diff mbox series

[v4,06/17] PCI: add SIOV and IMS capability detection

Message ID 160408389228.912050.10790556040702698268.stgit@djiang5-desk3.ch.intel.com
State New
Headers show
Series Add VFIO mediated device support and DEV-MSI support for the idxd driver | expand

Commit Message

Dave Jiang Oct. 30, 2020, 6:51 p.m. UTC
Intel Scalable I/O Virtualization (SIOV) enables sharing of I/O devices
across isolated domains through PASID based sub-device partitioning.
Interrupt Message Storage (IMS) enables devices to store the interrupt
messages in a device-specific optimized manner without the scalability
restrictions of the PCIe defined MSI-X capability. IMS is one of the
features supported under SIOV.

Move SIOV detection code from Intel iommu driver code to common PCI. Making
the detection code common allows supported accelerator drivers to query the
PCI core for SIOV and IMS capabilities. The support code will add the
ability to query the PCI DVSEC capabilities for the SIOV cap.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Baolu Lu <baolu.lu@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Ashok Raj <ashok.raj@intel.com>
---
 drivers/iommu/intel/iommu.c   |   31 ++-----------------------
 drivers/pci/Kconfig           |   15 ++++++++++++
 drivers/pci/Makefile          |    2 ++
 drivers/pci/dvsec.c           |   40 +++++++++++++++++++++++++++++++++
 drivers/pci/siov.c            |   50 +++++++++++++++++++++++++++++++++++++++++
 include/linux/pci-siov.h      |   18 +++++++++++++++
 include/linux/pci.h           |    3 ++
 include/uapi/linux/pci_regs.h |    4 +++
 8 files changed, 134 insertions(+), 29 deletions(-)
 create mode 100644 drivers/pci/dvsec.c
 create mode 100644 drivers/pci/siov.c
 create mode 100644 include/linux/pci-siov.h

Comments

Bjorn Helgaas Oct. 30, 2020, 7:51 p.m. UTC | #1
On Fri, Oct 30, 2020 at 11:51:32AM -0700, Dave Jiang wrote:
> Intel Scalable I/O Virtualization (SIOV) enables sharing of I/O devices
> across isolated domains through PASID based sub-device partitioning.
> Interrupt Message Storage (IMS) enables devices to store the interrupt
> messages in a device-specific optimized manner without the scalability
> restrictions of the PCIe defined MSI-X capability. IMS is one of the
> features supported under SIOV.
>
> Move SIOV detection code from Intel iommu driver code to common PCI. Making
> the detection code common allows supported accelerator drivers to query the
> PCI core for SIOV and IMS capabilities. The support code will add the
> ability to query the PCI DVSEC capabilities for the SIOV cap.

This patch really does not include anything related to SIOV other than
adding a little code to *find* the capability.  It doesn't add
anything that actually *uses* it.  I think this patch should simply
add pci_find_dvsec(), and it doesn't need any of this SIOV or IMS
description.

> Suggested-by: Thomas Gleixner <tglx@linutronix.de>
> Cc: Baolu Lu <baolu.lu@intel.com>
> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> Reviewed-by: Ashok Raj <ashok.raj@intel.com>
> ---
>  drivers/iommu/intel/iommu.c   |   31 ++-----------------------
>  drivers/pci/Kconfig           |   15 ++++++++++++
>  drivers/pci/Makefile          |    2 ++
>  drivers/pci/dvsec.c           |   40 +++++++++++++++++++++++++++++++++
>  drivers/pci/siov.c            |   50 +++++++++++++++++++++++++++++++++++++++++
>  include/linux/pci-siov.h      |   18 +++++++++++++++
>  include/linux/pci.h           |    3 ++
>  include/uapi/linux/pci_regs.h |    4 +++
>  8 files changed, 134 insertions(+), 29 deletions(-)
>  create mode 100644 drivers/pci/dvsec.c
>  create mode 100644 drivers/pci/siov.c
>  create mode 100644 include/linux/pci-siov.h
> 
> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
> index 3e77a88b236c..d9335f590b42 100644
> --- a/drivers/iommu/intel/iommu.c
> +++ b/drivers/iommu/intel/iommu.c
> @@ -36,6 +36,7 @@
>  #include <linux/tboot.h>
>  #include <linux/dmi.h>
>  #include <linux/pci-ats.h>
> +#include <linux/pci-siov.h>
>  #include <linux/memblock.h>
>  #include <linux/dma-map-ops.h>
>  #include <linux/dma-direct.h>
> @@ -5883,34 +5884,6 @@ static int intel_iommu_disable_auxd(struct device *dev)
>  	return 0;
>  }
>  
> -/*
> - * A PCI express designated vendor specific extended capability is defined
> - * in the section 3.7 of Intel scalable I/O virtualization technical spec
> - * for system software and tools to detect endpoint devices supporting the
> - * Intel scalable IO virtualization without host driver dependency.
> - *
> - * Returns the address of the matching extended capability structure within
> - * the device's PCI configuration space or 0 if the device does not support
> - * it.
> - */
> -static int siov_find_pci_dvsec(struct pci_dev *pdev)
> -{
> -	int pos;
> -	u16 vendor, id;
> -
> -	pos = pci_find_next_ext_capability(pdev, 0, 0x23);
> -	while (pos) {
> -		pci_read_config_word(pdev, pos + 4, &vendor);
> -		pci_read_config_word(pdev, pos + 8, &id);
> -		if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
> -			return pos;
> -
> -		pos = pci_find_next_ext_capability(pdev, pos, 0x23);
> -	}
> -
> -	return 0;
> -}
> -
>  static bool
>  intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
>  {
> @@ -5925,7 +5898,7 @@ intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
>  		if (ret < 0)
>  			return false;
>  
> -		return !!siov_find_pci_dvsec(to_pci_dev(dev));
> +		return pci_siov_supported(to_pci_dev(dev));
>  	}
>  
>  	if (feat == IOMMU_DEV_FEAT_SVA) {
> diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
> index 0c473d75e625..cf7f4d17d8cc 100644
> --- a/drivers/pci/Kconfig
> +++ b/drivers/pci/Kconfig
> @@ -161,6 +161,21 @@ config PCI_PASID
>  
>  	  If unsure, say N.
>  
> +config PCI_DVSEC
> +	bool
> +
> +config PCI_SIOV
> +	select PCI_PASID

This patch has nothing to do with PCI_PASID.  If you want to add this
select later in a patch that *does* add something that requires
PCI_PASID, that's OK.

> +	select PCI_DVSEC
> +	bool "PCI SIOV support"
> +	help
> +	  Scalable I/O Virtualzation enables sharing of I/O devices across isolated
> +	  domains through PASID based sub-device partitioning. One of the sub features
> +	  supported by SIOV is Inetrrupt Message Storage (IMS). Select this option if
> +	  you want to compile the support into your kernel.
> +	  If unsure, say N.
> +
>  config PCI_P2PDMA
>  	bool "PCI peer-to-peer transfer support"
>  	depends on ZONE_DEVICE
> diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
> index 522d2b974e91..653a1d69b0fc 100644
> --- a/drivers/pci/Makefile
> +++ b/drivers/pci/Makefile
> @@ -20,6 +20,8 @@ obj-$(CONFIG_PCI_QUIRKS)	+= quirks.o
>  obj-$(CONFIG_HOTPLUG_PCI)	+= hotplug/
>  obj-$(CONFIG_PCI_MSI)		+= msi.o
>  obj-$(CONFIG_PCI_ATS)		+= ats.o
> +obj-$(CONFIG_PCI_DVSEC)		+= dvsec.o
> +obj-$(CONFIG_PCI_SIOV)		+= siov.o
>  obj-$(CONFIG_PCI_IOV)		+= iov.o
>  obj-$(CONFIG_PCI_BRIDGE_EMUL)	+= pci-bridge-emul.o
>  obj-$(CONFIG_PCI_LABEL)		+= pci-label.o
> diff --git a/drivers/pci/dvsec.c b/drivers/pci/dvsec.c
> new file mode 100644
> index 000000000000..e49b079f0717
> --- /dev/null
> +++ b/drivers/pci/dvsec.c
> @@ -0,0 +1,40 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * PCI DVSEC helper functions
> + * Copyright (C) 2020 Intel Corp.
> + */
> +
> +#include <linux/export.h>
> +#include <linux/pci.h>
> +#include <uapi/linux/pci_regs.h>
> +#include "pci.h"
> +
> +/**
> + * pci_find_dvsec - return position of DVSEC with provided vendor and dvsec id
> + * @dev: the PCI device
> + * @vendor: Vendor for the DVSEC
> + * @id: the DVSEC cap id
> + *
> + * Return the offset of DVSEC on success or -ENOTSUPP if not found

s/vendor/Vendor/
s/dvsec/DVSEC/
s/id/ID/ twice above

Please put this function in drivers/pci/pci.c next to
pci_find_ext_capability().  I don't think it's worth making a new file
just for this.

> + */
> +int pci_find_dvsec(struct pci_dev *dev, u16 vendor, u16 id)
> +{
> +	u16 dev_vendor, dev_id;
> +	int pos;
> +
> +	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_DVSEC);
> +	if (!pos)
> +		return -ENOTSUPP;
> +
> +	while (pos) {
> +		pci_read_config_word(dev, pos + PCI_DVSEC_HEADER1, &dev_vendor);
> +		pci_read_config_word(dev, pos + PCI_DVSEC_HEADER2, &dev_id);
> +		if (dev_vendor == vendor && dev_id == id)
> +			return pos;
> +
> +		pos = pci_find_next_ext_capability(dev, pos, PCI_EXT_CAP_ID_DVSEC);
> +	}
> +
> +	return -ENOTSUPP;
> +}
> +EXPORT_SYMBOL_GPL(pci_find_dvsec);
> diff --git a/drivers/pci/siov.c b/drivers/pci/siov.c
> new file mode 100644
> index 000000000000..6147e6ae5832
> --- /dev/null
> +++ b/drivers/pci/siov.c
> @@ -0,0 +1,50 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Intel Scalable I/O Virtualization support
> + * Copyright (C) 2020 Intel Corp.
> + */
> +
> +#include <linux/export.h>
> +#include <linux/pci.h>
> +#include <linux/pci-siov.h>
> +#include <uapi/linux/pci_regs.h>
> +#include "pci.h"
> +
> +/*
> + * A PCI express designated vendor specific extended capability is defined
> + * in the section 3.7 of Intel scalable I/O virtualization technical spec
> + * for system software and tools to detect endpoint devices supporting the
> + * Intel scalable IO virtualization without host driver dependency.
> + */
> +
> +/**
> + * pci_siov_supported - check if the device can use SIOV
> + * @dev: the PCI device
> + *
> + * Returns true if the device supports SIOV,  false otherwise.
> + */
> +bool pci_siov_supported(struct pci_dev *dev)
> +{
> +	return pci_find_dvsec(dev, PCI_VENDOR_ID_INTEL, PCI_DVSEC_ID_INTEL_SIOV) < 0 ? false : true;
> +}
> +EXPORT_SYMBOL_GPL(pci_siov_supported);
> +
> +/**
> + * pci_ims_supported - check if the device can use IMS
> + * @dev: the PCI device
> + *
> + * Returns true if the device supports IMS, false otherwise.
> + */
> +bool pci_ims_supported(struct pci_dev *dev)
> +{
> +	int pos;
> +	u32 caps;
> +
> +	pos = pci_find_dvsec(dev, PCI_VENDOR_ID_INTEL, PCI_DVSEC_ID_INTEL_SIOV);
> +	if (pos < 0)
> +		return false;
> +
> +	pci_read_config_dword(dev, pos + PCI_DVSEC_INTEL_SIOV_CAP, &caps);
> +	return (caps & PCI_DVSEC_INTEL_SIOV_CAP_IMS) ? true : false;
> +}
> +EXPORT_SYMBOL_GPL(pci_ims_supported);

I don't really see the point of these *_supported() functions.  If the
caller wants to use them, I would expect it to call
pci_find_dvsec(PCI_DVSEC_ID_INTEL_SIOV) itself anyway.

But there *are* no calls to pci_find_dvsec(PCI_DVSEC_ID_INTEL_SIOV).
So apparently all you care about is whether the capability *exists*,
and you don't need any information at all from the capability
registers except PCI_DVSEC_INTEL_SIOV_CAP_IMS?  That seems a little
weird.

I don't think it's worth adding a whole new file just for this.  The
only value the PCI core is adding here is a way to locate the
PCI_DVSEC_ID_INTEL_SIOV capability.

> diff --git a/include/linux/pci-siov.h b/include/linux/pci-siov.h
> new file mode 100644
> index 000000000000..a8a4eb5f4634
> --- /dev/null
> +++ b/include/linux/pci-siov.h
> @@ -0,0 +1,18 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef LINUX_PCI_SIOV_H
> +#define LINUX_PCI_SIOV_H
> +
> +#include <linux/pci.h>
> +
> +#ifdef CONFIG_PCI_SIOV
> +/* Scalable I/O Virtualization */
> +bool pci_siov_supported(struct pci_dev *dev);
> +bool pci_ims_supported(struct pci_dev *dev);
> +#else /* CONFIG_PCI_SIOV */
> +static inline bool pci_siov_supported(struct pci_dev *d)
> +{ return false; }
> +static inline bool pci_ims_supported(struct pci_dev *d)
> +{ return false; }
> +#endif /* CONFIG_PCI_SIOV */
> +
> +#endif /* LINUX_PCI_SIOV_H */

What's the benefit to putting these declarations in a separate
pci-siov.h as opposed to putting them in pci.h itself?  That's what we
do for things like MSI, IOV, etc.

> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 22207a79762c..4710f09b43b1 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -1070,6 +1070,7 @@ int pci_find_next_ext_capability(struct pci_dev *dev, int pos, int cap);
>  int pci_find_ht_capability(struct pci_dev *dev, int ht_cap);
>  int pci_find_next_ht_capability(struct pci_dev *dev, int pos, int ht_cap);
>  struct pci_bus *pci_find_next_bus(const struct pci_bus *from);
> +int pci_find_dvsec(struct pci_dev *dev, u16 vendor, u16 id);
>  
>  u64 pci_get_dsn(struct pci_dev *dev);
>  
> @@ -1726,6 +1727,8 @@ static inline int pci_find_next_capability(struct pci_dev *dev, u8 post,
>  { return 0; }
>  static inline int pci_find_ext_capability(struct pci_dev *dev, int cap)
>  { return 0; }
> +static inline int pci_find_dvsec(struct pci_dev *dev, u16 vendor, u16 id)
> +{ return 0; }
>  
>  static inline u64 pci_get_dsn(struct pci_dev *dev)
>  { return 0; }
> diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
> index 8f8bd2318c6c..3532528441ef 100644
> --- a/include/uapi/linux/pci_regs.h
> +++ b/include/uapi/linux/pci_regs.h
> @@ -1071,6 +1071,10 @@
>  #define PCI_DVSEC_HEADER1		0x4 /* Designated Vendor-Specific Header1 */
>  #define PCI_DVSEC_HEADER2		0x8 /* Designated Vendor-Specific Header2 */
>  
> +#define PCI_DVSEC_ID_INTEL_SIOV		0x5
> +#define PCI_DVSEC_INTEL_SIOV_CAP	0x14
> +#define PCI_DVSEC_INTEL_SIOV_CAP_IMS	0x1

Convention in this file is to write constants in the register width,
e.g.,

  #define PCI_DVSEC_ID_INTEL_SIOV		0x0005
  #define PCI_DVSEC_INTEL_SIOV_CAP_IMS	0x00000001

You can learn this by looking at the surrounding definitions.

>  /* Data Link Feature */
>  #define PCI_DLF_CAP		0x04	/* Capabilities Register */
>  #define  PCI_DLF_EXCHANGE_ENABLE	0x80000000  /* Data Link Feature Exchange Enable */
> 
>
Dave Jiang Oct. 30, 2020, 9:20 p.m. UTC | #2
On 10/30/2020 12:51 PM, Bjorn Helgaas wrote:
> On Fri, Oct 30, 2020 at 11:51:32AM -0700, Dave Jiang wrote:
>> Intel Scalable I/O Virtualization (SIOV) enables sharing of I/O devices
>> across isolated domains through PASID based sub-device partitioning.
>> Interrupt Message Storage (IMS) enables devices to store the interrupt
>> messages in a device-specific optimized manner without the scalability
>> restrictions of the PCIe defined MSI-X capability. IMS is one of the
>> features supported under SIOV.
>>
>> Move SIOV detection code from Intel iommu driver code to common PCI. Making
>> the detection code common allows supported accelerator drivers to query the
>> PCI core for SIOV and IMS capabilities. The support code will add the
>> ability to query the PCI DVSEC capabilities for the SIOV cap.
> 
> This patch really does not include anything related to SIOV other than
> adding a little code to *find* the capability.  It doesn't add
> anything that actually *uses* it.  I think this patch should simply
> add pci_find_dvsec(), and it doesn't need any of this SIOV or IMS
> description.
> 

Thanks for the review Bjorn! I'll carve out a patch with just find_dvsec() and 
apply your comments and recommendations.

So the intel-iommu driver checks for the SIOV cap. And the idxd driver checks 
for SIOV and IMS cap. There will be other upcoming drivers that will check for 
such cap too. It is Intel vendor specific right now, but SIOV is public and 
other vendors may implement to the spec. Is there a good place to put the common 
capability check for that?

There are some other fields in the SIOV dvsec cap, but presently they are not 
being utilized. The idxd driver is only interested in making sure that SIOV and 
IMS (sub feature) support are present at this point.

- Dave

>> Suggested-by: Thomas Gleixner <tglx@linutronix.de>
>> Cc: Baolu Lu <baolu.lu@intel.com>
>> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
>> Reviewed-by: Ashok Raj <ashok.raj@intel.com>
>> ---
>>   drivers/iommu/intel/iommu.c   |   31 ++-----------------------
>>   drivers/pci/Kconfig           |   15 ++++++++++++
>>   drivers/pci/Makefile          |    2 ++
>>   drivers/pci/dvsec.c           |   40 +++++++++++++++++++++++++++++++++
>>   drivers/pci/siov.c            |   50 +++++++++++++++++++++++++++++++++++++++++
>>   include/linux/pci-siov.h      |   18 +++++++++++++++
>>   include/linux/pci.h           |    3 ++
>>   include/uapi/linux/pci_regs.h |    4 +++
>>   8 files changed, 134 insertions(+), 29 deletions(-)
>>   create mode 100644 drivers/pci/dvsec.c
>>   create mode 100644 drivers/pci/siov.c
>>   create mode 100644 include/linux/pci-siov.h
>>
>> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
>> index 3e77a88b236c..d9335f590b42 100644
>> --- a/drivers/iommu/intel/iommu.c
>> +++ b/drivers/iommu/intel/iommu.c
>> @@ -36,6 +36,7 @@
>>   #include <linux/tboot.h>
>>   #include <linux/dmi.h>
>>   #include <linux/pci-ats.h>
>> +#include <linux/pci-siov.h>
>>   #include <linux/memblock.h>
>>   #include <linux/dma-map-ops.h>
>>   #include <linux/dma-direct.h>
>> @@ -5883,34 +5884,6 @@ static int intel_iommu_disable_auxd(struct device *dev)
>>   	return 0;
>>   }
>>   
>> -/*
>> - * A PCI express designated vendor specific extended capability is defined
>> - * in the section 3.7 of Intel scalable I/O virtualization technical spec
>> - * for system software and tools to detect endpoint devices supporting the
>> - * Intel scalable IO virtualization without host driver dependency.
>> - *
>> - * Returns the address of the matching extended capability structure within
>> - * the device's PCI configuration space or 0 if the device does not support
>> - * it.
>> - */
>> -static int siov_find_pci_dvsec(struct pci_dev *pdev)
>> -{
>> -	int pos;
>> -	u16 vendor, id;
>> -
>> -	pos = pci_find_next_ext_capability(pdev, 0, 0x23);
>> -	while (pos) {
>> -		pci_read_config_word(pdev, pos + 4, &vendor);
>> -		pci_read_config_word(pdev, pos + 8, &id);
>> -		if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
>> -			return pos;
>> -
>> -		pos = pci_find_next_ext_capability(pdev, pos, 0x23);
>> -	}
>> -
>> -	return 0;
>> -}
>> -
>>   static bool
>>   intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
>>   {
>> @@ -5925,7 +5898,7 @@ intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
>>   		if (ret < 0)
>>   			return false;
>>   
>> -		return !!siov_find_pci_dvsec(to_pci_dev(dev));
>> +		return pci_siov_supported(to_pci_dev(dev));
>>   	}
>>   
>>   	if (feat == IOMMU_DEV_FEAT_SVA) {
>> diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
>> index 0c473d75e625..cf7f4d17d8cc 100644
>> --- a/drivers/pci/Kconfig
>> +++ b/drivers/pci/Kconfig
>> @@ -161,6 +161,21 @@ config PCI_PASID
>>   
>>   	  If unsure, say N.
>>   
>> +config PCI_DVSEC
>> +	bool
>> +
>> +config PCI_SIOV
>> +	select PCI_PASID
> 
> This patch has nothing to do with PCI_PASID.  If you want to add this
> select later in a patch that *does* add something that requires
> PCI_PASID, that's OK.
> 
>> +	select PCI_DVSEC
>> +	bool "PCI SIOV support"
>> +	help
>> +	  Scalable I/O Virtualzation enables sharing of I/O devices across isolated
>> +	  domains through PASID based sub-device partitioning. One of the sub features
>> +	  supported by SIOV is Inetrrupt Message Storage (IMS). Select this option if
>> +	  you want to compile the support into your kernel.
>> +	  If unsure, say N.
>> +
>>   config PCI_P2PDMA
>>   	bool "PCI peer-to-peer transfer support"
>>   	depends on ZONE_DEVICE
>> diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
>> index 522d2b974e91..653a1d69b0fc 100644
>> --- a/drivers/pci/Makefile
>> +++ b/drivers/pci/Makefile
>> @@ -20,6 +20,8 @@ obj-$(CONFIG_PCI_QUIRKS)	+= quirks.o
>>   obj-$(CONFIG_HOTPLUG_PCI)	+= hotplug/
>>   obj-$(CONFIG_PCI_MSI)		+= msi.o
>>   obj-$(CONFIG_PCI_ATS)		+= ats.o
>> +obj-$(CONFIG_PCI_DVSEC)		+= dvsec.o
>> +obj-$(CONFIG_PCI_SIOV)		+= siov.o
>>   obj-$(CONFIG_PCI_IOV)		+= iov.o
>>   obj-$(CONFIG_PCI_BRIDGE_EMUL)	+= pci-bridge-emul.o
>>   obj-$(CONFIG_PCI_LABEL)		+= pci-label.o
>> diff --git a/drivers/pci/dvsec.c b/drivers/pci/dvsec.c
>> new file mode 100644
>> index 000000000000..e49b079f0717
>> --- /dev/null
>> +++ b/drivers/pci/dvsec.c
>> @@ -0,0 +1,40 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * PCI DVSEC helper functions
>> + * Copyright (C) 2020 Intel Corp.
>> + */
>> +
>> +#include <linux/export.h>
>> +#include <linux/pci.h>
>> +#include <uapi/linux/pci_regs.h>
>> +#include "pci.h"
>> +
>> +/**
>> + * pci_find_dvsec - return position of DVSEC with provided vendor and dvsec id
>> + * @dev: the PCI device
>> + * @vendor: Vendor for the DVSEC
>> + * @id: the DVSEC cap id
>> + *
>> + * Return the offset of DVSEC on success or -ENOTSUPP if not found
> 
> s/vendor/Vendor/
> s/dvsec/DVSEC/
> s/id/ID/ twice above
> 
> Please put this function in drivers/pci/pci.c next to
> pci_find_ext_capability().  I don't think it's worth making a new file
> just for this.
> 
>> + */
>> +int pci_find_dvsec(struct pci_dev *dev, u16 vendor, u16 id)
>> +{
>> +	u16 dev_vendor, dev_id;
>> +	int pos;
>> +
>> +	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_DVSEC);
>> +	if (!pos)
>> +		return -ENOTSUPP;
>> +
>> +	while (pos) {
>> +		pci_read_config_word(dev, pos + PCI_DVSEC_HEADER1, &dev_vendor);
>> +		pci_read_config_word(dev, pos + PCI_DVSEC_HEADER2, &dev_id);
>> +		if (dev_vendor == vendor && dev_id == id)
>> +			return pos;
>> +
>> +		pos = pci_find_next_ext_capability(dev, pos, PCI_EXT_CAP_ID_DVSEC);
>> +	}
>> +
>> +	return -ENOTSUPP;
>> +}
>> +EXPORT_SYMBOL_GPL(pci_find_dvsec);
>> diff --git a/drivers/pci/siov.c b/drivers/pci/siov.c
>> new file mode 100644
>> index 000000000000..6147e6ae5832
>> --- /dev/null
>> +++ b/drivers/pci/siov.c
>> @@ -0,0 +1,50 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Intel Scalable I/O Virtualization support
>> + * Copyright (C) 2020 Intel Corp.
>> + */
>> +
>> +#include <linux/export.h>
>> +#include <linux/pci.h>
>> +#include <linux/pci-siov.h>
>> +#include <uapi/linux/pci_regs.h>
>> +#include "pci.h"
>> +
>> +/*
>> + * A PCI express designated vendor specific extended capability is defined
>> + * in the section 3.7 of Intel scalable I/O virtualization technical spec
>> + * for system software and tools to detect endpoint devices supporting the
>> + * Intel scalable IO virtualization without host driver dependency.
>> + */
>> +
>> +/**
>> + * pci_siov_supported - check if the device can use SIOV
>> + * @dev: the PCI device
>> + *
>> + * Returns true if the device supports SIOV,  false otherwise.
>> + */
>> +bool pci_siov_supported(struct pci_dev *dev)
>> +{
>> +	return pci_find_dvsec(dev, PCI_VENDOR_ID_INTEL, PCI_DVSEC_ID_INTEL_SIOV) < 0 ? false : true;
>> +}
>> +EXPORT_SYMBOL_GPL(pci_siov_supported);
>> +
>> +/**
>> + * pci_ims_supported - check if the device can use IMS
>> + * @dev: the PCI device
>> + *
>> + * Returns true if the device supports IMS, false otherwise.
>> + */
>> +bool pci_ims_supported(struct pci_dev *dev)
>> +{
>> +	int pos;
>> +	u32 caps;
>> +
>> +	pos = pci_find_dvsec(dev, PCI_VENDOR_ID_INTEL, PCI_DVSEC_ID_INTEL_SIOV);
>> +	if (pos < 0)
>> +		return false;
>> +
>> +	pci_read_config_dword(dev, pos + PCI_DVSEC_INTEL_SIOV_CAP, &caps);
>> +	return (caps & PCI_DVSEC_INTEL_SIOV_CAP_IMS) ? true : false;
>> +}
>> +EXPORT_SYMBOL_GPL(pci_ims_supported);
> 
> I don't really see the point of these *_supported() functions.  If the
> caller wants to use them, I would expect it to call
> pci_find_dvsec(PCI_DVSEC_ID_INTEL_SIOV) itself anyway.
> 
> But there *are* no calls to pci_find_dvsec(PCI_DVSEC_ID_INTEL_SIOV).
> So apparently all you care about is whether the capability *exists*,
> and you don't need any information at all from the capability
> registers except PCI_DVSEC_INTEL_SIOV_CAP_IMS?  That seems a little
> weird.
> 
> I don't think it's worth adding a whole new file just for this.  The
> only value the PCI core is adding here is a way to locate the
> PCI_DVSEC_ID_INTEL_SIOV capability.
> 
>> diff --git a/include/linux/pci-siov.h b/include/linux/pci-siov.h
>> new file mode 100644
>> index 000000000000..a8a4eb5f4634
>> --- /dev/null
>> +++ b/include/linux/pci-siov.h
>> @@ -0,0 +1,18 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +#ifndef LINUX_PCI_SIOV_H
>> +#define LINUX_PCI_SIOV_H
>> +
>> +#include <linux/pci.h>
>> +
>> +#ifdef CONFIG_PCI_SIOV
>> +/* Scalable I/O Virtualization */
>> +bool pci_siov_supported(struct pci_dev *dev);
>> +bool pci_ims_supported(struct pci_dev *dev);
>> +#else /* CONFIG_PCI_SIOV */
>> +static inline bool pci_siov_supported(struct pci_dev *d)
>> +{ return false; }
>> +static inline bool pci_ims_supported(struct pci_dev *d)
>> +{ return false; }
>> +#endif /* CONFIG_PCI_SIOV */
>> +
>> +#endif /* LINUX_PCI_SIOV_H */
> 
> What's the benefit to putting these declarations in a separate
> pci-siov.h as opposed to putting them in pci.h itself?  That's what we
> do for things like MSI, IOV, etc.
> 
>> diff --git a/include/linux/pci.h b/include/linux/pci.h
>> index 22207a79762c..4710f09b43b1 100644
>> --- a/include/linux/pci.h
>> +++ b/include/linux/pci.h
>> @@ -1070,6 +1070,7 @@ int pci_find_next_ext_capability(struct pci_dev *dev, int pos, int cap);
>>   int pci_find_ht_capability(struct pci_dev *dev, int ht_cap);
>>   int pci_find_next_ht_capability(struct pci_dev *dev, int pos, int ht_cap);
>>   struct pci_bus *pci_find_next_bus(const struct pci_bus *from);
>> +int pci_find_dvsec(struct pci_dev *dev, u16 vendor, u16 id);
>>   
>>   u64 pci_get_dsn(struct pci_dev *dev);
>>   
>> @@ -1726,6 +1727,8 @@ static inline int pci_find_next_capability(struct pci_dev *dev, u8 post,
>>   { return 0; }
>>   static inline int pci_find_ext_capability(struct pci_dev *dev, int cap)
>>   { return 0; }
>> +static inline int pci_find_dvsec(struct pci_dev *dev, u16 vendor, u16 id)
>> +{ return 0; }
>>   
>>   static inline u64 pci_get_dsn(struct pci_dev *dev)
>>   { return 0; }
>> diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
>> index 8f8bd2318c6c..3532528441ef 100644
>> --- a/include/uapi/linux/pci_regs.h
>> +++ b/include/uapi/linux/pci_regs.h
>> @@ -1071,6 +1071,10 @@
>>   #define PCI_DVSEC_HEADER1		0x4 /* Designated Vendor-Specific Header1 */
>>   #define PCI_DVSEC_HEADER2		0x8 /* Designated Vendor-Specific Header2 */
>>   
>> +#define PCI_DVSEC_ID_INTEL_SIOV		0x5
>> +#define PCI_DVSEC_INTEL_SIOV_CAP	0x14
>> +#define PCI_DVSEC_INTEL_SIOV_CAP_IMS	0x1
> 
> Convention in this file is to write constants in the register width,
> e.g.,
> 
>    #define PCI_DVSEC_ID_INTEL_SIOV		0x0005
>    #define PCI_DVSEC_INTEL_SIOV_CAP_IMS	0x00000001
> 
> You can learn this by looking at the surrounding definitions.
> 
>>   /* Data Link Feature */
>>   #define PCI_DLF_CAP		0x04	/* Capabilities Register */
>>   #define  PCI_DLF_EXCHANGE_ENABLE	0x80000000  /* Data Link Feature Exchange Enable */
>>
>>
Bjorn Helgaas Oct. 30, 2020, 9:50 p.m. UTC | #3
On Fri, Oct 30, 2020 at 02:20:03PM -0700, Dave Jiang wrote:
> 
> 
> On 10/30/2020 12:51 PM, Bjorn Helgaas wrote:
> > On Fri, Oct 30, 2020 at 11:51:32AM -0700, Dave Jiang wrote:
> > > Intel Scalable I/O Virtualization (SIOV) enables sharing of I/O devices
> > > across isolated domains through PASID based sub-device partitioning.
> > > Interrupt Message Storage (IMS) enables devices to store the interrupt
> > > messages in a device-specific optimized manner without the scalability
> > > restrictions of the PCIe defined MSI-X capability. IMS is one of the
> > > features supported under SIOV.
> > > 
> > > Move SIOV detection code from Intel iommu driver code to common PCI. Making
> > > the detection code common allows supported accelerator drivers to query the
> > > PCI core for SIOV and IMS capabilities. The support code will add the
> > > ability to query the PCI DVSEC capabilities for the SIOV cap.
> > 
> > This patch really does not include anything related to SIOV other than
> > adding a little code to *find* the capability.  It doesn't add
> > anything that actually *uses* it.  I think this patch should simply
> > add pci_find_dvsec(), and it doesn't need any of this SIOV or IMS
> > description.
> 
> Thanks for the review Bjorn! I'll carve out a patch with just find_dvsec()
> and apply your comments and recommendations.
> 
> So the intel-iommu driver checks for the SIOV cap. And the idxd driver
> checks for SIOV and IMS cap. There will be other upcoming drivers that will
> check for such cap too. It is Intel vendor specific right now, but SIOV is
> public and other vendors may implement to the spec. Is there a good place to
> put the common capability check for that?

Let's wait and see what that code looks like and figure it out then.
We can always move it to the PCI core if it turns out to be generic.

Right now the code only finds a capability and checks a bit in it.
None of that is anything the PCI core is interested in.

> There are some other fields in the SIOV dvsec cap, but presently they are
> not being utilized. The idxd driver is only interested in making sure that
> SIOV and IMS (sub feature) support are present at this point.

I'm a little dubious about code that checks whether support is present
but doesn't actually *do* anything with that support, but as long as
it's outside the PCI core, that's up to you :)

Bjorn
Jason Gunthorpe Oct. 30, 2020, 10:45 p.m. UTC | #4
On Fri, Oct 30, 2020 at 02:20:03PM -0700, Dave Jiang wrote:
> So the intel-iommu driver checks for the SIOV cap. And the idxd driver
> checks for SIOV and IMS cap. There will be other upcoming drivers that will
> check for such cap too. It is Intel vendor specific right now, but SIOV is
> public and other vendors may implement to the spec. Is there a good place to
> put the common capability check for that?

I'm still really unhappy with these SIOV caps. It was explained this
is just a hack to make up for pci_ims_array_create_msi_irq_domain()
succeeding in VM cases when it doesn't actually work.

Someday this is likely to get fixed, so tying platform behavior to PCI
caps is completely wrong.

This needs to be solved in the platform code,
pci_ims_array_create_msi_irq_domain() should not succeed in these
cases.

Jason
Dave Jiang Oct. 30, 2020, 10:49 p.m. UTC | #5
On 10/30/2020 3:45 PM, Jason Gunthorpe wrote:
> On Fri, Oct 30, 2020 at 02:20:03PM -0700, Dave Jiang wrote:
>> So the intel-iommu driver checks for the SIOV cap. And the idxd driver
>> checks for SIOV and IMS cap. There will be other upcoming drivers that will
>> check for such cap too. It is Intel vendor specific right now, but SIOV is
>> public and other vendors may implement to the spec. Is there a good place to
>> put the common capability check for that?
> 
> I'm still really unhappy with these SIOV caps. It was explained this
> is just a hack to make up for pci_ims_array_create_msi_irq_domain()
> succeeding in VM cases when it doesn't actually work.
> 
> Someday this is likely to get fixed, so tying platform behavior to PCI
> caps is completely wrong.
> 
> This needs to be solved in the platform code,
> pci_ims_array_create_msi_irq_domain() should not succeed in these
> cases.

That sounds reasonable. Are you asking that the IMS cap check should gate the 
success/failure of pci_ims_array_create_msi_irq_domain() rather than the driver?

> 
> Jason
>
Jason Gunthorpe Nov. 2, 2020, 1:21 p.m. UTC | #6
On Fri, Oct 30, 2020 at 03:49:22PM -0700, Dave Jiang wrote:
> 
> 
> On 10/30/2020 3:45 PM, Jason Gunthorpe wrote:
> > On Fri, Oct 30, 2020 at 02:20:03PM -0700, Dave Jiang wrote:
> > > So the intel-iommu driver checks for the SIOV cap. And the idxd driver
> > > checks for SIOV and IMS cap. There will be other upcoming drivers that will
> > > check for such cap too. It is Intel vendor specific right now, but SIOV is
> > > public and other vendors may implement to the spec. Is there a good place to
> > > put the common capability check for that?
> > 
> > I'm still really unhappy with these SIOV caps. It was explained this
> > is just a hack to make up for pci_ims_array_create_msi_irq_domain()
> > succeeding in VM cases when it doesn't actually work.
> > 
> > Someday this is likely to get fixed, so tying platform behavior to PCI
> > caps is completely wrong.
> > 
> > This needs to be solved in the platform code,
> > pci_ims_array_create_msi_irq_domain() should not succeed in these
> > cases.
> 
> That sounds reasonable. Are you asking that the IMS cap check should gate
> the success/failure of pci_ims_array_create_msi_irq_domain() rather than the
> driver?

There shouldn't be an IMS cap at all

As I understand, the problem here is the only way to establish new
VT-d IRQ routing is by trapping and emulating MSI/MSI-X related
activities and triggering routing of the vectors into the guest.

There is a missing hypercall to allow the guest to do this on its own,
presumably it will someday be fixed so IMS can work in guests.

Until the hypercall is added pci_ims_array_create_msi_irq_domain()
should simply fail in guests. No PCI cap check required.

Jason
Tian, Kevin Nov. 3, 2020, 2:49 a.m. UTC | #7
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Monday, November 2, 2020 9:22 PM
> 
> On Fri, Oct 30, 2020 at 03:49:22PM -0700, Dave Jiang wrote:
> >
> >
> > On 10/30/2020 3:45 PM, Jason Gunthorpe wrote:
> > > On Fri, Oct 30, 2020 at 02:20:03PM -0700, Dave Jiang wrote:
> > > > So the intel-iommu driver checks for the SIOV cap. And the idxd driver
> > > > checks for SIOV and IMS cap. There will be other upcoming drivers that
> will
> > > > check for such cap too. It is Intel vendor specific right now, but SIOV is
> > > > public and other vendors may implement to the spec. Is there a good
> place to
> > > > put the common capability check for that?
> > >
> > > I'm still really unhappy with these SIOV caps. It was explained this
> > > is just a hack to make up for pci_ims_array_create_msi_irq_domain()
> > > succeeding in VM cases when it doesn't actually work.
> > >
> > > Someday this is likely to get fixed, so tying platform behavior to PCI
> > > caps is completely wrong.
> > >
> > > This needs to be solved in the platform code,
> > > pci_ims_array_create_msi_irq_domain() should not succeed in these
> > > cases.
> >
> > That sounds reasonable. Are you asking that the IMS cap check should gate
> > the success/failure of pci_ims_array_create_msi_irq_domain() rather than
> the
> > driver?
> 
> There shouldn't be an IMS cap at all
> 
> As I understand, the problem here is the only way to establish new
> VT-d IRQ routing is by trapping and emulating MSI/MSI-X related
> activities and triggering routing of the vectors into the guest.
> 
> There is a missing hypercall to allow the guest to do this on its own,
> presumably it will someday be fixed so IMS can work in guests.

Hypercall is VMM specific, while IMS cap provides a VMM-agnostic
interface so any guest driver (if following the spec) can seamlessly
work on all hypervisors.

Thanks
Kevin
Jason Gunthorpe Nov. 3, 2020, 12:43 p.m. UTC | #8
On Tue, Nov 03, 2020 at 02:49:27AM +0000, Tian, Kevin wrote:

> > There is a missing hypercall to allow the guest to do this on its own,
> > presumably it will someday be fixed so IMS can work in guests.
> 
> Hypercall is VMM specific, while IMS cap provides a VMM-agnostic
> interface so any guest driver (if following the spec) can seamlessly
> work on all hypervisors.

It is a *VMM* issue, not PCI. Adding a PCI cap to describe a VMM issue
is architecturally wrong.

IMS *can not work* in any hypervsior without some special
hypercall. Just block it in the platform code and forget about the PCI
cap.

Jason
Tian, Kevin Nov. 4, 2020, 3:41 a.m. UTC | #9
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, November 3, 2020 8:44 PM
> 
> On Tue, Nov 03, 2020 at 02:49:27AM +0000, Tian, Kevin wrote:
> 
> > > There is a missing hypercall to allow the guest to do this on its own,
> > > presumably it will someday be fixed so IMS can work in guests.
> >
> > Hypercall is VMM specific, while IMS cap provides a VMM-agnostic
> > interface so any guest driver (if following the spec) can seamlessly
> > work on all hypervisors.
> 
> It is a *VMM* issue, not PCI. Adding a PCI cap to describe a VMM issue
> is architecturally wrong.
> 
> IMS *can not work* in any hypervsior without some special
> hypercall. Just block it in the platform code and forget about the PCI
> cap.
> 

It's per-device thing instead of platform thing. If the VMM understands
the IMS format of a specific device and virtualize it to the guest, the
guest can use IMS w/o any hypercall. If the VMM doesn't understand, it
simply clears the IMS cap bit for this device which forces the guest to
use the standard PCI MSI/MSI-X interface. In VMM side the decision is
based on device virtualization knowledge, e.g. in VFIO, instead of in 
platform virtualization logic. Your platform argument is based on the 
hypercall assumption, which is what we want to avoid instead.

Thanks
Kevin
Jason Gunthorpe Nov. 4, 2020, 12:40 p.m. UTC | #10
On Wed, Nov 04, 2020 at 03:41:33AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Tuesday, November 3, 2020 8:44 PM
> > 
> > On Tue, Nov 03, 2020 at 02:49:27AM +0000, Tian, Kevin wrote:
> > 
> > > > There is a missing hypercall to allow the guest to do this on its own,
> > > > presumably it will someday be fixed so IMS can work in guests.
> > >
> > > Hypercall is VMM specific, while IMS cap provides a VMM-agnostic
> > > interface so any guest driver (if following the spec) can seamlessly
> > > work on all hypervisors.
> > 
> > It is a *VMM* issue, not PCI. Adding a PCI cap to describe a VMM issue
> > is architecturally wrong.
> > 
> > IMS *can not work* in any hypervsior without some special
> > hypercall. Just block it in the platform code and forget about the PCI
> > cap.
> > 
> 
> It's per-device thing instead of platform thing. If the VMM understands
> the IMS format of a specific device and virtualize it to the guest,

Please no! Adding device specific emulation is just going down deeper
into this bad architecture.

Interrupts is a platform issue. Using emulation of MSI to dynamically
insert vectors to a VM was a reasonable, but hacky thing. Now it needs
proper platform support.

Jason
Tian, Kevin Nov. 4, 2020, 1:34 p.m. UTC | #11
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Wednesday, November 4, 2020 8:40 PM
> 
> On Wed, Nov 04, 2020 at 03:41:33AM +0000, Tian, Kevin wrote:
> > > From: Jason Gunthorpe <jgg@nvidia.com>
> > > Sent: Tuesday, November 3, 2020 8:44 PM
> > >
> > > On Tue, Nov 03, 2020 at 02:49:27AM +0000, Tian, Kevin wrote:
> > >
> > > > > There is a missing hypercall to allow the guest to do this on its own,
> > > > > presumably it will someday be fixed so IMS can work in guests.
> > > >
> > > > Hypercall is VMM specific, while IMS cap provides a VMM-agnostic
> > > > interface so any guest driver (if following the spec) can seamlessly
> > > > work on all hypervisors.
> > >
> > > It is a *VMM* issue, not PCI. Adding a PCI cap to describe a VMM issue
> > > is architecturally wrong.
> > >
> > > IMS *can not work* in any hypervsior without some special
> > > hypercall. Just block it in the platform code and forget about the PCI
> > > cap.
> > >
> >
> > It's per-device thing instead of platform thing. If the VMM understands
> > the IMS format of a specific device and virtualize it to the guest,
> 
> Please no! Adding device specific emulation is just going down deeper
> into this bad architecture.
> 
> Interrupts is a platform issue. Using emulation of MSI to dynamically

Interrupt controller is a platform issue. Interrupt source is about device.

> insert vectors to a VM was a reasonable, but hacky thing. Now it needs
> proper platform support.
> 

why is MSI emulation a hacky thing? isn't it defined by PCISIG? I guess
that I must misunderstand your real point here...

Thanks
Kevin
Jason Gunthorpe Nov. 4, 2020, 1:54 p.m. UTC | #12
On Wed, Nov 04, 2020 at 01:34:08PM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Wednesday, November 4, 2020 8:40 PM
> > 
> > On Wed, Nov 04, 2020 at 03:41:33AM +0000, Tian, Kevin wrote:
> > > > From: Jason Gunthorpe <jgg@nvidia.com>
> > > > Sent: Tuesday, November 3, 2020 8:44 PM
> > > >
> > > > On Tue, Nov 03, 2020 at 02:49:27AM +0000, Tian, Kevin wrote:
> > > >
> > > > > > There is a missing hypercall to allow the guest to do this on its own,
> > > > > > presumably it will someday be fixed so IMS can work in guests.
> > > > >
> > > > > Hypercall is VMM specific, while IMS cap provides a VMM-agnostic
> > > > > interface so any guest driver (if following the spec) can seamlessly
> > > > > work on all hypervisors.
> > > >
> > > > It is a *VMM* issue, not PCI. Adding a PCI cap to describe a VMM issue
> > > > is architecturally wrong.
> > > >
> > > > IMS *can not work* in any hypervsior without some special
> > > > hypercall. Just block it in the platform code and forget about the PCI
> > > > cap.
> > > >
> > >
> > > It's per-device thing instead of platform thing. If the VMM understands
> > > the IMS format of a specific device and virtualize it to the guest,
> > 
> > Please no! Adding device specific emulation is just going down deeper
> > into this bad architecture.
> > 
> > Interrupts is a platform issue. Using emulation of MSI to dynamically
> 
> Interrupt controller is a platform issue. Interrupt source is about device.

The interrupt controller is responsible to create an addr/data pair
for an interrupt message. It sets the message format and ensures it
routes to the proper CPU interrupt handler. Everything about the
addr/data pair is owned by the platform interrupt controller.

Devices do not create interrupts. They only trigger the addr/data pair
the platform gives them.

> > insert vectors to a VM was a reasonable, but hacky thing. Now it needs
> > proper platform support.
>
> why is MSI emulation a hacky thing? isn't it defined by PCISIG? I guess
> that I must misunderstand your real point here...

It means the interrupt controller in the VM's platform is a fiction,
the addr/data pairs it creates are not real.

A PCI device assigned to a VM is supposed to be fully contained by the
IOMMU, interrupts included, so there is no reason to do MSI emulation
if the VM's interrupt controller is aware of what addr/data pairs it
can use with the device - eg by getting them through a hypercall. This
is much cleaner and supports things like IMS

Trying to do IMS emulation is nutz, the entire point of IMS is the
device can do what it likes, and emulating that is not going to
feasible. For instance go read the discussion I had with Thomas how a
object-centric device would manage interrupts.

Jason
Tian, Kevin Nov. 6, 2020, 9:48 a.m. UTC | #13
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Wednesday, November 4, 2020 9:54 PM
> 
> On Wed, Nov 04, 2020 at 01:34:08PM +0000, Tian, Kevin wrote:
> > > From: Jason Gunthorpe <jgg@nvidia.com>
> > > Sent: Wednesday, November 4, 2020 8:40 PM
> > >
> > > On Wed, Nov 04, 2020 at 03:41:33AM +0000, Tian, Kevin wrote:
> > > > > From: Jason Gunthorpe <jgg@nvidia.com>
> > > > > Sent: Tuesday, November 3, 2020 8:44 PM
> > > > >
> > > > > On Tue, Nov 03, 2020 at 02:49:27AM +0000, Tian, Kevin wrote:
> > > > >
> > > > > > > There is a missing hypercall to allow the guest to do this on its own,
> > > > > > > presumably it will someday be fixed so IMS can work in guests.
> > > > > >
> > > > > > Hypercall is VMM specific, while IMS cap provides a VMM-agnostic
> > > > > > interface so any guest driver (if following the spec) can seamlessly
> > > > > > work on all hypervisors.
> > > > >
> > > > > It is a *VMM* issue, not PCI. Adding a PCI cap to describe a VMM
> issue
> > > > > is architecturally wrong.
> > > > >
> > > > > IMS *can not work* in any hypervsior without some special
> > > > > hypercall. Just block it in the platform code and forget about the PCI
> > > > > cap.
> > > > >
> > > >
> > > > It's per-device thing instead of platform thing. If the VMM understands
> > > > the IMS format of a specific device and virtualize it to the guest,
> > >
> > > Please no! Adding device specific emulation is just going down deeper
> > > into this bad architecture.
> > >
> > > Interrupts is a platform issue. Using emulation of MSI to dynamically
> >
> > Interrupt controller is a platform issue. Interrupt source is about device.
> 
> The interrupt controller is responsible to create an addr/data pair
> for an interrupt message. It sets the message format and ensures it
> routes to the proper CPU interrupt handler. Everything about the
> addr/data pair is owned by the platform interrupt controller.
> 
> Devices do not create interrupts. They only trigger the addr/data pair
> the platform gives them.

I guess that we may just view it from different angles. On x86 platform,
a MSI/IMS capable device directly composes interrupt messages, with 
addr/data pair filled by OS. If there is no IOMMU remapping enabled in 
the middle, the message just hits the CPU. Your description possibly
is from software side, e.g. describing the hierarchical IRQ domain
concept?

> 
> > > insert vectors to a VM was a reasonable, but hacky thing. Now it needs
> > > proper platform support.
> >
> > why is MSI emulation a hacky thing? isn't it defined by PCISIG? I guess
> > that I must misunderstand your real point here...
> 
> It means the interrupt controller in the VM's platform is a fiction,
> the addr/data pairs it creates are not real.
> 
> A PCI device assigned to a VM is supposed to be fully contained by the
> IOMMU, interrupts included, so there is no reason to do MSI emulation
> if the VM's interrupt controller is aware of what addr/data pairs it
> can use with the device - eg by getting them through a hypercall. This
> is much cleaner and supports things like IMS

I agree with this point, just as how pci-hyperv.c works. In concept Linux
guest driver should be able to use IMS when running on Hyper-v. There
is no such thing for KVM, but possibly one day we will need similar stuff.
Before that happens the guest could choose to simply disallow devmsi 
by default in the platform code (inventing a hypercall just for 'disable' 
doesn't make sense) and ignore the IMS cap. One small open is whether
this can be done in one central-place. The detection of running as guest
is done in arch-specific code. Do we need disabling devmsi for every arch?

But when talking about virtualization it's not good to assume the guest
behavior. It's perfectly sane to run a guest OS which doesn't implement 
any PV stuff (thus don't know running in a VM) but do support IMS. In 
such scenario the IMS cap allows the hypervisor to educate the guest 
driver to use MSI instead of IMS, as long as the driver follows the device 
spec. In this regard I don't think that the IMS cap will be a short-term 
thing, although Linux may choose to not use it.

> 
> Trying to do IMS emulation is nutz, the entire point of IMS is the
> device can do what it likes, and emulating that is not going to
> feasible. For instance go read the discussion I had with Thomas how a
> object-centric device would manage interrupts.
> 

Do you mind providing the link? There were lots of discussions between
you and Thomas. I failed to locate the exact mail when searching above
keywords. 

Thanks
Kevin
Jason Gunthorpe Nov. 6, 2020, 1:14 p.m. UTC | #14
On Fri, Nov 06, 2020 at 09:48:34AM +0000, Tian, Kevin wrote:
> > The interrupt controller is responsible to create an addr/data pair
> > for an interrupt message. It sets the message format and ensures it
> > routes to the proper CPU interrupt handler. Everything about the
> > addr/data pair is owned by the platform interrupt controller.
> > 
> > Devices do not create interrupts. They only trigger the addr/data pair
> > the platform gives them.
> 
> I guess that we may just view it from different angles. On x86 platform,
> a MSI/IMS capable device directly composes interrupt messages, with 
> addr/data pair filled by OS.

Yes, all platforms work like that. The addr/data pair is *opaque* to
the device. Only the platform interrupt controller component
understands how to form those values.

> If there is no IOMMU remapping enabled in the middle, the message
> just hits the CPU. Your description possibly is from software side,
> e.g. describing the hierarchical IRQ domain concept?

I suppose you could say that. Technically the APIC doesn't form any
addr/data pairs, but the configuration of the APIC, IOMMU and other
platform components define what addr/data pairs are acceptable.

The IRQ domain stuff broadly puts responsibilty to form these values
in the IRQ layer which abstracts all the platform detatils. In Linux
we expect the platform to provide the IRQ Domain tha can specify
working addr/data pairs.

> I agree with this point, just as how pci-hyperv.c works. In concept Linux
> guest driver should be able to use IMS when running on Hyper-v. There
> is no such thing for KVM, but possibly one day we will need similar stuff.
> Before that happens the guest could choose to simply disallow devmsi
> by default in the platform code (inventing a hypercall just for 'disable' 
> doesn't make sense) and ignore the IMS cap. One small open is whether
> this can be done in one central-place. The detection of running as guest
> is done in arch-specific code. Do we need disabling devmsi for every arch?
>
> But when talking about virtualization it's not good to assume the guest
> behavior. It's perfectly sane to run a guest OS which doesn't implement 
> any PV stuff (thus don't know running in a VM) but do support IMS. In 
> such scenario the IMS cap allows the hypervisor to educate the guest 
> driver to use MSI instead of IMS, as long as the driver follows the device 
> spec. In this regard I don't think that the IMS cap will be a short-term 
> thing, although Linux may choose to not use it.

The IMS flag belongs in the platform not in the devices.

For instance you could put a "disable IMS" flag in the ACPI tables, in
the config space of the emuulated root port, or any other areas that
clearly belong to the platform.

The OS logic would be
 - If no IMS information found then use IMS (Bare metal)
 - If the IMS disable flag is found then
   - If (future) hypercall available and the OS knows how to use it
     then use IMS
   - If no hypercall found, or no OS knowledge, fail IMS

Our devices can use IMS even in a pure no-emulation
configurations. Saying that we need to insert complicated security
sensitive emulation just to get IMS in the guest is absolutely crazy.

> Do you mind providing the link? There were lots of discussions between
> you and Thomas. I failed to locate the exact mail when searching above
> keywords. 

Read through these two threads:

https://lore.kernel.org/linux-hyperv/20200821002949.049867339@linutronix.de/
https://lore.kernel.org/dmaengine/159534734833.28840.10067945890695808535.stgit@djiang5-desk3.ch.intel.com/

Jason
Raj, Ashok Nov. 6, 2020, 4:48 p.m. UTC | #15
Hi Jason

On Fri, Nov 06, 2020 at 09:14:15AM -0400, Jason Gunthorpe wrote:
> On Fri, Nov 06, 2020 at 09:48:34AM +0000, Tian, Kevin wrote:
> > > The interrupt controller is responsible to create an addr/data pair
> > > for an interrupt message. It sets the message format and ensures it
> > > routes to the proper CPU interrupt handler. Everything about the
> > > addr/data pair is owned by the platform interrupt controller.
> > > 
> > > Devices do not create interrupts. They only trigger the addr/data pair
> > > the platform gives them.
> > 
> > I guess that we may just view it from different angles. On x86 platform,
> > a MSI/IMS capable device directly composes interrupt messages, with 
> > addr/data pair filled by OS.
> 
> Yes, all platforms work like that. The addr/data pair is *opaque* to
> the device. Only the platform interrupt controller component
> understands how to form those values.

True, the addr/data pair is opaque. IMS doesn't dictate what the contents
of addr/data pair is made of. That is still a platform attribute. IMS simply 
controls where the pair is physically stored. Which only the device dictates.

> 
> > If there is no IOMMU remapping enabled in the middle, the message
> > just hits the CPU. Your description possibly is from software side,
> > e.g. describing the hierarchical IRQ domain concept?
> 
> I suppose you could say that. Technically the APIC doesn't form any
> addr/data pairs, but the configuration of the APIC, IOMMU and other
> platform components define what addr/data pairs are acceptable.
> 
> The IRQ domain stuff broadly puts responsibilty to form these values
> in the IRQ layer which abstracts all the platform detatils. In Linux
> we expect the platform to provide the IRQ Domain tha can specify
> working addr/data pairs.
> 
> > I agree with this point, just as how pci-hyperv.c works. In concept Linux
> > guest driver should be able to use IMS when running on Hyper-v. There
> > is no such thing for KVM, but possibly one day we will need similar stuff.
> > Before that happens the guest could choose to simply disallow devmsi
> > by default in the platform code (inventing a hypercall just for 'disable' 
> > doesn't make sense) and ignore the IMS cap. One small open is whether
> > this can be done in one central-place. The detection of running as guest
> > is done in arch-specific code. Do we need disabling devmsi for every arch?
> >
> > But when talking about virtualization it's not good to assume the guest
> > behavior. It's perfectly sane to run a guest OS which doesn't implement 
> > any PV stuff (thus don't know running in a VM) but do support IMS. In 
> > such scenario the IMS cap allows the hypervisor to educate the guest 
> > driver to use MSI instead of IMS, as long as the driver follows the device 
> > spec. In this regard I don't think that the IMS cap will be a short-term 
> > thing, although Linux may choose to not use it.
> 
> The IMS flag belongs in the platform not in the devices.

This support is mostly a SW thing right? we don't need to muck with
platform/ACPI for that matter. 

> 
> For instance you could put a "disable IMS" flag in the ACPI tables, in
> the config space of the emuulated root port, or any other areas that
> clearly belong to the platform.

Maybe there is a different interpretation for IMS that I'm missing. Devices
that need more interrupt support than supported by PCIe standards, and how
device has grouped the storage needs for the addr/data pair is a device
attribute.

I missed why ACPI tables should carry such information. If kernel doesn't
want to support those devices its within kernel control. Which means kernel
will only use the available MSIx interfaces. This is legacy support.

> 
> The OS logic would be
>  - If no IMS information found then use IMS (Bare metal)
>  - If the IMS disable flag is found then
>    - If (future) hypercall available and the OS knows how to use it
>      then use IMS
>    - If no hypercall found, or no OS knowledge, fail IMS
> 
> Our devices can use IMS even in a pure no-emulation

This is true for IMS as well. But probably not implemented in the kernel as
such. From a HW point of view (take idxd for instance) the facility is
available to native OS as well. The early RFC supported this for native.

Native devices can have both MSIx and IMS capability. But as I understand this
isn't how we have partitioned things in SW today. We left IMS only for
mdev's. And I agree this would be very useful.

In cases where we want to support interrupt handles for user space
notification (when application specifies that in the descriptor). Those
could be IMS. The device HW has support for it.

Remember the "Why PASID in IMS entry" discussion?

https://lore.kernel.org/lkml/20201008233210.GH4734@nvidia.com/

Cheers,
Ashok
Jason Gunthorpe Nov. 6, 2020, 5:51 p.m. UTC | #16
On Fri, Nov 06, 2020 at 08:48:50AM -0800, Raj, Ashok wrote:
> > The IMS flag belongs in the platform not in the devices.
> 
> This support is mostly a SW thing right? we don't need to muck with
> platform/ACPI for that matter. 

Something needs to tell the guest OS platform what to do, so you need
a place to put it.

Putting it in a per-device PCI cap is horrible and hacky from an
architectural perspective.

> I missed why ACPI tables should carry such information. If kernel doesn't
> want to support those devices its within kernel control. Which means kernel
> will only use the available MSIx interfaces. This is legacy support.

The platform flag tells the guest that it can (or can't) support IMS
*at all*

Primarily a guest would be blocked because the VMM provides no way for
the guest to create addr/data pairs.

Has nothing to do with individual devices.

 
> > The OS logic would be
> >  - If no IMS information found then use IMS (Bare metal)
> >  - If the IMS disable flag is found then
> >    - If (future) hypercall available and the OS knows how to use it
> >      then use IMS
> >    - If no hypercall found, or no OS knowledge, fail IMS
> > 
> > Our devices can use IMS even in a pure no-emulation
> 
> This is true for IMS as well. But probably not implemented in the kernel as
> such. From a HW point of view (take idxd for instance) the facility is
> available to native OS as well. The early RFC supported this for native.

I can't follow what you are trying to say here.

Dave said the IMS cap was to indicate that the VMM supported emulation
of IMS so that the VMM can do the MSI addr/data translation as part of
the emulation.

I'm saying emulation will be too horrible for our devices that don't
require *any* emulation.

It is a bad architecture. The platform needs to handle this globally
for all devices, not special hacky emulations things custom made for
every device out there.

> Native devices can have both MSIx and IMS capability. But as I
> understand this isn't how we have partitioned things in SW today. We
> left IMS only for mdev's. And I agree this would be very useful.

That split is just some decision idxd did, we are thinking about doing
other things in our devices.

Jason
Dan Williams Nov. 6, 2020, 11:47 p.m. UTC | #17
On Fri, Nov 6, 2020 at 9:51 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
[..]
> > This is true for IMS as well. But probably not implemented in the kernel as
> > such. From a HW point of view (take idxd for instance) the facility is
> > available to native OS as well. The early RFC supported this for native.
>
> I can't follow what you are trying to say here.

I'm having a hard time following the technical cruxes of this debate.
I grokked your feedback on the original IMS proposal way back at the
beginning of this effort (pre-COVID even!), so maybe I can mediate
here as well. Although, SIOV is that much harder for me to spell than
IMS, so bear with me.

> Dave said the IMS cap was to indicate that the VMM supported emulation
> of IMS so that the VMM can do the MSI addr/data translation as part of
> the emulation.
>
> I'm saying emulation will be too horrible for our devices that don't
> require *any* emulation.

This part I think I understand, i.e. why spend any logic emulating IMS
as MSI since the IMS capability can be a paravirtualized interface
from guest to VMM with none of the compromises that MSI would enforce.
Did I get that right?

> It is a bad architecture. The platform needs to handle this globally
> for all devices, not special hacky emulations things custom made for
> every device out there.

I confess I don't quite understand the shape of what "platform needs
to handle this globally" means, but I understand the desired end
result of "no emulation added where not needed". However, would this
mean that the bare-metal idxd driver can not be used directly in the
guest without modification? For example, as I understand from talking
to Ashok, idxd has some device events like error notification hard
wired to MSI while data patch interrupts are IMS. So even if the IMS
side does not hook up MSI emulation doesn't idxd still need MSI
emulation to reuse the bare metal driver directly?

> > Native devices can have both MSIx and IMS capability. But as I
> > understand this isn't how we have partitioned things in SW today. We
> > left IMS only for mdev's. And I agree this would be very useful.
>
> That split is just some decision idxd did, we are thinking about doing
> other things in our devices.

Where does the collision happen between what you need for a clean
implementation of an IMS-like capability (/me misses his "dev-msi"
name that got thrown out in the Thomas rewrite), and emulation needed
to not have VF special casing in the idxd driver.

Also feel free to straighten me out (Jason or Ashok) if I've botched
the understanding of this.
Jason Gunthorpe Nov. 7, 2020, 12:12 a.m. UTC | #18
On Fri, Nov 06, 2020 at 03:47:00PM -0800, Dan Williams wrote:

> Also feel free to straighten me out (Jason or Ashok) if I've botched
> the understanding of this.

It is pretty simple when you get down to it.

We have a new kernel API that Thomas added:

  pci_subdevice_msi_create_irq_domain()

This creates an IRQ domain that hands out addr/data pairs that
trigger interrupts.

On bare metal the addr/data pairs from the IRQ domain are programmed
into the HW in some HW specific way by the device driver that calls
the above function.

On (kvm) virtualization the addr/data pair the IRQ domain hands out
doesn't work. It is some fake thing.

To make this work on normal MSI/MSI-X the VMM implements emulation of
the standard MSI/MSI-X programming and swaps the fake addr/data pair
for a real one obtained from the hypervisor IRQ domain.

To "deal" with this issue the SIOV spec suggests to add a per-device
PCI Capability that says "IMS works". Which means either:
 - This is bare metal, so of course it works
 - The VMM is trapping and emulating whatever the device specific IMS
   programming is.

The idea being that a VMM can never advertise the IMS cap flag to the
guest unles the VMM provides a device specific driver that does device
specific emulation to capture the addr/data pair. Remeber IMS doesn't
say how to program the addr/data pair! Every device is unique!

On something like IDXD this emulation is not so hard, on something
like mlx5 this is completely unworkable. Further we never do
emulation on our devices, they always pass native hardware through,
even for SIOV-like cases.

In the end pci_subdevice_msi_create_irq_domain() is a platform
function. Either it should work completely on every device with no
device-specific emulation required in the VMM, or it should not work
at all and return -EOPNOTSUPP.

The only sane way to implement this generically is for the VMM to
provide a hypercall to obtain a real *working* addr/data pair(s) and
then have the platform hand those out from
pci_subdevice_msi_create_irq_domain(). 

All IMS device drivers will work correctly. No VMM device emulation is
ever needed to translate addr/data pairs.

Earlier in this thread Kevin said hyper-v is already working this way,
even for MSI/MSI-X. To me this says it is fundamentally a KVM platform
problem and it should not be solved by PCI capability flags.

Jason
Thomas Gleixner Nov. 7, 2020, 12:32 a.m. UTC | #19
On Fri, Nov 06 2020 at 09:48, Kevin Tian wrote:
>> From: Jason Gunthorpe <jgg@nvidia.com>
>> On Wed, Nov 04, 2020 at 01:34:08PM +0000, Tian, Kevin wrote:
>> The interrupt controller is responsible to create an addr/data pair
>> for an interrupt message. It sets the message format and ensures it
>> routes to the proper CPU interrupt handler. Everything about the
>> addr/data pair is owned by the platform interrupt controller.
>> 
>> Devices do not create interrupts. They only trigger the addr/data pair
>> the platform gives them.
>
> I guess that we may just view it from different angles. On x86 platform,
> a MSI/IMS capable device directly composes interrupt messages, with 
> addr/data pair filled by OS. If there is no IOMMU remapping enabled in 
> the middle, the message just hits the CPU. Your description possibly
> is from software side, e.g. describing the hierarchical IRQ domain
> concept?

No. The device composes nothing. If the interrupt is raised in the
device then the MSI block sends the message which was composed by the OS
and stored in the device's message store. For PCI/MSI that's the MSI or
MSIX table and for IMS that's either on device memory (as IDXD uses) or
some completely different location which Jason described.

This has absolutely nothing to do with the X86 platform. MSI is a
architecture independent mechanism: Send whatever the OS put into the
storage to raise an interrupt in the CPU. The device does neither know
whether that message is going to be intercepted by an interrupt
remapping unit or not.

Stop claiming that any of this has anything to do with x86. It has
absolutely nothing to do with x86 and looking at MSI from an x86
perspective instead of looking at it from the architecture agnostic
technical reality of MSI is the reason why we have this discussion at
all.

We had a similar discussion vs. the way how IMS interrupts have to be
dealt with in terms of irq domains. Can you finally stop looking at
everything as a big x86/intel/platform lump and understand that things
are very well structured and seperated both at the hardware and at the
software level? 

> Do you mind providing the link? There were lots of discussions between
> you and Thomas. I failed to locate the exact mail when searching above
> keywords. 

In this thread: 20200821002424.119492231@linutronix.de and you were on
Cc

Thanks,

        tglx
Dan Williams Nov. 7, 2020, 1:42 a.m. UTC | #20
On Fri, Nov 6, 2020 at 4:12 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Fri, Nov 06, 2020 at 03:47:00PM -0800, Dan Williams wrote:
[..]
> The only sane way to implement this generically is for the VMM to
> provide a hypercall to obtain a real *working* addr/data pair(s) and
> then have the platform hand those out from
> pci_subdevice_msi_create_irq_domain().

Yeah, that seems a logical attach point for this magic. Appreciate you
taking the time to lay it out.
Raj, Ashok Nov. 8, 2020, 6:11 p.m. UTC | #21
Hi Jason

Thanks, its now clear what you had mentioned earlier.

I had couple questions/clarifications below. Thanks for working 
through this.

On Fri, Nov 06, 2020 at 08:12:07PM -0400, Jason Gunthorpe wrote:
> On Fri, Nov 06, 2020 at 03:47:00PM -0800, Dan Williams wrote:
> 
> > Also feel free to straighten me out (Jason or Ashok) if I've botched
> > the understanding of this.
> 
> It is pretty simple when you get down to it.
> 
> We have a new kernel API that Thomas added:
> 
>   pci_subdevice_msi_create_irq_domain()
> 
> This creates an IRQ domain that hands out addr/data pairs that
> trigger interrupts.
> 
> On bare metal the addr/data pairs from the IRQ domain are programmed
> into the HW in some HW specific way by the device driver that calls
> the above function.
> 
> On (kvm) virtualization the addr/data pair the IRQ domain hands out
> doesn't work. It is some fake thing.

Is it really some fake thing? I thought the vCPU and vector are real
for a guest, and VMM ensures when interrupts are delivered they are either.

1. Handled by VMM first and then injected to guest
2. Handled in a Posted Interrupt manner, and injected to guest
   when it resumes. It can be delivered directly if guest was running
   when the interrupt arrived.

> 
> To make this work on normal MSI/MSI-X the VMM implements emulation of
> the standard MSI/MSI-X programming and swaps the fake addr/data pair
> for a real one obtained from the hypervisor IRQ domain.
> 
> To "deal" with this issue the SIOV spec suggests to add a per-device
> PCI Capability that says "IMS works". Which means either:
>  - This is bare metal, so of course it works
>  - The VMM is trapping and emulating whatever the device specific IMS
>    programming is.
> 
> The idea being that a VMM can never advertise the IMS cap flag to the
> guest unles the VMM provides a device specific driver that does device
> specific emulation to capture the addr/data pair. Remeber IMS doesn't
> say how to program the addr/data pair! Every device is unique!
> 
> On something like IDXD this emulation is not so hard, on something
> like mlx5 this is completely unworkable. Further we never do
> emulation on our devices, they always pass native hardware through,
> even for SIOV-like cases.

So is that true for interrupts too? Possibly you have the interrupt
entries sitting in memory resident on the device? Don't we need the 
VMM to ensure they are brokered by VMM in either one of the two ways 
above? What if the guest creates some addr in the 0xfee... range
how do we take care of interrupt remapping and such without any VMM 
assist?

Its probably a gap in my understanding. 

> 
> In the end pci_subdevice_msi_create_irq_domain() is a platform
> function. Either it should work completely on every device with no
> device-specific emulation required in the VMM, or it should not work
> at all and return -EOPNOTSUPP.
> 
> The only sane way to implement this generically is for the VMM to
> provide a hypercall to obtain a real *working* addr/data pair(s) and
> then have the platform hand those out from
> pci_subdevice_msi_create_irq_domain(). 
> 
> All IMS device drivers will work correctly. No VMM device emulation is
> ever needed to translate addr/data pairs.
> 

That's true. Probably this can work the same even for MSIx types too then?

When we do interrupt remapping support in guest which would be required 
if we support x2apic in guest, I think this is something we should look into more 
carefully to make this work.

One criteria that we generally tried to follow is driver that runs in host
and guest are the same, and if needed they need some functionality make it
work around some capability  detection so the alternate path can be plummed in
a generic way. 

I agree with the overall idea and we should certainly take that into consideration
when we need IMS in guest support and in context of interrupt remapping.

Hopefully I understood the overall concept. If I mis-understood any of this
please let me know.

Cheers,
Ashok
David Woodhouse Nov. 8, 2020, 6:34 p.m. UTC | #22
On Sun, 2020-11-08 at 10:11 -0800, Raj, Ashok wrote:
> Hi Jason
> 
> Thanks, its now clear what you had mentioned earlier.
> 
> I had couple questions/clarifications below. Thanks for working 
> through this.
> 
> On Fri, Nov 06, 2020 at 08:12:07PM -0400, Jason Gunthorpe wrote:
> > On Fri, Nov 06, 2020 at 03:47:00PM -0800, Dan Williams wrote:
> > 
> > > Also feel free to straighten me out (Jason or Ashok) if I've botched
> > > the understanding of this.
> > 
> > It is pretty simple when you get down to it.
> > 
> > We have a new kernel API that Thomas added:
> > 
> >   pci_subdevice_msi_create_irq_domain()
> > 
> > This creates an IRQ domain that hands out addr/data pairs that
> > trigger interrupts.
> > 
> > On bare metal the addr/data pairs from the IRQ domain are programmed
> > into the HW in some HW specific way by the device driver that calls
> > the above function.
> > 
> > On (kvm) virtualization the addr/data pair the IRQ domain hands out
> > doesn't work. It is some fake thing.
> 
> Is it really some fake thing? I thought the vCPU and vector are real
> for a guest, and VMM ensures when interrupts are delivered they are either.
> 
> 1. Handled by VMM first and then injected to guest
> 2. Handled in a Posted Interrupt manner, and injected to guest
>    when it resumes. It can be delivered directly if guest was running
>    when the interrupt arrived.
> 
> > 
> > To make this work on normal MSI/MSI-X the VMM implements emulation of
> > the standard MSI/MSI-X programming and swaps the fake addr/data pair
> > for a real one obtained from the hypervisor IRQ domain.
> > 
> > To "deal" with this issue the SIOV spec suggests to add a per-device
> > PCI Capability that says "IMS works". Which means either:
> >  - This is bare metal, so of course it works
> >  - The VMM is trapping and emulating whatever the device specific IMS
> >    programming is.
> > 
> > The idea being that a VMM can never advertise the IMS cap flag to the
> > guest unles the VMM provides a device specific driver that does device
> > specific emulation to capture the addr/data pair. Remeber IMS doesn't
> > say how to program the addr/data pair! Every device is unique!
> > 
> > On something like IDXD this emulation is not so hard, on something
> > like mlx5 this is completely unworkable. Further we never do
> > emulation on our devices, they always pass native hardware through,
> > even for SIOV-like cases.
> 
> So is that true for interrupts too? Possibly you have the interrupt
> entries sitting in memory resident on the device? Don't we need the 
> VMM to ensure they are brokered by VMM in either one of the two ways 
> above? What if the guest creates some addr in the 0xfee... range
> how do we take care of interrupt remapping and such without any VMM 
> assist?
> 
> Its probably a gap in my understanding. 
> 
> > 
> > In the end pci_subdevice_msi_create_irq_domain() is a platform
> > function. Either it should work completely on every device with no
> > device-specific emulation required in the VMM, or it should not work
> > at all and return -EOPNOTSUPP.
> > 
> > The only sane way to implement this generically is for the VMM to
> > provide a hypercall to obtain a real *working* addr/data pair(s) and
> > then have the platform hand those out from
> > pci_subdevice_msi_create_irq_domain(). 
> > 
> > All IMS device drivers will work correctly. No VMM device emulation is
> > ever needed to translate addr/data pairs.
> > 
> 
> That's true. Probably this can work the same even for MSIx types too then?
> 
> When we do interrupt remapping support in guest which would be required 
> if we support x2apic in guest, I think this is something we should look into more 
> carefully to make this work.

No, interrupt remapping is not required for X2APIC in guests

They can have X2APIC and up to 32768 CPUs without needing interrupt
remapping at all. Only if they want more than 32768 vCPUs, or to do
nested virtualisation and actually remap for the benefit of *their*
(L2+) guests would they need IR.
Thomas Gleixner Nov. 8, 2020, 6:47 p.m. UTC | #23
On Fri, Nov 06 2020 at 20:12, Jason Gunthorpe wrote:
> All IMS device drivers will work correctly. No VMM device emulation is
> ever needed to translate addr/data pairs.
>
> Earlier in this thread Kevin said hyper-v is already working this way,
> even for MSI/MSI-X. To me this says it is fundamentally a KVM platform
> problem and it should not be solved by PCI capability flags.

I mostly agree but want to add a few clarifications about the
terminology and the boundaries because I think there is where lot of the
confusion comes from.

Let me go back to the basic structure both at the hardware and at the
software level.

The basic structure is:

  [CPU] -- [Bridge] -- Bus -- [Device]

This applies to all kind of buses where the bridge directly translates
into the CPUs address space. Now let's look at the boundaries:

                |
                |
  [CPU] -- [Bri | dge] -- Bus -- [Device]
                |   
                |

The boundary is in the middle of the bridge because the CPU side of the
bridge is obviously CPU and therefore architecture specific. The Bus
side of the bridge is architecture agnostic.

Now let's add an IOMMU:

  [CPU] -- [IOMMU] -- [Bridge] -- Bus -- [Device]

and in theory the boundary moves now to:

               |
               |
  [CPU] -- [IO | MMU] -- [Bridge] -- Bus -- [Device]
               |
               |

because with an IOMMU the bridge could become CPU and architecture
agnostic. In reality this is not the case as the bridge is still the
same thing.

Now let's look at MSI. As established above, the Bus and the Device are
CPU and architecture agnostic and the Device merily uses a composed
message which is stored at some place accessible to the device to send
that message when it raises an interrupt. So where is this message
composed?

The basic case:

                   |
                   |
  [CPU]    -- [Bri | dge] -- Bus -- [Device]
                   |
  Alloc +           
  Compose                   Store     Use

The Bridge is irrelevant here as it just is involved in the
transport. Nevertheless the Bridge is only transport in the view of the
interrupt subsystem.

The IOMMU case:

               |
               |
  [CPU] -- [IO | MMU] -- [Bridge] -- Bus -- [Device]
               |
            Alloc +
  Alloc     Compose                 Store     Use


That's exactly reflected in hierarchical irq domains:

                       |
                       |
  [CPU]        -- [Bri | dge] --    Bus    -- [Device]
                       |   
  Alloc +           
  Compose                         Store        Use

  Vectordomain                   Busdomain

and:

                     |
                     |
  [CPU]       -- [IO | MMU]  -- [Bridge] --    Bus    -- [Device]
                     |
                  Alloc +   
  Alloc           Compose                    Store       Use

  Vectordomain   Remapdomain                Busdomain


Now if we look at the virtualization scenario and device hand through
then the structure in the guest view is not any different from the basic
case. This works with PCI-MSI[X] and the IDXD IMS variant because the
hypervisor can trap the access to the storage and translate the message:

                   |
                   |
  [CPU]    -- [Bri | dge] -- Bus -- [Device]
                   |
  Alloc +
  Compose                   Store     Use
                             |
                             | Trap
                             v
                             Hypervisor translates and stores

But obviously with an IMS storage location which is software controlled
by the guest side driver (the case Jason is interested in) the above
cannot work for obvious reasons.

That means the guest needs a way to ask the hypervisor for a proper
translation, i.e. a hypercall. Now where to do that? Looking at the
above remapping case it's pretty obvious:


                     |
                     |
  [CPU]       -- [VI | RT]  -- [Bridge] --    Bus    -- [Device]
                     |
  Alloc          "Compose"                   Store         Use

  Vectordomain   HCALLdomain                Busdomain
                 |        ^
                 |        |
                 v        | 
            Hypervisor    
               Alloc + Compose

Why? Because it reflects the boundaries and leaves the busdomain part
agnostic as it should be. And it works for _all_ variants of Busdomains.

Now the question which I can't answer is whether this can work correctly
in terms of isolation. If the IMS storage is in guest memory (queue
storage) then the guest driver can obviously write random crap into it
which the device will happily send. (For MSI and IDXD style IMS it
still can trap the store).

Is the IOMMU/Interrupt remapping unit able to catch such messages which
go outside the space to which the guest is allowed to signal to? If yes,
problem solved. If no, then IMS storage in guest memory can't ever work.

Coming back to this:

> In the end pci_subdevice_msi_create_irq_domain() is a platform
> function. Either it should work completely on every device with no
> device-specific emulation required in the VMM, or it should not work
> at all and return -EOPNOTSUPP.

The subdevice domain is a 'Busdomain' according to the structure
above. It does not and should never have any clue about the underlying
system. It's in the agnostic part and always works. It simply does not
care what's underneath. So it won't return -EOPNOTSUPP.

What it has to do is to transport the IMS in queue memory requirement to
the underlying parent domain.

So in case that the HCALL domain is missing, the Vector domain needs
return an error code on domain creation. If the HCALL domain is there
then the domain creation works and in case of actual interrupt
allocation the hypercall either returns a valid composed message or an
appropriate error code.

But there's a catch:

This only works when the guest OS actually knows that it runs in a
VM. If the guest can't figure that out, i.e. via CPUID, this cannot be
solved because from the guest OS view that's the same as running on bare
metal. Obviously on bare metal the Vector domain can and must handle
this.

So this needs some thought.

Thanks,

        tglx
David Woodhouse Nov. 8, 2020, 7:36 p.m. UTC | #24
On Sun, 2020-11-08 at 19:47 +0100, Thomas Gleixner wrote:
> This only works when the guest OS actually knows that it runs in a
> VM. If the guest can't figure that out, i.e. via CPUID, this cannot be
> solved because from the guest OS view that's the same as running on bare
> metal. Obviously on bare metal the Vector domain can and must handle
> this.
> 
> So this needs some thought.

The problem here is that Intel implemented interrupt remapping in a way
which is anathema to structured, ordered IRQ domains.

When a guest writes an MSI message (addr/data) to the MSI table of a
PCI device which has been assigned to that guest, it *doesn't* properly
inherit the MSI composition from a parent irqdomain which knows about
the (host-side) IOMMU.

What actually happens is the hypervisor *traps* the writes to the
device's MSI table, and translates them *then*. In *precisely* the
fashion which we're trying to avoid for IMS.

Now, you can imagine a world where it wasn't like this, where
Remappable Format MSI messages don't exist, and where we let guests
write native MSI message to the device without trapping — and where the
IOMMU then sees the incoming interrupt and has to map the APIC ID to a
*virtual* CPU for that guest, based on the PCI source-id of the device.

In that world, IMS would work naturally. But that isn't how Intel
designed interrupt remapping. They *designed* to have to trap and
translate as the message is written to the device.

So it does look like we're going to need a hypercall interface to
compose an MSI message on behalf of the guest, for IMS to use. In fact
PCI devices assigned to a guest could use that too, and then we'd only
need to trap-and-remap any attempt to write a Compatibility Format MSI
to the device's MSI table, while letting Remappable Format messages get
written directly.

We'd also need a way for an OS running on bare metal to *know* that
it's on bare metal and can just compose MSI messages for itself. Since
we do expect bare metal to have an IOMMU, perhaps that is just a
feature flag on the IOMMU?

That or Intel needs to fix the IOMMU to do proper virtualisation and
actually translate "Compatibility Format" MSIs for a guest too.
Thomas Gleixner Nov. 8, 2020, 9:18 p.m. UTC | #25
On Fri, Nov 06 2020 at 09:14, Jason Gunthorpe wrote:
> On Fri, Nov 06, 2020 at 09:48:34AM +0000, Tian, Kevin wrote:
> For instance you could put a "disable IMS" flag in the ACPI tables, in
> the config space of the emuulated root port, or any other areas that
> clearly belong to the platform.
>
> The OS logic would be
>  - If no IMS information found then use IMS (Bare metal)
>  - If the IMS disable flag is found then
>    - If (future) hypercall available and the OS knows how to use it
>      then use IMS
>    - If no hypercall found, or no OS knowledge, fail IMS

That does not work because an older hypervisor would not have that
disable flag and the guest kernel would assume to be on bare metal (if
no other indicators are there).

Thanks

        tglx
David Woodhouse Nov. 8, 2020, 10:09 p.m. UTC | #26
> On Fri, Nov 06 2020 at 09:14, Jason Gunthorpe wrote:
>> On Fri, Nov 06, 2020 at 09:48:34AM +0000, Tian, Kevin wrote:
>> For instance you could put a "disable IMS" flag in the ACPI tables, in
>> the config space of the emuulated root port, or any other areas that
>> clearly belong to the platform.
>>
>> The OS logic would be
>>  - If no IMS information found then use IMS (Bare metal)
>>  - If the IMS disable flag is found then
>>    - If (future) hypercall available and the OS knows how to use it
>>      then use IMS
>>    - If no hypercall found, or no OS knowledge, fail IMS
>
> That does not work because an older hypervisor would not have that
> disable flag and the guest kernel would assume to be on bare metal (if
> no other indicators are there).

In the absence of a forward-thinking design from Intel perhaps we could
use the existence of an IOMMU with interrupt remapping and not caching
mode as the indication that it's bare metal?
Thomas Gleixner Nov. 8, 2020, 10:47 p.m. UTC | #27
On Sun, Nov 08 2020 at 19:36, David Woodhouse wrote:
> On Sun, 2020-11-08 at 19:47 +0100, Thomas Gleixner wrote:
>> So this needs some thought.
>
> The problem here is that Intel implemented interrupt remapping in a way
> which is anathema to structured, ordered IRQ domains.
>
> When a guest writes an MSI message (addr/data) to the MSI table of a
> PCI device which has been assigned to that guest, it *doesn't* properly
> inherit the MSI composition from a parent irqdomain which knows about
> the (host-side) IOMMU.
>
> What actually happens is the hypervisor *traps* the writes to the
> device's MSI table, and translates them *then*.

That's what I showed in the ascii art :)

> In *precisely* the fashion which we're trying to avoid for IMS.

At least for the IMS variant where the storage is not in trappable
device memory.

> Now, you can imagine a world where it wasn't like this, where
> Remappable Format MSI messages don't exist, and where we let guests
> write native MSI message to the device without trapping — and where the
> IOMMU then sees the incoming interrupt and has to map the APIC ID to a
> *virtual* CPU for that guest, based on the PCI source-id of the
> device.

That would be not convoluted enough and make too much sense.

> In that world, IMS would work naturally. But that isn't how Intel
> designed interrupt remapping. They *designed* to have to trap and
> translate as the message is written to the device.
>
> So it does look like we're going to need a hypercall interface to
> compose an MSI message on behalf of the guest, for IMS to use. In fact
> PCI devices assigned to a guest could use that too, and then we'd only
> need to trap-and-remap any attempt to write a Compatibility Format MSI
> to the device's MSI table, while letting Remappable Format messages get
> written directly.

Yes, if we have the HCALL domain then the message composed by the
hypervisor is valid for everything not only IMS. That's why I left out
any specifics on the Busdomain side. It does not matter which kind of
bus that is. The only mechanics which is provided by the busdomain is
to store the precomposed message and eventually provide mask/unmask at
that level.

> We'd also need a way for an OS running on bare metal to *know* that
> it's on bare metal and can just compose MSI messages for itself. Since
> we do expect bare metal to have an IOMMU, perhaps that is just a
> feature flag on the IOMMU?

There are still CPUs w/o IOMMU out there and new ones are shipped.

So you would basically mandate that IMS with memory storage can only
work on bare metal when the CPU has an IOMMU.

Jason said in [1]: "For x86 I think we could accept linking this to
IOMMU, if really necessary."

OTOH, what's the chance that a guest runs on something which

  1) Does not have X86_FEATURE_HYPERVISOR set in cpuid 1/EDX

and

  2) Cannot be identified as Xen domain

and

  3) Does not have a DMI vendor entry which identifies the
     virtualization solution (we don't use that today, but
     adding that table is trivial enough)

and

  4) Has such an IMS device passed through?

Possible, yes. Likely, no. Do we care?

> That or Intel needs to fix the IOMMU to do proper virtualisation and
> actually translate "Compatibility Format" MSIs for a guest too.

Is that going to happen before I retire?

Thanks,

        tglx

[1] https://lore.kernel.org/r/20200822005125.GB1152540@nvidia.com
Thomas Gleixner Nov. 8, 2020, 10:52 p.m. UTC | #28
On Sun, Nov 08 2020 at 22:09, David Woodhouse wrote:

>> On Fri, Nov 06 2020 at 09:14, Jason Gunthorpe wrote:
>>> On Fri, Nov 06, 2020 at 09:48:34AM +0000, Tian, Kevin wrote:
>>> For instance you could put a "disable IMS" flag in the ACPI tables, in
>>> the config space of the emuulated root port, or any other areas that
>>> clearly belong to the platform.
>>>
>>> The OS logic would be
>>>  - If no IMS information found then use IMS (Bare metal)
>>>  - If the IMS disable flag is found then
>>>    - If (future) hypercall available and the OS knows how to use it
>>>      then use IMS
>>>    - If no hypercall found, or no OS knowledge, fail IMS
>>
>> That does not work because an older hypervisor would not have that
>> disable flag and the guest kernel would assume to be on bare metal (if
>> no other indicators are there).
>
> In the absence of a forward-thinking design from Intel perhaps we could

Just to be fair the AMD interrupt remapping is not any better in that
regard.

Thanks,

        tglx
Jason Gunthorpe Nov. 8, 2020, 11:23 p.m. UTC | #29
On Sun, Nov 08, 2020 at 07:47:24PM +0100, Thomas Gleixner wrote:
> 
> That means the guest needs a way to ask the hypervisor for a proper
> translation, i.e. a hypercall. Now where to do that? Looking at the
> above remapping case it's pretty obvious:
> 
> 
>                      |
>                      |
>   [CPU]       -- [VI | RT]  -- [Bridge] --    Bus    -- [Device]
>                      |
>   Alloc          "Compose"                   Store         Use
> 
>   Vectordomain   HCALLdomain                Busdomain
>                  |        ^
>                  |        |
>                  v        | 
>             Hypervisor    
>                Alloc + Compose

Yes, this will describes what I have been thinking

> Now the question which I can't answer is whether this can work correctly
> in terms of isolation. If the IMS storage is in guest memory (queue
> storage) then the guest driver can obviously write random crap into it
> which the device will happily send. (For MSI and IDXD style IMS it
> still can trap the store).

There are four cases of interest here:

 1) Bare metal, PF and VF devices just deliver whatever addr/data pairs
    to the APIC. IMS works perfectly with pci_subdevice_msi_create_irq_domain()

 2) SRIOV VF assigned to the guest.

    The guest can cause any MemWr TLP to any addr/data pair
    and the iommu/platform/vmm is supposed to use the
    Bus/device/function to isolate & secure the interrupt address
    range.

    IMS can work in the guest if the guest knows the details of the
    address range and can make hypercalls to setup routing. So
    pci_subdevice_msi_create_irq_domain() works if the hypercalls
    exist and fails if they don't.

 3) SIOV sub device assigned to the guest.

    The difference between SIOV and SRIOV is the device must attach a
    PASID to every TLP triggered by the guest. Logically we'd expect
    when IMS is used in this situation the interrupt MemWr is tagged
    with bus/device/function/PASID to uniquly ID the guest and the same
    security protection scheme from #2 applies.

 4) SIOV sub device assigned to the guest, but with emulation.

    This SIOV device cannot tag interrupts with PASID so cannot do #2
    (or the platform cannot recieve a PASID tagged interrupt message).

    Since the interrupts are being delivered with TLPs pointing at the
    hypervisor the only solution is for the hypervisor to exclusively
    control the interrupt table. MSI table like emulation for IMS is
    needed and the hypervisor will use pci_subdevice_msi_create_irq_domain()
    to get the real interrupts.

    pci_subdevice_msi_create_irq_domain() needs to return the 'fake'
    addr/data pairs which are actually an ABI between the guest and
    hypervisor carried in the hidden hypercall of the emulation.
    (ie it works like MSI works today)

IDXD is worring about case #4, I think, but I didn't follow in that
whole discussion about the IMS table layout if they PASID tag the IMS
MemWr or not?? Ashok can you clarify?

> Is the IOMMU/Interrupt remapping unit able to catch such messages which
> go outside the space to which the guest is allowed to signal to? If yes,
> problem solved. If no, then IMS storage in guest memory can't ever work.

Right. Only PASID on the interrupt messages can resolve this securely.

> So in case that the HCALL domain is missing, the Vector domain needs
> return an error code on domain creation. If the HCALL domain is there
> then the domain creation works and in case of actual interrupt
> allocation the hypercall either returns a valid composed message or an
> appropriate error code.

Yes
 
> But there's a catch:
> 
> This only works when the guest OS actually knows that it runs in a
> VM. If the guest can't figure that out, i.e. via CPUID, this cannot be
> solved because from the guest OS view that's the same as running on bare
> metal. Obviously on bare metal the Vector domain can and must handle
> this.

Yes

The flip side is today, the way pci_subdevice_msi_create_irq_domain()
works a VF using it on baremetal will succeed and if that same VF is
assigned to a guest then pci_subdevice_msi_create_irq_domain()
succeeds but the interrupt never comes - so the driver is broken.

Yes, if we add some ACPI/etc flag that is not going to magically fix
old kvm's, but at least *something* exists that works right and
generically.

If we follow Intel's path then we need special KVM level support for
*every* device, PCI cap mangling and so on. Forever. Sounds horrible
to me..

This feels like one of these things where no matter what we do
something is broken. Picking the least breakage is the challenge here.

> So this needs some thought.

I think your HAOLL diagram is the only sane architecture.

If go that way then case #4 will still work, in this case the HCALL
will return addr/data pairs that conform to what the emulation
expects. Or fail if the VMM can't do emulation for the device.

Jason
Raj, Ashok Nov. 8, 2020, 11:25 p.m. UTC | #30
On Sun, Nov 08, 2020 at 06:34:55PM +0000, David Woodhouse wrote:
> > 
> > When we do interrupt remapping support in guest which would be required 
> > if we support x2apic in guest, I think this is something we should look into more 
> > carefully to make this work.
> 
> No, interrupt remapping is not required for X2APIC in guests
> 
> They can have X2APIC and up to 32768 CPUs without needing interrupt

How is this made available today without interrupt remapping? 

I thought without IR, the destination ID is still limited to only 8 bits?

On native, even if you have less than 255 cpu's but the APICID are sparsly 
distributed due to platform rules, the x2apic id could be more than 8 bits. 
Which is why the spec requires IR when x2apic is enabled.

> remapping at all. Only if they want more than 32768 vCPUs, or to do
> nested virtualisation and actually remap for the benefit of *their*
> (L2+) guests would they need IR.
Jason Gunthorpe Nov. 8, 2020, 11:29 p.m. UTC | #31
On Sun, Nov 08, 2020 at 11:47:13PM +0100, Thomas Gleixner wrote:

> OTOH, what's the chance that a guest runs on something which
> 
>   1) Does not have X86_FEATURE_HYPERVISOR set in cpuid 1/EDX
> 
> and
> 
>   2) Cannot be identified as Xen domain
> 
> and
> 
>   3) Does not have a DMI vendor entry which identifies the
>      virtualization solution (we don't use that today, but
>      adding that table is trivial enough)
> 
> and
> 
>   4) Has such an IMS device passed through?
> 
> Possible, yes. Likely, no. Do we care?

This is exactly my thinking too. IMS is still very new, if we add some
platform flag to disable it then yes there are broken cases but enough
options for an unlucky user to deal with it:

 - Have their VMM set X86_FEATURE_HYPERVISOR
 - Updating the VMM to set the global disable flag
 - Add some "disable_subdevice_msi" kernel comand line flag in the guest

In exchange we get a much cleaner architecture for the next 10 years..

Jason
Raj, Ashok Nov. 8, 2020, 11:36 p.m. UTC | #32
Hi Jason,

On Sun, Nov 08, 2020 at 07:23:41PM -0400, Jason Gunthorpe wrote:
> 
> IDXD is worring about case #4, I think, but I didn't follow in that
> whole discussion about the IMS table layout if they PASID tag the IMS
> MemWr or not?? Ashok can you clarify?
> 

The PASID in the interrupt store is for the IDXD to verify the interrupt handle
that came with the ENQCMD. User applications can obtain an interrupt handle and
ask for interrupt to be generated for transactions submitted via ENQCMD.

IDXD will compare the PASID that came with ENQCMD and verify if the PASID matches
the one stored in the Interrupt Table before generating the MemWr.

So MemWr for interrupts remains unchanged for IDXD on the wire. PASID is present in interrupt
store because the value was programmed by user space, and needs OS/hardware to ensure 
the entity asking for interrupts has ownership for the interrupt handle.
Jason Gunthorpe Nov. 8, 2020, 11:41 p.m. UTC | #33
On Sun, Nov 08, 2020 at 10:11:24AM -0800, Raj, Ashok wrote:

> > On (kvm) virtualization the addr/data pair the IRQ domain hands out
> > doesn't work. It is some fake thing.
> 
> Is it really some fake thing? I thought the vCPU and vector are real
> for a guest, and VMM ensures when interrupts are delivered they are either.

It is fake in the sense it is programmed into no hardware.
 
It is real in the sense it is an ABI contract with the VMM.

> > On something like IDXD this emulation is not so hard, on something
> > like mlx5 this is completely unworkable. Further we never do
> > emulation on our devices, they always pass native hardware through,
> > even for SIOV-like cases.
> 
> So is that true for interrupts too? 

There is no *mlx5* emulation. We ride on the generic MSI emulation KVM
is going.

> Possibly you have the interrupt entries sitting in memory resident
> on the device?

For SRIOV, yes. The appeal of IMS is to move away from that.

> Don't we need the VMM to ensure they are brokered by VMM in either
> one of the two ways above?

Yes, no matter what the VMM has to know the guest wants an interrupt
routed in and setup the VMM part of the equation. With SRIOV this is
all done with the MSI trapping.

> What if the guest creates some addr in the 0xfee... range how do we
> take care of interrupt remapping and such without any VMM assist?

Not sure I understand this?

> That's true. Probably this can work the same even for MSIx types too then?

Yes, once you have the ability to hypercall to create the addr/data
pair then it can work with MSI and the VMM can stop emulation. It
would be a nice bit of uniformity to close this, but switching the VMM
from legacy to new mode is going to be tricky, I fear.

> I agree with the overall idea and we should certainly take that into
> consideration when we need IMS in guest support and in context of
> interrupt remapping.

The issue with things, as they sit now, is SRIOV.

If any driver starts using pci_subdevice_msi_create_irq_domain() then
it fails if the VF is assigned to a guest with SRVIO. This is a real
and important, use case for many devices today!

The "solution" can't be to go back and retroactively change every
shipping device to add PCI capability blocks, and ensure that every
existing VMM strips them out before assigning the device (including
Hyper-V!!)  :(

Jason
Raj, Ashok Nov. 8, 2020, 11:58 p.m. UTC | #34
Hi Thomas,

[-] Jing, She isn't working at Intel anymore.

Now this is getting compiled as a book :-).. Thanks a ton!

One question on the hypercall case that isn't immediately
clear to me.

On Sun, Nov 08, 2020 at 07:47:24PM +0100, Thomas Gleixner wrote:
> 
> 
> Now if we look at the virtualization scenario and device hand through
> then the structure in the guest view is not any different from the basic
> case. This works with PCI-MSI[X] and the IDXD IMS variant because the
> hypervisor can trap the access to the storage and translate the message:
> 
>                    |
>                    |
>   [CPU]    -- [Bri | dge] -- Bus -- [Device]
>                    |
>   Alloc +
>   Compose                   Store     Use
>                              |
>                              | Trap
>                              v
>                              Hypervisor translates and stores
> 

The above case, VMM is responsible for writing to the message
store. In both cases if its IMS or Legacy MSI/MSIx. VMM handles
the writes to the device interrupt region and to the IRTE tables.

> But obviously with an IMS storage location which is software controlled
> by the guest side driver (the case Jason is interested in) the above
> cannot work for obvious reasons.
> 
> That means the guest needs a way to ask the hypervisor for a proper
> translation, i.e. a hypercall. Now where to do that? Looking at the
> above remapping case it's pretty obvious:
> 
> 
>                      |
>                      |
>   [CPU]       -- [VI | RT]  -- [Bridge] --    Bus    -- [Device]
>                      |
>   Alloc          "Compose"                   Store         Use
> 
>   Vectordomain   HCALLdomain                Busdomain
>                  |        ^
>                  |        |
>                  v        | 
>             Hypervisor    
>                Alloc + Compose
> 
> Why? Because it reflects the boundaries and leaves the busdomain part
> agnostic as it should be. And it works for _all_ variants of Busdomains.
> 
> Now the question which I can't answer is whether this can work correctly
> in terms of isolation. If the IMS storage is in guest memory (queue
> storage) then the guest driver can obviously write random crap into it
> which the device will happily send. (For MSI and IDXD style IMS it
> still can trap the store).

The isolation problem is not just the guest memory being used as interrrupt
store right? If the Store to device region is not trapped and controlled by 
VMM, there is no gaurantee the guest OS has done the right thing?


Thinking about it, guest memory might be more problematic since its not
trappable and VMM can't enforce what is written. This is something that
needs more attension. But for now the devices supporting memory on device
the trap and store by VMM seems to satisfy the security properties you
highlight here.

> 
> Is the IOMMU/Interrupt remapping unit able to catch such messages which
> go outside the space to which the guest is allowed to signal to? If yes,
> problem solved. If no, then IMS storage in guest memory can't ever work.

This can probably work for SRIOV devices where guest owns the entire device.
interrupt remap does have RID checks if interrupt arrives at an Interrupt handle
not allocated for that BDF.

But for SIOV devices there is no PASID filtering at the remap level since
interrupt messages don't carry PASID in the TLP.

> 
> Coming back to this:
> 
> > In the end pci_subdevice_msi_create_irq_domain() is a platform
> > function. Either it should work completely on every device with no
> > device-specific emulation required in the VMM, or it should not work
> > at all and return -EOPNOTSUPP.
> 
> The subdevice domain is a 'Busdomain' according to the structure
> above. It does not and should never have any clue about the underlying
> system. It's in the agnostic part and always works. It simply does not
> care what's underneath. So it won't return -EOPNOTSUPP.
> 
> What it has to do is to transport the IMS in queue memory requirement to
> the underlying parent domain.
> 
> So in case that the HCALL domain is missing, the Vector domain needs
> return an error code on domain creation. If the HCALL domain is there
> then the domain creation works and in case of actual interrupt
> allocation the hypercall either returns a valid composed message or an
> appropriate error code.
> 
> But there's a catch:
> 
> This only works when the guest OS actually knows that it runs in a
> VM. If the guest can't figure that out, i.e. via CPUID, this cannot be

Precicely!. It might work if the OS is new, but for legacy the trap-emulate
seems both safe and works for legacy as well?


Cheers,
Ashok
Raj, Ashok Nov. 9, 2020, 12:05 a.m. UTC | #35
Hi Jason

On Sun, Nov 08, 2020 at 07:41:42PM -0400, Jason Gunthorpe wrote:
> On Sun, Nov 08, 2020 at 10:11:24AM -0800, Raj, Ashok wrote:
> 
> > > On (kvm) virtualization the addr/data pair the IRQ domain hands out
> > > doesn't work. It is some fake thing.
> > 
> > Is it really some fake thing? I thought the vCPU and vector are real
> > for a guest, and VMM ensures when interrupts are delivered they are either.
> 
> It is fake in the sense it is programmed into no hardware.
>  
> It is real in the sense it is an ABI contract with the VMM.

Ah.. its clear now. That clears up my question below as well.

> 
> Yes, no matter what the VMM has to know the guest wants an interrupt
> routed in and setup the VMM part of the equation. With SRIOV this is
> all done with the MSI trapping.
> 
> > What if the guest creates some addr in the 0xfee... range how do we
> > take care of interrupt remapping and such without any VMM assist?
> 
> Not sure I understand this?
> 

My question was based on mis-conception that interrupt entries are directly
written by guest OS for mlx*. My concern was about security isolation if guest OS
has full control of device interrupt store. 

I think you clarified it, that interrupts still are marshalled by the VMM
and not in direct control of guest OS. That makes my question moot.

Cheers,
Ashok
Tian, Kevin Nov. 9, 2020, 5:25 a.m. UTC | #36
> -----Original Message-----
> From: Thomas Gleixner <tglx@linutronix.de>
> Sent: Saturday, November 7, 2020 8:32 AM
> To: Tian, Kevin <kevin.tian@intel.com>; Jason Gunthorpe <jgg@nvidia.com>
> Cc: Jiang, Dave <dave.jiang@intel.com>; Bjorn Helgaas <helgaas@kernel.org>;
> vkoul@kernel.org; Dey, Megha <megha.dey@intel.com>; maz@kernel.org;
> bhelgaas@google.com; alex.williamson@redhat.com; Pan, Jacob jun
> <jacob.jun.pan@intel.com>; Raj, Ashok <ashok.raj@intel.com>; Liu, Yi L
> <yi.l.liu@intel.com>; Lu, Baolu <baolu.lu@intel.com>; Kumar, Sanjay K
> <sanjay.k.kumar@intel.com>; Luck, Tony <tony.luck@intel.com>;
> jing.lin@intel.com; Williams, Dan J <dan.j.williams@intel.com>;
> kwankhede@nvidia.com; eric.auger@redhat.com; parav@mellanox.com;
> rafael@kernel.org; netanelg@mellanox.com; shahafs@mellanox.com;
> yan.y.zhao@linux.intel.com; pbonzini@redhat.com; Ortiz, Samuel
> <samuel.ortiz@intel.com>; Hossain, Mona <mona.hossain@intel.com>;
> dmaengine@vger.kernel.org; linux-kernel@vger.kernel.org; linux-
> pci@vger.kernel.org; kvm@vger.kernel.org
> Subject: RE: [PATCH v4 06/17] PCI: add SIOV and IMS capability detection
> 
> On Fri, Nov 06 2020 at 09:48, Kevin Tian wrote:
> >> From: Jason Gunthorpe <jgg@nvidia.com>
> >> On Wed, Nov 04, 2020 at 01:34:08PM +0000, Tian, Kevin wrote:
> >> The interrupt controller is responsible to create an addr/data pair
> >> for an interrupt message. It sets the message format and ensures it
> >> routes to the proper CPU interrupt handler. Everything about the
> >> addr/data pair is owned by the platform interrupt controller.
> >>
> >> Devices do not create interrupts. They only trigger the addr/data pair
> >> the platform gives them.
> >
> > I guess that we may just view it from different angles. On x86 platform,
> > a MSI/IMS capable device directly composes interrupt messages, with
> > addr/data pair filled by OS. If there is no IOMMU remapping enabled in
> > the middle, the message just hits the CPU. Your description possibly
> > is from software side, e.g. describing the hierarchical IRQ domain
> > concept?
> 
> No. The device composes nothing. If the interrupt is raised in the
> device then the MSI block sends the message which was composed by the OS
> and stored in the device's message store. For PCI/MSI that's the MSI or
> MSIX table and for IMS that's either on device memory (as IDXD uses) or
> some completely different location which Jason described.

Sorry being inaccurate here. I actually meant the same thing as
you described since I did mention addr/data pair filled by OS. 
Unfortunately I mistakenly thought that 'compose' has similar
meaning to 'send' in English but clearly it's not and instead it's
just about the message content. and for sure I also agree with your
other clarifications regarding to architecture independent  manner.

Thanks
Kevin

> 
> This has absolutely nothing to do with the X86 platform. MSI is a
> architecture independent mechanism: Send whatever the OS put into the
> storage to raise an interrupt in the CPU. The device does neither know
> whether that message is going to be intercepted by an interrupt
> remapping unit or not.
> 
> Stop claiming that any of this has anything to do with x86. It has
> absolutely nothing to do with x86 and looking at MSI from an x86
> perspective instead of looking at it from the architecture agnostic
> technical reality of MSI is the reason why we have this discussion at
> all.
> 
> We had a similar discussion vs. the way how IMS interrupts have to be
> dealt with in terms of irq domains. Can you finally stop looking at
> everything as a big x86/intel/platform lump and understand that things
> are very well structured and seperated both at the hardware and at the
> software level?
> 
> > Do you mind providing the link? There were lots of discussions between
> > you and Thomas. I failed to locate the exact mail when searching above
> > keywords.
> 
> In this thread: 20200821002424.119492231@linutronix.de and you were on
> Cc
> 
> Thanks,
> 
>         tglx
>
Tian, Kevin Nov. 9, 2020, 7:37 a.m. UTC | #37
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Monday, November 9, 2020 7:24 AM
> 
> On Sun, Nov 08, 2020 at 07:47:24PM +0100, Thomas Gleixner wrote:
> >
> > That means the guest needs a way to ask the hypervisor for a proper
> > translation, i.e. a hypercall. Now where to do that? Looking at the
> > above remapping case it's pretty obvious:
> >
> >
> >                      |
> >                      |
> >   [CPU]       -- [VI | RT]  -- [Bridge] --    Bus    -- [Device]
> >                      |
> >   Alloc          "Compose"                   Store         Use
> >
> >   Vectordomain   HCALLdomain                Busdomain
> >                  |        ^
> >                  |        |
> >                  v        |
> >             Hypervisor
> >                Alloc + Compose
> 
> Yes, this will describes what I have been thinking

Agree

> 
> > Now the question which I can't answer is whether this can work correctly
> > in terms of isolation. If the IMS storage is in guest memory (queue
> > storage) then the guest driver can obviously write random crap into it
> > which the device will happily send. (For MSI and IDXD style IMS it
> > still can trap the store).
> 
> There are four cases of interest here:
> 
>  1) Bare metal, PF and VF devices just deliver whatever addr/data pairs
>     to the APIC. IMS works perfectly with
> pci_subdevice_msi_create_irq_domain()
> 
>  2) SRIOV VF assigned to the guest.
> 
>     The guest can cause any MemWr TLP to any addr/data pair
>     and the iommu/platform/vmm is supposed to use the
>     Bus/device/function to isolate & secure the interrupt address
>     range.
> 
>     IMS can work in the guest if the guest knows the details of the
>     address range and can make hypercalls to setup routing. So
>     pci_subdevice_msi_create_irq_domain() works if the hypercalls
>     exist and fails if they don't.
> 
>  3) SIOV sub device assigned to the guest.
> 
>     The difference between SIOV and SRIOV is the device must attach a
>     PASID to every TLP triggered by the guest. Logically we'd expect
>     when IMS is used in this situation the interrupt MemWr is tagged
>     with bus/device/function/PASID to uniquly ID the guest and the same
>     security protection scheme from #2 applies.

Unfortunately no. Intel VT-d only treats MemWr w/o PASID to 0xFEExxxxx
as interrupt request. MemWr w/ PASID, even to 0xFEE, is translated
normally through DMA remapping page table. I don't know other IOMMU
vendors. But at least on Intel platform such device would not get the 
desired effect, since the IOMMU only guarantees interrupt isolation in 
BDF-level.

Does your device already implement such capability? We can bring this 
request back to the hardware team. 

> 
>  4) SIOV sub device assigned to the guest, but with emulation.
> 
>     This SIOV device cannot tag interrupts with PASID so cannot do #2
>     (or the platform cannot recieve a PASID tagged interrupt message).
> 
>     Since the interrupts are being delivered with TLPs pointing at the
>     hypervisor the only solution is for the hypervisor to exclusively
>     control the interrupt table. MSI table like emulation for IMS is
>     needed and the hypervisor will use
> pci_subdevice_msi_create_irq_domain()
>     to get the real interrupts.
> 
>     pci_subdevice_msi_create_irq_domain() needs to return the 'fake'
>     addr/data pairs which are actually an ABI between the guest and
>     hypervisor carried in the hidden hypercall of the emulation.
>     (ie it works like MSI works today)
> 
> IDXD is worring about case #4, I think, but I didn't follow in that
> whole discussion about the IMS table layout if they PASID tag the IMS
> MemWr or not?? Ashok can you clarify?
> 
> > Is the IOMMU/Interrupt remapping unit able to catch such messages which
> > go outside the space to which the guest is allowed to signal to? If yes,
> > problem solved. If no, then IMS storage in guest memory can't ever work.
> 
> Right. Only PASID on the interrupt messages can resolve this securely.
> 
> > So in case that the HCALL domain is missing, the Vector domain needs
> > return an error code on domain creation. If the HCALL domain is there
> > then the domain creation works and in case of actual interrupt
> > allocation the hypercall either returns a valid composed message or an
> > appropriate error code.
> 
> Yes
> 
> > But there's a catch:
> >
> > This only works when the guest OS actually knows that it runs in a
> > VM. If the guest can't figure that out, i.e. via CPUID, this cannot be
> > solved because from the guest OS view that's the same as running on bare
> > metal. Obviously on bare metal the Vector domain can and must handle
> > this.
> 
> Yes
> 
> The flip side is today, the way pci_subdevice_msi_create_irq_domain()
> works a VF using it on baremetal will succeed and if that same VF is
> assigned to a guest then pci_subdevice_msi_create_irq_domain()
> succeeds but the interrupt never comes - so the driver is broken.

Yes, this is the main worry here. While all agree that using hypercall is 
the proper way to virtualize IMS, how to disable it when hypercall is
not available is a more urgent demand at current stage.

> 
> Yes, if we add some ACPI/etc flag that is not going to magically fix
> old kvm's, but at least *something* exists that works right and
> generically.

Agree. We can work together on this definition. 

btw in reality such ACPI extension doesn't exist yet, which likely will
take some time. In the meantime we already have pending usages 
like IDXD. Do you suggest holding these patches until we get ASWG 
to accept the extension, or accept using Intel IMS cap as a vendor
specific mitigation to move forward while the platform flag is being 
worked on? Anyway the IMS cap is already defined and can help fix 
some broken cases.

> 
> If we follow Intel's path then we need special KVM level support for
> *every* device, PCI cap mangling and so on. Forever. Sounds horrible
> to me..
> 
> This feels like one of these things where no matter what we do
> something is broken. Picking the least breakage is the challenge here.
> 
> > So this needs some thought.
> 
> I think your HAOLL diagram is the only sane architecture.
> 
> If go that way then case #4 will still work, in this case the HCALL
> will return addr/data pairs that conform to what the emulation
> expects. Or fail if the VMM can't do emulation for the device.
> 

Thanks
Kevin
Tian, Kevin Nov. 9, 2020, 7:59 a.m. UTC | #38
> From: Raj, Ashok <ashok.raj@intel.com>
> Sent: Monday, November 9, 2020 7:59 AM
> 
> Hi Thomas,
> 
> [-] Jing, She isn't working at Intel anymore.
> 
> Now this is getting compiled as a book :-).. Thanks a ton!
> 
> One question on the hypercall case that isn't immediately
> clear to me.
> 
> On Sun, Nov 08, 2020 at 07:47:24PM +0100, Thomas Gleixner wrote:
> >
> >
> > Now if we look at the virtualization scenario and device hand through
> > then the structure in the guest view is not any different from the basic
> > case. This works with PCI-MSI[X] and the IDXD IMS variant because the
> > hypervisor can trap the access to the storage and translate the message:
> >
> >                    |
> >                    |
> >   [CPU]    -- [Bri | dge] -- Bus -- [Device]
> >                    |
> >   Alloc +
> >   Compose                   Store     Use
> >                              |
> >                              | Trap
> >                              v
> >                              Hypervisor translates and stores
> >
> 
> The above case, VMM is responsible for writing to the message
> store. In both cases if its IMS or Legacy MSI/MSIx. VMM handles
> the writes to the device interrupt region and to the IRTE tables.
> 
> > But obviously with an IMS storage location which is software controlled
> > by the guest side driver (the case Jason is interested in) the above
> > cannot work for obvious reasons.
> >
> > That means the guest needs a way to ask the hypervisor for a proper
> > translation, i.e. a hypercall. Now where to do that? Looking at the
> > above remapping case it's pretty obvious:
> >
> >
> >                      |
> >                      |
> >   [CPU]       -- [VI | RT]  -- [Bridge] --    Bus    -- [Device]
> >                      |
> >   Alloc          "Compose"                   Store         Use
> >
> >   Vectordomain   HCALLdomain                Busdomain
> >                  |        ^
> >                  |        |
> >                  v        |
> >             Hypervisor
> >                Alloc + Compose
> >
> > Why? Because it reflects the boundaries and leaves the busdomain part
> > agnostic as it should be. And it works for _all_ variants of Busdomains.
> >
> > Now the question which I can't answer is whether this can work correctly
> > in terms of isolation. If the IMS storage is in guest memory (queue
> > storage) then the guest driver can obviously write random crap into it
> > which the device will happily send. (For MSI and IDXD style IMS it
> > still can trap the store).
> 
> The isolation problem is not just the guest memory being used as interrrupt
> store right? If the Store to device region is not trapped and controlled by
> VMM, there is no gaurantee the guest OS has done the right thing?
> 
> 
> Thinking about it, guest memory might be more problematic since its not
> trappable and VMM can't enforce what is written. This is something that
> needs more attension. But for now the devices supporting memory on device
> the trap and store by VMM seems to satisfy the security properties you
> highlight here.
> 

Just want to clarify the trap part.

Guest memory is not trappable in Jason's example, which has queue/IMS
storage swapped between device/memory and requires special command 
to sync the state.

But there is also other forms of in-memory IMS implementation. e.g. Some
devices serve work requests based on command buffers instead of HW work
queues. The command buffers are linked in per-process contexts (both in 
memory) thus similarly IMS could be stored in each context too. There is no
swap per se. The context is allocated by the driver and then registered to 
the device through a mgmt. interface. When the mgmt. interface is mediated, 
the hypervisor knows the IMS location and could mark it as read-only in 
EPT page table to enable trapping of guest writes. Of course this approach
is awkward if the complexity is paid just for virtualizing IMS.

Thanks
Kevin
Thomas Gleixner Nov. 9, 2020, 11:21 a.m. UTC | #39
On Sun, Nov 08 2020 at 15:58, Ashok Raj wrote:
> On Sun, Nov 08, 2020 at 07:47:24PM +0100, Thomas Gleixner wrote:
>> 
>> 
>> Now if we look at the virtualization scenario and device hand through
>> then the structure in the guest view is not any different from the basic
>> case. This works with PCI-MSI[X] and the IDXD IMS variant because the
>> hypervisor can trap the access to the storage and translate the message:
>> 
>>                    |
>>                    |
>>   [CPU]    -- [Bri | dge] -- Bus -- [Device]
>>                    |
>>   Alloc +
>>   Compose                   Store     Use
>>                              |
>>                              | Trap
>>                              v
>>                              Hypervisor translates and stores
>> 
>
> The above case, VMM is responsible for writing to the message
> store. In both cases if its IMS or Legacy MSI/MSIx. VMM handles
> the writes to the device interrupt region and to the IRTE tables.

Yes, but that's just how it's done today and there is no real need to do
so.

>> Now the question which I can't answer is whether this can work correctly
>> in terms of isolation. If the IMS storage is in guest memory (queue
>> storage) then the guest driver can obviously write random crap into it
>> which the device will happily send. (For MSI and IDXD style IMS it
>> still can trap the store).
>
> The isolation problem is not just the guest memory being used as interrrupt
> store right? If the Store to device region is not trapped and controlled by 
> VMM, there is no gaurantee the guest OS has done the right thing?
>
> Thinking about it, guest memory might be more problematic since its not
> trappable and VMM can't enforce what is written. This is something that
> needs more attension. But for now the devices supporting memory on device
> the trap and store by VMM seems to satisfy the security properties you
> highlight here.

That's not the problem at all. The VMM is not responsible for the
correctness of the guest OS at all. All the VMM cares about is that the
guest cannot access anything which does not belong to the guest.

If the guest OS screws up the message (by stupidity or malice), then the
MSI sent from the passed through device has to be caught by the
IOMMU/remap unit if an _only_ if it writes to something which it is not
allowed to.

If it overwrites the guests memory then so be it. The VMM cannot prevent
the guest OS doing so by a stray pointer either. So why would it worry
about the MSI going into guest owned lala land?

>> Is the IOMMU/Interrupt remapping unit able to catch such messages which
>> go outside the space to which the guest is allowed to signal to? If yes,
>> problem solved. If no, then IMS storage in guest memory can't ever work.
>
> This can probably work for SRIOV devices where guest owns the entire device.
> interrupt remap does have RID checks if interrupt arrives at an Interrupt handle
> not allocated for that BDF.
>
> But for SIOV devices there is no PASID filtering at the remap level since
> interrupt messages don't carry PASID in the TLP.

PASID is irrelevant here.

If the device sends a message then the remap unit will see the requester
ID of the device and if the message it sends is not matching the remap
tables then it's caught and the guest is terminated. At least that's how
it should be.

>> But there's a catch:
>> 
>> This only works when the guest OS actually knows that it runs in a
>> VM. If the guest can't figure that out, i.e. via CPUID, this cannot be
>
> Precicely!. It might work if the OS is new, but for legacy the trap-emulate
> seems both safe and works for legacy as well?

Again, trap emulate does not work for IMS when the IMS store is software
managed guest memory and not part of the device. And that's the whole
reason why we are discussing this.

Thanks,

        tglx
Jason Gunthorpe Nov. 9, 2020, 4:46 p.m. UTC | #40
On Mon, Nov 09, 2020 at 07:37:03AM +0000, Tian, Kevin wrote:
> >  3) SIOV sub device assigned to the guest.
> > 
> >     The difference between SIOV and SRIOV is the device must attach a
> >     PASID to every TLP triggered by the guest. Logically we'd expect
> >     when IMS is used in this situation the interrupt MemWr is tagged
> >     with bus/device/function/PASID to uniquly ID the guest and the same
> >     security protection scheme from #2 applies.
> 
> Unfortunately no. Intel VT-d only treats MemWr w/o PASID to 0xFEExxxxx
> as interrupt request. MemWr w/ PASID, even to 0xFEE, is translated
> normally through DMA remapping page table. 

I've heard that current IOMMUs are limited as well, but IMHO, as I
describe, if you want full symmetry then you want to route interrupts
via PASID for SIOV. Otherwise the architecture is incomplete.

At least from a Linux and VMM perspective this should be planned
for. It is the only generic way to have a sub device assigned to a
guest and still have access to IMS.

> Does your device already implement such capability? We can bring this 
> request back to the hardware team. 

In some cases we can generate PASID tagged TLPs for interrupt
messages, if there was a reason to do that.

> Yes, this is the main worry here. While all agree that using hypercall is 
> the proper way to virtualize IMS, how to disable it when hypercall is
> not available is a more urgent demand at current stage.

Hopefully Thomas's note about checking for virtualization will help..

> btw in reality such ACPI extension doesn't exist yet, which likely will
> take some time. In the meantime we already have pending usages 
> like IDXD. Do you suggest holding these patches until we get ASWG 
> to accept the extension, or accept using Intel IMS cap as a vendor
> specific mitigation to move forward while the platform flag is being 
> worked on? Anyway the IMS cap is already defined and can help fix 
> some broken cases.

I think you need to sort something generic out, these half baked
architectures just make it some other teams problem.

Thomas's suggestion to check cpuid seems reasonably workable

Jason
Jason Gunthorpe Nov. 9, 2020, 5:30 p.m. UTC | #41
On Mon, Nov 09, 2020 at 12:21:22PM +0100, Thomas Gleixner wrote:

> >> Is the IOMMU/Interrupt remapping unit able to catch such messages which
> >> go outside the space to which the guest is allowed to signal to? If yes,
> >> problem solved. If no, then IMS storage in guest memory can't ever work.
> >
> > This can probably work for SRIOV devices where guest owns the entire device.
> > interrupt remap does have RID checks if interrupt arrives at an Interrupt handle
> > not allocated for that BDF.
> >
> > But for SIOV devices there is no PASID filtering at the remap level since
> > interrupt messages don't carry PASID in the TLP.
> 
> PASID is irrelevant here.
> 
> If the device sends a message then the remap unit will see the requester
> ID of the device and if the message it sends is not matching the remap
> tables then it's caught and the guest is terminated. At least that's how
> it should be.

The SIOV case is to take a single RID and split it to multiple
VMs and also to the hypervisor. All these things concurrently use the
same RID, and the IOMMU can't tell them apart.

The hypervisor security domain owns TLPs with no PASID. Each PASID is
assigned to a VM.

For interrupts, today, they are all generated, with no PASID, to the
same RID. There is no way for remapping to protect against a guest
without checking also PASID.

The relavance of PASID is this:

> Again, trap emulate does not work for IMS when the IMS store is software
> managed guest memory and not part of the device. And that's the whole
> reason why we are discussing this.

With PASID tagged interrupts and a IOMMU interrupt remapping
capability that can trigger on PASID, then the platform can provide
the same level of security as SRIOV - the above is no problem.

The device ensures that all DMAs and all interrupts program by the
guest are PASID tagged and the platform provides security by checking
the PASID when delivering the interrupt. Intel IOMMU doesn't work this
way today, but it makes alot of design sense.

Otherwise the interrupt is effectively delivered to the hypervisor. A
secure device can *never* allow a guest to specify an addr/data pair
for a non-PASID tagged TLP, so the device cannot offer IMS to the
guest.

Jason
Raj, Ashok Nov. 9, 2020, 10:40 p.m. UTC | #42
On Mon, Nov 09, 2020 at 01:30:34PM -0400, Jason Gunthorpe wrote:
> 
> > Again, trap emulate does not work for IMS when the IMS store is software
> > managed guest memory and not part of the device. And that's the whole
> > reason why we are discussing this.
> 
> With PASID tagged interrupts and a IOMMU interrupt remapping
> capability that can trigger on PASID, then the platform can provide
> the same level of security as SRIOV - the above is no problem.

You mean even if its stored in memory, as long as the MemWr comes with
PASID, and the hypercall has provisioned the IRTE properly?

that seems like a possiblity.

> 
> The device ensures that all DMAs and all interrupts program by the
> guest are PASID tagged and the platform provides security by checking
> the PASID when delivering the interrupt. Intel IOMMU doesn't work this
> way today, but it makes alot of design sense.
> 
> Otherwise the interrupt is effectively delivered to the hypervisor. A
> secure device can *never* allow a guest to specify an addr/data pair
> for a non-PASID tagged TLP, so the device cannot offer IMS to the
> guest.

Right, it seems like that's a limitation today.
Thomas Gleixner Nov. 9, 2020, 10:42 p.m. UTC | #43
On Mon, Nov 09 2020 at 13:30, Jason Gunthorpe wrote:
> On Mon, Nov 09, 2020 at 12:21:22PM +0100, Thomas Gleixner wrote:
>> >> Is the IOMMU/Interrupt remapping unit able to catch such messages which
>> >> go outside the space to which the guest is allowed to signal to? If yes,
>> >> problem solved. If no, then IMS storage in guest memory can't ever
>> >> work.

> The SIOV case is to take a single RID and split it to multiple
> VMs and also to the hypervisor. All these things concurrently use the
> same RID, and the IOMMU can't tell them apart.
>
> The hypervisor security domain owns TLPs with no PASID. Each PASID is
> assigned to a VM.
>
> For interrupts, today, they are all generated, with no PASID, to the
> same RID. There is no way for remapping to protect against a guest
> without checking also PASID.
>
> The relavance of PASID is this:
>
>> Again, trap emulate does not work for IMS when the IMS store is software
>> managed guest memory and not part of the device. And that's the whole
>> reason why we are discussing this.
>
> With PASID tagged interrupts and a IOMMU interrupt remapping
> capability that can trigger on PASID, then the platform can provide
> the same level of security as SRIOV - the above is no problem.
>
> The device ensures that all DMAs and all interrupts program by the
> guest are PASID tagged and the platform provides security by checking
> the PASID when delivering the interrupt.

Correct.

> Intel IOMMU doesn't work this way today, but it makes alot of design
> sense.

Right.

> Otherwise the interrupt is effectively delivered to the hypervisor. A
> secure device can *never* allow a guest to specify an addr/data pair
> for a non-PASID tagged TLP, so the device cannot offer IMS to the
> guest.

Ok. Let me summarize the current state of supported scenarios:

 1) SRIOV works with any form of IMS storage because it does not require
    PASID and the VF devices have unique requester ids, which allows the
    remap unit to sanity check the message.

 2) SIOV with IMS when the hypervisor can manage the IMS store
    exclusively.

So #2 prevents a device which handles IMS storage in queue memory to
utilize IMS for SIOV in a guest because the hypervisor cannot manage the
IMS message store and the guest can write arbitrary crap to it which
violates the isolation principle.

And here is the relevant part of the SIOV spec:

 "IMS is managed by host driver software and is not accessible directly
  from guest or user-mode drivers.

  Within the device, IMS storage is not accessible from the ADIs. ADIs
  can request interrupt generation only through the device’s ‘Interrupt
  Message Generation Logic’, which allows an ADI to only generate
  interrupt messages that are associated with that specific ADI. These
  restrictions ensure that the host driver has complete control over
  which interrupt messages can be generated by each ADI.

  On Intel 64 architecture platforms, message signaled interrupts are
  issued as DWORD size untranslated memory writes without a PASID TLP
  Prefix, to address range 0xFEExxxxx. Since all memory requests
  generated by ADIs include a PASID TLP Prefix, it is not possible for
  an ADI to generate a DMA write that would be interpreted by the
  platform as an interrupt message."

That's the reductio ad absurdum for this sentence in the first paragraph
of the preceding chapter describing the concept of IMS:

  "IMS enables devices to store the interrupt messages for ADIs in a
   device-specific optimized manner without the scalability restrictions
   of the PCI Express defined MSI-X capability."

"Device-specific optimized manner" is either wishful thinking or
marketing induced verbal diarrhoea.

The current specification puts massive restrictions on IMS storage which
are _not_ allowing to optimize it in a device specific manner as
demonstrated in this discussion.

It also precludes obvious use cases like passing a full device to a
guest and let the guest manage SIOV subdevices for containers or nested
guests.

TBH, to me this is just another hastily cobbled together half thought
out misfeature cast in silicon. The proposed software support is
following the exactly same principle.

So before we go anywhere with this, I want to see a proper way forward
to support _all_ sensible use cases and to fulfil the promise of
"device-specific optimized manner" at the conceptual and specification
and also at the code level.

I'm not at all interested to rush in support for a half baken Intel
centric solution which other people have to clean up after the fact
(again).

IOW, it's time to go back to the drawing board.

Thanks,

        tglx
Raj, Ashok Nov. 10, 2020, 5:14 a.m. UTC | #44
Hi Thomas,

On Mon, Nov 09, 2020 at 11:42:29PM +0100, Thomas Gleixner wrote:
> On Mon, Nov 09 2020 at 13:30, Jason Gunthorpe wrote:
> >
> > The relavance of PASID is this:
> >
> >> Again, trap emulate does not work for IMS when the IMS store is software
> >> managed guest memory and not part of the device. And that's the whole
> >> reason why we are discussing this.
> >
> > With PASID tagged interrupts and a IOMMU interrupt remapping
> > capability that can trigger on PASID, then the platform can provide
> > the same level of security as SRIOV - the above is no problem.
> >
> > The device ensures that all DMAs and all interrupts program by the
> > guest are PASID tagged and the platform provides security by checking
> > the PASID when delivering the interrupt.
> 
> Correct.
> 
> > Intel IOMMU doesn't work this way today, but it makes alot of design
> > sense.

Approach to IMS is more of a phased approach. 

#1 Allow physical device to scale beyond limits of PCIe MSIx
   Follows current methodology for guest interrupt programming and
   evolutionary changes rather than drastic.
#2 Long term we should work together on enabling IMS in guest which
   requires changes in both HW and SW eco-system.

For #1, the immediate need is to find a way to limit guest from using IMS
due to current limitations. We have couple options.

a) CPUID based method to disallow IMS when running in a guest OS. Limiting
   use to existing virtual MSIx to guest devices. (Both you/Jason alluded)
b) We can extend DMAR table to have a flag for opt-out. So in real platform
   this flag is clear and in guest VMM will ensure vDMAR will have this flag
   set. Along the lines as Jason alluded, platform level and via ACPI
   methods. We have similar use for x2apic_optout today.

Think a) is probably more generic.

For #2 Long term goal of allowing IMS in guest for devices that require
them. This requires some extensive eco-system enabling. 

- Extending HW to understand PASID-tagged interrupt messages.
- Appropriate extensions to IOMMU to enforce such PASID based isolation.

From SW improvements:

- Hypercall to retrieve addr/data from host
- Ensure SW can provide guarantee that the interrupt address range will not
  be mapped in process space when SVM is in play. Otherwise its hard to
  distinguish between DMA and Interrupt. OS needs to opt-in to this
  behavior. Today we ensure IOVA space has this 0xFEExxxxx range carve out
  of the IOVA space.


Devices such as idxd that do not have these entries on page-boundaries for
isolation to permit direct programming from GuestOS will continue to use
trap-emulate as used today.

In the end, virtualizing IMS requires eco-system collaboration, and we are
very open to change hw when all the relevant pieces are in place.

Until then, IMS will be restricted to host VMM only, and we can use the
methods above to prevent IMS in guest and continue to use the legacy
virtual MSIx.

> 
> Right.
> 
> > Otherwise the interrupt is effectively delivered to the hypervisor. A
> > secure device can *never* allow a guest to specify an addr/data pair
> > for a non-PASID tagged TLP, so the device cannot offer IMS to the
> > guest.
> 
> Ok. Let me summarize the current state of supported scenarios:
> 
>  1) SRIOV works with any form of IMS storage because it does not require
>     PASID and the VF devices have unique requester ids, which allows the
>     remap unit to sanity check the message.
> 
>  2) SIOV with IMS when the hypervisor can manage the IMS store
>     exclusively.

Today this is true for all interrupt types, MSI/MSIx/IMS.

> 
> So #2 prevents a device which handles IMS storage in queue memory to
> utilize IMS for SIOV in a guest because the hypervisor cannot manage the
> IMS message store and the guest can write arbitrary crap to it which
> violates the isolation principle.
> 
> And here is the relevant part of the SIOV spec:
> 
>  "IMS is managed by host driver software and is not accessible directly
>   from guest or user-mode drivers.
> 
>   Within the device, IMS storage is not accessible from the ADIs. ADIs
>   can request interrupt generation only through the device’s ‘Interrupt
>   Message Generation Logic’, which allows an ADI to only generate
>   interrupt messages that are associated with that specific ADI. These
>   restrictions ensure that the host driver has complete control over
>   which interrupt messages can be generated by each ADI.
> 
>   On Intel 64 architecture platforms, message signaled interrupts are
>   issued as DWORD size untranslated memory writes without a PASID TLP
>   Prefix, to address range 0xFEExxxxx. Since all memory requests
>   generated by ADIs include a PASID TLP Prefix, it is not possible for
>   an ADI to generate a DMA write that would be interpreted by the
>   platform as an interrupt message."
> 
> That's the reductio ad absurdum for this sentence in the first paragraph
> of the preceding chapter describing the concept of IMS:
> 
>   "IMS enables devices to store the interrupt messages for ADIs in a
>    device-specific optimized manner without the scalability restrictions
>    of the PCI Express defined MSI-X capability."
> 
> "Device-specific optimized manner" is either wishful thinking or
> marketing induced verbal diarrhoea.

No comment on the adjectives above :-)

> 
> The current specification puts massive restrictions on IMS storage which
> are _not_ allowing to optimize it in a device specific manner as
> demonstrated in this discussion.

IMS doesn't restrict this optimization, but to allow it requires more OS support as
you had mentioned.

> 
> It also precludes obvious use cases like passing a full device to a
> guest and let the guest manage SIOV subdevices for containers or nested
> guests.
> 
> TBH, to me this is just another hastily cobbled together half thought
> out misfeature cast in silicon. The proposed software support is
> following the exactly same principle.

Current IMS support adds incremental feature capability. Works pretty much
following everything that was created for MSIx, but just adds some device
flexibility. 

Here are some reasons why PASID isn't used today for tagging interrupts.

Interrupt messages (as specified by MSI/MSI-X in PCI specification) are 
currently defined as DWORD DMA writes to a platform/architecture specific 
address (0xFEExxxxx on Intel platforms). Existing root-complexes detect
DWORD writes to 0xFEExxxxx (without a PASID in the transaction) as interrupt 
messages and route them to interrupt-remapping logic (as opposed to other 
DMA requests that are routed to IOMMU's DMA remapping logic). 

There are multiple tools (such as logic analyzers) and OEM test validation 
harnesses that depend on such DWORD sized DMA writes with no PASID as interrupt
messages. One of the feedback we had received in the development of the
specification was to avoid impacting such tools irrespective of MSI-X or IMS 
was used for interrupt message storage (on the wire they follow the same format), 
and also to ensure interoperability of devices supporting IMS across CPU vendors 
(who may not support PASID TLP prefix).  This is one reason that led to interrupts 
from IMS to not use PASID (and match the wire format of MSI/MSI-X generated interrupts). 
The other problem was disambiguation between DMA to SVM v/s interrupts.

> 
> So before we go anywhere with this, I want to see a proper way forward
> to support _all_ sensible use cases and to fulfil the promise of
> "device-specific optimized manner" at the conceptual and specification
> and also at the code level.
> 
> I'm not at all interested to rush in support for a half baken Intel
> centric solution which other people have to clean up after the fact
> (again).

Intel had published the specification almost 2 years back and have
comprehended all the feedback received from the ecosystem 
(both open-source and others), along with offering the specification 
to be implemented by any vendors (both device and CPU vendors). 
There are few device vendors who are implementing to the spec already and 
are being explored for support by other CPU vendors

Cheers,
Ashok
Thomas Gleixner Nov. 10, 2020, 10:27 a.m. UTC | #45
Ashok,

On Mon, Nov 09 2020 at 21:14, Ashok Raj wrote:
> On Mon, Nov 09, 2020 at 11:42:29PM +0100, Thomas Gleixner wrote:
>> On Mon, Nov 09 2020 at 13:30, Jason Gunthorpe wrote:
> Approach to IMS is more of a phased approach. 
>
> #1 Allow physical device to scale beyond limits of PCIe MSIx
>    Follows current methodology for guest interrupt programming and
>    evolutionary changes rather than drastic.

Trapping MSI[X] writes is there because it allows to hand a device to an
unmodified guest OS and to handle the case where the MSI[X] entries
storage cannot be mapped exclusively to the guest.

But aside of this, it's not required if the storage can be mapped
exclusively, the guest is hypervisor aware and can get a host composed
message via a hypercall. That works for physical functions and SRIOV,
but not for SIOV.

> #2 Long term we should work together on enabling IMS in guest which
>    requires changes in both HW and SW eco-system.
>
> For #1, the immediate need is to find a way to limit guest from using IMS
> due to current limitations. We have couple options.
>
> a) CPUID based method to disallow IMS when running in a guest OS. Limiting
>    use to existing virtual MSIx to guest devices. (Both you/Jason alluded)
> b) We can extend DMAR table to have a flag for opt-out. So in real platform
>    this flag is clear and in guest VMM will ensure vDMAR will have this flag
>    set. Along the lines as Jason alluded, platform level and via ACPI
>    methods. We have similar use for x2apic_optout today.
>
> Think a) is probably more generic.

But incomplete as I explained before. If the VMM does not set the
hypervisor bit in CPUID then the guest OS assumes to run on bare
metal. It needs more than just relying on CPUID.

Aside of that neither Jason nor myself said that IMS cannot be supported
in a guest. PF and VF IMS can and has to be supported. SIOV is a
different story due to the PASID requirement which obviously needs to be
managed host side and needs HW changes.

> From SW improvements:
>
> - Hypercall to retrieve addr/data from host

You need to have that even for the non SIOV case in order to hand in a
full device which has the IMS storage in queue memory.

> Devices such as idxd that do not have these entries on page-boundaries for
> isolation to permit direct programming from GuestOS will continue to use
> trap-emulate as used today.

That's a restriction of that particular hardware.

> Until then, IMS will be restricted to host VMM only, and we can use the
> methods above to prevent IMS in guest and continue to use the legacy
> virtual MSIx.

SIOV IMS.

But as things stand now not even PF/VF pass through are possible. This
might not be an issue for IDXD, but it's an issue in general and this
want's the be thought of _now_ before we put a lot of infrastructure in
to place which needs then to be ripped apart again.

>> The current specification puts massive restrictions on IMS storage which
>> are _not_ allowing to optimize it in a device specific manner as
>> demonstrated in this discussion.
>
> IMS doesn't restrict this optimization, but to allow it requires more
> OS support as you had mentioned.

Right, IMS per se does not put an restriction on it.

The specification and the HW limitations on the remapping unit put that
restriction into place.

OS support is an obvious requirement, but OS support cannot make
the restrictions of HW go away magically.

But again, we need to think about the path forward _now_.

Just slapping some 'works for IDXD' solution into place can severly
restrict the options for going beyond these limitations simply because
we have to support that 'works for IDXD thing' forever.

Thanks,

        tglx
Raj, Ashok Nov. 10, 2020, 2:13 p.m. UTC | #46
Thomas,

With all these interrupt message storms ;-), I'm missing how to move towards
an end goal.

On Tue, Nov 10, 2020 at 11:27:29AM +0100, Thomas Gleixner wrote:
> Ashok,
> 
> On Mon, Nov 09 2020 at 21:14, Ashok Raj wrote:
> > On Mon, Nov 09, 2020 at 11:42:29PM +0100, Thomas Gleixner wrote:
> >> On Mon, Nov 09 2020 at 13:30, Jason Gunthorpe wrote:
> > Approach to IMS is more of a phased approach. 
> >
> > #1 Allow physical device to scale beyond limits of PCIe MSIx
> >    Follows current methodology for guest interrupt programming and
> >    evolutionary changes rather than drastic.
> 
> Trapping MSI[X] writes is there because it allows to hand a device to an
> unmodified guest OS and to handle the case where the MSI[X] entries
> storage cannot be mapped exclusively to the guest.
> 
> But aside of this, it's not required if the storage can be mapped
> exclusively, the guest is hypervisor aware and can get a host composed
> message via a hypercall. That works for physical functions and SRIOV,
> but not for SIOV.

It would greatly help if you can put down what you see is blocking 
to move forward in the following areas.

Address Gaps in Spec: 

Specs can accomodate change after review, as the number of ECN's that go on
with PCIe ;-). Please add what you like to see in the spec if you beleive
is a gap today.

Hardware Gaps?
- PASID tagged Interrupts.
- IOMMU Support for PASID based IR.

As i had called out, there are a lot of moving parts, and requires more
attention.

OS Gaps?
- Lack of ability to identify if platform can use IMS.
- Lack of hypercall.

We will always have devices that have more interrupts but their use doesn't
need IMS to be directly manipulated by the guest, or the fact those usages
require more than what is allowed by PCIe in a guest. These devices can 
scale by adding another sub-device and you get another block of 2048 if needed.

This isn't just for idxd, as I mentioned earlier, there are vendors other
than Intel already working on this. In all cases the need for guest direct
manipulation of interrupt store hasn't come up. From the discussion, it
seems like there are devices today or in future that will require direct
manipulation of interrupt store in the guest. This needs additional work
in both the device hardware providing the right plumbing and OS work to
comprehend those.

Cheers,
Ashok
Jason Gunthorpe Nov. 10, 2020, 2:19 p.m. UTC | #47
On Mon, Nov 09, 2020 at 09:14:12PM -0800, Raj, Ashok wrote:

> There are multiple tools (such as logic analyzers) and OEM test validation 
> harnesses that depend on such DWORD sized DMA writes with no PASID as interrupt
> messages. One of the feedback we had received in the development of the
> specification was to avoid impacting such tools irrespective of
> MSI-X or IMS

This is a really bad reason to make a poor decision for system
security. Relying on trapping/emulation increases the attack surface
and complexity of the VMM and the device which now have to create this
artificial split, which does not exist in SRIOV.

Hopefully we won't see devices get this wrong, but any path that
allows the guest to cause the device to create TLPs outside its IOMMU
containment is security worrysome.

> was used for interrupt message storage (on the wire they follow the
> same format), and also to ensure interoperability of devices
> supporting IMS across CPU vendors (who may not support PASID TLP
> prefix).  This is one reason that led to interrupts from IMS to not
> use PASID (and match the wire format of MSI/MSI-X generated
> interrupts).  The other problem was disambiguation between DMA to
> SVM v/s interrupts.

This is a defect in the IOMMU, not something fundamental.

The IOMMU needs to know if the interrupt range is active or not for
each PASID. Process based SVA will, of course, not enable interrupts
on the PASID, VM Guest based PASID will.

> Intel had published the specification almost 2 years back and have
> comprehended all the feedback received from the ecosystem 
> (both open-source and others), along with offering the specification 
> to be implemented by any vendors (both device and CPU vendors). 
> There are few device vendors who are implementing to the spec already and 
> are being explored for support by other CPU vendors

Which is why it is such a shame that including PASID in the MSI was
deliberately skipped in the document, the ecosystem could have been
much aligned to this solution by now :(

Jason
Raj, Ashok Nov. 10, 2020, 2:19 p.m. UTC | #48
Hi David

I did't follow the support for 32768 CPUs in guest without IR support.

Can you tell me how that is done?

On Sun, Nov 08, 2020 at 03:25:57PM -0800, Ashok Raj wrote:
> On Sun, Nov 08, 2020 at 06:34:55PM +0000, David Woodhouse wrote:
> > > 
> > > When we do interrupt remapping support in guest which would be required 
> > > if we support x2apic in guest, I think this is something we should look into more 
> > > carefully to make this work.
> > 
> > No, interrupt remapping is not required for X2APIC in guests
> > 
> > They can have X2APIC and up to 32768 CPUs without needing interrupt
> 
> How is this made available today without interrupt remapping? 
> 
> I thought without IR, the destination ID is still limited to only 8 bits?
> 
> On native, even if you have less than 255 cpu's but the APICID are sparsly 
> distributed due to platform rules, the x2apic id could be more than 8 bits. 
> Which is why the spec requires IR when x2apic is enabled.
> 
> > remapping at all. Only if they want more than 32768 vCPUs, or to do
> > nested virtualisation and actually remap for the benefit of *their*
> > (L2+) guests would they need IR.
Jason Gunthorpe Nov. 10, 2020, 2:23 p.m. UTC | #49
On Tue, Nov 10, 2020 at 06:13:23AM -0800, Raj, Ashok wrote:

> This isn't just for idxd, as I mentioned earlier, there are vendors other
> than Intel already working on this. In all cases the need for guest direct
> manipulation of interrupt store hasn't come up. From the discussion, it
> seems like there are devices today or in future that will require direct
> manipulation of interrupt store in the guest. This needs additional work
> in both the device hardware providing the right plumbing and OS work to
> comprehend those.

We'd want to see SRIOV's assigned to guests to be able to use
IMS. This allows a SRIOV instance in a guest to spawn SIOV's which is
useful.

SIOV's assigned to guests could use IMS, but the use cases we see in
the short term can be handled by using SRIOV instead.

I would expect in general for SIOV to use MSI-X emulation to expose
interrupts - it would be really weird for a SIOV emulator to do
something else and we should probably discourage that.

Jason
David Woodhouse Nov. 10, 2020, 2:41 p.m. UTC | #50
> Hi David
>
> I did't follow the support for 32768 CPUs in guest without IR support.
>
> Can you tell me how that is done?



Using bits 11-5 of the MSI address bits (the other 7 bits of "Extended
Destination ID" that aren't the Remappable Format indicator).

And physical addressing mode, which is no loss for external interrupts
since they're all unicast dest_Fixed these days anyway.
Tian, Kevin Nov. 11, 2020, 2:17 a.m. UTC | #51
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, November 10, 2020 10:24 PM
> 
> On Tue, Nov 10, 2020 at 06:13:23AM -0800, Raj, Ashok wrote:
> 
> > This isn't just for idxd, as I mentioned earlier, there are vendors other
> > than Intel already working on this. In all cases the need for guest direct
> > manipulation of interrupt store hasn't come up. From the discussion, it
> > seems like there are devices today or in future that will require direct
> > manipulation of interrupt store in the guest. This needs additional work
> > in both the device hardware providing the right plumbing and OS work to
> > comprehend those.
> 
> We'd want to see SRIOV's assigned to guests to be able to use
> IMS. This allows a SRIOV instance in a guest to spawn SIOV's which is
> useful.

Does your VF support both MSI/IMS or IMS only? If it is the former can't
we adopt a phased approach or parallel effort between forcing guest
to use MSI and adding hypercall to enable IMS on VF? Finding a way
to disable IMS is anyway required per earlier discussion when hypercall
is not available, and it could still provide a functional though suboptimal
model for such VFs.

> 
> SIOV's assigned to guests could use IMS, but the use cases we see in
> the short term can be handled by using SRIOV instead.
> 
> I would expect in general for SIOV to use MSI-X emulation to expose
> interrupts - it would be really weird for a SIOV emulator to do
> something else and we should probably discourage that.
> 

I agree with this point. This leaves hardware gaps in IOMMU and root
complex less an immediate blocker and to be addressed in the long term.

Thanks
Kevin
Tian, Kevin Nov. 11, 2020, 2:35 a.m. UTC | #52
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, November 10, 2020 10:19 PM
> On Mon, Nov 09, 2020 at 09:14:12PM -0800, Raj, Ashok wrote:
> 
> > was used for interrupt message storage (on the wire they follow the
> > same format), and also to ensure interoperability of devices
> > supporting IMS across CPU vendors (who may not support PASID TLP
> > prefix).  This is one reason that led to interrupts from IMS to not
> > use PASID (and match the wire format of MSI/MSI-X generated
> > interrupts).  The other problem was disambiguation between DMA to
> > SVM v/s interrupts.
> 
> This is a defect in the IOMMU, not something fundamental.
> 
> The IOMMU needs to know if the interrupt range is active or not for
> each PASID. Process based SVA will, of course, not enable interrupts
> on the PASID, VM Guest based PASID will.
> 

Unfortunately it's more than that. The interrupt message is firstly recognized
at root complex today and then routed to the IOMMU, unlike other DMA
requests. I'm not saying it's an unsolvable limitation, but just wants to point
out that to achieve such goal there are more things to be considered beyond 
the IOMMU.

Thanks
Kevin
Tian, Kevin Nov. 11, 2020, 7:14 a.m. UTC | #53
> From: Raj, Ashok <ashok.raj@intel.com>
> Sent: Tuesday, November 10, 2020 10:13 PM
> 
> Thomas,
> 
> With all these interrupt message storms ;-), I'm missing how to move
> towards
> an end goal.
> 
> On Tue, Nov 10, 2020 at 11:27:29AM +0100, Thomas Gleixner wrote:
> > Ashok,
> >
> > On Mon, Nov 09 2020 at 21:14, Ashok Raj wrote:
> > > On Mon, Nov 09, 2020 at 11:42:29PM +0100, Thomas Gleixner wrote:
> > >> On Mon, Nov 09 2020 at 13:30, Jason Gunthorpe wrote:
> > > Approach to IMS is more of a phased approach.
> > >
> > > #1 Allow physical device to scale beyond limits of PCIe MSIx
> > >    Follows current methodology for guest interrupt programming and
> > >    evolutionary changes rather than drastic.
> >
> > Trapping MSI[X] writes is there because it allows to hand a device to an
> > unmodified guest OS and to handle the case where the MSI[X] entries
> > storage cannot be mapped exclusively to the guest.
> >
> > But aside of this, it's not required if the storage can be mapped
> > exclusively, the guest is hypervisor aware and can get a host composed
> > message via a hypercall. That works for physical functions and SRIOV,
> > but not for SIOV.
> 
> It would greatly help if you can put down what you see is blocking
> to move forward in the following areas.
> 

Agree. We really need some guidance on how to move forward. I think all
people in this thread are aligned now that it's not Intel or IDXD specific thing,
e.g. need architectural solution, enabling IMS on PF/VF is important, etc. But
what we are not sure is whether we need complete all requirements in one
batch, or could evolve step-by-step as long as the growing path is clearly
defined. 

IMHO finding a way to disable IMS in guest is more important than supporting
IMS on PF/VF, since the latter requires hypercall which is not always available
in all scenarios. Even if Linux includes hypercall support for all existing archs
and hypervisors, it could run as an unmodified guest on a new hypervisor 
before this hypervisor gets its enlightenments into the Linux. So it is more
prominent to find a way to force using MSI/MSI-x inside guest, as it allows
such PFs/VFs still functional though not benefiting all scalability merits of IMS.

If such two-step plans can be agreed, then the next open is about how to
disable IMS in guest. We need a sane solution when checking in the initial 
host-only-IMS support. There are several options discussed in this thread:

1. Industry standard (e.g. a vendor-agnostic ACPI flag) followed by all 
platforms, hypervisors and OSes. It will require collaboration beyond 
Linux community;

2. IOMMU-vendor specific standards (DMAR, IORT, etc.) to report whether
IMS is allowed, implying that IMS is tied to the IOMMU. This tradeoff is 
acceptable since IMS alone cannot make SIOV working which relies on the 
IOMMU anyway. and this might be an easier path to move forward and
even not require to wait for all vendors to extend their tables together.
On physical platform the FW always reports IMS as 'allowed' and there is
time to change it. On virtual platform the hypervisor can choose to hide 
IMS in three ways:
	a) do not expose IOMMU
	b) expose IOMMU, but using the old format
	c) expose IOMMU, using the new format with IMS reported 'disallowed'

a/b can well support legacy software stack.

However, there is one potential issue with option 1/2. The construction
of the virtual ACPI table is at VM creation time, likely based on whether a 
PV interrupt controller is exposed to this guest. However, in most cases the
hypervisor doesn't know which guest OS is running and whether it will
use the PV controller when the VM is being created. If IMS is marked as
'allowed' in the virtual DMAR table, an unmodified guest might just go to 
enable it as if it's on the native platform. Maybe what we really required is 
a flag to tell the guest that although IMS is available you cannot use it with 
traditional interrupt controllers?

3. Use IOMMU 'caching mode' as the hint of running as guest and disable
IMS by default as long as 'caching mode' is detected. iirc all IOMMU vendors 
provide such capability for constructing shadow IOMMU page table. Later
when hypercall support is detected for a specific hypervisor/arch, that path 
can override the IOMMU hint to enable IMS.

Unlike the first two options, this will be a Linux-specific policy but self
contained. Other guest OSes may not follow this way though.

4. Using CPUID to detect running as guest. But as Thomas pointed out, this
approach is less reliable as not all hypervisors do this way.

Thoughts?

Thanks
Kevin
Christoph Hellwig Nov. 11, 2020, 3:41 p.m. UTC | #54
On Sun, Nov 08, 2020 at 07:36:34PM +0000, David Woodhouse wrote:
> So it does look like we're going to need a hypercall interface to
> compose an MSI message on behalf of the guest, for IMS to use. In fact
> PCI devices assigned to a guest could use that too, and then we'd only
> need to trap-and-remap any attempt to write a Compatibility Format MSI
> to the device's MSI table, while letting Remappable Format messages get
> written directly.
> 
> We'd also need a way for an OS running on bare metal to *know* that
> it's on bare metal and can just compose MSI messages for itself. Since
> we do expect bare metal to have an IOMMU, perhaps that is just a
> feature flag on the IOMMU?

Have the platform firmware advertise if it needs native or virtualized
IMS handling.  If it advertises neither don't support IMS?
Raj, Ashok Nov. 11, 2020, 4:09 p.m. UTC | #55
On Wed, Nov 11, 2020 at 03:41:59PM +0000, Christoph Hellwig wrote:
> On Sun, Nov 08, 2020 at 07:36:34PM +0000, David Woodhouse wrote:
> > So it does look like we're going to need a hypercall interface to
> > compose an MSI message on behalf of the guest, for IMS to use. In fact
> > PCI devices assigned to a guest could use that too, and then we'd only
> > need to trap-and-remap any attempt to write a Compatibility Format MSI
> > to the device's MSI table, while letting Remappable Format messages get
> > written directly.
> > 
> > We'd also need a way for an OS running on bare metal to *know* that
> > it's on bare metal and can just compose MSI messages for itself. Since
> > we do expect bare metal to have an IOMMU, perhaps that is just a
> > feature flag on the IOMMU?
> 
> Have the platform firmware advertise if it needs native or virtualized
> IMS handling.  If it advertises neither don't support IMS?

The platform hint can be easily accomplished via DMAR table flags. We could
have an IMS_OPTOUT(similart to x2apic optout flag) flag, when 0 its native 
and IMS is supported.

When vIOMMU is presented to guest, virtual DMAR table will have this flag
set to 1. Indicates to GuestOS, native IMS isn't supported.
Thomas Gleixner Nov. 11, 2020, 10:27 p.m. UTC | #56
On Wed, Nov 11 2020 at 08:09, Ashok Raj wrote:
> On Wed, Nov 11, 2020 at 03:41:59PM +0000, Christoph Hellwig wrote:
>> On Sun, Nov 08, 2020 at 07:36:34PM +0000, David Woodhouse wrote:
>> > So it does look like we're going to need a hypercall interface to
>> > compose an MSI message on behalf of the guest, for IMS to use. In fact
>> > PCI devices assigned to a guest could use that too, and then we'd only
>> > need to trap-and-remap any attempt to write a Compatibility Format MSI
>> > to the device's MSI table, while letting Remappable Format messages get
>> > written directly.
>> > 
>> > We'd also need a way for an OS running on bare metal to *know* that
>> > it's on bare metal and can just compose MSI messages for itself. Since
>> > we do expect bare metal to have an IOMMU, perhaps that is just a
>> > feature flag on the IOMMU?
>> 
>> Have the platform firmware advertise if it needs native or virtualized
>> IMS handling.  If it advertises neither don't support IMS?
>
> The platform hint can be easily accomplished via DMAR table flags. We could
> have an IMS_OPTOUT(similart to x2apic optout flag) flag, when 0 its native 
> and IMS is supported.
>
> When vIOMMU is presented to guest, virtual DMAR table will have this flag
> set to 1. Indicates to GuestOS, native IMS isn't supported.

These opt-out bits suck by definition. It comes all back to the fact
that the whole virt thing didn't have a hardware defined way to tell
that the OS runs in a VM and not on bare metal. It wouldn't have been
rocket science to do so.

And because that does not exist, we need magic opt-out bits for every
other piece of functionality which gets added. Can we please stop this
and provide a well defined way to tell the OS whether it runs on bare
metal or not?

The point is that you really want opt-in bits so that decisions come
down to

     if (!virt || virt->supports_X)

which is the obvious sane and safe logic. But sure, why am I asking for
sane and safe in the context of virtualization?

Thanks,

        tglx
Raj, Ashok Nov. 11, 2020, 11:03 p.m. UTC | #57
On Wed, Nov 11, 2020 at 11:27:28PM +0100, Thomas Gleixner wrote:
> On Wed, Nov 11 2020 at 08:09, Ashok Raj wrote:
> >> > We'd also need a way for an OS running on bare metal to *know* that
> >> > it's on bare metal and can just compose MSI messages for itself. Since
> >> > we do expect bare metal to have an IOMMU, perhaps that is just a
> >> > feature flag on the IOMMU?
> >> 
> >> Have the platform firmware advertise if it needs native or virtualized
> >> IMS handling.  If it advertises neither don't support IMS?
> >
> > The platform hint can be easily accomplished via DMAR table flags. We could
> > have an IMS_OPTOUT(similart to x2apic optout flag) flag, when 0 its native 
> > and IMS is supported.
> >
> > When vIOMMU is presented to guest, virtual DMAR table will have this flag
> > set to 1. Indicates to GuestOS, native IMS isn't supported.
> 
> These opt-out bits suck by definition. It comes all back to the fact
> that the whole virt thing didn't have a hardware defined way to tell
> that the OS runs in a VM and not on bare metal. It wouldn't have been
> rocket science to do so.

I'm sure everybody dislikes (hate being a strong word :-)). 
DVSEC capability. Real hardware always sets it to 1 for the IMS capability.

By default the DVSEC is not presented to guest even when the full PF is
presented to guest. I believe VFIO only builds and presents known standard
capabilities and specific extended capabilities. I'm a bit weak but maybe
@AlexWilliamson can confirm if I'm off track.

This tells the driver in guest that IMS is not available and will not
create those new dev_msi calls. 

Only if the VMM has build support to expose IMS for this device, guest SW
can even see DVSEC.SIOV.IMS=1. This also means the required plumbing, say
vIOMMU, or a hypercall has been provisioned, and adminstrator knows the
guest is compatible for these options. 

There maybe better ways to do this. If this has to be done differently
we certainly can and will do. 

> 
> And because that does not exist, we need magic opt-out bits for every
> other piece of functionality which gets added. Can we please stop this
> and provide a well defined way to tell the OS whether it runs on bare
> metal or not?
> 
> The point is that you really want opt-in bits so that decisions come
> down to

How would we opt-in when the feature is not available? You need someway to
tell the capability is available in the guest?, but then there is no reason
to opt-in though.. its ready for use isn't it?

> 
>      if (!virt || virt->supports_X)

The only closest thing that comes to mind is the CPUID bits, you had
mentioned they aren't reliable if the VMM didn't set those in an earlier
mail. If you want a platform level generic support.

- DMAR table optout's you had mentioned that's ugly
- We could use caching mode, but its not a platform level thing, and vendor
  specific. I'm not sure if other vendors have a similar feature. If there
  is a generic capabilty, we could expose via the iommu api's if we are in
  virt or real platform.
> 
> which is the obvious sane and safe logic. But sure, why am I asking for
> sane and safe in the context of virtualization?

We can pick how to solve this, and just waiting for you to tell, what
mechanism you prefer that's less painful and architecturally acceptible for
virtualization and linux. We are all ears!

Cheers,
Ashok
Thomas Gleixner Nov. 12, 2020, 1:13 a.m. UTC | #58
Ashok,

On Wed, Nov 11 2020 at 15:03, Ashok Raj wrote:
> On Wed, Nov 11, 2020 at 11:27:28PM +0100, Thomas Gleixner wrote:
>> which is the obvious sane and safe logic. But sure, why am I asking for
>> sane and safe in the context of virtualization?
>
> We can pick how to solve this, and just waiting for you to tell, what
> mechanism you prefer that's less painful and architecturally acceptible for
> virtualization and linux. We are all ears!

Obviously we can't turn the time back. The point I was trying to make is
that the general approach of just bolting things on top of the exiting
maze is bad in general.

Opt-out bits are error prone simply because anything which exists before
that point does not know that it should set that bit. Obvious, right?

CPUID bits are 'Feature available' and not 'Feature not longer
available' for a reason.

So with the introduction of VT this stringent road was left and the
approach was: Don't tell the guest OS that it's not running on bare
metal.

That's a perfectly fine approach for running existing legacy OSes which
do not care at all because they don't know about anything of this
newfangled stuff.

But it's a falls flat on it's nose for anything which comes past that
point simply because there is no reliable way to tell in which context
the OS runs.

The VMM can decide not to set or is not having support for setting the
software CPUID bit which tells the guest OS that it does NOT run on bare
metal and still hand in new fangled PCI devices for which the guest OS
happens to have a driver which then falls flat on it's nose because some
magic functionality is not there.

So we have the following matrix:

VMM   		Guest OS
Old             Old             -> Fine, does not support any of that
New             Old             -> Fine, does not support any of that
New             New             -> Fine, works as expected
Old             New             -> FAIL

To fix this we have to come up with heuristics again to figure out which
context we are running in and whether some magic feature can be
supported or not:

probably_on_bare_metal()
{
        if (CPUID(FEATURE_HYPERVISOR))
        	return false;
       	if (dmi_match_hypervisor_vendor())
        	return false;

        return PROBABLY_RUNNING_ON_BARE_METAL;
}

Yes, it works probably in most cases, but it still works by chance and
that's what I really hate about this; indeed 'hate' is not a strong
enough word.

Why on earth did VT not introduce a reliable way (instruction, CPUID
leaf, MSR, whatever, which can't be manipulated by the VMM to let the OS
figure out where it runs?)

Just because the general approach to these problems is: We can fix that
in software.

No, you can't fix inconsistency in software at all.

This is not the first time that we tell HW folks to stop this 'Fix this
in software' attitude which has caused more problems than it solved.

And you can argue in circles until you are blue, that inconsistency is
not going away. 

Everytime new (mis)features are added which need awareness of the OS
whether it runs on bare-metal or in a VM we have this unsolvable dance
of requiring that the underlying VMM has to tell the guest OS NOT to use
it instead of having the guest OS making the simple decision:

   if (!definitely_on_bare_metal())
   	return -ENOTSUPP;

or with a newer version of the guest OS:

   if (!definitely_on_bare_metal() && !hypervisor->supportsthis())
   	return -ENOTSUPP;

I'm halfways content to go with the above probably_on_bare_metal()
function as a replacement for definitely_on_bare_metal() to go forward,
but only for the very simple reason that this is the only option we
have.

Thanks,

        tglx
Jason Gunthorpe Nov. 12, 2020, 1:10 p.m. UTC | #59
On Wed, Nov 11, 2020 at 03:03:21PM -0800, Raj, Ashok wrote:

> By default the DVSEC is not presented to guest even when the full PF is
> presented to guest. I believe VFIO only builds and presents known standard
> capabilities and specific extended capabilities. I'm a bit weak but maybe
> @AlexWilliamson can confirm if I'm off track.

This also need to work on Hyper-V and all other cases, you can't just
assume everything is vfio and kvm.

It is horrible to ask people to go back an retroactively change their
config space in a device just to work around all the design failings
Thomas eloquantly describes :(

Jason
Jason Gunthorpe Nov. 12, 2020, 1:46 p.m. UTC | #60
On Wed, Nov 11, 2020 at 02:17:48AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Tuesday, November 10, 2020 10:24 PM
> > 
> > On Tue, Nov 10, 2020 at 06:13:23AM -0800, Raj, Ashok wrote:
> > 
> > > This isn't just for idxd, as I mentioned earlier, there are vendors other
> > > than Intel already working on this. In all cases the need for guest direct
> > > manipulation of interrupt store hasn't come up. From the discussion, it
> > > seems like there are devices today or in future that will require direct
> > > manipulation of interrupt store in the guest. This needs additional work
> > > in both the device hardware providing the right plumbing and OS work to
> > > comprehend those.
> > 
> > We'd want to see SRIOV's assigned to guests to be able to use
> > IMS. This allows a SRIOV instance in a guest to spawn SIOV's which is
> > useful.
> 
> Does your VF support both MSI/IMS or IMS only? 

Of course VF's support MSI..

> If it is the former can't we adopt a phased approach or parallel
> effort between forcing guest to use MSI and adding hypercall to
> enable IMS on VF? Finding a way to disable IMS is anyway required
> per earlier discussion when hypercall is not available, and it could
> still provide a functional though suboptimal model for such VFs.

Sure, I view that as the bare minimum

Jason
Konrad Rzeszutek Wilk Nov. 12, 2020, 7:32 p.m. UTC | #61
.monster snip..

> 4. Using CPUID to detect running as guest. But as Thomas pointed out, this
> approach is less reliable as not all hypervisors do this way.

Is that truly true? It is the first time I see the argument that extra
steps are needed and that checking for X86_FEATURE_HYPERVISOR is not enough.

Or is it more "Some hypervisor probably forgot about it, so lets make sure we patch
over that possible hole?"


Also is there anything in this spec that precludes this from working
on non-X86 architectures, say ARM systems?
Thomas Gleixner Nov. 12, 2020, 10:42 p.m. UTC | #62
On Thu, Nov 12 2020 at 14:32, Konrad Rzeszutek Wilk wrote:
>> 4. Using CPUID to detect running as guest. But as Thomas pointed out, this
>> approach is less reliable as not all hypervisors do this way.
>
> Is that truly true? It is the first time I see the argument that extra
> steps are needed and that checking for X86_FEATURE_HYPERVISOR is not enough.
>
> Or is it more "Some hypervisor probably forgot about it, so lets make sure we patch
> over that possible hole?"

Nothing enforces that bit to be set. The bit is a pure software
convention and was proposed by VMWare in 2008 with the following
changelog:

 "This patch proposes to use a cpuid interface to detect if we are
  running on an hypervisor.

  The discovery of a hypervisor is determined by bit 31 of CPUID#1_ECX,
  which is defined to be "hypervisor present bit". For a VM, the bit is
  1, otherwise it is set to 0. This bit is not officially documented by
  either Intel/AMD yet, but they plan to do so some time soon, in the
  meanwhile they have promised to keep it reserved for virtualization."

The reserved promise seems to hold. AMDs APM has it documented. The
Intel SDM not so.

Also the kernel side of KVM does not enforce that bit, it's up to the user
space management to set it.

And yes, I've tripped over this with some hypervisors and even qemu KVM
failed to set it in the early days because it was masked with host CPUID
trimming as there the bit is obviously 0.

DMI vendor name is pretty good final check when the bit is 0. The
strings I'm aware of are:

QEMU, Bochs, KVM, Xen, VMware, VMW, VMware Inc., innotek GmbH, Oracle
Corporation, Parallels, BHYVE, Microsoft Corporation

which is not complete but better than nothing ;)

Thanks,

        tglx
Tian, Kevin Nov. 13, 2020, 2:42 a.m. UTC | #63
> From: Thomas Gleixner <tglx@linutronix.de>
> Sent: Friday, November 13, 2020 6:43 AM
> 
> On Thu, Nov 12 2020 at 14:32, Konrad Rzeszutek Wilk wrote:
> >> 4. Using CPUID to detect running as guest. But as Thomas pointed out, this
> >> approach is less reliable as not all hypervisors do this way.
> >
> > Is that truly true? It is the first time I see the argument that extra
> > steps are needed and that checking for X86_FEATURE_HYPERVISOR is not
> enough.
> >
> > Or is it more "Some hypervisor probably forgot about it, so lets make sure
> we patch
> > over that possible hole?"
> 
> Nothing enforces that bit to be set. The bit is a pure software
> convention and was proposed by VMWare in 2008 with the following
> changelog:
> 
>  "This patch proposes to use a cpuid interface to detect if we are
>   running on an hypervisor.
> 
>   The discovery of a hypervisor is determined by bit 31 of CPUID#1_ECX,
>   which is defined to be "hypervisor present bit". For a VM, the bit is
>   1, otherwise it is set to 0. This bit is not officially documented by
>   either Intel/AMD yet, but they plan to do so some time soon, in the
>   meanwhile they have promised to keep it reserved for virtualization."
> 
> The reserved promise seems to hold. AMDs APM has it documented. The
> Intel SDM not so.
> 
> Also the kernel side of KVM does not enforce that bit, it's up to the user
> space management to set it.
> 
> And yes, I've tripped over this with some hypervisors and even qemu KVM
> failed to set it in the early days because it was masked with host CPUID
> trimming as there the bit is obviously 0.
> 
> DMI vendor name is pretty good final check when the bit is 0. The
> strings I'm aware of are:
> 
> QEMU, Bochs, KVM, Xen, VMware, VMW, VMware Inc., innotek GmbH,
> Oracle
> Corporation, Parallels, BHYVE, Microsoft Corporation
> 
> which is not complete but better than nothing ;)
> 
> Thanks,
> 
>         tglx

Hi, Thomas,

CPUID#1_ECX is a x86 thing. Do we need to figure out probably_on_
bare_metal for every architecture altogether, or is it OK to just
handle it for x86 arch at this stage? Based on previous discussions 
ims is just one piece of multiple technologies to enable SIOV-like
scalability. Ideally arch-specific enablement beyond ims (e.g. the 
IOMMU part) will be required for such scaled usage thus we 
may just leave ims disabled for non-x86 and wait until that time to 
figure out arch specific probably_on_bare_metal?

Thanks
Kevin
Jason Gunthorpe Nov. 13, 2020, 12:57 p.m. UTC | #64
On Fri, Nov 13, 2020 at 02:42:02AM +0000, Tian, Kevin wrote:

> CPUID#1_ECX is a x86 thing. Do we need to figure out probably_on_
> bare_metal for every architecture altogether, or is it OK to just
> handle it for x86 arch at this stage? Based on previous discussions 
> ims is just one piece of multiple technologies to enable SIOV-like
> scalability. Ideally arch-specific enablement beyond ims (e.g. the 
> IOMMU part) will be required for such scaled usage thus we 
> may just leave ims disabled for non-x86 and wait until that time to 
> figure out arch specific probably_on_bare_metal?

At the very least you need to ensure that
pci_subdevice_msi_create_irq_domain() fails entirely on other
architectures until they can sort out these sorts of issues..

Jason
Thomas Gleixner Nov. 13, 2020, 1:32 p.m. UTC | #65
On Fri, Nov 13 2020 at 02:42, Kevin Tian wrote:
>> From: Thomas Gleixner <tglx@linutronix.de>
> CPUID#1_ECX is a x86 thing. Do we need to figure out probably_on_
> bare_metal for every architecture altogether, or is it OK to just
> handle it for x86 arch at this stage? Based on previous discussions 
> ims is just one piece of multiple technologies to enable SIOV-like
> scalability. Ideally arch-specific enablement beyond ims (e.g. the 
> IOMMU part) will be required for such scaled usage thus we 
> may just leave ims disabled for non-x86 and wait until that time to 
> figure out arch specific probably_on_bare_metal?

Of course is this not only an x86 problem. Every architecture which
supports virtualization has the same issue. ARM(64) has no way to tell
for sure whether the machine runs bare metal either. No idea about the
other architectures.

Thanks,

        tglx
Luck, Tony Nov. 13, 2020, 4:12 p.m. UTC | #66
> Of course is this not only an x86 problem. Every architecture which
> supports virtualization has the same issue. ARM(64) has no way to tell
> for sure whether the machine runs bare metal either. No idea about the
> other architectures.

Sounds like a hypervisor problem. If the VMM provides perfect emulation
of every weird quirk of h/w, then it is OK to let the guest believe that it is
running on bare metal.

If it isn't perfect, then it should make sure the guest knows *for sure*, so that
the guest can take appropriate actions to avoid the sharp edges.

-Tony
Raj, Ashok Nov. 13, 2020, 5:38 p.m. UTC | #67
On Fri, Nov 13, 2020 at 08:12:39AM -0800, Luck, Tony wrote:
> > Of course is this not only an x86 problem. Every architecture which
> > supports virtualization has the same issue. ARM(64) has no way to tell
> > for sure whether the machine runs bare metal either. No idea about the
> > other architectures.
> 
> Sounds like a hypervisor problem. If the VMM provides perfect emulation
> of every weird quirk of h/w, then it is OK to let the guest believe that it is
> running on bare metal.

That's true, which is why there isn't an immutable bit in cpuid or
otherwise telling you are running under a hypervisor. Providing something
like that would make certain features not virtualizable. Apparently before we
had faulting cpuid, what you had in guest was the real raw cpuid. 

Waiver: I'm not saying this is perfect, I'm just replaying the reason
behind it. Not trying to defend it... flames > /dev/null
> 
> If it isn't perfect, then it should make sure the guest knows *for sure*, so that
> the guest can take appropriate actions to avoid the sharp edges.
> 

There are indeed 2 problems to solve.

1. How does device driver know if device is IMS capable.

   IMS is a device attribute. Each vendor can provide its own method to
   provide that indication. One such mechanism is the DVSEC.SIOV.IMS
   property. Some might believe this is for use only by Intel. For DVSEC I
   don't believe there is such a connection as in device vendor id in
   standard header. TBH, there are other device vendors using the exact
   same method to indicate SIOV and IMS propeties. What a DVSEC vendor ID
   states is "As defined by Vendor X". 

   Why we choose a config vs something in device specific mmio is because
   today VFIO being that one common mechanism, it only exposes known
   standard and some extended headers to guest. When we expose a full PF,
   the guest doens't see the DVSEC, so drivers know this isn't available.

   This is our mechanism to stop drivers from calling
   pci_ims_array_create_msi_irq_domain(). It may not be perfect for all
   devices, it is a device specific mechanism. For devices under
   consideration following the SIOV spec it meets the sprit of the
   requirement even without #2 below. When devices have no way to detect
   this, #2 is required as a second way to block IMS.

2. How does platform component (IOMMU) inform if they can support all forms
   of IMS. (On device, or in memory). 
   
   On device would require some form trap/emulate. Legacy MSIx already has
   that solved, but for device specific store you need some additional
   work.

   When its system memory (say IMS is in GPA space), you need some form of
   hypercall. There is no way around it since we can't intercept. Yes, you
   can maybe map those as RO and trap, but its not pretty.

   To solve this rather than a generic platform capability, maybe we should
   flip this to IOMMU instead, because that's the one that offers this
   capability today.

   iommu_ims_supported() 
   	When platform has no IOMMU or no hypervisor calls, it returns
	false. So device driver can tell, even if it supports IMS
	capability deduction, does the platform support IMS.
   
        On platforms where iommu supports capability.

	Either there is a vIOMMU with a Virtual Command Register that can
	provide a way to get the interrupt handle similar to what you would
	get from an hypercall for instance. Or there is a real hypercall
	that supports giving the guest OS the physical IRTE handle.
Christoph Hellwig Nov. 14, 2020, 10:34 a.m. UTC | #68
On Thu, Nov 12, 2020 at 11:42:46PM +0100, Thomas Gleixner wrote:
> DMI vendor name is pretty good final check when the bit is 0. The
> strings I'm aware of are:
> 
> QEMU, Bochs, KVM, Xen, VMware, VMW, VMware Inc., innotek GmbH, Oracle
> Corporation, Parallels, BHYVE, Microsoft Corporation
> 
> which is not complete but better than nothing ;)

Which is why I really think we need explicit opt-ins for "native"
SIOV handling and for paravirtualized SIOV handling, with the kernel
not offering support at all without either or a manual override on
the command line.
Raj, Ashok Nov. 14, 2020, 9:18 p.m. UTC | #69
On Sat, Nov 14, 2020 at 10:34:30AM +0000, Christoph Hellwig wrote:
> On Thu, Nov 12, 2020 at 11:42:46PM +0100, Thomas Gleixner wrote:
> > DMI vendor name is pretty good final check when the bit is 0. The
> > strings I'm aware of are:
> > 
> > QEMU, Bochs, KVM, Xen, VMware, VMW, VMware Inc., innotek GmbH, Oracle
> > Corporation, Parallels, BHYVE, Microsoft Corporation
> > 
> > which is not complete but better than nothing ;)
> 
> Which is why I really think we need explicit opt-ins for "native"
> SIOV handling and for paravirtualized SIOV handling, with the kernel
> not offering support at all without either or a manual override on
> the command line.

opt-in by device or kernel? The way we are planning to support this is:

Device support for IMS - Can discover in device specific means
Kernel support for IMS. - Supported by IOMMU driver.

each driver can check 

if (dev_supports_ims() && iommu_supports_ims()) {
	/* Then IMS is supported in the platform.*/
}


until we have vIOMMU support or a hypercall, iommu_supports_ims() will
check if X86_FEATURE_HYPERVISOR in addition to the platform id's Thomas
mentioned. or on intel platform check for cap.caching_mode=1 and return false.

When we add support for getting a native interrupt handle then we will plumb that
appropriately.

Does this match what you wanted?
Thomas Gleixner Nov. 15, 2020, 11:26 a.m. UTC | #70
On Sat, Nov 14 2020 at 13:18, Ashok Raj wrote:
> On Sat, Nov 14, 2020 at 10:34:30AM +0000, Christoph Hellwig wrote:
>> On Thu, Nov 12, 2020 at 11:42:46PM +0100, Thomas Gleixner wrote:
>> Which is why I really think we need explicit opt-ins for "native"
>> SIOV handling and for paravirtualized SIOV handling, with the kernel
>> not offering support at all without either or a manual override on
>> the command line.
>
> opt-in by device or kernel? The way we are planning to support this is:
>
> Device support for IMS - Can discover in device specific means
> Kernel support for IMS. - Supported by IOMMU driver.

And why exactly do we have to enforce IOMMU support? Please stop looking
at IMS purely from the IDXD perspective. We are talking about the
general concept here and not about the restricted Intel universe.

> each driver can check 
>
> if (dev_supports_ims() && iommu_supports_ims()) {
> 	/* Then IMS is supported in the platform.*/
> }

Please forget this 'each driver can check'. That's just wrong.

The only thing the driver has to check is whether the device supports
IMS or not. Everything else has to be handled by the underlying
infrastructure.

That's pretty much the same thing like PCI/MSI[X]. The driver does not
have to check 'device_has_msix() && platform_supports_msix()'. Enabling
MSI[X] will simply fail if it's not supported.

So for IMS creating the underlying irqdomain has to fail when the
platform does not support it and the driver can act upon the fail and
fallback to MSI[X] or just refuse to load when IMS is required for the
device to be functional.

Thanks,

        tglx
Raj, Ashok Nov. 15, 2020, 7:31 p.m. UTC | #71
On Sun, Nov 15, 2020 at 12:26:22PM +0100, Thomas Gleixner wrote:
> On Sat, Nov 14 2020 at 13:18, Ashok Raj wrote:
> > On Sat, Nov 14, 2020 at 10:34:30AM +0000, Christoph Hellwig wrote:
> >> On Thu, Nov 12, 2020 at 11:42:46PM +0100, Thomas Gleixner wrote:
> >> Which is why I really think we need explicit opt-ins for "native"
> >> SIOV handling and for paravirtualized SIOV handling, with the kernel
> >> not offering support at all without either or a manual override on
> >> the command line.
> >
> > opt-in by device or kernel? The way we are planning to support this is:
> >
> > Device support for IMS - Can discover in device specific means
> > Kernel support for IMS. - Supported by IOMMU driver.
> 
> And why exactly do we have to enforce IOMMU support? Please stop looking
> at IMS purely from the IDXD perspective. We are talking about the
> general concept here and not about the restricted Intel universe.

I think you have mentioned it almost every reply :-)..Got that! Point taken
several emails ago!! :-)

I didn't mean just for idxd, I said for *ANY* device driver that wants to
use IMS.

> 
> > each driver can check 
> >
> > if (dev_supports_ims() && iommu_supports_ims()) {
> > 	/* Then IMS is supported in the platform.*/
> > }
> 
> Please forget this 'each driver can check'. That's just wrong.

Ok.

> 
> The only thing the driver has to check is whether the device supports
> IMS or not. Everything else has to be handled by the underlying
> infrastructure.

That's pretty much the same thing.. I guess you wanted to add 
"Does infrastructure support IMS" to be someplace else, instead
of device driver checking it. That's perfectly fine.

Until we support this natively via hypercall or vIOMMU we can use your
varient of finding if you are not on bare_metal to decide support for IMS.

How you highligted below:

https://lore.kernel.org/lkml/877dqrnzr3.fsf@nanos.tec.linutronix.de/

probably_on_bare_metal()
{
        if (CPUID(FEATURE_HYPERVISOR))
        	return false;
       	if (dmi_match_hypervisor_vendor())
        	return false;

        return PROBABLY_RUNNING_ON_BARE_METAL;
}

The above is all we need for now and will work in almost all cases. 
We will move forward with just the above in the next series.

Below is for future consideration.

Even the above isn't fool proof if both HYPERVISOR feature flag isn't set,
and the dmi_string doesn't match, say some new hypervisor. The only way 
we can figure that is

- If no iommu support, or iommu can tell if this is a virtualized iommu.
The presence of caching_mode is one such indication for Intel. 

PS: Other IOMMU's must have something like this to support virtualization.
    I'm not saying this is an Intel only feature just in case you interpret
    it that way! I'm only saying if there is a mechanism to distinguish
    native vs emulated platform.

When vIOMMU supports getting native interrupt handle via a virtual command
interface for Intel IOMMU's. OR some equivalent when other vedors provide
such capability. Even without a hypercall virtualizing IOMMU can provide
the same solution.

If we support hypercall then its more generic so it would fall into the
native all platforms/vendors. Certainly the most scalable long term
solution.


Cheers,
Ashok
Thomas Gleixner Nov. 15, 2020, 10:11 p.m. UTC | #72
On Sun, Nov 15 2020 at 11:31, Ashok Raj wrote:
> On Sun, Nov 15, 2020 at 12:26:22PM +0100, Thomas Gleixner wrote:
>> > opt-in by device or kernel? The way we are planning to support this is:
>> >
>> > Device support for IMS - Can discover in device specific means
>> > Kernel support for IMS. - Supported by IOMMU driver.
>> 
>> And why exactly do we have to enforce IOMMU support? Please stop looking
>> at IMS purely from the IDXD perspective. We are talking about the
>> general concept here and not about the restricted Intel universe.
>
> I think you have mentioned it almost every reply :-)..Got that! Point taken
> several emails ago!! :-)

You sure? I _try_ to not mention it again then. No promise though. :)

> I didn't mean just for idxd, I said for *ANY* device driver that wants to
> use IMS.

Which is wrong. Again:

A) For PF/VF on bare metal there is absolutely no IOMMU dependency
   because it does not have a PASID requirement. It's just an
   alternative solution to MSI[X], which allows optimizations like
   storing the message in driver manages queue memory or lifting the
   restriction of 2048 interrupts per device. Nothing else.

B) For PF/VF in a guest the IOMMU dependency of IMS is a red herring.
   There is no direct dependency on the IOMMU.

   The problem is the inability of the VMM to trap the message write to
   the IMS storage if the storage is in guest driver managed memory.
   This can be solved with either

   - a hypercall which translates the guest MSI message
   or
   - a vIOMMU which uses a hypercall or whatever to translate the guest
     MSI message

C) Subdevices ala mdev are a different story. They require PASID which
   enforces IOMMU and the IMS part is not managed by the users anyway.

So we have a couple of problems to solve:

  1) Figure out whether the OS runs on bare metal

     There is no reliable answer to that, so we either:

      - Use heuristics and assume that failure is unlikely and in case
        of failure blame the incompetence of VMM authors and/or
        sysadmins

     or
     
      - Default to IMS disabled and let the sysadmin enable it via
        command line option.

        If the kernel detects to run in a VM it yells and disables it
        unless the OS and the hypervisor agree to provide support for
        that scenario (see #2).

        That's fails as well if the sysadmin does so when the OS runs on
        a VMM which is not identifiable, but at least we can rightfully
        blame the sysadmin in that case.

     or

      - Declare that IMS always depends on IOMMU

        I personaly don't care, but people working on these kind of
        device already said, that they want to avoid it when possible.
        
        If you want to go that route, then please talk to those folks
        and ask them to agree in public.

     You also need to take into account that this must work on all
     architectures which support virtualization because IMS is
     architecture independent.

  2) Guest support for PF/VF

     Again we have several scenarios depending on the IMS storage
     type.

      - If the storage type is device memory then it's pretty much the
        same as MSI[X] just a different location.

      - If the storage is in driver managed memory then this needs
        #1 plus guest OS and hypervisor support (hypercall/vIOMMU)
        
  3) Guest support for PF/VF and guest managed subdevice (mdev)

     Depends on #1 and #2 and is an orthogonal problem if I'm not
     missing something.

To move forward we need to make a decision about #1 and #2 now.

This needs to be well thought out as changing it after the fact is
going to be a nightmare.

/me grudgingly refrains from mentioning the obvious once more.

Thanks,

        tglx
Raj, Ashok Nov. 16, 2020, 12:22 a.m. UTC | #73
On Sun, Nov 15, 2020 at 11:11:27PM +0100, Thomas Gleixner wrote:
> On Sun, Nov 15 2020 at 11:31, Ashok Raj wrote:
> > On Sun, Nov 15, 2020 at 12:26:22PM +0100, Thomas Gleixner wrote:
> >> > opt-in by device or kernel? The way we are planning to support this is:
> >> >
> >> > Device support for IMS - Can discover in device specific means
> >> > Kernel support for IMS. - Supported by IOMMU driver.
> >> 
> >> And why exactly do we have to enforce IOMMU support? Please stop looking
> >> at IMS purely from the IDXD perspective. We are talking about the
> >> general concept here and not about the restricted Intel universe.
> >
> > I think you have mentioned it almost every reply :-)..Got that! Point taken
> > several emails ago!! :-)
> 
> You sure? I _try_ to not mention it again then. No promise though. :)

Hey.. anything that's entertaining go for it :-)

> 
> > I didn't mean just for idxd, I said for *ANY* device driver that wants to
> > use IMS.
> 
> Which is wrong. Again:
> 
> A) For PF/VF on bare metal there is absolutely no IOMMU dependency
>    because it does not have a PASID requirement. It's just an
>    alternative solution to MSI[X], which allows optimizations like
>    storing the message in driver manages queue memory or lifting the
>    restriction of 2048 interrupts per device. Nothing else.

You are right.. my eyes were clouded by virtualization.. no dependency for
native absolutely.

> 
> B) For PF/VF in a guest the IOMMU dependency of IMS is a red herring.
>    There is no direct dependency on the IOMMU.
> 
>    The problem is the inability of the VMM to trap the message write to
>    the IMS storage if the storage is in guest driver managed memory.
>    This can be solved with either
> 
>    - a hypercall which translates the guest MSI message
>    or
>    - a vIOMMU which uses a hypercall or whatever to translate the guest
>      MSI message
> 
> C) Subdevices ala mdev are a different story. They require PASID which
>    enforces IOMMU and the IMS part is not managed by the users anyway.

You are right again :)

The subdevices require PASID & IOMMU in native, but inside the guest there is no
need for IOMMU unless you want to build SVM on top. subdevices work without
any vIOMMU or hypercall in the guest. Only because they look like normal
PCI devices we could map interrupts to legacy MSIx.

> 
> So we have a couple of problems to solve:
> 
>   1) Figure out whether the OS runs on bare metal
> 
>      There is no reliable answer to that, so we either:
> 
>       - Use heuristics and assume that failure is unlikely and in case
>         of failure blame the incompetence of VMM authors and/or
>         sysadmins
> 
>      or
>      
>       - Default to IMS disabled and let the sysadmin enable it via
>         command line option.
> 
>         If the kernel detects to run in a VM it yells and disables it
>         unless the OS and the hypervisor agree to provide support for
>         that scenario (see #2).
> 
>         That's fails as well if the sysadmin does so when the OS runs on
>         a VMM which is not identifiable, but at least we can rightfully
>         blame the sysadmin in that case.

cmdline isn't nice, best to have this functional out of box.

> 
>      or
> 
>       - Declare that IMS always depends on IOMMU

As you had mentioned IMS has no real dependency on IOMMU in native.

we just need to make sure if running in guest we have support for it
plumbed.

> 
>         I personaly don't care, but people working on these kind of
>         device already said, that they want to avoid it when possible.
>         
>         If you want to go that route, then please talk to those folks
>         and ask them to agree in public.
> 
>      You also need to take into account that this must work on all
>      architectures which support virtualization because IMS is
>      architecture independent.

What you suggest makes perfect sense. We can certainly get buy in from
iommu list and have this co-ordinated between all existing iommu varients.

> 
>   2) Guest support for PF/VF
> 
>      Again we have several scenarios depending on the IMS storage
>      type.
> 
>       - If the storage type is device memory then it's pretty much the
>         same as MSI[X] just a different location.

True, but still need to have some special handling for trapping those mmio
access. Unlike for MSIx VFIO already traps them and everything is
pre-plummbed. It isn't seamless as its for MSIx.

> 
>       - If the storage is in driver managed memory then this needs
>         #1 plus guest OS and hypervisor support (hypercall/vIOMMU)

Violent agreement here :-)

>         
>   3) Guest support for PF/VF and guest managed subdevice (mdev)
> 
>      Depends on #1 and #2 and is an orthogonal problem if I'm not
>      missing something.
> 
> To move forward we need to make a decision about #1 and #2 now.

Mostly in agreement. Except for mdev (current considered use case) have no
need for IMS in the guest. (Don't get me wrong, I'm not saying some odd
device managing sub-devices would need IMS in addition and that the 2048
MSIx emulation. 
> 
> This needs to be well thought out as changing it after the fact is
> going to be a nightmare.
> 
> /me grudgingly refrains from mentioning the obvious once more.
> 

So this isn't an idxd and Intel only thing :-)... 

Cheers,
Ashok
Tian, Kevin Nov. 16, 2020, 7:31 a.m. UTC | #74
> From: Raj, Ashok <ashok.raj@intel.com>
> Sent: Monday, November 16, 2020 8:23 AM
> 
> On Sun, Nov 15, 2020 at 11:11:27PM +0100, Thomas Gleixner wrote:
> > On Sun, Nov 15 2020 at 11:31, Ashok Raj wrote:
> > > On Sun, Nov 15, 2020 at 12:26:22PM +0100, Thomas Gleixner wrote:
> > >> > opt-in by device or kernel? The way we are planning to support this is:
> > >> >
> > >> > Device support for IMS - Can discover in device specific means
> > >> > Kernel support for IMS. - Supported by IOMMU driver.
> > >>
> > >> And why exactly do we have to enforce IOMMU support? Please stop
> looking
> > >> at IMS purely from the IDXD perspective. We are talking about the
> > >> general concept here and not about the restricted Intel universe.
> > >
> > > I think you have mentioned it almost every reply :-)..Got that! Point taken
> > > several emails ago!! :-)
> >
> > You sure? I _try_ to not mention it again then. No promise though. :)
> 
> Hey.. anything that's entertaining go for it :-)
> 
> >
> > > I didn't mean just for idxd, I said for *ANY* device driver that wants to
> > > use IMS.
> >
> > Which is wrong. Again:
> >
> > A) For PF/VF on bare metal there is absolutely no IOMMU dependency
> >    because it does not have a PASID requirement. It's just an
> >    alternative solution to MSI[X], which allows optimizations like
> >    storing the message in driver manages queue memory or lifting the
> >    restriction of 2048 interrupts per device. Nothing else.
> 
> You are right.. my eyes were clouded by virtualization.. no dependency for
> native absolutely.
> 
> >
> > B) For PF/VF in a guest the IOMMU dependency of IMS is a red herring.
> >    There is no direct dependency on the IOMMU.
> >
> >    The problem is the inability of the VMM to trap the message write to
> >    the IMS storage if the storage is in guest driver managed memory.
> >    This can be solved with either
> >
> >    - a hypercall which translates the guest MSI message
> >    or
> >    - a vIOMMU which uses a hypercall or whatever to translate the guest
> >      MSI message
> >
> > C) Subdevices ala mdev are a different story. They require PASID which
> >    enforces IOMMU and the IMS part is not managed by the users anyway.
> 
> You are right again :)
> 
> The subdevices require PASID & IOMMU in native, but inside the guest there
> is no
> need for IOMMU unless you want to build SVM on top. subdevices work
> without
> any vIOMMU or hypercall in the guest. Only because they look like normal
> PCI devices we could map interrupts to legacy MSIx.

Guest managed subdevices on PF/VF requires vIOMMU. Anyway I think
Thomas was just pointing out that subdevices are the only category out
of above three which may have business tied to IOMMU. 😊

> 
> >
> > So we have a couple of problems to solve:
> >
> >   1) Figure out whether the OS runs on bare metal
> >
> >      There is no reliable answer to that, so we either:
> >
> >       - Use heuristics and assume that failure is unlikely and in case
> >         of failure blame the incompetence of VMM authors and/or
> >         sysadmins
> >
> >      or
> >
> >       - Default to IMS disabled and let the sysadmin enable it via
> >         command line option.
> >
> >         If the kernel detects to run in a VM it yells and disables it
> >         unless the OS and the hypervisor agree to provide support for
> >         that scenario (see #2).
> >
> >         That's fails as well if the sysadmin does so when the OS runs on
> >         a VMM which is not identifiable, but at least we can rightfully
> >         blame the sysadmin in that case.
> 
> cmdline isn't nice, best to have this functional out of box.
> 
> >
> >      or
> >
> >       - Declare that IMS always depends on IOMMU
> 
> As you had mentioned IMS has no real dependency on IOMMU in native.
> 
> we just need to make sure if running in guest we have support for it
> plumbed.
> 
> >
> >         I personaly don't care, but people working on these kind of
> >         device already said, that they want to avoid it when possible.
> >
> >         If you want to go that route, then please talk to those folks
> >         and ask them to agree in public.
> >
> >      You also need to take into account that this must work on all
> >      architectures which support virtualization because IMS is
> >      architecture independent.
> 
> What you suggest makes perfect sense. We can certainly get buy in from
> iommu list and have this co-ordinated between all existing iommu varients.

Does a hybrid scheme sound good here?

- Say a cmdline parameter: ims=[auto|on|off], with 'auto' as default;

- if ims=auto:

    * If arch doesn't implement probably_on_bare_metal, disallow ims;

    * If probably_on_bare_metal returns false, disallow ims;
	# (future) if hypercall is supported, allow ims;

    * If probably_on_bare_metal returns true, allow ims with caveat on
possible mis-interception of running on an old hypervisor. Sysadmin
may need to double-confirm in other means 
	# (future) if definitely_on_bare_metal is supported, no caveat;

- if ims=on:

    * If probably_on_bare_metal return false, yell and disable it until
hypercall is supported;

    * In all other cases allow ims. Sysadmin should be blamed if any
failure as doing so implies that extra confirmation has been done;

- if ims=off, then leave it off.

It's not necessary to claim strict dependency between ims and iommu.
Instead, we could leave iommu being an arch specific check when it
applies:

probably_on_bare_metal()
{
       if (CPUID(FEATURE_HYPERVISOR))
        	return false;
       if (dmi_match_hypervisor_vendor())
        	return false;
       if (iommu_existing() && iommu_in_guest())
        	return false;

        return PROBABLY_RUNNING_ON_BARE_METAL;
}
 
> 
> >
> >   2) Guest support for PF/VF
> >
> >      Again we have several scenarios depending on the IMS storage
> >      type.
> >
> >       - If the storage type is device memory then it's pretty much the
> >         same as MSI[X] just a different location.
> 
> True, but still need to have some special handling for trapping those mmio
> access. Unlike for MSIx VFIO already traps them and everything is
> pre-plummbed. It isn't seamless as its for MSIx.

yes. So what about tying guest IMS to hypercall even when emulation
is possible on some devices?  It's difficult for the guest to know that
its IMS is emulated by hypervisor. Adopting an unified policy for all
IMS-capable devices might be an easier path.

> 
> >
> >       - If the storage is in driver managed memory then this needs
> >         #1 plus guest OS and hypervisor support (hypercall/vIOMMU)
> 
> Violent agreement here :-)
> 
> >
> >   3) Guest support for PF/VF and guest managed subdevice (mdev)
> >
> >      Depends on #1 and #2 and is an orthogonal problem if I'm not
> >      missing something.
> >
> > To move forward we need to make a decision about #1 and #2 now.
> 
> Mostly in agreement. Except for mdev (current considered use case) have no
> need for IMS in the guest. (Don't get me wrong, I'm not saying some odd
> device managing sub-devices would need IMS in addition and that the 2048
> MSIx emulation.
> >
> > This needs to be well thought out as changing it after the fact is
> > going to be a nightmare.
> >
> > /me grudgingly refrains from mentioning the obvious once more.
> >
> 
> So this isn't an idxd and Intel only thing :-)...
> 
> Cheers,
> Ashok

Thanks
Kevin
Christoph Hellwig Nov. 16, 2020, 8:25 a.m. UTC | #75
On Sat, Nov 14, 2020 at 01:18:37PM -0800, Raj, Ashok wrote:
> On Sat, Nov 14, 2020 at 10:34:30AM +0000, Christoph Hellwig wrote:
> > On Thu, Nov 12, 2020 at 11:42:46PM +0100, Thomas Gleixner wrote:
> > > DMI vendor name is pretty good final check when the bit is 0. The
> > > strings I'm aware of are:
> > > 
> > > QEMU, Bochs, KVM, Xen, VMware, VMW, VMware Inc., innotek GmbH, Oracle
> > > Corporation, Parallels, BHYVE, Microsoft Corporation
> > > 
> > > which is not complete but better than nothing ;)
> > 
> > Which is why I really think we need explicit opt-ins for "native"
> > SIOV handling and for paravirtualized SIOV handling, with the kernel
> > not offering support at all without either or a manual override on
> > the command line.
> 
> opt-in by device or kernel? The way we are planning to support this is:

opt-in by the platform.  Not sure if an ACPI interface or something else
would be best.  But basically the kernel needs to be able to query:

Does this platform claim to support IMS, and if yes how.  If there is no
answer we need assume the platform doesn't.
Jason Gunthorpe Nov. 16, 2020, 3:46 p.m. UTC | #76
On Mon, Nov 16, 2020 at 07:31:49AM +0000, Tian, Kevin wrote:

> > The subdevices require PASID & IOMMU in native, but inside the guest there
> > is no
> > need for IOMMU unless you want to build SVM on top. subdevices work
> > without
> > any vIOMMU or hypercall in the guest. Only because they look like normal
> > PCI devices we could map interrupts to legacy MSIx.
> 
> Guest managed subdevices on PF/VF requires vIOMMU. 

Why? I've never heard we need vIOMMU for our existing SRIOV flows in
VMs??

Jason
Thomas Gleixner Nov. 16, 2020, 5:56 p.m. UTC | #77
On Mon, Nov 16 2020 at 11:46, Jason Gunthorpe wrote:

> On Mon, Nov 16, 2020 at 07:31:49AM +0000, Tian, Kevin wrote:
>
>> > The subdevices require PASID & IOMMU in native, but inside the guest there
>> > is no
>> > need for IOMMU unless you want to build SVM on top. subdevices work
>> > without
>> > any vIOMMU or hypercall in the guest. Only because they look like normal
>> > PCI devices we could map interrupts to legacy MSIx.
>> 
>> Guest managed subdevices on PF/VF requires vIOMMU. 
>
> Why? I've never heard we need vIOMMU for our existing SRIOV flows in
> VMs??

Handing PF/VF into the guest does not require it.

But if the PF/VF driver in the guest wants to create and manage the
magic mdev subdevices which require PASID support then you surely need
it.

Thanks,

        tglx
Jason Gunthorpe Nov. 16, 2020, 6:02 p.m. UTC | #78
On Mon, Nov 16, 2020 at 06:56:33PM +0100, Thomas Gleixner wrote:
> On Mon, Nov 16 2020 at 11:46, Jason Gunthorpe wrote:
> 
> > On Mon, Nov 16, 2020 at 07:31:49AM +0000, Tian, Kevin wrote:
> >
> >> > The subdevices require PASID & IOMMU in native, but inside the guest there
> >> > is no
> >> > need for IOMMU unless you want to build SVM on top. subdevices work
> >> > without
> >> > any vIOMMU or hypercall in the guest. Only because they look like normal
> >> > PCI devices we could map interrupts to legacy MSIx.
> >> 
> >> Guest managed subdevices on PF/VF requires vIOMMU. 
> >
> > Why? I've never heard we need vIOMMU for our existing SRIOV flows in
> > VMs??
> 
> Handing PF/VF into the guest does not require it.
> 
> But if the PF/VF driver in the guest wants to create and manage the
> magic mdev subdevices which require PASID support then you surely need
> it.

'magic mdevs' are only one reason to use IMS in a guest. On mlx5 we
might want to use IMS for VPDA devices. mlx5 can spawn a VDPA device
in a guest, against a 'ADI', without ever requiring an IOMMU to do it.

We don't even need IOMMU in the hypervisor to create the ADI, mlx5 has
an internal secure IOMMU that can be used instead of the platform
IOMMU.

Not saying this is a major use case, or a reason not to link things to
IOMMU detection, but lets be clear that a hard need for IOMMU is a
another IDXD thing, not general.

Jason
Thomas Gleixner Nov. 16, 2020, 8:37 p.m. UTC | #79
On Mon, Nov 16 2020 at 14:02, Jason Gunthorpe wrote:
> On Mon, Nov 16, 2020 at 06:56:33PM +0100, Thomas Gleixner wrote:
>> On Mon, Nov 16 2020 at 11:46, Jason Gunthorpe wrote:
>> 
>> > On Mon, Nov 16, 2020 at 07:31:49AM +0000, Tian, Kevin wrote:
>> >
>> >> > The subdevices require PASID & IOMMU in native, but inside the guest there
>> >> > is no
>> >> > need for IOMMU unless you want to build SVM on top. subdevices work
>> >> > without
>> >> > any vIOMMU or hypercall in the guest. Only because they look like normal
>> >> > PCI devices we could map interrupts to legacy MSIx.
>> >> 
>> >> Guest managed subdevices on PF/VF requires vIOMMU. 
>> >
>> > Why? I've never heard we need vIOMMU for our existing SRIOV flows in
>> > VMs??
>> 
>> Handing PF/VF into the guest does not require it.
>> 
>> But if the PF/VF driver in the guest wants to create and manage the
>> magic mdev subdevices which require PASID support then you surely need
>> it.
>
> 'magic mdevs' are only one reason to use IMS in a guest. On mlx5 we
> might want to use IMS for VPDA devices. mlx5 can spawn a VDPA device
> in a guest, against a 'ADI', without ever requiring an IOMMU to do it.
>
> We don't even need IOMMU in the hypervisor to create the ADI, mlx5 has
> an internal secure IOMMU that can be used instead of the platform
> IOMMU.
>
> Not saying this is a major use case, or a reason not to link things to
> IOMMU detection, but lets be clear that a hard need for IOMMU is a
> another IDXD thing, not general.

Fair enough.

Thanks,

        tglx
Tian, Kevin Nov. 16, 2020, 11:51 p.m. UTC | #80
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, November 17, 2020 2:03 AM
> 
> On Mon, Nov 16, 2020 at 06:56:33PM +0100, Thomas Gleixner wrote:
> > On Mon, Nov 16 2020 at 11:46, Jason Gunthorpe wrote:
> >
> > > On Mon, Nov 16, 2020 at 07:31:49AM +0000, Tian, Kevin wrote:
> > >
> > >> > The subdevices require PASID & IOMMU in native, but inside the guest
> there
> > >> > is no
> > >> > need for IOMMU unless you want to build SVM on top. subdevices
> work
> > >> > without
> > >> > any vIOMMU or hypercall in the guest. Only because they look like
> normal
> > >> > PCI devices we could map interrupts to legacy MSIx.
> > >>
> > >> Guest managed subdevices on PF/VF requires vIOMMU.
> > >
> > > Why? I've never heard we need vIOMMU for our existing SRIOV flows in
> > > VMs??
> >
> > Handing PF/VF into the guest does not require it.
> >
> > But if the PF/VF driver in the guest wants to create and manage the
> > magic mdev subdevices which require PASID support then you surely need
> > it.
> 
> 'magic mdevs' are only one reason to use IMS in a guest. On mlx5 we
> might want to use IMS for VPDA devices. mlx5 can spawn a VDPA device
> in a guest, against a 'ADI', without ever requiring an IOMMU to do it.
> 
> We don't even need IOMMU in the hypervisor to create the ADI, mlx5 has
> an internal secure IOMMU that can be used instead of the platform
> IOMMU.
> 
> Not saying this is a major use case, or a reason not to link things to
> IOMMU detection, but lets be clear that a hard need for IOMMU is a
> another IDXD thing, not general.
> 

I should use "may require" in original post. and one thing that I obviously
mixed is the requirement of PASID-granular interrupt isolation in the
physical IOMMU instead of virtual IOMMU. But anyway, I didn't attempt
to use above to build hard need for IOMMU, just the opposite when looking
at all three cases together.

btw Jason/Thomas, how do you think about the proposal down in this
thread (ims=[auto|on|off])? Does it sound a good tradeoff to move forward?

Thanks
Kevin
Thomas Gleixner Nov. 17, 2020, 9:21 a.m. UTC | #81
On Mon, Nov 16 2020 at 23:51, Kevin Tian wrote:
>> From: Jason Gunthorpe <jgg@nvidia.com>
> btw Jason/Thomas, how do you think about the proposal down in this
> thread (ims=[auto|on|off])? Does it sound a good tradeoff to move forward?

What does it solve? It defaults to auto and then you still need to solve
the problem of figuring out whether it's safe to use it or not.

The command line option is not a solution per se. It's the last resort
when the logic which decides whether IMS can be used or not fails to do
the right thing. Nothing more.

We clearly have outlined what needs to be done and you can come up with
as many magic bullets you want, they won't make the real problems go
away.

Thanks,

        tglx
diff mbox series

Patch

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 3e77a88b236c..d9335f590b42 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -36,6 +36,7 @@ 
 #include <linux/tboot.h>
 #include <linux/dmi.h>
 #include <linux/pci-ats.h>
+#include <linux/pci-siov.h>
 #include <linux/memblock.h>
 #include <linux/dma-map-ops.h>
 #include <linux/dma-direct.h>
@@ -5883,34 +5884,6 @@  static int intel_iommu_disable_auxd(struct device *dev)
 	return 0;
 }
 
-/*
- * A PCI express designated vendor specific extended capability is defined
- * in the section 3.7 of Intel scalable I/O virtualization technical spec
- * for system software and tools to detect endpoint devices supporting the
- * Intel scalable IO virtualization without host driver dependency.
- *
- * Returns the address of the matching extended capability structure within
- * the device's PCI configuration space or 0 if the device does not support
- * it.
- */
-static int siov_find_pci_dvsec(struct pci_dev *pdev)
-{
-	int pos;
-	u16 vendor, id;
-
-	pos = pci_find_next_ext_capability(pdev, 0, 0x23);
-	while (pos) {
-		pci_read_config_word(pdev, pos + 4, &vendor);
-		pci_read_config_word(pdev, pos + 8, &id);
-		if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
-			return pos;
-
-		pos = pci_find_next_ext_capability(pdev, pos, 0x23);
-	}
-
-	return 0;
-}
-
 static bool
 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
 {
@@ -5925,7 +5898,7 @@  intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
 		if (ret < 0)
 			return false;
 
-		return !!siov_find_pci_dvsec(to_pci_dev(dev));
+		return pci_siov_supported(to_pci_dev(dev));
 	}
 
 	if (feat == IOMMU_DEV_FEAT_SVA) {
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 0c473d75e625..cf7f4d17d8cc 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -161,6 +161,21 @@  config PCI_PASID
 
 	  If unsure, say N.
 
+config PCI_DVSEC
+	bool
+
+config PCI_SIOV
+	select PCI_PASID
+	select PCI_DVSEC
+	bool "PCI SIOV support"
+	help
+	  Scalable I/O Virtualzation enables sharing of I/O devices across isolated
+	  domains through PASID based sub-device partitioning. One of the sub features
+	  supported by SIOV is Inetrrupt Message Storage (IMS). Select this option if
+	  you want to compile the support into your kernel.
+
+	  If unsure, say N.
+
 config PCI_P2PDMA
 	bool "PCI peer-to-peer transfer support"
 	depends on ZONE_DEVICE
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 522d2b974e91..653a1d69b0fc 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -20,6 +20,8 @@  obj-$(CONFIG_PCI_QUIRKS)	+= quirks.o
 obj-$(CONFIG_HOTPLUG_PCI)	+= hotplug/
 obj-$(CONFIG_PCI_MSI)		+= msi.o
 obj-$(CONFIG_PCI_ATS)		+= ats.o
+obj-$(CONFIG_PCI_DVSEC)		+= dvsec.o
+obj-$(CONFIG_PCI_SIOV)		+= siov.o
 obj-$(CONFIG_PCI_IOV)		+= iov.o
 obj-$(CONFIG_PCI_BRIDGE_EMUL)	+= pci-bridge-emul.o
 obj-$(CONFIG_PCI_LABEL)		+= pci-label.o
diff --git a/drivers/pci/dvsec.c b/drivers/pci/dvsec.c
new file mode 100644
index 000000000000..e49b079f0717
--- /dev/null
+++ b/drivers/pci/dvsec.c
@@ -0,0 +1,40 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCI DVSEC helper functions
+ * Copyright (C) 2020 Intel Corp.
+ */
+
+#include <linux/export.h>
+#include <linux/pci.h>
+#include <uapi/linux/pci_regs.h>
+#include "pci.h"
+
+/**
+ * pci_find_dvsec - return position of DVSEC with provided vendor and dvsec id
+ * @dev: the PCI device
+ * @vendor: Vendor for the DVSEC
+ * @id: the DVSEC cap id
+ *
+ * Return the offset of DVSEC on success or -ENOTSUPP if not found
+ */
+int pci_find_dvsec(struct pci_dev *dev, u16 vendor, u16 id)
+{
+	u16 dev_vendor, dev_id;
+	int pos;
+
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_DVSEC);
+	if (!pos)
+		return -ENOTSUPP;
+
+	while (pos) {
+		pci_read_config_word(dev, pos + PCI_DVSEC_HEADER1, &dev_vendor);
+		pci_read_config_word(dev, pos + PCI_DVSEC_HEADER2, &dev_id);
+		if (dev_vendor == vendor && dev_id == id)
+			return pos;
+
+		pos = pci_find_next_ext_capability(dev, pos, PCI_EXT_CAP_ID_DVSEC);
+	}
+
+	return -ENOTSUPP;
+}
+EXPORT_SYMBOL_GPL(pci_find_dvsec);
diff --git a/drivers/pci/siov.c b/drivers/pci/siov.c
new file mode 100644
index 000000000000..6147e6ae5832
--- /dev/null
+++ b/drivers/pci/siov.c
@@ -0,0 +1,50 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Scalable I/O Virtualization support
+ * Copyright (C) 2020 Intel Corp.
+ */
+
+#include <linux/export.h>
+#include <linux/pci.h>
+#include <linux/pci-siov.h>
+#include <uapi/linux/pci_regs.h>
+#include "pci.h"
+
+/*
+ * A PCI express designated vendor specific extended capability is defined
+ * in the section 3.7 of Intel scalable I/O virtualization technical spec
+ * for system software and tools to detect endpoint devices supporting the
+ * Intel scalable IO virtualization without host driver dependency.
+ */
+
+/**
+ * pci_siov_supported - check if the device can use SIOV
+ * @dev: the PCI device
+ *
+ * Returns true if the device supports SIOV,  false otherwise.
+ */
+bool pci_siov_supported(struct pci_dev *dev)
+{
+	return pci_find_dvsec(dev, PCI_VENDOR_ID_INTEL, PCI_DVSEC_ID_INTEL_SIOV) < 0 ? false : true;
+}
+EXPORT_SYMBOL_GPL(pci_siov_supported);
+
+/**
+ * pci_ims_supported - check if the device can use IMS
+ * @dev: the PCI device
+ *
+ * Returns true if the device supports IMS, false otherwise.
+ */
+bool pci_ims_supported(struct pci_dev *dev)
+{
+	int pos;
+	u32 caps;
+
+	pos = pci_find_dvsec(dev, PCI_VENDOR_ID_INTEL, PCI_DVSEC_ID_INTEL_SIOV);
+	if (pos < 0)
+		return false;
+
+	pci_read_config_dword(dev, pos + PCI_DVSEC_INTEL_SIOV_CAP, &caps);
+	return (caps & PCI_DVSEC_INTEL_SIOV_CAP_IMS) ? true : false;
+}
+EXPORT_SYMBOL_GPL(pci_ims_supported);
diff --git a/include/linux/pci-siov.h b/include/linux/pci-siov.h
new file mode 100644
index 000000000000..a8a4eb5f4634
--- /dev/null
+++ b/include/linux/pci-siov.h
@@ -0,0 +1,18 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_PCI_SIOV_H
+#define LINUX_PCI_SIOV_H
+
+#include <linux/pci.h>
+
+#ifdef CONFIG_PCI_SIOV
+/* Scalable I/O Virtualization */
+bool pci_siov_supported(struct pci_dev *dev);
+bool pci_ims_supported(struct pci_dev *dev);
+#else /* CONFIG_PCI_SIOV */
+static inline bool pci_siov_supported(struct pci_dev *d)
+{ return false; }
+static inline bool pci_ims_supported(struct pci_dev *d)
+{ return false; }
+#endif /* CONFIG_PCI_SIOV */
+
+#endif /* LINUX_PCI_SIOV_H */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 22207a79762c..4710f09b43b1 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1070,6 +1070,7 @@  int pci_find_next_ext_capability(struct pci_dev *dev, int pos, int cap);
 int pci_find_ht_capability(struct pci_dev *dev, int ht_cap);
 int pci_find_next_ht_capability(struct pci_dev *dev, int pos, int ht_cap);
 struct pci_bus *pci_find_next_bus(const struct pci_bus *from);
+int pci_find_dvsec(struct pci_dev *dev, u16 vendor, u16 id);
 
 u64 pci_get_dsn(struct pci_dev *dev);
 
@@ -1726,6 +1727,8 @@  static inline int pci_find_next_capability(struct pci_dev *dev, u8 post,
 { return 0; }
 static inline int pci_find_ext_capability(struct pci_dev *dev, int cap)
 { return 0; }
+static inline int pci_find_dvsec(struct pci_dev *dev, u16 vendor, u16 id)
+{ return 0; }
 
 static inline u64 pci_get_dsn(struct pci_dev *dev)
 { return 0; }
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 8f8bd2318c6c..3532528441ef 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -1071,6 +1071,10 @@ 
 #define PCI_DVSEC_HEADER1		0x4 /* Designated Vendor-Specific Header1 */
 #define PCI_DVSEC_HEADER2		0x8 /* Designated Vendor-Specific Header2 */
 
+#define PCI_DVSEC_ID_INTEL_SIOV		0x5
+#define PCI_DVSEC_INTEL_SIOV_CAP	0x14
+#define PCI_DVSEC_INTEL_SIOV_CAP_IMS	0x1
+
 /* Data Link Feature */
 #define PCI_DLF_CAP		0x04	/* Capabilities Register */
 #define  PCI_DLF_EXCHANGE_ENABLE	0x80000000  /* Data Link Feature Exchange Enable */