diff mbox series

[v2,1/3] iommu/virtio: Add topology description to virtio-iommu config space

Message ID 20200228172537.377327-2-jean-philippe@linaro.org
State New
Headers show
Series virtio-iommu on x86 and non-devicetree platforms | expand

Commit Message

Jean-Philippe Brucker Feb. 28, 2020, 5:25 p.m. UTC
Platforms without device-tree do not currently have a method for
describing the vIOMMU topology. Provide a topology description embedded
into the virtio device.

Use PCI FIXUP to probe the config space early, because we need to
discover the topology before any DMA configuration takes place, and the
virtio driver may be loaded much later. Since we discover the topology
description when probing the PCI hierarchy, the virtual IOMMU cannot
manage other platform devices discovered earlier.

This solution isn't elegant nor foolproof, but is the best we can do at
the moment and works with existing virtio-iommu implementations. It also
enables an IOMMU for lightweight hypervisors that do not rely on
firmware methods for booting.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
---
 MAINTAINERS                           |   2 +
 drivers/iommu/Kconfig                 |  10 +
 drivers/iommu/Makefile                |   1 +
 drivers/iommu/virtio-iommu-topology.c | 343 ++++++++++++++++++++++++++
 drivers/iommu/virtio-iommu.c          |   3 +
 include/linux/virt_iommu.h            |  19 ++
 include/uapi/linux/virtio_iommu.h     |  26 ++
 7 files changed, 404 insertions(+)
 create mode 100644 drivers/iommu/virtio-iommu-topology.c
 create mode 100644 include/linux/virt_iommu.h

Comments

Michael S. Tsirkin March 1, 2020, 11:17 a.m. UTC | #1
On Fri, Feb 28, 2020 at 06:25:36PM +0100, Jean-Philippe Brucker wrote:
> Platforms without device-tree do not currently have a method for
> describing the vIOMMU topology. Provide a topology description embedded
> into the virtio device.
> 
> Use PCI FIXUP to probe the config space early, because we need to
> discover the topology before any DMA configuration takes place, and the
> virtio driver may be loaded much later. Since we discover the topology
> description when probing the PCI hierarchy, the virtual IOMMU cannot
> manage other platform devices discovered earlier.
> 
> This solution isn't elegant nor foolproof, but is the best we can do at
> the moment and works with existing virtio-iommu implementations. It also
> enables an IOMMU for lightweight hypervisors that do not rely on
> firmware methods for booting.
> 
> Signed-off-by: Eric Auger <eric.auger@redhat.com>
> Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
> ---
>  MAINTAINERS                           |   2 +
>  drivers/iommu/Kconfig                 |  10 +
>  drivers/iommu/Makefile                |   1 +
>  drivers/iommu/virtio-iommu-topology.c | 343 ++++++++++++++++++++++++++
>  drivers/iommu/virtio-iommu.c          |   3 +
>  include/linux/virt_iommu.h            |  19 ++
>  include/uapi/linux/virtio_iommu.h     |  26 ++
>  7 files changed, 404 insertions(+)
>  create mode 100644 drivers/iommu/virtio-iommu-topology.c
>  create mode 100644 include/linux/virt_iommu.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index fcd79fc38928..65a03ce53096 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -17781,6 +17781,8 @@ M:	Jean-Philippe Brucker <jean-philippe@linaro.org>
>  L:	virtualization@lists.linux-foundation.org
>  S:	Maintained
>  F:	drivers/iommu/virtio-iommu.c
> +F:	drivers/iommu/virtio-iommu-topology.c
> +F:	include/linux/virt_iommu.h
>  F:	include/uapi/linux/virtio_iommu.h
>  
>  VIRTUAL BOX GUEST DEVICE DRIVER
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index c5df570ef84a..f8cb45d84bb0 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -516,4 +516,14 @@ config VIRTIO_IOMMU
>  
>  	  Say Y here if you intend to run this kernel as a guest.
>  
> +config VIRTIO_IOMMU_TOPOLOGY
> +	bool "Topology properties for the virtio-iommu"
> +	depends on VIRTIO_IOMMU
> +	default y
> +	help
> +	  Enable early probing of the virtio-iommu device, to detect the
> +	  built-in topology description.
> +
> +	  Say Y here if you intend to run this kernel as a guest.
> +
>  endif # IOMMU_SUPPORT
> diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
> index 9f33fdb3bb05..5da24280d08c 100644
> --- a/drivers/iommu/Makefile
> +++ b/drivers/iommu/Makefile
> @@ -37,3 +37,4 @@ obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
>  obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o
>  obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o
>  obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
> +obj-$(CONFIG_VIRTIO_IOMMU_TOPOLOGY) += virtio-iommu-topology.o
> diff --git a/drivers/iommu/virtio-iommu-topology.c b/drivers/iommu/virtio-iommu-topology.c
> new file mode 100644
> index 000000000000..2188624ef216
> --- /dev/null
> +++ b/drivers/iommu/virtio-iommu-topology.c
> @@ -0,0 +1,343 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> +#include <linux/dma-iommu.h>
> +#include <linux/list.h>
> +#include <linux/pci.h>
> +#include <linux/virt_iommu.h>
> +#include <linux/virtio_ids.h>
> +#include <linux/virtio_pci.h>
> +#include <uapi/linux/virtio_iommu.h>
> +
> +struct viommu_cap_config {
> +	u8 bar;
> +	u32 length; /* structure size */
> +	u32 offset; /* structure offset within the bar */
> +};
> +
> +union viommu_topo_cfg {
> +	__le16					type;
> +	struct virtio_iommu_topo_pci_range	pci;
> +	struct virtio_iommu_topo_endpoint	ep;
> +};
> +
> +struct viommu_spec {
> +	struct device				*dev; /* transport device */
> +	struct fwnode_handle			*fwnode;
> +	struct iommu_ops			*ops;
> +	struct list_head			list;
> +	size_t					num_items;
> +	/* The config array of length num_items follows */
> +	union viommu_topo_cfg			cfg[];
> +};
> +
> +static LIST_HEAD(viommus);
> +static DEFINE_MUTEX(viommus_lock);
> +
> +#define VPCI_FIELD(field) offsetof(struct virtio_pci_cap, field)
> +
> +static inline int viommu_pci_find_capability(struct pci_dev *dev, u8 cfg_type,
> +					     struct viommu_cap_config *cap)
> +{
> +	int pos;
> +	u8 bar;
> +
> +	for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR);
> +	     pos > 0;
> +	     pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) {
> +		u8 type;
> +
> +		pci_read_config_byte(dev, pos + VPCI_FIELD(cfg_type), &type);
> +		if (type != cfg_type)
> +			continue;
> +
> +		pci_read_config_byte(dev, pos + VPCI_FIELD(bar), &bar);
> +
> +		/* Ignore structures with reserved BAR values */
> +		if (type != VIRTIO_PCI_CAP_PCI_CFG && bar > 0x5)
> +			continue;
> +
> +		cap->bar = bar;
> +		pci_read_config_dword(dev, pos + VPCI_FIELD(length),
> +				      &cap->length);
> +		pci_read_config_dword(dev, pos + VPCI_FIELD(offset),
> +				      &cap->offset);
> +
> +		return pos;
> +	}
> +	return 0;
> +}
> +
> +static void viommu_ccopy(__le32 *dst, u32 __iomem *src, size_t length)
> +{
> +	size_t i;
> +
> +	/* For the moment all our config structures align on 32b */
> +	if (WARN_ON(length % 4))
> +		return;
> +
> +	for (i = 0; i < length / 4; i++)
> +		/* Keep little-endian data */
> +		dst[i] = cpu_to_le32(ioread32(src + i));
> +}
> +
> +static int viommu_parse_topology(struct device *dev,
> +				 struct virtio_iommu_config __iomem *cfg)


why is this int? always succeeds and callers don't use the return code
...


> +{
> +	size_t i;
> +	size_t spec_length;
> +	struct viommu_spec *viommu_spec;
> +	u32 offset, item_length, num_items;
> +
> +	offset = ioread32(&cfg->topo_config.offset);
> +	item_length = ioread32(&cfg->topo_config.item_length);
> +	num_items = ioread32(&cfg->topo_config.num_items);
> +	if (!offset || !num_items || !item_length)
> +		return 0;
> +
> +	spec_length = sizeof(*viommu_spec) + num_items *
> +					     sizeof(union viommu_topo_cfg);


This can overflow. Worth detecting and failing init if it does?


> +	viommu_spec = kzalloc(spec_length, GFP_KERNEL);
> +	if (!viommu_spec)
> +		return -ENOMEM;
> +
> +	viommu_spec->dev = dev;
> +
> +	/* Copy in the whole array, sort it out later */
> +	for (i = 0; i < num_items; i++) {
> +		size_t read_length = min_t(size_t, item_length,
> +					   sizeof(union viommu_topo_cfg));
> +
> +		viommu_ccopy((__le32 *)&viommu_spec->cfg[i],

Doesn't this need __force?

> +			     (void __iomem *)cfg + offset,
> +			     read_length);

> +
> +		offset += item_length;
> +	}
> +	viommu_spec->num_items = num_items;
> +
> +	mutex_lock(&viommus_lock);
> +	list_add(&viommu_spec->list, &viommus);
> +	mutex_unlock(&viommus_lock);
> +
> +	return 0;
> +}
> +
> +static void viommu_pci_parse_topology(struct pci_dev *dev)
> +{
> +	int pos;
> +	u32 features;
> +	void __iomem *regs;
> +	struct viommu_cap_config cap = {0};
> +	struct virtio_pci_common_cfg __iomem *common_cfg;
> +
> +	/*
> +	 * The virtio infrastructure might not be loaded at this point. we need
> +	 * to access the BARs ourselves.
> +	 */
> +	pos = viommu_pci_find_capability(dev, VIRTIO_PCI_CAP_COMMON_CFG, &cap);
> +	if (!pos) {
> +		pci_warn(dev, "common capability not found\n");
> +		return;
> +	}
> +
> +	if (pci_enable_device_mem(dev))
> +		return;
> +
> +	regs = pci_iomap(dev, cap.bar, 0);
> +	if (!regs)
> +		return;
> +
> +	common_cfg = regs + cap.offset;
> +

So the virtio spec says this:

The driver MUST follow this sequence to initialize a device:

\begin{enumerate}


\item Reset the device.

\item Set the ACKNOWLEDGE status bit: the guest OS has noticed the device.

\item Set the DRIVER status bit: the guest OS knows how to drive the device.

Let's follow this unless there's a good reason not to.
And I guess let's reset at the end?


> +	/* Find out if the device supports topology description */
> +	writel(0, &common_cfg->device_feature_select);
> +	features = ioread32(&common_cfg->device_feature);
> +
> +	pci_iounmap(dev, regs);
> +
> +	if (!(features & BIT(VIRTIO_IOMMU_F_TOPOLOGY))) {
> +		pci_dbg(dev, "device doesn't have topology description");
> +		return;
> +	}
> +
> +	pos = viommu_pci_find_capability(dev, VIRTIO_PCI_CAP_DEVICE_CFG, &cap);
> +	if (!pos) {
> +		pci_warn(dev, "device config capability not found\n");
> +		return;
> +	}
> +
> +	regs = pci_iomap(dev, cap.bar, 0);
> +	if (!regs)
> +		return;
> +
> +	pci_info(dev, "parsing virtio-iommu topology\n");
> +	viommu_parse_topology(&dev->dev, regs + cap.offset);
> +	pci_iounmap(dev, regs);
> +}
> +
> +/*
> + * Catch a PCI virtio-iommu implementation early to get the topology description
> + * before we start probing other endpoints.
> + */
> +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1040 + VIRTIO_ID_IOMMU,
> +			viommu_pci_parse_topology);
> +
> +/*
> + * Return true if the device matches this topology structure. Write the endpoint
> + * ID into epid if it's the case.
> + */
> +static bool viommu_parse_pci(struct pci_dev *pdev, union viommu_topo_cfg *cfg,
> +			     u32 *epid)
> +{
> +	u32 endpoint_start;
> +	u16 start, end, domain;
> +	u16 devid = pci_dev_id(pdev);
> +	u16 type = le16_to_cpu(cfg->type);
> +
> +	if (type != VIRTIO_IOMMU_TOPO_PCI_RANGE)
> +		return false;
> +
> +	start		= le16_to_cpu(cfg->pci.requester_start);
> +	end		= le16_to_cpu(cfg->pci.requester_end);
> +	domain		= le16_to_cpu(cfg->pci.hierarchy);
> +	endpoint_start	= le32_to_cpu(cfg->pci.endpoint_start);
> +
> +	if (pci_domain_nr(pdev->bus) == domain &&
> +	    devid >= start && devid <= end) {
> +		*epid = devid - start + endpoint_start;
> +		return true;
> +	}
> +	return false;
> +}
> +
> +static const struct iommu_ops *virt_iommu_setup(struct device *dev)
> +{
> +	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> +	const struct iommu_ops *viommu_ops = NULL;
> +	struct fwnode_handle *viommu_fwnode;
> +	struct viommu_spec *viommu_spec;
> +	struct pci_dev *pci_dev = NULL;
> +	struct device *viommu_dev;
> +	bool found = false;
> +	size_t i;
> +	u32 epid;
> +	int ret;
> +
> +	/* Already translated? */
> +	if (fwspec && fwspec->ops)
> +		return NULL;
> +
> +	if (dev_is_pci(dev)) {
> +		pci_dev = to_pci_dev(dev);
> +	} else {
> +		/* At the moment we don't support platform devices */
> +		return NULL;
> +	}
> +
> +	mutex_lock(&viommus_lock);
> +	list_for_each_entry(viommu_spec, &viommus, list) {
> +		for (i = 0; i < viommu_spec->num_items; i++) {
> +			union viommu_topo_cfg *cfg = &viommu_spec->cfg[i];
> +
> +			found = viommu_parse_pci(pci_dev, cfg, &epid);
> +			if (found)
> +				break;
> +		}
> +		if (found) {
> +			viommu_ops = viommu_spec->ops;
> +			viommu_fwnode = viommu_spec->fwnode;
> +			viommu_dev = viommu_spec->dev;
> +			break;
> +		}
> +	}
> +	mutex_unlock(&viommus_lock);
> +	if (!found)
> +		return NULL;
> +
> +	/* We're not translating ourselves. */
> +	if (viommu_dev == dev)
> +		return NULL;
> +
> +	/*
> +	 * If we found a PCI range managed by the viommu, we're the ones that
> +	 * have to request ACS.
> +	 */
> +	if (pci_dev)
> +		pci_request_acs();
> +
> +	if (!viommu_ops)
> +		return ERR_PTR(-EPROBE_DEFER);
> +
> +	ret = iommu_fwspec_init(dev, viommu_fwnode, viommu_ops);
> +	if (ret)
> +		return ERR_PTR(ret);
> +
> +	iommu_fwspec_add_ids(dev, &epid, 1);
> +
> +	return viommu_ops;
> +}
> +
> +/**
> + * virt_dma_configure - Configure DMA of virtualized devices
> + * @dev: the endpoint
> + *
> + * Setup the DMA and IOMMU ops of a virtual device, for platforms without DT or
> + * ACPI.
> + *
> + * Return: -EPROBE_DEFER if the device is managed by an IOMMU that hasn't been
> + *   probed yet, 0 otherwise
> + */
> +int virt_dma_configure(struct device *dev)
> +{
> +	const struct iommu_ops *iommu_ops;
> +
> +	iommu_ops = virt_iommu_setup(dev);
> +	if (IS_ERR_OR_NULL(iommu_ops)) {
> +		int ret = PTR_ERR(iommu_ops);
> +
> +		if (ret == -EPROBE_DEFER || ret == 0)
> +			return ret;
> +		dev_err(dev, "error %d while setting up virt IOMMU\n", ret);
> +		return 0;
> +	}
> +
> +	/*
> +	 * If we have reason to believe the IOMMU driver missed the initial
> +	 * add_device callback for dev, replay it to get things in order.
> +	 */
> +	if (dev->bus && !device_iommu_mapped(dev))
> +		iommu_probe_device(dev);
> +
> +	/* Assume coherent, as well as full 64-bit addresses. */
> +#ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS
> +	arch_setup_dma_ops(dev, 0, ~0ULL, iommu_ops, true);
> +#else
> +	iommu_setup_dma_ops(dev, 0, ~0ULL);
> +#endif
> +	return 0;
> +}
> +
> +/**
> + * virt_set_iommu_ops - Set the IOMMU ops of a virtual IOMMU device
> + * @dev: the IOMMU device (transport)
> + * @ops: the new IOMMU ops or NULL
> + *
> + * Setup the iommu_ops associated to a viommu_spec, once the driver is loaded
> + * and the device probed.
> + */
> +void virt_set_iommu_ops(struct device *dev, struct iommu_ops *ops)
> +{
> +	struct viommu_spec *viommu_spec;
> +
> +	mutex_lock(&viommus_lock);
> +	list_for_each_entry(viommu_spec, &viommus, list) {
> +		if (viommu_spec->dev == dev) {
> +			viommu_spec->ops = ops;
> +			viommu_spec->fwnode = ops ? dev->fwnode : NULL;
> +			break;
> +		}
> +	}
> +	mutex_unlock(&viommus_lock);
> +}
> +EXPORT_SYMBOL_GPL(virt_set_iommu_ops);
> diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
> index 93ff58632452..5429c12c879b 100644
> --- a/drivers/iommu/virtio-iommu.c
> +++ b/drivers/iommu/virtio-iommu.c
> @@ -21,6 +21,7 @@
>  #include <linux/virtio.h>
>  #include <linux/virtio_config.h>
>  #include <linux/virtio_ids.h>
> +#include <linux/virt_iommu.h>
>  #include <linux/wait.h>
>  
>  #include <uapi/linux/virtio_iommu.h>
> @@ -1075,6 +1076,7 @@ static int viommu_probe(struct virtio_device *vdev)
>  	if (ret)
>  		goto err_free_vqs;
>  
> +	virt_set_iommu_ops(dev->parent, &viommu_ops);
>  	iommu_device_set_ops(&viommu->iommu, &viommu_ops);
>  	iommu_device_set_fwnode(&viommu->iommu, parent_dev->fwnode);
>  
> @@ -1121,6 +1123,7 @@ static void viommu_remove(struct virtio_device *vdev)
>  {
>  	struct viommu_dev *viommu = vdev->priv;
>  
> +	virt_set_iommu_ops(vdev->dev.parent, NULL);
>  	iommu_device_sysfs_remove(&viommu->iommu);
>  	iommu_device_unregister(&viommu->iommu);
>  
> diff --git a/include/linux/virt_iommu.h b/include/linux/virt_iommu.h
> new file mode 100644
> index 000000000000..c68b03ec75ba
> --- /dev/null
> +++ b/include/linux/virt_iommu.h
> @@ -0,0 +1,19 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef VIRTIO_IOMMU_H_
> +#define VIRTIO_IOMMU_H_
> +
> +#if IS_ENABLED(CONFIG_VIRTIO_IOMMU_TOPOLOGY)
> +int virt_dma_configure(struct device *dev);
> +void virt_set_iommu_ops(struct device *dev, struct iommu_ops *ops);
> +#else /* !CONFIG_VIRTIO_IOMMU_TOPOLOGY */
> +static inline int virt_dma_configure(struct device *dev)
> +{
> +	/* Don't disturb the normal DMA configuration methods */
> +	return 0;
> +}
> +
> +static inline void virt_set_iommu_ops(struct device *dev, struct iommu_ops *ops)
> +{ }
> +#endif /* !CONFIG_VIRTIO_IOMMU_TOPOLOGY */
> +
> +#endif /* VIRTIO_IOMMU_H_ */
> diff --git a/include/uapi/linux/virtio_iommu.h b/include/uapi/linux/virtio_iommu.h
> index 237e36a280cb..ec57d215086a 100644
> --- a/include/uapi/linux/virtio_iommu.h
> +++ b/include/uapi/linux/virtio_iommu.h
> @@ -16,6 +16,7 @@
>  #define VIRTIO_IOMMU_F_BYPASS			3
>  #define VIRTIO_IOMMU_F_PROBE			4
>  #define VIRTIO_IOMMU_F_MMIO			5
> +#define VIRTIO_IOMMU_F_TOPOLOGY			6
>  
>  struct virtio_iommu_range_64 {
>  	__le64					start;
> @@ -27,6 +28,12 @@ struct virtio_iommu_range_32 {
>  	__le32					end;
>  };
>  
> +struct virtio_iommu_topo_config {
> +	__le32					offset;
> +	__le32					num_items;
> +	__le32					item_length;
> +};
> +
>  struct virtio_iommu_config {
>  	/* Supported page sizes */
>  	__le64					page_size_mask;
> @@ -36,6 +43,25 @@ struct virtio_iommu_config {
>  	struct virtio_iommu_range_32		domain_range;
>  	/* Probe buffer size */
>  	__le32					probe_size;
> +	struct virtio_iommu_topo_config		topo_config;
> +};
> +
> +#define VIRTIO_IOMMU_TOPO_PCI_RANGE		0x1
> +#define VIRTIO_IOMMU_TOPO_ENDPOINT		0x2
> +
> +struct virtio_iommu_topo_pci_range {
> +	__le16					type;
> +	__le16					hierarchy;
> +	__le16					requester_start;
> +	__le16					requester_end;
> +	__le32					endpoint_start;
> +};
> +
> +struct virtio_iommu_topo_endpoint {
> +	__le16					type;
> +	__le16					reserved;
> +	__le32					endpoint;
> +	__le64					address;
>  };
>  
>  /* Request types */
> -- 
> 2.25.0
Joerg Roedel March 2, 2020, 4:16 p.m. UTC | #2
On Fri, Feb 28, 2020 at 06:25:36PM +0100, Jean-Philippe Brucker wrote:
> This solution isn't elegant nor foolproof, but is the best we can do at
> the moment and works with existing virtio-iommu implementations. It also
> enables an IOMMU for lightweight hypervisors that do not rely on
> firmware methods for booting.

I appreciate the enablement on x86, but putting the conmfiguration into
mmio-space isn't really something I want to see upstream. What is the
problem with defining an ACPI table instead? This would also make things
work on AARCH64 UEFI machines.

Regards,

	Joerg
Auger Eric March 3, 2020, 10:19 a.m. UTC | #3
Hi Joerg,

On 3/2/20 5:16 PM, Joerg Roedel wrote:
> On Fri, Feb 28, 2020 at 06:25:36PM +0100, Jean-Philippe Brucker wrote:
>> This solution isn't elegant nor foolproof, but is the best we can do at
>> the moment and works with existing virtio-iommu implementations. It also
>> enables an IOMMU for lightweight hypervisors that do not rely on
>> firmware methods for booting.
> 
> I appreciate the enablement on x86, but putting the conmfiguration into
> mmio-space isn't really something I want to see upstream. What is the
> problem with defining an ACPI table instead? This would also make things
> work on AARCH64 UEFI machines.
Michael has pushed this solution (putting the "configuration in the PCI
config space"), I think for those main reasons:
- ACPI may not be supported on some archs/hyps
- the virtio-iommu is a PCIe device so binding should not need ACPI
description

Another issue with ACPI integration is we have different flavours of
tables that exist: IORT, DMAR, IVRS

x86 ACPI integration was suggested with IORT. But it seems ARM is
reluctant to extend IORT to support para-virtualized IOMMU. So the VIOT
was proposed as a different atternative in "[RFC 00/13] virtio-iommu on
non-devicetree platforms"
(https://patchwork.kernel.org/cover/11257727/). Proposing a table that
may be used by different vendors seems to be a challenging issue here.

So even if the above solution does not look perfect, it looked a
sensible compromise given the above arguments. Please could you explain
what are the most compelling arguments against it?

Thanks

Eric
> 
> Regards,
> 
> 	Joerg
>
Joerg Roedel March 3, 2020, 1:01 p.m. UTC | #4
Hi Eric,

On Tue, Mar 03, 2020 at 11:19:20AM +0100, Auger Eric wrote:
> Michael has pushed this solution (putting the "configuration in the PCI
> config space"), I think for those main reasons:
> - ACPI may not be supported on some archs/hyps

But on those there is device-tree, right?

> - the virtio-iommu is a PCIe device so binding should not need ACPI
> description

The other x86 IOMMUs are PCI devices too and they definitly need a ACPI
table to be configured.

> Another issue with ACPI integration is we have different flavours of
> tables that exist: IORT, DMAR, IVRS

An integration with IORT might be the best, but if it is not possible
ther can be a new table-type for Virtio-iommu. That would still follow
platform best practices.

> x86 ACPI integration was suggested with IORT. But it seems ARM is
> reluctant to extend IORT to support para-virtualized IOMMU. So the VIOT
> was proposed as a different atternative in "[RFC 00/13] virtio-iommu on
> non-devicetree platforms"
> (https://patchwork.kernel.org/cover/11257727/). Proposing a table that
> may be used by different vendors seems to be a challenging issue here.

Yeah, if I am reading that right this proposes a one-fits-all solution.
That is not needed as the other x86 IOMMUs already have their tables
defined and implemented. There is no need to change anything there.

> So even if the above solution does not look perfect, it looked a
> sensible compromise given the above arguments. Please could you explain
> what are the most compelling arguments against it?

It is a hack and should be avoided if at all possible. And defining an
own ACPI table type seems very much possible.


Regards,

	Joerg
Michael S. Tsirkin March 3, 2020, 2 p.m. UTC | #5
On Tue, Mar 03, 2020 at 02:01:56PM +0100, Joerg Roedel wrote:
> Hi Eric,
> 
> On Tue, Mar 03, 2020 at 11:19:20AM +0100, Auger Eric wrote:
> > Michael has pushed this solution (putting the "configuration in the PCI
> > config space"), I think for those main reasons:
> > - ACPI may not be supported on some archs/hyps
> 
> But on those there is device-tree, right?

Not necessarily. E.g. some power systems have neither.
There are also systems looking to bypass ACPI e.g. for boot speed.


> > - the virtio-iommu is a PCIe device so binding should not need ACPI
> > description
> 
> The other x86 IOMMUs are PCI devices too and they definitly need a ACPI
> table to be configured.
> 
> > Another issue with ACPI integration is we have different flavours of
> > tables that exist: IORT, DMAR, IVRS
> 
> An integration with IORT might be the best, but if it is not possible
> ther can be a new table-type for Virtio-iommu. That would still follow
> platform best practices.
> 
> > x86 ACPI integration was suggested with IORT. But it seems ARM is
> > reluctant to extend IORT to support para-virtualized IOMMU. So the VIOT
> > was proposed as a different atternative in "[RFC 00/13] virtio-iommu on
> > non-devicetree platforms"
> > (https://patchwork.kernel.org/cover/11257727/). Proposing a table that
> > may be used by different vendors seems to be a challenging issue here.
> 
> Yeah, if I am reading that right this proposes a one-fits-all solution.
> That is not needed as the other x86 IOMMUs already have their tables
> defined and implemented. There is no need to change anything there.
> 
> > So even if the above solution does not look perfect, it looked a
> > sensible compromise given the above arguments. Please could you explain
> > what are the most compelling arguments against it?
> 
> It is a hack and should be avoided if at all possible.

That sentence doesn't really answer the question, does it?

> And defining an
> own ACPI table type seems very much possible.

Frankly with platform specific interfaces like ACPI, virtio-iommu is
much less compelling.  Describing topology as part of the device in a
way that is first, portable, and second, is a good fit for hypervisors,
is to me one of the main reasons virtio-iommu makes sense at all.

> 
> Regards,
> 
> 	Joerg
Michael S. Tsirkin March 3, 2020, 2:02 p.m. UTC | #6
On Mon, Mar 02, 2020 at 05:16:12PM +0100, Joerg Roedel wrote:
> On Fri, Feb 28, 2020 at 06:25:36PM +0100, Jean-Philippe Brucker wrote:
> > This solution isn't elegant nor foolproof, but is the best we can do at
> > the moment and works with existing virtio-iommu implementations. It also
> > enables an IOMMU for lightweight hypervisors that do not rely on
> > firmware methods for booting.
> 
> I appreciate the enablement on x86, but putting the conmfiguration into
> mmio-space isn't really something I want to see upstream.

It's in virtio config space, not in mmio-space. With a PCI virtio-IOMMU
device this will be in memory.

> What is the
> problem with defining an ACPI table instead? This would also make things
> work on AARCH64 UEFI machines.
> 
> Regards,
> 
> 	Joerg
Joerg Roedel March 3, 2020, 3:53 p.m. UTC | #7
On Tue, Mar 03, 2020 at 09:00:05AM -0500, Michael S. Tsirkin wrote:
> Not necessarily. E.g. some power systems have neither.
> There are also systems looking to bypass ACPI e.g. for boot speed.

If there is no firmware layer between the hardware and the OS the
necessary information the OS needs to run on the hardware is probably
hard-coded into the kernel? In that case the same can be done with
virtio-iommu tolopology.

> That sentence doesn't really answer the question, does it?

To be more elaborate, putting this information into config space is a
layering violation. Hardware is never completly self-descriptive and
that is why there is the firmware which provides the information about
the hardware to the OS in a generic way.

> Frankly with platform specific interfaces like ACPI, virtio-iommu is
> much less compelling.  Describing topology as part of the device in a
> way that is first, portable, and second, is a good fit for hypervisors,
> is to me one of the main reasons virtio-iommu makes sense at all.

Virtio-IOMMU makes sense in the first place because it is much faster
than emulating one of the hardware IOMMUs. And an ACPI table is also
portable to all ACPI platforms, same with device-tree.

Regards,

	Joerg
Michael S. Tsirkin March 3, 2020, 4:09 p.m. UTC | #8
On Tue, Mar 03, 2020 at 04:53:19PM +0100, Joerg Roedel wrote:
> On Tue, Mar 03, 2020 at 09:00:05AM -0500, Michael S. Tsirkin wrote:
> > Not necessarily. E.g. some power systems have neither.
> > There are also systems looking to bypass ACPI e.g. for boot speed.
> 
> If there is no firmware layer between the hardware and the OS the
> necessary information the OS needs to run on the hardware is probably
> hard-coded into the kernel?

No. It's coded into the hardware. Which might even be practical
for bare-metal (e.g. on-board flash), but is very practical
when the device is part of a hypervisor.

> In that case the same can be done with
> virtio-iommu tolopology.
> 
> > That sentence doesn't really answer the question, does it?
> 
> To be more elaborate, putting this information into config space is a
> layering violation. Hardware is never completly self-descriptive


This "hardware" is actually part of hypervisor so there's no
reason it can't be completely self-descriptive. It's specified
by the same entity as the "firmware".


> and
> that is why there is the firmware which provides the information about
> the hardware to the OS in a generic way.
>
> > Frankly with platform specific interfaces like ACPI, virtio-iommu is
> > much less compelling.  Describing topology as part of the device in a
> > way that is first, portable, and second, is a good fit for hypervisors,
> > is to me one of the main reasons virtio-iommu makes sense at all.
> 
> Virtio-IOMMU makes sense in the first place because it is much faster
> than emulating one of the hardware IOMMUs.

I don't see why it would be much faster. The interface isn't that
different from command queues of VTD.

> And an ACPI table is also
> portable to all ACPI platforms, same with device-tree.
> 
> Regards,
> 
> 	Joerg

Making ACPI meet the goals of embedded projects such as kata containers
would be a gigantic task with huge stability implications.  By
comparison this 400-line parser is well contained and does the job.  I
didn't yet see compelling reasons not to merge this, but I'll be
interested to see some more specific concerns.
Auger Eric March 3, 2020, 4:21 p.m. UTC | #9
Hi Michael, Joerg,

On 3/3/20 5:09 PM, Michael S. Tsirkin wrote:
> On Tue, Mar 03, 2020 at 04:53:19PM +0100, Joerg Roedel wrote:
>> On Tue, Mar 03, 2020 at 09:00:05AM -0500, Michael S. Tsirkin wrote:
>>> Not necessarily. E.g. some power systems have neither.
>>> There are also systems looking to bypass ACPI e.g. for boot speed.
>>
>> If there is no firmware layer between the hardware and the OS the
>> necessary information the OS needs to run on the hardware is probably
>> hard-coded into the kernel?
> 
> No. It's coded into the hardware. Which might even be practical
> for bare-metal (e.g. on-board flash), but is very practical
> when the device is part of a hypervisor.
> 
>> In that case the same can be done with
>> virtio-iommu tolopology.
>>
>>> That sentence doesn't really answer the question, does it?
>>
>> To be more elaborate, putting this information into config space is a
>> layering violation. Hardware is never completly self-descriptive
> 
> 
> This "hardware" is actually part of hypervisor so there's no
> reason it can't be completely self-descriptive. It's specified
> by the same entity as the "firmware".

Yes in the QEMU case the machine code would fill this information as it
knows all the devices connected to the iommu. In the same way it would
generate the DT description or the ACPI tables
> 
>> and
>> that is why there is the firmware which provides the information about
>> the hardware to the OS in a generic way.
>>
>>> Frankly with platform specific interfaces like ACPI, virtio-iommu is
>>> much less compelling.  Describing topology as part of the device in a
>>> way that is first, portable, and second, is a good fit for hypervisors,
>>> is to me one of the main reasons virtio-iommu makes sense at all.
>>
>> Virtio-IOMMU makes sense in the first place because it is much faster
>> than emulating one of the hardware IOMMUs.
> 
> I don't see why it would be much faster. The interface isn't that
> different from command queues of VTD.

> 
>> And an ACPI table is also
>> portable to all ACPI platforms, same with device-tree.
>>
>> Regards,
>>
>> 	Joerg
> 
> Making ACPI meet the goals of embedded projects such as kata containers
> would be a gigantic task with huge stability implications.  By
> comparison this 400-line parser is well contained and does the job.  I
> didn't yet see compelling reasons not to merge this, but I'll be
> interested to see some more specific concerns.
> 
> 
Thanks

Eric
Joerg Roedel March 4, 2020, 1:37 p.m. UTC | #10
Hi Michael,

On Tue, Mar 03, 2020 at 11:09:41AM -0500, Michael S. Tsirkin wrote:
> No. It's coded into the hardware. Which might even be practical
> for bare-metal (e.g. on-board flash), but is very practical
> when the device is part of a hypervisor.

If its that way on PPC, than fine for them. But since this is enablement
for x86, it should follow the x86 platform best practices, and that
means describing hardware through ACPI.

> This "hardware" is actually part of hypervisor so there's no
> reason it can't be completely self-descriptive. It's specified
> by the same entity as the "firmware".

That is just an implementation detail. Yes, QEMU emulates the hardware
and builds the ACPI tables. But it could also be implemented in a way
where the ACPI tables are build by guest firmware.

> I don't see why it would be much faster. The interface isn't that
> different from command queues of VTD.

VirtIO IOMMU doesn't need to build page-tables that the hypervisor then
has to shadow, which makes things much faster. If you emulate one of the
other IOMMUs (VT-d or AMD-Vi) the code has to shadow the full page-table
at once when device passthrough is used. VirtIO-IOMMU doesn't need that,
and that makes it much faster and efficient.

> Making ACPI meet the goals of embedded projects such as kata containers
> would be a gigantic task with huge stability implications.  By
> comparison this 400-line parser is well contained and does the job.  I
> didn't yet see compelling reasons not to merge this, but I'll be
> interested to see some more specific concerns.

An ACPI table parse wouldn't need more lines of code. For embedded
systems there is still the DT way of describing things.

Regards,

	Joerg
Jean-Philippe Brucker March 4, 2020, 3:38 p.m. UTC | #11
On Wed, Mar 04, 2020 at 02:37:08PM +0100, Joerg Roedel wrote:
> Hi Michael,
> 
> On Tue, Mar 03, 2020 at 11:09:41AM -0500, Michael S. Tsirkin wrote:
> > No. It's coded into the hardware. Which might even be practical
> > for bare-metal (e.g. on-board flash), but is very practical
> > when the device is part of a hypervisor.
> 
> If its that way on PPC, than fine for them. But since this is enablement
> for x86, it should follow the x86 platform best practices, and that
> means describing hardware through ACPI.

I agree with this. The problem is I don't know how to get a new ACPI table
or change an existing one. It needs to go through the UEFI forum in order
to be accepted, and I don't have any weight there. I've been trying to get
the tiny change into IORT for ages. I haven't been given any convincing
reason against it or offered any alternative, it's just stalled. The
topology description introduced here wasn't my first choice either but
unless someone can help finding another way into ACPI, I don't have a
better idea.

Thanks,
Jean
Jacob Pan (Jun) March 4, 2020, 3:48 p.m. UTC | #12
On Wed, 4 Mar 2020 14:37:08 +0100
Joerg Roedel <joro@8bytes.org> wrote:

> Hi Michael,
> 
> On Tue, Mar 03, 2020 at 11:09:41AM -0500, Michael S. Tsirkin wrote:
> > No. It's coded into the hardware. Which might even be practical
> > for bare-metal (e.g. on-board flash), but is very practical
> > when the device is part of a hypervisor.  
> 
> If its that way on PPC, than fine for them. But since this is
> enablement for x86, it should follow the x86 platform best practices,
> and that means describing hardware through ACPI.
> 
> > This "hardware" is actually part of hypervisor so there's no
> > reason it can't be completely self-descriptive. It's specified
> > by the same entity as the "firmware".  
> 
> That is just an implementation detail. Yes, QEMU emulates the hardware
> and builds the ACPI tables. But it could also be implemented in a way
> where the ACPI tables are build by guest firmware.
> 
> > I don't see why it would be much faster. The interface isn't that
> > different from command queues of VTD.  
> 
> VirtIO IOMMU doesn't need to build page-tables that the hypervisor
> then has to shadow, which makes things much faster. If you emulate
> one of the other IOMMUs (VT-d or AMD-Vi) the code has to shadow the
> full page-table at once when device passthrough is used. VirtIO-IOMMU
> doesn't need that, and that makes it much faster and efficient.
> 
For emulated VT-d IOMMU, GIOVA can also be build as first level page
tables then pass to the host IOMMU to bind. There is no need to shadow
in this case, pIOMMU will do nested translation and walk guest page
tables.

> > Making ACPI meet the goals of embedded projects such as kata
> > containers would be a gigantic task with huge stability
> > implications.  By comparison this 400-line parser is well contained
> > and does the job.  I didn't yet see compelling reasons not to merge
> > this, but I'll be interested to see some more specific concerns.  
> 
> An ACPI table parse wouldn't need more lines of code. For embedded
> systems there is still the DT way of describing things.
> 
I thought we have the universal device properties to abstract DT and
ACPI (via _DSD). Is that an option?

> Regards,
> 
> 	Joerg
> _______________________________________________
> iommu mailing list
> iommu@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/iommu

[Jacob Pan]
Joerg Roedel March 4, 2020, 5:34 p.m. UTC | #13
On Wed, Mar 04, 2020 at 07:48:54AM -0800, Jacob Pan wrote:
> For emulated VT-d IOMMU, GIOVA can also be build as first level page
> tables then pass to the host IOMMU to bind. There is no need to shadow
> in this case, pIOMMU will do nested translation and walk guest page
> tables.

Right, but that requires hardware support. A pure software emulation of
VT-d requires the full shadow of the guest io-page-table.

> I thought we have the universal device properties to abstract DT and
> ACPI (via _DSD). Is that an option?

I don't know whether this was considered, Jean-Philippe?


Regards,

	Joerg
Joerg Roedel March 4, 2020, 5:40 p.m. UTC | #14
On Wed, Mar 04, 2020 at 04:38:21PM +0100, Jean-Philippe Brucker wrote:
> I agree with this. The problem is I don't know how to get a new ACPI table
> or change an existing one. It needs to go through the UEFI forum in order
> to be accepted, and I don't have any weight there. I've been trying to get
> the tiny change into IORT for ages. I haven't been given any convincing
> reason against it or offered any alternative, it's just stalled. The
> topology description introduced here wasn't my first choice either but
> unless someone can help finding another way into ACPI, I don't have a
> better idea.

A quote from the ACPI Specification (Version 6.3, Section 5.2.6,
Page 119):

	Table signatures will be reserved by the ACPI promoters and
	posted independently of this specification in ACPI errata and
	clarification documents on the ACPI web site. Requests to
	reserve a 4-byte alphanumeric table signature should be sent to
	the email address info@acpi.info and should include the purpose
	of the table and reference URL to a document that describes the
	table format. Tables defined outside of the ACPI specification
	may define data value encodings in either little endian or big
	endian format. For the purpose of clarity, external table
	definition documents should include the endian-ness of their
	data value encodings.

So it sounds like you need to specifiy the table format and send a
request to info@acpi.info to get a table signature for it.

Regards,

	Joerg
Michael S. Tsirkin March 4, 2020, 7:34 p.m. UTC | #15
On Wed, Mar 04, 2020 at 02:37:08PM +0100, Joerg Roedel wrote:
> Hi Michael,
> 
> On Tue, Mar 03, 2020 at 11:09:41AM -0500, Michael S. Tsirkin wrote:
> > No. It's coded into the hardware. Which might even be practical
> > for bare-metal (e.g. on-board flash), but is very practical
> > when the device is part of a hypervisor.
> 
> If its that way on PPC, than fine for them. But since this is enablement
> for x86, it should follow the x86 platform best practices, and that
> means describing hardware through ACPI.

For hardware, sure.  Hypervisors aren't hardware
though and a bunch of hypervisors don't use ACPI.


> > This "hardware" is actually part of hypervisor so there's no
> > reason it can't be completely self-descriptive. It's specified
> > by the same entity as the "firmware".
> 
> That is just an implementation detail. Yes, QEMU emulates the hardware
> and builds the ACPI tables. But it could also be implemented in a way
> where the ACPI tables are build by guest firmware.

All these extra levels of indirection is one of the reasons
hypervisors such as kata try to avoid ACPI.

> > I don't see why it would be much faster. The interface isn't that
> > different from command queues of VTD.
> 
> VirtIO IOMMU doesn't need to build page-tables that the hypervisor then
> has to shadow, which makes things much faster. If you emulate one of the
> other IOMMUs (VT-d or AMD-Vi) the code has to shadow the full page-table
> at once when device passthrough is used. VirtIO-IOMMU doesn't need that,
> and that makes it much faster and efficient.


IIUC VT-d at least supports range invalidations.

> 
> > Making ACPI meet the goals of embedded projects such as kata containers
> > would be a gigantic task with huge stability implications.  By
> > comparison this 400-line parser is well contained and does the job.  I
> > didn't yet see compelling reasons not to merge this, but I'll be
> > interested to see some more specific concerns.
> 
> An ACPI table parse wouldn't need more lines of code.

It realies on ACPI OSPM itself to handle ACPI bytecode, which is huge.


> For embedded
> systems there is still the DT way of describing things.

For some embedded systems.

> Regards,
> 
> 	Joerg
Jacob Pan (Jun) March 4, 2020, 9:37 p.m. UTC | #16
On Wed, 4 Mar 2020 18:40:46 +0100
Joerg Roedel <joro@8bytes.org> wrote:

> On Wed, Mar 04, 2020 at 04:38:21PM +0100, Jean-Philippe Brucker wrote:
> > I agree with this. The problem is I don't know how to get a new
> > ACPI table or change an existing one. It needs to go through the
> > UEFI forum in order to be accepted, and I don't have any weight
> > there. I've been trying to get the tiny change into IORT for ages.
> > I haven't been given any convincing reason against it or offered
> > any alternative, it's just stalled. The topology description
> > introduced here wasn't my first choice either but unless someone
> > can help finding another way into ACPI, I don't have a better
> > idea.  
> 
> A quote from the ACPI Specification (Version 6.3, Section 5.2.6,
> Page 119):
> 
> 	Table signatures will be reserved by the ACPI promoters and
> 	posted independently of this specification in ACPI errata and
> 	clarification documents on the ACPI web site. Requests to
> 	reserve a 4-byte alphanumeric table signature should be sent
> to the email address info@acpi.info and should include the purpose
> 	of the table and reference URL to a document that describes
> the table format. Tables defined outside of the ACPI specification
> 	may define data value encodings in either little endian or big
> 	endian format. For the purpose of clarity, external table
> 	definition documents should include the endian-ness of their
> 	data value encodings.
> 
> So it sounds like you need to specifiy the table format and send a
> request to info@acpi.info to get a table signature for it.
> 
+ Mike and Erik who work closely on UEFI and ACPICA.

Copy paste Erik's initial response below on how to get a new table,
seems to confirm with the process you stated above.

"Fairly easy. You reserve a 4-letter symbol by sending a message
requesting to reserve the signature to Mike or the ASWG mailing list or
info@acpi.info

There is also another option. You can have ASWG own this new table so
that not one entity or company owns the new table."

> Regards,
> 
> 	Joerg
Joerg Roedel March 4, 2020, 9:50 p.m. UTC | #17
On Wed, Mar 04, 2020 at 02:34:33PM -0500, Michael S. Tsirkin wrote:
> All these extra levels of indirection is one of the reasons
> hypervisors such as kata try to avoid ACPI.

Platforms that don't use ACPI need another hardware detection mechanism,
which can also be supported. But the first step here is to enable the
general case, and for x86 platforms this means ACPI.

Regards,

	Joerg
Joerg Roedel March 4, 2020, 9:54 p.m. UTC | #18
On Wed, Mar 04, 2020 at 01:37:44PM -0800, Jacob Pan (Jun) wrote:
> + Mike and Erik who work closely on UEFI and ACPICA.
> 
> Copy paste Erik's initial response below on how to get a new table,
> seems to confirm with the process you stated above.
> 
> "Fairly easy. You reserve a 4-letter symbol by sending a message
> requesting to reserve the signature to Mike or the ASWG mailing list or
> info@acpi.info

Great! I think this is going to be the path forward here.

Regards,

	Joerg

> 
> There is also another option. You can have ASWG own this new table so
> that not one entity or company owns the new table."
Tian, Kevin March 5, 2020, 8:07 a.m. UTC | #19
> From: Jean-Philippe Brucker
> Sent: Saturday, February 29, 2020 1:26 AM
> 
> Platforms without device-tree do not currently have a method for
> describing the vIOMMU topology. Provide a topology description embedded
> into the virtio device.
> 
> Use PCI FIXUP to probe the config space early, because we need to
> discover the topology before any DMA configuration takes place, and the
> virtio driver may be loaded much later. Since we discover the topology
> description when probing the PCI hierarchy, the virtual IOMMU cannot
> manage other platform devices discovered earlier.
> 
> This solution isn't elegant nor foolproof, but is the best we can do at

can you elaborate "isn't elegant nor foolproof" part? is there any other 
limitation (beside pci fixup) along the route, when comparing it to 
the ACPI-approach?

> the moment and works with existing virtio-iommu implementations. It also
> enables an IOMMU for lightweight hypervisors that do not rely on
> firmware methods for booting.
> 
> Signed-off-by: Eric Auger <eric.auger@redhat.com>
> Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
> ---
>  MAINTAINERS                           |   2 +
>  drivers/iommu/Kconfig                 |  10 +
>  drivers/iommu/Makefile                |   1 +
>  drivers/iommu/virtio-iommu-topology.c | 343
> ++++++++++++++++++++++++++
>  drivers/iommu/virtio-iommu.c          |   3 +
>  include/linux/virt_iommu.h            |  19 ++
>  include/uapi/linux/virtio_iommu.h     |  26 ++
>  7 files changed, 404 insertions(+)
>  create mode 100644 drivers/iommu/virtio-iommu-topology.c
>  create mode 100644 include/linux/virt_iommu.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index fcd79fc38928..65a03ce53096 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -17781,6 +17781,8 @@ M:	Jean-Philippe Brucker <jean-
> philippe@linaro.org>
>  L:	virtualization@lists.linux-foundation.org
>  S:	Maintained
>  F:	drivers/iommu/virtio-iommu.c
> +F:	drivers/iommu/virtio-iommu-topology.c
> +F:	include/linux/virt_iommu.h
>  F:	include/uapi/linux/virtio_iommu.h
> 
>  VIRTUAL BOX GUEST DEVICE DRIVER
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index c5df570ef84a..f8cb45d84bb0 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -516,4 +516,14 @@ config VIRTIO_IOMMU
> 
>  	  Say Y here if you intend to run this kernel as a guest.
> 
> +config VIRTIO_IOMMU_TOPOLOGY
> +	bool "Topology properties for the virtio-iommu"
> +	depends on VIRTIO_IOMMU
> +	default y
> +	help
> +	  Enable early probing of the virtio-iommu device, to detect the
> +	  built-in topology description.
> +
> +	  Say Y here if you intend to run this kernel as a guest.
> +
>  endif # IOMMU_SUPPORT
> diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
> index 9f33fdb3bb05..5da24280d08c 100644
> --- a/drivers/iommu/Makefile
> +++ b/drivers/iommu/Makefile
> @@ -37,3 +37,4 @@ obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
>  obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o
>  obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o
>  obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
> +obj-$(CONFIG_VIRTIO_IOMMU_TOPOLOGY) += virtio-iommu-topology.o
> diff --git a/drivers/iommu/virtio-iommu-topology.c b/drivers/iommu/virtio-
> iommu-topology.c
> new file mode 100644
> index 000000000000..2188624ef216
> --- /dev/null
> +++ b/drivers/iommu/virtio-iommu-topology.c
> @@ -0,0 +1,343 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> +#include <linux/dma-iommu.h>
> +#include <linux/list.h>
> +#include <linux/pci.h>
> +#include <linux/virt_iommu.h>
> +#include <linux/virtio_ids.h>
> +#include <linux/virtio_pci.h>
> +#include <uapi/linux/virtio_iommu.h>
> +
> +struct viommu_cap_config {
> +	u8 bar;
> +	u32 length; /* structure size */
> +	u32 offset; /* structure offset within the bar */
> +};
> +
> +union viommu_topo_cfg {
> +	__le16					type;
> +	struct virtio_iommu_topo_pci_range	pci;
> +	struct virtio_iommu_topo_endpoint	ep;
> +};
> +
> +struct viommu_spec {
> +	struct device				*dev; /* transport device */
> +	struct fwnode_handle			*fwnode;
> +	struct iommu_ops			*ops;
> +	struct list_head			list;
> +	size_t					num_items;
> +	/* The config array of length num_items follows */
> +	union viommu_topo_cfg			cfg[];
> +};
> +
> +static LIST_HEAD(viommus);
> +static DEFINE_MUTEX(viommus_lock);
> +
> +#define VPCI_FIELD(field) offsetof(struct virtio_pci_cap, field)
> +
> +static inline int viommu_pci_find_capability(struct pci_dev *dev, u8 cfg_type,
> +					     struct viommu_cap_config *cap)
> +{
> +	int pos;
> +	u8 bar;
> +
> +	for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR);
> +	     pos > 0;
> +	     pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) {
> +		u8 type;
> +
> +		pci_read_config_byte(dev, pos + VPCI_FIELD(cfg_type),
> &type);
> +		if (type != cfg_type)
> +			continue;
> +
> +		pci_read_config_byte(dev, pos + VPCI_FIELD(bar), &bar);
> +
> +		/* Ignore structures with reserved BAR values */
> +		if (type != VIRTIO_PCI_CAP_PCI_CFG && bar > 0x5)
> +			continue;
> +
> +		cap->bar = bar;
> +		pci_read_config_dword(dev, pos + VPCI_FIELD(length),
> +				      &cap->length);
> +		pci_read_config_dword(dev, pos + VPCI_FIELD(offset),
> +				      &cap->offset);
> +
> +		return pos;
> +	}
> +	return 0;
> +}
> +
> +static void viommu_ccopy(__le32 *dst, u32 __iomem *src, size_t length)
> +{
> +	size_t i;
> +
> +	/* For the moment all our config structures align on 32b */
> +	if (WARN_ON(length % 4))
> +		return;
> +
> +	for (i = 0; i < length / 4; i++)
> +		/* Keep little-endian data */
> +		dst[i] = cpu_to_le32(ioread32(src + i));
> +}
> +
> +static int viommu_parse_topology(struct device *dev,
> +				 struct virtio_iommu_config __iomem *cfg)
> +{
> +	size_t i;
> +	size_t spec_length;
> +	struct viommu_spec *viommu_spec;
> +	u32 offset, item_length, num_items;
> +
> +	offset = ioread32(&cfg->topo_config.offset);
> +	item_length = ioread32(&cfg->topo_config.item_length);
> +	num_items = ioread32(&cfg->topo_config.num_items);
> +	if (!offset || !num_items || !item_length)
> +		return 0;
> +
> +	spec_length = sizeof(*viommu_spec) + num_items *
> +					     sizeof(union viommu_topo_cfg);
> +	viommu_spec = kzalloc(spec_length, GFP_KERNEL);
> +	if (!viommu_spec)
> +		return -ENOMEM;
> +
> +	viommu_spec->dev = dev;
> +
> +	/* Copy in the whole array, sort it out later */
> +	for (i = 0; i < num_items; i++) {
> +		size_t read_length = min_t(size_t, item_length,
> +					   sizeof(union viommu_topo_cfg));
> +
> +		viommu_ccopy((__le32 *)&viommu_spec->cfg[i],
> +			     (void __iomem *)cfg + offset,
> +			     read_length);
> +
> +		offset += item_length;
> +	}
> +	viommu_spec->num_items = num_items;
> +
> +	mutex_lock(&viommus_lock);
> +	list_add(&viommu_spec->list, &viommus);
> +	mutex_unlock(&viommus_lock);
> +
> +	return 0;
> +}
> +
> +static void viommu_pci_parse_topology(struct pci_dev *dev)
> +{
> +	int pos;
> +	u32 features;
> +	void __iomem *regs;
> +	struct viommu_cap_config cap = {0};
> +	struct virtio_pci_common_cfg __iomem *common_cfg;
> +
> +	/*
> +	 * The virtio infrastructure might not be loaded at this point. we need
> +	 * to access the BARs ourselves.
> +	 */
> +	pos = viommu_pci_find_capability(dev,
> VIRTIO_PCI_CAP_COMMON_CFG, &cap);
> +	if (!pos) {
> +		pci_warn(dev, "common capability not found\n");
> +		return;
> +	}
> +
> +	if (pci_enable_device_mem(dev))
> +		return;
> +
> +	regs = pci_iomap(dev, cap.bar, 0);
> +	if (!regs)
> +		return;
> +
> +	common_cfg = regs + cap.offset;
> +
> +	/* Find out if the device supports topology description */
> +	writel(0, &common_cfg->device_feature_select);
> +	features = ioread32(&common_cfg->device_feature);
> +
> +	pci_iounmap(dev, regs);
> +
> +	if (!(features & BIT(VIRTIO_IOMMU_F_TOPOLOGY))) {
> +		pci_dbg(dev, "device doesn't have topology description");
> +		return;
> +	}
> +
> +	pos = viommu_pci_find_capability(dev,
> VIRTIO_PCI_CAP_DEVICE_CFG, &cap);
> +	if (!pos) {
> +		pci_warn(dev, "device config capability not found\n");
> +		return;
> +	}
> +
> +	regs = pci_iomap(dev, cap.bar, 0);
> +	if (!regs)
> +		return;
> +
> +	pci_info(dev, "parsing virtio-iommu topology\n");
> +	viommu_parse_topology(&dev->dev, regs + cap.offset);
> +	pci_iounmap(dev, regs);
> +}
> +
> +/*
> + * Catch a PCI virtio-iommu implementation early to get the topology
> description
> + * before we start probing other endpoints.
> + */
> +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1040
> + VIRTIO_ID_IOMMU,
> +			viommu_pci_parse_topology);
> +
> +/*
> + * Return true if the device matches this topology structure. Write the
> endpoint
> + * ID into epid if it's the case.
> + */
> +static bool viommu_parse_pci(struct pci_dev *pdev, union
> viommu_topo_cfg *cfg,
> +			     u32 *epid)
> +{
> +	u32 endpoint_start;
> +	u16 start, end, domain;
> +	u16 devid = pci_dev_id(pdev);
> +	u16 type = le16_to_cpu(cfg->type);
> +
> +	if (type != VIRTIO_IOMMU_TOPO_PCI_RANGE)
> +		return false;
> +
> +	start		= le16_to_cpu(cfg->pci.requester_start);
> +	end		= le16_to_cpu(cfg->pci.requester_end);
> +	domain		= le16_to_cpu(cfg->pci.hierarchy);
> +	endpoint_start	= le32_to_cpu(cfg->pci.endpoint_start);
> +
> +	if (pci_domain_nr(pdev->bus) == domain &&
> +	    devid >= start && devid <= end) {
> +		*epid = devid - start + endpoint_start;
> +		return true;
> +	}
> +	return false;
> +}
> +
> +static const struct iommu_ops *virt_iommu_setup(struct device *dev)
> +{
> +	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> +	const struct iommu_ops *viommu_ops = NULL;
> +	struct fwnode_handle *viommu_fwnode;
> +	struct viommu_spec *viommu_spec;
> +	struct pci_dev *pci_dev = NULL;
> +	struct device *viommu_dev;
> +	bool found = false;
> +	size_t i;
> +	u32 epid;
> +	int ret;
> +
> +	/* Already translated? */
> +	if (fwspec && fwspec->ops)
> +		return NULL;
> +
> +	if (dev_is_pci(dev)) {
> +		pci_dev = to_pci_dev(dev);
> +	} else {
> +		/* At the moment we don't support platform devices */
> +		return NULL;
> +	}
> +
> +	mutex_lock(&viommus_lock);
> +	list_for_each_entry(viommu_spec, &viommus, list) {
> +		for (i = 0; i < viommu_spec->num_items; i++) {
> +			union viommu_topo_cfg *cfg = &viommu_spec-
> >cfg[i];
> +
> +			found = viommu_parse_pci(pci_dev, cfg, &epid);
> +			if (found)
> +				break;
> +		}
> +		if (found) {
> +			viommu_ops = viommu_spec->ops;
> +			viommu_fwnode = viommu_spec->fwnode;
> +			viommu_dev = viommu_spec->dev;
> +			break;
> +		}
> +	}
> +	mutex_unlock(&viommus_lock);
> +	if (!found)
> +		return NULL;
> +
> +	/* We're not translating ourselves. */
> +	if (viommu_dev == dev)
> +		return NULL;
> +
> +	/*
> +	 * If we found a PCI range managed by the viommu, we're the ones
> that
> +	 * have to request ACS.
> +	 */
> +	if (pci_dev)
> +		pci_request_acs();
> +
> +	if (!viommu_ops)
> +		return ERR_PTR(-EPROBE_DEFER);
> +
> +	ret = iommu_fwspec_init(dev, viommu_fwnode, viommu_ops);
> +	if (ret)
> +		return ERR_PTR(ret);
> +
> +	iommu_fwspec_add_ids(dev, &epid, 1);
> +
> +	return viommu_ops;
> +}
> +
> +/**
> + * virt_dma_configure - Configure DMA of virtualized devices
> + * @dev: the endpoint
> + *
> + * Setup the DMA and IOMMU ops of a virtual device, for platforms without
> DT or
> + * ACPI.
> + *
> + * Return: -EPROBE_DEFER if the device is managed by an IOMMU that
> hasn't been
> + *   probed yet, 0 otherwise
> + */
> +int virt_dma_configure(struct device *dev)
> +{
> +	const struct iommu_ops *iommu_ops;
> +
> +	iommu_ops = virt_iommu_setup(dev);
> +	if (IS_ERR_OR_NULL(iommu_ops)) {
> +		int ret = PTR_ERR(iommu_ops);
> +
> +		if (ret == -EPROBE_DEFER || ret == 0)
> +			return ret;
> +		dev_err(dev, "error %d while setting up virt IOMMU\n", ret);
> +		return 0;
> +	}
> +
> +	/*
> +	 * If we have reason to believe the IOMMU driver missed the initial
> +	 * add_device callback for dev, replay it to get things in order.
> +	 */
> +	if (dev->bus && !device_iommu_mapped(dev))
> +		iommu_probe_device(dev);
> +
> +	/* Assume coherent, as well as full 64-bit addresses. */
> +#ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS
> +	arch_setup_dma_ops(dev, 0, ~0ULL, iommu_ops, true);
> +#else
> +	iommu_setup_dma_ops(dev, 0, ~0ULL);
> +#endif
> +	return 0;
> +}
> +
> +/**
> + * virt_set_iommu_ops - Set the IOMMU ops of a virtual IOMMU device
> + * @dev: the IOMMU device (transport)
> + * @ops: the new IOMMU ops or NULL
> + *
> + * Setup the iommu_ops associated to a viommu_spec, once the driver is
> loaded
> + * and the device probed.
> + */
> +void virt_set_iommu_ops(struct device *dev, struct iommu_ops *ops)
> +{
> +	struct viommu_spec *viommu_spec;
> +
> +	mutex_lock(&viommus_lock);
> +	list_for_each_entry(viommu_spec, &viommus, list) {
> +		if (viommu_spec->dev == dev) {
> +			viommu_spec->ops = ops;
> +			viommu_spec->fwnode = ops ? dev->fwnode : NULL;
> +			break;
> +		}
> +	}
> +	mutex_unlock(&viommus_lock);
> +}
> +EXPORT_SYMBOL_GPL(virt_set_iommu_ops);
> diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
> index 93ff58632452..5429c12c879b 100644
> --- a/drivers/iommu/virtio-iommu.c
> +++ b/drivers/iommu/virtio-iommu.c
> @@ -21,6 +21,7 @@
>  #include <linux/virtio.h>
>  #include <linux/virtio_config.h>
>  #include <linux/virtio_ids.h>
> +#include <linux/virt_iommu.h>
>  #include <linux/wait.h>
> 
>  #include <uapi/linux/virtio_iommu.h>
> @@ -1075,6 +1076,7 @@ static int viommu_probe(struct virtio_device *vdev)
>  	if (ret)
>  		goto err_free_vqs;
> 
> +	virt_set_iommu_ops(dev->parent, &viommu_ops);
>  	iommu_device_set_ops(&viommu->iommu, &viommu_ops);
>  	iommu_device_set_fwnode(&viommu->iommu, parent_dev-
> >fwnode);
> 
> @@ -1121,6 +1123,7 @@ static void viommu_remove(struct virtio_device
> *vdev)
>  {
>  	struct viommu_dev *viommu = vdev->priv;
> 
> +	virt_set_iommu_ops(vdev->dev.parent, NULL);
>  	iommu_device_sysfs_remove(&viommu->iommu);
>  	iommu_device_unregister(&viommu->iommu);
> 
> diff --git a/include/linux/virt_iommu.h b/include/linux/virt_iommu.h
> new file mode 100644
> index 000000000000..c68b03ec75ba
> --- /dev/null
> +++ b/include/linux/virt_iommu.h
> @@ -0,0 +1,19 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef VIRTIO_IOMMU_H_
> +#define VIRTIO_IOMMU_H_
> +
> +#if IS_ENABLED(CONFIG_VIRTIO_IOMMU_TOPOLOGY)
> +int virt_dma_configure(struct device *dev);
> +void virt_set_iommu_ops(struct device *dev, struct iommu_ops *ops);
> +#else /* !CONFIG_VIRTIO_IOMMU_TOPOLOGY */
> +static inline int virt_dma_configure(struct device *dev)
> +{
> +	/* Don't disturb the normal DMA configuration methods */
> +	return 0;
> +}
> +
> +static inline void virt_set_iommu_ops(struct device *dev, struct iommu_ops
> *ops)
> +{ }
> +#endif /* !CONFIG_VIRTIO_IOMMU_TOPOLOGY */
> +
> +#endif /* VIRTIO_IOMMU_H_ */
> diff --git a/include/uapi/linux/virtio_iommu.h
> b/include/uapi/linux/virtio_iommu.h
> index 237e36a280cb..ec57d215086a 100644
> --- a/include/uapi/linux/virtio_iommu.h
> +++ b/include/uapi/linux/virtio_iommu.h
> @@ -16,6 +16,7 @@
>  #define VIRTIO_IOMMU_F_BYPASS			3
>  #define VIRTIO_IOMMU_F_PROBE			4
>  #define VIRTIO_IOMMU_F_MMIO			5
> +#define VIRTIO_IOMMU_F_TOPOLOGY			6
> 
>  struct virtio_iommu_range_64 {
>  	__le64					start;
> @@ -27,6 +28,12 @@ struct virtio_iommu_range_32 {
>  	__le32					end;
>  };
> 
> +struct virtio_iommu_topo_config {
> +	__le32					offset;
> +	__le32					num_items;
> +	__le32					item_length;
> +};
> +
>  struct virtio_iommu_config {
>  	/* Supported page sizes */
>  	__le64					page_size_mask;
> @@ -36,6 +43,25 @@ struct virtio_iommu_config {
>  	struct virtio_iommu_range_32		domain_range;
>  	/* Probe buffer size */
>  	__le32					probe_size;
> +	struct virtio_iommu_topo_config		topo_config;
> +};
> +
> +#define VIRTIO_IOMMU_TOPO_PCI_RANGE		0x1
> +#define VIRTIO_IOMMU_TOPO_ENDPOINT		0x2
> +
> +struct virtio_iommu_topo_pci_range {
> +	__le16					type;
> +	__le16					hierarchy;
> +	__le16					requester_start;
> +	__le16					requester_end;
> +	__le32					endpoint_start;
> +};
> +
> +struct virtio_iommu_topo_endpoint {
> +	__le16					type;
> +	__le16					reserved;
> +	__le32					endpoint;
> +	__le64					address;
>  };
> 
>  /* Request types */
> --
> 2.25.0
> 
> _______________________________________________
> iommu mailing list
> iommu@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/iommu
Michael S. Tsirkin March 5, 2020, 3:39 p.m. UTC | #20
On Wed, Mar 04, 2020 at 10:50:02PM +0100, Joerg Roedel wrote:
> On Wed, Mar 04, 2020 at 02:34:33PM -0500, Michael S. Tsirkin wrote:
> > All these extra levels of indirection is one of the reasons
> > hypervisors such as kata try to avoid ACPI.
> 
> Platforms that don't use ACPI need another hardware detection mechanism,
> which can also be supported. But the first step here is to enable the
> general case, and for x86 platforms this means ACPI.
> 
> Regards,
> 
> 	Joerg

Frankly since a portable way to do it is needed anyway I don't see why
we also need ACPI.
Michael S. Tsirkin March 5, 2020, 3:42 p.m. UTC | #21
On Wed, Mar 04, 2020 at 10:54:49PM +0100, Joerg Roedel wrote:
> On Wed, Mar 04, 2020 at 01:37:44PM -0800, Jacob Pan (Jun) wrote:
> > + Mike and Erik who work closely on UEFI and ACPICA.
> > 
> > Copy paste Erik's initial response below on how to get a new table,
> > seems to confirm with the process you stated above.
> > 
> > "Fairly easy. You reserve a 4-letter symbol by sending a message
> > requesting to reserve the signature to Mike or the ASWG mailing list or
> > info@acpi.info
> 
> Great! I think this is going to be the path forward here.
> 
> Regards,
> 
> 	Joerg

I don't, I don't see why we should bother with ACPI. This is a PV device
anyway, we can just make it self-describing. The need for extra firmware
on some platforms is a bug not a feature. No point in emulating that.

> > 
> > There is also another option. You can have ASWG own this new table so
> > that not one entity or company owns the new table."
Jean-Philippe Brucker March 11, 2020, 5:48 p.m. UTC | #22
On Thu, Mar 05, 2020 at 08:07:32AM +0000, Tian, Kevin wrote:
> > From: Jean-Philippe Brucker
> > Sent: Saturday, February 29, 2020 1:26 AM
> > 
> > Platforms without device-tree do not currently have a method for
> > describing the vIOMMU topology. Provide a topology description embedded
> > into the virtio device.
> > 
> > Use PCI FIXUP to probe the config space early, because we need to
> > discover the topology before any DMA configuration takes place, and the
> > virtio driver may be loaded much later. Since we discover the topology
> > description when probing the PCI hierarchy, the virtual IOMMU cannot
> > manage other platform devices discovered earlier.
> > 
> > This solution isn't elegant nor foolproof, but is the best we can do at
> 
> can you elaborate "isn't elegant nor foolproof" part? is there any other 
> limitation (beside pci fixup) along the route, when comparing it to 
> the ACPI-approach?

Yes "not elegant" in part because of the PCI fixup. Fixups are used to
work around bugs, and it seems strange to have one for a normal use-case.
We also have to copy some of the virtio infrastructure since this code
runs before module load. And we have to add a third DMA configuration
method.

I don't believe anymore that the "not foolproof" part is right. After
studying the device infrastructure a little more this solution seems less
fragile than I previously thought, but it's still a big hack, and it's
only half of the story.

This patch only handles PCI-based endpoints and viommu. On ACPI platforms,
where the virtio-mmio device is specified by an object with _HID LNRO0005,
supporting virtio-iommu on MMIO requires installing a bus notifier. There
I'd rather use an ACPI table for the topology. Platforms that don't have
ACPI such as microvm specify virtio-mmio devices on the command-line.
There devices are only created when the virtio-mmio module is loaded,
which is too late. In that case I think we need to add an early pass on
the command-line, instead of a bus notifier.

Thanks,
Jean
Michael S. Tsirkin March 11, 2020, 9:48 p.m. UTC | #23
On Wed, Mar 11, 2020 at 06:48:22PM +0100, Jean-Philippe Brucker wrote:
> Yes "not elegant" in part because of the PCI fixup. Fixups are used to
> work around bugs

Not really - they are for anything unusual that common PCI code can not
handle on its own.
Michael S. Tsirkin April 13, 2020, 1:22 p.m. UTC | #24
On Fri, Feb 28, 2020 at 06:25:36PM +0100, Jean-Philippe Brucker wrote:
> diff --git a/include/uapi/linux/virtio_iommu.h b/include/uapi/linux/virtio_iommu.h
> index 237e36a280cb..ec57d215086a 100644
> --- a/include/uapi/linux/virtio_iommu.h
> +++ b/include/uapi/linux/virtio_iommu.h
> @@ -16,6 +16,7 @@
>  #define VIRTIO_IOMMU_F_BYPASS			3
>  #define VIRTIO_IOMMU_F_PROBE			4
>  #define VIRTIO_IOMMU_F_MMIO			5
> +#define VIRTIO_IOMMU_F_TOPOLOGY			6
>  
>  struct virtio_iommu_range_64 {
>  	__le64					start;
> @@ -27,6 +28,12 @@ struct virtio_iommu_range_32 {
>  	__le32					end;
>  };
>  
> +struct virtio_iommu_topo_config {
> +	__le32					offset;

Any restrictions on offset? E.g. alignment?

> +	__le32					num_items;
> +	__le32					item_length;
> +};
> +
>  struct virtio_iommu_config {
>  	/* Supported page sizes */
>  	__le64					page_size_mask;
> @@ -36,6 +43,25 @@ struct virtio_iommu_config {
>  	struct virtio_iommu_range_32		domain_range;
>  	/* Probe buffer size */
>  	__le32					probe_size;
> +	struct virtio_iommu_topo_config		topo_config;
> +};
> +
> +#define VIRTIO_IOMMU_TOPO_PCI_RANGE		0x1
> +#define VIRTIO_IOMMU_TOPO_ENDPOINT		0x2
> +
> +struct virtio_iommu_topo_pci_range {
> +	__le16					type;
> +	__le16					hierarchy;
> +	__le16					requester_start;
> +	__le16					requester_end;
> +	__le32					endpoint_start;
> +};
> +
> +struct virtio_iommu_topo_endpoint {
> +	__le16					type;
> +	__le16					reserved;
> +	__le32					endpoint;
> +	__le64					address;
>  };
>  
>  /* Request types */

As any UAPI change, this needs to be copied to virtio TC.

I believe an old version of QEMU patches was published there
but I don't think it was the latest one you tested against.

Description should preferably be added to spec too.

In partucular please add comments (in this header, too)
documenting the new fields, values and structures.

> -- 
> 2.25.0
Tian, Kevin April 21, 2020, 7:31 a.m. UTC | #25
> From: Jean-Philippe Brucker
> Sent: Saturday, February 29, 2020 1:26 AM
> 
> Platforms without device-tree do not currently have a method for
> describing the vIOMMU topology. Provide a topology description embedded
> into the virtio device.
> 
> Use PCI FIXUP to probe the config space early, because we need to
> discover the topology before any DMA configuration takes place, and the
> virtio driver may be loaded much later. Since we discover the topology
> description when probing the PCI hierarchy, the virtual IOMMU cannot
> manage other platform devices discovered earlier.
> 
> This solution isn't elegant nor foolproof, but is the best we can do at
> the moment and works with existing virtio-iommu implementations. It also
> enables an IOMMU for lightweight hypervisors that do not rely on
> firmware methods for booting.
> 
> Signed-off-by: Eric Auger <eric.auger@redhat.com>
> Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
> ---
>  MAINTAINERS                           |   2 +
>  drivers/iommu/Kconfig                 |  10 +
>  drivers/iommu/Makefile                |   1 +
>  drivers/iommu/virtio-iommu-topology.c | 343
> ++++++++++++++++++++++++++
>  drivers/iommu/virtio-iommu.c          |   3 +
>  include/linux/virt_iommu.h            |  19 ++
>  include/uapi/linux/virtio_iommu.h     |  26 ++
>  7 files changed, 404 insertions(+)
>  create mode 100644 drivers/iommu/virtio-iommu-topology.c
>  create mode 100644 include/linux/virt_iommu.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index fcd79fc38928..65a03ce53096 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -17781,6 +17781,8 @@ M:	Jean-Philippe Brucker <jean-
> philippe@linaro.org>
>  L:	virtualization@lists.linux-foundation.org
>  S:	Maintained
>  F:	drivers/iommu/virtio-iommu.c
> +F:	drivers/iommu/virtio-iommu-topology.c
> +F:	include/linux/virt_iommu.h
>  F:	include/uapi/linux/virtio_iommu.h
> 
>  VIRTUAL BOX GUEST DEVICE DRIVER
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index c5df570ef84a..f8cb45d84bb0 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -516,4 +516,14 @@ config VIRTIO_IOMMU
> 
>  	  Say Y here if you intend to run this kernel as a guest.
> 
> +config VIRTIO_IOMMU_TOPOLOGY
> +	bool "Topology properties for the virtio-iommu"
> +	depends on VIRTIO_IOMMU
> +	default y
> +	help
> +	  Enable early probing of the virtio-iommu device, to detect the
> +	  built-in topology description.
> +
> +	  Say Y here if you intend to run this kernel as a guest.
> +
>  endif # IOMMU_SUPPORT
> diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
> index 9f33fdb3bb05..5da24280d08c 100644
> --- a/drivers/iommu/Makefile
> +++ b/drivers/iommu/Makefile
> @@ -37,3 +37,4 @@ obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
>  obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o
>  obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o
>  obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
> +obj-$(CONFIG_VIRTIO_IOMMU_TOPOLOGY) += virtio-iommu-topology.o
> diff --git a/drivers/iommu/virtio-iommu-topology.c b/drivers/iommu/virtio-
> iommu-topology.c
> new file mode 100644
> index 000000000000..2188624ef216
> --- /dev/null
> +++ b/drivers/iommu/virtio-iommu-topology.c
> @@ -0,0 +1,343 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> +#include <linux/dma-iommu.h>
> +#include <linux/list.h>
> +#include <linux/pci.h>
> +#include <linux/virt_iommu.h>
> +#include <linux/virtio_ids.h>
> +#include <linux/virtio_pci.h>
> +#include <uapi/linux/virtio_iommu.h>
> +
> +struct viommu_cap_config {
> +	u8 bar;
> +	u32 length; /* structure size */
> +	u32 offset; /* structure offset within the bar */
> +};
> +
> +union viommu_topo_cfg {
> +	__le16					type;
> +	struct virtio_iommu_topo_pci_range	pci;
> +	struct virtio_iommu_topo_endpoint	ep;
> +};
> +
> +struct viommu_spec {
> +	struct device				*dev; /* transport device */
> +	struct fwnode_handle			*fwnode;
> +	struct iommu_ops			*ops;
> +	struct list_head			list;
> +	size_t					num_items;

Intel DMAR allows an IOMMU to claim INCLUDE_ALL thus avoid listing
every endpoint one-by-one. It is especially useful when there is only
one IOMMU device in the system. Do you think whether making sense
to allow such optimization in this spec? It doesn't work for ARM since
you need ID mapping to find the MSI doorbell. But for architectures
where only topology info is required, it makes the enumeration process
much simpler.

Thanks
Kevin

> +	/* The config array of length num_items follows */
> +	union viommu_topo_cfg			cfg[];
> +};
> +
> +static LIST_HEAD(viommus);
> +static DEFINE_MUTEX(viommus_lock);
> +
> +#define VPCI_FIELD(field) offsetof(struct virtio_pci_cap, field)
> +
> +static inline int viommu_pci_find_capability(struct pci_dev *dev, u8 cfg_type,
> +					     struct viommu_cap_config *cap)
> +{
> +	int pos;
> +	u8 bar;
> +
> +	for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR);
> +	     pos > 0;
> +	     pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) {
> +		u8 type;
> +
> +		pci_read_config_byte(dev, pos + VPCI_FIELD(cfg_type),
> &type);
> +		if (type != cfg_type)
> +			continue;
> +
> +		pci_read_config_byte(dev, pos + VPCI_FIELD(bar), &bar);
> +
> +		/* Ignore structures with reserved BAR values */
> +		if (type != VIRTIO_PCI_CAP_PCI_CFG && bar > 0x5)
> +			continue;
> +
> +		cap->bar = bar;
> +		pci_read_config_dword(dev, pos + VPCI_FIELD(length),
> +				      &cap->length);
> +		pci_read_config_dword(dev, pos + VPCI_FIELD(offset),
> +				      &cap->offset);
> +
> +		return pos;
> +	}
> +	return 0;
> +}
> +
> +static void viommu_ccopy(__le32 *dst, u32 __iomem *src, size_t length)
> +{
> +	size_t i;
> +
> +	/* For the moment all our config structures align on 32b */
> +	if (WARN_ON(length % 4))
> +		return;
> +
> +	for (i = 0; i < length / 4; i++)
> +		/* Keep little-endian data */
> +		dst[i] = cpu_to_le32(ioread32(src + i));
> +}
> +
> +static int viommu_parse_topology(struct device *dev,
> +				 struct virtio_iommu_config __iomem *cfg)
> +{
> +	size_t i;
> +	size_t spec_length;
> +	struct viommu_spec *viommu_spec;
> +	u32 offset, item_length, num_items;
> +
> +	offset = ioread32(&cfg->topo_config.offset);
> +	item_length = ioread32(&cfg->topo_config.item_length);
> +	num_items = ioread32(&cfg->topo_config.num_items);
> +	if (!offset || !num_items || !item_length)
> +		return 0;
> +
> +	spec_length = sizeof(*viommu_spec) + num_items *
> +					     sizeof(union viommu_topo_cfg);
> +	viommu_spec = kzalloc(spec_length, GFP_KERNEL);
> +	if (!viommu_spec)
> +		return -ENOMEM;
> +
> +	viommu_spec->dev = dev;
> +
> +	/* Copy in the whole array, sort it out later */
> +	for (i = 0; i < num_items; i++) {
> +		size_t read_length = min_t(size_t, item_length,
> +					   sizeof(union viommu_topo_cfg));
> +
> +		viommu_ccopy((__le32 *)&viommu_spec->cfg[i],
> +			     (void __iomem *)cfg + offset,
> +			     read_length);
> +
> +		offset += item_length;
> +	}
> +	viommu_spec->num_items = num_items;
> +
> +	mutex_lock(&viommus_lock);
> +	list_add(&viommu_spec->list, &viommus);
> +	mutex_unlock(&viommus_lock);
> +
> +	return 0;
> +}
> +
> +static void viommu_pci_parse_topology(struct pci_dev *dev)
> +{
> +	int pos;
> +	u32 features;
> +	void __iomem *regs;
> +	struct viommu_cap_config cap = {0};
> +	struct virtio_pci_common_cfg __iomem *common_cfg;
> +
> +	/*
> +	 * The virtio infrastructure might not be loaded at this point. we need
> +	 * to access the BARs ourselves.
> +	 */
> +	pos = viommu_pci_find_capability(dev,
> VIRTIO_PCI_CAP_COMMON_CFG, &cap);
> +	if (!pos) {
> +		pci_warn(dev, "common capability not found\n");
> +		return;
> +	}
> +
> +	if (pci_enable_device_mem(dev))
> +		return;
> +
> +	regs = pci_iomap(dev, cap.bar, 0);
> +	if (!regs)
> +		return;
> +
> +	common_cfg = regs + cap.offset;
> +
> +	/* Find out if the device supports topology description */
> +	writel(0, &common_cfg->device_feature_select);
> +	features = ioread32(&common_cfg->device_feature);
> +
> +	pci_iounmap(dev, regs);
> +
> +	if (!(features & BIT(VIRTIO_IOMMU_F_TOPOLOGY))) {
> +		pci_dbg(dev, "device doesn't have topology description");
> +		return;
> +	}
> +
> +	pos = viommu_pci_find_capability(dev,
> VIRTIO_PCI_CAP_DEVICE_CFG, &cap);
> +	if (!pos) {
> +		pci_warn(dev, "device config capability not found\n");
> +		return;
> +	}
> +
> +	regs = pci_iomap(dev, cap.bar, 0);
> +	if (!regs)
> +		return;
> +
> +	pci_info(dev, "parsing virtio-iommu topology\n");
> +	viommu_parse_topology(&dev->dev, regs + cap.offset);
> +	pci_iounmap(dev, regs);
> +}
> +
> +/*
> + * Catch a PCI virtio-iommu implementation early to get the topology
> description
> + * before we start probing other endpoints.
> + */
> +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1040
> + VIRTIO_ID_IOMMU,
> +			viommu_pci_parse_topology);
> +
> +/*
> + * Return true if the device matches this topology structure. Write the
> endpoint
> + * ID into epid if it's the case.
> + */
> +static bool viommu_parse_pci(struct pci_dev *pdev, union
> viommu_topo_cfg *cfg,
> +			     u32 *epid)
> +{
> +	u32 endpoint_start;
> +	u16 start, end, domain;
> +	u16 devid = pci_dev_id(pdev);
> +	u16 type = le16_to_cpu(cfg->type);
> +
> +	if (type != VIRTIO_IOMMU_TOPO_PCI_RANGE)
> +		return false;
> +
> +	start		= le16_to_cpu(cfg->pci.requester_start);
> +	end		= le16_to_cpu(cfg->pci.requester_end);
> +	domain		= le16_to_cpu(cfg->pci.hierarchy);
> +	endpoint_start	= le32_to_cpu(cfg->pci.endpoint_start);
> +
> +	if (pci_domain_nr(pdev->bus) == domain &&
> +	    devid >= start && devid <= end) {
> +		*epid = devid - start + endpoint_start;
> +		return true;
> +	}
> +	return false;
> +}
> +
> +static const struct iommu_ops *virt_iommu_setup(struct device *dev)
> +{
> +	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> +	const struct iommu_ops *viommu_ops = NULL;
> +	struct fwnode_handle *viommu_fwnode;
> +	struct viommu_spec *viommu_spec;
> +	struct pci_dev *pci_dev = NULL;
> +	struct device *viommu_dev;
> +	bool found = false;
> +	size_t i;
> +	u32 epid;
> +	int ret;
> +
> +	/* Already translated? */
> +	if (fwspec && fwspec->ops)
> +		return NULL;
> +
> +	if (dev_is_pci(dev)) {
> +		pci_dev = to_pci_dev(dev);
> +	} else {
> +		/* At the moment we don't support platform devices */
> +		return NULL;
> +	}
> +
> +	mutex_lock(&viommus_lock);
> +	list_for_each_entry(viommu_spec, &viommus, list) {
> +		for (i = 0; i < viommu_spec->num_items; i++) {
> +			union viommu_topo_cfg *cfg = &viommu_spec-
> >cfg[i];
> +
> +			found = viommu_parse_pci(pci_dev, cfg, &epid);
> +			if (found)
> +				break;
> +		}
> +		if (found) {
> +			viommu_ops = viommu_spec->ops;
> +			viommu_fwnode = viommu_spec->fwnode;
> +			viommu_dev = viommu_spec->dev;
> +			break;
> +		}
> +	}
> +	mutex_unlock(&viommus_lock);
> +	if (!found)
> +		return NULL;
> +
> +	/* We're not translating ourselves. */
> +	if (viommu_dev == dev)
> +		return NULL;
> +
> +	/*
> +	 * If we found a PCI range managed by the viommu, we're the ones
> that
> +	 * have to request ACS.
> +	 */
> +	if (pci_dev)
> +		pci_request_acs();
> +
> +	if (!viommu_ops)
> +		return ERR_PTR(-EPROBE_DEFER);
> +
> +	ret = iommu_fwspec_init(dev, viommu_fwnode, viommu_ops);
> +	if (ret)
> +		return ERR_PTR(ret);
> +
> +	iommu_fwspec_add_ids(dev, &epid, 1);
> +
> +	return viommu_ops;
> +}
> +
> +/**
> + * virt_dma_configure - Configure DMA of virtualized devices
> + * @dev: the endpoint
> + *
> + * Setup the DMA and IOMMU ops of a virtual device, for platforms without
> DT or
> + * ACPI.
> + *
> + * Return: -EPROBE_DEFER if the device is managed by an IOMMU that
> hasn't been
> + *   probed yet, 0 otherwise
> + */
> +int virt_dma_configure(struct device *dev)
> +{
> +	const struct iommu_ops *iommu_ops;
> +
> +	iommu_ops = virt_iommu_setup(dev);
> +	if (IS_ERR_OR_NULL(iommu_ops)) {
> +		int ret = PTR_ERR(iommu_ops);
> +
> +		if (ret == -EPROBE_DEFER || ret == 0)
> +			return ret;
> +		dev_err(dev, "error %d while setting up virt IOMMU\n", ret);
> +		return 0;
> +	}
> +
> +	/*
> +	 * If we have reason to believe the IOMMU driver missed the initial
> +	 * add_device callback for dev, replay it to get things in order.
> +	 */
> +	if (dev->bus && !device_iommu_mapped(dev))
> +		iommu_probe_device(dev);
> +
> +	/* Assume coherent, as well as full 64-bit addresses. */
> +#ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS
> +	arch_setup_dma_ops(dev, 0, ~0ULL, iommu_ops, true);
> +#else
> +	iommu_setup_dma_ops(dev, 0, ~0ULL);
> +#endif
> +	return 0;
> +}
> +
> +/**
> + * virt_set_iommu_ops - Set the IOMMU ops of a virtual IOMMU device
> + * @dev: the IOMMU device (transport)
> + * @ops: the new IOMMU ops or NULL
> + *
> + * Setup the iommu_ops associated to a viommu_spec, once the driver is
> loaded
> + * and the device probed.
> + */
> +void virt_set_iommu_ops(struct device *dev, struct iommu_ops *ops)
> +{
> +	struct viommu_spec *viommu_spec;
> +
> +	mutex_lock(&viommus_lock);
> +	list_for_each_entry(viommu_spec, &viommus, list) {
> +		if (viommu_spec->dev == dev) {
> +			viommu_spec->ops = ops;
> +			viommu_spec->fwnode = ops ? dev->fwnode : NULL;
> +			break;
> +		}
> +	}
> +	mutex_unlock(&viommus_lock);
> +}
> +EXPORT_SYMBOL_GPL(virt_set_iommu_ops);
> diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
> index 93ff58632452..5429c12c879b 100644
> --- a/drivers/iommu/virtio-iommu.c
> +++ b/drivers/iommu/virtio-iommu.c
> @@ -21,6 +21,7 @@
>  #include <linux/virtio.h>
>  #include <linux/virtio_config.h>
>  #include <linux/virtio_ids.h>
> +#include <linux/virt_iommu.h>
>  #include <linux/wait.h>
> 
>  #include <uapi/linux/virtio_iommu.h>
> @@ -1075,6 +1076,7 @@ static int viommu_probe(struct virtio_device *vdev)
>  	if (ret)
>  		goto err_free_vqs;
> 
> +	virt_set_iommu_ops(dev->parent, &viommu_ops);
>  	iommu_device_set_ops(&viommu->iommu, &viommu_ops);
>  	iommu_device_set_fwnode(&viommu->iommu, parent_dev-
> >fwnode);
> 
> @@ -1121,6 +1123,7 @@ static void viommu_remove(struct virtio_device
> *vdev)
>  {
>  	struct viommu_dev *viommu = vdev->priv;
> 
> +	virt_set_iommu_ops(vdev->dev.parent, NULL);
>  	iommu_device_sysfs_remove(&viommu->iommu);
>  	iommu_device_unregister(&viommu->iommu);
> 
> diff --git a/include/linux/virt_iommu.h b/include/linux/virt_iommu.h
> new file mode 100644
> index 000000000000..c68b03ec75ba
> --- /dev/null
> +++ b/include/linux/virt_iommu.h
> @@ -0,0 +1,19 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef VIRTIO_IOMMU_H_
> +#define VIRTIO_IOMMU_H_
> +
> +#if IS_ENABLED(CONFIG_VIRTIO_IOMMU_TOPOLOGY)
> +int virt_dma_configure(struct device *dev);
> +void virt_set_iommu_ops(struct device *dev, struct iommu_ops *ops);
> +#else /* !CONFIG_VIRTIO_IOMMU_TOPOLOGY */
> +static inline int virt_dma_configure(struct device *dev)
> +{
> +	/* Don't disturb the normal DMA configuration methods */
> +	return 0;
> +}
> +
> +static inline void virt_set_iommu_ops(struct device *dev, struct iommu_ops
> *ops)
> +{ }
> +#endif /* !CONFIG_VIRTIO_IOMMU_TOPOLOGY */
> +
> +#endif /* VIRTIO_IOMMU_H_ */
> diff --git a/include/uapi/linux/virtio_iommu.h
> b/include/uapi/linux/virtio_iommu.h
> index 237e36a280cb..ec57d215086a 100644
> --- a/include/uapi/linux/virtio_iommu.h
> +++ b/include/uapi/linux/virtio_iommu.h
> @@ -16,6 +16,7 @@
>  #define VIRTIO_IOMMU_F_BYPASS			3
>  #define VIRTIO_IOMMU_F_PROBE			4
>  #define VIRTIO_IOMMU_F_MMIO			5
> +#define VIRTIO_IOMMU_F_TOPOLOGY			6
> 
>  struct virtio_iommu_range_64 {
>  	__le64					start;
> @@ -27,6 +28,12 @@ struct virtio_iommu_range_32 {
>  	__le32					end;
>  };
> 
> +struct virtio_iommu_topo_config {
> +	__le32					offset;
> +	__le32					num_items;
> +	__le32					item_length;
> +};
> +
>  struct virtio_iommu_config {
>  	/* Supported page sizes */
>  	__le64					page_size_mask;
> @@ -36,6 +43,25 @@ struct virtio_iommu_config {
>  	struct virtio_iommu_range_32		domain_range;
>  	/* Probe buffer size */
>  	__le32					probe_size;
> +	struct virtio_iommu_topo_config		topo_config;
> +};
> +
> +#define VIRTIO_IOMMU_TOPO_PCI_RANGE		0x1
> +#define VIRTIO_IOMMU_TOPO_ENDPOINT		0x2
> +
> +struct virtio_iommu_topo_pci_range {
> +	__le16					type;
> +	__le16					hierarchy;
> +	__le16					requester_start;
> +	__le16					requester_end;
> +	__le32					endpoint_start;
> +};
> +
> +struct virtio_iommu_topo_endpoint {
> +	__le16					type;
> +	__le16					reserved;
> +	__le32					endpoint;
> +	__le64					address;
>  };
> 
>  /* Request types */
> --
> 2.25.0
> 
> _______________________________________________
> iommu mailing list
> iommu@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/iommu
Jean-Philippe Brucker Aug. 21, 2020, 8:39 a.m. UTC | #26
Hi,

While preparing the next version I noticed I forgot to send this reply.
Better late than never I suppose...

On Tue, Apr 21, 2020 at 07:31:12AM +0000, Tian, Kevin wrote:
> > From: Jean-Philippe Brucker
> > Sent: Saturday, February 29, 2020 1:26 AM
> > 
> > Platforms without device-tree do not currently have a method for
> > describing the vIOMMU topology. Provide a topology description embedded
> > into the virtio device.
[...]
> > diff --git a/drivers/iommu/virtio-iommu-topology.c b/drivers/iommu/virtio-
> > iommu-topology.c
> > new file mode 100644
> > index 000000000000..2188624ef216
> > --- /dev/null
> > +++ b/drivers/iommu/virtio-iommu-topology.c
> > @@ -0,0 +1,343 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> > +
> > +#include <linux/dma-iommu.h>
> > +#include <linux/list.h>
> > +#include <linux/pci.h>
> > +#include <linux/virt_iommu.h>
> > +#include <linux/virtio_ids.h>
> > +#include <linux/virtio_pci.h>
> > +#include <uapi/linux/virtio_iommu.h>
> > +
> > +struct viommu_cap_config {
> > +	u8 bar;
> > +	u32 length; /* structure size */
> > +	u32 offset; /* structure offset within the bar */
> > +};
> > +
> > +union viommu_topo_cfg {
> > +	__le16					type;
> > +	struct virtio_iommu_topo_pci_range	pci;
> > +	struct virtio_iommu_topo_endpoint	ep;
> > +};
> > +
> > +struct viommu_spec {
> > +	struct device				*dev; /* transport device */
> > +	struct fwnode_handle			*fwnode;
> > +	struct iommu_ops			*ops;
> > +	struct list_head			list;
> > +	size_t					num_items;
> 
> Intel DMAR allows an IOMMU to claim INCLUDE_ALL thus avoid listing
> every endpoint one-by-one. It is especially useful when there is only
> one IOMMU device in the system. Do you think whether making sense
> to allow such optimization in this spec?

The DMAR INCLUDE_PCI_ALL is for a single PCI domain, so I think is
equivalent to having a single virtio_iommu_topo_pci_range structure with
start=0 and end=0xffff. That only takes 16 bytes of config space and is
pretty easy to parse, so a special case doesn't seem necessary to me.

If more than one PCI domain is managed by the IOMMU, then INCLUDE_ALL
isn't sufficient since we need to describe how endpoint IDs are associated
to domain:RID (one of the domains would have its endpoint IDs = RID +
0x10000 for example). Furthermore non-PCI devices don't have an implicit
endpoint ID like the RID.

Thanks,
Jean

> It doesn't work for ARM since
> you need ID mapping to find the MSI doorbell. But for architectures
> where only topology info is required, it makes the enumeration process
> much simpler.
> 
> Thanks
> Kevin
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index fcd79fc38928..65a03ce53096 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17781,6 +17781,8 @@  M:	Jean-Philippe Brucker <jean-philippe@linaro.org>
 L:	virtualization@lists.linux-foundation.org
 S:	Maintained
 F:	drivers/iommu/virtio-iommu.c
+F:	drivers/iommu/virtio-iommu-topology.c
+F:	include/linux/virt_iommu.h
 F:	include/uapi/linux/virtio_iommu.h
 
 VIRTUAL BOX GUEST DEVICE DRIVER
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index c5df570ef84a..f8cb45d84bb0 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -516,4 +516,14 @@  config VIRTIO_IOMMU
 
 	  Say Y here if you intend to run this kernel as a guest.
 
+config VIRTIO_IOMMU_TOPOLOGY
+	bool "Topology properties for the virtio-iommu"
+	depends on VIRTIO_IOMMU
+	default y
+	help
+	  Enable early probing of the virtio-iommu device, to detect the
+	  built-in topology description.
+
+	  Say Y here if you intend to run this kernel as a guest.
+
 endif # IOMMU_SUPPORT
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 9f33fdb3bb05..5da24280d08c 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -37,3 +37,4 @@  obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
 obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o
 obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o
 obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
+obj-$(CONFIG_VIRTIO_IOMMU_TOPOLOGY) += virtio-iommu-topology.o
diff --git a/drivers/iommu/virtio-iommu-topology.c b/drivers/iommu/virtio-iommu-topology.c
new file mode 100644
index 000000000000..2188624ef216
--- /dev/null
+++ b/drivers/iommu/virtio-iommu-topology.c
@@ -0,0 +1,343 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/dma-iommu.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/virt_iommu.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_pci.h>
+#include <uapi/linux/virtio_iommu.h>
+
+struct viommu_cap_config {
+	u8 bar;
+	u32 length; /* structure size */
+	u32 offset; /* structure offset within the bar */
+};
+
+union viommu_topo_cfg {
+	__le16					type;
+	struct virtio_iommu_topo_pci_range	pci;
+	struct virtio_iommu_topo_endpoint	ep;
+};
+
+struct viommu_spec {
+	struct device				*dev; /* transport device */
+	struct fwnode_handle			*fwnode;
+	struct iommu_ops			*ops;
+	struct list_head			list;
+	size_t					num_items;
+	/* The config array of length num_items follows */
+	union viommu_topo_cfg			cfg[];
+};
+
+static LIST_HEAD(viommus);
+static DEFINE_MUTEX(viommus_lock);
+
+#define VPCI_FIELD(field) offsetof(struct virtio_pci_cap, field)
+
+static inline int viommu_pci_find_capability(struct pci_dev *dev, u8 cfg_type,
+					     struct viommu_cap_config *cap)
+{
+	int pos;
+	u8 bar;
+
+	for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR);
+	     pos > 0;
+	     pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) {
+		u8 type;
+
+		pci_read_config_byte(dev, pos + VPCI_FIELD(cfg_type), &type);
+		if (type != cfg_type)
+			continue;
+
+		pci_read_config_byte(dev, pos + VPCI_FIELD(bar), &bar);
+
+		/* Ignore structures with reserved BAR values */
+		if (type != VIRTIO_PCI_CAP_PCI_CFG && bar > 0x5)
+			continue;
+
+		cap->bar = bar;
+		pci_read_config_dword(dev, pos + VPCI_FIELD(length),
+				      &cap->length);
+		pci_read_config_dword(dev, pos + VPCI_FIELD(offset),
+				      &cap->offset);
+
+		return pos;
+	}
+	return 0;
+}
+
+static void viommu_ccopy(__le32 *dst, u32 __iomem *src, size_t length)
+{
+	size_t i;
+
+	/* For the moment all our config structures align on 32b */
+	if (WARN_ON(length % 4))
+		return;
+
+	for (i = 0; i < length / 4; i++)
+		/* Keep little-endian data */
+		dst[i] = cpu_to_le32(ioread32(src + i));
+}
+
+static int viommu_parse_topology(struct device *dev,
+				 struct virtio_iommu_config __iomem *cfg)
+{
+	size_t i;
+	size_t spec_length;
+	struct viommu_spec *viommu_spec;
+	u32 offset, item_length, num_items;
+
+	offset = ioread32(&cfg->topo_config.offset);
+	item_length = ioread32(&cfg->topo_config.item_length);
+	num_items = ioread32(&cfg->topo_config.num_items);
+	if (!offset || !num_items || !item_length)
+		return 0;
+
+	spec_length = sizeof(*viommu_spec) + num_items *
+					     sizeof(union viommu_topo_cfg);
+	viommu_spec = kzalloc(spec_length, GFP_KERNEL);
+	if (!viommu_spec)
+		return -ENOMEM;
+
+	viommu_spec->dev = dev;
+
+	/* Copy in the whole array, sort it out later */
+	for (i = 0; i < num_items; i++) {
+		size_t read_length = min_t(size_t, item_length,
+					   sizeof(union viommu_topo_cfg));
+
+		viommu_ccopy((__le32 *)&viommu_spec->cfg[i],
+			     (void __iomem *)cfg + offset,
+			     read_length);
+
+		offset += item_length;
+	}
+	viommu_spec->num_items = num_items;
+
+	mutex_lock(&viommus_lock);
+	list_add(&viommu_spec->list, &viommus);
+	mutex_unlock(&viommus_lock);
+
+	return 0;
+}
+
+static void viommu_pci_parse_topology(struct pci_dev *dev)
+{
+	int pos;
+	u32 features;
+	void __iomem *regs;
+	struct viommu_cap_config cap = {0};
+	struct virtio_pci_common_cfg __iomem *common_cfg;
+
+	/*
+	 * The virtio infrastructure might not be loaded at this point. we need
+	 * to access the BARs ourselves.
+	 */
+	pos = viommu_pci_find_capability(dev, VIRTIO_PCI_CAP_COMMON_CFG, &cap);
+	if (!pos) {
+		pci_warn(dev, "common capability not found\n");
+		return;
+	}
+
+	if (pci_enable_device_mem(dev))
+		return;
+
+	regs = pci_iomap(dev, cap.bar, 0);
+	if (!regs)
+		return;
+
+	common_cfg = regs + cap.offset;
+
+	/* Find out if the device supports topology description */
+	writel(0, &common_cfg->device_feature_select);
+	features = ioread32(&common_cfg->device_feature);
+
+	pci_iounmap(dev, regs);
+
+	if (!(features & BIT(VIRTIO_IOMMU_F_TOPOLOGY))) {
+		pci_dbg(dev, "device doesn't have topology description");
+		return;
+	}
+
+	pos = viommu_pci_find_capability(dev, VIRTIO_PCI_CAP_DEVICE_CFG, &cap);
+	if (!pos) {
+		pci_warn(dev, "device config capability not found\n");
+		return;
+	}
+
+	regs = pci_iomap(dev, cap.bar, 0);
+	if (!regs)
+		return;
+
+	pci_info(dev, "parsing virtio-iommu topology\n");
+	viommu_parse_topology(&dev->dev, regs + cap.offset);
+	pci_iounmap(dev, regs);
+}
+
+/*
+ * Catch a PCI virtio-iommu implementation early to get the topology description
+ * before we start probing other endpoints.
+ */
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1040 + VIRTIO_ID_IOMMU,
+			viommu_pci_parse_topology);
+
+/*
+ * Return true if the device matches this topology structure. Write the endpoint
+ * ID into epid if it's the case.
+ */
+static bool viommu_parse_pci(struct pci_dev *pdev, union viommu_topo_cfg *cfg,
+			     u32 *epid)
+{
+	u32 endpoint_start;
+	u16 start, end, domain;
+	u16 devid = pci_dev_id(pdev);
+	u16 type = le16_to_cpu(cfg->type);
+
+	if (type != VIRTIO_IOMMU_TOPO_PCI_RANGE)
+		return false;
+
+	start		= le16_to_cpu(cfg->pci.requester_start);
+	end		= le16_to_cpu(cfg->pci.requester_end);
+	domain		= le16_to_cpu(cfg->pci.hierarchy);
+	endpoint_start	= le32_to_cpu(cfg->pci.endpoint_start);
+
+	if (pci_domain_nr(pdev->bus) == domain &&
+	    devid >= start && devid <= end) {
+		*epid = devid - start + endpoint_start;
+		return true;
+	}
+	return false;
+}
+
+static const struct iommu_ops *virt_iommu_setup(struct device *dev)
+{
+	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+	const struct iommu_ops *viommu_ops = NULL;
+	struct fwnode_handle *viommu_fwnode;
+	struct viommu_spec *viommu_spec;
+	struct pci_dev *pci_dev = NULL;
+	struct device *viommu_dev;
+	bool found = false;
+	size_t i;
+	u32 epid;
+	int ret;
+
+	/* Already translated? */
+	if (fwspec && fwspec->ops)
+		return NULL;
+
+	if (dev_is_pci(dev)) {
+		pci_dev = to_pci_dev(dev);
+	} else {
+		/* At the moment we don't support platform devices */
+		return NULL;
+	}
+
+	mutex_lock(&viommus_lock);
+	list_for_each_entry(viommu_spec, &viommus, list) {
+		for (i = 0; i < viommu_spec->num_items; i++) {
+			union viommu_topo_cfg *cfg = &viommu_spec->cfg[i];
+
+			found = viommu_parse_pci(pci_dev, cfg, &epid);
+			if (found)
+				break;
+		}
+		if (found) {
+			viommu_ops = viommu_spec->ops;
+			viommu_fwnode = viommu_spec->fwnode;
+			viommu_dev = viommu_spec->dev;
+			break;
+		}
+	}
+	mutex_unlock(&viommus_lock);
+	if (!found)
+		return NULL;
+
+	/* We're not translating ourselves. */
+	if (viommu_dev == dev)
+		return NULL;
+
+	/*
+	 * If we found a PCI range managed by the viommu, we're the ones that
+	 * have to request ACS.
+	 */
+	if (pci_dev)
+		pci_request_acs();
+
+	if (!viommu_ops)
+		return ERR_PTR(-EPROBE_DEFER);
+
+	ret = iommu_fwspec_init(dev, viommu_fwnode, viommu_ops);
+	if (ret)
+		return ERR_PTR(ret);
+
+	iommu_fwspec_add_ids(dev, &epid, 1);
+
+	return viommu_ops;
+}
+
+/**
+ * virt_dma_configure - Configure DMA of virtualized devices
+ * @dev: the endpoint
+ *
+ * Setup the DMA and IOMMU ops of a virtual device, for platforms without DT or
+ * ACPI.
+ *
+ * Return: -EPROBE_DEFER if the device is managed by an IOMMU that hasn't been
+ *   probed yet, 0 otherwise
+ */
+int virt_dma_configure(struct device *dev)
+{
+	const struct iommu_ops *iommu_ops;
+
+	iommu_ops = virt_iommu_setup(dev);
+	if (IS_ERR_OR_NULL(iommu_ops)) {
+		int ret = PTR_ERR(iommu_ops);
+
+		if (ret == -EPROBE_DEFER || ret == 0)
+			return ret;
+		dev_err(dev, "error %d while setting up virt IOMMU\n", ret);
+		return 0;
+	}
+
+	/*
+	 * If we have reason to believe the IOMMU driver missed the initial
+	 * add_device callback for dev, replay it to get things in order.
+	 */
+	if (dev->bus && !device_iommu_mapped(dev))
+		iommu_probe_device(dev);
+
+	/* Assume coherent, as well as full 64-bit addresses. */
+#ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS
+	arch_setup_dma_ops(dev, 0, ~0ULL, iommu_ops, true);
+#else
+	iommu_setup_dma_ops(dev, 0, ~0ULL);
+#endif
+	return 0;
+}
+
+/**
+ * virt_set_iommu_ops - Set the IOMMU ops of a virtual IOMMU device
+ * @dev: the IOMMU device (transport)
+ * @ops: the new IOMMU ops or NULL
+ *
+ * Setup the iommu_ops associated to a viommu_spec, once the driver is loaded
+ * and the device probed.
+ */
+void virt_set_iommu_ops(struct device *dev, struct iommu_ops *ops)
+{
+	struct viommu_spec *viommu_spec;
+
+	mutex_lock(&viommus_lock);
+	list_for_each_entry(viommu_spec, &viommus, list) {
+		if (viommu_spec->dev == dev) {
+			viommu_spec->ops = ops;
+			viommu_spec->fwnode = ops ? dev->fwnode : NULL;
+			break;
+		}
+	}
+	mutex_unlock(&viommus_lock);
+}
+EXPORT_SYMBOL_GPL(virt_set_iommu_ops);
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index 93ff58632452..5429c12c879b 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -21,6 +21,7 @@ 
 #include <linux/virtio.h>
 #include <linux/virtio_config.h>
 #include <linux/virtio_ids.h>
+#include <linux/virt_iommu.h>
 #include <linux/wait.h>
 
 #include <uapi/linux/virtio_iommu.h>
@@ -1075,6 +1076,7 @@  static int viommu_probe(struct virtio_device *vdev)
 	if (ret)
 		goto err_free_vqs;
 
+	virt_set_iommu_ops(dev->parent, &viommu_ops);
 	iommu_device_set_ops(&viommu->iommu, &viommu_ops);
 	iommu_device_set_fwnode(&viommu->iommu, parent_dev->fwnode);
 
@@ -1121,6 +1123,7 @@  static void viommu_remove(struct virtio_device *vdev)
 {
 	struct viommu_dev *viommu = vdev->priv;
 
+	virt_set_iommu_ops(vdev->dev.parent, NULL);
 	iommu_device_sysfs_remove(&viommu->iommu);
 	iommu_device_unregister(&viommu->iommu);
 
diff --git a/include/linux/virt_iommu.h b/include/linux/virt_iommu.h
new file mode 100644
index 000000000000..c68b03ec75ba
--- /dev/null
+++ b/include/linux/virt_iommu.h
@@ -0,0 +1,19 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef VIRTIO_IOMMU_H_
+#define VIRTIO_IOMMU_H_
+
+#if IS_ENABLED(CONFIG_VIRTIO_IOMMU_TOPOLOGY)
+int virt_dma_configure(struct device *dev);
+void virt_set_iommu_ops(struct device *dev, struct iommu_ops *ops);
+#else /* !CONFIG_VIRTIO_IOMMU_TOPOLOGY */
+static inline int virt_dma_configure(struct device *dev)
+{
+	/* Don't disturb the normal DMA configuration methods */
+	return 0;
+}
+
+static inline void virt_set_iommu_ops(struct device *dev, struct iommu_ops *ops)
+{ }
+#endif /* !CONFIG_VIRTIO_IOMMU_TOPOLOGY */
+
+#endif /* VIRTIO_IOMMU_H_ */
diff --git a/include/uapi/linux/virtio_iommu.h b/include/uapi/linux/virtio_iommu.h
index 237e36a280cb..ec57d215086a 100644
--- a/include/uapi/linux/virtio_iommu.h
+++ b/include/uapi/linux/virtio_iommu.h
@@ -16,6 +16,7 @@ 
 #define VIRTIO_IOMMU_F_BYPASS			3
 #define VIRTIO_IOMMU_F_PROBE			4
 #define VIRTIO_IOMMU_F_MMIO			5
+#define VIRTIO_IOMMU_F_TOPOLOGY			6
 
 struct virtio_iommu_range_64 {
 	__le64					start;
@@ -27,6 +28,12 @@  struct virtio_iommu_range_32 {
 	__le32					end;
 };
 
+struct virtio_iommu_topo_config {
+	__le32					offset;
+	__le32					num_items;
+	__le32					item_length;
+};
+
 struct virtio_iommu_config {
 	/* Supported page sizes */
 	__le64					page_size_mask;
@@ -36,6 +43,25 @@  struct virtio_iommu_config {
 	struct virtio_iommu_range_32		domain_range;
 	/* Probe buffer size */
 	__le32					probe_size;
+	struct virtio_iommu_topo_config		topo_config;
+};
+
+#define VIRTIO_IOMMU_TOPO_PCI_RANGE		0x1
+#define VIRTIO_IOMMU_TOPO_ENDPOINT		0x2
+
+struct virtio_iommu_topo_pci_range {
+	__le16					type;
+	__le16					hierarchy;
+	__le16					requester_start;
+	__le16					requester_end;
+	__le32					endpoint_start;
+};
+
+struct virtio_iommu_topo_endpoint {
+	__le16					type;
+	__le16					reserved;
+	__le32					endpoint;
+	__le64					address;
 };
 
 /* Request types */