diff mbox series

[v5,9/9] vfio/pci: Add dma-buf export support for MMIO regions

Message ID 72ecaa13864ca346797e342d23a7929562788148.1760368250.git.leon@kernel.org
State New
Headers show
Series vfio/pci: Allow MMIO regions to be exported through dma-buf | expand

Commit Message

Leon Romanovsky Oct. 13, 2025, 3:26 p.m. UTC
From: Leon Romanovsky <leonro@nvidia.com>

Add support for exporting PCI device MMIO regions through dma-buf,
enabling safe sharing of non-struct page memory with controlled
lifetime management. This allows RDMA and other subsystems to import
dma-buf FDs and build them into memory regions for PCI P2P operations.

The implementation provides a revocable attachment mechanism using
dma-buf move operations. MMIO regions are normally pinned as BARs
don't change physical addresses, but access is revoked when the VFIO
device is closed or a PCI reset is issued. This ensures kernel
self-defense against potentially hostile userspace.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/vfio/pci/Kconfig           |   3 +
 drivers/vfio/pci/Makefile          |   2 +
 drivers/vfio/pci/vfio_pci_config.c |  22 +-
 drivers/vfio/pci/vfio_pci_core.c   |  28 ++
 drivers/vfio/pci/vfio_pci_dmabuf.c | 446 +++++++++++++++++++++++++++++
 drivers/vfio/pci/vfio_pci_priv.h   |  23 ++
 include/linux/vfio_pci_core.h      |   1 +
 include/uapi/linux/vfio.h          |  25 ++
 8 files changed, 546 insertions(+), 4 deletions(-)
 create mode 100644 drivers/vfio/pci/vfio_pci_dmabuf.c

Comments

Jason Gunthorpe Oct. 16, 2025, 11:53 p.m. UTC | #1
On Mon, Oct 13, 2025 at 06:26:11PM +0300, Leon Romanovsky wrote:
> +
> +static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
> +				   struct dma_buf_attachment *attachment)
> +{
> +	struct vfio_pci_dma_buf *priv = dmabuf->priv;
> +
> +	if (!attachment->peer2peer)
> +		return -EOPNOTSUPP;
> +
> +	if (priv->revoked)
> +		return -ENODEV;
> +
> +	switch (pci_p2pdma_map_type(priv->provider, attachment->dev)) {
> +	case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
> +		break;
> +	case PCI_P2PDMA_MAP_BUS_ADDR:
> +		/*
> +		 * There is no need in IOVA at all for this flow.
> +		 * We rely on attachment->priv == NULL as a marker
> +		 * for this mode.
> +		 */
> +		return 0;
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	attachment->priv = kzalloc(sizeof(struct dma_iova_state), GFP_KERNEL);
> +	if (!attachment->priv)
> +		return -ENOMEM;
> +
> +	dma_iova_try_alloc(attachment->dev, attachment->priv, 0, priv->size);

The lifetime of this isn't good..

> +	return 0;
> +}
> +
> +static void vfio_pci_dma_buf_detach(struct dma_buf *dmabuf,
> +				    struct dma_buf_attachment *attachment)
> +{
> +	kfree(attachment->priv);
> +}

If the caller fails to call map then it leaks the iova.

> +static struct sg_table *
> +vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
> +		     enum dma_data_direction dir)
> +{
[..]


> +err_unmap_dma:
> +	if (!i || !state)
> +		; /* Do nothing */
> +	else if (dma_use_iova(state))
> +		dma_iova_destroy(attachment->dev, state, mapped_len, dir,
> +				 attrs);

If we hit this error path then it is freed..

> +static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
> +				   struct sg_table *sgt,
> +				   enum dma_data_direction dir)
> +{
> +	struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> +	struct dma_iova_state *state = attachment->priv;
> +	unsigned long attrs = DMA_ATTR_MMIO;
> +	struct scatterlist *sgl;
> +	int i;
> +
> +	if (!state)
> +		; /* Do nothing */
> +	else if (dma_use_iova(state))
> +		dma_iova_destroy(attachment->dev, state, priv->size, dir,
> +				 attrs);

It is freed here too, but we can call map multiple times. Every time a
move_notify happens can trigger another call to map.

I think just call unlink in those two and put dma_iova_free in detach

Jason
Jason Gunthorpe Oct. 17, 2025, 12:01 a.m. UTC | #2
On Mon, Oct 13, 2025 at 06:26:11PM +0300, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@nvidia.com>
> 
> Add support for exporting PCI device MMIO regions through dma-buf,
> enabling safe sharing of non-struct page memory with controlled
> lifetime management. This allows RDMA and other subsystems to import
> dma-buf FDs and build them into memory regions for PCI P2P operations.
> 
> The implementation provides a revocable attachment mechanism using
> dma-buf move operations. MMIO regions are normally pinned as BARs
> don't change physical addresses, but access is revoked when the VFIO
> device is closed or a PCI reset is issued. This ensures kernel
> self-defense against potentially hostile userspace.

I have drafted the iommufd importer side of this using the private
interconnect approach for now.

https://github.com/jgunthorpe/linux/commits/iommufd_dmabuf/

Due to this iommufd never calls map and we run into trouble here:

> +static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
> +				   struct dma_buf_attachment *attachment)
> +{
> +	struct vfio_pci_dma_buf *priv = dmabuf->priv;
> +
> +	if (!attachment->peer2peer)
> +		return -EOPNOTSUPP;
> +
> +	if (priv->revoked)
> +		return -ENODEV;
> +
> +	switch (pci_p2pdma_map_type(priv->provider, attachment->dev)) {
> +	case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
> +		break;
> +	case PCI_P2PDMA_MAP_BUS_ADDR:
> +		/*
> +		 * There is no need in IOVA at all for this flow.
> +		 * We rely on attachment->priv == NULL as a marker
> +		 * for this mode.
> +		 */
> +		return 0;
> +	default:
> +		return -EINVAL;

Where the dev from iommufd is also not p2p capable so the attach
fails.

This is OK since it won't call map.

So I reworked this logic to succeed attach but block map in this
case.. Can we fold this in for the next version? This diff has the
fixing for the iova lifecycle too.

I have a few more checks to make but so far it looks Ok and with some
luck we can get some iommufd p2p support this cycle..

Jason

diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
index eaba010777f3b7..a0650bd816d99b 100644
--- a/drivers/vfio/pci/vfio_pci_dmabuf.c
+++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
@@ -20,10 +20,21 @@ struct vfio_pci_dma_buf {
 	u8 revoked : 1;
 };
 
+struct vfio_pci_attach {
+	struct dma_iova_state state;
+	enum {
+		VFIO_ATTACH_NONE,
+		VFIO_ATTACH_HOST_BRIDGE_DMA,
+		VFIO_ATTACH_HOST_BRIDGE_IOVA,
+		VFIO_ATTACH_BUS
+	} kind;
+};
+
 static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
 				   struct dma_buf_attachment *attachment)
 {
 	struct vfio_pci_dma_buf *priv = dmabuf->priv;
+	struct vfio_pci_attach *attach;
 
 	if (!attachment->peer2peer)
 		return -EOPNOTSUPP;
@@ -31,32 +42,38 @@ static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
 	if (priv->revoked)
 		return -ENODEV;
 
+	attach = kzalloc(sizeof(*attach), GFP_KERNEL);
+	if (!attach)
+		return -ENOMEM;
+	attachment->priv = attach;
+
 	switch (pci_p2pdma_map_type(priv->provider, attachment->dev)) {
 	case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
-		break;
+		if (dma_iova_try_alloc(attachment->dev, &attach->state, 0,
+				       priv->size))
+			attach->kind = VFIO_ATTACH_HOST_BRIDGE_IOVA;
+		else
+			attach->kind = VFIO_ATTACH_HOST_BRIDGE_DMA;
+		return 0;
 	case PCI_P2PDMA_MAP_BUS_ADDR:
-		/*
-		 * There is no need in IOVA at all for this flow.
-		 * We rely on attachment->priv == NULL as a marker
-		 * for this mode.
-		 */
+		/* There is no need in IOVA at all for this flow. */
+		attach->kind = VFIO_ATTACH_BUS;
 		return 0;
 	default:
-		return -EINVAL;
+		attach->kind = VFIO_ATTACH_NONE;
+		return 0;
 	}
-
-	attachment->priv = kzalloc(sizeof(struct dma_iova_state), GFP_KERNEL);
-	if (!attachment->priv)
-		return -ENOMEM;
-
-	dma_iova_try_alloc(attachment->dev, attachment->priv, 0, priv->size);
 	return 0;
 }
 
 static void vfio_pci_dma_buf_detach(struct dma_buf *dmabuf,
 				    struct dma_buf_attachment *attachment)
 {
-	kfree(attachment->priv);
+	struct vfio_pci_attach *attach = attachment->priv;
+
+	if (attach->kind == VFIO_ATTACH_HOST_BRIDGE_IOVA)
+		dma_iova_free(attachment->dev, &attach->state);
+	kfree(attach);
 }
 
 static struct scatterlist *fill_sg_entry(struct scatterlist *sgl, u64 length,
@@ -83,22 +100,23 @@ static struct scatterlist *fill_sg_entry(struct scatterlist *sgl, u64 length,
 }
 
 static unsigned int calc_sg_nents(struct vfio_pci_dma_buf *priv,
-				  struct dma_iova_state *state)
+				  struct vfio_pci_attach *attach)
 {
 	struct phys_vec *phys_vec = priv->phys_vec;
 	unsigned int nents = 0;
 	u32 i;
 
-	if (!state || !dma_use_iova(state))
+	if (attach->kind != VFIO_ATTACH_HOST_BRIDGE_IOVA) {
 		for (i = 0; i < priv->nr_ranges; i++)
 			nents += DIV_ROUND_UP(phys_vec[i].len, UINT_MAX);
-	else
+	} else {
 		/*
 		 * In IOVA case, there is only one SG entry which spans
 		 * for whole IOVA address space, but we need to make sure
 		 * that it fits sg->length, maybe we need more.
 		 */
 		nents = DIV_ROUND_UP(priv->size, UINT_MAX);
+	}
 
 	return nents;
 }
@@ -108,7 +126,7 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
 		     enum dma_data_direction dir)
 {
 	struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
-	struct dma_iova_state *state = attachment->priv;
+	struct vfio_pci_attach *attach = attachment->priv;
 	struct phys_vec *phys_vec = priv->phys_vec;
 	unsigned long attrs = DMA_ATTR_MMIO;
 	unsigned int nents, mapped_len = 0;
@@ -127,7 +145,7 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
 	if (!sgt)
 		return ERR_PTR(-ENOMEM);
 
-	nents = calc_sg_nents(priv, state);
+	nents = calc_sg_nents(priv, attach);
 	ret = sg_alloc_table(sgt, nents, GFP_KERNEL | __GFP_ZERO);
 	if (ret)
 		goto err_kfree_sgt;
@@ -135,35 +153,42 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
 	sgl = sgt->sgl;
 
 	for (i = 0; i < priv->nr_ranges; i++) {
-		if (!state) {
+		switch (attach->kind) {
+		case VFIO_ATTACH_BUS:
 			addr = pci_p2pdma_bus_addr_map(priv->provider,
 						       phys_vec[i].paddr);
-		} else if (dma_use_iova(state)) {
-			ret = dma_iova_link(attachment->dev, state,
+			break;
+		case VFIO_ATTACH_HOST_BRIDGE_IOVA:
+			ret = dma_iova_link(attachment->dev, &attach->state,
 					    phys_vec[i].paddr, 0,
 					    phys_vec[i].len, dir, attrs);
 			if (ret)
 				goto err_unmap_dma;
 
 			mapped_len += phys_vec[i].len;
-		} else {
+			break;
+		case VFIO_ATTACH_HOST_BRIDGE_DMA:
 			addr = dma_map_phys(attachment->dev, phys_vec[i].paddr,
 					    phys_vec[i].len, dir, attrs);
 			ret = dma_mapping_error(attachment->dev, addr);
 			if (ret)
 				goto err_unmap_dma;
+			break;
+		default:
+			ret = -EINVAL;
+			goto err_unmap_dma;
 		}
 
-		if (!state || !dma_use_iova(state))
+		if (attach->kind != VFIO_ATTACH_HOST_BRIDGE_IOVA)
 			sgl = fill_sg_entry(sgl, phys_vec[i].len, addr);
 	}
 
-	if (state && dma_use_iova(state)) {
+	if (attach->kind == VFIO_ATTACH_HOST_BRIDGE_IOVA) {
 		WARN_ON_ONCE(mapped_len != priv->size);
-		ret = dma_iova_sync(attachment->dev, state, 0, mapped_len);
+		ret = dma_iova_sync(attachment->dev, &attach->state, 0, mapped_len);
 		if (ret)
 			goto err_unmap_dma;
-		sgl = fill_sg_entry(sgl, mapped_len, state->addr);
+		sgl = fill_sg_entry(sgl, mapped_len, attach->state.addr);
 	}
 
 	/*
@@ -174,15 +199,22 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
 	return sgt;
 
 err_unmap_dma:
-	if (!i || !state)
-		; /* Do nothing */
-	else if (dma_use_iova(state))
-		dma_iova_destroy(attachment->dev, state, mapped_len, dir,
-				 attrs);
-	else
+	switch (attach->kind) {
+	case VFIO_ATTACH_HOST_BRIDGE_IOVA:
+		if (mapped_len)
+			dma_iova_unlink(attachment->dev, &attach->state, 0,
+					mapped_len, dir, attrs);
+		break;
+	case VFIO_ATTACH_HOST_BRIDGE_DMA:
+		if (!i)
+			break;
 		for_each_sgtable_dma_sg(sgt, sgl, i)
 			dma_unmap_phys(attachment->dev, sg_dma_address(sgl),
-					sg_dma_len(sgl), dir, attrs);
+				       sg_dma_len(sgl), dir, attrs);
+		break;
+	default:
+		break;
+	}
 	sg_free_table(sgt);
 err_kfree_sgt:
 	kfree(sgt);
@@ -194,20 +226,24 @@ static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
 				   enum dma_data_direction dir)
 {
 	struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
-	struct dma_iova_state *state = attachment->priv;
+	struct vfio_pci_attach *attach = attachment->priv;
 	unsigned long attrs = DMA_ATTR_MMIO;
 	struct scatterlist *sgl;
 	int i;
 
-	if (!state)
-		; /* Do nothing */
-	else if (dma_use_iova(state))
-		dma_iova_destroy(attachment->dev, state, priv->size, dir,
-				 attrs);
-	else
+	switch (attach->kind) {
+	case VFIO_ATTACH_HOST_BRIDGE_IOVA:
+		dma_iova_destroy(attachment->dev, &attach->state, priv->size,
+				 dir, attrs);
+		break;
+	case VFIO_ATTACH_HOST_BRIDGE_DMA:
 		for_each_sgtable_dma_sg(sgt, sgl, i)
 			dma_unmap_phys(attachment->dev, sg_dma_address(sgl),
 				       sg_dma_len(sgl), dir, attrs);
+		break;
+	default:
+		break;
+	}
 
 	sg_free_table(sgt);
 	kfree(sgt);
Leon Romanovsky Oct. 17, 2025, 5:40 a.m. UTC | #3
On Thu, Oct 16, 2025 at 08:53:32PM -0300, Jason Gunthorpe wrote:
> On Mon, Oct 13, 2025 at 06:26:11PM +0300, Leon Romanovsky wrote:
> > +
> > +static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
> > +				   struct dma_buf_attachment *attachment)
> > +{
> > +	struct vfio_pci_dma_buf *priv = dmabuf->priv;
> > +
> > +	if (!attachment->peer2peer)
> > +		return -EOPNOTSUPP;
> > +
> > +	if (priv->revoked)
> > +		return -ENODEV;
> > +
> > +	switch (pci_p2pdma_map_type(priv->provider, attachment->dev)) {
> > +	case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
> > +		break;
> > +	case PCI_P2PDMA_MAP_BUS_ADDR:
> > +		/*
> > +		 * There is no need in IOVA at all for this flow.
> > +		 * We rely on attachment->priv == NULL as a marker
> > +		 * for this mode.
> > +		 */
> > +		return 0;
> > +	default:
> > +		return -EINVAL;
> > +	}
> > +
> > +	attachment->priv = kzalloc(sizeof(struct dma_iova_state), GFP_KERNEL);
> > +	if (!attachment->priv)
> > +		return -ENOMEM;
> > +
> > +	dma_iova_try_alloc(attachment->dev, attachment->priv, 0, priv->size);
> 
> The lifetime of this isn't good..
> 
> > +	return 0;
> > +}
> > +
> > +static void vfio_pci_dma_buf_detach(struct dma_buf *dmabuf,
> > +				    struct dma_buf_attachment *attachment)
> > +{
> > +	kfree(attachment->priv);
> > +}
> 
> If the caller fails to call map then it leaks the iova.

I'm relying on dmabuf code and documentation:

   926 /**
   927  * dma_buf_dynamic_attach - Add the device to dma_buf's attachments list
...   
   932  *
   933  * Returns struct dma_buf_attachment pointer for this attachment. Attachments
   934  * must be cleaned up by calling dma_buf_detach().

Successful call to vfio_pci_dma_buf_attach() MUST be accompanied by call
to vfio_pci_dma_buf_detach(), so as far as dmabuf implementation follows
it, there is no leak.

> 
> > +static struct sg_table *
> > +vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
> > +		     enum dma_data_direction dir)
> > +{
> [..]
> 
> 
> > +err_unmap_dma:
> > +	if (!i || !state)
> > +		; /* Do nothing */
> > +	else if (dma_use_iova(state))
> > +		dma_iova_destroy(attachment->dev, state, mapped_len, dir,
> > +				 attrs);
> 
> If we hit this error path then it is freed..
> 
> > +static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
> > +				   struct sg_table *sgt,
> > +				   enum dma_data_direction dir)
> > +{
> > +	struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> > +	struct dma_iova_state *state = attachment->priv;
> > +	unsigned long attrs = DMA_ATTR_MMIO;
> > +	struct scatterlist *sgl;
> > +	int i;
> > +
> > +	if (!state)
> > +		; /* Do nothing */
> > +	else if (dma_use_iova(state))
> > +		dma_iova_destroy(attachment->dev, state, priv->size, dir,
> > +				 attrs);
> 
> It is freed here too, but we can call map multiple times. Every time a
> move_notify happens can trigger another call to map.
> 
> I think just call unlink in those two and put dma_iova_free in detach

Yes, it can work.

Thanks

> 
> Jason
Christoph Hellwig Oct. 17, 2025, 6:33 a.m. UTC | #4
On Mon, Oct 13, 2025 at 06:26:11PM +0300, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@nvidia.com>
> 
> Add support for exporting PCI device MMIO regions through dma-buf,
> enabling safe sharing of non-struct page memory with controlled
> lifetime management. This allows RDMA and other subsystems to import
> dma-buf FDs and build them into memory regions for PCI P2P operations.
> 
> The implementation provides a revocable attachment mechanism using
> dma-buf move operations. MMIO regions are normally pinned as BARs
> don't change physical addresses, but access is revoked when the VFIO
> device is closed or a PCI reset is issued. This ensures kernel
> self-defense against potentially hostile userspace.

This still completely fails to explain why you think that it actually
is safe without the proper pgmap handling.
Jason Gunthorpe Oct. 17, 2025, 12:16 p.m. UTC | #5
On Thu, Oct 16, 2025 at 11:33:52PM -0700, Christoph Hellwig wrote:
> On Mon, Oct 13, 2025 at 06:26:11PM +0300, Leon Romanovsky wrote:
> > From: Leon Romanovsky <leonro@nvidia.com>
> > 
> > Add support for exporting PCI device MMIO regions through dma-buf,
> > enabling safe sharing of non-struct page memory with controlled
> > lifetime management. This allows RDMA and other subsystems to import
> > dma-buf FDs and build them into memory regions for PCI P2P operations.
> > 
> > The implementation provides a revocable attachment mechanism using
> > dma-buf move operations. MMIO regions are normally pinned as BARs
> > don't change physical addresses, but access is revoked when the VFIO
> > device is closed or a PCI reset is issued. This ensures kernel
> > self-defense against potentially hostile userspace.
> 
> This still completely fails to explain why you think that it actually
> is safe without the proper pgmap handling.

Leon, this keeps coming up, please clean up and copy the text from my
prior email into the commit message & cover letter explaining in
detail how lifetime magement works by using revocation/invalidation
driven by dmabuf move_notify callbacks.

Jason
Jason Gunthorpe Oct. 17, 2025, 1:02 p.m. UTC | #6
On Mon, Oct 13, 2025 at 06:26:11PM +0300, Leon Romanovsky wrote:
> +static void dma_ranges_to_p2p_phys(struct vfio_pci_dma_buf *priv,
> +				   struct vfio_device_feature_dma_buf *dma_buf,
> +				   struct vfio_region_dma_range *dma_ranges,
> +				   struct p2pdma_provider *provider)
> +{
> +	struct pci_dev *pdev = priv->vdev->pdev;
> +	phys_addr_t pci_start;
> +	u32 i;
> +
> +	pci_start = pci_resource_start(pdev, dma_buf->region_index);
> +	for (i = 0; i < dma_buf->nr_ranges; i++) {
> +		priv->phys_vec[i].len = dma_ranges[i].length;
> +		priv->phys_vec[i].paddr = pci_start + dma_ranges[i].offset;
> +		priv->size += priv->phys_vec[i].len;
> +	}

This is missing validation, the userspace can pass in any
length/offset but the resource is of limited size. Like this:

static int dma_ranges_to_p2p_phys(struct vfio_pci_dma_buf *priv,
				  struct vfio_device_feature_dma_buf *dma_buf,
				  struct vfio_region_dma_range *dma_ranges,
				  struct p2pdma_provider *provider)
{
	struct pci_dev *pdev = priv->vdev->pdev;
	phys_addr_t len = pci_resource_len(pdev, dma_buf->region_index);
	phys_addr_t pci_start;
	phys_addr_t pci_last;
	u32 i;

	if (!len)
		return -EINVAL;
	pci_start = pci_resource_start(pdev, dma_buf->region_index);
	pci_last = pci_start + len - 1;
	for (i = 0; i < dma_buf->nr_ranges; i++) {
		phys_addr_t last;

		if (!dma_ranges[i].length)
			return -EINVAL;

		if (check_add_overflow(pci_start, dma_ranges[i].offset,
				       &priv->phys_vec[i].paddr) ||
		    check_add_overflow(priv->phys_vec[i].paddr,
				       dma_ranges[i].length - 1, &last))
			return -EOVERFLOW;
		if (last > pci_last)
			return -EINVAL;

		priv->phys_vec[i].len = dma_ranges[i].length;
		priv->size += priv->phys_vec[i].len;
	}
	priv->nr_ranges = dma_buf->nr_ranges;
	priv->provider = provider;
	return 0;
}

Jason
Jason Gunthorpe Oct. 17, 2025, 3:58 p.m. UTC | #7
On Fri, Oct 17, 2025 at 08:40:07AM +0300, Leon Romanovsky wrote:
> > > +static void vfio_pci_dma_buf_detach(struct dma_buf *dmabuf,
> > > +				    struct dma_buf_attachment *attachment)
> > > +{
> > > +	kfree(attachment->priv);
> > > +}
> > 
> > If the caller fails to call map then it leaks the iova.
> 
> I'm relying on dmabuf code and documentation:
> 
>    926 /**
>    927  * dma_buf_dynamic_attach - Add the device to dma_buf's attachments list
> ...   
>    932  *
>    933  * Returns struct dma_buf_attachment pointer for this attachment. Attachments
>    934  * must be cleaned up by calling dma_buf_detach().
> 
> Successful call to vfio_pci_dma_buf_attach() MUST be accompanied by call
> to vfio_pci_dma_buf_detach(), so as far as dmabuf implementation follows
> it, there is no leak.

It leaks the ivoa because there is no dma_iova_destroy() unless you
call unmap. detach is not unmap and unmap is not mandatory to call.

Jason
Jason Gunthorpe Oct. 17, 2025, 4:01 p.m. UTC | #8
On Fri, Oct 17, 2025 at 12:58:50PM -0300, Jason Gunthorpe wrote:
> On Fri, Oct 17, 2025 at 08:40:07AM +0300, Leon Romanovsky wrote:
> > > > +static void vfio_pci_dma_buf_detach(struct dma_buf *dmabuf,
> > > > +				    struct dma_buf_attachment *attachment)
> > > > +{
> > > > +	kfree(attachment->priv);
> > > > +}
> > > 
> > > If the caller fails to call map then it leaks the iova.
> > 
> > I'm relying on dmabuf code and documentation:
> > 
> >    926 /**
> >    927  * dma_buf_dynamic_attach - Add the device to dma_buf's attachments list
> > ...   
> >    932  *
> >    933  * Returns struct dma_buf_attachment pointer for this attachment. Attachments
> >    934  * must be cleaned up by calling dma_buf_detach().
> > 
> > Successful call to vfio_pci_dma_buf_attach() MUST be accompanied by call
> > to vfio_pci_dma_buf_detach(), so as far as dmabuf implementation follows
> > it, there is no leak.
> 
> It leaks the ivoa because there is no dma_iova_destroy() unless you
> call unmap. detach is not unmap and unmap is not mandatory to call.

Though putting iova free in detach is problematic for the hot-unplug
case. In that instance we need to ensure the iova is cleaned up prior
to returning from vfio's remove(). detached is called on the importers
timeline but unmap is required to be called in move_notify..

So I guess some kind of flag to trigger the unmap after cleanup to
free the iova?

Jason
Leon Romanovsky Oct. 17, 2025, 4:13 p.m. UTC | #9
On Fri, Oct 17, 2025 at 10:02:49AM -0300, Jason Gunthorpe wrote:
> On Mon, Oct 13, 2025 at 06:26:11PM +0300, Leon Romanovsky wrote:
> > +static void dma_ranges_to_p2p_phys(struct vfio_pci_dma_buf *priv,
> > +				   struct vfio_device_feature_dma_buf *dma_buf,
> > +				   struct vfio_region_dma_range *dma_ranges,
> > +				   struct p2pdma_provider *provider)
> > +{
> > +	struct pci_dev *pdev = priv->vdev->pdev;
> > +	phys_addr_t pci_start;
> > +	u32 i;
> > +
> > +	pci_start = pci_resource_start(pdev, dma_buf->region_index);
> > +	for (i = 0; i < dma_buf->nr_ranges; i++) {
> > +		priv->phys_vec[i].len = dma_ranges[i].length;
> > +		priv->phys_vec[i].paddr = pci_start + dma_ranges[i].offset;
> > +		priv->size += priv->phys_vec[i].len;
> > +	}
> 
> This is missing validation, the userspace can pass in any
> length/offset but the resource is of limited size. Like this:
> 
> static int dma_ranges_to_p2p_phys(struct vfio_pci_dma_buf *priv,
> 				  struct vfio_device_feature_dma_buf *dma_buf,
> 				  struct vfio_region_dma_range *dma_ranges,
> 				  struct p2pdma_provider *provider)
> {
> 	struct pci_dev *pdev = priv->vdev->pdev;
> 	phys_addr_t len = pci_resource_len(pdev, dma_buf->region_index);
> 	phys_addr_t pci_start;
> 	phys_addr_t pci_last;
> 	u32 i;
> 
> 	if (!len)
> 		return -EINVAL;
> 	pci_start = pci_resource_start(pdev, dma_buf->region_index);
> 	pci_last = pci_start + len - 1;
> 	for (i = 0; i < dma_buf->nr_ranges; i++) {
> 		phys_addr_t last;
> 
> 		if (!dma_ranges[i].length)
> 			return -EINVAL;
> 
> 		if (check_add_overflow(pci_start, dma_ranges[i].offset,
> 				       &priv->phys_vec[i].paddr) ||
> 		    check_add_overflow(priv->phys_vec[i].paddr,
> 				       dma_ranges[i].length - 1, &last))
> 			return -EOVERFLOW;
> 		if (last > pci_last)
> 			return -EINVAL;
> 
> 		priv->phys_vec[i].len = dma_ranges[i].length;
> 		priv->size += priv->phys_vec[i].len;
> 	}
> 	priv->nr_ranges = dma_buf->nr_ranges;
> 	priv->provider = provider;
> 	return 0;
> }

I have these checks in validate_dmabuf_input(). Do you think that I need
to add extra checks?

Thanks

> 
> Jason
Jason Gunthorpe Oct. 17, 2025, 11:40 p.m. UTC | #10
On Mon, Oct 13, 2025 at 06:26:11PM +0300, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@nvidia.com>
> 
> Add support for exporting PCI device MMIO regions through dma-buf,
> enabling safe sharing of non-struct page memory with controlled
> lifetime management. This allows RDMA and other subsystems to import
> dma-buf FDs and build them into memory regions for PCI P2P operations.

I was looking at how to address Alex's note about not all drivers
being compatible, and how to enable the non-compatible drivers.

It looks like the simplest thing is to make dma_ranges_to_p2p_phys
into an ops and have the driver provide it. If not provided the no
support.

Drivers with special needs can fill in phys in their own way and get
their own provider.

Sort of like this:

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index ac10f14417f2f3..6d41cf26b53994 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -147,6 +147,10 @@ static const struct vfio_device_ops vfio_pci_ops = {
 	.pasid_detach_ioas	= vfio_iommufd_physical_pasid_detach_ioas,
 };
 
+static const struct vfio_pci_device_ops vfio_pci_dev_ops = {
+	.get_dmabuf_phys = vfio_pci_core_get_dmabuf_phys,
+};
+
 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct vfio_pci_core_device *vdev;
@@ -161,6 +165,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		return PTR_ERR(vdev);
 
 	dev_set_drvdata(&pdev->dev, vdev);
+	vdev->pci_ops = &vfio_pci_dev_ops;
 	ret = vfio_pci_core_register_device(vdev);
 	if (ret)
 		goto out_put_vdev;
diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
index 358856e6b8a820..dad880781a9352 100644
--- a/drivers/vfio/pci/vfio_pci_dmabuf.c
+++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
@@ -309,47 +309,52 @@ int vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment,
 }
 EXPORT_SYMBOL_GPL(vfio_pci_dma_buf_iommufd_map);
 
-static int dma_ranges_to_p2p_phys(struct vfio_pci_dma_buf *priv,
-				  struct vfio_device_feature_dma_buf *dma_buf,
+int vfio_pci_core_get_dmabuf_phys(struct vfio_pci_core_device *vdev,
+				  struct p2pdma_provider **provider,
+				  unsigned int region_index,
+				  struct phys_vec *phys_vec,
 				  struct vfio_region_dma_range *dma_ranges,
-				  struct p2pdma_provider *provider)
+				  size_t nr_ranges)
 {
-	struct pci_dev *pdev = priv->vdev->pdev;
-	phys_addr_t len = pci_resource_len(pdev, dma_buf->region_index);
+	struct pci_dev *pdev = vdev->pdev;
+	phys_addr_t len = pci_resource_len(pdev, region_index);
 	phys_addr_t pci_start;
 	phys_addr_t pci_last;
 	u32 i;
 
 	if (!len)
 		return -EINVAL;
-	pci_start = pci_resource_start(pdev, dma_buf->region_index);
+
+	*provider = pcim_p2pdma_provider(pdev, region_index);
+	if (!*provider)
+		return -EINVAL;
+
+	pci_start = pci_resource_start(pdev, region_index);
 	pci_last = pci_start + len - 1;
-	for (i = 0; i < dma_buf->nr_ranges; i++) {
+	for (i = 0; i < nr_ranges; i++) {
 		phys_addr_t last;
 
 		if (!dma_ranges[i].length)
 			return -EINVAL;
 
 		if (check_add_overflow(pci_start, dma_ranges[i].offset,
-				       &priv->phys_vec[i].paddr) ||
-		    check_add_overflow(priv->phys_vec[i].paddr,
+				       &phys_vec[i].paddr) ||
+		    check_add_overflow(phys_vec[i].paddr,
 				       dma_ranges[i].length - 1, &last))
 			return -EOVERFLOW;
 		if (last > pci_last)
 			return -EINVAL;
 
-		priv->phys_vec[i].len = dma_ranges[i].length;
-		priv->size += priv->phys_vec[i].len;
+		phys_vec[i].len = dma_ranges[i].length;
 	}
-	priv->nr_ranges = dma_buf->nr_ranges;
-	priv->provider = provider;
 	return 0;
 }
+EXPORT_SYMBOL_GPL(vfio_pci_core_get_dmabuf_phys);
 
 static int validate_dmabuf_input(struct vfio_pci_core_device *vdev,
 				 struct vfio_device_feature_dma_buf *dma_buf,
 				 struct vfio_region_dma_range *dma_ranges,
-				 struct p2pdma_provider **provider)
+				 size_t *lengthp)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	u32 bar = dma_buf->region_index;
@@ -365,10 +370,6 @@ static int validate_dmabuf_input(struct vfio_pci_core_device *vdev,
 	if (bar >= VFIO_PCI_ROM_REGION_INDEX)
 		return -ENODEV;
 
-	*provider = pcim_p2pdma_provider(pdev, bar);
-	if (!*provider)
-		return -EINVAL;
-
 	bar_size = pci_resource_len(pdev, bar);
 	for (i = 0; i < dma_buf->nr_ranges; i++) {
 		u64 offset = dma_ranges[i].offset;
@@ -397,6 +398,7 @@ static int validate_dmabuf_input(struct vfio_pci_core_device *vdev,
 	if (overflows_type(length, size_t) || length & DMA_IOVA_USE_SWIOTLB)
 		return -EINVAL;
 
+	*lengthp = length;
 	return 0;
 }
 
@@ -407,10 +409,13 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
 	struct vfio_device_feature_dma_buf get_dma_buf = {};
 	struct vfio_region_dma_range *dma_ranges;
 	DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
-	struct p2pdma_provider *provider;
 	struct vfio_pci_dma_buf *priv;
+	size_t length;
 	int ret;
 
+	if (!vdev->pci_ops->get_dmabuf_phys)
+		return -EOPNOTSUPP;
+
 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
 				 sizeof(get_dma_buf));
 	if (ret != 1)
@@ -427,7 +432,7 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
 	if (IS_ERR(dma_ranges))
 		return PTR_ERR(dma_ranges);
 
-	ret = validate_dmabuf_input(vdev, &get_dma_buf, dma_ranges, &provider);
+	ret = validate_dmabuf_input(vdev, &get_dma_buf, dma_ranges, &length);
 	if (ret)
 		goto err_free_ranges;
 
@@ -444,10 +449,16 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
 	}
 
 	priv->vdev = vdev;
-	ret = dma_ranges_to_p2p_phys(priv, &get_dma_buf, dma_ranges, provider);
+	priv->nr_ranges = get_dma_buf.nr_ranges;
+	priv->size = length;
+	ret = vdev->pci_ops->get_dmabuf_phys(vdev, &priv->provider,
+					     get_dma_buf.region_index,
+					     priv->phys_vec, dma_ranges,
+					     priv->nr_ranges);
 	if (ret)
 		goto err_free_phys;
 
+
 	kfree(dma_ranges);
 	dma_ranges = NULL;
 
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 37ce02e30c7632..4ea2095381eb24 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -26,6 +26,7 @@
 
 struct vfio_pci_core_device;
 struct vfio_pci_region;
+struct p2pdma_provider;
 
 struct vfio_pci_regops {
 	ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf,
@@ -49,9 +50,26 @@ struct vfio_pci_region {
 	u32				flags;
 };
 
+struct vfio_pci_device_ops {
+	int (*get_dmabuf_phys)(struct vfio_pci_core_device *vdev,
+			       struct p2pdma_provider **provider,
+			       unsigned int region_index,
+			       struct phys_vec *phys_vec,
+			       struct vfio_region_dma_range *dma_ranges,
+			       size_t nr_ranges);
+};
+
+int vfio_pci_core_get_dmabuf_phys(struct vfio_pci_core_device *vdev,
+				  struct p2pdma_provider **provider,
+				  unsigned int region_index,
+				  struct phys_vec *phys_vec,
+				  struct vfio_region_dma_range *dma_ranges,
+				  size_t nr_ranges);
+
 struct vfio_pci_core_device {
 	struct vfio_device	vdev;
 	struct pci_dev		*pdev;
+	const struct vfio_pci_device_ops *pci_ops;
 	void __iomem		*barmap[PCI_STD_NUM_BARS];
 	bool			bar_mmap_supported[PCI_STD_NUM_BARS];
 	u8			*pci_config_map;
Jason Gunthorpe Oct. 20, 2025, 4:15 p.m. UTC | #11
On Fri, Oct 17, 2025 at 07:13:58PM +0300, Leon Romanovsky wrote:
> > static int dma_ranges_to_p2p_phys(struct vfio_pci_dma_buf *priv,
> > 				  struct vfio_device_feature_dma_buf *dma_buf,
> > 				  struct vfio_region_dma_range *dma_ranges,
> > 				  struct p2pdma_provider *provider)
> > {
> > 	struct pci_dev *pdev = priv->vdev->pdev;
> > 	phys_addr_t len = pci_resource_len(pdev, dma_buf->region_index);
> > 	phys_addr_t pci_start;
> > 	phys_addr_t pci_last;
> > 	u32 i;
> > 
> > 	if (!len)
> > 		return -EINVAL;
> > 	pci_start = pci_resource_start(pdev, dma_buf->region_index);
> > 	pci_last = pci_start + len - 1;
> > 	for (i = 0; i < dma_buf->nr_ranges; i++) {
> > 		phys_addr_t last;
> > 
> > 		if (!dma_ranges[i].length)
> > 			return -EINVAL;
> > 
> > 		if (check_add_overflow(pci_start, dma_ranges[i].offset,
> > 				       &priv->phys_vec[i].paddr) ||
> > 		    check_add_overflow(priv->phys_vec[i].paddr,
> > 				       dma_ranges[i].length - 1, &last))
> > 			return -EOVERFLOW;
> > 		if (last > pci_last)
> > 			return -EINVAL;
> > 
> > 		priv->phys_vec[i].len = dma_ranges[i].length;
> > 		priv->size += priv->phys_vec[i].len;
> > 	}
> > 	priv->nr_ranges = dma_buf->nr_ranges;
> > 	priv->provider = provider;
> > 	return 0;
> > }
> 
> I have these checks in validate_dmabuf_input(). 
> Do you think that I need to add extra checks?

I think they work better in this function, so I'd move them here.

Jason
Leon Romanovsky Oct. 20, 2025, 4:44 p.m. UTC | #12
On Mon, Oct 20, 2025 at 01:15:16PM -0300, Jason Gunthorpe wrote:
> On Fri, Oct 17, 2025 at 07:13:58PM +0300, Leon Romanovsky wrote:
> > > static int dma_ranges_to_p2p_phys(struct vfio_pci_dma_buf *priv,
> > > 				  struct vfio_device_feature_dma_buf *dma_buf,
> > > 				  struct vfio_region_dma_range *dma_ranges,
> > > 				  struct p2pdma_provider *provider)
> > > {
> > > 	struct pci_dev *pdev = priv->vdev->pdev;
> > > 	phys_addr_t len = pci_resource_len(pdev, dma_buf->region_index);
> > > 	phys_addr_t pci_start;
> > > 	phys_addr_t pci_last;
> > > 	u32 i;
> > > 
> > > 	if (!len)
> > > 		return -EINVAL;
> > > 	pci_start = pci_resource_start(pdev, dma_buf->region_index);
> > > 	pci_last = pci_start + len - 1;
> > > 	for (i = 0; i < dma_buf->nr_ranges; i++) {
> > > 		phys_addr_t last;
> > > 
> > > 		if (!dma_ranges[i].length)
> > > 			return -EINVAL;
> > > 
> > > 		if (check_add_overflow(pci_start, dma_ranges[i].offset,
> > > 				       &priv->phys_vec[i].paddr) ||
> > > 		    check_add_overflow(priv->phys_vec[i].paddr,
> > > 				       dma_ranges[i].length - 1, &last))
> > > 			return -EOVERFLOW;
> > > 		if (last > pci_last)
> > > 			return -EINVAL;
> > > 
> > > 		priv->phys_vec[i].len = dma_ranges[i].length;
> > > 		priv->size += priv->phys_vec[i].len;
> > > 	}
> > > 	priv->nr_ranges = dma_buf->nr_ranges;
> > > 	priv->provider = provider;
> > > 	return 0;
> > > }
> > 
> > I have these checks in validate_dmabuf_input(). 
> > Do you think that I need to add extra checks?
> 
> I think they work better in this function, so I'd move them here.

The main idea for validate_dmabuf_input() is to perform as much as
possible checks before allocating kernel memory.

Thanks

> 
> Jason
Jason Gunthorpe Oct. 20, 2025, 4:51 p.m. UTC | #13
On Mon, Oct 20, 2025 at 07:44:57PM +0300, Leon Romanovsky wrote:
> On Mon, Oct 20, 2025 at 01:15:16PM -0300, Jason Gunthorpe wrote:
> > On Fri, Oct 17, 2025 at 07:13:58PM +0300, Leon Romanovsky wrote:
> > > > static int dma_ranges_to_p2p_phys(struct vfio_pci_dma_buf *priv,
> > > > 				  struct vfio_device_feature_dma_buf *dma_buf,
> > > > 				  struct vfio_region_dma_range *dma_ranges,
> > > > 				  struct p2pdma_provider *provider)
> > > > {
> > > > 	struct pci_dev *pdev = priv->vdev->pdev;
> > > > 	phys_addr_t len = pci_resource_len(pdev, dma_buf->region_index);
> > > > 	phys_addr_t pci_start;
> > > > 	phys_addr_t pci_last;
> > > > 	u32 i;
> > > > 
> > > > 	if (!len)
> > > > 		return -EINVAL;
> > > > 	pci_start = pci_resource_start(pdev, dma_buf->region_index);
> > > > 	pci_last = pci_start + len - 1;
> > > > 	for (i = 0; i < dma_buf->nr_ranges; i++) {
> > > > 		phys_addr_t last;
> > > > 
> > > > 		if (!dma_ranges[i].length)
> > > > 			return -EINVAL;
> > > > 
> > > > 		if (check_add_overflow(pci_start, dma_ranges[i].offset,
> > > > 				       &priv->phys_vec[i].paddr) ||
> > > > 		    check_add_overflow(priv->phys_vec[i].paddr,
> > > > 				       dma_ranges[i].length - 1, &last))
> > > > 			return -EOVERFLOW;
> > > > 		if (last > pci_last)
> > > > 			return -EINVAL;
> > > > 
> > > > 		priv->phys_vec[i].len = dma_ranges[i].length;
> > > > 		priv->size += priv->phys_vec[i].len;
> > > > 	}
> > > > 	priv->nr_ranges = dma_buf->nr_ranges;
> > > > 	priv->provider = provider;
> > > > 	return 0;
> > > > }
> > > 
> > > I have these checks in validate_dmabuf_input(). 
> > > Do you think that I need to add extra checks?
> > 
> > I think they work better in this function, so I'd move them here.
> 
> The main idea for validate_dmabuf_input() is to perform as much as
> possible checks before allocating kernel memory.

Yeah, but it's fine, it can just be turned into a function to safely
compute the total size. It makes more sense to try to validate once we
have the kernel memory and got the physical range from the driver to
copy into the phys.

Jason
Jason Gunthorpe Oct. 22, 2025, 12:50 p.m. UTC | #14
On Mon, Oct 13, 2025 at 06:26:11PM +0300, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@nvidia.com>
> 
> Add support for exporting PCI device MMIO regions through dma-buf,
> enabling safe sharing of non-struct page memory with controlled
> lifetime management. This allows RDMA and other subsystems to import
> dma-buf FDs and build them into memory regions for PCI P2P operations.
> 
> The implementation provides a revocable attachment mechanism using
> dma-buf move operations. MMIO regions are normally pinned as BARs
> don't change physical addresses, but access is revoked when the VFIO
> device is closed or a PCI reset is issued. This ensures kernel
> self-defense against potentially hostile userspace.

Let's enhance this:

Currently VFIO can take MMIO regions from the device's BAR and map
them into a PFNMAP VMA with special PTEs. This mapping type ensures
the memory cannot be used with things like pin_user_pages(), hmm, and
so on. In practice only the user process CPU and KVM can safely make
use of these VMA. When VFIO shuts down these VMAs are cleaned by
unmap_mapping_range() to prevent any UAF of the MMIO beyond driver
unbind.

However, VFIO type 1 has an insecure behavior where it uses
follow_pfnmap_*() to fish a MMIO PFN out of a VMA and program it back
into the IOMMU. This has a long history of enabling P2P DMA inside
VMs, but has serious lifetime problems by allowing a UAF of the MMIO
after the VFIO driver has been unbound.

Introduce DMABUF as a new safe way to export a FD based handle for the
MMIO regions. This can be consumed by existing DMABUF importers like
RDMA or DRM without opening an UAF. A following series will add an
importer to iommufd to obsolete the type 1 code and allow safe
UAF-free MMIO P2P in VM cases.

DMABUF has a built in synchronous invalidation mechanism called
move_notify. VFIO keeps track of all drivers importing its MMIO and
can invoke a synchronous invalidation callback to tell the importing
drivers to DMA unmap and forget about the MMIO pfns. This process is
being called revoke. This synchronous invalidation fully prevents any
lifecycle problems. VFIO will do this before unbinding its driver
ensuring there is no UAF of the MMIO beyond the driver lifecycle.

Further, VFIO has additional behavior to block access to the MMIO
during things like Function Level Reset. This is because some poor
platforms may experience a MCE type crash when touching MMIO of a PCI
device that is undergoing a reset. Today this is done by using
unmap_mapping_range() on the VMAs. Extend that into the DMABUF world
and temporarily revoke the MMIO from the DMABUF importers during FLR
as well. This will more robustly prevent an errant P2P from possibly
upsetting the platform.

A DMABUF FD is a prefered handle for MMIO compared to using something
like a pgmap because:
 - VFIO is supported, including its P2P feature, on archs that don't
   support pgmap
 - PCI devices have all sorts of BAR sizes, including ones smaller
   than a section so a pgmap cannot always be created
 - It is undesirable to waste alot of memory for struct pages,
   especially for a case like a GPU with ~100GB of BAR size
 - We want a synchronous revoke semantic to support FLR with light
   hardware requirements

Use the P2P subsystem to help generate the DMA mapping. This is a
significant upgrade over the abuse of dma_map_resource() that has
historically been used by DMABUF exporters. Experience with an OOT
version of this patch shows that real systems do need this. This
approach deals with all the P2P scenarios:
 - Non-zero PCI bus_offset
 - ACS flags routing traffic to the IOMMU
 - ACS flags that bypass the IOMMU - though vfio noiommu is required
   to hit this.

There will be further work to formalize the revoke semantic in
DMABUF. For now this acts like a move_notify dynamic exporter where
importer fault handling will get a failure when they attempt to map.
This means that only fully restartable fault capable importers can
import the VFIO DMABUFs. A future revoke semantic should open this up
to more HW as the HW only needs to invalidate, not handle restartable
faults.

Jason
Shuai Xue Oct. 26, 2025, 7:55 a.m. UTC | #15
在 2025/10/22 20:50, Jason Gunthorpe 写道:
> On Mon, Oct 13, 2025 at 06:26:11PM +0300, Leon Romanovsky wrote:
>> From: Leon Romanovsky <leonro@nvidia.com>
>>
>> Add support for exporting PCI device MMIO regions through dma-buf,
>> enabling safe sharing of non-struct page memory with controlled
>> lifetime management. This allows RDMA and other subsystems to import
>> dma-buf FDs and build them into memory regions for PCI P2P operations.
>>
>> The implementation provides a revocable attachment mechanism using
>> dma-buf move operations. MMIO regions are normally pinned as BARs
>> don't change physical addresses, but access is revoked when the VFIO
>> device is closed or a PCI reset is issued. This ensures kernel
>> self-defense against potentially hostile userspace.
> 
> Let's enhance this:
> 
> Currently VFIO can take MMIO regions from the device's BAR and map
> them into a PFNMAP VMA with special PTEs. This mapping type ensures
> the memory cannot be used with things like pin_user_pages(), hmm, and
> so on. In practice only the user process CPU and KVM can safely make
> use of these VMA. When VFIO shuts down these VMAs are cleaned by
> unmap_mapping_range() to prevent any UAF of the MMIO beyond driver
> unbind.
> 
> However, VFIO type 1 has an insecure behavior where it uses
> follow_pfnmap_*() to fish a MMIO PFN out of a VMA and program it back
> into the IOMMU. This has a long history of enabling P2P DMA inside
> VMs, but has serious lifetime problems by allowing a UAF of the MMIO
> after the VFIO driver has been unbound.

Hi, Jason,

Can you elaborate on this more?

 From my understanding of the VFIO type 1 implementation:

- When a device is opened through VFIO type 1, it increments the
   device->refcount
- During unbind, the driver waits for this refcount to drop to zero via
   wait_for_completion(&device->comp)
- This should prevent the unbind() from completing while the device is
   still in use

Given this refcount mechanism, I do not figure out how the UAF can
occur.

Thanks.
Jason Gunthorpe Oct. 27, 2025, 12:09 p.m. UTC | #16
On Sun, Oct 26, 2025 at 03:55:04PM +0800, Shuai Xue wrote:
> 
> 
> 在 2025/10/22 20:50, Jason Gunthorpe 写道:
> > On Mon, Oct 13, 2025 at 06:26:11PM +0300, Leon Romanovsky wrote:
> > > From: Leon Romanovsky <leonro@nvidia.com>
> > > 
> > > Add support for exporting PCI device MMIO regions through dma-buf,
> > > enabling safe sharing of non-struct page memory with controlled
> > > lifetime management. This allows RDMA and other subsystems to import
> > > dma-buf FDs and build them into memory regions for PCI P2P operations.
> > > 
> > > The implementation provides a revocable attachment mechanism using
> > > dma-buf move operations. MMIO regions are normally pinned as BARs
> > > don't change physical addresses, but access is revoked when the VFIO
> > > device is closed or a PCI reset is issued. This ensures kernel
> > > self-defense against potentially hostile userspace.
> > 
> > Let's enhance this:
> > 
> > Currently VFIO can take MMIO regions from the device's BAR and map
> > them into a PFNMAP VMA with special PTEs. This mapping type ensures
> > the memory cannot be used with things like pin_user_pages(), hmm, and
> > so on. In practice only the user process CPU and KVM can safely make
> > use of these VMA. When VFIO shuts down these VMAs are cleaned by
> > unmap_mapping_range() to prevent any UAF of the MMIO beyond driver
> > unbind.
> > 
> > However, VFIO type 1 has an insecure behavior where it uses
> > follow_pfnmap_*() to fish a MMIO PFN out of a VMA and program it back
> > into the IOMMU. This has a long history of enabling P2P DMA inside
> > VMs, but has serious lifetime problems by allowing a UAF of the MMIO
> > after the VFIO driver has been unbound.
> 
> Hi, Jason,
> 
> Can you elaborate on this more?
> 
> From my understanding of the VFIO type 1 implementation:
> 
> - When a device is opened through VFIO type 1, it increments the
>   device->refcount
> - During unbind, the driver waits for this refcount to drop to zero via
>   wait_for_completion(&device->comp)
> - This should prevent the unbind() from completing while the device is
>   still in use
> 
> Given this refcount mechanism, I do not figure out how the UAF can
> occur.

A second vfio device can be opened and then use follow_pfnmap_*() to
read the first vfio device's PTEs. There is no relationship betweent
the first and second VFIO devices, so once the first is unbound it
sails through the device->comp while the second device retains the PFN
in its type1 iommu_domain.

Jason
David Matlack Oct. 27, 2025, 11:13 p.m. UTC | #17
On Mon, Oct 13, 2025 at 8:44 AM Leon Romanovsky <leon@kernel.org> wrote:
>
> From: Leon Romanovsky <leonro@nvidia.com>
>
> Add support for exporting PCI device MMIO regions through dma-buf,
> enabling safe sharing of non-struct page memory with controlled
> lifetime management. This allows RDMA and other subsystems to import
> dma-buf FDs and build them into memory regions for PCI P2P operations.

> +/**
> + * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the
> + * regions selected.
> + *
> + * open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC,
> + * etc. offset/length specify a slice of the region to create the dmabuf from.
> + * nr_ranges is the total number of (P2P DMA) ranges that comprise the dmabuf.
> + *
> + * Return: The fd number on success, -1 and errno is set on failure.
> + */
> +#define VFIO_DEVICE_FEATURE_DMA_BUF 11
> +
> +struct vfio_region_dma_range {
> +       __u64 offset;
> +       __u64 length;
> +};
> +
> +struct vfio_device_feature_dma_buf {
> +       __u32   region_index;
> +       __u32   open_flags;
> +       __u32   flags;
> +       __u32   nr_ranges;
> +       struct vfio_region_dma_range dma_ranges[];
> +};

This uAPI would be a good candidate for a VFIO selftest. You can test
that it returns an error when it's supposed to, and a valid fd when
it's supposed to. And once the iommufd importer side is ready, we can
extend the test and verify that the fd can be mapped into iommufd.

It will probably be challenging to meaningfully exercise device P2P
through a selftest, I haven't thought about how to extend the driver
framework for that yet... But you can at least test that all the
ioctls behave like they should.
Leon Romanovsky Oct. 28, 2025, 12:02 p.m. UTC | #18
On Mon, Oct 27, 2025 at 04:13:05PM -0700, David Matlack wrote:
> On Mon, Oct 13, 2025 at 8:44 AM Leon Romanovsky <leon@kernel.org> wrote:
> >
> > From: Leon Romanovsky <leonro@nvidia.com>
> >
> > Add support for exporting PCI device MMIO regions through dma-buf,
> > enabling safe sharing of non-struct page memory with controlled
> > lifetime management. This allows RDMA and other subsystems to import
> > dma-buf FDs and build them into memory regions for PCI P2P operations.
> 
> > +/**
> > + * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the
> > + * regions selected.
> > + *
> > + * open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC,
> > + * etc. offset/length specify a slice of the region to create the dmabuf from.
> > + * nr_ranges is the total number of (P2P DMA) ranges that comprise the dmabuf.
> > + *
> > + * Return: The fd number on success, -1 and errno is set on failure.
> > + */
> > +#define VFIO_DEVICE_FEATURE_DMA_BUF 11
> > +
> > +struct vfio_region_dma_range {
> > +       __u64 offset;
> > +       __u64 length;
> > +};
> > +
> > +struct vfio_device_feature_dma_buf {
> > +       __u32   region_index;
> > +       __u32   open_flags;
> > +       __u32   flags;
> > +       __u32   nr_ranges;
> > +       struct vfio_region_dma_range dma_ranges[];
> > +};
> 
> This uAPI would be a good candidate for a VFIO selftest. You can test
> that it returns an error when it's supposed to, and a valid fd when
> it's supposed to. And once the iommufd importer side is ready, we can
> extend the test and verify that the fd can be mapped into iommufd.

No problem, I'll add such test, but let's focus on making sure that this
series is accepted first.

Thanks
Shuai Xue Oct. 28, 2025, 1:46 p.m. UTC | #19
在 2025/10/27 20:09, Jason Gunthorpe 写道:
> On Sun, Oct 26, 2025 at 03:55:04PM +0800, Shuai Xue wrote:
>>
>>
>> 在 2025/10/22 20:50, Jason Gunthorpe 写道:
>>> On Mon, Oct 13, 2025 at 06:26:11PM +0300, Leon Romanovsky wrote:
>>>> From: Leon Romanovsky <leonro@nvidia.com>
>>>>
>>>> Add support for exporting PCI device MMIO regions through dma-buf,
>>>> enabling safe sharing of non-struct page memory with controlled
>>>> lifetime management. This allows RDMA and other subsystems to import
>>>> dma-buf FDs and build them into memory regions for PCI P2P operations.
>>>>
>>>> The implementation provides a revocable attachment mechanism using
>>>> dma-buf move operations. MMIO regions are normally pinned as BARs
>>>> don't change physical addresses, but access is revoked when the VFIO
>>>> device is closed or a PCI reset is issued. This ensures kernel
>>>> self-defense against potentially hostile userspace.
>>>
>>> Let's enhance this:
>>>
>>> Currently VFIO can take MMIO regions from the device's BAR and map
>>> them into a PFNMAP VMA with special PTEs. This mapping type ensures
>>> the memory cannot be used with things like pin_user_pages(), hmm, and
>>> so on. In practice only the user process CPU and KVM can safely make
>>> use of these VMA. When VFIO shuts down these VMAs are cleaned by
>>> unmap_mapping_range() to prevent any UAF of the MMIO beyond driver
>>> unbind.
>>>
>>> However, VFIO type 1 has an insecure behavior where it uses
>>> follow_pfnmap_*() to fish a MMIO PFN out of a VMA and program it back
>>> into the IOMMU. This has a long history of enabling P2P DMA inside
>>> VMs, but has serious lifetime problems by allowing a UAF of the MMIO
>>> after the VFIO driver has been unbound.
>>
>> Hi, Jason,
>>
>> Can you elaborate on this more?
>>
>>  From my understanding of the VFIO type 1 implementation:
>>
>> - When a device is opened through VFIO type 1, it increments the
>>    device->refcount
>> - During unbind, the driver waits for this refcount to drop to zero via
>>    wait_for_completion(&device->comp)
>> - This should prevent the unbind() from completing while the device is
>>    still in use
>>
>> Given this refcount mechanism, I do not figure out how the UAF can
>> occur.
> 
> A second vfio device can be opened and then use follow_pfnmap_*() to
> read the first vfio device's PTEs. There is no relationship betweent
> the first and second VFIO devices, so once the first is unbound it
> sails through the device->comp while the second device retains the PFN
> in its type1 iommu_domain.
> 
> Jason

I see.

Thanks.

Best Regard,
Shuai
Leon Romanovsky Oct. 29, 2025, 6:21 p.m. UTC | #20
On Wed, Oct 29, 2025, at 18:50, Alex Mastro wrote:
> On Mon, Oct 13, 2025 at 06:26:11PM +0300, Leon Romanovsky wrote:
>> +	/*
>> +	 * dma_buf_fd() consumes the reference, when the file closes the dmabuf
>> +	 * will be released.
>> +	 */
>> +	return dma_buf_fd(priv->dmabuf, get_dma_buf.open_flags);
>
> I think this still needs to unwind state on fd allocation error. Reference
> ownership is only transferred on success.

Yes, you are correct, i need to call to dma_buf_put() in case of error. I will fix.

Thanks

>
>> +
>> +err_dev_put:
>> +	vfio_device_put_registration(&vdev->vdev);
>> +err_free_phys:
>> +	kfree(priv->phys_vec);
>> +err_free_priv:
>> +	kfree(priv);
>> +err_free_ranges:
>> +	kfree(dma_ranges);
>> +	return ret;
>> +}
Samiullah Khawaja Oct. 30, 2025, 12:25 a.m. UTC | #21
On Mon, Oct 13, 2025 at 8:27 AM Leon Romanovsky <leon@kernel.org> wrote:
>
> From: Leon Romanovsky <leonro@nvidia.com>
>
> Add support for exporting PCI device MMIO regions through dma-buf,
> enabling safe sharing of non-struct page memory with controlled
> lifetime management. This allows RDMA and other subsystems to import
> dma-buf FDs and build them into memory regions for PCI P2P operations.
>
> The implementation provides a revocable attachment mechanism using
> dma-buf move operations. MMIO regions are normally pinned as BARs
> don't change physical addresses, but access is revoked when the VFIO
> device is closed or a PCI reset is issued. This ensures kernel
> self-defense against potentially hostile userspace.
>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> ---
>  drivers/vfio/pci/Kconfig           |   3 +
>  drivers/vfio/pci/Makefile          |   2 +
>  drivers/vfio/pci/vfio_pci_config.c |  22 +-
>  drivers/vfio/pci/vfio_pci_core.c   |  28 ++
>  drivers/vfio/pci/vfio_pci_dmabuf.c | 446 +++++++++++++++++++++++++++++
>  drivers/vfio/pci/vfio_pci_priv.h   |  23 ++
>  include/linux/vfio_pci_core.h      |   1 +
>  include/uapi/linux/vfio.h          |  25 ++
>  8 files changed, 546 insertions(+), 4 deletions(-)
>  create mode 100644 drivers/vfio/pci/vfio_pci_dmabuf.c
>
> diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
> index 2b0172f54665..2b9fca00e9e8 100644
> --- a/drivers/vfio/pci/Kconfig
> +++ b/drivers/vfio/pci/Kconfig
> @@ -55,6 +55,9 @@ config VFIO_PCI_ZDEV_KVM
>
>           To enable s390x KVM vfio-pci extensions, say Y.
>
> +config VFIO_PCI_DMABUF
> +       def_bool y if VFIO_PCI_CORE && PCI_P2PDMA && DMA_SHARED_BUFFER
> +
>  source "drivers/vfio/pci/mlx5/Kconfig"
>
>  source "drivers/vfio/pci/hisilicon/Kconfig"
> diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
> index cf00c0a7e55c..f9155e9c5f63 100644
> --- a/drivers/vfio/pci/Makefile
> +++ b/drivers/vfio/pci/Makefile
> @@ -2,7 +2,9 @@
>
>  vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
>  vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o
> +
>  obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
> +vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o
>
>  vfio-pci-y := vfio_pci.o
>  vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
> diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
> index 8f02f236b5b4..1f6008eabf23 100644
> --- a/drivers/vfio/pci/vfio_pci_config.c
> +++ b/drivers/vfio/pci/vfio_pci_config.c
> @@ -589,10 +589,12 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos,
>                 virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY);
>                 new_mem = !!(new_cmd & PCI_COMMAND_MEMORY);
>
> -               if (!new_mem)
> +               if (!new_mem) {
>                         vfio_pci_zap_and_down_write_memory_lock(vdev);
> -               else
> +                       vfio_pci_dma_buf_move(vdev, true);
> +               } else {
>                         down_write(&vdev->memory_lock);
> +               }
>
>                 /*
>                  * If the user is writing mem/io enable (new_mem/io) and we
> @@ -627,6 +629,8 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos,
>                 *virt_cmd &= cpu_to_le16(~mask);
>                 *virt_cmd |= cpu_to_le16(new_cmd & mask);
>
> +               if (__vfio_pci_memory_enabled(vdev))
> +                       vfio_pci_dma_buf_move(vdev, false);
>                 up_write(&vdev->memory_lock);
>         }
>
> @@ -707,12 +711,16 @@ static int __init init_pci_cap_basic_perm(struct perm_bits *perm)
>  static void vfio_lock_and_set_power_state(struct vfio_pci_core_device *vdev,
>                                           pci_power_t state)
>  {
> -       if (state >= PCI_D3hot)
> +       if (state >= PCI_D3hot) {
>                 vfio_pci_zap_and_down_write_memory_lock(vdev);
> -       else
> +               vfio_pci_dma_buf_move(vdev, true);
> +       } else {
>                 down_write(&vdev->memory_lock);
> +       }
>
>         vfio_pci_set_power_state(vdev, state);
> +       if (__vfio_pci_memory_enabled(vdev))
> +               vfio_pci_dma_buf_move(vdev, false);
>         up_write(&vdev->memory_lock);
>  }
>
> @@ -900,7 +908,10 @@ static int vfio_exp_config_write(struct vfio_pci_core_device *vdev, int pos,
>
>                 if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) {
>                         vfio_pci_zap_and_down_write_memory_lock(vdev);
> +                       vfio_pci_dma_buf_move(vdev, true);
>                         pci_try_reset_function(vdev->pdev);
> +                       if (__vfio_pci_memory_enabled(vdev))
> +                               vfio_pci_dma_buf_move(vdev, false);
>                         up_write(&vdev->memory_lock);
>                 }
>         }
> @@ -982,7 +993,10 @@ static int vfio_af_config_write(struct vfio_pci_core_device *vdev, int pos,
>
>                 if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) {
>                         vfio_pci_zap_and_down_write_memory_lock(vdev);
> +                       vfio_pci_dma_buf_move(vdev, true);
>                         pci_try_reset_function(vdev->pdev);
> +                       if (__vfio_pci_memory_enabled(vdev))
> +                               vfio_pci_dma_buf_move(vdev, false);
>                         up_write(&vdev->memory_lock);
>                 }
>         }
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index fe247d0e2831..56b1320238a9 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -287,6 +287,8 @@ static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev,
>          * semaphore.
>          */
>         vfio_pci_zap_and_down_write_memory_lock(vdev);
> +       vfio_pci_dma_buf_move(vdev, true);
> +
>         if (vdev->pm_runtime_engaged) {
>                 up_write(&vdev->memory_lock);
>                 return -EINVAL;
> @@ -370,6 +372,8 @@ static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev)
>          */
>         down_write(&vdev->memory_lock);
>         __vfio_pci_runtime_pm_exit(vdev);
> +       if (__vfio_pci_memory_enabled(vdev))
> +               vfio_pci_dma_buf_move(vdev, false);
>         up_write(&vdev->memory_lock);
>  }
>
> @@ -690,6 +694,8 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
>  #endif
>         vfio_pci_core_disable(vdev);
>
> +       vfio_pci_dma_buf_cleanup(vdev);
> +
>         mutex_lock(&vdev->igate);
>         if (vdev->err_trigger) {
>                 eventfd_ctx_put(vdev->err_trigger);
> @@ -1222,7 +1228,10 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev,
>          */
>         vfio_pci_set_power_state(vdev, PCI_D0);
>
> +       vfio_pci_dma_buf_move(vdev, true);
>         ret = pci_try_reset_function(vdev->pdev);
> +       if (__vfio_pci_memory_enabled(vdev))
> +               vfio_pci_dma_buf_move(vdev, false);
>         up_write(&vdev->memory_lock);
>
>         return ret;
> @@ -1511,6 +1520,19 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
>                 return vfio_pci_core_pm_exit(vdev, flags, arg, argsz);
>         case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
>                 return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
> +       case VFIO_DEVICE_FEATURE_DMA_BUF:
> +               if (device->ops->ioctl != vfio_pci_core_ioctl)
> +                       /*
> +                        * Devices that overwrite general .ioctl() callback
> +                        * usually do it to implement their own
> +                        * VFIO_DEVICE_GET_REGION_INFO handlerm and they present
> +                        * different BAR information from the real PCI.
> +                        *
> +                        * DMABUF relies on real PCI information.
> +                        */
> +                       return -EOPNOTSUPP;
> +
> +               return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
>         default:
>                 return -ENOTTY;
>         }
> @@ -2095,6 +2117,7 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev)
>         ret = pcim_p2pdma_init(vdev->pdev);
>         if (ret != -EOPNOTSUPP)
>                 return ret;
> +       INIT_LIST_HEAD(&vdev->dmabufs);
>         init_rwsem(&vdev->memory_lock);
>         xa_init(&vdev->ctx);
>
> @@ -2459,6 +2482,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
>                         break;
>                 }
>
> +               vfio_pci_dma_buf_move(vdev, true);
>                 vfio_pci_zap_bars(vdev);
>         }
>
> @@ -2482,6 +2506,10 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
>
>         ret = pci_reset_bus(pdev);
>
> +       list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
> +               if (__vfio_pci_memory_enabled(vdev))
> +                       vfio_pci_dma_buf_move(vdev, false);
> +
>         vdev = list_last_entry(&dev_set->device_list,
>                                struct vfio_pci_core_device, vdev.dev_set_list);
>
> diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
> new file mode 100644
> index 000000000000..eaba010777f3
> --- /dev/null
> +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> @@ -0,0 +1,446 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
> + */
> +#include <linux/dma-buf.h>
> +#include <linux/pci-p2pdma.h>
> +#include <linux/dma-resv.h>
> +
> +#include "vfio_pci_priv.h"
> +
> +MODULE_IMPORT_NS("DMA_BUF");
> +
> +struct vfio_pci_dma_buf {
> +       struct dma_buf *dmabuf;
> +       struct vfio_pci_core_device *vdev;
> +       struct list_head dmabufs_elm;
> +       size_t size;
> +       struct phys_vec *phys_vec;
> +       struct p2pdma_provider *provider;
> +       u32 nr_ranges;
> +       u8 revoked : 1;
> +};
> +
> +static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
> +                                  struct dma_buf_attachment *attachment)
> +{
> +       struct vfio_pci_dma_buf *priv = dmabuf->priv;
> +
> +       if (!attachment->peer2peer)
> +               return -EOPNOTSUPP;
> +
> +       if (priv->revoked)
> +               return -ENODEV;
> +
> +       switch (pci_p2pdma_map_type(priv->provider, attachment->dev)) {
> +       case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
> +               break;
> +       case PCI_P2PDMA_MAP_BUS_ADDR:
> +               /*
> +                * There is no need in IOVA at all for this flow.
> +                * We rely on attachment->priv == NULL as a marker
> +                * for this mode.
> +                */
> +               return 0;
> +       default:
> +               return -EINVAL;
> +       }
> +
> +       attachment->priv = kzalloc(sizeof(struct dma_iova_state), GFP_KERNEL);
> +       if (!attachment->priv)
> +               return -ENOMEM;
> +
> +       dma_iova_try_alloc(attachment->dev, attachment->priv, 0, priv->size);
> +       return 0;
> +}
> +
> +static void vfio_pci_dma_buf_detach(struct dma_buf *dmabuf,
> +                                   struct dma_buf_attachment *attachment)
> +{
> +       kfree(attachment->priv);
> +}
> +
> +static struct scatterlist *fill_sg_entry(struct scatterlist *sgl, u64 length,
> +                                        dma_addr_t addr)
> +{
> +       unsigned int len, nents;
> +       int i;
> +
> +       nents = DIV_ROUND_UP(length, UINT_MAX);
> +       for (i = 0; i < nents; i++) {
> +               len = min_t(u64, length, UINT_MAX);
> +               length -= len;
> +               /*
> +                * Follow the DMABUF rules for scatterlist, the struct page can
> +                * be NULL'd for MMIO only memory.
> +                */
> +               sg_set_page(sgl, NULL, len, 0);
> +               sg_dma_address(sgl) = addr + i * UINT_MAX;
> +               sg_dma_len(sgl) = len;
> +               sgl = sg_next(sgl);
> +       }
> +
> +       return sgl;
> +}
> +
> +static unsigned int calc_sg_nents(struct vfio_pci_dma_buf *priv,
> +                                 struct dma_iova_state *state)
> +{
> +       struct phys_vec *phys_vec = priv->phys_vec;
> +       unsigned int nents = 0;
> +       u32 i;
> +
> +       if (!state || !dma_use_iova(state))
> +               for (i = 0; i < priv->nr_ranges; i++)
> +                       nents += DIV_ROUND_UP(phys_vec[i].len, UINT_MAX);
> +       else
> +               /*
> +                * In IOVA case, there is only one SG entry which spans
> +                * for whole IOVA address space, but we need to make sure
> +                * that it fits sg->length, maybe we need more.
> +                */
> +               nents = DIV_ROUND_UP(priv->size, UINT_MAX);
> +
> +       return nents;
> +}
> +
> +static struct sg_table *
> +vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
> +                    enum dma_data_direction dir)
> +{
> +       struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> +       struct dma_iova_state *state = attachment->priv;
> +       struct phys_vec *phys_vec = priv->phys_vec;
> +       unsigned long attrs = DMA_ATTR_MMIO;
> +       unsigned int nents, mapped_len = 0;
> +       struct scatterlist *sgl;
> +       struct sg_table *sgt;
> +       dma_addr_t addr;
> +       int ret;
> +       u32 i;
> +
> +       dma_resv_assert_held(priv->dmabuf->resv);
> +
> +       if (priv->revoked)
> +               return ERR_PTR(-ENODEV);
> +
> +       sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
> +       if (!sgt)
> +               return ERR_PTR(-ENOMEM);
> +
> +       nents = calc_sg_nents(priv, state);
> +       ret = sg_alloc_table(sgt, nents, GFP_KERNEL | __GFP_ZERO);
> +       if (ret)
> +               goto err_kfree_sgt;
> +
> +       sgl = sgt->sgl;
> +
> +       for (i = 0; i < priv->nr_ranges; i++) {
> +               if (!state) {
> +                       addr = pci_p2pdma_bus_addr_map(priv->provider,
> +                                                      phys_vec[i].paddr);
> +               } else if (dma_use_iova(state)) {
> +                       ret = dma_iova_link(attachment->dev, state,
> +                                           phys_vec[i].paddr, 0,
> +                                           phys_vec[i].len, dir, attrs);
> +                       if (ret)
> +                               goto err_unmap_dma;
> +
> +                       mapped_len += phys_vec[i].len;
> +               } else {
> +                       addr = dma_map_phys(attachment->dev, phys_vec[i].paddr,
> +                                           phys_vec[i].len, dir, attrs);
> +                       ret = dma_mapping_error(attachment->dev, addr);
> +                       if (ret)
> +                               goto err_unmap_dma;
> +               }
> +
> +               if (!state || !dma_use_iova(state))
> +                       sgl = fill_sg_entry(sgl, phys_vec[i].len, addr);
> +       }
> +
> +       if (state && dma_use_iova(state)) {
> +               WARN_ON_ONCE(mapped_len != priv->size);
> +               ret = dma_iova_sync(attachment->dev, state, 0, mapped_len);
> +               if (ret)
> +                       goto err_unmap_dma;
> +               sgl = fill_sg_entry(sgl, mapped_len, state->addr);
> +       }
> +
> +       /*
> +        * SGL must be NULL to indicate that SGL is the last one
> +        * and we allocated correct number of entries in sg_alloc_table()
> +        */
> +       WARN_ON_ONCE(sgl);
> +       return sgt;
> +
> +err_unmap_dma:
> +       if (!i || !state)
> +               ; /* Do nothing */
> +       else if (dma_use_iova(state))
> +               dma_iova_destroy(attachment->dev, state, mapped_len, dir,
> +                                attrs);
> +       else
> +               for_each_sgtable_dma_sg(sgt, sgl, i)
> +                       dma_unmap_phys(attachment->dev, sg_dma_address(sgl),
> +                                       sg_dma_len(sgl), dir, attrs);
> +       sg_free_table(sgt);
> +err_kfree_sgt:
> +       kfree(sgt);
> +       return ERR_PTR(ret);
> +}
> +
> +static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
> +                                  struct sg_table *sgt,
> +                                  enum dma_data_direction dir)
> +{
> +       struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> +       struct dma_iova_state *state = attachment->priv;
> +       unsigned long attrs = DMA_ATTR_MMIO;
> +       struct scatterlist *sgl;
> +       int i;
> +
> +       if (!state)
> +               ; /* Do nothing */
> +       else if (dma_use_iova(state))
> +               dma_iova_destroy(attachment->dev, state, priv->size, dir,
> +                                attrs);
> +       else
> +               for_each_sgtable_dma_sg(sgt, sgl, i)
> +                       dma_unmap_phys(attachment->dev, sg_dma_address(sgl),
> +                                      sg_dma_len(sgl), dir, attrs);
> +
> +       sg_free_table(sgt);
> +       kfree(sgt);
> +}
> +
> +static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
> +{
> +       struct vfio_pci_dma_buf *priv = dmabuf->priv;
> +
> +       /*
> +        * Either this or vfio_pci_dma_buf_cleanup() will remove from the list.
> +        * The refcount prevents both.
> +        */
> +       if (priv->vdev) {
> +               down_write(&priv->vdev->memory_lock);
> +               list_del_init(&priv->dmabufs_elm);
> +               up_write(&priv->vdev->memory_lock);
> +               vfio_device_put_registration(&priv->vdev->vdev);
> +       }
> +       kfree(priv->phys_vec);
> +       kfree(priv);
> +}
> +
> +static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
> +       .attach = vfio_pci_dma_buf_attach,
> +       .detach = vfio_pci_dma_buf_detach,
> +       .map_dma_buf = vfio_pci_dma_buf_map,
> +       .release = vfio_pci_dma_buf_release,
> +       .unmap_dma_buf = vfio_pci_dma_buf_unmap,
> +};
> +
> +static void dma_ranges_to_p2p_phys(struct vfio_pci_dma_buf *priv,
> +                                  struct vfio_device_feature_dma_buf *dma_buf,
> +                                  struct vfio_region_dma_range *dma_ranges,
> +                                  struct p2pdma_provider *provider)
> +{
> +       struct pci_dev *pdev = priv->vdev->pdev;
> +       phys_addr_t pci_start;
> +       u32 i;
> +
> +       pci_start = pci_resource_start(pdev, dma_buf->region_index);
> +       for (i = 0; i < dma_buf->nr_ranges; i++) {
> +               priv->phys_vec[i].len = dma_ranges[i].length;
> +               priv->phys_vec[i].paddr = pci_start + dma_ranges[i].offset;
> +               priv->size += priv->phys_vec[i].len;
> +       }
> +       priv->nr_ranges = dma_buf->nr_ranges;
> +       priv->provider = provider;
> +}
> +
> +static int validate_dmabuf_input(struct vfio_pci_core_device *vdev,
> +                                struct vfio_device_feature_dma_buf *dma_buf,
> +                                struct vfio_region_dma_range *dma_ranges,
> +                                struct p2pdma_provider **provider)
> +{
> +       struct pci_dev *pdev = vdev->pdev;
> +       u32 bar = dma_buf->region_index;
> +       resource_size_t bar_size;
> +       u64 length = 0, sum;
> +       u32 i;
> +
> +       if (dma_buf->flags)
> +               return -EINVAL;
> +       /*
> +        * For PCI the region_index is the BAR number like  everything else.
> +        */
> +       if (bar >= VFIO_PCI_ROM_REGION_INDEX)
> +               return -ENODEV;
> +
> +       *provider = pcim_p2pdma_provider(pdev, bar);
> +       if (!*provider)
> +               return -EINVAL;
> +
> +       bar_size = pci_resource_len(pdev, bar);
> +       for (i = 0; i < dma_buf->nr_ranges; i++) {
> +               u64 offset = dma_ranges[i].offset;
> +               u64 len = dma_ranges[i].length;
> +
> +               if (!len || !PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
> +                       return -EINVAL;
> +
> +               if (check_add_overflow(offset, len, &sum) || sum > bar_size)
> +                       return -EINVAL;
> +
> +               /* Total requested length can't overflow IOVA size */
> +               if (check_add_overflow(length, len, &sum))
> +                       return -EINVAL;
> +
> +               length = sum;
> +       }
> +
> +       /*
> +        * DMA API uses size_t, so make sure that requested region length
> +        * can fit into size_t variable, which can be unsigned int (32bits).
> +        *
> +        * In addition make sure that high bit of total length is not used too
> +        * as it is used as a marker for DMA IOVA API.
> +        */
> +       if (overflows_type(length, size_t) || length & DMA_IOVA_USE_SWIOTLB)
> +               return -EINVAL;
> +
> +       return 0;
> +}
> +
> +int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> +                                 struct vfio_device_feature_dma_buf __user *arg,
> +                                 size_t argsz)
> +{
> +       struct vfio_device_feature_dma_buf get_dma_buf = {};
> +       struct vfio_region_dma_range *dma_ranges;
> +       DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
> +       struct p2pdma_provider *provider;
> +       struct vfio_pci_dma_buf *priv;
> +       int ret;
> +
> +       ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
> +                                sizeof(get_dma_buf));
> +       if (ret != 1)
> +               return ret;
> +
> +       if (copy_from_user(&get_dma_buf, arg, sizeof(get_dma_buf)))
> +               return -EFAULT;
> +
> +       if (!get_dma_buf.nr_ranges)
> +               return -EINVAL;
> +
> +       dma_ranges = memdup_array_user(&arg->dma_ranges, get_dma_buf.nr_ranges,
> +                                      sizeof(*dma_ranges));
> +       if (IS_ERR(dma_ranges))
> +               return PTR_ERR(dma_ranges);
> +
> +       ret = validate_dmabuf_input(vdev, &get_dma_buf, dma_ranges, &provider);
> +       if (ret)
> +               goto err_free_ranges;
> +
> +       priv = kzalloc(sizeof(*priv), GFP_KERNEL);
> +       if (!priv) {
> +               ret = -ENOMEM;
> +               goto err_free_ranges;
> +       }
> +       priv->phys_vec = kcalloc(get_dma_buf.nr_ranges, sizeof(*priv->phys_vec),
> +                                GFP_KERNEL);
> +       if (!priv->phys_vec) {
> +               ret = -ENOMEM;
> +               goto err_free_priv;
> +       }
> +
> +       priv->vdev = vdev;
> +       dma_ranges_to_p2p_phys(priv, &get_dma_buf, dma_ranges, provider);
> +       kfree(dma_ranges);
> +       dma_ranges = NULL;
> +
> +       if (!vfio_device_try_get_registration(&vdev->vdev)) {
> +               ret = -ENODEV;
> +               goto err_free_phys;
> +       }
> +
> +       exp_info.ops = &vfio_pci_dmabuf_ops;
> +       exp_info.size = priv->size;
> +       exp_info.flags = get_dma_buf.open_flags;
> +       exp_info.priv = priv;
> +
> +       priv->dmabuf = dma_buf_export(&exp_info);
> +       if (IS_ERR(priv->dmabuf)) {
> +               ret = PTR_ERR(priv->dmabuf);
> +               goto err_dev_put;
> +       }
> +
> +       /* dma_buf_put() now frees priv */
> +       INIT_LIST_HEAD(&priv->dmabufs_elm);
> +       down_write(&vdev->memory_lock);
> +       dma_resv_lock(priv->dmabuf->resv, NULL);
> +       priv->revoked = !__vfio_pci_memory_enabled(vdev);
> +       list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs);
> +       dma_resv_unlock(priv->dmabuf->resv);
> +       up_write(&vdev->memory_lock);
> +
> +       /*
> +        * dma_buf_fd() consumes the reference, when the file closes the dmabuf
> +        * will be released.
> +        */
> +       return dma_buf_fd(priv->dmabuf, get_dma_buf.open_flags);
> +
> +err_dev_put:
> +       vfio_device_put_registration(&vdev->vdev);
> +err_free_phys:
> +       kfree(priv->phys_vec);
> +err_free_priv:
> +       kfree(priv);
> +err_free_ranges:
> +       kfree(dma_ranges);
> +       return ret;
> +}
> +
> +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
> +{
> +       struct vfio_pci_dma_buf *priv;
> +       struct vfio_pci_dma_buf *tmp;
> +
> +       lockdep_assert_held_write(&vdev->memory_lock);
> +
> +       list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
> +               if (!get_file_active(&priv->dmabuf->file))
> +                       continue;
> +
> +               if (priv->revoked != revoked) {
> +                       dma_resv_lock(priv->dmabuf->resv, NULL);
> +                       priv->revoked = revoked;
> +                       dma_buf_move_notify(priv->dmabuf);

I think this should only be called when revoked is true, otherwise
this will be calling move_notify on the already revoked dmabuf
attachments.
> +                       dma_resv_unlock(priv->dmabuf->resv);
> +               }
> +               dma_buf_put(priv->dmabuf);
> +       }
> +}
> +
> +void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
> +{
> +       struct vfio_pci_dma_buf *priv;
> +       struct vfio_pci_dma_buf *tmp;
> +
> +       down_write(&vdev->memory_lock);
> +       list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
> +               if (!get_file_active(&priv->dmabuf->file))
> +                       continue;
> +
> +               dma_resv_lock(priv->dmabuf->resv, NULL);
> +               list_del_init(&priv->dmabufs_elm);
> +               priv->vdev = NULL;
> +               priv->revoked = true;
> +               dma_buf_move_notify(priv->dmabuf);
> +               dma_resv_unlock(priv->dmabuf->resv);
> +               vfio_device_put_registration(&vdev->vdev);
> +               dma_buf_put(priv->dmabuf);
> +       }
> +       up_write(&vdev->memory_lock);
> +}
> diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
> index a9972eacb293..28a405f8b97c 100644
> --- a/drivers/vfio/pci/vfio_pci_priv.h
> +++ b/drivers/vfio/pci/vfio_pci_priv.h
> @@ -107,4 +107,27 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
>         return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
>  }
>
> +#ifdef CONFIG_VFIO_PCI_DMABUF
> +int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> +                                 struct vfio_device_feature_dma_buf __user *arg,
> +                                 size_t argsz);
> +void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev);
> +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked);
> +#else
> +static inline int
> +vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> +                             struct vfio_device_feature_dma_buf __user *arg,
> +                             size_t argsz)
> +{
> +       return -ENOTTY;
> +}
> +static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
> +{
> +}
> +static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev,
> +                                        bool revoked)
> +{
> +}
> +#endif
> +
>  #endif
> diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
> index f541044e42a2..30d74b364f25 100644
> --- a/include/linux/vfio_pci_core.h
> +++ b/include/linux/vfio_pci_core.h
> @@ -94,6 +94,7 @@ struct vfio_pci_core_device {
>         struct vfio_pci_core_device     *sriov_pf_core_dev;
>         struct notifier_block   nb;
>         struct rw_semaphore     memory_lock;
> +       struct list_head        dmabufs;
>  };
>
>  /* Will be exported for vfio pci drivers usage */
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 75100bf009ba..63214467c875 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1478,6 +1478,31 @@ struct vfio_device_feature_bus_master {
>  };
>  #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
>
> +/**
> + * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the
> + * regions selected.
> + *
> + * open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC,
> + * etc. offset/length specify a slice of the region to create the dmabuf from.
> + * nr_ranges is the total number of (P2P DMA) ranges that comprise the dmabuf.
> + *
> + * Return: The fd number on success, -1 and errno is set on failure.
> + */
> +#define VFIO_DEVICE_FEATURE_DMA_BUF 11
> +
> +struct vfio_region_dma_range {
> +       __u64 offset;
> +       __u64 length;
> +};
> +
> +struct vfio_device_feature_dma_buf {
> +       __u32   region_index;
> +       __u32   open_flags;
> +       __u32   flags;
> +       __u32   nr_ranges;
> +       struct vfio_region_dma_range dma_ranges[];
> +};
> +
>  /* -------- API for Type1 VFIO IOMMU -------- */
>
>  /**
> --
> 2.51.0
>
>
Leon Romanovsky Oct. 30, 2025, 6:48 a.m. UTC | #22
On Wed, Oct 29, 2025 at 05:25:03PM -0700, Samiullah Khawaja wrote:
> On Mon, Oct 13, 2025 at 8:27 AM Leon Romanovsky <leon@kernel.org> wrote:
> >
> > From: Leon Romanovsky <leonro@nvidia.com>
> >
> > Add support for exporting PCI device MMIO regions through dma-buf,
> > enabling safe sharing of non-struct page memory with controlled
> > lifetime management. This allows RDMA and other subsystems to import
> > dma-buf FDs and build them into memory regions for PCI P2P operations.
> >
> > The implementation provides a revocable attachment mechanism using
> > dma-buf move operations. MMIO regions are normally pinned as BARs
> > don't change physical addresses, but access is revoked when the VFIO
> > device is closed or a PCI reset is issued. This ensures kernel
> > self-defense against potentially hostile userspace.
> >
> > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> > Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> > Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> > ---
> >  drivers/vfio/pci/Kconfig           |   3 +
> >  drivers/vfio/pci/Makefile          |   2 +
> >  drivers/vfio/pci/vfio_pci_config.c |  22 +-
> >  drivers/vfio/pci/vfio_pci_core.c   |  28 ++
> >  drivers/vfio/pci/vfio_pci_dmabuf.c | 446 +++++++++++++++++++++++++++++
> >  drivers/vfio/pci/vfio_pci_priv.h   |  23 ++
> >  include/linux/vfio_pci_core.h      |   1 +
> >  include/uapi/linux/vfio.h          |  25 ++
> >  8 files changed, 546 insertions(+), 4 deletions(-)
> >  create mode 100644 drivers/vfio/pci/vfio_pci_dmabuf.c

<...>

> > +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
> > +{
> > +       struct vfio_pci_dma_buf *priv;
> > +       struct vfio_pci_dma_buf *tmp;
> > +
> > +       lockdep_assert_held_write(&vdev->memory_lock);
> > +
> > +       list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
> > +               if (!get_file_active(&priv->dmabuf->file))
> > +                       continue;
> > +
> > +               if (priv->revoked != revoked) {
> > +                       dma_resv_lock(priv->dmabuf->resv, NULL);
> > +                       priv->revoked = revoked;
> > +                       dma_buf_move_notify(priv->dmabuf);
> 
> I think this should only be called when revoked is true, otherwise
> this will be calling move_notify on the already revoked dmabuf
> attachments.

This case is protected by "if (priv->revoked)" check both in vfio_pci_dma_buf_map
and vfio_pci_dma_buf_attach. They will prevent DMABUF recreation if revoked is false.

VTW, please trim your replies, it is time consuming to find your reply
among 600 lines of unrelated text.

Thanks

> > +                       dma_resv_unlock(priv->dmabuf->resv);
> > +               }
> > +               dma_buf_put(priv->dmabuf);
> > +       }
> > +}
Jason Gunthorpe Oct. 30, 2025, 12:57 p.m. UTC | #23
On Thu, Oct 30, 2025 at 08:48:18AM +0200, Leon Romanovsky wrote:
> > > +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
> > > +{
> > > +       struct vfio_pci_dma_buf *priv;
> > > +       struct vfio_pci_dma_buf *tmp;
> > > +
> > > +       lockdep_assert_held_write(&vdev->memory_lock);
> > > +
> > > +       list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
> > > +               if (!get_file_active(&priv->dmabuf->file))
> > > +                       continue;
> > > +
> > > +               if (priv->revoked != revoked) {
> > > +                       dma_resv_lock(priv->dmabuf->resv, NULL);
> > > +                       priv->revoked = revoked;
> > > +                       dma_buf_move_notify(priv->dmabuf);
> > 
> > I think this should only be called when revoked is true, otherwise
> > this will be calling move_notify on the already revoked dmabuf
> > attachments.
> 
> This case is protected by "if (priv->revoked)" check both in
> vfio_pci_dma_buf_map and vfio_pci_dma_buf_attach. They will prevent
> DMABUF recreation if revoked is false.

The point was to call it when revoked becomes false as well, as that
gives the importing driver an opportunity to restore any mapping it
had.

Jason
Alex Williamson Oct. 30, 2025, 8:38 p.m. UTC | #24
On Mon, 13 Oct 2025 18:26:11 +0300
Leon Romanovsky <leon@kernel.org> wrote:
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index fe247d0e2831..56b1320238a9 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -1511,6 +1520,19 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
>  		return vfio_pci_core_pm_exit(vdev, flags, arg, argsz);
>  	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
>  		return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
> +	case VFIO_DEVICE_FEATURE_DMA_BUF:
> +		if (device->ops->ioctl != vfio_pci_core_ioctl)
> +			/*
> +			 * Devices that overwrite general .ioctl() callback
> +			 * usually do it to implement their own
> +			 * VFIO_DEVICE_GET_REGION_INFO handlerm and they present

Typo, "handlerm"

> +			 * different BAR information from the real PCI.
> +			 *
> +			 * DMABUF relies on real PCI information.
> +			 */
> +			return -EOPNOTSUPP;
> +
> +		return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
>  	default:
>  		return -ENOTTY;
>  	}
...
> @@ -2459,6 +2482,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
>  			break;
>  		}
>  
> +		vfio_pci_dma_buf_move(vdev, true);
>  		vfio_pci_zap_bars(vdev);
>  	}
>  
> @@ -2482,6 +2506,10 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
>  
>  	ret = pci_reset_bus(pdev);
>  
> +	list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
> +		if (__vfio_pci_memory_enabled(vdev))
> +			vfio_pci_dma_buf_move(vdev, false);
> +
>  	vdev = list_last_entry(&dev_set->device_list,
>  			       struct vfio_pci_core_device, vdev.dev_set_list);
>  

This needs to be placed in the existing undo loop with the up_write(),
otherwise it can be missed in the error case.

> diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
> new file mode 100644
> index 000000000000..eaba010777f3
> --- /dev/null
> +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> +static unsigned int calc_sg_nents(struct vfio_pci_dma_buf *priv,
> +				  struct dma_iova_state *state)
> +{
> +	struct phys_vec *phys_vec = priv->phys_vec;
> +	unsigned int nents = 0;
> +	u32 i;
> +
> +	if (!state || !dma_use_iova(state))
> +		for (i = 0; i < priv->nr_ranges; i++)
> +			nents += DIV_ROUND_UP(phys_vec[i].len, UINT_MAX);
> +	else
> +		/*
> +		 * In IOVA case, there is only one SG entry which spans
> +		 * for whole IOVA address space, but we need to make sure
> +		 * that it fits sg->length, maybe we need more.
> +		 */
> +		nents = DIV_ROUND_UP(priv->size, UINT_MAX);

I think we're arguably running afoul of the coding style standard here
that this is not a single simple statement and should use braces.

> +
> +	return nents;
> +}
> +
> +static struct sg_table *
> +vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
> +		     enum dma_data_direction dir)
> +{
> +	struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> +	struct dma_iova_state *state = attachment->priv;
> +	struct phys_vec *phys_vec = priv->phys_vec;
> +	unsigned long attrs = DMA_ATTR_MMIO;
> +	unsigned int nents, mapped_len = 0;
> +	struct scatterlist *sgl;
> +	struct sg_table *sgt;
> +	dma_addr_t addr;
> +	int ret;
> +	u32 i;
> +
> +	dma_resv_assert_held(priv->dmabuf->resv);
> +
> +	if (priv->revoked)
> +		return ERR_PTR(-ENODEV);
> +
> +	sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
> +	if (!sgt)
> +		return ERR_PTR(-ENOMEM);
> +
> +	nents = calc_sg_nents(priv, state);
> +	ret = sg_alloc_table(sgt, nents, GFP_KERNEL | __GFP_ZERO);
> +	if (ret)
> +		goto err_kfree_sgt;
> +
> +	sgl = sgt->sgl;
> +
> +	for (i = 0; i < priv->nr_ranges; i++) {
> +		if (!state) {
> +			addr = pci_p2pdma_bus_addr_map(priv->provider,
> +						       phys_vec[i].paddr);
> +		} else if (dma_use_iova(state)) {
> +			ret = dma_iova_link(attachment->dev, state,
> +					    phys_vec[i].paddr, 0,
> +					    phys_vec[i].len, dir, attrs);
> +			if (ret)
> +				goto err_unmap_dma;
> +
> +			mapped_len += phys_vec[i].len;
> +		} else {
> +			addr = dma_map_phys(attachment->dev, phys_vec[i].paddr,
> +					    phys_vec[i].len, dir, attrs);
> +			ret = dma_mapping_error(attachment->dev, addr);
> +			if (ret)
> +				goto err_unmap_dma;
> +		}
> +
> +		if (!state || !dma_use_iova(state))
> +			sgl = fill_sg_entry(sgl, phys_vec[i].len, addr);
> +	}
> +
> +	if (state && dma_use_iova(state)) {
> +		WARN_ON_ONCE(mapped_len != priv->size);
> +		ret = dma_iova_sync(attachment->dev, state, 0, mapped_len);
> +		if (ret)
> +			goto err_unmap_dma;
> +		sgl = fill_sg_entry(sgl, mapped_len, state->addr);
> +	}
> +
> +	/*
> +	 * SGL must be NULL to indicate that SGL is the last one
> +	 * and we allocated correct number of entries in sg_alloc_table()
> +	 */
> +	WARN_ON_ONCE(sgl);
> +	return sgt;
> +
> +err_unmap_dma:
> +	if (!i || !state)
> +		; /* Do nothing */
> +	else if (dma_use_iova(state))
> +		dma_iova_destroy(attachment->dev, state, mapped_len, dir,
> +				 attrs);
> +	else
> +		for_each_sgtable_dma_sg(sgt, sgl, i)
> +			dma_unmap_phys(attachment->dev, sg_dma_address(sgl),
> +					sg_dma_len(sgl), dir, attrs);

Same, here for braces.

> +	sg_free_table(sgt);
> +err_kfree_sgt:
> +	kfree(sgt);
> +	return ERR_PTR(ret);
> +}
> +
> +static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
> +				   struct sg_table *sgt,
> +				   enum dma_data_direction dir)
> +{
> +	struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> +	struct dma_iova_state *state = attachment->priv;
> +	unsigned long attrs = DMA_ATTR_MMIO;
> +	struct scatterlist *sgl;
> +	int i;
> +
> +	if (!state)
> +		; /* Do nothing */
> +	else if (dma_use_iova(state))
> +		dma_iova_destroy(attachment->dev, state, priv->size, dir,
> +				 attrs);
> +	else
> +		for_each_sgtable_dma_sg(sgt, sgl, i)
> +			dma_unmap_phys(attachment->dev, sg_dma_address(sgl),
> +				       sg_dma_len(sgl), dir, attrs);
> +

Here too.

> +	sg_free_table(sgt);
> +	kfree(sgt);
> +}
...
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 75100bf009ba..63214467c875 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1478,6 +1478,31 @@ struct vfio_device_feature_bus_master {
>  };
>  #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
>  
> +/**
> + * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the
> + * regions selected.
> + *
> + * open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC,
> + * etc. offset/length specify a slice of the region to create the dmabuf from.
> + * nr_ranges is the total number of (P2P DMA) ranges that comprise the dmabuf.
> + *

Probably worth noting that .flags should be zero, I see we enforce
that.  Thanks,

Alex

> + * Return: The fd number on success, -1 and errno is set on failure.
> + */
> +#define VFIO_DEVICE_FEATURE_DMA_BUF 11
> +
> +struct vfio_region_dma_range {
> +	__u64 offset;
> +	__u64 length;
> +};
> +
> +struct vfio_device_feature_dma_buf {
> +	__u32	region_index;
> +	__u32	open_flags;
> +	__u32   flags;
> +	__u32   nr_ranges;
> +	struct vfio_region_dma_range dma_ranges[];
> +};
> +
>  /* -------- API for Type1 VFIO IOMMU -------- */
>  
>  /**
Alex Mastro Oct. 30, 2025, 10:28 p.m. UTC | #25
> On Mon, Oct 27, 2025 at 04:13:05PM -0700, David Matlack wrote:
> > On Mon, Oct 13, 2025 at 8:44 AM Leon Romanovsky <leon@kernel.org> wrote:
> > This uAPI would be a good candidate for a VFIO selftest. You can test
> > that it returns an error when it's supposed to, and a valid fd when
> > it's supposed to. And once the iommufd importer side is ready, we can
> > extend the test and verify that the fd can be mapped into iommufd.
> 
> No problem, I'll add such test, but let's focus on making sure that this
> series is accepted first.

Along similar lines, perhaps adding vfio as src/vfio_memory.c dmabuf provider
to perftest [1] could be useful for low-level end-to-end validation.

https://github.com/linux-rdma/perftest
Leon Romanovsky Oct. 31, 2025, 6:48 a.m. UTC | #26
On Thu, Oct 30, 2025 at 02:38:36PM -0600, Alex Williamson wrote:
> On Mon, 13 Oct 2025 18:26:11 +0300
> Leon Romanovsky <leon@kernel.org> wrote:
> > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> > index fe247d0e2831..56b1320238a9 100644
> > --- a/drivers/vfio/pci/vfio_pci_core.c
> > +++ b/drivers/vfio/pci/vfio_pci_core.c
> > @@ -1511,6 +1520,19 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
> >  		return vfio_pci_core_pm_exit(vdev, flags, arg, argsz);
> >  	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
> >  		return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
> > +	case VFIO_DEVICE_FEATURE_DMA_BUF:
> > +		if (device->ops->ioctl != vfio_pci_core_ioctl)
> > +			/*
> > +			 * Devices that overwrite general .ioctl() callback
> > +			 * usually do it to implement their own
> > +			 * VFIO_DEVICE_GET_REGION_INFO handlerm and they present
> 
> Typo, "handlerm"

Thanks, this part of code is going to be different in v6.

> 

<...>

> > @@ -2482,6 +2506,10 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
> >  
> >  	ret = pci_reset_bus(pdev);
> >  
> > +	list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
> > +		if (__vfio_pci_memory_enabled(vdev))
> > +			vfio_pci_dma_buf_move(vdev, false);
> > +
> >  	vdev = list_last_entry(&dev_set->device_list,
> >  			       struct vfio_pci_core_device, vdev.dev_set_list);
> >  
> 
> This needs to be placed in the existing undo loop with the up_write(),
> otherwise it can be missed in the error case.

I'll move, but it caused me to wonder what did you want to achieve with
this "vdev = list_last_entry ..." line? vdev is overwritten immediately
after that line.

> 
> > diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
> > new file mode 100644
> > index 000000000000..eaba010777f3
> > --- /dev/null
> > +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> > +static unsigned int calc_sg_nents(struct vfio_pci_dma_buf *priv,
> > +				  struct dma_iova_state *state)
> > +{
> > +	struct phys_vec *phys_vec = priv->phys_vec;
> > +	unsigned int nents = 0;
> > +	u32 i;
> > +
> > +	if (!state || !dma_use_iova(state))
> > +		for (i = 0; i < priv->nr_ranges; i++)
> > +			nents += DIV_ROUND_UP(phys_vec[i].len, UINT_MAX);
> > +	else
> > +		/*
> > +		 * In IOVA case, there is only one SG entry which spans
> > +		 * for whole IOVA address space, but we need to make sure
> > +		 * that it fits sg->length, maybe we need more.
> > +		 */
> > +		nents = DIV_ROUND_UP(priv->size, UINT_MAX);
> 
> I think we're arguably running afoul of the coding style standard here
> that this is not a single simple statement and should use braces.
> 

<...>

> > +err_unmap_dma:
> > +	if (!i || !state)
> > +		; /* Do nothing */
> > +	else if (dma_use_iova(state))
> > +		dma_iova_destroy(attachment->dev, state, mapped_len, dir,
> > +				 attrs);
> > +	else
> > +		for_each_sgtable_dma_sg(sgt, sgl, i)
> > +			dma_unmap_phys(attachment->dev, sg_dma_address(sgl),
> > +					sg_dma_len(sgl), dir, attrs);
> 
> Same, here for braces.
> 

<...>

> > +	if (!state)
> > +		; /* Do nothing */
> > +	else if (dma_use_iova(state))
> > +		dma_iova_destroy(attachment->dev, state, priv->size, dir,
> > +				 attrs);
> > +	else
> > +		for_each_sgtable_dma_sg(sgt, sgl, i)
> > +			dma_unmap_phys(attachment->dev, sg_dma_address(sgl),
> > +				       sg_dma_len(sgl), dir, attrs);
> > +
> 
> Here too.

I will change it, but it is worth to admit that I'm consistent in my
coding style.

> 
> > +	sg_free_table(sgt);
> > +	kfree(sgt);
> > +}
> ...
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index 75100bf009ba..63214467c875 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -1478,6 +1478,31 @@ struct vfio_device_feature_bus_master {
> >  };
> >  #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
> >  
> > +/**
> > + * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the
> > + * regions selected.
> > + *
> > + * open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC,
> > + * etc. offset/length specify a slice of the region to create the dmabuf from.
> > + * nr_ranges is the total number of (P2P DMA) ranges that comprise the dmabuf.
> > + *
> 
> Probably worth noting that .flags should be zero, I see we enforce
> that.  Thanks,

Added, thanks

> 
> Alex
>
diff mbox series

Patch

diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 2b0172f54665..2b9fca00e9e8 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -55,6 +55,9 @@  config VFIO_PCI_ZDEV_KVM
 
 	  To enable s390x KVM vfio-pci extensions, say Y.
 
+config VFIO_PCI_DMABUF
+	def_bool y if VFIO_PCI_CORE && PCI_P2PDMA && DMA_SHARED_BUFFER
+
 source "drivers/vfio/pci/mlx5/Kconfig"
 
 source "drivers/vfio/pci/hisilicon/Kconfig"
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index cf00c0a7e55c..f9155e9c5f63 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -2,7 +2,9 @@ 
 
 vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
 vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o
+
 obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
+vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o
 
 vfio-pci-y := vfio_pci.o
 vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 8f02f236b5b4..1f6008eabf23 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -589,10 +589,12 @@  static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos,
 		virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY);
 		new_mem = !!(new_cmd & PCI_COMMAND_MEMORY);
 
-		if (!new_mem)
+		if (!new_mem) {
 			vfio_pci_zap_and_down_write_memory_lock(vdev);
-		else
+			vfio_pci_dma_buf_move(vdev, true);
+		} else {
 			down_write(&vdev->memory_lock);
+		}
 
 		/*
 		 * If the user is writing mem/io enable (new_mem/io) and we
@@ -627,6 +629,8 @@  static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos,
 		*virt_cmd &= cpu_to_le16(~mask);
 		*virt_cmd |= cpu_to_le16(new_cmd & mask);
 
+		if (__vfio_pci_memory_enabled(vdev))
+			vfio_pci_dma_buf_move(vdev, false);
 		up_write(&vdev->memory_lock);
 	}
 
@@ -707,12 +711,16 @@  static int __init init_pci_cap_basic_perm(struct perm_bits *perm)
 static void vfio_lock_and_set_power_state(struct vfio_pci_core_device *vdev,
 					  pci_power_t state)
 {
-	if (state >= PCI_D3hot)
+	if (state >= PCI_D3hot) {
 		vfio_pci_zap_and_down_write_memory_lock(vdev);
-	else
+		vfio_pci_dma_buf_move(vdev, true);
+	} else {
 		down_write(&vdev->memory_lock);
+	}
 
 	vfio_pci_set_power_state(vdev, state);
+	if (__vfio_pci_memory_enabled(vdev))
+		vfio_pci_dma_buf_move(vdev, false);
 	up_write(&vdev->memory_lock);
 }
 
@@ -900,7 +908,10 @@  static int vfio_exp_config_write(struct vfio_pci_core_device *vdev, int pos,
 
 		if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) {
 			vfio_pci_zap_and_down_write_memory_lock(vdev);
+			vfio_pci_dma_buf_move(vdev, true);
 			pci_try_reset_function(vdev->pdev);
+			if (__vfio_pci_memory_enabled(vdev))
+				vfio_pci_dma_buf_move(vdev, false);
 			up_write(&vdev->memory_lock);
 		}
 	}
@@ -982,7 +993,10 @@  static int vfio_af_config_write(struct vfio_pci_core_device *vdev, int pos,
 
 		if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) {
 			vfio_pci_zap_and_down_write_memory_lock(vdev);
+			vfio_pci_dma_buf_move(vdev, true);
 			pci_try_reset_function(vdev->pdev);
+			if (__vfio_pci_memory_enabled(vdev))
+				vfio_pci_dma_buf_move(vdev, false);
 			up_write(&vdev->memory_lock);
 		}
 	}
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index fe247d0e2831..56b1320238a9 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -287,6 +287,8 @@  static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev,
 	 * semaphore.
 	 */
 	vfio_pci_zap_and_down_write_memory_lock(vdev);
+	vfio_pci_dma_buf_move(vdev, true);
+
 	if (vdev->pm_runtime_engaged) {
 		up_write(&vdev->memory_lock);
 		return -EINVAL;
@@ -370,6 +372,8 @@  static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev)
 	 */
 	down_write(&vdev->memory_lock);
 	__vfio_pci_runtime_pm_exit(vdev);
+	if (__vfio_pci_memory_enabled(vdev))
+		vfio_pci_dma_buf_move(vdev, false);
 	up_write(&vdev->memory_lock);
 }
 
@@ -690,6 +694,8 @@  void vfio_pci_core_close_device(struct vfio_device *core_vdev)
 #endif
 	vfio_pci_core_disable(vdev);
 
+	vfio_pci_dma_buf_cleanup(vdev);
+
 	mutex_lock(&vdev->igate);
 	if (vdev->err_trigger) {
 		eventfd_ctx_put(vdev->err_trigger);
@@ -1222,7 +1228,10 @@  static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev,
 	 */
 	vfio_pci_set_power_state(vdev, PCI_D0);
 
+	vfio_pci_dma_buf_move(vdev, true);
 	ret = pci_try_reset_function(vdev->pdev);
+	if (__vfio_pci_memory_enabled(vdev))
+		vfio_pci_dma_buf_move(vdev, false);
 	up_write(&vdev->memory_lock);
 
 	return ret;
@@ -1511,6 +1520,19 @@  int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
 		return vfio_pci_core_pm_exit(vdev, flags, arg, argsz);
 	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
 		return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
+	case VFIO_DEVICE_FEATURE_DMA_BUF:
+		if (device->ops->ioctl != vfio_pci_core_ioctl)
+			/*
+			 * Devices that overwrite general .ioctl() callback
+			 * usually do it to implement their own
+			 * VFIO_DEVICE_GET_REGION_INFO handlerm and they present
+			 * different BAR information from the real PCI.
+			 *
+			 * DMABUF relies on real PCI information.
+			 */
+			return -EOPNOTSUPP;
+
+		return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
 	default:
 		return -ENOTTY;
 	}
@@ -2095,6 +2117,7 @@  int vfio_pci_core_init_dev(struct vfio_device *core_vdev)
 	ret = pcim_p2pdma_init(vdev->pdev);
 	if (ret != -EOPNOTSUPP)
 		return ret;
+	INIT_LIST_HEAD(&vdev->dmabufs);
 	init_rwsem(&vdev->memory_lock);
 	xa_init(&vdev->ctx);
 
@@ -2459,6 +2482,7 @@  static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
 			break;
 		}
 
+		vfio_pci_dma_buf_move(vdev, true);
 		vfio_pci_zap_bars(vdev);
 	}
 
@@ -2482,6 +2506,10 @@  static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
 
 	ret = pci_reset_bus(pdev);
 
+	list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
+		if (__vfio_pci_memory_enabled(vdev))
+			vfio_pci_dma_buf_move(vdev, false);
+
 	vdev = list_last_entry(&dev_set->device_list,
 			       struct vfio_pci_core_device, vdev.dev_set_list);
 
diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
new file mode 100644
index 000000000000..eaba010777f3
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
@@ -0,0 +1,446 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+ */
+#include <linux/dma-buf.h>
+#include <linux/pci-p2pdma.h>
+#include <linux/dma-resv.h>
+
+#include "vfio_pci_priv.h"
+
+MODULE_IMPORT_NS("DMA_BUF");
+
+struct vfio_pci_dma_buf {
+	struct dma_buf *dmabuf;
+	struct vfio_pci_core_device *vdev;
+	struct list_head dmabufs_elm;
+	size_t size;
+	struct phys_vec *phys_vec;
+	struct p2pdma_provider *provider;
+	u32 nr_ranges;
+	u8 revoked : 1;
+};
+
+static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
+				   struct dma_buf_attachment *attachment)
+{
+	struct vfio_pci_dma_buf *priv = dmabuf->priv;
+
+	if (!attachment->peer2peer)
+		return -EOPNOTSUPP;
+
+	if (priv->revoked)
+		return -ENODEV;
+
+	switch (pci_p2pdma_map_type(priv->provider, attachment->dev)) {
+	case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+		break;
+	case PCI_P2PDMA_MAP_BUS_ADDR:
+		/*
+		 * There is no need in IOVA at all for this flow.
+		 * We rely on attachment->priv == NULL as a marker
+		 * for this mode.
+		 */
+		return 0;
+	default:
+		return -EINVAL;
+	}
+
+	attachment->priv = kzalloc(sizeof(struct dma_iova_state), GFP_KERNEL);
+	if (!attachment->priv)
+		return -ENOMEM;
+
+	dma_iova_try_alloc(attachment->dev, attachment->priv, 0, priv->size);
+	return 0;
+}
+
+static void vfio_pci_dma_buf_detach(struct dma_buf *dmabuf,
+				    struct dma_buf_attachment *attachment)
+{
+	kfree(attachment->priv);
+}
+
+static struct scatterlist *fill_sg_entry(struct scatterlist *sgl, u64 length,
+					 dma_addr_t addr)
+{
+	unsigned int len, nents;
+	int i;
+
+	nents = DIV_ROUND_UP(length, UINT_MAX);
+	for (i = 0; i < nents; i++) {
+		len = min_t(u64, length, UINT_MAX);
+		length -= len;
+		/*
+		 * Follow the DMABUF rules for scatterlist, the struct page can
+		 * be NULL'd for MMIO only memory.
+		 */
+		sg_set_page(sgl, NULL, len, 0);
+		sg_dma_address(sgl) = addr + i * UINT_MAX;
+		sg_dma_len(sgl) = len;
+		sgl = sg_next(sgl);
+	}
+
+	return sgl;
+}
+
+static unsigned int calc_sg_nents(struct vfio_pci_dma_buf *priv,
+				  struct dma_iova_state *state)
+{
+	struct phys_vec *phys_vec = priv->phys_vec;
+	unsigned int nents = 0;
+	u32 i;
+
+	if (!state || !dma_use_iova(state))
+		for (i = 0; i < priv->nr_ranges; i++)
+			nents += DIV_ROUND_UP(phys_vec[i].len, UINT_MAX);
+	else
+		/*
+		 * In IOVA case, there is only one SG entry which spans
+		 * for whole IOVA address space, but we need to make sure
+		 * that it fits sg->length, maybe we need more.
+		 */
+		nents = DIV_ROUND_UP(priv->size, UINT_MAX);
+
+	return nents;
+}
+
+static struct sg_table *
+vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
+		     enum dma_data_direction dir)
+{
+	struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
+	struct dma_iova_state *state = attachment->priv;
+	struct phys_vec *phys_vec = priv->phys_vec;
+	unsigned long attrs = DMA_ATTR_MMIO;
+	unsigned int nents, mapped_len = 0;
+	struct scatterlist *sgl;
+	struct sg_table *sgt;
+	dma_addr_t addr;
+	int ret;
+	u32 i;
+
+	dma_resv_assert_held(priv->dmabuf->resv);
+
+	if (priv->revoked)
+		return ERR_PTR(-ENODEV);
+
+	sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
+	if (!sgt)
+		return ERR_PTR(-ENOMEM);
+
+	nents = calc_sg_nents(priv, state);
+	ret = sg_alloc_table(sgt, nents, GFP_KERNEL | __GFP_ZERO);
+	if (ret)
+		goto err_kfree_sgt;
+
+	sgl = sgt->sgl;
+
+	for (i = 0; i < priv->nr_ranges; i++) {
+		if (!state) {
+			addr = pci_p2pdma_bus_addr_map(priv->provider,
+						       phys_vec[i].paddr);
+		} else if (dma_use_iova(state)) {
+			ret = dma_iova_link(attachment->dev, state,
+					    phys_vec[i].paddr, 0,
+					    phys_vec[i].len, dir, attrs);
+			if (ret)
+				goto err_unmap_dma;
+
+			mapped_len += phys_vec[i].len;
+		} else {
+			addr = dma_map_phys(attachment->dev, phys_vec[i].paddr,
+					    phys_vec[i].len, dir, attrs);
+			ret = dma_mapping_error(attachment->dev, addr);
+			if (ret)
+				goto err_unmap_dma;
+		}
+
+		if (!state || !dma_use_iova(state))
+			sgl = fill_sg_entry(sgl, phys_vec[i].len, addr);
+	}
+
+	if (state && dma_use_iova(state)) {
+		WARN_ON_ONCE(mapped_len != priv->size);
+		ret = dma_iova_sync(attachment->dev, state, 0, mapped_len);
+		if (ret)
+			goto err_unmap_dma;
+		sgl = fill_sg_entry(sgl, mapped_len, state->addr);
+	}
+
+	/*
+	 * SGL must be NULL to indicate that SGL is the last one
+	 * and we allocated correct number of entries in sg_alloc_table()
+	 */
+	WARN_ON_ONCE(sgl);
+	return sgt;
+
+err_unmap_dma:
+	if (!i || !state)
+		; /* Do nothing */
+	else if (dma_use_iova(state))
+		dma_iova_destroy(attachment->dev, state, mapped_len, dir,
+				 attrs);
+	else
+		for_each_sgtable_dma_sg(sgt, sgl, i)
+			dma_unmap_phys(attachment->dev, sg_dma_address(sgl),
+					sg_dma_len(sgl), dir, attrs);
+	sg_free_table(sgt);
+err_kfree_sgt:
+	kfree(sgt);
+	return ERR_PTR(ret);
+}
+
+static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
+				   struct sg_table *sgt,
+				   enum dma_data_direction dir)
+{
+	struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
+	struct dma_iova_state *state = attachment->priv;
+	unsigned long attrs = DMA_ATTR_MMIO;
+	struct scatterlist *sgl;
+	int i;
+
+	if (!state)
+		; /* Do nothing */
+	else if (dma_use_iova(state))
+		dma_iova_destroy(attachment->dev, state, priv->size, dir,
+				 attrs);
+	else
+		for_each_sgtable_dma_sg(sgt, sgl, i)
+			dma_unmap_phys(attachment->dev, sg_dma_address(sgl),
+				       sg_dma_len(sgl), dir, attrs);
+
+	sg_free_table(sgt);
+	kfree(sgt);
+}
+
+static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
+{
+	struct vfio_pci_dma_buf *priv = dmabuf->priv;
+
+	/*
+	 * Either this or vfio_pci_dma_buf_cleanup() will remove from the list.
+	 * The refcount prevents both.
+	 */
+	if (priv->vdev) {
+		down_write(&priv->vdev->memory_lock);
+		list_del_init(&priv->dmabufs_elm);
+		up_write(&priv->vdev->memory_lock);
+		vfio_device_put_registration(&priv->vdev->vdev);
+	}
+	kfree(priv->phys_vec);
+	kfree(priv);
+}
+
+static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
+	.attach = vfio_pci_dma_buf_attach,
+	.detach = vfio_pci_dma_buf_detach,
+	.map_dma_buf = vfio_pci_dma_buf_map,
+	.release = vfio_pci_dma_buf_release,
+	.unmap_dma_buf = vfio_pci_dma_buf_unmap,
+};
+
+static void dma_ranges_to_p2p_phys(struct vfio_pci_dma_buf *priv,
+				   struct vfio_device_feature_dma_buf *dma_buf,
+				   struct vfio_region_dma_range *dma_ranges,
+				   struct p2pdma_provider *provider)
+{
+	struct pci_dev *pdev = priv->vdev->pdev;
+	phys_addr_t pci_start;
+	u32 i;
+
+	pci_start = pci_resource_start(pdev, dma_buf->region_index);
+	for (i = 0; i < dma_buf->nr_ranges; i++) {
+		priv->phys_vec[i].len = dma_ranges[i].length;
+		priv->phys_vec[i].paddr = pci_start + dma_ranges[i].offset;
+		priv->size += priv->phys_vec[i].len;
+	}
+	priv->nr_ranges = dma_buf->nr_ranges;
+	priv->provider = provider;
+}
+
+static int validate_dmabuf_input(struct vfio_pci_core_device *vdev,
+				 struct vfio_device_feature_dma_buf *dma_buf,
+				 struct vfio_region_dma_range *dma_ranges,
+				 struct p2pdma_provider **provider)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	u32 bar = dma_buf->region_index;
+	resource_size_t bar_size;
+	u64 length = 0, sum;
+	u32 i;
+
+	if (dma_buf->flags)
+		return -EINVAL;
+	/*
+	 * For PCI the region_index is the BAR number like  everything else.
+	 */
+	if (bar >= VFIO_PCI_ROM_REGION_INDEX)
+		return -ENODEV;
+
+	*provider = pcim_p2pdma_provider(pdev, bar);
+	if (!*provider)
+		return -EINVAL;
+
+	bar_size = pci_resource_len(pdev, bar);
+	for (i = 0; i < dma_buf->nr_ranges; i++) {
+		u64 offset = dma_ranges[i].offset;
+		u64 len = dma_ranges[i].length;
+
+		if (!len || !PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
+			return -EINVAL;
+
+		if (check_add_overflow(offset, len, &sum) || sum > bar_size)
+			return -EINVAL;
+
+		/* Total requested length can't overflow IOVA size */
+		if (check_add_overflow(length, len, &sum))
+			return -EINVAL;
+
+		length = sum;
+	}
+
+	/*
+	 * DMA API uses size_t, so make sure that requested region length
+	 * can fit into size_t variable, which can be unsigned int (32bits).
+	 *
+	 * In addition make sure that high bit of total length is not used too
+	 * as it is used as a marker for DMA IOVA API.
+	 */
+	if (overflows_type(length, size_t) || length & DMA_IOVA_USE_SWIOTLB)
+		return -EINVAL;
+
+	return 0;
+}
+
+int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
+				  struct vfio_device_feature_dma_buf __user *arg,
+				  size_t argsz)
+{
+	struct vfio_device_feature_dma_buf get_dma_buf = {};
+	struct vfio_region_dma_range *dma_ranges;
+	DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+	struct p2pdma_provider *provider;
+	struct vfio_pci_dma_buf *priv;
+	int ret;
+
+	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
+				 sizeof(get_dma_buf));
+	if (ret != 1)
+		return ret;
+
+	if (copy_from_user(&get_dma_buf, arg, sizeof(get_dma_buf)))
+		return -EFAULT;
+
+	if (!get_dma_buf.nr_ranges)
+		return -EINVAL;
+
+	dma_ranges = memdup_array_user(&arg->dma_ranges, get_dma_buf.nr_ranges,
+				       sizeof(*dma_ranges));
+	if (IS_ERR(dma_ranges))
+		return PTR_ERR(dma_ranges);
+
+	ret = validate_dmabuf_input(vdev, &get_dma_buf, dma_ranges, &provider);
+	if (ret)
+		goto err_free_ranges;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv) {
+		ret = -ENOMEM;
+		goto err_free_ranges;
+	}
+	priv->phys_vec = kcalloc(get_dma_buf.nr_ranges, sizeof(*priv->phys_vec),
+				 GFP_KERNEL);
+	if (!priv->phys_vec) {
+		ret = -ENOMEM;
+		goto err_free_priv;
+	}
+
+	priv->vdev = vdev;
+	dma_ranges_to_p2p_phys(priv, &get_dma_buf, dma_ranges, provider);
+	kfree(dma_ranges);
+	dma_ranges = NULL;
+
+	if (!vfio_device_try_get_registration(&vdev->vdev)) {
+		ret = -ENODEV;
+		goto err_free_phys;
+	}
+
+	exp_info.ops = &vfio_pci_dmabuf_ops;
+	exp_info.size = priv->size;
+	exp_info.flags = get_dma_buf.open_flags;
+	exp_info.priv = priv;
+
+	priv->dmabuf = dma_buf_export(&exp_info);
+	if (IS_ERR(priv->dmabuf)) {
+		ret = PTR_ERR(priv->dmabuf);
+		goto err_dev_put;
+	}
+
+	/* dma_buf_put() now frees priv */
+	INIT_LIST_HEAD(&priv->dmabufs_elm);
+	down_write(&vdev->memory_lock);
+	dma_resv_lock(priv->dmabuf->resv, NULL);
+	priv->revoked = !__vfio_pci_memory_enabled(vdev);
+	list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs);
+	dma_resv_unlock(priv->dmabuf->resv);
+	up_write(&vdev->memory_lock);
+
+	/*
+	 * dma_buf_fd() consumes the reference, when the file closes the dmabuf
+	 * will be released.
+	 */
+	return dma_buf_fd(priv->dmabuf, get_dma_buf.open_flags);
+
+err_dev_put:
+	vfio_device_put_registration(&vdev->vdev);
+err_free_phys:
+	kfree(priv->phys_vec);
+err_free_priv:
+	kfree(priv);
+err_free_ranges:
+	kfree(dma_ranges);
+	return ret;
+}
+
+void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
+{
+	struct vfio_pci_dma_buf *priv;
+	struct vfio_pci_dma_buf *tmp;
+
+	lockdep_assert_held_write(&vdev->memory_lock);
+
+	list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
+		if (!get_file_active(&priv->dmabuf->file))
+			continue;
+
+		if (priv->revoked != revoked) {
+			dma_resv_lock(priv->dmabuf->resv, NULL);
+			priv->revoked = revoked;
+			dma_buf_move_notify(priv->dmabuf);
+			dma_resv_unlock(priv->dmabuf->resv);
+		}
+		dma_buf_put(priv->dmabuf);
+	}
+}
+
+void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
+{
+	struct vfio_pci_dma_buf *priv;
+	struct vfio_pci_dma_buf *tmp;
+
+	down_write(&vdev->memory_lock);
+	list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
+		if (!get_file_active(&priv->dmabuf->file))
+			continue;
+
+		dma_resv_lock(priv->dmabuf->resv, NULL);
+		list_del_init(&priv->dmabufs_elm);
+		priv->vdev = NULL;
+		priv->revoked = true;
+		dma_buf_move_notify(priv->dmabuf);
+		dma_resv_unlock(priv->dmabuf->resv);
+		vfio_device_put_registration(&vdev->vdev);
+		dma_buf_put(priv->dmabuf);
+	}
+	up_write(&vdev->memory_lock);
+}
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
index a9972eacb293..28a405f8b97c 100644
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -107,4 +107,27 @@  static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
 	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
 }
 
+#ifdef CONFIG_VFIO_PCI_DMABUF
+int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
+				  struct vfio_device_feature_dma_buf __user *arg,
+				  size_t argsz);
+void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev);
+void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked);
+#else
+static inline int
+vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
+			      struct vfio_device_feature_dma_buf __user *arg,
+			      size_t argsz)
+{
+	return -ENOTTY;
+}
+static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
+{
+}
+static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev,
+					 bool revoked)
+{
+}
+#endif
+
 #endif
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index f541044e42a2..30d74b364f25 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -94,6 +94,7 @@  struct vfio_pci_core_device {
 	struct vfio_pci_core_device	*sriov_pf_core_dev;
 	struct notifier_block	nb;
 	struct rw_semaphore	memory_lock;
+	struct list_head	dmabufs;
 };
 
 /* Will be exported for vfio pci drivers usage */
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 75100bf009ba..63214467c875 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1478,6 +1478,31 @@  struct vfio_device_feature_bus_master {
 };
 #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
 
+/**
+ * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the
+ * regions selected.
+ *
+ * open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC,
+ * etc. offset/length specify a slice of the region to create the dmabuf from.
+ * nr_ranges is the total number of (P2P DMA) ranges that comprise the dmabuf.
+ *
+ * Return: The fd number on success, -1 and errno is set on failure.
+ */
+#define VFIO_DEVICE_FEATURE_DMA_BUF 11
+
+struct vfio_region_dma_range {
+	__u64 offset;
+	__u64 length;
+};
+
+struct vfio_device_feature_dma_buf {
+	__u32	region_index;
+	__u32	open_flags;
+	__u32   flags;
+	__u32   nr_ranges;
+	struct vfio_region_dma_range dma_ranges[];
+};
+
 /* -------- API for Type1 VFIO IOMMU -------- */
 
 /**