| Message ID | 72ecaa13864ca346797e342d23a7929562788148.1760368250.git.leon@kernel.org |
|---|---|
| State | New |
| Headers | show |
| Series | vfio/pci: Allow MMIO regions to be exported through dma-buf | expand |
On Mon, Oct 13, 2025 at 06:26:11PM +0300, Leon Romanovsky wrote: > + > +static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, > + struct dma_buf_attachment *attachment) > +{ > + struct vfio_pci_dma_buf *priv = dmabuf->priv; > + > + if (!attachment->peer2peer) > + return -EOPNOTSUPP; > + > + if (priv->revoked) > + return -ENODEV; > + > + switch (pci_p2pdma_map_type(priv->provider, attachment->dev)) { > + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: > + break; > + case PCI_P2PDMA_MAP_BUS_ADDR: > + /* > + * There is no need in IOVA at all for this flow. > + * We rely on attachment->priv == NULL as a marker > + * for this mode. > + */ > + return 0; > + default: > + return -EINVAL; > + } > + > + attachment->priv = kzalloc(sizeof(struct dma_iova_state), GFP_KERNEL); > + if (!attachment->priv) > + return -ENOMEM; > + > + dma_iova_try_alloc(attachment->dev, attachment->priv, 0, priv->size); The lifetime of this isn't good.. > + return 0; > +} > + > +static void vfio_pci_dma_buf_detach(struct dma_buf *dmabuf, > + struct dma_buf_attachment *attachment) > +{ > + kfree(attachment->priv); > +} If the caller fails to call map then it leaks the iova. > +static struct sg_table * > +vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment, > + enum dma_data_direction dir) > +{ [..] > +err_unmap_dma: > + if (!i || !state) > + ; /* Do nothing */ > + else if (dma_use_iova(state)) > + dma_iova_destroy(attachment->dev, state, mapped_len, dir, > + attrs); If we hit this error path then it is freed.. > +static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment, > + struct sg_table *sgt, > + enum dma_data_direction dir) > +{ > + struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv; > + struct dma_iova_state *state = attachment->priv; > + unsigned long attrs = DMA_ATTR_MMIO; > + struct scatterlist *sgl; > + int i; > + > + if (!state) > + ; /* Do nothing */ > + else if (dma_use_iova(state)) > + dma_iova_destroy(attachment->dev, state, priv->size, dir, > + attrs); It is freed here too, but we can call map multiple times. Every time a move_notify happens can trigger another call to map. I think just call unlink in those two and put dma_iova_free in detach Jason
On Mon, Oct 13, 2025 at 06:26:11PM +0300, Leon Romanovsky wrote: > From: Leon Romanovsky <leonro@nvidia.com> > > Add support for exporting PCI device MMIO regions through dma-buf, > enabling safe sharing of non-struct page memory with controlled > lifetime management. This allows RDMA and other subsystems to import > dma-buf FDs and build them into memory regions for PCI P2P operations. > > The implementation provides a revocable attachment mechanism using > dma-buf move operations. MMIO regions are normally pinned as BARs > don't change physical addresses, but access is revoked when the VFIO > device is closed or a PCI reset is issued. This ensures kernel > self-defense against potentially hostile userspace. I have drafted the iommufd importer side of this using the private interconnect approach for now. https://github.com/jgunthorpe/linux/commits/iommufd_dmabuf/ Due to this iommufd never calls map and we run into trouble here: > +static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, > + struct dma_buf_attachment *attachment) > +{ > + struct vfio_pci_dma_buf *priv = dmabuf->priv; > + > + if (!attachment->peer2peer) > + return -EOPNOTSUPP; > + > + if (priv->revoked) > + return -ENODEV; > + > + switch (pci_p2pdma_map_type(priv->provider, attachment->dev)) { > + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: > + break; > + case PCI_P2PDMA_MAP_BUS_ADDR: > + /* > + * There is no need in IOVA at all for this flow. > + * We rely on attachment->priv == NULL as a marker > + * for this mode. > + */ > + return 0; > + default: > + return -EINVAL; Where the dev from iommufd is also not p2p capable so the attach fails. This is OK since it won't call map. So I reworked this logic to succeed attach but block map in this case.. Can we fold this in for the next version? This diff has the fixing for the iova lifecycle too. I have a few more checks to make but so far it looks Ok and with some luck we can get some iommufd p2p support this cycle.. Jason diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c index eaba010777f3b7..a0650bd816d99b 100644 --- a/drivers/vfio/pci/vfio_pci_dmabuf.c +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -20,10 +20,21 @@ struct vfio_pci_dma_buf { u8 revoked : 1; }; +struct vfio_pci_attach { + struct dma_iova_state state; + enum { + VFIO_ATTACH_NONE, + VFIO_ATTACH_HOST_BRIDGE_DMA, + VFIO_ATTACH_HOST_BRIDGE_IOVA, + VFIO_ATTACH_BUS + } kind; +}; + static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, struct dma_buf_attachment *attachment) { struct vfio_pci_dma_buf *priv = dmabuf->priv; + struct vfio_pci_attach *attach; if (!attachment->peer2peer) return -EOPNOTSUPP; @@ -31,32 +42,38 @@ static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, if (priv->revoked) return -ENODEV; + attach = kzalloc(sizeof(*attach), GFP_KERNEL); + if (!attach) + return -ENOMEM; + attachment->priv = attach; + switch (pci_p2pdma_map_type(priv->provider, attachment->dev)) { case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: - break; + if (dma_iova_try_alloc(attachment->dev, &attach->state, 0, + priv->size)) + attach->kind = VFIO_ATTACH_HOST_BRIDGE_IOVA; + else + attach->kind = VFIO_ATTACH_HOST_BRIDGE_DMA; + return 0; case PCI_P2PDMA_MAP_BUS_ADDR: - /* - * There is no need in IOVA at all for this flow. - * We rely on attachment->priv == NULL as a marker - * for this mode. - */ + /* There is no need in IOVA at all for this flow. */ + attach->kind = VFIO_ATTACH_BUS; return 0; default: - return -EINVAL; + attach->kind = VFIO_ATTACH_NONE; + return 0; } - - attachment->priv = kzalloc(sizeof(struct dma_iova_state), GFP_KERNEL); - if (!attachment->priv) - return -ENOMEM; - - dma_iova_try_alloc(attachment->dev, attachment->priv, 0, priv->size); return 0; } static void vfio_pci_dma_buf_detach(struct dma_buf *dmabuf, struct dma_buf_attachment *attachment) { - kfree(attachment->priv); + struct vfio_pci_attach *attach = attachment->priv; + + if (attach->kind == VFIO_ATTACH_HOST_BRIDGE_IOVA) + dma_iova_free(attachment->dev, &attach->state); + kfree(attach); } static struct scatterlist *fill_sg_entry(struct scatterlist *sgl, u64 length, @@ -83,22 +100,23 @@ static struct scatterlist *fill_sg_entry(struct scatterlist *sgl, u64 length, } static unsigned int calc_sg_nents(struct vfio_pci_dma_buf *priv, - struct dma_iova_state *state) + struct vfio_pci_attach *attach) { struct phys_vec *phys_vec = priv->phys_vec; unsigned int nents = 0; u32 i; - if (!state || !dma_use_iova(state)) + if (attach->kind != VFIO_ATTACH_HOST_BRIDGE_IOVA) { for (i = 0; i < priv->nr_ranges; i++) nents += DIV_ROUND_UP(phys_vec[i].len, UINT_MAX); - else + } else { /* * In IOVA case, there is only one SG entry which spans * for whole IOVA address space, but we need to make sure * that it fits sg->length, maybe we need more. */ nents = DIV_ROUND_UP(priv->size, UINT_MAX); + } return nents; } @@ -108,7 +126,7 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment, enum dma_data_direction dir) { struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv; - struct dma_iova_state *state = attachment->priv; + struct vfio_pci_attach *attach = attachment->priv; struct phys_vec *phys_vec = priv->phys_vec; unsigned long attrs = DMA_ATTR_MMIO; unsigned int nents, mapped_len = 0; @@ -127,7 +145,7 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment, if (!sgt) return ERR_PTR(-ENOMEM); - nents = calc_sg_nents(priv, state); + nents = calc_sg_nents(priv, attach); ret = sg_alloc_table(sgt, nents, GFP_KERNEL | __GFP_ZERO); if (ret) goto err_kfree_sgt; @@ -135,35 +153,42 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment, sgl = sgt->sgl; for (i = 0; i < priv->nr_ranges; i++) { - if (!state) { + switch (attach->kind) { + case VFIO_ATTACH_BUS: addr = pci_p2pdma_bus_addr_map(priv->provider, phys_vec[i].paddr); - } else if (dma_use_iova(state)) { - ret = dma_iova_link(attachment->dev, state, + break; + case VFIO_ATTACH_HOST_BRIDGE_IOVA: + ret = dma_iova_link(attachment->dev, &attach->state, phys_vec[i].paddr, 0, phys_vec[i].len, dir, attrs); if (ret) goto err_unmap_dma; mapped_len += phys_vec[i].len; - } else { + break; + case VFIO_ATTACH_HOST_BRIDGE_DMA: addr = dma_map_phys(attachment->dev, phys_vec[i].paddr, phys_vec[i].len, dir, attrs); ret = dma_mapping_error(attachment->dev, addr); if (ret) goto err_unmap_dma; + break; + default: + ret = -EINVAL; + goto err_unmap_dma; } - if (!state || !dma_use_iova(state)) + if (attach->kind != VFIO_ATTACH_HOST_BRIDGE_IOVA) sgl = fill_sg_entry(sgl, phys_vec[i].len, addr); } - if (state && dma_use_iova(state)) { + if (attach->kind == VFIO_ATTACH_HOST_BRIDGE_IOVA) { WARN_ON_ONCE(mapped_len != priv->size); - ret = dma_iova_sync(attachment->dev, state, 0, mapped_len); + ret = dma_iova_sync(attachment->dev, &attach->state, 0, mapped_len); if (ret) goto err_unmap_dma; - sgl = fill_sg_entry(sgl, mapped_len, state->addr); + sgl = fill_sg_entry(sgl, mapped_len, attach->state.addr); } /* @@ -174,15 +199,22 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment, return sgt; err_unmap_dma: - if (!i || !state) - ; /* Do nothing */ - else if (dma_use_iova(state)) - dma_iova_destroy(attachment->dev, state, mapped_len, dir, - attrs); - else + switch (attach->kind) { + case VFIO_ATTACH_HOST_BRIDGE_IOVA: + if (mapped_len) + dma_iova_unlink(attachment->dev, &attach->state, 0, + mapped_len, dir, attrs); + break; + case VFIO_ATTACH_HOST_BRIDGE_DMA: + if (!i) + break; for_each_sgtable_dma_sg(sgt, sgl, i) dma_unmap_phys(attachment->dev, sg_dma_address(sgl), - sg_dma_len(sgl), dir, attrs); + sg_dma_len(sgl), dir, attrs); + break; + default: + break; + } sg_free_table(sgt); err_kfree_sgt: kfree(sgt); @@ -194,20 +226,24 @@ static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment, enum dma_data_direction dir) { struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv; - struct dma_iova_state *state = attachment->priv; + struct vfio_pci_attach *attach = attachment->priv; unsigned long attrs = DMA_ATTR_MMIO; struct scatterlist *sgl; int i; - if (!state) - ; /* Do nothing */ - else if (dma_use_iova(state)) - dma_iova_destroy(attachment->dev, state, priv->size, dir, - attrs); - else + switch (attach->kind) { + case VFIO_ATTACH_HOST_BRIDGE_IOVA: + dma_iova_destroy(attachment->dev, &attach->state, priv->size, + dir, attrs); + break; + case VFIO_ATTACH_HOST_BRIDGE_DMA: for_each_sgtable_dma_sg(sgt, sgl, i) dma_unmap_phys(attachment->dev, sg_dma_address(sgl), sg_dma_len(sgl), dir, attrs); + break; + default: + break; + } sg_free_table(sgt); kfree(sgt);
On Thu, Oct 16, 2025 at 08:53:32PM -0300, Jason Gunthorpe wrote: > On Mon, Oct 13, 2025 at 06:26:11PM +0300, Leon Romanovsky wrote: > > + > > +static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, > > + struct dma_buf_attachment *attachment) > > +{ > > + struct vfio_pci_dma_buf *priv = dmabuf->priv; > > + > > + if (!attachment->peer2peer) > > + return -EOPNOTSUPP; > > + > > + if (priv->revoked) > > + return -ENODEV; > > + > > + switch (pci_p2pdma_map_type(priv->provider, attachment->dev)) { > > + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: > > + break; > > + case PCI_P2PDMA_MAP_BUS_ADDR: > > + /* > > + * There is no need in IOVA at all for this flow. > > + * We rely on attachment->priv == NULL as a marker > > + * for this mode. > > + */ > > + return 0; > > + default: > > + return -EINVAL; > > + } > > + > > + attachment->priv = kzalloc(sizeof(struct dma_iova_state), GFP_KERNEL); > > + if (!attachment->priv) > > + return -ENOMEM; > > + > > + dma_iova_try_alloc(attachment->dev, attachment->priv, 0, priv->size); > > The lifetime of this isn't good.. > > > + return 0; > > +} > > + > > +static void vfio_pci_dma_buf_detach(struct dma_buf *dmabuf, > > + struct dma_buf_attachment *attachment) > > +{ > > + kfree(attachment->priv); > > +} > > If the caller fails to call map then it leaks the iova. I'm relying on dmabuf code and documentation: 926 /** 927 * dma_buf_dynamic_attach - Add the device to dma_buf's attachments list ... 932 * 933 * Returns struct dma_buf_attachment pointer for this attachment. Attachments 934 * must be cleaned up by calling dma_buf_detach(). Successful call to vfio_pci_dma_buf_attach() MUST be accompanied by call to vfio_pci_dma_buf_detach(), so as far as dmabuf implementation follows it, there is no leak. > > > +static struct sg_table * > > +vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment, > > + enum dma_data_direction dir) > > +{ > [..] > > > > +err_unmap_dma: > > + if (!i || !state) > > + ; /* Do nothing */ > > + else if (dma_use_iova(state)) > > + dma_iova_destroy(attachment->dev, state, mapped_len, dir, > > + attrs); > > If we hit this error path then it is freed.. > > > +static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment, > > + struct sg_table *sgt, > > + enum dma_data_direction dir) > > +{ > > + struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv; > > + struct dma_iova_state *state = attachment->priv; > > + unsigned long attrs = DMA_ATTR_MMIO; > > + struct scatterlist *sgl; > > + int i; > > + > > + if (!state) > > + ; /* Do nothing */ > > + else if (dma_use_iova(state)) > > + dma_iova_destroy(attachment->dev, state, priv->size, dir, > > + attrs); > > It is freed here too, but we can call map multiple times. Every time a > move_notify happens can trigger another call to map. > > I think just call unlink in those two and put dma_iova_free in detach Yes, it can work. Thanks > > Jason
On Mon, Oct 13, 2025 at 06:26:11PM +0300, Leon Romanovsky wrote: > From: Leon Romanovsky <leonro@nvidia.com> > > Add support for exporting PCI device MMIO regions through dma-buf, > enabling safe sharing of non-struct page memory with controlled > lifetime management. This allows RDMA and other subsystems to import > dma-buf FDs and build them into memory regions for PCI P2P operations. > > The implementation provides a revocable attachment mechanism using > dma-buf move operations. MMIO regions are normally pinned as BARs > don't change physical addresses, but access is revoked when the VFIO > device is closed or a PCI reset is issued. This ensures kernel > self-defense against potentially hostile userspace. This still completely fails to explain why you think that it actually is safe without the proper pgmap handling.
On Thu, Oct 16, 2025 at 11:33:52PM -0700, Christoph Hellwig wrote: > On Mon, Oct 13, 2025 at 06:26:11PM +0300, Leon Romanovsky wrote: > > From: Leon Romanovsky <leonro@nvidia.com> > > > > Add support for exporting PCI device MMIO regions through dma-buf, > > enabling safe sharing of non-struct page memory with controlled > > lifetime management. This allows RDMA and other subsystems to import > > dma-buf FDs and build them into memory regions for PCI P2P operations. > > > > The implementation provides a revocable attachment mechanism using > > dma-buf move operations. MMIO regions are normally pinned as BARs > > don't change physical addresses, but access is revoked when the VFIO > > device is closed or a PCI reset is issued. This ensures kernel > > self-defense against potentially hostile userspace. > > This still completely fails to explain why you think that it actually > is safe without the proper pgmap handling. Leon, this keeps coming up, please clean up and copy the text from my prior email into the commit message & cover letter explaining in detail how lifetime magement works by using revocation/invalidation driven by dmabuf move_notify callbacks. Jason
On Mon, Oct 13, 2025 at 06:26:11PM +0300, Leon Romanovsky wrote: > +static void dma_ranges_to_p2p_phys(struct vfio_pci_dma_buf *priv, > + struct vfio_device_feature_dma_buf *dma_buf, > + struct vfio_region_dma_range *dma_ranges, > + struct p2pdma_provider *provider) > +{ > + struct pci_dev *pdev = priv->vdev->pdev; > + phys_addr_t pci_start; > + u32 i; > + > + pci_start = pci_resource_start(pdev, dma_buf->region_index); > + for (i = 0; i < dma_buf->nr_ranges; i++) { > + priv->phys_vec[i].len = dma_ranges[i].length; > + priv->phys_vec[i].paddr = pci_start + dma_ranges[i].offset; > + priv->size += priv->phys_vec[i].len; > + } This is missing validation, the userspace can pass in any length/offset but the resource is of limited size. Like this: static int dma_ranges_to_p2p_phys(struct vfio_pci_dma_buf *priv, struct vfio_device_feature_dma_buf *dma_buf, struct vfio_region_dma_range *dma_ranges, struct p2pdma_provider *provider) { struct pci_dev *pdev = priv->vdev->pdev; phys_addr_t len = pci_resource_len(pdev, dma_buf->region_index); phys_addr_t pci_start; phys_addr_t pci_last; u32 i; if (!len) return -EINVAL; pci_start = pci_resource_start(pdev, dma_buf->region_index); pci_last = pci_start + len - 1; for (i = 0; i < dma_buf->nr_ranges; i++) { phys_addr_t last; if (!dma_ranges[i].length) return -EINVAL; if (check_add_overflow(pci_start, dma_ranges[i].offset, &priv->phys_vec[i].paddr) || check_add_overflow(priv->phys_vec[i].paddr, dma_ranges[i].length - 1, &last)) return -EOVERFLOW; if (last > pci_last) return -EINVAL; priv->phys_vec[i].len = dma_ranges[i].length; priv->size += priv->phys_vec[i].len; } priv->nr_ranges = dma_buf->nr_ranges; priv->provider = provider; return 0; } Jason
On Fri, Oct 17, 2025 at 08:40:07AM +0300, Leon Romanovsky wrote: > > > +static void vfio_pci_dma_buf_detach(struct dma_buf *dmabuf, > > > + struct dma_buf_attachment *attachment) > > > +{ > > > + kfree(attachment->priv); > > > +} > > > > If the caller fails to call map then it leaks the iova. > > I'm relying on dmabuf code and documentation: > > 926 /** > 927 * dma_buf_dynamic_attach - Add the device to dma_buf's attachments list > ... > 932 * > 933 * Returns struct dma_buf_attachment pointer for this attachment. Attachments > 934 * must be cleaned up by calling dma_buf_detach(). > > Successful call to vfio_pci_dma_buf_attach() MUST be accompanied by call > to vfio_pci_dma_buf_detach(), so as far as dmabuf implementation follows > it, there is no leak. It leaks the ivoa because there is no dma_iova_destroy() unless you call unmap. detach is not unmap and unmap is not mandatory to call. Jason
On Fri, Oct 17, 2025 at 12:58:50PM -0300, Jason Gunthorpe wrote: > On Fri, Oct 17, 2025 at 08:40:07AM +0300, Leon Romanovsky wrote: > > > > +static void vfio_pci_dma_buf_detach(struct dma_buf *dmabuf, > > > > + struct dma_buf_attachment *attachment) > > > > +{ > > > > + kfree(attachment->priv); > > > > +} > > > > > > If the caller fails to call map then it leaks the iova. > > > > I'm relying on dmabuf code and documentation: > > > > 926 /** > > 927 * dma_buf_dynamic_attach - Add the device to dma_buf's attachments list > > ... > > 932 * > > 933 * Returns struct dma_buf_attachment pointer for this attachment. Attachments > > 934 * must be cleaned up by calling dma_buf_detach(). > > > > Successful call to vfio_pci_dma_buf_attach() MUST be accompanied by call > > to vfio_pci_dma_buf_detach(), so as far as dmabuf implementation follows > > it, there is no leak. > > It leaks the ivoa because there is no dma_iova_destroy() unless you > call unmap. detach is not unmap and unmap is not mandatory to call. Though putting iova free in detach is problematic for the hot-unplug case. In that instance we need to ensure the iova is cleaned up prior to returning from vfio's remove(). detached is called on the importers timeline but unmap is required to be called in move_notify.. So I guess some kind of flag to trigger the unmap after cleanup to free the iova? Jason
On Fri, Oct 17, 2025 at 10:02:49AM -0300, Jason Gunthorpe wrote: > On Mon, Oct 13, 2025 at 06:26:11PM +0300, Leon Romanovsky wrote: > > +static void dma_ranges_to_p2p_phys(struct vfio_pci_dma_buf *priv, > > + struct vfio_device_feature_dma_buf *dma_buf, > > + struct vfio_region_dma_range *dma_ranges, > > + struct p2pdma_provider *provider) > > +{ > > + struct pci_dev *pdev = priv->vdev->pdev; > > + phys_addr_t pci_start; > > + u32 i; > > + > > + pci_start = pci_resource_start(pdev, dma_buf->region_index); > > + for (i = 0; i < dma_buf->nr_ranges; i++) { > > + priv->phys_vec[i].len = dma_ranges[i].length; > > + priv->phys_vec[i].paddr = pci_start + dma_ranges[i].offset; > > + priv->size += priv->phys_vec[i].len; > > + } > > This is missing validation, the userspace can pass in any > length/offset but the resource is of limited size. Like this: > > static int dma_ranges_to_p2p_phys(struct vfio_pci_dma_buf *priv, > struct vfio_device_feature_dma_buf *dma_buf, > struct vfio_region_dma_range *dma_ranges, > struct p2pdma_provider *provider) > { > struct pci_dev *pdev = priv->vdev->pdev; > phys_addr_t len = pci_resource_len(pdev, dma_buf->region_index); > phys_addr_t pci_start; > phys_addr_t pci_last; > u32 i; > > if (!len) > return -EINVAL; > pci_start = pci_resource_start(pdev, dma_buf->region_index); > pci_last = pci_start + len - 1; > for (i = 0; i < dma_buf->nr_ranges; i++) { > phys_addr_t last; > > if (!dma_ranges[i].length) > return -EINVAL; > > if (check_add_overflow(pci_start, dma_ranges[i].offset, > &priv->phys_vec[i].paddr) || > check_add_overflow(priv->phys_vec[i].paddr, > dma_ranges[i].length - 1, &last)) > return -EOVERFLOW; > if (last > pci_last) > return -EINVAL; > > priv->phys_vec[i].len = dma_ranges[i].length; > priv->size += priv->phys_vec[i].len; > } > priv->nr_ranges = dma_buf->nr_ranges; > priv->provider = provider; > return 0; > } I have these checks in validate_dmabuf_input(). Do you think that I need to add extra checks? Thanks > > Jason
On Mon, Oct 13, 2025 at 06:26:11PM +0300, Leon Romanovsky wrote: > From: Leon Romanovsky <leonro@nvidia.com> > > Add support for exporting PCI device MMIO regions through dma-buf, > enabling safe sharing of non-struct page memory with controlled > lifetime management. This allows RDMA and other subsystems to import > dma-buf FDs and build them into memory regions for PCI P2P operations. I was looking at how to address Alex's note about not all drivers being compatible, and how to enable the non-compatible drivers. It looks like the simplest thing is to make dma_ranges_to_p2p_phys into an ops and have the driver provide it. If not provided the no support. Drivers with special needs can fill in phys in their own way and get their own provider. Sort of like this: diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index ac10f14417f2f3..6d41cf26b53994 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -147,6 +147,10 @@ static const struct vfio_device_ops vfio_pci_ops = { .pasid_detach_ioas = vfio_iommufd_physical_pasid_detach_ioas, }; +static const struct vfio_pci_device_ops vfio_pci_dev_ops = { + .get_dmabuf_phys = vfio_pci_core_get_dmabuf_phys, +}; + static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct vfio_pci_core_device *vdev; @@ -161,6 +165,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) return PTR_ERR(vdev); dev_set_drvdata(&pdev->dev, vdev); + vdev->pci_ops = &vfio_pci_dev_ops; ret = vfio_pci_core_register_device(vdev); if (ret) goto out_put_vdev; diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c index 358856e6b8a820..dad880781a9352 100644 --- a/drivers/vfio/pci/vfio_pci_dmabuf.c +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -309,47 +309,52 @@ int vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, } EXPORT_SYMBOL_GPL(vfio_pci_dma_buf_iommufd_map); -static int dma_ranges_to_p2p_phys(struct vfio_pci_dma_buf *priv, - struct vfio_device_feature_dma_buf *dma_buf, +int vfio_pci_core_get_dmabuf_phys(struct vfio_pci_core_device *vdev, + struct p2pdma_provider **provider, + unsigned int region_index, + struct phys_vec *phys_vec, struct vfio_region_dma_range *dma_ranges, - struct p2pdma_provider *provider) + size_t nr_ranges) { - struct pci_dev *pdev = priv->vdev->pdev; - phys_addr_t len = pci_resource_len(pdev, dma_buf->region_index); + struct pci_dev *pdev = vdev->pdev; + phys_addr_t len = pci_resource_len(pdev, region_index); phys_addr_t pci_start; phys_addr_t pci_last; u32 i; if (!len) return -EINVAL; - pci_start = pci_resource_start(pdev, dma_buf->region_index); + + *provider = pcim_p2pdma_provider(pdev, region_index); + if (!*provider) + return -EINVAL; + + pci_start = pci_resource_start(pdev, region_index); pci_last = pci_start + len - 1; - for (i = 0; i < dma_buf->nr_ranges; i++) { + for (i = 0; i < nr_ranges; i++) { phys_addr_t last; if (!dma_ranges[i].length) return -EINVAL; if (check_add_overflow(pci_start, dma_ranges[i].offset, - &priv->phys_vec[i].paddr) || - check_add_overflow(priv->phys_vec[i].paddr, + &phys_vec[i].paddr) || + check_add_overflow(phys_vec[i].paddr, dma_ranges[i].length - 1, &last)) return -EOVERFLOW; if (last > pci_last) return -EINVAL; - priv->phys_vec[i].len = dma_ranges[i].length; - priv->size += priv->phys_vec[i].len; + phys_vec[i].len = dma_ranges[i].length; } - priv->nr_ranges = dma_buf->nr_ranges; - priv->provider = provider; return 0; } +EXPORT_SYMBOL_GPL(vfio_pci_core_get_dmabuf_phys); static int validate_dmabuf_input(struct vfio_pci_core_device *vdev, struct vfio_device_feature_dma_buf *dma_buf, struct vfio_region_dma_range *dma_ranges, - struct p2pdma_provider **provider) + size_t *lengthp) { struct pci_dev *pdev = vdev->pdev; u32 bar = dma_buf->region_index; @@ -365,10 +370,6 @@ static int validate_dmabuf_input(struct vfio_pci_core_device *vdev, if (bar >= VFIO_PCI_ROM_REGION_INDEX) return -ENODEV; - *provider = pcim_p2pdma_provider(pdev, bar); - if (!*provider) - return -EINVAL; - bar_size = pci_resource_len(pdev, bar); for (i = 0; i < dma_buf->nr_ranges; i++) { u64 offset = dma_ranges[i].offset; @@ -397,6 +398,7 @@ static int validate_dmabuf_input(struct vfio_pci_core_device *vdev, if (overflows_type(length, size_t) || length & DMA_IOVA_USE_SWIOTLB) return -EINVAL; + *lengthp = length; return 0; } @@ -407,10 +409,13 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, struct vfio_device_feature_dma_buf get_dma_buf = {}; struct vfio_region_dma_range *dma_ranges; DEFINE_DMA_BUF_EXPORT_INFO(exp_info); - struct p2pdma_provider *provider; struct vfio_pci_dma_buf *priv; + size_t length; int ret; + if (!vdev->pci_ops->get_dmabuf_phys) + return -EOPNOTSUPP; + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, sizeof(get_dma_buf)); if (ret != 1) @@ -427,7 +432,7 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, if (IS_ERR(dma_ranges)) return PTR_ERR(dma_ranges); - ret = validate_dmabuf_input(vdev, &get_dma_buf, dma_ranges, &provider); + ret = validate_dmabuf_input(vdev, &get_dma_buf, dma_ranges, &length); if (ret) goto err_free_ranges; @@ -444,10 +449,16 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, } priv->vdev = vdev; - ret = dma_ranges_to_p2p_phys(priv, &get_dma_buf, dma_ranges, provider); + priv->nr_ranges = get_dma_buf.nr_ranges; + priv->size = length; + ret = vdev->pci_ops->get_dmabuf_phys(vdev, &priv->provider, + get_dma_buf.region_index, + priv->phys_vec, dma_ranges, + priv->nr_ranges); if (ret) goto err_free_phys; + kfree(dma_ranges); dma_ranges = NULL; diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 37ce02e30c7632..4ea2095381eb24 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -26,6 +26,7 @@ struct vfio_pci_core_device; struct vfio_pci_region; +struct p2pdma_provider; struct vfio_pci_regops { ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf, @@ -49,9 +50,26 @@ struct vfio_pci_region { u32 flags; }; +struct vfio_pci_device_ops { + int (*get_dmabuf_phys)(struct vfio_pci_core_device *vdev, + struct p2pdma_provider **provider, + unsigned int region_index, + struct phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges); +}; + +int vfio_pci_core_get_dmabuf_phys(struct vfio_pci_core_device *vdev, + struct p2pdma_provider **provider, + unsigned int region_index, + struct phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges); + struct vfio_pci_core_device { struct vfio_device vdev; struct pci_dev *pdev; + const struct vfio_pci_device_ops *pci_ops; void __iomem *barmap[PCI_STD_NUM_BARS]; bool bar_mmap_supported[PCI_STD_NUM_BARS]; u8 *pci_config_map;
On Fri, Oct 17, 2025 at 07:13:58PM +0300, Leon Romanovsky wrote: > > static int dma_ranges_to_p2p_phys(struct vfio_pci_dma_buf *priv, > > struct vfio_device_feature_dma_buf *dma_buf, > > struct vfio_region_dma_range *dma_ranges, > > struct p2pdma_provider *provider) > > { > > struct pci_dev *pdev = priv->vdev->pdev; > > phys_addr_t len = pci_resource_len(pdev, dma_buf->region_index); > > phys_addr_t pci_start; > > phys_addr_t pci_last; > > u32 i; > > > > if (!len) > > return -EINVAL; > > pci_start = pci_resource_start(pdev, dma_buf->region_index); > > pci_last = pci_start + len - 1; > > for (i = 0; i < dma_buf->nr_ranges; i++) { > > phys_addr_t last; > > > > if (!dma_ranges[i].length) > > return -EINVAL; > > > > if (check_add_overflow(pci_start, dma_ranges[i].offset, > > &priv->phys_vec[i].paddr) || > > check_add_overflow(priv->phys_vec[i].paddr, > > dma_ranges[i].length - 1, &last)) > > return -EOVERFLOW; > > if (last > pci_last) > > return -EINVAL; > > > > priv->phys_vec[i].len = dma_ranges[i].length; > > priv->size += priv->phys_vec[i].len; > > } > > priv->nr_ranges = dma_buf->nr_ranges; > > priv->provider = provider; > > return 0; > > } > > I have these checks in validate_dmabuf_input(). > Do you think that I need to add extra checks? I think they work better in this function, so I'd move them here. Jason
On Mon, Oct 20, 2025 at 01:15:16PM -0300, Jason Gunthorpe wrote: > On Fri, Oct 17, 2025 at 07:13:58PM +0300, Leon Romanovsky wrote: > > > static int dma_ranges_to_p2p_phys(struct vfio_pci_dma_buf *priv, > > > struct vfio_device_feature_dma_buf *dma_buf, > > > struct vfio_region_dma_range *dma_ranges, > > > struct p2pdma_provider *provider) > > > { > > > struct pci_dev *pdev = priv->vdev->pdev; > > > phys_addr_t len = pci_resource_len(pdev, dma_buf->region_index); > > > phys_addr_t pci_start; > > > phys_addr_t pci_last; > > > u32 i; > > > > > > if (!len) > > > return -EINVAL; > > > pci_start = pci_resource_start(pdev, dma_buf->region_index); > > > pci_last = pci_start + len - 1; > > > for (i = 0; i < dma_buf->nr_ranges; i++) { > > > phys_addr_t last; > > > > > > if (!dma_ranges[i].length) > > > return -EINVAL; > > > > > > if (check_add_overflow(pci_start, dma_ranges[i].offset, > > > &priv->phys_vec[i].paddr) || > > > check_add_overflow(priv->phys_vec[i].paddr, > > > dma_ranges[i].length - 1, &last)) > > > return -EOVERFLOW; > > > if (last > pci_last) > > > return -EINVAL; > > > > > > priv->phys_vec[i].len = dma_ranges[i].length; > > > priv->size += priv->phys_vec[i].len; > > > } > > > priv->nr_ranges = dma_buf->nr_ranges; > > > priv->provider = provider; > > > return 0; > > > } > > > > I have these checks in validate_dmabuf_input(). > > Do you think that I need to add extra checks? > > I think they work better in this function, so I'd move them here. The main idea for validate_dmabuf_input() is to perform as much as possible checks before allocating kernel memory. Thanks > > Jason
On Mon, Oct 20, 2025 at 07:44:57PM +0300, Leon Romanovsky wrote: > On Mon, Oct 20, 2025 at 01:15:16PM -0300, Jason Gunthorpe wrote: > > On Fri, Oct 17, 2025 at 07:13:58PM +0300, Leon Romanovsky wrote: > > > > static int dma_ranges_to_p2p_phys(struct vfio_pci_dma_buf *priv, > > > > struct vfio_device_feature_dma_buf *dma_buf, > > > > struct vfio_region_dma_range *dma_ranges, > > > > struct p2pdma_provider *provider) > > > > { > > > > struct pci_dev *pdev = priv->vdev->pdev; > > > > phys_addr_t len = pci_resource_len(pdev, dma_buf->region_index); > > > > phys_addr_t pci_start; > > > > phys_addr_t pci_last; > > > > u32 i; > > > > > > > > if (!len) > > > > return -EINVAL; > > > > pci_start = pci_resource_start(pdev, dma_buf->region_index); > > > > pci_last = pci_start + len - 1; > > > > for (i = 0; i < dma_buf->nr_ranges; i++) { > > > > phys_addr_t last; > > > > > > > > if (!dma_ranges[i].length) > > > > return -EINVAL; > > > > > > > > if (check_add_overflow(pci_start, dma_ranges[i].offset, > > > > &priv->phys_vec[i].paddr) || > > > > check_add_overflow(priv->phys_vec[i].paddr, > > > > dma_ranges[i].length - 1, &last)) > > > > return -EOVERFLOW; > > > > if (last > pci_last) > > > > return -EINVAL; > > > > > > > > priv->phys_vec[i].len = dma_ranges[i].length; > > > > priv->size += priv->phys_vec[i].len; > > > > } > > > > priv->nr_ranges = dma_buf->nr_ranges; > > > > priv->provider = provider; > > > > return 0; > > > > } > > > > > > I have these checks in validate_dmabuf_input(). > > > Do you think that I need to add extra checks? > > > > I think they work better in this function, so I'd move them here. > > The main idea for validate_dmabuf_input() is to perform as much as > possible checks before allocating kernel memory. Yeah, but it's fine, it can just be turned into a function to safely compute the total size. It makes more sense to try to validate once we have the kernel memory and got the physical range from the driver to copy into the phys. Jason
On Mon, Oct 13, 2025 at 06:26:11PM +0300, Leon Romanovsky wrote: > From: Leon Romanovsky <leonro@nvidia.com> > > Add support for exporting PCI device MMIO regions through dma-buf, > enabling safe sharing of non-struct page memory with controlled > lifetime management. This allows RDMA and other subsystems to import > dma-buf FDs and build them into memory regions for PCI P2P operations. > > The implementation provides a revocable attachment mechanism using > dma-buf move operations. MMIO regions are normally pinned as BARs > don't change physical addresses, but access is revoked when the VFIO > device is closed or a PCI reset is issued. This ensures kernel > self-defense against potentially hostile userspace. Let's enhance this: Currently VFIO can take MMIO regions from the device's BAR and map them into a PFNMAP VMA with special PTEs. This mapping type ensures the memory cannot be used with things like pin_user_pages(), hmm, and so on. In practice only the user process CPU and KVM can safely make use of these VMA. When VFIO shuts down these VMAs are cleaned by unmap_mapping_range() to prevent any UAF of the MMIO beyond driver unbind. However, VFIO type 1 has an insecure behavior where it uses follow_pfnmap_*() to fish a MMIO PFN out of a VMA and program it back into the IOMMU. This has a long history of enabling P2P DMA inside VMs, but has serious lifetime problems by allowing a UAF of the MMIO after the VFIO driver has been unbound. Introduce DMABUF as a new safe way to export a FD based handle for the MMIO regions. This can be consumed by existing DMABUF importers like RDMA or DRM without opening an UAF. A following series will add an importer to iommufd to obsolete the type 1 code and allow safe UAF-free MMIO P2P in VM cases. DMABUF has a built in synchronous invalidation mechanism called move_notify. VFIO keeps track of all drivers importing its MMIO and can invoke a synchronous invalidation callback to tell the importing drivers to DMA unmap and forget about the MMIO pfns. This process is being called revoke. This synchronous invalidation fully prevents any lifecycle problems. VFIO will do this before unbinding its driver ensuring there is no UAF of the MMIO beyond the driver lifecycle. Further, VFIO has additional behavior to block access to the MMIO during things like Function Level Reset. This is because some poor platforms may experience a MCE type crash when touching MMIO of a PCI device that is undergoing a reset. Today this is done by using unmap_mapping_range() on the VMAs. Extend that into the DMABUF world and temporarily revoke the MMIO from the DMABUF importers during FLR as well. This will more robustly prevent an errant P2P from possibly upsetting the platform. A DMABUF FD is a prefered handle for MMIO compared to using something like a pgmap because: - VFIO is supported, including its P2P feature, on archs that don't support pgmap - PCI devices have all sorts of BAR sizes, including ones smaller than a section so a pgmap cannot always be created - It is undesirable to waste alot of memory for struct pages, especially for a case like a GPU with ~100GB of BAR size - We want a synchronous revoke semantic to support FLR with light hardware requirements Use the P2P subsystem to help generate the DMA mapping. This is a significant upgrade over the abuse of dma_map_resource() that has historically been used by DMABUF exporters. Experience with an OOT version of this patch shows that real systems do need this. This approach deals with all the P2P scenarios: - Non-zero PCI bus_offset - ACS flags routing traffic to the IOMMU - ACS flags that bypass the IOMMU - though vfio noiommu is required to hit this. There will be further work to formalize the revoke semantic in DMABUF. For now this acts like a move_notify dynamic exporter where importer fault handling will get a failure when they attempt to map. This means that only fully restartable fault capable importers can import the VFIO DMABUFs. A future revoke semantic should open this up to more HW as the HW only needs to invalidate, not handle restartable faults. Jason
在 2025/10/22 20:50, Jason Gunthorpe 写道: > On Mon, Oct 13, 2025 at 06:26:11PM +0300, Leon Romanovsky wrote: >> From: Leon Romanovsky <leonro@nvidia.com> >> >> Add support for exporting PCI device MMIO regions through dma-buf, >> enabling safe sharing of non-struct page memory with controlled >> lifetime management. This allows RDMA and other subsystems to import >> dma-buf FDs and build them into memory regions for PCI P2P operations. >> >> The implementation provides a revocable attachment mechanism using >> dma-buf move operations. MMIO regions are normally pinned as BARs >> don't change physical addresses, but access is revoked when the VFIO >> device is closed or a PCI reset is issued. This ensures kernel >> self-defense against potentially hostile userspace. > > Let's enhance this: > > Currently VFIO can take MMIO regions from the device's BAR and map > them into a PFNMAP VMA with special PTEs. This mapping type ensures > the memory cannot be used with things like pin_user_pages(), hmm, and > so on. In practice only the user process CPU and KVM can safely make > use of these VMA. When VFIO shuts down these VMAs are cleaned by > unmap_mapping_range() to prevent any UAF of the MMIO beyond driver > unbind. > > However, VFIO type 1 has an insecure behavior where it uses > follow_pfnmap_*() to fish a MMIO PFN out of a VMA and program it back > into the IOMMU. This has a long history of enabling P2P DMA inside > VMs, but has serious lifetime problems by allowing a UAF of the MMIO > after the VFIO driver has been unbound. Hi, Jason, Can you elaborate on this more? From my understanding of the VFIO type 1 implementation: - When a device is opened through VFIO type 1, it increments the device->refcount - During unbind, the driver waits for this refcount to drop to zero via wait_for_completion(&device->comp) - This should prevent the unbind() from completing while the device is still in use Given this refcount mechanism, I do not figure out how the UAF can occur. Thanks.
On Sun, Oct 26, 2025 at 03:55:04PM +0800, Shuai Xue wrote: > > > 在 2025/10/22 20:50, Jason Gunthorpe 写道: > > On Mon, Oct 13, 2025 at 06:26:11PM +0300, Leon Romanovsky wrote: > > > From: Leon Romanovsky <leonro@nvidia.com> > > > > > > Add support for exporting PCI device MMIO regions through dma-buf, > > > enabling safe sharing of non-struct page memory with controlled > > > lifetime management. This allows RDMA and other subsystems to import > > > dma-buf FDs and build them into memory regions for PCI P2P operations. > > > > > > The implementation provides a revocable attachment mechanism using > > > dma-buf move operations. MMIO regions are normally pinned as BARs > > > don't change physical addresses, but access is revoked when the VFIO > > > device is closed or a PCI reset is issued. This ensures kernel > > > self-defense against potentially hostile userspace. > > > > Let's enhance this: > > > > Currently VFIO can take MMIO regions from the device's BAR and map > > them into a PFNMAP VMA with special PTEs. This mapping type ensures > > the memory cannot be used with things like pin_user_pages(), hmm, and > > so on. In practice only the user process CPU and KVM can safely make > > use of these VMA. When VFIO shuts down these VMAs are cleaned by > > unmap_mapping_range() to prevent any UAF of the MMIO beyond driver > > unbind. > > > > However, VFIO type 1 has an insecure behavior where it uses > > follow_pfnmap_*() to fish a MMIO PFN out of a VMA and program it back > > into the IOMMU. This has a long history of enabling P2P DMA inside > > VMs, but has serious lifetime problems by allowing a UAF of the MMIO > > after the VFIO driver has been unbound. > > Hi, Jason, > > Can you elaborate on this more? > > From my understanding of the VFIO type 1 implementation: > > - When a device is opened through VFIO type 1, it increments the > device->refcount > - During unbind, the driver waits for this refcount to drop to zero via > wait_for_completion(&device->comp) > - This should prevent the unbind() from completing while the device is > still in use > > Given this refcount mechanism, I do not figure out how the UAF can > occur. A second vfio device can be opened and then use follow_pfnmap_*() to read the first vfio device's PTEs. There is no relationship betweent the first and second VFIO devices, so once the first is unbound it sails through the device->comp while the second device retains the PFN in its type1 iommu_domain. Jason
On Mon, Oct 13, 2025 at 8:44 AM Leon Romanovsky <leon@kernel.org> wrote: > > From: Leon Romanovsky <leonro@nvidia.com> > > Add support for exporting PCI device MMIO regions through dma-buf, > enabling safe sharing of non-struct page memory with controlled > lifetime management. This allows RDMA and other subsystems to import > dma-buf FDs and build them into memory regions for PCI P2P operations. > +/** > + * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the > + * regions selected. > + * > + * open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC, > + * etc. offset/length specify a slice of the region to create the dmabuf from. > + * nr_ranges is the total number of (P2P DMA) ranges that comprise the dmabuf. > + * > + * Return: The fd number on success, -1 and errno is set on failure. > + */ > +#define VFIO_DEVICE_FEATURE_DMA_BUF 11 > + > +struct vfio_region_dma_range { > + __u64 offset; > + __u64 length; > +}; > + > +struct vfio_device_feature_dma_buf { > + __u32 region_index; > + __u32 open_flags; > + __u32 flags; > + __u32 nr_ranges; > + struct vfio_region_dma_range dma_ranges[]; > +}; This uAPI would be a good candidate for a VFIO selftest. You can test that it returns an error when it's supposed to, and a valid fd when it's supposed to. And once the iommufd importer side is ready, we can extend the test and verify that the fd can be mapped into iommufd. It will probably be challenging to meaningfully exercise device P2P through a selftest, I haven't thought about how to extend the driver framework for that yet... But you can at least test that all the ioctls behave like they should.
On Mon, Oct 27, 2025 at 04:13:05PM -0700, David Matlack wrote: > On Mon, Oct 13, 2025 at 8:44 AM Leon Romanovsky <leon@kernel.org> wrote: > > > > From: Leon Romanovsky <leonro@nvidia.com> > > > > Add support for exporting PCI device MMIO regions through dma-buf, > > enabling safe sharing of non-struct page memory with controlled > > lifetime management. This allows RDMA and other subsystems to import > > dma-buf FDs and build them into memory regions for PCI P2P operations. > > > +/** > > + * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the > > + * regions selected. > > + * > > + * open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC, > > + * etc. offset/length specify a slice of the region to create the dmabuf from. > > + * nr_ranges is the total number of (P2P DMA) ranges that comprise the dmabuf. > > + * > > + * Return: The fd number on success, -1 and errno is set on failure. > > + */ > > +#define VFIO_DEVICE_FEATURE_DMA_BUF 11 > > + > > +struct vfio_region_dma_range { > > + __u64 offset; > > + __u64 length; > > +}; > > + > > +struct vfio_device_feature_dma_buf { > > + __u32 region_index; > > + __u32 open_flags; > > + __u32 flags; > > + __u32 nr_ranges; > > + struct vfio_region_dma_range dma_ranges[]; > > +}; > > This uAPI would be a good candidate for a VFIO selftest. You can test > that it returns an error when it's supposed to, and a valid fd when > it's supposed to. And once the iommufd importer side is ready, we can > extend the test and verify that the fd can be mapped into iommufd. No problem, I'll add such test, but let's focus on making sure that this series is accepted first. Thanks
在 2025/10/27 20:09, Jason Gunthorpe 写道: > On Sun, Oct 26, 2025 at 03:55:04PM +0800, Shuai Xue wrote: >> >> >> 在 2025/10/22 20:50, Jason Gunthorpe 写道: >>> On Mon, Oct 13, 2025 at 06:26:11PM +0300, Leon Romanovsky wrote: >>>> From: Leon Romanovsky <leonro@nvidia.com> >>>> >>>> Add support for exporting PCI device MMIO regions through dma-buf, >>>> enabling safe sharing of non-struct page memory with controlled >>>> lifetime management. This allows RDMA and other subsystems to import >>>> dma-buf FDs and build them into memory regions for PCI P2P operations. >>>> >>>> The implementation provides a revocable attachment mechanism using >>>> dma-buf move operations. MMIO regions are normally pinned as BARs >>>> don't change physical addresses, but access is revoked when the VFIO >>>> device is closed or a PCI reset is issued. This ensures kernel >>>> self-defense against potentially hostile userspace. >>> >>> Let's enhance this: >>> >>> Currently VFIO can take MMIO regions from the device's BAR and map >>> them into a PFNMAP VMA with special PTEs. This mapping type ensures >>> the memory cannot be used with things like pin_user_pages(), hmm, and >>> so on. In practice only the user process CPU and KVM can safely make >>> use of these VMA. When VFIO shuts down these VMAs are cleaned by >>> unmap_mapping_range() to prevent any UAF of the MMIO beyond driver >>> unbind. >>> >>> However, VFIO type 1 has an insecure behavior where it uses >>> follow_pfnmap_*() to fish a MMIO PFN out of a VMA and program it back >>> into the IOMMU. This has a long history of enabling P2P DMA inside >>> VMs, but has serious lifetime problems by allowing a UAF of the MMIO >>> after the VFIO driver has been unbound. >> >> Hi, Jason, >> >> Can you elaborate on this more? >> >> From my understanding of the VFIO type 1 implementation: >> >> - When a device is opened through VFIO type 1, it increments the >> device->refcount >> - During unbind, the driver waits for this refcount to drop to zero via >> wait_for_completion(&device->comp) >> - This should prevent the unbind() from completing while the device is >> still in use >> >> Given this refcount mechanism, I do not figure out how the UAF can >> occur. > > A second vfio device can be opened and then use follow_pfnmap_*() to > read the first vfio device's PTEs. There is no relationship betweent > the first and second VFIO devices, so once the first is unbound it > sails through the device->comp while the second device retains the PFN > in its type1 iommu_domain. > > Jason I see. Thanks. Best Regard, Shuai
On Wed, Oct 29, 2025, at 18:50, Alex Mastro wrote: > On Mon, Oct 13, 2025 at 06:26:11PM +0300, Leon Romanovsky wrote: >> + /* >> + * dma_buf_fd() consumes the reference, when the file closes the dmabuf >> + * will be released. >> + */ >> + return dma_buf_fd(priv->dmabuf, get_dma_buf.open_flags); > > I think this still needs to unwind state on fd allocation error. Reference > ownership is only transferred on success. Yes, you are correct, i need to call to dma_buf_put() in case of error. I will fix. Thanks > >> + >> +err_dev_put: >> + vfio_device_put_registration(&vdev->vdev); >> +err_free_phys: >> + kfree(priv->phys_vec); >> +err_free_priv: >> + kfree(priv); >> +err_free_ranges: >> + kfree(dma_ranges); >> + return ret; >> +}
On Mon, Oct 13, 2025 at 8:27 AM Leon Romanovsky <leon@kernel.org> wrote: > > From: Leon Romanovsky <leonro@nvidia.com> > > Add support for exporting PCI device MMIO regions through dma-buf, > enabling safe sharing of non-struct page memory with controlled > lifetime management. This allows RDMA and other subsystems to import > dma-buf FDs and build them into memory regions for PCI P2P operations. > > The implementation provides a revocable attachment mechanism using > dma-buf move operations. MMIO regions are normally pinned as BARs > don't change physical addresses, but access is revoked when the VFIO > device is closed or a PCI reset is issued. This ensures kernel > self-defense against potentially hostile userspace. > > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> > Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com> > Signed-off-by: Leon Romanovsky <leonro@nvidia.com> > --- > drivers/vfio/pci/Kconfig | 3 + > drivers/vfio/pci/Makefile | 2 + > drivers/vfio/pci/vfio_pci_config.c | 22 +- > drivers/vfio/pci/vfio_pci_core.c | 28 ++ > drivers/vfio/pci/vfio_pci_dmabuf.c | 446 +++++++++++++++++++++++++++++ > drivers/vfio/pci/vfio_pci_priv.h | 23 ++ > include/linux/vfio_pci_core.h | 1 + > include/uapi/linux/vfio.h | 25 ++ > 8 files changed, 546 insertions(+), 4 deletions(-) > create mode 100644 drivers/vfio/pci/vfio_pci_dmabuf.c > > diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig > index 2b0172f54665..2b9fca00e9e8 100644 > --- a/drivers/vfio/pci/Kconfig > +++ b/drivers/vfio/pci/Kconfig > @@ -55,6 +55,9 @@ config VFIO_PCI_ZDEV_KVM > > To enable s390x KVM vfio-pci extensions, say Y. > > +config VFIO_PCI_DMABUF > + def_bool y if VFIO_PCI_CORE && PCI_P2PDMA && DMA_SHARED_BUFFER > + > source "drivers/vfio/pci/mlx5/Kconfig" > > source "drivers/vfio/pci/hisilicon/Kconfig" > diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile > index cf00c0a7e55c..f9155e9c5f63 100644 > --- a/drivers/vfio/pci/Makefile > +++ b/drivers/vfio/pci/Makefile > @@ -2,7 +2,9 @@ > > vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o > vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o > + > obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o > +vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o > > vfio-pci-y := vfio_pci.o > vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o > diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c > index 8f02f236b5b4..1f6008eabf23 100644 > --- a/drivers/vfio/pci/vfio_pci_config.c > +++ b/drivers/vfio/pci/vfio_pci_config.c > @@ -589,10 +589,12 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos, > virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY); > new_mem = !!(new_cmd & PCI_COMMAND_MEMORY); > > - if (!new_mem) > + if (!new_mem) { > vfio_pci_zap_and_down_write_memory_lock(vdev); > - else > + vfio_pci_dma_buf_move(vdev, true); > + } else { > down_write(&vdev->memory_lock); > + } > > /* > * If the user is writing mem/io enable (new_mem/io) and we > @@ -627,6 +629,8 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos, > *virt_cmd &= cpu_to_le16(~mask); > *virt_cmd |= cpu_to_le16(new_cmd & mask); > > + if (__vfio_pci_memory_enabled(vdev)) > + vfio_pci_dma_buf_move(vdev, false); > up_write(&vdev->memory_lock); > } > > @@ -707,12 +711,16 @@ static int __init init_pci_cap_basic_perm(struct perm_bits *perm) > static void vfio_lock_and_set_power_state(struct vfio_pci_core_device *vdev, > pci_power_t state) > { > - if (state >= PCI_D3hot) > + if (state >= PCI_D3hot) { > vfio_pci_zap_and_down_write_memory_lock(vdev); > - else > + vfio_pci_dma_buf_move(vdev, true); > + } else { > down_write(&vdev->memory_lock); > + } > > vfio_pci_set_power_state(vdev, state); > + if (__vfio_pci_memory_enabled(vdev)) > + vfio_pci_dma_buf_move(vdev, false); > up_write(&vdev->memory_lock); > } > > @@ -900,7 +908,10 @@ static int vfio_exp_config_write(struct vfio_pci_core_device *vdev, int pos, > > if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) { > vfio_pci_zap_and_down_write_memory_lock(vdev); > + vfio_pci_dma_buf_move(vdev, true); > pci_try_reset_function(vdev->pdev); > + if (__vfio_pci_memory_enabled(vdev)) > + vfio_pci_dma_buf_move(vdev, false); > up_write(&vdev->memory_lock); > } > } > @@ -982,7 +993,10 @@ static int vfio_af_config_write(struct vfio_pci_core_device *vdev, int pos, > > if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) { > vfio_pci_zap_and_down_write_memory_lock(vdev); > + vfio_pci_dma_buf_move(vdev, true); > pci_try_reset_function(vdev->pdev); > + if (__vfio_pci_memory_enabled(vdev)) > + vfio_pci_dma_buf_move(vdev, false); > up_write(&vdev->memory_lock); > } > } > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c > index fe247d0e2831..56b1320238a9 100644 > --- a/drivers/vfio/pci/vfio_pci_core.c > +++ b/drivers/vfio/pci/vfio_pci_core.c > @@ -287,6 +287,8 @@ static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev, > * semaphore. > */ > vfio_pci_zap_and_down_write_memory_lock(vdev); > + vfio_pci_dma_buf_move(vdev, true); > + > if (vdev->pm_runtime_engaged) { > up_write(&vdev->memory_lock); > return -EINVAL; > @@ -370,6 +372,8 @@ static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev) > */ > down_write(&vdev->memory_lock); > __vfio_pci_runtime_pm_exit(vdev); > + if (__vfio_pci_memory_enabled(vdev)) > + vfio_pci_dma_buf_move(vdev, false); > up_write(&vdev->memory_lock); > } > > @@ -690,6 +694,8 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev) > #endif > vfio_pci_core_disable(vdev); > > + vfio_pci_dma_buf_cleanup(vdev); > + > mutex_lock(&vdev->igate); > if (vdev->err_trigger) { > eventfd_ctx_put(vdev->err_trigger); > @@ -1222,7 +1228,10 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, > */ > vfio_pci_set_power_state(vdev, PCI_D0); > > + vfio_pci_dma_buf_move(vdev, true); > ret = pci_try_reset_function(vdev->pdev); > + if (__vfio_pci_memory_enabled(vdev)) > + vfio_pci_dma_buf_move(vdev, false); > up_write(&vdev->memory_lock); > > return ret; > @@ -1511,6 +1520,19 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, > return vfio_pci_core_pm_exit(vdev, flags, arg, argsz); > case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: > return vfio_pci_core_feature_token(vdev, flags, arg, argsz); > + case VFIO_DEVICE_FEATURE_DMA_BUF: > + if (device->ops->ioctl != vfio_pci_core_ioctl) > + /* > + * Devices that overwrite general .ioctl() callback > + * usually do it to implement their own > + * VFIO_DEVICE_GET_REGION_INFO handlerm and they present > + * different BAR information from the real PCI. > + * > + * DMABUF relies on real PCI information. > + */ > + return -EOPNOTSUPP; > + > + return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz); > default: > return -ENOTTY; > } > @@ -2095,6 +2117,7 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev) > ret = pcim_p2pdma_init(vdev->pdev); > if (ret != -EOPNOTSUPP) > return ret; > + INIT_LIST_HEAD(&vdev->dmabufs); > init_rwsem(&vdev->memory_lock); > xa_init(&vdev->ctx); > > @@ -2459,6 +2482,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, > break; > } > > + vfio_pci_dma_buf_move(vdev, true); > vfio_pci_zap_bars(vdev); > } > > @@ -2482,6 +2506,10 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, > > ret = pci_reset_bus(pdev); > > + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) > + if (__vfio_pci_memory_enabled(vdev)) > + vfio_pci_dma_buf_move(vdev, false); > + > vdev = list_last_entry(&dev_set->device_list, > struct vfio_pci_core_device, vdev.dev_set_list); > > diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c > new file mode 100644 > index 000000000000..eaba010777f3 > --- /dev/null > +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c > @@ -0,0 +1,446 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. > + */ > +#include <linux/dma-buf.h> > +#include <linux/pci-p2pdma.h> > +#include <linux/dma-resv.h> > + > +#include "vfio_pci_priv.h" > + > +MODULE_IMPORT_NS("DMA_BUF"); > + > +struct vfio_pci_dma_buf { > + struct dma_buf *dmabuf; > + struct vfio_pci_core_device *vdev; > + struct list_head dmabufs_elm; > + size_t size; > + struct phys_vec *phys_vec; > + struct p2pdma_provider *provider; > + u32 nr_ranges; > + u8 revoked : 1; > +}; > + > +static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, > + struct dma_buf_attachment *attachment) > +{ > + struct vfio_pci_dma_buf *priv = dmabuf->priv; > + > + if (!attachment->peer2peer) > + return -EOPNOTSUPP; > + > + if (priv->revoked) > + return -ENODEV; > + > + switch (pci_p2pdma_map_type(priv->provider, attachment->dev)) { > + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: > + break; > + case PCI_P2PDMA_MAP_BUS_ADDR: > + /* > + * There is no need in IOVA at all for this flow. > + * We rely on attachment->priv == NULL as a marker > + * for this mode. > + */ > + return 0; > + default: > + return -EINVAL; > + } > + > + attachment->priv = kzalloc(sizeof(struct dma_iova_state), GFP_KERNEL); > + if (!attachment->priv) > + return -ENOMEM; > + > + dma_iova_try_alloc(attachment->dev, attachment->priv, 0, priv->size); > + return 0; > +} > + > +static void vfio_pci_dma_buf_detach(struct dma_buf *dmabuf, > + struct dma_buf_attachment *attachment) > +{ > + kfree(attachment->priv); > +} > + > +static struct scatterlist *fill_sg_entry(struct scatterlist *sgl, u64 length, > + dma_addr_t addr) > +{ > + unsigned int len, nents; > + int i; > + > + nents = DIV_ROUND_UP(length, UINT_MAX); > + for (i = 0; i < nents; i++) { > + len = min_t(u64, length, UINT_MAX); > + length -= len; > + /* > + * Follow the DMABUF rules for scatterlist, the struct page can > + * be NULL'd for MMIO only memory. > + */ > + sg_set_page(sgl, NULL, len, 0); > + sg_dma_address(sgl) = addr + i * UINT_MAX; > + sg_dma_len(sgl) = len; > + sgl = sg_next(sgl); > + } > + > + return sgl; > +} > + > +static unsigned int calc_sg_nents(struct vfio_pci_dma_buf *priv, > + struct dma_iova_state *state) > +{ > + struct phys_vec *phys_vec = priv->phys_vec; > + unsigned int nents = 0; > + u32 i; > + > + if (!state || !dma_use_iova(state)) > + for (i = 0; i < priv->nr_ranges; i++) > + nents += DIV_ROUND_UP(phys_vec[i].len, UINT_MAX); > + else > + /* > + * In IOVA case, there is only one SG entry which spans > + * for whole IOVA address space, but we need to make sure > + * that it fits sg->length, maybe we need more. > + */ > + nents = DIV_ROUND_UP(priv->size, UINT_MAX); > + > + return nents; > +} > + > +static struct sg_table * > +vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment, > + enum dma_data_direction dir) > +{ > + struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv; > + struct dma_iova_state *state = attachment->priv; > + struct phys_vec *phys_vec = priv->phys_vec; > + unsigned long attrs = DMA_ATTR_MMIO; > + unsigned int nents, mapped_len = 0; > + struct scatterlist *sgl; > + struct sg_table *sgt; > + dma_addr_t addr; > + int ret; > + u32 i; > + > + dma_resv_assert_held(priv->dmabuf->resv); > + > + if (priv->revoked) > + return ERR_PTR(-ENODEV); > + > + sgt = kzalloc(sizeof(*sgt), GFP_KERNEL); > + if (!sgt) > + return ERR_PTR(-ENOMEM); > + > + nents = calc_sg_nents(priv, state); > + ret = sg_alloc_table(sgt, nents, GFP_KERNEL | __GFP_ZERO); > + if (ret) > + goto err_kfree_sgt; > + > + sgl = sgt->sgl; > + > + for (i = 0; i < priv->nr_ranges; i++) { > + if (!state) { > + addr = pci_p2pdma_bus_addr_map(priv->provider, > + phys_vec[i].paddr); > + } else if (dma_use_iova(state)) { > + ret = dma_iova_link(attachment->dev, state, > + phys_vec[i].paddr, 0, > + phys_vec[i].len, dir, attrs); > + if (ret) > + goto err_unmap_dma; > + > + mapped_len += phys_vec[i].len; > + } else { > + addr = dma_map_phys(attachment->dev, phys_vec[i].paddr, > + phys_vec[i].len, dir, attrs); > + ret = dma_mapping_error(attachment->dev, addr); > + if (ret) > + goto err_unmap_dma; > + } > + > + if (!state || !dma_use_iova(state)) > + sgl = fill_sg_entry(sgl, phys_vec[i].len, addr); > + } > + > + if (state && dma_use_iova(state)) { > + WARN_ON_ONCE(mapped_len != priv->size); > + ret = dma_iova_sync(attachment->dev, state, 0, mapped_len); > + if (ret) > + goto err_unmap_dma; > + sgl = fill_sg_entry(sgl, mapped_len, state->addr); > + } > + > + /* > + * SGL must be NULL to indicate that SGL is the last one > + * and we allocated correct number of entries in sg_alloc_table() > + */ > + WARN_ON_ONCE(sgl); > + return sgt; > + > +err_unmap_dma: > + if (!i || !state) > + ; /* Do nothing */ > + else if (dma_use_iova(state)) > + dma_iova_destroy(attachment->dev, state, mapped_len, dir, > + attrs); > + else > + for_each_sgtable_dma_sg(sgt, sgl, i) > + dma_unmap_phys(attachment->dev, sg_dma_address(sgl), > + sg_dma_len(sgl), dir, attrs); > + sg_free_table(sgt); > +err_kfree_sgt: > + kfree(sgt); > + return ERR_PTR(ret); > +} > + > +static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment, > + struct sg_table *sgt, > + enum dma_data_direction dir) > +{ > + struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv; > + struct dma_iova_state *state = attachment->priv; > + unsigned long attrs = DMA_ATTR_MMIO; > + struct scatterlist *sgl; > + int i; > + > + if (!state) > + ; /* Do nothing */ > + else if (dma_use_iova(state)) > + dma_iova_destroy(attachment->dev, state, priv->size, dir, > + attrs); > + else > + for_each_sgtable_dma_sg(sgt, sgl, i) > + dma_unmap_phys(attachment->dev, sg_dma_address(sgl), > + sg_dma_len(sgl), dir, attrs); > + > + sg_free_table(sgt); > + kfree(sgt); > +} > + > +static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf) > +{ > + struct vfio_pci_dma_buf *priv = dmabuf->priv; > + > + /* > + * Either this or vfio_pci_dma_buf_cleanup() will remove from the list. > + * The refcount prevents both. > + */ > + if (priv->vdev) { > + down_write(&priv->vdev->memory_lock); > + list_del_init(&priv->dmabufs_elm); > + up_write(&priv->vdev->memory_lock); > + vfio_device_put_registration(&priv->vdev->vdev); > + } > + kfree(priv->phys_vec); > + kfree(priv); > +} > + > +static const struct dma_buf_ops vfio_pci_dmabuf_ops = { > + .attach = vfio_pci_dma_buf_attach, > + .detach = vfio_pci_dma_buf_detach, > + .map_dma_buf = vfio_pci_dma_buf_map, > + .release = vfio_pci_dma_buf_release, > + .unmap_dma_buf = vfio_pci_dma_buf_unmap, > +}; > + > +static void dma_ranges_to_p2p_phys(struct vfio_pci_dma_buf *priv, > + struct vfio_device_feature_dma_buf *dma_buf, > + struct vfio_region_dma_range *dma_ranges, > + struct p2pdma_provider *provider) > +{ > + struct pci_dev *pdev = priv->vdev->pdev; > + phys_addr_t pci_start; > + u32 i; > + > + pci_start = pci_resource_start(pdev, dma_buf->region_index); > + for (i = 0; i < dma_buf->nr_ranges; i++) { > + priv->phys_vec[i].len = dma_ranges[i].length; > + priv->phys_vec[i].paddr = pci_start + dma_ranges[i].offset; > + priv->size += priv->phys_vec[i].len; > + } > + priv->nr_ranges = dma_buf->nr_ranges; > + priv->provider = provider; > +} > + > +static int validate_dmabuf_input(struct vfio_pci_core_device *vdev, > + struct vfio_device_feature_dma_buf *dma_buf, > + struct vfio_region_dma_range *dma_ranges, > + struct p2pdma_provider **provider) > +{ > + struct pci_dev *pdev = vdev->pdev; > + u32 bar = dma_buf->region_index; > + resource_size_t bar_size; > + u64 length = 0, sum; > + u32 i; > + > + if (dma_buf->flags) > + return -EINVAL; > + /* > + * For PCI the region_index is the BAR number like everything else. > + */ > + if (bar >= VFIO_PCI_ROM_REGION_INDEX) > + return -ENODEV; > + > + *provider = pcim_p2pdma_provider(pdev, bar); > + if (!*provider) > + return -EINVAL; > + > + bar_size = pci_resource_len(pdev, bar); > + for (i = 0; i < dma_buf->nr_ranges; i++) { > + u64 offset = dma_ranges[i].offset; > + u64 len = dma_ranges[i].length; > + > + if (!len || !PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) > + return -EINVAL; > + > + if (check_add_overflow(offset, len, &sum) || sum > bar_size) > + return -EINVAL; > + > + /* Total requested length can't overflow IOVA size */ > + if (check_add_overflow(length, len, &sum)) > + return -EINVAL; > + > + length = sum; > + } > + > + /* > + * DMA API uses size_t, so make sure that requested region length > + * can fit into size_t variable, which can be unsigned int (32bits). > + * > + * In addition make sure that high bit of total length is not used too > + * as it is used as a marker for DMA IOVA API. > + */ > + if (overflows_type(length, size_t) || length & DMA_IOVA_USE_SWIOTLB) > + return -EINVAL; > + > + return 0; > +} > + > +int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, > + struct vfio_device_feature_dma_buf __user *arg, > + size_t argsz) > +{ > + struct vfio_device_feature_dma_buf get_dma_buf = {}; > + struct vfio_region_dma_range *dma_ranges; > + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); > + struct p2pdma_provider *provider; > + struct vfio_pci_dma_buf *priv; > + int ret; > + > + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, > + sizeof(get_dma_buf)); > + if (ret != 1) > + return ret; > + > + if (copy_from_user(&get_dma_buf, arg, sizeof(get_dma_buf))) > + return -EFAULT; > + > + if (!get_dma_buf.nr_ranges) > + return -EINVAL; > + > + dma_ranges = memdup_array_user(&arg->dma_ranges, get_dma_buf.nr_ranges, > + sizeof(*dma_ranges)); > + if (IS_ERR(dma_ranges)) > + return PTR_ERR(dma_ranges); > + > + ret = validate_dmabuf_input(vdev, &get_dma_buf, dma_ranges, &provider); > + if (ret) > + goto err_free_ranges; > + > + priv = kzalloc(sizeof(*priv), GFP_KERNEL); > + if (!priv) { > + ret = -ENOMEM; > + goto err_free_ranges; > + } > + priv->phys_vec = kcalloc(get_dma_buf.nr_ranges, sizeof(*priv->phys_vec), > + GFP_KERNEL); > + if (!priv->phys_vec) { > + ret = -ENOMEM; > + goto err_free_priv; > + } > + > + priv->vdev = vdev; > + dma_ranges_to_p2p_phys(priv, &get_dma_buf, dma_ranges, provider); > + kfree(dma_ranges); > + dma_ranges = NULL; > + > + if (!vfio_device_try_get_registration(&vdev->vdev)) { > + ret = -ENODEV; > + goto err_free_phys; > + } > + > + exp_info.ops = &vfio_pci_dmabuf_ops; > + exp_info.size = priv->size; > + exp_info.flags = get_dma_buf.open_flags; > + exp_info.priv = priv; > + > + priv->dmabuf = dma_buf_export(&exp_info); > + if (IS_ERR(priv->dmabuf)) { > + ret = PTR_ERR(priv->dmabuf); > + goto err_dev_put; > + } > + > + /* dma_buf_put() now frees priv */ > + INIT_LIST_HEAD(&priv->dmabufs_elm); > + down_write(&vdev->memory_lock); > + dma_resv_lock(priv->dmabuf->resv, NULL); > + priv->revoked = !__vfio_pci_memory_enabled(vdev); > + list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs); > + dma_resv_unlock(priv->dmabuf->resv); > + up_write(&vdev->memory_lock); > + > + /* > + * dma_buf_fd() consumes the reference, when the file closes the dmabuf > + * will be released. > + */ > + return dma_buf_fd(priv->dmabuf, get_dma_buf.open_flags); > + > +err_dev_put: > + vfio_device_put_registration(&vdev->vdev); > +err_free_phys: > + kfree(priv->phys_vec); > +err_free_priv: > + kfree(priv); > +err_free_ranges: > + kfree(dma_ranges); > + return ret; > +} > + > +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked) > +{ > + struct vfio_pci_dma_buf *priv; > + struct vfio_pci_dma_buf *tmp; > + > + lockdep_assert_held_write(&vdev->memory_lock); > + > + list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) { > + if (!get_file_active(&priv->dmabuf->file)) > + continue; > + > + if (priv->revoked != revoked) { > + dma_resv_lock(priv->dmabuf->resv, NULL); > + priv->revoked = revoked; > + dma_buf_move_notify(priv->dmabuf); I think this should only be called when revoked is true, otherwise this will be calling move_notify on the already revoked dmabuf attachments. > + dma_resv_unlock(priv->dmabuf->resv); > + } > + dma_buf_put(priv->dmabuf); > + } > +} > + > +void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) > +{ > + struct vfio_pci_dma_buf *priv; > + struct vfio_pci_dma_buf *tmp; > + > + down_write(&vdev->memory_lock); > + list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) { > + if (!get_file_active(&priv->dmabuf->file)) > + continue; > + > + dma_resv_lock(priv->dmabuf->resv, NULL); > + list_del_init(&priv->dmabufs_elm); > + priv->vdev = NULL; > + priv->revoked = true; > + dma_buf_move_notify(priv->dmabuf); > + dma_resv_unlock(priv->dmabuf->resv); > + vfio_device_put_registration(&vdev->vdev); > + dma_buf_put(priv->dmabuf); > + } > + up_write(&vdev->memory_lock); > +} > diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h > index a9972eacb293..28a405f8b97c 100644 > --- a/drivers/vfio/pci/vfio_pci_priv.h > +++ b/drivers/vfio/pci/vfio_pci_priv.h > @@ -107,4 +107,27 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev) > return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; > } > > +#ifdef CONFIG_VFIO_PCI_DMABUF > +int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, > + struct vfio_device_feature_dma_buf __user *arg, > + size_t argsz); > +void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev); > +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked); > +#else > +static inline int > +vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, > + struct vfio_device_feature_dma_buf __user *arg, > + size_t argsz) > +{ > + return -ENOTTY; > +} > +static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) > +{ > +} > +static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, > + bool revoked) > +{ > +} > +#endif > + > #endif > diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h > index f541044e42a2..30d74b364f25 100644 > --- a/include/linux/vfio_pci_core.h > +++ b/include/linux/vfio_pci_core.h > @@ -94,6 +94,7 @@ struct vfio_pci_core_device { > struct vfio_pci_core_device *sriov_pf_core_dev; > struct notifier_block nb; > struct rw_semaphore memory_lock; > + struct list_head dmabufs; > }; > > /* Will be exported for vfio pci drivers usage */ > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > index 75100bf009ba..63214467c875 100644 > --- a/include/uapi/linux/vfio.h > +++ b/include/uapi/linux/vfio.h > @@ -1478,6 +1478,31 @@ struct vfio_device_feature_bus_master { > }; > #define VFIO_DEVICE_FEATURE_BUS_MASTER 10 > > +/** > + * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the > + * regions selected. > + * > + * open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC, > + * etc. offset/length specify a slice of the region to create the dmabuf from. > + * nr_ranges is the total number of (P2P DMA) ranges that comprise the dmabuf. > + * > + * Return: The fd number on success, -1 and errno is set on failure. > + */ > +#define VFIO_DEVICE_FEATURE_DMA_BUF 11 > + > +struct vfio_region_dma_range { > + __u64 offset; > + __u64 length; > +}; > + > +struct vfio_device_feature_dma_buf { > + __u32 region_index; > + __u32 open_flags; > + __u32 flags; > + __u32 nr_ranges; > + struct vfio_region_dma_range dma_ranges[]; > +}; > + > /* -------- API for Type1 VFIO IOMMU -------- */ > > /** > -- > 2.51.0 > >
On Wed, Oct 29, 2025 at 05:25:03PM -0700, Samiullah Khawaja wrote: > On Mon, Oct 13, 2025 at 8:27 AM Leon Romanovsky <leon@kernel.org> wrote: > > > > From: Leon Romanovsky <leonro@nvidia.com> > > > > Add support for exporting PCI device MMIO regions through dma-buf, > > enabling safe sharing of non-struct page memory with controlled > > lifetime management. This allows RDMA and other subsystems to import > > dma-buf FDs and build them into memory regions for PCI P2P operations. > > > > The implementation provides a revocable attachment mechanism using > > dma-buf move operations. MMIO regions are normally pinned as BARs > > don't change physical addresses, but access is revoked when the VFIO > > device is closed or a PCI reset is issued. This ensures kernel > > self-defense against potentially hostile userspace. > > > > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> > > Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com> > > Signed-off-by: Leon Romanovsky <leonro@nvidia.com> > > --- > > drivers/vfio/pci/Kconfig | 3 + > > drivers/vfio/pci/Makefile | 2 + > > drivers/vfio/pci/vfio_pci_config.c | 22 +- > > drivers/vfio/pci/vfio_pci_core.c | 28 ++ > > drivers/vfio/pci/vfio_pci_dmabuf.c | 446 +++++++++++++++++++++++++++++ > > drivers/vfio/pci/vfio_pci_priv.h | 23 ++ > > include/linux/vfio_pci_core.h | 1 + > > include/uapi/linux/vfio.h | 25 ++ > > 8 files changed, 546 insertions(+), 4 deletions(-) > > create mode 100644 drivers/vfio/pci/vfio_pci_dmabuf.c <...> > > +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked) > > +{ > > + struct vfio_pci_dma_buf *priv; > > + struct vfio_pci_dma_buf *tmp; > > + > > + lockdep_assert_held_write(&vdev->memory_lock); > > + > > + list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) { > > + if (!get_file_active(&priv->dmabuf->file)) > > + continue; > > + > > + if (priv->revoked != revoked) { > > + dma_resv_lock(priv->dmabuf->resv, NULL); > > + priv->revoked = revoked; > > + dma_buf_move_notify(priv->dmabuf); > > I think this should only be called when revoked is true, otherwise > this will be calling move_notify on the already revoked dmabuf > attachments. This case is protected by "if (priv->revoked)" check both in vfio_pci_dma_buf_map and vfio_pci_dma_buf_attach. They will prevent DMABUF recreation if revoked is false. VTW, please trim your replies, it is time consuming to find your reply among 600 lines of unrelated text. Thanks > > + dma_resv_unlock(priv->dmabuf->resv); > > + } > > + dma_buf_put(priv->dmabuf); > > + } > > +}
On Thu, Oct 30, 2025 at 08:48:18AM +0200, Leon Romanovsky wrote: > > > +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked) > > > +{ > > > + struct vfio_pci_dma_buf *priv; > > > + struct vfio_pci_dma_buf *tmp; > > > + > > > + lockdep_assert_held_write(&vdev->memory_lock); > > > + > > > + list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) { > > > + if (!get_file_active(&priv->dmabuf->file)) > > > + continue; > > > + > > > + if (priv->revoked != revoked) { > > > + dma_resv_lock(priv->dmabuf->resv, NULL); > > > + priv->revoked = revoked; > > > + dma_buf_move_notify(priv->dmabuf); > > > > I think this should only be called when revoked is true, otherwise > > this will be calling move_notify on the already revoked dmabuf > > attachments. > > This case is protected by "if (priv->revoked)" check both in > vfio_pci_dma_buf_map and vfio_pci_dma_buf_attach. They will prevent > DMABUF recreation if revoked is false. The point was to call it when revoked becomes false as well, as that gives the importing driver an opportunity to restore any mapping it had. Jason
On Mon, 13 Oct 2025 18:26:11 +0300 Leon Romanovsky <leon@kernel.org> wrote: > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c > index fe247d0e2831..56b1320238a9 100644 > --- a/drivers/vfio/pci/vfio_pci_core.c > +++ b/drivers/vfio/pci/vfio_pci_core.c > @@ -1511,6 +1520,19 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, > return vfio_pci_core_pm_exit(vdev, flags, arg, argsz); > case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: > return vfio_pci_core_feature_token(vdev, flags, arg, argsz); > + case VFIO_DEVICE_FEATURE_DMA_BUF: > + if (device->ops->ioctl != vfio_pci_core_ioctl) > + /* > + * Devices that overwrite general .ioctl() callback > + * usually do it to implement their own > + * VFIO_DEVICE_GET_REGION_INFO handlerm and they present Typo, "handlerm" > + * different BAR information from the real PCI. > + * > + * DMABUF relies on real PCI information. > + */ > + return -EOPNOTSUPP; > + > + return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz); > default: > return -ENOTTY; > } ... > @@ -2459,6 +2482,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, > break; > } > > + vfio_pci_dma_buf_move(vdev, true); > vfio_pci_zap_bars(vdev); > } > > @@ -2482,6 +2506,10 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, > > ret = pci_reset_bus(pdev); > > + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) > + if (__vfio_pci_memory_enabled(vdev)) > + vfio_pci_dma_buf_move(vdev, false); > + > vdev = list_last_entry(&dev_set->device_list, > struct vfio_pci_core_device, vdev.dev_set_list); > This needs to be placed in the existing undo loop with the up_write(), otherwise it can be missed in the error case. > diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c > new file mode 100644 > index 000000000000..eaba010777f3 > --- /dev/null > +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c > +static unsigned int calc_sg_nents(struct vfio_pci_dma_buf *priv, > + struct dma_iova_state *state) > +{ > + struct phys_vec *phys_vec = priv->phys_vec; > + unsigned int nents = 0; > + u32 i; > + > + if (!state || !dma_use_iova(state)) > + for (i = 0; i < priv->nr_ranges; i++) > + nents += DIV_ROUND_UP(phys_vec[i].len, UINT_MAX); > + else > + /* > + * In IOVA case, there is only one SG entry which spans > + * for whole IOVA address space, but we need to make sure > + * that it fits sg->length, maybe we need more. > + */ > + nents = DIV_ROUND_UP(priv->size, UINT_MAX); I think we're arguably running afoul of the coding style standard here that this is not a single simple statement and should use braces. > + > + return nents; > +} > + > +static struct sg_table * > +vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment, > + enum dma_data_direction dir) > +{ > + struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv; > + struct dma_iova_state *state = attachment->priv; > + struct phys_vec *phys_vec = priv->phys_vec; > + unsigned long attrs = DMA_ATTR_MMIO; > + unsigned int nents, mapped_len = 0; > + struct scatterlist *sgl; > + struct sg_table *sgt; > + dma_addr_t addr; > + int ret; > + u32 i; > + > + dma_resv_assert_held(priv->dmabuf->resv); > + > + if (priv->revoked) > + return ERR_PTR(-ENODEV); > + > + sgt = kzalloc(sizeof(*sgt), GFP_KERNEL); > + if (!sgt) > + return ERR_PTR(-ENOMEM); > + > + nents = calc_sg_nents(priv, state); > + ret = sg_alloc_table(sgt, nents, GFP_KERNEL | __GFP_ZERO); > + if (ret) > + goto err_kfree_sgt; > + > + sgl = sgt->sgl; > + > + for (i = 0; i < priv->nr_ranges; i++) { > + if (!state) { > + addr = pci_p2pdma_bus_addr_map(priv->provider, > + phys_vec[i].paddr); > + } else if (dma_use_iova(state)) { > + ret = dma_iova_link(attachment->dev, state, > + phys_vec[i].paddr, 0, > + phys_vec[i].len, dir, attrs); > + if (ret) > + goto err_unmap_dma; > + > + mapped_len += phys_vec[i].len; > + } else { > + addr = dma_map_phys(attachment->dev, phys_vec[i].paddr, > + phys_vec[i].len, dir, attrs); > + ret = dma_mapping_error(attachment->dev, addr); > + if (ret) > + goto err_unmap_dma; > + } > + > + if (!state || !dma_use_iova(state)) > + sgl = fill_sg_entry(sgl, phys_vec[i].len, addr); > + } > + > + if (state && dma_use_iova(state)) { > + WARN_ON_ONCE(mapped_len != priv->size); > + ret = dma_iova_sync(attachment->dev, state, 0, mapped_len); > + if (ret) > + goto err_unmap_dma; > + sgl = fill_sg_entry(sgl, mapped_len, state->addr); > + } > + > + /* > + * SGL must be NULL to indicate that SGL is the last one > + * and we allocated correct number of entries in sg_alloc_table() > + */ > + WARN_ON_ONCE(sgl); > + return sgt; > + > +err_unmap_dma: > + if (!i || !state) > + ; /* Do nothing */ > + else if (dma_use_iova(state)) > + dma_iova_destroy(attachment->dev, state, mapped_len, dir, > + attrs); > + else > + for_each_sgtable_dma_sg(sgt, sgl, i) > + dma_unmap_phys(attachment->dev, sg_dma_address(sgl), > + sg_dma_len(sgl), dir, attrs); Same, here for braces. > + sg_free_table(sgt); > +err_kfree_sgt: > + kfree(sgt); > + return ERR_PTR(ret); > +} > + > +static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment, > + struct sg_table *sgt, > + enum dma_data_direction dir) > +{ > + struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv; > + struct dma_iova_state *state = attachment->priv; > + unsigned long attrs = DMA_ATTR_MMIO; > + struct scatterlist *sgl; > + int i; > + > + if (!state) > + ; /* Do nothing */ > + else if (dma_use_iova(state)) > + dma_iova_destroy(attachment->dev, state, priv->size, dir, > + attrs); > + else > + for_each_sgtable_dma_sg(sgt, sgl, i) > + dma_unmap_phys(attachment->dev, sg_dma_address(sgl), > + sg_dma_len(sgl), dir, attrs); > + Here too. > + sg_free_table(sgt); > + kfree(sgt); > +} ... > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > index 75100bf009ba..63214467c875 100644 > --- a/include/uapi/linux/vfio.h > +++ b/include/uapi/linux/vfio.h > @@ -1478,6 +1478,31 @@ struct vfio_device_feature_bus_master { > }; > #define VFIO_DEVICE_FEATURE_BUS_MASTER 10 > > +/** > + * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the > + * regions selected. > + * > + * open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC, > + * etc. offset/length specify a slice of the region to create the dmabuf from. > + * nr_ranges is the total number of (P2P DMA) ranges that comprise the dmabuf. > + * Probably worth noting that .flags should be zero, I see we enforce that. Thanks, Alex > + * Return: The fd number on success, -1 and errno is set on failure. > + */ > +#define VFIO_DEVICE_FEATURE_DMA_BUF 11 > + > +struct vfio_region_dma_range { > + __u64 offset; > + __u64 length; > +}; > + > +struct vfio_device_feature_dma_buf { > + __u32 region_index; > + __u32 open_flags; > + __u32 flags; > + __u32 nr_ranges; > + struct vfio_region_dma_range dma_ranges[]; > +}; > + > /* -------- API for Type1 VFIO IOMMU -------- */ > > /**
> On Mon, Oct 27, 2025 at 04:13:05PM -0700, David Matlack wrote: > > On Mon, Oct 13, 2025 at 8:44 AM Leon Romanovsky <leon@kernel.org> wrote: > > This uAPI would be a good candidate for a VFIO selftest. You can test > > that it returns an error when it's supposed to, and a valid fd when > > it's supposed to. And once the iommufd importer side is ready, we can > > extend the test and verify that the fd can be mapped into iommufd. > > No problem, I'll add such test, but let's focus on making sure that this > series is accepted first. Along similar lines, perhaps adding vfio as src/vfio_memory.c dmabuf provider to perftest [1] could be useful for low-level end-to-end validation. https://github.com/linux-rdma/perftest
On Thu, Oct 30, 2025 at 02:38:36PM -0600, Alex Williamson wrote: > On Mon, 13 Oct 2025 18:26:11 +0300 > Leon Romanovsky <leon@kernel.org> wrote: > > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c > > index fe247d0e2831..56b1320238a9 100644 > > --- a/drivers/vfio/pci/vfio_pci_core.c > > +++ b/drivers/vfio/pci/vfio_pci_core.c > > @@ -1511,6 +1520,19 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, > > return vfio_pci_core_pm_exit(vdev, flags, arg, argsz); > > case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: > > return vfio_pci_core_feature_token(vdev, flags, arg, argsz); > > + case VFIO_DEVICE_FEATURE_DMA_BUF: > > + if (device->ops->ioctl != vfio_pci_core_ioctl) > > + /* > > + * Devices that overwrite general .ioctl() callback > > + * usually do it to implement their own > > + * VFIO_DEVICE_GET_REGION_INFO handlerm and they present > > Typo, "handlerm" Thanks, this part of code is going to be different in v6. > <...> > > @@ -2482,6 +2506,10 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, > > > > ret = pci_reset_bus(pdev); > > > > + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) > > + if (__vfio_pci_memory_enabled(vdev)) > > + vfio_pci_dma_buf_move(vdev, false); > > + > > vdev = list_last_entry(&dev_set->device_list, > > struct vfio_pci_core_device, vdev.dev_set_list); > > > > This needs to be placed in the existing undo loop with the up_write(), > otherwise it can be missed in the error case. I'll move, but it caused me to wonder what did you want to achieve with this "vdev = list_last_entry ..." line? vdev is overwritten immediately after that line. > > > diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c > > new file mode 100644 > > index 000000000000..eaba010777f3 > > --- /dev/null > > +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c > > +static unsigned int calc_sg_nents(struct vfio_pci_dma_buf *priv, > > + struct dma_iova_state *state) > > +{ > > + struct phys_vec *phys_vec = priv->phys_vec; > > + unsigned int nents = 0; > > + u32 i; > > + > > + if (!state || !dma_use_iova(state)) > > + for (i = 0; i < priv->nr_ranges; i++) > > + nents += DIV_ROUND_UP(phys_vec[i].len, UINT_MAX); > > + else > > + /* > > + * In IOVA case, there is only one SG entry which spans > > + * for whole IOVA address space, but we need to make sure > > + * that it fits sg->length, maybe we need more. > > + */ > > + nents = DIV_ROUND_UP(priv->size, UINT_MAX); > > I think we're arguably running afoul of the coding style standard here > that this is not a single simple statement and should use braces. > <...> > > +err_unmap_dma: > > + if (!i || !state) > > + ; /* Do nothing */ > > + else if (dma_use_iova(state)) > > + dma_iova_destroy(attachment->dev, state, mapped_len, dir, > > + attrs); > > + else > > + for_each_sgtable_dma_sg(sgt, sgl, i) > > + dma_unmap_phys(attachment->dev, sg_dma_address(sgl), > > + sg_dma_len(sgl), dir, attrs); > > Same, here for braces. > <...> > > + if (!state) > > + ; /* Do nothing */ > > + else if (dma_use_iova(state)) > > + dma_iova_destroy(attachment->dev, state, priv->size, dir, > > + attrs); > > + else > > + for_each_sgtable_dma_sg(sgt, sgl, i) > > + dma_unmap_phys(attachment->dev, sg_dma_address(sgl), > > + sg_dma_len(sgl), dir, attrs); > > + > > Here too. I will change it, but it is worth to admit that I'm consistent in my coding style. > > > + sg_free_table(sgt); > > + kfree(sgt); > > +} > ... > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > > index 75100bf009ba..63214467c875 100644 > > --- a/include/uapi/linux/vfio.h > > +++ b/include/uapi/linux/vfio.h > > @@ -1478,6 +1478,31 @@ struct vfio_device_feature_bus_master { > > }; > > #define VFIO_DEVICE_FEATURE_BUS_MASTER 10 > > > > +/** > > + * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the > > + * regions selected. > > + * > > + * open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC, > > + * etc. offset/length specify a slice of the region to create the dmabuf from. > > + * nr_ranges is the total number of (P2P DMA) ranges that comprise the dmabuf. > > + * > > Probably worth noting that .flags should be zero, I see we enforce > that. Thanks, Added, thanks > > Alex >
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 2b0172f54665..2b9fca00e9e8 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -55,6 +55,9 @@ config VFIO_PCI_ZDEV_KVM To enable s390x KVM vfio-pci extensions, say Y. +config VFIO_PCI_DMABUF + def_bool y if VFIO_PCI_CORE && PCI_P2PDMA && DMA_SHARED_BUFFER + source "drivers/vfio/pci/mlx5/Kconfig" source "drivers/vfio/pci/hisilicon/Kconfig" diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index cf00c0a7e55c..f9155e9c5f63 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -2,7 +2,9 @@ vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o + obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o +vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o vfio-pci-y := vfio_pci.o vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 8f02f236b5b4..1f6008eabf23 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -589,10 +589,12 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos, virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY); new_mem = !!(new_cmd & PCI_COMMAND_MEMORY); - if (!new_mem) + if (!new_mem) { vfio_pci_zap_and_down_write_memory_lock(vdev); - else + vfio_pci_dma_buf_move(vdev, true); + } else { down_write(&vdev->memory_lock); + } /* * If the user is writing mem/io enable (new_mem/io) and we @@ -627,6 +629,8 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos, *virt_cmd &= cpu_to_le16(~mask); *virt_cmd |= cpu_to_le16(new_cmd & mask); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } @@ -707,12 +711,16 @@ static int __init init_pci_cap_basic_perm(struct perm_bits *perm) static void vfio_lock_and_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state) { - if (state >= PCI_D3hot) + if (state >= PCI_D3hot) { vfio_pci_zap_and_down_write_memory_lock(vdev); - else + vfio_pci_dma_buf_move(vdev, true); + } else { down_write(&vdev->memory_lock); + } vfio_pci_set_power_state(vdev, state); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } @@ -900,7 +908,10 @@ static int vfio_exp_config_write(struct vfio_pci_core_device *vdev, int pos, if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) { vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_pci_dma_buf_move(vdev, true); pci_try_reset_function(vdev->pdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } } @@ -982,7 +993,10 @@ static int vfio_af_config_write(struct vfio_pci_core_device *vdev, int pos, if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) { vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_pci_dma_buf_move(vdev, true); pci_try_reset_function(vdev->pdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } } diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index fe247d0e2831..56b1320238a9 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -287,6 +287,8 @@ static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev, * semaphore. */ vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_pci_dma_buf_move(vdev, true); + if (vdev->pm_runtime_engaged) { up_write(&vdev->memory_lock); return -EINVAL; @@ -370,6 +372,8 @@ static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev) */ down_write(&vdev->memory_lock); __vfio_pci_runtime_pm_exit(vdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } @@ -690,6 +694,8 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev) #endif vfio_pci_core_disable(vdev); + vfio_pci_dma_buf_cleanup(vdev); + mutex_lock(&vdev->igate); if (vdev->err_trigger) { eventfd_ctx_put(vdev->err_trigger); @@ -1222,7 +1228,10 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, */ vfio_pci_set_power_state(vdev, PCI_D0); + vfio_pci_dma_buf_move(vdev, true); ret = pci_try_reset_function(vdev->pdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); return ret; @@ -1511,6 +1520,19 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, return vfio_pci_core_pm_exit(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: return vfio_pci_core_feature_token(vdev, flags, arg, argsz); + case VFIO_DEVICE_FEATURE_DMA_BUF: + if (device->ops->ioctl != vfio_pci_core_ioctl) + /* + * Devices that overwrite general .ioctl() callback + * usually do it to implement their own + * VFIO_DEVICE_GET_REGION_INFO handlerm and they present + * different BAR information from the real PCI. + * + * DMABUF relies on real PCI information. + */ + return -EOPNOTSUPP; + + return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz); default: return -ENOTTY; } @@ -2095,6 +2117,7 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev) ret = pcim_p2pdma_init(vdev->pdev); if (ret != -EOPNOTSUPP) return ret; + INIT_LIST_HEAD(&vdev->dmabufs); init_rwsem(&vdev->memory_lock); xa_init(&vdev->ctx); @@ -2459,6 +2482,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, break; } + vfio_pci_dma_buf_move(vdev, true); vfio_pci_zap_bars(vdev); } @@ -2482,6 +2506,10 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, ret = pci_reset_bus(pdev); + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); + vdev = list_last_entry(&dev_set->device_list, struct vfio_pci_core_device, vdev.dev_set_list); diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c new file mode 100644 index 000000000000..eaba010777f3 --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -0,0 +1,446 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + */ +#include <linux/dma-buf.h> +#include <linux/pci-p2pdma.h> +#include <linux/dma-resv.h> + +#include "vfio_pci_priv.h" + +MODULE_IMPORT_NS("DMA_BUF"); + +struct vfio_pci_dma_buf { + struct dma_buf *dmabuf; + struct vfio_pci_core_device *vdev; + struct list_head dmabufs_elm; + size_t size; + struct phys_vec *phys_vec; + struct p2pdma_provider *provider; + u32 nr_ranges; + u8 revoked : 1; +}; + +static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ + struct vfio_pci_dma_buf *priv = dmabuf->priv; + + if (!attachment->peer2peer) + return -EOPNOTSUPP; + + if (priv->revoked) + return -ENODEV; + + switch (pci_p2pdma_map_type(priv->provider, attachment->dev)) { + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + break; + case PCI_P2PDMA_MAP_BUS_ADDR: + /* + * There is no need in IOVA at all for this flow. + * We rely on attachment->priv == NULL as a marker + * for this mode. + */ + return 0; + default: + return -EINVAL; + } + + attachment->priv = kzalloc(sizeof(struct dma_iova_state), GFP_KERNEL); + if (!attachment->priv) + return -ENOMEM; + + dma_iova_try_alloc(attachment->dev, attachment->priv, 0, priv->size); + return 0; +} + +static void vfio_pci_dma_buf_detach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ + kfree(attachment->priv); +} + +static struct scatterlist *fill_sg_entry(struct scatterlist *sgl, u64 length, + dma_addr_t addr) +{ + unsigned int len, nents; + int i; + + nents = DIV_ROUND_UP(length, UINT_MAX); + for (i = 0; i < nents; i++) { + len = min_t(u64, length, UINT_MAX); + length -= len; + /* + * Follow the DMABUF rules for scatterlist, the struct page can + * be NULL'd for MMIO only memory. + */ + sg_set_page(sgl, NULL, len, 0); + sg_dma_address(sgl) = addr + i * UINT_MAX; + sg_dma_len(sgl) = len; + sgl = sg_next(sgl); + } + + return sgl; +} + +static unsigned int calc_sg_nents(struct vfio_pci_dma_buf *priv, + struct dma_iova_state *state) +{ + struct phys_vec *phys_vec = priv->phys_vec; + unsigned int nents = 0; + u32 i; + + if (!state || !dma_use_iova(state)) + for (i = 0; i < priv->nr_ranges; i++) + nents += DIV_ROUND_UP(phys_vec[i].len, UINT_MAX); + else + /* + * In IOVA case, there is only one SG entry which spans + * for whole IOVA address space, but we need to make sure + * that it fits sg->length, maybe we need more. + */ + nents = DIV_ROUND_UP(priv->size, UINT_MAX); + + return nents; +} + +static struct sg_table * +vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment, + enum dma_data_direction dir) +{ + struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv; + struct dma_iova_state *state = attachment->priv; + struct phys_vec *phys_vec = priv->phys_vec; + unsigned long attrs = DMA_ATTR_MMIO; + unsigned int nents, mapped_len = 0; + struct scatterlist *sgl; + struct sg_table *sgt; + dma_addr_t addr; + int ret; + u32 i; + + dma_resv_assert_held(priv->dmabuf->resv); + + if (priv->revoked) + return ERR_PTR(-ENODEV); + + sgt = kzalloc(sizeof(*sgt), GFP_KERNEL); + if (!sgt) + return ERR_PTR(-ENOMEM); + + nents = calc_sg_nents(priv, state); + ret = sg_alloc_table(sgt, nents, GFP_KERNEL | __GFP_ZERO); + if (ret) + goto err_kfree_sgt; + + sgl = sgt->sgl; + + for (i = 0; i < priv->nr_ranges; i++) { + if (!state) { + addr = pci_p2pdma_bus_addr_map(priv->provider, + phys_vec[i].paddr); + } else if (dma_use_iova(state)) { + ret = dma_iova_link(attachment->dev, state, + phys_vec[i].paddr, 0, + phys_vec[i].len, dir, attrs); + if (ret) + goto err_unmap_dma; + + mapped_len += phys_vec[i].len; + } else { + addr = dma_map_phys(attachment->dev, phys_vec[i].paddr, + phys_vec[i].len, dir, attrs); + ret = dma_mapping_error(attachment->dev, addr); + if (ret) + goto err_unmap_dma; + } + + if (!state || !dma_use_iova(state)) + sgl = fill_sg_entry(sgl, phys_vec[i].len, addr); + } + + if (state && dma_use_iova(state)) { + WARN_ON_ONCE(mapped_len != priv->size); + ret = dma_iova_sync(attachment->dev, state, 0, mapped_len); + if (ret) + goto err_unmap_dma; + sgl = fill_sg_entry(sgl, mapped_len, state->addr); + } + + /* + * SGL must be NULL to indicate that SGL is the last one + * and we allocated correct number of entries in sg_alloc_table() + */ + WARN_ON_ONCE(sgl); + return sgt; + +err_unmap_dma: + if (!i || !state) + ; /* Do nothing */ + else if (dma_use_iova(state)) + dma_iova_destroy(attachment->dev, state, mapped_len, dir, + attrs); + else + for_each_sgtable_dma_sg(sgt, sgl, i) + dma_unmap_phys(attachment->dev, sg_dma_address(sgl), + sg_dma_len(sgl), dir, attrs); + sg_free_table(sgt); +err_kfree_sgt: + kfree(sgt); + return ERR_PTR(ret); +} + +static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment, + struct sg_table *sgt, + enum dma_data_direction dir) +{ + struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv; + struct dma_iova_state *state = attachment->priv; + unsigned long attrs = DMA_ATTR_MMIO; + struct scatterlist *sgl; + int i; + + if (!state) + ; /* Do nothing */ + else if (dma_use_iova(state)) + dma_iova_destroy(attachment->dev, state, priv->size, dir, + attrs); + else + for_each_sgtable_dma_sg(sgt, sgl, i) + dma_unmap_phys(attachment->dev, sg_dma_address(sgl), + sg_dma_len(sgl), dir, attrs); + + sg_free_table(sgt); + kfree(sgt); +} + +static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf) +{ + struct vfio_pci_dma_buf *priv = dmabuf->priv; + + /* + * Either this or vfio_pci_dma_buf_cleanup() will remove from the list. + * The refcount prevents both. + */ + if (priv->vdev) { + down_write(&priv->vdev->memory_lock); + list_del_init(&priv->dmabufs_elm); + up_write(&priv->vdev->memory_lock); + vfio_device_put_registration(&priv->vdev->vdev); + } + kfree(priv->phys_vec); + kfree(priv); +} + +static const struct dma_buf_ops vfio_pci_dmabuf_ops = { + .attach = vfio_pci_dma_buf_attach, + .detach = vfio_pci_dma_buf_detach, + .map_dma_buf = vfio_pci_dma_buf_map, + .release = vfio_pci_dma_buf_release, + .unmap_dma_buf = vfio_pci_dma_buf_unmap, +}; + +static void dma_ranges_to_p2p_phys(struct vfio_pci_dma_buf *priv, + struct vfio_device_feature_dma_buf *dma_buf, + struct vfio_region_dma_range *dma_ranges, + struct p2pdma_provider *provider) +{ + struct pci_dev *pdev = priv->vdev->pdev; + phys_addr_t pci_start; + u32 i; + + pci_start = pci_resource_start(pdev, dma_buf->region_index); + for (i = 0; i < dma_buf->nr_ranges; i++) { + priv->phys_vec[i].len = dma_ranges[i].length; + priv->phys_vec[i].paddr = pci_start + dma_ranges[i].offset; + priv->size += priv->phys_vec[i].len; + } + priv->nr_ranges = dma_buf->nr_ranges; + priv->provider = provider; +} + +static int validate_dmabuf_input(struct vfio_pci_core_device *vdev, + struct vfio_device_feature_dma_buf *dma_buf, + struct vfio_region_dma_range *dma_ranges, + struct p2pdma_provider **provider) +{ + struct pci_dev *pdev = vdev->pdev; + u32 bar = dma_buf->region_index; + resource_size_t bar_size; + u64 length = 0, sum; + u32 i; + + if (dma_buf->flags) + return -EINVAL; + /* + * For PCI the region_index is the BAR number like everything else. + */ + if (bar >= VFIO_PCI_ROM_REGION_INDEX) + return -ENODEV; + + *provider = pcim_p2pdma_provider(pdev, bar); + if (!*provider) + return -EINVAL; + + bar_size = pci_resource_len(pdev, bar); + for (i = 0; i < dma_buf->nr_ranges; i++) { + u64 offset = dma_ranges[i].offset; + u64 len = dma_ranges[i].length; + + if (!len || !PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) + return -EINVAL; + + if (check_add_overflow(offset, len, &sum) || sum > bar_size) + return -EINVAL; + + /* Total requested length can't overflow IOVA size */ + if (check_add_overflow(length, len, &sum)) + return -EINVAL; + + length = sum; + } + + /* + * DMA API uses size_t, so make sure that requested region length + * can fit into size_t variable, which can be unsigned int (32bits). + * + * In addition make sure that high bit of total length is not used too + * as it is used as a marker for DMA IOVA API. + */ + if (overflows_type(length, size_t) || length & DMA_IOVA_USE_SWIOTLB) + return -EINVAL; + + return 0; +} + +int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf __user *arg, + size_t argsz) +{ + struct vfio_device_feature_dma_buf get_dma_buf = {}; + struct vfio_region_dma_range *dma_ranges; + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + struct p2pdma_provider *provider; + struct vfio_pci_dma_buf *priv; + int ret; + + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, + sizeof(get_dma_buf)); + if (ret != 1) + return ret; + + if (copy_from_user(&get_dma_buf, arg, sizeof(get_dma_buf))) + return -EFAULT; + + if (!get_dma_buf.nr_ranges) + return -EINVAL; + + dma_ranges = memdup_array_user(&arg->dma_ranges, get_dma_buf.nr_ranges, + sizeof(*dma_ranges)); + if (IS_ERR(dma_ranges)) + return PTR_ERR(dma_ranges); + + ret = validate_dmabuf_input(vdev, &get_dma_buf, dma_ranges, &provider); + if (ret) + goto err_free_ranges; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) { + ret = -ENOMEM; + goto err_free_ranges; + } + priv->phys_vec = kcalloc(get_dma_buf.nr_ranges, sizeof(*priv->phys_vec), + GFP_KERNEL); + if (!priv->phys_vec) { + ret = -ENOMEM; + goto err_free_priv; + } + + priv->vdev = vdev; + dma_ranges_to_p2p_phys(priv, &get_dma_buf, dma_ranges, provider); + kfree(dma_ranges); + dma_ranges = NULL; + + if (!vfio_device_try_get_registration(&vdev->vdev)) { + ret = -ENODEV; + goto err_free_phys; + } + + exp_info.ops = &vfio_pci_dmabuf_ops; + exp_info.size = priv->size; + exp_info.flags = get_dma_buf.open_flags; + exp_info.priv = priv; + + priv->dmabuf = dma_buf_export(&exp_info); + if (IS_ERR(priv->dmabuf)) { + ret = PTR_ERR(priv->dmabuf); + goto err_dev_put; + } + + /* dma_buf_put() now frees priv */ + INIT_LIST_HEAD(&priv->dmabufs_elm); + down_write(&vdev->memory_lock); + dma_resv_lock(priv->dmabuf->resv, NULL); + priv->revoked = !__vfio_pci_memory_enabled(vdev); + list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs); + dma_resv_unlock(priv->dmabuf->resv); + up_write(&vdev->memory_lock); + + /* + * dma_buf_fd() consumes the reference, when the file closes the dmabuf + * will be released. + */ + return dma_buf_fd(priv->dmabuf, get_dma_buf.open_flags); + +err_dev_put: + vfio_device_put_registration(&vdev->vdev); +err_free_phys: + kfree(priv->phys_vec); +err_free_priv: + kfree(priv); +err_free_ranges: + kfree(dma_ranges); + return ret; +} + +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked) +{ + struct vfio_pci_dma_buf *priv; + struct vfio_pci_dma_buf *tmp; + + lockdep_assert_held_write(&vdev->memory_lock); + + list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) { + if (!get_file_active(&priv->dmabuf->file)) + continue; + + if (priv->revoked != revoked) { + dma_resv_lock(priv->dmabuf->resv, NULL); + priv->revoked = revoked; + dma_buf_move_notify(priv->dmabuf); + dma_resv_unlock(priv->dmabuf->resv); + } + dma_buf_put(priv->dmabuf); + } +} + +void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_dma_buf *priv; + struct vfio_pci_dma_buf *tmp; + + down_write(&vdev->memory_lock); + list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) { + if (!get_file_active(&priv->dmabuf->file)) + continue; + + dma_resv_lock(priv->dmabuf->resv, NULL); + list_del_init(&priv->dmabufs_elm); + priv->vdev = NULL; + priv->revoked = true; + dma_buf_move_notify(priv->dmabuf); + dma_resv_unlock(priv->dmabuf->resv); + vfio_device_put_registration(&vdev->vdev); + dma_buf_put(priv->dmabuf); + } + up_write(&vdev->memory_lock); +} diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index a9972eacb293..28a405f8b97c 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -107,4 +107,27 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev) return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; } +#ifdef CONFIG_VFIO_PCI_DMABUF +int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf __user *arg, + size_t argsz); +void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev); +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked); +#else +static inline int +vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf __user *arg, + size_t argsz) +{ + return -ENOTTY; +} +static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) +{ +} +static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, + bool revoked) +{ +} +#endif + #endif diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index f541044e42a2..30d74b364f25 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -94,6 +94,7 @@ struct vfio_pci_core_device { struct vfio_pci_core_device *sriov_pf_core_dev; struct notifier_block nb; struct rw_semaphore memory_lock; + struct list_head dmabufs; }; /* Will be exported for vfio pci drivers usage */ diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 75100bf009ba..63214467c875 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -1478,6 +1478,31 @@ struct vfio_device_feature_bus_master { }; #define VFIO_DEVICE_FEATURE_BUS_MASTER 10 +/** + * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the + * regions selected. + * + * open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC, + * etc. offset/length specify a slice of the region to create the dmabuf from. + * nr_ranges is the total number of (P2P DMA) ranges that comprise the dmabuf. + * + * Return: The fd number on success, -1 and errno is set on failure. + */ +#define VFIO_DEVICE_FEATURE_DMA_BUF 11 + +struct vfio_region_dma_range { + __u64 offset; + __u64 length; +}; + +struct vfio_device_feature_dma_buf { + __u32 region_index; + __u32 open_flags; + __u32 flags; + __u32 nr_ranges; + struct vfio_region_dma_range dma_ranges[]; +}; + /* -------- API for Type1 VFIO IOMMU -------- */ /**