Message ID | 4ED43CFE.8040009@au1.ibm.com |
---|---|
State | New |
Headers | show |
On Tue, 2011-11-29 at 13:01 +1100, Alexey Kardashevskiy wrote: > Hi all, > > Another problem I hit on POWER - MSI interrupts allocation. The existing VFIO does not expect a PBH > to support less interrupts that a device might request. In my case, PHB's limit is 8 interrupts > while my test card (10Gb ethernet CXGB3) wants 9. Below are the patches to demonstrate the idea. Seems reasonable. I assume we'd need similar for vfio_pci_setup_msi, though I haven't seen anything use more than a single MSI interrupt. Thanks, Alex > KERNEL patch: > > diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c > index 7d45c6b..d44b9bf 100644 > --- a/drivers/vfio/pci/vfio_pci_intrs.c > +++ b/drivers/vfio/pci/vfio_pci_intrs.c > @@ -458,17 +458,32 @@ int vfio_pci_setup_msix(struct vfio_pci_device *vdev, int nvec, int __user *inta > vdev->msix[i].entry = i; > vdev->ev_msix[i] = ctx; > } > - if (!ret) > + if (!ret) { > ret = pci_enable_msix(pdev, vdev->msix, nvec); > + /* > + The kernel is unable to allocate requested number of IRQs > + and returned the available number. > + */ > + if (0 < ret) { > + ret = pci_enable_msix(pdev, vdev->msix, ret); > + } > + } > vdev->msix_nvec = 0; > - for (i = 0; i < nvec && !ret; i++) { > - ret = request_irq(vdev->msix[i].vector, msihandler, 0, > - "vfio", vdev->ev_msix[i]); > - if (ret) > - break; > - vdev->msix_nvec = i+1; > + if (0 == ret) { > + vdev->msix_nvec = 0; > + ret = 0; > + for (i = 0; i < nvec && !ret; i++) { > + ret = request_irq(vdev->msix[i].vector, msihandler, 0, > + "vfio", vdev->ev_msix[i]); > + if (ret) > + break; > + vdev->msix_nvec = i+1; > + } > + if ((0 == vdev->msix_nvec) && (0 != ret)) > + vfio_pci_drop_msix(vdev); > + else > + ret = vdev->msix_nvec; > } > - if (ret) > - vfio_pci_drop_msix(vdev); > + > return ret; > } > > === end === > > > QEMU patch: > > diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c > index 020961a..980eec7 100644 > --- a/hw/vfio_pci.c > +++ b/hw/vfio_pci.c > @@ -341,7 +341,8 @@ static void vfio_enable_msi(VFIODevice *vdev, bool msix) > } > } > > - if (ioctl(vdev->fd, VFIO_DEVICE_SET_IRQ_EVENTFDS, fds)) { > + ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQ_EVENTFDS, fds); > + if (0 > ret) { > fprintf(stderr, "vfio: Error: Failed to setup MSI/X fds %s\n", > strerror(errno)); > for (i = 0; i < vdev->nr_vectors; i++) { > @@ -355,6 +356,8 @@ static void vfio_enable_msi(VFIODevice *vdev, bool msix) > qemu_free(vdev->msi_vectors); > vdev->nr_vectors = 0; > return; > + } else if (0 < ret) { > + vdev->nr_vectors = ret; > } > > vdev->interrupt = msix ? INT_MSIX : INT_MSI; > > > === end ===
On Mon, 2011-11-28 at 20:54 -0700, Alex Williamson wrote: > On Tue, 2011-11-29 at 13:01 +1100, Alexey Kardashevskiy wrote: > > Hi all, > > > > Another problem I hit on POWER - MSI interrupts allocation. The existing VFIO does not expect a PBH > > to support less interrupts that a device might request. In my case, PHB's limit is 8 interrupts > > while my test card (10Gb ethernet CXGB3) wants 9. Below are the patches to demonstrate the idea. > > Seems reasonable. I assume we'd need similar for vfio_pci_setup_msi, > though I haven't seen anything use more than a single MSI interrupt. > Thanks, Hmm, I'm thinking maybe we should reflect the pci_enable_msix() behavior directly and let the caller decide if they want to make do with fewer MSI vectors. So the ioctl will return <0: error, 0:success, >0: number of MSIs we think we can setup, without actually setting them. Sound good? BTW, github now has updated trees: git://github.com/awilliam/linux-vfio.git vfio-next-20111129 git://github.com/awilliam/qemu-vfio.git vfio-ng Thanks, Alex > > KERNEL patch: > > > > diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c > > index 7d45c6b..d44b9bf 100644 > > --- a/drivers/vfio/pci/vfio_pci_intrs.c > > +++ b/drivers/vfio/pci/vfio_pci_intrs.c > > @@ -458,17 +458,32 @@ int vfio_pci_setup_msix(struct vfio_pci_device *vdev, int nvec, int __user *inta > > vdev->msix[i].entry = i; > > vdev->ev_msix[i] = ctx; > > } > > - if (!ret) > > + if (!ret) { > > ret = pci_enable_msix(pdev, vdev->msix, nvec); > > + /* > > + The kernel is unable to allocate requested number of IRQs > > + and returned the available number. > > + */ > > + if (0 < ret) { > > + ret = pci_enable_msix(pdev, vdev->msix, ret); > > + } > > + } > > vdev->msix_nvec = 0; > > - for (i = 0; i < nvec && !ret; i++) { > > - ret = request_irq(vdev->msix[i].vector, msihandler, 0, > > - "vfio", vdev->ev_msix[i]); > > - if (ret) > > - break; > > - vdev->msix_nvec = i+1; > > + if (0 == ret) { > > + vdev->msix_nvec = 0; > > + ret = 0; > > + for (i = 0; i < nvec && !ret; i++) { > > + ret = request_irq(vdev->msix[i].vector, msihandler, 0, > > + "vfio", vdev->ev_msix[i]); > > + if (ret) > > + break; > > + vdev->msix_nvec = i+1; > > + } > > + if ((0 == vdev->msix_nvec) && (0 != ret)) > > + vfio_pci_drop_msix(vdev); > > + else > > + ret = vdev->msix_nvec; > > } > > - if (ret) > > - vfio_pci_drop_msix(vdev); > > + > > return ret; > > } > > > > === end === > > > > > > QEMU patch: > > > > diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c > > index 020961a..980eec7 100644 > > --- a/hw/vfio_pci.c > > +++ b/hw/vfio_pci.c > > @@ -341,7 +341,8 @@ static void vfio_enable_msi(VFIODevice *vdev, bool msix) > > } > > } > > > > - if (ioctl(vdev->fd, VFIO_DEVICE_SET_IRQ_EVENTFDS, fds)) { > > + ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQ_EVENTFDS, fds); > > + if (0 > ret) { > > fprintf(stderr, "vfio: Error: Failed to setup MSI/X fds %s\n", > > strerror(errno)); > > for (i = 0; i < vdev->nr_vectors; i++) { > > @@ -355,6 +356,8 @@ static void vfio_enable_msi(VFIODevice *vdev, bool msix) > > qemu_free(vdev->msi_vectors); > > vdev->nr_vectors = 0; > > return; > > + } else if (0 < ret) { > > + vdev->nr_vectors = ret; > > } > > > > vdev->interrupt = msix ? INT_MSIX : INT_MSI; > > > > > > === end === >
> > BTW, github now has updated trees: > > git://github.com/awilliam/linux-vfio.git vfio-next-20111129 > git://github.com/awilliam/qemu-vfio.git vfio-ng Hi Alex, Have been looking at vfio a bit. A few observations and things we'll need to figure out as it relates to the Freescale iommu. __vfio_dma_map() assumes that mappings are broken into 4KB pages. That will not be true for us. We normally will be mapping much larger physically contiguous chunks for our guests. Guests will get hugetlbfs backed memory with very large pages (e.g. 16MB, 64MB) or very large chunks allocated by some proprietary means. Also, mappings will have additional Freescale-specific attributes that need to get passed through to dma_map somehow. For example, the iommu can stash directly into a CPU's cache and we have iommu mapping properties like the cache stash target id and an operation mapping attribute. How do you envision handling proprietary attributes in struct vfio_dma_map? Stuart
On Tue, 2011-11-29 at 17:20 -0600, Stuart Yoder wrote: > > > > BTW, github now has updated trees: > > > > git://github.com/awilliam/linux-vfio.git vfio-next-20111129 > > git://github.com/awilliam/qemu-vfio.git vfio-ng > > Hi Alex, > > Have been looking at vfio a bit. A few observations and things > we'll need to figure out as it relates to the Freescale iommu. > > __vfio_dma_map() assumes that mappings are broken into > 4KB pages. That will not be true for us. We normally will be mapping > much larger physically contiguous chunks for our guests. Guests will > get hugetlbfs backed memory with very large pages (e.g. 16MB, > 64MB) or very large chunks allocated by some proprietary > means. Hi Stuart, I think practically everyone has commented on the 4k mappings ;) There are a few problems around this. The first is that iommu drivers don't necessarily support sub-region unmapping, so if we map 1GB and later want to unmap 4k, we can't do it atomically. 4k gives us the most flexibility for supporting fine granularities. Another problem is that we're using get_user_pages to pin memory. It's been suggested that we should use mlock for this, but I can't find anything that prevents a user from later munlock'ing the memory and then getting access to memory they shouldn't have. Those kind of limit us, but I don't see it being an API problem for VFIO, just implementation. > Also, mappings will have additional Freescale-specific attributes > that need to get passed through to dma_map somehow. For > example, the iommu can stash directly into a CPU's cache > and we have iommu mapping properties like the cache stash > target id and an operation mapping attribute. > > How do you envision handling proprietary attributes > in struct vfio_dma_map? Let me turn the question around, how do you plan to support proprietary attributes in the IOMMU API? Is the user level the appropriate place to specify them, or are they an intrinsic feature of the domain? We've designed struct vfio_dma_map for extension, so depending on how many bits you need, we can make a conduit using the flags directly or setting a new flag to indicate presence of an arch specific attributes field. Thanks, Alex
On Tue, Nov 29, 2011 at 5:44 PM, Alex Williamson <alex.williamson@redhat.com> wrote: > On Tue, 2011-11-29 at 17:20 -0600, Stuart Yoder wrote: >> > >> > BTW, github now has updated trees: >> > >> > git://github.com/awilliam/linux-vfio.git vfio-next-20111129 >> > git://github.com/awilliam/qemu-vfio.git vfio-ng >> >> Hi Alex, >> >> Have been looking at vfio a bit. A few observations and things >> we'll need to figure out as it relates to the Freescale iommu. >> >> __vfio_dma_map() assumes that mappings are broken into >> 4KB pages. That will not be true for us. We normally will be mapping >> much larger physically contiguous chunks for our guests. Guests will >> get hugetlbfs backed memory with very large pages (e.g. 16MB, >> 64MB) or very large chunks allocated by some proprietary >> means. > > Hi Stuart, > > I think practically everyone has commented on the 4k mappings ;) There > are a few problems around this. The first is that iommu drivers don't > necessarily support sub-region unmapping, so if we map 1GB and later > want to unmap 4k, we can't do it atomically. 4k gives us the most > flexibility for supporting fine granularities. Another problem is that > we're using get_user_pages to pin memory. It's been suggested that we > should use mlock for this, but I can't find anything that prevents a > user from later munlock'ing the memory and then getting access to memory > they shouldn't have. Those kind of limit us, but I don't see it being > an API problem for VFIO, just implementation. Ok. >> Also, mappings will have additional Freescale-specific attributes >> that need to get passed through to dma_map somehow. For >> example, the iommu can stash directly into a CPU's cache >> and we have iommu mapping properties like the cache stash >> target id and an operation mapping attribute. >> >> How do you envision handling proprietary attributes >> in struct vfio_dma_map? > > Let me turn the question around, how do you plan to support proprietary > attributes in the IOMMU API? Is the user level the appropriate place to > specify them, or are they an intrinsic feature of the domain? We've > designed struct vfio_dma_map for extension, so depending on how many > bits you need, we can make a conduit using the flags directly or setting > a new flag to indicate presence of an arch specific attributes field. The attributes are not intrinsic features of the domain. User space will need to set them. But in thinking about it a bit more I think the attributes are more properties of the domain rather than a per map() operation characteristic. I think a separate API might be appropriate. Define a new set_domain_attrs() op in the iommu_ops. In user space, perhaps a new vfio group API-- VFIO_GROUP_SET_ATTRS, VFIO_GROUP_GET_ATTRS. Stuart
On Wed, 2011-11-30 at 09:41 -0600, Stuart Yoder wrote: > On Tue, Nov 29, 2011 at 5:44 PM, Alex Williamson > <alex.williamson@redhat.com> wrote: > > On Tue, 2011-11-29 at 17:20 -0600, Stuart Yoder wrote: > >> > > >> > BTW, github now has updated trees: > >> > > >> > git://github.com/awilliam/linux-vfio.git vfio-next-20111129 > >> > git://github.com/awilliam/qemu-vfio.git vfio-ng > >> > >> Hi Alex, > >> > >> Have been looking at vfio a bit. A few observations and things > >> we'll need to figure out as it relates to the Freescale iommu. > >> > >> __vfio_dma_map() assumes that mappings are broken into > >> 4KB pages. That will not be true for us. We normally will be mapping > >> much larger physically contiguous chunks for our guests. Guests will > >> get hugetlbfs backed memory with very large pages (e.g. 16MB, > >> 64MB) or very large chunks allocated by some proprietary > >> means. > > > > Hi Stuart, > > > > I think practically everyone has commented on the 4k mappings ;) There > > are a few problems around this. The first is that iommu drivers don't > > necessarily support sub-region unmapping, so if we map 1GB and later > > want to unmap 4k, we can't do it atomically. 4k gives us the most > > flexibility for supporting fine granularities. Another problem is that > > we're using get_user_pages to pin memory. It's been suggested that we > > should use mlock for this, but I can't find anything that prevents a > > user from later munlock'ing the memory and then getting access to memory > > they shouldn't have. Those kind of limit us, but I don't see it being > > an API problem for VFIO, just implementation. > > Ok. > > >> Also, mappings will have additional Freescale-specific attributes > >> that need to get passed through to dma_map somehow. For > >> example, the iommu can stash directly into a CPU's cache > >> and we have iommu mapping properties like the cache stash > >> target id and an operation mapping attribute. > >> > >> How do you envision handling proprietary attributes > >> in struct vfio_dma_map? > > > > Let me turn the question around, how do you plan to support proprietary > > attributes in the IOMMU API? Is the user level the appropriate place to > > specify them, or are they an intrinsic feature of the domain? We've > > designed struct vfio_dma_map for extension, so depending on how many > > bits you need, we can make a conduit using the flags directly or setting > > a new flag to indicate presence of an arch specific attributes field. > > The attributes are not intrinsic features of the domain. User space will > need to set them. But in thinking about it a bit more I think the attributes > are more properties of the domain rather than a per map() operation > characteristic. I think a separate API might be appropriate. Define a > new set_domain_attrs() op in the iommu_ops. In user space, perhaps > a new vfio group API-- VFIO_GROUP_SET_ATTRS, > VFIO_GROUP_GET_ATTRS. In that case, you should definitely be following what Alexey is thinking about with an iommu_setup IOMMU API callback. I think it's shaping up to do: x86: - Report any IOVA range restrictions imposed by hw implementation POWER: - Request IOVA window size, report size and base powerpc: - Set domain attributes, probably report range as well. Thanks, Alex
>> The attributes are not intrinsic features of the domain. User space will >> need to set them. But in thinking about it a bit more I think the attributes >> are more properties of the domain rather than a per map() operation >> characteristic. I think a separate API might be appropriate. Define a >> new set_domain_attrs() op in the iommu_ops. In user space, perhaps >> a new vfio group API-- VFIO_GROUP_SET_ATTRS, >> VFIO_GROUP_GET_ATTRS. > > In that case, you should definitely be following what Alexey is thinking > about with an iommu_setup IOMMU API callback. I think it's shaping up > to do: > > x86: > - Report any IOVA range restrictions imposed by hw implementation > POWER: > - Request IOVA window size, report size and base > powerpc: > - Set domain attributes, probably report range as well. One other mechanism we need as well is the ability to enable/disable a domain. For example-- suppose a device is assigned to a VM, the device is in use when the VM is abruptly terminated. The VM terminate would shut off DMA at the IOMMU, but now the device is in an indeterminate state. Some devices have no simple reset bit and getting the device back into a sane state could be complicated-- something the hypervisor doesn't want to do. So now KVM restarts the VM, vfio init happens for the device and the IOMMU for that device is re-configured, etc, but we really can't re-enable DMA until the guest OS tells us (via an hcall) that it is ready. The guest needs to get the assigned device in a sane state before DMA is enabled. Does this warrant a new domain enable/disable API, or should we make this part of the setup API we are discussing here? Stuart
On Thu, 2011-12-01 at 14:58 -0600, Stuart Yoder wrote: > >> The attributes are not intrinsic features of the domain. User space will > >> need to set them. But in thinking about it a bit more I think the attributes > >> are more properties of the domain rather than a per map() operation > >> characteristic. I think a separate API might be appropriate. Define a > >> new set_domain_attrs() op in the iommu_ops. In user space, perhaps > >> a new vfio group API-- VFIO_GROUP_SET_ATTRS, > >> VFIO_GROUP_GET_ATTRS. > > > > In that case, you should definitely be following what Alexey is thinking > > about with an iommu_setup IOMMU API callback. I think it's shaping up > > to do: > > > > x86: > > - Report any IOVA range restrictions imposed by hw implementation > > POWER: > > - Request IOVA window size, report size and base > > powerpc: > > - Set domain attributes, probably report range as well. > > One other mechanism we need as well is the ability to > enable/disable a domain. > > For example-- suppose a device is assigned to a VM, the > device is in use when the VM is abruptly terminated. The > VM terminate would shut off DMA at the IOMMU, but now > the device is in an indeterminate state. Some devices > have no simple reset bit and getting the device back into > a sane state could be complicated-- something the hypervisor > doesn't want to do. > > So now KVM restarts the VM, vfio init happens for the device > and the IOMMU for that device is re-configured, > etc, but we really can't re-enable DMA until the guest OS tells us > (via an hcall) that it is ready. The guest needs to get the > assigned device in a sane state before DMA is enabled. Giant red flag. We need to paravirtualize the guest? Not on x86. Some devices are better for assignment than others. PCI devices are moving towards supporting standard reset mechanisms. > Does this warrant a new domain enable/disable API, or should > we make this part of the setup API we are discussing > here? What's wrong with simply not adding any DMA mapping entries until you think your guest is ready? Isn't that effectively the same thing? Unmap ~= disable. If the IOMMU API had a mechanism to toggle the iommu domain on and off, I wouldn't be opposed to adding an ioctl to do it, but it really seems like just a shortcut vs map/unmap. Thanks, Alex
On Thu, Dec 1, 2011 at 3:25 PM, Alex Williamson <alex.williamson@redhat.com> wrote: > On Thu, 2011-12-01 at 14:58 -0600, Stuart Yoder wrote: >> One other mechanism we need as well is the ability to >> enable/disable a domain. >> >> For example-- suppose a device is assigned to a VM, the >> device is in use when the VM is abruptly terminated. The >> VM terminate would shut off DMA at the IOMMU, but now >> the device is in an indeterminate state. Some devices >> have no simple reset bit and getting the device back into >> a sane state could be complicated-- something the hypervisor >> doesn't want to do. >> >> So now KVM restarts the VM, vfio init happens for the device >> and the IOMMU for that device is re-configured, >> etc, but we really can't re-enable DMA until the guest OS tells us >> (via an hcall) that it is ready. The guest needs to get the >> assigned device in a sane state before DMA is enabled. > > Giant red flag. We need to paravirtualize the guest? Not on x86. It's the reality we have to deal with, but doing this would obviously only apply to platforms that need it. > Some > devices are better for assignment than others. PCI devices are moving > towards supporting standard reset mechanisms. > >> Does this warrant a new domain enable/disable API, or should >> we make this part of the setup API we are discussing >> here? > > What's wrong with simply not adding any DMA mapping entries until you > think your guest is ready? Isn't that effectively the same thing? > Unmap ~= disable. If the IOMMU API had a mechanism to toggle the iommu > domain on and off, I wouldn't be opposed to adding an ioctl to do it, > but it really seems like just a shortcut vs map/unmap. Thanks, Yes, we could do something like that I guess. Stuart
> -----Original Message----- > From: kvm-owner@vger.kernel.org [mailto:kvm-owner@vger.kernel.org] On > Behalf Of Stuart Yoder > Sent: Friday, December 02, 2011 8:11 PM > To: Alex Williamson > Cc: Alexey Kardashevskiy; aafabbri@cisco.com; kvm@vger.kernel.org; > pmac@au1.ibm.com; qemu-devel@nongnu.org; joerg.roedel@amd.com; > konrad.wilk@oracle.com; agraf@suse.de; dwg@au1.ibm.com; chrisw@sous- > sol.org; Yoder Stuart-B08248; iommu@lists.linux-foundation.org; > avi@redhat.com; linux-pci@vger.kernel.org; Wood Scott-B07421; > benve@cisco.com > Subject: Re: [Qemu-devel] [RFC PATCH] vfio: VFIO Driver core framework > > On Thu, Dec 1, 2011 at 3:25 PM, Alex Williamson > <alex.williamson@redhat.com> wrote: > > On Thu, 2011-12-01 at 14:58 -0600, Stuart Yoder wrote: > >> One other mechanism we need as well is the ability to enable/disable > >> a domain. > >> > >> For example-- suppose a device is assigned to a VM, the device is in > >> use when the VM is abruptly terminated. The VM terminate would shut > >> off DMA at the IOMMU, but now the device is in an indeterminate > >> state. Some devices have no simple reset bit and getting the device > >> back into a sane state could be complicated-- something the > >> hypervisor doesn't want to do. > >> > >> So now KVM restarts the VM, vfio init happens for the device and the > >> IOMMU for that device is re-configured, etc, but we really can't > >> re-enable DMA until the guest OS tells us (via an hcall) that it is > >> ready. The guest needs to get the assigned device in a sane state > >> before DMA is enabled. > > > > Giant red flag. We need to paravirtualize the guest? Not on x86. > > It's the reality we have to deal with, but doing this would obviously > only apply to platforms that need it. > > > Some > > devices are better for assignment than others. PCI devices are moving > > towards supporting standard reset mechanisms. > > > >> Does this warrant a new domain enable/disable API, or should we make > >> this part of the setup API we are discussing here? > > > > What's wrong with simply not adding any DMA mapping entries until you > > think your guest is ready? Isn't that effectively the same thing? > > Unmap ~= disable. If the IOMMU API had a mechanism to toggle the > > iommu domain on and off, I wouldn't be opposed to adding an ioctl to > > do it, but it really seems like just a shortcut vs map/unmap. Thanks, > > Yes, we could do something like that I guess. How do we determine whether guest is ready or not? There can be multiple device get ready at different time. Further if guest have given the device to it user level process or its guest. Should not there be some mechanism for a guest to enable/disable on per device or group? Thanks -Bharat
On 12/02/2011 08:40 AM, Stuart Yoder wrote: > On Thu, Dec 1, 2011 at 3:25 PM, Alex Williamson > <alex.williamson@redhat.com> wrote: >> On Thu, 2011-12-01 at 14:58 -0600, Stuart Yoder wrote: >>> One other mechanism we need as well is the ability to >>> enable/disable a domain. >>> >>> For example-- suppose a device is assigned to a VM, the >>> device is in use when the VM is abruptly terminated. The >>> VM terminate would shut off DMA at the IOMMU, but now >>> the device is in an indeterminate state. Some devices >>> have no simple reset bit and getting the device back into >>> a sane state could be complicated-- something the hypervisor >>> doesn't want to do. >>> >>> So now KVM restarts the VM, vfio init happens for the device >>> and the IOMMU for that device is re-configured, >>> etc, but we really can't re-enable DMA until the guest OS tells us >>> (via an hcall) that it is ready. The guest needs to get the >>> assigned device in a sane state before DMA is enabled. >> >> Giant red flag. We need to paravirtualize the guest? Not on x86. > > It's the reality we have to deal with, but doing this would obviously > only apply to platforms that need it. By "x86" I assume you mean "PCI" and thus a bus-master enable flag that you rely on the guest not setting until the device has been reset or otherwise quiesced from any previous activity, in the absence of function-level reset. We don't have such a thing on our non-PCI devices. >> Some >> devices are better for assignment than others. PCI devices are moving >> towards supporting standard reset mechanisms. >> >>> Does this warrant a new domain enable/disable API, or should >>> we make this part of the setup API we are discussing >>> here? >> >> What's wrong with simply not adding any DMA mapping entries until you >> think your guest is ready? Isn't that effectively the same thing? >> Unmap ~= disable. If the IOMMU API had a mechanism to toggle the iommu >> domain on and off, I wouldn't be opposed to adding an ioctl to do it, >> but it really seems like just a shortcut vs map/unmap. Thanks, > > Yes, we could do something like that I guess. It would mean that we don't see any errors relating to impossible map requests until after the guest is running and decides to enable DMA. Depending on how PAMU table allocation is handled, it could introduce a risk of failing even later when a guest reboots and we need to temporarily disable DMA (e.g. if another vfio user consumes the same table space for another group in the meantime). It would add latency to failovers -- some customers have somewhat tight requirements there. -Scott
On 12/02/2011 12:11 PM, Bhushan Bharat-R65777 wrote: > How do we determine whether guest is ready or not? There can be multiple device get ready at different time. The guest makes a hypercall with a device handle -- at least that's how we do it in Topaz. > Further if guest have given the device to it user level process or its guest. Should not there be some mechanism for a guest to enable/disable on per device or group? Yes, the same mechanism can be used for that -- though in that case we'll also want the ability for the guest to be able to control another layer of mapping (which will get quite tricky with PAMU's limitations). This isn't really VFIO's concern, though (unless you're talking about the VFIO implementation in the guest). -Scott
> -----Original Message----- > From: Wood Scott-B07421 > Sent: Friday, December 02, 2011 11:57 PM > To: Bhushan Bharat-R65777 > Cc: Stuart Yoder; Alex Williamson; Alexey Kardashevskiy; > aafabbri@cisco.com; kvm@vger.kernel.org; pmac@au1.ibm.com; qemu- > devel@nongnu.org; joerg.roedel@amd.com; konrad.wilk@oracle.com; > agraf@suse.de; dwg@au1.ibm.com; chrisw@sous-sol.org; Yoder Stuart-B08248; > iommu@lists.linux-foundation.org; avi@redhat.com; linux- > pci@vger.kernel.org; Wood Scott-B07421; benve@cisco.com > Subject: Re: [Qemu-devel] [RFC PATCH] vfio: VFIO Driver core framework > > On 12/02/2011 12:11 PM, Bhushan Bharat-R65777 wrote: > > How do we determine whether guest is ready or not? There can be > multiple device get ready at different time. > > The guest makes a hypercall with a device handle -- at least that's how > we do it in Topaz. Yes, it is ok from hcall with device handle perspective. But I could not understood how cleanly this can be handled with the idea of enabling iommu when guest is ready. Thanks -Bharat > > > Further if guest have given the device to it user level process or its > guest. Should not there be some mechanism for a guest to enable/disable > on per device or group? > > Yes, the same mechanism can be used for that -- though in that case we'll > also want the ability for the guest to be able to control another layer > of mapping (which will get quite tricky with PAMU's limitations). > This isn't really VFIO's concern, though (unless you're talking about > the VFIO implementation in the guest). May be thinking too ahead, but will not something like this will be needed for nested virtualization? Thanks -Bharat
> -----Original Message----- > From: Wood Scott-B07421 > Sent: Friday, December 02, 2011 11:57 PM > To: Bhushan Bharat-R65777 > Cc: Stuart Yoder; Alex Williamson; Alexey Kardashevskiy; > aafabbri@cisco.com; kvm@vger.kernel.org; pmac@au1.ibm.com; qemu- > devel@nongnu.org; joerg.roedel@amd.com; konrad.wilk@oracle.com; > agraf@suse.de; dwg@au1.ibm.com; chrisw@sous-sol.org; Yoder Stuart-B08248; > iommu@lists.linux-foundation.org; avi@redhat.com; linux- > pci@vger.kernel.org; Wood Scott-B07421; benve@cisco.com > Subject: Re: [Qemu-devel] [RFC PATCH] vfio: VFIO Driver core framework > > On 12/02/2011 12:11 PM, Bhushan Bharat-R65777 wrote: > > How do we determine whether guest is ready or not? There can be > multiple device get ready at different time. > > The guest makes a hypercall with a device handle -- at least that's how > we do it in Topaz. > > > Further if guest have given the device to it user level process or its > guest. Should not there be some mechanism for a guest to enable/disable > on per device or group? > > Yes, the same mechanism can be used for that -- though in that case we'll > also want the ability for the guest to be able to control another layer > of mapping (which will get quite tricky with PAMU's limitations). > This isn't really VFIO's concern, though (unless you're talking about > the VFIO implementation in the guest). Scott, I am not sure if there is any real use case where device needed to assigned beyond 2 level (host + immediate guest) in nested virtualization. But if there any exists then will not it be better to virtualizes the iommu (PAMU for Freescale)? Thanks -Bharat
On 12/02/2011 12:45 PM, Bhushan Bharat-R65777 wrote: > Scott, I am not sure if there is any real use case where device needed to assigned beyond 2 level (host + immediate guest) in nested virtualization. Userspace drivers in the guest is a more likely scenario than nested virtualization, at least for us. Our hardware doesn't support nested virtualization, so it would have to be some slow emulation-based approach (worse than e500v2, since we don't have multiple PID registers). > But if there any exists then will not it be better to virtualizes the iommu (PAMU for Freescale)? We can't virtualize the PAMU in any sort of transparent manner. It's not flexible enough to handle arbitrary mappings. The guest will need to cooperate with the host to figure out what mappings it can do. -Scott
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 7d45c6b..d44b9bf 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -458,17 +458,32 @@ int vfio_pci_setup_msix(struct vfio_pci_device *vdev, int nvec, int __user *inta vdev->msix[i].entry = i; vdev->ev_msix[i] = ctx; } - if (!ret) + if (!ret) { ret = pci_enable_msix(pdev, vdev->msix, nvec); + /* + The kernel is unable to allocate requested number of IRQs + and returned the available number. + */ + if (0 < ret) { + ret = pci_enable_msix(pdev, vdev->msix, ret); + } + } vdev->msix_nvec = 0; - for (i = 0; i < nvec && !ret; i++) { - ret = request_irq(vdev->msix[i].vector, msihandler, 0, - "vfio", vdev->ev_msix[i]); - if (ret) - break; - vdev->msix_nvec = i+1; + if (0 == ret) { + vdev->msix_nvec = 0; + ret = 0; + for (i = 0; i < nvec && !ret; i++) { + ret = request_irq(vdev->msix[i].vector, msihandler, 0, + "vfio", vdev->ev_msix[i]); + if (ret) + break; + vdev->msix_nvec = i+1; + } + if ((0 == vdev->msix_nvec) && (0 != ret)) + vfio_pci_drop_msix(vdev); + else + ret = vdev->msix_nvec; } - if (ret) - vfio_pci_drop_msix(vdev); + return ret; } === end === QEMU patch: diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c index 020961a..980eec7 100644 --- a/hw/vfio_pci.c +++ b/hw/vfio_pci.c @@ -341,7 +341,8 @@ static void vfio_enable_msi(VFIODevice *vdev, bool msix) } } - if (ioctl(vdev->fd, VFIO_DEVICE_SET_IRQ_EVENTFDS, fds)) { + ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQ_EVENTFDS, fds); + if (0 > ret) { fprintf(stderr, "vfio: Error: Failed to setup MSI/X fds %s\n", strerror(errno)); for (i = 0; i < vdev->nr_vectors; i++) { @@ -355,6 +356,8 @@ static void vfio_enable_msi(VFIODevice *vdev, bool msix) qemu_free(vdev->msi_vectors); vdev->nr_vectors = 0; return; + } else if (0 < ret) { + vdev->nr_vectors = ret; } vdev->interrupt = msix ? INT_MSIX : INT_MSI;