diff mbox

[v4,04/12] spapr vfio: add vfio_container_spapr_get_info()

Message ID 1377857738-14789-5-git-send-email-aik@ozlabs.ru
State New
Headers show

Commit Message

Alexey Kardashevskiy Aug. 30, 2013, 10:15 a.m. UTC
As sPAPR platform supports DMA windows on a PCI bus, the information
about their location and size should be passed into the guest via
the device tree.

The patch adds a helper to read this info from the container fd.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v4:
* fixed possible leaks on error paths
---
 hw/misc/vfio.c         | 45 +++++++++++++++++++++++++++++++++++++++++++++
 include/hw/misc/vfio.h | 11 +++++++++++
 2 files changed, 56 insertions(+)
 create mode 100644 include/hw/misc/vfio.h

Comments

Alex Williamson Sept. 5, 2013, 7:01 p.m. UTC | #1
On Fri, 2013-08-30 at 20:15 +1000, Alexey Kardashevskiy wrote:
> As sPAPR platform supports DMA windows on a PCI bus, the information
> about their location and size should be passed into the guest via
> the device tree.
> 
> The patch adds a helper to read this info from the container fd.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> Changes:
> v4:
> * fixed possible leaks on error paths
> ---
>  hw/misc/vfio.c         | 45 +++++++++++++++++++++++++++++++++++++++++++++
>  include/hw/misc/vfio.h | 11 +++++++++++
>  2 files changed, 56 insertions(+)
>  create mode 100644 include/hw/misc/vfio.h
> 
> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
> index 53791fb..4210471 100644
> --- a/hw/misc/vfio.c
> +++ b/hw/misc/vfio.c
> @@ -39,6 +39,7 @@
>  #include "qemu/range.h"
>  #include "sysemu/kvm.h"
>  #include "sysemu/sysemu.h"
> +#include "hw/misc/vfio.h"
>  
>  /* #define DEBUG_VFIO */
>  #ifdef DEBUG_VFIO
> @@ -3490,3 +3491,47 @@ static void register_vfio_pci_dev_type(void)
>  }
>  
>  type_init(register_vfio_pci_dev_type)
> +
> +int vfio_container_spapr_get_info(AddressSpace *as, int32_t groupid,
> +                                  struct vfio_iommu_spapr_tce_info *info,
> +                                  int *group_fd)
> +{
> +    VFIOAddressSpace *space;
> +    VFIOGroup *group;
> +    VFIOContainer *container;
> +    int ret, fd;
> +
> +    space = vfio_get_address_space(as);
> +    if (!space) {
> +        return -1;
> +    }
> +    group = vfio_get_group(groupid, space);
> +    if (!group) {
> +        goto put_as_exit;
> +    }
> +    container = group->container;
> +    if (!group->container) {
> +        goto put_group_exit;
> +    }
> +    fd = container->fd;
> +    if (!ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU)) {
> +        goto put_group_exit;
> +    }
> +    ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, info);
> +    if (ret) {
> +        error_report("vfio: failed to get iommu info for container: %s",
> +                     strerror(errno));
> +        goto put_group_exit;
> +    }
> +    *group_fd = group->fd;

The above gets don't actually increment a reference count, so copying
the fd seems risky here.

> +
> +    return 0;
> +
> +put_group_exit:
> +    vfio_put_group(group);
> +
> +put_as_exit:
> +    vfio_put_address_space(space);

But put_group calls disconnect_container which calls
put_address_space... so it get's put twice.  The lack of symmetry
already bites us with a bug.

> +
> +    return -1;
> +}
> diff --git a/include/hw/misc/vfio.h b/include/hw/misc/vfio.h
> new file mode 100644
> index 0000000..ac9a971
> --- /dev/null
> +++ b/include/hw/misc/vfio.h
> @@ -0,0 +1,11 @@
> +#ifndef VFIO_API_H
> +#define VFIO_API_H
> +
> +#include "qemu/typedefs.h"
> +#include <linux/vfio.h>
> +
> +extern int vfio_container_spapr_get_info(AddressSpace *as, int32_t groupid,
> +                                         struct vfio_iommu_spapr_tce_info *info,
> +                                         int *group_fd);
> +
> +#endif
Alexey Kardashevskiy Sept. 10, 2013, 8:36 a.m. UTC | #2
On 09/06/2013 05:01 AM, Alex Williamson wrote:
> On Fri, 2013-08-30 at 20:15 +1000, Alexey Kardashevskiy wrote:
>> As sPAPR platform supports DMA windows on a PCI bus, the information
>> about their location and size should be passed into the guest via
>> the device tree.
>>
>> The patch adds a helper to read this info from the container fd.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>> Changes:
>> v4:
>> * fixed possible leaks on error paths
>> ---
>>  hw/misc/vfio.c         | 45 +++++++++++++++++++++++++++++++++++++++++++++
>>  include/hw/misc/vfio.h | 11 +++++++++++
>>  2 files changed, 56 insertions(+)
>>  create mode 100644 include/hw/misc/vfio.h
>>
>> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
>> index 53791fb..4210471 100644
>> --- a/hw/misc/vfio.c
>> +++ b/hw/misc/vfio.c
>> @@ -39,6 +39,7 @@
>>  #include "qemu/range.h"
>>  #include "sysemu/kvm.h"
>>  #include "sysemu/sysemu.h"
>> +#include "hw/misc/vfio.h"
>>  
>>  /* #define DEBUG_VFIO */
>>  #ifdef DEBUG_VFIO
>> @@ -3490,3 +3491,47 @@ static void register_vfio_pci_dev_type(void)
>>  }
>>  
>>  type_init(register_vfio_pci_dev_type)
>> +
>> +int vfio_container_spapr_get_info(AddressSpace *as, int32_t groupid,
>> +                                  struct vfio_iommu_spapr_tce_info *info,
>> +                                  int *group_fd)
>> +{
>> +    VFIOAddressSpace *space;
>> +    VFIOGroup *group;
>> +    VFIOContainer *container;
>> +    int ret, fd;
>> +
>> +    space = vfio_get_address_space(as);
>> +    if (!space) {
>> +        return -1;
>> +    }
>> +    group = vfio_get_group(groupid, space);
>> +    if (!group) {
>> +        goto put_as_exit;
>> +    }
>> +    container = group->container;
>> +    if (!group->container) {
>> +        goto put_group_exit;
>> +    }
>> +    fd = container->fd;
>> +    if (!ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU)) {
>> +        goto put_group_exit;
>> +    }
>> +    ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, info);
>> +    if (ret) {
>> +        error_report("vfio: failed to get iommu info for container: %s",
>> +                     strerror(errno));
>> +        goto put_group_exit;
>> +    }
>> +    *group_fd = group->fd;
> 
> The above gets don't actually increment a reference count, so copying
> the fd seems risky here.


If fd is gone while I am carrying it to my "external VFIO user" to call
kvmppc_vfio_group_get_external_user() on it, then the guest just shut
itself in a foot, no?
And I do not see how I would make it no risky, do you?


>> +
>> +    return 0;
>> +
>> +put_group_exit:
>> +    vfio_put_group(group);
>> +
>> +put_as_exit:
>> +    vfio_put_address_space(space);
> 
> But put_group calls disconnect_container which calls
> put_address_space... so it get's put twice.  The lack of symmetry
> already bites us with a bug.

True. This will be fixed by moving vfio_get_address_space() into
vfio_get_group().
Alex Williamson Sept. 10, 2013, 10:11 p.m. UTC | #3
On Tue, 2013-09-10 at 18:36 +1000, Alexey Kardashevskiy wrote:
> On 09/06/2013 05:01 AM, Alex Williamson wrote:
> > On Fri, 2013-08-30 at 20:15 +1000, Alexey Kardashevskiy wrote:
> >> As sPAPR platform supports DMA windows on a PCI bus, the information
> >> about their location and size should be passed into the guest via
> >> the device tree.
> >>
> >> The patch adds a helper to read this info from the container fd.
> >>
> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >> ---
> >> Changes:
> >> v4:
> >> * fixed possible leaks on error paths
> >> ---
> >>  hw/misc/vfio.c         | 45 +++++++++++++++++++++++++++++++++++++++++++++
> >>  include/hw/misc/vfio.h | 11 +++++++++++
> >>  2 files changed, 56 insertions(+)
> >>  create mode 100644 include/hw/misc/vfio.h
> >>
> >> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
> >> index 53791fb..4210471 100644
> >> --- a/hw/misc/vfio.c
> >> +++ b/hw/misc/vfio.c
> >> @@ -39,6 +39,7 @@
> >>  #include "qemu/range.h"
> >>  #include "sysemu/kvm.h"
> >>  #include "sysemu/sysemu.h"
> >> +#include "hw/misc/vfio.h"
> >>  
> >>  /* #define DEBUG_VFIO */
> >>  #ifdef DEBUG_VFIO
> >> @@ -3490,3 +3491,47 @@ static void register_vfio_pci_dev_type(void)
> >>  }
> >>  
> >>  type_init(register_vfio_pci_dev_type)
> >> +
> >> +int vfio_container_spapr_get_info(AddressSpace *as, int32_t groupid,
> >> +                                  struct vfio_iommu_spapr_tce_info *info,
> >> +                                  int *group_fd)
> >> +{
> >> +    VFIOAddressSpace *space;
> >> +    VFIOGroup *group;
> >> +    VFIOContainer *container;
> >> +    int ret, fd;
> >> +
> >> +    space = vfio_get_address_space(as);
> >> +    if (!space) {
> >> +        return -1;
> >> +    }
> >> +    group = vfio_get_group(groupid, space);
> >> +    if (!group) {
> >> +        goto put_as_exit;
> >> +    }
> >> +    container = group->container;
> >> +    if (!group->container) {
> >> +        goto put_group_exit;
> >> +    }
> >> +    fd = container->fd;
> >> +    if (!ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU)) {
> >> +        goto put_group_exit;
> >> +    }
> >> +    ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, info);
> >> +    if (ret) {
> >> +        error_report("vfio: failed to get iommu info for container: %s",
> >> +                     strerror(errno));
> >> +        goto put_group_exit;
> >> +    }
> >> +    *group_fd = group->fd;
> > 
> > The above gets don't actually increment a reference count, so copying
> > the fd seems risky here.
> 
> 
> If fd is gone while I am carrying it to my "external VFIO user" to call
> kvmppc_vfio_group_get_external_user() on it, then the guest just shut
> itself in a foot, no?
> And I do not see how I would make it no risky, do you?

We've handled the case in the kernel where the IOMMU code has a
reference to the group so the group won't go away as long as that
reference is in place, but we don't have that in QEMU.  If you supported
hotplug, how would QEMU vfio notify spapr code to release the group?  I
think you'd be left with the spapr kernel code holding the group
reference and possibly a bogus file descriptor in QEMU if the group is
close()'d and you've cached it from the above code.  Perhaps it's
sufficient to note that you don't support hot remove, but do you
actually do anything to prevent it?  Thanks,

Alex

> >> +
> >> +    return 0;
> >> +
> >> +put_group_exit:
> >> +    vfio_put_group(group);
> >> +
> >> +put_as_exit:
> >> +    vfio_put_address_space(space);
> > 
> > But put_group calls disconnect_container which calls
> > put_address_space... so it get's put twice.  The lack of symmetry
> > already bites us with a bug.
> 
> True. This will be fixed by moving vfio_get_address_space() into
> vfio_get_group().
> 
> 
> 
>
Alexey Kardashevskiy Sept. 13, 2013, 10:11 a.m. UTC | #4
On 09/11/2013 08:11 AM, Alex Williamson wrote:
> On Tue, 2013-09-10 at 18:36 +1000, Alexey Kardashevskiy wrote:
>> On 09/06/2013 05:01 AM, Alex Williamson wrote:
>>> On Fri, 2013-08-30 at 20:15 +1000, Alexey Kardashevskiy wrote:
>>>> As sPAPR platform supports DMA windows on a PCI bus, the information
>>>> about their location and size should be passed into the guest via
>>>> the device tree.
>>>>
>>>> The patch adds a helper to read this info from the container fd.
>>>>
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>> ---
>>>> Changes:
>>>> v4:
>>>> * fixed possible leaks on error paths
>>>> ---
>>>>  hw/misc/vfio.c         | 45 +++++++++++++++++++++++++++++++++++++++++++++
>>>>  include/hw/misc/vfio.h | 11 +++++++++++
>>>>  2 files changed, 56 insertions(+)
>>>>  create mode 100644 include/hw/misc/vfio.h
>>>>
>>>> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
>>>> index 53791fb..4210471 100644
>>>> --- a/hw/misc/vfio.c
>>>> +++ b/hw/misc/vfio.c
>>>> @@ -39,6 +39,7 @@
>>>>  #include "qemu/range.h"
>>>>  #include "sysemu/kvm.h"
>>>>  #include "sysemu/sysemu.h"
>>>> +#include "hw/misc/vfio.h"
>>>>  
>>>>  /* #define DEBUG_VFIO */
>>>>  #ifdef DEBUG_VFIO
>>>> @@ -3490,3 +3491,47 @@ static void register_vfio_pci_dev_type(void)
>>>>  }
>>>>  
>>>>  type_init(register_vfio_pci_dev_type)
>>>> +
>>>> +int vfio_container_spapr_get_info(AddressSpace *as, int32_t groupid,
>>>> +                                  struct vfio_iommu_spapr_tce_info *info,
>>>> +                                  int *group_fd)
>>>> +{
>>>> +    VFIOAddressSpace *space;
>>>> +    VFIOGroup *group;
>>>> +    VFIOContainer *container;
>>>> +    int ret, fd;
>>>> +
>>>> +    space = vfio_get_address_space(as);
>>>> +    if (!space) {
>>>> +        return -1;
>>>> +    }
>>>> +    group = vfio_get_group(groupid, space);
>>>> +    if (!group) {
>>>> +        goto put_as_exit;
>>>> +    }
>>>> +    container = group->container;
>>>> +    if (!group->container) {
>>>> +        goto put_group_exit;
>>>> +    }
>>>> +    fd = container->fd;
>>>> +    if (!ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU)) {
>>>> +        goto put_group_exit;
>>>> +    }
>>>> +    ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, info);
>>>> +    if (ret) {
>>>> +        error_report("vfio: failed to get iommu info for container: %s",
>>>> +                     strerror(errno));
>>>> +        goto put_group_exit;
>>>> +    }
>>>> +    *group_fd = group->fd;
>>>
>>> The above gets don't actually increment a reference count, so copying
>>> the fd seems risky here.
>>
>>
>> If fd is gone while I am carrying it to my "external VFIO user" to call
>> kvmppc_vfio_group_get_external_user() on it, then the guest just shut
>> itself in a foot, no?
>> And I do not see how I would make it no risky, do you?
> 
> We've handled the case in the kernel where the IOMMU code has a
> reference to the group so the group won't go away as long as that
> reference is in place, but we don't have that in QEMU.  If you supported
> hotplug, how would QEMU vfio notify spapr code to release the group?  I
> think you'd be left with the spapr kernel code holding the group
> reference and possibly a bogus file descriptor in QEMU if the group is
> close()'d and you've cached it from the above code.  Perhaps it's
> sufficient to note that you don't support hot remove, but do you
> actually do anything to prevent it?  Thanks,


I do not cache group_fd, I copy iе from VFIOGroup and immediately pass it
to KVM which immediately calls fget() on it. This is really short distance
and the only thing for protection here would be:

-    *group_fd = group->fd;
+    *group_fd = dup(group->fd);

and then close(group_fd) after I passed it to KVM. I guess it has to be
done anyway. But I suspect this is not what you are talking about...



> 
> Alex
> 
>>>> +
>>>> +    return 0;
>>>> +
>>>> +put_group_exit:
>>>> +    vfio_put_group(group);
>>>> +
>>>> +put_as_exit:
>>>> +    vfio_put_address_space(space);
>>>
>>> But put_group calls disconnect_container which calls
>>> put_address_space... so it get's put twice.  The lack of symmetry
>>> already bites us with a bug.
>>
>> True. This will be fixed by moving vfio_get_address_space() into
>> vfio_get_group().
Alex Williamson Sept. 25, 2013, 8:29 p.m. UTC | #5
On Fri, 2013-09-13 at 20:11 +1000, Alexey Kardashevskiy wrote:
> On 09/11/2013 08:11 AM, Alex Williamson wrote:
> > On Tue, 2013-09-10 at 18:36 +1000, Alexey Kardashevskiy wrote:
> >> On 09/06/2013 05:01 AM, Alex Williamson wrote:
> >>> On Fri, 2013-08-30 at 20:15 +1000, Alexey Kardashevskiy wrote:
> >>>> As sPAPR platform supports DMA windows on a PCI bus, the information
> >>>> about their location and size should be passed into the guest via
> >>>> the device tree.
> >>>>
> >>>> The patch adds a helper to read this info from the container fd.
> >>>>
> >>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>> ---
> >>>> Changes:
> >>>> v4:
> >>>> * fixed possible leaks on error paths
> >>>> ---
> >>>>  hw/misc/vfio.c         | 45 +++++++++++++++++++++++++++++++++++++++++++++
> >>>>  include/hw/misc/vfio.h | 11 +++++++++++
> >>>>  2 files changed, 56 insertions(+)
> >>>>  create mode 100644 include/hw/misc/vfio.h
> >>>>
> >>>> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
> >>>> index 53791fb..4210471 100644
> >>>> --- a/hw/misc/vfio.c
> >>>> +++ b/hw/misc/vfio.c
> >>>> @@ -39,6 +39,7 @@
> >>>>  #include "qemu/range.h"
> >>>>  #include "sysemu/kvm.h"
> >>>>  #include "sysemu/sysemu.h"
> >>>> +#include "hw/misc/vfio.h"
> >>>>  
> >>>>  /* #define DEBUG_VFIO */
> >>>>  #ifdef DEBUG_VFIO
> >>>> @@ -3490,3 +3491,47 @@ static void register_vfio_pci_dev_type(void)
> >>>>  }
> >>>>  
> >>>>  type_init(register_vfio_pci_dev_type)
> >>>> +
> >>>> +int vfio_container_spapr_get_info(AddressSpace *as, int32_t groupid,
> >>>> +                                  struct vfio_iommu_spapr_tce_info *info,
> >>>> +                                  int *group_fd)
> >>>> +{
> >>>> +    VFIOAddressSpace *space;
> >>>> +    VFIOGroup *group;
> >>>> +    VFIOContainer *container;
> >>>> +    int ret, fd;
> >>>> +
> >>>> +    space = vfio_get_address_space(as);
> >>>> +    if (!space) {
> >>>> +        return -1;
> >>>> +    }
> >>>> +    group = vfio_get_group(groupid, space);
> >>>> +    if (!group) {
> >>>> +        goto put_as_exit;
> >>>> +    }
> >>>> +    container = group->container;
> >>>> +    if (!group->container) {
> >>>> +        goto put_group_exit;
> >>>> +    }
> >>>> +    fd = container->fd;
> >>>> +    if (!ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU)) {
> >>>> +        goto put_group_exit;
> >>>> +    }
> >>>> +    ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, info);
> >>>> +    if (ret) {
> >>>> +        error_report("vfio: failed to get iommu info for container: %s",
> >>>> +                     strerror(errno));
> >>>> +        goto put_group_exit;
> >>>> +    }
> >>>> +    *group_fd = group->fd;
> >>>
> >>> The above gets don't actually increment a reference count, so copying
> >>> the fd seems risky here.
> >>
> >>
> >> If fd is gone while I am carrying it to my "external VFIO user" to call
> >> kvmppc_vfio_group_get_external_user() on it, then the guest just shut
> >> itself in a foot, no?
> >> And I do not see how I would make it no risky, do you?
> > 
> > We've handled the case in the kernel where the IOMMU code has a
> > reference to the group so the group won't go away as long as that
> > reference is in place, but we don't have that in QEMU.  If you supported
> > hotplug, how would QEMU vfio notify spapr code to release the group?  I
> > think you'd be left with the spapr kernel code holding the group
> > reference and possibly a bogus file descriptor in QEMU if the group is
> > close()'d and you've cached it from the above code.  Perhaps it's
> > sufficient to note that you don't support hot remove, but do you
> > actually do anything to prevent it?  Thanks,
> 
> 
> I do not cache group_fd, I copy iе from VFIOGroup and immediately pass it
> to KVM which immediately calls fget() on it. This is really short distance
> and the only thing for protection here would be:
> 
> -    *group_fd = group->fd;
> +    *group_fd = dup(group->fd);
> 
> and then close(group_fd) after I passed it to KVM. I guess it has to be
> done anyway. But I suspect this is not what you are talking about...

Meanwhile each of the processors has executed several million
instructions during this sequence of "immediate" events.  Besides, this
just creates the interface, who uses it and how is outside of our
control after this is in place.  Rather than creating an interface where
you can ask for info, some of which may be closely tied to the lifecycle
of a specific device, why not make an interface where vfio-pci can
register and unregister information about a device as part of it's
lifecycle?  That at least gives you an end point after which you know
the data is no longer valid.  Thanks,

Alex

> >>>> +
> >>>> +    return 0;
> >>>> +
> >>>> +put_group_exit:
> >>>> +    vfio_put_group(group);
> >>>> +
> >>>> +put_as_exit:
> >>>> +    vfio_put_address_space(space);
> >>>
> >>> But put_group calls disconnect_container which calls
> >>> put_address_space... so it get's put twice.  The lack of symmetry
> >>> already bites us with a bug.
> >>
> >> True. This will be fixed by moving vfio_get_address_space() into
> >> vfio_get_group().
> 
>
Alexey Kardashevskiy Sept. 26, 2013, 10:22 a.m. UTC | #6
On 09/26/2013 06:29 AM, Alex Williamson wrote:
> On Fri, 2013-09-13 at 20:11 +1000, Alexey Kardashevskiy wrote:
>> On 09/11/2013 08:11 AM, Alex Williamson wrote:
>>> On Tue, 2013-09-10 at 18:36 +1000, Alexey Kardashevskiy wrote:
>>>> On 09/06/2013 05:01 AM, Alex Williamson wrote:
>>>>> On Fri, 2013-08-30 at 20:15 +1000, Alexey Kardashevskiy wrote:
>>>>>> As sPAPR platform supports DMA windows on a PCI bus, the information
>>>>>> about their location and size should be passed into the guest via
>>>>>> the device tree.
>>>>>>
>>>>>> The patch adds a helper to read this info from the container fd.
>>>>>>
>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>>> ---
>>>>>> Changes:
>>>>>> v4:
>>>>>> * fixed possible leaks on error paths
>>>>>> ---
>>>>>>  hw/misc/vfio.c         | 45 +++++++++++++++++++++++++++++++++++++++++++++
>>>>>>  include/hw/misc/vfio.h | 11 +++++++++++
>>>>>>  2 files changed, 56 insertions(+)
>>>>>>  create mode 100644 include/hw/misc/vfio.h
>>>>>>
>>>>>> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
>>>>>> index 53791fb..4210471 100644
>>>>>> --- a/hw/misc/vfio.c
>>>>>> +++ b/hw/misc/vfio.c
>>>>>> @@ -39,6 +39,7 @@
>>>>>>  #include "qemu/range.h"
>>>>>>  #include "sysemu/kvm.h"
>>>>>>  #include "sysemu/sysemu.h"
>>>>>> +#include "hw/misc/vfio.h"
>>>>>>  
>>>>>>  /* #define DEBUG_VFIO */
>>>>>>  #ifdef DEBUG_VFIO
>>>>>> @@ -3490,3 +3491,47 @@ static void register_vfio_pci_dev_type(void)
>>>>>>  }
>>>>>>  
>>>>>>  type_init(register_vfio_pci_dev_type)
>>>>>> +
>>>>>> +int vfio_container_spapr_get_info(AddressSpace *as, int32_t groupid,
>>>>>> +                                  struct vfio_iommu_spapr_tce_info *info,
>>>>>> +                                  int *group_fd)
>>>>>> +{
>>>>>> +    VFIOAddressSpace *space;
>>>>>> +    VFIOGroup *group;
>>>>>> +    VFIOContainer *container;
>>>>>> +    int ret, fd;
>>>>>> +
>>>>>> +    space = vfio_get_address_space(as);
>>>>>> +    if (!space) {
>>>>>> +        return -1;
>>>>>> +    }
>>>>>> +    group = vfio_get_group(groupid, space);
>>>>>> +    if (!group) {
>>>>>> +        goto put_as_exit;
>>>>>> +    }
>>>>>> +    container = group->container;
>>>>>> +    if (!group->container) {
>>>>>> +        goto put_group_exit;
>>>>>> +    }
>>>>>> +    fd = container->fd;
>>>>>> +    if (!ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU)) {
>>>>>> +        goto put_group_exit;
>>>>>> +    }
>>>>>> +    ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, info);
>>>>>> +    if (ret) {
>>>>>> +        error_report("vfio: failed to get iommu info for container: %s",
>>>>>> +                     strerror(errno));
>>>>>> +        goto put_group_exit;
>>>>>> +    }
>>>>>> +    *group_fd = group->fd;
>>>>>
>>>>> The above gets don't actually increment a reference count, so copying
>>>>> the fd seems risky here.
>>>>
>>>>
>>>> If fd is gone while I am carrying it to my "external VFIO user" to call
>>>> kvmppc_vfio_group_get_external_user() on it, then the guest just shut
>>>> itself in a foot, no?
>>>> And I do not see how I would make it no risky, do you?
>>>
>>> We've handled the case in the kernel where the IOMMU code has a
>>> reference to the group so the group won't go away as long as that
>>> reference is in place, but we don't have that in QEMU.  If you supported
>>> hotplug, how would QEMU vfio notify spapr code to release the group?  I
>>> think you'd be left with the spapr kernel code holding the group
>>> reference and possibly a bogus file descriptor in QEMU if the group is
>>> close()'d and you've cached it from the above code.  Perhaps it's
>>> sufficient to note that you don't support hot remove, but do you
>>> actually do anything to prevent it?  Thanks,
>>
>>
>> I do not cache group_fd, I copy iе from VFIOGroup and immediately pass it
>> to KVM which immediately calls fget() on it. This is really short distance
>> and the only thing for protection here would be:
>>
>> -    *group_fd = group->fd;
>> +    *group_fd = dup(group->fd);
>>
>> and then close(group_fd) after I passed it to KVM. I guess it has to be
>> done anyway. But I suspect this is not what you are talking about...
> 
> Meanwhile each of the processors has executed several million
> instructions during this sequence of "immediate" events.  Besides, this
> just creates the interface, who uses it and how is outside of our
> control after this is in place.  Rather than creating an interface where
> you can ask for info, some of which may be closely tied to the lifecycle
> of a specific device, why not make an interface where vfio-pci can
> register and unregister information about a device as part of it's
> lifecycle?  That at least gives you an end point after which you know
> the data is no longer valid.  Thanks,

Sorry, I am not sure I understood you here.

As I understand the whole VFIO external API thing will move from spapr to
vfio so all I'll have to do will be just passing LIOBN to vfio so
vfio_container_spapr_get_info() will become
vfio_container_spapr_register_liobn_and_get_info() and no business with any
group fd. Is that correct?

Anyway it would be useful to see any rough QEMU patch or some git tree with
it. Thanks!





> 
> Alex
> 
>>>>>> +
>>>>>> +    return 0;
>>>>>> +
>>>>>> +put_group_exit:
>>>>>> +    vfio_put_group(group);
>>>>>> +
>>>>>> +put_as_exit:
>>>>>> +    vfio_put_address_space(space);
>>>>>
>>>>> But put_group calls disconnect_container which calls
>>>>> put_address_space... so it get's put twice.  The lack of symmetry
>>>>> already bites us with a bug.
>>>>
>>>> True. This will be fixed by moving vfio_get_address_space() into
>>>> vfio_get_group().
>>
>>
> 
> 
>
diff mbox

Patch

diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index 53791fb..4210471 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -39,6 +39,7 @@ 
 #include "qemu/range.h"
 #include "sysemu/kvm.h"
 #include "sysemu/sysemu.h"
+#include "hw/misc/vfio.h"
 
 /* #define DEBUG_VFIO */
 #ifdef DEBUG_VFIO
@@ -3490,3 +3491,47 @@  static void register_vfio_pci_dev_type(void)
 }
 
 type_init(register_vfio_pci_dev_type)
+
+int vfio_container_spapr_get_info(AddressSpace *as, int32_t groupid,
+                                  struct vfio_iommu_spapr_tce_info *info,
+                                  int *group_fd)
+{
+    VFIOAddressSpace *space;
+    VFIOGroup *group;
+    VFIOContainer *container;
+    int ret, fd;
+
+    space = vfio_get_address_space(as);
+    if (!space) {
+        return -1;
+    }
+    group = vfio_get_group(groupid, space);
+    if (!group) {
+        goto put_as_exit;
+    }
+    container = group->container;
+    if (!group->container) {
+        goto put_group_exit;
+    }
+    fd = container->fd;
+    if (!ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU)) {
+        goto put_group_exit;
+    }
+    ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, info);
+    if (ret) {
+        error_report("vfio: failed to get iommu info for container: %s",
+                     strerror(errno));
+        goto put_group_exit;
+    }
+    *group_fd = group->fd;
+
+    return 0;
+
+put_group_exit:
+    vfio_put_group(group);
+
+put_as_exit:
+    vfio_put_address_space(space);
+
+    return -1;
+}
diff --git a/include/hw/misc/vfio.h b/include/hw/misc/vfio.h
new file mode 100644
index 0000000..ac9a971
--- /dev/null
+++ b/include/hw/misc/vfio.h
@@ -0,0 +1,11 @@ 
+#ifndef VFIO_API_H
+#define VFIO_API_H
+
+#include "qemu/typedefs.h"
+#include <linux/vfio.h>
+
+extern int vfio_container_spapr_get_info(AddressSpace *as, int32_t groupid,
+                                         struct vfio_iommu_spapr_tce_info *info,
+                                         int *group_fd);
+
+#endif