diff mbox

[13/13] qdev-properties: Add pci-devaddr property

Message ID b5771ff2f8bad4231fee1ef74d9cbbb1448150d3.1338799936.git.jan.kiszka@siemens.com
State New
Headers show

Commit Message

Jan Kiszka June 4, 2012, 8:52 a.m. UTC
Add a property to receive a fully qualified PCI device address.

Will be used by KVM device assignment.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---
 hw/qdev-properties.c |   48 ++++++++++++++++++++++++++++++++++++++++++++++++
 hw/qdev.h            |    3 +++
 2 files changed, 51 insertions(+), 0 deletions(-)

Comments

Michael S. Tsirkin June 10, 2012, 9:35 a.m. UTC | #1
On Mon, Jun 04, 2012 at 10:52:21AM +0200, Jan Kiszka wrote:
> Add a property to receive a fully qualified PCI device address.
> 
> Will be used by KVM device assignment.
> 
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>

I'd like to ponder this a bit more.  What bothers me is that this mixes
two things:
	- addressing of qemu devices
		Using full device addresses there is a legacy feature,
		users really should supply the parent bus and
		the bus local address.
	- addressing devices on the linux host for assignment
		It so happens that the syntax matches
		the legacy naming very closely,
		but conceptually is completely unrelated

> ---
>  hw/qdev-properties.c |   48 ++++++++++++++++++++++++++++++++++++++++++++++++
>  hw/qdev.h            |    3 +++
>  2 files changed, 51 insertions(+), 0 deletions(-)
> 
> diff --git a/hw/qdev-properties.c b/hw/qdev-properties.c
> index 32e41f1..6634f22 100644
> --- a/hw/qdev-properties.c
> +++ b/hw/qdev-properties.c
> @@ -946,6 +946,54 @@ PropertyInfo qdev_prop_pci_devfn = {
>      .max   = 0xFFFFFFFFULL,
>  };
>  
> +static void get_pci_devaddr(Object *obj, Visitor *v, void *opaque,
> +                            const char *name, Error **errp)
> +{
> +    DeviceState *dev = DEVICE(obj);
> +    Property *prop = opaque;
> +    PCIDeviceAddress *addr = qdev_get_prop_ptr(dev, prop);
> +    char buffer[10 + 3 + 1];
> +    char *p = buffer;
> +
> +    snprintf(buffer, sizeof(buffer), "%04x:%02x:%02x.%02x",
> +             addr->domain, addr->bus, addr->slot, addr->function);
> +
> +    visit_type_str(v, &p, name, errp);
> +}
> +
> +static void set_pci_devaddr(Object *obj, Visitor *v, void *opaque,
> +                            const char *name, Error **errp)
> +{
> +    DeviceState *dev = DEVICE(obj);
> +    Property *prop = opaque;
> +    PCIDeviceAddress *addr = qdev_get_prop_ptr(dev, prop);
> +    Error *local_err = NULL;
> +    char *str;
> +
> +    if (dev->state != DEV_STATE_CREATED) {
> +        error_set(errp, QERR_PERMISSION_DENIED);
> +        return;
> +    }
> +
> +    visit_type_str(v, &str, name, &local_err);
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        return;
> +    }
> +
> +    if (qemu_parse_pci_devaddr(str, addr,
> +                               PCI_DEVADDR_WITH_DOM_BUS_OPT |
> +                               PCI_DEVADDR_WITH_FUNC) < 0) {
> +        error_set_from_qdev_prop_error(errp, EINVAL, dev, prop, str);
> +    }
> +}
> +
> +PropertyInfo qdev_prop_pci_devaddr = {
> +    .name  = "pci-devaddr",

This is a very confusing name.  Something like host-pci-address?
This also should be built on linux only.
Can this be part of device assignment code instead of qdev?



> +    .get   = get_pci_devaddr,
> +    .set   = set_pci_devaddr,
> +};
> +
>  /* --- blocksize --- */
>  
>  static void set_blocksize(Object *obj, Visitor *v, void *opaque,
> diff --git a/hw/qdev.h b/hw/qdev.h
> index 15acfca..d2c87a0 100644
> --- a/hw/qdev.h
> +++ b/hw/qdev.h
> @@ -224,6 +224,7 @@ extern PropertyInfo qdev_prop_drive;
>  extern PropertyInfo qdev_prop_netdev;
>  extern PropertyInfo qdev_prop_vlan;
>  extern PropertyInfo qdev_prop_pci_devfn;
> +extern PropertyInfo qdev_prop_pci_devaddr;
>  extern PropertyInfo qdev_prop_blocksize;
>  
>  #define DEFINE_PROP(_name, _state, _field, _prop, _type) { \
> @@ -288,6 +289,8 @@ extern PropertyInfo qdev_prop_blocksize;
>                          LostTickPolicy)
>  #define DEFINE_PROP_BLOCKSIZE(_n, _s, _f, _d) \
>      DEFINE_PROP_DEFAULT(_n, _s, _f, _d, qdev_prop_blocksize, uint16_t)
> +#define DEFINE_PROP_PCI_DEVADDR(_n, _s, _f) \
> +    DEFINE_PROP(_n, _s, _f, qdev_prop_pci_devaddr, PCIDeviceAddress)
>  
>  #define DEFINE_PROP_END_OF_LIST()               \
>      {}
> -- 
> 1.7.3.4
Jan Kiszka June 10, 2012, 10:14 a.m. UTC | #2
On 2012-06-10 11:35, Michael S. Tsirkin wrote:
> On Mon, Jun 04, 2012 at 10:52:21AM +0200, Jan Kiszka wrote:
>> Add a property to receive a fully qualified PCI device address.
>>
>> Will be used by KVM device assignment.
>>
>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> 
> I'd like to ponder this a bit more.  What bothers me is that this mixes
> two things:
> 	- addressing of qemu devices
> 		Using full device addresses there is a legacy feature,
> 		users really should supply the parent bus and
> 		the bus local address.
> 	- addressing devices on the linux host for assignment
> 		It so happens that the syntax matches
> 		the legacy naming very closely,
> 		but conceptually is completely unrelated

We can keep code duplications, of course.

> 
>> ---
>>  hw/qdev-properties.c |   48 ++++++++++++++++++++++++++++++++++++++++++++++++
>>  hw/qdev.h            |    3 +++
>>  2 files changed, 51 insertions(+), 0 deletions(-)
>>
>> diff --git a/hw/qdev-properties.c b/hw/qdev-properties.c
>> index 32e41f1..6634f22 100644
>> --- a/hw/qdev-properties.c
>> +++ b/hw/qdev-properties.c
>> @@ -946,6 +946,54 @@ PropertyInfo qdev_prop_pci_devfn = {
>>      .max   = 0xFFFFFFFFULL,
>>  };
>>  
>> +static void get_pci_devaddr(Object *obj, Visitor *v, void *opaque,
>> +                            const char *name, Error **errp)
>> +{
>> +    DeviceState *dev = DEVICE(obj);
>> +    Property *prop = opaque;
>> +    PCIDeviceAddress *addr = qdev_get_prop_ptr(dev, prop);
>> +    char buffer[10 + 3 + 1];
>> +    char *p = buffer;
>> +
>> +    snprintf(buffer, sizeof(buffer), "%04x:%02x:%02x.%02x",
>> +             addr->domain, addr->bus, addr->slot, addr->function);
>> +
>> +    visit_type_str(v, &p, name, errp);
>> +}
>> +
>> +static void set_pci_devaddr(Object *obj, Visitor *v, void *opaque,
>> +                            const char *name, Error **errp)
>> +{
>> +    DeviceState *dev = DEVICE(obj);
>> +    Property *prop = opaque;
>> +    PCIDeviceAddress *addr = qdev_get_prop_ptr(dev, prop);
>> +    Error *local_err = NULL;
>> +    char *str;
>> +
>> +    if (dev->state != DEV_STATE_CREATED) {
>> +        error_set(errp, QERR_PERMISSION_DENIED);
>> +        return;
>> +    }
>> +
>> +    visit_type_str(v, &str, name, &local_err);
>> +    if (local_err) {
>> +        error_propagate(errp, local_err);
>> +        return;
>> +    }
>> +
>> +    if (qemu_parse_pci_devaddr(str, addr,
>> +                               PCI_DEVADDR_WITH_DOM_BUS_OPT |
>> +                               PCI_DEVADDR_WITH_FUNC) < 0) {
>> +        error_set_from_qdev_prop_error(errp, EINVAL, dev, prop, str);
>> +    }
>> +}
>> +
>> +PropertyInfo qdev_prop_pci_devaddr = {
>> +    .name  = "pci-devaddr",
> 
> This is a very confusing name.  Something like host-pci-address?

That might be an option.

> This also should be built on linux only.

Why, what do we gain with #ifdefs? And isn't the addressing concept generic?

> Can this be part of device assignment code instead of qdev?

How does VFIO address their host devices? And Xen?

Jan
Michael S. Tsirkin June 10, 2012, 10:49 a.m. UTC | #3
On Sun, Jun 10, 2012 at 12:14:36PM +0200, Jan Kiszka wrote:
> On 2012-06-10 11:35, Michael S. Tsirkin wrote:
> > On Mon, Jun 04, 2012 at 10:52:21AM +0200, Jan Kiszka wrote:
> >> Add a property to receive a fully qualified PCI device address.
> >>
> >> Will be used by KVM device assignment.
> >>
> >> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> > 
> > I'd like to ponder this a bit more.  What bothers me is that this mixes
> > two things:
> > 	- addressing of qemu devices
> > 		Using full device addresses there is a legacy feature,
> > 		users really should supply the parent bus and
> > 		the bus local address.
> > 	- addressing devices on the linux host for assignment
> > 		It so happens that the syntax matches
> > 		the legacy naming very closely,
> > 		but conceptually is completely unrelated
> 
> We can keep code duplications, of course.
> 
> > 
> >> ---
> >>  hw/qdev-properties.c |   48 ++++++++++++++++++++++++++++++++++++++++++++++++
> >>  hw/qdev.h            |    3 +++
> >>  2 files changed, 51 insertions(+), 0 deletions(-)
> >>
> >> diff --git a/hw/qdev-properties.c b/hw/qdev-properties.c
> >> index 32e41f1..6634f22 100644
> >> --- a/hw/qdev-properties.c
> >> +++ b/hw/qdev-properties.c
> >> @@ -946,6 +946,54 @@ PropertyInfo qdev_prop_pci_devfn = {
> >>      .max   = 0xFFFFFFFFULL,
> >>  };
> >>  
> >> +static void get_pci_devaddr(Object *obj, Visitor *v, void *opaque,
> >> +                            const char *name, Error **errp)
> >> +{
> >> +    DeviceState *dev = DEVICE(obj);
> >> +    Property *prop = opaque;
> >> +    PCIDeviceAddress *addr = qdev_get_prop_ptr(dev, prop);
> >> +    char buffer[10 + 3 + 1];
> >> +    char *p = buffer;
> >> +
> >> +    snprintf(buffer, sizeof(buffer), "%04x:%02x:%02x.%02x",
> >> +             addr->domain, addr->bus, addr->slot, addr->function);
> >> +
> >> +    visit_type_str(v, &p, name, errp);
> >> +}
> >> +
> >> +static void set_pci_devaddr(Object *obj, Visitor *v, void *opaque,
> >> +                            const char *name, Error **errp)
> >> +{
> >> +    DeviceState *dev = DEVICE(obj);
> >> +    Property *prop = opaque;
> >> +    PCIDeviceAddress *addr = qdev_get_prop_ptr(dev, prop);
> >> +    Error *local_err = NULL;
> >> +    char *str;
> >> +
> >> +    if (dev->state != DEV_STATE_CREATED) {
> >> +        error_set(errp, QERR_PERMISSION_DENIED);
> >> +        return;
> >> +    }
> >> +
> >> +    visit_type_str(v, &str, name, &local_err);
> >> +    if (local_err) {
> >> +        error_propagate(errp, local_err);
> >> +        return;
> >> +    }
> >> +
> >> +    if (qemu_parse_pci_devaddr(str, addr,
> >> +                               PCI_DEVADDR_WITH_DOM_BUS_OPT |
> >> +                               PCI_DEVADDR_WITH_FUNC) < 0) {
> >> +        error_set_from_qdev_prop_error(errp, EINVAL, dev, prop, str);
> >> +    }
> >> +}
> >> +
> >> +PropertyInfo qdev_prop_pci_devaddr = {
> >> +    .name  = "pci-devaddr",
> > 
> > This is a very confusing name.  Something like host-pci-address?
> 
> That might be an option.
> 
> > This also should be built on linux only.
> 
> Why, what do we gain with #ifdefs? And isn't the addressing concept generic?

Not the XXX:XX.X format. And not the concept of a domain.

> > Can this be part of device assignment code instead of qdev?
> 
> How does VFIO address their host devices?

You get an fd I think. I think you don't need to know the host address.

> And Xen?
> 
> Jan
> 
>
Jan Kiszka June 10, 2012, 10:52 a.m. UTC | #4
On 2012-06-10 12:49, Michael S. Tsirkin wrote:
> On Sun, Jun 10, 2012 at 12:14:36PM +0200, Jan Kiszka wrote:
>> On 2012-06-10 11:35, Michael S. Tsirkin wrote:
>>> On Mon, Jun 04, 2012 at 10:52:21AM +0200, Jan Kiszka wrote:
>>>> Add a property to receive a fully qualified PCI device address.
>>>>
>>>> Will be used by KVM device assignment.
>>>>
>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>>
>>> I'd like to ponder this a bit more.  What bothers me is that this mixes
>>> two things:
>>> 	- addressing of qemu devices
>>> 		Using full device addresses there is a legacy feature,
>>> 		users really should supply the parent bus and
>>> 		the bus local address.
>>> 	- addressing devices on the linux host for assignment
>>> 		It so happens that the syntax matches
>>> 		the legacy naming very closely,
>>> 		but conceptually is completely unrelated
>>
>> We can keep code duplications, of course.
>>
>>>
>>>> ---
>>>>  hw/qdev-properties.c |   48 ++++++++++++++++++++++++++++++++++++++++++++++++
>>>>  hw/qdev.h            |    3 +++
>>>>  2 files changed, 51 insertions(+), 0 deletions(-)
>>>>
>>>> diff --git a/hw/qdev-properties.c b/hw/qdev-properties.c
>>>> index 32e41f1..6634f22 100644
>>>> --- a/hw/qdev-properties.c
>>>> +++ b/hw/qdev-properties.c
>>>> @@ -946,6 +946,54 @@ PropertyInfo qdev_prop_pci_devfn = {
>>>>      .max   = 0xFFFFFFFFULL,
>>>>  };
>>>>  
>>>> +static void get_pci_devaddr(Object *obj, Visitor *v, void *opaque,
>>>> +                            const char *name, Error **errp)
>>>> +{
>>>> +    DeviceState *dev = DEVICE(obj);
>>>> +    Property *prop = opaque;
>>>> +    PCIDeviceAddress *addr = qdev_get_prop_ptr(dev, prop);
>>>> +    char buffer[10 + 3 + 1];
>>>> +    char *p = buffer;
>>>> +
>>>> +    snprintf(buffer, sizeof(buffer), "%04x:%02x:%02x.%02x",
>>>> +             addr->domain, addr->bus, addr->slot, addr->function);
>>>> +
>>>> +    visit_type_str(v, &p, name, errp);
>>>> +}
>>>> +
>>>> +static void set_pci_devaddr(Object *obj, Visitor *v, void *opaque,
>>>> +                            const char *name, Error **errp)
>>>> +{
>>>> +    DeviceState *dev = DEVICE(obj);
>>>> +    Property *prop = opaque;
>>>> +    PCIDeviceAddress *addr = qdev_get_prop_ptr(dev, prop);
>>>> +    Error *local_err = NULL;
>>>> +    char *str;
>>>> +
>>>> +    if (dev->state != DEV_STATE_CREATED) {
>>>> +        error_set(errp, QERR_PERMISSION_DENIED);
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    visit_type_str(v, &str, name, &local_err);
>>>> +    if (local_err) {
>>>> +        error_propagate(errp, local_err);
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    if (qemu_parse_pci_devaddr(str, addr,
>>>> +                               PCI_DEVADDR_WITH_DOM_BUS_OPT |
>>>> +                               PCI_DEVADDR_WITH_FUNC) < 0) {
>>>> +        error_set_from_qdev_prop_error(errp, EINVAL, dev, prop, str);
>>>> +    }
>>>> +}
>>>> +
>>>> +PropertyInfo qdev_prop_pci_devaddr = {
>>>> +    .name  = "pci-devaddr",
>>>
>>> This is a very confusing name.  Something like host-pci-address?
>>
>> That might be an option.
>>
>>> This also should be built on linux only.
>>
>> Why, what do we gain with #ifdefs? And isn't the addressing concept generic?
> 
> Not the XXX:XX.X format. And not the concept of a domain.
> 
>>> Can this be part of device assignment code instead of qdev?
>>
>> How does VFIO address their host devices?
> 
> You get an fd I think. I think you don't need to know the host address.

vfio_pci.c contains a nice function called "parse_hostaddr". You may
guess what it does. ;)

So we need this generic service. Let's called it pci-host-devaddr and be
fine?

Jan
Michael S. Tsirkin June 10, 2012, 10:58 a.m. UTC | #5
On Sun, Jun 10, 2012 at 12:52:45PM +0200, Jan Kiszka wrote:
> On 2012-06-10 12:49, Michael S. Tsirkin wrote:
> > On Sun, Jun 10, 2012 at 12:14:36PM +0200, Jan Kiszka wrote:
> >> On 2012-06-10 11:35, Michael S. Tsirkin wrote:
> >>> On Mon, Jun 04, 2012 at 10:52:21AM +0200, Jan Kiszka wrote:
> >>>> Add a property to receive a fully qualified PCI device address.
> >>>>
> >>>> Will be used by KVM device assignment.
> >>>>
> >>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> >>>
> >>> I'd like to ponder this a bit more.  What bothers me is that this mixes
> >>> two things:
> >>> 	- addressing of qemu devices
> >>> 		Using full device addresses there is a legacy feature,
> >>> 		users really should supply the parent bus and
> >>> 		the bus local address.
> >>> 	- addressing devices on the linux host for assignment
> >>> 		It so happens that the syntax matches
> >>> 		the legacy naming very closely,
> >>> 		but conceptually is completely unrelated
> >>
> >> We can keep code duplications, of course.
> >>
> >>>
> >>>> ---
> >>>>  hw/qdev-properties.c |   48 ++++++++++++++++++++++++++++++++++++++++++++++++
> >>>>  hw/qdev.h            |    3 +++
> >>>>  2 files changed, 51 insertions(+), 0 deletions(-)
> >>>>
> >>>> diff --git a/hw/qdev-properties.c b/hw/qdev-properties.c
> >>>> index 32e41f1..6634f22 100644
> >>>> --- a/hw/qdev-properties.c
> >>>> +++ b/hw/qdev-properties.c
> >>>> @@ -946,6 +946,54 @@ PropertyInfo qdev_prop_pci_devfn = {
> >>>>      .max   = 0xFFFFFFFFULL,
> >>>>  };
> >>>>  
> >>>> +static void get_pci_devaddr(Object *obj, Visitor *v, void *opaque,
> >>>> +                            const char *name, Error **errp)
> >>>> +{
> >>>> +    DeviceState *dev = DEVICE(obj);
> >>>> +    Property *prop = opaque;
> >>>> +    PCIDeviceAddress *addr = qdev_get_prop_ptr(dev, prop);
> >>>> +    char buffer[10 + 3 + 1];
> >>>> +    char *p = buffer;
> >>>> +
> >>>> +    snprintf(buffer, sizeof(buffer), "%04x:%02x:%02x.%02x",
> >>>> +             addr->domain, addr->bus, addr->slot, addr->function);
> >>>> +
> >>>> +    visit_type_str(v, &p, name, errp);
> >>>> +}
> >>>> +
> >>>> +static void set_pci_devaddr(Object *obj, Visitor *v, void *opaque,
> >>>> +                            const char *name, Error **errp)
> >>>> +{
> >>>> +    DeviceState *dev = DEVICE(obj);
> >>>> +    Property *prop = opaque;
> >>>> +    PCIDeviceAddress *addr = qdev_get_prop_ptr(dev, prop);
> >>>> +    Error *local_err = NULL;
> >>>> +    char *str;
> >>>> +
> >>>> +    if (dev->state != DEV_STATE_CREATED) {
> >>>> +        error_set(errp, QERR_PERMISSION_DENIED);
> >>>> +        return;
> >>>> +    }
> >>>> +
> >>>> +    visit_type_str(v, &str, name, &local_err);
> >>>> +    if (local_err) {
> >>>> +        error_propagate(errp, local_err);
> >>>> +        return;
> >>>> +    }
> >>>> +
> >>>> +    if (qemu_parse_pci_devaddr(str, addr,
> >>>> +                               PCI_DEVADDR_WITH_DOM_BUS_OPT |
> >>>> +                               PCI_DEVADDR_WITH_FUNC) < 0) {
> >>>> +        error_set_from_qdev_prop_error(errp, EINVAL, dev, prop, str);
> >>>> +    }
> >>>> +}
> >>>> +
> >>>> +PropertyInfo qdev_prop_pci_devaddr = {
> >>>> +    .name  = "pci-devaddr",
> >>>
> >>> This is a very confusing name.  Something like host-pci-address?
> >>
> >> That might be an option.
> >>
> >>> This also should be built on linux only.
> >>
> >> Why, what do we gain with #ifdefs? And isn't the addressing concept generic?
> > 
> > Not the XXX:XX.X format. And not the concept of a domain.
> > 
> >>> Can this be part of device assignment code instead of qdev?
> >>
> >> How does VFIO address their host devices?
> > 
> > You get an fd I think. I think you don't need to know the host address.
> 
> vfio_pci.c contains a nice function called "parse_hostaddr". You may
> guess what it does. ;)

Interesting. Why? This looks strange to me:
I would expect the admin to bind a device to vfio
the way it's now bound to a stub.
The pass /dev/vfioXXX to qemu.

> 
> So we need this generic service. Let's called it pci-host-devaddr and be
> fine?
> 
> Jan
>
Jan Kiszka June 10, 2012, 11 a.m. UTC | #6
On 2012-06-10 12:58, Michael S. Tsirkin wrote:
> On Sun, Jun 10, 2012 at 12:52:45PM +0200, Jan Kiszka wrote:
>> On 2012-06-10 12:49, Michael S. Tsirkin wrote:
>>> On Sun, Jun 10, 2012 at 12:14:36PM +0200, Jan Kiszka wrote:
>>>> On 2012-06-10 11:35, Michael S. Tsirkin wrote:
>>>>> On Mon, Jun 04, 2012 at 10:52:21AM +0200, Jan Kiszka wrote:
>>>>>> Add a property to receive a fully qualified PCI device address.
>>>>>>
>>>>>> Will be used by KVM device assignment.
>>>>>>
>>>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>>>>
>>>>> I'd like to ponder this a bit more.  What bothers me is that this mixes
>>>>> two things:
>>>>> 	- addressing of qemu devices
>>>>> 		Using full device addresses there is a legacy feature,
>>>>> 		users really should supply the parent bus and
>>>>> 		the bus local address.
>>>>> 	- addressing devices on the linux host for assignment
>>>>> 		It so happens that the syntax matches
>>>>> 		the legacy naming very closely,
>>>>> 		but conceptually is completely unrelated
>>>>
>>>> We can keep code duplications, of course.
>>>>
>>>>>
>>>>>> ---
>>>>>>  hw/qdev-properties.c |   48 ++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>>  hw/qdev.h            |    3 +++
>>>>>>  2 files changed, 51 insertions(+), 0 deletions(-)
>>>>>>
>>>>>> diff --git a/hw/qdev-properties.c b/hw/qdev-properties.c
>>>>>> index 32e41f1..6634f22 100644
>>>>>> --- a/hw/qdev-properties.c
>>>>>> +++ b/hw/qdev-properties.c
>>>>>> @@ -946,6 +946,54 @@ PropertyInfo qdev_prop_pci_devfn = {
>>>>>>      .max   = 0xFFFFFFFFULL,
>>>>>>  };
>>>>>>  
>>>>>> +static void get_pci_devaddr(Object *obj, Visitor *v, void *opaque,
>>>>>> +                            const char *name, Error **errp)
>>>>>> +{
>>>>>> +    DeviceState *dev = DEVICE(obj);
>>>>>> +    Property *prop = opaque;
>>>>>> +    PCIDeviceAddress *addr = qdev_get_prop_ptr(dev, prop);
>>>>>> +    char buffer[10 + 3 + 1];
>>>>>> +    char *p = buffer;
>>>>>> +
>>>>>> +    snprintf(buffer, sizeof(buffer), "%04x:%02x:%02x.%02x",
>>>>>> +             addr->domain, addr->bus, addr->slot, addr->function);
>>>>>> +
>>>>>> +    visit_type_str(v, &p, name, errp);
>>>>>> +}
>>>>>> +
>>>>>> +static void set_pci_devaddr(Object *obj, Visitor *v, void *opaque,
>>>>>> +                            const char *name, Error **errp)
>>>>>> +{
>>>>>> +    DeviceState *dev = DEVICE(obj);
>>>>>> +    Property *prop = opaque;
>>>>>> +    PCIDeviceAddress *addr = qdev_get_prop_ptr(dev, prop);
>>>>>> +    Error *local_err = NULL;
>>>>>> +    char *str;
>>>>>> +
>>>>>> +    if (dev->state != DEV_STATE_CREATED) {
>>>>>> +        error_set(errp, QERR_PERMISSION_DENIED);
>>>>>> +        return;
>>>>>> +    }
>>>>>> +
>>>>>> +    visit_type_str(v, &str, name, &local_err);
>>>>>> +    if (local_err) {
>>>>>> +        error_propagate(errp, local_err);
>>>>>> +        return;
>>>>>> +    }
>>>>>> +
>>>>>> +    if (qemu_parse_pci_devaddr(str, addr,
>>>>>> +                               PCI_DEVADDR_WITH_DOM_BUS_OPT |
>>>>>> +                               PCI_DEVADDR_WITH_FUNC) < 0) {
>>>>>> +        error_set_from_qdev_prop_error(errp, EINVAL, dev, prop, str);
>>>>>> +    }
>>>>>> +}
>>>>>> +
>>>>>> +PropertyInfo qdev_prop_pci_devaddr = {
>>>>>> +    .name  = "pci-devaddr",
>>>>>
>>>>> This is a very confusing name.  Something like host-pci-address?
>>>>
>>>> That might be an option.
>>>>
>>>>> This also should be built on linux only.
>>>>
>>>> Why, what do we gain with #ifdefs? And isn't the addressing concept generic?
>>>
>>> Not the XXX:XX.X format. And not the concept of a domain.
>>>
>>>>> Can this be part of device assignment code instead of qdev?
>>>>
>>>> How does VFIO address their host devices?
>>>
>>> You get an fd I think. I think you don't need to know the host address.
>>
>> vfio_pci.c contains a nice function called "parse_hostaddr". You may
>> guess what it does. ;)
> 
> Interesting. Why? This looks strange to me:
> I would expect the admin to bind a device to vfio
> the way it's now bound to a stub.
> The pass /dev/vfioXXX to qemu.

That's the "libvirt way". We surely also want the "qemu command line
way" for which this kind of service is needed.

Jan
Michael S. Tsirkin June 10, 2012, 11:17 a.m. UTC | #7
On Sun, Jun 10, 2012 at 01:00:35PM +0200, Jan Kiszka wrote:
> On 2012-06-10 12:58, Michael S. Tsirkin wrote:
> > On Sun, Jun 10, 2012 at 12:52:45PM +0200, Jan Kiszka wrote:
> >> On 2012-06-10 12:49, Michael S. Tsirkin wrote:
> >>> On Sun, Jun 10, 2012 at 12:14:36PM +0200, Jan Kiszka wrote:
> >>>> On 2012-06-10 11:35, Michael S. Tsirkin wrote:
> >>>>> On Mon, Jun 04, 2012 at 10:52:21AM +0200, Jan Kiszka wrote:
> >>>>>> Add a property to receive a fully qualified PCI device address.
> >>>>>>
> >>>>>> Will be used by KVM device assignment.
> >>>>>>
> >>>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> >>>>>
> >>>>> I'd like to ponder this a bit more.  What bothers me is that this mixes
> >>>>> two things:
> >>>>> 	- addressing of qemu devices
> >>>>> 		Using full device addresses there is a legacy feature,
> >>>>> 		users really should supply the parent bus and
> >>>>> 		the bus local address.
> >>>>> 	- addressing devices on the linux host for assignment
> >>>>> 		It so happens that the syntax matches
> >>>>> 		the legacy naming very closely,
> >>>>> 		but conceptually is completely unrelated
> >>>>
> >>>> We can keep code duplications, of course.
> >>>>
> >>>>>
> >>>>>> ---
> >>>>>>  hw/qdev-properties.c |   48 ++++++++++++++++++++++++++++++++++++++++++++++++
> >>>>>>  hw/qdev.h            |    3 +++
> >>>>>>  2 files changed, 51 insertions(+), 0 deletions(-)
> >>>>>>
> >>>>>> diff --git a/hw/qdev-properties.c b/hw/qdev-properties.c
> >>>>>> index 32e41f1..6634f22 100644
> >>>>>> --- a/hw/qdev-properties.c
> >>>>>> +++ b/hw/qdev-properties.c
> >>>>>> @@ -946,6 +946,54 @@ PropertyInfo qdev_prop_pci_devfn = {
> >>>>>>      .max   = 0xFFFFFFFFULL,
> >>>>>>  };
> >>>>>>  
> >>>>>> +static void get_pci_devaddr(Object *obj, Visitor *v, void *opaque,
> >>>>>> +                            const char *name, Error **errp)
> >>>>>> +{
> >>>>>> +    DeviceState *dev = DEVICE(obj);
> >>>>>> +    Property *prop = opaque;
> >>>>>> +    PCIDeviceAddress *addr = qdev_get_prop_ptr(dev, prop);
> >>>>>> +    char buffer[10 + 3 + 1];
> >>>>>> +    char *p = buffer;
> >>>>>> +
> >>>>>> +    snprintf(buffer, sizeof(buffer), "%04x:%02x:%02x.%02x",
> >>>>>> +             addr->domain, addr->bus, addr->slot, addr->function);
> >>>>>> +
> >>>>>> +    visit_type_str(v, &p, name, errp);
> >>>>>> +}
> >>>>>> +
> >>>>>> +static void set_pci_devaddr(Object *obj, Visitor *v, void *opaque,
> >>>>>> +                            const char *name, Error **errp)
> >>>>>> +{
> >>>>>> +    DeviceState *dev = DEVICE(obj);
> >>>>>> +    Property *prop = opaque;
> >>>>>> +    PCIDeviceAddress *addr = qdev_get_prop_ptr(dev, prop);
> >>>>>> +    Error *local_err = NULL;
> >>>>>> +    char *str;
> >>>>>> +
> >>>>>> +    if (dev->state != DEV_STATE_CREATED) {
> >>>>>> +        error_set(errp, QERR_PERMISSION_DENIED);
> >>>>>> +        return;
> >>>>>> +    }
> >>>>>> +
> >>>>>> +    visit_type_str(v, &str, name, &local_err);
> >>>>>> +    if (local_err) {
> >>>>>> +        error_propagate(errp, local_err);
> >>>>>> +        return;
> >>>>>> +    }
> >>>>>> +
> >>>>>> +    if (qemu_parse_pci_devaddr(str, addr,
> >>>>>> +                               PCI_DEVADDR_WITH_DOM_BUS_OPT |
> >>>>>> +                               PCI_DEVADDR_WITH_FUNC) < 0) {
> >>>>>> +        error_set_from_qdev_prop_error(errp, EINVAL, dev, prop, str);
> >>>>>> +    }
> >>>>>> +}
> >>>>>> +
> >>>>>> +PropertyInfo qdev_prop_pci_devaddr = {
> >>>>>> +    .name  = "pci-devaddr",
> >>>>>
> >>>>> This is a very confusing name.  Something like host-pci-address?
> >>>>
> >>>> That might be an option.
> >>>>
> >>>>> This also should be built on linux only.
> >>>>
> >>>> Why, what do we gain with #ifdefs? And isn't the addressing concept generic?
> >>>
> >>> Not the XXX:XX.X format. And not the concept of a domain.
> >>>
> >>>>> Can this be part of device assignment code instead of qdev?
> >>>>
> >>>> How does VFIO address their host devices?
> >>>
> >>> You get an fd I think. I think you don't need to know the host address.
> >>
> >> vfio_pci.c contains a nice function called "parse_hostaddr". You may
> >> guess what it does. ;)
> > 
> > Interesting. Why? This looks strange to me:
> > I would expect the admin to bind a device to vfio
> > the way it's now bound to a stub.
> > The pass /dev/vfioXXX to qemu.
> 
> That's the "libvirt way". We surely also want the "qemu command line
> way" for which this kind of service is needed.
> 
> Jan
> 

Yes, I imagine the qemu command line passing in /dev/vfioXXX,
the libvirt way will pass in an fd for above. No?
Jan Kiszka June 10, 2012, 11:25 a.m. UTC | #8
On 2012-06-10 13:17, Michael S. Tsirkin wrote:
> On Sun, Jun 10, 2012 at 01:00:35PM +0200, Jan Kiszka wrote:
>> On 2012-06-10 12:58, Michael S. Tsirkin wrote:
>>> On Sun, Jun 10, 2012 at 12:52:45PM +0200, Jan Kiszka wrote:
>>>> On 2012-06-10 12:49, Michael S. Tsirkin wrote:
>>>>> On Sun, Jun 10, 2012 at 12:14:36PM +0200, Jan Kiszka wrote:
>>>>>> On 2012-06-10 11:35, Michael S. Tsirkin wrote:
>>>>>>> On Mon, Jun 04, 2012 at 10:52:21AM +0200, Jan Kiszka wrote:
>>>>>>>> Add a property to receive a fully qualified PCI device address.
>>>>>>>>
>>>>>>>> Will be used by KVM device assignment.
>>>>>>>>
>>>>>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>>>>>>
>>>>>>> I'd like to ponder this a bit more.  What bothers me is that this mixes
>>>>>>> two things:
>>>>>>> 	- addressing of qemu devices
>>>>>>> 		Using full device addresses there is a legacy feature,
>>>>>>> 		users really should supply the parent bus and
>>>>>>> 		the bus local address.
>>>>>>> 	- addressing devices on the linux host for assignment
>>>>>>> 		It so happens that the syntax matches
>>>>>>> 		the legacy naming very closely,
>>>>>>> 		but conceptually is completely unrelated
>>>>>>
>>>>>> We can keep code duplications, of course.
>>>>>>
>>>>>>>
>>>>>>>> ---
>>>>>>>>  hw/qdev-properties.c |   48 ++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>>>>  hw/qdev.h            |    3 +++
>>>>>>>>  2 files changed, 51 insertions(+), 0 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/hw/qdev-properties.c b/hw/qdev-properties.c
>>>>>>>> index 32e41f1..6634f22 100644
>>>>>>>> --- a/hw/qdev-properties.c
>>>>>>>> +++ b/hw/qdev-properties.c
>>>>>>>> @@ -946,6 +946,54 @@ PropertyInfo qdev_prop_pci_devfn = {
>>>>>>>>      .max   = 0xFFFFFFFFULL,
>>>>>>>>  };
>>>>>>>>  
>>>>>>>> +static void get_pci_devaddr(Object *obj, Visitor *v, void *opaque,
>>>>>>>> +                            const char *name, Error **errp)
>>>>>>>> +{
>>>>>>>> +    DeviceState *dev = DEVICE(obj);
>>>>>>>> +    Property *prop = opaque;
>>>>>>>> +    PCIDeviceAddress *addr = qdev_get_prop_ptr(dev, prop);
>>>>>>>> +    char buffer[10 + 3 + 1];
>>>>>>>> +    char *p = buffer;
>>>>>>>> +
>>>>>>>> +    snprintf(buffer, sizeof(buffer), "%04x:%02x:%02x.%02x",
>>>>>>>> +             addr->domain, addr->bus, addr->slot, addr->function);
>>>>>>>> +
>>>>>>>> +    visit_type_str(v, &p, name, errp);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static void set_pci_devaddr(Object *obj, Visitor *v, void *opaque,
>>>>>>>> +                            const char *name, Error **errp)
>>>>>>>> +{
>>>>>>>> +    DeviceState *dev = DEVICE(obj);
>>>>>>>> +    Property *prop = opaque;
>>>>>>>> +    PCIDeviceAddress *addr = qdev_get_prop_ptr(dev, prop);
>>>>>>>> +    Error *local_err = NULL;
>>>>>>>> +    char *str;
>>>>>>>> +
>>>>>>>> +    if (dev->state != DEV_STATE_CREATED) {
>>>>>>>> +        error_set(errp, QERR_PERMISSION_DENIED);
>>>>>>>> +        return;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    visit_type_str(v, &str, name, &local_err);
>>>>>>>> +    if (local_err) {
>>>>>>>> +        error_propagate(errp, local_err);
>>>>>>>> +        return;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    if (qemu_parse_pci_devaddr(str, addr,
>>>>>>>> +                               PCI_DEVADDR_WITH_DOM_BUS_OPT |
>>>>>>>> +                               PCI_DEVADDR_WITH_FUNC) < 0) {
>>>>>>>> +        error_set_from_qdev_prop_error(errp, EINVAL, dev, prop, str);
>>>>>>>> +    }
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +PropertyInfo qdev_prop_pci_devaddr = {
>>>>>>>> +    .name  = "pci-devaddr",
>>>>>>>
>>>>>>> This is a very confusing name.  Something like host-pci-address?
>>>>>>
>>>>>> That might be an option.
>>>>>>
>>>>>>> This also should be built on linux only.
>>>>>>
>>>>>> Why, what do we gain with #ifdefs? And isn't the addressing concept generic?
>>>>>
>>>>> Not the XXX:XX.X format. And not the concept of a domain.
>>>>>
>>>>>>> Can this be part of device assignment code instead of qdev?
>>>>>>
>>>>>> How does VFIO address their host devices?
>>>>>
>>>>> You get an fd I think. I think you don't need to know the host address.
>>>>
>>>> vfio_pci.c contains a nice function called "parse_hostaddr". You may
>>>> guess what it does. ;)
>>>
>>> Interesting. Why? This looks strange to me:
>>> I would expect the admin to bind a device to vfio
>>> the way it's now bound to a stub.
>>> The pass /dev/vfioXXX to qemu.
>>
>> That's the "libvirt way". We surely also want the "qemu command line
>> way" for which this kind of service is needed.
>>
>> Jan
>>
> 
> Yes, I imagine the qemu command line passing in /dev/vfioXXX,
> the libvirt way will pass in an fd for above. No?

As far as I understand the API, there is no device file per assigned
device. Also, this [domain:]bus:dev.fn format is more handy for the
command line.

Jan
Michael S. Tsirkin June 10, 2012, 12:01 p.m. UTC | #9
On Sun, Jun 10, 2012 at 01:25:41PM +0200, Jan Kiszka wrote:
> On 2012-06-10 13:17, Michael S. Tsirkin wrote:
> > On Sun, Jun 10, 2012 at 01:00:35PM +0200, Jan Kiszka wrote:
> >> On 2012-06-10 12:58, Michael S. Tsirkin wrote:
> >>> On Sun, Jun 10, 2012 at 12:52:45PM +0200, Jan Kiszka wrote:
> >>>> On 2012-06-10 12:49, Michael S. Tsirkin wrote:
> >>>>> On Sun, Jun 10, 2012 at 12:14:36PM +0200, Jan Kiszka wrote:
> >>>>>> On 2012-06-10 11:35, Michael S. Tsirkin wrote:
> >>>>>>> On Mon, Jun 04, 2012 at 10:52:21AM +0200, Jan Kiszka wrote:
> >>>>>>>> Add a property to receive a fully qualified PCI device address.
> >>>>>>>>
> >>>>>>>> Will be used by KVM device assignment.
> >>>>>>>>
> >>>>>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> >>>>>>>
> >>>>>>> I'd like to ponder this a bit more.  What bothers me is that this mixes
> >>>>>>> two things:
> >>>>>>> 	- addressing of qemu devices
> >>>>>>> 		Using full device addresses there is a legacy feature,
> >>>>>>> 		users really should supply the parent bus and
> >>>>>>> 		the bus local address.
> >>>>>>> 	- addressing devices on the linux host for assignment
> >>>>>>> 		It so happens that the syntax matches
> >>>>>>> 		the legacy naming very closely,
> >>>>>>> 		but conceptually is completely unrelated
> >>>>>>
> >>>>>> We can keep code duplications, of course.
> >>>>>>
> >>>>>>>
> >>>>>>>> ---
> >>>>>>>>  hw/qdev-properties.c |   48 ++++++++++++++++++++++++++++++++++++++++++++++++
> >>>>>>>>  hw/qdev.h            |    3 +++
> >>>>>>>>  2 files changed, 51 insertions(+), 0 deletions(-)
> >>>>>>>>
> >>>>>>>> diff --git a/hw/qdev-properties.c b/hw/qdev-properties.c
> >>>>>>>> index 32e41f1..6634f22 100644
> >>>>>>>> --- a/hw/qdev-properties.c
> >>>>>>>> +++ b/hw/qdev-properties.c
> >>>>>>>> @@ -946,6 +946,54 @@ PropertyInfo qdev_prop_pci_devfn = {
> >>>>>>>>      .max   = 0xFFFFFFFFULL,
> >>>>>>>>  };
> >>>>>>>>  
> >>>>>>>> +static void get_pci_devaddr(Object *obj, Visitor *v, void *opaque,
> >>>>>>>> +                            const char *name, Error **errp)
> >>>>>>>> +{
> >>>>>>>> +    DeviceState *dev = DEVICE(obj);
> >>>>>>>> +    Property *prop = opaque;
> >>>>>>>> +    PCIDeviceAddress *addr = qdev_get_prop_ptr(dev, prop);
> >>>>>>>> +    char buffer[10 + 3 + 1];
> >>>>>>>> +    char *p = buffer;
> >>>>>>>> +
> >>>>>>>> +    snprintf(buffer, sizeof(buffer), "%04x:%02x:%02x.%02x",
> >>>>>>>> +             addr->domain, addr->bus, addr->slot, addr->function);
> >>>>>>>> +
> >>>>>>>> +    visit_type_str(v, &p, name, errp);
> >>>>>>>> +}
> >>>>>>>> +
> >>>>>>>> +static void set_pci_devaddr(Object *obj, Visitor *v, void *opaque,
> >>>>>>>> +                            const char *name, Error **errp)
> >>>>>>>> +{
> >>>>>>>> +    DeviceState *dev = DEVICE(obj);
> >>>>>>>> +    Property *prop = opaque;
> >>>>>>>> +    PCIDeviceAddress *addr = qdev_get_prop_ptr(dev, prop);
> >>>>>>>> +    Error *local_err = NULL;
> >>>>>>>> +    char *str;
> >>>>>>>> +
> >>>>>>>> +    if (dev->state != DEV_STATE_CREATED) {
> >>>>>>>> +        error_set(errp, QERR_PERMISSION_DENIED);
> >>>>>>>> +        return;
> >>>>>>>> +    }
> >>>>>>>> +
> >>>>>>>> +    visit_type_str(v, &str, name, &local_err);
> >>>>>>>> +    if (local_err) {
> >>>>>>>> +        error_propagate(errp, local_err);
> >>>>>>>> +        return;
> >>>>>>>> +    }
> >>>>>>>> +
> >>>>>>>> +    if (qemu_parse_pci_devaddr(str, addr,
> >>>>>>>> +                               PCI_DEVADDR_WITH_DOM_BUS_OPT |
> >>>>>>>> +                               PCI_DEVADDR_WITH_FUNC) < 0) {
> >>>>>>>> +        error_set_from_qdev_prop_error(errp, EINVAL, dev, prop, str);
> >>>>>>>> +    }
> >>>>>>>> +}
> >>>>>>>> +
> >>>>>>>> +PropertyInfo qdev_prop_pci_devaddr = {
> >>>>>>>> +    .name  = "pci-devaddr",
> >>>>>>>
> >>>>>>> This is a very confusing name.  Something like host-pci-address?
> >>>>>>
> >>>>>> That might be an option.
> >>>>>>
> >>>>>>> This also should be built on linux only.
> >>>>>>
> >>>>>> Why, what do we gain with #ifdefs? And isn't the addressing concept generic?
> >>>>>
> >>>>> Not the XXX:XX.X format. And not the concept of a domain.
> >>>>>
> >>>>>>> Can this be part of device assignment code instead of qdev?
> >>>>>>
> >>>>>> How does VFIO address their host devices?
> >>>>>
> >>>>> You get an fd I think. I think you don't need to know the host address.
> >>>>
> >>>> vfio_pci.c contains a nice function called "parse_hostaddr". You may
> >>>> guess what it does. ;)
> >>>
> >>> Interesting. Why? This looks strange to me:
> >>> I would expect the admin to bind a device to vfio
> >>> the way it's now bound to a stub.
> >>> The pass /dev/vfioXXX to qemu.
> >>
> >> That's the "libvirt way". We surely also want the "qemu command line
> >> way" for which this kind of service is needed.
> >>
> >> Jan
> >>
> > 
> > Yes, I imagine the qemu command line passing in /dev/vfioXXX,
> > the libvirt way will pass in an fd for above. No?
> 
> As far as I understand the API, there is no device file per assigned
> device.

Does it do pci_get_domain_bus_and_slot like kvm then?
With all the warts like you have to remember to bind pci stub
or you get two drivers for one device?
If true that's unfortunate IMHO.

> Also, this [domain:]bus:dev.fn format is more handy for the
> command line.
> 
> Jan
> 

Then users could add udev rules that will name vfio devices
like this.  Another interesting option: /dev/vfio/eth0/vf1.
That's better I think: no one really likes running lspci
and guessing the address from there.
Alex Williamson June 10, 2012, 1:41 p.m. UTC | #10
On Sun, 2012-06-10 at 15:01 +0300, Michael S. Tsirkin wrote:
> On Sun, Jun 10, 2012 at 01:25:41PM +0200, Jan Kiszka wrote:
> > On 2012-06-10 13:17, Michael S. Tsirkin wrote:
> > > On Sun, Jun 10, 2012 at 01:00:35PM +0200, Jan Kiszka wrote:
> > >> On 2012-06-10 12:58, Michael S. Tsirkin wrote:
> > >>> On Sun, Jun 10, 2012 at 12:52:45PM +0200, Jan Kiszka wrote:
> > >>>> On 2012-06-10 12:49, Michael S. Tsirkin wrote:
> > >>>>> On Sun, Jun 10, 2012 at 12:14:36PM +0200, Jan Kiszka wrote:
> > >>>>>> On 2012-06-10 11:35, Michael S. Tsirkin wrote:
> > >>>>>>> On Mon, Jun 04, 2012 at 10:52:21AM +0200, Jan Kiszka wrote:
> > >>>>>>>> Add a property to receive a fully qualified PCI device address.
> > >>>>>>>>
> > >>>>>>>> Will be used by KVM device assignment.
> > >>>>>>>>
> > >>>>>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> > >>>>>>>
> > >>>>>>> I'd like to ponder this a bit more.  What bothers me is that this mixes
> > >>>>>>> two things:
> > >>>>>>> 	- addressing of qemu devices
> > >>>>>>> 		Using full device addresses there is a legacy feature,
> > >>>>>>> 		users really should supply the parent bus and
> > >>>>>>> 		the bus local address.
> > >>>>>>> 	- addressing devices on the linux host for assignment
> > >>>>>>> 		It so happens that the syntax matches
> > >>>>>>> 		the legacy naming very closely,
> > >>>>>>> 		but conceptually is completely unrelated
> > >>>>>>
> > >>>>>> We can keep code duplications, of course.
> > >>>>>>
> > >>>>>>>
> > >>>>>>>> ---
> > >>>>>>>>  hw/qdev-properties.c |   48 ++++++++++++++++++++++++++++++++++++++++++++++++
> > >>>>>>>>  hw/qdev.h            |    3 +++
> > >>>>>>>>  2 files changed, 51 insertions(+), 0 deletions(-)
> > >>>>>>>>
> > >>>>>>>> diff --git a/hw/qdev-properties.c b/hw/qdev-properties.c
> > >>>>>>>> index 32e41f1..6634f22 100644
> > >>>>>>>> --- a/hw/qdev-properties.c
> > >>>>>>>> +++ b/hw/qdev-properties.c
> > >>>>>>>> @@ -946,6 +946,54 @@ PropertyInfo qdev_prop_pci_devfn = {
> > >>>>>>>>      .max   = 0xFFFFFFFFULL,
> > >>>>>>>>  };
> > >>>>>>>>  
> > >>>>>>>> +static void get_pci_devaddr(Object *obj, Visitor *v, void *opaque,
> > >>>>>>>> +                            const char *name, Error **errp)
> > >>>>>>>> +{
> > >>>>>>>> +    DeviceState *dev = DEVICE(obj);
> > >>>>>>>> +    Property *prop = opaque;
> > >>>>>>>> +    PCIDeviceAddress *addr = qdev_get_prop_ptr(dev, prop);
> > >>>>>>>> +    char buffer[10 + 3 + 1];
> > >>>>>>>> +    char *p = buffer;
> > >>>>>>>> +
> > >>>>>>>> +    snprintf(buffer, sizeof(buffer), "%04x:%02x:%02x.%02x",
> > >>>>>>>> +             addr->domain, addr->bus, addr->slot, addr->function);
> > >>>>>>>> +
> > >>>>>>>> +    visit_type_str(v, &p, name, errp);
> > >>>>>>>> +}
> > >>>>>>>> +
> > >>>>>>>> +static void set_pci_devaddr(Object *obj, Visitor *v, void *opaque,
> > >>>>>>>> +                            const char *name, Error **errp)
> > >>>>>>>> +{
> > >>>>>>>> +    DeviceState *dev = DEVICE(obj);
> > >>>>>>>> +    Property *prop = opaque;
> > >>>>>>>> +    PCIDeviceAddress *addr = qdev_get_prop_ptr(dev, prop);
> > >>>>>>>> +    Error *local_err = NULL;
> > >>>>>>>> +    char *str;
> > >>>>>>>> +
> > >>>>>>>> +    if (dev->state != DEV_STATE_CREATED) {
> > >>>>>>>> +        error_set(errp, QERR_PERMISSION_DENIED);
> > >>>>>>>> +        return;
> > >>>>>>>> +    }
> > >>>>>>>> +
> > >>>>>>>> +    visit_type_str(v, &str, name, &local_err);
> > >>>>>>>> +    if (local_err) {
> > >>>>>>>> +        error_propagate(errp, local_err);
> > >>>>>>>> +        return;
> > >>>>>>>> +    }
> > >>>>>>>> +
> > >>>>>>>> +    if (qemu_parse_pci_devaddr(str, addr,
> > >>>>>>>> +                               PCI_DEVADDR_WITH_DOM_BUS_OPT |
> > >>>>>>>> +                               PCI_DEVADDR_WITH_FUNC) < 0) {
> > >>>>>>>> +        error_set_from_qdev_prop_error(errp, EINVAL, dev, prop, str);
> > >>>>>>>> +    }
> > >>>>>>>> +}
> > >>>>>>>> +
> > >>>>>>>> +PropertyInfo qdev_prop_pci_devaddr = {
> > >>>>>>>> +    .name  = "pci-devaddr",
> > >>>>>>>
> > >>>>>>> This is a very confusing name.  Something like host-pci-address?
> > >>>>>>
> > >>>>>> That might be an option.
> > >>>>>>
> > >>>>>>> This also should be built on linux only.
> > >>>>>>
> > >>>>>> Why, what do we gain with #ifdefs? And isn't the addressing concept generic?
> > >>>>>
> > >>>>> Not the XXX:XX.X format. And not the concept of a domain.
> > >>>>>
> > >>>>>>> Can this be part of device assignment code instead of qdev?
> > >>>>>>
> > >>>>>> How does VFIO address their host devices?
> > >>>>>
> > >>>>> You get an fd I think. I think you don't need to know the host address.
> > >>>>
> > >>>> vfio_pci.c contains a nice function called "parse_hostaddr". You may
> > >>>> guess what it does. ;)
> > >>>
> > >>> Interesting. Why? This looks strange to me:
> > >>> I would expect the admin to bind a device to vfio
> > >>> the way it's now bound to a stub.
> > >>> The pass /dev/vfioXXX to qemu.
> > >>
> > >> That's the "libvirt way". We surely also want the "qemu command line
> > >> way" for which this kind of service is needed.
> > >>
> > >> Jan
> > >>
> > > 
> > > Yes, I imagine the qemu command line passing in /dev/vfioXXX,
> > > the libvirt way will pass in an fd for above. No?
> > 
> > As far as I understand the API, there is no device file per assigned
> > device.
> 
> Does it do pci_get_domain_bus_and_slot like kvm then?
> With all the warts like you have to remember to bind pci stub
> or you get two drivers for one device?
> If true that's unfortunate IMHO.
> 
> > Also, this [domain:]bus:dev.fn format is more handy for the
> > command line.
> > 
> > Jan
> > 
> 
> Then users could add udev rules that will name vfio devices
> like this.  Another interesting option: /dev/vfio/eth0/vf1.
> That's better I think: no one really likes running lspci
> and guessing the address from there.

That's not at all how VFIO works.  /dev/vfio/# represents a group, which
may contain one or more devices.  Even if libvirt passes a file
descriptor for the group, qemu needs to know which device in the group
to add to the guest, so parsing a device address is still necessary.
Thanks,

Alex
Michael S. Tsirkin June 10, 2012, 1:49 p.m. UTC | #11
On Sun, Jun 10, 2012 at 12:14:36PM +0200, Jan Kiszka wrote:
> On 2012-06-10 11:35, Michael S. Tsirkin wrote:
> > On Mon, Jun 04, 2012 at 10:52:21AM +0200, Jan Kiszka wrote:
> >> Add a property to receive a fully qualified PCI device address.
> >>
> >> Will be used by KVM device assignment.
> >>
> >> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> > 
> > I'd like to ponder this a bit more.  What bothers me is that this mixes
> > two things:
> > 	- addressing of qemu devices
> > 		Using full device addresses there is a legacy feature,
> > 		users really should supply the parent bus and
> > 		the bus local address.
> > 	- addressing devices on the linux host for assignment
> > 		It so happens that the syntax matches
> > 		the legacy naming very closely,
> > 		but conceptually is completely unrelated
> 
> We can keep code duplications, of course.

Just note that the parsing that we have is not even consistent. For
example what does it mean not to supply an optional field, e.g. a
function number, a domain number, etc? lspci and friends all treat any
missing number as a wildcard:

$ lspci -s 0.1
15:00.1 FireWire (IEEE 1394): Ricoh Co Ltd R5C832 IEEE 1394 Controller (rev 04)

$ lspci -s .1
00:02.1 Display controller: Intel Corporation Mobile 4 Series Chipset Integrated Graphics Controller (rev 07)
00:1a.1 USB Controller: Intel Corporation 82801I (ICH9 Family) USB UHCI Controller #5 (rev 03)
00:1c.1 PCI bridge: Intel Corporation 82801I (ICH9 Family) PCI Express Port 2 (rev 03)
00:1d.1 USB Controller: Intel Corporation 82801I (ICH9 Family) USB UHCI Controller #2 (rev 03)
15:00.1 FireWire (IEEE 1394): Ricoh Co Ltd R5C832 IEEE 1394 Controller (rev 04)

We don't, and the only reason for that is that it's been buggy
for a long time.
Michael S. Tsirkin June 10, 2012, 2:03 p.m. UTC | #12
On Sun, Jun 10, 2012 at 07:41:51AM -0600, Alex Williamson wrote:
> > > >>>> vfio_pci.c contains a nice function called "parse_hostaddr". You may
> > > >>>> guess what it does. ;)
> > > >>>
> > > >>> Interesting. Why? This looks strange to me:
> > > >>> I would expect the admin to bind a device to vfio
> > > >>> the way it's now bound to a stub.
> > > >>> The pass /dev/vfioXXX to qemu.
> > > >>
> > > >> That's the "libvirt way". We surely also want the "qemu command line
> > > >> way" for which this kind of service is needed.
> > > >>
> > > >> Jan
> > > >>
> > > > 
> > > > Yes, I imagine the qemu command line passing in /dev/vfioXXX,
> > > > the libvirt way will pass in an fd for above. No?
> > > 
> > > As far as I understand the API, there is no device file per assigned
> > > device.
> > 
> > Does it do pci_get_domain_bus_and_slot like kvm then?
> > With all the warts like you have to remember to bind pci stub
> > or you get two drivers for one device?
> > If true that's unfortunate IMHO.

I hope the answer to the above is no?

> > > Also, this [domain:]bus:dev.fn format is more handy for the
> > > command line.
> > > 
> > > Jan
> > > 
> > 
> > Then users could add udev rules that will name vfio devices
> > like this.  Another interesting option: /dev/vfio/eth0/vf1.
> > That's better I think: no one really likes running lspci
> > and guessing the address from there.
> 
> That's not at all how VFIO works.  /dev/vfio/# represents a group, which
> may contain one or more devices.  Even if libvirt passes a file
> descriptor for the group, qemu needs to know which device in the group
> to add to the guest, so parsing a device address is still necessary.
> Thanks,
> 
> Alex

That's very unusual, and unfortunate.  For example this means that I
must update applications just because I move a card to another slot.
UIO does not have this problem.
The fact that it's broken in kvm ATM seems to have made people
think it's okay, but it really is a bug. We didn't fix it
because vfio was supposed to be the solution.

I do realize you want to represent a group of devices somehow but can't
this be solved without breaking naming devices with udev? For example, the
device could be a file as well. You would then use the fd to identify the
device within the group. And in a somewhat common case of a single device
within the group, you can even make opening the group optional.
Don't know if this fix I suggest makes sense at all but it's a real
problem all the same.
Alex Williamson June 10, 2012, 2:41 p.m. UTC | #13
On Sun, 2012-06-10 at 17:03 +0300, Michael S. Tsirkin wrote:
> On Sun, Jun 10, 2012 at 07:41:51AM -0600, Alex Williamson wrote:
> > > > >>>> vfio_pci.c contains a nice function called "parse_hostaddr". You may
> > > > >>>> guess what it does. ;)
> > > > >>>
> > > > >>> Interesting. Why? This looks strange to me:
> > > > >>> I would expect the admin to bind a device to vfio
> > > > >>> the way it's now bound to a stub.
> > > > >>> The pass /dev/vfioXXX to qemu.
> > > > >>
> > > > >> That's the "libvirt way". We surely also want the "qemu command line
> > > > >> way" for which this kind of service is needed.
> > > > >>
> > > > >> Jan
> > > > >>
> > > > > 
> > > > > Yes, I imagine the qemu command line passing in /dev/vfioXXX,
> > > > > the libvirt way will pass in an fd for above. No?
> > > > 
> > > > As far as I understand the API, there is no device file per assigned
> > > > device.
> > > 
> > > Does it do pci_get_domain_bus_and_slot like kvm then?
> > > With all the warts like you have to remember to bind pci stub
> > > or you get two drivers for one device?
> > > If true that's unfortunate IMHO.
> 
> I hope the answer to the above is no?

No, it does a probe for devices.  We need the devaddr to compare against
dev_name of the device to figure out which device the user is attempting
to identify.

> > > > Also, this [domain:]bus:dev.fn format is more handy for the
> > > > command line.
> > > > 
> > > > Jan
> > > > 
> > > 
> > > Then users could add udev rules that will name vfio devices
> > > like this.  Another interesting option: /dev/vfio/eth0/vf1.
> > > That's better I think: no one really likes running lspci
> > > and guessing the address from there.
> > 
> > That's not at all how VFIO works.  /dev/vfio/# represents a group, which
> > may contain one or more devices.  Even if libvirt passes a file
> > descriptor for the group, qemu needs to know which device in the group
> > to add to the guest, so parsing a device address is still necessary.
> > Thanks,
> > 
> > Alex
> 
> That's very unusual, and unfortunate.  For example this means that I
> must update applications just because I move a card to another slot.
> UIO does not have this problem.
> The fact that it's broken in kvm ATM seems to have made people
> think it's okay, but it really is a bug. We didn't fix it
> because vfio was supposed to be the solution.

I don't know what you're talking about here.  Are you suggesting that
needing to specify -device pci-assign,host=3.0 changing to host=4.0 when
you move a card is broken?  How does UIO avoid such a problem.  UIO-pci
requires the user to use pci-sysfs for resource access, so it surely
cares about the device address.

> I do realize you want to represent a group of devices somehow but can't
> this be solved without breaking naming devices with udev? For example, the
> device could be a file as well. You would then use the fd to identify the
> device within the group. And in a somewhat common case of a single device
> within the group, you can even make opening the group optional.
> Don't know if this fix I suggest makes sense at all but it's a real
> problem all the same.

Unfortunately, exposing individual devices just confuses the ownership
model we require for groups.  It would provide the illusion of being
able to assign an individual device, without the reality of the
grouping.  Groups are owned either by _a_ user or by the kernel, they
can't be split across multiple users (at least not with any guarantees
of isolation).  The current interface makes this clear.  Thanks,

Alex
Michael S. Tsirkin June 10, 2012, 2:54 p.m. UTC | #14
On Sun, Jun 10, 2012 at 08:41:03AM -0600, Alex Williamson wrote:
> On Sun, 2012-06-10 at 17:03 +0300, Michael S. Tsirkin wrote:
> > On Sun, Jun 10, 2012 at 07:41:51AM -0600, Alex Williamson wrote:
> > > > > >>>> vfio_pci.c contains a nice function called "parse_hostaddr". You may
> > > > > >>>> guess what it does. ;)
> > > > > >>>
> > > > > >>> Interesting. Why? This looks strange to me:
> > > > > >>> I would expect the admin to bind a device to vfio
> > > > > >>> the way it's now bound to a stub.
> > > > > >>> The pass /dev/vfioXXX to qemu.
> > > > > >>
> > > > > >> That's the "libvirt way". We surely also want the "qemu command line
> > > > > >> way" for which this kind of service is needed.
> > > > > >>
> > > > > >> Jan
> > > > > >>
> > > > > > 
> > > > > > Yes, I imagine the qemu command line passing in /dev/vfioXXX,
> > > > > > the libvirt way will pass in an fd for above. No?
> > > > > 
> > > > > As far as I understand the API, there is no device file per assigned
> > > > > device.
> > > > 
> > > > Does it do pci_get_domain_bus_and_slot like kvm then?
> > > > With all the warts like you have to remember to bind pci stub
> > > > or you get two drivers for one device?
> > > > If true that's unfortunate IMHO.
> > 
> > I hope the answer to the above is no?
> 
> No, it does a probe for devices.  We need the devaddr to compare against
> dev_name of the device to figure out which device the user is attempting
> to identify.
> 
> > > > > Also, this [domain:]bus:dev.fn format is more handy for the
> > > > > command line.
> > > > > 
> > > > > Jan
> > > > > 
> > > > 
> > > > Then users could add udev rules that will name vfio devices
> > > > like this.  Another interesting option: /dev/vfio/eth0/vf1.
> > > > That's better I think: no one really likes running lspci
> > > > and guessing the address from there.
> > > 
> > > That's not at all how VFIO works.  /dev/vfio/# represents a group, which
> > > may contain one or more devices.  Even if libvirt passes a file
> > > descriptor for the group, qemu needs to know which device in the group
> > > to add to the guest, so parsing a device address is still necessary.
> > > Thanks,
> > > 
> > > Alex
> > 
> > That's very unusual, and unfortunate.  For example this means that I
> > must update applications just because I move a card to another slot.
> > UIO does not have this problem.
> > The fact that it's broken in kvm ATM seems to have made people
> > think it's okay, but it really is a bug. We didn't fix it
> > because vfio was supposed to be the solution.
> 
> I don't know what you're talking about here.  Are you suggesting that
> needing to specify -device pci-assign,host=3.0 changing to host=4.0 when
> you move a card is broken?

Yes. Absolutely. Admin should be able to abstract it away without users
knowing anything about it.

>  How does UIO avoid such a problem.

Normally you use a misc device that you can name with udev.

>  UIO-pci
> requires the user to use pci-sysfs for resource access, so it surely
> cares about the device address.

Only uio_pci_generic. Other uio devices let you drive the
device.

> > I do realize you want to represent a group of devices somehow but can't
> > this be solved without breaking naming devices with udev? For example, the
> > device could be a file as well. You would then use the fd to identify the
> > device within the group. And in a somewhat common case of a single device
> > within the group, you can even make opening the group optional.
> > Don't know if this fix I suggest makes sense at all but it's a real
> > problem all the same.
> 
> Unfortunately, exposing individual devices just confuses the ownership
> model we require for groups.  It would provide the illusion of being
> able to assign an individual device, without the reality of the
> grouping.  Groups are owned either by _a_ user or by the kernel, they
> can't be split across multiple users (at least not with any guarantees
> of isolation).  The current interface makes this clear.  Thanks,
> 
> Alex

So do users pass in group=/dev/vfio/1,host=0:3.0 then?
Alex Williamson June 10, 2012, 3:15 p.m. UTC | #15
On Sun, 2012-06-10 at 17:54 +0300, Michael S. Tsirkin wrote:
> On Sun, Jun 10, 2012 at 08:41:03AM -0600, Alex Williamson wrote:
> > On Sun, 2012-06-10 at 17:03 +0300, Michael S. Tsirkin wrote:
> > > On Sun, Jun 10, 2012 at 07:41:51AM -0600, Alex Williamson wrote:
> > > > > > >>>> vfio_pci.c contains a nice function called "parse_hostaddr". You may
> > > > > > >>>> guess what it does. ;)
> > > > > > >>>
> > > > > > >>> Interesting. Why? This looks strange to me:
> > > > > > >>> I would expect the admin to bind a device to vfio
> > > > > > >>> the way it's now bound to a stub.
> > > > > > >>> The pass /dev/vfioXXX to qemu.
> > > > > > >>
> > > > > > >> That's the "libvirt way". We surely also want the "qemu command line
> > > > > > >> way" for which this kind of service is needed.
> > > > > > >>
> > > > > > >> Jan
> > > > > > >>
> > > > > > > 
> > > > > > > Yes, I imagine the qemu command line passing in /dev/vfioXXX,
> > > > > > > the libvirt way will pass in an fd for above. No?
> > > > > > 
> > > > > > As far as I understand the API, there is no device file per assigned
> > > > > > device.
> > > > > 
> > > > > Does it do pci_get_domain_bus_and_slot like kvm then?
> > > > > With all the warts like you have to remember to bind pci stub
> > > > > or you get two drivers for one device?
> > > > > If true that's unfortunate IMHO.
> > > 
> > > I hope the answer to the above is no?
> > 
> > No, it does a probe for devices.  We need the devaddr to compare against
> > dev_name of the device to figure out which device the user is attempting
> > to identify.
> > 
> > > > > > Also, this [domain:]bus:dev.fn format is more handy for the
> > > > > > command line.
> > > > > > 
> > > > > > Jan
> > > > > > 
> > > > > 
> > > > > Then users could add udev rules that will name vfio devices
> > > > > like this.  Another interesting option: /dev/vfio/eth0/vf1.
> > > > > That's better I think: no one really likes running lspci
> > > > > and guessing the address from there.
> > > > 
> > > > That's not at all how VFIO works.  /dev/vfio/# represents a group, which
> > > > may contain one or more devices.  Even if libvirt passes a file
> > > > descriptor for the group, qemu needs to know which device in the group
> > > > to add to the guest, so parsing a device address is still necessary.
> > > > Thanks,
> > > > 
> > > > Alex
> > > 
> > > That's very unusual, and unfortunate.  For example this means that I
> > > must update applications just because I move a card to another slot.
> > > UIO does not have this problem.
> > > The fact that it's broken in kvm ATM seems to have made people
> > > think it's okay, but it really is a bug. We didn't fix it
> > > because vfio was supposed to be the solution.
> > 
> > I don't know what you're talking about here.  Are you suggesting that
> > needing to specify -device pci-assign,host=3.0 changing to host=4.0 when
> > you move a card is broken?
> 
> Yes. Absolutely. Admin should be able to abstract it away without users
> knowing anything about it.

We don't have UUIDs on PCI devices, so who's to say that the device that
was in slot 3 is the same device that's now in slot 4 and the user
should still have access to it?  That sounds even more problematic.

> >  How does UIO avoid such a problem.
> 
> Normally you use a misc device that you can name with udev.
> 
> >  UIO-pci
> > requires the user to use pci-sysfs for resource access, so it surely
> > cares about the device address.
> 
> Only uio_pci_generic. Other uio devices let you drive the
> device.

If this is actually a problem, this is the first ever complaint I've
heard about it.  As above, I don't think we can assume the same access
when a device is moved.

> > > I do realize you want to represent a group of devices somehow but can't
> > > this be solved without breaking naming devices with udev? For example, the
> > > device could be a file as well. You would then use the fd to identify the
> > > device within the group. And in a somewhat common case of a single device
> > > within the group, you can even make opening the group optional.
> > > Don't know if this fix I suggest makes sense at all but it's a real
> > > problem all the same.
> > 
> > Unfortunately, exposing individual devices just confuses the ownership
> > model we require for groups.  It would provide the illusion of being
> > able to assign an individual device, without the reality of the
> > grouping.  Groups are owned either by _a_ user or by the kernel, they
> > can't be split across multiple users (at least not with any guarantees
> > of isolation).  The current interface makes this clear.  Thanks,
> > 
> > Alex
> 
> So do users pass in group=/dev/vfio/1,host=0:3.0 then?

No, vfio syntax is -device vfio-pci,host=0:3.0, just like pci-assign.
Qemu will figure out which group that device belongs to and "do the
right thing".  If we add support for libvirt passing a groupfd, it will
be mostly the same, just using scm_rights to get the groupfd instead of
opening it directly.  Thanks,

Alex
Michael S. Tsirkin June 10, 2012, 3:37 p.m. UTC | #16
On Sun, Jun 10, 2012 at 09:15:10AM -0600, Alex Williamson wrote:
> On Sun, 2012-06-10 at 17:54 +0300, Michael S. Tsirkin wrote:
> > On Sun, Jun 10, 2012 at 08:41:03AM -0600, Alex Williamson wrote:
> > > On Sun, 2012-06-10 at 17:03 +0300, Michael S. Tsirkin wrote:
> > > > On Sun, Jun 10, 2012 at 07:41:51AM -0600, Alex Williamson wrote:
> > > > > > > >>>> vfio_pci.c contains a nice function called "parse_hostaddr". You may
> > > > > > > >>>> guess what it does. ;)
> > > > > > > >>>
> > > > > > > >>> Interesting. Why? This looks strange to me:
> > > > > > > >>> I would expect the admin to bind a device to vfio
> > > > > > > >>> the way it's now bound to a stub.
> > > > > > > >>> The pass /dev/vfioXXX to qemu.
> > > > > > > >>
> > > > > > > >> That's the "libvirt way". We surely also want the "qemu command line
> > > > > > > >> way" for which this kind of service is needed.
> > > > > > > >>
> > > > > > > >> Jan
> > > > > > > >>
> > > > > > > > 
> > > > > > > > Yes, I imagine the qemu command line passing in /dev/vfioXXX,
> > > > > > > > the libvirt way will pass in an fd for above. No?
> > > > > > > 
> > > > > > > As far as I understand the API, there is no device file per assigned
> > > > > > > device.
> > > > > > 
> > > > > > Does it do pci_get_domain_bus_and_slot like kvm then?
> > > > > > With all the warts like you have to remember to bind pci stub
> > > > > > or you get two drivers for one device?
> > > > > > If true that's unfortunate IMHO.
> > > > 
> > > > I hope the answer to the above is no?
> > > 
> > > No, it does a probe for devices.  We need the devaddr to compare against
> > > dev_name of the device to figure out which device the user is attempting
> > > to identify.
> > > 
> > > > > > > Also, this [domain:]bus:dev.fn format is more handy for the
> > > > > > > command line.
> > > > > > > 
> > > > > > > Jan
> > > > > > > 
> > > > > > 
> > > > > > Then users could add udev rules that will name vfio devices
> > > > > > like this.  Another interesting option: /dev/vfio/eth0/vf1.
> > > > > > That's better I think: no one really likes running lspci
> > > > > > and guessing the address from there.
> > > > > 
> > > > > That's not at all how VFIO works.  /dev/vfio/# represents a group, which
> > > > > may contain one or more devices.  Even if libvirt passes a file
> > > > > descriptor for the group, qemu needs to know which device in the group
> > > > > to add to the guest, so parsing a device address is still necessary.
> > > > > Thanks,
> > > > > 
> > > > > Alex
> > > > 
> > > > That's very unusual, and unfortunate.  For example this means that I
> > > > must update applications just because I move a card to another slot.
> > > > UIO does not have this problem.
> > > > The fact that it's broken in kvm ATM seems to have made people
> > > > think it's okay, but it really is a bug. We didn't fix it
> > > > because vfio was supposed to be the solution.
> > > 
> > > I don't know what you're talking about here.  Are you suggesting that
> > > needing to specify -device pci-assign,host=3.0 changing to host=4.0 when
> > > you move a card is broken?
> > 
> > Yes. Absolutely. Admin should be able to abstract it away without users
> > knowing anything about it.
> 
> We don't have UUIDs on PCI devices, so who's to say that the device that
> was in slot 3 is the same device that's now in slot 4 and the user
> should still have access to it?  That sounds even more problematic.

PF has a driver loaded so you can identify that, and
identify the VF through it. Again this is really policy,
it should be up to the admin how to name the device.

> > >  How does UIO avoid such a problem.
> > 
> > Normally you use a misc device that you can name with udev.
> > 
> > >  UIO-pci
> > > requires the user to use pci-sysfs for resource access, so it surely
> > > cares about the device address.
> > 
> > Only uio_pci_generic. Other uio devices let you drive the
> > device.
> 
> If this is actually a problem, this is the first ever complaint I've
> heard about it.  As above, I don't think we can assume the same access
> when a device is moved.

I thought need for sane naming and for sysfs interface was discussed
multiple times. But maybe I'm misremembering.

> > > > I do realize you want to represent a group of devices somehow but can't
> > > > this be solved without breaking naming devices with udev? For example, the
> > > > device could be a file as well. You would then use the fd to identify the
> > > > device within the group. And in a somewhat common case of a single device
> > > > within the group, you can even make opening the group optional.
> > > > Don't know if this fix I suggest makes sense at all but it's a real
> > > > problem all the same.
> > > 
> > > Unfortunately, exposing individual devices just confuses the ownership
> > > model we require for groups.  It would provide the illusion of being
> > > able to assign an individual device, without the reality of the
> > > grouping.  Groups are owned either by _a_ user or by the kernel, they
> > > can't be split across multiple users (at least not with any guarantees
> > > of isolation).  The current interface makes this clear.  Thanks,
> > > 
> > > Alex
> > 
> > So do users pass in group=/dev/vfio/1,host=0:3.0 then?
> 
> No, vfio syntax is -device vfio-pci,host=0:3.0, just like pci-assign.
> Qemu will figure out which group that device belongs to and "do the
> right thing".  If we add support for libvirt passing a groupfd, it will
> be mostly the same, just using scm_rights to get the groupfd instead of
> opening it directly.  Thanks,
> 
> Alex

Then how do you know which /dev/vfio/# to open?
Alex Williamson June 10, 2012, 3:58 p.m. UTC | #17
On Sun, 2012-06-10 at 18:37 +0300, Michael S. Tsirkin wrote:
> On Sun, Jun 10, 2012 at 09:15:10AM -0600, Alex Williamson wrote:
> > On Sun, 2012-06-10 at 17:54 +0300, Michael S. Tsirkin wrote:
> > > On Sun, Jun 10, 2012 at 08:41:03AM -0600, Alex Williamson wrote:
> > > > On Sun, 2012-06-10 at 17:03 +0300, Michael S. Tsirkin wrote:
> > > > > On Sun, Jun 10, 2012 at 07:41:51AM -0600, Alex Williamson wrote:
> > > > > > > > >>>> vfio_pci.c contains a nice function called "parse_hostaddr". You may
> > > > > > > > >>>> guess what it does. ;)
> > > > > > > > >>>
> > > > > > > > >>> Interesting. Why? This looks strange to me:
> > > > > > > > >>> I would expect the admin to bind a device to vfio
> > > > > > > > >>> the way it's now bound to a stub.
> > > > > > > > >>> The pass /dev/vfioXXX to qemu.
> > > > > > > > >>
> > > > > > > > >> That's the "libvirt way". We surely also want the "qemu command line
> > > > > > > > >> way" for which this kind of service is needed.
> > > > > > > > >>
> > > > > > > > >> Jan
> > > > > > > > >>
> > > > > > > > > 
> > > > > > > > > Yes, I imagine the qemu command line passing in /dev/vfioXXX,
> > > > > > > > > the libvirt way will pass in an fd for above. No?
> > > > > > > > 
> > > > > > > > As far as I understand the API, there is no device file per assigned
> > > > > > > > device.
> > > > > > > 
> > > > > > > Does it do pci_get_domain_bus_and_slot like kvm then?
> > > > > > > With all the warts like you have to remember to bind pci stub
> > > > > > > or you get two drivers for one device?
> > > > > > > If true that's unfortunate IMHO.
> > > > > 
> > > > > I hope the answer to the above is no?
> > > > 
> > > > No, it does a probe for devices.  We need the devaddr to compare against
> > > > dev_name of the device to figure out which device the user is attempting
> > > > to identify.
> > > > 
> > > > > > > > Also, this [domain:]bus:dev.fn format is more handy for the
> > > > > > > > command line.
> > > > > > > > 
> > > > > > > > Jan
> > > > > > > > 
> > > > > > > 
> > > > > > > Then users could add udev rules that will name vfio devices
> > > > > > > like this.  Another interesting option: /dev/vfio/eth0/vf1.
> > > > > > > That's better I think: no one really likes running lspci
> > > > > > > and guessing the address from there.
> > > > > > 
> > > > > > That's not at all how VFIO works.  /dev/vfio/# represents a group, which
> > > > > > may contain one or more devices.  Even if libvirt passes a file
> > > > > > descriptor for the group, qemu needs to know which device in the group
> > > > > > to add to the guest, so parsing a device address is still necessary.
> > > > > > Thanks,
> > > > > > 
> > > > > > Alex
> > > > > 
> > > > > That's very unusual, and unfortunate.  For example this means that I
> > > > > must update applications just because I move a card to another slot.
> > > > > UIO does not have this problem.
> > > > > The fact that it's broken in kvm ATM seems to have made people
> > > > > think it's okay, but it really is a bug. We didn't fix it
> > > > > because vfio was supposed to be the solution.
> > > > 
> > > > I don't know what you're talking about here.  Are you suggesting that
> > > > needing to specify -device pci-assign,host=3.0 changing to host=4.0 when
> > > > you move a card is broken?
> > > 
> > > Yes. Absolutely. Admin should be able to abstract it away without users
> > > knowing anything about it.
> > 
> > We don't have UUIDs on PCI devices, so who's to say that the device that
> > was in slot 3 is the same device that's now in slot 4 and the user
> > should still have access to it?  That sounds even more problematic.
> 
> PF has a driver loaded so you can identify that, and
> identify the VF through it. Again this is really policy,
> it should be up to the admin how to name the device.

Do PFs have a UUID?  Some devices support a serial number, but that's
not related to being a PF vs VF.  We need to support both PFs and VFs
regardless of whether they have any kind of UUID.  I think we're
inventing a problem though.

> > > >  How does UIO avoid such a problem.
> > > 
> > > Normally you use a misc device that you can name with udev.
> > > 
> > > >  UIO-pci
> > > > requires the user to use pci-sysfs for resource access, so it surely
> > > > cares about the device address.
> > > 
> > > Only uio_pci_generic. Other uio devices let you drive the
> > > device.
> > 
> > If this is actually a problem, this is the first ever complaint I've
> > heard about it.  As above, I don't think we can assume the same access
> > when a device is moved.
> 
> I thought need for sane naming and for sysfs interface was discussed
> multiple times. But maybe I'm misremembering.

There is sane naming and a sysfs interface...

> > > > > I do realize you want to represent a group of devices somehow but can't
> > > > > this be solved without breaking naming devices with udev? For example, the
> > > > > device could be a file as well. You would then use the fd to identify the
> > > > > device within the group. And in a somewhat common case of a single device
> > > > > within the group, you can even make opening the group optional.
> > > > > Don't know if this fix I suggest makes sense at all but it's a real
> > > > > problem all the same.
> > > > 
> > > > Unfortunately, exposing individual devices just confuses the ownership
> > > > model we require for groups.  It would provide the illusion of being
> > > > able to assign an individual device, without the reality of the
> > > > grouping.  Groups are owned either by _a_ user or by the kernel, they
> > > > can't be split across multiple users (at least not with any guarantees
> > > > of isolation).  The current interface makes this clear.  Thanks,
> > > > 
> > > > Alex
> > > 
> > > So do users pass in group=/dev/vfio/1,host=0:3.0 then?
> > 
> > No, vfio syntax is -device vfio-pci,host=0:3.0, just like pci-assign.
> > Qemu will figure out which group that device belongs to and "do the
> > right thing".  If we add support for libvirt passing a groupfd, it will
> > be mostly the same, just using scm_rights to get the groupfd instead of
> > opening it directly.  Thanks,
> > 
> > Alex
> 
> Then how do you know which /dev/vfio/# to open?

This is all in the documentation patch... groups are exposed in sysfs
in /sys/kernel/iommu_groups.  Each group has a unique number which is
exposed as a directory.  Each group directory has a subdirectory called
devices which links to all devices in the group.  Each device within a
group as an iommu_group link back to the group directory.
The /dev/vfio/# entry matches the group number in sysfs.  So it's all
pretty easy.  Thanks,

Alex
Michael S. Tsirkin June 10, 2012, 4:22 p.m. UTC | #18
On Sun, Jun 10, 2012 at 09:58:17AM -0600, Alex Williamson wrote:
> On Sun, 2012-06-10 at 18:37 +0300, Michael S. Tsirkin wrote:
> > On Sun, Jun 10, 2012 at 09:15:10AM -0600, Alex Williamson wrote:
> > > On Sun, 2012-06-10 at 17:54 +0300, Michael S. Tsirkin wrote:
> > > > On Sun, Jun 10, 2012 at 08:41:03AM -0600, Alex Williamson wrote:
> > > > > On Sun, 2012-06-10 at 17:03 +0300, Michael S. Tsirkin wrote:
> > > > > > On Sun, Jun 10, 2012 at 07:41:51AM -0600, Alex Williamson wrote:
> > > > > > > > > >>>> vfio_pci.c contains a nice function called "parse_hostaddr". You may
> > > > > > > > > >>>> guess what it does. ;)
> > > > > > > > > >>>
> > > > > > > > > >>> Interesting. Why? This looks strange to me:
> > > > > > > > > >>> I would expect the admin to bind a device to vfio
> > > > > > > > > >>> the way it's now bound to a stub.
> > > > > > > > > >>> The pass /dev/vfioXXX to qemu.
> > > > > > > > > >>
> > > > > > > > > >> That's the "libvirt way". We surely also want the "qemu command line
> > > > > > > > > >> way" for which this kind of service is needed.
> > > > > > > > > >>
> > > > > > > > > >> Jan
> > > > > > > > > >>
> > > > > > > > > > 
> > > > > > > > > > Yes, I imagine the qemu command line passing in /dev/vfioXXX,
> > > > > > > > > > the libvirt way will pass in an fd for above. No?
> > > > > > > > > 
> > > > > > > > > As far as I understand the API, there is no device file per assigned
> > > > > > > > > device.
> > > > > > > > 
> > > > > > > > Does it do pci_get_domain_bus_and_slot like kvm then?
> > > > > > > > With all the warts like you have to remember to bind pci stub
> > > > > > > > or you get two drivers for one device?
> > > > > > > > If true that's unfortunate IMHO.
> > > > > > 
> > > > > > I hope the answer to the above is no?
> > > > > 
> > > > > No, it does a probe for devices.  We need the devaddr to compare against
> > > > > dev_name of the device to figure out which device the user is attempting
> > > > > to identify.
> > > > > 
> > > > > > > > > Also, this [domain:]bus:dev.fn format is more handy for the
> > > > > > > > > command line.
> > > > > > > > > 
> > > > > > > > > Jan
> > > > > > > > > 
> > > > > > > > 
> > > > > > > > Then users could add udev rules that will name vfio devices
> > > > > > > > like this.  Another interesting option: /dev/vfio/eth0/vf1.
> > > > > > > > That's better I think: no one really likes running lspci
> > > > > > > > and guessing the address from there.
> > > > > > > 
> > > > > > > That's not at all how VFIO works.  /dev/vfio/# represents a group, which
> > > > > > > may contain one or more devices.  Even if libvirt passes a file
> > > > > > > descriptor for the group, qemu needs to know which device in the group
> > > > > > > to add to the guest, so parsing a device address is still necessary.
> > > > > > > Thanks,
> > > > > > > 
> > > > > > > Alex
> > > > > > 
> > > > > > That's very unusual, and unfortunate.  For example this means that I
> > > > > > must update applications just because I move a card to another slot.
> > > > > > UIO does not have this problem.
> > > > > > The fact that it's broken in kvm ATM seems to have made people
> > > > > > think it's okay, but it really is a bug. We didn't fix it
> > > > > > because vfio was supposed to be the solution.
> > > > > 
> > > > > I don't know what you're talking about here.  Are you suggesting that
> > > > > needing to specify -device pci-assign,host=3.0 changing to host=4.0 when
> > > > > you move a card is broken?
> > > > 
> > > > Yes. Absolutely. Admin should be able to abstract it away without users
> > > > knowing anything about it.
> > > 
> > > We don't have UUIDs on PCI devices, so who's to say that the device that
> > > was in slot 3 is the same device that's now in slot 4 and the user
> > > should still have access to it?  That sounds even more problematic.
> > 
> > PF has a driver loaded so you can identify that, and
> > identify the VF through it. Again this is really policy,
> > it should be up to the admin how to name the device.
> 
> Do PFs have a UUID?  Some devices support a serial number, but that's
> not related to being a PF vs VF.  We need to support both PFs and VFs
> regardless of whether they have any kind of UUID.

This is a solved problem. udev has class-specific ways to
get the device id and use it to find the name.

> I think we're inventing a problem though.

You think persistent names in udev were a solution in search of a
problem?

> > > > >  How does UIO avoid such a problem.
> > > > 
> > > > Normally you use a misc device that you can name with udev.
> > > > 
> > > > >  UIO-pci
> > > > > requires the user to use pci-sysfs for resource access, so it surely
> > > > > cares about the device address.
> > > > 
> > > > Only uio_pci_generic. Other uio devices let you drive the
> > > > device.
> > > 
> > > If this is actually a problem, this is the first ever complaint I've
> > > heard about it.  As above, I don't think we can assume the same access
> > > when a device is moved.
> > 
> > I thought need for sane naming and for sysfs interface was discussed
> > multiple times. But maybe I'm misremembering.
> 
> There is sane naming and a sysfs interface...

It's not sane if the admin can't rename the device without breaking
applications.

> > > > > > I do realize you want to represent a group of devices somehow but can't
> > > > > > this be solved without breaking naming devices with udev? For example, the
> > > > > > device could be a file as well. You would then use the fd to identify the
> > > > > > device within the group. And in a somewhat common case of a single device
> > > > > > within the group, you can even make opening the group optional.
> > > > > > Don't know if this fix I suggest makes sense at all but it's a real
> > > > > > problem all the same.
> > > > > 
> > > > > Unfortunately, exposing individual devices just confuses the ownership
> > > > > model we require for groups.  It would provide the illusion of being
> > > > > able to assign an individual device, without the reality of the
> > > > > grouping.  Groups are owned either by _a_ user or by the kernel, they
> > > > > can't be split across multiple users (at least not with any guarantees
> > > > > of isolation).  The current interface makes this clear.  Thanks,
> > > > > 
> > > > > Alex
> > > > 
> > > > So do users pass in group=/dev/vfio/1,host=0:3.0 then?
> > > 
> > > No, vfio syntax is -device vfio-pci,host=0:3.0, just like pci-assign.
> > > Qemu will figure out which group that device belongs to and "do the
> > > right thing".  If we add support for libvirt passing a groupfd, it will
> > > be mostly the same, just using scm_rights to get the groupfd instead of
> > > opening it directly.  Thanks,
> > > 
> > > Alex
> > 
> > Then how do you know which /dev/vfio/# to open?
> 
> This is all in the documentation patch... groups are exposed in sysfs
> in /sys/kernel/iommu_groups.  Each group has a unique number which is
> exposed as a directory.  Each group directory has a subdirectory called
> devices which links to all devices in the group.  Each device within a
> group as an iommu_group link back to the group directory.
> The /dev/vfio/# entry matches the group number in sysfs.  So it's all
> pretty easy.  Thanks,
> 
> Alex

So what's the problem to have devices in sysfs linked e.g. from
/sys/class/vfio/ ?  udev could create the nodes e.g. in
/dev/vfio/devices/.  User can then pass the device name and qemu can
figure out the group from sysfs.
Alex Williamson June 10, 2012, 5:29 p.m. UTC | #19
On Sun, 2012-06-10 at 19:22 +0300, Michael S. Tsirkin wrote:
> On Sun, Jun 10, 2012 at 09:58:17AM -0600, Alex Williamson wrote:
> > On Sun, 2012-06-10 at 18:37 +0300, Michael S. Tsirkin wrote:
> > > On Sun, Jun 10, 2012 at 09:15:10AM -0600, Alex Williamson wrote:
> > > > On Sun, 2012-06-10 at 17:54 +0300, Michael S. Tsirkin wrote:
> > > > > On Sun, Jun 10, 2012 at 08:41:03AM -0600, Alex Williamson wrote:
> > > > > > On Sun, 2012-06-10 at 17:03 +0300, Michael S. Tsirkin wrote:
> > > > > > > On Sun, Jun 10, 2012 at 07:41:51AM -0600, Alex Williamson wrote:
> > > > > > > > > > >>>> vfio_pci.c contains a nice function called "parse_hostaddr". You may
> > > > > > > > > > >>>> guess what it does. ;)
> > > > > > > > > > >>>
> > > > > > > > > > >>> Interesting. Why? This looks strange to me:
> > > > > > > > > > >>> I would expect the admin to bind a device to vfio
> > > > > > > > > > >>> the way it's now bound to a stub.
> > > > > > > > > > >>> The pass /dev/vfioXXX to qemu.
> > > > > > > > > > >>
> > > > > > > > > > >> That's the "libvirt way". We surely also want the "qemu command line
> > > > > > > > > > >> way" for which this kind of service is needed.
> > > > > > > > > > >>
> > > > > > > > > > >> Jan
> > > > > > > > > > >>
> > > > > > > > > > > 
> > > > > > > > > > > Yes, I imagine the qemu command line passing in /dev/vfioXXX,
> > > > > > > > > > > the libvirt way will pass in an fd for above. No?
> > > > > > > > > > 
> > > > > > > > > > As far as I understand the API, there is no device file per assigned
> > > > > > > > > > device.
> > > > > > > > > 
> > > > > > > > > Does it do pci_get_domain_bus_and_slot like kvm then?
> > > > > > > > > With all the warts like you have to remember to bind pci stub
> > > > > > > > > or you get two drivers for one device?
> > > > > > > > > If true that's unfortunate IMHO.
> > > > > > > 
> > > > > > > I hope the answer to the above is no?
> > > > > > 
> > > > > > No, it does a probe for devices.  We need the devaddr to compare against
> > > > > > dev_name of the device to figure out which device the user is attempting
> > > > > > to identify.
> > > > > > 
> > > > > > > > > > Also, this [domain:]bus:dev.fn format is more handy for the
> > > > > > > > > > command line.
> > > > > > > > > > 
> > > > > > > > > > Jan
> > > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > Then users could add udev rules that will name vfio devices
> > > > > > > > > like this.  Another interesting option: /dev/vfio/eth0/vf1.
> > > > > > > > > That's better I think: no one really likes running lspci
> > > > > > > > > and guessing the address from there.
> > > > > > > > 
> > > > > > > > That's not at all how VFIO works.  /dev/vfio/# represents a group, which
> > > > > > > > may contain one or more devices.  Even if libvirt passes a file
> > > > > > > > descriptor for the group, qemu needs to know which device in the group
> > > > > > > > to add to the guest, so parsing a device address is still necessary.
> > > > > > > > Thanks,
> > > > > > > > 
> > > > > > > > Alex
> > > > > > > 
> > > > > > > That's very unusual, and unfortunate.  For example this means that I
> > > > > > > must update applications just because I move a card to another slot.
> > > > > > > UIO does not have this problem.
> > > > > > > The fact that it's broken in kvm ATM seems to have made people
> > > > > > > think it's okay, but it really is a bug. We didn't fix it
> > > > > > > because vfio was supposed to be the solution.
> > > > > > 
> > > > > > I don't know what you're talking about here.  Are you suggesting that
> > > > > > needing to specify -device pci-assign,host=3.0 changing to host=4.0 when
> > > > > > you move a card is broken?
> > > > > 
> > > > > Yes. Absolutely. Admin should be able to abstract it away without users
> > > > > knowing anything about it.
> > > > 
> > > > We don't have UUIDs on PCI devices, so who's to say that the device that
> > > > was in slot 3 is the same device that's now in slot 4 and the user
> > > > should still have access to it?  That sounds even more problematic.
> > > 
> > > PF has a driver loaded so you can identify that, and
> > > identify the VF through it. Again this is really policy,
> > > it should be up to the admin how to name the device.
> > 
> > Do PFs have a UUID?  Some devices support a serial number, but that's
> > not related to being a PF vs VF.  We need to support both PFs and VFs
> > regardless of whether they have any kind of UUID.
> 
> This is a solved problem. udev has class-specific ways to
> get the device id and use it to find the name.

Usually via things discovered via device specific drivers, ex. MACs,
disk UUIDs, etc.  We don't have those in VFIO.  Don't forget, VFIO is
also device agnostic, VFIO-pci is just one possible driver backend.

> > I think we're inventing a problem though.
> 
> You think persistent names in udev were a solution in search of a
> problem?

Of course not, but I've never heard any indication that this is a real
problem for device assignment.  If it is, like udev, why not solve it in
userspace?  The kernel doesn't provide the persistence that udev
exposes.

> > > > > >  How does UIO avoid such a problem.
> > > > > 
> > > > > Normally you use a misc device that you can name with udev.
> > > > > 
> > > > > >  UIO-pci
> > > > > > requires the user to use pci-sysfs for resource access, so it surely
> > > > > > cares about the device address.
> > > > > 
> > > > > Only uio_pci_generic. Other uio devices let you drive the
> > > > > device.
> > > > 
> > > > If this is actually a problem, this is the first ever complaint I've
> > > > heard about it.  As above, I don't think we can assume the same access
> > > > when a device is moved.
> > > 
> > > I thought need for sane naming and for sysfs interface was discussed
> > > multiple times. But maybe I'm misremembering.
> > 
> > There is sane naming and a sysfs interface...
> 
> It's not sane if the admin can't rename the device without breaking
> applications.

Again, udev provides that persistence, not the kernel.

> > > > > > > I do realize you want to represent a group of devices somehow but can't
> > > > > > > this be solved without breaking naming devices with udev? For example, the
> > > > > > > device could be a file as well. You would then use the fd to identify the
> > > > > > > device within the group. And in a somewhat common case of a single device
> > > > > > > within the group, you can even make opening the group optional.
> > > > > > > Don't know if this fix I suggest makes sense at all but it's a real
> > > > > > > problem all the same.
> > > > > > 
> > > > > > Unfortunately, exposing individual devices just confuses the ownership
> > > > > > model we require for groups.  It would provide the illusion of being
> > > > > > able to assign an individual device, without the reality of the
> > > > > > grouping.  Groups are owned either by _a_ user or by the kernel, they
> > > > > > can't be split across multiple users (at least not with any guarantees
> > > > > > of isolation).  The current interface makes this clear.  Thanks,
> > > > > > 
> > > > > > Alex
> > > > > 
> > > > > So do users pass in group=/dev/vfio/1,host=0:3.0 then?
> > > > 
> > > > No, vfio syntax is -device vfio-pci,host=0:3.0, just like pci-assign.
> > > > Qemu will figure out which group that device belongs to and "do the
> > > > right thing".  If we add support for libvirt passing a groupfd, it will
> > > > be mostly the same, just using scm_rights to get the groupfd instead of
> > > > opening it directly.  Thanks,
> > > > 
> > > > Alex
> > > 
> > > Then how do you know which /dev/vfio/# to open?
> > 
> > This is all in the documentation patch... groups are exposed in sysfs
> > in /sys/kernel/iommu_groups.  Each group has a unique number which is
> > exposed as a directory.  Each group directory has a subdirectory called
> > devices which links to all devices in the group.  Each device within a
> > group as an iommu_group link back to the group directory.
> > The /dev/vfio/# entry matches the group number in sysfs.  So it's all
> > pretty easy.  Thanks,
> > 
> > Alex
> 
> So what's the problem to have devices in sysfs linked e.g. from
> /sys/class/vfio/ ?

They have to be bound to a VFIO driver, ex. vfio-pci.  We have iommu
groups in sysfs and can use that for this purpose.  I did have a class
for this before iommu_group were more integrated at the device level,
now it just seems redundant.

>   udev could create the nodes e.g. in
> /dev/vfio/devices/.  User can then pass the device name and qemu can
> figure out the group from sysfs.

As previously noted, we don't, and I don't think it makes sense to
expose individual devices in VFIO as the interface is fundamentally
group based by necessity.  Groups are just containers and they do
support a name, which is exposed in sysfs when it's used, but groups are
created by topology and don't follow the devices, so I don't think
that's what you're looking for.  It seems that if someone wants to
create persistence for VFIO, they just need to identify something unique
about a device in sysfs, then follow the device to the group.  Userpsace
is far more capable of providing this than the kernel.  Thanks,

Alex
Michael S. Tsirkin June 10, 2012, 5:57 p.m. UTC | #20
On Sun, Jun 10, 2012 at 11:29:10AM -0600, Alex Williamson wrote:
> On Sun, 2012-06-10 at 19:22 +0300, Michael S. Tsirkin wrote:
> > On Sun, Jun 10, 2012 at 09:58:17AM -0600, Alex Williamson wrote:
> > > On Sun, 2012-06-10 at 18:37 +0300, Michael S. Tsirkin wrote:
> > > > On Sun, Jun 10, 2012 at 09:15:10AM -0600, Alex Williamson wrote:
> > > > > On Sun, 2012-06-10 at 17:54 +0300, Michael S. Tsirkin wrote:
> > > > > > On Sun, Jun 10, 2012 at 08:41:03AM -0600, Alex Williamson wrote:
> > > > > > > On Sun, 2012-06-10 at 17:03 +0300, Michael S. Tsirkin wrote:
> > > > > > > > On Sun, Jun 10, 2012 at 07:41:51AM -0600, Alex Williamson wrote:
> > > > > > > > > > > >>>> vfio_pci.c contains a nice function called "parse_hostaddr". You may
> > > > > > > > > > > >>>> guess what it does. ;)
> > > > > > > > > > > >>>
> > > > > > > > > > > >>> Interesting. Why? This looks strange to me:
> > > > > > > > > > > >>> I would expect the admin to bind a device to vfio
> > > > > > > > > > > >>> the way it's now bound to a stub.
> > > > > > > > > > > >>> The pass /dev/vfioXXX to qemu.
> > > > > > > > > > > >>
> > > > > > > > > > > >> That's the "libvirt way". We surely also want the "qemu command line
> > > > > > > > > > > >> way" for which this kind of service is needed.
> > > > > > > > > > > >>
> > > > > > > > > > > >> Jan
> > > > > > > > > > > >>
> > > > > > > > > > > > 
> > > > > > > > > > > > Yes, I imagine the qemu command line passing in /dev/vfioXXX,
> > > > > > > > > > > > the libvirt way will pass in an fd for above. No?
> > > > > > > > > > > 
> > > > > > > > > > > As far as I understand the API, there is no device file per assigned
> > > > > > > > > > > device.
> > > > > > > > > > 
> > > > > > > > > > Does it do pci_get_domain_bus_and_slot like kvm then?
> > > > > > > > > > With all the warts like you have to remember to bind pci stub
> > > > > > > > > > or you get two drivers for one device?
> > > > > > > > > > If true that's unfortunate IMHO.
> > > > > > > > 
> > > > > > > > I hope the answer to the above is no?
> > > > > > > 
> > > > > > > No, it does a probe for devices.  We need the devaddr to compare against
> > > > > > > dev_name of the device to figure out which device the user is attempting
> > > > > > > to identify.
> > > > > > > 
> > > > > > > > > > > Also, this [domain:]bus:dev.fn format is more handy for the
> > > > > > > > > > > command line.
> > > > > > > > > > > 
> > > > > > > > > > > Jan
> > > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > Then users could add udev rules that will name vfio devices
> > > > > > > > > > like this.  Another interesting option: /dev/vfio/eth0/vf1.
> > > > > > > > > > That's better I think: no one really likes running lspci
> > > > > > > > > > and guessing the address from there.
> > > > > > > > > 
> > > > > > > > > That's not at all how VFIO works.  /dev/vfio/# represents a group, which
> > > > > > > > > may contain one or more devices.  Even if libvirt passes a file
> > > > > > > > > descriptor for the group, qemu needs to know which device in the group
> > > > > > > > > to add to the guest, so parsing a device address is still necessary.
> > > > > > > > > Thanks,
> > > > > > > > > 
> > > > > > > > > Alex
> > > > > > > > 
> > > > > > > > That's very unusual, and unfortunate.  For example this means that I
> > > > > > > > must update applications just because I move a card to another slot.
> > > > > > > > UIO does not have this problem.
> > > > > > > > The fact that it's broken in kvm ATM seems to have made people
> > > > > > > > think it's okay, but it really is a bug. We didn't fix it
> > > > > > > > because vfio was supposed to be the solution.
> > > > > > > 
> > > > > > > I don't know what you're talking about here.  Are you suggesting that
> > > > > > > needing to specify -device pci-assign,host=3.0 changing to host=4.0 when
> > > > > > > you move a card is broken?
> > > > > > 
> > > > > > Yes. Absolutely. Admin should be able to abstract it away without users
> > > > > > knowing anything about it.
> > > > > 
> > > > > We don't have UUIDs on PCI devices, so who's to say that the device that
> > > > > was in slot 3 is the same device that's now in slot 4 and the user
> > > > > should still have access to it?  That sounds even more problematic.
> > > > 
> > > > PF has a driver loaded so you can identify that, and
> > > > identify the VF through it. Again this is really policy,
> > > > it should be up to the admin how to name the device.
> > > 
> > > Do PFs have a UUID?  Some devices support a serial number, but that's
> > > not related to being a PF vs VF.  We need to support both PFs and VFs
> > > regardless of whether they have any kind of UUID.
> > 
> > This is a solved problem. udev has class-specific ways to
> > get the device id and use it to find the name.
> 
> Usually via things discovered via device specific drivers, ex. MACs,
> disk UUIDs, etc.  We don't have those in VFIO.  Don't forget, VFIO is
> also device agnostic, VFIO-pci is just one possible driver backend.

Exactly. But udev can see that what it got is a VF anf query the PF.

> > > I think we're inventing a problem though.
> > 
> > You think persistent names in udev were a solution in search of a
> > problem?
> 
> Of course not, but I've never heard any indication that this is a real
> problem for device assignment.

How is it different from any other workload?

>  If it is, like udev, why not solve it in
> userspace?  The kernel doesn't provide the persistence that udev
> exposes.

To solve it you need to find the PF. Admin does it easily
but VFIO conceptually handles one VF at a time.
So let the admin set the name.
And it is better using standard udev tools than some
hacked interface specific to vfio.

> > > > > > >  How does UIO avoid such a problem.
> > > > > > 
> > > > > > Normally you use a misc device that you can name with udev.
> > > > > > 
> > > > > > >  UIO-pci
> > > > > > > requires the user to use pci-sysfs for resource access, so it surely
> > > > > > > cares about the device address.
> > > > > > 
> > > > > > Only uio_pci_generic. Other uio devices let you drive the
> > > > > > device.
> > > > > 
> > > > > If this is actually a problem, this is the first ever complaint I've
> > > > > heard about it.  As above, I don't think we can assume the same access
> > > > > when a device is moved.
> > > > 
> > > > I thought need for sane naming and for sysfs interface was discussed
> > > > multiple times. But maybe I'm misremembering.
> > > 
> > > There is sane naming and a sysfs interface...
> > 
> > It's not sane if the admin can't rename the device without breaking
> > applications.
> 
> Again, udev provides that persistence, not the kernel.

So let it do its thing.

> > > > > > > > I do realize you want to represent a group of devices somehow but can't
> > > > > > > > this be solved without breaking naming devices with udev? For example, the
> > > > > > > > device could be a file as well. You would then use the fd to identify the
> > > > > > > > device within the group. And in a somewhat common case of a single device
> > > > > > > > within the group, you can even make opening the group optional.
> > > > > > > > Don't know if this fix I suggest makes sense at all but it's a real
> > > > > > > > problem all the same.
> > > > > > > 
> > > > > > > Unfortunately, exposing individual devices just confuses the ownership
> > > > > > > model we require for groups.  It would provide the illusion of being
> > > > > > > able to assign an individual device, without the reality of the
> > > > > > > grouping.  Groups are owned either by _a_ user or by the kernel, they
> > > > > > > can't be split across multiple users (at least not with any guarantees
> > > > > > > of isolation).  The current interface makes this clear.  Thanks,
> > > > > > > 
> > > > > > > Alex
> > > > > > 
> > > > > > So do users pass in group=/dev/vfio/1,host=0:3.0 then?
> > > > > 
> > > > > No, vfio syntax is -device vfio-pci,host=0:3.0, just like pci-assign.
> > > > > Qemu will figure out which group that device belongs to and "do the
> > > > > right thing".  If we add support for libvirt passing a groupfd, it will
> > > > > be mostly the same, just using scm_rights to get the groupfd instead of
> > > > > opening it directly.  Thanks,
> > > > > 
> > > > > Alex
> > > > 
> > > > Then how do you know which /dev/vfio/# to open?
> > > 
> > > This is all in the documentation patch... groups are exposed in sysfs
> > > in /sys/kernel/iommu_groups.  Each group has a unique number which is
> > > exposed as a directory.  Each group directory has a subdirectory called
> > > devices which links to all devices in the group.  Each device within a
> > > group as an iommu_group link back to the group directory.
> > > The /dev/vfio/# entry matches the group number in sysfs.  So it's all
> > > pretty easy.  Thanks,
> > > 
> > > Alex
> > 
> > So what's the problem to have devices in sysfs linked e.g. from
> > /sys/class/vfio/ ?
> 
> They have to be bound to a VFIO driver, ex. vfio-pci.  We have iommu
> groups in sysfs and can use that for this purpose.  I did have a class
> for this before iommu_group were more integrated at the device level,
> now it just seems redundant.
> 
> >   udev could create the nodes e.g. in
> > /dev/vfio/devices/.  User can then pass the device name and qemu can
> > figure out the group from sysfs.
> 
> As previously noted, we don't, and I don't think it makes sense to
> expose individual devices in VFIO as the interface is fundamentally
> group based by necessity.  Groups are just containers and they do
> support a name, which is exposed in sysfs when it's used, but groups are
> created by topology and don't follow the devices, so I don't think
> that's what you're looking for.  It seems that if someone wants to
> create persistence for VFIO, they just need to identify something unique
> about a device in sysfs, then follow the device to the group.

So you are forced to code this all upfront in qemu.
And since you don't in practice, no percictence and no way to provide it.

>  Userpsace
> is far more capable of providing this than the kernel.

It's the admin that knows. Not applications.

>  Thanks,
> 
> Alex
diff mbox

Patch

diff --git a/hw/qdev-properties.c b/hw/qdev-properties.c
index 32e41f1..6634f22 100644
--- a/hw/qdev-properties.c
+++ b/hw/qdev-properties.c
@@ -946,6 +946,54 @@  PropertyInfo qdev_prop_pci_devfn = {
     .max   = 0xFFFFFFFFULL,
 };
 
+static void get_pci_devaddr(Object *obj, Visitor *v, void *opaque,
+                            const char *name, Error **errp)
+{
+    DeviceState *dev = DEVICE(obj);
+    Property *prop = opaque;
+    PCIDeviceAddress *addr = qdev_get_prop_ptr(dev, prop);
+    char buffer[10 + 3 + 1];
+    char *p = buffer;
+
+    snprintf(buffer, sizeof(buffer), "%04x:%02x:%02x.%02x",
+             addr->domain, addr->bus, addr->slot, addr->function);
+
+    visit_type_str(v, &p, name, errp);
+}
+
+static void set_pci_devaddr(Object *obj, Visitor *v, void *opaque,
+                            const char *name, Error **errp)
+{
+    DeviceState *dev = DEVICE(obj);
+    Property *prop = opaque;
+    PCIDeviceAddress *addr = qdev_get_prop_ptr(dev, prop);
+    Error *local_err = NULL;
+    char *str;
+
+    if (dev->state != DEV_STATE_CREATED) {
+        error_set(errp, QERR_PERMISSION_DENIED);
+        return;
+    }
+
+    visit_type_str(v, &str, name, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    if (qemu_parse_pci_devaddr(str, addr,
+                               PCI_DEVADDR_WITH_DOM_BUS_OPT |
+                               PCI_DEVADDR_WITH_FUNC) < 0) {
+        error_set_from_qdev_prop_error(errp, EINVAL, dev, prop, str);
+    }
+}
+
+PropertyInfo qdev_prop_pci_devaddr = {
+    .name  = "pci-devaddr",
+    .get   = get_pci_devaddr,
+    .set   = set_pci_devaddr,
+};
+
 /* --- blocksize --- */
 
 static void set_blocksize(Object *obj, Visitor *v, void *opaque,
diff --git a/hw/qdev.h b/hw/qdev.h
index 15acfca..d2c87a0 100644
--- a/hw/qdev.h
+++ b/hw/qdev.h
@@ -224,6 +224,7 @@  extern PropertyInfo qdev_prop_drive;
 extern PropertyInfo qdev_prop_netdev;
 extern PropertyInfo qdev_prop_vlan;
 extern PropertyInfo qdev_prop_pci_devfn;
+extern PropertyInfo qdev_prop_pci_devaddr;
 extern PropertyInfo qdev_prop_blocksize;
 
 #define DEFINE_PROP(_name, _state, _field, _prop, _type) { \
@@ -288,6 +289,8 @@  extern PropertyInfo qdev_prop_blocksize;
                         LostTickPolicy)
 #define DEFINE_PROP_BLOCKSIZE(_n, _s, _f, _d) \
     DEFINE_PROP_DEFAULT(_n, _s, _f, _d, qdev_prop_blocksize, uint16_t)
+#define DEFINE_PROP_PCI_DEVADDR(_n, _s, _f) \
+    DEFINE_PROP(_n, _s, _f, qdev_prop_pci_devaddr, PCIDeviceAddress)
 
 #define DEFINE_PROP_END_OF_LIST()               \
     {}