diff mbox series

[v5,02/11] pci: add option for net failover

Message ID 20191023082711.16694-3-jfreimann@redhat.com
State New
Headers show
Series add failover feature for assigned network devices | expand

Commit Message

Jens Freimann Oct. 23, 2019, 8:27 a.m. UTC
This patch adds a net_failover_pair_id property to PCIDev which is
used to link the primary device in a failover pair (the PCI dev) to
a standby (a virtio-net-pci) device.

It only supports ethernet devices. Also currently it only supports
PCIe devices. QEMU will exit with an error message otherwise.

Signed-off-by: Jens Freimann <jfreimann@redhat.com>
---
 hw/pci/pci.c         | 17 +++++++++++++++++
 include/hw/pci/pci.h |  3 +++
 2 files changed, 20 insertions(+)

Comments

Alex Williamson Oct. 23, 2019, 6:06 p.m. UTC | #1
On Wed, 23 Oct 2019 10:27:02 +0200
Jens Freimann <jfreimann@redhat.com> wrote:

> This patch adds a net_failover_pair_id property to PCIDev which is
> used to link the primary device in a failover pair (the PCI dev) to
> a standby (a virtio-net-pci) device.
> 
> It only supports ethernet devices. Also currently it only supports
> PCIe devices. QEMU will exit with an error message otherwise.
> 
> Signed-off-by: Jens Freimann <jfreimann@redhat.com>
> ---
>  hw/pci/pci.c         | 17 +++++++++++++++++
>  include/hw/pci/pci.h |  3 +++
>  2 files changed, 20 insertions(+)
> 
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index aa05c2b9b2..fa9b5219f8 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -75,6 +75,8 @@ static Property pci_props[] = {
>                      QEMU_PCIE_LNKSTA_DLLLA_BITNR, true),
>      DEFINE_PROP_BIT("x-pcie-extcap-init", PCIDevice, cap_present,
>                      QEMU_PCIE_EXTCAP_INIT_BITNR, true),
> +    DEFINE_PROP_STRING("net_failover_pair_id", PCIDevice,
> +            net_failover_pair_id),

Nit, white space appears broken here.

>      DEFINE_PROP_END_OF_LIST()
>  };
>  
> @@ -2077,6 +2079,7 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
>      ObjectClass *klass = OBJECT_CLASS(pc);
>      Error *local_err = NULL;
>      bool is_default_rom;
> +    uint16_t class_id;
>  
>      /* initialize cap_present for pci_is_express() and pci_config_size(),
>       * Note that hybrid PCIs are not set automatically and need to manage
> @@ -2101,6 +2104,20 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
>          }
>      }
>  
> +    if (pci_dev->net_failover_pair_id) {
> +        if (!pci_is_express(pci_dev)) {
> +            error_setg(errp, "failover device is not a PCIExpress device");
> +            error_propagate(errp, local_err);
> +            return;
> +        }

Did we decide we don't need to test that the device is also in a
hotpluggable slot?  Are there also multi-function considerations that
should be prevented or documented?  For example, if a user tries to
configure both the primary and failover NICs in the same slot, I assume
bad things will happen.

> +        class_id = pci_get_word(pci_dev->config + PCI_CLASS_DEVICE);
> +        if (class_id != PCI_CLASS_NETWORK_ETHERNET) {
> +            error_setg(errp, "failover device is not an Ethernet device");
> +            error_propagate(errp, local_err);
> +            return;
> +        }
> +    }

Looks like cleanup is missing both both error cases, the option rom
error path below this does a pci_qdev_unrealize() before returning so
I'd assume we want the same here.  Thanks,

Alex

> +
>      /* rom loading */
>      is_default_rom = false;
>      if (pci_dev->romfile == NULL && pc->romfile != NULL) {
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index f3f0ffd5fb..def5435685 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -352,6 +352,9 @@ struct PCIDevice {
>      MSIVectorUseNotifier msix_vector_use_notifier;
>      MSIVectorReleaseNotifier msix_vector_release_notifier;
>      MSIVectorPollNotifier msix_vector_poll_notifier;
> +
> +    /* ID of standby device in net_failover pair */
> +    char *net_failover_pair_id;
>  };
>  
>  void pci_register_bar(PCIDevice *pci_dev, int region_num,
Jens Freimann Oct. 23, 2019, 7:30 p.m. UTC | #2
On Wed, Oct 23, 2019 at 12:06:48PM -0600, Alex Williamson wrote:
>On Wed, 23 Oct 2019 10:27:02 +0200
>Jens Freimann <jfreimann@redhat.com> wrote:
>
>> This patch adds a net_failover_pair_id property to PCIDev which is
>> used to link the primary device in a failover pair (the PCI dev) to
>> a standby (a virtio-net-pci) device.
>>
>> It only supports ethernet devices. Also currently it only supports
>> PCIe devices. QEMU will exit with an error message otherwise.
>>
>> Signed-off-by: Jens Freimann <jfreimann@redhat.com>
>> ---
>>  hw/pci/pci.c         | 17 +++++++++++++++++
>>  include/hw/pci/pci.h |  3 +++
>>  2 files changed, 20 insertions(+)
>>
>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
>> index aa05c2b9b2..fa9b5219f8 100644
>> --- a/hw/pci/pci.c
>> +++ b/hw/pci/pci.c
>> @@ -75,6 +75,8 @@ static Property pci_props[] = {
>>                      QEMU_PCIE_LNKSTA_DLLLA_BITNR, true),
>>      DEFINE_PROP_BIT("x-pcie-extcap-init", PCIDevice, cap_present,
>>                      QEMU_PCIE_EXTCAP_INIT_BITNR, true),
>> +    DEFINE_PROP_STRING("net_failover_pair_id", PCIDevice,
>> +            net_failover_pair_id),
>
>Nit, white space appears broken here.

I'll fix it.

>>      DEFINE_PROP_END_OF_LIST()
>>  };
>>
>> @@ -2077,6 +2079,7 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
>>      ObjectClass *klass = OBJECT_CLASS(pc);
>>      Error *local_err = NULL;
>>      bool is_default_rom;
>> +    uint16_t class_id;
>>
>>      /* initialize cap_present for pci_is_express() and pci_config_size(),
>>       * Note that hybrid PCIs are not set automatically and need to manage
>> @@ -2101,6 +2104,20 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
>>          }
>>      }
>>
>> +    if (pci_dev->net_failover_pair_id) {
>> +        if (!pci_is_express(pci_dev)) {
>> +            error_setg(errp, "failover device is not a PCIExpress device");
>> +            error_propagate(errp, local_err);
>> +            return;
>> +        }
>
>Did we decide we don't need to test that the device is also in a
>hotpluggable slot?  

Hmm, my reply to you was never sent. I thought the check for
qdev_device_add() was sufficient but you said that works only
after qdev_hotplug is set (after machine creation). I modified
the check to this:

    hide = should_hide_device(opts);                                                                      
                                                                                                          
    if ((hide || qdev_hotplug) && bus && !qbus_is_hotpluggable(bus)) {                                    
        error_setg(errp, QERR_BUS_NO_HOTPLUG, bus->name);                                                 
        return NULL;                                                                                      
    }                                                                                                     
                                                                                                          
    if (hide) {                                                                                           
        return NULL;                                                                                      
    }

This will make qemu fail when we have the device in the initial
configuration or when we hotplug it to a bus that doesn't support it.
I tested both with a device on pcie.0. Am I missing something? 

>Are there also multi-function considerations that
>should be prevented or documented?  For example, if a user tries to
>configure both the primary and failover NICs in the same slot, I assume
>bad things will happen.

  I would have expected that this is already checked in pci code, but
it is not. I tried it and when I put both devices into the same slot
they are both unplugged from the guest during boot but nothing else
happens. I don't know what triggers that unplug of the devices.

I'm not aware of any other problems regarding multi-function, which
doesn't mean there aren't any. 

>
>> +        class_id = pci_get_word(pci_dev->config + PCI_CLASS_DEVICE);
>> +        if (class_id != PCI_CLASS_NETWORK_ETHERNET) {
>> +            error_setg(errp, "failover device is not an Ethernet device");
>> +            error_propagate(errp, local_err);
>> +            return;
>> +        }
>> +    }
>
>Looks like cleanup is missing both both error cases, the option rom
>error path below this does a pci_qdev_unrealize() before returning so
>I'd assume we want the same here.  Thanks,

Thanks, I'll fix this too.

regards,
Jens
Alex Williamson Oct. 23, 2019, 8:02 p.m. UTC | #3
On Wed, 23 Oct 2019 21:30:35 +0200
Jens Freimann <jfreimann@redhat.com> wrote:

> On Wed, Oct 23, 2019 at 12:06:48PM -0600, Alex Williamson wrote:
> >On Wed, 23 Oct 2019 10:27:02 +0200
> >Jens Freimann <jfreimann@redhat.com> wrote:
> >  
> >> This patch adds a net_failover_pair_id property to PCIDev which is
> >> used to link the primary device in a failover pair (the PCI dev) to
> >> a standby (a virtio-net-pci) device.
> >>
> >> It only supports ethernet devices. Also currently it only supports
> >> PCIe devices. QEMU will exit with an error message otherwise.
> >>
> >> Signed-off-by: Jens Freimann <jfreimann@redhat.com>
> >> ---
> >>  hw/pci/pci.c         | 17 +++++++++++++++++
> >>  include/hw/pci/pci.h |  3 +++
> >>  2 files changed, 20 insertions(+)
> >>
> >> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> >> index aa05c2b9b2..fa9b5219f8 100644
> >> --- a/hw/pci/pci.c
> >> +++ b/hw/pci/pci.c
> >> @@ -75,6 +75,8 @@ static Property pci_props[] = {
> >>                      QEMU_PCIE_LNKSTA_DLLLA_BITNR, true),
> >>      DEFINE_PROP_BIT("x-pcie-extcap-init", PCIDevice, cap_present,
> >>                      QEMU_PCIE_EXTCAP_INIT_BITNR, true),
> >> +    DEFINE_PROP_STRING("net_failover_pair_id", PCIDevice,
> >> +            net_failover_pair_id),  
> >
> >Nit, white space appears broken here.  
> 
> I'll fix it.
> 
> >>      DEFINE_PROP_END_OF_LIST()
> >>  };
> >>
> >> @@ -2077,6 +2079,7 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
> >>      ObjectClass *klass = OBJECT_CLASS(pc);
> >>      Error *local_err = NULL;
> >>      bool is_default_rom;
> >> +    uint16_t class_id;
> >>
> >>      /* initialize cap_present for pci_is_express() and pci_config_size(),
> >>       * Note that hybrid PCIs are not set automatically and need to manage
> >> @@ -2101,6 +2104,20 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
> >>          }
> >>      }
> >>
> >> +    if (pci_dev->net_failover_pair_id) {
> >> +        if (!pci_is_express(pci_dev)) {
> >> +            error_setg(errp, "failover device is not a PCIExpress device");
> >> +            error_propagate(errp, local_err);
> >> +            return;
> >> +        }  
> >
> >Did we decide we don't need to test that the device is also in a
> >hotpluggable slot?    
> 
> Hmm, my reply to you was never sent. I thought the check for
> qdev_device_add() was sufficient but you said that works only
> after qdev_hotplug is set (after machine creation). I modified
> the check to this:
> 
>     hide = should_hide_device(opts);                                                                      
>                                                                                                           
>     if ((hide || qdev_hotplug) && bus && !qbus_is_hotpluggable(bus)) {                                    
>         error_setg(errp, QERR_BUS_NO_HOTPLUG, bus->name);                                                 
>         return NULL;                                                                                      
>     }                                                                                                     
>                                                                                                           
>     if (hide) {                                                                                           
>         return NULL;                                                                                      
>     }
> 
> This will make qemu fail when we have the device in the initial
> configuration or when we hotplug it to a bus that doesn't support it.
> I tested both with a device on pcie.0. Am I missing something? 

Nope, sorry, I was expecting the check here and didn't see that you
perform it elsewhere.  Seems good enough for me.
 
> >Are there also multi-function considerations that
> >should be prevented or documented?  For example, if a user tries to
> >configure both the primary and failover NICs in the same slot, I assume
> >bad things will happen.  
> 
>   I would have expected that this is already checked in pci code, but
> it is not. I tried it and when I put both devices into the same slot
> they are both unplugged from the guest during boot but nothing else
> happens. I don't know what triggers that unplug of the devices.
> 
> I'm not aware of any other problems regarding multi-function, which
> doesn't mean there aren't any. 

Hmm, was the hidden device at function #0?  The guest won't find any
functions if function #0 isn't present, but I don't know what would
trigger the hotplug.  The angle I'm thinking is that we only have slot
level granularity for hotplug, so any sort of automatic hotplug of a
slot should probably think about bystander devices within the slot.
Thanks,

Alex

> >> +        class_id = pci_get_word(pci_dev->config + PCI_CLASS_DEVICE);
> >> +        if (class_id != PCI_CLASS_NETWORK_ETHERNET) {
> >> +            error_setg(errp, "failover device is not an Ethernet device");
> >> +            error_propagate(errp, local_err);
> >> +            return;
> >> +        }
> >> +    }  
> >
> >Looks like cleanup is missing both both error cases, the option rom
> >error path below this does a pci_qdev_unrealize() before returning so
> >I'd assume we want the same here.  Thanks,  
> 
> Thanks, I'll fix this too.
> 
> regards,
> Jens
Jens Freimann Oct. 23, 2019, 8:31 p.m. UTC | #4
On Wed, Oct 23, 2019 at 02:02:11PM -0600, Alex Williamson wrote:
>On Wed, 23 Oct 2019 21:30:35 +0200
>Jens Freimann <jfreimann@redhat.com> wrote:
>
>> On Wed, Oct 23, 2019 at 12:06:48PM -0600, Alex Williamson wrote:
>> >On Wed, 23 Oct 2019 10:27:02 +0200
>> >Jens Freimann <jfreimann@redhat.com> wrote:
[...]
>> >Are there also multi-function considerations that
>> >should be prevented or documented?  For example, if a user tries to
>> >configure both the primary and failover NICs in the same slot, I assume
>> >bad things will happen.
>>
>>   I would have expected that this is already checked in pci code, but
>> it is not. I tried it and when I put both devices into the same slot
>> they are both unplugged from the guest during boot but nothing else
>> happens. I don't know what triggers that unplug of the devices.
>>
>> I'm not aware of any other problems regarding multi-function, which
>> doesn't mean there aren't any.
>
>Hmm, was the hidden device at function #0?  The guest won't find any
>functions if function #0 isn't present, but I don't know what would
>trigger the hotplug.  The angle I'm thinking is that we only have slot
>level granularity for hotplug, so any sort of automatic hotplug of a
>slot should probably think about bystander devices within the slot.

Yes that would be a problem, but isn't it the same in the non-failover case
where a user configures it wrong? The slot where the device is plugged is not
chosen automatically it's configured by the user, no? I might be mixing something
up here.  I have no idea yet how to check if a slot is already populated, but
I'll think about it. 

Thanks!

regards,
Jens
Alex Williamson Oct. 23, 2019, 9:15 p.m. UTC | #5
On Wed, 23 Oct 2019 22:31:37 +0200
Jens Freimann <jfreimann@redhat.com> wrote:

> On Wed, Oct 23, 2019 at 02:02:11PM -0600, Alex Williamson wrote:
> >On Wed, 23 Oct 2019 21:30:35 +0200
> >Jens Freimann <jfreimann@redhat.com> wrote:
> >  
> >> On Wed, Oct 23, 2019 at 12:06:48PM -0600, Alex Williamson wrote:  
> >> >On Wed, 23 Oct 2019 10:27:02 +0200
> >> >Jens Freimann <jfreimann@redhat.com> wrote:  
> [...]
> >> >Are there also multi-function considerations that
> >> >should be prevented or documented?  For example, if a user tries to
> >> >configure both the primary and failover NICs in the same slot, I assume
> >> >bad things will happen.  
> >>
> >>   I would have expected that this is already checked in pci code, but
> >> it is not. I tried it and when I put both devices into the same slot
> >> they are both unplugged from the guest during boot but nothing else
> >> happens. I don't know what triggers that unplug of the devices.
> >>
> >> I'm not aware of any other problems regarding multi-function, which
> >> doesn't mean there aren't any.  
> >
> >Hmm, was the hidden device at function #0?  The guest won't find any
> >functions if function #0 isn't present, but I don't know what would
> >trigger the hotplug.  The angle I'm thinking is that we only have slot
> >level granularity for hotplug, so any sort of automatic hotplug of a
> >slot should probably think about bystander devices within the slot.  
> 
> Yes that would be a problem, but isn't it the same in the non-failover case
> where a user configures it wrong? The slot where the device is plugged is not
> chosen automatically it's configured by the user, no? I might be mixing something
> up here.  I have no idea yet how to check if a slot is already populated, but
> I'll think about it. 

I don't think libvirt will automatically make use of multifunction
endpoints, except maybe for some built-in devices, so yes it probably
would be up to the user to explicitly create a multifunction device.
But are there other scenarios that generate an automatic hot-unplug?
If a user creates a multifunction slot and then triggers a hot-unplug
themselves, it's easy to place the blame on the user if the result is
unexpected, but is it so obviously a user configuration error if the
hotplug occurs as an automatic response to a migration?  I'm not as
sure about that.

As indicated, I don't know whether this should just be documented or if
we should spend time preventing it, but someone, somewhere will
probably think it's a good idea to put their primary and failover NIC
in the same slot and be confused that the underlying mechanisms cannot
support it.  It doesn't appear that it would be too difficult to test
QEMU_PCI_CAP_MULTIFUNCTION (not set) and PCI_FUNC (is 0) for the
primary, but maybe I'm just being paranoid.  Thanks,

Alex
Parav Pandit Oct. 24, 2019, 5:03 a.m. UTC | #6
> -----Original Message-----
> From: Jens Freimann <jfreimann@redhat.com>
> Sent: Wednesday, October 23, 2019 3:27 AM
> To: qemu-devel@nongnu.org
> Cc: ehabkost@redhat.com; mst@redhat.com; berrange@redhat.com;
> pkrempa@redhat.com; laine@redhat.com; aadam@redhat.com;
> ailan@redhat.com; Parav Pandit <parav@mellanox.com>;
> dgilbert@redhat.com; alex.williamson@redhat.com
> Subject: [PATCH v5 02/11] pci: add option for net failover
> 
> This patch adds a net_failover_pair_id property to PCIDev which is used to
> link the primary device in a failover pair (the PCI dev) to a standby (a virtio-
> net-pci) device.
> 
> It only supports ethernet devices. Also currently it only supports PCIe
> devices. QEMU will exit with an error message otherwise.
> 
> Signed-off-by: Jens Freimann <jfreimann@redhat.com>
> ---
>  hw/pci/pci.c         | 17 +++++++++++++++++
>  include/hw/pci/pci.h |  3 +++
>  2 files changed, 20 insertions(+)
> 
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c index aa05c2b9b2..fa9b5219f8 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -75,6 +75,8 @@ static Property pci_props[] = {
>                      QEMU_PCIE_LNKSTA_DLLLA_BITNR, true),
>      DEFINE_PROP_BIT("x-pcie-extcap-init", PCIDevice, cap_present,
>                      QEMU_PCIE_EXTCAP_INIT_BITNR, true),
> +    DEFINE_PROP_STRING("net_failover_pair_id", PCIDevice,
> +            net_failover_pair_id),
>      DEFINE_PROP_END_OF_LIST()
>  };
> 
> @@ -2077,6 +2079,7 @@ static void pci_qdev_realize(DeviceState *qdev,
> Error **errp)
>      ObjectClass *klass = OBJECT_CLASS(pc);
>      Error *local_err = NULL;
>      bool is_default_rom;
> +    uint16_t class_id;
> 
>      /* initialize cap_present for pci_is_express() and pci_config_size(),
>       * Note that hybrid PCIs are not set automatically and need to manage
> @@ -2101,6 +2104,20 @@ static void pci_qdev_realize(DeviceState *qdev,
> Error **errp)
>          }
>      }
> 
> +    if (pci_dev->net_failover_pair_id) {
> +        if (!pci_is_express(pci_dev)) {

I am testing and integrating this piece with mlx5 devices.
I see that pci_is_express() return true only for first PCI function.
Didn't yet dig the API.
Commenting out this check and below class check progresses further.

While reviewing, I realized that we shouldn't have this check for below reasons.

1. It is user's responsibility to pass networking device.
But its ok to check the class, if PCI Device is passed.
So class comparison should be inside the pci_check().

2. It is limiting to only consider PCI devices. 
Automated and regression tests should be able validate this feature without PCI Device.
This will enhance the stability of feature in long run.

3. net failover driver doesn't limit it to have it over only PCI device.
So similarly hypervisor should be limiting.

> +            error_setg(errp, "failover device is not a PCIExpress device");
> +            error_propagate(errp, local_err);
> +            return;
> +        }
> +        class_id = pci_get_word(pci_dev->config + PCI_CLASS_DEVICE);
> +        if (class_id != PCI_CLASS_NETWORK_ETHERNET) {
> +            error_setg(errp, "failover device is not an Ethernet device");
> +            error_propagate(errp, local_err);
> +            return;
> +        }
> +    }
> +
>      /* rom loading */
>      is_default_rom = false;
>      if (pci_dev->romfile == NULL && pc->romfile != NULL) { diff --git
> a/include/hw/pci/pci.h b/include/hw/pci/pci.h index f3f0ffd5fb..def5435685
> 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -352,6 +352,9 @@ struct PCIDevice {
>      MSIVectorUseNotifier msix_vector_use_notifier;
>      MSIVectorReleaseNotifier msix_vector_release_notifier;
>      MSIVectorPollNotifier msix_vector_poll_notifier;
> +
> +    /* ID of standby device in net_failover pair */
> +    char *net_failover_pair_id;
>  };
> 
>  void pci_register_bar(PCIDevice *pci_dev, int region_num,
> --
> 2.21.0
Jens Freimann Oct. 24, 2019, 9:37 a.m. UTC | #7
On Thu, Oct 24, 2019 at 05:03:46AM +0000, Parav Pandit wrote:
>> @@ -2101,6 +2104,20 @@ static void pci_qdev_realize(DeviceState *qdev,
>> Error **errp)
>>          }
>>      }
>>
>> +    if (pci_dev->net_failover_pair_id) {
>> +        if (!pci_is_express(pci_dev)) {
>
>I am testing and integrating this piece with mlx5 devices.
>I see that pci_is_express() return true only for first PCI function.
>Didn't yet dig the API.
>Commenting out this check and below class check progresses further.

First of all, thanks for testing this!
Could you share your commandline please? I can't reproduce it.
>
>While reviewing, I realized that we shouldn't have this check for below reasons.
>
>1. It is user's responsibility to pass networking device.
>But its ok to check the class, if PCI Device is passed.
>So class comparison should be inside the pci_check().

I'm not sure I understand this point, could you please elaborate?
You're suggesting to move the check for the class into the check for
pci_is_express?

>2. It is limiting to only consider PCI devices.
>Automated and regression tests should be able validate this feature without PCI Device.
>This will enhance the stability of feature in long run.
>
>3. net failover driver doesn't limit it to have it over only PCI device.
>So similarly hypervisor should be limiting.

I agree that we don't have to limit it to PCI(e) forever. But for this
first shot I think we should and then extend it continually. There are
more things we can support in the future like other hotplug types etc.

regards,
Jens
Parav Pandit Oct. 24, 2019, 4:34 p.m. UTC | #8
> -----Original Message-----
> From: Jens Freimann <jfreimann@redhat.com>
> Sent: Thursday, October 24, 2019 4:38 AM
> To: Parav Pandit <parav@mellanox.com>
> Cc: qemu-devel@nongnu.org; ehabkost@redhat.com; mst@redhat.com;
> berrange@redhat.com; pkrempa@redhat.com; laine@redhat.com;
> aadam@redhat.com; ailan@redhat.com; dgilbert@redhat.com;
> alex.williamson@redhat.com
> Subject: Re: [PATCH v5 02/11] pci: add option for net failover
> 
> On Thu, Oct 24, 2019 at 05:03:46AM +0000, Parav Pandit wrote:
> >> @@ -2101,6 +2104,20 @@ static void pci_qdev_realize(DeviceState
> >> *qdev, Error **errp)
> >>          }
> >>      }
> >>
> >> +    if (pci_dev->net_failover_pair_id) {
> >> +        if (!pci_is_express(pci_dev)) {
> >
> >I am testing and integrating this piece with mlx5 devices.
> >I see that pci_is_express() return true only for first PCI function.
> >Didn't yet dig the API.
> >Commenting out this check and below class check progresses further.
> 
> First of all, thanks for testing this!
> Could you share your commandline please? I can't reproduce it.
> >
I added debug prints to get the difference between VF1 and VF2 behavior.
What I see is, vfio_populate_device() below code is activated for VF2 where qemu claims that its not a PCIe device.

    vdev->config_size = reg_info->size;
    if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) {
        vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS;
        printf("%s clearing QEMU PCI bits\n", __func__);
    }

Command line:
/usr/local/bin/qemu-system-x86_64 -enable-kvm -m 3072 -smp 3 \
               -machine q35,usb=off,vmport=off,dump-guest-core=off -cpu Haswell-noTSX-IBRS \
           -net none \
               -qmp unix:/tmp/qmp.socket,server,nowait \
        -monitor telnet:127.0.0.1:5556,server,nowait \
        -device pcie-root-port,id=root0,multifunction=on,chassis=0,addr=0xa \
        -device pcie-root-port,id=root1,bus=pcie.0,chassis=1 \
        -device pcie-root-port,id=root2,bus=pcie.0,chassis=2 \
        -netdev tap,id=hostnet1,fd=4 4<>/dev/tap49\
        -device virtio-net-pci,netdev=hostnet1,id=net1,mac=52:54:00:02:02:02,bus=root2,failover=on \
        -device vfio-pci,id=hostdev0,host=05:00.2,bus=root1,net_failover_pair_id=net1 \
        /var/lib/libvirt/images/sriov-lm-02.qcow2

> >While reviewing, I realized that we shouldn't have this check for below
> reasons.
> >
> >1. It is user's responsibility to pass networking device.
> >But its ok to check the class, if PCI Device is passed.
> >So class comparison should be inside the pci_check().
> 
> I'm not sure I understand this point, could you please elaborate?
> You're suggesting to move the check for the class into the check for
> pci_is_express?
> 
No. Below is the suggestion.

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 8fbf32d68c..8004309973 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2105,17 +2105,14 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
     }

     if (pci_dev->net_failover_pair_id) {
-        if (!pci_is_express(pci_dev)) {
-            error_setg(errp, "failover device is not a PCIExpress device");
-            error_propagate(errp, local_err);
-            return;
-        }
-        class_id = pci_get_word(pci_dev->config + PCI_CLASS_DEVICE);
-        if (class_id != PCI_CLASS_NETWORK_ETHERNET) {
-            error_setg(errp, "failover device is not an Ethernet device");
-            error_propagate(errp, local_err);
-            return;
-        }
+        if (pci_is_express(pci_dev)) {
+            class_id = pci_get_word(pci_dev->config + PCI_CLASS_DEVICE);
+            if (class_id != PCI_CLASS_NETWORK_ETHERNET) {
+                error_setg(errp, "failover device is not an Ethernet device");
+                error_propagate(errp, local_err);
+                return;
+            }
+       }

This will allow to map non PCI device as failover too.
After writing above hunk I think that when code reaches to check for 
If (pci_dev->net_failover_pair_id),... it is already gone gone through do_pci_register_device().
There should not be any check needed again for pci_is_express().
Isn't it?


> >2. It is limiting to only consider PCI devices.
> >Automated and regression tests should be able validate this feature without
> PCI Device.
> >This will enhance the stability of feature in long run.
> >
> >3. net failover driver doesn't limit it to have it over only PCI device.
> >So similarly hypervisor should be limiting.
> 
> I agree that we don't have to limit it to PCI(e) forever. But for this first shot I
> think we should and then extend it continually. There are more things we can
> support in the future like other hotplug types etc.
> 
o.k. But probably net_failover_pair_id field should be in DeviceState instead of PCIDevice at minimum?
Or you want to refactor it later?
Alex Williamson Oct. 24, 2019, 4:52 p.m. UTC | #9
On Thu, 24 Oct 2019 11:37:54 +0200
Jens Freimann <jfreimann@redhat.com> wrote:

> On Thu, Oct 24, 2019 at 05:03:46AM +0000, Parav Pandit wrote:
> >> @@ -2101,6 +2104,20 @@ static void pci_qdev_realize(DeviceState *qdev,
> >> Error **errp)
> >>          }
> >>      }
> >>
> >> +    if (pci_dev->net_failover_pair_id) {
> >> +        if (!pci_is_express(pci_dev)) {  
> >
> >I am testing and integrating this piece with mlx5 devices.
> >I see that pci_is_express() return true only for first PCI function.
> >Didn't yet dig the API.
> >Commenting out this check and below class check progresses further.  
> 
> First of all, thanks for testing this!
> Could you share your commandline please? I can't reproduce it.
> >
> >While reviewing, I realized that we shouldn't have this check for below reasons.
> >
> >1. It is user's responsibility to pass networking device.
> >But its ok to check the class, if PCI Device is passed.
> >So class comparison should be inside the pci_check().  
> 
> I'm not sure I understand this point, could you please elaborate?
> You're suggesting to move the check for the class into the check for
> pci_is_express?

Seems like the suggestion is that net_failover_pair_id should be an
option on the base class of PCIDevice (DeviceState?) and only if it's a
PCI device would we check the class code.  But there are dependencies
at the hotplug controller, which I think is why this is currently
specific to PCI.

However, it's an interesting point about pci_is_express().  This test
is really just meant to check whether the hotplug controller supports
this feature, which is only implemented in pciehp via this series.
There's a bit of a mismatch though that pcie_is_express() checks
whether the device is express, not whether the bus it sits on is
express.  I think we really want the latter, so maybe this should be:

pci_bus_is_express(pci_get_bus(dev)

For example this feature should work if I plug an e1000 (not e1000e)
into an express slot, but not if I plug an e1000e into a conventional
slot.
 
> >2. It is limiting to only consider PCI devices.
> >Automated and regression tests should be able validate this feature without PCI Device.
> >This will enhance the stability of feature in long run.
> >
> >3. net failover driver doesn't limit it to have it over only PCI device.
> >So similarly hypervisor should be limiting.  
> 
> I agree that we don't have to limit it to PCI(e) forever. But for this
> first shot I think we should and then extend it continually. There are
> more things we can support in the future like other hotplug types etc.

Yep, long term it seems very generic, but there's a dependency in the
hotplug controller and it is beneficial that PCI has a class code
feature that allows us to error if this is specified on a non-net
device.  Thanks,

Alex
Alex Williamson Oct. 24, 2019, 5:06 p.m. UTC | #10
On Thu, 24 Oct 2019 16:34:01 +0000
Parav Pandit <parav@mellanox.com> wrote:

> > -----Original Message-----
> > From: Jens Freimann <jfreimann@redhat.com>
> > Sent: Thursday, October 24, 2019 4:38 AM
> > To: Parav Pandit <parav@mellanox.com>
> > Cc: qemu-devel@nongnu.org; ehabkost@redhat.com; mst@redhat.com;
> > berrange@redhat.com; pkrempa@redhat.com; laine@redhat.com;
> > aadam@redhat.com; ailan@redhat.com; dgilbert@redhat.com;
> > alex.williamson@redhat.com
> > Subject: Re: [PATCH v5 02/11] pci: add option for net failover
> > 
> > On Thu, Oct 24, 2019 at 05:03:46AM +0000, Parav Pandit wrote:  
> > >> @@ -2101,6 +2104,20 @@ static void pci_qdev_realize(DeviceState
> > >> *qdev, Error **errp)
> > >>          }
> > >>      }
> > >>
> > >> +    if (pci_dev->net_failover_pair_id) {
> > >> +        if (!pci_is_express(pci_dev)) {  
> > >
> > >I am testing and integrating this piece with mlx5 devices.
> > >I see that pci_is_express() return true only for first PCI function.
> > >Didn't yet dig the API.
> > >Commenting out this check and below class check progresses further.  
> > 
> > First of all, thanks for testing this!
> > Could you share your commandline please? I can't reproduce it.  
> > >  
> I added debug prints to get the difference between VF1 and VF2 behavior.
> What I see is, vfio_populate_device() below code is activated for VF2 where qemu claims that its not a PCIe device.
> 
>     vdev->config_size = reg_info->size;
>     if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) {
>         vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS;
>         printf("%s clearing QEMU PCI bits\n", __func__);
>     }
> 
> Command line:
> /usr/local/bin/qemu-system-x86_64 -enable-kvm -m 3072 -smp 3 \
>                -machine q35,usb=off,vmport=off,dump-guest-core=off -cpu Haswell-noTSX-IBRS \
>            -net none \
>                -qmp unix:/tmp/qmp.socket,server,nowait \
>         -monitor telnet:127.0.0.1:5556,server,nowait \
>         -device pcie-root-port,id=root0,multifunction=on,chassis=0,addr=0xa \
>         -device pcie-root-port,id=root1,bus=pcie.0,chassis=1 \
>         -device pcie-root-port,id=root2,bus=pcie.0,chassis=2 \
>         -netdev tap,id=hostnet1,fd=4 4<>/dev/tap49\
>         -device virtio-net-pci,netdev=hostnet1,id=net1,mac=52:54:00:02:02:02,bus=root2,failover=on \
>         -device vfio-pci,id=hostdev0,host=05:00.2,bus=root1,net_failover_pair_id=net1 \
>         /var/lib/libvirt/images/sriov-lm-02.qcow2
> 
> > >While reviewing, I realized that we shouldn't have this check for below  
> > reasons.  
> > >
> > >1. It is user's responsibility to pass networking device.
> > >But its ok to check the class, if PCI Device is passed.
> > >So class comparison should be inside the pci_check().  
> > 
> > I'm not sure I understand this point, could you please elaborate?
> > You're suggesting to move the check for the class into the check for
> > pci_is_express?
> >   
> No. Below is the suggestion.
> 
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index 8fbf32d68c..8004309973 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -2105,17 +2105,14 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
>      }
> 
>      if (pci_dev->net_failover_pair_id) {
> -        if (!pci_is_express(pci_dev)) {
> -            error_setg(errp, "failover device is not a PCIExpress device");
> -            error_propagate(errp, local_err);
> -            return;
> -        }
> -        class_id = pci_get_word(pci_dev->config + PCI_CLASS_DEVICE);
> -        if (class_id != PCI_CLASS_NETWORK_ETHERNET) {
> -            error_setg(errp, "failover device is not an Ethernet device");
> -            error_propagate(errp, local_err);
> -            return;
> -        }
> +        if (pci_is_express(pci_dev)) {
> +            class_id = pci_get_word(pci_dev->config + PCI_CLASS_DEVICE);
> +            if (class_id != PCI_CLASS_NETWORK_ETHERNET) {
> +                error_setg(errp, "failover device is not an Ethernet device");
> +                error_propagate(errp, local_err);
> +                return;
> +            }
> +       }
> 
> This will allow to map non PCI device as failover too.

As in previous email, the point of the check was to exclude devices
when the hotplug controller is known not to support the feature.  It's
a topology check masked as a device check, it only exists because
support at the hotplug controller is not ubiquitous.  Thanks,

Alex

> After writing above hunk I think that when code reaches to check for 
> If (pci_dev->net_failover_pair_id),... it is already gone gone through do_pci_register_device().
> There should not be any check needed again for pci_is_express().
> Isn't it?
> 
> 
> > >2. It is limiting to only consider PCI devices.
> > >Automated and regression tests should be able validate this feature without  
> > PCI Device.  
> > >This will enhance the stability of feature in long run.
> > >
> > >3. net failover driver doesn't limit it to have it over only PCI device.
> > >So similarly hypervisor should be limiting.  
> > 
> > I agree that we don't have to limit it to PCI(e) forever. But for this first shot I
> > think we should and then extend it continually. There are more things we can
> > support in the future like other hotplug types etc.
> >   
> o.k. But probably net_failover_pair_id field should be in DeviceState instead of PCIDevice at minimum?
> Or you want to refactor it later?
Dr. David Alan Gilbert Oct. 24, 2019, 5:22 p.m. UTC | #11
* Jens Freimann (jfreimann@redhat.com) wrote:
> This patch adds a net_failover_pair_id property to PCIDev which is
> used to link the primary device in a failover pair (the PCI dev) to
> a standby (a virtio-net-pci) device.
> 
> It only supports ethernet devices. Also currently it only supports
> PCIe devices. QEMU will exit with an error message otherwise.
> 
> Signed-off-by: Jens Freimann <jfreimann@redhat.com>
> ---
>  hw/pci/pci.c         | 17 +++++++++++++++++
>  include/hw/pci/pci.h |  3 +++
>  2 files changed, 20 insertions(+)
> 
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index aa05c2b9b2..fa9b5219f8 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -75,6 +75,8 @@ static Property pci_props[] = {
>                      QEMU_PCIE_LNKSTA_DLLLA_BITNR, true),
>      DEFINE_PROP_BIT("x-pcie-extcap-init", PCIDevice, cap_present,
>                      QEMU_PCIE_EXTCAP_INIT_BITNR, true),
> +    DEFINE_PROP_STRING("net_failover_pair_id", PCIDevice,
> +            net_failover_pair_id),

Should we just make this 'failover_pair_id' - then when someone in the
future figures out how to make it work for something else (e.g.
multipath block devices) then it's all good?

Dave

>      DEFINE_PROP_END_OF_LIST()
>  };
>  
> @@ -2077,6 +2079,7 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
>      ObjectClass *klass = OBJECT_CLASS(pc);
>      Error *local_err = NULL;
>      bool is_default_rom;
> +    uint16_t class_id;
>  
>      /* initialize cap_present for pci_is_express() and pci_config_size(),
>       * Note that hybrid PCIs are not set automatically and need to manage
> @@ -2101,6 +2104,20 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
>          }
>      }
>  
> +    if (pci_dev->net_failover_pair_id) {
> +        if (!pci_is_express(pci_dev)) {
> +            error_setg(errp, "failover device is not a PCIExpress device");
> +            error_propagate(errp, local_err);
> +            return;
> +        }
> +        class_id = pci_get_word(pci_dev->config + PCI_CLASS_DEVICE);
> +        if (class_id != PCI_CLASS_NETWORK_ETHERNET) {
> +            error_setg(errp, "failover device is not an Ethernet device");
> +            error_propagate(errp, local_err);
> +            return;
> +        }
> +    }
> +
>      /* rom loading */
>      is_default_rom = false;
>      if (pci_dev->romfile == NULL && pc->romfile != NULL) {
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index f3f0ffd5fb..def5435685 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -352,6 +352,9 @@ struct PCIDevice {
>      MSIVectorUseNotifier msix_vector_use_notifier;
>      MSIVectorReleaseNotifier msix_vector_release_notifier;
>      MSIVectorPollNotifier msix_vector_poll_notifier;
> +
> +    /* ID of standby device in net_failover pair */
> +    char *net_failover_pair_id;
>  };
>  
>  void pci_register_bar(PCIDevice *pci_dev, int region_num,
> -- 
> 2.21.0
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Laine Stump Oct. 24, 2019, 5:57 p.m. UTC | #12
On 10/23/19 5:15 PM, Alex Williamson wrote:
> On Wed, 23 Oct 2019 22:31:37 +0200
> Jens Freimann <jfreimann@redhat.com> wrote:
> 
>> On Wed, Oct 23, 2019 at 02:02:11PM -0600, Alex Williamson wrote:
>>> On Wed, 23 Oct 2019 21:30:35 +0200
>>> Jens Freimann <jfreimann@redhat.com> wrote:
>>>   
>>>> On Wed, Oct 23, 2019 at 12:06:48PM -0600, Alex Williamson wrote:
>>>>> On Wed, 23 Oct 2019 10:27:02 +0200
>>>>> Jens Freimann <jfreimann@redhat.com> wrote:
>> [...]
>>>>> Are there also multi-function considerations that
>>>>> should be prevented or documented?  For example, if a user tries to
>>>>> configure both the primary and failover NICs in the same slot, I assume
>>>>> bad things will happen.
>>>>
>>>>    I would have expected that this is already checked in pci code, but
>>>> it is not. I tried it and when I put both devices into the same slot
>>>> they are both unplugged from the guest during boot but nothing else
>>>> happens. I don't know what triggers that unplug of the devices.
>>>>
>>>> I'm not aware of any other problems regarding multi-function, which
>>>> doesn't mean there aren't any.
>>>
>>> Hmm, was the hidden device at function #0?  The guest won't find any
>>> functions if function #0 isn't present, but I don't know what would
>>> trigger the hotplug.  The angle I'm thinking is that we only have slot
>>> level granularity for hotplug, so any sort of automatic hotplug of a
>>> slot should probably think about bystander devices within the slot.
>>
>> Yes that would be a problem, but isn't it the same in the non-failover case
>> where a user configures it wrong? The slot where the device is plugged is not
>> chosen automatically it's configured by the user, no? I might be mixing something
>> up here.  I have no idea yet how to check if a slot is already populated, but
>> I'll think about it.
> 
> I don't think libvirt will automatically make use of multifunction
> endpoints, except maybe for some built-in devices, so yes it probably
> would be up to the user to explicitly create a multifunction device.

Correct. The only place libvirt will ever assign devices anywhere except 
function 0 is when we are adding pcie-root-ports - those are combined 
8-per-slot in order to conserve space on pcie.0 (this permits us to have 
up to 240 PCIe devices without needing to resort to upstream/downstream 
switches).


> But are there other scenarios that generate an automatic hot-unplug?
> If a user creates a multifunction slot and then triggers a hot-unplug
> themselves, it's easy to place the blame on the user if the result is
> unexpected, but is it so obviously a user configuration error if the
> hotplug occurs as an automatic response to a migration?  I'm not as
> sure about that.

I guess that's all a matter of opinion. If the user never enters in any 
PCI address info and it's all handled by someone else, then I wouldn't 
expect them to know exactly where the devices were (and only vaguely 
understand that their hostdev network interface is going to be unplugged 
during migration). In that case (as long as it's libvirt assigning the 
PCI addresses) the situation we're considering would never ever happen, 
so it's a non-issue.

If, on the other hand, the user wants to mess around assigning PCI 
addresses themselves, then they get to pick up all the pieces. It might 
be nice if they could be given a clue about why it broke though.

> 
> As indicated, I don't know whether this should just be documented or if
> we should spend time preventing it, but someone, somewhere will
> probably think it's a good idea to put their primary and failover NIC
> in the same slot and be confused that the underlying mechanisms cannot
> support it.  It doesn't appear that it would be too difficult to test
> QEMU_PCI_CAP_MULTIFUNCTION (not set) and PCI_FUNC (is 0) for the
> primary, but maybe I'm just being paranoid.  Thanks,

If, as you claim, it's not difficult, then I guess why not?
Jens Freimann Oct. 24, 2019, 7:56 p.m. UTC | #13
On Thu, Oct 24, 2019 at 06:22:27PM +0100, Dr. David Alan Gilbert wrote:
>* Jens Freimann (jfreimann@redhat.com) wrote:
>> This patch adds a net_failover_pair_id property to PCIDev which is
>> used to link the primary device in a failover pair (the PCI dev) to
>> a standby (a virtio-net-pci) device.
>>
>> It only supports ethernet devices. Also currently it only supports
>> PCIe devices. QEMU will exit with an error message otherwise.
>>
>> Signed-off-by: Jens Freimann <jfreimann@redhat.com>
>> ---
>>  hw/pci/pci.c         | 17 +++++++++++++++++
>>  include/hw/pci/pci.h |  3 +++
>>  2 files changed, 20 insertions(+)
>>
>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
>> index aa05c2b9b2..fa9b5219f8 100644
>> --- a/hw/pci/pci.c
>> +++ b/hw/pci/pci.c
>> @@ -75,6 +75,8 @@ static Property pci_props[] = {
>>                      QEMU_PCIE_LNKSTA_DLLLA_BITNR, true),
>>      DEFINE_PROP_BIT("x-pcie-extcap-init", PCIDevice, cap_present,
>>                      QEMU_PCIE_EXTCAP_INIT_BITNR, true),
>> +    DEFINE_PROP_STRING("net_failover_pair_id", PCIDevice,
>> +            net_failover_pair_id),
>
>Should we just make this 'failover_pair_id' - then when someone in the
>future figures out how to make it work for something else (e.g.
>multipath block devices) then it's all good?

Yes, I see no reason why not to rename it. 

Thanks!

regards,
Jens
Jens Freimann Oct. 24, 2019, 8:08 p.m. UTC | #14
On Thu, Oct 24, 2019 at 10:52:36AM -0600, Alex Williamson wrote:
>On Thu, 24 Oct 2019 11:37:54 +0200
>Jens Freimann <jfreimann@redhat.com> wrote:
[...]
> >
>> >While reviewing, I realized that we shouldn't have this check for below reasons.
>> >
>> >1. It is user's responsibility to pass networking device.
>> >But its ok to check the class, if PCI Device is passed.
>> >So class comparison should be inside the pci_check().
>>
>> I'm not sure I understand this point, could you please elaborate?
>> You're suggesting to move the check for the class into the check for
>> pci_is_express?
>
>Seems like the suggestion is that net_failover_pair_id should be an
>option on the base class of PCIDevice (DeviceState?) and only if it's a
>PCI device would we check the class code.  But there are dependencies
>at the hotplug controller, which I think is why this is currently
>specific to PCI.

Yes, It doesn't support acpi, shpc,... hotplug as of now. It
shouldn't be hard to add but I'd like to do it as a follow-on series.

>However, it's an interesting point about pci_is_express().  This test
>is really just meant to check whether the hotplug controller supports
>this feature, which is only implemented in pciehp via this series.
>There's a bit of a mismatch though that pcie_is_express() checks
>whether the device is express, not whether the bus it sits on is
>express.  I think we really want the latter, so maybe this should be:
>
>pci_bus_is_express(pci_get_bus(dev)
>
>For example this feature should work if I plug an e1000 (not e1000e)
>into an express slot, but not if I plug an e1000e into a conventional
>slot.

I'll try this and test. 

Thanks!

regards,
Jens
Jens Freimann Oct. 25, 2019, 10:52 a.m. UTC | #15
On Wed, Oct 23, 2019 at 12:06:48PM -0600, Alex Williamson wrote:
>On Wed, 23 Oct 2019 10:27:02 +0200
>Jens Freimann <jfreimann@redhat.com> wrote:
[...]
>> @@ -2101,6 +2104,20 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
>>          }
>>      }
>>
>> +    if (pci_dev->net_failover_pair_id) {
>> +        if (!pci_is_express(pci_dev)) {
>> +            error_setg(errp, "failover device is not a PCIExpress device");
>> +            error_propagate(errp, local_err);
>> +            return;
>> +        }
>
>Did we decide we don't need to test that the device is also in a
>hotpluggable slot?  Are there also multi-function considerations that
>should be prevented or documented?  For example, if a user tries to
>configure both the primary and failover NICs in the same slot, I assume
>bad things will happen.

I added this check

        if (!(pci_dev->cap_present & QEMU_PCI_CAP_MULTIFUNCTION)
            && (PCI_FUNC(pci_dev->devfn) == 0)) {
            qdev->allow_unplug_during_migration = true;
        } else {
            error_setg(errp, "failover: primary device must be in its own "
                              "PCI slot");
            error_propagate(errp, local_err);
            pci_qdev_unrealize(DEVICE(pci_dev), NULL);
            return;
        }

When I first add a vfio-pci with net_failover_pair_id=x,multifunction=on
and addr=0.0 I will now get an error.

(qemu) device_add vfio-pci,...,bus=root2,net_failover_pair_id=net1,multifunction=on,addr=0.0
Error: failover: primary device must be in its own PCI slot

If I put in a virtio-net device in slot 0 and then try to add a
vfio-pci device in the same slot I get the following error message:

-device virtio-net-pci,...id=net1bus=root1,failover=on,multifunction=on,addr=0.0
(qemu) device_add vfio-pci,id=hostdev1,host=0000:5e:00.2,bus=root1,net_failover_pair_id=net1,addr=0.1
Error: PCI: slot 0 function 0 already ocuppied by virtio-net-pci,
   new func vfio-pci cannot be exposed to guest.


regards,
Jens
Alex Williamson Oct. 25, 2019, 2:56 p.m. UTC | #16
On Fri, 25 Oct 2019 12:52:32 +0200
Jens Freimann <jfreimann@redhat.com> wrote:

> On Wed, Oct 23, 2019 at 12:06:48PM -0600, Alex Williamson wrote:
> >On Wed, 23 Oct 2019 10:27:02 +0200
> >Jens Freimann <jfreimann@redhat.com> wrote:  
> [...]
> >> @@ -2101,6 +2104,20 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
> >>          }
> >>      }
> >>
> >> +    if (pci_dev->net_failover_pair_id) {
> >> +        if (!pci_is_express(pci_dev)) {
> >> +            error_setg(errp, "failover device is not a PCIExpress device");
> >> +            error_propagate(errp, local_err);
> >> +            return;
> >> +        }  
> >
> >Did we decide we don't need to test that the device is also in a
> >hotpluggable slot?  Are there also multi-function considerations that
> >should be prevented or documented?  For example, if a user tries to
> >configure both the primary and failover NICs in the same slot, I assume
> >bad things will happen.  
> 
> I added this check
> 
>         if (!(pci_dev->cap_present & QEMU_PCI_CAP_MULTIFUNCTION)
>             && (PCI_FUNC(pci_dev->devfn) == 0)) {
>             qdev->allow_unplug_during_migration = true;
>         } else {
>             error_setg(errp, "failover: primary device must be in its own "
>                               "PCI slot");
>             error_propagate(errp, local_err);
>             pci_qdev_unrealize(DEVICE(pci_dev), NULL);
>             return;
>         }
> 
> When I first add a vfio-pci with net_failover_pair_id=x,multifunction=on
> and addr=0.0 I will now get an error.
> 
> (qemu) device_add vfio-pci,...,bus=root2,net_failover_pair_id=net1,multifunction=on,addr=0.0
> Error: failover: primary device must be in its own PCI slot
> 
> If I put in a virtio-net device in slot 0 and then try to add a
> vfio-pci device in the same slot I get the following error message:
> 
> -device virtio-net-pci,...id=net1bus=root1,failover=on,multifunction=on,addr=0.0
> (qemu) device_add vfio-pci,id=hostdev1,host=0000:5e:00.2,bus=root1,net_failover_pair_id=net1,addr=0.1
> Error: PCI: slot 0 function 0 already ocuppied by virtio-net-pci,
>    new func vfio-pci cannot be exposed to guest.

Cool, looks good.  Thanks,

Alex
diff mbox series

Patch

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index aa05c2b9b2..fa9b5219f8 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -75,6 +75,8 @@  static Property pci_props[] = {
                     QEMU_PCIE_LNKSTA_DLLLA_BITNR, true),
     DEFINE_PROP_BIT("x-pcie-extcap-init", PCIDevice, cap_present,
                     QEMU_PCIE_EXTCAP_INIT_BITNR, true),
+    DEFINE_PROP_STRING("net_failover_pair_id", PCIDevice,
+            net_failover_pair_id),
     DEFINE_PROP_END_OF_LIST()
 };
 
@@ -2077,6 +2079,7 @@  static void pci_qdev_realize(DeviceState *qdev, Error **errp)
     ObjectClass *klass = OBJECT_CLASS(pc);
     Error *local_err = NULL;
     bool is_default_rom;
+    uint16_t class_id;
 
     /* initialize cap_present for pci_is_express() and pci_config_size(),
      * Note that hybrid PCIs are not set automatically and need to manage
@@ -2101,6 +2104,20 @@  static void pci_qdev_realize(DeviceState *qdev, Error **errp)
         }
     }
 
+    if (pci_dev->net_failover_pair_id) {
+        if (!pci_is_express(pci_dev)) {
+            error_setg(errp, "failover device is not a PCIExpress device");
+            error_propagate(errp, local_err);
+            return;
+        }
+        class_id = pci_get_word(pci_dev->config + PCI_CLASS_DEVICE);
+        if (class_id != PCI_CLASS_NETWORK_ETHERNET) {
+            error_setg(errp, "failover device is not an Ethernet device");
+            error_propagate(errp, local_err);
+            return;
+        }
+    }
+
     /* rom loading */
     is_default_rom = false;
     if (pci_dev->romfile == NULL && pc->romfile != NULL) {
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index f3f0ffd5fb..def5435685 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -352,6 +352,9 @@  struct PCIDevice {
     MSIVectorUseNotifier msix_vector_use_notifier;
     MSIVectorReleaseNotifier msix_vector_release_notifier;
     MSIVectorPollNotifier msix_vector_poll_notifier;
+
+    /* ID of standby device in net_failover pair */
+    char *net_failover_pair_id;
 };
 
 void pci_register_bar(PCIDevice *pci_dev, int region_num,