diff mbox series

[RFC,1/3] qemu: virtio-bypass should explicitly bind to a passthrough device

Message ID 1522573990-5242-2-git-send-email-si-wei.liu@oracle.com
State RFC, archived
Delegated to: David Miller
Headers show
Series Userspace compatible driver model for virtio_bypass | expand

Commit Message

Si-Wei Liu April 1, 2018, 9:13 a.m. UTC
The new backup option allows guest virtio-bypass driver to explicitly
bind to a corresponding passthrough instance, which is identifiable by
the <bus>:<slot>.<function> notation. MAC address is still validated
in the guest but not the only criteria for pairing two devices.
MAC address is more a matter of network configuration than a (virtual)
device identifier, the latter of which needs to be unique as part of
VM configuration. Techinically it's possible there exists more than
one device in the guest configured with the same MAC, but each belongs
to completely isolated network.

The direct benefit as a result of the explicit binding (or pairing),
apparently, is the prohibition of improper binding or malicious pairing
due to any flexiblility in terms of guest MAC address config.

What's more important, the indicator of guest device location can even
be used as a means to reserve the slot for the corresponding passthrough
device in the PCI bus tree if such device is temporarily absent, but
yet to be hot plugged into the VM. We'd need to preserve the slot for
the passthrough device to which virtio-bypass is bound, such that once
it is plugged out as a result of migration we can ensure the slot
wouldn't be occupied by other devices, and any user-space application
assumes consistent device location in the bus tree still works.

The usage for the backup option is as follows:

   -device virtio-net-pci, ... ,backup=<bus-id>:<slot>[.<function>]

for e.g.

   -device virtio-net-pci,id=net0,mac=52:54:00:e0:58:80,backup=pci.2:0x3
   ...
   -device vfio-pci,host=02:10.1,id=hostdev0,bus=pci.2,addr=0x3

Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
---
 hw/net/virtio-net.c                         | 29 ++++++++++++-
 include/hw/pci/pci.h                        |  3 ++
 include/hw/virtio/virtio-net.h              |  2 +
 include/standard-headers/linux/virtio_net.h |  1 +
 qdev-monitor.c                              | 64 +++++++++++++++++++++++++++++
 5 files changed, 97 insertions(+), 2 deletions(-)

Comments

Michael S. Tsirkin April 3, 2018, 12:25 p.m. UTC | #1
On Sun, Apr 01, 2018 at 05:13:08AM -0400, Si-Wei Liu wrote:
> @@ -896,6 +898,68 @@ void qmp_device_del(const char *id, Error **errp)
>      }
>  }
>  
> +int pci_get_busdevfn_by_id(const char *id, uint16_t *busnr,
> +                           uint16_t *devfn, Error **errp)
> +{
> +    uint16_t busnum = 0, slot = 0, func = 0;
> +    const char *pc, *pd, *pe;
> +    Error *local_err = NULL;
> +    ObjectClass *class;
> +    char value[1024];
> +    BusState *bus;
> +    uint64_t u64;
> +
> +    if (!(pc = strchr(id, ':'))) {
> +        error_setg(errp, "Invalid id: backup=%s, "
> +                   "correct format should be backup="
> +                   "'<bus-id>:<slot>[.<function>]'", id);
> +        return -1;
> +    }
> +    get_opt_name(value, sizeof(value), id, ':');
> +    if (pc != id + 1) {
> +        bus = qbus_find(value, errp);
> +        if (!bus)
> +            return -1;
> +
> +        class = object_get_class(OBJECT(bus));
> +        if (class != object_class_by_name(TYPE_PCI_BUS) &&
> +            class != object_class_by_name(TYPE_PCIE_BUS)) {
> +            error_setg(errp, "%s is not a device on pci bus", id);
> +            return -1;
> +        }
> +        busnum = (uint16_t)pci_bus_num(PCI_BUS(bus));
> +    }

pci_bus_num is almost always a bug if not done within
a context of a PCI host, bridge, etc.

In particular this will not DTRT if run before guest assigns bus
numbers.


> +
> +    if (!devfn)
> +        goto out;
> +
> +    pd = strchr(pc, '.');
> +    pe = get_opt_name(value, sizeof(value), pc + 1, '.');
> +    if (pe != pc + 1) {
> +        parse_option_number("slot", value, &u64, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return -1;
> +        }
> +        slot = (uint16_t)u64;
> +    }
> +    if (pd && *(pd + 1) != '\0') {
> +        parse_option_number("function", pd, &u64, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return -1;
> +        }
> +        func = (uint16_t)u64;
> +    }
> +
> +out:
> +    if (busnr)
> +        *busnr = busnum;
> +    if (devfn)
> +        *devfn = ((slot & 0x1F) << 3) | (func & 0x7);
> +    return 0;
> +}
> +
>  BlockBackend *blk_by_qdev_id(const char *id, Error **errp)
>  {
>      DeviceState *dev;
> -- 
> 1.8.3.1
Siwei Liu April 4, 2018, 8:02 a.m. UTC | #2
On Tue, Apr 3, 2018 at 5:25 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Sun, Apr 01, 2018 at 05:13:08AM -0400, Si-Wei Liu wrote:
>> @@ -896,6 +898,68 @@ void qmp_device_del(const char *id, Error **errp)
>>      }
>>  }
>>
>> +int pci_get_busdevfn_by_id(const char *id, uint16_t *busnr,
>> +                           uint16_t *devfn, Error **errp)
>> +{
>> +    uint16_t busnum = 0, slot = 0, func = 0;
>> +    const char *pc, *pd, *pe;
>> +    Error *local_err = NULL;
>> +    ObjectClass *class;
>> +    char value[1024];
>> +    BusState *bus;
>> +    uint64_t u64;
>> +
>> +    if (!(pc = strchr(id, ':'))) {
>> +        error_setg(errp, "Invalid id: backup=%s, "
>> +                   "correct format should be backup="
>> +                   "'<bus-id>:<slot>[.<function>]'", id);
>> +        return -1;
>> +    }
>> +    get_opt_name(value, sizeof(value), id, ':');
>> +    if (pc != id + 1) {
>> +        bus = qbus_find(value, errp);
>> +        if (!bus)
>> +            return -1;
>> +
>> +        class = object_get_class(OBJECT(bus));
>> +        if (class != object_class_by_name(TYPE_PCI_BUS) &&
>> +            class != object_class_by_name(TYPE_PCIE_BUS)) {
>> +            error_setg(errp, "%s is not a device on pci bus", id);
>> +            return -1;
>> +        }
>> +        busnum = (uint16_t)pci_bus_num(PCI_BUS(bus));
>> +    }
>
> pci_bus_num is almost always a bug if not done within
> a context of a PCI host, bridge, etc.
>
> In particular this will not DTRT if run before guest assigns bus
> numbers.
>
I was seeking means to reserve a specific pci bus slot from drivers,
and update the driver when guest assigns the bus number but it seems
there's no low-hanging fruits. Because of that reason the bus_num is
only obtained until it's really needed (during get_config) and I
assume at that point the pci bus assignment is already done. I know
the current one is not perfect, but we need that information (PCI
bus:slot.func number) to name the guest device correctly.

Regards,
-Siwei
>
>> +
>> +    if (!devfn)
>> +        goto out;
>> +
>> +    pd = strchr(pc, '.');
>> +    pe = get_opt_name(value, sizeof(value), pc + 1, '.');
>> +    if (pe != pc + 1) {
>> +        parse_option_number("slot", value, &u64, &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            return -1;
>> +        }
>> +        slot = (uint16_t)u64;
>> +    }
>> +    if (pd && *(pd + 1) != '\0') {
>> +        parse_option_number("function", pd, &u64, &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            return -1;
>> +        }
>> +        func = (uint16_t)u64;
>> +    }
>> +
>> +out:
>> +    if (busnr)
>> +        *busnr = busnum;
>> +    if (devfn)
>> +        *devfn = ((slot & 0x1F) << 3) | (func & 0x7);
>> +    return 0;
>> +}
>> +
>>  BlockBackend *blk_by_qdev_id(const char *id, Error **errp)
>>  {
>>      DeviceState *dev;
>> --
>> 1.8.3.1
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: virtio-dev-unsubscribe@lists.oasis-open.org
> For additional commands, e-mail: virtio-dev-help@lists.oasis-open.org
>
Paolo Bonzini April 5, 2018, 3:31 p.m. UTC | #3
On 04/04/2018 10:02, Siwei Liu wrote:
>> pci_bus_num is almost always a bug if not done within
>> a context of a PCI host, bridge, etc.
>>
>> In particular this will not DTRT if run before guest assigns bus
>> numbers.
>>
> I was seeking means to reserve a specific pci bus slot from drivers,
> and update the driver when guest assigns the bus number but it seems
> there's no low-hanging fruits. Because of that reason the bus_num is
> only obtained until it's really needed (during get_config) and I
> assume at that point the pci bus assignment is already done. I know
> the current one is not perfect, but we need that information (PCI
> bus:slot.func number) to name the guest device correctly.

Can you use the -device "id", and look it up as

    devices = container_get(qdev_get_machine(), "/peripheral");
    return object_resolve_path_component(devices, id);

?

Thanks,

Paolo
Siwei Liu April 7, 2018, 2:54 a.m. UTC | #4
(click the wrong reply button again, sorry)


On Thu, Apr 5, 2018 at 8:31 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> On 04/04/2018 10:02, Siwei Liu wrote:
>>> pci_bus_num is almost always a bug if not done within
>>> a context of a PCI host, bridge, etc.
>>>
>>> In particular this will not DTRT if run before guest assigns bus
>>> numbers.
>>>
>> I was seeking means to reserve a specific pci bus slot from drivers,
>> and update the driver when guest assigns the bus number but it seems
>> there's no low-hanging fruits. Because of that reason the bus_num is
>> only obtained until it's really needed (during get_config) and I
>> assume at that point the pci bus assignment is already done. I know
>> the current one is not perfect, but we need that information (PCI
>> bus:slot.func number) to name the guest device correctly.
>
> Can you use the -device "id", and look it up as
>
>     devices = container_get(qdev_get_machine(), "/peripheral");
>     return object_resolve_path_component(devices, id);


No. The problem of using device id is that the vfio device may come
and go at any time, this is particularly true when live migration is
happening. There's no gurantee we can get the bus:device.func info if
that device is gone. Currently the binding between vfio and virtio-net
is weakly coupled through the backup property, there's no better way
than specifying the bus id and addr property directly.

Regards,
-Siwei

>
> ?
>
> Thanks,
>
> Paolo
diff mbox series

Patch

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index de31b1b98c..a36b169958 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -26,6 +26,7 @@ 
 #include "qapi-event.h"
 #include "hw/virtio/virtio-access.h"
 #include "migration/misc.h"
+#include "hw/pci/pci.h"
 
 #define VIRTIO_NET_VM_VERSION    11
 
@@ -61,6 +62,8 @@  static VirtIOFeature feature_sizes[] = {
      .end = endof(struct virtio_net_config, max_virtqueue_pairs)},
     {.flags = 1 << VIRTIO_NET_F_MTU,
      .end = endof(struct virtio_net_config, mtu)},
+    {.flags = 1 << VIRTIO_NET_F_BACKUP,
+     .end = endof(struct virtio_net_config, bsf2backup)},
     {}
 };
 
@@ -84,10 +87,24 @@  static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config)
 {
     VirtIONet *n = VIRTIO_NET(vdev);
     struct virtio_net_config netcfg;
+    uint16_t busdevfn;
 
     virtio_stw_p(vdev, &netcfg.status, n->status);
     virtio_stw_p(vdev, &netcfg.max_virtqueue_pairs, n->max_queues);
     virtio_stw_p(vdev, &netcfg.mtu, n->net_conf.mtu);
+    if (n->net_conf.backup) {
+        /* Below function should not fail as the backup ID string
+         * has been validated when device is being realized.
+         * Until guest starts to run we can can get to the
+         * effective bus num in use from pci config space where
+         * guest had written to.
+         */
+        pci_get_busdevfn_by_id(n->net_conf.backup, &busdevfn,
+                               NULL, NULL);
+        busdevfn <<= 8;
+        busdevfn |= (n->backup_devfn & 0xFF);
+        virtio_stw_p(vdev, &netcfg.bsf2backup, busdevfn);
+    }
     memcpy(netcfg.mac, n->mac, ETH_ALEN);
     memcpy(config, &netcfg, n->config_size);
 }
@@ -1935,11 +1952,19 @@  static void virtio_net_device_realize(DeviceState *dev, Error **errp)
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
     VirtIONet *n = VIRTIO_NET(dev);
     NetClientState *nc;
+    uint16_t bdevfn;
     int i;
 
     if (n->net_conf.mtu) {
         n->host_features |= (0x1 << VIRTIO_NET_F_MTU);
     }
+    if (n->net_conf.backup) {
+        if (pci_get_busdevfn_by_id(n->net_conf.backup, NULL,
+                                   &bdevfn, errp))
+            return;
+        n->backup_devfn = bdevfn;
+        n->host_features |= (0x1 << VIRTIO_NET_F_BACKUP);
+    }
 
     virtio_net_set_config_size(n, n->host_features);
     virtio_init(vdev, "virtio-net", VIRTIO_ID_NET, n->config_size);
@@ -2160,8 +2185,8 @@  static Property virtio_net_properties[] = {
     DEFINE_PROP_UINT16("host_mtu", VirtIONet, net_conf.mtu, 0),
     DEFINE_PROP_BOOL("x-mtu-bypass-backend", VirtIONet, mtu_bypass_backend,
                      true),
-    DEFINE_PROP_BIT("backup", VirtIONet, host_features,
-                     VIRTIO_NET_F_BACKUP, false),
+    DEFINE_PROP_STRING("backup", VirtIONet, net_conf.backup),
+
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index d8c18c7fa4..dbb910d162 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -431,6 +431,9 @@  PCIDevice *pci_nic_init_nofail(NICInfo *nd, PCIBus *rootbus,
 
 PCIDevice *pci_vga_init(PCIBus *bus);
 
+int pci_get_busdevfn_by_id(const char *id, uint16_t *busnr,
+                           uint16_t *devfn, Error **errp);
+
 static inline PCIBus *pci_get_bus(const PCIDevice *dev)
 {
     return PCI_BUS(qdev_get_parent_bus(DEVICE(dev)));
diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
index b81b6a4624..276b39f64f 100644
--- a/include/hw/virtio/virtio-net.h
+++ b/include/hw/virtio/virtio-net.h
@@ -38,6 +38,7 @@  typedef struct virtio_net_conf
     uint16_t rx_queue_size;
     uint16_t tx_queue_size;
     uint16_t mtu;
+    char *backup;
 } virtio_net_conf;
 
 /* Maximum packet size we can receive from tap device: header + 64k */
@@ -99,6 +100,7 @@  typedef struct VirtIONet {
     int announce_counter;
     bool needs_vnet_hdr_swap;
     bool mtu_bypass_backend;
+    uint16_t backup_devfn;
 } VirtIONet;
 
 void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
diff --git a/include/standard-headers/linux/virtio_net.h b/include/standard-headers/linux/virtio_net.h
index 65dde3209d..cd936e5521 100644
--- a/include/standard-headers/linux/virtio_net.h
+++ b/include/standard-headers/linux/virtio_net.h
@@ -79,6 +79,7 @@  struct virtio_net_config {
 	uint16_t max_virtqueue_pairs;
 	/* Default maximum transmit unit advice */
 	uint16_t mtu;
+	uint16_t bsf2backup;
 } QEMU_PACKED;
 
 /*
diff --git a/qdev-monitor.c b/qdev-monitor.c
index 846238175f..600a81c73e 100644
--- a/qdev-monitor.c
+++ b/qdev-monitor.c
@@ -32,6 +32,8 @@ 
 #include "qemu/help_option.h"
 #include "qemu/option.h"
 #include "sysemu/block-backend.h"
+#include "hw/pci/pci.h"
+#include "hw/vfio/pci.h"
 #include "migration/misc.h"
 
 /*
@@ -896,6 +898,68 @@  void qmp_device_del(const char *id, Error **errp)
     }
 }
 
+int pci_get_busdevfn_by_id(const char *id, uint16_t *busnr,
+                           uint16_t *devfn, Error **errp)
+{
+    uint16_t busnum = 0, slot = 0, func = 0;
+    const char *pc, *pd, *pe;
+    Error *local_err = NULL;
+    ObjectClass *class;
+    char value[1024];
+    BusState *bus;
+    uint64_t u64;
+
+    if (!(pc = strchr(id, ':'))) {
+        error_setg(errp, "Invalid id: backup=%s, "
+                   "correct format should be backup="
+                   "'<bus-id>:<slot>[.<function>]'", id);
+        return -1;
+    }
+    get_opt_name(value, sizeof(value), id, ':');
+    if (pc != id + 1) {
+        bus = qbus_find(value, errp);
+        if (!bus)
+            return -1;
+
+        class = object_get_class(OBJECT(bus));
+        if (class != object_class_by_name(TYPE_PCI_BUS) &&
+            class != object_class_by_name(TYPE_PCIE_BUS)) {
+            error_setg(errp, "%s is not a device on pci bus", id);
+            return -1;
+        }
+        busnum = (uint16_t)pci_bus_num(PCI_BUS(bus));
+    }
+
+    if (!devfn)
+        goto out;
+
+    pd = strchr(pc, '.');
+    pe = get_opt_name(value, sizeof(value), pc + 1, '.');
+    if (pe != pc + 1) {
+        parse_option_number("slot", value, &u64, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return -1;
+        }
+        slot = (uint16_t)u64;
+    }
+    if (pd && *(pd + 1) != '\0') {
+        parse_option_number("function", pd, &u64, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return -1;
+        }
+        func = (uint16_t)u64;
+    }
+
+out:
+    if (busnr)
+        *busnr = busnum;
+    if (devfn)
+        *devfn = ((slot & 0x1F) << 3) | (func & 0x7);
+    return 0;
+}
+
 BlockBackend *blk_by_qdev_id(const char *id, Error **errp)
 {
     DeviceState *dev;