diff mbox

[ovs-dev,3/4] netdev-dpdk: Add vHost User PMD

Message ID 1468592799-19251-4-git-send-email-ciara.loftus@intel.com
State Changes Requested
Delegated to: Daniele Di Proietto
Headers show

Commit Message

Ciara Loftus July 15, 2016, 2:26 p.m. UTC
DPDK 16.04 introduces the vHost PMD which allows 'dpdkvhostuser' ports
to be controlled by the librte_ether API, like physical 'dpdk' ports and
IVSHM 'dpdkr' ports. This commit integrates this PMD into OVS and
removes direct calls to the librte_vhost DPDK library.

This commit removes extended statistics support for vHost User ports
until such a time that this becomes available in the vHost PMD in a
DPDK release supported by OVS.

Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
---
 INSTALL.DPDK.md   |  10 +
 NEWS              |   2 +
 lib/netdev-dpdk.c | 856 ++++++++++++++++++------------------------------------
 3 files changed, 302 insertions(+), 566 deletions(-)

Comments

Daniele Di Proietto July 25, 2016, 11:45 p.m. UTC | #1
Thanks for the patch

This needs a little bit of rebasing, I did it myself to review, but it'd be
nice to have an updated version.

I like the simplification that this brings especially to the fast path.

If we merge this before we merge the DPDK 16.07 we won't have to deal with
the vid change.

Thanks,

Daniele

2016-07-15 7:26 GMT-07:00 Ciara Loftus <ciara.loftus@intel.com>:

> DPDK 16.04 introduces the vHost PMD which allows 'dpdkvhostuser' ports
> to be controlled by the librte_ether API, like physical 'dpdk' ports and
> IVSHM 'dpdkr' ports. This commit integrates this PMD into OVS and
> removes direct calls to the librte_vhost DPDK library.
>
> This commit removes extended statistics support for vHost User ports
> until such a time that this becomes available in the vHost PMD in a
> DPDK release supported by OVS.
>
> Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
> ---
>  INSTALL.DPDK.md   |  10 +
>  NEWS              |   2 +
>  lib/netdev-dpdk.c | 856
> ++++++++++++++++++------------------------------------
>  3 files changed, 302 insertions(+), 566 deletions(-)
>
> diff --git a/INSTALL.DPDK.md b/INSTALL.DPDK.md
> index 5407794..29b6f91 100644
> --- a/INSTALL.DPDK.md
> +++ b/INSTALL.DPDK.md
> @@ -561,6 +561,16 @@ can be found in [Vhost Walkthrough].
>
>      http://dpdk.org/doc/guides/rel_notes/release_16_04.html
>
> +  - dpdk, dpdkr and dpdkvhostuser ports are 'eth' type ports in the
> context of
> +    DPDK as they are all managed by the rte_ether API. This means that
> they
> +    adhere to the DPDK configuration option CONFIG_RTE_MAX_ETHPORTS which
> by
> +    default is set to 32. This means by default the combined total number
> of
> +    dpdk, dpdkr and dpdkvhostuser ports allowable in OVS with DPDK is 32.
> This
> +    value can be changed if desired by modifying the configuration file in
> +    DPDK, or by overriding the default value on the command line when
> building
> +    DPDK. eg.
> +
> +        `make install CONFIG_RTE_MAX_ETHPORTS=64`
>

Again, I hope this doesn't cause problems to a lot of users.  I'd like to
see the limit increased by default, but I think we can merge this patch as
it is.


>
>  Bug Reporting:
>  --------------
> diff --git a/NEWS b/NEWS
> index aa1b915..b3791ed 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -59,6 +59,8 @@ Post-v2.5.0
>         node that device memory is located on if
> CONFIG_RTE_LIBRTE_VHOST_NUMA
>         is enabled in DPDK.
>       * Remove dpdkvhostcuse port type.
> +     * vHost PMD integration brings vhost-user ports under control of the
> +       rte_ether DPDK API.
>     - Increase number of registers to 16.
>     - ovs-benchmark: This utility has been removed due to lack of use and
>       bitrot.
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index b4f82af..5de806a 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -56,6 +56,7 @@
>  #include "unixctl.h"
>
>  #include "rte_config.h"
> +#include "rte_eth_vhost.h"
>  #include "rte_mbuf.h"
>  #include "rte_meter.h"
>  #include "rte_virtio_net.h"
> @@ -141,6 +142,11 @@ static char *vhost_sock_dir = NULL;   /* Location of
> vhost-user sockets */
>
>  #define VHOST_ENQ_RETRY_NUM 8
>
> +/* Array that tracks the used & unused vHost user driver IDs */
> +static unsigned int vhost_drv_ids[RTE_MAX_ETHPORTS];
> +/* Maximum string length allowed to provide to rte_eth_attach function */
> +#define DEVARGS_MAX (RTE_ETH_NAME_MAX_LEN + PATH_MAX + 18)
> +
>

I think this is not needed if we use xasprintf() below.


>  static const struct rte_eth_conf port_conf = {
>      .rxmode = {
>          .mq_mode = ETH_MQ_RX_RSS,
> @@ -353,12 +359,15 @@ struct netdev_dpdk {
>       * always true.  */
>      bool txq_needs_locking;
>
> -    /* virtio-net structure for vhost device */
> -    OVSRCU_TYPE(struct virtio_net *) virtio_dev;
> +    /* Number of virtqueue pairs reported by the guest */
> +    uint32_t vhost_qp_nb;
>
>      /* Identifier used to distinguish vhost devices from each other */
>      char vhost_id[PATH_MAX];
>
> +    /* ID of vhost user port given to the PMD driver */
> +    unsigned int vhost_pmd_id;
> +
>      /* In dpdk_list. */
>      struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
>
> @@ -389,16 +398,25 @@ struct netdev_rxq_dpdk {
>  static bool dpdk_thread_is_pmd(void);
>
>  static int netdev_dpdk_construct(struct netdev *);
> +static int netdev_dpdk_vhost_construct(struct netdev *);
>
>  struct virtio_net * netdev_dpdk_get_virtio(const struct netdev_dpdk *dev);
>
>  struct ingress_policer *
>  netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev);
>
> +void link_status_changed_callback(uint8_t port_id,
> +        enum rte_eth_event_type type OVS_UNUSED, void *param OVS_UNUSED);
> +void vring_state_changed_callback(uint8_t port_id,
> +        enum rte_eth_event_type type OVS_UNUSED, void *param OVS_UNUSED);
>

Minor: I think we can avoid OVS_UNUSED on the declaration and keep it only
on the definition.

Also, these two function can be static


> +static void netdev_dpdk_remap_txqs(struct netdev_dpdk *dev);
> +static void netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev);
> +
>  static bool
> -is_dpdk_class(const struct netdev_class *class)
> +is_dpdk_eth_class(const struct netdev_class *class)
>  {
> -    return class->construct == netdev_dpdk_construct;
> +    return ((class->construct == netdev_dpdk_construct) ||
> +            (class->construct == netdev_dpdk_vhost_construct));
>  }
>
>  /* DPDK NIC drivers allocate RX buffers at a particular granularity,
> typically
> @@ -622,8 +640,13 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int
> n_rxq, int n_txq)
>              continue;
>          }
>
> -        dev->up.n_rxq = n_rxq;
> -        dev->up.n_txq = n_txq;
> +        /* Only set n_*xq for physical devices. vHost User devices will
> set
> +         * this value correctly using info from the virtio backend.
> +         */
> +        if (dev->type == DPDK_DEV_ETH) {
> +            dev->up.n_rxq = n_rxq;
> +            dev->up.n_txq = n_txq;
> +        }
>
>          return 0;
>      }
> @@ -647,8 +670,14 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
> OVS_REQUIRES(dpdk_mutex)
>
>      rte_eth_dev_info_get(dev->port_id, &info);
>
> -    n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
> -    n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
> +    if (dev->type == DPDK_DEV_VHOST) {
> +        /* We don't know how many queues QEMU will use so set up the max
> */
> +        n_rxq = OVS_VHOST_MAX_QUEUE_NUM;
> +        n_txq = OVS_VHOST_MAX_QUEUE_NUM;
>

It's not very clear to me how multiqueue is handled now.

Do we really configure 1024 queues on the PMD? Wouldn't it be simpler to
call again this function from _reconfigure?


> +    } else {
> +        n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
> +        n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
> +    }
>
>      diag = dpdk_eth_dev_queue_setup(dev, n_rxq, n_txq);
>      if (diag) {
> @@ -715,6 +744,85 @@ netdev_dpdk_alloc_txq(struct netdev_dpdk *dev,
> unsigned int n_txqs)
>      }
>  }
>
> +void
> +link_status_changed_callback(uint8_t port_id,
> +                             enum rte_eth_event_type type OVS_UNUSED,
> +                             void *param OVS_UNUSED)
> +{
> +    struct netdev_dpdk *dev;
> +    int socket_id = -1;
> +
> +    ovs_mutex_lock(&dpdk_mutex);
> +    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
> +        if (port_id == dev->port_id) {
> +            ovs_mutex_lock(&dev->mutex);
> +            check_link_status(dev);
> +            if (dev->link.link_status == ETH_LINK_UP) {
> +                /* new device */
> +                /* Get NUMA information */
> +                socket_id = rte_eth_dev_socket_id(dev->port_id);
> +                if (socket_id != -1 && socket_id != dev->socket_id) {
> +                    dev->requested_socket_id = socket_id;
> +                }
> +                netdev_request_reconfigure(&dev->up);
> +                netdev_change_seq_changed(&dev->up);
> +                VLOG_INFO("vHost Device '%s' has been added on numa node
> %i",
> +                          dev->vhost_id, socket_id);
> +            } else {
> +                /* destroy device */
> +                /* Clear tx/rx queue settings. */
> +                netdev_dpdk_txq_map_clear(dev);
> +                netdev_request_reconfigure(&dev->up);
> +                netdev_change_seq_changed(&dev->up);
> +                VLOG_INFO("vHost Device '%s' has been removed",
> dev->vhost_id);
> +            }
> +            ovs_mutex_unlock(&dev->mutex);
> +            break;
> +        }
> +    }
> +
> +    ovs_mutex_unlock(&dpdk_mutex);
> +
> +    return;
> +}
> +
> +void
> +vring_state_changed_callback(uint8_t port_id,
> +                             enum rte_eth_event_type type OVS_UNUSED,
> +                             void *param OVS_UNUSED)
> +{
> +    struct netdev_dpdk *dev;
> +    struct rte_eth_vhost_queue_event event;
> +    int err = 0;
> +
> +    err = rte_eth_vhost_get_queue_event(port_id, &event);
> +    if (err || (event.rx)) {
>

Minor: extra parentheses


> +        return;
> +    }
> +
> +    ovs_mutex_lock(&dpdk_mutex);
> +    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
> +        if (port_id == dev->port_id) {
> +            ovs_mutex_lock(&dev->mutex);
> +            if (event.enable) {
> +                dev->tx_q[event.queue_id].map = event.queue_id;
> +                dev->vhost_qp_nb++;
> +            } else {
> +                dev->tx_q[event.queue_id].map = OVS_VHOST_QUEUE_DISABLED;
> +                dev->vhost_qp_nb--;
> +            }
> +            dev->requested_n_rxq = dev->vhost_qp_nb;
> +            dev->requested_n_txq = dev->vhost_qp_nb;
> +            netdev_request_reconfigure(&dev->up);
>

On master vring_state_changed() doesn't call _reconfigure(), but it just
updates the map.

Can we kkep the same behavior?


> +            ovs_mutex_unlock(&dev->mutex);
> +            break;
> +        }
> +    }
> +    ovs_mutex_unlock(&dpdk_mutex);
> +
> +    return;
> +}
> +
>  static int
>  netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
>                   enum dpdk_dev_type type)
> @@ -724,6 +832,7 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int
> port_no,
>      int sid;
>      int err = 0;
>      uint32_t buf_size;
> +    unsigned int nr_q = 0;
>
>      ovs_mutex_init(&dev->mutex);
>      ovs_mutex_lock(&dev->mutex);
> @@ -733,11 +842,7 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int
> port_no,
>      /* If the 'sid' is negative, it means that the kernel fails
>       * to obtain the pci numa info.  In that situation, always
>       * use 'SOCKET0'. */
> -    if (type == DPDK_DEV_ETH) {
> -        sid = rte_eth_dev_socket_id(port_no);
> -    } else {
> -        sid = rte_lcore_to_socket_id(rte_get_master_lcore());
> -    }
> +    sid = rte_eth_dev_socket_id(port_no);
>
>      dev->socket_id = sid < 0 ? SOCKET0 : sid;
>      dev->requested_socket_id = dev->socket_id;
> @@ -767,19 +872,23 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int
> port_no,
>      netdev->n_txq = NR_QUEUE;
>      dev->requested_n_rxq = netdev->n_rxq;
>      dev->requested_n_txq = netdev->n_txq;
> +    dev->vhost_qp_nb = 0;
>
> -    if (type == DPDK_DEV_ETH) {
> -        err = dpdk_eth_dev_init(dev);
> -        if (err) {
> -            goto unlock;
> -        }
> -        netdev_dpdk_alloc_txq(dev, netdev->n_txq);
> -        dev->txq_needs_locking = netdev->n_txq < dev->requested_n_txq;
> -    } else {
> -        netdev_dpdk_alloc_txq(dev, OVS_VHOST_MAX_QUEUE_NUM);
> -        dev->txq_needs_locking = true;
> -        /* Enable DPDK_DEV_VHOST device and set promiscuous mode flag. */
> -        dev->flags = NETDEV_UP | NETDEV_PROMISC;
> +    err = dpdk_eth_dev_init(dev);
> +    if (err) {
> +        goto unlock;
> +    }
> +    nr_q = (type == DPDK_DEV_ETH ? 1 : RTE_MAX_QUEUES_PER_PORT);
> +    netdev_dpdk_alloc_txq(dev, nr_q);
> +    dev->txq_needs_locking = netdev->n_txq < dev->requested_n_txq;
> +
> +    if (type == DPDK_DEV_VHOST) {
> +        rte_eth_dev_callback_register(port_no, RTE_ETH_EVENT_QUEUE_STATE,
> +                                     (void*)vring_state_changed_callback,
>

Can we avoid the cast to "void *"? It will cause API changes to go
undetected


> +                                      NULL);
> +        rte_eth_dev_callback_register(port_no, RTE_ETH_EVENT_INTR_LSC,
> +                                      (void*)link_status_changed_callback,
>

same


> +                                      NULL);
>      }
>
>      ovs_list_push_back(&dpdk_list, &dev->list_node);
> @@ -810,17 +919,48 @@ dpdk_dev_parse_name(const char dev_name[], const
> char prefix[],
>      }
>  }
>
> +/* When attaching a vhost device to DPDK, a unique name of the format
> + * 'eth_vhostX' is expected, where X is a unique identifier.
> + * get_vhost_drv_id returns a valid X value to provide to DPDK.
> + */
> +static int
> +get_vhost_drv_id(void)
> +{
> +    int i = 0;
> +
> +    for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
> +        if (vhost_drv_ids[i] == 0) {
> +            return i;
> +        }
> +    }
> +
> +    return -1;
> +}
> +
> +static void
> +set_vhost_drv_id(int id, int val)
> +{
> +    vhost_drv_ids[id] = val;
> +}
> +
>  static int
>  netdev_dpdk_vhost_construct(struct netdev *netdev)
>  {
>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>      const char *name = netdev->name;
>      int err;
> +    uint8_t port_no = 0;
> +    char devargs[DEVARGS_MAX];
>

If we use xasprintf() we don't need to provide a fixed sized buffer.


> +    int driver_id = 0;
> +
> +    if (rte_eal_init_ret) {
> +        return rte_eal_init_ret;
> +    }
>
>      /* 'name' is appended to 'vhost_sock_dir' and used to create a socket
> in
>       * the file system. '/' or '\' would traverse directories, so they're
> not
>       * acceptable in 'name'. */
> -    if (strchr(name, '/') || strchr(name, '\\')) {
> +    if (strchr(name, '/') || strchr(name, '\\') || strchr(name, ',')) {
>          VLOG_ERR("\"%s\" is not a valid name for a vhost-user port. "
>                   "A valid name must not include '/' or '\\'",
>                   name);
> @@ -837,18 +977,32 @@ netdev_dpdk_vhost_construct(struct netdev *netdev)
>       */
>      snprintf(dev->vhost_id, sizeof(dev->vhost_id), "%s/%s",
>               vhost_sock_dir, name);
> +    driver_id = get_vhost_drv_id();
> +    if (driver_id == -1) {
> +        VLOG_ERR("Unable to create vhost-user device %s - too many
> vhost-user"
> +                 "devices registered with PMD", dev->vhost_id);
> +        err = ENODEV;
> +        goto out;
> +
> +    } else {
> +        snprintf(devargs, sizeof(devargs),
> "eth_vhost%u,iface=%s,queues=%i",
> +                 driver_id, dev->vhost_id, RTE_MAX_QUEUES_PER_PORT);
> +        err = rte_eth_dev_attach(devargs, &port_no);
> +    }
>
> -    err = rte_vhost_driver_register(dev->vhost_id);
>      if (err) {
> -        VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
> +        VLOG_ERR("Failed to attach vhost-user device %s to DPDK",
>                   dev->vhost_id);
>      } else {
>          fatal_signal_add_file_to_unlink(dev->vhost_id);
>          VLOG_INFO("Socket %s created for vhost-user port %s\n",
>                    dev->vhost_id, name);
> -        err = netdev_dpdk_init(netdev, -1, DPDK_DEV_VHOST);
> +        dev->vhost_pmd_id = driver_id;
> +        set_vhost_drv_id(driver_id, 1);
> +        err = netdev_dpdk_init(netdev, port_no, DPDK_DEV_VHOST);
>      }
>
> +out:
>      ovs_mutex_unlock(&dpdk_mutex);
>      return err;
>  }
> @@ -876,15 +1030,11 @@ netdev_dpdk_construct(struct netdev *netdev)
>  }
>
>  static void
> -netdev_dpdk_destruct(struct netdev *netdev)
> +dpdk_destruct_helper(struct netdev_dpdk *dev)
>  {
> -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> -
> -    ovs_mutex_lock(&dev->mutex);
>      rte_eth_dev_stop(dev->port_id);
>      free(ovsrcu_get_protected(struct ingress_policer *,
>                                &dev->ingress_policer));
> -    ovs_mutex_unlock(&dev->mutex);
>
>      ovs_mutex_lock(&dpdk_mutex);
>      rte_free(dev->tx_q);
> @@ -894,35 +1044,31 @@ netdev_dpdk_destruct(struct netdev *netdev)
>  }
>
>  static void
> +netdev_dpdk_destruct(struct netdev *netdev)
> +{
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +
> +    ovs_mutex_lock(&dev->mutex);
> +    dpdk_destruct_helper(dev);
> +    ovs_mutex_unlock(&dev->mutex);
> +}
> +
> +static void
>  netdev_dpdk_vhost_destruct(struct netdev *netdev)
>  {
>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>
> -    /* Guest becomes an orphan if still attached. */
> -    if (netdev_dpdk_get_virtio(dev) != NULL) {
> -        VLOG_ERR("Removing port '%s' while vhost device still attached.",
> -                 netdev->name);
> -        VLOG_ERR("To restore connectivity after re-adding of port, VM on
> socket"
> -                 " '%s' must be restarted.",
> -                 dev->vhost_id);
> -    }
> +    ovs_mutex_lock(&dev->mutex);
>
> -    if (rte_vhost_driver_unregister(dev->vhost_id)) {
> -        VLOG_ERR("Unable to remove vhost-user socket %s", dev->vhost_id);
> +    if (rte_eth_dev_detach(dev->port_id, dev->vhost_id)) {
> +        VLOG_ERR("Error removing vhost device %s", dev->vhost_id);
>      } else {
>          fatal_signal_remove_file_to_unlink(dev->vhost_id);
>      }
> +    set_vhost_drv_id(dev->vhost_pmd_id, 0);
>
> -    ovs_mutex_lock(&dev->mutex);
> -    free(ovsrcu_get_protected(struct ingress_policer *,
> -                              &dev->ingress_policer));
> +    dpdk_destruct_helper(dev);
>      ovs_mutex_unlock(&dev->mutex);
> -
> -    ovs_mutex_lock(&dpdk_mutex);
> -    rte_free(dev->tx_q);
> -    ovs_list_remove(&dev->list_node);
> -    dpdk_mp_put(dev->dpdk_mp);
> -    ovs_mutex_unlock(&dpdk_mutex);
>  }
>

With this refactoring 'dpdk_mutex' is nested inside 'dev->mutex'.  In the
rest of the code 'dev->mutex' nests inside 'dpdk_mutex'.


>
>  static void
> @@ -1113,114 +1259,6 @@ ingress_policer_run(struct ingress_policer
> *policer, struct rte_mbuf **pkts,
>      return cnt;
>  }
>
> -static bool
> -is_vhost_running(struct virtio_net *virtio_dev)
> -{
> -    return (virtio_dev != NULL && (virtio_dev->flags &
> VIRTIO_DEV_RUNNING));
> -}
> -
> -static inline void
> -netdev_dpdk_vhost_update_rx_size_counters(struct netdev_stats *stats,
> -                                          unsigned int packet_size)
> -{
> -    /* Hard-coded search for the size bucket. */
> -    if (packet_size < 256) {
> -        if (packet_size >= 128) {
> -            stats->rx_128_to_255_packets++;
> -        } else if (packet_size <= 64) {
> -            stats->rx_1_to_64_packets++;
> -        } else {
> -            stats->rx_65_to_127_packets++;
> -        }
> -    } else {
> -        if (packet_size >= 1523) {
> -            stats->rx_1523_to_max_packets++;
> -        } else if (packet_size >= 1024) {
> -            stats->rx_1024_to_1522_packets++;
> -        } else if (packet_size < 512) {
> -            stats->rx_256_to_511_packets++;
> -        } else {
> -            stats->rx_512_to_1023_packets++;
> -        }
> -    }
> -}
> -
> -static inline void
> -netdev_dpdk_vhost_update_rx_counters(struct netdev_stats *stats,
> -                                     struct dp_packet **packets, int
> count,
> -                                     int dropped)
> -{
> -    int i;
> -    unsigned int packet_size;
> -    struct dp_packet *packet;
> -
> -    stats->rx_packets += count;
> -    stats->rx_dropped += dropped;
> -    for (i = 0; i < count; i++) {
> -        packet = packets[i];
> -        packet_size = dp_packet_size(packet);
> -
> -        if (OVS_UNLIKELY(packet_size < ETH_HEADER_LEN)) {
> -            /* This only protects the following multicast counting from
> -             * too short packets, but it does not stop the packet from
> -             * further processing. */
> -            stats->rx_errors++;
> -            stats->rx_length_errors++;
> -            continue;
> -        }
> -
> -        netdev_dpdk_vhost_update_rx_size_counters(stats, packet_size);
> -
> -        struct eth_header *eh = (struct eth_header *)
> dp_packet_data(packet);
> -        if (OVS_UNLIKELY(eth_addr_is_multicast(eh->eth_dst))) {
> -            stats->multicast++;
> -        }
> -
> -        stats->rx_bytes += packet_size;
> -    }
> -}
> -
> -/*
> - * The receive path for the vhost port is the TX path out from guest.
> - */
> -static int
> -netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
> -                           struct dp_packet **packets, int *c)
> -{
> -    struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
> -    struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);
> -    int qid = rxq->queue_id;
> -    struct ingress_policer *policer =
> netdev_dpdk_get_ingress_policer(dev);
> -    uint16_t nb_rx = 0;
> -    uint16_t dropped = 0;
> -
> -    if (OVS_UNLIKELY(!is_vhost_running(virtio_dev)
> -                     || !(dev->flags & NETDEV_UP))) {
> -        return EAGAIN;
> -    }
> -
> -    nb_rx = rte_vhost_dequeue_burst(virtio_dev, qid * VIRTIO_QNUM +
> VIRTIO_TXQ,
> -                                    dev->dpdk_mp->mp,
> -                                    (struct rte_mbuf **)packets,
> -                                    NETDEV_MAX_BURST);
> -    if (!nb_rx) {
> -        return EAGAIN;
> -    }
> -
> -    if (policer) {
> -        dropped = nb_rx;
> -        nb_rx = ingress_policer_run(policer, (struct rte_mbuf **)packets,
> nb_rx);
> -        dropped -= nb_rx;
> -    }
> -
> -    rte_spinlock_lock(&dev->stats_lock);
> -    netdev_dpdk_vhost_update_rx_counters(&dev->stats, packets, nb_rx,
> dropped);
> -    rte_spinlock_unlock(&dev->stats_lock);
> -
> -    *c = (int) nb_rx;
> -    return 0;
> -}
> -
>  static int
>  netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet **packets,
>                       int *c)
> @@ -1273,85 +1311,6 @@ netdev_dpdk_qos_run__(struct netdev_dpdk *dev,
> struct rte_mbuf **pkts,
>      return cnt;
>  }
>
> -static inline void
> -netdev_dpdk_vhost_update_tx_counters(struct netdev_stats *stats,
> -                                     struct dp_packet **packets,
> -                                     int attempted,
> -                                     int dropped)
> -{
> -    int i;
> -    int sent = attempted - dropped;
> -
> -    stats->tx_packets += sent;
> -    stats->tx_dropped += dropped;
> -
> -    for (i = 0; i < sent; i++) {
> -        stats->tx_bytes += dp_packet_size(packets[i]);
> -    }
> -}
> -
> -static void
> -__netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
> -                         struct dp_packet **pkts, int cnt,
> -                         bool may_steal)
> -{
> -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> -    struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);
> -    struct rte_mbuf **cur_pkts = (struct rte_mbuf **) pkts;
> -    unsigned int total_pkts = cnt;
> -    unsigned int qos_pkts = cnt;
> -    int retries = 0;
> -
> -    qid = dev->tx_q[qid % netdev->n_txq].map;
> -
> -    if (OVS_UNLIKELY(!is_vhost_running(virtio_dev) || qid < 0
> -                     || !(dev->flags & NETDEV_UP))) {
> -        rte_spinlock_lock(&dev->stats_lock);
> -        dev->stats.tx_dropped+= cnt;
> -        rte_spinlock_unlock(&dev->stats_lock);
> -        goto out;
> -    }
> -
> -    rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> -
> -    /* Check has QoS has been configured for the netdev */
> -    cnt = netdev_dpdk_qos_run__(dev, cur_pkts, cnt);
> -    qos_pkts -= cnt;
> -
> -    do {
> -        int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
> -        unsigned int tx_pkts;
> -
> -        tx_pkts = rte_vhost_enqueue_burst(virtio_dev, vhost_qid,
> -                                          cur_pkts, cnt);
> -        if (OVS_LIKELY(tx_pkts)) {
> -            /* Packets have been sent.*/
> -            cnt -= tx_pkts;
> -            /* Prepare for possible retry.*/
> -            cur_pkts = &cur_pkts[tx_pkts];
> -        } else {
> -            /* No packets sent - do not retry.*/
> -            break;
> -        }
> -    } while (cnt && (retries++ < VHOST_ENQ_RETRY_NUM));
> -
> -    rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
> -
> -    rte_spinlock_lock(&dev->stats_lock);
> -    cnt += qos_pkts;
> -    netdev_dpdk_vhost_update_tx_counters(&dev->stats, pkts, total_pkts,
> cnt);
> -    rte_spinlock_unlock(&dev->stats_lock);
> -
> -out:
> -    if (may_steal) {
> -        int i;
> -
> -        for (i = 0; i < total_pkts; i++) {
> -            dp_packet_delete(pkts[i]);
> -        }
> -    }
> -}
> -
>  /* Tx function. Transmit packets indefinitely */
>  static void
>  dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet **pkts,
> @@ -1408,17 +1367,13 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid,
> struct dp_packet **pkts,
>          newcnt++;
>      }
>
> -    if (dev->type == DPDK_DEV_VHOST) {
> -        __netdev_dpdk_vhost_send(netdev, qid, (struct dp_packet **)
> mbufs, newcnt, true);
> -    } else {
> -        unsigned int qos_pkts = newcnt;
> +    unsigned int qos_pkts = newcnt;
>
> -        /* Check if QoS has been configured for this netdev. */
> -        newcnt = netdev_dpdk_qos_run__(dev, mbufs, newcnt);
> +    /* Check if QoS has been configured for this netdev. */
> +    newcnt = netdev_dpdk_qos_run__(dev, mbufs, newcnt);
>
> -        dropped += qos_pkts - newcnt;
> -        netdev_dpdk_eth_tx_burst(dev, qid, mbufs, newcnt);
> -    }
> +    dropped += qos_pkts - newcnt;
> +    netdev_dpdk_eth_tx_burst(dev, qid, mbufs, newcnt);
>
>      if (OVS_UNLIKELY(dropped)) {
>          rte_spinlock_lock(&dev->stats_lock);
> @@ -1431,44 +1386,12 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid,
> struct dp_packet **pkts,
>      }
>  }
>
> -static int
> -netdev_dpdk_vhost_send(struct netdev *netdev, int qid, struct dp_packet
> **pkts,
> -                 int cnt, bool may_steal)
> -{
> -    if (OVS_UNLIKELY(pkts[0]->source != DPBUF_DPDK)) {
> -        int i;
> -
> -        dpdk_do_tx_copy(netdev, qid, pkts, cnt);
> -        if (may_steal) {
> -            for (i = 0; i < cnt; i++) {
> -                dp_packet_delete(pkts[i]);
> -            }
> -        }
> -    } else {
> -        int i;
> -
> -        for (i = 0; i < cnt; i++) {
> -            int cutlen = dp_packet_get_cutlen(pkts[i]);
> -
> -            dp_packet_set_size(pkts[i], dp_packet_size(pkts[i]) - cutlen);
> -            dp_packet_reset_cutlen(pkts[i]);
> -        }
> -        __netdev_dpdk_vhost_send(netdev, qid, pkts, cnt, may_steal);
> -    }
> -    return 0;
> -}
> -
>  static inline void
>  netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
>                     struct dp_packet **pkts, int cnt, bool may_steal)
>  {
>      int i;
>
> -    if (OVS_UNLIKELY(dev->txq_needs_locking)) {
> -        qid = qid % dev->up.n_txq;
> -        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> -    }
> -
>

Did you forget to remove the rte_spinlock_unlock() from this function?


>      if (OVS_UNLIKELY(!may_steal ||
>                       pkts[0]->source != DPBUF_DPDK)) {
>          struct netdev *netdev = &dev->up;
> @@ -1543,7 +1466,44 @@ netdev_dpdk_eth_send(struct netdev *netdev, int qid,
>  {
>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>
> +    if (OVS_UNLIKELY(dev->txq_needs_locking)) {
> +        qid = qid % dev->up.n_txq;
> +        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> +    }
> +
>      netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal);
> +
> +    if (OVS_UNLIKELY(dev->txq_needs_locking)) {
> +        rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
> +    }
> +
> +    return 0;
> +}
> +
> +static int
> +netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
> +                     struct dp_packet **pkts, int cnt, bool may_steal)
> +{
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +
> +    qid = dev->tx_q[qid % netdev->n_txq].map;
> +    if (qid == -1) {
> +        rte_spinlock_lock(&dev->stats_lock);
> +        dev->stats.tx_dropped+= cnt;
> +        rte_spinlock_unlock(&dev->stats_lock);
> +        if (may_steal) {
> +            int i;
> +
> +            for (i = 0; i < cnt; i++) {
> +                dp_packet_delete(pkts[i]);
> +            }
> +        }
>

It wasn't on master when this patch was sent, but now we can use
dp_packet_delete_batch().


> +    } else {
> +        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> +        netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal);
> +        rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
> +    }
> +
>      return 0;
>  }
>
> @@ -1640,41 +1600,6 @@ out:
>  static int
>  netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier);
>
> -static int
> -netdev_dpdk_vhost_get_stats(const struct netdev *netdev,
> -                            struct netdev_stats *stats)
> -{
> -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> -
> -    ovs_mutex_lock(&dev->mutex);
> -
> -    rte_spinlock_lock(&dev->stats_lock);
> -    /* Supported Stats */
> -    stats->rx_packets += dev->stats.rx_packets;
> -    stats->tx_packets += dev->stats.tx_packets;
> -    stats->rx_dropped = dev->stats.rx_dropped;
> -    stats->tx_dropped += dev->stats.tx_dropped;
> -    stats->multicast = dev->stats.multicast;
> -    stats->rx_bytes = dev->stats.rx_bytes;
> -    stats->tx_bytes = dev->stats.tx_bytes;
> -    stats->rx_errors = dev->stats.rx_errors;
> -    stats->rx_length_errors = dev->stats.rx_length_errors;
> -
> -    stats->rx_1_to_64_packets = dev->stats.rx_1_to_64_packets;
> -    stats->rx_65_to_127_packets = dev->stats.rx_65_to_127_packets;
> -    stats->rx_128_to_255_packets = dev->stats.rx_128_to_255_packets;
> -    stats->rx_256_to_511_packets = dev->stats.rx_256_to_511_packets;
> -    stats->rx_512_to_1023_packets = dev->stats.rx_512_to_1023_packets;
> -    stats->rx_1024_to_1522_packets = dev->stats.rx_1024_to_1522_packets;
> -    stats->rx_1523_to_max_packets = dev->stats.rx_1523_to_max_packets;
> -
> -    rte_spinlock_unlock(&dev->stats_lock);
> -
> -    ovs_mutex_unlock(&dev->mutex);
> -
> -    return 0;
> -}
> -
>  static void
>  netdev_dpdk_convert_xstats(struct netdev_stats *stats,
>                             const struct rte_eth_xstats *xstats,
> @@ -1755,28 +1680,40 @@ netdev_dpdk_get_stats(const struct netdev *netdev,
> struct netdev_stats *stats)
>          return EPROTO;
>      }
>
> -    rte_xstats_len = rte_eth_xstats_get(dev->port_id, NULL, 0);
> -    if (rte_xstats_len > 0) {
> -        rte_xstats = dpdk_rte_mzalloc(sizeof(*rte_xstats) *
> rte_xstats_len);
> -        memset(rte_xstats, 0xff, sizeof(*rte_xstats) * rte_xstats_len);
> -        rte_xstats_ret = rte_eth_xstats_get(dev->port_id, rte_xstats,
> -                                            rte_xstats_len);
> -        if (rte_xstats_ret > 0 && rte_xstats_ret <= rte_xstats_len) {
> -            netdev_dpdk_convert_xstats(stats, rte_xstats, rte_xstats_ret);
> +    /* Extended statistics are not yet available for vHost User PMD */
> +    if (dev->type == DPDK_DEV_ETH) {
> +        rte_xstats_len = rte_eth_xstats_get(dev->port_id, NULL, 0);
> +        if (rte_xstats_len > 0) {
> +            rte_xstats = dpdk_rte_mzalloc(sizeof(*rte_xstats)
> +                                          * rte_xstats_len);
> +            memset(rte_xstats, 0xff, sizeof(*rte_xstats) *
> rte_xstats_len);
> +            rte_xstats_ret = rte_eth_xstats_get(dev->port_id, rte_xstats,
> +                                                rte_xstats_len);
> +            if (rte_xstats_ret > 0 && rte_xstats_ret <= rte_xstats_len) {
> +                netdev_dpdk_convert_xstats(stats, rte_xstats,
> rte_xstats_ret);
> +            }
> +            rte_free(rte_xstats);
> +        } else {
> +            VLOG_WARN("Can't get XSTATS counters for port: %i.",
> dev->port_id);
>          }
> -        rte_free(rte_xstats);
> -    } else {
> -        VLOG_WARN("Can't get XSTATS counters for port: %i.",
> dev->port_id);
>      }
>
>      stats->rx_packets = rte_stats.ipackets;
>      stats->tx_packets = rte_stats.opackets;
>      stats->rx_bytes = rte_stats.ibytes;
>      stats->tx_bytes = rte_stats.obytes;
> -    /* DPDK counts imissed as errors, but count them here as dropped
> instead */
> -    stats->rx_errors = rte_stats.ierrors - rte_stats.imissed;
> -    stats->tx_errors = rte_stats.oerrors;
> -    stats->multicast = rte_stats.imcasts;
> +
> +    if (dev->type == DPDK_DEV_ETH) {
> +        /* DPDK counts imissed as errors, but count them here as dropped
> +         * instead */
> +        stats->rx_errors = rte_stats.ierrors - rte_stats.imissed;
> +        stats->tx_errors = rte_stats.oerrors;
> +        stats->multicast = rte_stats.imcasts;
> +    } else {
> +        stats->rx_errors = UINT64_MAX;
> +        stats->tx_errors = UINT64_MAX;
> +        stats->multicast = UINT64_MAX;
> +    }
>
>      rte_spinlock_lock(&dev->stats_lock);
>      stats->tx_dropped = dev->stats.tx_dropped;
> @@ -1939,25 +1876,6 @@ netdev_dpdk_get_carrier(const struct netdev
> *netdev, bool *carrier)
>      return 0;
>  }
>
> -static int
> -netdev_dpdk_vhost_get_carrier(const struct netdev *netdev, bool *carrier)
> -{
> -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> -    struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);
> -
> -    ovs_mutex_lock(&dev->mutex);
> -
> -    if (is_vhost_running(virtio_dev)) {
> -        *carrier = 1;
> -    } else {
> -        *carrier = 0;
> -    }
> -
> -    ovs_mutex_unlock(&dev->mutex);
> -
> -    return 0;
> -}
> -
>  static long long int
>  netdev_dpdk_get_carrier_resets(const struct netdev *netdev)
>  {
> @@ -1993,6 +1911,7 @@ netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
>      dev->flags |= on;
>      dev->flags &= ~off;
>
> +
>

Minor: extra newline


>      if (dev->flags == *old_flagsp) {
>          return 0;
>      }
> @@ -2012,13 +1931,10 @@ netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
>              rte_eth_dev_stop(dev->port_id);
>          }
>      } else {
> -        /* If DPDK_DEV_VHOST device's NETDEV_UP flag was changed and
> vhost is
> -         * running then change netdev's change_seq to trigger link state
> -         * update. */
> -        struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);
> +        /* If DPDK_DEV_VHOST device's NETDEV_UP flag was changed then
> change
> +         * netdev's change_seq to trigger link state update. */
>
> -        if ((NETDEV_UP & ((*old_flagsp ^ on) | (*old_flagsp ^ off)))
> -            && is_vhost_running(virtio_dev)) {
> +        if ((NETDEV_UP & ((*old_flagsp ^ on) | (*old_flagsp ^ off)))) {
>              netdev_change_seq_changed(&dev->up);
>
>              /* Clear statistics if device is getting up. */
> @@ -2115,7 +2031,7 @@ netdev_dpdk_set_admin_state(struct unixctl_conn
> *conn, int argc,
>
>      if (argc > 2) {
>          struct netdev *netdev = netdev_from_name(argv[1]);
> -        if (netdev && is_dpdk_class(netdev->netdev_class)) {
> +        if (netdev && is_dpdk_eth_class(netdev->netdev_class)) {
>

I think this will leave out ring devices.


>              struct netdev_dpdk *dpdk_dev = netdev_dpdk_cast(netdev);
>
>              ovs_mutex_lock(&dpdk_dev->mutex);
> @@ -2143,22 +2059,6 @@ netdev_dpdk_set_admin_state(struct unixctl_conn
> *conn, int argc,
>  }
>
>  /*
> - * Set virtqueue flags so that we do not receive interrupts.
> - */
> -static void
> -set_irq_status(struct virtio_net *virtio_dev)
> -{
> -    uint32_t i;
> -    uint64_t idx;
> -
> -    for (i = 0; i < virtio_dev->virt_qp_nb; i++) {
> -        idx = i * VIRTIO_QNUM;
> -        rte_vhost_enable_guest_notification(virtio_dev, idx + VIRTIO_RXQ,
> 0);
> -        rte_vhost_enable_guest_notification(virtio_dev, idx + VIRTIO_TXQ,
> 0);
> -    }
> -}
> -
> -/*
>   * Fixes mapping for vhost-user tx queues. Must be called after each
>   * enabling/disabling of queues and n_txq modifications.
>   */
> @@ -2199,62 +2099,6 @@ netdev_dpdk_remap_txqs(struct netdev_dpdk *dev)
>      rte_free(enabled_queues);
>  }
>
> -/*
> - * A new virtio-net device is added to a vhost port.
> - */
> -static int
> -new_device(struct virtio_net *virtio_dev)
> -{
> -    struct netdev_dpdk *dev;
> -    bool exists = false;
> -    int newnode = 0;
> -    long err = 0;
> -
> -    ovs_mutex_lock(&dpdk_mutex);
> -    /* Add device to the vhost port with the same name as that passed
> down. */
> -    LIST_FOR_EACH(dev, list_node, &dpdk_list) {
> -        if (strncmp(virtio_dev->ifname, dev->vhost_id, IF_NAME_SZ) == 0) {
> -            uint32_t qp_num = virtio_dev->virt_qp_nb;
> -
> -            ovs_mutex_lock(&dev->mutex);
> -            /* Get NUMA information */
> -            err = get_mempolicy(&newnode, NULL, 0, virtio_dev,
> -                                MPOL_F_NODE | MPOL_F_ADDR);
> -            if (err) {
> -                VLOG_INFO("Error getting NUMA info for vHost Device '%s'",
> -                        virtio_dev->ifname);
> -                newnode = dev->socket_id;
> -            }
> -
> -            dev->requested_socket_id = newnode;
> -            dev->requested_n_rxq = qp_num;
> -            dev->requested_n_txq = qp_num;
> -            netdev_request_reconfigure(&dev->up);
> -
> -            ovsrcu_set(&dev->virtio_dev, virtio_dev);
> -            exists = true;
> -
> -            /* Disable notifications. */
> -            set_irq_status(virtio_dev);
> -            netdev_change_seq_changed(&dev->up);
> -            ovs_mutex_unlock(&dev->mutex);
> -            break;
> -        }
> -    }
> -    ovs_mutex_unlock(&dpdk_mutex);
> -
> -    if (!exists) {
> -        VLOG_INFO("vHost Device '%s' %"PRIu64" can't be added - name not "
> -                  "found", virtio_dev->ifname, virtio_dev->device_fh);
> -
> -        return -1;
> -    }
> -
> -    VLOG_INFO("vHost Device '%s' %"PRIu64" has been added on numa node
> %i",
> -              virtio_dev->ifname, virtio_dev->device_fh, newnode);
> -    return 0;
> -}
> -
>  /* Clears mapping for all available queues of vhost interface. */
>  static void
>  netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)
> @@ -2267,144 +2111,18 @@ netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)
>      }
>  }
>
> -/*
> - * Remove a virtio-net device from the specific vhost port.  Use
> dev->remove
> - * flag to stop any more packets from being sent or received to/from a VM
> and
> - * ensure all currently queued packets have been sent/received before
> removing
> - *  the device.
> - */
> -static void
> -destroy_device(volatile struct virtio_net *virtio_dev)
> -{
> -    struct netdev_dpdk *dev;
> -    bool exists = false;
> -
> -    ovs_mutex_lock(&dpdk_mutex);
> -    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
> -        if (netdev_dpdk_get_virtio(dev) == virtio_dev) {
> -
> -            ovs_mutex_lock(&dev->mutex);
> -            virtio_dev->flags &= ~VIRTIO_DEV_RUNNING;
> -            ovsrcu_set(&dev->virtio_dev, NULL);
> -            /* Clear tx/rx queue settings. */
> -            netdev_dpdk_txq_map_clear(dev);
> -            dev->requested_n_rxq = NR_QUEUE;
> -            dev->requested_n_txq = NR_QUEUE;
> -            netdev_request_reconfigure(&dev->up);
> -
> -            netdev_change_seq_changed(&dev->up);
> -            ovs_mutex_unlock(&dev->mutex);
> -            exists = true;
> -            break;
> -        }
> -    }
> -
> -    ovs_mutex_unlock(&dpdk_mutex);
> -
> -    if (exists == true) {
> -        /*
> -         * Wait for other threads to quiesce after setting the
> 'virtio_dev'
> -         * to NULL, before returning.
> -         */
> -        ovsrcu_synchronize();
> -        /*
> -         * As call to ovsrcu_synchronize() will end the quiescent state,
> -         * put thread back into quiescent state before returning.
> -         */
> -        ovsrcu_quiesce_start();
> -        VLOG_INFO("vHost Device '%s' %"PRIu64" has been removed",
> -                  virtio_dev->ifname, virtio_dev->device_fh);
> -    } else {
> -        VLOG_INFO("vHost Device '%s' %"PRIu64" not found",
> virtio_dev->ifname,
> -                  virtio_dev->device_fh);
> -    }
> -}
> -
> -static int
> -vring_state_changed(struct virtio_net *virtio_dev, uint16_t queue_id,
> -                    int enable)
> -{
> -    struct netdev_dpdk *dev;
> -    bool exists = false;
> -    int qid = queue_id / VIRTIO_QNUM;
> -
> -    if (queue_id % VIRTIO_QNUM == VIRTIO_TXQ) {
> -        return 0;
> -    }
> -
> -    ovs_mutex_lock(&dpdk_mutex);
> -    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
> -        if (strncmp(virtio_dev->ifname, dev->vhost_id, IF_NAME_SZ) == 0) {
> -            ovs_mutex_lock(&dev->mutex);
> -            if (enable) {
> -                dev->tx_q[qid].map = qid;
> -            } else {
> -                dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
> -            }
> -            netdev_dpdk_remap_txqs(dev);
> -            exists = true;
> -            ovs_mutex_unlock(&dev->mutex);
> -            break;
> -        }
> -    }
> -    ovs_mutex_unlock(&dpdk_mutex);
> -
> -    if (exists) {
> -        VLOG_INFO("State of queue %d ( tx_qid %d ) of vhost device '%s' %"
> -                  PRIu64" changed to \'%s\'", queue_id, qid,
> -                  virtio_dev->ifname, virtio_dev->device_fh,
> -                  (enable == 1) ? "enabled" : "disabled");
> -    } else {
> -        VLOG_INFO("vHost Device '%s' %"PRIu64" not found",
> virtio_dev->ifname,
> -                  virtio_dev->device_fh);
> -        return -1;
> -    }
> -
> -    return 0;
> -}
> -
> -struct virtio_net *
> -netdev_dpdk_get_virtio(const struct netdev_dpdk *dev)
> -{
> -    return ovsrcu_get(struct virtio_net *, &dev->virtio_dev);
> -}
> -
>  struct ingress_policer *
>  netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev)
>  {
>      return ovsrcu_get(struct ingress_policer *, &dev->ingress_policer);
>  }
>
> -/*
> - * These callbacks allow virtio-net devices to be added to vhost ports
> when
> - * configuration has been fully complete.
> - */
> -static const struct virtio_net_device_ops virtio_net_device_ops =
> -{
> -    .new_device =  new_device,
> -    .destroy_device = destroy_device,
> -    .vring_state_changed = vring_state_changed
> -};
> -
> -static void *
> -start_vhost_loop(void *dummy OVS_UNUSED)
> -{
> -     pthread_detach(pthread_self());
> -     /* Put the vhost thread into quiescent state. */
> -     ovsrcu_quiesce_start();
> -     rte_vhost_driver_session_start();
> -     return NULL;
> -}
> -
>  static int
>  dpdk_vhost_class_init(void)
>  {
> -    rte_vhost_driver_callback_register(&virtio_net_device_ops);
> -    rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
> -                            | 1ULL << VIRTIO_NET_F_HOST_TSO6
> -                            | 1ULL << VIRTIO_NET_F_CSUM);
> -
> -    ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
> +    rte_eth_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
> +                                | 1ULL << VIRTIO_NET_F_HOST_TSO6
> +                                | 1ULL << VIRTIO_NET_F_CSUM);
>      return 0;
>  }
>
> @@ -2515,7 +2233,16 @@ netdev_dpdk_ring_send(struct netdev *netdev, int
> qid,
>          dp_packet_rss_invalidate(pkts[i]);
>      }
>
> +    if (OVS_UNLIKELY(dev->txq_needs_locking)) {
> +        qid = qid % dev->up.n_txq;
> +        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> +    }
> +
>      netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal);
> +
> +    if (OVS_UNLIKELY(dev->txq_needs_locking)) {
> +        rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
> +    }
>      return 0;
>  }
>
> @@ -2806,7 +2533,6 @@ static int
>  netdev_dpdk_vhost_reconfigure(struct netdev *netdev)
>  {
>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> -    struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);
>      int err = 0;
>
>      ovs_mutex_lock(&dpdk_mutex);
> @@ -2822,6 +2548,8 @@ netdev_dpdk_vhost_reconfigure(struct netdev *netdev)
>
>      netdev_dpdk_remap_txqs(dev);
>
> +    dev->txq_needs_locking = netdev->n_txq < dev->requested_n_txq;
> +
>      if (dev->requested_socket_id != dev->socket_id) {
>          dev->socket_id = dev->requested_socket_id;
>          /* Change mempool to new NUMA Node */
> @@ -2832,10 +2560,6 @@ netdev_dpdk_vhost_reconfigure(struct netdev *netdev)
>          }
>      }
>
> -    if (virtio_dev) {
> -        virtio_dev->flags |= VIRTIO_DEV_RUNNING;
> -    }
> -
>      ovs_mutex_unlock(&dev->mutex);
>      ovs_mutex_unlock(&dpdk_mutex);
>
> @@ -3329,12 +3053,12 @@ static const struct netdev_class OVS_UNUSED
> dpdk_vhost_class =
>          NULL,
>          NULL,
>          netdev_dpdk_vhost_send,
> -        netdev_dpdk_vhost_get_carrier,
> -        netdev_dpdk_vhost_get_stats,
> +        netdev_dpdk_get_carrier,
> +        netdev_dpdk_get_stats,
>          NULL,
>          NULL,
>          netdev_dpdk_vhost_reconfigure,
> -        netdev_dpdk_vhost_rxq_recv);
> +        netdev_dpdk_rxq_recv);
>
>  void
>  netdev_dpdk_register(void)
> --
> 2.4.3
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> http://openvswitch.org/mailman/listinfo/dev
>
Ciara Loftus July 26, 2016, 4:15 p.m. UTC | #2
> Subject: Re: [ovs-dev] [PATCH 3/4] netdev-dpdk: Add vHost User PMD

> 

> Thanks for the patch

> This needs a little bit of rebasing, I did it myself to review, but it'd be nice to

> have an updated version.


I've submitted a new set here: http://openvswitch.org/pipermail/dev/2016-July/076245.html

> I like the simplification that this brings especially to the fast path.

> If we merge this before we merge the DPDK 16.07 we won't have to deal

> with the vid change.


Sounds good. Thanks for the review. Replies inline.

Thanks,
Ciara

> Thanks,

> Daniele

> 

> 2016-07-15 7:26 GMT-07:00 Ciara Loftus <ciara.loftus@intel.com>:

> DPDK 16.04 introduces the vHost PMD which allows 'dpdkvhostuser' ports

> to be controlled by the librte_ether API, like physical 'dpdk' ports and

> IVSHM 'dpdkr' ports. This commit integrates this PMD into OVS and

> removes direct calls to the librte_vhost DPDK library.

> 

> This commit removes extended statistics support for vHost User ports

> until such a time that this becomes available in the vHost PMD in a

> DPDK release supported by OVS.

> 

> Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>

> ---

>  INSTALL.DPDK.md   |  10 +

>  NEWS              |   2 +

>  lib/netdev-dpdk.c | 856 ++++++++++++++++++-----------------------------------

> -

>  3 files changed, 302 insertions(+), 566 deletions(-)

> 

> diff --git a/INSTALL.DPDK.md b/INSTALL.DPDK.md

> index 5407794..29b6f91 100644

> --- a/INSTALL.DPDK.md

> +++ b/INSTALL.DPDK.md

> @@ -561,6 +561,16 @@ can be found in [Vhost Walkthrough].

> 

>      http://dpdk.org/doc/guides/rel_notes/release_16_04.html

> 

> +  - dpdk, dpdkr and dpdkvhostuser ports are 'eth' type ports in the context

> of

> +    DPDK as they are all managed by the rte_ether API. This means that they

> +    adhere to the DPDK configuration option CONFIG_RTE_MAX_ETHPORTS

> which by

> +    default is set to 32. This means by default the combined total number of

> +    dpdk, dpdkr and dpdkvhostuser ports allowable in OVS with DPDK is 32.

> This

> +    value can be changed if desired by modifying the configuration file in

> +    DPDK, or by overriding the default value on the command line when

> building

> +    DPDK. eg.

> +

> +        `make install CONFIG_RTE_MAX_ETHPORTS=64`

> 

> Again, I hope this doesn't cause problems to a lot of users.  I'd like to see the

> limit increased by default, but I think we can merge this patch as it is.


Agreed.

> 

> 

>  Bug Reporting:

>  --------------

> diff --git a/NEWS b/NEWS

> index aa1b915..b3791ed 100644

> --- a/NEWS

> +++ b/NEWS

> @@ -59,6 +59,8 @@ Post-v2.5.0

>         node that device memory is located on if

> CONFIG_RTE_LIBRTE_VHOST_NUMA

>         is enabled in DPDK.

>       * Remove dpdkvhostcuse port type.

> +     * vHost PMD integration brings vhost-user ports under control of the

> +       rte_ether DPDK API.

>     - Increase number of registers to 16.

>     - ovs-benchmark: This utility has been removed due to lack of use and

>       bitrot.

> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c

> index b4f82af..5de806a 100644

> --- a/lib/netdev-dpdk.c

> +++ b/lib/netdev-dpdk.c

> @@ -56,6 +56,7 @@

>  #include "unixctl.h"

> 

>  #include "rte_config.h"

> +#include "rte_eth_vhost.h"

>  #include "rte_mbuf.h"

>  #include "rte_meter.h"

>  #include "rte_virtio_net.h"

> @@ -141,6 +142,11 @@ static char *vhost_sock_dir = NULL;   /* Location of

> vhost-user sockets */

> 

>  #define VHOST_ENQ_RETRY_NUM 8

> 

> +/* Array that tracks the used & unused vHost user driver IDs */

> +static unsigned int vhost_drv_ids[RTE_MAX_ETHPORTS];

> +/* Maximum string length allowed to provide to rte_eth_attach function */

> +#define DEVARGS_MAX (RTE_ETH_NAME_MAX_LEN + PATH_MAX + 18)

> +

> 

> I think this is not needed if we use xasprintf() below.

Removed in v2

> 

>  static const struct rte_eth_conf port_conf = {

>      .rxmode = {

>          .mq_mode = ETH_MQ_RX_RSS,

> @@ -353,12 +359,15 @@ struct netdev_dpdk {

>       * always true.  */

>      bool txq_needs_locking;

> 

> -    /* virtio-net structure for vhost device */

> -    OVSRCU_TYPE(struct virtio_net *) virtio_dev;

> +    /* Number of virtqueue pairs reported by the guest */

> +    uint32_t vhost_qp_nb;

> 

>      /* Identifier used to distinguish vhost devices from each other */

>      char vhost_id[PATH_MAX];

> 

> +    /* ID of vhost user port given to the PMD driver */

> +    unsigned int vhost_pmd_id;

> +

>      /* In dpdk_list. */

>      struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);

> 

> @@ -389,16 +398,25 @@ struct netdev_rxq_dpdk {

>  static bool dpdk_thread_is_pmd(void);

> 

>  static int netdev_dpdk_construct(struct netdev *);

> +static int netdev_dpdk_vhost_construct(struct netdev *);

> 

>  struct virtio_net * netdev_dpdk_get_virtio(const struct netdev_dpdk *dev);

> 

>  struct ingress_policer *

>  netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev);

> 

> +void link_status_changed_callback(uint8_t port_id,

> +        enum rte_eth_event_type type OVS_UNUSED, void *param

> OVS_UNUSED);

> +void vring_state_changed_callback(uint8_t port_id,

> +        enum rte_eth_event_type type OVS_UNUSED, void *param

> OVS_UNUSED);

> 

> Minor: I think we can avoid OVS_UNUSED on the declaration and keep it only

> on the definition.

> Also, these two function can be static

Ok

> 

> +static void netdev_dpdk_remap_txqs(struct netdev_dpdk *dev);

> +static void netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev);

> +

>  static bool

> -is_dpdk_class(const struct netdev_class *class)

> +is_dpdk_eth_class(const struct netdev_class *class)

>  {

> -    return class->construct == netdev_dpdk_construct;

> +    return ((class->construct == netdev_dpdk_construct) ||

> +            (class->construct == netdev_dpdk_vhost_construct));

>  }

> 

>  /* DPDK NIC drivers allocate RX buffers at a particular granularity, typically

> @@ -622,8 +640,13 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk

> *dev, int n_rxq, int n_txq)

>              continue;

>          }

> 

> -        dev->up.n_rxq = n_rxq;

> -        dev->up.n_txq = n_txq;

> +        /* Only set n_*xq for physical devices. vHost User devices will set

> +         * this value correctly using info from the virtio backend.

> +         */

> +        if (dev->type == DPDK_DEV_ETH) {

> +            dev->up.n_rxq = n_rxq;

> +            dev->up.n_txq = n_txq;

> +        }

> 

>          return 0;

>      }

> @@ -647,8 +670,14 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)

> OVS_REQUIRES(dpdk_mutex)

> 

>      rte_eth_dev_info_get(dev->port_id, &info);

> 

> -    n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);

> -    n_txq = MIN(info.max_tx_queues, dev->up.n_txq);

> +    if (dev->type == DPDK_DEV_VHOST) {

> +        /* We don't know how many queues QEMU will use so set up the max

> */

> +        n_rxq = OVS_VHOST_MAX_QUEUE_NUM;

> +        n_txq = OVS_VHOST_MAX_QUEUE_NUM;

> 

> It's not very clear to me how multiqueue is handled now.

> Do we really configure 1024 queues on the PMD? Wouldn't it be simpler to

> call again this function from _reconfigure?

This is where it gets complicated. We need to configure the max due to the way the PMD works. It's kind of a workaround. I'll do my best to explain:

The value we use for n_*xq is used in DPDK's implementation of new_device() (snip below):

	struct vhost_queue *vq;
	for (i = 0; i < eth_dev->data->n_*xq; i++) {
		vq = eth_dev->data->*x_queues[i];
		if (vq == NULL)
			continue;
		rte_atomic32_set(&vq->allow_queuing, 1);
	}

The flag 'allow_queuing' is set for n_*xq virtqueues. This flag is checked before every enqueue/dequeue.

Take for example the case where we don't initialise the max, rather MIN(info.max_rx_queues, dev->up.n_rxq) like physical devices. (This val usually ends up as dev->up.n_rxq).
When we boot the VM, usually only one queue is enabled until we enable more for example with ethtool. So when we encounter new_device() dev->up.n*xq = 1 and we 'allow_queuing' on only one queue. Later on we may enable more eg via ethtool. Another call to new_device is not guaranteed so we may never set 'allow_queuing' for the newly enabled queues. If that's the case, when we try enqueue/dequeue on these queues, 'allow_queuing' is set to 0 so we can never send/receive and multiqueue doesn't work.

See rte_eth_dev_*x_queue_config() in rte_ethdev.c and new_device() in rte_eth_vhost.c.

Let me know if you need further clarification or if you have any ideas on improving this as I know it's not ideal.

> 

> +    } else {

> +        n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);

> +        n_txq = MIN(info.max_tx_queues, dev->up.n_txq);

> +    }

> 

>      diag = dpdk_eth_dev_queue_setup(dev, n_rxq, n_txq);

>      if (diag) {

> @@ -715,6 +744,85 @@ netdev_dpdk_alloc_txq(struct netdev_dpdk *dev,

> unsigned int n_txqs)

>      }

>  }

> 

> +void

> +link_status_changed_callback(uint8_t port_id,

> +                             enum rte_eth_event_type type OVS_UNUSED,

> +                             void *param OVS_UNUSED)

> +{

> +    struct netdev_dpdk *dev;

> +    int socket_id = -1;

> +

> +    ovs_mutex_lock(&dpdk_mutex);

> +    LIST_FOR_EACH (dev, list_node, &dpdk_list) {

> +        if (port_id == dev->port_id) {

> +            ovs_mutex_lock(&dev->mutex);

> +            check_link_status(dev);

> +            if (dev->link.link_status == ETH_LINK_UP) {

> +                /* new device */

> +                /* Get NUMA information */

> +                socket_id = rte_eth_dev_socket_id(dev->port_id);

> +                if (socket_id != -1 && socket_id != dev->socket_id) {

> +                    dev->requested_socket_id = socket_id;

> +                }

> +                netdev_request_reconfigure(&dev->up);

> +                netdev_change_seq_changed(&dev->up);

> +                VLOG_INFO("vHost Device '%s' has been added on numa node %i",

> +                          dev->vhost_id, socket_id);

> +            } else {

> +                /* destroy device */

> +                /* Clear tx/rx queue settings. */

> +                netdev_dpdk_txq_map_clear(dev);

> +                netdev_request_reconfigure(&dev->up);

> +                netdev_change_seq_changed(&dev->up);

> +                VLOG_INFO("vHost Device '%s' has been removed", dev-

> >vhost_id);

> +            }

> +            ovs_mutex_unlock(&dev->mutex);

> +            break;

> +        }

> +    }

> +

> +    ovs_mutex_unlock(&dpdk_mutex);

> +

> +    return;

> +}

> +

> +void

> +vring_state_changed_callback(uint8_t port_id,

> +                             enum rte_eth_event_type type OVS_UNUSED,

> +                             void *param OVS_UNUSED)

> +{

> +    struct netdev_dpdk *dev;

> +    struct rte_eth_vhost_queue_event event;

> +    int err = 0;

> +

> +    err = rte_eth_vhost_get_queue_event(port_id, &event);

> +    if (err || (event.rx)) {

> 

> Minor: extra parentheses

> 

> +        return;

> +    }

> +

> +    ovs_mutex_lock(&dpdk_mutex);

> +    LIST_FOR_EACH (dev, list_node, &dpdk_list) {

> +        if (port_id == dev->port_id) {

> +            ovs_mutex_lock(&dev->mutex);

> +            if (event.enable) {

> +                dev->tx_q[event.queue_id].map = event.queue_id;

> +                dev->vhost_qp_nb++;

> +            } else {

> +                dev->tx_q[event.queue_id].map =

> OVS_VHOST_QUEUE_DISABLED;

> +                dev->vhost_qp_nb--;

> +            }

> +            dev->requested_n_rxq = dev->vhost_qp_nb;

> +            dev->requested_n_txq = dev->vhost_qp_nb;

> +            netdev_request_reconfigure(&dev->up);

> 

> On master vring_state_changed() doesn't call _reconfigure(), but it just

> updates the map.

> Can we kkep the same behavior?

vring_state_changed tells us how many queues we need to configure now, this was previously done in new_device followed by a call to reconfigure. We're changing requested_n_*xq here now so we need the reconfigure.

> 

> +            ovs_mutex_unlock(&dev->mutex);

> +            break;

> +        }

> +    }

> +    ovs_mutex_unlock(&dpdk_mutex);

> +

> +    return;

> +}

> +

>  static int

>  netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,

>                   enum dpdk_dev_type type)

> @@ -724,6 +832,7 @@ netdev_dpdk_init(struct netdev *netdev, unsigned

> int port_no,

>      int sid;

>      int err = 0;

>      uint32_t buf_size;

> +    unsigned int nr_q = 0;

> 

>      ovs_mutex_init(&dev->mutex);

>      ovs_mutex_lock(&dev->mutex);

> @@ -733,11 +842,7 @@ netdev_dpdk_init(struct netdev *netdev, unsigned

> int port_no,

>      /* If the 'sid' is negative, it means that the kernel fails

>       * to obtain the pci numa info.  In that situation, always

>       * use 'SOCKET0'. */

> -    if (type == DPDK_DEV_ETH) {

> -        sid = rte_eth_dev_socket_id(port_no);

> -    } else {

> -        sid = rte_lcore_to_socket_id(rte_get_master_lcore());

> -    }

> +    sid = rte_eth_dev_socket_id(port_no);

> 

>      dev->socket_id = sid < 0 ? SOCKET0 : sid;

>      dev->requested_socket_id = dev->socket_id;

> @@ -767,19 +872,23 @@ netdev_dpdk_init(struct netdev *netdev, unsigned

> int port_no,

>      netdev->n_txq = NR_QUEUE;

>      dev->requested_n_rxq = netdev->n_rxq;

>      dev->requested_n_txq = netdev->n_txq;

> +    dev->vhost_qp_nb = 0;

> 

> -    if (type == DPDK_DEV_ETH) {

> -        err = dpdk_eth_dev_init(dev);

> -        if (err) {

> -            goto unlock;

> -        }

> -        netdev_dpdk_alloc_txq(dev, netdev->n_txq);

> -        dev->txq_needs_locking = netdev->n_txq < dev->requested_n_txq;

> -    } else {

> -        netdev_dpdk_alloc_txq(dev, OVS_VHOST_MAX_QUEUE_NUM);

> -        dev->txq_needs_locking = true;

> -        /* Enable DPDK_DEV_VHOST device and set promiscuous mode flag. */

> -        dev->flags = NETDEV_UP | NETDEV_PROMISC;

> +    err = dpdk_eth_dev_init(dev);

> +    if (err) {

> +        goto unlock;

> +    }

> +    nr_q = (type == DPDK_DEV_ETH ? 1 : RTE_MAX_QUEUES_PER_PORT);

> +    netdev_dpdk_alloc_txq(dev, nr_q);

> +    dev->txq_needs_locking = netdev->n_txq < dev->requested_n_txq;

> +

> +    if (type == DPDK_DEV_VHOST) {

> +        rte_eth_dev_callback_register(port_no,

> RTE_ETH_EVENT_QUEUE_STATE,

> +                                     (void*)vring_state_changed_callback,

> 

> Can we avoid the cast to "void *"? It will cause API changes to go undetected

Sure

> 

> +                                      NULL);

> +        rte_eth_dev_callback_register(port_no, RTE_ETH_EVENT_INTR_LSC,

> +                                      (void*)link_status_changed_callback,

> 

> same

> 

> +                                      NULL);

>      }

> 

>      ovs_list_push_back(&dpdk_list, &dev->list_node);

> @@ -810,17 +919,48 @@ dpdk_dev_parse_name(const char dev_name[],

> const char prefix[],

>      }

>  }

> 

> +/* When attaching a vhost device to DPDK, a unique name of the format

> + * 'eth_vhostX' is expected, where X is a unique identifier.

> + * get_vhost_drv_id returns a valid X value to provide to DPDK.

> + */

> +static int

> +get_vhost_drv_id(void)

> +{

> +    int i = 0;

> +

> +    for (i = 0; i < RTE_MAX_ETHPORTS; i++) {

> +        if (vhost_drv_ids[i] == 0) {

> +            return i;

> +        }

> +    }

> +

> +    return -1;

> +}

> +

> +static void

> +set_vhost_drv_id(int id, int val)

> +{

> +    vhost_drv_ids[id] = val;

> +}

> +

>  static int

>  netdev_dpdk_vhost_construct(struct netdev *netdev)

>  {

>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

>      const char *name = netdev->name;

>      int err;

> +    uint8_t port_no = 0;

> +    char devargs[DEVARGS_MAX];

> 

> If we use xasprintf() we don't need to provide a fixed sized buffer.

Ok, changed in v2

> 

> +    int driver_id = 0;

> +

> +    if (rte_eal_init_ret) {

> +        return rte_eal_init_ret;

> +    }

> 

>      /* 'name' is appended to 'vhost_sock_dir' and used to create a socket in

>       * the file system. '/' or '\' would traverse directories, so they're not

>       * acceptable in 'name'. */

> -    if (strchr(name, '/') || strchr(name, '\\')) {

> +    if (strchr(name, '/') || strchr(name, '\\') || strchr(name, ',')) {

>          VLOG_ERR("\"%s\" is not a valid name for a vhost-user port. "

>                   "A valid name must not include '/' or '\\'",

>                   name);

> @@ -837,18 +977,32 @@ netdev_dpdk_vhost_construct(struct netdev

> *netdev)

>       */

>      snprintf(dev->vhost_id, sizeof(dev->vhost_id), "%s/%s",

>               vhost_sock_dir, name);

> +    driver_id = get_vhost_drv_id();

> +    if (driver_id == -1) {

> +        VLOG_ERR("Unable to create vhost-user device %s - too many vhost-

> user"

> +                 "devices registered with PMD", dev->vhost_id);

> +        err = ENODEV;

> +        goto out;

> +

> +    } else {

> +        snprintf(devargs, sizeof(devargs), "eth_vhost%u,iface=%s,queues=%i",

> +                 driver_id, dev->vhost_id, RTE_MAX_QUEUES_PER_PORT);

> +        err = rte_eth_dev_attach(devargs, &port_no);

> +    }

> 

> -    err = rte_vhost_driver_register(dev->vhost_id);

>      if (err) {

> -        VLOG_ERR("vhost-user socket device setup failure for socket %s\n",

> +        VLOG_ERR("Failed to attach vhost-user device %s to DPDK",

>                   dev->vhost_id);

>      } else {

>          fatal_signal_add_file_to_unlink(dev->vhost_id);

>          VLOG_INFO("Socket %s created for vhost-user port %s\n",

>                    dev->vhost_id, name);

> -        err = netdev_dpdk_init(netdev, -1, DPDK_DEV_VHOST);

> +        dev->vhost_pmd_id = driver_id;

> +        set_vhost_drv_id(driver_id, 1);

> +        err = netdev_dpdk_init(netdev, port_no, DPDK_DEV_VHOST);

>      }

> 

> +out:

>      ovs_mutex_unlock(&dpdk_mutex);

>      return err;

>  }

> @@ -876,15 +1030,11 @@ netdev_dpdk_construct(struct netdev *netdev)

>  }

> 

>  static void

> -netdev_dpdk_destruct(struct netdev *netdev)

> +dpdk_destruct_helper(struct netdev_dpdk *dev)

>  {

> -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> -

> -    ovs_mutex_lock(&dev->mutex);

>      rte_eth_dev_stop(dev->port_id);

>      free(ovsrcu_get_protected(struct ingress_policer *,

>                                &dev->ingress_policer));

> -    ovs_mutex_unlock(&dev->mutex);

> 

>      ovs_mutex_lock(&dpdk_mutex);

>      rte_free(dev->tx_q);

> @@ -894,35 +1044,31 @@ netdev_dpdk_destruct(struct netdev *netdev)

>  }

> 

>  static void

> +netdev_dpdk_destruct(struct netdev *netdev)

> +{

> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> +

> +    ovs_mutex_lock(&dev->mutex);

> +    dpdk_destruct_helper(dev);

> +    ovs_mutex_unlock(&dev->mutex);

> +}

> +

> +static void

>  netdev_dpdk_vhost_destruct(struct netdev *netdev)

>  {

>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> 

> -    /* Guest becomes an orphan if still attached. */

> -    if (netdev_dpdk_get_virtio(dev) != NULL) {

> -        VLOG_ERR("Removing port '%s' while vhost device still attached.",

> -                 netdev->name);

> -        VLOG_ERR("To restore connectivity after re-adding of port, VM on

> socket"

> -                 " '%s' must be restarted.",

> -                 dev->vhost_id);

> -    }

> +    ovs_mutex_lock(&dev->mutex);

> 

> -    if (rte_vhost_driver_unregister(dev->vhost_id)) {

> -        VLOG_ERR("Unable to remove vhost-user socket %s", dev->vhost_id);

> +    if (rte_eth_dev_detach(dev->port_id, dev->vhost_id)) {

> +        VLOG_ERR("Error removing vhost device %s", dev->vhost_id);

>      } else {

>          fatal_signal_remove_file_to_unlink(dev->vhost_id);

>      }

> +    set_vhost_drv_id(dev->vhost_pmd_id, 0);

> 

> -    ovs_mutex_lock(&dev->mutex);

> -    free(ovsrcu_get_protected(struct ingress_policer *,

> -                              &dev->ingress_policer));

> +    dpdk_destruct_helper(dev);

>      ovs_mutex_unlock(&dev->mutex);

> -

> -    ovs_mutex_lock(&dpdk_mutex);

> -    rte_free(dev->tx_q);

> -    ovs_list_remove(&dev->list_node);

> -    dpdk_mp_put(dev->dpdk_mp);

> -    ovs_mutex_unlock(&dpdk_mutex);

>  }

> 

> With this refactoring 'dpdk_mutex' is nested inside 'dev->mutex'.  In the rest

> of the code 'dev->mutex' nests inside 'dpdk_mutex'.

Rearranged in next version

> 

> 

>  static void

> @@ -1113,114 +1259,6 @@ ingress_policer_run(struct ingress_policer

> *policer, struct rte_mbuf **pkts,

>      return cnt;

>  }

> 

> -static bool

> -is_vhost_running(struct virtio_net *virtio_dev)

> -{

> -    return (virtio_dev != NULL && (virtio_dev->flags &

> VIRTIO_DEV_RUNNING));

> -}

> -

> -static inline void

> -netdev_dpdk_vhost_update_rx_size_counters(struct netdev_stats *stats,

> -                                          unsigned int packet_size)

> -{

> -    /* Hard-coded search for the size bucket. */

> -    if (packet_size < 256) {

> -        if (packet_size >= 128) {

> -            stats->rx_128_to_255_packets++;

> -        } else if (packet_size <= 64) {

> -            stats->rx_1_to_64_packets++;

> -        } else {

> -            stats->rx_65_to_127_packets++;

> -        }

> -    } else {

> -        if (packet_size >= 1523) {

> -            stats->rx_1523_to_max_packets++;

> -        } else if (packet_size >= 1024) {

> -            stats->rx_1024_to_1522_packets++;

> -        } else if (packet_size < 512) {

> -            stats->rx_256_to_511_packets++;

> -        } else {

> -            stats->rx_512_to_1023_packets++;

> -        }

> -    }

> -}

> -

> -static inline void

> -netdev_dpdk_vhost_update_rx_counters(struct netdev_stats *stats,

> -                                     struct dp_packet **packets, int count,

> -                                     int dropped)

> -{

> -    int i;

> -    unsigned int packet_size;

> -    struct dp_packet *packet;

> -

> -    stats->rx_packets += count;

> -    stats->rx_dropped += dropped;

> -    for (i = 0; i < count; i++) {

> -        packet = packets[i];

> -        packet_size = dp_packet_size(packet);

> -

> -        if (OVS_UNLIKELY(packet_size < ETH_HEADER_LEN)) {

> -            /* This only protects the following multicast counting from

> -             * too short packets, but it does not stop the packet from

> -             * further processing. */

> -            stats->rx_errors++;

> -            stats->rx_length_errors++;

> -            continue;

> -        }

> -

> -        netdev_dpdk_vhost_update_rx_size_counters(stats, packet_size);

> -

> -        struct eth_header *eh = (struct eth_header *)

> dp_packet_data(packet);

> -        if (OVS_UNLIKELY(eth_addr_is_multicast(eh->eth_dst))) {

> -            stats->multicast++;

> -        }

> -

> -        stats->rx_bytes += packet_size;

> -    }

> -}

> -

> -/*

> - * The receive path for the vhost port is the TX path out from guest.

> - */

> -static int

> -netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,

> -                           struct dp_packet **packets, int *c)

> -{

> -    struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);

> -    struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);

> -    int qid = rxq->queue_id;

> -    struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);

> -    uint16_t nb_rx = 0;

> -    uint16_t dropped = 0;

> -

> -    if (OVS_UNLIKELY(!is_vhost_running(virtio_dev)

> -                     || !(dev->flags & NETDEV_UP))) {

> -        return EAGAIN;

> -    }

> -

> -    nb_rx = rte_vhost_dequeue_burst(virtio_dev, qid * VIRTIO_QNUM +

> VIRTIO_TXQ,

> -                                    dev->dpdk_mp->mp,

> -                                    (struct rte_mbuf **)packets,

> -                                    NETDEV_MAX_BURST);

> -    if (!nb_rx) {

> -        return EAGAIN;

> -    }

> -

> -    if (policer) {

> -        dropped = nb_rx;

> -        nb_rx = ingress_policer_run(policer, (struct rte_mbuf **)packets,

> nb_rx);

> -        dropped -= nb_rx;

> -    }

> -

> -    rte_spinlock_lock(&dev->stats_lock);

> -    netdev_dpdk_vhost_update_rx_counters(&dev->stats, packets, nb_rx,

> dropped);

> -    rte_spinlock_unlock(&dev->stats_lock);

> -

> -    *c = (int) nb_rx;

> -    return 0;

> -}

> -

>  static int

>  netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet

> **packets,

>                       int *c)

> @@ -1273,85 +1311,6 @@ netdev_dpdk_qos_run__(struct netdev_dpdk

> *dev, struct rte_mbuf **pkts,

>      return cnt;

>  }

> 

> -static inline void

> -netdev_dpdk_vhost_update_tx_counters(struct netdev_stats *stats,

> -                                     struct dp_packet **packets,

> -                                     int attempted,

> -                                     int dropped)

> -{

> -    int i;

> -    int sent = attempted - dropped;

> -

> -    stats->tx_packets += sent;

> -    stats->tx_dropped += dropped;

> -

> -    for (i = 0; i < sent; i++) {

> -        stats->tx_bytes += dp_packet_size(packets[i]);

> -    }

> -}

> -

> -static void

> -__netdev_dpdk_vhost_send(struct netdev *netdev, int qid,

> -                         struct dp_packet **pkts, int cnt,

> -                         bool may_steal)

> -{

> -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> -    struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);

> -    struct rte_mbuf **cur_pkts = (struct rte_mbuf **) pkts;

> -    unsigned int total_pkts = cnt;

> -    unsigned int qos_pkts = cnt;

> -    int retries = 0;

> -

> -    qid = dev->tx_q[qid % netdev->n_txq].map;

> -

> -    if (OVS_UNLIKELY(!is_vhost_running(virtio_dev) || qid < 0

> -                     || !(dev->flags & NETDEV_UP))) {

> -        rte_spinlock_lock(&dev->stats_lock);

> -        dev->stats.tx_dropped+= cnt;

> -        rte_spinlock_unlock(&dev->stats_lock);

> -        goto out;

> -    }

> -

> -    rte_spinlock_lock(&dev->tx_q[qid].tx_lock);

> -

> -    /* Check has QoS has been configured for the netdev */

> -    cnt = netdev_dpdk_qos_run__(dev, cur_pkts, cnt);

> -    qos_pkts -= cnt;

> -

> -    do {

> -        int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;

> -        unsigned int tx_pkts;

> -

> -        tx_pkts = rte_vhost_enqueue_burst(virtio_dev, vhost_qid,

> -                                          cur_pkts, cnt);

> -        if (OVS_LIKELY(tx_pkts)) {

> -            /* Packets have been sent.*/

> -            cnt -= tx_pkts;

> -            /* Prepare for possible retry.*/

> -            cur_pkts = &cur_pkts[tx_pkts];

> -        } else {

> -            /* No packets sent - do not retry.*/

> -            break;

> -        }

> -    } while (cnt && (retries++ < VHOST_ENQ_RETRY_NUM));

> -

> -    rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);

> -

> -    rte_spinlock_lock(&dev->stats_lock);

> -    cnt += qos_pkts;

> -    netdev_dpdk_vhost_update_tx_counters(&dev->stats, pkts, total_pkts,

> cnt);

> -    rte_spinlock_unlock(&dev->stats_lock);

> -

> -out:

> -    if (may_steal) {

> -        int i;

> -

> -        for (i = 0; i < total_pkts; i++) {

> -            dp_packet_delete(pkts[i]);

> -        }

> -    }

> -}

> -

>  /* Tx function. Transmit packets indefinitely */

>  static void

>  dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet **pkts,

> @@ -1408,17 +1367,13 @@ dpdk_do_tx_copy(struct netdev *netdev, int

> qid, struct dp_packet **pkts,

>          newcnt++;

>      }

> 

> -    if (dev->type == DPDK_DEV_VHOST) {

> -        __netdev_dpdk_vhost_send(netdev, qid, (struct dp_packet **) mbufs,

> newcnt, true);

> -    } else {

> -        unsigned int qos_pkts = newcnt;

> +    unsigned int qos_pkts = newcnt;

> 

> -        /* Check if QoS has been configured for this netdev. */

> -        newcnt = netdev_dpdk_qos_run__(dev, mbufs, newcnt);

> +    /* Check if QoS has been configured for this netdev. */

> +    newcnt = netdev_dpdk_qos_run__(dev, mbufs, newcnt);

> 

> -        dropped += qos_pkts - newcnt;

> -        netdev_dpdk_eth_tx_burst(dev, qid, mbufs, newcnt);

> -    }

> +    dropped += qos_pkts - newcnt;

> +    netdev_dpdk_eth_tx_burst(dev, qid, mbufs, newcnt);

> 

>      if (OVS_UNLIKELY(dropped)) {

>          rte_spinlock_lock(&dev->stats_lock);

> @@ -1431,44 +1386,12 @@ dpdk_do_tx_copy(struct netdev *netdev, int

> qid, struct dp_packet **pkts,

>      }

>  }

> 

> -static int

> -netdev_dpdk_vhost_send(struct netdev *netdev, int qid, struct dp_packet

> **pkts,

> -                 int cnt, bool may_steal)

> -{

> -    if (OVS_UNLIKELY(pkts[0]->source != DPBUF_DPDK)) {

> -        int i;

> -

> -        dpdk_do_tx_copy(netdev, qid, pkts, cnt);

> -        if (may_steal) {

> -            for (i = 0; i < cnt; i++) {

> -                dp_packet_delete(pkts[i]);

> -            }

> -        }

> -    } else {

> -        int i;

> -

> -        for (i = 0; i < cnt; i++) {

> -            int cutlen = dp_packet_get_cutlen(pkts[i]);

> -

> -            dp_packet_set_size(pkts[i], dp_packet_size(pkts[i]) - cutlen);

> -            dp_packet_reset_cutlen(pkts[i]);

> -        }

> -        __netdev_dpdk_vhost_send(netdev, qid, pkts, cnt, may_steal);

> -    }

> -    return 0;

> -}

> -

>  static inline void

>  netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,

>                     struct dp_packet **pkts, int cnt, bool may_steal)

>  {

>      int i;

> 

> -    if (OVS_UNLIKELY(dev->txq_needs_locking)) {

> -        qid = qid % dev->up.n_txq;

> -        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);

> -    }

> -

> 

> Did you forget to remove the rte_spinlock_unlock() from this function?

Yes :) Thanks for catching

> 

>      if (OVS_UNLIKELY(!may_steal ||

>                       pkts[0]->source != DPBUF_DPDK)) {

>          struct netdev *netdev = &dev->up;

> @@ -1543,7 +1466,44 @@ netdev_dpdk_eth_send(struct netdev *netdev,

> int qid,

>  {

>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> 

> +    if (OVS_UNLIKELY(dev->txq_needs_locking)) {

> +        qid = qid % dev->up.n_txq;

> +        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);

> +    }

> +

>      netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal);

> +

> +    if (OVS_UNLIKELY(dev->txq_needs_locking)) {

> +        rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);

> +    }

> +

> +    return 0;

> +}

> +

> +static int

> +netdev_dpdk_vhost_send(struct netdev *netdev, int qid,

> +                     struct dp_packet **pkts, int cnt, bool may_steal)

> +{

> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> +

> +    qid = dev->tx_q[qid % netdev->n_txq].map;

> +    if (qid == -1) {

> +        rte_spinlock_lock(&dev->stats_lock);

> +        dev->stats.tx_dropped+= cnt;

> +        rte_spinlock_unlock(&dev->stats_lock);

> +        if (may_steal) {

> +            int i;

> +

> +            for (i = 0; i < cnt; i++) {

> +                dp_packet_delete(pkts[i]);

> +            }

> +        }

> 

> It wasn't on master when this patch was sent, but now we can use

> dp_packet_delete_batch().

Ok

> 

> +    } else {

> +        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);

> +        netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal);

> +        rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);

> +    }

> +

>      return 0;

>  }

> 

> @@ -1640,41 +1600,6 @@ out:

>  static int

>  netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier);

> 

> -static int

> -netdev_dpdk_vhost_get_stats(const struct netdev *netdev,

> -                            struct netdev_stats *stats)

> -{

> -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> -

> -    ovs_mutex_lock(&dev->mutex);

> -

> -    rte_spinlock_lock(&dev->stats_lock);

> -    /* Supported Stats */

> -    stats->rx_packets += dev->stats.rx_packets;

> -    stats->tx_packets += dev->stats.tx_packets;

> -    stats->rx_dropped = dev->stats.rx_dropped;

> -    stats->tx_dropped += dev->stats.tx_dropped;

> -    stats->multicast = dev->stats.multicast;

> -    stats->rx_bytes = dev->stats.rx_bytes;

> -    stats->tx_bytes = dev->stats.tx_bytes;

> -    stats->rx_errors = dev->stats.rx_errors;

> -    stats->rx_length_errors = dev->stats.rx_length_errors;

> -

> -    stats->rx_1_to_64_packets = dev->stats.rx_1_to_64_packets;

> -    stats->rx_65_to_127_packets = dev->stats.rx_65_to_127_packets;

> -    stats->rx_128_to_255_packets = dev->stats.rx_128_to_255_packets;

> -    stats->rx_256_to_511_packets = dev->stats.rx_256_to_511_packets;

> -    stats->rx_512_to_1023_packets = dev->stats.rx_512_to_1023_packets;

> -    stats->rx_1024_to_1522_packets = dev->stats.rx_1024_to_1522_packets;

> -    stats->rx_1523_to_max_packets = dev->stats.rx_1523_to_max_packets;

> -

> -    rte_spinlock_unlock(&dev->stats_lock);

> -

> -    ovs_mutex_unlock(&dev->mutex);

> -

> -    return 0;

> -}

> -

>  static void

>  netdev_dpdk_convert_xstats(struct netdev_stats *stats,

>                             const struct rte_eth_xstats *xstats,

> @@ -1755,28 +1680,40 @@ netdev_dpdk_get_stats(const struct netdev

> *netdev, struct netdev_stats *stats)

>          return EPROTO;

>      }

> 

> -    rte_xstats_len = rte_eth_xstats_get(dev->port_id, NULL, 0);

> -    if (rte_xstats_len > 0) {

> -        rte_xstats = dpdk_rte_mzalloc(sizeof(*rte_xstats) * rte_xstats_len);

> -        memset(rte_xstats, 0xff, sizeof(*rte_xstats) * rte_xstats_len);

> -        rte_xstats_ret = rte_eth_xstats_get(dev->port_id, rte_xstats,

> -                                            rte_xstats_len);

> -        if (rte_xstats_ret > 0 && rte_xstats_ret <= rte_xstats_len) {

> -            netdev_dpdk_convert_xstats(stats, rte_xstats, rte_xstats_ret);

> +    /* Extended statistics are not yet available for vHost User PMD */

> +    if (dev->type == DPDK_DEV_ETH) {

> +        rte_xstats_len = rte_eth_xstats_get(dev->port_id, NULL, 0);

> +        if (rte_xstats_len > 0) {

> +            rte_xstats = dpdk_rte_mzalloc(sizeof(*rte_xstats)

> +                                          * rte_xstats_len);

> +            memset(rte_xstats, 0xff, sizeof(*rte_xstats) * rte_xstats_len);

> +            rte_xstats_ret = rte_eth_xstats_get(dev->port_id, rte_xstats,

> +                                                rte_xstats_len);

> +            if (rte_xstats_ret > 0 && rte_xstats_ret <= rte_xstats_len) {

> +                netdev_dpdk_convert_xstats(stats, rte_xstats, rte_xstats_ret);

> +            }

> +            rte_free(rte_xstats);

> +        } else {

> +            VLOG_WARN("Can't get XSTATS counters for port: %i.", dev-

> >port_id);

>          }

> -        rte_free(rte_xstats);

> -    } else {

> -        VLOG_WARN("Can't get XSTATS counters for port: %i.", dev->port_id);

>      }

> 

>      stats->rx_packets = rte_stats.ipackets;

>      stats->tx_packets = rte_stats.opackets;

>      stats->rx_bytes = rte_stats.ibytes;

>      stats->tx_bytes = rte_stats.obytes;

> -    /* DPDK counts imissed as errors, but count them here as dropped

> instead */

> -    stats->rx_errors = rte_stats.ierrors - rte_stats.imissed;

> -    stats->tx_errors = rte_stats.oerrors;

> -    stats->multicast = rte_stats.imcasts;

> +

> +    if (dev->type == DPDK_DEV_ETH) {

> +        /* DPDK counts imissed as errors, but count them here as dropped

> +         * instead */

> +        stats->rx_errors = rte_stats.ierrors - rte_stats.imissed;

> +        stats->tx_errors = rte_stats.oerrors;

> +        stats->multicast = rte_stats.imcasts;

> +    } else {

> +        stats->rx_errors = UINT64_MAX;

> +        stats->tx_errors = UINT64_MAX;

> +        stats->multicast = UINT64_MAX;

> +    }

> 

>      rte_spinlock_lock(&dev->stats_lock);

>      stats->tx_dropped = dev->stats.tx_dropped;

> @@ -1939,25 +1876,6 @@ netdev_dpdk_get_carrier(const struct netdev

> *netdev, bool *carrier)

>      return 0;

>  }

> 

> -static int

> -netdev_dpdk_vhost_get_carrier(const struct netdev *netdev, bool

> *carrier)

> -{

> -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> -    struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);

> -

> -    ovs_mutex_lock(&dev->mutex);

> -

> -    if (is_vhost_running(virtio_dev)) {

> -        *carrier = 1;

> -    } else {

> -        *carrier = 0;

> -    }

> -

> -    ovs_mutex_unlock(&dev->mutex);

> -

> -    return 0;

> -}

> -

>  static long long int

>  netdev_dpdk_get_carrier_resets(const struct netdev *netdev)

>  {

> @@ -1993,6 +1911,7 @@ netdev_dpdk_update_flags__(struct netdev_dpdk

> *dev,

>      dev->flags |= on;

>      dev->flags &= ~off;

> 

> +

> 

> Minor: extra newline

> 

>      if (dev->flags == *old_flagsp) {

>          return 0;

>      }

> @@ -2012,13 +1931,10 @@ netdev_dpdk_update_flags__(struct

> netdev_dpdk *dev,

>              rte_eth_dev_stop(dev->port_id);

>          }

>      } else {

> -        /* If DPDK_DEV_VHOST device's NETDEV_UP flag was changed and

> vhost is

> -         * running then change netdev's change_seq to trigger link state

> -         * update. */

> -        struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);

> +        /* If DPDK_DEV_VHOST device's NETDEV_UP flag was changed then

> change

> +         * netdev's change_seq to trigger link state update. */

> 

> -        if ((NETDEV_UP & ((*old_flagsp ^ on) | (*old_flagsp ^ off)))

> -            && is_vhost_running(virtio_dev)) {

> +        if ((NETDEV_UP & ((*old_flagsp ^ on) | (*old_flagsp ^ off)))) {

>              netdev_change_seq_changed(&dev->up);

> 

>              /* Clear statistics if device is getting up. */

> @@ -2115,7 +2031,7 @@ netdev_dpdk_set_admin_state(struct

> unixctl_conn *conn, int argc,

> 

>      if (argc > 2) {

>          struct netdev *netdev = netdev_from_name(argv[1]);

> -        if (netdev && is_dpdk_class(netdev->netdev_class)) {

> +        if (netdev && is_dpdk_eth_class(netdev->netdev_class)) {

> 

> I think this will leave out ring devices.

I think it always has.

> 

>              struct netdev_dpdk *dpdk_dev = netdev_dpdk_cast(netdev);

> 

>              ovs_mutex_lock(&dpdk_dev->mutex);

> @@ -2143,22 +2059,6 @@ netdev_dpdk_set_admin_state(struct

> unixctl_conn *conn, int argc,

>  }

> 

>  /*

> - * Set virtqueue flags so that we do not receive interrupts.

> - */

> -static void

> -set_irq_status(struct virtio_net *virtio_dev)

> -{

> -    uint32_t i;

> -    uint64_t idx;

> -

> -    for (i = 0; i < virtio_dev->virt_qp_nb; i++) {

> -        idx = i * VIRTIO_QNUM;

> -        rte_vhost_enable_guest_notification(virtio_dev, idx + VIRTIO_RXQ, 0);

> -        rte_vhost_enable_guest_notification(virtio_dev, idx + VIRTIO_TXQ, 0);

> -    }

> -}

> -

> -/*

>   * Fixes mapping for vhost-user tx queues. Must be called after each

>   * enabling/disabling of queues and n_txq modifications.

>   */

> @@ -2199,62 +2099,6 @@ netdev_dpdk_remap_txqs(struct netdev_dpdk

> *dev)

>      rte_free(enabled_queues);

>  }

> 

> -/*

> - * A new virtio-net device is added to a vhost port.

> - */

> -static int

> -new_device(struct virtio_net *virtio_dev)

> -{

> -    struct netdev_dpdk *dev;

> -    bool exists = false;

> -    int newnode = 0;

> -    long err = 0;

> -

> -    ovs_mutex_lock(&dpdk_mutex);

> -    /* Add device to the vhost port with the same name as that passed down.

> */

> -    LIST_FOR_EACH(dev, list_node, &dpdk_list) {

> -        if (strncmp(virtio_dev->ifname, dev->vhost_id, IF_NAME_SZ) == 0) {

> -            uint32_t qp_num = virtio_dev->virt_qp_nb;

> -

> -            ovs_mutex_lock(&dev->mutex);

> -            /* Get NUMA information */

> -            err = get_mempolicy(&newnode, NULL, 0, virtio_dev,

> -                                MPOL_F_NODE | MPOL_F_ADDR);

> -            if (err) {

> -                VLOG_INFO("Error getting NUMA info for vHost Device '%s'",

> -                        virtio_dev->ifname);

> -                newnode = dev->socket_id;

> -            }

> -

> -            dev->requested_socket_id = newnode;

> -            dev->requested_n_rxq = qp_num;

> -            dev->requested_n_txq = qp_num;

> -            netdev_request_reconfigure(&dev->up);

> -

> -            ovsrcu_set(&dev->virtio_dev, virtio_dev);

> -            exists = true;

> -

> -            /* Disable notifications. */

> -            set_irq_status(virtio_dev);

> -            netdev_change_seq_changed(&dev->up);

> -            ovs_mutex_unlock(&dev->mutex);

> -            break;

> -        }

> -    }

> -    ovs_mutex_unlock(&dpdk_mutex);

> -

> -    if (!exists) {

> -        VLOG_INFO("vHost Device '%s' %"PRIu64" can't be added - name not "

> -                  "found", virtio_dev->ifname, virtio_dev->device_fh);

> -

> -        return -1;

> -    }

> -

> -    VLOG_INFO("vHost Device '%s' %"PRIu64" has been added on numa node

> %i",

> -              virtio_dev->ifname, virtio_dev->device_fh, newnode);

> -    return 0;

> -}

> -

>  /* Clears mapping for all available queues of vhost interface. */

>  static void

>  netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)

> @@ -2267,144 +2111,18 @@ netdev_dpdk_txq_map_clear(struct

> netdev_dpdk *dev)

>      }

>  }

> 

> -/*

> - * Remove a virtio-net device from the specific vhost port.  Use dev-

> >remove

> - * flag to stop any more packets from being sent or received to/from a VM

> and

> - * ensure all currently queued packets have been sent/received before

> removing

> - *  the device.

> - */

> -static void

> -destroy_device(volatile struct virtio_net *virtio_dev)

> -{

> -    struct netdev_dpdk *dev;

> -    bool exists = false;

> -

> -    ovs_mutex_lock(&dpdk_mutex);

> -    LIST_FOR_EACH (dev, list_node, &dpdk_list) {

> -        if (netdev_dpdk_get_virtio(dev) == virtio_dev) {

> -

> -            ovs_mutex_lock(&dev->mutex);

> -            virtio_dev->flags &= ~VIRTIO_DEV_RUNNING;

> -            ovsrcu_set(&dev->virtio_dev, NULL);

> -            /* Clear tx/rx queue settings. */

> -            netdev_dpdk_txq_map_clear(dev);

> -            dev->requested_n_rxq = NR_QUEUE;

> -            dev->requested_n_txq = NR_QUEUE;

> -            netdev_request_reconfigure(&dev->up);

> -

> -            netdev_change_seq_changed(&dev->up);

> -            ovs_mutex_unlock(&dev->mutex);

> -            exists = true;

> -            break;

> -        }

> -    }

> -

> -    ovs_mutex_unlock(&dpdk_mutex);

> -

> -    if (exists == true) {

> -        /*

> -         * Wait for other threads to quiesce after setting the 'virtio_dev'

> -         * to NULL, before returning.

> -         */

> -        ovsrcu_synchronize();

> -        /*

> -         * As call to ovsrcu_synchronize() will end the quiescent state,

> -         * put thread back into quiescent state before returning.

> -         */

> -        ovsrcu_quiesce_start();

> -        VLOG_INFO("vHost Device '%s' %"PRIu64" has been removed",

> -                  virtio_dev->ifname, virtio_dev->device_fh);

> -    } else {

> -        VLOG_INFO("vHost Device '%s' %"PRIu64" not found", virtio_dev-

> >ifname,

> -                  virtio_dev->device_fh);

> -    }

> -}

> -

> -static int

> -vring_state_changed(struct virtio_net *virtio_dev, uint16_t queue_id,

> -                    int enable)

> -{

> -    struct netdev_dpdk *dev;

> -    bool exists = false;

> -    int qid = queue_id / VIRTIO_QNUM;

> -

> -    if (queue_id % VIRTIO_QNUM == VIRTIO_TXQ) {

> -        return 0;

> -    }

> -

> -    ovs_mutex_lock(&dpdk_mutex);

> -    LIST_FOR_EACH (dev, list_node, &dpdk_list) {

> -        if (strncmp(virtio_dev->ifname, dev->vhost_id, IF_NAME_SZ) == 0) {

> -            ovs_mutex_lock(&dev->mutex);

> -            if (enable) {

> -                dev->tx_q[qid].map = qid;

> -            } else {

> -                dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;

> -            }

> -            netdev_dpdk_remap_txqs(dev);

> -            exists = true;

> -            ovs_mutex_unlock(&dev->mutex);

> -            break;

> -        }

> -    }

> -    ovs_mutex_unlock(&dpdk_mutex);

> -

> -    if (exists) {

> -        VLOG_INFO("State of queue %d ( tx_qid %d ) of vhost device '%s' %"

> -                  PRIu64" changed to \'%s\'", queue_id, qid,

> -                  virtio_dev->ifname, virtio_dev->device_fh,

> -                  (enable == 1) ? "enabled" : "disabled");

> -    } else {

> -        VLOG_INFO("vHost Device '%s' %"PRIu64" not found", virtio_dev-

> >ifname,

> -                  virtio_dev->device_fh);

> -        return -1;

> -    }

> -

> -    return 0;

> -}

> -

> -struct virtio_net *

> -netdev_dpdk_get_virtio(const struct netdev_dpdk *dev)

> -{

> -    return ovsrcu_get(struct virtio_net *, &dev->virtio_dev);

> -}

> -

>  struct ingress_policer *

>  netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev)

>  {

>      return ovsrcu_get(struct ingress_policer *, &dev->ingress_policer);

>  }

> 

> -/*

> - * These callbacks allow virtio-net devices to be added to vhost ports when

> - * configuration has been fully complete.

> - */

> -static const struct virtio_net_device_ops virtio_net_device_ops =

> -{

> -    .new_device =  new_device,

> -    .destroy_device = destroy_device,

> -    .vring_state_changed = vring_state_changed

> -};

> -

> -static void *

> -start_vhost_loop(void *dummy OVS_UNUSED)

> -{

> -     pthread_detach(pthread_self());

> -     /* Put the vhost thread into quiescent state. */

> -     ovsrcu_quiesce_start();

> -     rte_vhost_driver_session_start();

> -     return NULL;

> -}

> -

>  static int

>  dpdk_vhost_class_init(void)

>  {

> -    rte_vhost_driver_callback_register(&virtio_net_device_ops);

> -    rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4

> -                            | 1ULL << VIRTIO_NET_F_HOST_TSO6

> -                            | 1ULL << VIRTIO_NET_F_CSUM);

> -

> -    ovs_thread_create("vhost_thread", start_vhost_loop, NULL);

> +    rte_eth_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4

> +                                | 1ULL << VIRTIO_NET_F_HOST_TSO6

> +                                | 1ULL << VIRTIO_NET_F_CSUM);

>      return 0;

>  }

> 

> @@ -2515,7 +2233,16 @@ netdev_dpdk_ring_send(struct netdev *netdev,

> int qid,

>          dp_packet_rss_invalidate(pkts[i]);

>      }

> 

> +    if (OVS_UNLIKELY(dev->txq_needs_locking)) {

> +        qid = qid % dev->up.n_txq;

> +        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);

> +    }

> +

>      netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal);

> +

> +    if (OVS_UNLIKELY(dev->txq_needs_locking)) {

> +        rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);

> +    }

>      return 0;

>  }

> 

> @@ -2806,7 +2533,6 @@ static int

>  netdev_dpdk_vhost_reconfigure(struct netdev *netdev)

>  {

>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> -    struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);

>      int err = 0;

> 

>      ovs_mutex_lock(&dpdk_mutex);

> @@ -2822,6 +2548,8 @@ netdev_dpdk_vhost_reconfigure(struct netdev

> *netdev)

> 

>      netdev_dpdk_remap_txqs(dev);

> 

> +    dev->txq_needs_locking = netdev->n_txq < dev->requested_n_txq;

> +

>      if (dev->requested_socket_id != dev->socket_id) {

>          dev->socket_id = dev->requested_socket_id;

>          /* Change mempool to new NUMA Node */

> @@ -2832,10 +2560,6 @@ netdev_dpdk_vhost_reconfigure(struct netdev

> *netdev)

>          }

>      }

> 

> -    if (virtio_dev) {

> -        virtio_dev->flags |= VIRTIO_DEV_RUNNING;

> -    }

> -

>      ovs_mutex_unlock(&dev->mutex);

>      ovs_mutex_unlock(&dpdk_mutex);

> 

> @@ -3329,12 +3053,12 @@ static const struct netdev_class OVS_UNUSED

> dpdk_vhost_class =

>          NULL,

>          NULL,

>          netdev_dpdk_vhost_send,

> -        netdev_dpdk_vhost_get_carrier,

> -        netdev_dpdk_vhost_get_stats,

> +        netdev_dpdk_get_carrier,

> +        netdev_dpdk_get_stats,

>          NULL,

>          NULL,

>          netdev_dpdk_vhost_reconfigure,

> -        netdev_dpdk_vhost_rxq_recv);

> +        netdev_dpdk_rxq_recv);

> 

>  void

>  netdev_dpdk_register(void)

> --

> 2.4.3

> 

> _______________________________________________

> dev mailing list

> dev@openvswitch.org

> http://openvswitch.org/mailman/listinfo/dev
diff mbox

Patch

diff --git a/INSTALL.DPDK.md b/INSTALL.DPDK.md
index 5407794..29b6f91 100644
--- a/INSTALL.DPDK.md
+++ b/INSTALL.DPDK.md
@@ -561,6 +561,16 @@  can be found in [Vhost Walkthrough].
 
     http://dpdk.org/doc/guides/rel_notes/release_16_04.html
 
+  - dpdk, dpdkr and dpdkvhostuser ports are 'eth' type ports in the context of
+    DPDK as they are all managed by the rte_ether API. This means that they
+    adhere to the DPDK configuration option CONFIG_RTE_MAX_ETHPORTS which by
+    default is set to 32. This means by default the combined total number of
+    dpdk, dpdkr and dpdkvhostuser ports allowable in OVS with DPDK is 32. This
+    value can be changed if desired by modifying the configuration file in
+    DPDK, or by overriding the default value on the command line when building
+    DPDK. eg.
+
+        `make install CONFIG_RTE_MAX_ETHPORTS=64`
 
 Bug Reporting:
 --------------
diff --git a/NEWS b/NEWS
index aa1b915..b3791ed 100644
--- a/NEWS
+++ b/NEWS
@@ -59,6 +59,8 @@  Post-v2.5.0
        node that device memory is located on if CONFIG_RTE_LIBRTE_VHOST_NUMA
        is enabled in DPDK.
      * Remove dpdkvhostcuse port type.
+     * vHost PMD integration brings vhost-user ports under control of the
+       rte_ether DPDK API.
    - Increase number of registers to 16.
    - ovs-benchmark: This utility has been removed due to lack of use and
      bitrot.
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index b4f82af..5de806a 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -56,6 +56,7 @@ 
 #include "unixctl.h"
 
 #include "rte_config.h"
+#include "rte_eth_vhost.h"
 #include "rte_mbuf.h"
 #include "rte_meter.h"
 #include "rte_virtio_net.h"
@@ -141,6 +142,11 @@  static char *vhost_sock_dir = NULL;   /* Location of vhost-user sockets */
 
 #define VHOST_ENQ_RETRY_NUM 8
 
+/* Array that tracks the used & unused vHost user driver IDs */
+static unsigned int vhost_drv_ids[RTE_MAX_ETHPORTS];
+/* Maximum string length allowed to provide to rte_eth_attach function */
+#define DEVARGS_MAX (RTE_ETH_NAME_MAX_LEN + PATH_MAX + 18)
+
 static const struct rte_eth_conf port_conf = {
     .rxmode = {
         .mq_mode = ETH_MQ_RX_RSS,
@@ -353,12 +359,15 @@  struct netdev_dpdk {
      * always true.  */
     bool txq_needs_locking;
 
-    /* virtio-net structure for vhost device */
-    OVSRCU_TYPE(struct virtio_net *) virtio_dev;
+    /* Number of virtqueue pairs reported by the guest */
+    uint32_t vhost_qp_nb;
 
     /* Identifier used to distinguish vhost devices from each other */
     char vhost_id[PATH_MAX];
 
+    /* ID of vhost user port given to the PMD driver */
+    unsigned int vhost_pmd_id;
+
     /* In dpdk_list. */
     struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
 
@@ -389,16 +398,25 @@  struct netdev_rxq_dpdk {
 static bool dpdk_thread_is_pmd(void);
 
 static int netdev_dpdk_construct(struct netdev *);
+static int netdev_dpdk_vhost_construct(struct netdev *);
 
 struct virtio_net * netdev_dpdk_get_virtio(const struct netdev_dpdk *dev);
 
 struct ingress_policer *
 netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev);
 
+void link_status_changed_callback(uint8_t port_id,
+        enum rte_eth_event_type type OVS_UNUSED, void *param OVS_UNUSED);
+void vring_state_changed_callback(uint8_t port_id,
+        enum rte_eth_event_type type OVS_UNUSED, void *param OVS_UNUSED);
+static void netdev_dpdk_remap_txqs(struct netdev_dpdk *dev);
+static void netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev);
+
 static bool
-is_dpdk_class(const struct netdev_class *class)
+is_dpdk_eth_class(const struct netdev_class *class)
 {
-    return class->construct == netdev_dpdk_construct;
+    return ((class->construct == netdev_dpdk_construct) ||
+            (class->construct == netdev_dpdk_vhost_construct));
 }
 
 /* DPDK NIC drivers allocate RX buffers at a particular granularity, typically
@@ -622,8 +640,13 @@  dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq)
             continue;
         }
 
-        dev->up.n_rxq = n_rxq;
-        dev->up.n_txq = n_txq;
+        /* Only set n_*xq for physical devices. vHost User devices will set
+         * this value correctly using info from the virtio backend.
+         */
+        if (dev->type == DPDK_DEV_ETH) {
+            dev->up.n_rxq = n_rxq;
+            dev->up.n_txq = n_txq;
+        }
 
         return 0;
     }
@@ -647,8 +670,14 @@  dpdk_eth_dev_init(struct netdev_dpdk *dev) OVS_REQUIRES(dpdk_mutex)
 
     rte_eth_dev_info_get(dev->port_id, &info);
 
-    n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
-    n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
+    if (dev->type == DPDK_DEV_VHOST) {
+        /* We don't know how many queues QEMU will use so set up the max */
+        n_rxq = OVS_VHOST_MAX_QUEUE_NUM;
+        n_txq = OVS_VHOST_MAX_QUEUE_NUM;
+    } else {
+        n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
+        n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
+    }
 
     diag = dpdk_eth_dev_queue_setup(dev, n_rxq, n_txq);
     if (diag) {
@@ -715,6 +744,85 @@  netdev_dpdk_alloc_txq(struct netdev_dpdk *dev, unsigned int n_txqs)
     }
 }
 
+void
+link_status_changed_callback(uint8_t port_id,
+                             enum rte_eth_event_type type OVS_UNUSED,
+                             void *param OVS_UNUSED)
+{
+    struct netdev_dpdk *dev;
+    int socket_id = -1;
+
+    ovs_mutex_lock(&dpdk_mutex);
+    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
+        if (port_id == dev->port_id) {
+            ovs_mutex_lock(&dev->mutex);
+            check_link_status(dev);
+            if (dev->link.link_status == ETH_LINK_UP) {
+                /* new device */
+                /* Get NUMA information */
+                socket_id = rte_eth_dev_socket_id(dev->port_id);
+                if (socket_id != -1 && socket_id != dev->socket_id) {
+                    dev->requested_socket_id = socket_id;
+                }
+                netdev_request_reconfigure(&dev->up);
+                netdev_change_seq_changed(&dev->up);
+                VLOG_INFO("vHost Device '%s' has been added on numa node %i",
+                          dev->vhost_id, socket_id);
+            } else {
+                /* destroy device */
+                /* Clear tx/rx queue settings. */
+                netdev_dpdk_txq_map_clear(dev);
+                netdev_request_reconfigure(&dev->up);
+                netdev_change_seq_changed(&dev->up);
+                VLOG_INFO("vHost Device '%s' has been removed", dev->vhost_id);
+            }
+            ovs_mutex_unlock(&dev->mutex);
+            break;
+        }
+    }
+
+    ovs_mutex_unlock(&dpdk_mutex);
+
+    return;
+}
+
+void
+vring_state_changed_callback(uint8_t port_id,
+                             enum rte_eth_event_type type OVS_UNUSED,
+                             void *param OVS_UNUSED)
+{
+    struct netdev_dpdk *dev;
+    struct rte_eth_vhost_queue_event event;
+    int err = 0;
+
+    err = rte_eth_vhost_get_queue_event(port_id, &event);
+    if (err || (event.rx)) {
+        return;
+    }
+
+    ovs_mutex_lock(&dpdk_mutex);
+    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
+        if (port_id == dev->port_id) {
+            ovs_mutex_lock(&dev->mutex);
+            if (event.enable) {
+                dev->tx_q[event.queue_id].map = event.queue_id;
+                dev->vhost_qp_nb++;
+            } else {
+                dev->tx_q[event.queue_id].map = OVS_VHOST_QUEUE_DISABLED;
+                dev->vhost_qp_nb--;
+            }
+            dev->requested_n_rxq = dev->vhost_qp_nb;
+            dev->requested_n_txq = dev->vhost_qp_nb;
+            netdev_request_reconfigure(&dev->up);
+            ovs_mutex_unlock(&dev->mutex);
+            break;
+        }
+    }
+    ovs_mutex_unlock(&dpdk_mutex);
+
+    return;
+}
+
 static int
 netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
                  enum dpdk_dev_type type)
@@ -724,6 +832,7 @@  netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
     int sid;
     int err = 0;
     uint32_t buf_size;
+    unsigned int nr_q = 0;
 
     ovs_mutex_init(&dev->mutex);
     ovs_mutex_lock(&dev->mutex);
@@ -733,11 +842,7 @@  netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
     /* If the 'sid' is negative, it means that the kernel fails
      * to obtain the pci numa info.  In that situation, always
      * use 'SOCKET0'. */
-    if (type == DPDK_DEV_ETH) {
-        sid = rte_eth_dev_socket_id(port_no);
-    } else {
-        sid = rte_lcore_to_socket_id(rte_get_master_lcore());
-    }
+    sid = rte_eth_dev_socket_id(port_no);
 
     dev->socket_id = sid < 0 ? SOCKET0 : sid;
     dev->requested_socket_id = dev->socket_id;
@@ -767,19 +872,23 @@  netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
     netdev->n_txq = NR_QUEUE;
     dev->requested_n_rxq = netdev->n_rxq;
     dev->requested_n_txq = netdev->n_txq;
+    dev->vhost_qp_nb = 0;
 
-    if (type == DPDK_DEV_ETH) {
-        err = dpdk_eth_dev_init(dev);
-        if (err) {
-            goto unlock;
-        }
-        netdev_dpdk_alloc_txq(dev, netdev->n_txq);
-        dev->txq_needs_locking = netdev->n_txq < dev->requested_n_txq;
-    } else {
-        netdev_dpdk_alloc_txq(dev, OVS_VHOST_MAX_QUEUE_NUM);
-        dev->txq_needs_locking = true;
-        /* Enable DPDK_DEV_VHOST device and set promiscuous mode flag. */
-        dev->flags = NETDEV_UP | NETDEV_PROMISC;
+    err = dpdk_eth_dev_init(dev);
+    if (err) {
+        goto unlock;
+    }
+    nr_q = (type == DPDK_DEV_ETH ? 1 : RTE_MAX_QUEUES_PER_PORT);
+    netdev_dpdk_alloc_txq(dev, nr_q);
+    dev->txq_needs_locking = netdev->n_txq < dev->requested_n_txq;
+
+    if (type == DPDK_DEV_VHOST) {
+        rte_eth_dev_callback_register(port_no, RTE_ETH_EVENT_QUEUE_STATE,
+                                     (void*)vring_state_changed_callback,
+                                      NULL);
+        rte_eth_dev_callback_register(port_no, RTE_ETH_EVENT_INTR_LSC,
+                                      (void*)link_status_changed_callback,
+                                      NULL);
     }
 
     ovs_list_push_back(&dpdk_list, &dev->list_node);
@@ -810,17 +919,48 @@  dpdk_dev_parse_name(const char dev_name[], const char prefix[],
     }
 }
 
+/* When attaching a vhost device to DPDK, a unique name of the format
+ * 'eth_vhostX' is expected, where X is a unique identifier.
+ * get_vhost_drv_id returns a valid X value to provide to DPDK.
+ */
+static int
+get_vhost_drv_id(void)
+{
+    int i = 0;
+
+    for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
+        if (vhost_drv_ids[i] == 0) {
+            return i;
+        }
+    }
+
+    return -1;
+}
+
+static void
+set_vhost_drv_id(int id, int val)
+{
+    vhost_drv_ids[id] = val;
+}
+
 static int
 netdev_dpdk_vhost_construct(struct netdev *netdev)
 {
     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
     const char *name = netdev->name;
     int err;
+    uint8_t port_no = 0;
+    char devargs[DEVARGS_MAX];
+    int driver_id = 0;
+
+    if (rte_eal_init_ret) {
+        return rte_eal_init_ret;
+    }
 
     /* 'name' is appended to 'vhost_sock_dir' and used to create a socket in
      * the file system. '/' or '\' would traverse directories, so they're not
      * acceptable in 'name'. */
-    if (strchr(name, '/') || strchr(name, '\\')) {
+    if (strchr(name, '/') || strchr(name, '\\') || strchr(name, ',')) {
         VLOG_ERR("\"%s\" is not a valid name for a vhost-user port. "
                  "A valid name must not include '/' or '\\'",
                  name);
@@ -837,18 +977,32 @@  netdev_dpdk_vhost_construct(struct netdev *netdev)
      */
     snprintf(dev->vhost_id, sizeof(dev->vhost_id), "%s/%s",
              vhost_sock_dir, name);
+    driver_id = get_vhost_drv_id();
+    if (driver_id == -1) {
+        VLOG_ERR("Unable to create vhost-user device %s - too many vhost-user"
+                 "devices registered with PMD", dev->vhost_id);
+        err = ENODEV;
+        goto out;
+
+    } else {
+        snprintf(devargs, sizeof(devargs), "eth_vhost%u,iface=%s,queues=%i",
+                 driver_id, dev->vhost_id, RTE_MAX_QUEUES_PER_PORT);
+        err = rte_eth_dev_attach(devargs, &port_no);
+    }
 
-    err = rte_vhost_driver_register(dev->vhost_id);
     if (err) {
-        VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
+        VLOG_ERR("Failed to attach vhost-user device %s to DPDK",
                  dev->vhost_id);
     } else {
         fatal_signal_add_file_to_unlink(dev->vhost_id);
         VLOG_INFO("Socket %s created for vhost-user port %s\n",
                   dev->vhost_id, name);
-        err = netdev_dpdk_init(netdev, -1, DPDK_DEV_VHOST);
+        dev->vhost_pmd_id = driver_id;
+        set_vhost_drv_id(driver_id, 1);
+        err = netdev_dpdk_init(netdev, port_no, DPDK_DEV_VHOST);
     }
 
+out:
     ovs_mutex_unlock(&dpdk_mutex);
     return err;
 }
@@ -876,15 +1030,11 @@  netdev_dpdk_construct(struct netdev *netdev)
 }
 
 static void
-netdev_dpdk_destruct(struct netdev *netdev)
+dpdk_destruct_helper(struct netdev_dpdk *dev)
 {
-    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
-
-    ovs_mutex_lock(&dev->mutex);
     rte_eth_dev_stop(dev->port_id);
     free(ovsrcu_get_protected(struct ingress_policer *,
                               &dev->ingress_policer));
-    ovs_mutex_unlock(&dev->mutex);
 
     ovs_mutex_lock(&dpdk_mutex);
     rte_free(dev->tx_q);
@@ -894,35 +1044,31 @@  netdev_dpdk_destruct(struct netdev *netdev)
 }
 
 static void
+netdev_dpdk_destruct(struct netdev *netdev)
+{
+    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
+
+    ovs_mutex_lock(&dev->mutex);
+    dpdk_destruct_helper(dev);
+    ovs_mutex_unlock(&dev->mutex);
+}
+
+static void
 netdev_dpdk_vhost_destruct(struct netdev *netdev)
 {
     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
 
-    /* Guest becomes an orphan if still attached. */
-    if (netdev_dpdk_get_virtio(dev) != NULL) {
-        VLOG_ERR("Removing port '%s' while vhost device still attached.",
-                 netdev->name);
-        VLOG_ERR("To restore connectivity after re-adding of port, VM on socket"
-                 " '%s' must be restarted.",
-                 dev->vhost_id);
-    }
+    ovs_mutex_lock(&dev->mutex);
 
-    if (rte_vhost_driver_unregister(dev->vhost_id)) {
-        VLOG_ERR("Unable to remove vhost-user socket %s", dev->vhost_id);
+    if (rte_eth_dev_detach(dev->port_id, dev->vhost_id)) {
+        VLOG_ERR("Error removing vhost device %s", dev->vhost_id);
     } else {
         fatal_signal_remove_file_to_unlink(dev->vhost_id);
     }
+    set_vhost_drv_id(dev->vhost_pmd_id, 0);
 
-    ovs_mutex_lock(&dev->mutex);
-    free(ovsrcu_get_protected(struct ingress_policer *,
-                              &dev->ingress_policer));
+    dpdk_destruct_helper(dev);
     ovs_mutex_unlock(&dev->mutex);
-
-    ovs_mutex_lock(&dpdk_mutex);
-    rte_free(dev->tx_q);
-    ovs_list_remove(&dev->list_node);
-    dpdk_mp_put(dev->dpdk_mp);
-    ovs_mutex_unlock(&dpdk_mutex);
 }
 
 static void
@@ -1113,114 +1259,6 @@  ingress_policer_run(struct ingress_policer *policer, struct rte_mbuf **pkts,
     return cnt;
 }
 
-static bool
-is_vhost_running(struct virtio_net *virtio_dev)
-{
-    return (virtio_dev != NULL && (virtio_dev->flags & VIRTIO_DEV_RUNNING));
-}
-
-static inline void
-netdev_dpdk_vhost_update_rx_size_counters(struct netdev_stats *stats,
-                                          unsigned int packet_size)
-{
-    /* Hard-coded search for the size bucket. */
-    if (packet_size < 256) {
-        if (packet_size >= 128) {
-            stats->rx_128_to_255_packets++;
-        } else if (packet_size <= 64) {
-            stats->rx_1_to_64_packets++;
-        } else {
-            stats->rx_65_to_127_packets++;
-        }
-    } else {
-        if (packet_size >= 1523) {
-            stats->rx_1523_to_max_packets++;
-        } else if (packet_size >= 1024) {
-            stats->rx_1024_to_1522_packets++;
-        } else if (packet_size < 512) {
-            stats->rx_256_to_511_packets++;
-        } else {
-            stats->rx_512_to_1023_packets++;
-        }
-    }
-}
-
-static inline void
-netdev_dpdk_vhost_update_rx_counters(struct netdev_stats *stats,
-                                     struct dp_packet **packets, int count,
-                                     int dropped)
-{
-    int i;
-    unsigned int packet_size;
-    struct dp_packet *packet;
-
-    stats->rx_packets += count;
-    stats->rx_dropped += dropped;
-    for (i = 0; i < count; i++) {
-        packet = packets[i];
-        packet_size = dp_packet_size(packet);
-
-        if (OVS_UNLIKELY(packet_size < ETH_HEADER_LEN)) {
-            /* This only protects the following multicast counting from
-             * too short packets, but it does not stop the packet from
-             * further processing. */
-            stats->rx_errors++;
-            stats->rx_length_errors++;
-            continue;
-        }
-
-        netdev_dpdk_vhost_update_rx_size_counters(stats, packet_size);
-
-        struct eth_header *eh = (struct eth_header *) dp_packet_data(packet);
-        if (OVS_UNLIKELY(eth_addr_is_multicast(eh->eth_dst))) {
-            stats->multicast++;
-        }
-
-        stats->rx_bytes += packet_size;
-    }
-}
-
-/*
- * The receive path for the vhost port is the TX path out from guest.
- */
-static int
-netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
-                           struct dp_packet **packets, int *c)
-{
-    struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
-    struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);
-    int qid = rxq->queue_id;
-    struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
-    uint16_t nb_rx = 0;
-    uint16_t dropped = 0;
-
-    if (OVS_UNLIKELY(!is_vhost_running(virtio_dev)
-                     || !(dev->flags & NETDEV_UP))) {
-        return EAGAIN;
-    }
-
-    nb_rx = rte_vhost_dequeue_burst(virtio_dev, qid * VIRTIO_QNUM + VIRTIO_TXQ,
-                                    dev->dpdk_mp->mp,
-                                    (struct rte_mbuf **)packets,
-                                    NETDEV_MAX_BURST);
-    if (!nb_rx) {
-        return EAGAIN;
-    }
-
-    if (policer) {
-        dropped = nb_rx;
-        nb_rx = ingress_policer_run(policer, (struct rte_mbuf **)packets, nb_rx);
-        dropped -= nb_rx;
-    }
-
-    rte_spinlock_lock(&dev->stats_lock);
-    netdev_dpdk_vhost_update_rx_counters(&dev->stats, packets, nb_rx, dropped);
-    rte_spinlock_unlock(&dev->stats_lock);
-
-    *c = (int) nb_rx;
-    return 0;
-}
-
 static int
 netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet **packets,
                      int *c)
@@ -1273,85 +1311,6 @@  netdev_dpdk_qos_run__(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
     return cnt;
 }
 
-static inline void
-netdev_dpdk_vhost_update_tx_counters(struct netdev_stats *stats,
-                                     struct dp_packet **packets,
-                                     int attempted,
-                                     int dropped)
-{
-    int i;
-    int sent = attempted - dropped;
-
-    stats->tx_packets += sent;
-    stats->tx_dropped += dropped;
-
-    for (i = 0; i < sent; i++) {
-        stats->tx_bytes += dp_packet_size(packets[i]);
-    }
-}
-
-static void
-__netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
-                         struct dp_packet **pkts, int cnt,
-                         bool may_steal)
-{
-    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
-    struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);
-    struct rte_mbuf **cur_pkts = (struct rte_mbuf **) pkts;
-    unsigned int total_pkts = cnt;
-    unsigned int qos_pkts = cnt;
-    int retries = 0;
-
-    qid = dev->tx_q[qid % netdev->n_txq].map;
-
-    if (OVS_UNLIKELY(!is_vhost_running(virtio_dev) || qid < 0
-                     || !(dev->flags & NETDEV_UP))) {
-        rte_spinlock_lock(&dev->stats_lock);
-        dev->stats.tx_dropped+= cnt;
-        rte_spinlock_unlock(&dev->stats_lock);
-        goto out;
-    }
-
-    rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
-
-    /* Check has QoS has been configured for the netdev */
-    cnt = netdev_dpdk_qos_run__(dev, cur_pkts, cnt);
-    qos_pkts -= cnt;
-
-    do {
-        int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
-        unsigned int tx_pkts;
-
-        tx_pkts = rte_vhost_enqueue_burst(virtio_dev, vhost_qid,
-                                          cur_pkts, cnt);
-        if (OVS_LIKELY(tx_pkts)) {
-            /* Packets have been sent.*/
-            cnt -= tx_pkts;
-            /* Prepare for possible retry.*/
-            cur_pkts = &cur_pkts[tx_pkts];
-        } else {
-            /* No packets sent - do not retry.*/
-            break;
-        }
-    } while (cnt && (retries++ < VHOST_ENQ_RETRY_NUM));
-
-    rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
-
-    rte_spinlock_lock(&dev->stats_lock);
-    cnt += qos_pkts;
-    netdev_dpdk_vhost_update_tx_counters(&dev->stats, pkts, total_pkts, cnt);
-    rte_spinlock_unlock(&dev->stats_lock);
-
-out:
-    if (may_steal) {
-        int i;
-
-        for (i = 0; i < total_pkts; i++) {
-            dp_packet_delete(pkts[i]);
-        }
-    }
-}
-
 /* Tx function. Transmit packets indefinitely */
 static void
 dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet **pkts,
@@ -1408,17 +1367,13 @@  dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet **pkts,
         newcnt++;
     }
 
-    if (dev->type == DPDK_DEV_VHOST) {
-        __netdev_dpdk_vhost_send(netdev, qid, (struct dp_packet **) mbufs, newcnt, true);
-    } else {
-        unsigned int qos_pkts = newcnt;
+    unsigned int qos_pkts = newcnt;
 
-        /* Check if QoS has been configured for this netdev. */
-        newcnt = netdev_dpdk_qos_run__(dev, mbufs, newcnt);
+    /* Check if QoS has been configured for this netdev. */
+    newcnt = netdev_dpdk_qos_run__(dev, mbufs, newcnt);
 
-        dropped += qos_pkts - newcnt;
-        netdev_dpdk_eth_tx_burst(dev, qid, mbufs, newcnt);
-    }
+    dropped += qos_pkts - newcnt;
+    netdev_dpdk_eth_tx_burst(dev, qid, mbufs, newcnt);
 
     if (OVS_UNLIKELY(dropped)) {
         rte_spinlock_lock(&dev->stats_lock);
@@ -1431,44 +1386,12 @@  dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet **pkts,
     }
 }
 
-static int
-netdev_dpdk_vhost_send(struct netdev *netdev, int qid, struct dp_packet **pkts,
-                 int cnt, bool may_steal)
-{
-    if (OVS_UNLIKELY(pkts[0]->source != DPBUF_DPDK)) {
-        int i;
-
-        dpdk_do_tx_copy(netdev, qid, pkts, cnt);
-        if (may_steal) {
-            for (i = 0; i < cnt; i++) {
-                dp_packet_delete(pkts[i]);
-            }
-        }
-    } else {
-        int i;
-
-        for (i = 0; i < cnt; i++) {
-            int cutlen = dp_packet_get_cutlen(pkts[i]);
-
-            dp_packet_set_size(pkts[i], dp_packet_size(pkts[i]) - cutlen);
-            dp_packet_reset_cutlen(pkts[i]);
-        }
-        __netdev_dpdk_vhost_send(netdev, qid, pkts, cnt, may_steal);
-    }
-    return 0;
-}
-
 static inline void
 netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
                    struct dp_packet **pkts, int cnt, bool may_steal)
 {
     int i;
 
-    if (OVS_UNLIKELY(dev->txq_needs_locking)) {
-        qid = qid % dev->up.n_txq;
-        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
-    }
-
     if (OVS_UNLIKELY(!may_steal ||
                      pkts[0]->source != DPBUF_DPDK)) {
         struct netdev *netdev = &dev->up;
@@ -1543,7 +1466,44 @@  netdev_dpdk_eth_send(struct netdev *netdev, int qid,
 {
     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
 
+    if (OVS_UNLIKELY(dev->txq_needs_locking)) {
+        qid = qid % dev->up.n_txq;
+        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
+    }
+
     netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal);
+
+    if (OVS_UNLIKELY(dev->txq_needs_locking)) {
+        rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
+    }
+
+    return 0;
+}
+
+static int
+netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
+                     struct dp_packet **pkts, int cnt, bool may_steal)
+{
+    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
+
+    qid = dev->tx_q[qid % netdev->n_txq].map;
+    if (qid == -1) {
+        rte_spinlock_lock(&dev->stats_lock);
+        dev->stats.tx_dropped+= cnt;
+        rte_spinlock_unlock(&dev->stats_lock);
+        if (may_steal) {
+            int i;
+
+            for (i = 0; i < cnt; i++) {
+                dp_packet_delete(pkts[i]);
+            }
+        }
+    } else {
+        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
+        netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal);
+        rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
+    }
+
     return 0;
 }
 
@@ -1640,41 +1600,6 @@  out:
 static int
 netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier);
 
-static int
-netdev_dpdk_vhost_get_stats(const struct netdev *netdev,
-                            struct netdev_stats *stats)
-{
-    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
-
-    ovs_mutex_lock(&dev->mutex);
-
-    rte_spinlock_lock(&dev->stats_lock);
-    /* Supported Stats */
-    stats->rx_packets += dev->stats.rx_packets;
-    stats->tx_packets += dev->stats.tx_packets;
-    stats->rx_dropped = dev->stats.rx_dropped;
-    stats->tx_dropped += dev->stats.tx_dropped;
-    stats->multicast = dev->stats.multicast;
-    stats->rx_bytes = dev->stats.rx_bytes;
-    stats->tx_bytes = dev->stats.tx_bytes;
-    stats->rx_errors = dev->stats.rx_errors;
-    stats->rx_length_errors = dev->stats.rx_length_errors;
-
-    stats->rx_1_to_64_packets = dev->stats.rx_1_to_64_packets;
-    stats->rx_65_to_127_packets = dev->stats.rx_65_to_127_packets;
-    stats->rx_128_to_255_packets = dev->stats.rx_128_to_255_packets;
-    stats->rx_256_to_511_packets = dev->stats.rx_256_to_511_packets;
-    stats->rx_512_to_1023_packets = dev->stats.rx_512_to_1023_packets;
-    stats->rx_1024_to_1522_packets = dev->stats.rx_1024_to_1522_packets;
-    stats->rx_1523_to_max_packets = dev->stats.rx_1523_to_max_packets;
-
-    rte_spinlock_unlock(&dev->stats_lock);
-
-    ovs_mutex_unlock(&dev->mutex);
-
-    return 0;
-}
-
 static void
 netdev_dpdk_convert_xstats(struct netdev_stats *stats,
                            const struct rte_eth_xstats *xstats,
@@ -1755,28 +1680,40 @@  netdev_dpdk_get_stats(const struct netdev *netdev, struct netdev_stats *stats)
         return EPROTO;
     }
 
-    rte_xstats_len = rte_eth_xstats_get(dev->port_id, NULL, 0);
-    if (rte_xstats_len > 0) {
-        rte_xstats = dpdk_rte_mzalloc(sizeof(*rte_xstats) * rte_xstats_len);
-        memset(rte_xstats, 0xff, sizeof(*rte_xstats) * rte_xstats_len);
-        rte_xstats_ret = rte_eth_xstats_get(dev->port_id, rte_xstats,
-                                            rte_xstats_len);
-        if (rte_xstats_ret > 0 && rte_xstats_ret <= rte_xstats_len) {
-            netdev_dpdk_convert_xstats(stats, rte_xstats, rte_xstats_ret);
+    /* Extended statistics are not yet available for vHost User PMD */
+    if (dev->type == DPDK_DEV_ETH) {
+        rte_xstats_len = rte_eth_xstats_get(dev->port_id, NULL, 0);
+        if (rte_xstats_len > 0) {
+            rte_xstats = dpdk_rte_mzalloc(sizeof(*rte_xstats)
+                                          * rte_xstats_len);
+            memset(rte_xstats, 0xff, sizeof(*rte_xstats) * rte_xstats_len);
+            rte_xstats_ret = rte_eth_xstats_get(dev->port_id, rte_xstats,
+                                                rte_xstats_len);
+            if (rte_xstats_ret > 0 && rte_xstats_ret <= rte_xstats_len) {
+                netdev_dpdk_convert_xstats(stats, rte_xstats, rte_xstats_ret);
+            }
+            rte_free(rte_xstats);
+        } else {
+            VLOG_WARN("Can't get XSTATS counters for port: %i.", dev->port_id);
         }
-        rte_free(rte_xstats);
-    } else {
-        VLOG_WARN("Can't get XSTATS counters for port: %i.", dev->port_id);
     }
 
     stats->rx_packets = rte_stats.ipackets;
     stats->tx_packets = rte_stats.opackets;
     stats->rx_bytes = rte_stats.ibytes;
     stats->tx_bytes = rte_stats.obytes;
-    /* DPDK counts imissed as errors, but count them here as dropped instead */
-    stats->rx_errors = rte_stats.ierrors - rte_stats.imissed;
-    stats->tx_errors = rte_stats.oerrors;
-    stats->multicast = rte_stats.imcasts;
+
+    if (dev->type == DPDK_DEV_ETH) {
+        /* DPDK counts imissed as errors, but count them here as dropped
+         * instead */
+        stats->rx_errors = rte_stats.ierrors - rte_stats.imissed;
+        stats->tx_errors = rte_stats.oerrors;
+        stats->multicast = rte_stats.imcasts;
+    } else {
+        stats->rx_errors = UINT64_MAX;
+        stats->tx_errors = UINT64_MAX;
+        stats->multicast = UINT64_MAX;
+    }
 
     rte_spinlock_lock(&dev->stats_lock);
     stats->tx_dropped = dev->stats.tx_dropped;
@@ -1939,25 +1876,6 @@  netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier)
     return 0;
 }
 
-static int
-netdev_dpdk_vhost_get_carrier(const struct netdev *netdev, bool *carrier)
-{
-    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
-    struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);
-
-    ovs_mutex_lock(&dev->mutex);
-
-    if (is_vhost_running(virtio_dev)) {
-        *carrier = 1;
-    } else {
-        *carrier = 0;
-    }
-
-    ovs_mutex_unlock(&dev->mutex);
-
-    return 0;
-}
-
 static long long int
 netdev_dpdk_get_carrier_resets(const struct netdev *netdev)
 {
@@ -1993,6 +1911,7 @@  netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
     dev->flags |= on;
     dev->flags &= ~off;
 
+
     if (dev->flags == *old_flagsp) {
         return 0;
     }
@@ -2012,13 +1931,10 @@  netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
             rte_eth_dev_stop(dev->port_id);
         }
     } else {
-        /* If DPDK_DEV_VHOST device's NETDEV_UP flag was changed and vhost is
-         * running then change netdev's change_seq to trigger link state
-         * update. */
-        struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);
+        /* If DPDK_DEV_VHOST device's NETDEV_UP flag was changed then change
+         * netdev's change_seq to trigger link state update. */
 
-        if ((NETDEV_UP & ((*old_flagsp ^ on) | (*old_flagsp ^ off)))
-            && is_vhost_running(virtio_dev)) {
+        if ((NETDEV_UP & ((*old_flagsp ^ on) | (*old_flagsp ^ off)))) {
             netdev_change_seq_changed(&dev->up);
 
             /* Clear statistics if device is getting up. */
@@ -2115,7 +2031,7 @@  netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc,
 
     if (argc > 2) {
         struct netdev *netdev = netdev_from_name(argv[1]);
-        if (netdev && is_dpdk_class(netdev->netdev_class)) {
+        if (netdev && is_dpdk_eth_class(netdev->netdev_class)) {
             struct netdev_dpdk *dpdk_dev = netdev_dpdk_cast(netdev);
 
             ovs_mutex_lock(&dpdk_dev->mutex);
@@ -2143,22 +2059,6 @@  netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc,
 }
 
 /*
- * Set virtqueue flags so that we do not receive interrupts.
- */
-static void
-set_irq_status(struct virtio_net *virtio_dev)
-{
-    uint32_t i;
-    uint64_t idx;
-
-    for (i = 0; i < virtio_dev->virt_qp_nb; i++) {
-        idx = i * VIRTIO_QNUM;
-        rte_vhost_enable_guest_notification(virtio_dev, idx + VIRTIO_RXQ, 0);
-        rte_vhost_enable_guest_notification(virtio_dev, idx + VIRTIO_TXQ, 0);
-    }
-}
-
-/*
  * Fixes mapping for vhost-user tx queues. Must be called after each
  * enabling/disabling of queues and n_txq modifications.
  */
@@ -2199,62 +2099,6 @@  netdev_dpdk_remap_txqs(struct netdev_dpdk *dev)
     rte_free(enabled_queues);
 }
 
-/*
- * A new virtio-net device is added to a vhost port.
- */
-static int
-new_device(struct virtio_net *virtio_dev)
-{
-    struct netdev_dpdk *dev;
-    bool exists = false;
-    int newnode = 0;
-    long err = 0;
-
-    ovs_mutex_lock(&dpdk_mutex);
-    /* Add device to the vhost port with the same name as that passed down. */
-    LIST_FOR_EACH(dev, list_node, &dpdk_list) {
-        if (strncmp(virtio_dev->ifname, dev->vhost_id, IF_NAME_SZ) == 0) {
-            uint32_t qp_num = virtio_dev->virt_qp_nb;
-
-            ovs_mutex_lock(&dev->mutex);
-            /* Get NUMA information */
-            err = get_mempolicy(&newnode, NULL, 0, virtio_dev,
-                                MPOL_F_NODE | MPOL_F_ADDR);
-            if (err) {
-                VLOG_INFO("Error getting NUMA info for vHost Device '%s'",
-                        virtio_dev->ifname);
-                newnode = dev->socket_id;
-            }
-
-            dev->requested_socket_id = newnode;
-            dev->requested_n_rxq = qp_num;
-            dev->requested_n_txq = qp_num;
-            netdev_request_reconfigure(&dev->up);
-
-            ovsrcu_set(&dev->virtio_dev, virtio_dev);
-            exists = true;
-
-            /* Disable notifications. */
-            set_irq_status(virtio_dev);
-            netdev_change_seq_changed(&dev->up);
-            ovs_mutex_unlock(&dev->mutex);
-            break;
-        }
-    }
-    ovs_mutex_unlock(&dpdk_mutex);
-
-    if (!exists) {
-        VLOG_INFO("vHost Device '%s' %"PRIu64" can't be added - name not "
-                  "found", virtio_dev->ifname, virtio_dev->device_fh);
-
-        return -1;
-    }
-
-    VLOG_INFO("vHost Device '%s' %"PRIu64" has been added on numa node %i",
-              virtio_dev->ifname, virtio_dev->device_fh, newnode);
-    return 0;
-}
-
 /* Clears mapping for all available queues of vhost interface. */
 static void
 netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)
@@ -2267,144 +2111,18 @@  netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)
     }
 }
 
-/*
- * Remove a virtio-net device from the specific vhost port.  Use dev->remove
- * flag to stop any more packets from being sent or received to/from a VM and
- * ensure all currently queued packets have been sent/received before removing
- *  the device.
- */
-static void
-destroy_device(volatile struct virtio_net *virtio_dev)
-{
-    struct netdev_dpdk *dev;
-    bool exists = false;
-
-    ovs_mutex_lock(&dpdk_mutex);
-    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
-        if (netdev_dpdk_get_virtio(dev) == virtio_dev) {
-
-            ovs_mutex_lock(&dev->mutex);
-            virtio_dev->flags &= ~VIRTIO_DEV_RUNNING;
-            ovsrcu_set(&dev->virtio_dev, NULL);
-            /* Clear tx/rx queue settings. */
-            netdev_dpdk_txq_map_clear(dev);
-            dev->requested_n_rxq = NR_QUEUE;
-            dev->requested_n_txq = NR_QUEUE;
-            netdev_request_reconfigure(&dev->up);
-
-            netdev_change_seq_changed(&dev->up);
-            ovs_mutex_unlock(&dev->mutex);
-            exists = true;
-            break;
-        }
-    }
-
-    ovs_mutex_unlock(&dpdk_mutex);
-
-    if (exists == true) {
-        /*
-         * Wait for other threads to quiesce after setting the 'virtio_dev'
-         * to NULL, before returning.
-         */
-        ovsrcu_synchronize();
-        /*
-         * As call to ovsrcu_synchronize() will end the quiescent state,
-         * put thread back into quiescent state before returning.
-         */
-        ovsrcu_quiesce_start();
-        VLOG_INFO("vHost Device '%s' %"PRIu64" has been removed",
-                  virtio_dev->ifname, virtio_dev->device_fh);
-    } else {
-        VLOG_INFO("vHost Device '%s' %"PRIu64" not found", virtio_dev->ifname,
-                  virtio_dev->device_fh);
-    }
-}
-
-static int
-vring_state_changed(struct virtio_net *virtio_dev, uint16_t queue_id,
-                    int enable)
-{
-    struct netdev_dpdk *dev;
-    bool exists = false;
-    int qid = queue_id / VIRTIO_QNUM;
-
-    if (queue_id % VIRTIO_QNUM == VIRTIO_TXQ) {
-        return 0;
-    }
-
-    ovs_mutex_lock(&dpdk_mutex);
-    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
-        if (strncmp(virtio_dev->ifname, dev->vhost_id, IF_NAME_SZ) == 0) {
-            ovs_mutex_lock(&dev->mutex);
-            if (enable) {
-                dev->tx_q[qid].map = qid;
-            } else {
-                dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
-            }
-            netdev_dpdk_remap_txqs(dev);
-            exists = true;
-            ovs_mutex_unlock(&dev->mutex);
-            break;
-        }
-    }
-    ovs_mutex_unlock(&dpdk_mutex);
-
-    if (exists) {
-        VLOG_INFO("State of queue %d ( tx_qid %d ) of vhost device '%s' %"
-                  PRIu64" changed to \'%s\'", queue_id, qid,
-                  virtio_dev->ifname, virtio_dev->device_fh,
-                  (enable == 1) ? "enabled" : "disabled");
-    } else {
-        VLOG_INFO("vHost Device '%s' %"PRIu64" not found", virtio_dev->ifname,
-                  virtio_dev->device_fh);
-        return -1;
-    }
-
-    return 0;
-}
-
-struct virtio_net *
-netdev_dpdk_get_virtio(const struct netdev_dpdk *dev)
-{
-    return ovsrcu_get(struct virtio_net *, &dev->virtio_dev);
-}
-
 struct ingress_policer *
 netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev)
 {
     return ovsrcu_get(struct ingress_policer *, &dev->ingress_policer);
 }
 
-/*
- * These callbacks allow virtio-net devices to be added to vhost ports when
- * configuration has been fully complete.
- */
-static const struct virtio_net_device_ops virtio_net_device_ops =
-{
-    .new_device =  new_device,
-    .destroy_device = destroy_device,
-    .vring_state_changed = vring_state_changed
-};
-
-static void *
-start_vhost_loop(void *dummy OVS_UNUSED)
-{
-     pthread_detach(pthread_self());
-     /* Put the vhost thread into quiescent state. */
-     ovsrcu_quiesce_start();
-     rte_vhost_driver_session_start();
-     return NULL;
-}
-
 static int
 dpdk_vhost_class_init(void)
 {
-    rte_vhost_driver_callback_register(&virtio_net_device_ops);
-    rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
-                            | 1ULL << VIRTIO_NET_F_HOST_TSO6
-                            | 1ULL << VIRTIO_NET_F_CSUM);
-
-    ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
+    rte_eth_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
+                                | 1ULL << VIRTIO_NET_F_HOST_TSO6
+                                | 1ULL << VIRTIO_NET_F_CSUM);
     return 0;
 }
 
@@ -2515,7 +2233,16 @@  netdev_dpdk_ring_send(struct netdev *netdev, int qid,
         dp_packet_rss_invalidate(pkts[i]);
     }
 
+    if (OVS_UNLIKELY(dev->txq_needs_locking)) {
+        qid = qid % dev->up.n_txq;
+        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
+    }
+
     netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal);
+
+    if (OVS_UNLIKELY(dev->txq_needs_locking)) {
+        rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
+    }
     return 0;
 }
 
@@ -2806,7 +2533,6 @@  static int
 netdev_dpdk_vhost_reconfigure(struct netdev *netdev)
 {
     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
-    struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);
     int err = 0;
 
     ovs_mutex_lock(&dpdk_mutex);
@@ -2822,6 +2548,8 @@  netdev_dpdk_vhost_reconfigure(struct netdev *netdev)
 
     netdev_dpdk_remap_txqs(dev);
 
+    dev->txq_needs_locking = netdev->n_txq < dev->requested_n_txq;
+
     if (dev->requested_socket_id != dev->socket_id) {
         dev->socket_id = dev->requested_socket_id;
         /* Change mempool to new NUMA Node */
@@ -2832,10 +2560,6 @@  netdev_dpdk_vhost_reconfigure(struct netdev *netdev)
         }
     }
 
-    if (virtio_dev) {
-        virtio_dev->flags |= VIRTIO_DEV_RUNNING;
-    }
-
     ovs_mutex_unlock(&dev->mutex);
     ovs_mutex_unlock(&dpdk_mutex);
 
@@ -3329,12 +3053,12 @@  static const struct netdev_class OVS_UNUSED dpdk_vhost_class =
         NULL,
         NULL,
         netdev_dpdk_vhost_send,
-        netdev_dpdk_vhost_get_carrier,
-        netdev_dpdk_vhost_get_stats,
+        netdev_dpdk_get_carrier,
+        netdev_dpdk_get_stats,
         NULL,
         NULL,
         netdev_dpdk_vhost_reconfigure,
-        netdev_dpdk_vhost_rxq_recv);
+        netdev_dpdk_rxq_recv);
 
 void
 netdev_dpdk_register(void)