[ovs-dev] netdev-dpdk: Add vHost User PMD

Message ID	1461241230-5323-1-git-send-email-ciara.loftus@intel.com
State	Changes Requested
Headers	show Return-Path: <dev-bounces@openvswitch.org> Received-SPF: none (mx1-pf1.cudamail.com: domain at ecsmtp.ir.intel.com does not designate permitted sender hosts) From: Ciara Loftus <ciara.loftus@intel.com> To: dev@openvswitch.org Date: Thu, 21 Apr 2016 13:20:30 +0100 netdev-dpdk: Add vHost User PMD Message-Id: <1461241230-5323-1-git-send-email-ciara.loftus@intel.com> Subject: [ovs-dev] [PATCH] netdev-dpdk: Add vHost User PMD Precedence: list MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: dev-bounces@openvswitch.org Sender: "dev" <dev-bounces@openvswitch.org>

Ciara Loftus April 21, 2016, 12:20 p.m. UTC

DPDK 16.04 introduces the vHost PMD which allows 'dpdkvhostuser' ports
to be controlled by the librte_ether API, like physical 'dpdk' ports.
The commit integrates this functionality into OVS, and refactors some
of the existing vhost code such that it is vhost-cuse specific.
Similarly, there is now some overlap between dpdk and vhost-user port
code.

Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
---
 INSTALL.DPDK.md   |  12 ++
 NEWS              |   2 +
 lib/netdev-dpdk.c | 515 +++++++++++++++++++++++++-----------------------------
 3 files changed, 254 insertions(+), 275 deletions(-)
 mode change 100644 => 100755 lib/netdev-dpdk.c

Kevin Traynor April 28, 2016, 2:16 p.m. UTC | #1

On 21/04/2016 13:20, Ciara Loftus wrote:
> DPDK 16.04 introduces the vHost PMD which allows 'dpdkvhostuser' ports
> to be controlled by the librte_ether API, like physical 'dpdk' ports.
> The commit integrates this functionality into OVS, and refactors some
> of the existing vhost code such that it is vhost-cuse specific.
> Similarly, there is now some overlap between dpdk and vhost-user port
> code.
>
> Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
> ---
>   INSTALL.DPDK.md   |  12 ++
>   NEWS              |   2 +
>   lib/netdev-dpdk.c | 515 +++++++++++++++++++++++++-----------------------------

Hi Ciara, there's a lot of churn in this file. It might be worth 
considering to see if it could be split through a few commits commits to 
help reviewers. e.g. new features like adding get_features, get_status 
for vhost could be a separate patch at least.

>   3 files changed, 254 insertions(+), 275 deletions(-)
>   mode change 100644 => 100755 lib/netdev-dpdk.c

file permission change.

>
> diff --git a/INSTALL.DPDK.md b/INSTALL.DPDK.md
> index 7f76df8..5006812 100644
> --- a/INSTALL.DPDK.md
> +++ b/INSTALL.DPDK.md
> @@ -945,6 +945,18 @@ Restrictions:
>       increased to the desired number of queues. Both DPDK and OVS must be
>       recompiled for this change to take effect.
>
> +  DPDK 'eth' type ports:
> +  - dpdk, dpdkr and dpdkvhostuser ports are 'eth' type ports in the context of
> +    DPDK as they are all managed by the rte_ether API. This means that they
> +    adhere to the DPDK configuration option CONFIG_RTE_MAX_ETHPORTS which by
> +    default is set to 32. This means by default the combined total number of
> +    dpdk, dpdkr and dpdkvhostuser ports allowable in OVS with DPDK is 32. This
> +    value can be changed if desired by modifying the configuration file in
> +    DPDK, or by overriding the default value on the command line when building
> +    DPDK. eg.
> +
> +        `make install CONFIG_RTE_MAX_ETHPORTS=64`

format is not registering right for this in my md viewer.

> +
>   Bug Reporting:
>   --------------
>
> diff --git a/NEWS b/NEWS
> index ea7f3a1..4dc0201 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -26,6 +26,8 @@ Post-v2.5.0
>          assignment.
>        * Type of log messages from PMD threads changed from INFO to DBG.
>        * QoS functionality with sample egress-policer implementation.
> +     * vHost PMD integration brings vhost-user ports under control of the
> +       rte_ether DPDK API.
>      - ovs-benchmark: This utility has been removed due to lack of use and
>        bitrot.
>      - ovs-appctl:
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> old mode 100644
> new mode 100755
> index 208c5f5..4fccd63
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -56,6 +56,7 @@
>   #include "rte_mbuf.h"
>   #include "rte_meter.h"
>   #include "rte_virtio_net.h"
> +#include "rte_eth_vhost.h"

nit: generally these go in alphabetical order.

>
>   VLOG_DEFINE_THIS_MODULE(dpdk);
>   static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
> @@ -109,6 +110,8 @@ BUILD_ASSERT_DECL((MAX_NB_MBUF / ROUND_DOWN_POW2(MAX_NB_MBUF/MIN_NB_MBUF))
>
>   static char *cuse_dev_name = NULL;    /* Character device cuse_dev_name. */
>   static char *vhost_sock_dir = NULL;   /* Location of vhost-user sockets */
> +/* Array that tracks the used & unused vHost user driver IDs */
> +static unsigned int vhost_user_drv_ids[RTE_MAX_ETHPORTS];
>
>   /*
>    * Maximum amount of time in micro seconds to try and enqueue to vhost.
> @@ -143,7 +146,8 @@ enum { DRAIN_TSC = 200000ULL };
>
>   enum dpdk_dev_type {
>       DPDK_DEV_ETH = 0,
> -    DPDK_DEV_VHOST = 1,
> +    DPDK_DEV_VHOST_USER = 1,
> +    DPDK_DEV_VHOST_CUSE = 2,
>   };
>
>   static int rte_eal_init_ret = ENODEV;
> @@ -275,8 +279,6 @@ struct dpdk_tx_queue {
>                                       * from concurrent access.  It is used only
>                                       * if the queue is shared among different
>                                       * pmd threads (see 'txq_needs_locking'). */
> -    int map;                       /* Mapping of configured vhost-user queues
> -                                    * to enabled by guest. */
>       uint64_t tsc;
>       struct rte_mbuf *burst_pkts[MAX_TX_QUEUE_LEN];
>   };
> @@ -329,12 +331,22 @@ struct netdev_dpdk {
>       int real_n_rxq;
>       bool txq_needs_locking;
>
> -    /* virtio-net structure for vhost device */
> +    /* Spinlock for vhost cuse transmission. Other DPDK devices use spinlocks
> +     * in dpdk_tx_queue */
> +    rte_spinlock_t vhost_cuse_tx_lock;

Why can't we continue to use the lock in dpdk_tx_queue? rather than 
adding a cuse specific lock.

> +
> +    /* virtio-net structure for vhost cuse device */
>       OVSRCU_TYPE(struct virtio_net *) virtio_dev;
>
>       /* Identifier used to distinguish vhost devices from each other */
>       char vhost_id[PATH_MAX];
>
> +    /* ID of vhost user port given to the PMD driver */
> +    unsigned int vhost_pmd_id;
> +
> +    /* Number of virtqueue pairs reported by the guest */
> +    uint32_t reported_queues;

Is this useful? we could just use the real_n_*xq's directly.

> +
>       /* In dpdk_list. */
>       struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
>
> @@ -352,13 +364,20 @@ struct netdev_rxq_dpdk {
>   static bool dpdk_thread_is_pmd(void);
>
>   static int netdev_dpdk_construct(struct netdev *);
> +static int netdev_dpdk_vhost_user_construct(struct netdev *);
>
>   struct virtio_net * netdev_dpdk_get_virtio(const struct netdev_dpdk *dev);
>
> +void vring_state_changed_callback(uint8_t port_id,
> +        enum rte_eth_event_type type OVS_UNUSED, void *param OVS_UNUSED);
> +void device_state_changed_callback(uint8_t port_id,
> +        enum rte_eth_event_type type OVS_UNUSED, void *param OVS_UNUSED);
> +
>   static bool
>   is_dpdk_class(const struct netdev_class *class)
>   {
> -    return class->construct == netdev_dpdk_construct;
> +    return ((class->construct == netdev_dpdk_construct) ||
> +            (class->construct == netdev_dpdk_vhost_user_construct));

You should probably change the name of the function to something more 
intuitive now that it's not just for a "dpdk" netdev class.

>   }
>
>   /* DPDK NIC drivers allocate RX buffers at a particular granularity, typically
> @@ -581,7 +600,9 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq)
>           }
>
>           dev->up.n_rxq = n_rxq;
> -        dev->real_n_txq = n_txq;
> +        if (dev->type == DPDK_DEV_ETH) {
> +            dev->real_n_txq = n_txq;
> +        }

This needs a comment to explain why you've added this check.

>
>           return 0;
>       }
> @@ -672,11 +693,72 @@ netdev_dpdk_alloc_txq(struct netdev_dpdk *dev, unsigned int n_txqs)
>               /* Queues are shared among CPUs. Always flush */
>               dev->tx_q[i].flush_tx = true;
>           }
> +    }
> +}
>
> -        /* Initialize map for vhost devices. */
> -        dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
> -        rte_spinlock_init(&dev->tx_q[i].tx_lock);
> +void
> +vring_state_changed_callback(uint8_t port_id,
> +                             enum rte_eth_event_type type OVS_UNUSED,
> +                             void *param OVS_UNUSED)
> +{
> +    struct netdev_dpdk *dev;
> +    struct rte_eth_vhost_queue_event event;
> +    int err = 0;
> +
> +    err = rte_eth_vhost_get_queue_event(port_id, &event);
> +    if (err || (event.rx == 1)) {
> +        return;
>       }
> +
> +    ovs_mutex_lock(&dpdk_mutex);
> +    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
> +        if (port_id == dev->port_id) {
> +            ovs_mutex_lock(&dev->mutex);
> +            if (event.enable) {
> +                dev->reported_queues++;
> +            } else {
> +                dev->reported_queues--;
> +            }
> +            dev->real_n_rxq = dev->reported_queues;
> +            dev->real_n_txq = dev->reported_queues;
> +            dev->txq_needs_locking = true;

How do you know locking is needed at this point? Oh I just realised this 
is in the existing code. Seems like we should put in a check against 
netdev->n_txq like elsewhere in the code.

> +            netdev_dpdk_alloc_txq(dev, dev->real_n_txq);
> +            ovs_mutex_unlock(&dev->mutex);
> +            break;
> +        }
> +    }
> +    ovs_mutex_unlock(&dpdk_mutex);
> +
> +    return;
> +}
> +
> +void
> +device_state_changed_callback(uint8_t port_id,
> +                              enum rte_eth_event_type type OVS_UNUSED,
> +                              void *param OVS_UNUSED)
> +{
> +    struct netdev_dpdk *dev;
> +
> +    ovs_mutex_lock(&dpdk_mutex);
> +    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
> +        if (port_id == dev->port_id) {
> +            ovs_mutex_lock(&dev->mutex);
> +            check_link_status(dev);
> +            if (dev->link.link_status == ETH_LINK_UP) {
> +                /* new device */
> +                VLOG_INFO("vHost Device '%s' has been added", dev->vhost_id);
> +
> +            } else {
> +                /* destroy device */
> +                VLOG_INFO("vHost Device '%s' has been removed", dev->vhost_id);
> +            }
> +            ovs_mutex_unlock(&dev->mutex);
> +            break;
> +        }
> +    }
> +    ovs_mutex_unlock(&dpdk_mutex);
> +
> +    return;
>   }
>
>   static int
> @@ -688,6 +770,7 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
>       int sid;
>       int err = 0;
>       uint32_t buf_size;
> +    unsigned int n_queues = 0;
>
>       ovs_mutex_init(&dev->mutex);
>       ovs_mutex_lock(&dev->mutex);
> @@ -697,7 +780,7 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
>       /* If the 'sid' is negative, it means that the kernel fails
>        * to obtain the pci numa info.  In that situation, always
>        * use 'SOCKET0'. */
> -    if (type == DPDK_DEV_ETH) {
> +    if (type != DPDK_DEV_VHOST_CUSE) {
>           sid = rte_eth_dev_socket_id(port_no);

Are you trying to sneak in your NUMA changes for vhostuser ;-)

>       } else {
>           sid = rte_lcore_to_socket_id(rte_get_master_lcore());
> @@ -709,6 +792,8 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
>       dev->flags = 0;
>       dev->mtu = ETHER_MTU;
>       dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
> +    dev->vhost_pmd_id = 0;

0 is a valid vhost_pmd_id number. Seen as it's used in the destruct it 
might be better to set to an invalid value like MAX and check for that 
in destruct, otherwise you're relying on higher layer to ensure that 
destruct is never called before construct. edit: actually i've another 
comment wrt vhost_pmd_id below.

> +    dev->reported_queues = 0;
>
>       buf_size = dpdk_buf_size(dev->mtu);
>       dev->dpdk_mp = dpdk_mp_get(dev->socket_id, FRAME_LEN_TO_MTU(buf_size));
> @@ -726,14 +811,23 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
>       netdev->requested_n_rxq = NR_QUEUE;
>       dev->real_n_txq = NR_QUEUE;
>
> -    if (type == DPDK_DEV_ETH) {
> -        netdev_dpdk_alloc_txq(dev, NR_QUEUE);
> +    n_queues = (type == DPDK_DEV_VHOST_CUSE) ? OVS_VHOST_MAX_QUEUE_NUM :
> +                                               NR_QUEUE;

No need for special cuse case above.

> +    netdev_dpdk_alloc_txq(dev, n_queues);
> +    if (type != DPDK_DEV_VHOST_CUSE) {
>           err = dpdk_eth_dev_init(dev);
>           if (err) {
>               goto unlock;
>           }
> -    } else {
> -        netdev_dpdk_alloc_txq(dev, OVS_VHOST_MAX_QUEUE_NUM);
> +    }
> +
> +    if (type == DPDK_DEV_VHOST_USER) {
> +        rte_eth_dev_callback_register(port_no, RTE_ETH_EVENT_QUEUE_STATE,
> +                                     (void*)vring_state_changed_callback,
> +                                      NULL);
> +        rte_eth_dev_callback_register(port_no, RTE_ETH_EVENT_INTR_LSC,
> +                                     (void*)device_state_changed_callback,

Seems like this using this callback is a different way to do the same 
thing as the watchdog timer - could we consolidate them and simplify?

> +                                      NULL);
>       }
>
>       ovs_list_push_back(&dpdk_list, &dev->list_node);
> @@ -768,26 +862,37 @@ dpdk_dev_parse_name(const char dev_name[], const char prefix[],
>   }
>
>   static int
> -vhost_construct_helper(struct netdev *netdev) OVS_REQUIRES(dpdk_mutex)
> +netdev_dpdk_vhost_cuse_construct(struct netdev *netdev)
>   {
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +    int err;
> +
> +    ovs_mutex_lock(&dpdk_mutex);
> +    strncpy(dev->vhost_id, netdev->name, sizeof(dev->vhost_id));
>       if (rte_eal_init_ret) {
>           return rte_eal_init_ret;
>       }
>
> -    return netdev_dpdk_init(netdev, -1, DPDK_DEV_VHOST);
> +    rte_spinlock_init(&dev->vhost_cuse_tx_lock);
> +
> +    err = netdev_dpdk_init(netdev, -1, DPDK_DEV_VHOST_CUSE);
> +
> +    ovs_mutex_unlock(&dpdk_mutex);
> +    return err;
>   }
>
>   static int
> -netdev_dpdk_vhost_cuse_construct(struct netdev *netdev)
> +get_vhost_user_drv_id(void)
>   {
> -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> -    int err;
> +    int i = 0;
>
> -    ovs_mutex_lock(&dpdk_mutex);
> -    strncpy(dev->vhost_id, netdev->name, sizeof(dev->vhost_id));
> -    err = vhost_construct_helper(netdev);
> -    ovs_mutex_unlock(&dpdk_mutex);
> -    return err;
> +    for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
> +        if (vhost_user_drv_ids[i] == 0) {

This is treating RTE_MAX_ETHPORTS as the max vhost ports. I didn't check 
but it sounds like it's for all DPDK ports and not just vhost. If true 
it means the code would allow >RTE_MAX_ETHPORTS.

> +            return i;
> +        }
> +    }
> +
> +    return -1;
>   }
>
>   static int
> @@ -796,6 +901,9 @@ netdev_dpdk_vhost_user_construct(struct netdev *netdev)
>       struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>       const char *name = netdev->name;
>       int err;
> +    uint8_t port_no = 0;
> +    char devargs[4096];

magic number - any define we can use?

> +    int driver_id = 0;
>
>       /* 'name' is appended to 'vhost_sock_dir' and used to create a socket in
>        * the file system. '/' or '\' would traverse directories, so they're not
> @@ -807,22 +915,35 @@ netdev_dpdk_vhost_user_construct(struct netdev *netdev)
>           return EINVAL;
>       }
>
> +    if (rte_eal_init_ret) {
> +        return rte_eal_init_ret;
> +    }
> +

it's more consistent to check this before the name.

>       ovs_mutex_lock(&dpdk_mutex);
>       /* Take the name of the vhost-user port and append it to the location where
>        * the socket is to be created, then register the socket.
>        */
>       snprintf(dev->vhost_id, sizeof(dev->vhost_id), "%s/%s",
>                vhost_sock_dir, name);
> +    driver_id = get_vhost_user_drv_id();
> +    if (driver_id == -1) {
> +        VLOG_ERR("Too many vhost-user devices registered with PMD");
> +        err = ENODEV;
> +    } else {
> +        snprintf(devargs, sizeof(devargs), "eth_vhost%u,iface=%s,queues=%i",
> +                 driver_id, dev->vhost_id, RTE_MAX_QUEUES_PER_PORT);
> +        err = rte_eth_dev_attach(devargs, &port_no);
> +    }
>
> -    err = rte_vhost_driver_register(dev->vhost_id);
>       if (err) {
> -        VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
> -                 dev->vhost_id);
> +        VLOG_ERR("Failed to attach vhost-user device to DPDK");

It might be a bit confusing getting this error message in some cases 
where you have tried to attach and failed and in other cases where it 
has not been tried because of too many vhost-user devices. Also the 
socket name would be handy.

>       } else {
>           fatal_signal_add_file_to_unlink(dev->vhost_id);
>           VLOG_INFO("Socket %s created for vhost-user port %s\n",
>                     dev->vhost_id, name);
> -        err = vhost_construct_helper(netdev);
> +        dev->vhost_pmd_id = driver_id;

You're setting this but then it gets reset when you call 
netdev_dpdk_init(). So when you destruct you'll set the reset the wrong 
index in vhost_user_drv_ids[] ?

> +        vhost_user_drv_ids[dev->vhost_pmd_id] = 1;
> +        err = netdev_dpdk_init(netdev, port_no, DPDK_DEV_VHOST_USER);
>       }
>
>       ovs_mutex_unlock(&dpdk_mutex);
> @@ -857,6 +978,12 @@ netdev_dpdk_destruct(struct netdev *netdev)
>       struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>
>       ovs_mutex_lock(&dev->mutex);
> +
> +    if (dev->type == DPDK_DEV_VHOST_USER) {
> +        rte_eth_dev_detach(dev->port_id, dev->vhost_id);
> +        vhost_user_drv_ids[dev->vhost_pmd_id] = 0;
> +    }
> +
>       rte_eth_dev_stop(dev->port_id);
>       ovs_mutex_unlock(&dev->mutex);
>
> @@ -868,7 +995,7 @@ netdev_dpdk_destruct(struct netdev *netdev)
>   }
>
>   static void
> -netdev_dpdk_vhost_destruct(struct netdev *netdev)
> +netdev_dpdk_vhost_cuse_destruct(struct netdev *netdev)
>   {
>       struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>
> @@ -876,15 +1003,8 @@ netdev_dpdk_vhost_destruct(struct netdev *netdev)
>       if (netdev_dpdk_get_virtio(dev) != NULL) {
>           VLOG_ERR("Removing port '%s' while vhost device still attached.",
>                    netdev->name);
> -        VLOG_ERR("To restore connectivity after re-adding of port, VM on socket"
> -                 " '%s' must be restarted.",
> -                 dev->vhost_id);
> -    }
> -
> -    if (rte_vhost_driver_unregister(dev->vhost_id)) {
> -        VLOG_ERR("Unable to remove vhost-user socket %s", dev->vhost_id);
> -    } else {
> -        fatal_signal_remove_file_to_unlink(dev->vhost_id);
> +        VLOG_ERR("To restore connectivity after re-adding of port, VM with"
> +                 "port '%s' must be restarted.", dev->vhost_id);
>       }
>
>       ovs_mutex_lock(&dpdk_mutex);
> @@ -1000,30 +1120,6 @@ netdev_dpdk_vhost_cuse_set_multiq(struct netdev *netdev, unsigned int n_txq,
>       netdev->n_txq = n_txq;
>       dev->real_n_txq = 1;
>       netdev->n_rxq = 1;
> -    dev->txq_needs_locking = dev->real_n_txq != netdev->n_txq;
> -
> -    ovs_mutex_unlock(&dev->mutex);
> -    ovs_mutex_unlock(&dpdk_mutex);
> -
> -    return err;
> -}
> -
> -static int
> -netdev_dpdk_vhost_set_multiq(struct netdev *netdev, unsigned int n_txq,
> -                             unsigned int n_rxq)
> -{
> -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> -    int err = 0;
> -
> -    if (netdev->n_txq == n_txq && netdev->n_rxq == n_rxq) {
> -        return err;
> -    }
> -
> -    ovs_mutex_lock(&dpdk_mutex);
> -    ovs_mutex_lock(&dev->mutex);
> -
> -    netdev->n_txq = n_txq;
> -    netdev->n_rxq = n_rxq;
>
>       ovs_mutex_unlock(&dev->mutex);
>       ovs_mutex_unlock(&dpdk_mutex);
> @@ -1118,13 +1214,13 @@ dpdk_queue_flush(struct netdev_dpdk *dev, int qid)
>   }
>
>   static bool
> -is_vhost_running(struct virtio_net *virtio_dev)
> +is_vhost_cuse_running(struct virtio_net *virtio_dev)
>   {
>       return (virtio_dev != NULL && (virtio_dev->flags & VIRTIO_DEV_RUNNING));
>   }
>
>   static inline void
> -netdev_dpdk_vhost_update_rx_counters(struct netdev_stats *stats,
> +netdev_dpdk_vhost_cuse_update_rx_counters(struct netdev_stats *stats,
>                                        struct dp_packet **packets, int count)
>   {
>       int i;
> @@ -1153,26 +1249,22 @@ netdev_dpdk_vhost_update_rx_counters(struct netdev_stats *stats,
>   }
>
>   /*
> - * The receive path for the vhost port is the TX path out from guest.
> + * The receive path for the vhost cuse port is the TX path out from guest.
>    */
>   static int
> -netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
> +netdev_dpdk_vhost_cuse_rxq_recv(struct netdev_rxq *rxq,
>                              struct dp_packet **packets, int *c)
>   {
>       struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
>       struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);
> -    int qid = rxq->queue_id;
> +    int qid = 1;
>       uint16_t nb_rx = 0;
>
> -    if (OVS_UNLIKELY(!is_vhost_running(virtio_dev))) {
> +    if (OVS_UNLIKELY(!is_vhost_cuse_running(virtio_dev))) {
>           return EAGAIN;
>       }
>
> -    if (rxq->queue_id >= dev->real_n_rxq) {
> -        return EOPNOTSUPP;
> -    }
> -
> -    nb_rx = rte_vhost_dequeue_burst(virtio_dev, qid * VIRTIO_QNUM + VIRTIO_TXQ,
> +    nb_rx = rte_vhost_dequeue_burst(virtio_dev, qid,
>                                       dev->dpdk_mp->mp,
>                                       (struct rte_mbuf **)packets,
>                                       NETDEV_MAX_BURST);
> @@ -1181,7 +1273,7 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
>       }
>
>       rte_spinlock_lock(&dev->stats_lock);
> -    netdev_dpdk_vhost_update_rx_counters(&dev->stats, packets, nb_rx);
> +    netdev_dpdk_vhost_cuse_update_rx_counters(&dev->stats, packets, nb_rx);
>       rte_spinlock_unlock(&dev->stats_lock);
>
>       *c = (int) nb_rx;
> @@ -1217,6 +1309,18 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet **packets,
>       return 0;
>   }
>
> +static int
> +netdev_dpdk_vhost_user_rxq_recv(struct netdev_rxq *rxq,
> +                                struct dp_packet **packets, int *c)
> +{
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
> +
> +    if (rxq->queue_id >= dev->real_n_rxq) {
> +        return EOPNOTSUPP;
> +    }
> +
> +    return netdev_dpdk_rxq_recv(rxq, packets, c);
> +}
>   static inline int
>   netdev_dpdk_qos_run__(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
>                           int cnt)
> @@ -1235,7 +1339,7 @@ netdev_dpdk_qos_run__(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
>   }
>
>   static inline void
> -netdev_dpdk_vhost_update_tx_counters(struct netdev_stats *stats,
> +netdev_dpdk_vhost_cuse_update_tx_counters(struct netdev_stats *stats,
>                                        struct dp_packet **packets,
>                                        int attempted,
>                                        int dropped)
> @@ -1252,9 +1356,8 @@ netdev_dpdk_vhost_update_tx_counters(struct netdev_stats *stats,
>   }
>
>   static void
> -__netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
> -                         struct dp_packet **pkts, int cnt,
> -                         bool may_steal)
> +__netdev_dpdk_vhost_cuse_send(struct netdev *netdev, struct dp_packet **pkts,
> +                         int cnt, bool may_steal)
>   {
>       struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>       struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);
> @@ -1263,26 +1366,24 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
>       unsigned int qos_pkts = cnt;
>       uint64_t start = 0;
>
> -    qid = dev->tx_q[qid % dev->real_n_txq].map;
> -
> -    if (OVS_UNLIKELY(!is_vhost_running(virtio_dev) || qid < 0)) {
> +    if (OVS_UNLIKELY(!is_vhost_cuse_running(virtio_dev))) {
>           rte_spinlock_lock(&dev->stats_lock);
>           dev->stats.tx_dropped+= cnt;
>           rte_spinlock_unlock(&dev->stats_lock);
>           goto out;
>       }
>
> -    rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> +    /* There is a single vhost-cuse TX queue, So we need to lock it for TX. */
> +    rte_spinlock_lock(&dev->vhost_cuse_tx_lock);
>
>       /* Check has QoS has been configured for the netdev */
>       cnt = netdev_dpdk_qos_run__(dev, cur_pkts, cnt);
>       qos_pkts -= cnt;
>
>       do {
> -        int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
>           unsigned int tx_pkts;
>
> -        tx_pkts = rte_vhost_enqueue_burst(virtio_dev, vhost_qid,
> +        tx_pkts = rte_vhost_enqueue_burst(virtio_dev, VIRTIO_RXQ,
>                                             cur_pkts, cnt);
>           if (OVS_LIKELY(tx_pkts)) {
>               /* Packets have been sent.*/
> @@ -1301,7 +1402,7 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
>                * Unable to enqueue packets to vhost interface.
>                * Check available entries before retrying.
>                */
> -            while (!rte_vring_available_entries(virtio_dev, vhost_qid)) {
> +            while (!rte_vring_available_entries(virtio_dev, VIRTIO_RXQ)) {
>                   if (OVS_UNLIKELY((rte_get_timer_cycles() - start) > timeout)) {
>                       expired = 1;
>                       break;
> @@ -1313,12 +1414,12 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
>               }
>           }
>       } while (cnt);
> -
> -    rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
> +    rte_spinlock_unlock(&dev->vhost_cuse_tx_lock);
>
>       rte_spinlock_lock(&dev->stats_lock);
>       cnt += qos_pkts;
> -    netdev_dpdk_vhost_update_tx_counters(&dev->stats, pkts, total_pkts, cnt);
> +    netdev_dpdk_vhost_cuse_update_tx_counters(&dev->stats, pkts, total_pkts,
> +                                              cnt);
>       rte_spinlock_unlock(&dev->stats_lock);
>
>   out:
> @@ -1412,8 +1513,9 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet **pkts,
>           newcnt++;
>       }
>
> -    if (dev->type == DPDK_DEV_VHOST) {
> -        __netdev_dpdk_vhost_send(netdev, qid, (struct dp_packet **) mbufs, newcnt, true);
> +    if (dev->type == DPDK_DEV_VHOST_CUSE) {
> +        __netdev_dpdk_vhost_cuse_send(netdev, (struct dp_packet **) mbufs,
> +                                      newcnt, true);
>       } else {
>           unsigned int qos_pkts = newcnt;
>
> @@ -1437,8 +1539,8 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet **pkts,
>   }
>
>   static int
> -netdev_dpdk_vhost_send(struct netdev *netdev, int qid, struct dp_packet **pkts,
> -                 int cnt, bool may_steal)
> +netdev_dpdk_vhost_cuse_send(struct netdev *netdev, int qid OVS_UNUSED,
> +                            struct dp_packet **pkts, int cnt, bool may_steal)
>   {
>       if (OVS_UNLIKELY(pkts[0]->source != DPBUF_DPDK)) {
>           int i;
> @@ -1450,7 +1552,7 @@ netdev_dpdk_vhost_send(struct netdev *netdev, int qid, struct dp_packet **pkts,
>               }
>           }
>       } else {
> -        __netdev_dpdk_vhost_send(netdev, qid, pkts, cnt, may_steal);
> +        __netdev_dpdk_vhost_cuse_send(netdev, pkts, cnt, may_steal);
>       }
>       return 0;
>   }
> @@ -1461,11 +1563,6 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
>   {

Now that this handles buffers for vhost, how is may_steal on/off catered 
for?

>       int i;
>
> -    if (OVS_UNLIKELY(dev->txq_needs_locking)) {
> -        qid = qid % dev->real_n_txq;
> -        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> -    }
> -

why is the possible locking removed?

>       if (OVS_UNLIKELY(!may_steal ||
>                        pkts[0]->source != DPBUF_DPDK)) {
>           struct netdev *netdev = &dev->up;
> @@ -1541,6 +1638,18 @@ netdev_dpdk_eth_send(struct netdev *netdev, int qid,
>   }
>
>   static int
> +netdev_dpdk_vhost_user_send(struct netdev *netdev, int qid,
> +        struct dp_packet **pkts, int cnt, bool may_steal)
> +{
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +
> +    qid = qid % dev->real_n_txq;

Due to the modulo, 2 cores can be trying to send on the same queue which 
makes for lock contention. The code was changed to avoid that but now 
the code to avoid and the locking seems to be gone so it could cause 
corruption.

> +
> +    netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal);
> +    return 0;
> +}
> +
> +static int
>   netdev_dpdk_set_etheraddr(struct netdev *netdev, const struct eth_addr mac)
>   {
>       struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> @@ -1634,7 +1743,7 @@ static int
>   netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier);
>
>   static int
> -netdev_dpdk_vhost_get_stats(const struct netdev *netdev,
> +netdev_dpdk_vhost_cuse_get_stats(const struct netdev *netdev,
>                               struct netdev_stats *stats)
>   {
>       struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> @@ -1796,14 +1905,14 @@ netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier)
>   }
>
>   static int
> -netdev_dpdk_vhost_get_carrier(const struct netdev *netdev, bool *carrier)
> +netdev_dpdk_vhost_cuse_get_carrier(const struct netdev *netdev, bool *carrier)
>   {
>       struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>       struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);
>
>       ovs_mutex_lock(&dev->mutex);
>
> -    if (is_vhost_running(virtio_dev)) {
> +    if (is_vhost_cuse_running(virtio_dev)) {
>           *carrier = 1;
>       } else {
>           *carrier = 0;
> @@ -1853,7 +1962,7 @@ netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
>           return 0;
>       }
>
> -    if (dev->type == DPDK_DEV_ETH) {
> +    if (dev->type != DPDK_DEV_VHOST_CUSE) {
>           if (dev->flags & NETDEV_UP) {
>               err = rte_eth_dev_start(dev->port_id);
>               if (err)
> @@ -1998,75 +2107,7 @@ set_irq_status(struct virtio_net *virtio_dev)
>   }
>
>   /*
> - * Fixes mapping for vhost-user tx queues. Must be called after each
> - * enabling/disabling of queues and real_n_txq modifications.
> - */
> -static void
> -netdev_dpdk_remap_txqs(struct netdev_dpdk *dev)
> -    OVS_REQUIRES(dev->mutex)
> -{
> -    int *enabled_queues, n_enabled = 0;
> -    int i, k, total_txqs = dev->real_n_txq;
> -
> -    enabled_queues = dpdk_rte_mzalloc(total_txqs * sizeof *enabled_queues);
> -
> -    for (i = 0; i < total_txqs; i++) {
> -        /* Enabled queues always mapped to themselves. */
> -        if (dev->tx_q[i].map == i) {
> -            enabled_queues[n_enabled++] = i;
> -        }
> -    }
> -
> -    if (n_enabled == 0 && total_txqs != 0) {
> -        enabled_queues[0] = OVS_VHOST_QUEUE_DISABLED;
> -        n_enabled = 1;
> -    }
> -
> -    k = 0;
> -    for (i = 0; i < total_txqs; i++) {
> -        if (dev->tx_q[i].map != i) {
> -            dev->tx_q[i].map = enabled_queues[k];
> -            k = (k + 1) % n_enabled;
> -        }
> -    }
> -
> -    VLOG_DBG("TX queue mapping for %s\n", dev->vhost_id);
> -    for (i = 0; i < total_txqs; i++) {
> -        VLOG_DBG("%2d --> %2d", i, dev->tx_q[i].map);
> -    }
> -
> -    rte_free(enabled_queues);
> -}
> -
> -static int
> -netdev_dpdk_vhost_set_queues(struct netdev_dpdk *dev, struct virtio_net *virtio_dev)
> -    OVS_REQUIRES(dev->mutex)
> -{
> -    uint32_t qp_num;
> -
> -    qp_num = virtio_dev->virt_qp_nb;
> -    if (qp_num > dev->up.n_rxq) {
> -        VLOG_ERR("vHost Device '%s' %"PRIu64" can't be added - "
> -                 "too many queues %d > %d", virtio_dev->ifname, virtio_dev->device_fh,
> -                 qp_num, dev->up.n_rxq);
> -        return -1;
> -    }
> -
> -    dev->real_n_rxq = qp_num;
> -    dev->real_n_txq = qp_num;
> -    dev->txq_needs_locking = true;
> -    /* Enable TX queue 0 by default if it wasn't disabled. */
> -    if (dev->tx_q[0].map == OVS_VHOST_QUEUE_MAP_UNKNOWN) {
> -        dev->tx_q[0].map = 0;
> -    }
> -
> -    netdev_dpdk_remap_txqs(dev);
> -
> -    return 0;
> -}
> -
> -/*
> - * A new virtio-net device is added to a vhost port.
> + * A new virtio-net device is added to a vhost cuse port.
>    */
>   static int
>   new_device(struct virtio_net *virtio_dev)
> @@ -2079,11 +2120,6 @@ new_device(struct virtio_net *virtio_dev)
>       LIST_FOR_EACH(dev, list_node, &dpdk_list) {
>           if (strncmp(virtio_dev->ifname, dev->vhost_id, IF_NAME_SZ) == 0) {
>               ovs_mutex_lock(&dev->mutex);
> -            if (netdev_dpdk_vhost_set_queues(dev, virtio_dev)) {
> -                ovs_mutex_unlock(&dev->mutex);
> -                ovs_mutex_unlock(&dpdk_mutex);
> -                return -1;
> -            }
>               ovsrcu_set(&dev->virtio_dev, virtio_dev);
>               exists = true;
>               virtio_dev->flags |= VIRTIO_DEV_RUNNING;
> @@ -2107,23 +2143,11 @@ new_device(struct virtio_net *virtio_dev)
>       return 0;
>   }
>
> -/* Clears mapping for all available queues of vhost interface. */
> -static void
> -netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)
> -    OVS_REQUIRES(dev->mutex)
> -{
> -    int i;
> -
> -    for (i = 0; i < dev->real_n_txq; i++) {
> -        dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
> -    }
> -}
> -
>   /*
> - * Remove a virtio-net device from the specific vhost port.  Use dev->remove
> - * flag to stop any more packets from being sent or received to/from a VM and
> - * ensure all currently queued packets have been sent/received before removing
> - *  the device.
> + * Remove a virtio-net device from the specific vhost cuse port. Use
> + * dev->remove flag to stop any more packets from being sent or received
> + * to/from a VM and ensure all currently queued packets have been sent/received
> + * before removing the device.
>    */
>   static void
>   destroy_device(volatile struct virtio_net *virtio_dev)
> @@ -2138,7 +2162,6 @@ destroy_device(volatile struct virtio_net *virtio_dev)
>               ovs_mutex_lock(&dev->mutex);
>               virtio_dev->flags &= ~VIRTIO_DEV_RUNNING;
>               ovsrcu_set(&dev->virtio_dev, NULL);
> -            netdev_dpdk_txq_map_clear(dev);
>               exists = true;
>               ovs_mutex_unlock(&dev->mutex);
>               break;
> @@ -2166,49 +2189,6 @@ destroy_device(volatile struct virtio_net *virtio_dev)
>       }
>   }
>
> -static int
> -vring_state_changed(struct virtio_net *virtio_dev, uint16_t queue_id,
> -                    int enable)
> -{
> -    struct netdev_dpdk *dev;
> -    bool exists = false;
> -    int qid = queue_id / VIRTIO_QNUM;
> -
> -    if (queue_id % VIRTIO_QNUM == VIRTIO_TXQ) {
> -        return 0;
> -    }
> -
> -    ovs_mutex_lock(&dpdk_mutex);
> -    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
> -        if (strncmp(virtio_dev->ifname, dev->vhost_id, IF_NAME_SZ) == 0) {
> -            ovs_mutex_lock(&dev->mutex);
> -            if (enable) {
> -                dev->tx_q[qid].map = qid;
> -            } else {
> -                dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
> -            }
> -            netdev_dpdk_remap_txqs(dev);
> -            exists = true;
> -            ovs_mutex_unlock(&dev->mutex);
> -            break;
> -        }
> -    }
> -    ovs_mutex_unlock(&dpdk_mutex);
> -
> -    if (exists) {
> -        VLOG_INFO("State of queue %d ( tx_qid %d ) of vhost device '%s' %"
> -                  PRIu64" changed to \'%s\'", queue_id, qid,
> -                  virtio_dev->ifname, virtio_dev->device_fh,
> -                  (enable == 1) ? "enabled" : "disabled");
> -    } else {
> -        VLOG_INFO("vHost Device '%s' %"PRIu64" not found", virtio_dev->ifname,
> -                  virtio_dev->device_fh);
> -        return -1;
> -    }
> -
> -    return 0;
> -}
> -
>   struct virtio_net *
>   netdev_dpdk_get_virtio(const struct netdev_dpdk *dev)
>   {
> @@ -2216,18 +2196,17 @@ netdev_dpdk_get_virtio(const struct netdev_dpdk *dev)
>   }
>
>   /*
> - * These callbacks allow virtio-net devices to be added to vhost ports when
> - * configuration has been fully complete.
> + * These callbacks allow virtio-net devices to be added to vhost cuse ports
> + * when configuration has been fully complete.
>    */
>   static const struct virtio_net_device_ops virtio_net_device_ops =
>   {
>       .new_device =  new_device,
>       .destroy_device = destroy_device,
> -    .vring_state_changed = vring_state_changed
>   };
>
>   static void *
> -start_vhost_loop(void *dummy OVS_UNUSED)
> +start_vhost_cuse_loop(void *dummy OVS_UNUSED)
>   {
>        pthread_detach(pthread_self());
>        /* Put the cuse thread into quiescent state. */
> @@ -2237,19 +2216,7 @@ start_vhost_loop(void *dummy OVS_UNUSED)
>   }
>
>   static int
> -dpdk_vhost_class_init(void)
> -{
> -    rte_vhost_driver_callback_register(&virtio_net_device_ops);
> -    rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
> -                            | 1ULL << VIRTIO_NET_F_HOST_TSO6
> -                            | 1ULL << VIRTIO_NET_F_CSUM);

I think you'll still want to set these for vhostuser.

> -
> -    ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
> -    return 0;
> -}
> -
> -static int
> -dpdk_vhost_cuse_class_init(void)
> +netdev_dpdk_vhost_cuse_class_init(void)
>   {
>       int err = -1;
>
> @@ -2265,14 +2232,12 @@ dpdk_vhost_cuse_class_init(void)
>           return -1;
>       }
>
> -    dpdk_vhost_class_init();
> -    return 0;
> -}
> +    rte_vhost_driver_callback_register(&virtio_net_device_ops);
> +    rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
> +                            | 1ULL << VIRTIO_NET_F_HOST_TSO6
> +                            | 1ULL << VIRTIO_NET_F_CSUM);
> +    ovs_thread_create("vhost_cuse_thread", start_vhost_cuse_loop, NULL);
>
> -static int
> -dpdk_vhost_user_class_init(void)
> -{
> -    dpdk_vhost_class_init();
>       return 0;
>   }
>
> @@ -2859,30 +2824,30 @@ static const struct netdev_class dpdk_ring_class =
>   static const struct netdev_class OVS_UNUSED dpdk_vhost_cuse_class =
>       NETDEV_DPDK_CLASS(
>           "dpdkvhostcuse",
> -        dpdk_vhost_cuse_class_init,
> +        netdev_dpdk_vhost_cuse_class_init,
>           netdev_dpdk_vhost_cuse_construct,
> -        netdev_dpdk_vhost_destruct,
> +        netdev_dpdk_vhost_cuse_destruct,
>           netdev_dpdk_vhost_cuse_set_multiq,
> -        netdev_dpdk_vhost_send,
> -        netdev_dpdk_vhost_get_carrier,
> -        netdev_dpdk_vhost_get_stats,
> +        netdev_dpdk_vhost_cuse_send,
> +        netdev_dpdk_vhost_cuse_get_carrier,
> +        netdev_dpdk_vhost_cuse_get_stats,
>           NULL,
>           NULL,
> -        netdev_dpdk_vhost_rxq_recv);
> +        netdev_dpdk_vhost_cuse_rxq_recv);
>
>   static const struct netdev_class OVS_UNUSED dpdk_vhost_user_class =
>       NETDEV_DPDK_CLASS(
>           "dpdkvhostuser",
> -        dpdk_vhost_user_class_init,
> -        netdev_dpdk_vhost_user_construct,
> -        netdev_dpdk_vhost_destruct,
> -        netdev_dpdk_vhost_set_multiq,
> -        netdev_dpdk_vhost_send,
> -        netdev_dpdk_vhost_get_carrier,
> -        netdev_dpdk_vhost_get_stats,
> -        NULL,
>           NULL,
> -        netdev_dpdk_vhost_rxq_recv);
> +        netdev_dpdk_vhost_user_construct,
> +        netdev_dpdk_destruct,
> +        netdev_dpdk_set_multiq,
> +        netdev_dpdk_vhost_user_send,
> +        netdev_dpdk_get_carrier,
> +        netdev_dpdk_get_stats,

Are all the phy type stats available for vhost? or do you need to check 
the netdev type in that function and report specific stats. I didn't 
think DPDK would report things like rx_dropped for vhost but I'll be 
glad to be wrong.

> +        netdev_dpdk_get_features,
> +        netdev_dpdk_get_status,
> +        netdev_dpdk_vhost_user_rxq_recv);
>
>   void
>   netdev_dpdk_register(void)
>

Daniele Di Proietto April 30, 2016, 12:23 a.m. UTC | #2

Hi Ciara,

thanks for doing this.  I really think this has the potential to clean up
the netdev-dpdk code.

The clang thread safety analyzer reports some warnings:

  CC       lib/netdev-dpdk.lo
../lib/netdev-dpdk.c:882:1: error: mutex 'dpdk_mutex' is not held on every
path through here [-Werror,-Wthread-safety-analysis]
}
^
../lib/netdev-dpdk.c:870:5: note: mutex acquired here
    ovs_mutex_lock(&dpdk_mutex);
    ^
../include/openvswitch/thread.h:60:9: note: expanded from macro
'ovs_mutex_lock'
        ovs_mutex_lock_at(mutex, OVS_SOURCE_LOCATOR)
        ^
1 error generated.

I see that this patch removes the transmission locks for the txqs. I think
those are still needed by physical NICs.

This patch seem to remove a lot of txq remapping functions (like
netdev_dpdk_remap_txqs).  How does it handle the case of a disabled txq in
the guest kernel?

I see that vhost-cuse is still handled separately. Is it possible to use
the vhost pmd also for vhost-cuse? Otherwise we still basically have to
handle differently three cases: NIC PMD, vhost user pmd, vhost cuse.  Maybe
it's time to remove vhost-cuse (I understand this is a separate issue,
though)?

I get an error when I try this:

ovs-vsctl add-port br0 p1 -- set Interface p1 type="dpdkvhostuser"
ovs-vsctl del-port br0 p1
ovs-vsctl add-port br0 p1 -- set Interface p1 type="dpdkvhostuser"

More comments inline

Thanks!

2016-04-21 5:20 GMT-07:00 Ciara Loftus <ciara.loftus@intel.com>:

> DPDK 16.04 introduces the vHost PMD which allows 'dpdkvhostuser' ports
> to be controlled by the librte_ether API, like physical 'dpdk' ports.
> The commit integrates this functionality into OVS, and refactors some
> of the existing vhost code such that it is vhost-cuse specific.
> Similarly, there is now some overlap between dpdk and vhost-user port
> code.
>
> Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
> ---
>  INSTALL.DPDK.md   |  12 ++
>  NEWS              |   2 +
>  lib/netdev-dpdk.c | 515
> +++++++++++++++++++++++++-----------------------------
>  3 files changed, 254 insertions(+), 275 deletions(-)
>  mode change 100644 => 100755 lib/netdev-dpdk.c
>
> diff --git a/INSTALL.DPDK.md b/INSTALL.DPDK.md
> index 7f76df8..5006812 100644
> --- a/INSTALL.DPDK.md
> +++ b/INSTALL.DPDK.md
> @@ -945,6 +945,18 @@ Restrictions:
>      increased to the desired number of queues. Both DPDK and OVS must be
>      recompiled for this change to take effect.
>
> +  DPDK 'eth' type ports:
> +  - dpdk, dpdkr and dpdkvhostuser ports are 'eth' type ports in the
> context of
> +    DPDK as they are all managed by the rte_ether API. This means that
> they
> +    adhere to the DPDK configuration option CONFIG_RTE_MAX_ETHPORTS which
> by
> +    default is set to 32. This means by default the combined total number
> of
> +    dpdk, dpdkr and dpdkvhostuser ports allowable in OVS with DPDK is 32.
> This
> +    value can be changed if desired by modifying the configuration file in
> +    DPDK, or by overriding the default value on the command line when
> building
> +    DPDK. eg.
> +
> +        `make install CONFIG_RTE_MAX_ETHPORTS=64`
> +
>

This seems a heavy limitation compared to the previous librte_vhost
approach. Are there any plans to increase this in DPDK upstream?


>  Bug Reporting:
>  --------------
>
> diff --git a/NEWS b/NEWS
> index ea7f3a1..4dc0201 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -26,6 +26,8 @@ Post-v2.5.0
>         assignment.
>       * Type of log messages from PMD threads changed from INFO to DBG.
>       * QoS functionality with sample egress-policer implementation.
> +     * vHost PMD integration brings vhost-user ports under control of the
> +       rte_ether DPDK API.
>     - ovs-benchmark: This utility has been removed due to lack of use and
>       bitrot.
>     - ovs-appctl:
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> old mode 100644
> new mode 100755
> index 208c5f5..4fccd63
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -56,6 +56,7 @@
>  #include "rte_mbuf.h"
>  #include "rte_meter.h"
>  #include "rte_virtio_net.h"
> +#include "rte_eth_vhost.h"
>
>  VLOG_DEFINE_THIS_MODULE(dpdk);
>  static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
> @@ -109,6 +110,8 @@ BUILD_ASSERT_DECL((MAX_NB_MBUF /
> ROUND_DOWN_POW2(MAX_NB_MBUF/MIN_NB_MBUF))
>
>  static char *cuse_dev_name = NULL;    /* Character device cuse_dev_name.
> */
>  static char *vhost_sock_dir = NULL;   /* Location of vhost-user sockets */
> +/* Array that tracks the used & unused vHost user driver IDs */
> +static unsigned int vhost_user_drv_ids[RTE_MAX_ETHPORTS];
>
>  /*
>   * Maximum amount of time in micro seconds to try and enqueue to vhost.
> @@ -143,7 +146,8 @@ enum { DRAIN_TSC = 200000ULL };
>
>  enum dpdk_dev_type {
>      DPDK_DEV_ETH = 0,
> -    DPDK_DEV_VHOST = 1,
> +    DPDK_DEV_VHOST_USER = 1,
> +    DPDK_DEV_VHOST_CUSE = 2,
>  };
>
>  static int rte_eal_init_ret = ENODEV;
> @@ -275,8 +279,6 @@ struct dpdk_tx_queue {
>                                      * from concurrent access.  It is used
> only
>                                      * if the queue is shared among
> different
>                                      * pmd threads (see
> 'txq_needs_locking'). */
> -    int map;                       /* Mapping of configured vhost-user
> queues
> -                                    * to enabled by guest. */
>      uint64_t tsc;
>      struct rte_mbuf *burst_pkts[MAX_TX_QUEUE_LEN];
>  };
> @@ -329,12 +331,22 @@ struct netdev_dpdk {
>      int real_n_rxq;
>      bool txq_needs_locking;
>
> -    /* virtio-net structure for vhost device */
> +    /* Spinlock for vhost cuse transmission. Other DPDK devices use
> spinlocks
> +     * in dpdk_tx_queue */
> +    rte_spinlock_t vhost_cuse_tx_lock;
> +
> +    /* virtio-net structure for vhost cuse device */
>      OVSRCU_TYPE(struct virtio_net *) virtio_dev;
>
>      /* Identifier used to distinguish vhost devices from each other */
>      char vhost_id[PATH_MAX];
>
> +    /* ID of vhost user port given to the PMD driver */
> +    unsigned int vhost_pmd_id;
> +
> +    /* Number of virtqueue pairs reported by the guest */
> +    uint32_t reported_queues;
> +
>      /* In dpdk_list. */
>      struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
>
> @@ -352,13 +364,20 @@ struct netdev_rxq_dpdk {
>  static bool dpdk_thread_is_pmd(void);
>
>  static int netdev_dpdk_construct(struct netdev *);
> +static int netdev_dpdk_vhost_user_construct(struct netdev *);
>
>  struct virtio_net * netdev_dpdk_get_virtio(const struct netdev_dpdk *dev);
>
> +void vring_state_changed_callback(uint8_t port_id,
> +        enum rte_eth_event_type type OVS_UNUSED, void *param OVS_UNUSED);
> +void device_state_changed_callback(uint8_t port_id,
> +        enum rte_eth_event_type type OVS_UNUSED, void *param OVS_UNUSED);
> +
>  static bool
>  is_dpdk_class(const struct netdev_class *class)
>  {
> -    return class->construct == netdev_dpdk_construct;
> +    return ((class->construct == netdev_dpdk_construct) ||
> +            (class->construct == netdev_dpdk_vhost_user_construct));
>  }
>
>  /* DPDK NIC drivers allocate RX buffers at a particular granularity,
> typically
> @@ -581,7 +600,9 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int
> n_rxq, int n_txq)
>          }
>
>          dev->up.n_rxq = n_rxq;
> -        dev->real_n_txq = n_txq;
> +        if (dev->type == DPDK_DEV_ETH) {
> +            dev->real_n_txq = n_txq;
> +        }
>
>          return 0;
>      }
> @@ -672,11 +693,72 @@ netdev_dpdk_alloc_txq(struct netdev_dpdk *dev,
> unsigned int n_txqs)
>              /* Queues are shared among CPUs. Always flush */
>              dev->tx_q[i].flush_tx = true;
>          }
> +    }
> +}
>
> -        /* Initialize map for vhost devices. */
> -        dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
> -        rte_spinlock_init(&dev->tx_q[i].tx_lock);
> +void
> +vring_state_changed_callback(uint8_t port_id,
> +                             enum rte_eth_event_type type OVS_UNUSED,
> +                             void *param OVS_UNUSED)
> +{
> +    struct netdev_dpdk *dev;
> +    struct rte_eth_vhost_queue_event event;
> +    int err = 0;
> +
> +    err = rte_eth_vhost_get_queue_event(port_id, &event);
> +    if (err || (event.rx == 1)) {
> +        return;
>      }
> +
> +    ovs_mutex_lock(&dpdk_mutex);
> +    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
> +        if (port_id == dev->port_id) {
> +            ovs_mutex_lock(&dev->mutex);
> +            if (event.enable) {
> +                dev->reported_queues++;
> +            } else {
> +                dev->reported_queues--;
> +            }
> +            dev->real_n_rxq = dev->reported_queues;
> +            dev->real_n_txq = dev->reported_queues;
> +            dev->txq_needs_locking = true;
> +            netdev_dpdk_alloc_txq(dev, dev->real_n_txq);
> +            ovs_mutex_unlock(&dev->mutex);
> +            break;
> +        }
> +    }
> +    ovs_mutex_unlock(&dpdk_mutex);
> +
> +    return;
> +}
> +
> +void
> +device_state_changed_callback(uint8_t port_id,
> +                              enum rte_eth_event_type type OVS_UNUSED,
> +                              void *param OVS_UNUSED)
> +{
> +    struct netdev_dpdk *dev;
> +
> +    ovs_mutex_lock(&dpdk_mutex);
> +    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
> +        if (port_id == dev->port_id) {
> +            ovs_mutex_lock(&dev->mutex);
> +            check_link_status(dev);
> +            if (dev->link.link_status == ETH_LINK_UP) {
> +                /* new device */
> +                VLOG_INFO("vHost Device '%s' has been added",
> dev->vhost_id);
> +
> +            } else {
> +                /* destroy device */
> +                VLOG_INFO("vHost Device '%s' has been removed",
> dev->vhost_id);
> +            }
> +            ovs_mutex_unlock(&dev->mutex);
> +            break;
> +        }
> +    }
> +    ovs_mutex_unlock(&dpdk_mutex);
> +
> +    return;
>  }
>
>  static int
> @@ -688,6 +770,7 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int
> port_no,
>      int sid;
>      int err = 0;
>      uint32_t buf_size;
> +    unsigned int n_queues = 0;
>
>      ovs_mutex_init(&dev->mutex);
>      ovs_mutex_lock(&dev->mutex);
> @@ -697,7 +780,7 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int
> port_no,
>      /* If the 'sid' is negative, it means that the kernel fails
>       * to obtain the pci numa info.  In that situation, always
>       * use 'SOCKET0'. */
> -    if (type == DPDK_DEV_ETH) {
> +    if (type != DPDK_DEV_VHOST_CUSE) {
>          sid = rte_eth_dev_socket_id(port_no);
>      } else {
>          sid = rte_lcore_to_socket_id(rte_get_master_lcore());
> @@ -709,6 +792,8 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int
> port_no,
>      dev->flags = 0;
>      dev->mtu = ETHER_MTU;
>      dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
> +    dev->vhost_pmd_id = 0;
> +    dev->reported_queues = 0;
>
>      buf_size = dpdk_buf_size(dev->mtu);
>      dev->dpdk_mp = dpdk_mp_get(dev->socket_id,
> FRAME_LEN_TO_MTU(buf_size));
> @@ -726,14 +811,23 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int
> port_no,
>      netdev->requested_n_rxq = NR_QUEUE;
>      dev->real_n_txq = NR_QUEUE;
>
> -    if (type == DPDK_DEV_ETH) {
> -        netdev_dpdk_alloc_txq(dev, NR_QUEUE);
> +    n_queues = (type == DPDK_DEV_VHOST_CUSE) ? OVS_VHOST_MAX_QUEUE_NUM :
> +                                               NR_QUEUE;
> +    netdev_dpdk_alloc_txq(dev, n_queues);
> +    if (type != DPDK_DEV_VHOST_CUSE) {
>          err = dpdk_eth_dev_init(dev);
>          if (err) {
>              goto unlock;
>          }
> -    } else {
> -        netdev_dpdk_alloc_txq(dev, OVS_VHOST_MAX_QUEUE_NUM);
> +    }
> +
> +    if (type == DPDK_DEV_VHOST_USER) {
> +        rte_eth_dev_callback_register(port_no, RTE_ETH_EVENT_QUEUE_STATE,
> +                                     (void*)vring_state_changed_callback,
> +                                      NULL);
> +        rte_eth_dev_callback_register(port_no, RTE_ETH_EVENT_INTR_LSC,
> +                                     (void*)device_state_changed_callback,
> +                                      NULL);
>      }
>
>      ovs_list_push_back(&dpdk_list, &dev->list_node);
> @@ -768,26 +862,37 @@ dpdk_dev_parse_name(const char dev_name[], const
> char prefix[],
>  }
>
>  static int
> -vhost_construct_helper(struct netdev *netdev) OVS_REQUIRES(dpdk_mutex)
> +netdev_dpdk_vhost_cuse_construct(struct netdev *netdev)
>  {
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +    int err;
> +
> +    ovs_mutex_lock(&dpdk_mutex);
> +    strncpy(dev->vhost_id, netdev->name, sizeof(dev->vhost_id));
>      if (rte_eal_init_ret) {
>          return rte_eal_init_ret;
>      }
>
> -    return netdev_dpdk_init(netdev, -1, DPDK_DEV_VHOST);
> +    rte_spinlock_init(&dev->vhost_cuse_tx_lock);
> +
> +    err = netdev_dpdk_init(netdev, -1, DPDK_DEV_VHOST_CUSE);
> +
> +    ovs_mutex_unlock(&dpdk_mutex);
> +    return err;
>  }
>
>  static int
> -netdev_dpdk_vhost_cuse_construct(struct netdev *netdev)
> +get_vhost_user_drv_id(void)
>  {
> -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> -    int err;
> +    int i = 0;
>
> -    ovs_mutex_lock(&dpdk_mutex);
> -    strncpy(dev->vhost_id, netdev->name, sizeof(dev->vhost_id));
> -    err = vhost_construct_helper(netdev);
> -    ovs_mutex_unlock(&dpdk_mutex);
> -    return err;
> +    for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
> +        if (vhost_user_drv_ids[i] == 0) {
> +            return i;
> +        }
> +    }
> +
> +    return -1;
>  }
>
>  static int
> @@ -796,6 +901,9 @@ netdev_dpdk_vhost_user_construct(struct netdev *netdev)
>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>      const char *name = netdev->name;
>      int err;
> +    uint8_t port_no = 0;
> +    char devargs[4096];
> +    int driver_id = 0;
>
>      /* 'name' is appended to 'vhost_sock_dir' and used to create a socket
> in
>       * the file system. '/' or '\' would traverse directories, so they're
> not
> @@ -807,22 +915,35 @@ netdev_dpdk_vhost_user_construct(struct netdev
> *netdev)
>          return EINVAL;
>      }
>
> +    if (rte_eal_init_ret) {
> +        return rte_eal_init_ret;
> +    }
> +
>      ovs_mutex_lock(&dpdk_mutex);
>      /* Take the name of the vhost-user port and append it to the location
> where
>       * the socket is to be created, then register the socket.
>       */
>      snprintf(dev->vhost_id, sizeof(dev->vhost_id), "%s/%s",
>               vhost_sock_dir, name);
> +    driver_id = get_vhost_user_drv_id();
>

I'm not sure I understand the logic behing this code.  Are we trying to
create a unique name with driver_id? What are the requirements for the
device name?

Also, since the OVS Interface name is part of the devargs string, it seems
to me that we should protect agains commas (",")



> +    if (driver_id == -1) {
> +        VLOG_ERR("Too many vhost-user devices registered with PMD");
> +        err = ENODEV;
> +    } else {
> +        snprintf(devargs, sizeof(devargs),
> "eth_vhost%u,iface=%s,queues=%i",
> +                 driver_id, dev->vhost_id, RTE_MAX_QUEUES_PER_PORT);
> +        err = rte_eth_dev_attach(devargs, &port_no);
> +    }
>
> -    err = rte_vhost_driver_register(dev->vhost_id);
>      if (err) {
> -        VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
> -                 dev->vhost_id);
> +        VLOG_ERR("Failed to attach vhost-user device to DPDK");
>      } else {
>          fatal_signal_add_file_to_unlink(dev->vhost_id);
>          VLOG_INFO("Socket %s created for vhost-user port %s\n",
>                    dev->vhost_id, name);
> -        err = vhost_construct_helper(netdev);
> +        dev->vhost_pmd_id = driver_id;
> +        vhost_user_drv_ids[dev->vhost_pmd_id] = 1;
> +        err = netdev_dpdk_init(netdev, port_no, DPDK_DEV_VHOST_USER);
>      }
>
>      ovs_mutex_unlock(&dpdk_mutex);
> @@ -857,6 +978,12 @@ netdev_dpdk_destruct(struct netdev *netdev)
>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>
>      ovs_mutex_lock(&dev->mutex);
> +
> +    if (dev->type == DPDK_DEV_VHOST_USER) {
> +        rte_eth_dev_detach(dev->port_id, dev->vhost_id);
> +        vhost_user_drv_ids[dev->vhost_pmd_id] = 0;
> +    }
> +
>      rte_eth_dev_stop(dev->port_id);
>      ovs_mutex_unlock(&dev->mutex);
>
> @@ -868,7 +995,7 @@ netdev_dpdk_destruct(struct netdev *netdev)
>  }
>
>  static void
> -netdev_dpdk_vhost_destruct(struct netdev *netdev)
> +netdev_dpdk_vhost_cuse_destruct(struct netdev *netdev)
>  {
>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>
> @@ -876,15 +1003,8 @@ netdev_dpdk_vhost_destruct(struct netdev *netdev)
>      if (netdev_dpdk_get_virtio(dev) != NULL) {
>          VLOG_ERR("Removing port '%s' while vhost device still attached.",
>                   netdev->name);
> -        VLOG_ERR("To restore connectivity after re-adding of port, VM on
> socket"
> -                 " '%s' must be restarted.",
> -                 dev->vhost_id);
> -    }
> -
> -    if (rte_vhost_driver_unregister(dev->vhost_id)) {
> -        VLOG_ERR("Unable to remove vhost-user socket %s", dev->vhost_id);
> -    } else {
> -        fatal_signal_remove_file_to_unlink(dev->vhost_id);
> +        VLOG_ERR("To restore connectivity after re-adding of port, VM
> with"
> +                 "port '%s' must be restarted.", dev->vhost_id);
>      }
>
>      ovs_mutex_lock(&dpdk_mutex);
> @@ -1000,30 +1120,6 @@ netdev_dpdk_vhost_cuse_set_multiq(struct netdev
> *netdev, unsigned int n_txq,
>      netdev->n_txq = n_txq;
>      dev->real_n_txq = 1;
>      netdev->n_rxq = 1;
> -    dev->txq_needs_locking = dev->real_n_txq != netdev->n_txq;
> -
> -    ovs_mutex_unlock(&dev->mutex);
> -    ovs_mutex_unlock(&dpdk_mutex);
> -
> -    return err;
> -}
> -
> -static int
> -netdev_dpdk_vhost_set_multiq(struct netdev *netdev, unsigned int n_txq,
> -                             unsigned int n_rxq)
> -{
> -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> -    int err = 0;
> -
> -    if (netdev->n_txq == n_txq && netdev->n_rxq == n_rxq) {
> -        return err;
> -    }
> -
> -    ovs_mutex_lock(&dpdk_mutex);
> -    ovs_mutex_lock(&dev->mutex);
> -
> -    netdev->n_txq = n_txq;
> -    netdev->n_rxq = n_rxq;
>
>      ovs_mutex_unlock(&dev->mutex);
>      ovs_mutex_unlock(&dpdk_mutex);
> @@ -1118,13 +1214,13 @@ dpdk_queue_flush(struct netdev_dpdk *dev, int qid)
>  }
>
>  static bool
> -is_vhost_running(struct virtio_net *virtio_dev)
> +is_vhost_cuse_running(struct virtio_net *virtio_dev)
>  {
>      return (virtio_dev != NULL && (virtio_dev->flags &
> VIRTIO_DEV_RUNNING));
>  }
>
>  static inline void
> -netdev_dpdk_vhost_update_rx_counters(struct netdev_stats *stats,
> +netdev_dpdk_vhost_cuse_update_rx_counters(struct netdev_stats *stats,
>                                       struct dp_packet **packets, int
> count)
>  {
>      int i;
> @@ -1153,26 +1249,22 @@ netdev_dpdk_vhost_update_rx_counters(struct
> netdev_stats *stats,
>  }
>
>  /*
> - * The receive path for the vhost port is the TX path out from guest.
> + * The receive path for the vhost cuse port is the TX path out from guest.
>   */
>  static int
> -netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
> +netdev_dpdk_vhost_cuse_rxq_recv(struct netdev_rxq *rxq,
>                             struct dp_packet **packets, int *c)
>  {
>      struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
>      struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);
> -    int qid = rxq->queue_id;
> +    int qid = 1;
>      uint16_t nb_rx = 0;
>
> -    if (OVS_UNLIKELY(!is_vhost_running(virtio_dev))) {
> +    if (OVS_UNLIKELY(!is_vhost_cuse_running(virtio_dev))) {
>          return EAGAIN;
>      }
>
> -    if (rxq->queue_id >= dev->real_n_rxq) {
> -        return EOPNOTSUPP;
> -    }
> -
> -    nb_rx = rte_vhost_dequeue_burst(virtio_dev, qid * VIRTIO_QNUM +
> VIRTIO_TXQ,
> +    nb_rx = rte_vhost_dequeue_burst(virtio_dev, qid,
>                                      dev->dpdk_mp->mp,
>                                      (struct rte_mbuf **)packets,
>                                      NETDEV_MAX_BURST);
> @@ -1181,7 +1273,7 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
>      }
>
>      rte_spinlock_lock(&dev->stats_lock);
> -    netdev_dpdk_vhost_update_rx_counters(&dev->stats, packets, nb_rx);
> +    netdev_dpdk_vhost_cuse_update_rx_counters(&dev->stats, packets,
> nb_rx);
>      rte_spinlock_unlock(&dev->stats_lock);
>
>      *c = (int) nb_rx;
> @@ -1217,6 +1309,18 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct
> dp_packet **packets,
>      return 0;
>  }
>
> +static int
> +netdev_dpdk_vhost_user_rxq_recv(struct netdev_rxq *rxq,
> +                                struct dp_packet **packets, int *c)
> +{
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
> +
> +    if (rxq->queue_id >= dev->real_n_rxq) {
> +        return EOPNOTSUPP;
> +    }
> +
> +    return netdev_dpdk_rxq_recv(rxq, packets, c);
> +}
>  static inline int
>  netdev_dpdk_qos_run__(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
>                          int cnt)
> @@ -1235,7 +1339,7 @@ netdev_dpdk_qos_run__(struct netdev_dpdk *dev,
> struct rte_mbuf **pkts,
>  }
>
>  static inline void
> -netdev_dpdk_vhost_update_tx_counters(struct netdev_stats *stats,
> +netdev_dpdk_vhost_cuse_update_tx_counters(struct netdev_stats *stats,
>                                       struct dp_packet **packets,
>                                       int attempted,
>                                       int dropped)
> @@ -1252,9 +1356,8 @@ netdev_dpdk_vhost_update_tx_counters(struct
> netdev_stats *stats,
>  }
>
>  static void
> -__netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
> -                         struct dp_packet **pkts, int cnt,
> -                         bool may_steal)
> +__netdev_dpdk_vhost_cuse_send(struct netdev *netdev, struct dp_packet
> **pkts,
> +                         int cnt, bool may_steal)
>  {
>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>      struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);
> @@ -1263,26 +1366,24 @@ __netdev_dpdk_vhost_send(struct netdev *netdev,
> int qid,
>      unsigned int qos_pkts = cnt;
>      uint64_t start = 0;
>
> -    qid = dev->tx_q[qid % dev->real_n_txq].map;
> -
> -    if (OVS_UNLIKELY(!is_vhost_running(virtio_dev) || qid < 0)) {
> +    if (OVS_UNLIKELY(!is_vhost_cuse_running(virtio_dev))) {
>          rte_spinlock_lock(&dev->stats_lock);
>          dev->stats.tx_dropped+= cnt;
>          rte_spinlock_unlock(&dev->stats_lock);
>          goto out;
>      }
>
> -    rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> +    /* There is a single vhost-cuse TX queue, So we need to lock it for
> TX. */
> +    rte_spinlock_lock(&dev->vhost_cuse_tx_lock);
>
>      /* Check has QoS has been configured for the netdev */
>      cnt = netdev_dpdk_qos_run__(dev, cur_pkts, cnt);
>      qos_pkts -= cnt;
>
>      do {
> -        int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
>          unsigned int tx_pkts;
>
> -        tx_pkts = rte_vhost_enqueue_burst(virtio_dev, vhost_qid,
> +        tx_pkts = rte_vhost_enqueue_burst(virtio_dev, VIRTIO_RXQ,
>                                            cur_pkts, cnt);
>          if (OVS_LIKELY(tx_pkts)) {
>              /* Packets have been sent.*/
> @@ -1301,7 +1402,7 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, int
> qid,
>               * Unable to enqueue packets to vhost interface.
>               * Check available entries before retrying.
>               */
> -            while (!rte_vring_available_entries(virtio_dev, vhost_qid)) {
> +            while (!rte_vring_available_entries(virtio_dev, VIRTIO_RXQ)) {
>                  if (OVS_UNLIKELY((rte_get_timer_cycles() - start) >
> timeout)) {
>                      expired = 1;
>                      break;
> @@ -1313,12 +1414,12 @@ __netdev_dpdk_vhost_send(struct netdev *netdev,
> int qid,
>              }
>          }
>      } while (cnt);
> -
> -    rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
> +    rte_spinlock_unlock(&dev->vhost_cuse_tx_lock);
>
>      rte_spinlock_lock(&dev->stats_lock);
>      cnt += qos_pkts;
> -    netdev_dpdk_vhost_update_tx_counters(&dev->stats, pkts, total_pkts,
> cnt);
> +    netdev_dpdk_vhost_cuse_update_tx_counters(&dev->stats, pkts,
> total_pkts,
> +                                              cnt);
>      rte_spinlock_unlock(&dev->stats_lock);
>
>  out:
> @@ -1412,8 +1513,9 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid,
> struct dp_packet **pkts,
>          newcnt++;
>      }
>
> -    if (dev->type == DPDK_DEV_VHOST) {
> -        __netdev_dpdk_vhost_send(netdev, qid, (struct dp_packet **)
> mbufs, newcnt, true);
> +    if (dev->type == DPDK_DEV_VHOST_CUSE) {
> +        __netdev_dpdk_vhost_cuse_send(netdev, (struct dp_packet **) mbufs,
> +                                      newcnt, true);
>      } else {
>          unsigned int qos_pkts = newcnt;
>
> @@ -1437,8 +1539,8 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid,
> struct dp_packet **pkts,
>  }
>
>  static int
> -netdev_dpdk_vhost_send(struct netdev *netdev, int qid, struct dp_packet
> **pkts,
> -                 int cnt, bool may_steal)
> +netdev_dpdk_vhost_cuse_send(struct netdev *netdev, int qid OVS_UNUSED,
> +                            struct dp_packet **pkts, int cnt, bool
> may_steal)
>  {
>      if (OVS_UNLIKELY(pkts[0]->source != DPBUF_DPDK)) {
>          int i;
> @@ -1450,7 +1552,7 @@ netdev_dpdk_vhost_send(struct netdev *netdev, int
> qid, struct dp_packet **pkts,
>              }
>          }
>      } else {
> -        __netdev_dpdk_vhost_send(netdev, qid, pkts, cnt, may_steal);
> +        __netdev_dpdk_vhost_cuse_send(netdev, pkts, cnt, may_steal);
>      }
>      return 0;
>  }
> @@ -1461,11 +1563,6 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
>  {
>      int i;
>
> -    if (OVS_UNLIKELY(dev->txq_needs_locking)) {
> -        qid = qid % dev->real_n_txq;
> -        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> -    }
> -
>      if (OVS_UNLIKELY(!may_steal ||
>                       pkts[0]->source != DPBUF_DPDK)) {
>          struct netdev *netdev = &dev->up;
> @@ -1541,6 +1638,18 @@ netdev_dpdk_eth_send(struct netdev *netdev, int qid,
>  }
>
>  static int
> +netdev_dpdk_vhost_user_send(struct netdev *netdev, int qid,
> +        struct dp_packet **pkts, int cnt, bool may_steal)
> +{
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +
> +    qid = qid % dev->real_n_txq;
> +
> +    netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal);
> +    return 0;
> +}
> +
> +static int
>  netdev_dpdk_set_etheraddr(struct netdev *netdev, const struct eth_addr
> mac)
>  {
>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> @@ -1634,7 +1743,7 @@ static int
>  netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier);
>
>  static int
> -netdev_dpdk_vhost_get_stats(const struct netdev *netdev,
> +netdev_dpdk_vhost_cuse_get_stats(const struct netdev *netdev,
>                              struct netdev_stats *stats)
>  {
>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> @@ -1796,14 +1905,14 @@ netdev_dpdk_get_carrier(const struct netdev
> *netdev, bool *carrier)
>  }
>
>  static int
> -netdev_dpdk_vhost_get_carrier(const struct netdev *netdev, bool *carrier)
> +netdev_dpdk_vhost_cuse_get_carrier(const struct netdev *netdev, bool
> *carrier)
>  {
>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>      struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);
>
>      ovs_mutex_lock(&dev->mutex);
>
> -    if (is_vhost_running(virtio_dev)) {
> +    if (is_vhost_cuse_running(virtio_dev)) {
>          *carrier = 1;
>      } else {
>          *carrier = 0;
> @@ -1853,7 +1962,7 @@ netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
>          return 0;
>      }
>
> -    if (dev->type == DPDK_DEV_ETH) {
> +    if (dev->type != DPDK_DEV_VHOST_CUSE) {
>          if (dev->flags & NETDEV_UP) {
>              err = rte_eth_dev_start(dev->port_id);
>              if (err)
> @@ -1998,75 +2107,7 @@ set_irq_status(struct virtio_net *virtio_dev)
>  }
>
>  /*
> - * Fixes mapping for vhost-user tx queues. Must be called after each
> - * enabling/disabling of queues and real_n_txq modifications.
> - */
> -static void
> -netdev_dpdk_remap_txqs(struct netdev_dpdk *dev)
> -    OVS_REQUIRES(dev->mutex)
> -{
> -    int *enabled_queues, n_enabled = 0;
> -    int i, k, total_txqs = dev->real_n_txq;
> -
> -    enabled_queues = dpdk_rte_mzalloc(total_txqs * sizeof
> *enabled_queues);
> -
> -    for (i = 0; i < total_txqs; i++) {
> -        /* Enabled queues always mapped to themselves. */
> -        if (dev->tx_q[i].map == i) {
> -            enabled_queues[n_enabled++] = i;
> -        }
> -    }
> -
> -    if (n_enabled == 0 && total_txqs != 0) {
> -        enabled_queues[0] = OVS_VHOST_QUEUE_DISABLED;
> -        n_enabled = 1;
> -    }
> -
> -    k = 0;
> -    for (i = 0; i < total_txqs; i++) {
> -        if (dev->tx_q[i].map != i) {
> -            dev->tx_q[i].map = enabled_queues[k];
> -            k = (k + 1) % n_enabled;
> -        }
> -    }
> -
> -    VLOG_DBG("TX queue mapping for %s\n", dev->vhost_id);
> -    for (i = 0; i < total_txqs; i++) {
> -        VLOG_DBG("%2d --> %2d", i, dev->tx_q[i].map);
> -    }
> -
> -    rte_free(enabled_queues);
> -}
> -
> -static int
> -netdev_dpdk_vhost_set_queues(struct netdev_dpdk *dev, struct virtio_net
> *virtio_dev)
> -    OVS_REQUIRES(dev->mutex)
> -{
> -    uint32_t qp_num;
> -
> -    qp_num = virtio_dev->virt_qp_nb;
> -    if (qp_num > dev->up.n_rxq) {
> -        VLOG_ERR("vHost Device '%s' %"PRIu64" can't be added - "
> -                 "too many queues %d > %d", virtio_dev->ifname,
> virtio_dev->device_fh,
> -                 qp_num, dev->up.n_rxq);
> -        return -1;
> -    }
> -
> -    dev->real_n_rxq = qp_num;
> -    dev->real_n_txq = qp_num;
> -    dev->txq_needs_locking = true;
> -    /* Enable TX queue 0 by default if it wasn't disabled. */
> -    if (dev->tx_q[0].map == OVS_VHOST_QUEUE_MAP_UNKNOWN) {
> -        dev->tx_q[0].map = 0;
> -    }
> -
> -    netdev_dpdk_remap_txqs(dev);
> -
> -    return 0;
> -}
> -
> -/*
> - * A new virtio-net device is added to a vhost port.
> + * A new virtio-net device is added to a vhost cuse port.
>   */
>  static int
>  new_device(struct virtio_net *virtio_dev)
> @@ -2079,11 +2120,6 @@ new_device(struct virtio_net *virtio_dev)
>      LIST_FOR_EACH(dev, list_node, &dpdk_list) {
>          if (strncmp(virtio_dev->ifname, dev->vhost_id, IF_NAME_SZ) == 0) {
>              ovs_mutex_lock(&dev->mutex);
> -            if (netdev_dpdk_vhost_set_queues(dev, virtio_dev)) {
> -                ovs_mutex_unlock(&dev->mutex);
> -                ovs_mutex_unlock(&dpdk_mutex);
> -                return -1;
> -            }
>              ovsrcu_set(&dev->virtio_dev, virtio_dev);
>              exists = true;
>              virtio_dev->flags |= VIRTIO_DEV_RUNNING;
> @@ -2107,23 +2143,11 @@ new_device(struct virtio_net *virtio_dev)
>      return 0;
>  }
>
> -/* Clears mapping for all available queues of vhost interface. */
> -static void
> -netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)
> -    OVS_REQUIRES(dev->mutex)
> -{
> -    int i;
> -
> -    for (i = 0; i < dev->real_n_txq; i++) {
> -        dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
> -    }
> -}
> -
>  /*
> - * Remove a virtio-net device from the specific vhost port.  Use
> dev->remove
> - * flag to stop any more packets from being sent or received to/from a VM
> and
> - * ensure all currently queued packets have been sent/received before
> removing
> - *  the device.
> + * Remove a virtio-net device from the specific vhost cuse port. Use
> + * dev->remove flag to stop any more packets from being sent or received
> + * to/from a VM and ensure all currently queued packets have been
> sent/received
> + * before removing the device.
>   */
>  static void
>  destroy_device(volatile struct virtio_net *virtio_dev)
> @@ -2138,7 +2162,6 @@ destroy_device(volatile struct virtio_net
> *virtio_dev)
>              ovs_mutex_lock(&dev->mutex);
>              virtio_dev->flags &= ~VIRTIO_DEV_RUNNING;
>              ovsrcu_set(&dev->virtio_dev, NULL);
> -            netdev_dpdk_txq_map_clear(dev);
>              exists = true;
>              ovs_mutex_unlock(&dev->mutex);
>              break;
> @@ -2166,49 +2189,6 @@ destroy_device(volatile struct virtio_net
> *virtio_dev)
>      }
>  }
>
> -static int
> -vring_state_changed(struct virtio_net *virtio_dev, uint16_t queue_id,
> -                    int enable)
> -{
> -    struct netdev_dpdk *dev;
> -    bool exists = false;
> -    int qid = queue_id / VIRTIO_QNUM;
> -
> -    if (queue_id % VIRTIO_QNUM == VIRTIO_TXQ) {
> -        return 0;
> -    }
> -
> -    ovs_mutex_lock(&dpdk_mutex);
> -    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
> -        if (strncmp(virtio_dev->ifname, dev->vhost_id, IF_NAME_SZ) == 0) {
> -            ovs_mutex_lock(&dev->mutex);
> -            if (enable) {
> -                dev->tx_q[qid].map = qid;
> -            } else {
> -                dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
> -            }
> -            netdev_dpdk_remap_txqs(dev);
> -            exists = true;
> -            ovs_mutex_unlock(&dev->mutex);
> -            break;
> -        }
> -    }
> -    ovs_mutex_unlock(&dpdk_mutex);
> -
> -    if (exists) {
> -        VLOG_INFO("State of queue %d ( tx_qid %d ) of vhost device '%s' %"
> -                  PRIu64" changed to \'%s\'", queue_id, qid,
> -                  virtio_dev->ifname, virtio_dev->device_fh,
> -                  (enable == 1) ? "enabled" : "disabled");
> -    } else {
> -        VLOG_INFO("vHost Device '%s' %"PRIu64" not found",
> virtio_dev->ifname,
> -                  virtio_dev->device_fh);
> -        return -1;
> -    }
> -
> -    return 0;
> -}
> -
>  struct virtio_net *
>  netdev_dpdk_get_virtio(const struct netdev_dpdk *dev)
>  {
> @@ -2216,18 +2196,17 @@ netdev_dpdk_get_virtio(const struct netdev_dpdk
> *dev)
>  }
>
>  /*
> - * These callbacks allow virtio-net devices to be added to vhost ports
> when
> - * configuration has been fully complete.
> + * These callbacks allow virtio-net devices to be added to vhost cuse
> ports
> + * when configuration has been fully complete.
>   */
>  static const struct virtio_net_device_ops virtio_net_device_ops =
>  {
>      .new_device =  new_device,
>      .destroy_device = destroy_device,
> -    .vring_state_changed = vring_state_changed
>  };
>
>  static void *
> -start_vhost_loop(void *dummy OVS_UNUSED)
> +start_vhost_cuse_loop(void *dummy OVS_UNUSED)
>  {
>       pthread_detach(pthread_self());
>       /* Put the cuse thread into quiescent state. */
> @@ -2237,19 +2216,7 @@ start_vhost_loop(void *dummy OVS_UNUSED)
>  }
>
>  static int
> -dpdk_vhost_class_init(void)
> -{
> -    rte_vhost_driver_callback_register(&virtio_net_device_ops);
> -    rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
> -                            | 1ULL << VIRTIO_NET_F_HOST_TSO6
> -                            | 1ULL << VIRTIO_NET_F_CSUM);
> -
> -    ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
> -    return 0;
> -}
> -
> -static int
> -dpdk_vhost_cuse_class_init(void)
> +netdev_dpdk_vhost_cuse_class_init(void)
>  {
>      int err = -1;
>
> @@ -2265,14 +2232,12 @@ dpdk_vhost_cuse_class_init(void)
>          return -1;
>      }
>
> -    dpdk_vhost_class_init();
> -    return 0;
> -}
> +    rte_vhost_driver_callback_register(&virtio_net_device_ops);
> +    rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
> +                            | 1ULL << VIRTIO_NET_F_HOST_TSO6
> +                            | 1ULL << VIRTIO_NET_F_CSUM);
> +    ovs_thread_create("vhost_cuse_thread", start_vhost_cuse_loop, NULL);
>
> -static int
> -dpdk_vhost_user_class_init(void)
> -{
> -    dpdk_vhost_class_init();
>      return 0;
>  }
>
> @@ -2859,30 +2824,30 @@ static const struct netdev_class dpdk_ring_class =
>  static const struct netdev_class OVS_UNUSED dpdk_vhost_cuse_class =
>      NETDEV_DPDK_CLASS(
>          "dpdkvhostcuse",
> -        dpdk_vhost_cuse_class_init,
> +        netdev_dpdk_vhost_cuse_class_init,
>          netdev_dpdk_vhost_cuse_construct,
> -        netdev_dpdk_vhost_destruct,
> +        netdev_dpdk_vhost_cuse_destruct,
>          netdev_dpdk_vhost_cuse_set_multiq,
> -        netdev_dpdk_vhost_send,
> -        netdev_dpdk_vhost_get_carrier,
> -        netdev_dpdk_vhost_get_stats,
> +        netdev_dpdk_vhost_cuse_send,
> +        netdev_dpdk_vhost_cuse_get_carrier,
> +        netdev_dpdk_vhost_cuse_get_stats,
>          NULL,
>          NULL,
> -        netdev_dpdk_vhost_rxq_recv);
> +        netdev_dpdk_vhost_cuse_rxq_recv);
>
>  static const struct netdev_class OVS_UNUSED dpdk_vhost_user_class =
>      NETDEV_DPDK_CLASS(
>          "dpdkvhostuser",
> -        dpdk_vhost_user_class_init,
> -        netdev_dpdk_vhost_user_construct,
> -        netdev_dpdk_vhost_destruct,
> -        netdev_dpdk_vhost_set_multiq,
> -        netdev_dpdk_vhost_send,
> -        netdev_dpdk_vhost_get_carrier,
> -        netdev_dpdk_vhost_get_stats,
> -        NULL,
>          NULL,
> -        netdev_dpdk_vhost_rxq_recv);
> +        netdev_dpdk_vhost_user_construct,
> +        netdev_dpdk_destruct,
> +        netdev_dpdk_set_multiq,
> +        netdev_dpdk_vhost_user_send,
> +        netdev_dpdk_get_carrier,
> +        netdev_dpdk_get_stats,
> +        netdev_dpdk_get_features,
> +        netdev_dpdk_get_status,
> +        netdev_dpdk_vhost_user_rxq_recv);
>
>  void
>  netdev_dpdk_register(void)
> --
> 2.4.3
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> http://openvswitch.org/mailman/listinfo/dev
>

Ciara Loftus May 3, 2016, 11:28 a.m. UTC | #3

> 

> Hi Ciara,

> thanks for doing this.  I really think this has the potential to clean up the

> netdev-dpdk code.


Thanks for your feedback Daniele. Comments inline, v2 soon.

> The clang thread safety analyzer reports some warnings:

> 

>   CC       lib/netdev-dpdk.lo

> ../lib/netdev-dpdk.c:882:1: error: mutex 'dpdk_mutex' is not held on every

> path through here [-Werror,-Wthread-safety-analysis]

> }

> ^

> ../lib/netdev-dpdk.c:870:5: note: mutex acquired here

>     ovs_mutex_lock(&dpdk_mutex);

>     ^

> ../include/openvswitch/thread.h:60:9: note: expanded from macro

> 'ovs_mutex_lock'

>         ovs_mutex_lock_at(mutex, OVS_SOURCE_LOCATOR)

>         ^

> 1 error generated.

Thanks. Will be fixed in v2.

> I see that this patch removes the transmission locks for the txqs. I think those

> are still needed by physical NICs.

Correct - they managed to disappear with the churn in v1! My mistake.

> This patch seem to remove a lot of txq remapping functions (like

> netdev_dpdk_remap_txqs).  How does it handle the case of a disabled txq in

> the guest kernel?

There is a difference in the amount of information we can get about vrings in OVS now. With the PMD, we no longer have direct access to the virtio_net structure. We used to use virto_net->virt_qp_nb to determine the number of vrings (enabled and disabled) in the guest kernel, and we could map disabled onto enabled accordingly. Now with the PMD, we only get vring information as their state changes.
eg. VM with 2 vrings enabled -> we assume there are only 2 vrings, even though there may be many more that are disabled. We don't need to map because we aren't aware of the disabled queues. 

> I see that vhost-cuse is still handled separately. Is it possible to use the vhost

> pmd also for vhost-cuse? Otherwise we still basically have to handle

It's not possible to use the PMD for vhost-cuse, just vhost-user.

> differently three cases: NIC PMD, vhost user pmd, vhost cuse.  Maybe it's

> time to remove vhost-cuse (I understand this is a separate issue, though)?

I guess it's as good a time as any to discuss this. Would be interested to hear opinions from the community.

> I get an error when I try this:

> 

> ovs-vsctl add-port br0 p1 -- set Interface p1 type="dpdkvhostuser"

> ovs-vsctl del-port br0 p1

> ovs-vsctl add-port br0 p1 -- set Interface p1 type="dpdkvhostuser"

Will be fixed in the v2, too. I was resetting vhost_pmd_id in netdev_dpdk_init when I shouldn't have been. Removing that fixes problems with add/del combinations.

> 

> More comments inline

> Thanks!

> 

> 2016-04-21 5:20 GMT-07:00 Ciara Loftus <ciara.loftus@intel.com>:

> DPDK 16.04 introduces the vHost PMD which allows 'dpdkvhostuser' ports

> to be controlled by the librte_ether API, like physical 'dpdk' ports.

> The commit integrates this functionality into OVS, and refactors some

> of the existing vhost code such that it is vhost-cuse specific.

> Similarly, there is now some overlap between dpdk and vhost-user port

> code.

> 

> Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>

> ---

>  INSTALL.DPDK.md   |  12 ++

>  NEWS              |   2 +

>  lib/netdev-dpdk.c | 515 +++++++++++++++++++++++++------------------------

> -----

>  3 files changed, 254 insertions(+), 275 deletions(-)

>  mode change 100644 => 100755 lib/netdev-dpdk.c

> 

> diff --git a/INSTALL.DPDK.md b/INSTALL.DPDK.md

> index 7f76df8..5006812 100644

> --- a/INSTALL.DPDK.md

> +++ b/INSTALL.DPDK.md

> @@ -945,6 +945,18 @@ Restrictions:

>      increased to the desired number of queues. Both DPDK and OVS must be

>      recompiled for this change to take effect.

> 

> +  DPDK 'eth' type ports:

> +  - dpdk, dpdkr and dpdkvhostuser ports are 'eth' type ports in the context

> of

> +    DPDK as they are all managed by the rte_ether API. This means that they

> +    adhere to the DPDK configuration option CONFIG_RTE_MAX_ETHPORTS

> which by

> +    default is set to 32. This means by default the combined total number of

> +    dpdk, dpdkr and dpdkvhostuser ports allowable in OVS with DPDK is 32.

> This

> +    value can be changed if desired by modifying the configuration file in

> +    DPDK, or by overriding the default value on the command line when

> building

> +    DPDK. eg.

> +

> +        `make install CONFIG_RTE_MAX_ETHPORTS=64`

> +

> 

> This seems a heavy limitation compared to the previous librte_vhost

> approach. Are there any plans to increase this in DPDK upstream?

Not that I'm aware of.

> 

>  Bug Reporting:

>  --------------

> 

> diff --git a/NEWS b/NEWS

> index ea7f3a1..4dc0201 100644

> --- a/NEWS

> +++ b/NEWS

> @@ -26,6 +26,8 @@ Post-v2.5.0

>         assignment.

>       * Type of log messages from PMD threads changed from INFO to DBG.

>       * QoS functionality with sample egress-policer implementation.

> +     * vHost PMD integration brings vhost-user ports under control of the

> +       rte_ether DPDK API.

>     - ovs-benchmark: This utility has been removed due to lack of use and

>       bitrot.

>     - ovs-appctl:

> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c

> old mode 100644

> new mode 100755

> index 208c5f5..4fccd63

> --- a/lib/netdev-dpdk.c

> +++ b/lib/netdev-dpdk.c

> @@ -56,6 +56,7 @@

>  #include "rte_mbuf.h"

>  #include "rte_meter.h"

>  #include "rte_virtio_net.h"

> +#include "rte_eth_vhost.h"

> 

>  VLOG_DEFINE_THIS_MODULE(dpdk);

>  static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);

> @@ -109,6 +110,8 @@ BUILD_ASSERT_DECL((MAX_NB_MBUF /

> ROUND_DOWN_POW2(MAX_NB_MBUF/MIN_NB_MBUF))

> 

>  static char *cuse_dev_name = NULL;    /* Character device cuse_dev_name.

> */

>  static char *vhost_sock_dir = NULL;   /* Location of vhost-user sockets */

> +/* Array that tracks the used & unused vHost user driver IDs */

> +static unsigned int vhost_user_drv_ids[RTE_MAX_ETHPORTS];

> 

>  /*

>   * Maximum amount of time in micro seconds to try and enqueue to vhost.

> @@ -143,7 +146,8 @@ enum { DRAIN_TSC = 200000ULL };

> 

>  enum dpdk_dev_type {

>      DPDK_DEV_ETH = 0,

> -    DPDK_DEV_VHOST = 1,

> +    DPDK_DEV_VHOST_USER = 1,

> +    DPDK_DEV_VHOST_CUSE = 2,

>  };

> 

>  static int rte_eal_init_ret = ENODEV;

> @@ -275,8 +279,6 @@ struct dpdk_tx_queue {

>                                      * from concurrent access.  It is used only

>                                      * if the queue is shared among different

>                                      * pmd threads (see 'txq_needs_locking'). */

> -    int map;                       /* Mapping of configured vhost-user queues

> -                                    * to enabled by guest. */

>      uint64_t tsc;

>      struct rte_mbuf *burst_pkts[MAX_TX_QUEUE_LEN];

>  };

> @@ -329,12 +331,22 @@ struct netdev_dpdk {

>      int real_n_rxq;

>      bool txq_needs_locking;

> 

> -    /* virtio-net structure for vhost device */

> +    /* Spinlock for vhost cuse transmission. Other DPDK devices use spinlocks

> +     * in dpdk_tx_queue */

> +    rte_spinlock_t vhost_cuse_tx_lock;

> +

> +    /* virtio-net structure for vhost cuse device */

>      OVSRCU_TYPE(struct virtio_net *) virtio_dev;

> 

>      /* Identifier used to distinguish vhost devices from each other */

>      char vhost_id[PATH_MAX];

> 

> +    /* ID of vhost user port given to the PMD driver */

> +    unsigned int vhost_pmd_id;

> +

> +    /* Number of virtqueue pairs reported by the guest */

> +    uint32_t reported_queues;

> +

>      /* In dpdk_list. */

>      struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);

> 

> @@ -352,13 +364,20 @@ struct netdev_rxq_dpdk {

>  static bool dpdk_thread_is_pmd(void);

> 

>  static int netdev_dpdk_construct(struct netdev *);

> +static int netdev_dpdk_vhost_user_construct(struct netdev *);

> 

>  struct virtio_net * netdev_dpdk_get_virtio(const struct netdev_dpdk *dev);

> 

> +void vring_state_changed_callback(uint8_t port_id,

> +        enum rte_eth_event_type type OVS_UNUSED, void *param

> OVS_UNUSED);

> +void device_state_changed_callback(uint8_t port_id,

> +        enum rte_eth_event_type type OVS_UNUSED, void *param

> OVS_UNUSED);

> +

>  static bool

>  is_dpdk_class(const struct netdev_class *class)

>  {

> -    return class->construct == netdev_dpdk_construct;

> +    return ((class->construct == netdev_dpdk_construct) ||

> +            (class->construct == netdev_dpdk_vhost_user_construct));

>  }

> 

>  /* DPDK NIC drivers allocate RX buffers at a particular granularity, typically

> @@ -581,7 +600,9 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk

> *dev, int n_rxq, int n_txq)

>          }

> 

>          dev->up.n_rxq = n_rxq;

> -        dev->real_n_txq = n_txq;

> +        if (dev->type == DPDK_DEV_ETH) {

> +            dev->real_n_txq = n_txq;

> +        }

> 

>          return 0;

>      }

> @@ -672,11 +693,72 @@ netdev_dpdk_alloc_txq(struct netdev_dpdk *dev,

> unsigned int n_txqs)

>              /* Queues are shared among CPUs. Always flush */

>              dev->tx_q[i].flush_tx = true;

>          }

> +    }

> +}

> 

> -        /* Initialize map for vhost devices. */

> -        dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;

> -        rte_spinlock_init(&dev->tx_q[i].tx_lock);

> +void

> +vring_state_changed_callback(uint8_t port_id,

> +                             enum rte_eth_event_type type OVS_UNUSED,

> +                             void *param OVS_UNUSED)

> +{

> +    struct netdev_dpdk *dev;

> +    struct rte_eth_vhost_queue_event event;

> +    int err = 0;

> +

> +    err = rte_eth_vhost_get_queue_event(port_id, &event);

> +    if (err || (event.rx == 1)) {

> +        return;

>      }

> +

> +    ovs_mutex_lock(&dpdk_mutex);

> +    LIST_FOR_EACH (dev, list_node, &dpdk_list) {

> +        if (port_id == dev->port_id) {

> +            ovs_mutex_lock(&dev->mutex);

> +            if (event.enable) {

> +                dev->reported_queues++;

> +            } else {

> +                dev->reported_queues--;

> +            }

> +            dev->real_n_rxq = dev->reported_queues;

> +            dev->real_n_txq = dev->reported_queues;

> +            dev->txq_needs_locking = true;

> +            netdev_dpdk_alloc_txq(dev, dev->real_n_txq);

> +            ovs_mutex_unlock(&dev->mutex);

> +            break;

> +        }

> +    }

> +    ovs_mutex_unlock(&dpdk_mutex);

> +

> +    return;

> +}

> +

> +void

> +device_state_changed_callback(uint8_t port_id,

> +                              enum rte_eth_event_type type OVS_UNUSED,

> +                              void *param OVS_UNUSED)

> +{

> +    struct netdev_dpdk *dev;

> +

> +    ovs_mutex_lock(&dpdk_mutex);

> +    LIST_FOR_EACH (dev, list_node, &dpdk_list) {

> +        if (port_id == dev->port_id) {

> +            ovs_mutex_lock(&dev->mutex);

> +            check_link_status(dev);

> +            if (dev->link.link_status == ETH_LINK_UP) {

> +                /* new device */

> +                VLOG_INFO("vHost Device '%s' has been added", dev->vhost_id);

> +

> +            } else {

> +                /* destroy device */

> +                VLOG_INFO("vHost Device '%s' has been removed", dev-

> >vhost_id);

> +            }

> +            ovs_mutex_unlock(&dev->mutex);

> +            break;

> +        }

> +    }

> +    ovs_mutex_unlock(&dpdk_mutex);

> +

> +    return;

>  }

> 

>  static int

> @@ -688,6 +770,7 @@ netdev_dpdk_init(struct netdev *netdev, unsigned

> int port_no,

>      int sid;

>      int err = 0;

>      uint32_t buf_size;

> +    unsigned int n_queues = 0;

> 

>      ovs_mutex_init(&dev->mutex);

>      ovs_mutex_lock(&dev->mutex);

> @@ -697,7 +780,7 @@ netdev_dpdk_init(struct netdev *netdev, unsigned

> int port_no,

>      /* If the 'sid' is negative, it means that the kernel fails

>       * to obtain the pci numa info.  In that situation, always

>       * use 'SOCKET0'. */

> -    if (type == DPDK_DEV_ETH) {

> +    if (type != DPDK_DEV_VHOST_CUSE) {

>          sid = rte_eth_dev_socket_id(port_no);

>      } else {

>          sid = rte_lcore_to_socket_id(rte_get_master_lcore());

> @@ -709,6 +792,8 @@ netdev_dpdk_init(struct netdev *netdev, unsigned

> int port_no,

>      dev->flags = 0;

>      dev->mtu = ETHER_MTU;

>      dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);

> +    dev->vhost_pmd_id = 0;

> +    dev->reported_queues = 0;

> 

>      buf_size = dpdk_buf_size(dev->mtu);

>      dev->dpdk_mp = dpdk_mp_get(dev->socket_id,

> FRAME_LEN_TO_MTU(buf_size));

> @@ -726,14 +811,23 @@ netdev_dpdk_init(struct netdev *netdev, unsigned

> int port_no,

>      netdev->requested_n_rxq = NR_QUEUE;

>      dev->real_n_txq = NR_QUEUE;

> 

> -    if (type == DPDK_DEV_ETH) {

> -        netdev_dpdk_alloc_txq(dev, NR_QUEUE);

> +    n_queues = (type == DPDK_DEV_VHOST_CUSE) ?

> OVS_VHOST_MAX_QUEUE_NUM :

> +                                               NR_QUEUE;

> +    netdev_dpdk_alloc_txq(dev, n_queues);

> +    if (type != DPDK_DEV_VHOST_CUSE) {

>          err = dpdk_eth_dev_init(dev);

>          if (err) {

>              goto unlock;

>          }

> -    } else {

> -        netdev_dpdk_alloc_txq(dev, OVS_VHOST_MAX_QUEUE_NUM);

> +    }

> +

> +    if (type == DPDK_DEV_VHOST_USER) {

> +        rte_eth_dev_callback_register(port_no,

> RTE_ETH_EVENT_QUEUE_STATE,

> +                                     (void*)vring_state_changed_callback,

> +                                      NULL);

> +        rte_eth_dev_callback_register(port_no, RTE_ETH_EVENT_INTR_LSC,

> +                                     (void*)device_state_changed_callback,

> +                                      NULL);

>      }

> 

>      ovs_list_push_back(&dpdk_list, &dev->list_node);

> @@ -768,26 +862,37 @@ dpdk_dev_parse_name(const char dev_name[],

> const char prefix[],

>  }

> 

>  static int

> -vhost_construct_helper(struct netdev *netdev)

> OVS_REQUIRES(dpdk_mutex)

> +netdev_dpdk_vhost_cuse_construct(struct netdev *netdev)

>  {

> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> +    int err;

> +

> +    ovs_mutex_lock(&dpdk_mutex);

> +    strncpy(dev->vhost_id, netdev->name, sizeof(dev->vhost_id));

>      if (rte_eal_init_ret) {

>          return rte_eal_init_ret;

>      }

> 

> -    return netdev_dpdk_init(netdev, -1, DPDK_DEV_VHOST);

> +    rte_spinlock_init(&dev->vhost_cuse_tx_lock);

> +

> +    err = netdev_dpdk_init(netdev, -1, DPDK_DEV_VHOST_CUSE);

> +

> +    ovs_mutex_unlock(&dpdk_mutex);

> +    return err;

>  }

> 

>  static int

> -netdev_dpdk_vhost_cuse_construct(struct netdev *netdev)

> +get_vhost_user_drv_id(void)

>  {

> -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> -    int err;

> +    int i = 0;

> 

> -    ovs_mutex_lock(&dpdk_mutex);

> -    strncpy(dev->vhost_id, netdev->name, sizeof(dev->vhost_id));

> -    err = vhost_construct_helper(netdev);

> -    ovs_mutex_unlock(&dpdk_mutex);

> -    return err;

> +    for (i = 0; i < RTE_MAX_ETHPORTS; i++) {

> +        if (vhost_user_drv_ids[i] == 0) {

> +            return i;

> +        }

> +    }

> +

> +    return -1;

>  }

> 

>  static int

> @@ -796,6 +901,9 @@ netdev_dpdk_vhost_user_construct(struct netdev

> *netdev)

>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

>      const char *name = netdev->name;

>      int err;

> +    uint8_t port_no = 0;

> +    char devargs[4096];

> +    int driver_id = 0;

> 

>      /* 'name' is appended to 'vhost_sock_dir' and used to create a socket in

>       * the file system. '/' or '\' would traverse directories, so they're not

> @@ -807,22 +915,35 @@ netdev_dpdk_vhost_user_construct(struct netdev

> *netdev)

>          return EINVAL;

>      }

> 

> +    if (rte_eal_init_ret) {

> +        return rte_eal_init_ret;

> +    }

> +

>      ovs_mutex_lock(&dpdk_mutex);

>      /* Take the name of the vhost-user port and append it to the location

> where

>       * the socket is to be created, then register the socket.

>       */

>      snprintf(dev->vhost_id, sizeof(dev->vhost_id), "%s/%s",

>               vhost_sock_dir, name);

> +    driver_id = get_vhost_user_drv_id();

> 

> I'm not sure I understand the logic behing this code.  Are we trying to create

> a unique name with driver_id? What are the requirements for the device

> name?

Yes. rte_eth_dev_attach() expects the driver_name (eth_vhost) along with a unique ID as its first argument. This is essentially the name of the port from a DPDK perspective. The ID part of the name could be a string, and we could use the name of the socket, but the max length of the DPDK port name = RTE_ETH_NAME_MAX_LEN = 32 - this would limit the length of the vhost port name to 23 (eth_vhost using the first 9 chars).
So instead we use an int, generated by get_vhost_user_drv_id(), to append onto "eth_vhost". We track those that are used and unused, and the max it can be = RTE_MAX_ETHPORTS as we will never have more vhost ports than that number.

> Also, since the OVS Interface name is part of the devargs string, it seems to

> me that we should protect agains commas (",")

Yes, good point. Will implement this in v2.

> 

> 

> +    if (driver_id == -1) {

> +        VLOG_ERR("Too many vhost-user devices registered with PMD");

> +        err = ENODEV;

> +    } else {

> +        snprintf(devargs, sizeof(devargs), "eth_vhost%u,iface=%s,queues=%i",

> +                 driver_id, dev->vhost_id, RTE_MAX_QUEUES_PER_PORT);

> +        err = rte_eth_dev_attach(devargs, &port_no);

> +    }

> 

> -    err = rte_vhost_driver_register(dev->vhost_id);

>      if (err) {

> -        VLOG_ERR("vhost-user socket device setup failure for socket %s\n",

> -                 dev->vhost_id);

> +        VLOG_ERR("Failed to attach vhost-user device to DPDK");

>      } else {

>          fatal_signal_add_file_to_unlink(dev->vhost_id);

>          VLOG_INFO("Socket %s created for vhost-user port %s\n",

>                    dev->vhost_id, name);

> -        err = vhost_construct_helper(netdev);

> +        dev->vhost_pmd_id = driver_id;

> +        vhost_user_drv_ids[dev->vhost_pmd_id] = 1;

> +        err = netdev_dpdk_init(netdev, port_no, DPDK_DEV_VHOST_USER);

>      }

> 

>      ovs_mutex_unlock(&dpdk_mutex);

> @@ -857,6 +978,12 @@ netdev_dpdk_destruct(struct netdev *netdev)

>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> 

>      ovs_mutex_lock(&dev->mutex);

> +

> +    if (dev->type == DPDK_DEV_VHOST_USER) {

> +        rte_eth_dev_detach(dev->port_id, dev->vhost_id);

> +        vhost_user_drv_ids[dev->vhost_pmd_id] = 0;

> +    }

> +

>      rte_eth_dev_stop(dev->port_id);

>      ovs_mutex_unlock(&dev->mutex);

> 

> @@ -868,7 +995,7 @@ netdev_dpdk_destruct(struct netdev *netdev)

>  }

> 

>  static void

> -netdev_dpdk_vhost_destruct(struct netdev *netdev)

> +netdev_dpdk_vhost_cuse_destruct(struct netdev *netdev)

>  {

>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> 

> @@ -876,15 +1003,8 @@ netdev_dpdk_vhost_destruct(struct netdev

> *netdev)

>      if (netdev_dpdk_get_virtio(dev) != NULL) {

>          VLOG_ERR("Removing port '%s' while vhost device still attached.",

>                   netdev->name);

> -        VLOG_ERR("To restore connectivity after re-adding of port, VM on

> socket"

> -                 " '%s' must be restarted.",

> -                 dev->vhost_id);

> -    }

> -

> -    if (rte_vhost_driver_unregister(dev->vhost_id)) {

> -        VLOG_ERR("Unable to remove vhost-user socket %s", dev->vhost_id);

> -    } else {

> -        fatal_signal_remove_file_to_unlink(dev->vhost_id);

> +        VLOG_ERR("To restore connectivity after re-adding of port, VM with"

> +                 "port '%s' must be restarted.", dev->vhost_id);

>      }

> 

>      ovs_mutex_lock(&dpdk_mutex);

> @@ -1000,30 +1120,6 @@ netdev_dpdk_vhost_cuse_set_multiq(struct

> netdev *netdev, unsigned int n_txq,

>      netdev->n_txq = n_txq;

>      dev->real_n_txq = 1;

>      netdev->n_rxq = 1;

> -    dev->txq_needs_locking = dev->real_n_txq != netdev->n_txq;

> -

> -    ovs_mutex_unlock(&dev->mutex);

> -    ovs_mutex_unlock(&dpdk_mutex);

> -

> -    return err;

> -}

> -

> -static int

> -netdev_dpdk_vhost_set_multiq(struct netdev *netdev, unsigned int

> n_txq,

> -                             unsigned int n_rxq)

> -{

> -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> -    int err = 0;

> -

> -    if (netdev->n_txq == n_txq && netdev->n_rxq == n_rxq) {

> -        return err;

> -    }

> -

> -    ovs_mutex_lock(&dpdk_mutex);

> -    ovs_mutex_lock(&dev->mutex);

> -

> -    netdev->n_txq = n_txq;

> -    netdev->n_rxq = n_rxq;

> 

>      ovs_mutex_unlock(&dev->mutex);

>      ovs_mutex_unlock(&dpdk_mutex);

> @@ -1118,13 +1214,13 @@ dpdk_queue_flush(struct netdev_dpdk *dev, int

> qid)

>  }

> 

>  static bool

> -is_vhost_running(struct virtio_net *virtio_dev)

> +is_vhost_cuse_running(struct virtio_net *virtio_dev)

>  {

>      return (virtio_dev != NULL && (virtio_dev->flags &

> VIRTIO_DEV_RUNNING));

>  }

> 

>  static inline void

> -netdev_dpdk_vhost_update_rx_counters(struct netdev_stats *stats,

> +netdev_dpdk_vhost_cuse_update_rx_counters(struct netdev_stats

> *stats,

>                                       struct dp_packet **packets, int count)

>  {

>      int i;

> @@ -1153,26 +1249,22 @@ netdev_dpdk_vhost_update_rx_counters(struct

> netdev_stats *stats,

>  }

> 

>  /*

> - * The receive path for the vhost port is the TX path out from guest.

> + * The receive path for the vhost cuse port is the TX path out from guest.

>   */

>  static int

> -netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,

> +netdev_dpdk_vhost_cuse_rxq_recv(struct netdev_rxq *rxq,

>                             struct dp_packet **packets, int *c)

>  {

>      struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);

>      struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);

> -    int qid = rxq->queue_id;

> +    int qid = 1;

>      uint16_t nb_rx = 0;

> 

> -    if (OVS_UNLIKELY(!is_vhost_running(virtio_dev))) {

> +    if (OVS_UNLIKELY(!is_vhost_cuse_running(virtio_dev))) {

>          return EAGAIN;

>      }

> 

> -    if (rxq->queue_id >= dev->real_n_rxq) {

> -        return EOPNOTSUPP;

> -    }

> -

> -    nb_rx = rte_vhost_dequeue_burst(virtio_dev, qid * VIRTIO_QNUM +

> VIRTIO_TXQ,

> +    nb_rx = rte_vhost_dequeue_burst(virtio_dev, qid,

>                                      dev->dpdk_mp->mp,

>                                      (struct rte_mbuf **)packets,

>                                      NETDEV_MAX_BURST);

> @@ -1181,7 +1273,7 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq

> *rxq,

>      }

> 

>      rte_spinlock_lock(&dev->stats_lock);

> -    netdev_dpdk_vhost_update_rx_counters(&dev->stats, packets, nb_rx);

> +    netdev_dpdk_vhost_cuse_update_rx_counters(&dev->stats, packets,

> nb_rx);

>      rte_spinlock_unlock(&dev->stats_lock);

> 

>      *c = (int) nb_rx;

> @@ -1217,6 +1309,18 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq,

> struct dp_packet **packets,

>      return 0;

>  }

> 

> +static int

> +netdev_dpdk_vhost_user_rxq_recv(struct netdev_rxq *rxq,

> +                                struct dp_packet **packets, int *c)

> +{

> +    struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);

> +

> +    if (rxq->queue_id >= dev->real_n_rxq) {

> +        return EOPNOTSUPP;

> +    }

> +

> +    return netdev_dpdk_rxq_recv(rxq, packets, c);

> +}

>  static inline int

>  netdev_dpdk_qos_run__(struct netdev_dpdk *dev, struct rte_mbuf

> **pkts,

>                          int cnt)

> @@ -1235,7 +1339,7 @@ netdev_dpdk_qos_run__(struct netdev_dpdk

> *dev, struct rte_mbuf **pkts,

>  }

> 

>  static inline void

> -netdev_dpdk_vhost_update_tx_counters(struct netdev_stats *stats,

> +netdev_dpdk_vhost_cuse_update_tx_counters(struct netdev_stats

> *stats,

>                                       struct dp_packet **packets,

>                                       int attempted,

>                                       int dropped)

> @@ -1252,9 +1356,8 @@ netdev_dpdk_vhost_update_tx_counters(struct

> netdev_stats *stats,

>  }

> 

>  static void

> -__netdev_dpdk_vhost_send(struct netdev *netdev, int qid,

> -                         struct dp_packet **pkts, int cnt,

> -                         bool may_steal)

> +__netdev_dpdk_vhost_cuse_send(struct netdev *netdev, struct

> dp_packet **pkts,

> +                         int cnt, bool may_steal)

>  {

>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

>      struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);

> @@ -1263,26 +1366,24 @@ __netdev_dpdk_vhost_send(struct netdev

> *netdev, int qid,

>      unsigned int qos_pkts = cnt;

>      uint64_t start = 0;

> 

> -    qid = dev->tx_q[qid % dev->real_n_txq].map;

> -

> -    if (OVS_UNLIKELY(!is_vhost_running(virtio_dev) || qid < 0)) {

> +    if (OVS_UNLIKELY(!is_vhost_cuse_running(virtio_dev))) {

>          rte_spinlock_lock(&dev->stats_lock);

>          dev->stats.tx_dropped+= cnt;

>          rte_spinlock_unlock(&dev->stats_lock);

>          goto out;

>      }

> 

> -    rte_spinlock_lock(&dev->tx_q[qid].tx_lock);

> +    /* There is a single vhost-cuse TX queue, So we need to lock it for TX. */

> +    rte_spinlock_lock(&dev->vhost_cuse_tx_lock);

> 

>      /* Check has QoS has been configured for the netdev */

>      cnt = netdev_dpdk_qos_run__(dev, cur_pkts, cnt);

>      qos_pkts -= cnt;

> 

>      do {

> -        int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;

>          unsigned int tx_pkts;

> 

> -        tx_pkts = rte_vhost_enqueue_burst(virtio_dev, vhost_qid,

> +        tx_pkts = rte_vhost_enqueue_burst(virtio_dev, VIRTIO_RXQ,

>                                            cur_pkts, cnt);

>          if (OVS_LIKELY(tx_pkts)) {

>              /* Packets have been sent.*/

> @@ -1301,7 +1402,7 @@ __netdev_dpdk_vhost_send(struct netdev

> *netdev, int qid,

>               * Unable to enqueue packets to vhost interface.

>               * Check available entries before retrying.

>               */

> -            while (!rte_vring_available_entries(virtio_dev, vhost_qid)) {

> +            while (!rte_vring_available_entries(virtio_dev, VIRTIO_RXQ)) {

>                  if (OVS_UNLIKELY((rte_get_timer_cycles() - start) > timeout)) {

>                      expired = 1;

>                      break;

> @@ -1313,12 +1414,12 @@ __netdev_dpdk_vhost_send(struct netdev

> *netdev, int qid,

>              }

>          }

>      } while (cnt);

> -

> -    rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);

> +    rte_spinlock_unlock(&dev->vhost_cuse_tx_lock);

> 

>      rte_spinlock_lock(&dev->stats_lock);

>      cnt += qos_pkts;

> -    netdev_dpdk_vhost_update_tx_counters(&dev->stats, pkts, total_pkts,

> cnt);

> +    netdev_dpdk_vhost_cuse_update_tx_counters(&dev->stats, pkts,

> total_pkts,

> +                                              cnt);

>      rte_spinlock_unlock(&dev->stats_lock);

> 

>  out:

> @@ -1412,8 +1513,9 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid,

> struct dp_packet **pkts,

>          newcnt++;

>      }

> 

> -    if (dev->type == DPDK_DEV_VHOST) {

> -        __netdev_dpdk_vhost_send(netdev, qid, (struct dp_packet **) mbufs,

> newcnt, true);

> +    if (dev->type == DPDK_DEV_VHOST_CUSE) {

> +        __netdev_dpdk_vhost_cuse_send(netdev, (struct dp_packet **)

> mbufs,

> +                                      newcnt, true);

>      } else {

>          unsigned int qos_pkts = newcnt;

> 

> @@ -1437,8 +1539,8 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid,

> struct dp_packet **pkts,

>  }

> 

>  static int

> -netdev_dpdk_vhost_send(struct netdev *netdev, int qid, struct dp_packet

> **pkts,

> -                 int cnt, bool may_steal)

> +netdev_dpdk_vhost_cuse_send(struct netdev *netdev, int qid

> OVS_UNUSED,

> +                            struct dp_packet **pkts, int cnt, bool may_steal)

>  {

>      if (OVS_UNLIKELY(pkts[0]->source != DPBUF_DPDK)) {

>          int i;

> @@ -1450,7 +1552,7 @@ netdev_dpdk_vhost_send(struct netdev *netdev,

> int qid, struct dp_packet **pkts,

>              }

>          }

>      } else {

> -        __netdev_dpdk_vhost_send(netdev, qid, pkts, cnt, may_steal);

> +        __netdev_dpdk_vhost_cuse_send(netdev, pkts, cnt, may_steal);

>      }

>      return 0;

>  }

> @@ -1461,11 +1563,6 @@ netdev_dpdk_send__(struct netdev_dpdk *dev,

> int qid,

>  {

>      int i;

> 

> -    if (OVS_UNLIKELY(dev->txq_needs_locking)) {

> -        qid = qid % dev->real_n_txq;

> -        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);

> -    }

> -

>      if (OVS_UNLIKELY(!may_steal ||

>                       pkts[0]->source != DPBUF_DPDK)) {

>          struct netdev *netdev = &dev->up;

> @@ -1541,6 +1638,18 @@ netdev_dpdk_eth_send(struct netdev *netdev,

> int qid,

>  }

> 

>  static int

> +netdev_dpdk_vhost_user_send(struct netdev *netdev, int qid,

> +        struct dp_packet **pkts, int cnt, bool may_steal)

> +{

> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> +

> +    qid = qid % dev->real_n_txq;

> +

> +    netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal);

> +    return 0;

> +}

> +

> +static int

>  netdev_dpdk_set_etheraddr(struct netdev *netdev, const struct eth_addr

> mac)

>  {

>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> @@ -1634,7 +1743,7 @@ static int

>  netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier);

> 

>  static int

> -netdev_dpdk_vhost_get_stats(const struct netdev *netdev,

> +netdev_dpdk_vhost_cuse_get_stats(const struct netdev *netdev,

>                              struct netdev_stats *stats)

>  {

>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> @@ -1796,14 +1905,14 @@ netdev_dpdk_get_carrier(const struct netdev

> *netdev, bool *carrier)

>  }

> 

>  static int

> -netdev_dpdk_vhost_get_carrier(const struct netdev *netdev, bool

> *carrier)

> +netdev_dpdk_vhost_cuse_get_carrier(const struct netdev *netdev, bool

> *carrier)

>  {

>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

>      struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);

> 

>      ovs_mutex_lock(&dev->mutex);

> 

> -    if (is_vhost_running(virtio_dev)) {

> +    if (is_vhost_cuse_running(virtio_dev)) {

>          *carrier = 1;

>      } else {

>          *carrier = 0;

> @@ -1853,7 +1962,7 @@ netdev_dpdk_update_flags__(struct netdev_dpdk

> *dev,

>          return 0;

>      }

> 

> -    if (dev->type == DPDK_DEV_ETH) {

> +    if (dev->type != DPDK_DEV_VHOST_CUSE) {

>          if (dev->flags & NETDEV_UP) {

>              err = rte_eth_dev_start(dev->port_id);

>              if (err)

> @@ -1998,75 +2107,7 @@ set_irq_status(struct virtio_net *virtio_dev)

>  }

> 

>  /*

> - * Fixes mapping for vhost-user tx queues. Must be called after each

> - * enabling/disabling of queues and real_n_txq modifications.

> - */

> -static void

> -netdev_dpdk_remap_txqs(struct netdev_dpdk *dev)

> -    OVS_REQUIRES(dev->mutex)

> -{

> -    int *enabled_queues, n_enabled = 0;

> -    int i, k, total_txqs = dev->real_n_txq;

> -

> -    enabled_queues = dpdk_rte_mzalloc(total_txqs * sizeof

> *enabled_queues);

> -

> -    for (i = 0; i < total_txqs; i++) {

> -        /* Enabled queues always mapped to themselves. */

> -        if (dev->tx_q[i].map == i) {

> -            enabled_queues[n_enabled++] = i;

> -        }

> -    }

> -

> -    if (n_enabled == 0 && total_txqs != 0) {

> -        enabled_queues[0] = OVS_VHOST_QUEUE_DISABLED;

> -        n_enabled = 1;

> -    }

> -

> -    k = 0;

> -    for (i = 0; i < total_txqs; i++) {

> -        if (dev->tx_q[i].map != i) {

> -            dev->tx_q[i].map = enabled_queues[k];

> -            k = (k + 1) % n_enabled;

> -        }

> -    }

> -

> -    VLOG_DBG("TX queue mapping for %s\n", dev->vhost_id);

> -    for (i = 0; i < total_txqs; i++) {

> -        VLOG_DBG("%2d --> %2d", i, dev->tx_q[i].map);

> -    }

> -

> -    rte_free(enabled_queues);

> -}

> -

> -static int

> -netdev_dpdk_vhost_set_queues(struct netdev_dpdk *dev, struct

> virtio_net *virtio_dev)

> -    OVS_REQUIRES(dev->mutex)

> -{

> -    uint32_t qp_num;

> -

> -    qp_num = virtio_dev->virt_qp_nb;

> -    if (qp_num > dev->up.n_rxq) {

> -        VLOG_ERR("vHost Device '%s' %"PRIu64" can't be added - "

> -                 "too many queues %d > %d", virtio_dev->ifname, virtio_dev-

> >device_fh,

> -                 qp_num, dev->up.n_rxq);

> -        return -1;

> -    }

> -

> -    dev->real_n_rxq = qp_num;

> -    dev->real_n_txq = qp_num;

> -    dev->txq_needs_locking = true;

> -    /* Enable TX queue 0 by default if it wasn't disabled. */

> -    if (dev->tx_q[0].map == OVS_VHOST_QUEUE_MAP_UNKNOWN) {

> -        dev->tx_q[0].map = 0;

> -    }

> -

> -    netdev_dpdk_remap_txqs(dev);

> -

> -    return 0;

> -}

> -

> -/*

> - * A new virtio-net device is added to a vhost port.

> + * A new virtio-net device is added to a vhost cuse port.

>   */

>  static int

>  new_device(struct virtio_net *virtio_dev)

> @@ -2079,11 +2120,6 @@ new_device(struct virtio_net *virtio_dev)

>      LIST_FOR_EACH(dev, list_node, &dpdk_list) {

>          if (strncmp(virtio_dev->ifname, dev->vhost_id, IF_NAME_SZ) == 0) {

>              ovs_mutex_lock(&dev->mutex);

> -            if (netdev_dpdk_vhost_set_queues(dev, virtio_dev)) {

> -                ovs_mutex_unlock(&dev->mutex);

> -                ovs_mutex_unlock(&dpdk_mutex);

> -                return -1;

> -            }

>              ovsrcu_set(&dev->virtio_dev, virtio_dev);

>              exists = true;

>              virtio_dev->flags |= VIRTIO_DEV_RUNNING;

> @@ -2107,23 +2143,11 @@ new_device(struct virtio_net *virtio_dev)

>      return 0;

>  }

> 

> -/* Clears mapping for all available queues of vhost interface. */

> -static void

> -netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)

> -    OVS_REQUIRES(dev->mutex)

> -{

> -    int i;

> -

> -    for (i = 0; i < dev->real_n_txq; i++) {

> -        dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;

> -    }

> -}

> -

>  /*

> - * Remove a virtio-net device from the specific vhost port.  Use dev-

> >remove

> - * flag to stop any more packets from being sent or received to/from a VM

> and

> - * ensure all currently queued packets have been sent/received before

> removing

> - *  the device.

> + * Remove a virtio-net device from the specific vhost cuse port. Use

> + * dev->remove flag to stop any more packets from being sent or received

> + * to/from a VM and ensure all currently queued packets have been

> sent/received

> + * before removing the device.

>   */

>  static void

>  destroy_device(volatile struct virtio_net *virtio_dev)

> @@ -2138,7 +2162,6 @@ destroy_device(volatile struct virtio_net

> *virtio_dev)

>              ovs_mutex_lock(&dev->mutex);

>              virtio_dev->flags &= ~VIRTIO_DEV_RUNNING;

>              ovsrcu_set(&dev->virtio_dev, NULL);

> -            netdev_dpdk_txq_map_clear(dev);

>              exists = true;

>              ovs_mutex_unlock(&dev->mutex);

>              break;

> @@ -2166,49 +2189,6 @@ destroy_device(volatile struct virtio_net

> *virtio_dev)

>      }

>  }

> 

> -static int

> -vring_state_changed(struct virtio_net *virtio_dev, uint16_t queue_id,

> -                    int enable)

> -{

> -    struct netdev_dpdk *dev;

> -    bool exists = false;

> -    int qid = queue_id / VIRTIO_QNUM;

> -

> -    if (queue_id % VIRTIO_QNUM == VIRTIO_TXQ) {

> -        return 0;

> -    }

> -

> -    ovs_mutex_lock(&dpdk_mutex);

> -    LIST_FOR_EACH (dev, list_node, &dpdk_list) {

> -        if (strncmp(virtio_dev->ifname, dev->vhost_id, IF_NAME_SZ) == 0) {

> -            ovs_mutex_lock(&dev->mutex);

> -            if (enable) {

> -                dev->tx_q[qid].map = qid;

> -            } else {

> -                dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;

> -            }

> -            netdev_dpdk_remap_txqs(dev);

> -            exists = true;

> -            ovs_mutex_unlock(&dev->mutex);

> -            break;

> -        }

> -    }

> -    ovs_mutex_unlock(&dpdk_mutex);

> -

> -    if (exists) {

> -        VLOG_INFO("State of queue %d ( tx_qid %d ) of vhost device '%s' %"

> -                  PRIu64" changed to \'%s\'", queue_id, qid,

> -                  virtio_dev->ifname, virtio_dev->device_fh,

> -                  (enable == 1) ? "enabled" : "disabled");

> -    } else {

> -        VLOG_INFO("vHost Device '%s' %"PRIu64" not found", virtio_dev-

> >ifname,

> -                  virtio_dev->device_fh);

> -        return -1;

> -    }

> -

> -    return 0;

> -}

> -

>  struct virtio_net *

>  netdev_dpdk_get_virtio(const struct netdev_dpdk *dev)

>  {

> @@ -2216,18 +2196,17 @@ netdev_dpdk_get_virtio(const struct

> netdev_dpdk *dev)

>  }

> 

>  /*

> - * These callbacks allow virtio-net devices to be added to vhost ports when

> - * configuration has been fully complete.

> + * These callbacks allow virtio-net devices to be added to vhost cuse ports

> + * when configuration has been fully complete.

>   */

>  static const struct virtio_net_device_ops virtio_net_device_ops =

>  {

>      .new_device =  new_device,

>      .destroy_device = destroy_device,

> -    .vring_state_changed = vring_state_changed

>  };

> 

>  static void *

> -start_vhost_loop(void *dummy OVS_UNUSED)

> +start_vhost_cuse_loop(void *dummy OVS_UNUSED)

>  {

>       pthread_detach(pthread_self());

>       /* Put the cuse thread into quiescent state. */

> @@ -2237,19 +2216,7 @@ start_vhost_loop(void *dummy OVS_UNUSED)

>  }

> 

>  static int

> -dpdk_vhost_class_init(void)

> -{

> -    rte_vhost_driver_callback_register(&virtio_net_device_ops);

> -    rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4

> -                            | 1ULL << VIRTIO_NET_F_HOST_TSO6

> -                            | 1ULL << VIRTIO_NET_F_CSUM);

> -

> -    ovs_thread_create("vhost_thread", start_vhost_loop, NULL);

> -    return 0;

> -}

> -

> -static int

> -dpdk_vhost_cuse_class_init(void)

> +netdev_dpdk_vhost_cuse_class_init(void)

>  {

>      int err = -1;

> 

> @@ -2265,14 +2232,12 @@ dpdk_vhost_cuse_class_init(void)

>          return -1;

>      }

> 

> -    dpdk_vhost_class_init();

> -    return 0;

> -}

> +    rte_vhost_driver_callback_register(&virtio_net_device_ops);

> +    rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4

> +                            | 1ULL << VIRTIO_NET_F_HOST_TSO6

> +                            | 1ULL << VIRTIO_NET_F_CSUM);

> +    ovs_thread_create("vhost_cuse_thread", start_vhost_cuse_loop, NULL);

> 

> -static int

> -dpdk_vhost_user_class_init(void)

> -{

> -    dpdk_vhost_class_init();

>      return 0;

>  }

> 

> @@ -2859,30 +2824,30 @@ static const struct netdev_class dpdk_ring_class =

>  static const struct netdev_class OVS_UNUSED dpdk_vhost_cuse_class =

>      NETDEV_DPDK_CLASS(

>          "dpdkvhostcuse",

> -        dpdk_vhost_cuse_class_init,

> +        netdev_dpdk_vhost_cuse_class_init,

>          netdev_dpdk_vhost_cuse_construct,

> -        netdev_dpdk_vhost_destruct,

> +        netdev_dpdk_vhost_cuse_destruct,

>          netdev_dpdk_vhost_cuse_set_multiq,

> -        netdev_dpdk_vhost_send,

> -        netdev_dpdk_vhost_get_carrier,

> -        netdev_dpdk_vhost_get_stats,

> +        netdev_dpdk_vhost_cuse_send,

> +        netdev_dpdk_vhost_cuse_get_carrier,

> +        netdev_dpdk_vhost_cuse_get_stats,

>          NULL,

>          NULL,

> -        netdev_dpdk_vhost_rxq_recv);

> +        netdev_dpdk_vhost_cuse_rxq_recv);

>  static const struct netdev_class OVS_UNUSED dpdk_vhost_user_class =

>      NETDEV_DPDK_CLASS(

>          "dpdkvhostuser",

> -        dpdk_vhost_user_class_init,

> -        netdev_dpdk_vhost_user_construct,

> -        netdev_dpdk_vhost_destruct,

> -        netdev_dpdk_vhost_set_multiq,

> -        netdev_dpdk_vhost_send,

> -        netdev_dpdk_vhost_get_carrier,

> -        netdev_dpdk_vhost_get_stats,

> -        NULL,

>          NULL,

> -        netdev_dpdk_vhost_rxq_recv);

> +        netdev_dpdk_vhost_user_construct,

> +        netdev_dpdk_destruct,

> +        netdev_dpdk_set_multiq,

> +        netdev_dpdk_vhost_user_send,

> +        netdev_dpdk_get_carrier,

> +        netdev_dpdk_get_stats,

> +        netdev_dpdk_get_features,

> +        netdev_dpdk_get_status,

> +        netdev_dpdk_vhost_user_rxq_recv);

> 

>  void

>  netdev_dpdk_register(void)

> --

> 2.4.3

> 

> _______________________________________________

> dev mailing list

> dev@openvswitch.org

> http://openvswitch.org/mailman/listinfo/dev

Ilya Maximets May 10, 2016, 9:05 a.m. UTC | #4

CC:mail-list.
Sorry.

On 10.05.2016 11:31, Ilya Maximets wrote:
> On 03.05.2016 14:28, ciara.loftus at intel.com (Loftus, Ciara) wrote:
>>> This patch seem to remove a lot of txq remapping functions (like
>>> netdev_dpdk_remap_txqs).Ã‚Â  How does it handle the case of a disabled txq in
>>> the guest kernel?
>> There is a difference in the amount of information we can get about vrings
>> in OVS now. With the PMD, we no longer have direct access to the virtio_net
>> structure. We used to use virto_net->virt_qp_nb to determine the number of
>> vrings (enabled and disabled) in the guest kernel, and we could map disabled
>> onto enabled accordingly. Now with the PMD, we only get vring information as
>> their state changes. eg. VM with 2 vrings enabled -> we assume there are only
>> 2 vrings, even though there may be many more that are disabled. We don't need
>> to map because we aren't aware of the disabled queues.
> 
> virtio protocol still allows to disable random queue in guest. This patch will
> work only with linux kernel virtio driver on guest side and just because linux
> kernel driver always enables/disables queues sequentially. For example, you may
> write your own application with virtio-user with 2 rx/tx queues in guest and
> disable rx queue #0. This scenario will lead to broken connection while
> queue #1 still enabled.
> 
> Best regards, Ilya Maximets.
>

Ciara Loftus May 10, 2016, 9:22 a.m. UTC | #5

> On 21/04/2016 13:20, Ciara Loftus wrote:

> > DPDK 16.04 introduces the vHost PMD which allows 'dpdkvhostuser' ports

> > to be controlled by the librte_ether API, like physical 'dpdk' ports.

> > The commit integrates this functionality into OVS, and refactors some

> > of the existing vhost code such that it is vhost-cuse specific.

> > Similarly, there is now some overlap between dpdk and vhost-user port

> > code.

> >

> > Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>

> > ---

> >   INSTALL.DPDK.md   |  12 ++

> >   NEWS              |   2 +

> >   lib/netdev-dpdk.c | 515 +++++++++++++++++++++++++---------------------

> --------

> 

> Hi Ciara, there's a lot of churn in this file. It might be worth

> considering to see if it could be split through a few commits commits to

> help reviewers. e.g. new features like adding get_features, get_status

> for vhost could be a separate patch at least.


I've split into 3:
- remove watchdog
- add pmd
- add get_stats & get_features

Couldn't quite find a way to split it up more.

> 

> >   3 files changed, 254 insertions(+), 275 deletions(-)

> >   mode change 100644 => 100755 lib/netdev-dpdk.c

> 

> file permission change.


Woops. Fixed in v2.

> 

> >

> > diff --git a/INSTALL.DPDK.md b/INSTALL.DPDK.md

> > index 7f76df8..5006812 100644

> > --- a/INSTALL.DPDK.md

> > +++ b/INSTALL.DPDK.md

> > @@ -945,6 +945,18 @@ Restrictions:

> >       increased to the desired number of queues. Both DPDK and OVS must

> be

> >       recompiled for this change to take effect.

> >

> > +  DPDK 'eth' type ports:

> > +  - dpdk, dpdkr and dpdkvhostuser ports are 'eth' type ports in the context

> of

> > +    DPDK as they are all managed by the rte_ether API. This means that

> they

> > +    adhere to the DPDK configuration option CONFIG_RTE_MAX_ETHPORTS

> which by

> > +    default is set to 32. This means by default the combined total number of

> > +    dpdk, dpdkr and dpdkvhostuser ports allowable in OVS with DPDK is 32.

> This

> > +    value can be changed if desired by modifying the configuration file in

> > +    DPDK, or by overriding the default value on the command line when

> building

> > +    DPDK. eg.

> > +

> > +        `make install CONFIG_RTE_MAX_ETHPORTS=64`

> 

> format is not registering right for this in my md viewer.


It's looking ok on mine. What doesn't look right?

> 

> > +

> >   Bug Reporting:

> >   --------------

> >

> > diff --git a/NEWS b/NEWS

> > index ea7f3a1..4dc0201 100644

> > --- a/NEWS

> > +++ b/NEWS

> > @@ -26,6 +26,8 @@ Post-v2.5.0

> >          assignment.

> >        * Type of log messages from PMD threads changed from INFO to DBG.

> >        * QoS functionality with sample egress-policer implementation.

> > +     * vHost PMD integration brings vhost-user ports under control of the

> > +       rte_ether DPDK API.

> >      - ovs-benchmark: This utility has been removed due to lack of use and

> >        bitrot.

> >      - ovs-appctl:

> > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c

> > old mode 100644

> > new mode 100755

> > index 208c5f5..4fccd63

> > --- a/lib/netdev-dpdk.c

> > +++ b/lib/netdev-dpdk.c

> > @@ -56,6 +56,7 @@

> >   #include "rte_mbuf.h"

> >   #include "rte_meter.h"

> >   #include "rte_virtio_net.h"

> > +#include "rte_eth_vhost.h"

> 

> nit: generally these go in alphabetical order.


Ok

> 

> >

> >   VLOG_DEFINE_THIS_MODULE(dpdk);

> >   static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);

> > @@ -109,6 +110,8 @@ BUILD_ASSERT_DECL((MAX_NB_MBUF /

> ROUND_DOWN_POW2(MAX_NB_MBUF/MIN_NB_MBUF))

> >

> >   static char *cuse_dev_name = NULL;    /* Character device

> cuse_dev_name. */

> >   static char *vhost_sock_dir = NULL;   /* Location of vhost-user sockets */

> > +/* Array that tracks the used & unused vHost user driver IDs */

> > +static unsigned int vhost_user_drv_ids[RTE_MAX_ETHPORTS];

> >

> >   /*

> >    * Maximum amount of time in micro seconds to try and enqueue to

> vhost.

> > @@ -143,7 +146,8 @@ enum { DRAIN_TSC = 200000ULL };

> >

> >   enum dpdk_dev_type {

> >       DPDK_DEV_ETH = 0,

> > -    DPDK_DEV_VHOST = 1,

> > +    DPDK_DEV_VHOST_USER = 1,

> > +    DPDK_DEV_VHOST_CUSE = 2,

> >   };

> >

> >   static int rte_eal_init_ret = ENODEV;

> > @@ -275,8 +279,6 @@ struct dpdk_tx_queue {

> >                                       * from concurrent access.  It is used only

> >                                       * if the queue is shared among different

> >                                       * pmd threads (see 'txq_needs_locking'). */

> > -    int map;                       /* Mapping of configured vhost-user queues

> > -                                    * to enabled by guest. */

> >       uint64_t tsc;

> >       struct rte_mbuf *burst_pkts[MAX_TX_QUEUE_LEN];

> >   };

> > @@ -329,12 +331,22 @@ struct netdev_dpdk {

> >       int real_n_rxq;

> >       bool txq_needs_locking;

> >

> > -    /* virtio-net structure for vhost device */

> > +    /* Spinlock for vhost cuse transmission. Other DPDK devices use

> spinlocks

> > +     * in dpdk_tx_queue */

> > +    rte_spinlock_t vhost_cuse_tx_lock;

> 

> Why can't we continue to use the lock in dpdk_tx_queue? rather than

> adding a cuse specific lock.


This logic is taken from netdev-dpdk pre-multiqueue. I assumed it was the safest way to go since it's tried and tested.

> 

> > +

> > +    /* virtio-net structure for vhost cuse device */

> >       OVSRCU_TYPE(struct virtio_net *) virtio_dev;

> >

> >       /* Identifier used to distinguish vhost devices from each other */

> >       char vhost_id[PATH_MAX];

> >

> > +    /* ID of vhost user port given to the PMD driver */

> > +    unsigned int vhost_pmd_id;

> > +

> > +    /* Number of virtqueue pairs reported by the guest */

> > +    uint32_t reported_queues;

> 

> Is this useful? we could just use the real_n_*xq's directly.

I had a look at doing that, but it has some undesired knock-on effects. We need to initialise the var that tracks virtqueue numbers to zero but the above are inited to 1 and used by the phy ports. Think it's safer to maintain a separate variable. I've renamed this vhost_qp_nb to make it a bit clearer.

> 

> > +

> >       /* In dpdk_list. */

> >       struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);

> >

> > @@ -352,13 +364,20 @@ struct netdev_rxq_dpdk {

> >   static bool dpdk_thread_is_pmd(void);

> >

> >   static int netdev_dpdk_construct(struct netdev *);

> > +static int netdev_dpdk_vhost_user_construct(struct netdev *);

> >

> >   struct virtio_net * netdev_dpdk_get_virtio(const struct netdev_dpdk

> *dev);

> >

> > +void vring_state_changed_callback(uint8_t port_id,

> > +        enum rte_eth_event_type type OVS_UNUSED, void *param

> OVS_UNUSED);

> > +void device_state_changed_callback(uint8_t port_id,

> > +        enum rte_eth_event_type type OVS_UNUSED, void *param

> OVS_UNUSED);

> > +

> >   static bool

> >   is_dpdk_class(const struct netdev_class *class)

> >   {

> > -    return class->construct == netdev_dpdk_construct;

> > +    return ((class->construct == netdev_dpdk_construct) ||

> > +            (class->construct == netdev_dpdk_vhost_user_construct));

> 

> You should probably change the name of the function to something more

> intuitive now that it's not just for a "dpdk" netdev class.


Done

> 

> >   }

> >

> >   /* DPDK NIC drivers allocate RX buffers at a particular granularity, typically

> > @@ -581,7 +600,9 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk

> *dev, int n_rxq, int n_txq)

> >           }

> >

> >           dev->up.n_rxq = n_rxq;

> > -        dev->real_n_txq = n_txq;

> > +        if (dev->type == DPDK_DEV_ETH) {

> > +            dev->real_n_txq = n_txq;

> > +        }

> 

> This needs a comment to explain why you've added this check.


Done

> 

> >

> >           return 0;

> >       }

> > @@ -672,11 +693,72 @@ netdev_dpdk_alloc_txq(struct netdev_dpdk

> *dev, unsigned int n_txqs)

> >               /* Queues are shared among CPUs. Always flush */

> >               dev->tx_q[i].flush_tx = true;

> >           }

> > +    }

> > +}

> >

> > -        /* Initialize map for vhost devices. */

> > -        dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;

> > -        rte_spinlock_init(&dev->tx_q[i].tx_lock);

> > +void

> > +vring_state_changed_callback(uint8_t port_id,

> > +                             enum rte_eth_event_type type OVS_UNUSED,

> > +                             void *param OVS_UNUSED)

> > +{

> > +    struct netdev_dpdk *dev;

> > +    struct rte_eth_vhost_queue_event event;

> > +    int err = 0;

> > +

> > +    err = rte_eth_vhost_get_queue_event(port_id, &event);

> > +    if (err || (event.rx == 1)) {

> > +        return;

> >       }

> > +

> > +    ovs_mutex_lock(&dpdk_mutex);

> > +    LIST_FOR_EACH (dev, list_node, &dpdk_list) {

> > +        if (port_id == dev->port_id) {

> > +            ovs_mutex_lock(&dev->mutex);

> > +            if (event.enable) {

> > +                dev->reported_queues++;

> > +            } else {

> > +                dev->reported_queues--;

> > +            }

> > +            dev->real_n_rxq = dev->reported_queues;

> > +            dev->real_n_txq = dev->reported_queues;

> > +            dev->txq_needs_locking = true;

> 

> How do you know locking is needed at this point? Oh I just realised this

> is in the existing code. Seems like we should put in a check against

> netdev->n_txq like elsewhere in the code.


Done

> 

> > +            netdev_dpdk_alloc_txq(dev, dev->real_n_txq);

> > +            ovs_mutex_unlock(&dev->mutex);

> > +            break;

> > +        }

> > +    }

> > +    ovs_mutex_unlock(&dpdk_mutex);

> > +

> > +    return;

> > +}

> > +

> > +void

> > +device_state_changed_callback(uint8_t port_id,

> > +                              enum rte_eth_event_type type OVS_UNUSED,

> > +                              void *param OVS_UNUSED)

> > +{

> > +    struct netdev_dpdk *dev;

> > +

> > +    ovs_mutex_lock(&dpdk_mutex);

> > +    LIST_FOR_EACH (dev, list_node, &dpdk_list) {

> > +        if (port_id == dev->port_id) {

> > +            ovs_mutex_lock(&dev->mutex);

> > +            check_link_status(dev);

> > +            if (dev->link.link_status == ETH_LINK_UP) {

> > +                /* new device */

> > +                VLOG_INFO("vHost Device '%s' has been added", dev->vhost_id);

> > +

> > +            } else {

> > +                /* destroy device */

> > +                VLOG_INFO("vHost Device '%s' has been removed", dev-

> >vhost_id);

> > +            }

> > +            ovs_mutex_unlock(&dev->mutex);

> > +            break;

> > +        }

> > +    }

> > +    ovs_mutex_unlock(&dpdk_mutex);

> > +

> > +    return;

> >   }

> >

> >   static int

> > @@ -688,6 +770,7 @@ netdev_dpdk_init(struct netdev *netdev, unsigned

> int port_no,

> >       int sid;

> >       int err = 0;

> >       uint32_t buf_size;

> > +    unsigned int n_queues = 0;

> >

> >       ovs_mutex_init(&dev->mutex);

> >       ovs_mutex_lock(&dev->mutex);

> > @@ -697,7 +780,7 @@ netdev_dpdk_init(struct netdev *netdev, unsigned

> int port_no,

> >       /* If the 'sid' is negative, it means that the kernel fails

> >        * to obtain the pci numa info.  In that situation, always

> >        * use 'SOCKET0'. */

> > -    if (type == DPDK_DEV_ETH) {

> > +    if (type != DPDK_DEV_VHOST_CUSE) {

> >           sid = rte_eth_dev_socket_id(port_no);

> 

> Are you trying to sneak in your NUMA changes for vhostuser ;-)


:-) Stay tuned!

> 

> >       } else {

> >           sid = rte_lcore_to_socket_id(rte_get_master_lcore());

> > @@ -709,6 +792,8 @@ netdev_dpdk_init(struct netdev *netdev, unsigned

> int port_no,

> >       dev->flags = 0;

> >       dev->mtu = ETHER_MTU;

> >       dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);

> > +    dev->vhost_pmd_id = 0;

> 

> 0 is a valid vhost_pmd_id number. Seen as it's used in the destruct it

> might be better to set to an invalid value like MAX and check for that

> in destruct, otherwise you're relying on higher layer to ensure that

> destruct is never called before construct. edit: actually i've another

> comment wrt vhost_pmd_id below.

> 

> > +    dev->reported_queues = 0;

> >

> >       buf_size = dpdk_buf_size(dev->mtu);

> >       dev->dpdk_mp = dpdk_mp_get(dev->socket_id,

> FRAME_LEN_TO_MTU(buf_size));

> > @@ -726,14 +811,23 @@ netdev_dpdk_init(struct netdev *netdev,

> unsigned int port_no,

> >       netdev->requested_n_rxq = NR_QUEUE;

> >       dev->real_n_txq = NR_QUEUE;

> >

> > -    if (type == DPDK_DEV_ETH) {

> > -        netdev_dpdk_alloc_txq(dev, NR_QUEUE);

> > +    n_queues = (type == DPDK_DEV_VHOST_CUSE) ?

> OVS_VHOST_MAX_QUEUE_NUM :

> > +                                               NR_QUEUE;

> 

> No need for special cuse case above.

Ok

> 

> > +    netdev_dpdk_alloc_txq(dev, n_queues);

> > +    if (type != DPDK_DEV_VHOST_CUSE) {

> >           err = dpdk_eth_dev_init(dev);

> >           if (err) {

> >               goto unlock;

> >           }

> > -    } else {

> > -        netdev_dpdk_alloc_txq(dev, OVS_VHOST_MAX_QUEUE_NUM);

> > +    }

> > +

> > +    if (type == DPDK_DEV_VHOST_USER) {

> > +        rte_eth_dev_callback_register(port_no,

> RTE_ETH_EVENT_QUEUE_STATE,

> > +                                     (void*)vring_state_changed_callback,

> > +                                      NULL);

> > +        rte_eth_dev_callback_register(port_no, RTE_ETH_EVENT_INTR_LSC,

> > +                                     (void*)device_state_changed_callback,

> 

> Seems like this using this callback is a different way to do the same

> thing as the watchdog timer - could we consolidate them and simplify?


Good suggestion. This is part one of the v2 patchset.

> 

> > +                                      NULL);

> >       }

> >

> >       ovs_list_push_back(&dpdk_list, &dev->list_node);

> > @@ -768,26 +862,37 @@ dpdk_dev_parse_name(const char dev_name[],

> const char prefix[],

> >   }

> >

> >   static int

> > -vhost_construct_helper(struct netdev *netdev)

> OVS_REQUIRES(dpdk_mutex)

> > +netdev_dpdk_vhost_cuse_construct(struct netdev *netdev)

> >   {

> > +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> > +    int err;

> > +

> > +    ovs_mutex_lock(&dpdk_mutex);

> > +    strncpy(dev->vhost_id, netdev->name, sizeof(dev->vhost_id));

> >       if (rte_eal_init_ret) {

> >           return rte_eal_init_ret;

> >       }

> >

> > -    return netdev_dpdk_init(netdev, -1, DPDK_DEV_VHOST);

> > +    rte_spinlock_init(&dev->vhost_cuse_tx_lock);

> > +

> > +    err = netdev_dpdk_init(netdev, -1, DPDK_DEV_VHOST_CUSE);

> > +

> > +    ovs_mutex_unlock(&dpdk_mutex);

> > +    return err;

> >   }

> >

> >   static int

> > -netdev_dpdk_vhost_cuse_construct(struct netdev *netdev)

> > +get_vhost_user_drv_id(void)

> >   {

> > -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> > -    int err;

> > +    int i = 0;

> >

> > -    ovs_mutex_lock(&dpdk_mutex);

> > -    strncpy(dev->vhost_id, netdev->name, sizeof(dev->vhost_id));

> > -    err = vhost_construct_helper(netdev);

> > -    ovs_mutex_unlock(&dpdk_mutex);

> > -    return err;

> > +    for (i = 0; i < RTE_MAX_ETHPORTS; i++) {

> > +        if (vhost_user_drv_ids[i] == 0) {

> 

> This is treating RTE_MAX_ETHPORTS as the max vhost ports. I didn't check

> but it sounds like it's for all DPDK ports and not just vhost. If true

> it means the code would allow >RTE_MAX_ETHPORTS.

It's for all DPDK ports under control of the ether API yes.
The theoretical max vhost ports == RTE_MAX_ETHPORTS though.
If we try register too many vhosts we will fail anyway on rte_eth_dev_attach.

> 

> > +            return i;

> > +        }

> > +    }

> > +

> > +    return -1;

> >   }

> >

> >   static int

> > @@ -796,6 +901,9 @@ netdev_dpdk_vhost_user_construct(struct netdev

> *netdev)

> >       struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> >       const char *name = netdev->name;

> >       int err;

> > +    uint8_t port_no = 0;

> > +    char devargs[4096];

> 

> magic number - any define we can use?


It's not pretty but I tried to tidy it into a #define:
#define DEVARGS_MAX (RTE_ETH_NAME_MAX_LEN + PATH_MAX + 18)

> 

> > +    int driver_id = 0;

> >

> >       /* 'name' is appended to 'vhost_sock_dir' and used to create a socket in

> >        * the file system. '/' or '\' would traverse directories, so they're not

> > @@ -807,22 +915,35 @@ netdev_dpdk_vhost_user_construct(struct

> netdev *netdev)

> >           return EINVAL;

> >       }

> >

> > +    if (rte_eal_init_ret) {

> > +        return rte_eal_init_ret;

> > +    }

> > +

> 

> it's more consistent to check this before the name.

Ok

> 

> >       ovs_mutex_lock(&dpdk_mutex);

> >       /* Take the name of the vhost-user port and append it to the location

> where

> >        * the socket is to be created, then register the socket.

> >        */

> >       snprintf(dev->vhost_id, sizeof(dev->vhost_id), "%s/%s",

> >                vhost_sock_dir, name);

> > +    driver_id = get_vhost_user_drv_id();

> > +    if (driver_id == -1) {

> > +        VLOG_ERR("Too many vhost-user devices registered with PMD");

> > +        err = ENODEV;

> > +    } else {

> > +        snprintf(devargs, sizeof(devargs),

> "eth_vhost%u,iface=%s,queues=%i",

> > +                 driver_id, dev->vhost_id, RTE_MAX_QUEUES_PER_PORT);

> > +        err = rte_eth_dev_attach(devargs, &port_no);

> > +    }

> >

> > -    err = rte_vhost_driver_register(dev->vhost_id);

> >       if (err) {

> > -        VLOG_ERR("vhost-user socket device setup failure for socket %s\n",

> > -                 dev->vhost_id);

> > +        VLOG_ERR("Failed to attach vhost-user device to DPDK");

> 

> It might be a bit confusing getting this error message in some cases

> where you have tried to attach and failed and in other cases where it

> has not been tried because of too many vhost-user devices. Also the

> socket name would be handy.


I've tidied this up in the v2.

> 

> >       } else {

> >           fatal_signal_add_file_to_unlink(dev->vhost_id);

> >           VLOG_INFO("Socket %s created for vhost-user port %s\n",

> >                     dev->vhost_id, name);

> > -        err = vhost_construct_helper(netdev);

> > +        dev->vhost_pmd_id = driver_id;

> 

> You're setting this but then it gets reset when you call

> netdev_dpdk_init(). So when you destruct you'll set the reset the wrong

> index in vhost_user_drv_ids[] ?


You're right. The re-init in netdev-dpdk_init is removed in v2.

> 

> > +        vhost_user_drv_ids[dev->vhost_pmd_id] = 1;

> > +        err = netdev_dpdk_init(netdev, port_no, DPDK_DEV_VHOST_USER);

> >       }

> >

> >       ovs_mutex_unlock(&dpdk_mutex);

> > @@ -857,6 +978,12 @@ netdev_dpdk_destruct(struct netdev *netdev)

> >       struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> >

> >       ovs_mutex_lock(&dev->mutex);

> > +

> > +    if (dev->type == DPDK_DEV_VHOST_USER) {

> > +        rte_eth_dev_detach(dev->port_id, dev->vhost_id);

> > +        vhost_user_drv_ids[dev->vhost_pmd_id] = 0;

> > +    }

> > +

> >       rte_eth_dev_stop(dev->port_id);

> >       ovs_mutex_unlock(&dev->mutex);

> >

> > @@ -868,7 +995,7 @@ netdev_dpdk_destruct(struct netdev *netdev)

> >   }

> >

> >   static void

> > -netdev_dpdk_vhost_destruct(struct netdev *netdev)

> > +netdev_dpdk_vhost_cuse_destruct(struct netdev *netdev)

> >   {

> >       struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> >

> > @@ -876,15 +1003,8 @@ netdev_dpdk_vhost_destruct(struct netdev

> *netdev)

> >       if (netdev_dpdk_get_virtio(dev) != NULL) {

> >           VLOG_ERR("Removing port '%s' while vhost device still attached.",

> >                    netdev->name);

> > -        VLOG_ERR("To restore connectivity after re-adding of port, VM on

> socket"

> > -                 " '%s' must be restarted.",

> > -                 dev->vhost_id);

> > -    }

> > -

> > -    if (rte_vhost_driver_unregister(dev->vhost_id)) {

> > -        VLOG_ERR("Unable to remove vhost-user socket %s", dev-

> >vhost_id);

> > -    } else {

> > -        fatal_signal_remove_file_to_unlink(dev->vhost_id);

> > +        VLOG_ERR("To restore connectivity after re-adding of port, VM with"

> > +                 "port '%s' must be restarted.", dev->vhost_id);

> >       }

> >

> >       ovs_mutex_lock(&dpdk_mutex);

> > @@ -1000,30 +1120,6 @@ netdev_dpdk_vhost_cuse_set_multiq(struct

> netdev *netdev, unsigned int n_txq,

> >       netdev->n_txq = n_txq;

> >       dev->real_n_txq = 1;

> >       netdev->n_rxq = 1;

> > -    dev->txq_needs_locking = dev->real_n_txq != netdev->n_txq;

> > -

> > -    ovs_mutex_unlock(&dev->mutex);

> > -    ovs_mutex_unlock(&dpdk_mutex);

> > -

> > -    return err;

> > -}

> > -

> > -static int

> > -netdev_dpdk_vhost_set_multiq(struct netdev *netdev, unsigned int

> n_txq,

> > -                             unsigned int n_rxq)

> > -{

> > -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> > -    int err = 0;

> > -

> > -    if (netdev->n_txq == n_txq && netdev->n_rxq == n_rxq) {

> > -        return err;

> > -    }

> > -

> > -    ovs_mutex_lock(&dpdk_mutex);

> > -    ovs_mutex_lock(&dev->mutex);

> > -

> > -    netdev->n_txq = n_txq;

> > -    netdev->n_rxq = n_rxq;

> >

> >       ovs_mutex_unlock(&dev->mutex);

> >       ovs_mutex_unlock(&dpdk_mutex);

> > @@ -1118,13 +1214,13 @@ dpdk_queue_flush(struct netdev_dpdk *dev,

> int qid)

> >   }

> >

> >   static bool

> > -is_vhost_running(struct virtio_net *virtio_dev)

> > +is_vhost_cuse_running(struct virtio_net *virtio_dev)

> >   {

> >       return (virtio_dev != NULL && (virtio_dev->flags &

> VIRTIO_DEV_RUNNING));

> >   }

> >

> >   static inline void

> > -netdev_dpdk_vhost_update_rx_counters(struct netdev_stats *stats,

> > +netdev_dpdk_vhost_cuse_update_rx_counters(struct netdev_stats

> *stats,

> >                                        struct dp_packet **packets, int count)

> >   {

> >       int i;

> > @@ -1153,26 +1249,22 @@

> netdev_dpdk_vhost_update_rx_counters(struct netdev_stats *stats,

> >   }

> >

> >   /*

> > - * The receive path for the vhost port is the TX path out from guest.

> > + * The receive path for the vhost cuse port is the TX path out from guest.

> >    */

> >   static int

> > -netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,

> > +netdev_dpdk_vhost_cuse_rxq_recv(struct netdev_rxq *rxq,

> >                              struct dp_packet **packets, int *c)

> >   {

> >       struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);

> >       struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);

> > -    int qid = rxq->queue_id;

> > +    int qid = 1;

> >       uint16_t nb_rx = 0;

> >

> > -    if (OVS_UNLIKELY(!is_vhost_running(virtio_dev))) {

> > +    if (OVS_UNLIKELY(!is_vhost_cuse_running(virtio_dev))) {

> >           return EAGAIN;

> >       }

> >

> > -    if (rxq->queue_id >= dev->real_n_rxq) {

> > -        return EOPNOTSUPP;

> > -    }

> > -

> > -    nb_rx = rte_vhost_dequeue_burst(virtio_dev, qid * VIRTIO_QNUM +

> VIRTIO_TXQ,

> > +    nb_rx = rte_vhost_dequeue_burst(virtio_dev, qid,

> >                                       dev->dpdk_mp->mp,

> >                                       (struct rte_mbuf **)packets,

> >                                       NETDEV_MAX_BURST);

> > @@ -1181,7 +1273,7 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq

> *rxq,

> >       }

> >

> >       rte_spinlock_lock(&dev->stats_lock);

> > -    netdev_dpdk_vhost_update_rx_counters(&dev->stats, packets,

> nb_rx);

> > +    netdev_dpdk_vhost_cuse_update_rx_counters(&dev->stats, packets,

> nb_rx);

> >       rte_spinlock_unlock(&dev->stats_lock);

> >

> >       *c = (int) nb_rx;

> > @@ -1217,6 +1309,18 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq,

> struct dp_packet **packets,

> >       return 0;

> >   }

> >

> > +static int

> > +netdev_dpdk_vhost_user_rxq_recv(struct netdev_rxq *rxq,

> > +                                struct dp_packet **packets, int *c)

> > +{

> > +    struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);

> > +

> > +    if (rxq->queue_id >= dev->real_n_rxq) {

> > +        return EOPNOTSUPP;

> > +    }

> > +

> > +    return netdev_dpdk_rxq_recv(rxq, packets, c);

> > +}

> >   static inline int

> >   netdev_dpdk_qos_run__(struct netdev_dpdk *dev, struct rte_mbuf

> **pkts,

> >                           int cnt)

> > @@ -1235,7 +1339,7 @@ netdev_dpdk_qos_run__(struct netdev_dpdk

> *dev, struct rte_mbuf **pkts,

> >   }

> >

> >   static inline void

> > -netdev_dpdk_vhost_update_tx_counters(struct netdev_stats *stats,

> > +netdev_dpdk_vhost_cuse_update_tx_counters(struct netdev_stats

> *stats,

> >                                        struct dp_packet **packets,

> >                                        int attempted,

> >                                        int dropped)

> > @@ -1252,9 +1356,8 @@ netdev_dpdk_vhost_update_tx_counters(struct

> netdev_stats *stats,

> >   }

> >

> >   static void

> > -__netdev_dpdk_vhost_send(struct netdev *netdev, int qid,

> > -                         struct dp_packet **pkts, int cnt,

> > -                         bool may_steal)

> > +__netdev_dpdk_vhost_cuse_send(struct netdev *netdev, struct

> dp_packet **pkts,

> > +                         int cnt, bool may_steal)

> >   {

> >       struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> >       struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);

> > @@ -1263,26 +1366,24 @@ __netdev_dpdk_vhost_send(struct netdev

> *netdev, int qid,

> >       unsigned int qos_pkts = cnt;

> >       uint64_t start = 0;

> >

> > -    qid = dev->tx_q[qid % dev->real_n_txq].map;

> > -

> > -    if (OVS_UNLIKELY(!is_vhost_running(virtio_dev) || qid < 0)) {

> > +    if (OVS_UNLIKELY(!is_vhost_cuse_running(virtio_dev))) {

> >           rte_spinlock_lock(&dev->stats_lock);

> >           dev->stats.tx_dropped+= cnt;

> >           rte_spinlock_unlock(&dev->stats_lock);

> >           goto out;

> >       }

> >

> > -    rte_spinlock_lock(&dev->tx_q[qid].tx_lock);

> > +    /* There is a single vhost-cuse TX queue, So we need to lock it for TX. */

> > +    rte_spinlock_lock(&dev->vhost_cuse_tx_lock);

> >

> >       /* Check has QoS has been configured for the netdev */

> >       cnt = netdev_dpdk_qos_run__(dev, cur_pkts, cnt);

> >       qos_pkts -= cnt;

> >

> >       do {

> > -        int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;

> >           unsigned int tx_pkts;

> >

> > -        tx_pkts = rte_vhost_enqueue_burst(virtio_dev, vhost_qid,

> > +        tx_pkts = rte_vhost_enqueue_burst(virtio_dev, VIRTIO_RXQ,

> >                                             cur_pkts, cnt);

> >           if (OVS_LIKELY(tx_pkts)) {

> >               /* Packets have been sent.*/

> > @@ -1301,7 +1402,7 @@ __netdev_dpdk_vhost_send(struct netdev

> *netdev, int qid,

> >                * Unable to enqueue packets to vhost interface.

> >                * Check available entries before retrying.

> >                */

> > -            while (!rte_vring_available_entries(virtio_dev, vhost_qid)) {

> > +            while (!rte_vring_available_entries(virtio_dev, VIRTIO_RXQ)) {

> >                   if (OVS_UNLIKELY((rte_get_timer_cycles() - start) > timeout)) {

> >                       expired = 1;

> >                       break;

> > @@ -1313,12 +1414,12 @@ __netdev_dpdk_vhost_send(struct netdev

> *netdev, int qid,

> >               }

> >           }

> >       } while (cnt);

> > -

> > -    rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);

> > +    rte_spinlock_unlock(&dev->vhost_cuse_tx_lock);

> >

> >       rte_spinlock_lock(&dev->stats_lock);

> >       cnt += qos_pkts;

> > -    netdev_dpdk_vhost_update_tx_counters(&dev->stats, pkts,

> total_pkts, cnt);

> > +    netdev_dpdk_vhost_cuse_update_tx_counters(&dev->stats, pkts,

> total_pkts,

> > +                                              cnt);

> >       rte_spinlock_unlock(&dev->stats_lock);

> >

> >   out:

> > @@ -1412,8 +1513,9 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid,

> struct dp_packet **pkts,

> >           newcnt++;

> >       }

> >

> > -    if (dev->type == DPDK_DEV_VHOST) {

> > -        __netdev_dpdk_vhost_send(netdev, qid, (struct dp_packet **)

> mbufs, newcnt, true);

> > +    if (dev->type == DPDK_DEV_VHOST_CUSE) {

> > +        __netdev_dpdk_vhost_cuse_send(netdev, (struct dp_packet **)

> mbufs,

> > +                                      newcnt, true);

> >       } else {

> >           unsigned int qos_pkts = newcnt;

> >

> > @@ -1437,8 +1539,8 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid,

> struct dp_packet **pkts,

> >   }

> >

> >   static int

> > -netdev_dpdk_vhost_send(struct netdev *netdev, int qid, struct

> dp_packet **pkts,

> > -                 int cnt, bool may_steal)

> > +netdev_dpdk_vhost_cuse_send(struct netdev *netdev, int qid

> OVS_UNUSED,

> > +                            struct dp_packet **pkts, int cnt, bool may_steal)

> >   {

> >       if (OVS_UNLIKELY(pkts[0]->source != DPBUF_DPDK)) {

> >           int i;

> > @@ -1450,7 +1552,7 @@ netdev_dpdk_vhost_send(struct netdev

> *netdev, int qid, struct dp_packet **pkts,

> >               }

> >           }

> >       } else {

> > -        __netdev_dpdk_vhost_send(netdev, qid, pkts, cnt, may_steal);

> > +        __netdev_dpdk_vhost_cuse_send(netdev, pkts, cnt, may_steal);

> >       }

> >       return 0;

> >   }

> > @@ -1461,11 +1563,6 @@ netdev_dpdk_send__(struct netdev_dpdk

> *dev, int qid,

> >   {

> 

> Now that this handles buffers for vhost, how is may_steal on/off catered

> for?

Looking into this.

> 

> >       int i;

> >

> > -    if (OVS_UNLIKELY(dev->txq_needs_locking)) {

> > -        qid = qid % dev->real_n_txq;

> > -        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);

> > -    }

> > -

> 

> why is the possible locking removed?

Error - re instantiated in the v2.

> 

> >       if (OVS_UNLIKELY(!may_steal ||

> >                        pkts[0]->source != DPBUF_DPDK)) {

> >           struct netdev *netdev = &dev->up;

> > @@ -1541,6 +1638,18 @@ netdev_dpdk_eth_send(struct netdev *netdev,

> int qid,

> >   }

> >

> >   static int

> > +netdev_dpdk_vhost_user_send(struct netdev *netdev, int qid,

> > +        struct dp_packet **pkts, int cnt, bool may_steal)

> > +{

> > +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> > +

> > +    qid = qid % dev->real_n_txq;

> 

> Due to the modulo, 2 cores can be trying to send on the same queue which

> makes for lock contention. The code was changed to avoid that but now

> the code to avoid and the locking seems to be gone so it could cause

> corruption.

Reintroduced the locking in v2.
> 

> > +

> > +    netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal);

> > +    return 0;

> > +}

> > +

> > +static int

> >   netdev_dpdk_set_etheraddr(struct netdev *netdev, const struct

> eth_addr mac)

> >   {

> >       struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> > @@ -1634,7 +1743,7 @@ static int

> >   netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier);

> >

> >   static int

> > -netdev_dpdk_vhost_get_stats(const struct netdev *netdev,

> > +netdev_dpdk_vhost_cuse_get_stats(const struct netdev *netdev,

> >                               struct netdev_stats *stats)

> >   {

> >       struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> > @@ -1796,14 +1905,14 @@ netdev_dpdk_get_carrier(const struct netdev

> *netdev, bool *carrier)

> >   }

> >

> >   static int

> > -netdev_dpdk_vhost_get_carrier(const struct netdev *netdev, bool

> *carrier)

> > +netdev_dpdk_vhost_cuse_get_carrier(const struct netdev *netdev, bool

> *carrier)

> >   {

> >       struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

> >       struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);

> >

> >       ovs_mutex_lock(&dev->mutex);

> >

> > -    if (is_vhost_running(virtio_dev)) {

> > +    if (is_vhost_cuse_running(virtio_dev)) {

> >           *carrier = 1;

> >       } else {

> >           *carrier = 0;

> > @@ -1853,7 +1962,7 @@ netdev_dpdk_update_flags__(struct

> netdev_dpdk *dev,

> >           return 0;

> >       }

> >

> > -    if (dev->type == DPDK_DEV_ETH) {

> > +    if (dev->type != DPDK_DEV_VHOST_CUSE) {

> >           if (dev->flags & NETDEV_UP) {

> >               err = rte_eth_dev_start(dev->port_id);

> >               if (err)

> > @@ -1998,75 +2107,7 @@ set_irq_status(struct virtio_net *virtio_dev)

> >   }

> >

> >   /*

> > - * Fixes mapping for vhost-user tx queues. Must be called after each

> > - * enabling/disabling of queues and real_n_txq modifications.

> > - */

> > -static void

> > -netdev_dpdk_remap_txqs(struct netdev_dpdk *dev)

> > -    OVS_REQUIRES(dev->mutex)

> > -{

> > -    int *enabled_queues, n_enabled = 0;

> > -    int i, k, total_txqs = dev->real_n_txq;

> > -

> > -    enabled_queues = dpdk_rte_mzalloc(total_txqs * sizeof

> *enabled_queues);

> > -

> > -    for (i = 0; i < total_txqs; i++) {

> > -        /* Enabled queues always mapped to themselves. */

> > -        if (dev->tx_q[i].map == i) {

> > -            enabled_queues[n_enabled++] = i;

> > -        }

> > -    }

> > -

> > -    if (n_enabled == 0 && total_txqs != 0) {

> > -        enabled_queues[0] = OVS_VHOST_QUEUE_DISABLED;

> > -        n_enabled = 1;

> > -    }

> > -

> > -    k = 0;

> > -    for (i = 0; i < total_txqs; i++) {

> > -        if (dev->tx_q[i].map != i) {

> > -            dev->tx_q[i].map = enabled_queues[k];

> > -            k = (k + 1) % n_enabled;

> > -        }

> > -    }

> > -

> > -    VLOG_DBG("TX queue mapping for %s\n", dev->vhost_id);

> > -    for (i = 0; i < total_txqs; i++) {

> > -        VLOG_DBG("%2d --> %2d", i, dev->tx_q[i].map);

> > -    }

> > -

> > -    rte_free(enabled_queues);

> > -}

> > -

> > -static int

> > -netdev_dpdk_vhost_set_queues(struct netdev_dpdk *dev, struct

> virtio_net *virtio_dev)

> > -    OVS_REQUIRES(dev->mutex)

> > -{

> > -    uint32_t qp_num;

> > -

> > -    qp_num = virtio_dev->virt_qp_nb;

> > -    if (qp_num > dev->up.n_rxq) {

> > -        VLOG_ERR("vHost Device '%s' %"PRIu64" can't be added - "

> > -                 "too many queues %d > %d", virtio_dev->ifname, virtio_dev-

> >device_fh,

> > -                 qp_num, dev->up.n_rxq);

> > -        return -1;

> > -    }

> > -

> > -    dev->real_n_rxq = qp_num;

> > -    dev->real_n_txq = qp_num;

> > -    dev->txq_needs_locking = true;

> > -    /* Enable TX queue 0 by default if it wasn't disabled. */

> > -    if (dev->tx_q[0].map == OVS_VHOST_QUEUE_MAP_UNKNOWN) {

> > -        dev->tx_q[0].map = 0;

> > -    }

> > -

> > -    netdev_dpdk_remap_txqs(dev);

> > -

> > -    return 0;

> > -}

> > -

> > -/*

> > - * A new virtio-net device is added to a vhost port.

> > + * A new virtio-net device is added to a vhost cuse port.

> >    */

> >   static int

> >   new_device(struct virtio_net *virtio_dev)

> > @@ -2079,11 +2120,6 @@ new_device(struct virtio_net *virtio_dev)

> >       LIST_FOR_EACH(dev, list_node, &dpdk_list) {

> >           if (strncmp(virtio_dev->ifname, dev->vhost_id, IF_NAME_SZ) == 0) {

> >               ovs_mutex_lock(&dev->mutex);

> > -            if (netdev_dpdk_vhost_set_queues(dev, virtio_dev)) {

> > -                ovs_mutex_unlock(&dev->mutex);

> > -                ovs_mutex_unlock(&dpdk_mutex);

> > -                return -1;

> > -            }

> >               ovsrcu_set(&dev->virtio_dev, virtio_dev);

> >               exists = true;

> >               virtio_dev->flags |= VIRTIO_DEV_RUNNING;

> > @@ -2107,23 +2143,11 @@ new_device(struct virtio_net *virtio_dev)

> >       return 0;

> >   }

> >

> > -/* Clears mapping for all available queues of vhost interface. */

> > -static void

> > -netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)

> > -    OVS_REQUIRES(dev->mutex)

> > -{

> > -    int i;

> > -

> > -    for (i = 0; i < dev->real_n_txq; i++) {

> > -        dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;

> > -    }

> > -}

> > -

> >   /*

> > - * Remove a virtio-net device from the specific vhost port.  Use dev-

> >remove

> > - * flag to stop any more packets from being sent or received to/from a VM

> and

> > - * ensure all currently queued packets have been sent/received before

> removing

> > - *  the device.

> > + * Remove a virtio-net device from the specific vhost cuse port. Use

> > + * dev->remove flag to stop any more packets from being sent or

> received

> > + * to/from a VM and ensure all currently queued packets have been

> sent/received

> > + * before removing the device.

> >    */

> >   static void

> >   destroy_device(volatile struct virtio_net *virtio_dev)

> > @@ -2138,7 +2162,6 @@ destroy_device(volatile struct virtio_net

> *virtio_dev)

> >               ovs_mutex_lock(&dev->mutex);

> >               virtio_dev->flags &= ~VIRTIO_DEV_RUNNING;

> >               ovsrcu_set(&dev->virtio_dev, NULL);

> > -            netdev_dpdk_txq_map_clear(dev);

> >               exists = true;

> >               ovs_mutex_unlock(&dev->mutex);

> >               break;

> > @@ -2166,49 +2189,6 @@ destroy_device(volatile struct virtio_net

> *virtio_dev)

> >       }

> >   }

> >

> > -static int

> > -vring_state_changed(struct virtio_net *virtio_dev, uint16_t queue_id,

> > -                    int enable)

> > -{

> > -    struct netdev_dpdk *dev;

> > -    bool exists = false;

> > -    int qid = queue_id / VIRTIO_QNUM;

> > -

> > -    if (queue_id % VIRTIO_QNUM == VIRTIO_TXQ) {

> > -        return 0;

> > -    }

> > -

> > -    ovs_mutex_lock(&dpdk_mutex);

> > -    LIST_FOR_EACH (dev, list_node, &dpdk_list) {

> > -        if (strncmp(virtio_dev->ifname, dev->vhost_id, IF_NAME_SZ) == 0) {

> > -            ovs_mutex_lock(&dev->mutex);

> > -            if (enable) {

> > -                dev->tx_q[qid].map = qid;

> > -            } else {

> > -                dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;

> > -            }

> > -            netdev_dpdk_remap_txqs(dev);

> > -            exists = true;

> > -            ovs_mutex_unlock(&dev->mutex);

> > -            break;

> > -        }

> > -    }

> > -    ovs_mutex_unlock(&dpdk_mutex);

> > -

> > -    if (exists) {

> > -        VLOG_INFO("State of queue %d ( tx_qid %d ) of vhost device '%s' %"

> > -                  PRIu64" changed to \'%s\'", queue_id, qid,

> > -                  virtio_dev->ifname, virtio_dev->device_fh,

> > -                  (enable == 1) ? "enabled" : "disabled");

> > -    } else {

> > -        VLOG_INFO("vHost Device '%s' %"PRIu64" not found", virtio_dev-

> >ifname,

> > -                  virtio_dev->device_fh);

> > -        return -1;

> > -    }

> > -

> > -    return 0;

> > -}

> > -

> >   struct virtio_net *

> >   netdev_dpdk_get_virtio(const struct netdev_dpdk *dev)

> >   {

> > @@ -2216,18 +2196,17 @@ netdev_dpdk_get_virtio(const struct

> netdev_dpdk *dev)

> >   }

> >

> >   /*

> > - * These callbacks allow virtio-net devices to be added to vhost ports when

> > - * configuration has been fully complete.

> > + * These callbacks allow virtio-net devices to be added to vhost cuse ports

> > + * when configuration has been fully complete.

> >    */

> >   static const struct virtio_net_device_ops virtio_net_device_ops =

> >   {

> >       .new_device =  new_device,

> >       .destroy_device = destroy_device,

> > -    .vring_state_changed = vring_state_changed

> >   };

> >

> >   static void *

> > -start_vhost_loop(void *dummy OVS_UNUSED)

> > +start_vhost_cuse_loop(void *dummy OVS_UNUSED)

> >   {

> >        pthread_detach(pthread_self());

> >        /* Put the cuse thread into quiescent state. */

> > @@ -2237,19 +2216,7 @@ start_vhost_loop(void *dummy OVS_UNUSED)

> >   }

> >

> >   static int

> > -dpdk_vhost_class_init(void)

> > -{

> > -    rte_vhost_driver_callback_register(&virtio_net_device_ops);

> > -    rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4

> > -                            | 1ULL << VIRTIO_NET_F_HOST_TSO6

> > -                            | 1ULL << VIRTIO_NET_F_CSUM);

> 

> I think you'll still want to set these for vhostuser.

I didn't think there was an equivalent function to expose this in the PMD but there is! Used in the v2.

> 

> > -

> > -    ovs_thread_create("vhost_thread", start_vhost_loop, NULL);

> > -    return 0;

> > -}

> > -

> > -static int

> > -dpdk_vhost_cuse_class_init(void)

> > +netdev_dpdk_vhost_cuse_class_init(void)

> >   {

> >       int err = -1;

> >

> > @@ -2265,14 +2232,12 @@ dpdk_vhost_cuse_class_init(void)

> >           return -1;

> >       }

> >

> > -    dpdk_vhost_class_init();

> > -    return 0;

> > -}

> > +    rte_vhost_driver_callback_register(&virtio_net_device_ops);

> > +    rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4

> > +                            | 1ULL << VIRTIO_NET_F_HOST_TSO6

> > +                            | 1ULL << VIRTIO_NET_F_CSUM);

> > +    ovs_thread_create("vhost_cuse_thread", start_vhost_cuse_loop,

> NULL);

> >

> > -static int

> > -dpdk_vhost_user_class_init(void)

> > -{

> > -    dpdk_vhost_class_init();

> >       return 0;

> >   }

> >

> > @@ -2859,30 +2824,30 @@ static const struct netdev_class dpdk_ring_class

> =

> >   static const struct netdev_class OVS_UNUSED dpdk_vhost_cuse_class =

> >       NETDEV_DPDK_CLASS(

> >           "dpdkvhostcuse",

> > -        dpdk_vhost_cuse_class_init,

> > +        netdev_dpdk_vhost_cuse_class_init,

> >           netdev_dpdk_vhost_cuse_construct,

> > -        netdev_dpdk_vhost_destruct,

> > +        netdev_dpdk_vhost_cuse_destruct,

> >           netdev_dpdk_vhost_cuse_set_multiq,

> > -        netdev_dpdk_vhost_send,

> > -        netdev_dpdk_vhost_get_carrier,

> > -        netdev_dpdk_vhost_get_stats,

> > +        netdev_dpdk_vhost_cuse_send,

> > +        netdev_dpdk_vhost_cuse_get_carrier,

> > +        netdev_dpdk_vhost_cuse_get_stats,

> >           NULL,

> >           NULL,

> > -        netdev_dpdk_vhost_rxq_recv);

> > +        netdev_dpdk_vhost_cuse_rxq_recv);

> >

> >   static const struct netdev_class OVS_UNUSED dpdk_vhost_user_class =

> >       NETDEV_DPDK_CLASS(

> >           "dpdkvhostuser",

> > -        dpdk_vhost_user_class_init,

> > -        netdev_dpdk_vhost_user_construct,

> > -        netdev_dpdk_vhost_destruct,

> > -        netdev_dpdk_vhost_set_multiq,

> > -        netdev_dpdk_vhost_send,

> > -        netdev_dpdk_vhost_get_carrier,

> > -        netdev_dpdk_vhost_get_stats,

> > -        NULL,

> >           NULL,

> > -        netdev_dpdk_vhost_rxq_recv);

> > +        netdev_dpdk_vhost_user_construct,

> > +        netdev_dpdk_destruct,

> > +        netdev_dpdk_set_multiq,

> > +        netdev_dpdk_vhost_user_send,

> > +        netdev_dpdk_get_carrier,

> > +        netdev_dpdk_get_stats,

> 

> Are all the phy type stats available for vhost? or do you need to check

> the netdev type in that function and report specific stats. I didn't

> think DPDK would report things like rx_dropped for vhost but I'll be

> glad to be wrong.

Fixed in v2.

> 

> > +        netdev_dpdk_get_features,

> > +        netdev_dpdk_get_status,

> > +        netdev_dpdk_vhost_user_rxq_recv);

> >

> >   void

> >   netdev_dpdk_register(void)

> >


Thanks for the thorough review Kevin. Hopefully I've addressed your comments. The v2 will be posted shortly.

Thanks,
Ciara

Ciara Loftus May 10, 2016, 9:38 a.m. UTC | #6

> 
> On 10.05.2016 11:31, Ilya Maximets wrote:
> > On 03.05.2016 14:28, ciara.loftus at intel.com (Loftus, Ciara) wrote:
> >>> This patch seem to remove a lot of txq remapping functions (like
> >>> netdev_dpdk_remap_txqs).Ã,Â  How does it handle the case of a
> disabled txq in
> >>> the guest kernel?
> >> There is a difference in the amount of information we can get about
> vrings
> >> in OVS now. With the PMD, we no longer have direct access to the
> virtio_net
> >> structure. We used to use virto_net->virt_qp_nb to determine the
> number of
> >> vrings (enabled and disabled) in the guest kernel, and we could map
> disabled
> >> onto enabled accordingly. Now with the PMD, we only get vring
> information as
> >> their state changes. eg. VM with 2 vrings enabled -> we assume there are
> only
> >> 2 vrings, even though there may be many more that are disabled. We
> don't need
> >> to map because we aren't aware of the disabled queues.
> >
> > virtio protocol still allows to disable random queue in guest. This patch will
> > work only with linux kernel virtio driver on guest side and just because linux
> > kernel driver always enables/disables queues sequentially. For example,
> you may
> > write your own application with virtio-user with 2 rx/tx queues in guest and
> > disable rx queue #0. This scenario will lead to broken connection while
> > queue #1 still enabled.
> >
> > Best regards, Ilya Maximets.
> >

Hi Ilya,

Apologies I didn't see your mail before I sent a v2 of the patch.
Thanks for the information. My testing involved the kernel driver and DPDK driver in the guest so it did not expose this type of issue.

With the PMD we now receive the following data struct during vring_state_changed_callback:

struct rte_eth_vhost_queue_event {
	uint16_t queue_id;
	bool rx;
	bool enable;
};

Since we have queue_id information we should be able correctly handle the case you mentioned above. I will look to implement a solution to this in the v3.

Thanks,
Ciara

Ilya Maximets May 10, 2016, 9:51 a.m. UTC | #7

On 10.05.2016 12:38, Loftus, Ciara wrote:
>>
>> On 10.05.2016 11:31, Ilya Maximets wrote:
>>> On 03.05.2016 14:28, ciara.loftus at intel.com (Loftus, Ciara) wrote:
>>>>> This patch seem to remove a lot of txq remapping functions (like
>>>>> netdev_dpdk_remap_txqs).Ã,Â  How does it handle the case of a
>> disabled txq in
>>>>> the guest kernel?
>>>> There is a difference in the amount of information we can get about
>> vrings
>>>> in OVS now. With the PMD, we no longer have direct access to the
>> virtio_net
>>>> structure. We used to use virto_net->virt_qp_nb to determine the
>> number of
>>>> vrings (enabled and disabled) in the guest kernel, and we could map
>> disabled
>>>> onto enabled accordingly. Now with the PMD, we only get vring
>> information as
>>>> their state changes. eg. VM with 2 vrings enabled -> we assume there are
>> only
>>>> 2 vrings, even though there may be many more that are disabled. We
>> don't need
>>>> to map because we aren't aware of the disabled queues.
>>>
>>> virtio protocol still allows to disable random queue in guest. This patch will
>>> work only with linux kernel virtio driver on guest side and just because linux
>>> kernel driver always enables/disables queues sequentially. For example,
>> you may
>>> write your own application with virtio-user with 2 rx/tx queues in guest and
>>> disable rx queue #0. This scenario will lead to broken connection while
>>> queue #1 still enabled.
>>>
>>> Best regards, Ilya Maximets.
>>>
> 
> Hi Ilya,
> 
> Apologies I didn't see your mail before I sent a v2 of the patch.
> Thanks for the information. My testing involved the kernel driver and DPDK driver in the guest so it did not expose this type of issue.
> 
> With the PMD we now receive the following data struct during vring_state_changed_callback:
> 
> struct rte_eth_vhost_queue_event {
> 	uint16_t queue_id;
> 	bool rx;
> 	bool enable;
> };
> 
> Since we have queue_id information we should be able correctly handle the case you mentioned above. I will look to implement a solution to this in the v3.

OK. But solution already exists. You may just preserve mechanism of remapping.
It was implemented exactly for this situation.

Best regards, Ilya Maximets.

Panu Matilainen May 10, 2016, 1:51 p.m. UTC | #8

On 04/30/2016 03:23 AM, Daniele Di Proietto wrote:

> I see that vhost-cuse is still handled separately. Is it possible to use
> the vhost pmd also for vhost-cuse? Otherwise we still basically have to
> handle differently three cases: NIC PMD, vhost user pmd, vhost cuse.  Maybe
> it's time to remove vhost-cuse (I understand this is a separate issue,
> though)?

It's a separate issue but sufficiently related that its worth rehashing 
now - getting rid of it first would presumably simplify the remaining 
code quite a bit.

At least my personal experience with it has been that vhost-cuse only 
ever worked once in a full moon with stars aligned just so, and with 
just the right magic kernel versions in host and guest.

My 5c? Just put the poor thing out of its misery already :)

	- Panu -

Kevin Traynor May 13, 2016, 9:53 a.m. UTC | #9

> -----Original Message-----

> From: dev [mailto:dev-bounces@openvswitch.org] On Behalf Of Panu

> Matilainen

> Sent: Tuesday, May 10, 2016 2:51 PM

> To: Daniele Di Proietto <diproiettod@ovn.org>; Loftus, Ciara

> <ciara.loftus@intel.com>

> Cc: dev@openvswitch.org

> Subject: Re: [ovs-dev] [PATCH] netdev-dpdk: Add vHost User PMD

> 

> On 04/30/2016 03:23 AM, Daniele Di Proietto wrote:

> 

> > I see that vhost-cuse is still handled separately. Is it possible to

> use

> > the vhost pmd also for vhost-cuse? Otherwise we still basically have

> to

> > handle differently three cases: NIC PMD, vhost user pmd, vhost cuse.

> Maybe

> > it's time to remove vhost-cuse (I understand this is a separate

> issue,

> > though)?

> 

> It's a separate issue but sufficiently related that its worth

> rehashing

> now - getting rid of it first would presumably simplify the remaining

> code quite a bit.

> 

> At least my personal experience with it has been that vhost-cuse only

> ever worked once in a full moon with stars aligned just so, and with

> just the right magic kernel versions in host and guest.

> 

> My 5c? Just put the poor thing out of its misery already :)

> 

> 	- Panu -

Hi Panu,

I didn't share the pain in running vhost-cuse (although it's a long time
since I ran it) but I agree with removing it. vhost-user is clearly the
way to go now, with better support in QEMU/DPDK and no out of tree kernel
module.

QEMU/DPDK support - vhost-user has been available since QEMU 2.1/2.
Anyone willing to upgrade to a new OVS release without vhost-cuse would
surely be ok to use a version of QEMU that supports vhost-user, multiqueue
etc. 

Distro's - the latest Fedora, Ubuntu all come shipped with versions of QEMU
that support vhost-user. Centos has QEMU 2.0 but that supports neither
vhost-cuse or vhost-user.

Code - now that we have a vhost-user pmd in DPDK, removing vhost-cuse will
be a significant clean up. 

+1 from me.

Kevin.

> _______________________________________________

> dev mailing list

> dev@openvswitch.org

> http://openvswitch.org/mailman/listinfo/dev

Kevin Traynor May 13, 2016, 1:13 p.m. UTC | #10

> -----Original Message-----

> From: Loftus, Ciara

> Sent: Tuesday, May 10, 2016 10:22 AM

> To: Traynor, Kevin <kevin.traynor@intel.com>

> Cc: dev@openvswitch.org

> Subject: RE: [ovs-dev] [PATCH] netdev-dpdk: Add vHost User PMD

> 

> > On 21/04/2016 13:20, Ciara Loftus wrote:

> > > DPDK 16.04 introduces the vHost PMD which allows 'dpdkvhostuser'

> ports

> > > to be controlled by the librte_ether API, like physical 'dpdk'

> ports.

> > > The commit integrates this functionality into OVS, and refactors

> some

> > > of the existing vhost code such that it is vhost-cuse specific.

> > > Similarly, there is now some overlap between dpdk and vhost-user

> port

> > > code.

> > >

> > > Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>

> > > ---

> > >   INSTALL.DPDK.md   |  12 ++

> > >   NEWS              |   2 +

> > >   lib/netdev-dpdk.c | 515 +++++++++++++++++++++++++---------------

> ------

> > --------

> >

> > Hi Ciara, there's a lot of churn in this file. It might be worth

> > considering to see if it could be split through a few commits

> commits to

> > help reviewers. e.g. new features like adding get_features,

> get_status

> > for vhost could be a separate patch at least.

> 

> I've split into 3:

> - remove watchdog

> - add pmd

> - add get_stats & get_features


Great - this helps review a lot

> 

> Couldn't quite find a way to split it up more.

> 

> >

> > >   3 files changed, 254 insertions(+), 275 deletions(-)

> > >   mode change 100644 => 100755 lib/netdev-dpdk.c

> >

> > file permission change.

> 

> Woops. Fixed in v2.

> 

> >

> > >

> > > diff --git a/INSTALL.DPDK.md b/INSTALL.DPDK.md

> > > index 7f76df8..5006812 100644

> > > --- a/INSTALL.DPDK.md

> > > +++ b/INSTALL.DPDK.md

> > > @@ -945,6 +945,18 @@ Restrictions:

> > >       increased to the desired number of queues. Both DPDK and OVS

> must

> > be

> > >       recompiled for this change to take effect.

> > >

> > > +  DPDK 'eth' type ports:

> > > +  - dpdk, dpdkr and dpdkvhostuser ports are 'eth' type ports in

> the context

> > of

> > > +    DPDK as they are all managed by the rte_ether API. This means

> that

> > they

> > > +    adhere to the DPDK configuration option

> CONFIG_RTE_MAX_ETHPORTS

> > which by

> > > +    default is set to 32. This means by default the combined

> total number of

> > > +    dpdk, dpdkr and dpdkvhostuser ports allowable in OVS with

> DPDK is 32.

> > This

> > > +    value can be changed if desired by modifying the

> configuration file in

> > > +    DPDK, or by overriding the default value on the command line

> when

> > building

> > > +    DPDK. eg.

> > > +

> > > +        `make install CONFIG_RTE_MAX_ETHPORTS=64`

> >

> > format is not registering right for this in my md viewer.

> 

> It's looking ok on mine. What doesn't look right?


The ``'s are showing in atom.io - could be just atom or schema related.

> 

> >

> > > +

> > >   Bug Reporting:

> > >   --------------

> > >

> > > diff --git a/NEWS b/NEWS

> > > index ea7f3a1..4dc0201 100644

> > > --- a/NEWS

> > > +++ b/NEWS

> > > @@ -26,6 +26,8 @@ Post-v2.5.0

> > >          assignment.

> > >        * Type of log messages from PMD threads changed from INFO

> to DBG.

> > >        * QoS functionality with sample egress-policer

> implementation.

> > > +     * vHost PMD integration brings vhost-user ports under

> control of the

> > > +       rte_ether DPDK API.

> > >      - ovs-benchmark: This utility has been removed due to lack of

> use and

> > >        bitrot.

> > >      - ovs-appctl:

> > > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c

> > > old mode 100644

> > > new mode 100755

> > > index 208c5f5..4fccd63

> > > --- a/lib/netdev-dpdk.c

> > > +++ b/lib/netdev-dpdk.c

> > > @@ -56,6 +56,7 @@

> > >   #include "rte_mbuf.h"

> > >   #include "rte_meter.h"

> > >   #include "rte_virtio_net.h"

> > > +#include "rte_eth_vhost.h"

> >

> > nit: generally these go in alphabetical order.

> 

> Ok

> 

> >

> > >

> > >   VLOG_DEFINE_THIS_MODULE(dpdk);

> > >   static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);

> > > @@ -109,6 +110,8 @@ BUILD_ASSERT_DECL((MAX_NB_MBUF /

> > ROUND_DOWN_POW2(MAX_NB_MBUF/MIN_NB_MBUF))

> > >

> > >   static char *cuse_dev_name = NULL;    /* Character device

> > cuse_dev_name. */

> > >   static char *vhost_sock_dir = NULL;   /* Location of vhost-user

> sockets */

> > > +/* Array that tracks the used & unused vHost user driver IDs */

> > > +static unsigned int vhost_user_drv_ids[RTE_MAX_ETHPORTS];

> > >

> > >   /*

> > >    * Maximum amount of time in micro seconds to try and enqueue to

> > vhost.

> > > @@ -143,7 +146,8 @@ enum { DRAIN_TSC = 200000ULL };

> > >

> > >   enum dpdk_dev_type {

> > >       DPDK_DEV_ETH = 0,

> > > -    DPDK_DEV_VHOST = 1,

> > > +    DPDK_DEV_VHOST_USER = 1,

> > > +    DPDK_DEV_VHOST_CUSE = 2,

> > >   };

> > >

> > >   static int rte_eal_init_ret = ENODEV;

> > > @@ -275,8 +279,6 @@ struct dpdk_tx_queue {

> > >                                       * from concurrent access.

> It is used only

> > >                                       * if the queue is shared

> among different

> > >                                       * pmd threads (see

> 'txq_needs_locking'). */

> > > -    int map;                       /* Mapping of configured

> vhost-user queues

> > > -                                    * to enabled by guest. */

> > >       uint64_t tsc;

> > >       struct rte_mbuf *burst_pkts[MAX_TX_QUEUE_LEN];

> > >   };

> > > @@ -329,12 +331,22 @@ struct netdev_dpdk {

> > >       int real_n_rxq;

> > >       bool txq_needs_locking;

> > >

> > > -    /* virtio-net structure for vhost device */

> > > +    /* Spinlock for vhost cuse transmission. Other DPDK devices

> use

> > spinlocks

> > > +     * in dpdk_tx_queue */

> > > +    rte_spinlock_t vhost_cuse_tx_lock;

> >

> > Why can't we continue to use the lock in dpdk_tx_queue? rather than

> > adding a cuse specific lock.

> 

> This logic is taken from netdev-dpdk pre-multiqueue. I assumed it was

> the safest way to go since it's tried and tested.


The queue already has a lock, so I don't see the need to introduce
a new one. Anyway, it could be a moot point if people think cuse can be
removed.

[ovs-dev] netdev-dpdk: Add vHost User PMD

Commit Message

Comments

Patch