diff mbox

[ovs-dev] netdev-dpdk: Use instant sending instead of queueing of packets.

Message ID 1477335700-31122-1-git-send-email-cascardo@redhat.com
State Changes Requested
Delegated to: Daniele Di Proietto
Headers show

Commit Message

Thadeu Lima de Souza Cascardo Oct. 24, 2016, 7:01 p.m. UTC
Backport commit b59cc14e032da370021794bfd1bdd3d67e88a9a3 from master.

This improves latency compared to current openvswitch 2.5.

Without the patch:

Device 0->1:
  Throughput: 6.8683 Mpps
  Min. Latency: 39.9780 usec
  Avg. Latency: 61.1226 usec
  Max. Latency: 89.1110 usec

Device 1->0:
  Throughput: 6.8683 Mpps
  Min. Latency: 41.0660 usec
  Avg. Latency: 58.7778 usec
  Max. Latency: 89.4720 usec

With the patch:

Device 0->1:
  Throughput: 6.3941 Mpps
  Min. Latency: 10.5410 usec
  Avg. Latency: 14.1309 usec
  Max. Latency: 28.9880 usec

Device 1->0:
  Throughput: 6.3941 Mpps
  Min. Latency: 11.9780 usec
  Avg. Latency: 18.0692 usec
  Max. Latency: 29.5200

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Cc: Ilya Maximets <i.maximets@samsung.com>
Cc: Daniele Di Proietto <diproiettod@vmware.com>
---
 lib/netdev-dpdk.c | 102 ++++++++----------------------------------------------
 1 file changed, 14 insertions(+), 88 deletions(-)

Comments

Daniele Di Proietto Nov. 14, 2016, 6:15 p.m. UTC | #1
Hi Cascardo,

I agree, the tx queueing mechanism in 2.5 never really worked because it
made wrong assumptions about the rxq distribution and introduced latency
(at least), as you point out.

Thanks for doing the backport, since it was non trivial.

Usually when we do a backport we use the commit message (including the
tags) from the original commit.  Would you mind doing that?  You can append
the numbers you found as well.

The patch looks good to me, but I still see a compiler warning below.

This might introduce some throughput degradation in certain usecases, but I
think it's more important to fix the broken logic.

Would you mind sending a v2?  I'll be happy to apply it

Thanks,

Daniele

2016-10-24 12:01 GMT-07:00 Thadeu Lima de Souza Cascardo <
cascardo@redhat.com>:

> Backport commit b59cc14e032da370021794bfd1bdd3d67e88a9a3 from master.
>
> This improves latency compared to current openvswitch 2.5.
>
> Without the patch:
>
> Device 0->1:
>   Throughput: 6.8683 Mpps
>   Min. Latency: 39.9780 usec
>   Avg. Latency: 61.1226 usec
>   Max. Latency: 89.1110 usec
>
> Device 1->0:
>   Throughput: 6.8683 Mpps
>   Min. Latency: 41.0660 usec
>   Avg. Latency: 58.7778 usec
>   Max. Latency: 89.4720 usec
>
> With the patch:
>
> Device 0->1:
>   Throughput: 6.3941 Mpps
>   Min. Latency: 10.5410 usec
>   Avg. Latency: 14.1309 usec
>   Max. Latency: 28.9880 usec
>
> Device 1->0:
>   Throughput: 6.3941 Mpps
>   Min. Latency: 11.9780 usec
>   Avg. Latency: 18.0692 usec
>   Max. Latency: 29.5200
>
> Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
> Cc: Ilya Maximets <i.maximets@samsung.com>
> Cc: Daniele Di Proietto <diproiettod@vmware.com>
> ---
>  lib/netdev-dpdk.c | 102 ++++++++----------------------
> ------------------------
>  1 file changed, 14 insertions(+), 88 deletions(-)
>
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 6e68c07..cbe361a 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -131,7 +131,6 @@ static const struct rte_eth_conf port_conf = {
>      },
>  };
>
>
>
[...]

@@ -969,32 +948,18 @@ dpdk_queue_flush__(struct netdev_dpdk *dev, int qid)
>          nb_tx += ret;
>      }
>
> -    if (OVS_UNLIKELY(nb_tx != txq->count)) {
> +    if (OVS_UNLIKELY(nb_tx != cnt)) {
>          /* free buffers, which we couldn't transmit, one at a time (each
>           * packet could come from a different mempool) */
>          int i;
>
> -        for (i = nb_tx; i < txq->count; i++) {
> -            rte_pktmbuf_free(txq->burst_pkts[i]);
> +        for (i = nb_tx; i < cnt; i++) {
> +            rte_pktmbuf_free(pkts[i]);
>          }
>          rte_spinlock_lock(&dev->stats_lock);
> -        dev->stats.tx_dropped += txq->count-nb_tx;
> +        dev->stats.tx_dropped += cnt - nb_tx;
>          rte_spinlock_unlock(&dev->stats_lock);
>      }
> -
> -    txq->count = 0;
> -    txq->tsc = rte_get_timer_cycles();
> -}
> -
> -static inline void
> -dpdk_queue_flush(struct netdev_dpdk *dev, int qid)
> -{
> -    struct dpdk_tx_queue *txq = &dev->tx_q[qid];
> -
> -    if (txq->count == 0) {
> -        return;
> -    }
> -    dpdk_queue_flush__(dev, qid);
>  }
>
>  static bool
> @@ -1080,15 +1045,6 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq_,
> struct dp_packet **packets,
>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>

'dev' and 'netdev' are unused now, we can remove the above lines.




>      int nb_rx;
>
> -    /* There is only one tx queue for this core.  Do not flush other
> -     * queues.
> -     * Do not flush tx queue which is shared among CPUs
> -     * since it is always flushed */
> -    if (rxq_->queue_id == rte_lcore_id() &&
> -        OVS_LIKELY(!dev->txq_needs_locking)) {
> -        dpdk_queue_flush(dev, rxq_->queue_id);
> -    }
> -
>      nb_rx = rte_eth_rx_burst(rx->port_id, rxq_->queue_id,
>                               (struct rte_mbuf **) packets,
>                               NETDEV_MAX_BURST);
>
diff mbox

Patch

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 6e68c07..cbe361a 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -131,7 +131,6 @@  static const struct rte_eth_conf port_conf = {
     },
 };
 
-enum { MAX_TX_QUEUE_LEN = 384 };
 enum { DPDK_RING_SIZE = 256 };
 BUILD_ASSERT_DECL(IS_POW2(DPDK_RING_SIZE));
 enum { DRAIN_TSC = 200000ULL };
@@ -153,8 +152,7 @@  static struct ovs_list dpdk_mp_list OVS_GUARDED_BY(dpdk_mutex)
     = OVS_LIST_INITIALIZER(&dpdk_mp_list);
 
 /* This mutex must be used by non pmd threads when allocating or freeing
- * mbufs through mempools. Since dpdk_queue_pkts() and dpdk_queue_flush() may
- * use mempools, a non pmd thread should hold this mutex while calling them */
+ * mbufs through mempools. */
 static struct ovs_mutex nonpmd_mempool_mutex = OVS_MUTEX_INITIALIZER;
 
 struct dpdk_mp {
@@ -168,17 +166,12 @@  struct dpdk_mp {
 /* There should be one 'struct dpdk_tx_queue' created for
  * each cpu core. */
 struct dpdk_tx_queue {
-    bool flush_tx;                 /* Set to true to flush queue everytime */
-                                   /* pkts are queued. */
-    int count;
     rte_spinlock_t tx_lock;        /* Protects the members and the NIC queue
                                     * from concurrent access.  It is used only
                                     * if the queue is shared among different
                                     * pmd threads (see 'txq_needs_locking'). */
     int map;                       /* Mapping of configured vhost-user queues
                                     * to enabled by guest. */
-    uint64_t tsc;
-    struct rte_mbuf *burst_pkts[MAX_TX_QUEUE_LEN];
 };
 
 /* dpdk has no way to remove dpdk ring ethernet devices
@@ -566,19 +559,6 @@  netdev_dpdk_alloc_txq(struct netdev_dpdk *netdev, unsigned int n_txqs)
 
     netdev->tx_q = dpdk_rte_mzalloc(n_txqs * sizeof *netdev->tx_q);
     for (i = 0; i < n_txqs; i++) {
-        int numa_id = ovs_numa_get_numa_id(i);
-
-        if (!netdev->txq_needs_locking) {
-            /* Each index is considered as a cpu core id, since there should
-             * be one tx queue for each cpu core.  If the corresponding core
-             * is not on the same numa node as 'netdev', flags the
-             * 'flush_tx'. */
-            netdev->tx_q[i].flush_tx = netdev->socket_id == numa_id;
-        } else {
-            /* Queues are shared among CPUs. Always flush */
-            netdev->tx_q[i].flush_tx = true;
-        }
-
         /* Initialize map for vhost devices. */
         netdev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
         rte_spinlock_init(&netdev->tx_q[i].tx_lock);
@@ -952,16 +932,15 @@  netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq_)
 }
 
 static inline void
-dpdk_queue_flush__(struct netdev_dpdk *dev, int qid)
+netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
+                             struct rte_mbuf **pkts, int cnt)
 {
-    struct dpdk_tx_queue *txq = &dev->tx_q[qid];
     uint32_t nb_tx = 0;
 
-    while (nb_tx != txq->count) {
+    while (nb_tx != cnt) {
         uint32_t ret;
 
-        ret = rte_eth_tx_burst(dev->port_id, qid, txq->burst_pkts + nb_tx,
-                               txq->count - nb_tx);
+        ret = rte_eth_tx_burst(dev->port_id, qid, pkts + nb_tx, cnt - nb_tx);
         if (!ret) {
             break;
         }
@@ -969,32 +948,18 @@  dpdk_queue_flush__(struct netdev_dpdk *dev, int qid)
         nb_tx += ret;
     }
 
-    if (OVS_UNLIKELY(nb_tx != txq->count)) {
+    if (OVS_UNLIKELY(nb_tx != cnt)) {
         /* free buffers, which we couldn't transmit, one at a time (each
          * packet could come from a different mempool) */
         int i;
 
-        for (i = nb_tx; i < txq->count; i++) {
-            rte_pktmbuf_free(txq->burst_pkts[i]);
+        for (i = nb_tx; i < cnt; i++) {
+            rte_pktmbuf_free(pkts[i]);
         }
         rte_spinlock_lock(&dev->stats_lock);
-        dev->stats.tx_dropped += txq->count-nb_tx;
+        dev->stats.tx_dropped += cnt - nb_tx;
         rte_spinlock_unlock(&dev->stats_lock);
     }
-
-    txq->count = 0;
-    txq->tsc = rte_get_timer_cycles();
-}
-
-static inline void
-dpdk_queue_flush(struct netdev_dpdk *dev, int qid)
-{
-    struct dpdk_tx_queue *txq = &dev->tx_q[qid];
-
-    if (txq->count == 0) {
-        return;
-    }
-    dpdk_queue_flush__(dev, qid);
 }
 
 static bool
@@ -1080,15 +1045,6 @@  netdev_dpdk_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
     int nb_rx;
 
-    /* There is only one tx queue for this core.  Do not flush other
-     * queues.
-     * Do not flush tx queue which is shared among CPUs
-     * since it is always flushed */
-    if (rxq_->queue_id == rte_lcore_id() &&
-        OVS_LIKELY(!dev->txq_needs_locking)) {
-        dpdk_queue_flush(dev, rxq_->queue_id);
-    }
-
     nb_rx = rte_eth_rx_burst(rx->port_id, rxq_->queue_id,
                              (struct rte_mbuf **) packets,
                              NETDEV_MAX_BURST);
@@ -1175,35 +1131,6 @@  out:
     }
 }
 
-inline static void
-dpdk_queue_pkts(struct netdev_dpdk *dev, int qid,
-               struct rte_mbuf **pkts, int cnt)
-{
-    struct dpdk_tx_queue *txq = &dev->tx_q[qid];
-    uint64_t diff_tsc;
-
-    int i = 0;
-
-    while (i < cnt) {
-        int freeslots = MAX_TX_QUEUE_LEN - txq->count;
-        int tocopy = MIN(freeslots, cnt-i);
-
-        memcpy(&txq->burst_pkts[txq->count], &pkts[i],
-               tocopy * sizeof (struct rte_mbuf *));
-
-        txq->count += tocopy;
-        i += tocopy;
-
-        if (txq->count == MAX_TX_QUEUE_LEN || txq->flush_tx) {
-            dpdk_queue_flush__(dev, qid);
-        }
-        diff_tsc = rte_get_timer_cycles() - txq->tsc;
-        if (diff_tsc >= DRAIN_TSC) {
-            dpdk_queue_flush__(dev, qid);
-        }
-    }
-}
-
 /* Tx function. Transmit packets indefinitely */
 static void
 dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet **pkts,
@@ -1265,8 +1192,7 @@  dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet **pkts,
     if (dev->type == DPDK_DEV_VHOST) {
         __netdev_dpdk_vhost_send(netdev, qid, (struct dp_packet **) mbufs, newcnt, true);
     } else {
-        dpdk_queue_pkts(dev, qid, mbufs, newcnt);
-        dpdk_queue_flush(dev, qid);
+        netdev_dpdk_eth_tx_burst(dev, qid, mbufs, newcnt);
     }
 
     if (!dpdk_thread_is_pmd()) {
@@ -1324,7 +1250,7 @@  netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
 
             if (OVS_UNLIKELY(size > dev->max_packet_len)) {
                 if (next_tx_idx != i) {
-                    dpdk_queue_pkts(dev, qid,
+                    netdev_dpdk_eth_tx_burst(dev, qid,
                                     (struct rte_mbuf **)&pkts[next_tx_idx],
                                     i-next_tx_idx);
                 }
@@ -1338,9 +1264,9 @@  netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
             }
         }
         if (next_tx_idx != cnt) {
-           dpdk_queue_pkts(dev, qid,
-                            (struct rte_mbuf **)&pkts[next_tx_idx],
-                            cnt-next_tx_idx);
+            netdev_dpdk_eth_tx_burst(dev, qid,
+                                     (struct rte_mbuf **)&pkts[next_tx_idx],
+                                     cnt-next_tx_idx);
         }
 
         if (OVS_UNLIKELY(dropped)) {