diff mbox

[ovs-dev,v2,2.5] netdev-dpdk: Use instant sending instead of queueing of packets.

Message ID 1479375798-10968-1-git-send-email-cascardo@redhat.com
State Accepted
Delegated to: Daniele Di Proietto
Headers show

Commit Message

Thadeu Lima de Souza Cascardo Nov. 17, 2016, 9:43 a.m. UTC
From: Ilya Maximets <i.maximets@samsung.com>

Current implementarion of TX packet's queueing is broken in several ways:

	* TX queue flushing implemented on receive assumes that all
	  core_id-s are sequential and starts from zero. This may lead
	  to situation when packets will stuck in queue forever and,
	  also, this influences on latency.

	* For a long time flushing logic depends on uninitialized
	  'txq_needs_locking', because it usually calculated after
	  'netdev_dpdk_alloc_txq' but used inside of this function
	  for initialization of 'flush_tx'.

Testing shows no performance difference with and without queueing.
Lets remove queueing at all because it doesn't work properly now and
also does not increase performance.

This improves latency compared to current openvswitch 2.5.

Without the patch:

Device 0->1:
  Throughput: 6.8683 Mpps
  Min. Latency: 39.9780 usec
  Avg. Latency: 61.1226 usec
  Max. Latency: 89.1110 usec

Device 1->0:
  Throughput: 6.8683 Mpps
  Min. Latency: 41.0660 usec
  Avg. Latency: 58.7778 usec
  Max. Latency: 89.4720 usec

With the patch:

Device 0->1:
  Throughput: 6.3941 Mpps
  Min. Latency: 10.5410 usec
  Avg. Latency: 14.1309 usec
  Max. Latency: 28.9880 usec

Device 1->0:
  Throughput: 6.3941 Mpps
  Min. Latency: 11.9780 usec
  Avg. Latency: 18.0692 usec
  Max. Latency: 29.5200

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
---
 lib/netdev-dpdk.c | 104 ++++++++----------------------------------------------
 1 file changed, 14 insertions(+), 90 deletions(-)

Comments

Daniele Di Proietto Dec. 9, 2016, 11:30 p.m. UTC | #1
On 17/11/2016 01:43, "Thadeu Lima de Souza Cascardo" <cascardo@redhat.com> wrote:

>From: Ilya Maximets <i.maximets@samsung.com>
>
>Current implementarion of TX packet's queueing is broken in several ways:
>
>	* TX queue flushing implemented on receive assumes that all
>	  core_id-s are sequential and starts from zero. This may lead
>	  to situation when packets will stuck in queue forever and,
>	  also, this influences on latency.
>
>	* For a long time flushing logic depends on uninitialized
>	  'txq_needs_locking', because it usually calculated after
>	  'netdev_dpdk_alloc_txq' but used inside of this function
>	  for initialization of 'flush_tx'.
>
>Testing shows no performance difference with and without queueing.
>Lets remove queueing at all because it doesn't work properly now and
>also does not increase performance.
>
>This improves latency compared to current openvswitch 2.5.
>
>Without the patch:
>
>Device 0->1:
>  Throughput: 6.8683 Mpps
>  Min. Latency: 39.9780 usec
>  Avg. Latency: 61.1226 usec
>  Max. Latency: 89.1110 usec
>
>Device 1->0:
>  Throughput: 6.8683 Mpps
>  Min. Latency: 41.0660 usec
>  Avg. Latency: 58.7778 usec
>  Max. Latency: 89.4720 usec
>
>With the patch:
>
>Device 0->1:
>  Throughput: 6.3941 Mpps
>  Min. Latency: 10.5410 usec
>  Avg. Latency: 14.1309 usec
>  Max. Latency: 28.9880 usec
>
>Device 1->0:
>  Throughput: 6.3941 Mpps
>  Min. Latency: 11.9780 usec
>  Avg. Latency: 18.0692 usec
>  Max. Latency: 29.5200
>
>Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
>Acked-by: Daniele Di Proietto <diproiettod@vmware.com>
>Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>

Applied to branch-2.5, thanks

>---
> lib/netdev-dpdk.c | 104 ++++++++----------------------------------------------
> 1 file changed, 14 insertions(+), 90 deletions(-)
>
>diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>index 6e68c07..58d2f8e 100644
>--- a/lib/netdev-dpdk.c
>+++ b/lib/netdev-dpdk.c
>@@ -131,7 +131,6 @@ static const struct rte_eth_conf port_conf = {
>     },
> };
> 
>-enum { MAX_TX_QUEUE_LEN = 384 };
> enum { DPDK_RING_SIZE = 256 };
> BUILD_ASSERT_DECL(IS_POW2(DPDK_RING_SIZE));
> enum { DRAIN_TSC = 200000ULL };
>@@ -153,8 +152,7 @@ static struct ovs_list dpdk_mp_list OVS_GUARDED_BY(dpdk_mutex)
>     = OVS_LIST_INITIALIZER(&dpdk_mp_list);
> 
> /* This mutex must be used by non pmd threads when allocating or freeing
>- * mbufs through mempools. Since dpdk_queue_pkts() and dpdk_queue_flush() may
>- * use mempools, a non pmd thread should hold this mutex while calling them */
>+ * mbufs through mempools. */
> static struct ovs_mutex nonpmd_mempool_mutex = OVS_MUTEX_INITIALIZER;
> 
> struct dpdk_mp {
>@@ -168,17 +166,12 @@ struct dpdk_mp {
> /* There should be one 'struct dpdk_tx_queue' created for
>  * each cpu core. */
> struct dpdk_tx_queue {
>-    bool flush_tx;                 /* Set to true to flush queue everytime */
>-                                   /* pkts are queued. */
>-    int count;
>     rte_spinlock_t tx_lock;        /* Protects the members and the NIC queue
>                                     * from concurrent access.  It is used only
>                                     * if the queue is shared among different
>                                     * pmd threads (see 'txq_needs_locking'). */
>     int map;                       /* Mapping of configured vhost-user queues
>                                     * to enabled by guest. */
>-    uint64_t tsc;
>-    struct rte_mbuf *burst_pkts[MAX_TX_QUEUE_LEN];
> };
> 
> /* dpdk has no way to remove dpdk ring ethernet devices
>@@ -566,19 +559,6 @@ netdev_dpdk_alloc_txq(struct netdev_dpdk *netdev, unsigned int n_txqs)
> 
>     netdev->tx_q = dpdk_rte_mzalloc(n_txqs * sizeof *netdev->tx_q);
>     for (i = 0; i < n_txqs; i++) {
>-        int numa_id = ovs_numa_get_numa_id(i);
>-
>-        if (!netdev->txq_needs_locking) {
>-            /* Each index is considered as a cpu core id, since there should
>-             * be one tx queue for each cpu core.  If the corresponding core
>-             * is not on the same numa node as 'netdev', flags the
>-             * 'flush_tx'. */
>-            netdev->tx_q[i].flush_tx = netdev->socket_id == numa_id;
>-        } else {
>-            /* Queues are shared among CPUs. Always flush */
>-            netdev->tx_q[i].flush_tx = true;
>-        }
>-
>         /* Initialize map for vhost devices. */
>         netdev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
>         rte_spinlock_init(&netdev->tx_q[i].tx_lock);
>@@ -952,16 +932,15 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq_)
> }
> 
> static inline void
>-dpdk_queue_flush__(struct netdev_dpdk *dev, int qid)
>+netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
>+                             struct rte_mbuf **pkts, int cnt)
> {
>-    struct dpdk_tx_queue *txq = &dev->tx_q[qid];
>     uint32_t nb_tx = 0;
> 
>-    while (nb_tx != txq->count) {
>+    while (nb_tx != cnt) {
>         uint32_t ret;
> 
>-        ret = rte_eth_tx_burst(dev->port_id, qid, txq->burst_pkts + nb_tx,
>-                               txq->count - nb_tx);
>+        ret = rte_eth_tx_burst(dev->port_id, qid, pkts + nb_tx, cnt - nb_tx);
>         if (!ret) {
>             break;
>         }
>@@ -969,32 +948,18 @@ dpdk_queue_flush__(struct netdev_dpdk *dev, int qid)
>         nb_tx += ret;
>     }
> 
>-    if (OVS_UNLIKELY(nb_tx != txq->count)) {
>+    if (OVS_UNLIKELY(nb_tx != cnt)) {
>         /* free buffers, which we couldn't transmit, one at a time (each
>          * packet could come from a different mempool) */
>         int i;
> 
>-        for (i = nb_tx; i < txq->count; i++) {
>-            rte_pktmbuf_free(txq->burst_pkts[i]);
>+        for (i = nb_tx; i < cnt; i++) {
>+            rte_pktmbuf_free(pkts[i]);
>         }
>         rte_spinlock_lock(&dev->stats_lock);
>-        dev->stats.tx_dropped += txq->count-nb_tx;
>+        dev->stats.tx_dropped += cnt - nb_tx;
>         rte_spinlock_unlock(&dev->stats_lock);
>     }
>-
>-    txq->count = 0;
>-    txq->tsc = rte_get_timer_cycles();
>-}
>-
>-static inline void
>-dpdk_queue_flush(struct netdev_dpdk *dev, int qid)
>-{
>-    struct dpdk_tx_queue *txq = &dev->tx_q[qid];
>-
>-    if (txq->count == 0) {
>-        return;
>-    }
>-    dpdk_queue_flush__(dev, qid);
> }
> 
> static bool
>@@ -1076,19 +1041,8 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
>                      int *c)
> {
>     struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq_);
>-    struct netdev *netdev = rx->up.netdev;
>-    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>     int nb_rx;
> 
>-    /* There is only one tx queue for this core.  Do not flush other
>-     * queues.
>-     * Do not flush tx queue which is shared among CPUs
>-     * since it is always flushed */
>-    if (rxq_->queue_id == rte_lcore_id() &&
>-        OVS_LIKELY(!dev->txq_needs_locking)) {
>-        dpdk_queue_flush(dev, rxq_->queue_id);
>-    }
>-
>     nb_rx = rte_eth_rx_burst(rx->port_id, rxq_->queue_id,
>                              (struct rte_mbuf **) packets,
>                              NETDEV_MAX_BURST);
>@@ -1175,35 +1129,6 @@ out:
>     }
> }
> 
>-inline static void
>-dpdk_queue_pkts(struct netdev_dpdk *dev, int qid,
>-               struct rte_mbuf **pkts, int cnt)
>-{
>-    struct dpdk_tx_queue *txq = &dev->tx_q[qid];
>-    uint64_t diff_tsc;
>-
>-    int i = 0;
>-
>-    while (i < cnt) {
>-        int freeslots = MAX_TX_QUEUE_LEN - txq->count;
>-        int tocopy = MIN(freeslots, cnt-i);
>-
>-        memcpy(&txq->burst_pkts[txq->count], &pkts[i],
>-               tocopy * sizeof (struct rte_mbuf *));
>-
>-        txq->count += tocopy;
>-        i += tocopy;
>-
>-        if (txq->count == MAX_TX_QUEUE_LEN || txq->flush_tx) {
>-            dpdk_queue_flush__(dev, qid);
>-        }
>-        diff_tsc = rte_get_timer_cycles() - txq->tsc;
>-        if (diff_tsc >= DRAIN_TSC) {
>-            dpdk_queue_flush__(dev, qid);
>-        }
>-    }
>-}
>-
> /* Tx function. Transmit packets indefinitely */
> static void
> dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet **pkts,
>@@ -1265,8 +1190,7 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet **pkts,
>     if (dev->type == DPDK_DEV_VHOST) {
>         __netdev_dpdk_vhost_send(netdev, qid, (struct dp_packet **) mbufs, newcnt, true);
>     } else {
>-        dpdk_queue_pkts(dev, qid, mbufs, newcnt);
>-        dpdk_queue_flush(dev, qid);
>+        netdev_dpdk_eth_tx_burst(dev, qid, mbufs, newcnt);
>     }
> 
>     if (!dpdk_thread_is_pmd()) {
>@@ -1324,7 +1248,7 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
> 
>             if (OVS_UNLIKELY(size > dev->max_packet_len)) {
>                 if (next_tx_idx != i) {
>-                    dpdk_queue_pkts(dev, qid,
>+                    netdev_dpdk_eth_tx_burst(dev, qid,
>                                     (struct rte_mbuf **)&pkts[next_tx_idx],
>                                     i-next_tx_idx);
>                 }
>@@ -1338,9 +1262,9 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
>             }
>         }
>         if (next_tx_idx != cnt) {
>-           dpdk_queue_pkts(dev, qid,
>-                            (struct rte_mbuf **)&pkts[next_tx_idx],
>-                            cnt-next_tx_idx);
>+            netdev_dpdk_eth_tx_burst(dev, qid,
>+                                     (struct rte_mbuf **)&pkts[next_tx_idx],
>+                                     cnt-next_tx_idx);
>         }
> 
>         if (OVS_UNLIKELY(dropped)) {
>-- 
>2.7.4
>
diff mbox

Patch

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 6e68c07..58d2f8e 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -131,7 +131,6 @@  static const struct rte_eth_conf port_conf = {
     },
 };
 
-enum { MAX_TX_QUEUE_LEN = 384 };
 enum { DPDK_RING_SIZE = 256 };
 BUILD_ASSERT_DECL(IS_POW2(DPDK_RING_SIZE));
 enum { DRAIN_TSC = 200000ULL };
@@ -153,8 +152,7 @@  static struct ovs_list dpdk_mp_list OVS_GUARDED_BY(dpdk_mutex)
     = OVS_LIST_INITIALIZER(&dpdk_mp_list);
 
 /* This mutex must be used by non pmd threads when allocating or freeing
- * mbufs through mempools. Since dpdk_queue_pkts() and dpdk_queue_flush() may
- * use mempools, a non pmd thread should hold this mutex while calling them */
+ * mbufs through mempools. */
 static struct ovs_mutex nonpmd_mempool_mutex = OVS_MUTEX_INITIALIZER;
 
 struct dpdk_mp {
@@ -168,17 +166,12 @@  struct dpdk_mp {
 /* There should be one 'struct dpdk_tx_queue' created for
  * each cpu core. */
 struct dpdk_tx_queue {
-    bool flush_tx;                 /* Set to true to flush queue everytime */
-                                   /* pkts are queued. */
-    int count;
     rte_spinlock_t tx_lock;        /* Protects the members and the NIC queue
                                     * from concurrent access.  It is used only
                                     * if the queue is shared among different
                                     * pmd threads (see 'txq_needs_locking'). */
     int map;                       /* Mapping of configured vhost-user queues
                                     * to enabled by guest. */
-    uint64_t tsc;
-    struct rte_mbuf *burst_pkts[MAX_TX_QUEUE_LEN];
 };
 
 /* dpdk has no way to remove dpdk ring ethernet devices
@@ -566,19 +559,6 @@  netdev_dpdk_alloc_txq(struct netdev_dpdk *netdev, unsigned int n_txqs)
 
     netdev->tx_q = dpdk_rte_mzalloc(n_txqs * sizeof *netdev->tx_q);
     for (i = 0; i < n_txqs; i++) {
-        int numa_id = ovs_numa_get_numa_id(i);
-
-        if (!netdev->txq_needs_locking) {
-            /* Each index is considered as a cpu core id, since there should
-             * be one tx queue for each cpu core.  If the corresponding core
-             * is not on the same numa node as 'netdev', flags the
-             * 'flush_tx'. */
-            netdev->tx_q[i].flush_tx = netdev->socket_id == numa_id;
-        } else {
-            /* Queues are shared among CPUs. Always flush */
-            netdev->tx_q[i].flush_tx = true;
-        }
-
         /* Initialize map for vhost devices. */
         netdev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
         rte_spinlock_init(&netdev->tx_q[i].tx_lock);
@@ -952,16 +932,15 @@  netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq_)
 }
 
 static inline void
-dpdk_queue_flush__(struct netdev_dpdk *dev, int qid)
+netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
+                             struct rte_mbuf **pkts, int cnt)
 {
-    struct dpdk_tx_queue *txq = &dev->tx_q[qid];
     uint32_t nb_tx = 0;
 
-    while (nb_tx != txq->count) {
+    while (nb_tx != cnt) {
         uint32_t ret;
 
-        ret = rte_eth_tx_burst(dev->port_id, qid, txq->burst_pkts + nb_tx,
-                               txq->count - nb_tx);
+        ret = rte_eth_tx_burst(dev->port_id, qid, pkts + nb_tx, cnt - nb_tx);
         if (!ret) {
             break;
         }
@@ -969,32 +948,18 @@  dpdk_queue_flush__(struct netdev_dpdk *dev, int qid)
         nb_tx += ret;
     }
 
-    if (OVS_UNLIKELY(nb_tx != txq->count)) {
+    if (OVS_UNLIKELY(nb_tx != cnt)) {
         /* free buffers, which we couldn't transmit, one at a time (each
          * packet could come from a different mempool) */
         int i;
 
-        for (i = nb_tx; i < txq->count; i++) {
-            rte_pktmbuf_free(txq->burst_pkts[i]);
+        for (i = nb_tx; i < cnt; i++) {
+            rte_pktmbuf_free(pkts[i]);
         }
         rte_spinlock_lock(&dev->stats_lock);
-        dev->stats.tx_dropped += txq->count-nb_tx;
+        dev->stats.tx_dropped += cnt - nb_tx;
         rte_spinlock_unlock(&dev->stats_lock);
     }
-
-    txq->count = 0;
-    txq->tsc = rte_get_timer_cycles();
-}
-
-static inline void
-dpdk_queue_flush(struct netdev_dpdk *dev, int qid)
-{
-    struct dpdk_tx_queue *txq = &dev->tx_q[qid];
-
-    if (txq->count == 0) {
-        return;
-    }
-    dpdk_queue_flush__(dev, qid);
 }
 
 static bool
@@ -1076,19 +1041,8 @@  netdev_dpdk_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
                      int *c)
 {
     struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq_);
-    struct netdev *netdev = rx->up.netdev;
-    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
     int nb_rx;
 
-    /* There is only one tx queue for this core.  Do not flush other
-     * queues.
-     * Do not flush tx queue which is shared among CPUs
-     * since it is always flushed */
-    if (rxq_->queue_id == rte_lcore_id() &&
-        OVS_LIKELY(!dev->txq_needs_locking)) {
-        dpdk_queue_flush(dev, rxq_->queue_id);
-    }
-
     nb_rx = rte_eth_rx_burst(rx->port_id, rxq_->queue_id,
                              (struct rte_mbuf **) packets,
                              NETDEV_MAX_BURST);
@@ -1175,35 +1129,6 @@  out:
     }
 }
 
-inline static void
-dpdk_queue_pkts(struct netdev_dpdk *dev, int qid,
-               struct rte_mbuf **pkts, int cnt)
-{
-    struct dpdk_tx_queue *txq = &dev->tx_q[qid];
-    uint64_t diff_tsc;
-
-    int i = 0;
-
-    while (i < cnt) {
-        int freeslots = MAX_TX_QUEUE_LEN - txq->count;
-        int tocopy = MIN(freeslots, cnt-i);
-
-        memcpy(&txq->burst_pkts[txq->count], &pkts[i],
-               tocopy * sizeof (struct rte_mbuf *));
-
-        txq->count += tocopy;
-        i += tocopy;
-
-        if (txq->count == MAX_TX_QUEUE_LEN || txq->flush_tx) {
-            dpdk_queue_flush__(dev, qid);
-        }
-        diff_tsc = rte_get_timer_cycles() - txq->tsc;
-        if (diff_tsc >= DRAIN_TSC) {
-            dpdk_queue_flush__(dev, qid);
-        }
-    }
-}
-
 /* Tx function. Transmit packets indefinitely */
 static void
 dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet **pkts,
@@ -1265,8 +1190,7 @@  dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet **pkts,
     if (dev->type == DPDK_DEV_VHOST) {
         __netdev_dpdk_vhost_send(netdev, qid, (struct dp_packet **) mbufs, newcnt, true);
     } else {
-        dpdk_queue_pkts(dev, qid, mbufs, newcnt);
-        dpdk_queue_flush(dev, qid);
+        netdev_dpdk_eth_tx_burst(dev, qid, mbufs, newcnt);
     }
 
     if (!dpdk_thread_is_pmd()) {
@@ -1324,7 +1248,7 @@  netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
 
             if (OVS_UNLIKELY(size > dev->max_packet_len)) {
                 if (next_tx_idx != i) {
-                    dpdk_queue_pkts(dev, qid,
+                    netdev_dpdk_eth_tx_burst(dev, qid,
                                     (struct rte_mbuf **)&pkts[next_tx_idx],
                                     i-next_tx_idx);
                 }
@@ -1338,9 +1262,9 @@  netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
             }
         }
         if (next_tx_idx != cnt) {
-           dpdk_queue_pkts(dev, qid,
-                            (struct rte_mbuf **)&pkts[next_tx_idx],
-                            cnt-next_tx_idx);
+            netdev_dpdk_eth_tx_burst(dev, qid,
+                                     (struct rte_mbuf **)&pkts[next_tx_idx],
+                                     cnt-next_tx_idx);
         }
 
         if (OVS_UNLIKELY(dropped)) {