diff mbox series

[ovs-dev] dpif-netdev: Optimize flushing of output packet buffers

Message ID 20220922125208.4769-1-dheeraj.k@acldigital.com
State Superseded
Headers show
Series [ovs-dev] dpif-netdev: Optimize flushing of output packet buffers | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/intel-ovs-compilation success test: success

Commit Message

dheeraj Sept. 22, 2022, 12:52 p.m. UTC
Problem Statement:
Before OVS 2.12 the OVS-DPDK datapath transmitted processed rx packet batches
directly to the wanted tx queues. In OVS 2.12 each PMD stores the processed
packets in an intermediate buffer per output port and flushes these output
buffers in a separate step. This buffering was introduced to allow better
batching of packets for transmit.

The current implementation of the function that flushes the output buffers
performs a full scan overall output ports, even if only one single packet
was buffered. In systems with hundreds of ports this can take a long time and
degrades OVS-DPDK performance significantly.

Solution:
Maintain a list of output ports with buffered packets for each PMD thread and
only iterate over that list when flushing output buffers. Furthermore, defer
the flushing of packet buffers to a single invocation at the end of each PMD
loop when output packet batching with a tx-flush-interval has been enabled
and increased jitter (in microsecond range is not of concern).

Signed-off-by: Dheeraj Kumar <dheeraj.k@acldigital.com>
---
 lib/dpif-netdev-private-thread.h |  7 +++---
 lib/dpif-netdev.c                | 39 ++++++++++++++++++++------------
 2 files changed, 29 insertions(+), 17 deletions(-)

Comments

Ilya Maximets Oct. 25, 2022, 8:47 p.m. UTC | #1
On 9/22/22 14:52, Dheeraj Kumar via dev wrote:
> Problem Statement:
> Before OVS 2.12 the OVS-DPDK datapath transmitted processed rx packet batches
> directly to the wanted tx queues. In OVS 2.12 each PMD stores the processed
> packets in an intermediate buffer per output port and flushes these output
> buffers in a separate step. This buffering was introduced to allow better
> batching of packets for transmit.
> 
> The current implementation of the function that flushes the output buffers
> performs a full scan overall output ports, even if only one single packet
> was buffered. In systems with hundreds of ports this can take a long time and
> degrades OVS-DPDK performance significantly.

Hi.  Thanks for the patch!

Do you have some more detailed performance data for this change?

Also, IIUC, you're trying to optimize a use-case with a few
high-traffic ports and many low-traffic ports on the same thread.
If so, maybe auto load-balancing may also help in this case
by moving away low-traffic ports to other threads?  Did you try
somethign like that?


> 
> Solution:
> Maintain a list of output ports with buffered packets for each PMD thread and
> only iterate over that list when flushing output buffers. Furthermore, defer
> the flushing of packet buffers to a single invocation at the end of each PMD
> loop when output packet batching with a tx-flush-interval has been enabled
> and increased jitter (in microsecond range is not of concern).

I'm not sure about the last part.  The tx-flush-interval is enabled
by default and users should not generally touch it unless they really
know what they are doing.  Som that change will affect the majority
of OVS users.

Best regards, Ilya Maximets.
diff mbox series

Patch

diff --git a/lib/dpif-netdev-private-thread.h b/lib/dpif-netdev-private-thread.h
index 4472b199d..2775e1a2b 100644
--- a/lib/dpif-netdev-private-thread.h
+++ b/lib/dpif-netdev-private-thread.h
@@ -185,9 +185,6 @@  struct dp_netdev_pmd_thread {
      * than 'cmap_count(dp->poll_threads)'. */
     uint32_t static_tx_qid;
 
-    /* Number of filled output batches. */
-    int n_output_batches;
-
     struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. */
     /* List of rx queues to poll. */
     struct hmap poll_list OVS_GUARDED;
@@ -213,6 +210,10 @@  struct dp_netdev_pmd_thread {
     struct hmap tnl_port_cache;
     struct hmap send_port_cache;
 
+    /* Keep track of the ports with buffered output packets in
+     * send_port_cache. */
+    struct ovs_list pending_tx_ports;
+
     /* Keep track of detailed PMD performance statistics. */
     struct pmd_perf_stats perf_stats;
 
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index a45b46014..3c2cd6cbc 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -500,6 +500,7 @@  struct tx_port {
     int qid;
     long long last_used;
     struct hmap_node node;
+    struct ovs_list pending_tx;     /* Only used in send_port_cache. */
     long long flush_time;
     struct dp_packet_batch output_pkts;
     struct dp_packet_batch *txq_pkts; /* Only for hash mode. */
@@ -5241,8 +5242,9 @@  dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
     atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
     p->flush_time = pmd->ctx.now + tx_flush_interval;
 
-    ovs_assert(pmd->n_output_batches > 0);
-    pmd->n_output_batches--;
+    /* Remove send port from pending port list */
+    ovs_assert(!ovs_list_is_empty(&p->pending_tx));
+    ovs_list_remove(&p->pending_tx);
 
     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
@@ -5264,16 +5266,11 @@  static int
 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
                                    bool force)
 {
-    struct tx_port *p;
+    struct tx_port *p, *next;
     int output_cnt = 0;
 
-    if (!pmd->n_output_batches) {
-        return 0;
-    }
-
-    HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
-        if (!dp_packet_batch_is_empty(&p->output_pkts)
-            && (force || pmd->ctx.now >= p->flush_time)) {
+    LIST_FOR_EACH_SAFE (p, next, pending_tx, &pmd->pending_tx_ports) {
+        if (force || pmd->ctx.now >= p->flush_time) {
             output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
         }
     }
@@ -5292,6 +5289,7 @@  dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
     int batch_cnt = 0;
     int rem_qlen = 0, *qlen_p = NULL;
     uint64_t cycles;
+    uint64_t tx_flush_interval;
 
     /* Measure duration for polling and processing rx burst. */
     cycle_timer_start(&pmd->perf_stats, &timer);
@@ -5333,7 +5331,12 @@  dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
         cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
         dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
 
-        dp_netdev_pmd_flush_output_packets(pmd, false);
+        /* Defer flushing of tx buffers if tx packet batching is enabled. */
+        atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
+        if (tx_flush_interval == 0) {
+            dp_netdev_pmd_flush_output_packets(pmd, false);
+        }
+
     } else {
         /* Discard cycles. */
         cycle_timer_stop(&pmd->perf_stats, &timer);
@@ -6803,6 +6806,7 @@  pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
                                           n_txq * sizeof *tx_port->txq_pkts);
                 tx_port_cached->txq_pkts = txq_pkts_cached;
             }
+            ovs_list_init(&tx_port_cached->pending_tx);
             hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
                         hash_port_no(tx_port_cached->port->port_no));
         }
@@ -6877,6 +6881,7 @@  pmd_thread_main(void *f_)
     int poll_cnt;
     int i;
     int process_packets = 0;
+    uint32_t tx_flush_interval;
 
     poll_list = NULL;
 
@@ -6890,6 +6895,7 @@  pmd_thread_main(void *f_)
 
 reload:
     atomic_count_init(&pmd->pmd_overloaded, 0);
+    atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
 
     if (!dpdk_attached) {
         dpdk_attached = dpdk_attach_thread(pmd->core_id);
@@ -6962,7 +6968,6 @@  reload:
 
         if (!rx_packets) {
             /* We didn't receive anything in the process loop.
-             * Check if we need to send something.
              * There was no time updates on current iteration. */
             pmd_thread_ctx_time_update(pmd);
             tx_packets = dp_netdev_pmd_flush_output_packets(pmd, false);
@@ -6978,6 +6983,11 @@  reload:
             }
         }
 
+        if (tx_flush_interval > 0) {
+            /* Try to flush port output buffers. */
+             tx_packets = dp_netdev_pmd_flush_output_packets(pmd, false);
+         }
+
         if (lc++ > 1024) {
             lc = 0;
 
@@ -7447,7 +7457,6 @@  dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
     pmd->core_id = core_id;
     pmd->numa_id = numa_id;
     pmd->need_reload = false;
-    pmd->n_output_batches = 0;
 
     ovs_refcount_init(&pmd->ref_cnt);
     atomic_init(&pmd->exit, false);
@@ -7474,6 +7483,7 @@  dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
     hmap_init(&pmd->tnl_port_cache);
     hmap_init(&pmd->send_port_cache);
     cmap_init(&pmd->tx_bonds);
+    ovs_list_init(&pmd->pending_tx_ports);
 
     /* Initialize DPIF function pointer to the default configured version. */
     atomic_init(&pmd->netdev_input_func, dp_netdev_impl_get_default());
@@ -7498,6 +7508,7 @@  dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
     struct dpcls *cls;
 
     dp_netdev_pmd_flow_flush(pmd);
+    ovs_list_poison(&pmd->pending_tx_ports);
     hmap_destroy(&pmd->send_port_cache);
     hmap_destroy(&pmd->tnl_port_cache);
     hmap_destroy(&pmd->tx_ports);
@@ -8714,7 +8725,7 @@  dp_execute_output_action(struct dp_netdev_pmd_thread *pmd,
         dp_netdev_pmd_flush_output_on_port(pmd, p);
     }
     if (dp_packet_batch_is_empty(&p->output_pkts)) {
-        pmd->n_output_batches++;
+        ovs_list_push_front(&pmd->pending_tx_ports, &p->pending_tx);
     }
 
     struct dp_packet *packet;