diff mbox series

[ovs-dev,v10,3/5] dpif-netdev: Time based output batching.

Message ID 1515755828-1848-4-git-send-email-i.maximets@samsung.com
State Superseded
Headers show
Series Output packet batching (Time-based). | expand

Commit Message

Ilya Maximets Jan. 12, 2018, 11:17 a.m. UTC
This allows to collect packets from more than one RX burst
and send them together with a configurable intervals.

'other_config:tx-flush-interval' can be used to configure
time that a packet can wait in output batch for sending.

'tx-flush-interval' has microsecond resolution.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
---
 lib/dpif-netdev.c    | 108 ++++++++++++++++++++++++++++++++++++++++-----------
 vswitchd/vswitch.xml |  16 ++++++++
 2 files changed, 102 insertions(+), 22 deletions(-)

Comments

Jan Scheurich Jan. 12, 2018, 4:42 p.m. UTC | #1
Hi Ilya,

I understand that for the purpose of the present patch this is good enough to count rx and tx packets together, but the PMD performance metrics patch will depend on counting them separately. It would have been nice if you had simply included the changes I proposed. Now I will have to revert some of your changes to make the tx_packets available to pmd_perf_end_iteration(). But so be it. 

Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>

Regards, Jan

> -----Original Message-----
> From: Ilya Maximets [mailto:i.maximets@samsung.com]
> Sent: Friday, 12 January, 2018 12:17
> To: ovs-dev@openvswitch.org
> Cc: Heetae Ahn <heetae82.ahn@samsung.com>; Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>; Antonio Fischetti
> <antonio.fischetti@intel.com>; Eelco Chaudron <echaudro@redhat.com>; Ciara Loftus <ciara.loftus@intel.com>; Kevin Traynor
> <ktraynor@redhat.com>; Jan Scheurich <jan.scheurich@ericsson.com>; Billy O'Mahony <billy.o.mahony@intel.com>; Ian Stokes
> <ian.stokes@intel.com>; Ilya Maximets <i.maximets@samsung.com>
> Subject: [PATCH v10 3/5] dpif-netdev: Time based output batching.
> 
> This allows to collect packets from more than one RX burst
> and send them together with a configurable intervals.
> 
> 'other_config:tx-flush-interval' can be used to configure
> time that a packet can wait in output batch for sending.
> 
> 'tx-flush-interval' has microsecond resolution.
> 
> Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
> ---
>  lib/dpif-netdev.c    | 108 ++++++++++++++++++++++++++++++++++++++++-----------
>  vswitchd/vswitch.xml |  16 ++++++++
>  2 files changed, 102 insertions(+), 22 deletions(-)
> 
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 6909a03..f0ba2b2 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -87,6 +87,9 @@ VLOG_DEFINE_THIS_MODULE(dpif_netdev);
>  #define MAX_RECIRC_DEPTH 6
>  DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
> 
> +/* Use instant packet send by default. */
> +#define DEFAULT_TX_FLUSH_INTERVAL 0
> +
>  /* Configuration parameters. */
>  enum { MAX_FLOWS = 65536 };     /* Maximum number of flows in flow table. */
>  enum { MAX_METERS = 65536 };    /* Maximum number of meters. */
> @@ -273,6 +276,9 @@ struct dp_netdev {
>      struct hmap ports;
>      struct seq *port_seq;       /* Incremented whenever a port changes. */
> 
> +    /* The time that a packet can wait in output batch for sending. */
> +    atomic_uint32_t tx_flush_interval;
> +
>      /* Meters. */
>      struct ovs_mutex meter_locks[N_METER_LOCKS];
>      struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
> @@ -500,6 +506,7 @@ struct tx_port {
>      int qid;
>      long long last_used;
>      struct hmap_node node;
> +    long long flush_time;
>      struct dp_packet_batch output_pkts;
>      struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
>  };
> @@ -581,6 +588,9 @@ struct dp_netdev_pmd_thread {
>       * than 'cmap_count(dp->poll_threads)'. */
>      uint32_t static_tx_qid;
> 
> +    /* Number of filled output batches. */
> +    int n_output_batches;
> +
>      struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. */
>      /* List of rx queues to poll. */
>      struct hmap poll_list OVS_GUARDED;
> @@ -670,8 +680,9 @@ static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
>  static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
>                                         struct rxq_poll *poll)
>      OVS_REQUIRES(pmd->port_mutex);
> -static void
> -dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd);
> +static int
> +dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
> +                                   bool force);
> 
>  static void reconfigure_datapath(struct dp_netdev *dp)
>      OVS_REQUIRES(dp->port_mutex);
> @@ -1245,6 +1256,7 @@ create_dp_netdev(const char *name, const struct dpif_class *class,
>      conntrack_init(&dp->conntrack);
> 
>      atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
> +    atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
> 
>      cmap_init(&dp->poll_threads);
> 
> @@ -2911,7 +2923,7 @@ dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
>      dp_packet_batch_init_packet(&pp, execute->packet);
>      dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
>                                execute->actions, execute->actions_len);
> -    dp_netdev_pmd_flush_output_packets(pmd);
> +    dp_netdev_pmd_flush_output_packets(pmd, true);
> 
>      if (pmd->core_id == NON_PMD_CORE_ID) {
>          ovs_mutex_unlock(&dp->non_pmd_mutex);
> @@ -2960,6 +2972,16 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
>          smap_get_ullong(other_config, "emc-insert-inv-prob",
>                          DEFAULT_EM_FLOW_INSERT_INV_PROB);
>      uint32_t insert_min, cur_min;
> +    uint32_t tx_flush_interval, cur_tx_flush_interval;
> +
> +    tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
> +                                     DEFAULT_TX_FLUSH_INTERVAL);
> +    atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
> +    if (tx_flush_interval != cur_tx_flush_interval) {
> +        atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
> +        VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
> +                  tx_flush_interval);
> +    }
> 
>      if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
>          free(dp->pmd_cmask);
> @@ -3154,7 +3176,7 @@ dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
>      return processing_cycles;
>  }
> 
> -static void
> +static int
>  dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
>                                     struct tx_port *p)
>  {
> @@ -3164,6 +3186,7 @@ dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
>      bool dynamic_txqs;
>      struct cycle_timer timer;
>      uint64_t cycles;
> +    uint32_t tx_flush_interval;
> 
>      cycle_timer_start(&pmd->perf_stats, &timer);
> 
> @@ -3180,6 +3203,13 @@ dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
>      netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
>      dp_packet_batch_init(&p->output_pkts);
> 
> +    /* Update time of the next flush. */
> +    atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
> +    p->flush_time = pmd->ctx.now + tx_flush_interval;
> +
> +    ovs_assert(pmd->n_output_batches > 0);
> +    pmd->n_output_batches--;
> +
>      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
>      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
> 
> @@ -3191,18 +3221,28 @@ dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
>                                       RXQ_CYCLES_PROC_CURR, cycles);
>          }
>      }
> +
> +    return output_cnt;
>  }
> 
> -static void
> -dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd)
> +static int
> +dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
> +                                   bool force)
>  {
>      struct tx_port *p;
> +    int output_cnt = 0;
> +
> +    if (!pmd->n_output_batches) {
> +        return 0;
> +    }
> 
>      HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
> -        if (!dp_packet_batch_is_empty(&p->output_pkts)) {
> -            dp_netdev_pmd_flush_output_on_port(pmd, p);
> +        if (!dp_packet_batch_is_empty(&p->output_pkts)
> +            && (force || pmd->ctx.now >= p->flush_time)) {
> +            output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
>          }
>      }
> +    return output_cnt;
>  }
> 
>  static int
> @@ -3213,7 +3253,7 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
>      struct dp_packet_batch batch;
>      struct cycle_timer timer;
>      int error;
> -    int batch_cnt = 0;
> +    int batch_cnt = 0, output_cnt = 0;
>      uint64_t cycles;
> 
>      /* Measure duration for polling and processing rx burst. */
> @@ -3235,7 +3275,7 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
>          cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
>          dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
> 
> -        dp_netdev_pmd_flush_output_packets(pmd);
> +        output_cnt = dp_netdev_pmd_flush_output_packets(pmd, false);
>      } else {
>          /* Discard cycles. */
>          cycle_timer_stop(&pmd->perf_stats, &timer);
> @@ -3249,7 +3289,7 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
> 
>      pmd->ctx.last_rxq = NULL;
> 
> -    return batch_cnt;
> +    return batch_cnt + output_cnt;
>  }
> 
>  static struct tx_port *
> @@ -3872,6 +3912,7 @@ dpif_netdev_run(struct dpif *dpif)
>      struct dp_netdev *dp = get_dp_netdev(dpif);
>      struct dp_netdev_pmd_thread *non_pmd;
>      uint64_t new_tnl_seq;
> +    bool need_to_flush = true;
> 
>      ovs_mutex_lock(&dp->port_mutex);
>      non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
> @@ -3882,13 +3923,22 @@ dpif_netdev_run(struct dpif *dpif)
>                  int i;
> 
>                  for (i = 0; i < port->n_rxq; i++) {
> -                    dp_netdev_process_rxq_port(non_pmd,
> -                                               &port->rxqs[i],
> -                                               port->port_no);
> +                    if (dp_netdev_process_rxq_port(non_pmd,
> +                                                   &port->rxqs[i],
> +                                                   port->port_no)) {
> +                        need_to_flush = false;
> +                    }
>                  }
>              }
>          }
> -        pmd_thread_ctx_time_update(non_pmd);
> +        if (need_to_flush) {
> +            /* We didn't receive anything in the process loop.
> +             * Check if we need to send something.
> +             * There was no time updates on current iteration. */
> +            pmd_thread_ctx_time_update(non_pmd);
> +            dp_netdev_pmd_flush_output_packets(non_pmd, false);
> +        }
> +
>          dpif_netdev_xps_revalidate_pmd(non_pmd, false);
>          ovs_mutex_unlock(&dp->non_pmd_mutex);
> 
> @@ -3939,6 +3989,8 @@ pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
>  {
>      struct tx_port *tx_port_cached;
> 
> +    /* Flush all the queued packets. */
> +    dp_netdev_pmd_flush_output_packets(pmd, true);
>      /* Free all used tx queue ids. */
>      dpif_netdev_xps_revalidate_pmd(pmd, true);
> 
> @@ -4069,6 +4121,7 @@ reload:
>      cycles_counter_update(s);
>      for (;;) {
>          uint64_t iter_packets = 0;
> +
>          pmd_perf_start_iteration(s);
>          for (i = 0; i < poll_cnt; i++) {
>              process_packets =
> @@ -4077,15 +4130,20 @@ reload:
>              iter_packets += process_packets;
>          }
> 
> +        if (!iter_packets) {
> +            /* We didn't receive anything in the process loop.
> +             * Check if we need to send something.
> +             * There was no time updates on current iteration. */
> +            pmd_thread_ctx_time_update(pmd);
> +            iter_packets += dp_netdev_pmd_flush_output_packets(pmd, false);
> +        }
> +
>          if (lc++ > 1024) {
>              bool reload;
> 
>              lc = 0;
> 
>              coverage_try_clear();
> -            /* It's possible that the time was not updated on current
> -             * iteration, if there were no received packets. */
> -            pmd_thread_ctx_time_update(pmd);
>              dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
>              if (!ovsrcu_try_quiesce()) {
>                  emc_cache_slow_sweep(&pmd->flow_cache);
> @@ -4524,6 +4582,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
>      pmd->core_id = core_id;
>      pmd->numa_id = numa_id;
>      pmd->need_reload = false;
> +    pmd->n_output_batches = 0;
> 
>      ovs_refcount_init(&pmd->ref_cnt);
>      latch_init(&pmd->exit_latch);
> @@ -4713,6 +4772,7 @@ dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
> 
>      tx->port = port;
>      tx->qid = -1;
> +    tx->flush_time = 0LL;
>      dp_packet_batch_init(&tx->output_pkts);
> 
>      hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
> @@ -5407,12 +5467,16 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
>                  dp_netdev_pmd_flush_output_on_port(pmd, p);
>              }
>  #endif
> -            if (OVS_UNLIKELY(dp_packet_batch_size(&p->output_pkts)
> -                       + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST)) {
> -                /* Some packets was generated while input batch processing.
> -                 * Flush here to avoid overflow. */
> +            if (dp_packet_batch_size(&p->output_pkts)
> +                + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
> +                /* Flush here to avoid overflow. */
>                  dp_netdev_pmd_flush_output_on_port(pmd, p);
>              }
> +
> +            if (dp_packet_batch_is_empty(&p->output_pkts)) {
> +                pmd->n_output_batches++;
> +            }
> +
>              DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
>                  p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
>                                                               pmd->ctx.last_rxq;
> diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
> index 58c0ebd..61fb7b1 100644
> --- a/vswitchd/vswitch.xml
> +++ b/vswitchd/vswitch.xml
> @@ -359,6 +359,22 @@
>          </p>
>        </column>
> 
> +      <column name="other_config" key="tx-flush-interval"
> +              type='{"type": "integer",
> +                     "minInteger": 0, "maxInteger": 1000000}'>
> +        <p>
> +          Specifies the time in microseconds that a packet can wait in output
> +          batch for sending i.e. amount of time that packet can spend in an
> +          intermediate output queue before sending to netdev.
> +          This option can be used to configure balance between throughput
> +          and latency. Lower values decreases latency while higher values
> +          may be useful to achieve higher performance.
> +        </p>
> +        <p>
> +          Defaults to 0 i.e. instant packet sending (latency optimized).
> +        </p>
> +      </column>
> +
>        <column name="other_config" key="n-handler-threads"
>                type='{"type": "integer", "minInteger": 1}'>
>          <p>
> --
> 2.7.4
Ilya Maximets Jan. 15, 2018, 7:22 a.m. UTC | #2
On 12.01.2018 19:42, Jan Scheurich wrote:
> Hi Ilya,
> 
> I understand that for the purpose of the present patch this is good enough to count rx and tx packets together, but the PMD performance metrics patch will depend on counting them separately. It would have been nice if you had simply included the changes I proposed. Now I will have to revert some of your changes to make the tx_packets available to pmd_perf_end_iteration(). But so be it.


Hello, Jan.
I tried to find anything about tx_packets in your v5 of supervision patch-set
and RFC rebased series on mail-list but failed. At the first glance these series
just ignores the tx packets and the cycles spent for sending them. Am I missing
something? Or you didn't send these changes to mail-list?

In general, I think, it's better if you'll made this changes yourself, because
you know your code better.

Best regards, Ilya Maximets.
 
> 
> Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
> Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
> 
> Regards, Jan
> 
>> -----Original Message-----
>> From: Ilya Maximets [mailto:i.maximets@samsung.com]
>> Sent: Friday, 12 January, 2018 12:17
>> To: ovs-dev@openvswitch.org
>> Cc: Heetae Ahn <heetae82.ahn@samsung.com>; Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>; Antonio Fischetti
>> <antonio.fischetti@intel.com>; Eelco Chaudron <echaudro@redhat.com>; Ciara Loftus <ciara.loftus@intel.com>; Kevin Traynor
>> <ktraynor@redhat.com>; Jan Scheurich <jan.scheurich@ericsson.com>; Billy O'Mahony <billy.o.mahony@intel.com>; Ian Stokes
>> <ian.stokes@intel.com>; Ilya Maximets <i.maximets@samsung.com>
>> Subject: [PATCH v10 3/5] dpif-netdev: Time based output batching.
>>
>> This allows to collect packets from more than one RX burst
>> and send them together with a configurable intervals.
>>
>> 'other_config:tx-flush-interval' can be used to configure
>> time that a packet can wait in output batch for sending.
>>
>> 'tx-flush-interval' has microsecond resolution.
>>
>> Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
>> ---
>>  lib/dpif-netdev.c    | 108 ++++++++++++++++++++++++++++++++++++++++-----------
>>  vswitchd/vswitch.xml |  16 ++++++++
>>  2 files changed, 102 insertions(+), 22 deletions(-)
>>
>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>> index 6909a03..f0ba2b2 100644
>> --- a/lib/dpif-netdev.c
>> +++ b/lib/dpif-netdev.c
>> @@ -87,6 +87,9 @@ VLOG_DEFINE_THIS_MODULE(dpif_netdev);
>>  #define MAX_RECIRC_DEPTH 6
>>  DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
>>
>> +/* Use instant packet send by default. */
>> +#define DEFAULT_TX_FLUSH_INTERVAL 0
>> +
>>  /* Configuration parameters. */
>>  enum { MAX_FLOWS = 65536 };     /* Maximum number of flows in flow table. */
>>  enum { MAX_METERS = 65536 };    /* Maximum number of meters. */
>> @@ -273,6 +276,9 @@ struct dp_netdev {
>>      struct hmap ports;
>>      struct seq *port_seq;       /* Incremented whenever a port changes. */
>>
>> +    /* The time that a packet can wait in output batch for sending. */
>> +    atomic_uint32_t tx_flush_interval;
>> +
>>      /* Meters. */
>>      struct ovs_mutex meter_locks[N_METER_LOCKS];
>>      struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
>> @@ -500,6 +506,7 @@ struct tx_port {
>>      int qid;
>>      long long last_used;
>>      struct hmap_node node;
>> +    long long flush_time;
>>      struct dp_packet_batch output_pkts;
>>      struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
>>  };
>> @@ -581,6 +588,9 @@ struct dp_netdev_pmd_thread {
>>       * than 'cmap_count(dp->poll_threads)'. */
>>      uint32_t static_tx_qid;
>>
>> +    /* Number of filled output batches. */
>> +    int n_output_batches;
>> +
>>      struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. */
>>      /* List of rx queues to poll. */
>>      struct hmap poll_list OVS_GUARDED;
>> @@ -670,8 +680,9 @@ static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
>>  static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
>>                                         struct rxq_poll *poll)
>>      OVS_REQUIRES(pmd->port_mutex);
>> -static void
>> -dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd);
>> +static int
>> +dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
>> +                                   bool force);
>>
>>  static void reconfigure_datapath(struct dp_netdev *dp)
>>      OVS_REQUIRES(dp->port_mutex);
>> @@ -1245,6 +1256,7 @@ create_dp_netdev(const char *name, const struct dpif_class *class,
>>      conntrack_init(&dp->conntrack);
>>
>>      atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
>> +    atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
>>
>>      cmap_init(&dp->poll_threads);
>>
>> @@ -2911,7 +2923,7 @@ dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
>>      dp_packet_batch_init_packet(&pp, execute->packet);
>>      dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
>>                                execute->actions, execute->actions_len);
>> -    dp_netdev_pmd_flush_output_packets(pmd);
>> +    dp_netdev_pmd_flush_output_packets(pmd, true);
>>
>>      if (pmd->core_id == NON_PMD_CORE_ID) {
>>          ovs_mutex_unlock(&dp->non_pmd_mutex);
>> @@ -2960,6 +2972,16 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
>>          smap_get_ullong(other_config, "emc-insert-inv-prob",
>>                          DEFAULT_EM_FLOW_INSERT_INV_PROB);
>>      uint32_t insert_min, cur_min;
>> +    uint32_t tx_flush_interval, cur_tx_flush_interval;
>> +
>> +    tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
>> +                                     DEFAULT_TX_FLUSH_INTERVAL);
>> +    atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
>> +    if (tx_flush_interval != cur_tx_flush_interval) {
>> +        atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
>> +        VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
>> +                  tx_flush_interval);
>> +    }
>>
>>      if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
>>          free(dp->pmd_cmask);
>> @@ -3154,7 +3176,7 @@ dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
>>      return processing_cycles;
>>  }
>>
>> -static void
>> +static int
>>  dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
>>                                     struct tx_port *p)
>>  {
>> @@ -3164,6 +3186,7 @@ dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
>>      bool dynamic_txqs;
>>      struct cycle_timer timer;
>>      uint64_t cycles;
>> +    uint32_t tx_flush_interval;
>>
>>      cycle_timer_start(&pmd->perf_stats, &timer);
>>
>> @@ -3180,6 +3203,13 @@ dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
>>      netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
>>      dp_packet_batch_init(&p->output_pkts);
>>
>> +    /* Update time of the next flush. */
>> +    atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
>> +    p->flush_time = pmd->ctx.now + tx_flush_interval;
>> +
>> +    ovs_assert(pmd->n_output_batches > 0);
>> +    pmd->n_output_batches--;
>> +
>>      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
>>      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
>>
>> @@ -3191,18 +3221,28 @@ dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
>>                                       RXQ_CYCLES_PROC_CURR, cycles);
>>          }
>>      }
>> +
>> +    return output_cnt;
>>  }
>>
>> -static void
>> -dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd)
>> +static int
>> +dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
>> +                                   bool force)
>>  {
>>      struct tx_port *p;
>> +    int output_cnt = 0;
>> +
>> +    if (!pmd->n_output_batches) {
>> +        return 0;
>> +    }
>>
>>      HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
>> -        if (!dp_packet_batch_is_empty(&p->output_pkts)) {
>> -            dp_netdev_pmd_flush_output_on_port(pmd, p);
>> +        if (!dp_packet_batch_is_empty(&p->output_pkts)
>> +            && (force || pmd->ctx.now >= p->flush_time)) {
>> +            output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
>>          }
>>      }
>> +    return output_cnt;
>>  }
>>
>>  static int
>> @@ -3213,7 +3253,7 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
>>      struct dp_packet_batch batch;
>>      struct cycle_timer timer;
>>      int error;
>> -    int batch_cnt = 0;
>> +    int batch_cnt = 0, output_cnt = 0;
>>      uint64_t cycles;
>>
>>      /* Measure duration for polling and processing rx burst. */
>> @@ -3235,7 +3275,7 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
>>          cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
>>          dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
>>
>> -        dp_netdev_pmd_flush_output_packets(pmd);
>> +        output_cnt = dp_netdev_pmd_flush_output_packets(pmd, false);
>>      } else {
>>          /* Discard cycles. */
>>          cycle_timer_stop(&pmd->perf_stats, &timer);
>> @@ -3249,7 +3289,7 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
>>
>>      pmd->ctx.last_rxq = NULL;
>>
>> -    return batch_cnt;
>> +    return batch_cnt + output_cnt;
>>  }
>>
>>  static struct tx_port *
>> @@ -3872,6 +3912,7 @@ dpif_netdev_run(struct dpif *dpif)
>>      struct dp_netdev *dp = get_dp_netdev(dpif);
>>      struct dp_netdev_pmd_thread *non_pmd;
>>      uint64_t new_tnl_seq;
>> +    bool need_to_flush = true;
>>
>>      ovs_mutex_lock(&dp->port_mutex);
>>      non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
>> @@ -3882,13 +3923,22 @@ dpif_netdev_run(struct dpif *dpif)
>>                  int i;
>>
>>                  for (i = 0; i < port->n_rxq; i++) {
>> -                    dp_netdev_process_rxq_port(non_pmd,
>> -                                               &port->rxqs[i],
>> -                                               port->port_no);
>> +                    if (dp_netdev_process_rxq_port(non_pmd,
>> +                                                   &port->rxqs[i],
>> +                                                   port->port_no)) {
>> +                        need_to_flush = false;
>> +                    }
>>                  }
>>              }
>>          }
>> -        pmd_thread_ctx_time_update(non_pmd);
>> +        if (need_to_flush) {
>> +            /* We didn't receive anything in the process loop.
>> +             * Check if we need to send something.
>> +             * There was no time updates on current iteration. */
>> +            pmd_thread_ctx_time_update(non_pmd);
>> +            dp_netdev_pmd_flush_output_packets(non_pmd, false);
>> +        }
>> +
>>          dpif_netdev_xps_revalidate_pmd(non_pmd, false);
>>          ovs_mutex_unlock(&dp->non_pmd_mutex);
>>
>> @@ -3939,6 +3989,8 @@ pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
>>  {
>>      struct tx_port *tx_port_cached;
>>
>> +    /* Flush all the queued packets. */
>> +    dp_netdev_pmd_flush_output_packets(pmd, true);
>>      /* Free all used tx queue ids. */
>>      dpif_netdev_xps_revalidate_pmd(pmd, true);
>>
>> @@ -4069,6 +4121,7 @@ reload:
>>      cycles_counter_update(s);
>>      for (;;) {
>>          uint64_t iter_packets = 0;
>> +
>>          pmd_perf_start_iteration(s);
>>          for (i = 0; i < poll_cnt; i++) {
>>              process_packets =
>> @@ -4077,15 +4130,20 @@ reload:
>>              iter_packets += process_packets;
>>          }
>>
>> +        if (!iter_packets) {
>> +            /* We didn't receive anything in the process loop.
>> +             * Check if we need to send something.
>> +             * There was no time updates on current iteration. */
>> +            pmd_thread_ctx_time_update(pmd);
>> +            iter_packets += dp_netdev_pmd_flush_output_packets(pmd, false);
>> +        }
>> +
>>          if (lc++ > 1024) {
>>              bool reload;
>>
>>              lc = 0;
>>
>>              coverage_try_clear();
>> -            /* It's possible that the time was not updated on current
>> -             * iteration, if there were no received packets. */
>> -            pmd_thread_ctx_time_update(pmd);
>>              dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
>>              if (!ovsrcu_try_quiesce()) {
>>                  emc_cache_slow_sweep(&pmd->flow_cache);
>> @@ -4524,6 +4582,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
>>      pmd->core_id = core_id;
>>      pmd->numa_id = numa_id;
>>      pmd->need_reload = false;
>> +    pmd->n_output_batches = 0;
>>
>>      ovs_refcount_init(&pmd->ref_cnt);
>>      latch_init(&pmd->exit_latch);
>> @@ -4713,6 +4772,7 @@ dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
>>
>>      tx->port = port;
>>      tx->qid = -1;
>> +    tx->flush_time = 0LL;
>>      dp_packet_batch_init(&tx->output_pkts);
>>
>>      hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
>> @@ -5407,12 +5467,16 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
>>                  dp_netdev_pmd_flush_output_on_port(pmd, p);
>>              }
>>  #endif
>> -            if (OVS_UNLIKELY(dp_packet_batch_size(&p->output_pkts)
>> -                       + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST)) {
>> -                /* Some packets was generated while input batch processing.
>> -                 * Flush here to avoid overflow. */
>> +            if (dp_packet_batch_size(&p->output_pkts)
>> +                + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
>> +                /* Flush here to avoid overflow. */
>>                  dp_netdev_pmd_flush_output_on_port(pmd, p);
>>              }
>> +
>> +            if (dp_packet_batch_is_empty(&p->output_pkts)) {
>> +                pmd->n_output_batches++;
>> +            }
>> +
>>              DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
>>                  p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
>>                                                               pmd->ctx.last_rxq;
>> diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
>> index 58c0ebd..61fb7b1 100644
>> --- a/vswitchd/vswitch.xml
>> +++ b/vswitchd/vswitch.xml
>> @@ -359,6 +359,22 @@
>>          </p>
>>        </column>
>>
>> +      <column name="other_config" key="tx-flush-interval"
>> +              type='{"type": "integer",
>> +                     "minInteger": 0, "maxInteger": 1000000}'>
>> +        <p>
>> +          Specifies the time in microseconds that a packet can wait in output
>> +          batch for sending i.e. amount of time that packet can spend in an
>> +          intermediate output queue before sending to netdev.
>> +          This option can be used to configure balance between throughput
>> +          and latency. Lower values decreases latency while higher values
>> +          may be useful to achieve higher performance.
>> +        </p>
>> +        <p>
>> +          Defaults to 0 i.e. instant packet sending (latency optimized).
>> +        </p>
>> +      </column>
>> +
>>        <column name="other_config" key="n-handler-threads"
>>                type='{"type": "integer", "minInteger": 1}'>
>>          <p>
>> --
>> 2.7.4
> 
> 
> 
>
Jan Scheurich Jan. 15, 2018, 8:02 a.m. UTC | #3
> From: Ilya Maximets [mailto:i.maximets@samsung.com]
> Sent: Monday, 15 January, 2018 08:23
> To: Jan Scheurich <jan.scheurich@ericsson.com>; ovs-dev@openvswitch.org
> Cc: Heetae Ahn <heetae82.ahn@samsung.com>; Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>; Antonio Fischetti
> <antonio.fischetti@intel.com>; Eelco Chaudron <echaudro@redhat.com>; Ciara Loftus <ciara.loftus@intel.com>; Kevin Traynor
> <ktraynor@redhat.com>; Billy O'Mahony <billy.o.mahony@intel.com>; Ian Stokes <ian.stokes@intel.com>
> Subject: Re: [PATCH v10 3/5] dpif-netdev: Time based output batching.
> 
> On 12.01.2018 19:42, Jan Scheurich wrote:
> > Hi Ilya,
> >
> > I understand that for the purpose of the present patch this is good enough to count rx and tx packets together, but the PMD performance
> metrics patch will depend on counting them separately. It would have been nice if you had simply included the changes I proposed. Now I
> will have to revert some of your changes to make the tx_packets available to pmd_perf_end_iteration(). But so be it.
> 
> 
> Hello, Jan.
> I tried to find anything about tx_packets in your v5 of supervision patch-set
> and RFC rebased series on mail-list but failed. At the first glance these series
> just ignores the tx packets and the cycles spent for sending them. Am I missing
> something? Or you didn't send these changes to mail-list?

I guess that was difficult to see in the jungle of patches ;-).
Now that I look for it I can't find it myself any longer in those patches.

> 
> In general, I think, it's better if you'll made this changes yourself, because
> you know your code better.

Fine with me, I have done that now when rebasing my series on top of your v10. Will send out soon.

BR, Jan
diff mbox series

Patch

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 6909a03..f0ba2b2 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -87,6 +87,9 @@  VLOG_DEFINE_THIS_MODULE(dpif_netdev);
 #define MAX_RECIRC_DEPTH 6
 DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
 
+/* Use instant packet send by default. */
+#define DEFAULT_TX_FLUSH_INTERVAL 0
+
 /* Configuration parameters. */
 enum { MAX_FLOWS = 65536 };     /* Maximum number of flows in flow table. */
 enum { MAX_METERS = 65536 };    /* Maximum number of meters. */
@@ -273,6 +276,9 @@  struct dp_netdev {
     struct hmap ports;
     struct seq *port_seq;       /* Incremented whenever a port changes. */
 
+    /* The time that a packet can wait in output batch for sending. */
+    atomic_uint32_t tx_flush_interval;
+
     /* Meters. */
     struct ovs_mutex meter_locks[N_METER_LOCKS];
     struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
@@ -500,6 +506,7 @@  struct tx_port {
     int qid;
     long long last_used;
     struct hmap_node node;
+    long long flush_time;
     struct dp_packet_batch output_pkts;
     struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
 };
@@ -581,6 +588,9 @@  struct dp_netdev_pmd_thread {
      * than 'cmap_count(dp->poll_threads)'. */
     uint32_t static_tx_qid;
 
+    /* Number of filled output batches. */
+    int n_output_batches;
+
     struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. */
     /* List of rx queues to poll. */
     struct hmap poll_list OVS_GUARDED;
@@ -670,8 +680,9 @@  static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
 static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
                                        struct rxq_poll *poll)
     OVS_REQUIRES(pmd->port_mutex);
-static void
-dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd);
+static int
+dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
+                                   bool force);
 
 static void reconfigure_datapath(struct dp_netdev *dp)
     OVS_REQUIRES(dp->port_mutex);
@@ -1245,6 +1256,7 @@  create_dp_netdev(const char *name, const struct dpif_class *class,
     conntrack_init(&dp->conntrack);
 
     atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
+    atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
 
     cmap_init(&dp->poll_threads);
 
@@ -2911,7 +2923,7 @@  dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
     dp_packet_batch_init_packet(&pp, execute->packet);
     dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
                               execute->actions, execute->actions_len);
-    dp_netdev_pmd_flush_output_packets(pmd);
+    dp_netdev_pmd_flush_output_packets(pmd, true);
 
     if (pmd->core_id == NON_PMD_CORE_ID) {
         ovs_mutex_unlock(&dp->non_pmd_mutex);
@@ -2960,6 +2972,16 @@  dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
         smap_get_ullong(other_config, "emc-insert-inv-prob",
                         DEFAULT_EM_FLOW_INSERT_INV_PROB);
     uint32_t insert_min, cur_min;
+    uint32_t tx_flush_interval, cur_tx_flush_interval;
+
+    tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
+                                     DEFAULT_TX_FLUSH_INTERVAL);
+    atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
+    if (tx_flush_interval != cur_tx_flush_interval) {
+        atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
+        VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
+                  tx_flush_interval);
+    }
 
     if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
         free(dp->pmd_cmask);
@@ -3154,7 +3176,7 @@  dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
     return processing_cycles;
 }
 
-static void
+static int
 dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
                                    struct tx_port *p)
 {
@@ -3164,6 +3186,7 @@  dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
     bool dynamic_txqs;
     struct cycle_timer timer;
     uint64_t cycles;
+    uint32_t tx_flush_interval;
 
     cycle_timer_start(&pmd->perf_stats, &timer);
 
@@ -3180,6 +3203,13 @@  dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
     netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
     dp_packet_batch_init(&p->output_pkts);
 
+    /* Update time of the next flush. */
+    atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
+    p->flush_time = pmd->ctx.now + tx_flush_interval;
+
+    ovs_assert(pmd->n_output_batches > 0);
+    pmd->n_output_batches--;
+
     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
 
@@ -3191,18 +3221,28 @@  dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
                                      RXQ_CYCLES_PROC_CURR, cycles);
         }
     }
+
+    return output_cnt;
 }
 
-static void
-dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd)
+static int
+dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
+                                   bool force)
 {
     struct tx_port *p;
+    int output_cnt = 0;
+
+    if (!pmd->n_output_batches) {
+        return 0;
+    }
 
     HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
-        if (!dp_packet_batch_is_empty(&p->output_pkts)) {
-            dp_netdev_pmd_flush_output_on_port(pmd, p);
+        if (!dp_packet_batch_is_empty(&p->output_pkts)
+            && (force || pmd->ctx.now >= p->flush_time)) {
+            output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
         }
     }
+    return output_cnt;
 }
 
 static int
@@ -3213,7 +3253,7 @@  dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
     struct dp_packet_batch batch;
     struct cycle_timer timer;
     int error;
-    int batch_cnt = 0;
+    int batch_cnt = 0, output_cnt = 0;
     uint64_t cycles;
 
     /* Measure duration for polling and processing rx burst. */
@@ -3235,7 +3275,7 @@  dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
         cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
         dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
 
-        dp_netdev_pmd_flush_output_packets(pmd);
+        output_cnt = dp_netdev_pmd_flush_output_packets(pmd, false);
     } else {
         /* Discard cycles. */
         cycle_timer_stop(&pmd->perf_stats, &timer);
@@ -3249,7 +3289,7 @@  dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
 
     pmd->ctx.last_rxq = NULL;
 
-    return batch_cnt;
+    return batch_cnt + output_cnt;
 }
 
 static struct tx_port *
@@ -3872,6 +3912,7 @@  dpif_netdev_run(struct dpif *dpif)
     struct dp_netdev *dp = get_dp_netdev(dpif);
     struct dp_netdev_pmd_thread *non_pmd;
     uint64_t new_tnl_seq;
+    bool need_to_flush = true;
 
     ovs_mutex_lock(&dp->port_mutex);
     non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
@@ -3882,13 +3923,22 @@  dpif_netdev_run(struct dpif *dpif)
                 int i;
 
                 for (i = 0; i < port->n_rxq; i++) {
-                    dp_netdev_process_rxq_port(non_pmd,
-                                               &port->rxqs[i],
-                                               port->port_no);
+                    if (dp_netdev_process_rxq_port(non_pmd,
+                                                   &port->rxqs[i],
+                                                   port->port_no)) {
+                        need_to_flush = false;
+                    }
                 }
             }
         }
-        pmd_thread_ctx_time_update(non_pmd);
+        if (need_to_flush) {
+            /* We didn't receive anything in the process loop.
+             * Check if we need to send something.
+             * There was no time updates on current iteration. */
+            pmd_thread_ctx_time_update(non_pmd);
+            dp_netdev_pmd_flush_output_packets(non_pmd, false);
+        }
+
         dpif_netdev_xps_revalidate_pmd(non_pmd, false);
         ovs_mutex_unlock(&dp->non_pmd_mutex);
 
@@ -3939,6 +3989,8 @@  pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
 {
     struct tx_port *tx_port_cached;
 
+    /* Flush all the queued packets. */
+    dp_netdev_pmd_flush_output_packets(pmd, true);
     /* Free all used tx queue ids. */
     dpif_netdev_xps_revalidate_pmd(pmd, true);
 
@@ -4069,6 +4121,7 @@  reload:
     cycles_counter_update(s);
     for (;;) {
         uint64_t iter_packets = 0;
+
         pmd_perf_start_iteration(s);
         for (i = 0; i < poll_cnt; i++) {
             process_packets =
@@ -4077,15 +4130,20 @@  reload:
             iter_packets += process_packets;
         }
 
+        if (!iter_packets) {
+            /* We didn't receive anything in the process loop.
+             * Check if we need to send something.
+             * There was no time updates on current iteration. */
+            pmd_thread_ctx_time_update(pmd);
+            iter_packets += dp_netdev_pmd_flush_output_packets(pmd, false);
+        }
+
         if (lc++ > 1024) {
             bool reload;
 
             lc = 0;
 
             coverage_try_clear();
-            /* It's possible that the time was not updated on current
-             * iteration, if there were no received packets. */
-            pmd_thread_ctx_time_update(pmd);
             dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
             if (!ovsrcu_try_quiesce()) {
                 emc_cache_slow_sweep(&pmd->flow_cache);
@@ -4524,6 +4582,7 @@  dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
     pmd->core_id = core_id;
     pmd->numa_id = numa_id;
     pmd->need_reload = false;
+    pmd->n_output_batches = 0;
 
     ovs_refcount_init(&pmd->ref_cnt);
     latch_init(&pmd->exit_latch);
@@ -4713,6 +4772,7 @@  dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
 
     tx->port = port;
     tx->qid = -1;
+    tx->flush_time = 0LL;
     dp_packet_batch_init(&tx->output_pkts);
 
     hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
@@ -5407,12 +5467,16 @@  dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
                 dp_netdev_pmd_flush_output_on_port(pmd, p);
             }
 #endif
-            if (OVS_UNLIKELY(dp_packet_batch_size(&p->output_pkts)
-                       + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST)) {
-                /* Some packets was generated while input batch processing.
-                 * Flush here to avoid overflow. */
+            if (dp_packet_batch_size(&p->output_pkts)
+                + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
+                /* Flush here to avoid overflow. */
                 dp_netdev_pmd_flush_output_on_port(pmd, p);
             }
+
+            if (dp_packet_batch_is_empty(&p->output_pkts)) {
+                pmd->n_output_batches++;
+            }
+
             DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
                 p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
                                                              pmd->ctx.last_rxq;
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 58c0ebd..61fb7b1 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -359,6 +359,22 @@ 
         </p>
       </column>
 
+      <column name="other_config" key="tx-flush-interval"
+              type='{"type": "integer",
+                     "minInteger": 0, "maxInteger": 1000000}'>
+        <p>
+          Specifies the time in microseconds that a packet can wait in output
+          batch for sending i.e. amount of time that packet can spend in an
+          intermediate output queue before sending to netdev.
+          This option can be used to configure balance between throughput
+          and latency. Lower values decreases latency while higher values
+          may be useful to achieve higher performance.
+        </p>
+        <p>
+          Defaults to 0 i.e. instant packet sending (latency optimized).
+        </p>
+      </column>
+
       <column name="other_config" key="n-handler-threads"
               type='{"type": "integer", "minInteger": 1}'>
         <p>