[ovs-dev,v4,1/5] netdev: Add netdev_txq_flush function.

Message ID 1502211976-76937-2-git-send-email-bhanuprakash.bodireddy@intel.com
State Rejected
Delegated to: Darrell Ball
Headers show

Commit Message

Bodireddy, Bhanuprakash Aug. 8, 2017, 5:06 p.m.
Add netdev_txq_flush(), that flush packets on a queue. This is needed
to transmit packets on the intermediate queue.

This commit also implements netdev_dpdk_txq_flush() function. If there
are any packets waiting in the queue, they are transmitted instantly
using the rte_eth_tx_burst function. In XPS enabled case, lock is
taken on the tx queue before flushing the queue.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
Signed-off-by: Antonio Fischetti <antonio.fischetti@intel.com>
Co-authored-by: Antonio Fischetti <antonio.fischetti@intel.com>
Signed-off-by: Markus Magnusson <markus.magnusson@ericsson.com>
Co-authored-by: Markus Magnusson <markus.magnusson@ericsson.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
---
 lib/netdev-bsd.c      |  1 +
 lib/netdev-dpdk.c     | 52 ++++++++++++++++++++++++++++++++++++++++++++++-----
 lib/netdev-dummy.c    |  1 +
 lib/netdev-linux.c    |  1 +
 lib/netdev-provider.h |  8 ++++++++
 lib/netdev-vport.c    |  2 +-
 lib/netdev.c          |  9 +++++++++
 lib/netdev.h          |  1 +
 8 files changed, 69 insertions(+), 6 deletions(-)

Comments

Ilya Maximets Aug. 9, 2017, 8:21 a.m. | #1
Not a full review.
Comments inline.

> Add netdev_txq_flush(), that flush packets on a queue. This is needed
> to transmit packets on the intermediate queue.
> 
> This commit also implements netdev_dpdk_txq_flush() function. If there
> are any packets waiting in the queue, they are transmitted instantly
> using the rte_eth_tx_burst function. In XPS enabled case, lock is
> taken on the tx queue before flushing the queue.
> 
> Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy at intel.com>
> Signed-off-by: Antonio Fischetti <antonio.fischetti at intel.com>
> Co-authored-by: Antonio Fischetti <antonio.fischetti at intel.com>
> Signed-off-by: Markus Magnusson <markus.magnusson at ericsson.com>
> Co-authored-by: Markus Magnusson <markus.magnusson at ericsson.com>
> Acked-by: Eelco Chaudron <echaudro at redhat.com>
> ---
>  lib/netdev-bsd.c      |  1 +
>  lib/netdev-dpdk.c     | 52 ++++++++++++++++++++++++++++++++++++++++++++++-----
>  lib/netdev-dummy.c    |  1 +
>  lib/netdev-linux.c    |  1 +
>  lib/netdev-provider.h |  8 ++++++++
>  lib/netdev-vport.c    |  2 +-
>  lib/netdev.c          |  9 +++++++++
>  lib/netdev.h          |  1 +
>  8 files changed, 69 insertions(+), 6 deletions(-)
> 
> diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c
> index 8a4cdb3..75483ad 100644
> --- a/lib/netdev-bsd.c
> +++ b/lib/netdev-bsd.c
> @@ -1546,6 +1546,7 @@ netdev_bsd_update_flags(struct netdev *netdev_, enum netdev_flags off,
>      netdev_bsd_rxq_recv,                             \
>      netdev_bsd_rxq_wait,                             \
>      netdev_bsd_rxq_drain,                            \
> +    NULL,                                            \
>                                                       \
>      NO_OFFLOAD_API                                   \
>  }
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 1d82bca..50d6b29 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -313,6 +313,11 @@ struct dpdk_mp {
>      struct ovs_list list_node OVS_GUARDED_BY(dpdk_mp_mutex);
>  };
>  
> +/* Queue 'INTERIM_QUEUE_BURST_THRESHOLD' packets before transmitting.
> + * Defaults to 'NETDEV_MAX_BURST'(32) packets.
> + */
> +#define INTERIM_QUEUE_BURST_THRESHOLD NETDEV_MAX_BURST
> +
>  /* There should be one 'struct dpdk_tx_queue' created for
>   * each cpu core. */
>  struct dpdk_tx_queue {
> @@ -322,6 +327,12 @@ struct dpdk_tx_queue {
>                                      * pmd threads (see 'concurrent_txq'). */
>      int map;                       /* Mapping of configured vhost-user queues
>                                      * to enabled by guest. */
> +    int dpdk_pkt_cnt;              /* Number of buffered packets waiting to
> +                                      be sent on DPDK tx queue. */
> +    struct rte_mbuf *dpdk_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD];
> +                                   /* Intermediate queue where packets can
> +                                    * be buffered to amortize the cost of MMIO
> +                                    * writes. */
>  };
>  
>  /* dpdk has no way to remove dpdk ring ethernet devices
> @@ -1931,6 +1942,32 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
>      }
>  }
>  
> +/* Flush tx queues.
> + * This is done periodically to empty the intermediate queue in case of
> + * fewer packets (< INTERIM_QUEUE_BURST_THRESHOLD) buffered in the queue.
> + */
> +static int
> +netdev_dpdk_txq_flush(struct netdev *netdev, int qid , bool concurrent_txq)
> +{
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +    struct dpdk_tx_queue *txq = &dev->tx_q[qid];
> +
> +    if (OVS_LIKELY(txq->dpdk_pkt_cnt)) {
> +        if (OVS_UNLIKELY(concurrent_txq)) {
> +            qid = qid % dev->up.n_txq;
> +            rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> +        }
> +
> +        netdev_dpdk_eth_tx_burst(dev, qid, txq->dpdk_burst_pkts,
> +                                 txq->dpdk_pkt_cnt);

The queue used for send and the locked one are different because you're
remapping the qid before taking the spinlock.

I suspect that we're always using right queue numbers in current
implementation of dpif-netdev, but I need to recheck to be sure.
Anyway, logic of this function completely broken. 

> +
> +        if (OVS_UNLIKELY(concurrent_txq)) {
> +            rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
> +        }
> +    }
> +    return 0;
> +}
> +
>  static int
>  netdev_dpdk_eth_send(struct netdev *netdev, int qid,
>                       struct dp_packet_batch *batch, bool may_steal,
> @@ -3313,7 +3350,7 @@ unlock:
>                            SET_CONFIG, SET_TX_MULTIQ, SEND,    \
>                            GET_CARRIER, GET_STATS,             \
>                            GET_FEATURES, GET_STATUS,           \
> -                          RECONFIGURE, RXQ_RECV)              \
> +                          RECONFIGURE, RXQ_RECV, TXQ_FLUSH)   \
>  {                                                             \
>      NAME,                                                     \
>      true,                       /* is_pmd */                  \
> @@ -3381,6 +3418,7 @@ unlock:
>      RXQ_RECV,                                                 \
>      NULL,                       /* rx_wait */                 \
>      NULL,                       /* rxq_drain */               \
> +    TXQ_FLUSH,                  /* txq_flush */               \
>      NO_OFFLOAD_API                                            \
>  }
>  
> @@ -3398,7 +3436,8 @@ static const struct netdev_class dpdk_class =
>          netdev_dpdk_get_features,
>          netdev_dpdk_get_status,
>          netdev_dpdk_reconfigure,
> -        netdev_dpdk_rxq_recv);
> +        netdev_dpdk_rxq_recv,
> +        netdev_dpdk_txq_flush);
>  
>  static const struct netdev_class dpdk_ring_class =
>      NETDEV_DPDK_CLASS(
> @@ -3414,7 +3453,8 @@ static const struct netdev_class dpdk_ring_class =
>          netdev_dpdk_get_features,
>          netdev_dpdk_get_status,
>          netdev_dpdk_reconfigure,
> -        netdev_dpdk_rxq_recv);
> +        netdev_dpdk_rxq_recv,
> +        NULL);
>  
>  static const struct netdev_class dpdk_vhost_class =
>      NETDEV_DPDK_CLASS(
> @@ -3430,7 +3470,8 @@ static const struct netdev_class dpdk_vhost_class =
>          NULL,
>          NULL,
>          netdev_dpdk_vhost_reconfigure,
> -        netdev_dpdk_vhost_rxq_recv);
> +        netdev_dpdk_vhost_rxq_recv,
> +        NULL);
>  static const struct netdev_class dpdk_vhost_client_class =
>      NETDEV_DPDK_CLASS(
>          "dpdkvhostuserclient",
> @@ -3445,7 +3486,8 @@ static const struct netdev_class dpdk_vhost_client_class =
>          NULL,
>          NULL,
>          netdev_dpdk_vhost_client_reconfigure,
> -        netdev_dpdk_vhost_rxq_recv);
> +        netdev_dpdk_vhost_rxq_recv,
> +        NULL);
>  
>  void
>  netdev_dpdk_register(void)
> diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
> index 752f157..86ec902 100644
> --- a/lib/netdev-dummy.c
> +++ b/lib/netdev-dummy.c
> @@ -1413,6 +1413,7 @@ netdev_dummy_update_flags(struct netdev *netdev_,
>      netdev_dummy_rxq_recv,                                      \
>      netdev_dummy_rxq_wait,                                      \
>      netdev_dummy_rxq_drain,                                     \
> +    NULL,                                                       \
>                                                                  \
>      NO_OFFLOAD_API                                              \
>  }
> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
> index 98820ed..992f887 100644
> --- a/lib/netdev-linux.c
> +++ b/lib/netdev-linux.c
> @@ -2888,6 +2888,7 @@ netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
>      netdev_linux_rxq_recv,                                      \
>      netdev_linux_rxq_wait,                                      \
>      netdev_linux_rxq_drain,                                     \
> +    NULL,                                                       \
>                                                                  \
>      FLOW_OFFLOAD_API                                            \
>  }
> diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
> index b3c57d5..9c47fdf 100644
> --- a/lib/netdev-provider.h
> +++ b/lib/netdev-provider.h
> @@ -347,6 +347,11 @@ struct netdev_class {
>       * If the function returns a non-zero value, some of the packets might have
>       * been sent anyway.
>       *
> +     * Some netdev provider - like in case of 'dpdk' - may buffer the batch
> +     * of packets into an intermediate queue.  Buffered packets shall be
> +     * transmitted when the packet count exceeds a threshold (or) by the
> +     * periodic call to the flush function.
> +     *
>       * If 'may_steal' is false, the caller retains ownership of all the
>       * packets.  If 'may_steal' is true, the caller transfers ownership of all
>       * the packets to the network device, regardless of success.
> @@ -788,6 +793,9 @@ struct netdev_class {
>      /* Discards all packets waiting to be received from 'rx'. */
>      int (*rxq_drain)(struct netdev_rxq *rx);
>  
> +    /* Flush all packets waiting to be sent on 'qid' queue. */
> +    int (*txq_flush)(struct netdev *netdev, int qid, bool concurrent_txq);
> +
>      /* ## -------------------------------- ## */
>      /* ## netdev flow offloading functions ## */
>      /* ## -------------------------------- ## */
> diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c
> index 64a3ba3..3c5eacf 100644
> --- a/lib/netdev-vport.c
> +++ b/lib/netdev-vport.c
> @@ -951,10 +951,10 @@ netdev_vport_get_ifindex(const struct netdev *netdev_)
>      NULL,                   /* rx_recv */                   \
>      NULL,                   /* rx_wait */                   \
>      NULL,                   /* rx_drain */                  \
> +    NULL,                   /* tx_flush */                  \
>                                                              \
>      NETDEV_FLOW_OFFLOAD_API
>  
> -
>  #define TUNNEL_CLASS(NAME, DPIF_PORT, BUILD_HEADER, PUSH_HEADER, POP_HEADER,   \
>                       GET_IFINDEX)                                              \
>      { DPIF_PORT,                                                               \
> diff --git a/lib/netdev.c b/lib/netdev.c
> index 7e9896b..8250396 100644
> --- a/lib/netdev.c
> +++ b/lib/netdev.c
> @@ -724,6 +724,15 @@ netdev_rxq_drain(struct netdev_rxq *rx)
>              : 0);
>  }
>  
> +/* Flush packets on the 'qid' queue. */
> +int
> +netdev_txq_flush(struct netdev *netdev, int qid, bool netdev_txq_flush)
> +{
> +    return (netdev->netdev_class->txq_flush
> +            ? netdev->netdev_class->txq_flush(netdev, qid, netdev_txq_flush)
> +            : EOPNOTSUPP);
> +}
> +
>  /* Configures the number of tx queues of 'netdev'. Returns 0 if successful,
>   * otherwise a positive errno value.
>   *
> diff --git a/lib/netdev.h b/lib/netdev.h
> index f8482f7..328a158 100644
> --- a/lib/netdev.h
> +++ b/lib/netdev.h
> @@ -183,6 +183,7 @@ int netdev_rxq_drain(struct netdev_rxq *);
>  int netdev_send(struct netdev *, int qid, struct dp_packet_batch *,
>                  bool may_steal, bool concurrent_txq);
>  void netdev_send_wait(struct netdev *, int qid);
> +int netdev_txq_flush(struct netdev *, int qid, bool concurrent_txq);
>  
>  /* Flow offloading. */
>  struct offload_info {
> -- 
> 2.4.11
Bodireddy, Bhanuprakash Aug. 9, 2017, 12:29 p.m. | #2
Hi Ilya,
>>
>> +/* Flush tx queues.
>> + * This is done periodically to empty the intermediate queue in case
>> +of
>> + * fewer packets (< INTERIM_QUEUE_BURST_THRESHOLD) buffered in the
>queue.
>> + */
>> +static int
>> +netdev_dpdk_txq_flush(struct netdev *netdev, int qid , bool
>> +concurrent_txq) {
>> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>> +    struct dpdk_tx_queue *txq = &dev->tx_q[qid];
>> +
>> +    if (OVS_LIKELY(txq->dpdk_pkt_cnt)) {
>> +        if (OVS_UNLIKELY(concurrent_txq)) {
>> +            qid = qid % dev->up.n_txq;
>> +            rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
>> +        }
>> +
>> +        netdev_dpdk_eth_tx_burst(dev, qid, txq->dpdk_burst_pkts,
>> +                                 txq->dpdk_pkt_cnt);
>
>The queue used for send and the locked one are different because you're
>remapping the qid before taking the spinlock.

>I suspect that we're always using right queue numbers in current
>implementation of dpif-netdev, but I need to recheck to be sure.

I believe the case you are referring here is the XPS case ('dynamic_txqs' true).
When we have to flush the packets we retrieve the qid from the 'cached_tx_port->last_used_qid'
 that was initialized earlier by 'dpif_netdev_xps_get_tx_qid()'. The logic of remapping the qid and 
acquiring the spin lock in the above function is no different from current logic in master. Can you 
elaborate the specific case where this would break the functionality?

Please note that  in 'dpif_netdev_xps_get_tx_qid'  the qid can change and so we did flush the queue.  

- Bhanuprakash. 

>Anyway, logic of this function completely broken.
>
Ilya Maximets Aug. 9, 2017, 12:45 p.m. | #3
On 09.08.2017 15:29, Bodireddy, Bhanuprakash wrote:
> Hi Ilya,
>>>
>>> +/* Flush tx queues.
>>> + * This is done periodically to empty the intermediate queue in case
>>> +of
>>> + * fewer packets (< INTERIM_QUEUE_BURST_THRESHOLD) buffered in the
>> queue.
>>> + */
>>> +static int
>>> +netdev_dpdk_txq_flush(struct netdev *netdev, int qid , bool
>>> +concurrent_txq) {
>>> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>>> +    struct dpdk_tx_queue *txq = &dev->tx_q[qid];
>>> +
>>> +    if (OVS_LIKELY(txq->dpdk_pkt_cnt)) {
>>> +        if (OVS_UNLIKELY(concurrent_txq)) {
>>> +            qid = qid % dev->up.n_txq;
>>> +            rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
>>> +        }
>>> +
>>> +        netdev_dpdk_eth_tx_burst(dev, qid, txq->dpdk_burst_pkts,
>>> +                                 txq->dpdk_pkt_cnt);
>>
>> The queue used for send and the locked one are different because you're
>> remapping the qid before taking the spinlock.
> 
>> I suspect that we're always using right queue numbers in current
>> implementation of dpif-netdev, but I need to recheck to be sure.
> 
> I believe the case you are referring here is the XPS case ('dynamic_txqs' true).
> When we have to flush the packets we retrieve the qid from the 'cached_tx_port->last_used_qid'
>  that was initialized earlier by 'dpif_netdev_xps_get_tx_qid()'. The logic of remapping the qid and 
> acquiring the spin lock in the above function is no different from current logic in master. Can you 
> elaborate the specific case where this would break the functionality?

Maybe my initial words are not fully correct, but below example shows what I tried to say.

1. dpif-netdev calls netdev_dpdk_txq_flush() with qid == 10;
2. txq = &dev->tx_q[10];  // Remember that 'txq' points to queue #10
3. if (txq->dpdk_pkt_cnt) ? true // Is there packets to send to queue #10?
4. qid = 10 % dev->up.n_txq; // Lets assume that dev->up.n_txq == 7
       ---> qid = 10 % 7 = 3
5. rte_spinlock_lock(&dev->tx_q[3].tx_lock); // Locking queue #3
6. netdev_dpdk_eth_tx_burst(dev, 3, txq->dpdk_burst_pkts, ..);
   --> sending to queue #3 packets enqueued for queue #10 ('txq' still points to queue #10)
   At this point queue #10 is not locked, so 'txq->dpdk_burst_pkts' is not protected
   from modifications, which could lead to wrong mempool refilling or driver crash.
   Also, you're trying to send not right packets to the queue.

   I mentioned that it looks like above scenario is impossible right now and
   qid will always be the same after truncating, but the logic is wrong anyway.

> 
> Please note that  in 'dpif_netdev_xps_get_tx_qid'  the qid can change and so we did flush the queue.  
> 
> - Bhanuprakash. 
> 
>> Anyway, logic of this function completely broken.
>>

Patch

diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c
index 8a4cdb3..75483ad 100644
--- a/lib/netdev-bsd.c
+++ b/lib/netdev-bsd.c
@@ -1546,6 +1546,7 @@  netdev_bsd_update_flags(struct netdev *netdev_, enum netdev_flags off,
     netdev_bsd_rxq_recv,                             \
     netdev_bsd_rxq_wait,                             \
     netdev_bsd_rxq_drain,                            \
+    NULL,                                            \
                                                      \
     NO_OFFLOAD_API                                   \
 }
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 1d82bca..50d6b29 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -313,6 +313,11 @@  struct dpdk_mp {
     struct ovs_list list_node OVS_GUARDED_BY(dpdk_mp_mutex);
 };
 
+/* Queue 'INTERIM_QUEUE_BURST_THRESHOLD' packets before transmitting.
+ * Defaults to 'NETDEV_MAX_BURST'(32) packets.
+ */
+#define INTERIM_QUEUE_BURST_THRESHOLD NETDEV_MAX_BURST
+
 /* There should be one 'struct dpdk_tx_queue' created for
  * each cpu core. */
 struct dpdk_tx_queue {
@@ -322,6 +327,12 @@  struct dpdk_tx_queue {
                                     * pmd threads (see 'concurrent_txq'). */
     int map;                       /* Mapping of configured vhost-user queues
                                     * to enabled by guest. */
+    int dpdk_pkt_cnt;              /* Number of buffered packets waiting to
+                                      be sent on DPDK tx queue. */
+    struct rte_mbuf *dpdk_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD];
+                                   /* Intermediate queue where packets can
+                                    * be buffered to amortize the cost of MMIO
+                                    * writes. */
 };
 
 /* dpdk has no way to remove dpdk ring ethernet devices
@@ -1931,6 +1942,32 @@  netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
     }
 }
 
+/* Flush tx queues.
+ * This is done periodically to empty the intermediate queue in case of
+ * fewer packets (< INTERIM_QUEUE_BURST_THRESHOLD) buffered in the queue.
+ */
+static int
+netdev_dpdk_txq_flush(struct netdev *netdev, int qid , bool concurrent_txq)
+{
+    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
+    struct dpdk_tx_queue *txq = &dev->tx_q[qid];
+
+    if (OVS_LIKELY(txq->dpdk_pkt_cnt)) {
+        if (OVS_UNLIKELY(concurrent_txq)) {
+            qid = qid % dev->up.n_txq;
+            rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
+        }
+
+        netdev_dpdk_eth_tx_burst(dev, qid, txq->dpdk_burst_pkts,
+                                 txq->dpdk_pkt_cnt);
+
+        if (OVS_UNLIKELY(concurrent_txq)) {
+            rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
+        }
+    }
+    return 0;
+}
+
 static int
 netdev_dpdk_eth_send(struct netdev *netdev, int qid,
                      struct dp_packet_batch *batch, bool may_steal,
@@ -3313,7 +3350,7 @@  unlock:
                           SET_CONFIG, SET_TX_MULTIQ, SEND,    \
                           GET_CARRIER, GET_STATS,             \
                           GET_FEATURES, GET_STATUS,           \
-                          RECONFIGURE, RXQ_RECV)              \
+                          RECONFIGURE, RXQ_RECV, TXQ_FLUSH)   \
 {                                                             \
     NAME,                                                     \
     true,                       /* is_pmd */                  \
@@ -3381,6 +3418,7 @@  unlock:
     RXQ_RECV,                                                 \
     NULL,                       /* rx_wait */                 \
     NULL,                       /* rxq_drain */               \
+    TXQ_FLUSH,                  /* txq_flush */               \
     NO_OFFLOAD_API                                            \
 }
 
@@ -3398,7 +3436,8 @@  static const struct netdev_class dpdk_class =
         netdev_dpdk_get_features,
         netdev_dpdk_get_status,
         netdev_dpdk_reconfigure,
-        netdev_dpdk_rxq_recv);
+        netdev_dpdk_rxq_recv,
+        netdev_dpdk_txq_flush);
 
 static const struct netdev_class dpdk_ring_class =
     NETDEV_DPDK_CLASS(
@@ -3414,7 +3453,8 @@  static const struct netdev_class dpdk_ring_class =
         netdev_dpdk_get_features,
         netdev_dpdk_get_status,
         netdev_dpdk_reconfigure,
-        netdev_dpdk_rxq_recv);
+        netdev_dpdk_rxq_recv,
+        NULL);
 
 static const struct netdev_class dpdk_vhost_class =
     NETDEV_DPDK_CLASS(
@@ -3430,7 +3470,8 @@  static const struct netdev_class dpdk_vhost_class =
         NULL,
         NULL,
         netdev_dpdk_vhost_reconfigure,
-        netdev_dpdk_vhost_rxq_recv);
+        netdev_dpdk_vhost_rxq_recv,
+        NULL);
 static const struct netdev_class dpdk_vhost_client_class =
     NETDEV_DPDK_CLASS(
         "dpdkvhostuserclient",
@@ -3445,7 +3486,8 @@  static const struct netdev_class dpdk_vhost_client_class =
         NULL,
         NULL,
         netdev_dpdk_vhost_client_reconfigure,
-        netdev_dpdk_vhost_rxq_recv);
+        netdev_dpdk_vhost_rxq_recv,
+        NULL);
 
 void
 netdev_dpdk_register(void)
diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
index 752f157..86ec902 100644
--- a/lib/netdev-dummy.c
+++ b/lib/netdev-dummy.c
@@ -1413,6 +1413,7 @@  netdev_dummy_update_flags(struct netdev *netdev_,
     netdev_dummy_rxq_recv,                                      \
     netdev_dummy_rxq_wait,                                      \
     netdev_dummy_rxq_drain,                                     \
+    NULL,                                                       \
                                                                 \
     NO_OFFLOAD_API                                              \
 }
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index 98820ed..992f887 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -2888,6 +2888,7 @@  netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
     netdev_linux_rxq_recv,                                      \
     netdev_linux_rxq_wait,                                      \
     netdev_linux_rxq_drain,                                     \
+    NULL,                                                       \
                                                                 \
     FLOW_OFFLOAD_API                                            \
 }
diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
index b3c57d5..9c47fdf 100644
--- a/lib/netdev-provider.h
+++ b/lib/netdev-provider.h
@@ -347,6 +347,11 @@  struct netdev_class {
      * If the function returns a non-zero value, some of the packets might have
      * been sent anyway.
      *
+     * Some netdev provider - like in case of 'dpdk' - may buffer the batch
+     * of packets into an intermediate queue.  Buffered packets shall be
+     * transmitted when the packet count exceeds a threshold (or) by the
+     * periodic call to the flush function.
+     *
      * If 'may_steal' is false, the caller retains ownership of all the
      * packets.  If 'may_steal' is true, the caller transfers ownership of all
      * the packets to the network device, regardless of success.
@@ -788,6 +793,9 @@  struct netdev_class {
     /* Discards all packets waiting to be received from 'rx'. */
     int (*rxq_drain)(struct netdev_rxq *rx);
 
+    /* Flush all packets waiting to be sent on 'qid' queue. */
+    int (*txq_flush)(struct netdev *netdev, int qid, bool concurrent_txq);
+
     /* ## -------------------------------- ## */
     /* ## netdev flow offloading functions ## */
     /* ## -------------------------------- ## */
diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c
index 64a3ba3..3c5eacf 100644
--- a/lib/netdev-vport.c
+++ b/lib/netdev-vport.c
@@ -951,10 +951,10 @@  netdev_vport_get_ifindex(const struct netdev *netdev_)
     NULL,                   /* rx_recv */                   \
     NULL,                   /* rx_wait */                   \
     NULL,                   /* rx_drain */                  \
+    NULL,                   /* tx_flush */                  \
                                                             \
     NETDEV_FLOW_OFFLOAD_API
 
-
 #define TUNNEL_CLASS(NAME, DPIF_PORT, BUILD_HEADER, PUSH_HEADER, POP_HEADER,   \
                      GET_IFINDEX)                                              \
     { DPIF_PORT,                                                               \
diff --git a/lib/netdev.c b/lib/netdev.c
index 7e9896b..8250396 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -724,6 +724,15 @@  netdev_rxq_drain(struct netdev_rxq *rx)
             : 0);
 }
 
+/* Flush packets on the 'qid' queue. */
+int
+netdev_txq_flush(struct netdev *netdev, int qid, bool netdev_txq_flush)
+{
+    return (netdev->netdev_class->txq_flush
+            ? netdev->netdev_class->txq_flush(netdev, qid, netdev_txq_flush)
+            : EOPNOTSUPP);
+}
+
 /* Configures the number of tx queues of 'netdev'. Returns 0 if successful,
  * otherwise a positive errno value.
  *
diff --git a/lib/netdev.h b/lib/netdev.h
index f8482f7..328a158 100644
--- a/lib/netdev.h
+++ b/lib/netdev.h
@@ -183,6 +183,7 @@  int netdev_rxq_drain(struct netdev_rxq *);
 int netdev_send(struct netdev *, int qid, struct dp_packet_batch *,
                 bool may_steal, bool concurrent_txq);
 void netdev_send_wait(struct netdev *, int qid);
+int netdev_txq_flush(struct netdev *, int qid, bool concurrent_txq);
 
 /* Flow offloading. */
 struct offload_info {