diff mbox series

[ovs-dev,RFC] dpif-netdev: Put idle PMD threads to sleep.

Message ID 20220304161132.22065-1-david.marchand@redhat.com
State Superseded
Headers show
Series [ovs-dev,RFC] dpif-netdev: Put idle PMD threads to sleep. | expand

Checks

Context Check Description
ovsrobot/intel-ovs-compilation fail test: fail

Commit Message

David Marchand March 4, 2022, 4:11 p.m. UTC
This is a RFC about trying to save some power, by putting the PMD threads
CPU to sleep in some circumstances.
This is probably not a full solution, but sending this to get
feedback.

DPDK ethdev ports support Rx interrupts, which are implemented by
providing a file descriptor per rxq on which an application might wait
until a packet is available.

This patch uses those file descriptors and builds some naively simplified
NAPI-like mechanism in PMD threads main loop: PMD threads start to sleep
when no traffic is detected after a period of time (in ms) configured via
a new dp parameter 'rx-idle-threshold' (default 0 ms i.e. never sleep).
PMD threads wake up in case of reloads, deferred TX flushing and other
internal tasks.

No capability for availability of Rx interrrupts is exposed from ethdev
DPDK API.
For this reason, init and starting the port is first tried with Rx
interrupts with a fallback if it fails.
A new .rxq_can_wait op is added so that the dpif layer can determine
whether a PMD thread may sleep.

For vhost support, because activation of queues happens dynamically,
a new seq is added per device so that a PMD may sleep waiting for
vrings to get enabled.

Signed-off-by: David Marchand <david.marchand@redhat.com>
---
 lib/dpif-netdev.c     |  86 +++++++++++++++++-
 lib/netdev-dpdk.c     | 199 +++++++++++++++++++++++++++++++++++-------
 lib/netdev-provider.h |   8 ++
 lib/netdev.c          |   9 ++
 lib/netdev.h          |   1 +
 5 files changed, 272 insertions(+), 31 deletions(-)
diff mbox series

Patch

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 9f35713ef5..713d29aa69 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -268,6 +268,9 @@  struct dp_netdev {
     /* The time that a packet can wait in output batch for sending. */
     atomic_uint32_t tx_flush_interval;
 
+    /* The time before PMD threads sleep when receiving no traffic. */
+    atomic_uint32_t rx_idle_threshold;
+
     /* Meters. */
     struct ovs_mutex meters_lock;
     struct cmap meters OVS_GUARDED;
@@ -4924,6 +4927,22 @@  dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
     bool autolb_state = smap_get_bool(other_config, "pmd-auto-lb", false);
 
     set_pmd_auto_lb(dp, autolb_state, log_autolb);
+
+    uint32_t rx_idle_threshold = smap_get_int(other_config,
+                                              "rx-idle-threshold", 0);
+    uint32_t cur_rx_idle_threshold;
+
+    atomic_read_relaxed(&dp->rx_idle_threshold, &cur_rx_idle_threshold);
+    if (cur_rx_idle_threshold != rx_idle_threshold) {
+        if (rx_idle_threshold != 0) {
+            VLOG_INFO("PMD will sleep after %"PRIu32" ms of empty polling.",
+                      rx_idle_threshold);
+        } else {
+            VLOG_INFO("PMD will not sleep.");
+        }
+        atomic_store_relaxed(&dp->rx_idle_threshold, rx_idle_threshold);
+    }
+
     return 0;
 }
 
@@ -5269,6 +5288,24 @@  dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
     return output_cnt;
 }
 
+static long long int
+dp_netdev_pmd_next_flush(struct dp_netdev_pmd_thread *pmd)
+{
+    long long int next_flush = LLONG_MAX;
+
+    if (pmd->n_output_batches) {
+        struct tx_port *p;
+
+        HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
+            if (!dp_packet_batch_is_empty(&p->output_pkts)
+                && next_flush > p->flush_time) {
+                next_flush = p->flush_time;
+            }
+        }
+    }
+    return next_flush;
+}
+
 static int
 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
                                    bool force)
@@ -6851,10 +6888,12 @@  pmd_thread_main(void *f_)
     struct dp_netdev_pmd_thread *pmd = f_;
     struct pmd_perf_stats *s = &pmd->perf_stats;
     unsigned int lc = 0;
+    long long int empty_polling_start;
     struct polled_queue *poll_list;
     bool wait_for_reload = false;
     bool dpdk_attached;
     bool reload_tx_qid;
+    bool rxq_can_wait;
     bool exiting;
     bool reload;
     int poll_cnt;
@@ -6873,6 +6912,8 @@  pmd_thread_main(void *f_)
 
 reload:
     atomic_count_init(&pmd->pmd_overloaded, 0);
+    rxq_can_wait = false;
+    empty_polling_start = LLONG_MAX;
 
     if (!dpdk_attached) {
         dpdk_attached = dpdk_attach_thread(pmd->core_id);
@@ -6944,11 +6985,50 @@  reload:
         }
 
         if (!rx_packets) {
+            uint32_t rx_idle_threshold;
+
             /* We didn't receive anything in the process loop.
              * Check if we need to send something.
              * There was no time updates on current iteration. */
             pmd_thread_ctx_time_update(pmd);
             tx_packets = dp_netdev_pmd_flush_output_packets(pmd, false);
+            atomic_read_relaxed(&pmd->dp->rx_idle_threshold,
+                                &rx_idle_threshold);
+
+            if (rxq_can_wait && rx_idle_threshold) {
+                static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
+
+                if (pmd->ctx.now <= empty_polling_start) {
+                    empty_polling_start = pmd->ctx.now;
+                } else if (pmd->ctx.now > empty_polling_start
+                                 + 1000 * rx_idle_threshold) {
+                    long long int next_wakeup;
+
+                    VLOG_DBG_RL(&rl, "No packet in last %lld ms, sleeping.",
+                                (pmd->ctx.now - empty_polling_start) / 1000);
+                    for (i = 0; i < poll_cnt; i++) {
+                        if (!poll_list[i].rxq_enabled) {
+                            continue;
+                        }
+                        netdev_rxq_wait(poll_list[i].rxq->rx);
+                    }
+                    next_wakeup = dp_netdev_pmd_next_flush(pmd);
+                    next_wakeup = MIN(next_wakeup, pmd->next_cycle_store);
+                    next_wakeup = MIN(next_wakeup, pmd->next_optimization);
+                    /* ms granularity is not precise enough and the PMD might
+                     * end up on a busy loop of calls to poll() with a 0ms
+                     * timeout. Let's wait for one more ms. */
+                    next_wakeup = next_wakeup / 1000 + 1;
+                    poll_timer_wait_until(next_wakeup);
+                    seq_wait(pmd->reload_seq, pmd->last_reload_seq);
+                    poll_block();
+                    if (time_usec() >= next_wakeup) {
+                        lc = 1024;
+                    }
+                }
+            }
+        } else {
+            empty_polling_start = LLONG_MAX;
         }
 
         /* Do RCU synchronization at fixed interval.  This ensures that
@@ -6961,7 +7041,7 @@  reload:
             }
         }
 
-        if (lc++ > 1024) {
+        if (lc++ >= 1024) {
             lc = 0;
 
             coverage_try_clear();
@@ -6972,6 +7052,7 @@  reload:
                     pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
             }
 
+            rxq_can_wait = true;
             for (i = 0; i < poll_cnt; i++) {
                 uint64_t current_seq =
                          netdev_get_change_seq(poll_list[i].rxq->port->netdev);
@@ -6980,6 +7061,9 @@  reload:
                     poll_list[i].rxq_enabled =
                                  netdev_rxq_enabled(poll_list[i].rxq->rx);
                 }
+                if (poll_list[i].rxq_enabled) {
+                    rxq_can_wait &= netdev_rxq_can_wait(poll_list[i].rxq->rx);
+                }
             }
         }
 
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 667d758b23..8495f1a6c8 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -54,6 +54,7 @@ 
 #include "openvswitch/list.h"
 #include "openvswitch/match.h"
 #include "openvswitch/ofp-print.h"
+#include "openvswitch/poll-loop.h"
 #include "openvswitch/shash.h"
 #include "openvswitch/vlog.h"
 #include "ovs-numa.h"
@@ -473,8 +474,9 @@  struct netdev_dpdk {
         uint32_t policer_rate;
         uint32_t policer_burst;
 
-        /* Array of vhost rxq states, see vring_state_changed. */
-        bool *vhost_rxq_enabled;
+        /* See vring_state_changed. */
+        struct netdev_dpdk_vhost_rxq *vhost_rxq;
+        struct seq *vhost_state_seq;
     );
 
     PADDED_MEMBERS(CACHE_LINE_SIZE,
@@ -518,6 +520,9 @@  struct netdev_dpdk {
         bool requested_lsc_interrupt_mode;
         bool lsc_interrupt_mode;
 
+        /* Property for rx interrupt support. */
+        bool rxq_interrupt_mode;
+
         /* VF configuration. */
         struct eth_addr requested_hwaddr;
     );
@@ -534,6 +539,14 @@  struct netdev_dpdk {
 struct netdev_rxq_dpdk {
     struct netdev_rxq up;
     dpdk_port_t port_id;
+    bool waiting;
+    int fd;
+};
+
+struct netdev_dpdk_vhost_rxq {
+    bool enabled;
+    int pending_fd;
+    uint64_t last_change_seq;
 };
 
 static void netdev_dpdk_destruct(struct netdev *netdev);
@@ -974,6 +987,7 @@  dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
     }
 
     conf.intr_conf.lsc = dev->lsc_interrupt_mode;
+    conf.intr_conf.rxq = dev->rxq_interrupt_mode;
 
     if (dev->hw_ol_features & NETDEV_RX_CHECKSUM_OFFLOAD) {
         conf.rxmode.offloads |= RTE_ETH_RX_OFFLOAD_CHECKSUM;
@@ -1007,7 +1021,8 @@  dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
      * and request less queues */
     while (n_rxq && n_txq) {
         if (diag) {
-            VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq, n_txq);
+            VLOG_INFO("Retrying setup with (rxq:%d txq:%d%s)", n_rxq, n_txq,
+                      conf.intr_conf.rxq != 0 ? ", rx interrupts" : "");
         }
 
         diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq, &conf);
@@ -1145,8 +1160,20 @@  dpdk_eth_dev_init(struct netdev_dpdk *dev)
     n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
     n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
 
+    dev->rxq_interrupt_mode = true;
+
+retry_without_rx_interrupt:
     diag = dpdk_eth_dev_port_config(dev, n_rxq, n_txq);
     if (diag) {
+        if (dev->rxq_interrupt_mode) {
+            VLOG_INFO("Interface %s(rxq:%d txq:%d lsc interrupt mode:%s) "
+                     "configure error with rx interrupts: %s",
+                     dev->up.name, n_rxq, n_txq,
+                     dev->lsc_interrupt_mode ? "true" : "false",
+                     rte_strerror(-diag));
+            dev->rxq_interrupt_mode = false;
+            goto retry_without_rx_interrupt;
+        }
         VLOG_ERR("Interface %s(rxq:%d txq:%d lsc interrupt mode:%s) "
                  "configure error: %s",
                  dev->up.name, n_rxq, n_txq,
@@ -1157,6 +1184,12 @@  dpdk_eth_dev_init(struct netdev_dpdk *dev)
 
     diag = rte_eth_dev_start(dev->port_id);
     if (diag) {
+        if (dev->rxq_interrupt_mode) {
+            VLOG_INFO("Interface %s start error with rx interrupts: %s",
+                     dev->up.name, rte_strerror(-diag));
+            dev->rxq_interrupt_mode = false;
+            goto retry_without_rx_interrupt;
+        }
         VLOG_ERR("Interface %s start error: %s", dev->up.name,
                  rte_strerror(-diag));
         return -diag;
@@ -1283,6 +1316,22 @@  common_construct(struct netdev *netdev, dpdk_port_t port_no,
     return 0;
 }
 
+static struct netdev_dpdk_vhost_rxq *
+netdev_dpdk_vhost_alloc_rxq(void)
+{
+    const unsigned int n_rxq = OVS_VHOST_MAX_QUEUE_NUM;
+    struct netdev_dpdk_vhost_rxq *rxqs;
+
+    rxqs = dpdk_rte_mzalloc(n_rxq * sizeof *rxqs);
+    if (rxqs) {
+        for (unsigned int i = 0; i < n_rxq; i++) {
+            rxqs[i].pending_fd = -1;
+        }
+    }
+
+    return rxqs;
+}
+
 static int
 vhost_common_construct(struct netdev *netdev)
     OVS_REQUIRES(dpdk_mutex)
@@ -1290,17 +1339,17 @@  vhost_common_construct(struct netdev *netdev)
     int socket_id = rte_lcore_to_socket_id(rte_get_main_lcore());
     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
 
-    dev->vhost_rxq_enabled = dpdk_rte_mzalloc(OVS_VHOST_MAX_QUEUE_NUM *
-                                              sizeof *dev->vhost_rxq_enabled);
-    if (!dev->vhost_rxq_enabled) {
+    dev->vhost_rxq = netdev_dpdk_vhost_alloc_rxq();
+    if (!dev->vhost_rxq) {
         return ENOMEM;
     }
     dev->tx_q = netdev_dpdk_alloc_txq(OVS_VHOST_MAX_QUEUE_NUM);
     if (!dev->tx_q) {
-        rte_free(dev->vhost_rxq_enabled);
+        rte_free(dev->vhost_rxq);
+        dev->vhost_rxq = NULL;
         return ENOMEM;
     }
-
+    dev->vhost_state_seq = seq_create();
     atomic_init(&dev->vhost_tx_retries_max, VHOST_ENQ_RETRY_DEF);
 
     return common_construct(netdev, DPDK_ETH_PORT_ID_INVALID,
@@ -1533,7 +1582,10 @@  netdev_dpdk_vhost_destruct(struct netdev *netdev)
 
     vhost_id = dev->vhost_id;
     dev->vhost_id = NULL;
-    rte_free(dev->vhost_rxq_enabled);
+    seq_destroy(dev->vhost_state_seq);
+    dev->vhost_state_seq = NULL;
+    rte_free(dev->vhost_rxq);
+    dev->vhost_rxq = NULL;
 
     common_destruct(dev);
 
@@ -1705,6 +1757,8 @@  netdev_dpdk_get_config(const struct netdev *netdev, struct smap *args)
         }
         smap_add(args, "lsc_interrupt_mode",
                  dev->lsc_interrupt_mode ? "true" : "false");
+        smap_add(args, "rxq_interrupt_mode",
+                 dev->rxq_interrupt_mode ? "true" : "false");
 
         if (dpdk_port_is_representor(dev)) {
             smap_add_format(args, "dpdk-vf-mac", ETH_ADDR_FMT,
@@ -2120,6 +2174,15 @@  netdev_dpdk_rxq_construct(struct netdev_rxq *rxq)
 
     ovs_mutex_lock(&dev->mutex);
     rx->port_id = dev->port_id;
+    rx->waiting = false;
+    rx->fd = -1;
+    if (dev->type == DPDK_DEV_ETH && dev->rxq_interrupt_mode) {
+        rx->fd = rte_eth_dev_rx_intr_ctl_q_get_fd(rx->port_id, rxq->queue_id);
+        if (rx->fd < 0) {
+            VLOG_WARN("Rx interrupts are broken for device %s rxq %d.",
+                      dev->up.name, rxq->queue_id);
+        }
+    }
     ovs_mutex_unlock(&dev->mutex);
 
     return 0;
@@ -2379,6 +2442,7 @@  static int
 netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
                            struct dp_packet_batch *batch, int *qfill)
 {
+    struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
     struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
     struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
     uint16_t nb_rx = 0;
@@ -2391,6 +2455,18 @@  netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
         return EAGAIN;
     }
 
+    if (OVS_UNLIKELY(rx->waiting)) {
+        ssize_t retval;
+
+        rx->waiting = false;
+        rte_vhost_enable_guest_notification(vid, qid, false);
+        do {
+            uint64_t buf;
+
+            retval = read(rx->fd, &buf, sizeof buf);
+        } while (retval < 0 && errno == EINTR);
+    }
+
     nb_rx = rte_vhost_dequeue_burst(vid, qid, dev->dpdk_mp->mp,
                                     (struct rte_mbuf **) batch->packets,
                                     NETDEV_MAX_BURST);
@@ -2432,7 +2508,7 @@  netdev_dpdk_vhost_rxq_enabled(struct netdev_rxq *rxq)
 {
     struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
 
-    return dev->vhost_rxq_enabled[rxq->queue_id];
+    return dev->vhost_rxq[rxq->queue_id].enabled;
 }
 
 static int
@@ -2449,6 +2525,18 @@  netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch,
         return EAGAIN;
     }
 
+    if (OVS_UNLIKELY(rx->waiting)) {
+        ssize_t retval;
+
+        rx->waiting = false;
+        rte_eth_dev_rx_intr_disable(rx->port_id, rxq->queue_id);
+        do {
+            uint64_t buf;
+
+            retval = read(rx->fd, &buf, sizeof buf);
+        } while (retval < 0 && errno == EINTR);
+    }
+
     nb_rx = rte_eth_rx_burst(rx->port_id, rxq->queue_id,
                              (struct rte_mbuf **) batch->packets,
                              NETDEV_MAX_BURST);
@@ -2486,6 +2574,44 @@  netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch,
     return 0;
 }
 
+static bool
+netdev_dpdk_rxq_can_wait(struct netdev_rxq *rxq)
+{
+    struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
+
+    return rx->fd >= 0;
+}
+
+static void
+netdev_dpdk_rxq_wait(struct netdev_rxq *rxq)
+{
+    struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
+
+    poll_fd_wait(rx->fd, POLLIN);
+    rx->waiting = true;
+    rte_eth_dev_rx_intr_enable(rx->port_id, rxq->queue_id);
+}
+
+static void
+netdev_dpdk_vhost_rxq_wait(struct netdev_rxq *rxq)
+{
+    struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
+    struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
+    struct netdev_dpdk_vhost_rxq *vhost_rxq = &dev->vhost_rxq[rxq->queue_id];
+    int vid = netdev_dpdk_get_vid(dev);
+
+    vhost_rxq->last_change_seq = seq_read(dev->vhost_state_seq);
+    seq_wait(dev->vhost_state_seq, vhost_rxq->last_change_seq);
+    rx->fd = vhost_rxq->pending_fd;
+    if (vid >= 0 && rx->fd >= 0) {
+        int qid = rxq->queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
+
+        poll_fd_wait(rx->fd, POLLIN);
+        rx->waiting = true;
+        rte_vhost_enable_guest_notification(vid, qid, true);
+    }
+}
+
 static inline int
 netdev_dpdk_qos_run(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
                     int cnt, bool should_steal)
@@ -4045,8 +4171,9 @@  destroy_device(int vid)
             ovs_mutex_lock(&dev->mutex);
             dev->vhost_reconfigured = false;
             ovsrcu_index_set(&dev->vid, -1);
-            memset(dev->vhost_rxq_enabled, 0,
-                   dev->up.n_rxq * sizeof *dev->vhost_rxq_enabled);
+            for (int i = 0; i < dev->up.n_rxq; i++) {
+                dev->vhost_rxq[i].enabled = false;
+            }
             netdev_dpdk_txq_map_clear(dev);
 
             netdev_change_seq_changed(&dev->up);
@@ -4091,10 +4218,21 @@  vring_state_changed(int vid, uint16_t queue_id, int enable)
         ovs_mutex_lock(&dev->mutex);
         if (nullable_string_is_equal(ifname, dev->vhost_id)) {
             if (is_rx) {
-                bool old_state = dev->vhost_rxq_enabled[qid];
+                bool old_state = dev->vhost_rxq[qid].enabled;
+                int old_fd = dev->vhost_rxq[qid].pending_fd;
+
+                dev->vhost_rxq[qid].enabled = enable != 0;
+                if (dev->vhost_rxq[qid].enabled) {
+                    struct rte_vhost_vring vring;
 
-                dev->vhost_rxq_enabled[qid] = enable != 0;
-                if (old_state != dev->vhost_rxq_enabled[qid]) {
+                    rte_vhost_get_vhost_vring(vid, queue_id, &vring);
+                    dev->vhost_rxq[qid].pending_fd = vring.kickfd;
+                } else {
+                    dev->vhost_rxq[qid].pending_fd = -1;
+                }
+                if (old_state != dev->vhost_rxq[qid].enabled
+                    || old_fd != dev->vhost_rxq[qid].pending_fd) {
+                    seq_change(dev->vhost_state_seq);
                     netdev_change_seq_changed(&dev->up);
                 }
             } else {
@@ -5030,7 +5168,7 @@  dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev)
 
     /* Always keep RX queue 0 enabled for implementations that won't
      * report vring states. */
-    dev->vhost_rxq_enabled[0] = true;
+    dev->vhost_rxq[0].enabled = true;
 
     /* Enable TX queue 0 by default if it wasn't disabled. */
     if (dev->tx_q[0].map == OVS_VHOST_QUEUE_MAP_UNKNOWN) {
@@ -5414,24 +5552,23 @@  netdev_dpdk_rte_flow_tunnel_item_release(struct netdev *netdev,
     .rxq_destruct = netdev_dpdk_rxq_destruct,               \
     .rxq_dealloc = netdev_dpdk_rxq_dealloc
 
-#define NETDEV_DPDK_CLASS_BASE                          \
-    NETDEV_DPDK_CLASS_COMMON,                           \
-    .init = netdev_dpdk_class_init,                     \
-    .destruct = netdev_dpdk_destruct,                   \
-    .set_tx_multiq = netdev_dpdk_set_tx_multiq,         \
-    .get_carrier = netdev_dpdk_get_carrier,             \
-    .get_stats = netdev_dpdk_get_stats,                 \
-    .get_custom_stats = netdev_dpdk_get_custom_stats,   \
-    .get_features = netdev_dpdk_get_features,           \
-    .get_status = netdev_dpdk_get_status,               \
-    .reconfigure = netdev_dpdk_reconfigure,             \
-    .rxq_recv = netdev_dpdk_rxq_recv
-
 static const struct netdev_class dpdk_class = {
     .type = "dpdk",
-    NETDEV_DPDK_CLASS_BASE,
+    .init = netdev_dpdk_class_init,
+    NETDEV_DPDK_CLASS_COMMON,
     .construct = netdev_dpdk_construct,
+    .destruct = netdev_dpdk_destruct,
     .set_config = netdev_dpdk_set_config,
+    .reconfigure = netdev_dpdk_reconfigure,
+    .set_tx_multiq = netdev_dpdk_set_tx_multiq,
+    .get_carrier = netdev_dpdk_get_carrier,
+    .get_stats = netdev_dpdk_get_stats,
+    .get_custom_stats = netdev_dpdk_get_custom_stats,
+    .get_features = netdev_dpdk_get_features,
+    .get_status = netdev_dpdk_get_status,
+    .rxq_recv = netdev_dpdk_rxq_recv,
+    .rxq_wait = netdev_dpdk_rxq_wait,
+    .rxq_can_wait = netdev_dpdk_rxq_can_wait,
     .send = netdev_dpdk_eth_send,
 };
 
@@ -5448,6 +5585,7 @@  static const struct netdev_class dpdk_vhost_class = {
     .reconfigure = netdev_dpdk_vhost_reconfigure,
     .rxq_recv = netdev_dpdk_vhost_rxq_recv,
     .rxq_enabled = netdev_dpdk_vhost_rxq_enabled,
+    .rxq_wait = netdev_dpdk_vhost_rxq_wait,
 };
 
 static const struct netdev_class dpdk_vhost_client_class = {
@@ -5464,6 +5602,7 @@  static const struct netdev_class dpdk_vhost_client_class = {
     .reconfigure = netdev_dpdk_vhost_client_reconfigure,
     .rxq_recv = netdev_dpdk_vhost_rxq_recv,
     .rxq_enabled = netdev_dpdk_vhost_rxq_enabled,
+    .rxq_wait = netdev_dpdk_vhost_rxq_wait,
 };
 
 void
diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
index b5420947d0..f9b913d5aa 100644
--- a/lib/netdev-provider.h
+++ b/lib/netdev-provider.h
@@ -831,6 +831,14 @@  struct netdev_class {
      * netdev_rxq_recv() on 'rx'. */
     void (*rxq_wait)(struct netdev_rxq *rx);
 
+    /* For some netdev (DPDK), marked with .is_pmd = true, there might be
+     * cases where a PMD thread being able to call netdev_rxq_wait() is a
+     * wanted feature.
+     * Yet, the netdev might lack a way to know this feature is available
+     * until runtime.
+     */
+    bool (*rxq_can_wait)(struct netdev_rxq *rx);
+
     /* Discards all packets waiting to be received from 'rx'. */
     int (*rxq_drain)(struct netdev_rxq *rx);
 
diff --git a/lib/netdev.c b/lib/netdev.c
index 8305f6c427..ce656f40f9 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -733,6 +733,15 @@  netdev_rxq_recv(struct netdev_rxq *rx, struct dp_packet_batch *batch,
     return retval;
 }
 
+bool
+netdev_rxq_can_wait(struct netdev_rxq *rx)
+{
+    return rx->netdev->netdev_class->rxq_wait &&
+        (rx->netdev->netdev_class->rxq_can_wait
+         ? rx->netdev->netdev_class->rxq_can_wait(rx)
+         : true);
+}
+
 /* Arranges for poll_block() to wake up when a packet is ready to be received
  * on 'rx'. */
 void
diff --git a/lib/netdev.h b/lib/netdev.h
index acf174927d..27d3d19846 100644
--- a/lib/netdev.h
+++ b/lib/netdev.h
@@ -193,6 +193,7 @@  int netdev_rxq_get_queue_id(const struct netdev_rxq *);
 
 int netdev_rxq_recv(struct netdev_rxq *rx, struct dp_packet_batch *,
                     int *qfill);
+bool netdev_rxq_can_wait(struct netdev_rxq *);
 void netdev_rxq_wait(struct netdev_rxq *);
 int netdev_rxq_drain(struct netdev_rxq *);