[ovs-dev,v3,2/3] dpif-netdev: Detailed performance stats for PMDs

Message ID CFF8EF42F1132E4CBE2BF0AB6C21C58D8CECD207@ESESSMB107.ericsson.se
State New
Delegated to: Ian Stokes
Headers show
Series
  • Untitled series #14486
Related show

Commit Message

Jan Scheurich Nov. 21, 2017, 12:38 a.m.
This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 1024 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1024 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance statists can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
	[-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Packets:              2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
---
 lib/dp-packet.h        |   2 +
 lib/dpif-netdev-perf.c | 306 ++++++++++++++++++++++++++++++++++++++++++++++++-
 lib/dpif-netdev-perf.h | 166 ++++++++++++++++++++++++++-
 lib/dpif-netdev.c      | 115 ++++++++++++++++++-
 lib/netdev-dpdk.c      |  28 ++++-
 lib/netdev-dpdk.h      |  14 +++
 ofproto/ofproto-dpif.c |   3 +-
 7 files changed, 623 insertions(+), 11 deletions(-)

Patch

diff --git a/lib/dp-packet.h b/lib/dp-packet.h
index b4b721c..7950247 100644
--- a/lib/dp-packet.h
+++ b/lib/dp-packet.h
@@ -695,8 +695,10 @@  enum { NETDEV_MAX_BURST = 32 }; /* Maximum number packets in a batch. */
 
 struct dp_packet_batch {
     size_t count;
+    size_t qfill; /* Number of packets in queue when reading rx burst. */
     bool trunc; /* true if the batch needs truncate. */
     struct dp_packet *packets[NETDEV_MAX_BURST];
+
 };
 
 static inline void
diff --git a/lib/dpif-netdev-perf.c b/lib/dpif-netdev-perf.c
index 6a8f7c4..42d93a2 100644
--- a/lib/dpif-netdev-perf.c
+++ b/lib/dpif-netdev-perf.c
@@ -15,6 +15,7 @@ 
  */
 
 #include <config.h>
+#include <stdint.h>
 
 #ifdef DPDK_NETDEV
 #include <rte_cycles.h>
@@ -27,13 +28,304 @@ 
 
 VLOG_DEFINE_THIS_MODULE(pmd_perf);
 
+#ifdef DPDK_NETDEV
+static uint64_t
+get_tsc_hz(void)
+{
+    return rte_get_tsc_hz();
+}
+#else
+static uint64_t
+read_tsc(void)
+{
+    register uint64_t tsc asm("eax");
+    asm volatile (".byte 15, 49" : : : "eax", "edx");
+    return tsc;
+}
+
+static uint64_t
+get_tsc_hz(void)
+{
+    uint64_t tsc_start = read_tsc();
+    xsleep(1);
+    return read_tsc() - tsc_start;
+}
+#endif
+
+static void
+histogram_walls_set_lin(struct histogram *hist, uint32_t min, uint32_t max)
+{
+    int i;
+
+    ovs_assert(min < max);
+    for (i = 0; i < NUM_BINS-1; i++) {
+        hist->wall[i] = min + (i * (max - min)) / (NUM_BINS - 2);
+    }
+    hist->wall[NUM_BINS-1] = UINT32_MAX;
+}
+
+static void
+histogram_walls_set_log(struct histogram *hist, uint32_t min, uint32_t max)
+{
+    int i, start, bins, wall;
+    double log_min, log_max;
+
+    ovs_assert(min < max);
+    if (min > 0) {
+        log_min = log(min);
+        log_max = log(max);
+        start = 0;
+        bins = NUM_BINS - 1;
+    } else {
+        hist->wall[0] = 0;
+        log_min = log(1);
+        log_max = log(max);
+        start = 1;
+        bins = NUM_BINS - 2;
+    }
+    wall = start;
+    for (i = 0; i < bins; i++) {
+        /* Make sure each wall is monotonically increasing. */
+        wall = MAX(wall, exp(log_min + (i * (log_max - log_min)) / (bins-1)));
+        hist->wall[start + i] = wall++;
+    }
+    if (hist->wall[NUM_BINS-2] < max) {
+        hist->wall[NUM_BINS-2] = max;
+    }
+    hist->wall[NUM_BINS-1] = UINT32_MAX;
+}
+
+uint64_t
+histogram_samples(const struct histogram *hist)
+{
+    uint64_t samples = 0;
+
+    for (int i = 0; i < NUM_BINS; i++) {
+        samples += hist->bin[i];
+    }
+    return samples;
+}
+
+static void
+histogram_clear(struct histogram *hist)
+{
+    int i;
+
+    for (i = 0; i < NUM_BINS; i++) {
+        hist->bin[i] = 0;
+    }
+}
+
+static void
+history_init(struct history *h)
+{
+    memset(h, 0, sizeof(*h));
+}
+
 void
 pmd_perf_stats_init(struct pmd_perf_stats *s) {
-    memset(s, 0 , sizeof(*s));
+    /* The struct has been zeroed at allocation. */
+    histogram_walls_set_log(&s->cycles, 500, 24000000);
+    histogram_walls_set_log(&s->pkts, 0, 1000);
+    histogram_walls_set_lin(&s->cycles_per_pkt, 100, 30000);
+    histogram_walls_set_lin(&s->pkts_per_batch, 0, 32);
+    histogram_walls_set_lin(&s->upcalls, 0, 30);
+    histogram_walls_set_log(&s->cycles_per_upcall, 1000, 1000000);
+    histogram_walls_set_log(&s->max_vhost_qfill, 0, 512);
     s->start_ms = time_msec();
 }
 
 void
+pmd_perf_format_overall_stats(struct ds *str, struct pmd_perf_stats *s,
+                              double duration)
+{
+    uint64_t stats[PMD_N_STATS];
+    double us_per_cycle = 1000000.0 / get_tsc_hz();
+
+    if (duration == 0) {
+        return;
+    }
+
+    pmd_perf_read_counters(s, stats);
+    uint64_t tot_cycles = stats[PMD_CYCLES_ITER_IDLE] +
+                          stats[PMD_CYCLES_ITER_BUSY];
+    uint64_t packets = stats[PMD_STAT_RECV];
+    uint64_t passes = stats[PMD_STAT_RECV] +
+                      stats[PMD_STAT_RECIRC];
+    uint64_t upcalls = stats[PMD_STAT_MISS];
+    uint64_t upcall_cycles = stats[PMD_CYCLES_UPCALL];
+    uint64_t tot_iter = histogram_samples(&s->pkts);
+    uint64_t idle_iter = s->pkts.bin[0];
+
+    ds_put_format(str,
+            "  Cycles:          %12"PRIu64"  (%.2f GHz)\n"
+            "  Iterations:      %12"PRIu64"  (%.2f us/it)\n"
+            "  - idle:          %12"PRIu64"  (%4.1f %% cycles)\n"
+            "  - busy:          %12"PRIu64"  (%4.1f %% cycles)\n",
+            tot_cycles, (tot_cycles / duration) / 1E9,
+            tot_iter, tot_cycles * us_per_cycle / tot_iter,
+            idle_iter,
+            100.0 * stats[PMD_CYCLES_ITER_IDLE] / tot_cycles,
+            tot_iter - idle_iter,
+            100.0 * stats[PMD_CYCLES_ITER_BUSY] / tot_cycles);
+    if (packets > 0) {
+        ds_put_format(str,
+            "  Packets:         %12"PRIu64"  (%.0f Kpps, %.0f cycles/pkt)\n"
+            "  Datapath passes: %12"PRIu64"  (%.2f passes/pkt)\n"
+            "  - EMC hits:      %12"PRIu64"  (%4.1f %%)\n"
+            "  - Megaflow hits: %12"PRIu64"  (%4.1f %%, %.2f subtbl lookups/"
+                                                                     "hit)\n"
+            "  - Upcalls:       %12"PRIu64"  (%4.1f %%, %.1f us/upcall)\n"
+            "  - Lost upcalls:  %12"PRIu64"  (%4.1f %%)\n"
+            "\n",
+            packets, (packets / duration) / 1000,
+            1.0 * stats[PMD_CYCLES_ITER_BUSY] / packets,
+            passes, packets ? 1.0 * passes / packets : 0,
+            stats[PMD_STAT_EXACT_HIT],
+            100.0 * stats[PMD_STAT_EXACT_HIT] / passes,
+            stats[PMD_STAT_MASKED_HIT],
+            100.0 * stats[PMD_STAT_MASKED_HIT] / passes,
+            1.0 * stats[PMD_STAT_MASKED_LOOKUP] / stats[PMD_STAT_MASKED_HIT],
+            upcalls, 100.0 * upcalls / passes,
+            upcalls ? (upcall_cycles * us_per_cycle) / upcalls : 0,
+            stats[PMD_STAT_LOST],
+            100.0 * stats[PMD_STAT_LOST] / passes);
+    } else {
+        ds_put_format(str,
+                "  Packets:         %12"PRIu64"\n"
+                "\n",
+                0UL);
+    }
+}
+
+void
+pmd_perf_format_histograms(struct ds *str, struct pmd_perf_stats *s)
+{
+    int i;
+
+    ds_put_cstr(str, "Histograms\n");
+    ds_put_format(str,
+                  "   %-21s  %-21s  %-21s  %-21s  %-21s  %-21s  %-21s\n",
+                  "cycles/it", "packets/it", "cycles/pkt", "pkts/batch",
+                  "max vhost qlen", "upcalls/it", "cycles/upcall");
+    for (i = 0; i < NUM_BINS-1; i++) {
+        ds_put_format(str,
+            "   %-9d %-11"PRIu64"  %-9d %-11"PRIu64"  %-9d %-11"PRIu64
+            "  %-9d %-11"PRIu64"  %-9d %-11"PRIu64"  %-9d %-11"PRIu64
+            "  %-9d %-11"PRIu64"\n",
+            s->cycles.wall[i], s->cycles.bin[i],
+            s->pkts.wall[i],s->pkts.bin[i],
+            s->cycles_per_pkt.wall[i], s->cycles_per_pkt.bin[i],
+            s->pkts_per_batch.wall[i], s->pkts_per_batch.bin[i],
+            s->max_vhost_qfill.wall[i], s->max_vhost_qfill.bin[i],
+            s->upcalls.wall[i], s->upcalls.bin[i],
+            s->cycles_per_upcall.wall[i], s->cycles_per_upcall.bin[i]);
+    }
+    ds_put_format(str,
+                  "   %-9s %-11"PRIu64"  %-9s %-11"PRIu64"  %-9s %-11"PRIu64
+                  "  %-9s %-11"PRIu64"  %-9s %-11"PRIu64"  %-9s %-11"PRIu64
+                  "  %-9s %-11"PRIu64"\n",
+                  ">", s->cycles.bin[i],
+                  ">", s->pkts.bin[i],
+                  ">", s->cycles_per_pkt.bin[i],
+                  ">", s->pkts_per_batch.bin[i],
+                  ">", s->max_vhost_qfill.bin[i],
+                  ">", s->upcalls.bin[i],
+                  ">", s->cycles_per_upcall.bin[i]);
+    if (s->totals.iterations > 0) {
+        ds_put_cstr(str,
+                    "-----------------------------------------------------"
+                    "-----------------------------------------------------"
+                    "------------------------------------------------\n");
+        ds_put_format(str,
+                      "   %-21s  %-21s  %-21s  %-21s  %-21s  %-21s  %-21s\n",
+                      "cycles/it", "packets/it", "cycles/pkt", "pkts/batch",
+                      "vhost qlen", "upcalls/it", "cycles/upcall");
+        ds_put_format(str,
+                      "   %-21"PRIu64"  %-21.5f  %-21"PRIu64
+                      "  %-21.5f  %-21.5f  %-21.5f  %-21"PRIu32"\n",
+                      s->totals.cycles / s->totals.iterations,
+                      1.0 * s->totals.pkts / s->totals.iterations,
+                      s->totals.pkts
+                          ? s->totals.busy_cycles / s->totals.pkts : 0,
+                      s->totals.batches
+                          ? 1.0 * s->totals.pkts / s->totals.batches : 0,
+                      1.0 * s->totals.max_vhost_qfill / s->totals.iterations,
+                      1.0 * s->totals.upcalls / s->totals.iterations,
+                      s->totals.upcalls
+                          ? s->totals.upcall_cycles / s->totals.upcalls : 0);
+    }
+}
+
+void
+pmd_perf_format_iteration_history(struct ds *str, struct pmd_perf_stats *s,
+                                  int n_iter)
+{
+    struct iter_stats *is;
+    size_t index;
+    int i;
+
+    if (n_iter == 0) {
+        return;
+    }
+    ds_put_format(str, "   %-17s   %-10s   %-10s   %-10s   %-10s   "
+                  "%-10s   %-10s   %-10s\n",
+                  "tsc", "cycles", "packets", "cycles/pkt", "pkts/batch",
+                  "vhost qlen", "upcalls", "cycles/upcall");
+    for (i = 1; i <= n_iter; i++) {
+        index = (s->iterations.idx + HISTORY_LEN - i) % HISTORY_LEN;
+        is = &s->iterations.sample[index];
+        ds_put_format(str,
+                      "   %-17"PRIu64"   %-11"PRIu64"  %-11"PRIu32
+                      "  %-11"PRIu64"  %-11"PRIu32"  %-11"PRIu32
+                      "  %-11"PRIu32"  %-11"PRIu32"\n",
+                      is->timestamp,
+                      is->cycles,
+                      is->pkts,
+                      is->pkts ? is->cycles / is->pkts : 0,
+                      is->batches ? is->pkts / is->batches : 0,
+                      is->max_vhost_qfill,
+                      is->upcalls,
+                      is->upcalls ? is->upcall_cycles / is->upcalls : 0);
+    }
+}
+
+void
+pmd_perf_format_ms_history(struct ds *str, struct pmd_perf_stats *s, int n_ms)
+{
+    struct iter_stats *is;
+    size_t index;
+    int i;
+
+    if (n_ms == 0) {
+        return;
+    }
+    ds_put_format(str,
+                  "   %-12s   %-10s   %-10s   %-10s   %-10s"
+                  "   %-10s   %-10s   %-10s   %-10s\n",
+                  "ms", "iterations", "cycles/it", "Kpps", "cycles/pkt",
+                  "pkts/batch", "vhost qlen", "upcalls", "cycles/upcall");
+    for (i = 1; i <= n_ms; i++) {
+        index = (s->milliseconds.idx + HISTORY_LEN - i) % HISTORY_LEN;
+        is = &s->milliseconds.sample[index];
+        ds_put_format(str,
+                      "   %-12"PRIu64"   %-11"PRIu32"  %-11"PRIu64
+                      "  %-11"PRIu32"  %-11"PRIu64"  %-11"PRIu32
+                      "  %-11"PRIu32"  %-11"PRIu32"  %-11"PRIu32"\n",
+                      is->timestamp,
+                      is->iterations,
+                      is->iterations ? is->cycles / is->iterations : 0,
+                      is->pkts,
+                      is->pkts ? is->busy_cycles / is->pkts : 0,
+                      is->batches ? is->pkts / is->batches : 0,
+                      is->iterations
+                          ? is->max_vhost_qfill / is->iterations : 0,
+                      is->upcalls,
+                      is->upcalls ? is->upcall_cycles / is->upcalls : 0);
+    }
+}
+
+void
 pmd_perf_read_counters(struct pmd_perf_stats *s,
                        uint64_t stats[PMD_N_STATS])
 {
@@ -59,9 +351,21 @@  void
 pmd_perf_stats_clear(struct pmd_perf_stats *s)
 {
     s->start_ms = 0; /* Marks start of clearing. */
+    memset(&s->current, 0 , sizeof(struct iter_stats));
+    memset(&s->totals, 0 , sizeof(struct iter_stats));
     for (int i = 0; i < PMD_N_STATS; i++) {
         atomic_read_relaxed(&s->counters.n[i], &s->counters.zero[i]);
     }
+    histogram_clear(&s->cycles);
+    histogram_clear(&s->pkts);
+    histogram_clear(&s->cycles_per_pkt);
+    histogram_clear(&s->upcalls);
+    histogram_clear(&s->cycles_per_upcall);
+    histogram_clear(&s->pkts_per_batch);
+    histogram_clear(&s->max_vhost_qfill);
+    history_init(&s->iterations);
+    history_init(&s->milliseconds);
     s->start_ms = time_msec(); /* Clearing finished. */
+    s->milliseconds.sample[0].timestamp = s->start_ms;
 }
 
diff --git a/lib/dpif-netdev-perf.h b/lib/dpif-netdev-perf.h
index f55f7a2..01fde8d 100644
--- a/lib/dpif-netdev-perf.h
+++ b/lib/dpif-netdev-perf.h
@@ -33,6 +33,11 @@ 
 extern "C" {
 #endif
 
+#define NUM_BINS 32             /* Number of histogram bins. */
+#define HISTORY_LEN 1000        /* Length of recorded history
+                                   (iterations and ms). */
+#define DEF_HIST_SHOW 20        /* Default number of history samples. */
+
 enum pmd_stat_type {
     PMD_STAT_EXACT_HIT,     /* Packets that had an exact match (emc). */
     PMD_STAT_MASKED_HIT,    /* Packets that matched in the flow table. */
@@ -60,6 +65,28 @@  enum pmd_stat_type {
     PMD_N_STATS
 };
 
+struct histogram {
+    uint32_t wall[NUM_BINS];
+    uint64_t bin[NUM_BINS];
+};
+
+struct iter_stats {
+    uint64_t timestamp;
+    uint64_t cycles;
+    uint64_t busy_cycles;
+    uint32_t iterations;
+    uint32_t pkts;
+    uint32_t upcalls;
+    uint32_t upcall_cycles;
+    uint32_t batches;
+    uint32_t max_vhost_qfill;
+};
+
+struct history {
+    uint64_t idx;
+    struct iter_stats sample[HISTORY_LEN];
+};
+
 struct pmd_counters {
     atomic_uint64_t n[PMD_N_STATS];     /* Indexed by PMD_STAT_*. */
     uint64_t zero[PMD_N_STATS];
@@ -69,6 +96,17 @@  struct pmd_perf_stats {
     uint64_t start_ms;
     uint64_t last_tsc;
     struct pmd_counters counters;
+    struct iter_stats current;
+    struct iter_stats totals;
+    struct histogram cycles;
+    struct histogram pkts;
+    struct histogram cycles_per_pkt;
+    struct histogram upcalls;
+    struct histogram cycles_per_upcall;
+    struct histogram pkts_per_batch;
+    struct histogram max_vhost_qfill;
+    struct history iterations;
+    struct history milliseconds;
 };
 
 enum pmd_info_type;
@@ -96,6 +134,63 @@  pmd_perf_update_counter(struct pmd_perf_stats *s,
     atomic_store_relaxed(&s->counters.n[counter], tmp);
 }
 
+void pmd_perf_format_overall_stats(struct ds *str, struct pmd_perf_stats *s,
+                                   double duration);
+void pmd_perf_format_histograms(struct ds *str, struct pmd_perf_stats *s);
+void pmd_perf_format_iteration_history(struct ds *str,
+                                       struct pmd_perf_stats *s,
+                                       int n_iter);
+void pmd_perf_format_ms_history(struct ds *str, struct pmd_perf_stats *s,
+                                int n_ms);
+
+void pmd_perf_show(struct unixctl_conn *conn, int argc,
+                   const char *argv[],
+                   void *aux OVS_UNUSED);
+static inline void
+histogram_add_sample(struct histogram *hist, uint32_t val)
+{
+    /* TODO: Can do better with binary search? */
+    for (int i = 0; i < NUM_BINS-1; i++) {
+        if (val <= hist->wall[i]) {
+            hist->bin[i]++;
+            return;
+        }
+    }
+    hist->bin[NUM_BINS-1]++;
+}
+
+uint64_t histogram_samples(const struct histogram *hist);
+
+static inline struct iter_stats *
+history_current(struct history *h)
+{
+    return &h->sample[h->idx];
+}
+
+static inline struct iter_stats *
+history_next(struct history *h)
+{
+    struct iter_stats *next;
+
+    h->idx++;
+    if (h->idx == HISTORY_LEN) {
+        h->idx = 0;
+    }
+    next = &h->sample[h->idx];
+    memset(next, 0, sizeof(*next));
+    return next;
+}
+
+static inline struct iter_stats *
+history_store(struct history *h, struct iter_stats *is)
+{
+    if (is) {
+        h->sample[h->idx] = *is;
+    }
+    /* Advance the history pointer */
+    return history_next(h);
+}
+
 static inline void
 pmd_perf_start_iteration(struct pmd_perf_stats *s, uint64_t now_tsc)
 {
@@ -103,14 +198,25 @@  pmd_perf_start_iteration(struct pmd_perf_stats *s, uint64_t now_tsc)
         /* Stats not yet fully initialised. */
         return;
     }
-    s->last_tsc = now_tsc;
+    memset(&s->current, 0, sizeof(struct iter_stats));
+    s->current.timestamp = now_tsc;
 }
 
 static inline void
 pmd_perf_end_iteration(struct pmd_perf_stats *s, uint64_t now_tsc,
                        int packets)
 {
-    uint64_t cycles = now_tsc - s->last_tsc;
+    struct iter_stats *cum_ms;
+    uint64_t cycles, cycles_per_pkt = 0;
+
+    if (OVS_UNLIKELY(s->current.timestamp == 0)) {
+        /* Stats were cleared during the ongoing iteration. */
+        return;
+    }
+
+    cycles = now_tsc - s->current.timestamp;
+    s->current.cycles = cycles;
+    s->current.pkts = packets;
 
     /* No need for atomic updates as only invoked within PMD thread. */
     if (packets > 0) {
@@ -118,6 +224,62 @@  pmd_perf_end_iteration(struct pmd_perf_stats *s, uint64_t now_tsc,
     } else {
         s->counters.n[PMD_CYCLES_ITER_IDLE] += cycles;
     }
+    s->counters.n[PMD_CYCLES_UPCALL] += s->current.upcall_cycles;
+
+    /* Add iteration sample to histograms. */
+    histogram_add_sample(&s->cycles, cycles);
+    histogram_add_sample(&s->pkts, packets);
+    if (packets > 0) {
+        cycles_per_pkt = cycles / packets;
+        histogram_add_sample(&s->cycles_per_pkt, cycles_per_pkt);
+    }
+    if (s->current.batches > 0) {
+        histogram_add_sample(&s->pkts_per_batch, packets / s->current.batches);
+    }
+    histogram_add_sample(&s->upcalls, s->current.upcalls);
+    if (s->current.upcalls > 0) {
+        histogram_add_sample(&s->cycles_per_upcall,
+                             s->current.upcall_cycles / s->current.upcalls);
+    }
+    histogram_add_sample(&s->max_vhost_qfill, s->current.max_vhost_qfill);
+
+    /* Add iteration samples to millisecond stats. */
+    cum_ms = history_current(&s->milliseconds);
+    cum_ms->iterations++;
+    cum_ms->cycles += cycles;
+    if (packets > 0) {
+        cum_ms->busy_cycles += cycles;
+    }
+    cum_ms->pkts += s->current.pkts;
+    cum_ms->upcalls += s->current.upcalls;
+    cum_ms->upcall_cycles += s->current.upcall_cycles;
+    cum_ms->batches += s->current.batches;
+    cum_ms->max_vhost_qfill += s->current.max_vhost_qfill;
+
+    /* Store in iteration history. */
+    history_store(&s->iterations, &s->current);
+    if (now_tsc - s->last_tsc > 10000) {
+        /* Check if ms is completed and store in milliseconds history. */
+        uint64_t now = time_msec();
+        if (now != cum_ms->timestamp) {
+            /* Add ms stats to totals. */
+            s->totals.iterations += cum_ms->iterations;
+            s->totals.cycles += cum_ms->cycles;
+            s->totals.busy_cycles += cum_ms->busy_cycles;
+            s->totals.pkts += cum_ms->pkts;
+            s->totals.upcalls += cum_ms->upcalls;
+            s->totals.upcall_cycles += cum_ms->upcall_cycles;
+            s->totals.batches += cum_ms->batches;
+            s->totals.max_vhost_qfill += cum_ms->max_vhost_qfill;
+            cum_ms = history_next(&s->milliseconds);
+            cum_ms->timestamp = now;
+        }
+        s->last_tsc = now_tsc;
+    }
 }
 
+#ifdef  __cplusplus
+}
+#endif
+
 #endif /* pmd-perf.h */
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index bf99f81..63a4139 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -52,6 +52,7 @@ 
 #include "id-pool.h"
 #include "latch.h"
 #include "netdev.h"
+#include "netdev-provider.h"
 #include "netdev-vport.h"
 #include "netlink.h"
 #include "odp-execute.h"
@@ -766,7 +767,8 @@  get_dp_netdev(const struct dpif *dpif)
 enum pmd_info_type {
     PMD_INFO_SHOW_STATS,  /* Show how cpu cycles are spent. */
     PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
-    PMD_INFO_SHOW_RXQ     /* Show poll-lists of pmd threads. */
+    PMD_INFO_SHOW_RXQ,    /* Show poll-lists of pmd threads. */
+    PMD_INFO_PERF_SHOW,   /* Show pmd performance details. */
 };
 
 static void
@@ -845,6 +847,42 @@  pmd_info_show_stats(struct ds *reply,
                   stats[PMD_CYCLES_POLL_BUSY], total_packets);
 }
 
+static void
+pmd_info_show_perf(struct ds *reply,
+                   struct dp_netdev_pmd_thread *pmd,
+                   struct pmd_perf_params *par)
+{
+    if (pmd->core_id != NON_PMD_CORE_ID) {
+        char *time_str =
+                xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true);
+        long long now = time_msec();
+        double duration = (now - pmd->perf_stats.start_ms) / 1000.0;
+
+        ds_put_cstr(reply, "\n");
+        ds_put_format(reply, "Time: %s\n", time_str);
+        ds_put_format(reply, "Measurement duration: %.3f s\n", duration);
+        ds_put_cstr(reply, "\n");
+        format_pmd_thread(reply, pmd);
+        ds_put_cstr(reply, "\n");
+        pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration);
+        if (par->histograms) {
+            ds_put_cstr(reply, "\n");
+            pmd_perf_format_histograms(reply, &pmd->perf_stats);
+        }
+        if (par->iter_hist_len > 0) {
+            ds_put_cstr(reply, "\n");
+            pmd_perf_format_iteration_history(reply, &pmd->perf_stats,
+                                              par->iter_hist_len);
+        }
+        if (par->ms_hist_len > 0) {
+            ds_put_cstr(reply, "\n");
+            pmd_perf_format_ms_history(reply, &pmd->perf_stats,
+                                       par->ms_hist_len);
+        }
+        free(time_str);
+    }
+}
+
 static int
 compare_poll_list(const void *a_, const void *b_)
 {
@@ -1047,6 +1085,8 @@  dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
             pmd_perf_stats_clear(&pmd->perf_stats);
         } else if (type == PMD_INFO_SHOW_STATS) {
             pmd_info_show_stats(&reply, pmd);
+        } else if (type == PMD_INFO_PERF_SHOW) {
+            pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux);
         }
     }
     free(pmd_list);
@@ -1056,6 +1096,48 @@  dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
     unixctl_command_reply(conn, ds_cstr(&reply));
     ds_destroy(&reply);
 }
+
+static void
+pmd_perf_show_cmd(struct unixctl_conn *conn, int argc,
+                          const char *argv[],
+                          void *aux OVS_UNUSED)
+{
+    struct pmd_perf_params par;
+    long int it_hist = 0, ms_hist = 0;
+    par.histograms = true;
+
+    while (argc > 1) {
+        if (!strcmp(argv[1], "-nh")) {
+            par.histograms = false;
+            argc -= 1;
+            argv += 1;
+        } else if (!strcmp(argv[1], "-it") && argc > 2) {
+            it_hist = strtol(argv[2], NULL, 10);
+            if (it_hist < 0) {
+                it_hist = 0;
+            } else if (it_hist > HISTORY_LEN) {
+                it_hist = HISTORY_LEN;
+            }
+            argc -= 2;
+            argv += 2;
+        } else if (!strcmp(argv[1], "-ms") && argc > 2) {
+            ms_hist = strtol(argv[2], NULL, 10);
+            if (ms_hist < 0) {
+                ms_hist = 0;
+            } else if (ms_hist > HISTORY_LEN) {
+                ms_hist = HISTORY_LEN;
+            }
+            argc -= 2;
+            argv += 2;
+        } else {
+            break;
+        }
+    }
+    par.iter_hist_len = it_hist;
+    par.ms_hist_len = ms_hist;
+    par.command_type = PMD_INFO_PERF_SHOW;
+    dpif_netdev_pmd_info(conn, argc, argv, &par);
+}
 

 static int
 dpif_netdev_init(void)
@@ -1073,6 +1155,12 @@  dpif_netdev_init(void)
     unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] [dp]",
                              0, 2, dpif_netdev_pmd_info,
                              (void *)&poll_aux);
+    unixctl_command_register("dpif-netdev/pmd-perf-show",
+                             "[-nh] [-it iter-history-len]"
+                             " [-ms ms-history-len]"
+                             " [-pmd core] [dp]",
+                             0, 7, pmd_perf_show_cmd,
+                             NULL);
     unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
                              0, 1, dpif_netdev_pmd_rebalance,
                              NULL);
@@ -3160,6 +3248,7 @@  dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
                            struct netdev_rxq *rx,
                            odp_port_t port_no)
 {
+    struct pmd_perf_stats *s = &pmd->perf_stats;
     struct dp_packet_batch batch;
     int error;
     int batch_cnt = 0;
@@ -3168,8 +3257,23 @@  dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
     error = netdev_rxq_recv(rx, &batch);
     if (!error) {
         *recirc_depth_get() = 0;
-
+        /* Update batch histogram. */
         batch_cnt = batch.count;
+        s->current.batches++;
+        histogram_add_sample(&s->pkts_per_batch, batch_cnt);
+        /* Update the maximum Rx queue fill level. */
+        uint32_t qfill = batch.qfill;
+        switch (netdev_dpdk_get_type(netdev_rxq_get_netdev(rx))) {
+        case DPDK_DEV_VHOST:
+            if (qfill > s->current.max_vhost_qfill) {
+                s->current.max_vhost_qfill = qfill;
+            }
+            break;
+        case DPDK_DEV_ETH:
+        default:
+            break;
+        }
+        /* Process packet batch. */
         dp_netdev_input(pmd, &batch, port_no);
     } else if (error != EAGAIN && error != EOPNOTSUPP) {
         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
@@ -4917,6 +5021,7 @@  handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
     struct match match;
     ovs_u128 ufid;
     int error = 0;
+    uint64_t cycles = cycles_counter();
 
     match.tun_md.valid = false;
     miniflow_expand(&key->mf, &match.flow);
@@ -4970,6 +5075,12 @@  handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
         ovs_mutex_unlock(&pmd->flow_mutex);
         emc_probabilistic_insert(pmd, key, netdev_flow);
     }
+    /* Update upcall stats. */
+    cycles = cycles_counter() - cycles;
+    struct pmd_perf_stats *s = &pmd->perf_stats;
+    s->current.upcalls++;
+    s->current.upcall_cycles += cycles;
+    histogram_add_sample(&s->cycles_per_upcall, cycles);
     return error;
 }
 
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index faff842..94ba9ce 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -35,6 +35,7 @@ 
 #include <rte_mbuf.h>
 #include <rte_meter.h>
 #include <rte_pci.h>
+#include <rte_version.h>
 #include <rte_vhost.h>
 
 #include "dirs.h"
@@ -186,11 +187,6 @@  enum { DPDK_RING_SIZE = 256 };
 BUILD_ASSERT_DECL(IS_POW2(DPDK_RING_SIZE));
 enum { DRAIN_TSC = 200000ULL };
 
-enum dpdk_dev_type {
-    DPDK_DEV_ETH = 0,
-    DPDK_DEV_VHOST = 1,
-};
-
 /* Quality of Service */
 
 /* An instance of a QoS configuration.  Always associated with a particular
@@ -816,6 +812,13 @@  netdev_dpdk_cast(const struct netdev *netdev)
     return CONTAINER_OF(netdev, struct netdev_dpdk, up);
 }
 
+enum dpdk_dev_type
+netdev_dpdk_get_type(const struct netdev *netdev)
+{
+    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
+    return dev->type;
+}
+
 static struct netdev *
 netdev_dpdk_alloc(void)
 {
@@ -1626,6 +1629,19 @@  netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
         return EAGAIN;
     }
 
+    batch->qfill = nb_rx;
+
+    if (OVS_UNLIKELY(nb_rx == NETDEV_MAX_BURST)) {
+        /* This vhostuser API call is only implemented in DPDK 17.08.
+         * The need for this API call disappears when switching to the
+         * vhostuser PMD. */
+#if RTE_VERSION >= RTE_VERSION_NUM (17, 8, 0, RTE_VER_RELEASE)
+        batch->qfill += rte_vhost_rx_queue_count(netdev_dpdk_get_vid(dev),
+                                                 qid * VIRTIO_QNUM
+                                                     + VIRTIO_TXQ);
+#endif
+    }
+
     if (policer) {
         dropped = nb_rx;
         nb_rx = ingress_policer_run(policer,
@@ -1665,6 +1681,8 @@  netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch)
         return EAGAIN;
     }
 
+    batch->qfill = nb_rx;
+
     if (policer) {
         dropped = nb_rx;
         nb_rx = ingress_policer_run(policer,
diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h
index b7d02a7..2b357db 100644
--- a/lib/netdev-dpdk.h
+++ b/lib/netdev-dpdk.h
@@ -22,11 +22,18 @@ 
 #include "openvswitch/compiler.h"
 
 struct dp_packet;
+struct netdev;
+
+enum dpdk_dev_type {
+    DPDK_DEV_ETH = 0,
+    DPDK_DEV_VHOST = 1,
+};
 
 #ifdef DPDK_NETDEV
 
 void netdev_dpdk_register(void);
 void free_dpdk_buf(struct dp_packet *);
+enum dpdk_dev_type netdev_dpdk_get_type(const struct netdev *netdev);
 
 #else
 
@@ -41,6 +48,13 @@  free_dpdk_buf(struct dp_packet *buf OVS_UNUSED)
     /* Nothing */
 }
 
+static inline enum dpdk_dev_type
+netdev_dpdk_get_type(const struct netdev *netdev OVS_UNUSED)
+{
+    /* Nothing to do. Return value zero to make compiler happy. */
+    return DPDK_DEV_ETH;
+}
+
 #endif
 
 #endif /* netdev-dpdk.h */
diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
index b14b339..b266535 100644
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -5315,7 +5315,8 @@  dpif_show_backer(const struct dpif_backer *backer, struct ds *ds)
 
     dpif_get_dp_stats(backer->dpif, &dp_stats);
     ds_put_format(ds, "%s: hit:%"PRIu64" missed:%"PRIu64"\n",
-                  dpif_name(backer->dpif), dp_stats.n_hit, dp_stats.n_missed);
+                  dpif_name(backer->dpif),
+                  dp_stats.n_hit + dp_stats.n_mask_hit, dp_stats.n_missed);
 
     shash_init(&ofproto_shash);
     ofprotos = get_ofprotos(&ofproto_shash);