[ovs-dev,RFC,6/8] dpif-netdev: Refactor cycle counting

Message ID 1515096166-16257-7-git-send-email-jan.scheurich@ericsson.com
State Superseded
Delegated to: Ian Stokes
Headers show
Series
  • dpif-netdev: Refactor cycle count and rebased patches
Related show

Commit Message

Jan Scheurich Jan. 4, 2018, 8:02 p.m.
Simplify the historically grown TSC cycle counting in PMD threads.
Cycles are currently counted for the following purposes:

1. Measure PMD ustilization

PMD utilization is defined as ratio of cycles spent in busy iterations
(at least one packet received or sent) over the total number of cycles.

This is already done in pmd_perf_start_iteration() and
pmd_perf_end_iteration() based on a TSC timestamp saved in current
iteration at start_iteration() and the actual TSC at end_iteration().
No dependency on intermediate cycle accounting.

2. Measure the processing load per RX queue

This comprises cycles spend on polling and processing packets received
from the rx queue and the cycles spent on delayed sending of these packets
to tx queues (with time-based batching).

3. Measure the cycles spend on processing upcalls

These are part of the processing cycles of PMD and rxq but are also
measured separately for the purpose of supervising upcall performance
and load.

The previous scheme using cycles_count_start(), cycles_count_intermediate()
and cycles-count_end() originally introduced to simplify cycle counting
and saving calls to rte_get_tsc_cycles() was rather obscuring things.

Replaced this with dedicated pairs of cycles_count() around each task to
be measured and accounting the difference in cycles as appropriate for
the task.

Each call to cycles_count(pmd) will now store the read TSC counter in
pmd->ctx.last_cycles, so that users with lower accuracy requirements can
read that value instead of calling cycles_count().

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
---
 lib/dpif-netdev.c | 132 ++++++++++++++++--------------------------------------
 1 file changed, 39 insertions(+), 93 deletions(-)

Comments

Kevin Traynor Jan. 8, 2018, 4:46 p.m. | #1
On 01/04/2018 08:02 PM, Jan Scheurich wrote:
> Simplify the historically grown TSC cycle counting in PMD threads.
> Cycles are currently counted for the following purposes:
> 
> 1. Measure PMD ustilization
> 
> PMD utilization is defined as ratio of cycles spent in busy iterations
> (at least one packet received or sent) over the total number of cycles.
> 
> This is already done in pmd_perf_start_iteration() and
> pmd_perf_end_iteration() based on a TSC timestamp saved in current
> iteration at start_iteration() and the actual TSC at end_iteration().
> No dependency on intermediate cycle accounting.
> 
> 2. Measure the processing load per RX queue
> 
> This comprises cycles spend on polling and processing packets received
> from the rx queue and the cycles spent on delayed sending of these packets
> to tx queues (with time-based batching).
> 
> 3. Measure the cycles spend on processing upcalls
> 
> These are part of the processing cycles of PMD and rxq but are also
> measured separately for the purpose of supervising upcall performance
> and load.
> 
> The previous scheme using cycles_count_start(), cycles_count_intermediate()
> and cycles-count_end() originally introduced to simplify cycle counting
> and saving calls to rte_get_tsc_cycles() was rather obscuring things.
> 
> Replaced this with dedicated pairs of cycles_count() around each task to
> be measured and accounting the difference in cycles as appropriate for
> the task.
> 
> Each call to cycles_count(pmd) will now store the read TSC counter in
> pmd->ctx.last_cycles, so that users with lower accuracy requirements can
> read that value instead of calling cycles_count().
> 
> Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
> ---
>  lib/dpif-netdev.c | 132 ++++++++++++++++--------------------------------------
>  1 file changed, 39 insertions(+), 93 deletions(-)
> 
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index d16ba93..5d23128 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -522,11 +522,9 @@ struct dp_netdev_pmd_thread_ctx {
>      /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
>      long long now;
>      /* Used to count cycles. See 'cycles_count_end()' */
> -    unsigned long long last_cycles;
> +    uint64_t last_cycles;
>      /* RX queue from which last packet was received. */
>      struct dp_netdev_rxq *last_rxq;
> -    /* Indicates how should be treated last counted cycles. */
> -    enum pmd_cycles_counter_type current_pmd_cycles_type;
>  };
>  
>  /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
> @@ -3246,69 +3244,16 @@ dp_netdev_actions_free(struct dp_netdev_actions *actions)
>      free(actions);
>  }
>  
> -static inline unsigned long long
> -cycles_counter(void)
> +static inline uint64_t
> +cycles_counter(struct dp_netdev_pmd_thread *pmd)
>  {
>  #ifdef DPDK_NETDEV
> -    return rte_get_tsc_cycles();
> +    return pmd->ctx.last_cycles = rte_get_tsc_cycles();
>  #else
> -    return 0;
> +    return pmd->ctx.last_cycles = 0;
>  #endif
>  }
>  
> -/* Fake mutex to make sure that the calls to cycles_count_* are balanced */
> -extern struct ovs_mutex cycles_counter_fake_mutex;
> -
> -/* Start counting cycles.  Must be followed by 'cycles_count_end()'.
> - * Counting starts from the idle type state.  */
> -static inline void
> -cycles_count_start(struct dp_netdev_pmd_thread *pmd)
> -    OVS_ACQUIRES(&cycles_counter_fake_mutex)
> -    OVS_NO_THREAD_SAFETY_ANALYSIS
> -{
> -    pmd->ctx.current_pmd_cycles_type = PMD_CYCLES_IDLE;
> -    pmd->ctx.last_cycles = cycles_counter();
> -}
> -
> -/* Stop counting cycles and add them to the counter of the current type. */
> -static inline void
> -cycles_count_end(struct dp_netdev_pmd_thread *pmd)
> -    OVS_RELEASES(&cycles_counter_fake_mutex)
> -    OVS_NO_THREAD_SAFETY_ANALYSIS
> -{
> -    unsigned long long interval = cycles_counter() - pmd->ctx.last_cycles;
> -    enum pmd_cycles_counter_type type = pmd->ctx.current_pmd_cycles_type;
> -
> -    pmd_perf_update_counter(&pmd->perf_stats, type, interval);
> -}
> -
> -/* Calculate the intermediate cycle result and add to the counter of
> - * the current type */
> -static inline void
> -cycles_count_intermediate(struct dp_netdev_pmd_thread *pmd,
> -                          struct dp_netdev_rxq **rxqs, int n_rxqs)
> -    OVS_NO_THREAD_SAFETY_ANALYSIS
> -{
> -    unsigned long long new_cycles = cycles_counter();
> -    unsigned long long interval = new_cycles - pmd->ctx.last_cycles;
> -    enum pmd_cycles_counter_type type = pmd->ctx.current_pmd_cycles_type;
> -    int i;
> -
> -    pmd->ctx.last_cycles = new_cycles;
> -
> -    pmd_perf_update_counter(&pmd->perf_stats, type, interval);
> -    if (n_rxqs && (type == PMD_CYCLES_POLL_BUSY)) {
> -        /* Add to the amount of current processing cycles. */
> -        interval /= n_rxqs;
> -        for (i = 0; i < n_rxqs; i++) {
> -            if (rxqs[i]) {
> -                non_atomic_ullong_add(&rxqs[i]->cycles[RXQ_CYCLES_PROC_CURR],
> -                        interval);
> -            }
> -        }
> -    }
> -}
> -
>  static inline bool
>  pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd)
>  {
> @@ -3325,6 +3270,14 @@ dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
>     atomic_store_relaxed(&rx->cycles[type], cycles);
>  }
>  
> +static void
> +dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
> +                         enum rxq_cycles_counter_type type,
> +                         unsigned long long cycles)
> +{
> +    non_atomic_ullong_add(&rx->cycles[type], cycles);
> +}
> +
>  static uint64_t
>  dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
>                           enum rxq_cycles_counter_type type)
> @@ -3358,16 +3311,9 @@ dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
>      int output_cnt;
>      bool dynamic_txqs;
>      uint32_t tx_flush_interval;
> -    enum pmd_cycles_counter_type save_pmd_cycles_type;
>  
> -    /* In case we're in PMD_CYCLES_PROCESSING state we need to count
> -     * cycles for rxq we're processing now. */
> -    cycles_count_intermediate(pmd, &pmd->ctx.last_rxq, 1);
> -
> -    /* Save current cycles counting state to restore after accounting
> -     * send cycles. */
> -    save_pmd_cycles_type = pmd->ctx.current_pmd_cycles_type;
> -    pmd->ctx.current_pmd_cycles_type = PMD_CYCLES_POLL_BUSY;
> +    /* Measure duration of batch transmission. */
> +    uint64_t cycles = cycles_counter(pmd);
>  
>      dynamic_txqs = p->port->dynamic_txqs;
>      if (dynamic_txqs) {
> @@ -3394,9 +3340,15 @@ dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
>      pmd_perf_update_counter(&pmd->perf_stats,
>                              PMD_STAT_SENT_BATCHES, 1);
>  
> -    /* Update send cycles for all the rx queues and restore previous state. */
> -    cycles_count_intermediate(pmd, p->output_pkts_rxqs, output_cnt);
> -    pmd->ctx.current_pmd_cycles_type = save_pmd_cycles_type;
> +    /* Divide the batch tx cost evenly over the packets' rxqs. */
> +    cycles = (cycles_counter(pmd) - cycles) / output_cnt;
> +    struct dp_netdev_rxq **rxqs = p->output_pkts_rxqs;
> +    for (int i = 0; i < output_cnt; i++) {
> +        if (rxqs[i]) {
> +            dp_netdev_rxq_add_cycles(rxqs[i], RXQ_CYCLES_PROC_CURR,
> +                                     cycles);
> +        }
> +    }
>      return output_cnt;
>  }
>  
> @@ -3429,16 +3381,17 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
>      struct pmd_perf_stats *s = &pmd->perf_stats;
>      struct dp_packet_batch batch;
>      int error;
> -    int batch_cnt = 0, output_cnt = 0;
> +    int batch_cnt = 0;
>  
>      dp_packet_batch_init(&batch);
> -
> -    cycles_count_intermediate(pmd, NULL, 0);
>      pmd->ctx.last_rxq = rxq;
> -    pmd->ctx.current_pmd_cycles_type = PMD_CYCLES_POLL_BUSY;
> +
> +    /* Measure duration for polling and processing rx burst. */
> +    uint64_t cycles = cycles_counter(pmd);

hi Jan - not a full review. Consider this function is called from
dp_netdev_input()...dp_execute_cb(), if the code below is hit, the above
measurement will make for double counting.

<snip>
#ifdef DPDK_NETDEV
            if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts)
                             && packets_->packets[0]->source
                                != p->output_pkts.packets[0]->source)) {
                /* XXX: netdev-dpdk assumes that all packets in a single
                 *      output batch has the same source. Flush here to
                 *      avoid memory access issues. */
                dp_netdev_pmd_flush_output_on_port(pmd, p);
            }
#endif
            if (dp_packet_batch_size(&p->output_pkts)
                + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
                /* Flush here to avoid overflow. */
                dp_netdev_pmd_flush_output_on_port(pmd, p);
            }
</snip>

I wouldn't be too concerned about the first one because it's unlikely. I
would be more concerned about the second one, as it is quite likely that
multiple rxqs are sending to a single port and the cycles for tx could
be significant. Take 2 rxq's sending packets to a vhost port, unless we
got batches of 32 on each rxq there would be double counting for one of
the queues everytime. There could be more cases like this also, as there
is flush from a lot of places. I didn't compare, but from memory I don't
think this would be an issues in Ilya's patches.

start/intermediate/stop type functions might be better than storing
cycles locally in functions, because you could stop and start the count
from any function you need. In this case, you could avoid the double
counting by something like stop/call
dp_netdev_pmd_flush_output_on_port()/start. That might start to make the
code more like Ilya's, so it's probably good to get his review.

>      error = netdev_rxq_recv(rxq->rx, &batch);
>  
>      if (!error) {
> +        /* At least one packet received. */
>          *recirc_depth_get() = 0;
>          pmd_thread_ctx_time_update(pmd);
>          batch_cnt = batch.count;
> @@ -3448,7 +3401,7 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
>              histogram_add_sample(&s->pkts_per_batch, batch_cnt);
>              /* Update the maximum Rx queue fill level. */
>              uint32_t qfill = batch.qfill;
> -            switch (netdev_dpdk_get_type(netdev_rxq_get_netdev(rx))) {
> +            switch (netdev_dpdk_get_type(netdev_rxq_get_netdev(rxq->rx))) {

It looks like this is fixing an error from a previous patch

thanks,
Kevin.

>              case DPDK_DEV_VHOST:
>                  if (qfill > s->current.max_vhost_qfill) {
>                      s->current.max_vhost_qfill = qfill;
> @@ -3461,11 +3414,13 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
>          }
>          /* Process packet batch. */
>          dp_netdev_input(pmd, &batch, port_no);
> -        cycles_count_intermediate(pmd, &rxq, 1);
> -
> -        output_cnt = dp_netdev_pmd_flush_output_packets(pmd, false);
> +        /* Add processing cycles to rxq stats. */
> +        cycles = cycles_counter(pmd) - cycles;
> +        dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
> +        /* Flush the send queues. */
> +        dp_netdev_pmd_flush_output_packets(pmd, false);
>          *flushed = true;
> -        
> +
>      } else if (error != EAGAIN && error != EOPNOTSUPP) {
>          static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>  
> @@ -3473,9 +3428,7 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
>                      netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
>      }
>  
> -    pmd->ctx.current_pmd_cycles_type = PMD_CYCLES_POLL_IDLE;
>      pmd->ctx.last_rxq = NULL;
> -
>      return batch_cnt;
>  }
>  
> @@ -4105,7 +4058,6 @@ dpif_netdev_run(struct dpif *dpif)
>      non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
>      if (non_pmd) {
>          ovs_mutex_lock(&dp->non_pmd_mutex);
> -        cycles_count_start(non_pmd);
>          HMAP_FOR_EACH (port, node, &dp->ports) {
>              if (!netdev_is_pmd(port->netdev)) {
>                  int i;
> @@ -4126,7 +4078,6 @@ dpif_netdev_run(struct dpif *dpif)
>              dp_netdev_pmd_flush_output_packets(non_pmd, false);
>          }
>  
> -        cycles_count_end(non_pmd);
>          dpif_netdev_xps_revalidate_pmd(non_pmd, false);
>          ovs_mutex_unlock(&dp->non_pmd_mutex);
>  
> @@ -4305,7 +4256,6 @@ reload:
>          lc = UINT_MAX;
>      }
>  
> -    cycles_count_start(pmd);
>      for (;;) {
>          uint64_t iter_packets = 0;
>          bool flushed = false;
> @@ -4343,15 +4293,12 @@ reload:
>              if (reload) {
>                  break;
>              }
> -            cycles_count_intermediate(pmd, NULL, 0);
>          }
>  
> -        pmd_perf_end_iteration(s, pmd->ctx.last_cycles, iter_packets,
> +        pmd_perf_end_iteration(s, cycles_counter(pmd), iter_packets,
>                                 pmd_perf_metrics_enabled(pmd));
>      }
>  
> -    cycles_count_end(pmd);
> -
>      poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
>      exiting = latch_is_set(&pmd->exit_latch);
>      /* Signal here to make sure the pmd finishes
> @@ -4791,7 +4738,6 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
>      cmap_init(&pmd->flow_table);
>      cmap_init(&pmd->classifiers);
>      pmd->ctx.last_rxq = NULL;
> -    pmd->ctx.current_pmd_cycles_type = PMD_CYCLES_IDLE;
>      pmd_thread_ctx_time_update(pmd);
>      pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
>      pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
> @@ -5247,7 +5193,7 @@ handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
>      struct match match;
>      ovs_u128 ufid;
>      int error;
> -    uint64_t cycles = cycles_counter();
> +    uint64_t cycles = cycles_counter(pmd);
>  
>      match.tun_md.valid = false;
>      miniflow_expand(&key->mf, &match.flow);
> @@ -5303,7 +5249,7 @@ handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
>      }
>      if (pmd_perf_metrics_enabled(pmd)) {
>          /* Update upcall stats. */
> -        cycles = cycles_counter() - cycles;
> +        cycles = cycles_counter(pmd) - cycles;
>          struct pmd_perf_stats *s = &pmd->perf_stats;
>          s->current.upcalls++;
>          s->current.upcall_cycles += cycles;
>
Jan Scheurich Jan. 8, 2018, 5:17 p.m. | #2
Hi Kevin,

Thanks for the feedback. See below.

> > @@ -3429,16 +3381,17 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
> >      struct pmd_perf_stats *s = &pmd->perf_stats;
> >      struct dp_packet_batch batch;
> >      int error;
> > -    int batch_cnt = 0, output_cnt = 0;
> > +    int batch_cnt = 0;
> >
> >      dp_packet_batch_init(&batch);
> > -
> > -    cycles_count_intermediate(pmd, NULL, 0);
> >      pmd->ctx.last_rxq = rxq;
> > -    pmd->ctx.current_pmd_cycles_type = PMD_CYCLES_POLL_BUSY;
> > +
> > +    /* Measure duration for polling and processing rx burst. */
> > +    uint64_t cycles = cycles_counter(pmd);
> 
> hi Jan - not a full review. Consider this function is called from
> dp_netdev_input()...dp_execute_cb(), if the code below is hit, the above
> measurement will make for double counting.
> 
> <snip>
> #ifdef DPDK_NETDEV
>             if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts)
>                              && packets_->packets[0]->source
>                                 != p->output_pkts.packets[0]->source)) {
>                 /* XXX: netdev-dpdk assumes that all packets in a single
>                  *      output batch has the same source. Flush here to
>                  *      avoid memory access issues. */
>                 dp_netdev_pmd_flush_output_on_port(pmd, p);
>             }
> #endif
>             if (dp_packet_batch_size(&p->output_pkts)
>                 + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
>                 /* Flush here to avoid overflow. */
>                 dp_netdev_pmd_flush_output_on_port(pmd, p);
>             }
> </snip>
> 
> I wouldn't be too concerned about the first one because it's unlikely. I
> would be more concerned about the second one, as it is quite likely that
> multiple rxqs are sending to a single port and the cycles for tx could
> be significant. Take 2 rxq's sending packets to a vhost port, unless we
> got batches of 32 on each rxq there would be double counting for one of
> the queues everytime. There could be more cases like this also, as there
> is flush from a lot of places. I didn't compare, but from memory I don't
> think this would be an issues in Ilya's patches.
> 
> start/intermediate/stop type functions might be better than storing
> cycles locally in functions, because you could stop and start the count
> from any function you need. In this case, you could avoid the double
> counting by something like stop/call
> dp_netdev_pmd_flush_output_on_port()/start. That might start to make the
> code more like Ilya's, so it's probably good to get his review.

You are right, I overlooked the possibility for a tx burst triggered immediately by
executing an output action. This needs fixing, and I agree that a scheme which
is able to "suspend" an ongoing measurement at any time will be needed for 
that. Let me have another look at the previous scheme to see if I can simplify
that. If not we can stick to the original solution.

> 
> >      error = netdev_rxq_recv(rxq->rx, &batch);
> >
> >      if (!error) {
> > +        /* At least one packet received. */
> >          *recirc_depth_get() = 0;
> >          pmd_thread_ctx_time_update(pmd);
> >          batch_cnt = batch.count;
> > @@ -3448,7 +3401,7 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
> >              histogram_add_sample(&s->pkts_per_batch, batch_cnt);
> >              /* Update the maximum Rx queue fill level. */
> >              uint32_t qfill = batch.qfill;
> > -            switch (netdev_dpdk_get_type(netdev_rxq_get_netdev(rx))) {
> > +            switch (netdev_dpdk_get_type(netdev_rxq_get_netdev(rxq->rx))) {
> 
> It looks like this is fixing an error from a previous patch

Good spot.
Jan Scheurich Jan. 9, 2018, 12:51 a.m. | #3
> > > @@ -3429,16 +3381,17 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
> > >      struct pmd_perf_stats *s = &pmd->perf_stats;
> > >      struct dp_packet_batch batch;
> > >      int error;
> > > -    int batch_cnt = 0, output_cnt = 0;
> > > +    int batch_cnt = 0;
> > >
> > >      dp_packet_batch_init(&batch);
> > > -
> > > -    cycles_count_intermediate(pmd, NULL, 0);
> > >      pmd->ctx.last_rxq = rxq;
> > > -    pmd->ctx.current_pmd_cycles_type = PMD_CYCLES_POLL_BUSY;
> > > +
> > > +    /* Measure duration for polling and processing rx burst. */
> > > +    uint64_t cycles = cycles_counter(pmd);
> >
> > hi Jan - not a full review. Consider this function is called from
> > dp_netdev_input()...dp_execute_cb(), if the code below is hit, the above
> > measurement will make for double counting.
> >
> > <snip>
> > #ifdef DPDK_NETDEV
> >             if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts)
> >                              && packets_->packets[0]->source
> >                                 != p->output_pkts.packets[0]->source)) {
> >                 /* XXX: netdev-dpdk assumes that all packets in a single
> >                  *      output batch has the same source. Flush here to
> >                  *      avoid memory access issues. */
> >                 dp_netdev_pmd_flush_output_on_port(pmd, p);
> >             }
> > #endif
> >             if (dp_packet_batch_size(&p->output_pkts)
> >                 + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
> >                 /* Flush here to avoid overflow. */
> >                 dp_netdev_pmd_flush_output_on_port(pmd, p);
> >             }
> > </snip>
> >
> > I wouldn't be too concerned about the first one because it's unlikely. I
> > would be more concerned about the second one, as it is quite likely that
> > multiple rxqs are sending to a single port and the cycles for tx could
> > be significant. Take 2 rxq's sending packets to a vhost port, unless we
> > got batches of 32 on each rxq there would be double counting for one of
> > the queues everytime. There could be more cases like this also, as there
> > is flush from a lot of places. I didn't compare, but from memory I don't
> > think this would be an issues in Ilya's patches.
> >
> > start/intermediate/stop type functions might be better than storing
> > cycles locally in functions, because you could stop and start the count
> > from any function you need. In this case, you could avoid the double
> > counting by something like stop/call
> > dp_netdev_pmd_flush_output_on_port()/start. That might start to make the
> > code more like Ilya's, so it's probably good to get his review.
> 
> You are right, I overlooked the possibility for a tx burst triggered immediately by
> executing an output action. This needs fixing, and I agree that a scheme which
> is able to "suspend" an ongoing measurement at any time will be needed for
> that. Let me have another look at the previous scheme to see if I can simplify
> that. If not we can stick to the original solution.
> 

I have prepared a slightly more advanced version that allows arbitrary nesting of cycles measurements where the nested measurements are excluded from the outer measurement: Here's the outline. I am including this in the next version of my umbrella series. This version isn't tested yet:

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 4761d3b..e487828 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -516,6 +516,8 @@ struct tx_port {
     struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
 };
 
+struct cycle_timer;
+
 /* A set of properties for the current processing loop that is not directly
  * associated with the pmd thread itself, but with the packets being
  * processed or the short-term system configuration (for example, time).
@@ -527,6 +529,7 @@ struct dp_netdev_pmd_thread_ctx {
     uint64_t last_cycles;
     /* RX queue from which last packet was received. */
     struct dp_netdev_rxq *last_rxq;
+    struct cycle_timer *cur_timer;
 };
 
 /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
@@ -632,6 +635,71 @@ struct dpif_netdev {
     uint64_t last_port_seq;
 };
 
+/* Support for accurate timing of PMD execution on TSC clock cycle level.
+ * These functions are intended to be invoked in the context of pmd threads. */
+
+/* Read the TSC cycle register and cache it in the pmd context. Any function
+ * not requiring clock cycle accuracy should read the cached value from the
+ * context. */
+static inline uint64_t
+cycles_counter(struct dp_netdev_pmd_thread *pmd)
+{
+#ifdef DPDK_NETDEV
+    return pmd->ctx.last_cycles = rte_get_tsc_cycles();
+#else
+    return pmd->ctx.last_cycles = 0;
+#endif
+}
+
+/* A nestable timer for measuring execution time in TSC cycles.
+ *
+ * Usage:
+ * struct cycle_timer timer;
+ *
+ * cycle_timer_start(pmd, &timer);
+ * <Timed execution>
+ * uint64_t cycles = cycle_timer_stop(pmd, &timer);
+ *
+ * The caller must guarantee that a call to cycle_timer_start() is always
+ * paired with a call to cycle_stimer_stop().
+ *
+ * Is is possible to have nested cycles timers within the timed code. The
+ * execution time measured by the nested timers is excluded from the time
+ * measured by the embracing timer.
+ */
+
+struct cycle_timer {
+    uint64_t *offset;
+    struct cycle_timer *interrupted;
+};
+
+static void
+cycle_timer_start(struct dp_netdev_pmd_thread *pmd,
+                  struct cycle_timer *timer)
+{
+    /* Suspend current timer, if any. */
+    timer->interrupted = pmd->ctx.cur_timer;
+    timer->offset = cycles_counter(pmd);
+    pmd->ctx.cur_timer = timer;
+}
+
+static uint64_t
+cycle_timer_stop(struct dp_netdev_pmd_thread *pmd,
+                 struct cycle_timer *timer)
+{
+    /* Assert that this is the current cycle timer. */
+    ovs_assert(pmd->ctx.cur_timer == timer);
+    uint64_t cycles = cycles_counter(pmd) - timer->offset;
+    if (timer->interrupted) {
+        /* Adjust the start offset by the used cycles. */
+        timer->interrupted->offset += cycles;
+    }
+    /* Restore suspended timer, if any. */
+    pmd->ctx.cur_timer = timer->interrupted;
+    return cycles;
+}
+
 static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
                               struct dp_netdev_port **portp)
     OVS_REQUIRES(dp->port_mutex);
@@ -3267,16 +3335,6 @@ dp_netdev_actions_free(struct dp_netdev_actions *actions)
     free(actions);
 }
Ilya Maximets Jan. 9, 2018, 1:43 p.m. | #4
On 09.01.2018 03:51, Jan Scheurich wrote:
> 
>>>> @@ -3429,16 +3381,17 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
>>>>      struct pmd_perf_stats *s = &pmd->perf_stats;
>>>>      struct dp_packet_batch batch;
>>>>      int error;
>>>> -    int batch_cnt = 0, output_cnt = 0;
>>>> +    int batch_cnt = 0;
>>>>
>>>>      dp_packet_batch_init(&batch);
>>>> -
>>>> -    cycles_count_intermediate(pmd, NULL, 0);
>>>>      pmd->ctx.last_rxq = rxq;
>>>> -    pmd->ctx.current_pmd_cycles_type = PMD_CYCLES_POLL_BUSY;
>>>> +
>>>> +    /* Measure duration for polling and processing rx burst. */
>>>> +    uint64_t cycles = cycles_counter(pmd);
>>>
>>> hi Jan - not a full review. Consider this function is called from
>>> dp_netdev_input()...dp_execute_cb(), if the code below is hit, the above
>>> measurement will make for double counting.
>>>
>>> <snip>
>>> #ifdef DPDK_NETDEV
>>>             if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts)
>>>                              && packets_->packets[0]->source
>>>                                 != p->output_pkts.packets[0]->source)) {
>>>                 /* XXX: netdev-dpdk assumes that all packets in a single
>>>                  *      output batch has the same source. Flush here to
>>>                  *      avoid memory access issues. */
>>>                 dp_netdev_pmd_flush_output_on_port(pmd, p);
>>>             }
>>> #endif
>>>             if (dp_packet_batch_size(&p->output_pkts)
>>>                 + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
>>>                 /* Flush here to avoid overflow. */
>>>                 dp_netdev_pmd_flush_output_on_port(pmd, p);
>>>             }
>>> </snip>
>>>
>>> I wouldn't be too concerned about the first one because it's unlikely. I
>>> would be more concerned about the second one, as it is quite likely that
>>> multiple rxqs are sending to a single port and the cycles for tx could
>>> be significant. Take 2 rxq's sending packets to a vhost port, unless we
>>> got batches of 32 on each rxq there would be double counting for one of
>>> the queues everytime. There could be more cases like this also, as there
>>> is flush from a lot of places. I didn't compare, but from memory I don't
>>> think this would be an issues in Ilya's patches.
>>>
>>> start/intermediate/stop type functions might be better than storing
>>> cycles locally in functions, because you could stop and start the count
>>> from any function you need. In this case, you could avoid the double
>>> counting by something like stop/call
>>> dp_netdev_pmd_flush_output_on_port()/start. That might start to make the
>>> code more like Ilya's, so it's probably good to get his review.
>>
>> You are right, I overlooked the possibility for a tx burst triggered immediately by
>> executing an output action. This needs fixing, and I agree that a scheme which
>> is able to "suspend" an ongoing measurement at any time will be needed for
>> that. Let me have another look at the previous scheme to see if I can simplify
>> that. If not we can stick to the original solution.
>>
> 
> I have prepared a slightly more advanced version that allows arbitrary nesting of cycles measurements where the nested measurements are excluded from the outer measurement: Here's the outline. I am including this in the next version of my umbrella series. This version isn't tested yet:
> 
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 4761d3b..e487828 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -516,6 +516,8 @@ struct tx_port {
>      struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
>  };
>  
> +struct cycle_timer;
> +
>  /* A set of properties for the current processing loop that is not directly
>   * associated with the pmd thread itself, but with the packets being
>   * processed or the short-term system configuration (for example, time).
> @@ -527,6 +529,7 @@ struct dp_netdev_pmd_thread_ctx {
>      uint64_t last_cycles;
>      /* RX queue from which last packet was received. */
>      struct dp_netdev_rxq *last_rxq;
> +    struct cycle_timer *cur_timer;
>  };
>  
>  /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
> @@ -632,6 +635,71 @@ struct dpif_netdev {
>      uint64_t last_port_seq;
>  };
>  
> +/* Support for accurate timing of PMD execution on TSC clock cycle level.
> + * These functions are intended to be invoked in the context of pmd threads. */
> +
> +/* Read the TSC cycle register and cache it in the pmd context. Any function
> + * not requiring clock cycle accuracy should read the cached value from the
> + * context. */
> +static inline uint64_t
> +cycles_counter(struct dp_netdev_pmd_thread *pmd)
> +{
> +#ifdef DPDK_NETDEV
> +    return pmd->ctx.last_cycles = rte_get_tsc_cycles();
> +#else
> +    return pmd->ctx.last_cycles = 0;
> +#endif
> +}
> +
> +/* A nestable timer for measuring execution time in TSC cycles.
> + *
> + * Usage:
> + * struct cycle_timer timer;
> + *
> + * cycle_timer_start(pmd, &timer);
> + * <Timed execution>
> + * uint64_t cycles = cycle_timer_stop(pmd, &timer);
> + *
> + * The caller must guarantee that a call to cycle_timer_start() is always
> + * paired with a call to cycle_stimer_stop().
> + *
> + * Is is possible to have nested cycles timers within the timed code. The
> + * execution time measured by the nested timers is excluded from the time
> + * measured by the embracing timer.
> + */
> +
> +struct cycle_timer {
> +    uint64_t *offset;
> +    struct cycle_timer *interrupted;
> +};
> +
> +static void
> +cycle_timer_start(struct dp_netdev_pmd_thread *pmd,
> +                  struct cycle_timer *timer)
> +{
> +    /* Suspend current timer, if any. */
> +    timer->interrupted = pmd->ctx.cur_timer;
> +    timer->offset = cycles_counter(pmd);
> +    pmd->ctx.cur_timer = timer;
> +}
> +
> +static uint64_t
> +cycle_timer_stop(struct dp_netdev_pmd_thread *pmd,
> +                 struct cycle_timer *timer)
> +{
> +    /* Assert that this is the current cycle timer. */
> +    ovs_assert(pmd->ctx.cur_timer == timer);
> +    uint64_t cycles = cycles_counter(pmd) - timer->offset;
> +    if (timer->interrupted) {
> +        /* Adjust the start offset by the used cycles. */
> +        timer->interrupted->offset += cycles;

I think, this will not work for nesting more than 2 timers, because
you will adjust only by the time of the one level deeper timer.
You probably need one more field in the timer structure to accumulate
sleep time while unwinding the timer's stack and not modify original
offsets like:

	uint64_t cycles = cycles_counter(pmd) - timer->offset;
	if (timer->interrupted) {
		timer->interrupted->interrupted_time += cycles;
	}
	pmd->ctx.cur_timer = timer->interrupted;
	return cycles - timer->interrupted_time;

> +    }
> +    /* Restore suspended timer, if any. */
> +    pmd->ctx.cur_timer = timer->interrupted;
> +    return cycles;
> +}
> +
>  static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
>                                struct dp_netdev_port **portp)
>      OVS_REQUIRES(dp->port_mutex);
> @@ -3267,16 +3335,6 @@ dp_netdev_actions_free(struct dp_netdev_actions *actions)
>      free(actions);
>  }
>  
> 
> 
> 
> 
>
Jan Scheurich Jan. 9, 2018, 1:48 p.m. | #5
> > I have prepared a slightly more advanced version that allows arbitrary nesting of cycles measurements where the nested measurements
> are excluded from the outer measurement: Here's the outline. I am including this in the next version of my umbrella series. This version
> isn't tested yet:
> >
> > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> > index 4761d3b..e487828 100644
> > --- a/lib/dpif-netdev.c
> > +++ b/lib/dpif-netdev.c
> > @@ -516,6 +516,8 @@ struct tx_port {
> >      struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
> >  };
> >
> > +struct cycle_timer;
> > +
> >  /* A set of properties for the current processing loop that is not directly
> >   * associated with the pmd thread itself, but with the packets being
> >   * processed or the short-term system configuration (for example, time).
> > @@ -527,6 +529,7 @@ struct dp_netdev_pmd_thread_ctx {
> >      uint64_t last_cycles;
> >      /* RX queue from which last packet was received. */
> >      struct dp_netdev_rxq *last_rxq;
> > +    struct cycle_timer *cur_timer;
> >  };
> >
> >  /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
> > @@ -632,6 +635,71 @@ struct dpif_netdev {
> >      uint64_t last_port_seq;
> >  };
> >
> > +/* Support for accurate timing of PMD execution on TSC clock cycle level.
> > + * These functions are intended to be invoked in the context of pmd threads. */
> > +
> > +/* Read the TSC cycle register and cache it in the pmd context. Any function
> > + * not requiring clock cycle accuracy should read the cached value from the
> > + * context. */
> > +static inline uint64_t
> > +cycles_counter(struct dp_netdev_pmd_thread *pmd)
> > +{
> > +#ifdef DPDK_NETDEV
> > +    return pmd->ctx.last_cycles = rte_get_tsc_cycles();
> > +#else
> > +    return pmd->ctx.last_cycles = 0;
> > +#endif
> > +}
> > +
> > +/* A nestable timer for measuring execution time in TSC cycles.
> > + *
> > + * Usage:
> > + * struct cycle_timer timer;
> > + *
> > + * cycle_timer_start(pmd, &timer);
> > + * <Timed execution>
> > + * uint64_t cycles = cycle_timer_stop(pmd, &timer);
> > + *
> > + * The caller must guarantee that a call to cycle_timer_start() is always
> > + * paired with a call to cycle_stimer_stop().
> > + *
> > + * Is is possible to have nested cycles timers within the timed code. The
> > + * execution time measured by the nested timers is excluded from the time
> > + * measured by the embracing timer.
> > + */
> > +
> > +struct cycle_timer {
> > +    uint64_t *offset;
> > +    struct cycle_timer *interrupted;
> > +};
> > +
> > +static void
> > +cycle_timer_start(struct dp_netdev_pmd_thread *pmd,
> > +                  struct cycle_timer *timer)
> > +{
> > +    /* Suspend current timer, if any. */
> > +    timer->interrupted = pmd->ctx.cur_timer;
> > +    timer->offset = cycles_counter(pmd);
> > +    pmd->ctx.cur_timer = timer;
> > +}
> > +
> > +static uint64_t
> > +cycle_timer_stop(struct dp_netdev_pmd_thread *pmd,
> > +                 struct cycle_timer *timer)
> > +{
> > +    /* Assert that this is the current cycle timer. */
> > +    ovs_assert(pmd->ctx.cur_timer == timer);
> > +    uint64_t cycles = cycles_counter(pmd) - timer->offset;
> > +    if (timer->interrupted) {
> > +        /* Adjust the start offset by the used cycles. */
> > +        timer->interrupted->offset += cycles;
> 
> I think, this will not work for nesting more than 2 timers, because
> you will adjust only by the time of the one level deeper timer.
> You probably need one more field in the timer structure to accumulate
> sleep time while unwinding the timer's stack and not modify original
> offsets like:
> 
> 	uint64_t cycles = cycles_counter(pmd) - timer->offset;
> 	if (timer->interrupted) {
> 		timer->interrupted->interrupted_time += cycles;
> 	}
> 	pmd->ctx.cur_timer = timer->interrupted;
> 	return cycles - timer->interrupted_time;
> 

Good spot. I will fix this in my #2 patch.

Thanks, Jan

Patch

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index d16ba93..5d23128 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -522,11 +522,9 @@  struct dp_netdev_pmd_thread_ctx {
     /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
     long long now;
     /* Used to count cycles. See 'cycles_count_end()' */
-    unsigned long long last_cycles;
+    uint64_t last_cycles;
     /* RX queue from which last packet was received. */
     struct dp_netdev_rxq *last_rxq;
-    /* Indicates how should be treated last counted cycles. */
-    enum pmd_cycles_counter_type current_pmd_cycles_type;
 };
 
 /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
@@ -3246,69 +3244,16 @@  dp_netdev_actions_free(struct dp_netdev_actions *actions)
     free(actions);
 }
 
-static inline unsigned long long
-cycles_counter(void)
+static inline uint64_t
+cycles_counter(struct dp_netdev_pmd_thread *pmd)
 {
 #ifdef DPDK_NETDEV
-    return rte_get_tsc_cycles();
+    return pmd->ctx.last_cycles = rte_get_tsc_cycles();
 #else
-    return 0;
+    return pmd->ctx.last_cycles = 0;
 #endif
 }
 
-/* Fake mutex to make sure that the calls to cycles_count_* are balanced */
-extern struct ovs_mutex cycles_counter_fake_mutex;
-
-/* Start counting cycles.  Must be followed by 'cycles_count_end()'.
- * Counting starts from the idle type state.  */
-static inline void
-cycles_count_start(struct dp_netdev_pmd_thread *pmd)
-    OVS_ACQUIRES(&cycles_counter_fake_mutex)
-    OVS_NO_THREAD_SAFETY_ANALYSIS
-{
-    pmd->ctx.current_pmd_cycles_type = PMD_CYCLES_IDLE;
-    pmd->ctx.last_cycles = cycles_counter();
-}
-
-/* Stop counting cycles and add them to the counter of the current type. */
-static inline void
-cycles_count_end(struct dp_netdev_pmd_thread *pmd)
-    OVS_RELEASES(&cycles_counter_fake_mutex)
-    OVS_NO_THREAD_SAFETY_ANALYSIS
-{
-    unsigned long long interval = cycles_counter() - pmd->ctx.last_cycles;
-    enum pmd_cycles_counter_type type = pmd->ctx.current_pmd_cycles_type;
-
-    pmd_perf_update_counter(&pmd->perf_stats, type, interval);
-}
-
-/* Calculate the intermediate cycle result and add to the counter of
- * the current type */
-static inline void
-cycles_count_intermediate(struct dp_netdev_pmd_thread *pmd,
-                          struct dp_netdev_rxq **rxqs, int n_rxqs)
-    OVS_NO_THREAD_SAFETY_ANALYSIS
-{
-    unsigned long long new_cycles = cycles_counter();
-    unsigned long long interval = new_cycles - pmd->ctx.last_cycles;
-    enum pmd_cycles_counter_type type = pmd->ctx.current_pmd_cycles_type;
-    int i;
-
-    pmd->ctx.last_cycles = new_cycles;
-
-    pmd_perf_update_counter(&pmd->perf_stats, type, interval);
-    if (n_rxqs && (type == PMD_CYCLES_POLL_BUSY)) {
-        /* Add to the amount of current processing cycles. */
-        interval /= n_rxqs;
-        for (i = 0; i < n_rxqs; i++) {
-            if (rxqs[i]) {
-                non_atomic_ullong_add(&rxqs[i]->cycles[RXQ_CYCLES_PROC_CURR],
-                        interval);
-            }
-        }
-    }
-}
-
 static inline bool
 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd)
 {
@@ -3325,6 +3270,14 @@  dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
    atomic_store_relaxed(&rx->cycles[type], cycles);
 }
 
+static void
+dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
+                         enum rxq_cycles_counter_type type,
+                         unsigned long long cycles)
+{
+    non_atomic_ullong_add(&rx->cycles[type], cycles);
+}
+
 static uint64_t
 dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
                          enum rxq_cycles_counter_type type)
@@ -3358,16 +3311,9 @@  dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
     int output_cnt;
     bool dynamic_txqs;
     uint32_t tx_flush_interval;
-    enum pmd_cycles_counter_type save_pmd_cycles_type;
 
-    /* In case we're in PMD_CYCLES_PROCESSING state we need to count
-     * cycles for rxq we're processing now. */
-    cycles_count_intermediate(pmd, &pmd->ctx.last_rxq, 1);
-
-    /* Save current cycles counting state to restore after accounting
-     * send cycles. */
-    save_pmd_cycles_type = pmd->ctx.current_pmd_cycles_type;
-    pmd->ctx.current_pmd_cycles_type = PMD_CYCLES_POLL_BUSY;
+    /* Measure duration of batch transmission. */
+    uint64_t cycles = cycles_counter(pmd);
 
     dynamic_txqs = p->port->dynamic_txqs;
     if (dynamic_txqs) {
@@ -3394,9 +3340,15 @@  dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
     pmd_perf_update_counter(&pmd->perf_stats,
                             PMD_STAT_SENT_BATCHES, 1);
 
-    /* Update send cycles for all the rx queues and restore previous state. */
-    cycles_count_intermediate(pmd, p->output_pkts_rxqs, output_cnt);
-    pmd->ctx.current_pmd_cycles_type = save_pmd_cycles_type;
+    /* Divide the batch tx cost evenly over the packets' rxqs. */
+    cycles = (cycles_counter(pmd) - cycles) / output_cnt;
+    struct dp_netdev_rxq **rxqs = p->output_pkts_rxqs;
+    for (int i = 0; i < output_cnt; i++) {
+        if (rxqs[i]) {
+            dp_netdev_rxq_add_cycles(rxqs[i], RXQ_CYCLES_PROC_CURR,
+                                     cycles);
+        }
+    }
     return output_cnt;
 }
 
@@ -3429,16 +3381,17 @@  dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
     struct pmd_perf_stats *s = &pmd->perf_stats;
     struct dp_packet_batch batch;
     int error;
-    int batch_cnt = 0, output_cnt = 0;
+    int batch_cnt = 0;
 
     dp_packet_batch_init(&batch);
-
-    cycles_count_intermediate(pmd, NULL, 0);
     pmd->ctx.last_rxq = rxq;
-    pmd->ctx.current_pmd_cycles_type = PMD_CYCLES_POLL_BUSY;
+
+    /* Measure duration for polling and processing rx burst. */
+    uint64_t cycles = cycles_counter(pmd);
     error = netdev_rxq_recv(rxq->rx, &batch);
 
     if (!error) {
+        /* At least one packet received. */
         *recirc_depth_get() = 0;
         pmd_thread_ctx_time_update(pmd);
         batch_cnt = batch.count;
@@ -3448,7 +3401,7 @@  dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
             histogram_add_sample(&s->pkts_per_batch, batch_cnt);
             /* Update the maximum Rx queue fill level. */
             uint32_t qfill = batch.qfill;
-            switch (netdev_dpdk_get_type(netdev_rxq_get_netdev(rx))) {
+            switch (netdev_dpdk_get_type(netdev_rxq_get_netdev(rxq->rx))) {
             case DPDK_DEV_VHOST:
                 if (qfill > s->current.max_vhost_qfill) {
                     s->current.max_vhost_qfill = qfill;
@@ -3461,11 +3414,13 @@  dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
         }
         /* Process packet batch. */
         dp_netdev_input(pmd, &batch, port_no);
-        cycles_count_intermediate(pmd, &rxq, 1);
-
-        output_cnt = dp_netdev_pmd_flush_output_packets(pmd, false);
+        /* Add processing cycles to rxq stats. */
+        cycles = cycles_counter(pmd) - cycles;
+        dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
+        /* Flush the send queues. */
+        dp_netdev_pmd_flush_output_packets(pmd, false);
         *flushed = true;
-        
+
     } else if (error != EAGAIN && error != EOPNOTSUPP) {
         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
 
@@ -3473,9 +3428,7 @@  dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
                     netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
     }
 
-    pmd->ctx.current_pmd_cycles_type = PMD_CYCLES_POLL_IDLE;
     pmd->ctx.last_rxq = NULL;
-
     return batch_cnt;
 }
 
@@ -4105,7 +4058,6 @@  dpif_netdev_run(struct dpif *dpif)
     non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
     if (non_pmd) {
         ovs_mutex_lock(&dp->non_pmd_mutex);
-        cycles_count_start(non_pmd);
         HMAP_FOR_EACH (port, node, &dp->ports) {
             if (!netdev_is_pmd(port->netdev)) {
                 int i;
@@ -4126,7 +4078,6 @@  dpif_netdev_run(struct dpif *dpif)
             dp_netdev_pmd_flush_output_packets(non_pmd, false);
         }
 
-        cycles_count_end(non_pmd);
         dpif_netdev_xps_revalidate_pmd(non_pmd, false);
         ovs_mutex_unlock(&dp->non_pmd_mutex);
 
@@ -4305,7 +4256,6 @@  reload:
         lc = UINT_MAX;
     }
 
-    cycles_count_start(pmd);
     for (;;) {
         uint64_t iter_packets = 0;
         bool flushed = false;
@@ -4343,15 +4293,12 @@  reload:
             if (reload) {
                 break;
             }
-            cycles_count_intermediate(pmd, NULL, 0);
         }
 
-        pmd_perf_end_iteration(s, pmd->ctx.last_cycles, iter_packets,
+        pmd_perf_end_iteration(s, cycles_counter(pmd), iter_packets,
                                pmd_perf_metrics_enabled(pmd));
     }
 
-    cycles_count_end(pmd);
-
     poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
     exiting = latch_is_set(&pmd->exit_latch);
     /* Signal here to make sure the pmd finishes
@@ -4791,7 +4738,6 @@  dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
     cmap_init(&pmd->flow_table);
     cmap_init(&pmd->classifiers);
     pmd->ctx.last_rxq = NULL;
-    pmd->ctx.current_pmd_cycles_type = PMD_CYCLES_IDLE;
     pmd_thread_ctx_time_update(pmd);
     pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
     pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
@@ -5247,7 +5193,7 @@  handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
     struct match match;
     ovs_u128 ufid;
     int error;
-    uint64_t cycles = cycles_counter();
+    uint64_t cycles = cycles_counter(pmd);
 
     match.tun_md.valid = false;
     miniflow_expand(&key->mf, &match.flow);
@@ -5303,7 +5249,7 @@  handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
     }
     if (pmd_perf_metrics_enabled(pmd)) {
         /* Update upcall stats. */
-        cycles = cycles_counter() - cycles;
+        cycles = cycles_counter(pmd) - cycles;
         struct pmd_perf_stats *s = &pmd->perf_stats;
         s->current.upcalls++;
         s->current.upcall_cycles += cycles;