[ovs-dev,v10,2/5] dpif-netdev: Count cycles on per-rxq basis.

Message ID 1515755828-1848-3-git-send-email-i.maximets@samsung.com
State Superseded
Headers show
Series
  • Output packet batching (Time-based).
Related show

Commit Message

Ilya Maximets Jan. 12, 2018, 11:17 a.m.
Upcoming time-based output batching will allow to collect in a single
output batch packets from different RX queues. Lets keep the list of
RX queues for each output packet and collect cycles for them on send.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
---
 lib/dpif-netdev.c | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

Comments

Jan Scheurich Jan. 12, 2018, 3:52 p.m. | #1
Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>

One minor comment inline.

/Jan

> -----Original Message-----
> From: Ilya Maximets [mailto:i.maximets@samsung.com]
> Sent: Friday, 12 January, 2018 12:17
> To: ovs-dev@openvswitch.org
> Cc: Heetae Ahn <heetae82.ahn@samsung.com>; Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>; Antonio Fischetti
> <antonio.fischetti@intel.com>; Eelco Chaudron <echaudro@redhat.com>; Ciara Loftus <ciara.loftus@intel.com>; Kevin Traynor
> <ktraynor@redhat.com>; Jan Scheurich <jan.scheurich@ericsson.com>; Billy O'Mahony <billy.o.mahony@intel.com>; Ian Stokes
> <ian.stokes@intel.com>; Ilya Maximets <i.maximets@samsung.com>
> Subject: [PATCH v10 2/5] dpif-netdev: Count cycles on per-rxq basis.
> 
> Upcoming time-based output batching will allow to collect in a single
> output batch packets from different RX queues. Lets keep the list of
> RX queues for each output packet and collect cycles for them on send.
> 
> Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
> ---
>  lib/dpif-netdev.c | 31 +++++++++++++++++++++++++++++--
>  1 file changed, 29 insertions(+), 2 deletions(-)
> 
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index b35700d..6909a03 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -501,6 +501,7 @@ struct tx_port {
>      long long last_used;
>      struct hmap_node node;
>      struct dp_packet_batch output_pkts;
> +    struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
>  };
> 
>  /* A set of properties for the current processing loop that is not directly
> @@ -510,6 +511,8 @@ struct tx_port {
>  struct dp_netdev_pmd_thread_ctx {
>      /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
>      long long now;
> +    /* RX queue from which last packet was received. */
> +    struct dp_netdev_rxq *last_rxq;
>  };
> 
>  /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
> @@ -3155,9 +3158,14 @@ static void
>  dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
>                                     struct tx_port *p)
>  {
> +    int i;
>      int tx_qid;
>      int output_cnt;
>      bool dynamic_txqs;
> +    struct cycle_timer timer;
> +    uint64_t cycles;
> +
> +    cycle_timer_start(&pmd->perf_stats, &timer);
> 
>      dynamic_txqs = p->port->dynamic_txqs;
>      if (dynamic_txqs) {
> @@ -3167,12 +3175,22 @@ dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
>      }
> 
>      output_cnt = dp_packet_batch_size(&p->output_pkts);
> +    ovs_assert(output_cnt > 0);
> 
>      netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
>      dp_packet_batch_init(&p->output_pkts);
> 
>      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
>      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
> +
> +    /* Update send cycles for all the rx queues evenly. */
> +    cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
> +    for (i = 0; i < output_cnt; i++) {
> +        if (p->output_pkts_rxqs[i]) {
> +            dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
> +                                     RXQ_CYCLES_PROC_CURR, cycles);
> +        }
> +    }
>  }
> 
>  static void
> @@ -3196,10 +3214,14 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
>      struct cycle_timer timer;
>      int error;
>      int batch_cnt = 0;
> +    uint64_t cycles;
> 
>      /* Measure duration for polling and processing rx burst. */
>      cycle_timer_start(&pmd->perf_stats, &timer);
> +
> +    pmd->ctx.last_rxq = rxq;
>      dp_packet_batch_init(&batch);
> +
>      error = netdev_rxq_recv(rxq->rx, &batch);
>      if (!error) {
>          /* At least one packet received. */
> @@ -3208,12 +3230,12 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
> 
>          batch_cnt = batch.count;
>          dp_netdev_input(pmd, &batch, port_no);
> -        dp_netdev_pmd_flush_output_packets(pmd);
> 
>          /* Assign processing cycles to rx queue. */
> -        uint64_t cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
> +        cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
>          dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
> 
> +        dp_netdev_pmd_flush_output_packets(pmd);
>      } else {
>          /* Discard cycles. */
>          cycle_timer_stop(&pmd->perf_stats, &timer);
> @@ -3225,6 +3247,8 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
>          }
>      }
> 
> +    pmd->ctx.last_rxq = NULL;
> +
>      return batch_cnt;
>  }
> 
> @@ -4512,6 +4536,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
>      ovs_mutex_init(&pmd->port_mutex);
>      cmap_init(&pmd->flow_table);
>      cmap_init(&pmd->classifiers);
> +    pmd->ctx.last_rxq = NULL;
>      pmd_thread_ctx_time_update(pmd);
>      pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
>      pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
> @@ -5389,6 +5414,8 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
>                  dp_netdev_pmd_flush_output_on_port(pmd, p);
>              }
>              DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
> +                p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
> +                                                             pmd->ctx.last_rxq;

This works but seems unnecessarily cryptic. Can't you increment a simple int variable (i) in the loop and use that to index output_pkts_rxqs[i]?

>                  dp_packet_batch_add(&p->output_pkts, packet);
>              }
>              return;
> --
> 2.7.4
Ilya Maximets Jan. 12, 2018, 4:13 p.m. | #2
On 12.01.2018 18:52, Jan Scheurich wrote:
> Tested-by: Jan Scheurich <jan.scheurich@ericsson.com>
> Acked-by: Jan Scheurich <jan.scheurich@ericsson.com>
> 
> One minor comment inline.
> 
> /Jan
> 
>> -----Original Message-----
>> From: Ilya Maximets [mailto:i.maximets@samsung.com]
>> Sent: Friday, 12 January, 2018 12:17
>> To: ovs-dev@openvswitch.org
>> Cc: Heetae Ahn <heetae82.ahn@samsung.com>; Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>; Antonio Fischetti
>> <antonio.fischetti@intel.com>; Eelco Chaudron <echaudro@redhat.com>; Ciara Loftus <ciara.loftus@intel.com>; Kevin Traynor
>> <ktraynor@redhat.com>; Jan Scheurich <jan.scheurich@ericsson.com>; Billy O'Mahony <billy.o.mahony@intel.com>; Ian Stokes
>> <ian.stokes@intel.com>; Ilya Maximets <i.maximets@samsung.com>
>> Subject: [PATCH v10 2/5] dpif-netdev: Count cycles on per-rxq basis.
>>
>> Upcoming time-based output batching will allow to collect in a single
>> output batch packets from different RX queues. Lets keep the list of
>> RX queues for each output packet and collect cycles for them on send.
>>
>> Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
>> ---
>>  lib/dpif-netdev.c | 31 +++++++++++++++++++++++++++++--
>>  1 file changed, 29 insertions(+), 2 deletions(-)
>>
>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>> index b35700d..6909a03 100644
>> --- a/lib/dpif-netdev.c
>> +++ b/lib/dpif-netdev.c
>> @@ -501,6 +501,7 @@ struct tx_port {
>>      long long last_used;
>>      struct hmap_node node;
>>      struct dp_packet_batch output_pkts;
>> +    struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
>>  };
>>
>>  /* A set of properties for the current processing loop that is not directly
>> @@ -510,6 +511,8 @@ struct tx_port {
>>  struct dp_netdev_pmd_thread_ctx {
>>      /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
>>      long long now;
>> +    /* RX queue from which last packet was received. */
>> +    struct dp_netdev_rxq *last_rxq;
>>  };
>>
>>  /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
>> @@ -3155,9 +3158,14 @@ static void
>>  dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
>>                                     struct tx_port *p)
>>  {
>> +    int i;
>>      int tx_qid;
>>      int output_cnt;
>>      bool dynamic_txqs;
>> +    struct cycle_timer timer;
>> +    uint64_t cycles;
>> +
>> +    cycle_timer_start(&pmd->perf_stats, &timer);
>>
>>      dynamic_txqs = p->port->dynamic_txqs;
>>      if (dynamic_txqs) {
>> @@ -3167,12 +3175,22 @@ dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
>>      }
>>
>>      output_cnt = dp_packet_batch_size(&p->output_pkts);
>> +    ovs_assert(output_cnt > 0);
>>
>>      netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
>>      dp_packet_batch_init(&p->output_pkts);
>>
>>      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
>>      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
>> +
>> +    /* Update send cycles for all the rx queues evenly. */
>> +    cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
>> +    for (i = 0; i < output_cnt; i++) {
>> +        if (p->output_pkts_rxqs[i]) {
>> +            dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
>> +                                     RXQ_CYCLES_PROC_CURR, cycles);
>> +        }
>> +    }
>>  }
>>
>>  static void
>> @@ -3196,10 +3214,14 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
>>      struct cycle_timer timer;
>>      int error;
>>      int batch_cnt = 0;
>> +    uint64_t cycles;
>>
>>      /* Measure duration for polling and processing rx burst. */
>>      cycle_timer_start(&pmd->perf_stats, &timer);
>> +
>> +    pmd->ctx.last_rxq = rxq;
>>      dp_packet_batch_init(&batch);
>> +
>>      error = netdev_rxq_recv(rxq->rx, &batch);
>>      if (!error) {
>>          /* At least one packet received. */
>> @@ -3208,12 +3230,12 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
>>
>>          batch_cnt = batch.count;
>>          dp_netdev_input(pmd, &batch, port_no);
>> -        dp_netdev_pmd_flush_output_packets(pmd);
>>
>>          /* Assign processing cycles to rx queue. */
>> -        uint64_t cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
>> +        cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
>>          dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
>>
>> +        dp_netdev_pmd_flush_output_packets(pmd);
>>      } else {
>>          /* Discard cycles. */
>>          cycle_timer_stop(&pmd->perf_stats, &timer);
>> @@ -3225,6 +3247,8 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
>>          }
>>      }
>>
>> +    pmd->ctx.last_rxq = NULL;
>> +
>>      return batch_cnt;
>>  }
>>
>> @@ -4512,6 +4536,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
>>      ovs_mutex_init(&pmd->port_mutex);
>>      cmap_init(&pmd->flow_table);
>>      cmap_init(&pmd->classifiers);
>> +    pmd->ctx.last_rxq = NULL;
>>      pmd_thread_ctx_time_update(pmd);
>>      pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
>>      pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
>> @@ -5389,6 +5414,8 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
>>                  dp_netdev_pmd_flush_output_on_port(pmd, p);
>>              }
>>              DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
>> +                p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
>> +                                                             pmd->ctx.last_rxq;
> 
> This works but seems unnecessarily cryptic. Can't you increment a simple int variable (i) in the loop and use that to index output_pkts_rxqs[i]?

I'm using 'dp_packet_batch_size()' as index to make clear dependency between
filling of the batch itself and the rxqs array.
If you'll not insist I'd like to keep it as is.

> 
>>                  dp_packet_batch_add(&p->output_pkts, packet);
>>              }
>>              return;
>> --
>> 2.7.4
> 
> 
> 
>
Stokes, Ian Jan. 13, 2018, 12:16 p.m. | #3
> -----Original Message-----
> From: Ilya Maximets [mailto:i.maximets@samsung.com]
> Sent: Friday, January 12, 2018 11:17 AM
> To: ovs-dev@openvswitch.org
> Cc: Heetae Ahn <heetae82.ahn@samsung.com>; Bodireddy, Bhanuprakash
> <bhanuprakash.bodireddy@intel.com>; Fischetti, Antonio
> <antonio.fischetti@intel.com>; Eelco Chaudron <echaudro@redhat.com>;
> Loftus, Ciara <ciara.loftus@intel.com>; Kevin Traynor
> <ktraynor@redhat.com>; Jan Scheurich <jan.scheurich@ericsson.com>; O
> Mahony, Billy <billy.o.mahony@intel.com>; Stokes, Ian
> <ian.stokes@intel.com>; Ilya Maximets <i.maximets@samsung.com>
> Subject: [PATCH v10 2/5] dpif-netdev: Count cycles on per-rxq basis.
> 
> Upcoming time-based output batching will allow to collect in a single
> output batch packets from different RX queues. Lets keep the list of RX
> queues for each output packet and collect cycles for them on send.
> 
> Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
> ---
>  lib/dpif-netdev.c | 31 +++++++++++++++++++++++++++++--
>  1 file changed, 29 insertions(+), 2 deletions(-)
> 
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index b35700d..6909a03
> 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -501,6 +501,7 @@ struct tx_port {
>      long long last_used;
>      struct hmap_node node;
>      struct dp_packet_batch output_pkts;
> +    struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
>  };
> 
>  /* A set of properties for the current processing loop that is not
> directly @@ -510,6 +511,8 @@ struct tx_port {  struct
> dp_netdev_pmd_thread_ctx {
>      /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
>      long long now;
> +    /* RX queue from which last packet was received. */
> +    struct dp_netdev_rxq *last_rxq;
>  };
> 
>  /* PMD: Poll modes drivers.  PMD accesses devices via polling to
> eliminate @@ -3155,9 +3158,14 @@ static void
> dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
>                                     struct tx_port *p)  {
> +    int i;
>      int tx_qid;
>      int output_cnt;
>      bool dynamic_txqs;
> +    struct cycle_timer timer;
> +    uint64_t cycles;
> +
> +    cycle_timer_start(&pmd->perf_stats, &timer);
> 
>      dynamic_txqs = p->port->dynamic_txqs;
>      if (dynamic_txqs) {
> @@ -3167,12 +3175,22 @@ dp_netdev_pmd_flush_output_on_port(struct
> dp_netdev_pmd_thread *pmd,
>      }
> 
>      output_cnt = dp_packet_batch_size(&p->output_pkts);
> +    ovs_assert(output_cnt > 0);
> 
>      netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
>      dp_packet_batch_init(&p->output_pkts);
> 
>      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS,
> output_cnt);
>      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
> +
> +    /* Update send cycles for all the rx queues evenly. */

Just a query, is it right that this is distributed evenly?
If there are more packets from one rx queue than another will it make a difference or will the cycles spent sending that batch be the same as a batch of a small or larger size?

> +    cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
> +    for (i = 0; i < output_cnt; i++) {
> +        if (p->output_pkts_rxqs[i]) {
> +            dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
> +                                     RXQ_CYCLES_PROC_CURR, cycles);
> +        }
> +    }
>  }
> 
>  static void
> @@ -3196,10 +3214,14 @@ dp_netdev_process_rxq_port(struct
> dp_netdev_pmd_thread *pmd,
>      struct cycle_timer timer;
>      int error;
>      int batch_cnt = 0;
> +    uint64_t cycles;
> 
>      /* Measure duration for polling and processing rx burst. */
>      cycle_timer_start(&pmd->perf_stats, &timer);
> +
> +    pmd->ctx.last_rxq = rxq;
>      dp_packet_batch_init(&batch);
> +
>      error = netdev_rxq_recv(rxq->rx, &batch);
>      if (!error) {
>          /* At least one packet received. */ @@ -3208,12 +3230,12 @@
> dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
> 
>          batch_cnt = batch.count;
>          dp_netdev_input(pmd, &batch, port_no);
> -        dp_netdev_pmd_flush_output_packets(pmd);
> 
>          /* Assign processing cycles to rx queue. */
> -        uint64_t cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
> +        cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
>          dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
> 
> +        dp_netdev_pmd_flush_output_packets(pmd);
>      } else {
>          /* Discard cycles. */
>          cycle_timer_stop(&pmd->perf_stats, &timer); @@ -3225,6 +3247,8 @@
> dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
>          }
>      }
> 
> +    pmd->ctx.last_rxq = NULL;
> +
>      return batch_cnt;
>  }
> 
> @@ -4512,6 +4536,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread
> *pmd, struct dp_netdev *dp,
>      ovs_mutex_init(&pmd->port_mutex);
>      cmap_init(&pmd->flow_table);
>      cmap_init(&pmd->classifiers);
> +    pmd->ctx.last_rxq = NULL;
>      pmd_thread_ctx_time_update(pmd);
>      pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
>      pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN; @@ -
> 5389,6 +5414,8 @@ dp_execute_cb(void *aux_, struct dp_packet_batch
> *packets_,
>                  dp_netdev_pmd_flush_output_on_port(pmd, p);
>              }
>              DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
> +                p->output_pkts_rxqs[dp_packet_batch_size(&p-
> >output_pkts)] =
> +
> + pmd->ctx.last_rxq;
>                  dp_packet_batch_add(&p->output_pkts, packet);
>              }
>              return;
> --
> 2.7.4
Kevin Traynor Jan. 13, 2018, 2:42 p.m. | #4
On 01/13/2018 12:16 PM, Stokes, Ian wrote:
> 
> 
>> -----Original Message-----
>> From: Ilya Maximets [mailto:i.maximets@samsung.com]
>> Sent: Friday, January 12, 2018 11:17 AM
>> To: ovs-dev@openvswitch.org
>> Cc: Heetae Ahn <heetae82.ahn@samsung.com>; Bodireddy, Bhanuprakash
>> <bhanuprakash.bodireddy@intel.com>; Fischetti, Antonio
>> <antonio.fischetti@intel.com>; Eelco Chaudron <echaudro@redhat.com>;
>> Loftus, Ciara <ciara.loftus@intel.com>; Kevin Traynor
>> <ktraynor@redhat.com>; Jan Scheurich <jan.scheurich@ericsson.com>; O
>> Mahony, Billy <billy.o.mahony@intel.com>; Stokes, Ian
>> <ian.stokes@intel.com>; Ilya Maximets <i.maximets@samsung.com>
>> Subject: [PATCH v10 2/5] dpif-netdev: Count cycles on per-rxq basis.
>>
>> Upcoming time-based output batching will allow to collect in a single
>> output batch packets from different RX queues. Lets keep the list of RX
>> queues for each output packet and collect cycles for them on send.
>>
>> Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
>> ---
>>  lib/dpif-netdev.c | 31 +++++++++++++++++++++++++++++--
>>  1 file changed, 29 insertions(+), 2 deletions(-)
>>
>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index b35700d..6909a03
>> 100644
>> --- a/lib/dpif-netdev.c
>> +++ b/lib/dpif-netdev.c
>> @@ -501,6 +501,7 @@ struct tx_port {
>>      long long last_used;
>>      struct hmap_node node;
>>      struct dp_packet_batch output_pkts;
>> +    struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
>>  };
>>
>>  /* A set of properties for the current processing loop that is not
>> directly @@ -510,6 +511,8 @@ struct tx_port {  struct
>> dp_netdev_pmd_thread_ctx {
>>      /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
>>      long long now;
>> +    /* RX queue from which last packet was received. */
>> +    struct dp_netdev_rxq *last_rxq;
>>  };
>>
>>  /* PMD: Poll modes drivers.  PMD accesses devices via polling to
>> eliminate @@ -3155,9 +3158,14 @@ static void
>> dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
>>                                     struct tx_port *p)  {
>> +    int i;
>>      int tx_qid;
>>      int output_cnt;
>>      bool dynamic_txqs;
>> +    struct cycle_timer timer;
>> +    uint64_t cycles;
>> +
>> +    cycle_timer_start(&pmd->perf_stats, &timer);
>>
>>      dynamic_txqs = p->port->dynamic_txqs;
>>      if (dynamic_txqs) {
>> @@ -3167,12 +3175,22 @@ dp_netdev_pmd_flush_output_on_port(struct
>> dp_netdev_pmd_thread *pmd,
>>      }
>>
>>      output_cnt = dp_packet_batch_size(&p->output_pkts);
>> +    ovs_assert(output_cnt > 0);
>>
>>      netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
>>      dp_packet_batch_init(&p->output_pkts);
>>
>>      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS,
>> output_cnt);
>>      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
>> +
>> +    /* Update send cycles for all the rx queues evenly. */
> 
> Just a query, is it right that this is distributed evenly?
> If there are more packets from one rx queue than another will it make a difference or will the cycles spent sending that batch be the same as a batch of a small or larger size?
> 

Maybe the "evenly" comment is a misleading? The send cycles for each rxq
is updated proportionally for how many packets that rxq had in the
batch. I think this is about the best that can be done.

>> +    cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
>> +    for (i = 0; i < output_cnt; i++) {
>> +        if (p->output_pkts_rxqs[i]) {
>> +            dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
>> +                                     RXQ_CYCLES_PROC_CURR, cycles);
>> +        }
>> +    }
>>  }
>>
>>  static void
>> @@ -3196,10 +3214,14 @@ dp_netdev_process_rxq_port(struct
>> dp_netdev_pmd_thread *pmd,
>>      struct cycle_timer timer;
>>      int error;
>>      int batch_cnt = 0;
>> +    uint64_t cycles;
>>
>>      /* Measure duration for polling and processing rx burst. */
>>      cycle_timer_start(&pmd->perf_stats, &timer);
>> +
>> +    pmd->ctx.last_rxq = rxq;
>>      dp_packet_batch_init(&batch);
>> +
>>      error = netdev_rxq_recv(rxq->rx, &batch);
>>      if (!error) {
>>          /* At least one packet received. */ @@ -3208,12 +3230,12 @@
>> dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
>>
>>          batch_cnt = batch.count;
>>          dp_netdev_input(pmd, &batch, port_no);
>> -        dp_netdev_pmd_flush_output_packets(pmd);
>>
>>          /* Assign processing cycles to rx queue. */
>> -        uint64_t cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
>> +        cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
>>          dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
>>
>> +        dp_netdev_pmd_flush_output_packets(pmd);
>>      } else {
>>          /* Discard cycles. */
>>          cycle_timer_stop(&pmd->perf_stats, &timer); @@ -3225,6 +3247,8 @@
>> dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
>>          }
>>      }
>>
>> +    pmd->ctx.last_rxq = NULL;
>> +
>>      return batch_cnt;
>>  }
>>
>> @@ -4512,6 +4536,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread
>> *pmd, struct dp_netdev *dp,
>>      ovs_mutex_init(&pmd->port_mutex);
>>      cmap_init(&pmd->flow_table);
>>      cmap_init(&pmd->classifiers);
>> +    pmd->ctx.last_rxq = NULL;
>>      pmd_thread_ctx_time_update(pmd);
>>      pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
>>      pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN; @@ -
>> 5389,6 +5414,8 @@ dp_execute_cb(void *aux_, struct dp_packet_batch
>> *packets_,
>>                  dp_netdev_pmd_flush_output_on_port(pmd, p);
>>              }
>>              DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
>> +                p->output_pkts_rxqs[dp_packet_batch_size(&p-
>>> output_pkts)] =
>> +
>> + pmd->ctx.last_rxq;
>>                  dp_packet_batch_add(&p->output_pkts, packet);
>>              }
>>              return;
>> --
>> 2.7.4
>
Ilya Maximets Jan. 15, 2018, 7:11 a.m. | #5
On 13.01.2018 17:42, Kevin Traynor wrote:
> On 01/13/2018 12:16 PM, Stokes, Ian wrote:
>>
>>
>>> -----Original Message-----
>>> From: Ilya Maximets [mailto:i.maximets@samsung.com]
>>> Sent: Friday, January 12, 2018 11:17 AM
>>> To: ovs-dev@openvswitch.org
>>> Cc: Heetae Ahn <heetae82.ahn@samsung.com>; Bodireddy, Bhanuprakash
>>> <bhanuprakash.bodireddy@intel.com>; Fischetti, Antonio
>>> <antonio.fischetti@intel.com>; Eelco Chaudron <echaudro@redhat.com>;
>>> Loftus, Ciara <ciara.loftus@intel.com>; Kevin Traynor
>>> <ktraynor@redhat.com>; Jan Scheurich <jan.scheurich@ericsson.com>; O
>>> Mahony, Billy <billy.o.mahony@intel.com>; Stokes, Ian
>>> <ian.stokes@intel.com>; Ilya Maximets <i.maximets@samsung.com>
>>> Subject: [PATCH v10 2/5] dpif-netdev: Count cycles on per-rxq basis.
>>>
>>> Upcoming time-based output batching will allow to collect in a single
>>> output batch packets from different RX queues. Lets keep the list of RX
>>> queues for each output packet and collect cycles for them on send.
>>>
>>> Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
>>> ---
>>>  lib/dpif-netdev.c | 31 +++++++++++++++++++++++++++++--
>>>  1 file changed, 29 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index b35700d..6909a03
>>> 100644
>>> --- a/lib/dpif-netdev.c
>>> +++ b/lib/dpif-netdev.c
>>> @@ -501,6 +501,7 @@ struct tx_port {
>>>      long long last_used;
>>>      struct hmap_node node;
>>>      struct dp_packet_batch output_pkts;
>>> +    struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
>>>  };
>>>
>>>  /* A set of properties for the current processing loop that is not
>>> directly @@ -510,6 +511,8 @@ struct tx_port {  struct
>>> dp_netdev_pmd_thread_ctx {
>>>      /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
>>>      long long now;
>>> +    /* RX queue from which last packet was received. */
>>> +    struct dp_netdev_rxq *last_rxq;
>>>  };
>>>
>>>  /* PMD: Poll modes drivers.  PMD accesses devices via polling to
>>> eliminate @@ -3155,9 +3158,14 @@ static void
>>> dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
>>>                                     struct tx_port *p)  {
>>> +    int i;
>>>      int tx_qid;
>>>      int output_cnt;
>>>      bool dynamic_txqs;
>>> +    struct cycle_timer timer;
>>> +    uint64_t cycles;
>>> +
>>> +    cycle_timer_start(&pmd->perf_stats, &timer);
>>>
>>>      dynamic_txqs = p->port->dynamic_txqs;
>>>      if (dynamic_txqs) {
>>> @@ -3167,12 +3175,22 @@ dp_netdev_pmd_flush_output_on_port(struct
>>> dp_netdev_pmd_thread *pmd,
>>>      }
>>>
>>>      output_cnt = dp_packet_batch_size(&p->output_pkts);
>>> +    ovs_assert(output_cnt > 0);
>>>
>>>      netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
>>>      dp_packet_batch_init(&p->output_pkts);
>>>
>>>      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS,
>>> output_cnt);
>>>      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
>>> +
>>> +    /* Update send cycles for all the rx queues evenly. */
>>
>> Just a query, is it right that this is distributed evenly?
>> If there are more packets from one rx queue than another will it make a difference or will the cycles spent sending that batch be the same as a batch of a small or larger size?
>>
> 
> Maybe the "evenly" comment is a misleading? The send cycles for each rxq
> is updated proportionally for how many packets that rxq had in the
> batch. I think this is about the best that can be done.
> 

Yes, it means what Kevin said.
I'm not sure about this comment. Should I change it?
I'm also not sure if we need to describe distribution algorithm in comments.
It's quiet obvious for me, but I, as an author, can not be objective.

How about something neutral like:
/* Distribute the send cycles between the rx queues. */ ?


>>> +    cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
>>> +    for (i = 0; i < output_cnt; i++) {
>>> +        if (p->output_pkts_rxqs[i]) {
>>> +            dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
>>> +                                     RXQ_CYCLES_PROC_CURR, cycles);
>>> +        }
>>> +    }
>>>  }
>>>
>>>  static void
>>> @@ -3196,10 +3214,14 @@ dp_netdev_process_rxq_port(struct
>>> dp_netdev_pmd_thread *pmd,
>>>      struct cycle_timer timer;
>>>      int error;
>>>      int batch_cnt = 0;
>>> +    uint64_t cycles;
>>>
>>>      /* Measure duration for polling and processing rx burst. */
>>>      cycle_timer_start(&pmd->perf_stats, &timer);
>>> +
>>> +    pmd->ctx.last_rxq = rxq;
>>>      dp_packet_batch_init(&batch);
>>> +
>>>      error = netdev_rxq_recv(rxq->rx, &batch);
>>>      if (!error) {
>>>          /* At least one packet received. */ @@ -3208,12 +3230,12 @@
>>> dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
>>>
>>>          batch_cnt = batch.count;
>>>          dp_netdev_input(pmd, &batch, port_no);
>>> -        dp_netdev_pmd_flush_output_packets(pmd);
>>>
>>>          /* Assign processing cycles to rx queue. */
>>> -        uint64_t cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
>>> +        cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
>>>          dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
>>>
>>> +        dp_netdev_pmd_flush_output_packets(pmd);
>>>      } else {
>>>          /* Discard cycles. */
>>>          cycle_timer_stop(&pmd->perf_stats, &timer); @@ -3225,6 +3247,8 @@
>>> dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
>>>          }
>>>      }
>>>
>>> +    pmd->ctx.last_rxq = NULL;
>>> +
>>>      return batch_cnt;
>>>  }
>>>
>>> @@ -4512,6 +4536,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread
>>> *pmd, struct dp_netdev *dp,
>>>      ovs_mutex_init(&pmd->port_mutex);
>>>      cmap_init(&pmd->flow_table);
>>>      cmap_init(&pmd->classifiers);
>>> +    pmd->ctx.last_rxq = NULL;
>>>      pmd_thread_ctx_time_update(pmd);
>>>      pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
>>>      pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN; @@ -
>>> 5389,6 +5414,8 @@ dp_execute_cb(void *aux_, struct dp_packet_batch
>>> *packets_,
>>>                  dp_netdev_pmd_flush_output_on_port(pmd, p);
>>>              }
>>>              DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
>>> +                p->output_pkts_rxqs[dp_packet_batch_size(&p-
>>>> output_pkts)] =
>>> +
>>> + pmd->ctx.last_rxq;
>>>                  dp_packet_batch_add(&p->output_pkts, packet);
>>>              }
>>>              return;
>>> --
>>> 2.7.4
>>
> 
> 
> 
>
Jan Scheurich Jan. 15, 2018, 8:08 a.m. | #6
> >>> +    /* Update send cycles for all the rx queues evenly. */
> >>
> >> Just a query, is it right that this is distributed evenly?
> >> If there are more packets from one rx queue than another will it make a difference or will the cycles spent sending that batch be the
> same as a batch of a small or larger size?
> >>
> >
> > Maybe the "evenly" comment is a misleading? The send cycles for each rxq
> > is updated proportionally for how many packets that rxq had in the
> > batch. I think this is about the best that can be done.
> >
> 
> Yes, it means what Kevin said.
> I'm not sure about this comment. Should I change it?
> I'm also not sure if we need to describe distribution algorithm in comments.
> It's quiet obvious for me, but I, as an author, can not be objective.
> 
> How about something neutral like:
> /* Distribute the send cycles between the rx queues. */ ?

How about this:
/* Distribute send cycles evenly among transmitted packets and assign to their respective rx queues. */

This approximation is actually not too bad as all packets in the batch went to the same tx queue. The only error we make is to ignore the size of the packets (which may still be substantial, though, given that the packet copying cost for large packets is significant in real-world scenarios with lots of L3 cache misses and especially across the QPI bus).

BR, Jan
Stokes, Ian Jan. 15, 2018, 9:27 a.m. | #7
> > >>> +    /* Update send cycles for all the rx queues evenly. */
> > >>
> > >> Just a query, is it right that this is distributed evenly?
> > >> If there are more packets from one rx queue than another will it
> > >> make a difference or will the cycles spent sending that batch be
> > >> the
> > same as a batch of a small or larger size?
> > >>
> > >
> > > Maybe the "evenly" comment is a misleading? The send cycles for each
> > > rxq is updated proportionally for how many packets that rxq had in
> > > the batch. I think this is about the best that can be done.
> > >
> >
> > Yes, it means what Kevin said.
> > I'm not sure about this comment. Should I change it?
> > I'm also not sure if we need to describe distribution algorithm in
> comments.
> > It's quiet obvious for me, but I, as an author, can not be objective.
> >
> > How about something neutral like:
> > /* Distribute the send cycles between the rx queues. */ ?
> 
> How about this:
> /* Distribute send cycles evenly among transmitted packets and assign to
> their respective rx queues. */
> 

I think above looks ok to me, I won't block on this at this stage.

Thanks
Ian

> This approximation is actually not too bad as all packets in the batch
> went to the same tx queue. The only error we make is to ignore the size of
> the packets (which may still be substantial, though, given that the packet
> copying cost for large packets is significant in real-world scenarios with
> lots of L3 cache misses and especially across the QPI bus).
> 
> BR, Jan
Ilya Maximets Jan. 15, 2018, 10:22 a.m. | #8
On 15.01.2018 12:27, Stokes, Ian wrote:
>>>>>> +    /* Update send cycles for all the rx queues evenly. */
>>>>>
>>>>> Just a query, is it right that this is distributed evenly?
>>>>> If there are more packets from one rx queue than another will it
>>>>> make a difference or will the cycles spent sending that batch be
>>>>> the
>>> same as a batch of a small or larger size?
>>>>>
>>>>
>>>> Maybe the "evenly" comment is a misleading? The send cycles for each
>>>> rxq is updated proportionally for how many packets that rxq had in
>>>> the batch. I think this is about the best that can be done.
>>>>
>>>
>>> Yes, it means what Kevin said.
>>> I'm not sure about this comment. Should I change it?
>>> I'm also not sure if we need to describe distribution algorithm in
>> comments.
>>> It's quiet obvious for me, but I, as an author, can not be objective.
>>>
>>> How about something neutral like:
>>> /* Distribute the send cycles between the rx queues. */ ?
>>
>> How about this:
>> /* Distribute send cycles evenly among transmitted packets and assign to
>> their respective rx queues. */
>>
> 
> I think above looks ok to me, I won't block on this at this stage.
> 

OK. I've sent v11 with changed comment.

> Thanks
> Ian
> 
>> This approximation is actually not too bad as all packets in the batch
>> went to the same tx queue. The only error we make is to ignore the size of
>> the packets (which may still be substantial, though, given that the packet
>> copying cost for large packets is significant in real-world scenarios with
>> lots of L3 cache misses and especially across the QPI bus).
>>
>> BR, Jan

Patch

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index b35700d..6909a03 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -501,6 +501,7 @@  struct tx_port {
     long long last_used;
     struct hmap_node node;
     struct dp_packet_batch output_pkts;
+    struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
 };
 
 /* A set of properties for the current processing loop that is not directly
@@ -510,6 +511,8 @@  struct tx_port {
 struct dp_netdev_pmd_thread_ctx {
     /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
     long long now;
+    /* RX queue from which last packet was received. */
+    struct dp_netdev_rxq *last_rxq;
 };
 
 /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
@@ -3155,9 +3158,14 @@  static void
 dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
                                    struct tx_port *p)
 {
+    int i;
     int tx_qid;
     int output_cnt;
     bool dynamic_txqs;
+    struct cycle_timer timer;
+    uint64_t cycles;
+
+    cycle_timer_start(&pmd->perf_stats, &timer);
 
     dynamic_txqs = p->port->dynamic_txqs;
     if (dynamic_txqs) {
@@ -3167,12 +3175,22 @@  dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
     }
 
     output_cnt = dp_packet_batch_size(&p->output_pkts);
+    ovs_assert(output_cnt > 0);
 
     netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
     dp_packet_batch_init(&p->output_pkts);
 
     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
+
+    /* Update send cycles for all the rx queues evenly. */
+    cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
+    for (i = 0; i < output_cnt; i++) {
+        if (p->output_pkts_rxqs[i]) {
+            dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
+                                     RXQ_CYCLES_PROC_CURR, cycles);
+        }
+    }
 }
 
 static void
@@ -3196,10 +3214,14 @@  dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
     struct cycle_timer timer;
     int error;
     int batch_cnt = 0;
+    uint64_t cycles;
 
     /* Measure duration for polling and processing rx burst. */
     cycle_timer_start(&pmd->perf_stats, &timer);
+
+    pmd->ctx.last_rxq = rxq;
     dp_packet_batch_init(&batch);
+
     error = netdev_rxq_recv(rxq->rx, &batch);
     if (!error) {
         /* At least one packet received. */
@@ -3208,12 +3230,12 @@  dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
 
         batch_cnt = batch.count;
         dp_netdev_input(pmd, &batch, port_no);
-        dp_netdev_pmd_flush_output_packets(pmd);
 
         /* Assign processing cycles to rx queue. */
-        uint64_t cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
+        cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
         dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
 
+        dp_netdev_pmd_flush_output_packets(pmd);
     } else {
         /* Discard cycles. */
         cycle_timer_stop(&pmd->perf_stats, &timer);
@@ -3225,6 +3247,8 @@  dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
         }
     }
 
+    pmd->ctx.last_rxq = NULL;
+
     return batch_cnt;
 }
 
@@ -4512,6 +4536,7 @@  dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
     ovs_mutex_init(&pmd->port_mutex);
     cmap_init(&pmd->flow_table);
     cmap_init(&pmd->classifiers);
+    pmd->ctx.last_rxq = NULL;
     pmd_thread_ctx_time_update(pmd);
     pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
     pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
@@ -5389,6 +5414,8 @@  dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
                 dp_netdev_pmd_flush_output_on_port(pmd, p);
             }
             DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
+                p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
+                                                             pmd->ctx.last_rxq;
                 dp_packet_batch_add(&p->output_pkts, packet);
             }
             return;