diff mbox series

[ovs-dev,2/2] dpif-netdev: Introduce netdev array cache

Message ID 20210707150546.6546-3-elibr@nvidia.com
State Changes Requested
Headers show
Series [ovs-dev,1/2] dpif-netdev: Do not execute packet recovery without experimental support | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot fail github build: failed

Commit Message

Eli Britstein July 7, 2021, 3:05 p.m. UTC
Port numbers are usually small. Maintain an array of netdev handles indexed
by port numbers. It accelerates looking up for them for
netdev_hw_miss_packet_recover().

Reported-by: Cian Ferriter <cian.ferriter@intel.com>
Signed-off-by: Eli Britstein <elibr@nvidia.com>
Reviewed-by: Gaetan Rivet <gaetanr@nvidia.com>
---
 lib/dpif-netdev.c | 41 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 37 insertions(+), 4 deletions(-)

Comments

Gaetan Rivet July 7, 2021, 4:35 p.m. UTC | #1
On Wed, Jul 7, 2021, at 17:05, Eli Britstein wrote:
> Port numbers are usually small. Maintain an array of netdev handles indexed
> by port numbers. It accelerates looking up for them for
> netdev_hw_miss_packet_recover().
> 
> Reported-by: Cian Ferriter <cian.ferriter@intel.com>
> Signed-off-by: Eli Britstein <elibr@nvidia.com>
> Reviewed-by: Gaetan Rivet <gaetanr@nvidia.com>
> ---
>  lib/dpif-netdev.c | 41 +++++++++++++++++++++++++++++++++++++----
>  1 file changed, 37 insertions(+), 4 deletions(-)
> 
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 2e654426e..accb23a1a 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -650,6 +650,9 @@ struct dp_netdev_pmd_thread_ctx {
>      uint32_t emc_insert_min;
>  };
>  
> +/* Size of netdev's cache. */
> +#define DP_PMD_NETDEV_CACHE_SIZE 1024
> +
>  /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
>   * the performance overhead of interrupt processing.  Therefore netdev can
>   * not implement rx-wait for these devices.  dpif-netdev needs to poll
> @@ -786,6 +789,7 @@ struct dp_netdev_pmd_thread {
>       * other instance will only be accessed by its own pmd thread. */
>      struct hmap tnl_port_cache;
>      struct hmap send_port_cache;
> +    struct netdev *send_netdev_cache[DP_PMD_NETDEV_CACHE_SIZE];
>  
>      /* Keep track of detailed PMD performance statistics. */
>      struct pmd_perf_stats perf_stats;
> @@ -5910,6 +5914,10 @@ pmd_free_cached_ports(struct 
> dp_netdev_pmd_thread *pmd)
>          free(tx_port_cached);
>      }
>      HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
> +        if (tx_port_cached->port->port_no <
> +            ARRAY_SIZE(pmd->send_netdev_cache)) {
> +            pmd->send_netdev_cache[tx_port_cached->port->port_no] = 
> NULL;
> +        }
>          free(tx_port_cached);
>      }
>  }
> @@ -5939,6 +5947,11 @@ pmd_load_cached_ports(struct 
> dp_netdev_pmd_thread *pmd)
>              tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
>              hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
>                          hash_port_no(tx_port_cached->port->port_no));
> +            if (tx_port_cached->port->port_no <
> +                ARRAY_SIZE(pmd->send_netdev_cache)) {
> +                pmd->send_netdev_cache[tx_port_cached->port->port_no] =
> +                    tx_port_cached->port->netdev;
> +            }
>          }
>      }
>  }
> @@ -6585,6 +6598,7 @@ dp_netdev_configure_pmd(struct 
> dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
>      hmap_init(&pmd->tx_ports);
>      hmap_init(&pmd->tnl_port_cache);
>      hmap_init(&pmd->send_port_cache);
> +    memset(pmd->send_netdev_cache, 0, sizeof pmd->send_netdev_cache);
>      cmap_init(&pmd->tx_bonds);
>      /* init the 'flow_cache' since there is no
>       * actual thread created for NON_PMD_CORE_ID. */
> @@ -6603,6 +6617,7 @@ dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread 
> *pmd)
>      struct dpcls *cls;
>  
>      dp_netdev_pmd_flow_flush(pmd);
> +    memset(pmd->send_netdev_cache, 0, sizeof pmd->send_netdev_cache);
>      hmap_destroy(&pmd->send_port_cache);
>      hmap_destroy(&pmd->tnl_port_cache);
>      hmap_destroy(&pmd->tx_ports);
> @@ -7090,20 +7105,38 @@ smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
>  static struct tx_port * pmd_send_port_cache_lookup(
>      const struct dp_netdev_pmd_thread *pmd, odp_port_t port_no);
>  
> +OVS_UNUSED
> +static inline struct netdev *
> +pmd_netdev_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
> +                        odp_port_t port_no)
> +{
> +    struct tx_port *p;
> +
> +    if (port_no < ARRAY_SIZE(pmd->send_netdev_cache)) {
> +        return pmd->send_netdev_cache[port_no];
> +    }
> +
> +    p = pmd_send_port_cache_lookup(pmd, port_no);
> +    if (p) {
> +        return p->port->netdev;
> +    }
> +    return NULL;
> +}
> +
>  static inline int
>  dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd,
>                    odp_port_t port_no OVS_UNUSED,
>                    struct dp_packet *packet,
>                    struct dp_netdev_flow **flow)
>  {
> -    struct tx_port *p OVS_UNUSED;
> +    struct netdev *netdev OVS_UNUSED;
>      uint32_t mark;
>  
>  #ifdef ALLOW_EXPERIMENTAL_API /* Packet restoration API required. */
>      /* Restore the packet if HW processing was terminated before completion. */
> -    p = pmd_send_port_cache_lookup(pmd, port_no);
> -    if (OVS_LIKELY(p)) {
> -        int err = netdev_hw_miss_packet_recover(p->port->netdev, packet);
> +    netdev = pmd_netdev_cache_lookup(pmd, port_no);
> +    if (OVS_LIKELY(netdev)) {
> +        int err = netdev_hw_miss_packet_recover(netdev, packet);
>  
>          if (err && err != EOPNOTSUPP) {
>              COVERAGE_INC(datapath_drop_hw_miss_recover);FI-86194-0059
> -- 
> 2.28.0.2311.g225365fb51
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> 

Hello,

I tested the performance impact of this patch with a partial offload setup.
As reported by pmd-stats-show, in average cycles per packet:

Before vxlan-decap: 525 c/p
After vxlan-decap: 542 c/p
After this fix: 530 c/p

Without those fixes, vxlan-decap has a 3.2% negative impact on cycles,
with the fixes, the impact is reduced to 0.95%.

As I had to force partial offloads for our hardware, it would be better
with an outside confirmation on a proper setup.

Kind regards,
Eli Britstein July 7, 2021, 5:11 p.m. UTC | #2
On 7/7/2021 7:35 PM, Gaëtan Rivet wrote:
> External email: Use caution opening links or attachments
>
>
> On Wed, Jul 7, 2021, at 17:05, Eli Britstein wrote:
>> Port numbers are usually small. Maintain an array of netdev handles indexed
>> by port numbers. It accelerates looking up for them for
>> netdev_hw_miss_packet_recover().
>>
>> Reported-by: Cian Ferriter <cian.ferriter@intel.com>
>> Signed-off-by: Eli Britstein <elibr@nvidia.com>
>> Reviewed-by: Gaetan Rivet <gaetanr@nvidia.com>
>> ---
>>   lib/dpif-netdev.c | 41 +++++++++++++++++++++++++++++++++++++----
>>   1 file changed, 37 insertions(+), 4 deletions(-)
>>
>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>> index 2e654426e..accb23a1a 100644
>> --- a/lib/dpif-netdev.c
>> +++ b/lib/dpif-netdev.c
>> @@ -650,6 +650,9 @@ struct dp_netdev_pmd_thread_ctx {
>>       uint32_t emc_insert_min;
>>   };
>>
>> +/* Size of netdev's cache. */
>> +#define DP_PMD_NETDEV_CACHE_SIZE 1024
>> +
>>   /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
>>    * the performance overhead of interrupt processing.  Therefore netdev can
>>    * not implement rx-wait for these devices.  dpif-netdev needs to poll
>> @@ -786,6 +789,7 @@ struct dp_netdev_pmd_thread {
>>        * other instance will only be accessed by its own pmd thread. */
>>       struct hmap tnl_port_cache;
>>       struct hmap send_port_cache;
>> +    struct netdev *send_netdev_cache[DP_PMD_NETDEV_CACHE_SIZE];
>>
>>       /* Keep track of detailed PMD performance statistics. */
>>       struct pmd_perf_stats perf_stats;
>> @@ -5910,6 +5914,10 @@ pmd_free_cached_ports(struct
>> dp_netdev_pmd_thread *pmd)
>>           free(tx_port_cached);
>>       }
>>       HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
>> +        if (tx_port_cached->port->port_no <
It has some issues in github actions. I'll fix and post v2.
>> +            ARRAY_SIZE(pmd->send_netdev_cache)) {
>> +            pmd->send_netdev_cache[tx_port_cached->port->port_no] =
>> NULL;
>> +        }
>>           free(tx_port_cached);
>>       }
>>   }
>> @@ -5939,6 +5947,11 @@ pmd_load_cached_ports(struct
>> dp_netdev_pmd_thread *pmd)
>>               tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
>>               hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
>>                           hash_port_no(tx_port_cached->port->port_no));
>> +            if (tx_port_cached->port->port_no <
>> +                ARRAY_SIZE(pmd->send_netdev_cache)) {
>> +                pmd->send_netdev_cache[tx_port_cached->port->port_no] =
>> +                    tx_port_cached->port->netdev;
>> +            }
>>           }
>>       }
>>   }
>> @@ -6585,6 +6598,7 @@ dp_netdev_configure_pmd(struct
>> dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
>>       hmap_init(&pmd->tx_ports);
>>       hmap_init(&pmd->tnl_port_cache);
>>       hmap_init(&pmd->send_port_cache);
>> +    memset(pmd->send_netdev_cache, 0, sizeof pmd->send_netdev_cache);
>>       cmap_init(&pmd->tx_bonds);
>>       /* init the 'flow_cache' since there is no
>>        * actual thread created for NON_PMD_CORE_ID. */
>> @@ -6603,6 +6617,7 @@ dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread
>> *pmd)
>>       struct dpcls *cls;
>>
>>       dp_netdev_pmd_flow_flush(pmd);
>> +    memset(pmd->send_netdev_cache, 0, sizeof pmd->send_netdev_cache);
>>       hmap_destroy(&pmd->send_port_cache);
>>       hmap_destroy(&pmd->tnl_port_cache);
>>       hmap_destroy(&pmd->tx_ports);
>> @@ -7090,20 +7105,38 @@ smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
>>   static struct tx_port * pmd_send_port_cache_lookup(
>>       const struct dp_netdev_pmd_thread *pmd, odp_port_t port_no);
>>
>> +OVS_UNUSED
>> +static inline struct netdev *
>> +pmd_netdev_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
>> +                        odp_port_t port_no)
>> +{
>> +    struct tx_port *p;
>> +
>> +    if (port_no < ARRAY_SIZE(pmd->send_netdev_cache)) {
>> +        return pmd->send_netdev_cache[port_no];
>> +    }
>> +
>> +    p = pmd_send_port_cache_lookup(pmd, port_no);
>> +    if (p) {
>> +        return p->port->netdev;
>> +    }
>> +    return NULL;
>> +}
>> +
>>   static inline int
>>   dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd,
>>                     odp_port_t port_no OVS_UNUSED,
>>                     struct dp_packet *packet,
>>                     struct dp_netdev_flow **flow)
>>   {
>> -    struct tx_port *p OVS_UNUSED;
>> +    struct netdev *netdev OVS_UNUSED;
>>       uint32_t mark;
>>
>>   #ifdef ALLOW_EXPERIMENTAL_API /* Packet restoration API required. */
>>       /* Restore the packet if HW processing was terminated before completion. */
>> -    p = pmd_send_port_cache_lookup(pmd, port_no);
>> -    if (OVS_LIKELY(p)) {
>> -        int err = netdev_hw_miss_packet_recover(p->port->netdev, packet);
>> +    netdev = pmd_netdev_cache_lookup(pmd, port_no);
>> +    if (OVS_LIKELY(netdev)) {
>> +        int err = netdev_hw_miss_packet_recover(netdev, packet);
>>
>>           if (err && err != EOPNOTSUPP) {
>>               COVERAGE_INC(datapath_drop_hw_miss_recover);FI-86194-0059
>> --
>> 2.28.0.2311.g225365fb51
>>
>> _______________________________________________
>> dev mailing list
>> dev@openvswitch.org
>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>
> Hello,
>
> I tested the performance impact of this patch with a partial offload setup.
> As reported by pmd-stats-show, in average cycles per packet:
>
> Before vxlan-decap: 525 c/p
> After vxlan-decap: 542 c/p
> After this fix: 530 c/p
>
> Without those fixes, vxlan-decap has a 3.2% negative impact on cycles,
> with the fixes, the impact is reduced to 0.95%.
>
> As I had to force partial offloads for our hardware, it would be better
> with an outside confirmation on a proper setup.
>
> Kind regards,
> --
> Gaetan Rivet
Ferriter, Cian July 8, 2021, 4:43 p.m. UTC | #3
Hi Gaetan, Eli and all,

Thanks for the patch and the info on how it affects performance in your case. I just wanted to post the performance we are seeing.

I've posted the numbers inline. Please note, I'll be away on leave till Tuesday.
Thanks,
Cian

> -----Original Message-----
> From: Gaëtan Rivet <grive@u256.net>
> Sent: Wednesday 7 July 2021 17:36
> To: Eli Britstein <elibr@nvidia.com>; <dev@openvswitch.org> <dev@openvswitch.org>; Van Haaren, Harry
> <harry.van.haaren@intel.com>; Ferriter, Cian <cian.ferriter@intel.com>
> Cc: Majd Dibbiny <majd@nvidia.com>; Ilya Maximets <i.maximets@ovn.org>
> Subject: Re: [ovs-dev] [PATCH 2/2] dpif-netdev: Introduce netdev array cache
> 
> On Wed, Jul 7, 2021, at 17:05, Eli Britstein wrote:
> > Port numbers are usually small. Maintain an array of netdev handles indexed
> > by port numbers. It accelerates looking up for them for
> > netdev_hw_miss_packet_recover().
> >
> > Reported-by: Cian Ferriter <cian.ferriter@intel.com>
> > Signed-off-by: Eli Britstein <elibr@nvidia.com>
> > Reviewed-by: Gaetan Rivet <gaetanr@nvidia.com>
> > ---

<snipped patch contents>

> > _______________________________________________
> > dev mailing list
> > dev@openvswitch.org
> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> >
> 
> Hello,
> 
> I tested the performance impact of this patch with a partial offload setup.
> As reported by pmd-stats-show, in average cycles per packet:
> 
> Before vxlan-decap: 525 c/p
> After vxlan-decap: 542 c/p
> After this fix: 530 c/p
> 
> Without those fixes, vxlan-decap has a 3.2% negative impact on cycles,
> with the fixes, the impact is reduced to 0.95%.
> 
> As I had to force partial offloads for our hardware, it would be better
> with an outside confirmation on a proper setup.
> 
> Kind regards,
> --
> Gaetan Rivet

I'm showing the performance relative to what we measured on OVS master directly before the VXLAN HWOL changes went in. All of the below results are using the scalar DPIF and partial HWOL.

Link to "Fixup patches": http://patchwork.ozlabs.org/project/openvswitch/list/?series=252356

Master before VXLAN HWOL changes (f0e4a73)
1.000x

Latest master after VXLAN HWOL changes (b780911)
0.918x (-8.2%)

After fixup patches on OVS ML are applied (with ALLOW_EXPERIMENTAL_API=off)
0.973x (-2.7%)

After fixup patches on OVS ML are applied and after ALLOW_EXPERIMENTAL_API is removed.
0.938x (-6.2%)

I ran the last set of results by applying the below diff. I did this because I'm assuming the plan is to remove the ALLOW_EXPERIMENTAL_API '#ifdef's at some point?
Diff:
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index accb23a1a..0e29c609f 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -7132,7 +7132,6 @@ dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd,
     struct netdev *netdev OVS_UNUSED;
     uint32_t mark;

-#ifdef ALLOW_EXPERIMENTAL_API /* Packet restoration API required. */
     /* Restore the packet if HW processing was terminated before completion. */
     netdev = pmd_netdev_cache_lookup(pmd, port_no);
     if (OVS_LIKELY(netdev)) {
@@ -7143,7 +7142,6 @@ dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd,
             return -1;
         }
     }
-#endif

     /* If no mark, no flow to find. */
     if (!dp_packet_has_flow_mark(packet, &mark)) {
Ilya Maximets July 9, 2021, 8:52 p.m. UTC | #4
On 7/8/21 6:43 PM, Ferriter, Cian wrote:
> Hi Gaetan, Eli and all,
> 
> Thanks for the patch and the info on how it affects performance in your case. I just wanted to post the performance we are seeing.
> 
> I've posted the numbers inline. Please note, I'll be away on leave till Tuesday.
> Thanks,
> Cian
> 
>> -----Original Message-----
>> From: Gaëtan Rivet <grive@u256.net>
>> Sent: Wednesday 7 July 2021 17:36
>> To: Eli Britstein <elibr@nvidia.com>; <dev@openvswitch.org> <dev@openvswitch.org>; Van Haaren, Harry
>> <harry.van.haaren@intel.com>; Ferriter, Cian <cian.ferriter@intel.com>
>> Cc: Majd Dibbiny <majd@nvidia.com>; Ilya Maximets <i.maximets@ovn.org>
>> Subject: Re: [ovs-dev] [PATCH 2/2] dpif-netdev: Introduce netdev array cache
>>
>> On Wed, Jul 7, 2021, at 17:05, Eli Britstein wrote:
>>> Port numbers are usually small. Maintain an array of netdev handles indexed
>>> by port numbers. It accelerates looking up for them for
>>> netdev_hw_miss_packet_recover().
>>>
>>> Reported-by: Cian Ferriter <cian.ferriter@intel.com>
>>> Signed-off-by: Eli Britstein <elibr@nvidia.com>
>>> Reviewed-by: Gaetan Rivet <gaetanr@nvidia.com>
>>> ---
> 
> <snipped patch contents>
> 
>>> _______________________________________________
>>> dev mailing list
>>> dev@openvswitch.org
>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>>
>>
>> Hello,
>>
>> I tested the performance impact of this patch with a partial offload setup.
>> As reported by pmd-stats-show, in average cycles per packet:
>>
>> Before vxlan-decap: 525 c/p
>> After vxlan-decap: 542 c/p
>> After this fix: 530 c/p
>>
>> Without those fixes, vxlan-decap has a 3.2% negative impact on cycles,
>> with the fixes, the impact is reduced to 0.95%.
>>
>> As I had to force partial offloads for our hardware, it would be better
>> with an outside confirmation on a proper setup.
>>
>> Kind regards,
>> --
>> Gaetan Rivet
> 
> I'm showing the performance relative to what we measured on OVS master directly before the VXLAN HWOL changes went in. All of the below results are using the scalar DPIF and partial HWOL.
> 
> Link to "Fixup patches": http://patchwork.ozlabs.org/project/openvswitch/list/?series=252356
> 
> Master before VXLAN HWOL changes (f0e4a73)
> 1.000x
> 
> Latest master after VXLAN HWOL changes (b780911)
> 0.918x (-8.2%)
> 
> After fixup patches on OVS ML are applied (with ALLOW_EXPERIMENTAL_API=off)
> 0.973x (-2.7%)
> 
> After fixup patches on OVS ML are applied and after ALLOW_EXPERIMENTAL_API is removed.
> 0.938x (-6.2%)
> 
> I ran the last set of results by applying the below diff. I did this because I'm assuming the plan is to remove the ALLOW_EXPERIMENTAL_API '#ifdef's at some point?

Yes, that is the plan.

And thanks for testing, Gaetan and Cian!

Could you also provide more details on your test environment,
so someone else can reproduce?

What is important to know:
- Test configuration: P2P, V2V, PVP, etc.
- Test type: max. throughput, zero packet loss.
- OVS config: EMC, SMC, HWOL, AVX512 - on/off/type
- Installed OF rules.
- Traffic pattern: Packet size, number of flows, packet type.

This tests also didn't include the fix from Balazs, IIUC, because
they were performed a bit before that patch got accepted.

And Flavio reported what seems to be noticeable performance
drop due to just accepted AVX512 DPIF implementation for the
non-HWOL non-AVX512 setup:
  https://mail.openvswitch.org/pipermail/ovs-dev/2021-July/385448.html

So, it seems that everything will need to be re-tested anyway
in order to understand what is the current situation.

Best regards, Ilya Maximets.

> Diff:
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index accb23a1a..0e29c609f 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -7132,7 +7132,6 @@ dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd,
>      struct netdev *netdev OVS_UNUSED;
>      uint32_t mark;
> 
> -#ifdef ALLOW_EXPERIMENTAL_API /* Packet restoration API required. */
>      /* Restore the packet if HW processing was terminated before completion. */
>      netdev = pmd_netdev_cache_lookup(pmd, port_no);
>      if (OVS_LIKELY(netdev)) {
> @@ -7143,7 +7142,6 @@ dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd,
>              return -1;
>          }
>      }
> -#endif
> 
>      /* If no mark, no flow to find. */
>      if (!dp_packet_has_flow_mark(packet, &mark)) {
>
Ferriter, Cian July 14, 2021, 2:58 p.m. UTC | #5
> -----Original Message-----
> From: Ilya Maximets <i.maximets@ovn.org>
> Sent: Friday 9 July 2021 21:53
> To: Ferriter, Cian <cian.ferriter@intel.com>; Gaëtan Rivet <grive@u256.net>; Eli Britstein
> <elibr@nvidia.com>; dev@openvswitch.org; Van Haaren, Harry <harry.van.haaren@intel.com>
> Cc: Majd Dibbiny <majd@nvidia.com>; Ilya Maximets <i.maximets@ovn.org>; Stokes, Ian
> <ian.stokes@intel.com>; Flavio Leitner <fbl@sysclose.org>
> Subject: Re: [ovs-dev] [PATCH 2/2] dpif-netdev: Introduce netdev array cache
> 
> On 7/8/21 6:43 PM, Ferriter, Cian wrote:
> > Hi Gaetan, Eli and all,
> >
> > Thanks for the patch and the info on how it affects performance in your case. I just wanted to post
> the performance we are seeing.
> >
> > I've posted the numbers inline. Please note, I'll be away on leave till Tuesday.
> > Thanks,
> > Cian
> >
> >> -----Original Message-----
> >> From: Gaëtan Rivet <grive@u256.net>
> >> Sent: Wednesday 7 July 2021 17:36
> >> To: Eli Britstein <elibr@nvidia.com>; <dev@openvswitch.org> <dev@openvswitch.org>; Van Haaren,
> Harry
> >> <harry.van.haaren@intel.com>; Ferriter, Cian <cian.ferriter@intel.com>
> >> Cc: Majd Dibbiny <majd@nvidia.com>; Ilya Maximets <i.maximets@ovn.org>
> >> Subject: Re: [ovs-dev] [PATCH 2/2] dpif-netdev: Introduce netdev array cache
> >>
> >> On Wed, Jul 7, 2021, at 17:05, Eli Britstein wrote:
> >>> Port numbers are usually small. Maintain an array of netdev handles indexed
> >>> by port numbers. It accelerates looking up for them for
> >>> netdev_hw_miss_packet_recover().
> >>>
> >>> Reported-by: Cian Ferriter <cian.ferriter@intel.com>
> >>> Signed-off-by: Eli Britstein <elibr@nvidia.com>
> >>> Reviewed-by: Gaetan Rivet <gaetanr@nvidia.com>
> >>> ---
> >
> > <snipped patch contents>
> >
> >>> _______________________________________________
> >>> dev mailing list
> >>> dev@openvswitch.org
> >>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> >>>
> >>
> >> Hello,
> >>
> >> I tested the performance impact of this patch with a partial offload setup.
> >> As reported by pmd-stats-show, in average cycles per packet:
> >>
> >> Before vxlan-decap: 525 c/p
> >> After vxlan-decap: 542 c/p
> >> After this fix: 530 c/p
> >>
> >> Without those fixes, vxlan-decap has a 3.2% negative impact on cycles,
> >> with the fixes, the impact is reduced to 0.95%.
> >>
> >> As I had to force partial offloads for our hardware, it would be better
> >> with an outside confirmation on a proper setup.
> >>
> >> Kind regards,
> >> --
> >> Gaetan Rivet
> >
> > I'm showing the performance relative to what we measured on OVS master directly before the VXLAN
> HWOL changes went in. All of the below results are using the scalar DPIF and partial HWOL.
> >
> > Link to "Fixup patches": http://patchwork.ozlabs.org/project/openvswitch/list/?series=252356
> >
> > Master before VXLAN HWOL changes (f0e4a73)
> > 1.000x
> >
> > Latest master after VXLAN HWOL changes (b780911)
> > 0.918x (-8.2%)
> >
> > After fixup patches on OVS ML are applied (with ALLOW_EXPERIMENTAL_API=off)
> > 0.973x (-2.7%)
> >
> > After fixup patches on OVS ML are applied and after ALLOW_EXPERIMENTAL_API is removed.
> > 0.938x (-6.2%)
> >
> > I ran the last set of results by applying the below diff. I did this because I'm assuming the plan
> is to remove the ALLOW_EXPERIMENTAL_API '#ifdef's at some point?
> 
> Yes, that is the plan.
> 

Thanks for confirming this.

> And thanks for testing, Gaetan and Cian!
> 
> Could you also provide more details on your test environment,
> so someone else can reproduce?
> 

Good idea, I'll add the details inline below. These details apply to the performance measured previously by me, and the performance in this mail.

> What is important to know:
> - Test configuration: P2P, V2V, PVP, etc.


P2P
1 PHY port
1 RXQ

> - Test type: max. throughput, zero packet loss.

Max throughput.

> - OVS config: EMC, SMC, HWOL, AVX512 - on/off/type

In all tests, all packets hit a single datapath flow with "offloaded:partial". So all packets are partially offloaded, skipping miniflow_extract() and EMC/SMC/DPCLS lookups.

AVX512 is off.

> - Installed OF rules.

$ $OVS_DIR/utilities/ovs-ofctl dump-flows br0
 cookie=0x0, duration=253.691s, table=0, n_packets=2993867136, n_bytes=179632028160, in_port=phy0 actions=IN_PORT

> - Traffic pattern: Packet size, number of flows, packet type.

64B, 1 flow, ETH/IP packets.

> 
> This tests also didn't include the fix from Balazs, IIUC, because
> they were performed a bit before that patch got accepted.
> 

Correct, the above tests didn't include the optimization from Balazs.

> And Flavio reported what seems to be noticeable performance
> drop due to just accepted AVX512 DPIF implementation for the
> non-HWOL non-AVX512 setup:
>   https://mail.openvswitch.org/pipermail/ovs-dev/2021-July/385448.html
> 

We are testing partial HWOL setups, so I don't think Flavio's mail is relevant to this. 

> So, it seems that everything will need to be re-tested anyway
> in order to understand what is the current situation.
> 

Agreed, let's retest the performance. I've included the new numbers below:

I'm showing the performance relative to what we measured on OVS master directly before the VXLAN HWOL changes went in. All of the below results are using the scalar DPIF and partial HWOL.
Link to "Fixup patches" (v2): http://patchwork.ozlabs.org/project/openvswitch/list/?series=253488

Master before VXLAN HWOL changes. (f0e4a73)
1.000x

Master after VXLAN HWOL changes. (d53ea18)
0.964x (-3.6%)

After rebased fixup patches on OVS ML are applied. (with ALLOW_EXPERIMENTAL_API=off)
0.993x (-0.7%)

After rebased fixup patches on OVS ML are applied and after ALLOW_EXPERIMENTAL_API is removed.
0.961x (-3.9%)

So the performance is looking better now because of the optimization from Balazs. This together with #ifdefing out the code brings the performance almost to where it was before.

I'm worried that the #ifdef is a temporary solution for the partial HWOL case, since eventually that will be removed. This leaves us with a -3.9% degradation.

Thanks,
Cian

> Best regards, Ilya Maximets.
> 
> > Diff:
> > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> > index accb23a1a..0e29c609f 100644
> > --- a/lib/dpif-netdev.c
> > +++ b/lib/dpif-netdev.c
> > @@ -7132,7 +7132,6 @@ dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd,
> >      struct netdev *netdev OVS_UNUSED;
> >      uint32_t mark;
> >
> > -#ifdef ALLOW_EXPERIMENTAL_API /* Packet restoration API required. */
> >      /* Restore the packet if HW processing was terminated before completion. */
> >      netdev = pmd_netdev_cache_lookup(pmd, port_no);
> >      if (OVS_LIKELY(netdev)) {
> > @@ -7143,7 +7142,6 @@ dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd,
> >              return -1;
> >          }
> >      }
> > -#endif
> >
> >      /* If no mark, no flow to find. */
> >      if (!dp_packet_has_flow_mark(packet, &mark)) {
> >
Eli Britstein July 14, 2021, 3:20 p.m. UTC | #6
On 7/14/2021 5:58 PM, Ferriter, Cian wrote:
> External email: Use caution opening links or attachments
>
>
>> -----Original Message-----
>> From: Ilya Maximets <i.maximets@ovn.org>
>> Sent: Friday 9 July 2021 21:53
>> To: Ferriter, Cian <cian.ferriter@intel.com>; Gaëtan Rivet <grive@u256.net>; Eli Britstein
>> <elibr@nvidia.com>; dev@openvswitch.org; Van Haaren, Harry <harry.van.haaren@intel.com>
>> Cc: Majd Dibbiny <majd@nvidia.com>; Ilya Maximets <i.maximets@ovn.org>; Stokes, Ian
>> <ian.stokes@intel.com>; Flavio Leitner <fbl@sysclose.org>
>> Subject: Re: [ovs-dev] [PATCH 2/2] dpif-netdev: Introduce netdev array cache
>>
>> On 7/8/21 6:43 PM, Ferriter, Cian wrote:
>>> Hi Gaetan, Eli and all,
>>>
>>> Thanks for the patch and the info on how it affects performance in your case. I just wanted to post
>> the performance we are seeing.
>>> I've posted the numbers inline. Please note, I'll be away on leave till Tuesday.
>>> Thanks,
>>> Cian
>>>
>>>> -----Original Message-----
>>>> From: Gaëtan Rivet <grive@u256.net>
>>>> Sent: Wednesday 7 July 2021 17:36
>>>> To: Eli Britstein <elibr@nvidia.com>; <dev@openvswitch.org> <dev@openvswitch.org>; Van Haaren,
>> Harry
>>>> <harry.van.haaren@intel.com>; Ferriter, Cian <cian.ferriter@intel.com>
>>>> Cc: Majd Dibbiny <majd@nvidia.com>; Ilya Maximets <i.maximets@ovn.org>
>>>> Subject: Re: [ovs-dev] [PATCH 2/2] dpif-netdev: Introduce netdev array cache
>>>>
>>>> On Wed, Jul 7, 2021, at 17:05, Eli Britstein wrote:
>>>>> Port numbers are usually small. Maintain an array of netdev handles indexed
>>>>> by port numbers. It accelerates looking up for them for
>>>>> netdev_hw_miss_packet_recover().
>>>>>
>>>>> Reported-by: Cian Ferriter <cian.ferriter@intel.com>
>>>>> Signed-off-by: Eli Britstein <elibr@nvidia.com>
>>>>> Reviewed-by: Gaetan Rivet <gaetanr@nvidia.com>
>>>>> ---
>>> <snipped patch contents>
>>>
>>>>> _______________________________________________
>>>>> dev mailing list
>>>>> dev@openvswitch.org
>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmail.openvswitch.org%2Fmailman%2Flistinfo%2Fovs-dev&amp;data=04%7C01%7Celibr%40nvidia.com%7C7ca0caf9434e429e4ffd08d946d7cf2f%7C43083d15727340c1b7db39efd9ccc17a%7C0%7C0%7C637618715041410254%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=rv%2FdenANxrcTGxBBbRvhhlNioyswL7ieFr8AGcGtCs8%3D&amp;reserved=0
>>>>>
>>>> Hello,
>>>>
>>>> I tested the performance impact of this patch with a partial offload setup.
>>>> As reported by pmd-stats-show, in average cycles per packet:
>>>>
>>>> Before vxlan-decap: 525 c/p
>>>> After vxlan-decap: 542 c/p
>>>> After this fix: 530 c/p
>>>>
>>>> Without those fixes, vxlan-decap has a 3.2% negative impact on cycles,
>>>> with the fixes, the impact is reduced to 0.95%.
>>>>
>>>> As I had to force partial offloads for our hardware, it would be better
>>>> with an outside confirmation on a proper setup.
>>>>
>>>> Kind regards,
>>>> --
>>>> Gaetan Rivet
>>> I'm showing the performance relative to what we measured on OVS master directly before the VXLAN
>> HWOL changes went in. All of the below results are using the scalar DPIF and partial HWOL.
>>> Link to "Fixup patches": https://nam11.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatchwork.ozlabs.org%2Fproject%2Fopenvswitch%2Flist%2F%3Fseries%3D252356&amp;data=04%7C01%7Celibr%40nvidia.com%7C7ca0caf9434e429e4ffd08d946d7cf2f%7C43083d15727340c1b7db39efd9ccc17a%7C0%7C0%7C637618715041410254%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=Y62OrCRyS00vJHHPQvAHyhG5C4eO%2FSfWMCSPtszn3Is%3D&amp;reserved=0
>>>
>>> Master before VXLAN HWOL changes (f0e4a73)
>>> 1.000x
>>>
>>> Latest master after VXLAN HWOL changes (b780911)
>>> 0.918x (-8.2%)
>>>
>>> After fixup patches on OVS ML are applied (with ALLOW_EXPERIMENTAL_API=off)
>>> 0.973x (-2.7%)
>>>
>>> After fixup patches on OVS ML are applied and after ALLOW_EXPERIMENTAL_API is removed.
>>> 0.938x (-6.2%)
>>>
>>> I ran the last set of results by applying the below diff. I did this because I'm assuming the plan
>> is to remove the ALLOW_EXPERIMENTAL_API '#ifdef's at some point?
>>
>> Yes, that is the plan.
>>
> Thanks for confirming this.
>
>> And thanks for testing, Gaetan and Cian!
>>
>> Could you also provide more details on your test environment,
>> so someone else can reproduce?
>>
> Good idea, I'll add the details inline below. These details apply to the performance measured previously by me, and the performance in this mail.
>
>> What is important to know:
>> - Test configuration: P2P, V2V, PVP, etc.
>
> P2P
> 1 PHY port
> 1 RXQ
>
>> - Test type: max. throughput, zero packet loss.
> Max throughput.
>
>> - OVS config: EMC, SMC, HWOL, AVX512 - on/off/type
> In all tests, all packets hit a single datapath flow with "offloaded:partial". So all packets are partially offloaded, skipping miniflow_extract() and EMC/SMC/DPCLS lookups.
>
> AVX512 is off.
>
>> - Installed OF rules.
> $ $OVS_DIR/utilities/ovs-ofctl dump-flows br0
>   cookie=0x0, duration=253.691s, table=0, n_packets=2993867136, n_bytes=179632028160, in_port=phy0 actions=IN_PORT
>
>> - Traffic pattern: Packet size, number of flows, packet type.
> 64B, 1 flow, ETH/IP packets.
>
>> This tests also didn't include the fix from Balazs, IIUC, because
>> they were performed a bit before that patch got accepted.
>>
> Correct, the above tests didn't include the optimization from Balazs.
>
>> And Flavio reported what seems to be noticeable performance
>> drop due to just accepted AVX512 DPIF implementation for the
>> non-HWOL non-AVX512 setup:
>>    https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmail.openvswitch.org%2Fpipermail%2Fovs-dev%2F2021-July%2F385448.html&amp;data=04%7C01%7Celibr%40nvidia.com%7C7ca0caf9434e429e4ffd08d946d7cf2f%7C43083d15727340c1b7db39efd9ccc17a%7C0%7C0%7C637618715041410254%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=Co%2FugazVORXH43uWlB2UUAoP%2BGdHpdwisKs10BPFW7c%3D&amp;reserved=0
>>
> We are testing partial HWOL setups, so I don't think Flavio's mail is relevant to this.
>
>> So, it seems that everything will need to be re-tested anyway
>> in order to understand what is the current situation.
>>
> Agreed, let's retest the performance. I've included the new numbers below:
>
> I'm showing the performance relative to what we measured on OVS master directly before the VXLAN HWOL changes went in. All of the below results are using the scalar DPIF and partial HWOL.
> Link to "Fixup patches" (v2): https://nam11.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatchwork.ozlabs.org%2Fproject%2Fopenvswitch%2Flist%2F%3Fseries%3D253488&amp;data=04%7C01%7Celibr%40nvidia.com%7C7ca0caf9434e429e4ffd08d946d7cf2f%7C43083d15727340c1b7db39efd9ccc17a%7C0%7C0%7C637618715041410254%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=ejj3KdwvnZwYfiYM%2BEcpL5tnb3JIbtdhMmaMMhxJkSo%3D&amp;reserved=0
>
> Master before VXLAN HWOL changes. (f0e4a73)
> 1.000x
>
> Master after VXLAN HWOL changes. (d53ea18)
> 0.964x (-3.6%)
>
> After rebased fixup patches on OVS ML are applied. (with ALLOW_EXPERIMENTAL_API=off)
> 0.993x (-0.7%)
>
> After rebased fixup patches on OVS ML are applied and after ALLOW_EXPERIMENTAL_API is removed.
> 0.961x (-3.9%)

According to this, we are better off without the array cache commit, at 
least in this test (-3.6% vs -3.9%). Isn't it?

In our internal tests we saw it actually improves, though not gaining 
back all the degradation.

What do you think?

>
> So the performance is looking better now because of the optimization from Balazs. This together with #ifdefing out the code brings the performance almost to where it was before.
>
> I'm worried that the #ifdef is a temporary solution for the partial HWOL case, since eventually that will be removed. This leaves us with a -3.9% degradation.
>
> Thanks,
> Cian
>
>> Best regards, Ilya Maximets.
>>
>>> Diff:
>>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>>> index accb23a1a..0e29c609f 100644
>>> --- a/lib/dpif-netdev.c
>>> +++ b/lib/dpif-netdev.c
>>> @@ -7132,7 +7132,6 @@ dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd,
>>>       struct netdev *netdev OVS_UNUSED;
>>>       uint32_t mark;
>>>
>>> -#ifdef ALLOW_EXPERIMENTAL_API /* Packet restoration API required. */
>>>       /* Restore the packet if HW processing was terminated before completion. */
>>>       netdev = pmd_netdev_cache_lookup(pmd, port_no);
>>>       if (OVS_LIKELY(netdev)) {
>>> @@ -7143,7 +7142,6 @@ dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd,
>>>               return -1;
>>>           }
>>>       }
>>> -#endif
>>>
>>>       /* If no mark, no flow to find. */
>>>       if (!dp_packet_has_flow_mark(packet, &mark)) {
>>>
Ferriter, Cian July 15, 2021, 1:35 p.m. UTC | #7
> -----Original Message-----
> From: Eli Britstein <elibr@nvidia.com>
> Sent: Wednesday 14 July 2021 16:21
> To: Ferriter, Cian <cian.ferriter@intel.com>; Ilya Maximets <i.maximets@ovn.org>; Gaëtan Rivet
> <grive@u256.net>; dev@openvswitch.org; Van Haaren, Harry <harry.van.haaren@intel.com>
> Cc: Majd Dibbiny <majd@nvidia.com>; Stokes, Ian <ian.stokes@intel.com>; Flavio Leitner
> <fbl@sysclose.org>
> Subject: Re: [ovs-dev] [PATCH 2/2] dpif-netdev: Introduce netdev array cache
> 
> 
> On 7/14/2021 5:58 PM, Ferriter, Cian wrote:
> > External email: Use caution opening links or attachments
> >
> >
> >> -----Original Message-----
> >> From: Ilya Maximets <i.maximets@ovn.org>
> >> Sent: Friday 9 July 2021 21:53
> >> To: Ferriter, Cian <cian.ferriter@intel.com>; Gaëtan Rivet <grive@u256.net>; Eli Britstein
> >> <elibr@nvidia.com>; dev@openvswitch.org; Van Haaren, Harry <harry.van.haaren@intel.com>
> >> Cc: Majd Dibbiny <majd@nvidia.com>; Ilya Maximets <i.maximets@ovn.org>; Stokes, Ian
> >> <ian.stokes@intel.com>; Flavio Leitner <fbl@sysclose.org>
> >> Subject: Re: [ovs-dev] [PATCH 2/2] dpif-netdev: Introduce netdev array cache
> >>
> >> On 7/8/21 6:43 PM, Ferriter, Cian wrote:
> >>> Hi Gaetan, Eli and all,
> >>>
> >>> Thanks for the patch and the info on how it affects performance in your case. I just wanted to
> post
> >> the performance we are seeing.
> >>> I've posted the numbers inline. Please note, I'll be away on leave till Tuesday.
> >>> Thanks,
> >>> Cian
> >>>
> >>>> -----Original Message-----
> >>>> From: Gaëtan Rivet <grive@u256.net>
> >>>> Sent: Wednesday 7 July 2021 17:36
> >>>> To: Eli Britstein <elibr@nvidia.com>; <dev@openvswitch.org> <dev@openvswitch.org>; Van Haaren,
> >> Harry
> >>>> <harry.van.haaren@intel.com>; Ferriter, Cian <cian.ferriter@intel.com>
> >>>> Cc: Majd Dibbiny <majd@nvidia.com>; Ilya Maximets <i.maximets@ovn.org>
> >>>> Subject: Re: [ovs-dev] [PATCH 2/2] dpif-netdev: Introduce netdev array cache
> >>>>
> >>>> On Wed, Jul 7, 2021, at 17:05, Eli Britstein wrote:
> >>>>> Port numbers are usually small. Maintain an array of netdev handles indexed
> >>>>> by port numbers. It accelerates looking up for them for
> >>>>> netdev_hw_miss_packet_recover().
> >>>>>
> >>>>> Reported-by: Cian Ferriter <cian.ferriter@intel.com>
> >>>>> Signed-off-by: Eli Britstein <elibr@nvidia.com>
> >>>>> Reviewed-by: Gaetan Rivet <gaetanr@nvidia.com>
> >>>>> ---
> >>> <snipped patch contents>
> >>>
> >>>>> _______________________________________________
> >>>>> dev mailing list
> >>>>> dev@openvswitch.org
> >>>>>
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmail.openvswitch.org%2Fmailman%2Flis
> tinfo%2Fovs-
> dev&amp;data=04%7C01%7Celibr%40nvidia.com%7C7ca0caf9434e429e4ffd08d946d7cf2f%7C43083d15727340c1b7db39e
> fd9ccc17a%7C0%7C0%7C637618715041410254%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJ
> BTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=rv%2FdenANxrcTGxBBbRvhhlNioyswL7ieFr8AGcGtCs8%3D&amp;rese
> rved=0
> >>>>>
> >>>> Hello,
> >>>>
> >>>> I tested the performance impact of this patch with a partial offload setup.
> >>>> As reported by pmd-stats-show, in average cycles per packet:
> >>>>
> >>>> Before vxlan-decap: 525 c/p
> >>>> After vxlan-decap: 542 c/p
> >>>> After this fix: 530 c/p
> >>>>
> >>>> Without those fixes, vxlan-decap has a 3.2% negative impact on cycles,
> >>>> with the fixes, the impact is reduced to 0.95%.
> >>>>
> >>>> As I had to force partial offloads for our hardware, it would be better
> >>>> with an outside confirmation on a proper setup.
> >>>>
> >>>> Kind regards,
> >>>> --
> >>>> Gaetan Rivet
> >>> I'm showing the performance relative to what we measured on OVS master directly before the VXLAN
> >> HWOL changes went in. All of the below results are using the scalar DPIF and partial HWOL.
> >>> Link to "Fixup patches":
> https://nam11.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatchwork.ozlabs.org%2Fproject%2Fopen
> vswitch%2Flist%2F%3Fseries%3D252356&amp;data=04%7C01%7Celibr%40nvidia.com%7C7ca0caf9434e429e4ffd08d946
> d7cf2f%7C43083d15727340c1b7db39efd9ccc17a%7C0%7C0%7C637618715041410254%7CUnknown%7CTWFpbGZsb3d8eyJWIjo
> iMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=Y62OrCRyS00vJHHPQvAHyhG5C
> 4eO%2FSfWMCSPtszn3Is%3D&amp;reserved=0
> >>>
> >>> Master before VXLAN HWOL changes (f0e4a73)
> >>> 1.000x
> >>>
> >>> Latest master after VXLAN HWOL changes (b780911)
> >>> 0.918x (-8.2%)
> >>>
> >>> After fixup patches on OVS ML are applied (with ALLOW_EXPERIMENTAL_API=off)
> >>> 0.973x (-2.7%)
> >>>
> >>> After fixup patches on OVS ML are applied and after ALLOW_EXPERIMENTAL_API is removed.
> >>> 0.938x (-6.2%)
> >>>
> >>> I ran the last set of results by applying the below diff. I did this because I'm assuming the plan
> >> is to remove the ALLOW_EXPERIMENTAL_API '#ifdef's at some point?
> >>
> >> Yes, that is the plan.
> >>
> > Thanks for confirming this.
> >
> >> And thanks for testing, Gaetan and Cian!
> >>
> >> Could you also provide more details on your test environment,
> >> so someone else can reproduce?
> >>
> > Good idea, I'll add the details inline below. These details apply to the performance measured
> previously by me, and the performance in this mail.
> >
> >> What is important to know:
> >> - Test configuration: P2P, V2V, PVP, etc.
> >
> > P2P
> > 1 PHY port
> > 1 RXQ
> >
> >> - Test type: max. throughput, zero packet loss.
> > Max throughput.
> >
> >> - OVS config: EMC, SMC, HWOL, AVX512 - on/off/type
> > In all tests, all packets hit a single datapath flow with "offloaded:partial". So all packets are
> partially offloaded, skipping miniflow_extract() and EMC/SMC/DPCLS lookups.
> >
> > AVX512 is off.
> >
> >> - Installed OF rules.
> > $ $OVS_DIR/utilities/ovs-ofctl dump-flows br0
> >   cookie=0x0, duration=253.691s, table=0, n_packets=2993867136, n_bytes=179632028160, in_port=phy0
> actions=IN_PORT
> >
> >> - Traffic pattern: Packet size, number of flows, packet type.
> > 64B, 1 flow, ETH/IP packets.
> >
> >> This tests also didn't include the fix from Balazs, IIUC, because
> >> they were performed a bit before that patch got accepted.
> >>
> > Correct, the above tests didn't include the optimization from Balazs.
> >
> >> And Flavio reported what seems to be noticeable performance
> >> drop due to just accepted AVX512 DPIF implementation for the
> >> non-HWOL non-AVX512 setup:
> >>
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmail.openvswitch.org%2Fpipermail%2Fo
> vs-dev%2F2021-
> July%2F385448.html&amp;data=04%7C01%7Celibr%40nvidia.com%7C7ca0caf9434e429e4ffd08d946d7cf2f%7C43083d15
> 727340c1b7db39efd9ccc17a%7C0%7C0%7C637618715041410254%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQ
> IjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=Co%2FugazVORXH43uWlB2UUAoP%2BGdHpdwisKs10B
> PFW7c%3D&amp;reserved=0
> >>
> > We are testing partial HWOL setups, so I don't think Flavio's mail is relevant to this.
> >
> >> So, it seems that everything will need to be re-tested anyway
> >> in order to understand what is the current situation.
> >>
> > Agreed, let's retest the performance. I've included the new numbers below:
> >
> > I'm showing the performance relative to what we measured on OVS master directly before the VXLAN
> HWOL changes went in. All of the below results are using the scalar DPIF and partial HWOL.
> > Link to "Fixup patches" (v2):
> https://nam11.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatchwork.ozlabs.org%2Fproject%2Fopen
> vswitch%2Flist%2F%3Fseries%3D253488&amp;data=04%7C01%7Celibr%40nvidia.com%7C7ca0caf9434e429e4ffd08d946
> d7cf2f%7C43083d15727340c1b7db39efd9ccc17a%7C0%7C0%7C637618715041410254%7CUnknown%7CTWFpbGZsb3d8eyJWIjo
> iMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=ejj3KdwvnZwYfiYM%2BEcpL5t
> nb3JIbtdhMmaMMhxJkSo%3D&amp;reserved=0
> >
> > Master before VXLAN HWOL changes. (f0e4a73)
> > 1.000x
> >
> > Master after VXLAN HWOL changes. (d53ea18)
> > 0.964x (-3.6%)
> >
> > After rebased fixup patches on OVS ML are applied. (with ALLOW_EXPERIMENTAL_API=off)
> > 0.993x (-0.7%)
> >
> > After rebased fixup patches on OVS ML are applied and after ALLOW_EXPERIMENTAL_API is removed.
> > 0.961x (-3.9%)
> 
> According to this, we are better off without the array cache commit, at
> least in this test (-3.6% vs -3.9%). Isn't it?
> 
> In our internal tests we saw it actually improves, though not gaining
> back all the degradation.
> 
> What do you think?
> 

I think we are better off keeping the array cache commit. It's a good idea and replaces a HMAP_FOR_EACH_IN_BUCKET() with a direct port_no to netdev lookup which is what we need for the netdev_hw_miss_packet_recover() call.

Yes, I show a slight degradation with that, but it's small, and my test scenario only has one TX port. I think that one TX port hides the benefit of the array cache commit. When we have a more realistic number of TX ports, the direct lookup should be quicker than the HMAP_FOR_EACH.

My remaining concern is with PATCH 1/2 of this series.
In that patch, we are hiding the overhead of the packet restoration API under ALLOW_EXPERIMENTAL_API, but this is a temporary fix. In the future when ALLOW_EXPERIMENTAL_API is removed from the packet restoration API, that overhead will once again be present.

> >
> > So the performance is looking better now because of the optimization from Balazs. This together with
> #ifdefing out the code brings the performance almost to where it was before.
> >
> > I'm worried that the #ifdef is a temporary solution for the partial HWOL case, since eventually that
> will be removed. This leaves us with a -3.9% degradation.
> >
> > Thanks,
> > Cian
> >
> >> Best regards, Ilya Maximets.
> >>
> >>> Diff:
> >>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> >>> index accb23a1a..0e29c609f 100644
> >>> --- a/lib/dpif-netdev.c
> >>> +++ b/lib/dpif-netdev.c
> >>> @@ -7132,7 +7132,6 @@ dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd,
> >>>       struct netdev *netdev OVS_UNUSED;
> >>>       uint32_t mark;
> >>>
> >>> -#ifdef ALLOW_EXPERIMENTAL_API /* Packet restoration API required. */
> >>>       /* Restore the packet if HW processing was terminated before completion. */
> >>>       netdev = pmd_netdev_cache_lookup(pmd, port_no);
> >>>       if (OVS_LIKELY(netdev)) {
> >>> @@ -7143,7 +7142,6 @@ dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd,
> >>>               return -1;
> >>>           }
> >>>       }
> >>> -#endif
> >>>
> >>>       /* If no mark, no flow to find. */
> >>>       if (!dp_packet_has_flow_mark(packet, &mark)) {
> >>>
Eli Britstein July 15, 2021, 2:10 p.m. UTC | #8
On 7/15/2021 4:35 PM, Ferriter, Cian wrote:
> External email: Use caution opening links or attachments
>
>
>> -----Original Message-----
>> From: Eli Britstein <elibr@nvidia.com>
>> Sent: Wednesday 14 July 2021 16:21
>> To: Ferriter, Cian <cian.ferriter@intel.com>; Ilya Maximets <i.maximets@ovn.org>; Gaëtan Rivet
>> <grive@u256.net>; dev@openvswitch.org; Van Haaren, Harry <harry.van.haaren@intel.com>
>> Cc: Majd Dibbiny <majd@nvidia.com>; Stokes, Ian <ian.stokes@intel.com>; Flavio Leitner
>> <fbl@sysclose.org>
>> Subject: Re: [ovs-dev] [PATCH 2/2] dpif-netdev: Introduce netdev array cache
>>
>>
>> On 7/14/2021 5:58 PM, Ferriter, Cian wrote:
>>> External email: Use caution opening links or attachments
>>>
>>>
>>>> -----Original Message-----
>>>> From: Ilya Maximets <i.maximets@ovn.org>
>>>> Sent: Friday 9 July 2021 21:53
>>>> To: Ferriter, Cian <cian.ferriter@intel.com>; Gaëtan Rivet <grive@u256.net>; Eli Britstein
>>>> <elibr@nvidia.com>; dev@openvswitch.org; Van Haaren, Harry <harry.van.haaren@intel.com>
>>>> Cc: Majd Dibbiny <majd@nvidia.com>; Ilya Maximets <i.maximets@ovn.org>; Stokes, Ian
>>>> <ian.stokes@intel.com>; Flavio Leitner <fbl@sysclose.org>
>>>> Subject: Re: [ovs-dev] [PATCH 2/2] dpif-netdev: Introduce netdev array cache
>>>>
>>>> On 7/8/21 6:43 PM, Ferriter, Cian wrote:
>>>>> Hi Gaetan, Eli and all,
>>>>>
>>>>> Thanks for the patch and the info on how it affects performance in your case. I just wanted to
>> post
>>>> the performance we are seeing.
>>>>> I've posted the numbers inline. Please note, I'll be away on leave till Tuesday.
>>>>> Thanks,
>>>>> Cian
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Gaëtan Rivet <grive@u256.net>
>>>>>> Sent: Wednesday 7 July 2021 17:36
>>>>>> To: Eli Britstein <elibr@nvidia.com>; <dev@openvswitch.org> <dev@openvswitch.org>; Van Haaren,
>>>> Harry
>>>>>> <harry.van.haaren@intel.com>; Ferriter, Cian <cian.ferriter@intel.com>
>>>>>> Cc: Majd Dibbiny <majd@nvidia.com>; Ilya Maximets <i.maximets@ovn.org>
>>>>>> Subject: Re: [ovs-dev] [PATCH 2/2] dpif-netdev: Introduce netdev array cache
>>>>>>
>>>>>> On Wed, Jul 7, 2021, at 17:05, Eli Britstein wrote:
>>>>>>> Port numbers are usually small. Maintain an array of netdev handles indexed
>>>>>>> by port numbers. It accelerates looking up for them for
>>>>>>> netdev_hw_miss_packet_recover().
>>>>>>>
>>>>>>> Reported-by: Cian Ferriter <cian.ferriter@intel.com>
>>>>>>> Signed-off-by: Eli Britstein <elibr@nvidia.com>
>>>>>>> Reviewed-by: Gaetan Rivet <gaetanr@nvidia.com>
>>>>>>> ---
>>>>> <snipped patch contents>
>>>>>
>>>>>>> _______________________________________________
>>>>>>> dev mailing list
>>>>>>> dev@openvswitch.org
>>>>>>>
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmail.openvswitch.org%2Fmailman%2Flis
>> tinfo%2Fovs-
>> dev&amp;data=04%7C01%7Celibr%40nvidia.com%7C7ca0caf9434e429e4ffd08d946d7cf2f%7C43083d15727340c1b7db39e
>> fd9ccc17a%7C0%7C0%7C637618715041410254%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJ
>> BTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=rv%2FdenANxrcTGxBBbRvhhlNioyswL7ieFr8AGcGtCs8%3D&amp;rese
>> rved=0
>>>>>> Hello,
>>>>>>
>>>>>> I tested the performance impact of this patch with a partial offload setup.
>>>>>> As reported by pmd-stats-show, in average cycles per packet:
>>>>>>
>>>>>> Before vxlan-decap: 525 c/p
>>>>>> After vxlan-decap: 542 c/p
>>>>>> After this fix: 530 c/p
>>>>>>
>>>>>> Without those fixes, vxlan-decap has a 3.2% negative impact on cycles,
>>>>>> with the fixes, the impact is reduced to 0.95%.
>>>>>>
>>>>>> As I had to force partial offloads for our hardware, it would be better
>>>>>> with an outside confirmation on a proper setup.
>>>>>>
>>>>>> Kind regards,
>>>>>> --
>>>>>> Gaetan Rivet
>>>>> I'm showing the performance relative to what we measured on OVS master directly before the VXLAN
>>>> HWOL changes went in. All of the below results are using the scalar DPIF and partial HWOL.
>>>>> Link to "Fixup patches":
>> https://nam11.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatchwork.ozlabs.org%2Fproject%2Fopen
>> vswitch%2Flist%2F%3Fseries%3D252356&amp;data=04%7C01%7Celibr%40nvidia.com%7C7ca0caf9434e429e4ffd08d946
>> d7cf2f%7C43083d15727340c1b7db39efd9ccc17a%7C0%7C0%7C637618715041410254%7CUnknown%7CTWFpbGZsb3d8eyJWIjo
>> iMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=Y62OrCRyS00vJHHPQvAHyhG5C
>> 4eO%2FSfWMCSPtszn3Is%3D&amp;reserved=0
>>>>> Master before VXLAN HWOL changes (f0e4a73)
>>>>> 1.000x
>>>>>
>>>>> Latest master after VXLAN HWOL changes (b780911)
>>>>> 0.918x (-8.2%)
>>>>>
>>>>> After fixup patches on OVS ML are applied (with ALLOW_EXPERIMENTAL_API=off)
>>>>> 0.973x (-2.7%)
>>>>>
>>>>> After fixup patches on OVS ML are applied and after ALLOW_EXPERIMENTAL_API is removed.
>>>>> 0.938x (-6.2%)
>>>>>
>>>>> I ran the last set of results by applying the below diff. I did this because I'm assuming the plan
>>>> is to remove the ALLOW_EXPERIMENTAL_API '#ifdef's at some point?
>>>>
>>>> Yes, that is the plan.
>>>>
>>> Thanks for confirming this.
>>>
>>>> And thanks for testing, Gaetan and Cian!
>>>>
>>>> Could you also provide more details on your test environment,
>>>> so someone else can reproduce?
>>>>
>>> Good idea, I'll add the details inline below. These details apply to the performance measured
>> previously by me, and the performance in this mail.
>>>> What is important to know:
>>>> - Test configuration: P2P, V2V, PVP, etc.
>>> P2P
>>> 1 PHY port
>>> 1 RXQ
>>>
>>>> - Test type: max. throughput, zero packet loss.
>>> Max throughput.
>>>
>>>> - OVS config: EMC, SMC, HWOL, AVX512 - on/off/type
>>> In all tests, all packets hit a single datapath flow with "offloaded:partial". So all packets are
>> partially offloaded, skipping miniflow_extract() and EMC/SMC/DPCLS lookups.
>>> AVX512 is off.
>>>
>>>> - Installed OF rules.
>>> $ $OVS_DIR/utilities/ovs-ofctl dump-flows br0
>>>    cookie=0x0, duration=253.691s, table=0, n_packets=2993867136, n_bytes=179632028160, in_port=phy0
>> actions=IN_PORT
>>>> - Traffic pattern: Packet size, number of flows, packet type.
>>> 64B, 1 flow, ETH/IP packets.
>>>
>>>> This tests also didn't include the fix from Balazs, IIUC, because
>>>> they were performed a bit before that patch got accepted.
>>>>
>>> Correct, the above tests didn't include the optimization from Balazs.
>>>
>>>> And Flavio reported what seems to be noticeable performance
>>>> drop due to just accepted AVX512 DPIF implementation for the
>>>> non-HWOL non-AVX512 setup:
>>>>
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmail.openvswitch.org%2Fpipermail%2Fo
>> vs-dev%2F2021-
>> July%2F385448.html&amp;data=04%7C01%7Celibr%40nvidia.com%7C7ca0caf9434e429e4ffd08d946d7cf2f%7C43083d15
>> 727340c1b7db39efd9ccc17a%7C0%7C0%7C637618715041410254%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQ
>> IjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=Co%2FugazVORXH43uWlB2UUAoP%2BGdHpdwisKs10B
>> PFW7c%3D&amp;reserved=0
>>> We are testing partial HWOL setups, so I don't think Flavio's mail is relevant to this.
>>>
>>>> So, it seems that everything will need to be re-tested anyway
>>>> in order to understand what is the current situation.
>>>>
>>> Agreed, let's retest the performance. I've included the new numbers below:
>>>
>>> I'm showing the performance relative to what we measured on OVS master directly before the VXLAN
>> HWOL changes went in. All of the below results are using the scalar DPIF and partial HWOL.
>>> Link to "Fixup patches" (v2):
>> https://nam11.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatchwork.ozlabs.org%2Fproject%2Fopen
>> vswitch%2Flist%2F%3Fseries%3D253488&amp;data=04%7C01%7Celibr%40nvidia.com%7C7ca0caf9434e429e4ffd08d946
>> d7cf2f%7C43083d15727340c1b7db39efd9ccc17a%7C0%7C0%7C637618715041410254%7CUnknown%7CTWFpbGZsb3d8eyJWIjo
>> iMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=ejj3KdwvnZwYfiYM%2BEcpL5t
>> nb3JIbtdhMmaMMhxJkSo%3D&amp;reserved=0
>>> Master before VXLAN HWOL changes. (f0e4a73)
>>> 1.000x
>>>
>>> Master after VXLAN HWOL changes. (d53ea18)
>>> 0.964x (-3.6%)
>>>
>>> After rebased fixup patches on OVS ML are applied. (with ALLOW_EXPERIMENTAL_API=off)
>>> 0.993x (-0.7%)
>>>
>>> After rebased fixup patches on OVS ML are applied and after ALLOW_EXPERIMENTAL_API is removed.
>>> 0.961x (-3.9%)
>> According to this, we are better off without the array cache commit, at
>> least in this test (-3.6% vs -3.9%). Isn't it?
>>
>> In our internal tests we saw it actually improves, though not gaining
>> back all the degradation.
>>
>> What do you think?
>>
> I think we are better off keeping the array cache commit. It's a good idea and replaces a HMAP_FOR_EACH_IN_BUCKET() with a direct port_no to netdev lookup which is what we need for the netdev_hw_miss_packet_recover() call.
>
> Yes, I show a slight degradation with that, but it's small, and my test scenario only has one TX port. I think that one TX port hides the benefit of the array cache commit. When we have a more realistic number of TX ports, the direct lookup should be quicker than the HMAP_FOR_EACH.

I thought about doing the array of struct tx_port *, not only netdevs. 
In this case all calls to pmd_send_port_cache_lookup() are optimized.  
The new function pmd_netdev_cache_lookup() along with the changes in 
dp_netdev_hw_flow() are omitted.

Our tests (which are also not very realistic, in the same sense) showed 
very slight improvement/degradation, depending vxlan or non-vxlan setups 
(I don't have the precise results).

As it was very slight (either way), I thought to focus only in the usage 
for netdev_hw_miss_packet_recover().

Following your comment, maybe indeed in a more realistic setup we will 
benefit from this even more.

What do you think?

>
> My remaining concern is with PATCH 1/2 of this series.
> In that patch, we are hiding the overhead of the packet restoration API under ALLOW_EXPERIMENTAL_API, but this is a temporary fix. In the future when ALLOW_EXPERIMENTAL_API is removed from the packet restoration API, that overhead will once again be present.

That's right.

I think that until it is removed in the future it is better for setups 
without experimental support, and also gives us the opportunity to 
optimize more meanwhile, maybe taking advantage of future patches. For 
example this:

https://patchwork.ozlabs.org/project/openvswitch/list/?series=253593

For example, if we move netdev_hw_miss_packet_recover() to be 
dpif-offload function, we won't need to provide the netdev at all.

>
>>> So the performance is looking better now because of the optimization from Balazs. This together with
>> #ifdefing out the code brings the performance almost to where it was before.
>>> I'm worried that the #ifdef is a temporary solution for the partial HWOL case, since eventually that
>> will be removed. This leaves us with a -3.9% degradation.
>>> Thanks,
>>> Cian
>>>
>>>> Best regards, Ilya Maximets.
>>>>
>>>>> Diff:
>>>>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>>>>> index accb23a1a..0e29c609f 100644
>>>>> --- a/lib/dpif-netdev.c
>>>>> +++ b/lib/dpif-netdev.c
>>>>> @@ -7132,7 +7132,6 @@ dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd,
>>>>>        struct netdev *netdev OVS_UNUSED;
>>>>>        uint32_t mark;
>>>>>
>>>>> -#ifdef ALLOW_EXPERIMENTAL_API /* Packet restoration API required. */
>>>>>        /* Restore the packet if HW processing was terminated before completion. */
>>>>>        netdev = pmd_netdev_cache_lookup(pmd, port_no);
>>>>>        if (OVS_LIKELY(netdev)) {
>>>>> @@ -7143,7 +7142,6 @@ dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd,
>>>>>                return -1;
>>>>>            }
>>>>>        }
>>>>> -#endif
>>>>>
>>>>>        /* If no mark, no flow to find. */
>>>>>        if (!dp_packet_has_flow_mark(packet, &mark)) {
>>>>>
Ferriter, Cian July 16, 2021, 9:30 a.m. UTC | #9
> -----Original Message-----
> From: Eli Britstein <elibr@nvidia.com>
> Sent: Thursday 15 July 2021 15:10
> To: Ferriter, Cian <cian.ferriter@intel.com>; Ilya Maximets <i.maximets@ovn.org>; Gaëtan Rivet
> <grive@u256.net>; dev@openvswitch.org; Van Haaren, Harry <harry.van.haaren@intel.com>
> Cc: Majd Dibbiny <majd@nvidia.com>; Stokes, Ian <ian.stokes@intel.com>; Flavio Leitner
> <fbl@sysclose.org>
> Subject: Re: [ovs-dev] [PATCH 2/2] dpif-netdev: Introduce netdev array cache
> 
> 
> On 7/15/2021 4:35 PM, Ferriter, Cian wrote:
> > External email: Use caution opening links or attachments
> >
> >
> >> -----Original Message-----
> >> From: Eli Britstein <elibr@nvidia.com>
> >> Sent: Wednesday 14 July 2021 16:21
> >> To: Ferriter, Cian <cian.ferriter@intel.com>; Ilya Maximets <i.maximets@ovn.org>; Gaëtan Rivet
> >> <grive@u256.net>; dev@openvswitch.org; Van Haaren, Harry <harry.van.haaren@intel.com>
> >> Cc: Majd Dibbiny <majd@nvidia.com>; Stokes, Ian <ian.stokes@intel.com>; Flavio Leitner
> >> <fbl@sysclose.org>
> >> Subject: Re: [ovs-dev] [PATCH 2/2] dpif-netdev: Introduce netdev array cache
> >>
> >>
> >> On 7/14/2021 5:58 PM, Ferriter, Cian wrote:
> >>> External email: Use caution opening links or attachments
> >>>
> >>>
> >>>> -----Original Message-----
> >>>> From: Ilya Maximets <i.maximets@ovn.org>
> >>>> Sent: Friday 9 July 2021 21:53
> >>>> To: Ferriter, Cian <cian.ferriter@intel.com>; Gaëtan Rivet <grive@u256.net>; Eli Britstein
> >>>> <elibr@nvidia.com>; dev@openvswitch.org; Van Haaren, Harry <harry.van.haaren@intel.com>
> >>>> Cc: Majd Dibbiny <majd@nvidia.com>; Ilya Maximets <i.maximets@ovn.org>; Stokes, Ian
> >>>> <ian.stokes@intel.com>; Flavio Leitner <fbl@sysclose.org>
> >>>> Subject: Re: [ovs-dev] [PATCH 2/2] dpif-netdev: Introduce netdev array cache
> >>>>
> >>>> On 7/8/21 6:43 PM, Ferriter, Cian wrote:
> >>>>> Hi Gaetan, Eli and all,
> >>>>>
> >>>>> Thanks for the patch and the info on how it affects performance in your case. I just wanted to
> >> post
> >>>> the performance we are seeing.
> >>>>> I've posted the numbers inline. Please note, I'll be away on leave till Tuesday.
> >>>>> Thanks,
> >>>>> Cian
> >>>>>
> >>>>>> -----Original Message-----
> >>>>>> From: Gaëtan Rivet <grive@u256.net>
> >>>>>> Sent: Wednesday 7 July 2021 17:36
> >>>>>> To: Eli Britstein <elibr@nvidia.com>; <dev@openvswitch.org> <dev@openvswitch.org>; Van Haaren,
> >>>> Harry
> >>>>>> <harry.van.haaren@intel.com>; Ferriter, Cian <cian.ferriter@intel.com>
> >>>>>> Cc: Majd Dibbiny <majd@nvidia.com>; Ilya Maximets <i.maximets@ovn.org>
> >>>>>> Subject: Re: [ovs-dev] [PATCH 2/2] dpif-netdev: Introduce netdev array cache
> >>>>>>
> >>>>>> On Wed, Jul 7, 2021, at 17:05, Eli Britstein wrote:
> >>>>>>> Port numbers are usually small. Maintain an array of netdev handles indexed
> >>>>>>> by port numbers. It accelerates looking up for them for
> >>>>>>> netdev_hw_miss_packet_recover().
> >>>>>>>
> >>>>>>> Reported-by: Cian Ferriter <cian.ferriter@intel.com>
> >>>>>>> Signed-off-by: Eli Britstein <elibr@nvidia.com>
> >>>>>>> Reviewed-by: Gaetan Rivet <gaetanr@nvidia.com>
> >>>>>>> ---
> >>>>> <snipped patch contents>
> >>>>>
> >>>>>>> _______________________________________________
> >>>>>>> dev mailing list
> >>>>>>> dev@openvswitch.org
> >>>>>>>
> >>
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmail.openvswitch.org%2Fmailman%2Flis
> >> tinfo%2Fovs-
> >>
> dev&amp;data=04%7C01%7Celibr%40nvidia.com%7C7ca0caf9434e429e4ffd08d946d7cf2f%7C43083d15727340c1b7db39e
> >>
> fd9ccc17a%7C0%7C0%7C637618715041410254%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJ
> >>
> BTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=rv%2FdenANxrcTGxBBbRvhhlNioyswL7ieFr8AGcGtCs8%3D&amp;rese
> >> rved=0
> >>>>>> Hello,
> >>>>>>
> >>>>>> I tested the performance impact of this patch with a partial offload setup.
> >>>>>> As reported by pmd-stats-show, in average cycles per packet:
> >>>>>>
> >>>>>> Before vxlan-decap: 525 c/p
> >>>>>> After vxlan-decap: 542 c/p
> >>>>>> After this fix: 530 c/p
> >>>>>>
> >>>>>> Without those fixes, vxlan-decap has a 3.2% negative impact on cycles,
> >>>>>> with the fixes, the impact is reduced to 0.95%.
> >>>>>>
> >>>>>> As I had to force partial offloads for our hardware, it would be better
> >>>>>> with an outside confirmation on a proper setup.
> >>>>>>
> >>>>>> Kind regards,
> >>>>>> --
> >>>>>> Gaetan Rivet
> >>>>> I'm showing the performance relative to what we measured on OVS master directly before the VXLAN
> >>>> HWOL changes went in. All of the below results are using the scalar DPIF and partial HWOL.
> >>>>> Link to "Fixup patches":
> >>
> https://nam11.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatchwork.ozlabs.org%2Fproject%2Fopen
> >>
> vswitch%2Flist%2F%3Fseries%3D252356&amp;data=04%7C01%7Celibr%40nvidia.com%7C7ca0caf9434e429e4ffd08d946
> >>
> d7cf2f%7C43083d15727340c1b7db39efd9ccc17a%7C0%7C0%7C637618715041410254%7CUnknown%7CTWFpbGZsb3d8eyJWIjo
> >>
> iMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=Y62OrCRyS00vJHHPQvAHyhG5C
> >> 4eO%2FSfWMCSPtszn3Is%3D&amp;reserved=0
> >>>>> Master before VXLAN HWOL changes (f0e4a73)
> >>>>> 1.000x
> >>>>>
> >>>>> Latest master after VXLAN HWOL changes (b780911)
> >>>>> 0.918x (-8.2%)
> >>>>>
> >>>>> After fixup patches on OVS ML are applied (with ALLOW_EXPERIMENTAL_API=off)
> >>>>> 0.973x (-2.7%)
> >>>>>
> >>>>> After fixup patches on OVS ML are applied and after ALLOW_EXPERIMENTAL_API is removed.
> >>>>> 0.938x (-6.2%)
> >>>>>
> >>>>> I ran the last set of results by applying the below diff. I did this because I'm assuming the
> plan
> >>>> is to remove the ALLOW_EXPERIMENTAL_API '#ifdef's at some point?
> >>>>
> >>>> Yes, that is the plan.
> >>>>
> >>> Thanks for confirming this.
> >>>
> >>>> And thanks for testing, Gaetan and Cian!
> >>>>
> >>>> Could you also provide more details on your test environment,
> >>>> so someone else can reproduce?
> >>>>
> >>> Good idea, I'll add the details inline below. These details apply to the performance measured
> >> previously by me, and the performance in this mail.
> >>>> What is important to know:
> >>>> - Test configuration: P2P, V2V, PVP, etc.
> >>> P2P
> >>> 1 PHY port
> >>> 1 RXQ
> >>>
> >>>> - Test type: max. throughput, zero packet loss.
> >>> Max throughput.
> >>>
> >>>> - OVS config: EMC, SMC, HWOL, AVX512 - on/off/type
> >>> In all tests, all packets hit a single datapath flow with "offloaded:partial". So all packets are
> >> partially offloaded, skipping miniflow_extract() and EMC/SMC/DPCLS lookups.
> >>> AVX512 is off.
> >>>
> >>>> - Installed OF rules.
> >>> $ $OVS_DIR/utilities/ovs-ofctl dump-flows br0
> >>>    cookie=0x0, duration=253.691s, table=0, n_packets=2993867136, n_bytes=179632028160,
> in_port=phy0
> >> actions=IN_PORT
> >>>> - Traffic pattern: Packet size, number of flows, packet type.
> >>> 64B, 1 flow, ETH/IP packets.
> >>>
> >>>> This tests also didn't include the fix from Balazs, IIUC, because
> >>>> they were performed a bit before that patch got accepted.
> >>>>
> >>> Correct, the above tests didn't include the optimization from Balazs.
> >>>
> >>>> And Flavio reported what seems to be noticeable performance
> >>>> drop due to just accepted AVX512 DPIF implementation for the
> >>>> non-HWOL non-AVX512 setup:
> >>>>
> >>
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmail.openvswitch.org%2Fpipermail%2Fo
> >> vs-dev%2F2021-
> >>
> July%2F385448.html&amp;data=04%7C01%7Celibr%40nvidia.com%7C7ca0caf9434e429e4ffd08d946d7cf2f%7C43083d15
> >>
> 727340c1b7db39efd9ccc17a%7C0%7C0%7C637618715041410254%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQ
> >>
> IjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=Co%2FugazVORXH43uWlB2UUAoP%2BGdHpdwisKs10B
> >> PFW7c%3D&amp;reserved=0
> >>> We are testing partial HWOL setups, so I don't think Flavio's mail is relevant to this.
> >>>
> >>>> So, it seems that everything will need to be re-tested anyway
> >>>> in order to understand what is the current situation.
> >>>>
> >>> Agreed, let's retest the performance. I've included the new numbers below:
> >>>
> >>> I'm showing the performance relative to what we measured on OVS master directly before the VXLAN
> >> HWOL changes went in. All of the below results are using the scalar DPIF and partial HWOL.
> >>> Link to "Fixup patches" (v2):
> >>
> https://nam11.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatchwork.ozlabs.org%2Fproject%2Fopen
> >>
> vswitch%2Flist%2F%3Fseries%3D253488&amp;data=04%7C01%7Celibr%40nvidia.com%7C7ca0caf9434e429e4ffd08d946
> >>
> d7cf2f%7C43083d15727340c1b7db39efd9ccc17a%7C0%7C0%7C637618715041410254%7CUnknown%7CTWFpbGZsb3d8eyJWIjo
> >>
> iMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=ejj3KdwvnZwYfiYM%2BEcpL5t
> >> nb3JIbtdhMmaMMhxJkSo%3D&amp;reserved=0
> >>> Master before VXLAN HWOL changes. (f0e4a73)
> >>> 1.000x
> >>>
> >>> Master after VXLAN HWOL changes. (d53ea18)
> >>> 0.964x (-3.6%)
> >>>
> >>> After rebased fixup patches on OVS ML are applied. (with ALLOW_EXPERIMENTAL_API=off)
> >>> 0.993x (-0.7%)
> >>>
> >>> After rebased fixup patches on OVS ML are applied and after ALLOW_EXPERIMENTAL_API is removed.
> >>> 0.961x (-3.9%)
> >> According to this, we are better off without the array cache commit, at
> >> least in this test (-3.6% vs -3.9%). Isn't it?
> >>
> >> In our internal tests we saw it actually improves, though not gaining
> >> back all the degradation.
> >>
> >> What do you think?
> >>
> > I think we are better off keeping the array cache commit. It's a good idea and replaces a
> HMAP_FOR_EACH_IN_BUCKET() with a direct port_no to netdev lookup which is what we need for the
> netdev_hw_miss_packet_recover() call.
> >
> > Yes, I show a slight degradation with that, but it's small, and my test scenario only has one TX
> port. I think that one TX port hides the benefit of the array cache commit. When we have a more
> realistic number of TX ports, the direct lookup should be quicker than the HMAP_FOR_EACH.
> 
> I thought about doing the array of struct tx_port *, not only netdevs.
> In this case all calls to pmd_send_port_cache_lookup() are optimized.
> The new function pmd_netdev_cache_lookup() along with the changes in
> dp_netdev_hw_flow() are omitted.
> 

This sounds quite nice actually, and optimizes more use cases, since pmd_send_port_cache_lookup() is used for the output action.

> Our tests (which are also not very realistic, in the same sense) showed
> very slight improvement/degradation, depending vxlan or non-vxlan setups
> (I don't have the precise results).
> 

Agreed, it sounds tricky to demonstrate the benefit with a simple setup, since the pmd_send_port_cache_lookup() shouldn't take long in the first place.

> As it was very slight (either way), I thought to focus only in the usage
> for netdev_hw_miss_packet_recover().
> 

Makes sense, and that's fine with me too.

> Following your comment, maybe indeed in a more realistic setup we will
> benefit from this even more.
> 
> What do you think?
> 

I think that both approaches sound good, so whichever is your preference is OK with me.

> >
> > My remaining concern is with PATCH 1/2 of this series.
> > In that patch, we are hiding the overhead of the packet restoration API under
> ALLOW_EXPERIMENTAL_API, but this is a temporary fix. In the future when ALLOW_EXPERIMENTAL_API is
> removed from the packet restoration API, that overhead will once again be present.
> 
> That's right.
> 
> I think that until it is removed in the future it is better for setups
> without experimental support, and also gives us the opportunity to
> optimize more meanwhile, maybe taking advantage of future patches. For
> example this:
> 
> https://patchwork.ozlabs.org/project/openvswitch/list/?series=253593
> 
> For example, if we move netdev_hw_miss_packet_recover() to be
> dpif-offload function, we won't need to provide the netdev at all.
> 

Future optimizations to minimize the impact that the packet restoration APIs have on performance of other HWOL use cases is good. Having these in place before the ALLOW_EXPERIMENTAL_API ifdefs makes the most sense.

This should hopefully mean no performance degradation for a user who doesn't use the packet restoration API.

> >
> >>> So the performance is looking better now because of the optimization from Balazs. This together
> with
> >> #ifdefing out the code brings the performance almost to where it was before.
> >>> I'm worried that the #ifdef is a temporary solution for the partial HWOL case, since eventually
> that
> >> will be removed. This leaves us with a -3.9% degradation.
> >>> Thanks,
> >>> Cian
> >>>
> >>>> Best regards, Ilya Maximets.
> >>>>
> >>>>> Diff:
> >>>>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> >>>>> index accb23a1a..0e29c609f 100644
> >>>>> --- a/lib/dpif-netdev.c
> >>>>> +++ b/lib/dpif-netdev.c
> >>>>> @@ -7132,7 +7132,6 @@ dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd,
> >>>>>        struct netdev *netdev OVS_UNUSED;
> >>>>>        uint32_t mark;
> >>>>>
> >>>>> -#ifdef ALLOW_EXPERIMENTAL_API /* Packet restoration API required. */
> >>>>>        /* Restore the packet if HW processing was terminated before completion. */
> >>>>>        netdev = pmd_netdev_cache_lookup(pmd, port_no);
> >>>>>        if (OVS_LIKELY(netdev)) {
> >>>>> @@ -7143,7 +7142,6 @@ dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd,
> >>>>>                return -1;
> >>>>>            }
> >>>>>        }
> >>>>> -#endif
> >>>>>
> >>>>>        /* If no mark, no flow to find. */
> >>>>>        if (!dp_packet_has_flow_mark(packet, &mark)) {
> >>>>>
diff mbox series

Patch

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 2e654426e..accb23a1a 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -650,6 +650,9 @@  struct dp_netdev_pmd_thread_ctx {
     uint32_t emc_insert_min;
 };
 
+/* Size of netdev's cache. */
+#define DP_PMD_NETDEV_CACHE_SIZE 1024
+
 /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
  * the performance overhead of interrupt processing.  Therefore netdev can
  * not implement rx-wait for these devices.  dpif-netdev needs to poll
@@ -786,6 +789,7 @@  struct dp_netdev_pmd_thread {
      * other instance will only be accessed by its own pmd thread. */
     struct hmap tnl_port_cache;
     struct hmap send_port_cache;
+    struct netdev *send_netdev_cache[DP_PMD_NETDEV_CACHE_SIZE];
 
     /* Keep track of detailed PMD performance statistics. */
     struct pmd_perf_stats perf_stats;
@@ -5910,6 +5914,10 @@  pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
         free(tx_port_cached);
     }
     HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
+        if (tx_port_cached->port->port_no <
+            ARRAY_SIZE(pmd->send_netdev_cache)) {
+            pmd->send_netdev_cache[tx_port_cached->port->port_no] = NULL;
+        }
         free(tx_port_cached);
     }
 }
@@ -5939,6 +5947,11 @@  pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
             tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
             hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
                         hash_port_no(tx_port_cached->port->port_no));
+            if (tx_port_cached->port->port_no <
+                ARRAY_SIZE(pmd->send_netdev_cache)) {
+                pmd->send_netdev_cache[tx_port_cached->port->port_no] =
+                    tx_port_cached->port->netdev;
+            }
         }
     }
 }
@@ -6585,6 +6598,7 @@  dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
     hmap_init(&pmd->tx_ports);
     hmap_init(&pmd->tnl_port_cache);
     hmap_init(&pmd->send_port_cache);
+    memset(pmd->send_netdev_cache, 0, sizeof pmd->send_netdev_cache);
     cmap_init(&pmd->tx_bonds);
     /* init the 'flow_cache' since there is no
      * actual thread created for NON_PMD_CORE_ID. */
@@ -6603,6 +6617,7 @@  dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
     struct dpcls *cls;
 
     dp_netdev_pmd_flow_flush(pmd);
+    memset(pmd->send_netdev_cache, 0, sizeof pmd->send_netdev_cache);
     hmap_destroy(&pmd->send_port_cache);
     hmap_destroy(&pmd->tnl_port_cache);
     hmap_destroy(&pmd->tx_ports);
@@ -7090,20 +7105,38 @@  smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
 static struct tx_port * pmd_send_port_cache_lookup(
     const struct dp_netdev_pmd_thread *pmd, odp_port_t port_no);
 
+OVS_UNUSED
+static inline struct netdev *
+pmd_netdev_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
+                        odp_port_t port_no)
+{
+    struct tx_port *p;
+
+    if (port_no < ARRAY_SIZE(pmd->send_netdev_cache)) {
+        return pmd->send_netdev_cache[port_no];
+    }
+
+    p = pmd_send_port_cache_lookup(pmd, port_no);
+    if (p) {
+        return p->port->netdev;
+    }
+    return NULL;
+}
+
 static inline int
 dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd,
                   odp_port_t port_no OVS_UNUSED,
                   struct dp_packet *packet,
                   struct dp_netdev_flow **flow)
 {
-    struct tx_port *p OVS_UNUSED;
+    struct netdev *netdev OVS_UNUSED;
     uint32_t mark;
 
 #ifdef ALLOW_EXPERIMENTAL_API /* Packet restoration API required. */
     /* Restore the packet if HW processing was terminated before completion. */
-    p = pmd_send_port_cache_lookup(pmd, port_no);
-    if (OVS_LIKELY(p)) {
-        int err = netdev_hw_miss_packet_recover(p->port->netdev, packet);
+    netdev = pmd_netdev_cache_lookup(pmd, port_no);
+    if (OVS_LIKELY(netdev)) {
+        int err = netdev_hw_miss_packet_recover(netdev, packet);
 
         if (err && err != EOPNOTSUPP) {
             COVERAGE_INC(datapath_drop_hw_miss_recover);