Message ID | 1552454868-183848-1-git-send-email-Yanqin.Wei@arm.com |
---|---|
State | Superseded |
Delegated to: | Ian Stokes |
Headers | show |
Series | [ovs-dev,v3] dpif-netdev: dfc_process optimization by prefetching EMC entry. | expand |
Hi , OVS Maintainers, Could you help to have a look at this patch? Thanks a lot. Best Regards, Wei Yanqin -----Original Message----- From: Yanqin Wei <Yanqin.Wei@arm.com> Sent: Wednesday, March 13, 2019 1:28 PM To: dev@openvswitch.org Cc: nd <nd@arm.com>; Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>; Yanqin Wei (Arm Technology China) <Yanqin.Wei@arm.com> Subject: [ovs-dev][PATCH v3] dpif-netdev: dfc_process optimization by prefetching EMC entry. It is observed that the throughput of multi-flow is worse than single-flow in the EMC NIC to NIC cases. It is because CPU cache-miss increasing in EMC lookup. Each flow need load at least one EMC entry to CPU cache(several cache lines) and compare it with packet miniflow. This patch improve it by prefetching EMC entry in advance. Hash value can be obtained from dpdk rss hash, so this step can be advanced ahead of miniflow_extract() and prefetch EMC entry there. The prefetching size is defined as ROUND_UP(128,CACHE_LINE_SIZE), which can cover majority traffic including TCP/UDP protocol and need 2 cache lines in most modern CPU. Performance test was run in some arm platform. 1000/10000 flows NIC2NIC test achieved around 10% throughput improvement in thunderX2(aarch64 platform). Signed-off-by: Yanqin Wei <Yanqin.Wei@arm.com> Reviewed-by: Gavin Hu <Gavin.Hu@arm.com> --- lib/dpif-netdev.c | 80 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 52 insertions(+), 28 deletions(-) diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 4d6d0c3..982082c 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -189,6 +189,10 @@ struct netdev_flow_key { #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \ DEFAULT_EM_FLOW_INSERT_INV_PROB) +/* DEFAULT_EMC_PREFETCH_SIZE can cover majority traffic including +TCP/UDP + * protocol. */ +#define DEFAULT_EMC_PREFETCH_SIZE ROUND_UP(128,CACHE_LINE_SIZE) + struct emc_entry { struct dp_netdev_flow *flow; struct netdev_flow_key key; /* key.hash used for emc hash value. */ @@ -6166,15 +6170,20 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_, } static inline uint32_t -dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, - const struct miniflow *mf) +dpif_netdev_packet_get_packet_rss_hash(struct dp_packet *packet, + bool md_is_valid) { - uint32_t hash; + uint32_t hash,recirc_depth; - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { - hash = dp_packet_get_rss_hash(packet); - } else { - hash = miniflow_hash_5tuple(mf, 0); + hash = dp_packet_get_rss_hash(packet); + + if (md_is_valid) { + /* The RSS hash must account for the recirculation depth to avoid + * collisions in the exact match cache */ + recirc_depth = *recirc_depth_get_unsafe(); + if (OVS_UNLIKELY(recirc_depth)) { + hash = hash_finish(hash, recirc_depth); + } dp_packet_set_rss_hash(packet, hash); } @@ -6182,24 +6191,23 @@ dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, } static inline uint32_t -dpif_netdev_packet_get_rss_hash(struct dp_packet *packet, - const struct miniflow *mf) +dpif_netdev_packet_get_hash_5tuple(struct dp_packet *packet, + const struct miniflow *mf, + bool md_is_valid) { - uint32_t hash, recirc_depth; + uint32_t hash,recirc_depth; - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { - hash = dp_packet_get_rss_hash(packet); - } else { - hash = miniflow_hash_5tuple(mf, 0); - dp_packet_set_rss_hash(packet, hash); - } + hash = miniflow_hash_5tuple(mf, 0); + dp_packet_set_rss_hash(packet, hash); - /* The RSS hash must account for the recirculation depth to avoid - * collisions in the exact match cache */ - recirc_depth = *recirc_depth_get_unsafe(); - if (OVS_UNLIKELY(recirc_depth)) { - hash = hash_finish(hash, recirc_depth); - dp_packet_set_rss_hash(packet, hash); + if (md_is_valid) { + /* The RSS hash must account for the recirculation depth to avoid + * collisions in the exact match cache */ + recirc_depth = *recirc_depth_get_unsafe(); + if (OVS_UNLIKELY(recirc_depth)) { + hash = hash_finish(hash, recirc_depth); + dp_packet_set_rss_hash(packet, hash); + } } return hash; } @@ -6390,6 +6398,7 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, bool smc_enable_db; size_t map_cnt = 0; bool batch_enable = true; + bool is_5tuple_hash_needed; atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db); pmd_perf_update_counter(&pmd->perf_stats, @@ -6436,16 +6445,31 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, } } - miniflow_extract(packet, &key->mf); - key->len = 0; /* Not computed yet. */ /* If EMC and SMC disabled skip hash computation */ if (smc_enable_db == true || cur_min != 0) { - if (!md_is_valid) { - key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, - &key->mf); + if (OVS_LIKELY(dp_packet_rss_valid(packet))) { + is_5tuple_hash_needed = false; + key->hash = + dpif_netdev_packet_get_packet_rss_hash(packet,md_is_valid); + if (cur_min) { + ovs_prefetch_range( + &cache->emc_cache.entries[key->hash & EM_FLOW_HASH_MASK], + DEFAULT_EMC_PREFETCH_SIZE); + } } else { - key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf); + is_5tuple_hash_needed = true; } + } else { + is_5tuple_hash_needed = false; + } + + miniflow_extract(packet, &key->mf); + key->len = 0; /* Not computed yet. */ + + /* If 5tuple hash is needed */ + if (is_5tuple_hash_needed) { + key->hash = dpif_netdev_packet_get_hash_5tuple(packet, &key->mf, + + md_is_valid); } if (cur_min) { flow = emc_lookup(&cache->emc_cache, key); -- 2.7.4
On 22.03.2019 11:44, Yanqin Wei (Arm Technology China) wrote: > Hi , OVS Maintainers, > > Could you help to have a look at this patch? Thanks a lot. Hi. Thanks for improving performance and sorry for delay. Review process here in OVS is a bit slow due to lack of reviewers. I have a plan to test this patch a bit on a next week. Want to check the performance impact on PVP cases on x86. BTW, I have a patch that affects same code. Maybe it'll be interesting to you: https://patchwork.ozlabs.org/patch/1054571/ Best regards, Ilya Maximets. > > Best Regards, > Wei Yanqin > > -----Original Message----- > From: Yanqin Wei <Yanqin.Wei@arm.com> > Sent: Wednesday, March 13, 2019 1:28 PM > To: dev@openvswitch.org > Cc: nd <nd@arm.com>; Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>; Yanqin Wei (Arm Technology China) <Yanqin.Wei@arm.com> > Subject: [ovs-dev][PATCH v3] dpif-netdev: dfc_process optimization by prefetching EMC entry. > > It is observed that the throughput of multi-flow is worse than single-flow in the EMC NIC to NIC cases. It is because CPU cache-miss increasing in EMC lookup. Each flow need load at least one EMC entry to CPU cache(several cache lines) and compare it with packet miniflow. > This patch improve it by prefetching EMC entry in advance. Hash value can be obtained from dpdk rss hash, so this step can be advanced ahead of > miniflow_extract() and prefetch EMC entry there. The prefetching size is defined as ROUND_UP(128,CACHE_LINE_SIZE), which can cover majority traffic including TCP/UDP protocol and need 2 cache lines in most modern CPU. > Performance test was run in some arm platform. 1000/10000 flows NIC2NIC test achieved around 10% throughput improvement in thunderX2(aarch64 platform). > > Signed-off-by: Yanqin Wei <Yanqin.Wei@arm.com> > Reviewed-by: Gavin Hu <Gavin.Hu@arm.com> > --- > lib/dpif-netdev.c | 80 ++++++++++++++++++++++++++++++++++++------------------- > 1 file changed, 52 insertions(+), 28 deletions(-) > > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 4d6d0c3..982082c 100644 > --- a/lib/dpif-netdev.c > +++ b/lib/dpif-netdev.c > @@ -189,6 +189,10 @@ struct netdev_flow_key { > #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \ > DEFAULT_EM_FLOW_INSERT_INV_PROB) > > +/* DEFAULT_EMC_PREFETCH_SIZE can cover majority traffic including > +TCP/UDP > + * protocol. */ > +#define DEFAULT_EMC_PREFETCH_SIZE ROUND_UP(128,CACHE_LINE_SIZE) > + > struct emc_entry { > struct dp_netdev_flow *flow; > struct netdev_flow_key key; /* key.hash used for emc hash value. */ > @@ -6166,15 +6170,20 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_, } > > static inline uint32_t > -dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, > - const struct miniflow *mf) > +dpif_netdev_packet_get_packet_rss_hash(struct dp_packet *packet, > + bool md_is_valid) > { > - uint32_t hash; > + uint32_t hash,recirc_depth; > > - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { > - hash = dp_packet_get_rss_hash(packet); > - } else { > - hash = miniflow_hash_5tuple(mf, 0); > + hash = dp_packet_get_rss_hash(packet); > + > + if (md_is_valid) { > + /* The RSS hash must account for the recirculation depth to avoid > + * collisions in the exact match cache */ > + recirc_depth = *recirc_depth_get_unsafe(); > + if (OVS_UNLIKELY(recirc_depth)) { > + hash = hash_finish(hash, recirc_depth); > + } > dp_packet_set_rss_hash(packet, hash); > } > > @@ -6182,24 +6191,23 @@ dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, } > > static inline uint32_t > -dpif_netdev_packet_get_rss_hash(struct dp_packet *packet, > - const struct miniflow *mf) > +dpif_netdev_packet_get_hash_5tuple(struct dp_packet *packet, > + const struct miniflow *mf, > + bool md_is_valid) > { > - uint32_t hash, recirc_depth; > + uint32_t hash,recirc_depth; > > - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { > - hash = dp_packet_get_rss_hash(packet); > - } else { > - hash = miniflow_hash_5tuple(mf, 0); > - dp_packet_set_rss_hash(packet, hash); > - } > + hash = miniflow_hash_5tuple(mf, 0); > + dp_packet_set_rss_hash(packet, hash); > > - /* The RSS hash must account for the recirculation depth to avoid > - * collisions in the exact match cache */ > - recirc_depth = *recirc_depth_get_unsafe(); > - if (OVS_UNLIKELY(recirc_depth)) { > - hash = hash_finish(hash, recirc_depth); > - dp_packet_set_rss_hash(packet, hash); > + if (md_is_valid) { > + /* The RSS hash must account for the recirculation depth to avoid > + * collisions in the exact match cache */ > + recirc_depth = *recirc_depth_get_unsafe(); > + if (OVS_UNLIKELY(recirc_depth)) { > + hash = hash_finish(hash, recirc_depth); > + dp_packet_set_rss_hash(packet, hash); > + } > } > return hash; > } > @@ -6390,6 +6398,7 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, > bool smc_enable_db; > size_t map_cnt = 0; > bool batch_enable = true; > + bool is_5tuple_hash_needed; > > atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db); > pmd_perf_update_counter(&pmd->perf_stats, > @@ -6436,16 +6445,31 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, > } > } > > - miniflow_extract(packet, &key->mf); > - key->len = 0; /* Not computed yet. */ > /* If EMC and SMC disabled skip hash computation */ > if (smc_enable_db == true || cur_min != 0) { > - if (!md_is_valid) { > - key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, > - &key->mf); > + if (OVS_LIKELY(dp_packet_rss_valid(packet))) { > + is_5tuple_hash_needed = false; > + key->hash = > + dpif_netdev_packet_get_packet_rss_hash(packet,md_is_valid); > + if (cur_min) { > + ovs_prefetch_range( > + &cache->emc_cache.entries[key->hash & EM_FLOW_HASH_MASK], > + DEFAULT_EMC_PREFETCH_SIZE); > + } > } else { > - key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf); > + is_5tuple_hash_needed = true; > } > + } else { > + is_5tuple_hash_needed = false; > + } > + > + miniflow_extract(packet, &key->mf); > + key->len = 0; /* Not computed yet. */ > + > + /* If 5tuple hash is needed */ > + if (is_5tuple_hash_needed) { > + key->hash = dpif_netdev_packet_get_hash_5tuple(packet, &key->mf, > + > + md_is_valid); > } > if (cur_min) { > flow = emc_lookup(&cache->emc_cache, key); > -- > 2.7.4 > > >
On 3/22/2019 8:44 AM, Yanqin Wei (Arm Technology China) wrote: > Hi , OVS Maintainers, > > Could you help to have a look at this patch? Thanks a lot. > Hi Wei, I'll have some time this afternoon to look at this. Thanks Ian > Best Regards, > Wei Yanqin > > -----Original Message----- > From: Yanqin Wei <Yanqin.Wei@arm.com> > Sent: Wednesday, March 13, 2019 1:28 PM > To: dev@openvswitch.org > Cc: nd <nd@arm.com>; Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>; Yanqin Wei (Arm Technology China) <Yanqin.Wei@arm.com> > Subject: [ovs-dev][PATCH v3] dpif-netdev: dfc_process optimization by prefetching EMC entry. > > It is observed that the throughput of multi-flow is worse than single-flow in the EMC NIC to NIC cases. It is because CPU cache-miss increasing in EMC lookup. Each flow need load at least one EMC entry to CPU cache(several cache lines) and compare it with packet miniflow. > This patch improve it by prefetching EMC entry in advance. Hash value can be obtained from dpdk rss hash, so this step can be advanced ahead of > miniflow_extract() and prefetch EMC entry there. The prefetching size is defined as ROUND_UP(128,CACHE_LINE_SIZE), which can cover majority traffic including TCP/UDP protocol and need 2 cache lines in most modern CPU. > Performance test was run in some arm platform. 1000/10000 flows NIC2NIC test achieved around 10% throughput improvement in thunderX2(aarch64 platform). > > Signed-off-by: Yanqin Wei <Yanqin.Wei@arm.com> > Reviewed-by: Gavin Hu <Gavin.Hu@arm.com> > --- > lib/dpif-netdev.c | 80 ++++++++++++++++++++++++++++++++++++------------------- > 1 file changed, 52 insertions(+), 28 deletions(-) > > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 4d6d0c3..982082c 100644 > --- a/lib/dpif-netdev.c > +++ b/lib/dpif-netdev.c > @@ -189,6 +189,10 @@ struct netdev_flow_key { > #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \ > DEFAULT_EM_FLOW_INSERT_INV_PROB) > > +/* DEFAULT_EMC_PREFETCH_SIZE can cover majority traffic including > +TCP/UDP > + * protocol. */ > +#define DEFAULT_EMC_PREFETCH_SIZE ROUND_UP(128,CACHE_LINE_SIZE) > + > struct emc_entry { > struct dp_netdev_flow *flow; > struct netdev_flow_key key; /* key.hash used for emc hash value. */ > @@ -6166,15 +6170,20 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_, } > > static inline uint32_t > -dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, > - const struct miniflow *mf) > +dpif_netdev_packet_get_packet_rss_hash(struct dp_packet *packet, > + bool md_is_valid) > { > - uint32_t hash; > + uint32_t hash,recirc_depth; > > - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { > - hash = dp_packet_get_rss_hash(packet); > - } else { > - hash = miniflow_hash_5tuple(mf, 0); > + hash = dp_packet_get_rss_hash(packet); > + > + if (md_is_valid) { > + /* The RSS hash must account for the recirculation depth to avoid > + * collisions in the exact match cache */ > + recirc_depth = *recirc_depth_get_unsafe(); > + if (OVS_UNLIKELY(recirc_depth)) { > + hash = hash_finish(hash, recirc_depth); > + } > dp_packet_set_rss_hash(packet, hash); > } > > @@ -6182,24 +6191,23 @@ dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, } > > static inline uint32_t > -dpif_netdev_packet_get_rss_hash(struct dp_packet *packet, > - const struct miniflow *mf) > +dpif_netdev_packet_get_hash_5tuple(struct dp_packet *packet, > + const struct miniflow *mf, > + bool md_is_valid) > { > - uint32_t hash, recirc_depth; > + uint32_t hash,recirc_depth; > > - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { > - hash = dp_packet_get_rss_hash(packet); > - } else { > - hash = miniflow_hash_5tuple(mf, 0); > - dp_packet_set_rss_hash(packet, hash); > - } > + hash = miniflow_hash_5tuple(mf, 0); > + dp_packet_set_rss_hash(packet, hash); > > - /* The RSS hash must account for the recirculation depth to avoid > - * collisions in the exact match cache */ > - recirc_depth = *recirc_depth_get_unsafe(); > - if (OVS_UNLIKELY(recirc_depth)) { > - hash = hash_finish(hash, recirc_depth); > - dp_packet_set_rss_hash(packet, hash); > + if (md_is_valid) { > + /* The RSS hash must account for the recirculation depth to avoid > + * collisions in the exact match cache */ > + recirc_depth = *recirc_depth_get_unsafe(); > + if (OVS_UNLIKELY(recirc_depth)) { > + hash = hash_finish(hash, recirc_depth); > + dp_packet_set_rss_hash(packet, hash); > + } > } > return hash; > } > @@ -6390,6 +6398,7 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, > bool smc_enable_db; > size_t map_cnt = 0; > bool batch_enable = true; > + bool is_5tuple_hash_needed; > > atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db); > pmd_perf_update_counter(&pmd->perf_stats, > @@ -6436,16 +6445,31 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, > } > } > > - miniflow_extract(packet, &key->mf); > - key->len = 0; /* Not computed yet. */ > /* If EMC and SMC disabled skip hash computation */ > if (smc_enable_db == true || cur_min != 0) { > - if (!md_is_valid) { > - key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, > - &key->mf); > + if (OVS_LIKELY(dp_packet_rss_valid(packet))) { > + is_5tuple_hash_needed = false; > + key->hash = > + dpif_netdev_packet_get_packet_rss_hash(packet,md_is_valid); > + if (cur_min) { > + ovs_prefetch_range( > + &cache->emc_cache.entries[key->hash & EM_FLOW_HASH_MASK], > + DEFAULT_EMC_PREFETCH_SIZE); > + } > } else { > - key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf); > + is_5tuple_hash_needed = true; > } > + } else { > + is_5tuple_hash_needed = false; > + } > + > + miniflow_extract(packet, &key->mf); > + key->len = 0; /* Not computed yet. */ > + > + /* If 5tuple hash is needed */ > + if (is_5tuple_hash_needed) { > + key->hash = dpif_netdev_packet_get_hash_5tuple(packet, &key->mf, > + > + md_is_valid); > } > if (cur_min) { > flow = emc_lookup(&cache->emc_cache, key); > -- > 2.7.4 >
On 3/13/2019 5:27 AM, Yanqin Wei wrote: > It is observed that the throughput of multi-flow is worse than single-flow > in the EMC NIC to NIC cases. It is because CPU cache-miss increasing in EMC > lookup. Each flow need load at least one EMC entry to CPU cache(several > cache lines) and compare it with packet miniflow. > This patch improve it by prefetching EMC entry in advance. Hash value can > be obtained from dpdk rss hash, so this step can be advanced ahead of > miniflow_extract() and prefetch EMC entry there. The prefetching size is > defined as ROUND_UP(128,CACHE_LINE_SIZE), which can cover majority traffic > including TCP/UDP protocol and need 2 cache lines in most modern CPU. > Performance test was run in some arm platform. 1000/10000 flows NIC2NIC > test achieved around 10% throughput improvement in thunderX2(aarch64 > platform). > Thanks for this Wei, not a few review, please see some minor comments below WRT style issues. I've also run some benchmarks on this. I was seeing typically a ~3% drop on x86 with single flows with RFC2544. However once or twice, I saw a drop of up to 25% on achievable lossless packet rate but I suspect it could be an anomaly in my setup. Ilya, if you are testing this week on x86, it would be great you confirm if you see something similar in your benchmarks? For vsperf phy2phy_scalability flow tests on x86 I saw an improvement of +3% after applying the patch for zero loss tests and +5% in the case of phy2phy_scalability_cont so this looks promising. As an FYI I'll be I'm out of office this coming week so will not have an opportunity to investigate further until I'm back in office. I'll be able to review and benchmark further then. > Signed-off-by: Yanqin Wei <Yanqin.Wei@arm.com> > Reviewed-by: Gavin Hu <Gavin.Hu@arm.com> Although it doesn't appear here or in patchwork, after downloading the patch the sign off and review tags above appear duplicated after being applied. Examining the mbox I can confirm they are duplicated, can you check this on your side also? > --- > lib/dpif-netdev.c | 80 ++++++++++++++++++++++++++++++++++++------------------- > 1 file changed, 52 insertions(+), 28 deletions(-) > > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c > index 4d6d0c3..982082c 100644 > --- a/lib/dpif-netdev.c > +++ b/lib/dpif-netdev.c > @@ -189,6 +189,10 @@ struct netdev_flow_key { > #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \ > DEFAULT_EM_FLOW_INSERT_INV_PROB) > > +/* DEFAULT_EMC_PREFETCH_SIZE can cover majority traffic including TCP/UDP > + * protocol. */ > +#define DEFAULT_EMC_PREFETCH_SIZE ROUND_UP(128,CACHE_LINE_SIZE) > + > struct emc_entry { > struct dp_netdev_flow *flow; > struct netdev_flow_key key; /* key.hash used for emc hash value. */ > @@ -6166,15 +6170,20 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_, > } > > static inline uint32_t > -dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, > - const struct miniflow *mf) > +dpif_netdev_packet_get_packet_rss_hash(struct dp_packet *packet, > + bool md_is_valid) > { > - uint32_t hash; > + uint32_t hash,recirc_depth; > > - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { > - hash = dp_packet_get_rss_hash(packet); > - } else { > - hash = miniflow_hash_5tuple(mf, 0); > + hash = dp_packet_get_rss_hash(packet); > + > + if (md_is_valid) { > + /* The RSS hash must account for the recirculation depth to avoid > + * collisions in the exact match cache */ Minor, comment style, missing period at end of comment. > + recirc_depth = *recirc_depth_get_unsafe(); > + if (OVS_UNLIKELY(recirc_depth)) { > + hash = hash_finish(hash, recirc_depth); > + } > dp_packet_set_rss_hash(packet, hash); > } > > @@ -6182,24 +6191,23 @@ dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, > } > > static inline uint32_t > -dpif_netdev_packet_get_rss_hash(struct dp_packet *packet, > - const struct miniflow *mf) > +dpif_netdev_packet_get_hash_5tuple(struct dp_packet *packet, > + const struct miniflow *mf, > + bool md_is_valid) > { > - uint32_t hash, recirc_depth; > + uint32_t hash,recirc_depth; Coding style, missing space between , and recirc_depth. > > - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { > - hash = dp_packet_get_rss_hash(packet); > - } else { > - hash = miniflow_hash_5tuple(mf, 0); > - dp_packet_set_rss_hash(packet, hash); > - } > + hash = miniflow_hash_5tuple(mf, 0); > + dp_packet_set_rss_hash(packet, hash); > > - /* The RSS hash must account for the recirculation depth to avoid > - * collisions in the exact match cache */ > - recirc_depth = *recirc_depth_get_unsafe(); > - if (OVS_UNLIKELY(recirc_depth)) { > - hash = hash_finish(hash, recirc_depth); > - dp_packet_set_rss_hash(packet, hash); > + if (md_is_valid) { > + /* The RSS hash must account for the recirculation depth to avoid > + * collisions in the exact match cache */ Minor, comment style, missing period at end of comment. > + recirc_depth = *recirc_depth_get_unsafe(); > + if (OVS_UNLIKELY(recirc_depth)) { > + hash = hash_finish(hash, recirc_depth); > + dp_packet_set_rss_hash(packet, hash); > + } > } > return hash; > } > @@ -6390,6 +6398,7 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, > bool smc_enable_db; > size_t map_cnt = 0; > bool batch_enable = true; > + bool is_5tuple_hash_needed; > > atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db); > pmd_perf_update_counter(&pmd->perf_stats, > @@ -6436,16 +6445,31 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, > } > } > > - miniflow_extract(packet, &key->mf); > - key->len = 0; /* Not computed yet. */ > /* If EMC and SMC disabled skip hash computation */ > if (smc_enable_db == true || cur_min != 0) { > - if (!md_is_valid) { > - key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, > - &key->mf); > + if (OVS_LIKELY(dp_packet_rss_valid(packet))) { > + is_5tuple_hash_needed = false; > + key->hash = > + dpif_netdev_packet_get_packet_rss_hash(packet,md_is_valid); Coding style, missing space between , and md_is_valid. > + if (cur_min) { > + ovs_prefetch_range( > + &cache->emc_cache.entries[key->hash & EM_FLOW_HASH_MASK], > + DEFAULT_EMC_PREFETCH_SIZE); > + } > } else { > - key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf); > + is_5tuple_hash_needed = true; > } > + } else { > + is_5tuple_hash_needed = false; > + } > + > + miniflow_extract(packet, &key->mf); > + key->len = 0; /* Not computed yet. */ > + > + /* If 5tuple hash is needed */ Minor, comment style, missing period at end of comment. Ian > + if (is_5tuple_hash_needed) { > + key->hash = dpif_netdev_packet_get_hash_5tuple(packet, &key->mf, > + md_is_valid); > } > if (cur_min) { > flow = emc_lookup(&cache->emc_cache, key); >
Hi Ian, I also observed a minor throughput drop (around 1%) with single flow in arm platform, but not 25% drop. Maybe the additional prefetch operation cause it. Anyway, when you come back next week, let's discuss this patch again. Best Regards, Wei Yanqin -----Original Message----- From: Ian Stokes <ian.stokes@intel.com> Sent: Monday, March 25, 2019 6:16 AM To: Yanqin Wei (Arm Technology China) <Yanqin.Wei@arm.com>; dev@openvswitch.org Cc: nd <nd@arm.com>; Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>; Ilya Maximets <i.maximets@samsung.com> Subject: Re: [ovs-dev] [PATCH v3] dpif-netdev: dfc_process optimization by prefetching EMC entry. On 3/13/2019 5:27 AM, Yanqin Wei wrote: > It is observed that the throughput of multi-flow is worse than > single-flow in the EMC NIC to NIC cases. It is because CPU cache-miss > increasing in EMC lookup. Each flow need load at least one EMC entry > to CPU cache(several cache lines) and compare it with packet miniflow. > This patch improve it by prefetching EMC entry in advance. Hash value > can be obtained from dpdk rss hash, so this step can be advanced ahead > of > miniflow_extract() and prefetch EMC entry there. The prefetching size > is defined as ROUND_UP(128,CACHE_LINE_SIZE), which can cover majority > traffic including TCP/UDP protocol and need 2 cache lines in most modern CPU. > Performance test was run in some arm platform. 1000/10000 flows > NIC2NIC test achieved around 10% throughput improvement in > thunderX2(aarch64 platform). > Thanks for this Wei, not a few review, please see some minor comments below WRT style issues. I've also run some benchmarks on this. I was seeing typically a ~3% drop on x86 with single flows with RFC2544. However once or twice, I saw a drop of up to 25% on achievable lossless packet rate but I suspect it could be an anomaly in my setup. Ilya, if you are testing this week on x86, it would be great you confirm if you see something similar in your benchmarks? For vsperf phy2phy_scalability flow tests on x86 I saw an improvement of +3% after applying the patch for zero loss tests and +5% in the case of phy2phy_scalability_cont so this looks promising. As an FYI I'll be I'm out of office this coming week so will not have an opportunity to investigate further until I'm back in office. I'll be able to review and benchmark further then. > Signed-off-by: Yanqin Wei <Yanqin.Wei@arm.com> > Reviewed-by: Gavin Hu <Gavin.Hu@arm.com> Although it doesn't appear here or in patchwork, after downloading the patch the sign off and review tags above appear duplicated after being applied. Examining the mbox I can confirm they are duplicated, can you check this on your side also? > --- > lib/dpif-netdev.c | 80 ++++++++++++++++++++++++++++++++++++------------------- > 1 file changed, 52 insertions(+), 28 deletions(-) > > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c > index 4d6d0c3..982082c 100644 > --- a/lib/dpif-netdev.c > +++ b/lib/dpif-netdev.c > @@ -189,6 +189,10 @@ struct netdev_flow_key { > #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \ > DEFAULT_EM_FLOW_INSERT_INV_PROB) > > +/* DEFAULT_EMC_PREFETCH_SIZE can cover majority traffic including TCP/UDP > + * protocol. */ > +#define DEFAULT_EMC_PREFETCH_SIZE ROUND_UP(128,CACHE_LINE_SIZE) > + > struct emc_entry { > struct dp_netdev_flow *flow; > struct netdev_flow_key key; /* key.hash used for emc hash value. */ > @@ -6166,15 +6170,20 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_, > } > > static inline uint32_t > -dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, > - const struct miniflow *mf) > +dpif_netdev_packet_get_packet_rss_hash(struct dp_packet *packet, > + bool md_is_valid) > { > - uint32_t hash; > + uint32_t hash,recirc_depth; > > - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { > - hash = dp_packet_get_rss_hash(packet); > - } else { > - hash = miniflow_hash_5tuple(mf, 0); > + hash = dp_packet_get_rss_hash(packet); > + > + if (md_is_valid) { > + /* The RSS hash must account for the recirculation depth to avoid > + * collisions in the exact match cache */ Minor, comment style, missing period at end of comment. > + recirc_depth = *recirc_depth_get_unsafe(); > + if (OVS_UNLIKELY(recirc_depth)) { > + hash = hash_finish(hash, recirc_depth); > + } > dp_packet_set_rss_hash(packet, hash); > } > > @@ -6182,24 +6191,23 @@ dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, > } > > static inline uint32_t > -dpif_netdev_packet_get_rss_hash(struct dp_packet *packet, > - const struct miniflow *mf) > +dpif_netdev_packet_get_hash_5tuple(struct dp_packet *packet, > + const struct miniflow *mf, > + bool md_is_valid) > { > - uint32_t hash, recirc_depth; > + uint32_t hash,recirc_depth; Coding style, missing space between , and recirc_depth. > > - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { > - hash = dp_packet_get_rss_hash(packet); > - } else { > - hash = miniflow_hash_5tuple(mf, 0); > - dp_packet_set_rss_hash(packet, hash); > - } > + hash = miniflow_hash_5tuple(mf, 0); > + dp_packet_set_rss_hash(packet, hash); > > - /* The RSS hash must account for the recirculation depth to avoid > - * collisions in the exact match cache */ > - recirc_depth = *recirc_depth_get_unsafe(); > - if (OVS_UNLIKELY(recirc_depth)) { > - hash = hash_finish(hash, recirc_depth); > - dp_packet_set_rss_hash(packet, hash); > + if (md_is_valid) { > + /* The RSS hash must account for the recirculation depth to avoid > + * collisions in the exact match cache */ Minor, comment style, missing period at end of comment. > + recirc_depth = *recirc_depth_get_unsafe(); > + if (OVS_UNLIKELY(recirc_depth)) { > + hash = hash_finish(hash, recirc_depth); > + dp_packet_set_rss_hash(packet, hash); > + } > } > return hash; > } > @@ -6390,6 +6398,7 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, > bool smc_enable_db; > size_t map_cnt = 0; > bool batch_enable = true; > + bool is_5tuple_hash_needed; > > atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db); > pmd_perf_update_counter(&pmd->perf_stats, > @@ -6436,16 +6445,31 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, > } > } > > - miniflow_extract(packet, &key->mf); > - key->len = 0; /* Not computed yet. */ > /* If EMC and SMC disabled skip hash computation */ > if (smc_enable_db == true || cur_min != 0) { > - if (!md_is_valid) { > - key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, > - &key->mf); > + if (OVS_LIKELY(dp_packet_rss_valid(packet))) { > + is_5tuple_hash_needed = false; > + key->hash = > + dpif_netdev_packet_get_packet_rss_hash(packet,md_is_valid); Coding style, missing space between , and md_is_valid. > + if (cur_min) { > + ovs_prefetch_range( > + &cache->emc_cache.entries[key->hash & EM_FLOW_HASH_MASK], > + DEFAULT_EMC_PREFETCH_SIZE); > + } > } else { > - key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf); > + is_5tuple_hash_needed = true; > } > + } else { > + is_5tuple_hash_needed = false; > + } > + > + miniflow_extract(packet, &key->mf); > + key->len = 0; /* Not computed yet. */ > + > + /* If 5tuple hash is needed */ Minor, comment style, missing period at end of comment. Ian > + if (is_5tuple_hash_needed) { > + key->hash = dpif_netdev_packet_get_hash_5tuple(packet, &key->mf, > + md_is_valid); > } > if (cur_min) { > flow = emc_lookup(&cache->emc_cache, key); >
Hi Ilya, Thanks for your reply. We could have a look at test results in x86 by then. I can understand patch 1054571. If both patches are apply to master, I could rebase prefetch EMC patch for it. Mandatory hash computing in the ingress makes logic simple a lot, and it only costs a small price even in worst case(EMC/SMC disable & no hash-feature/load balance enable). Best Regards, Wei Yanqin -----Original Message----- From: Ilya Maximets <i.maximets@samsung.com> Sent: Friday, March 22, 2019 9:12 PM To: Yanqin Wei (Arm Technology China) <Yanqin.Wei@arm.com>; dev@openvswitch.org; ian.stokes@intel.com Cc: nd <nd@arm.com> Subject: Re: [ovs-dev][PATCH v3] dpif-netdev: dfc_process optimization by prefetching EMC entry. On 22.03.2019 11:44, Yanqin Wei (Arm Technology China) wrote: > Hi , OVS Maintainers, > > Could you help to have a look at this patch? Thanks a lot. Hi. Thanks for improving performance and sorry for delay. Review process here in OVS is a bit slow due to lack of reviewers. I have a plan to test this patch a bit on a next week. Want to check the performance impact on PVP cases on x86. BTW, I have a patch that affects same code. Maybe it'll be interesting to you: https://patchwork.ozlabs.org/patch/1054571/ Best regards, Ilya Maximets. > > Best Regards, > Wei Yanqin > > -----Original Message----- > From: Yanqin Wei <Yanqin.Wei@arm.com> > Sent: Wednesday, March 13, 2019 1:28 PM > To: dev@openvswitch.org > Cc: nd <nd@arm.com>; Gavin Hu (Arm Technology China) > <Gavin.Hu@arm.com>; Yanqin Wei (Arm Technology China) > <Yanqin.Wei@arm.com> > Subject: [ovs-dev][PATCH v3] dpif-netdev: dfc_process optimization by prefetching EMC entry. > > It is observed that the throughput of multi-flow is worse than single-flow in the EMC NIC to NIC cases. It is because CPU cache-miss increasing in EMC lookup. Each flow need load at least one EMC entry to CPU cache(several cache lines) and compare it with packet miniflow. > This patch improve it by prefetching EMC entry in advance. Hash value > can be obtained from dpdk rss hash, so this step can be advanced ahead > of > miniflow_extract() and prefetch EMC entry there. The prefetching size is defined as ROUND_UP(128,CACHE_LINE_SIZE), which can cover majority traffic including TCP/UDP protocol and need 2 cache lines in most modern CPU. > Performance test was run in some arm platform. 1000/10000 flows NIC2NIC test achieved around 10% throughput improvement in thunderX2(aarch64 platform). > > Signed-off-by: Yanqin Wei <Yanqin.Wei@arm.com> > Reviewed-by: Gavin Hu <Gavin.Hu@arm.com> > --- > lib/dpif-netdev.c | 80 > ++++++++++++++++++++++++++++++++++++------------------- > 1 file changed, 52 insertions(+), 28 deletions(-) > > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index > 4d6d0c3..982082c 100644 > --- a/lib/dpif-netdev.c > +++ b/lib/dpif-netdev.c > @@ -189,6 +189,10 @@ struct netdev_flow_key { > #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \ > DEFAULT_EM_FLOW_INSERT_INV_PROB) > > +/* DEFAULT_EMC_PREFETCH_SIZE can cover majority traffic including > +TCP/UDP > + * protocol. */ > +#define DEFAULT_EMC_PREFETCH_SIZE ROUND_UP(128,CACHE_LINE_SIZE) > + > struct emc_entry { > struct dp_netdev_flow *flow; > struct netdev_flow_key key; /* key.hash used for emc hash value. */ > @@ -6166,15 +6170,20 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread > *pmd, struct dp_packet *packet_, } > > static inline uint32_t > -dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, > - const struct miniflow *mf) > +dpif_netdev_packet_get_packet_rss_hash(struct dp_packet *packet, > + bool md_is_valid) > { > - uint32_t hash; > + uint32_t hash,recirc_depth; > > - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { > - hash = dp_packet_get_rss_hash(packet); > - } else { > - hash = miniflow_hash_5tuple(mf, 0); > + hash = dp_packet_get_rss_hash(packet); > + > + if (md_is_valid) { > + /* The RSS hash must account for the recirculation depth to avoid > + * collisions in the exact match cache */ > + recirc_depth = *recirc_depth_get_unsafe(); > + if (OVS_UNLIKELY(recirc_depth)) { > + hash = hash_finish(hash, recirc_depth); > + } > dp_packet_set_rss_hash(packet, hash); > } > > @@ -6182,24 +6191,23 @@ > dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, } > > static inline uint32_t > -dpif_netdev_packet_get_rss_hash(struct dp_packet *packet, > - const struct miniflow *mf) > +dpif_netdev_packet_get_hash_5tuple(struct dp_packet *packet, > + const struct miniflow *mf, > + bool md_is_valid) > { > - uint32_t hash, recirc_depth; > + uint32_t hash,recirc_depth; > > - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { > - hash = dp_packet_get_rss_hash(packet); > - } else { > - hash = miniflow_hash_5tuple(mf, 0); > - dp_packet_set_rss_hash(packet, hash); > - } > + hash = miniflow_hash_5tuple(mf, 0); > + dp_packet_set_rss_hash(packet, hash); > > - /* The RSS hash must account for the recirculation depth to avoid > - * collisions in the exact match cache */ > - recirc_depth = *recirc_depth_get_unsafe(); > - if (OVS_UNLIKELY(recirc_depth)) { > - hash = hash_finish(hash, recirc_depth); > - dp_packet_set_rss_hash(packet, hash); > + if (md_is_valid) { > + /* The RSS hash must account for the recirculation depth to avoid > + * collisions in the exact match cache */ > + recirc_depth = *recirc_depth_get_unsafe(); > + if (OVS_UNLIKELY(recirc_depth)) { > + hash = hash_finish(hash, recirc_depth); > + dp_packet_set_rss_hash(packet, hash); > + } > } > return hash; > } > @@ -6390,6 +6398,7 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, > bool smc_enable_db; > size_t map_cnt = 0; > bool batch_enable = true; > + bool is_5tuple_hash_needed; > > atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db); > pmd_perf_update_counter(&pmd->perf_stats, > @@ -6436,16 +6445,31 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, > } > } > > - miniflow_extract(packet, &key->mf); > - key->len = 0; /* Not computed yet. */ > /* If EMC and SMC disabled skip hash computation */ > if (smc_enable_db == true || cur_min != 0) { > - if (!md_is_valid) { > - key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, > - &key->mf); > + if (OVS_LIKELY(dp_packet_rss_valid(packet))) { > + is_5tuple_hash_needed = false; > + key->hash = > + dpif_netdev_packet_get_packet_rss_hash(packet,md_is_valid); > + if (cur_min) { > + ovs_prefetch_range( > + &cache->emc_cache.entries[key->hash & EM_FLOW_HASH_MASK], > + DEFAULT_EMC_PREFETCH_SIZE); > + } > } else { > - key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf); > + is_5tuple_hash_needed = true; > } > + } else { > + is_5tuple_hash_needed = false; > + } > + > + miniflow_extract(packet, &key->mf); > + key->len = 0; /* Not computed yet. */ > + > + /* If 5tuple hash is needed */ > + if (is_5tuple_hash_needed) { > + key->hash = dpif_netdev_packet_get_hash_5tuple(packet, > + &key->mf, > + > + md_is_valid); > } > if (cur_min) { > flow = emc_lookup(&cache->emc_cache, key); > -- > 2.7.4 > > >
Hi. I made few tests on PVP with bonded PHY setup and found no significant difference in maximum performance with low and medium number of flows (8 - 8192). In case of big number of flows (64K - 256K) I see performance drop in about 2-3%. I think that because of prefetching of a second cacheline, which is unneeded because current_entry->key.hash likely != key->hash and we don't need to compare miniflows while emc_lookup. Changing the code to prefetch the first cacheline only decreases the drop to ~1% (However, this increases the consumed processing cycles for medium numbers of flows described below). OTOH, I see a slight decrease (~1%) of consumed cycles per packet for the thread that polls HW NICs and send packets to VM, which is good. This improvement observed for a medium small number of flows: 512 - 8192. For a low (1 - 256) and high (64K - 256K) numbers of flows value of consumed processing cycles per packet fro this thread was not affected by the patch. Tests made with average 512B packets, EMC enabled, SMC disabled, TX flushing interval 50us. Note that the bottleneck in this case is the VM --> bonded PHY part which is the case of 5tuple hash calculation. See review comments inline. Best regards, Ilya Maximets. On 22.03.2019 11:44, Yanqin Wei (Arm Technology China) wrote: > Hi , OVS Maintainers, > > Could you help to have a look at this patch? Thanks a lot. > > Best Regards, > Wei Yanqin > > -----Original Message----- > From: Yanqin Wei <Yanqin.Wei@arm.com> > Sent: Wednesday, March 13, 2019 1:28 PM > To: dev@openvswitch.org > Cc: nd <nd@arm.com>; Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>; Yanqin Wei (Arm Technology China) <Yanqin.Wei@arm.com> > Subject: [ovs-dev][PATCH v3] dpif-netdev: dfc_process optimization by prefetching EMC entry. > > It is observed that the throughput of multi-flow is worse than single-flow in the EMC NIC to NIC cases. It is because CPU cache-miss increasing in EMC lookup. Each flow need load at least one EMC entry to CPU cache(several cache lines) and compare it with packet miniflow. > This patch improve it by prefetching EMC entry in advance. Hash value can be obtained from dpdk rss hash, so this step can be advanced ahead of > miniflow_extract() and prefetch EMC entry there. The prefetching size is defined as ROUND_UP(128,CACHE_LINE_SIZE), which can cover majority traffic including TCP/UDP protocol and need 2 cache lines in most modern CPU. > Performance test was run in some arm platform. 1000/10000 flows NIC2NIC test achieved around 10% throughput improvement in thunderX2(aarch64 platform). > > Signed-off-by: Yanqin Wei <Yanqin.Wei@arm.com> > Reviewed-by: Gavin Hu <Gavin.Hu@arm.com> > --- > lib/dpif-netdev.c | 80 ++++++++++++++++++++++++++++++++++++------------------- > 1 file changed, 52 insertions(+), 28 deletions(-) > > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 4d6d0c3..982082c 100644 > --- a/lib/dpif-netdev.c > +++ b/lib/dpif-netdev.c > @@ -189,6 +189,10 @@ struct netdev_flow_key { > #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \ > DEFAULT_EM_FLOW_INSERT_INV_PROB) > > +/* DEFAULT_EMC_PREFETCH_SIZE can cover majority traffic including > +TCP/UDP > + * protocol. */ > +#define DEFAULT_EMC_PREFETCH_SIZE ROUND_UP(128,CACHE_LINE_SIZE) Space after the ',' needed. > + > struct emc_entry { > struct dp_netdev_flow *flow; > struct netdev_flow_key key; /* key.hash used for emc hash value. */ > @@ -6166,15 +6170,20 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_, } > > static inline uint32_t > -dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, > - const struct miniflow *mf) > +dpif_netdev_packet_get_packet_rss_hash(struct dp_packet *packet, > + bool md_is_valid) 1. 'packet' word appears twice in the function name. How about 'dpif_netdev_packet_get_rss_hash' and 'dpif_netdev_packet_get_5tuple_hash' for the second function? 2. Please align the second line to the level of next char after '('. 3. 'md_is_valid' is meaningless name inside the function. What about renaming to 'bool account_recirc_id'? > { > - uint32_t hash; > + uint32_t hash,recirc_depth; missing space. > > - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { > - hash = dp_packet_get_rss_hash(packet); > - } else { > - hash = miniflow_hash_5tuple(mf, 0); > + hash = dp_packet_get_rss_hash(packet); > + > + if (md_is_valid) { > + /* The RSS hash must account for the recirculation depth to avoid > + * collisions in the exact match cache */ > + recirc_depth = *recirc_depth_get_unsafe(); > + if (OVS_UNLIKELY(recirc_depth)) { We're in recirculation if md is valid. So, 'OVS_UNLIKELY' is wrong here. I think that we don't need to check the value at all. 'OVS_UNLIKELY' should be moved to the upper 'if'. > + hash = hash_finish(hash, recirc_depth); > + } > dp_packet_set_rss_hash(packet, hash); This could be moved inside the 'if', because we need to update rss hash only if it was changed. > } > > @@ -6182,24 +6191,23 @@ dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, } > > static inline uint32_t > -dpif_netdev_packet_get_rss_hash(struct dp_packet *packet, > - const struct miniflow *mf) > +dpif_netdev_packet_get_hash_5tuple(struct dp_packet *packet, > + const struct miniflow *mf, > + bool md_is_valid) Alignments. > { > - uint32_t hash, recirc_depth; > + uint32_t hash,recirc_depth; Space. > > - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { > - hash = dp_packet_get_rss_hash(packet); > - } else { > - hash = miniflow_hash_5tuple(mf, 0); > - dp_packet_set_rss_hash(packet, hash); > - } > + hash = miniflow_hash_5tuple(mf, 0); > + dp_packet_set_rss_hash(packet, hash); > > - /* The RSS hash must account for the recirculation depth to avoid > - * collisions in the exact match cache */ > - recirc_depth = *recirc_depth_get_unsafe(); > - if (OVS_UNLIKELY(recirc_depth)) { > - hash = hash_finish(hash, recirc_depth); > - dp_packet_set_rss_hash(packet, hash); > + if (md_is_valid) { > + /* The RSS hash must account for the recirculation depth to avoid > + * collisions in the exact match cache */ > + recirc_depth = *recirc_depth_get_unsafe(); > + if (OVS_UNLIKELY(recirc_depth)) { Same as previous. > + hash = hash_finish(hash, recirc_depth); > + dp_packet_set_rss_hash(packet, hash); This function sets the rss to dp_packet twice. You could avoid that by moving the 'dp_packet_set_rss_hash' call to the end of this function. > + } > } > return hash; > } > @@ -6390,6 +6398,7 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, > bool smc_enable_db; > size_t map_cnt = 0; > bool batch_enable = true; > + bool is_5tuple_hash_needed; > > atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db); > pmd_perf_update_counter(&pmd->perf_stats, > @@ -6436,16 +6445,31 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, > } > } > > - miniflow_extract(packet, &key->mf); > - key->len = 0; /* Not computed yet. */ > /* If EMC and SMC disabled skip hash computation */ > if (smc_enable_db == true || cur_min != 0) { > - if (!md_is_valid) { > - key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, > - &key->mf); > + if (OVS_LIKELY(dp_packet_rss_valid(packet))) { I'd not put the 'LIKELY' macro here, because all the packets from virtual ports has no RSS and this is the common case. > + is_5tuple_hash_needed = false; > + key->hash = > + dpif_netdev_packet_get_packet_rss_hash(packet,md_is_valid); Space. > + if (cur_min) { > + ovs_prefetch_range( > + &cache->emc_cache.entries[key->hash & EM_FLOW_HASH_MASK], > + DEFAULT_EMC_PREFETCH_SIZE); > + } > } else { > - key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf); > + is_5tuple_hash_needed = true; > } > + } else { > + is_5tuple_hash_needed = false; > + } > + > + miniflow_extract(packet, &key->mf); > + key->len = 0; /* Not computed yet. */ > + > + /* If 5tuple hash is needed */ This comment is not that useful. IMHO, the variable is self-documenting. > + if (is_5tuple_hash_needed) { > + key->hash = dpif_netdev_packet_get_hash_5tuple(packet, &key->mf, > + > + md_is_valid); > } > if (cur_min) { > flow = emc_lookup(&cache->emc_cache, key); > -- > 2.7.4 > > >
On 29.03.2019 20:33, Ilya Maximets wrote: > Hi. > I made few tests on PVP with bonded PHY setup and found no significant > difference in maximum performance with low and medium number of flows > (8 - 8192). Correction: I see the a slight performance drop (~1%) for the cases with very low number of flows (1-8), and a performance increase (~ 2-3%) for the medium low number of flows (64 - 4096). In this scenario packets from VM has already calculated hash on recirculation, so the prefetching optimization doesn't work for the first pass through the datapath, but works for the second. Degradation on single or very low number of flows probably caused by additional checks and instructions for prefetching the memory that is in cache already. > In case of big number of flows (64K - 256K) I see performance > drop in about 2-3%. I think that because of prefetching of a second cacheline, > which is unneeded because current_entry->key.hash likely != key->hash > and we don't need to compare miniflows while emc_lookup. Changing the code > to prefetch the first cacheline only decreases the drop to ~1% (However, > this increases the consumed processing cycles for medium numbers of flows > described below) In general, this patch decreases the performance for cases where EMC is not efficient and improves it for cases where EMC is efficient, except the cases with very low numbers of flows, which could be the main concern. > > OTOH, I see a slight decrease (~1%) of consumed cycles per packet for the > thread that polls HW NICs and send packets to VM, which is good. > This improvement observed for a medium small number of flows: 512 - 8192. > For a low (1 - 256) and high (64K - 256K) numbers of flows value of consumed > processing cycles per packet fro this thread was not affected by the patch. > > Tests made with average 512B packets, EMC enabled, SMC disabled, TX flushing > interval 50us. Note that the bottleneck in this case is the VM --> bonded PHY > part which is the case of 5tuple hash calculation. > > See review comments inline. > > Best regards, Ilya Maximets. > > On 22.03.2019 11:44, Yanqin Wei (Arm Technology China) wrote: >> Hi , OVS Maintainers, >> >> Could you help to have a look at this patch? Thanks a lot. >> >> Best Regards, >> Wei Yanqin >> >> -----Original Message----- >> From: Yanqin Wei <Yanqin.Wei@arm.com> >> Sent: Wednesday, March 13, 2019 1:28 PM >> To: dev@openvswitch.org >> Cc: nd <nd@arm.com>; Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>; Yanqin Wei (Arm Technology China) <Yanqin.Wei@arm.com> >> Subject: [ovs-dev][PATCH v3] dpif-netdev: dfc_process optimization by prefetching EMC entry. >> >> It is observed that the throughput of multi-flow is worse than single-flow in the EMC NIC to NIC cases. It is because CPU cache-miss increasing in EMC lookup. Each flow need load at least one EMC entry to CPU cache(several cache lines) and compare it with packet miniflow. >> This patch improve it by prefetching EMC entry in advance. Hash value can be obtained from dpdk rss hash, so this step can be advanced ahead of >> miniflow_extract() and prefetch EMC entry there. The prefetching size is defined as ROUND_UP(128,CACHE_LINE_SIZE), which can cover majority traffic including TCP/UDP protocol and need 2 cache lines in most modern CPU. >> Performance test was run in some arm platform. 1000/10000 flows NIC2NIC test achieved around 10% throughput improvement in thunderX2(aarch64 platform). >> >> Signed-off-by: Yanqin Wei <Yanqin.Wei@arm.com> >> Reviewed-by: Gavin Hu <Gavin.Hu@arm.com> >> --- >> lib/dpif-netdev.c | 80 ++++++++++++++++++++++++++++++++++++------------------- >> 1 file changed, 52 insertions(+), 28 deletions(-) >> >> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 4d6d0c3..982082c 100644 >> --- a/lib/dpif-netdev.c >> +++ b/lib/dpif-netdev.c >> @@ -189,6 +189,10 @@ struct netdev_flow_key { >> #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \ >> DEFAULT_EM_FLOW_INSERT_INV_PROB) >> >> +/* DEFAULT_EMC_PREFETCH_SIZE can cover majority traffic including >> +TCP/UDP >> + * protocol. */ >> +#define DEFAULT_EMC_PREFETCH_SIZE ROUND_UP(128,CACHE_LINE_SIZE) > > Space after the ',' needed. > >> + >> struct emc_entry { >> struct dp_netdev_flow *flow; >> struct netdev_flow_key key; /* key.hash used for emc hash value. */ >> @@ -6166,15 +6170,20 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_, } >> >> static inline uint32_t >> -dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, >> - const struct miniflow *mf) >> +dpif_netdev_packet_get_packet_rss_hash(struct dp_packet *packet, >> + bool md_is_valid) > > 1. 'packet' word appears twice in the function name. > How about 'dpif_netdev_packet_get_rss_hash' and > 'dpif_netdev_packet_get_5tuple_hash' for the second function? > > 2. Please align the second line to the level of next char after '('. > > 3. 'md_is_valid' is meaningless name inside the function. > What about renaming to 'bool account_recirc_id'? > >> { >> - uint32_t hash; >> + uint32_t hash,recirc_depth; > > missing space. > >> >> - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { >> - hash = dp_packet_get_rss_hash(packet); >> - } else { >> - hash = miniflow_hash_5tuple(mf, 0); >> + hash = dp_packet_get_rss_hash(packet); >> + >> + if (md_is_valid) { >> + /* The RSS hash must account for the recirculation depth to avoid >> + * collisions in the exact match cache */ >> + recirc_depth = *recirc_depth_get_unsafe(); >> + if (OVS_UNLIKELY(recirc_depth)) { > > We're in recirculation if md is valid. So, 'OVS_UNLIKELY' is wrong here. > I think that we don't need to check the value at all. > 'OVS_UNLIKELY' should be moved to the upper 'if'. > >> + hash = hash_finish(hash, recirc_depth); >> + } >> dp_packet_set_rss_hash(packet, hash); > > This could be moved inside the 'if', because we need to update rss hash > only if it was changed. > >> } >> >> @@ -6182,24 +6191,23 @@ dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, } >> >> static inline uint32_t >> -dpif_netdev_packet_get_rss_hash(struct dp_packet *packet, >> - const struct miniflow *mf) >> +dpif_netdev_packet_get_hash_5tuple(struct dp_packet *packet, >> + const struct miniflow *mf, >> + bool md_is_valid) > > Alignments. > >> { >> - uint32_t hash, recirc_depth; >> + uint32_t hash,recirc_depth; > > Space. > >> >> - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { >> - hash = dp_packet_get_rss_hash(packet); >> - } else { >> - hash = miniflow_hash_5tuple(mf, 0); >> - dp_packet_set_rss_hash(packet, hash); >> - } >> + hash = miniflow_hash_5tuple(mf, 0); >> + dp_packet_set_rss_hash(packet, hash); >> >> - /* The RSS hash must account for the recirculation depth to avoid >> - * collisions in the exact match cache */ >> - recirc_depth = *recirc_depth_get_unsafe(); >> - if (OVS_UNLIKELY(recirc_depth)) { >> - hash = hash_finish(hash, recirc_depth); >> - dp_packet_set_rss_hash(packet, hash); >> + if (md_is_valid) { >> + /* The RSS hash must account for the recirculation depth to avoid >> + * collisions in the exact match cache */ >> + recirc_depth = *recirc_depth_get_unsafe(); >> + if (OVS_UNLIKELY(recirc_depth)) { > > Same as previous. > >> + hash = hash_finish(hash, recirc_depth); >> + dp_packet_set_rss_hash(packet, hash); > > This function sets the rss to dp_packet twice. You could avoid that > by moving the 'dp_packet_set_rss_hash' call to the end of this function. > >> + } >> } >> return hash; >> } >> @@ -6390,6 +6398,7 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, >> bool smc_enable_db; >> size_t map_cnt = 0; >> bool batch_enable = true; >> + bool is_5tuple_hash_needed; >> >> atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db); >> pmd_perf_update_counter(&pmd->perf_stats, >> @@ -6436,16 +6445,31 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, >> } >> } >> >> - miniflow_extract(packet, &key->mf); >> - key->len = 0; /* Not computed yet. */ >> /* If EMC and SMC disabled skip hash computation */ >> if (smc_enable_db == true || cur_min != 0) { >> - if (!md_is_valid) { >> - key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, >> - &key->mf); >> + if (OVS_LIKELY(dp_packet_rss_valid(packet))) { > > I'd not put the 'LIKELY' macro here, because all the packets from virtual > ports has no RSS and this is the common case. > >> + is_5tuple_hash_needed = false; >> + key->hash = >> + dpif_netdev_packet_get_packet_rss_hash(packet,md_is_valid); > > Space. > >> + if (cur_min) { >> + ovs_prefetch_range( >> + &cache->emc_cache.entries[key->hash & EM_FLOW_HASH_MASK], >> + DEFAULT_EMC_PREFETCH_SIZE); >> + } >> } else { >> - key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf); >> + is_5tuple_hash_needed = true; >> } >> + } else { >> + is_5tuple_hash_needed = false; >> + } >> + >> + miniflow_extract(packet, &key->mf); >> + key->len = 0; /* Not computed yet. */ >> + >> + /* If 5tuple hash is needed */ > > This comment is not that useful. IMHO, the variable is self-documenting. > >> + if (is_5tuple_hash_needed) { >> + key->hash = dpif_netdev_packet_get_hash_5tuple(packet, &key->mf, >> + >> + md_is_valid); >> } >> if (cur_min) { >> flow = emc_lookup(&cache->emc_cache, key); >> -- >> 2.7.4 >> >> >> > >
Hi Ilya, Really appreciate your time in doing benchmarking for this patch. And many thanks for your valuable comments. It is quite possible that the second unneeded cache line prefetching causes performance drop in case of big number of flows (64K - 256K). This impact the real world use case, I will try to improve it . For 1-8 flows cases, it should be not very important because of the lack of actual deployment. Best Regards, Wei Yanqin -----Original Message----- From: Ilya Maximets <i.maximets@samsung.com> Sent: Monday, April 1, 2019 2:16 PM To: Yanqin Wei (Arm Technology China) <Yanqin.Wei@arm.com>; dev@openvswitch.org; ian.stokes@intel.com Cc: nd <nd@arm.com>; Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com> Subject: Re: [ovs-dev][PATCH v3] dpif-netdev: dfc_process optimization by prefetching EMC entry. On 29.03.2019 20:33, Ilya Maximets wrote: > Hi. > I made few tests on PVP with bonded PHY setup and found no significant > difference in maximum performance with low and medium number of flows > (8 - 8192). Correction: I see the a slight performance drop (~1%) for the cases with very low number of flows (1-8), and a performance increase (~ 2-3%) for the medium low number of flows (64 - 4096). In this scenario packets from VM has already calculated hash on recirculation, so the prefetching optimization doesn't work for the first pass through the datapath, but works for the second. Degradation on single or very low number of flows probably caused by additional checks and instructions for prefetching the memory that is in cache already. > In case of big number of flows (64K - 256K) I see performance drop in > about 2-3%. I think that because of prefetching of a second cacheline, > which is unneeded because current_entry->key.hash likely != key->hash > and we don't need to compare miniflows while emc_lookup. Changing the > code to prefetch the first cacheline only decreases the drop to ~1% > (However, this increases the consumed processing cycles for medium > numbers of flows described below) In general, this patch decreases the performance for cases where EMC is not efficient and improves it for cases where EMC is efficient, except the cases with very low numbers of flows, which could be the main concern. > > OTOH, I see a slight decrease (~1%) of consumed cycles per packet for > the thread that polls HW NICs and send packets to VM, which is good. > This improvement observed for a medium small number of flows: 512 - 8192. > For a low (1 - 256) and high (64K - 256K) numbers of flows value of > consumed processing cycles per packet fro this thread was not affected by the patch. > > Tests made with average 512B packets, EMC enabled, SMC disabled, TX > flushing interval 50us. Note that the bottleneck in this case is the > VM --> bonded PHY part which is the case of 5tuple hash calculation. > > See review comments inline. > > Best regards, Ilya Maximets. > > On 22.03.2019 11:44, Yanqin Wei (Arm Technology China) wrote: >> Hi , OVS Maintainers, >> >> Could you help to have a look at this patch? Thanks a lot. >> >> Best Regards, >> Wei Yanqin >> >> -----Original Message----- >> From: Yanqin Wei <Yanqin.Wei@arm.com> >> Sent: Wednesday, March 13, 2019 1:28 PM >> To: dev@openvswitch.org >> Cc: nd <nd@arm.com>; Gavin Hu (Arm Technology China) >> <Gavin.Hu@arm.com>; Yanqin Wei (Arm Technology China) >> <Yanqin.Wei@arm.com> >> Subject: [ovs-dev][PATCH v3] dpif-netdev: dfc_process optimization by prefetching EMC entry. >> >> It is observed that the throughput of multi-flow is worse than single-flow in the EMC NIC to NIC cases. It is because CPU cache-miss increasing in EMC lookup. Each flow need load at least one EMC entry to CPU cache(several cache lines) and compare it with packet miniflow. >> This patch improve it by prefetching EMC entry in advance. Hash value >> can be obtained from dpdk rss hash, so this step can be advanced >> ahead of >> miniflow_extract() and prefetch EMC entry there. The prefetching size is defined as ROUND_UP(128,CACHE_LINE_SIZE), which can cover majority traffic including TCP/UDP protocol and need 2 cache lines in most modern CPU. >> Performance test was run in some arm platform. 1000/10000 flows NIC2NIC test achieved around 10% throughput improvement in thunderX2(aarch64 platform). >> >> Signed-off-by: Yanqin Wei <Yanqin.Wei@arm.com> >> Reviewed-by: Gavin Hu <Gavin.Hu@arm.com> >> --- >> lib/dpif-netdev.c | 80 >> ++++++++++++++++++++++++++++++++++++------------------- >> 1 file changed, 52 insertions(+), 28 deletions(-) >> >> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index >> 4d6d0c3..982082c 100644 >> --- a/lib/dpif-netdev.c >> +++ b/lib/dpif-netdev.c >> @@ -189,6 +189,10 @@ struct netdev_flow_key { >> #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \ >> DEFAULT_EM_FLOW_INSERT_INV_PROB) >> >> +/* DEFAULT_EMC_PREFETCH_SIZE can cover majority traffic including >> +TCP/UDP >> + * protocol. */ >> +#define DEFAULT_EMC_PREFETCH_SIZE ROUND_UP(128,CACHE_LINE_SIZE) > > Space after the ',' needed. > >> + >> struct emc_entry { >> struct dp_netdev_flow *flow; >> struct netdev_flow_key key; /* key.hash used for emc hash value. */ >> @@ -6166,15 +6170,20 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread >> *pmd, struct dp_packet *packet_, } >> >> static inline uint32_t >> -dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, >> - const struct miniflow *mf) >> +dpif_netdev_packet_get_packet_rss_hash(struct dp_packet *packet, >> + bool md_is_valid) > > 1. 'packet' word appears twice in the function name. > How about 'dpif_netdev_packet_get_rss_hash' and > 'dpif_netdev_packet_get_5tuple_hash' for the second function? > > 2. Please align the second line to the level of next char after '('. > > 3. 'md_is_valid' is meaningless name inside the function. > What about renaming to 'bool account_recirc_id'? > >> { >> - uint32_t hash; >> + uint32_t hash,recirc_depth; > > missing space. > >> >> - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { >> - hash = dp_packet_get_rss_hash(packet); >> - } else { >> - hash = miniflow_hash_5tuple(mf, 0); >> + hash = dp_packet_get_rss_hash(packet); >> + >> + if (md_is_valid) { >> + /* The RSS hash must account for the recirculation depth to avoid >> + * collisions in the exact match cache */ >> + recirc_depth = *recirc_depth_get_unsafe(); >> + if (OVS_UNLIKELY(recirc_depth)) { > > We're in recirculation if md is valid. So, 'OVS_UNLIKELY' is wrong here. > I think that we don't need to check the value at all. > 'OVS_UNLIKELY' should be moved to the upper 'if'. > >> + hash = hash_finish(hash, recirc_depth); >> + } >> dp_packet_set_rss_hash(packet, hash); > > This could be moved inside the 'if', because we need to update rss > hash only if it was changed. > >> } >> >> @@ -6182,24 +6191,23 @@ >> dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, } >> >> static inline uint32_t >> -dpif_netdev_packet_get_rss_hash(struct dp_packet *packet, >> - const struct miniflow *mf) >> +dpif_netdev_packet_get_hash_5tuple(struct dp_packet *packet, >> + const struct miniflow *mf, >> + bool md_is_valid) > > Alignments. > >> { >> - uint32_t hash, recirc_depth; >> + uint32_t hash,recirc_depth; > > Space. > >> >> - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { >> - hash = dp_packet_get_rss_hash(packet); >> - } else { >> - hash = miniflow_hash_5tuple(mf, 0); >> - dp_packet_set_rss_hash(packet, hash); >> - } >> + hash = miniflow_hash_5tuple(mf, 0); >> + dp_packet_set_rss_hash(packet, hash); >> >> - /* The RSS hash must account for the recirculation depth to avoid >> - * collisions in the exact match cache */ >> - recirc_depth = *recirc_depth_get_unsafe(); >> - if (OVS_UNLIKELY(recirc_depth)) { >> - hash = hash_finish(hash, recirc_depth); >> - dp_packet_set_rss_hash(packet, hash); >> + if (md_is_valid) { >> + /* The RSS hash must account for the recirculation depth to avoid >> + * collisions in the exact match cache */ >> + recirc_depth = *recirc_depth_get_unsafe(); >> + if (OVS_UNLIKELY(recirc_depth)) { > > Same as previous. > >> + hash = hash_finish(hash, recirc_depth); >> + dp_packet_set_rss_hash(packet, hash); > > This function sets the rss to dp_packet twice. You could avoid that by > moving the 'dp_packet_set_rss_hash' call to the end of this function. > >> + } >> } >> return hash; >> } >> @@ -6390,6 +6398,7 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, >> bool smc_enable_db; >> size_t map_cnt = 0; >> bool batch_enable = true; >> + bool is_5tuple_hash_needed; >> >> atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db); >> pmd_perf_update_counter(&pmd->perf_stats, >> @@ -6436,16 +6445,31 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, >> } >> } >> >> - miniflow_extract(packet, &key->mf); >> - key->len = 0; /* Not computed yet. */ >> /* If EMC and SMC disabled skip hash computation */ >> if (smc_enable_db == true || cur_min != 0) { >> - if (!md_is_valid) { >> - key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, >> - &key->mf); >> + if (OVS_LIKELY(dp_packet_rss_valid(packet))) { > > I'd not put the 'LIKELY' macro here, because all the packets from > virtual ports has no RSS and this is the common case. > >> + is_5tuple_hash_needed = false; >> + key->hash = >> + >> + dpif_netdev_packet_get_packet_rss_hash(packet,md_is_valid); > > Space. > >> + if (cur_min) { >> + ovs_prefetch_range( >> + &cache->emc_cache.entries[key->hash & EM_FLOW_HASH_MASK], >> + DEFAULT_EMC_PREFETCH_SIZE); >> + } >> } else { >> - key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf); >> + is_5tuple_hash_needed = true; >> } >> + } else { >> + is_5tuple_hash_needed = false; >> + } >> + >> + miniflow_extract(packet, &key->mf); >> + key->len = 0; /* Not computed yet. */ >> + >> + /* If 5tuple hash is needed */ > > This comment is not that useful. IMHO, the variable is self-documenting. > >> + if (is_5tuple_hash_needed) { >> + key->hash = dpif_netdev_packet_get_hash_5tuple(packet, >> + &key->mf, >> + >> + md_is_valid); >> } >> if (cur_min) { >> flow = emc_lookup(&cache->emc_cache, key); >> -- >> 2.7.4 >> >> >> > >
On 01.04.2019 9:52, Yanqin Wei (Arm Technology China) wrote: > Hi Ilya, > > Really appreciate your time in doing benchmarking for this patch. And many thanks for your valuable comments. > It is quite possible that the second unneeded cache line prefetching causes performance drop in case of big number of flows (64K - 256K). > This impact the real world use case, I will try to improve it . For 1-8 flows cases, it should be not very important because of the lack of actual deployment. I'm not actually sure which of these cases is more real. We're suggesting to disable EMC for cases where it's not effective. From the other side, if VM encapsulates all the incoming traffic and sends it for further processing, there will be only few outcoming flows. > > Best Regards, > Wei Yanqin > > -----Original Message----- > From: Ilya Maximets <i.maximets@samsung.com> > Sent: Monday, April 1, 2019 2:16 PM > To: Yanqin Wei (Arm Technology China) <Yanqin.Wei@arm.com>; dev@openvswitch.org; ian.stokes@intel.com > Cc: nd <nd@arm.com>; Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com> > Subject: Re: [ovs-dev][PATCH v3] dpif-netdev: dfc_process optimization by prefetching EMC entry. > > On 29.03.2019 20:33, Ilya Maximets wrote: >> Hi. >> I made few tests on PVP with bonded PHY setup and found no significant >> difference in maximum performance with low and medium number of flows >> (8 - 8192). > > Correction: I see the a slight performance drop (~1%) for the cases with very low number of flows (1-8), and a performance increase (~ 2-3%) for the medium low number of flows (64 - 4096). > In this scenario packets from VM has already calculated hash on recirculation, so the prefetching optimization doesn't work for the first pass through the datapath, but works for the second. > Degradation on single or very low number of flows probably caused by additional checks and instructions for prefetching the memory that is in cache already. > >> In case of big number of flows (64K - 256K) I see performance drop in >> about 2-3%. I think that because of prefetching of a second cacheline, >> which is unneeded because current_entry->key.hash likely != key->hash >> and we don't need to compare miniflows while emc_lookup. Changing the >> code to prefetch the first cacheline only decreases the drop to ~1% >> (However, this increases the consumed processing cycles for medium >> numbers of flows described below) > > In general, this patch decreases the performance for cases where EMC is not efficient and improves it for cases where EMC is efficient, except the cases with very low numbers of flows, which could be the main concern. > >> >> OTOH, I see a slight decrease (~1%) of consumed cycles per packet for >> the thread that polls HW NICs and send packets to VM, which is good. >> This improvement observed for a medium small number of flows: 512 - 8192. >> For a low (1 - 256) and high (64K - 256K) numbers of flows value of >> consumed processing cycles per packet fro this thread was not affected by the patch. >> >> Tests made with average 512B packets, EMC enabled, SMC disabled, TX >> flushing interval 50us. Note that the bottleneck in this case is the >> VM --> bonded PHY part which is the case of 5tuple hash calculation. >> >> See review comments inline. >> >> Best regards, Ilya Maximets. >> >> On 22.03.2019 11:44, Yanqin Wei (Arm Technology China) wrote: >>> Hi , OVS Maintainers, >>> >>> Could you help to have a look at this patch? Thanks a lot. >>> >>> Best Regards, >>> Wei Yanqin >>> >>> -----Original Message----- >>> From: Yanqin Wei <Yanqin.Wei@arm.com> >>> Sent: Wednesday, March 13, 2019 1:28 PM >>> To: dev@openvswitch.org >>> Cc: nd <nd@arm.com>; Gavin Hu (Arm Technology China) >>> <Gavin.Hu@arm.com>; Yanqin Wei (Arm Technology China) >>> <Yanqin.Wei@arm.com> >>> Subject: [ovs-dev][PATCH v3] dpif-netdev: dfc_process optimization by prefetching EMC entry. >>> >>> It is observed that the throughput of multi-flow is worse than single-flow in the EMC NIC to NIC cases. It is because CPU cache-miss increasing in EMC lookup. Each flow need load at least one EMC entry to CPU cache(several cache lines) and compare it with packet miniflow. >>> This patch improve it by prefetching EMC entry in advance. Hash value >>> can be obtained from dpdk rss hash, so this step can be advanced >>> ahead of >>> miniflow_extract() and prefetch EMC entry there. The prefetching size is defined as ROUND_UP(128,CACHE_LINE_SIZE), which can cover majority traffic including TCP/UDP protocol and need 2 cache lines in most modern CPU. >>> Performance test was run in some arm platform. 1000/10000 flows NIC2NIC test achieved around 10% throughput improvement in thunderX2(aarch64 platform). >>> >>> Signed-off-by: Yanqin Wei <Yanqin.Wei@arm.com> >>> Reviewed-by: Gavin Hu <Gavin.Hu@arm.com> >>> --- >>> lib/dpif-netdev.c | 80 >>> ++++++++++++++++++++++++++++++++++++------------------- >>> 1 file changed, 52 insertions(+), 28 deletions(-) >>> >>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index >>> 4d6d0c3..982082c 100644 >>> --- a/lib/dpif-netdev.c >>> +++ b/lib/dpif-netdev.c >>> @@ -189,6 +189,10 @@ struct netdev_flow_key { >>> #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \ >>> DEFAULT_EM_FLOW_INSERT_INV_PROB) >>> >>> +/* DEFAULT_EMC_PREFETCH_SIZE can cover majority traffic including >>> +TCP/UDP >>> + * protocol. */ >>> +#define DEFAULT_EMC_PREFETCH_SIZE ROUND_UP(128,CACHE_LINE_SIZE) >> >> Space after the ',' needed. >> >>> + >>> struct emc_entry { >>> struct dp_netdev_flow *flow; >>> struct netdev_flow_key key; /* key.hash used for emc hash value. */ >>> @@ -6166,15 +6170,20 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread >>> *pmd, struct dp_packet *packet_, } >>> >>> static inline uint32_t >>> -dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, >>> - const struct miniflow *mf) >>> +dpif_netdev_packet_get_packet_rss_hash(struct dp_packet *packet, >>> + bool md_is_valid) >> >> 1. 'packet' word appears twice in the function name. >> How about 'dpif_netdev_packet_get_rss_hash' and >> 'dpif_netdev_packet_get_5tuple_hash' for the second function? >> >> 2. Please align the second line to the level of next char after '('. >> >> 3. 'md_is_valid' is meaningless name inside the function. >> What about renaming to 'bool account_recirc_id'? >> >>> { >>> - uint32_t hash; >>> + uint32_t hash,recirc_depth; >> >> missing space. >> >>> >>> - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { >>> - hash = dp_packet_get_rss_hash(packet); >>> - } else { >>> - hash = miniflow_hash_5tuple(mf, 0); >>> + hash = dp_packet_get_rss_hash(packet); >>> + >>> + if (md_is_valid) { >>> + /* The RSS hash must account for the recirculation depth to avoid >>> + * collisions in the exact match cache */ >>> + recirc_depth = *recirc_depth_get_unsafe(); >>> + if (OVS_UNLIKELY(recirc_depth)) { >> >> We're in recirculation if md is valid. So, 'OVS_UNLIKELY' is wrong here. >> I think that we don't need to check the value at all. >> 'OVS_UNLIKELY' should be moved to the upper 'if'. >> >>> + hash = hash_finish(hash, recirc_depth); >>> + } >>> dp_packet_set_rss_hash(packet, hash); >> >> This could be moved inside the 'if', because we need to update rss >> hash only if it was changed. >> >>> } >>> >>> @@ -6182,24 +6191,23 @@ >>> dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, } >>> >>> static inline uint32_t >>> -dpif_netdev_packet_get_rss_hash(struct dp_packet *packet, >>> - const struct miniflow *mf) >>> +dpif_netdev_packet_get_hash_5tuple(struct dp_packet *packet, >>> + const struct miniflow *mf, >>> + bool md_is_valid) >> >> Alignments. >> >>> { >>> - uint32_t hash, recirc_depth; >>> + uint32_t hash,recirc_depth; >> >> Space. >> >>> >>> - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { >>> - hash = dp_packet_get_rss_hash(packet); >>> - } else { >>> - hash = miniflow_hash_5tuple(mf, 0); >>> - dp_packet_set_rss_hash(packet, hash); >>> - } >>> + hash = miniflow_hash_5tuple(mf, 0); >>> + dp_packet_set_rss_hash(packet, hash); >>> >>> - /* The RSS hash must account for the recirculation depth to avoid >>> - * collisions in the exact match cache */ >>> - recirc_depth = *recirc_depth_get_unsafe(); >>> - if (OVS_UNLIKELY(recirc_depth)) { >>> - hash = hash_finish(hash, recirc_depth); >>> - dp_packet_set_rss_hash(packet, hash); >>> + if (md_is_valid) { >>> + /* The RSS hash must account for the recirculation depth to avoid >>> + * collisions in the exact match cache */ >>> + recirc_depth = *recirc_depth_get_unsafe(); >>> + if (OVS_UNLIKELY(recirc_depth)) { >> >> Same as previous. >> >>> + hash = hash_finish(hash, recirc_depth); >>> + dp_packet_set_rss_hash(packet, hash); >> >> This function sets the rss to dp_packet twice. You could avoid that by >> moving the 'dp_packet_set_rss_hash' call to the end of this function. >> >>> + } >>> } >>> return hash; >>> } >>> @@ -6390,6 +6398,7 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, >>> bool smc_enable_db; >>> size_t map_cnt = 0; >>> bool batch_enable = true; >>> + bool is_5tuple_hash_needed; >>> >>> atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db); >>> pmd_perf_update_counter(&pmd->perf_stats, >>> @@ -6436,16 +6445,31 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, >>> } >>> } >>> >>> - miniflow_extract(packet, &key->mf); >>> - key->len = 0; /* Not computed yet. */ >>> /* If EMC and SMC disabled skip hash computation */ >>> if (smc_enable_db == true || cur_min != 0) { >>> - if (!md_is_valid) { >>> - key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, >>> - &key->mf); >>> + if (OVS_LIKELY(dp_packet_rss_valid(packet))) { >> >> I'd not put the 'LIKELY' macro here, because all the packets from >> virtual ports has no RSS and this is the common case. >> >>> + is_5tuple_hash_needed = false; >>> + key->hash = >>> + >>> + dpif_netdev_packet_get_packet_rss_hash(packet,md_is_valid); >> >> Space. >> >>> + if (cur_min) { >>> + ovs_prefetch_range( >>> + &cache->emc_cache.entries[key->hash & EM_FLOW_HASH_MASK], >>> + DEFAULT_EMC_PREFETCH_SIZE); >>> + } >>> } else { >>> - key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf); >>> + is_5tuple_hash_needed = true; >>> } >>> + } else { >>> + is_5tuple_hash_needed = false; >>> + } >>> + >>> + miniflow_extract(packet, &key->mf); >>> + key->len = 0; /* Not computed yet. */ >>> + >>> + /* If 5tuple hash is needed */ >> >> This comment is not that useful. IMHO, the variable is self-documenting. >> >>> + if (is_5tuple_hash_needed) { >>> + key->hash = dpif_netdev_packet_get_hash_5tuple(packet, >>> + &key->mf, >>> + >>> + md_is_valid); >>> } >>> if (cur_min) { >>> flow = emc_lookup(&cache->emc_cache, key); >>> -- >>> 2.7.4 >>> >>> >>> >> >>
So the main concern is performance drop for 1~8 flow, because EMC is effective here and would be enable. On the other hand, EMC prefetching does not take effect when EMC is disable, and in the case of large number of flows, the EMC is likely to be disabled. So the performance drop here can be avoided by configuration. Is my understanding correct? Best Regards, Wei Yanqin -----Original Message----- From: Ilya Maximets <i.maximets@samsung.com> Sent: Monday, April 1, 2019 3:15 PM To: Yanqin Wei (Arm Technology China) <Yanqin.Wei@arm.com>; dev@openvswitch.org; ian.stokes@intel.com Cc: nd <nd@arm.com>; Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com> Subject: Re: [ovs-dev][PATCH v3] dpif-netdev: dfc_process optimization by prefetching EMC entry. On 01.04.2019 9:52, Yanqin Wei (Arm Technology China) wrote: > Hi Ilya, > > Really appreciate your time in doing benchmarking for this patch. And many thanks for your valuable comments. > It is quite possible that the second unneeded cache line prefetching causes performance drop in case of big number of flows (64K - 256K). > This impact the real world use case, I will try to improve it . For 1-8 flows cases, it should be not very important because of the lack of actual deployment. I'm not actually sure which of these cases is more real. We're suggesting to disable EMC for cases where it's not effective. From the other side, if VM encapsulates all the incoming traffic and sends it for further processing, there will be only few outcoming flows. > > Best Regards, > Wei Yanqin > > -----Original Message----- > From: Ilya Maximets <i.maximets@samsung.com> > Sent: Monday, April 1, 2019 2:16 PM > To: Yanqin Wei (Arm Technology China) <Yanqin.Wei@arm.com>; > dev@openvswitch.org; ian.stokes@intel.com > Cc: nd <nd@arm.com>; Gavin Hu (Arm Technology China) > <Gavin.Hu@arm.com> > Subject: Re: [ovs-dev][PATCH v3] dpif-netdev: dfc_process optimization by prefetching EMC entry. > > On 29.03.2019 20:33, Ilya Maximets wrote: >> Hi. >> I made few tests on PVP with bonded PHY setup and found no >> significant difference in maximum performance with low and medium >> number of flows >> (8 - 8192). > > Correction: I see the a slight performance drop (~1%) for the cases with very low number of flows (1-8), and a performance increase (~ 2-3%) for the medium low number of flows (64 - 4096). > In this scenario packets from VM has already calculated hash on recirculation, so the prefetching optimization doesn't work for the first pass through the datapath, but works for the second. > Degradation on single or very low number of flows probably caused by additional checks and instructions for prefetching the memory that is in cache already. > >> In case of big number of flows (64K - 256K) I see performance drop in >> about 2-3%. I think that because of prefetching of a second >> cacheline, which is unneeded because current_entry->key.hash likely >> != key->hash and we don't need to compare miniflows while emc_lookup. >> Changing the code to prefetch the first cacheline only decreases the >> drop to ~1% (However, this increases the consumed processing cycles >> for medium numbers of flows described below) > > In general, this patch decreases the performance for cases where EMC is not efficient and improves it for cases where EMC is efficient, except the cases with very low numbers of flows, which could be the main concern. > >> >> OTOH, I see a slight decrease (~1%) of consumed cycles per packet for >> the thread that polls HW NICs and send packets to VM, which is good. >> This improvement observed for a medium small number of flows: 512 - 8192. >> For a low (1 - 256) and high (64K - 256K) numbers of flows value of >> consumed processing cycles per packet fro this thread was not affected by the patch. >> >> Tests made with average 512B packets, EMC enabled, SMC disabled, TX >> flushing interval 50us. Note that the bottleneck in this case is the >> VM --> bonded PHY part which is the case of 5tuple hash calculation. >> >> See review comments inline. >> >> Best regards, Ilya Maximets. >> >> On 22.03.2019 11:44, Yanqin Wei (Arm Technology China) wrote: >>> Hi , OVS Maintainers, >>> >>> Could you help to have a look at this patch? Thanks a lot. >>> >>> Best Regards, >>> Wei Yanqin >>> >>> -----Original Message----- >>> From: Yanqin Wei <Yanqin.Wei@arm.com> >>> Sent: Wednesday, March 13, 2019 1:28 PM >>> To: dev@openvswitch.org >>> Cc: nd <nd@arm.com>; Gavin Hu (Arm Technology China) >>> <Gavin.Hu@arm.com>; Yanqin Wei (Arm Technology China) >>> <Yanqin.Wei@arm.com> >>> Subject: [ovs-dev][PATCH v3] dpif-netdev: dfc_process optimization by prefetching EMC entry. >>> >>> It is observed that the throughput of multi-flow is worse than single-flow in the EMC NIC to NIC cases. It is because CPU cache-miss increasing in EMC lookup. Each flow need load at least one EMC entry to CPU cache(several cache lines) and compare it with packet miniflow. >>> This patch improve it by prefetching EMC entry in advance. Hash >>> value can be obtained from dpdk rss hash, so this step can be >>> advanced ahead of >>> miniflow_extract() and prefetch EMC entry there. The prefetching size is defined as ROUND_UP(128,CACHE_LINE_SIZE), which can cover majority traffic including TCP/UDP protocol and need 2 cache lines in most modern CPU. >>> Performance test was run in some arm platform. 1000/10000 flows NIC2NIC test achieved around 10% throughput improvement in thunderX2(aarch64 platform). >>> >>> Signed-off-by: Yanqin Wei <Yanqin.Wei@arm.com> >>> Reviewed-by: Gavin Hu <Gavin.Hu@arm.com> >>> --- >>> lib/dpif-netdev.c | 80 >>> ++++++++++++++++++++++++++++++++++++------------------- >>> 1 file changed, 52 insertions(+), 28 deletions(-) >>> >>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index >>> 4d6d0c3..982082c 100644 >>> --- a/lib/dpif-netdev.c >>> +++ b/lib/dpif-netdev.c >>> @@ -189,6 +189,10 @@ struct netdev_flow_key { >>> #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \ >>> >>> DEFAULT_EM_FLOW_INSERT_INV_PROB) >>> >>> +/* DEFAULT_EMC_PREFETCH_SIZE can cover majority traffic including >>> +TCP/UDP >>> + * protocol. */ >>> +#define DEFAULT_EMC_PREFETCH_SIZE ROUND_UP(128,CACHE_LINE_SIZE) >> >> Space after the ',' needed. >> >>> + >>> struct emc_entry { >>> struct dp_netdev_flow *flow; >>> struct netdev_flow_key key; /* key.hash used for emc hash value. */ >>> @@ -6166,15 +6170,20 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread >>> *pmd, struct dp_packet *packet_, } >>> >>> static inline uint32_t >>> -dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, >>> - const struct miniflow *mf) >>> +dpif_netdev_packet_get_packet_rss_hash(struct dp_packet *packet, >>> + bool md_is_valid) >> >> 1. 'packet' word appears twice in the function name. >> How about 'dpif_netdev_packet_get_rss_hash' and >> 'dpif_netdev_packet_get_5tuple_hash' for the second function? >> >> 2. Please align the second line to the level of next char after '('. >> >> 3. 'md_is_valid' is meaningless name inside the function. >> What about renaming to 'bool account_recirc_id'? >> >>> { >>> - uint32_t hash; >>> + uint32_t hash,recirc_depth; >> >> missing space. >> >>> >>> - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { >>> - hash = dp_packet_get_rss_hash(packet); >>> - } else { >>> - hash = miniflow_hash_5tuple(mf, 0); >>> + hash = dp_packet_get_rss_hash(packet); >>> + >>> + if (md_is_valid) { >>> + /* The RSS hash must account for the recirculation depth to avoid >>> + * collisions in the exact match cache */ >>> + recirc_depth = *recirc_depth_get_unsafe(); >>> + if (OVS_UNLIKELY(recirc_depth)) { >> >> We're in recirculation if md is valid. So, 'OVS_UNLIKELY' is wrong here. >> I think that we don't need to check the value at all. >> 'OVS_UNLIKELY' should be moved to the upper 'if'. >> >>> + hash = hash_finish(hash, recirc_depth); >>> + } >>> dp_packet_set_rss_hash(packet, hash); >> >> This could be moved inside the 'if', because we need to update rss >> hash only if it was changed. >> >>> } >>> >>> @@ -6182,24 +6191,23 @@ >>> dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, >>> } >>> >>> static inline uint32_t >>> -dpif_netdev_packet_get_rss_hash(struct dp_packet *packet, >>> - const struct miniflow *mf) >>> +dpif_netdev_packet_get_hash_5tuple(struct dp_packet *packet, >>> + const struct miniflow *mf, >>> + bool md_is_valid) >> >> Alignments. >> >>> { >>> - uint32_t hash, recirc_depth; >>> + uint32_t hash,recirc_depth; >> >> Space. >> >>> >>> - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { >>> - hash = dp_packet_get_rss_hash(packet); >>> - } else { >>> - hash = miniflow_hash_5tuple(mf, 0); >>> - dp_packet_set_rss_hash(packet, hash); >>> - } >>> + hash = miniflow_hash_5tuple(mf, 0); >>> + dp_packet_set_rss_hash(packet, hash); >>> >>> - /* The RSS hash must account for the recirculation depth to avoid >>> - * collisions in the exact match cache */ >>> - recirc_depth = *recirc_depth_get_unsafe(); >>> - if (OVS_UNLIKELY(recirc_depth)) { >>> - hash = hash_finish(hash, recirc_depth); >>> - dp_packet_set_rss_hash(packet, hash); >>> + if (md_is_valid) { >>> + /* The RSS hash must account for the recirculation depth to avoid >>> + * collisions in the exact match cache */ >>> + recirc_depth = *recirc_depth_get_unsafe(); >>> + if (OVS_UNLIKELY(recirc_depth)) { >> >> Same as previous. >> >>> + hash = hash_finish(hash, recirc_depth); >>> + dp_packet_set_rss_hash(packet, hash); >> >> This function sets the rss to dp_packet twice. You could avoid that >> by moving the 'dp_packet_set_rss_hash' call to the end of this function. >> >>> + } >>> } >>> return hash; >>> } >>> @@ -6390,6 +6398,7 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, >>> bool smc_enable_db; >>> size_t map_cnt = 0; >>> bool batch_enable = true; >>> + bool is_5tuple_hash_needed; >>> >>> atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db); >>> pmd_perf_update_counter(&pmd->perf_stats, >>> @@ -6436,16 +6445,31 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, >>> } >>> } >>> >>> - miniflow_extract(packet, &key->mf); >>> - key->len = 0; /* Not computed yet. */ >>> /* If EMC and SMC disabled skip hash computation */ >>> if (smc_enable_db == true || cur_min != 0) { >>> - if (!md_is_valid) { >>> - key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, >>> - &key->mf); >>> + if (OVS_LIKELY(dp_packet_rss_valid(packet))) { >> >> I'd not put the 'LIKELY' macro here, because all the packets from >> virtual ports has no RSS and this is the common case. >> >>> + is_5tuple_hash_needed = false; >>> + key->hash = >>> + >>> + dpif_netdev_packet_get_packet_rss_hash(packet,md_is_valid); >> >> Space. >> >>> + if (cur_min) { >>> + ovs_prefetch_range( >>> + &cache->emc_cache.entries[key->hash & EM_FLOW_HASH_MASK], >>> + DEFAULT_EMC_PREFETCH_SIZE); >>> + } >>> } else { >>> - key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf); >>> + is_5tuple_hash_needed = true; >>> } >>> + } else { >>> + is_5tuple_hash_needed = false; >>> + } >>> + >>> + miniflow_extract(packet, &key->mf); >>> + key->len = 0; /* Not computed yet. */ >>> + >>> + /* If 5tuple hash is needed */ >> >> This comment is not that useful. IMHO, the variable is self-documenting. >> >>> + if (is_5tuple_hash_needed) { >>> + key->hash = dpif_netdev_packet_get_hash_5tuple(packet, >>> + &key->mf, >>> + >>> + md_is_valid); >>> } >>> if (cur_min) { >>> flow = emc_lookup(&cache->emc_cache, key); >>> -- >>> 2.7.4 >>> >>> >>> >> >>
On 01.04.2019 10:35, Yanqin Wei (Arm Technology China) wrote: > So the main concern is performance drop for 1~8 flow, because EMC is effective here and would be enable. > On the other hand, EMC prefetching does not take effect when EMC is disable, and in the case of large number of flows, the EMC is likely to be disabled. So the performance drop here can be avoided by configuration. > Is my understanding correct? I think so. Ian, what is your opinion here? > > Best Regards, > Wei Yanqin > > -----Original Message----- > From: Ilya Maximets <i.maximets@samsung.com> > Sent: Monday, April 1, 2019 3:15 PM > To: Yanqin Wei (Arm Technology China) <Yanqin.Wei@arm.com>; dev@openvswitch.org; ian.stokes@intel.com > Cc: nd <nd@arm.com>; Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com> > Subject: Re: [ovs-dev][PATCH v3] dpif-netdev: dfc_process optimization by prefetching EMC entry. > > On 01.04.2019 9:52, Yanqin Wei (Arm Technology China) wrote: >> Hi Ilya, >> >> Really appreciate your time in doing benchmarking for this patch. And many thanks for your valuable comments. >> It is quite possible that the second unneeded cache line prefetching causes performance drop in case of big number of flows (64K - 256K). >> This impact the real world use case, I will try to improve it . For 1-8 flows cases, it should be not very important because of the lack of actual deployment. > > I'm not actually sure which of these cases is more real. > We're suggesting to disable EMC for cases where it's not effective. > From the other side, if VM encapsulates all the incoming traffic and sends it for further processing, there will be only few outcoming flows. > >> >> Best Regards, >> Wei Yanqin >> >> -----Original Message----- >> From: Ilya Maximets <i.maximets@samsung.com> >> Sent: Monday, April 1, 2019 2:16 PM >> To: Yanqin Wei (Arm Technology China) <Yanqin.Wei@arm.com>; >> dev@openvswitch.org; ian.stokes@intel.com >> Cc: nd <nd@arm.com>; Gavin Hu (Arm Technology China) >> <Gavin.Hu@arm.com> >> Subject: Re: [ovs-dev][PATCH v3] dpif-netdev: dfc_process optimization by prefetching EMC entry. >> >> On 29.03.2019 20:33, Ilya Maximets wrote: >>> Hi. >>> I made few tests on PVP with bonded PHY setup and found no >>> significant difference in maximum performance with low and medium >>> number of flows >>> (8 - 8192). >> >> Correction: I see the a slight performance drop (~1%) for the cases with very low number of flows (1-8), and a performance increase (~ 2-3%) for the medium low number of flows (64 - 4096). >> In this scenario packets from VM has already calculated hash on recirculation, so the prefetching optimization doesn't work for the first pass through the datapath, but works for the second. >> Degradation on single or very low number of flows probably caused by additional checks and instructions for prefetching the memory that is in cache already. >> >>> In case of big number of flows (64K - 256K) I see performance drop in >>> about 2-3%. I think that because of prefetching of a second >>> cacheline, which is unneeded because current_entry->key.hash likely >>> != key->hash and we don't need to compare miniflows while emc_lookup. >>> Changing the code to prefetch the first cacheline only decreases the >>> drop to ~1% (However, this increases the consumed processing cycles >>> for medium numbers of flows described below) >> >> In general, this patch decreases the performance for cases where EMC is not efficient and improves it for cases where EMC is efficient, except the cases with very low numbers of flows, which could be the main concern. >> >>> >>> OTOH, I see a slight decrease (~1%) of consumed cycles per packet for >>> the thread that polls HW NICs and send packets to VM, which is good. >>> This improvement observed for a medium small number of flows: 512 - 8192. >>> For a low (1 - 256) and high (64K - 256K) numbers of flows value of >>> consumed processing cycles per packet fro this thread was not affected by the patch. >>> >>> Tests made with average 512B packets, EMC enabled, SMC disabled, TX >>> flushing interval 50us. Note that the bottleneck in this case is the >>> VM --> bonded PHY part which is the case of 5tuple hash calculation. >>> >>> See review comments inline. >>> >>> Best regards, Ilya Maximets. >>> >>> On 22.03.2019 11:44, Yanqin Wei (Arm Technology China) wrote: >>>> Hi , OVS Maintainers, >>>> >>>> Could you help to have a look at this patch? Thanks a lot. >>>> >>>> Best Regards, >>>> Wei Yanqin >>>> >>>> -----Original Message----- >>>> From: Yanqin Wei <Yanqin.Wei@arm.com> >>>> Sent: Wednesday, March 13, 2019 1:28 PM >>>> To: dev@openvswitch.org >>>> Cc: nd <nd@arm.com>; Gavin Hu (Arm Technology China) >>>> <Gavin.Hu@arm.com>; Yanqin Wei (Arm Technology China) >>>> <Yanqin.Wei@arm.com> >>>> Subject: [ovs-dev][PATCH v3] dpif-netdev: dfc_process optimization by prefetching EMC entry. >>>> >>>> It is observed that the throughput of multi-flow is worse than single-flow in the EMC NIC to NIC cases. It is because CPU cache-miss increasing in EMC lookup. Each flow need load at least one EMC entry to CPU cache(several cache lines) and compare it with packet miniflow. >>>> This patch improve it by prefetching EMC entry in advance. Hash >>>> value can be obtained from dpdk rss hash, so this step can be >>>> advanced ahead of >>>> miniflow_extract() and prefetch EMC entry there. The prefetching size is defined as ROUND_UP(128,CACHE_LINE_SIZE), which can cover majority traffic including TCP/UDP protocol and need 2 cache lines in most modern CPU. >>>> Performance test was run in some arm platform. 1000/10000 flows NIC2NIC test achieved around 10% throughput improvement in thunderX2(aarch64 platform). >>>> >>>> Signed-off-by: Yanqin Wei <Yanqin.Wei@arm.com> >>>> Reviewed-by: Gavin Hu <Gavin.Hu@arm.com> >>>> --- >>>> lib/dpif-netdev.c | 80 >>>> ++++++++++++++++++++++++++++++++++++------------------- >>>> 1 file changed, 52 insertions(+), 28 deletions(-) >>>> >>>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index >>>> 4d6d0c3..982082c 100644 >>>> --- a/lib/dpif-netdev.c >>>> +++ b/lib/dpif-netdev.c >>>> @@ -189,6 +189,10 @@ struct netdev_flow_key { >>>> #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \ >>>> >>>> DEFAULT_EM_FLOW_INSERT_INV_PROB) >>>> >>>> +/* DEFAULT_EMC_PREFETCH_SIZE can cover majority traffic including >>>> +TCP/UDP >>>> + * protocol. */ >>>> +#define DEFAULT_EMC_PREFETCH_SIZE ROUND_UP(128,CACHE_LINE_SIZE) >>> >>> Space after the ',' needed. >>> >>>> + >>>> struct emc_entry { >>>> struct dp_netdev_flow *flow; >>>> struct netdev_flow_key key; /* key.hash used for emc hash value. */ >>>> @@ -6166,15 +6170,20 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread >>>> *pmd, struct dp_packet *packet_, } >>>> >>>> static inline uint32_t >>>> -dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, >>>> - const struct miniflow *mf) >>>> +dpif_netdev_packet_get_packet_rss_hash(struct dp_packet *packet, >>>> + bool md_is_valid) >>> >>> 1. 'packet' word appears twice in the function name. >>> How about 'dpif_netdev_packet_get_rss_hash' and >>> 'dpif_netdev_packet_get_5tuple_hash' for the second function? >>> >>> 2. Please align the second line to the level of next char after '('. >>> >>> 3. 'md_is_valid' is meaningless name inside the function. >>> What about renaming to 'bool account_recirc_id'? >>> >>>> { >>>> - uint32_t hash; >>>> + uint32_t hash,recirc_depth; >>> >>> missing space. >>> >>>> >>>> - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { >>>> - hash = dp_packet_get_rss_hash(packet); >>>> - } else { >>>> - hash = miniflow_hash_5tuple(mf, 0); >>>> + hash = dp_packet_get_rss_hash(packet); >>>> + >>>> + if (md_is_valid) { >>>> + /* The RSS hash must account for the recirculation depth to avoid >>>> + * collisions in the exact match cache */ >>>> + recirc_depth = *recirc_depth_get_unsafe(); >>>> + if (OVS_UNLIKELY(recirc_depth)) { >>> >>> We're in recirculation if md is valid. So, 'OVS_UNLIKELY' is wrong here. >>> I think that we don't need to check the value at all. >>> 'OVS_UNLIKELY' should be moved to the upper 'if'. >>> >>>> + hash = hash_finish(hash, recirc_depth); >>>> + } >>>> dp_packet_set_rss_hash(packet, hash); >>> >>> This could be moved inside the 'if', because we need to update rss >>> hash only if it was changed. >>> >>>> } >>>> >>>> @@ -6182,24 +6191,23 @@ >>>> dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, >>>> } >>>> >>>> static inline uint32_t >>>> -dpif_netdev_packet_get_rss_hash(struct dp_packet *packet, >>>> - const struct miniflow *mf) >>>> +dpif_netdev_packet_get_hash_5tuple(struct dp_packet *packet, >>>> + const struct miniflow *mf, >>>> + bool md_is_valid) >>> >>> Alignments. >>> >>>> { >>>> - uint32_t hash, recirc_depth; >>>> + uint32_t hash,recirc_depth; >>> >>> Space. >>> >>>> >>>> - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { >>>> - hash = dp_packet_get_rss_hash(packet); >>>> - } else { >>>> - hash = miniflow_hash_5tuple(mf, 0); >>>> - dp_packet_set_rss_hash(packet, hash); >>>> - } >>>> + hash = miniflow_hash_5tuple(mf, 0); >>>> + dp_packet_set_rss_hash(packet, hash); >>>> >>>> - /* The RSS hash must account for the recirculation depth to avoid >>>> - * collisions in the exact match cache */ >>>> - recirc_depth = *recirc_depth_get_unsafe(); >>>> - if (OVS_UNLIKELY(recirc_depth)) { >>>> - hash = hash_finish(hash, recirc_depth); >>>> - dp_packet_set_rss_hash(packet, hash); >>>> + if (md_is_valid) { >>>> + /* The RSS hash must account for the recirculation depth to avoid >>>> + * collisions in the exact match cache */ >>>> + recirc_depth = *recirc_depth_get_unsafe(); >>>> + if (OVS_UNLIKELY(recirc_depth)) { >>> >>> Same as previous. >>> >>>> + hash = hash_finish(hash, recirc_depth); >>>> + dp_packet_set_rss_hash(packet, hash); >>> >>> This function sets the rss to dp_packet twice. You could avoid that >>> by moving the 'dp_packet_set_rss_hash' call to the end of this function. >>> >>>> + } >>>> } >>>> return hash; >>>> } >>>> @@ -6390,6 +6398,7 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, >>>> bool smc_enable_db; >>>> size_t map_cnt = 0; >>>> bool batch_enable = true; >>>> + bool is_5tuple_hash_needed; >>>> >>>> atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db); >>>> pmd_perf_update_counter(&pmd->perf_stats, >>>> @@ -6436,16 +6445,31 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, >>>> } >>>> } >>>> >>>> - miniflow_extract(packet, &key->mf); >>>> - key->len = 0; /* Not computed yet. */ >>>> /* If EMC and SMC disabled skip hash computation */ >>>> if (smc_enable_db == true || cur_min != 0) { >>>> - if (!md_is_valid) { >>>> - key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, >>>> - &key->mf); >>>> + if (OVS_LIKELY(dp_packet_rss_valid(packet))) { >>> >>> I'd not put the 'LIKELY' macro here, because all the packets from >>> virtual ports has no RSS and this is the common case. >>> >>>> + is_5tuple_hash_needed = false; >>>> + key->hash = >>>> + >>>> + dpif_netdev_packet_get_packet_rss_hash(packet,md_is_valid); >>> >>> Space. >>> >>>> + if (cur_min) { >>>> + ovs_prefetch_range( >>>> + &cache->emc_cache.entries[key->hash & EM_FLOW_HASH_MASK], >>>> + DEFAULT_EMC_PREFETCH_SIZE); >>>> + } >>>> } else { >>>> - key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf); >>>> + is_5tuple_hash_needed = true; >>>> } >>>> + } else { >>>> + is_5tuple_hash_needed = false; >>>> + } >>>> + >>>> + miniflow_extract(packet, &key->mf); >>>> + key->len = 0; /* Not computed yet. */ >>>> + >>>> + /* If 5tuple hash is needed */ >>> >>> This comment is not that useful. IMHO, the variable is self-documenting. >>> >>>> + if (is_5tuple_hash_needed) { >>>> + key->hash = dpif_netdev_packet_get_hash_5tuple(packet, >>>> + &key->mf, >>>> + >>>> + md_is_valid); >>>> } >>>> if (cur_min) { >>>> flow = emc_lookup(&cache->emc_cache, key); >>>> -- >>>> 2.7.4 >>>> >>>> >>>> >>> >>>
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 4d6d0c3..982082c 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -189,6 +189,10 @@ struct netdev_flow_key { #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \ DEFAULT_EM_FLOW_INSERT_INV_PROB) +/* DEFAULT_EMC_PREFETCH_SIZE can cover majority traffic including TCP/UDP + * protocol. */ +#define DEFAULT_EMC_PREFETCH_SIZE ROUND_UP(128,CACHE_LINE_SIZE) + struct emc_entry { struct dp_netdev_flow *flow; struct netdev_flow_key key; /* key.hash used for emc hash value. */ @@ -6166,15 +6170,20 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_, } static inline uint32_t -dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, - const struct miniflow *mf) +dpif_netdev_packet_get_packet_rss_hash(struct dp_packet *packet, + bool md_is_valid) { - uint32_t hash; + uint32_t hash,recirc_depth; - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { - hash = dp_packet_get_rss_hash(packet); - } else { - hash = miniflow_hash_5tuple(mf, 0); + hash = dp_packet_get_rss_hash(packet); + + if (md_is_valid) { + /* The RSS hash must account for the recirculation depth to avoid + * collisions in the exact match cache */ + recirc_depth = *recirc_depth_get_unsafe(); + if (OVS_UNLIKELY(recirc_depth)) { + hash = hash_finish(hash, recirc_depth); + } dp_packet_set_rss_hash(packet, hash); } @@ -6182,24 +6191,23 @@ dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, } static inline uint32_t -dpif_netdev_packet_get_rss_hash(struct dp_packet *packet, - const struct miniflow *mf) +dpif_netdev_packet_get_hash_5tuple(struct dp_packet *packet, + const struct miniflow *mf, + bool md_is_valid) { - uint32_t hash, recirc_depth; + uint32_t hash,recirc_depth; - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { - hash = dp_packet_get_rss_hash(packet); - } else { - hash = miniflow_hash_5tuple(mf, 0); - dp_packet_set_rss_hash(packet, hash); - } + hash = miniflow_hash_5tuple(mf, 0); + dp_packet_set_rss_hash(packet, hash); - /* The RSS hash must account for the recirculation depth to avoid - * collisions in the exact match cache */ - recirc_depth = *recirc_depth_get_unsafe(); - if (OVS_UNLIKELY(recirc_depth)) { - hash = hash_finish(hash, recirc_depth); - dp_packet_set_rss_hash(packet, hash); + if (md_is_valid) { + /* The RSS hash must account for the recirculation depth to avoid + * collisions in the exact match cache */ + recirc_depth = *recirc_depth_get_unsafe(); + if (OVS_UNLIKELY(recirc_depth)) { + hash = hash_finish(hash, recirc_depth); + dp_packet_set_rss_hash(packet, hash); + } } return hash; } @@ -6390,6 +6398,7 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, bool smc_enable_db; size_t map_cnt = 0; bool batch_enable = true; + bool is_5tuple_hash_needed; atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db); pmd_perf_update_counter(&pmd->perf_stats, @@ -6436,16 +6445,31 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, } } - miniflow_extract(packet, &key->mf); - key->len = 0; /* Not computed yet. */ /* If EMC and SMC disabled skip hash computation */ if (smc_enable_db == true || cur_min != 0) { - if (!md_is_valid) { - key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, - &key->mf); + if (OVS_LIKELY(dp_packet_rss_valid(packet))) { + is_5tuple_hash_needed = false; + key->hash = + dpif_netdev_packet_get_packet_rss_hash(packet,md_is_valid); + if (cur_min) { + ovs_prefetch_range( + &cache->emc_cache.entries[key->hash & EM_FLOW_HASH_MASK], + DEFAULT_EMC_PREFETCH_SIZE); + } } else { - key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf); + is_5tuple_hash_needed = true; } + } else { + is_5tuple_hash_needed = false; + } + + miniflow_extract(packet, &key->mf); + key->len = 0; /* Not computed yet. */ + + /* If 5tuple hash is needed */ + if (is_5tuple_hash_needed) { + key->hash = dpif_netdev_packet_get_hash_5tuple(packet, &key->mf, + md_is_valid); } if (cur_min) { flow = emc_lookup(&cache->emc_cache, key);