Message ID | 20191016225028.2100206-2-jonathan.lemon@gmail.com |
---|---|
State | Changes Requested |
Delegated to: | David Miller |
Headers | show |
Series | page_pool cleanups | expand |
On Wed, 16 Oct 2019 15:50:19 -0700, Jonathan Lemon wrote: > From: Tariq Toukan <tariqt@mellanox.com> > > Obsolete the RX page-cache layer, pages are now recycled > in page_pool. > > This patch introduce a temporary degradation as recycling > does not keep the pages DMA-mapped. That is fixed in a > downstream patch. > > Issue: 1487631 Please drop these Issue identifiers. > Signed-off-by: Tariq Toukan <tariqt@mellanox.com> > And no empty lines between tags. > Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
On 16 Oct 2019, at 18:30, Jakub Kicinski wrote: > On Wed, 16 Oct 2019 15:50:19 -0700, Jonathan Lemon wrote: >> From: Tariq Toukan <tariqt@mellanox.com> >> >> Obsolete the RX page-cache layer, pages are now recycled >> in page_pool. >> >> This patch introduce a temporary degradation as recycling >> does not keep the pages DMA-mapped. That is fixed in a >> downstream patch. >> >> Issue: 1487631 > > Please drop these Issue identifiers. > >> Signed-off-by: Tariq Toukan <tariqt@mellanox.com> >> > > And no empty lines between tags. > >> Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com> Will fix.
On Wed, 2019-10-16 at 15:50 -0700, Jonathan Lemon wrote: > From: Tariq Toukan <tariqt@mellanox.com> > > Obsolete the RX page-cache layer, pages are now recycled > in page_pool. > > This patch introduce a temporary degradation as recycling > does not keep the pages DMA-mapped. That is fixed in a > downstream patch. Tariq, i think we need to have performance numbers here to show degradation. i am sure that non XDP traffic TCP/UDP performance will be hit. > > Issue: 1487631 > Signed-off-by: Tariq Toukan <tariqt@mellanox.com> > > Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com> > --- > drivers/net/ethernet/mellanox/mlx5/core/en.h | 13 ---- > .../net/ethernet/mellanox/mlx5/core/en_main.c | 16 ----- > .../net/ethernet/mellanox/mlx5/core/en_rx.c | 67 ++--------------- > -- > 3 files changed, 4 insertions(+), 92 deletions(-) > > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h > b/drivers/net/ethernet/mellanox/mlx5/core/en.h > index 8d76452cacdc..0595cdcff594 100644 > --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h > @@ -583,18 +583,6 @@ struct mlx5e_mpw_info { > > #define MLX5E_MAX_RX_FRAGS 4 > > -/* a single cache unit is capable to serve one napi call (for non- > striding rq) > - * or a MPWQE (for striding rq). > - */ > -#define MLX5E_CACHE_UNIT (MLX5_MPWRQ_PAGES_PER_WQE > > NAPI_POLL_WEIGHT ? \ > - MLX5_MPWRQ_PAGES_PER_WQE : > NAPI_POLL_WEIGHT) > -#define MLX5E_CACHE_SIZE (4 * > roundup_pow_of_two(MLX5E_CACHE_UNIT)) > -struct mlx5e_page_cache { > - u32 head; > - u32 tail; > - struct mlx5e_dma_info page_cache[MLX5E_CACHE_SIZE]; > -}; > - > struct mlx5e_rq; > typedef void (*mlx5e_fp_handle_rx_cqe)(struct mlx5e_rq*, struct > mlx5_cqe64*); > typedef struct sk_buff * > @@ -658,7 +646,6 @@ struct mlx5e_rq { > struct mlx5e_rq_stats *stats; > struct mlx5e_cq cq; > struct mlx5e_cq_decomp cqd; > - struct mlx5e_page_cache page_cache; > struct hwtstamp_config *tstamp; > struct mlx5_clock *clock; > > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c > b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c > index 7569287f8f3c..168be1f800a3 100644 > --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c > @@ -612,9 +612,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel > *c, > rq->dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE; > } > > - rq->page_cache.head = 0; > - rq->page_cache.tail = 0; > - > return 0; > > err_free: > @@ -640,8 +637,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel > *c, > > static void mlx5e_free_rq(struct mlx5e_rq *rq) > { > - int i; > - > if (rq->xdp_prog) > bpf_prog_put(rq->xdp_prog); > > @@ -655,17 +650,6 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq) > mlx5e_free_di_list(rq); > } > > - for (i = rq->page_cache.head; i != rq->page_cache.tail; > - i = (i + 1) & (MLX5E_CACHE_SIZE - 1)) { > - struct mlx5e_dma_info *dma_info = &rq- > >page_cache.page_cache[i]; > - > - /* With AF_XDP, page_cache is not used, so this loop is > not > - * entered, and it's safe to call > mlx5e_page_release_dynamic > - * directly. > - */ > - mlx5e_page_release_dynamic(rq, dma_info, false); > - } > - > xdp_rxq_info_unreg(&rq->xdp_rxq); > page_pool_destroy(rq->page_pool); > mlx5_wq_destroy(&rq->wq_ctrl); > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c > b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c > index d6a547238de0..a3773f8a4931 100644 > --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c > @@ -184,65 +184,9 @@ static inline u32 > mlx5e_decompress_cqes_start(struct mlx5e_rq *rq, > return mlx5e_decompress_cqes_cont(rq, wq, 1, budget_rem) - 1; > } > > -static inline bool mlx5e_page_is_reserved(struct page *page) > -{ > - return page_is_pfmemalloc(page) || page_to_nid(page) != > numa_mem_id(); > -} > - > -static inline bool mlx5e_rx_cache_put(struct mlx5e_rq *rq, > - struct mlx5e_dma_info *dma_info) > -{ > - struct mlx5e_page_cache *cache = &rq->page_cache; > - u32 tail_next = (cache->tail + 1) & (MLX5E_CACHE_SIZE - 1); > - struct mlx5e_rq_stats *stats = rq->stats; > - > - if (tail_next == cache->head) { > - stats->cache_full++; > - return false; > - } > - > - if (unlikely(mlx5e_page_is_reserved(dma_info->page))) { > - stats->cache_waive++; > - return false; > - } > - > - cache->page_cache[cache->tail] = *dma_info; > - cache->tail = tail_next; > - return true; > -} > - > -static inline bool mlx5e_rx_cache_get(struct mlx5e_rq *rq, > - struct mlx5e_dma_info *dma_info) > -{ > - struct mlx5e_page_cache *cache = &rq->page_cache; > - struct mlx5e_rq_stats *stats = rq->stats; > - > - if (unlikely(cache->head == cache->tail)) { > - stats->cache_empty++; > - return false; > - } > - > - if (page_ref_count(cache->page_cache[cache->head].page) != 1) { > - stats->cache_busy++; > - return false; > - } > - > - *dma_info = cache->page_cache[cache->head]; > - cache->head = (cache->head + 1) & (MLX5E_CACHE_SIZE - 1); > - stats->cache_reuse++; > - > - dma_sync_single_for_device(rq->pdev, dma_info->addr, > - PAGE_SIZE, > - DMA_FROM_DEVICE); > - return true; > -} > - > static inline int mlx5e_page_alloc_pool(struct mlx5e_rq *rq, > struct mlx5e_dma_info > *dma_info) > { > - if (mlx5e_rx_cache_get(rq, dma_info)) > - return 0; > - > dma_info->page = page_pool_dev_alloc_pages(rq->page_pool); > if (unlikely(!dma_info->page)) > return -ENOMEM; > @@ -276,14 +220,11 @@ void mlx5e_page_release_dynamic(struct mlx5e_rq > *rq, > struct mlx5e_dma_info *dma_info, > bool recycle) > { > - if (likely(recycle)) { > - if (mlx5e_rx_cache_put(rq, dma_info)) > - return; > + mlx5e_page_dma_unmap(rq, dma_info); > > - mlx5e_page_dma_unmap(rq, dma_info); > + if (likely(recycle)) { > page_pool_recycle_direct(rq->page_pool, dma_info- > >page); > } else { > - mlx5e_page_dma_unmap(rq, dma_info); > page_pool_release_page(rq->page_pool, dma_info->page); > put_page(dma_info->page); > } > @@ -1167,7 +1108,7 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, > struct mlx5_cqe64 *cqe) > if (!skb) { > /* probably for XDP */ > if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq- > >flags)) { > - /* do not return page to cache, > + /* do not return page to pool, > * it will be returned on XDP_TX completion. > */ > goto wq_cyc_pop; > @@ -1210,7 +1151,7 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq > *rq, struct mlx5_cqe64 *cqe) > if (!skb) { > /* probably for XDP */ > if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq- > >flags)) { > - /* do not return page to cache, > + /* do not return page to pool, > * it will be returned on XDP_TX completion. > */ > goto wq_cyc_pop;
I was running the updated patches on machines with various workloads, and have a bunch of different results. For the following numbers, Effective = hit / (hit + empty + stall) * 100 In other words, show the hit rate for for every trip to the cache, and the cache full stat is ignored. On a webserver: [web] # ./eff ('rx_pool_cache_hit:', '360127643') ('rx_pool_cache_full:', '0') ('rx_pool_cache_empty:', '6455735977') ('rx_pool_ring_produce:', '474958') ('rx_pool_ring_consume:', '0') ('rx_pool_ring_return:', '474958') ('rx_pool_flush:', '144') ('rx_pool_node_change:', '0') cache effectiveness: 5.28 On a proxygen: # ethtool -S eth0 | grep rx_pool rx_pool_cache_hit: 1646798 rx_pool_cache_full: 0 rx_pool_cache_empty: 15723566 rx_pool_ring_produce: 474958 rx_pool_ring_consume: 0 rx_pool_ring_return: 474958 rx_pool_flush: 144 rx_pool_node_change: 0 cache effectiveness: 9.48 On both of these, only pages with refcount = 1 are being kept. I changed things around in the page pool so: 1) the cache behaves like a ring instead of a stack, this sacrifices temporal locality. 2) it caches all pages returned regardless of refcount, but only returns pages with refcount=1. This is the same behavior as the mlx5 cache. Some gains would come about if the sojourn time though the cache is greater than the lifetime of the page usage by the networking stack, as it provides a fixed working set of mapped pages. On the web server, this is a net loss: [web] # ./eff ('rx_pool_cache_hit:', '6052662') ('rx_pool_cache_full:', '156355415') ('rx_pool_cache_empty:', '409600') ('rx_pool_cache_stall:', '302787473') ('rx_pool_ring_produce:', '156633847') ('rx_pool_ring_consume:', '9925520') ('rx_pool_ring_return:', '278788') ('rx_pool_flush:', '96') ('rx_pool_node_change:', '0') cache effectiveness: 1.95720846778 For proxygen on the other hand, it's a win: [proxy] # ./eff ('rx_pool_cache_hit:', '69235177') ('rx_pool_cache_full:', '35404387') ('rx_pool_cache_empty:', '460800') ('rx_pool_cache_stall:', '42932530') ('rx_pool_ring_produce:', '35717618') ('rx_pool_ring_consume:', '27879469') ('rx_pool_ring_return:', '404800') ('rx_pool_flush:', '108') ('rx_pool_node_change:', '0') cache effectiveness: 61.4721608624 So the correct behavior isn't quite clear cut here - caching a working set of mapped pages is beneficial in spite of the HOL blocking stalls for some workloads, but I'm sure that it wouldn't be too difficult to exceed the WS size. Thoughts?
On 10/19/2019 3:10 AM, Jonathan Lemon wrote: > I was running the updated patches on machines with various workloads, and > have a bunch of different results. > > For the following numbers, > Effective = hit / (hit + empty + stall) * 100 > > In other words, show the hit rate for for every trip to the cache, > and the cache full stat is ignored. > > On a webserver: > > [web] # ./eff > ('rx_pool_cache_hit:', '360127643') > ('rx_pool_cache_full:', '0') > ('rx_pool_cache_empty:', '6455735977') > ('rx_pool_ring_produce:', '474958') > ('rx_pool_ring_consume:', '0') > ('rx_pool_ring_return:', '474958') > ('rx_pool_flush:', '144') > ('rx_pool_node_change:', '0') > cache effectiveness: 5.28 > > On a proxygen: > # ethtool -S eth0 | grep rx_pool > rx_pool_cache_hit: 1646798 > rx_pool_cache_full: 0 > rx_pool_cache_empty: 15723566 > rx_pool_ring_produce: 474958 > rx_pool_ring_consume: 0 > rx_pool_ring_return: 474958 > rx_pool_flush: 144 > rx_pool_node_change: 0 > cache effectiveness: 9.48 > > On both of these, only pages with refcount = 1 are being kept. > > > I changed things around in the page pool so: > > 1) the cache behaves like a ring instead of a stack, this > sacrifices temporal locality. > > 2) it caches all pages returned regardless of refcount, but > only returns pages with refcount=1. > > This is the same behavior as the mlx5 cache. Some gains > would come about if the sojourn time though the cache is > greater than the lifetime of the page usage by the networking > stack, as it provides a fixed working set of mapped pages. > > On the web server, this is a net loss: > [web] # ./eff > ('rx_pool_cache_hit:', '6052662') > ('rx_pool_cache_full:', '156355415') > ('rx_pool_cache_empty:', '409600') > ('rx_pool_cache_stall:', '302787473') > ('rx_pool_ring_produce:', '156633847') > ('rx_pool_ring_consume:', '9925520') > ('rx_pool_ring_return:', '278788') > ('rx_pool_flush:', '96') > ('rx_pool_node_change:', '0') > cache effectiveness: 1.95720846778 > > For proxygen on the other hand, it's a win: > [proxy] # ./eff > ('rx_pool_cache_hit:', '69235177') > ('rx_pool_cache_full:', '35404387') > ('rx_pool_cache_empty:', '460800') > ('rx_pool_cache_stall:', '42932530') > ('rx_pool_ring_produce:', '35717618') > ('rx_pool_ring_consume:', '27879469') > ('rx_pool_ring_return:', '404800') > ('rx_pool_flush:', '108') > ('rx_pool_node_change:', '0') > cache effectiveness: 61.4721608624 > > So the correct behavior isn't quite clear cut here - caching a > working set of mapped pages is beneficial in spite of the HOL > blocking stalls for some workloads, but I'm sure that it wouldn't > be too difficult to exceed the WS size. > > Thoughts? > We have a WIP in which we avoid the HOL block, by having pages returned to the available-queue only when their refcnt reaches back to 1. This requires catching this case in the page/skb release path. See: https://github.com/xdp-project/xdp-project/tree/master/areas/mem
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 8d76452cacdc..0595cdcff594 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -583,18 +583,6 @@ struct mlx5e_mpw_info { #define MLX5E_MAX_RX_FRAGS 4 -/* a single cache unit is capable to serve one napi call (for non-striding rq) - * or a MPWQE (for striding rq). - */ -#define MLX5E_CACHE_UNIT (MLX5_MPWRQ_PAGES_PER_WQE > NAPI_POLL_WEIGHT ? \ - MLX5_MPWRQ_PAGES_PER_WQE : NAPI_POLL_WEIGHT) -#define MLX5E_CACHE_SIZE (4 * roundup_pow_of_two(MLX5E_CACHE_UNIT)) -struct mlx5e_page_cache { - u32 head; - u32 tail; - struct mlx5e_dma_info page_cache[MLX5E_CACHE_SIZE]; -}; - struct mlx5e_rq; typedef void (*mlx5e_fp_handle_rx_cqe)(struct mlx5e_rq*, struct mlx5_cqe64*); typedef struct sk_buff * @@ -658,7 +646,6 @@ struct mlx5e_rq { struct mlx5e_rq_stats *stats; struct mlx5e_cq cq; struct mlx5e_cq_decomp cqd; - struct mlx5e_page_cache page_cache; struct hwtstamp_config *tstamp; struct mlx5_clock *clock; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 7569287f8f3c..168be1f800a3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -612,9 +612,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, rq->dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE; } - rq->page_cache.head = 0; - rq->page_cache.tail = 0; - return 0; err_free: @@ -640,8 +637,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, static void mlx5e_free_rq(struct mlx5e_rq *rq) { - int i; - if (rq->xdp_prog) bpf_prog_put(rq->xdp_prog); @@ -655,17 +650,6 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq) mlx5e_free_di_list(rq); } - for (i = rq->page_cache.head; i != rq->page_cache.tail; - i = (i + 1) & (MLX5E_CACHE_SIZE - 1)) { - struct mlx5e_dma_info *dma_info = &rq->page_cache.page_cache[i]; - - /* With AF_XDP, page_cache is not used, so this loop is not - * entered, and it's safe to call mlx5e_page_release_dynamic - * directly. - */ - mlx5e_page_release_dynamic(rq, dma_info, false); - } - xdp_rxq_info_unreg(&rq->xdp_rxq); page_pool_destroy(rq->page_pool); mlx5_wq_destroy(&rq->wq_ctrl); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index d6a547238de0..a3773f8a4931 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -184,65 +184,9 @@ static inline u32 mlx5e_decompress_cqes_start(struct mlx5e_rq *rq, return mlx5e_decompress_cqes_cont(rq, wq, 1, budget_rem) - 1; } -static inline bool mlx5e_page_is_reserved(struct page *page) -{ - return page_is_pfmemalloc(page) || page_to_nid(page) != numa_mem_id(); -} - -static inline bool mlx5e_rx_cache_put(struct mlx5e_rq *rq, - struct mlx5e_dma_info *dma_info) -{ - struct mlx5e_page_cache *cache = &rq->page_cache; - u32 tail_next = (cache->tail + 1) & (MLX5E_CACHE_SIZE - 1); - struct mlx5e_rq_stats *stats = rq->stats; - - if (tail_next == cache->head) { - stats->cache_full++; - return false; - } - - if (unlikely(mlx5e_page_is_reserved(dma_info->page))) { - stats->cache_waive++; - return false; - } - - cache->page_cache[cache->tail] = *dma_info; - cache->tail = tail_next; - return true; -} - -static inline bool mlx5e_rx_cache_get(struct mlx5e_rq *rq, - struct mlx5e_dma_info *dma_info) -{ - struct mlx5e_page_cache *cache = &rq->page_cache; - struct mlx5e_rq_stats *stats = rq->stats; - - if (unlikely(cache->head == cache->tail)) { - stats->cache_empty++; - return false; - } - - if (page_ref_count(cache->page_cache[cache->head].page) != 1) { - stats->cache_busy++; - return false; - } - - *dma_info = cache->page_cache[cache->head]; - cache->head = (cache->head + 1) & (MLX5E_CACHE_SIZE - 1); - stats->cache_reuse++; - - dma_sync_single_for_device(rq->pdev, dma_info->addr, - PAGE_SIZE, - DMA_FROM_DEVICE); - return true; -} - static inline int mlx5e_page_alloc_pool(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info) { - if (mlx5e_rx_cache_get(rq, dma_info)) - return 0; - dma_info->page = page_pool_dev_alloc_pages(rq->page_pool); if (unlikely(!dma_info->page)) return -ENOMEM; @@ -276,14 +220,11 @@ void mlx5e_page_release_dynamic(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info, bool recycle) { - if (likely(recycle)) { - if (mlx5e_rx_cache_put(rq, dma_info)) - return; + mlx5e_page_dma_unmap(rq, dma_info); - mlx5e_page_dma_unmap(rq, dma_info); + if (likely(recycle)) { page_pool_recycle_direct(rq->page_pool, dma_info->page); } else { - mlx5e_page_dma_unmap(rq, dma_info); page_pool_release_page(rq->page_pool, dma_info->page); put_page(dma_info->page); } @@ -1167,7 +1108,7 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) if (!skb) { /* probably for XDP */ if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) { - /* do not return page to cache, + /* do not return page to pool, * it will be returned on XDP_TX completion. */ goto wq_cyc_pop; @@ -1210,7 +1151,7 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) if (!skb) { /* probably for XDP */ if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) { - /* do not return page to cache, + /* do not return page to pool, * it will be returned on XDP_TX completion. */ goto wq_cyc_pop;