diff mbox series

[01/10,net-next] net/mlx5e: RX, Remove RX page-cache

Message ID 20191016225028.2100206-2-jonathan.lemon@gmail.com
State Changes Requested
Delegated to: David Miller
Headers show
Series page_pool cleanups | expand

Commit Message

Jonathan Lemon Oct. 16, 2019, 10:50 p.m. UTC
From: Tariq Toukan <tariqt@mellanox.com>

Obsolete the RX page-cache layer, pages are now recycled
in page_pool.

This patch introduce a temporary degradation as recycling
does not keep the pages DMA-mapped. That is fixed in a
downstream patch.

Issue: 1487631
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>

Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  | 13 ----
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 16 -----
 .../net/ethernet/mellanox/mlx5/core/en_rx.c   | 67 ++-----------------
 3 files changed, 4 insertions(+), 92 deletions(-)

Comments

Jakub Kicinski Oct. 17, 2019, 1:30 a.m. UTC | #1
On Wed, 16 Oct 2019 15:50:19 -0700, Jonathan Lemon wrote:
> From: Tariq Toukan <tariqt@mellanox.com>
> 
> Obsolete the RX page-cache layer, pages are now recycled
> in page_pool.
> 
> This patch introduce a temporary degradation as recycling
> does not keep the pages DMA-mapped. That is fixed in a
> downstream patch.
> 
> Issue: 1487631

Please drop these Issue identifiers.

> Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
> 

And no empty lines between tags.

> Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Jonathan Lemon Oct. 18, 2019, 8:03 p.m. UTC | #2
On 16 Oct 2019, at 18:30, Jakub Kicinski wrote:

> On Wed, 16 Oct 2019 15:50:19 -0700, Jonathan Lemon wrote:
>> From: Tariq Toukan <tariqt@mellanox.com>
>>
>> Obsolete the RX page-cache layer, pages are now recycled
>> in page_pool.
>>
>> This patch introduce a temporary degradation as recycling
>> does not keep the pages DMA-mapped. That is fixed in a
>> downstream patch.
>>
>> Issue: 1487631
>
> Please drop these Issue identifiers.
>
>> Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
>>
>
> And no empty lines between tags.
>
>> Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>

Will fix.
Saeed Mahameed Oct. 18, 2019, 8:53 p.m. UTC | #3
On Wed, 2019-10-16 at 15:50 -0700, Jonathan Lemon wrote:
> From: Tariq Toukan <tariqt@mellanox.com>
> 
> Obsolete the RX page-cache layer, pages are now recycled
> in page_pool.
> 
> This patch introduce a temporary degradation as recycling
> does not keep the pages DMA-mapped. That is fixed in a
> downstream patch.
Tariq,

i think we need to have performance numbers here to show degradation.
i am sure that non XDP traffic TCP/UDP performance will be hit.

> 
> Issue: 1487631
> Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
> 
> Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
> ---
>  drivers/net/ethernet/mellanox/mlx5/core/en.h  | 13 ----
>  .../net/ethernet/mellanox/mlx5/core/en_main.c | 16 -----
>  .../net/ethernet/mellanox/mlx5/core/en_rx.c   | 67 ++---------------
> --
>  3 files changed, 4 insertions(+), 92 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h
> b/drivers/net/ethernet/mellanox/mlx5/core/en.h
> index 8d76452cacdc..0595cdcff594 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
> @@ -583,18 +583,6 @@ struct mlx5e_mpw_info {
>  
>  #define MLX5E_MAX_RX_FRAGS 4
>  
> -/* a single cache unit is capable to serve one napi call (for non-
> striding rq)
> - * or a MPWQE (for striding rq).
> - */
> -#define MLX5E_CACHE_UNIT	(MLX5_MPWRQ_PAGES_PER_WQE >
> NAPI_POLL_WEIGHT ? \
> -				 MLX5_MPWRQ_PAGES_PER_WQE :
> NAPI_POLL_WEIGHT)
> -#define MLX5E_CACHE_SIZE	(4 *
> roundup_pow_of_two(MLX5E_CACHE_UNIT))
> -struct mlx5e_page_cache {
> -	u32 head;
> -	u32 tail;
> -	struct mlx5e_dma_info page_cache[MLX5E_CACHE_SIZE];
> -};
> -
>  struct mlx5e_rq;
>  typedef void (*mlx5e_fp_handle_rx_cqe)(struct mlx5e_rq*, struct
> mlx5_cqe64*);
>  typedef struct sk_buff *
> @@ -658,7 +646,6 @@ struct mlx5e_rq {
>  	struct mlx5e_rq_stats *stats;
>  	struct mlx5e_cq        cq;
>  	struct mlx5e_cq_decomp cqd;
> -	struct mlx5e_page_cache page_cache;
>  	struct hwtstamp_config *tstamp;
>  	struct mlx5_clock      *clock;
>  
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> index 7569287f8f3c..168be1f800a3 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> @@ -612,9 +612,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel
> *c,
>  		rq->dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
>  	}
>  
> -	rq->page_cache.head = 0;
> -	rq->page_cache.tail = 0;
> -
>  	return 0;
>  
>  err_free:
> @@ -640,8 +637,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel
> *c,
>  
>  static void mlx5e_free_rq(struct mlx5e_rq *rq)
>  {
> -	int i;
> -
>  	if (rq->xdp_prog)
>  		bpf_prog_put(rq->xdp_prog);
>  
> @@ -655,17 +650,6 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
>  		mlx5e_free_di_list(rq);
>  	}
>  
> -	for (i = rq->page_cache.head; i != rq->page_cache.tail;
> -	     i = (i + 1) & (MLX5E_CACHE_SIZE - 1)) {
> -		struct mlx5e_dma_info *dma_info = &rq-
> >page_cache.page_cache[i];
> -
> -		/* With AF_XDP, page_cache is not used, so this loop is
> not
> -		 * entered, and it's safe to call
> mlx5e_page_release_dynamic
> -		 * directly.
> -		 */
> -		mlx5e_page_release_dynamic(rq, dma_info, false);
> -	}
> -
>  	xdp_rxq_info_unreg(&rq->xdp_rxq);
>  	page_pool_destroy(rq->page_pool);
>  	mlx5_wq_destroy(&rq->wq_ctrl);
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
> b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
> index d6a547238de0..a3773f8a4931 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
> @@ -184,65 +184,9 @@ static inline u32
> mlx5e_decompress_cqes_start(struct mlx5e_rq *rq,
>  	return mlx5e_decompress_cqes_cont(rq, wq, 1, budget_rem) - 1;
>  }
>  
> -static inline bool mlx5e_page_is_reserved(struct page *page)
> -{
> -	return page_is_pfmemalloc(page) || page_to_nid(page) !=
> numa_mem_id();
> -}
> -
> -static inline bool mlx5e_rx_cache_put(struct mlx5e_rq *rq,
> -				      struct mlx5e_dma_info *dma_info)
> -{
> -	struct mlx5e_page_cache *cache = &rq->page_cache;
> -	u32 tail_next = (cache->tail + 1) & (MLX5E_CACHE_SIZE - 1);
> -	struct mlx5e_rq_stats *stats = rq->stats;
> -
> -	if (tail_next == cache->head) {
> -		stats->cache_full++;
> -		return false;
> -	}
> -
> -	if (unlikely(mlx5e_page_is_reserved(dma_info->page))) {
> -		stats->cache_waive++;
> -		return false;
> -	}
> -
> -	cache->page_cache[cache->tail] = *dma_info;
> -	cache->tail = tail_next;
> -	return true;
> -}
> -
> -static inline bool mlx5e_rx_cache_get(struct mlx5e_rq *rq,
> -				      struct mlx5e_dma_info *dma_info)
> -{
> -	struct mlx5e_page_cache *cache = &rq->page_cache;
> -	struct mlx5e_rq_stats *stats = rq->stats;
> -
> -	if (unlikely(cache->head == cache->tail)) {
> -		stats->cache_empty++;
> -		return false;
> -	}
> -
> -	if (page_ref_count(cache->page_cache[cache->head].page) != 1) {
> -		stats->cache_busy++;
> -		return false;
> -	}
> -
> -	*dma_info = cache->page_cache[cache->head];
> -	cache->head = (cache->head + 1) & (MLX5E_CACHE_SIZE - 1);
> -	stats->cache_reuse++;
> -
> -	dma_sync_single_for_device(rq->pdev, dma_info->addr,
> -				   PAGE_SIZE,
> -				   DMA_FROM_DEVICE);
> -	return true;
> -}
> -
>  static inline int mlx5e_page_alloc_pool(struct mlx5e_rq *rq,
>  					struct mlx5e_dma_info
> *dma_info)
>  {
> -	if (mlx5e_rx_cache_get(rq, dma_info))
> -		return 0;
> -
>  	dma_info->page = page_pool_dev_alloc_pages(rq->page_pool);
>  	if (unlikely(!dma_info->page))
>  		return -ENOMEM;
> @@ -276,14 +220,11 @@ void mlx5e_page_release_dynamic(struct mlx5e_rq
> *rq,
>  				struct mlx5e_dma_info *dma_info,
>  				bool recycle)
>  {
> -	if (likely(recycle)) {
> -		if (mlx5e_rx_cache_put(rq, dma_info))
> -			return;
> +	mlx5e_page_dma_unmap(rq, dma_info);
>  
> -		mlx5e_page_dma_unmap(rq, dma_info);
> +	if (likely(recycle)) {
>  		page_pool_recycle_direct(rq->page_pool, dma_info-
> >page);
>  	} else {
> -		mlx5e_page_dma_unmap(rq, dma_info);
>  		page_pool_release_page(rq->page_pool, dma_info->page);
>  		put_page(dma_info->page);
>  	}
> @@ -1167,7 +1108,7 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq,
> struct mlx5_cqe64 *cqe)
>  	if (!skb) {
>  		/* probably for XDP */
>  		if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq-
> >flags)) {
> -			/* do not return page to cache,
> +			/* do not return page to pool,
>  			 * it will be returned on XDP_TX completion.
>  			 */
>  			goto wq_cyc_pop;
> @@ -1210,7 +1151,7 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq
> *rq, struct mlx5_cqe64 *cqe)
>  	if (!skb) {
>  		/* probably for XDP */
>  		if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq-
> >flags)) {
> -			/* do not return page to cache,
> +			/* do not return page to pool,
>  			 * it will be returned on XDP_TX completion.
>  			 */
>  			goto wq_cyc_pop;
Jonathan Lemon Oct. 19, 2019, 12:10 a.m. UTC | #4
I was running the updated patches on machines with various workloads, and
have a bunch of different results.

For the following numbers,
  Effective = hit / (hit + empty + stall) * 100

In other words, show the hit rate for for every trip to the cache,
and the cache full stat is ignored.

On a webserver:

[web] # ./eff
('rx_pool_cache_hit:', '360127643')
('rx_pool_cache_full:', '0')
('rx_pool_cache_empty:', '6455735977')
('rx_pool_ring_produce:', '474958')
('rx_pool_ring_consume:', '0')
('rx_pool_ring_return:', '474958')
('rx_pool_flush:', '144')
('rx_pool_node_change:', '0')
cache effectiveness:  5.28

On a proxygen:
# ethtool -S eth0 | grep rx_pool
     rx_pool_cache_hit: 1646798
     rx_pool_cache_full: 0
     rx_pool_cache_empty: 15723566
     rx_pool_ring_produce: 474958
     rx_pool_ring_consume: 0
     rx_pool_ring_return: 474958
     rx_pool_flush: 144
     rx_pool_node_change: 0
cache effectiveness:  9.48

On both of these, only pages with refcount = 1 are being kept.


I changed things around in the page pool so:

1) the cache behaves like a ring instead of a stack, this
   sacrifices temporal locality.

2) it caches all pages returned regardless of refcount, but
   only returns pages with refcount=1.

This is the same behavior as the mlx5 cache.  Some gains
would come about if the sojourn time though the cache is
greater than the lifetime of the page usage by the networking
stack, as it provides a fixed working set of mapped pages.

On the web server, this is a net loss:
[web] # ./eff
('rx_pool_cache_hit:', '6052662')
('rx_pool_cache_full:', '156355415')
('rx_pool_cache_empty:', '409600')
('rx_pool_cache_stall:', '302787473')
('rx_pool_ring_produce:', '156633847')
('rx_pool_ring_consume:', '9925520')
('rx_pool_ring_return:', '278788')
('rx_pool_flush:', '96')
('rx_pool_node_change:', '0')
cache effectiveness:  1.95720846778

For proxygen on the other hand, it's a win:
[proxy] # ./eff
('rx_pool_cache_hit:', '69235177')
('rx_pool_cache_full:', '35404387')
('rx_pool_cache_empty:', '460800')
('rx_pool_cache_stall:', '42932530')
('rx_pool_ring_produce:', '35717618')
('rx_pool_ring_consume:', '27879469')
('rx_pool_ring_return:', '404800')
('rx_pool_flush:', '108')
('rx_pool_node_change:', '0')
cache effectiveness:  61.4721608624

So the correct behavior isn't quite clear cut here - caching a
working set of mapped pages is beneficial in spite of the HOL
blocking stalls for some workloads, but I'm sure that it wouldn't
be too difficult to exceed the WS size.

Thoughts?
Tariq Toukan Oct. 20, 2019, 7:29 a.m. UTC | #5
On 10/19/2019 3:10 AM, Jonathan Lemon wrote:
> I was running the updated patches on machines with various workloads, and
> have a bunch of different results.
> 
> For the following numbers,
>    Effective = hit / (hit + empty + stall) * 100
> 
> In other words, show the hit rate for for every trip to the cache,
> and the cache full stat is ignored.
> 
> On a webserver:
> 
> [web] # ./eff
> ('rx_pool_cache_hit:', '360127643')
> ('rx_pool_cache_full:', '0')
> ('rx_pool_cache_empty:', '6455735977')
> ('rx_pool_ring_produce:', '474958')
> ('rx_pool_ring_consume:', '0')
> ('rx_pool_ring_return:', '474958')
> ('rx_pool_flush:', '144')
> ('rx_pool_node_change:', '0')
> cache effectiveness:  5.28
> 
> On a proxygen:
> # ethtool -S eth0 | grep rx_pool
>       rx_pool_cache_hit: 1646798
>       rx_pool_cache_full: 0
>       rx_pool_cache_empty: 15723566
>       rx_pool_ring_produce: 474958
>       rx_pool_ring_consume: 0
>       rx_pool_ring_return: 474958
>       rx_pool_flush: 144
>       rx_pool_node_change: 0
> cache effectiveness:  9.48
> 
> On both of these, only pages with refcount = 1 are being kept.
> 
> 
> I changed things around in the page pool so:
> 
> 1) the cache behaves like a ring instead of a stack, this
>     sacrifices temporal locality.
> 
> 2) it caches all pages returned regardless of refcount, but
>     only returns pages with refcount=1.
> 
> This is the same behavior as the mlx5 cache.  Some gains
> would come about if the sojourn time though the cache is
> greater than the lifetime of the page usage by the networking
> stack, as it provides a fixed working set of mapped pages.
> 
> On the web server, this is a net loss:
> [web] # ./eff
> ('rx_pool_cache_hit:', '6052662')
> ('rx_pool_cache_full:', '156355415')
> ('rx_pool_cache_empty:', '409600')
> ('rx_pool_cache_stall:', '302787473')
> ('rx_pool_ring_produce:', '156633847')
> ('rx_pool_ring_consume:', '9925520')
> ('rx_pool_ring_return:', '278788')
> ('rx_pool_flush:', '96')
> ('rx_pool_node_change:', '0')
> cache effectiveness:  1.95720846778
> 
> For proxygen on the other hand, it's a win:
> [proxy] # ./eff
> ('rx_pool_cache_hit:', '69235177')
> ('rx_pool_cache_full:', '35404387')
> ('rx_pool_cache_empty:', '460800')
> ('rx_pool_cache_stall:', '42932530')
> ('rx_pool_ring_produce:', '35717618')
> ('rx_pool_ring_consume:', '27879469')
> ('rx_pool_ring_return:', '404800')
> ('rx_pool_flush:', '108')
> ('rx_pool_node_change:', '0')
> cache effectiveness:  61.4721608624
> 
> So the correct behavior isn't quite clear cut here - caching a
> working set of mapped pages is beneficial in spite of the HOL
> blocking stalls for some workloads, but I'm sure that it wouldn't
> be too difficult to exceed the WS size.
> 
> Thoughts?
> 

We have a WIP in which we avoid the HOL block, by having pages returned 
to the available-queue only when their refcnt reaches back to 1. This 
requires catching this case in the page/skb release path.

See:
https://github.com/xdp-project/xdp-project/tree/master/areas/mem
diff mbox series

Patch

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 8d76452cacdc..0595cdcff594 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -583,18 +583,6 @@  struct mlx5e_mpw_info {
 
 #define MLX5E_MAX_RX_FRAGS 4
 
-/* a single cache unit is capable to serve one napi call (for non-striding rq)
- * or a MPWQE (for striding rq).
- */
-#define MLX5E_CACHE_UNIT	(MLX5_MPWRQ_PAGES_PER_WQE > NAPI_POLL_WEIGHT ? \
-				 MLX5_MPWRQ_PAGES_PER_WQE : NAPI_POLL_WEIGHT)
-#define MLX5E_CACHE_SIZE	(4 * roundup_pow_of_two(MLX5E_CACHE_UNIT))
-struct mlx5e_page_cache {
-	u32 head;
-	u32 tail;
-	struct mlx5e_dma_info page_cache[MLX5E_CACHE_SIZE];
-};
-
 struct mlx5e_rq;
 typedef void (*mlx5e_fp_handle_rx_cqe)(struct mlx5e_rq*, struct mlx5_cqe64*);
 typedef struct sk_buff *
@@ -658,7 +646,6 @@  struct mlx5e_rq {
 	struct mlx5e_rq_stats *stats;
 	struct mlx5e_cq        cq;
 	struct mlx5e_cq_decomp cqd;
-	struct mlx5e_page_cache page_cache;
 	struct hwtstamp_config *tstamp;
 	struct mlx5_clock      *clock;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 7569287f8f3c..168be1f800a3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -612,9 +612,6 @@  static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 		rq->dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
 	}
 
-	rq->page_cache.head = 0;
-	rq->page_cache.tail = 0;
-
 	return 0;
 
 err_free:
@@ -640,8 +637,6 @@  static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
 static void mlx5e_free_rq(struct mlx5e_rq *rq)
 {
-	int i;
-
 	if (rq->xdp_prog)
 		bpf_prog_put(rq->xdp_prog);
 
@@ -655,17 +650,6 @@  static void mlx5e_free_rq(struct mlx5e_rq *rq)
 		mlx5e_free_di_list(rq);
 	}
 
-	for (i = rq->page_cache.head; i != rq->page_cache.tail;
-	     i = (i + 1) & (MLX5E_CACHE_SIZE - 1)) {
-		struct mlx5e_dma_info *dma_info = &rq->page_cache.page_cache[i];
-
-		/* With AF_XDP, page_cache is not used, so this loop is not
-		 * entered, and it's safe to call mlx5e_page_release_dynamic
-		 * directly.
-		 */
-		mlx5e_page_release_dynamic(rq, dma_info, false);
-	}
-
 	xdp_rxq_info_unreg(&rq->xdp_rxq);
 	page_pool_destroy(rq->page_pool);
 	mlx5_wq_destroy(&rq->wq_ctrl);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index d6a547238de0..a3773f8a4931 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -184,65 +184,9 @@  static inline u32 mlx5e_decompress_cqes_start(struct mlx5e_rq *rq,
 	return mlx5e_decompress_cqes_cont(rq, wq, 1, budget_rem) - 1;
 }
 
-static inline bool mlx5e_page_is_reserved(struct page *page)
-{
-	return page_is_pfmemalloc(page) || page_to_nid(page) != numa_mem_id();
-}
-
-static inline bool mlx5e_rx_cache_put(struct mlx5e_rq *rq,
-				      struct mlx5e_dma_info *dma_info)
-{
-	struct mlx5e_page_cache *cache = &rq->page_cache;
-	u32 tail_next = (cache->tail + 1) & (MLX5E_CACHE_SIZE - 1);
-	struct mlx5e_rq_stats *stats = rq->stats;
-
-	if (tail_next == cache->head) {
-		stats->cache_full++;
-		return false;
-	}
-
-	if (unlikely(mlx5e_page_is_reserved(dma_info->page))) {
-		stats->cache_waive++;
-		return false;
-	}
-
-	cache->page_cache[cache->tail] = *dma_info;
-	cache->tail = tail_next;
-	return true;
-}
-
-static inline bool mlx5e_rx_cache_get(struct mlx5e_rq *rq,
-				      struct mlx5e_dma_info *dma_info)
-{
-	struct mlx5e_page_cache *cache = &rq->page_cache;
-	struct mlx5e_rq_stats *stats = rq->stats;
-
-	if (unlikely(cache->head == cache->tail)) {
-		stats->cache_empty++;
-		return false;
-	}
-
-	if (page_ref_count(cache->page_cache[cache->head].page) != 1) {
-		stats->cache_busy++;
-		return false;
-	}
-
-	*dma_info = cache->page_cache[cache->head];
-	cache->head = (cache->head + 1) & (MLX5E_CACHE_SIZE - 1);
-	stats->cache_reuse++;
-
-	dma_sync_single_for_device(rq->pdev, dma_info->addr,
-				   PAGE_SIZE,
-				   DMA_FROM_DEVICE);
-	return true;
-}
-
 static inline int mlx5e_page_alloc_pool(struct mlx5e_rq *rq,
 					struct mlx5e_dma_info *dma_info)
 {
-	if (mlx5e_rx_cache_get(rq, dma_info))
-		return 0;
-
 	dma_info->page = page_pool_dev_alloc_pages(rq->page_pool);
 	if (unlikely(!dma_info->page))
 		return -ENOMEM;
@@ -276,14 +220,11 @@  void mlx5e_page_release_dynamic(struct mlx5e_rq *rq,
 				struct mlx5e_dma_info *dma_info,
 				bool recycle)
 {
-	if (likely(recycle)) {
-		if (mlx5e_rx_cache_put(rq, dma_info))
-			return;
+	mlx5e_page_dma_unmap(rq, dma_info);
 
-		mlx5e_page_dma_unmap(rq, dma_info);
+	if (likely(recycle)) {
 		page_pool_recycle_direct(rq->page_pool, dma_info->page);
 	} else {
-		mlx5e_page_dma_unmap(rq, dma_info);
 		page_pool_release_page(rq->page_pool, dma_info->page);
 		put_page(dma_info->page);
 	}
@@ -1167,7 +1108,7 @@  void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	if (!skb) {
 		/* probably for XDP */
 		if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
-			/* do not return page to cache,
+			/* do not return page to pool,
 			 * it will be returned on XDP_TX completion.
 			 */
 			goto wq_cyc_pop;
@@ -1210,7 +1151,7 @@  void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	if (!skb) {
 		/* probably for XDP */
 		if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
-			/* do not return page to cache,
+			/* do not return page to pool,
 			 * it will be returned on XDP_TX completion.
 			 */
 			goto wq_cyc_pop;