diff mbox series

[bpf-next,10/14] mlx5, xsk: migrate to new MEM_TYPE_XSK_BUFF_POOL

Message ID 20200507104252.544114-11-bjorn.topel@gmail.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series Introduce AF_XDP buffer allocation API | expand

Commit Message

Björn Töpel May 7, 2020, 10:42 a.m. UTC
From: Björn Töpel <bjorn.topel@intel.com>

Use the new MEM_TYPE_XSK_BUFF_POOL API in lieu of MEM_TYPE_ZERO_COPY in
mlx5e. It allows to drop a lot of code from the driver (which is now
common in AF_XDP core and was related to XSK RX frame allocation, DMA
mapping, etc.) and slightly improve performance.

rfc->v1: Put back the sanity check for XSK params, use XSK API to get
         the total headroom size. (Maxim)

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  11 +-
 .../ethernet/mellanox/mlx5/core/en/params.c   |  13 +-
 .../net/ethernet/mellanox/mlx5/core/en/xdp.c  |  30 ++---
 .../net/ethernet/mellanox/mlx5/core/en/xdp.h  |   2 +-
 .../ethernet/mellanox/mlx5/core/en/xsk/rx.c   | 111 +++---------------
 .../ethernet/mellanox/mlx5/core/en/xsk/rx.h   |   6 -
 .../ethernet/mellanox/mlx5/core/en/xsk/tx.c   |   6 +-
 .../ethernet/mellanox/mlx5/core/en/xsk/umem.c |  49 +-------
 .../net/ethernet/mellanox/mlx5/core/en_main.c |  15 +--
 .../net/ethernet/mellanox/mlx5/core/en_rx.c   |  43 +++++--
 10 files changed, 84 insertions(+), 202 deletions(-)

Comments

Maxim Mikityanskiy May 8, 2020, 11:55 a.m. UTC | #1
On 2020-05-07 13:42, Björn Töpel wrote:
> From: Björn Töpel <bjorn.topel@intel.com>
> 
> Use the new MEM_TYPE_XSK_BUFF_POOL API in lieu of MEM_TYPE_ZERO_COPY in
> mlx5e. It allows to drop a lot of code from the driver (which is now
> common in AF_XDP core and was related to XSK RX frame allocation, DMA
> mapping, etc.) and slightly improve performance.
> 
> rfc->v1: Put back the sanity check for XSK params, use XSK API to get
>           the total headroom size. (Maxim)
> 
> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
> Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>

I did some functional and performance tests.

Unfortunately, something is wrong with the traffic: I get zeros in 
XDP_TX, XDP_PASS and XSK instead of packet data. I set DEBUG_HEXDUMP in 
xdpsock, and it shows the packets of the correct length, but all bytes 
are 0 after these patches. It might be wrong xdp_buff pointers, however, 
I still have to investigate it. Björn, does it also affect Intel 
drivers, or is it Mellanox-specific?

For performance, I got +1.0..+1.2 Mpps on RX. TX performance got better 
after Björn inlined the relevant UMEM functions, however, there is still 
a slight decrease compared to the old code. I'll try to find the 
possible reason, but the good thing is that it's not significant anymore.

> ---
>   drivers/net/ethernet/mellanox/mlx5/core/en.h  |  11 +-
>   .../ethernet/mellanox/mlx5/core/en/params.c   |  13 +-
>   .../net/ethernet/mellanox/mlx5/core/en/xdp.c  |  30 ++---
>   .../net/ethernet/mellanox/mlx5/core/en/xdp.h  |   2 +-
>   .../ethernet/mellanox/mlx5/core/en/xsk/rx.c   | 111 +++---------------
>   .../ethernet/mellanox/mlx5/core/en/xsk/rx.h   |   6 -
>   .../ethernet/mellanox/mlx5/core/en/xsk/tx.c   |   6 +-
>   .../ethernet/mellanox/mlx5/core/en/xsk/umem.c |  49 +-------
>   .../net/ethernet/mellanox/mlx5/core/en_main.c |  15 +--
>   .../net/ethernet/mellanox/mlx5/core/en_rx.c   |  43 +++++--
>   10 files changed, 84 insertions(+), 202 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
> index 0864b76ca2c0..368563ff8efb 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
> @@ -426,13 +426,12 @@ struct mlx5e_txqsq {
>   } ____cacheline_aligned_in_smp;
>   
>   struct mlx5e_dma_info {
> -	dma_addr_t addr;
>   	union {
> -		struct page *page;
>   		struct {
> -			u64 handle;
> -			void *data;
> -		} xsk;
> +			dma_addr_t addr;
> +			struct page *page;
> +		};
> +		struct xdp_buff *xsk;
>   	};
>   };
>   
> @@ -650,7 +649,6 @@ struct mlx5e_rq {
>   		} mpwqe;
>   	};
>   	struct {
> -		u16            umem_headroom;
>   		u16            headroom;
>   		u8             map_dir;   /* dma map direction */
>   	} buff;
> @@ -682,7 +680,6 @@ struct mlx5e_rq {
>   	struct page_pool      *page_pool;
>   
>   	/* AF_XDP zero-copy */
> -	struct zero_copy_allocator zca;
>   	struct xdp_umem       *umem;
>   
>   	struct work_struct     recover_work;
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
> index eb2e1f2138e4..38e4f19d69f8 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
> @@ -12,15 +12,16 @@ static inline bool mlx5e_rx_is_xdp(struct mlx5e_params *params,
>   u16 mlx5e_get_linear_rq_headroom(struct mlx5e_params *params,
>   				 struct mlx5e_xsk_param *xsk)
>   {
> -	u16 headroom = NET_IP_ALIGN;
> +	u16 headroom;
>   
> -	if (mlx5e_rx_is_xdp(params, xsk)) {
> +	if (xsk)
> +		return xsk->headroom;
> +
> +	headroom = NET_IP_ALIGN;
> +	if (mlx5e_rx_is_xdp(params, xsk))
>   		headroom += XDP_PACKET_HEADROOM;
> -		if (xsk)
> -			headroom += xsk->headroom;
> -	} else {
> +	else
>   		headroom += MLX5_RX_HEADROOM;
> -	}
>   
>   	return headroom;
>   }
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
> index b04b99396f65..a2a194525b15 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
> @@ -71,7 +71,7 @@ mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq,
>   	xdptxd.data = xdpf->data;
>   	xdptxd.len  = xdpf->len;
>   
> -	if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) {
> +	if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
>   		/* The xdp_buff was in the UMEM and was copied into a newly
>   		 * allocated page. The UMEM page was returned via the ZCA, and
>   		 * this new page has to be mapped at this point and has to be
> @@ -119,49 +119,33 @@ mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq,
>   
>   /* returns true if packet was consumed by xdp */
>   bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
> -		      void *va, u16 *rx_headroom, u32 *len, bool xsk)
> +		      u32 *len, struct xdp_buff *xdp)
>   {
>   	struct bpf_prog *prog = READ_ONCE(rq->xdp_prog);
> -	struct xdp_umem *umem = rq->umem;
> -	struct xdp_buff xdp;
>   	u32 act;
>   	int err;
>   
>   	if (!prog)
>   		return false;
>   
> -	xdp.data = va + *rx_headroom;
> -	xdp_set_data_meta_invalid(&xdp);
> -	xdp.data_end = xdp.data + *len;
> -	xdp.data_hard_start = va;
> -	if (xsk)
> -		xdp.handle = di->xsk.handle;
> -	xdp.rxq = &rq->xdp_rxq;
> -
> -	act = bpf_prog_run_xdp(prog, &xdp);
> -	if (xsk) {
> -		u64 off = xdp.data - xdp.data_hard_start;
> -
> -		xdp.handle = xsk_umem_adjust_offset(umem, xdp.handle, off);
> -	}
> +	act = bpf_prog_run_xdp(prog, xdp);
>   	switch (act) {
>   	case XDP_PASS:
> -		*rx_headroom = xdp.data - xdp.data_hard_start;
> -		*len = xdp.data_end - xdp.data;
> +		*len = xdp->data_end - xdp->data;
>   		return false;
>   	case XDP_TX:
> -		if (unlikely(!mlx5e_xmit_xdp_buff(rq->xdpsq, rq, di, &xdp)))
> +		if (unlikely(!mlx5e_xmit_xdp_buff(rq->xdpsq, rq, di, xdp)))
>   			goto xdp_abort;
>   		__set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); /* non-atomic */
>   		return true;
>   	case XDP_REDIRECT:
>   		/* When XDP enabled then page-refcnt==1 here */
> -		err = xdp_do_redirect(rq->netdev, &xdp, prog);
> +		err = xdp_do_redirect(rq->netdev, xdp, prog);
>   		if (unlikely(err))
>   			goto xdp_abort;
>   		__set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags);
>   		__set_bit(MLX5E_RQ_FLAG_XDP_REDIRECT, rq->flags);
> -		if (!xsk)
> +		if (xdp->rxq->mem.type != MEM_TYPE_XSK_BUFF_POOL)
>   			mlx5e_page_dma_unmap(rq, di);
>   		rq->stats->xdp_redirect++;
>   		return true;
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
> index ed6f045febeb..54bad625267f 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
> @@ -63,7 +63,7 @@
>   struct mlx5e_xsk_param;
>   int mlx5e_xdp_max_mtu(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk);
>   bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
> -		      void *va, u16 *rx_headroom, u32 *len, bool xsk);
> +		      u32 *len, struct xdp_buff *xdp);
>   void mlx5e_xdp_mpwqe_complete(struct mlx5e_xdpsq *sq);
>   bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq);
>   void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq);
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
> index 62fc8a128a8d..59c40983d428 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
> @@ -3,71 +3,10 @@
>   
>   #include "rx.h"
>   #include "en/xdp.h"
> -#include <net/xdp_sock.h>
> +#include <net/xdp_sock_drv.h>
>   
>   /* RX data path */
>   
> -bool mlx5e_xsk_pages_enough_umem(struct mlx5e_rq *rq, int count)
> -{
> -	/* Check in advance that we have enough frames, instead of allocating
> -	 * one-by-one, failing and moving frames to the Reuse Ring.
> -	 */
> -	return xsk_umem_has_addrs_rq(rq->umem, count);
> -}
> -
> -int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq,
> -			      struct mlx5e_dma_info *dma_info)
> -{
> -	struct xdp_umem *umem = rq->umem;
> -	u64 handle;
> -
> -	if (!xsk_umem_peek_addr_rq(umem, &handle))
> -		return -ENOMEM;
> -
> -	dma_info->xsk.handle = xsk_umem_adjust_offset(umem, handle,
> -						      rq->buff.umem_headroom);
> -	dma_info->xsk.data = xdp_umem_get_data(umem, dma_info->xsk.handle);
> -
> -	/* No need to add headroom to the DMA address. In striding RQ case, we
> -	 * just provide pages for UMR, and headroom is counted at the setup
> -	 * stage when creating a WQE. In non-striding RQ case, headroom is
> -	 * accounted in mlx5e_alloc_rx_wqe.
> -	 */
> -	dma_info->addr = xdp_umem_get_dma(umem, handle);
> -
> -	xsk_umem_release_addr_rq(umem);
> -
> -	dma_sync_single_for_device(rq->pdev, dma_info->addr, PAGE_SIZE,
> -				   DMA_BIDIRECTIONAL);
> -
> -	return 0;
> -}
> -
> -static inline void mlx5e_xsk_recycle_frame(struct mlx5e_rq *rq, u64 handle)
> -{
> -	xsk_umem_fq_reuse(rq->umem, handle & rq->umem->chunk_mask);
> -}
> -
> -/* XSKRQ uses pages from UMEM, they must not be released. They are returned to
> - * the userspace if possible, and if not, this function is called to reuse them
> - * in the driver.
> - */
> -void mlx5e_xsk_page_release(struct mlx5e_rq *rq,
> -			    struct mlx5e_dma_info *dma_info)
> -{
> -	mlx5e_xsk_recycle_frame(rq, dma_info->xsk.handle);
> -}
> -
> -/* Return a frame back to the hardware to fill in again. It is used by XDP when
> - * the XDP program returns XDP_TX or XDP_REDIRECT not to an XSKMAP.
> - */
> -void mlx5e_xsk_zca_free(struct zero_copy_allocator *zca, unsigned long handle)
> -{
> -	struct mlx5e_rq *rq = container_of(zca, struct mlx5e_rq, zca);
> -
> -	mlx5e_xsk_recycle_frame(rq, handle);
> -}
> -
>   static struct sk_buff *mlx5e_xsk_construct_skb(struct mlx5e_rq *rq, void *data,
>   					       u32 cqe_bcnt)
>   {
> @@ -90,11 +29,8 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
>   						    u32 head_offset,
>   						    u32 page_idx)
>   {
> -	struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx];
> -	u16 rx_headroom = rq->buff.headroom - rq->buff.umem_headroom;
> +	struct xdp_buff *xdp = wi->umr.dma_info[page_idx].xsk;
>   	u32 cqe_bcnt32 = cqe_bcnt;
> -	void *va, *data;
> -	u32 frag_size;
>   	bool consumed;
>   
>   	/* Check packet size. Note LRO doesn't use linear SKB */
> @@ -103,22 +39,19 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
>   		return NULL;
>   	}
>   
> -	/* head_offset is not used in this function, because di->xsk.data and
> -	 * di->addr point directly to the necessary place. Furthermore, in the
> -	 * current implementation, UMR pages are mapped to XSK frames, so
> +	/* head_offset is not used in this function, because xdp->data and the
> +	 * DMA address point directly to the necessary place. Furthermore, in
> +	 * the current implementation, UMR pages are mapped to XSK frames, so
>   	 * head_offset should always be 0.
>   	 */
>   	WARN_ON_ONCE(head_offset);
>   
> -	va             = di->xsk.data;
> -	data           = va + rx_headroom;
> -	frag_size      = rq->buff.headroom + cqe_bcnt32;
> -
> -	dma_sync_single_for_cpu(rq->pdev, di->addr, frag_size, DMA_BIDIRECTIONAL);
> -	prefetch(data);
> +	xdp->data_end = xdp->data + cqe_bcnt32;
> +	xsk_buff_dma_sync_for_cpu(xdp);
> +	prefetch(xdp->data);
>   
>   	rcu_read_lock();
> -	consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt32, true);
> +	consumed = mlx5e_xdp_handle(rq, NULL, &cqe_bcnt32, xdp);
>   	rcu_read_unlock();
>   
>   	/* Possible flows:
> @@ -145,7 +78,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
>   	/* XDP_PASS: copy the data from the UMEM to a new SKB and reuse the
>   	 * frame. On SKB allocation failure, NULL is returned.
>   	 */
> -	return mlx5e_xsk_construct_skb(rq, data, cqe_bcnt32);
> +	return mlx5e_xsk_construct_skb(rq, xdp->data, cqe_bcnt32);
>   }
>   
>   struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
> @@ -153,25 +86,19 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
>   					      struct mlx5e_wqe_frag_info *wi,
>   					      u32 cqe_bcnt)
>   {
> -	struct mlx5e_dma_info *di = wi->di;
> -	u16 rx_headroom = rq->buff.headroom - rq->buff.umem_headroom;
> -	void *va, *data;
> +	struct xdp_buff *xdp = wi->di->xsk;
>   	bool consumed;
> -	u32 frag_size;
>   
> -	/* wi->offset is not used in this function, because di->xsk.data and
> -	 * di->addr point directly to the necessary place. Furthermore, in the
> -	 * current implementation, one page = one packet = one frame, so
> +	/* wi->offset is not used in this function, because xdp->data and the
> +	 * DMA address point directly to the necessary place. Furthermore, the
> +	 * XSK allocator allocates frames per packet, instead of pages, so
>   	 * wi->offset should always be 0.
>   	 */
>   	WARN_ON_ONCE(wi->offset);
>   
> -	va             = di->xsk.data;
> -	data           = va + rx_headroom;
> -	frag_size      = rq->buff.headroom + cqe_bcnt;
> -
> -	dma_sync_single_for_cpu(rq->pdev, di->addr, frag_size, DMA_BIDIRECTIONAL);
> -	prefetch(data);
> +	xdp->data_end = xdp->data + cqe_bcnt;
> +	xsk_buff_dma_sync_for_cpu(xdp);
> +	prefetch(xdp->data);
>   
>   	if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_RESP_SEND)) {
>   		rq->stats->wqe_err++;
> @@ -179,7 +106,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
>   	}
>   
>   	rcu_read_lock();
> -	consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt, true);
> +	consumed = mlx5e_xdp_handle(rq, NULL, &cqe_bcnt, xdp);
>   	rcu_read_unlock();
>   
>   	if (likely(consumed))
> @@ -189,5 +116,5 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
>   	 * will be handled by mlx5e_put_rx_frag.
>   	 * On SKB allocation failure, NULL is returned.
>   	 */
> -	return mlx5e_xsk_construct_skb(rq, data, cqe_bcnt);
> +	return mlx5e_xsk_construct_skb(rq, xdp->data, cqe_bcnt);
>   }
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
> index a8e11adbf426..0062652f37b3 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
> @@ -9,12 +9,6 @@
>   
>   /* RX data path */
>   
> -bool mlx5e_xsk_pages_enough_umem(struct mlx5e_rq *rq, int count);
> -int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq,
> -			      struct mlx5e_dma_info *dma_info);
> -void mlx5e_xsk_page_release(struct mlx5e_rq *rq,
> -			    struct mlx5e_dma_info *dma_info);
> -void mlx5e_xsk_zca_free(struct zero_copy_allocator *zca, unsigned long handle);
>   struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
>   						    struct mlx5e_mpw_info *wi,
>   						    u16 cqe_bcnt,
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
> index 3bcdb5b2fc20..470f3b6317c3 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
> @@ -5,7 +5,7 @@
>   #include "umem.h"
>   #include "en/xdp.h"
>   #include "en/params.h"
> -#include <net/xdp_sock.h>
> +#include <net/xdp_sock_drv.h>
>   
>   int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
>   {
> @@ -92,8 +92,8 @@ bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget)
>   			break;
>   		}
>   
> -		xdptxd.dma_addr = xdp_umem_get_dma(umem, desc.addr);
> -		xdptxd.data = xdp_umem_get_data(umem, desc.addr);
> +		xdptxd.dma_addr = xsk_buff_raw_get_dma(umem, desc.addr);
> +		xdptxd.data = xsk_buff_raw_get_data(umem, desc.addr);
>   		xdptxd.len = desc.len;
>   
>   		dma_sync_single_for_device(sq->pdev, xdptxd.dma_addr,
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c
> index 5e49fdb564b3..7b17fcd0a56d 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c
> @@ -10,40 +10,14 @@ static int mlx5e_xsk_map_umem(struct mlx5e_priv *priv,
>   			      struct xdp_umem *umem)
>   {
>   	struct device *dev = priv->mdev->device;
> -	u32 i;
>   
> -	for (i = 0; i < umem->npgs; i++) {
> -		dma_addr_t dma = dma_map_page(dev, umem->pgs[i], 0, PAGE_SIZE,
> -					      DMA_BIDIRECTIONAL);
> -
> -		if (unlikely(dma_mapping_error(dev, dma)))
> -			goto err_unmap;
> -		umem->pages[i].dma = dma;
> -	}
> -
> -	return 0;
> -
> -err_unmap:
> -	while (i--) {
> -		dma_unmap_page(dev, umem->pages[i].dma, PAGE_SIZE,
> -			       DMA_BIDIRECTIONAL);
> -		umem->pages[i].dma = 0;
> -	}
> -
> -	return -ENOMEM;
> +	return xsk_buff_dma_map(umem, dev, 0);
>   }
>   
>   static void mlx5e_xsk_unmap_umem(struct mlx5e_priv *priv,
>   				 struct xdp_umem *umem)
>   {
> -	struct device *dev = priv->mdev->device;
> -	u32 i;
> -
> -	for (i = 0; i < umem->npgs; i++) {
> -		dma_unmap_page(dev, umem->pages[i].dma, PAGE_SIZE,
> -			       DMA_BIDIRECTIONAL);
> -		umem->pages[i].dma = 0;
> -	}
> +	return xsk_buff_dma_unmap(umem, 0);
>   }
>   
>   static int mlx5e_xsk_get_umems(struct mlx5e_xsk *xsk)
> @@ -90,13 +64,14 @@ static void mlx5e_xsk_remove_umem(struct mlx5e_xsk *xsk, u16 ix)
>   
>   static bool mlx5e_xsk_is_umem_sane(struct xdp_umem *umem)
>   {
> -	return umem->headroom <= 0xffff && umem->chunk_size_nohr <= 0xffff;
> +	return xsk_umem_get_headroom(umem) <= 0xffff &&
> +		xsk_umem_get_chunk_size(umem) <= 0xffff;
>   }
>   
>   void mlx5e_build_xsk_param(struct xdp_umem *umem, struct mlx5e_xsk_param *xsk)
>   {
> -	xsk->headroom = umem->headroom;
> -	xsk->chunk_size = umem->chunk_size_nohr + umem->headroom;
> +	xsk->headroom = xsk_umem_get_headroom(umem);
> +	xsk->chunk_size = xsk_umem_get_chunk_size(umem);
>   }
>   
>   static int mlx5e_xsk_enable_locked(struct mlx5e_priv *priv,
> @@ -241,18 +216,6 @@ int mlx5e_xsk_setup_umem(struct net_device *dev, struct xdp_umem *umem, u16 qid)
>   		      mlx5e_xsk_disable_umem(priv, ix);
>   }
>   
> -int mlx5e_xsk_resize_reuseq(struct xdp_umem *umem, u32 nentries)
> -{
> -	struct xdp_umem_fq_reuse *reuseq;
> -
> -	reuseq = xsk_reuseq_prepare(nentries);
> -	if (unlikely(!reuseq))
> -		return -ENOMEM;
> -	xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
> -
> -	return 0;
> -}
> -
>   u16 mlx5e_xsk_first_unused_channel(struct mlx5e_params *params, struct mlx5e_xsk *xsk)
>   {
>   	u16 res = xsk->refcnt ? params->num_channels : 0;
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> index 048a4f8601a8..5ce73931eff4 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> @@ -38,7 +38,7 @@
>   #include <linux/bpf.h>
>   #include <linux/if_bridge.h>
>   #include <net/page_pool.h>
> -#include <net/xdp_sock.h>
> +#include <net/xdp_sock_drv.h>
>   #include "eswitch.h"
>   #include "en.h"
>   #include "en/txrx.h"
> @@ -414,7 +414,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
>   
>   	rq->buff.map_dir = rq->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
>   	rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params, xsk);
> -	rq->buff.umem_headroom = xsk ? xsk->headroom : 0;
>   	pool_size = 1 << params->log_rq_mtu_frames;
>   
>   	switch (rq->wq_type) {
> @@ -522,17 +521,9 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
>   	}
>   
>   	if (xsk) {
> -		err = mlx5e_xsk_resize_reuseq(umem, num_xsk_frames);
> -		if (unlikely(err)) {
> -			mlx5_core_err(mdev, "Unable to allocate the Reuse Ring for %u frames\n",
> -				      num_xsk_frames);
> -			goto err_free;
> -		}
> -
> -		rq->zca.free = mlx5e_xsk_zca_free;
>   		err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
> -						 MEM_TYPE_ZERO_COPY,
> -						 &rq->zca);
> +						 MEM_TYPE_XSK_BUFF_POOL, NULL);
> +		xsk_buff_set_rxq_info(rq->umem, &rq->xdp_rxq);
>   	} else {
>   		/* Create a page_pool and register it with rxq */
>   		pp_params.order     = 0;
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
> index d9a5a669b84d..662bc73f6dae 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
> @@ -263,10 +263,12 @@ static inline int mlx5e_page_alloc_pool(struct mlx5e_rq *rq,
>   static inline int mlx5e_page_alloc(struct mlx5e_rq *rq,
>   				   struct mlx5e_dma_info *dma_info)
>   {
> -	if (rq->umem)
> -		return mlx5e_xsk_page_alloc_umem(rq, dma_info);
> -	else
> -		return mlx5e_page_alloc_pool(rq, dma_info);
> +	if (rq->umem) {
> +		dma_info->xsk = xsk_buff_alloc(rq->umem);
> +		return dma_info->xsk ? 0 : -ENOMEM;
> +	}
> +
> +	return mlx5e_page_alloc_pool(rq, dma_info);
>   }
>   
>   void mlx5e_page_dma_unmap(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info)
> @@ -300,7 +302,7 @@ static inline void mlx5e_page_release(struct mlx5e_rq *rq,
>   		 * put into the Reuse Ring, because there is no way to return
>   		 * the page to the userspace when the interface goes down.
>   		 */
> -		mlx5e_xsk_page_release(rq, dma_info);
> +		xsk_buff_free(dma_info->xsk);
>   	else
>   		mlx5e_page_release_dynamic(rq, dma_info, recycle);
>   }
> @@ -385,7 +387,11 @@ static int mlx5e_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, u8 wqe_bulk)
>   	if (rq->umem) {
>   		int pages_desired = wqe_bulk << rq->wqe.info.log_num_frags;
>   
> -		if (unlikely(!mlx5e_xsk_pages_enough_umem(rq, pages_desired)))
> +		/* Check in advance that we have enough frames, instead of
> +		 * allocating one-by-one, failing and moving frames to the
> +		 * Reuse Ring.
> +		 */
> +		if (unlikely(!xsk_buff_can_alloc(rq->umem, pages_desired)))
>   			return -ENOMEM;
>   	}
>   
> @@ -480,8 +486,11 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
>   	int err;
>   	int i;
>   
> +	/* Check in advance that we have enough frames, instead of allocating
> +	 * one-by-one, failing and moving frames to the Reuse Ring.
> +	 */
>   	if (rq->umem &&
> -	    unlikely(!mlx5e_xsk_pages_enough_umem(rq, MLX5_MPWRQ_PAGES_PER_WQE))) {
> +	    unlikely(!xsk_buff_can_alloc(rq->umem, MLX5_MPWRQ_PAGES_PER_WQE))) {
>   		err = -ENOMEM;
>   		goto err;
>   	}
> @@ -1038,12 +1047,23 @@ struct sk_buff *mlx5e_build_linear_skb(struct mlx5e_rq *rq, void *va,
>   	return skb;
>   }
>   
> +static void mlx5e_fill_xdp_buff(struct mlx5e_rq *rq, void *va, u16 headroom,
> +				u32 len, struct xdp_buff *xdp)
> +{
> +	xdp->data_hard_start = va;
> +	xdp_set_data_meta_invalid(xdp);
> +	xdp->data = va + headroom;
> +	xdp->data_end = xdp->data + len;
> +	xdp->rxq = &rq->xdp_rxq;
> +}
> +
>   struct sk_buff *
>   mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
>   			  struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt)
>   {
>   	struct mlx5e_dma_info *di = wi->di;
>   	u16 rx_headroom = rq->buff.headroom;
> +	struct xdp_buff xdp;
>   	struct sk_buff *skb;
>   	void *va, *data;
>   	bool consumed;
> @@ -1059,11 +1079,13 @@ mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
>   	prefetch(data);
>   
>   	rcu_read_lock();
> -	consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt, false);
> +	mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt, &xdp);
> +	consumed = mlx5e_xdp_handle(rq, di, &cqe_bcnt, &xdp);
>   	rcu_read_unlock();
>   	if (consumed)
>   		return NULL; /* page/packet was consumed by XDP */
>   
> +	rx_headroom = xdp.data - xdp.data_hard_start;
>   	skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt);
>   	if (unlikely(!skb))
>   		return NULL;
> @@ -1336,6 +1358,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
>   	struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx];
>   	u16 rx_headroom = rq->buff.headroom;
>   	u32 cqe_bcnt32 = cqe_bcnt;
> +	struct xdp_buff xdp;
>   	struct sk_buff *skb;
>   	void *va, *data;
>   	u32 frag_size;
> @@ -1357,7 +1380,8 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
>   	prefetch(data);
>   
>   	rcu_read_lock();
> -	consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt32, false);
> +	mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt32, &xdp);
> +	consumed = mlx5e_xdp_handle(rq, di, &cqe_bcnt32, &xdp);
>   	rcu_read_unlock();
>   	if (consumed) {
>   		if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags))
> @@ -1365,6 +1389,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
>   		return NULL; /* page/packet was consumed by XDP */
>   	}
>   
> +	rx_headroom = xdp.data - xdp.data_hard_start;
>   	skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt32);
>   	if (unlikely(!skb))
>   		return NULL;
>
Björn Töpel May 8, 2020, 12:27 p.m. UTC | #2
On 2020-05-08 13:55, Maxim Mikityanskiy wrote:
> On 2020-05-07 13:42, Björn Töpel wrote:
>> From: Björn Töpel <bjorn.topel@intel.com>
>>
>> Use the new MEM_TYPE_XSK_BUFF_POOL API in lieu of MEM_TYPE_ZERO_COPY in
>> mlx5e. It allows to drop a lot of code from the driver (which is now
>> common in AF_XDP core and was related to XSK RX frame allocation, DMA
>> mapping, etc.) and slightly improve performance.
>>
>> rfc->v1: Put back the sanity check for XSK params, use XSK API to get
>>           the total headroom size. (Maxim)
>>
>> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
>> Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
> 
> I did some functional and performance tests.
> 
> Unfortunately, something is wrong with the traffic: I get zeros in 
> XDP_TX, XDP_PASS and XSK instead of packet data. I set DEBUG_HEXDUMP in 
> xdpsock, and it shows the packets of the correct length, but all bytes 
> are 0 after these patches. It might be wrong xdp_buff pointers, however, 
> I still have to investigate it. Björn, does it also affect Intel 
> drivers, or is it Mellanox-specific?
>

Are you getting zeros for TX, PASS *and* in xdpsock (REDIRECT:ed 
packets), or just TX and PASS?

No, I get correct packet data for AF_XDP zero-copy XDP_REDIRECT,
XDP_PASS, and XDP_TX for Intel.

> For performance, I got +1.0..+1.2 Mpps on RX. TX performance got better 
> after Björn inlined the relevant UMEM functions, however, there is still 
> a slight decrease compared to the old code. I'll try to find the 
> possible reason, but the good thing is that it's not significant anymore.
> 

Ok, so for Rx mlx5 it's the same as for i40e. Good! :-)

How much decrease on Tx?


Björn
Maxim Mikityanskiy May 8, 2020, 1 p.m. UTC | #3
On 2020-05-08 15:27, Björn Töpel wrote:
> On 2020-05-08 13:55, Maxim Mikityanskiy wrote:
>> On 2020-05-07 13:42, Björn Töpel wrote:
>>> From: Björn Töpel <bjorn.topel@intel.com>
>>>
>>> Use the new MEM_TYPE_XSK_BUFF_POOL API in lieu of MEM_TYPE_ZERO_COPY in
>>> mlx5e. It allows to drop a lot of code from the driver (which is now
>>> common in AF_XDP core and was related to XSK RX frame allocation, DMA
>>> mapping, etc.) and slightly improve performance.
>>>
>>> rfc->v1: Put back the sanity check for XSK params, use XSK API to get
>>>           the total headroom size. (Maxim)
>>>
>>> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
>>> Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
>>
>> I did some functional and performance tests.
>>
>> Unfortunately, something is wrong with the traffic: I get zeros in 
>> XDP_TX, XDP_PASS and XSK instead of packet data. I set DEBUG_HEXDUMP 
>> in xdpsock, and it shows the packets of the correct length, but all 
>> bytes are 0 after these patches. It might be wrong xdp_buff pointers, 
>> however, I still have to investigate it. Björn, does it also affect 
>> Intel drivers, or is it Mellanox-specific?
>>
> 
> Are you getting zeros for TX, PASS *and* in xdpsock (REDIRECT:ed 
> packets), or just TX and PASS?

Yes, in all modes: XDP_TX, XDP_PASS and XDP_REDIRECT to XSK (xdpsock).

> No, I get correct packet data for AF_XDP zero-copy XDP_REDIRECT,
> XDP_PASS, and XDP_TX for Intel.

Hmm, weird - with the new API I expected the same behavior on all 
drivers. Thanks for the information, I'll know that I need to look in 
mlx5 code to find the issue.

>> For performance, I got +1.0..+1.2 Mpps on RX. TX performance got 
>> better after Björn inlined the relevant UMEM functions, however, there 
>> is still a slight decrease compared to the old code. I'll try to find 
>> the possible reason, but the good thing is that it's not significant 
>> anymore.
>>
> 
> Ok, so for Rx mlx5 it's the same as for i40e. Good! :-)
> 
> How much decrease on Tx?

~0.8 Mpps (was 3.1 before you inlined the functions).

> 
> Björn
Björn Töpel May 8, 2020, 1:08 p.m. UTC | #4
On Fri, 8 May 2020 at 15:01, Maxim Mikityanskiy <maximmi@mellanox.com> wrote:
>
> On 2020-05-08 15:27, Björn Töpel wrote:
> > On 2020-05-08 13:55, Maxim Mikityanskiy wrote:
> >> On 2020-05-07 13:42, Björn Töpel wrote:
> >>> From: Björn Töpel <bjorn.topel@intel.com>
> >>>
> >>> Use the new MEM_TYPE_XSK_BUFF_POOL API in lieu of MEM_TYPE_ZERO_COPY in
> >>> mlx5e. It allows to drop a lot of code from the driver (which is now
> >>> common in AF_XDP core and was related to XSK RX frame allocation, DMA
> >>> mapping, etc.) and slightly improve performance.
> >>>
> >>> rfc->v1: Put back the sanity check for XSK params, use XSK API to get
> >>>           the total headroom size. (Maxim)
> >>>
> >>> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
> >>> Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
> >>
> >> I did some functional and performance tests.
> >>
> >> Unfortunately, something is wrong with the traffic: I get zeros in
> >> XDP_TX, XDP_PASS and XSK instead of packet data. I set DEBUG_HEXDUMP
> >> in xdpsock, and it shows the packets of the correct length, but all
> >> bytes are 0 after these patches. It might be wrong xdp_buff pointers,
> >> however, I still have to investigate it. Björn, does it also affect
> >> Intel drivers, or is it Mellanox-specific?
> >>
> >
> > Are you getting zeros for TX, PASS *and* in xdpsock (REDIRECT:ed
> > packets), or just TX and PASS?
>
> Yes, in all modes: XDP_TX, XDP_PASS and XDP_REDIRECT to XSK (xdpsock).
>
> > No, I get correct packet data for AF_XDP zero-copy XDP_REDIRECT,
> > XDP_PASS, and XDP_TX for Intel.
>
> Hmm, weird - with the new API I expected the same behavior on all
> drivers. Thanks for the information, I'll know that I need to look in
> mlx5 code to find the issue.
>

All zeros hints that you're probably putting in the wrong DMA address somewhere.

> >> For performance, I got +1.0..+1.2 Mpps on RX. TX performance got
> >> better after Björn inlined the relevant UMEM functions, however, there
> >> is still a slight decrease compared to the old code. I'll try to find
> >> the possible reason, but the good thing is that it's not significant
> >> anymore.
> >>
> >
> > Ok, so for Rx mlx5 it's the same as for i40e. Good! :-)
> >
> > How much decrease on Tx?
>
> ~0.8 Mpps (was 3.1 before you inlined the functions).
>

Thanks. Still a bit much. What does perf say?


Björn
Björn Töpel May 8, 2020, 1:14 p.m. UTC | #5
On Fri, 8 May 2020 at 15:08, Björn Töpel <bjorn.topel@gmail.com> wrote:
>
> On Fri, 8 May 2020 at 15:01, Maxim Mikityanskiy <maximmi@mellanox.com> wrote:
> >
[]
>
> All zeros hints that you're probably putting in the wrong DMA address somewhere.
>

Hmm, I can't see that you're using xsk_buff_xdp_get_dma() anywhere in
the code. Probably it?

Björn
Maxim Mikityanskiy May 8, 2020, 5:34 p.m. UTC | #6
On 2020-05-08 16:14, Björn Töpel wrote:
> On Fri, 8 May 2020 at 15:08, Björn Töpel <bjorn.topel@gmail.com> wrote:
>>
>> On Fri, 8 May 2020 at 15:01, Maxim Mikityanskiy <maximmi@mellanox.com> wrote:
>>>
> []
>>
>> All zeros hints that you're probably putting in the wrong DMA address somewhere.
>>
> 
> Hmm, I can't see that you're using xsk_buff_xdp_get_dma() anywhere in
> the code. Probably it?

You are right, thanks, it was indeed missing. However, adding it was not 
enough, I still get zeros, will continue investigating on Monday.

> 
> Björn
>
Björn Töpel May 8, 2020, 5:57 p.m. UTC | #7
On 2020-05-08 19:34, Maxim Mikityanskiy wrote:
[]
> 
> You are right, thanks, it was indeed missing. However, adding it was not 
> enough, I still get zeros, will continue investigating on Monday.
>

Thanks Max! Let me know if I can help out.


Have a nice weekend,
Björn
diff mbox series

Patch

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 0864b76ca2c0..368563ff8efb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -426,13 +426,12 @@  struct mlx5e_txqsq {
 } ____cacheline_aligned_in_smp;
 
 struct mlx5e_dma_info {
-	dma_addr_t addr;
 	union {
-		struct page *page;
 		struct {
-			u64 handle;
-			void *data;
-		} xsk;
+			dma_addr_t addr;
+			struct page *page;
+		};
+		struct xdp_buff *xsk;
 	};
 };
 
@@ -650,7 +649,6 @@  struct mlx5e_rq {
 		} mpwqe;
 	};
 	struct {
-		u16            umem_headroom;
 		u16            headroom;
 		u8             map_dir;   /* dma map direction */
 	} buff;
@@ -682,7 +680,6 @@  struct mlx5e_rq {
 	struct page_pool      *page_pool;
 
 	/* AF_XDP zero-copy */
-	struct zero_copy_allocator zca;
 	struct xdp_umem       *umem;
 
 	struct work_struct     recover_work;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
index eb2e1f2138e4..38e4f19d69f8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
@@ -12,15 +12,16 @@  static inline bool mlx5e_rx_is_xdp(struct mlx5e_params *params,
 u16 mlx5e_get_linear_rq_headroom(struct mlx5e_params *params,
 				 struct mlx5e_xsk_param *xsk)
 {
-	u16 headroom = NET_IP_ALIGN;
+	u16 headroom;
 
-	if (mlx5e_rx_is_xdp(params, xsk)) {
+	if (xsk)
+		return xsk->headroom;
+
+	headroom = NET_IP_ALIGN;
+	if (mlx5e_rx_is_xdp(params, xsk))
 		headroom += XDP_PACKET_HEADROOM;
-		if (xsk)
-			headroom += xsk->headroom;
-	} else {
+	else
 		headroom += MLX5_RX_HEADROOM;
-	}
 
 	return headroom;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index b04b99396f65..a2a194525b15 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -71,7 +71,7 @@  mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq,
 	xdptxd.data = xdpf->data;
 	xdptxd.len  = xdpf->len;
 
-	if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) {
+	if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
 		/* The xdp_buff was in the UMEM and was copied into a newly
 		 * allocated page. The UMEM page was returned via the ZCA, and
 		 * this new page has to be mapped at this point and has to be
@@ -119,49 +119,33 @@  mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq,
 
 /* returns true if packet was consumed by xdp */
 bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
-		      void *va, u16 *rx_headroom, u32 *len, bool xsk)
+		      u32 *len, struct xdp_buff *xdp)
 {
 	struct bpf_prog *prog = READ_ONCE(rq->xdp_prog);
-	struct xdp_umem *umem = rq->umem;
-	struct xdp_buff xdp;
 	u32 act;
 	int err;
 
 	if (!prog)
 		return false;
 
-	xdp.data = va + *rx_headroom;
-	xdp_set_data_meta_invalid(&xdp);
-	xdp.data_end = xdp.data + *len;
-	xdp.data_hard_start = va;
-	if (xsk)
-		xdp.handle = di->xsk.handle;
-	xdp.rxq = &rq->xdp_rxq;
-
-	act = bpf_prog_run_xdp(prog, &xdp);
-	if (xsk) {
-		u64 off = xdp.data - xdp.data_hard_start;
-
-		xdp.handle = xsk_umem_adjust_offset(umem, xdp.handle, off);
-	}
+	act = bpf_prog_run_xdp(prog, xdp);
 	switch (act) {
 	case XDP_PASS:
-		*rx_headroom = xdp.data - xdp.data_hard_start;
-		*len = xdp.data_end - xdp.data;
+		*len = xdp->data_end - xdp->data;
 		return false;
 	case XDP_TX:
-		if (unlikely(!mlx5e_xmit_xdp_buff(rq->xdpsq, rq, di, &xdp)))
+		if (unlikely(!mlx5e_xmit_xdp_buff(rq->xdpsq, rq, di, xdp)))
 			goto xdp_abort;
 		__set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); /* non-atomic */
 		return true;
 	case XDP_REDIRECT:
 		/* When XDP enabled then page-refcnt==1 here */
-		err = xdp_do_redirect(rq->netdev, &xdp, prog);
+		err = xdp_do_redirect(rq->netdev, xdp, prog);
 		if (unlikely(err))
 			goto xdp_abort;
 		__set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags);
 		__set_bit(MLX5E_RQ_FLAG_XDP_REDIRECT, rq->flags);
-		if (!xsk)
+		if (xdp->rxq->mem.type != MEM_TYPE_XSK_BUFF_POOL)
 			mlx5e_page_dma_unmap(rq, di);
 		rq->stats->xdp_redirect++;
 		return true;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
index ed6f045febeb..54bad625267f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
@@ -63,7 +63,7 @@ 
 struct mlx5e_xsk_param;
 int mlx5e_xdp_max_mtu(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk);
 bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
-		      void *va, u16 *rx_headroom, u32 *len, bool xsk);
+		      u32 *len, struct xdp_buff *xdp);
 void mlx5e_xdp_mpwqe_complete(struct mlx5e_xdpsq *sq);
 bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq);
 void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
index 62fc8a128a8d..59c40983d428 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
@@ -3,71 +3,10 @@ 
 
 #include "rx.h"
 #include "en/xdp.h"
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 
 /* RX data path */
 
-bool mlx5e_xsk_pages_enough_umem(struct mlx5e_rq *rq, int count)
-{
-	/* Check in advance that we have enough frames, instead of allocating
-	 * one-by-one, failing and moving frames to the Reuse Ring.
-	 */
-	return xsk_umem_has_addrs_rq(rq->umem, count);
-}
-
-int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq,
-			      struct mlx5e_dma_info *dma_info)
-{
-	struct xdp_umem *umem = rq->umem;
-	u64 handle;
-
-	if (!xsk_umem_peek_addr_rq(umem, &handle))
-		return -ENOMEM;
-
-	dma_info->xsk.handle = xsk_umem_adjust_offset(umem, handle,
-						      rq->buff.umem_headroom);
-	dma_info->xsk.data = xdp_umem_get_data(umem, dma_info->xsk.handle);
-
-	/* No need to add headroom to the DMA address. In striding RQ case, we
-	 * just provide pages for UMR, and headroom is counted at the setup
-	 * stage when creating a WQE. In non-striding RQ case, headroom is
-	 * accounted in mlx5e_alloc_rx_wqe.
-	 */
-	dma_info->addr = xdp_umem_get_dma(umem, handle);
-
-	xsk_umem_release_addr_rq(umem);
-
-	dma_sync_single_for_device(rq->pdev, dma_info->addr, PAGE_SIZE,
-				   DMA_BIDIRECTIONAL);
-
-	return 0;
-}
-
-static inline void mlx5e_xsk_recycle_frame(struct mlx5e_rq *rq, u64 handle)
-{
-	xsk_umem_fq_reuse(rq->umem, handle & rq->umem->chunk_mask);
-}
-
-/* XSKRQ uses pages from UMEM, they must not be released. They are returned to
- * the userspace if possible, and if not, this function is called to reuse them
- * in the driver.
- */
-void mlx5e_xsk_page_release(struct mlx5e_rq *rq,
-			    struct mlx5e_dma_info *dma_info)
-{
-	mlx5e_xsk_recycle_frame(rq, dma_info->xsk.handle);
-}
-
-/* Return a frame back to the hardware to fill in again. It is used by XDP when
- * the XDP program returns XDP_TX or XDP_REDIRECT not to an XSKMAP.
- */
-void mlx5e_xsk_zca_free(struct zero_copy_allocator *zca, unsigned long handle)
-{
-	struct mlx5e_rq *rq = container_of(zca, struct mlx5e_rq, zca);
-
-	mlx5e_xsk_recycle_frame(rq, handle);
-}
-
 static struct sk_buff *mlx5e_xsk_construct_skb(struct mlx5e_rq *rq, void *data,
 					       u32 cqe_bcnt)
 {
@@ -90,11 +29,8 @@  struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
 						    u32 head_offset,
 						    u32 page_idx)
 {
-	struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx];
-	u16 rx_headroom = rq->buff.headroom - rq->buff.umem_headroom;
+	struct xdp_buff *xdp = wi->umr.dma_info[page_idx].xsk;
 	u32 cqe_bcnt32 = cqe_bcnt;
-	void *va, *data;
-	u32 frag_size;
 	bool consumed;
 
 	/* Check packet size. Note LRO doesn't use linear SKB */
@@ -103,22 +39,19 @@  struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
 		return NULL;
 	}
 
-	/* head_offset is not used in this function, because di->xsk.data and
-	 * di->addr point directly to the necessary place. Furthermore, in the
-	 * current implementation, UMR pages are mapped to XSK frames, so
+	/* head_offset is not used in this function, because xdp->data and the
+	 * DMA address point directly to the necessary place. Furthermore, in
+	 * the current implementation, UMR pages are mapped to XSK frames, so
 	 * head_offset should always be 0.
 	 */
 	WARN_ON_ONCE(head_offset);
 
-	va             = di->xsk.data;
-	data           = va + rx_headroom;
-	frag_size      = rq->buff.headroom + cqe_bcnt32;
-
-	dma_sync_single_for_cpu(rq->pdev, di->addr, frag_size, DMA_BIDIRECTIONAL);
-	prefetch(data);
+	xdp->data_end = xdp->data + cqe_bcnt32;
+	xsk_buff_dma_sync_for_cpu(xdp);
+	prefetch(xdp->data);
 
 	rcu_read_lock();
-	consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt32, true);
+	consumed = mlx5e_xdp_handle(rq, NULL, &cqe_bcnt32, xdp);
 	rcu_read_unlock();
 
 	/* Possible flows:
@@ -145,7 +78,7 @@  struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
 	/* XDP_PASS: copy the data from the UMEM to a new SKB and reuse the
 	 * frame. On SKB allocation failure, NULL is returned.
 	 */
-	return mlx5e_xsk_construct_skb(rq, data, cqe_bcnt32);
+	return mlx5e_xsk_construct_skb(rq, xdp->data, cqe_bcnt32);
 }
 
 struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
@@ -153,25 +86,19 @@  struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
 					      struct mlx5e_wqe_frag_info *wi,
 					      u32 cqe_bcnt)
 {
-	struct mlx5e_dma_info *di = wi->di;
-	u16 rx_headroom = rq->buff.headroom - rq->buff.umem_headroom;
-	void *va, *data;
+	struct xdp_buff *xdp = wi->di->xsk;
 	bool consumed;
-	u32 frag_size;
 
-	/* wi->offset is not used in this function, because di->xsk.data and
-	 * di->addr point directly to the necessary place. Furthermore, in the
-	 * current implementation, one page = one packet = one frame, so
+	/* wi->offset is not used in this function, because xdp->data and the
+	 * DMA address point directly to the necessary place. Furthermore, the
+	 * XSK allocator allocates frames per packet, instead of pages, so
 	 * wi->offset should always be 0.
 	 */
 	WARN_ON_ONCE(wi->offset);
 
-	va             = di->xsk.data;
-	data           = va + rx_headroom;
-	frag_size      = rq->buff.headroom + cqe_bcnt;
-
-	dma_sync_single_for_cpu(rq->pdev, di->addr, frag_size, DMA_BIDIRECTIONAL);
-	prefetch(data);
+	xdp->data_end = xdp->data + cqe_bcnt;
+	xsk_buff_dma_sync_for_cpu(xdp);
+	prefetch(xdp->data);
 
 	if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_RESP_SEND)) {
 		rq->stats->wqe_err++;
@@ -179,7 +106,7 @@  struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
 	}
 
 	rcu_read_lock();
-	consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt, true);
+	consumed = mlx5e_xdp_handle(rq, NULL, &cqe_bcnt, xdp);
 	rcu_read_unlock();
 
 	if (likely(consumed))
@@ -189,5 +116,5 @@  struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
 	 * will be handled by mlx5e_put_rx_frag.
 	 * On SKB allocation failure, NULL is returned.
 	 */
-	return mlx5e_xsk_construct_skb(rq, data, cqe_bcnt);
+	return mlx5e_xsk_construct_skb(rq, xdp->data, cqe_bcnt);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
index a8e11adbf426..0062652f37b3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
@@ -9,12 +9,6 @@ 
 
 /* RX data path */
 
-bool mlx5e_xsk_pages_enough_umem(struct mlx5e_rq *rq, int count);
-int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq,
-			      struct mlx5e_dma_info *dma_info);
-void mlx5e_xsk_page_release(struct mlx5e_rq *rq,
-			    struct mlx5e_dma_info *dma_info);
-void mlx5e_xsk_zca_free(struct zero_copy_allocator *zca, unsigned long handle);
 struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
 						    struct mlx5e_mpw_info *wi,
 						    u16 cqe_bcnt,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
index 3bcdb5b2fc20..470f3b6317c3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
@@ -5,7 +5,7 @@ 
 #include "umem.h"
 #include "en/xdp.h"
 #include "en/params.h"
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 
 int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
 {
@@ -92,8 +92,8 @@  bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget)
 			break;
 		}
 
-		xdptxd.dma_addr = xdp_umem_get_dma(umem, desc.addr);
-		xdptxd.data = xdp_umem_get_data(umem, desc.addr);
+		xdptxd.dma_addr = xsk_buff_raw_get_dma(umem, desc.addr);
+		xdptxd.data = xsk_buff_raw_get_data(umem, desc.addr);
 		xdptxd.len = desc.len;
 
 		dma_sync_single_for_device(sq->pdev, xdptxd.dma_addr,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c
index 5e49fdb564b3..7b17fcd0a56d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c
@@ -10,40 +10,14 @@  static int mlx5e_xsk_map_umem(struct mlx5e_priv *priv,
 			      struct xdp_umem *umem)
 {
 	struct device *dev = priv->mdev->device;
-	u32 i;
 
-	for (i = 0; i < umem->npgs; i++) {
-		dma_addr_t dma = dma_map_page(dev, umem->pgs[i], 0, PAGE_SIZE,
-					      DMA_BIDIRECTIONAL);
-
-		if (unlikely(dma_mapping_error(dev, dma)))
-			goto err_unmap;
-		umem->pages[i].dma = dma;
-	}
-
-	return 0;
-
-err_unmap:
-	while (i--) {
-		dma_unmap_page(dev, umem->pages[i].dma, PAGE_SIZE,
-			       DMA_BIDIRECTIONAL);
-		umem->pages[i].dma = 0;
-	}
-
-	return -ENOMEM;
+	return xsk_buff_dma_map(umem, dev, 0);
 }
 
 static void mlx5e_xsk_unmap_umem(struct mlx5e_priv *priv,
 				 struct xdp_umem *umem)
 {
-	struct device *dev = priv->mdev->device;
-	u32 i;
-
-	for (i = 0; i < umem->npgs; i++) {
-		dma_unmap_page(dev, umem->pages[i].dma, PAGE_SIZE,
-			       DMA_BIDIRECTIONAL);
-		umem->pages[i].dma = 0;
-	}
+	return xsk_buff_dma_unmap(umem, 0);
 }
 
 static int mlx5e_xsk_get_umems(struct mlx5e_xsk *xsk)
@@ -90,13 +64,14 @@  static void mlx5e_xsk_remove_umem(struct mlx5e_xsk *xsk, u16 ix)
 
 static bool mlx5e_xsk_is_umem_sane(struct xdp_umem *umem)
 {
-	return umem->headroom <= 0xffff && umem->chunk_size_nohr <= 0xffff;
+	return xsk_umem_get_headroom(umem) <= 0xffff &&
+		xsk_umem_get_chunk_size(umem) <= 0xffff;
 }
 
 void mlx5e_build_xsk_param(struct xdp_umem *umem, struct mlx5e_xsk_param *xsk)
 {
-	xsk->headroom = umem->headroom;
-	xsk->chunk_size = umem->chunk_size_nohr + umem->headroom;
+	xsk->headroom = xsk_umem_get_headroom(umem);
+	xsk->chunk_size = xsk_umem_get_chunk_size(umem);
 }
 
 static int mlx5e_xsk_enable_locked(struct mlx5e_priv *priv,
@@ -241,18 +216,6 @@  int mlx5e_xsk_setup_umem(struct net_device *dev, struct xdp_umem *umem, u16 qid)
 		      mlx5e_xsk_disable_umem(priv, ix);
 }
 
-int mlx5e_xsk_resize_reuseq(struct xdp_umem *umem, u32 nentries)
-{
-	struct xdp_umem_fq_reuse *reuseq;
-
-	reuseq = xsk_reuseq_prepare(nentries);
-	if (unlikely(!reuseq))
-		return -ENOMEM;
-	xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
-
-	return 0;
-}
-
 u16 mlx5e_xsk_first_unused_channel(struct mlx5e_params *params, struct mlx5e_xsk *xsk)
 {
 	u16 res = xsk->refcnt ? params->num_channels : 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 048a4f8601a8..5ce73931eff4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -38,7 +38,7 @@ 
 #include <linux/bpf.h>
 #include <linux/if_bridge.h>
 #include <net/page_pool.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 #include "eswitch.h"
 #include "en.h"
 #include "en/txrx.h"
@@ -414,7 +414,6 @@  static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
 	rq->buff.map_dir = rq->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
 	rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params, xsk);
-	rq->buff.umem_headroom = xsk ? xsk->headroom : 0;
 	pool_size = 1 << params->log_rq_mtu_frames;
 
 	switch (rq->wq_type) {
@@ -522,17 +521,9 @@  static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 	}
 
 	if (xsk) {
-		err = mlx5e_xsk_resize_reuseq(umem, num_xsk_frames);
-		if (unlikely(err)) {
-			mlx5_core_err(mdev, "Unable to allocate the Reuse Ring for %u frames\n",
-				      num_xsk_frames);
-			goto err_free;
-		}
-
-		rq->zca.free = mlx5e_xsk_zca_free;
 		err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
-						 MEM_TYPE_ZERO_COPY,
-						 &rq->zca);
+						 MEM_TYPE_XSK_BUFF_POOL, NULL);
+		xsk_buff_set_rxq_info(rq->umem, &rq->xdp_rxq);
 	} else {
 		/* Create a page_pool and register it with rxq */
 		pp_params.order     = 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index d9a5a669b84d..662bc73f6dae 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -263,10 +263,12 @@  static inline int mlx5e_page_alloc_pool(struct mlx5e_rq *rq,
 static inline int mlx5e_page_alloc(struct mlx5e_rq *rq,
 				   struct mlx5e_dma_info *dma_info)
 {
-	if (rq->umem)
-		return mlx5e_xsk_page_alloc_umem(rq, dma_info);
-	else
-		return mlx5e_page_alloc_pool(rq, dma_info);
+	if (rq->umem) {
+		dma_info->xsk = xsk_buff_alloc(rq->umem);
+		return dma_info->xsk ? 0 : -ENOMEM;
+	}
+
+	return mlx5e_page_alloc_pool(rq, dma_info);
 }
 
 void mlx5e_page_dma_unmap(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info)
@@ -300,7 +302,7 @@  static inline void mlx5e_page_release(struct mlx5e_rq *rq,
 		 * put into the Reuse Ring, because there is no way to return
 		 * the page to the userspace when the interface goes down.
 		 */
-		mlx5e_xsk_page_release(rq, dma_info);
+		xsk_buff_free(dma_info->xsk);
 	else
 		mlx5e_page_release_dynamic(rq, dma_info, recycle);
 }
@@ -385,7 +387,11 @@  static int mlx5e_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, u8 wqe_bulk)
 	if (rq->umem) {
 		int pages_desired = wqe_bulk << rq->wqe.info.log_num_frags;
 
-		if (unlikely(!mlx5e_xsk_pages_enough_umem(rq, pages_desired)))
+		/* Check in advance that we have enough frames, instead of
+		 * allocating one-by-one, failing and moving frames to the
+		 * Reuse Ring.
+		 */
+		if (unlikely(!xsk_buff_can_alloc(rq->umem, pages_desired)))
 			return -ENOMEM;
 	}
 
@@ -480,8 +486,11 @@  static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 	int err;
 	int i;
 
+	/* Check in advance that we have enough frames, instead of allocating
+	 * one-by-one, failing and moving frames to the Reuse Ring.
+	 */
 	if (rq->umem &&
-	    unlikely(!mlx5e_xsk_pages_enough_umem(rq, MLX5_MPWRQ_PAGES_PER_WQE))) {
+	    unlikely(!xsk_buff_can_alloc(rq->umem, MLX5_MPWRQ_PAGES_PER_WQE))) {
 		err = -ENOMEM;
 		goto err;
 	}
@@ -1038,12 +1047,23 @@  struct sk_buff *mlx5e_build_linear_skb(struct mlx5e_rq *rq, void *va,
 	return skb;
 }
 
+static void mlx5e_fill_xdp_buff(struct mlx5e_rq *rq, void *va, u16 headroom,
+				u32 len, struct xdp_buff *xdp)
+{
+	xdp->data_hard_start = va;
+	xdp_set_data_meta_invalid(xdp);
+	xdp->data = va + headroom;
+	xdp->data_end = xdp->data + len;
+	xdp->rxq = &rq->xdp_rxq;
+}
+
 struct sk_buff *
 mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
 			  struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt)
 {
 	struct mlx5e_dma_info *di = wi->di;
 	u16 rx_headroom = rq->buff.headroom;
+	struct xdp_buff xdp;
 	struct sk_buff *skb;
 	void *va, *data;
 	bool consumed;
@@ -1059,11 +1079,13 @@  mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
 	prefetch(data);
 
 	rcu_read_lock();
-	consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt, false);
+	mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt, &xdp);
+	consumed = mlx5e_xdp_handle(rq, di, &cqe_bcnt, &xdp);
 	rcu_read_unlock();
 	if (consumed)
 		return NULL; /* page/packet was consumed by XDP */
 
+	rx_headroom = xdp.data - xdp.data_hard_start;
 	skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt);
 	if (unlikely(!skb))
 		return NULL;
@@ -1336,6 +1358,7 @@  mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
 	struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx];
 	u16 rx_headroom = rq->buff.headroom;
 	u32 cqe_bcnt32 = cqe_bcnt;
+	struct xdp_buff xdp;
 	struct sk_buff *skb;
 	void *va, *data;
 	u32 frag_size;
@@ -1357,7 +1380,8 @@  mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
 	prefetch(data);
 
 	rcu_read_lock();
-	consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt32, false);
+	mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt32, &xdp);
+	consumed = mlx5e_xdp_handle(rq, di, &cqe_bcnt32, &xdp);
 	rcu_read_unlock();
 	if (consumed) {
 		if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags))
@@ -1365,6 +1389,7 @@  mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
 		return NULL; /* page/packet was consumed by XDP */
 	}
 
+	rx_headroom = xdp.data - xdp.data_hard_start;
 	skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt32);
 	if (unlikely(!skb))
 		return NULL;