diff mbox series

[next-queue,v7] igb: add XDP support

Message ID 20200902203222.185141-1-anthony.l.nguyen@intel.com
State Accepted
Delegated to: Anthony Nguyen
Headers show
Series [next-queue,v7] igb: add XDP support | expand

Commit Message

Tony Nguyen Sept. 2, 2020, 8:32 p.m. UTC
From: Sven Auhagen <Sven.Auhagen@voleatech.de>

Add XDP support to the IGB driver.
The implementation follows the IXGBE XDP implementation
closely and I used the following patches as basis:

1. commit 924708081629 ("ixgbe: add XDP support for pass and drop actions")
2. commit 33fdc82f0883 ("ixgbe: add support for XDP_TX action")
3. commit ed93a3987128 ("ixgbe: tweak page counting for XDP_REDIRECT")

Due to the hardware constraints of the devices using the
IGB driver we must share the TX queues with XDP which
means locking the TX queue for XDP.

I ran tests on an older device to get better numbers.
Test machine:

Intel(R) Atom(TM) CPU C2338 @ 1.74GHz (2 Cores)
2x Intel I211

Routing Original Driver Network Stack: 382 Kpps

Routing XDP Redirect (xdp_fwd_kern): 1.48 Mpps
XDP Drop: 1.48 Mpps

Using XDP we can achieve line rate forwarding even on
an older Intel Atom CPU.

Signed-off-by: Sven Auhagen <sven.auhagen@voleatech.de>
---
v7:
 * Fix issue with applying to dev-queue branch; utilize net_prefetch()
 * Fix build issue; remove XDP_QUERY_PROG
 * Replace fallthrough comment with fallthrough macro to resolve
checkpatch warning
 * Fix reverse Christmas tree

v6:
* igb_xdp_ring_update_tail changed to static
* bump to 5.8

v5: resubmission with function names in patch

v4:
* use HARD_TX_LOCK in XDP xmit
* do not pass adapter to igb_setup_rx_resources
* account for timestamp in frame size

v3: igb_xdp_ring_update_tail should be static

v2: original did not apply to my dev-queue branch, so fixed the
    conflicts in the patch

 drivers/net/ethernet/intel/igb/igb.h         |  81 +++-
 drivers/net/ethernet/intel/igb/igb_ethtool.c |   4 +
 drivers/net/ethernet/intel/igb/igb_main.c    | 433 +++++++++++++++++--
 3 files changed, 482 insertions(+), 36 deletions(-)

Comments

Sven Auhagen Sept. 21, 2020, 9:58 a.m. UTC | #1
On Wed, Sep 02, 2020 at 01:32:22PM -0700, Tony Nguyen wrote:
> From: Sven Auhagen <Sven.Auhagen@voleatech.de>
> 
> Add XDP support to the IGB driver.
> The implementation follows the IXGBE XDP implementation
> closely and I used the following patches as basis:
> 
> 1. commit 924708081629 ("ixgbe: add XDP support for pass and drop actions")
> 2. commit 33fdc82f0883 ("ixgbe: add support for XDP_TX action")
> 3. commit ed93a3987128 ("ixgbe: tweak page counting for XDP_REDIRECT")
> 
> Due to the hardware constraints of the devices using the
> IGB driver we must share the TX queues with XDP which
> means locking the TX queue for XDP.
> 
> I ran tests on an older device to get better numbers.
> Test machine:
> 
> Intel(R) Atom(TM) CPU C2338 @ 1.74GHz (2 Cores)
> 2x Intel I211
> 
> Routing Original Driver Network Stack: 382 Kpps
> 
> Routing XDP Redirect (xdp_fwd_kern): 1.48 Mpps
> XDP Drop: 1.48 Mpps
> 
> Using XDP we can achieve line rate forwarding even on
> an older Intel Atom CPU.
> 
> Signed-off-by: Sven Auhagen <sven.auhagen@voleatech.de>

Hello Tony,

thanks for the patch update.
How is the process to get this accepted?

Best
Sven

> ---
> v7:
>  * Fix issue with applying to dev-queue branch; utilize net_prefetch()
>  * Fix build issue; remove XDP_QUERY_PROG
>  * Replace fallthrough comment with fallthrough macro to resolve
> checkpatch warning
>  * Fix reverse Christmas tree
> 
> v6:
> * igb_xdp_ring_update_tail changed to static
> * bump to 5.8
> 
> v5: resubmission with function names in patch
> 
> v4:
> * use HARD_TX_LOCK in XDP xmit
> * do not pass adapter to igb_setup_rx_resources
> * account for timestamp in frame size
> 
> v3: igb_xdp_ring_update_tail should be static
> 
> v2: original did not apply to my dev-queue branch, so fixed the
>     conflicts in the patch
> 
>  drivers/net/ethernet/intel/igb/igb.h         |  81 +++-
>  drivers/net/ethernet/intel/igb/igb_ethtool.c |   4 +
>  drivers/net/ethernet/intel/igb/igb_main.c    | 433 +++++++++++++++++--
>  3 files changed, 482 insertions(+), 36 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
> index 2f015b60a995..fa6ff1a64fc0 100644
> --- a/drivers/net/ethernet/intel/igb/igb.h
> +++ b/drivers/net/ethernet/intel/igb/igb.h
> @@ -19,6 +19,8 @@
>  #include <linux/pci.h>
>  #include <linux/mdio.h>
>  
> +#include <net/xdp.h>
> +
>  struct igb_adapter;
>  
>  #define E1000_PCS_CFG_IGN_SD	1
> @@ -79,6 +81,12 @@ struct igb_adapter;
>  #define IGB_I210_RX_LATENCY_100		2213
>  #define IGB_I210_RX_LATENCY_1000	448
>  
> +/* XDP */
> +#define IGB_XDP_PASS		0
> +#define IGB_XDP_CONSUMED	BIT(0)
> +#define IGB_XDP_TX		BIT(1)
> +#define IGB_XDP_REDIR		BIT(2)
> +
>  struct vf_data_storage {
>  	unsigned char vf_mac_addresses[ETH_ALEN];
>  	u16 vf_mc_hashes[IGB_MAX_VF_MC_ENTRIES];
> @@ -132,17 +140,63 @@ struct vf_mac_filter {
>  
>  /* Supported Rx Buffer Sizes */
>  #define IGB_RXBUFFER_256	256
> +#define IGB_RXBUFFER_1536	1536
>  #define IGB_RXBUFFER_2048	2048
>  #define IGB_RXBUFFER_3072	3072
>  #define IGB_RX_HDR_LEN		IGB_RXBUFFER_256
>  #define IGB_TS_HDR_LEN		16
>  
> -#define IGB_SKB_PAD		(NET_SKB_PAD + NET_IP_ALIGN)
> +/* Attempt to maximize the headroom available for incoming frames.  We
> + * use a 2K buffer for receives and need 1536/1534 to store the data for
> + * the frame.  This leaves us with 512 bytes of room.  From that we need
> + * to deduct the space needed for the shared info and the padding needed
> + * to IP align the frame.
> + *
> + * Note: For cache line sizes 256 or larger this value is going to end
> + *	 up negative.  In these cases we should fall back to the 3K
> + *	 buffers.
> + */
>  #if (PAGE_SIZE < 8192)
> -#define IGB_MAX_FRAME_BUILD_SKB \
> -	(SKB_WITH_OVERHEAD(IGB_RXBUFFER_2048) - IGB_SKB_PAD - IGB_TS_HDR_LEN)
> +#define IGB_MAX_FRAME_BUILD_SKB (IGB_RXBUFFER_1536 - NET_IP_ALIGN)
> +#define IGB_2K_TOO_SMALL_WITH_PADDING \
> +((NET_SKB_PAD + IGB_TS_HDR_LEN + IGB_RXBUFFER_1536) > \
> +SKB_WITH_OVERHEAD(IGB_RXBUFFER_2048))
> +
> +static inline int igb_compute_pad(int rx_buf_len)
> +{
> +	int page_size, pad_size;
> +
> +	page_size = ALIGN(rx_buf_len, PAGE_SIZE / 2);
> +	pad_size = SKB_WITH_OVERHEAD(page_size) - rx_buf_len;
> +
> +	return pad_size;
> +}
> +
> +static inline int igb_skb_pad(void)
> +{
> +	int rx_buf_len;
> +
> +	/* If a 2K buffer cannot handle a standard Ethernet frame then
> +	 * optimize padding for a 3K buffer instead of a 1.5K buffer.
> +	 *
> +	 * For a 3K buffer we need to add enough padding to allow for
> +	 * tailroom due to NET_IP_ALIGN possibly shifting us out of
> +	 * cache-line alignment.
> +	 */
> +	if (IGB_2K_TOO_SMALL_WITH_PADDING)
> +		rx_buf_len = IGB_RXBUFFER_3072 + SKB_DATA_ALIGN(NET_IP_ALIGN);
> +	else
> +		rx_buf_len = IGB_RXBUFFER_1536;
> +
> +	/* if needed make room for NET_IP_ALIGN */
> +	rx_buf_len -= NET_IP_ALIGN;
> +
> +	return igb_compute_pad(rx_buf_len);
> +}
> +
> +#define IGB_SKB_PAD	igb_skb_pad()
>  #else
> -#define IGB_MAX_FRAME_BUILD_SKB (IGB_RXBUFFER_2048 - IGB_TS_HDR_LEN)
> +#define IGB_SKB_PAD	(NET_SKB_PAD + NET_IP_ALIGN)
>  #endif
>  
>  /* How many Rx Buffers do we bundle into one write to the hardware ? */
> @@ -194,13 +248,22 @@ enum igb_tx_flags {
>  #define IGB_SFF_ADDRESSING_MODE		0x4
>  #define IGB_SFF_8472_UNSUP		0x00
>  
> +enum igb_tx_buf_type {
> +	IGB_TYPE_SKB = 0,
> +	IGB_TYPE_XDP,
> +};
> +
>  /* wrapper around a pointer to a socket buffer,
>   * so a DMA handle can be stored along with the buffer
>   */
>  struct igb_tx_buffer {
>  	union e1000_adv_tx_desc *next_to_watch;
>  	unsigned long time_stamp;
> -	struct sk_buff *skb;
> +	enum igb_tx_buf_type type;
> +	union {
> +		struct sk_buff *skb;
> +		struct xdp_frame *xdpf;
> +	};
>  	unsigned int bytecount;
>  	u16 gso_segs;
>  	__be16 protocol;
> @@ -248,6 +311,7 @@ struct igb_ring_container {
>  struct igb_ring {
>  	struct igb_q_vector *q_vector;	/* backlink to q_vector */
>  	struct net_device *netdev;	/* back pointer to net_device */
> +	struct bpf_prog *xdp_prog;
>  	struct device *dev;		/* device pointer for dma mapping */
>  	union {				/* array of buffer info structs */
>  		struct igb_tx_buffer *tx_buffer_info;
> @@ -288,6 +352,7 @@ struct igb_ring {
>  			struct u64_stats_sync rx_syncp;
>  		};
>  	};
> +	struct xdp_rxq_info xdp_rxq;
>  } ____cacheline_internodealigned_in_smp;
>  
>  struct igb_q_vector {
> @@ -339,7 +404,7 @@ static inline unsigned int igb_rx_bufsz(struct igb_ring *ring)
>  		return IGB_RXBUFFER_3072;
>  
>  	if (ring_uses_build_skb(ring))
> -		return IGB_MAX_FRAME_BUILD_SKB + IGB_TS_HDR_LEN;
> +		return IGB_MAX_FRAME_BUILD_SKB;
>  #endif
>  	return IGB_RXBUFFER_2048;
>  }
> @@ -467,6 +532,7 @@ struct igb_adapter {
>  	unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
>  
>  	struct net_device *netdev;
> +	struct bpf_prog *xdp_prog;
>  
>  	unsigned long state;
>  	unsigned int flags;
> @@ -643,6 +709,9 @@ enum igb_boards {
>  
>  extern char igb_driver_name[];
>  
> +int igb_xmit_xdp_ring(struct igb_adapter *adapter,
> +		      struct igb_ring *ring,
> +		      struct xdp_frame *xdpf);
>  int igb_open(struct net_device *netdev);
>  int igb_close(struct net_device *netdev);
>  int igb_up(struct igb_adapter *);
> diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c
> index 6e8231c1ddf0..28baf203459a 100644
> --- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
> +++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
> @@ -961,6 +961,10 @@ static int igb_set_ringparam(struct net_device *netdev,
>  			memcpy(&temp_ring[i], adapter->rx_ring[i],
>  			       sizeof(struct igb_ring));
>  
> +			/* Clear copied XDP RX-queue info */
> +			memset(&temp_ring[i].xdp_rxq, 0,
> +			       sizeof(temp_ring[i].xdp_rxq));
> +
>  			temp_ring[i].count = new_rx_count;
>  			err = igb_setup_rx_resources(&temp_ring[i]);
>  			if (err) {
> diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
> index 698bb6a4b088..73635a012f4a 100644
> --- a/drivers/net/ethernet/intel/igb/igb_main.c
> +++ b/drivers/net/ethernet/intel/igb/igb_main.c
> @@ -30,6 +30,8 @@
>  #include <linux/if_ether.h>
>  #include <linux/aer.h>
>  #include <linux/prefetch.h>
> +#include <linux/bpf.h>
> +#include <linux/bpf_trace.h>
>  #include <linux/pm_runtime.h>
>  #include <linux/etherdevice.h>
>  #ifdef CONFIG_IGB_DCA
> @@ -2825,6 +2827,147 @@ static int igb_setup_tc(struct net_device *dev, enum tc_setup_type type,
>  	}
>  }
>  
> +static int igb_xdp_setup(struct net_device *dev, struct bpf_prog *prog)
> +{
> +	int i, frame_size = dev->mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN;
> +	struct igb_adapter *adapter = netdev_priv(dev);
> +	bool running = netif_running(dev);
> +	struct bpf_prog *old_prog;
> +	bool need_reset;
> +
> +	/* verify igb ring attributes are sufficient for XDP */
> +	for (i = 0; i < adapter->num_rx_queues; i++) {
> +		struct igb_ring *ring = adapter->rx_ring[i];
> +
> +		if (frame_size > igb_rx_bufsz(ring))
> +			return -EINVAL;
> +	}
> +
> +	old_prog = xchg(&adapter->xdp_prog, prog);
> +	need_reset = (!!prog != !!old_prog);
> +
> +	/* device is up and bpf is added/removed, must setup the RX queues */
> +	if (need_reset && running) {
> +		igb_close(dev);
> +	} else {
> +		for (i = 0; i < adapter->num_rx_queues; i++)
> +			(void)xchg(&adapter->rx_ring[i]->xdp_prog,
> +			    adapter->xdp_prog);
> +	}
> +
> +	if (old_prog)
> +		bpf_prog_put(old_prog);
> +
> +	/* bpf is just replaced, RXQ and MTU are already setup */
> +	if (!need_reset)
> +		return 0;
> +
> +	if (running)
> +		igb_open(dev);
> +
> +	return 0;
> +}
> +
> +static int igb_xdp(struct net_device *dev, struct netdev_bpf *xdp)
> +{
> +	switch (xdp->command) {
> +	case XDP_SETUP_PROG:
> +		return igb_xdp_setup(dev, xdp->prog);
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
> +static void igb_xdp_ring_update_tail(struct igb_ring *ring)
> +{
> +	/* Force memory writes to complete before letting h/w know there
> +	 * are new descriptors to fetch.
> +	 */
> +	wmb();
> +	writel(ring->next_to_use, ring->tail);
> +}
> +
> +static inline struct igb_ring *igb_xdp_tx_queue_mapping(struct igb_adapter *adapter)
> +{
> +	unsigned int r_idx = smp_processor_id();
> +
> +	if (r_idx >= adapter->num_tx_queues)
> +		r_idx = r_idx % adapter->num_tx_queues;
> +
> +	return adapter->tx_ring[r_idx];
> +}
> +
> +static int igb_xdp_xmit_back(struct igb_adapter *adapter, struct xdp_buff *xdp)
> +{
> +	struct xdp_frame *xdpf = xdp_convert_buff_to_frame(xdp);
> +	int cpu = smp_processor_id();
> +	struct igb_ring *tx_ring;
> +	struct netdev_queue *nq;
> +	u32 ret;
> +
> +	if (unlikely(!xdpf))
> +		return IGB_XDP_CONSUMED;
> +
> +	/* During program transitions its possible adapter->xdp_prog is assigned
> +	 * but ring has not been configured yet. In this case simply abort xmit.
> +	 */
> +	tx_ring = adapter->xdp_prog ? igb_xdp_tx_queue_mapping(adapter) : NULL;
> +	if (unlikely(!tx_ring))
> +		return -ENXIO;
> +
> +	nq = txring_txq(tx_ring);
> +	__netif_tx_lock(nq, cpu);
> +	ret = igb_xmit_xdp_ring(adapter, tx_ring, xdpf);
> +	__netif_tx_unlock(nq);
> +
> +	return ret;
> +}
> +
> +static int igb_xdp_xmit(struct net_device *dev, int n,
> +			struct xdp_frame **frames, u32 flags)
> +{
> +	struct igb_adapter *adapter = netdev_priv(dev);
> +	int cpu = smp_processor_id();
> +	struct igb_ring *tx_ring;
> +	struct netdev_queue *nq;
> +	int drops = 0;
> +	int i;
> +
> +	if (unlikely(test_bit(__IGB_DOWN, &adapter->state)))
> +		return -ENETDOWN;
> +
> +	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
> +		return -EINVAL;
> +
> +	/* During program transitions its possible adapter->xdp_prog is assigned
> +	 * but ring has not been configured yet. In this case simply abort xmit.
> +	 */
> +	tx_ring = adapter->xdp_prog ? igb_xdp_tx_queue_mapping(adapter) : NULL;
> +	if (unlikely(!tx_ring))
> +		return -ENXIO;
> +
> +	nq = txring_txq(tx_ring);
> +	__netif_tx_lock(nq, cpu);
> +
> +	for (i = 0; i < n; i++) {
> +		struct xdp_frame *xdpf = frames[i];
> +		int err;
> +
> +		err = igb_xmit_xdp_ring(adapter, tx_ring, xdpf);
> +		if (err != IGB_XDP_TX) {
> +			xdp_return_frame_rx_napi(xdpf);
> +			drops++;
> +		}
> +	}
> +
> +	__netif_tx_unlock(nq);
> +
> +	if (unlikely(flags & XDP_XMIT_FLUSH))
> +		igb_xdp_ring_update_tail(tx_ring);
> +
> +	return n - drops;
> +}
> +
>  static const struct net_device_ops igb_netdev_ops = {
>  	.ndo_open		= igb_open,
>  	.ndo_stop		= igb_close,
> @@ -2849,6 +2992,8 @@ static const struct net_device_ops igb_netdev_ops = {
>  	.ndo_fdb_add		= igb_ndo_fdb_add,
>  	.ndo_features_check	= igb_features_check,
>  	.ndo_setup_tc		= igb_setup_tc,
> +	.ndo_bpf		= igb_xdp,
> +	.ndo_xdp_xmit		= igb_xdp_xmit,
>  };
>  
>  /**
> @@ -4179,6 +4324,7 @@ static void igb_configure_tx(struct igb_adapter *adapter)
>   **/
>  int igb_setup_rx_resources(struct igb_ring *rx_ring)
>  {
> +	struct igb_adapter *adapter = netdev_priv(rx_ring->netdev);
>  	struct device *dev = rx_ring->dev;
>  	int size;
>  
> @@ -4201,6 +4347,13 @@ int igb_setup_rx_resources(struct igb_ring *rx_ring)
>  	rx_ring->next_to_clean = 0;
>  	rx_ring->next_to_use = 0;
>  
> +	rx_ring->xdp_prog = adapter->xdp_prog;
> +
> +	/* XDP RX-queue info */
> +	if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
> +			     rx_ring->queue_index) < 0)
> +		goto err;
> +
>  	return 0;
>  
>  err:
> @@ -4505,6 +4658,10 @@ void igb_configure_rx_ring(struct igb_adapter *adapter,
>  	int reg_idx = ring->reg_idx;
>  	u32 rxdctl = 0;
>  
> +	xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
> +	WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
> +					   MEM_TYPE_PAGE_SHARED, NULL));
> +
>  	/* disable the queue */
>  	wr32(E1000_RXDCTL(reg_idx), 0);
>  
> @@ -4709,6 +4866,8 @@ void igb_free_rx_resources(struct igb_ring *rx_ring)
>  {
>  	igb_clean_rx_ring(rx_ring);
>  
> +	rx_ring->xdp_prog = NULL;
> +	xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
>  	vfree(rx_ring->rx_buffer_info);
>  	rx_ring->rx_buffer_info = NULL;
>  
> @@ -6078,6 +6237,80 @@ static int igb_tx_map(struct igb_ring *tx_ring,
>  	return -1;
>  }
>  
> +int igb_xmit_xdp_ring(struct igb_adapter *adapter,
> +		      struct igb_ring *tx_ring,
> +		      struct xdp_frame *xdpf)
> +{
> +	union e1000_adv_tx_desc *tx_desc;
> +	u32 len, cmd_type, olinfo_status;
> +	struct igb_tx_buffer *tx_buffer;
> +	dma_addr_t dma;
> +	u16 i;
> +
> +	len = xdpf->len;
> +
> +	if (unlikely(!igb_desc_unused(tx_ring)))
> +		return IGB_XDP_CONSUMED;
> +
> +	dma = dma_map_single(tx_ring->dev, xdpf->data, len, DMA_TO_DEVICE);
> +	if (dma_mapping_error(tx_ring->dev, dma))
> +		return IGB_XDP_CONSUMED;
> +
> +	/* record the location of the first descriptor for this packet */
> +	tx_buffer = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
> +	tx_buffer->bytecount = len;
> +	tx_buffer->gso_segs = 1;
> +	tx_buffer->protocol = 0;
> +
> +	i = tx_ring->next_to_use;
> +	tx_desc = IGB_TX_DESC(tx_ring, i);
> +
> +	dma_unmap_len_set(tx_buffer, len, len);
> +	dma_unmap_addr_set(tx_buffer, dma, dma);
> +	tx_buffer->type = IGB_TYPE_XDP;
> +	tx_buffer->xdpf = xdpf;
> +
> +	tx_desc->read.buffer_addr = cpu_to_le64(dma);
> +
> +	/* put descriptor type bits */
> +	cmd_type = E1000_ADVTXD_DTYP_DATA |
> +		       E1000_ADVTXD_DCMD_DEXT |
> +		       E1000_ADVTXD_DCMD_IFCS;
> +	cmd_type |= len | IGB_TXD_DCMD;
> +	tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type);
> +
> +	olinfo_status = cpu_to_le32(len << E1000_ADVTXD_PAYLEN_SHIFT);
> +	/* 82575 requires a unique index per ring */
> +	if (test_bit(IGB_RING_FLAG_TX_CTX_IDX, &tx_ring->flags))
> +		olinfo_status |= tx_ring->reg_idx << 4;
> +
> +	tx_desc->read.olinfo_status = olinfo_status;
> +
> +	netdev_tx_sent_queue(txring_txq(tx_ring), tx_buffer->bytecount);
> +
> +	/* set the timestamp */
> +	tx_buffer->time_stamp = jiffies;
> +
> +	/* Avoid any potential race with xdp_xmit and cleanup */
> +	smp_wmb();
> +
> +	/* set next_to_watch value indicating a packet is present */
> +	i++;
> +	if (i == tx_ring->count)
> +		i = 0;
> +
> +	tx_buffer->next_to_watch = tx_desc;
> +	tx_ring->next_to_use = i;
> +
> +	/* Make sure there is space in the ring for the next send. */
> +	igb_maybe_stop_tx(tx_ring, DESC_NEEDED);
> +
> +	if (netif_xmit_stopped(txring_txq(tx_ring)) || !netdev_xmit_more())
> +		writel(i, tx_ring->tail);
> +
> +	return IGB_XDP_TX;
> +}
> +
>  netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
>  				struct igb_ring *tx_ring)
>  {
> @@ -6106,6 +6339,7 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
>  
>  	/* record the location of the first descriptor for this packet */
>  	first = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
> +	first->type = IGB_TYPE_SKB;
>  	first->skb = skb;
>  	first->bytecount = skb->len;
>  	first->gso_segs = 1;
> @@ -6257,6 +6491,19 @@ static int igb_change_mtu(struct net_device *netdev, int new_mtu)
>  	struct igb_adapter *adapter = netdev_priv(netdev);
>  	int max_frame = new_mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN;
>  
> +	if (adapter->xdp_prog) {
> +		int i;
> +
> +		for (i = 0; i < adapter->num_rx_queues; i++) {
> +			struct igb_ring *ring = adapter->rx_ring[i];
> +
> +			if (max_frame > igb_rx_bufsz(ring)) {
> +				netdev_warn(adapter->netdev, "Requested MTU size is not supported with XDP\n");
> +				return -EINVAL;
> +			}
> +		}
> +	}
> +
>  	/* adjust max frame to be at least the size of a standard frame */
>  	if (max_frame < (ETH_FRAME_LEN + ETH_FCS_LEN))
>  		max_frame = ETH_FRAME_LEN + ETH_FCS_LEN;
> @@ -7810,7 +8057,10 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget)
>  		total_packets += tx_buffer->gso_segs;
>  
>  		/* free the skb */
> -		napi_consume_skb(tx_buffer->skb, napi_budget);
> +		if (tx_buffer->type == IGB_TYPE_SKB)
> +			napi_consume_skb(tx_buffer->skb, napi_budget);
> +		else
> +			xdp_return_frame(tx_buffer->xdpf);
>  
>  		/* unmap skb header data */
>  		dma_unmap_single(tx_ring->dev,
> @@ -7994,8 +8244,8 @@ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer)
>  	 * the pagecnt_bias and page count so that we fully restock the
>  	 * number of references the driver holds.
>  	 */
> -	if (unlikely(!pagecnt_bias)) {
> -		page_ref_add(page, USHRT_MAX);
> +	if (unlikely(pagecnt_bias == 1)) {
> +		page_ref_add(page, USHRT_MAX - 1);
>  		rx_buffer->pagecnt_bias = USHRT_MAX;
>  	}
>  
> @@ -8034,20 +8284,21 @@ static void igb_add_rx_frag(struct igb_ring *rx_ring,
>  
>  static struct sk_buff *igb_construct_skb(struct igb_ring *rx_ring,
>  					 struct igb_rx_buffer *rx_buffer,
> -					 union e1000_adv_rx_desc *rx_desc,
> -					 unsigned int size)
> +					 struct xdp_buff *xdp,
> +					 union e1000_adv_rx_desc *rx_desc)
>  {
> -	void *va = page_address(rx_buffer->page) + rx_buffer->page_offset;
>  #if (PAGE_SIZE < 8192)
>  	unsigned int truesize = igb_rx_pg_size(rx_ring) / 2;
>  #else
> -	unsigned int truesize = SKB_DATA_ALIGN(size);
> +	unsigned int truesize = SKB_DATA_ALIGN(xdp->data_end -
> +					       xdp->data_hard_start);
>  #endif
> +	unsigned int size = xdp->data_end - xdp->data;
>  	unsigned int headlen;
>  	struct sk_buff *skb;
>  
>  	/* prefetch first cache line of first page */
> -	net_prefetch(va);
> +	net_prefetch(xdp->data);
>  
>  	/* allocate a skb to store the frags */
>  	skb = napi_alloc_skb(&rx_ring->q_vector->napi, IGB_RX_HDR_LEN);
> @@ -8055,24 +8306,24 @@ static struct sk_buff *igb_construct_skb(struct igb_ring *rx_ring,
>  		return NULL;
>  
>  	if (unlikely(igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP))) {
> -		igb_ptp_rx_pktstamp(rx_ring->q_vector, va, skb);
> -		va += IGB_TS_HDR_LEN;
> +		igb_ptp_rx_pktstamp(rx_ring->q_vector, xdp->data, skb);
> +		xdp->data += IGB_TS_HDR_LEN;
>  		size -= IGB_TS_HDR_LEN;
>  	}
>  
>  	/* Determine available headroom for copy */
>  	headlen = size;
>  	if (headlen > IGB_RX_HDR_LEN)
> -		headlen = eth_get_headlen(skb->dev, va, IGB_RX_HDR_LEN);
> +		headlen = eth_get_headlen(skb->dev, xdp->data, IGB_RX_HDR_LEN);
>  
>  	/* align pull length to size of long to optimize memcpy performance */
> -	memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));
> +	memcpy(__skb_put(skb, headlen), xdp->data, ALIGN(headlen, sizeof(long)));
>  
>  	/* update all of the pointers */
>  	size -= headlen;
>  	if (size) {
>  		skb_add_rx_frag(skb, 0, rx_buffer->page,
> -				(va + headlen) - page_address(rx_buffer->page),
> +				(xdp->data + headlen) - page_address(rx_buffer->page),
>  				size, truesize);
>  #if (PAGE_SIZE < 8192)
>  		rx_buffer->page_offset ^= truesize;
> @@ -8088,29 +8339,29 @@ static struct sk_buff *igb_construct_skb(struct igb_ring *rx_ring,
>  
>  static struct sk_buff *igb_build_skb(struct igb_ring *rx_ring,
>  				     struct igb_rx_buffer *rx_buffer,
> -				     union e1000_adv_rx_desc *rx_desc,
> -				     unsigned int size)
> +				     struct xdp_buff *xdp,
> +				     union e1000_adv_rx_desc *rx_desc)
>  {
> -	void *va = page_address(rx_buffer->page) + rx_buffer->page_offset;
>  #if (PAGE_SIZE < 8192)
>  	unsigned int truesize = igb_rx_pg_size(rx_ring) / 2;
>  #else
>  	unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) +
> -				SKB_DATA_ALIGN(IGB_SKB_PAD + size);
> +				SKB_DATA_ALIGN(xdp->data_end -
> +					       xdp->data_hard_start);
>  #endif
>  	struct sk_buff *skb;
>  
>  	/* prefetch first cache line of first page */
> -	net_prefetch(va);
> +	net_prefetch(xdp->data_meta);
>  
>  	/* build an skb around the page buffer */
> -	skb = build_skb(va - IGB_SKB_PAD, truesize);
> +	skb = build_skb(xdp->data_hard_start, truesize);
>  	if (unlikely(!skb))
>  		return NULL;
>  
>  	/* update pointers within the skb to store the data */
> -	skb_reserve(skb, IGB_SKB_PAD);
> -	__skb_put(skb, size);
> +	skb_reserve(skb, xdp->data - xdp->data_hard_start);
> +	__skb_put(skb, xdp->data_end - xdp->data);
>  
>  	/* pull timestamp out of packet data */
>  	if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) {
> @@ -8128,6 +8379,79 @@ static struct sk_buff *igb_build_skb(struct igb_ring *rx_ring,
>  	return skb;
>  }
>  
> +static struct sk_buff *igb_run_xdp(struct igb_adapter *adapter,
> +				   struct igb_ring *rx_ring,
> +				   struct xdp_buff *xdp)
> +{
> +	int err, result = IGB_XDP_PASS;
> +	struct bpf_prog *xdp_prog;
> +	u32 act;
> +
> +	rcu_read_lock();
> +	xdp_prog = READ_ONCE(rx_ring->xdp_prog);
> +
> +	if (!xdp_prog)
> +		goto xdp_out;
> +
> +	prefetchw(xdp->data_hard_start); /* xdp_frame write */
> +
> +	act = bpf_prog_run_xdp(xdp_prog, xdp);
> +	switch (act) {
> +	case XDP_PASS:
> +		break;
> +	case XDP_TX:
> +		result = igb_xdp_xmit_back(adapter, xdp);
> +		break;
> +	case XDP_REDIRECT:
> +		err = xdp_do_redirect(adapter->netdev, xdp, xdp_prog);
> +		if (!err)
> +			result = IGB_XDP_REDIR;
> +		else
> +			result = IGB_XDP_CONSUMED;
> +		break;
> +	default:
> +		bpf_warn_invalid_xdp_action(act);
> +		fallthrough;
> +	case XDP_ABORTED:
> +		trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
> +		fallthrough;
> +	case XDP_DROP:
> +		result = IGB_XDP_CONSUMED;
> +		break;
> +	}
> +xdp_out:
> +	rcu_read_unlock();
> +	return ERR_PTR(-result);
> +}
> +
> +static unsigned int igb_rx_frame_truesize(struct igb_ring *rx_ring,
> +					  unsigned int size)
> +{
> +	unsigned int truesize;
> +
> +#if (PAGE_SIZE < 8192)
> +	truesize = igb_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */
> +#else
> +	truesize = ring_uses_build_skb(rx_ring) ?
> +		SKB_DATA_ALIGN(IGB_SKB_PAD + size) +
> +		SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) :
> +		SKB_DATA_ALIGN(size);
> +#endif
> +	return truesize;
> +}
> +
> +static void igb_rx_buffer_flip(struct igb_ring *rx_ring,
> +			       struct igb_rx_buffer *rx_buffer,
> +			       unsigned int size)
> +{
> +	unsigned int truesize = igb_rx_frame_truesize(rx_ring, size);
> +#if (PAGE_SIZE < 8192)
> +	rx_buffer->page_offset ^= truesize;
> +#else
> +	rx_buffer->page_offset += truesize;
> +#endif
> +}
> +
>  static inline void igb_rx_checksum(struct igb_ring *ring,
>  				   union e1000_adv_rx_desc *rx_desc,
>  				   struct sk_buff *skb)
> @@ -8224,6 +8548,10 @@ static bool igb_cleanup_headers(struct igb_ring *rx_ring,
>  				union e1000_adv_rx_desc *rx_desc,
>  				struct sk_buff *skb)
>  {
> +	/* XDP packets use error pointer so abort at this point */
> +	if (IS_ERR(skb))
> +		return true;
> +
>  	if (unlikely((igb_test_staterr(rx_desc,
>  				       E1000_RXDEXT_ERR_FRAME_ERR_MASK)))) {
>  		struct net_device *netdev = rx_ring->netdev;
> @@ -8282,6 +8610,11 @@ static void igb_process_skb_fields(struct igb_ring *rx_ring,
>  	skb->protocol = eth_type_trans(skb, rx_ring->netdev);
>  }
>  
> +static inline unsigned int igb_rx_offset(struct igb_ring *rx_ring)
> +{
> +	return ring_uses_build_skb(rx_ring) ? IGB_SKB_PAD : 0;
> +}
> +
>  static struct igb_rx_buffer *igb_get_rx_buffer(struct igb_ring *rx_ring,
>  					       const unsigned int size)
>  {
> @@ -8325,10 +8658,20 @@ static void igb_put_rx_buffer(struct igb_ring *rx_ring,
>  
>  static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
>  {
> +	struct igb_adapter *adapter = q_vector->adapter;
>  	struct igb_ring *rx_ring = q_vector->rx.ring;
>  	struct sk_buff *skb = rx_ring->skb;
>  	unsigned int total_bytes = 0, total_packets = 0;
>  	u16 cleaned_count = igb_desc_unused(rx_ring);
> +	unsigned int xdp_xmit = 0;
> +	struct xdp_buff xdp;
> +
> +	xdp.rxq = &rx_ring->xdp_rxq;
> +
> +	/* Frame size depend on rx_ring setup when PAGE_SIZE=4K */
> +#if (PAGE_SIZE < 8192)
> +	xdp.frame_sz = igb_rx_frame_truesize(rx_ring, 0);
> +#endif
>  
>  	while (likely(total_packets < budget)) {
>  		union e1000_adv_rx_desc *rx_desc;
> @@ -8355,13 +8698,38 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
>  		rx_buffer = igb_get_rx_buffer(rx_ring, size);
>  
>  		/* retrieve a buffer from the ring */
> -		if (skb)
> +		if (!skb) {
> +			xdp.data = page_address(rx_buffer->page) +
> +				   rx_buffer->page_offset;
> +			xdp.data_meta = xdp.data;
> +			xdp.data_hard_start = xdp.data -
> +					      igb_rx_offset(rx_ring);
> +			xdp.data_end = xdp.data + size;
> +#if (PAGE_SIZE > 4096)
> +			/* At larger PAGE_SIZE, frame_sz depend on len size */
> +			xdp.frame_sz = igb_rx_frame_truesize(rx_ring, size);
> +#endif
> +			skb = igb_run_xdp(adapter, rx_ring, &xdp);
> +		}
> +
> +		if (IS_ERR(skb)) {
> +			unsigned int xdp_res = -PTR_ERR(skb);
> +
> +			if (xdp_res & (IGB_XDP_TX | IGB_XDP_REDIR)) {
> +				xdp_xmit |= xdp_res;
> +				igb_rx_buffer_flip(rx_ring, rx_buffer, size);
> +			} else {
> +				rx_buffer->pagecnt_bias++;
> +			}
> +			total_packets++;
> +			total_bytes += size;
> +		} else if (skb)
>  			igb_add_rx_frag(rx_ring, rx_buffer, skb, size);
>  		else if (ring_uses_build_skb(rx_ring))
> -			skb = igb_build_skb(rx_ring, rx_buffer, rx_desc, size);
> +			skb = igb_build_skb(rx_ring, rx_buffer, &xdp, rx_desc);
>  		else
>  			skb = igb_construct_skb(rx_ring, rx_buffer,
> -						rx_desc, size);
> +						&xdp, rx_desc);
>  
>  		/* exit if we failed to retrieve a buffer */
>  		if (!skb) {
> @@ -8401,6 +8769,15 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
>  	/* place incomplete frames back on ring for completion */
>  	rx_ring->skb = skb;
>  
> +	if (xdp_xmit & IGB_XDP_REDIR)
> +		xdp_do_flush_map();
> +
> +	if (xdp_xmit & IGB_XDP_TX) {
> +		struct igb_ring *tx_ring = igb_xdp_tx_queue_mapping(adapter);
> +
> +		igb_xdp_ring_update_tail(tx_ring);
> +	}
> +
>  	u64_stats_update_begin(&rx_ring->rx_syncp);
>  	rx_ring->rx_stats.packets += total_packets;
>  	rx_ring->rx_stats.bytes += total_bytes;
> @@ -8414,11 +8791,6 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
>  	return total_packets;
>  }
>  
> -static inline unsigned int igb_rx_offset(struct igb_ring *rx_ring)
> -{
> -	return ring_uses_build_skb(rx_ring) ? IGB_SKB_PAD : 0;
> -}
> -
>  static bool igb_alloc_mapped_page(struct igb_ring *rx_ring,
>  				  struct igb_rx_buffer *bi)
>  {
> @@ -8455,7 +8827,8 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring,
>  	bi->dma = dma;
>  	bi->page = page;
>  	bi->page_offset = igb_rx_offset(rx_ring);
> -	bi->pagecnt_bias = 1;
> +	page_ref_add(page, USHRT_MAX - 1);
> +	bi->pagecnt_bias = USHRT_MAX;
>  
>  	return true;
>  }
> -- 
> 2.26.2
>
Tony Nguyen Sept. 21, 2020, 10:21 p.m. UTC | #2
On Mon, 2020-09-21 at 11:58 +0200, Sven Auhagen wrote:
> On Wed, Sep 02, 2020 at 01:32:22PM -0700, Tony Nguyen wrote:
> > From: Sven Auhagen <Sven.Auhagen@voleatech.de>
> > 
> > Add XDP support to the IGB driver.
> > The implementation follows the IXGBE XDP implementation
> > closely and I used the following patches as basis:
> > 
> > 1. commit 924708081629 ("ixgbe: add XDP support for pass and drop
> > actions")
> > 2. commit 33fdc82f0883 ("ixgbe: add support for XDP_TX action")
> > 3. commit ed93a3987128 ("ixgbe: tweak page counting for
> > XDP_REDIRECT")
> > 
> > Due to the hardware constraints of the devices using the
> > IGB driver we must share the TX queues with XDP which
> > means locking the TX queue for XDP.
> > 
> > I ran tests on an older device to get better numbers.
> > Test machine:
> > 
> > Intel(R) Atom(TM) CPU C2338 @ 1.74GHz (2 Cores)
> > 2x Intel I211
> > 
> > Routing Original Driver Network Stack: 382 Kpps
> > 
> > Routing XDP Redirect (xdp_fwd_kern): 1.48 Mpps
> > XDP Drop: 1.48 Mpps
> > 
> > Using XDP we can achieve line rate forwarding even on
> > an older Intel Atom CPU.
> > 
> > Signed-off-by: Sven Auhagen <sven.auhagen@voleatech.de>
> 
> Hello Tony,
> 
> thanks for the patch update.
> How is the process to get this accepted?
> 
Hi Sven,

I've been waiting for validation to test this. When I get word that
everything is ok, I'll send the patch to net-next along with other 1Gb
changes.

Thanks,
Tony
Penigalapati, Sandeep Sept. 22, 2020, 5:52 a.m. UTC | #3
From: Intel-wired-lan <intel-wired-lan-bounces@osuosl.org> On Behalf Of Sven Auhagen
Sent: Monday, September 21, 2020 3:28 PM
To: Nguyen, Anthony L <anthony.l.nguyen@intel.com>
Cc: intel-wired-lan@lists.osuosl.org
Subject: Re: [Intel-wired-lan] [next-queue v7] igb: add XDP support

On Wed, Sep 02, 2020 at 01:32:22PM -0700, Tony Nguyen wrote:
> From: Sven Auhagen <Sven.Auhagen@voleatech.de>
> 
> Add XDP support to the IGB driver.
> The implementation follows the IXGBE XDP implementation closely and I 
> used the following patches as basis:
> 
> 1. commit 924708081629 ("ixgbe: add XDP support for pass and drop 
> actions") 2. commit 33fdc82f0883 ("ixgbe: add support for XDP_TX 
> action") 3. commit ed93a3987128 ("ixgbe: tweak page counting for 
> XDP_REDIRECT")
> 
> Due to the hardware constraints of the devices using the IGB driver we 
> must share the TX queues with XDP which means locking the TX queue for 
> XDP.
> 
> I ran tests on an older device to get better numbers.
> Test machine:
> 
> Intel(R) Atom(TM) CPU C2338 @ 1.74GHz (2 Cores) 2x Intel I211
> 
> Routing Original Driver Network Stack: 382 Kpps
> 
> Routing XDP Redirect (xdp_fwd_kern): 1.48 Mpps XDP Drop: 1.48 Mpps
> 
> Using XDP we can achieve line rate forwarding even on an older Intel 
> Atom CPU.
> 
> Signed-off-by: Sven Auhagen <sven.auhagen@voleatech.de>

Hello Tony,

thanks for the patch update.
How is the process to get this accepted?

Best
Sven

> ---
> v7:
>  * Fix issue with applying to dev-queue branch; utilize net_prefetch()
>  * Fix build issue; remove XDP_QUERY_PROG
>  * Replace fallthrough comment with fallthrough macro to resolve 
> checkpatch warning
>  * Fix reverse Christmas tree
> 
> v6:
> * igb_xdp_ring_update_tail changed to static
> * bump to 5.8
> 
> v5: resubmission with function names in patch
> 
> v4:
> * use HARD_TX_LOCK in XDP xmit
> * do not pass adapter to igb_setup_rx_resources
> * account for timestamp in frame size
> 
> v3: igb_xdp_ring_update_tail should be static
> 
> v2: original did not apply to my dev-queue branch, so fixed the
>     conflicts in the patch
> 
>  drivers/net/ethernet/intel/igb/igb.h         |  81 +++-
>  drivers/net/ethernet/intel/igb/igb_ethtool.c |   4 +
>  drivers/net/ethernet/intel/igb/igb_main.c    | 433 +++++++++++++++++--
>  3 files changed, 482 insertions(+), 36 deletions(-)
> 
Tested-by: Sandeep Penigalapati <sandeep.penigalapati@intel.com>
diff mbox series

Patch

diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index 2f015b60a995..fa6ff1a64fc0 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -19,6 +19,8 @@ 
 #include <linux/pci.h>
 #include <linux/mdio.h>
 
+#include <net/xdp.h>
+
 struct igb_adapter;
 
 #define E1000_PCS_CFG_IGN_SD	1
@@ -79,6 +81,12 @@  struct igb_adapter;
 #define IGB_I210_RX_LATENCY_100		2213
 #define IGB_I210_RX_LATENCY_1000	448
 
+/* XDP */
+#define IGB_XDP_PASS		0
+#define IGB_XDP_CONSUMED	BIT(0)
+#define IGB_XDP_TX		BIT(1)
+#define IGB_XDP_REDIR		BIT(2)
+
 struct vf_data_storage {
 	unsigned char vf_mac_addresses[ETH_ALEN];
 	u16 vf_mc_hashes[IGB_MAX_VF_MC_ENTRIES];
@@ -132,17 +140,63 @@  struct vf_mac_filter {
 
 /* Supported Rx Buffer Sizes */
 #define IGB_RXBUFFER_256	256
+#define IGB_RXBUFFER_1536	1536
 #define IGB_RXBUFFER_2048	2048
 #define IGB_RXBUFFER_3072	3072
 #define IGB_RX_HDR_LEN		IGB_RXBUFFER_256
 #define IGB_TS_HDR_LEN		16
 
-#define IGB_SKB_PAD		(NET_SKB_PAD + NET_IP_ALIGN)
+/* Attempt to maximize the headroom available for incoming frames.  We
+ * use a 2K buffer for receives and need 1536/1534 to store the data for
+ * the frame.  This leaves us with 512 bytes of room.  From that we need
+ * to deduct the space needed for the shared info and the padding needed
+ * to IP align the frame.
+ *
+ * Note: For cache line sizes 256 or larger this value is going to end
+ *	 up negative.  In these cases we should fall back to the 3K
+ *	 buffers.
+ */
 #if (PAGE_SIZE < 8192)
-#define IGB_MAX_FRAME_BUILD_SKB \
-	(SKB_WITH_OVERHEAD(IGB_RXBUFFER_2048) - IGB_SKB_PAD - IGB_TS_HDR_LEN)
+#define IGB_MAX_FRAME_BUILD_SKB (IGB_RXBUFFER_1536 - NET_IP_ALIGN)
+#define IGB_2K_TOO_SMALL_WITH_PADDING \
+((NET_SKB_PAD + IGB_TS_HDR_LEN + IGB_RXBUFFER_1536) > \
+SKB_WITH_OVERHEAD(IGB_RXBUFFER_2048))
+
+static inline int igb_compute_pad(int rx_buf_len)
+{
+	int page_size, pad_size;
+
+	page_size = ALIGN(rx_buf_len, PAGE_SIZE / 2);
+	pad_size = SKB_WITH_OVERHEAD(page_size) - rx_buf_len;
+
+	return pad_size;
+}
+
+static inline int igb_skb_pad(void)
+{
+	int rx_buf_len;
+
+	/* If a 2K buffer cannot handle a standard Ethernet frame then
+	 * optimize padding for a 3K buffer instead of a 1.5K buffer.
+	 *
+	 * For a 3K buffer we need to add enough padding to allow for
+	 * tailroom due to NET_IP_ALIGN possibly shifting us out of
+	 * cache-line alignment.
+	 */
+	if (IGB_2K_TOO_SMALL_WITH_PADDING)
+		rx_buf_len = IGB_RXBUFFER_3072 + SKB_DATA_ALIGN(NET_IP_ALIGN);
+	else
+		rx_buf_len = IGB_RXBUFFER_1536;
+
+	/* if needed make room for NET_IP_ALIGN */
+	rx_buf_len -= NET_IP_ALIGN;
+
+	return igb_compute_pad(rx_buf_len);
+}
+
+#define IGB_SKB_PAD	igb_skb_pad()
 #else
-#define IGB_MAX_FRAME_BUILD_SKB (IGB_RXBUFFER_2048 - IGB_TS_HDR_LEN)
+#define IGB_SKB_PAD	(NET_SKB_PAD + NET_IP_ALIGN)
 #endif
 
 /* How many Rx Buffers do we bundle into one write to the hardware ? */
@@ -194,13 +248,22 @@  enum igb_tx_flags {
 #define IGB_SFF_ADDRESSING_MODE		0x4
 #define IGB_SFF_8472_UNSUP		0x00
 
+enum igb_tx_buf_type {
+	IGB_TYPE_SKB = 0,
+	IGB_TYPE_XDP,
+};
+
 /* wrapper around a pointer to a socket buffer,
  * so a DMA handle can be stored along with the buffer
  */
 struct igb_tx_buffer {
 	union e1000_adv_tx_desc *next_to_watch;
 	unsigned long time_stamp;
-	struct sk_buff *skb;
+	enum igb_tx_buf_type type;
+	union {
+		struct sk_buff *skb;
+		struct xdp_frame *xdpf;
+	};
 	unsigned int bytecount;
 	u16 gso_segs;
 	__be16 protocol;
@@ -248,6 +311,7 @@  struct igb_ring_container {
 struct igb_ring {
 	struct igb_q_vector *q_vector;	/* backlink to q_vector */
 	struct net_device *netdev;	/* back pointer to net_device */
+	struct bpf_prog *xdp_prog;
 	struct device *dev;		/* device pointer for dma mapping */
 	union {				/* array of buffer info structs */
 		struct igb_tx_buffer *tx_buffer_info;
@@ -288,6 +352,7 @@  struct igb_ring {
 			struct u64_stats_sync rx_syncp;
 		};
 	};
+	struct xdp_rxq_info xdp_rxq;
 } ____cacheline_internodealigned_in_smp;
 
 struct igb_q_vector {
@@ -339,7 +404,7 @@  static inline unsigned int igb_rx_bufsz(struct igb_ring *ring)
 		return IGB_RXBUFFER_3072;
 
 	if (ring_uses_build_skb(ring))
-		return IGB_MAX_FRAME_BUILD_SKB + IGB_TS_HDR_LEN;
+		return IGB_MAX_FRAME_BUILD_SKB;
 #endif
 	return IGB_RXBUFFER_2048;
 }
@@ -467,6 +532,7 @@  struct igb_adapter {
 	unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
 
 	struct net_device *netdev;
+	struct bpf_prog *xdp_prog;
 
 	unsigned long state;
 	unsigned int flags;
@@ -643,6 +709,9 @@  enum igb_boards {
 
 extern char igb_driver_name[];
 
+int igb_xmit_xdp_ring(struct igb_adapter *adapter,
+		      struct igb_ring *ring,
+		      struct xdp_frame *xdpf);
 int igb_open(struct net_device *netdev);
 int igb_close(struct net_device *netdev);
 int igb_up(struct igb_adapter *);
diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index 6e8231c1ddf0..28baf203459a 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -961,6 +961,10 @@  static int igb_set_ringparam(struct net_device *netdev,
 			memcpy(&temp_ring[i], adapter->rx_ring[i],
 			       sizeof(struct igb_ring));
 
+			/* Clear copied XDP RX-queue info */
+			memset(&temp_ring[i].xdp_rxq, 0,
+			       sizeof(temp_ring[i].xdp_rxq));
+
 			temp_ring[i].count = new_rx_count;
 			err = igb_setup_rx_resources(&temp_ring[i]);
 			if (err) {
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 698bb6a4b088..73635a012f4a 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -30,6 +30,8 @@ 
 #include <linux/if_ether.h>
 #include <linux/aer.h>
 #include <linux/prefetch.h>
+#include <linux/bpf.h>
+#include <linux/bpf_trace.h>
 #include <linux/pm_runtime.h>
 #include <linux/etherdevice.h>
 #ifdef CONFIG_IGB_DCA
@@ -2825,6 +2827,147 @@  static int igb_setup_tc(struct net_device *dev, enum tc_setup_type type,
 	}
 }
 
+static int igb_xdp_setup(struct net_device *dev, struct bpf_prog *prog)
+{
+	int i, frame_size = dev->mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN;
+	struct igb_adapter *adapter = netdev_priv(dev);
+	bool running = netif_running(dev);
+	struct bpf_prog *old_prog;
+	bool need_reset;
+
+	/* verify igb ring attributes are sufficient for XDP */
+	for (i = 0; i < adapter->num_rx_queues; i++) {
+		struct igb_ring *ring = adapter->rx_ring[i];
+
+		if (frame_size > igb_rx_bufsz(ring))
+			return -EINVAL;
+	}
+
+	old_prog = xchg(&adapter->xdp_prog, prog);
+	need_reset = (!!prog != !!old_prog);
+
+	/* device is up and bpf is added/removed, must setup the RX queues */
+	if (need_reset && running) {
+		igb_close(dev);
+	} else {
+		for (i = 0; i < adapter->num_rx_queues; i++)
+			(void)xchg(&adapter->rx_ring[i]->xdp_prog,
+			    adapter->xdp_prog);
+	}
+
+	if (old_prog)
+		bpf_prog_put(old_prog);
+
+	/* bpf is just replaced, RXQ and MTU are already setup */
+	if (!need_reset)
+		return 0;
+
+	if (running)
+		igb_open(dev);
+
+	return 0;
+}
+
+static int igb_xdp(struct net_device *dev, struct netdev_bpf *xdp)
+{
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return igb_xdp_setup(dev, xdp->prog);
+	default:
+		return -EINVAL;
+	}
+}
+
+static void igb_xdp_ring_update_tail(struct igb_ring *ring)
+{
+	/* Force memory writes to complete before letting h/w know there
+	 * are new descriptors to fetch.
+	 */
+	wmb();
+	writel(ring->next_to_use, ring->tail);
+}
+
+static inline struct igb_ring *igb_xdp_tx_queue_mapping(struct igb_adapter *adapter)
+{
+	unsigned int r_idx = smp_processor_id();
+
+	if (r_idx >= adapter->num_tx_queues)
+		r_idx = r_idx % adapter->num_tx_queues;
+
+	return adapter->tx_ring[r_idx];
+}
+
+static int igb_xdp_xmit_back(struct igb_adapter *adapter, struct xdp_buff *xdp)
+{
+	struct xdp_frame *xdpf = xdp_convert_buff_to_frame(xdp);
+	int cpu = smp_processor_id();
+	struct igb_ring *tx_ring;
+	struct netdev_queue *nq;
+	u32 ret;
+
+	if (unlikely(!xdpf))
+		return IGB_XDP_CONSUMED;
+
+	/* During program transitions its possible adapter->xdp_prog is assigned
+	 * but ring has not been configured yet. In this case simply abort xmit.
+	 */
+	tx_ring = adapter->xdp_prog ? igb_xdp_tx_queue_mapping(adapter) : NULL;
+	if (unlikely(!tx_ring))
+		return -ENXIO;
+
+	nq = txring_txq(tx_ring);
+	__netif_tx_lock(nq, cpu);
+	ret = igb_xmit_xdp_ring(adapter, tx_ring, xdpf);
+	__netif_tx_unlock(nq);
+
+	return ret;
+}
+
+static int igb_xdp_xmit(struct net_device *dev, int n,
+			struct xdp_frame **frames, u32 flags)
+{
+	struct igb_adapter *adapter = netdev_priv(dev);
+	int cpu = smp_processor_id();
+	struct igb_ring *tx_ring;
+	struct netdev_queue *nq;
+	int drops = 0;
+	int i;
+
+	if (unlikely(test_bit(__IGB_DOWN, &adapter->state)))
+		return -ENETDOWN;
+
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
+		return -EINVAL;
+
+	/* During program transitions its possible adapter->xdp_prog is assigned
+	 * but ring has not been configured yet. In this case simply abort xmit.
+	 */
+	tx_ring = adapter->xdp_prog ? igb_xdp_tx_queue_mapping(adapter) : NULL;
+	if (unlikely(!tx_ring))
+		return -ENXIO;
+
+	nq = txring_txq(tx_ring);
+	__netif_tx_lock(nq, cpu);
+
+	for (i = 0; i < n; i++) {
+		struct xdp_frame *xdpf = frames[i];
+		int err;
+
+		err = igb_xmit_xdp_ring(adapter, tx_ring, xdpf);
+		if (err != IGB_XDP_TX) {
+			xdp_return_frame_rx_napi(xdpf);
+			drops++;
+		}
+	}
+
+	__netif_tx_unlock(nq);
+
+	if (unlikely(flags & XDP_XMIT_FLUSH))
+		igb_xdp_ring_update_tail(tx_ring);
+
+	return n - drops;
+}
+
 static const struct net_device_ops igb_netdev_ops = {
 	.ndo_open		= igb_open,
 	.ndo_stop		= igb_close,
@@ -2849,6 +2992,8 @@  static const struct net_device_ops igb_netdev_ops = {
 	.ndo_fdb_add		= igb_ndo_fdb_add,
 	.ndo_features_check	= igb_features_check,
 	.ndo_setup_tc		= igb_setup_tc,
+	.ndo_bpf		= igb_xdp,
+	.ndo_xdp_xmit		= igb_xdp_xmit,
 };
 
 /**
@@ -4179,6 +4324,7 @@  static void igb_configure_tx(struct igb_adapter *adapter)
  **/
 int igb_setup_rx_resources(struct igb_ring *rx_ring)
 {
+	struct igb_adapter *adapter = netdev_priv(rx_ring->netdev);
 	struct device *dev = rx_ring->dev;
 	int size;
 
@@ -4201,6 +4347,13 @@  int igb_setup_rx_resources(struct igb_ring *rx_ring)
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
 
+	rx_ring->xdp_prog = adapter->xdp_prog;
+
+	/* XDP RX-queue info */
+	if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
+			     rx_ring->queue_index) < 0)
+		goto err;
+
 	return 0;
 
 err:
@@ -4505,6 +4658,10 @@  void igb_configure_rx_ring(struct igb_adapter *adapter,
 	int reg_idx = ring->reg_idx;
 	u32 rxdctl = 0;
 
+	xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
+	WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+					   MEM_TYPE_PAGE_SHARED, NULL));
+
 	/* disable the queue */
 	wr32(E1000_RXDCTL(reg_idx), 0);
 
@@ -4709,6 +4866,8 @@  void igb_free_rx_resources(struct igb_ring *rx_ring)
 {
 	igb_clean_rx_ring(rx_ring);
 
+	rx_ring->xdp_prog = NULL;
+	xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
 	vfree(rx_ring->rx_buffer_info);
 	rx_ring->rx_buffer_info = NULL;
 
@@ -6078,6 +6237,80 @@  static int igb_tx_map(struct igb_ring *tx_ring,
 	return -1;
 }
 
+int igb_xmit_xdp_ring(struct igb_adapter *adapter,
+		      struct igb_ring *tx_ring,
+		      struct xdp_frame *xdpf)
+{
+	union e1000_adv_tx_desc *tx_desc;
+	u32 len, cmd_type, olinfo_status;
+	struct igb_tx_buffer *tx_buffer;
+	dma_addr_t dma;
+	u16 i;
+
+	len = xdpf->len;
+
+	if (unlikely(!igb_desc_unused(tx_ring)))
+		return IGB_XDP_CONSUMED;
+
+	dma = dma_map_single(tx_ring->dev, xdpf->data, len, DMA_TO_DEVICE);
+	if (dma_mapping_error(tx_ring->dev, dma))
+		return IGB_XDP_CONSUMED;
+
+	/* record the location of the first descriptor for this packet */
+	tx_buffer = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
+	tx_buffer->bytecount = len;
+	tx_buffer->gso_segs = 1;
+	tx_buffer->protocol = 0;
+
+	i = tx_ring->next_to_use;
+	tx_desc = IGB_TX_DESC(tx_ring, i);
+
+	dma_unmap_len_set(tx_buffer, len, len);
+	dma_unmap_addr_set(tx_buffer, dma, dma);
+	tx_buffer->type = IGB_TYPE_XDP;
+	tx_buffer->xdpf = xdpf;
+
+	tx_desc->read.buffer_addr = cpu_to_le64(dma);
+
+	/* put descriptor type bits */
+	cmd_type = E1000_ADVTXD_DTYP_DATA |
+		       E1000_ADVTXD_DCMD_DEXT |
+		       E1000_ADVTXD_DCMD_IFCS;
+	cmd_type |= len | IGB_TXD_DCMD;
+	tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type);
+
+	olinfo_status = cpu_to_le32(len << E1000_ADVTXD_PAYLEN_SHIFT);
+	/* 82575 requires a unique index per ring */
+	if (test_bit(IGB_RING_FLAG_TX_CTX_IDX, &tx_ring->flags))
+		olinfo_status |= tx_ring->reg_idx << 4;
+
+	tx_desc->read.olinfo_status = olinfo_status;
+
+	netdev_tx_sent_queue(txring_txq(tx_ring), tx_buffer->bytecount);
+
+	/* set the timestamp */
+	tx_buffer->time_stamp = jiffies;
+
+	/* Avoid any potential race with xdp_xmit and cleanup */
+	smp_wmb();
+
+	/* set next_to_watch value indicating a packet is present */
+	i++;
+	if (i == tx_ring->count)
+		i = 0;
+
+	tx_buffer->next_to_watch = tx_desc;
+	tx_ring->next_to_use = i;
+
+	/* Make sure there is space in the ring for the next send. */
+	igb_maybe_stop_tx(tx_ring, DESC_NEEDED);
+
+	if (netif_xmit_stopped(txring_txq(tx_ring)) || !netdev_xmit_more())
+		writel(i, tx_ring->tail);
+
+	return IGB_XDP_TX;
+}
+
 netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
 				struct igb_ring *tx_ring)
 {
@@ -6106,6 +6339,7 @@  netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
 
 	/* record the location of the first descriptor for this packet */
 	first = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
+	first->type = IGB_TYPE_SKB;
 	first->skb = skb;
 	first->bytecount = skb->len;
 	first->gso_segs = 1;
@@ -6257,6 +6491,19 @@  static int igb_change_mtu(struct net_device *netdev, int new_mtu)
 	struct igb_adapter *adapter = netdev_priv(netdev);
 	int max_frame = new_mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN;
 
+	if (adapter->xdp_prog) {
+		int i;
+
+		for (i = 0; i < adapter->num_rx_queues; i++) {
+			struct igb_ring *ring = adapter->rx_ring[i];
+
+			if (max_frame > igb_rx_bufsz(ring)) {
+				netdev_warn(adapter->netdev, "Requested MTU size is not supported with XDP\n");
+				return -EINVAL;
+			}
+		}
+	}
+
 	/* adjust max frame to be at least the size of a standard frame */
 	if (max_frame < (ETH_FRAME_LEN + ETH_FCS_LEN))
 		max_frame = ETH_FRAME_LEN + ETH_FCS_LEN;
@@ -7810,7 +8057,10 @@  static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget)
 		total_packets += tx_buffer->gso_segs;
 
 		/* free the skb */
-		napi_consume_skb(tx_buffer->skb, napi_budget);
+		if (tx_buffer->type == IGB_TYPE_SKB)
+			napi_consume_skb(tx_buffer->skb, napi_budget);
+		else
+			xdp_return_frame(tx_buffer->xdpf);
 
 		/* unmap skb header data */
 		dma_unmap_single(tx_ring->dev,
@@ -7994,8 +8244,8 @@  static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer)
 	 * the pagecnt_bias and page count so that we fully restock the
 	 * number of references the driver holds.
 	 */
-	if (unlikely(!pagecnt_bias)) {
-		page_ref_add(page, USHRT_MAX);
+	if (unlikely(pagecnt_bias == 1)) {
+		page_ref_add(page, USHRT_MAX - 1);
 		rx_buffer->pagecnt_bias = USHRT_MAX;
 	}
 
@@ -8034,20 +8284,21 @@  static void igb_add_rx_frag(struct igb_ring *rx_ring,
 
 static struct sk_buff *igb_construct_skb(struct igb_ring *rx_ring,
 					 struct igb_rx_buffer *rx_buffer,
-					 union e1000_adv_rx_desc *rx_desc,
-					 unsigned int size)
+					 struct xdp_buff *xdp,
+					 union e1000_adv_rx_desc *rx_desc)
 {
-	void *va = page_address(rx_buffer->page) + rx_buffer->page_offset;
 #if (PAGE_SIZE < 8192)
 	unsigned int truesize = igb_rx_pg_size(rx_ring) / 2;
 #else
-	unsigned int truesize = SKB_DATA_ALIGN(size);
+	unsigned int truesize = SKB_DATA_ALIGN(xdp->data_end -
+					       xdp->data_hard_start);
 #endif
+	unsigned int size = xdp->data_end - xdp->data;
 	unsigned int headlen;
 	struct sk_buff *skb;
 
 	/* prefetch first cache line of first page */
-	net_prefetch(va);
+	net_prefetch(xdp->data);
 
 	/* allocate a skb to store the frags */
 	skb = napi_alloc_skb(&rx_ring->q_vector->napi, IGB_RX_HDR_LEN);
@@ -8055,24 +8306,24 @@  static struct sk_buff *igb_construct_skb(struct igb_ring *rx_ring,
 		return NULL;
 
 	if (unlikely(igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP))) {
-		igb_ptp_rx_pktstamp(rx_ring->q_vector, va, skb);
-		va += IGB_TS_HDR_LEN;
+		igb_ptp_rx_pktstamp(rx_ring->q_vector, xdp->data, skb);
+		xdp->data += IGB_TS_HDR_LEN;
 		size -= IGB_TS_HDR_LEN;
 	}
 
 	/* Determine available headroom for copy */
 	headlen = size;
 	if (headlen > IGB_RX_HDR_LEN)
-		headlen = eth_get_headlen(skb->dev, va, IGB_RX_HDR_LEN);
+		headlen = eth_get_headlen(skb->dev, xdp->data, IGB_RX_HDR_LEN);
 
 	/* align pull length to size of long to optimize memcpy performance */
-	memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));
+	memcpy(__skb_put(skb, headlen), xdp->data, ALIGN(headlen, sizeof(long)));
 
 	/* update all of the pointers */
 	size -= headlen;
 	if (size) {
 		skb_add_rx_frag(skb, 0, rx_buffer->page,
-				(va + headlen) - page_address(rx_buffer->page),
+				(xdp->data + headlen) - page_address(rx_buffer->page),
 				size, truesize);
 #if (PAGE_SIZE < 8192)
 		rx_buffer->page_offset ^= truesize;
@@ -8088,29 +8339,29 @@  static struct sk_buff *igb_construct_skb(struct igb_ring *rx_ring,
 
 static struct sk_buff *igb_build_skb(struct igb_ring *rx_ring,
 				     struct igb_rx_buffer *rx_buffer,
-				     union e1000_adv_rx_desc *rx_desc,
-				     unsigned int size)
+				     struct xdp_buff *xdp,
+				     union e1000_adv_rx_desc *rx_desc)
 {
-	void *va = page_address(rx_buffer->page) + rx_buffer->page_offset;
 #if (PAGE_SIZE < 8192)
 	unsigned int truesize = igb_rx_pg_size(rx_ring) / 2;
 #else
 	unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) +
-				SKB_DATA_ALIGN(IGB_SKB_PAD + size);
+				SKB_DATA_ALIGN(xdp->data_end -
+					       xdp->data_hard_start);
 #endif
 	struct sk_buff *skb;
 
 	/* prefetch first cache line of first page */
-	net_prefetch(va);
+	net_prefetch(xdp->data_meta);
 
 	/* build an skb around the page buffer */
-	skb = build_skb(va - IGB_SKB_PAD, truesize);
+	skb = build_skb(xdp->data_hard_start, truesize);
 	if (unlikely(!skb))
 		return NULL;
 
 	/* update pointers within the skb to store the data */
-	skb_reserve(skb, IGB_SKB_PAD);
-	__skb_put(skb, size);
+	skb_reserve(skb, xdp->data - xdp->data_hard_start);
+	__skb_put(skb, xdp->data_end - xdp->data);
 
 	/* pull timestamp out of packet data */
 	if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) {
@@ -8128,6 +8379,79 @@  static struct sk_buff *igb_build_skb(struct igb_ring *rx_ring,
 	return skb;
 }
 
+static struct sk_buff *igb_run_xdp(struct igb_adapter *adapter,
+				   struct igb_ring *rx_ring,
+				   struct xdp_buff *xdp)
+{
+	int err, result = IGB_XDP_PASS;
+	struct bpf_prog *xdp_prog;
+	u32 act;
+
+	rcu_read_lock();
+	xdp_prog = READ_ONCE(rx_ring->xdp_prog);
+
+	if (!xdp_prog)
+		goto xdp_out;
+
+	prefetchw(xdp->data_hard_start); /* xdp_frame write */
+
+	act = bpf_prog_run_xdp(xdp_prog, xdp);
+	switch (act) {
+	case XDP_PASS:
+		break;
+	case XDP_TX:
+		result = igb_xdp_xmit_back(adapter, xdp);
+		break;
+	case XDP_REDIRECT:
+		err = xdp_do_redirect(adapter->netdev, xdp, xdp_prog);
+		if (!err)
+			result = IGB_XDP_REDIR;
+		else
+			result = IGB_XDP_CONSUMED;
+		break;
+	default:
+		bpf_warn_invalid_xdp_action(act);
+		fallthrough;
+	case XDP_ABORTED:
+		trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
+		fallthrough;
+	case XDP_DROP:
+		result = IGB_XDP_CONSUMED;
+		break;
+	}
+xdp_out:
+	rcu_read_unlock();
+	return ERR_PTR(-result);
+}
+
+static unsigned int igb_rx_frame_truesize(struct igb_ring *rx_ring,
+					  unsigned int size)
+{
+	unsigned int truesize;
+
+#if (PAGE_SIZE < 8192)
+	truesize = igb_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */
+#else
+	truesize = ring_uses_build_skb(rx_ring) ?
+		SKB_DATA_ALIGN(IGB_SKB_PAD + size) +
+		SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) :
+		SKB_DATA_ALIGN(size);
+#endif
+	return truesize;
+}
+
+static void igb_rx_buffer_flip(struct igb_ring *rx_ring,
+			       struct igb_rx_buffer *rx_buffer,
+			       unsigned int size)
+{
+	unsigned int truesize = igb_rx_frame_truesize(rx_ring, size);
+#if (PAGE_SIZE < 8192)
+	rx_buffer->page_offset ^= truesize;
+#else
+	rx_buffer->page_offset += truesize;
+#endif
+}
+
 static inline void igb_rx_checksum(struct igb_ring *ring,
 				   union e1000_adv_rx_desc *rx_desc,
 				   struct sk_buff *skb)
@@ -8224,6 +8548,10 @@  static bool igb_cleanup_headers(struct igb_ring *rx_ring,
 				union e1000_adv_rx_desc *rx_desc,
 				struct sk_buff *skb)
 {
+	/* XDP packets use error pointer so abort at this point */
+	if (IS_ERR(skb))
+		return true;
+
 	if (unlikely((igb_test_staterr(rx_desc,
 				       E1000_RXDEXT_ERR_FRAME_ERR_MASK)))) {
 		struct net_device *netdev = rx_ring->netdev;
@@ -8282,6 +8610,11 @@  static void igb_process_skb_fields(struct igb_ring *rx_ring,
 	skb->protocol = eth_type_trans(skb, rx_ring->netdev);
 }
 
+static inline unsigned int igb_rx_offset(struct igb_ring *rx_ring)
+{
+	return ring_uses_build_skb(rx_ring) ? IGB_SKB_PAD : 0;
+}
+
 static struct igb_rx_buffer *igb_get_rx_buffer(struct igb_ring *rx_ring,
 					       const unsigned int size)
 {
@@ -8325,10 +8658,20 @@  static void igb_put_rx_buffer(struct igb_ring *rx_ring,
 
 static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
 {
+	struct igb_adapter *adapter = q_vector->adapter;
 	struct igb_ring *rx_ring = q_vector->rx.ring;
 	struct sk_buff *skb = rx_ring->skb;
 	unsigned int total_bytes = 0, total_packets = 0;
 	u16 cleaned_count = igb_desc_unused(rx_ring);
+	unsigned int xdp_xmit = 0;
+	struct xdp_buff xdp;
+
+	xdp.rxq = &rx_ring->xdp_rxq;
+
+	/* Frame size depend on rx_ring setup when PAGE_SIZE=4K */
+#if (PAGE_SIZE < 8192)
+	xdp.frame_sz = igb_rx_frame_truesize(rx_ring, 0);
+#endif
 
 	while (likely(total_packets < budget)) {
 		union e1000_adv_rx_desc *rx_desc;
@@ -8355,13 +8698,38 @@  static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
 		rx_buffer = igb_get_rx_buffer(rx_ring, size);
 
 		/* retrieve a buffer from the ring */
-		if (skb)
+		if (!skb) {
+			xdp.data = page_address(rx_buffer->page) +
+				   rx_buffer->page_offset;
+			xdp.data_meta = xdp.data;
+			xdp.data_hard_start = xdp.data -
+					      igb_rx_offset(rx_ring);
+			xdp.data_end = xdp.data + size;
+#if (PAGE_SIZE > 4096)
+			/* At larger PAGE_SIZE, frame_sz depend on len size */
+			xdp.frame_sz = igb_rx_frame_truesize(rx_ring, size);
+#endif
+			skb = igb_run_xdp(adapter, rx_ring, &xdp);
+		}
+
+		if (IS_ERR(skb)) {
+			unsigned int xdp_res = -PTR_ERR(skb);
+
+			if (xdp_res & (IGB_XDP_TX | IGB_XDP_REDIR)) {
+				xdp_xmit |= xdp_res;
+				igb_rx_buffer_flip(rx_ring, rx_buffer, size);
+			} else {
+				rx_buffer->pagecnt_bias++;
+			}
+			total_packets++;
+			total_bytes += size;
+		} else if (skb)
 			igb_add_rx_frag(rx_ring, rx_buffer, skb, size);
 		else if (ring_uses_build_skb(rx_ring))
-			skb = igb_build_skb(rx_ring, rx_buffer, rx_desc, size);
+			skb = igb_build_skb(rx_ring, rx_buffer, &xdp, rx_desc);
 		else
 			skb = igb_construct_skb(rx_ring, rx_buffer,
-						rx_desc, size);
+						&xdp, rx_desc);
 
 		/* exit if we failed to retrieve a buffer */
 		if (!skb) {
@@ -8401,6 +8769,15 @@  static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
 	/* place incomplete frames back on ring for completion */
 	rx_ring->skb = skb;
 
+	if (xdp_xmit & IGB_XDP_REDIR)
+		xdp_do_flush_map();
+
+	if (xdp_xmit & IGB_XDP_TX) {
+		struct igb_ring *tx_ring = igb_xdp_tx_queue_mapping(adapter);
+
+		igb_xdp_ring_update_tail(tx_ring);
+	}
+
 	u64_stats_update_begin(&rx_ring->rx_syncp);
 	rx_ring->rx_stats.packets += total_packets;
 	rx_ring->rx_stats.bytes += total_bytes;
@@ -8414,11 +8791,6 @@  static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
 	return total_packets;
 }
 
-static inline unsigned int igb_rx_offset(struct igb_ring *rx_ring)
-{
-	return ring_uses_build_skb(rx_ring) ? IGB_SKB_PAD : 0;
-}
-
 static bool igb_alloc_mapped_page(struct igb_ring *rx_ring,
 				  struct igb_rx_buffer *bi)
 {
@@ -8455,7 +8827,8 @@  static bool igb_alloc_mapped_page(struct igb_ring *rx_ring,
 	bi->dma = dma;
 	bi->page = page;
 	bi->page_offset = igb_rx_offset(rx_ring);
-	bi->pagecnt_bias = 1;
+	page_ref_add(page, USHRT_MAX - 1);
+	bi->pagecnt_bias = USHRT_MAX;
 
 	return true;
 }