diff mbox

[v10,05/12] net/mlx4_en: add support for fast rx drop bpf program

Message ID 1468955817-10604-6-git-send-email-bblanco@plumgrid.com
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Brenden Blanco July 19, 2016, 7:16 p.m. UTC
Add support for the BPF_PROG_TYPE_XDP hook in mlx4 driver.

In tc/socket bpf programs, helpers linearize skb fragments as needed
when the program touches the packet data. However, in the pursuit of
speed, XDP programs will not be allowed to use these slower functions,
especially if it involves allocating an skb.

Therefore, disallow MTU settings that would produce a multi-fragment
packet that XDP programs would fail to access. Future enhancements could
be done to increase the allowable MTU.

The xdp program is present as a per-ring data structure, but as of yet
it is not possible to set at that granularity through any ndo.

Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 60 ++++++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/en_rx.c     | 40 +++++++++++++++--
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h   |  6 +++
 3 files changed, 102 insertions(+), 4 deletions(-)

Comments

Alexei Starovoitov July 19, 2016, 9:41 p.m. UTC | #1
On Tue, Jul 19, 2016 at 12:16:50PM -0700, Brenden Blanco wrote:
> Add support for the BPF_PROG_TYPE_XDP hook in mlx4 driver.
> 
> In tc/socket bpf programs, helpers linearize skb fragments as needed
> when the program touches the packet data. However, in the pursuit of
> speed, XDP programs will not be allowed to use these slower functions,
> especially if it involves allocating an skb.
> 
> Therefore, disallow MTU settings that would produce a multi-fragment
> packet that XDP programs would fail to access. Future enhancements could
> be done to increase the allowable MTU.
> 
> The xdp program is present as a per-ring data structure, but as of yet
> it is not possible to set at that granularity through any ndo.
> 
> Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
...
> +static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
> +{
> +	struct mlx4_en_priv *priv = netdev_priv(dev);
> +	struct bpf_prog *old_prog;
> +	int xdp_ring_num;
> +	int i;
> +
> +	xdp_ring_num = prog ? ALIGN(priv->rx_ring_num, MLX4_EN_NUM_UP) : 0;
> +
> +	if (priv->num_frags > 1) {
> +		en_err(priv, "Cannot set XDP if MTU requires multiple frags\n");
> +		return -EOPNOTSUPP;
> +	}
> +
> +	if (prog) {
> +		prog = bpf_prog_add(prog, priv->rx_ring_num - 1);
> +		if (IS_ERR(prog))
> +			return PTR_ERR(prog);
> +	}
> +
> +	priv->xdp_ring_num = xdp_ring_num;
> +
> +	/* This xchg is paired with READ_ONCE in the fast path */
> +	for (i = 0; i < priv->rx_ring_num; i++) {
> +		old_prog = xchg(&priv->rx_ring[i]->xdp_prog, prog);
> +		if (old_prog)
> +			bpf_prog_put(old_prog);
> +	}

priv->xdp_ring_num looks similar priv->rx_ring_num, so on the first glance
it seemed that the per ring refactoring broke detach logic, but no. it's good.
Acked-by: Alexei Starovoitov <ast@kernel.org>
Daniel Borkmann July 20, 2016, 9:07 a.m. UTC | #2
On 07/19/2016 09:16 PM, Brenden Blanco wrote:
> Add support for the BPF_PROG_TYPE_XDP hook in mlx4 driver.
>
> In tc/socket bpf programs, helpers linearize skb fragments as needed
> when the program touches the packet data. However, in the pursuit of
> speed, XDP programs will not be allowed to use these slower functions,
> especially if it involves allocating an skb.
>
> Therefore, disallow MTU settings that would produce a multi-fragment
> packet that XDP programs would fail to access. Future enhancements could
> be done to increase the allowable MTU.
>
> The xdp program is present as a per-ring data structure, but as of yet
> it is not possible to set at that granularity through any ndo.
>
> Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
[...]
>   struct mlx4_en_bond {
> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> index c1b3a9c..6729545 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> @@ -32,6 +32,7 @@
>    */
>
>   #include <net/busy_poll.h>
> +#include <linux/bpf.h>
>   #include <linux/mlx4/cq.h>
>   #include <linux/slab.h>
>   #include <linux/mlx4/qp.h>
> @@ -509,6 +510,8 @@ void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
>   	struct mlx4_en_dev *mdev = priv->mdev;
>   	struct mlx4_en_rx_ring *ring = *pring;
>
> +	if (ring->xdp_prog)
> +		bpf_prog_put(ring->xdp_prog);

Would be good if you also make this a READ_ONCE() here. I believe this is the
only other spot in your set that has this 'direct' access (besides xchg() and
READ_ONCE() from mlx4_en_process_rx_cq()). It would be mostly for consistency
and to indicate that there's a more complex synchronization behind it. I'm mostly
worried that if it's not consistently used, people might copy this and not use
the READ_ONCE() also in other spots where it matters, and thus add hard to find
bugs.

>   	mlx4_free_hwq_res(mdev->dev, &ring->wqres, size * stride + TXBB_SIZE);
>   	vfree(ring->rx_info);
>   	ring->rx_info = NULL;
> @@ -743,6 +746,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
>   	struct mlx4_en_rx_ring *ring = priv->rx_ring[cq->ring];
>   	struct mlx4_en_rx_alloc *frags;
>   	struct mlx4_en_rx_desc *rx_desc;
> +	struct bpf_prog *xdp_prog;
>   	struct sk_buff *skb;
>   	int index;
>   	int nr;
> @@ -759,6 +763,8 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
>   	if (budget <= 0)
>   		return polled;
Brenden Blanco July 20, 2016, 5:33 p.m. UTC | #3
On Wed, Jul 20, 2016 at 11:07:57AM +0200, Daniel Borkmann wrote:
> On 07/19/2016 09:16 PM, Brenden Blanco wrote:
[...]
> >+	if (ring->xdp_prog)
> >+		bpf_prog_put(ring->xdp_prog);
> 
> Would be good if you also make this a READ_ONCE() here. I believe this is the
> only other spot in your set that has this 'direct' access (besides xchg() and
> READ_ONCE() from mlx4_en_process_rx_cq()). It would be mostly for consistency
> and to indicate that there's a more complex synchronization behind it. I'm mostly
> worried that if it's not consistently used, people might copy this and not use
> the READ_ONCE() also in other spots where it matters, and thus add hard to find
> bugs.
I can do that. My thinking was just that this is the cleanup path so the
code would have been superfluous. I think there were a few nits so I'll
collect those and clean them up.
> 
[...]
Jesper Dangaard Brouer July 24, 2016, 11:56 a.m. UTC | #4
On Tue, 19 Jul 2016 12:16:50 -0700
Brenden Blanco <bblanco@plumgrid.com> wrote:

> The xdp program is present as a per-ring data structure, but as of yet
> it is not possible to set at that granularity through any ndo.

Thank you for doing this! :-)
Tom Herbert July 24, 2016, 4:57 p.m. UTC | #5
On Tue, Jul 19, 2016 at 2:16 PM, Brenden Blanco <bblanco@plumgrid.com> wrote:
> Add support for the BPF_PROG_TYPE_XDP hook in mlx4 driver.
>
> In tc/socket bpf programs, helpers linearize skb fragments as needed
> when the program touches the packet data. However, in the pursuit of
> speed, XDP programs will not be allowed to use these slower functions,
> especially if it involves allocating an skb.
>
> Therefore, disallow MTU settings that would produce a multi-fragment
> packet that XDP programs would fail to access. Future enhancements could
> be done to increase the allowable MTU.
>
> The xdp program is present as a per-ring data structure, but as of yet
> it is not possible to set at that granularity through any ndo.
>
> Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
> ---
>  drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 60 ++++++++++++++++++++++++++
>  drivers/net/ethernet/mellanox/mlx4/en_rx.c     | 40 +++++++++++++++--
>  drivers/net/ethernet/mellanox/mlx4/mlx4_en.h   |  6 +++
>  3 files changed, 102 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> index 6083775..c34a33d 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> @@ -31,6 +31,7 @@
>   *
>   */
>
> +#include <linux/bpf.h>
>  #include <linux/etherdevice.h>
>  #include <linux/tcp.h>
>  #include <linux/if_vlan.h>
> @@ -2112,6 +2113,11 @@ static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu)
>                 en_err(priv, "Bad MTU size:%d.\n", new_mtu);
>                 return -EPERM;
>         }
> +       if (priv->xdp_ring_num && MLX4_EN_EFF_MTU(new_mtu) > FRAG_SZ0) {
> +               en_err(priv, "MTU size:%d requires frags but XDP running\n",
> +                      new_mtu);
> +               return -EOPNOTSUPP;
> +       }
>         dev->mtu = new_mtu;
>
>         if (netif_running(dev)) {
> @@ -2520,6 +2526,58 @@ static int mlx4_en_set_tx_maxrate(struct net_device *dev, int queue_index, u32 m
>         return err;
>  }
>
> +static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
> +{
> +       struct mlx4_en_priv *priv = netdev_priv(dev);
> +       struct bpf_prog *old_prog;
> +       int xdp_ring_num;
> +       int i;
> +
> +       xdp_ring_num = prog ? ALIGN(priv->rx_ring_num, MLX4_EN_NUM_UP) : 0;
> +
> +       if (priv->num_frags > 1) {
> +               en_err(priv, "Cannot set XDP if MTU requires multiple frags\n");
> +               return -EOPNOTSUPP;
> +       }
> +
> +       if (prog) {
> +               prog = bpf_prog_add(prog, priv->rx_ring_num - 1);
> +               if (IS_ERR(prog))
> +                       return PTR_ERR(prog);
> +       }
> +
> +       priv->xdp_ring_num = xdp_ring_num;
> +
> +       /* This xchg is paired with READ_ONCE in the fast path */
> +       for (i = 0; i < priv->rx_ring_num; i++) {
> +               old_prog = xchg(&priv->rx_ring[i]->xdp_prog, prog);

This can be done under a lock instead of relying on xchg.

> +               if (old_prog)
> +                       bpf_prog_put(old_prog);

I don't see how this can work. Even after setting the new program, the
old program might still be run (pointer dereferenced before xchg).
Either rcu needs to be used or the queue should stopped and synced
before setting the new program.

> +       }
> +
> +       return 0;
> +}
> +
> +static bool mlx4_xdp_attached(struct net_device *dev)
> +{
> +       struct mlx4_en_priv *priv = netdev_priv(dev);
> +
> +       return !!priv->xdp_ring_num;
> +}
> +
> +static int mlx4_xdp(struct net_device *dev, struct netdev_xdp *xdp)
> +{
> +       switch (xdp->command) {
> +       case XDP_SETUP_PROG:
> +               return mlx4_xdp_set(dev, xdp->prog);
> +       case XDP_QUERY_PROG:
> +               xdp->prog_attached = mlx4_xdp_attached(dev);
> +               return 0;
> +       default:
> +               return -EINVAL;
> +       }
> +}
> +
>  static const struct net_device_ops mlx4_netdev_ops = {
>         .ndo_open               = mlx4_en_open,
>         .ndo_stop               = mlx4_en_close,
> @@ -2548,6 +2606,7 @@ static const struct net_device_ops mlx4_netdev_ops = {
>         .ndo_udp_tunnel_del     = mlx4_en_del_vxlan_port,
>         .ndo_features_check     = mlx4_en_features_check,
>         .ndo_set_tx_maxrate     = mlx4_en_set_tx_maxrate,
> +       .ndo_xdp                = mlx4_xdp,
>  };
>
>  static const struct net_device_ops mlx4_netdev_ops_master = {
> @@ -2584,6 +2643,7 @@ static const struct net_device_ops mlx4_netdev_ops_master = {
>         .ndo_udp_tunnel_del     = mlx4_en_del_vxlan_port,
>         .ndo_features_check     = mlx4_en_features_check,
>         .ndo_set_tx_maxrate     = mlx4_en_set_tx_maxrate,
> +       .ndo_xdp                = mlx4_xdp,
>  };
>
>  struct mlx4_en_bond {
> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> index c1b3a9c..6729545 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> @@ -32,6 +32,7 @@
>   */
>
>  #include <net/busy_poll.h>
> +#include <linux/bpf.h>
>  #include <linux/mlx4/cq.h>
>  #include <linux/slab.h>
>  #include <linux/mlx4/qp.h>
> @@ -509,6 +510,8 @@ void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
>         struct mlx4_en_dev *mdev = priv->mdev;
>         struct mlx4_en_rx_ring *ring = *pring;
>
> +       if (ring->xdp_prog)
> +               bpf_prog_put(ring->xdp_prog);
>         mlx4_free_hwq_res(mdev->dev, &ring->wqres, size * stride + TXBB_SIZE);
>         vfree(ring->rx_info);
>         ring->rx_info = NULL;
> @@ -743,6 +746,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
>         struct mlx4_en_rx_ring *ring = priv->rx_ring[cq->ring];
>         struct mlx4_en_rx_alloc *frags;
>         struct mlx4_en_rx_desc *rx_desc;
> +       struct bpf_prog *xdp_prog;
>         struct sk_buff *skb;
>         int index;
>         int nr;
> @@ -759,6 +763,8 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
>         if (budget <= 0)
>                 return polled;
>
> +       xdp_prog = READ_ONCE(ring->xdp_prog);
> +
>         /* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
>          * descriptor offset can be deduced from the CQE index instead of
>          * reading 'cqe->index' */
> @@ -835,6 +841,35 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
>                 l2_tunnel = (dev->hw_enc_features & NETIF_F_RXCSUM) &&
>                         (cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_L2_TUNNEL));
>
> +               /* A bpf program gets first chance to drop the packet. It may
> +                * read bytes but not past the end of the frag.
> +                */
> +               if (xdp_prog) {
> +                       struct xdp_buff xdp;
> +                       dma_addr_t dma;
> +                       u32 act;
> +
> +                       dma = be64_to_cpu(rx_desc->data[0].addr);
> +                       dma_sync_single_for_cpu(priv->ddev, dma,
> +                                               priv->frag_info[0].frag_size,
> +                                               DMA_FROM_DEVICE);
> +
> +                       xdp.data = page_address(frags[0].page) +
> +                                                       frags[0].page_offset;
> +                       xdp.data_end = xdp.data + length;
> +
> +                       act = bpf_prog_run_xdp(xdp_prog, &xdp);
> +                       switch (act) {
> +                       case XDP_PASS:
> +                               break;
> +                       default:
> +                               bpf_warn_invalid_xdp_action(act);
> +                       case XDP_ABORTED:
> +                       case XDP_DROP:
> +                               goto next;
> +                       }
> +               }
> +
>                 if (likely(dev->features & NETIF_F_RXCSUM)) {
>                         if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_TCP |
>                                                       MLX4_CQE_STATUS_UDP)) {
> @@ -1062,10 +1097,7 @@ static const int frag_sizes[] = {
>  void mlx4_en_calc_rx_buf(struct net_device *dev)
>  {
>         struct mlx4_en_priv *priv = netdev_priv(dev);
> -       /* VLAN_HLEN is added twice,to support skb vlan tagged with multiple
> -        * headers. (For example: ETH_P_8021Q and ETH_P_8021AD).
> -        */
> -       int eff_mtu = dev->mtu + ETH_HLEN + (2 * VLAN_HLEN);
> +       int eff_mtu = MLX4_EN_EFF_MTU(dev->mtu);
>         int buf_size = 0;
>         int i = 0;
>
> diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
> index d39bf59..eb1238d 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
> +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
> @@ -164,6 +164,10 @@ enum {
>  #define MLX4_LOOPBACK_TEST_PAYLOAD (HEADER_COPY_SIZE - ETH_HLEN)
>
>  #define MLX4_EN_MIN_MTU                46
> +/* VLAN_HLEN is added twice,to support skb vlan tagged with multiple
> + * headers. (For example: ETH_P_8021Q and ETH_P_8021AD).
> + */
> +#define MLX4_EN_EFF_MTU(mtu)   ((mtu) + ETH_HLEN + (2 * VLAN_HLEN))
>  #define ETH_BCAST              0xffffffffffffULL
>
>  #define MLX4_EN_LOOPBACK_RETRIES       5
> @@ -319,6 +323,7 @@ struct mlx4_en_rx_ring {
>         u8  fcs_del;
>         void *buf;
>         void *rx_info;
> +       struct bpf_prog *xdp_prog;
>         unsigned long bytes;
>         unsigned long packets;
>         unsigned long csum_ok;
> @@ -558,6 +563,7 @@ struct mlx4_en_priv {
>         struct mlx4_en_frag_info frag_info[MLX4_EN_MAX_RX_FRAGS];
>         u16 num_frags;
>         u16 log_rx_info;
> +       int xdp_ring_num;
>
>         struct mlx4_en_tx_ring **tx_ring;
>         struct mlx4_en_rx_ring *rx_ring[MAX_RX_RINGS];
> --
> 2.8.2
>
Daniel Borkmann July 24, 2016, 8:34 p.m. UTC | #6
On 07/24/2016 06:57 PM, Tom Herbert wrote:
> On Tue, Jul 19, 2016 at 2:16 PM, Brenden Blanco <bblanco@plumgrid.com> wrote:
>> Add support for the BPF_PROG_TYPE_XDP hook in mlx4 driver.
>>
>> In tc/socket bpf programs, helpers linearize skb fragments as needed
>> when the program touches the packet data. However, in the pursuit of
>> speed, XDP programs will not be allowed to use these slower functions,
>> especially if it involves allocating an skb.
>>
>> Therefore, disallow MTU settings that would produce a multi-fragment
>> packet that XDP programs would fail to access. Future enhancements could
>> be done to increase the allowable MTU.
>>
>> The xdp program is present as a per-ring data structure, but as of yet
>> it is not possible to set at that granularity through any ndo.
>>
>> Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
[...]
>> +       if (prog) {
>> +               prog = bpf_prog_add(prog, priv->rx_ring_num - 1);
>> +               if (IS_ERR(prog))
>> +                       return PTR_ERR(prog);
>> +       }
>> +
>> +       priv->xdp_ring_num = xdp_ring_num;
>> +
>> +       /* This xchg is paired with READ_ONCE in the fast path */
>> +       for (i = 0; i < priv->rx_ring_num; i++) {
>> +               old_prog = xchg(&priv->rx_ring[i]->xdp_prog, prog);
>
> This can be done under a lock instead of relying on xchg.
>
>> +               if (old_prog)
>> +                       bpf_prog_put(old_prog);
>
> I don't see how this can work. Even after setting the new program, the
> old program might still be run (pointer dereferenced before xchg).
> Either rcu needs to be used or the queue should stopped and synced
> before setting the new program.

It's a strict requirement that all BPF programs must run under RCU.
diff mbox

Patch

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 6083775..c34a33d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -31,6 +31,7 @@ 
  *
  */
 
+#include <linux/bpf.h>
 #include <linux/etherdevice.h>
 #include <linux/tcp.h>
 #include <linux/if_vlan.h>
@@ -2112,6 +2113,11 @@  static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu)
 		en_err(priv, "Bad MTU size:%d.\n", new_mtu);
 		return -EPERM;
 	}
+	if (priv->xdp_ring_num && MLX4_EN_EFF_MTU(new_mtu) > FRAG_SZ0) {
+		en_err(priv, "MTU size:%d requires frags but XDP running\n",
+		       new_mtu);
+		return -EOPNOTSUPP;
+	}
 	dev->mtu = new_mtu;
 
 	if (netif_running(dev)) {
@@ -2520,6 +2526,58 @@  static int mlx4_en_set_tx_maxrate(struct net_device *dev, int queue_index, u32 m
 	return err;
 }
 
+static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct bpf_prog *old_prog;
+	int xdp_ring_num;
+	int i;
+
+	xdp_ring_num = prog ? ALIGN(priv->rx_ring_num, MLX4_EN_NUM_UP) : 0;
+
+	if (priv->num_frags > 1) {
+		en_err(priv, "Cannot set XDP if MTU requires multiple frags\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (prog) {
+		prog = bpf_prog_add(prog, priv->rx_ring_num - 1);
+		if (IS_ERR(prog))
+			return PTR_ERR(prog);
+	}
+
+	priv->xdp_ring_num = xdp_ring_num;
+
+	/* This xchg is paired with READ_ONCE in the fast path */
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		old_prog = xchg(&priv->rx_ring[i]->xdp_prog, prog);
+		if (old_prog)
+			bpf_prog_put(old_prog);
+	}
+
+	return 0;
+}
+
+static bool mlx4_xdp_attached(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	return !!priv->xdp_ring_num;
+}
+
+static int mlx4_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+{
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return mlx4_xdp_set(dev, xdp->prog);
+	case XDP_QUERY_PROG:
+		xdp->prog_attached = mlx4_xdp_attached(dev);
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
 static const struct net_device_ops mlx4_netdev_ops = {
 	.ndo_open		= mlx4_en_open,
 	.ndo_stop		= mlx4_en_close,
@@ -2548,6 +2606,7 @@  static const struct net_device_ops mlx4_netdev_ops = {
 	.ndo_udp_tunnel_del	= mlx4_en_del_vxlan_port,
 	.ndo_features_check	= mlx4_en_features_check,
 	.ndo_set_tx_maxrate	= mlx4_en_set_tx_maxrate,
+	.ndo_xdp		= mlx4_xdp,
 };
 
 static const struct net_device_ops mlx4_netdev_ops_master = {
@@ -2584,6 +2643,7 @@  static const struct net_device_ops mlx4_netdev_ops_master = {
 	.ndo_udp_tunnel_del	= mlx4_en_del_vxlan_port,
 	.ndo_features_check	= mlx4_en_features_check,
 	.ndo_set_tx_maxrate	= mlx4_en_set_tx_maxrate,
+	.ndo_xdp		= mlx4_xdp,
 };
 
 struct mlx4_en_bond {
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index c1b3a9c..6729545 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -32,6 +32,7 @@ 
  */
 
 #include <net/busy_poll.h>
+#include <linux/bpf.h>
 #include <linux/mlx4/cq.h>
 #include <linux/slab.h>
 #include <linux/mlx4/qp.h>
@@ -509,6 +510,8 @@  void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
 	struct mlx4_en_dev *mdev = priv->mdev;
 	struct mlx4_en_rx_ring *ring = *pring;
 
+	if (ring->xdp_prog)
+		bpf_prog_put(ring->xdp_prog);
 	mlx4_free_hwq_res(mdev->dev, &ring->wqres, size * stride + TXBB_SIZE);
 	vfree(ring->rx_info);
 	ring->rx_info = NULL;
@@ -743,6 +746,7 @@  int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 	struct mlx4_en_rx_ring *ring = priv->rx_ring[cq->ring];
 	struct mlx4_en_rx_alloc *frags;
 	struct mlx4_en_rx_desc *rx_desc;
+	struct bpf_prog *xdp_prog;
 	struct sk_buff *skb;
 	int index;
 	int nr;
@@ -759,6 +763,8 @@  int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 	if (budget <= 0)
 		return polled;
 
+	xdp_prog = READ_ONCE(ring->xdp_prog);
+
 	/* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
 	 * descriptor offset can be deduced from the CQE index instead of
 	 * reading 'cqe->index' */
@@ -835,6 +841,35 @@  int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 		l2_tunnel = (dev->hw_enc_features & NETIF_F_RXCSUM) &&
 			(cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_L2_TUNNEL));
 
+		/* A bpf program gets first chance to drop the packet. It may
+		 * read bytes but not past the end of the frag.
+		 */
+		if (xdp_prog) {
+			struct xdp_buff xdp;
+			dma_addr_t dma;
+			u32 act;
+
+			dma = be64_to_cpu(rx_desc->data[0].addr);
+			dma_sync_single_for_cpu(priv->ddev, dma,
+						priv->frag_info[0].frag_size,
+						DMA_FROM_DEVICE);
+
+			xdp.data = page_address(frags[0].page) +
+							frags[0].page_offset;
+			xdp.data_end = xdp.data + length;
+
+			act = bpf_prog_run_xdp(xdp_prog, &xdp);
+			switch (act) {
+			case XDP_PASS:
+				break;
+			default:
+				bpf_warn_invalid_xdp_action(act);
+			case XDP_ABORTED:
+			case XDP_DROP:
+				goto next;
+			}
+		}
+
 		if (likely(dev->features & NETIF_F_RXCSUM)) {
 			if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_TCP |
 						      MLX4_CQE_STATUS_UDP)) {
@@ -1062,10 +1097,7 @@  static const int frag_sizes[] = {
 void mlx4_en_calc_rx_buf(struct net_device *dev)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
-	/* VLAN_HLEN is added twice,to support skb vlan tagged with multiple
-	 * headers. (For example: ETH_P_8021Q and ETH_P_8021AD).
-	 */
-	int eff_mtu = dev->mtu + ETH_HLEN + (2 * VLAN_HLEN);
+	int eff_mtu = MLX4_EN_EFF_MTU(dev->mtu);
 	int buf_size = 0;
 	int i = 0;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index d39bf59..eb1238d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -164,6 +164,10 @@  enum {
 #define MLX4_LOOPBACK_TEST_PAYLOAD (HEADER_COPY_SIZE - ETH_HLEN)
 
 #define MLX4_EN_MIN_MTU		46
+/* VLAN_HLEN is added twice,to support skb vlan tagged with multiple
+ * headers. (For example: ETH_P_8021Q and ETH_P_8021AD).
+ */
+#define MLX4_EN_EFF_MTU(mtu)	((mtu) + ETH_HLEN + (2 * VLAN_HLEN))
 #define ETH_BCAST		0xffffffffffffULL
 
 #define MLX4_EN_LOOPBACK_RETRIES	5
@@ -319,6 +323,7 @@  struct mlx4_en_rx_ring {
 	u8  fcs_del;
 	void *buf;
 	void *rx_info;
+	struct bpf_prog *xdp_prog;
 	unsigned long bytes;
 	unsigned long packets;
 	unsigned long csum_ok;
@@ -558,6 +563,7 @@  struct mlx4_en_priv {
 	struct mlx4_en_frag_info frag_info[MLX4_EN_MAX_RX_FRAGS];
 	u16 num_frags;
 	u16 log_rx_info;
+	int xdp_ring_num;
 
 	struct mlx4_en_tx_ring **tx_ring;
 	struct mlx4_en_rx_ring *rx_ring[MAX_RX_RINGS];