diff mbox series

[RFC,2/9] veth: Add driver XDP

Message ID 20180424143923.26519-3-toshiaki.makita1@gmail.com
State RFC, archived
Delegated to: David Miller
Headers show
Series veth: Driver XDP | expand

Commit Message

Toshiaki Makita April 24, 2018, 2:39 p.m. UTC
From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>

This is basic implementation of veth driver XDP.

Incoming packets are sent from the peer veth device in the form of skb,
so this is generally doing the same thing as generic XDP.

This itself is not so useful, but a starting point to implement other
useful veth XDP features like TX and REDIRECT.

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
---
 drivers/net/veth.c | 210 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 205 insertions(+), 5 deletions(-)

Comments

Jesper Dangaard Brouer April 25, 2018, 8:39 p.m. UTC | #1
On Tue, 24 Apr 2018 23:39:16 +0900
Toshiaki Makita <toshiaki.makita1@gmail.com> wrote:

> This is basic implementation of veth driver XDP.
> 
> Incoming packets are sent from the peer veth device in the form of skb,
> so this is generally doing the same thing as generic XDP.

I'm unsure that context you are calling veth_xdp_rcv_skb() from.  The
XDP (RX side) depend heavily on the protection provided by NAPI context.
It looks like you are adding NAPI handler later.
Toshiaki Makita April 26, 2018, 10:46 a.m. UTC | #2
Hi Jesper,

Thanks for taking a look!

On 2018/04/26 5:39, Jesper Dangaard Brouer wrote:
> On Tue, 24 Apr 2018 23:39:16 +0900
> Toshiaki Makita <toshiaki.makita1@gmail.com> wrote:
> 
>> This is basic implementation of veth driver XDP.
>>
>> Incoming packets are sent from the peer veth device in the form of skb,
>> so this is generally doing the same thing as generic XDP.
> 
> I'm unsure that context you are calling veth_xdp_rcv_skb() from.  The
> XDP (RX side) depend heavily on the protection provided by NAPI context.
> It looks like you are adding NAPI handler later.  

This is called from softirq or bh disabled context.
I can see XDP REDIRECT depends on NAPI since it uses per-cpu temporary
storage which is used in ndo_xdp_flush. I thought DROP and PASS is safe
here. Also this is basically the same context as generic XDP, which is
called from netif_rx_internal.

Anyway this is a temporary state and not needed. It looks like this does
not help review so I'll squash this and patch 4 (napi patch).
diff mbox series

Patch

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index a69ad39ee57e..9c4197306716 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -19,10 +19,15 @@ 
 #include <net/xfrm.h>
 #include <linux/veth.h>
 #include <linux/module.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/bpf_trace.h>
 
 #define DRV_NAME	"veth"
 #define DRV_VERSION	"1.0"
 
+#define VETH_XDP_HEADROOM	(XDP_PACKET_HEADROOM + NET_IP_ALIGN)
+
 struct pcpu_vstats {
 	u64			packets;
 	u64			bytes;
@@ -30,9 +35,11 @@  struct pcpu_vstats {
 };
 
 struct veth_priv {
+	struct bpf_prog __rcu	*xdp_prog;
 	struct net_device __rcu	*peer;
 	atomic64_t		dropped;
 	unsigned		requested_headroom;
+	struct xdp_rxq_info	xdp_rxq;
 };
 
 /*
@@ -98,6 +105,25 @@  static const struct ethtool_ops veth_ethtool_ops = {
 	.get_link_ksettings	= veth_get_link_ksettings,
 };
 
+/* general routines */
+
+static struct sk_buff *veth_xdp_rcv_skb(struct net_device *dev,
+					struct sk_buff *skb);
+
+static int veth_xdp_rx(struct net_device *dev, struct sk_buff *skb)
+{
+	skb = veth_xdp_rcv_skb(dev, skb);
+	if (!skb)
+		return NET_RX_DROP;
+
+	return netif_rx(skb);
+}
+
+static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb)
+{
+	return __dev_forward_skb(dev, skb) ?: veth_xdp_rx(dev, skb);
+}
+
 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct veth_priv *priv = netdev_priv(dev);
@@ -111,7 +137,7 @@  static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 		goto drop;
 	}
 
-	if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) {
+	if (likely(veth_forward_skb(rcv, skb) == NET_RX_SUCCESS)) {
 		struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
 
 		u64_stats_update_begin(&stats->syncp);
@@ -126,10 +152,6 @@  static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 	return NETDEV_TX_OK;
 }
 
-/*
- * general routines
- */
-
 static u64 veth_stats_one(struct pcpu_vstats *result, struct net_device *dev)
 {
 	struct veth_priv *priv = netdev_priv(dev);
@@ -179,19 +201,152 @@  static void veth_set_multicast_list(struct net_device *dev)
 {
 }
 
+static struct sk_buff *veth_build_skb(void *head, int headroom, int len,
+				      int buflen)
+{
+	struct sk_buff *skb;
+
+	if (!buflen) {
+		buflen = SKB_DATA_ALIGN(headroom + len) +
+			 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	}
+	skb = build_skb(head, buflen);
+	if (!skb)
+		return NULL;
+
+	skb_reserve(skb, headroom);
+	skb_put(skb, len);
+
+	return skb;
+}
+
+static struct sk_buff *veth_xdp_rcv_skb(struct net_device *dev,
+					struct sk_buff *skb)
+{
+	struct veth_priv *priv = netdev_priv(dev);
+	u32 pktlen, headroom, act, metalen;
+	int size, mac_len, delta, off;
+	struct bpf_prog *xdp_prog;
+	struct xdp_buff xdp;
+	void *orig_data;
+
+	rcu_read_lock();
+	xdp_prog = rcu_dereference(priv->xdp_prog);
+	if (!xdp_prog) {
+		rcu_read_unlock();
+		goto out;
+	}
+
+	mac_len = skb->data - skb_mac_header(skb);
+	pktlen = skb->len + mac_len;
+	size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) +
+	       SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	if (size > PAGE_SIZE)
+		goto drop;
+
+	headroom = skb_headroom(skb) - mac_len;
+	if (skb_shared(skb) || skb_head_is_locked(skb) ||
+	    skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {
+		struct sk_buff *nskb;
+		void *head, *start;
+		struct page *page;
+		int head_off;
+
+		page = alloc_page(GFP_ATOMIC);
+		if (!page)
+			goto drop;
+
+		head = page_address(page);
+		start = head + VETH_XDP_HEADROOM;
+		if (skb_copy_bits(skb, -mac_len, start, pktlen)) {
+			page_frag_free(head);
+			goto drop;
+		}
+
+		nskb = veth_build_skb(head,
+				      VETH_XDP_HEADROOM + mac_len, skb->len,
+				      PAGE_SIZE);
+		if (!nskb) {
+			page_frag_free(head);
+			goto drop;
+		}
+
+		skb_copy_header(nskb, skb);
+		head_off = skb_headroom(nskb) - skb_headroom(skb);
+		skb_headers_offset_update(nskb, head_off);
+		dev_consume_skb_any(skb);
+		skb = nskb;
+	}
+
+	xdp.data_hard_start = skb->head;
+	xdp.data = skb_mac_header(skb);
+	xdp.data_end = xdp.data + pktlen;
+	xdp.data_meta = xdp.data;
+	xdp.rxq = &priv->xdp_rxq;
+	orig_data = xdp.data;
+
+	act = bpf_prog_run_xdp(xdp_prog, &xdp);
+
+	switch (act) {
+	case XDP_PASS:
+		break;
+	default:
+		bpf_warn_invalid_xdp_action(act);
+	case XDP_ABORTED:
+		trace_xdp_exception(dev, xdp_prog, act);
+	case XDP_DROP:
+		goto drop;
+	}
+	rcu_read_unlock();
+
+	delta = orig_data - xdp.data;
+	off = mac_len + delta;
+	if (off > 0)
+		__skb_push(skb, off);
+	else if (off < 0)
+		__skb_pull(skb, -off);
+	skb->mac_header -= delta;
+	skb->protocol = eth_type_trans(skb, dev);
+
+	metalen = xdp.data - xdp.data_meta;
+	if (metalen)
+		skb_metadata_set(skb, metalen);
+out:
+	return skb;
+drop:
+	rcu_read_unlock();
+	dev_kfree_skb_any(skb);
+	return NULL;
+}
+
 static int veth_open(struct net_device *dev)
 {
 	struct veth_priv *priv = netdev_priv(dev);
 	struct net_device *peer = rtnl_dereference(priv->peer);
+	int err;
 
 	if (!peer)
 		return -ENOTCONN;
 
+	err = xdp_rxq_info_reg(&priv->xdp_rxq, dev, 0);
+	if (err < 0)
+		return err;
+
+	err = xdp_rxq_info_reg_mem_model(&priv->xdp_rxq,
+					 MEM_TYPE_PAGE_SHARED, NULL);
+	if (err < 0)
+		goto err_reg_mem;
+
 	if (peer->flags & IFF_UP) {
 		netif_carrier_on(dev);
 		netif_carrier_on(peer);
 	}
+
 	return 0;
+err_reg_mem:
+	xdp_rxq_info_unreg(&priv->xdp_rxq);
+
+	return err;
 }
 
 static int veth_close(struct net_device *dev)
@@ -203,6 +358,8 @@  static int veth_close(struct net_device *dev)
 	if (peer)
 		netif_carrier_off(peer);
 
+	xdp_rxq_info_unreg(&priv->xdp_rxq);
+
 	return 0;
 }
 
@@ -276,6 +433,48 @@  static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
 	rcu_read_unlock();
 }
 
+static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
+			struct netlink_ext_ack *extack)
+{
+	struct veth_priv *priv = netdev_priv(dev);
+	struct bpf_prog *old_prog;
+
+	old_prog = rtnl_dereference(priv->xdp_prog);
+
+	rcu_assign_pointer(priv->xdp_prog, prog);
+
+	if (old_prog)
+		bpf_prog_put(old_prog);
+
+	return 0;
+}
+
+static u32 veth_xdp_query(struct net_device *dev)
+{
+	struct veth_priv *priv = netdev_priv(dev);
+	const struct bpf_prog *xdp_prog;
+
+	xdp_prog = rtnl_dereference(priv->xdp_prog);
+	if (xdp_prog)
+		return xdp_prog->aux->id;
+
+	return 0;
+}
+
+static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
+{
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return veth_xdp_set(dev, xdp->prog, xdp->extack);
+	case XDP_QUERY_PROG:
+		xdp->prog_id = veth_xdp_query(dev);
+		xdp->prog_attached = !!xdp->prog_id;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
 static const struct net_device_ops veth_netdev_ops = {
 	.ndo_init            = veth_dev_init,
 	.ndo_open            = veth_open,
@@ -290,6 +489,7 @@  static const struct net_device_ops veth_netdev_ops = {
 	.ndo_get_iflink		= veth_get_iflink,
 	.ndo_features_check	= passthru_features_check,
 	.ndo_set_rx_headroom	= veth_set_rx_headroom,
+	.ndo_bpf		= veth_xdp,
 };
 
 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \