diff mbox series

[bpf-next,v5,3/5] bpf: add handling of BPF_LWT_REROUTE to lwt_bpf.c

Message ID 20190130235136.136527-4-posk@google.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap | expand

Commit Message

Peter Oskolkov Jan. 30, 2019, 11:51 p.m. UTC
This patch builds on top of the previous patch in the patchset,
which added BPF_LWT_ENCAP_IP mode to bpf_lwt_push_encap. As the
encapping can result in the skb needing to go via a different
interface/route/dst, bpf programs can indicate this by returning
BPF_LWT_REROUTE, which triggers a new route lookup for the skb.

Signed-off-by: Peter Oskolkov <posk@google.com>
---
 net/core/lwt_bpf.c | 125 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)
diff mbox series

Patch

diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 6a6e9acab73d..20581567f84a 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -16,6 +16,7 @@ 
 #include <linux/types.h>
 #include <linux/bpf.h>
 #include <net/lwtunnel.h>
+#include <net/ip6_route.h>
 
 struct bpf_lwt_prog {
 	struct bpf_prog *prog;
@@ -55,6 +56,7 @@  static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
 
 	switch (ret) {
 	case BPF_OK:
+	case BPF_LWT_REROUTE:
 		break;
 
 	case BPF_REDIRECT:
@@ -87,6 +89,32 @@  static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
 	return ret;
 }
 
+static int bpf_lwt_input_reroute(struct sk_buff *skb)
+{
+	int err = -EINVAL;
+
+	if (skb->protocol == htons(ETH_P_IP)) {
+		struct iphdr *iph = ip_hdr(skb);
+
+		err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+					   iph->tos, skb_dst(skb)->dev);
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		ip6_route_input(skb);
+		err = skb_dst(skb)->error;
+	} else {
+		pr_warn_once("BPF_LWT_REROUTE input: unsupported proto %d\n",
+			     skb->protocol);
+	}
+
+	if (err)
+		goto err;
+	return dst_input(skb);
+
+err:
+	kfree_skb(skb);
+	return err;
+}
+
 static int bpf_input(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
@@ -98,6 +126,8 @@  static int bpf_input(struct sk_buff *skb)
 		ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
 		if (ret < 0)
 			return ret;
+		if (ret == BPF_LWT_REROUTE)
+			return bpf_lwt_input_reroute(skb);
 	}
 
 	if (unlikely(!dst->lwtstate->orig_input)) {
@@ -147,6 +177,90 @@  static int xmit_check_hhlen(struct sk_buff *skb)
 	return 0;
 }
 
+static int bpf_lwt_xmit_reroute(struct sk_buff *skb)
+{
+	struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev);
+	int oif = l3mdev ? l3mdev->ifindex : 0;
+	struct dst_entry *dst = NULL;
+	struct sock *sk;
+	struct net *net;
+	bool ipv4;
+	int err;
+
+	if (skb->protocol == htons(ETH_P_IP)) {
+		ipv4 = true;
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		ipv4 = false;
+	} else {
+		pr_warn_once("BPF_LWT_REROUTE xmit: unsupported proto %d\n",
+			     skb->protocol);
+		return -EINVAL;
+	}
+
+	sk = sk_to_full_sk(skb->sk);
+	if (sk) {
+		if (sk->sk_bound_dev_if)
+			oif = sk->sk_bound_dev_if;
+		net = sock_net(sk);
+	} else {
+		net = dev_net(skb_dst(skb)->dev);
+	}
+
+	if (ipv4) {
+		struct iphdr *iph = ip_hdr(skb);
+		struct flowi4 fl4 = {0};
+		struct rtable *rt;
+
+		fl4.flowi4_oif = oif;
+		fl4.flowi4_mark = skb->mark;
+		fl4.flowi4_uid = sock_net_uid(net, sk);
+		fl4.flowi4_tos = RT_TOS(iph->tos);
+		fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
+		fl4.flowi4_proto = iph->protocol;
+		fl4.daddr = iph->daddr;
+		fl4.saddr = iph->saddr;
+
+		rt = ip_route_output_key(net, &fl4);
+		if (IS_ERR(rt) || rt->dst.error)
+			return -EINVAL;
+		dst = &rt->dst;
+	} else {
+		struct ipv6hdr *iph6 = ipv6_hdr(skb);
+		struct flowi6 fl6 = {0};
+
+		fl6.flowi6_oif = oif;
+		fl6.flowi6_mark = skb->mark;
+		fl6.flowi6_uid = sock_net_uid(net, sk);
+		fl6.flowlabel = ip6_flowinfo(iph6);
+		fl6.flowi6_proto = iph6->nexthdr;
+		fl6.daddr = iph6->daddr;
+		fl6.saddr = iph6->saddr;
+
+		dst = ip6_route_output(net, skb->sk, &fl6);
+		if (IS_ERR(dst) || dst->error)
+			return -EINVAL;
+	}
+
+	/* Although skb header was reserved in bpf_lwt_push_ip_encap(), it
+	 * was done for the previous dst, so we are doing it here again, in
+	 * case the new dst needs much more space. The call below is a noop
+	 * if there is enough header space in skb.
+	 */
+	err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
+	if (unlikely(err))
+		return err;
+
+	skb_dst_drop(skb);
+	skb_dst_set(skb, dst);
+
+	err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb);
+	if (unlikely(err))
+		return err;
+
+	/* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */
+	return LWTUNNEL_XMIT_DONE;
+}
+
 static int bpf_xmit(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
@@ -154,11 +268,20 @@  static int bpf_xmit(struct sk_buff *skb)
 
 	bpf = bpf_lwt_lwtunnel(dst->lwtstate);
 	if (bpf->xmit.prog) {
+		__be16 proto = skb->protocol;
 		int ret;
 
 		ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
 		switch (ret) {
 		case BPF_OK:
+			/* If the header changed, e.g. via bpf_lwt_push_encap,
+			 * BPF_LWT_REROUTE below should have been used if the
+			 * protocol was also changed.
+			 */
+			if (skb->protocol != proto) {
+				kfree_skb(skb);
+				return -EINVAL;
+			}
 			/* If the header was expanded, headroom might be too
 			 * small for L2 header to come, expand as needed.
 			 */
@@ -169,6 +292,8 @@  static int bpf_xmit(struct sk_buff *skb)
 			return LWTUNNEL_XMIT_CONTINUE;
 		case BPF_REDIRECT:
 			return LWTUNNEL_XMIT_DONE;
+		case BPF_LWT_REROUTE:
+			return bpf_lwt_xmit_reroute(skb);
 		default:
 			return ret;
 		}