diff mbox series

[bpf-next,v2,2/3] bpf: implement BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap

Message ID 20190124193418.81674-3-posk@google.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap | expand

Commit Message

Peter Oskolkov Jan. 24, 2019, 7:34 p.m. UTC
This patch implements BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap
BPF helper. It enables BPF programs (specifically, BPF_PROG_TYPE_LWT_IN
and BPF_PROG_TYPE_LWT_XMIT prog types) to add IP encapsulation headers
to packets (e.g. IP/GRE, GUE, IPIP).

This is useful when thousands of different short-lived flows should be
encapped, each with different and dynamically determined destination.
Although lwtunnels can be used in some of these scenarios, the ability
to dynamically generate encap headers adds more flexibility, e.g.
when routing depends on the state of the host (reflected in global bpf
maps).

Signed-off-by: Peter Oskolkov <posk@google.com>
---
 include/net/lwtunnel.h |   3 +
 net/core/filter.c      |   3 +-
 net/core/lwt_bpf.c     | 142 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 147 insertions(+), 1 deletion(-)

Comments

David Ahern Jan. 28, 2019, 8:31 p.m. UTC | #1
On 1/24/19 12:34 PM, Peter Oskolkov wrote:
> This patch implements BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap
> BPF helper. It enables BPF programs (specifically, BPF_PROG_TYPE_LWT_IN
> and BPF_PROG_TYPE_LWT_XMIT prog types) to add IP encapsulation headers
> to packets (e.g. IP/GRE, GUE, IPIP).
> 
> This is useful when thousands of different short-lived flows should be
> encapped, each with different and dynamically determined destination.
> Although lwtunnels can be used in some of these scenarios, the ability
> to dynamically generate encap headers adds more flexibility, e.g.
> when routing depends on the state of the host (reflected in global bpf
> maps).
> 
> Signed-off-by: Peter Oskolkov <posk@google.com>
> ---
>  include/net/lwtunnel.h |   3 +
>  net/core/filter.c      |   3 +-
>  net/core/lwt_bpf.c     | 142 +++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 147 insertions(+), 1 deletion(-)
> 
> diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
> index 33fd9ba7e0e5..f0973eca8036 100644
> --- a/include/net/lwtunnel.h
> +++ b/include/net/lwtunnel.h
> @@ -126,6 +126,8 @@ int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b);
>  int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb);
>  int lwtunnel_input(struct sk_buff *skb);
>  int lwtunnel_xmit(struct sk_buff *skb);
> +int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
> +			  bool ingress);
>  
>  static inline void lwtunnel_set_redirect(struct dst_entry *dst)
>  {
> @@ -138,6 +140,7 @@ static inline void lwtunnel_set_redirect(struct dst_entry *dst)
>  		dst->input = lwtunnel_input;
>  	}
>  }
> +
>  #else
>  
>  static inline void lwtstate_free(struct lwtunnel_state *lws)
> diff --git a/net/core/filter.c b/net/core/filter.c
> index fd3ae092d3d7..81d18660c38b 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -73,6 +73,7 @@
>  #include <linux/seg6_local.h>
>  #include <net/seg6.h>
>  #include <net/seg6_local.h>
> +#include <net/lwtunnel.h>
>  
>  /**
>   *	sk_filter_trim_cap - run a packet through a socket filter
> @@ -4796,7 +4797,7 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len
>  static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
>  			     bool ingress)
>  {
> -	return -EINVAL;  /* Implemented in the next patch. */
> +	return bpf_lwt_push_ip_encap(skb, hdr, len, ingress);
>  }
>  
>  BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
> diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
> index 3e85437f7106..a3f79bff3776 100644
> --- a/net/core/lwt_bpf.c
> +++ b/net/core/lwt_bpf.c
> @@ -16,6 +16,7 @@
>  #include <linux/types.h>
>  #include <linux/bpf.h>
>  #include <net/lwtunnel.h>
> +#include <net/ip6_route.h>
>  
>  struct bpf_lwt_prog {
>  	struct bpf_prog *prog;
> @@ -55,6 +56,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
>  
>  	switch (ret) {
>  	case BPF_OK:
> +	case BPF_LWT_REROUTE:
>  		break;
>  
>  	case BPF_REDIRECT:
> @@ -97,6 +99,8 @@ static int bpf_input(struct sk_buff *skb)
>  		ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
>  		if (ret < 0)
>  			return ret;
> +		if (ret == BPF_LWT_REROUTE)
> +			return dst_input(skb);
>  	}
>  
>  	if (unlikely(!dst->lwtstate->orig_input)) {
> @@ -168,6 +172,13 @@ static int bpf_xmit(struct sk_buff *skb)
>  			return LWTUNNEL_XMIT_CONTINUE;
>  		case BPF_REDIRECT:
>  			return LWTUNNEL_XMIT_DONE;
> +		case BPF_LWT_REROUTE:
> +			ret = dst_output(dev_net(skb_dst(skb)->dev),
> +					 skb->sk, skb);
> +			if (unlikely(ret))
> +				return ret;
> +			/* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */
> +			return LWTUNNEL_XMIT_DONE;
>  		default:
>  			return ret;
>  		}
> @@ -389,6 +400,137 @@ static const struct lwtunnel_encap_ops bpf_encap_ops = {
>  	.owner		= THIS_MODULE,
>  };
>  
> +int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress)
> +{
> +	struct dst_entry *dst = NULL;
> +	struct iphdr *iph;
> +	bool ipv4;
> +	int err;
> +
> +	if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM))
> +		return -EINVAL;
> +
> +	/* validate protocol and length */
> +	iph = (struct iphdr *)hdr;
> +	if (iph->version == 4) {
> +		ipv4 = true;
> +		if (iph->ihl * 4 > len)
> +			return -EINVAL;
> +	} else if (iph->version == 6) {
> +		ipv4 = false;
> +		if (unlikely(len < sizeof(struct ipv6hdr)))
> +			return -EINVAL;
> +	} else {
> +		return -EINVAL;
> +	}
> +
> +	/* allocate enough space for the encap headers + L2 hdr */
> +	if (ingress) {
> +		err = skb_cow_head(skb, len + skb->mac_len);
> +		if (unlikely(err))
> +			return err;
> +	} else {
> +		/* ip_route_input_noref below does route lookup and dst
> +		 * drop/set for ingress. There is no similar function for
> +		 * egress, so we need to do route lookup and replace skb's
> +		 * dst in this function.
> +		 */
> +		struct sock *sk;
> +		struct net *net;
> +
> +		sk = sk_to_full_sk(skb->sk);
> +		if (sk)
> +			net = sock_net(sk);
> +		else
> +			net = dev_net(skb_dst(skb)->dev);

This delta gets VRF tests to pass too. Also, you should be able to
always get net from the device.

diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 526b7cfc6d52..79feebd6da34 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -436,20 +436,24 @@ int bpf_lwt_push_ip_encap(struct sk_buff *skb,
void *hdr, u32 len, bool ingress)
                 * egress, so we need to do route lookup and replace skb's
                 * dst in this function.
                 */
+               struct net_device *l3mdev =
l3mdev_master_dev_rcu(skb_dst(skb)->dev);
+               int oif = l3mdev ? l3mdev->ifindex : 0;
                struct sock *sk;
                struct net *net;

                sk = sk_to_full_sk(skb->sk);
-               if (sk)
+               if (sk) {
+                       if (sk->sk_bound_dev_if)
+                               oif = sk->sk_bound_dev_if;
                        net = sock_net(sk);
-               else
+               } else
                        net = dev_net(skb_dst(skb)->dev);

                if (ipv4) {
                        struct flowi4 fl4 = {0};
                        struct rtable *rt;

-                       fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0;
+                       fl4.flowi4_oif = oif;
                        fl4.flowi4_mark = skb->mark;
                        fl4.flowi4_uid = sock_net_uid(net, sk);
                        fl4.flowi4_tos = RT_TOS(iph->tos);
@@ -466,7 +470,7 @@ int bpf_lwt_push_ip_encap(struct sk_buff *skb, void
*hdr, u32 len, bool ingress)
                        struct ipv6hdr *iph6 = (struct ipv6hdr *)hdr;
                        struct flowi6 fl6 = {0};

-                       fl6.flowi6_oif = sk ? sk->sk_bound_dev_if : 0;
+                       fl6.flowi6_oif = oif;
                        fl6.flowi6_mark = skb->mark;
                        fl6.flowi6_uid = sock_net_uid(net, sk);
                        fl6.flowlabel = ip6_flowinfo(iph6);

> +
> +		if (ipv4) {
> +			struct flowi4 fl4 = {0};
> +			struct rtable *rt;
> +
> +			fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0;
> +			fl4.flowi4_mark = skb->mark;
> +			fl4.flowi4_uid = sock_net_uid(net, sk);
> +			fl4.flowi4_tos = RT_TOS(iph->tos);
> +			fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
> +			fl4.flowi4_proto = iph->protocol;
> +			fl4.daddr = iph->daddr;
> +			fl4.saddr = iph->saddr;
> +
> +			rt = ip_route_output_key(net, &fl4);
> +			if (IS_ERR(rt) || rt->dst.error)
> +				return -EINVAL;
> +			dst = &rt->dst;
> +		} else {
> +			struct ipv6hdr *iph6 = (struct ipv6hdr *)hdr;
> +			struct flowi6 fl6 = {0};
> +
> +			fl6.flowi6_oif = sk ? sk->sk_bound_dev_if : 0;
> +			fl6.flowi6_mark = skb->mark;
> +			fl6.flowi6_uid = sock_net_uid(net, sk);
> +			fl6.flowlabel = ip6_flowinfo(iph6);
> +			fl6.flowi6_proto = iph6->nexthdr;
> +			fl6.daddr = iph6->daddr;
> +			fl6.saddr = iph6->saddr;
> +
> +			dst = ip6_route_output(net, skb->sk, &fl6);
> +			if (IS_ERR(dst) || dst->error)
> +				return -EINVAL;
> +		}
> +
> +		err = skb_cow_head(skb, len + LL_RESERVED_SPACE(dst->dev));
> +		if (unlikely(err))
> +			return err;
> +	}
> +
> +	/* push the encap headers and fix pointers */
> +	skb_reset_inner_headers(skb);
> +	skb->encapsulation = 1;
> +	skb_push(skb, len);
> +	if (ingress)
> +		skb_postpush_rcsum(skb, iph, len);
> +	skb_reset_network_header(skb);
> +	iph = ip_hdr(skb);
> +	memcpy(iph, hdr, len);

Calling it iph and using ip_hdr seems wrong given that hdr can also be
IPv6. Why not just use skb_network_header?

> +	bpf_compute_data_pointers(skb);
> +
> +	/* final skb touches + routing */
> +	if (ipv4) {
> +		skb->protocol = htons(ETH_P_IP);
> +		if (iph->ihl * 4 < len)
> +			skb_set_transport_header(skb, iph->ihl * 4);
> +
> +		if (!iph->check)
> +			iph->check = ip_fast_csum((unsigned char *)iph,
> +						  iph->ihl);
> +
> +		if (ingress) {
> +			err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
> +						   iph->tos, skb_dst(skb)->dev);
> +			if (err)
> +				return err;
> +		} else {
> +			skb_dst_drop(skb);
> +			skb_dst_set(skb, dst);
> +		}
> +	} else {
> +		skb->protocol = htons(ETH_P_IPV6);
> +		if (sizeof(struct ipv6hdr) < len)
> +			skb_set_transport_header(skb, sizeof(struct ipv6hdr));
> +
> +		if (ingress) {
> +			ip6_route_input(skb);
> +			if (skb_dst(skb)->error)
> +				return skb_dst(skb)->error;
> +		} else {
> +			skb_dst_drop(skb);
> +			skb_dst_set(skb, dst);
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  static int __init bpf_lwt_init(void)
>  {
>  	return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);
>
diff mbox series

Patch

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 33fd9ba7e0e5..f0973eca8036 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -126,6 +126,8 @@  int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b);
 int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 int lwtunnel_input(struct sk_buff *skb);
 int lwtunnel_xmit(struct sk_buff *skb);
+int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
+			  bool ingress);
 
 static inline void lwtunnel_set_redirect(struct dst_entry *dst)
 {
@@ -138,6 +140,7 @@  static inline void lwtunnel_set_redirect(struct dst_entry *dst)
 		dst->input = lwtunnel_input;
 	}
 }
+
 #else
 
 static inline void lwtstate_free(struct lwtunnel_state *lws)
diff --git a/net/core/filter.c b/net/core/filter.c
index fd3ae092d3d7..81d18660c38b 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -73,6 +73,7 @@ 
 #include <linux/seg6_local.h>
 #include <net/seg6.h>
 #include <net/seg6_local.h>
+#include <net/lwtunnel.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -4796,7 +4797,7 @@  static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len
 static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
 			     bool ingress)
 {
-	return -EINVAL;  /* Implemented in the next patch. */
+	return bpf_lwt_push_ip_encap(skb, hdr, len, ingress);
 }
 
 BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 3e85437f7106..a3f79bff3776 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -16,6 +16,7 @@ 
 #include <linux/types.h>
 #include <linux/bpf.h>
 #include <net/lwtunnel.h>
+#include <net/ip6_route.h>
 
 struct bpf_lwt_prog {
 	struct bpf_prog *prog;
@@ -55,6 +56,7 @@  static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
 
 	switch (ret) {
 	case BPF_OK:
+	case BPF_LWT_REROUTE:
 		break;
 
 	case BPF_REDIRECT:
@@ -97,6 +99,8 @@  static int bpf_input(struct sk_buff *skb)
 		ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
 		if (ret < 0)
 			return ret;
+		if (ret == BPF_LWT_REROUTE)
+			return dst_input(skb);
 	}
 
 	if (unlikely(!dst->lwtstate->orig_input)) {
@@ -168,6 +172,13 @@  static int bpf_xmit(struct sk_buff *skb)
 			return LWTUNNEL_XMIT_CONTINUE;
 		case BPF_REDIRECT:
 			return LWTUNNEL_XMIT_DONE;
+		case BPF_LWT_REROUTE:
+			ret = dst_output(dev_net(skb_dst(skb)->dev),
+					 skb->sk, skb);
+			if (unlikely(ret))
+				return ret;
+			/* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */
+			return LWTUNNEL_XMIT_DONE;
 		default:
 			return ret;
 		}
@@ -389,6 +400,137 @@  static const struct lwtunnel_encap_ops bpf_encap_ops = {
 	.owner		= THIS_MODULE,
 };
 
+int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress)
+{
+	struct dst_entry *dst = NULL;
+	struct iphdr *iph;
+	bool ipv4;
+	int err;
+
+	if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM))
+		return -EINVAL;
+
+	/* validate protocol and length */
+	iph = (struct iphdr *)hdr;
+	if (iph->version == 4) {
+		ipv4 = true;
+		if (iph->ihl * 4 > len)
+			return -EINVAL;
+	} else if (iph->version == 6) {
+		ipv4 = false;
+		if (unlikely(len < sizeof(struct ipv6hdr)))
+			return -EINVAL;
+	} else {
+		return -EINVAL;
+	}
+
+	/* allocate enough space for the encap headers + L2 hdr */
+	if (ingress) {
+		err = skb_cow_head(skb, len + skb->mac_len);
+		if (unlikely(err))
+			return err;
+	} else {
+		/* ip_route_input_noref below does route lookup and dst
+		 * drop/set for ingress. There is no similar function for
+		 * egress, so we need to do route lookup and replace skb's
+		 * dst in this function.
+		 */
+		struct sock *sk;
+		struct net *net;
+
+		sk = sk_to_full_sk(skb->sk);
+		if (sk)
+			net = sock_net(sk);
+		else
+			net = dev_net(skb_dst(skb)->dev);
+
+		if (ipv4) {
+			struct flowi4 fl4 = {0};
+			struct rtable *rt;
+
+			fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0;
+			fl4.flowi4_mark = skb->mark;
+			fl4.flowi4_uid = sock_net_uid(net, sk);
+			fl4.flowi4_tos = RT_TOS(iph->tos);
+			fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
+			fl4.flowi4_proto = iph->protocol;
+			fl4.daddr = iph->daddr;
+			fl4.saddr = iph->saddr;
+
+			rt = ip_route_output_key(net, &fl4);
+			if (IS_ERR(rt) || rt->dst.error)
+				return -EINVAL;
+			dst = &rt->dst;
+		} else {
+			struct ipv6hdr *iph6 = (struct ipv6hdr *)hdr;
+			struct flowi6 fl6 = {0};
+
+			fl6.flowi6_oif = sk ? sk->sk_bound_dev_if : 0;
+			fl6.flowi6_mark = skb->mark;
+			fl6.flowi6_uid = sock_net_uid(net, sk);
+			fl6.flowlabel = ip6_flowinfo(iph6);
+			fl6.flowi6_proto = iph6->nexthdr;
+			fl6.daddr = iph6->daddr;
+			fl6.saddr = iph6->saddr;
+
+			dst = ip6_route_output(net, skb->sk, &fl6);
+			if (IS_ERR(dst) || dst->error)
+				return -EINVAL;
+		}
+
+		err = skb_cow_head(skb, len + LL_RESERVED_SPACE(dst->dev));
+		if (unlikely(err))
+			return err;
+	}
+
+	/* push the encap headers and fix pointers */
+	skb_reset_inner_headers(skb);
+	skb->encapsulation = 1;
+	skb_push(skb, len);
+	if (ingress)
+		skb_postpush_rcsum(skb, iph, len);
+	skb_reset_network_header(skb);
+	iph = ip_hdr(skb);
+	memcpy(iph, hdr, len);
+	bpf_compute_data_pointers(skb);
+
+	/* final skb touches + routing */
+	if (ipv4) {
+		skb->protocol = htons(ETH_P_IP);
+		if (iph->ihl * 4 < len)
+			skb_set_transport_header(skb, iph->ihl * 4);
+
+		if (!iph->check)
+			iph->check = ip_fast_csum((unsigned char *)iph,
+						  iph->ihl);
+
+		if (ingress) {
+			err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+						   iph->tos, skb_dst(skb)->dev);
+			if (err)
+				return err;
+		} else {
+			skb_dst_drop(skb);
+			skb_dst_set(skb, dst);
+		}
+	} else {
+		skb->protocol = htons(ETH_P_IPV6);
+		if (sizeof(struct ipv6hdr) < len)
+			skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+		if (ingress) {
+			ip6_route_input(skb);
+			if (skb_dst(skb)->error)
+				return skb_dst(skb)->error;
+		} else {
+			skb_dst_drop(skb);
+			skb_dst_set(skb, dst);
+		}
+	}
+
+	return 0;
+}
+
 static int __init bpf_lwt_init(void)
 {
 	return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);