diff mbox

[net-next,v2,3/5] bpf: BPF for lightweight tunnel encapsulation

Message ID d8c7e07ac7d289a2846af8d659287f3512a4edec.1477959702.git.tgraf@suug.ch
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Thomas Graf Nov. 1, 2016, 12:37 a.m. UTC
Register two new BPF prog types BPF_PROG_TYPE_LWT_IN and
BPF_PROG_TYPE_LWT_OUT which are invoked if a route contains a
LWT redirection of type LWTUNNEL_ENCAP_BPF.

The separate program types are required because manipulation of
packet data is only allowed on the output and transmit path as
the subsequent dst_input() call path assumes an IP header
validated by ip_rcv(). The BPF programs will be handed an skb
with the L3 header attached and may return one of the following
return codes:

 BPF_OK - Continue routing as per nexthop
 BPF_DROP - Drop skb and return EPERM
 BPF_REDIRECT - Redirect skb to device as per redirect() helper.
                (Only valid on lwtunnel_xmit() hook)

The return codes are binary compatible with their TC_ACT_
relatives to ease compatibility.

A new helper bpf_skb_push() is added which allows to preprend an
L2 header in front of the skb, extend the existing L3 header, or
both. This allows to address a wide range of issues:
 - Optimize L2 header construction when L2 information is always
   static to avoid ARP/NDisc lookup.
 - Extend IP header to add additional IP options.
 - Perform simple encapsulation where offload is of no concern.
   (The existing funtionality to attach a tunnel key to the skb
    and redirect to a tunnel net_device to allow for offload
    continues to work obviously).

Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
 include/linux/filter.h        |   2 +-
 include/uapi/linux/bpf.h      |  37 +++-
 include/uapi/linux/lwtunnel.h |  21 ++
 kernel/bpf/verifier.c         |  16 +-
 net/Kconfig                   |   1 +
 net/core/Makefile             |   2 +-
 net/core/filter.c             | 148 ++++++++++++-
 net/core/lwt_bpf.c            | 504 ++++++++++++++++++++++++++++++++++++++++++
 net/core/lwtunnel.c           |   1 +
 9 files changed, 725 insertions(+), 7 deletions(-)
 create mode 100644 net/core/lwt_bpf.c

Comments

kernel test robot Nov. 1, 2016, 1:10 a.m. UTC | #1
Hi Thomas,

[auto build test ERROR on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Thomas-Graf/bpf-BPF-for-lightweight-tunnel-encapsulation/20161101-084038
config: m68k-sun3_defconfig (attached as .config)
compiler: m68k-linux-gcc (GCC) 4.9.0
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=m68k 

All errors (new ones prefixed by >>):

   net/built-in.o: In function `bpf_output':
>> lwt_bpf.c:(.text+0x2cff4): undefined reference to `ip6_route_output_flags'

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
kernel test robot Nov. 1, 2016, 2:39 a.m. UTC | #2
Hi Thomas,

[auto build test WARNING on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Thomas-Graf/bpf-BPF-for-lightweight-tunnel-encapsulation/20161101-084038
config: x86_64-randconfig-s0-11010954 (attached as .config)
compiler: gcc-4.4 (Debian 4.4.7-8) 4.4.7
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All warnings (new ones prefixed by >>):

   net/core/lwt_bpf.c: In function 'bpf_lwt_lookup6':
>> net/core/lwt_bpf.c:132: warning: initialized field with side-effects overwritten
   net/core/lwt_bpf.c:132: warning: (near initialization for 'fl6')

vim +132 net/core/lwt_bpf.c

   116		}
   117	
   118		return dst->lwtstate->orig_input(skb);
   119	}
   120	
   121	#if IS_ENABLED(CONFIG_IPV6)
   122	static struct dst_entry *bpf_lwt_lookup6(struct net *net, struct sk_buff *skb,
   123						 struct bpf_lwt *bpf)
   124	{
   125		struct ipv6hdr *ip6h = ipv6_hdr(skb);
   126		struct dst_entry *dst;
   127		struct flowi6 fl6 = {
   128			.daddr = ip6h->daddr,
   129			.saddr = ip6h->saddr,
   130			.flowlabel = ip6_flowinfo(ip6h),
   131			.flowi6_mark = skb->mark,
 > 132			.flowi6_proto = ip6h->nexthdr,
   133			.flowi6_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
   134		};
   135	
   136		dst = ip6_route_output(net, skb->sk, &fl6);
   137		if (unlikely(dst->error)) {
   138			int err = dst->error;
   139			dst_release(dst);
   140			return ERR_PTR(err);

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
David Ahern Nov. 1, 2016, 8:11 p.m. UTC | #3
On 10/31/16 6:37 PM, Thomas Graf wrote:
> Register two new BPF prog types BPF_PROG_TYPE_LWT_IN and
> BPF_PROG_TYPE_LWT_OUT which are invoked if a route contains a
> LWT redirection of type LWTUNNEL_ENCAP_BPF.
> 
> The separate program types are required because manipulation of
> packet data is only allowed on the output and transmit path as
> the subsequent dst_input() call path assumes an IP header
> validated by ip_rcv(). The BPF programs will be handed an skb
> with the L3 header attached and may return one of the following
> return codes:
> 
>  BPF_OK - Continue routing as per nexthop
>  BPF_DROP - Drop skb and return EPERM
>  BPF_REDIRECT - Redirect skb to device as per redirect() helper.
>                 (Only valid on lwtunnel_xmit() hook)
> 
> The return codes are binary compatible with their TC_ACT_
> relatives to ease compatibility.
> 
> A new helper bpf_skb_push() is added which allows to preprend an
> L2 header in front of the skb, extend the existing L3 header, or
> both. This allows to address a wide range of issues:
>  - Optimize L2 header construction when L2 information is always
>    static to avoid ARP/NDisc lookup.
>  - Extend IP header to add additional IP options.
>  - Perform simple encapsulation where offload is of no concern.
>    (The existing funtionality to attach a tunnel key to the skb
>     and redirect to a tunnel net_device to allow for offload
>     continues to work obviously).

have you tested the adding of headers with large packets hitting gso and fragmentation? Wondering if mtu is used properly when headers are added and the lwt->headroom is not accounted. See 14972cbd34ff668c390cbd2e6497323484c9e812
Thomas Graf Nov. 1, 2016, 9:03 p.m. UTC | #4
On 1 November 2016 at 13:11, David Ahern <dsa@cumulusnetworks.com> wrote:
> On 10/31/16 6:37 PM, Thomas Graf wrote:
>>  - Perform simple encapsulation where offload is of no concern.
>>    (The existing funtionality to attach a tunnel key to the skb
>>     and redirect to a tunnel net_device to allow for offload
>>     continues to work obviously).
>
> have you tested the adding of headers with large packets hitting gso and fragmentation? Wondering if mtu is used properly when headers are added and the lwt->headroom is not accounted. See 14972cbd34ff668c390cbd2e6497323484c9e812

Thanks for the pointer David, I will look into this.

The packet size is currently limited to this when adding l2 headers:
skb->dev->mtu + skb->dev->hard_header_len
Roopa Prabhu Nov. 2, 2016, 1:39 p.m. UTC | #5
On 10/31/16, 5:37 PM, Thomas Graf wrote:
> Register two new BPF prog types BPF_PROG_TYPE_LWT_IN and
> BPF_PROG_TYPE_LWT_OUT which are invoked if a route contains a
> LWT redirection of type LWTUNNEL_ENCAP_BPF.
>
> The separate program types are required because manipulation of
> packet data is only allowed on the output and transmit path as
> the subsequent dst_input() call path assumes an IP header
> validated by ip_rcv(). The BPF programs will be handed an skb
> with the L3 header attached and may return one of the following
> return codes:
>
>  BPF_OK - Continue routing as per nexthop
>  BPF_DROP - Drop skb and return EPERM
>  BPF_REDIRECT - Redirect skb to device as per redirect() helper.
>                 (Only valid on lwtunnel_xmit() hook)
>
> The return codes are binary compatible with their TC_ACT_
> relatives to ease compatibility.
>
> A new helper bpf_skb_push() is added which allows to preprend an
> L2 header in front of the skb, extend the existing L3 header, or
> both. This allows to address a wide range of issues:
>  - Optimize L2 header construction when L2 information is always
>    static to avoid ARP/NDisc lookup.
>  - Extend IP header to add additional IP options.
>  - Perform simple encapsulation where offload is of no concern.
>    (The existing funtionality to attach a tunnel key to the skb
>     and redirect to a tunnel net_device to allow for offload
>     continues to work obviously).
>
> Signed-off-by: Thomas Graf <tgraf@suug.ch>
> ---
>  
[snip]
> diff --git a/net/Kconfig b/net/Kconfig
> index 7b6cd34..7554f12 100644
> --- a/net/Kconfig
> +++ b/net/Kconfig
> @@ -396,6 +396,7 @@ source "net/nfc/Kconfig"
>  
>  config LWTUNNEL
>  	bool "Network light weight tunnels"
> +	depends on IPV6 || IPV6=n
>  	---help---
>  	  This feature provides an infrastructure to support light weight
>  	  tunnels like mpls. There is no netdevice associated with a light
> diff --git a/net/core/Makefile b/net/core/Makefile
> index d6508c2..a675fd3 100644
> --- a/net/core/Makefile
> +++ b/net/core/Makefile
> @@ -23,7 +23,7 @@ obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
>  obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
>  obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
>  obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
> -obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
> +obj-$(CONFIG_LWTUNNEL) += lwtunnel.o lwt_bpf.o

Any reason you want to keep lwt bpf under the main CONFIG_LWTUNNEL infra config ?.
since it is defined as yet another plug-gable encap function, seems like it will be better under a separate
CONFIG_LWTUNNEL_BPF or CONFIG_LWT_BPF that depends on CONFIG_LWTUNNEL
Thomas Graf Nov. 2, 2016, 10:58 p.m. UTC | #6
On 2 November 2016 at 07:39, Roopa Prabhu <roopa@cumulusnetworks.com> wrote:
>> diff --git a/net/core/Makefile b/net/core/Makefile
>> index d6508c2..a675fd3 100644
>> --- a/net/core/Makefile
>> +++ b/net/core/Makefile
>> @@ -23,7 +23,7 @@ obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
>>  obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
>>  obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
>>  obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
>> -obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
>> +obj-$(CONFIG_LWTUNNEL) += lwtunnel.o lwt_bpf.o
>
> Any reason you want to keep lwt bpf under the main CONFIG_LWTUNNEL infra config ?.
> since it is defined as yet another plug-gable encap function, seems like it will be better under a separate
> CONFIG_LWTUNNEL_BPF or CONFIG_LWT_BPF that depends on CONFIG_LWTUNNEL

The code was so minimal with no additional dependencies that I didn't
see a need for a separate Kconfig. I'm fine adding that in the next
iteration though. No objections.
diff mbox

Patch

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1f09c52..aad7f81 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -438,7 +438,7 @@  struct xdp_buff {
 };
 
 /* compute the linear packet data range [data, data_end) which
- * will be accessed by cls_bpf and act_bpf programs
+ * will be accessed by cls_bpf, act_bpf and lwt programs
  */
 static inline void bpf_compute_data_end(struct sk_buff *skb)
 {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e2f38e0..c034a2d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -96,6 +96,9 @@  enum bpf_prog_type {
 	BPF_PROG_TYPE_TRACEPOINT,
 	BPF_PROG_TYPE_XDP,
 	BPF_PROG_TYPE_PERF_EVENT,
+	BPF_PROG_TYPE_LWT_IN,
+	BPF_PROG_TYPE_LWT_OUT,
+	BPF_PROG_TYPE_LWT_XMIT,
 };
 
 #define BPF_PSEUDO_MAP_FD	1
@@ -383,6 +386,16 @@  union bpf_attr {
  *
  * int bpf_get_numa_node_id()
  *     Return: Id of current NUMA node.
+ *
+ * int bpf_skb_push()
+ *     Add room to beginning of skb and adjusts MAC header offset accordingly.
+ *     Extends/reallocaes for needed skb headeroom automatically.
+ *     May change skb data pointer and will thus invalidate any check done
+ *     for direct packet access.
+ *     @skb: pointer to skb
+ *     @len: length of header to be pushed in front
+ *     @flags: Flags (unused for now)
+ *     Return: 0 on success or negative error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -427,7 +440,8 @@  union bpf_attr {
 	FN(skb_pull_data),		\
 	FN(csum_update),		\
 	FN(set_hash_invalid),		\
-	FN(get_numa_node_id),
+	FN(get_numa_node_id),		\
+	FN(skb_push),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -511,6 +525,27 @@  struct bpf_tunnel_key {
 	__u32 tunnel_label;
 };
 
+/* Generic BPF return codes which all BPF program types may support.
+ * The values are binary compatible with their TC_ACT_* counter-part to
+ * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT
+ * programs.
+ *
+ * XDP is handled seprately, see XDP_*.
+ */
+enum bpf_ret_code {
+	BPF_OK = 0,
+	/* 1 reserved */
+	BPF_DROP = 2,
+	/* 3-6 reserved */
+	BPF_REDIRECT = 7,
+	/* >127 are reserved for prog type specific return codes */
+};
+
+/* LWT specific return codes */
+enum bpf_lwt_ret_code {
+	BPF_LWT_REROUTE = 128,
+};
+
 /* User return codes for XDP prog type.
  * A valid XDP program must return one of these defined values. All other
  * return codes are reserved for future use. Unknown return codes will result
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index a478fe8..9354d997 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -9,6 +9,7 @@  enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_IP,
 	LWTUNNEL_ENCAP_ILA,
 	LWTUNNEL_ENCAP_IP6,
+	LWTUNNEL_ENCAP_BPF,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
@@ -42,4 +43,24 @@  enum lwtunnel_ip6_t {
 
 #define LWTUNNEL_IP6_MAX (__LWTUNNEL_IP6_MAX - 1)
 
+enum {
+	LWT_BPF_PROG_UNSPEC,
+	LWT_BPF_PROG_FD,
+	LWT_BPF_PROG_NAME,
+	__LWT_BPF_PROG_MAX,
+};
+
+#define LWT_BPF_PROG_MAX (__LWT_BPF_PROG_MAX - 1)
+
+enum {
+	LWT_BPF_UNSPEC,
+	LWT_BPF_IN,
+	LWT_BPF_OUT,
+	LWT_BPF_XMIT,
+	__LWT_BPF_MAX,
+};
+
+#define LWT_BPF_MAX (__LWT_BPF_MAX - 1)
+
+
 #endif /* _UAPI_LWTUNNEL_H_ */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9002575..519b58e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -633,12 +633,21 @@  static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 #define MAX_PACKET_OFF 0xffff
 
 static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
-				       const struct bpf_call_arg_meta *meta)
+				       const struct bpf_call_arg_meta *meta,
+				       enum bpf_access_type t)
 {
 	switch (env->prog->type) {
+	case BPF_PROG_TYPE_LWT_IN:
+		/* dst_input() can't write for now, orig_input may depend on
+		 * IP header parsed by ip_rcv().
+		 */
+		if (t == BPF_WRITE)
+			return false;
 	case BPF_PROG_TYPE_SCHED_CLS:
 	case BPF_PROG_TYPE_SCHED_ACT:
 	case BPF_PROG_TYPE_XDP:
+	case BPF_PROG_TYPE_LWT_OUT:
+	case BPF_PROG_TYPE_LWT_XMIT:
 		if (meta)
 			return meta->pkt_access;
 
@@ -837,7 +846,7 @@  static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
 			err = check_stack_read(state, off, size, value_regno);
 		}
 	} else if (state->regs[regno].type == PTR_TO_PACKET) {
-		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) {
+		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
 			verbose("cannot write into packet\n");
 			return -EACCES;
 		}
@@ -970,7 +979,8 @@  static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		return 0;
 	}
 
-	if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) {
+	if (type == PTR_TO_PACKET &&
+	    !may_access_direct_pkt_data(env, meta, BPF_READ)) {
 		verbose("helper access to the packet is not allowed\n");
 		return -EACCES;
 	}
diff --git a/net/Kconfig b/net/Kconfig
index 7b6cd34..7554f12 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -396,6 +396,7 @@  source "net/nfc/Kconfig"
 
 config LWTUNNEL
 	bool "Network light weight tunnels"
+	depends on IPV6 || IPV6=n
 	---help---
 	  This feature provides an infrastructure to support light weight
 	  tunnels like mpls. There is no netdevice associated with a light
diff --git a/net/core/Makefile b/net/core/Makefile
index d6508c2..a675fd3 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -23,7 +23,7 @@  obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
 obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
 obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
 obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
-obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
+obj-$(CONFIG_LWTUNNEL) += lwtunnel.o lwt_bpf.o
 obj-$(CONFIG_DST_CACHE) += dst_cache.o
 obj-$(CONFIG_HWBM) += hwbm.o
 obj-$(CONFIG_NET_DEVLINK) += devlink.o
diff --git a/net/core/filter.c b/net/core/filter.c
index cd9e2ba..325a9d8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2138,6 +2138,43 @@  static const struct bpf_func_proto bpf_skb_change_tail_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_3(bpf_skb_push, struct sk_buff *, skb, __u32, len, u64, flags)
+{
+	u32 new_len = skb->len + len;
+
+	/* restrict max skb size and check for overflow */
+	if (new_len > __bpf_skb_max_len(skb) || new_len < skb->len)
+		return -ERANGE;
+
+	if (flags)
+		return -EINVAL;
+
+	if (len > 0) {
+		int ret;
+
+		ret = skb_cow(skb, len);
+		if (unlikely(ret < 0))
+			return ret;
+
+		__skb_push(skb, len);
+		memset(skb->data, 0, len);
+	}
+
+	skb_reset_mac_header(skb);
+
+	bpf_compute_data_end(skb);
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_push_proto = {
+	.func		= bpf_skb_push,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+};
+
 bool bpf_helper_changes_skb_data(void *func)
 {
 	if (func == bpf_skb_vlan_push ||
@@ -2147,7 +2184,8 @@  bool bpf_helper_changes_skb_data(void *func)
 	    func == bpf_skb_change_tail ||
 	    func == bpf_skb_pull_data ||
 	    func == bpf_l3_csum_replace ||
-	    func == bpf_l4_csum_replace)
+	    func == bpf_l4_csum_replace ||
+	    func == bpf_skb_push)
 		return true;
 
 	return false;
@@ -2578,6 +2616,75 @@  xdp_func_proto(enum bpf_func_id func_id)
 	}
 }
 
+static const struct bpf_func_proto *
+lwt_in_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_skb_load_bytes:
+		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_skb_pull_data:
+		return &bpf_skb_pull_data_proto;
+	case BPF_FUNC_csum_diff:
+		return &bpf_csum_diff_proto;
+	case BPF_FUNC_get_cgroup_classid:
+		return &bpf_get_cgroup_classid_proto;
+	case BPF_FUNC_get_route_realm:
+		return &bpf_get_route_realm_proto;
+	case BPF_FUNC_get_hash_recalc:
+		return &bpf_get_hash_recalc_proto;
+	case BPF_FUNC_perf_event_output:
+		return &bpf_skb_event_output_proto;
+	case BPF_FUNC_get_smp_processor_id:
+		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_skb_under_cgroup:
+		return &bpf_skb_under_cgroup_proto;
+	default:
+		return sk_filter_func_proto(func_id);
+	}
+}
+
+static const struct bpf_func_proto *
+lwt_out_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_skb_store_bytes:
+		return &bpf_skb_store_bytes_proto;
+	case BPF_FUNC_csum_update:
+		return &bpf_csum_update_proto;
+	case BPF_FUNC_l3_csum_replace:
+		return &bpf_l3_csum_replace_proto;
+	case BPF_FUNC_l4_csum_replace:
+		return &bpf_l4_csum_replace_proto;
+	case BPF_FUNC_set_hash_invalid:
+		return &bpf_set_hash_invalid_proto;
+	default:
+		return lwt_in_func_proto(func_id);
+	}
+}
+
+static const struct bpf_func_proto *
+lwt_xmit_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_skb_get_tunnel_key:
+		return &bpf_skb_get_tunnel_key_proto;
+	case BPF_FUNC_skb_set_tunnel_key:
+		return bpf_get_skb_set_tunnel_proto(func_id);
+	case BPF_FUNC_skb_get_tunnel_opt:
+		return &bpf_skb_get_tunnel_opt_proto;
+	case BPF_FUNC_skb_set_tunnel_opt:
+		return bpf_get_skb_set_tunnel_proto(func_id);
+	case BPF_FUNC_redirect:
+		return &bpf_redirect_proto;
+	case BPF_FUNC_skb_change_tail:
+		return &bpf_skb_change_tail_proto;
+	case BPF_FUNC_skb_push:
+		return &bpf_skb_push_proto;
+	default:
+		return lwt_out_func_proto(func_id);
+	}
+}
+
 static bool __is_valid_access(int off, int size, enum bpf_access_type type)
 {
 	if (off < 0 || off >= sizeof(struct __sk_buff))
@@ -2940,6 +3047,27 @@  static const struct bpf_verifier_ops xdp_ops = {
 	.convert_ctx_access	= xdp_convert_ctx_access,
 };
 
+static const struct bpf_verifier_ops lwt_in_ops = {
+	.get_func_proto		= lwt_in_func_proto,
+	.is_valid_access	= tc_cls_act_is_valid_access,
+	.convert_ctx_access	= sk_filter_convert_ctx_access,
+	.gen_prologue		= tc_cls_act_prologue,
+};
+
+static const struct bpf_verifier_ops lwt_out_ops = {
+	.get_func_proto		= lwt_out_func_proto,
+	.is_valid_access	= tc_cls_act_is_valid_access,
+	.convert_ctx_access	= sk_filter_convert_ctx_access,
+	.gen_prologue		= tc_cls_act_prologue,
+};
+
+static const struct bpf_verifier_ops lwt_xmit_ops = {
+	.get_func_proto		= lwt_xmit_func_proto,
+	.is_valid_access	= tc_cls_act_is_valid_access,
+	.convert_ctx_access	= sk_filter_convert_ctx_access,
+	.gen_prologue		= tc_cls_act_prologue,
+};
+
 static struct bpf_prog_type_list sk_filter_type __read_mostly = {
 	.ops	= &sk_filter_ops,
 	.type	= BPF_PROG_TYPE_SOCKET_FILTER,
@@ -2960,12 +3088,30 @@  static struct bpf_prog_type_list xdp_type __read_mostly = {
 	.type	= BPF_PROG_TYPE_XDP,
 };
 
+static struct bpf_prog_type_list lwt_in_type __read_mostly = {
+	.ops	= &lwt_in_ops,
+	.type	= BPF_PROG_TYPE_LWT_IN,
+};
+
+static struct bpf_prog_type_list lwt_out_type __read_mostly = {
+	.ops	= &lwt_out_ops,
+	.type	= BPF_PROG_TYPE_LWT_OUT,
+};
+
+static struct bpf_prog_type_list lwt_xmit_type __read_mostly = {
+	.ops	= &lwt_xmit_ops,
+	.type	= BPF_PROG_TYPE_LWT_XMIT,
+};
+
 static int __init register_sk_filter_ops(void)
 {
 	bpf_register_prog_type(&sk_filter_type);
 	bpf_register_prog_type(&sched_cls_type);
 	bpf_register_prog_type(&sched_act_type);
 	bpf_register_prog_type(&xdp_type);
+	bpf_register_prog_type(&lwt_in_type);
+	bpf_register_prog_type(&lwt_out_type);
+	bpf_register_prog_type(&lwt_xmit_type);
 
 	return 0;
 }
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
new file mode 100644
index 0000000..c9b0494
--- /dev/null
+++ b/net/core/lwt_bpf.c
@@ -0,0 +1,504 @@ 
+/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <net/lwtunnel.h>
+#include <net/dst_cache.h>
+#include <net/ip6_route.h>
+
+struct bpf_lwt_prog {
+	struct bpf_prog *prog;
+	char *name;
+};
+
+struct bpf_lwt {
+	struct bpf_lwt_prog in;
+	struct bpf_lwt_prog out;
+	struct bpf_lwt_prog xmit;
+	struct dst_cache dst_cache;
+	int family;
+};
+
+#define MAX_PROG_NAME 256
+
+static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt)
+{
+	return (struct bpf_lwt *)lwt->data;
+}
+
+#define NO_REDIRECT false
+#define CAN_REDIRECT true
+
+static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
+		       struct dst_entry *dst, bool can_redirect)
+{
+	int ret;
+
+	/* Preempt disable is needed to protect per-cpu redirect_info between
+	 * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and
+	 * access to maps strictly require a rcu_read_lock() for protection,
+	 * mixing with BH RCU lock doesn't work.
+	 */
+	preempt_disable();
+	rcu_read_lock();
+	bpf_compute_data_end(skb);
+	ret = BPF_PROG_RUN(lwt->prog, skb);
+	rcu_read_unlock();
+
+	switch (ret) {
+	case BPF_OK:
+		break;
+
+	case BPF_REDIRECT:
+		if (!can_redirect) {
+			WARN_ONCE(1, "Illegal redirect return code in prog %s\n",
+				  lwt->name ? : "<unknown>");
+			ret = BPF_OK;
+		} else {
+			ret = skb_do_redirect(skb);
+			if (ret == 0)
+				ret = BPF_REDIRECT;
+		}
+		break;
+
+	case BPF_DROP:
+		kfree_skb(skb);
+		ret = -EPERM;
+		break;
+
+	case BPF_LWT_REROUTE:
+		break;
+
+	default:
+		WARN_ONCE(1, "Illegal LWT BPF return value %u, expect packet loss\n",
+			  ret);
+		kfree_skb(skb);
+		ret = -EINVAL;
+		break;
+	}
+
+	preempt_enable();
+
+	return ret;
+}
+
+static int bpf_input(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct bpf_lwt *bpf;
+	int ret;
+
+	bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+	if (bpf->in.prog) {
+		ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
+		if (ret < 0)
+			return ret;
+	}
+
+	if (unlikely(!dst->lwtstate->orig_input)) {
+		WARN_ONCE(1, "orig_input not set on dst for prog %s\n",
+			  bpf->out.name);
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+
+	return dst->lwtstate->orig_input(skb);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static struct dst_entry *bpf_lwt_lookup6(struct net *net, struct sk_buff *skb,
+					 struct bpf_lwt *bpf)
+{
+	struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	struct dst_entry *dst;
+	struct flowi6 fl6 = {
+		.daddr = ip6h->daddr,
+		.saddr = ip6h->saddr,
+		.flowlabel = ip6_flowinfo(ip6h),
+		.flowi6_mark = skb->mark,
+		.flowi6_proto = ip6h->nexthdr,
+		.flowi6_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
+	};
+
+	dst = ip6_route_output(net, skb->sk, &fl6);
+	if (unlikely(dst->error)) {
+		int err = dst->error;
+		dst_release(dst);
+		return ERR_PTR(err);
+	}
+
+	dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+	if (IS_ERR(dst))
+		return dst;
+
+	dst_cache_set_ip6(&bpf->dst_cache, dst, &fl6.saddr);
+
+	return dst;
+}
+#endif
+
+static struct dst_entry *bpf_lwt_lookup4(struct net *net, struct sk_buff *skb,
+					 struct bpf_lwt *bpf)
+{
+	struct iphdr *ip4 = ip_hdr(skb);
+	struct dst_entry *dst;
+	struct rtable *rt;
+	struct flowi4 fl4 = {
+		.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
+		.flowi4_mark = skb->mark,
+		.flowi4_proto = ip4->protocol,
+		.flowi4_tos = RT_TOS(ip4->tos),
+		.flowi4_flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0,
+		.saddr = ip4->saddr,
+		.daddr = ip4->daddr,
+	};
+
+	rt = ip_route_output_key(net, &fl4);
+	if (IS_ERR(rt))
+		return ERR_CAST(rt);
+
+	dst = &rt->dst;
+	if (dst->error) {
+		int err = dst->error;
+		dst_release(dst);
+		return ERR_PTR(err);
+	}
+
+	dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), NULL, 0);
+	if (IS_ERR(dst))
+		return dst;
+
+	dst_cache_set_ip4(&bpf->dst_cache, dst, fl4.saddr);
+
+	return dst;
+}
+
+static int bpf_lwt_reroute(struct net *net, struct sk_buff *skb,
+			   struct bpf_lwt *bpf)
+{
+	struct dst_entry *dst;
+
+	dst = dst_cache_get(&bpf->dst_cache);
+	if (unlikely(!dst)) {
+		switch (bpf->family) {
+		case AF_INET:
+			dst = bpf_lwt_lookup4(net, skb, bpf);
+			break;
+#if IS_ENABLED(CONFIG_IPV6)
+		case AF_INET6:
+			dst = bpf_lwt_lookup6(net, skb, bpf);
+			break;
+#endif
+		default:
+			return -EAFNOSUPPORT;
+		}
+
+		if (IS_ERR(dst))
+			return PTR_ERR(dst);
+	}
+
+	skb_dst_drop(skb);
+	skb_dst_set(skb, dst);
+
+	return 0;
+}
+
+static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct bpf_lwt *bpf;
+	int ret;
+
+	bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+	if (bpf->out.prog) {
+		ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT);
+		if (ret < 0)
+			return ret;
+
+		if (ret == BPF_LWT_REROUTE) {
+			ret = bpf_lwt_reroute(net, skb, bpf);
+			if (ret < 0) {
+				kfree_skb(skb);
+				return ret;
+			}
+
+			return dst_output(net, sk, skb);
+		}
+	}
+
+	if (unlikely(!dst->lwtstate->orig_output)) {
+		WARN_ONCE(1, "orig_output not set on dst for prog %s\n",
+			  bpf->out.name);
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+
+	return dst->lwtstate->orig_output(net, sk, skb);
+}
+
+static int xmit_check_hhlen(struct sk_buff *skb)
+{
+	int hh_len = skb_dst(skb)->dev->hard_header_len;
+
+	if (skb_headroom(skb) < hh_len) {
+		int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
+
+		if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC))
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int bpf_xmit(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct bpf_lwt *bpf;
+
+	bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+	if (bpf->xmit.prog) {
+		int ret;
+
+		ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
+		switch (ret) {
+		case BPF_OK:
+			/* If the L3 header was expanded, headroom might be too
+			 * small for L2 header now, expand as needed.
+			 */
+			ret = xmit_check_hhlen(skb);
+			if (unlikely(ret))
+				return ret;
+
+			return LWTUNNEL_XMIT_CONTINUE;
+		case BPF_REDIRECT:
+			return LWTUNNEL_XMIT_DONE;
+		default:
+			return ret;
+		}
+	}
+
+	return LWTUNNEL_XMIT_CONTINUE;
+}
+
+static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog)
+{
+	if (prog->prog)
+		bpf_prog_put(prog->prog);
+
+	kfree(prog->name);
+}
+
+static void bpf_destroy_state(struct lwtunnel_state *lwt)
+{
+	struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
+
+	dst_cache_destroy(&bpf->dst_cache);
+	bpf_lwt_prog_destroy(&bpf->in);
+	bpf_lwt_prog_destroy(&bpf->out);
+	bpf_lwt_prog_destroy(&bpf->xmit);
+}
+
+static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = {
+	[LWT_BPF_PROG_FD] = { .type = NLA_U32, },
+	[LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING,
+				.len = MAX_PROG_NAME },
+};
+
+static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog,
+			  enum bpf_prog_type type)
+{
+	struct nlattr *tb[LWT_BPF_PROG_MAX + 1];
+	struct bpf_prog *p;
+	int ret;
+	u32 fd;
+
+	ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME])
+		return -EINVAL;
+
+	prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL);
+	if (!prog->name)
+		return -ENOMEM;
+
+	fd = nla_get_u32(tb[LWT_BPF_PROG_FD]);
+	p = bpf_prog_get_type(fd, type);
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	prog->prog = p;
+
+	return 0;
+}
+
+static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
+	[LWT_BPF_IN]   = { .type = NLA_NESTED, },
+	[LWT_BPF_OUT]  = { .type = NLA_NESTED, },
+	[LWT_BPF_XMIT] = { .type = NLA_NESTED, },
+};
+
+static int bpf_build_state(struct net_device *dev, struct nlattr *nla,
+			   unsigned int family, const void *cfg,
+			   struct lwtunnel_state **ts)
+{
+	struct nlattr *tb[LWT_BPF_MAX + 1];
+	struct lwtunnel_state *newts;
+	struct bpf_lwt *bpf;
+	int ret;
+
+	if (family != AF_INET && family != AF_INET6)
+		return -EAFNOSUPPORT;
+
+	ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT])
+		return -EINVAL;
+
+	newts = lwtunnel_state_alloc(sizeof(*bpf));
+	if (!newts)
+		return -ENOMEM;
+
+	newts->type = LWTUNNEL_ENCAP_BPF;
+	bpf = bpf_lwt_lwtunnel(newts);
+
+	if (tb[LWT_BPF_IN]) {
+		newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
+		ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in,
+				     BPF_PROG_TYPE_LWT_IN);
+		if (ret  < 0)
+			goto errout;
+	}
+
+	if (tb[LWT_BPF_OUT]) {
+		newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+		ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out,
+				     BPF_PROG_TYPE_LWT_OUT);
+		if (ret < 0)
+			goto errout;
+	}
+
+	if (tb[LWT_BPF_XMIT]) {
+		newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT;
+		ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit,
+				     BPF_PROG_TYPE_LWT_XMIT);
+		if (ret < 0)
+			goto errout;
+	}
+
+	ret = dst_cache_init(&bpf->dst_cache, GFP_KERNEL);
+	if (ret)
+		goto errout;
+
+	bpf->family = family;
+	*ts = newts;
+
+	return 0;
+
+errout:
+	bpf_destroy_state(newts);
+	kfree(newts);
+	return ret;
+}
+
+static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr,
+			     struct bpf_lwt_prog *prog)
+{
+	struct nlattr *nest;
+
+	if (!prog->prog)
+		return 0;
+
+	nest = nla_nest_start(skb, attr);
+	if (!nest)
+		return -EMSGSIZE;
+
+	if (prog->name &&
+	    nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name))
+		return -EMSGSIZE;
+
+	return nla_nest_end(skb, nest);
+}
+
+static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt)
+{
+	struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
+
+	if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 ||
+	    bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 ||
+	    bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0)
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+	int nest_len = nla_total_size(sizeof(struct nlattr)) +
+		       nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */
+		       0;
+
+	return nest_len + /* LWT_BPF_IN */
+	       nest_len + /* LWT_BPF_OUT */
+	       nest_len + /* LWT_BPF_XMIT */
+	       0;
+}
+
+int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b)
+{
+	/* FIXME:
+	 * The LWT state is currently rebuilt for delete requests which
+	 * results in a new bpf_prog instance. Comparing names for now.
+	 */
+	if (!a->name && !b->name)
+		return 0;
+
+	if (!a->name || !b->name)
+		return 1;
+
+	return strcmp(a->name, b->name);
+}
+
+static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+	struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a);
+	struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b);
+
+	return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) ||
+	       bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) ||
+	       bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit);
+}
+
+static const struct lwtunnel_encap_ops bpf_encap_ops = {
+	.build_state	= bpf_build_state,
+	.destroy_state	= bpf_destroy_state,
+	.input		= bpf_input,
+	.output		= bpf_output,
+	.xmit		= bpf_xmit,
+	.fill_encap	= bpf_fill_encap_info,
+	.get_encap_size = bpf_encap_nlsize,
+	.cmp_encap	= bpf_encap_cmp,
+};
+
+static int __init bpf_lwt_init(void)
+{
+	return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);
+}
+
+subsys_initcall(bpf_lwt_init)
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 88fd642..554d901 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -39,6 +39,7 @@  static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
 		return "MPLS";
 	case LWTUNNEL_ENCAP_ILA:
 		return "ILA";
+	case LWTUNNEL_ENCAP_BPF:
 	case LWTUNNEL_ENCAP_IP6:
 	case LWTUNNEL_ENCAP_IP:
 	case LWTUNNEL_ENCAP_NONE: