diff mbox series

[bpf-next,RFC,2/3] flow_dissector: implements eBPF parser

Message ID 20180816164423.14368-3-peterpenkov96@gmail.com
State RFC, archived
Delegated to: BPF Maintainers
Headers show
Series Introduce eBPF flow dissector | expand

Commit Message

Petar Penkov Aug. 16, 2018, 4:44 p.m. UTC
From: Petar Penkov <ppenkov@google.com>

This eBPF program extracts basic/control/ip address/ports keys from
incoming packets. It supports recursive parsing for IP
encapsulation, MPLS, GUE, and VLAN, along with IPv4/IPv6 and extension
headers. This program is meant to show how flow dissection and key
extraction can be done in eBPF.

It is initially meant to be used for demonstration rather than as a
complete replacement of the existing flow dissector.

This includes parsing of GUE and MPLS payload, which cannot be done
in production in general, as GUE tunnels and MPLS payloads cannot
unambiguously be detected in general.

In closed environments, however, it can be enabled. Another example
where the programmability of BPF aids flow dissection.

Link: http://vger.kernel.org/netconf2017_files/rx_hardening_and_udp_gso.pdf
Signed-off-by: Petar Penkov <ppenkov@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 tools/testing/selftests/bpf/Makefile   |   2 +-
 tools/testing/selftests/bpf/bpf_flow.c | 542 +++++++++++++++++++++++++
 2 files changed, 543 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/bpf_flow.c

Comments

Tom Herbert Aug. 18, 2018, 3:50 p.m. UTC | #1
On Thu, Aug 16, 2018 at 9:44 AM, Petar Penkov <peterpenkov96@gmail.com> wrote:
> From: Petar Penkov <ppenkov@google.com>
>
> This eBPF program extracts basic/control/ip address/ports keys from
> incoming packets. It supports recursive parsing for IP
> encapsulation, MPLS, GUE, and VLAN, along with IPv4/IPv6 and extension
> headers. This program is meant to show how flow dissection and key
> extraction can be done in eBPF.
>
> It is initially meant to be used for demonstration rather than as a
> complete replacement of the existing flow dissector.
>
> This includes parsing of GUE and MPLS payload, which cannot be done
> in production in general, as GUE tunnels and MPLS payloads cannot
> unambiguously be detected in general.
>
> In closed environments, however, it can be enabled. Another example
> where the programmability of BPF aids flow dissection.
>
> Link: http://vger.kernel.org/netconf2017_files/rx_hardening_and_udp_gso.pdf
> Signed-off-by: Petar Penkov <ppenkov@google.com>
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> ---
>  tools/testing/selftests/bpf/Makefile   |   2 +-
>  tools/testing/selftests/bpf/bpf_flow.c | 542 +++++++++++++++++++++++++
>  2 files changed, 543 insertions(+), 1 deletion(-)
>  create mode 100644 tools/testing/selftests/bpf/bpf_flow.c
>
> diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
> index fff7fb1285fc..e65f50f9185e 100644
> --- a/tools/testing/selftests/bpf/Makefile
> +++ b/tools/testing/selftests/bpf/Makefile
> @@ -35,7 +35,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
>         test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
>         test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \
>         get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \
> -       test_skb_cgroup_id_kern.o
> +       test_skb_cgroup_id_kern.o bpf_flow.o
>
>  # Order correspond to 'make run_tests' order
>  TEST_PROGS := test_kmod.sh \
> diff --git a/tools/testing/selftests/bpf/bpf_flow.c b/tools/testing/selftests/bpf/bpf_flow.c
> new file mode 100644
> index 000000000000..9c11c644b713
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/bpf_flow.c
> @@ -0,0 +1,542 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <stddef.h>
> +#include <stdbool.h>
> +#include <string.h>
> +#include <linux/pkt_cls.h>
> +#include <linux/bpf.h>
> +#include <linux/in.h>
> +#include <linux/if_ether.h>
> +#include <linux/icmp.h>
> +#include <linux/ip.h>
> +#include <linux/ipv6.h>
> +#include <linux/tcp.h>
> +#include <linux/udp.h>
> +#include <linux/if_packet.h>
> +#include <sys/socket.h>
> +#include <linux/if_tunnel.h>
> +#include <linux/mpls.h>
> +#include "bpf_helpers.h"
> +#include "bpf_endian.h"
> +
> +int _version SEC("version") = 1;
> +#define PROG(F) SEC(#F) int bpf_func_##F
> +
> +/* These are the identifiers of the BPF programs that will be used in tail
> + * calls. Name is limited to 16 characters, with the terminating character and
> + * bpf_func_ above, we have only 6 to work with, anything after will be cropped.
> + */
> +enum {
> +       IP,
> +       IPV6,
> +       IPV6OP, /* Destination/Hop-by-Hop Options IPv6 Extension header */
> +       IPV6FR, /* Fragmentation IPv6 Extension Header */
> +       MPLS,
> +       VLAN,
> +       GUE,
> +};
> +
> +#define IP_MF          0x2000
> +#define IP_OFFSET      0x1FFF
> +#define IP6_MF         0x0001
> +#define IP6_OFFSET     0xFFF8
> +
> +struct vlan_hdr {
> +       __be16 h_vlan_TCI;
> +       __be16 h_vlan_encapsulated_proto;
> +};
> +
> +struct gre_hdr {
> +       __be16 flags;
> +       __be16 proto;
> +};
> +
> +#define GUE_PORT 6080
> +/* Taken from include/net/gue.h. Move that to uapi, instead? */
> +struct guehdr {
> +       union {
> +               struct {
> +#if defined(__LITTLE_ENDIAN_BITFIELD)
> +                       __u8    hlen:5,
> +                               control:1,
> +                               version:2;
> +#elif defined (__BIG_ENDIAN_BITFIELD)
> +                       __u8    version:2,
> +                               control:1,
> +                               hlen:5;
> +#else
> +#error  "Please fix <asm/byteorder.h>"
> +#endif
> +                       __u8    proto_ctype;
> +                       __be16  flags;
> +               };
> +               __be32  word;
> +       };
> +};
> +
> +enum flow_dissector_key_id {
> +       FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */
> +       FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */
> +       FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */
> +       FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
> +       FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */
> +       FLOW_DISSECTOR_KEY_ICMP, /* struct flow_dissector_key_icmp */
> +       FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */
> +       FLOW_DISSECTOR_KEY_TIPC, /* struct flow_dissector_key_tipc */
> +       FLOW_DISSECTOR_KEY_ARP, /* struct flow_dissector_key_arp */
> +       FLOW_DISSECTOR_KEY_VLAN, /* struct flow_dissector_key_flow_vlan */
> +       FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_flow_tags */
> +       FLOW_DISSECTOR_KEY_GRE_KEYID, /* struct flow_dissector_key_keyid */
> +       FLOW_DISSECTOR_KEY_MPLS_ENTROPY, /* struct flow_dissector_key_keyid */
> +       FLOW_DISSECTOR_KEY_ENC_KEYID, /* struct flow_dissector_key_keyid */
> +       FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */
> +       FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
> +       FLOW_DISSECTOR_KEY_ENC_CONTROL, /* struct flow_dissector_key_control */
> +       FLOW_DISSECTOR_KEY_ENC_PORTS, /* struct flow_dissector_key_ports */
> +       FLOW_DISSECTOR_KEY_MPLS, /* struct flow_dissector_key_mpls */
> +       FLOW_DISSECTOR_KEY_TCP, /* struct flow_dissector_key_tcp */
> +       FLOW_DISSECTOR_KEY_IP, /* struct flow_dissector_key_ip */
> +       FLOW_DISSECTOR_KEY_CVLAN, /* struct flow_dissector_key_flow_vlan */
> +
> +       FLOW_DISSECTOR_KEY_MAX,
> +};
> +
> +struct flow_dissector_key_control {
> +       __u16   thoff;
> +       __u16   addr_type;
> +       __u32   flags;
> +};
> +
> +#define FLOW_DIS_IS_FRAGMENT   (1 << 0)
> +#define FLOW_DIS_FIRST_FRAG    (1 << 1)
> +#define FLOW_DIS_ENCAPSULATION (1 << 2)
> +
> +struct flow_dissector_key_basic {
> +       __be16  n_proto;
> +       __u8    ip_proto;
> +       __u8    padding;
> +};
> +
> +struct flow_dissector_key_ipv4_addrs {
> +       __be32 src;
> +       __be32 dst;
> +};
> +
> +struct flow_dissector_key_ipv6_addrs {
> +       struct in6_addr src;
> +       struct in6_addr dst;
> +};
> +
> +struct flow_dissector_key_addrs {
> +       union {
> +               struct flow_dissector_key_ipv4_addrs v4addrs;
> +               struct flow_dissector_key_ipv6_addrs v6addrs;
> +       };
> +};
> +
> +struct flow_dissector_key_ports {
> +       union {
> +               __be32 ports;
> +               struct {
> +                       __be16 src;
> +                       __be16 dst;
> +               };
> +       };
> +};
> +
> +struct bpf_map_def SEC("maps") jmp_table = {
> +       .type = BPF_MAP_TYPE_PROG_ARRAY,
> +       .key_size = sizeof(__u32),
> +       .value_size = sizeof(__u32),
> +       .max_entries = 8
> +};
> +
> +struct bpf_dissect_cb {
> +       __u16 nhoff;
> +       __u16 flags;
> +};
> +
> +/* Dispatches on ETHERTYPE */
> +static __always_inline int parse_eth_proto(struct __sk_buff *skb, __be16 proto)
> +{
> +       switch (proto) {
> +       case bpf_htons(ETH_P_IP):
> +               bpf_tail_call(skb, &jmp_table, IP);
> +               break;
> +       case bpf_htons(ETH_P_IPV6):
> +               bpf_tail_call(skb, &jmp_table, IPV6);
> +               break;
> +       case bpf_htons(ETH_P_MPLS_MC):
> +       case bpf_htons(ETH_P_MPLS_UC):
> +               bpf_tail_call(skb, &jmp_table, MPLS);
> +               break;
> +       case bpf_htons(ETH_P_8021Q):
> +       case bpf_htons(ETH_P_8021AD):
> +               bpf_tail_call(skb, &jmp_table, VLAN);
> +               break;
> +       default:
> +               /* Protocol not supported */
> +               return BPF_DROP;
> +       }
> +
> +       return BPF_DROP;
> +}
> +
> +static __always_inline int write_ports(struct __sk_buff *skb, __u8 proto)
> +{
> +       struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
> +       struct flow_dissector_key_ports ports;
> +
> +       /* The supported protocols always start with the ports */
> +       if (bpf_skb_load_bytes(skb, cb->nhoff, &ports, sizeof(ports)))
> +               return BPF_DROP;
> +
> +       if (proto == IPPROTO_UDP && ports.dst == bpf_htons(GUE_PORT)) {
> +               /* GUE encapsulation */
> +               cb->nhoff += sizeof(struct udphdr);
> +               bpf_tail_call(skb, &jmp_table, GUE);
> +               return BPF_DROP;

It's a nice sentiment to support GUE, but this really isn't the right
way to do it. What would be much better is a means to generically
support all the various UDP encapsulations like GUE, VXLAN, Geneve,
GRE/UDP, MPLS/UDP, etc. I think there's two ways to do that:

1) A UDP socket lookup that returns an encapsulation socket containing
a flow dissector function that can be called. This is the safest
method because of the UDP are reserved numbers problem. I implement
this in kernel flow dissector, not upstreamed though.
2) Create a lookup table based on destination port that returns the
flow dissector function to call. This doesn't have the socket lookup
so it isn't quite as robust as the socket lookup. But, at least it's a
generic interface and programmable so it might be appropriate in the
BPF flow dissector case.

Tom

> +       }
> +
> +       if (bpf_flow_dissector_write_keys(skb, &ports, sizeof(ports),
> +                                         FLOW_DISSECTOR_KEY_PORTS))
> +               return BPF_DROP;
> +
> +       return BPF_OK;
> +}
> +
> +SEC("dissect")
> +int dissect(struct __sk_buff *skb)
> +{
> +       if (!skb->vlan_present)
> +               return parse_eth_proto(skb, skb->protocol);
> +       else
> +               return parse_eth_proto(skb, skb->vlan_proto);
> +}
> +
> +/* Parses on IPPROTO_* */
> +static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto)
> +{
> +       struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
> +       __u8 *data_end = (__u8 *)(long)skb->data_end;
> +       __u8 *data = (__u8 *)(long)skb->data;
> +       __u32 data_len = data_end - data;
> +       struct gre_hdr gre;
> +       struct ethhdr eth;
> +       struct tcphdr tcp;
> +
> +       switch (proto) {
> +       case IPPROTO_ICMP:
> +               if (cb->nhoff + sizeof(struct icmphdr) > data_len)
> +                       return BPF_DROP;
> +               return BPF_OK;
> +       case IPPROTO_IPIP:
> +               cb->flags |= FLOW_DIS_ENCAPSULATION;
> +               bpf_tail_call(skb, &jmp_table, IP);
> +               break;
> +       case IPPROTO_IPV6:
> +               cb->flags |= FLOW_DIS_ENCAPSULATION;
> +               bpf_tail_call(skb, &jmp_table, IPV6);
> +               break;
> +       case IPPROTO_GRE:
> +               if (bpf_skb_load_bytes(skb, cb->nhoff, &gre, sizeof(gre)))
> +                       return BPF_DROP;
> +
> +               if (bpf_htons(gre.flags & GRE_VERSION))
> +                       /* Only inspect standard GRE packets with version 0 */
> +                       return BPF_OK;
> +
> +               cb->nhoff += sizeof(gre); /* Step over GRE Flags and Protocol */
> +               if (GRE_IS_CSUM(gre.flags))
> +                       cb->nhoff += 4; /* Step over chksum and Padding */
> +               if (GRE_IS_KEY(gre.flags))
> +                       cb->nhoff += 4; /* Step over key */
> +               if (GRE_IS_SEQ(gre.flags))
> +                       cb->nhoff += 4; /* Step over sequence number */
> +
> +               cb->flags |= FLOW_DIS_ENCAPSULATION;
> +
> +               if (gre.proto == bpf_htons(ETH_P_TEB)) {
> +                       if (bpf_skb_load_bytes(skb, cb->nhoff, &eth,
> +                                              sizeof(eth)))
> +                               return BPF_DROP;
> +
> +                       cb->nhoff += sizeof(eth);
> +
> +                       return parse_eth_proto(skb, eth.h_proto);
> +               } else {
> +                       return parse_eth_proto(skb, gre.proto);
> +               }
> +
> +       case IPPROTO_TCP:
> +               if (cb->nhoff + sizeof(struct tcphdr) > data_len)
> +                       return BPF_DROP;
> +
> +               if (bpf_skb_load_bytes(skb, cb->nhoff, &tcp, sizeof(tcp)))
> +                       return BPF_DROP;
> +
> +               if (tcp.doff < 5)
> +                       return BPF_DROP;
> +
> +               if (cb->nhoff + (tcp.doff << 2) > data_len)
> +                       return BPF_DROP;
> +
> +               return write_ports(skb, proto);
> +       case IPPROTO_UDP:
> +       case IPPROTO_UDPLITE:
> +               if (cb->nhoff + sizeof(struct udphdr) > data_len)
> +                       return BPF_DROP;
> +
> +               return write_ports(skb, proto);
> +       default:
> +               return BPF_DROP;
> +       }
> +
> +       return BPF_DROP;
> +}
> +
> +static __always_inline int parse_ipv6_proto(struct __sk_buff *skb, __u8 nexthdr)
> +{
> +       struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
> +       struct flow_dissector_key_control control;
> +       struct flow_dissector_key_basic basic;
> +
> +       switch (nexthdr) {
> +       case IPPROTO_HOPOPTS:
> +       case IPPROTO_DSTOPTS:
> +               bpf_tail_call(skb, &jmp_table, IPV6OP);
> +               break;
> +       case IPPROTO_FRAGMENT:
> +               bpf_tail_call(skb, &jmp_table, IPV6FR);
> +               break;
> +       default:
> +               control.thoff = cb->nhoff;
> +               control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
> +               control.flags = cb->flags;
> +               if (bpf_flow_dissector_write_keys(skb, &control,
> +                                                 sizeof(control),
> +                                                 FLOW_DISSECTOR_KEY_CONTROL))
> +                       return BPF_DROP;
> +
> +               memset(&basic, 0, sizeof(basic));
> +               basic.n_proto = bpf_htons(ETH_P_IPV6);
> +               basic.ip_proto = nexthdr;
> +               if (bpf_flow_dissector_write_keys(skb, &basic, sizeof(basic),
> +                                             FLOW_DISSECTOR_KEY_BASIC))
> +                       return BPF_DROP;
> +
> +               return parse_ip_proto(skb, nexthdr);
> +       }
> +
> +       return BPF_DROP;
> +}
> +
> +PROG(IP)(struct __sk_buff *skb)
> +{
> +       struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
> +       __u8 *data_end = (__u8 *)(long)skb->data_end;
> +       struct flow_dissector_key_control control;
> +       struct flow_dissector_key_addrs addrs;
> +       struct flow_dissector_key_basic basic;
> +       __u8 *data = (__u8 *)(long)skb->data;
> +       __u32 data_len = data_end - data;
> +       bool done = false;
> +       struct iphdr iph;
> +
> +       if (bpf_skb_load_bytes(skb, cb->nhoff, &iph, sizeof(iph)))
> +               return BPF_DROP;
> +
> +       /* IP header cannot be smaller than 20 bytes */
> +       if (iph.ihl < 5)
> +               return BPF_DROP;
> +
> +       addrs.v4addrs.src = iph.saddr;
> +       addrs.v4addrs.dst = iph.daddr;
> +       if (bpf_flow_dissector_write_keys(skb, &addrs, sizeof(addrs.v4addrs),
> +                                     FLOW_DISSECTOR_KEY_IPV4_ADDRS))
> +               return BPF_DROP;
> +
> +       cb->nhoff += iph.ihl << 2;
> +       if (cb->nhoff > data_len)
> +               return BPF_DROP;
> +
> +       if (iph.frag_off & bpf_htons(IP_MF | IP_OFFSET)) {
> +               cb->flags |= FLOW_DIS_IS_FRAGMENT;
> +               if (iph.frag_off & bpf_htons(IP_OFFSET))
> +                       /* From second fragment on, packets do not have headers
> +                        * we can parse.
> +                        */
> +                       done = true;
> +               else
> +                       cb->flags |= FLOW_DIS_FIRST_FRAG;
> +       }
> +
> +
> +       control.thoff = cb->nhoff;
> +       control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
> +       control.flags = cb->flags;
> +       if (bpf_flow_dissector_write_keys(skb, &control, sizeof(control),
> +                                         FLOW_DISSECTOR_KEY_CONTROL))
> +               return BPF_DROP;
> +
> +       memset(&basic, 0, sizeof(basic));
> +       basic.n_proto = bpf_htons(ETH_P_IP);
> +       basic.ip_proto = iph.protocol;
> +       if (bpf_flow_dissector_write_keys(skb, &basic, sizeof(basic),
> +                                     FLOW_DISSECTOR_KEY_BASIC))
> +               return BPF_DROP;
> +
> +       if (done)
> +               return BPF_OK;
> +
> +       return parse_ip_proto(skb, iph.protocol);
> +}
> +
> +PROG(IPV6)(struct __sk_buff *skb)
> +{
> +       struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
> +       struct flow_dissector_key_addrs addrs;
> +       struct ipv6hdr ip6h;
> +
> +       if (bpf_skb_load_bytes(skb, cb->nhoff, &ip6h, sizeof(ip6h)))
> +               return BPF_DROP;
> +
> +       addrs.v6addrs.src = ip6h.saddr;
> +       addrs.v6addrs.dst = ip6h.daddr;
> +       if (bpf_flow_dissector_write_keys(skb, &addrs, sizeof(addrs.v6addrs),
> +                                     FLOW_DISSECTOR_KEY_IPV6_ADDRS))
> +               return BPF_DROP;
> +
> +       cb->nhoff += sizeof(struct ipv6hdr);
> +
> +       return parse_ipv6_proto(skb, ip6h.nexthdr);
> +}
> +
> +PROG(IPV6OP)(struct __sk_buff *skb)
> +{
> +       struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
> +       __u8 proto;
> +       __u8 hlen;
> +
> +       if (bpf_skb_load_bytes(skb, cb->nhoff, &proto, sizeof(proto)))
> +               return BPF_DROP;
> +
> +       if (bpf_skb_load_bytes(skb, cb->nhoff + sizeof(proto), &hlen,
> +                              sizeof(hlen)))
> +               return BPF_DROP;
> +       /* hlen is in 8-octects and does not include the first 8 bytes
> +        * of the header
> +        */
> +       cb->nhoff += (1 + hlen) << 3;
> +
> +       return parse_ipv6_proto(skb, proto);
> +}
> +
> +PROG(IPV6FR)(struct __sk_buff *skb)
> +{
> +       struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
> +       __be16 frag_off;
> +       __u8 proto;
> +
> +       if (bpf_skb_load_bytes(skb, cb->nhoff, &proto, sizeof(proto)))
> +               return BPF_DROP;
> +
> +       if (bpf_skb_load_bytes(skb, cb->nhoff + 2, &frag_off, sizeof(frag_off)))
> +               return BPF_DROP;
> +
> +       cb->nhoff += 8;
> +       cb->flags |= FLOW_DIS_IS_FRAGMENT;
> +       if (!(frag_off & bpf_htons(IP6_OFFSET)))
> +               cb->flags |= FLOW_DIS_FIRST_FRAG;
> +
> +       return parse_ipv6_proto(skb, proto);
> +}
> +
> +PROG(MPLS)(struct __sk_buff *skb)
> +{
> +       struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
> +       struct mpls_label mpls;
> +
> +       if (bpf_skb_load_bytes(skb, cb->nhoff, &mpls, sizeof(mpls)))
> +               return BPF_DROP;
> +
> +       cb->nhoff += sizeof(mpls);
> +
> +       if (mpls.entry & MPLS_LS_S_MASK) {
> +               /* This is the last MPLS header. The network layer packet always
> +                * follows the MPLS header. Peek forward and dispatch based on
> +                * that.
> +                */
> +               __u8 version;
> +
> +               if (bpf_skb_load_bytes(skb, cb->nhoff, &version,
> +                                      sizeof(version)))
> +                       return BPF_DROP;
> +
> +               /* IP version is always the first 4 bits of the header */
> +               switch (version & 0xF0) {
> +               case 4:
> +                       bpf_tail_call(skb, &jmp_table, IP);
> +                       break;
> +               case 6:
> +                       bpf_tail_call(skb, &jmp_table, IPV6);
> +                       break;
> +               default:
> +                       return BPF_DROP;
> +               }
> +       } else {
> +               bpf_tail_call(skb, &jmp_table, MPLS);
> +       }
> +
> +       return BPF_DROP;
> +}
> +
> +PROG(VLAN)(struct __sk_buff *skb)
> +{
> +       struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
> +       struct vlan_hdr vlan;
> +       __be16 proto;
> +
> +       /* Peek back to see if single or double-tagging */
> +       if (bpf_skb_load_bytes(skb, cb->nhoff - sizeof(proto), &proto,
> +                              sizeof(proto)))
> +               return BPF_DROP;
> +
> +       /* Account for double-tagging */
> +       if (proto == bpf_htons(ETH_P_8021AD)) {
> +               if (bpf_skb_load_bytes(skb, cb->nhoff, &vlan, sizeof(vlan)))
> +                       return BPF_DROP;
> +
> +               if (vlan.h_vlan_encapsulated_proto != bpf_htons(ETH_P_8021Q))
> +                       return BPF_DROP;
> +
> +               cb->nhoff += sizeof(vlan);
> +       }
> +
> +       if (bpf_skb_load_bytes(skb, cb->nhoff, &vlan, sizeof(vlan)))
> +               return BPF_DROP;
> +
> +       cb->nhoff += sizeof(vlan);
> +       /* Only allow 8021AD + 8021Q double tagging and no triple tagging.*/
> +       if (vlan.h_vlan_encapsulated_proto == bpf_htons(ETH_P_8021AD) ||
> +           vlan.h_vlan_encapsulated_proto == bpf_htons(ETH_P_8021Q))
> +               return BPF_DROP;
> +
> +       return parse_eth_proto(skb, vlan.h_vlan_encapsulated_proto);
> +}
> +
> +PROG(GUE)(struct __sk_buff *skb)
> +{
> +       struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
> +       struct guehdr gue;
> +
> +       if (bpf_skb_load_bytes(skb, cb->nhoff, &gue, sizeof(gue)))
> +               return BPF_DROP;
> +
> +       cb->nhoff += sizeof(gue);
> +       cb->nhoff += gue.hlen << 2;
> +
> +       cb->flags |= FLOW_DIS_ENCAPSULATION;
> +       return parse_ip_proto(skb, gue.proto_ctype);
> +}
> +
> +char __license[] SEC("license") = "GPL";
> --
> 2.18.0.865.gffc8e1a3cd6-goog
>
Willem de Bruijn Aug. 18, 2018, 7:49 p.m. UTC | #2
On Sat, Aug 18, 2018 at 11:56 AM Tom Herbert <tom@herbertland.com> wrote:
>
> On Thu, Aug 16, 2018 at 9:44 AM, Petar Penkov <peterpenkov96@gmail.com> wrote:
> > From: Petar Penkov <ppenkov@google.com>
> >
> > This eBPF program extracts basic/control/ip address/ports keys from
> > incoming packets. It supports recursive parsing for IP
> > encapsulation, MPLS, GUE, and VLAN, along with IPv4/IPv6 and extension
> > headers. This program is meant to show how flow dissection and key
> > extraction can be done in eBPF.
> >
> > It is initially meant to be used for demonstration rather than as a
> > complete replacement of the existing flow dissector.
> >
> > This includes parsing of GUE and MPLS payload, which cannot be done
> > in production in general, as GUE tunnels and MPLS payloads cannot
> > unambiguously be detected in general.
> >
> > In closed environments, however, it can be enabled. Another example
> > where the programmability of BPF aids flow dissection.

> > +static __always_inline int write_ports(struct __sk_buff *skb, __u8 proto)
> > +{
> > +       struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
> > +       struct flow_dissector_key_ports ports;
> > +
> > +       /* The supported protocols always start with the ports */
> > +       if (bpf_skb_load_bytes(skb, cb->nhoff, &ports, sizeof(ports)))
> > +               return BPF_DROP;
> > +
> > +       if (proto == IPPROTO_UDP && ports.dst == bpf_htons(GUE_PORT)) {
> > +               /* GUE encapsulation */
> > +               cb->nhoff += sizeof(struct udphdr);
> > +               bpf_tail_call(skb, &jmp_table, GUE);
> > +               return BPF_DROP;
>
> It's a nice sentiment to support GUE, but this really isn't the right
> way to do it.

Yes, this was just for demonstration purposes. The same for
unconditionally parsing MPLS payload as IP.

Though note the point in the commit message that within a closed
network with fixed reserved GUE ports, a custom BPF program
like this could be sufficient. That's true not only for UDP tunnels.

> What would be much better is a means to generically
> support all the various UDP encapsulations like GUE, VXLAN, Geneve,
> GRE/UDP, MPLS/UDP, etc. I think there's two ways to do that:
>
> 1) A UDP socket lookup that returns an encapsulation socket containing
> a flow dissector function that can be called. This is the safest
> method because of the UDP are reserved numbers problem. I implement
> this in kernel flow dissector, not upstreamed though.

Yes, similar to udp_gro_receive. Socket lookup is not free, however,
and this is a relatively rarely used feature.

I want to move the one in udp_gro_receive behind a static key.
udp_encap_needed_key is the likely target. Then the same can
eventually be done for flow dissection inside UDP tunnels.

> 2) Create a lookup table based on destination port that returns the
> flow dissector function to call. This doesn't have the socket lookup
> so it isn't quite as robust as the socket lookup. But, at least it's a
> generic interface and programmable so it might be appropriate in the
> BPF flow dissector case.

Option 1 sounds preferable to me.
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index fff7fb1285fc..e65f50f9185e 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -35,7 +35,7 @@  TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
 	test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \
 	get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \
-	test_skb_cgroup_id_kern.o
+	test_skb_cgroup_id_kern.o bpf_flow.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
diff --git a/tools/testing/selftests/bpf/bpf_flow.c b/tools/testing/selftests/bpf/bpf_flow.c
new file mode 100644
index 000000000000..9c11c644b713
--- /dev/null
+++ b/tools/testing/selftests/bpf/bpf_flow.c
@@ -0,0 +1,542 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/pkt_cls.h>
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/icmp.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_packet.h>
+#include <sys/socket.h>
+#include <linux/if_tunnel.h>
+#include <linux/mpls.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+int _version SEC("version") = 1;
+#define PROG(F) SEC(#F) int bpf_func_##F
+
+/* These are the identifiers of the BPF programs that will be used in tail
+ * calls. Name is limited to 16 characters, with the terminating character and
+ * bpf_func_ above, we have only 6 to work with, anything after will be cropped.
+ */
+enum {
+	IP,
+	IPV6,
+	IPV6OP,	/* Destination/Hop-by-Hop Options IPv6 Extension header */
+	IPV6FR,	/* Fragmentation IPv6 Extension Header */
+	MPLS,
+	VLAN,
+	GUE,
+};
+
+#define IP_MF		0x2000
+#define IP_OFFSET	0x1FFF
+#define IP6_MF		0x0001
+#define IP6_OFFSET	0xFFF8
+
+struct vlan_hdr {
+	__be16 h_vlan_TCI;
+	__be16 h_vlan_encapsulated_proto;
+};
+
+struct gre_hdr {
+	__be16 flags;
+	__be16 proto;
+};
+
+#define GUE_PORT 6080
+/* Taken from include/net/gue.h. Move that to uapi, instead? */
+struct guehdr {
+	union {
+		struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+			__u8	hlen:5,
+				control:1,
+				version:2;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+			__u8	version:2,
+				control:1,
+				hlen:5;
+#else
+#error  "Please fix <asm/byteorder.h>"
+#endif
+			__u8	proto_ctype;
+			__be16	flags;
+		};
+		__be32	word;
+	};
+};
+
+enum flow_dissector_key_id {
+	FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */
+	FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */
+	FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */
+	FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
+	FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */
+	FLOW_DISSECTOR_KEY_ICMP, /* struct flow_dissector_key_icmp */
+	FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */
+	FLOW_DISSECTOR_KEY_TIPC, /* struct flow_dissector_key_tipc */
+	FLOW_DISSECTOR_KEY_ARP, /* struct flow_dissector_key_arp */
+	FLOW_DISSECTOR_KEY_VLAN, /* struct flow_dissector_key_flow_vlan */
+	FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_flow_tags */
+	FLOW_DISSECTOR_KEY_GRE_KEYID, /* struct flow_dissector_key_keyid */
+	FLOW_DISSECTOR_KEY_MPLS_ENTROPY, /* struct flow_dissector_key_keyid */
+	FLOW_DISSECTOR_KEY_ENC_KEYID, /* struct flow_dissector_key_keyid */
+	FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */
+	FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
+	FLOW_DISSECTOR_KEY_ENC_CONTROL, /* struct flow_dissector_key_control */
+	FLOW_DISSECTOR_KEY_ENC_PORTS, /* struct flow_dissector_key_ports */
+	FLOW_DISSECTOR_KEY_MPLS, /* struct flow_dissector_key_mpls */
+	FLOW_DISSECTOR_KEY_TCP, /* struct flow_dissector_key_tcp */
+	FLOW_DISSECTOR_KEY_IP, /* struct flow_dissector_key_ip */
+	FLOW_DISSECTOR_KEY_CVLAN, /* struct flow_dissector_key_flow_vlan */
+
+	FLOW_DISSECTOR_KEY_MAX,
+};
+
+struct flow_dissector_key_control {
+	__u16	thoff;
+	__u16	addr_type;
+	__u32	flags;
+};
+
+#define FLOW_DIS_IS_FRAGMENT	(1 << 0)
+#define FLOW_DIS_FIRST_FRAG	(1 << 1)
+#define FLOW_DIS_ENCAPSULATION	(1 << 2)
+
+struct flow_dissector_key_basic {
+	__be16	n_proto;
+	__u8	ip_proto;
+	__u8	padding;
+};
+
+struct flow_dissector_key_ipv4_addrs {
+	__be32 src;
+	__be32 dst;
+};
+
+struct flow_dissector_key_ipv6_addrs {
+	struct in6_addr src;
+	struct in6_addr dst;
+};
+
+struct flow_dissector_key_addrs {
+	union {
+		struct flow_dissector_key_ipv4_addrs v4addrs;
+		struct flow_dissector_key_ipv6_addrs v6addrs;
+	};
+};
+
+struct flow_dissector_key_ports {
+	union {
+		__be32 ports;
+		struct {
+			__be16 src;
+			__be16 dst;
+		};
+	};
+};
+
+struct bpf_map_def SEC("maps") jmp_table = {
+	.type = BPF_MAP_TYPE_PROG_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(__u32),
+	.max_entries = 8
+};
+
+struct bpf_dissect_cb {
+	__u16 nhoff;
+	__u16 flags;
+};
+
+/* Dispatches on ETHERTYPE */
+static __always_inline int parse_eth_proto(struct __sk_buff *skb, __be16 proto)
+{
+	switch (proto) {
+	case bpf_htons(ETH_P_IP):
+		bpf_tail_call(skb, &jmp_table, IP);
+		break;
+	case bpf_htons(ETH_P_IPV6):
+		bpf_tail_call(skb, &jmp_table, IPV6);
+		break;
+	case bpf_htons(ETH_P_MPLS_MC):
+	case bpf_htons(ETH_P_MPLS_UC):
+		bpf_tail_call(skb, &jmp_table, MPLS);
+		break;
+	case bpf_htons(ETH_P_8021Q):
+	case bpf_htons(ETH_P_8021AD):
+		bpf_tail_call(skb, &jmp_table, VLAN);
+		break;
+	default:
+		/* Protocol not supported */
+		return BPF_DROP;
+	}
+
+	return BPF_DROP;
+}
+
+static __always_inline int write_ports(struct __sk_buff *skb, __u8 proto)
+{
+	struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
+	struct flow_dissector_key_ports ports;
+
+	/* The supported protocols always start with the ports */
+	if (bpf_skb_load_bytes(skb, cb->nhoff, &ports, sizeof(ports)))
+		return BPF_DROP;
+
+	if (proto == IPPROTO_UDP && ports.dst == bpf_htons(GUE_PORT)) {
+		/* GUE encapsulation */
+		cb->nhoff += sizeof(struct udphdr);
+		bpf_tail_call(skb, &jmp_table, GUE);
+		return BPF_DROP;
+	}
+
+	if (bpf_flow_dissector_write_keys(skb, &ports, sizeof(ports),
+					  FLOW_DISSECTOR_KEY_PORTS))
+		return BPF_DROP;
+
+	return BPF_OK;
+}
+
+SEC("dissect")
+int dissect(struct __sk_buff *skb)
+{
+	if (!skb->vlan_present)
+		return parse_eth_proto(skb, skb->protocol);
+	else
+		return parse_eth_proto(skb, skb->vlan_proto);
+}
+
+/* Parses on IPPROTO_* */
+static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto)
+{
+	struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
+	__u8 *data_end = (__u8 *)(long)skb->data_end;
+	__u8 *data = (__u8 *)(long)skb->data;
+	__u32 data_len = data_end - data;
+	struct gre_hdr gre;
+	struct ethhdr eth;
+	struct tcphdr tcp;
+
+	switch (proto) {
+	case IPPROTO_ICMP:
+		if (cb->nhoff + sizeof(struct icmphdr) > data_len)
+			return BPF_DROP;
+		return BPF_OK;
+	case IPPROTO_IPIP:
+		cb->flags |= FLOW_DIS_ENCAPSULATION;
+		bpf_tail_call(skb, &jmp_table, IP);
+		break;
+	case IPPROTO_IPV6:
+		cb->flags |= FLOW_DIS_ENCAPSULATION;
+		bpf_tail_call(skb, &jmp_table, IPV6);
+		break;
+	case IPPROTO_GRE:
+		if (bpf_skb_load_bytes(skb, cb->nhoff, &gre, sizeof(gre)))
+			return BPF_DROP;
+
+		if (bpf_htons(gre.flags & GRE_VERSION))
+			/* Only inspect standard GRE packets with version 0 */
+			return BPF_OK;
+
+		cb->nhoff += sizeof(gre); /* Step over GRE Flags and Protocol */
+		if (GRE_IS_CSUM(gre.flags))
+			cb->nhoff += 4; /* Step over chksum and Padding */
+		if (GRE_IS_KEY(gre.flags))
+			cb->nhoff += 4; /* Step over key */
+		if (GRE_IS_SEQ(gre.flags))
+			cb->nhoff += 4; /* Step over sequence number */
+
+		cb->flags |= FLOW_DIS_ENCAPSULATION;
+
+		if (gre.proto == bpf_htons(ETH_P_TEB)) {
+			if (bpf_skb_load_bytes(skb, cb->nhoff, &eth,
+					       sizeof(eth)))
+				return BPF_DROP;
+
+			cb->nhoff += sizeof(eth);
+
+			return parse_eth_proto(skb, eth.h_proto);
+		} else {
+			return parse_eth_proto(skb, gre.proto);
+		}
+
+	case IPPROTO_TCP:
+		if (cb->nhoff + sizeof(struct tcphdr) > data_len)
+			return BPF_DROP;
+
+		if (bpf_skb_load_bytes(skb, cb->nhoff, &tcp, sizeof(tcp)))
+			return BPF_DROP;
+
+		if (tcp.doff < 5)
+			return BPF_DROP;
+
+		if (cb->nhoff + (tcp.doff << 2) > data_len)
+			return BPF_DROP;
+
+		return write_ports(skb, proto);
+	case IPPROTO_UDP:
+	case IPPROTO_UDPLITE:
+		if (cb->nhoff + sizeof(struct udphdr) > data_len)
+			return BPF_DROP;
+
+		return write_ports(skb, proto);
+	default:
+		return BPF_DROP;
+	}
+
+	return BPF_DROP;
+}
+
+static __always_inline int parse_ipv6_proto(struct __sk_buff *skb, __u8 nexthdr)
+{
+	struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
+	struct flow_dissector_key_control control;
+	struct flow_dissector_key_basic basic;
+
+	switch (nexthdr) {
+	case IPPROTO_HOPOPTS:
+	case IPPROTO_DSTOPTS:
+		bpf_tail_call(skb, &jmp_table, IPV6OP);
+		break;
+	case IPPROTO_FRAGMENT:
+		bpf_tail_call(skb, &jmp_table, IPV6FR);
+		break;
+	default:
+		control.thoff = cb->nhoff;
+		control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+		control.flags = cb->flags;
+		if (bpf_flow_dissector_write_keys(skb, &control,
+						  sizeof(control),
+						  FLOW_DISSECTOR_KEY_CONTROL))
+			return BPF_DROP;
+
+		memset(&basic, 0, sizeof(basic));
+		basic.n_proto = bpf_htons(ETH_P_IPV6);
+		basic.ip_proto = nexthdr;
+		if (bpf_flow_dissector_write_keys(skb, &basic, sizeof(basic),
+					      FLOW_DISSECTOR_KEY_BASIC))
+			return BPF_DROP;
+
+		return parse_ip_proto(skb, nexthdr);
+	}
+
+	return BPF_DROP;
+}
+
+PROG(IP)(struct __sk_buff *skb)
+{
+	struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
+	__u8 *data_end = (__u8 *)(long)skb->data_end;
+	struct flow_dissector_key_control control;
+	struct flow_dissector_key_addrs addrs;
+	struct flow_dissector_key_basic basic;
+	__u8 *data = (__u8 *)(long)skb->data;
+	__u32 data_len = data_end - data;
+	bool done = false;
+	struct iphdr iph;
+
+	if (bpf_skb_load_bytes(skb, cb->nhoff, &iph, sizeof(iph)))
+		return BPF_DROP;
+
+	/* IP header cannot be smaller than 20 bytes */
+	if (iph.ihl < 5)
+		return BPF_DROP;
+
+	addrs.v4addrs.src = iph.saddr;
+	addrs.v4addrs.dst = iph.daddr;
+	if (bpf_flow_dissector_write_keys(skb, &addrs, sizeof(addrs.v4addrs),
+				      FLOW_DISSECTOR_KEY_IPV4_ADDRS))
+		return BPF_DROP;
+
+	cb->nhoff += iph.ihl << 2;
+	if (cb->nhoff > data_len)
+		return BPF_DROP;
+
+	if (iph.frag_off & bpf_htons(IP_MF | IP_OFFSET)) {
+		cb->flags |= FLOW_DIS_IS_FRAGMENT;
+		if (iph.frag_off & bpf_htons(IP_OFFSET))
+			/* From second fragment on, packets do not have headers
+			 * we can parse.
+			 */
+			done = true;
+		else
+			cb->flags |= FLOW_DIS_FIRST_FRAG;
+	}
+
+
+	control.thoff = cb->nhoff;
+	control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+	control.flags = cb->flags;
+	if (bpf_flow_dissector_write_keys(skb, &control, sizeof(control),
+					  FLOW_DISSECTOR_KEY_CONTROL))
+		return BPF_DROP;
+
+	memset(&basic, 0, sizeof(basic));
+	basic.n_proto = bpf_htons(ETH_P_IP);
+	basic.ip_proto = iph.protocol;
+	if (bpf_flow_dissector_write_keys(skb, &basic, sizeof(basic),
+				      FLOW_DISSECTOR_KEY_BASIC))
+		return BPF_DROP;
+
+	if (done)
+		return BPF_OK;
+
+	return parse_ip_proto(skb, iph.protocol);
+}
+
+PROG(IPV6)(struct __sk_buff *skb)
+{
+	struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
+	struct flow_dissector_key_addrs addrs;
+	struct ipv6hdr ip6h;
+
+	if (bpf_skb_load_bytes(skb, cb->nhoff, &ip6h, sizeof(ip6h)))
+		return BPF_DROP;
+
+	addrs.v6addrs.src = ip6h.saddr;
+	addrs.v6addrs.dst = ip6h.daddr;
+	if (bpf_flow_dissector_write_keys(skb, &addrs, sizeof(addrs.v6addrs),
+				      FLOW_DISSECTOR_KEY_IPV6_ADDRS))
+		return BPF_DROP;
+
+	cb->nhoff += sizeof(struct ipv6hdr);
+
+	return parse_ipv6_proto(skb, ip6h.nexthdr);
+}
+
+PROG(IPV6OP)(struct __sk_buff *skb)
+{
+	struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
+	__u8 proto;
+	__u8 hlen;
+
+	if (bpf_skb_load_bytes(skb, cb->nhoff, &proto, sizeof(proto)))
+		return BPF_DROP;
+
+	if (bpf_skb_load_bytes(skb, cb->nhoff + sizeof(proto), &hlen,
+			       sizeof(hlen)))
+		return BPF_DROP;
+	/* hlen is in 8-octects and does not include the first 8 bytes
+	 * of the header
+	 */
+	cb->nhoff += (1 + hlen) << 3;
+
+	return parse_ipv6_proto(skb, proto);
+}
+
+PROG(IPV6FR)(struct __sk_buff *skb)
+{
+	struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
+	__be16 frag_off;
+	__u8 proto;
+
+	if (bpf_skb_load_bytes(skb, cb->nhoff, &proto, sizeof(proto)))
+		return BPF_DROP;
+
+	if (bpf_skb_load_bytes(skb, cb->nhoff + 2, &frag_off, sizeof(frag_off)))
+		return BPF_DROP;
+
+	cb->nhoff += 8;
+	cb->flags |= FLOW_DIS_IS_FRAGMENT;
+	if (!(frag_off & bpf_htons(IP6_OFFSET)))
+		cb->flags |= FLOW_DIS_FIRST_FRAG;
+
+	return parse_ipv6_proto(skb, proto);
+}
+
+PROG(MPLS)(struct __sk_buff *skb)
+{
+	struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
+	struct mpls_label mpls;
+
+	if (bpf_skb_load_bytes(skb, cb->nhoff, &mpls, sizeof(mpls)))
+		return BPF_DROP;
+
+	cb->nhoff += sizeof(mpls);
+
+	if (mpls.entry & MPLS_LS_S_MASK) {
+		/* This is the last MPLS header. The network layer packet always
+		 * follows the MPLS header. Peek forward and dispatch based on
+		 * that.
+		 */
+		__u8 version;
+
+		if (bpf_skb_load_bytes(skb, cb->nhoff, &version,
+				       sizeof(version)))
+			return BPF_DROP;
+
+		/* IP version is always the first 4 bits of the header */
+		switch (version & 0xF0) {
+		case 4:
+			bpf_tail_call(skb, &jmp_table, IP);
+			break;
+		case 6:
+			bpf_tail_call(skb, &jmp_table, IPV6);
+			break;
+		default:
+			return BPF_DROP;
+		}
+	} else {
+		bpf_tail_call(skb, &jmp_table, MPLS);
+	}
+
+	return BPF_DROP;
+}
+
+PROG(VLAN)(struct __sk_buff *skb)
+{
+	struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
+	struct vlan_hdr vlan;
+	__be16 proto;
+
+	/* Peek back to see if single or double-tagging */
+	if (bpf_skb_load_bytes(skb, cb->nhoff - sizeof(proto), &proto,
+			       sizeof(proto)))
+		return BPF_DROP;
+
+	/* Account for double-tagging */
+	if (proto == bpf_htons(ETH_P_8021AD)) {
+		if (bpf_skb_load_bytes(skb, cb->nhoff, &vlan, sizeof(vlan)))
+			return BPF_DROP;
+
+		if (vlan.h_vlan_encapsulated_proto != bpf_htons(ETH_P_8021Q))
+			return BPF_DROP;
+
+		cb->nhoff += sizeof(vlan);
+	}
+
+	if (bpf_skb_load_bytes(skb, cb->nhoff, &vlan, sizeof(vlan)))
+		return BPF_DROP;
+
+	cb->nhoff += sizeof(vlan);
+	/* Only allow 8021AD + 8021Q double tagging and no triple tagging.*/
+	if (vlan.h_vlan_encapsulated_proto == bpf_htons(ETH_P_8021AD) ||
+	    vlan.h_vlan_encapsulated_proto == bpf_htons(ETH_P_8021Q))
+		return BPF_DROP;
+
+	return parse_eth_proto(skb, vlan.h_vlan_encapsulated_proto);
+}
+
+PROG(GUE)(struct __sk_buff *skb)
+{
+	struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
+	struct guehdr gue;
+
+	if (bpf_skb_load_bytes(skb, cb->nhoff, &gue, sizeof(gue)))
+		return BPF_DROP;
+
+	cb->nhoff += sizeof(gue);
+	cb->nhoff += gue.hlen << 2;
+
+	cb->flags |= FLOW_DIS_ENCAPSULATION;
+	return parse_ip_proto(skb, gue.proto_ctype);
+}
+
+char __license[] SEC("license") = "GPL";