diff mbox series

[ovs-dev,RFC,v2,4/4] bpf: Add reference XDP program implementation for netdev-offload-xdp

Message ID 20200421144704.263235-5-toshiaki.makita1@gmail.com
State Changes Requested
Headers show
Series XDP offload using flow API provider | expand

Commit Message

Toshiaki Makita April 21, 2020, 2:47 p.m. UTC
This adds a reference program, flowtable_afxdp.o, which can be used to
offload flows to XDP through netdev-offload-xdp.
The program will be compiled when using --enable-bpf switch.

Signed-off-by: Toshiaki Makita <toshiaki.makita1@gmail.com>
---
 Makefile.am           |   9 +-
 acinclude.m4          |  56 +++++
 bpf/.gitignore        |   4 +
 bpf/Makefile.am       |  59 +++++
 bpf/bpf_miniflow.h    | 179 +++++++++++++++
 bpf/bpf_netlink.h     |  34 +++
 bpf/bpf_workaround.h  |  28 +++
 bpf/flowtable_afxdp.c | 515 ++++++++++++++++++++++++++++++++++++++++++
 configure.ac          |   2 +
 9 files changed, 884 insertions(+), 2 deletions(-)
 create mode 100644 bpf/.gitignore
 create mode 100644 bpf/Makefile.am
 create mode 100644 bpf/bpf_miniflow.h
 create mode 100644 bpf/bpf_netlink.h
 create mode 100644 bpf/bpf_workaround.h
 create mode 100644 bpf/flowtable_afxdp.c

Comments

William Tu May 3, 2020, 4:22 p.m. UTC | #1
On Tue, Apr 21, 2020 at 11:47:04PM +0900, Toshiaki Makita wrote:
> This adds a reference program, flowtable_afxdp.o, which can be used to
> offload flows to XDP through netdev-offload-xdp.
> The program will be compiled when using --enable-bpf switch.
> 
> Signed-off-by: Toshiaki Makita <toshiaki.makita1@gmail.com>

Hi Toshiaki,

Thanks for your patch, I haven't tried to run it but I have
questions about miniflow.

> ---
>  Makefile.am           |   9 +-
>  acinclude.m4          |  56 +++++
>  bpf/.gitignore        |   4 +
>  bpf/Makefile.am       |  59 +++++
>  bpf/bpf_miniflow.h    | 179 +++++++++++++++
>  bpf/bpf_netlink.h     |  34 +++
>  bpf/bpf_workaround.h  |  28 +++
>  bpf/flowtable_afxdp.c | 515 ++++++++++++++++++++++++++++++++++++++++++
>  configure.ac          |   2 +
>  9 files changed, 884 insertions(+), 2 deletions(-)
>  create mode 100644 bpf/.gitignore
>  create mode 100644 bpf/Makefile.am
>  create mode 100644 bpf/bpf_miniflow.h
>  create mode 100644 bpf/bpf_netlink.h
>  create mode 100644 bpf/bpf_workaround.h
>  create mode 100644 bpf/flowtable_afxdp.c
> 
> diff --git a/Makefile.am b/Makefile.am
> index b279303d1..f18bfefde 100644
> --- a/Makefile.am
> +++ b/Makefile.am
> @@ -8,6 +8,9 @@
>  AUTOMAKE_OPTIONS = foreign subdir-objects
>  ACLOCAL_AMFLAGS = -I m4
>  SUBDIRS = datapath
> +if HAVE_BPF
> +SUBDIRS += bpf
> +endif
>  
>  AM_CPPFLAGS = $(SSL_CFLAGS)
>  AM_LDFLAGS = $(SSL_LDFLAGS)
> @@ -198,7 +201,9 @@ ALL_LOCAL += dist-hook-git
>  dist-hook-git: distfiles
>  	@if test -e $(srcdir)/.git && (git --version) >/dev/null 2>&1; then \
>  	  (cd datapath && $(MAKE) distfiles); \
> -	  (cat distfiles; sed 's|^|datapath/|' datapath/distfiles) | \
> +	  (cd bpf && $(MAKE) distfiles); \
> +	  (cat distfiles; sed 's|^|datapath/|' datapath/distfiles; \
> +	   sed 's|^|bpf/|' bpf/distfiles) | \
>  	    LC_ALL=C sort -u > all-distfiles; \
>  	  (cd $(srcdir) && git ls-files) | grep -v '\.gitignore$$' | \
>  	    grep -v '\.gitattributes$$' | \
> @@ -234,7 +239,7 @@ config-h-check:
>  	@cd $(srcdir); \
>  	if test -e .git && (git --version) >/dev/null 2>&1 && \
>  	  git --no-pager grep -L '#include <config\.h>' `git ls-files | grep '\.c$$' | \
> -	    grep -vE '^datapath|^lib/sflow|^third-party|^datapath-windows|^python'`; \
> +	    grep -vE '^datapath|^lib/sflow|^third-party|^datapath-windows|^python|^bpf'`; \
>  	then \
>  	  echo "See above for list of violations of the rule that"; \
>  	  echo "every C source file must #include <config.h>."; \
> diff --git a/acinclude.m4 b/acinclude.m4
> index 0901f2870..2fb2f385f 100644
> --- a/acinclude.m4
> +++ b/acinclude.m4
> @@ -301,6 +301,62 @@ AC_DEFUN([OVS_CHECK_LINUX_AF_XDP], [
>    AM_CONDITIONAL([HAVE_AF_XDP], test "$AF_XDP_ENABLE" = true)
>  ])
>  
> +dnl OVS_CHECK_LINUX_BPF
> +dnl
> +dnl Check both llvm and libbpf support
> +AC_DEFUN([OVS_CHECK_LINUX_BPF], [
> +  AC_ARG_ENABLE([bpf],
> +                [AC_HELP_STRING([--enable-bpf],
> +                                [Compile reference eBPF programs for XDP])],
> +                [], [enable_bpf=no])
> +  AC_MSG_CHECKING([whether BPF is enabled])
> +  if test "$enable_bpf" != yes; then
> +    AC_MSG_RESULT([no])
> +    BPF_ENABLE=false
> +  else
> +    AC_MSG_RESULT([yes])
> +    BPF_ENABLE=true
> +
> +    AC_CHECK_PROG(CLANG_CHECK, clang, yes)
> +    AS_IF([test X"$CLANG_CHECK" != X"yes"],
> +      [AC_MSG_ERROR([unable to find clang to compile BPF program])])
> +
> +    AC_CHECK_PROG(LLC_CHECK, llc, yes)
> +    AS_IF([test X"$LLC_CHECK" != X"yes"],
> +      [AC_MSG_ERROR([unable to find llc to compile BPF program])])
> +
> +    AC_CHECK_HEADER([bpf/bpf_helpers.h], [],
> +      [AC_MSG_ERROR([unable to find bpf/bpf_helpers.h to compile BPF program])])
> +
> +    AC_CHECK_HEADER([linux/bpf.h], [],
> +      [AC_MSG_ERROR([unable to find linux/bpf.h to compile BPF program])])
> +
> +    AC_MSG_CHECKING([for LLVM bpf target support])
> +    if llc -march=bpf -mattr=help >/dev/null 2>&1; then
> +      AC_MSG_RESULT([yes])
> +    else
> +      AC_MSG_RESULT([no])
> +      AC_MSG_ERROR([LLVM does not support bpf target])
> +    fi
> +
> +    AC_MSG_CHECKING([for BTF DATASEC support])
> +    AC_LANG_CONFTEST(
> +      [AC_LANG_SOURCE([__attribute__((section("_x"), used)) int x;])])
> +    if clang -g -O2 -S -target bpf -emit-llvm -c conftest.c -o conftest.ll && \
> +       llc -march=bpf -filetype=obj -o conftest.o conftest.ll && \
> +       readelf -p.BTF conftest.o 2>/dev/null | grep -q -w _x; then
> +      AC_MSG_RESULT([yes])
> +    else
> +      AC_MSG_RESULT([no])
> +      AC_MSG_ERROR([LLVM does not support BTF DATASEC])
> +    fi
> +
> +    AC_DEFINE([HAVE_BPF], [1],
> +              [Define to 1 if BPF compilation is available and enabled.])
> +  fi
> +  AM_CONDITIONAL([HAVE_BPF], test "$BPF_ENABLE" = true)
> +])
> +
>  dnl OVS_CHECK_DPDK
>  dnl
>  dnl Configure DPDK source tree
> diff --git a/bpf/.gitignore b/bpf/.gitignore
> new file mode 100644
> index 000000000..ab0f2d9e4
> --- /dev/null
> +++ b/bpf/.gitignore
> @@ -0,0 +1,4 @@
> +*.ll
> +/distfiles
> +/Makefile
> +/Makefile.in
> diff --git a/bpf/Makefile.am b/bpf/Makefile.am
> new file mode 100644
> index 000000000..f485b17f0
> --- /dev/null
> +++ b/bpf/Makefile.am
> @@ -0,0 +1,59 @@
> +AUTOMAKE_OPTIONS = foreign
> +
> +EXTRA_DIST = flowtable_afxdp.c bpf_miniflow.h bpf_netlink.h bpf_workaround.h
> +
> +# The following is based on commands for the Automake "distdir" target.
> +distfiles: Makefile
> +	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
> +	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
> +	list='$(DISTFILES)'; \
> +	for file in $$list; do echo $$file; done | \
> +	  sed -e "s|^$$srcdirstrip/||;t" \
> +	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t" | \
> +	  LC_ALL=C sort -u > $@
> +CLEANFILES = distfiles
> +
> +CLANG ?= clang
> +LLC ?= llc
> +
> +AM_CPPFLAGS = -I $(top_srcdir)/include
> +AM_CPPFLAGS += -I $(top_builddir)/include
> +AM_CPPFLAGS += -I $(top_srcdir)/lib
> +AM_CPPFLAGS += -I $(top_builddir)/lib
> +
> +AM_CFLAGS = -Wall
> +AM_CFLAGS += -Wextra
> +AM_CFLAGS += -Wno-sign-compare
> +AM_CFLAGS += -Wformat -Wformat-security
> +AM_CFLAGS += -Wswitch-enum
> +AM_CFLAGS += -Wunused-parameter
> +AM_CFLAGS += -Wbad-function-cast
> +AM_CFLAGS += -Wcast-align
> +AM_CFLAGS += -Wstrict-prototypes
> +AM_CFLAGS += -Wold-style-definition
> +AM_CFLAGS += -Wmissing-field-initializers
> +AM_CFLAGS += -fno-strict-aliasing
> +AM_CFLAGS += -Wswitch-bool
> +AM_CFLAGS += -Wlogical-not-parentheses
> +AM_CFLAGS += -Wsizeof-array-argument
> +AM_CFLAGS += -Wshift-negative-value
> +AM_CFLAGS += -Wshadow
> +AM_CFLAGS += -Wcast-align
> +AM_CFLAGS += -Wno-unused-value
> +AM_CFLAGS += -Wno-compare-distinct-pointer-types
> +AM_CFLAGS += -g
> +AM_CFLAGS += -O2
> +
> +SUFFIXES = .ll
> +%.ll: %.c
> +	$(CLANG) $(AM_CPPFLAGS) $(AM_CFLAGS) -S \
> +		-target bpf -emit-llvm -c $< -o $@
> +CLEANFILES += *.ll
> +
> +%.o: %.ll
> +	$(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@ $<
> +CLEANFILES += *.o
> +
> +flowtable_afxdp.o: flowtable_afxdp.c bpf_miniflow.h bpf_netlink.h bpf_workaround.h
> +
> +all-local: flowtable_afxdp.o
> diff --git a/bpf/bpf_miniflow.h b/bpf/bpf_miniflow.h
> new file mode 100644
> index 000000000..090a9c7be
> --- /dev/null
> +++ b/bpf/bpf_miniflow.h
> @@ -0,0 +1,179 @@
> +/*
> + * Copyright (c) 2020 NTT Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef BPF_MINIFLOW_H
> +#define BPF_MINIFLOW_H 1
> +
> +#include "flow.h"
> +
> +struct bpf_mf_ctx {
> +    struct flowmap map;
> +    uint64_t *data;
> +};
> +
> +static inline void
> +miniflow_set_maps(struct bpf_mf_ctx *ctx, size_t ofs, size_t n_words)
> +{
> +    flowmap_set(&ctx->map, ofs, n_words);
> +}
> +
> +static inline void
> +miniflow_set_map(struct bpf_mf_ctx *ctx, size_t ofs)
> +{
> +    flowmap_set(&ctx->map, ofs, 1);
> +}
> +
> +static inline void
> +miniflow_push_uint8_(struct bpf_mf_ctx *ctx, size_t ofs, uint8_t value)
> +{
> +    size_t ofs8 = ofs % 8;
> +
> +    if (ofs8 == 0) {
> +        miniflow_set_map(ctx, ofs / 8);
> +    }
> +    *((uint8_t *)ctx->data + ofs8) = value;
> +    if (ofs8 == 7) {
> +        ctx->data++;
> +    }
> +}
> +
> +static inline void
> +miniflow_push_uint16_(struct bpf_mf_ctx *ctx, size_t ofs, uint16_t value)
> +{
> +    size_t ofs8 = ofs % 8;
> +
> +    if (ofs8 == 0) {
> +        miniflow_set_map(ctx, ofs / 8);
> +        *(uint16_t *)ctx->data = value;
> +    } else if (ofs8 == 2) {
> +        *((uint16_t *)ctx->data + 1) = value;
> +    } else if (ofs8 == 4) {
> +        *((uint16_t *)ctx->data + 2) = value;
> +    } else if (ofs8 == 6) {
> +        *((uint16_t *)ctx->data + 3) = value;
> +        ctx->data++;
> +    }
> +}
> +
> +static inline void
> +miniflow_push_uint32_(struct bpf_mf_ctx *ctx, size_t ofs, uint32_t value)
> +{
> +    size_t ofs8 = ofs % 8;
> +
> +    if (ofs8 == 0) {
> +        miniflow_set_map(ctx, ofs / 8);
> +        *(uint32_t *)ctx->data = value;
> +    } else if (ofs8 == 4) {
> +        *((uint32_t *)ctx->data + 1) = value;
> +        ctx->data++;
> +    }
> +}
> +
> +static inline void
> +ether_addr_copy(struct eth_addr *dst, const struct eth_addr *src)
> +{
> +    ovs_be16 *a = dst->be16;
> +    const ovs_be16 *b = src->be16;
> +
> +    a[0] = b[0];
> +    a[1] = b[1];
> +    a[2] = b[2];
> +}
> +
> +/* 'valuep' is 16-aligned */
> +/* data must start 64-aligned and must be followed by other data or padding */
> +static inline void
> +miniflow_push_macs_(struct bpf_mf_ctx *ctx, size_t ofs,
> +                    const struct eth_addr *valuep)
> +{
> +    miniflow_set_maps(ctx, ofs / 8, 2);
> +    ether_addr_copy((struct eth_addr *)ctx->data, valuep);
> +    ether_addr_copy((struct eth_addr *)ctx->data + 1, valuep + 1);
> +    ctx->data++; /* First word only. */
> +}
> +
> +/* data must start 64-aligned and must be followed by other data */
> +static inline void
> +miniflow_pad_from_64_(struct bpf_mf_ctx *ctx, size_t ofs)
> +{
> +    size_t ofs8 = ofs % 8;
> +    size_t ofs4 = ofs % 4;
> +    size_t ofs2 = ofs % 2;
> +    void *cdata = ctx->data;
> +
> +    miniflow_set_map(ctx, ofs / 8);
> +
> +    if (ofs8 >= 4) {
> +        *(uint32_t *)cdata = 0;
> +        cdata += 4;
> +    }
> +    if (ofs4 >= 2) {
> +        *(uint16_t *)cdata = 0;
> +        cdata += 2;
> +    }
> +    if (ofs2 == 1) {
> +        *(uint8_t *)cdata = 0;
> +    }
> +}
> +
> +static inline void
> +miniflow_pad_to_64_(struct bpf_mf_ctx *ctx, size_t ofs)
> +{
> +    size_t ofs8 = ofs % 8;
> +    size_t ofs4 = ofs % 4;
> +    size_t ofs2 = ofs % 2;
> +    void *cdata = ctx->data;
> +
> +    cdata += ofs8;
> +    if (ofs2 == 1) {
> +        *(uint8_t *)cdata = 0;
> +        cdata++;
> +    }
> +    if (ofs4 <= 2) {
> +        *(uint16_t *)cdata = 0;
> +        cdata += 2;
> +    }
> +    if (ofs8 <= 4) {
> +        *(uint32_t *)cdata = 0;
> +    }
> +    ctx->data++;
> +}
> +
> +#define miniflow_push_uint8(CTX, FIELD, VALUE)                       \
> +    miniflow_push_uint8_(CTX, offsetof(struct flow, FIELD), VALUE)
> +
> +#define miniflow_push_be16_(CTX, OFS, VALUE)                         \
> +    miniflow_push_uint16_(CTX, OFS, (OVS_FORCE uint16_t)VALUE)
> +
> +#define miniflow_push_be16(CTX, FIELD, VALUE)                        \
> +    miniflow_push_be16_(CTX, offsetof(struct flow, FIELD), VALUE)    \
> +
> +#define miniflow_push_be32_(CTX, OFS, VALUE)                         \
> +    miniflow_push_uint32_(CTX, OFS, (OVS_FORCE uint32_t)VALUE)
> +
> +#define miniflow_push_be32(CTX, FIELD, VALUE)                        \
> +    miniflow_push_be32_(CTX, offsetof(struct flow, FIELD), VALUE)    \
> +
> +#define miniflow_push_macs(CTX, FIELD, VALUEP)                       \
> +    miniflow_push_macs_(CTX, offsetof(struct flow, FIELD), VALUEP)
> +
> +#define miniflow_pad_from_64(CTX, FIELD)                             \
> +    miniflow_pad_from_64_(CTX, offsetof(struct flow, FIELD))
> +
> +#define miniflow_pad_to_64(CTX, FIELD)                               \
> +    miniflow_pad_to_64_(CTX, OFFSETOFEND(struct flow, FIELD))
> +
> +#endif /* bpf_miniflow.h */
> diff --git a/bpf/bpf_netlink.h b/bpf/bpf_netlink.h
> new file mode 100644
> index 000000000..091926c61
> --- /dev/null
> +++ b/bpf/bpf_netlink.h
> @@ -0,0 +1,34 @@
> +/*
> + * Copyright (c) 2020 NTT Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef BPF_NETLINK_H
> +#define BPF_NETLINK_H 1
> +
> +#include "netlink.h"
> +
> +static inline int
> +bpf_nl_attr_type(const struct nlattr *nla)
> +{
> +    return nla->nla_type & NLA_TYPE_MASK;
> +}
> +
> +static inline const void *
> +bpf_nl_attr_get(const struct nlattr *nla)
> +{
> +    return nla + 1;
> +}
> +
> +#endif /* bpf_netlink.h */
> diff --git a/bpf/bpf_workaround.h b/bpf/bpf_workaround.h
> new file mode 100644
> index 000000000..cb072a7e6
> --- /dev/null
> +++ b/bpf/bpf_workaround.h
> @@ -0,0 +1,28 @@
> +/*
> + * Copyright (c) 2020 NTT Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef BPF_WORKAROUND_H
> +#define BPF_WORKAROUND_H
> +
> +/* On Linux x86/x64 systems bits/wordsize.h included from stdint.h cannot
> + * correctly determine __WORDSIZE for bpf, which causes incorrect UINTPTR_MAX
> + */
> +#if __UINTPTR_MAX__ == __UINT64_MAX__ && defined(UINTPTR_MAX)
> +#undef UINTPTR_MAX
> +#define UINTPTR_MAX UINT64_MAX
> +#endif
> +
> +#endif /* bpf_workaround.h */
> diff --git a/bpf/flowtable_afxdp.c b/bpf/flowtable_afxdp.c
> new file mode 100644
> index 000000000..7a4767333
> --- /dev/null
> +++ b/bpf/flowtable_afxdp.c
> @@ -0,0 +1,515 @@
> +/*
> + * Copyright (c) 2020 NTT Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +/* linux/types.h is necessary for bpf_helpers.h as it's not self-contained */
> +#include <linux/types.h>
> +#include <bpf/bpf_helpers.h>
> +#include <linux/bpf.h>
> +
> +/* Workaround for incorrect macros for bpf in stdint.h */
> +#include <stdint.h>
> +#include "bpf_workaround.h"
> +
> +#include "bpf_miniflow.h"
> +#include "bpf_netlink.h"
> +#include "netdev-offload-xdp.h"
> +#include "packets.h"
> +#include "util.h"
> +
> +/* Supported keys. Need to keep same 64-align as struct flow for miniflow */
> +struct xdp_flow {
> +    struct eth_addr dl_dst;
> +    struct eth_addr dl_src;
> +    ovs_be16 dl_type;
> +    uint8_t pad1[2];
> +
> +    union flow_vlan_hdr vlans[1];
> +    uint8_t pad2[4];
> +
> +    ovs_be32 nw_src;
> +    ovs_be32 nw_dst;
> +
> +    uint8_t pad3[4];
> +    uint8_t nw_frag;
> +    uint8_t nw_tos;
> +    uint8_t nw_ttl;
> +    uint8_t nw_proto;
> +
> +    ovs_be16 tp_src;
> +    ovs_be16 tp_dst;
> +    uint8_t pad4[4];
> +};
> +
> +/* Size of xdp_flow must be 64-aligned for key comparison */
> +BUILD_ASSERT_DECL(sizeof(struct xdp_flow) % sizeof(uint64_t) == 0);
> +
> +#define XDP_FLOW_U64S (sizeof(struct xdp_flow) / sizeof(uint64_t))
> +
> +#define XDP_MAX_SUBTABLE_FLOWS 1024
> +#define XDP_MAX_ACTIONS_LEN 256
> +
> +/* Actual key in each subtable. miniflow map is omitted as it's identical to
> + * mask map */
> +struct xdp_flow_key {
> +    union {
> +        uint64_t miniflow_buf[XDP_FLOW_U64S];
> +        struct xdp_flow _flow; /* Need this to keep xdp_flow in BTF */
> +    };
> +};
> +
> +/* Value for subtable mask array */
> +struct xdp_subtable_mask {
> +    struct xdp_subtable_mask_header header;
> +    uint64_t buf[XDP_FLOW_U64S];
> +};
> +
> +/* miniflow for packet */
> +struct xdp_miniflow {
> +    struct miniflow mf;
> +    struct xdp_flow_key value;
> +};
> +
> +/* Used when the action only modifies the packet */
> +#define _XDP_ACTION_CONTINUE -1
> +
> +/* Supported actions */
> +/* XXX: This size should be uint16_t but needs to be int as kernel has a bug
> + * in btf_enum_check_member() that assumes enum size is sizeof(int), which
> + * causes an error when loading BTF if we use uint16_t here */
> +enum action_attrs : uint32_t {
> +    XDP_ACTION_OUTPUT = OVS_ACTION_ATTR_OUTPUT,
> +    XDP_ACTION_PUSH_VLAN = OVS_ACTION_ATTR_PUSH_VLAN,
> +    XDP_ACTION_POP_VLAN = OVS_ACTION_ATTR_POP_VLAN,
> +};
> +
> +/* Identical to struct nlattr. Need this to keep enum action_attrs in BTF */
> +struct xdp_action_nlattr {
> +    uint16_t nla_len;
> +    enum action_attrs action_type;
> +};
> +
> +struct xdp_flow_actions {
> +    struct xdp_flow_actions_header header;
> +    uint8_t data[XDP_MAX_ACTIONS_LEN - sizeof(struct xdp_flow_actions_header)];
> +    /* Dummy. Need xdp_action_nlattr to keep enum action_attrs in BTF */
> +    struct xdp_action_nlattr _xdp_actions[];
> +};
> +
> +
> +/* Map definitions */
> +
> +struct {
> +    __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> +    __uint(max_entries, 256);
> +    __type(key, uint32_t);
> +    __type(value, long);
> +} debug_stats SEC(".maps");
> +
> +/* Temporary storage for packet miniflow. Need this because verifier does not
> + * allow access to array variable in stack with variable index. Such access
> + * happens in mask_key() */
> +struct {
> +    __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> +    __uint(max_entries, 1);
> +    __type(key, uint32_t);
> +    __type(value, struct xdp_miniflow);
> +} pkt_mf_tbl SEC(".maps");
> +
> +struct {
> +    __uint(type, BPF_MAP_TYPE_XSKMAP);
> +    __uint(max_entries, 1); /* This should be redefined by userspace */
> +    __uint(key_size, sizeof(int));
> +    __uint(value_size, sizeof(int));
> +} xsks_map SEC(".maps");
> +
> +struct {
> +    __uint(type, BPF_MAP_TYPE_DEVMAP);
> +    __uint(max_entries, XDP_MAX_PORTS);
> +    __uint(key_size, sizeof(int));
> +    __uint(value_size, sizeof(int));
> +} output_map SEC(".maps");
> +
> +/* Head index for subtbl_masks list */
> +/* TODO: Use global variable */
> +struct {
> +    __uint(type, BPF_MAP_TYPE_ARRAY);
> +    __uint(max_entries, 1);
> +    __type(key, uint32_t);
> +    __type(value, int);
> +} subtbl_masks_hd SEC(".maps");
> +
> +/* Information about subtable mask. A list implemented using array */
> +struct {
> +    __uint(type, BPF_MAP_TYPE_ARRAY);
> +    __uint(max_entries, XDP_MAX_SUBTABLES);
> +    __type(key, uint32_t);
> +    __type(value, struct xdp_subtable_mask);
> +} subtbl_masks SEC(".maps");
> +
> +/* Template for subtable hash-map. This will be used in userspace to create
> + * flow_table array-of-maps. */
> +struct {
> +    __uint(type, BPF_MAP_TYPE_HASH);
> +    __uint(max_entries, XDP_MAX_SUBTABLE_FLOWS);
> +    __type(key, struct xdp_flow_key);
> +    __type(value, struct xdp_flow_actions);
> +} subtbl_template SEC(".maps");
> +
> +/* Array-of-maps whose entry contains subtable hash-map. */
> +struct {
> +    __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
> +    __uint(max_entries, XDP_MAX_SUBTABLES);
> +    __uint(key_size, sizeof(uint32_t));
> +    __uint(value_size, sizeof(uint32_t));
> +} flow_table SEC(".maps");
> +
> +
> +static inline void
> +account_debug(int idx)
> +{
> +    long *cnt;
> +
> +    cnt = bpf_map_lookup_elem(&debug_stats, &idx);
> +    if (cnt) {
> +        *cnt += 1;
> +    }
> +}
> +
> +static inline void
> +account_action(enum action_attrs act)
> +{
> +    account_debug(act + 1);
> +}
> +
> +/* Derived from xsk_load_xdp_prog() in libbpf */
> +static inline int
> +upcall(struct xdp_md *ctx)
> +{
> +    int ret, index = ctx->rx_queue_index;
> +
> +    ret = bpf_redirect_map(&xsks_map, index, XDP_ABORTED);
> +    if (ret > 0) {
> +        return ret;
> +    }
> +
> +    /* Fallback for kernel <= 5.3 not supporting default action in flags */
> +    if (bpf_map_lookup_elem(&xsks_map, &index)) {
> +        return bpf_redirect_map(&xsks_map, index, 0);
> +    }
> +
> +    return XDP_ABORTED;
> +}
> +
> +static inline int
> +action_output(int tx_port)
> +{
> +    account_action(XDP_ACTION_OUTPUT);
> +
> +    return bpf_redirect_map(&output_map, tx_port, 0);
> +}
> +
> +static inline int
> +action_vlan_push(struct xdp_md *ctx OVS_UNUSED,
> +                 const struct ovs_action_push_vlan *vlan OVS_UNUSED)
> +{
> +    account_action(XDP_ACTION_PUSH_VLAN);
> +
> +    /* TODO: implement this */
> +    return XDP_ABORTED;
> +}
> +
> +static inline int
> +action_vlan_pop(struct xdp_md *ctx OVS_UNUSED)
> +{
> +    account_action(XDP_ACTION_POP_VLAN);
> +
> +    /* TODO: implement this */
> +    return XDP_ABORTED;
> +}
> +
> +/* TODO: Add more actions */
> +
> +
> +struct nw_params {
> +    union {
> +        ovs_be32 params;
> +        struct {
> +            uint8_t nw_frag;
> +            uint8_t nw_tos;
> +            uint8_t nw_ttl;
> +            uint8_t nw_proto;
> +        };
> +    };
> +};
> +
> +static inline int
> +parse_ipv4(void *data, uint64_t *nh_off, void *data_end,
> +           struct bpf_mf_ctx *mf_ctx, struct nw_params *nw_params)
> +{
> +    struct ip_header *ip = data + *nh_off;
> +
> +    if (ip + 1 > data_end) {
> +        return 1;
> +    }
> +
> +    /* Linux network drivers ensure that IP header is 4-byte aligned or
> +     * the platform can handle unaligned access */
> +    miniflow_push_be32(mf_ctx, nw_src, *(ovs_be32 *)(void *)&ip->ip_src);
> +    miniflow_push_be32(mf_ctx, nw_dst, *(ovs_be32 *)(void *)&ip->ip_dst);
> +
> +    if (OVS_UNLIKELY(IP_IS_FRAGMENT(ip->ip_frag_off))) {
> +        nw_params->nw_frag = FLOW_NW_FRAG_ANY;
> +        if (ip->ip_frag_off & htons(IP_FRAG_OFF_MASK)) {
> +            nw_params->nw_frag |= FLOW_NW_FRAG_LATER;
> +        }
> +    } else {
> +        nw_params->nw_frag = 0;
> +    }
> +    nw_params->nw_tos = ip->ip_tos;
> +    nw_params->nw_ttl = ip->ip_ttl;
> +    nw_params->nw_proto = ip->ip_proto;
> +
> +    *nh_off += IP_IHL(ip->ip_ihl_ver) * 4;
> +
> +    return 0;
> +}
> +
> +static inline int
> +xdp_miniflow_extract(struct xdp_md *ctx, struct xdp_miniflow *pkt_mf)
> +{
> +    void *data = (void *)(long)ctx->data;
> +    void *data_end = (void *)(long)ctx->data_end;
> +    struct eth_header *eth = data;
> +    struct vlan_header *vlan = NULL;
> +    ovs_be16 dl_type;
> +    uint64_t nh_off;
> +    struct nw_params nw_params;
> +    struct bpf_mf_ctx mf_ctx = { {{ 0 }}, (uint64_t *)&pkt_mf->value };
> +
> +    nh_off = sizeof *eth;
> +    if (data + nh_off > data_end) {
> +        return 1;
> +    }
> +
> +    miniflow_push_macs(&mf_ctx, dl_dst, &eth->eth_dst);
> +
> +    if (eth_type_vlan(eth->eth_type)) {
> +        vlan = data + nh_off;
> +        nh_off += sizeof(*vlan);
> +        if (data + nh_off > data_end) {
> +            return 1;
> +        }
> +        dl_type = vlan->vlan_next_type;
> +    } else {
> +        dl_type = eth->eth_type;
> +    }
> +    miniflow_push_be16(&mf_ctx, dl_type, dl_type);
> +    miniflow_pad_to_64(&mf_ctx, dl_type);
> +
> +    if (vlan) {
> +        const ovs_16aligned_be32 *qp;
> +        union flow_vlan_hdr vlan_hdr;
> +
> +        qp = (ovs_16aligned_be32 *)&eth->eth_type;
> +        vlan_hdr.qtag = get_16aligned_be32(qp);
> +        vlan_hdr.tci |= htons(VLAN_CFI);
> +        miniflow_push_be32(&mf_ctx, vlans, vlan_hdr.qtag);
> +        miniflow_push_be32_(&mf_ctx,
> +                            offsetof(struct flow, vlans) + sizeof(ovs_be32),
> +                            0);
> +    }
> +
> +    if (dl_type == htons(ETH_TYPE_IP)) {
> +        if (parse_ipv4(data, &nh_off, data_end, &mf_ctx, &nw_params)) {
> +            return 1;
> +        }
> +    } else {
> +        goto out;
> +    }
> +    miniflow_pad_from_64(&mf_ctx, nw_frag);
> +    miniflow_push_be32(&mf_ctx, nw_frag, &nw_params.params);
> +
> +    if (nw_params.nw_proto == IPPROTO_TCP) {
> +        struct tcp_header *tcp = data + nh_off;
> +
> +        if (tcp + 1 > data_end) {
> +            return 1;
> +        }
> +
> +        miniflow_push_be16(&mf_ctx, tp_src, tcp->tcp_src);
> +        miniflow_push_be16(&mf_ctx, tp_dst, tcp->tcp_dst);
> +    } else if (nw_params.nw_proto == IPPROTO_UDP) {
> +        struct udp_header *udp = data + nh_off;
> +
> +        if (udp + 1 > data_end) {
> +            return 1;
> +        }
> +
> +        miniflow_push_be16(&mf_ctx, tp_src, udp->udp_src);
> +        miniflow_push_be16(&mf_ctx, tp_dst, udp->udp_dst);
> +    }
> +out:
> +    pkt_mf->mf.map = mf_ctx.map;
> +    return 0;
> +}
> +
> +#define for_each_subtable_mask(subtable_mask, head, idx, cnt) \
> +    for (subtable_mask = bpf_map_lookup_elem(&subtbl_masks, (head)), \
> +         idx = *(head), cnt = 0; \
> +         subtable_mask != NULL && cnt < XDP_MAX_SUBTABLES; \
> +         idx = subtable_mask->header.next, \
> +         subtable_mask = bpf_map_lookup_elem(&subtbl_masks, &idx), cnt++)
> +
> +/* Returns false if an error happens */
> +static inline int
> +mask_key(uint64_t *mkey, const struct miniflow *pkt_mf,
> +         const struct xdp_subtable_mask_header *tbl_mask)
> +{
> +    const struct miniflow *tbl_mf = &tbl_mask->mask.masks;
> +    const uint64_t *tbl_blocks = miniflow_get_values(tbl_mf);
> +    const uint64_t *pkt_blocks = miniflow_get_values(pkt_mf);
> +    uint64_t tbl_mf_bits = tbl_mf->map.bits[0];
> +    uint64_t pkt_mf_bits = pkt_mf->map.bits[0];
> +    uint8_t tbl_mf_bits_u0 = tbl_mask->mf_bits_u0;
> +    uint8_t tbl_mf_bits_u1 = tbl_mask->mf_bits_u1;
> +    unsigned int pkt_ofs = 0;
> +    int i = 0;
> +
> +    /* This sensitive loop easily exceeds verifier limit 1M insns so
> +     * need to be careful when modifying.
> +     * E.g. increasing XDP_FLOW_U64S by adding keys to struct xdp_flow
> +     * increases verifier cost and may be rejected due to 1M insns exceeds */
> +    for (; i < tbl_mf_bits_u0 + tbl_mf_bits_u1 && i < XDP_FLOW_U64S; i++) {
> +        uint64_t mf_mask;
> +        uint64_t idx_bits;
> +        unsigned int pkt_idx;
> +        uint64_t lowest_bit;
> +
> +        if (i == tbl_mf_bits_u0) {
> +            tbl_mf_bits = tbl_mf->map.bits[1];
> +            pkt_mf_bits = pkt_mf->map.bits[1];
> +            pkt_ofs = count_1bits(pkt_mf->map.bits[0]);
> +        }
> +
> +        lowest_bit = tbl_mf_bits & -tbl_mf_bits;
> +        tbl_mf_bits &= ~lowest_bit;
> +        if (!(lowest_bit & pkt_mf_bits)) {
> +            mkey[i] = 0;
> +            continue;
> +        }
> +        mf_mask = lowest_bit - 1;
> +        idx_bits = mf_mask & pkt_mf_bits;
> +        pkt_idx = count_1bits(idx_bits) + pkt_ofs;
> +        if (pkt_idx >= XDP_FLOW_U64S) {
> +            /* xdp flow api provider (userspace) BUG */
> +            return false;
> +        }
> +
> +        mkey[i] = pkt_blocks[pkt_idx] & tbl_blocks[i];
> +    }
> +
> +    return true;
> +}
> +
> +SEC("xdp") int
> +flowtable_afxdp(struct xdp_md *ctx)
> +{
> +    struct xdp_miniflow *pkt_mf;
> +    struct xdp_subtable_mask *subtable_mask;
> +    int *head;
> +    struct xdp_flow_actions *xdp_actions = NULL;
> +    struct nlattr *a;
> +    unsigned int left;
> +    int cnt, idx, zero = 0;
> +
> +    account_debug(0);
> +
> +    head = bpf_map_lookup_elem(&subtbl_masks_hd, &zero);
> +    if (!head) {
> +        return XDP_ABORTED;
> +    }
> +    if (*head == XDP_SUBTABLES_TAIL) {
> +        /* Offload not enabled */
> +        goto upcall;
> +    }
> +
> +    /* Get temporary storage for storing packet miniflow */
> +    pkt_mf = bpf_map_lookup_elem(&pkt_mf_tbl, &zero);
> +    if (!pkt_mf) {
> +        return XDP_ABORTED;
> +    }
> +

I start to wonder what's the benefit of using miniflow in XDP?
miniflow tries to compress the large flow key into smaller memory,
and with a flowmap indicating which offset in bits are valid.
And when adding a new subtable at flow_put, the subtable's key
size can be smaller with only the needed fields.

But in the case of XDP/eBPF, we have to statically allocated fixed
key size (the subtbl_template has key as struct xdp_flow_key), so
each subtable is always having key with full key fields. (not saving
memory space) Moreover with miniflow, it makes the 'mask_key' function
below pretty complicated.

What if we don't compress the flow key?
unlike to OVS userspace, more like the kernel's implementation of OVS
megaflow (ex: masked_flow_lookup in net/openvswitch/flow_table.c)
Where it has a list of masks (like your 'subtbl_masks' map) but
only use 1 hash table (so we don't need map-in-map).
We lookup the hash table multiple times, each
time applied different masks in the mask list until finding a match.

However, I'm not sure whether this will hit some limitations of verifier.

Please correct me if I understand wrong here.
Thanks
William

> +    /* Extract miniflow from packet */
> +    if (xdp_miniflow_extract(ctx, pkt_mf)) {
> +        return XDP_DROP;
> +    }
> +
> +    /* Lookup each subtable */
> +    for_each_subtable_mask(subtable_mask, head, idx, cnt) {
> +        struct xdp_flow_key mkey = { 0 };
> +        void *subtable;
> +
> +        subtable = bpf_map_lookup_elem(&flow_table, &idx);
> +        if (!subtable) {
> +            return XDP_ABORTED;
> +        }
> +
> +        if (!mask_key(mkey.miniflow_buf, &pkt_mf->mf,
> +                      &subtable_mask->header)) {
> +            continue;
> +        }
> +
> +        xdp_actions = bpf_map_lookup_elem(subtable, &mkey);
> +        if (xdp_actions) {
> +            break;
> +        }
> +    }
> +
> +    if (!xdp_actions) {
> +        /* Flow entry miss */
> +upcall:
> +        return upcall(ctx);
> +    }
> +
> +    /* Execute actions */
> +    NL_ATTR_FOR_EACH_UNSAFE(a, left, xdp_flow_actions(&xdp_actions->header),
> +                            xdp_actions->header.actions_len) {
> +        uint16_t type = bpf_nl_attr_type(a);
> +        int act;
> +
> +        switch ((enum action_attrs)type) {
> +        case XDP_ACTION_OUTPUT:
> +            /* Note: userspace ensures there is no multiple output in actions */
> +            return action_output(*(int *)bpf_nl_attr_get(a));
> +        case XDP_ACTION_PUSH_VLAN:
> +            act = action_vlan_push(ctx, bpf_nl_attr_get(a));
> +            break;
> +        case XDP_ACTION_POP_VLAN:
> +            act = action_vlan_pop(ctx);
> +            break;
> +        default:
> +            return XDP_ABORTED;
> +        }
> +        if (act != _XDP_ACTION_CONTINUE) {
> +            return act;
> +        }
> +    }
> +
> +    account_debug(1);
> +    return XDP_DROP;
> +}
> +
> +char _license[] SEC("license") = "Apache-2.0";
> diff --git a/configure.ac b/configure.ac
> index 1877aae56..99a93ce00 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -99,6 +99,7 @@ OVS_CHECK_DOT
>  OVS_CHECK_IF_DL
>  OVS_CHECK_STRTOK_R
>  OVS_CHECK_LINUX_AF_XDP
> +OVS_CHECK_LINUX_BPF
>  AC_CHECK_DECLS([sys_siglist], [], [], [[#include <signal.h>]])
>  AC_CHECK_MEMBERS([struct stat.st_mtim.tv_nsec, struct stat.st_mtimensec],
>    [], [], [[#include <sys/stat.h>]])
> @@ -198,6 +199,7 @@ AC_CONFIG_FILES(datapath/Makefile)
>  AC_CONFIG_FILES(datapath/linux/Kbuild)
>  AC_CONFIG_FILES(datapath/linux/Makefile)
>  AC_CONFIG_FILES(datapath/linux/Makefile.main)
> +AC_CONFIG_FILES(bpf/Makefile)
>  AC_CONFIG_FILES(tests/atlocal)
>  AC_CONFIG_FILES(lib/libopenvswitch.pc)
>  AC_CONFIG_FILES(lib/libsflow.pc)
> -- 
> 2.25.1
>
Toshiaki Makita May 5, 2020, 2:43 a.m. UTC | #2
On 2020/05/04 1:22, William Tu wrote:
> On Tue, Apr 21, 2020 at 11:47:04PM +0900, Toshiaki Makita wrote:
>> This adds a reference program, flowtable_afxdp.o, which can be used to
>> offload flows to XDP through netdev-offload-xdp.
>> The program will be compiled when using --enable-bpf switch.
>>
>> Signed-off-by: Toshiaki Makita <toshiaki.makita1@gmail.com>
> 
> Hi Toshiaki,
> 
> Thanks for your patch, I haven't tried to run it but I have
> questions about miniflow.
> 
>> ---
...
>> +SEC("xdp") int
>> +flowtable_afxdp(struct xdp_md *ctx)
>> +{
>> +    struct xdp_miniflow *pkt_mf;
>> +    struct xdp_subtable_mask *subtable_mask;
>> +    int *head;
>> +    struct xdp_flow_actions *xdp_actions = NULL;
>> +    struct nlattr *a;
>> +    unsigned int left;
>> +    int cnt, idx, zero = 0;
>> +
>> +    account_debug(0);
>> +
>> +    head = bpf_map_lookup_elem(&subtbl_masks_hd, &zero);
>> +    if (!head) {
>> +        return XDP_ABORTED;
>> +    }
>> +    if (*head == XDP_SUBTABLES_TAIL) {
>> +        /* Offload not enabled */
>> +        goto upcall;
>> +    }
>> +
>> +    /* Get temporary storage for storing packet miniflow */
>> +    pkt_mf = bpf_map_lookup_elem(&pkt_mf_tbl, &zero);
>> +    if (!pkt_mf) {
>> +        return XDP_ABORTED;
>> +    }
>> +
> 
> I start to wonder what's the benefit of using miniflow in XDP?
> miniflow tries to compress the large flow key into smaller memory,
> and with a flowmap indicating which offset in bits are valid.
> And when adding a new subtable at flow_put, the subtable's key
> size can be smaller with only the needed fields.
> 
> But in the case of XDP/eBPF, we have to statically allocated fixed
> key size (the subtbl_template has key as struct xdp_flow_key), so
> each subtable is always having key with full key fields. (not saving
> memory space) Moreover with miniflow, it makes the 'mask_key' function
> below pretty complicated.

Fixed sized subtable is restriction of map-in-map.
The benefit to use miniflow I envisioned initially was
- compress the key
- use existing function to convert flows for XDP

The first one is actually not doable due to map-in-map. I hope someday the 
restriction gets loosen...

The second one is doable and it's good for userspace offload driver.
But I ended up with rewriting most helper functions of miniflow for BPF due to some 
restriction.

> What if we don't compress the flow key?
> unlike to OVS userspace, more like the kernel's implementation of OVS
> megaflow (ex: masked_flow_lookup in net/openvswitch/flow_table.c)

Without miniflow the BPF program will be simple like the original xdp_flow patch for 
Linux kernel.
I can try it.

> Where it has a list of masks (like your 'subtbl_masks' map) but
> only use 1 hash table (so we don't need map-in-map).
> We lookup the hash table multiple times, each
> time applied different masks in the mask list until finding a match.

For HW offload one big hash table may help. Fow software it will be slower as key 
size and entries are larger.
Maybe we can support both types, one big hash table and multiple subtables, in the 
future?

As I don't have BPF offloadable NICs I prefer to keep current multiple subtables for 
now.

Toshiaki Makita

> 
> However, I'm not sure whether this will hit some limitations of verifier.
> 
> Please correct me if I understand wrong here.
> Thanks
> William
William Tu May 5, 2020, 3:18 p.m. UTC | #3
On Tue, May 05, 2020 at 11:43:58AM +0900, Toshiaki Makita wrote:
> On 2020/05/04 1:22, William Tu wrote:
> >On Tue, Apr 21, 2020 at 11:47:04PM +0900, Toshiaki Makita wrote:
> >>This adds a reference program, flowtable_afxdp.o, which can be used to
> >>offload flows to XDP through netdev-offload-xdp.
> >>The program will be compiled when using --enable-bpf switch.
> >>
> >>Signed-off-by: Toshiaki Makita <toshiaki.makita1@gmail.com>
> >
> >Hi Toshiaki,
> >
> >Thanks for your patch, I haven't tried to run it but I have
> >questions about miniflow.
> >
> >>---
> ...
> >>+SEC("xdp") int
> >>+flowtable_afxdp(struct xdp_md *ctx)
> >>+{
> >>+    struct xdp_miniflow *pkt_mf;
> >>+    struct xdp_subtable_mask *subtable_mask;
> >>+    int *head;
> >>+    struct xdp_flow_actions *xdp_actions = NULL;
> >>+    struct nlattr *a;
> >>+    unsigned int left;
> >>+    int cnt, idx, zero = 0;
> >>+
> >>+    account_debug(0);
> >>+
> >>+    head = bpf_map_lookup_elem(&subtbl_masks_hd, &zero);
> >>+    if (!head) {
> >>+        return XDP_ABORTED;
> >>+    }
> >>+    if (*head == XDP_SUBTABLES_TAIL) {
> >>+        /* Offload not enabled */
> >>+        goto upcall;
> >>+    }
> >>+
> >>+    /* Get temporary storage for storing packet miniflow */
> >>+    pkt_mf = bpf_map_lookup_elem(&pkt_mf_tbl, &zero);
> >>+    if (!pkt_mf) {
> >>+        return XDP_ABORTED;
> >>+    }
> >>+
> >
> >I start to wonder what's the benefit of using miniflow in XDP?
> >miniflow tries to compress the large flow key into smaller memory,
> >and with a flowmap indicating which offset in bits are valid.
> >And when adding a new subtable at flow_put, the subtable's key
> >size can be smaller with only the needed fields.
> >
> >But in the case of XDP/eBPF, we have to statically allocated fixed
> >key size (the subtbl_template has key as struct xdp_flow_key), so
> >each subtable is always having key with full key fields. (not saving
> >memory space) Moreover with miniflow, it makes the 'mask_key' function
> >below pretty complicated.
> 
> Fixed sized subtable is restriction of map-in-map.
> The benefit to use miniflow I envisioned initially was
> - compress the key
> - use existing function to convert flows for XDP
> 
> The first one is actually not doable due to map-in-map. I hope someday the
> restriction gets loosen...

On my second thought this might be a good idea.

One problem I hit in my previous prototype is that the flow key is too big.
ex: struct sw_flow_key in ovs kernel is more than 500B, and moving it to bpf
some BPF limitation. If we continue adding more fields to struct xdp_flow,
ex: ipv6, vxlan tunnel, then with miniflow extract, the actually memory usage
in key is smaller. The key size in subtbl_template can be a value smaller the
the size of struct xdp_flow.

But when adding more fields, I'm not sure whether we will hit some limitation
of BPF at xdp_miniflow_extract.
So miniflow can save key size but complicate the key extraction.
Without miniflow, key size is large but key extract and match are simpler.
What do you think?

> 
> The second one is doable and it's good for userspace offload driver.
> But I ended up with rewriting most helper functions of miniflow for BPF due
> to some restriction.
> 
> >What if we don't compress the flow key?
> >unlike to OVS userspace, more like the kernel's implementation of OVS
> >megaflow (ex: masked_flow_lookup in net/openvswitch/flow_table.c)
> 
> Without miniflow the BPF program will be simple like the original xdp_flow
> patch for Linux kernel.
> I can try it.
> 
> >Where it has a list of masks (like your 'subtbl_masks' map) but
> >only use 1 hash table (so we don't need map-in-map).
> >We lookup the hash table multiple times, each
> >time applied different masks in the mask list until finding a match.
> 
> For HW offload one big hash table may help. Fow software it will be slower
> as key size and entries are larger.
> Maybe we can support both types, one big hash table and multiple subtables,
> in the future?
> 

For one big hash table, do you mean adding another cache before megaflow
like EMC (exact match cache)?

William
> As I don't have BPF offloadable NICs I prefer to keep current multiple
> subtables for now.
> 
> Toshiaki Makita
> 
> >
> >However, I'm not sure whether this will hit some limitations of verifier.
> >
> >Please correct me if I understand wrong here.
> >Thanks
> >William
Toshiaki Makita May 7, 2020, 2:56 p.m. UTC | #4
On 2020/05/06 0:18, William Tu wrote:
> On Tue, May 05, 2020 at 11:43:58AM +0900, Toshiaki Makita wrote:
>> On 2020/05/04 1:22, William Tu wrote:
>>> On Tue, Apr 21, 2020 at 11:47:04PM +0900, Toshiaki Makita wrote:
>>>> This adds a reference program, flowtable_afxdp.o, which can be used to
>>>> offload flows to XDP through netdev-offload-xdp.
>>>> The program will be compiled when using --enable-bpf switch.
>>>>
>>>> Signed-off-by: Toshiaki Makita <toshiaki.makita1@gmail.com>
>>>
>>> Hi Toshiaki,
>>>
>>> Thanks for your patch, I haven't tried to run it but I have
>>> questions about miniflow.
>>>
>>>> ---
>> ...
>>>> +SEC("xdp") int
>>>> +flowtable_afxdp(struct xdp_md *ctx)
>>>> +{
>>>> +    struct xdp_miniflow *pkt_mf;
>>>> +    struct xdp_subtable_mask *subtable_mask;
>>>> +    int *head;
>>>> +    struct xdp_flow_actions *xdp_actions = NULL;
>>>> +    struct nlattr *a;
>>>> +    unsigned int left;
>>>> +    int cnt, idx, zero = 0;
>>>> +
>>>> +    account_debug(0);
>>>> +
>>>> +    head = bpf_map_lookup_elem(&subtbl_masks_hd, &zero);
>>>> +    if (!head) {
>>>> +        return XDP_ABORTED;
>>>> +    }
>>>> +    if (*head == XDP_SUBTABLES_TAIL) {
>>>> +        /* Offload not enabled */
>>>> +        goto upcall;
>>>> +    }
>>>> +
>>>> +    /* Get temporary storage for storing packet miniflow */
>>>> +    pkt_mf = bpf_map_lookup_elem(&pkt_mf_tbl, &zero);
>>>> +    if (!pkt_mf) {
>>>> +        return XDP_ABORTED;
>>>> +    }
>>>> +
>>>
>>> I start to wonder what's the benefit of using miniflow in XDP?
>>> miniflow tries to compress the large flow key into smaller memory,
>>> and with a flowmap indicating which offset in bits are valid.
>>> And when adding a new subtable at flow_put, the subtable's key
>>> size can be smaller with only the needed fields.
>>>
>>> But in the case of XDP/eBPF, we have to statically allocated fixed
>>> key size (the subtbl_template has key as struct xdp_flow_key), so
>>> each subtable is always having key with full key fields. (not saving
>>> memory space) Moreover with miniflow, it makes the 'mask_key' function
>>> below pretty complicated.
>>
>> Fixed sized subtable is restriction of map-in-map.
>> The benefit to use miniflow I envisioned initially was
>> - compress the key
>> - use existing function to convert flows for XDP
>>
>> The first one is actually not doable due to map-in-map. I hope someday the
>> restriction gets loosen...
> 
> On my second thought this might be a good idea.
> 
> One problem I hit in my previous prototype is that the flow key is too big.
> ex: struct sw_flow_key in ovs kernel is more than 500B, and moving it to bpf
> some BPF limitation. If we continue adding more fields to struct xdp_flow,
> ex: ipv6, vxlan tunnel, then with miniflow extract, the actually memory usage
> in key is smaller. The key size in subtbl_template can be a value smaller the
> the size of struct xdp_flow.

Nice idea!
So you mean we can have less sized subtbl_template key because when adding more and 
more keys, we probably don't use *all* of keys at the same time?

> But when adding more fields, I'm not sure whether we will hit some limitation
> of BPF at xdp_miniflow_extract.
> So miniflow can save key size but complicate the key extraction.
> Without miniflow, key size is large but key extract and match are simpler.
> What do you think?

I cannot think of any limitation xdp_miniflow_extract will hit. Do you have insns 
limit in mind?
My main concern for miniflow is insns limit in mask_key(), but if we can have 
smaller key size for subtbl_template then insns will be less too.
Although I should check how large key is acceptable for mask_key, I guess miniflow 
is better considering we have more keys in the future.

>> The second one is doable and it's good for userspace offload driver.
>> But I ended up with rewriting most helper functions of miniflow for BPF due
>> to some restriction.
>>
>>> What if we don't compress the flow key?
>>> unlike to OVS userspace, more like the kernel's implementation of OVS
>>> megaflow (ex: masked_flow_lookup in net/openvswitch/flow_table.c)
>>
>> Without miniflow the BPF program will be simple like the original xdp_flow
>> patch for Linux kernel.
>> I can try it.
>>
>>> Where it has a list of masks (like your 'subtbl_masks' map) but
>>> only use 1 hash table (so we don't need map-in-map).
>>> We lookup the hash table multiple times, each
>>> time applied different masks in the mask list until finding a match.
>>
>> For HW offload one big hash table may help. Fow software it will be slower
>> as key size and entries are larger.
>> Maybe we can support both types, one big hash table and multiple subtables,
>> in the future?
>>
> 
> For one big hash table, do you mean adding another cache before megaflow
> like EMC (exact match cache)?

No, I meant replacing combination of current flow_table and multiple subtbls with 
one big hash table, like ovs kernel module.

Toshiaki Makita
William Tu May 8, 2020, 3:05 p.m. UTC | #5
On Thu, May 7, 2020 at 7:56 AM Toshiaki Makita
<toshiaki.makita1@gmail.com> wrote:
>
> On 2020/05/06 0:18, William Tu wrote:
> > On Tue, May 05, 2020 at 11:43:58AM +0900, Toshiaki Makita wrote:
> >> On 2020/05/04 1:22, William Tu wrote:
> >>> On Tue, Apr 21, 2020 at 11:47:04PM +0900, Toshiaki Makita wrote:
> >>>> This adds a reference program, flowtable_afxdp.o, which can be used to
> >>>> offload flows to XDP through netdev-offload-xdp.
> >>>> The program will be compiled when using --enable-bpf switch.
> >>>>
> >>>> Signed-off-by: Toshiaki Makita <toshiaki.makita1@gmail.com>
> >>>
> >>> Hi Toshiaki,
> >>>
> >>> Thanks for your patch, I haven't tried to run it but I have
> >>> questions about miniflow.
> >>>
> >>>> ---
> >> ...
> >>>> +SEC("xdp") int
> >>>> +flowtable_afxdp(struct xdp_md *ctx)
> >>>> +{
> >>>> +    struct xdp_miniflow *pkt_mf;
> >>>> +    struct xdp_subtable_mask *subtable_mask;
> >>>> +    int *head;
> >>>> +    struct xdp_flow_actions *xdp_actions = NULL;
> >>>> +    struct nlattr *a;
> >>>> +    unsigned int left;
> >>>> +    int cnt, idx, zero = 0;
> >>>> +
> >>>> +    account_debug(0);
> >>>> +
> >>>> +    head = bpf_map_lookup_elem(&subtbl_masks_hd, &zero);
> >>>> +    if (!head) {
> >>>> +        return XDP_ABORTED;
> >>>> +    }
> >>>> +    if (*head == XDP_SUBTABLES_TAIL) {
> >>>> +        /* Offload not enabled */
> >>>> +        goto upcall;
> >>>> +    }
> >>>> +
> >>>> +    /* Get temporary storage for storing packet miniflow */
> >>>> +    pkt_mf = bpf_map_lookup_elem(&pkt_mf_tbl, &zero);
> >>>> +    if (!pkt_mf) {
> >>>> +        return XDP_ABORTED;
> >>>> +    }
> >>>> +
> >>>
> >>> I start to wonder what's the benefit of using miniflow in XDP?
> >>> miniflow tries to compress the large flow key into smaller memory,
> >>> and with a flowmap indicating which offset in bits are valid.
> >>> And when adding a new subtable at flow_put, the subtable's key
> >>> size can be smaller with only the needed fields.
> >>>
> >>> But in the case of XDP/eBPF, we have to statically allocated fixed
> >>> key size (the subtbl_template has key as struct xdp_flow_key), so
> >>> each subtable is always having key with full key fields. (not saving
> >>> memory space) Moreover with miniflow, it makes the 'mask_key' function
> >>> below pretty complicated.
> >>
> >> Fixed sized subtable is restriction of map-in-map.
> >> The benefit to use miniflow I envisioned initially was
> >> - compress the key
> >> - use existing function to convert flows for XDP
> >>
> >> The first one is actually not doable due to map-in-map. I hope someday the
> >> restriction gets loosen...
> >
> > On my second thought this might be a good idea.
> >
> > One problem I hit in my previous prototype is that the flow key is too big.
> > ex: struct sw_flow_key in ovs kernel is more than 500B, and moving it to bpf
> > some BPF limitation. If we continue adding more fields to struct xdp_flow,
> > ex: ipv6, vxlan tunnel, then with miniflow extract, the actually memory usage
> > in key is smaller. The key size in subtbl_template can be a value smaller the
> > the size of struct xdp_flow.
>
> Nice idea!
> So you mean we can have less sized subtbl_template key because when adding more and
> more keys, we probably don't use *all* of keys at the same time?

Now I'm a little confused.
If you see 'struct flow' in ovs, all the fields have its own space
(ex: no union is used for
ipv4 and ipv6). So using miniflow extract saves space by only use
fields on the packet header.
But if you see 'struct sw_flow_key' in ovs kernel, same layer of
protocols are using union,
(ex: ipv4 and ipv6 are in a union). So no extra space is wasted.
So how we define the struct xdp_flow_key makes difference.

Or here is another idea.
We can also do on-demand parsing/key_extract here.
1) keep a list of key_fields = {empty}
2) when xdp_flow_put, we know which fields are needed to extract
    ex: key_fields = {src_mac, dst_mac}
3) then at the xdp_miniflow_extract we can skip/return early after
    the L2 fields are parsed.

>
> > But when adding more fields, I'm not sure whether we will hit some limitation
> > of BPF at xdp_miniflow_extract.
> > So miniflow can save key size but complicate the key extraction.
> > Without miniflow, key size is large but key extract and match are simpler.
> > What do you think?
>
> I cannot think of any limitation xdp_miniflow_extract will hit. Do you have insns
> limit in mind?
yes.

> My main concern for miniflow is insns limit in mask_key(), but if we can have
> smaller key size for subtbl_template then insns will be less too.
> Although I should check how large key is acceptable for mask_key, I guess miniflow
> is better considering we have more keys in the future.
>
> >> The second one is doable and it's good for userspace offload driver.
> >> But I ended up with rewriting most helper functions of miniflow for BPF due
> >> to some restriction.
> >>
> >>> What if we don't compress the flow key?
> >>> unlike to OVS userspace, more like the kernel's implementation of OVS
> >>> megaflow (ex: masked_flow_lookup in net/openvswitch/flow_table.c)
> >>
> >> Without miniflow the BPF program will be simple like the original xdp_flow
> >> patch for Linux kernel.
> >> I can try it.
> >>
> >>> Where it has a list of masks (like your 'subtbl_masks' map) but
> >>> only use 1 hash table (so we don't need map-in-map).
> >>> We lookup the hash table multiple times, each
> >>> time applied different masks in the mask list until finding a match.
> >>
> >> For HW offload one big hash table may help. Fow software it will be slower
> >> as key size and entries are larger.
> >> Maybe we can support both types, one big hash table and multiple subtables,
> >> in the future?
> >>
> >
> > For one big hash table, do you mean adding another cache before megaflow
> > like EMC (exact match cache)?
>
> No, I meant replacing combination of current flow_table and multiple subtbls with
> one big hash table, like ovs kernel module.

I see, thanks
William
Toshiaki Makita May 11, 2020, 2:53 p.m. UTC | #6
On 2020/05/09 0:05, William Tu wrote:
> On Thu, May 7, 2020 at 7:56 AM Toshiaki Makita
> <toshiaki.makita1@gmail.com> wrote:
>>
>> On 2020/05/06 0:18, William Tu wrote:
>>> On Tue, May 05, 2020 at 11:43:58AM +0900, Toshiaki Makita wrote:
>>>> On 2020/05/04 1:22, William Tu wrote:
>>>>> On Tue, Apr 21, 2020 at 11:47:04PM +0900, Toshiaki Makita wrote:
>>>>>> This adds a reference program, flowtable_afxdp.o, which can be used to
>>>>>> offload flows to XDP through netdev-offload-xdp.
>>>>>> The program will be compiled when using --enable-bpf switch.
>>>>>>
>>>>>> Signed-off-by: Toshiaki Makita <toshiaki.makita1@gmail.com>
>>>>>
>>>>> Hi Toshiaki,
>>>>>
>>>>> Thanks for your patch, I haven't tried to run it but I have
>>>>> questions about miniflow.
>>>>>
>>>>>> ---
>>>> ...
>>>>>> +SEC("xdp") int
>>>>>> +flowtable_afxdp(struct xdp_md *ctx)
>>>>>> +{
>>>>>> +    struct xdp_miniflow *pkt_mf;
>>>>>> +    struct xdp_subtable_mask *subtable_mask;
>>>>>> +    int *head;
>>>>>> +    struct xdp_flow_actions *xdp_actions = NULL;
>>>>>> +    struct nlattr *a;
>>>>>> +    unsigned int left;
>>>>>> +    int cnt, idx, zero = 0;
>>>>>> +
>>>>>> +    account_debug(0);
>>>>>> +
>>>>>> +    head = bpf_map_lookup_elem(&subtbl_masks_hd, &zero);
>>>>>> +    if (!head) {
>>>>>> +        return XDP_ABORTED;
>>>>>> +    }
>>>>>> +    if (*head == XDP_SUBTABLES_TAIL) {
>>>>>> +        /* Offload not enabled */
>>>>>> +        goto upcall;
>>>>>> +    }
>>>>>> +
>>>>>> +    /* Get temporary storage for storing packet miniflow */
>>>>>> +    pkt_mf = bpf_map_lookup_elem(&pkt_mf_tbl, &zero);
>>>>>> +    if (!pkt_mf) {
>>>>>> +        return XDP_ABORTED;
>>>>>> +    }
>>>>>> +
>>>>>
>>>>> I start to wonder what's the benefit of using miniflow in XDP?
>>>>> miniflow tries to compress the large flow key into smaller memory,
>>>>> and with a flowmap indicating which offset in bits are valid.
>>>>> And when adding a new subtable at flow_put, the subtable's key
>>>>> size can be smaller with only the needed fields.
>>>>>
>>>>> But in the case of XDP/eBPF, we have to statically allocated fixed
>>>>> key size (the subtbl_template has key as struct xdp_flow_key), so
>>>>> each subtable is always having key with full key fields. (not saving
>>>>> memory space) Moreover with miniflow, it makes the 'mask_key' function
>>>>> below pretty complicated.
>>>>
>>>> Fixed sized subtable is restriction of map-in-map.
>>>> The benefit to use miniflow I envisioned initially was
>>>> - compress the key
>>>> - use existing function to convert flows for XDP
>>>>
>>>> The first one is actually not doable due to map-in-map. I hope someday the
>>>> restriction gets loosen...
>>>
>>> On my second thought this might be a good idea.
>>>
>>> One problem I hit in my previous prototype is that the flow key is too big.
>>> ex: struct sw_flow_key in ovs kernel is more than 500B, and moving it to bpf
>>> some BPF limitation. If we continue adding more fields to struct xdp_flow,
>>> ex: ipv6, vxlan tunnel, then with miniflow extract, the actually memory usage
>>> in key is smaller. The key size in subtbl_template can be a value smaller the
>>> the size of struct xdp_flow.
>>
>> Nice idea!
>> So you mean we can have less sized subtbl_template key because when adding more and
>> more keys, we probably don't use *all* of keys at the same time?
> 
> Now I'm a little confused.
> If you see 'struct flow' in ovs, all the fields have its own space
> (ex: no union is used for
> ipv4 and ipv6). So using miniflow extract saves space by only use
> fields on the packet header.
> But if you see 'struct sw_flow_key' in ovs kernel, same layer of
> protocols are using union,
> (ex: ipv4 and ipv6 are in a union). So no extra space is wasted.
> So how we define the struct xdp_flow_key makes difference.

Thanks, now I see what you mean.
Actually I was thinking of further aggressive way to minimize subtables keys.
As we don't necessarily need to support all of key combinations, we can cap
the key size at a certain value.
For example, one flow may need dl_* and ip_* and another flow may need
tunneling keys etc, but requiring all of them, i.e. dl, ip, tp, ct, tun, ...
and every other key, is unlikely. So just cap the key size at a reasonable value, 
and in case some flow requires more size, we can just return error like EOPNOTSUPP 
for such flow.

What do you think?

> 
> Or here is another idea.
> We can also do on-demand parsing/key_extract here.
> 1) keep a list of key_fields = {empty}
> 2) when xdp_flow_put, we know which fields are needed to extract
>      ex: key_fields = {src_mac, dst_mac}
> 3) then at the xdp_miniflow_extract we can skip/return early after
>      the L2 fields are parsed.

So in this case we need to keep track of all of keys used in any subtable, right?
I'm a bit concerned about complexity introduced by this mechanism.

>>> But when adding more fields, I'm not sure whether we will hit some limitation
>>> of BPF at xdp_miniflow_extract.
>>> So miniflow can save key size but complicate the key extraction.
>>> Without miniflow, key size is large but key extract and match are simpler.
>>> What do you think?
>>
>> I cannot think of any limitation xdp_miniflow_extract will hit. Do you have insns
>> limit in mind?
> yes.

As xdp_miniflow_extract does not handle loop, I'm not so concerned.

Toshiaki Makita

> 
>> My main concern for miniflow is insns limit in mask_key(), but if we can have
>> smaller key size for subtbl_template then insns will be less too.
>> Although I should check how large key is acceptable for mask_key, I guess miniflow
>> is better considering we have more keys in the future.
>>
>>>> The second one is doable and it's good for userspace offload driver.
>>>> But I ended up with rewriting most helper functions of miniflow for BPF due
>>>> to some restriction.
>>>>
>>>>> What if we don't compress the flow key?
>>>>> unlike to OVS userspace, more like the kernel's implementation of OVS
>>>>> megaflow (ex: masked_flow_lookup in net/openvswitch/flow_table.c)
>>>>
>>>> Without miniflow the BPF program will be simple like the original xdp_flow
>>>> patch for Linux kernel.
>>>> I can try it.
>>>>
>>>>> Where it has a list of masks (like your 'subtbl_masks' map) but
>>>>> only use 1 hash table (so we don't need map-in-map).
>>>>> We lookup the hash table multiple times, each
>>>>> time applied different masks in the mask list until finding a match.
>>>>
>>>> For HW offload one big hash table may help. Fow software it will be slower
>>>> as key size and entries are larger.
>>>> Maybe we can support both types, one big hash table and multiple subtables,
>>>> in the future?
>>>>
>>>
>>> For one big hash table, do you mean adding another cache before megaflow
>>> like EMC (exact match cache)?
>>
>> No, I meant replacing combination of current flow_table and multiple subtbls with
>> one big hash table, like ovs kernel module.
> 
> I see, thanks
> William
>
William Tu May 13, 2020, 3:31 p.m. UTC | #7
On Mon, May 11, 2020 at 7:53 AM Toshiaki Makita
<toshiaki.makita1@gmail.com> wrote:
>
> On 2020/05/09 0:05, William Tu wrote:
> > On Thu, May 7, 2020 at 7:56 AM Toshiaki Makita
> > <toshiaki.makita1@gmail.com> wrote:
> >>
> >> On 2020/05/06 0:18, William Tu wrote:
> >>> On Tue, May 05, 2020 at 11:43:58AM +0900, Toshiaki Makita wrote:
> >>>> On 2020/05/04 1:22, William Tu wrote:
> >>>>> On Tue, Apr 21, 2020 at 11:47:04PM +0900, Toshiaki Makita wrote:
> >>>>>> This adds a reference program, flowtable_afxdp.o, which can be used to
> >>>>>> offload flows to XDP through netdev-offload-xdp.
> >>>>>> The program will be compiled when using --enable-bpf switch.
> >>>>>>
> >>>>>> Signed-off-by: Toshiaki Makita <toshiaki.makita1@gmail.com>
> >>>>>
> >>>>> Hi Toshiaki,
> >>>>>
> >>>>> Thanks for your patch, I haven't tried to run it but I have
> >>>>> questions about miniflow.
> >>>>>
> >>>>>> ---
> >>>> ...
> >>>>>> +SEC("xdp") int
> >>>>>> +flowtable_afxdp(struct xdp_md *ctx)
> >>>>>> +{
> >>>>>> +    struct xdp_miniflow *pkt_mf;
> >>>>>> +    struct xdp_subtable_mask *subtable_mask;
> >>>>>> +    int *head;
> >>>>>> +    struct xdp_flow_actions *xdp_actions = NULL;
> >>>>>> +    struct nlattr *a;
> >>>>>> +    unsigned int left;
> >>>>>> +    int cnt, idx, zero = 0;
> >>>>>> +
> >>>>>> +    account_debug(0);
> >>>>>> +
> >>>>>> +    head = bpf_map_lookup_elem(&subtbl_masks_hd, &zero);
> >>>>>> +    if (!head) {
> >>>>>> +        return XDP_ABORTED;
> >>>>>> +    }
> >>>>>> +    if (*head == XDP_SUBTABLES_TAIL) {
> >>>>>> +        /* Offload not enabled */
> >>>>>> +        goto upcall;
> >>>>>> +    }
> >>>>>> +
> >>>>>> +    /* Get temporary storage for storing packet miniflow */
> >>>>>> +    pkt_mf = bpf_map_lookup_elem(&pkt_mf_tbl, &zero);
> >>>>>> +    if (!pkt_mf) {
> >>>>>> +        return XDP_ABORTED;
> >>>>>> +    }
> >>>>>> +
> >>>>>
> >>>>> I start to wonder what's the benefit of using miniflow in XDP?
> >>>>> miniflow tries to compress the large flow key into smaller memory,
> >>>>> and with a flowmap indicating which offset in bits are valid.
> >>>>> And when adding a new subtable at flow_put, the subtable's key
> >>>>> size can be smaller with only the needed fields.
> >>>>>
> >>>>> But in the case of XDP/eBPF, we have to statically allocated fixed
> >>>>> key size (the subtbl_template has key as struct xdp_flow_key), so
> >>>>> each subtable is always having key with full key fields. (not saving
> >>>>> memory space) Moreover with miniflow, it makes the 'mask_key' function
> >>>>> below pretty complicated.
> >>>>
> >>>> Fixed sized subtable is restriction of map-in-map.
> >>>> The benefit to use miniflow I envisioned initially was
> >>>> - compress the key
> >>>> - use existing function to convert flows for XDP
> >>>>
> >>>> The first one is actually not doable due to map-in-map. I hope someday the
> >>>> restriction gets loosen...
> >>>
> >>> On my second thought this might be a good idea.
> >>>
> >>> One problem I hit in my previous prototype is that the flow key is too big.
> >>> ex: struct sw_flow_key in ovs kernel is more than 500B, and moving it to bpf
> >>> some BPF limitation. If we continue adding more fields to struct xdp_flow,
> >>> ex: ipv6, vxlan tunnel, then with miniflow extract, the actually memory usage
> >>> in key is smaller. The key size in subtbl_template can be a value smaller the
> >>> the size of struct xdp_flow.
> >>
> >> Nice idea!
> >> So you mean we can have less sized subtbl_template key because when adding more and
> >> more keys, we probably don't use *all* of keys at the same time?
> >
> > Now I'm a little confused.
> > If you see 'struct flow' in ovs, all the fields have its own space
> > (ex: no union is used for
> > ipv4 and ipv6). So using miniflow extract saves space by only use
> > fields on the packet header.
> > But if you see 'struct sw_flow_key' in ovs kernel, same layer of
> > protocols are using union,
> > (ex: ipv4 and ipv6 are in a union). So no extra space is wasted.
> > So how we define the struct xdp_flow_key makes difference.
>
> Thanks, now I see what you mean.
> Actually I was thinking of further aggressive way to minimize subtables keys.
> As we don't necessarily need to support all of key combinations, we can cap
> the key size at a certain value.
> For example, one flow may need dl_* and ip_* and another flow may need
> tunneling keys etc, but requiring all of them, i.e. dl, ip, tp, ct, tun, ...
> and every other key, is unlikely. So just cap the key size at a reasonable value,
> and in case some flow requires more size, we can just return error like EOPNOTSUPP
> for such flow.
>
> What do you think?

Yes, I think that's a good idea.
>
> >
> > Or here is another idea.
> > We can also do on-demand parsing/key_extract here.
> > 1) keep a list of key_fields = {empty}
> > 2) when xdp_flow_put, we know which fields are needed to extract
> >      ex: key_fields = {src_mac, dst_mac}
> > 3) then at the xdp_miniflow_extract we can skip/return early after
> >      the L2 fields are parsed.
>
> So in this case we need to keep track of all of keys used in any subtable, right?
> I'm a bit concerned about complexity introduced by this mechanism.

right, that's might be too complicated for initial prototype.
>
> >>> But when adding more fields, I'm not sure whether we will hit some limitation
> >>> of BPF at xdp_miniflow_extract.
> >>> So miniflow can save key size but complicate the key extraction.
> >>> Without miniflow, key size is large but key extract and match are simpler.
> >>> What do you think?
> >>
> >> I cannot think of any limitation xdp_miniflow_extract will hit. Do you have insns
> >> limit in mind?
> > yes.
>
> As xdp_miniflow_extract does not handle loop, I'm not so concerned.

I see, thanks!
William
Tonghao Zhang May 14, 2020, 4:08 a.m. UTC | #8
On Tue, Apr 21, 2020 at 10:48 PM Toshiaki Makita
<toshiaki.makita1@gmail.com> wrote:
>
> This adds a reference program, flowtable_afxdp.o, which can be used to
> offload flows to XDP through netdev-offload-xdp.
> The program will be compiled when using --enable-bpf switch.
Hi Toshiaki
Good! One question, did we test the performance with this patch ?
> Signed-off-by: Toshiaki Makita <toshiaki.makita1@gmail.com>
> ---
>  Makefile.am           |   9 +-
>  acinclude.m4          |  56 +++++
>  bpf/.gitignore        |   4 +
>  bpf/Makefile.am       |  59 +++++
>  bpf/bpf_miniflow.h    | 179 +++++++++++++++
>  bpf/bpf_netlink.h     |  34 +++
>  bpf/bpf_workaround.h  |  28 +++
>  bpf/flowtable_afxdp.c | 515 ++++++++++++++++++++++++++++++++++++++++++
>  configure.ac          |   2 +
>  9 files changed, 884 insertions(+), 2 deletions(-)
>  create mode 100644 bpf/.gitignore
>  create mode 100644 bpf/Makefile.am
>  create mode 100644 bpf/bpf_miniflow.h
>  create mode 100644 bpf/bpf_netlink.h
>  create mode 100644 bpf/bpf_workaround.h
>  create mode 100644 bpf/flowtable_afxdp.c
>
> diff --git a/Makefile.am b/Makefile.am
> index b279303d1..f18bfefde 100644
> --- a/Makefile.am
> +++ b/Makefile.am
> @@ -8,6 +8,9 @@
>  AUTOMAKE_OPTIONS = foreign subdir-objects
>  ACLOCAL_AMFLAGS = -I m4
>  SUBDIRS = datapath
> +if HAVE_BPF
> +SUBDIRS += bpf
> +endif
>
>  AM_CPPFLAGS = $(SSL_CFLAGS)
>  AM_LDFLAGS = $(SSL_LDFLAGS)
> @@ -198,7 +201,9 @@ ALL_LOCAL += dist-hook-git
>  dist-hook-git: distfiles
>         @if test -e $(srcdir)/.git && (git --version) >/dev/null 2>&1; then \
>           (cd datapath && $(MAKE) distfiles); \
> -         (cat distfiles; sed 's|^|datapath/|' datapath/distfiles) | \
> +         (cd bpf && $(MAKE) distfiles); \
> +         (cat distfiles; sed 's|^|datapath/|' datapath/distfiles; \
> +          sed 's|^|bpf/|' bpf/distfiles) | \
>             LC_ALL=C sort -u > all-distfiles; \
>           (cd $(srcdir) && git ls-files) | grep -v '\.gitignore$$' | \
>             grep -v '\.gitattributes$$' | \
> @@ -234,7 +239,7 @@ config-h-check:
>         @cd $(srcdir); \
>         if test -e .git && (git --version) >/dev/null 2>&1 && \
>           git --no-pager grep -L '#include <config\.h>' `git ls-files | grep '\.c$$' | \
> -           grep -vE '^datapath|^lib/sflow|^third-party|^datapath-windows|^python'`; \
> +           grep -vE '^datapath|^lib/sflow|^third-party|^datapath-windows|^python|^bpf'`; \
>         then \
>           echo "See above for list of violations of the rule that"; \
>           echo "every C source file must #include <config.h>."; \
> diff --git a/acinclude.m4 b/acinclude.m4
> index 0901f2870..2fb2f385f 100644
> --- a/acinclude.m4
> +++ b/acinclude.m4
> @@ -301,6 +301,62 @@ AC_DEFUN([OVS_CHECK_LINUX_AF_XDP], [
>    AM_CONDITIONAL([HAVE_AF_XDP], test "$AF_XDP_ENABLE" = true)
>  ])
>
> +dnl OVS_CHECK_LINUX_BPF
> +dnl
> +dnl Check both llvm and libbpf support
> +AC_DEFUN([OVS_CHECK_LINUX_BPF], [
> +  AC_ARG_ENABLE([bpf],
> +                [AC_HELP_STRING([--enable-bpf],
> +                                [Compile reference eBPF programs for XDP])],
> +                [], [enable_bpf=no])
> +  AC_MSG_CHECKING([whether BPF is enabled])
> +  if test "$enable_bpf" != yes; then
> +    AC_MSG_RESULT([no])
> +    BPF_ENABLE=false
> +  else
> +    AC_MSG_RESULT([yes])
> +    BPF_ENABLE=true
> +
> +    AC_CHECK_PROG(CLANG_CHECK, clang, yes)
> +    AS_IF([test X"$CLANG_CHECK" != X"yes"],
> +      [AC_MSG_ERROR([unable to find clang to compile BPF program])])
> +
> +    AC_CHECK_PROG(LLC_CHECK, llc, yes)
> +    AS_IF([test X"$LLC_CHECK" != X"yes"],
> +      [AC_MSG_ERROR([unable to find llc to compile BPF program])])
> +
> +    AC_CHECK_HEADER([bpf/bpf_helpers.h], [],
> +      [AC_MSG_ERROR([unable to find bpf/bpf_helpers.h to compile BPF program])])
> +
> +    AC_CHECK_HEADER([linux/bpf.h], [],
> +      [AC_MSG_ERROR([unable to find linux/bpf.h to compile BPF program])])
> +
> +    AC_MSG_CHECKING([for LLVM bpf target support])
> +    if llc -march=bpf -mattr=help >/dev/null 2>&1; then
> +      AC_MSG_RESULT([yes])
> +    else
> +      AC_MSG_RESULT([no])
> +      AC_MSG_ERROR([LLVM does not support bpf target])
> +    fi
> +
> +    AC_MSG_CHECKING([for BTF DATASEC support])
> +    AC_LANG_CONFTEST(
> +      [AC_LANG_SOURCE([__attribute__((section("_x"), used)) int x;])])
> +    if clang -g -O2 -S -target bpf -emit-llvm -c conftest.c -o conftest.ll && \
> +       llc -march=bpf -filetype=obj -o conftest.o conftest.ll && \
> +       readelf -p.BTF conftest.o 2>/dev/null | grep -q -w _x; then
> +      AC_MSG_RESULT([yes])
> +    else
> +      AC_MSG_RESULT([no])
> +      AC_MSG_ERROR([LLVM does not support BTF DATASEC])
> +    fi
> +
> +    AC_DEFINE([HAVE_BPF], [1],
> +              [Define to 1 if BPF compilation is available and enabled.])
> +  fi
> +  AM_CONDITIONAL([HAVE_BPF], test "$BPF_ENABLE" = true)
> +])
> +
>  dnl OVS_CHECK_DPDK
>  dnl
>  dnl Configure DPDK source tree
> diff --git a/bpf/.gitignore b/bpf/.gitignore
> new file mode 100644
> index 000000000..ab0f2d9e4
> --- /dev/null
> +++ b/bpf/.gitignore
> @@ -0,0 +1,4 @@
> +*.ll
> +/distfiles
> +/Makefile
> +/Makefile.in
> diff --git a/bpf/Makefile.am b/bpf/Makefile.am
> new file mode 100644
> index 000000000..f485b17f0
> --- /dev/null
> +++ b/bpf/Makefile.am
> @@ -0,0 +1,59 @@
> +AUTOMAKE_OPTIONS = foreign
> +
> +EXTRA_DIST = flowtable_afxdp.c bpf_miniflow.h bpf_netlink.h bpf_workaround.h
> +
> +# The following is based on commands for the Automake "distdir" target.
> +distfiles: Makefile
> +       @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
> +       topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
> +       list='$(DISTFILES)'; \
> +       for file in $$list; do echo $$file; done | \
> +         sed -e "s|^$$srcdirstrip/||;t" \
> +             -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t" | \
> +         LC_ALL=C sort -u > $@
> +CLEANFILES = distfiles
> +
> +CLANG ?= clang
> +LLC ?= llc
> +
> +AM_CPPFLAGS = -I $(top_srcdir)/include
> +AM_CPPFLAGS += -I $(top_builddir)/include
> +AM_CPPFLAGS += -I $(top_srcdir)/lib
> +AM_CPPFLAGS += -I $(top_builddir)/lib
> +
> +AM_CFLAGS = -Wall
> +AM_CFLAGS += -Wextra
> +AM_CFLAGS += -Wno-sign-compare
> +AM_CFLAGS += -Wformat -Wformat-security
> +AM_CFLAGS += -Wswitch-enum
> +AM_CFLAGS += -Wunused-parameter
> +AM_CFLAGS += -Wbad-function-cast
> +AM_CFLAGS += -Wcast-align
> +AM_CFLAGS += -Wstrict-prototypes
> +AM_CFLAGS += -Wold-style-definition
> +AM_CFLAGS += -Wmissing-field-initializers
> +AM_CFLAGS += -fno-strict-aliasing
> +AM_CFLAGS += -Wswitch-bool
> +AM_CFLAGS += -Wlogical-not-parentheses
> +AM_CFLAGS += -Wsizeof-array-argument
> +AM_CFLAGS += -Wshift-negative-value
> +AM_CFLAGS += -Wshadow
> +AM_CFLAGS += -Wcast-align
> +AM_CFLAGS += -Wno-unused-value
> +AM_CFLAGS += -Wno-compare-distinct-pointer-types
> +AM_CFLAGS += -g
> +AM_CFLAGS += -O2
> +
> +SUFFIXES = .ll
> +%.ll: %.c
> +       $(CLANG) $(AM_CPPFLAGS) $(AM_CFLAGS) -S \
> +               -target bpf -emit-llvm -c $< -o $@
> +CLEANFILES += *.ll
> +
> +%.o: %.ll
> +       $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@ $<
> +CLEANFILES += *.o
> +
> +flowtable_afxdp.o: flowtable_afxdp.c bpf_miniflow.h bpf_netlink.h bpf_workaround.h
> +
> +all-local: flowtable_afxdp.o
> diff --git a/bpf/bpf_miniflow.h b/bpf/bpf_miniflow.h
> new file mode 100644
> index 000000000..090a9c7be
> --- /dev/null
> +++ b/bpf/bpf_miniflow.h
> @@ -0,0 +1,179 @@
> +/*
> + * Copyright (c) 2020 NTT Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef BPF_MINIFLOW_H
> +#define BPF_MINIFLOW_H 1
> +
> +#include "flow.h"
> +
> +struct bpf_mf_ctx {
> +    struct flowmap map;
> +    uint64_t *data;
> +};
> +
> +static inline void
> +miniflow_set_maps(struct bpf_mf_ctx *ctx, size_t ofs, size_t n_words)
> +{
> +    flowmap_set(&ctx->map, ofs, n_words);
> +}
> +
> +static inline void
> +miniflow_set_map(struct bpf_mf_ctx *ctx, size_t ofs)
> +{
> +    flowmap_set(&ctx->map, ofs, 1);
> +}
> +
> +static inline void
> +miniflow_push_uint8_(struct bpf_mf_ctx *ctx, size_t ofs, uint8_t value)
> +{
> +    size_t ofs8 = ofs % 8;
> +
> +    if (ofs8 == 0) {
> +        miniflow_set_map(ctx, ofs / 8);
> +    }
> +    *((uint8_t *)ctx->data + ofs8) = value;
> +    if (ofs8 == 7) {
> +        ctx->data++;
> +    }
> +}
> +
> +static inline void
> +miniflow_push_uint16_(struct bpf_mf_ctx *ctx, size_t ofs, uint16_t value)
> +{
> +    size_t ofs8 = ofs % 8;
> +
> +    if (ofs8 == 0) {
> +        miniflow_set_map(ctx, ofs / 8);
> +        *(uint16_t *)ctx->data = value;
> +    } else if (ofs8 == 2) {
> +        *((uint16_t *)ctx->data + 1) = value;
> +    } else if (ofs8 == 4) {
> +        *((uint16_t *)ctx->data + 2) = value;
> +    } else if (ofs8 == 6) {
> +        *((uint16_t *)ctx->data + 3) = value;
> +        ctx->data++;
> +    }
> +}
> +
> +static inline void
> +miniflow_push_uint32_(struct bpf_mf_ctx *ctx, size_t ofs, uint32_t value)
> +{
> +    size_t ofs8 = ofs % 8;
> +
> +    if (ofs8 == 0) {
> +        miniflow_set_map(ctx, ofs / 8);
> +        *(uint32_t *)ctx->data = value;
> +    } else if (ofs8 == 4) {
> +        *((uint32_t *)ctx->data + 1) = value;
> +        ctx->data++;
> +    }
> +}
> +
> +static inline void
> +ether_addr_copy(struct eth_addr *dst, const struct eth_addr *src)
> +{
> +    ovs_be16 *a = dst->be16;
> +    const ovs_be16 *b = src->be16;
> +
> +    a[0] = b[0];
> +    a[1] = b[1];
> +    a[2] = b[2];
> +}
> +
> +/* 'valuep' is 16-aligned */
> +/* data must start 64-aligned and must be followed by other data or padding */
> +static inline void
> +miniflow_push_macs_(struct bpf_mf_ctx *ctx, size_t ofs,
> +                    const struct eth_addr *valuep)
> +{
> +    miniflow_set_maps(ctx, ofs / 8, 2);
> +    ether_addr_copy((struct eth_addr *)ctx->data, valuep);
> +    ether_addr_copy((struct eth_addr *)ctx->data + 1, valuep + 1);
> +    ctx->data++; /* First word only. */
> +}
> +
> +/* data must start 64-aligned and must be followed by other data */
> +static inline void
> +miniflow_pad_from_64_(struct bpf_mf_ctx *ctx, size_t ofs)
> +{
> +    size_t ofs8 = ofs % 8;
> +    size_t ofs4 = ofs % 4;
> +    size_t ofs2 = ofs % 2;
> +    void *cdata = ctx->data;
> +
> +    miniflow_set_map(ctx, ofs / 8);
> +
> +    if (ofs8 >= 4) {
> +        *(uint32_t *)cdata = 0;
> +        cdata += 4;
> +    }
> +    if (ofs4 >= 2) {
> +        *(uint16_t *)cdata = 0;
> +        cdata += 2;
> +    }
> +    if (ofs2 == 1) {
> +        *(uint8_t *)cdata = 0;
> +    }
> +}
> +
> +static inline void
> +miniflow_pad_to_64_(struct bpf_mf_ctx *ctx, size_t ofs)
> +{
> +    size_t ofs8 = ofs % 8;
> +    size_t ofs4 = ofs % 4;
> +    size_t ofs2 = ofs % 2;
> +    void *cdata = ctx->data;
> +
> +    cdata += ofs8;
> +    if (ofs2 == 1) {
> +        *(uint8_t *)cdata = 0;
> +        cdata++;
> +    }
> +    if (ofs4 <= 2) {
> +        *(uint16_t *)cdata = 0;
> +        cdata += 2;
> +    }
> +    if (ofs8 <= 4) {
> +        *(uint32_t *)cdata = 0;
> +    }
> +    ctx->data++;
> +}
> +
> +#define miniflow_push_uint8(CTX, FIELD, VALUE)                       \
> +    miniflow_push_uint8_(CTX, offsetof(struct flow, FIELD), VALUE)
> +
> +#define miniflow_push_be16_(CTX, OFS, VALUE)                         \
> +    miniflow_push_uint16_(CTX, OFS, (OVS_FORCE uint16_t)VALUE)
> +
> +#define miniflow_push_be16(CTX, FIELD, VALUE)                        \
> +    miniflow_push_be16_(CTX, offsetof(struct flow, FIELD), VALUE)    \
> +
> +#define miniflow_push_be32_(CTX, OFS, VALUE)                         \
> +    miniflow_push_uint32_(CTX, OFS, (OVS_FORCE uint32_t)VALUE)
> +
> +#define miniflow_push_be32(CTX, FIELD, VALUE)                        \
> +    miniflow_push_be32_(CTX, offsetof(struct flow, FIELD), VALUE)    \
> +
> +#define miniflow_push_macs(CTX, FIELD, VALUEP)                       \
> +    miniflow_push_macs_(CTX, offsetof(struct flow, FIELD), VALUEP)
> +
> +#define miniflow_pad_from_64(CTX, FIELD)                             \
> +    miniflow_pad_from_64_(CTX, offsetof(struct flow, FIELD))
> +
> +#define miniflow_pad_to_64(CTX, FIELD)                               \
> +    miniflow_pad_to_64_(CTX, OFFSETOFEND(struct flow, FIELD))
> +
> +#endif /* bpf_miniflow.h */
> diff --git a/bpf/bpf_netlink.h b/bpf/bpf_netlink.h
> new file mode 100644
> index 000000000..091926c61
> --- /dev/null
> +++ b/bpf/bpf_netlink.h
> @@ -0,0 +1,34 @@
> +/*
> + * Copyright (c) 2020 NTT Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef BPF_NETLINK_H
> +#define BPF_NETLINK_H 1
> +
> +#include "netlink.h"
> +
> +static inline int
> +bpf_nl_attr_type(const struct nlattr *nla)
> +{
> +    return nla->nla_type & NLA_TYPE_MASK;
> +}
> +
> +static inline const void *
> +bpf_nl_attr_get(const struct nlattr *nla)
> +{
> +    return nla + 1;
> +}
> +
> +#endif /* bpf_netlink.h */
> diff --git a/bpf/bpf_workaround.h b/bpf/bpf_workaround.h
> new file mode 100644
> index 000000000..cb072a7e6
> --- /dev/null
> +++ b/bpf/bpf_workaround.h
> @@ -0,0 +1,28 @@
> +/*
> + * Copyright (c) 2020 NTT Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef BPF_WORKAROUND_H
> +#define BPF_WORKAROUND_H
> +
> +/* On Linux x86/x64 systems bits/wordsize.h included from stdint.h cannot
> + * correctly determine __WORDSIZE for bpf, which causes incorrect UINTPTR_MAX
> + */
> +#if __UINTPTR_MAX__ == __UINT64_MAX__ && defined(UINTPTR_MAX)
> +#undef UINTPTR_MAX
> +#define UINTPTR_MAX UINT64_MAX
> +#endif
> +
> +#endif /* bpf_workaround.h */
> diff --git a/bpf/flowtable_afxdp.c b/bpf/flowtable_afxdp.c
> new file mode 100644
> index 000000000..7a4767333
> --- /dev/null
> +++ b/bpf/flowtable_afxdp.c
> @@ -0,0 +1,515 @@
> +/*
> + * Copyright (c) 2020 NTT Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +/* linux/types.h is necessary for bpf_helpers.h as it's not self-contained */
> +#include <linux/types.h>
> +#include <bpf/bpf_helpers.h>
> +#include <linux/bpf.h>
> +
> +/* Workaround for incorrect macros for bpf in stdint.h */
> +#include <stdint.h>
> +#include "bpf_workaround.h"
> +
> +#include "bpf_miniflow.h"
> +#include "bpf_netlink.h"
> +#include "netdev-offload-xdp.h"
> +#include "packets.h"
> +#include "util.h"
> +
> +/* Supported keys. Need to keep same 64-align as struct flow for miniflow */
> +struct xdp_flow {
> +    struct eth_addr dl_dst;
> +    struct eth_addr dl_src;
> +    ovs_be16 dl_type;
> +    uint8_t pad1[2];
> +
> +    union flow_vlan_hdr vlans[1];
> +    uint8_t pad2[4];
> +
> +    ovs_be32 nw_src;
> +    ovs_be32 nw_dst;
> +
> +    uint8_t pad3[4];
> +    uint8_t nw_frag;
> +    uint8_t nw_tos;
> +    uint8_t nw_ttl;
> +    uint8_t nw_proto;
> +
> +    ovs_be16 tp_src;
> +    ovs_be16 tp_dst;
> +    uint8_t pad4[4];
> +};
> +
> +/* Size of xdp_flow must be 64-aligned for key comparison */
> +BUILD_ASSERT_DECL(sizeof(struct xdp_flow) % sizeof(uint64_t) == 0);
> +
> +#define XDP_FLOW_U64S (sizeof(struct xdp_flow) / sizeof(uint64_t))
> +
> +#define XDP_MAX_SUBTABLE_FLOWS 1024
> +#define XDP_MAX_ACTIONS_LEN 256
> +
> +/* Actual key in each subtable. miniflow map is omitted as it's identical to
> + * mask map */
> +struct xdp_flow_key {
> +    union {
> +        uint64_t miniflow_buf[XDP_FLOW_U64S];
> +        struct xdp_flow _flow; /* Need this to keep xdp_flow in BTF */
> +    };
> +};
> +
> +/* Value for subtable mask array */
> +struct xdp_subtable_mask {
> +    struct xdp_subtable_mask_header header;
> +    uint64_t buf[XDP_FLOW_U64S];
> +};
> +
> +/* miniflow for packet */
> +struct xdp_miniflow {
> +    struct miniflow mf;
> +    struct xdp_flow_key value;
> +};
> +
> +/* Used when the action only modifies the packet */
> +#define _XDP_ACTION_CONTINUE -1
> +
> +/* Supported actions */
> +/* XXX: This size should be uint16_t but needs to be int as kernel has a bug
> + * in btf_enum_check_member() that assumes enum size is sizeof(int), which
> + * causes an error when loading BTF if we use uint16_t here */
> +enum action_attrs : uint32_t {
> +    XDP_ACTION_OUTPUT = OVS_ACTION_ATTR_OUTPUT,
> +    XDP_ACTION_PUSH_VLAN = OVS_ACTION_ATTR_PUSH_VLAN,
> +    XDP_ACTION_POP_VLAN = OVS_ACTION_ATTR_POP_VLAN,
> +};
> +
> +/* Identical to struct nlattr. Need this to keep enum action_attrs in BTF */
> +struct xdp_action_nlattr {
> +    uint16_t nla_len;
> +    enum action_attrs action_type;
> +};
> +
> +struct xdp_flow_actions {
> +    struct xdp_flow_actions_header header;
> +    uint8_t data[XDP_MAX_ACTIONS_LEN - sizeof(struct xdp_flow_actions_header)];
> +    /* Dummy. Need xdp_action_nlattr to keep enum action_attrs in BTF */
> +    struct xdp_action_nlattr _xdp_actions[];
> +};
> +
> +
> +/* Map definitions */
> +
> +struct {
> +    __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> +    __uint(max_entries, 256);
> +    __type(key, uint32_t);
> +    __type(value, long);
> +} debug_stats SEC(".maps");
> +
> +/* Temporary storage for packet miniflow. Need this because verifier does not
> + * allow access to array variable in stack with variable index. Such access
> + * happens in mask_key() */
> +struct {
> +    __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> +    __uint(max_entries, 1);
> +    __type(key, uint32_t);
> +    __type(value, struct xdp_miniflow);
> +} pkt_mf_tbl SEC(".maps");
> +
> +struct {
> +    __uint(type, BPF_MAP_TYPE_XSKMAP);
> +    __uint(max_entries, 1); /* This should be redefined by userspace */
> +    __uint(key_size, sizeof(int));
> +    __uint(value_size, sizeof(int));
> +} xsks_map SEC(".maps");
> +
> +struct {
> +    __uint(type, BPF_MAP_TYPE_DEVMAP);
> +    __uint(max_entries, XDP_MAX_PORTS);
> +    __uint(key_size, sizeof(int));
> +    __uint(value_size, sizeof(int));
> +} output_map SEC(".maps");
> +
> +/* Head index for subtbl_masks list */
> +/* TODO: Use global variable */
> +struct {
> +    __uint(type, BPF_MAP_TYPE_ARRAY);
> +    __uint(max_entries, 1);
> +    __type(key, uint32_t);
> +    __type(value, int);
> +} subtbl_masks_hd SEC(".maps");
> +
> +/* Information about subtable mask. A list implemented using array */
> +struct {
> +    __uint(type, BPF_MAP_TYPE_ARRAY);
> +    __uint(max_entries, XDP_MAX_SUBTABLES);
> +    __type(key, uint32_t);
> +    __type(value, struct xdp_subtable_mask);
> +} subtbl_masks SEC(".maps");
> +
> +/* Template for subtable hash-map. This will be used in userspace to create
> + * flow_table array-of-maps. */
> +struct {
> +    __uint(type, BPF_MAP_TYPE_HASH);
> +    __uint(max_entries, XDP_MAX_SUBTABLE_FLOWS);
> +    __type(key, struct xdp_flow_key);
> +    __type(value, struct xdp_flow_actions);
> +} subtbl_template SEC(".maps");
> +
> +/* Array-of-maps whose entry contains subtable hash-map. */
> +struct {
> +    __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
> +    __uint(max_entries, XDP_MAX_SUBTABLES);
> +    __uint(key_size, sizeof(uint32_t));
> +    __uint(value_size, sizeof(uint32_t));
> +} flow_table SEC(".maps");
> +
> +
> +static inline void
> +account_debug(int idx)
> +{
> +    long *cnt;
> +
> +    cnt = bpf_map_lookup_elem(&debug_stats, &idx);
> +    if (cnt) {
> +        *cnt += 1;
> +    }
> +}
> +
> +static inline void
> +account_action(enum action_attrs act)
> +{
> +    account_debug(act + 1);
> +}
> +
> +/* Derived from xsk_load_xdp_prog() in libbpf */
> +static inline int
> +upcall(struct xdp_md *ctx)
> +{
> +    int ret, index = ctx->rx_queue_index;
> +
> +    ret = bpf_redirect_map(&xsks_map, index, XDP_ABORTED);
> +    if (ret > 0) {
> +        return ret;
> +    }
> +
> +    /* Fallback for kernel <= 5.3 not supporting default action in flags */
> +    if (bpf_map_lookup_elem(&xsks_map, &index)) {
> +        return bpf_redirect_map(&xsks_map, index, 0);
> +    }
> +
> +    return XDP_ABORTED;
> +}
> +
> +static inline int
> +action_output(int tx_port)
> +{
> +    account_action(XDP_ACTION_OUTPUT);
> +
> +    return bpf_redirect_map(&output_map, tx_port, 0);
> +}
> +
> +static inline int
> +action_vlan_push(struct xdp_md *ctx OVS_UNUSED,
> +                 const struct ovs_action_push_vlan *vlan OVS_UNUSED)
> +{
> +    account_action(XDP_ACTION_PUSH_VLAN);
> +
> +    /* TODO: implement this */
> +    return XDP_ABORTED;
> +}
> +
> +static inline int
> +action_vlan_pop(struct xdp_md *ctx OVS_UNUSED)
> +{
> +    account_action(XDP_ACTION_POP_VLAN);
> +
> +    /* TODO: implement this */
> +    return XDP_ABORTED;
> +}
> +
> +/* TODO: Add more actions */
> +
> +
> +struct nw_params {
> +    union {
> +        ovs_be32 params;
> +        struct {
> +            uint8_t nw_frag;
> +            uint8_t nw_tos;
> +            uint8_t nw_ttl;
> +            uint8_t nw_proto;
> +        };
> +    };
> +};
> +
> +static inline int
> +parse_ipv4(void *data, uint64_t *nh_off, void *data_end,
> +           struct bpf_mf_ctx *mf_ctx, struct nw_params *nw_params)
> +{
> +    struct ip_header *ip = data + *nh_off;
> +
> +    if (ip + 1 > data_end) {
> +        return 1;
> +    }
> +
> +    /* Linux network drivers ensure that IP header is 4-byte aligned or
> +     * the platform can handle unaligned access */
> +    miniflow_push_be32(mf_ctx, nw_src, *(ovs_be32 *)(void *)&ip->ip_src);
> +    miniflow_push_be32(mf_ctx, nw_dst, *(ovs_be32 *)(void *)&ip->ip_dst);
> +
> +    if (OVS_UNLIKELY(IP_IS_FRAGMENT(ip->ip_frag_off))) {
> +        nw_params->nw_frag = FLOW_NW_FRAG_ANY;
> +        if (ip->ip_frag_off & htons(IP_FRAG_OFF_MASK)) {
> +            nw_params->nw_frag |= FLOW_NW_FRAG_LATER;
> +        }
> +    } else {
> +        nw_params->nw_frag = 0;
> +    }
> +    nw_params->nw_tos = ip->ip_tos;
> +    nw_params->nw_ttl = ip->ip_ttl;
> +    nw_params->nw_proto = ip->ip_proto;
> +
> +    *nh_off += IP_IHL(ip->ip_ihl_ver) * 4;
> +
> +    return 0;
> +}
> +
> +static inline int
> +xdp_miniflow_extract(struct xdp_md *ctx, struct xdp_miniflow *pkt_mf)
> +{
> +    void *data = (void *)(long)ctx->data;
> +    void *data_end = (void *)(long)ctx->data_end;
> +    struct eth_header *eth = data;
> +    struct vlan_header *vlan = NULL;
> +    ovs_be16 dl_type;
> +    uint64_t nh_off;
> +    struct nw_params nw_params;
> +    struct bpf_mf_ctx mf_ctx = { {{ 0 }}, (uint64_t *)&pkt_mf->value };
> +
> +    nh_off = sizeof *eth;
> +    if (data + nh_off > data_end) {
> +        return 1;
> +    }
> +
> +    miniflow_push_macs(&mf_ctx, dl_dst, &eth->eth_dst);
> +
> +    if (eth_type_vlan(eth->eth_type)) {
> +        vlan = data + nh_off;
> +        nh_off += sizeof(*vlan);
> +        if (data + nh_off > data_end) {
> +            return 1;
> +        }
> +        dl_type = vlan->vlan_next_type;
> +    } else {
> +        dl_type = eth->eth_type;
> +    }
> +    miniflow_push_be16(&mf_ctx, dl_type, dl_type);
> +    miniflow_pad_to_64(&mf_ctx, dl_type);
> +
> +    if (vlan) {
> +        const ovs_16aligned_be32 *qp;
> +        union flow_vlan_hdr vlan_hdr;
> +
> +        qp = (ovs_16aligned_be32 *)&eth->eth_type;
> +        vlan_hdr.qtag = get_16aligned_be32(qp);
> +        vlan_hdr.tci |= htons(VLAN_CFI);
> +        miniflow_push_be32(&mf_ctx, vlans, vlan_hdr.qtag);
> +        miniflow_push_be32_(&mf_ctx,
> +                            offsetof(struct flow, vlans) + sizeof(ovs_be32),
> +                            0);
> +    }
> +
> +    if (dl_type == htons(ETH_TYPE_IP)) {
> +        if (parse_ipv4(data, &nh_off, data_end, &mf_ctx, &nw_params)) {
> +            return 1;
> +        }
> +    } else {
> +        goto out;
> +    }
> +    miniflow_pad_from_64(&mf_ctx, nw_frag);
> +    miniflow_push_be32(&mf_ctx, nw_frag, &nw_params.params);
> +
> +    if (nw_params.nw_proto == IPPROTO_TCP) {
> +        struct tcp_header *tcp = data + nh_off;
> +
> +        if (tcp + 1 > data_end) {
> +            return 1;
> +        }
> +
> +        miniflow_push_be16(&mf_ctx, tp_src, tcp->tcp_src);
> +        miniflow_push_be16(&mf_ctx, tp_dst, tcp->tcp_dst);
> +    } else if (nw_params.nw_proto == IPPROTO_UDP) {
> +        struct udp_header *udp = data + nh_off;
> +
> +        if (udp + 1 > data_end) {
> +            return 1;
> +        }
> +
> +        miniflow_push_be16(&mf_ctx, tp_src, udp->udp_src);
> +        miniflow_push_be16(&mf_ctx, tp_dst, udp->udp_dst);
> +    }
> +out:
> +    pkt_mf->mf.map = mf_ctx.map;
> +    return 0;
> +}
> +
> +#define for_each_subtable_mask(subtable_mask, head, idx, cnt) \
> +    for (subtable_mask = bpf_map_lookup_elem(&subtbl_masks, (head)), \
> +         idx = *(head), cnt = 0; \
> +         subtable_mask != NULL && cnt < XDP_MAX_SUBTABLES; \
> +         idx = subtable_mask->header.next, \
> +         subtable_mask = bpf_map_lookup_elem(&subtbl_masks, &idx), cnt++)
> +
> +/* Returns false if an error happens */
> +static inline int
> +mask_key(uint64_t *mkey, const struct miniflow *pkt_mf,
> +         const struct xdp_subtable_mask_header *tbl_mask)
> +{
> +    const struct miniflow *tbl_mf = &tbl_mask->mask.masks;
> +    const uint64_t *tbl_blocks = miniflow_get_values(tbl_mf);
> +    const uint64_t *pkt_blocks = miniflow_get_values(pkt_mf);
> +    uint64_t tbl_mf_bits = tbl_mf->map.bits[0];
> +    uint64_t pkt_mf_bits = pkt_mf->map.bits[0];
> +    uint8_t tbl_mf_bits_u0 = tbl_mask->mf_bits_u0;
> +    uint8_t tbl_mf_bits_u1 = tbl_mask->mf_bits_u1;
> +    unsigned int pkt_ofs = 0;
> +    int i = 0;
> +
> +    /* This sensitive loop easily exceeds verifier limit 1M insns so
> +     * need to be careful when modifying.
> +     * E.g. increasing XDP_FLOW_U64S by adding keys to struct xdp_flow
> +     * increases verifier cost and may be rejected due to 1M insns exceeds */
> +    for (; i < tbl_mf_bits_u0 + tbl_mf_bits_u1 && i < XDP_FLOW_U64S; i++) {
> +        uint64_t mf_mask;
> +        uint64_t idx_bits;
> +        unsigned int pkt_idx;
> +        uint64_t lowest_bit;
> +
> +        if (i == tbl_mf_bits_u0) {
> +            tbl_mf_bits = tbl_mf->map.bits[1];
> +            pkt_mf_bits = pkt_mf->map.bits[1];
> +            pkt_ofs = count_1bits(pkt_mf->map.bits[0]);
> +        }
> +
> +        lowest_bit = tbl_mf_bits & -tbl_mf_bits;
> +        tbl_mf_bits &= ~lowest_bit;
> +        if (!(lowest_bit & pkt_mf_bits)) {
> +            mkey[i] = 0;
> +            continue;
> +        }
> +        mf_mask = lowest_bit - 1;
> +        idx_bits = mf_mask & pkt_mf_bits;
> +        pkt_idx = count_1bits(idx_bits) + pkt_ofs;
> +        if (pkt_idx >= XDP_FLOW_U64S) {
> +            /* xdp flow api provider (userspace) BUG */
> +            return false;
> +        }
> +
> +        mkey[i] = pkt_blocks[pkt_idx] & tbl_blocks[i];
> +    }
> +
> +    return true;
> +}
> +
> +SEC("xdp") int
> +flowtable_afxdp(struct xdp_md *ctx)
> +{
> +    struct xdp_miniflow *pkt_mf;
> +    struct xdp_subtable_mask *subtable_mask;
> +    int *head;
> +    struct xdp_flow_actions *xdp_actions = NULL;
> +    struct nlattr *a;
> +    unsigned int left;
> +    int cnt, idx, zero = 0;
> +
> +    account_debug(0);
> +
> +    head = bpf_map_lookup_elem(&subtbl_masks_hd, &zero);
> +    if (!head) {
> +        return XDP_ABORTED;
> +    }
> +    if (*head == XDP_SUBTABLES_TAIL) {
> +        /* Offload not enabled */
> +        goto upcall;
> +    }
> +
> +    /* Get temporary storage for storing packet miniflow */
> +    pkt_mf = bpf_map_lookup_elem(&pkt_mf_tbl, &zero);
> +    if (!pkt_mf) {
> +        return XDP_ABORTED;
> +    }
> +
> +    /* Extract miniflow from packet */
> +    if (xdp_miniflow_extract(ctx, pkt_mf)) {
> +        return XDP_DROP;
> +    }
> +
> +    /* Lookup each subtable */
> +    for_each_subtable_mask(subtable_mask, head, idx, cnt) {
> +        struct xdp_flow_key mkey = { 0 };
> +        void *subtable;
> +
> +        subtable = bpf_map_lookup_elem(&flow_table, &idx);
> +        if (!subtable) {
> +            return XDP_ABORTED;
> +        }
> +
> +        if (!mask_key(mkey.miniflow_buf, &pkt_mf->mf,
> +                      &subtable_mask->header)) {
> +            continue;
> +        }
> +
> +        xdp_actions = bpf_map_lookup_elem(subtable, &mkey);
> +        if (xdp_actions) {
> +            break;
> +        }
> +    }
> +
> +    if (!xdp_actions) {
> +        /* Flow entry miss */
> +upcall:
> +        return upcall(ctx);
> +    }
> +
> +    /* Execute actions */
> +    NL_ATTR_FOR_EACH_UNSAFE(a, left, xdp_flow_actions(&xdp_actions->header),
> +                            xdp_actions->header.actions_len) {
> +        uint16_t type = bpf_nl_attr_type(a);
> +        int act;
> +
> +        switch ((enum action_attrs)type) {
> +        case XDP_ACTION_OUTPUT:
> +            /* Note: userspace ensures there is no multiple output in actions */
> +            return action_output(*(int *)bpf_nl_attr_get(a));
> +        case XDP_ACTION_PUSH_VLAN:
> +            act = action_vlan_push(ctx, bpf_nl_attr_get(a));
> +            break;
> +        case XDP_ACTION_POP_VLAN:
> +            act = action_vlan_pop(ctx);
> +            break;
> +        default:
> +            return XDP_ABORTED;
> +        }
> +        if (act != _XDP_ACTION_CONTINUE) {
> +            return act;
> +        }
> +    }
> +
> +    account_debug(1);
> +    return XDP_DROP;
> +}
> +
> +char _license[] SEC("license") = "Apache-2.0";
> diff --git a/configure.ac b/configure.ac
> index 1877aae56..99a93ce00 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -99,6 +99,7 @@ OVS_CHECK_DOT
>  OVS_CHECK_IF_DL
>  OVS_CHECK_STRTOK_R
>  OVS_CHECK_LINUX_AF_XDP
> +OVS_CHECK_LINUX_BPF
>  AC_CHECK_DECLS([sys_siglist], [], [], [[#include <signal.h>]])
>  AC_CHECK_MEMBERS([struct stat.st_mtim.tv_nsec, struct stat.st_mtimensec],
>    [], [], [[#include <sys/stat.h>]])
> @@ -198,6 +199,7 @@ AC_CONFIG_FILES(datapath/Makefile)
>  AC_CONFIG_FILES(datapath/linux/Kbuild)
>  AC_CONFIG_FILES(datapath/linux/Makefile)
>  AC_CONFIG_FILES(datapath/linux/Makefile.main)
> +AC_CONFIG_FILES(bpf/Makefile)
>  AC_CONFIG_FILES(tests/atlocal)
>  AC_CONFIG_FILES(lib/libopenvswitch.pc)
>  AC_CONFIG_FILES(lib/libsflow.pc)
> --
> 2.25.1
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Toshiaki Makita May 14, 2020, 10:30 a.m. UTC | #9
Hi Tonghao,

On 2020/05/14 13:08, Tonghao Zhang wrote:
> On Tue, Apr 21, 2020 at 10:48 PM Toshiaki Makita
> <toshiaki.makita1@gmail.com> wrote:
>>
>> This adds a reference program, flowtable_afxdp.o, which can be used to
>> offload flows to XDP through netdev-offload-xdp.
>> The program will be compiled when using --enable-bpf switch.
> Hi Toshiaki
> Good! One question, did we test the performance with this patch ?

Yes. Please check the cover letter.

Thanks!
Toshiaki Makita
diff mbox series

Patch

diff --git a/Makefile.am b/Makefile.am
index b279303d1..f18bfefde 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -8,6 +8,9 @@ 
 AUTOMAKE_OPTIONS = foreign subdir-objects
 ACLOCAL_AMFLAGS = -I m4
 SUBDIRS = datapath
+if HAVE_BPF
+SUBDIRS += bpf
+endif
 
 AM_CPPFLAGS = $(SSL_CFLAGS)
 AM_LDFLAGS = $(SSL_LDFLAGS)
@@ -198,7 +201,9 @@  ALL_LOCAL += dist-hook-git
 dist-hook-git: distfiles
 	@if test -e $(srcdir)/.git && (git --version) >/dev/null 2>&1; then \
 	  (cd datapath && $(MAKE) distfiles); \
-	  (cat distfiles; sed 's|^|datapath/|' datapath/distfiles) | \
+	  (cd bpf && $(MAKE) distfiles); \
+	  (cat distfiles; sed 's|^|datapath/|' datapath/distfiles; \
+	   sed 's|^|bpf/|' bpf/distfiles) | \
 	    LC_ALL=C sort -u > all-distfiles; \
 	  (cd $(srcdir) && git ls-files) | grep -v '\.gitignore$$' | \
 	    grep -v '\.gitattributes$$' | \
@@ -234,7 +239,7 @@  config-h-check:
 	@cd $(srcdir); \
 	if test -e .git && (git --version) >/dev/null 2>&1 && \
 	  git --no-pager grep -L '#include <config\.h>' `git ls-files | grep '\.c$$' | \
-	    grep -vE '^datapath|^lib/sflow|^third-party|^datapath-windows|^python'`; \
+	    grep -vE '^datapath|^lib/sflow|^third-party|^datapath-windows|^python|^bpf'`; \
 	then \
 	  echo "See above for list of violations of the rule that"; \
 	  echo "every C source file must #include <config.h>."; \
diff --git a/acinclude.m4 b/acinclude.m4
index 0901f2870..2fb2f385f 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -301,6 +301,62 @@  AC_DEFUN([OVS_CHECK_LINUX_AF_XDP], [
   AM_CONDITIONAL([HAVE_AF_XDP], test "$AF_XDP_ENABLE" = true)
 ])
 
+dnl OVS_CHECK_LINUX_BPF
+dnl
+dnl Check both llvm and libbpf support
+AC_DEFUN([OVS_CHECK_LINUX_BPF], [
+  AC_ARG_ENABLE([bpf],
+                [AC_HELP_STRING([--enable-bpf],
+                                [Compile reference eBPF programs for XDP])],
+                [], [enable_bpf=no])
+  AC_MSG_CHECKING([whether BPF is enabled])
+  if test "$enable_bpf" != yes; then
+    AC_MSG_RESULT([no])
+    BPF_ENABLE=false
+  else
+    AC_MSG_RESULT([yes])
+    BPF_ENABLE=true
+
+    AC_CHECK_PROG(CLANG_CHECK, clang, yes)
+    AS_IF([test X"$CLANG_CHECK" != X"yes"],
+      [AC_MSG_ERROR([unable to find clang to compile BPF program])])
+
+    AC_CHECK_PROG(LLC_CHECK, llc, yes)
+    AS_IF([test X"$LLC_CHECK" != X"yes"],
+      [AC_MSG_ERROR([unable to find llc to compile BPF program])])
+
+    AC_CHECK_HEADER([bpf/bpf_helpers.h], [],
+      [AC_MSG_ERROR([unable to find bpf/bpf_helpers.h to compile BPF program])])
+
+    AC_CHECK_HEADER([linux/bpf.h], [],
+      [AC_MSG_ERROR([unable to find linux/bpf.h to compile BPF program])])
+
+    AC_MSG_CHECKING([for LLVM bpf target support])
+    if llc -march=bpf -mattr=help >/dev/null 2>&1; then
+      AC_MSG_RESULT([yes])
+    else
+      AC_MSG_RESULT([no])
+      AC_MSG_ERROR([LLVM does not support bpf target])
+    fi
+
+    AC_MSG_CHECKING([for BTF DATASEC support])
+    AC_LANG_CONFTEST(
+      [AC_LANG_SOURCE([__attribute__((section("_x"), used)) int x;])])
+    if clang -g -O2 -S -target bpf -emit-llvm -c conftest.c -o conftest.ll && \
+       llc -march=bpf -filetype=obj -o conftest.o conftest.ll && \
+       readelf -p.BTF conftest.o 2>/dev/null | grep -q -w _x; then
+      AC_MSG_RESULT([yes])
+    else
+      AC_MSG_RESULT([no])
+      AC_MSG_ERROR([LLVM does not support BTF DATASEC])
+    fi
+
+    AC_DEFINE([HAVE_BPF], [1],
+              [Define to 1 if BPF compilation is available and enabled.])
+  fi
+  AM_CONDITIONAL([HAVE_BPF], test "$BPF_ENABLE" = true)
+])
+
 dnl OVS_CHECK_DPDK
 dnl
 dnl Configure DPDK source tree
diff --git a/bpf/.gitignore b/bpf/.gitignore
new file mode 100644
index 000000000..ab0f2d9e4
--- /dev/null
+++ b/bpf/.gitignore
@@ -0,0 +1,4 @@ 
+*.ll
+/distfiles
+/Makefile
+/Makefile.in
diff --git a/bpf/Makefile.am b/bpf/Makefile.am
new file mode 100644
index 000000000..f485b17f0
--- /dev/null
+++ b/bpf/Makefile.am
@@ -0,0 +1,59 @@ 
+AUTOMAKE_OPTIONS = foreign
+
+EXTRA_DIST = flowtable_afxdp.c bpf_miniflow.h bpf_netlink.h bpf_workaround.h
+
+# The following is based on commands for the Automake "distdir" target.
+distfiles: Makefile
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t" | \
+	  LC_ALL=C sort -u > $@
+CLEANFILES = distfiles
+
+CLANG ?= clang
+LLC ?= llc
+
+AM_CPPFLAGS = -I $(top_srcdir)/include
+AM_CPPFLAGS += -I $(top_builddir)/include
+AM_CPPFLAGS += -I $(top_srcdir)/lib
+AM_CPPFLAGS += -I $(top_builddir)/lib
+
+AM_CFLAGS = -Wall
+AM_CFLAGS += -Wextra
+AM_CFLAGS += -Wno-sign-compare
+AM_CFLAGS += -Wformat -Wformat-security
+AM_CFLAGS += -Wswitch-enum
+AM_CFLAGS += -Wunused-parameter
+AM_CFLAGS += -Wbad-function-cast
+AM_CFLAGS += -Wcast-align
+AM_CFLAGS += -Wstrict-prototypes
+AM_CFLAGS += -Wold-style-definition
+AM_CFLAGS += -Wmissing-field-initializers
+AM_CFLAGS += -fno-strict-aliasing
+AM_CFLAGS += -Wswitch-bool
+AM_CFLAGS += -Wlogical-not-parentheses
+AM_CFLAGS += -Wsizeof-array-argument
+AM_CFLAGS += -Wshift-negative-value
+AM_CFLAGS += -Wshadow
+AM_CFLAGS += -Wcast-align
+AM_CFLAGS += -Wno-unused-value
+AM_CFLAGS += -Wno-compare-distinct-pointer-types
+AM_CFLAGS += -g
+AM_CFLAGS += -O2
+
+SUFFIXES = .ll
+%.ll: %.c
+	$(CLANG) $(AM_CPPFLAGS) $(AM_CFLAGS) -S \
+		-target bpf -emit-llvm -c $< -o $@
+CLEANFILES += *.ll
+
+%.o: %.ll
+	$(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@ $<
+CLEANFILES += *.o
+
+flowtable_afxdp.o: flowtable_afxdp.c bpf_miniflow.h bpf_netlink.h bpf_workaround.h
+
+all-local: flowtable_afxdp.o
diff --git a/bpf/bpf_miniflow.h b/bpf/bpf_miniflow.h
new file mode 100644
index 000000000..090a9c7be
--- /dev/null
+++ b/bpf/bpf_miniflow.h
@@ -0,0 +1,179 @@ 
+/*
+ * Copyright (c) 2020 NTT Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef BPF_MINIFLOW_H
+#define BPF_MINIFLOW_H 1
+
+#include "flow.h"
+
+struct bpf_mf_ctx {
+    struct flowmap map;
+    uint64_t *data;
+};
+
+static inline void
+miniflow_set_maps(struct bpf_mf_ctx *ctx, size_t ofs, size_t n_words)
+{
+    flowmap_set(&ctx->map, ofs, n_words);
+}
+
+static inline void
+miniflow_set_map(struct bpf_mf_ctx *ctx, size_t ofs)
+{
+    flowmap_set(&ctx->map, ofs, 1);
+}
+
+static inline void
+miniflow_push_uint8_(struct bpf_mf_ctx *ctx, size_t ofs, uint8_t value)
+{
+    size_t ofs8 = ofs % 8;
+
+    if (ofs8 == 0) {
+        miniflow_set_map(ctx, ofs / 8);
+    }
+    *((uint8_t *)ctx->data + ofs8) = value;
+    if (ofs8 == 7) {
+        ctx->data++;
+    }
+}
+
+static inline void
+miniflow_push_uint16_(struct bpf_mf_ctx *ctx, size_t ofs, uint16_t value)
+{
+    size_t ofs8 = ofs % 8;
+
+    if (ofs8 == 0) {
+        miniflow_set_map(ctx, ofs / 8);
+        *(uint16_t *)ctx->data = value;
+    } else if (ofs8 == 2) {
+        *((uint16_t *)ctx->data + 1) = value;
+    } else if (ofs8 == 4) {
+        *((uint16_t *)ctx->data + 2) = value;
+    } else if (ofs8 == 6) {
+        *((uint16_t *)ctx->data + 3) = value;
+        ctx->data++;
+    }
+}
+
+static inline void
+miniflow_push_uint32_(struct bpf_mf_ctx *ctx, size_t ofs, uint32_t value)
+{
+    size_t ofs8 = ofs % 8;
+
+    if (ofs8 == 0) {
+        miniflow_set_map(ctx, ofs / 8);
+        *(uint32_t *)ctx->data = value;
+    } else if (ofs8 == 4) {
+        *((uint32_t *)ctx->data + 1) = value;
+        ctx->data++;
+    }
+}
+
+static inline void
+ether_addr_copy(struct eth_addr *dst, const struct eth_addr *src)
+{
+    ovs_be16 *a = dst->be16;
+    const ovs_be16 *b = src->be16;
+
+    a[0] = b[0];
+    a[1] = b[1];
+    a[2] = b[2];
+}
+
+/* 'valuep' is 16-aligned */
+/* data must start 64-aligned and must be followed by other data or padding */
+static inline void
+miniflow_push_macs_(struct bpf_mf_ctx *ctx, size_t ofs,
+                    const struct eth_addr *valuep)
+{
+    miniflow_set_maps(ctx, ofs / 8, 2);
+    ether_addr_copy((struct eth_addr *)ctx->data, valuep);
+    ether_addr_copy((struct eth_addr *)ctx->data + 1, valuep + 1);
+    ctx->data++; /* First word only. */
+}
+
+/* data must start 64-aligned and must be followed by other data */
+static inline void
+miniflow_pad_from_64_(struct bpf_mf_ctx *ctx, size_t ofs)
+{
+    size_t ofs8 = ofs % 8;
+    size_t ofs4 = ofs % 4;
+    size_t ofs2 = ofs % 2;
+    void *cdata = ctx->data;
+
+    miniflow_set_map(ctx, ofs / 8);
+
+    if (ofs8 >= 4) {
+        *(uint32_t *)cdata = 0;
+        cdata += 4;
+    }
+    if (ofs4 >= 2) {
+        *(uint16_t *)cdata = 0;
+        cdata += 2;
+    }
+    if (ofs2 == 1) {
+        *(uint8_t *)cdata = 0;
+    }
+}
+
+static inline void
+miniflow_pad_to_64_(struct bpf_mf_ctx *ctx, size_t ofs)
+{
+    size_t ofs8 = ofs % 8;
+    size_t ofs4 = ofs % 4;
+    size_t ofs2 = ofs % 2;
+    void *cdata = ctx->data;
+
+    cdata += ofs8;
+    if (ofs2 == 1) {
+        *(uint8_t *)cdata = 0;
+        cdata++;
+    }
+    if (ofs4 <= 2) {
+        *(uint16_t *)cdata = 0;
+        cdata += 2;
+    }
+    if (ofs8 <= 4) {
+        *(uint32_t *)cdata = 0;
+    }
+    ctx->data++;
+}
+
+#define miniflow_push_uint8(CTX, FIELD, VALUE)                       \
+    miniflow_push_uint8_(CTX, offsetof(struct flow, FIELD), VALUE)
+
+#define miniflow_push_be16_(CTX, OFS, VALUE)                         \
+    miniflow_push_uint16_(CTX, OFS, (OVS_FORCE uint16_t)VALUE)
+
+#define miniflow_push_be16(CTX, FIELD, VALUE)                        \
+    miniflow_push_be16_(CTX, offsetof(struct flow, FIELD), VALUE)    \
+
+#define miniflow_push_be32_(CTX, OFS, VALUE)                         \
+    miniflow_push_uint32_(CTX, OFS, (OVS_FORCE uint32_t)VALUE)
+
+#define miniflow_push_be32(CTX, FIELD, VALUE)                        \
+    miniflow_push_be32_(CTX, offsetof(struct flow, FIELD), VALUE)    \
+
+#define miniflow_push_macs(CTX, FIELD, VALUEP)                       \
+    miniflow_push_macs_(CTX, offsetof(struct flow, FIELD), VALUEP)
+
+#define miniflow_pad_from_64(CTX, FIELD)                             \
+    miniflow_pad_from_64_(CTX, offsetof(struct flow, FIELD))
+
+#define miniflow_pad_to_64(CTX, FIELD)                               \
+    miniflow_pad_to_64_(CTX, OFFSETOFEND(struct flow, FIELD))
+
+#endif /* bpf_miniflow.h */
diff --git a/bpf/bpf_netlink.h b/bpf/bpf_netlink.h
new file mode 100644
index 000000000..091926c61
--- /dev/null
+++ b/bpf/bpf_netlink.h
@@ -0,0 +1,34 @@ 
+/*
+ * Copyright (c) 2020 NTT Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef BPF_NETLINK_H
+#define BPF_NETLINK_H 1
+
+#include "netlink.h"
+
+static inline int
+bpf_nl_attr_type(const struct nlattr *nla)
+{
+    return nla->nla_type & NLA_TYPE_MASK;
+}
+
+static inline const void *
+bpf_nl_attr_get(const struct nlattr *nla)
+{
+    return nla + 1;
+}
+
+#endif /* bpf_netlink.h */
diff --git a/bpf/bpf_workaround.h b/bpf/bpf_workaround.h
new file mode 100644
index 000000000..cb072a7e6
--- /dev/null
+++ b/bpf/bpf_workaround.h
@@ -0,0 +1,28 @@ 
+/*
+ * Copyright (c) 2020 NTT Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef BPF_WORKAROUND_H
+#define BPF_WORKAROUND_H
+
+/* On Linux x86/x64 systems bits/wordsize.h included from stdint.h cannot
+ * correctly determine __WORDSIZE for bpf, which causes incorrect UINTPTR_MAX
+ */
+#if __UINTPTR_MAX__ == __UINT64_MAX__ && defined(UINTPTR_MAX)
+#undef UINTPTR_MAX
+#define UINTPTR_MAX UINT64_MAX
+#endif
+
+#endif /* bpf_workaround.h */
diff --git a/bpf/flowtable_afxdp.c b/bpf/flowtable_afxdp.c
new file mode 100644
index 000000000..7a4767333
--- /dev/null
+++ b/bpf/flowtable_afxdp.c
@@ -0,0 +1,515 @@ 
+/*
+ * Copyright (c) 2020 NTT Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* linux/types.h is necessary for bpf_helpers.h as it's not self-contained */
+#include <linux/types.h>
+#include <bpf/bpf_helpers.h>
+#include <linux/bpf.h>
+
+/* Workaround for incorrect macros for bpf in stdint.h */
+#include <stdint.h>
+#include "bpf_workaround.h"
+
+#include "bpf_miniflow.h"
+#include "bpf_netlink.h"
+#include "netdev-offload-xdp.h"
+#include "packets.h"
+#include "util.h"
+
+/* Supported keys. Need to keep same 64-align as struct flow for miniflow */
+struct xdp_flow {
+    struct eth_addr dl_dst;
+    struct eth_addr dl_src;
+    ovs_be16 dl_type;
+    uint8_t pad1[2];
+
+    union flow_vlan_hdr vlans[1];
+    uint8_t pad2[4];
+
+    ovs_be32 nw_src;
+    ovs_be32 nw_dst;
+
+    uint8_t pad3[4];
+    uint8_t nw_frag;
+    uint8_t nw_tos;
+    uint8_t nw_ttl;
+    uint8_t nw_proto;
+
+    ovs_be16 tp_src;
+    ovs_be16 tp_dst;
+    uint8_t pad4[4];
+};
+
+/* Size of xdp_flow must be 64-aligned for key comparison */
+BUILD_ASSERT_DECL(sizeof(struct xdp_flow) % sizeof(uint64_t) == 0);
+
+#define XDP_FLOW_U64S (sizeof(struct xdp_flow) / sizeof(uint64_t))
+
+#define XDP_MAX_SUBTABLE_FLOWS 1024
+#define XDP_MAX_ACTIONS_LEN 256
+
+/* Actual key in each subtable. miniflow map is omitted as it's identical to
+ * mask map */
+struct xdp_flow_key {
+    union {
+        uint64_t miniflow_buf[XDP_FLOW_U64S];
+        struct xdp_flow _flow; /* Need this to keep xdp_flow in BTF */
+    };
+};
+
+/* Value for subtable mask array */
+struct xdp_subtable_mask {
+    struct xdp_subtable_mask_header header;
+    uint64_t buf[XDP_FLOW_U64S];
+};
+
+/* miniflow for packet */
+struct xdp_miniflow {
+    struct miniflow mf;
+    struct xdp_flow_key value;
+};
+
+/* Used when the action only modifies the packet */
+#define _XDP_ACTION_CONTINUE -1
+
+/* Supported actions */
+/* XXX: This size should be uint16_t but needs to be int as kernel has a bug
+ * in btf_enum_check_member() that assumes enum size is sizeof(int), which
+ * causes an error when loading BTF if we use uint16_t here */
+enum action_attrs : uint32_t {
+    XDP_ACTION_OUTPUT = OVS_ACTION_ATTR_OUTPUT,
+    XDP_ACTION_PUSH_VLAN = OVS_ACTION_ATTR_PUSH_VLAN,
+    XDP_ACTION_POP_VLAN = OVS_ACTION_ATTR_POP_VLAN,
+};
+
+/* Identical to struct nlattr. Need this to keep enum action_attrs in BTF */
+struct xdp_action_nlattr {
+    uint16_t nla_len;
+    enum action_attrs action_type;
+};
+
+struct xdp_flow_actions {
+    struct xdp_flow_actions_header header;
+    uint8_t data[XDP_MAX_ACTIONS_LEN - sizeof(struct xdp_flow_actions_header)];
+    /* Dummy. Need xdp_action_nlattr to keep enum action_attrs in BTF */
+    struct xdp_action_nlattr _xdp_actions[];
+};
+
+
+/* Map definitions */
+
+struct {
+    __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+    __uint(max_entries, 256);
+    __type(key, uint32_t);
+    __type(value, long);
+} debug_stats SEC(".maps");
+
+/* Temporary storage for packet miniflow. Need this because verifier does not
+ * allow access to array variable in stack with variable index. Such access
+ * happens in mask_key() */
+struct {
+    __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+    __uint(max_entries, 1);
+    __type(key, uint32_t);
+    __type(value, struct xdp_miniflow);
+} pkt_mf_tbl SEC(".maps");
+
+struct {
+    __uint(type, BPF_MAP_TYPE_XSKMAP);
+    __uint(max_entries, 1); /* This should be redefined by userspace */
+    __uint(key_size, sizeof(int));
+    __uint(value_size, sizeof(int));
+} xsks_map SEC(".maps");
+
+struct {
+    __uint(type, BPF_MAP_TYPE_DEVMAP);
+    __uint(max_entries, XDP_MAX_PORTS);
+    __uint(key_size, sizeof(int));
+    __uint(value_size, sizeof(int));
+} output_map SEC(".maps");
+
+/* Head index for subtbl_masks list */
+/* TODO: Use global variable */
+struct {
+    __uint(type, BPF_MAP_TYPE_ARRAY);
+    __uint(max_entries, 1);
+    __type(key, uint32_t);
+    __type(value, int);
+} subtbl_masks_hd SEC(".maps");
+
+/* Information about subtable mask. A list implemented using array */
+struct {
+    __uint(type, BPF_MAP_TYPE_ARRAY);
+    __uint(max_entries, XDP_MAX_SUBTABLES);
+    __type(key, uint32_t);
+    __type(value, struct xdp_subtable_mask);
+} subtbl_masks SEC(".maps");
+
+/* Template for subtable hash-map. This will be used in userspace to create
+ * flow_table array-of-maps. */
+struct {
+    __uint(type, BPF_MAP_TYPE_HASH);
+    __uint(max_entries, XDP_MAX_SUBTABLE_FLOWS);
+    __type(key, struct xdp_flow_key);
+    __type(value, struct xdp_flow_actions);
+} subtbl_template SEC(".maps");
+
+/* Array-of-maps whose entry contains subtable hash-map. */
+struct {
+    __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+    __uint(max_entries, XDP_MAX_SUBTABLES);
+    __uint(key_size, sizeof(uint32_t));
+    __uint(value_size, sizeof(uint32_t));
+} flow_table SEC(".maps");
+
+
+static inline void
+account_debug(int idx)
+{
+    long *cnt;
+
+    cnt = bpf_map_lookup_elem(&debug_stats, &idx);
+    if (cnt) {
+        *cnt += 1;
+    }
+}
+
+static inline void
+account_action(enum action_attrs act)
+{
+    account_debug(act + 1);
+}
+
+/* Derived from xsk_load_xdp_prog() in libbpf */
+static inline int
+upcall(struct xdp_md *ctx)
+{
+    int ret, index = ctx->rx_queue_index;
+
+    ret = bpf_redirect_map(&xsks_map, index, XDP_ABORTED);
+    if (ret > 0) {
+        return ret;
+    }
+
+    /* Fallback for kernel <= 5.3 not supporting default action in flags */
+    if (bpf_map_lookup_elem(&xsks_map, &index)) {
+        return bpf_redirect_map(&xsks_map, index, 0);
+    }
+
+    return XDP_ABORTED;
+}
+
+static inline int
+action_output(int tx_port)
+{
+    account_action(XDP_ACTION_OUTPUT);
+
+    return bpf_redirect_map(&output_map, tx_port, 0);
+}
+
+static inline int
+action_vlan_push(struct xdp_md *ctx OVS_UNUSED,
+                 const struct ovs_action_push_vlan *vlan OVS_UNUSED)
+{
+    account_action(XDP_ACTION_PUSH_VLAN);
+
+    /* TODO: implement this */
+    return XDP_ABORTED;
+}
+
+static inline int
+action_vlan_pop(struct xdp_md *ctx OVS_UNUSED)
+{
+    account_action(XDP_ACTION_POP_VLAN);
+
+    /* TODO: implement this */
+    return XDP_ABORTED;
+}
+
+/* TODO: Add more actions */
+
+
+struct nw_params {
+    union {
+        ovs_be32 params;
+        struct {
+            uint8_t nw_frag;
+            uint8_t nw_tos;
+            uint8_t nw_ttl;
+            uint8_t nw_proto;
+        };
+    };
+};
+
+static inline int
+parse_ipv4(void *data, uint64_t *nh_off, void *data_end,
+           struct bpf_mf_ctx *mf_ctx, struct nw_params *nw_params)
+{
+    struct ip_header *ip = data + *nh_off;
+
+    if (ip + 1 > data_end) {
+        return 1;
+    }
+
+    /* Linux network drivers ensure that IP header is 4-byte aligned or
+     * the platform can handle unaligned access */
+    miniflow_push_be32(mf_ctx, nw_src, *(ovs_be32 *)(void *)&ip->ip_src);
+    miniflow_push_be32(mf_ctx, nw_dst, *(ovs_be32 *)(void *)&ip->ip_dst);
+
+    if (OVS_UNLIKELY(IP_IS_FRAGMENT(ip->ip_frag_off))) {
+        nw_params->nw_frag = FLOW_NW_FRAG_ANY;
+        if (ip->ip_frag_off & htons(IP_FRAG_OFF_MASK)) {
+            nw_params->nw_frag |= FLOW_NW_FRAG_LATER;
+        }
+    } else {
+        nw_params->nw_frag = 0;
+    }
+    nw_params->nw_tos = ip->ip_tos;
+    nw_params->nw_ttl = ip->ip_ttl;
+    nw_params->nw_proto = ip->ip_proto;
+
+    *nh_off += IP_IHL(ip->ip_ihl_ver) * 4;
+
+    return 0;
+}
+
+static inline int
+xdp_miniflow_extract(struct xdp_md *ctx, struct xdp_miniflow *pkt_mf)
+{
+    void *data = (void *)(long)ctx->data;
+    void *data_end = (void *)(long)ctx->data_end;
+    struct eth_header *eth = data;
+    struct vlan_header *vlan = NULL;
+    ovs_be16 dl_type;
+    uint64_t nh_off;
+    struct nw_params nw_params;
+    struct bpf_mf_ctx mf_ctx = { {{ 0 }}, (uint64_t *)&pkt_mf->value };
+
+    nh_off = sizeof *eth;
+    if (data + nh_off > data_end) {
+        return 1;
+    }
+
+    miniflow_push_macs(&mf_ctx, dl_dst, &eth->eth_dst);
+
+    if (eth_type_vlan(eth->eth_type)) {
+        vlan = data + nh_off;
+        nh_off += sizeof(*vlan);
+        if (data + nh_off > data_end) {
+            return 1;
+        }
+        dl_type = vlan->vlan_next_type;
+    } else {
+        dl_type = eth->eth_type;
+    }
+    miniflow_push_be16(&mf_ctx, dl_type, dl_type);
+    miniflow_pad_to_64(&mf_ctx, dl_type);
+
+    if (vlan) {
+        const ovs_16aligned_be32 *qp;
+        union flow_vlan_hdr vlan_hdr;
+
+        qp = (ovs_16aligned_be32 *)&eth->eth_type;
+        vlan_hdr.qtag = get_16aligned_be32(qp);
+        vlan_hdr.tci |= htons(VLAN_CFI);
+        miniflow_push_be32(&mf_ctx, vlans, vlan_hdr.qtag);
+        miniflow_push_be32_(&mf_ctx,
+                            offsetof(struct flow, vlans) + sizeof(ovs_be32),
+                            0);
+    }
+
+    if (dl_type == htons(ETH_TYPE_IP)) {
+        if (parse_ipv4(data, &nh_off, data_end, &mf_ctx, &nw_params)) {
+            return 1;
+        }
+    } else {
+        goto out;
+    }
+    miniflow_pad_from_64(&mf_ctx, nw_frag);
+    miniflow_push_be32(&mf_ctx, nw_frag, &nw_params.params);
+
+    if (nw_params.nw_proto == IPPROTO_TCP) {
+        struct tcp_header *tcp = data + nh_off;
+
+        if (tcp + 1 > data_end) {
+            return 1;
+        }
+
+        miniflow_push_be16(&mf_ctx, tp_src, tcp->tcp_src);
+        miniflow_push_be16(&mf_ctx, tp_dst, tcp->tcp_dst);
+    } else if (nw_params.nw_proto == IPPROTO_UDP) {
+        struct udp_header *udp = data + nh_off;
+
+        if (udp + 1 > data_end) {
+            return 1;
+        }
+
+        miniflow_push_be16(&mf_ctx, tp_src, udp->udp_src);
+        miniflow_push_be16(&mf_ctx, tp_dst, udp->udp_dst);
+    }
+out:
+    pkt_mf->mf.map = mf_ctx.map;
+    return 0;
+}
+
+#define for_each_subtable_mask(subtable_mask, head, idx, cnt) \
+    for (subtable_mask = bpf_map_lookup_elem(&subtbl_masks, (head)), \
+         idx = *(head), cnt = 0; \
+         subtable_mask != NULL && cnt < XDP_MAX_SUBTABLES; \
+         idx = subtable_mask->header.next, \
+         subtable_mask = bpf_map_lookup_elem(&subtbl_masks, &idx), cnt++)
+
+/* Returns false if an error happens */
+static inline int
+mask_key(uint64_t *mkey, const struct miniflow *pkt_mf,
+         const struct xdp_subtable_mask_header *tbl_mask)
+{
+    const struct miniflow *tbl_mf = &tbl_mask->mask.masks;
+    const uint64_t *tbl_blocks = miniflow_get_values(tbl_mf);
+    const uint64_t *pkt_blocks = miniflow_get_values(pkt_mf);
+    uint64_t tbl_mf_bits = tbl_mf->map.bits[0];
+    uint64_t pkt_mf_bits = pkt_mf->map.bits[0];
+    uint8_t tbl_mf_bits_u0 = tbl_mask->mf_bits_u0;
+    uint8_t tbl_mf_bits_u1 = tbl_mask->mf_bits_u1;
+    unsigned int pkt_ofs = 0;
+    int i = 0;
+
+    /* This sensitive loop easily exceeds verifier limit 1M insns so
+     * need to be careful when modifying.
+     * E.g. increasing XDP_FLOW_U64S by adding keys to struct xdp_flow
+     * increases verifier cost and may be rejected due to 1M insns exceeds */
+    for (; i < tbl_mf_bits_u0 + tbl_mf_bits_u1 && i < XDP_FLOW_U64S; i++) {
+        uint64_t mf_mask;
+        uint64_t idx_bits;
+        unsigned int pkt_idx;
+        uint64_t lowest_bit;
+
+        if (i == tbl_mf_bits_u0) {
+            tbl_mf_bits = tbl_mf->map.bits[1];
+            pkt_mf_bits = pkt_mf->map.bits[1];
+            pkt_ofs = count_1bits(pkt_mf->map.bits[0]);
+        }
+
+        lowest_bit = tbl_mf_bits & -tbl_mf_bits;
+        tbl_mf_bits &= ~lowest_bit;
+        if (!(lowest_bit & pkt_mf_bits)) {
+            mkey[i] = 0;
+            continue;
+        }
+        mf_mask = lowest_bit - 1;
+        idx_bits = mf_mask & pkt_mf_bits;
+        pkt_idx = count_1bits(idx_bits) + pkt_ofs;
+        if (pkt_idx >= XDP_FLOW_U64S) {
+            /* xdp flow api provider (userspace) BUG */
+            return false;
+        }
+
+        mkey[i] = pkt_blocks[pkt_idx] & tbl_blocks[i];
+    }
+
+    return true;
+}
+
+SEC("xdp") int
+flowtable_afxdp(struct xdp_md *ctx)
+{
+    struct xdp_miniflow *pkt_mf;
+    struct xdp_subtable_mask *subtable_mask;
+    int *head;
+    struct xdp_flow_actions *xdp_actions = NULL;
+    struct nlattr *a;
+    unsigned int left;
+    int cnt, idx, zero = 0;
+
+    account_debug(0);
+
+    head = bpf_map_lookup_elem(&subtbl_masks_hd, &zero);
+    if (!head) {
+        return XDP_ABORTED;
+    }
+    if (*head == XDP_SUBTABLES_TAIL) {
+        /* Offload not enabled */
+        goto upcall;
+    }
+
+    /* Get temporary storage for storing packet miniflow */
+    pkt_mf = bpf_map_lookup_elem(&pkt_mf_tbl, &zero);
+    if (!pkt_mf) {
+        return XDP_ABORTED;
+    }
+
+    /* Extract miniflow from packet */
+    if (xdp_miniflow_extract(ctx, pkt_mf)) {
+        return XDP_DROP;
+    }
+
+    /* Lookup each subtable */
+    for_each_subtable_mask(subtable_mask, head, idx, cnt) {
+        struct xdp_flow_key mkey = { 0 };
+        void *subtable;
+
+        subtable = bpf_map_lookup_elem(&flow_table, &idx);
+        if (!subtable) {
+            return XDP_ABORTED;
+        }
+
+        if (!mask_key(mkey.miniflow_buf, &pkt_mf->mf,
+                      &subtable_mask->header)) {
+            continue;
+        }
+
+        xdp_actions = bpf_map_lookup_elem(subtable, &mkey);
+        if (xdp_actions) {
+            break;
+        }
+    }
+
+    if (!xdp_actions) {
+        /* Flow entry miss */
+upcall:
+        return upcall(ctx);
+    }
+
+    /* Execute actions */
+    NL_ATTR_FOR_EACH_UNSAFE(a, left, xdp_flow_actions(&xdp_actions->header),
+                            xdp_actions->header.actions_len) {
+        uint16_t type = bpf_nl_attr_type(a);
+        int act;
+
+        switch ((enum action_attrs)type) {
+        case XDP_ACTION_OUTPUT:
+            /* Note: userspace ensures there is no multiple output in actions */
+            return action_output(*(int *)bpf_nl_attr_get(a));
+        case XDP_ACTION_PUSH_VLAN:
+            act = action_vlan_push(ctx, bpf_nl_attr_get(a));
+            break;
+        case XDP_ACTION_POP_VLAN:
+            act = action_vlan_pop(ctx);
+            break;
+        default:
+            return XDP_ABORTED;
+        }
+        if (act != _XDP_ACTION_CONTINUE) {
+            return act;
+        }
+    }
+
+    account_debug(1);
+    return XDP_DROP;
+}
+
+char _license[] SEC("license") = "Apache-2.0";
diff --git a/configure.ac b/configure.ac
index 1877aae56..99a93ce00 100644
--- a/configure.ac
+++ b/configure.ac
@@ -99,6 +99,7 @@  OVS_CHECK_DOT
 OVS_CHECK_IF_DL
 OVS_CHECK_STRTOK_R
 OVS_CHECK_LINUX_AF_XDP
+OVS_CHECK_LINUX_BPF
 AC_CHECK_DECLS([sys_siglist], [], [], [[#include <signal.h>]])
 AC_CHECK_MEMBERS([struct stat.st_mtim.tv_nsec, struct stat.st_mtimensec],
   [], [], [[#include <sys/stat.h>]])
@@ -198,6 +199,7 @@  AC_CONFIG_FILES(datapath/Makefile)
 AC_CONFIG_FILES(datapath/linux/Kbuild)
 AC_CONFIG_FILES(datapath/linux/Makefile)
 AC_CONFIG_FILES(datapath/linux/Makefile.main)
+AC_CONFIG_FILES(bpf/Makefile)
 AC_CONFIG_FILES(tests/atlocal)
 AC_CONFIG_FILES(lib/libopenvswitch.pc)
 AC_CONFIG_FILES(lib/libsflow.pc)