diff mbox series

[v2,net-next,2/4] net: add skeleton of bpfilter kernel module

Message ID 20180503043604.1604587-3-ast@kernel.org
State Changes Requested, archived
Delegated to: David Miller
Headers show
Series bpfilter | expand

Commit Message

Alexei Starovoitov May 3, 2018, 4:36 a.m. UTC
bpfilter.ko consists of bpfilter_kern.c (normal kernel module code)
and user mode helper code that is embedded into bpfilter.ko

The steps to build bpfilter.ko are the following:
- main.c is compiled by HOSTCC into the bpfilter_umh elf executable file
- with quite a bit of objcopy and Makefile magic the bpfilter_umh elf file
  is converted into bpfilter_umh.o object file
  with _binary_net_bpfilter_bpfilter_umh_start and _end symbols
  Example:
  $ nm ./bld_x64/net/bpfilter/bpfilter_umh.o
  0000000000004cf8 T _binary_net_bpfilter_bpfilter_umh_end
  0000000000004cf8 A _binary_net_bpfilter_bpfilter_umh_size
  0000000000000000 T _binary_net_bpfilter_bpfilter_umh_start
- bpfilter_umh.o and bpfilter_kern.o are linked together into bpfilter.ko

bpfilter_kern.c is a normal kernel module code that calls
the fork_usermode_blob() helper to execute part of its own data
as a user mode process.

Notice that _binary_net_bpfilter_bpfilter_umh_start - end
is placed into .init.rodata section, so it's freed as soon as __init
function of bpfilter.ko is finished.
As part of __init the bpfilter.ko does first request/reply action
via two unix pipe provided by fork_usermode_blob() helper to
make sure that umh is healthy. If not it will kill it via pid.

Later bpfilter_process_sockopt() will be called from bpfilter hooks
in get/setsockopt() to pass iptable commands into umh via bpfilter.ko

If admin does 'rmmod bpfilter' the __exit code bpfilter.ko will
kill umh as well.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpfilter.h      | 15 +++++++
 include/uapi/linux/bpfilter.h | 21 ++++++++++
 net/Kconfig                   |  2 +
 net/Makefile                  |  1 +
 net/bpfilter/Kconfig          | 17 ++++++++
 net/bpfilter/Makefile         | 24 +++++++++++
 net/bpfilter/bpfilter_kern.c  | 93 +++++++++++++++++++++++++++++++++++++++++++
 net/bpfilter/main.c           | 63 +++++++++++++++++++++++++++++
 net/bpfilter/msgfmt.h         | 17 ++++++++
 net/ipv4/Makefile             |  2 +
 net/ipv4/bpfilter/Makefile    |  2 +
 net/ipv4/bpfilter/sockopt.c   | 42 +++++++++++++++++++
 net/ipv4/ip_sockglue.c        | 17 ++++++++
 13 files changed, 316 insertions(+)
 create mode 100644 include/linux/bpfilter.h
 create mode 100644 include/uapi/linux/bpfilter.h
 create mode 100644 net/bpfilter/Kconfig
 create mode 100644 net/bpfilter/Makefile
 create mode 100644 net/bpfilter/bpfilter_kern.c
 create mode 100644 net/bpfilter/main.c
 create mode 100644 net/bpfilter/msgfmt.h
 create mode 100644 net/ipv4/bpfilter/Makefile
 create mode 100644 net/ipv4/bpfilter/sockopt.c

Comments

Edward Cree May 3, 2018, 2:23 p.m. UTC | #1
On 03/05/18 05:36, Alexei Starovoitov wrote:
> bpfilter.ko consists of bpfilter_kern.c (normal kernel module code)
> and user mode helper code that is embedded into bpfilter.ko
>
> The steps to build bpfilter.ko are the following:
> - main.c is compiled by HOSTCC into the bpfilter_umh elf executable file
> - with quite a bit of objcopy and Makefile magic the bpfilter_umh elf file
>   is converted into bpfilter_umh.o object file
>   with _binary_net_bpfilter_bpfilter_umh_start and _end symbols
>   Example:
>   $ nm ./bld_x64/net/bpfilter/bpfilter_umh.o
>   0000000000004cf8 T _binary_net_bpfilter_bpfilter_umh_end
>   0000000000004cf8 A _binary_net_bpfilter_bpfilter_umh_size
>   0000000000000000 T _binary_net_bpfilter_bpfilter_umh_start
> - bpfilter_umh.o and bpfilter_kern.o are linked together into bpfilter.ko
>
> bpfilter_kern.c is a normal kernel module code that calls
> the fork_usermode_blob() helper to execute part of its own data
> as a user mode process.
>
> Notice that _binary_net_bpfilter_bpfilter_umh_start - end
> is placed into .init.rodata section, so it's freed as soon as __init
> function of bpfilter.ko is finished.
> As part of __init the bpfilter.ko does first request/reply action
> via two unix pipe provided by fork_usermode_blob() helper to
> make sure that umh is healthy. If not it will kill it via pid.
>
> Later bpfilter_process_sockopt() will be called from bpfilter hooks
> in get/setsockopt() to pass iptable commands into umh via bpfilter.ko
>
> If admin does 'rmmod bpfilter' the __exit code bpfilter.ko will
> kill umh as well.
>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
>  include/linux/bpfilter.h      | 15 +++++++
>  include/uapi/linux/bpfilter.h | 21 ++++++++++
>  net/Kconfig                   |  2 +
>  net/Makefile                  |  1 +
>  net/bpfilter/Kconfig          | 17 ++++++++
>  net/bpfilter/Makefile         | 24 +++++++++++
>  net/bpfilter/bpfilter_kern.c  | 93 +++++++++++++++++++++++++++++++++++++++++++
>  net/bpfilter/main.c           | 63 +++++++++++++++++++++++++++++
>  net/bpfilter/msgfmt.h         | 17 ++++++++
>  net/ipv4/Makefile             |  2 +
>  net/ipv4/bpfilter/Makefile    |  2 +
>  net/ipv4/bpfilter/sockopt.c   | 42 +++++++++++++++++++
>  net/ipv4/ip_sockglue.c        | 17 ++++++++
>  13 files changed, 316 insertions(+)
>  create mode 100644 include/linux/bpfilter.h
>  create mode 100644 include/uapi/linux/bpfilter.h
>  create mode 100644 net/bpfilter/Kconfig
>  create mode 100644 net/bpfilter/Makefile
>  create mode 100644 net/bpfilter/bpfilter_kern.c
>  create mode 100644 net/bpfilter/main.c
>  create mode 100644 net/bpfilter/msgfmt.h
>  create mode 100644 net/ipv4/bpfilter/Makefile
>  create mode 100644 net/ipv4/bpfilter/sockopt.c
>
> diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h
> new file mode 100644
> index 000000000000..687b1760bb9f
> --- /dev/null
> +++ b/include/linux/bpfilter.h
> @@ -0,0 +1,15 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_BPFILTER_H
> +#define _LINUX_BPFILTER_H
> +
> +#include <uapi/linux/bpfilter.h>
> +
> +struct sock;
> +int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char *optval,
> +			    unsigned int optlen);
> +int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char *optval,
> +			    int *optlen);
> +extern int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
> +				       char __user *optval,
> +				       unsigned int optlen, bool is_set);
> +#endif
> diff --git a/include/uapi/linux/bpfilter.h b/include/uapi/linux/bpfilter.h
> new file mode 100644
> index 000000000000..2ec3cc99ea4c
> --- /dev/null
> +++ b/include/uapi/linux/bpfilter.h
> @@ -0,0 +1,21 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _UAPI_LINUX_BPFILTER_H
> +#define _UAPI_LINUX_BPFILTER_H
> +
> +#include <linux/if.h>
> +
> +enum {
> +	BPFILTER_IPT_SO_SET_REPLACE = 64,
> +	BPFILTER_IPT_SO_SET_ADD_COUNTERS = 65,
> +	BPFILTER_IPT_SET_MAX,
> +};
> +
> +enum {
> +	BPFILTER_IPT_SO_GET_INFO = 64,
> +	BPFILTER_IPT_SO_GET_ENTRIES = 65,
> +	BPFILTER_IPT_SO_GET_REVISION_MATCH = 66,
> +	BPFILTER_IPT_SO_GET_REVISION_TARGET = 67,
> +	BPFILTER_IPT_GET_MAX,
> +};
> +
> +#endif /* _UAPI_LINUX_BPFILTER_H */
> diff --git a/net/Kconfig b/net/Kconfig
> index b62089fb1332..ed6368b306fa 100644
> --- a/net/Kconfig
> +++ b/net/Kconfig
> @@ -201,6 +201,8 @@ source "net/bridge/netfilter/Kconfig"
>  
>  endif
>  
> +source "net/bpfilter/Kconfig"
> +
>  source "net/dccp/Kconfig"
>  source "net/sctp/Kconfig"
>  source "net/rds/Kconfig"
> diff --git a/net/Makefile b/net/Makefile
> index a6147c61b174..7f982b7682bd 100644
> --- a/net/Makefile
> +++ b/net/Makefile
> @@ -20,6 +20,7 @@ obj-$(CONFIG_TLS)		+= tls/
>  obj-$(CONFIG_XFRM)		+= xfrm/
>  obj-$(CONFIG_UNIX)		+= unix/
>  obj-$(CONFIG_NET)		+= ipv6/
> +obj-$(CONFIG_BPFILTER)		+= bpfilter/
>  obj-$(CONFIG_PACKET)		+= packet/
>  obj-$(CONFIG_NET_KEY)		+= key/
>  obj-$(CONFIG_BRIDGE)		+= bridge/
> diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
> new file mode 100644
> index 000000000000..782a732b9a5c
> --- /dev/null
> +++ b/net/bpfilter/Kconfig
> @@ -0,0 +1,17 @@
> +menuconfig BPFILTER
> +	bool "BPF based packet filtering framework (BPFILTER)"
> +	default n
> +	depends on NET && BPF
> +	help
> +	  This builds experimental bpfilter framework that is aiming to
> +	  provide netfilter compatible functionality via BPF
> +
> +if BPFILTER
> +config BPFILTER_UMH
> +	tristate "bpftiler kernel module with user mode helper"
sp. "bpftiler" -> "bpfilter"
> +	default m
> +	depends on m
> +	help
> +	  This builds bpfilter kernel module with embedded user mode helper
> +endif
> +
> diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
> new file mode 100644
> index 000000000000..897eedae523e
> --- /dev/null
> +++ b/net/bpfilter/Makefile
> @@ -0,0 +1,24 @@
> +# SPDX-License-Identifier: GPL-2.0
> +#
> +# Makefile for the Linux BPFILTER layer.
> +#
> +
> +hostprogs-y := bpfilter_umh
> +bpfilter_umh-objs := main.o
> +HOSTCFLAGS += -I. -Itools/include/
> +
> +# a bit of elf magic to convert bpfilter_umh binary into a binary blob
> +# inside bpfilter_umh.o elf file referenced by
> +# _binary_net_bpfilter_bpfilter_umh_start symbol
> +# which bpfilter_kern.c passes further into umh blob loader at run-time
> +quiet_cmd_copy_umh = GEN $@
> +      cmd_copy_umh = echo ':' > $(obj)/.bpfilter_umh.o.cmd; \
> +      $(OBJCOPY) -I binary -O $(CONFIG_OUTPUT_FORMAT) \
> +      -B `$(OBJDUMP) -f $<|grep architecture|cut -d, -f1|cut -d' ' -f2` \
> +      --rename-section .data=.init.rodata $< $@
> +
> +$(obj)/bpfilter_umh.o: $(obj)/bpfilter_umh
> +	$(call cmd,copy_umh)
> +
> +obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o
> +bpfilter-objs += bpfilter_kern.o bpfilter_umh.o
> diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
> new file mode 100644
> index 000000000000..e0a6fdd5842b
> --- /dev/null
> +++ b/net/bpfilter/bpfilter_kern.c
> @@ -0,0 +1,93 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/umh.h>
> +#include <linux/bpfilter.h>
> +#include <linux/sched.h>
> +#include <linux/sched/signal.h>
> +#include <linux/fs.h>
> +#include <linux/file.h>
> +#include "msgfmt.h"
> +
> +#define UMH_start _binary_net_bpfilter_bpfilter_umh_start
> +#define UMH_end _binary_net_bpfilter_bpfilter_umh_end
> +
> +extern char UMH_start;
> +extern char UMH_end;
> +
> +static struct umh_info info;
> +
> +static void shutdown_umh(struct umh_info *info)
> +{
> +	struct task_struct *tsk;
> +
> +	tsk = pid_task(find_vpid(info->pid), PIDTYPE_PID);
> +	if (tsk)
> +		force_sig(SIGKILL, tsk);
> +	fput(info->pipe_to_umh);
> +	fput(info->pipe_from_umh);
> +}
> +
> +static void stop_umh(void)
> +{
> +	if (bpfilter_process_sockopt) {
I worry about locking here.  Is it possible for two calls to
 bpfilter_process_sockopt() to run in parallel, both fail, and thus both
 call stop_umh()?  And if both end up calling shutdown_umh(), we double
 fput().
> +		bpfilter_process_sockopt = NULL;
> +		shutdown_umh(&info);
> +	}
> +}
> +
> +static int __bpfilter_process_sockopt(struct sock *sk, int optname,
> +				      char __user *optval,
> +				      unsigned int optlen, bool is_set)
> +{
> +	struct mbox_request req;
> +	struct mbox_reply reply;
> +	loff_t pos;
> +	ssize_t n;
> +
> +	req.is_set = is_set;
> +	req.pid = current->pid;
> +	req.cmd = optname;
> +	req.addr = (long)optval;
> +	req.len = optlen;
> +	n = __kernel_write(info.pipe_to_umh, &req, sizeof(req), &pos);
> +	if (n != sizeof(req)) {
> +		pr_err("write fail %zd\n", n);
> +		stop_umh();
> +		return -EFAULT;
> +	}
> +	pos = 0;
> +	n = kernel_read(info.pipe_from_umh, &reply, sizeof(reply), &pos);
> +	if (n != sizeof(reply)) {
> +		pr_err("read fail %zd\n", n);
> +		stop_umh();
> +		return -EFAULT;
> +	}
> +	return reply.status;
> +}
> +
> +static int __init load_umh(void)
> +{
> +	int err;
> +
> +	err = fork_usermode_blob(&UMH_start, &UMH_end - &UMH_start, &info);
> +	if (err)
> +		return err;
> +	pr_info("Loaded umh pid %d\n", info.pid);
> +	bpfilter_process_sockopt = &__bpfilter_process_sockopt;
> +
> +	if (__bpfilter_process_sockopt(NULL, 0, 0, 0, 0) != 0) {
> +		stop_umh();
> +		return -EFAULT;
> +	}
> +	return 0;
> +}
> +
> +static void __exit fini_umh(void)
> +{
> +	stop_umh();
> +}
> +module_init(load_umh);
> +module_exit(fini_umh);
> +MODULE_LICENSE("GPL");
> diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c
> new file mode 100644
> index 000000000000..81bbc1684896
> --- /dev/null
> +++ b/net/bpfilter/main.c
> @@ -0,0 +1,63 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define _GNU_SOURCE
> +#include <sys/uio.h>
> +#include <errno.h>
> +#include <stdio.h>
> +#include <sys/socket.h>
> +#include <fcntl.h>
> +#include <unistd.h>
> +#include "include/uapi/linux/bpf.h"
> +#include <asm/unistd.h>
> +#include "msgfmt.h"
> +
> +int debug_fd;
> +
> +static int handle_get_cmd(struct mbox_request *cmd)
> +{
> +	switch (cmd->cmd) {
> +	case 0:
> +		return 0;
> +	default:
> +		break;
> +	}
> +	return -ENOPROTOOPT;
> +}
> +
> +static int handle_set_cmd(struct mbox_request *cmd)
> +{
> +	return -ENOPROTOOPT;
> +}
> +
> +static void loop(void)
> +{
> +	while (1) {
> +		struct mbox_request req;
> +		struct mbox_reply reply;
> +		int n;
> +
> +		n = read(0, &req, sizeof(req));
> +		if (n != sizeof(req)) {
> +			dprintf(debug_fd, "invalid request %d\n", n);
> +			return;
> +		}
> +
> +		reply.status = req.is_set ?
> +			handle_set_cmd(&req) :
> +			handle_get_cmd(&req);
> +
> +		n = write(1, &reply, sizeof(reply));
> +		if (n != sizeof(reply)) {
> +			dprintf(debug_fd, "reply failed %d\n", n);
> +			return;
> +		}
> +	}
> +}
> +
> +int main(void)
> +{
> +	debug_fd = open("/dev/console", 00000002 | 00000100);
Should probably handle failure of this open() call.
> +	dprintf(debug_fd, "Started bpfilter\n");
> +	loop();
> +	close(debug_fd);
> +	return 0;
> +}
> diff --git a/net/bpfilter/msgfmt.h b/net/bpfilter/msgfmt.h
> new file mode 100644
> index 000000000000..94b9ac9e5114
> --- /dev/null
> +++ b/net/bpfilter/msgfmt.h
> @@ -0,0 +1,17 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _NET_BPFILTER_MSGFMT_H
> +#define _NET_BPFTILER_MSGFMT_H
Another bpftiler here, should be
+#define _NET_BPFILTER_MSGFMT_H

-Ed
> +
> +struct mbox_request {
> +	__u64 addr;
> +	__u32 len;
> +	__u32 is_set;
> +	__u32 cmd;
> +	__u32 pid;
> +};
> +
> +struct mbox_reply {
> +	__u32 status;
> +};
> +
> +#endif
> diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
> index b379520f9133..7018f91c5a39 100644
> --- a/net/ipv4/Makefile
> +++ b/net/ipv4/Makefile
> @@ -16,6 +16,8 @@ obj-y     := route.o inetpeer.o protocol.o \
>  	     inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
>  	     metrics.o
>  
> +obj-$(CONFIG_BPFILTER) += bpfilter/
> +
>  obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
>  obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
>  obj-$(CONFIG_PROC_FS) += proc.o
> diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile
> new file mode 100644
> index 000000000000..ce262d76cc48
> --- /dev/null
> +++ b/net/ipv4/bpfilter/Makefile
> @@ -0,0 +1,2 @@
> +obj-$(CONFIG_BPFILTER) += sockopt.o
> +
> diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
> new file mode 100644
> index 000000000000..42a96d2d8d05
> --- /dev/null
> +++ b/net/ipv4/bpfilter/sockopt.c
> @@ -0,0 +1,42 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/uaccess.h>
> +#include <linux/bpfilter.h>
> +#include <uapi/linux/bpf.h>
> +#include <linux/wait.h>
> +#include <linux/kmod.h>
> +
> +int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
> +				char __user *optval,
> +				unsigned int optlen, bool is_set);
> +EXPORT_SYMBOL_GPL(bpfilter_process_sockopt);
> +
> +int bpfilter_mbox_request(struct sock *sk, int optname, char __user *optval,
> +			  unsigned int optlen, bool is_set)
> +{
> +	if (!bpfilter_process_sockopt) {
> +		int err = request_module("bpfilter");
> +
> +		if (err)
> +			return err;
> +		if (!bpfilter_process_sockopt)
> +			return -ECHILD;
> +	}
> +	return bpfilter_process_sockopt(sk, optname, optval, optlen, is_set);
> +}
> +
> +int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
> +			    unsigned int optlen)
> +{
> +	return bpfilter_mbox_request(sk, optname, optval, optlen, true);
> +}
> +
> +int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
> +			    int __user *optlen)
> +{
> +	int len;
> +
> +	if (get_user(len, optlen))
> +		return -EFAULT;
> +
> +	return bpfilter_mbox_request(sk, optname, optval, len, false);
> +}
> diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
> index 5ad2d8ed3a3f..e0791faacb24 100644
> --- a/net/ipv4/ip_sockglue.c
> +++ b/net/ipv4/ip_sockglue.c
> @@ -47,6 +47,8 @@
>  #include <linux/errqueue.h>
>  #include <linux/uaccess.h>
>  
> +#include <linux/bpfilter.h>
> +
>  /*
>   *	SOL_IP control messages.
>   */
> @@ -1244,6 +1246,11 @@ int ip_setsockopt(struct sock *sk, int level,
>  		return -ENOPROTOOPT;
>  
>  	err = do_ip_setsockopt(sk, level, optname, optval, optlen);
> +#ifdef CONFIG_BPFILTER
> +	if (optname >= BPFILTER_IPT_SO_SET_REPLACE &&
> +	    optname < BPFILTER_IPT_SET_MAX)
> +		err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen);
> +#endif
>  #ifdef CONFIG_NETFILTER
>  	/* we need to exclude all possible ENOPROTOOPTs except default case */
>  	if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
> @@ -1552,6 +1559,11 @@ int ip_getsockopt(struct sock *sk, int level,
>  	int err;
>  
>  	err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0);
> +#ifdef CONFIG_BPFILTER
> +	if (optname >= BPFILTER_IPT_SO_GET_INFO &&
> +	    optname < BPFILTER_IPT_GET_MAX)
> +		err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
> +#endif
>  #ifdef CONFIG_NETFILTER
>  	/* we need to exclude all possible ENOPROTOOPTs except default case */
>  	if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
> @@ -1584,6 +1596,11 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname,
>  	err = do_ip_getsockopt(sk, level, optname, optval, optlen,
>  		MSG_CMSG_COMPAT);
>  
> +#ifdef CONFIG_BPFILTER
> +	if (optname >= BPFILTER_IPT_SO_GET_INFO &&
> +	    optname < BPFILTER_IPT_GET_MAX)
> +		err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
> +#endif
>  #ifdef CONFIG_NETFILTER
>  	/* we need to exclude all possible ENOPROTOOPTs except default case */
>  	if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
Alexei Starovoitov May 5, 2018, 1 a.m. UTC | #2
On Thu, May 03, 2018 at 03:23:55PM +0100, Edward Cree wrote:
> On 03/05/18 05:36, Alexei Starovoitov wrote:
> > bpfilter.ko consists of bpfilter_kern.c (normal kernel module code)
> > and user mode helper code that is embedded into bpfilter.ko
> >
> > The steps to build bpfilter.ko are the following:
> > - main.c is compiled by HOSTCC into the bpfilter_umh elf executable file
> > - with quite a bit of objcopy and Makefile magic the bpfilter_umh elf file
> >   is converted into bpfilter_umh.o object file
> >   with _binary_net_bpfilter_bpfilter_umh_start and _end symbols
> >   Example:
> >   $ nm ./bld_x64/net/bpfilter/bpfilter_umh.o
> >   0000000000004cf8 T _binary_net_bpfilter_bpfilter_umh_end
> >   0000000000004cf8 A _binary_net_bpfilter_bpfilter_umh_size
> >   0000000000000000 T _binary_net_bpfilter_bpfilter_umh_start
> > - bpfilter_umh.o and bpfilter_kern.o are linked together into bpfilter.ko
> >
> > bpfilter_kern.c is a normal kernel module code that calls
> > the fork_usermode_blob() helper to execute part of its own data
> > as a user mode process.
> >
> > Notice that _binary_net_bpfilter_bpfilter_umh_start - end
> > is placed into .init.rodata section, so it's freed as soon as __init
> > function of bpfilter.ko is finished.
> > As part of __init the bpfilter.ko does first request/reply action
> > via two unix pipe provided by fork_usermode_blob() helper to
> > make sure that umh is healthy. If not it will kill it via pid.
> >
> > Later bpfilter_process_sockopt() will be called from bpfilter hooks
> > in get/setsockopt() to pass iptable commands into umh via bpfilter.ko
> >
> > If admin does 'rmmod bpfilter' the __exit code bpfilter.ko will
> > kill umh as well.
> >
> > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
...
> > +static void stop_umh(void)
> > +{
> > +	if (bpfilter_process_sockopt) {
> I worry about locking here.  Is it possible for two calls to
>  bpfilter_process_sockopt() to run in parallel, both fail, and thus both
>  call stop_umh()?  And if both end up calling shutdown_umh(), we double
>  fput().

I thought iptables sockopt is serialized earlier. Nope.
We need to grab the mutex to access these pipes.
Will fix.

Thanks for spelling nits. Will fix as well.
Harald Welte May 7, 2018, 3:24 p.m. UTC | #3
Hi Alexei + netdev list,

On Wed, May 02, 2018 at 09:36:02PM -0700, Alexei Starovoitov wrote:
> Later bpfilter_process_sockopt() will be called from bpfilter hooks
> in get/setsockopt() to pass iptable commands into umh via bpfilter.ko

This is a part I'm quite heavily opposed to - at least at this point.
Unless bpfilter offered something that is semantically compatible to
what netfilter/iptables is currently implementing, I don't think
bpfilter should be [allowed to] overriding the iptables
{get,set}sockopt() calls.

I appreciate that people are working on a different architecture packet
filter than what we used to.   I also understand that there is a need
for backwards compatibility.  I still think it's wrong to offer that
compatibility on the {set,get}sockopt level, rather than on the
"iptables command line utility replacement" level.  But nevermind, you
guys have a different opinion on that, on which we can agree to
disagree.

However, no matter what you do, the most important part from the user
point of view is to make sure you don't break semantics.

netfilter/iptables semantics have an intricate notion abut when which
chain of which table is executed, in which order, at what particular
point of the packet traversal during the network stack.  The packet
filtering rulesets that people have created over more than 18 years
are based on those semantics.  If you offer the same interface, but not
that very same semantics, the packet filtering policies can an will
break - and they will break so in a hidden way.  To the user, it appears
as if the ruleset is loaded with the assumed semantics, but in reality
it isn't.

Unless you can replicate those semantics 1:1, I think it is not only
wrong to override the iptables sockopt interface, but it's outright
dangerous.

Having less matches/targets implemented than original iptables is
something that I believe is acceptable (and inevitable, at least in the
beginning).  If somebody tries to load a related ruleset with bpfilter
active, it will fail gracefully and the user can chose to not use that
match/target in his ruleset, or to not use bpfilter.

But if the ruleset loads but behaves different than before (because e.g.
it's executed from a completely different place in the stack), that's
IMHO an absolute no-go that must be avoided at all cost.  If that's the
case, you are actively breaking network security, rather than creating
it.

So I think there's only two ways to go:

a) replicate the exact semantics/order of the filter/mangle/raw/...
   tables and their chains, both among themselves as well as in terms of
   ordering with other parts of the network stack, or

b) not use the existing tables/chains with their pre-defined semantics
   but rather start new 'tables' which can then have different semantics
   as defined at the time of their implementation.

My apologies if I misunderstood something about bpfilter.  Feel free to
correct me where I'm wrong.  Thanks.

Regards,
	Harald
David Miller May 7, 2018, 3:50 p.m. UTC | #4
From: Harald Welte <laforge@gnumonks.org>
Date: Mon, 7 May 2018 17:24:35 +0200

> But if the ruleset loads but behaves different than before (because e.g.
> it's executed from a completely different place in the stack), that's
> IMHO an absolute no-go that must be avoided at all cost.

That's not what we are doing nor proposing.  I'm sorry if you are
confused on this matter.

The base implementation we strive for will execute the BPF programs
from the existing netfilter hook points.

However, if semantically the effect is equal if we execute the BPF
program from XDP, we will allow that to happen as an optimization.

The BPF exection is where it is in these patches for the purposes of
bootstrapping the bpfilter project and easy testing/benchmarking/hacking.

I hope this clears up your confusion.

If you would like to become involved in hacking on bpfilter to help us
ensure more accurate compatability between existing iptables and what
bpfilter will execute for the same rule sets, we very much look
forward to your contributions and expertiece.

Thank you.
Luis Chamberlain May 7, 2018, 6:51 p.m. UTC | #5
On Wed, May 02, 2018 at 09:36:02PM -0700, Alexei Starovoitov wrote:
> bpfilter.ko consists of bpfilter_kern.c (normal kernel module code)
> and user mode helper code that is embedded into bpfilter.ko
> 
> The steps to build bpfilter.ko are the following:
> - main.c is compiled by HOSTCC into the bpfilter_umh elf executable file
> - with quite a bit of objcopy and Makefile magic the bpfilter_umh elf file
>   is converted into bpfilter_umh.o object file
>   with _binary_net_bpfilter_bpfilter_umh_start and _end symbols
>   Example:
>   $ nm ./bld_x64/net/bpfilter/bpfilter_umh.o
>   0000000000004cf8 T _binary_net_bpfilter_bpfilter_umh_end
>   0000000000004cf8 A _binary_net_bpfilter_bpfilter_umh_size
>   0000000000000000 T _binary_net_bpfilter_bpfilter_umh_start
> - bpfilter_umh.o and bpfilter_kern.o are linked together into bpfilter.ko
> 
> bpfilter_kern.c is a normal kernel module code that calls
> the fork_usermode_blob() helper to execute part of its own data
> as a user mode process.
> 
> Notice that _binary_net_bpfilter_bpfilter_umh_start - end
> is placed into .init.rodata section, so it's freed as soon as __init
> function of bpfilter.ko is finished.
> As part of __init the bpfilter.ko does first request/reply action
> via two unix pipe provided by fork_usermode_blob() helper to
> make sure that umh is healthy. If not it will kill it via pid.

It does this very fast, right away. On a really slow system how are you sure
that this won't race and the execution of the check happens early on prior to
letting the actual setup trigger? After all, we're calling the userpsace
process in async mode. We could preempt it now.

> Later bpfilter_process_sockopt() will be called from bpfilter hooks
> in get/setsockopt() to pass iptable commands into umh via bpfilter.ko
> 
> If admin does 'rmmod bpfilter' the __exit code bpfilter.ko will
> kill umh as well.
> 
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
>  include/linux/bpfilter.h      | 15 +++++++
>  include/uapi/linux/bpfilter.h | 21 ++++++++++
>  net/Kconfig                   |  2 +
>  net/Makefile                  |  1 +
>  net/bpfilter/Kconfig          | 17 ++++++++
>  net/bpfilter/Makefile         | 24 +++++++++++
>  net/bpfilter/bpfilter_kern.c  | 93 +++++++++++++++++++++++++++++++++++++++++++
>  net/bpfilter/main.c           | 63 +++++++++++++++++++++++++++++
>  net/bpfilter/msgfmt.h         | 17 ++++++++
>  net/ipv4/Makefile             |  2 +
>  net/ipv4/bpfilter/Makefile    |  2 +
>  net/ipv4/bpfilter/sockopt.c   | 42 +++++++++++++++++++
>  net/ipv4/ip_sockglue.c        | 17 ++++++++
>  13 files changed, 316 insertions(+)
>  create mode 100644 include/linux/bpfilter.h
>  create mode 100644 include/uapi/linux/bpfilter.h
>  create mode 100644 net/bpfilter/Kconfig
>  create mode 100644 net/bpfilter/Makefile
>  create mode 100644 net/bpfilter/bpfilter_kern.c
>  create mode 100644 net/bpfilter/main.c
>  create mode 100644 net/bpfilter/msgfmt.h
>  create mode 100644 net/ipv4/bpfilter/Makefile
>  create mode 100644 net/ipv4/bpfilter/sockopt.c
> 
> diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h
> new file mode 100644
> index 000000000000..687b1760bb9f
> --- /dev/null
> +++ b/include/linux/bpfilter.h
> @@ -0,0 +1,15 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_BPFILTER_H
> +#define _LINUX_BPFILTER_H
> +
> +#include <uapi/linux/bpfilter.h>
> +
> +struct sock;
> +int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char *optval,
> +			    unsigned int optlen);
> +int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char *optval,
> +			    int *optlen);
> +extern int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
> +				       char __user *optval,
> +				       unsigned int optlen, bool is_set);
> +#endif
> diff --git a/include/uapi/linux/bpfilter.h b/include/uapi/linux/bpfilter.h
> new file mode 100644
> index 000000000000..2ec3cc99ea4c
> --- /dev/null
> +++ b/include/uapi/linux/bpfilter.h
> @@ -0,0 +1,21 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _UAPI_LINUX_BPFILTER_H
> +#define _UAPI_LINUX_BPFILTER_H
> +
> +#include <linux/if.h>
> +
> +enum {
> +	BPFILTER_IPT_SO_SET_REPLACE = 64,
> +	BPFILTER_IPT_SO_SET_ADD_COUNTERS = 65,
> +	BPFILTER_IPT_SET_MAX,
> +};
> +
> +enum {
> +	BPFILTER_IPT_SO_GET_INFO = 64,
> +	BPFILTER_IPT_SO_GET_ENTRIES = 65,
> +	BPFILTER_IPT_SO_GET_REVISION_MATCH = 66,
> +	BPFILTER_IPT_SO_GET_REVISION_TARGET = 67,
> +	BPFILTER_IPT_GET_MAX,
> +};
> +
> +#endif /* _UAPI_LINUX_BPFILTER_H */
> diff --git a/net/Kconfig b/net/Kconfig
> index b62089fb1332..ed6368b306fa 100644
> --- a/net/Kconfig
> +++ b/net/Kconfig
> @@ -201,6 +201,8 @@ source "net/bridge/netfilter/Kconfig"
>  
>  endif
>  
> +source "net/bpfilter/Kconfig"
> +
>  source "net/dccp/Kconfig"
>  source "net/sctp/Kconfig"
>  source "net/rds/Kconfig"
> diff --git a/net/Makefile b/net/Makefile
> index a6147c61b174..7f982b7682bd 100644
> --- a/net/Makefile
> +++ b/net/Makefile
> @@ -20,6 +20,7 @@ obj-$(CONFIG_TLS)		+= tls/
>  obj-$(CONFIG_XFRM)		+= xfrm/
>  obj-$(CONFIG_UNIX)		+= unix/
>  obj-$(CONFIG_NET)		+= ipv6/
> +obj-$(CONFIG_BPFILTER)		+= bpfilter/
>  obj-$(CONFIG_PACKET)		+= packet/
>  obj-$(CONFIG_NET_KEY)		+= key/
>  obj-$(CONFIG_BRIDGE)		+= bridge/
> diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
> new file mode 100644
> index 000000000000..782a732b9a5c
> --- /dev/null
> +++ b/net/bpfilter/Kconfig
> @@ -0,0 +1,17 @@
> +menuconfig BPFILTER
> +	bool "BPF based packet filtering framework (BPFILTER)"
> +	default n
> +	depends on NET && BPF
> +	help
> +	  This builds experimental bpfilter framework that is aiming to
> +	  provide netfilter compatible functionality via BPF
> +
> +if BPFILTER
> +config BPFILTER_UMH
> +	tristate "bpftiler kernel module with user mode helper"
> +	default m
> +	depends on m
> +	help
> +	  This builds bpfilter kernel module with embedded user mode helper
> +endif
> +
> diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
> new file mode 100644
> index 000000000000..897eedae523e
> --- /dev/null
> +++ b/net/bpfilter/Makefile
> @@ -0,0 +1,24 @@
> +# SPDX-License-Identifier: GPL-2.0
> +#
> +# Makefile for the Linux BPFILTER layer.
> +#
> +
> +hostprogs-y := bpfilter_umh
> +bpfilter_umh-objs := main.o
> +HOSTCFLAGS += -I. -Itools/include/
> +
> +# a bit of elf magic to convert bpfilter_umh binary into a binary blob
> +# inside bpfilter_umh.o elf file referenced by
> +# _binary_net_bpfilter_bpfilter_umh_start symbol
> +# which bpfilter_kern.c passes further into umh blob loader at run-time
> +quiet_cmd_copy_umh = GEN $@
> +      cmd_copy_umh = echo ':' > $(obj)/.bpfilter_umh.o.cmd; \
> +      $(OBJCOPY) -I binary -O $(CONFIG_OUTPUT_FORMAT) \
> +      -B `$(OBJDUMP) -f $<|grep architecture|cut -d, -f1|cut -d' ' -f2` \
> +      --rename-section .data=.init.rodata $< $@

Cool, but so our expectation is that the compiler sets this symbol, how
are we sure it will always be set?

> +
> +$(obj)/bpfilter_umh.o: $(obj)/bpfilter_umh
> +	$(call cmd,copy_umh)
> +
> +obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o
> +bpfilter-objs += bpfilter_kern.o bpfilter_umh.o
> diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
> new file mode 100644
> index 000000000000..e0a6fdd5842b
> --- /dev/null
> +++ b/net/bpfilter/bpfilter_kern.c
> @@ -0,0 +1,93 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/umh.h>
> +#include <linux/bpfilter.h>
> +#include <linux/sched.h>
> +#include <linux/sched/signal.h>
> +#include <linux/fs.h>
> +#include <linux/file.h>
> +#include "msgfmt.h"
> +
> +#define UMH_start _binary_net_bpfilter_bpfilter_umh_start
> +#define UMH_end _binary_net_bpfilter_bpfilter_umh_end
> +
> +extern char UMH_start;
> +extern char UMH_end;
> +
> +static struct umh_info info;
> +
> +static void shutdown_umh(struct umh_info *info)
> +{
> +	struct task_struct *tsk;
> +
> +	tsk = pid_task(find_vpid(info->pid), PIDTYPE_PID);
> +	if (tsk)
> +		force_sig(SIGKILL, tsk);
> +	fput(info->pipe_to_umh);
> +	fput(info->pipe_from_umh);
> +}
> +
> +static void stop_umh(void)
> +{
> +	if (bpfilter_process_sockopt) {
> +		bpfilter_process_sockopt = NULL;
> +		shutdown_umh(&info);
> +	}
> +}
> +
> +static int __bpfilter_process_sockopt(struct sock *sk, int optname,
> +				      char __user *optval,
> +				      unsigned int optlen, bool is_set)
> +{
> +	struct mbox_request req;
> +	struct mbox_reply reply;
> +	loff_t pos;
> +	ssize_t n;
> +
> +	req.is_set = is_set;
> +	req.pid = current->pid;
> +	req.cmd = optname;
> +	req.addr = (long)optval;
> +	req.len = optlen;
> +	n = __kernel_write(info.pipe_to_umh, &req, sizeof(req), &pos);
> +	if (n != sizeof(req)) {
> +		pr_err("write fail %zd\n", n);
> +		stop_umh();
> +		return -EFAULT;
> +	}
> +	pos = 0;
> +	n = kernel_read(info.pipe_from_umh, &reply, sizeof(reply), &pos);
> +	if (n != sizeof(reply)) {
> +		pr_err("read fail %zd\n", n);
> +		stop_umh();
> +		return -EFAULT;
> +	}
> +	return reply.status;
> +}
> +
> +static int __init load_umh(void)
> +{
> +	int err;
> +
> +	err = fork_usermode_blob(&UMH_start, &UMH_end - &UMH_start, &info);
> +	if (err)
> +		return err;
> +	pr_info("Loaded umh pid %d\n", info.pid);
> +	bpfilter_process_sockopt = &__bpfilter_process_sockopt;
> +
> +	if (__bpfilter_process_sockopt(NULL, 0, 0, 0, 0) != 0) {

See, here, what if the userspace process gets preemtped and we run this
check afterwards? Is that possible?

  Luis

> +		stop_umh();
> +		return -EFAULT;
> +	}
> +	return 0;
> +}
> +
> +static void __exit fini_umh(void)
> +{
> +	stop_umh();
> +}
> +module_init(load_umh);
> +module_exit(fini_umh);
> +MODULE_LICENSE("GPL");
> diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c
> new file mode 100644
> index 000000000000..81bbc1684896
> --- /dev/null
> +++ b/net/bpfilter/main.c
> @@ -0,0 +1,63 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define _GNU_SOURCE
> +#include <sys/uio.h>
> +#include <errno.h>
> +#include <stdio.h>
> +#include <sys/socket.h>
> +#include <fcntl.h>
> +#include <unistd.h>
> +#include "include/uapi/linux/bpf.h"
> +#include <asm/unistd.h>
> +#include "msgfmt.h"
> +
> +int debug_fd;
> +
> +static int handle_get_cmd(struct mbox_request *cmd)
> +{
> +	switch (cmd->cmd) {
> +	case 0:
> +		return 0;
> +	default:
> +		break;
> +	}
> +	return -ENOPROTOOPT;
> +}
> +
> +static int handle_set_cmd(struct mbox_request *cmd)
> +{
> +	return -ENOPROTOOPT;
> +}
> +
> +static void loop(void)
> +{
> +	while (1) {
> +		struct mbox_request req;
> +		struct mbox_reply reply;
> +		int n;
> +
> +		n = read(0, &req, sizeof(req));
> +		if (n != sizeof(req)) {
> +			dprintf(debug_fd, "invalid request %d\n", n);
> +			return;
> +		}
> +
> +		reply.status = req.is_set ?
> +			handle_set_cmd(&req) :
> +			handle_get_cmd(&req);
> +
> +		n = write(1, &reply, sizeof(reply));
> +		if (n != sizeof(reply)) {
> +			dprintf(debug_fd, "reply failed %d\n", n);
> +			return;
> +		}
> +	}
> +}
> +
> +int main(void)
> +{
> +	debug_fd = open("/dev/console", 00000002 | 00000100);
> +	dprintf(debug_fd, "Started bpfilter\n");
> +	loop();
> +	close(debug_fd);
> +	return 0;
> +}
> diff --git a/net/bpfilter/msgfmt.h b/net/bpfilter/msgfmt.h
> new file mode 100644
> index 000000000000..94b9ac9e5114
> --- /dev/null
> +++ b/net/bpfilter/msgfmt.h
> @@ -0,0 +1,17 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _NET_BPFILTER_MSGFMT_H
> +#define _NET_BPFTILER_MSGFMT_H
> +
> +struct mbox_request {
> +	__u64 addr;
> +	__u32 len;
> +	__u32 is_set;
> +	__u32 cmd;
> +	__u32 pid;
> +};
> +
> +struct mbox_reply {
> +	__u32 status;
> +};
> +
> +#endif
> diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
> index b379520f9133..7018f91c5a39 100644
> --- a/net/ipv4/Makefile
> +++ b/net/ipv4/Makefile
> @@ -16,6 +16,8 @@ obj-y     := route.o inetpeer.o protocol.o \
>  	     inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
>  	     metrics.o
>  
> +obj-$(CONFIG_BPFILTER) += bpfilter/
> +
>  obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
>  obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
>  obj-$(CONFIG_PROC_FS) += proc.o
> diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile
> new file mode 100644
> index 000000000000..ce262d76cc48
> --- /dev/null
> +++ b/net/ipv4/bpfilter/Makefile
> @@ -0,0 +1,2 @@
> +obj-$(CONFIG_BPFILTER) += sockopt.o
> +
> diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
> new file mode 100644
> index 000000000000..42a96d2d8d05
> --- /dev/null
> +++ b/net/ipv4/bpfilter/sockopt.c
> @@ -0,0 +1,42 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/uaccess.h>
> +#include <linux/bpfilter.h>
> +#include <uapi/linux/bpf.h>
> +#include <linux/wait.h>
> +#include <linux/kmod.h>
> +
> +int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
> +				char __user *optval,
> +				unsigned int optlen, bool is_set);
> +EXPORT_SYMBOL_GPL(bpfilter_process_sockopt);
> +
> +int bpfilter_mbox_request(struct sock *sk, int optname, char __user *optval,
> +			  unsigned int optlen, bool is_set)
> +{
> +	if (!bpfilter_process_sockopt) {
> +		int err = request_module("bpfilter");
> +
> +		if (err)
> +			return err;
> +		if (!bpfilter_process_sockopt)
> +			return -ECHILD;
> +	}
> +	return bpfilter_process_sockopt(sk, optname, optval, optlen, is_set);
> +}
> +
> +int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
> +			    unsigned int optlen)
> +{
> +	return bpfilter_mbox_request(sk, optname, optval, optlen, true);
> +}
> +
> +int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
> +			    int __user *optlen)
> +{
> +	int len;
> +
> +	if (get_user(len, optlen))
> +		return -EFAULT;
> +
> +	return bpfilter_mbox_request(sk, optname, optval, len, false);
> +}
> diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
> index 5ad2d8ed3a3f..e0791faacb24 100644
> --- a/net/ipv4/ip_sockglue.c
> +++ b/net/ipv4/ip_sockglue.c
> @@ -47,6 +47,8 @@
>  #include <linux/errqueue.h>
>  #include <linux/uaccess.h>
>  
> +#include <linux/bpfilter.h>
> +
>  /*
>   *	SOL_IP control messages.
>   */
> @@ -1244,6 +1246,11 @@ int ip_setsockopt(struct sock *sk, int level,
>  		return -ENOPROTOOPT;
>  
>  	err = do_ip_setsockopt(sk, level, optname, optval, optlen);
> +#ifdef CONFIG_BPFILTER
> +	if (optname >= BPFILTER_IPT_SO_SET_REPLACE &&
> +	    optname < BPFILTER_IPT_SET_MAX)
> +		err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen);
> +#endif
>  #ifdef CONFIG_NETFILTER
>  	/* we need to exclude all possible ENOPROTOOPTs except default case */
>  	if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
> @@ -1552,6 +1559,11 @@ int ip_getsockopt(struct sock *sk, int level,
>  	int err;
>  
>  	err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0);
> +#ifdef CONFIG_BPFILTER
> +	if (optname >= BPFILTER_IPT_SO_GET_INFO &&
> +	    optname < BPFILTER_IPT_GET_MAX)
> +		err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
> +#endif
>  #ifdef CONFIG_NETFILTER
>  	/* we need to exclude all possible ENOPROTOOPTs except default case */
>  	if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
> @@ -1584,6 +1596,11 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname,
>  	err = do_ip_getsockopt(sk, level, optname, optval, optlen,
>  		MSG_CMSG_COMPAT);
>  
> +#ifdef CONFIG_BPFILTER
> +	if (optname >= BPFILTER_IPT_SO_GET_INFO &&
> +	    optname < BPFILTER_IPT_GET_MAX)
> +		err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
> +#endif
>  #ifdef CONFIG_NETFILTER
>  	/* we need to exclude all possible ENOPROTOOPTs except default case */
>  	if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
> -- 
> 2.9.5
Alexei Starovoitov May 9, 2018, 2:29 a.m. UTC | #6
On Mon, May 07, 2018 at 06:51:24PM +0000, Luis R. Rodriguez wrote:
> > Notice that _binary_net_bpfilter_bpfilter_umh_start - end
> > is placed into .init.rodata section, so it's freed as soon as __init
> > function of bpfilter.ko is finished.
> > As part of __init the bpfilter.ko does first request/reply action
> > via two unix pipe provided by fork_usermode_blob() helper to
> > make sure that umh is healthy. If not it will kill it via pid.
> 
> It does this very fast, right away. On a really slow system how are you sure
> that this won't race and the execution of the check happens early on prior to
> letting the actual setup trigger? After all, we're calling the userpsace
> process in async mode. We could preempt it now.

I don't see an issue.
the kernel synchronously writes into a pipe. User space process reads.
Exactly the same as coredump logic with pipes.

> > +# a bit of elf magic to convert bpfilter_umh binary into a binary blob
> > +# inside bpfilter_umh.o elf file referenced by
> > +# _binary_net_bpfilter_bpfilter_umh_start symbol
> > +# which bpfilter_kern.c passes further into umh blob loader at run-time
> > +quiet_cmd_copy_umh = GEN $@
> > +      cmd_copy_umh = echo ':' > $(obj)/.bpfilter_umh.o.cmd; \
> > +      $(OBJCOPY) -I binary -O $(CONFIG_OUTPUT_FORMAT) \
> > +      -B `$(OBJDUMP) -f $<|grep architecture|cut -d, -f1|cut -d' ' -f2` \
> > +      --rename-section .data=.init.rodata $< $@
> 
> Cool, but so our expectation is that the compiler sets this symbol, how
> are we sure it will always be set?

Compiler doesn't set it. objcopy does.

> > +
> > +	if (__bpfilter_process_sockopt(NULL, 0, 0, 0, 0) != 0) {
> 
> See, here, what if the userspace process gets preemtped and we run this
> check afterwards? Is that possible?

User space is a normal task. It can sleep and can be single stepped with GDB.
diff mbox series

Patch

diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h
new file mode 100644
index 000000000000..687b1760bb9f
--- /dev/null
+++ b/include/linux/bpfilter.h
@@ -0,0 +1,15 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_BPFILTER_H
+#define _LINUX_BPFILTER_H
+
+#include <uapi/linux/bpfilter.h>
+
+struct sock;
+int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char *optval,
+			    unsigned int optlen);
+int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char *optval,
+			    int *optlen);
+extern int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
+				       char __user *optval,
+				       unsigned int optlen, bool is_set);
+#endif
diff --git a/include/uapi/linux/bpfilter.h b/include/uapi/linux/bpfilter.h
new file mode 100644
index 000000000000..2ec3cc99ea4c
--- /dev/null
+++ b/include/uapi/linux/bpfilter.h
@@ -0,0 +1,21 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _UAPI_LINUX_BPFILTER_H
+#define _UAPI_LINUX_BPFILTER_H
+
+#include <linux/if.h>
+
+enum {
+	BPFILTER_IPT_SO_SET_REPLACE = 64,
+	BPFILTER_IPT_SO_SET_ADD_COUNTERS = 65,
+	BPFILTER_IPT_SET_MAX,
+};
+
+enum {
+	BPFILTER_IPT_SO_GET_INFO = 64,
+	BPFILTER_IPT_SO_GET_ENTRIES = 65,
+	BPFILTER_IPT_SO_GET_REVISION_MATCH = 66,
+	BPFILTER_IPT_SO_GET_REVISION_TARGET = 67,
+	BPFILTER_IPT_GET_MAX,
+};
+
+#endif /* _UAPI_LINUX_BPFILTER_H */
diff --git a/net/Kconfig b/net/Kconfig
index b62089fb1332..ed6368b306fa 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -201,6 +201,8 @@  source "net/bridge/netfilter/Kconfig"
 
 endif
 
+source "net/bpfilter/Kconfig"
+
 source "net/dccp/Kconfig"
 source "net/sctp/Kconfig"
 source "net/rds/Kconfig"
diff --git a/net/Makefile b/net/Makefile
index a6147c61b174..7f982b7682bd 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -20,6 +20,7 @@  obj-$(CONFIG_TLS)		+= tls/
 obj-$(CONFIG_XFRM)		+= xfrm/
 obj-$(CONFIG_UNIX)		+= unix/
 obj-$(CONFIG_NET)		+= ipv6/
+obj-$(CONFIG_BPFILTER)		+= bpfilter/
 obj-$(CONFIG_PACKET)		+= packet/
 obj-$(CONFIG_NET_KEY)		+= key/
 obj-$(CONFIG_BRIDGE)		+= bridge/
diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
new file mode 100644
index 000000000000..782a732b9a5c
--- /dev/null
+++ b/net/bpfilter/Kconfig
@@ -0,0 +1,17 @@ 
+menuconfig BPFILTER
+	bool "BPF based packet filtering framework (BPFILTER)"
+	default n
+	depends on NET && BPF
+	help
+	  This builds experimental bpfilter framework that is aiming to
+	  provide netfilter compatible functionality via BPF
+
+if BPFILTER
+config BPFILTER_UMH
+	tristate "bpftiler kernel module with user mode helper"
+	default m
+	depends on m
+	help
+	  This builds bpfilter kernel module with embedded user mode helper
+endif
+
diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
new file mode 100644
index 000000000000..897eedae523e
--- /dev/null
+++ b/net/bpfilter/Makefile
@@ -0,0 +1,24 @@ 
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the Linux BPFILTER layer.
+#
+
+hostprogs-y := bpfilter_umh
+bpfilter_umh-objs := main.o
+HOSTCFLAGS += -I. -Itools/include/
+
+# a bit of elf magic to convert bpfilter_umh binary into a binary blob
+# inside bpfilter_umh.o elf file referenced by
+# _binary_net_bpfilter_bpfilter_umh_start symbol
+# which bpfilter_kern.c passes further into umh blob loader at run-time
+quiet_cmd_copy_umh = GEN $@
+      cmd_copy_umh = echo ':' > $(obj)/.bpfilter_umh.o.cmd; \
+      $(OBJCOPY) -I binary -O $(CONFIG_OUTPUT_FORMAT) \
+      -B `$(OBJDUMP) -f $<|grep architecture|cut -d, -f1|cut -d' ' -f2` \
+      --rename-section .data=.init.rodata $< $@
+
+$(obj)/bpfilter_umh.o: $(obj)/bpfilter_umh
+	$(call cmd,copy_umh)
+
+obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o
+bpfilter-objs += bpfilter_kern.o bpfilter_umh.o
diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
new file mode 100644
index 000000000000..e0a6fdd5842b
--- /dev/null
+++ b/net/bpfilter/bpfilter_kern.c
@@ -0,0 +1,93 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/umh.h>
+#include <linux/bpfilter.h>
+#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include "msgfmt.h"
+
+#define UMH_start _binary_net_bpfilter_bpfilter_umh_start
+#define UMH_end _binary_net_bpfilter_bpfilter_umh_end
+
+extern char UMH_start;
+extern char UMH_end;
+
+static struct umh_info info;
+
+static void shutdown_umh(struct umh_info *info)
+{
+	struct task_struct *tsk;
+
+	tsk = pid_task(find_vpid(info->pid), PIDTYPE_PID);
+	if (tsk)
+		force_sig(SIGKILL, tsk);
+	fput(info->pipe_to_umh);
+	fput(info->pipe_from_umh);
+}
+
+static void stop_umh(void)
+{
+	if (bpfilter_process_sockopt) {
+		bpfilter_process_sockopt = NULL;
+		shutdown_umh(&info);
+	}
+}
+
+static int __bpfilter_process_sockopt(struct sock *sk, int optname,
+				      char __user *optval,
+				      unsigned int optlen, bool is_set)
+{
+	struct mbox_request req;
+	struct mbox_reply reply;
+	loff_t pos;
+	ssize_t n;
+
+	req.is_set = is_set;
+	req.pid = current->pid;
+	req.cmd = optname;
+	req.addr = (long)optval;
+	req.len = optlen;
+	n = __kernel_write(info.pipe_to_umh, &req, sizeof(req), &pos);
+	if (n != sizeof(req)) {
+		pr_err("write fail %zd\n", n);
+		stop_umh();
+		return -EFAULT;
+	}
+	pos = 0;
+	n = kernel_read(info.pipe_from_umh, &reply, sizeof(reply), &pos);
+	if (n != sizeof(reply)) {
+		pr_err("read fail %zd\n", n);
+		stop_umh();
+		return -EFAULT;
+	}
+	return reply.status;
+}
+
+static int __init load_umh(void)
+{
+	int err;
+
+	err = fork_usermode_blob(&UMH_start, &UMH_end - &UMH_start, &info);
+	if (err)
+		return err;
+	pr_info("Loaded umh pid %d\n", info.pid);
+	bpfilter_process_sockopt = &__bpfilter_process_sockopt;
+
+	if (__bpfilter_process_sockopt(NULL, 0, 0, 0, 0) != 0) {
+		stop_umh();
+		return -EFAULT;
+	}
+	return 0;
+}
+
+static void __exit fini_umh(void)
+{
+	stop_umh();
+}
+module_init(load_umh);
+module_exit(fini_umh);
+MODULE_LICENSE("GPL");
diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c
new file mode 100644
index 000000000000..81bbc1684896
--- /dev/null
+++ b/net/bpfilter/main.c
@@ -0,0 +1,63 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sys/uio.h>
+#include <errno.h>
+#include <stdio.h>
+#include <sys/socket.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "include/uapi/linux/bpf.h"
+#include <asm/unistd.h>
+#include "msgfmt.h"
+
+int debug_fd;
+
+static int handle_get_cmd(struct mbox_request *cmd)
+{
+	switch (cmd->cmd) {
+	case 0:
+		return 0;
+	default:
+		break;
+	}
+	return -ENOPROTOOPT;
+}
+
+static int handle_set_cmd(struct mbox_request *cmd)
+{
+	return -ENOPROTOOPT;
+}
+
+static void loop(void)
+{
+	while (1) {
+		struct mbox_request req;
+		struct mbox_reply reply;
+		int n;
+
+		n = read(0, &req, sizeof(req));
+		if (n != sizeof(req)) {
+			dprintf(debug_fd, "invalid request %d\n", n);
+			return;
+		}
+
+		reply.status = req.is_set ?
+			handle_set_cmd(&req) :
+			handle_get_cmd(&req);
+
+		n = write(1, &reply, sizeof(reply));
+		if (n != sizeof(reply)) {
+			dprintf(debug_fd, "reply failed %d\n", n);
+			return;
+		}
+	}
+}
+
+int main(void)
+{
+	debug_fd = open("/dev/console", 00000002 | 00000100);
+	dprintf(debug_fd, "Started bpfilter\n");
+	loop();
+	close(debug_fd);
+	return 0;
+}
diff --git a/net/bpfilter/msgfmt.h b/net/bpfilter/msgfmt.h
new file mode 100644
index 000000000000..94b9ac9e5114
--- /dev/null
+++ b/net/bpfilter/msgfmt.h
@@ -0,0 +1,17 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _NET_BPFILTER_MSGFMT_H
+#define _NET_BPFTILER_MSGFMT_H
+
+struct mbox_request {
+	__u64 addr;
+	__u32 len;
+	__u32 is_set;
+	__u32 cmd;
+	__u32 pid;
+};
+
+struct mbox_reply {
+	__u32 status;
+};
+
+#endif
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index b379520f9133..7018f91c5a39 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -16,6 +16,8 @@  obj-y     := route.o inetpeer.o protocol.o \
 	     inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
 	     metrics.o
 
+obj-$(CONFIG_BPFILTER) += bpfilter/
+
 obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
 obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
 obj-$(CONFIG_PROC_FS) += proc.o
diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile
new file mode 100644
index 000000000000..ce262d76cc48
--- /dev/null
+++ b/net/ipv4/bpfilter/Makefile
@@ -0,0 +1,2 @@ 
+obj-$(CONFIG_BPFILTER) += sockopt.o
+
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
new file mode 100644
index 000000000000..42a96d2d8d05
--- /dev/null
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -0,0 +1,42 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/uaccess.h>
+#include <linux/bpfilter.h>
+#include <uapi/linux/bpf.h>
+#include <linux/wait.h>
+#include <linux/kmod.h>
+
+int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
+				char __user *optval,
+				unsigned int optlen, bool is_set);
+EXPORT_SYMBOL_GPL(bpfilter_process_sockopt);
+
+int bpfilter_mbox_request(struct sock *sk, int optname, char __user *optval,
+			  unsigned int optlen, bool is_set)
+{
+	if (!bpfilter_process_sockopt) {
+		int err = request_module("bpfilter");
+
+		if (err)
+			return err;
+		if (!bpfilter_process_sockopt)
+			return -ECHILD;
+	}
+	return bpfilter_process_sockopt(sk, optname, optval, optlen, is_set);
+}
+
+int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
+			    unsigned int optlen)
+{
+	return bpfilter_mbox_request(sk, optname, optval, optlen, true);
+}
+
+int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
+			    int __user *optlen)
+{
+	int len;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	return bpfilter_mbox_request(sk, optname, optval, len, false);
+}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 5ad2d8ed3a3f..e0791faacb24 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -47,6 +47,8 @@ 
 #include <linux/errqueue.h>
 #include <linux/uaccess.h>
 
+#include <linux/bpfilter.h>
+
 /*
  *	SOL_IP control messages.
  */
@@ -1244,6 +1246,11 @@  int ip_setsockopt(struct sock *sk, int level,
 		return -ENOPROTOOPT;
 
 	err = do_ip_setsockopt(sk, level, optname, optval, optlen);
+#ifdef CONFIG_BPFILTER
+	if (optname >= BPFILTER_IPT_SO_SET_REPLACE &&
+	    optname < BPFILTER_IPT_SET_MAX)
+		err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen);
+#endif
 #ifdef CONFIG_NETFILTER
 	/* we need to exclude all possible ENOPROTOOPTs except default case */
 	if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
@@ -1552,6 +1559,11 @@  int ip_getsockopt(struct sock *sk, int level,
 	int err;
 
 	err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0);
+#ifdef CONFIG_BPFILTER
+	if (optname >= BPFILTER_IPT_SO_GET_INFO &&
+	    optname < BPFILTER_IPT_GET_MAX)
+		err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
+#endif
 #ifdef CONFIG_NETFILTER
 	/* we need to exclude all possible ENOPROTOOPTs except default case */
 	if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
@@ -1584,6 +1596,11 @@  int compat_ip_getsockopt(struct sock *sk, int level, int optname,
 	err = do_ip_getsockopt(sk, level, optname, optval, optlen,
 		MSG_CMSG_COMPAT);
 
+#ifdef CONFIG_BPFILTER
+	if (optname >= BPFILTER_IPT_SO_GET_INFO &&
+	    optname < BPFILTER_IPT_GET_MAX)
+		err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
+#endif
 #ifdef CONFIG_NETFILTER
 	/* we need to exclude all possible ENOPROTOOPTs except default case */
 	if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&