diff mbox series

[bpf-next,v3,2/6] bpf: introduce BPF dispatcher

Message ID 20191209135522.16576-3-bjorn.topel@gmail.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series Introduce the BPF dispatcher | expand

Commit Message

Björn Töpel Dec. 9, 2019, 1:55 p.m. UTC
From: Björn Töpel <bjorn.topel@intel.com>

The BPF dispatcher is a multi-way branch code generator, mainly
targeted for XDP programs. When an XDP program is executed via the
bpf_prog_run_xdp(), it is invoked via an indirect call. The indirect
call has a substantial performance impact, when retpolines are
enabled. The dispatcher transform indirect calls to direct calls, and
therefore avoids the retpoline. The dispatcher is generated using the
BPF JIT, and relies on text poking provided by bpf_arch_text_poke().

The dispatcher hijacks a trampoline function it via the __fentry__ nop
of the trampoline. One dispatcher instance currently supports up to 64
dispatch points, and currently only one instance of the dispatcher is
supported. This can be extended in the future.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 arch/x86/net/bpf_jit_comp.c | 122 +++++++++++++++++++++
 kernel/bpf/Makefile         |   1 +
 kernel/bpf/dispatcher.c     | 207 ++++++++++++++++++++++++++++++++++++
 3 files changed, 330 insertions(+)
 create mode 100644 kernel/bpf/dispatcher.c

Comments

Alexei Starovoitov Dec. 10, 2019, 5:50 a.m. UTC | #1
On Mon, Dec 09, 2019 at 02:55:18PM +0100, Björn Töpel wrote:
> +
> +struct bpf_disp_prog {
> +	struct bpf_prog *prog;
> +	refcount_t users;
> +};
> +
> +struct bpf_dispatcher {
> +	void *func;
> +	struct bpf_disp_prog progs[BPF_DISPATCHER_MAX];
> +	int num_progs;
> +	void *image;
> +	u32 image_off;
> +};
> +
> +static struct bpf_dispatcher *bpf_disp;
> +
> +static DEFINE_MUTEX(dispatcher_mutex);
> +
> +static struct bpf_dispatcher *bpf_dispatcher_lookup(void *func)
> +{
> +	struct bpf_dispatcher *d;
> +	void *image;
> +
> +	if (bpf_disp) {
> +		if (bpf_disp->func != func)
> +			return NULL;
> +		return bpf_disp;
> +	}
> +
> +	d = kzalloc(sizeof(*d), GFP_KERNEL);
> +	if (!d)
> +		return NULL;

The bpf_dispatcher_lookup() above makes this dispatch logic a bit difficult to
extend, since it works for only one bpf_disp and additional dispatchers would
need hash table. Yet your numbers show that even with retpoline=off there is a
performance benefit. So dispatcher probably can be reused almost as-is to
accelerate sched_cls programs.
What I was trying to say in my previous feedback on this subject is that
lookup() doesn't need to exist. That 'void *func' doesn't need to be a function
that dispatcher uses. It can be 'struct bpf_dispatcher *' instead.
And lookup() becomes init().
Then bpf_dispatcher_change_prog() will be passing &bpf_dispatcher_xdp
and bpf_dispatcher_xdp is defined via macro that supplies
'struct bpf_dispatcher' above and instantiated in particular .c file
that used that macro. Then dispatcher can be used in more than one place.
No need for hash table. Multiple dispatchers are instantiated in places
that need them via macro.
The code will look like:
bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog)
{
   bpf_dispatcher_change_prog(&bpf_dispatcher_xdp, prev_prog, prog);
}
Similarly sched_cls dispatcher for skb progs will do:
   bpf_dispatcher_change_prog(&bpf_dispatcher_tc, prev_prog, prog);
wdyt?
Björn Töpel Dec. 10, 2019, 5:54 a.m. UTC | #2
On Tue, 10 Dec 2019 at 06:50, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Dec 09, 2019 at 02:55:18PM +0100, Björn Töpel wrote:
> > +
> > +struct bpf_disp_prog {
> > +     struct bpf_prog *prog;
> > +     refcount_t users;
> > +};
> > +
> > +struct bpf_dispatcher {
> > +     void *func;
> > +     struct bpf_disp_prog progs[BPF_DISPATCHER_MAX];
> > +     int num_progs;
> > +     void *image;
> > +     u32 image_off;
> > +};
> > +
> > +static struct bpf_dispatcher *bpf_disp;
> > +
> > +static DEFINE_MUTEX(dispatcher_mutex);
> > +
> > +static struct bpf_dispatcher *bpf_dispatcher_lookup(void *func)
> > +{
> > +     struct bpf_dispatcher *d;
> > +     void *image;
> > +
> > +     if (bpf_disp) {
> > +             if (bpf_disp->func != func)
> > +                     return NULL;
> > +             return bpf_disp;
> > +     }
> > +
> > +     d = kzalloc(sizeof(*d), GFP_KERNEL);
> > +     if (!d)
> > +             return NULL;
>
> The bpf_dispatcher_lookup() above makes this dispatch logic a bit difficult to
> extend, since it works for only one bpf_disp and additional dispatchers would
> need hash table. Yet your numbers show that even with retpoline=off there is a
> performance benefit. So dispatcher probably can be reused almost as-is to
> accelerate sched_cls programs.
> What I was trying to say in my previous feedback on this subject is that
> lookup() doesn't need to exist. That 'void *func' doesn't need to be a function
> that dispatcher uses. It can be 'struct bpf_dispatcher *' instead.
> And lookup() becomes init().
> Then bpf_dispatcher_change_prog() will be passing &bpf_dispatcher_xdp
> and bpf_dispatcher_xdp is defined via macro that supplies
> 'struct bpf_dispatcher' above and instantiated in particular .c file
> that used that macro. Then dispatcher can be used in more than one place.
> No need for hash table. Multiple dispatchers are instantiated in places
> that need them via macro.
> The code will look like:
> bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog)
> {
>    bpf_dispatcher_change_prog(&bpf_dispatcher_xdp, prev_prog, prog);
> }
> Similarly sched_cls dispatcher for skb progs will do:
>    bpf_dispatcher_change_prog(&bpf_dispatcher_tc, prev_prog, prog);
> wdyt?
>

Yes, much cleaner. I'll respin!
diff mbox series

Patch

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index b8be18427277..3ce7ad41bd6f 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -10,10 +10,12 @@ 
 #include <linux/if_vlan.h>
 #include <linux/bpf.h>
 #include <linux/memory.h>
+#include <linux/sort.h>
 #include <asm/extable.h>
 #include <asm/set_memory.h>
 #include <asm/nospec-branch.h>
 #include <asm/text-patching.h>
+#include <asm/asm-prototypes.h>
 
 static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 {
@@ -1530,6 +1532,126 @@  int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags
 	return 0;
 }
 
+static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	s64 offset;
+
+	offset = func - (ip + 2 + 4);
+	if (!is_simm32(offset)) {
+		pr_err("Target %p is out of range\n", func);
+		return -EINVAL;
+	}
+	EMIT2_off32(0x0F, jmp_cond + 0x10, offset);
+	*pprog = prog;
+	return 0;
+}
+
+static int emit_fallback_jump(u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int err = 0;
+
+#ifdef CONFIG_RETPOLINE
+	/* Note that this assumes the the compiler uses external
+	 * thunks for indirect calls. Both clang and GCC use the same
+	 * naming convention for external thunks.
+	 */
+	err = emit_jump(&prog, __x86_indirect_thunk_rdx, prog);
+#else
+	int cnt = 0;
+
+	EMIT2(0xFF, 0xE2);	/* jmp rdx */
+#endif
+	*pprog = prog;
+	return err;
+}
+
+static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs)
+{
+	int pivot, err, jg_bytes = 1, cnt = 0;
+	u8 *jg_reloc, *prog = *pprog;
+	s64 jg_offset;
+
+	if (a == b) {
+		/* Leaf node of recursion, i.e. not a range of indices
+		 * anymore.
+		 */
+		EMIT1(add_1mod(0x48, BPF_REG_3));	/* cmp rdx,func */
+		if (!is_simm32(progs[a]))
+			return -1;
+		EMIT2_off32(0x81, add_1reg(0xF8, BPF_REG_3),
+			    progs[a]);
+		err = emit_cond_near_jump(&prog,	/* je func */
+					  (void *)progs[a], prog,
+					  X86_JE);
+		if (err)
+			return err;
+
+		err = emit_fallback_jump(&prog);	/* jmp thunk/indirect */
+		if (err)
+			return err;
+
+		*pprog = prog;
+		return 0;
+	}
+
+	/* Not a leaf node, so we pivot, and recursively descend into
+	 * the lower and upper ranges.
+	 */
+	pivot = (b - a) / 2;
+	EMIT1(add_1mod(0x48, BPF_REG_3));		/* cmp rdx,func */
+	if (!is_simm32(progs[a + pivot]))
+		return -1;
+	EMIT2_off32(0x81, add_1reg(0xF8, BPF_REG_3), progs[a + pivot]);
+
+	if (pivot > 2) {				/* jg upper_part */
+		/* Require near jump. */
+		jg_bytes = 4;
+		EMIT2_off32(0x0F, X86_JG + 0x10, 0);
+	} else {
+		EMIT2(X86_JG, 0);
+	}
+	jg_reloc = prog;
+
+	err = emit_bpf_dispatcher(&prog, a, a + pivot,	/* emit lower_part */
+				  progs);
+	if (err)
+		return err;
+
+	jg_offset = prog - jg_reloc;
+	emit_code(jg_reloc - jg_bytes, jg_offset, jg_bytes);
+
+	err = emit_bpf_dispatcher(&prog, a + pivot + 1,	/* emit upper_part */
+				  b, progs);
+	if (err)
+		return err;
+
+	*pprog = prog;
+	return 0;
+}
+
+static int cmp_ips(const void *a, const void *b)
+{
+	const s64 *ipa = a;
+	const s64 *ipb = b;
+
+	if (*ipa > *ipb)
+		return 1;
+	if (*ipa < *ipb)
+		return -1;
+	return 0;
+}
+
+int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
+{
+	u8 *prog = image;
+
+	sort(funcs, num_funcs, sizeof(funcs[0]), cmp_ips, NULL);
+	return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs);
+}
+
 struct x64_jit_data {
 	struct bpf_binary_header *header;
 	int *addrs;
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 3f671bf617e8..d4f330351f87 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -8,6 +8,7 @@  obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 obj-$(CONFIG_BPF_JIT) += trampoline.o
 obj-$(CONFIG_BPF_SYSCALL) += btf.o
+obj-$(CONFIG_BPF_JIT) += dispatcher.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
new file mode 100644
index 000000000000..de6b1f20b920
--- /dev/null
+++ b/kernel/bpf/dispatcher.c
@@ -0,0 +1,207 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2019 Intel Corporation. */
+
+#include <linux/hash.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+
+/* The BPF dispatcher is a multiway branch code generator. The
+ * dispatcher is a mechanism to avoid the performance penalty of an
+ * indirect call, which is expensive when retpolines are enabled. A
+ * dispatch client registers a BPF program into the dispatcher, and if
+ * there is available room in the dispatcher a direct call to the BPF
+ * program will be generated. All calls to the BPF programs called via
+ * the dispatcher will then be a direct call, instead of an
+ * indirect. The dispatcher hijacks a trampoline function it via the
+ * __fentry__ of the trampoline. The trampoline function has the
+ * following signature:
+ *
+ * unsigned int trampoline(const void *xdp_ctx,
+ *                         const struct bpf_insn *insnsi,
+ *                         unsigned int (*bpf_func)(const void *,
+ *                                                  const struct bpf_insn *));
+ *
+ * Currently only one, global, dispatcher instance is supported. It's
+ * allocated on first use, and never freed.
+ */
+
+#define BPF_DISPATCHER_MAX 64 /* Fits in 2048B */
+
+struct bpf_disp_prog {
+	struct bpf_prog *prog;
+	refcount_t users;
+};
+
+struct bpf_dispatcher {
+	void *func;
+	struct bpf_disp_prog progs[BPF_DISPATCHER_MAX];
+	int num_progs;
+	void *image;
+	u32 image_off;
+};
+
+static struct bpf_dispatcher *bpf_disp;
+
+static DEFINE_MUTEX(dispatcher_mutex);
+
+static struct bpf_dispatcher *bpf_dispatcher_lookup(void *func)
+{
+	struct bpf_dispatcher *d;
+	void *image;
+
+	if (bpf_disp) {
+		if (bpf_disp->func != func)
+			return NULL;
+		return bpf_disp;
+	}
+
+	d = kzalloc(sizeof(*d), GFP_KERNEL);
+	if (!d)
+		return NULL;
+
+	image = bpf_jit_alloc_exec_page();
+	if (!image) {
+		kfree(d);
+		return NULL;
+	}
+
+	d->func = func;
+	d->image = image;
+	bpf_disp = d;
+	return d;
+}
+
+static struct bpf_disp_prog *bpf_dispatcher_find_prog(struct bpf_dispatcher *d,
+						      struct bpf_prog *prog)
+{
+	int i;
+
+	for (i = 0; i < BPF_DISPATCHER_MAX; i++) {
+		if (prog == d->progs[i].prog)
+			return &d->progs[i];
+	}
+	return NULL;
+}
+
+static struct bpf_disp_prog *bpf_dispatcher_find_free(struct bpf_dispatcher *d)
+{
+	return bpf_dispatcher_find_prog(d, NULL);
+}
+
+static bool bpf_dispatcher_add_prog(struct bpf_dispatcher *d,
+				    struct bpf_prog *prog)
+{
+	struct bpf_disp_prog *entry;
+
+	if (!prog)
+		return false;
+
+	entry = bpf_dispatcher_find_prog(d, prog);
+	if (entry) {
+		refcount_inc(&entry->users);
+		return false;
+	}
+
+	entry = bpf_dispatcher_find_free(d);
+	if (!entry)
+		return false;
+
+	bpf_prog_inc(prog);
+	entry->prog = prog;
+	refcount_set(&entry->users, 1);
+	d->num_progs++;
+	return true;
+}
+
+static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d,
+				       struct bpf_prog *prog)
+{
+	struct bpf_disp_prog *entry;
+
+	if (!prog)
+		return false;
+
+	entry = bpf_dispatcher_find_prog(d, prog);
+	if (!entry)
+		return false;
+
+	if (refcount_dec_and_test(&entry->users)) {
+		entry->prog = NULL;
+		bpf_prog_put(prog);
+		d->num_progs--;
+		return true;
+	}
+	return false;
+}
+
+int __weak arch_prepare_bpf_dispatcher(void *image, s64 *funcs,
+				       int num_funcs)
+{
+	return -ENOTSUPP;
+}
+
+static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image)
+{
+	s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0];
+	int i;
+
+	for (i = 0; i < BPF_DISPATCHER_MAX; i++) {
+		if (d->progs[i].prog)
+			*ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func;
+	}
+	return arch_prepare_bpf_dispatcher(image, &ips[0], d->num_progs);
+}
+
+static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
+{
+	void *old, *new;
+	u32 noff;
+	int err;
+
+	if (!prev_num_progs) {
+		old = NULL;
+		noff = 0;
+	} else {
+		old = d->image + d->image_off;
+		noff = d->image_off ^ (PAGE_SIZE / 2);
+	}
+
+	new = d->num_progs ? d->image + noff : NULL;
+	if (new) {
+		if (bpf_dispatcher_prepare(d, new))
+			return;
+	}
+
+	err = bpf_arch_text_poke(d->func, BPF_MOD_JUMP, old, new);
+	if (err || !new)
+		return;
+
+	d->image_off = noff;
+}
+
+void bpf_dispatcher_change_prog(void *func, struct bpf_prog *from,
+				struct bpf_prog *to)
+{
+	struct bpf_dispatcher *d;
+	bool changed = false;
+	int prev_num_progs;
+
+	if (from == to)
+		return;
+
+	mutex_lock(&dispatcher_mutex);
+	d = bpf_dispatcher_lookup(func);
+	if (!d)
+		goto out;
+
+	prev_num_progs = d->num_progs;
+	changed |= bpf_dispatcher_remove_prog(d, from);
+	changed |= bpf_dispatcher_add_prog(d, to);
+
+	if (!changed)
+		goto out;
+
+	bpf_dispatcher_update(d, prev_num_progs);
+out:
+	mutex_unlock(&dispatcher_mutex);
+}