diff mbox series

[v2,bpf-next,12/12] selftests/bpf: add kfree_skb raw_tp test

Message ID 20191010041503.2526303-13-ast@kernel.org
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series bpf: revolutionize bpf tracing | expand

Commit Message

Alexei Starovoitov Oct. 10, 2019, 4:15 a.m. UTC
Load basic cls_bpf program.
Load raw_tracepoint program and attach to kfree_skb raw tracepoint.
Trigger cls_bpf via prog_test_run.
At the end of test_run kernel will call kfree_skb
which will trigger trace_kfree_skb tracepoint.
Which will call our raw_tracepoint program.
Which will take that skb and will dump it into perf ring buffer.
Check that user space received correct packet.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
---
 .../selftests/bpf/prog_tests/kfree_skb.c      | 90 +++++++++++++++++++
 tools/testing/selftests/bpf/progs/kfree_skb.c | 74 +++++++++++++++
 2 files changed, 164 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/kfree_skb.c
 create mode 100644 tools/testing/selftests/bpf/progs/kfree_skb.c

Comments

Ido Schimmel Oct. 10, 2019, 11:07 a.m. UTC | #1
On Wed, Oct 09, 2019 at 09:15:03PM -0700, Alexei Starovoitov wrote:
> Load basic cls_bpf program.
> Load raw_tracepoint program and attach to kfree_skb raw tracepoint.
> Trigger cls_bpf via prog_test_run.
> At the end of test_run kernel will call kfree_skb
> which will trigger trace_kfree_skb tracepoint.
> Which will call our raw_tracepoint program.
> Which will take that skb and will dump it into perf ring buffer.
> Check that user space received correct packet.
> 
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> Acked-by: Andrii Nakryiko <andriin@fb.com>
> ---
>  .../selftests/bpf/prog_tests/kfree_skb.c      | 90 +++++++++++++++++++
>  tools/testing/selftests/bpf/progs/kfree_skb.c | 74 +++++++++++++++
>  2 files changed, 164 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/kfree_skb.c
>  create mode 100644 tools/testing/selftests/bpf/progs/kfree_skb.c
> 
> diff --git a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
> new file mode 100644
> index 000000000000..0cf91b6bf276
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
> @@ -0,0 +1,90 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <test_progs.h>
> +
> +static void on_sample(void *ctx, int cpu, void *data, __u32 size)
> +{
> +	int ifindex = *(int *)data, duration = 0;
> +	struct ipv6_packet *pkt_v6 = data + 4;
> +
> +	if (ifindex != 1)
> +		/* spurious kfree_skb not on loopback device */
> +		return;
> +	if (CHECK(size != 76, "check_size", "size %u != 76\n", size))
> +		return;
> +	if (CHECK(pkt_v6->eth.h_proto != 0xdd86, "check_eth",
> +		  "h_proto %x\n", pkt_v6->eth.h_proto))
> +		return;
> +	if (CHECK(pkt_v6->iph.nexthdr != 6, "check_ip",
> +		  "iph.nexthdr %x\n", pkt_v6->iph.nexthdr))
> +		return;
> +	if (CHECK(pkt_v6->tcp.doff != 5, "check_tcp",
> +		  "tcp.doff %x\n", pkt_v6->tcp.doff))
> +		return;
> +
> +	*(bool *)ctx = true;
> +}
> +
> +void test_kfree_skb(void)
> +{
> +	struct bpf_prog_load_attr attr = {
> +		.file = "./kfree_skb.o",
> +		.log_level = 2,
> +	};
> +
> +	struct bpf_object *obj, *obj2 = NULL;
> +	struct perf_buffer_opts pb_opts = {};
> +	struct perf_buffer *pb = NULL;
> +	struct bpf_link *link = NULL;
> +	struct bpf_map *perf_buf_map;
> +	struct bpf_program *prog;
> +	__u32 duration, retval;
> +	int err, pkt_fd, kfree_skb_fd;
> +	bool passed = false;
> +
> +	err = bpf_prog_load("./test_pkt_access.o", BPF_PROG_TYPE_SCHED_CLS, &obj, &pkt_fd);
> +	if (CHECK(err, "prog_load sched cls", "err %d errno %d\n", err, errno))
> +		return;
> +
> +	err = bpf_prog_load_xattr(&attr, &obj2, &kfree_skb_fd);
> +	if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
> +		goto close_prog;
> +
> +	prog = bpf_object__find_program_by_title(obj2, "raw_tracepoint/kfree_skb");
> +	if (CHECK(!prog, "find_prog", "prog kfree_skb not found\n"))
> +		goto close_prog;
> +	link = bpf_program__attach_raw_tracepoint(prog, NULL);
> +	if (CHECK(IS_ERR(link), "attach_raw_tp", "err %ld\n", PTR_ERR(link)))
> +		goto close_prog;
> +
> +	perf_buf_map = bpf_object__find_map_by_name(obj2, "perf_buf_map");
> +	if (CHECK(!perf_buf_map, "find_perf_buf_map", "not found\n"))
> +		goto close_prog;
> +
> +	/* set up perf buffer */
> +	pb_opts.sample_cb = on_sample;
> +	pb_opts.ctx = &passed;
> +	pb = perf_buffer__new(bpf_map__fd(perf_buf_map), 1, &pb_opts);
> +	if (CHECK(IS_ERR(pb), "perf_buf__new", "err %ld\n", PTR_ERR(pb)))
> +		goto close_prog;
> +
> +	err = bpf_prog_test_run(pkt_fd, 1, &pkt_v6, sizeof(pkt_v6),
> +				NULL, NULL, &retval, &duration);
> +	CHECK(err || retval, "ipv6",
> +	      "err %d errno %d retval %d duration %d\n",
> +	      err, errno, retval, duration);
> +
> +	/* read perf buffer */
> +	err = perf_buffer__poll(pb, 100);
> +	if (CHECK(err < 0, "perf_buffer__poll", "err %d\n", err))
> +		goto close_prog;
> +	/* make sure kfree_skb program was triggered
> +	 * and it sent expected skb into ring buffer
> +	 */
> +	CHECK_FAIL(!passed);
> +close_prog:
> +	perf_buffer__free(pb);
> +	if (!IS_ERR_OR_NULL(link))
> +		bpf_link__destroy(link);
> +	bpf_object__close(obj);
> +	bpf_object__close(obj2);
> +}
> diff --git a/tools/testing/selftests/bpf/progs/kfree_skb.c b/tools/testing/selftests/bpf/progs/kfree_skb.c
> new file mode 100644
> index 000000000000..fc25797cc64d
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/kfree_skb.c
> @@ -0,0 +1,74 @@
> +// SPDX-License-Identifier: GPL-2.0
> +// Copyright (c) 2019 Facebook
> +#include <linux/bpf.h>
> +#include "bpf_helpers.h"
> +
> +char _license[] SEC("license") = "GPL";
> +struct {
> +	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
> +	__uint(key_size, sizeof(int));
> +	__uint(value_size, sizeof(int));
> +} perf_buf_map SEC(".maps");
> +
> +#define _(P) (__builtin_preserve_access_index(P))
> +
> +/* define few struct-s that bpf program needs to access */
> +struct callback_head {
> +	struct callback_head *next;
> +	void (*func)(struct callback_head *head);
> +};
> +struct dev_ifalias {
> +	struct callback_head rcuhead;
> +};
> +
> +struct net_device /* same as kernel's struct net_device */ {
> +	int ifindex;
> +	struct dev_ifalias *ifalias;
> +};
> +
> +struct sk_buff {
> +	/* field names and sizes should match to those in the kernel */
> +	unsigned int len, data_len;
> +	__u16 mac_len, hdr_len, queue_mapping;
> +	struct net_device *dev;
> +	/* order of the fields doesn't matter */
> +};
> +
> +/* copy arguments from
> + * include/trace/events/skb.h:
> + * TRACE_EVENT(kfree_skb,
> + *         TP_PROTO(struct sk_buff *skb, void *location),
> + *
> + * into struct below:
> + */
> +struct trace_kfree_skb {
> +	struct sk_buff *skb;
> +	void *location;
> +};
> +
> +SEC("raw_tracepoint/kfree_skb")
> +int trace_kfree_skb(struct trace_kfree_skb *ctx)
> +{
> +	struct sk_buff *skb = ctx->skb;
> +	struct net_device *dev;
> +	int ifindex;
> +	struct callback_head *ptr;
> +	void *func;
> +
> +	__builtin_preserve_access_index(({
> +		dev = skb->dev;
> +		ifindex = dev->ifindex;

Hi Alexei,

The patchset looks very useful. One question: Is it always safe to
access 'skb->dev->ifindex' here? I'm asking because 'dev' is inside a
union with 'dev_scratch' which is 'unsigned long' and therefore might
not always be a valid memory address. Consider for example the following
code path:

...
__udp_queue_rcv_skb(sk, skb)
	__udp_enqueue_schedule_skb(sk, skb)
		udp_set_dev_scratch(skb)
		// returns error
	...
	kfree_skb(skb) // ebpf program is invoked

How is this handled by eBPF?

Thanks

> +		ptr = dev->ifalias->rcuhead.next;
> +		func = ptr->func;
> +	}));
> +
> +	bpf_printk("rcuhead.next %llx func %llx\n", ptr, func);
> +	bpf_printk("skb->len %d\n", _(skb->len));
> +	bpf_printk("skb->queue_mapping %d\n", _(skb->queue_mapping));
> +	bpf_printk("dev->ifindex %d\n", ifindex);
> +
> +	/* send first 72 byte of the packet to user space */
> +	bpf_skb_output(skb, &perf_buf_map, (72ull << 32) | BPF_F_CURRENT_CPU,
> +		       &ifindex, sizeof(ifindex));
> +	return 0;
> +}
> -- 
> 2.23.0
>
Alexei Starovoitov Oct. 10, 2019, 7:07 p.m. UTC | #2
On Thu, Oct 10, 2019 at 02:07:29PM +0300, Ido Schimmel wrote:
> On Wed, Oct 09, 2019 at 09:15:03PM -0700, Alexei Starovoitov wrote:
> > +SEC("raw_tracepoint/kfree_skb")
> > +int trace_kfree_skb(struct trace_kfree_skb *ctx)
> > +{
> > +	struct sk_buff *skb = ctx->skb;
> > +	struct net_device *dev;
> > +	int ifindex;
> > +	struct callback_head *ptr;
> > +	void *func;
> > +
> > +	__builtin_preserve_access_index(({
> > +		dev = skb->dev;
> > +		ifindex = dev->ifindex;
> 
> Hi Alexei,
> 
> The patchset looks very useful. One question: Is it always safe to
> access 'skb->dev->ifindex' here? I'm asking because 'dev' is inside a
> union with 'dev_scratch' which is 'unsigned long' and therefore might
> not always be a valid memory address. Consider for example the following
> code path:
> 
> ...
> __udp_queue_rcv_skb(sk, skb)
> 	__udp_enqueue_schedule_skb(sk, skb)
> 		udp_set_dev_scratch(skb)
> 		// returns error
> 	...
> 	kfree_skb(skb) // ebpf program is invoked
> 
> How is this handled by eBPF?

Excellent question. There are cases like this where the verifier cannot possibly
track semantics of the kernel code and union of pointer with scratch area
like this. That's why every access through btf pointer is a hidden probe_read.
Comparing to old school tracing all memory accesses were probe_read
and bpf prog was free to read anything and page fault everywhere.
Now bpf prog will almost always access correct data. Yet corner cases like
this are inevitable. I'm working on few ideas how to improve it further
with btf-tagged slab allocations and kasan-like memory shadowing.

Your question made me thinking whether we have a long standing issue
with dev_scratch, since even classic bpf has SKF_AD_IFINDEX hack
which is implemented as:
    *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
                          BPF_REG_TMP, BPF_REG_CTX,
                          offsetof(struct sk_buff, dev));
    /* if (tmp != 0) goto pc + 1 */
    *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1);
    *insn++ = BPF_EXIT_INSN();
    if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX)
            *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP,
                                offsetof(struct net_device, ifindex));

That means for long time [c|e]BPF code was checking skb->dev for NULL only.
I've analyzed the code where socket filters can be called and I think it's good
there. dev_scratch is used after sk_filter has run.
But there are other hooks: lwt, various cgroups.
I've checked lwt and cgroup inet/egress. I think dev_scratch should not be
used in these paths. So should be good there as well.
But I think the whole idea of aliasing scratch into 'dev' pointer is dangerous.
There are plenty of tracepoints that do skb->dev->foo. Hard to track where
everything is called.
I think udp code need to move this dev_scratch into some other place in skb.
Andrii Nakryiko Oct. 11, 2019, 7:05 p.m. UTC | #3
On Wed, Oct 9, 2019 at 9:15 PM Alexei Starovoitov <ast@kernel.org> wrote:
>
> Load basic cls_bpf program.
> Load raw_tracepoint program and attach to kfree_skb raw tracepoint.
> Trigger cls_bpf via prog_test_run.
> At the end of test_run kernel will call kfree_skb
> which will trigger trace_kfree_skb tracepoint.
> Which will call our raw_tracepoint program.
> Which will take that skb and will dump it into perf ring buffer.
> Check that user space received correct packet.
>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> Acked-by: Andrii Nakryiko <andriin@fb.com>
> ---
>  .../selftests/bpf/prog_tests/kfree_skb.c      | 90 +++++++++++++++++++
>  tools/testing/selftests/bpf/progs/kfree_skb.c | 74 +++++++++++++++
>  2 files changed, 164 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/kfree_skb.c
>  create mode 100644 tools/testing/selftests/bpf/progs/kfree_skb.c
>

[...]

> +
> +void test_kfree_skb(void)
> +{
> +       struct bpf_prog_load_attr attr = {
> +               .file = "./kfree_skb.o",
> +               .log_level = 2,

This is rather verbose and memory-consuming. Do you really want to
leave it at 2?


> +       };
> +
> +       struct bpf_object *obj, *obj2 = NULL;
> +       struct perf_buffer_opts pb_opts = {};
> +       struct perf_buffer *pb = NULL;
> +       struct bpf_link *link = NULL;
> +       struct bpf_map *perf_buf_map;
> +       struct bpf_program *prog;
> +       __u32 duration, retval;
> +       int err, pkt_fd, kfree_skb_fd;
> +       bool passed = false;
> +

[...]
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
new file mode 100644
index 000000000000..0cf91b6bf276
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
@@ -0,0 +1,90 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+static void on_sample(void *ctx, int cpu, void *data, __u32 size)
+{
+	int ifindex = *(int *)data, duration = 0;
+	struct ipv6_packet *pkt_v6 = data + 4;
+
+	if (ifindex != 1)
+		/* spurious kfree_skb not on loopback device */
+		return;
+	if (CHECK(size != 76, "check_size", "size %u != 76\n", size))
+		return;
+	if (CHECK(pkt_v6->eth.h_proto != 0xdd86, "check_eth",
+		  "h_proto %x\n", pkt_v6->eth.h_proto))
+		return;
+	if (CHECK(pkt_v6->iph.nexthdr != 6, "check_ip",
+		  "iph.nexthdr %x\n", pkt_v6->iph.nexthdr))
+		return;
+	if (CHECK(pkt_v6->tcp.doff != 5, "check_tcp",
+		  "tcp.doff %x\n", pkt_v6->tcp.doff))
+		return;
+
+	*(bool *)ctx = true;
+}
+
+void test_kfree_skb(void)
+{
+	struct bpf_prog_load_attr attr = {
+		.file = "./kfree_skb.o",
+		.log_level = 2,
+	};
+
+	struct bpf_object *obj, *obj2 = NULL;
+	struct perf_buffer_opts pb_opts = {};
+	struct perf_buffer *pb = NULL;
+	struct bpf_link *link = NULL;
+	struct bpf_map *perf_buf_map;
+	struct bpf_program *prog;
+	__u32 duration, retval;
+	int err, pkt_fd, kfree_skb_fd;
+	bool passed = false;
+
+	err = bpf_prog_load("./test_pkt_access.o", BPF_PROG_TYPE_SCHED_CLS, &obj, &pkt_fd);
+	if (CHECK(err, "prog_load sched cls", "err %d errno %d\n", err, errno))
+		return;
+
+	err = bpf_prog_load_xattr(&attr, &obj2, &kfree_skb_fd);
+	if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
+		goto close_prog;
+
+	prog = bpf_object__find_program_by_title(obj2, "raw_tracepoint/kfree_skb");
+	if (CHECK(!prog, "find_prog", "prog kfree_skb not found\n"))
+		goto close_prog;
+	link = bpf_program__attach_raw_tracepoint(prog, NULL);
+	if (CHECK(IS_ERR(link), "attach_raw_tp", "err %ld\n", PTR_ERR(link)))
+		goto close_prog;
+
+	perf_buf_map = bpf_object__find_map_by_name(obj2, "perf_buf_map");
+	if (CHECK(!perf_buf_map, "find_perf_buf_map", "not found\n"))
+		goto close_prog;
+
+	/* set up perf buffer */
+	pb_opts.sample_cb = on_sample;
+	pb_opts.ctx = &passed;
+	pb = perf_buffer__new(bpf_map__fd(perf_buf_map), 1, &pb_opts);
+	if (CHECK(IS_ERR(pb), "perf_buf__new", "err %ld\n", PTR_ERR(pb)))
+		goto close_prog;
+
+	err = bpf_prog_test_run(pkt_fd, 1, &pkt_v6, sizeof(pkt_v6),
+				NULL, NULL, &retval, &duration);
+	CHECK(err || retval, "ipv6",
+	      "err %d errno %d retval %d duration %d\n",
+	      err, errno, retval, duration);
+
+	/* read perf buffer */
+	err = perf_buffer__poll(pb, 100);
+	if (CHECK(err < 0, "perf_buffer__poll", "err %d\n", err))
+		goto close_prog;
+	/* make sure kfree_skb program was triggered
+	 * and it sent expected skb into ring buffer
+	 */
+	CHECK_FAIL(!passed);
+close_prog:
+	perf_buffer__free(pb);
+	if (!IS_ERR_OR_NULL(link))
+		bpf_link__destroy(link);
+	bpf_object__close(obj);
+	bpf_object__close(obj2);
+}
diff --git a/tools/testing/selftests/bpf/progs/kfree_skb.c b/tools/testing/selftests/bpf/progs/kfree_skb.c
new file mode 100644
index 000000000000..fc25797cc64d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/kfree_skb.c
@@ -0,0 +1,74 @@ 
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+struct {
+	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+	__uint(key_size, sizeof(int));
+	__uint(value_size, sizeof(int));
+} perf_buf_map SEC(".maps");
+
+#define _(P) (__builtin_preserve_access_index(P))
+
+/* define few struct-s that bpf program needs to access */
+struct callback_head {
+	struct callback_head *next;
+	void (*func)(struct callback_head *head);
+};
+struct dev_ifalias {
+	struct callback_head rcuhead;
+};
+
+struct net_device /* same as kernel's struct net_device */ {
+	int ifindex;
+	struct dev_ifalias *ifalias;
+};
+
+struct sk_buff {
+	/* field names and sizes should match to those in the kernel */
+	unsigned int len, data_len;
+	__u16 mac_len, hdr_len, queue_mapping;
+	struct net_device *dev;
+	/* order of the fields doesn't matter */
+};
+
+/* copy arguments from
+ * include/trace/events/skb.h:
+ * TRACE_EVENT(kfree_skb,
+ *         TP_PROTO(struct sk_buff *skb, void *location),
+ *
+ * into struct below:
+ */
+struct trace_kfree_skb {
+	struct sk_buff *skb;
+	void *location;
+};
+
+SEC("raw_tracepoint/kfree_skb")
+int trace_kfree_skb(struct trace_kfree_skb *ctx)
+{
+	struct sk_buff *skb = ctx->skb;
+	struct net_device *dev;
+	int ifindex;
+	struct callback_head *ptr;
+	void *func;
+
+	__builtin_preserve_access_index(({
+		dev = skb->dev;
+		ifindex = dev->ifindex;
+		ptr = dev->ifalias->rcuhead.next;
+		func = ptr->func;
+	}));
+
+	bpf_printk("rcuhead.next %llx func %llx\n", ptr, func);
+	bpf_printk("skb->len %d\n", _(skb->len));
+	bpf_printk("skb->queue_mapping %d\n", _(skb->queue_mapping));
+	bpf_printk("dev->ifindex %d\n", ifindex);
+
+	/* send first 72 byte of the packet to user space */
+	bpf_skb_output(skb, &perf_buf_map, (72ull << 32) | BPF_F_CURRENT_CPU,
+		       &ifindex, sizeof(ifindex));
+	return 0;
+}