diff mbox

[net-next] tc: bpf: generalize pedit action

Message ID 1427424837-7757-1-git-send-email-ast@plumgrid.com
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Alexei Starovoitov March 27, 2015, 2:53 a.m. UTC
existing TC action 'pedit' can munge any bits of the packet.
Generalize it for use in bpf programs attached as cls_bpf and act_bpf via
bpf_skb_store_bytes() helper function.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---

pedit is limited to 32-bit masked rewrites. Here let it be flexible.

ptr = skb_header_pointer(skb, offset, len, buf);
memcpy(ptr, from, len);
if (ptr == buf)
  skb_store_bits(skb, offset, ptr, len);

^^ logic is the same as in pedit.
shifts, mask, invert style of rewrite is easily done by the program.
Just like arbitrary parsing of the packet and applying rewrites on demand.

 include/linux/bpf.h      |    1 +
 include/uapi/linux/bpf.h |    1 +
 kernel/bpf/verifier.c    |    2 ++
 net/core/filter.c        |   71 ++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 73 insertions(+), 2 deletions(-)

Comments

Jiri Pirko March 27, 2015, 6:38 a.m. UTC | #1
Fri, Mar 27, 2015 at 03:53:57AM CET, ast@plumgrid.com wrote:
>existing TC action 'pedit' can munge any bits of the packet.
>Generalize it for use in bpf programs attached as cls_bpf and act_bpf via
>bpf_skb_store_bytes() helper function.
>
>Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>

This looks fine to me. Great stuff.

Reviewed-by: Jiri Pirko <jiri@resnulli.us>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Borkmann March 27, 2015, 10:42 a.m. UTC | #2
On 03/27/2015 03:53 AM, Alexei Starovoitov wrote:
> existing TC action 'pedit' can munge any bits of the packet.
> Generalize it for use in bpf programs attached as cls_bpf and act_bpf via
> bpf_skb_store_bytes() helper function.
>
> Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>

I like it.

> pedit is limited to 32-bit masked rewrites. Here let it be flexible.
>
> ptr = skb_header_pointer(skb, offset, len, buf);
> memcpy(ptr, from, len);
> if (ptr == buf)
>    skb_store_bits(skb, offset, ptr, len);
>
> ^^ logic is the same as in pedit.
> shifts, mask, invert style of rewrite is easily done by the program.
> Just like arbitrary parsing of the packet and applying rewrites on demand.
...
> +static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
> +{
> +	struct sk_buff *skb = (struct sk_buff *) (long) r1;
> +	unsigned int offset = (unsigned int) r2;
> +	void *from = (void *) (long) r3;
> +	unsigned int len = (unsigned int) r4;
> +	char buf[16];
> +	void *ptr;
> +
> +	/* bpf verifier guarantees that:
> +	 * 'from' pointer points to bpf program stack
> +	 * 'len' bytes of it were initialized
> +	 * 'len' > 0
> +	 * 'skb' is a valid pointer to 'struct sk_buff'
> +	 *
> +	 * so check for invalid 'offset' and too large 'len'
> +	 */
> +	if (offset > 0xffff || len > sizeof(buf))
> +		return -EFAULT;

Could you elaborate on the hard-coded 0xffff? Hm, perhaps better u16, or
do you see any issues with wrong widening?

This check should probably be also unlikely().

Ok, the sizeof(buf) could still be increased in future if truly necessary.

> +	if (skb_cloned(skb) && !skb_clone_writable(skb, offset + len))
> +		return -EFAULT;
> +
> +	ptr = skb_header_pointer(skb, offset, len, buf);
> +	if (unlikely(!ptr))
> +		return -EFAULT;
> +
> +	skb_postpull_rcsum(skb, ptr, len);
> +
> +	memcpy(ptr, from, len);
> +
> +	if (ptr == buf)
> +		/* skb_store_bits cannot return -EFAULT here */
> +		skb_store_bits(skb, offset, ptr, len);
> +
> +	if (skb->ip_summed == CHECKSUM_COMPLETE)
> +		skb->csum = csum_add(skb->csum, csum_partial(ptr, len, 0));

For egress, I think that CHECKSUM_PARTIAL does not need to be dealt
with since the skb length doesn't change. Do you see an issue when
cls_bpf/act_bpf would be attached to the ingress qdisc?

I was also thinking if it's worth it to split off the csum correction
as a separate function if there are not too big performance implications?

That way, an action may also allow to intentionally test corruption of
a part of the skb data together with the recent prandom function.

> +	return 0;
> +}
> +
> +const struct bpf_func_proto bpf_skb_store_bytes_proto = {
> +	.func		= bpf_skb_store_bytes,
> +	.gpl_only	= false,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_PTR_TO_CTX,
> +	.arg2_type	= ARG_ANYTHING,
> +	.arg3_type	= ARG_PTR_TO_STACK,
> +	.arg4_type	= ARG_CONST_STACK_SIZE,
> +};
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexei Starovoitov March 27, 2015, 4:01 p.m. UTC | #3
On 3/27/15 3:42 AM, Daniel Borkmann wrote:
> On 03/27/2015 03:53 AM, Alexei Starovoitov wrote:
>> existing TC action 'pedit' can munge any bits of the packet.
>> Generalize it for use in bpf programs attached as cls_bpf and act_bpf via
>> bpf_skb_store_bytes() helper function.
>>
>> Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
>
> I like it.
>
>> pedit is limited to 32-bit masked rewrites. Here let it be flexible.
>>
>> ptr = skb_header_pointer(skb, offset, len, buf);
>> memcpy(ptr, from, len);
>> if (ptr == buf)
>>    skb_store_bits(skb, offset, ptr, len);
>>
>> ^^ logic is the same as in pedit.
>> shifts, mask, invert style of rewrite is easily done by the program.
>> Just like arbitrary parsing of the packet and applying rewrites on
>> demand.
> ...
>> +static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
>> +{
>> +    struct sk_buff *skb = (struct sk_buff *) (long) r1;
>> +    unsigned int offset = (unsigned int) r2;
>> +    void *from = (void *) (long) r3;
>> +    unsigned int len = (unsigned int) r4;
>> +    char buf[16];
>> +    void *ptr;
>> +
>> +    /* bpf verifier guarantees that:
>> +     * 'from' pointer points to bpf program stack
>> +     * 'len' bytes of it were initialized
>> +     * 'len' > 0
>> +     * 'skb' is a valid pointer to 'struct sk_buff'
>> +     *
>> +     * so check for invalid 'offset' and too large 'len'
>> +     */
>> +    if (offset > 0xffff || len > sizeof(buf))
>> +        return -EFAULT;
>
> Could you elaborate on the hard-coded 0xffff? Hm, perhaps better u16, or
> do you see any issues with wrong widening?

0xffff is the maximum packet size, of course.
Beyond basic sanity the above two conditions check for overflow
of offset+len automatically.
u16 won't work, since all the following functions are taking 'int' or
'unsigned int'. These checks are done first to make there are no wrap
arounds or other subtleties. Especially since skb_copy_bits is quite
complex inside.

> This check should probably be also unlikely().

I thought about it as well, but decided against it, since we don't
use likley/unlikely in skb_header_pointer, skb_copy_bits and others.
Better to be consistent.

> Ok, the sizeof(buf) could still be increased in future if truly necessary.

yes. correct.
I've decided to go small first and extend if necessary.

>> +    if (skb_cloned(skb) && !skb_clone_writable(skb, offset + len))
>> +        return -EFAULT;
>> +
>> +    ptr = skb_header_pointer(skb, offset, len, buf);
>> +    if (unlikely(!ptr))
>> +        return -EFAULT;
>> +
>> +    skb_postpull_rcsum(skb, ptr, len);
>> +
>> +    memcpy(ptr, from, len);
>> +
>> +    if (ptr == buf)
>> +        /* skb_store_bits cannot return -EFAULT here */
>> +        skb_store_bits(skb, offset, ptr, len);
>> +
>> +    if (skb->ip_summed == CHECKSUM_COMPLETE)
>> +        skb->csum = csum_add(skb->csum, csum_partial(ptr, len, 0));
>
> For egress, I think that CHECKSUM_PARTIAL does not need to be dealt
> with since the skb length doesn't change. Do you see an issue when
> cls_bpf/act_bpf would be attached to the ingress qdisc?

Well, this patch is packet writer only.
The checksum helpers and support for CHECKSUM_PARTIAL (similar to 
TP_STATUS_CSUMNOTREADY) are coming in the future patches.
They should be independent. Otherwise this simple writer function
would need to special case different offsets and tons of other
checks. Keep it simple principle.

> I was also thinking if it's worth it to split off the csum correction
> as a separate function if there are not too big performance implications?

yep. performance will suffer if we split it. Better to leave it as-is.

> That way, an action may also allow to intentionally test corruption of
> a part of the skb data together with the recent prandom function.

This writer can do that already, but it keeps skb->csum correct.
If you suggestion is to test corruption of skb->csum, then I don't
see why we would want that.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Borkmann March 28, 2015, 12:14 a.m. UTC | #4
On 03/27/2015 03:53 AM, Alexei Starovoitov wrote:
> existing TC action 'pedit' can munge any bits of the packet.
> Generalize it for use in bpf programs attached as cls_bpf and act_bpf via
> bpf_skb_store_bytes() helper function.
>
> Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>

Acked-by: Daniel Borkmann <daniel@iogearbox.net>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller March 29, 2015, 8:27 p.m. UTC | #5
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Thu, 26 Mar 2015 19:53:57 -0700

> existing TC action 'pedit' can munge any bits of the packet.
> Generalize it for use in bpf programs attached as cls_bpf and act_bpf via
> bpf_skb_store_bytes() helper function.
> 
> Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>

Applied, thanks Alexei.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jamal Hadi Salim March 30, 2015, 12:52 a.m. UTC | #6
On 03/26/15 22:53, Alexei Starovoitov wrote:
> existing TC action 'pedit' can munge any bits of the packet.
> Generalize it for use in bpf programs attached as cls_bpf and act_bpf via
> bpf_skb_store_bytes() helper function.
>
> Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
> ---
>
> pedit is limited to 32-bit masked rewrites. Here let it be flexible.
>
> ptr = skb_header_pointer(skb, offset, len, buf);
> memcpy(ptr, from, len);
> if (ptr == buf)
>    skb_store_bits(skb, offset, ptr, len);
>
> ^^ logic is the same as in pedit.
> shifts, mask, invert style of rewrite is easily done by the program.
> Just like arbitrary parsing of the packet and applying rewrites on demand.
>

Alexei/Daniel - I am backlogged on email; however, i didn quiet follow:
Is there another patch to pedit that help achieve the above?

cheers,
jamal


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexei Starovoitov March 30, 2015, 1:18 a.m. UTC | #7
On 3/29/15 5:52 PM, Jamal Hadi Salim wrote:
> On 03/26/15 22:53, Alexei Starovoitov wrote:
>> existing TC action 'pedit' can munge any bits of the packet.
>> Generalize it for use in bpf programs attached as cls_bpf and act_bpf via
>> bpf_skb_store_bytes() helper function.
>>
>> Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
>> ---
>>
>> pedit is limited to 32-bit masked rewrites. Here let it be flexible.
>>
>> ptr = skb_header_pointer(skb, offset, len, buf);
>> memcpy(ptr, from, len);
>> if (ptr == buf)
>>    skb_store_bits(skb, offset, ptr, len);
>>
>> ^^ logic is the same as in pedit.
>> shifts, mask, invert style of rewrite is easily done by the program.
>> Just like arbitrary parsing of the packet and applying rewrites on
>> demand.
>>
>
> Alexei/Daniel - I am backlogged on email; however, i didn quiet follow:
> Is there another patch to pedit that help achieve the above?

not really. If you meant adding 'array of bytes' attribute to pedit, it
won't be sufficient from program point of view. Programs are deciding
what and when to write, whereas pedit is static.
I'm working on csum helper patch and a set of examples that will
demonstrate the possibilities. I believe Daniel is preparing examples
as well for different use cases.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 280a315de8d6..d5cda067115a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -59,6 +59,7 @@  enum bpf_arg_type {
 	ARG_PTR_TO_STACK,	/* any pointer to eBPF program stack */
 	ARG_CONST_STACK_SIZE,	/* number of bytes accessed from stack */
 
+	ARG_PTR_TO_CTX,		/* pointer to context */
 	ARG_ANYTHING,		/* any (initialized) argument is ok */
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 27dc4ec58840..74aab6e0d964 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -168,6 +168,7 @@  enum bpf_func_id {
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
 	BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */
 	BPF_FUNC_get_smp_processor_id, /* u32 raw_smp_processor_id(void) */
+	BPF_FUNC_skb_store_bytes, /* int skb_store_bytes(skb, offset, from, len) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0e714f799ec0..630a7bac1e51 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -773,6 +773,8 @@  static int check_func_arg(struct verifier_env *env, u32 regno,
 		expected_type = CONST_IMM;
 	} else if (arg_type == ARG_CONST_MAP_PTR) {
 		expected_type = CONST_PTR_TO_MAP;
+	} else if (arg_type == ARG_PTR_TO_CTX) {
+		expected_type = PTR_TO_CTX;
 	} else {
 		verbose("unsupported arg_type %d\n", arg_type);
 		return -EFAULT;
diff --git a/net/core/filter.c b/net/core/filter.c
index 32f43c59908c..444a07e4f68d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1175,6 +1175,56 @@  int sk_attach_bpf(u32 ufd, struct sock *sk)
 	return 0;
 }
 
+static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *) (long) r1;
+	unsigned int offset = (unsigned int) r2;
+	void *from = (void *) (long) r3;
+	unsigned int len = (unsigned int) r4;
+	char buf[16];
+	void *ptr;
+
+	/* bpf verifier guarantees that:
+	 * 'from' pointer points to bpf program stack
+	 * 'len' bytes of it were initialized
+	 * 'len' > 0
+	 * 'skb' is a valid pointer to 'struct sk_buff'
+	 *
+	 * so check for invalid 'offset' and too large 'len'
+	 */
+	if (offset > 0xffff || len > sizeof(buf))
+		return -EFAULT;
+
+	if (skb_cloned(skb) && !skb_clone_writable(skb, offset + len))
+		return -EFAULT;
+
+	ptr = skb_header_pointer(skb, offset, len, buf);
+	if (unlikely(!ptr))
+		return -EFAULT;
+
+	skb_postpull_rcsum(skb, ptr, len);
+
+	memcpy(ptr, from, len);
+
+	if (ptr == buf)
+		/* skb_store_bits cannot return -EFAULT here */
+		skb_store_bits(skb, offset, ptr, len);
+
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
+		skb->csum = csum_add(skb->csum, csum_partial(ptr, len, 0));
+	return 0;
+}
+
+const struct bpf_func_proto bpf_skb_store_bytes_proto = {
+	.func		= bpf_skb_store_bytes,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_STACK,
+	.arg4_type	= ARG_CONST_STACK_SIZE,
+};
+
 static const struct bpf_func_proto *
 sk_filter_func_proto(enum bpf_func_id func_id)
 {
@@ -1194,6 +1244,17 @@  sk_filter_func_proto(enum bpf_func_id func_id)
 	}
 }
 
+static const struct bpf_func_proto *
+tc_cls_act_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_skb_store_bytes:
+		return &bpf_skb_store_bytes_proto;
+	default:
+		return sk_filter_func_proto(func_id);
+	}
+}
+
 static bool sk_filter_is_valid_access(int off, int size,
 				      enum bpf_access_type type)
 {
@@ -1270,18 +1331,24 @@  static const struct bpf_verifier_ops sk_filter_ops = {
 	.convert_ctx_access = sk_filter_convert_ctx_access,
 };
 
+static const struct bpf_verifier_ops tc_cls_act_ops = {
+	.get_func_proto = tc_cls_act_func_proto,
+	.is_valid_access = sk_filter_is_valid_access,
+	.convert_ctx_access = sk_filter_convert_ctx_access,
+};
+
 static struct bpf_prog_type_list sk_filter_type __read_mostly = {
 	.ops = &sk_filter_ops,
 	.type = BPF_PROG_TYPE_SOCKET_FILTER,
 };
 
 static struct bpf_prog_type_list sched_cls_type __read_mostly = {
-	.ops = &sk_filter_ops,
+	.ops = &tc_cls_act_ops,
 	.type = BPF_PROG_TYPE_SCHED_CLS,
 };
 
 static struct bpf_prog_type_list sched_act_type __read_mostly = {
-	.ops = &sk_filter_ops,
+	.ops = &tc_cls_act_ops,
 	.type = BPF_PROG_TYPE_SCHED_ACT,
 };