diff mbox

[v3,net-next,1/3] net: flow_dissector: avoid multiple calls in BPF

Message ID 1400093480-498-1-git-send-email-chema@google.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Chema Gonzalez May 14, 2014, 6:51 p.m. UTC
We want multiple calls to __skb_get_poff() in the same filter to only
cause one invocation to the flow dissector. In order to reuse the result
of the flow dissector invocation (skb_flow_dissect()), we add a flow_keys
variable in the eBPF runner stack (__sk_run_filter() function), and pass
it as an argument to __skb_get_poff(). __skb_get_poff() inits the variable
the very first time it is called, and reuses the result in any further
invocation.

We also add handy function to init/check for inited flow_keys.

Signed-off-by: Chema Gonzalez <chema@google.com>
---
 include/linux/skbuff.h    |  4 +++-
 net/core/filter.c         | 14 ++++++++++++--
 net/core/flow_dissector.c | 28 ++++++++++++++++++++++------
 3 files changed, 37 insertions(+), 9 deletions(-)

Comments

Alexei Starovoitov May 14, 2014, 8:05 p.m. UTC | #1
On Wed, May 14, 2014 at 11:51 AM, Chema Gonzalez <chema@google.com> wrote:
> We want multiple calls to __skb_get_poff() in the same filter to only
> cause one invocation to the flow dissector. In order to reuse the result
> of the flow dissector invocation (skb_flow_dissect()), we add a flow_keys
> variable in the eBPF runner stack (__sk_run_filter() function), and pass
> it as an argument to __skb_get_poff(). __skb_get_poff() inits the variable
> the very first time it is called, and reuses the result in any further
> invocation.
>
> We also add handy function to init/check for inited flow_keys.
>
> Signed-off-by: Chema Gonzalez <chema@google.com>
> ---
>  include/linux/skbuff.h    |  4 +++-
>  net/core/filter.c         | 14 ++++++++++++--
>  net/core/flow_dissector.c | 28 ++++++++++++++++++++++------
>  3 files changed, 37 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 7a9beeb..858d7af 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -3065,7 +3065,9 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
>
>  int skb_checksum_setup(struct sk_buff *skb, bool recalculate);
>
> -u32 __skb_get_poff(const struct sk_buff *skb);
> +void __flow_keys_reset(struct flow_keys *flow);
> +bool __flow_keys_inited(struct flow_keys *flow);
> +u32 __skb_get_poff(const struct sk_buff *skb, struct flow_keys *flow);
>
>  /**
>   * skb_head_is_locked - Determine if the skb->head is locked down
> diff --git a/net/core/filter.c b/net/core/filter.c
> index c442a0d..b71948b 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -66,6 +66,11 @@
>  #define CTX    regs[BPF_REG_CTX]
>  #define K      insn->imm
>
> +struct sk_run_filter_ctx {
> +       struct sk_buff *skb;
> +       struct flow_keys *flow;
> +};
> +
>  /* No hurry in this branch
>   *
>   * Exported for the bpf jit load helper.
> @@ -252,12 +257,15 @@ unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn)
>         };
>         void *ptr;
>         int off;
> +       struct flow_keys flow;
> +       struct sk_run_filter_ctx context = { ctx, &flow };
>
>  #define CONT    ({ insn++; goto select_insn; })
>  #define CONT_JMP ({ insn++; goto select_insn; })
>
> +       memset(&flow, 0, sizeof(flow));
>         FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
> -       ARG1 = (u64) (unsigned long) ctx;
> +       ARG1 = (u64) (unsigned long) &context;

I have a feeling that you didn't test this. Even basic "tcpdump port 22"
will be broken, let alone BPF testsuite. It also breaks seccomp, new JIT
and tracing filter patches I've posted yesterday.
sk_run_filter() is a generic interpreter that should be suitable for
sockets, seccomp, tracing, etc. Adding one use case specific data
structure to interpreter is not desirable.
Doing memset(&flow,0...) every invocation is not performance
friendly either. Majority of filters will suffer.

You can do the same without touching sk_run_filter().
For example, you can tweak sk_convert_filter() to pass
R10(frame pointer) as arg4 to helper functions
(__skb_get_pay_offset() and others)
then inside those functions cast 'arg4 - appropriate offset'
to struct flow_keys and use it.
BPF was extended from 16 spill/fill only slots to generic
stack space exactly for this type of use cases.
Key point: you can add pretty much any feature to classic BPF
by tweaking converter from classic to internal without changing
interpreter and causing trainwreck for non-socket use cases.

The idea itself I think is quite interesting. I'm only disliking its
implementation.

>         /* Register for user BPF programs need to be reset first. */
>         regs[BPF_REG_A] = 0;
> @@ -602,7 +610,9 @@ static unsigned int pkt_type_offset(void)
>
>  static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
>  {
> -       return __skb_get_poff((struct sk_buff *)(unsigned long) ctx);
> +       struct sk_run_filter_ctx *context = (struct sk_run_filter_ctx *)
> +                       (unsigned long) ctx;
> +       return __skb_get_poff(context->skb, context->flow);
>  }
>
>  static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
> diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
> index 107ed12..4d431f0 100644
> --- a/net/core/flow_dissector.c
> +++ b/net/core/flow_dissector.c
> @@ -270,21 +270,37 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
>  }
>  EXPORT_SYMBOL(__skb_tx_hash);
>
> +/* __flow_keys_reset() zeroes a flow_keys struct */
> +void __flow_keys_reset(struct flow_keys *flow)
> +{
> +       memset(flow, 0, sizeof(struct flow_keys));
> +}
> +
> +/* __flow_keys_inited() checks whether a flow_keys struct is init */
> +bool __flow_keys_inited(struct flow_keys *flow)
> +{
> +       struct flow_keys zero_flow;
> +
> +       __flow_keys_reset(&zero_flow);
> +       return memcmp(flow, &zero_flow, sizeof(struct flow_keys));
> +}
> +
>  /* __skb_get_poff() returns the offset to the payload as far as it could
>   * be dissected. The main user is currently BPF, so that we can dynamically
>   * truncate packets without needing to push actual payload to the user
>   * space and can analyze headers only, instead.
>   */
> -u32 __skb_get_poff(const struct sk_buff *skb)
> +u32 __skb_get_poff(const struct sk_buff *skb, struct flow_keys *flow)
>  {
> -       struct flow_keys keys;
>         u32 poff = 0;
>
> -       if (!skb_flow_dissect(skb, &keys))
> -               return 0;
> +       /* check whether the flow dissector has already been run */
> +       if (!__flow_keys_inited(flow))
> +               if (!skb_flow_dissect(skb, flow))
> +                       return 0;
>
> -       poff += keys.thoff;
> -       switch (keys.ip_proto) {
> +       poff += flow->thoff;
> +       switch (flow->ip_proto) {
>         case IPPROTO_TCP: {
>                 const struct tcphdr *tcph;
>                 struct tcphdr _tcph;
> --
> 1.9.1.423.g4596e3a
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chema Gonzalez May 14, 2014, 9:51 p.m. UTC | #2
Hi,

On Wed, May 14, 2014 at 1:05 PM, Alexei Starovoitov <ast@plumgrid.com> wrote:
> I have a feeling that you didn't test this. Even basic "tcpdump port 22"
> will be broken,
Not sure whether you understood the use of this. I'm not trying to
change how to compile tcpdump expressions. The only way to access the
new features is using Daniel's bpf_asm code.

I did test the code:

$ cat tools/net/ipv4_tcp_poff2.bpf
ldh [12]
jne #0x800, drop
ldb [23]
jneq #6, drop
ld poff
ld poff
ld poff
ld poff
ld toff
ld toff
ld toff
ld tproto
ld tproto
ld tproto
ret #-1
drop: ret #0
$ ./tools/net/bpf_asm tools/net/ipv4_tcp_poff2.bpf
16,40 0 0 12,21 0 13 2048,48 0 0 23,21 0 11 6,32 0 0 4294963252,32 0 0
4294963252,32 0 0 4294963252,32 0 0 4294963252,32 0 0 4294963260,32 0
0 4294963260,32 0 0 4294963260,32 0 0 4294963264,32 0 0 4294963264,32
0 0 4294963264,6 0 0 4294967295,6 0 0 0,

And then, in a VM, I ran:

$ tcpdump -n -i eth0 -f "16,40 0 0 12,21 0 13 2048,48 0 0 23,21 0 11
6,32 0 0 4294963252,32 0 0 4294963252,32 0 0 4294963252,32 0 0
4294963252,32 0 0 4294963260,32 0 0 4294963260,32 0 0 4294963260,32 0
0 4294963264,32 0 0 4294963264,32 0 0 4294963264,6 0 0 4294967295,6 0
0 0,"

This tcpdump is github's tcpdump HEAD with
https://github.com/the-tcpdump-group/libpcap/pull/353.

Adding some labels let me see how the flow dissector was only called
for the first "ld poff":

[    7.003362] --------__skb_get_poff(): checking flow dissector: {0,
0, 0, 0, 0} inited returns 0
[    7.005618] --------__skb_get_poff(): after calling flow dissector:
{-26957632, 23374016, 369113781, 34, 6}
[    7.006821] --------__skb_get_poff(): checking flow dissector:
{-26957632, 23374016, 369113781, 34, 6} inited returns 1
[    7.008156] --------__skb_get_poff(): checking flow dissector:
{-26957632, 23374016, 369113781, 34, 6} inited returns 1
[    7.009485] --------__skb_get_poff(): checking flow dissector:
{-26957632, 23374016, 369113781, 34, 6} inited returns 1
[    7.010827] --------__skb_get_tra_offset(): checking flow
dissector: {-26957632, 23374016, 369113781, 34, 6} inited returns 1
[    7.013630] --------__skb_get_tra_offset(): checking flow
dissector: {-26957632, 23374016, 369113781, 34, 6} inited returns 1
[    7.015969] --------__skb_get_tra_offset(): checking flow
dissector: {-26957632, 23374016, 369113781, 34, 6} inited returns 1
[    7.017357] --------__skb_get_tra_protocol(): checking flow
dissector: {-26957632, 23374016, 369113781, 34, 6} inited returns 1
[    7.018749] --------__skb_get_tra_protocol(): checking flow
dissector: {-26957632, 23374016, 369113781, 34, 6} inited returns 1
[    7.020137] --------__skb_get_tra_protocol(): checking flow
dissector: {-26957632, 23374016, 369113781, 34, 6} inited returns 1

> let alone BPF testsuite. It also breaks seccomp, new JIT
> and tracing filter patches I've posted yesterday.
Note that the flow_keys field is only used by those functions actually
calling the flow dissector (originally __skb_get_poff(), and after
this set of patches __skb_get_tra_offset and __skb_get_tra_protocol).
If any non-packet user is calling any of these functions, you have an
issue there.

Sorry if I broke your patches from yesterday. I didn't see them. I'll
check them.

> sk_run_filter() is a generic interpreter that should be suitable for
> sockets, seccomp, tracing, etc. Adding one use case specific data
> structure to interpreter is not desirable.
That's an interesting point, which you'd agree does apply to "ld poff"
right now. There's a tradeoff here between making the BPF VM as
generic as possible, and having to reimplement in BPF VM code
functions that already exist in the kernel (the flow dissector
itself). I think not having 2 flow dissectors in the kernel (one in C,
one in BPF VM code) is a win.

> Doing memset(&flow,0...) every invocation is not performance
> friendly either. Majority of filters will suffer.
Will fix. In retrospect, I should have just added an flow_keys_is_init
field instead of mapping {0, 0, {0}, 0, 0} to "not inited."

> You can do the same without touching sk_run_filter().
> For example, you can tweak sk_convert_filter() to pass
> R10(frame pointer) as arg4 to helper functions
> (__skb_get_pay_offset() and others)
> then inside those functions cast 'arg4 - appropriate offset'
> to struct flow_keys and use it.
> BPF was extended from 16 spill/fill only slots to generic
> stack space exactly for this type of use cases.
I tried several implementations, including the one you propose:

1. add flow_keys to skb CB (packet_skb_cb)
The problem here is that struct flow_keys (15 bytes) does not fit in
AF_PACKET's 48-byte skb CB. I also tried to create a small_flow_keys
struct that only contains L3 info (nhoff/proto, which is the minimum
output you need from the flow dissector to be able to get the rest
without having to rewalk the packet), or maybe L3 and L4
(thoff/ip_proto). The problem is that the packet CB is completely full
already. packet_skb_cb is 4 (origlen) + 20 (sockaddr_ll) and
MAX_ADDR_LEN=32, so the net/packet/af_packet.c:1822 check is 4+20+32-8
= 48, which is already the size of the sk_buff->CB.

2. add flow_keys using the stack in the 3 BPF filter runners
(original, eBPF, JIT)
  - (-) there are 6x BPF engines (1x non-JIT and 5x JIT)
  - (+) we can just do the non-JIT version, and let JITs implement it
when they want)
  - (+) very simple

3. add FLOW, BPF_REG_FLOW, map it to a register, and then pass it as
R4. This is what you're proposing, IIIC. I found the implementation
way more complicated (in fact I gave up trying to make it not crash
;), but I guess this is my own taste. What really convinced me to
extend the context is that, if you think about it, both the skb and
the flow_keys are part of the context that the bpf_calls use to run.
If we pass flow_keys in R4, and we ever want to add more context
parameters to a call, that means using R5. And then what?

> Key point: you can add pretty much any feature to classic BPF
> by tweaking converter from classic to internal without changing
> interpreter and causing trainwreck for non-socket use cases.
Again, the patch is not breaking the current functionality. We already
provide "ld poff" as a recognition that packet processing is a first
class citizen of the BPF VM. If you run a seccomp filter containing a
"ld poff" right now, you're probably causing issues anyway.

> The idea itself I think is quite interesting. I'm only disliking its
> implementation.
Thanks a lot for the comments.

-Chema
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7a9beeb..858d7af 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3065,7 +3065,9 @@  bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
 
 int skb_checksum_setup(struct sk_buff *skb, bool recalculate);
 
-u32 __skb_get_poff(const struct sk_buff *skb);
+void __flow_keys_reset(struct flow_keys *flow);
+bool __flow_keys_inited(struct flow_keys *flow);
+u32 __skb_get_poff(const struct sk_buff *skb, struct flow_keys *flow);
 
 /**
  * skb_head_is_locked - Determine if the skb->head is locked down
diff --git a/net/core/filter.c b/net/core/filter.c
index c442a0d..b71948b 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -66,6 +66,11 @@ 
 #define CTX	regs[BPF_REG_CTX]
 #define K	insn->imm
 
+struct sk_run_filter_ctx {
+	struct sk_buff *skb;
+	struct flow_keys *flow;
+};
+
 /* No hurry in this branch
  *
  * Exported for the bpf jit load helper.
@@ -252,12 +257,15 @@  unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn)
 	};
 	void *ptr;
 	int off;
+	struct flow_keys flow;
+	struct sk_run_filter_ctx context = { ctx, &flow };
 
 #define CONT	 ({ insn++; goto select_insn; })
 #define CONT_JMP ({ insn++; goto select_insn; })
 
+	memset(&flow, 0, sizeof(flow));
 	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
-	ARG1 = (u64) (unsigned long) ctx;
+	ARG1 = (u64) (unsigned long) &context;
 
 	/* Register for user BPF programs need to be reset first. */
 	regs[BPF_REG_A] = 0;
@@ -602,7 +610,9 @@  static unsigned int pkt_type_offset(void)
 
 static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 {
-	return __skb_get_poff((struct sk_buff *)(unsigned long) ctx);
+	struct sk_run_filter_ctx *context = (struct sk_run_filter_ctx *)
+			(unsigned long) ctx;
+	return __skb_get_poff(context->skb, context->flow);
 }
 
 static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 107ed12..4d431f0 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -270,21 +270,37 @@  u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
 }
 EXPORT_SYMBOL(__skb_tx_hash);
 
+/* __flow_keys_reset() zeroes a flow_keys struct */
+void __flow_keys_reset(struct flow_keys *flow)
+{
+	memset(flow, 0, sizeof(struct flow_keys));
+}
+
+/* __flow_keys_inited() checks whether a flow_keys struct is init */
+bool __flow_keys_inited(struct flow_keys *flow)
+{
+	struct flow_keys zero_flow;
+
+	__flow_keys_reset(&zero_flow);
+	return memcmp(flow, &zero_flow, sizeof(struct flow_keys));
+}
+
 /* __skb_get_poff() returns the offset to the payload as far as it could
  * be dissected. The main user is currently BPF, so that we can dynamically
  * truncate packets without needing to push actual payload to the user
  * space and can analyze headers only, instead.
  */
-u32 __skb_get_poff(const struct sk_buff *skb)
+u32 __skb_get_poff(const struct sk_buff *skb, struct flow_keys *flow)
 {
-	struct flow_keys keys;
 	u32 poff = 0;
 
-	if (!skb_flow_dissect(skb, &keys))
-		return 0;
+	/* check whether the flow dissector has already been run */
+	if (!__flow_keys_inited(flow))
+		if (!skb_flow_dissect(skb, flow))
+			return 0;
 
-	poff += keys.thoff;
-	switch (keys.ip_proto) {
+	poff += flow->thoff;
+	switch (flow->ip_proto) {
 	case IPPROTO_TCP: {
 		const struct tcphdr *tcph;
 		struct tcphdr _tcph;