diff mbox

[net-next] bpf: fix cb access in socket filter programs

Message ID 1444184292-17500-1-git-send-email-ast@plumgrid.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Alexei Starovoitov Oct. 7, 2015, 2:18 a.m. UTC
eBPF socket filter programs may see junk in 'u32 cb[5]' area,
since it could have been used by protocol layers earlier.

On the receive path the af_packet sees clean skb->cb.
On the xmit the dev_queue_xmit_nit() delivers cloned skb, so we can
conditionally clean 20 bytes of skb->cb that could be used by the program.
For programs attached to TCP/UDP sockets we need to save/restore
these 20 bytes, since it's used by protocol layers.

Long term we may move this bpf cb area to per-cpu scratch, but that
requires addition of new 'per-cpu load/store' instructions,
so not suitable as a short term fix.

Fixes: d691f9e8d440 ("bpf: allow programs to write to certain skb fields")
Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
Since eBPF is root only, the impact of the bug is low and
the fix can stay in net-next.

As an imporvement for TX part of the fix, we can introduce a flag
for skb_clone()->__skb_clone()->__copy_skb_header() that will
zero 48 bytes of cb instead of copying it and use it in
dev_queue_xmit_nit() instead ?

For those interested in assembler code: gcc 4.7 generates
ugly code for this memcpy/memset, but gcc 4.9 looks great.
Thankfuly both move unlikely paths out of the way.

 include/linux/bpf.h    |    6 +++---
 include/linux/filter.h |   39 +++++++++++++++++++++++++++++++++++----
 kernel/bpf/verifier.c  |    2 +-
 net/core/filter.c      |   12 +++++++-----
 net/packet/af_packet.c |   10 +++++-----
 5 files changed, 51 insertions(+), 18 deletions(-)

Comments

Daniel Borkmann Oct. 7, 2015, 9:39 a.m. UTC | #1
On 10/07/2015 04:18 AM, Alexei Starovoitov wrote:
> eBPF socket filter programs may see junk in 'u32 cb[5]' area,
> since it could have been used by protocol layers earlier.
>
> On the receive path the af_packet sees clean skb->cb.
> On the xmit the dev_queue_xmit_nit() delivers cloned skb, so we can
> conditionally clean 20 bytes of skb->cb that could be used by the program.

Having slept over this one night, I think this assumption is not
always correct :/, more below ...

> For programs attached to TCP/UDP sockets we need to save/restore
> these 20 bytes, since it's used by protocol layers.
...
> +static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog,
> +				       struct sk_buff *skb)
> +{
> +	u8 *cb_data = qdisc_skb_cb(skb)->data;
> +	u8 saved_cb[QDISC_CB_PRIV_LEN];
> +	u32 res;
> +
> +	BUILD_BUG_ON(FIELD_SIZEOF(struct __sk_buff, cb) !=
> +		     QDISC_CB_PRIV_LEN);
> +
> +	if (unlikely(prog->cb_access)) {
> +		memcpy(saved_cb, cb_data, sizeof(saved_cb));
> +		memset(cb_data, 0, sizeof(saved_cb));
> +	}
> +
> +	res = BPF_PROG_RUN(prog, skb);
> +
> +	if (unlikely(prog->cb_access))
> +		memcpy(cb_data, saved_cb, sizeof(saved_cb));
> +
> +	return res;
> +}
> +
> +static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
> +					struct sk_buff *skb)
> +{
> +	u8 *cb_data = qdisc_skb_cb(skb)->data;
> +
> +	if (unlikely(prog->cb_access) && skb->pkt_type == PACKET_OUTGOING)
> +		memset(cb_data, 0, QDISC_CB_PRIV_LEN);
> +	return BPF_PROG_RUN(prog, skb);
> +}
> +
>   static inline unsigned int bpf_prog_size(unsigned int proglen)
>   {
>   	return max(sizeof(struct bpf_prog),

bpf_prog_run_clear_cb() wouldn't work on dev_forward_skb() as
skb->pkt_type is then being scrubbed to PACKET_HOST, so on the
receive path, AF_PACKET might not always see clean skbs->cb[]
as assumed ... I think that the skb->pkt_type part needs to be
dropped, no?

Thanks,
Daniel
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Borkmann Oct. 7, 2015, 1:09 p.m. UTC | #2
On 10/07/2015 11:39 AM, Daniel Borkmann wrote:
> On 10/07/2015 04:18 AM, Alexei Starovoitov wrote:
>> eBPF socket filter programs may see junk in 'u32 cb[5]' area,
>> since it could have been used by protocol layers earlier.
>>
>> On the receive path the af_packet sees clean skb->cb.
>> On the xmit the dev_queue_xmit_nit() delivers cloned skb, so we can
>> conditionally clean 20 bytes of skb->cb that could be used by the program.
>
> Having slept over this one night, I think this assumption is not
> always correct :/, more below ...
>
>> For programs attached to TCP/UDP sockets we need to save/restore
>> these 20 bytes, since it's used by protocol layers.
> ...
>> +static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog,
>> +                       struct sk_buff *skb)
>> +{
>> +    u8 *cb_data = qdisc_skb_cb(skb)->data;
>> +    u8 saved_cb[QDISC_CB_PRIV_LEN];
>> +    u32 res;
>> +
>> +    BUILD_BUG_ON(FIELD_SIZEOF(struct __sk_buff, cb) !=
>> +             QDISC_CB_PRIV_LEN);
>> +
>> +    if (unlikely(prog->cb_access)) {
>> +        memcpy(saved_cb, cb_data, sizeof(saved_cb));
>> +        memset(cb_data, 0, sizeof(saved_cb));
>> +    }
>> +
>> +    res = BPF_PROG_RUN(prog, skb);
>> +
>> +    if (unlikely(prog->cb_access))
>> +        memcpy(cb_data, saved_cb, sizeof(saved_cb));
>> +
>> +    return res;
>> +}
>> +
>> +static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
>> +                    struct sk_buff *skb)
>> +{
>> +    u8 *cb_data = qdisc_skb_cb(skb)->data;
>> +
>> +    if (unlikely(prog->cb_access) && skb->pkt_type == PACKET_OUTGOING)
>> +        memset(cb_data, 0, QDISC_CB_PRIV_LEN);
>> +    return BPF_PROG_RUN(prog, skb);
>> +}
>> +
>>   static inline unsigned int bpf_prog_size(unsigned int proglen)
>>   {
>>       return max(sizeof(struct bpf_prog),
>
> bpf_prog_run_clear_cb() wouldn't work on dev_forward_skb() as
> skb->pkt_type is then being scrubbed to PACKET_HOST, so on the
> receive path, AF_PACKET might not always see clean skbs->cb[]
> as assumed ... I think that the skb->pkt_type part needs to be
> dropped, no?

Thinking a bit more about this part, which only accounts for
fanout_demux_bpf() and run_filter(), so AF_PACKET only, this
logic still needs to be slightly different:

You currently can have eBPF on packet fanout as a demux and behind
that eBPF on the actual packet socket. So, for some reason, fanout
could transfer some state to the socket along the way, which could
break when cleared as-is via bpf_prog_run_clear_cb().

So we need to make sure to only clear this once, either in front
of fanout, or when not present, in front of the socket filter.

Thanks,
Daniel
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexei Starovoitov Oct. 7, 2015, 4:53 p.m. UTC | #3
On 10/7/15 6:09 AM, Daniel Borkmann wrote:
>> bpf_prog_run_clear_cb() wouldn't work on dev_forward_skb() as
>> skb->pkt_type is then being scrubbed to PACKET_HOST, so on the
>> receive path, AF_PACKET might not always see clean skbs->cb[]
>> as assumed ... I think that the skb->pkt_type part needs to be
>> dropped, no?

right. will respin.

> Thinking a bit more about this part, which only accounts for
> fanout_demux_bpf() and run_filter(), so AF_PACKET only, this
> logic still needs to be slightly different:
>
> You currently can have eBPF on packet fanout as a demux and behind
> that eBPF on the actual packet socket. So, for some reason, fanout
> could transfer some state to the socket along the way, which could
> break when cleared as-is via bpf_prog_run_clear_cb().
>
> So we need to make sure to only clear this once, either in front
> of fanout, or when not present, in front of the socket filter.

no. that would be anti-feature. user space shouldn't rely on such
things. If somebody wants to pass data between two disjoint
programs (not called via tail_call), they should be using maps or
some future per-cpu scratch area that will guarantee such semantics.
cb as part of skb is not suitable for that, since it will limit
what kernel can do in-between.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c915a6b54570..19b8a2081f88 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -100,6 +100,8 @@  enum bpf_access_type {
 	BPF_WRITE = 2
 };
 
+struct bpf_prog;
+
 struct bpf_verifier_ops {
 	/* return eBPF function prototype for verification */
 	const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id);
@@ -111,7 +113,7 @@  struct bpf_verifier_ops {
 
 	u32 (*convert_ctx_access)(enum bpf_access_type type, int dst_reg,
 				  int src_reg, int ctx_off,
-				  struct bpf_insn *insn);
+				  struct bpf_insn *insn, struct bpf_prog *prog);
 };
 
 struct bpf_prog_type_list {
@@ -120,8 +122,6 @@  struct bpf_prog_type_list {
 	enum bpf_prog_type type;
 };
 
-struct bpf_prog;
-
 struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1bbce14bcf17..33db7b801c90 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -13,6 +13,7 @@ 
 #include <linux/printk.h>
 #include <linux/workqueue.h>
 #include <linux/sched.h>
+#include <net/sch_generic.h>
 
 #include <asm/cacheflush.h>
 
@@ -302,10 +303,6 @@  struct bpf_prog_aux;
 	bpf_size;						\
 })
 
-/* Macro to invoke filter function. */
-#define SK_RUN_FILTER(filter, ctx) \
-	(*filter->prog->bpf_func)(ctx, filter->prog->insnsi)
-
 #ifdef CONFIG_COMPAT
 /* A struct sock_filter is architecture independent. */
 struct compat_sock_fprog {
@@ -329,6 +326,7 @@  struct bpf_prog {
 	kmemcheck_bitfield_begin(meta);
 	u16			jited:1,	/* Is our filter JIT'ed? */
 				gpl_compatible:1, /* Is filter GPL compatible? */
+				cb_access:1,	/* Is control block accessed? */
 				dst_needed:1;	/* Do we need dst entry? */
 	kmemcheck_bitfield_end(meta);
 	u32			len;		/* Number of filter blocks */
@@ -352,6 +350,39 @@  struct sk_filter {
 
 #define BPF_PROG_RUN(filter, ctx)  (*filter->bpf_func)(ctx, filter->insnsi)
 
+static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog,
+				       struct sk_buff *skb)
+{
+	u8 *cb_data = qdisc_skb_cb(skb)->data;
+	u8 saved_cb[QDISC_CB_PRIV_LEN];
+	u32 res;
+
+	BUILD_BUG_ON(FIELD_SIZEOF(struct __sk_buff, cb) !=
+		     QDISC_CB_PRIV_LEN);
+
+	if (unlikely(prog->cb_access)) {
+		memcpy(saved_cb, cb_data, sizeof(saved_cb));
+		memset(cb_data, 0, sizeof(saved_cb));
+	}
+
+	res = BPF_PROG_RUN(prog, skb);
+
+	if (unlikely(prog->cb_access))
+		memcpy(cb_data, saved_cb, sizeof(saved_cb));
+
+	return res;
+}
+
+static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
+					struct sk_buff *skb)
+{
+	u8 *cb_data = qdisc_skb_cb(skb)->data;
+
+	if (unlikely(prog->cb_access) && skb->pkt_type == PACKET_OUTGOING)
+		memset(cb_data, 0, QDISC_CB_PRIV_LEN);
+	return BPF_PROG_RUN(prog, skb);
+}
+
 static inline unsigned int bpf_prog_size(unsigned int proglen)
 {
 	return max(sizeof(struct bpf_prog),
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b074b23000d6..f8da034c2258 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2024,7 +2024,7 @@  static int convert_ctx_accesses(struct verifier_env *env)
 
 		cnt = env->prog->aux->ops->
 			convert_ctx_access(type, insn->dst_reg, insn->src_reg,
-					   insn->off, insn_buf);
+					   insn->off, insn_buf, env->prog);
 		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
 			verbose("bpf verifier is misconfigured\n");
 			return -EINVAL;
diff --git a/net/core/filter.c b/net/core/filter.c
index da3e5357f138..cbaa23c3fb30 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -56,10 +56,10 @@ 
  *	@sk: sock associated with &sk_buff
  *	@skb: buffer to filter
  *
- * Run the filter code and then cut skb->data to correct size returned by
- * SK_RUN_FILTER. If pkt_len is 0 we toss packet. If skb->len is smaller
+ * Run the eBPF program and then cut skb->data to correct size returned by
+ * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
  * than pkt_len we keep whole skb->data. This is the socket level
- * wrapper to SK_RUN_FILTER. It returns 0 if the packet should
+ * wrapper to BPF_PROG_RUN. It returns 0 if the packet should
  * be accepted or -EPERM if the packet should be tossed.
  *
  */
@@ -83,7 +83,7 @@  int sk_filter(struct sock *sk, struct sk_buff *skb)
 	rcu_read_lock();
 	filter = rcu_dereference(sk->sk_filter);
 	if (filter) {
-		unsigned int pkt_len = SK_RUN_FILTER(filter, skb);
+		unsigned int pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
 
 		err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
 	}
@@ -1740,7 +1740,8 @@  static bool tc_cls_act_is_valid_access(int off, int size,
 
 static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 				      int src_reg, int ctx_off,
-				      struct bpf_insn *insn_buf)
+				      struct bpf_insn *insn_buf,
+				      struct bpf_prog *prog)
 {
 	struct bpf_insn *insn = insn_buf;
 
@@ -1831,6 +1832,7 @@  static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 		offsetof(struct __sk_buff, cb[4]):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
 
+		prog->cb_access = 1;
 		ctx_off -= offsetof(struct __sk_buff, cb[0]);
 		ctx_off += offsetof(struct sk_buff, cb);
 		ctx_off += offsetof(struct qdisc_skb_cb, data);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 81c900fbc4a4..104910f7d1fb 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1423,7 +1423,7 @@  static unsigned int fanout_demux_bpf(struct packet_fanout *f,
 	rcu_read_lock();
 	prog = rcu_dereference(f->bpf_prog);
 	if (prog)
-		ret = BPF_PROG_RUN(prog, skb) % num;
+		ret = bpf_prog_run_clear_cb(prog, skb) % num;
 	rcu_read_unlock();
 
 	return ret;
@@ -1939,16 +1939,16 @@  out_free:
 	return err;
 }
 
-static unsigned int run_filter(const struct sk_buff *skb,
-				      const struct sock *sk,
-				      unsigned int res)
+static unsigned int run_filter(struct sk_buff *skb,
+			       const struct sock *sk,
+			       unsigned int res)
 {
 	struct sk_filter *filter;
 
 	rcu_read_lock();
 	filter = rcu_dereference(sk->sk_filter);
 	if (filter != NULL)
-		res = SK_RUN_FILTER(filter, skb);
+		res = bpf_prog_run_clear_cb(filter->prog, skb);
 	rcu_read_unlock();
 
 	return res;