diff mbox series

[bpf,1/3] bpf: sock_ops ctx access may stomp registers in corner case

Message ID 159595102637.30613.6373296730696919300.stgit@john-Precision-5820-Tower
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series Fix sock_ops field read splat | expand

Commit Message

John Fastabend July 28, 2020, 3:43 p.m. UTC
I had a sockmap program that after doing some refactoring started spewing
this splat at me:

[18610.807284] BUG: unable to handle kernel NULL pointer dereference at 0000000000000001
[...]
[18610.807359] Call Trace:
[18610.807370]  ? 0xffffffffc114d0d5
[18610.807382]  __cgroup_bpf_run_filter_sock_ops+0x7d/0xb0
[18610.807391]  tcp_connect+0x895/0xd50
[18610.807400]  tcp_v4_connect+0x465/0x4e0
[18610.807407]  __inet_stream_connect+0xd6/0x3a0
[18610.807412]  ? __inet_stream_connect+0x5/0x3a0
[18610.807417]  inet_stream_connect+0x3b/0x60
[18610.807425]  __sys_connect+0xed/0x120

After some debugging I was able to build this simple reproducer,

 __section("sockops/reproducer_bad")
 int bpf_reproducer_bad(struct bpf_sock_ops *skops)
 {
        volatile __maybe_unused __u32 i = skops->snd_ssthresh;
        return 0;
 }

And along the way noticed that below program ran without splat,

__section("sockops/reproducer_good")
int bpf_reproducer_good(struct bpf_sock_ops *skops)
{
        volatile __maybe_unused __u32 i = skops->snd_ssthresh;
        volatile __maybe_unused __u32 family;

        compiler_barrier();

        family = skops->family;
        return 0;
}

So I decided to check out the code we generate for the above two
programs and noticed each generates the BPF code you would expect,

0000000000000000 <bpf_reproducer_bad>:
;       volatile __maybe_unused __u32 i = skops->snd_ssthresh;
       0:       r1 = *(u32 *)(r1 + 96)
       1:       *(u32 *)(r10 - 4) = r1
;       return 0;
       2:       r0 = 0
       3:       exit

0000000000000000 <bpf_reproducer_good>:
;       volatile __maybe_unused __u32 i = skops->snd_ssthresh;
       0:       r2 = *(u32 *)(r1 + 96)
       1:       *(u32 *)(r10 - 4) = r2
;       family = skops->family;
       2:       r1 = *(u32 *)(r1 + 20)
       3:       *(u32 *)(r10 - 8) = r1
;       return 0;
       4:       r0 = 0
       5:       exit

So we get reasonable assembly, but still something was causing the null
pointer dereference. So, we load the programs and dump the xlated version
observing that line 0 above 'r* = *(u32 *)(r1 +96)' is going to be
translated by the skops access helpers.

int bpf_reproducer_bad(struct bpf_sock_ops * skops):
; volatile __maybe_unused __u32 i = skops->snd_ssthresh;
   0: (61) r1 = *(u32 *)(r1 +28)
   1: (15) if r1 == 0x0 goto pc+2
   2: (79) r1 = *(u64 *)(r1 +0)
   3: (61) r1 = *(u32 *)(r1 +2340)
; volatile __maybe_unused __u32 i = skops->snd_ssthresh;
   4: (63) *(u32 *)(r10 -4) = r1
; return 0;
   5: (b7) r0 = 0
   6: (95) exit

int bpf_reproducer_good(struct bpf_sock_ops * skops):
; volatile __maybe_unused __u32 i = skops->snd_ssthresh;
   0: (61) r2 = *(u32 *)(r1 +28)
   1: (15) if r2 == 0x0 goto pc+2
   2: (79) r2 = *(u64 *)(r1 +0)
   3: (61) r2 = *(u32 *)(r2 +2340)
; volatile __maybe_unused __u32 i = skops->snd_ssthresh;
   4: (63) *(u32 *)(r10 -4) = r2
; family = skops->family;
   5: (79) r1 = *(u64 *)(r1 +0)
   6: (69) r1 = *(u16 *)(r1 +16)
; family = skops->family;
   7: (63) *(u32 *)(r10 -8) = r1
; return 0;
   8: (b7) r0 = 0
   9: (95) exit

Then we look at lines 0 and 2 above. In the good case we do the zero
check in r2 and then load 'r1 + 0' at line 2. Do a quick cross-check
into the bpf_sock_ops check and we can confirm that is the 'struct
sock *sk' pointer field. But, in the bad case,

   0: (61) r1 = *(u32 *)(r1 +28)
   1: (15) if r1 == 0x0 goto pc+2
   2: (79) r1 = *(u64 *)(r1 +0)

Oh no, we read 'r1 +28' into r1, this is skops->fullsock and then in
line 2 we read the 'r1 +0' as a pointer. Now jumping back to our spat,

[18610.807284] BUG: unable to handle kernel NULL pointer dereference at 0000000000000001

The 0x01 makes sense because that is exactly the fullsock value. And
its not a valid dereference so we splat.

To fix we need to guard the case when a program is doing a sock_ops field
access with src_reg == dst_reg. This is already handled in the load case
where the ctx_access handler uses a tmp register being careful to
store the old value and restore it. To fix the get case test if
src_reg == dst_reg and in this case do the is_fullsock test in the
temporary register. Remembering to restore the temporary register before
writing to either dst_reg or src_reg to avoid smashing the pointer into
the struct holding the tmp variable.

Adding this inline code to test_tcpbpf_kern will now be generated
correctly from,

  9: r2 = *(u32 *)(r2 + 96)

to xlated code,

  13: (61) r9 = *(u32 *)(r2 +28)
  14: (15) if r9 == 0x0 goto pc+4
  15: (79) r9 = *(u64 *)(r2 +32)
  16: (79) r2 = *(u64 *)(r2 +0)
  17: (61) r2 = *(u32 *)(r2 +2348)
  18: (05) goto pc+1
  19: (79) r9 = *(u64 *)(r2 +32)

And in the normal case we keep the original code, because really this
is an edge case. From this,

  9: r2 = *(u32 *)(r6 + 96)

to xlated code,

  22: (61) r2 = *(u32 *)(r6 +28)
  23: (15) if r2 == 0x0 goto pc+2
  24: (79) r2 = *(u64 *)(r6 +0)
  25: (61) r2 = *(u32 *)(r2 +2348)

So three additional instructions if dst == src register, but I scanned
my current code base and did not see this pattern anywhere so should
not be a big deal. Further, it seems no one else has hit this or at
least reported it so it must a fairly rare pattern.

Fixes: 9b1f3d6e5af29 ("bpf: Refactor sock_ops_convert_ctx_access")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 net/core/filter.c |   26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

Comments

Martin KaFai Lau July 28, 2020, 5:23 p.m. UTC | #1
On Tue, Jul 28, 2020 at 08:43:46AM -0700, John Fastabend wrote:
> I had a sockmap program that after doing some refactoring started spewing
> this splat at me:
> 
> [18610.807284] BUG: unable to handle kernel NULL pointer dereference at 0000000000000001
> [...]
> [18610.807359] Call Trace:
> [18610.807370]  ? 0xffffffffc114d0d5
> [18610.807382]  __cgroup_bpf_run_filter_sock_ops+0x7d/0xb0
> [18610.807391]  tcp_connect+0x895/0xd50
> [18610.807400]  tcp_v4_connect+0x465/0x4e0
> [18610.807407]  __inet_stream_connect+0xd6/0x3a0
> [18610.807412]  ? __inet_stream_connect+0x5/0x3a0
> [18610.807417]  inet_stream_connect+0x3b/0x60
> [18610.807425]  __sys_connect+0xed/0x120
> 
> After some debugging I was able to build this simple reproducer,
> 
>  __section("sockops/reproducer_bad")
>  int bpf_reproducer_bad(struct bpf_sock_ops *skops)
>  {
>         volatile __maybe_unused __u32 i = skops->snd_ssthresh;
>         return 0;
>  }
> 
> And along the way noticed that below program ran without splat,
> 
> __section("sockops/reproducer_good")
> int bpf_reproducer_good(struct bpf_sock_ops *skops)
> {
>         volatile __maybe_unused __u32 i = skops->snd_ssthresh;
>         volatile __maybe_unused __u32 family;
> 
>         compiler_barrier();
> 
>         family = skops->family;
>         return 0;
> }
> 
> So I decided to check out the code we generate for the above two
> programs and noticed each generates the BPF code you would expect,
> 
> 0000000000000000 <bpf_reproducer_bad>:
> ;       volatile __maybe_unused __u32 i = skops->snd_ssthresh;
>        0:       r1 = *(u32 *)(r1 + 96)
>        1:       *(u32 *)(r10 - 4) = r1
> ;       return 0;
>        2:       r0 = 0
>        3:       exit
> 
> 0000000000000000 <bpf_reproducer_good>:
> ;       volatile __maybe_unused __u32 i = skops->snd_ssthresh;
>        0:       r2 = *(u32 *)(r1 + 96)
>        1:       *(u32 *)(r10 - 4) = r2
> ;       family = skops->family;
>        2:       r1 = *(u32 *)(r1 + 20)
>        3:       *(u32 *)(r10 - 8) = r1
> ;       return 0;
>        4:       r0 = 0
>        5:       exit
> 
> So we get reasonable assembly, but still something was causing the null
> pointer dereference. So, we load the programs and dump the xlated version
> observing that line 0 above 'r* = *(u32 *)(r1 +96)' is going to be
> translated by the skops access helpers.
> 
> int bpf_reproducer_bad(struct bpf_sock_ops * skops):
> ; volatile __maybe_unused __u32 i = skops->snd_ssthresh;
>    0: (61) r1 = *(u32 *)(r1 +28)
>    1: (15) if r1 == 0x0 goto pc+2
>    2: (79) r1 = *(u64 *)(r1 +0)
>    3: (61) r1 = *(u32 *)(r1 +2340)
> ; volatile __maybe_unused __u32 i = skops->snd_ssthresh;
>    4: (63) *(u32 *)(r10 -4) = r1
> ; return 0;
>    5: (b7) r0 = 0
>    6: (95) exit
> 
> int bpf_reproducer_good(struct bpf_sock_ops * skops):
> ; volatile __maybe_unused __u32 i = skops->snd_ssthresh;
>    0: (61) r2 = *(u32 *)(r1 +28)
>    1: (15) if r2 == 0x0 goto pc+2
>    2: (79) r2 = *(u64 *)(r1 +0)
>    3: (61) r2 = *(u32 *)(r2 +2340)
> ; volatile __maybe_unused __u32 i = skops->snd_ssthresh;
>    4: (63) *(u32 *)(r10 -4) = r2
> ; family = skops->family;
>    5: (79) r1 = *(u64 *)(r1 +0)
>    6: (69) r1 = *(u16 *)(r1 +16)
> ; family = skops->family;
>    7: (63) *(u32 *)(r10 -8) = r1
> ; return 0;
>    8: (b7) r0 = 0
>    9: (95) exit
> 
> Then we look at lines 0 and 2 above. In the good case we do the zero
> check in r2 and then load 'r1 + 0' at line 2. Do a quick cross-check
> into the bpf_sock_ops check and we can confirm that is the 'struct
> sock *sk' pointer field. But, in the bad case,
> 
>    0: (61) r1 = *(u32 *)(r1 +28)
>    1: (15) if r1 == 0x0 goto pc+2
>    2: (79) r1 = *(u64 *)(r1 +0)
> 
> Oh no, we read 'r1 +28' into r1, this is skops->fullsock and then in
> line 2 we read the 'r1 +0' as a pointer. Now jumping back to our spat,
> 
> [18610.807284] BUG: unable to handle kernel NULL pointer dereference at 0000000000000001
> 
> The 0x01 makes sense because that is exactly the fullsock value. And
> its not a valid dereference so we splat.
Great debugging!  Thanks for the details explanation.

> 
> To fix we need to guard the case when a program is doing a sock_ops field
> access with src_reg == dst_reg. This is already handled in the load case
> where the ctx_access handler uses a tmp register being careful to
> store the old value and restore it. To fix the get case test if
> src_reg == dst_reg and in this case do the is_fullsock test in the
> temporary register. Remembering to restore the temporary register before
> writing to either dst_reg or src_reg to avoid smashing the pointer into
> the struct holding the tmp variable.
> 
> Adding this inline code to test_tcpbpf_kern will now be generated
> correctly from,
> 
>   9: r2 = *(u32 *)(r2 + 96)
> 
> to xlated code,
> 
>   13: (61) r9 = *(u32 *)(r2 +28)
>   14: (15) if r9 == 0x0 goto pc+4
>   15: (79) r9 = *(u64 *)(r2 +32)
>   16: (79) r2 = *(u64 *)(r2 +0)
>   17: (61) r2 = *(u32 *)(r2 +2348)
>   18: (05) goto pc+1
>   19: (79) r9 = *(u64 *)(r2 +32)
> 
> And in the normal case we keep the original code, because really this
> is an edge case. From this,
> 
>   9: r2 = *(u32 *)(r6 + 96)
> 
> to xlated code,
> 
>   22: (61) r2 = *(u32 *)(r6 +28)
>   23: (15) if r2 == 0x0 goto pc+2
>   24: (79) r2 = *(u64 *)(r6 +0)
>   25: (61) r2 = *(u32 *)(r2 +2348)
> 
> So three additional instructions if dst == src register, but I scanned
> my current code base and did not see this pattern anywhere so should
> not be a big deal. Further, it seems no one else has hit this or at
> least reported it so it must a fairly rare pattern.
> 
> Fixes: 9b1f3d6e5af29 ("bpf: Refactor sock_ops_convert_ctx_access")
I think this issue dated at least back from
commit 34d367c59233 ("bpf: Make SOCK_OPS_GET_TCP struct independent")
There are a few refactoring since then, so fixing in much older
code may not worth it since it is rare?

> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
> ---
>  net/core/filter.c |   26 ++++++++++++++++++++++++--
>  1 file changed, 24 insertions(+), 2 deletions(-)
> 
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 29e34551..c50cb80 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -8314,15 +8314,31 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
>  /* Helper macro for adding read access to tcp_sock or sock fields. */
>  #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)			      \
>  	do {								      \
> +		int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2;     \
>  		BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) >		      \
>  			     sizeof_field(struct bpf_sock_ops, BPF_FIELD));   \
> +		if (si->dst_reg == reg || si->src_reg == reg)		      \
> +			reg--;						      \
> +		if (si->dst_reg == reg || si->src_reg == reg)		      \
> +			reg--;						      \
> +		if (si->dst_reg == si->src_reg) {			      \
> +			*insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg,	      \
> +					  offsetof(struct bpf_sock_ops_kern,  \
> +				          temp));			      \
Instead of sock_ops->temp, can BPF_REG_AX be used here as a temp?
e.g. bpf_convert_shinfo_access() has already used it as a temp also.

Also, it seems the "sk" access in sock_ops_convert_ctx_access() suffers
a similar issue.

> +			fullsock_reg = reg;				      \
> +			jmp+=2;						      \
> +		}							      \
>  		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
>  						struct bpf_sock_ops_kern,     \
>  						is_fullsock),		      \
> -				      si->dst_reg, si->src_reg,		      \
> +				      fullsock_reg, si->src_reg,	      \
>  				      offsetof(struct bpf_sock_ops_kern,      \
>  					       is_fullsock));		      \
> -		*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2);	      \
> +		*insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp);	      \
> +		if (si->dst_reg == si->src_reg)				      \
> +			*insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,	      \
> +				      offsetof(struct bpf_sock_ops_kern,      \
> +				      temp));				      \
>  		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
>  						struct bpf_sock_ops_kern, sk),\
>  				      si->dst_reg, si->src_reg,		      \
> @@ -8331,6 +8347,12 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
>  						       OBJ_FIELD),	      \
>  				      si->dst_reg, si->dst_reg,		      \
>  				      offsetof(OBJ, OBJ_FIELD));	      \
> +		if (si->dst_reg == si->src_reg)	{			      \
> +			*insn++ = BPF_JMP_A(1);				      \
> +			*insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,	      \
> +				      offsetof(struct bpf_sock_ops_kern,      \
> +				      temp));				      \
> +		}							      \
>  	} while (0)
>  
>  #define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \
>
John Fastabend July 28, 2020, 8:55 p.m. UTC | #2
Martin KaFai Lau wrote:
> On Tue, Jul 28, 2020 at 08:43:46AM -0700, John Fastabend wrote:
> > I had a sockmap program that after doing some refactoring started spewing
> > this splat at me:
> > 
> > [18610.807284] BUG: unable to handle kernel NULL pointer dereference at 0000000000000001
> > [...]
> > [18610.807359] Call Trace:
> > [18610.807370]  ? 0xffffffffc114d0d5
> > [18610.807382]  __cgroup_bpf_run_filter_sock_ops+0x7d/0xb0
> > [18610.807391]  tcp_connect+0x895/0xd50
> > [18610.807400]  tcp_v4_connect+0x465/0x4e0
> > [18610.807407]  __inet_stream_connect+0xd6/0x3a0
> > [18610.807412]  ? __inet_stream_connect+0x5/0x3a0
> > [18610.807417]  inet_stream_connect+0x3b/0x60
> > [18610.807425]  __sys_connect+0xed/0x120
> > 

[...]

> > So three additional instructions if dst == src register, but I scanned
> > my current code base and did not see this pattern anywhere so should
> > not be a big deal. Further, it seems no one else has hit this or at
> > least reported it so it must a fairly rare pattern.
> > 
> > Fixes: 9b1f3d6e5af29 ("bpf: Refactor sock_ops_convert_ctx_access")
> I think this issue dated at least back from
> commit 34d367c59233 ("bpf: Make SOCK_OPS_GET_TCP struct independent")
> There are a few refactoring since then, so fixing in much older
> code may not worth it since it is rare?

OK I just did a quick git annotate and pulled out the last patch
there. I didn't go any farther back. The failure is rare and has
the nice property that it crashes hard always. For example I found
it by simply running some of our go tests after doing the refactor.
I guess if it was in some path that doesn't get tested like an
error case or something you might have an ugly surprise in production.
I can imagine a case where tracking this down might be difficult.

OTOH the backport wont be automatic past some of those reworks.

> 
> > Signed-off-by: John Fastabend <john.fastabend@gmail.com>
> > ---
> >  net/core/filter.c |   26 ++++++++++++++++++++++++--
> >  1 file changed, 24 insertions(+), 2 deletions(-)
> > 
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index 29e34551..c50cb80 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -8314,15 +8314,31 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> >  /* Helper macro for adding read access to tcp_sock or sock fields. */
> >  #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)			      \
> >  	do {								      \
> > +		int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2;     \
> >  		BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) >		      \
> >  			     sizeof_field(struct bpf_sock_ops, BPF_FIELD));   \
> > +		if (si->dst_reg == reg || si->src_reg == reg)		      \
> > +			reg--;						      \
> > +		if (si->dst_reg == reg || si->src_reg == reg)		      \
> > +			reg--;						      \
> > +		if (si->dst_reg == si->src_reg) {			      \
> > +			*insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg,	      \
> > +					  offsetof(struct bpf_sock_ops_kern,  \
> > +				          temp));			      \
> Instead of sock_ops->temp, can BPF_REG_AX be used here as a temp?
> e.g. bpf_convert_shinfo_access() has already used it as a temp also.

Sure I will roll a v2 I agree that rax is a bit nicer. I guess for
bpf-next we can roll the load over to use rax as well? Once the
fix is in place I'll take a look it would be nice for consistency.

> 
> Also, it seems the "sk" access in sock_ops_convert_ctx_access() suffers
> a similar issue.

Good catch. I'll fix it up as well. Maybe with a second patch and test.
Patches might be a bit verbose but makes it easier to track the bugs
I think.

> 
> > +			fullsock_reg = reg;				      \
> > +			jmp+=2;						      \
> > +		}							      \
> >  		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
> >  						struct bpf_sock_ops_kern,     \
> >  						is_fullsock),		      \
> > -				      si->dst_reg, si->src_reg,		      \
> > +				      fullsock_reg, si->src_reg,	      \
> >  				      offsetof(struct bpf_sock_ops_kern,      \
> >  					       is_fullsock));		      \
> > -		*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2);	      \
> > +		*insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp);	      \
> > +		if (si->dst_reg == si->src_reg)				      \
> > +			*insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,	      \
> > +				      offsetof(struct bpf_sock_ops_kern,      \
> > +				      temp));				      \
> >  		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
> >  						struct bpf_sock_ops_kern, sk),\
> >  				      si->dst_reg, si->src_reg,		      \
> > @@ -8331,6 +8347,12 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> >  						       OBJ_FIELD),	      \
> >  				      si->dst_reg, si->dst_reg,		      \
> >  				      offsetof(OBJ, OBJ_FIELD));	      \
> > +		if (si->dst_reg == si->src_reg)	{			      \
> > +			*insn++ = BPF_JMP_A(1);				      \
> > +			*insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,	      \
> > +				      offsetof(struct bpf_sock_ops_kern,      \
> > +				      temp));				      \
> > +		}							      \
> >  	} while (0)
> >  
> >  #define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \
> >
Daniel Borkmann July 28, 2020, 9:09 p.m. UTC | #3
On 7/28/20 10:55 PM, John Fastabend wrote:
> Martin KaFai Lau wrote:
>> On Tue, Jul 28, 2020 at 08:43:46AM -0700, John Fastabend wrote:
>>> I had a sockmap program that after doing some refactoring started spewing
>>> this splat at me:
>>>
>>> [18610.807284] BUG: unable to handle kernel NULL pointer dereference at 0000000000000001
>>> [...]
>>> [18610.807359] Call Trace:
>>> [18610.807370]  ? 0xffffffffc114d0d5
>>> [18610.807382]  __cgroup_bpf_run_filter_sock_ops+0x7d/0xb0
>>> [18610.807391]  tcp_connect+0x895/0xd50
>>> [18610.807400]  tcp_v4_connect+0x465/0x4e0
>>> [18610.807407]  __inet_stream_connect+0xd6/0x3a0
>>> [18610.807412]  ? __inet_stream_connect+0x5/0x3a0
>>> [18610.807417]  inet_stream_connect+0x3b/0x60
>>> [18610.807425]  __sys_connect+0xed/0x120
>>>
> 
> [...]
> 
>>> So three additional instructions if dst == src register, but I scanned
>>> my current code base and did not see this pattern anywhere so should
>>> not be a big deal. Further, it seems no one else has hit this or at
>>> least reported it so it must a fairly rare pattern.
>>>
>>> Fixes: 9b1f3d6e5af29 ("bpf: Refactor sock_ops_convert_ctx_access")
>> I think this issue dated at least back from
>> commit 34d367c59233 ("bpf: Make SOCK_OPS_GET_TCP struct independent")
>> There are a few refactoring since then, so fixing in much older
>> code may not worth it since it is rare?
> 
> OK I just did a quick git annotate and pulled out the last patch
> there. I didn't go any farther back. The failure is rare and has
> the nice property that it crashes hard always. For example I found
> it by simply running some of our go tests after doing the refactor.
> I guess if it was in some path that doesn't get tested like an
> error case or something you might have an ugly surprise in production.
> I can imagine a case where tracking this down might be difficult.
> 
> OTOH the backport wont be automatic past some of those reworks.
> 
>>
>>> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
>>> ---
>>>   net/core/filter.c |   26 ++++++++++++++++++++++++--
>>>   1 file changed, 24 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/net/core/filter.c b/net/core/filter.c
>>> index 29e34551..c50cb80 100644
>>> --- a/net/core/filter.c
>>> +++ b/net/core/filter.c
>>> @@ -8314,15 +8314,31 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
>>>   /* Helper macro for adding read access to tcp_sock or sock fields. */
>>>   #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)			      \
>>>   	do {								      \
>>> +		int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2;     \
>>>   		BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) >		      \
>>>   			     sizeof_field(struct bpf_sock_ops, BPF_FIELD));   \
>>> +		if (si->dst_reg == reg || si->src_reg == reg)		      \
>>> +			reg--;						      \
>>> +		if (si->dst_reg == reg || si->src_reg == reg)		      \
>>> +			reg--;						      \
>>> +		if (si->dst_reg == si->src_reg) {			      \
>>> +			*insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg,	      \
>>> +					  offsetof(struct bpf_sock_ops_kern,  \
>>> +				          temp));			      \
>> Instead of sock_ops->temp, can BPF_REG_AX be used here as a temp?
>> e.g. bpf_convert_shinfo_access() has already used it as a temp also.
> 
> Sure I will roll a v2 I agree that rax is a bit nicer. I guess for
> bpf-next we can roll the load over to use rax as well? Once the
> fix is in place I'll take a look it would be nice for consistency.

Keep in mind that REG_AX is also used from constant blinding side, so use with care.
Please make sure to add necessary selftest cases to test_verifier and run them under
constant blinding enabled for root so we have these rewrites exercised.

>> Also, it seems the "sk" access in sock_ops_convert_ctx_access() suffers
>> a similar issue.
> 
> Good catch. I'll fix it up as well. Maybe with a second patch and test.
> Patches might be a bit verbose but makes it easier to track the bugs
> I think.
> 
>>
>>> +			fullsock_reg = reg;				      \
>>> +			jmp+=2;						      \
>>> +		}							      \
>>>   		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
>>>   						struct bpf_sock_ops_kern,     \
>>>   						is_fullsock),		      \
>>> -				      si->dst_reg, si->src_reg,		      \
>>> +				      fullsock_reg, si->src_reg,	      \
>>>   				      offsetof(struct bpf_sock_ops_kern,      \
>>>   					       is_fullsock));		      \
>>> -		*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2);	      \
>>> +		*insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp);	      \
>>> +		if (si->dst_reg == si->src_reg)				      \
>>> +			*insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,	      \
>>> +				      offsetof(struct bpf_sock_ops_kern,      \
>>> +				      temp));				      \
>>>   		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
>>>   						struct bpf_sock_ops_kern, sk),\
>>>   				      si->dst_reg, si->src_reg,		      \
>>> @@ -8331,6 +8347,12 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
>>>   						       OBJ_FIELD),	      \
>>>   				      si->dst_reg, si->dst_reg,		      \
>>>   				      offsetof(OBJ, OBJ_FIELD));	      \
>>> +		if (si->dst_reg == si->src_reg)	{			      \
>>> +			*insn++ = BPF_JMP_A(1);				      \
>>> +			*insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,	      \
>>> +				      offsetof(struct bpf_sock_ops_kern,      \
>>> +				      temp));				      \
>>> +		}							      \
>>>   	} while (0)
>>>   
>>>   #define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \
>>>
Martin KaFai Lau July 28, 2020, 9:15 p.m. UTC | #4
On Tue, Jul 28, 2020 at 01:55:22PM -0700, John Fastabend wrote:
> Martin KaFai Lau wrote:
> > On Tue, Jul 28, 2020 at 08:43:46AM -0700, John Fastabend wrote:
> > > I had a sockmap program that after doing some refactoring started spewing
> > > this splat at me:
> > > 
> > > [18610.807284] BUG: unable to handle kernel NULL pointer dereference at 0000000000000001
> > > [...]
> > > [18610.807359] Call Trace:
> > > [18610.807370]  ? 0xffffffffc114d0d5
> > > [18610.807382]  __cgroup_bpf_run_filter_sock_ops+0x7d/0xb0
> > > [18610.807391]  tcp_connect+0x895/0xd50
> > > [18610.807400]  tcp_v4_connect+0x465/0x4e0
> > > [18610.807407]  __inet_stream_connect+0xd6/0x3a0
> > > [18610.807412]  ? __inet_stream_connect+0x5/0x3a0
> > > [18610.807417]  inet_stream_connect+0x3b/0x60
> > > [18610.807425]  __sys_connect+0xed/0x120
> > > 
> 
> [...]
> 
> > > So three additional instructions if dst == src register, but I scanned
> > > my current code base and did not see this pattern anywhere so should
> > > not be a big deal. Further, it seems no one else has hit this or at
> > > least reported it so it must a fairly rare pattern.
> > > 
> > > Fixes: 9b1f3d6e5af29 ("bpf: Refactor sock_ops_convert_ctx_access")
> > I think this issue dated at least back from
> > commit 34d367c59233 ("bpf: Make SOCK_OPS_GET_TCP struct independent")
> > There are a few refactoring since then, so fixing in much older
> > code may not worth it since it is rare?
> 
> OK I just did a quick git annotate and pulled out the last patch
> there. I didn't go any farther back. The failure is rare and has
> the nice property that it crashes hard always. For example I found
> it by simply running some of our go tests after doing the refactor.
> I guess if it was in some path that doesn't get tested like an
> error case or something you might have an ugly surprise in production.
> I can imagine a case where tracking this down might be difficult.
> 
> OTOH the backport wont be automatic past some of those reworks.
> 
> > 
> > > Signed-off-by: John Fastabend <john.fastabend@gmail.com>
> > > ---
> > >  net/core/filter.c |   26 ++++++++++++++++++++++++--
> > >  1 file changed, 24 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/net/core/filter.c b/net/core/filter.c
> > > index 29e34551..c50cb80 100644
> > > --- a/net/core/filter.c
> > > +++ b/net/core/filter.c
> > > @@ -8314,15 +8314,31 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> > >  /* Helper macro for adding read access to tcp_sock or sock fields. */
> > >  #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)			      \
> > >  	do {								      \
> > > +		int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2;     \
> > >  		BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) >		      \
> > >  			     sizeof_field(struct bpf_sock_ops, BPF_FIELD));   \
> > > +		if (si->dst_reg == reg || si->src_reg == reg)		      \
> > > +			reg--;						      \
> > > +		if (si->dst_reg == reg || si->src_reg == reg)		      \
> > > +			reg--;						      \
> > > +		if (si->dst_reg == si->src_reg) {			      \
> > > +			*insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg,	      \
> > > +					  offsetof(struct bpf_sock_ops_kern,  \
> > > +				          temp));			      \
> > Instead of sock_ops->temp, can BPF_REG_AX be used here as a temp?
> > e.g. bpf_convert_shinfo_access() has already used it as a temp also.
> 
> Sure I will roll a v2 I agree that rax is a bit nicer. I guess for
> bpf-next we can roll the load over to use rax as well? Once the
> fix is in place I'll take a look it would be nice for consistency.
Agree that it would be nice to do the same in SOCK_OPS_SET_FIELD() also
and this improvement could be done in bpf-next.

> 
> > 
> > Also, it seems the "sk" access in sock_ops_convert_ctx_access() suffers
> > a similar issue.
> 
> Good catch. I'll fix it up as well. Maybe with a second patch and test.
> Patches might be a bit verbose but makes it easier to track the bugs
> I think.
Thanks for taking care of it!
John Fastabend July 29, 2020, 12:44 a.m. UTC | #5
Martin KaFai Lau wrote:
> On Tue, Jul 28, 2020 at 01:55:22PM -0700, John Fastabend wrote:
> > Martin KaFai Lau wrote:
> > > On Tue, Jul 28, 2020 at 08:43:46AM -0700, John Fastabend wrote:
> > > > I had a sockmap program that after doing some refactoring started spewing
> > > > this splat at me:
> > > > 
> > > > [18610.807284] BUG: unable to handle kernel NULL pointer dereference at 0000000000000001
> > > > [...]
> > > > [18610.807359] Call Trace:
> > > > [18610.807370]  ? 0xffffffffc114d0d5
> > > > [18610.807382]  __cgroup_bpf_run_filter_sock_ops+0x7d/0xb0
> > > > [18610.807391]  tcp_connect+0x895/0xd50
> > > > [18610.807400]  tcp_v4_connect+0x465/0x4e0
> > > > [18610.807407]  __inet_stream_connect+0xd6/0x3a0
> > > > [18610.807412]  ? __inet_stream_connect+0x5/0x3a0
> > > > [18610.807417]  inet_stream_connect+0x3b/0x60
> > > > [18610.807425]  __sys_connect+0xed/0x120
> > > > 
> > 
> > [...]
> > 
> > > > So three additional instructions if dst == src register, but I scanned
> > > > my current code base and did not see this pattern anywhere so should
> > > > not be a big deal. Further, it seems no one else has hit this or at
> > > > least reported it so it must a fairly rare pattern.
> > > > 
> > > > Fixes: 9b1f3d6e5af29 ("bpf: Refactor sock_ops_convert_ctx_access")
> > > I think this issue dated at least back from
> > > commit 34d367c59233 ("bpf: Make SOCK_OPS_GET_TCP struct independent")
> > > There are a few refactoring since then, so fixing in much older
> > > code may not worth it since it is rare?
> > 
> > OK I just did a quick git annotate and pulled out the last patch
> > there. I didn't go any farther back. The failure is rare and has
> > the nice property that it crashes hard always. For example I found
> > it by simply running some of our go tests after doing the refactor.
> > I guess if it was in some path that doesn't get tested like an
> > error case or something you might have an ugly surprise in production.
> > I can imagine a case where tracking this down might be difficult.
> > 
> > OTOH the backport wont be automatic past some of those reworks.
> > 
> > > 
> > > > Signed-off-by: John Fastabend <john.fastabend@gmail.com>
> > > > ---
> > > >  net/core/filter.c |   26 ++++++++++++++++++++++++--
> > > >  1 file changed, 24 insertions(+), 2 deletions(-)
> > > > 
> > > > diff --git a/net/core/filter.c b/net/core/filter.c
> > > > index 29e34551..c50cb80 100644
> > > > --- a/net/core/filter.c
> > > > +++ b/net/core/filter.c
> > > > @@ -8314,15 +8314,31 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> > > >  /* Helper macro for adding read access to tcp_sock or sock fields. */
> > > >  #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)			      \
> > > >  	do {								      \
> > > > +		int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2;     \
> > > >  		BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) >		      \
> > > >  			     sizeof_field(struct bpf_sock_ops, BPF_FIELD));   \
> > > > +		if (si->dst_reg == reg || si->src_reg == reg)		      \
> > > > +			reg--;						      \
> > > > +		if (si->dst_reg == reg || si->src_reg == reg)		      \
> > > > +			reg--;						      \
> > > > +		if (si->dst_reg == si->src_reg) {			      \
> > > > +			*insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg,	      \
> > > > +					  offsetof(struct bpf_sock_ops_kern,  \
> > > > +				          temp));			      \
> > > Instead of sock_ops->temp, can BPF_REG_AX be used here as a temp?
> > > e.g. bpf_convert_shinfo_access() has already used it as a temp also.

OTOH it looks like we will cause the bpf_jit_blind_insn() to abort on those
instructions.

I'm not sure it matters for performance see'ing we are in a bit of an
edge case. iirc Daniel wrote that code so maybe its best to see if he has
any opinions.

@Daniel, Do you have a preference? If we use REG_RAX it seems the insns
will be skipped over by bpf_jit_blind_insn otoh its slightly faster I guess
to skip the load/store.

> > 
> > Sure I will roll a v2 I agree that rax is a bit nicer. I guess for
> > bpf-next we can roll the load over to use rax as well? Once the
> > fix is in place I'll take a look it would be nice for consistency.
> Agree that it would be nice to do the same in SOCK_OPS_SET_FIELD() also
> and this improvement could be done in bpf-next.
> 
> > 
> > > 
> > > Also, it seems the "sk" access in sock_ops_convert_ctx_access() suffers
> > > a similar issue.
> > 
> > Good catch. I'll fix it up as well. Maybe with a second patch and test.
> > Patches might be a bit verbose but makes it easier to track the bugs
> > I think.
> Thanks for taking care of it!
Martin KaFai Lau July 29, 2020, 1:14 a.m. UTC | #6
On Tue, Jul 28, 2020 at 05:44:19PM -0700, John Fastabend wrote:
> Martin KaFai Lau wrote:
> > On Tue, Jul 28, 2020 at 01:55:22PM -0700, John Fastabend wrote:
> > > Martin KaFai Lau wrote:
> > > > On Tue, Jul 28, 2020 at 08:43:46AM -0700, John Fastabend wrote:
> > > > > I had a sockmap program that after doing some refactoring started spewing
> > > > > this splat at me:
> > > > > 
> > > > > [18610.807284] BUG: unable to handle kernel NULL pointer dereference at 0000000000000001
> > > > > [...]
> > > > > [18610.807359] Call Trace:
> > > > > [18610.807370]  ? 0xffffffffc114d0d5
> > > > > [18610.807382]  __cgroup_bpf_run_filter_sock_ops+0x7d/0xb0
> > > > > [18610.807391]  tcp_connect+0x895/0xd50
> > > > > [18610.807400]  tcp_v4_connect+0x465/0x4e0
> > > > > [18610.807407]  __inet_stream_connect+0xd6/0x3a0
> > > > > [18610.807412]  ? __inet_stream_connect+0x5/0x3a0
> > > > > [18610.807417]  inet_stream_connect+0x3b/0x60
> > > > > [18610.807425]  __sys_connect+0xed/0x120
> > > > > 
> > > 
> > > [...]
> > > 
> > > > > So three additional instructions if dst == src register, but I scanned
> > > > > my current code base and did not see this pattern anywhere so should
> > > > > not be a big deal. Further, it seems no one else has hit this or at
> > > > > least reported it so it must a fairly rare pattern.
> > > > > 
> > > > > Fixes: 9b1f3d6e5af29 ("bpf: Refactor sock_ops_convert_ctx_access")
> > > > I think this issue dated at least back from
> > > > commit 34d367c59233 ("bpf: Make SOCK_OPS_GET_TCP struct independent")
> > > > There are a few refactoring since then, so fixing in much older
> > > > code may not worth it since it is rare?
> > > 
> > > OK I just did a quick git annotate and pulled out the last patch
> > > there. I didn't go any farther back. The failure is rare and has
> > > the nice property that it crashes hard always. For example I found
> > > it by simply running some of our go tests after doing the refactor.
> > > I guess if it was in some path that doesn't get tested like an
> > > error case or something you might have an ugly surprise in production.
> > > I can imagine a case where tracking this down might be difficult.
> > > 
> > > OTOH the backport wont be automatic past some of those reworks.
> > > 
> > > > 
> > > > > Signed-off-by: John Fastabend <john.fastabend@gmail.com>
> > > > > ---
> > > > >  net/core/filter.c |   26 ++++++++++++++++++++++++--
> > > > >  1 file changed, 24 insertions(+), 2 deletions(-)
> > > > > 
> > > > > diff --git a/net/core/filter.c b/net/core/filter.c
> > > > > index 29e34551..c50cb80 100644
> > > > > --- a/net/core/filter.c
> > > > > +++ b/net/core/filter.c
> > > > > @@ -8314,15 +8314,31 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> > > > >  /* Helper macro for adding read access to tcp_sock or sock fields. */
> > > > >  #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)			      \
> > > > >  	do {								      \
> > > > > +		int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2;     \
> > > > >  		BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) >		      \
> > > > >  			     sizeof_field(struct bpf_sock_ops, BPF_FIELD));   \
> > > > > +		if (si->dst_reg == reg || si->src_reg == reg)		      \
> > > > > +			reg--;						      \
> > > > > +		if (si->dst_reg == reg || si->src_reg == reg)		      \
> > > > > +			reg--;						      \
> > > > > +		if (si->dst_reg == si->src_reg) {			      \
> > > > > +			*insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg,	      \
> > > > > +					  offsetof(struct bpf_sock_ops_kern,  \
> > > > > +				          temp));			      \
> > > > Instead of sock_ops->temp, can BPF_REG_AX be used here as a temp?
> > > > e.g. bpf_convert_shinfo_access() has already used it as a temp also.
> 
> OTOH it looks like we will cause the bpf_jit_blind_insn() to abort on those
> instructions.
You are right.  I think eventually "BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, ...);"
has to be done here and it is working on an IMM 0.  BPF_REG_AX has the
is_fullsock.

From the bpf_jit_blind_insn(), the "case BPF_JMP | BPF_JEQ  | BPF_K"
will break the above BPF_JMP_IMM.

I think the "temp" approach has to be used as in this patch.

> 
> I'm not sure it matters for performance see'ing we are in a bit of an
> edge case. iirc Daniel wrote that code so maybe its best to see if he has
> any opinions.
> 
> @Daniel, Do you have a preference? If we use REG_RAX it seems the insns
> will be skipped over by bpf_jit_blind_insn otoh its slightly faster I guess
> to skip the load/store.
> 
> > > 
> > > Sure I will roll a v2 I agree that rax is a bit nicer. I guess for
> > > bpf-next we can roll the load over to use rax as well? Once the
> > > fix is in place I'll take a look it would be nice for consistency.
> > Agree that it would be nice to do the same in SOCK_OPS_SET_FIELD() also
> > and this improvement could be done in bpf-next.
> > 
> > > 
> > > > 
> > > > Also, it seems the "sk" access in sock_ops_convert_ctx_access() suffers
> > > > a similar issue.
> > > 
> > > Good catch. I'll fix it up as well. Maybe with a second patch and test.
> > > Patches might be a bit verbose but makes it easier to track the bugs
> > > I think.
> > Thanks for taking care of it!
> 
>
diff mbox series

Patch

diff --git a/net/core/filter.c b/net/core/filter.c
index 29e34551..c50cb80 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -8314,15 +8314,31 @@  static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 /* Helper macro for adding read access to tcp_sock or sock fields. */
 #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)			      \
 	do {								      \
+		int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2;     \
 		BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) >		      \
 			     sizeof_field(struct bpf_sock_ops, BPF_FIELD));   \
+		if (si->dst_reg == reg || si->src_reg == reg)		      \
+			reg--;						      \
+		if (si->dst_reg == reg || si->src_reg == reg)		      \
+			reg--;						      \
+		if (si->dst_reg == si->src_reg) {			      \
+			*insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg,	      \
+					  offsetof(struct bpf_sock_ops_kern,  \
+				          temp));			      \
+			fullsock_reg = reg;				      \
+			jmp+=2;						      \
+		}							      \
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
 						struct bpf_sock_ops_kern,     \
 						is_fullsock),		      \
-				      si->dst_reg, si->src_reg,		      \
+				      fullsock_reg, si->src_reg,	      \
 				      offsetof(struct bpf_sock_ops_kern,      \
 					       is_fullsock));		      \
-		*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2);	      \
+		*insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp);	      \
+		if (si->dst_reg == si->src_reg)				      \
+			*insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,	      \
+				      offsetof(struct bpf_sock_ops_kern,      \
+				      temp));				      \
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
 						struct bpf_sock_ops_kern, sk),\
 				      si->dst_reg, si->src_reg,		      \
@@ -8331,6 +8347,12 @@  static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 						       OBJ_FIELD),	      \
 				      si->dst_reg, si->dst_reg,		      \
 				      offsetof(OBJ, OBJ_FIELD));	      \
+		if (si->dst_reg == si->src_reg)	{			      \
+			*insn++ = BPF_JMP_A(1);				      \
+			*insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,	      \
+				      offsetof(struct bpf_sock_ops_kern,      \
+				      temp));				      \
+		}							      \
 	} while (0)
 
 #define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \