[bpf,v2] bpf: fix accessing bpf_sysctl.file_pos on s390
diff mbox series

Message ID 20190816105300.49035-1-iii@linux.ibm.com
State Accepted
Delegated to: BPF Maintainers
Headers show
Series
  • [bpf,v2] bpf: fix accessing bpf_sysctl.file_pos on s390
Related show

Commit Message

Ilya Leoshkevich Aug. 16, 2019, 10:53 a.m. UTC
"ctx:file_pos sysctl:read write ok" fails on s390 with "Read value  !=
nux". This is because verifier rewrites a complete 32-bit
bpf_sysctl.file_pos update to a partial update of the first 32 bits of
64-bit *bpf_sysctl_kern.ppos, which is not correct on big-endian
systems.

Fix by using an offset on big-endian systems.

Ditto for bpf_sysctl.file_pos reads. Currently the test does not detect
a problem there, since it expects to see 0, which it gets with high
probability in error cases, so change it to seek to offset 3 and expect
3 in bpf_sysctl.file_pos.

Fixes: e1550bfe0de4 ("bpf: Add file_pos field to bpf_sysctl ctx")
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
---
v1->v2: Merge bpf_ctx_narrow_load_shift and
bpf_ctx_narrow_access_offset.

 include/linux/filter.h                    |  8 ++++----
 kernel/bpf/cgroup.c                       | 10 ++++++++--
 kernel/bpf/verifier.c                     |  4 ++--
 tools/testing/selftests/bpf/test_sysctl.c |  9 ++++++++-
 4 files changed, 22 insertions(+), 9 deletions(-)

Comments

Yonghong Song Aug. 16, 2019, 3:43 p.m. UTC | #1
On 8/16/19 3:53 AM, Ilya Leoshkevich wrote:
> "ctx:file_pos sysctl:read write ok" fails on s390 with "Read value  !=
> nux". This is because verifier rewrites a complete 32-bit
> bpf_sysctl.file_pos update to a partial update of the first 32 bits of
> 64-bit *bpf_sysctl_kern.ppos, which is not correct on big-endian
> systems.
> 
> Fix by using an offset on big-endian systems.
> 
> Ditto for bpf_sysctl.file_pos reads. Currently the test does not detect
> a problem there, since it expects to see 0, which it gets with high
> probability in error cases, so change it to seek to offset 3 and expect
> 3 in bpf_sysctl.file_pos.
> 
> Fixes: e1550bfe0de4 ("bpf: Add file_pos field to bpf_sysctl ctx")
> Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>

Acked-by: Yonghong Song <yhs@fb.com>
Daniel Borkmann Sept. 3, 2019, 10:13 p.m. UTC | #2
On 8/16/19 12:53 PM, Ilya Leoshkevich wrote:
> "ctx:file_pos sysctl:read write ok" fails on s390 with "Read value  !=
> nux". This is because verifier rewrites a complete 32-bit
> bpf_sysctl.file_pos update to a partial update of the first 32 bits of
> 64-bit *bpf_sysctl_kern.ppos, which is not correct on big-endian
> systems.
> 
> Fix by using an offset on big-endian systems.
> 
> Ditto for bpf_sysctl.file_pos reads. Currently the test does not detect
> a problem there, since it expects to see 0, which it gets with high
> probability in error cases, so change it to seek to offset 3 and expect
> 3 in bpf_sysctl.file_pos.
> 
> Fixes: e1550bfe0de4 ("bpf: Add file_pos field to bpf_sysctl ctx")
> Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
> ---
> v1->v2: Merge bpf_ctx_narrow_load_shift and
> bpf_ctx_narrow_access_offset.
> 
>   include/linux/filter.h                    |  8 ++++----
>   kernel/bpf/cgroup.c                       | 10 ++++++++--
>   kernel/bpf/verifier.c                     |  4 ++--
>   tools/testing/selftests/bpf/test_sysctl.c |  9 ++++++++-
>   4 files changed, 22 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 92c6e31fb008..2ce57645f3cd 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -749,14 +749,14 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
>   }
>   
>   static inline u8
> -bpf_ctx_narrow_load_shift(u32 off, u32 size, u32 size_default)
> +bpf_ctx_narrow_access_offset(u32 off, u32 size, u32 size_default)
>   {
> -	u8 load_off = off & (size_default - 1);
> +	u8 access_off = off & (size_default - 1);
>   
>   #ifdef __LITTLE_ENDIAN
> -	return load_off * 8;
> +	return access_off;
>   #else
> -	return (size_default - (load_off + size)) * 8;
> +	return size_default - (access_off + size);
>   #endif
>   }
>   
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index 0a00eaca6fae..00c4647ce92a 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -1325,6 +1325,7 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
>   				     struct bpf_prog *prog, u32 *target_size)
>   {
>   	struct bpf_insn *insn = insn_buf;
> +	u32 read_size;
>   
>   	switch (si->off) {
>   	case offsetof(struct bpf_sysctl, write):
> @@ -1356,7 +1357,9 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
>   				treg, si->dst_reg,
>   				offsetof(struct bpf_sysctl_kern, ppos));
>   			*insn++ = BPF_STX_MEM(
> -				BPF_SIZEOF(u32), treg, si->src_reg, 0);
> +				BPF_SIZEOF(u32), treg, si->src_reg,
> +				bpf_ctx_narrow_access_offset(
> +					0, sizeof(u32), sizeof(loff_t)));
>   			*insn++ = BPF_LDX_MEM(
>   				BPF_DW, treg, si->dst_reg,
>   				offsetof(struct bpf_sysctl_kern, tmp_reg));
> @@ -1365,8 +1368,11 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
>   				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
>   				si->dst_reg, si->src_reg,
>   				offsetof(struct bpf_sysctl_kern, ppos));
> +			read_size = bpf_size_to_bytes(BPF_SIZE(si->code));
>   			*insn++ = BPF_LDX_MEM(
> -				BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0);
> +				BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
> +				bpf_ctx_narrow_access_offset(
> +					0, read_size, sizeof(loff_t)));

I see what you're doing, but generally I'm a bit puzzled on why we need these
partial store/loads and cannot access the full loff_t value internally with the
rewrite. Why was BPF_SIZEOF(u32) chosen in the first place? Looks like git history
doesn't have any useful insight here ... Andrey mind to put some clarifications
on this? Thx

>   		}
>   		*target_size = sizeof(u32);
>   		break;
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index c84d83f86141..d1d4c995a9eb 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -8616,8 +8616,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
>   		}
>   
>   		if (is_narrower_load && size < target_size) {
> -			u8 shift = bpf_ctx_narrow_load_shift(off, size,
> -							     size_default);
> +			u8 shift = bpf_ctx_narrow_access_offset(
> +				off, size, size_default) * 8;
>   			if (ctx_field_size <= 4) {
>   				if (shift)
>   					insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
> diff --git a/tools/testing/selftests/bpf/test_sysctl.c b/tools/testing/selftests/bpf/test_sysctl.c
> index a3bebd7c68dd..abc26248a7f1 100644
> --- a/tools/testing/selftests/bpf/test_sysctl.c
> +++ b/tools/testing/selftests/bpf/test_sysctl.c
> @@ -31,6 +31,7 @@ struct sysctl_test {
>   	enum bpf_attach_type attach_type;
>   	const char *sysctl;
>   	int open_flags;
> +	int seek;
>   	const char *newval;
>   	const char *oldval;
>   	enum {
> @@ -139,7 +140,7 @@ static struct sysctl_test tests[] = {
>   			/* If (file_pos == X) */
>   			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
>   				    offsetof(struct bpf_sysctl, file_pos)),
> -			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0, 2),
> +			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 3, 2),
>   
>   			/* return ALLOW; */
>   			BPF_MOV64_IMM(BPF_REG_0, 1),
> @@ -152,6 +153,7 @@ static struct sysctl_test tests[] = {
>   		.attach_type = BPF_CGROUP_SYSCTL,
>   		.sysctl = "kernel/ostype",
>   		.open_flags = O_RDONLY,
> +		.seek = 3,
>   		.result = SUCCESS,
>   	},
>   	{
> @@ -1442,6 +1444,11 @@ static int access_sysctl(const char *sysctl_path,
>   	if (fd < 0)
>   		return fd;
>   
> +	if (test->seek && lseek(fd, test->seek, SEEK_SET) == -1) {
> +		log_err("lseek(%d) failed", test->seek);
> +		goto err;
> +	}
> +
>   	if (test->open_flags == O_RDONLY) {
>   		char buf[128];
>   
>
Ilya Leoshkevich Sept. 4, 2019, 1:53 p.m. UTC | #3
> Am 04.09.2019 um 00:13 schrieb Daniel Borkmann <daniel@iogearbox.net>:
> 
> On 8/16/19 12:53 PM, Ilya Leoshkevich wrote:
>> "ctx:file_pos sysctl:read write ok" fails on s390 with "Read value  !=
>> nux". This is because verifier rewrites a complete 32-bit
>> bpf_sysctl.file_pos update to a partial update of the first 32 bits of
>> 64-bit *bpf_sysctl_kern.ppos, which is not correct on big-endian
>> systems.
>> Fix by using an offset on big-endian systems.
>> Ditto for bpf_sysctl.file_pos reads. Currently the test does not detect
>> a problem there, since it expects to see 0, which it gets with high
>> probability in error cases, so change it to seek to offset 3 and expect
>> 3 in bpf_sysctl.file_pos.
>> Fixes: e1550bfe0de4 ("bpf: Add file_pos field to bpf_sysctl ctx")
>> Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
>> ---
>> v1->v2: Merge bpf_ctx_narrow_load_shift and
>> bpf_ctx_narrow_access_offset.
>>  include/linux/filter.h                    |  8 ++++----
>>  kernel/bpf/cgroup.c                       | 10 ++++++++--
>>  kernel/bpf/verifier.c                     |  4 ++--
>>  tools/testing/selftests/bpf/test_sysctl.c |  9 ++++++++-
>>  4 files changed, 22 insertions(+), 9 deletions(-)
>> diff --git a/include/linux/filter.h b/include/linux/filter.h
>> index 92c6e31fb008..2ce57645f3cd 100644
>> --- a/include/linux/filter.h
>> +++ b/include/linux/filter.h
>> @@ -749,14 +749,14 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
>>  }
>>    static inline u8
>> -bpf_ctx_narrow_load_shift(u32 off, u32 size, u32 size_default)
>> +bpf_ctx_narrow_access_offset(u32 off, u32 size, u32 size_default)
>>  {
>> -	u8 load_off = off & (size_default - 1);
>> +	u8 access_off = off & (size_default - 1);
>>    #ifdef __LITTLE_ENDIAN
>> -	return load_off * 8;
>> +	return access_off;
>>  #else
>> -	return (size_default - (load_off + size)) * 8;
>> +	return size_default - (access_off + size);
>>  #endif
>>  }
>>  diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
>> index 0a00eaca6fae..00c4647ce92a 100644
>> --- a/kernel/bpf/cgroup.c
>> +++ b/kernel/bpf/cgroup.c
>> @@ -1325,6 +1325,7 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
>>  				     struct bpf_prog *prog, u32 *target_size)
>>  {
>>  	struct bpf_insn *insn = insn_buf;
>> +	u32 read_size;
>>    	switch (si->off) {
>>  	case offsetof(struct bpf_sysctl, write):
>> @@ -1356,7 +1357,9 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
>>  				treg, si->dst_reg,
>>  				offsetof(struct bpf_sysctl_kern, ppos));
>>  			*insn++ = BPF_STX_MEM(
>> -				BPF_SIZEOF(u32), treg, si->src_reg, 0);
>> +				BPF_SIZEOF(u32), treg, si->src_reg,
>> +				bpf_ctx_narrow_access_offset(
>> +					0, sizeof(u32), sizeof(loff_t)));
>>  			*insn++ = BPF_LDX_MEM(
>>  				BPF_DW, treg, si->dst_reg,
>>  				offsetof(struct bpf_sysctl_kern, tmp_reg));
>> @@ -1365,8 +1368,11 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
>>  				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
>>  				si->dst_reg, si->src_reg,
>>  				offsetof(struct bpf_sysctl_kern, ppos));
>> +			read_size = bpf_size_to_bytes(BPF_SIZE(si->code));
>>  			*insn++ = BPF_LDX_MEM(
>> -				BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0);
>> +				BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
>> +				bpf_ctx_narrow_access_offset(
>> +					0, read_size, sizeof(loff_t)));
> 
> I see what you're doing, but generally I'm a bit puzzled on why we need these
> partial store/loads and cannot access the full loff_t value internally with the
> rewrite. Why was BPF_SIZEOF(u32) chosen in the first place? Looks like git history
> doesn't have any useful insight here ... Andrey mind to put some clarifications
> on this? Thx

Do you mean rewriting e.g. the following:

    BPF_LDX_MEM(access_size, si->dst_reg, si->src_reg,
                offsetof(struct bpf_sysctl, file_pos)));

as the following?

    BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
                si->dst_reg, si->src_reg,
                offsetof(struct bpf_sysctl_kern, ppos));
    BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, 0);  # full loff_t

while also doing

    *target_size = sizeof(loff_t);

in order to let is_narrower_load logic in convert_ctx_accesses () do the
masking? That would work, but wouldn't the generated code be less
efficient due to explicit masking?
Andrey Ignatov Sept. 8, 2019, 3:37 a.m. UTC | #4
Daniel Borkmann <daniel@iogearbox.net> [Wed, 4 Sep 2019 00:13:03 +0200]:
> On 8/16/19 12:53 PM, Ilya Leoshkevich wrote:
> > "ctx:file_pos sysctl:read write ok" fails on s390 with "Read value  !=
> > nux". This is because verifier rewrites a complete 32-bit
> > bpf_sysctl.file_pos update to a partial update of the first 32 bits of
> > 64-bit *bpf_sysctl_kern.ppos, which is not correct on big-endian
> > systems.
> > 
> > Fix by using an offset on big-endian systems.
> > 
> > Ditto for bpf_sysctl.file_pos reads. Currently the test does not detect
> > a problem there, since it expects to see 0, which it gets with high
> > probability in error cases, so change it to seek to offset 3 and expect
> > 3 in bpf_sysctl.file_pos.
> > 
> > Fixes: e1550bfe0de4 ("bpf: Add file_pos field to bpf_sysctl ctx")
> > Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
> > ---
> > v1->v2: Merge bpf_ctx_narrow_load_shift and
> > bpf_ctx_narrow_access_offset.
> > 
> >   include/linux/filter.h                    |  8 ++++----
> >   kernel/bpf/cgroup.c                       | 10 ++++++++--
> >   kernel/bpf/verifier.c                     |  4 ++--
> >   tools/testing/selftests/bpf/test_sysctl.c |  9 ++++++++-
> >   4 files changed, 22 insertions(+), 9 deletions(-)
> > 
> > diff --git a/include/linux/filter.h b/include/linux/filter.h
> > index 92c6e31fb008..2ce57645f3cd 100644
> > --- a/include/linux/filter.h
> > +++ b/include/linux/filter.h
> > @@ -749,14 +749,14 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
> >   }
> >   static inline u8
> > -bpf_ctx_narrow_load_shift(u32 off, u32 size, u32 size_default)
> > +bpf_ctx_narrow_access_offset(u32 off, u32 size, u32 size_default)
> >   {
> > -	u8 load_off = off & (size_default - 1);
> > +	u8 access_off = off & (size_default - 1);
> >   #ifdef __LITTLE_ENDIAN
> > -	return load_off * 8;
> > +	return access_off;
> >   #else
> > -	return (size_default - (load_off + size)) * 8;
> > +	return size_default - (access_off + size);
> >   #endif
> >   }
> > diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> > index 0a00eaca6fae..00c4647ce92a 100644
> > --- a/kernel/bpf/cgroup.c
> > +++ b/kernel/bpf/cgroup.c
> > @@ -1325,6 +1325,7 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
> >   				     struct bpf_prog *prog, u32 *target_size)
> >   {
> >   	struct bpf_insn *insn = insn_buf;
> > +	u32 read_size;
> >   	switch (si->off) {
> >   	case offsetof(struct bpf_sysctl, write):
> > @@ -1356,7 +1357,9 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
> >   				treg, si->dst_reg,
> >   				offsetof(struct bpf_sysctl_kern, ppos));
> >   			*insn++ = BPF_STX_MEM(
> > -				BPF_SIZEOF(u32), treg, si->src_reg, 0);
> > +				BPF_SIZEOF(u32), treg, si->src_reg,
> > +				bpf_ctx_narrow_access_offset(
> > +					0, sizeof(u32), sizeof(loff_t)));
> >   			*insn++ = BPF_LDX_MEM(
> >   				BPF_DW, treg, si->dst_reg,
> >   				offsetof(struct bpf_sysctl_kern, tmp_reg));
> > @@ -1365,8 +1368,11 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
> >   				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
> >   				si->dst_reg, si->src_reg,
> >   				offsetof(struct bpf_sysctl_kern, ppos));
> > +			read_size = bpf_size_to_bytes(BPF_SIZE(si->code));
> >   			*insn++ = BPF_LDX_MEM(
> > -				BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0);
> > +				BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
> > +				bpf_ctx_narrow_access_offset(
> > +					0, read_size, sizeof(loff_t)));
> 
> I see what you're doing, but generally I'm a bit puzzled on why we need these
> partial store/loads and cannot access the full loff_t value internally with the
> rewrite. Why was BPF_SIZEOF(u32) chosen in the first place? Looks like git history
> doesn't have any useful insight here ... Andrey mind to put some clarifications
> on this? Thx

Hi Daniel,

Sorry for delay, I took a week of vacation w/ pretty limited Internet
access and w/o access to my work e-mail (so answering from personal
one).

loff_t is signed and can be negative, but file position used in
bpf_sysctl ctx can only be >= 0 since negative file position doesn't
make sense here. When picking up the unsigned int size I made an
assumption that writing >4GB to sysctl file in /proc/sys would never be
needed in practice so I chose u32. This also allowed not to care about
negative values passed to loff_t since MSB is never touched and stays
zero.

You're right the commit message could be more detailed on this part.

One thing I missed in the initial patch though is the endianness and,
yes, making the code play well with both LE and BE requires this
additional work.


> >   		}
> >   		*target_size = sizeof(u32);
> >   		break;
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index c84d83f86141..d1d4c995a9eb 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -8616,8 +8616,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
> >   		}
> >   		if (is_narrower_load && size < target_size) {
> > -			u8 shift = bpf_ctx_narrow_load_shift(off, size,
> > -							     size_default);
> > +			u8 shift = bpf_ctx_narrow_access_offset(
> > +				off, size, size_default) * 8;
> >   			if (ctx_field_size <= 4) {
> >   				if (shift)
> >   					insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
> > diff --git a/tools/testing/selftests/bpf/test_sysctl.c b/tools/testing/selftests/bpf/test_sysctl.c
> > index a3bebd7c68dd..abc26248a7f1 100644
> > --- a/tools/testing/selftests/bpf/test_sysctl.c
> > +++ b/tools/testing/selftests/bpf/test_sysctl.c
> > @@ -31,6 +31,7 @@ struct sysctl_test {
> >   	enum bpf_attach_type attach_type;
> >   	const char *sysctl;
> >   	int open_flags;
> > +	int seek;
> >   	const char *newval;
> >   	const char *oldval;
> >   	enum {
> > @@ -139,7 +140,7 @@ static struct sysctl_test tests[] = {
> >   			/* If (file_pos == X) */
> >   			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
> >   				    offsetof(struct bpf_sysctl, file_pos)),
> > -			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0, 2),
> > +			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 3, 2),
> >   			/* return ALLOW; */
> >   			BPF_MOV64_IMM(BPF_REG_0, 1),
> > @@ -152,6 +153,7 @@ static struct sysctl_test tests[] = {
> >   		.attach_type = BPF_CGROUP_SYSCTL,
> >   		.sysctl = "kernel/ostype",
> >   		.open_flags = O_RDONLY,
> > +		.seek = 3,
> >   		.result = SUCCESS,
> >   	},
> >   	{
> > @@ -1442,6 +1444,11 @@ static int access_sysctl(const char *sysctl_path,
> >   	if (fd < 0)
> >   		return fd;
> > +	if (test->seek && lseek(fd, test->seek, SEEK_SET) == -1) {
> > +		log_err("lseek(%d) failed", test->seek);
> > +		goto err;
> > +	}
> > +
> >   	if (test->open_flags == O_RDONLY) {
> >   		char buf[128];
> > 
>
Daniel Borkmann Sept. 16, 2019, 10:11 a.m. UTC | #5
On 8/16/19 12:53 PM, Ilya Leoshkevich wrote:
> "ctx:file_pos sysctl:read write ok" fails on s390 with "Read value  !=
> nux". This is because verifier rewrites a complete 32-bit
> bpf_sysctl.file_pos update to a partial update of the first 32 bits of
> 64-bit *bpf_sysctl_kern.ppos, which is not correct on big-endian
> systems.
> 
> Fix by using an offset on big-endian systems.
> 
> Ditto for bpf_sysctl.file_pos reads. Currently the test does not detect
> a problem there, since it expects to see 0, which it gets with high
> probability in error cases, so change it to seek to offset 3 and expect
> 3 in bpf_sysctl.file_pos.
> 
> Fixes: e1550bfe0de4 ("bpf: Add file_pos field to bpf_sysctl ctx")
> Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>

Applied, thanks!

Patch
diff mbox series

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 92c6e31fb008..2ce57645f3cd 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -749,14 +749,14 @@  bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
 }
 
 static inline u8
-bpf_ctx_narrow_load_shift(u32 off, u32 size, u32 size_default)
+bpf_ctx_narrow_access_offset(u32 off, u32 size, u32 size_default)
 {
-	u8 load_off = off & (size_default - 1);
+	u8 access_off = off & (size_default - 1);
 
 #ifdef __LITTLE_ENDIAN
-	return load_off * 8;
+	return access_off;
 #else
-	return (size_default - (load_off + size)) * 8;
+	return size_default - (access_off + size);
 #endif
 }
 
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 0a00eaca6fae..00c4647ce92a 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1325,6 +1325,7 @@  static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
 				     struct bpf_prog *prog, u32 *target_size)
 {
 	struct bpf_insn *insn = insn_buf;
+	u32 read_size;
 
 	switch (si->off) {
 	case offsetof(struct bpf_sysctl, write):
@@ -1356,7 +1357,9 @@  static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
 				treg, si->dst_reg,
 				offsetof(struct bpf_sysctl_kern, ppos));
 			*insn++ = BPF_STX_MEM(
-				BPF_SIZEOF(u32), treg, si->src_reg, 0);
+				BPF_SIZEOF(u32), treg, si->src_reg,
+				bpf_ctx_narrow_access_offset(
+					0, sizeof(u32), sizeof(loff_t)));
 			*insn++ = BPF_LDX_MEM(
 				BPF_DW, treg, si->dst_reg,
 				offsetof(struct bpf_sysctl_kern, tmp_reg));
@@ -1365,8 +1368,11 @@  static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
 				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
 				si->dst_reg, si->src_reg,
 				offsetof(struct bpf_sysctl_kern, ppos));
+			read_size = bpf_size_to_bytes(BPF_SIZE(si->code));
 			*insn++ = BPF_LDX_MEM(
-				BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0);
+				BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
+				bpf_ctx_narrow_access_offset(
+					0, read_size, sizeof(loff_t)));
 		}
 		*target_size = sizeof(u32);
 		break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c84d83f86141..d1d4c995a9eb 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -8616,8 +8616,8 @@  static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		}
 
 		if (is_narrower_load && size < target_size) {
-			u8 shift = bpf_ctx_narrow_load_shift(off, size,
-							     size_default);
+			u8 shift = bpf_ctx_narrow_access_offset(
+				off, size, size_default) * 8;
 			if (ctx_field_size <= 4) {
 				if (shift)
 					insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
diff --git a/tools/testing/selftests/bpf/test_sysctl.c b/tools/testing/selftests/bpf/test_sysctl.c
index a3bebd7c68dd..abc26248a7f1 100644
--- a/tools/testing/selftests/bpf/test_sysctl.c
+++ b/tools/testing/selftests/bpf/test_sysctl.c
@@ -31,6 +31,7 @@  struct sysctl_test {
 	enum bpf_attach_type attach_type;
 	const char *sysctl;
 	int open_flags;
+	int seek;
 	const char *newval;
 	const char *oldval;
 	enum {
@@ -139,7 +140,7 @@  static struct sysctl_test tests[] = {
 			/* If (file_pos == X) */
 			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
 				    offsetof(struct bpf_sysctl, file_pos)),
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0, 2),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 3, 2),
 
 			/* return ALLOW; */
 			BPF_MOV64_IMM(BPF_REG_0, 1),
@@ -152,6 +153,7 @@  static struct sysctl_test tests[] = {
 		.attach_type = BPF_CGROUP_SYSCTL,
 		.sysctl = "kernel/ostype",
 		.open_flags = O_RDONLY,
+		.seek = 3,
 		.result = SUCCESS,
 	},
 	{
@@ -1442,6 +1444,11 @@  static int access_sysctl(const char *sysctl_path,
 	if (fd < 0)
 		return fd;
 
+	if (test->seek && lseek(fd, test->seek, SEEK_SET) == -1) {
+		log_err("lseek(%d) failed", test->seek);
+		goto err;
+	}
+
 	if (test->open_flags == O_RDONLY) {
 		char buf[128];