[bpf] bpf: fix accessing bpf_sysctl.file_pos on s390
diff mbox series

Message ID 20190815112044.38420-1-iii@linux.ibm.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series
  • [bpf] bpf: fix accessing bpf_sysctl.file_pos on s390
Related show

Commit Message

Ilya Leoshkevich Aug. 15, 2019, 11:20 a.m. UTC
"ctx:file_pos sysctl:read write ok" fails on s390 with "Read value  !=
nux". This is because verifier rewrites a complete 32-bit
bpf_sysctl.file_pos update to a partial update of the first 32 bits of
64-bit *bpf_sysctl_kern.ppos, which is not correct on big-endian
systems.

Fix by using an offset on big-endian systems.

Ditto for bpf_sysctl.file_pos reads. Currently the test does not detect
a problem there, since it expects to see 0, which it gets with high
probability in error cases, so change it to seek to offset 3 and expect
3 in bpf_sysctl.file_pos.

Fixes: e1550bfe0de4 ("bpf: Add file_pos field to bpf_sysctl ctx")
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
---
 include/linux/filter.h                    | 10 ++++++++++
 kernel/bpf/cgroup.c                       |  9 +++++++--
 tools/testing/selftests/bpf/test_sysctl.c |  9 ++++++++-
 3 files changed, 25 insertions(+), 3 deletions(-)

Comments

Andrey Ignatov Aug. 15, 2019, 7:58 p.m. UTC | #1
Ilya Leoshkevich <iii@linux.ibm.com> [Thu, 2019-08-15 11:20 -0700]:
> "ctx:file_pos sysctl:read write ok" fails on s390 with "Read value  !=
> nux". This is because verifier rewrites a complete 32-bit
> bpf_sysctl.file_pos update to a partial update of the first 32 bits of
> 64-bit *bpf_sysctl_kern.ppos, which is not correct on big-endian
> systems.
> 
> Fix by using an offset on big-endian systems.
> 
> Ditto for bpf_sysctl.file_pos reads. Currently the test does not detect
> a problem there, since it expects to see 0, which it gets with high
> probability in error cases, so change it to seek to offset 3 and expect
> 3 in bpf_sysctl.file_pos.
> 
> Fixes: e1550bfe0de4 ("bpf: Add file_pos field to bpf_sysctl ctx")
> Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>

Right, I missed this. Thanks for fixing!

Acked-by: Andrey Ignatov <rdna@fb.com>

> ---
>  include/linux/filter.h                    | 10 ++++++++++
>  kernel/bpf/cgroup.c                       |  9 +++++++--
>  tools/testing/selftests/bpf/test_sysctl.c |  9 ++++++++-
>  3 files changed, 25 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 92c6e31fb008..94e81c56d81c 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -760,6 +760,16 @@ bpf_ctx_narrow_load_shift(u32 off, u32 size, u32 size_default)
>  #endif
>  }
>  
> +static inline s16
> +bpf_ctx_narrow_access_offset(size_t variable_size, size_t access_size)
> +{
> +#ifdef __LITTLE_ENDIAN
> +	return 0;
> +#else
> +	return variable_size - access_size;
> +#endif
> +}
> +
>  #define bpf_ctx_wide_access_ok(off, size, type, field)			\
>  	(size == sizeof(__u64) &&					\
>  	off >= offsetof(type, field) &&					\
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index 0a00eaca6fae..b835fbb13ea8 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -1356,7 +1356,9 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
>  				treg, si->dst_reg,
>  				offsetof(struct bpf_sysctl_kern, ppos));
>  			*insn++ = BPF_STX_MEM(
> -				BPF_SIZEOF(u32), treg, si->src_reg, 0);
> +				BPF_SIZEOF(u32), treg, si->src_reg,
> +				bpf_ctx_narrow_access_offset(
> +					sizeof(loff_t), sizeof(u32)));
>  			*insn++ = BPF_LDX_MEM(
>  				BPF_DW, treg, si->dst_reg,
>  				offsetof(struct bpf_sysctl_kern, tmp_reg));
> @@ -1366,7 +1368,10 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
>  				si->dst_reg, si->src_reg,
>  				offsetof(struct bpf_sysctl_kern, ppos));
>  			*insn++ = BPF_LDX_MEM(
> -				BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0);
> +				BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
> +				bpf_ctx_narrow_access_offset(
> +					sizeof(loff_t),
> +					bpf_size_to_bytes(BPF_SIZE(si->code))));
>  		}
>  		*target_size = sizeof(u32);
>  		break;
> diff --git a/tools/testing/selftests/bpf/test_sysctl.c b/tools/testing/selftests/bpf/test_sysctl.c
> index a3bebd7c68dd..abc26248a7f1 100644
> --- a/tools/testing/selftests/bpf/test_sysctl.c
> +++ b/tools/testing/selftests/bpf/test_sysctl.c
> @@ -31,6 +31,7 @@ struct sysctl_test {
>  	enum bpf_attach_type attach_type;
>  	const char *sysctl;
>  	int open_flags;
> +	int seek;
>  	const char *newval;
>  	const char *oldval;
>  	enum {
> @@ -139,7 +140,7 @@ static struct sysctl_test tests[] = {
>  			/* If (file_pos == X) */
>  			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
>  				    offsetof(struct bpf_sysctl, file_pos)),
> -			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0, 2),
> +			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 3, 2),
>  
>  			/* return ALLOW; */
>  			BPF_MOV64_IMM(BPF_REG_0, 1),
> @@ -152,6 +153,7 @@ static struct sysctl_test tests[] = {
>  		.attach_type = BPF_CGROUP_SYSCTL,
>  		.sysctl = "kernel/ostype",
>  		.open_flags = O_RDONLY,
> +		.seek = 3,
>  		.result = SUCCESS,
>  	},
>  	{
> @@ -1442,6 +1444,11 @@ static int access_sysctl(const char *sysctl_path,
>  	if (fd < 0)
>  		return fd;
>  
> +	if (test->seek && lseek(fd, test->seek, SEEK_SET) == -1) {
> +		log_err("lseek(%d) failed", test->seek);
> +		goto err;
> +	}
> +
>  	if (test->open_flags == O_RDONLY) {
>  		char buf[128];
>
Yonghong Song Aug. 15, 2019, 11:01 p.m. UTC | #2
On 8/15/19 4:20 AM, Ilya Leoshkevich wrote:
> "ctx:file_pos sysctl:read write ok" fails on s390 with "Read value  !=
> nux". This is because verifier rewrites a complete 32-bit
> bpf_sysctl.file_pos update to a partial update of the first 32 bits of
> 64-bit *bpf_sysctl_kern.ppos, which is not correct on big-endian
> systems.
> 
> Fix by using an offset on big-endian systems.
> 
> Ditto for bpf_sysctl.file_pos reads. Currently the test does not detect
> a problem there, since it expects to see 0, which it gets with high
> probability in error cases, so change it to seek to offset 3 and expect
> 3 in bpf_sysctl.file_pos.
> 
> Fixes: e1550bfe0de4 ("bpf: Add file_pos field to bpf_sysctl ctx")
> Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
> ---
>   include/linux/filter.h                    | 10 ++++++++++
>   kernel/bpf/cgroup.c                       |  9 +++++++--
>   tools/testing/selftests/bpf/test_sysctl.c |  9 ++++++++-
>   3 files changed, 25 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 92c6e31fb008..94e81c56d81c 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -760,6 +760,16 @@ bpf_ctx_narrow_load_shift(u32 off, u32 size, u32 size_default)
>   #endif
>   }
>   
> +static inline s16
> +bpf_ctx_narrow_access_offset(size_t variable_size, size_t access_size)
> +{
> +#ifdef __LITTLE_ENDIAN
> +	return 0;
> +#else
> +	return variable_size - access_size;
> +#endif
> +}

The change looks correct to me.
But now in include/linux/filter.h we have to macros:

static inline u8
bpf_ctx_narrow_load_shift(u32 off, u32 size, u32 size_default)
{
         u8 load_off = off & (size_default - 1);

#ifdef __LITTLE_ENDIAN
         return load_off * 8;
#else
         return (size_default - (load_off + size)) * 8;
#endif
}

static inline s16
bpf_ctx_narrow_access_offset(size_t variable_size, size_t access_size)
{
#ifdef __LITTLE_ENDIAN
         return 0;
#else
         return variable_size - access_size;
#endif
}

It would be good if we can have ifdef __LITTLE_ENDIAN only in one place.
How about something like below:

static inline u8
bpf_ctx_narrow_access_offset(u32 off, u32 size, u32 size_default)
{
         u8 access_off = off & (size_default - 1);

#ifdef __LITTLE_ENDIAN
         return access_off;
#else
         return size_default - (access_off + size);
#endif
}

static inline u8
bpf_ctx_narrow_load_shift(u32 off, u32 size, u32 size_default)
{
         return bpf_ctx_narrow_access_offset(off, size, size_default) * 8;
}

> +
>   #define bpf_ctx_wide_access_ok(off, size, type, field)			\
>   	(size == sizeof(__u64) &&					\
>   	off >= offsetof(type, field) &&					\
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index 0a00eaca6fae..b835fbb13ea8 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -1356,7 +1356,9 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
>   				treg, si->dst_reg,
>   				offsetof(struct bpf_sysctl_kern, ppos));
>   			*insn++ = BPF_STX_MEM(
> -				BPF_SIZEOF(u32), treg, si->src_reg, 0);
> +				BPF_SIZEOF(u32), treg, si->src_reg,
> +				bpf_ctx_narrow_access_offset(
> +					sizeof(loff_t), sizeof(u32)));
>   			*insn++ = BPF_LDX_MEM(
>   				BPF_DW, treg, si->dst_reg,
>   				offsetof(struct bpf_sysctl_kern, tmp_reg));
> @@ -1366,7 +1368,10 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
>   				si->dst_reg, si->src_reg,
>   				offsetof(struct bpf_sysctl_kern, ppos));
>   			*insn++ = BPF_LDX_MEM(
> -				BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0);
> +				BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
> +				bpf_ctx_narrow_access_offset(
> +					sizeof(loff_t),
> +					bpf_size_to_bytes(BPF_SIZE(si->code))));
>   		}
>   		*target_size = sizeof(u32);
>   		break;
> diff --git a/tools/testing/selftests/bpf/test_sysctl.c b/tools/testing/selftests/bpf/test_sysctl.c
> index a3bebd7c68dd..abc26248a7f1 100644
> --- a/tools/testing/selftests/bpf/test_sysctl.c
> +++ b/tools/testing/selftests/bpf/test_sysctl.c
> @@ -31,6 +31,7 @@ struct sysctl_test {
>   	enum bpf_attach_type attach_type;
>   	const char *sysctl;
>   	int open_flags;
> +	int seek;
>   	const char *newval;
>   	const char *oldval;
>   	enum {
> @@ -139,7 +140,7 @@ static struct sysctl_test tests[] = {
>   			/* If (file_pos == X) */
>   			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
>   				    offsetof(struct bpf_sysctl, file_pos)),
> -			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0, 2),
> +			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 3, 2),
>   
>   			/* return ALLOW; */
>   			BPF_MOV64_IMM(BPF_REG_0, 1),
> @@ -152,6 +153,7 @@ static struct sysctl_test tests[] = {
>   		.attach_type = BPF_CGROUP_SYSCTL,
>   		.sysctl = "kernel/ostype",
>   		.open_flags = O_RDONLY,
> +		.seek = 3,
>   		.result = SUCCESS,
>   	},
>   	{
> @@ -1442,6 +1444,11 @@ static int access_sysctl(const char *sysctl_path,
>   	if (fd < 0)
>   		return fd;
>   
> +	if (test->seek && lseek(fd, test->seek, SEEK_SET) == -1) {
> +		log_err("lseek(%d) failed", test->seek);
> +		goto err;
> +	}
> +
>   	if (test->open_flags == O_RDONLY) {
>   		char buf[128];
>   
>
Ilya Leoshkevich Aug. 16, 2019, 10:34 a.m. UTC | #3
> Am 16.08.2019 um 01:01 schrieb Yonghong Song <yhs@fb.com>:
> 
> 
> 
> On 8/15/19 4:20 AM, Ilya Leoshkevich wrote:
>> "ctx:file_pos sysctl:read write ok" fails on s390 with "Read value  !=
>> nux". This is because verifier rewrites a complete 32-bit
>> bpf_sysctl.file_pos update to a partial update of the first 32 bits of
>> 64-bit *bpf_sysctl_kern.ppos, which is not correct on big-endian
>> systems.
>> 
>> Fix by using an offset on big-endian systems.
>> 
>> Ditto for bpf_sysctl.file_pos reads. Currently the test does not detect
>> a problem there, since it expects to see 0, which it gets with high
>> probability in error cases, so change it to seek to offset 3 and expect
>> 3 in bpf_sysctl.file_pos.
>> 
>> Fixes: e1550bfe0de4 ("bpf: Add file_pos field to bpf_sysctl ctx")
>> Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
>> ---
>>  include/linux/filter.h                    | 10 ++++++++++
>>  kernel/bpf/cgroup.c                       |  9 +++++++--
>>  tools/testing/selftests/bpf/test_sysctl.c |  9 ++++++++-
>>  3 files changed, 25 insertions(+), 3 deletions(-)
>> 
>> diff --git a/include/linux/filter.h b/include/linux/filter.h
>> index 92c6e31fb008..94e81c56d81c 100644
>> --- a/include/linux/filter.h
>> +++ b/include/linux/filter.h
>> @@ -760,6 +760,16 @@ bpf_ctx_narrow_load_shift(u32 off, u32 size, u32 size_default)
>>  #endif
>>  }
>> 
>> +static inline s16
>> +bpf_ctx_narrow_access_offset(size_t variable_size, size_t access_size)
>> +{
>> +#ifdef __LITTLE_ENDIAN
>> +	return 0;
>> +#else
>> +	return variable_size - access_size;
>> +#endif
>> +}
> 
> The change looks correct to me.
> But now in include/linux/filter.h we have to macros:
> 
> static inline u8
> bpf_ctx_narrow_load_shift(u32 off, u32 size, u32 size_default)
> {
>         u8 load_off = off & (size_default - 1);
> 
> #ifdef __LITTLE_ENDIAN
>         return load_off * 8;
> #else
>         return (size_default - (load_off + size)) * 8;
> #endif
> }
> 
> static inline s16
> bpf_ctx_narrow_access_offset(size_t variable_size, size_t access_size)
> {
> #ifdef __LITTLE_ENDIAN
>         return 0;
> #else
>         return variable_size - access_size;
> #endif
> }
> 
> It would be good if we can have ifdef __LITTLE_ENDIAN only in one place.
> How about something like below:
> 
> static inline u8
> bpf_ctx_narrow_access_offset(u32 off, u32 size, u32 size_default)
> {
>         u8 access_off = off & (size_default - 1);
> 
> #ifdef __LITTLE_ENDIAN
>         return access_off;
> #else
>         return size_default - (access_off + size);
> #endif
> }
> 
> static inline u8
> bpf_ctx_narrow_load_shift(u32 off, u32 size, u32 size_default)
> {
>         return bpf_ctx_narrow_access_offset(off, size, size_default) * 8;
> }

This does indeed look better, thanks! In this case, we don't even need
bpf_ctx_narrow_load_shift() anymore, since doing

u8 shift = bpf_ctx_narrow_access_offset(
       off, size, size_default) * 8;

directly is quite readable. I will test and send a v2.

Patch
diff mbox series

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 92c6e31fb008..94e81c56d81c 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -760,6 +760,16 @@  bpf_ctx_narrow_load_shift(u32 off, u32 size, u32 size_default)
 #endif
 }
 
+static inline s16
+bpf_ctx_narrow_access_offset(size_t variable_size, size_t access_size)
+{
+#ifdef __LITTLE_ENDIAN
+	return 0;
+#else
+	return variable_size - access_size;
+#endif
+}
+
 #define bpf_ctx_wide_access_ok(off, size, type, field)			\
 	(size == sizeof(__u64) &&					\
 	off >= offsetof(type, field) &&					\
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 0a00eaca6fae..b835fbb13ea8 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1356,7 +1356,9 @@  static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
 				treg, si->dst_reg,
 				offsetof(struct bpf_sysctl_kern, ppos));
 			*insn++ = BPF_STX_MEM(
-				BPF_SIZEOF(u32), treg, si->src_reg, 0);
+				BPF_SIZEOF(u32), treg, si->src_reg,
+				bpf_ctx_narrow_access_offset(
+					sizeof(loff_t), sizeof(u32)));
 			*insn++ = BPF_LDX_MEM(
 				BPF_DW, treg, si->dst_reg,
 				offsetof(struct bpf_sysctl_kern, tmp_reg));
@@ -1366,7 +1368,10 @@  static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
 				si->dst_reg, si->src_reg,
 				offsetof(struct bpf_sysctl_kern, ppos));
 			*insn++ = BPF_LDX_MEM(
-				BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0);
+				BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
+				bpf_ctx_narrow_access_offset(
+					sizeof(loff_t),
+					bpf_size_to_bytes(BPF_SIZE(si->code))));
 		}
 		*target_size = sizeof(u32);
 		break;
diff --git a/tools/testing/selftests/bpf/test_sysctl.c b/tools/testing/selftests/bpf/test_sysctl.c
index a3bebd7c68dd..abc26248a7f1 100644
--- a/tools/testing/selftests/bpf/test_sysctl.c
+++ b/tools/testing/selftests/bpf/test_sysctl.c
@@ -31,6 +31,7 @@  struct sysctl_test {
 	enum bpf_attach_type attach_type;
 	const char *sysctl;
 	int open_flags;
+	int seek;
 	const char *newval;
 	const char *oldval;
 	enum {
@@ -139,7 +140,7 @@  static struct sysctl_test tests[] = {
 			/* If (file_pos == X) */
 			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
 				    offsetof(struct bpf_sysctl, file_pos)),
-			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0, 2),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 3, 2),
 
 			/* return ALLOW; */
 			BPF_MOV64_IMM(BPF_REG_0, 1),
@@ -152,6 +153,7 @@  static struct sysctl_test tests[] = {
 		.attach_type = BPF_CGROUP_SYSCTL,
 		.sysctl = "kernel/ostype",
 		.open_flags = O_RDONLY,
+		.seek = 3,
 		.result = SUCCESS,
 	},
 	{
@@ -1442,6 +1444,11 @@  static int access_sysctl(const char *sysctl_path,
 	if (fd < 0)
 		return fd;
 
+	if (test->seek && lseek(fd, test->seek, SEEK_SET) == -1) {
+		log_err("lseek(%d) failed", test->seek);
+		goto err;
+	}
+
 	if (test->open_flags == O_RDONLY) {
 		char buf[128];