diff mbox series

[bpf-next] bpf: enable btf for use in all maps

Message ID 20180809194220.17484-1-daniel@iogearbox.net
State Changes Requested, archived
Delegated to: BPF Maintainers
Headers show
Series [bpf-next] bpf: enable btf for use in all maps | expand

Commit Message

Daniel Borkmann Aug. 9, 2018, 7:42 p.m. UTC
Commit a26ca7c982cb ("bpf: btf: Add pretty print support to
the basic arraymap") enabled support for BTF and dumping via
BPF fs for arraymap. However, both can be decoupled from each
other such that all BPF maps can be supported for attaching
BTF key/value information, while not all maps necessarily
need to dump via map_seq_show_elem() callback.

The check in array_map_check_btf() can be generalized as
ultimatively the key and value size is the only contraint
that needs to match for the map. The fact that the key needs
to be of type int is optional; it could be any data type as
long as it matches the 4 byte key size, just like hash table
key or others could be of any data type as well.

Minimal example of a hash table dump which then works out
of the box for bpftool:

  # bpftool map dump id 19
  [{
          "key": {
              "": {
                  "vip": 0,
                  "vipv6": []
              },
              "port": 0,
              "family": 0,
              "proto": 0
          },
          "value": {
              "flags": 0,
              "vip_num": 0
          }
      }
  ]

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h   |  4 +---
 kernel/bpf/arraymap.c | 27 ---------------------------
 kernel/bpf/inode.c    |  3 ++-
 kernel/bpf/syscall.c  | 24 ++++++++++++++++++++----
 4 files changed, 23 insertions(+), 35 deletions(-)

Comments

Yonghong Song Aug. 9, 2018, 8:50 p.m. UTC | #1
On 8/9/18 12:42 PM, Daniel Borkmann wrote:
> Commit a26ca7c982cb ("bpf: btf: Add pretty print support to
> the basic arraymap") enabled support for BTF and dumping via
> BPF fs for arraymap. However, both can be decoupled from each
> other such that all BPF maps can be supported for attaching
> BTF key/value information, while not all maps necessarily
> need to dump via map_seq_show_elem() callback.
> 
> The check in array_map_check_btf() can be generalized as
> ultimatively the key and value size is the only contraint
> that needs to match for the map. The fact that the key needs
> to be of type int is optional; it could be any data type as
> long as it matches the 4 byte key size, just like hash table
> key or others could be of any data type as well.
> 
> Minimal example of a hash table dump which then works out
> of the box for bpftool:
> 
>    # bpftool map dump id 19
>    [{
>            "key": {
>                "": {
>                    "vip": 0,
>                    "vipv6": []
>                },
>                "port": 0,
>                "family": 0,
>                "proto": 0
>            },
>            "value": {
>                "flags": 0,
>                "vip_num": 0
>            }
>        }
>    ]
> 
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> Cc: Yonghong Song <yhs@fb.com>

LGTM. Thanks!
Acked-by: Yonghong Song <yhs@fb.com>
Alexei Starovoitov Aug. 9, 2018, 9:14 p.m. UTC | #2
On Thu, Aug 09, 2018 at 09:42:20PM +0200, Daniel Borkmann wrote:
> Commit a26ca7c982cb ("bpf: btf: Add pretty print support to
> the basic arraymap") enabled support for BTF and dumping via
> BPF fs for arraymap. However, both can be decoupled from each
> other such that all BPF maps can be supported for attaching
> BTF key/value information, while not all maps necessarily
> need to dump via map_seq_show_elem() callback.
> 
> The check in array_map_check_btf() can be generalized as
> ultimatively the key and value size is the only contraint
> that needs to match for the map. The fact that the key needs
> to be of type int is optional; it could be any data type as
> long as it matches the 4 byte key size, just like hash table
> key or others could be of any data type as well.
> 
> Minimal example of a hash table dump which then works out
> of the box for bpftool:
> 
>   # bpftool map dump id 19
>   [{
>           "key": {
>               "": {
>                   "vip": 0,
>                   "vipv6": []
>               },
>               "port": 0,
>               "family": 0,
>               "proto": 0
>           },
>           "value": {
>               "flags": 0,
>               "vip_num": 0
>           }
>       }
>   ]
> 
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> Cc: Yonghong Song <yhs@fb.com>
> ---
>  include/linux/bpf.h   |  4 +---
>  kernel/bpf/arraymap.c | 27 ---------------------------
>  kernel/bpf/inode.c    |  3 ++-
>  kernel/bpf/syscall.c  | 24 ++++++++++++++++++++----
>  4 files changed, 23 insertions(+), 35 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index cd8790d..eb76e8e 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -48,8 +48,6 @@ struct bpf_map_ops {
>  	u32 (*map_fd_sys_lookup_elem)(void *ptr);
>  	void (*map_seq_show_elem)(struct bpf_map *map, void *key,
>  				  struct seq_file *m);
> -	int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf,
> -			     u32 key_type_id, u32 value_type_id);
>  };
>  
>  struct bpf_map {
> @@ -118,7 +116,7 @@ static inline bool bpf_map_offload_neutral(const struct bpf_map *map)
>  
>  static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
>  {
> -	return map->ops->map_seq_show_elem && map->ops->map_check_btf;
> +	return map->btf && map->ops->map_seq_show_elem;
>  }
>  
>  extern const struct bpf_map_ops bpf_map_offload_ops;
> diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
> index 2aa55d030..67f0bdf 100644
> --- a/kernel/bpf/arraymap.c
> +++ b/kernel/bpf/arraymap.c
> @@ -358,32 +358,6 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,
>  	rcu_read_unlock();
>  }
>  
> -static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf,
> -			       u32 btf_key_id, u32 btf_value_id)
> -{
> -	const struct btf_type *key_type, *value_type;
> -	u32 key_size, value_size;
> -	u32 int_data;
> -
> -	key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
> -	if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
> -		return -EINVAL;
> -
> -	int_data = *(u32 *)(key_type + 1);
> -	/* bpf array can only take a u32 key.  This check makes
> -	 * sure that the btf matches the attr used during map_create.
> -	 */
> -	if (BTF_INT_BITS(int_data) != 32 || key_size != 4 ||
> -	    BTF_INT_OFFSET(int_data))
> -		return -EINVAL;

I think most of these checks are still necessary for array type.
Relaxing BTF array key from BTF_KIND_INT to, for example, BTF_KIND_ENUM
is probably ok, but key being BTF_KIND_PTR or BTF_KIND_ARRAY doesn't makes sense.

For hash maps we probably need hash specific checks too. Otherwise
such sanity checks would need to be in kernel pretty printer and later
in user space too (bpftool and everything that will consume BTF),
since user space won't be able to trust kernel with sane key types.
Daniel Borkmann Aug. 9, 2018, 9:30 p.m. UTC | #3
On 08/09/2018 11:14 PM, Alexei Starovoitov wrote:
> On Thu, Aug 09, 2018 at 09:42:20PM +0200, Daniel Borkmann wrote:
>> Commit a26ca7c982cb ("bpf: btf: Add pretty print support to
>> the basic arraymap") enabled support for BTF and dumping via
>> BPF fs for arraymap. However, both can be decoupled from each
>> other such that all BPF maps can be supported for attaching
>> BTF key/value information, while not all maps necessarily
>> need to dump via map_seq_show_elem() callback.
>>
>> The check in array_map_check_btf() can be generalized as
>> ultimatively the key and value size is the only contraint
>> that needs to match for the map. The fact that the key needs
>> to be of type int is optional; it could be any data type as
>> long as it matches the 4 byte key size, just like hash table
>> key or others could be of any data type as well.
>>
>> Minimal example of a hash table dump which then works out
>> of the box for bpftool:
>>
>>   # bpftool map dump id 19
>>   [{
>>           "key": {
>>               "": {
>>                   "vip": 0,
>>                   "vipv6": []
>>               },
>>               "port": 0,
>>               "family": 0,
>>               "proto": 0
>>           },
>>           "value": {
>>               "flags": 0,
>>               "vip_num": 0
>>           }
>>       }
>>   ]
>>
>> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
>> Cc: Yonghong Song <yhs@fb.com>
>> ---
>>  include/linux/bpf.h   |  4 +---
>>  kernel/bpf/arraymap.c | 27 ---------------------------
>>  kernel/bpf/inode.c    |  3 ++-
>>  kernel/bpf/syscall.c  | 24 ++++++++++++++++++++----
>>  4 files changed, 23 insertions(+), 35 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index cd8790d..eb76e8e 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -48,8 +48,6 @@ struct bpf_map_ops {
>>  	u32 (*map_fd_sys_lookup_elem)(void *ptr);
>>  	void (*map_seq_show_elem)(struct bpf_map *map, void *key,
>>  				  struct seq_file *m);
>> -	int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf,
>> -			     u32 key_type_id, u32 value_type_id);
>>  };
>>  
>>  struct bpf_map {
>> @@ -118,7 +116,7 @@ static inline bool bpf_map_offload_neutral(const struct bpf_map *map)
>>  
>>  static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
>>  {
>> -	return map->ops->map_seq_show_elem && map->ops->map_check_btf;
>> +	return map->btf && map->ops->map_seq_show_elem;
>>  }
>>  
>>  extern const struct bpf_map_ops bpf_map_offload_ops;
>> diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
>> index 2aa55d030..67f0bdf 100644
>> --- a/kernel/bpf/arraymap.c
>> +++ b/kernel/bpf/arraymap.c
>> @@ -358,32 +358,6 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,
>>  	rcu_read_unlock();
>>  }
>>  
>> -static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf,
>> -			       u32 btf_key_id, u32 btf_value_id)
>> -{
>> -	const struct btf_type *key_type, *value_type;
>> -	u32 key_size, value_size;
>> -	u32 int_data;
>> -
>> -	key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
>> -	if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
>> -		return -EINVAL;
>> -
>> -	int_data = *(u32 *)(key_type + 1);
>> -	/* bpf array can only take a u32 key.  This check makes
>> -	 * sure that the btf matches the attr used during map_create.
>> -	 */
>> -	if (BTF_INT_BITS(int_data) != 32 || key_size != 4 ||
>> -	    BTF_INT_OFFSET(int_data))
>> -		return -EINVAL;
> 
> I think most of these checks are still necessary for array type.
> Relaxing BTF array key from BTF_KIND_INT to, for example, BTF_KIND_ENUM
> is probably ok, but key being BTF_KIND_PTR or BTF_KIND_ARRAY doesn't makes sense.

Hmm, so on 64 bit archs BTF_KIND_PTR would get rejected for array,
on 32 bit it may be allowed due to sizeof(void *) == 4. BTF_KIND_ARRAY
could be array of u8 foo[4], for example, or u16 foo[2]. But how would
it ultimately be different from e.g. having 'struct a' versus 'struct b'
where both are of same size and while actual key has 'struct a', the one
who writes the prog resp. loads the BTF into the kernel would lie about
it stating it's of type 'struct b' instead? It's basically trusting the
app that it advertised sane key types which kernel is propagating back.

Thanks,
Daniel
Alexei Starovoitov Aug. 9, 2018, 9:44 p.m. UTC | #4
On Thu, Aug 09, 2018 at 11:30:52PM +0200, Daniel Borkmann wrote:
> On 08/09/2018 11:14 PM, Alexei Starovoitov wrote:
> > On Thu, Aug 09, 2018 at 09:42:20PM +0200, Daniel Borkmann wrote:
> >> Commit a26ca7c982cb ("bpf: btf: Add pretty print support to
> >> the basic arraymap") enabled support for BTF and dumping via
> >> BPF fs for arraymap. However, both can be decoupled from each
> >> other such that all BPF maps can be supported for attaching
> >> BTF key/value information, while not all maps necessarily
> >> need to dump via map_seq_show_elem() callback.
> >>
> >> The check in array_map_check_btf() can be generalized as
> >> ultimatively the key and value size is the only contraint
> >> that needs to match for the map. The fact that the key needs
> >> to be of type int is optional; it could be any data type as
> >> long as it matches the 4 byte key size, just like hash table
> >> key or others could be of any data type as well.
> >>
> >> Minimal example of a hash table dump which then works out
> >> of the box for bpftool:
> >>
> >>   # bpftool map dump id 19
> >>   [{
> >>           "key": {
> >>               "": {
> >>                   "vip": 0,
> >>                   "vipv6": []
> >>               },
> >>               "port": 0,
> >>               "family": 0,
> >>               "proto": 0
> >>           },
> >>           "value": {
> >>               "flags": 0,
> >>               "vip_num": 0
> >>           }
> >>       }
> >>   ]
> >>
> >> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> >> Cc: Yonghong Song <yhs@fb.com>
> >> ---
> >>  include/linux/bpf.h   |  4 +---
> >>  kernel/bpf/arraymap.c | 27 ---------------------------
> >>  kernel/bpf/inode.c    |  3 ++-
> >>  kernel/bpf/syscall.c  | 24 ++++++++++++++++++++----
> >>  4 files changed, 23 insertions(+), 35 deletions(-)
> >>
> >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> >> index cd8790d..eb76e8e 100644
> >> --- a/include/linux/bpf.h
> >> +++ b/include/linux/bpf.h
> >> @@ -48,8 +48,6 @@ struct bpf_map_ops {
> >>  	u32 (*map_fd_sys_lookup_elem)(void *ptr);
> >>  	void (*map_seq_show_elem)(struct bpf_map *map, void *key,
> >>  				  struct seq_file *m);
> >> -	int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf,
> >> -			     u32 key_type_id, u32 value_type_id);
> >>  };
> >>  
> >>  struct bpf_map {
> >> @@ -118,7 +116,7 @@ static inline bool bpf_map_offload_neutral(const struct bpf_map *map)
> >>  
> >>  static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
> >>  {
> >> -	return map->ops->map_seq_show_elem && map->ops->map_check_btf;
> >> +	return map->btf && map->ops->map_seq_show_elem;
> >>  }
> >>  
> >>  extern const struct bpf_map_ops bpf_map_offload_ops;
> >> diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
> >> index 2aa55d030..67f0bdf 100644
> >> --- a/kernel/bpf/arraymap.c
> >> +++ b/kernel/bpf/arraymap.c
> >> @@ -358,32 +358,6 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,
> >>  	rcu_read_unlock();
> >>  }
> >>  
> >> -static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf,
> >> -			       u32 btf_key_id, u32 btf_value_id)
> >> -{
> >> -	const struct btf_type *key_type, *value_type;
> >> -	u32 key_size, value_size;
> >> -	u32 int_data;
> >> -
> >> -	key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
> >> -	if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
> >> -		return -EINVAL;
> >> -
> >> -	int_data = *(u32 *)(key_type + 1);
> >> -	/* bpf array can only take a u32 key.  This check makes
> >> -	 * sure that the btf matches the attr used during map_create.
> >> -	 */
> >> -	if (BTF_INT_BITS(int_data) != 32 || key_size != 4 ||
> >> -	    BTF_INT_OFFSET(int_data))
> >> -		return -EINVAL;
> > 
> > I think most of these checks are still necessary for array type.
> > Relaxing BTF array key from BTF_KIND_INT to, for example, BTF_KIND_ENUM
> > is probably ok, but key being BTF_KIND_PTR or BTF_KIND_ARRAY doesn't makes sense.
> 
> Hmm, so on 64 bit archs BTF_KIND_PTR would get rejected for array,
> on 32 bit it may be allowed due to sizeof(void *) == 4. BTF_KIND_ARRAY
> could be array of u8 foo[4], for example, or u16 foo[2]. But how would
> it ultimately be different from e.g. having 'struct a' versus 'struct b'
> where both are of same size and while actual key has 'struct a', the one
> who writes the prog resp. loads the BTF into the kernel would lie about
> it stating it's of type 'struct b' instead? It's basically trusting the
> app that it advertised sane key types which kernel is propagating back.

for hash map - yes. the kernel cannot yet catch the lie that
key == 'struct a' that user said in BTF is not what program used
(which used 'struct b' of the same size).
Eventually we will annotate all load/store in the program and will
make sure that memory access match what BTF said.
For array we can catch the lie today that key is not 4 byte int,
since it matters from pretty printing point of view.
If it's PTR or ARRAY or STRUCT, the printer will go nuts.
When userspace can trust kernel that array key is u32 it can print
int arr[10];
just like gdb does:
(gdb) p arr
$1 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
(gdb) ptype arr
type = int [10]

What user printer suppose to do if kernel says that key=PTR or, worse, key=STRUCT ?
I cannot think of sane way of printing such array.
Even key=ENUM is not trivial to print, but I think it can be useful and
practical to use ENUM as a key, but for now I'd stick to INT only
like the check does today.
Daniel Borkmann Aug. 9, 2018, 10:43 p.m. UTC | #5
On 08/09/2018 11:44 PM, Alexei Starovoitov wrote:
> On Thu, Aug 09, 2018 at 11:30:52PM +0200, Daniel Borkmann wrote:
>> On 08/09/2018 11:14 PM, Alexei Starovoitov wrote:
>>> On Thu, Aug 09, 2018 at 09:42:20PM +0200, Daniel Borkmann wrote:
>>>> Commit a26ca7c982cb ("bpf: btf: Add pretty print support to
>>>> the basic arraymap") enabled support for BTF and dumping via
>>>> BPF fs for arraymap. However, both can be decoupled from each
>>>> other such that all BPF maps can be supported for attaching
>>>> BTF key/value information, while not all maps necessarily
>>>> need to dump via map_seq_show_elem() callback.
>>>>
>>>> The check in array_map_check_btf() can be generalized as
>>>> ultimatively the key and value size is the only contraint
>>>> that needs to match for the map. The fact that the key needs
>>>> to be of type int is optional; it could be any data type as
>>>> long as it matches the 4 byte key size, just like hash table
>>>> key or others could be of any data type as well.
>>>>
>>>> Minimal example of a hash table dump which then works out
>>>> of the box for bpftool:
>>>>
>>>>   # bpftool map dump id 19
>>>>   [{
>>>>           "key": {
>>>>               "": {
>>>>                   "vip": 0,
>>>>                   "vipv6": []
>>>>               },
>>>>               "port": 0,
>>>>               "family": 0,
>>>>               "proto": 0
>>>>           },
>>>>           "value": {
>>>>               "flags": 0,
>>>>               "vip_num": 0
>>>>           }
>>>>       }
>>>>   ]
>>>>
>>>> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
>>>> Cc: Yonghong Song <yhs@fb.com>
>>>> ---
>>>>  include/linux/bpf.h   |  4 +---
>>>>  kernel/bpf/arraymap.c | 27 ---------------------------
>>>>  kernel/bpf/inode.c    |  3 ++-
>>>>  kernel/bpf/syscall.c  | 24 ++++++++++++++++++++----
>>>>  4 files changed, 23 insertions(+), 35 deletions(-)
>>>>
>>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>>> index cd8790d..eb76e8e 100644
>>>> --- a/include/linux/bpf.h
>>>> +++ b/include/linux/bpf.h
>>>> @@ -48,8 +48,6 @@ struct bpf_map_ops {
>>>>  	u32 (*map_fd_sys_lookup_elem)(void *ptr);
>>>>  	void (*map_seq_show_elem)(struct bpf_map *map, void *key,
>>>>  				  struct seq_file *m);
>>>> -	int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf,
>>>> -			     u32 key_type_id, u32 value_type_id);
>>>>  };
>>>>  
>>>>  struct bpf_map {
>>>> @@ -118,7 +116,7 @@ static inline bool bpf_map_offload_neutral(const struct bpf_map *map)
>>>>  
>>>>  static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
>>>>  {
>>>> -	return map->ops->map_seq_show_elem && map->ops->map_check_btf;
>>>> +	return map->btf && map->ops->map_seq_show_elem;
>>>>  }
>>>>  
>>>>  extern const struct bpf_map_ops bpf_map_offload_ops;
>>>> diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
>>>> index 2aa55d030..67f0bdf 100644
>>>> --- a/kernel/bpf/arraymap.c
>>>> +++ b/kernel/bpf/arraymap.c
>>>> @@ -358,32 +358,6 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,
>>>>  	rcu_read_unlock();
>>>>  }
>>>>  
>>>> -static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf,
>>>> -			       u32 btf_key_id, u32 btf_value_id)
>>>> -{
>>>> -	const struct btf_type *key_type, *value_type;
>>>> -	u32 key_size, value_size;
>>>> -	u32 int_data;
>>>> -
>>>> -	key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
>>>> -	if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
>>>> -		return -EINVAL;
>>>> -
>>>> -	int_data = *(u32 *)(key_type + 1);
>>>> -	/* bpf array can only take a u32 key.  This check makes
>>>> -	 * sure that the btf matches the attr used during map_create.
>>>> -	 */
>>>> -	if (BTF_INT_BITS(int_data) != 32 || key_size != 4 ||
>>>> -	    BTF_INT_OFFSET(int_data))
>>>> -		return -EINVAL;
>>>
>>> I think most of these checks are still necessary for array type.
>>> Relaxing BTF array key from BTF_KIND_INT to, for example, BTF_KIND_ENUM
>>> is probably ok, but key being BTF_KIND_PTR or BTF_KIND_ARRAY doesn't makes sense.
>>
>> Hmm, so on 64 bit archs BTF_KIND_PTR would get rejected for array,
>> on 32 bit it may be allowed due to sizeof(void *) == 4. BTF_KIND_ARRAY
>> could be array of u8 foo[4], for example, or u16 foo[2]. But how would
>> it ultimately be different from e.g. having 'struct a' versus 'struct b'
>> where both are of same size and while actual key has 'struct a', the one
>> who writes the prog resp. loads the BTF into the kernel would lie about
>> it stating it's of type 'struct b' instead? It's basically trusting the
>> app that it advertised sane key types which kernel is propagating back.
> 
> for hash map - yes. the kernel cannot yet catch the lie that
> key == 'struct a' that user said in BTF is not what program used
> (which used 'struct b' of the same size).
> Eventually we will annotate all load/store in the program and will
> make sure that memory access match what BTF said.

But in that case, would you reject the program? E.g. from prog point of
view, it's just a buffer of x bytes, so key could be casted to different
struct/types potentially and used for lookup; similar with value if you
would go the route to annotate all access into it. I don't think this
serves as a security feature (since you might as well just load the prog
without BTF just fine), but it can be used to help verifier to perform
rewrites like in tracing for implicit bpf_probe_read() based on member
access. But also in that case, if you might have e.g. stale or wrong BTF
data e.g. of the whole kernel or some application it would follow/walk
that one instead. But such user error would be "acceptable" since it serves
as a hint, roughly similar to (explicitly) walking the data structures
based on the headers today, you do have better control in terms of header
mismatches in that you can ship the BTF directly from the build, but there's
still no guarantee in that sense that you "verified" that these bytes
originally were mapped to struct foo somewhere in a C program.

> For array we can catch the lie today that key is not 4 byte int,
> since it matters from pretty printing point of view.
> If it's PTR or ARRAY or STRUCT, the printer will go nuts.

In that case, would you enforce a hash map key size of 4 also to INT-only
instead of e.g. allowing STRUCT and various others? I don't think it's
that much different from pretty printer PoV (e.g. see bpftool transparently
handling them via btf_dumper_type()), it's some key with 4 bytes of memory
and a type matching these 4 bytes in size for pretty printer.

Thanks,
Daniel
Alexei Starovoitov Aug. 10, 2018, 2:13 a.m. UTC | #6
On Fri, Aug 10, 2018 at 12:43:20AM +0200, Daniel Borkmann wrote:
> On 08/09/2018 11:44 PM, Alexei Starovoitov wrote:
> > On Thu, Aug 09, 2018 at 11:30:52PM +0200, Daniel Borkmann wrote:
> >> On 08/09/2018 11:14 PM, Alexei Starovoitov wrote:
> >>> On Thu, Aug 09, 2018 at 09:42:20PM +0200, Daniel Borkmann wrote:
> >>>> Commit a26ca7c982cb ("bpf: btf: Add pretty print support to
> >>>> the basic arraymap") enabled support for BTF and dumping via
> >>>> BPF fs for arraymap. However, both can be decoupled from each
> >>>> other such that all BPF maps can be supported for attaching
> >>>> BTF key/value information, while not all maps necessarily
> >>>> need to dump via map_seq_show_elem() callback.
> >>>>
> >>>> The check in array_map_check_btf() can be generalized as
> >>>> ultimatively the key and value size is the only contraint
> >>>> that needs to match for the map. The fact that the key needs
> >>>> to be of type int is optional; it could be any data type as
> >>>> long as it matches the 4 byte key size, just like hash table
> >>>> key or others could be of any data type as well.
> >>>>
> >>>> Minimal example of a hash table dump which then works out
> >>>> of the box for bpftool:
> >>>>
> >>>>   # bpftool map dump id 19
> >>>>   [{
> >>>>           "key": {
> >>>>               "": {
> >>>>                   "vip": 0,
> >>>>                   "vipv6": []
> >>>>               },
> >>>>               "port": 0,
> >>>>               "family": 0,
> >>>>               "proto": 0
> >>>>           },
> >>>>           "value": {
> >>>>               "flags": 0,
> >>>>               "vip_num": 0
> >>>>           }
> >>>>       }
> >>>>   ]
> >>>>
> >>>> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> >>>> Cc: Yonghong Song <yhs@fb.com>
> >>>> ---
> >>>>  include/linux/bpf.h   |  4 +---
> >>>>  kernel/bpf/arraymap.c | 27 ---------------------------
> >>>>  kernel/bpf/inode.c    |  3 ++-
> >>>>  kernel/bpf/syscall.c  | 24 ++++++++++++++++++++----
> >>>>  4 files changed, 23 insertions(+), 35 deletions(-)
> >>>>
> >>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> >>>> index cd8790d..eb76e8e 100644
> >>>> --- a/include/linux/bpf.h
> >>>> +++ b/include/linux/bpf.h
> >>>> @@ -48,8 +48,6 @@ struct bpf_map_ops {
> >>>>  	u32 (*map_fd_sys_lookup_elem)(void *ptr);
> >>>>  	void (*map_seq_show_elem)(struct bpf_map *map, void *key,
> >>>>  				  struct seq_file *m);
> >>>> -	int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf,
> >>>> -			     u32 key_type_id, u32 value_type_id);
> >>>>  };
> >>>>  
> >>>>  struct bpf_map {
> >>>> @@ -118,7 +116,7 @@ static inline bool bpf_map_offload_neutral(const struct bpf_map *map)
> >>>>  
> >>>>  static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
> >>>>  {
> >>>> -	return map->ops->map_seq_show_elem && map->ops->map_check_btf;
> >>>> +	return map->btf && map->ops->map_seq_show_elem;
> >>>>  }
> >>>>  
> >>>>  extern const struct bpf_map_ops bpf_map_offload_ops;
> >>>> diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
> >>>> index 2aa55d030..67f0bdf 100644
> >>>> --- a/kernel/bpf/arraymap.c
> >>>> +++ b/kernel/bpf/arraymap.c
> >>>> @@ -358,32 +358,6 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,
> >>>>  	rcu_read_unlock();
> >>>>  }
> >>>>  
> >>>> -static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf,
> >>>> -			       u32 btf_key_id, u32 btf_value_id)
> >>>> -{
> >>>> -	const struct btf_type *key_type, *value_type;
> >>>> -	u32 key_size, value_size;
> >>>> -	u32 int_data;
> >>>> -
> >>>> -	key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
> >>>> -	if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
> >>>> -		return -EINVAL;
> >>>> -
> >>>> -	int_data = *(u32 *)(key_type + 1);
> >>>> -	/* bpf array can only take a u32 key.  This check makes
> >>>> -	 * sure that the btf matches the attr used during map_create.
> >>>> -	 */
> >>>> -	if (BTF_INT_BITS(int_data) != 32 || key_size != 4 ||
> >>>> -	    BTF_INT_OFFSET(int_data))
> >>>> -		return -EINVAL;
> >>>
> >>> I think most of these checks are still necessary for array type.
> >>> Relaxing BTF array key from BTF_KIND_INT to, for example, BTF_KIND_ENUM
> >>> is probably ok, but key being BTF_KIND_PTR or BTF_KIND_ARRAY doesn't makes sense.
> >>
> >> Hmm, so on 64 bit archs BTF_KIND_PTR would get rejected for array,
> >> on 32 bit it may be allowed due to sizeof(void *) == 4. BTF_KIND_ARRAY
> >> could be array of u8 foo[4], for example, or u16 foo[2]. But how would
> >> it ultimately be different from e.g. having 'struct a' versus 'struct b'
> >> where both are of same size and while actual key has 'struct a', the one
> >> who writes the prog resp. loads the BTF into the kernel would lie about
> >> it stating it's of type 'struct b' instead? It's basically trusting the
> >> app that it advertised sane key types which kernel is propagating back.
> > 
> > for hash map - yes. the kernel cannot yet catch the lie that
> > key == 'struct a' that user said in BTF is not what program used
> > (which used 'struct b' of the same size).
> > Eventually we will annotate all load/store in the program and will
> > make sure that memory access match what BTF said.
> 
> But in that case, would you reject the program? E.g. from prog point of
> view, it's just a buffer of x bytes, so key could be casted to different
> struct/types potentially and used for lookup; similar with value if you
> would go the route to annotate all access into it. I don't think this
> serves as a security feature (since you might as well just load the prog
> without BTF just fine), but it can be used to help verifier to perform
> rewrites like in tracing for implicit bpf_probe_read() based on member
> access. But also in that case, if you might have e.g. stale or wrong BTF
> data e.g. of the whole kernel or some application it would follow/walk
> that one instead. But such user error would be "acceptable" since it serves
> as a hint, roughly similar to (explicitly) walking the data structures
> based on the headers today, you do have better control in terms of header
> mismatches in that you can ship the BTF directly from the build, but there's
> still no guarantee in that sense that you "verified" that these bytes
> originally were mapped to struct foo somewhere in a C program.

I wouldn't view such key checks as safety feature, but rather
as honesty check. Meaning that user space shouldn't be able to cheat
by passing completely bogus BTF into the kernel that only
statisfies single size check.
Say, BTF has a key that is a 4 byte struct { char a,b,c,d; };
but program operates with it as u32. I very much would like
the verifier to notice and reject such program, since if C code
has struct { char a,b,c,d; }; the compiler won't generate u32 access
to it unless C code type casts, but then llvm will warn. So for u32
to legitimately appear in the generated code the struct should be:
union {
  struct { char a,b,c,d;}
  u32 e;
};
Narrow access (like u8 load/store in the bpf prog form u32 BTF field)
is ok, since that's normal compiler optimization, but any other
field/size mismatch the verifier should reject to prevent cheating.

In other words even proprietary bpf programs should not be able to cheat.
If they provide BTF to the kernel, it should correspond exactly to the program.
That's the key to _trusted_ introspection.
Admin that ssh-es into the box and operates with bpftool should be
certain that the map introspection represent the real situation of
the program. If proprietary prog is paranoid about layout of map
it shouldn't be using BTF, but if it does, BTF should match.
In the future I'd like to enfore availability of BTF for
new program types.

> > For array we can catch the lie today that key is not 4 byte int,
> > since it matters from pretty printing point of view.
> > If it's PTR or ARRAY or STRUCT, the printer will go nuts.
> 
> In that case, would you enforce a hash map key size of 4 also to INT-only
> instead of e.g. allowing STRUCT and various others?

hash map key of 4 bytes can be anything and printing is ideally
done similar to the way gdb prints stl c++:
#include <map>

struct K {
  int a, b;
};
struct V {
  int c, d;
};
int operator<(const K& a, const K& b) {return a.a < b.a;}
std::map<K, V> m;

int main()
{
  K k1 = {1, 2};
  K k2 = {3, 4};
  V v1 = {5, 6};
  V v2 = {7, 8};
  m[k1] = v1;
  m[k2] = v2;
...
(gdb) p m
$3 = std::map with 2 elements = {[{a = 1, b = 2}] = {c = 5, d = 6}, [{a = 3, b = 4}] = {c = 7, d = 8}}
(gdb) set print pretty on
(gdb) p m
$4 = std::map with 2 elements = {
  [{
    a = 1,
    b = 2
  }] = {
    c = 5,
    d = 6
  },
  [{
    a = 3,
    b = 4
  }] = {
    c = 7,
    d = 8
  }
}
Daniel Borkmann Aug. 10, 2018, 7:55 a.m. UTC | #7
On 08/10/2018 04:13 AM, Alexei Starovoitov wrote:
> On Fri, Aug 10, 2018 at 12:43:20AM +0200, Daniel Borkmann wrote:
>> On 08/09/2018 11:44 PM, Alexei Starovoitov wrote:
>>> On Thu, Aug 09, 2018 at 11:30:52PM +0200, Daniel Borkmann wrote:
>>>> On 08/09/2018 11:14 PM, Alexei Starovoitov wrote:
>>>>> On Thu, Aug 09, 2018 at 09:42:20PM +0200, Daniel Borkmann wrote:
>>>>>> Commit a26ca7c982cb ("bpf: btf: Add pretty print support to
>>>>>> the basic arraymap") enabled support for BTF and dumping via
>>>>>> BPF fs for arraymap. However, both can be decoupled from each
>>>>>> other such that all BPF maps can be supported for attaching
>>>>>> BTF key/value information, while not all maps necessarily
>>>>>> need to dump via map_seq_show_elem() callback.
>>>>>>
>>>>>> The check in array_map_check_btf() can be generalized as
>>>>>> ultimatively the key and value size is the only contraint
>>>>>> that needs to match for the map. The fact that the key needs
>>>>>> to be of type int is optional; it could be any data type as
>>>>>> long as it matches the 4 byte key size, just like hash table
>>>>>> key or others could be of any data type as well.
>>>>>>
>>>>>> Minimal example of a hash table dump which then works out
>>>>>> of the box for bpftool:
>>>>>>
>>>>>>   # bpftool map dump id 19
>>>>>>   [{
>>>>>>           "key": {
>>>>>>               "": {
>>>>>>                   "vip": 0,
>>>>>>                   "vipv6": []
>>>>>>               },
>>>>>>               "port": 0,
>>>>>>               "family": 0,
>>>>>>               "proto": 0
>>>>>>           },
>>>>>>           "value": {
>>>>>>               "flags": 0,
>>>>>>               "vip_num": 0
>>>>>>           }
>>>>>>       }
>>>>>>   ]
>>>>>>
>>>>>> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
>>>>>> Cc: Yonghong Song <yhs@fb.com>
>>>>>> ---
>>>>>>  include/linux/bpf.h   |  4 +---
>>>>>>  kernel/bpf/arraymap.c | 27 ---------------------------
>>>>>>  kernel/bpf/inode.c    |  3 ++-
>>>>>>  kernel/bpf/syscall.c  | 24 ++++++++++++++++++++----
>>>>>>  4 files changed, 23 insertions(+), 35 deletions(-)
>>>>>>
>>>>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>>>>> index cd8790d..eb76e8e 100644
>>>>>> --- a/include/linux/bpf.h
>>>>>> +++ b/include/linux/bpf.h
>>>>>> @@ -48,8 +48,6 @@ struct bpf_map_ops {
>>>>>>  	u32 (*map_fd_sys_lookup_elem)(void *ptr);
>>>>>>  	void (*map_seq_show_elem)(struct bpf_map *map, void *key,
>>>>>>  				  struct seq_file *m);
>>>>>> -	int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf,
>>>>>> -			     u32 key_type_id, u32 value_type_id);
>>>>>>  };
>>>>>>  
>>>>>>  struct bpf_map {
>>>>>> @@ -118,7 +116,7 @@ static inline bool bpf_map_offload_neutral(const struct bpf_map *map)
>>>>>>  
>>>>>>  static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
>>>>>>  {
>>>>>> -	return map->ops->map_seq_show_elem && map->ops->map_check_btf;
>>>>>> +	return map->btf && map->ops->map_seq_show_elem;
>>>>>>  }
>>>>>>  
>>>>>>  extern const struct bpf_map_ops bpf_map_offload_ops;
>>>>>> diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
>>>>>> index 2aa55d030..67f0bdf 100644
>>>>>> --- a/kernel/bpf/arraymap.c
>>>>>> +++ b/kernel/bpf/arraymap.c
>>>>>> @@ -358,32 +358,6 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,
>>>>>>  	rcu_read_unlock();
>>>>>>  }
>>>>>>  
>>>>>> -static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf,
>>>>>> -			       u32 btf_key_id, u32 btf_value_id)
>>>>>> -{
>>>>>> -	const struct btf_type *key_type, *value_type;
>>>>>> -	u32 key_size, value_size;
>>>>>> -	u32 int_data;
>>>>>> -
>>>>>> -	key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
>>>>>> -	if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
>>>>>> -		return -EINVAL;
>>>>>> -
>>>>>> -	int_data = *(u32 *)(key_type + 1);
>>>>>> -	/* bpf array can only take a u32 key.  This check makes
>>>>>> -	 * sure that the btf matches the attr used during map_create.
>>>>>> -	 */
>>>>>> -	if (BTF_INT_BITS(int_data) != 32 || key_size != 4 ||
>>>>>> -	    BTF_INT_OFFSET(int_data))
>>>>>> -		return -EINVAL;
>>>>>
>>>>> I think most of these checks are still necessary for array type.
>>>>> Relaxing BTF array key from BTF_KIND_INT to, for example, BTF_KIND_ENUM
>>>>> is probably ok, but key being BTF_KIND_PTR or BTF_KIND_ARRAY doesn't makes sense.
>>>>
>>>> Hmm, so on 64 bit archs BTF_KIND_PTR would get rejected for array,
>>>> on 32 bit it may be allowed due to sizeof(void *) == 4. BTF_KIND_ARRAY
>>>> could be array of u8 foo[4], for example, or u16 foo[2]. But how would
>>>> it ultimately be different from e.g. having 'struct a' versus 'struct b'
>>>> where both are of same size and while actual key has 'struct a', the one
>>>> who writes the prog resp. loads the BTF into the kernel would lie about
>>>> it stating it's of type 'struct b' instead? It's basically trusting the
>>>> app that it advertised sane key types which kernel is propagating back.
>>>
>>> for hash map - yes. the kernel cannot yet catch the lie that
>>> key == 'struct a' that user said in BTF is not what program used
>>> (which used 'struct b' of the same size).
>>> Eventually we will annotate all load/store in the program and will
>>> make sure that memory access match what BTF said.
>>
>> But in that case, would you reject the program? E.g. from prog point of
>> view, it's just a buffer of x bytes, so key could be casted to different
>> struct/types potentially and used for lookup; similar with value if you
>> would go the route to annotate all access into it. I don't think this
>> serves as a security feature (since you might as well just load the prog
>> without BTF just fine), but it can be used to help verifier to perform
>> rewrites like in tracing for implicit bpf_probe_read() based on member
>> access. But also in that case, if you might have e.g. stale or wrong BTF
>> data e.g. of the whole kernel or some application it would follow/walk
>> that one instead. But such user error would be "acceptable" since it serves
>> as a hint, roughly similar to (explicitly) walking the data structures
>> based on the headers today, you do have better control in terms of header
>> mismatches in that you can ship the BTF directly from the build, but there's
>> still no guarantee in that sense that you "verified" that these bytes
>> originally were mapped to struct foo somewhere in a C program.
> 
> I wouldn't view such key checks as safety feature, but rather
> as honesty check. Meaning that user space shouldn't be able to cheat
> by passing completely bogus BTF into the kernel that only
> statisfies single size check.

Ok, meaning, we agree that BTF is passed as a hint to the kernel. It is
a 'should not cheat' requirement but not 'cannot cheat', meaning if it
was the latter, BTF is core part of the verifier's safety analysis /
checks, but as it's optional on top of it (well, due to compatibility it
kind of needs to be anyway), it doesn't affect kernel's safety in
relation to what the program is doing internally.

> Say, BTF has a key that is a 4 byte struct { char a,b,c,d; };
> but program operates with it as u32. I very much would like
> the verifier to notice and reject such program, since if C code
> has struct { char a,b,c,d; }; the compiler won't generate u32 access
> to it unless C code type casts, but then llvm will warn. So for u32
> to legitimately appear in the generated code the struct should be:
> union {
>   struct { char a,b,c,d;}
>   u32 e;
> };
> Narrow access (like u8 load/store in the bpf prog form u32 BTF field)
> is ok, since that's normal compiler optimization, but any other
> field/size mismatch the verifier should reject to prevent cheating.
> 
> In other words even proprietary bpf programs should not be able to cheat.
> If they provide BTF to the kernel, it should correspond exactly to the program.
> That's the key to _trusted_ introspection.

But then wouldn't this artificially force users into programming /
thinking style that verifier dictates upon them similar as we have
today as opposed to further moving away from it to allow more C-style
programs to be accepted?

But even if that is the goal, it is still used as a hint today, e.g.
even for an array I could tell the compiler that the key is '__u32'
for BTF sake while using the underlying key differently (e.g. as struct,
enum, etc) in order to pass the array_map_check_btf() check in the
kernel. Those type of programs would still need to be accepted due
to compatibility reasons. Same if we only test on size match in other
maps, it's nothing different. I don't see how verifier would start
enforcing programs to be rejected based on access patterns on the
types; while it might work for newly added program types that this
could be enforced, it cannot for all the many existing program types,
but I also don't really think it needs to be. Unless it is declared
a safety feature for future program types.

> Admin that ssh-es into the box and operates with bpftool should be
> certain that the map introspection represent the real situation of
> the program. If proprietary prog is paranoid about layout of map
> it shouldn't be using BTF, but if it does, BTF should match.
> In the future I'd like to enfore availability of BTF for
> new program types.

Sure, and in vast majority / normal cases it will represent the real
situation. Do you mistrust DWARF debugging data that gets shipped with
your distro, for example? pahole and other tools on top of it? It is
pretty much the same situation. Given gdb example below, does gdb do
any verification of the binary in order to analyze load/store patterns
and whether it matches with what sits in DWARF as types? It would just
as well map the types into memory and dump it as pretty print instead.
And that's okay, the one thing that needs to be verified for correctness
resp. validity is the debugging format itself so it can be used for
that.

>>> For array we can catch the lie today that key is not 4 byte int,
>>> since it matters from pretty printing point of view.
>>> If it's PTR or ARRAY or STRUCT, the printer will go nuts.
>>
>> In that case, would you enforce a hash map key size of 4 also to INT-only
>> instead of e.g. allowing STRUCT and various others?
> 
> hash map key of 4 bytes can be anything and printing is ideally
> done similar to the way gdb prints stl c++:
> #include <map>
> 
> struct K {
>   int a, b;
> };
> struct V {
>   int c, d;
> };
> int operator<(const K& a, const K& b) {return a.a < b.a;}
> std::map<K, V> m;
> 
> int main()
> {
>   K k1 = {1, 2};
>   K k2 = {3, 4};
>   V v1 = {5, 6};
>   V v2 = {7, 8};
>   m[k1] = v1;
>   m[k2] = v2;
> ...
> (gdb) p m
> $3 = std::map with 2 elements = {[{a = 1, b = 2}] = {c = 5, d = 6}, [{a = 3, b = 4}] = {c = 7, d = 8}}
> (gdb) set print pretty on
> (gdb) p m
> $4 = std::map with 2 elements = {
>   [{
>     a = 1,
>     b = 2
>   }] = {
>     c = 5,
>     d = 6
>   },
>   [{
>     a = 3,
>     b = 4
>   }] = {
>     c = 7,
>     d = 8
>   }
> }
Martin KaFai Lau Aug. 10, 2018, 12:54 p.m. UTC | #8
On Fri, Aug 10, 2018 at 09:55:35AM +0200, Daniel Borkmann wrote:
> On 08/10/2018 04:13 AM, Alexei Starovoitov wrote:
> > On Fri, Aug 10, 2018 at 12:43:20AM +0200, Daniel Borkmann wrote:
> >> On 08/09/2018 11:44 PM, Alexei Starovoitov wrote:
> >>> On Thu, Aug 09, 2018 at 11:30:52PM +0200, Daniel Borkmann wrote:
> >>>> On 08/09/2018 11:14 PM, Alexei Starovoitov wrote:
> >>>>> On Thu, Aug 09, 2018 at 09:42:20PM +0200, Daniel Borkmann wrote:
> >>>>>> Commit a26ca7c982cb ("bpf: btf: Add pretty print support to
> >>>>>> the basic arraymap") enabled support for BTF and dumping via
> >>>>>> BPF fs for arraymap. However, both can be decoupled from each
> >>>>>> other such that all BPF maps can be supported for attaching
> >>>>>> BTF key/value information, while not all maps necessarily
> >>>>>> need to dump via map_seq_show_elem() callback.
> >>>>>>
> >>>>>> The check in array_map_check_btf() can be generalized as
> >>>>>> ultimatively the key and value size is the only contraint
> >>>>>> that needs to match for the map. The fact that the key needs
> >>>>>> to be of type int is optional; it could be any data type as
> >>>>>> long as it matches the 4 byte key size, just like hash table
> >>>>>> key or others could be of any data type as well.
> >>>>>>
> >>>>>> Minimal example of a hash table dump which then works out
> >>>>>> of the box for bpftool:
> >>>>>>
> >>>>>>   # bpftool map dump id 19
> >>>>>>   [{
> >>>>>>           "key": {
> >>>>>>               "": {
> >>>>>>                   "vip": 0,
> >>>>>>                   "vipv6": []
> >>>>>>               },
> >>>>>>               "port": 0,
> >>>>>>               "family": 0,
> >>>>>>               "proto": 0
> >>>>>>           },
> >>>>>>           "value": {
> >>>>>>               "flags": 0,
> >>>>>>               "vip_num": 0
> >>>>>>           }
> >>>>>>       }
> >>>>>>   ]
> >>>>>>
> >>>>>> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> >>>>>> Cc: Yonghong Song <yhs@fb.com>
> >>>>>> ---
> >>>>>>  include/linux/bpf.h   |  4 +---
> >>>>>>  kernel/bpf/arraymap.c | 27 ---------------------------
> >>>>>>  kernel/bpf/inode.c    |  3 ++-
> >>>>>>  kernel/bpf/syscall.c  | 24 ++++++++++++++++++++----
> >>>>>>  4 files changed, 23 insertions(+), 35 deletions(-)
> >>>>>>
> >>>>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> >>>>>> index cd8790d..eb76e8e 100644
> >>>>>> --- a/include/linux/bpf.h
> >>>>>> +++ b/include/linux/bpf.h
> >>>>>> @@ -48,8 +48,6 @@ struct bpf_map_ops {
> >>>>>>  	u32 (*map_fd_sys_lookup_elem)(void *ptr);
> >>>>>>  	void (*map_seq_show_elem)(struct bpf_map *map, void *key,
> >>>>>>  				  struct seq_file *m);
> >>>>>> -	int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf,
> >>>>>> -			     u32 key_type_id, u32 value_type_id);
> >>>>>>  };
> >>>>>>  
> >>>>>>  struct bpf_map {
> >>>>>> @@ -118,7 +116,7 @@ static inline bool bpf_map_offload_neutral(const struct bpf_map *map)
> >>>>>>  
> >>>>>>  static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
> >>>>>>  {
> >>>>>> -	return map->ops->map_seq_show_elem && map->ops->map_check_btf;
> >>>>>> +	return map->btf && map->ops->map_seq_show_elem;
> >>>>>>  }
> >>>>>>  
> >>>>>>  extern const struct bpf_map_ops bpf_map_offload_ops;
> >>>>>> diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
> >>>>>> index 2aa55d030..67f0bdf 100644
> >>>>>> --- a/kernel/bpf/arraymap.c
> >>>>>> +++ b/kernel/bpf/arraymap.c
> >>>>>> @@ -358,32 +358,6 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,
> >>>>>>  	rcu_read_unlock();
> >>>>>>  }
> >>>>>>  
> >>>>>> -static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf,
> >>>>>> -			       u32 btf_key_id, u32 btf_value_id)
> >>>>>> -{
> >>>>>> -	const struct btf_type *key_type, *value_type;
> >>>>>> -	u32 key_size, value_size;
> >>>>>> -	u32 int_data;
> >>>>>> -
> >>>>>> -	key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
> >>>>>> -	if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
> >>>>>> -		return -EINVAL;
> >>>>>> -
> >>>>>> -	int_data = *(u32 *)(key_type + 1);
> >>>>>> -	/* bpf array can only take a u32 key.  This check makes
> >>>>>> -	 * sure that the btf matches the attr used during map_create.
> >>>>>> -	 */
> >>>>>> -	if (BTF_INT_BITS(int_data) != 32 || key_size != 4 ||
> >>>>>> -	    BTF_INT_OFFSET(int_data))
> >>>>>> -		return -EINVAL;
> >>>>>
> >>>>> I think most of these checks are still necessary for array type.
> >>>>> Relaxing BTF array key from BTF_KIND_INT to, for example, BTF_KIND_ENUM
> >>>>> is probably ok, but key being BTF_KIND_PTR or BTF_KIND_ARRAY doesn't makes sense.
> >>>>
> >>>> Hmm, so on 64 bit archs BTF_KIND_PTR would get rejected for array,
> >>>> on 32 bit it may be allowed due to sizeof(void *) == 4. BTF_KIND_ARRAY
> >>>> could be array of u8 foo[4], for example, or u16 foo[2]. But how would
> >>>> it ultimately be different from e.g. having 'struct a' versus 'struct b'
> >>>> where both are of same size and while actual key has 'struct a', the one
> >>>> who writes the prog resp. loads the BTF into the kernel would lie about
> >>>> it stating it's of type 'struct b' instead? It's basically trusting the
> >>>> app that it advertised sane key types which kernel is propagating back.
> >>>
> >>> for hash map - yes. the kernel cannot yet catch the lie that
> >>> key == 'struct a' that user said in BTF is not what program used
> >>> (which used 'struct b' of the same size).
> >>> Eventually we will annotate all load/store in the program and will
> >>> make sure that memory access match what BTF said.
> >>
> >> But in that case, would you reject the program? E.g. from prog point of
> >> view, it's just a buffer of x bytes, so key could be casted to different
> >> struct/types potentially and used for lookup; similar with value if you
> >> would go the route to annotate all access into it. I don't think this
> >> serves as a security feature (since you might as well just load the prog
> >> without BTF just fine), but it can be used to help verifier to perform
> >> rewrites like in tracing for implicit bpf_probe_read() based on member
> >> access. But also in that case, if you might have e.g. stale or wrong BTF
> >> data e.g. of the whole kernel or some application it would follow/walk
> >> that one instead. But such user error would be "acceptable" since it serves
> >> as a hint, roughly similar to (explicitly) walking the data structures
> >> based on the headers today, you do have better control in terms of header
> >> mismatches in that you can ship the BTF directly from the build, but there's
> >> still no guarantee in that sense that you "verified" that these bytes
> >> originally were mapped to struct foo somewhere in a C program.
> > 
> > I wouldn't view such key checks as safety feature, but rather
> > as honesty check. Meaning that user space shouldn't be able to cheat
> > by passing completely bogus BTF into the kernel that only
> > statisfies single size check.
> 
> Ok, meaning, we agree that BTF is passed as a hint to the kernel. It is
> a 'should not cheat' requirement but not 'cannot cheat', meaning if it
> was the latter, BTF is core part of the verifier's safety analysis /
> checks, but as it's optional on top of it (well, due to compatibility it
> kind of needs to be anyway), it doesn't affect kernel's safety in
> relation to what the program is doing internally.
> 
> > Say, BTF has a key that is a 4 byte struct { char a,b,c,d; };
> > but program operates with it as u32. I very much would like
> > the verifier to notice and reject such program, since if C code
> > has struct { char a,b,c,d; }; the compiler won't generate u32 access
> > to it unless C code type casts, but then llvm will warn. So for u32
> > to legitimately appear in the generated code the struct should be:
> > union {
> >   struct { char a,b,c,d;}
> >   u32 e;
> > };
> > Narrow access (like u8 load/store in the bpf prog form u32 BTF field)
> > is ok, since that's normal compiler optimization, but any other
> > field/size mismatch the verifier should reject to prevent cheating.
> > 
> > In other words even proprietary bpf programs should not be able to cheat.
> > If they provide BTF to the kernel, it should correspond exactly to the program.
> > That's the key to _trusted_ introspection.
> 
> But then wouldn't this artificially force users into programming /
> thinking style that verifier dictates upon them similar as we have
> today as opposed to further moving away from it to allow more C-style
> programs to be accepted?
> 
> But even if that is the goal, it is still used as a hint today, e.g.
> even for an array I could tell the compiler that the key is '__u32'
> for BTF sake while using the underlying key differently (e.g. as struct,
> enum, etc) in order to pass the array_map_check_btf() check in the
> kernel. Those type of programs would still need to be accepted due
> to compatibility reasons. Same if we only test on size match in other
> maps, it's nothing different. I don't see how verifier would start
> enforcing programs to be rejected based on access patterns on the
> types; while it might work for newly added program types that this
> could be enforced, it cannot for all the many existing program types,
> but I also don't really think it needs to be. Unless it is declared
> a safety feature for future program types.
> 
> > Admin that ssh-es into the box and operates with bpftool should be
> > certain that the map introspection represent the real situation of
> > the program. If proprietary prog is paranoid about layout of map
> > it shouldn't be using BTF, but if it does, BTF should match.
> > In the future I'd like to enfore availability of BTF for
> > new program types.
> 
> Sure, and in vast majority / normal cases it will represent the real
> situation. Do you mistrust DWARF debugging data that gets shipped with
> your distro, for example? pahole and other tools on top of it? It is
> pretty much the same situation. Given gdb example below, does gdb do
> any verification of the binary in order to analyze load/store patterns
> and whether it matches with what sits in DWARF as types? It would just
> as well map the types into memory and dump it as pretty print instead.
> And that's okay, the one thing that needs to be verified for correctness
> resp. validity is the debugging format itself so it can be used for
> that.
I think decoupling map_seq_show_elem() requirement from BTF support
during map_create() is ok.  There is plan to do that going forward if
it ever hits some map types that is difficult to do map_seq_show_elem().
However, I think most of the maps should have pretty straight
forward seq_show implementation considering most of the
heavy lifting has already been done in btf_*_seq_show()
and bpffs_map_seq_ops.

Regarding one general map_check_btf() for all maps' purpose,
I think not all bpf maps are behaving the same.

There are some property that the kernel implies on a particular
map type.  Some of them even have kernel generated value in it.
For example, map-in-map, the lookup_elem() value is a map's id
which must be an integer generated by kernel's idr_alloc_cyclic().
Treating it as enum/struct/array...etc seems wrong.
Hence, I think each map type should be able to provide
its own map_check_btf().  If a map does not have
specific key/value requirement, it can use the default one which
is the min requirement for kernel safety reason.

The map can provide a more strict requirement first and relax it
later when use cases come up.

> 
> >>> For array we can catch the lie today that key is not 4 byte int,
> >>> since it matters from pretty printing point of view.
> >>> If it's PTR or ARRAY or STRUCT, the printer will go nuts.
> >>
> >> In that case, would you enforce a hash map key size of 4 also to INT-only
> >> instead of e.g. allowing STRUCT and various others?
> > 
> > hash map key of 4 bytes can be anything and printing is ideally
> > done similar to the way gdb prints stl c++:
> > #include <map>
> > 
> > struct K {
> >   int a, b;
> > };
> > struct V {
> >   int c, d;
> > };
> > int operator<(const K& a, const K& b) {return a.a < b.a;}
> > std::map<K, V> m;
> > 
> > int main()
> > {
> >   K k1 = {1, 2};
> >   K k2 = {3, 4};
> >   V v1 = {5, 6};
> >   V v2 = {7, 8};
> >   m[k1] = v1;
> >   m[k2] = v2;
> > ...
> > (gdb) p m
> > $3 = std::map with 2 elements = {[{a = 1, b = 2}] = {c = 5, d = 6}, [{a = 3, b = 4}] = {c = 7, d = 8}}
> > (gdb) set print pretty on
> > (gdb) p m
> > $4 = std::map with 2 elements = {
> >   [{
> >     a = 1,
> >     b = 2
> >   }] = {
> >     c = 5,
> >     d = 6
> >   },
> >   [{
> >     a = 3,
> >     b = 4
> >   }] = {
> >     c = 7,
> >     d = 8
> >   }
> > }
>
Daniel Borkmann Aug. 10, 2018, 1:30 p.m. UTC | #9
On 08/10/2018 02:54 PM, Martin KaFai Lau wrote:
> On Fri, Aug 10, 2018 at 09:55:35AM +0200, Daniel Borkmann wrote:
>> On 08/10/2018 04:13 AM, Alexei Starovoitov wrote:
>>> On Fri, Aug 10, 2018 at 12:43:20AM +0200, Daniel Borkmann wrote:
>>>> On 08/09/2018 11:44 PM, Alexei Starovoitov wrote:
>>>>> On Thu, Aug 09, 2018 at 11:30:52PM +0200, Daniel Borkmann wrote:
>>>>>> On 08/09/2018 11:14 PM, Alexei Starovoitov wrote:
>>>>>>> On Thu, Aug 09, 2018 at 09:42:20PM +0200, Daniel Borkmann wrote:
>>>>>>>> Commit a26ca7c982cb ("bpf: btf: Add pretty print support to
>>>>>>>> the basic arraymap") enabled support for BTF and dumping via
>>>>>>>> BPF fs for arraymap. However, both can be decoupled from each
>>>>>>>> other such that all BPF maps can be supported for attaching
>>>>>>>> BTF key/value information, while not all maps necessarily
>>>>>>>> need to dump via map_seq_show_elem() callback.
>>>>>>>>
>>>>>>>> The check in array_map_check_btf() can be generalized as
>>>>>>>> ultimatively the key and value size is the only contraint
>>>>>>>> that needs to match for the map. The fact that the key needs
>>>>>>>> to be of type int is optional; it could be any data type as
>>>>>>>> long as it matches the 4 byte key size, just like hash table
>>>>>>>> key or others could be of any data type as well.
>>>>>>>>
>>>>>>>> Minimal example of a hash table dump which then works out
>>>>>>>> of the box for bpftool:
>>>>>>>>
>>>>>>>>   # bpftool map dump id 19
>>>>>>>>   [{
>>>>>>>>           "key": {
>>>>>>>>               "": {
>>>>>>>>                   "vip": 0,
>>>>>>>>                   "vipv6": []
>>>>>>>>               },
>>>>>>>>               "port": 0,
>>>>>>>>               "family": 0,
>>>>>>>>               "proto": 0
>>>>>>>>           },
>>>>>>>>           "value": {
>>>>>>>>               "flags": 0,
>>>>>>>>               "vip_num": 0
>>>>>>>>           }
>>>>>>>>       }
>>>>>>>>   ]
>>>>>>>>
>>>>>>>> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
>>>>>>>> Cc: Yonghong Song <yhs@fb.com>
>>>>>>>> ---
>>>>>>>>  include/linux/bpf.h   |  4 +---
>>>>>>>>  kernel/bpf/arraymap.c | 27 ---------------------------
>>>>>>>>  kernel/bpf/inode.c    |  3 ++-
>>>>>>>>  kernel/bpf/syscall.c  | 24 ++++++++++++++++++++----
>>>>>>>>  4 files changed, 23 insertions(+), 35 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>>>>>>> index cd8790d..eb76e8e 100644
>>>>>>>> --- a/include/linux/bpf.h
>>>>>>>> +++ b/include/linux/bpf.h
>>>>>>>> @@ -48,8 +48,6 @@ struct bpf_map_ops {
>>>>>>>>  	u32 (*map_fd_sys_lookup_elem)(void *ptr);
>>>>>>>>  	void (*map_seq_show_elem)(struct bpf_map *map, void *key,
>>>>>>>>  				  struct seq_file *m);
>>>>>>>> -	int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf,
>>>>>>>> -			     u32 key_type_id, u32 value_type_id);
>>>>>>>>  };
>>>>>>>>  
>>>>>>>>  struct bpf_map {
>>>>>>>> @@ -118,7 +116,7 @@ static inline bool bpf_map_offload_neutral(const struct bpf_map *map)
>>>>>>>>  
>>>>>>>>  static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
>>>>>>>>  {
>>>>>>>> -	return map->ops->map_seq_show_elem && map->ops->map_check_btf;
>>>>>>>> +	return map->btf && map->ops->map_seq_show_elem;
>>>>>>>>  }
>>>>>>>>  
>>>>>>>>  extern const struct bpf_map_ops bpf_map_offload_ops;
>>>>>>>> diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
>>>>>>>> index 2aa55d030..67f0bdf 100644
>>>>>>>> --- a/kernel/bpf/arraymap.c
>>>>>>>> +++ b/kernel/bpf/arraymap.c
>>>>>>>> @@ -358,32 +358,6 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,
>>>>>>>>  	rcu_read_unlock();
>>>>>>>>  }
>>>>>>>>  
>>>>>>>> -static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf,
>>>>>>>> -			       u32 btf_key_id, u32 btf_value_id)
>>>>>>>> -{
>>>>>>>> -	const struct btf_type *key_type, *value_type;
>>>>>>>> -	u32 key_size, value_size;
>>>>>>>> -	u32 int_data;
>>>>>>>> -
>>>>>>>> -	key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
>>>>>>>> -	if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
>>>>>>>> -		return -EINVAL;
>>>>>>>> -
>>>>>>>> -	int_data = *(u32 *)(key_type + 1);
>>>>>>>> -	/* bpf array can only take a u32 key.  This check makes
>>>>>>>> -	 * sure that the btf matches the attr used during map_create.
>>>>>>>> -	 */
>>>>>>>> -	if (BTF_INT_BITS(int_data) != 32 || key_size != 4 ||
>>>>>>>> -	    BTF_INT_OFFSET(int_data))
>>>>>>>> -		return -EINVAL;
>>>>>>>
>>>>>>> I think most of these checks are still necessary for array type.
>>>>>>> Relaxing BTF array key from BTF_KIND_INT to, for example, BTF_KIND_ENUM
>>>>>>> is probably ok, but key being BTF_KIND_PTR or BTF_KIND_ARRAY doesn't makes sense.
>>>>>>
>>>>>> Hmm, so on 64 bit archs BTF_KIND_PTR would get rejected for array,
>>>>>> on 32 bit it may be allowed due to sizeof(void *) == 4. BTF_KIND_ARRAY
>>>>>> could be array of u8 foo[4], for example, or u16 foo[2]. But how would
>>>>>> it ultimately be different from e.g. having 'struct a' versus 'struct b'
>>>>>> where both are of same size and while actual key has 'struct a', the one
>>>>>> who writes the prog resp. loads the BTF into the kernel would lie about
>>>>>> it stating it's of type 'struct b' instead? It's basically trusting the
>>>>>> app that it advertised sane key types which kernel is propagating back.
>>>>>
>>>>> for hash map - yes. the kernel cannot yet catch the lie that
>>>>> key == 'struct a' that user said in BTF is not what program used
>>>>> (which used 'struct b' of the same size).
>>>>> Eventually we will annotate all load/store in the program and will
>>>>> make sure that memory access match what BTF said.
>>>>
>>>> But in that case, would you reject the program? E.g. from prog point of
>>>> view, it's just a buffer of x bytes, so key could be casted to different
>>>> struct/types potentially and used for lookup; similar with value if you
>>>> would go the route to annotate all access into it. I don't think this
>>>> serves as a security feature (since you might as well just load the prog
>>>> without BTF just fine), but it can be used to help verifier to perform
>>>> rewrites like in tracing for implicit bpf_probe_read() based on member
>>>> access. But also in that case, if you might have e.g. stale or wrong BTF
>>>> data e.g. of the whole kernel or some application it would follow/walk
>>>> that one instead. But such user error would be "acceptable" since it serves
>>>> as a hint, roughly similar to (explicitly) walking the data structures
>>>> based on the headers today, you do have better control in terms of header
>>>> mismatches in that you can ship the BTF directly from the build, but there's
>>>> still no guarantee in that sense that you "verified" that these bytes
>>>> originally were mapped to struct foo somewhere in a C program.
>>>
>>> I wouldn't view such key checks as safety feature, but rather
>>> as honesty check. Meaning that user space shouldn't be able to cheat
>>> by passing completely bogus BTF into the kernel that only
>>> statisfies single size check.
>>
>> Ok, meaning, we agree that BTF is passed as a hint to the kernel. It is
>> a 'should not cheat' requirement but not 'cannot cheat', meaning if it
>> was the latter, BTF is core part of the verifier's safety analysis /
>> checks, but as it's optional on top of it (well, due to compatibility it
>> kind of needs to be anyway), it doesn't affect kernel's safety in
>> relation to what the program is doing internally.
>>
>>> Say, BTF has a key that is a 4 byte struct { char a,b,c,d; };
>>> but program operates with it as u32. I very much would like
>>> the verifier to notice and reject such program, since if C code
>>> has struct { char a,b,c,d; }; the compiler won't generate u32 access
>>> to it unless C code type casts, but then llvm will warn. So for u32
>>> to legitimately appear in the generated code the struct should be:
>>> union {
>>>   struct { char a,b,c,d;}
>>>   u32 e;
>>> };
>>> Narrow access (like u8 load/store in the bpf prog form u32 BTF field)
>>> is ok, since that's normal compiler optimization, but any other
>>> field/size mismatch the verifier should reject to prevent cheating.
>>>
>>> In other words even proprietary bpf programs should not be able to cheat.
>>> If they provide BTF to the kernel, it should correspond exactly to the program.
>>> That's the key to _trusted_ introspection.
>>
>> But then wouldn't this artificially force users into programming /
>> thinking style that verifier dictates upon them similar as we have
>> today as opposed to further moving away from it to allow more C-style
>> programs to be accepted?
>>
>> But even if that is the goal, it is still used as a hint today, e.g.
>> even for an array I could tell the compiler that the key is '__u32'
>> for BTF sake while using the underlying key differently (e.g. as struct,
>> enum, etc) in order to pass the array_map_check_btf() check in the
>> kernel. Those type of programs would still need to be accepted due
>> to compatibility reasons. Same if we only test on size match in other
>> maps, it's nothing different. I don't see how verifier would start
>> enforcing programs to be rejected based on access patterns on the
>> types; while it might work for newly added program types that this
>> could be enforced, it cannot for all the many existing program types,
>> but I also don't really think it needs to be. Unless it is declared
>> a safety feature for future program types.
>>
>>> Admin that ssh-es into the box and operates with bpftool should be
>>> certain that the map introspection represent the real situation of
>>> the program. If proprietary prog is paranoid about layout of map
>>> it shouldn't be using BTF, but if it does, BTF should match.
>>> In the future I'd like to enfore availability of BTF for
>>> new program types.
>>
>> Sure, and in vast majority / normal cases it will represent the real
>> situation. Do you mistrust DWARF debugging data that gets shipped with
>> your distro, for example? pahole and other tools on top of it? It is
>> pretty much the same situation. Given gdb example below, does gdb do
>> any verification of the binary in order to analyze load/store patterns
>> and whether it matches with what sits in DWARF as types? It would just
>> as well map the types into memory and dump it as pretty print instead.
>> And that's okay, the one thing that needs to be verified for correctness
>> resp. validity is the debugging format itself so it can be used for
>> that.
> I think decoupling map_seq_show_elem() requirement from BTF support
> during map_create() is ok.  There is plan to do that going forward if
> it ever hits some map types that is difficult to do map_seq_show_elem().
> However, I think most of the maps should have pretty straight
> forward seq_show implementation considering most of the
> heavy lifting has already been done in btf_*_seq_show()
> and bpffs_map_seq_ops.

I think potentially this could also be made generic for majority of maps
in that you do the lookup and then the key/value btf_type_seq_show()
dump from it under RCU.

> Regarding one general map_check_btf() for all maps' purpose,
> I think not all bpf maps are behaving the same.
> 
> There are some property that the kernel implies on a particular
> map type.  Some of them even have kernel generated value in it.
> For example, map-in-map, the lookup_elem() value is a map's id
> which must be an integer generated by kernel's idr_alloc_cyclic().
> Treating it as enum/struct/array...etc seems wrong.
> Hence, I think each map type should be able to provide
> its own map_check_btf().  If a map does not have
> specific key/value requirement, it can use the default one which
> is the min requirement for kernel safety reason.

Yeah, fair point, I think potentially for the in-kernel generated ones at
some point they could provide their own in-kernel BTF description. Ok,
I'll rework to have a default and stricter map specific override if present
which those in-kernel ones will reject then.

Thanks,
Daniel
Alexei Starovoitov Aug. 10, 2018, 4:40 p.m. UTC | #10
On Fri, Aug 10, 2018 at 09:55:35AM +0200, Daniel Borkmann wrote:
> On 08/10/2018 04:13 AM, Alexei Starovoitov wrote:
> > On Fri, Aug 10, 2018 at 12:43:20AM +0200, Daniel Borkmann wrote:
> >> On 08/09/2018 11:44 PM, Alexei Starovoitov wrote:
> >>> On Thu, Aug 09, 2018 at 11:30:52PM +0200, Daniel Borkmann wrote:
> >>>> On 08/09/2018 11:14 PM, Alexei Starovoitov wrote:
> >>>>> On Thu, Aug 09, 2018 at 09:42:20PM +0200, Daniel Borkmann wrote:
> >>>>>> Commit a26ca7c982cb ("bpf: btf: Add pretty print support to
> >>>>>> the basic arraymap") enabled support for BTF and dumping via
> >>>>>> BPF fs for arraymap. However, both can be decoupled from each
> >>>>>> other such that all BPF maps can be supported for attaching
> >>>>>> BTF key/value information, while not all maps necessarily
> >>>>>> need to dump via map_seq_show_elem() callback.
> >>>>>>
> >>>>>> The check in array_map_check_btf() can be generalized as
> >>>>>> ultimatively the key and value size is the only contraint
> >>>>>> that needs to match for the map. The fact that the key needs
> >>>>>> to be of type int is optional; it could be any data type as
> >>>>>> long as it matches the 4 byte key size, just like hash table
> >>>>>> key or others could be of any data type as well.
> >>>>>>
> >>>>>> Minimal example of a hash table dump which then works out
> >>>>>> of the box for bpftool:
> >>>>>>
> >>>>>>   # bpftool map dump id 19
> >>>>>>   [{
> >>>>>>           "key": {
> >>>>>>               "": {
> >>>>>>                   "vip": 0,
> >>>>>>                   "vipv6": []
> >>>>>>               },
> >>>>>>               "port": 0,
> >>>>>>               "family": 0,
> >>>>>>               "proto": 0
> >>>>>>           },
> >>>>>>           "value": {
> >>>>>>               "flags": 0,
> >>>>>>               "vip_num": 0
> >>>>>>           }
> >>>>>>       }
> >>>>>>   ]
> >>>>>>
> >>>>>> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> >>>>>> Cc: Yonghong Song <yhs@fb.com>
> >>>>>> ---
> >>>>>>  include/linux/bpf.h   |  4 +---
> >>>>>>  kernel/bpf/arraymap.c | 27 ---------------------------
> >>>>>>  kernel/bpf/inode.c    |  3 ++-
> >>>>>>  kernel/bpf/syscall.c  | 24 ++++++++++++++++++++----
> >>>>>>  4 files changed, 23 insertions(+), 35 deletions(-)
> >>>>>>
> >>>>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> >>>>>> index cd8790d..eb76e8e 100644
> >>>>>> --- a/include/linux/bpf.h
> >>>>>> +++ b/include/linux/bpf.h
> >>>>>> @@ -48,8 +48,6 @@ struct bpf_map_ops {
> >>>>>>  	u32 (*map_fd_sys_lookup_elem)(void *ptr);
> >>>>>>  	void (*map_seq_show_elem)(struct bpf_map *map, void *key,
> >>>>>>  				  struct seq_file *m);
> >>>>>> -	int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf,
> >>>>>> -			     u32 key_type_id, u32 value_type_id);
> >>>>>>  };
> >>>>>>  
> >>>>>>  struct bpf_map {
> >>>>>> @@ -118,7 +116,7 @@ static inline bool bpf_map_offload_neutral(const struct bpf_map *map)
> >>>>>>  
> >>>>>>  static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
> >>>>>>  {
> >>>>>> -	return map->ops->map_seq_show_elem && map->ops->map_check_btf;
> >>>>>> +	return map->btf && map->ops->map_seq_show_elem;
> >>>>>>  }
> >>>>>>  
> >>>>>>  extern const struct bpf_map_ops bpf_map_offload_ops;
> >>>>>> diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
> >>>>>> index 2aa55d030..67f0bdf 100644
> >>>>>> --- a/kernel/bpf/arraymap.c
> >>>>>> +++ b/kernel/bpf/arraymap.c
> >>>>>> @@ -358,32 +358,6 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,
> >>>>>>  	rcu_read_unlock();
> >>>>>>  }
> >>>>>>  
> >>>>>> -static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf,
> >>>>>> -			       u32 btf_key_id, u32 btf_value_id)
> >>>>>> -{
> >>>>>> -	const struct btf_type *key_type, *value_type;
> >>>>>> -	u32 key_size, value_size;
> >>>>>> -	u32 int_data;
> >>>>>> -
> >>>>>> -	key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
> >>>>>> -	if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
> >>>>>> -		return -EINVAL;
> >>>>>> -
> >>>>>> -	int_data = *(u32 *)(key_type + 1);
> >>>>>> -	/* bpf array can only take a u32 key.  This check makes
> >>>>>> -	 * sure that the btf matches the attr used during map_create.
> >>>>>> -	 */
> >>>>>> -	if (BTF_INT_BITS(int_data) != 32 || key_size != 4 ||
> >>>>>> -	    BTF_INT_OFFSET(int_data))
> >>>>>> -		return -EINVAL;
> >>>>>
> >>>>> I think most of these checks are still necessary for array type.
> >>>>> Relaxing BTF array key from BTF_KIND_INT to, for example, BTF_KIND_ENUM
> >>>>> is probably ok, but key being BTF_KIND_PTR or BTF_KIND_ARRAY doesn't makes sense.
> >>>>
> >>>> Hmm, so on 64 bit archs BTF_KIND_PTR would get rejected for array,
> >>>> on 32 bit it may be allowed due to sizeof(void *) == 4. BTF_KIND_ARRAY
> >>>> could be array of u8 foo[4], for example, or u16 foo[2]. But how would
> >>>> it ultimately be different from e.g. having 'struct a' versus 'struct b'
> >>>> where both are of same size and while actual key has 'struct a', the one
> >>>> who writes the prog resp. loads the BTF into the kernel would lie about
> >>>> it stating it's of type 'struct b' instead? It's basically trusting the
> >>>> app that it advertised sane key types which kernel is propagating back.
> >>>
> >>> for hash map - yes. the kernel cannot yet catch the lie that
> >>> key == 'struct a' that user said in BTF is not what program used
> >>> (which used 'struct b' of the same size).
> >>> Eventually we will annotate all load/store in the program and will
> >>> make sure that memory access match what BTF said.
> >>
> >> But in that case, would you reject the program? E.g. from prog point of
> >> view, it's just a buffer of x bytes, so key could be casted to different
> >> struct/types potentially and used for lookup; similar with value if you
> >> would go the route to annotate all access into it. I don't think this
> >> serves as a security feature (since you might as well just load the prog
> >> without BTF just fine), but it can be used to help verifier to perform
> >> rewrites like in tracing for implicit bpf_probe_read() based on member
> >> access. But also in that case, if you might have e.g. stale or wrong BTF
> >> data e.g. of the whole kernel or some application it would follow/walk
> >> that one instead. But such user error would be "acceptable" since it serves
> >> as a hint, roughly similar to (explicitly) walking the data structures
> >> based on the headers today, you do have better control in terms of header
> >> mismatches in that you can ship the BTF directly from the build, but there's
> >> still no guarantee in that sense that you "verified" that these bytes
> >> originally were mapped to struct foo somewhere in a C program.
> > 
> > I wouldn't view such key checks as safety feature, but rather
> > as honesty check. Meaning that user space shouldn't be able to cheat
> > by passing completely bogus BTF into the kernel that only
> > statisfies single size check.
> 
> Ok, meaning, we agree that BTF is passed as a hint to the kernel. It is
> a 'should not cheat' requirement but not 'cannot cheat', meaning if it
> was the latter, BTF is core part of the verifier's safety analysis /
> checks, but as it's optional on top of it (well, due to compatibility it
> kind of needs to be anyway), it doesn't affect kernel's safety in
> relation to what the program is doing internally.

Eventually BTF will not be optional, since it's not another dwarf-like debug info.
Today its type information serves the purpose of debug info,
but it's more than that. It's a type description.
Meaning that the kernel verifies it and the verifier will eventually use
it to make safety decisions.
Right now BTF is disjoined from the code, but later we'll tie
them together and the verifer will start rejecting programs
where memory accesses don't match the description.
At that point if the code has annotated load/store they gotta be correct.
We won't be able to do gradual tightening due to compatibility
reasons, so when it happens it's gotta be fully strict from the start.
So I really see it as "cannot cheat".

> > Say, BTF has a key that is a 4 byte struct { char a,b,c,d; };
> > but program operates with it as u32. I very much would like
> > the verifier to notice and reject such program, since if C code
> > has struct { char a,b,c,d; }; the compiler won't generate u32 access
> > to it unless C code type casts, but then llvm will warn. So for u32
> > to legitimately appear in the generated code the struct should be:
> > union {
> >   struct { char a,b,c,d;}
> >   u32 e;
> > };
> > Narrow access (like u8 load/store in the bpf prog form u32 BTF field)
> > is ok, since that's normal compiler optimization, but any other
> > field/size mismatch the verifier should reject to prevent cheating.
> > 
> > In other words even proprietary bpf programs should not be able to cheat.
> > If they provide BTF to the kernel, it should correspond exactly to the program.
> > That's the key to _trusted_ introspection.
> 
> But then wouldn't this artificially force users into programming /
> thinking style that verifier dictates upon them similar as we have
> today as opposed to further moving away from it to allow more C-style
> programs to be accepted?

I think it will be the opposite. Strict code<->BTF check will force
users to use llvm with C source (or other language with proper
compiler support). Hacking in asm will get harder for folks
who want to take advantage of BTF.

> But even if that is the goal, it is still used as a hint today, e.g.
> even for an array I could tell the compiler that the key is '__u32'
> for BTF sake while using the underlying key differently (e.g. as struct,
> enum, etc) in order to pass the array_map_check_btf() check in the
> kernel. 

correct that's possible today.

> Those type of programs would still need to be accepted due
> to compatibility reasons. 

I don't think there will be a compatibility issue.
The programs need to be tied with BTF first. The verifier cannot
just get smarter and understand the field accesses.
For load/store we only check alignment and range.
We don't really know the field within a struct. In very simple cases
of fixed offset the verifier can figure that out, but it's too limited
to start acting on it.
I think all load/stores have to be annotated with BTF.
Including packet, stack, and even bpf_probe_read.
Then the verifier will see which ethernet or IP header is being loaded
and can act on it. The life of HW JITs will become easier too.

> Same if we only test on size match in other
> maps, it's nothing different. I don't see how verifier would start
> enforcing programs to be rejected based on access patterns on the
> types;

right. In the current shape of progs and BTF I don't see how
the verifier can do that.

> while it might work for newly added program types that this
> could be enforced, it cannot for all the many existing program types,

even for existing program types it's ok, since recompiling the code
with new annotations is an explicit act of the user.
Future code annotations+BTF is not '-g' flag. It's not debug info.
It's a new feature that is using ISA extensions.
It's very similar to alu32 llvm flag.
A bunch of existing programs will fail to load if they're simply
recompiled with alu32.

> > Admin that ssh-es into the box and operates with bpftool should be
> > certain that the map introspection represent the real situation of
> > the program. If proprietary prog is paranoid about layout of map
> > it shouldn't be using BTF, but if it does, BTF should match.
> > In the future I'd like to enfore availability of BTF for
> > new program types.
> 
> Sure, and in vast majority / normal cases it will represent the real
> situation. Do you mistrust DWARF debugging data that gets shipped with
> your distro, for example? pahole and other tools on top of it? It is

dwarf is not used to make safety checks or optimizations.
well, unless we consider eh_frame to be part of dwarf, then yes
dwarf has to be correct otherwise programs will crash
and user space has to trust it for safety.

> pretty much the same situation. Given gdb example below, does gdb do
> any verification of the binary in order to analyze load/store patterns
> and whether it matches with what sits in DWARF as types?

gdb doesn't do it, since it treats dwarf as debug info only.

> as well map the types into memory and dump it as pretty print instead.
> And that's okay, the one thing that needs to be verified for correctness
> resp. validity is the debugging format itself so it can be used for
> that.

I don't think BTF belongs in kernel if it's used for pretty print only.
Daniel Borkmann Aug. 10, 2018, 6:45 p.m. UTC | #11
On 08/10/2018 06:40 PM, Alexei Starovoitov wrote:
> On Fri, Aug 10, 2018 at 09:55:35AM +0200, Daniel Borkmann wrote:
>> On 08/10/2018 04:13 AM, Alexei Starovoitov wrote:
>>> On Fri, Aug 10, 2018 at 12:43:20AM +0200, Daniel Borkmann wrote:
>>>> On 08/09/2018 11:44 PM, Alexei Starovoitov wrote:
>>>>> On Thu, Aug 09, 2018 at 11:30:52PM +0200, Daniel Borkmann wrote:
>>>>>> On 08/09/2018 11:14 PM, Alexei Starovoitov wrote:
>>>>>>> On Thu, Aug 09, 2018 at 09:42:20PM +0200, Daniel Borkmann wrote:
>>>>>>>> Commit a26ca7c982cb ("bpf: btf: Add pretty print support to
>>>>>>>> the basic arraymap") enabled support for BTF and dumping via
>>>>>>>> BPF fs for arraymap. However, both can be decoupled from each
>>>>>>>> other such that all BPF maps can be supported for attaching
>>>>>>>> BTF key/value information, while not all maps necessarily
>>>>>>>> need to dump via map_seq_show_elem() callback.
>>>>>>>>
>>>>>>>> The check in array_map_check_btf() can be generalized as
>>>>>>>> ultimatively the key and value size is the only contraint
>>>>>>>> that needs to match for the map. The fact that the key needs
>>>>>>>> to be of type int is optional; it could be any data type as
>>>>>>>> long as it matches the 4 byte key size, just like hash table
>>>>>>>> key or others could be of any data type as well.
>>>>>>>>
>>>>>>>> Minimal example of a hash table dump which then works out
>>>>>>>> of the box for bpftool:
>>>>>>>>
>>>>>>>>   # bpftool map dump id 19
>>>>>>>>   [{
>>>>>>>>           "key": {
>>>>>>>>               "": {
>>>>>>>>                   "vip": 0,
>>>>>>>>                   "vipv6": []
>>>>>>>>               },
>>>>>>>>               "port": 0,
>>>>>>>>               "family": 0,
>>>>>>>>               "proto": 0
>>>>>>>>           },
>>>>>>>>           "value": {
>>>>>>>>               "flags": 0,
>>>>>>>>               "vip_num": 0
>>>>>>>>           }
>>>>>>>>       }
>>>>>>>>   ]
>>>>>>>>
>>>>>>>> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
>>>>>>>> Cc: Yonghong Song <yhs@fb.com>
>>>>>>>> ---
>>>>>>>>  include/linux/bpf.h   |  4 +---
>>>>>>>>  kernel/bpf/arraymap.c | 27 ---------------------------
>>>>>>>>  kernel/bpf/inode.c    |  3 ++-
>>>>>>>>  kernel/bpf/syscall.c  | 24 ++++++++++++++++++++----
>>>>>>>>  4 files changed, 23 insertions(+), 35 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>>>>>>> index cd8790d..eb76e8e 100644
>>>>>>>> --- a/include/linux/bpf.h
>>>>>>>> +++ b/include/linux/bpf.h
>>>>>>>> @@ -48,8 +48,6 @@ struct bpf_map_ops {
>>>>>>>>  	u32 (*map_fd_sys_lookup_elem)(void *ptr);
>>>>>>>>  	void (*map_seq_show_elem)(struct bpf_map *map, void *key,
>>>>>>>>  				  struct seq_file *m);
>>>>>>>> -	int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf,
>>>>>>>> -			     u32 key_type_id, u32 value_type_id);
>>>>>>>>  };
>>>>>>>>  
>>>>>>>>  struct bpf_map {
>>>>>>>> @@ -118,7 +116,7 @@ static inline bool bpf_map_offload_neutral(const struct bpf_map *map)
>>>>>>>>  
>>>>>>>>  static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
>>>>>>>>  {
>>>>>>>> -	return map->ops->map_seq_show_elem && map->ops->map_check_btf;
>>>>>>>> +	return map->btf && map->ops->map_seq_show_elem;
>>>>>>>>  }
>>>>>>>>  
>>>>>>>>  extern const struct bpf_map_ops bpf_map_offload_ops;
>>>>>>>> diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
>>>>>>>> index 2aa55d030..67f0bdf 100644
>>>>>>>> --- a/kernel/bpf/arraymap.c
>>>>>>>> +++ b/kernel/bpf/arraymap.c
>>>>>>>> @@ -358,32 +358,6 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,
>>>>>>>>  	rcu_read_unlock();
>>>>>>>>  }
>>>>>>>>  
>>>>>>>> -static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf,
>>>>>>>> -			       u32 btf_key_id, u32 btf_value_id)
>>>>>>>> -{
>>>>>>>> -	const struct btf_type *key_type, *value_type;
>>>>>>>> -	u32 key_size, value_size;
>>>>>>>> -	u32 int_data;
>>>>>>>> -
>>>>>>>> -	key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
>>>>>>>> -	if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
>>>>>>>> -		return -EINVAL;
>>>>>>>> -
>>>>>>>> -	int_data = *(u32 *)(key_type + 1);
>>>>>>>> -	/* bpf array can only take a u32 key.  This check makes
>>>>>>>> -	 * sure that the btf matches the attr used during map_create.
>>>>>>>> -	 */
>>>>>>>> -	if (BTF_INT_BITS(int_data) != 32 || key_size != 4 ||
>>>>>>>> -	    BTF_INT_OFFSET(int_data))
>>>>>>>> -		return -EINVAL;
>>>>>>>
>>>>>>> I think most of these checks are still necessary for array type.
>>>>>>> Relaxing BTF array key from BTF_KIND_INT to, for example, BTF_KIND_ENUM
>>>>>>> is probably ok, but key being BTF_KIND_PTR or BTF_KIND_ARRAY doesn't makes sense.
>>>>>>
>>>>>> Hmm, so on 64 bit archs BTF_KIND_PTR would get rejected for array,
>>>>>> on 32 bit it may be allowed due to sizeof(void *) == 4. BTF_KIND_ARRAY
>>>>>> could be array of u8 foo[4], for example, or u16 foo[2]. But how would
>>>>>> it ultimately be different from e.g. having 'struct a' versus 'struct b'
>>>>>> where both are of same size and while actual key has 'struct a', the one
>>>>>> who writes the prog resp. loads the BTF into the kernel would lie about
>>>>>> it stating it's of type 'struct b' instead? It's basically trusting the
>>>>>> app that it advertised sane key types which kernel is propagating back.
>>>>>
>>>>> for hash map - yes. the kernel cannot yet catch the lie that
>>>>> key == 'struct a' that user said in BTF is not what program used
>>>>> (which used 'struct b' of the same size).
>>>>> Eventually we will annotate all load/store in the program and will
>>>>> make sure that memory access match what BTF said.
>>>>
>>>> But in that case, would you reject the program? E.g. from prog point of
>>>> view, it's just a buffer of x bytes, so key could be casted to different
>>>> struct/types potentially and used for lookup; similar with value if you
>>>> would go the route to annotate all access into it. I don't think this
>>>> serves as a security feature (since you might as well just load the prog
>>>> without BTF just fine), but it can be used to help verifier to perform
>>>> rewrites like in tracing for implicit bpf_probe_read() based on member
>>>> access. But also in that case, if you might have e.g. stale or wrong BTF
>>>> data e.g. of the whole kernel or some application it would follow/walk
>>>> that one instead. But such user error would be "acceptable" since it serves
>>>> as a hint, roughly similar to (explicitly) walking the data structures
>>>> based on the headers today, you do have better control in terms of header
>>>> mismatches in that you can ship the BTF directly from the build, but there's
>>>> still no guarantee in that sense that you "verified" that these bytes
>>>> originally were mapped to struct foo somewhere in a C program.
>>>
>>> I wouldn't view such key checks as safety feature, but rather
>>> as honesty check. Meaning that user space shouldn't be able to cheat
>>> by passing completely bogus BTF into the kernel that only
>>> statisfies single size check.
>>
>> Ok, meaning, we agree that BTF is passed as a hint to the kernel. It is
>> a 'should not cheat' requirement but not 'cannot cheat', meaning if it
>> was the latter, BTF is core part of the verifier's safety analysis /
>> checks, but as it's optional on top of it (well, due to compatibility it
>> kind of needs to be anyway), it doesn't affect kernel's safety in
>> relation to what the program is doing internally.
> 
> Eventually BTF will not be optional, since it's not another dwarf-like debug info.
> Today its type information serves the purpose of debug info,
> but it's more than that. It's a type description.
> Meaning that the kernel verifies it and the verifier will eventually use
> it to make safety decisions.
> Right now BTF is disjoined from the code, but later we'll tie
> them together and the verifer will start rejecting programs
> where memory accesses don't match the description.
> At that point if the code has annotated load/store they gotta be correct.
> We won't be able to do gradual tightening due to compatibility
> reasons, so when it happens it's gotta be fully strict from the start.
> So I really see it as "cannot cheat".

Okay.

>>> Say, BTF has a key that is a 4 byte struct { char a,b,c,d; };
>>> but program operates with it as u32. I very much would like
>>> the verifier to notice and reject such program, since if C code
>>> has struct { char a,b,c,d; }; the compiler won't generate u32 access
>>> to it unless C code type casts, but then llvm will warn. So for u32
>>> to legitimately appear in the generated code the struct should be:
>>> union {
>>>   struct { char a,b,c,d;}
>>>   u32 e;
>>> };
>>> Narrow access (like u8 load/store in the bpf prog form u32 BTF field)
>>> is ok, since that's normal compiler optimization, but any other
>>> field/size mismatch the verifier should reject to prevent cheating.
>>>
>>> In other words even proprietary bpf programs should not be able to cheat.
>>> If they provide BTF to the kernel, it should correspond exactly to the program.
>>> That's the key to _trusted_ introspection.
>>
>> But then wouldn't this artificially force users into programming /
>> thinking style that verifier dictates upon them similar as we have
>> today as opposed to further moving away from it to allow more C-style
>> programs to be accepted?
> 
> I think it will be the opposite. Strict code<->BTF check will force
> users to use llvm with C source (or other language with proper
> compiler support). Hacking in asm will get harder for folks
> who want to take advantage of BTF.
> 
>> But even if that is the goal, it is still used as a hint today, e.g.
>> even for an array I could tell the compiler that the key is '__u32'
>> for BTF sake while using the underlying key differently (e.g. as struct,
>> enum, etc) in order to pass the array_map_check_btf() check in the
>> kernel. 
> 
> correct that's possible today.
> 
>> Those type of programs would still need to be accepted due
>> to compatibility reasons. 
> 
> I don't think there will be a compatibility issue.
> The programs need to be tied with BTF first. The verifier cannot
> just get smarter and understand the field accesses.
> For load/store we only check alignment and range.
> We don't really know the field within a struct. In very simple cases
> of fixed offset the verifier can figure that out, but it's too limited
> to start acting on it.

Right, agree, for verifier w/o additional help via annotations, it's
likely not possible.

> I think all load/stores have to be annotated with BTF.
> Including packet, stack, and even bpf_probe_read.
> Then the verifier will see which ethernet or IP header is being loaded
> and can act on it. The life of HW JITs will become easier too.

That makes more sense, so really _all_ load / stores would have to
be annotated in the program with BTF info and verified, and at that
point we could be able to start relying on it in future.

>> Same if we only test on size match in other
>> maps, it's nothing different. I don't see how verifier would start
>> enforcing programs to be rejected based on access patterns on the
>> types;
> 
> right. In the current shape of progs and BTF I don't see how
> the verifier can do that.
> 
>> while it might work for newly added program types that this
>> could be enforced, it cannot for all the many existing program types,
> 
> even for existing program types it's ok, since recompiling the code
> with new annotations is an explicit act of the user.
> Future code annotations+BTF is not '-g' flag. It's not debug info.
> It's a new feature that is using ISA extensions.
> It's very similar to alu32 llvm flag.
> A bunch of existing programs will fail to load if they're simply
> recompiled with alu32.

To make them an ISA extension would be fine, and they could also
help with other features potentially (like loops). For annotations
and BTF to opt into, I think there would need to be additional
incentives from developer point of view, but they could certainly
be in the case of abstracting details away to make writing programs
more user friendly (as in mentioned implicit bpf_probe_read() case,
or perhaps to hide away pt_regs entirely).

>>> Admin that ssh-es into the box and operates with bpftool should be
>>> certain that the map introspection represent the real situation of
>>> the program. If proprietary prog is paranoid about layout of map
>>> it shouldn't be using BTF, but if it does, BTF should match.
>>> In the future I'd like to enfore availability of BTF for
>>> new program types.
>>
>> Sure, and in vast majority / normal cases it will represent the real
>> situation. Do you mistrust DWARF debugging data that gets shipped with
>> your distro, for example? pahole and other tools on top of it? It is
> 
> dwarf is not used to make safety checks or optimizations.
> well, unless we consider eh_frame to be part of dwarf, then yes
> dwarf has to be correct otherwise programs will crash
> and user space has to trust it for safety.
> 
>> pretty much the same situation. Given gdb example below, does gdb do
>> any verification of the binary in order to analyze load/store patterns
>> and whether it matches with what sits in DWARF as types?
> 
> gdb doesn't do it, since it treats dwarf as debug info only.
> 
>> as well map the types into memory and dump it as pretty print instead.
>> And that's okay, the one thing that needs to be verified for correctness
>> resp. validity is the debugging format itself so it can be used for
>> that.
> 
> I don't think BTF belongs in kernel if it's used for pretty print only.
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cd8790d..eb76e8e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -48,8 +48,6 @@  struct bpf_map_ops {
 	u32 (*map_fd_sys_lookup_elem)(void *ptr);
 	void (*map_seq_show_elem)(struct bpf_map *map, void *key,
 				  struct seq_file *m);
-	int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf,
-			     u32 key_type_id, u32 value_type_id);
 };
 
 struct bpf_map {
@@ -118,7 +116,7 @@  static inline bool bpf_map_offload_neutral(const struct bpf_map *map)
 
 static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
 {
-	return map->ops->map_seq_show_elem && map->ops->map_check_btf;
+	return map->btf && map->ops->map_seq_show_elem;
 }
 
 extern const struct bpf_map_ops bpf_map_offload_ops;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 2aa55d030..67f0bdf 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -358,32 +358,6 @@  static void array_map_seq_show_elem(struct bpf_map *map, void *key,
 	rcu_read_unlock();
 }
 
-static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf,
-			       u32 btf_key_id, u32 btf_value_id)
-{
-	const struct btf_type *key_type, *value_type;
-	u32 key_size, value_size;
-	u32 int_data;
-
-	key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
-	if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
-		return -EINVAL;
-
-	int_data = *(u32 *)(key_type + 1);
-	/* bpf array can only take a u32 key.  This check makes
-	 * sure that the btf matches the attr used during map_create.
-	 */
-	if (BTF_INT_BITS(int_data) != 32 || key_size != 4 ||
-	    BTF_INT_OFFSET(int_data))
-		return -EINVAL;
-
-	value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
-	if (!value_type || value_size != map->value_size)
-		return -EINVAL;
-
-	return 0;
-}
-
 const struct bpf_map_ops array_map_ops = {
 	.map_alloc_check = array_map_alloc_check,
 	.map_alloc = array_map_alloc,
@@ -394,7 +368,6 @@  const struct bpf_map_ops array_map_ops = {
 	.map_delete_elem = array_map_delete_elem,
 	.map_gen_lookup = array_map_gen_lookup,
 	.map_seq_show_elem = array_map_seq_show_elem,
-	.map_check_btf = array_map_check_btf,
 };
 
 const struct bpf_map_ops percpu_array_map_ops = {
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 76efe9a..400f27d 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -332,7 +332,8 @@  static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
 	struct bpf_map *map = arg;
 
 	return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops,
-			     map->btf ? &bpffs_map_fops : &bpffs_obj_fops);
+			     bpf_map_support_seq_show(map) ?
+			     &bpffs_map_fops : &bpffs_obj_fops);
 }
 
 static struct dentry *
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5af4e9e..0b6f6e8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -455,6 +455,23 @@  static int bpf_obj_name_cpy(char *dst, const char *src)
 	return 0;
 }
 
+static int map_check_btf(const struct bpf_map *map, const struct btf *btf,
+			 u32 btf_key_id, u32 btf_value_id)
+{
+	const struct btf_type *key_type, *value_type;
+	u32 key_size, value_size;
+
+	key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
+	if (!key_type || key_size != map->key_size)
+		return -EINVAL;
+
+	value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
+	if (!value_type || value_size != map->value_size)
+		return -EINVAL;
+
+	return 0;
+}
+
 #define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
@@ -489,8 +506,7 @@  static int map_create(union bpf_attr *attr)
 	atomic_set(&map->refcnt, 1);
 	atomic_set(&map->usercnt, 1);
 
-	if (bpf_map_support_seq_show(map) &&
-	    (attr->btf_key_type_id || attr->btf_value_type_id)) {
+	if (attr->btf_key_type_id || attr->btf_value_type_id) {
 		struct btf *btf;
 
 		if (!attr->btf_key_type_id || !attr->btf_value_type_id) {
@@ -504,8 +520,8 @@  static int map_create(union bpf_attr *attr)
 			goto free_map_nouncharge;
 		}
 
-		err = map->ops->map_check_btf(map, btf, attr->btf_key_type_id,
-					      attr->btf_value_type_id);
+		err = map_check_btf(map, btf, attr->btf_key_type_id,
+				    attr->btf_value_type_id);
 		if (err) {
 			btf_put(btf);
 			goto free_map_nouncharge;