diff mbox series

[RFC,bpf-next,2/6] bpf: add bpf_get_stack helper

Message ID 20180406214846.916265-3-yhs@fb.com
State RFC, archived
Delegated to: BPF Maintainers
Headers show
Series bpf: add bpf_get_stack_helper | expand

Commit Message

Yonghong Song April 6, 2018, 9:48 p.m. UTC
Currently, stackmap and bpf_get_stackid helper are provided
for bpf program to get the stack trace. This approach has
a limitation though. If two stack traces have the same hash,
only one will get stored in the stackmap table,
so some stack traces are missing from user perspective.

This patch implements a new helper, bpf_get_stack, will
send stack traces directly to bpf program. The bpf program
is able to see all stack traces, and then can do in-kernel
processing or send stack traces to user space through
shared map or bpf_perf_event_output.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h      |  1 +
 include/linux/filter.h   |  3 ++-
 include/uapi/linux/bpf.h | 17 +++++++++++++--
 kernel/bpf/stackmap.c    | 56 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c     | 12 ++++++++++-
 kernel/bpf/verifier.c    |  3 +++
 kernel/trace/bpf_trace.c | 50 +++++++++++++++++++++++++++++++++++++++++-
 7 files changed, 137 insertions(+), 5 deletions(-)

Comments

Alexei Starovoitov April 9, 2018, 3:34 a.m. UTC | #1
On 4/6/18 2:48 PM, Yonghong Song wrote:
> Currently, stackmap and bpf_get_stackid helper are provided
> for bpf program to get the stack trace. This approach has
> a limitation though. If two stack traces have the same hash,
> only one will get stored in the stackmap table,
> so some stack traces are missing from user perspective.
>
> This patch implements a new helper, bpf_get_stack, will
> send stack traces directly to bpf program. The bpf program
> is able to see all stack traces, and then can do in-kernel
> processing or send stack traces to user space through
> shared map or bpf_perf_event_output.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/linux/bpf.h      |  1 +
>  include/linux/filter.h   |  3 ++-
>  include/uapi/linux/bpf.h | 17 +++++++++++++--
>  kernel/bpf/stackmap.c    | 56 ++++++++++++++++++++++++++++++++++++++++++++++++
>  kernel/bpf/syscall.c     | 12 ++++++++++-
>  kernel/bpf/verifier.c    |  3 +++
>  kernel/trace/bpf_trace.c | 50 +++++++++++++++++++++++++++++++++++++++++-
>  7 files changed, 137 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 95a7abd..72ccb9a 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -676,6 +676,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto;
>  extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
>  extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
>  extern const struct bpf_func_proto bpf_get_stackid_proto;
> +extern const struct bpf_func_proto bpf_get_stack_proto;
>  extern const struct bpf_func_proto bpf_sock_map_update_proto;
>
>  /* Shared helpers among cBPF and eBPF. */
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index fc4e8f9..9b64f63 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -467,7 +467,8 @@ struct bpf_prog {
>  				dst_needed:1,	/* Do we need dst entry? */
>  				blinded:1,	/* Was blinded */
>  				is_func:1,	/* program is a bpf function */
> -				kprobe_override:1; /* Do we override a kprobe? */
> +				kprobe_override:1, /* Do we override a kprobe? */
> +				need_callchain_buf:1; /* Needs callchain buffer? */
>  	enum bpf_prog_type	type;		/* Type of BPF program */
>  	enum bpf_attach_type	expected_attach_type; /* For some prog types */
>  	u32			len;		/* Number of filter blocks */
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index c5ec897..a4ff5b7 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -517,6 +517,17 @@ union bpf_attr {
>   *             other bits - reserved
>   *     Return: >= 0 stackid on success or negative error
>   *
> + * int bpf_get_stack(ctx, buf, size, flags)
> + *     walk user or kernel stack and store the ips in buf
> + *     @ctx: struct pt_regs*
> + *     @buf: user buffer to fill stack
> + *     @size: the buf size
> + *     @flags: bits 0-7 - numer of stack frames to skip
> + *             bit 8 - collect user stack instead of kernel
> + *             bit 11 - get build-id as well if user stack
> + *             other bits - reserved
> + *     Return: >= 0 size copied on success or negative error
> + *
>   * s64 bpf_csum_diff(from, from_size, to, to_size, seed)
>   *     calculate csum diff
>   *     @from: raw from buffer
> @@ -821,7 +832,8 @@ union bpf_attr {
>  	FN(msg_apply_bytes),		\
>  	FN(msg_cork_bytes),		\
>  	FN(msg_pull_data),		\
> -	FN(bind),
> +	FN(bind),			\
> +	FN(get_stack),
>
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> @@ -855,11 +867,12 @@ enum bpf_func_id {
>  /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
>  #define BPF_F_TUNINFO_IPV6		(1ULL << 0)
>
> -/* BPF_FUNC_get_stackid flags. */
> +/* BPF_FUNC_get_stackid and BPF_FUNC_get_stack flags. */
>  #define BPF_F_SKIP_FIELD_MASK		0xffULL
>  #define BPF_F_USER_STACK		(1ULL << 8)
>  #define BPF_F_FAST_STACK_CMP		(1ULL << 9)
>  #define BPF_F_REUSE_STACKID		(1ULL << 10)
> +#define BPF_F_USER_BUILD_ID		(1ULL << 11)

the comment above is not quite correct.
This new flag is only available for new helper.

>
>  /* BPF_FUNC_skb_set_tunnel_key flags. */
>  #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> index 04f6ec1..371c72e 100644
> --- a/kernel/bpf/stackmap.c
> +++ b/kernel/bpf/stackmap.c
> @@ -402,6 +402,62 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
>  	.arg3_type	= ARG_ANYTHING,
>  };
>
> +BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
> +	   u64, flags)
> +{
> +	u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
> +	bool user_build_id = flags & BPF_F_USER_BUILD_ID;
> +	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
> +	bool user = flags & BPF_F_USER_STACK;
> +	struct perf_callchain_entry *trace;
> +	bool kernel = !user;
> +	u64 *ips;
> +
> +	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
> +			       BPF_F_USER_BUILD_ID)))
> +		return -EINVAL;
> +
> +	elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
> +					    : sizeof(u64);
> +	if (unlikely(size % elem_size))
> +		return -EINVAL;
> +
> +	num_elem = size / elem_size;
> +	if (sysctl_perf_event_max_stack < num_elem)
> +		init_nr = 0;

prog's buffer should be zero padded in this case since it
points to uninit_mem.

> +	else
> +		init_nr = sysctl_perf_event_max_stack - num_elem;
> +	trace = get_perf_callchain(regs, init_nr, kernel, user,
> +				   sysctl_perf_event_max_stack, false, false);
> +	if (unlikely(!trace))
> +		return -EFAULT;
> +
> +	trace_nr = trace->nr - init_nr;
> +	if (trace_nr <= skip)
> +		return -EFAULT;
> +
> +	trace_nr -= skip;
> +	trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
> +	copy_len = trace_nr * elem_size;
> +	ips = trace->ip + skip + init_nr;
> +	if (user && user_build_id)

the combination of kern + user_build_id should probably be rejected
earlier with einval.

> +		stack_map_get_build_id_offset(buf, ips, trace_nr, user);
> +	else
> +		memcpy(buf, ips, copy_len);
> +
> +	return copy_len;
> +}
> +
> +const struct bpf_func_proto bpf_get_stack_proto = {
> +	.func		= bpf_get_stack,
> +	.gpl_only	= true,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_PTR_TO_CTX,
> +	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
> +	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,

why allow zero size?
I'm not sure the helper will work correctly when size=0

> +	.arg4_type	= ARG_ANYTHING,
> +};
> +
>  /* Called from eBPF program */
>  static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
>  {
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 0244973..2aa3a65 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -984,10 +984,13 @@ void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
>  static void __bpf_prog_put_rcu(struct rcu_head *rcu)
>  {
>  	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
> +	bool need_callchain_buf = aux->prog->need_callchain_buf;
>
>  	free_used_maps(aux);
>  	bpf_prog_uncharge_memlock(aux->prog);
>  	security_bpf_prog_free(aux);
> +	if (need_callchain_buf)
> +		put_callchain_buffers();
>  	bpf_prog_free(aux->prog);
>  }
>
> @@ -1004,7 +1007,8 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
>  			bpf_prog_kallsyms_del(prog->aux->func[i]);
>  		bpf_prog_kallsyms_del(prog);
>
> -		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
> +		synchronize_rcu();
> +		__bpf_prog_put_rcu(&prog->aux->rcu);

there should have been lockdep splat.
We cannot call synchronize_rcu here, since we cannot sleep
in some cases.
Yonghong Song April 9, 2018, 4:53 a.m. UTC | #2
On 4/8/18 8:34 PM, Alexei Starovoitov wrote:
> On 4/6/18 2:48 PM, Yonghong Song wrote:
>> Currently, stackmap and bpf_get_stackid helper are provided
>> for bpf program to get the stack trace. This approach has
>> a limitation though. If two stack traces have the same hash,
>> only one will get stored in the stackmap table,
>> so some stack traces are missing from user perspective.
>>
>> This patch implements a new helper, bpf_get_stack, will
>> send stack traces directly to bpf program. The bpf program
>> is able to see all stack traces, and then can do in-kernel
>> processing or send stack traces to user space through
>> shared map or bpf_perf_event_output.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>  include/linux/bpf.h      |  1 +
>>  include/linux/filter.h   |  3 ++-
>>  include/uapi/linux/bpf.h | 17 +++++++++++++--
>>  kernel/bpf/stackmap.c    | 56 
>> ++++++++++++++++++++++++++++++++++++++++++++++++
>>  kernel/bpf/syscall.c     | 12 ++++++++++-
>>  kernel/bpf/verifier.c    |  3 +++
>>  kernel/trace/bpf_trace.c | 50 +++++++++++++++++++++++++++++++++++++++++-
>>  7 files changed, 137 insertions(+), 5 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 95a7abd..72ccb9a 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -676,6 +676,7 @@ extern const struct bpf_func_proto 
>> bpf_get_current_comm_proto;
>>  extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
>>  extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
>>  extern const struct bpf_func_proto bpf_get_stackid_proto;
>> +extern const struct bpf_func_proto bpf_get_stack_proto;
>>  extern const struct bpf_func_proto bpf_sock_map_update_proto;
>>
>>  /* Shared helpers among cBPF and eBPF. */
>> diff --git a/include/linux/filter.h b/include/linux/filter.h
>> index fc4e8f9..9b64f63 100644
>> --- a/include/linux/filter.h
>> +++ b/include/linux/filter.h
>> @@ -467,7 +467,8 @@ struct bpf_prog {
>>                  dst_needed:1,    /* Do we need dst entry? */
>>                  blinded:1,    /* Was blinded */
>>                  is_func:1,    /* program is a bpf function */
>> -                kprobe_override:1; /* Do we override a kprobe? */
>> +                kprobe_override:1, /* Do we override a kprobe? */
>> +                need_callchain_buf:1; /* Needs callchain buffer? */
>>      enum bpf_prog_type    type;        /* Type of BPF program */
>>      enum bpf_attach_type    expected_attach_type; /* For some prog 
>> types */
>>      u32            len;        /* Number of filter blocks */
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index c5ec897..a4ff5b7 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -517,6 +517,17 @@ union bpf_attr {
>>   *             other bits - reserved
>>   *     Return: >= 0 stackid on success or negative error
>>   *
>> + * int bpf_get_stack(ctx, buf, size, flags)
>> + *     walk user or kernel stack and store the ips in buf
>> + *     @ctx: struct pt_regs*
>> + *     @buf: user buffer to fill stack
>> + *     @size: the buf size
>> + *     @flags: bits 0-7 - numer of stack frames to skip
>> + *             bit 8 - collect user stack instead of kernel
>> + *             bit 11 - get build-id as well if user stack
>> + *             other bits - reserved
>> + *     Return: >= 0 size copied on success or negative error
>> + *
>>   * s64 bpf_csum_diff(from, from_size, to, to_size, seed)
>>   *     calculate csum diff
>>   *     @from: raw from buffer
>> @@ -821,7 +832,8 @@ union bpf_attr {
>>      FN(msg_apply_bytes),        \
>>      FN(msg_cork_bytes),        \
>>      FN(msg_pull_data),        \
>> -    FN(bind),
>> +    FN(bind),            \
>> +    FN(get_stack),
>>
>>  /* integer value in 'imm' field of BPF_CALL instruction selects which 
>> helper
>>   * function eBPF program intends to call
>> @@ -855,11 +867,12 @@ enum bpf_func_id {
>>  /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
>>  #define BPF_F_TUNINFO_IPV6        (1ULL << 0)
>>
>> -/* BPF_FUNC_get_stackid flags. */
>> +/* BPF_FUNC_get_stackid and BPF_FUNC_get_stack flags. */
>>  #define BPF_F_SKIP_FIELD_MASK        0xffULL
>>  #define BPF_F_USER_STACK        (1ULL << 8)
>>  #define BPF_F_FAST_STACK_CMP        (1ULL << 9)
>>  #define BPF_F_REUSE_STACKID        (1ULL << 10)
>> +#define BPF_F_USER_BUILD_ID        (1ULL << 11)
> 
> the comment above is not quite correct.
> This new flag is only available for new helper.

Right, some flags are used for both helpers and some are only used for 
one of them. Will make it clear in the next revision.

> 
>>
>>  /* BPF_FUNC_skb_set_tunnel_key flags. */
>>  #define BPF_F_ZERO_CSUM_TX        (1ULL << 1)
>> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
>> index 04f6ec1..371c72e 100644
>> --- a/kernel/bpf/stackmap.c
>> +++ b/kernel/bpf/stackmap.c
>> @@ -402,6 +402,62 @@ const struct bpf_func_proto bpf_get_stackid_proto 
>> = {
>>      .arg3_type    = ARG_ANYTHING,
>>  };
>>
>> +BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, 
>> size,
>> +       u64, flags)
>> +{
>> +    u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
>> +    bool user_build_id = flags & BPF_F_USER_BUILD_ID;
>> +    u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
>> +    bool user = flags & BPF_F_USER_STACK;
>> +    struct perf_callchain_entry *trace;
>> +    bool kernel = !user;
>> +    u64 *ips;
>> +
>> +    if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
>> +                   BPF_F_USER_BUILD_ID)))
>> +        return -EINVAL;
>> +
>> +    elem_size = (user && user_build_id) ? sizeof(struct 
>> bpf_stack_build_id)
>> +                        : sizeof(u64);
>> +    if (unlikely(size % elem_size))
>> +        return -EINVAL;
>> +
>> +    num_elem = size / elem_size;
>> +    if (sysctl_perf_event_max_stack < num_elem)
>> +        init_nr = 0;
> 
> prog's buffer should be zero padded in this case since it
> points to uninit_mem.

Will make the change in the next revision.

> 
>> +    else
>> +        init_nr = sysctl_perf_event_max_stack - num_elem;
>> +    trace = get_perf_callchain(regs, init_nr, kernel, user,
>> +                   sysctl_perf_event_max_stack, false, false);
>> +    if (unlikely(!trace))
>> +        return -EFAULT;
>> +
>> +    trace_nr = trace->nr - init_nr;
>> +    if (trace_nr <= skip)
>> +        return -EFAULT;
>> +
>> +    trace_nr -= skip;
>> +    trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
>> +    copy_len = trace_nr * elem_size;
>> +    ips = trace->ip + skip + init_nr;
>> +    if (user && user_build_id)
> 
> the combination of kern + user_build_id should probably be rejected
> earlier with einval.

Right, I missed this case. It should be rejected.

> 
>> +        stack_map_get_build_id_offset(buf, ips, trace_nr, user);
>> +    else
>> +        memcpy(buf, ips, copy_len);
>> +
>> +    return copy_len;
>> +}
>> +
>> +const struct bpf_func_proto bpf_get_stack_proto = {
>> +    .func        = bpf_get_stack,
>> +    .gpl_only    = true,
>> +    .ret_type    = RET_INTEGER,
>> +    .arg1_type    = ARG_PTR_TO_CTX,
>> +    .arg2_type    = ARG_PTR_TO_UNINIT_MEM,
>> +    .arg3_type    = ARG_CONST_SIZE_OR_ZERO,
> 
> why allow zero size?
> I'm not sure the helper will work correctly when size=0

The only reason I had is to make bpf program easier so
they do not need to test zero size, similiar to 
bpf_probe_read/bpf_perf_event_output.

I have double checked my implementation and it should
handle zero size properly.

Let me double check whether whether not allowing zero
can still have reasonable bpf program coding which
passes verifier.

> 
>> +    .arg4_type    = ARG_ANYTHING,
>> +};
>> +
>>  /* Called from eBPF program */
>>  static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
>>  {
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index 0244973..2aa3a65 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -984,10 +984,13 @@ void bpf_prog_free_id(struct bpf_prog *prog, 
>> bool do_idr_lock)
>>  static void __bpf_prog_put_rcu(struct rcu_head *rcu)
>>  {
>>      struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, 
>> rcu);
>> +    bool need_callchain_buf = aux->prog->need_callchain_buf;
>>
>>      free_used_maps(aux);
>>      bpf_prog_uncharge_memlock(aux->prog);
>>      security_bpf_prog_free(aux);
>> +    if (need_callchain_buf)
>> +        put_callchain_buffers();
>>      bpf_prog_free(aux->prog);
>>  }
>>
>> @@ -1004,7 +1007,8 @@ static void __bpf_prog_put(struct bpf_prog 
>> *prog, bool do_idr_lock)
>>              bpf_prog_kallsyms_del(prog->aux->func[i]);
>>          bpf_prog_kallsyms_del(prog);
>>
>> -        call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
>> +        synchronize_rcu();
>> +        __bpf_prog_put_rcu(&prog->aux->rcu);
> 
> there should have been lockdep splat.
> We cannot call synchronize_rcu here, since we cannot sleep
> in some cases.

Let me double check this. The following is the reason
why I am using synchronize_rcu().

With call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu) and
_bpf_prog_put_rcu calls put_callchain_buffers() which
calls mutex_lock(), the runtime with CONFIG_DEBUG_ATOMIC_SLEEP=y
will complains since potential sleep inside the call_rcu is not
allowed.
Alexei Starovoitov April 9, 2018, 5:02 a.m. UTC | #3
On 4/8/18 9:53 PM, Yonghong Song wrote:
>>> @@ -1004,7 +1007,8 @@ static void __bpf_prog_put(struct bpf_prog
>>> *prog, bool do_idr_lock)
>>>              bpf_prog_kallsyms_del(prog->aux->func[i]);
>>>          bpf_prog_kallsyms_del(prog);
>>>
>>> -        call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
>>> +        synchronize_rcu();
>>> +        __bpf_prog_put_rcu(&prog->aux->rcu);
>>
>> there should have been lockdep splat.
>> We cannot call synchronize_rcu here, since we cannot sleep
>> in some cases.
>
> Let me double check this. The following is the reason
> why I am using synchronize_rcu().
>
> With call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu) and
> _bpf_prog_put_rcu calls put_callchain_buffers() which
> calls mutex_lock(), the runtime with CONFIG_DEBUG_ATOMIC_SLEEP=y
> will complains since potential sleep inside the call_rcu is not
> allowed.

I see. Indeed. We cannot call put_callchain_buffers() from rcu callback,
but doing synchronize_rcu() here is also not possible.
How about moving put_callchain into bpf_prog_free_deferred() ?
Daniel Borkmann April 9, 2018, 10:01 a.m. UTC | #4
On 04/09/2018 07:02 AM, Alexei Starovoitov wrote:
> On 4/8/18 9:53 PM, Yonghong Song wrote:
>>>> @@ -1004,7 +1007,8 @@ static void __bpf_prog_put(struct bpf_prog
>>>> *prog, bool do_idr_lock)
>>>>              bpf_prog_kallsyms_del(prog->aux->func[i]);
>>>>          bpf_prog_kallsyms_del(prog);
>>>>
>>>> -        call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
>>>> +        synchronize_rcu();
>>>> +        __bpf_prog_put_rcu(&prog->aux->rcu);
>>>
>>> there should have been lockdep splat.
>>> We cannot call synchronize_rcu here, since we cannot sleep
>>> in some cases.
>>
>> Let me double check this. The following is the reason
>> why I am using synchronize_rcu().
>>
>> With call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu) and
>> _bpf_prog_put_rcu calls put_callchain_buffers() which
>> calls mutex_lock(), the runtime with CONFIG_DEBUG_ATOMIC_SLEEP=y
>> will complains since potential sleep inside the call_rcu is not
>> allowed.
> 
> I see. Indeed. We cannot call put_callchain_buffers() from rcu callback,
> but doing synchronize_rcu() here is also not possible.
> How about moving put_callchain into bpf_prog_free_deferred() ?

+1, the assumption is that you can call bpf_prog_put() and also the
bpf_map_put() from any context. Sleeping here for a long time might
subtly break things badly.
Yonghong Song April 9, 2018, 4:52 p.m. UTC | #5
On 4/9/18 3:01 AM, Daniel Borkmann wrote:
> On 04/09/2018 07:02 AM, Alexei Starovoitov wrote:
>> On 4/8/18 9:53 PM, Yonghong Song wrote:
>>>>> @@ -1004,7 +1007,8 @@ static void __bpf_prog_put(struct bpf_prog
>>>>> *prog, bool do_idr_lock)
>>>>>               bpf_prog_kallsyms_del(prog->aux->func[i]);
>>>>>           bpf_prog_kallsyms_del(prog);
>>>>>
>>>>> -        call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
>>>>> +        synchronize_rcu();
>>>>> +        __bpf_prog_put_rcu(&prog->aux->rcu);
>>>>
>>>> there should have been lockdep splat.
>>>> We cannot call synchronize_rcu here, since we cannot sleep
>>>> in some cases.
>>>
>>> Let me double check this. The following is the reason
>>> why I am using synchronize_rcu().
>>>
>>> With call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu) and
>>> _bpf_prog_put_rcu calls put_callchain_buffers() which
>>> calls mutex_lock(), the runtime with CONFIG_DEBUG_ATOMIC_SLEEP=y
>>> will complains since potential sleep inside the call_rcu is not
>>> allowed.
>>
>> I see. Indeed. We cannot call put_callchain_buffers() from rcu callback,
>> but doing synchronize_rcu() here is also not possible.
>> How about moving put_callchain into bpf_prog_free_deferred() ?
> 
> +1, the assumption is that you can call bpf_prog_put() and also the
> bpf_map_put() from any context. Sleeping here for a long time might
> subtly break things badly.

Thanks for the suggestion! This should work.
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 95a7abd..72ccb9a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -676,6 +676,7 @@  extern const struct bpf_func_proto bpf_get_current_comm_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
+extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
 
 /* Shared helpers among cBPF and eBPF. */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index fc4e8f9..9b64f63 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -467,7 +467,8 @@  struct bpf_prog {
 				dst_needed:1,	/* Do we need dst entry? */
 				blinded:1,	/* Was blinded */
 				is_func:1,	/* program is a bpf function */
-				kprobe_override:1; /* Do we override a kprobe? */
+				kprobe_override:1, /* Do we override a kprobe? */
+				need_callchain_buf:1; /* Needs callchain buffer? */
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	enum bpf_attach_type	expected_attach_type; /* For some prog types */
 	u32			len;		/* Number of filter blocks */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c5ec897..a4ff5b7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -517,6 +517,17 @@  union bpf_attr {
  *             other bits - reserved
  *     Return: >= 0 stackid on success or negative error
  *
+ * int bpf_get_stack(ctx, buf, size, flags)
+ *     walk user or kernel stack and store the ips in buf
+ *     @ctx: struct pt_regs*
+ *     @buf: user buffer to fill stack
+ *     @size: the buf size
+ *     @flags: bits 0-7 - numer of stack frames to skip
+ *             bit 8 - collect user stack instead of kernel
+ *             bit 11 - get build-id as well if user stack
+ *             other bits - reserved
+ *     Return: >= 0 size copied on success or negative error
+ *
  * s64 bpf_csum_diff(from, from_size, to, to_size, seed)
  *     calculate csum diff
  *     @from: raw from buffer
@@ -821,7 +832,8 @@  union bpf_attr {
 	FN(msg_apply_bytes),		\
 	FN(msg_cork_bytes),		\
 	FN(msg_pull_data),		\
-	FN(bind),
+	FN(bind),			\
+	FN(get_stack),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -855,11 +867,12 @@  enum bpf_func_id {
 /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
 #define BPF_F_TUNINFO_IPV6		(1ULL << 0)
 
-/* BPF_FUNC_get_stackid flags. */
+/* BPF_FUNC_get_stackid and BPF_FUNC_get_stack flags. */
 #define BPF_F_SKIP_FIELD_MASK		0xffULL
 #define BPF_F_USER_STACK		(1ULL << 8)
 #define BPF_F_FAST_STACK_CMP		(1ULL << 9)
 #define BPF_F_REUSE_STACKID		(1ULL << 10)
+#define BPF_F_USER_BUILD_ID		(1ULL << 11)
 
 /* BPF_FUNC_skb_set_tunnel_key flags. */
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 04f6ec1..371c72e 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -402,6 +402,62 @@  const struct bpf_func_proto bpf_get_stackid_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
+	   u64, flags)
+{
+	u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
+	bool user_build_id = flags & BPF_F_USER_BUILD_ID;
+	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+	bool user = flags & BPF_F_USER_STACK;
+	struct perf_callchain_entry *trace;
+	bool kernel = !user;
+	u64 *ips;
+
+	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+			       BPF_F_USER_BUILD_ID)))
+		return -EINVAL;
+
+	elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
+					    : sizeof(u64);
+	if (unlikely(size % elem_size))
+		return -EINVAL;
+
+	num_elem = size / elem_size;
+	if (sysctl_perf_event_max_stack < num_elem)
+		init_nr = 0;
+	else
+		init_nr = sysctl_perf_event_max_stack - num_elem;
+	trace = get_perf_callchain(regs, init_nr, kernel, user,
+				   sysctl_perf_event_max_stack, false, false);
+	if (unlikely(!trace))
+		return -EFAULT;
+
+	trace_nr = trace->nr - init_nr;
+	if (trace_nr <= skip)
+		return -EFAULT;
+
+	trace_nr -= skip;
+	trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
+	copy_len = trace_nr * elem_size;
+	ips = trace->ip + skip + init_nr;
+	if (user && user_build_id)
+		stack_map_get_build_id_offset(buf, ips, trace_nr, user);
+	else
+		memcpy(buf, ips, copy_len);
+
+	return copy_len;
+}
+
+const struct bpf_func_proto bpf_get_stack_proto = {
+	.func		= bpf_get_stack,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 /* Called from eBPF program */
 static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
 {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0244973..2aa3a65 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -984,10 +984,13 @@  void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
 static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 {
 	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
+	bool need_callchain_buf = aux->prog->need_callchain_buf;
 
 	free_used_maps(aux);
 	bpf_prog_uncharge_memlock(aux->prog);
 	security_bpf_prog_free(aux);
+	if (need_callchain_buf)
+		put_callchain_buffers();
 	bpf_prog_free(aux->prog);
 }
 
@@ -1004,7 +1007,8 @@  static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
 			bpf_prog_kallsyms_del(prog->aux->func[i]);
 		bpf_prog_kallsyms_del(prog);
 
-		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
+		synchronize_rcu();
+		__bpf_prog_put_rcu(&prog->aux->rcu);
 	}
 }
 
@@ -1341,6 +1345,12 @@  static int bpf_prog_load(union bpf_attr *attr)
 	if (err)
 		goto free_used_maps;
 
+	if (prog->need_callchain_buf) {
+		err = get_callchain_buffers(sysctl_perf_event_max_stack);
+		if (err)
+			goto free_used_maps;
+	}
+
 	err = bpf_prog_new_fd(prog);
 	if (err < 0) {
 		/* failed to allocate fd.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5dd1dcb..aba9425 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2460,6 +2460,9 @@  static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	if (err)
 		return err;
 
+	if (func_id == BPF_FUNC_get_stack)
+		env->prog->need_callchain_buf = true;
+
 	if (changes_data)
 		clear_all_pkt_pointers(env);
 	return 0;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d88e96d..fe8476f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -20,6 +20,7 @@ 
 #include "trace.h"
 
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 /**
  * trace_call_bpf - invoke BPF program
@@ -577,6 +578,8 @@  kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto;
 	case BPF_FUNC_perf_event_read_value:
 		return &bpf_perf_event_read_value_proto;
 #ifdef CONFIG_BPF_KPROBE_OVERRIDE
@@ -664,6 +667,25 @@  static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size,
+	   u64, flags)
+{
+	struct pt_regs *regs = *(struct pt_regs **)tp_buff;
+
+	return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+			     (unsigned long) size, flags, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stack_proto_tp = {
+	.func		= bpf_get_stack_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -672,6 +694,8 @@  tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_tp;
 	default:
 		return tracing_func_proto(func_id, prog);
 	}
@@ -734,6 +758,8 @@  pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_tp;
 	case BPF_FUNC_perf_prog_read_value:
 		return &bpf_perf_prog_read_value_proto;
 	default:
@@ -744,7 +770,7 @@  pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 /*
  * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
  * to avoid potential recursive reuse issue when/if tracepoints are added
- * inside bpf_*_event_output and/or bpf_get_stack_id
+ * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
  */
 static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
 BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
@@ -787,6 +813,26 @@  static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
+	   void *, buf, u32, size, u64, flags)
+{
+	struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+
+	perf_fetch_caller_regs(regs);
+	return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+			     (unsigned long) size, flags, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
+	.func		= bpf_get_stack_raw_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -795,6 +841,8 @@  raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_raw_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_raw_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_raw_tp;
 	default:
 		return tracing_func_proto(func_id, prog);
 	}