diff mbox

[v3,3/6] bpf: add BPF_PROG_ATTACH and BPF_PROG_DETACH commands

Message ID 1472241532-11682-4-git-send-email-daniel@zonque.org
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Daniel Mack Aug. 26, 2016, 7:58 p.m. UTC
Extend the bpf(2) syscall by two new commands, BPF_PROG_ATTACH and
BPF_PROG_DETACH which allow attaching and detaching eBPF programs
to a target.

On the API level, the target could be anything that has an fd in
userspace, hence the name of the field in union bpf_attr is called
'target_fd'.

When called with BPF_ATTACH_TYPE_CGROUP_INET_{E,IN}GRESS, the target is
expected to be a valid file descriptor of a cgroup v2 directory which
has the bpf controller enabled. These are the only use-cases
implemented by this patch at this point, but more can be added.

If a program of the given type already exists in the given cgroup,
the program is swapped automically, so userspace does not have to drop
an existing program first before installing a new one, which would
otherwise leave a gap in which no program is attached.

For more information on the propagation logic to subcgroups, please
refer to the bpf cgroup controller implementation.

The API is guarded by CAP_NET_ADMIN.

Signed-off-by: Daniel Mack <daniel@zonque.org>
syscall
---
 include/uapi/linux/bpf.h |  9 ++++++
 kernel/bpf/syscall.c     | 83 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+)

Comments

Alexei Starovoitov Aug. 27, 2016, 12:08 a.m. UTC | #1
On Fri, Aug 26, 2016 at 09:58:49PM +0200, Daniel Mack wrote:
> Extend the bpf(2) syscall by two new commands, BPF_PROG_ATTACH and
> BPF_PROG_DETACH which allow attaching and detaching eBPF programs
> to a target.
> 
> On the API level, the target could be anything that has an fd in
> userspace, hence the name of the field in union bpf_attr is called
> 'target_fd'.
> 
> When called with BPF_ATTACH_TYPE_CGROUP_INET_{E,IN}GRESS, the target is
> expected to be a valid file descriptor of a cgroup v2 directory which
> has the bpf controller enabled. These are the only use-cases
> implemented by this patch at this point, but more can be added.
> 
> If a program of the given type already exists in the given cgroup,
> the program is swapped automically, so userspace does not have to drop
> an existing program first before installing a new one, which would
> otherwise leave a gap in which no program is attached.
> 
> For more information on the propagation logic to subcgroups, please
> refer to the bpf cgroup controller implementation.
> 
> The API is guarded by CAP_NET_ADMIN.
> 
> Signed-off-by: Daniel Mack <daniel@zonque.org>
> syscall
> ---
>  include/uapi/linux/bpf.h |  9 ++++++
>  kernel/bpf/syscall.c     | 83 ++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 92 insertions(+)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 1d5db42..4cc2dcf 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -73,6 +73,8 @@ enum bpf_cmd {
>  	BPF_PROG_LOAD,
>  	BPF_OBJ_PIN,
>  	BPF_OBJ_GET,
> +	BPF_PROG_ATTACH,
> +	BPF_PROG_DETACH,
>  };
>  
>  enum bpf_map_type {
> @@ -147,6 +149,13 @@ union bpf_attr {
>  		__aligned_u64	pathname;
>  		__u32		bpf_fd;
>  	};
> +
> +	struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
> +		__u32		target_fd;	/* container object to attach to */
> +		__u32		attach_bpf_fd;	/* eBPF program to attach */
> +		__u32		attach_type;	/* BPF_ATTACH_TYPE_* */
> +		__u64		attach_flags;
> +	};

there is a 4 byte hole in this struct. Can we pack it differently?

>  } __attribute__((aligned(8)));
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 228f962..cc4d603 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -822,6 +822,79 @@ static int bpf_obj_get(const union bpf_attr *attr)
>  	return bpf_obj_get_user(u64_to_ptr(attr->pathname));
>  }
>  
> +#ifdef CONFIG_CGROUP_BPF
> +static int bpf_prog_attach(const union bpf_attr *attr)
> +{
> +	struct bpf_prog *prog;
> +
> +	if (!capable(CAP_NET_ADMIN))
> +		return -EPERM;
> +
> +	/* Flags are unused for now */
> +	if (attr->attach_flags != 0)
> +		return -EINVAL;
> +
> +	switch (attr->attach_type) {
> +	case BPF_ATTACH_TYPE_CGROUP_INET_INGRESS:
> +	case BPF_ATTACH_TYPE_CGROUP_INET_EGRESS: {
> +		struct cgroup *cgrp;
> +
> +		prog = bpf_prog_get_type(attr->attach_bpf_fd,
> +					 BPF_PROG_TYPE_CGROUP_SOCKET_FILTER);
> +		if (IS_ERR(prog))
> +			return PTR_ERR(prog);
> +
> +		cgrp = cgroup_get_from_fd(attr->target_fd);
> +		if (IS_ERR(cgrp)) {
> +			bpf_prog_put(prog);
> +			return PTR_ERR(cgrp);
> +		}
> +
> +		cgroup_bpf_update(cgrp, prog, attr->attach_type);
> +		cgroup_put(cgrp);
> +
> +		break;
> +	}

this } formatting style is confusing. The above } looks
like it matches 'switch () {'.
May be move 'struct cgroup *cgrp' to the top to avoid that?
Daniel Borkmann Aug. 29, 2016, 11 p.m. UTC | #2
On 08/26/2016 09:58 PM, Daniel Mack wrote:
> Extend the bpf(2) syscall by two new commands, BPF_PROG_ATTACH and
> BPF_PROG_DETACH which allow attaching and detaching eBPF programs
> to a target.
>
> On the API level, the target could be anything that has an fd in
> userspace, hence the name of the field in union bpf_attr is called
> 'target_fd'.
>
> When called with BPF_ATTACH_TYPE_CGROUP_INET_{E,IN}GRESS, the target is
> expected to be a valid file descriptor of a cgroup v2 directory which
> has the bpf controller enabled. These are the only use-cases
> implemented by this patch at this point, but more can be added.
>
> If a program of the given type already exists in the given cgroup,
> the program is swapped automically, so userspace does not have to drop
> an existing program first before installing a new one, which would
> otherwise leave a gap in which no program is attached.
>
> For more information on the propagation logic to subcgroups, please
> refer to the bpf cgroup controller implementation.
>
> The API is guarded by CAP_NET_ADMIN.
>
> Signed-off-by: Daniel Mack <daniel@zonque.org>
> syscall

^^^ slipped in?

> ---
>   include/uapi/linux/bpf.h |  9 ++++++
>   kernel/bpf/syscall.c     | 83 ++++++++++++++++++++++++++++++++++++++++++++++++
>   2 files changed, 92 insertions(+)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 1d5db42..4cc2dcf 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -73,6 +73,8 @@ enum bpf_cmd {
>   	BPF_PROG_LOAD,
>   	BPF_OBJ_PIN,
>   	BPF_OBJ_GET,
> +	BPF_PROG_ATTACH,
> +	BPF_PROG_DETACH,
>   };
>
>   enum bpf_map_type {
> @@ -147,6 +149,13 @@ union bpf_attr {
>   		__aligned_u64	pathname;
>   		__u32		bpf_fd;
>   	};
> +
> +	struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
> +		__u32		target_fd;	/* container object to attach to */
> +		__u32		attach_bpf_fd;	/* eBPF program to attach */
> +		__u32		attach_type;	/* BPF_ATTACH_TYPE_* */
> +		__u64		attach_flags;

Could we just do ...

__u32 dst_fd;
__u32 src_fd;
__u32 attach_type;

... and leave flags out, since unused anyway? Also see below.

> +	};
>   } __attribute__((aligned(8)));
>
>   /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 228f962..cc4d603 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -822,6 +822,79 @@ static int bpf_obj_get(const union bpf_attr *attr)
>   	return bpf_obj_get_user(u64_to_ptr(attr->pathname));
>   }
>
> +#ifdef CONFIG_CGROUP_BPF

#define BPF_PROG_ATTACH_LAST_FIELD attach_type
#define BPF_PROG_DETACH_LAST_FIELD BPF_PROG_ATTACH_LAST_FIELD

> +static int bpf_prog_attach(const union bpf_attr *attr)
> +{
> +	struct bpf_prog *prog;
> +
> +	if (!capable(CAP_NET_ADMIN))
> +		return -EPERM;
> +
> +	/* Flags are unused for now */
> +	if (attr->attach_flags != 0)
> +		return -EINVAL;

Correct way would be to:

	if (CHECK_ATTR(BPF_PROG_ATTACH))
		return -EINVAL;

> +
> +	switch (attr->attach_type) {
> +	case BPF_ATTACH_TYPE_CGROUP_INET_INGRESS:
> +	case BPF_ATTACH_TYPE_CGROUP_INET_EGRESS: {
> +		struct cgroup *cgrp;
> +
> +		prog = bpf_prog_get_type(attr->attach_bpf_fd,
> +					 BPF_PROG_TYPE_CGROUP_SOCKET_FILTER);
> +		if (IS_ERR(prog))
> +			return PTR_ERR(prog);
> +
> +		cgrp = cgroup_get_from_fd(attr->target_fd);
> +		if (IS_ERR(cgrp)) {
> +			bpf_prog_put(prog);
> +			return PTR_ERR(cgrp);
> +		}
> +
> +		cgroup_bpf_update(cgrp, prog, attr->attach_type);
> +		cgroup_put(cgrp);
> +
> +		break;
> +	}
> +
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +
> +static int bpf_prog_detach(const union bpf_attr *attr)
> +{
> +	if (!capable(CAP_NET_ADMIN))
> +		return -EPERM;
> +
> +	/* Flags are unused for now */
> +	if (attr->attach_flags != 0)
> +		return -EINVAL;

	if (CHECK_ATTR(BPF_PROG_DETACH))
		return -EINVAL;

> +	switch (attr->attach_type) {
> +	case BPF_ATTACH_TYPE_CGROUP_INET_INGRESS:
> +	case BPF_ATTACH_TYPE_CGROUP_INET_EGRESS: {
> +		struct cgroup *cgrp;
> +
> +		cgrp = cgroup_get_from_fd(attr->target_fd);
> +		if (IS_ERR(cgrp))
> +			return PTR_ERR(cgrp);
> +
> +		cgroup_bpf_update(cgrp, NULL, attr->attach_type);
> +		cgroup_put(cgrp);
> +
> +		break;
> +	}
> +
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +#endif /* CONFIG_CGROUP_BPF */
> +
>   SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
>   {
>   	union bpf_attr attr = {};
> @@ -888,6 +961,16 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
>   	case BPF_OBJ_GET:
>   		err = bpf_obj_get(&attr);
>   		break;
> +
> +#ifdef CONFIG_CGROUP_BPF
> +	case BPF_PROG_ATTACH:
> +		err = bpf_prog_attach(&attr);
> +		break;
> +	case BPF_PROG_DETACH:
> +		err = bpf_prog_detach(&attr);
> +		break;
> +#endif
> +
>   	default:
>   		err = -EINVAL;
>   		break;
>
Daniel Mack Sept. 5, 2016, 12:54 p.m. UTC | #3
On 08/30/2016 01:00 AM, Daniel Borkmann wrote:
> On 08/26/2016 09:58 PM, Daniel Mack wrote:

>>   enum bpf_map_type {
>> @@ -147,6 +149,13 @@ union bpf_attr {
>>   		__aligned_u64	pathname;
>>   		__u32		bpf_fd;
>>   	};
>> +
>> +	struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
>> +		__u32		target_fd;	/* container object to attach to */
>> +		__u32		attach_bpf_fd;	/* eBPF program to attach */
>> +		__u32		attach_type;	/* BPF_ATTACH_TYPE_* */
>> +		__u64		attach_flags;
> 
> Could we just do ...
> 
> __u32 dst_fd;
> __u32 src_fd;
> __u32 attach_type;
> 
> ... and leave flags out, since unused anyway? Also see below.

I'd really like to keep the flags, even if they're unused right now.
This only adds 8 bytes during the syscall operation, so it doesn't harm.
However, we cannot change the userspace API after the fact, and who
knows what this (rather generic) interface will be used for later on.

> #define BPF_PROG_ATTACH_LAST_FIELD attach_type
> #define BPF_PROG_DETACH_LAST_FIELD BPF_PROG_ATTACH_LAST_FIELD

...

> 
> Correct way would be to:
> 
> 	if (CHECK_ATTR(BPF_PROG_ATTACH))
> 		return -EINVAL;

Will add - thanks!


Daniel
Daniel Mack Sept. 5, 2016, 12:56 p.m. UTC | #4
On 08/27/2016 02:08 AM, Alexei Starovoitov wrote:
> On Fri, Aug 26, 2016 at 09:58:49PM +0200, Daniel Mack wrote:

>> +
>> +	struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
>> +		__u32		target_fd;	/* container object to attach to */
>> +		__u32		attach_bpf_fd;	/* eBPF program to attach */
>> +		__u32		attach_type;	/* BPF_ATTACH_TYPE_* */
>> +		__u64		attach_flags;
>> +	};
> 
> there is a 4 byte hole in this struct. Can we pack it differently?

Okay - I swapped "type" and "flags" to repair this.

>> +	switch (attr->attach_type) {
>> +	case BPF_ATTACH_TYPE_CGROUP_INET_INGRESS:
>> +	case BPF_ATTACH_TYPE_CGROUP_INET_EGRESS: {
>> +		struct cgroup *cgrp;
>> +
>> +		prog = bpf_prog_get_type(attr->attach_bpf_fd,
>> +					 BPF_PROG_TYPE_CGROUP_SOCKET_FILTER);
>> +		if (IS_ERR(prog))
>> +			return PTR_ERR(prog);
>> +
>> +		cgrp = cgroup_get_from_fd(attr->target_fd);
>> +		if (IS_ERR(cgrp)) {
>> +			bpf_prog_put(prog);
>> +			return PTR_ERR(cgrp);
>> +		}
>> +
>> +		cgroup_bpf_update(cgrp, prog, attr->attach_type);
>> +		cgroup_put(cgrp);
>> +
>> +		break;
>> +	}
> 
> this } formatting style is confusing. The above } looks
> like it matches 'switch () {'.
> May be move 'struct cgroup *cgrp' to the top to avoid that?

I kept it local to its users, but you're right, it's not worth it. Will
change.


Thanks,
Daniel
Daniel Borkmann Sept. 5, 2016, 1:56 p.m. UTC | #5
On 09/05/2016 02:54 PM, Daniel Mack wrote:
> On 08/30/2016 01:00 AM, Daniel Borkmann wrote:
>> On 08/26/2016 09:58 PM, Daniel Mack wrote:
>
>>>    enum bpf_map_type {
>>> @@ -147,6 +149,13 @@ union bpf_attr {
>>>    		__aligned_u64	pathname;
>>>    		__u32		bpf_fd;
>>>    	};
>>> +
>>> +	struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
>>> +		__u32		target_fd;	/* container object to attach to */
>>> +		__u32		attach_bpf_fd;	/* eBPF program to attach */
>>> +		__u32		attach_type;	/* BPF_ATTACH_TYPE_* */
>>> +		__u64		attach_flags;
>>
>> Could we just do ...
>>
>> __u32 dst_fd;
>> __u32 src_fd;
>> __u32 attach_type;
>>
>> ... and leave flags out, since unused anyway? Also see below.
>
> I'd really like to keep the flags, even if they're unused right now.
> This only adds 8 bytes during the syscall operation, so it doesn't harm.
> However, we cannot change the userspace API after the fact, and who
> knows what this (rather generic) interface will be used for later on.

With the below suggestion added, then flags doesn't need to be
added currently as it can be done safely at a later point in time
with respecting old binaries. See also the syscall handling code
in kernel/bpf/syscall.c +825 and the CHECK_ATTR() macro. The
underlying idea of this was taken from perf_event_open() syscall
back then, see [1] for a summary.

   [1] https://lkml.org/lkml/2014/8/26/116

>> #define BPF_PROG_ATTACH_LAST_FIELD attach_type
>> #define BPF_PROG_DETACH_LAST_FIELD BPF_PROG_ATTACH_LAST_FIELD
>
> ...
>
>>
>> Correct way would be to:
>>
>> 	if (CHECK_ATTR(BPF_PROG_ATTACH))
>> 		return -EINVAL;
>
> Will add - thanks!
>
>
> Daniel
>
Daniel Mack Sept. 5, 2016, 2:09 p.m. UTC | #6
On 09/05/2016 03:56 PM, Daniel Borkmann wrote:
> On 09/05/2016 02:54 PM, Daniel Mack wrote:
>> On 08/30/2016 01:00 AM, Daniel Borkmann wrote:
>>> On 08/26/2016 09:58 PM, Daniel Mack wrote:
>>
>>>>    enum bpf_map_type {
>>>> @@ -147,6 +149,13 @@ union bpf_attr {
>>>>    		__aligned_u64	pathname;
>>>>    		__u32		bpf_fd;
>>>>    	};
>>>> +
>>>> +	struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
>>>> +		__u32		target_fd;	/* container object to attach to */
>>>> +		__u32		attach_bpf_fd;	/* eBPF program to attach */
>>>> +		__u32		attach_type;	/* BPF_ATTACH_TYPE_* */
>>>> +		__u64		attach_flags;
>>>
>>> Could we just do ...
>>>
>>> __u32 dst_fd;
>>> __u32 src_fd;
>>> __u32 attach_type;
>>>
>>> ... and leave flags out, since unused anyway? Also see below.
>>
>> I'd really like to keep the flags, even if they're unused right now.
>> This only adds 8 bytes during the syscall operation, so it doesn't harm.
>> However, we cannot change the userspace API after the fact, and who
>> knows what this (rather generic) interface will be used for later on.
> 
> With the below suggestion added, then flags doesn't need to be
> added currently as it can be done safely at a later point in time
> with respecting old binaries. See also the syscall handling code
> in kernel/bpf/syscall.c +825 and the CHECK_ATTR() macro. The
> underlying idea of this was taken from perf_event_open() syscall
> back then, see [1] for a summary.
> 
>    [1] https://lkml.org/lkml/2014/8/26/116

Yes, I know that's possible, and I like the idea, but I don't think any
new interface should come without flags really, as flags are something
that will most certainly be needed at some point anyway. I didn't have
them in my first shot, but Alexei pointed out that they should be added,
and I agree.

Also, this optimization wouldn't make the transported struct payload any
smaller anyway, because the member of that union used by BPF_PROG_LOAD
is still by far the biggest.

I really don't think it's worth sparing 8 bytes here and then do the
binary compat dance after flags are added, for no real gain.



Thanks,
Daniel
David Laight Sept. 5, 2016, 3:30 p.m. UTC | #7
From: Daniel Mack

> >> +

> >> +	struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */

> >> +		__u32		target_fd;	/* container object to attach to */

> >> +		__u32		attach_bpf_fd;	/* eBPF program to attach */

> >> +		__u32		attach_type;	/* BPF_ATTACH_TYPE_* */

> >> +		__u64		attach_flags;

> >> +	};

> >

> > there is a 4 byte hole in this struct. Can we pack it differently?

> 

> Okay - I swapped "type" and "flags" to repair this.


That just moves the pad to the end of the structure.
Still likely to cause a problem for 32bit apps on a 64bit kernel.
If you can't think of any flags, why 64 of them?

	David
Daniel Mack Sept. 5, 2016, 3:40 p.m. UTC | #8
On 09/05/2016 05:30 PM, David Laight wrote:
> From: Daniel Mack
>>>> +
>>>> +	struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
>>>> +		__u32		target_fd;	/* container object to attach to */
>>>> +		__u32		attach_bpf_fd;	/* eBPF program to attach */
>>>> +		__u32		attach_type;	/* BPF_ATTACH_TYPE_* */
>>>> +		__u64		attach_flags;
>>>> +	};
>>>
>>> there is a 4 byte hole in this struct. Can we pack it differently?
>>
>> Okay - I swapped "type" and "flags" to repair this.
> 
> That just moves the pad to the end of the structure.
> Still likely to cause a problem for 32bit apps on a 64bit kernel.

What kind of problem do you have in mind? Again, this is embedded in a
union of much bigger total size, and the API is not used in any kind of
hot-path.

> If you can't think of any flags, why 64 of them?

I can't think of them right now, but this is about defining an API that
can be used in other context as well. Also, it doesn't matter at all,
they don't harm. IMO, it's just better to have them right away than to
do a binary compat dance once someone needs them.


Thanks,
Daniel
Daniel Borkmann Sept. 5, 2016, 5:09 p.m. UTC | #9
On 09/05/2016 04:09 PM, Daniel Mack wrote:
> On 09/05/2016 03:56 PM, Daniel Borkmann wrote:
>> On 09/05/2016 02:54 PM, Daniel Mack wrote:
>>> On 08/30/2016 01:00 AM, Daniel Borkmann wrote:
>>>> On 08/26/2016 09:58 PM, Daniel Mack wrote:
>>>
>>>>>     enum bpf_map_type {
>>>>> @@ -147,6 +149,13 @@ union bpf_attr {
>>>>>     		__aligned_u64	pathname;
>>>>>     		__u32		bpf_fd;
>>>>>     	};
>>>>> +
>>>>> +	struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
>>>>> +		__u32		target_fd;	/* container object to attach to */
>>>>> +		__u32		attach_bpf_fd;	/* eBPF program to attach */
>>>>> +		__u32		attach_type;	/* BPF_ATTACH_TYPE_* */
>>>>> +		__u64		attach_flags;
>>>>
>>>> Could we just do ...
>>>>
>>>> __u32 dst_fd;
>>>> __u32 src_fd;
>>>> __u32 attach_type;
>>>>
>>>> ... and leave flags out, since unused anyway? Also see below.
>>>
>>> I'd really like to keep the flags, even if they're unused right now.
>>> This only adds 8 bytes during the syscall operation, so it doesn't harm.
>>> However, we cannot change the userspace API after the fact, and who
>>> knows what this (rather generic) interface will be used for later on.
>>
>> With the below suggestion added, then flags doesn't need to be
>> added currently as it can be done safely at a later point in time
>> with respecting old binaries. See also the syscall handling code
>> in kernel/bpf/syscall.c +825 and the CHECK_ATTR() macro. The
>> underlying idea of this was taken from perf_event_open() syscall
>> back then, see [1] for a summary.
>>
>>     [1] https://lkml.org/lkml/2014/8/26/116
>
> Yes, I know that's possible, and I like the idea, but I don't think any
> new interface should come without flags really, as flags are something
> that will most certainly be needed at some point anyway. I didn't have
> them in my first shot, but Alexei pointed out that they should be added,
> and I agree.
>
> Also, this optimization wouldn't make the transported struct payload any
> smaller anyway, because the member of that union used by BPF_PROG_LOAD
> is still by far the biggest.
>
> I really don't think it's worth sparing 8 bytes here and then do the
> binary compat dance after flags are added, for no real gain.

Sure, but there's not much of a dance needed, see for example how map_flags
were added some time ago. So, iff there's really no foreseeable use-case in
sight and since we have this flexibility in place already, then I don't quite
follow why it's needed, if there's zero pain to add it later on. I would
understand it of course, if it cannot be handled later on anymore.
Joe Perches Sept. 5, 2016, 5:29 p.m. UTC | #10
On Mon, 2016-09-05 at 14:56 +0200, Daniel Mack wrote:
> On 08/27/2016 02:08 AM, Alexei Starovoitov wrote:
[]
> > +	switch (attr->attach_type) {
> > +	case BPF_ATTACH_TYPE_CGROUP_INET_INGRESS:
> > +	case BPF_ATTACH_TYPE_CGROUP_INET_EGRESS: {
> > +		struct cgroup *cgrp;
> > +
> > +		prog = bpf_prog_get_type(attr->attach_bpf_fd,
> > +					 BPF_PROG_TYPE_CGROUP_SOCKET_FILTER);
> > +		if (IS_ERR(prog))
> > +			return PTR_ERR(prog);
> > +
> > +		cgrp = cgroup_get_from_fd(attr->target_fd);
> > +		if (IS_ERR(cgrp)) {
> > +			bpf_prog_put(prog);
> > +			return PTR_ERR(cgrp);
> > +		}
> > +
> > +		cgroup_bpf_update(cgrp, prog, attr->attach_type);
> > +		cgroup_put(cgrp);
> > +
> > +		break;
> > +	}
> this } formatting style is confusing. The above } looks
> like it matches 'switch () {'.
> May be move 'struct cgroup *cgrp' to the top to avoid that?

This style of case statements that declare local variables
with an open brace after the case statement

	switch (bar) {
	[cases...]
	case foo: {
		local declarations;

		code...
	}
	[cases...]
	}

is used quite frequently in the kernel.
I think it's fine as is.
Alexei Starovoitov Sept. 5, 2016, 6:32 p.m. UTC | #11
On 9/5/16 10:09 AM, Daniel Borkmann wrote:
> On 09/05/2016 04:09 PM, Daniel Mack wrote:
>> On 09/05/2016 03:56 PM, Daniel Borkmann wrote:
>>> On 09/05/2016 02:54 PM, Daniel Mack wrote:
>>>> On 08/30/2016 01:00 AM, Daniel Borkmann wrote:
>>>>> On 08/26/2016 09:58 PM, Daniel Mack wrote:
>>>>
>>>>>>     enum bpf_map_type {
>>>>>> @@ -147,6 +149,13 @@ union bpf_attr {
>>>>>>             __aligned_u64    pathname;
>>>>>>             __u32        bpf_fd;
>>>>>>         };
>>>>>> +
>>>>>> +    struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH
>>>>>> commands */
>>>>>> +        __u32        target_fd;    /* container object to attach
>>>>>> to */
>>>>>> +        __u32        attach_bpf_fd;    /* eBPF program to attach */
>>>>>> +        __u32        attach_type;    /* BPF_ATTACH_TYPE_* */
>>>>>> +        __u64        attach_flags;
>>>>>
>>>>> Could we just do ...
>>>>>
>>>>> __u32 dst_fd;
>>>>> __u32 src_fd;
>>>>> __u32 attach_type;
>>>>>
>>>>> ... and leave flags out, since unused anyway? Also see below.
>>>>
>>>> I'd really like to keep the flags, even if they're unused right now.
>>>> This only adds 8 bytes during the syscall operation, so it doesn't
>>>> harm.
>>>> However, we cannot change the userspace API after the fact, and who
>>>> knows what this (rather generic) interface will be used for later on.
>>>
>>> With the below suggestion added, then flags doesn't need to be
>>> added currently as it can be done safely at a later point in time
>>> with respecting old binaries. See also the syscall handling code
>>> in kernel/bpf/syscall.c +825 and the CHECK_ATTR() macro. The
>>> underlying idea of this was taken from perf_event_open() syscall
>>> back then, see [1] for a summary.
>>>
>>>     [1] https://lkml.org/lkml/2014/8/26/116
>>
>> Yes, I know that's possible, and I like the idea, but I don't think any
>> new interface should come without flags really, as flags are something
>> that will most certainly be needed at some point anyway. I didn't have
>> them in my first shot, but Alexei pointed out that they should be added,
>> and I agree.
>>
>> Also, this optimization wouldn't make the transported struct payload any
>> smaller anyway, because the member of that union used by BPF_PROG_LOAD
>> is still by far the biggest.
>>
>> I really don't think it's worth sparing 8 bytes here and then do the
>> binary compat dance after flags are added, for no real gain.
>
> Sure, but there's not much of a dance needed, see for example how map_flags
> were added some time ago. So, iff there's really no foreseeable use-case in
> sight and since we have this flexibility in place already, then I don't
> quite
> follow why it's needed, if there's zero pain to add it later on. I would
> understand it of course, if it cannot be handled later on anymore.

I agree with Daniel B. Since flags are completely unused right now,
there is no plan to use it for anything in the coming months and
even worse they make annoying hole in the struct, let's not
add them. We can safely do that later. CHECK_ATTR() allows us to
do it easily. It's not like syscall where flags are must have,
since we cannot add it later. Here it's done trivially.
Daniel Mack Sept. 5, 2016, 6:43 p.m. UTC | #12
On 09/05/2016 08:32 PM, Alexei Starovoitov wrote:
> On 9/5/16 10:09 AM, Daniel Borkmann wrote:
>> On 09/05/2016 04:09 PM, Daniel Mack wrote:

>>> I really don't think it's worth sparing 8 bytes here and then do the
>>> binary compat dance after flags are added, for no real gain.
>>
>> Sure, but there's not much of a dance needed, see for example how map_flags
>> were added some time ago. So, iff there's really no foreseeable use-case in
>> sight and since we have this flexibility in place already, then I don't
>> quite
>> follow why it's needed, if there's zero pain to add it later on. I would
>> understand it of course, if it cannot be handled later on anymore.
> 
> I agree with Daniel B. Since flags are completely unused right now,
> there is no plan to use it for anything in the coming months and
> even worse they make annoying hole in the struct, let's not
> add them. We can safely do that later. CHECK_ATTR() allows us to
> do it easily. It's not like syscall where flags are must have,
> since we cannot add it later. Here it's done trivially.

Okay then. If you both agree, I won't interfere :)


Daniel
diff mbox

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1d5db42..4cc2dcf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -73,6 +73,8 @@  enum bpf_cmd {
 	BPF_PROG_LOAD,
 	BPF_OBJ_PIN,
 	BPF_OBJ_GET,
+	BPF_PROG_ATTACH,
+	BPF_PROG_DETACH,
 };
 
 enum bpf_map_type {
@@ -147,6 +149,13 @@  union bpf_attr {
 		__aligned_u64	pathname;
 		__u32		bpf_fd;
 	};
+
+	struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
+		__u32		target_fd;	/* container object to attach to */
+		__u32		attach_bpf_fd;	/* eBPF program to attach */
+		__u32		attach_type;	/* BPF_ATTACH_TYPE_* */
+		__u64		attach_flags;
+	};
 } __attribute__((aligned(8)));
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 228f962..cc4d603 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -822,6 +822,79 @@  static int bpf_obj_get(const union bpf_attr *attr)
 	return bpf_obj_get_user(u64_to_ptr(attr->pathname));
 }
 
+#ifdef CONFIG_CGROUP_BPF
+static int bpf_prog_attach(const union bpf_attr *attr)
+{
+	struct bpf_prog *prog;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	/* Flags are unused for now */
+	if (attr->attach_flags != 0)
+		return -EINVAL;
+
+	switch (attr->attach_type) {
+	case BPF_ATTACH_TYPE_CGROUP_INET_INGRESS:
+	case BPF_ATTACH_TYPE_CGROUP_INET_EGRESS: {
+		struct cgroup *cgrp;
+
+		prog = bpf_prog_get_type(attr->attach_bpf_fd,
+					 BPF_PROG_TYPE_CGROUP_SOCKET_FILTER);
+		if (IS_ERR(prog))
+			return PTR_ERR(prog);
+
+		cgrp = cgroup_get_from_fd(attr->target_fd);
+		if (IS_ERR(cgrp)) {
+			bpf_prog_put(prog);
+			return PTR_ERR(cgrp);
+		}
+
+		cgroup_bpf_update(cgrp, prog, attr->attach_type);
+		cgroup_put(cgrp);
+
+		break;
+	}
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int bpf_prog_detach(const union bpf_attr *attr)
+{
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	/* Flags are unused for now */
+	if (attr->attach_flags != 0)
+		return -EINVAL;
+
+	switch (attr->attach_type) {
+	case BPF_ATTACH_TYPE_CGROUP_INET_INGRESS:
+	case BPF_ATTACH_TYPE_CGROUP_INET_EGRESS: {
+		struct cgroup *cgrp;
+
+		cgrp = cgroup_get_from_fd(attr->target_fd);
+		if (IS_ERR(cgrp))
+			return PTR_ERR(cgrp);
+
+		cgroup_bpf_update(cgrp, NULL, attr->attach_type);
+		cgroup_put(cgrp);
+
+		break;
+	}
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+#endif /* CONFIG_CGROUP_BPF */
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -888,6 +961,16 @@  SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_OBJ_GET:
 		err = bpf_obj_get(&attr);
 		break;
+
+#ifdef CONFIG_CGROUP_BPF
+	case BPF_PROG_ATTACH:
+		err = bpf_prog_attach(&attr);
+		break;
+	case BPF_PROG_DETACH:
+		err = bpf_prog_detach(&attr);
+		break;
+#endif
+
 	default:
 		err = -EINVAL;
 		break;