diff mbox

[net-next,2/3] bpf: Add new cgroups prog type to enable sock modifications

Message ID 1477434613-3169-3-git-send-email-dsa@cumulusnetworks.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

David Ahern Oct. 25, 2016, 10:30 p.m. UTC
Add new cgroup based program type, BPF_PROG_TYPE_CGROUP_SOCK. Similar to
BPF_PROG_TYPE_CGROUP_SKB programs can be attached to a cgroup and run
any time a process in the cgroup opens an AF_INET or AF_INET6 socket.
Currently only sk_bound_dev_if is exported to userspace for modification
by a bpf program.

This allows a cgroup to be configured such that AF_INET{6} sockets opened
by processes are automatically bound to a specific device. In turn, this
enables the running of programs that do not support SO_BINDTODEVICE in a
specific VRF context / L3 domain.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 include/linux/filter.h   |  2 +-
 include/uapi/linux/bpf.h | 15 ++++++++
 kernel/bpf/cgroup.c      |  9 +++++
 kernel/bpf/syscall.c     |  4 +++
 net/core/filter.c        | 92 ++++++++++++++++++++++++++++++++++++++++++++++++
 net/core/sock.c          |  7 ++++
 6 files changed, 128 insertions(+), 1 deletion(-)

Comments

Daniel Borkmann Oct. 25, 2016, 11:28 p.m. UTC | #1
On 10/26/2016 12:30 AM, David Ahern wrote:
> Add new cgroup based program type, BPF_PROG_TYPE_CGROUP_SOCK. Similar to
> BPF_PROG_TYPE_CGROUP_SKB programs can be attached to a cgroup and run
> any time a process in the cgroup opens an AF_INET or AF_INET6 socket.
> Currently only sk_bound_dev_if is exported to userspace for modification
> by a bpf program.
>
> This allows a cgroup to be configured such that AF_INET{6} sockets opened
> by processes are automatically bound to a specific device. In turn, this
> enables the running of programs that do not support SO_BINDTODEVICE in a
> specific VRF context / L3 domain.
>
> Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
[...]
> @@ -524,6 +535,10 @@ struct bpf_tunnel_key {
>   	__u32 tunnel_label;
>   };
>
> +struct bpf_sock {
> +	__u32 bound_dev_if;
> +};
> +
>   /* User return codes for XDP prog type.
>    * A valid XDP program must return one of these defined values. All other
>    * return codes are reserved for future use. Unknown return codes will result
[...]
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 4552b8c93b99..775802881b01 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -2482,6 +2482,27 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {
>   	.arg5_type	= ARG_CONST_STACK_SIZE,
>   };
>
> +BPF_CALL_3(bpf_sock_store_u32, struct sock *, sk, u32, offset, u32, val)
> +{
> +	u8 *ptr = (u8 *)sk;
> +
> +	if (unlikely(offset > sizeof(*sk)))
> +		return -EFAULT;
> +
> +	*((u32 *)ptr) = val;
> +
> +	return 0;
> +}

Seems strange to me. So, this helper allows to overwrite arbitrary memory
of a struct sock instance. Potentially we could crash the kernel.

And in your sock_filter_convert_ctx_access(), you already implement inline
read/write for the context ...

Your demo code does in pseudocode:

   r1 = sk
   r2 = offsetof(struct bpf_sock, bound_dev_if)
   r3 = idx
   r1->sk_bound_dev_if = idx
   sock_store_u32(r1, r2, r3) // updates sk_bound_dev_if again to idx
   return 1

Dropping that helper from the patch, the only thing a program can do here
is to read/write the sk_bound_dev_if helper per cgroup. Hmm ... dunno. So
this really has to be for cgroups v2, right?
Eric Dumazet Oct. 25, 2016, 11:39 p.m. UTC | #2
On Tue, 2016-10-25 at 15:30 -0700, David Ahern wrote:
> Add new cgroup based program type, BPF_PROG_TYPE_CGROUP_SOCK. Similar to
> BPF_PROG_TYPE_CGROUP_SKB programs can be attached to a cgroup and run
> any time a process in the cgroup opens an AF_INET or AF_INET6 socket.
> Currently only sk_bound_dev_if is exported to userspace for modification
> by a bpf program.
> 
> This allows a cgroup to be configured such that AF_INET{6} sockets opened
> by processes are automatically bound to a specific device. In turn, this
> enables the running of programs that do not support SO_BINDTODEVICE in a
> specific VRF context / L3 domain.

Does this mean that these programs no longer can use loopback ?
Alexei Starovoitov Oct. 26, 2016, 1:55 a.m. UTC | #3
On Wed, Oct 26, 2016 at 01:28:24AM +0200, Daniel Borkmann wrote:
> On 10/26/2016 12:30 AM, David Ahern wrote:
> >Add new cgroup based program type, BPF_PROG_TYPE_CGROUP_SOCK. Similar to
> >BPF_PROG_TYPE_CGROUP_SKB programs can be attached to a cgroup and run
> >any time a process in the cgroup opens an AF_INET or AF_INET6 socket.
> >Currently only sk_bound_dev_if is exported to userspace for modification
> >by a bpf program.
> >
> >This allows a cgroup to be configured such that AF_INET{6} sockets opened
> >by processes are automatically bound to a specific device. In turn, this
> >enables the running of programs that do not support SO_BINDTODEVICE in a
> >specific VRF context / L3 domain.
> >
> >Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
> [...]
> >@@ -524,6 +535,10 @@ struct bpf_tunnel_key {
> >  	__u32 tunnel_label;
> >  };
> >
> >+struct bpf_sock {
> >+	__u32 bound_dev_if;
> >+};
> >+
> >  /* User return codes for XDP prog type.
> >   * A valid XDP program must return one of these defined values. All other
> >   * return codes are reserved for future use. Unknown return codes will result
> [...]
> >diff --git a/net/core/filter.c b/net/core/filter.c
> >index 4552b8c93b99..775802881b01 100644
> >--- a/net/core/filter.c
> >+++ b/net/core/filter.c
> >@@ -2482,6 +2482,27 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {
> >  	.arg5_type	= ARG_CONST_STACK_SIZE,
> >  };
> >
> >+BPF_CALL_3(bpf_sock_store_u32, struct sock *, sk, u32, offset, u32, val)
> >+{
> >+	u8 *ptr = (u8 *)sk;
> >+
> >+	if (unlikely(offset > sizeof(*sk)))
> >+		return -EFAULT;
> >+
> >+	*((u32 *)ptr) = val;
> >+
> >+	return 0;
> >+}
> 
> Seems strange to me. So, this helper allows to overwrite arbitrary memory
> of a struct sock instance. Potentially we could crash the kernel.
> 
> And in your sock_filter_convert_ctx_access(), you already implement inline
> read/write for the context ...
> 
> Your demo code does in pseudocode:
> 
>   r1 = sk
>   r2 = offsetof(struct bpf_sock, bound_dev_if)
>   r3 = idx
>   r1->sk_bound_dev_if = idx
>   sock_store_u32(r1, r2, r3) // updates sk_bound_dev_if again to idx
>   return 1
> 
> Dropping that helper from the patch, the only thing a program can do here
> is to read/write the sk_bound_dev_if helper per cgroup. Hmm ... dunno. So
> this really has to be for cgroups v2, right?

Looks pretty cool.
Same question as Daniel... why extra helper?
If program overwrites bpf_sock->sk_bound_dev_if can we use that
after program returns?
Also do you think it's possible to extend this patch to prototype
the port bind restrictions that were proposed few month back using
the same bpf_sock input structure?
Probably the check would need to be moved into different
place instead of sk_alloc(), but then we'll have more
opportunities to overwrite bound_dev_if, look at ports and so on ?
David Ahern Oct. 26, 2016, 2:05 a.m. UTC | #4
On 10/25/16 5:28 PM, Daniel Borkmann wrote:
>> +BPF_CALL_3(bpf_sock_store_u32, struct sock *, sk, u32, offset, u32, val)
>> +{
>> +    u8 *ptr = (u8 *)sk;
>> +
>> +    if (unlikely(offset > sizeof(*sk)))
>> +        return -EFAULT;
>> +
>> +    *((u32 *)ptr) = val;
>> +
>> +    return 0;
>> +}
> 
> Seems strange to me. So, this helper allows to overwrite arbitrary memory
> of a struct sock instance. Potentially we could crash the kernel.
> 
> And in your sock_filter_convert_ctx_access(), you already implement inline
> read/write for the context ...
> 
> Your demo code does in pseudocode:
> 
>   r1 = sk
>   r2 = offsetof(struct bpf_sock, bound_dev_if)
>   r3 = idx
>   r1->sk_bound_dev_if = idx
>   sock_store_u32(r1, r2, r3) // updates sk_bound_dev_if again to idx
>   return 1
> 
> Dropping that helper from the patch, the only thing a program can do here
> is to read/write the sk_bound_dev_if helper per cgroup. Hmm ... dunno. So
> this really has to be for cgroups v2, right?

Showing my inexperience with the bpf code. The helper can be dropped. I'll do that for v2.

Yes, Daniel's patch set provides the infra for this one and it has a cgroups v2 limitation.
David Ahern Oct. 26, 2016, 2:21 a.m. UTC | #5
On 10/25/16 5:39 PM, Eric Dumazet wrote:
> On Tue, 2016-10-25 at 15:30 -0700, David Ahern wrote:
>> Add new cgroup based program type, BPF_PROG_TYPE_CGROUP_SOCK. Similar to
>> BPF_PROG_TYPE_CGROUP_SKB programs can be attached to a cgroup and run
>> any time a process in the cgroup opens an AF_INET or AF_INET6 socket.
>> Currently only sk_bound_dev_if is exported to userspace for modification
>> by a bpf program.
>>
>> This allows a cgroup to be configured such that AF_INET{6} sockets opened
>> by processes are automatically bound to a specific device. In turn, this
>> enables the running of programs that do not support SO_BINDTODEVICE in a
>> specific VRF context / L3 domain.
> 
> Does this mean that these programs no longer can use loopback ?

I am probably misunderstanding your question, so I'll ramble a bit and see if I cover it.

This patch set generically allows sk_bound_dev_if to be set to any value. It does not check that an index corresponds to a device at that moment (either bpf prog install or execution of the filter), and even if it did the device can be deleted at any moment. That seems to be standard operating procedure with bpf filters (user mistakes mean packets go no where and in this case a socket is bound to a non-existent device).

The index can be any interface (e.g., eth0) or an L3 device (e.g., a VRF). Loopback and index=1 is allowed.

The VRF device is the loopback device for the domain, so binding to it covers addresses on the VRF device as well as interfaces enslaved to it.

Did you mean something else?
David Ahern Oct. 26, 2016, 2:38 a.m. UTC | #6
On 10/25/16 7:55 PM, Alexei Starovoitov wrote:
> Same question as Daniel... why extra helper?

It can be dropped. wrong path while learning this code.

> If program overwrites bpf_sock->sk_bound_dev_if can we use that
> after program returns?
> Also do you think it's possible to extend this patch to prototype
> the port bind restrictions that were proposed few month back using
> the same bpf_sock input structure?
> Probably the check would need to be moved into different
> place instead of sk_alloc(), but then we'll have more
> opportunities to overwrite bound_dev_if, look at ports and so on ?
> 

I think the sk_bound_dev_if should be set when the socket is created versus waiting until it is used (bind, connect, sendmsg, recvmsg). That said, the filter could (should?) be run in the protocol family create function (inet_create and inet6_create) versus sk_alloc. That would allow the filter to allocate a local port based on its logic. I'd prefer interested parties to look into the details of that use case.

I'll move the running of the filter to the end of the create functions for v2.
Eric Dumazet Oct. 26, 2016, 2:48 a.m. UTC | #7
On Tue, 2016-10-25 at 20:21 -0600, David Ahern wrote:
> On 10/25/16 5:39 PM, Eric Dumazet wrote:
> > On Tue, 2016-10-25 at 15:30 -0700, David Ahern wrote:
> >> Add new cgroup based program type, BPF_PROG_TYPE_CGROUP_SOCK. Similar to
> >> BPF_PROG_TYPE_CGROUP_SKB programs can be attached to a cgroup and run
> >> any time a process in the cgroup opens an AF_INET or AF_INET6 socket.
> >> Currently only sk_bound_dev_if is exported to userspace for modification
> >> by a bpf program.
> >>
> >> This allows a cgroup to be configured such that AF_INET{6} sockets opened
> >> by processes are automatically bound to a specific device. In turn, this
> >> enables the running of programs that do not support SO_BINDTODEVICE in a
> >> specific VRF context / L3 domain.
> > 
> > Does this mean that these programs no longer can use loopback ?
> 
> I am probably misunderstanding your question, so I'll ramble a bit and
> see if I cover it.
> 
> This patch set generically allows sk_bound_dev_if to be set to any
> value. It does not check that an index corresponds to a device at that
> moment (either bpf prog install or execution of the filter), and even
> if it did the device can be deleted at any moment. That seems to be
> standard operating procedure with bpf filters (user mistakes mean
> packets go no where and in this case a socket is bound to a
> non-existent device).
> 
> The index can be any interface (e.g., eth0) or an L3 device (e.g., a
> VRF). Loopback and index=1 is allowed.
> 
> The VRF device is the loopback device for the domain, so binding to it
> covers addresses on the VRF device as well as interfaces enslaved to
> it.
> 
> Did you mean something else?

Maybe I do not understand how you plan to use this.

Let say you want a filter to force a BIND_TO_DEVICE xxx because a task
runs in a cgroup yyy

Then a program doing a socket() + connect (127.0.0.1)  will fail ?

I do not see how a BPF filter at socket() time can be selective.
David Ahern Oct. 26, 2016, 3:09 a.m. UTC | #8
On 10/25/16 8:48 PM, Eric Dumazet wrote:
> Maybe I do not understand how you plan to use this.
> 
> Let say you want a filter to force a BIND_TO_DEVICE xxx because a task
> runs in a cgroup yyy
> 
> Then a program doing a socket() + connect (127.0.0.1)  will fail ?

maybe. VRF devices can have 127.0.0.1 address in which case the connect would succeed. ntpq uses 127.0.0.1 to talk to ntpd for example. If ntpd is bound to a Management VRF, then you need this context for ntpq to talk to it.

> 
> I do not see how a BPF filter at socket() time can be selective.

Here's my use case - and this is what we are doing today with the l3mdev cgroup (a patch which has not been accepted upstream):

1. create VRF device

2. create cgroup and configure it

   in this case it means load the bpf program that sets the sk_bound_dev_if to the vrf device that was just created

3. Add shell to cgroup

   For Management VRF this can be done automatically at login so a user does not need to take any action.

At this point any command run in the shell runs in the VRF context (PS1 for bash can show the VRF to keep you from going crazy on why a connect fails :-)) so any ipv4/ipv6 sockets have that VRF scope.

For example, if the VRF is a management VRF, sockets opened by apt-get are automatically bound to the VRF at create time, so requests go out the eth0 (management) interface.

In a similar fashion, using a cgroup and assigning tasks to it allows automated launch of systemd services with instances running in a VRF context - one dhcrelay in vrf red, one in vrf blue with both using a parameterized instance file.
Daniel Borkmann Oct. 26, 2016, 8:33 a.m. UTC | #9
On 10/26/2016 04:05 AM, David Ahern wrote:
> On 10/25/16 5:28 PM, Daniel Borkmann wrote:
>>> +BPF_CALL_3(bpf_sock_store_u32, struct sock *, sk, u32, offset, u32, val)
>>> +{
>>> +    u8 *ptr = (u8 *)sk;
>>> +
>>> +    if (unlikely(offset > sizeof(*sk)))
>>> +        return -EFAULT;
>>> +
>>> +    *((u32 *)ptr) = val;
>>> +
>>> +    return 0;
>>> +}
>>
>> Seems strange to me. So, this helper allows to overwrite arbitrary memory
>> of a struct sock instance. Potentially we could crash the kernel.
>>
>> And in your sock_filter_convert_ctx_access(), you already implement inline
>> read/write for the context ...
>>
>> Your demo code does in pseudocode:
>>
>>    r1 = sk
>>    r2 = offsetof(struct bpf_sock, bound_dev_if)
>>    r3 = idx
>>    r1->sk_bound_dev_if = idx
>>    sock_store_u32(r1, r2, r3) // updates sk_bound_dev_if again to idx
>>    return 1
>>
>> Dropping that helper from the patch, the only thing a program can do here
>> is to read/write the sk_bound_dev_if helper per cgroup. Hmm ... dunno. So
>> this really has to be for cgroups v2, right?
>
> Showing my inexperience with the bpf code. The helper can be dropped. I'll do that for v2.
>
> Yes, Daniel's patch set provides the infra for this one and it has a cgroups v2 limitation.

Sure, I understand that, and I know it was brought up at netconf, I'm
just still wondering in general if BPF is a good fit here in the sense
that what the program can do is just really really limited (at least
right now). Hmm, just trying to understand where this would go long term.
Probably okay'ish, if it's guaranteed that it can also integrate various
other use cases as well for the new program type like the ones proposed
by Anoop from net cgroup.

If that would reuse BPF_PROG_TYPE_CGROUP_SOCK from not only sk_alloc()
hook, programs can thus change sk_bound_dev_if also from elsewhere since
it's a fixed part of the context, and attaching to the cgroup comes after
program was verified and returned a program fd back to the user. I guess
it might be expected, right?

I mean non-cooperative processes in that cgroup could already overwrite
the policy set in sk_alloc() anyway with SO_BINDTODEVICE, no? What is the
expectation if processes are moved from one cgroup to another one? Is it
expected that also sk_bound_dev_if updates (not yet seeing how that would
work from a BPF program)? If sk_bound_dev_if is enforced from cgroup side,
should that lock out processes from changing it (maybe similar to what we
do in SOCK_FILTER_LOCKED)?
Thomas Graf Oct. 26, 2016, 8:41 a.m. UTC | #10
On 10/25/16 at 03:30pm, David Ahern wrote:
> @@ -171,6 +177,9 @@ int __cgroup_bpf_run_filter(struct sock *sk,
>  		case BPF_CGROUP_INET_EGRESS:
>  			ret = __cgroup_bpf_run_filter_skb(skb, prog);
>  			break;
> +		case BPF_CGROUP_INET_SOCK_CREATE:
> +			ret = __cgroup_bpf_run_filter_sk_create(sk, prog);
> +			break;
>  		/* make gcc happy else complains about missing enum value */
>  		default:
>  			return 0;

Thinking further ahead of your simple example. Instead of adding yet
another prog type for the same hook, we can make this compatible with
BPF_CGROUP_INET_EGRESS instead which would then provide a ctx which
contains both, the sk and skb.

The ctx concept is very flexible. We can keep the existing dummy skb
representation and add sk_ fields which are only valid for BPF at
socket layer, e.g skb->sk_bound_dev_if would translate to
sk->bound_dev_if.
David Ahern Oct. 26, 2016, 3:44 p.m. UTC | #11
On 10/26/16 2:33 AM, Daniel Borkmann wrote:
> Sure, I understand that, and I know it was brought up at netconf, I'm
> just still wondering in general if BPF is a good fit here in the sense
> that what the program can do is just really really limited (at least
> right now). Hmm, just trying to understand where this would go long term.
> Probably okay'ish, if it's guaranteed that it can also integrate various
> other use cases as well for the new program type like the ones proposed
> by Anoop from net cgroup.
> 
> If that would reuse BPF_PROG_TYPE_CGROUP_SOCK from not only sk_alloc()
> hook, programs can thus change sk_bound_dev_if also from elsewhere since
> it's a fixed part of the context, and attaching to the cgroup comes after
> program was verified and returned a program fd back to the user. I guess
> it might be expected, right?

sure.

> 
> I mean non-cooperative processes in that cgroup could already overwrite
> the policy set in sk_alloc() anyway with SO_BINDTODEVICE, no? What is the

yes. If a process running as root is invoked/wants to change the binding it can. For example a shell is set in Management VRF context and the user wants to ping out a data plane port the -I arg would do that.

> expectation if processes are moved from one cgroup to another one? Is it 
> expected that also sk_bound_dev_if updates (not yet seeing how that would
> work from a BPF program)? If sk_bound_dev_if is enforced from cgroup side,
> should that lock out processes from changing it (maybe similar to what we
> do in SOCK_FILTER_LOCKED)?

existing sockets would not be affected by the cgroup program.
David Ahern Oct. 26, 2016, 4:08 p.m. UTC | #12
On 10/26/16 2:41 AM, Thomas Graf wrote:
> On 10/25/16 at 03:30pm, David Ahern wrote:
>> @@ -171,6 +177,9 @@ int __cgroup_bpf_run_filter(struct sock *sk,
>>  		case BPF_CGROUP_INET_EGRESS:
>>  			ret = __cgroup_bpf_run_filter_skb(skb, prog);
>>  			break;
>> +		case BPF_CGROUP_INET_SOCK_CREATE:
>> +			ret = __cgroup_bpf_run_filter_sk_create(sk, prog);
>> +			break;
>>  		/* make gcc happy else complains about missing enum value */
>>  		default:
>>  			return 0;
> 
> Thinking further ahead of your simple example. Instead of adding yet
> another prog type for the same hook, we can make this compatible with
> BPF_CGROUP_INET_EGRESS instead which would then provide a ctx which
> contains both, the sk and skb.
> 
> The ctx concept is very flexible. We can keep the existing dummy skb
> representation and add sk_ fields which are only valid for BPF at
> socket layer, e.g skb->sk_bound_dev_if would translate to
> sk->bound_dev_if.
> 

It's an odd user semantic to me to put sock elements into the shadow sk_buff and to reuse BPF_CGROUP_INET_EGRESS. 

I can drop the _CREATE and just make it BPF_CGROUP_INET_SOCK so it works for any sock modification someone wants to add -- e.g., the port binding use case.
Thomas Graf Oct. 26, 2016, 6:57 p.m. UTC | #13
On 10/26/16 at 10:08am, David Ahern wrote:
> On 10/26/16 2:41 AM, Thomas Graf wrote:
> > On 10/25/16 at 03:30pm, David Ahern wrote:
> >> @@ -171,6 +177,9 @@ int __cgroup_bpf_run_filter(struct sock *sk,
> >>  		case BPF_CGROUP_INET_EGRESS:
> >>  			ret = __cgroup_bpf_run_filter_skb(skb, prog);
> >>  			break;
> >> +		case BPF_CGROUP_INET_SOCK_CREATE:
> >> +			ret = __cgroup_bpf_run_filter_sk_create(sk, prog);
> >> +			break;
> >>  		/* make gcc happy else complains about missing enum value */
> >>  		default:
> >>  			return 0;
> > 
> > Thinking further ahead of your simple example. Instead of adding yet
> > another prog type for the same hook, we can make this compatible with
> > BPF_CGROUP_INET_EGRESS instead which would then provide a ctx which
> > contains both, the sk and skb.
> > 
> > The ctx concept is very flexible. We can keep the existing dummy skb
> > representation and add sk_ fields which are only valid for BPF at
> > socket layer, e.g skb->sk_bound_dev_if would translate to
> > sk->bound_dev_if.
> > 
> 
> It's an odd user semantic to me to put sock elements into the shadow sk_buff and to reuse BPF_CGROUP_INET_EGRESS. 
> 
> I can drop the _CREATE and just make it BPF_CGROUP_INET_SOCK so it works for any sock modification someone wants to add -- e.g., the port binding use case.

Your specific sk_alloc hook won't have an skb but the majority of
socket BPF programs will want to see both skb and sk. It is not
ideal to introduce a new bpf_prog_type for every minimal difference
in capability if the majority of capabilities and restrictions are
shared. Instead, the subtype approach was implemented by the Landlock
series looks much cleaner to me.

I think we should think about how to do this properly for all BPF
programs which operate in socket cgroup context.
David Ahern Oct. 26, 2016, 8:42 p.m. UTC | #14
On 10/26/16 2:31 PM, Mahesh Bandewar (महेश बंडेवार) wrote:
> The hook insertion in sk_alloc() may not solve all control-path checks as not much can be done (probably apart for changing sk_bound_dev_if) during allocation but hooks in bind(), listen(), setsockopt() etc. (not a complete list) will be needed. Also pushing this change (changing sk_bound_dev_if) later (than sk_alloc()) you might avoid the case dumazet@ has mentioned.

For v2 I moved the running of the filter to the end of inet{6}_create.
diff mbox

Patch

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1f09c521adfe..808e158742a2 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -408,7 +408,7 @@  struct bpf_prog {
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	struct bpf_prog_aux	*aux;		/* Auxiliary fields */
 	struct sock_fprog_kern	*orig_prog;	/* Original BPF program */
-	unsigned int		(*bpf_func)(const struct sk_buff *skb,
+	unsigned int		(*bpf_func)(const void *ctx,
 					    const struct bpf_insn *filter);
 	/* Instructions for interpreter */
 	union {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6b62ee9a2f78..ce5283f221e7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -99,11 +99,13 @@  enum bpf_prog_type {
 	BPF_PROG_TYPE_XDP,
 	BPF_PROG_TYPE_PERF_EVENT,
 	BPF_PROG_TYPE_CGROUP_SKB,
+	BPF_PROG_TYPE_CGROUP_SOCK,
 };
 
 enum bpf_attach_type {
 	BPF_CGROUP_INET_INGRESS,
 	BPF_CGROUP_INET_EGRESS,
+	BPF_CGROUP_INET_SOCK_CREATE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -449,6 +451,15 @@  enum bpf_func_id {
 	 */
 	BPF_FUNC_get_numa_node_id,
 
+	/**
+	 * sock_store_u32(sk, offset, val) - store bytes into sock
+	 * @sk: pointer to sock
+	 * @offset: offset within sock
+	 * @val: value to write
+	 * Return: 0 on success
+	 */
+	BPF_FUNC_sock_store_u32,
+
 	__BPF_FUNC_MAX_ID,
 };
 
@@ -524,6 +535,10 @@  struct bpf_tunnel_key {
 	__u32 tunnel_label;
 };
 
+struct bpf_sock {
+	__u32 bound_dev_if;
+};
+
 /* User return codes for XDP prog type.
  * A valid XDP program must return one of these defined values. All other
  * return codes are reserved for future use. Unknown return codes will result
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 918c01a6f129..4fcb58013a3a 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -117,6 +117,12 @@  void __cgroup_bpf_update(struct cgroup *cgrp,
 	}
 }
 
+static int __cgroup_bpf_run_filter_sk_create(struct sock *sk,
+					     struct bpf_prog *prog)
+{
+	return prog->bpf_func(sk, prog->insnsi) == 1 ? 0 : -EPERM;
+}
+
 static int __cgroup_bpf_run_filter_skb(struct sk_buff *skb,
 				       struct bpf_prog *prog)
 {
@@ -171,6 +177,9 @@  int __cgroup_bpf_run_filter(struct sock *sk,
 		case BPF_CGROUP_INET_EGRESS:
 			ret = __cgroup_bpf_run_filter_skb(skb, prog);
 			break;
+		case BPF_CGROUP_INET_SOCK_CREATE:
+			ret = __cgroup_bpf_run_filter_sk_create(sk, prog);
+			break;
 		/* make gcc happy else complains about missing enum value */
 		default:
 			return 0;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9abc88deabbc..3b7e30e28cd3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -844,6 +844,9 @@  static int bpf_prog_attach(const union bpf_attr *attr)
 		ptype = BPF_PROG_TYPE_CGROUP_SKB;
 		break;
 
+	case BPF_CGROUP_INET_SOCK_CREATE:
+		ptype = BPF_PROG_TYPE_CGROUP_SOCK;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -879,6 +882,7 @@  static int bpf_prog_detach(const union bpf_attr *attr)
 	switch (attr->attach_type) {
 	case BPF_CGROUP_INET_INGRESS:
 	case BPF_CGROUP_INET_EGRESS:
+	case BPF_CGROUP_INET_SOCK_CREATE:
 		cgrp = cgroup_get_from_fd(attr->target_fd);
 		if (IS_ERR(cgrp))
 			return PTR_ERR(cgrp);
diff --git a/net/core/filter.c b/net/core/filter.c
index 4552b8c93b99..775802881b01 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2482,6 +2482,27 @@  static const struct bpf_func_proto bpf_xdp_event_output_proto = {
 	.arg5_type	= ARG_CONST_STACK_SIZE,
 };
 
+BPF_CALL_3(bpf_sock_store_u32, struct sock *, sk, u32, offset, u32, val)
+{
+	u8 *ptr = (u8 *)sk;
+
+	if (unlikely(offset > sizeof(*sk)))
+		return -EFAULT;
+
+	*((u32 *)ptr) = val;
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_sock_store_u32_proto = {
+	.func		= bpf_sock_store_u32,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 sk_filter_func_proto(enum bpf_func_id func_id)
 {
@@ -2593,6 +2614,17 @@  cg_skb_func_proto(enum bpf_func_id func_id)
 	}
 }
 
+static const struct bpf_func_proto *
+cg_sock_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_sock_store_u32:
+		return &bpf_sock_store_u32_proto;
+	default:
+		return NULL;
+	}
+}
+
 static bool __is_valid_access(int off, int size, enum bpf_access_type type)
 {
 	if (off < 0 || off >= sizeof(struct __sk_buff))
@@ -2630,6 +2662,30 @@  static bool sk_filter_is_valid_access(int off, int size,
 	return __is_valid_access(off, size, type);
 }
 
+static bool sock_filter_is_valid_access(int off, int size,
+					enum bpf_access_type type,
+					enum bpf_reg_type *reg_type)
+{
+	if (type == BPF_WRITE) {
+		switch (off) {
+		case offsetof(struct bpf_sock, bound_dev_if):
+			break;
+		default:
+			return false;
+		}
+	}
+
+	if (off < 0 || off >= sizeof(struct bpf_sock))
+		return false;
+	/* The verifier guarantees that size > 0. */
+	if (off % size != 0)
+		return false;
+	if (size != sizeof(__u32))
+		return false;
+
+	return true;
+}
+
 static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
 			       const struct bpf_prog *prog)
 {
@@ -2888,6 +2944,30 @@  static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 	return insn - insn_buf;
 }
 
+static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
+					  int dst_reg, int src_reg,
+					  int ctx_off,
+					  struct bpf_insn *insn_buf,
+					  struct bpf_prog *prog)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (ctx_off) {
+	case offsetof(struct bpf_sock, bound_dev_if):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4);
+
+		if (type == BPF_WRITE)
+			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+					offsetof(struct sock, sk_bound_dev_if));
+		else
+			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+				      offsetof(struct sock, sk_bound_dev_if));
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
 static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 					 int src_reg, int ctx_off,
 					 struct bpf_insn *insn_buf,
@@ -2961,6 +3041,12 @@  static const struct bpf_verifier_ops cg_skb_ops = {
 	.convert_ctx_access	= sk_filter_convert_ctx_access,
 };
 
+static const struct bpf_verifier_ops cg_sock_ops = {
+	.get_func_proto		= cg_sock_func_proto,
+	.is_valid_access	= sock_filter_is_valid_access,
+	.convert_ctx_access	= sock_filter_convert_ctx_access,
+};
+
 static struct bpf_prog_type_list sk_filter_type __read_mostly = {
 	.ops	= &sk_filter_ops,
 	.type	= BPF_PROG_TYPE_SOCKET_FILTER,
@@ -2986,6 +3072,11 @@  static struct bpf_prog_type_list cg_skb_type __read_mostly = {
 	.type	= BPF_PROG_TYPE_CGROUP_SKB,
 };
 
+static struct bpf_prog_type_list cg_sock_type __read_mostly = {
+	.ops	= &cg_sock_ops,
+	.type	= BPF_PROG_TYPE_CGROUP_SOCK
+};
+
 static int __init register_sk_filter_ops(void)
 {
 	bpf_register_prog_type(&sk_filter_type);
@@ -2993,6 +3084,7 @@  static int __init register_sk_filter_ops(void)
 	bpf_register_prog_type(&sched_act_type);
 	bpf_register_prog_type(&xdp_type);
 	bpf_register_prog_type(&cg_skb_type);
+	bpf_register_prog_type(&cg_sock_type);
 
 	return 0;
 }
diff --git a/net/core/sock.c b/net/core/sock.c
index d8e4532e89e7..936f221cc6c6 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1404,6 +1404,13 @@  struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		cgroup_sk_alloc(&sk->sk_cgrp_data);
 		sock_update_classid(&sk->sk_cgrp_data);
 		sock_update_netprioidx(&sk->sk_cgrp_data);
+
+		if (!kern &&
+		    cgroup_bpf_run_filter(sk, NULL,
+					  BPF_CGROUP_INET_SOCK_CREATE)) {
+			sk_free(sk);
+			sk = NULL;
+		}
 	}
 
 	return sk;