diff mbox

[net-next,v3,07/15] bpf: Add setsockopt helper function to bpf

Message ID 20170620030048.3275347-8-brakmo@fb.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Lawrence Brakmo June 20, 2017, 3 a.m. UTC
Added support for calling a subset of socket setsockopts from
BPF_PROG_TYPE_SOCK_OPS programs. The code was duplicated rather
than making the changes to call the socket setsockopt function because
the changes required would have been larger.

The ops supported are:
  SO_RCVBUF
  SO_SNDBUF
  SO_MAX_PACING_RATE
  SO_PRIORITY
  SO_RCVLOWAT
  SO_MARK

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/uapi/linux/bpf.h  | 14 ++++++++-
 net/core/filter.c         | 77 ++++++++++++++++++++++++++++++++++++++++++++++-
 samples/bpf/bpf_helpers.h |  3 ++
 3 files changed, 92 insertions(+), 2 deletions(-)

Comments

Craig Gallek June 20, 2017, 9:25 p.m. UTC | #1
On Mon, Jun 19, 2017 at 11:00 PM, Lawrence Brakmo <brakmo@fb.com> wrote:
> Added support for calling a subset of socket setsockopts from
> BPF_PROG_TYPE_SOCK_OPS programs. The code was duplicated rather
> than making the changes to call the socket setsockopt function because
> the changes required would have been larger.
>
> @@ -2671,6 +2672,69 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
>         .arg1_type      = ARG_PTR_TO_CTX,
>  };
>
> +BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
> +          int, level, int, optname, char *, optval, int, optlen)
> +{
> +       struct sock *sk = bpf_sock->sk;
> +       int ret = 0;
> +       int val;
> +
> +       if (bpf_sock->is_req_sock)
> +               return -EINVAL;
> +
> +       if (level == SOL_SOCKET) {
> +               /* Only some socketops are supported */
> +               val = *((int *)optval);
> +
> +               switch (optname) {
> +               case SO_RCVBUF:
> +                       sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
> +                       sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
> +                       break;
> +               case SO_SNDBUF:
> +                       sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
> +                       sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
> +                       break;
> +               case SO_MAX_PACING_RATE:
> +                       sk->sk_max_pacing_rate = val;
> +                       sk->sk_pacing_rate = min(sk->sk_pacing_rate,
> +                                                sk->sk_max_pacing_rate);
> +                       break;
> +               case SO_PRIORITY:
> +                       sk->sk_priority = val;
> +                       break;
> +               case SO_RCVLOWAT:
> +                       if (val < 0)
> +                               val = INT_MAX;
> +                       sk->sk_rcvlowat = val ? : 1;
> +                       break;
> +               case SO_MARK:
> +                       sk->sk_mark = val;
> +                       break;

Isn't the socket lock required when manipulating these fields?  It's
not obvious that the lock is held from every bpf hook point that could
trigger this function...
Lawrence Brakmo June 21, 2017, 4:51 p.m. UTC | #2
On 6/20/17, 2:25 PM, "Craig Gallek" <kraigatgoog@gmail.com> wrote:

    On Mon, Jun 19, 2017 at 11:00 PM, Lawrence Brakmo <brakmo@fb.com> wrote:
    > Added support for calling a subset of socket setsockopts from

    > BPF_PROG_TYPE_SOCK_OPS programs. The code was duplicated rather

    > than making the changes to call the socket setsockopt function because

    > the changes required would have been larger.

    >

    > @@ -2671,6 +2672,69 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {

    >         .arg1_type      = ARG_PTR_TO_CTX,

    >  };

    >

    > +BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,

    > +          int, level, int, optname, char *, optval, int, optlen)

    > +{

    > +       struct sock *sk = bpf_sock->sk;

    > +       int ret = 0;

    > +       int val;

    > +

    > +       if (bpf_sock->is_req_sock)

    > +               return -EINVAL;

    > +

    > +       if (level == SOL_SOCKET) {

    > +               /* Only some socketops are supported */

    > +               val = *((int *)optval);

    > +

    > +               switch (optname) {

    > +               case SO_RCVBUF:

    > +                       sk->sk_userlocks |= SOCK_RCVBUF_LOCK;

    > +                       sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);

    > +                       break;

    > +               case SO_SNDBUF:

    > +                       sk->sk_userlocks |= SOCK_SNDBUF_LOCK;

    > +                       sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);

    > +                       break;

    > +               case SO_MAX_PACING_RATE:

    > +                       sk->sk_max_pacing_rate = val;

    > +                       sk->sk_pacing_rate = min(sk->sk_pacing_rate,

    > +                                                sk->sk_max_pacing_rate);

    > +                       break;

    > +               case SO_PRIORITY:

    > +                       sk->sk_priority = val;

    > +                       break;

    > +               case SO_RCVLOWAT:

    > +                       if (val < 0)

    > +                               val = INT_MAX;

    > +                       sk->sk_rcvlowat = val ? : 1;

    > +                       break;

    > +               case SO_MARK:

    > +                       sk->sk_mark = val;

    > +                       break;

    
    Isn't the socket lock required when manipulating these fields?  It's
    not obvious that the lock is held from every bpf hook point that could
    trigger this function...
    
The sock_ops BPF programs are being called from within the network
stack and my understanding is that  lock has already been taken. 
Currently they are only called:
(1) after a packet is received, where there is the call to
bh_lock_sock_nested() in tcp_v4_rcv() before calling
tcp_v4_do_rcv().
(2) in tcp_connect(), where there should be no issue

Just in case I added a check “sock_owned_by_me(sk)” in tcp_call_bpf()
Do you think this is enough, or should I explicitly add a bh_lock_sock_nested
in the bpf_setsockopt function?
Craig Gallek June 21, 2017, 5:13 p.m. UTC | #3
On Wed, Jun 21, 2017 at 12:51 PM, Lawrence Brakmo <brakmo@fb.com> wrote:
>
> On 6/20/17, 2:25 PM, "Craig Gallek" <kraigatgoog@gmail.com> wrote:
>
>     On Mon, Jun 19, 2017 at 11:00 PM, Lawrence Brakmo <brakmo@fb.com> wrote:
>     > Added support for calling a subset of socket setsockopts from
>     > BPF_PROG_TYPE_SOCK_OPS programs. The code was duplicated rather
>     > than making the changes to call the socket setsockopt function because
>     > the changes required would have been larger.
>     >
>     > @@ -2671,6 +2672,69 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
>     >         .arg1_type      = ARG_PTR_TO_CTX,
>     >  };
>     >
>     > +BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
>     > +          int, level, int, optname, char *, optval, int, optlen)
>     > +{
>     > +       struct sock *sk = bpf_sock->sk;
>     > +       int ret = 0;
>     > +       int val;
>     > +
>     > +       if (bpf_sock->is_req_sock)
>     > +               return -EINVAL;
>     > +
>     > +       if (level == SOL_SOCKET) {
>     > +               /* Only some socketops are supported */
>     > +               val = *((int *)optval);
>     > +
>     > +               switch (optname) {
>     > +               case SO_RCVBUF:
>     > +                       sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
>     > +                       sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
>     > +                       break;
>     > +               case SO_SNDBUF:
>     > +                       sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
>     > +                       sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
>     > +                       break;
>     > +               case SO_MAX_PACING_RATE:
>     > +                       sk->sk_max_pacing_rate = val;
>     > +                       sk->sk_pacing_rate = min(sk->sk_pacing_rate,
>     > +                                                sk->sk_max_pacing_rate);
>     > +                       break;
>     > +               case SO_PRIORITY:
>     > +                       sk->sk_priority = val;
>     > +                       break;
>     > +               case SO_RCVLOWAT:
>     > +                       if (val < 0)
>     > +                               val = INT_MAX;
>     > +                       sk->sk_rcvlowat = val ? : 1;
>     > +                       break;
>     > +               case SO_MARK:
>     > +                       sk->sk_mark = val;
>     > +                       break;
>
>     Isn't the socket lock required when manipulating these fields?  It's
>     not obvious that the lock is held from every bpf hook point that could
>     trigger this function...
>
> The sock_ops BPF programs are being called from within the network
> stack and my understanding is that  lock has already been taken.
> Currently they are only called:
> (1) after a packet is received, where there is the call to
> bh_lock_sock_nested() in tcp_v4_rcv() before calling
> tcp_v4_do_rcv().
> (2) in tcp_connect(), where there should be no issue
Someone who understands the TCP stack better than I should verify
this, but even if it's OK to do in these specific spots, it's not
unreasonable to believe that someone will add another socket-context
bpf hook in the future where it would not be safe.  Without some
additional check to prevent this setsockopt function from being called
in those spots, we could run into trouble.  The only other
socket-context point currently is the cgroup one, which happens during
socket creation and should also be safe.

> Just in case I added a check “sock_owned_by_me(sk)” in tcp_call_bpf()
> Do you think this is enough, or should I explicitly add a bh_lock_sock_nested
> in the bpf_setsockopt function?
Adding the check is certainly a way to test the behavior as
implemented, but this bpf function could be called by any
socket-context bpf (not just the tcp_call_bpf ones).  I believe the
current bpf hook points only guarantee RCU read-side lock.  Adding an
additional lock guarantee may have undesirable performance
implications.  If this is just for socket creation or other rare
events it's probably not a big deal, but if it's for a hook in the
fast path it's probably a non-starter.

I guess the higher level question is what should the locking
guarantees for socket-context bpf programs be?
Lawrence Brakmo June 21, 2017, 11:55 p.m. UTC | #4
On 6/21/17, 10:13 AM, "Craig Gallek" <kraigatgoog@gmail.com> wrote:

    On Wed, Jun 21, 2017 at 12:51 PM, Lawrence Brakmo <brakmo@fb.com> wrote:
    >

    > On 6/20/17, 2:25 PM, "Craig Gallek" <kraigatgoog@gmail.com> wrote:

    >

    >     On Mon, Jun 19, 2017 at 11:00 PM, Lawrence Brakmo <brakmo@fb.com> wrote:

    >     > Added support for calling a subset of socket setsockopts from

    >     > BPF_PROG_TYPE_SOCK_OPS programs. The code was duplicated rather

    >     > than making the changes to call the socket setsockopt function because

    >     > the changes required would have been larger.

    >     >

    >     > @@ -2671,6 +2672,69 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {

    >     >         .arg1_type      = ARG_PTR_TO_CTX,

    >     >  };

    >     >

    >     > +BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,

    >     > +          int, level, int, optname, char *, optval, int, optlen)

    >     > +{

    >     > +       struct sock *sk = bpf_sock->sk;

    >     > +       int ret = 0;

    >     > +       int val;

    >     > +

    >     > +       if (bpf_sock->is_req_sock)

    >     > +               return -EINVAL;

    >     > +

    >     > +       if (level == SOL_SOCKET) {

    >     > +               /* Only some socketops are supported */

    >     > +               val = *((int *)optval);

    >     > +

    >     > +               switch (optname) {

    >     > +               case SO_RCVBUF:

    >     > +                       sk->sk_userlocks |= SOCK_RCVBUF_LOCK;

    >     > +                       sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);

    >     > +                       break;

    >     > +               case SO_SNDBUF:

    >     > +                       sk->sk_userlocks |= SOCK_SNDBUF_LOCK;

    >     > +                       sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);

    >     > +                       break;

    >     > +               case SO_MAX_PACING_RATE:

    >     > +                       sk->sk_max_pacing_rate = val;

    >     > +                       sk->sk_pacing_rate = min(sk->sk_pacing_rate,

    >     > +                                                sk->sk_max_pacing_rate);

    >     > +                       break;

    >     > +               case SO_PRIORITY:

    >     > +                       sk->sk_priority = val;

    >     > +                       break;

    >     > +               case SO_RCVLOWAT:

    >     > +                       if (val < 0)

    >     > +                               val = INT_MAX;

    >     > +                       sk->sk_rcvlowat = val ? : 1;

    >     > +                       break;

    >     > +               case SO_MARK:

    >     > +                       sk->sk_mark = val;

    >     > +                       break;

    >

    >     Isn't the socket lock required when manipulating these fields?  It's

    >     not obvious that the lock is held from every bpf hook point that could

    >     trigger this function...

    >

    > The sock_ops BPF programs are being called from within the network

    > stack and my understanding is that  lock has already been taken.

    > Currently they are only called:

    > (1) after a packet is received, where there is the call to

    > bh_lock_sock_nested() in tcp_v4_rcv() before calling

    > tcp_v4_do_rcv().

    > (2) in tcp_connect(), where there should be no issue

    Someone who understands the TCP stack better than I should verify
    this, but even if it's OK to do in these specific spots, it's not
    unreasonable to believe that someone will add another socket-context
    bpf hook in the future where it would not be safe.  Without some
    additional check to prevent this setsockopt function from being called
    in those spots, we could run into trouble.  The only other
    socket-context point currently is the cgroup one, which happens during
    socket creation and should also be safe.

The cgroup socket (BPF_PROG_TYPE_CGROUP_SOCK) is a different prog type
than BPF_PROG_TYPE_SOCK_OPS and it cannot call the bpf_setsockops function.
And it should not because the context arguments are different (struct sock *
vs. struct bpf_socks_ops_kern *) 
  
    > Just in case I added a check “sock_owned_by_me(sk)” in tcp_call_bpf()

    > Do you think this is enough, or should I explicitly add a bh_lock_sock_nested

    > in the bpf_setsockopt function?

    Adding the check is certainly a way to test the behavior as
    implemented, but this bpf function could be called by any
    socket-context bpf (not just the tcp_call_bpf ones).  I believe the

Can only be called by tcp_call_bpf ones

    current bpf hook points only guarantee RCU read-side lock.  Adding an
    additional lock guarantee may have undesirable performance
    implications.  If this is just for socket creation or other rare
    events it's probably not a big deal, but if it's for a hook in the
    fast path it's probably a non-starter.

I agree
    
    I guess the higher level question is what should the locking
    guarantees for socket-context bpf programs be?

Since the whole point of this new bpf prog type is to be able to
modify sock and TCP parameters, we should insure that
tcp_call_bpf is only called when it is safe to change tcp sock state
(either because the sock lock is held or because it is safe for
other reasons).
diff mbox

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 314fdf3..86595f9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -520,6 +520,17 @@  union bpf_attr {
  *     Set full skb->hash.
  *     @skb: pointer to skb
  *     @hash: hash to set
+ *
+ * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen)
+ *     Calls setsockopt. Not all opts are available, only those with
+ *     integer optvals plus TCP_CONGESTION.
+ *     Supported levels: SOL_SOCKET and IPROTO_TCP
+ *     @bpf_socket: pointer to bpf_socket
+ *     @level: SOL_SOCKET or IPROTO_TCP
+ *     @optname: option name
+ *     @optval: pointer to option value
+ *     @optlen: length of optval in byes
+ *     Return: 0 or negative error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -570,7 +581,8 @@  union bpf_attr {
 	FN(probe_read_str),		\
 	FN(get_socket_cookie),		\
 	FN(get_socket_uid),		\
-	FN(set_hash),
+	FN(set_hash),			\
+	FN(setsockopt),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 7d69d16..b114ae1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -54,6 +54,7 @@ 
 #include <net/dst.h>
 #include <net/sock_reuseport.h>
 #include <net/busy_poll.h>
+#include <net/tcp.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -2671,6 +2672,69 @@  static const struct bpf_func_proto bpf_get_socket_uid_proto = {
 	.arg1_type      = ARG_PTR_TO_CTX,
 };
 
+BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
+	   int, level, int, optname, char *, optval, int, optlen)
+{
+	struct sock *sk = bpf_sock->sk;
+	int ret = 0;
+	int val;
+
+	if (bpf_sock->is_req_sock)
+		return -EINVAL;
+
+	if (level == SOL_SOCKET) {
+		/* Only some socketops are supported */
+		val = *((int *)optval);
+
+		switch (optname) {
+		case SO_RCVBUF:
+			sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+			sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
+			break;
+		case SO_SNDBUF:
+			sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+			sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
+			break;
+		case SO_MAX_PACING_RATE:
+			sk->sk_max_pacing_rate = val;
+			sk->sk_pacing_rate = min(sk->sk_pacing_rate,
+						 sk->sk_max_pacing_rate);
+			break;
+		case SO_PRIORITY:
+			sk->sk_priority = val;
+			break;
+		case SO_RCVLOWAT:
+			if (val < 0)
+				val = INT_MAX;
+			sk->sk_rcvlowat = val ? : 1;
+			break;
+		case SO_MARK:
+			sk->sk_mark = val;
+			break;
+		default:
+			ret = -EINVAL;
+		}
+	} else if (level == SOL_TCP &&
+		   sk->sk_prot->setsockopt == tcp_setsockopt) {
+		/* Place holder */
+		ret = -EINVAL;
+	} else {
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+static const struct bpf_func_proto bpf_setsockopt_proto = {
+	.func		= bpf_setsockopt,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
+};
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -2822,6 +2886,17 @@  lwt_inout_func_proto(enum bpf_func_id func_id)
 }
 
 static const struct bpf_func_proto *
+	sock_ops_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_setsockopt:
+		return &bpf_setsockopt_proto;
+	default:
+		return bpf_base_func_proto(func_id);
+	}
+}
+
+static const struct bpf_func_proto *
 lwt_xmit_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -3578,7 +3653,7 @@  const struct bpf_verifier_ops cg_sock_prog_ops = {
 };
 
 const struct bpf_verifier_ops sock_ops_prog_ops = {
-	.get_func_proto		= bpf_base_func_proto,
+	.get_func_proto		= sock_ops_func_proto,
 	.is_valid_access	= sock_ops_is_valid_access,
 	.convert_ctx_access	= sock_ops_convert_ctx_access,
 };
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index f4840b8..d50ac34 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -60,6 +60,9 @@  static unsigned long long (*bpf_get_prandom_u32)(void) =
 	(void *) BPF_FUNC_get_prandom_u32;
 static int (*bpf_xdp_adjust_head)(void *ctx, int offset) =
 	(void *) BPF_FUNC_xdp_adjust_head;
+static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval,
+			     int optlen) =
+	(void *) BPF_FUNC_setsockopt;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions