diff mbox series

[net-next] bpf: Add access to snd_cwnd and others in sock_ops

Message ID 20171201181504.1040223-1-brakmo@fb.com
State Accepted, archived
Delegated to: BPF Maintainers
Headers show
Series [net-next] bpf: Add access to snd_cwnd and others in sock_ops | expand

Commit Message

Lawrence Brakmo Dec. 1, 2017, 6:15 p.m. UTC
Adds read access to snd_cwnd and srtt_us fields of tcp_sock. Since these
fields are only valid if the socket associated with the sock_ops program
call is a full socket, the field is_fullsock is also added to the
bpf_sock_ops struct. If the socket is not a full socket, reading these
fields returns 0.

Note that in most cases it will not be necessary to check is_fullsock to
know if there is a full socket. The context of the call, as specified by
the 'op' field, can sometimes determine whether there is a full socket.

The struct bpf_sock_ops has the following fields added:

  __u32 is_fullsock;      /* Some TCP fields are only valid if
                           * there is a full socket. If not, the
                           * fields read as zero.
			   */
  __u32 snd_cwnd;
  __u32 srtt_us;          /* Averaged RTT << 3 in usecs */

There is a new macro, SOCK_OPS_GET_TCP32(NAME), to make it easier to add
read access to more 32 bit tcp_sock fields.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/linux/filter.h   |  1 +
 include/net/tcp.h        |  6 ++++--
 include/uapi/linux/bpf.h |  6 ++++++
 net/core/filter.c        | 36 ++++++++++++++++++++++++++++++++++++
 4 files changed, 47 insertions(+), 2 deletions(-)

Comments

Alexei Starovoitov Dec. 3, 2017, 6:26 p.m. UTC | #1
On Fri, Dec 01, 2017 at 10:15:04AM -0800, Lawrence Brakmo wrote:
> Adds read access to snd_cwnd and srtt_us fields of tcp_sock. Since these
> fields are only valid if the socket associated with the sock_ops program
> call is a full socket, the field is_fullsock is also added to the
> bpf_sock_ops struct. If the socket is not a full socket, reading these
> fields returns 0.
> 
> Note that in most cases it will not be necessary to check is_fullsock to
> know if there is a full socket. The context of the call, as specified by
> the 'op' field, can sometimes determine whether there is a full socket.
> 
> The struct bpf_sock_ops has the following fields added:
> 
>   __u32 is_fullsock;      /* Some TCP fields are only valid if
>                            * there is a full socket. If not, the
>                            * fields read as zero.
> 			   */
>   __u32 snd_cwnd;
>   __u32 srtt_us;          /* Averaged RTT << 3 in usecs */
> 
> There is a new macro, SOCK_OPS_GET_TCP32(NAME), to make it easier to add
> read access to more 32 bit tcp_sock fields.
> 
> Signed-off-by: Lawrence Brakmo <brakmo@fb.com>

lgtm
Acked-by: Alexei Starovoitov <ast@kernel.org>
will wait for Daniel to either ack it or apply it.
Daniel Borkmann Dec. 4, 2017, 10:32 p.m. UTC | #2
Hi Lawrence,

On 12/01/2017 07:15 PM, Lawrence Brakmo wrote:
> Adds read access to snd_cwnd and srtt_us fields of tcp_sock. Since these
> fields are only valid if the socket associated with the sock_ops program
> call is a full socket, the field is_fullsock is also added to the
> bpf_sock_ops struct. If the socket is not a full socket, reading these
> fields returns 0.
> 
> Note that in most cases it will not be necessary to check is_fullsock to
> know if there is a full socket. The context of the call, as specified by
> the 'op' field, can sometimes determine whether there is a full socket.
> 
> The struct bpf_sock_ops has the following fields added:
> 
>   __u32 is_fullsock;      /* Some TCP fields are only valid if
>                            * there is a full socket. If not, the
>                            * fields read as zero.
> 			   */
>   __u32 snd_cwnd;
>   __u32 srtt_us;          /* Averaged RTT << 3 in usecs */
> 
> There is a new macro, SOCK_OPS_GET_TCP32(NAME), to make it easier to add
> read access to more 32 bit tcp_sock fields.
> 
> Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
> ---
>  include/linux/filter.h   |  1 +
>  include/net/tcp.h        |  6 ++++--
>  include/uapi/linux/bpf.h |  6 ++++++
>  net/core/filter.c        | 36 ++++++++++++++++++++++++++++++++++++
>  4 files changed, 47 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 80b5b48..0062302 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -985,6 +985,7 @@ struct bpf_sock_ops_kern {
>  		u32 reply;
>  		u32 replylong[4];
>  	};
> +	u32	is_fullsock;
>  };
>  
>  #endif /* __LINUX_FILTER_H__ */
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 4e09398..89a6560 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -2012,10 +2012,12 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
>  	struct bpf_sock_ops_kern sock_ops;
>  	int ret;
>  
> -	if (sk_fullsock(sk))
> +	memset(&sock_ops, 0, sizeof(sock_ops));
> +	if (sk_fullsock(sk)) {
> +		sock_ops.is_fullsock = 1;
>  		sock_owned_by_me(sk);
> +	}
>  
> -	memset(&sock_ops, 0, sizeof(sock_ops));
>  	sock_ops.sk = sk;
>  	sock_ops.op = op;
>  
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 4c223ab..80d62e8 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -941,6 +941,12 @@ struct bpf_sock_ops {
>  	__u32 local_ip6[4];	/* Stored in network byte order */
>  	__u32 remote_port;	/* Stored in network byte order */
>  	__u32 local_port;	/* stored in host byte order */
> +	__u32 is_fullsock;	/* Some TCP fields are only valid if
> +				 * there is a full socket. If not, the
> +				 * fields read as zero.
> +				 */

Just a question for clarification: do you need to have this in
the uapi struct as well? Meaning, do you have a specific use case
where you do this check out of the BPF program given the below
two snd_cwnd/srtt_us check this internally in the ctx rewrite?

Do you plan to reuse the ctx assignment also for bpf_setsockopt()
and bpf_getsockopt() internally?

> +	__u32 snd_cwnd;
> +	__u32 srtt_us;		/* Averaged RTT << 3 in usecs */
>  };
>  
>  /* List of known BPF sock_ops operators.
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 6a85e67..bf31236 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -4437,6 +4437,42 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
>  		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
>  				      offsetof(struct sock_common, skc_num));
>  		break;
> +
> +	case offsetof(struct bpf_sock_ops, is_fullsock):
> +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> +						struct bpf_sock_ops_kern,
> +						is_fullsock),
> +				      si->dst_reg, si->src_reg,
> +				      offsetof(struct bpf_sock_ops_kern,
> +					       is_fullsock));
> +		break;
> +
> +/* Helper macro for adding read access to tcp_sock fields. */
> +#define SOCK_OPS_GET_TCP32(FIELD_NAME)					      \
> +	do {								      \
> +		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \
> +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
> +						struct bpf_sock_ops_kern,     \
> +						is_fullsock),		      \
> +				      si->dst_reg, si->src_reg,		      \
> +				      offsetof(struct bpf_sock_ops_kern,      \
> +					       is_fullsock));		      \
> +		*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2);	      \
> +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
> +						struct bpf_sock_ops_kern, sk),\
> +				      si->dst_reg, si->src_reg,		      \
> +				      offsetof(struct bpf_sock_ops_kern, sk));\
> +		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,        \
> +				      offsetof(struct tcp_sock, FIELD_NAME)); \
> +	} while (0)
> +
> +	case offsetof(struct bpf_sock_ops, snd_cwnd):
> +		SOCK_OPS_GET_TCP32(snd_cwnd);
> +		break;
> +
> +	case offsetof(struct bpf_sock_ops, srtt_us):
> +		SOCK_OPS_GET_TCP32(srtt_us);
> +		break;
>  	}
>  	return insn - insn_buf;
>  }
>
Lawrence Brakmo Dec. 5, 2017, 12:21 a.m. UTC | #3
On 12/4/17, 2:32 PM, "netdev-owner@vger.kernel.org on behalf of Daniel Borkmann" <netdev-owner@vger.kernel.org on behalf of daniel@iogearbox.net> wrote:

    Hi Lawrence,
    
    On 12/01/2017 07:15 PM, Lawrence Brakmo wrote:
    > Adds read access to snd_cwnd and srtt_us fields of tcp_sock. Since these

    > fields are only valid if the socket associated with the sock_ops program

    > call is a full socket, the field is_fullsock is also added to the

    > bpf_sock_ops struct. If the socket is not a full socket, reading these

    > fields returns 0.

    > 

    > Note that in most cases it will not be necessary to check is_fullsock to

    > know if there is a full socket. The context of the call, as specified by

    > the 'op' field, can sometimes determine whether there is a full socket.

    > 

    > The struct bpf_sock_ops has the following fields added:

    > 

    >   __u32 is_fullsock;      /* Some TCP fields are only valid if

    >                            * there is a full socket. If not, the

    >                            * fields read as zero.

    > 			   */

    >   __u32 snd_cwnd;

    >   __u32 srtt_us;          /* Averaged RTT << 3 in usecs */

    > 

    > There is a new macro, SOCK_OPS_GET_TCP32(NAME), to make it easier to add

    > read access to more 32 bit tcp_sock fields.

    > 

    > Signed-off-by: Lawrence Brakmo <brakmo@fb.com>

    > ---

    >  include/linux/filter.h   |  1 +

    >  include/net/tcp.h        |  6 ++++--

    >  include/uapi/linux/bpf.h |  6 ++++++

    >  net/core/filter.c        | 36 ++++++++++++++++++++++++++++++++++++

    >  4 files changed, 47 insertions(+), 2 deletions(-)

    > 

    > diff --git a/include/linux/filter.h b/include/linux/filter.h

    > index 80b5b48..0062302 100644

    > --- a/include/linux/filter.h

    > +++ b/include/linux/filter.h

    > @@ -985,6 +985,7 @@ struct bpf_sock_ops_kern {

    >  		u32 reply;

    >  		u32 replylong[4];

    >  	};

    > +	u32	is_fullsock;

    >  };

    >  

    >  #endif /* __LINUX_FILTER_H__ */

    > diff --git a/include/net/tcp.h b/include/net/tcp.h

    > index 4e09398..89a6560 100644

    > --- a/include/net/tcp.h

    > +++ b/include/net/tcp.h

    > @@ -2012,10 +2012,12 @@ static inline int tcp_call_bpf(struct sock *sk, int op)

    >  	struct bpf_sock_ops_kern sock_ops;

    >  	int ret;

    >  

    > -	if (sk_fullsock(sk))

    > +	memset(&sock_ops, 0, sizeof(sock_ops));

    > +	if (sk_fullsock(sk)) {

    > +		sock_ops.is_fullsock = 1;

    >  		sock_owned_by_me(sk);

    > +	}

    >  

    > -	memset(&sock_ops, 0, sizeof(sock_ops));

    >  	sock_ops.sk = sk;

    >  	sock_ops.op = op;

    >  

    > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h

    > index 4c223ab..80d62e8 100644

    > --- a/include/uapi/linux/bpf.h

    > +++ b/include/uapi/linux/bpf.h

    > @@ -941,6 +941,12 @@ struct bpf_sock_ops {

    >  	__u32 local_ip6[4];	/* Stored in network byte order */

    >  	__u32 remote_port;	/* Stored in network byte order */

    >  	__u32 local_port;	/* stored in host byte order */

    > +	__u32 is_fullsock;	/* Some TCP fields are only valid if

    > +				 * there is a full socket. If not, the

    > +				 * fields read as zero.

    > +				 */

    
    Just a question for clarification: do you need to have this in
    the uapi struct as well? Meaning, do you have a specific use case
    where you do this check out of the BPF program given the below
    two snd_cwnd/srtt_us check this internally in the ctx rewrite?
    
Although is_fullsock is checked internally, the bpf program may want to
verify if a return value of zero is due to a non fullsock state. The issue is
that there are some ops that are called from both active and passive paths
so it is not possible to know just by the op type.

    Do you plan to reuse the ctx assignment also for bpf_setsockopt()
    and bpf_getsockopt() internally?

If you mean, check is_fullsock instead of the call to sk_fullsock, then yes, it is
a great idea. However, I would rather do it in another patch so we don’t delay
this one (I have other things almost ready that depend on this patch).

    > +	__u32 snd_cwnd;

    > +	__u32 srtt_us;		/* Averaged RTT << 3 in usecs */

    >  };

    >  

    >  /* List of known BPF sock_ops operators.

    > diff --git a/net/core/filter.c b/net/core/filter.c

    > index 6a85e67..bf31236 100644

    > --- a/net/core/filter.c

    > +++ b/net/core/filter.c

    > @@ -4437,6 +4437,42 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,

    >  		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,

    >  				      offsetof(struct sock_common, skc_num));

    >  		break;

    > +

    > +	case offsetof(struct bpf_sock_ops, is_fullsock):

    > +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(

    > +						struct bpf_sock_ops_kern,

    > +						is_fullsock),

    > +				      si->dst_reg, si->src_reg,

    > +				      offsetof(struct bpf_sock_ops_kern,

    > +					       is_fullsock));

    > +		break;

    > +

    > +/* Helper macro for adding read access to tcp_sock fields. */

    > +#define SOCK_OPS_GET_TCP32(FIELD_NAME)					      \

    > +	do {								      \

    > +		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \

    > +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \

    > +						struct bpf_sock_ops_kern,     \

    > +						is_fullsock),		      \

    > +				      si->dst_reg, si->src_reg,		      \

    > +				      offsetof(struct bpf_sock_ops_kern,      \

    > +					       is_fullsock));		      \

    > +		*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2);	      \

    > +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \

    > +						struct bpf_sock_ops_kern, sk),\

    > +				      si->dst_reg, si->src_reg,		      \

    > +				      offsetof(struct bpf_sock_ops_kern, sk));\

    > +		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,        \

    > +				      offsetof(struct tcp_sock, FIELD_NAME)); \

    > +	} while (0)

    > +

    > +	case offsetof(struct bpf_sock_ops, snd_cwnd):

    > +		SOCK_OPS_GET_TCP32(snd_cwnd);

    > +		break;

    > +

    > +	case offsetof(struct bpf_sock_ops, srtt_us):

    > +		SOCK_OPS_GET_TCP32(srtt_us);

    > +		break;

    >  	}

    >  	return insn - insn_buf;

    >  }

    >
Daniel Borkmann Dec. 5, 2017, 1:59 p.m. UTC | #4
On 12/05/2017 01:21 AM, Lawrence Brakmo wrote:
[...]
>     Just a question for clarification: do you need to have this in
>     the uapi struct as well? Meaning, do you have a specific use case
>     where you do this check out of the BPF program given the below
>     two snd_cwnd/srtt_us check this internally in the ctx rewrite?
>     
> Although is_fullsock is checked internally, the bpf program may want to
> verify if a return value of zero is due to a non fullsock state. The issue is
> that there are some ops that are called from both active and passive paths
> so it is not possible to know just by the op type.

Ok, fair point. Patch applied to bpf-next, thanks Lawrence!

>     Do you plan to reuse the ctx assignment also for bpf_setsockopt()
>     and bpf_getsockopt() internally?
> 
> If you mean, check is_fullsock instead of the call to sk_fullsock, then yes, it is
> a great idea. However, I would rather do it in another patch so we don’t delay
> this one (I have other things almost ready that depend on this patch).
That's fine by me.
diff mbox series

Patch

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 80b5b48..0062302 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -985,6 +985,7 @@  struct bpf_sock_ops_kern {
 		u32 reply;
 		u32 replylong[4];
 	};
+	u32	is_fullsock;
 };
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 4e09398..89a6560 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2012,10 +2012,12 @@  static inline int tcp_call_bpf(struct sock *sk, int op)
 	struct bpf_sock_ops_kern sock_ops;
 	int ret;
 
-	if (sk_fullsock(sk))
+	memset(&sock_ops, 0, sizeof(sock_ops));
+	if (sk_fullsock(sk)) {
+		sock_ops.is_fullsock = 1;
 		sock_owned_by_me(sk);
+	}
 
-	memset(&sock_ops, 0, sizeof(sock_ops));
 	sock_ops.sk = sk;
 	sock_ops.op = op;
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4c223ab..80d62e8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -941,6 +941,12 @@  struct bpf_sock_ops {
 	__u32 local_ip6[4];	/* Stored in network byte order */
 	__u32 remote_port;	/* Stored in network byte order */
 	__u32 local_port;	/* stored in host byte order */
+	__u32 is_fullsock;	/* Some TCP fields are only valid if
+				 * there is a full socket. If not, the
+				 * fields read as zero.
+				 */
+	__u32 snd_cwnd;
+	__u32 srtt_us;		/* Averaged RTT << 3 in usecs */
 };
 
 /* List of known BPF sock_ops operators.
diff --git a/net/core/filter.c b/net/core/filter.c
index 6a85e67..bf31236 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4437,6 +4437,42 @@  static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
 				      offsetof(struct sock_common, skc_num));
 		break;
+
+	case offsetof(struct bpf_sock_ops, is_fullsock):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern,
+						is_fullsock),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern,
+					       is_fullsock));
+		break;
+
+/* Helper macro for adding read access to tcp_sock fields. */
+#define SOCK_OPS_GET_TCP32(FIELD_NAME)					      \
+	do {								      \
+		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
+						struct bpf_sock_ops_kern,     \
+						is_fullsock),		      \
+				      si->dst_reg, si->src_reg,		      \
+				      offsetof(struct bpf_sock_ops_kern,      \
+					       is_fullsock));		      \
+		*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2);	      \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
+						struct bpf_sock_ops_kern, sk),\
+				      si->dst_reg, si->src_reg,		      \
+				      offsetof(struct bpf_sock_ops_kern, sk));\
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,        \
+				      offsetof(struct tcp_sock, FIELD_NAME)); \
+	} while (0)
+
+	case offsetof(struct bpf_sock_ops, snd_cwnd):
+		SOCK_OPS_GET_TCP32(snd_cwnd);
+		break;
+
+	case offsetof(struct bpf_sock_ops, srtt_us):
+		SOCK_OPS_GET_TCP32(srtt_us);
+		break;
 	}
 	return insn - insn_buf;
 }