diff mbox series

[bpf-next,4/9] bpf: Add bpf helper bpf_tcp_check_probe_timer

Message ID 20190219053833.2086766-1-brakmo@fb.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series None | expand

Commit Message

Lawrence Brakmo Feb. 19, 2019, 5:38 a.m. UTC
This patch adds a new bpf helper BPF_FUNC_tcp_check_probe_timer
"int bpf_check_tcp_probe_timer(struct tcp_bpf_sock *tp, u32 when_us)".
It is added to BPF_PROG_TYPE_CGROUP_SKB typed bpf_prog which currently
can be attached to the ingress and egress path.

The function forces when_us to be at least TCP_TIMEOUT_MIN (currently
2 jiffies) and no more than TCP_RTO_MIN (currently 200ms).

When using a bpf_prog to limit the egress bandwidth of a cgroup,
it can happen that we drop a packet of a connection that has no
packets out. In this case, the connection may not retry sending
the packet until the probe timer fires. Since the default value
of the probe timer is at least 200ms, this can introduce link
underutiliation (i.e. the cgroup egress bandwidth being smaller
than the specified rate) thus increased tail latency.
This helper function allows for setting a smaller probe timer.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/uapi/linux/bpf.h | 12 +++++++++++-
 net/core/filter.c        | 27 +++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

Comments

Daniel Borkmann Feb. 19, 2019, 10:56 a.m. UTC | #1
On 02/19/2019 06:38 AM, brakmo wrote:
> This patch adds a new bpf helper BPF_FUNC_tcp_check_probe_timer
> "int bpf_check_tcp_probe_timer(struct tcp_bpf_sock *tp, u32 when_us)".
> It is added to BPF_PROG_TYPE_CGROUP_SKB typed bpf_prog which currently
> can be attached to the ingress and egress path.
> 
> The function forces when_us to be at least TCP_TIMEOUT_MIN (currently
> 2 jiffies) and no more than TCP_RTO_MIN (currently 200ms).
> 
> When using a bpf_prog to limit the egress bandwidth of a cgroup,
> it can happen that we drop a packet of a connection that has no
> packets out. In this case, the connection may not retry sending
> the packet until the probe timer fires. Since the default value
> of the probe timer is at least 200ms, this can introduce link
> underutiliation (i.e. the cgroup egress bandwidth being smaller
> than the specified rate) thus increased tail latency.
> This helper function allows for setting a smaller probe timer.
> 
> Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
> ---
>  include/uapi/linux/bpf.h | 12 +++++++++++-
>  net/core/filter.c        | 27 +++++++++++++++++++++++++++
>  2 files changed, 38 insertions(+), 1 deletion(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 5daf404511f7..a78936acccae 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2372,6 +2372,15 @@ union bpf_attr {
>   *		val should be one of 0, 1, 2, 3.
>   *	Return
>   *		-EINVAL on error (e.g. val > 3), 0 otherwise.
> + *
> + * int bpf_tcp_check_probe_timer(struct bpf_tcp_sock *tp, int when_us)
> + *	Description
> + *		Checks that there are no packets out and there is no pending
> + *		timer. If both of these are true, it bounds when_us by
> + *		TCP_TIMEOUT_MIN (2 jiffies) or TCP_RTO_MIN (200ms) and
> + *		sets the probe timer.
> + *	Return
> + *		0
>   */
>  #define __BPF_FUNC_MAPPER(FN)		\
>  	FN(unspec),			\
> @@ -2472,7 +2481,8 @@ union bpf_attr {
>  	FN(sk_fullsock),		\
>  	FN(tcp_sock),			\
>  	FN(tcp_enter_cwr),		\
> -	FN(skb_set_ecn),
> +	FN(skb_set_ecn),		\
> +	FN(tcp_check_probe_timer),
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 275acfb2117d..2b975e651a04 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -5465,6 +5465,31 @@ static const struct bpf_func_proto bpf_skb_set_ecn_proto = {
>  	.arg1_type	= ARG_PTR_TO_CTX,
>  	.arg2_type	= ARG_ANYTHING,
>  };
> +
> +BPF_CALL_2(bpf_tcp_check_probe_timer, struct tcp_sock *, tp, u32, when_us)
> +{
> +	struct sock *sk = (struct sock *) tp;
> +	unsigned long when = usecs_to_jiffies(when_us);
> +
> +	if (!tp->packets_out && !inet_csk(sk)->icsk_pending) {
> +		if (when < TCP_TIMEOUT_MIN)
> +			when = TCP_TIMEOUT_MIN;
> +		else if (when > TCP_RTO_MIN)
> +			when = TCP_RTO_MIN;
> +
> +		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
> +					  when, TCP_RTO_MAX);

Should this be using tcp_reset_xmit_timer() instead to take pacing
into account? (If not, would be good to have a comment explaining
why it's okay to use directly here.)

> +	}
> +	return 0;
> +}
> +
> +static const struct bpf_func_proto bpf_tcp_check_probe_timer_proto = {
> +	.func		= bpf_tcp_check_probe_timer,
> +	.gpl_only	= false,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_PTR_TO_TCP_SOCK,
> +	.arg2_type	= ARG_ANYTHING,
> +};
>  #endif /* CONFIG_INET */
>  
>  bool bpf_helper_changes_pkt_data(void *func)
> @@ -5628,6 +5653,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  		return &bpf_tcp_enter_cwr_proto;
>  	case BPF_FUNC_skb_set_ecn:
>  		return &bpf_skb_set_ecn_proto;
> +	case BPF_FUNC_tcp_check_probe_timer:
> +		return &bpf_tcp_check_probe_timer_proto;
>  #endif
>  	default:
>  		return sk_filter_func_proto(func_id, prog);
>
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 5daf404511f7..a78936acccae 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2372,6 +2372,15 @@  union bpf_attr {
  *		val should be one of 0, 1, 2, 3.
  *	Return
  *		-EINVAL on error (e.g. val > 3), 0 otherwise.
+ *
+ * int bpf_tcp_check_probe_timer(struct bpf_tcp_sock *tp, int when_us)
+ *	Description
+ *		Checks that there are no packets out and there is no pending
+ *		timer. If both of these are true, it bounds when_us by
+ *		TCP_TIMEOUT_MIN (2 jiffies) or TCP_RTO_MIN (200ms) and
+ *		sets the probe timer.
+ *	Return
+ *		0
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2472,7 +2481,8 @@  union bpf_attr {
 	FN(sk_fullsock),		\
 	FN(tcp_sock),			\
 	FN(tcp_enter_cwr),		\
-	FN(skb_set_ecn),
+	FN(skb_set_ecn),		\
+	FN(tcp_check_probe_timer),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 275acfb2117d..2b975e651a04 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5465,6 +5465,31 @@  static const struct bpf_func_proto bpf_skb_set_ecn_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_ANYTHING,
 };
+
+BPF_CALL_2(bpf_tcp_check_probe_timer, struct tcp_sock *, tp, u32, when_us)
+{
+	struct sock *sk = (struct sock *) tp;
+	unsigned long when = usecs_to_jiffies(when_us);
+
+	if (!tp->packets_out && !inet_csk(sk)->icsk_pending) {
+		if (when < TCP_TIMEOUT_MIN)
+			when = TCP_TIMEOUT_MIN;
+		else if (when > TCP_RTO_MIN)
+			when = TCP_RTO_MIN;
+
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+					  when, TCP_RTO_MAX);
+	}
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_tcp_check_probe_timer_proto = {
+	.func		= bpf_tcp_check_probe_timer,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_TCP_SOCK,
+	.arg2_type	= ARG_ANYTHING,
+};
 #endif /* CONFIG_INET */
 
 bool bpf_helper_changes_pkt_data(void *func)
@@ -5628,6 +5653,8 @@  cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_tcp_enter_cwr_proto;
 	case BPF_FUNC_skb_set_ecn:
 		return &bpf_skb_set_ecn_proto;
+	case BPF_FUNC_tcp_check_probe_timer:
+		return &bpf_tcp_check_probe_timer_proto;
 #endif
 	default:
 		return sk_filter_func_proto(func_id, prog);