diff mbox

[4/5] net: tcp: more detailed ACK events, and events for CE marked packets

Message ID 1399928384-24143-5-git-send-email-fw@strlen.de
State Deferred, archived
Delegated to: David Miller
Headers show

Commit Message

Florian Westphal May 12, 2014, 8:59 p.m. UTC
DataCenter TCP (DCTCP) determines cwnd growth based on ECN information
and ACK properties, e.g. ACK that updates window is treated differently
than DUPACK.

Also DCTCP needs information whether ACK was delayed ACK. Furthermore,
DCTCP also implements a CE state machine that keeps track of CE markings
of incoming packets.

Therefore, extend the congestion control framework to provide these
event types, so that DCTCP can be properly implemented as a normal
congestion algorithm module outside the core stack.

Joint work with Daniel Borkmann and Glenn Judd.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: Glenn Judd <glenn.judd@morganstanley.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/net/tcp.h     |  9 ++++++++-
 net/ipv4/tcp_input.c  | 22 ++++++++++++++++++----
 net/ipv4/tcp_output.c |  4 ++++
 3 files changed, 30 insertions(+), 5 deletions(-)

Comments

Yuchung Cheng May 13, 2014, 4:41 a.m. UTC | #1
On Mon, May 12, 2014 at 1:59 PM, Florian Westphal <fw@strlen.de> wrote:
>
> DataCenter TCP (DCTCP) determines cwnd growth based on ECN information
> and ACK properties, e.g. ACK that updates window is treated differently
> than DUPACK.
>
> Also DCTCP needs information whether ACK was delayed ACK. Furthermore,
> DCTCP also implements a CE state machine that keeps track of CE markings
> of incoming packets.
>
> Therefore, extend the congestion control framework to provide these
> event types, so that DCTCP can be properly implemented as a normal
> congestion algorithm module outside the core stack.
>
> Joint work with Daniel Borkmann and Glenn Judd.
>
> Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
> Signed-off-by: Glenn Judd <glenn.judd@morganstanley.com>
> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---
>  include/net/tcp.h     |  9 ++++++++-
>  net/ipv4/tcp_input.c  | 22 ++++++++++++++++++----
>  net/ipv4/tcp_output.c |  4 ++++
>  3 files changed, 30 insertions(+), 5 deletions(-)
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 0d767d2..56bf383 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -754,10 +754,17 @@ enum tcp_ca_event {
>         CA_EVENT_CWND_RESTART,  /* congestion window restart */
>         CA_EVENT_COMPLETE_CWR,  /* end of congestion recovery */
>         CA_EVENT_LOSS,          /* loss timeout */
> +       CA_EVENT_ECN_NO_CE,     /* ECT set, but not CE marked */
> +       CA_EVENT_ECN_IS_CE,     /* received CE marked IP packet */
> +       CA_EVENT_DELAYED_ACK,   /* Delayed ack is sent */
> +       CA_EVENT_NON_DELAYED_ACK,
do we need NON_DELAYED_ACK event? is there a third kind?

>  };
>
> +/* information about inbound ACK, passed to cong_ops->in_ack_event() */
>  enum tcp_ca_ack_event_flags {
> -       CA_ACK_SLOWPATH = (1 << 0),
> +       CA_ACK_SLOWPATH         = (1 << 0),     /* in slow path processing */
> +       CA_ACK_WIN_UPDATE       = (1 << 1),     /* ACK updated window */
> +       CA_ACK_ECE              = (1 << 2),     /* ECE bit is set on ack */
>  };
>
>
>  /*
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 7fab1da..bf0f734 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -232,14 +232,21 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s
>                         tcp_enter_quickack_mode((struct sock *)tp);
>                 break;
>         case INET_ECN_CE:
> +               if (tcp_ca_needs_ecn((struct sock *)tp))
> +                       tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
> +
>                 if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
>                         /* Better not delay acks, sender can have a very low cwnd */
>                         tcp_enter_quickack_mode((struct sock *)tp);
>                         tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
>                 }
> -               /* fallinto */
> +               tp->ecn_flags |= TCP_ECN_SEEN;
> +               break;
>         default:
> +               if (tcp_ca_needs_ecn((struct sock *)tp))
> +                       tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
>                 tp->ecn_flags |= TCP_ECN_SEEN;
> +               break;
>         }
>  }
>
> @@ -3421,10 +3428,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
>                 tp->snd_una = ack;
>                 flag |= FLAG_WIN_UPDATE;
>
> -               tcp_in_ack_event(sk, 0);
> +               tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
These CA_ACK_xxx are identical to the ACK flag so might as well just
pass the flag to the event handler?

>
>                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
>         } else {
> +               u32 ack_ev_flags = CA_ACK_SLOWPATH;
> +
>                 if (ack_seq != TCP_SKB_CB(skb)->end_seq)
>                         flag |= FLAG_DATA;
>                 else
> @@ -3436,10 +3445,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
>                         flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
>                                                         &sack_rtt_us);
>
> -               if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
> +               if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) {
>                         flag |= FLAG_ECE;
> +                       ack_ev_flags |= CA_ACK_ECE;
> +               }
> +
> +               if (flag & FLAG_WIN_UPDATE)
> +                       ack_ev_flags |= CA_ACK_WIN_UPDATE;
>
> -               tcp_in_ack_event(sk, CA_ACK_SLOWPATH);
> +               tcp_in_ack_event(sk, ack_ev_flags);
>         }
>
>         /* We passed data and got it acked, remove any soft error
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 1f983dd..1f5e04a 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -3119,6 +3119,8 @@ void tcp_send_delayed_ack(struct sock *sk)
>         int ato = icsk->icsk_ack.ato;
>         unsigned long timeout;
>
> +       tcp_ca_event(sk, CA_EVENT_DELAYED_ACK);
> +
>         if (ato > TCP_DELACK_MIN) {
>                 const struct tcp_sock *tp = tcp_sk(sk);
>                 int max_ato = HZ / 2;
> @@ -3175,6 +3177,8 @@ void tcp_send_ack(struct sock *sk)
>         if (sk->sk_state == TCP_CLOSE)
>                 return;
>
> +       tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK);
> +
>         /* We are not putting this on the write queue, so
>          * tcp_transmit_skb() will set the ownership to this
>          * sock.
> --
> 1.8.1.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Florian Westphal May 13, 2014, 10:26 a.m. UTC | #2
Yuchung Cheng <ycheng@google.com> wrote:
> On Mon, May 12, 2014 at 1:59 PM, Florian Westphal <fw@strlen.de> wrote:
> >
> > DataCenter TCP (DCTCP) determines cwnd growth based on ECN information
> > and ACK properties, e.g. ACK that updates window is treated differently
> > than DUPACK.
> >
> > Also DCTCP needs information whether ACK was delayed ACK. Furthermore,
> > DCTCP also implements a CE state machine that keeps track of CE markings
> > of incoming packets.
> >
> > Therefore, extend the congestion control framework to provide these
> > event types, so that DCTCP can be properly implemented as a normal
> > congestion algorithm module outside the core stack.
> >
> > Joint work with Daniel Borkmann and Glenn Judd.
> >
> > Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
> > Signed-off-by: Glenn Judd <glenn.judd@morganstanley.com>
> > Signed-off-by: Florian Westphal <fw@strlen.de>
> > ---
> >  include/net/tcp.h     |  9 ++++++++-
> >  net/ipv4/tcp_input.c  | 22 ++++++++++++++++++----
> >  net/ipv4/tcp_output.c |  4 ++++
> >  3 files changed, 30 insertions(+), 5 deletions(-)
> >
> > diff --git a/include/net/tcp.h b/include/net/tcp.h
> > index 0d767d2..56bf383 100644
> > --- a/include/net/tcp.h
> > +++ b/include/net/tcp.h
> > @@ -754,10 +754,17 @@ enum tcp_ca_event {
> >         CA_EVENT_CWND_RESTART,  /* congestion window restart */
> >         CA_EVENT_COMPLETE_CWR,  /* end of congestion recovery */
> >         CA_EVENT_LOSS,          /* loss timeout */
> > +       CA_EVENT_ECN_NO_CE,     /* ECT set, but not CE marked */
> > +       CA_EVENT_ECN_IS_CE,     /* received CE marked IP packet */
> > +       CA_EVENT_DELAYED_ACK,   /* Delayed ack is sent */
> > +       CA_EVENT_NON_DELAYED_ACK,
> do we need NON_DELAYED_ACK event? is there a third kind?

Could you elaborate? I am not sure what you mean.

> > @@ -3421,10 +3428,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
> >                 tp->snd_una = ack;
> >                 flag |= FLAG_WIN_UPDATE;
> >
> > -               tcp_in_ack_event(sk, 0);
> > +               tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
> These CA_ACK_xxx are identical to the ACK flag so might as well just
> pass the flag to the event handler?

We felt that exposing all the FLAG_xxx values (which are private to
tcp_input.c) to congestion modules would be overkill, especially
since we would fail to notify the cong_ops module(s) about all of these.

I think if we were to expose all of them we'd also have to call
rename tcp_in_ack_event() (since most of the flags are not ack related)
and then consistently call that for all the flag changes (which is just
needless overhead since no cong_ops module would react to these).

We can update the tcp_ca_ack_event_flags to include
all FLAG_xxx values and then convert tcp_input.c  to use them, but it
would mean that cong_ops modules could only rely on a selected few of
these flags to actually be set/propagated consistently.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0d767d2..56bf383 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -754,10 +754,17 @@  enum tcp_ca_event {
 	CA_EVENT_CWND_RESTART,	/* congestion window restart */
 	CA_EVENT_COMPLETE_CWR,	/* end of congestion recovery */
 	CA_EVENT_LOSS,		/* loss timeout */
+	CA_EVENT_ECN_NO_CE,	/* ECT set, but not CE marked */
+	CA_EVENT_ECN_IS_CE,	/* received CE marked IP packet */
+	CA_EVENT_DELAYED_ACK,	/* Delayed ack is sent */
+	CA_EVENT_NON_DELAYED_ACK,
 };
 
+/* information about inbound ACK, passed to cong_ops->in_ack_event() */
 enum tcp_ca_ack_event_flags {
-	CA_ACK_SLOWPATH = (1 << 0),
+	CA_ACK_SLOWPATH		= (1 << 0),	/* in slow path processing */
+	CA_ACK_WIN_UPDATE	= (1 << 1),	/* ACK updated window */
+	CA_ACK_ECE		= (1 << 2),	/* ECE bit is set on ack */
 };
 
 /*
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7fab1da..bf0f734 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -232,14 +232,21 @@  static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s
 			tcp_enter_quickack_mode((struct sock *)tp);
 		break;
 	case INET_ECN_CE:
+		if (tcp_ca_needs_ecn((struct sock *)tp))
+			tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
+
 		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
 			/* Better not delay acks, sender can have a very low cwnd */
 			tcp_enter_quickack_mode((struct sock *)tp);
 			tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
 		}
-		/* fallinto */
+		tp->ecn_flags |= TCP_ECN_SEEN;
+		break;
 	default:
+		if (tcp_ca_needs_ecn((struct sock *)tp))
+			tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
 		tp->ecn_flags |= TCP_ECN_SEEN;
+		break;
 	}
 }
 
@@ -3421,10 +3428,12 @@  static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 		tp->snd_una = ack;
 		flag |= FLAG_WIN_UPDATE;
 
-		tcp_in_ack_event(sk, 0);
+		tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
 
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
 	} else {
+		u32 ack_ev_flags = CA_ACK_SLOWPATH;
+
 		if (ack_seq != TCP_SKB_CB(skb)->end_seq)
 			flag |= FLAG_DATA;
 		else
@@ -3436,10 +3445,15 @@  static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 			flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
 							&sack_rtt_us);
 
-		if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
+		if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) {
 			flag |= FLAG_ECE;
+			ack_ev_flags |= CA_ACK_ECE;
+		}
+
+		if (flag & FLAG_WIN_UPDATE)
+			ack_ev_flags |= CA_ACK_WIN_UPDATE;
 
-		tcp_in_ack_event(sk, CA_ACK_SLOWPATH);
+		tcp_in_ack_event(sk, ack_ev_flags);
 	}
 
 	/* We passed data and got it acked, remove any soft error
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 1f983dd..1f5e04a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3119,6 +3119,8 @@  void tcp_send_delayed_ack(struct sock *sk)
 	int ato = icsk->icsk_ack.ato;
 	unsigned long timeout;
 
+	tcp_ca_event(sk, CA_EVENT_DELAYED_ACK);
+
 	if (ato > TCP_DELACK_MIN) {
 		const struct tcp_sock *tp = tcp_sk(sk);
 		int max_ato = HZ / 2;
@@ -3175,6 +3177,8 @@  void tcp_send_ack(struct sock *sk)
 	if (sk->sk_state == TCP_CLOSE)
 		return;
 
+	tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK);
+
 	/* We are not putting this on the write queue, so
 	 * tcp_transmit_skb() will set the ownership to this
 	 * sock.