diff mbox

tcp: Increase the initial congestion window to 10.

Message ID 20110202.170750.229739784.davem@davemloft.net
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

David Miller Feb. 3, 2011, 1:07 a.m. UTC
Signed-off-by: David S. Miller <davem@davemloft.net>
---

I've left the DCCP code to keep using RFC3390 logic, if they
wish to adopt this change in their code they can do so by
simply deleting the rfc33390_bytes_to_packets() function and
using TCP_INIT_CWND in their assignment.

 include/net/tcp.h      |   12 +++---------
 net/dccp/ccids/ccid2.c |    9 +++++++++
 net/ipv4/tcp_input.c   |    2 +-
 3 files changed, 13 insertions(+), 10 deletions(-)

Comments

Eric Dumazet Feb. 3, 2011, 1:53 a.m. UTC | #1
Le mercredi 02 février 2011 à 17:07 -0800, David Miller a écrit :
> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---
> 

Hmm, you forgot a Changelog David ;)

I thought Tom and Google guys were preparing a nice one ?



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nandita Dukkipati Feb. 3, 2011, 2:25 a.m. UTC | #2
Acked-by: Nandita Dukkipati <nanditad@google.com>

On Wed, Feb 2, 2011 at 5:07 PM, David Miller <davem@davemloft.net> wrote:
>
> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---
>
> I've left the DCCP code to keep using RFC3390 logic, if they
> wish to adopt this change in their code they can do so by
> simply deleting the rfc33390_bytes_to_packets() function and
> using TCP_INIT_CWND in their assignment.
>
>  include/net/tcp.h      |   12 +++---------
>  net/dccp/ccids/ccid2.c |    9 +++++++++
>  net/ipv4/tcp_input.c   |    2 +-
>  3 files changed, 13 insertions(+), 10 deletions(-)
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 9179111..7118668 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -196,6 +196,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
>  /* TCP thin-stream limits */
>  #define TCP_THIN_LINEAR_RETRIES 6       /* After 6 linear retries, do exp. backoff */
>
> +/* TCP initial congestion window */
> +#define TCP_INIT_CWND          10
> +
>  extern struct inet_timewait_death_row tcp_death_row;
>
>  /* sysctl variables for tcp */
> @@ -799,15 +802,6 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk)
>  /* Use define here intentionally to get WARN_ON location shown at the caller */
>  #define tcp_verify_left_out(tp)        WARN_ON(tcp_left_out(tp) > tp->packets_out)
>
> -/*
> - * Convert RFC 3390 larger initial window into an equivalent number of packets.
> - * This is based on the numbers specified in RFC 5681, 3.1.
> - */
> -static inline u32 rfc3390_bytes_to_packets(const u32 smss)
> -{
> -       return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3);
> -}
> -
>  extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh);
>  extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst);
>
> diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
> index e96d5e8..fadecd2 100644
> --- a/net/dccp/ccids/ccid2.c
> +++ b/net/dccp/ccids/ccid2.c
> @@ -583,6 +583,15 @@ done:
>        dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
>  }
>
> +/*
> + * Convert RFC 3390 larger initial window into an equivalent number of packets.
> + * This is based on the numbers specified in RFC 5681, 3.1.
> + */
> +static inline u32 rfc3390_bytes_to_packets(const u32 smss)
> +{
> +       return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3);
> +}
> +
>  static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
>  {
>        struct ccid2_hc_tx_sock *hc = ccid_priv(ccid);
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index eb7f82e..2f692ce 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -817,7 +817,7 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
>        __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
>
>        if (!cwnd)
> -               cwnd = rfc3390_bytes_to_packets(tp->mss_cache);
> +               cwnd = TCP_INIT_CWND;
>        return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
>  }
>
> --
> 1.7.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Feb. 3, 2011, 3:34 a.m. UTC | #3
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 03 Feb 2011 02:53:43 +0100

> Le mercredi 02 février 2011 à 17:07 -0800, David Miller a écrit :
>> Signed-off-by: David S. Miller <davem@davemloft.net>
>> ---
>> 
> 
> Hmm, you forgot a Changelog David ;)

Maybe, or maybe not, frankly it's quite self-evident if you ask
me. :-)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hagen Paul Pfeifer Feb. 3, 2011, 2 p.m. UTC | #4
On Wed, 02 Feb 2011 17:07:50 -0800 (PST), David Miller wrote:

> +/* TCP initial congestion window */
> +#define TCP_INIT_CWND		10
> +

Davem, there is _no_ research how this huge IW will behave in environments
with a small BDP. Belief it or not, but there are networks out there with a
BDP similar to ~1980. Why for heaven's sake should this work in these
environments? There are only _two_ extensive analysis one from Cherry and
and one from Ilpo. Both analysis focus on current mainstream BDP. I started
to setup a ns-2 environment but due to lack of time I am a little bit
behind schedule. Anyway, why not make this a tunable per route knob so that
it is easier to fix things, e.g. set IW back to 3.

HGN
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John Heffner Feb. 3, 2011, 2:17 p.m. UTC | #5
On Thu, Feb 3, 2011 at 9:00 AM, Hagen Paul Pfeifer <hagen@jauu.net> wrote:
>
> On Wed, 02 Feb 2011 17:07:50 -0800 (PST), David Miller wrote:
>
>> +/* TCP initial congestion window */
>> +#define TCP_INIT_CWND                10
>> +
>
> Davem, there is _no_ research how this huge IW will behave in environments
> with a small BDP. Belief it or not, but there are networks out there with a
> BDP similar to ~1980. Why for heaven's sake should this work in these
> environments? There are only _two_ extensive analysis one from Cherry and
> and one from Ilpo. Both analysis focus on current mainstream BDP. I started
> to setup a ns-2 environment but due to lack of time I am a little bit
> behind schedule. Anyway, why not make this a tunable per route knob so that
> it is easier to fix things, e.g. set IW back to 3.


There's already a per-route tunable, right (RTAX_INITCWND)?

  -John
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hagen Paul Pfeifer Feb. 3, 2011, 2:26 p.m. UTC | #6
On Thu, 3 Feb 2011 09:17:14 -0500, John Heffner wrote:

> There's already a per-route tunable, right (RTAX_INITCWND)?

Yes, __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);

I was a little bit nervous ... ;)

HGN
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hagen Paul Pfeifer Feb. 3, 2011, 2:43 p.m. UTC | #7
On Thu, 03 Feb 2011 15:00:28 +0100, Hagen Paul Pfeifer wrote:

> Davem, there is _no_ research how this huge IW will behave in
environments
> with a small BDP. Belief it or not, but there are networks out there
with a
> BDP similar to ~1980. Why for heaven's sake should this work in these
> environments? There are only _two_ extensive analysis one from Cherry
and

s/Cherry/Jerry/  ;-)

Hagen
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yuchung Cheng Feb. 3, 2011, 8:28 p.m. UTC | #8
Thanks for doing this!

Acked-by: Yuchung Cheng <ycheng@google.com>

On Wed, Feb 2, 2011 at 5:07 PM, David Miller <davem@davemloft.net> wrote:
>
> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---
>
> I've left the DCCP code to keep using RFC3390 logic, if they
> wish to adopt this change in their code they can do so by
> simply deleting the rfc33390_bytes_to_packets() function and
> using TCP_INIT_CWND in their assignment.
>
>  include/net/tcp.h      |   12 +++---------
>  net/dccp/ccids/ccid2.c |    9 +++++++++
>  net/ipv4/tcp_input.c   |    2 +-
>  3 files changed, 13 insertions(+), 10 deletions(-)
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 9179111..7118668 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -196,6 +196,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
>  /* TCP thin-stream limits */
>  #define TCP_THIN_LINEAR_RETRIES 6       /* After 6 linear retries, do exp. backoff */
>
> +/* TCP initial congestion window */
> +#define TCP_INIT_CWND          10
> +
>  extern struct inet_timewait_death_row tcp_death_row;
>
>  /* sysctl variables for tcp */
> @@ -799,15 +802,6 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk)
>  /* Use define here intentionally to get WARN_ON location shown at the caller */
>  #define tcp_verify_left_out(tp)        WARN_ON(tcp_left_out(tp) > tp->packets_out)
>
> -/*
> - * Convert RFC 3390 larger initial window into an equivalent number of packets.
> - * This is based on the numbers specified in RFC 5681, 3.1.
> - */
> -static inline u32 rfc3390_bytes_to_packets(const u32 smss)
> -{
> -       return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3);
> -}
> -
>  extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh);
>  extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst);
>
> diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
> index e96d5e8..fadecd2 100644
> --- a/net/dccp/ccids/ccid2.c
> +++ b/net/dccp/ccids/ccid2.c
> @@ -583,6 +583,15 @@ done:
>        dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
>  }
>
> +/*
> + * Convert RFC 3390 larger initial window into an equivalent number of packets.
> + * This is based on the numbers specified in RFC 5681, 3.1.
> + */
> +static inline u32 rfc3390_bytes_to_packets(const u32 smss)
> +{
> +       return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3);
> +}
> +
>  static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
>  {
>        struct ccid2_hc_tx_sock *hc = ccid_priv(ccid);
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index eb7f82e..2f692ce 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -817,7 +817,7 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
>        __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
>
>        if (!cwnd)
> -               cwnd = rfc3390_bytes_to_packets(tp->mss_cache);
> +               cwnd = TCP_INIT_CWND;
>        return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
>  }
>
> --
> 1.7.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
H.K. Jerry Chu Feb. 3, 2011, 9:17 p.m. UTC | #9
On Thu, Feb 3, 2011 at 6:00 AM, Hagen Paul Pfeifer <hagen@jauu.net> wrote:
>
> On Wed, 02 Feb 2011 17:07:50 -0800 (PST), David Miller wrote:
>
>> +/* TCP initial congestion window */
>> +#define TCP_INIT_CWND                10
>> +
>
> Davem, there is _no_ research how this huge IW will behave in environments
> with a small BDP. Belief it or not, but there are networks out there with a
> BDP similar to ~1980. Why for heaven's sake should this work in these
> environments? There are only _two_ extensive analysis one from Cherry and
> and one from Ilpo. Both analysis focus on current mainstream BDP. I started

It's more than that. We have also conducted extensive, large scale
experiments covering
all BDP/RTT combinations and the result was documented in details in a
SIGCOMM/CCR paper that can be found from a summary page
http://code.google.com/speed/protocols/tcpm-IW10.html covering all our
IW10 work.

> to setup a ns-2 environment but due to lack of time I am a little bit
> behind schedule. Anyway, why not make this a tunable per route knob so that
> it is easier to fix things, e.g. set IW back to 3.

You've seemed to get this backwards. Shouldn't we set the default for the normal
cases and leave the corner cases to the initcwnd and initrwnd knobs to cover?

Jerry

>
> HGN
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Leandro Sales Feb. 3, 2011, 10 p.m. UTC | #10
Just forwarding my reply to the list netdev and dccp lists, the text
was in HTML format and Majordomo rejected it...

---------- Forwarded message ----------
From: Leandro Melo de Sales <leandroal@gmail.com>
Date: Thu, Feb 3, 2011 at 2:48 PM
Subject: Re: [PATCH] tcp: Increase the initial congestion window to 10.
To: David Miller <davem@davemloft.net>
Cc: netdev@vger.kernel.org, dccp@vger.kernel.org, therbert@google.com,
Angelo Perkusich <Angelo.Perkusich@gmail.com>, Hyggo Almeida
<hyggo.almeida@gmail.com>


On Wed, Feb 2, 2011 at 10:07 PM, David Miller <davem@davemloft.net> wrote:
>
> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---
>
> I've left the DCCP code to keep using RFC3390 logic, if they
> wish to adopt this change in their code they can do so by
> simply deleting the rfc33390_bytes_to_packets() function and
> using TCP_INIT_CWND in their assignment.
>
>  include/net/tcp.h      |   12 +++---------
>  net/dccp/ccids/ccid2.c |    9 +++++++++
>  net/ipv4/tcp_input.c   |    2 +-
>  3 files changed, 13 insertions(+), 10 deletions(-)
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 9179111..7118668 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -196,6 +196,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
>  /* TCP thin-stream limits */
>  #define TCP_THIN_LINEAR_RETRIES 6       /* After 6 linear retries, do exp. backoff */
>
> +/* TCP initial congestion window */
> +#define TCP_INIT_CWND          10
> +
>  extern struct inet_timewait_death_row tcp_death_row;
>
>  /* sysctl variables for tcp */
> @@ -799,15 +802,6 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk)
>  /* Use define here intentionally to get WARN_ON location shown at the caller */
>  #define tcp_verify_left_out(tp)        WARN_ON(tcp_left_out(tp) > tp->packets_out)
>
> -/*
> - * Convert RFC 3390 larger initial window into an equivalent number of packets.
> - * This is based on the numbers specified in RFC 5681, 3.1.
> - */
> -static inline u32 rfc3390_bytes_to_packets(const u32 smss)
> -{
> -       return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3);
> -}
> -
>  extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh);
>  extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst);
>
> diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
> index e96d5e8..fadecd2 100644
> --- a/net/dccp/ccids/ccid2.c
> +++ b/net/dccp/ccids/ccid2.c
> @@ -583,6 +583,15 @@ done:
>        dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
>  }
>
> +/*
> + * Convert RFC 3390 larger initial window into an equivalent number of packets.
> + * This is based on the numbers specified in RFC 5681, 3.1.
> + */
> +static inline u32 rfc3390_bytes_to_packets(const u32 smss)
> +{
> +       return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3);
> +}
> +
>  static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
>  {
>        struct ccid2_hc_tx_sock *hc = ccid_priv(ccid);
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index eb7f82e..2f692ce 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -817,7 +817,7 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
>        __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
>
>        if (!cwnd)
> -               cwnd = rfc3390_bytes_to_packets(tp->mss_cache);
> +               cwnd = TCP_INIT_CWND;
>        return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
>  }
>
> --
> 1.7.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe dccp" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

David and others,
    I don't think this change will make sense for DCCP, once IW10 is
for the cases of short-lived flows. DCCP is used on scenarios for
multimedia flows, which are, in general, long-lived flows. So, IMO the
way how we calculate IW for DCCP is appropriate, at least considering
a quick answer. In fact, nowadays we need better congestion control
algorithms for DCCP, and in this sense we are working on adapt TCP
Cubic for DCCP as a CCID, and also we started some work to make DCCP
support TCP pluggable mechanism , which will allow us to use TCP
congestion control algorithms in DCCP.
    I've been following all the discussions about IW10 in the TCPM
discussion list and also been reading all materials in this context
(papers and ppts) provided by Tom Herbert and Nandita Dukkipati (along
with other researchers from Networking Research Lab at North Carolina
State University), but although the results is really great (at least
for scenarios they've experimented and regardless to all opinions
expressed at TCPM), DCCP should calculate its IW as specified in
RFC3390 rather than set it to IW10. For the moment, I understand that
for DCCP, we have to discuss more about this in a separated thread.

Leandro.

--
Leandro Melo de Sales
Professor at Institute of Computing at Federal University of Alagoas, Brazil
PhD candidate in Computer Science
Pervasive and Embedded Computing Laboratory, UFCG
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ilpo Järvinen Feb. 3, 2011, 10:43 p.m. UTC | #11
It would perhaps be useful to change receiver advertized window to include 
some extra segs initially. It should be >= IW + peer's dupThresh-1 as 
otherwise limited transmit won't work for the initial window because we 
won't open more receiver window with dupacks (IIRC, I suppose Jerry might 
be able to correct me right away if I'm wrong and we open window with 
dupacks too?).  I think initial receiver window code used to have some 
surplus but it was broken by the rfc3390-func conversion (against my 
advice on how to do the conversion).
Hagen Paul Pfeifer Feb. 3, 2011, 10:55 p.m. UTC | #12
* H.K. Jerry Chu | 2011-02-03 13:17:19 [-0800]:

>> to setup a ns-2 environment but due to lack of time I am a little bit
>> behind schedule. Anyway, why not make this a tunable per route knob so that
>> it is easier to fix things, e.g. set IW back to 3.
>
>You've seemed to get this backwards. Shouldn't we set the default for the normal
>cases and leave the corner cases to the initcwnd and initrwnd knobs to cover?

Yes Jerry, I phrase this a little bit abstruse. Apply David patch and use per
route options to reduce the IW if some curious behavior appear.

It seems the time is come to increase the IW. ;-)

Hagen


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yuchung Cheng Feb. 3, 2011, 11:54 p.m. UTC | #13
On Thu, Feb 3, 2011 at 2:43 PM, Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> wrote:
> It would perhaps be useful to change receiver advertized window to include
> some extra segs initially. It should be >= IW + peer's dupThresh-1 as
That's a very good point.

Maybe IRW should be IW + ssthresh - 1 since Linux also performs
limited-transmit during fast-recovery as described in RFC 3517. This
way sender can keep sending new data during the recovery as long as
cwnd allows.

> otherwise limited transmit won't work for the initial window because we
> won't open more receiver window with dupacks (IIRC, I suppose Jerry might
> be able to correct me right away if I'm wrong and we open window with
> dupacks too?).  I think initial receiver window code used to have some
> surplus but it was broken by the rfc3390-func conversion (against my
> advice on how to do the conversion).
>
> --
>  i.
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
H.K. Jerry Chu Feb. 4, 2011, 2:01 a.m. UTC | #14
Hi Ilpo,

On Thu, Feb 3, 2011 at 2:43 PM, Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> wrote:
> It would perhaps be useful to change receiver advertized window to include
> some extra segs initially. It should be >= IW + peer's dupThresh-1 as
> otherwise limited transmit won't work for the initial window because we
> won't open more receiver window with dupacks (IIRC, I suppose Jerry might
> be able to correct me right away if I'm wrong and we open window with
> dupacks too?).

Sorry I don't know how the receive window is updated in Linux,
autotuning or not.
But I just wonder why would it have to do with dupacks, i.e., why would it not
slide forward as long as the left edge of the window slides forward,
regardless of
OOO pkt arrival?

I am of the opinion that rwnd is for flow control purpose only thus should be
fully decoupled from the cwnd of the other (sender) side. Therefore
initrwnd should
normally be based on projected BDP and local memory pressure, e.g., 64KB, not
bearing any relation with IW of the other side. Only under special
circumstances should it be used to constrain the sender, e.g., for
devices behind slow links with
very small buffer.

Jerry

>I think initial receiver window code used to have some
> surplus but it was broken by the rfc3390-func conversion (against my
> advice on how to do the conversion).
>
> --
>  i.
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Zimmermann Feb. 4, 2011, 8:38 a.m. UTC | #15
Hi David,

Am 03.02.2011 um 02:07 schrieb David Miller:

> 
> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---
> 
> I've left the DCCP code to keep using RFC3390 logic, if they
> wish to adopt this change in their code they can do so by
> simply deleting the rfc33390_bytes_to_packets() function and
> using TCP_INIT_CWND in their assignment.
> 
> include/net/tcp.h      |   12 +++---------
> net/dccp/ccids/ccid2.c |    9 +++++++++
> net/ipv4/tcp_input.c   |    2 +-
> 3 files changed, 13 insertions(+), 10 deletions(-)
> 
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 9179111..7118668 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -196,6 +196,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
> /* TCP thin-stream limits */
> #define TCP_THIN_LINEAR_RETRIES 6       /* After 6 linear retries, do exp. backoff */
> 

Could you add a reference to draft?

> +/* TCP initial congestion window */
> +#define TCP_INIT_CWND		10
> +
> extern struct inet_timewait_death_row tcp_death_row;
> 
> /* sysctl variables for tcp */
> @@ -799,15 +802,6 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk)
> /* Use define here intentionally to get WARN_ON location shown at the caller */
> #define tcp_verify_left_out(tp)	WARN_ON(tcp_left_out(tp) > tp->packets_out)
> 
> -/*
> - * Convert RFC 3390 larger initial window into an equivalent number of packets.
> - * This is based on the numbers specified in RFC 5681, 3.1.
> - */
> -static inline u32 rfc3390_bytes_to_packets(const u32 smss)
> -{
> -	return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3);
> -}
> -
> extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh);
> extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst);
>
Ilpo Järvinen Feb. 4, 2011, 7:43 p.m. UTC | #16
On Thu, 3 Feb 2011, H.K. Jerry Chu wrote:

> On Thu, Feb 3, 2011 at 2:43 PM, Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> wrote:
> > It would perhaps be useful to change receiver advertized window to include
> > some extra segs initially. It should be >= IW + peer's dupThresh-1 as
> > otherwise limited transmit won't work for the initial window because we
> > won't open more receiver window with dupacks (IIRC, I suppose Jerry might
> > be able to correct me right away if I'm wrong and we open window with
> > dupacks too?).
> 
> Sorry I don't know how the receive window is updated in Linux,
> autotuning or not.
> But I just wonder why would it have to do with dupacks, i.e., why would 
> it not slide forward as long as the left edge of the window slides 
> forward, regardless of OOO pkt arrival?

?? DupACK by defination does not slide the left edge?!? :-) ...It
certainly makes a difference whether the ACK is cumulative or not. 
Anyway, I tcpdumped it now and confirmed that advertized window is not 
advanced if OOO packet arrives.

> I am of the opinion that rwnd is for flow control purpose only thus should be
> fully decoupled from the cwnd of the other (sender) side. Therefore
> initrwnd should
> normally be based on projected BDP and local memory pressure, e.g., 64KB, not
> bearing any relation with IW of the other side. Only under special
> circumstances should it be used to constrain the sender, e.g., for
> devices behind slow links with
> very small buffer.

I also think along the lines that the advertized window autotuning code 
is just unnecessarily preventive (besides the IW change, also Quickstart 
couldn't be used that efficiently because of it).
Ilpo Järvinen Feb. 4, 2011, 7:50 p.m. UTC | #17
On Thu, 3 Feb 2011, Yuchung Cheng wrote:

> On Thu, Feb 3, 2011 at 2:43 PM, Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> wrote:
> > It would perhaps be useful to change receiver advertized window to include
> > some extra segs initially. It should be >= IW + peer's dupThresh-1 as
> That's a very good point.
> 
> Maybe IRW should be IW + ssthresh - 1 since Linux also performs
> limited-transmit during fast-recovery as described in RFC 3517. This
> way sender can keep sending new data during the recovery as long as
> cwnd allows.

You're of course right, I forgot the recovery altogether :-).
Jerry Chu Feb. 5, 2011, 3:39 a.m. UTC | #18
On Fri, Feb 4, 2011 at 11:43 AM, Ilpo Järvinen
<ilpo.jarvinen@helsinki.fi> wrote:
>
> On Thu, 3 Feb 2011, H.K. Jerry Chu wrote:
>
> > On Thu, Feb 3, 2011 at 2:43 PM, Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> wrote:
> > > It would perhaps be useful to change receiver advertized window to include
> > > some extra segs initially. It should be >= IW + peer's dupThresh-1 as
> > > otherwise limited transmit won't work for the initial window because we
> > > won't open more receiver window with dupacks (IIRC, I suppose Jerry might
> > > be able to correct me right away if I'm wrong and we open window with
> > > dupacks too?).
> >
> > Sorry I don't know how the receive window is updated in Linux,
> > autotuning or not.
> > But I just wonder why would it have to do with dupacks, i.e., why would
> > it not slide forward as long as the left edge of the window slides
> > forward, regardless of OOO pkt arrival?
>
> ?? DupACK by defination does not slide the left edge?!? :-) ...It
> certainly makes a difference whether the ACK is cumulative or not.
> Anyway, I tcpdumped it now and confirmed that advertized window is not
> advanced if OOO packet arrives.

Cwnd discounts packets that have left the network but rwnd won't discharge
packets until they are consumed by ULP so you are right that in case
the packets from or near the head of the retransmission queue get
dropped rwnd won't
open up room for more packets even though cwnd will. To cover that case initrwnd
needs to be larger than initcwnd.

Jerry

>
> > I am of the opinion that rwnd is for flow control purpose only thus should be
> > fully decoupled from the cwnd of the other (sender) side. Therefore
> > initrwnd should
> > normally be based on projected BDP and local memory pressure, e.g., 64KB, not
> > bearing any relation with IW of the other side. Only under special
> > circumstances should it be used to constrain the sender, e.g., for
> > devices behind slow links with
> > very small buffer.
>
> I also think along the lines that the advertized window autotuning code
> is just unnecessarily preventive (besides the IW change, also Quickstart
> couldn't be used that efficiently because of it).
>
> --
>  i.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gerrit Renker Feb. 5, 2011, 1:57 p.m. UTC | #19
| I've left the DCCP code to keep using RFC3390 logic, if they
| wish to adopt this change in their code they can do so by
| simply deleting the rfc33390_bytes_to_packets() function and
| using TCP_INIT_CWND in their assignment.
| 
|  include/net/tcp.h      |   12 +++---------
|  net/dccp/ccids/ccid2.c |    9 +++++++++
Thank you for keeping the compatility, this seems an excellent way forward.

The change only affects CCID-2, TCP-like congestion control. This still has
some unresolved issues (related to cwnd) in its implementation.

Once these have been resolved, we can return to tracking the state of the 
art in TCP, for the moment your patch works best.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gerrit Renker Feb. 5, 2011, 1:58 p.m. UTC | #20
Leandro,
| >  include/net/tcp.h      |   12 +++---------
| >  net/dccp/ccids/ccid2.c |    9 +++++++++
|
David's change only affects CCID-2, CCID-3/4 remain unaffected,
they use rfc3390_initial_rate() (which does almost the same).
 
|     I don't think this change will make sense for DCCP, once IW10 is
| for the cases of short-lived flows. DCCP is used on scenarios for
| multimedia flows, which are, in general, long-lived flows. So, IMO the
| way how we calculate IW for DCCP is appropriate, at least considering
| a quick answer. 
I agree with you, and also vote to eventually bring CCID-2 on a par with
the state of art in TCP. Before we can do this, the CCID-2 implementation
needs some work (as per separate thread with Samuel Jero).

| In fact, nowadays we need better congestion control algorithms for DCCP,
| and in this sense we are working on adapt TCP Cubic for DCCP as a CCID,
| and also we started some work to make DCCP support TCP pluggable
| mechanism , which will allow us to use TCP congestion control
| algorithms in DCCP.
This is very good news and am very much looking forward to the new approach.


| DCCP should calculate its IW as specified in RFC3390 rather than set
| it to IW10. For the moment, I understand that for DCCP, we have to 
| discuss more about this in a separated thread.
Agree that we should keep an eye on this - perhaps too early to turn it into a
should. Networking speeds are growing, and many of the simulations were based
on TCP Reno which stems from the 10 Mbit ethernet era.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9179111..7118668 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -196,6 +196,9 @@  extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 /* TCP thin-stream limits */
 #define TCP_THIN_LINEAR_RETRIES 6       /* After 6 linear retries, do exp. backoff */
 
+/* TCP initial congestion window */
+#define TCP_INIT_CWND		10
+
 extern struct inet_timewait_death_row tcp_death_row;
 
 /* sysctl variables for tcp */
@@ -799,15 +802,6 @@  static inline __u32 tcp_current_ssthresh(const struct sock *sk)
 /* Use define here intentionally to get WARN_ON location shown at the caller */
 #define tcp_verify_left_out(tp)	WARN_ON(tcp_left_out(tp) > tp->packets_out)
 
-/*
- * Convert RFC 3390 larger initial window into an equivalent number of packets.
- * This is based on the numbers specified in RFC 5681, 3.1.
- */
-static inline u32 rfc3390_bytes_to_packets(const u32 smss)
-{
-	return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3);
-}
-
 extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh);
 extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst);
 
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index e96d5e8..fadecd2 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -583,6 +583,15 @@  done:
 	dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
 }
 
+/*
+ * Convert RFC 3390 larger initial window into an equivalent number of packets.
+ * This is based on the numbers specified in RFC 5681, 3.1.
+ */
+static inline u32 rfc3390_bytes_to_packets(const u32 smss)
+{
+	return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3);
+}
+
 static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
 {
 	struct ccid2_hc_tx_sock *hc = ccid_priv(ccid);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index eb7f82e..2f692ce 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -817,7 +817,7 @@  __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
 	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
 
 	if (!cwnd)
-		cwnd = rfc3390_bytes_to_packets(tp->mss_cache);
+		cwnd = TCP_INIT_CWND;
 	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
 }