diff mbox

[net-next,2/3] ipv4: Use binary search to choose tcp PMTU probe_size

Message ID 1423815405-32644-3-git-send-email-fan.du@intel.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Fan Du Feb. 13, 2015, 8:16 a.m. UTC
Current probe_size is chosen by doubling mss_cache,
the initial mss base is 512 Bytes, as a result the
converged probe_size will only be 1024 Bytes, there
is still big gap between 1024 and common 1500 bytes
of mtu.

Use binary search to choose probe_size in a fine
granularity manner, an optimal mss will be found
to boost performance as its maxmium.

Test env:
Docker instance with vxlan encapuslation(82599EB)
iperf -c 10.0.0.24  -t 60

before this patch:
1.26 Gbits/sec

After this patch: increase 26%
1.59 Gbits/sec

Signed-off-by: Fan Du <fan.du@intel.com>
---
 include/net/inet_connection_sock.h |    3 +++
 net/ipv4/tcp_input.c               |    5 ++++-
 net/ipv4/tcp_output.c              |   12 +++++++++---
 net/ipv4/tcp_timer.c               |    2 +-
 4 files changed, 17 insertions(+), 5 deletions(-)

Comments

John Heffner Feb. 13, 2015, 5:52 p.m. UTC | #1
On Fri, Feb 13, 2015 at 3:16 AM, Fan Du <fan.du@intel.com> wrote:
> Current probe_size is chosen by doubling mss_cache,
> the initial mss base is 512 Bytes, as a result the
> converged probe_size will only be 1024 Bytes, there
> is still big gap between 1024 and common 1500 bytes
> of mtu.
>
> Use binary search to choose probe_size in a fine
> granularity manner, an optimal mss will be found
> to boost performance as its maxmium.
>
> Test env:
> Docker instance with vxlan encapuslation(82599EB)
> iperf -c 10.0.0.24  -t 60
>
> before this patch:
> 1.26 Gbits/sec
>
> After this patch: increase 26%
> 1.59 Gbits/sec
>
> Signed-off-by: Fan Du <fan.du@intel.com>

Thanks for looking into making mtu probing better.  Improving the
search strategy is commendable.  One high level comment though is that
there's some cost associated with probing and diminishing returns the
smaller the interval (search_high - search_low), so there should be
some threshold below which further probing is deemed no longer useful.

Aside from that, some things in this patch don't look right to me.
Comments inline below.


> ---
>  include/net/inet_connection_sock.h |    3 +++
>  net/ipv4/tcp_input.c               |    5 ++++-
>  net/ipv4/tcp_output.c              |   12 +++++++++---
>  net/ipv4/tcp_timer.c               |    2 +-
>  4 files changed, 17 insertions(+), 5 deletions(-)
>
> diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
> index 5976bde..3d0932e 100644
> --- a/include/net/inet_connection_sock.h
> +++ b/include/net/inet_connection_sock.h
> @@ -124,6 +124,9 @@ struct inet_connection_sock {
>                 int               search_high;
>                 int               search_low;
>
> +               int               search_high_sav;
> +               int               search_low_sav;
> +
>                 /* Information on the current probe. */
>                 int               probe_size;
>         } icsk_mtup;


What are these for?  They're assigned but not used.


> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 8fdd27b..20b28e9 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -2613,7 +2613,10 @@ static void tcp_mtup_probe_success(struct sock *sk)
>         tp->snd_cwnd_stamp = tcp_time_stamp;
>         tp->snd_ssthresh = tcp_current_ssthresh(sk);
>
> -       icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
> +       if (icsk->icsk_mtup.search_low == icsk->icsk_mtup.probe_size)
> +               icsk->icsk_mtup.search_low = icsk->icsk_mtup.search_high;
> +       else
> +               icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
>         icsk->icsk_mtup.probe_size = 0;
>         tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
>  }

It would be cleaner to handle this in tcp_mtu_probe, in deciding
whether to issue a probe, than to change the semantics of search_high
and search_low.  Issuing a probe where probe_size == search_low seems
like the wrong thing to do.


> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index a2a796c..0a60deb 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -1349,10 +1349,13 @@ void tcp_mtup_init(struct sock *sk)
>         struct inet_connection_sock *icsk = inet_csk(sk);
>         struct net *net = sock_net(sk);
>
> -       icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
> +       icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing;
>         icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
>                                icsk->icsk_af_ops->net_header_len;
>         icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
> +
> +       icsk->icsk_mtup.search_high_sav = icsk->icsk_mtup.search_high;
> +       icsk->icsk_mtup.search_low_sav = icsk->icsk_mtup.search_low;
>         icsk->icsk_mtup.probe_size = 0;
>  }
>  EXPORT_SYMBOL(tcp_mtup_init);

You're changing the meaning of sysctl_tcp_mtu_probing.  I don't think
that's what you want.  From Documentation/networking/ip-sysctl.txt:

tcp_mtu_probing - INTEGER
Controls TCP Packetization-Layer Path MTU Discovery.  Takes three
values:
 0 - Disabled
 1 - Disabled by default, enabled when an ICMP black hole detected
 2 - Always enabled, use initial MSS of tcp_base_mss.



> diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
> index 0732b78..9d1cfe0 100644
> --- a/net/ipv4/tcp_timer.c
> +++ b/net/ipv4/tcp_timer.c
> @@ -113,7 +113,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
>                         struct tcp_sock *tp = tcp_sk(sk);
>                         int mss;
>
> -                       mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
> +                       mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low);
>                         mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
>                         mss = max(mss, 68 - tp->tcp_header_len);
>                         icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);

Why did you change this?  I think this breaks black hole detection.

Thanks,
  -John
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
FengYu LeiDian Feb. 16, 2015, 5:27 a.m. UTC | #2
于 2015年02月14日 01:52, John Heffner 写道:
> On Fri, Feb 13, 2015 at 3:16 AM, Fan Du <fan.du@intel.com> wrote:
>> Current probe_size is chosen by doubling mss_cache,
>> the initial mss base is 512 Bytes, as a result the
>> converged probe_size will only be 1024 Bytes, there
>> is still big gap between 1024 and common 1500 bytes
>> of mtu.
>>
>> Use binary search to choose probe_size in a fine
>> granularity manner, an optimal mss will be found
>> to boost performance as its maxmium.
>>
>> Test env:
>> Docker instance with vxlan encapuslation(82599EB)
>> iperf -c 10.0.0.24  -t 60
>>
>> before this patch:
>> 1.26 Gbits/sec
>>
>> After this patch: increase 26%
>> 1.59 Gbits/sec
>>
>> Signed-off-by: Fan Du <fan.du@intel.com>
>
> Thanks for looking into making mtu probing better.  Improving the
> search strategy is commendable.  One high level comment though is that
> there's some cost associated with probing and diminishing returns the
> smaller the interval (search_high - search_low), so there should be
> some threshold below which further probing is deemed no longer useful.
>
> Aside from that, some things in this patch don't look right to me.
> Comments inline below.
>
>
>> ---
>>   include/net/inet_connection_sock.h |    3 +++
>>   net/ipv4/tcp_input.c               |    5 ++++-
>>   net/ipv4/tcp_output.c              |   12 +++++++++---
>>   net/ipv4/tcp_timer.c               |    2 +-
>>   4 files changed, 17 insertions(+), 5 deletions(-)
>>
>> diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
>> index 5976bde..3d0932e 100644
>> --- a/include/net/inet_connection_sock.h
>> +++ b/include/net/inet_connection_sock.h
>> @@ -124,6 +124,9 @@ struct inet_connection_sock {
>>                  int               search_high;
>>                  int               search_low;
>>
>> +               int               search_high_sav;
>> +               int               search_low_sav;
>> +
>>                  /* Information on the current probe. */
>>                  int               probe_size;
>>          } icsk_mtup;
>
>
> What are these for?  They're assigned but not used.

It's used by the probe timer to restore original search range.
See patch3/3.

>
>> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
>> index 8fdd27b..20b28e9 100644
>> --- a/net/ipv4/tcp_input.c
>> +++ b/net/ipv4/tcp_input.c
>> @@ -2613,7 +2613,10 @@ static void tcp_mtup_probe_success(struct sock *sk)
>>          tp->snd_cwnd_stamp = tcp_time_stamp;
>>          tp->snd_ssthresh = tcp_current_ssthresh(sk);
>>
>> -       icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
>> +       if (icsk->icsk_mtup.search_low == icsk->icsk_mtup.probe_size)
>> +               icsk->icsk_mtup.search_low = icsk->icsk_mtup.search_high;
>> +       else
>> +               icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
>>          icsk->icsk_mtup.probe_size = 0;
>>          tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
>>   }
>
> It would be cleaner to handle this in tcp_mtu_probe, in deciding
> whether to issue a probe, than to change the semantics of search_high
> and search_low.  Issuing a probe where probe_size == search_low seems
> like the wrong thing to do.
That's my original thoughts, the seconds thoughts is every BYTE in datacenter
cost money, so why not to get optimal performance by using every possible byte
available.

Anyway, a sysctl threshold will also do the job, will incorporate this in next version.

>
>> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
>> index a2a796c..0a60deb 100644
>> --- a/net/ipv4/tcp_output.c
>> +++ b/net/ipv4/tcp_output.c
>> @@ -1349,10 +1349,13 @@ void tcp_mtup_init(struct sock *sk)
>>          struct inet_connection_sock *icsk = inet_csk(sk);
>>          struct net *net = sock_net(sk);
>>
>> -       icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
>> +       icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing;
>>          icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
>>                                 icsk->icsk_af_ops->net_header_len;
>>          icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
>> +
>> +       icsk->icsk_mtup.search_high_sav = icsk->icsk_mtup.search_high;
>> +       icsk->icsk_mtup.search_low_sav = icsk->icsk_mtup.search_low;
>>          icsk->icsk_mtup.probe_size = 0;
>>   }
>>   EXPORT_SYMBOL(tcp_mtup_init);
>
> You're changing the meaning of sysctl_tcp_mtu_probing.  I don't think
> that's what you want.  From Documentation/networking/ip-sysctl.txt:
>
> tcp_mtu_probing - INTEGER
> Controls TCP Packetization-Layer Path MTU Discovery.  Takes three
> values:
>   0 - Disabled
>   1 - Disabled by default, enabled when an ICMP black hole detected
>   2 - Always enabled, use initial MSS of tcp_base_mss.
yes, will honor the original enable theme.

>
>> diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
>> index 0732b78..9d1cfe0 100644
>> --- a/net/ipv4/tcp_timer.c
>> +++ b/net/ipv4/tcp_timer.c
>> @@ -113,7 +113,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
>>                          struct tcp_sock *tp = tcp_sk(sk);
>>                          int mss;
>>
>> -                       mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
>> +                       mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low);
>>                          mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
>>                          mss = max(mss, 68 - tp->tcp_header_len);
>>                          icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
>
> Why did you change this?  I think this breaks black hole detection.
hmm, I misunderstood this part.
In case of pure black hole detection, lowering the current tcp mss instead of search_low,
will make more sense, as current tcp mss still got lost.

-                       mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
+                       /* try mss smaller than current mss */
+                       mss = tcp_current_mss(sk) >> 1;

Black hole seems more like a misconfiguration in administrative level on intermediate node,
rather than a stack issue, why keep shrinking mss to get packet through with poor performance?


> Thanks,
>    -John
>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John Heffner Feb. 16, 2015, 11:59 p.m. UTC | #3
On Mon, Feb 16, 2015 at 12:27 AM, Fan Du <fengyuleidian0615@gmail.com> wrote:
> 于 2015年02月14日 01:52, John Heffner 写道:
>
>> On Fri, Feb 13, 2015 at 3:16 AM, Fan Du <fan.du@intel.com> wrote:
>>> diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
>>> index 0732b78..9d1cfe0 100644
>>> --- a/net/ipv4/tcp_timer.c
>>> +++ b/net/ipv4/tcp_timer.c
>>> @@ -113,7 +113,7 @@ static void tcp_mtu_probing(struct
>>> inet_connection_sock *icsk, struct sock *sk)
>>>                          struct tcp_sock *tp = tcp_sk(sk);
>>>                          int mss;
>>>
>>> -                       mss = tcp_mtu_to_mss(sk,
>>> icsk->icsk_mtup.search_low) >> 1;
>>> +                       mss = tcp_mtu_to_mss(sk,
>>> icsk->icsk_mtup.search_low);
>>>                          mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
>>>                          mss = max(mss, 68 - tp->tcp_header_len);
>>>                          icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk,
>>> mss);
>>
>>
>> Why did you change this?  I think this breaks black hole detection.
>
> hmm, I misunderstood this part.
> In case of pure black hole detection, lowering the current tcp mss instead
> of search_low,
> will make more sense, as current tcp mss still got lost.
>
> -                       mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)
>>> 1;
> +                       /* try mss smaller than current mss */
> +                       mss = tcp_current_mss(sk) >> 1;
>
> Black hole seems more like a misconfiguration in administrative level on
> intermediate node,
> rather than a stack issue, why keep shrinking mss to get packet through with
> poor performance?

The idea here is to make it robust to route changes, where the new
path has a lower MTU.  This also protects us against paths that have a
lower MTU than the base mss (which you're also trying to raise in this
patch series).

Thanks,
  -John
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 5976bde..3d0932e 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -124,6 +124,9 @@  struct inet_connection_sock {
 		int		  search_high;
 		int		  search_low;
 
+		int		  search_high_sav;
+		int		  search_low_sav;
+
 		/* Information on the current probe. */
 		int		  probe_size;
 	} icsk_mtup;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8fdd27b..20b28e9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2613,7 +2613,10 @@  static void tcp_mtup_probe_success(struct sock *sk)
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 	tp->snd_ssthresh = tcp_current_ssthresh(sk);
 
-	icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
+	if (icsk->icsk_mtup.search_low == icsk->icsk_mtup.probe_size)
+		icsk->icsk_mtup.search_low = icsk->icsk_mtup.search_high;
+	else
+		icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
 	icsk->icsk_mtup.probe_size = 0;
 	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 }
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a2a796c..0a60deb 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1349,10 +1349,13 @@  void tcp_mtup_init(struct sock *sk)
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct net *net = sock_net(sk);
 
-	icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
+	icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing;
 	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
 			       icsk->icsk_af_ops->net_header_len;
 	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
+
+	icsk->icsk_mtup.search_high_sav = icsk->icsk_mtup.search_high;
+	icsk->icsk_mtup.search_low_sav = icsk->icsk_mtup.search_low;
 	icsk->icsk_mtup.probe_size = 0;
 }
 EXPORT_SYMBOL(tcp_mtup_init);
@@ -1854,9 +1857,12 @@  static int tcp_mtu_probe(struct sock *sk)
 	    tp->rx_opt.num_sacks || tp->rx_opt.dsack)
 		return -1;
 
-	/* Very simple search strategy: just double the MSS. */
+	/* Use binary search for probe_size bewteen tcp_mss_base
+	 * and current mss_clamp.
+	 */
 	mss_now = tcp_current_mss(sk);
-	probe_size = 2 * tp->mss_cache;
+
+	probe_size = (icsk->icsk_mtup.search_high + icsk->icsk_mtup.search_low) >> 1;
 	size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
 	if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
 		/* TODO: set timer for probe_converge_event */
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 0732b78..9d1cfe0 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -113,7 +113,7 @@  static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
 			struct tcp_sock *tp = tcp_sk(sk);
 			int mss;
 
-			mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
+			mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low);
 			mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
 			mss = max(mss, 68 - tp->tcp_header_len);
 			icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);