diff mbox

[RFC] tcp: Export TCP Delayed ACK parameters to user

Message ID 1319756841-2051-1-git-send-email-dbaluta@ixiacom.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Daniel Baluta Oct. 27, 2011, 11:07 p.m. UTC
RFC2581 ($4.2) specifies when an ACK should be generated as follows:

" .. an ACK SHOULD be generated for at least every second
  full-sized segment, and MUST be generated within 500 ms
  of the arrival of the first unacknowledged packet.
"

We export the number of segments and the timeout limits
specified above, so that a user can tune them according
to its needs.

Specifically:
	* /proc/sys/net/ipv4/tcp_delack_segs, represents
	the threshold for the number of segments.
	* /proc/sys/net/ipv4/tcp_delack_min, specifies
	the minimum timeout value
	* /proc/sys/net/ipv4/tcp_delack_max, specifies
	the maximum timeout value.

Signed-off-by: Daniel Baluta <dbaluta@ixiacom.com>
---
 include/net/tcp.h          |   20 +++++++++++++++++---
 net/ipv4/sysctl_net_ipv4.c |   21 +++++++++++++++++++++
 net/ipv4/tcp.c             |    5 +++--
 net/ipv4/tcp_input.c       |    7 +++++--
 net/ipv4/tcp_output.c      |    4 +++-
 5 files changed, 49 insertions(+), 8 deletions(-)

Comments

Eric Dumazet Oct. 28, 2011, 12:01 a.m. UTC | #1
Le vendredi 28 octobre 2011 à 02:07 +0300, Daniel Baluta a écrit :
> RFC2581 ($4.2) specifies when an ACK should be generated as follows:
> 
> " .. an ACK SHOULD be generated for at least every second
>   full-sized segment, and MUST be generated within 500 ms
>   of the arrival of the first unacknowledged packet.
> "
> 
> We export the number of segments and the timeout limits
> specified above, so that a user can tune them according
> to its needs.
> 

Well, this requires user has a machine exclusive use :)

> Specifically:
> 	* /proc/sys/net/ipv4/tcp_delack_segs, represents
> 	the threshold for the number of segments.
> 	* /proc/sys/net/ipv4/tcp_delack_min, specifies
> 	the minimum timeout value
> 	* /proc/sys/net/ipv4/tcp_delack_max, specifies
> 	the maximum timeout value.
> 


> Signed-off-by: Daniel Baluta <dbaluta@ixiacom.com>
> ---
>  include/net/tcp.h          |   20 +++++++++++++++++---
>  net/ipv4/sysctl_net_ipv4.c |   21 +++++++++++++++++++++
>  net/ipv4/tcp.c             |    5 +++--
>  net/ipv4/tcp_input.c       |    7 +++++--
>  net/ipv4/tcp_output.c      |    4 +++-
>  5 files changed, 49 insertions(+), 8 deletions(-)
> 

Missing Documentation changes

> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index e147f42..f3b0c17 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -111,14 +111,21 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
>  				  * TIME-WAIT timer.
>  				  */
>  
> -#define TCP_DELACK_MAX	((unsigned)(HZ/5))	/* maximal time to delay before sending an ACK */
> +/* default maximum time to delay before sending an ACK */
> +#define TCP_DELACK_MAX_DEFAULT	((unsigned)(HZ/5))
> +
>  #if HZ >= 100
> -#define TCP_DELACK_MIN	((unsigned)(HZ/25))	/* minimal time to delay before sending an ACK */
> +/* default minimum time to delay before sending an ACK */
> +#define TCP_DELACK_MIN_DEFAULT	((unsigned)(HZ/25))
>  #define TCP_ATO_MIN	((unsigned)(HZ/25))
>  #else
> -#define TCP_DELACK_MIN	4U
> +#define TCP_DELACK_MIN_DEFAULT	4U
>  #define TCP_ATO_MIN	4U
>  #endif
> +
> +#define TCP_DELACK_MIN sysctl_tcp_delack_min
> +#define TCP_DELACK_MAX sysctl_tcp_delack_max

Hmm, please try to compile dccp as a module :)

You need some EXPORT_SYMBOL() definitions.

Frankly, I suggest removing TCP_DELACK_{MIN|MAX} to avoid unecessary
layer, and use sysctl_tcp_delack_{min|max} instead


> +
>  #define TCP_RTO_MAX	((unsigned)(120*HZ))
>  #define TCP_RTO_MIN	((unsigned)(HZ/5))
>  #define TCP_TIMEOUT_INIT ((unsigned)(1*HZ))	/* RFC2988bis initial RTO value	*/
> @@ -251,6 +258,9 @@ extern int sysctl_tcp_max_ssthresh;
>  extern int sysctl_tcp_cookie_size;
>  extern int sysctl_tcp_thin_linear_timeouts;
>  extern int sysctl_tcp_thin_dupack;
> +extern int sysctl_tcp_delack_segs;
> +extern int sysctl_tcp_delack_min;
> +extern int sysctl_tcp_delack_max;
>  
>  extern atomic_long_t tcp_memory_allocated;
>  extern struct percpu_counter tcp_sockets_allocated;
> @@ -1557,6 +1567,10 @@ static inline struct tcp_extend_values *tcp_xv(struct request_values *rvp)
>  {
>  	return (struct tcp_extend_values *)rvp;
>  }


> +static inline int tcp_snd_thresh(struct sock *sk)

I am not sure name is properly chosen, its about delack or not ?

const struct *sk

> +{
> +	return inet_csk(sk)->icsk_ack.rcv_mss * sysctl_tcp_delack_segs;
> +}
>  

Thanks !


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Baluta Oct. 28, 2011, 8:01 a.m. UTC | #2
On Fri, Oct 28, 2011 at 3:01 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le vendredi 28 octobre 2011 à 02:07 +0300, Daniel Baluta a écrit :
>> RFC2581 ($4.2) specifies when an ACK should be generated as follows:
>>
>> " .. an ACK SHOULD be generated for at least every second
>>   full-sized segment, and MUST be generated within 500 ms
>>   of the arrival of the first unacknowledged packet.
>> "
>>
>> We export the number of segments and the timeout limits
>> specified above, so that a user can tune them according
>> to its needs.
>>
>
> Well, this requires user has a machine exclusive use :)

So, this means that setting parameters system wide
isn't an option?

On Windows there is a global setting TcpAckFrequency [1],
which is similar with our tcp_delack_{min,max}.

On Solaris there is a global option tcp_deferred_acks_max [2],
which is similar with our tcp_delack_segs.

Thanks for your comments, I will post an updated patch asap.

Daniel.

[1] http://support.microsoft.com/kb/328890
[2] http://www.sean.de/Solaris/soltune.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Oct. 28, 2011, 8:44 a.m. UTC | #3
Le vendredi 28 octobre 2011 à 11:01 +0300, Daniel Baluta a écrit :
> On Fri, Oct 28, 2011 at 3:01 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > Le vendredi 28 octobre 2011 à 02:07 +0300, Daniel Baluta a écrit :
> >> RFC2581 ($4.2) specifies when an ACK should be generated as follows:
> >>
> >> " .. an ACK SHOULD be generated for at least every second
> >>   full-sized segment, and MUST be generated within 500 ms
> >>   of the arrival of the first unacknowledged packet.
> >> "
> >>
> >> We export the number of segments and the timeout limits
> >> specified above, so that a user can tune them according
> >> to its needs.
> >>
> >
> > Well, this requires user has a machine exclusive use :)
> 
> So, this means that setting parameters system wide
> isn't an option?
> 

It is a first step, but we can notice a global setting might please one
application but negatively impact other applications.

I guess some users will want a per socket option, but this can come
later. An other idea to save space on socket structures would be to
select two set of values depending on TOS/TCLASS.

I can imagine ssh (lowdelay) and scp (throughput) wanting different
behavior here.

> On Windows there is a global setting TcpAckFrequency [1],
> which is similar with our tcp_delack_{min,max}.
> 
> On Solaris there is a global option tcp_deferred_acks_max [2],
> which is similar with our tcp_delack_segs.
> 

and also has tcp_deferred_ack_interval

> Thanks for your comments, I will post an updated patch asap.
> 
> Daniel.
> 
> [1] http://support.microsoft.com/kb/328890
> [2] http://www.sean.de/Solaris/soltune.html

Dont forget to CC Andy Lutomirski <luto@amacapital.net>, he might be
interested being part of the process.

Thanks


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rick Jones Oct. 28, 2011, 4:38 p.m. UTC | #4
>> On Solaris there is a global option tcp_deferred_acks_max [2],
>> which is similar with our tcp_delack_segs.
>>
>
> and also has tcp_deferred_ack_interval

And those have similar settings in HP-UX 11.X.

For the sake of completeness, the ACK avoidance heuristic in HP-UX, and 
I presume Solaris (as they share a common "Mentat" heritage) includes a 
mechanism to reduce the per-connection effective number of segments per 
ACKnowledgement.  I believe this is done to handle cases where the 
sender may have reduced her cwnd.  That would have deployment going back 
to 1997 in the case of HP-UX 11.0, and presumably a few years before 
that in the case of Solaris.  That mechanism in their ACK avoidance 
heuristics may be the reason neither have gone so far as to make the 
settings per-route or per-connection (though I could be wrong).  I 
believe that Solaris does though have two deferred ACK limits - one for 
perceived to be local connections and one (lower) for perceived to be 
remote connections.

There can be "fun" interactions with senders which increase cwnd per ACK 
rather than per bytes ACKed.

Still, I myself am somewhat fond of ACK avoidance heuristics.

rick jones

PS - when discussing the performance benefits of an ACK avoidance 
heuristic, feel free to use netperf and service demand numbers :)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/tcp.h b/include/net/tcp.h
index e147f42..f3b0c17 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -111,14 +111,21 @@  extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 				  * TIME-WAIT timer.
 				  */
 
-#define TCP_DELACK_MAX	((unsigned)(HZ/5))	/* maximal time to delay before sending an ACK */
+/* default maximum time to delay before sending an ACK */
+#define TCP_DELACK_MAX_DEFAULT	((unsigned)(HZ/5))
+
 #if HZ >= 100
-#define TCP_DELACK_MIN	((unsigned)(HZ/25))	/* minimal time to delay before sending an ACK */
+/* default minimum time to delay before sending an ACK */
+#define TCP_DELACK_MIN_DEFAULT	((unsigned)(HZ/25))
 #define TCP_ATO_MIN	((unsigned)(HZ/25))
 #else
-#define TCP_DELACK_MIN	4U
+#define TCP_DELACK_MIN_DEFAULT	4U
 #define TCP_ATO_MIN	4U
 #endif
+
+#define TCP_DELACK_MIN sysctl_tcp_delack_min
+#define TCP_DELACK_MAX sysctl_tcp_delack_max
+
 #define TCP_RTO_MAX	((unsigned)(120*HZ))
 #define TCP_RTO_MIN	((unsigned)(HZ/5))
 #define TCP_TIMEOUT_INIT ((unsigned)(1*HZ))	/* RFC2988bis initial RTO value	*/
@@ -251,6 +258,9 @@  extern int sysctl_tcp_max_ssthresh;
 extern int sysctl_tcp_cookie_size;
 extern int sysctl_tcp_thin_linear_timeouts;
 extern int sysctl_tcp_thin_dupack;
+extern int sysctl_tcp_delack_segs;
+extern int sysctl_tcp_delack_min;
+extern int sysctl_tcp_delack_max;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
@@ -1557,6 +1567,10 @@  static inline struct tcp_extend_values *tcp_xv(struct request_values *rvp)
 {
 	return (struct tcp_extend_values *)rvp;
 }
+static inline int tcp_snd_thresh(struct sock *sk)
+{
+	return inet_csk(sk)->icsk_ack.rcv_mss * sysctl_tcp_delack_segs;
+}
 
 extern void tcp_v4_init(void);
 extern void tcp_init(void);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 69fd720..c22c4c5 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -639,6 +639,27 @@  static struct ctl_table ipv4_table[] = {
 		.proc_handler   = proc_dointvec
 	},
 	{
+		.procname	= "tcp_delack_segs",
+		.data		= &sysctl_tcp_delack_segs,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_delack_min",
+		.data		= &sysctl_tcp_delack_min,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_ms_jiffies
+	},
+	{
+		.procname	= "tcp_delack_max",
+		.data		= &sysctl_tcp_delack_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_ms_jiffies
+	},
+	{
 		.procname	= "udp_mem",
 		.data		= &sysctl_udp_mem,
 		.maxlen		= sizeof(sysctl_udp_mem),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 34f5db1..0aad29b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1204,8 +1204,9 @@  void tcp_cleanup_rbuf(struct sock *sk, int copied)
 		   /* Delayed ACKs frequently hit locked sockets during bulk
 		    * receive. */
 		if (icsk->icsk_ack.blocked ||
-		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
-		    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
+		    /* More than once-per-tcp_delack_segs-segments ACK
+		     * was not sent by tcp_input.c */
+		    tp->rcv_nxt - tp->rcv_wup > tcp_snd_thresh(sk) ||
 		    /*
 		     * If this read emptied read buffer, we send ACK, if
 		     * connection is not bidirectional, user drained
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 52b5c2d..1e02a80 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -98,6 +98,9 @@  int sysctl_tcp_thin_dupack __read_mostly;
 int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
 int sysctl_tcp_abc __read_mostly;
 
+int sysctl_tcp_delack_segs __read_mostly = 1;
+
+
 #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
 #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
 #define FLAG_DATA_ACKED		0x04 /* This ACK acknowledged new data.		*/
@@ -4993,8 +4996,8 @@  static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	    /* More than one full frame received... */
-	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
+	    /* More than tcp_delack_segs full frame(s) received... */
+	if (((tp->rcv_nxt - tp->rcv_wup) > tcp_snd_thresh(sk) &&
 	     /* ... and right edge of window advances far enough.
 	      * (tcp_recvmsg() will send ACK otherwise). Or...
 	      */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 980b98f..0ec31af 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -63,6 +63,8 @@  int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
 int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
 EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
 
+int sysctl_tcp_delack_min __read_mostly = TCP_DELACK_MIN_DEFAULT;
+int sysctl_tcp_delack_max __read_mostly = TCP_DELACK_MAX_DEFAULT;
 
 /* Account for new data that has been sent to the network. */
 static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
@@ -2685,7 +2687,7 @@  void tcp_send_delayed_ack(struct sock *sk)
 		 * directly.
 		 */
 		if (tp->srtt) {
-			int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN);
+			int rtt = max_t(unsigned, tp->srtt >> 3, TCP_DELACK_MIN);
 
 			if (rtt < max_ato)
 				max_ato = rtt;