diff mbox

[RFC,net-next,1/2] tcp: Add DPIFL thin stream detection mechanism

Message ID 1445633413-3532-2-git-send-email-bro.devel+kernel@gmail.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

=?UTF-8?q?Bendik=20R=C3=B8nning=20Opstad?= Oct. 23, 2015, 8:50 p.m. UTC
The existing mechanism for detecting thin streams (tcp_stream_is_thin)
is based on a static limit of less than 4 packets in flight. This treats
streams differently depending on the connections RTT, such that a stream
on a high RTT link may never be considered thin, whereas the same
application would produce a stream that would always be thin in a low RTT
scenario (e.g. data center).

By calculating a dynamic packets in flight limit (DPIFL), the thin stream
detection will be independent of the RTT and treat streams equally based
on the transmission pattern, i.e. the inter-transmission time (ITT).

Cc: Andreas Petlund <apetlund@simula.no>
Cc: Carsten Griwodz <griff@simula.no>
Cc: Pål Halvorsen <paalh@simula.no>
Cc: Jonas Markussen <jonassm@ifi.uio.no>
Cc: Kristian Evensen <kristian.evensen@gmail.com>
Cc: Kenneth Klette Jonassen <kennetkl@ifi.uio.no>
Signed-off-by: Bendik Rønning Opstad <bro.devel+kernel@gmail.com>
---
 Documentation/networking/ip-sysctl.txt |  8 ++++++++
 include/linux/tcp.h                    |  6 ++++++
 include/net/tcp.h                      | 20 ++++++++++++++++++++
 net/ipv4/sysctl_net_ipv4.c             |  9 +++++++++
 net/ipv4/tcp.c                         |  3 +++
 5 files changed, 46 insertions(+)

Comments

Eric Dumazet Oct. 23, 2015, 9:44 p.m. UTC | #1
On Fri, 2015-10-23 at 22:50 +0200, Bendik Rønning Opstad wrote:

>  
> +/**
> + * tcp_stream_is_thin_dpifl() - Tests if the stream is thin based on dynamic PIF
> + *                              limit
> + * @tp: the tcp_sock struct
> + *
> + * Return: true if current packets in flight (PIF) count is lower than
> + *         the dynamic PIF limit, else false
> + */
> +static inline bool tcp_stream_is_thin_dpifl(const struct tcp_sock *tp)
> +{
> +	u64 dpif_lim = tp->srtt_us >> 3;
> +	/* Div by is_thin_min_itt_lim, the minimum allowed ITT
> +	 * (Inter-transmission time) in usecs.
> +	 */
> +	do_div(dpif_lim, tp->thin_dpifl_itt_lower_bound);
> +	return tcp_packets_in_flight(tp) < dpif_lim;
> +}
> +
This is very strange :

You are using a do_div() while both operands are 32bits.  A regular
divide would be ok :

u32 dpif_lim = (tp->srtt_us >> 3) / tp->thin_dpifl_itt_lower_bound;

But then, you can avoid the divide by using a multiply, less expensive :

return	(u64)tcp_packets_in_flight(tp) * tp->thin_dpifl_itt_lower_bound <
	(tp->srtt_us >> 3);


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
=?UTF-8?q?Bendik=20R=C3=B8nning=20Opstad?= Oct. 25, 2015, 5:56 a.m. UTC | #2
On Friday, October 23, 2015 02:44:14 PM Eric Dumazet wrote:
> On Fri, 2015-10-23 at 22:50 +0200, Bendik Rønning Opstad wrote:
> 
> >  
> > +/**
> > + * tcp_stream_is_thin_dpifl() - Tests if the stream is thin based on dynamic PIF
> > + *                              limit
> > + * @tp: the tcp_sock struct
> > + *
> > + * Return: true if current packets in flight (PIF) count is lower than
> > + *         the dynamic PIF limit, else false
> > + */
> > +static inline bool tcp_stream_is_thin_dpifl(const struct tcp_sock *tp)
> > +{
> > +	u64 dpif_lim = tp->srtt_us >> 3;
> > +	/* Div by is_thin_min_itt_lim, the minimum allowed ITT
> > +	 * (Inter-transmission time) in usecs.
> > +	 */
> > +	do_div(dpif_lim, tp->thin_dpifl_itt_lower_bound);
> > +	return tcp_packets_in_flight(tp) < dpif_lim;
> > +}
> > +
> This is very strange :
> 
> You are using a do_div() while both operands are 32bits.  A regular
> divide would be ok :
> 
> u32 dpif_lim = (tp->srtt_us >> 3) / tp->thin_dpifl_itt_lower_bound;
> 
> But then, you can avoid the divide by using a multiply, less expensive :
> 
> return	(u64)tcp_packets_in_flight(tp) * tp->thin_dpifl_itt_lower_bound <
> 	(tp->srtt_us >> 3);
> 

You are of course correct. Will fix this and use multiply. Thanks.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 85752c8..b841a76 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -700,6 +700,14 @@  tcp_thin_dupack - BOOLEAN
 	Documentation/networking/tcp-thin.txt
 	Default: 0
 
+tcp_thin_dpifl_itt_lower_bound - INTEGER
+	Controls the lower bound for ITT (inter-transmission time) threshold
+	for when a stream is considered thin. The value is specified in
+	microseconds, and may not be lower than 10000 (10 ms). This theshold
+	is used to calculate a dynamic packets in flight limit (DPIFL) which
+	is used to classify whether a stream is thin.
+	Default: 10000
+
 tcp_limit_output_bytes - INTEGER
 	Controls TCP Small Queue limit per tcp socket.
 	TCP bulk sender tends to increase packets in flight until it
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index c906f45..fc885db 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -269,6 +269,12 @@  struct tcp_sock {
 	struct sk_buff* lost_skb_hint;
 	struct sk_buff *retransmit_skb_hint;
 
+	/* The limit used to identify when a stream is thin based in a minimum
+	 * allowed inter-transmission time (ITT) in microseconds. This is used
+	 * to dynamically calculate a max packets in flight limit (DPIFL).
+	*/
+	int thin_dpifl_itt_lower_bound;
+
 	/* OOO segments go in this list. Note that socket lock must be held,
 	 * as we do not use sk_buff_head lock.
 	 */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 4fc457b..6534836 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -215,6 +215,7 @@  void tcp_time_wait(struct sock *sk, int state, int timeo);
 
 /* TCP thin-stream limits */
 #define TCP_THIN_LINEAR_RETRIES 6       /* After 6 linear retries, do exp. backoff */
+#define TCP_THIN_DPIFL_ITT_LOWER_BOUND_MIN 10000  /* Minimum lower bound is 10 ms (10000 usec) */
 
 /* TCP initial congestion window as per draft-hkchu-tcpm-initcwnd-01 */
 #define TCP_INIT_CWND		10
@@ -274,6 +275,7 @@  extern int sysctl_tcp_workaround_signed_windows;
 extern int sysctl_tcp_slow_start_after_idle;
 extern int sysctl_tcp_thin_linear_timeouts;
 extern int sysctl_tcp_thin_dupack;
+extern int sysctl_tcp_thin_dpifl_itt_lower_bound;
 extern int sysctl_tcp_early_retrans;
 extern int sysctl_tcp_limit_output_bytes;
 extern int sysctl_tcp_challenge_ack_limit;
@@ -1631,6 +1633,24 @@  static inline bool tcp_stream_is_thin(struct tcp_sock *tp)
 	return tp->packets_out < 4 && !tcp_in_initial_slowstart(tp);
 }
 
+/**
+ * tcp_stream_is_thin_dpifl() - Tests if the stream is thin based on dynamic PIF
+ *                              limit
+ * @tp: the tcp_sock struct
+ *
+ * Return: true if current packets in flight (PIF) count is lower than
+ *         the dynamic PIF limit, else false
+ */
+static inline bool tcp_stream_is_thin_dpifl(const struct tcp_sock *tp)
+{
+	u64 dpif_lim = tp->srtt_us >> 3;
+	/* Div by is_thin_min_itt_lim, the minimum allowed ITT
+	 * (Inter-transmission time) in usecs.
+	 */
+	do_div(dpif_lim, tp->thin_dpifl_itt_lower_bound);
+	return tcp_packets_in_flight(tp) < dpif_lim;
+}
+
 /* /proc */
 enum tcp_seq_states {
 	TCP_SEQ_STATE_LISTENING,
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 25300c5..917fdde 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -42,6 +42,7 @@  static int tcp_syn_retries_min = 1;
 static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
 static int ip_ping_group_range_min[] = { 0, 0 };
 static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
+static int tcp_thin_dpifl_itt_lower_bound_min = TCP_THIN_DPIFL_ITT_LOWER_BOUND_MIN;
 
 /* Update system visible IP port range */
 static void set_local_port_range(struct net *net, int range[2])
@@ -709,6 +710,14 @@  static struct ctl_table ipv4_table[] = {
 		.proc_handler   = proc_dointvec
 	},
 	{
+		.procname	= "tcp_thin_dpifl_itt_lower_bound",
+		.data		= &sysctl_tcp_thin_dpifl_itt_lower_bound,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &tcp_thin_dpifl_itt_lower_bound_min,
+	},
+	{
 		.procname	= "tcp_early_retrans",
 		.data		= &sysctl_tcp_early_retrans,
 		.maxlen		= sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0cfa7c0..f712d7c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -287,6 +287,8 @@  int sysctl_tcp_min_tso_segs __read_mostly = 2;
 
 int sysctl_tcp_autocorking __read_mostly = 1;
 
+int sysctl_tcp_thin_dpifl_itt_lower_bound __read_mostly = TCP_THIN_DPIFL_ITT_LOWER_BOUND_MIN;
+
 struct percpu_counter tcp_orphan_count;
 EXPORT_SYMBOL_GPL(tcp_orphan_count);
 
@@ -406,6 +408,7 @@  void tcp_init_sock(struct sock *sk)
 	u64_stats_init(&tp->syncp);
 
 	tp->reordering = sysctl_tcp_reordering;
+	tp->thin_dpifl_itt_lower_bound = sysctl_tcp_thin_dpifl_itt_lower_bound;
 	tcp_enable_early_retrans(tp);
 	tcp_assign_congestion_control(sk);