diff mbox

[net,2/2] RDS: TCP: Synchrnozize accept() and connect() paths on t_conn_lock.

Message ID 5fc507116182afdda4b824173008e09d5d464a33.1462127059.git.sowmini.varadhan@oracle.com
State Superseded, archived
Delegated to: David Miller
Headers show

Commit Message

Sowmini Varadhan May 1, 2016, 11:10 p.m. UTC
An arbitration scheme for duelling SYNs is implemented as part of
commit 241b271952eb ("RDS-TCP: Reset tcp callbacks if re-using an
outgoing socket in rds_tcp_accept_one()") which ensures that both nodes
involved will arrive at the same arbitration decision. However, this
needs to be synchronized with an outgoing SYN to be generated by
rds_tcp_conn_connect(). This commit achieves the synchronization
through the t_conn_lock mutex in struct rds_tcp_connection.

The rds_conn_state is checked in rds_tcp_conn_connect() after acquiring
the t_conn_lock mutex.  A SYN is sent out only if the RDS connection is
not already UP (an UP would indicate that rds_tcp_accept_one() has
completed 3WH, so no SYN needs to be generated).

Similarly, the rds_conn_state is checked in rds_tcp_accept_one() after
acquiring the t_conn_lock mutex. The only acceptable states (to
allow continuation of the arbitration logic) are UP (i.e., outgoing SYN
was SYN-ACKed by peer after it sent us the SYN) or CONNECTING (we sent
outgoing SYN before we saw incoming SYN).

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
---
 net/rds/tcp.c         |    1 +
 net/rds/tcp.h         |    4 ++++
 net/rds/tcp_connect.c |    8 ++++++++
 net/rds/tcp_listen.c  |   30 ++++++++++++++++++++----------
 4 files changed, 33 insertions(+), 10 deletions(-)

Comments

Santosh Shilimkar May 2, 2016, 4:33 p.m. UTC | #1
On 5/1/2016 4:10 PM, Sowmini Varadhan wrote:
> An arbitration scheme for duelling SYNs is implemented as part of
> commit 241b271952eb ("RDS-TCP: Reset tcp callbacks if re-using an
> outgoing socket in rds_tcp_accept_one()") which ensures that both nodes
> involved will arrive at the same arbitration decision. However, this
> needs to be synchronized with an outgoing SYN to be generated by
> rds_tcp_conn_connect(). This commit achieves the synchronization
> through the t_conn_lock mutex in struct rds_tcp_connection.
>
> The rds_conn_state is checked in rds_tcp_conn_connect() after acquiring
> the t_conn_lock mutex.  A SYN is sent out only if the RDS connection is
> not already UP (an UP would indicate that rds_tcp_accept_one() has
> completed 3WH, so no SYN needs to be generated).
>
> Similarly, the rds_conn_state is checked in rds_tcp_accept_one() after
> acquiring the t_conn_lock mutex. The only acceptable states (to
> allow continuation of the arbitration logic) are UP (i.e., outgoing SYN
> was SYN-ACKed by peer after it sent us the SYN) or CONNECTING (we sent
> outgoing SYN before we saw incoming SYN).
>
> Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
> ---
>  net/rds/tcp.c         |    1 +
>  net/rds/tcp.h         |    4 ++++
>  net/rds/tcp_connect.c |    8 ++++++++
>  net/rds/tcp_listen.c  |   30 ++++++++++++++++++++----------
>  4 files changed, 33 insertions(+), 10 deletions(-)
>
[...]

> diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
> index 5cb1687..49a3fcf 100644
> --- a/net/rds/tcp_connect.c
> +++ b/net/rds/tcp_connect.c
> @@ -78,7 +78,14 @@ int rds_tcp_conn_connect(struct rds_connection *conn)
>  	struct socket *sock = NULL;
>  	struct sockaddr_in src, dest;
>  	int ret;
> +	struct rds_tcp_connection *tc = conn->c_transport_data;
> +
> +	mutex_lock(&tc->t_conn_lock);
>
> +	if (rds_conn_up(conn)) {
> +		mutex_unlock(&tc->t_conn_lock);
> +		return 0;
> +	}
>  	ret = sock_create_kern(rds_conn_net(conn), PF_INET,
>  			       SOCK_STREAM, IPPROTO_TCP, &sock);
>  	if (ret < 0)
> @@ -120,6 +127,7 @@ int rds_tcp_conn_connect(struct rds_connection *conn)
>  	}
>
>  out:
> +	mutex_unlock(&tc->t_conn_lock);
Just wondering whether the spin_lock() would better here considering
entry into rds_tcp_conn_connect() & rds_tcp_accept_one() might be
from softirq context. Ignore it if its not applicable.

>  	if (sock)
>  		sock_release(sock);
>  	return ret;
> diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
> index 0896187..cc8496f 100644
> --- a/net/rds/tcp_listen.c
> +++ b/net/rds/tcp_listen.c
> @@ -76,7 +76,9 @@ int rds_tcp_accept_one(struct socket *sock)
>  	struct rds_connection *conn;
>  	int ret;
>  	struct inet_sock *inet;
> -	struct rds_tcp_connection *rs_tcp;
> +	struct rds_tcp_connection *rs_tcp = NULL;
> +	int conn_state;
> +	struct sock *nsk;
>
>  	ret = sock_create_kern(sock_net(sock->sk), sock->sk->sk_family,
>  			       sock->sk->sk_type, sock->sk->sk_protocol,
> @@ -116,6 +118,10 @@ int rds_tcp_accept_one(struct socket *sock)
>  	 */
>  	rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data;
>  	rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING);
Like patch 1/2, probably we can leverage return value of above.


> +	conn_state = rds_conn_state(conn);
> +	if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_UP)
You probably don't need the local 'conn_state' and below should work.
	if (!rds_conn_connecting(conn) && !rds_conn_up(conn))

Regards,
Santosh
Sowmini Varadhan May 2, 2016, 4:43 p.m. UTC | #2
On (05/02/16 09:33), Santosh Shilimkar wrote:
> >+	mutex_unlock(&tc->t_conn_lock);
> Just wondering whether the spin_lock() would better here considering
> entry into rds_tcp_conn_connect() & rds_tcp_accept_one() might be
> from softirq context. Ignore it if its not applicable.

It's not from softirq context (both are workqs), but I used a mutex
to follow c_cm_lock (which I considered reusing, given that it 
is only IB specific?) But spin_lock vs mutex may not be a big
differentiator here- this is really a one-time start up (corner-case)
issue in the control path.

> > 	rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING);
> Like patch 1/2, probably we can leverage return value of above. 
     :
> You probably don't need the local 'conn_state' and below should work.
> 	if (!rds_conn_connecting(conn) && !rds_conn_up(conn))

see explanation for comment to 1/2.


--Sowmini
Santosh Shilimkar May 2, 2016, 6:08 p.m. UTC | #3
On 5/2/2016 9:43 AM, Sowmini Varadhan wrote:
> On (05/02/16 09:33), Santosh Shilimkar wrote:
>>> +	mutex_unlock(&tc->t_conn_lock);
>> Just wondering whether the spin_lock() would better here considering
>> entry into rds_tcp_conn_connect() & rds_tcp_accept_one() might be
>> from softirq context. Ignore it if its not applicable.
>
> It's not from softirq context (both are workqs), but I used a mutex
> to follow c_cm_lock (which I considered reusing, given that it
> is only IB specific?) But spin_lock vs mutex may not be a big
> differentiator here- this is really a one-time start up (corner-case)
> issue in the control path.
>
That should be fine then.

>>> 	rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING);
>> Like patch 1/2, probably we can leverage return value of above.
>      :
>> You probably don't need the local 'conn_state' and below should work.
>> 	if (!rds_conn_connecting(conn) && !rds_conn_up(conn))
>
> see explanation for comment to 1/2.
>
Yep.

 > +rst_nsk:
 > +	/* rest the newly returned accept sock and bail */
s/rest/reset

With typo fixed,
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
diff mbox

Patch

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 9134544..86187da 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -216,6 +216,7 @@  static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 	if (!tc)
 		return -ENOMEM;
 
+	mutex_init(&tc->t_conn_lock);
 	tc->t_sock = NULL;
 	tc->t_tinc = NULL;
 	tc->t_tinc_hdr_rem = sizeof(struct rds_header);
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 64f873c..41c2283 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -12,6 +12,10 @@  struct rds_tcp_connection {
 
 	struct list_head	t_tcp_node;
 	struct rds_connection   *conn;
+	/* t_conn_lock synchronizes the connection establishment between
+	 * rds_tcp_accept_one and rds_tcp_conn_connect
+	 */
+	struct mutex		t_conn_lock;
 	struct socket		*t_sock;
 	void			*t_orig_write_space;
 	void			*t_orig_data_ready;
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 5cb1687..49a3fcf 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -78,7 +78,14 @@  int rds_tcp_conn_connect(struct rds_connection *conn)
 	struct socket *sock = NULL;
 	struct sockaddr_in src, dest;
 	int ret;
+	struct rds_tcp_connection *tc = conn->c_transport_data;
+
+	mutex_lock(&tc->t_conn_lock);
 
+	if (rds_conn_up(conn)) {
+		mutex_unlock(&tc->t_conn_lock);
+		return 0;
+	}
 	ret = sock_create_kern(rds_conn_net(conn), PF_INET,
 			       SOCK_STREAM, IPPROTO_TCP, &sock);
 	if (ret < 0)
@@ -120,6 +127,7 @@  int rds_tcp_conn_connect(struct rds_connection *conn)
 	}
 
 out:
+	mutex_unlock(&tc->t_conn_lock);
 	if (sock)
 		sock_release(sock);
 	return ret;
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 0896187..cc8496f 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -76,7 +76,9 @@  int rds_tcp_accept_one(struct socket *sock)
 	struct rds_connection *conn;
 	int ret;
 	struct inet_sock *inet;
-	struct rds_tcp_connection *rs_tcp;
+	struct rds_tcp_connection *rs_tcp = NULL;
+	int conn_state;
+	struct sock *nsk;
 
 	ret = sock_create_kern(sock_net(sock->sk), sock->sk->sk_family,
 			       sock->sk->sk_type, sock->sk->sk_protocol,
@@ -116,6 +118,10 @@  int rds_tcp_accept_one(struct socket *sock)
 	 */
 	rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data;
 	rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING);
+	mutex_lock(&rs_tcp->t_conn_lock);
+	conn_state = rds_conn_state(conn);
+	if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_UP)
+		goto rst_nsk;
 	if (rs_tcp->t_sock) {
 		/* Need to resolve a duelling SYN between peers.
 		 * We have an outstanding SYN to this peer, which may
@@ -126,14 +132,7 @@  int rds_tcp_accept_one(struct socket *sock)
 		wait_event(conn->c_waitq,
 			   !test_bit(RDS_IN_XMIT, &conn->c_flags));
 		if (ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr)) {
-			struct sock *nsk = new_sock->sk;
-
-			nsk->sk_user_data = NULL;
-			nsk->sk_prot->disconnect(nsk, 0);
-			tcp_done(nsk);
-			new_sock = NULL;
-			ret = 0;
-			goto out;
+			goto rst_nsk;
 		} else if (rs_tcp->t_sock) {
 			rds_tcp_restore_callbacks(rs_tcp->t_sock, rs_tcp);
 			conn->c_outgoing = 0;
@@ -143,8 +142,19 @@  int rds_tcp_accept_one(struct socket *sock)
 	rds_connect_complete(conn); /* marks RDS_CONN_UP */
 	new_sock = NULL;
 	ret = 0;
-
+	goto out;
+rst_nsk:
+	/* rest the newly returned accept sock and bail */
+	nsk = new_sock->sk;
+	rds_tcp_stats_inc(s_tcp_listen_closed_stale);
+	nsk->sk_user_data = NULL;
+	nsk->sk_prot->disconnect(nsk, 0);
+	tcp_done(nsk);
+	new_sock = NULL;
+	ret = 0;
 out:
+	if (rs_tcp)
+		mutex_unlock(&rs_tcp->t_conn_lock);
 	if (new_sock)
 		sock_release(new_sock);
 	return ret;