diff mbox series

[net-next,v3,04/19] mptcp: Handle MP_CAPABLE options for outgoing connections

Message ID 20200122005633.21229-5-cpaasch@apple.com
State Deferred, archived
Headers show
Series Multipath TCP part 2: Single subflow & RFC8684 support | expand

Commit Message

Christoph Paasch Jan. 22, 2020, 12:56 a.m. UTC
From: Peter Krystad <peter.krystad@linux.intel.com>

Add hooks to tcp_output.c to add MP_CAPABLE to an outgoing SYN request,
to capture the MP_CAPABLE in the received SYN-ACK, to add MP_CAPABLE to
the final ACK of the three-way handshake.

Use the .sk_rx_dst_set() handler in the subflow proto to capture when the
responding SYN-ACK is received and notify the MPTCP connection layer.

Co-developed-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Co-developed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Peter Krystad <peter.krystad@linux.intel.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
---
 include/linux/tcp.h   |   3 +
 include/net/mptcp.h   |  57 +++++++++
 net/ipv4/tcp_input.c  |   6 +
 net/ipv4/tcp_output.c |  44 +++++++
 net/ipv6/tcp_ipv6.c   |   6 +
 net/mptcp/options.c   | 100 ++++++++++++++++
 net/mptcp/protocol.c  | 163 +++++++++++++++++++++----
 net/mptcp/protocol.h  |  40 ++++++-
 net/mptcp/subflow.c   | 268 +++++++++++++++++++++++++++++++++++++++++-
 9 files changed, 663 insertions(+), 24 deletions(-)

Comments

Eric Dumazet Jan. 22, 2020, 7:29 a.m. UTC | #1
On 1/21/20 4:56 PM, Christoph Paasch wrote:
> From: Peter Krystad <peter.krystad@linux.intel.com>
> 
> Add hooks to tcp_output.c to add MP_CAPABLE to an outgoing SYN request,
> to capture the MP_CAPABLE in the received SYN-ACK, to add MP_CAPABLE to
> the final ACK of the three-way handshake.
> 
> Use the .sk_rx_dst_set() handler in the subflow proto to capture when the
> responding SYN-ACK is received and notify the MPTCP connection layer.
> 
> Co-developed-by: Paolo Abeni <pabeni@redhat.com>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> Co-developed-by: Florian Westphal <fw@strlen.de>
> Signed-off-by: Florian Westphal <fw@strlen.de>
> Signed-off-by: Peter Krystad <peter.krystad@linux.intel.com>
> Signed-off-by: Christoph Paasch <cpaasch@apple.com>

Please make sure to CC me (edumazet@google.com) on TCP patches in general,
and on the whole MPTCP series while pushing them upstream.

netdev@ has delivered only 5 patches to my gmail.com account.

Thanks.
diff mbox series

Patch

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 877947475814..e9ee06d887fa 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -137,6 +137,9 @@  struct tcp_request_sock {
 	const struct tcp_request_sock_ops *af_specific;
 	u64				snt_synack; /* first SYNACK sent time */
 	bool				tfo_listener;
+#if IS_ENABLED(CONFIG_MPTCP)
+	bool				is_mptcp;
+#endif
 	u32				txhash;
 	u32				rcv_isn;
 	u32				snt_isn;
diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index 3daec2ceb3ff..eabc57c3fde4 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -39,8 +39,27 @@  struct mptcp_out_options {
 
 void mptcp_init(void);
 
+static inline bool sk_is_mptcp(const struct sock *sk)
+{
+	return tcp_sk(sk)->is_mptcp;
+}
+
+static inline bool rsk_is_mptcp(const struct request_sock *req)
+{
+	return tcp_rsk(req)->is_mptcp;
+}
+
 void mptcp_parse_option(const unsigned char *ptr, int opsize,
 			struct tcp_options_received *opt_rx);
+bool mptcp_syn_options(struct sock *sk, unsigned int *size,
+		       struct mptcp_out_options *opts);
+void mptcp_rcv_synsent(struct sock *sk);
+bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
+			  struct mptcp_out_options *opts);
+bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
+			       unsigned int *size, unsigned int remaining,
+			       struct mptcp_out_options *opts);
+
 void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts);
 
 /* move the skb extension owership, with the assumption that 'to' is
@@ -89,11 +108,47 @@  static inline void mptcp_init(void)
 {
 }
 
+static inline bool sk_is_mptcp(const struct sock *sk)
+{
+	return false;
+}
+
+static inline bool rsk_is_mptcp(const struct request_sock *req)
+{
+	return false;
+}
+
 static inline void mptcp_parse_option(const unsigned char *ptr, int opsize,
 				      struct tcp_options_received *opt_rx)
 {
 }
 
+static inline bool mptcp_syn_options(struct sock *sk, unsigned int *size,
+				     struct mptcp_out_options *opts)
+{
+	return false;
+}
+
+static inline void mptcp_rcv_synsent(struct sock *sk)
+{
+}
+
+static inline bool mptcp_synack_options(const struct request_sock *req,
+					unsigned int *size,
+					struct mptcp_out_options *opts)
+{
+	return false;
+}
+
+static inline bool mptcp_established_options(struct sock *sk,
+					     struct sk_buff *skb,
+					     unsigned int *size,
+					     unsigned int remaining,
+					     struct mptcp_out_options *opts)
+{
+	return false;
+}
+
 static inline void mptcp_skb_ext_move(struct sk_buff *to,
 				      const struct sk_buff *from)
 {
@@ -107,6 +162,8 @@  static inline bool mptcp_skb_can_collapse(const struct sk_buff *to,
 
 #endif /* CONFIG_MPTCP */
 
+void mptcp_handle_ipv6_mapped(struct sock *sk, bool mapped);
+
 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
 int mptcpv6_init(void);
 #elif IS_ENABLED(CONFIG_IPV6)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3458ee13e6f0..5165c8de47ee 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5978,6 +5978,9 @@  static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 		tcp_initialize_rcv_mss(sk);
 
+		if (sk_is_mptcp(sk))
+			mptcp_rcv_synsent(sk);
+
 		/* Remember, tcp_poll() does not lock socket!
 		 * Change state from SYN-SENT only after copied_seq
 		 * is initialized. */
@@ -6600,6 +6603,9 @@  int tcp_conn_request(struct request_sock_ops *rsk_ops,
 
 	tcp_rsk(req)->af_specific = af_ops;
 	tcp_rsk(req)->ts_off = 0;
+#if IS_ENABLED(CONFIG_MPTCP)
+	tcp_rsk(req)->is_mptcp = 0;
+#endif
 
 	tcp_clear_options(&tmp_opt);
 	tmp_opt.mss_clamp = af_ops->mss_clamp;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b6cf87b02b01..c6797a388921 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -597,6 +597,22 @@  static void smc_set_option_cond(const struct tcp_sock *tp,
 #endif
 }
 
+static void mptcp_set_option_cond(const struct request_sock *req,
+				  struct tcp_out_options *opts,
+				  unsigned int *remaining)
+{
+	if (rsk_is_mptcp(req)) {
+		unsigned int size;
+
+		if (mptcp_synack_options(req, &size, &opts->mptcp)) {
+			if (*remaining >= size) {
+				opts->options |= OPTION_MPTCP;
+				*remaining -= size;
+			}
+		}
+	}
+}
+
 /* Compute TCP options for SYN packets. This is not the final
  * network wire format yet.
  */
@@ -666,6 +682,15 @@  static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 
 	smc_set_option(tp, opts, &remaining);
 
+	if (sk_is_mptcp(sk)) {
+		unsigned int size;
+
+		if (mptcp_syn_options(sk, &size, &opts->mptcp)) {
+			opts->options |= OPTION_MPTCP;
+			remaining -= size;
+		}
+	}
+
 	return MAX_TCP_OPTION_SPACE - remaining;
 }
 
@@ -727,6 +752,8 @@  static unsigned int tcp_synack_options(const struct sock *sk,
 		}
 	}
 
+	mptcp_set_option_cond(req, opts, &remaining);
+
 	smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
 
 	return MAX_TCP_OPTION_SPACE - remaining;
@@ -764,6 +791,23 @@  static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
 		size += TCPOLEN_TSTAMP_ALIGNED;
 	}
 
+	/* MPTCP options have precedence over SACK for the limited TCP
+	 * option space because a MPTCP connection would be forced to
+	 * fall back to regular TCP if a required multipath option is
+	 * missing. SACK still gets a chance to use whatever space is
+	 * left.
+	 */
+	if (sk_is_mptcp(sk)) {
+		unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+		unsigned int opt_size = 0;
+
+		if (mptcp_established_options(sk, skb, &opt_size, remaining,
+					      &opts->mptcp)) {
+			opts->options |= OPTION_MPTCP;
+			size += opt_size;
+		}
+	}
+
 	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
 	if (unlikely(eff_sacks)) {
 		const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 60068ffde1d9..33a578a3eb3a 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -238,6 +238,8 @@  static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
 
 		icsk->icsk_af_ops = &ipv6_mapped;
+		if (sk_is_mptcp(sk))
+			mptcp_handle_ipv6_mapped(sk, true);
 		sk->sk_backlog_rcv = tcp_v4_do_rcv;
 #ifdef CONFIG_TCP_MD5SIG
 		tp->af_specific = &tcp_sock_ipv6_mapped_specific;
@@ -248,6 +250,8 @@  static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		if (err) {
 			icsk->icsk_ext_hdr_len = exthdrlen;
 			icsk->icsk_af_ops = &ipv6_specific;
+			if (sk_is_mptcp(sk))
+				mptcp_handle_ipv6_mapped(sk, false);
 			sk->sk_backlog_rcv = tcp_v6_do_rcv;
 #ifdef CONFIG_TCP_MD5SIG
 			tp->af_specific = &tcp_sock_ipv6_specific;
@@ -1203,6 +1207,8 @@  static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 		newnp->saddr = newsk->sk_v6_rcv_saddr;
 
 		inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
+		if (sk_is_mptcp(newsk))
+			mptcp_handle_ipv6_mapped(newsk, true);
 		newsk->sk_backlog_rcv = tcp_v4_do_rcv;
 #ifdef CONFIG_TCP_MD5SIG
 		newtp->af_specific = &tcp_sock_ipv6_mapped_specific;
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index b7a31c0e5283..52ff2301b68b 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -72,14 +72,114 @@  void mptcp_parse_option(const unsigned char *ptr, int opsize,
 	}
 }
 
+void mptcp_get_options(const struct sk_buff *skb,
+		       struct tcp_options_received *opt_rx)
+{
+	const unsigned char *ptr;
+	const struct tcphdr *th = tcp_hdr(skb);
+	int length = (th->doff * 4) - sizeof(struct tcphdr);
+
+	ptr = (const unsigned char *)(th + 1);
+
+	while (length > 0) {
+		int opcode = *ptr++;
+		int opsize;
+
+		switch (opcode) {
+		case TCPOPT_EOL:
+			return;
+		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
+			length--;
+			continue;
+		default:
+			opsize = *ptr++;
+			if (opsize < 2) /* "silly options" */
+				return;
+			if (opsize > length)
+				return;	/* don't parse partial options */
+			if (opcode == TCPOPT_MPTCP)
+				mptcp_parse_option(ptr, opsize, opt_rx);
+			ptr += opsize - 2;
+			length -= opsize;
+		}
+	}
+}
+
+bool mptcp_syn_options(struct sock *sk, unsigned int *size,
+		       struct mptcp_out_options *opts)
+{
+	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+	if (subflow->request_mptcp) {
+		pr_debug("local_key=%llu", subflow->local_key);
+		opts->suboptions = OPTION_MPTCP_MPC_SYN;
+		opts->sndr_key = subflow->local_key;
+		*size = TCPOLEN_MPTCP_MPC_SYN;
+		return true;
+	}
+	return false;
+}
+
+void mptcp_rcv_synsent(struct sock *sk)
+{
+	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	pr_debug("subflow=%p", subflow);
+	if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) {
+		subflow->mp_capable = 1;
+		subflow->remote_key = tp->rx_opt.mptcp.sndr_key;
+	} else {
+		tcp_sk(sk)->is_mptcp = 0;
+	}
+}
+
+bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
+			       unsigned int *size, unsigned int remaining,
+			       struct mptcp_out_options *opts)
+{
+	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+	if (subflow->mp_capable && !subflow->fourth_ack) {
+		opts->suboptions = OPTION_MPTCP_MPC_ACK;
+		opts->sndr_key = subflow->local_key;
+		opts->rcvr_key = subflow->remote_key;
+		*size = TCPOLEN_MPTCP_MPC_ACK;
+		subflow->fourth_ack = 1;
+		pr_debug("subflow=%p, local_key=%llu, remote_key=%llu",
+			 subflow, subflow->local_key, subflow->remote_key);
+		return true;
+	}
+	return false;
+}
+
+bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
+			  struct mptcp_out_options *opts)
+{
+	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+
+	if (subflow_req->mp_capable) {
+		opts->suboptions = OPTION_MPTCP_MPC_SYNACK;
+		opts->sndr_key = subflow_req->local_key;
+		*size = TCPOLEN_MPTCP_MPC_SYNACK;
+		pr_debug("subflow_req=%p, local_key=%llu",
+			 subflow_req, subflow_req->local_key);
+		return true;
+	}
+	return false;
+}
+
 void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
 {
 	if ((OPTION_MPTCP_MPC_SYN |
+	     OPTION_MPTCP_MPC_SYNACK |
 	     OPTION_MPTCP_MPC_ACK) & opts->suboptions) {
 		u8 len;
 
 		if (OPTION_MPTCP_MPC_SYN & opts->suboptions)
 			len = TCPOLEN_MPTCP_MPC_SYN;
+		else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions)
+			len = TCPOLEN_MPTCP_MPC_SYNACK;
 		else
 			len = TCPOLEN_MPTCP_MPC_ACK;
 
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 294b03a0393a..bdd58da1e4f6 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -25,12 +25,28 @@ 
  */
 static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
 {
-	if (!msk->subflow)
+	if (!msk->subflow || mptcp_subflow_ctx(msk->subflow->sk)->fourth_ack)
 		return NULL;
 
 	return msk->subflow;
 }
 
+/* if msk has a single subflow, and the mp_capable handshake is failed,
+ * return it.
+ * Otherwise returns NULL
+ */
+static struct socket *__mptcp_tcp_fallback(const struct mptcp_sock *msk)
+{
+	struct socket *ssock = __mptcp_nmpc_socket(msk);
+
+	sock_owned_by_me((const struct sock *)msk);
+
+	if (!ssock || sk_is_mptcp(ssock->sk))
+		return NULL;
+
+	return ssock;
+}
+
 static bool __mptcp_can_create_subflow(const struct mptcp_sock *msk)
 {
 	return ((struct sock *)msk)->sk_state == TCP_CLOSE;
@@ -56,6 +72,7 @@  static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state)
 
 	msk->subflow = ssock;
 	subflow = mptcp_subflow_ctx(ssock->sk);
+	list_add(&subflow->node, &msk->conn_list);
 	subflow->request_mptcp = 1;
 
 set_state:
@@ -64,66 +81,169 @@  static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state)
 	return ssock;
 }
 
+static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk)
+{
+	struct mptcp_subflow_context *subflow;
+
+	sock_owned_by_me((const struct sock *)msk);
+
+	mptcp_for_each_subflow(msk, subflow) {
+		return mptcp_subflow_tcp_sock(subflow);
+	}
+
+	return NULL;
+}
+
 static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
-	struct socket *subflow = msk->subflow;
+	struct socket *ssock;
+	struct sock *ssk;
+	int ret;
 
 	if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
 		return -EOPNOTSUPP;
 
-	return sock_sendmsg(subflow, msg);
+	lock_sock(sk);
+	ssock = __mptcp_tcp_fallback(msk);
+	if (ssock) {
+		pr_debug("fallback passthrough");
+		ret = sock_sendmsg(ssock, msg);
+		release_sock(sk);
+		return ret;
+	}
+
+	ssk = mptcp_subflow_get(msk);
+	if (!ssk) {
+		release_sock(sk);
+		return -ENOTCONN;
+	}
+
+	ret = sock_sendmsg(ssk->sk_socket, msg);
+
+	release_sock(sk);
+	return ret;
 }
 
 static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 			 int nonblock, int flags, int *addr_len)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
-	struct socket *subflow = msk->subflow;
+	struct socket *ssock;
+	struct sock *ssk;
+	int copied = 0;
 
 	if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT))
 		return -EOPNOTSUPP;
 
-	return sock_recvmsg(subflow, msg, flags);
+	lock_sock(sk);
+	ssock = __mptcp_tcp_fallback(msk);
+	if (ssock) {
+		pr_debug("fallback-read subflow=%p",
+			 mptcp_subflow_ctx(ssock->sk));
+		copied = sock_recvmsg(ssock, msg, flags);
+		release_sock(sk);
+		return copied;
+	}
+
+	ssk = mptcp_subflow_get(msk);
+	if (!ssk) {
+		release_sock(sk);
+		return -ENOTCONN;
+	}
+
+	copied = sock_recvmsg(ssk->sk_socket, msg, flags);
+
+	release_sock(sk);
+
+	return copied;
+}
+
+/* subflow sockets can be either outgoing (connect) or incoming
+ * (accept).
+ *
+ * Outgoing subflows use in-kernel sockets.
+ * Incoming subflows do not have their own 'struct socket' allocated,
+ * so we need to use tcp_close() after detaching them from the mptcp
+ * parent socket.
+ */
+static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
+			      struct mptcp_subflow_context *subflow,
+			      long timeout)
+{
+	struct socket *sock = READ_ONCE(ssk->sk_socket);
+
+	list_del(&subflow->node);
+
+	if (sock && sock != sk->sk_socket) {
+		/* outgoing subflow */
+		sock_release(sock);
+	} else {
+		/* incoming subflow */
+		tcp_close(ssk, timeout);
+	}
 }
 
 static int mptcp_init_sock(struct sock *sk)
 {
+	struct mptcp_sock *msk = mptcp_sk(sk);
+
+	INIT_LIST_HEAD(&msk->conn_list);
+
 	return 0;
 }
 
 static void mptcp_close(struct sock *sk, long timeout)
 {
+	struct mptcp_subflow_context *subflow, *tmp;
 	struct mptcp_sock *msk = mptcp_sk(sk);
-	struct socket *ssock;
 
 	inet_sk_state_store(sk, TCP_CLOSE);
 
-	ssock = __mptcp_nmpc_socket(msk);
-	if (ssock) {
-		pr_debug("subflow=%p", mptcp_subflow_ctx(ssock->sk));
-		sock_release(ssock);
+	lock_sock(sk);
+
+	list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
+		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+		__mptcp_close_ssk(sk, ssk, subflow, timeout);
 	}
 
-	sock_orphan(sk);
-	sock_put(sk);
+	release_sock(sk);
+	sk_common_release(sk);
 }
 
-static int mptcp_connect(struct sock *sk, struct sockaddr *saddr, int len)
+static int mptcp_get_port(struct sock *sk, unsigned short snum)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
-	int err;
+	struct socket *ssock;
 
-	saddr->sa_family = AF_INET;
+	ssock = __mptcp_nmpc_socket(msk);
+	pr_debug("msk=%p, subflow=%p", msk, ssock);
+	if (WARN_ON_ONCE(!ssock))
+		return -EINVAL;
 
-	pr_debug("msk=%p, subflow=%p", msk,
-		 mptcp_subflow_ctx(msk->subflow->sk));
+	return inet_csk_get_port(ssock->sk, snum);
+}
 
-	err = kernel_connect(msk->subflow, saddr, len, 0);
+void mptcp_finish_connect(struct sock *ssk)
+{
+	struct mptcp_subflow_context *subflow;
+	struct mptcp_sock *msk;
+	struct sock *sk;
 
-	sk->sk_state = TCP_ESTABLISHED;
+	subflow = mptcp_subflow_ctx(ssk);
 
-	return err;
+	if (!subflow->mp_capable)
+		return;
+
+	sk = subflow->conn;
+	msk = mptcp_sk(sk);
+
+	/* the socket is not connected yet, no msk/subflow ops can access/race
+	 * accessing the field below
+	 */
+	WRITE_ONCE(msk->remote_key, subflow->remote_key);
+	WRITE_ONCE(msk->local_key, subflow->local_key);
 }
 
 static struct proto mptcp_prot = {
@@ -132,13 +252,12 @@  static struct proto mptcp_prot = {
 	.init		= mptcp_init_sock,
 	.close		= mptcp_close,
 	.accept		= inet_csk_accept,
-	.connect	= mptcp_connect,
 	.shutdown	= tcp_shutdown,
 	.sendmsg	= mptcp_sendmsg,
 	.recvmsg	= mptcp_recvmsg,
 	.hash		= inet_hash,
 	.unhash		= inet_unhash,
-	.get_port	= inet_csk_get_port,
+	.get_port	= mptcp_get_port,
 	.obj_size	= sizeof(struct mptcp_sock),
 	.no_autobind	= true,
 };
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 543d4d5d8985..bd66e7415515 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -40,19 +40,47 @@ 
 struct mptcp_sock {
 	/* inet_connection_sock must be the first member */
 	struct inet_connection_sock sk;
+	u64		local_key;
+	u64		remote_key;
+	struct list_head conn_list;
 	struct socket	*subflow; /* outgoing connect/listener/!mp_capable */
 };
 
+#define mptcp_for_each_subflow(__msk, __subflow)			\
+	list_for_each_entry(__subflow, &((__msk)->conn_list), node)
+
 static inline struct mptcp_sock *mptcp_sk(const struct sock *sk)
 {
 	return (struct mptcp_sock *)sk;
 }
 
+struct mptcp_subflow_request_sock {
+	struct	tcp_request_sock sk;
+	u8	mp_capable : 1,
+		mp_join : 1,
+		backup : 1;
+	u64	local_key;
+	u64	remote_key;
+};
+
+static inline struct mptcp_subflow_request_sock *
+mptcp_subflow_rsk(const struct request_sock *rsk)
+{
+	return (struct mptcp_subflow_request_sock *)rsk;
+}
+
 /* MPTCP subflow context */
 struct mptcp_subflow_context {
-	u32	request_mptcp : 1;  /* send MP_CAPABLE */
+	struct	list_head node;/* conn_list of subflows */
+	u64	local_key;
+	u64	remote_key;
+	u32	request_mptcp : 1,  /* send MP_CAPABLE */
+		mp_capable : 1,	    /* remote is MPTCP capable */
+		fourth_ack : 1,	    /* send initial DSS */
+		conn_finished : 1;
 	struct	sock *tcp_sock;	    /* tcp sk backpointer */
 	struct	sock *conn;	    /* parent mptcp_sock */
+	const	struct inet_connection_sock_af_ops *icsk_af_ops;
 	struct	rcu_head rcu;
 };
 
@@ -74,4 +102,14 @@  mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow)
 void mptcp_subflow_init(void);
 int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock);
 
+extern const struct inet_connection_sock_af_ops ipv4_specific;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+extern const struct inet_connection_sock_af_ops ipv6_specific;
+#endif
+
+void mptcp_get_options(const struct sk_buff *skb,
+		       struct tcp_options_received *opt_rx);
+
+void mptcp_finish_connect(struct sock *sk);
+
 #endif /* __MPTCP_PROTOCOL_H */
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index bf8139353653..df3192305967 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -12,9 +12,188 @@ 
 #include <net/inet_hashtables.h>
 #include <net/protocol.h>
 #include <net/tcp.h>
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+#include <net/ip6_route.h>
+#endif
 #include <net/mptcp.h>
 #include "protocol.h"
 
+static void subflow_init_req(struct request_sock *req,
+			     const struct sock *sk_listener,
+			     struct sk_buff *skb)
+{
+	struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
+	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+	struct tcp_options_received rx_opt;
+
+	pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
+
+	memset(&rx_opt.mptcp, 0, sizeof(rx_opt.mptcp));
+	mptcp_get_options(skb, &rx_opt);
+
+	subflow_req->mp_capable = 0;
+
+#ifdef CONFIG_TCP_MD5SIG
+	/* no MPTCP if MD5SIG is enabled on this socket or we may run out of
+	 * TCP option space.
+	 */
+	if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
+		return;
+#endif
+
+	if (rx_opt.mptcp.mp_capable && listener->request_mptcp) {
+		subflow_req->mp_capable = 1;
+		subflow_req->remote_key = rx_opt.mptcp.sndr_key;
+	}
+}
+
+static void subflow_v4_init_req(struct request_sock *req,
+				const struct sock *sk_listener,
+				struct sk_buff *skb)
+{
+	tcp_rsk(req)->is_mptcp = 1;
+
+	tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb);
+
+	subflow_init_req(req, sk_listener, skb);
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+static void subflow_v6_init_req(struct request_sock *req,
+				const struct sock *sk_listener,
+				struct sk_buff *skb)
+{
+	tcp_rsk(req)->is_mptcp = 1;
+
+	tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb);
+
+	subflow_init_req(req, sk_listener, skb);
+}
+#endif
+
+static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
+{
+	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+	subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
+
+	if (subflow->conn && !subflow->conn_finished) {
+		pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
+			 subflow->remote_key);
+		mptcp_finish_connect(sk);
+		subflow->conn_finished = 1;
+	}
+}
+
+static struct request_sock_ops subflow_request_sock_ops;
+static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops;
+
+static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+	pr_debug("subflow=%p", subflow);
+
+	/* Never answer to SYNs sent to broadcast or multicast */
+	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+		goto drop;
+
+	return tcp_conn_request(&subflow_request_sock_ops,
+				&subflow_request_sock_ipv4_ops,
+				sk, skb);
+drop:
+	tcp_listendrop(sk);
+	return 0;
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops;
+static struct inet_connection_sock_af_ops subflow_v6_specific;
+static struct inet_connection_sock_af_ops subflow_v6m_specific;
+
+static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+	pr_debug("subflow=%p", subflow);
+
+	if (skb->protocol == htons(ETH_P_IP))
+		return subflow_v4_conn_request(sk, skb);
+
+	if (!ipv6_unicast_destination(skb))
+		goto drop;
+
+	return tcp_conn_request(&subflow_request_sock_ops,
+				&subflow_request_sock_ipv6_ops, sk, skb);
+
+drop:
+	tcp_listendrop(sk);
+	return 0; /* don't send reset */
+}
+#endif
+
+static struct sock *subflow_syn_recv_sock(const struct sock *sk,
+					  struct sk_buff *skb,
+					  struct request_sock *req,
+					  struct dst_entry *dst,
+					  struct request_sock *req_unhash,
+					  bool *own_req)
+{
+	struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
+	struct sock *child;
+
+	pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
+
+	/* if the sk is MP_CAPABLE, we already received the client key */
+
+	child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
+						     req_unhash, own_req);
+
+	if (child && *own_req) {
+		if (!mptcp_subflow_ctx(child)) {
+			pr_debug("Closing child socket");
+			inet_sk_set_state(child, TCP_CLOSE);
+			sock_set_flag(child, SOCK_DEAD);
+			inet_csk_destroy_sock(child);
+			child = NULL;
+		}
+	}
+
+	return child;
+}
+
+static struct inet_connection_sock_af_ops subflow_specific;
+
+static struct inet_connection_sock_af_ops *
+subflow_default_af_ops(struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+	if (sk->sk_family == AF_INET6)
+		return &subflow_v6_specific;
+#endif
+	return &subflow_specific;
+}
+
+void mptcp_handle_ipv6_mapped(struct sock *sk, bool mapped)
+{
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct inet_connection_sock_af_ops *target;
+
+	target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);
+
+	pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
+	         subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped);
+
+	if (likely(icsk->icsk_af_ops == target))
+		return;
+
+	subflow->icsk_af_ops = icsk->icsk_af_ops;
+	icsk->icsk_af_ops = target;
+#endif
+}
+
 int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
 {
 	struct mptcp_subflow_context *subflow;
@@ -22,7 +201,8 @@  int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
 	struct socket *sf;
 	int err;
 
-	err = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sf);
+	err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP,
+			       &sf);
 	if (err)
 		return err;
 
@@ -60,6 +240,7 @@  static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
 		return NULL;
 
 	rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
+	INIT_LIST_HEAD(&ctx->node);
 
 	pr_debug("subflow=%p", ctx);
 
@@ -70,6 +251,7 @@  static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
 
 static int subflow_ulp_init(struct sock *sk)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct mptcp_subflow_context *ctx;
 	struct tcp_sock *tp = tcp_sk(sk);
 	int err = 0;
@@ -91,6 +273,8 @@  static int subflow_ulp_init(struct sock *sk)
 	pr_debug("subflow=%p, family=%d", ctx, sk->sk_family);
 
 	tp->is_mptcp = 1;
+	ctx->icsk_af_ops = icsk->icsk_af_ops;
+	icsk->icsk_af_ops = subflow_default_af_ops(sk);
 out:
 	return err;
 }
@@ -105,15 +289,97 @@  static void subflow_ulp_release(struct sock *sk)
 	kfree_rcu(ctx, rcu);
 }
 
+static void subflow_ulp_fallback(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	icsk->icsk_ulp_ops = NULL;
+	rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
+	tcp_sk(sk)->is_mptcp = 0;
+}
+
+static void subflow_ulp_clone(const struct request_sock *req,
+			      struct sock *newsk,
+			      const gfp_t priority)
+{
+	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+	struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
+	struct mptcp_subflow_context *new_ctx;
+
+	if (!subflow_req->mp_capable) {
+		subflow_ulp_fallback(newsk);
+		return;
+	}
+
+	new_ctx = subflow_create_ctx(newsk, priority);
+	if (new_ctx == NULL) {
+		subflow_ulp_fallback(newsk);
+		return;
+	}
+
+	new_ctx->conn_finished = 1;
+	new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
+	new_ctx->mp_capable = 1;
+	new_ctx->fourth_ack = 1;
+	new_ctx->remote_key = subflow_req->remote_key;
+	new_ctx->local_key = subflow_req->local_key;
+}
+
 static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
 	.name		= "mptcp",
 	.owner		= THIS_MODULE,
 	.init		= subflow_ulp_init,
 	.release	= subflow_ulp_release,
+	.clone		= subflow_ulp_clone,
 };
 
+static int subflow_ops_init(struct request_sock_ops *subflow_ops)
+{
+	subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock);
+	subflow_ops->slab_name = "request_sock_subflow";
+
+	subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name,
+					      subflow_ops->obj_size, 0,
+					      SLAB_ACCOUNT |
+					      SLAB_TYPESAFE_BY_RCU,
+					      NULL);
+	if (!subflow_ops->slab)
+		return -ENOMEM;
+
+	return 0;
+}
+
 void mptcp_subflow_init(void)
 {
+	subflow_request_sock_ops = tcp_request_sock_ops;
+	if (subflow_ops_init(&subflow_request_sock_ops) != 0)
+		panic("MPTCP: failed to init subflow request sock ops\n");
+
+	subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
+	subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req;
+
+	subflow_specific = ipv4_specific;
+	subflow_specific.conn_request = subflow_v4_conn_request;
+	subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
+	subflow_specific.sk_rx_dst_set = subflow_finish_connect;
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+	subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
+	subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req;
+
+	subflow_v6_specific = ipv6_specific;
+	subflow_v6_specific.conn_request = subflow_v6_conn_request;
+	subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock;
+	subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect;
+
+	subflow_v6m_specific = subflow_v6_specific;
+	subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit;
+	subflow_v6m_specific.send_check = ipv4_specific.send_check;
+	subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
+	subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
+	subflow_v6m_specific.net_frag_header_len = 0;
+#endif
+
 	if (tcp_register_ulp(&subflow_ulp_ops) != 0)
 		panic("MPTCP: failed to register subflows to ULP\n");
 }