Message ID | 20201105170126.5627-4-fw@strlen.de |
---|---|
State | Superseded, archived |
Delegated to: | Florian Westphal |
Headers | show |
Series | mptcp: add reset and fastclose option support | expand |
On Thu, 5 Nov 2020, Florian Westphal wrote: > The MPTCP reset option allows to carry a mptcp-specific error > code that provides more information on the nature of a connection > reset. > > The Reset option data received gets stored in the mptcp skb > extension structure so it can be consumed by e.g. path management. > > When a subflow is closed, the desired error code that should be sent > to the peer is placed in the subflow context structure. > > If close happens before a suitable tcp socket has been created > (for example, when HMAC fails validation), the reset code can be placed > in the mptcp skb extension which then gets added to the TCP reset skb. > > Signed-off-by: Florian Westphal <fw@strlen.de> > --- > include/net/mptcp.h | 6 ++++-- > include/net/tcp.h | 3 +++ > net/ipv4/tcp_ipv4.c | 21 ++++++++++++++++++++- > net/ipv6/tcp_ipv6.c | 19 +++++++++++++++++++ > net/mptcp/options.c | 42 +++++++++++++++++++++++++++++++++++++----- > net/mptcp/protocol.c | 12 +++++++++--- > net/mptcp/protocol.h | 18 ++++++++++++++++++ > net/mptcp/subflow.c | 27 ++++++++++++++++++++++++--- > 8 files changed, 134 insertions(+), 14 deletions(-) > > diff --git a/include/net/mptcp.h b/include/net/mptcp.h > index 3d57607982fa..0aed06330a25 100644 > --- a/include/net/mptcp.h > +++ b/include/net/mptcp.h > @@ -30,8 +30,8 @@ struct mptcp_ext { > ack64:1, > mpc_map:1, > frozen:1, > - __unused:1; > - /* one byte hole */ > + reset_transient:1; > + u8 reset_reason:4; > }; > > struct mptcp_out_options { > @@ -50,6 +50,8 @@ struct mptcp_out_options { > u8 rm_id; > u8 join_id; > u8 backup; > + u8 reset_reason:4; > + u8 reset_transient:1; > u32 nonce; > u64 thmac; > u32 token; > diff --git a/include/net/tcp.h b/include/net/tcp.h > index c0fef9e9ba20..899f87346b49 100644 > --- a/include/net/tcp.h > +++ b/include/net/tcp.h > @@ -193,6 +193,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); > #define TCPOPT_FASTOPEN_MAGIC 0xF989 > #define TCPOPT_SMC_MAGIC 0xE2D4C3D9 > > +/* MPTCP suboptions used in TCP */ > +#define MPTCPOPT_RST 8 > /* > * TCP option lengths > */ > @@ -216,6 +218,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); > #define TCPOLEN_MD5SIG_ALIGNED 20 > #define TCPOLEN_MSS_ALIGNED 4 > #define TCPOLEN_EXP_SMC_BASE_ALIGNED 8 > +#define TCPOLEN_MPTCP_RST 4 > > /* Flags in tp->nonagle */ > #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c > index 7352c097ae48..c96aea5514c6 100644 > --- a/net/ipv4/tcp_ipv4.c > +++ b/net/ipv4/tcp_ipv4.c > @@ -660,9 +660,11 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) > const struct tcphdr *th = tcp_hdr(skb); > struct { > struct tcphdr th; > + __be32 opt[(TCPOLEN_MPTCP_RST >> 2) Works for me - doesn't seem worthwhile to ifdef out one 32-bit value on the stack if MPTCP is not enabled, and having to deal with all the preprocessor combinations of MD5SIG and MPTCP. > #ifdef CONFIG_TCP_MD5SIG > - __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; > + + (TCPOLEN_MD5SIG_ALIGNED >> 2) > #endif > + ]; > } rep; > struct ip_reply_arg arg; > #ifdef CONFIG_TCP_MD5SIG > @@ -770,6 +772,23 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) > ip_hdr(skb)->daddr, &rep.th); > } > #endif > + /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ If they can't coexist, do we need space for both? (And, yes, I know I'm asking this after saying the single 32-bit value was not a big deal above) > + if (sk && sk_fullsock(sk) && sk_is_mptcp(sk) && rep.opt[0] == 0) { > + const struct mptcp_ext *ext = mptcp_get_ext(skb); > + u8 flags = 0, reason = 0; > + > + if (ext) { > + flags = ext->reset_transient; > + reason = ext->reset_reason; > + } > + > + rep.opt[0] = mptcp_option(MPTCPOPT_RST, TCPOLEN_MPTCP_RST, > + flags, reason); > + > + arg.iov[0].iov_len += TCPOLEN_MPTCP_RST; > + rep.th.doff = arg.iov[0].iov_len / 4; > + } > + > arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, > ip_hdr(skb)->saddr, /* XXX */ > arg.iov[0].iov_len, IPPROTO_TCP, 0); > diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c > index 8db59f4e5f13..cfe8d6b4c34c 100644 > --- a/net/ipv6/tcp_ipv6.c > +++ b/net/ipv6/tcp_ipv6.c > @@ -868,6 +868,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 > struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); > struct sock *ctl_sk = net->ipv6.tcp_sk; > unsigned int tot_len = sizeof(struct tcphdr); > + bool mptcp_reset = false; > struct dst_entry *dst; > __be32 *topt; > __u32 mark = 0; > @@ -879,6 +880,11 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 > tot_len += TCPOLEN_MD5SIG_ALIGNED; > #endif > > + if (rst && sk && sk_fullsock(sk) && sk_is_mptcp(sk) && !key) { > + tot_len += TCPOLEN_MPTCP_RST; > + mptcp_reset = true; > + } > + > buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len, > GFP_ATOMIC); > if (!buff) > @@ -909,6 +915,19 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 > *topt++ = htonl(tsecr); > } > > + if (mptcp_reset) { > + const struct mptcp_ext *ext = mptcp_get_ext(skb); > + u8 flags = 0, reason = 0; > + > + if (ext) { > + flags = ext->reset_transient; > + reason = ext->reset_reason; > + } > + > + *topt++ = mptcp_option(MPTCPOPT_RST, TCPOLEN_MPTCP_RST, > + flags, reason); > + } > + > #ifdef CONFIG_TCP_MD5SIG > if (key) { > *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | > diff --git a/net/mptcp/options.c b/net/mptcp/options.c > index 248e3930c0cb..785a9f4e7da8 100644 > --- a/net/mptcp/options.c > +++ b/net/mptcp/options.c > @@ -281,7 +281,17 @@ static void mptcp_parse_option(const struct sk_buff *skb, > mp_opt->rm_id = *ptr++; > pr_debug("RM_ADDR: id=%d", mp_opt->rm_id); > break; > + case MPTCPOPT_RST: > + if (opsize != TCPOLEN_MPTCP_RST) > + break; > > + if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) > + break; > + mp_opt->reset = 1; > + flags = *ptr++; > + mp_opt->reset_transient = flags & MPTCP_RST_TRANSIENT; > + mp_opt->reset_reason = *ptr; > + break; > default: > break; > } > @@ -302,6 +312,7 @@ void mptcp_get_options(const struct sk_buff *skb, > mp_opt->port = 0; > mp_opt->rm_addr = 0; > mp_opt->dss = 0; > + mp_opt->reset = 0; > > length = (th->doff * 4) - sizeof(struct tcphdr); > ptr = (const unsigned char *)(th + 1); > @@ -660,6 +671,22 @@ static bool mptcp_established_options_rm_addr(struct sock *sk, > return true; > } > > +static noinline void mptcp_established_options_rst(struct sock *sk, struct sk_buff *skb, I'm assuming this is 'noinline' to keep mptcp_established_options() code size slightly smaller in the cache? I'm ok with that, of course, but curious if you think (or have data) that further code size reduction on that fast path would be worthwhile. Thanks, Mat > + unsigned int *size, > + unsigned int remaining, > + struct mptcp_out_options *opts) > +{ > + const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); > + > + if (remaining < TCPOLEN_MPTCP_RST) > + return; > + > + *size = TCPOLEN_MPTCP_RST; > + opts->suboptions |= OPTION_MPTCP_RST; > + opts->reset_transient = subflow->reset_transient; > + opts->reset_reason = subflow->reset_reason; > +} > + > bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, > unsigned int *size, unsigned int remaining, > struct mptcp_out_options *opts) > @@ -672,11 +699,10 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, > if (unlikely(mptcp_check_fallback(sk))) > return false; > > - /* prevent adding of any MPTCP related options on reset packet > - * until we support MP_TCPRST/MP_FASTCLOSE > - */ > - if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) > - return false; > + if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) { > + mptcp_established_options_rst(sk, skb, size, remaining, opts); > + return true; > + } > > if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) > ret = true; > @@ -1137,6 +1163,12 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, > ptr += 5; > } > > + if (OPTION_MPTCP_RST & opts->suboptions) > + *ptr++ = mptcp_option(MPTCPOPT_RST, > + TCPOLEN_MPTCP_RST, > + opts->reset_transient, > + opts->reset_reason); > + > if (opts->ext_copy.use_ack || opts->ext_copy.use_map) { > struct mptcp_ext *mpext = &opts->ext_copy; > u8 len = TCPOLEN_MPTCP_DSS_BASE; > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c > index a6bd06c724d5..71e556540161 100644 > --- a/net/mptcp/protocol.c > +++ b/net/mptcp/protocol.c > @@ -2608,14 +2608,18 @@ bool mptcp_finish_join(struct sock *ssk) > pr_debug("msk=%p, subflow=%p", msk, subflow); > > /* mptcp socket already closing? */ > - if (!mptcp_is_fully_established(parent)) > + if (!mptcp_is_fully_established(parent)) { > + subflow->reset_reason = MPTCP_RST_EMPTCP; > return false; > + } > > if (!msk->pm.server_side) > return true; > > - if (!mptcp_pm_allow_new_subflow(msk)) > + if (!mptcp_pm_allow_new_subflow(msk)) { > + subflow->reset_reason = MPTCP_RST_EPROHIBIT; > return false; > + } > > /* active connections are already on conn_list, and we can't acquire > * msk lock here. > @@ -2629,8 +2633,10 @@ bool mptcp_finish_join(struct sock *ssk) > sock_hold(ssk); > } > spin_unlock_bh(&msk->join_list_lock); > - if (!ret) > + if (!ret) { > + subflow->reset_reason = MPTCP_RST_EPROHIBIT; > return false; > + } > > /* attach to msk socket only after we are sure he will deal with us > * at close time > diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h > index 66bd4d096753..8a247e50d326 100644 > --- a/net/mptcp/protocol.h > +++ b/net/mptcp/protocol.h > @@ -23,6 +23,7 @@ > #define OPTION_MPTCP_ADD_ADDR BIT(6) > #define OPTION_MPTCP_ADD_ADDR6 BIT(7) > #define OPTION_MPTCP_RM_ADDR BIT(8) > +#define OPTION_MPTCP_RST BIT(9) > > /* MPTCP option subtypes */ > #define MPTCPOPT_MP_CAPABLE 0 > @@ -84,6 +85,18 @@ > #define MPTCP_ADDR_IPVERSION_4 4 > #define MPTCP_ADDR_IPVERSION_6 6 > > +/* MPTCP TCPRST flags */ > +#define MPTCP_RST_TRANSIENT BIT(0) > + > +/* MPTCP Reset reason codes, rfc8684 */ > +#define MPTCP_RST_EUNSPEC 0 > +#define MPTCP_RST_EMPTCP 1 > +#define MPTCP_RST_ERESOURCE 2 > +#define MPTCP_RST_EPROHIBIT 3 > +#define MPTCP_RST_EWQ2BIG 4 > +#define MPTCP_RST_EBADPERF 5 > +#define MPTCP_RST_EMIDDLEBOX 6 > + > /* MPTCP socket flags */ > #define MPTCP_DATA_READY 0 > #define MPTCP_NOSPACE 1 > @@ -109,6 +122,7 @@ struct mptcp_options_received { > u16 data_len; > u16 mp_capable : 1, > mp_join : 1, > + reset : 1, > dss : 1, > add_addr : 1, > rm_addr : 1, > @@ -129,6 +143,8 @@ struct mptcp_options_received { > __unused:2; > u8 addr_id; > u8 rm_id; > + u8 reset_reason:4; > + u8 reset_transient:1; > union { > struct in_addr addr; > #if IS_ENABLED(CONFIG_MPTCP_IPV6) > @@ -365,6 +381,8 @@ struct mptcp_subflow_context { > u8 hmac[MPTCPOPT_HMAC_LEN]; > u8 local_id; > u8 remote_id; > + u8 reset_transient:1; > + u8 reset_reason:4; > > struct sock *tcp_sock; /* tcp sk backpointer */ > struct sock *conn; /* parent mptcp_sock */ > diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c > index 1e9a72af67dc..e0da6712a5c3 100644 > --- a/net/mptcp/subflow.c > +++ b/net/mptcp/subflow.c > @@ -325,8 +325,10 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) > } else if (subflow->request_join) { > u8 hmac[SHA256_DIGEST_SIZE]; > > - if (!mp_opt.mp_join) > + if (!mp_opt.mp_join) { > + subflow->reset_reason = MPTCP_RST_EMPTCP; > goto do_reset; > + } > > subflow->thmac = mp_opt.thmac; > subflow->remote_nonce = mp_opt.nonce; > @@ -335,6 +337,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) > > if (!subflow_thmac_valid(subflow)) { > MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); > + subflow->reset_reason = MPTCP_RST_EMPTCP; > goto do_reset; > } > > @@ -356,6 +359,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) > return; > > do_reset: > + subflow->reset_transient = 0; > mptcp_subflow_reset(sk); > } > > @@ -505,6 +509,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, > struct mptcp_options_received mp_opt; > bool fallback, fallback_is_fatal; > struct sock *new_msk = NULL; > + struct mptcp_ext *mpext; > struct sock *child; > > pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); > @@ -565,8 +570,15 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, > * to reset the context to non MPTCP status. > */ > if (!ctx || fallback) { > - if (fallback_is_fatal) > + if (fallback_is_fatal) { > + mpext = skb_ext_add(skb, SKB_EXT_MPTCP); > + if (mpext) { > + memset(mpext, 0, sizeof(*mpext)); > + mpext->reset_reason = MPTCP_RST_EMPTCP; > + } > + > goto dispose_child; > + } > > subflow_drop_ctx(child); > goto out; > @@ -600,8 +612,15 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, > struct mptcp_sock *owner; > > owner = subflow_req->msk; > - if (!owner) > + if (!owner) { > + mpext = skb_ext_add(skb, SKB_EXT_MPTCP); > + if (mpext) { > + memset(mpext, 0, sizeof(*mpext)); > + mpext->reset_reason = MPTCP_RST_EPROHIBIT; > + } > + > goto dispose_child; > + } > > /* move the msk reference ownership to the subflow */ > subflow_req->msk = NULL; > @@ -936,6 +955,8 @@ static bool subflow_check_data_avail(struct sock *ssk) > smp_wmb(); > ssk->sk_error_report(ssk); > tcp_set_state(ssk, TCP_CLOSE); > + subflow->reset_transient = 0; > + subflow->reset_reason = MPTCP_RST_EMPTCP; > tcp_send_active_reset(ssk, GFP_ATOMIC); > subflow->data_avail = 0; > return false; > -- > 2.26.2 -- Mat Martineau Intel
Mat Martineau <mathew.j.martineau@linux.intel.com> wrote: > > #ifdef CONFIG_TCP_MD5SIG > > - __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; > > + + (TCPOLEN_MD5SIG_ALIGNED >> 2) > > #endif > > + ]; > > } rep; > > struct ip_reply_arg arg; > > #ifdef CONFIG_TCP_MD5SIG > > @@ -770,6 +772,23 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) > > ip_hdr(skb)->daddr, &rep.th); > > } > > #endif > > + /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ > > If they can't coexist, do we need space for both? (And, yes, I know I'm > asking this after saying the single 32-bit value was not a big deal above) Mhhh, that comment and .. > > + if (sk && sk_fullsock(sk) && sk_is_mptcp(sk) && rep.opt[0] == 0) { the rep.opt[0] test can be removed. I do not see how we can have sk_is_mptcp() evaluate to true while md5sig is enabled at same time. Wrt. space -- right, we do not need both, so maybe add #ifdef CONFIG_TCP_MD5SIG # define TCP_RST_OPTLEN_WORDS (TCPOLEN_MD5SIG_ALIGNED >> 2) #else # define TCP_RST_OPTLEN_WORDS (TCPOLEN_MPTCP_RST >> 2) #endif and use that instead for opt[] size. > > +static noinline void mptcp_established_options_rst(struct sock *sk, struct sk_buff *skb, > > I'm assuming this is 'noinline' to keep mptcp_established_options() code > size slightly smaller in the cache? Yes, the idea was that this code path rarely deals with tcp reset packets so that there is no need for inlining. > I'm ok with that, of course, but curious > if you think (or have data) that further code size reduction on that fast > path would be worthwhile. No, I do not have data for that (and I would not mind removing the noinline). As for making more size reductions, I do not think it makes sense to look into such micro optimizations at this time.
diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 3d57607982fa..0aed06330a25 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -30,8 +30,8 @@ struct mptcp_ext { ack64:1, mpc_map:1, frozen:1, - __unused:1; - /* one byte hole */ + reset_transient:1; + u8 reset_reason:4; }; struct mptcp_out_options { @@ -50,6 +50,8 @@ struct mptcp_out_options { u8 rm_id; u8 join_id; u8 backup; + u8 reset_reason:4; + u8 reset_transient:1; u32 nonce; u64 thmac; u32 token; diff --git a/include/net/tcp.h b/include/net/tcp.h index c0fef9e9ba20..899f87346b49 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -193,6 +193,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOPT_FASTOPEN_MAGIC 0xF989 #define TCPOPT_SMC_MAGIC 0xE2D4C3D9 +/* MPTCP suboptions used in TCP */ +#define MPTCPOPT_RST 8 /* * TCP option lengths */ @@ -216,6 +218,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOLEN_MD5SIG_ALIGNED 20 #define TCPOLEN_MSS_ALIGNED 4 #define TCPOLEN_EXP_SMC_BASE_ALIGNED 8 +#define TCPOLEN_MPTCP_RST 4 /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7352c097ae48..c96aea5514c6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -660,9 +660,11 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) const struct tcphdr *th = tcp_hdr(skb); struct { struct tcphdr th; + __be32 opt[(TCPOLEN_MPTCP_RST >> 2) #ifdef CONFIG_TCP_MD5SIG - __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; + + (TCPOLEN_MD5SIG_ALIGNED >> 2) #endif + ]; } rep; struct ip_reply_arg arg; #ifdef CONFIG_TCP_MD5SIG @@ -770,6 +772,23 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) ip_hdr(skb)->daddr, &rep.th); } #endif + /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ + if (sk && sk_fullsock(sk) && sk_is_mptcp(sk) && rep.opt[0] == 0) { + const struct mptcp_ext *ext = mptcp_get_ext(skb); + u8 flags = 0, reason = 0; + + if (ext) { + flags = ext->reset_transient; + reason = ext->reset_reason; + } + + rep.opt[0] = mptcp_option(MPTCPOPT_RST, TCPOLEN_MPTCP_RST, + flags, reason); + + arg.iov[0].iov_len += TCPOLEN_MPTCP_RST; + rep.th.doff = arg.iov[0].iov_len / 4; + } + arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, /* XXX */ arg.iov[0].iov_len, IPPROTO_TCP, 0); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 8db59f4e5f13..cfe8d6b4c34c 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -868,6 +868,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); struct sock *ctl_sk = net->ipv6.tcp_sk; unsigned int tot_len = sizeof(struct tcphdr); + bool mptcp_reset = false; struct dst_entry *dst; __be32 *topt; __u32 mark = 0; @@ -879,6 +880,11 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 tot_len += TCPOLEN_MD5SIG_ALIGNED; #endif + if (rst && sk && sk_fullsock(sk) && sk_is_mptcp(sk) && !key) { + tot_len += TCPOLEN_MPTCP_RST; + mptcp_reset = true; + } + buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len, GFP_ATOMIC); if (!buff) @@ -909,6 +915,19 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 *topt++ = htonl(tsecr); } + if (mptcp_reset) { + const struct mptcp_ext *ext = mptcp_get_ext(skb); + u8 flags = 0, reason = 0; + + if (ext) { + flags = ext->reset_transient; + reason = ext->reset_reason; + } + + *topt++ = mptcp_option(MPTCPOPT_RST, TCPOLEN_MPTCP_RST, + flags, reason); + } + #ifdef CONFIG_TCP_MD5SIG if (key) { *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 248e3930c0cb..785a9f4e7da8 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -281,7 +281,17 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->rm_id = *ptr++; pr_debug("RM_ADDR: id=%d", mp_opt->rm_id); break; + case MPTCPOPT_RST: + if (opsize != TCPOLEN_MPTCP_RST) + break; + if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) + break; + mp_opt->reset = 1; + flags = *ptr++; + mp_opt->reset_transient = flags & MPTCP_RST_TRANSIENT; + mp_opt->reset_reason = *ptr; + break; default: break; } @@ -302,6 +312,7 @@ void mptcp_get_options(const struct sk_buff *skb, mp_opt->port = 0; mp_opt->rm_addr = 0; mp_opt->dss = 0; + mp_opt->reset = 0; length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); @@ -660,6 +671,22 @@ static bool mptcp_established_options_rm_addr(struct sock *sk, return true; } +static noinline void mptcp_established_options_rst(struct sock *sk, struct sk_buff *skb, + unsigned int *size, + unsigned int remaining, + struct mptcp_out_options *opts) +{ + const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + if (remaining < TCPOLEN_MPTCP_RST) + return; + + *size = TCPOLEN_MPTCP_RST; + opts->suboptions |= OPTION_MPTCP_RST; + opts->reset_transient = subflow->reset_transient; + opts->reset_reason = subflow->reset_reason; +} + bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) @@ -672,11 +699,10 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, if (unlikely(mptcp_check_fallback(sk))) return false; - /* prevent adding of any MPTCP related options on reset packet - * until we support MP_TCPRST/MP_FASTCLOSE - */ - if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) - return false; + if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) { + mptcp_established_options_rst(sk, skb, size, remaining, opts); + return true; + } if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) ret = true; @@ -1137,6 +1163,12 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, ptr += 5; } + if (OPTION_MPTCP_RST & opts->suboptions) + *ptr++ = mptcp_option(MPTCPOPT_RST, + TCPOLEN_MPTCP_RST, + opts->reset_transient, + opts->reset_reason); + if (opts->ext_copy.use_ack || opts->ext_copy.use_map) { struct mptcp_ext *mpext = &opts->ext_copy; u8 len = TCPOLEN_MPTCP_DSS_BASE; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index a6bd06c724d5..71e556540161 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2608,14 +2608,18 @@ bool mptcp_finish_join(struct sock *ssk) pr_debug("msk=%p, subflow=%p", msk, subflow); /* mptcp socket already closing? */ - if (!mptcp_is_fully_established(parent)) + if (!mptcp_is_fully_established(parent)) { + subflow->reset_reason = MPTCP_RST_EMPTCP; return false; + } if (!msk->pm.server_side) return true; - if (!mptcp_pm_allow_new_subflow(msk)) + if (!mptcp_pm_allow_new_subflow(msk)) { + subflow->reset_reason = MPTCP_RST_EPROHIBIT; return false; + } /* active connections are already on conn_list, and we can't acquire * msk lock here. @@ -2629,8 +2633,10 @@ bool mptcp_finish_join(struct sock *ssk) sock_hold(ssk); } spin_unlock_bh(&msk->join_list_lock); - if (!ret) + if (!ret) { + subflow->reset_reason = MPTCP_RST_EPROHIBIT; return false; + } /* attach to msk socket only after we are sure he will deal with us * at close time diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 66bd4d096753..8a247e50d326 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -23,6 +23,7 @@ #define OPTION_MPTCP_ADD_ADDR BIT(6) #define OPTION_MPTCP_ADD_ADDR6 BIT(7) #define OPTION_MPTCP_RM_ADDR BIT(8) +#define OPTION_MPTCP_RST BIT(9) /* MPTCP option subtypes */ #define MPTCPOPT_MP_CAPABLE 0 @@ -84,6 +85,18 @@ #define MPTCP_ADDR_IPVERSION_4 4 #define MPTCP_ADDR_IPVERSION_6 6 +/* MPTCP TCPRST flags */ +#define MPTCP_RST_TRANSIENT BIT(0) + +/* MPTCP Reset reason codes, rfc8684 */ +#define MPTCP_RST_EUNSPEC 0 +#define MPTCP_RST_EMPTCP 1 +#define MPTCP_RST_ERESOURCE 2 +#define MPTCP_RST_EPROHIBIT 3 +#define MPTCP_RST_EWQ2BIG 4 +#define MPTCP_RST_EBADPERF 5 +#define MPTCP_RST_EMIDDLEBOX 6 + /* MPTCP socket flags */ #define MPTCP_DATA_READY 0 #define MPTCP_NOSPACE 1 @@ -109,6 +122,7 @@ struct mptcp_options_received { u16 data_len; u16 mp_capable : 1, mp_join : 1, + reset : 1, dss : 1, add_addr : 1, rm_addr : 1, @@ -129,6 +143,8 @@ struct mptcp_options_received { __unused:2; u8 addr_id; u8 rm_id; + u8 reset_reason:4; + u8 reset_transient:1; union { struct in_addr addr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) @@ -365,6 +381,8 @@ struct mptcp_subflow_context { u8 hmac[MPTCPOPT_HMAC_LEN]; u8 local_id; u8 remote_id; + u8 reset_transient:1; + u8 reset_reason:4; struct sock *tcp_sock; /* tcp sk backpointer */ struct sock *conn; /* parent mptcp_sock */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 1e9a72af67dc..e0da6712a5c3 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -325,8 +325,10 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) } else if (subflow->request_join) { u8 hmac[SHA256_DIGEST_SIZE]; - if (!mp_opt.mp_join) + if (!mp_opt.mp_join) { + subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; + } subflow->thmac = mp_opt.thmac; subflow->remote_nonce = mp_opt.nonce; @@ -335,6 +337,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) if (!subflow_thmac_valid(subflow)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); + subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; } @@ -356,6 +359,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) return; do_reset: + subflow->reset_transient = 0; mptcp_subflow_reset(sk); } @@ -505,6 +509,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct mptcp_options_received mp_opt; bool fallback, fallback_is_fatal; struct sock *new_msk = NULL; + struct mptcp_ext *mpext; struct sock *child; pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); @@ -565,8 +570,15 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, * to reset the context to non MPTCP status. */ if (!ctx || fallback) { - if (fallback_is_fatal) + if (fallback_is_fatal) { + mpext = skb_ext_add(skb, SKB_EXT_MPTCP); + if (mpext) { + memset(mpext, 0, sizeof(*mpext)); + mpext->reset_reason = MPTCP_RST_EMPTCP; + } + goto dispose_child; + } subflow_drop_ctx(child); goto out; @@ -600,8 +612,15 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct mptcp_sock *owner; owner = subflow_req->msk; - if (!owner) + if (!owner) { + mpext = skb_ext_add(skb, SKB_EXT_MPTCP); + if (mpext) { + memset(mpext, 0, sizeof(*mpext)); + mpext->reset_reason = MPTCP_RST_EPROHIBIT; + } + goto dispose_child; + } /* move the msk reference ownership to the subflow */ subflow_req->msk = NULL; @@ -936,6 +955,8 @@ static bool subflow_check_data_avail(struct sock *ssk) smp_wmb(); ssk->sk_error_report(ssk); tcp_set_state(ssk, TCP_CLOSE); + subflow->reset_transient = 0; + subflow->reset_reason = MPTCP_RST_EMPTCP; tcp_send_active_reset(ssk, GFP_ATOMIC); subflow->data_avail = 0; return false;
The MPTCP reset option allows to carry a mptcp-specific error code that provides more information on the nature of a connection reset. The Reset option data received gets stored in the mptcp skb extension structure so it can be consumed by e.g. path management. When a subflow is closed, the desired error code that should be sent to the peer is placed in the subflow context structure. If close happens before a suitable tcp socket has been created (for example, when HMAC fails validation), the reset code can be placed in the mptcp skb extension which then gets added to the TCP reset skb. Signed-off-by: Florian Westphal <fw@strlen.de> --- include/net/mptcp.h | 6 ++++-- include/net/tcp.h | 3 +++ net/ipv4/tcp_ipv4.c | 21 ++++++++++++++++++++- net/ipv6/tcp_ipv6.c | 19 +++++++++++++++++++ net/mptcp/options.c | 42 +++++++++++++++++++++++++++++++++++++----- net/mptcp/protocol.c | 12 +++++++++--- net/mptcp/protocol.h | 18 ++++++++++++++++++ net/mptcp/subflow.c | 27 ++++++++++++++++++++++++--- 8 files changed, 134 insertions(+), 14 deletions(-)