diff mbox series

[bpf-next,01/10] tcp: Use a struct to represent a saved_syn

Message ID 20200626175508.1460345-1-kafai@fb.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series BPF TCP header options | expand

Commit Message

Martin KaFai Lau June 26, 2020, 5:55 p.m. UTC
The total length of the saved syn packet is currently stored in
the first 4 bytes (u32) and the actual packet data is stored after that.

A latter patch will also want to store an offset (bpf_hdr_opt_off) to
a TCP header option which the bpf program will be interested in parsing.
Instead of anonymously storing this offset into the second 4 bytes,
this patch creates a struct for the existing saved_syn.
It can give a readable name to the stored lengths instead of implicitly
using the first few u32(s) to do that.

The new TCP bpf header offset (bpf_hdr_opt_off) added in a latter patch is
an offset from the tcp header instead of from the network header.
It will make the bpf programming side easier.  Thus, this patch stores
the network header length instead of the total length of the syn
header.  The total length can be obtained by the
"network header len + tcp_hdrlen".  The latter patch can
then also gets the offset to the TCP bpf header option by
"network header len + bpf_hdr_opt_off".

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 include/linux/tcp.h        | 11 ++++++++++-
 include/net/request_sock.h |  7 ++++++-
 net/core/filter.c          |  4 ++--
 net/ipv4/tcp.c             |  9 +++++----
 net/ipv4/tcp_input.c       | 12 ++++++------
 5 files changed, 29 insertions(+), 14 deletions(-)

Comments

Eric Dumazet June 27, 2020, 5:41 p.m. UTC | #1
On Fri, Jun 26, 2020 at 10:55 AM Martin KaFai Lau <kafai@fb.com> wrote:
>
> The total length of the saved syn packet is currently stored in
> the first 4 bytes (u32) and the actual packet data is stored after that.
>
> A latter patch will also want to store an offset (bpf_hdr_opt_off) to
> a TCP header option which the bpf program will be interested in parsing.
> Instead of anonymously storing this offset into the second 4 bytes,
> this patch creates a struct for the existing saved_syn.
> It can give a readable name to the stored lengths instead of implicitly
> using the first few u32(s) to do that.
>
> The new TCP bpf header offset (bpf_hdr_opt_off) added in a latter patch is
> an offset from the tcp header instead of from the network header.
> It will make the bpf programming side easier.  Thus, this patch stores
> the network header length instead of the total length of the syn
> header.  The total length can be obtained by the
> "network header len + tcp_hdrlen".  The latter patch can
> then also gets the offset to the TCP bpf header option by
> "network header len + bpf_hdr_opt_off".
>
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---
>  include/linux/tcp.h        | 11 ++++++++++-
>  include/net/request_sock.h |  7 ++++++-
>  net/core/filter.c          |  4 ++--
>  net/ipv4/tcp.c             |  9 +++++----
>  net/ipv4/tcp_input.c       | 12 ++++++------
>  5 files changed, 29 insertions(+), 14 deletions(-)
>
> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> index 3bdec31ce8f4..9d50132d95e6 100644
> --- a/include/linux/tcp.h
> +++ b/include/linux/tcp.h
> @@ -404,7 +404,7 @@ struct tcp_sock {
>          * socket. Used to retransmit SYNACKs etc.
>          */
>         struct request_sock __rcu *fastopen_rsk;
> -       u32     *saved_syn;
> +       struct saved_syn *saved_syn;
>  };
>
>  enum tsq_enum {
> @@ -482,6 +482,15 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp)
>         tp->saved_syn = NULL;
>  }
>
> +static inline u32 tcp_saved_syn_len(const struct saved_syn *saved_syn)
> +{
> +       const struct tcphdr *th;
> +
> +       th = (void *)saved_syn->data + saved_syn->network_hdrlen;
> +
> +       return saved_syn->network_hdrlen + __tcp_hdrlen(th);
> +}


Ah... We have a patch extending TCP_SAVE_SYN to save all headers, so
keeping the length in a proper field would be better than going back
to TCP header to get __tcp_hdrlen(th)

I am not sure why trying to save 4 bytes in this saved_syn would matter ;)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index c9fcfa4ec43f3f0d75763e2bc6773e15bd38d68f..8ecdc5f87788439c7a08d3b72f9567e6369e7c4e
100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -258,7 +258,7 @@ struct tcp_sock {
                fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
                fastopen_no_cookie:1, /* Allow send/recv SYN+data
without a cookie */
                is_sack_reneg:1,    /* in recovery from loss with SACK reneg? */
-               unused:2;
+               save_syn:2;     /* Save headers of SYN packet */
        u8      nonagle     : 4,/* Disable Nagle algorithm?             */
                thin_lto    : 1,/* Use linear timeouts for thin streams */
                recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
@@ -270,7 +270,7 @@ struct tcp_sock {
                syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
                syn_fastopen_ch:1, /* Active TFO re-enabling probe */
                syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
-               save_syn:1,     /* Save headers of SYN packet */
+               unused_save_syn:1,      /* Moved above */
                is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
                syn_smc:1;      /* SYN includes SMC */
        u32     tlp_high_seq;   /* snd_nxt at the time of TLP retransmit. */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index fa62baf509c8075cb7da30ee0f65059ac35c1c60..7e108de07fb4e45a994d3d75331489ad82f9deb7
100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3097,7 +3097,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
                break;

        case TCP_SAVE_SYN:
-               if (val < 0 || val > 1)
+               /* 0: disable, 1: enable, 2: start from ether_header */
+               if (val < 0 || val > 2)
                        err = -EINVAL;
                else
                        tp->save_syn = val;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7ce5bad2308134954133f612ec129cf56946d9a1..5513f8aaae9f6c0303fac4d2c590ead1a6076502
100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6708,11 +6708,19 @@ static void tcp_reqsk_record_syn(const struct sock *sk,
        if (tcp_sk(sk)->save_syn) {
                u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
                u32 *copy;
+               void *base;
+
+               if (tcp_sk(sk)->save_syn == 2) {  /* Save full header. */
+                       len += skb->network_header - skb->mac_header;
+                       base = skb_mac_header(skb);
+               } else {
+                       base = skb_network_header(skb);
+               }

                copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
                if (copy) {
                        copy[0] = len;
-                       memcpy(&copy[1], skb_network_header(skb), len);
+                       memcpy(&copy[1], base, len);
                        req->saved_syn = copy;
                }
        }
Martin KaFai Lau June 30, 2020, 11:24 p.m. UTC | #2
On Sat, Jun 27, 2020 at 10:41:32AM -0700, Eric Dumazet wrote:
> On Fri, Jun 26, 2020 at 10:55 AM Martin KaFai Lau <kafai@fb.com> wrote:
> >
> > The total length of the saved syn packet is currently stored in
> > the first 4 bytes (u32) and the actual packet data is stored after that.
> >
> > A latter patch will also want to store an offset (bpf_hdr_opt_off) to
> > a TCP header option which the bpf program will be interested in parsing.
> > Instead of anonymously storing this offset into the second 4 bytes,
> > this patch creates a struct for the existing saved_syn.
> > It can give a readable name to the stored lengths instead of implicitly
> > using the first few u32(s) to do that.
> >
> > The new TCP bpf header offset (bpf_hdr_opt_off) added in a latter patch is
> > an offset from the tcp header instead of from the network header.
> > It will make the bpf programming side easier.  Thus, this patch stores
> > the network header length instead of the total length of the syn
> > header.  The total length can be obtained by the
> > "network header len + tcp_hdrlen".  The latter patch can
> > then also gets the offset to the TCP bpf header option by
> > "network header len + bpf_hdr_opt_off".
> >
> > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> > ---
> >  include/linux/tcp.h        | 11 ++++++++++-
> >  include/net/request_sock.h |  7 ++++++-
> >  net/core/filter.c          |  4 ++--
> >  net/ipv4/tcp.c             |  9 +++++----
> >  net/ipv4/tcp_input.c       | 12 ++++++------
> >  5 files changed, 29 insertions(+), 14 deletions(-)
> >
> > diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> > index 3bdec31ce8f4..9d50132d95e6 100644
> > --- a/include/linux/tcp.h
> > +++ b/include/linux/tcp.h
> > @@ -404,7 +404,7 @@ struct tcp_sock {
> >          * socket. Used to retransmit SYNACKs etc.
> >          */
> >         struct request_sock __rcu *fastopen_rsk;
> > -       u32     *saved_syn;
> > +       struct saved_syn *saved_syn;
> >  };
> >
> >  enum tsq_enum {
> > @@ -482,6 +482,15 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp)
> >         tp->saved_syn = NULL;
> >  }
> >
> > +static inline u32 tcp_saved_syn_len(const struct saved_syn *saved_syn)
> > +{
> > +       const struct tcphdr *th;
> > +
> > +       th = (void *)saved_syn->data + saved_syn->network_hdrlen;
> > +
> > +       return saved_syn->network_hdrlen + __tcp_hdrlen(th);
> > +}
> 
> 
> Ah... We have a patch extending TCP_SAVE_SYN to save all headers, so
> keeping the length in a proper field would be better than going back
> to TCP header to get __tcp_hdrlen(th)
> 
> I am not sure why trying to save 4 bytes in this saved_syn would matter ;)
I will use another 4 bytes to store __tcp_hdrlen().

> 
> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> index c9fcfa4ec43f3f0d75763e2bc6773e15bd38d68f..8ecdc5f87788439c7a08d3b72f9567e6369e7c4e
> 100644
> --- a/include/linux/tcp.h
> +++ b/include/linux/tcp.h
> @@ -258,7 +258,7 @@ struct tcp_sock {
>                 fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
>                 fastopen_no_cookie:1, /* Allow send/recv SYN+data
> without a cookie */
>                 is_sack_reneg:1,    /* in recovery from loss with SACK reneg? */
> -               unused:2;
> +               save_syn:2;     /* Save headers of SYN packet */
Interesting idea.  I don't have an immediate use case in the mac header of
the SYN but I think it may be useful going forward.

Although bpf_hdr_opt_off may be no longer needed in v2,
it will still be convenient for the bpf prog to be able to get the TCP header
only instead of reparsing from the mac/ip[46] and also save some stack space
of the bpf prog.  Thus, storing the length of each hdr would still be nice
so that the bpf helper can directly offset to the start of the required
header.

Do you prefer to incorporate this "save_syn:2" idea in this set
so that mac hdrlen can be stored in the "struct saved_syn"?

This "unused:2" bits have already been used by "fastopen_client_fail:2".
May be get 2 bits from repair_queue?


>         u8      nonagle     : 4,/* Disable Nagle algorithm?             */
>                 thin_lto    : 1,/* Use linear timeouts for thin streams */
>                 recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
> @@ -270,7 +270,7 @@ struct tcp_sock {
>                 syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
>                 syn_fastopen_ch:1, /* Active TFO re-enabling probe */
>                 syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
> -               save_syn:1,     /* Save headers of SYN packet */
> +               unused_save_syn:1,      /* Moved above */
>                 is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
>                 syn_smc:1;      /* SYN includes SMC */
>         u32     tlp_high_seq;   /* snd_nxt at the time of TLP retransmit. */
Eric Dumazet June 30, 2020, 11:35 p.m. UTC | #3
On Tue, Jun 30, 2020 at 4:24 PM Martin KaFai Lau <kafai@fb.com> wrote:

> Interesting idea.  I don't have an immediate use case in the mac header of
> the SYN but I think it may be useful going forward.
>
> Although bpf_hdr_opt_off may be no longer needed in v2,
> it will still be convenient for the bpf prog to be able to get the TCP header
> only instead of reparsing from the mac/ip[46] and also save some stack space
> of the bpf prog.  Thus, storing the length of each hdr would still be nice
> so that the bpf helper can directly offset to the start of the required
> header.
>
> Do you prefer to incorporate this "save_syn:2" idea in this set
> so that mac hdrlen can be stored in the "struct saved_syn"?

Sure, please go ahead.

>
> This "unused:2" bits have already been used by "fastopen_client_fail:2".

Oh that is possible, I have sent the patch as it was when we merged it
years ago.

> May be get 2 bits from repair_queue?

Hmm... this repair_queue is used in fast path, I would rather keep it
as u8 for better code generation.
diff mbox series

Patch

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 3bdec31ce8f4..9d50132d95e6 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -404,7 +404,7 @@  struct tcp_sock {
 	 * socket. Used to retransmit SYNACKs etc.
 	 */
 	struct request_sock __rcu *fastopen_rsk;
-	u32	*saved_syn;
+	struct saved_syn *saved_syn;
 };
 
 enum tsq_enum {
@@ -482,6 +482,15 @@  static inline void tcp_saved_syn_free(struct tcp_sock *tp)
 	tp->saved_syn = NULL;
 }
 
+static inline u32 tcp_saved_syn_len(const struct saved_syn *saved_syn)
+{
+	const struct tcphdr *th;
+
+	th = (void *)saved_syn->data + saved_syn->network_hdrlen;
+
+	return saved_syn->network_hdrlen + __tcp_hdrlen(th);
+}
+
 struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk);
 
 static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss)
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index cf8b33213bbc..d77237ec9fb4 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -41,6 +41,11 @@  struct request_sock_ops {
 
 int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req);
 
+struct saved_syn {
+	u32 network_hdrlen;
+	u8 data[];
+};
+
 /* struct request_sock - mini sock to represent a connection request
  */
 struct request_sock {
@@ -60,7 +65,7 @@  struct request_sock {
 	struct timer_list		rsk_timer;
 	const struct request_sock_ops	*rsk_ops;
 	struct sock			*sk;
-	u32				*saved_syn;
+	struct saved_syn		*saved_syn;
 	u32				secid;
 	u32				peer_secid;
 };
diff --git a/net/core/filter.c b/net/core/filter.c
index c796e141ea8e..19dbcc8448d8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4522,9 +4522,9 @@  static int _bpf_getsockopt(struct sock *sk, int level, int optname,
 			tp = tcp_sk(sk);
 
 			if (optlen <= 0 || !tp->saved_syn ||
-			    optlen > tp->saved_syn[0])
+			    optlen > tcp_saved_syn_len(tp->saved_syn))
 				goto err_clear;
-			memcpy(optval, tp->saved_syn + 1, optlen);
+			memcpy(optval, tp->saved_syn->data, optlen);
 			break;
 		default:
 			goto err_clear;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index de36c91d32ea..60093a211f4d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3805,20 +3805,21 @@  static int do_tcp_getsockopt(struct sock *sk, int level,
 
 		lock_sock(sk);
 		if (tp->saved_syn) {
-			if (len < tp->saved_syn[0]) {
-				if (put_user(tp->saved_syn[0], optlen)) {
+			if (len < tcp_saved_syn_len(tp->saved_syn)) {
+				if (put_user(tcp_saved_syn_len(tp->saved_syn),
+					     optlen)) {
 					release_sock(sk);
 					return -EFAULT;
 				}
 				release_sock(sk);
 				return -EINVAL;
 			}
-			len = tp->saved_syn[0];
+			len = tcp_saved_syn_len(tp->saved_syn);
 			if (put_user(len, optlen)) {
 				release_sock(sk);
 				return -EFAULT;
 			}
-			if (copy_to_user(optval, tp->saved_syn + 1, len)) {
+			if (copy_to_user(optval, tp->saved_syn->data, len)) {
 				release_sock(sk);
 				return -EFAULT;
 			}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 12fda8f27b08..eb0e32b2def9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6557,13 +6557,13 @@  static void tcp_reqsk_record_syn(const struct sock *sk,
 {
 	if (tcp_sk(sk)->save_syn) {
 		u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
-		u32 *copy;
+		struct saved_syn *saved_syn;
 
-		copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
-		if (copy) {
-			copy[0] = len;
-			memcpy(&copy[1], skb_network_header(skb), len);
-			req->saved_syn = copy;
+		saved_syn = kmalloc(len + sizeof(*saved_syn), GFP_ATOMIC);
+		if (saved_syn) {
+			saved_syn->network_hdrlen = skb_network_header_len(skb);
+			memcpy(saved_syn->data, skb_network_header(skb), len);
+			req->saved_syn = saved_syn;
 		}
 	}
 }