diff mbox series

[bpf-next,02/10] tcp: bpf: Parse BPF experimental header option

Message ID 20200626175514.1460570-1-kafai@fb.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series BPF TCP header options | expand

Commit Message

Martin KaFai Lau June 26, 2020, 5:55 p.m. UTC
This patch adds logic to parse experimental kind 254 with 16 bit magic
0xeB9F.  The latter patch will allow bpf prog to write and parse data
under this experimental kind and magic.

A one byte bpf_hdr_opt_off is added to tcp_skb_cb by using an existing
4 byte hole.  It is only used in rx.  It stores the offset to the
bpf experimental option and will be made available to BPF prog
in a latter patch.  This offset is also stored in the saved_syn.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 include/net/request_sock.h | 1 +
 include/net/tcp.h          | 3 +++
 net/ipv4/tcp_input.c       | 6 ++++++
 net/ipv4/tcp_ipv4.c        | 1 +
 net/ipv6/tcp_ipv6.c        | 1 +
 5 files changed, 12 insertions(+)

Comments

Eric Dumazet June 27, 2020, 4:44 p.m. UTC | #1
On Fri, Jun 26, 2020 at 10:55 AM Martin KaFai Lau <kafai@fb.com> wrote:
>
> This patch adds logic to parse experimental kind 254 with 16 bit magic
> 0xeB9F.  The latter patch will allow bpf prog to write and parse data
> under this experimental kind and magic.
>
> A one byte bpf_hdr_opt_off is added to tcp_skb_cb by using an existing
> 4 byte hole.  It is only used in rx.  It stores the offset to the
> bpf experimental option and will be made available to BPF prog
> in a latter patch.  This offset is also stored in the saved_syn.
>
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---
>  include/net/request_sock.h | 1 +
>  include/net/tcp.h          | 3 +++
>  net/ipv4/tcp_input.c       | 6 ++++++
>  net/ipv4/tcp_ipv4.c        | 1 +
>  net/ipv6/tcp_ipv6.c        | 1 +
>  5 files changed, 12 insertions(+)
>
> diff --git a/include/net/request_sock.h b/include/net/request_sock.h
> index d77237ec9fb4..55297286c066 100644
> --- a/include/net/request_sock.h
> +++ b/include/net/request_sock.h
> @@ -43,6 +43,7 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req);
>
>  struct saved_syn {
>         u32 network_hdrlen;
> +       u32 bpf_hdr_opt_off;
>         u8 data[];
>  };
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index eab1c7d0facb..07a9dfe35242 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -191,6 +191,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
>   */
>  #define TCPOPT_FASTOPEN_MAGIC  0xF989
>  #define TCPOPT_SMC_MAGIC       0xE2D4C3D9
> +#define TCPOPT_BPF_MAGIC       0xEB9F
>
>  /*
>   *     TCP option lengths
> @@ -204,6 +205,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
>  #define TCPOLEN_FASTOPEN_BASE  2
>  #define TCPOLEN_EXP_FASTOPEN_BASE  4
>  #define TCPOLEN_EXP_SMC_BASE   6
> +#define TCPOLEN_EXP_BPF_BASE   4
>
>  /* But this is what stacks really send out. */
>  #define TCPOLEN_TSTAMP_ALIGNED         12
> @@ -857,6 +859,7 @@ struct tcp_skb_cb {
>                         has_rxtstamp:1, /* SKB has a RX timestamp       */
>                         unused:5;
>         __u32           ack_seq;        /* Sequence number ACK'd        */
> +       __u8            bpf_hdr_opt_off;/* offset to bpf hdr option. rx only. */
>         union {
>                 struct {
>                         /* There is space for up to 24 bytes */
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index eb0e32b2def9..640408a80b3d 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -3924,6 +3924,10 @@ void tcp_parse_options(const struct net *net,
>                                         tcp_parse_fastopen_option(opsize -
>                                                 TCPOLEN_EXP_FASTOPEN_BASE,
>                                                 ptr + 2, th->syn, foc, true);
> +                               else if (opsize >= TCPOLEN_EXP_BPF_BASE &&
> +                                        get_unaligned_be16(ptr) ==
> +                                        TCPOPT_BPF_MAGIC)
> +                                       TCP_SKB_CB(skb)->bpf_hdr_opt_off = (ptr - 2) - (unsigned char *)th;
>                                 else
>                                         smc_parse_options(th, opt_rx, ptr,
>                                                           opsize);
> @@ -6562,6 +6566,8 @@ static void tcp_reqsk_record_syn(const struct sock *sk,
>                 saved_syn = kmalloc(len + sizeof(*saved_syn), GFP_ATOMIC);
>                 if (saved_syn) {
>                         saved_syn->network_hdrlen = skb_network_header_len(skb);
> +                       saved_syn->bpf_hdr_opt_off =
> +                               TCP_SKB_CB(skb)->bpf_hdr_opt_off;
>                         memcpy(saved_syn->data, skb_network_header(skb), len);
>                         req->saved_syn = saved_syn;
>                 }
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index ea0df9fd7618..a3535b7fe002 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1864,6 +1864,7 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
>         TCP_SKB_CB(skb)->sacked  = 0;
>         TCP_SKB_CB(skb)->has_rxtstamp =
>                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
> +       TCP_SKB_CB(skb)->bpf_hdr_opt_off = 0;
>  }
>
>  /*
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index f67d45ff00b4..8356d0562279 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -1545,6 +1545,7 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
>         TCP_SKB_CB(skb)->sacked = 0;
>         TCP_SKB_CB(skb)->has_rxtstamp =
>                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
> +       TCP_SKB_CB(skb)->bpf_hdr_opt_off = 0;
>  }
>
>  INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
> --
> 2.24.1
>
Eric Dumazet June 27, 2020, 5:17 p.m. UTC | #2
On Fri, Jun 26, 2020 at 10:55 AM Martin KaFai Lau <kafai@fb.com> wrote:
>
> This patch adds logic to parse experimental kind 254 with 16 bit magic
> 0xeB9F.  The latter patch will allow bpf prog to write and parse data
> under this experimental kind and magic.
>
> A one byte bpf_hdr_opt_off is added to tcp_skb_cb by using an existing
> 4 byte hole.  It is only used in rx.  It stores the offset to the
> bpf experimental option and will be made available to BPF prog
> in a latter patch.  This offset is also stored in the saved_syn.
>
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---
>  include/net/request_sock.h | 1 +
>  include/net/tcp.h          | 3 +++
>  net/ipv4/tcp_input.c       | 6 ++++++
>  net/ipv4/tcp_ipv4.c        | 1 +
>  net/ipv6/tcp_ipv6.c        | 1 +
>  5 files changed, 12 insertions(+)
>
> diff --git a/include/net/request_sock.h b/include/net/request_sock.h
> index d77237ec9fb4..55297286c066 100644
> --- a/include/net/request_sock.h
> +++ b/include/net/request_sock.h
> @@ -43,6 +43,7 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req);
>
>  struct saved_syn {
>         u32 network_hdrlen;
> +       u32 bpf_hdr_opt_off;
>         u8 data[];
>  };
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index eab1c7d0facb..07a9dfe35242 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -191,6 +191,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
>   */
>  #define TCPOPT_FASTOPEN_MAGIC  0xF989
>  #define TCPOPT_SMC_MAGIC       0xE2D4C3D9
> +#define TCPOPT_BPF_MAGIC       0xEB9F
>
>  /*
>   *     TCP option lengths
> @@ -204,6 +205,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
>  #define TCPOLEN_FASTOPEN_BASE  2
>  #define TCPOLEN_EXP_FASTOPEN_BASE  4
>  #define TCPOLEN_EXP_SMC_BASE   6
> +#define TCPOLEN_EXP_BPF_BASE   4
>
>  /* But this is what stacks really send out. */
>  #define TCPOLEN_TSTAMP_ALIGNED         12
> @@ -857,6 +859,7 @@ struct tcp_skb_cb {
>                         has_rxtstamp:1, /* SKB has a RX timestamp       */
>                         unused:5;
>         __u32           ack_seq;        /* Sequence number ACK'd        */
> +       __u8            bpf_hdr_opt_off;/* offset to bpf hdr option. rx only. */
>         union {
>                 struct {
>                         /* There is space for up to 24 bytes */
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index eb0e32b2def9..640408a80b3d 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -3924,6 +3924,10 @@ void tcp_parse_options(const struct net *net,
>                                         tcp_parse_fastopen_option(opsize -
>                                                 TCPOLEN_EXP_FASTOPEN_BASE,
>                                                 ptr + 2, th->syn, foc, true);
> +                               else if (opsize >= TCPOLEN_EXP_BPF_BASE &&
> +                                        get_unaligned_be16(ptr) ==
> +                                        TCPOPT_BPF_MAGIC)
> +                                       TCP_SKB_CB(skb)->bpf_hdr_opt_off = (ptr - 2) - (unsigned char *)th;
>                                 else
>                                         smc_parse_options(th, opt_rx, ptr,
>                                                           opsize);
> @@ -6562,6 +6566,8 @@ static void tcp_reqsk_record_syn(const struct sock *sk,
>                 saved_syn = kmalloc(len + sizeof(*saved_syn), GFP_ATOMIC);
>                 if (saved_syn) {
>                         saved_syn->network_hdrlen = skb_network_header_len(skb);
> +                       saved_syn->bpf_hdr_opt_off =
> +                               TCP_SKB_CB(skb)->bpf_hdr_opt_off;
>                         memcpy(saved_syn->data, skb_network_header(skb), len);
>                         req->saved_syn = saved_syn;
>                 }
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index ea0df9fd7618..a3535b7fe002 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1864,6 +1864,7 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
>         TCP_SKB_CB(skb)->sacked  = 0;
>         TCP_SKB_CB(skb)->has_rxtstamp =
>                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
> +       TCP_SKB_CB(skb)->bpf_hdr_opt_off = 0;
>  }
>
>  /*
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index f67d45ff00b4..8356d0562279 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -1545,6 +1545,7 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
>         TCP_SKB_CB(skb)->sacked = 0;
>         TCP_SKB_CB(skb)->has_rxtstamp =
>                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
> +       TCP_SKB_CB(skb)->bpf_hdr_opt_off = 0;
>  }
>
>  INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
> --
> 2.24.1
>

(Sorry for the prior empty reply, accidentally click the wrong area)

It seems strange that we want to add code in TCP stack only to cover a
limited use case (kind 254 and 0xEB9F magic)

For something like the work Petar Penkov did (to be able to generate
SYNCOOKIES from XDP), we do not go through tcp_parse_options() and BPF
program
would have to implement its own parsing (without having an SKB at
hand), probably calling a helper function, with no
TCP_SKB_CB(skb)->bpf_hdr_opt_off.

This patch is hard coding a specific option and will prevent anyone
using private option(s) from using this infrastructure in the future,
yet paying the extra overhead.

TCP_SKB_CB(skb) is tight, I would prefer keeping the space in it for
standard TCP stack features.

If an optional BPF program needs to re-parse the TCP options to find a
specific option, maybe the extra cost is noise (especially if this is
only for SYN & SYNACK packets) ?

Thanks
Martin KaFai Lau June 28, 2020, 11:44 p.m. UTC | #3
On Sat, Jun 27, 2020 at 10:17:26AM -0700, Eric Dumazet wrote:
> On Fri, Jun 26, 2020 at 10:55 AM Martin KaFai Lau <kafai@fb.com> wrote:
> >
> > This patch adds logic to parse experimental kind 254 with 16 bit magic
> > 0xeB9F.  The latter patch will allow bpf prog to write and parse data
> > under this experimental kind and magic.
> >
> > A one byte bpf_hdr_opt_off is added to tcp_skb_cb by using an existing
> > 4 byte hole.  It is only used in rx.  It stores the offset to the
> > bpf experimental option and will be made available to BPF prog
> > in a latter patch.  This offset is also stored in the saved_syn.
> >
> > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> > ---
> >  include/net/request_sock.h | 1 +
> >  include/net/tcp.h          | 3 +++
> >  net/ipv4/tcp_input.c       | 6 ++++++
> >  net/ipv4/tcp_ipv4.c        | 1 +
> >  net/ipv6/tcp_ipv6.c        | 1 +
> >  5 files changed, 12 insertions(+)
> >
> > diff --git a/include/net/request_sock.h b/include/net/request_sock.h
> > index d77237ec9fb4..55297286c066 100644
> > --- a/include/net/request_sock.h
> > +++ b/include/net/request_sock.h
> > @@ -43,6 +43,7 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req);
> >
> >  struct saved_syn {
> >         u32 network_hdrlen;
> > +       u32 bpf_hdr_opt_off;
> >         u8 data[];
> >  };
> >
> > diff --git a/include/net/tcp.h b/include/net/tcp.h
> > index eab1c7d0facb..07a9dfe35242 100644
> > --- a/include/net/tcp.h
> > +++ b/include/net/tcp.h
> > @@ -191,6 +191,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
> >   */
> >  #define TCPOPT_FASTOPEN_MAGIC  0xF989
> >  #define TCPOPT_SMC_MAGIC       0xE2D4C3D9
> > +#define TCPOPT_BPF_MAGIC       0xEB9F
> >
> >  /*
> >   *     TCP option lengths
> > @@ -204,6 +205,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
> >  #define TCPOLEN_FASTOPEN_BASE  2
> >  #define TCPOLEN_EXP_FASTOPEN_BASE  4
> >  #define TCPOLEN_EXP_SMC_BASE   6
> > +#define TCPOLEN_EXP_BPF_BASE   4
> >
> >  /* But this is what stacks really send out. */
> >  #define TCPOLEN_TSTAMP_ALIGNED         12
> > @@ -857,6 +859,7 @@ struct tcp_skb_cb {
> >                         has_rxtstamp:1, /* SKB has a RX timestamp       */
> >                         unused:5;
> >         __u32           ack_seq;        /* Sequence number ACK'd        */
> > +       __u8            bpf_hdr_opt_off;/* offset to bpf hdr option. rx only. */
> >         union {
> >                 struct {
> >                         /* There is space for up to 24 bytes */
> > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > index eb0e32b2def9..640408a80b3d 100644
> > --- a/net/ipv4/tcp_input.c
> > +++ b/net/ipv4/tcp_input.c
> > @@ -3924,6 +3924,10 @@ void tcp_parse_options(const struct net *net,
> >                                         tcp_parse_fastopen_option(opsize -
> >                                                 TCPOLEN_EXP_FASTOPEN_BASE,
> >                                                 ptr + 2, th->syn, foc, true);
> > +                               else if (opsize >= TCPOLEN_EXP_BPF_BASE &&
> > +                                        get_unaligned_be16(ptr) ==
> > +                                        TCPOPT_BPF_MAGIC)
> > +                                       TCP_SKB_CB(skb)->bpf_hdr_opt_off = (ptr - 2) - (unsigned char *)th;
> >                                 else
> >                                         smc_parse_options(th, opt_rx, ptr,
> >                                                           opsize);
> > @@ -6562,6 +6566,8 @@ static void tcp_reqsk_record_syn(const struct sock *sk,
> >                 saved_syn = kmalloc(len + sizeof(*saved_syn), GFP_ATOMIC);
> >                 if (saved_syn) {
> >                         saved_syn->network_hdrlen = skb_network_header_len(skb);
> > +                       saved_syn->bpf_hdr_opt_off =
> > +                               TCP_SKB_CB(skb)->bpf_hdr_opt_off;
> >                         memcpy(saved_syn->data, skb_network_header(skb), len);
> >                         req->saved_syn = saved_syn;
> >                 }
> > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> > index ea0df9fd7618..a3535b7fe002 100644
> > --- a/net/ipv4/tcp_ipv4.c
> > +++ b/net/ipv4/tcp_ipv4.c
> > @@ -1864,6 +1864,7 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
> >         TCP_SKB_CB(skb)->sacked  = 0;
> >         TCP_SKB_CB(skb)->has_rxtstamp =
> >                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
> > +       TCP_SKB_CB(skb)->bpf_hdr_opt_off = 0;
> >  }
> >
> >  /*
> > diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> > index f67d45ff00b4..8356d0562279 100644
> > --- a/net/ipv6/tcp_ipv6.c
> > +++ b/net/ipv6/tcp_ipv6.c
> > @@ -1545,6 +1545,7 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
> >         TCP_SKB_CB(skb)->sacked = 0;
> >         TCP_SKB_CB(skb)->has_rxtstamp =
> >                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
> > +       TCP_SKB_CB(skb)->bpf_hdr_opt_off = 0;
> >  }
> >
> >  INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
> > --
> > 2.24.1
> >
> 
> (Sorry for the prior empty reply, accidentally click the wrong area)
> 
> It seems strange that we want to add code in TCP stack only to cover a
> limited use case (kind 254 and 0xEB9F magic)
> 
> For something like the work Petar Penkov did (to be able to generate
> SYNCOOKIES from XDP), we do not go through tcp_parse_options() and BPF
> program
> would have to implement its own parsing (without having an SKB at
> hand), probably calling a helper function, with no
> TCP_SKB_CB(skb)->bpf_hdr_opt_off.
> 
> This patch is hard coding a specific option and will prevent anyone
> using private option(s) from using this infrastructure in the future,
> yet paying the extra overhead.
> 
> TCP_SKB_CB(skb) is tight, I would prefer keeping the space in it for
> standard TCP stack features.
> 
> If an optional BPF program needs to re-parse the TCP options to find a
> specific option, maybe the extra cost is noise (especially if this is
> only for SYN & SYNACK packets) ?
Thanks for the feedback.

Re: syn & synack only

The bpf tcp hdr option infrastructure is not only limited to syn
and synack.  It is available to the data/ack/fin pkt also, although
most of the use cases may be limited to syn and synack.
e.g. the latter example tests parsing the 0xeB9F option in FIN.

After a connection is established, the bpf may choose to continue hearing
for (kind 254 and 0xEB9F magic).  bpf_hdr_opt_off is also used to
decide if the tcphdr has the 0xeB9F option and then call the bpf prog
to handle it.

Re: the spaces in TCP_SKB_CB(skb).  I think I can avoid tapping into it.

bpf_hdr_opt_off is only needed upto calling the bpf prog.  i.e. after
the bpf prog returns, the bpf_hdr_opt_off is no longer needed in TCP_SKB_CB.
Like "struct tcp_fastopen_cookie *foc", "u8 *bpf_hdr_opt_off" can be
added to tcp_parse_options() instead of saving it in TCP_SKB_CB(skb).
Then pass it all the way to the bpf prog and also save this to "saved_syn".
Does it address the concern in the spaces in TCP_SKB_CB(skb)?
Martin KaFai Lau June 29, 2020, 12:45 a.m. UTC | #4
On Sat, Jun 27, 2020 at 10:17:26AM -0700, Eric Dumazet wrote:
[ ... ]

> It seems strange that we want to add code in TCP stack only to cover a
> limited use case (kind 254 and 0xEB9F magic)
> 
> For something like the work Petar Penkov did (to be able to generate
> SYNCOOKIES from XDP), we do not go through tcp_parse_options() and BPF
> program
> would have to implement its own parsing (without having an SKB at
> hand), probably calling a helper function, with no
> TCP_SKB_CB(skb)->bpf_hdr_opt_off.
> 
> This patch is hard coding a specific option and will prevent anyone
> using private option(s) from using this infrastructure in the future,
> yet paying the extra overhead.
There is a discussion in patch 4 about not limiting this patch set
to option kind 254.  That will affect the usefulness of bpf_hdr_opt_off.

> 
> TCP_SKB_CB(skb) is tight, I would prefer keeping the space in it for
> standard TCP stack features.
> 
> If an optional BPF program needs to re-parse the TCP options to find a
> specific option, maybe the extra cost is noise (especially if this is
> only for SYN & SYNACK packets) ?
> 
> Thanks
diff mbox series

Patch

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index d77237ec9fb4..55297286c066 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -43,6 +43,7 @@  int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req);
 
 struct saved_syn {
 	u32 network_hdrlen;
+	u32 bpf_hdr_opt_off;
 	u8 data[];
 };
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index eab1c7d0facb..07a9dfe35242 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -191,6 +191,7 @@  void tcp_time_wait(struct sock *sk, int state, int timeo);
  */
 #define TCPOPT_FASTOPEN_MAGIC	0xF989
 #define TCPOPT_SMC_MAGIC	0xE2D4C3D9
+#define TCPOPT_BPF_MAGIC	0xEB9F
 
 /*
  *     TCP option lengths
@@ -204,6 +205,7 @@  void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCPOLEN_FASTOPEN_BASE  2
 #define TCPOLEN_EXP_FASTOPEN_BASE  4
 #define TCPOLEN_EXP_SMC_BASE   6
+#define TCPOLEN_EXP_BPF_BASE   4
 
 /* But this is what stacks really send out. */
 #define TCPOLEN_TSTAMP_ALIGNED		12
@@ -857,6 +859,7 @@  struct tcp_skb_cb {
 			has_rxtstamp:1,	/* SKB has a RX timestamp	*/
 			unused:5;
 	__u32		ack_seq;	/* Sequence number ACK'd	*/
+	__u8            bpf_hdr_opt_off;/* offset to bpf hdr option. rx only. */
 	union {
 		struct {
 			/* There is space for up to 24 bytes */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index eb0e32b2def9..640408a80b3d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3924,6 +3924,10 @@  void tcp_parse_options(const struct net *net,
 					tcp_parse_fastopen_option(opsize -
 						TCPOLEN_EXP_FASTOPEN_BASE,
 						ptr + 2, th->syn, foc, true);
+				else if (opsize >= TCPOLEN_EXP_BPF_BASE &&
+					 get_unaligned_be16(ptr) ==
+					 TCPOPT_BPF_MAGIC)
+					TCP_SKB_CB(skb)->bpf_hdr_opt_off = (ptr - 2) - (unsigned char *)th;
 				else
 					smc_parse_options(th, opt_rx, ptr,
 							  opsize);
@@ -6562,6 +6566,8 @@  static void tcp_reqsk_record_syn(const struct sock *sk,
 		saved_syn = kmalloc(len + sizeof(*saved_syn), GFP_ATOMIC);
 		if (saved_syn) {
 			saved_syn->network_hdrlen = skb_network_header_len(skb);
+			saved_syn->bpf_hdr_opt_off =
+				TCP_SKB_CB(skb)->bpf_hdr_opt_off;
 			memcpy(saved_syn->data, skb_network_header(skb), len);
 			req->saved_syn = saved_syn;
 		}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ea0df9fd7618..a3535b7fe002 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1864,6 +1864,7 @@  static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
 	TCP_SKB_CB(skb)->sacked	 = 0;
 	TCP_SKB_CB(skb)->has_rxtstamp =
 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
+	TCP_SKB_CB(skb)->bpf_hdr_opt_off = 0;
 }
 
 /*
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f67d45ff00b4..8356d0562279 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1545,6 +1545,7 @@  static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
 	TCP_SKB_CB(skb)->sacked = 0;
 	TCP_SKB_CB(skb)->has_rxtstamp =
 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
+	TCP_SKB_CB(skb)->bpf_hdr_opt_off = 0;
 }
 
 INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)