diff mbox series

[v2,4/8] bpf: add helper to check for a valid SYN cookie

Message ID 20190319102103.7380-5-lmb@cloudflare.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series [v2,1/8] bpf: track references based on is_acquire_func | expand

Commit Message

Lorenz Bauer March 19, 2019, 10:20 a.m. UTC
Using bpf_skc_lookup_tcp it's possible to ascertain whether a packet
belongs to a known connection. However, there is one corner case: no
sockets are created if SYN cookies are active. This means that the final
ACK in the 3WHS is misclassified.

Using the helper, we can look up the listening socket via
bpf_skc_lookup_tcp and then check whether a packet is a valid SYN
cookie ACK.

Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
---
 include/uapi/linux/bpf.h | 18 +++++++++-
 net/core/filter.c        | 72 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+), 1 deletion(-)

Comments

Alexei Starovoitov March 19, 2019, 10:17 p.m. UTC | #1
On Tue, Mar 19, 2019 at 10:20:59AM +0000, Lorenz Bauer wrote:
> Using bpf_skc_lookup_tcp it's possible to ascertain whether a packet
> belongs to a known connection. However, there is one corner case: no
> sockets are created if SYN cookies are active. This means that the final
> ACK in the 3WHS is misclassified.
> 
> Using the helper, we can look up the listening socket via
> bpf_skc_lookup_tcp and then check whether a packet is a valid SYN
> cookie ACK.
> 
> Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
> ---
>  include/uapi/linux/bpf.h | 18 +++++++++-
>  net/core/filter.c        | 72 ++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 89 insertions(+), 1 deletion(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 8e4f8276942a..587d7a3295bf 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2391,6 +2391,21 @@ union bpf_attr {
>   *		Pointer to **struct bpf_sock**, or **NULL** in case of failure.
>   *		For sockets with reuseport option, the **struct bpf_sock**
>   *		result is from **reuse->socks**\ [] using the hash of the tuple.
> + *
> + * int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len)
> + * 	Description
> + * 		Check whether iph and th contain a valid SYN cookie ACK for
> + * 		the listening socket in sk.
> + *
> + * 		iph points to the start of the IPv4 or IPv6 header, while
> + * 		iph_len contains sizeof(struct iphdr) or sizeof(struct ip6hdr).
> + *
> + * 		th points to the start of the TCP header, while th_len contains
> + * 		sizeof(struct tcphdr).
> + *
> + * 	Return
> + * 		0 if iph and th are a valid SYN cookie ACK, or a negative error
> + * 		otherwise.
>   */
>  #define __BPF_FUNC_MAPPER(FN)		\
>  	FN(unspec),			\
> @@ -2492,7 +2507,8 @@ union bpf_attr {
>  	FN(tcp_sock),			\
>  	FN(skb_ecn_set_ce),		\
>  	FN(get_listener_sock),		\
> -	FN(skc_lookup_tcp),
> +	FN(skc_lookup_tcp),		\
> +	FN(tcp_check_syncookie),
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> diff --git a/net/core/filter.c b/net/core/filter.c
> index f5210773cfd8..45a46fbe4ee0 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -5546,6 +5546,74 @@ static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
>  	.ret_type       = RET_INTEGER,
>  	.arg1_type      = ARG_PTR_TO_CTX,
>  };
> +
> +BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
> +	   struct tcphdr *, th, u32, th_len)

what was the reason to pass ip and tcp header pointers separately?
Why not to pass 'void* iph' with len long enough to cover both?
the helper can compute tcp header and can take into account variable length ipv4 hdr
while checking that resulting ptr is within 'iph + len' validated by the verifier.
Last patch example is buggy in that sense, since it's doing:
tcph = data + sizeof(struct ethhdr) + sizeof(struct iphdr);

If we drop two args then there will be room for 'flags'.
It can be used for example to indicate whether LINUX_MIB_SYNCOOKIESFAILED
should be incremented or not.
May be it shold be unconditionally?

> +{
> +#ifdef CONFIG_SYN_COOKIES
> +	u32 cookie;
> +	int ret;
> +
> +	if (unlikely(th_len < sizeof(*th)))
> +		return -EINVAL;
> +
> +	/* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */
> +	if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
> +		return -EINVAL;
> +
> +	if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies)
> +		return -EINVAL;
> +
> +	if (!th->ack || th->rst || th->syn)
> +		return -ENOENT;
> +
> +	if (tcp_synq_no_recent_overflow(sk))
> +		return -ENOENT;
> +
> +	cookie = ntohl(th->ack_seq) - 1;
> +
> +	switch (sk->sk_family) {
> +	case AF_INET:
> +		if (unlikely(iph_len < sizeof(struct iphdr)))
> +			return -EINVAL;
> +
> +		ret = __cookie_v4_check((struct iphdr *)iph, th, cookie);
> +		break;
> +
> +#if IS_BUILTIN(CONFIG_IPV6)
> +	case AF_INET6:
> +		if (unlikely(iph_len < sizeof(struct ipv6hdr)))
> +			return -EINVAL;
> +
> +		ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie);

I guess this is fast path and you don't want indirect call via ipv6_bpf_stub ?
Lorenz Bauer March 21, 2019, 2:09 a.m. UTC | #2
On Tue, 19 Mar 2019 at 22:17, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Mar 19, 2019 at 10:20:59AM +0000, Lorenz Bauer wrote:
> > Using bpf_skc_lookup_tcp it's possible to ascertain whether a packet
> > belongs to a known connection. However, there is one corner case: no
> > sockets are created if SYN cookies are active. This means that the final
> > ACK in the 3WHS is misclassified.
> >
> > Using the helper, we can look up the listening socket via
> > bpf_skc_lookup_tcp and then check whether a packet is a valid SYN
> > cookie ACK.
> >
> > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
> > ---
> >  include/uapi/linux/bpf.h | 18 +++++++++-
> >  net/core/filter.c        | 72 ++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 89 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 8e4f8276942a..587d7a3295bf 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -2391,6 +2391,21 @@ union bpf_attr {
> >   *           Pointer to **struct bpf_sock**, or **NULL** in case of failure.
> >   *           For sockets with reuseport option, the **struct bpf_sock**
> >   *           result is from **reuse->socks**\ [] using the hash of the tuple.
> > + *
> > + * int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len)
> > + *   Description
> > + *           Check whether iph and th contain a valid SYN cookie ACK for
> > + *           the listening socket in sk.
> > + *
> > + *           iph points to the start of the IPv4 or IPv6 header, while
> > + *           iph_len contains sizeof(struct iphdr) or sizeof(struct ip6hdr).
> > + *
> > + *           th points to the start of the TCP header, while th_len contains
> > + *           sizeof(struct tcphdr).
> > + *
> > + *   Return
> > + *           0 if iph and th are a valid SYN cookie ACK, or a negative error
> > + *           otherwise.
> >   */
> >  #define __BPF_FUNC_MAPPER(FN)                \
> >       FN(unspec),                     \
> > @@ -2492,7 +2507,8 @@ union bpf_attr {
> >       FN(tcp_sock),                   \
> >       FN(skb_ecn_set_ce),             \
> >       FN(get_listener_sock),          \
> > -     FN(skc_lookup_tcp),
> > +     FN(skc_lookup_tcp),             \
> > +     FN(tcp_check_syncookie),
> >
> >  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> >   * function eBPF program intends to call
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index f5210773cfd8..45a46fbe4ee0 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -5546,6 +5546,74 @@ static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
> >       .ret_type       = RET_INTEGER,
> >       .arg1_type      = ARG_PTR_TO_CTX,
> >  };
> > +
> > +BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
> > +        struct tcphdr *, th, u32, th_len)
>
> what was the reason to pass ip and tcp header pointers separately?
> Why not to pass 'void* iph' with len long enough to cover both?

Our use case is to look at Foo-over-UDP encapsulated packets before they are
decapsulated, and if necessary forward them to another machine.
Essentially a bit
of BPF which either XDP_PASSes when we find that a packet belongs to a known
connection, or XDP_TX if it doesn't.

This means we already have to have both ip and tcp header parsed,
otherwise it can't get a reference to a socket via bpf_skc_lookup_tcp.
Giving both pointers
to the syncookie helper saves us from doing that work twice.

> the helper can compute tcp header and can take into account variable length ipv4 hdr
> while checking that resulting ptr is within 'iph + len' validated by the verifier.

I guess you're thinking of a use case where ip and tcp are not already
parsed in eBPF, like checking
tcp_check_syncookie(skb->sk, ...)?

For IPv4 this seems straightforward, but for IPv6 I need to skip
extension headers.
ipv6_skip_exthdr requires an sk_buff, which isn't available when
calling from an XDP context.
Neither creating an sk_buff ad-hoc, nor duplicating ipv6_skip_exthdr
are particularly appealing.

Even if I have an skb context, I need to be able to specify at which offset in
the skb the TCP/IP header starts, since we're using FoU encapsulation.
In this case I also can't
use skb->sk, but have to do skc_lookup_tcp, which requires demuxing
again (to get a socket tuple).
If the helper only takes a single pointer + length, that work is done twice.

I think there is a tension here between integrating well with
bpf_sock_tuple based APIs and
using skb->sk directly. My best attempt at fixing this is

bpf_tcp_check_syncookie(struct sock * sk, void *tcpiph,
         u32 tcpiph_len, u32 th_off, u64 flags)

Where th_off is < tcpiph_len and gives the offset of the tcp header in tcpiph.
But using that API seems even more cumbersome? It still requires demux in eBPF
and figuring out th_off isn't very natural. It might be possible to
allow th_off to be
0 to invoke ipv6_skip_exthdr machinery if there is a way around sk_buff.

Alternatively:

bpf_tcp_check_syncookie(void *ctx, u32 iph_offset, struct sock * sk, u64 flags)

No idea whether this is feasible from XDP (due to sk_buff), and forces
duplicate work.
It would allow bpf_tcp_check_syncookie(skb, 0, NULL, 0) for checking skb
against it skb->skb, which seems nice, but I'm not sure whether it's
actually useful.

> Last patch example is buggy in that sense, since it's doing:
> tcph = data + sizeof(struct ethhdr) + sizeof(struct iphdr);

Oops!

>
> If we drop two args then there will be room for 'flags'.
> It can be used for example to indicate whether LINUX_MIB_SYNCOOKIESFAILED
> should be incremented or not.
> May be it shold be unconditionally?

No, I think that is only the right thing to do do if a) the packet is
destined for the local machine
and b) is dropped by BPF as a result of doing the check (otherwise
we'd double count).

For us, a) isn't necessarily true. For others, knowing to drop in case
b) to avoid double counting
seems harder than just letting the packet continue up the stack? I
guess I'm not exactly clear what
the use case could be.

I agree that having flags would be good though.

>
> > +{
> > +#ifdef CONFIG_SYN_COOKIES
> > +     u32 cookie;
> > +     int ret;
> > +
> > +     if (unlikely(th_len < sizeof(*th)))
> > +             return -EINVAL;
> > +
> > +     /* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */
> > +     if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
> > +             return -EINVAL;
> > +
> > +     if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies)
> > +             return -EINVAL;
> > +
> > +     if (!th->ack || th->rst || th->syn)
> > +             return -ENOENT;
> > +
> > +     if (tcp_synq_no_recent_overflow(sk))
> > +             return -ENOENT;
> > +
> > +     cookie = ntohl(th->ack_seq) - 1;
> > +
> > +     switch (sk->sk_family) {
> > +     case AF_INET:
> > +             if (unlikely(iph_len < sizeof(struct iphdr)))
> > +                     return -EINVAL;
> > +
> > +             ret = __cookie_v4_check((struct iphdr *)iph, th, cookie);
> > +             break;
> > +
> > +#if IS_BUILTIN(CONFIG_IPV6)
> > +     case AF_INET6:
> > +             if (unlikely(iph_len < sizeof(struct ipv6hdr)))
> > +                     return -EINVAL;
> > +
> > +             ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie);
>
> I guess this is fast path and you don't want indirect call via ipv6_bpf_stub ?
>

We run this on all inbound eyeball traffic, so yes. What is the
benefit of going via ipv6_bpf_stub?
Alexei Starovoitov March 21, 2019, 3:03 a.m. UTC | #3
On Thu, Mar 21, 2019 at 02:09:08AM +0000, Lorenz Bauer wrote:
> On Tue, 19 Mar 2019 at 22:17, Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Tue, Mar 19, 2019 at 10:20:59AM +0000, Lorenz Bauer wrote:
> > > Using bpf_skc_lookup_tcp it's possible to ascertain whether a packet
> > > belongs to a known connection. However, there is one corner case: no
> > > sockets are created if SYN cookies are active. This means that the final
> > > ACK in the 3WHS is misclassified.
> > >
> > > Using the helper, we can look up the listening socket via
> > > bpf_skc_lookup_tcp and then check whether a packet is a valid SYN
> > > cookie ACK.
> > >
> > > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
> > > ---
> > >  include/uapi/linux/bpf.h | 18 +++++++++-
> > >  net/core/filter.c        | 72 ++++++++++++++++++++++++++++++++++++++++
> > >  2 files changed, 89 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > index 8e4f8276942a..587d7a3295bf 100644
> > > --- a/include/uapi/linux/bpf.h
> > > +++ b/include/uapi/linux/bpf.h
> > > @@ -2391,6 +2391,21 @@ union bpf_attr {
> > >   *           Pointer to **struct bpf_sock**, or **NULL** in case of failure.
> > >   *           For sockets with reuseport option, the **struct bpf_sock**
> > >   *           result is from **reuse->socks**\ [] using the hash of the tuple.
> > > + *
> > > + * int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len)
> > > + *   Description
> > > + *           Check whether iph and th contain a valid SYN cookie ACK for
> > > + *           the listening socket in sk.
> > > + *
> > > + *           iph points to the start of the IPv4 or IPv6 header, while
> > > + *           iph_len contains sizeof(struct iphdr) or sizeof(struct ip6hdr).
> > > + *
> > > + *           th points to the start of the TCP header, while th_len contains
> > > + *           sizeof(struct tcphdr).
> > > + *
> > > + *   Return
> > > + *           0 if iph and th are a valid SYN cookie ACK, or a negative error
> > > + *           otherwise.
> > >   */
> > >  #define __BPF_FUNC_MAPPER(FN)                \
> > >       FN(unspec),                     \
> > > @@ -2492,7 +2507,8 @@ union bpf_attr {
> > >       FN(tcp_sock),                   \
> > >       FN(skb_ecn_set_ce),             \
> > >       FN(get_listener_sock),          \
> > > -     FN(skc_lookup_tcp),
> > > +     FN(skc_lookup_tcp),             \
> > > +     FN(tcp_check_syncookie),
> > >
> > >  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> > >   * function eBPF program intends to call
> > > diff --git a/net/core/filter.c b/net/core/filter.c
> > > index f5210773cfd8..45a46fbe4ee0 100644
> > > --- a/net/core/filter.c
> > > +++ b/net/core/filter.c
> > > @@ -5546,6 +5546,74 @@ static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
> > >       .ret_type       = RET_INTEGER,
> > >       .arg1_type      = ARG_PTR_TO_CTX,
> > >  };
> > > +
> > > +BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
> > > +        struct tcphdr *, th, u32, th_len)
> >
> > what was the reason to pass ip and tcp header pointers separately?
> > Why not to pass 'void* iph' with len long enough to cover both?
> 
> Our use case is to look at Foo-over-UDP encapsulated packets before they are
> decapsulated, and if necessary forward them to another machine.
> Essentially a bit
> of BPF which either XDP_PASSes when we find that a packet belongs to a known
> connection, or XDP_TX if it doesn't.
> 
> This means we already have to have both ip and tcp header parsed,
> otherwise it can't get a reference to a socket via bpf_skc_lookup_tcp.
> Giving both pointers
> to the syncookie helper saves us from doing that work twice.
> 
> > the helper can compute tcp header and can take into account variable length ipv4 hdr
> > while checking that resulting ptr is within 'iph + len' validated by the verifier.
> 
> I guess you're thinking of a use case where ip and tcp are not already
> parsed in eBPF, like checking
> tcp_check_syncookie(skb->sk, ...)?
> 
> For IPv4 this seems straightforward, but for IPv6 I need to skip
> extension headers.
> ipv6_skip_exthdr requires an sk_buff, which isn't available when
> calling from an XDP context.
> Neither creating an sk_buff ad-hoc, nor duplicating ipv6_skip_exthdr
> are particularly appealing.
> 
> Even if I have an skb context, I need to be able to specify at which offset in
> the skb the TCP/IP header starts, since we're using FoU encapsulation.
> In this case I also can't
> use skb->sk, but have to do skc_lookup_tcp, which requires demuxing
> again (to get a socket tuple).
> If the helper only takes a single pointer + length, that work is done twice.
> 
> I think there is a tension here between integrating well with
> bpf_sock_tuple based APIs and
> using skb->sk directly. My best attempt at fixing this is
> 
> bpf_tcp_check_syncookie(struct sock * sk, void *tcpiph,
>          u32 tcpiph_len, u32 th_off, u64 flags)
> 
> Where th_off is < tcpiph_len and gives the offset of the tcp header in tcpiph.
> But using that API seems even more cumbersome? It still requires demux in eBPF
> and figuring out th_off isn't very natural. It might be possible to
> allow th_off to be
> 0 to invoke ipv6_skip_exthdr machinery if there is a way around sk_buff.
> 
> Alternatively:
> 
> bpf_tcp_check_syncookie(void *ctx, u32 iph_offset, struct sock * sk, u64 flags)
> 
> No idea whether this is feasible from XDP (due to sk_buff), and forces
> duplicate work.
> It would allow bpf_tcp_check_syncookie(skb, 0, NULL, 0) for checking skb
> against it skb->skb, which seems nice, but I'm not sure whether it's
> actually useful.
> 
> > Last patch example is buggy in that sense, since it's doing:
> > tcph = data + sizeof(struct ethhdr) + sizeof(struct iphdr);
> 
> Oops!
> 
> >
> > If we drop two args then there will be room for 'flags'.
> > It can be used for example to indicate whether LINUX_MIB_SYNCOOKIESFAILED
> > should be incremented or not.
> > May be it shold be unconditionally?
> 
> No, I think that is only the right thing to do do if a) the packet is
> destined for the local machine
> and b) is dropped by BPF as a result of doing the check (otherwise
> we'd double count).
> 
> For us, a) isn't necessarily true. For others, knowing to drop in case
> b) to avoid double counting
> seems harder than just letting the packet continue up the stack? I
> guess I'm not exactly clear what
> the use case could be.
> 
> I agree that having flags would be good though.

Ok. All makes sense.

Was thinking that something like this should be possible:
BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph,
        struct tcphdr *, th, u32, th_len, u64, flags)
and the helper would need to check that th - iph > sizeof(iphdr)
but that requires some non-trivial verifier hacks.

So please resubmit as-is.
It doesn't apply cleanly yet.

> > > +#if IS_BUILTIN(CONFIG_IPV6)
> > > +     case AF_INET6:
> > > +             if (unlikely(iph_len < sizeof(struct ipv6hdr)))
> > > +                     return -EINVAL;
> > > +
> > > +             ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie);
> >
> > I guess this is fast path and you don't want indirect call via ipv6_bpf_stub ?
> >
> 
> We run this on all inbound eyeball traffic, so yes. What is the
> benefit of going via ipv6_bpf_stub?

to keep this helper working when ipv6 is a module,
but I very much prefer to kill ipv6-as-module option,
so IS_BUILTIN(CONFIG_IPV6) is fine to me.
Lorenz Bauer March 21, 2019, 4:53 a.m. UTC | #4
On Thu, 21 Mar 2019 at 03:03, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Thu, Mar 21, 2019 at 02:09:08AM +0000, Lorenz Bauer wrote:
> > On Tue, 19 Mar 2019 at 22:17, Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Tue, Mar 19, 2019 at 10:20:59AM +0000, Lorenz Bauer wrote:
> > > > Using bpf_skc_lookup_tcp it's possible to ascertain whether a packet
> > > > belongs to a known connection. However, there is one corner case: no
> > > > sockets are created if SYN cookies are active. This means that the final
> > > > ACK in the 3WHS is misclassified.
> > > >
> > > > Using the helper, we can look up the listening socket via
> > > > bpf_skc_lookup_tcp and then check whether a packet is a valid SYN
> > > > cookie ACK.
> > > >
> > > > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
> > > > ---
> > > >  include/uapi/linux/bpf.h | 18 +++++++++-
> > > >  net/core/filter.c        | 72 ++++++++++++++++++++++++++++++++++++++++
> > > >  2 files changed, 89 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > > index 8e4f8276942a..587d7a3295bf 100644
> > > > --- a/include/uapi/linux/bpf.h
> > > > +++ b/include/uapi/linux/bpf.h
> > > > @@ -2391,6 +2391,21 @@ union bpf_attr {
> > > >   *           Pointer to **struct bpf_sock**, or **NULL** in case of failure.
> > > >   *           For sockets with reuseport option, the **struct bpf_sock**
> > > >   *           result is from **reuse->socks**\ [] using the hash of the tuple.
> > > > + *
> > > > + * int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len)
> > > > + *   Description
> > > > + *           Check whether iph and th contain a valid SYN cookie ACK for
> > > > + *           the listening socket in sk.
> > > > + *
> > > > + *           iph points to the start of the IPv4 or IPv6 header, while
> > > > + *           iph_len contains sizeof(struct iphdr) or sizeof(struct ip6hdr).
> > > > + *
> > > > + *           th points to the start of the TCP header, while th_len contains
> > > > + *           sizeof(struct tcphdr).
> > > > + *
> > > > + *   Return
> > > > + *           0 if iph and th are a valid SYN cookie ACK, or a negative error
> > > > + *           otherwise.
> > > >   */
> > > >  #define __BPF_FUNC_MAPPER(FN)                \
> > > >       FN(unspec),                     \
> > > > @@ -2492,7 +2507,8 @@ union bpf_attr {
> > > >       FN(tcp_sock),                   \
> > > >       FN(skb_ecn_set_ce),             \
> > > >       FN(get_listener_sock),          \
> > > > -     FN(skc_lookup_tcp),
> > > > +     FN(skc_lookup_tcp),             \
> > > > +     FN(tcp_check_syncookie),
> > > >
> > > >  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> > > >   * function eBPF program intends to call
> > > > diff --git a/net/core/filter.c b/net/core/filter.c
> > > > index f5210773cfd8..45a46fbe4ee0 100644
> > > > --- a/net/core/filter.c
> > > > +++ b/net/core/filter.c
> > > > @@ -5546,6 +5546,74 @@ static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
> > > >       .ret_type       = RET_INTEGER,
> > > >       .arg1_type      = ARG_PTR_TO_CTX,
> > > >  };
> > > > +
> > > > +BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
> > > > +        struct tcphdr *, th, u32, th_len)
> > >
> > > what was the reason to pass ip and tcp header pointers separately?
> > > Why not to pass 'void* iph' with len long enough to cover both?
> >
> > Our use case is to look at Foo-over-UDP encapsulated packets before they are
> > decapsulated, and if necessary forward them to another machine.
> > Essentially a bit
> > of BPF which either XDP_PASSes when we find that a packet belongs to a known
> > connection, or XDP_TX if it doesn't.
> >
> > This means we already have to have both ip and tcp header parsed,
> > otherwise it can't get a reference to a socket via bpf_skc_lookup_tcp.
> > Giving both pointers
> > to the syncookie helper saves us from doing that work twice.
> >
> > > the helper can compute tcp header and can take into account variable length ipv4 hdr
> > > while checking that resulting ptr is within 'iph + len' validated by the verifier.
> >
> > I guess you're thinking of a use case where ip and tcp are not already
> > parsed in eBPF, like checking
> > tcp_check_syncookie(skb->sk, ...)?
> >
> > For IPv4 this seems straightforward, but for IPv6 I need to skip
> > extension headers.
> > ipv6_skip_exthdr requires an sk_buff, which isn't available when
> > calling from an XDP context.
> > Neither creating an sk_buff ad-hoc, nor duplicating ipv6_skip_exthdr
> > are particularly appealing.
> >
> > Even if I have an skb context, I need to be able to specify at which offset in
> > the skb the TCP/IP header starts, since we're using FoU encapsulation.
> > In this case I also can't
> > use skb->sk, but have to do skc_lookup_tcp, which requires demuxing
> > again (to get a socket tuple).
> > If the helper only takes a single pointer + length, that work is done twice.
> >
> > I think there is a tension here between integrating well with
> > bpf_sock_tuple based APIs and
> > using skb->sk directly. My best attempt at fixing this is
> >
> > bpf_tcp_check_syncookie(struct sock * sk, void *tcpiph,
> >          u32 tcpiph_len, u32 th_off, u64 flags)
> >
> > Where th_off is < tcpiph_len and gives the offset of the tcp header in tcpiph.
> > But using that API seems even more cumbersome? It still requires demux in eBPF
> > and figuring out th_off isn't very natural. It might be possible to
> > allow th_off to be
> > 0 to invoke ipv6_skip_exthdr machinery if there is a way around sk_buff.
> >
> > Alternatively:
> >
> > bpf_tcp_check_syncookie(void *ctx, u32 iph_offset, struct sock * sk, u64 flags)
> >
> > No idea whether this is feasible from XDP (due to sk_buff), and forces
> > duplicate work.
> > It would allow bpf_tcp_check_syncookie(skb, 0, NULL, 0) for checking skb
> > against it skb->skb, which seems nice, but I'm not sure whether it's
> > actually useful.
> >
> > > Last patch example is buggy in that sense, since it's doing:
> > > tcph = data + sizeof(struct ethhdr) + sizeof(struct iphdr);
> >
> > Oops!
> >
> > >
> > > If we drop two args then there will be room for 'flags'.
> > > It can be used for example to indicate whether LINUX_MIB_SYNCOOKIESFAILED
> > > should be incremented or not.
> > > May be it shold be unconditionally?
> >
> > No, I think that is only the right thing to do do if a) the packet is
> > destined for the local machine
> > and b) is dropped by BPF as a result of doing the check (otherwise
> > we'd double count).
> >
> > For us, a) isn't necessarily true. For others, knowing to drop in case
> > b) to avoid double counting
> > seems harder than just letting the packet continue up the stack? I
> > guess I'm not exactly clear what
> > the use case could be.
> >
> > I agree that having flags would be good though.
>
> Ok. All makes sense.
>
> Was thinking that something like this should be possible:
> BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph,
>         struct tcphdr *, th, u32, th_len, u64, flags)
> and the helper would need to check that th - iph > sizeof(iphdr)
> but that requires some non-trivial verifier hacks.
>
> So please resubmit as-is.
> It doesn't apply cleanly yet.

Sorry about this. It requires Martin KaFai Lau's fix for the
bpf_tcp_sock reference
tracking, since it depends on sk_release on PTR_TO_SOCK_COMMON. How do I fix
this for bpf-next?

>
> > > > +#if IS_BUILTIN(CONFIG_IPV6)
> > > > +     case AF_INET6:
> > > > +             if (unlikely(iph_len < sizeof(struct ipv6hdr)))
> > > > +                     return -EINVAL;
> > > > +
> > > > +             ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie);
> > >
> > > I guess this is fast path and you don't want indirect call via ipv6_bpf_stub ?
> > >
> >
> > We run this on all inbound eyeball traffic, so yes. What is the
> > benefit of going via ipv6_bpf_stub?
>
> to keep this helper working when ipv6 is a module,
> but I very much prefer to kill ipv6-as-module option,
> so IS_BUILTIN(CONFIG_IPV6) is fine to me.
>
Alexei Starovoitov March 21, 2019, 5:50 a.m. UTC | #5
On Wed, Mar 20, 2019 at 9:53 PM Lorenz Bauer <lmb@cloudflare.com> wrote:
> >
> > So please resubmit as-is.
> > It doesn't apply cleanly yet.
>
> Sorry about this. It requires Martin KaFai Lau's fix for the
> bpf_tcp_sock reference
> tracking, since it depends on sk_release on PTR_TO_SOCK_COMMON. How do I fix
> this for bpf-next?

Martin's fix is already in bpf-next.
Just rebase, may be?
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 8e4f8276942a..587d7a3295bf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2391,6 +2391,21 @@  union bpf_attr {
  *		Pointer to **struct bpf_sock**, or **NULL** in case of failure.
  *		For sockets with reuseport option, the **struct bpf_sock**
  *		result is from **reuse->socks**\ [] using the hash of the tuple.
+ *
+ * int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len)
+ * 	Description
+ * 		Check whether iph and th contain a valid SYN cookie ACK for
+ * 		the listening socket in sk.
+ *
+ * 		iph points to the start of the IPv4 or IPv6 header, while
+ * 		iph_len contains sizeof(struct iphdr) or sizeof(struct ip6hdr).
+ *
+ * 		th points to the start of the TCP header, while th_len contains
+ * 		sizeof(struct tcphdr).
+ *
+ * 	Return
+ * 		0 if iph and th are a valid SYN cookie ACK, or a negative error
+ * 		otherwise.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2492,7 +2507,8 @@  union bpf_attr {
 	FN(tcp_sock),			\
 	FN(skb_ecn_set_ce),		\
 	FN(get_listener_sock),		\
-	FN(skc_lookup_tcp),
+	FN(skc_lookup_tcp),		\
+	FN(tcp_check_syncookie),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index f5210773cfd8..45a46fbe4ee0 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5546,6 +5546,74 @@  static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
 	.ret_type       = RET_INTEGER,
 	.arg1_type      = ARG_PTR_TO_CTX,
 };
+
+BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
+	   struct tcphdr *, th, u32, th_len)
+{
+#ifdef CONFIG_SYN_COOKIES
+	u32 cookie;
+	int ret;
+
+	if (unlikely(th_len < sizeof(*th)))
+		return -EINVAL;
+
+	/* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */
+	if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
+		return -EINVAL;
+
+	if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies)
+		return -EINVAL;
+
+	if (!th->ack || th->rst || th->syn)
+		return -ENOENT;
+
+	if (tcp_synq_no_recent_overflow(sk))
+		return -ENOENT;
+
+	cookie = ntohl(th->ack_seq) - 1;
+
+	switch (sk->sk_family) {
+	case AF_INET:
+		if (unlikely(iph_len < sizeof(struct iphdr)))
+			return -EINVAL;
+
+		ret = __cookie_v4_check((struct iphdr *)iph, th, cookie);
+		break;
+
+#if IS_BUILTIN(CONFIG_IPV6)
+	case AF_INET6:
+		if (unlikely(iph_len < sizeof(struct ipv6hdr)))
+			return -EINVAL;
+
+		ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie);
+		break;
+#endif /* CONFIG_IPV6 */
+
+	default:
+		return -EPROTONOSUPPORT;
+	}
+
+	if (ret > 0)
+		return 0;
+
+	return -ENOENT;
+#else
+	return -ENOTSUPP;
+#endif
+}
+
+static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = {
+	.func		= bpf_tcp_check_syncookie,
+	.gpl_only	= true,
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_SOCK_COMMON,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
+};
+
 #endif /* CONFIG_INET */
 
 bool bpf_helper_changes_pkt_data(void *func)
@@ -5808,6 +5876,8 @@  tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_listener_sock_proto;
 	case BPF_FUNC_skc_lookup_tcp:
 		return &bpf_skc_lookup_tcp_proto;
+	case BPF_FUNC_tcp_check_syncookie:
+		return &bpf_tcp_check_syncookie_proto;
 #endif
 	default:
 		return bpf_base_func_proto(func_id);
@@ -5845,6 +5915,8 @@  xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_sk_release_proto;
 	case BPF_FUNC_skc_lookup_tcp:
 		return &bpf_xdp_skc_lookup_tcp_proto;
+	case BPF_FUNC_tcp_check_syncookie:
+		return &bpf_tcp_check_syncookie_proto;
 #endif
 	default:
 		return bpf_base_func_proto(func_id);