diff mbox series

[PATCHv2,bpf-next,07/11] bpf: Add helper to retrieve socket in BPF

Message ID 20180921171043.20823-8-joe@wand.net.nz
State Changes Requested, archived
Delegated to: BPF Maintainers
Headers show
Series Add socket lookup support | expand

Commit Message

Joe Stringer Sept. 21, 2018, 5:10 p.m. UTC
This patch adds new BPF helper functions, bpf_sk_lookup_tcp() and
bpf_sk_lookup_udp() which allows BPF programs to find out if there is a
socket listening on this host, and returns a socket pointer which the
BPF program can then access to determine, for instance, whether to
forward or drop traffic. bpf_sk_lookup_xxx() may take a reference on the
socket, so when a BPF program makes use of this function, it must
subsequently pass the returned pointer into the newly added sk_release()
to return the reference.

By way of example, the following pseudocode would filter inbound
connections at XDP if there is no corresponding service listening for
the traffic:

  struct bpf_sock_tuple tuple;
  struct bpf_sock_ops *sk;

  populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet
  sk = bpf_sk_lookup_tcp(ctx, &tuple, sizeof tuple, netns, 0);
  if (!sk) {
    // Couldn't find a socket listening for this traffic. Drop.
    return TC_ACT_SHOT;
  }
  bpf_sk_release(sk, 0);
  return TC_ACT_OK;

Signed-off-by: Joe Stringer <joe@wand.net.nz>

---

v2: Rework 'struct bpf_sock_tuple' to allow passing a packet pointer
    Limit netns_id field to 32 bits
    Fix compile error with CONFIG_IPV6 enabled
    Allow direct packet access from helper
---
 include/uapi/linux/bpf.h                  |  57 ++++++++-
 kernel/bpf/verifier.c                     |   8 +-
 net/core/filter.c                         | 149 ++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h            |  57 ++++++++-
 tools/testing/selftests/bpf/bpf_helpers.h |  12 ++
 5 files changed, 280 insertions(+), 3 deletions(-)

Comments

Daniel Borkmann Sept. 24, 2018, 12:12 p.m. UTC | #1
Hi Joe,

couple of comments inline:

On 09/21/2018 07:10 PM, Joe Stringer wrote:
> This patch adds new BPF helper functions, bpf_sk_lookup_tcp() and
> bpf_sk_lookup_udp() which allows BPF programs to find out if there is a
> socket listening on this host, and returns a socket pointer which the
> BPF program can then access to determine, for instance, whether to
> forward or drop traffic. bpf_sk_lookup_xxx() may take a reference on the
> socket, so when a BPF program makes use of this function, it must
> subsequently pass the returned pointer into the newly added sk_release()
> to return the reference.
> 
> By way of example, the following pseudocode would filter inbound
> connections at XDP if there is no corresponding service listening for
> the traffic:
> 
>   struct bpf_sock_tuple tuple;
>   struct bpf_sock_ops *sk;
> 
>   populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet
>   sk = bpf_sk_lookup_tcp(ctx, &tuple, sizeof tuple, netns, 0);
>   if (!sk) {
>     // Couldn't find a socket listening for this traffic. Drop.
>     return TC_ACT_SHOT;
>   }
>   bpf_sk_release(sk, 0);
>   return TC_ACT_OK;
> 
> Signed-off-by: Joe Stringer <joe@wand.net.nz>
> 
> ---
> 
> v2: Rework 'struct bpf_sock_tuple' to allow passing a packet pointer
>     Limit netns_id field to 32 bits
>     Fix compile error with CONFIG_IPV6 enabled
>     Allow direct packet access from helper
> ---
>  include/uapi/linux/bpf.h                  |  57 ++++++++-
>  kernel/bpf/verifier.c                     |   8 +-
>  net/core/filter.c                         | 149 ++++++++++++++++++++++
>  tools/include/uapi/linux/bpf.h            |  57 ++++++++-
>  tools/testing/selftests/bpf/bpf_helpers.h |  12 ++
>  5 files changed, 280 insertions(+), 3 deletions(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index aa5ccd2385ed..620adbb09a94 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2143,6 +2143,41 @@ union bpf_attr {
>   *		request in the skb.
>   *	Return
>   *		0 on success, or a negative error in case of failure.
> + *
> + * struct bpf_sock_ops *bpf_sk_lookup_tcp(ctx, tuple, tuple_size, netns, flags)

Nit: could you add proper signature like in other cases that are documented?

> + * 	Decription
> + * 		Look for TCP socket matching 'tuple'. The return value must
> + * 		be checked, and if non-NULL, released via bpf_sk_release().
> + * 		@ctx: pointer to ctx
> + * 		@tuple: pointer to struct bpf_sock_tuple
> + * 		@tuple_size: size of the tuple
> + * 		@netns: network namespace id
> + * 		@flags: flags value

Should probably say in all three cases that it's unused right now and reserved
for future.

> + * 	Return
> + * 		pointer to socket ops on success, or
> + * 		NULL in case of failure
> + *
> + * struct bpf_sock_ops *bpf_sk_lookup_udp(ctx, tuple, tuple_size, netns, flags)
> + * 	Decription
> + * 		Look for UDP socket matching 'tuple'. The return value must
> + * 		be checked, and if non-NULL, released via bpf_sk_release().
> + * 		@ctx: pointer to ctx
> + * 		@tuple: pointer to struct bpf_sock_tuple
> + * 		@tuple_size: size of the tuple
> + * 		@netns: network namespace id
> + * 		@flags: flags value
> + * 	Return
> + * 		pointer to socket ops on success, or
> + * 		NULL in case of failure
> + *
> + *  int bpf_sk_release(sock, flags)
> + * 	Description
> + * 		Release the reference held by 'sock'.
> + * 		@sock: Pointer reference to release. Must be found via
> + * 		       bpf_sk_lookup_xxx().
> + * 		@flags: flags value
> + * 	Return
> + * 		0 on success, or a negative error in case of failure.
>   */
>  #define __BPF_FUNC_MAPPER(FN)		\
>  	FN(unspec),			\
> @@ -2228,7 +2263,10 @@ union bpf_attr {
>  	FN(get_current_cgroup_id),	\
>  	FN(get_local_storage),		\
>  	FN(sk_select_reuseport),	\
> -	FN(skb_ancestor_cgroup_id),
> +	FN(skb_ancestor_cgroup_id),	\
> +	FN(sk_lookup_tcp),		\
> +	FN(sk_lookup_udp),		\
> +	FN(sk_release),
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> @@ -2398,6 +2436,23 @@ struct bpf_sock {
>  				 */
>  };
>  
> +struct bpf_sock_tuple {
> +	union {
> +		struct {
> +			__be32 saddr;
> +			__be32 daddr;
> +			__be16 sport;
> +			__be16 dport;
> +		} ipv4;
> +		struct {
> +			__be32 saddr[4];
> +			__be32 daddr[4];
> +			__be16 sport;
> +			__be16 dport;
> +		} ipv6;
> +	};
> +};
> +
>  #define XDP_PACKET_HEADROOM 256
>  
>  /* User return codes for XDP prog type.
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 16818508b225..7b7fa94aba58 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -153,6 +153,12 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
>   * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type
>   * passes through a NULL-check conditional. For the branch wherein the state is
>   * changed to CONST_IMM, the verifier releases the reference.
> + *
> + * For each helper function that allocates a reference, such as
> + * bpf_sk_lookup_tcp(), there is a corresponding release function, such as
> + * bpf_sk_release(). When a reference type passes into the release function,
> + * the verifier also releases the reference. If any unchecked or unreleased
> + * reference remains at the end of the program, the verifier rejects it.
>   */
>  
>  /* verifier_state + insn_idx are pushed to stack when branch is encountered */
> @@ -300,7 +306,7 @@ static bool arg_type_is_refcounted(enum bpf_arg_type type)
>   */
>  static bool is_release_function(enum bpf_func_id func_id)
>  {
> -	return false;
> +	return func_id == BPF_FUNC_sk_release;
>  }
>  
>  /* string representation of 'enum bpf_reg_type' */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 43f81883f31d..f0715e2e7b07 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -58,13 +58,17 @@
>  #include <net/busy_poll.h>
>  #include <net/tcp.h>
>  #include <net/xfrm.h>
> +#include <net/udp.h>
>  #include <linux/bpf_trace.h>
>  #include <net/xdp_sock.h>
>  #include <linux/inetdevice.h>
> +#include <net/inet_hashtables.h>
> +#include <net/inet6_hashtables.h>
>  #include <net/ip_fib.h>
>  #include <net/flow.h>
>  #include <net/arp.h>
>  #include <net/ipv6.h>
> +#include <net/net_namespace.h>
>  #include <linux/seg6_local.h>
>  #include <net/seg6.h>
>  #include <net/seg6_local.h>
> @@ -4812,6 +4816,139 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
>  };
>  #endif /* CONFIG_IPV6_SEG6_BPF */
>  
> +struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
> +		       struct sk_buff *skb, u8 family, u8 proto)
> +{
> +	int dif = skb->dev->ifindex;
> +	bool refcounted = false;
> +	struct sock *sk = NULL;
> +
> +	if (family == AF_INET) {
> +		__be32 src4 = tuple->ipv4.saddr;
> +		__be32 dst4 = tuple->ipv4.daddr;
> +		int sdif = inet_sdif(skb);
> +
> +		if (proto == IPPROTO_TCP)
> +			sk = __inet_lookup(net, &tcp_hashinfo, skb, 0,
> +					   src4, tuple->ipv4.sport,
> +					   dst4, tuple->ipv4.dport,
> +					   dif, sdif, &refcounted);
> +		else
> +			sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport,
> +					       dst4, tuple->ipv4.dport,
> +					       dif, sdif, &udp_table, skb);
> +#if IS_ENABLED(CONFIG_IPV6)
> +	} else {
> +		struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr;
> +		struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;
> +		int sdif = inet6_sdif(skb);
> +
> +		if (proto == IPPROTO_TCP)
> +			sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0,
> +					    src6, tuple->ipv6.sport,
> +					    dst6, tuple->ipv6.dport,
> +					    dif, sdif, &refcounted);
> +		else
> +			sk = __udp6_lib_lookup(net, src6, tuple->ipv6.sport,
> +					       dst6, tuple->ipv6.dport,
> +					       dif, sdif, &udp_table, skb);
> +#endif
> +	}
> +
> +	if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) {
> +		WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
> +		sk = NULL;
> +	}
> +	return sk;
> +}
> +
> +/* bpf_sk_lookup performs the core lookup for different types of sockets,
> + * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE.
> + * Returns the socket as an 'unsigned long' to simplify the casting in the
> + * callers to satisfy BPF_CALL declarations.
> + */
> +static unsigned long
> +bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
> +	      u8 proto, u64 netns_id, u64 flags)
> +{
> +	struct net *caller_net = dev_net(skb->dev);

For sk_skb programs, are we *always* guaranteed to have a skb->dev assigned?

This definitely holds true for tc programs, but afaik not for sk_skb ones where
you enable the helpers below, so this would result in a NULL ptr dereference.

> +	struct sock *sk = NULL;
> +	u8 family = AF_UNSPEC;
> +	struct net *net;
> +
> +	family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6;
> +	if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags))
> +		goto out;
> +
> +	if (netns_id)
> +		net = get_net_ns_by_id(caller_net, netns_id);
> +	else
> +		net = caller_net;
> +	if (unlikely(!net))
> +		goto out;
> +	sk = sk_lookup(net, tuple, skb, family, proto);
> +	put_net(net);
> +
> +	if (sk)
> +		sk = sk_to_full_sk(sk);
> +out:
> +	return (unsigned long) sk;
> +}
> +
> +BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb,
> +	   struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
> +{
> +	return bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags);
> +}
> +
> +static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
> +	.func		= bpf_sk_lookup_tcp,
> +	.gpl_only	= false,
> +	.pkt_access	= true,
> +	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,
> +	.arg1_type	= ARG_PTR_TO_CTX,
> +	.arg2_type	= ARG_PTR_TO_MEM,
> +	.arg3_type	= ARG_CONST_SIZE,
> +	.arg4_type	= ARG_ANYTHING,
> +	.arg5_type	= ARG_ANYTHING,
> +};
> +
> +BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb,
> +	   struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
> +{
> +	return bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags);
> +}
> +
> +static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
> +	.func		= bpf_sk_lookup_udp,
> +	.gpl_only	= false,
> +	.pkt_access	= true,
> +	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,
> +	.arg1_type	= ARG_PTR_TO_CTX,
> +	.arg2_type	= ARG_PTR_TO_MEM,
> +	.arg3_type	= ARG_CONST_SIZE,
> +	.arg4_type	= ARG_ANYTHING,
> +	.arg5_type	= ARG_ANYTHING,
> +};
> +
> +BPF_CALL_2(bpf_sk_release, struct sock *, sk, u64, flags)
> +{
> +	if (!sock_flag(sk, SOCK_RCU_FREE))
> +		sock_gen_put(sk);
> +
> +	if (unlikely(flags))
> +		return -EINVAL;

I guess it's probably okay to leave here, though I'm wondering whether it's
worth to have a flags part in general in this helper. We need to release the
reference in any case beforehands anyway as we otherwise leak it. Personally,
I'd probably that arg here.

> +	return 0;
> +}
> +
> +static const struct bpf_func_proto bpf_sk_release_proto = {
> +	.func		= bpf_sk_release,
> +	.gpl_only	= false,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_PTR_TO_SOCKET,
> +	.arg2_type	= ARG_ANYTHING,
> +};
> +
>  bool bpf_helper_changes_pkt_data(void *func)
>  {
>  	if (func == bpf_skb_vlan_push ||
> @@ -5018,6 +5155,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  	case BPF_FUNC_skb_ancestor_cgroup_id:
>  		return &bpf_skb_ancestor_cgroup_id_proto;
>  #endif
> +	case BPF_FUNC_sk_lookup_tcp:
> +		return &bpf_sk_lookup_tcp_proto;
> +	case BPF_FUNC_sk_lookup_udp:
> +		return &bpf_sk_lookup_udp_proto;
> +	case BPF_FUNC_sk_release:
> +		return &bpf_sk_release_proto;
>  	default:
>  		return bpf_base_func_proto(func_id);
>  	}
> @@ -5118,6 +5261,12 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  		return &bpf_sk_redirect_hash_proto;
>  	case BPF_FUNC_get_local_storage:
>  		return &bpf_get_local_storage_proto;
> +	case BPF_FUNC_sk_lookup_tcp:
> +		return &bpf_sk_lookup_tcp_proto;
> +	case BPF_FUNC_sk_lookup_udp:
> +		return &bpf_sk_lookup_udp_proto;
> +	case BPF_FUNC_sk_release:
> +		return &bpf_sk_release_proto;

(See comment above.)

>  	default:
>  		return bpf_base_func_proto(func_id);
>  	}
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index aa5ccd2385ed..620adbb09a94 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -2143,6 +2143,41 @@ union bpf_attr {
>   *		request in the skb.
>   *	Return
>   *		0 on success, or a negative error in case of failure.
> + *
> + * struct bpf_sock_ops *bpf_sk_lookup_tcp(ctx, tuple, tuple_size, netns, flags)
> + * 	Decription
> + * 		Look for TCP socket matching 'tuple'. The return value must
> + * 		be checked, and if non-NULL, released via bpf_sk_release().
> + * 		@ctx: pointer to ctx
> + * 		@tuple: pointer to struct bpf_sock_tuple
> + * 		@tuple_size: size of the tuple
> + * 		@netns: network namespace id
> + * 		@flags: flags value
> + * 	Return
> + * 		pointer to socket ops on success, or
> + * 		NULL in case of failure
> + *
> + * struct bpf_sock_ops *bpf_sk_lookup_udp(ctx, tuple, tuple_size, netns, flags)
> + * 	Decription
> + * 		Look for UDP socket matching 'tuple'. The return value must
> + * 		be checked, and if non-NULL, released via bpf_sk_release().
> + * 		@ctx: pointer to ctx
> + * 		@tuple: pointer to struct bpf_sock_tuple
> + * 		@tuple_size: size of the tuple
> + * 		@netns: network namespace id
> + * 		@flags: flags value
> + * 	Return
> + * 		pointer to socket ops on success, or
> + * 		NULL in case of failure
> + *
> + *  int bpf_sk_release(sock, flags)
> + * 	Description
> + * 		Release the reference held by 'sock'.
> + * 		@sock: Pointer reference to release. Must be found via
> + * 		       bpf_sk_lookup_xxx().
> + * 		@flags: flags value
> + * 	Return
> + * 		0 on success, or a negative error in case of failure.
>   */
>  #define __BPF_FUNC_MAPPER(FN)		\
>  	FN(unspec),			\
> @@ -2228,7 +2263,10 @@ union bpf_attr {
>  	FN(get_current_cgroup_id),	\
>  	FN(get_local_storage),		\
>  	FN(sk_select_reuseport),	\
> -	FN(skb_ancestor_cgroup_id),
> +	FN(skb_ancestor_cgroup_id),	\
> +	FN(sk_lookup_tcp),		\
> +	FN(sk_lookup_udp),		\
> +	FN(sk_release),
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> @@ -2398,6 +2436,23 @@ struct bpf_sock {
>  				 */
>  };
>  
> +struct bpf_sock_tuple {
> +	union {
> +		struct {
> +			__be32 saddr;
> +			__be32 daddr;
> +			__be16 sport;
> +			__be16 dport;
> +		} ipv4;
> +		struct {
> +			__be32 saddr[4];
> +			__be32 daddr[4];
> +			__be16 sport;
> +			__be16 dport;
> +		} ipv6;
> +	};
> +};
> +
>  #define XDP_PACKET_HEADROOM 256
>  
>  /* User return codes for XDP prog type.
> diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
> index e4be7730222d..88ce00c3aa0f 100644
> --- a/tools/testing/selftests/bpf/bpf_helpers.h
> +++ b/tools/testing/selftests/bpf/bpf_helpers.h
> @@ -143,6 +143,18 @@ static unsigned long long (*bpf_skb_cgroup_id)(void *ctx) =
>  	(void *) BPF_FUNC_skb_cgroup_id;
>  static unsigned long long (*bpf_skb_ancestor_cgroup_id)(void *ctx, int level) =
>  	(void *) BPF_FUNC_skb_ancestor_cgroup_id;
> +static struct bpf_sock *(*bpf_sk_lookup_tcp)(void *ctx,
> +					     struct bpf_sock_tuple *tuple,
> +					     int size, unsigned int netns_id,
> +					     unsigned long long flags) =
> +	(void *) BPF_FUNC_sk_lookup_tcp;
> +static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx,
> +					     struct bpf_sock_tuple *tuple,
> +					     int size, unsigned int netns_id,
> +					     unsigned long long flags) =
> +	(void *) BPF_FUNC_sk_lookup_udp;
> +static int (*bpf_sk_release)(struct bpf_sock *sk, unsigned long long flags) =
> +	(void *) BPF_FUNC_sk_release;
>  
>  /* llvm builtin functions that eBPF C program may use to
>   * emit BPF_LD_ABS and BPF_LD_IND instructions
>
Daniel Borkmann Sept. 24, 2018, 12:38 p.m. UTC | #2
On 09/24/2018 02:12 PM, Daniel Borkmann wrote:
> Hi Joe,
> 
> couple of comments inline:
> 
> On 09/21/2018 07:10 PM, Joe Stringer wrote:
>> This patch adds new BPF helper functions, bpf_sk_lookup_tcp() and
>> bpf_sk_lookup_udp() which allows BPF programs to find out if there is a
>> socket listening on this host, and returns a socket pointer which the
>> BPF program can then access to determine, for instance, whether to
>> forward or drop traffic. bpf_sk_lookup_xxx() may take a reference on the
>> socket, so when a BPF program makes use of this function, it must
>> subsequently pass the returned pointer into the newly added sk_release()
>> to return the reference.
>>
>> By way of example, the following pseudocode would filter inbound
>> connections at XDP if there is no corresponding service listening for
>> the traffic:
>>
>>   struct bpf_sock_tuple tuple;
>>   struct bpf_sock_ops *sk;
>>
>>   populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet
>>   sk = bpf_sk_lookup_tcp(ctx, &tuple, sizeof tuple, netns, 0);
>>   if (!sk) {
>>     // Couldn't find a socket listening for this traffic. Drop.
>>     return TC_ACT_SHOT;
>>   }
>>   bpf_sk_release(sk, 0);
>>   return TC_ACT_OK;
>>
>> Signed-off-by: Joe Stringer <joe@wand.net.nz>
>>
>> ---
>>
>> v2: Rework 'struct bpf_sock_tuple' to allow passing a packet pointer
>>     Limit netns_id field to 32 bits
>>     Fix compile error with CONFIG_IPV6 enabled
>>     Allow direct packet access from helper
>> ---
>>  include/uapi/linux/bpf.h                  |  57 ++++++++-
>>  kernel/bpf/verifier.c                     |   8 +-
>>  net/core/filter.c                         | 149 ++++++++++++++++++++++
>>  tools/include/uapi/linux/bpf.h            |  57 ++++++++-
>>  tools/testing/selftests/bpf/bpf_helpers.h |  12 ++
>>  5 files changed, 280 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index aa5ccd2385ed..620adbb09a94 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -2143,6 +2143,41 @@ union bpf_attr {
>>   *		request in the skb.
>>   *	Return
>>   *		0 on success, or a negative error in case of failure.
>> + *
>> + * struct bpf_sock_ops *bpf_sk_lookup_tcp(ctx, tuple, tuple_size, netns, flags)
> 
> Nit: could you add proper signature like in other cases that are documented?
> 
>> + * 	Decription
>> + * 		Look for TCP socket matching 'tuple'. The return value must
>> + * 		be checked, and if non-NULL, released via bpf_sk_release().
>> + * 		@ctx: pointer to ctx
>> + * 		@tuple: pointer to struct bpf_sock_tuple
>> + * 		@tuple_size: size of the tuple
>> + * 		@netns: network namespace id
>> + * 		@flags: flags value
> 
> Should probably say in all three cases that it's unused right now and reserved
> for future.

I think the two lookup helpers here also need to have a better documentation in
terms of semantics wrt netns, so it definitely needs to be documented when it's
derived from the skb's dev and when netns is used.

>> + * 	Return
>> + * 		pointer to socket ops on success, or
>> + * 		NULL in case of failure
>> + *
>> + * struct bpf_sock_ops *bpf_sk_lookup_udp(ctx, tuple, tuple_size, netns, flags)
>> + * 	Decription
>> + * 		Look for UDP socket matching 'tuple'. The return value must
>> + * 		be checked, and if non-NULL, released via bpf_sk_release().
>> + * 		@ctx: pointer to ctx
>> + * 		@tuple: pointer to struct bpf_sock_tuple
>> + * 		@tuple_size: size of the tuple
>> + * 		@netns: network namespace id
>> + * 		@flags: flags value
>> + * 	Return
>> + * 		pointer to socket ops on success, or
>> + * 		NULL in case of failure
>> + *
>> + *  int bpf_sk_release(sock, flags)
>> + * 	Description
>> + * 		Release the reference held by 'sock'.
>> + * 		@sock: Pointer reference to release. Must be found via
>> + * 		       bpf_sk_lookup_xxx().
>> + * 		@flags: flags value
>> + * 	Return
>> + * 		0 on success, or a negative error in case of failure.
>>   */
Daniel Borkmann Sept. 24, 2018, 12:51 p.m. UTC | #3
On 09/24/2018 02:12 PM, Daniel Borkmann wrote:
> On 09/21/2018 07:10 PM, Joe Stringer wrote:
[...]
>> +/* bpf_sk_lookup performs the core lookup for different types of sockets,
>> + * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE.
>> + * Returns the socket as an 'unsigned long' to simplify the casting in the
>> + * callers to satisfy BPF_CALL declarations.
>> + */
>> +static unsigned long
>> +bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
>> +	      u8 proto, u64 netns_id, u64 flags)
>> +{
>> +	struct net *caller_net = dev_net(skb->dev);
> 
> For sk_skb programs, are we *always* guaranteed to have a skb->dev assigned?
> 
> This definitely holds true for tc programs, but afaik not for sk_skb ones where
> you enable the helpers below, so this would result in a NULL ptr dereference.
> 
>> +	struct sock *sk = NULL;
>> +	u8 family = AF_UNSPEC;
>> +	struct net *net;
>> +
>> +	family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6;
>> +	if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags))
>> +		goto out;
>> +
>> +	if (netns_id)
>> +		net = get_net_ns_by_id(caller_net, netns_id);
>> +	else
>> +		net = caller_net;
>> +	if (unlikely(!net))
>> +		goto out;
>> +	sk = sk_lookup(net, tuple, skb, family, proto);
>> +	put_net(net);

Hmm, isn't this also resulting in a use-after-free on net ?

Presume we use dev_net(skb->dev) as net. It does read_pnet(&dev->nd_net), which
either gets you pnet->net when netns is configured or &init_net otherwise.

I don't see where an additional reference was taken in this path, hence you'll
get an imbalance which triggers freeing the underlying netns ..

>> +	if (sk)
>> +		sk = sk_to_full_sk(sk);
>> +out:
>> +	return (unsigned long) sk;
>> +}
>> +
>> +BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb,
>> +	   struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
>> +{
>> +	return bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags);
>> +}
Joe Stringer Sept. 24, 2018, 6:04 p.m. UTC | #4
On Mon, 24 Sep 2018 at 05:12, Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> Hi Joe,
>
> couple of comments inline:

Thanks for the review, I'll fix up those bits.

> On 09/21/2018 07:10 PM, Joe Stringer wrote:
> > This patch adds new BPF helper functions, bpf_sk_lookup_tcp() and
> > bpf_sk_lookup_udp() which allows BPF programs to find out if there is a
> > socket listening on this host, and returns a socket pointer which the
> > BPF program can then access to determine, for instance, whether to
> > forward or drop traffic. bpf_sk_lookup_xxx() may take a reference on the
> > socket, so when a BPF program makes use of this function, it must
> > subsequently pass the returned pointer into the newly added sk_release()
> > to return the reference.
> >
> > By way of example, the following pseudocode would filter inbound
> > connections at XDP if there is no corresponding service listening for
> > the traffic:
> >
> >   struct bpf_sock_tuple tuple;
> >   struct bpf_sock_ops *sk;
> >
> >   populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet
> >   sk = bpf_sk_lookup_tcp(ctx, &tuple, sizeof tuple, netns, 0);
> >   if (!sk) {
> >     // Couldn't find a socket listening for this traffic. Drop.
> >     return TC_ACT_SHOT;
> >   }
> >   bpf_sk_release(sk, 0);
> >   return TC_ACT_OK;
> >
> > Signed-off-by: Joe Stringer <joe@wand.net.nz>
> >
> > ---
> >
> > v2: Rework 'struct bpf_sock_tuple' to allow passing a packet pointer
> >     Limit netns_id field to 32 bits
> >     Fix compile error with CONFIG_IPV6 enabled
> >     Allow direct packet access from helper
> > ---
> >  include/uapi/linux/bpf.h                  |  57 ++++++++-
> >  kernel/bpf/verifier.c                     |   8 +-
> >  net/core/filter.c                         | 149 ++++++++++++++++++++++
> >  tools/include/uapi/linux/bpf.h            |  57 ++++++++-
> >  tools/testing/selftests/bpf/bpf_helpers.h |  12 ++
> >  5 files changed, 280 insertions(+), 3 deletions(-)
> >
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index aa5ccd2385ed..620adbb09a94 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > ...
> > +/* bpf_sk_lookup performs the core lookup for different types of sockets,
> > + * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE.
> > + * Returns the socket as an 'unsigned long' to simplify the casting in the
> > + * callers to satisfy BPF_CALL declarations.
> > + */
> > +static unsigned long
> > +bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
> > +           u8 proto, u64 netns_id, u64 flags)
> > +{
> > +     struct net *caller_net = dev_net(skb->dev);
>
> For sk_skb programs, are we *always* guaranteed to have a skb->dev assigned?
>
> This definitely holds true for tc programs, but afaik not for sk_skb ones where
> you enable the helpers below, so this would result in a NULL ptr dereference.

I'll update this to take the netns from the skb->sk in those hooks.

> > +BPF_CALL_2(bpf_sk_release, struct sock *, sk, u64, flags)
> > +{
> > +     if (!sock_flag(sk, SOCK_RCU_FREE))
> > +             sock_gen_put(sk);
> > +
> > +     if (unlikely(flags))
> > +             return -EINVAL;
>
> I guess it's probably okay to leave here, though I'm wondering whether it's
> worth to have a flags part in general in this helper. We need to release the
> reference in any case beforehands anyway as we otherwise leak it. Personally,
> I'd probably that arg here.

I agree, I can't think of a good reason to have optional behaviour on
socket release. I'll drop it.

> > +     return 0;
> > +}
> > +
> > +static const struct bpf_func_proto bpf_sk_release_proto = {
> > +     .func           = bpf_sk_release,
> > +     .gpl_only       = false,
> > +     .ret_type       = RET_INTEGER,
> > +     .arg1_type      = ARG_PTR_TO_SOCKET,
> > +     .arg2_type      = ARG_ANYTHING,
> > +};
> > +
> >  bool bpf_helper_changes_pkt_data(void *func)
> >  {
> >       if (func == bpf_skb_vlan_push ||
> > @@ -5018,6 +5155,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> >       case BPF_FUNC_skb_ancestor_cgroup_id:
> >               return &bpf_skb_ancestor_cgroup_id_proto;
> >  #endif
> > +     case BPF_FUNC_sk_lookup_tcp:
> > +             return &bpf_sk_lookup_tcp_proto;
> > +     case BPF_FUNC_sk_lookup_udp:
> > +             return &bpf_sk_lookup_udp_proto;
> > +     case BPF_FUNC_sk_release:
> > +             return &bpf_sk_release_proto;
> >       default:
> >               return bpf_base_func_proto(func_id);
> >       }
> > @@ -5118,6 +5261,12 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> >               return &bpf_sk_redirect_hash_proto;
> >       case BPF_FUNC_get_local_storage:
> >               return &bpf_get_local_storage_proto;
> > +     case BPF_FUNC_sk_lookup_tcp:
> > +             return &bpf_sk_lookup_tcp_proto;
> > +     case BPF_FUNC_sk_lookup_udp:
> > +             return &bpf_sk_lookup_udp_proto;
> > +     case BPF_FUNC_sk_release:
> > +             return &bpf_sk_release_proto;
>
> (See comment above.)
Joe Stringer Sept. 24, 2018, 6:05 p.m. UTC | #5
On Mon, 24 Sep 2018 at 05:38, Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 09/24/2018 02:12 PM, Daniel Borkmann wrote:
> > Hi Joe,
> >
> > couple of comments inline:
> >
> > On 09/21/2018 07:10 PM, Joe Stringer wrote:
> >> This patch adds new BPF helper functions, bpf_sk_lookup_tcp() and
> >> bpf_sk_lookup_udp() which allows BPF programs to find out if there is a
> >> socket listening on this host, and returns a socket pointer which the
> >> BPF program can then access to determine, for instance, whether to
> >> forward or drop traffic. bpf_sk_lookup_xxx() may take a reference on the
> >> socket, so when a BPF program makes use of this function, it must
> >> subsequently pass the returned pointer into the newly added sk_release()
> >> to return the reference.
> >>
> >> By way of example, the following pseudocode would filter inbound
> >> connections at XDP if there is no corresponding service listening for
> >> the traffic:
> >>
> >>   struct bpf_sock_tuple tuple;
> >>   struct bpf_sock_ops *sk;
> >>
> >>   populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet
> >>   sk = bpf_sk_lookup_tcp(ctx, &tuple, sizeof tuple, netns, 0);
> >>   if (!sk) {
> >>     // Couldn't find a socket listening for this traffic. Drop.
> >>     return TC_ACT_SHOT;
> >>   }
> >>   bpf_sk_release(sk, 0);
> >>   return TC_ACT_OK;
> >>
> >> Signed-off-by: Joe Stringer <joe@wand.net.nz>
> >>
> >> ---
> >>
> >> v2: Rework 'struct bpf_sock_tuple' to allow passing a packet pointer
> >>     Limit netns_id field to 32 bits
> >>     Fix compile error with CONFIG_IPV6 enabled
> >>     Allow direct packet access from helper
> >> ---
> >>  include/uapi/linux/bpf.h                  |  57 ++++++++-
> >>  kernel/bpf/verifier.c                     |   8 +-
> >>  net/core/filter.c                         | 149 ++++++++++++++++++++++
> >>  tools/include/uapi/linux/bpf.h            |  57 ++++++++-
> >>  tools/testing/selftests/bpf/bpf_helpers.h |  12 ++
> >>  5 files changed, 280 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> >> index aa5ccd2385ed..620adbb09a94 100644
> >> --- a/include/uapi/linux/bpf.h
> >> +++ b/include/uapi/linux/bpf.h
> >> @@ -2143,6 +2143,41 @@ union bpf_attr {
> >>   *          request in the skb.
> >>   *  Return
> >>   *          0 on success, or a negative error in case of failure.
> >> + *
> >> + * struct bpf_sock_ops *bpf_sk_lookup_tcp(ctx, tuple, tuple_size, netns, flags)
> >
> > Nit: could you add proper signature like in other cases that are documented?
> >
> >> + *  Decription
> >> + *          Look for TCP socket matching 'tuple'. The return value must
> >> + *          be checked, and if non-NULL, released via bpf_sk_release().
> >> + *          @ctx: pointer to ctx
> >> + *          @tuple: pointer to struct bpf_sock_tuple
> >> + *          @tuple_size: size of the tuple
> >> + *          @netns: network namespace id
> >> + *          @flags: flags value
> >
> > Should probably say in all three cases that it's unused right now and reserved
> > for future.
>
> I think the two lookup helpers here also need to have a better documentation in
> terms of semantics wrt netns, so it definitely needs to be documented when it's
> derived from the skb's dev and when netns is used.

Good point, maybe some text like this:

If 'netns' is 0, then the netns associated with the ctx will be used.
For the tc hooks, this is the netns of the device in the skb. For
socket hooks, this is the netns of the socket. If netns is nonzero,
then this is the netns id relative to the netns associated with the
ctx.
> >> + *  Return
> >> + *          pointer to socket ops on success, or
> >> + *          NULL in case of failure
> >> + *
> >> + * struct bpf_sock_ops *bpf_sk_lookup_udp(ctx, tuple, tuple_size, netns, flags)
> >> + *  Decription
> >> + *          Look for UDP socket matching 'tuple'. The return value must
> >> + *          be checked, and if non-NULL, released via bpf_sk_release().
> >> + *          @ctx: pointer to ctx
> >> + *          @tuple: pointer to struct bpf_sock_tuple
> >> + *          @tuple_size: size of the tuple
> >> + *          @netns: network namespace id
> >> + *          @flags: flags value
> >> + *  Return
> >> + *          pointer to socket ops on success, or
> >> + *          NULL in case of failure
> >> + *
> >> + *  int bpf_sk_release(sock, flags)
> >> + *  Description
> >> + *          Release the reference held by 'sock'.
> >> + *          @sock: Pointer reference to release. Must be found via
> >> + *                 bpf_sk_lookup_xxx().
> >> + *          @flags: flags value
> >> + *  Return
> >> + *          0 on success, or a negative error in case of failure.
> >>   */
Joe Stringer Sept. 24, 2018, 6:09 p.m. UTC | #6
On Mon, 24 Sep 2018 at 05:51, Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 09/24/2018 02:12 PM, Daniel Borkmann wrote:
> > On 09/21/2018 07:10 PM, Joe Stringer wrote:
> [...]
> >> +/* bpf_sk_lookup performs the core lookup for different types of sockets,
> >> + * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE.
> >> + * Returns the socket as an 'unsigned long' to simplify the casting in the
> >> + * callers to satisfy BPF_CALL declarations.
> >> + */
> >> +static unsigned long
> >> +bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
> >> +          u8 proto, u64 netns_id, u64 flags)
> >> +{
> >> +    struct net *caller_net = dev_net(skb->dev);
> >
> > For sk_skb programs, are we *always* guaranteed to have a skb->dev assigned?
> >
> > This definitely holds true for tc programs, but afaik not for sk_skb ones where
> > you enable the helpers below, so this would result in a NULL ptr dereference.
> >
> >> +    struct sock *sk = NULL;
> >> +    u8 family = AF_UNSPEC;
> >> +    struct net *net;
> >> +
> >> +    family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6;
> >> +    if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags))
> >> +            goto out;
> >> +
> >> +    if (netns_id)
> >> +            net = get_net_ns_by_id(caller_net, netns_id);
> >> +    else
> >> +            net = caller_net;
> >> +    if (unlikely(!net))
> >> +            goto out;
> >> +    sk = sk_lookup(net, tuple, skb, family, proto);
> >> +    put_net(net);
>
> Hmm, isn't this also resulting in a use-after-free on net ?
>
> Presume we use dev_net(skb->dev) as net. It does read_pnet(&dev->nd_net), which
> either gets you pnet->net when netns is configured or &init_net otherwise.
>
> I don't see where an additional reference was taken in this path, hence you'll
> get an imbalance which triggers freeing the underlying netns ..

Good catch, will fix.
Daniel Borkmann Sept. 24, 2018, 7:18 p.m. UTC | #7
On 09/24/2018 08:05 PM, Joe Stringer wrote:
> On Mon, 24 Sep 2018 at 05:38, Daniel Borkmann <daniel@iogearbox.net> wrote:
>> On 09/24/2018 02:12 PM, Daniel Borkmann wrote:
>>> Hi Joe,
>>>
>>> couple of comments inline:
>>>
>>> On 09/21/2018 07:10 PM, Joe Stringer wrote:
>>>> This patch adds new BPF helper functions, bpf_sk_lookup_tcp() and
>>>> bpf_sk_lookup_udp() which allows BPF programs to find out if there is a
>>>> socket listening on this host, and returns a socket pointer which the
>>>> BPF program can then access to determine, for instance, whether to
>>>> forward or drop traffic. bpf_sk_lookup_xxx() may take a reference on the
>>>> socket, so when a BPF program makes use of this function, it must
>>>> subsequently pass the returned pointer into the newly added sk_release()
>>>> to return the reference.
>>>>
>>>> By way of example, the following pseudocode would filter inbound
>>>> connections at XDP if there is no corresponding service listening for
>>>> the traffic:
>>>>
>>>>   struct bpf_sock_tuple tuple;
>>>>   struct bpf_sock_ops *sk;
>>>>
>>>>   populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet
>>>>   sk = bpf_sk_lookup_tcp(ctx, &tuple, sizeof tuple, netns, 0);
>>>>   if (!sk) {
>>>>     // Couldn't find a socket listening for this traffic. Drop.
>>>>     return TC_ACT_SHOT;
>>>>   }
>>>>   bpf_sk_release(sk, 0);
>>>>   return TC_ACT_OK;
>>>>
>>>> Signed-off-by: Joe Stringer <joe@wand.net.nz>
>>>>
>>>> ---
>>>>
>>>> v2: Rework 'struct bpf_sock_tuple' to allow passing a packet pointer
>>>>     Limit netns_id field to 32 bits
>>>>     Fix compile error with CONFIG_IPV6 enabled
>>>>     Allow direct packet access from helper
>>>> ---
>>>>  include/uapi/linux/bpf.h                  |  57 ++++++++-
>>>>  kernel/bpf/verifier.c                     |   8 +-
>>>>  net/core/filter.c                         | 149 ++++++++++++++++++++++
>>>>  tools/include/uapi/linux/bpf.h            |  57 ++++++++-
>>>>  tools/testing/selftests/bpf/bpf_helpers.h |  12 ++
>>>>  5 files changed, 280 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>>> index aa5ccd2385ed..620adbb09a94 100644
>>>> --- a/include/uapi/linux/bpf.h
>>>> +++ b/include/uapi/linux/bpf.h
>>>> @@ -2143,6 +2143,41 @@ union bpf_attr {
>>>>   *          request in the skb.
>>>>   *  Return
>>>>   *          0 on success, or a negative error in case of failure.
>>>> + *
>>>> + * struct bpf_sock_ops *bpf_sk_lookup_tcp(ctx, tuple, tuple_size, netns, flags)
>>>
>>> Nit: could you add proper signature like in other cases that are documented?
>>>
>>>> + *  Decription
>>>> + *          Look for TCP socket matching 'tuple'. The return value must
>>>> + *          be checked, and if non-NULL, released via bpf_sk_release().
>>>> + *          @ctx: pointer to ctx
>>>> + *          @tuple: pointer to struct bpf_sock_tuple
>>>> + *          @tuple_size: size of the tuple
>>>> + *          @netns: network namespace id
>>>> + *          @flags: flags value
>>>
>>> Should probably say in all three cases that it's unused right now and reserved
>>> for future.
>>
>> I think the two lookup helpers here also need to have a better documentation in
>> terms of semantics wrt netns, so it definitely needs to be documented when it's
>> derived from the skb's dev and when netns is used.
> 
> Good point, maybe some text like this:
> 
> If 'netns' is 0, then the netns associated with the ctx will be used.
> For the tc hooks, this is the netns of the device in the skb. For
> socket hooks, this is the netns of the socket. If netns is nonzero,
> then this is the netns id relative to the netns associated with the
> ctx.

Looks good, I'd probably add to the first one "[...] then the socket lookup
table in the netns associated with the ctx will be used."
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index aa5ccd2385ed..620adbb09a94 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2143,6 +2143,41 @@  union bpf_attr {
  *		request in the skb.
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * struct bpf_sock_ops *bpf_sk_lookup_tcp(ctx, tuple, tuple_size, netns, flags)
+ * 	Decription
+ * 		Look for TCP socket matching 'tuple'. The return value must
+ * 		be checked, and if non-NULL, released via bpf_sk_release().
+ * 		@ctx: pointer to ctx
+ * 		@tuple: pointer to struct bpf_sock_tuple
+ * 		@tuple_size: size of the tuple
+ * 		@netns: network namespace id
+ * 		@flags: flags value
+ * 	Return
+ * 		pointer to socket ops on success, or
+ * 		NULL in case of failure
+ *
+ * struct bpf_sock_ops *bpf_sk_lookup_udp(ctx, tuple, tuple_size, netns, flags)
+ * 	Decription
+ * 		Look for UDP socket matching 'tuple'. The return value must
+ * 		be checked, and if non-NULL, released via bpf_sk_release().
+ * 		@ctx: pointer to ctx
+ * 		@tuple: pointer to struct bpf_sock_tuple
+ * 		@tuple_size: size of the tuple
+ * 		@netns: network namespace id
+ * 		@flags: flags value
+ * 	Return
+ * 		pointer to socket ops on success, or
+ * 		NULL in case of failure
+ *
+ *  int bpf_sk_release(sock, flags)
+ * 	Description
+ * 		Release the reference held by 'sock'.
+ * 		@sock: Pointer reference to release. Must be found via
+ * 		       bpf_sk_lookup_xxx().
+ * 		@flags: flags value
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2228,7 +2263,10 @@  union bpf_attr {
 	FN(get_current_cgroup_id),	\
 	FN(get_local_storage),		\
 	FN(sk_select_reuseport),	\
-	FN(skb_ancestor_cgroup_id),
+	FN(skb_ancestor_cgroup_id),	\
+	FN(sk_lookup_tcp),		\
+	FN(sk_lookup_udp),		\
+	FN(sk_release),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2398,6 +2436,23 @@  struct bpf_sock {
 				 */
 };
 
+struct bpf_sock_tuple {
+	union {
+		struct {
+			__be32 saddr;
+			__be32 daddr;
+			__be16 sport;
+			__be16 dport;
+		} ipv4;
+		struct {
+			__be32 saddr[4];
+			__be32 daddr[4];
+			__be16 sport;
+			__be16 dport;
+		} ipv6;
+	};
+};
+
 #define XDP_PACKET_HEADROOM 256
 
 /* User return codes for XDP prog type.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 16818508b225..7b7fa94aba58 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -153,6 +153,12 @@  static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
  * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type
  * passes through a NULL-check conditional. For the branch wherein the state is
  * changed to CONST_IMM, the verifier releases the reference.
+ *
+ * For each helper function that allocates a reference, such as
+ * bpf_sk_lookup_tcp(), there is a corresponding release function, such as
+ * bpf_sk_release(). When a reference type passes into the release function,
+ * the verifier also releases the reference. If any unchecked or unreleased
+ * reference remains at the end of the program, the verifier rejects it.
  */
 
 /* verifier_state + insn_idx are pushed to stack when branch is encountered */
@@ -300,7 +306,7 @@  static bool arg_type_is_refcounted(enum bpf_arg_type type)
  */
 static bool is_release_function(enum bpf_func_id func_id)
 {
-	return false;
+	return func_id == BPF_FUNC_sk_release;
 }
 
 /* string representation of 'enum bpf_reg_type' */
diff --git a/net/core/filter.c b/net/core/filter.c
index 43f81883f31d..f0715e2e7b07 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -58,13 +58,17 @@ 
 #include <net/busy_poll.h>
 #include <net/tcp.h>
 #include <net/xfrm.h>
+#include <net/udp.h>
 #include <linux/bpf_trace.h>
 #include <net/xdp_sock.h>
 #include <linux/inetdevice.h>
+#include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
 #include <net/ip_fib.h>
 #include <net/flow.h>
 #include <net/arp.h>
 #include <net/ipv6.h>
+#include <net/net_namespace.h>
 #include <linux/seg6_local.h>
 #include <net/seg6.h>
 #include <net/seg6_local.h>
@@ -4812,6 +4816,139 @@  static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
 };
 #endif /* CONFIG_IPV6_SEG6_BPF */
 
+struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
+		       struct sk_buff *skb, u8 family, u8 proto)
+{
+	int dif = skb->dev->ifindex;
+	bool refcounted = false;
+	struct sock *sk = NULL;
+
+	if (family == AF_INET) {
+		__be32 src4 = tuple->ipv4.saddr;
+		__be32 dst4 = tuple->ipv4.daddr;
+		int sdif = inet_sdif(skb);
+
+		if (proto == IPPROTO_TCP)
+			sk = __inet_lookup(net, &tcp_hashinfo, skb, 0,
+					   src4, tuple->ipv4.sport,
+					   dst4, tuple->ipv4.dport,
+					   dif, sdif, &refcounted);
+		else
+			sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport,
+					       dst4, tuple->ipv4.dport,
+					       dif, sdif, &udp_table, skb);
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr;
+		struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;
+		int sdif = inet6_sdif(skb);
+
+		if (proto == IPPROTO_TCP)
+			sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0,
+					    src6, tuple->ipv6.sport,
+					    dst6, tuple->ipv6.dport,
+					    dif, sdif, &refcounted);
+		else
+			sk = __udp6_lib_lookup(net, src6, tuple->ipv6.sport,
+					       dst6, tuple->ipv6.dport,
+					       dif, sdif, &udp_table, skb);
+#endif
+	}
+
+	if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) {
+		WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
+		sk = NULL;
+	}
+	return sk;
+}
+
+/* bpf_sk_lookup performs the core lookup for different types of sockets,
+ * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE.
+ * Returns the socket as an 'unsigned long' to simplify the casting in the
+ * callers to satisfy BPF_CALL declarations.
+ */
+static unsigned long
+bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
+	      u8 proto, u64 netns_id, u64 flags)
+{
+	struct net *caller_net = dev_net(skb->dev);
+	struct sock *sk = NULL;
+	u8 family = AF_UNSPEC;
+	struct net *net;
+
+	family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6;
+	if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags))
+		goto out;
+
+	if (netns_id)
+		net = get_net_ns_by_id(caller_net, netns_id);
+	else
+		net = caller_net;
+	if (unlikely(!net))
+		goto out;
+	sk = sk_lookup(net, tuple, skb, family, proto);
+	put_net(net);
+
+	if (sk)
+		sk = sk_to_full_sk(sk);
+out:
+	return (unsigned long) sk;
+}
+
+BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb,
+	   struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+	return bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
+	.func		= bpf_sk_lookup_tcp,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb,
+	   struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+	return bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
+	.func		= bpf_sk_lookup_udp,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_sk_release, struct sock *, sk, u64, flags)
+{
+	if (!sock_flag(sk, SOCK_RCU_FREE))
+		sock_gen_put(sk);
+
+	if (unlikely(flags))
+		return -EINVAL;
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_sk_release_proto = {
+	.func		= bpf_sk_release,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_SOCKET,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 bool bpf_helper_changes_pkt_data(void *func)
 {
 	if (func == bpf_skb_vlan_push ||
@@ -5018,6 +5155,12 @@  tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skb_ancestor_cgroup_id:
 		return &bpf_skb_ancestor_cgroup_id_proto;
 #endif
+	case BPF_FUNC_sk_lookup_tcp:
+		return &bpf_sk_lookup_tcp_proto;
+	case BPF_FUNC_sk_lookup_udp:
+		return &bpf_sk_lookup_udp_proto;
+	case BPF_FUNC_sk_release:
+		return &bpf_sk_release_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -5118,6 +5261,12 @@  sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_sk_redirect_hash_proto;
 	case BPF_FUNC_get_local_storage:
 		return &bpf_get_local_storage_proto;
+	case BPF_FUNC_sk_lookup_tcp:
+		return &bpf_sk_lookup_tcp_proto;
+	case BPF_FUNC_sk_lookup_udp:
+		return &bpf_sk_lookup_udp_proto;
+	case BPF_FUNC_sk_release:
+		return &bpf_sk_release_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index aa5ccd2385ed..620adbb09a94 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2143,6 +2143,41 @@  union bpf_attr {
  *		request in the skb.
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * struct bpf_sock_ops *bpf_sk_lookup_tcp(ctx, tuple, tuple_size, netns, flags)
+ * 	Decription
+ * 		Look for TCP socket matching 'tuple'. The return value must
+ * 		be checked, and if non-NULL, released via bpf_sk_release().
+ * 		@ctx: pointer to ctx
+ * 		@tuple: pointer to struct bpf_sock_tuple
+ * 		@tuple_size: size of the tuple
+ * 		@netns: network namespace id
+ * 		@flags: flags value
+ * 	Return
+ * 		pointer to socket ops on success, or
+ * 		NULL in case of failure
+ *
+ * struct bpf_sock_ops *bpf_sk_lookup_udp(ctx, tuple, tuple_size, netns, flags)
+ * 	Decription
+ * 		Look for UDP socket matching 'tuple'. The return value must
+ * 		be checked, and if non-NULL, released via bpf_sk_release().
+ * 		@ctx: pointer to ctx
+ * 		@tuple: pointer to struct bpf_sock_tuple
+ * 		@tuple_size: size of the tuple
+ * 		@netns: network namespace id
+ * 		@flags: flags value
+ * 	Return
+ * 		pointer to socket ops on success, or
+ * 		NULL in case of failure
+ *
+ *  int bpf_sk_release(sock, flags)
+ * 	Description
+ * 		Release the reference held by 'sock'.
+ * 		@sock: Pointer reference to release. Must be found via
+ * 		       bpf_sk_lookup_xxx().
+ * 		@flags: flags value
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2228,7 +2263,10 @@  union bpf_attr {
 	FN(get_current_cgroup_id),	\
 	FN(get_local_storage),		\
 	FN(sk_select_reuseport),	\
-	FN(skb_ancestor_cgroup_id),
+	FN(skb_ancestor_cgroup_id),	\
+	FN(sk_lookup_tcp),		\
+	FN(sk_lookup_udp),		\
+	FN(sk_release),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2398,6 +2436,23 @@  struct bpf_sock {
 				 */
 };
 
+struct bpf_sock_tuple {
+	union {
+		struct {
+			__be32 saddr;
+			__be32 daddr;
+			__be16 sport;
+			__be16 dport;
+		} ipv4;
+		struct {
+			__be32 saddr[4];
+			__be32 daddr[4];
+			__be16 sport;
+			__be16 dport;
+		} ipv6;
+	};
+};
+
 #define XDP_PACKET_HEADROOM 256
 
 /* User return codes for XDP prog type.
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index e4be7730222d..88ce00c3aa0f 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -143,6 +143,18 @@  static unsigned long long (*bpf_skb_cgroup_id)(void *ctx) =
 	(void *) BPF_FUNC_skb_cgroup_id;
 static unsigned long long (*bpf_skb_ancestor_cgroup_id)(void *ctx, int level) =
 	(void *) BPF_FUNC_skb_ancestor_cgroup_id;
+static struct bpf_sock *(*bpf_sk_lookup_tcp)(void *ctx,
+					     struct bpf_sock_tuple *tuple,
+					     int size, unsigned int netns_id,
+					     unsigned long long flags) =
+	(void *) BPF_FUNC_sk_lookup_tcp;
+static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx,
+					     struct bpf_sock_tuple *tuple,
+					     int size, unsigned int netns_id,
+					     unsigned long long flags) =
+	(void *) BPF_FUNC_sk_lookup_udp;
+static int (*bpf_sk_release)(struct bpf_sock *sk, unsigned long long flags) =
+	(void *) BPF_FUNC_sk_release;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions