diff mbox series

[RFC,bpf-next,8/9] bpf: Provide helper to do lookups in kernel FIB table

Message ID 20180425183449.25134-9-dsahern@gmail.com
State RFC, archived
Delegated to: BPF Maintainers
Headers show
Series bpf: Add helper to do FIB lookups | expand

Commit Message

David Ahern April 25, 2018, 6:34 p.m. UTC
Provide a helper for doing a FIB and neighbor lookup in the kernel
tables from an XDP program. The helper provides a fastpath for forwarding
packets. If the packet is a local delivery or for any reason is not a
simple lookup and forward, the packet continues up the stack.

If it is to be forwarded, the forwarding can be done directly if the
neighbor is already known. If the neighbor does not exist, the first
few packets go up the stack for neighbor resolution. Once resolved, the
xdp program provides the fast path.

On successful lookup the nexthop dmac, current device smac and egress
device index are returned.

The API supports IPv4, IPv6 and MPLS protocols, but only IPv4 and IPv6
are implemented in this patch. The API includes layer 4 parameters if
the XDP program chooses to do deep packet inspection to allow compare
against ACLs implemented as FIB rules.

Header rewrite is left to the XDP program.

The lookup takes 2 flags:
- BPF_FIB_LOOKUP_DIRECT to do a lookup that bypasses FIB rules and goes
  straight to the table associated with the device (expert setting for
  those looking to maximize throughput)

- BPF_FIB_LOOKUP_OUTPUT to do a lookup from the egress perspective.
  Default is an ingress lookup.

Initial performance numbers collected by Jesper, forwarded packets/sec:

       Full stack    XDP FIB lookup    XDP Direct lookup
IPv4   1,947,969       7,074,156          7,415,333
IPv6   1,728,000       6,165,504          7,262,720


Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/uapi/linux/bpf.h |  68 +++++++++++++-
 net/core/filter.c        | 233 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 300 insertions(+), 1 deletion(-)

Comments

Daniel Borkmann April 25, 2018, 7:55 p.m. UTC | #1
On 04/25/2018 08:34 PM, David Ahern wrote:
> Provide a helper for doing a FIB and neighbor lookup in the kernel
> tables from an XDP program. The helper provides a fastpath for forwarding
> packets. If the packet is a local delivery or for any reason is not a
> simple lookup and forward, the packet continues up the stack.
> 
> If it is to be forwarded, the forwarding can be done directly if the
> neighbor is already known. If the neighbor does not exist, the first
> few packets go up the stack for neighbor resolution. Once resolved, the
> xdp program provides the fast path.
> 
> On successful lookup the nexthop dmac, current device smac and egress
> device index are returned.
> 
> The API supports IPv4, IPv6 and MPLS protocols, but only IPv4 and IPv6
> are implemented in this patch. The API includes layer 4 parameters if
> the XDP program chooses to do deep packet inspection to allow compare
> against ACLs implemented as FIB rules.
> 
> Header rewrite is left to the XDP program.
> 
> The lookup takes 2 flags:
> - BPF_FIB_LOOKUP_DIRECT to do a lookup that bypasses FIB rules and goes
>   straight to the table associated with the device (expert setting for
>   those looking to maximize throughput)
> 
> - BPF_FIB_LOOKUP_OUTPUT to do a lookup from the egress perspective.
>   Default is an ingress lookup.
> 
> Initial performance numbers collected by Jesper, forwarded packets/sec:
> 
>        Full stack    XDP FIB lookup    XDP Direct lookup
> IPv4   1,947,969       7,074,156          7,415,333
> IPv6   1,728,000       6,165,504          7,262,720
> 
> 
> Signed-off-by: David Ahern <dsahern@gmail.com>
> ---
>  include/uapi/linux/bpf.h |  68 +++++++++++++-
>  net/core/filter.c        | 233 +++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 300 insertions(+), 1 deletion(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index e6679393b687..82601c132b9f 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -10,6 +10,8 @@
>  
>  #include <linux/types.h>
>  #include <linux/bpf_common.h>
> +#include <linux/if_ether.h>
> +#include <linux/in6.h>
>  
>  /* Extended instruction set based on top of classic BPF */
>  
> @@ -783,6 +785,17 @@ union bpf_attr {
>   *     @size: size of 'struct bpf_xfrm_state'
>   *     @flags: room for future extensions
>   *     Return: 0 on success or negative error
> + *
> + * int bpf_fib_lookup(ctx, params, plen, flags)
> + *     Do a FIB lookup based on given parameters
> + *     @ctx:     pointer to context of type xdp_md

Nit: would just say pointer to context here since used with xdp/skb

> + *     @params:  pointer to bpf_fib_lookup
> + *     @plen:    size of params argument
> + *     @flags:   u32 bitmask of BPF_FIB_LOOKUP_* flags
> + *     Return: egress device index if packet is to be forwarded,
> + *             0 for local delivery (anything that needs to be handled
> + *             by the full stack), or negative on error.
> + *             If index is > 0, output data in bpf_fib_lookup is set
>   */
>  #define __BPF_FUNC_MAPPER(FN)		\
>  	FN(unspec),			\
> @@ -851,7 +864,9 @@ union bpf_attr {
>  	FN(msg_pull_data),		\
>  	FN(bind),			\
>  	FN(xdp_adjust_tail),		\
> -	FN(skb_get_xfrm_state),
> +	FN(skb_get_xfrm_state),		\
> +	FN(fib_lookup),			\
> +
>  

Nit: trailing '\' resp. double newline

>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
[...]

> diff --git a/net/core/filter.c b/net/core/filter.c
> index 8e45c6c7ab08..37602b2fb94a 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -59,6 +59,10 @@
>  #include <net/tcp.h>
>  #include <net/xfrm.h>
>  #include <linux/bpf_trace.h>
> +#include <linux/inetdevice.h>
> +#include <net/ip_fib.h>
> +#include <net/flow.h>
> +#include <net/arp.h>
>  
>  /**
>   *	sk_filter_trim_cap - run a packet through a socket filter
> @@ -3787,6 +3791,231 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
>  };
>  #endif
>  
> +#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
> +static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
> +				  const struct neighbour *neigh,
> +				  const struct net_device *dev)
> +{
> +	memcpy(params->dmac, neigh->ha, ETH_ALEN);
> +	memcpy(params->smac, dev->dev_addr, ETH_ALEN);
> +	params->h_vlan_TCI = 0;
> +	params->h_vlan_proto = 0;
> +
> +	return dev->ifindex;
> +}
> +#endif
> +
> +#if IS_ENABLED(CONFIG_INET)
> +static int bpf_ipv4_fib_lookup(struct xdp_buff *ctx,

Instead of passing xdp_buff here, just pass the netdev pointer. More below
why it's needed.

> +			       struct bpf_fib_lookup *params, u32 flags)
> +{
> +	struct net *net = dev_net(ctx->rxq->dev);
> +	struct in_device *in_dev;
> +	struct neighbour *neigh;
> +	struct net_device *dev;
> +	struct fib_result res;
> +	struct fib_nh *nh;
> +	struct flowi4 fl4;
> +	int err;
> +
> +	dev = dev_get_by_index_rcu(net, params->ifindex);
> +	if (unlikely(!dev))
> +		return -ENODEV;
> +
> +	/* verify forwarding is enabled on this interface */
> +	in_dev = __in_dev_get_rcu(dev);
> +	if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
> +		return 0;
> +
> +	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
> +		fl4.flowi4_iif = 1;
> +		fl4.flowi4_oif = params->ifindex;
> +	} else {
> +		fl4.flowi4_iif = params->ifindex;
> +		fl4.flowi4_oif = 0;
> +	}
> +	fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
> +	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
> +	fl4.flowi4_flags = 0;
> +
> +	fl4.flowi4_proto = params->l4_protocol;
> +	fl4.daddr = params->ipv4_dst;
> +	fl4.saddr = params->ipv4_src;
> +	fl4.fl4_sport = params->sport;
> +	fl4.fl4_dport = params->dport;
> +
> +	if (flags & BPF_FIB_LOOKUP_DIRECT) {
> +		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
> +		struct fib_table *tb;
> +
> +		tb = fib_get_table(net, tbid);
> +		if (unlikely(!tb))
> +			return 0;
> +
> +		err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
> +	} else {
> +		fl4.flowi4_mark = 0;
> +		fl4.flowi4_secid = 0;
> +		fl4.flowi4_tun_key.tun_id = 0;
> +		fl4.flowi4_uid = sock_net_uid(net, NULL);
> +
> +		err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
> +	}
> +
> +	if (err || res.type != RTN_UNICAST)
> +		return 0;
> +
> +	if (res.fi->fib_nhs > 1)
> +		fib_select_path(net, &res, &fl4, NULL);
> +
> +	nh = &res.fi->fib_nh[res.nh_sel];
> +
> +	/* do not handle lwt encaps right now */
> +	if (nh->nh_lwtstate)
> +		return 0;
> +
> +	dev = nh->nh_dev;
> +	if (unlikely(!dev))
> +		return 0;
> +
> +	if (nh->nh_gw)
> +		params->ipv4_dst = nh->nh_gw;
> +
> +	params->rt_metric = res.fi->fib_priority;
> +
> +	/* xdp and cls_bpf programs are run in RCU-bh so
> +	 * rcu_read_lock_bh is not needed here
> +	 */
> +	neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
> +	if (neigh)
> +		return bpf_fib_set_fwd_params(params, neigh, dev);
> +
> +	return 0;
> +}
> +#endif
> +
> +#if IS_ENABLED(CONFIG_IPV6)
> +static int bpf_ipv6_fib_lookup(struct xdp_buff *ctx,

Same here.

> +			       struct bpf_fib_lookup *params, u32 flags)
> +{
> +	struct net *net = dev_net(ctx->rxq->dev);
> +	struct neighbour *neigh;
> +	struct net_device *dev;
> +	struct fib6_info *f6i;
> +	struct flowi6 fl6;
> +	int strict = 0;
> +	int oif;
> +
> +	/* link local addresses are never forwarded */
> +	if (rt6_need_strict(&params->ipv6_dst) ||
> +	    rt6_need_strict(&params->ipv6_src))
> +		return 0;
> +
> +	dev = dev_get_by_index_rcu(net, params->ifindex);
> +	if (unlikely(!dev))
> +		return -ENODEV;
> +
> +	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
> +		fl6.flowi6_iif = 1;
> +		oif = fl6.flowi6_oif = params->ifindex;
> +	} else {
> +		oif = fl6.flowi6_iif = params->ifindex;
> +		fl6.flowi6_oif = 0;
> +		strict = RT6_LOOKUP_F_HAS_SADDR;
> +	}
> +	fl6.flowlabel = params->flowlabel;
> +	fl6.flowi6_scope = 0;
> +	fl6.flowi6_flags = 0;
> +	fl6.mp_hash = 0;
> +
> +	fl6.flowi6_proto = params->l4_protocol;
> +	fl6.daddr = params->ipv6_dst;
> +	fl6.saddr = params->ipv6_src;
> +	fl6.fl6_sport = params->sport;
> +	fl6.fl6_dport = params->dport;
> +
> +	if (flags & BPF_FIB_LOOKUP_DIRECT) {
> +		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
> +		struct fib6_table *tb;
> +
> +		tb = ipv6_stub->fib6_get_table(net, tbid);
> +		if (unlikely(!tb))
> +			return 0;
> +
> +		f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
> +	} else {
> +		fl6.flowi6_mark = 0;
> +		fl6.flowi6_secid = 0;
> +		fl6.flowi6_tun_key.tun_id = 0;
> +		fl6.flowi6_uid = sock_net_uid(net, NULL);
> +
> +		f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict);
> +	}
> +
> +	if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
> +		return 0;
> +
> +	if (unlikely(f6i->fib6_flags & RTF_REJECT ||
> +	    f6i->fib6_type != RTN_UNICAST))
> +		return 0;
> +
> +	if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
> +		f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
> +						       fl6.flowi6_oif, NULL,
> +						       strict);
> +
> +	if (f6i->fib6_nh.nh_lwtstate)
> +		return 0;
> +
> +	if (f6i->fib6_flags & RTF_GATEWAY)
> +		params->ipv6_dst = f6i->fib6_nh.nh_gw;
> +
> +	dev = f6i->fib6_nh.nh_dev;
> +	params->rt_metric = f6i->fib6_metric;
> +
> +	/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
> +	 * not needed here. Can not use __ipv6_neigh_lookup_noref here
> +	 * because we need to get nd_tbl via the stub
> +	 */
> +	neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
> +				      ndisc_hashfn, &params->ipv6_dst, dev);
> +	if (neigh)
> +		return bpf_fib_set_fwd_params(params, neigh, dev);
> +
> +	return 0;
> +}
> +#endif
> +
> +BPF_CALL_4(bpf_fib_lookup, struct xdp_buff *, ctx,
> +	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
> +{
> +	if (plen < sizeof(*params))
> +		return -EINVAL;
> +
> +	switch (params->family) {
> +#if IS_ENABLED(CONFIG_INET)
> +	case AF_INET:
> +		return bpf_ipv4_fib_lookup(ctx, params, flags);
> +#endif
> +#if IS_ENABLED(CONFIG_IPV6)
> +	case AF_INET6:
> +		return bpf_ipv6_fib_lookup(ctx, params, flags);
> +#endif
> +	}
> +	return -ENOTSUPP;
> +}
> +
> +static const struct bpf_func_proto bpf_fib_lookup_proto = {
> +	.func		= bpf_fib_lookup,
> +	.gpl_only	= true,
> +	.pkt_access	= true,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type      = ARG_PTR_TO_CTX,
> +	.arg2_type      = ARG_PTR_TO_MEM,
> +	.arg3_type      = ARG_CONST_SIZE,
> +	.arg4_type	= ARG_ANYTHING,
> +};
> +
>  static const struct bpf_func_proto *
>  bpf_base_func_proto(enum bpf_func_id func_id)
>  {
> @@ -3861,6 +4090,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  		return &bpf_get_socket_cookie_proto;
>  	case BPF_FUNC_get_socket_uid:
>  		return &bpf_get_socket_uid_proto;
> +	case BPF_FUNC_fib_lookup:
> +		return &bpf_fib_lookup_proto;

This part doesn't belong to sk_filter_func_proto(), but to the
tc_cls_act_func_proto() instead.

>  	default:
>  		return bpf_base_func_proto(func_id);
>  	}
> @@ -3957,6 +4188,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  		return &bpf_xdp_redirect_map_proto;
>  	case BPF_FUNC_xdp_adjust_tail:
>  		return &bpf_xdp_adjust_tail_proto;
> +	case BPF_FUNC_fib_lookup:
> +		return &bpf_fib_lookup_proto;

Basically, you're using the very same bpf_fib_lookup_proto for
both XDP and skb. In the skb case, you're reusing the two functions
bpf_ipv{4,6}_fib_lookup(), so when you get the netdev pointer for
retrieving the netns, you'll crash at dev_net(ctx->rxq->dev) since
this is XDP only and not skb meta data.

Therefore, as mentioned, pass the netdev to bpf_ipv{4,6}_fib_lookup()
to have it generic and have bpf_xdp_fib_lookup_proto and
bpf_skb_fib_lookup_proto where both are under the case BPF_FUNC_fib_lookup
in the respective *func_proto(), but using the proper prototypes according
to their correct context. Meaning, both reuse bpf_ipv{4,6}_fib_lookup()
from each of their BPF_CALL_4() helper implementation.

>  	default:
>  		return bpf_base_func_proto(func_id);
>  	}
>
David Ahern April 25, 2018, 11:24 p.m. UTC | #2
On 4/25/18 1:55 PM, Daniel Borkmann wrote:
>> @@ -3861,6 +4090,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>>  		return &bpf_get_socket_cookie_proto;
>>  	case BPF_FUNC_get_socket_uid:
>>  		return &bpf_get_socket_uid_proto;
>> +	case BPF_FUNC_fib_lookup:
>> +		return &bpf_fib_lookup_proto;
> This part doesn't belong to sk_filter_func_proto(), but to the
> tc_cls_act_func_proto() instead.

oops, somewhere in all of the re-basing it got added to the wrong
function. Will fix.

> 
>>  	default:
>>  		return bpf_base_func_proto(func_id);
>>  	}
>> @@ -3957,6 +4188,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>>  		return &bpf_xdp_redirect_map_proto;
>>  	case BPF_FUNC_xdp_adjust_tail:
>>  		return &bpf_xdp_adjust_tail_proto;
>> +	case BPF_FUNC_fib_lookup:
>> +		return &bpf_fib_lookup_proto;
> Basically, you're using the very same bpf_fib_lookup_proto for
> both XDP and skb. In the skb case, you're reusing the two functions
> bpf_ipv{4,6}_fib_lookup(), so when you get the netdev pointer for
> retrieving the netns, you'll crash at dev_net(ctx->rxq->dev) since
> this is XDP only and not skb meta data.
> 
> Therefore, as mentioned, pass the netdev to bpf_ipv{4,6}_fib_lookup()
> to have it generic and have bpf_xdp_fib_lookup_proto and
> bpf_skb_fib_lookup_proto where both are under the case BPF_FUNC_fib_lookup
> in the respective *func_proto(), but using the proper prototypes according
> to their correct context. Meaning, both reuse bpf_ipv{4,6}_fib_lookup()
> from each of their BPF_CALL_4() helper implementation.

ok. I have been focused on the xdp program and not the tc path. Will fix.
Martin KaFai Lau April 27, 2018, 4:43 p.m. UTC | #3
On Wed, Apr 25, 2018 at 11:34:48AM -0700, David Ahern wrote:
> Provide a helper for doing a FIB and neighbor lookup in the kernel
> tables from an XDP program. The helper provides a fastpath for forwarding
> packets. If the packet is a local delivery or for any reason is not a
> simple lookup and forward, the packet continues up the stack.
> 
> If it is to be forwarded, the forwarding can be done directly if the
> neighbor is already known. If the neighbor does not exist, the first
> few packets go up the stack for neighbor resolution. Once resolved, the
> xdp program provides the fast path.
> 
> On successful lookup the nexthop dmac, current device smac and egress
> device index are returned.
> 
> The API supports IPv4, IPv6 and MPLS protocols, but only IPv4 and IPv6
> are implemented in this patch. The API includes layer 4 parameters if
> the XDP program chooses to do deep packet inspection to allow compare
> against ACLs implemented as FIB rules.
> 
> Header rewrite is left to the XDP program.
> 
> The lookup takes 2 flags:
> - BPF_FIB_LOOKUP_DIRECT to do a lookup that bypasses FIB rules and goes
>   straight to the table associated with the device (expert setting for
>   those looking to maximize throughput)
> 
> - BPF_FIB_LOOKUP_OUTPUT to do a lookup from the egress perspective.
>   Default is an ingress lookup.
> 
> Initial performance numbers collected by Jesper, forwarded packets/sec:
> 
>        Full stack    XDP FIB lookup    XDP Direct lookup
> IPv4   1,947,969       7,074,156          7,415,333
> IPv6   1,728,000       6,165,504          7,262,720
> 
> 
> Signed-off-by: David Ahern <dsahern@gmail.com>
> ---
>  include/uapi/linux/bpf.h |  68 +++++++++++++-
>  net/core/filter.c        | 233 +++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 300 insertions(+), 1 deletion(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index e6679393b687..82601c132b9f 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -10,6 +10,8 @@
>  
>  #include <linux/types.h>
>  #include <linux/bpf_common.h>
> +#include <linux/if_ether.h>
> +#include <linux/in6.h>
>  
>  /* Extended instruction set based on top of classic BPF */
>  
> @@ -783,6 +785,17 @@ union bpf_attr {
>   *     @size: size of 'struct bpf_xfrm_state'
>   *     @flags: room for future extensions
>   *     Return: 0 on success or negative error
> + *
> + * int bpf_fib_lookup(ctx, params, plen, flags)
> + *     Do a FIB lookup based on given parameters
> + *     @ctx:     pointer to context of type xdp_md
> + *     @params:  pointer to bpf_fib_lookup
> + *     @plen:    size of params argument
> + *     @flags:   u32 bitmask of BPF_FIB_LOOKUP_* flags
> + *     Return: egress device index if packet is to be forwarded,
> + *             0 for local delivery (anything that needs to be handled
> + *             by the full stack), or negative on error.
> + *             If index is > 0, output data in bpf_fib_lookup is set
>   */
>  #define __BPF_FUNC_MAPPER(FN)		\
>  	FN(unspec),			\
> @@ -851,7 +864,9 @@ union bpf_attr {
>  	FN(msg_pull_data),		\
>  	FN(bind),			\
>  	FN(xdp_adjust_tail),		\
> -	FN(skb_get_xfrm_state),
> +	FN(skb_get_xfrm_state),		\
> +	FN(fib_lookup),			\
> +
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> @@ -1255,4 +1270,55 @@ struct bpf_raw_tracepoint_args {
>  	__u64 args[0];
>  };
>  
> +/* DIRECT:  Skip the FIB rules and go to FIB table associated with device
> + * OUTPUT:  Do lookup from egress perspective; default is ingress
> + */
> +#define BPF_FIB_LOOKUP_DIRECT  BIT(0)
> +#define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
> +
> +struct bpf_fib_lookup {
> +	/* input */
> +	__u8	family;   /* network family, AF_INET, AF_INET6, AF_MPLS */
> +
> +	/* set if lookup is to consider L4 data - e.g., FIB rules */
> +	__u8	l4_protocol;
> +	__be16	sport;
> +	__be16	dport;
> +
> +	/* total length of packet from network header - used for MTU check */
> +	__u16	tot_len;
> +	__u32	ifindex;  /* L3 device index for lookup */
> +
> +	union {
> +		/* inputs to lookup */
> +		__u8	tos;		/* AF_INET  */
> +		__be32	flowlabel;	/* AF_INET6 */
> +
> +		/* output: metric of fib result */
> +		__u32 rt_metric;
> +	};
> +
> +	union {
> +		__be32		mpls_in;
> +		__be32		ipv4_src;
> +		struct in6_addr	ipv6_src;
> +	};
> +
> +	/* input to bpf_fib_lookup, *dst is destination address.
> +	 * output: bpf_fib_lookup sets to gateway address
> +	 */
> +	union {
> +		/* return for MPLS lookups */
> +		__be32		mpls_out[4];  /* support up to 4 labels */
> +		__be32		ipv4_dst;
> +		struct in6_addr	ipv6_dst;
> +	};
> +
> +	/* output */
> +	__be16	h_vlan_proto;
> +	__be16	h_vlan_TCI;
> +	__u8	smac[ETH_ALEN];
> +	__u8	dmac[ETH_ALEN];
> +};
> +
>  #endif /* _UAPI__LINUX_BPF_H__ */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 8e45c6c7ab08..37602b2fb94a 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -59,6 +59,10 @@
>  #include <net/tcp.h>
>  #include <net/xfrm.h>
>  #include <linux/bpf_trace.h>
> +#include <linux/inetdevice.h>
> +#include <net/ip_fib.h>
> +#include <net/flow.h>
> +#include <net/arp.h>
>  
>  /**
>   *	sk_filter_trim_cap - run a packet through a socket filter
> @@ -3787,6 +3791,231 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
>  };
>  #endif
>  
> +#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
> +static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
> +				  const struct neighbour *neigh,
> +				  const struct net_device *dev)
> +{
> +	memcpy(params->dmac, neigh->ha, ETH_ALEN);
> +	memcpy(params->smac, dev->dev_addr, ETH_ALEN);
> +	params->h_vlan_TCI = 0;
> +	params->h_vlan_proto = 0;
> +
> +	return dev->ifindex;
> +}
> +#endif
> +
> +#if IS_ENABLED(CONFIG_INET)
> +static int bpf_ipv4_fib_lookup(struct xdp_buff *ctx,
> +			       struct bpf_fib_lookup *params, u32 flags)
> +{
> +	struct net *net = dev_net(ctx->rxq->dev);
> +	struct in_device *in_dev;
> +	struct neighbour *neigh;
> +	struct net_device *dev;
> +	struct fib_result res;
> +	struct fib_nh *nh;
> +	struct flowi4 fl4;
> +	int err;
> +
> +	dev = dev_get_by_index_rcu(net, params->ifindex);
> +	if (unlikely(!dev))
> +		return -ENODEV;
> +
> +	/* verify forwarding is enabled on this interface */
> +	in_dev = __in_dev_get_rcu(dev);
> +	if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
> +		return 0;
> +
> +	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
> +		fl4.flowi4_iif = 1;
> +		fl4.flowi4_oif = params->ifindex;
> +	} else {
> +		fl4.flowi4_iif = params->ifindex;
> +		fl4.flowi4_oif = 0;
> +	}
> +	fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
> +	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
> +	fl4.flowi4_flags = 0;
> +
> +	fl4.flowi4_proto = params->l4_protocol;
> +	fl4.daddr = params->ipv4_dst;
> +	fl4.saddr = params->ipv4_src;
> +	fl4.fl4_sport = params->sport;
> +	fl4.fl4_dport = params->dport;
> +
> +	if (flags & BPF_FIB_LOOKUP_DIRECT) {
> +		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
> +		struct fib_table *tb;
> +
> +		tb = fib_get_table(net, tbid);
> +		if (unlikely(!tb))
> +			return 0;
> +
> +		err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
> +	} else {
> +		fl4.flowi4_mark = 0;
> +		fl4.flowi4_secid = 0;
> +		fl4.flowi4_tun_key.tun_id = 0;
> +		fl4.flowi4_uid = sock_net_uid(net, NULL);
> +
> +		err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
> +	}
> +
> +	if (err || res.type != RTN_UNICAST)
> +		return 0;
> +
> +	if (res.fi->fib_nhs > 1)
> +		fib_select_path(net, &res, &fl4, NULL);
> +
> +	nh = &res.fi->fib_nh[res.nh_sel];
> +
> +	/* do not handle lwt encaps right now */
> +	if (nh->nh_lwtstate)
> +		return 0;
> +
> +	dev = nh->nh_dev;
> +	if (unlikely(!dev))
> +		return 0;
> +
> +	if (nh->nh_gw)
> +		params->ipv4_dst = nh->nh_gw;
> +
> +	params->rt_metric = res.fi->fib_priority;
> +
> +	/* xdp and cls_bpf programs are run in RCU-bh so
> +	 * rcu_read_lock_bh is not needed here
> +	 */
> +	neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
> +	if (neigh)
> +		return bpf_fib_set_fwd_params(params, neigh, dev);
> +
> +	return 0;
> +}
> +#endif
> +
> +#if IS_ENABLED(CONFIG_IPV6)
> +static int bpf_ipv6_fib_lookup(struct xdp_buff *ctx,
> +			       struct bpf_fib_lookup *params, u32 flags)
> +{
> +	struct net *net = dev_net(ctx->rxq->dev);
> +	struct neighbour *neigh;
> +	struct net_device *dev;
> +	struct fib6_info *f6i;
> +	struct flowi6 fl6;
> +	int strict = 0;
> +	int oif;
> +
> +	/* link local addresses are never forwarded */
> +	if (rt6_need_strict(&params->ipv6_dst) ||
> +	    rt6_need_strict(&params->ipv6_src))
> +		return 0;
> +
> +	dev = dev_get_by_index_rcu(net, params->ifindex);
> +	if (unlikely(!dev))
> +		return -ENODEV;
> +
> +	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
> +		fl6.flowi6_iif = 1;
1 is for LOOPBACK_IFINDEX?

> +		oif = fl6.flowi6_oif = params->ifindex;
> +	} else {
> +		oif = fl6.flowi6_iif = params->ifindex;
> +		fl6.flowi6_oif = 0;
> +		strict = RT6_LOOKUP_F_HAS_SADDR;
> +	}
> +	fl6.flowlabel = params->flowlabel;
> +	fl6.flowi6_scope = 0;
> +	fl6.flowi6_flags = 0;
> +	fl6.mp_hash = 0;
> +
> +	fl6.flowi6_proto = params->l4_protocol;
> +	fl6.daddr = params->ipv6_dst;
> +	fl6.saddr = params->ipv6_src;
> +	fl6.fl6_sport = params->sport;
> +	fl6.fl6_dport = params->dport;
> +
> +	if (flags & BPF_FIB_LOOKUP_DIRECT) {
> +		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
> +		struct fib6_table *tb;
> +
> +		tb = ipv6_stub->fib6_get_table(net, tbid);
> +		if (unlikely(!tb))
> +			return 0;
> +
> +		f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
> +	} else {
> +		fl6.flowi6_mark = 0;
> +		fl6.flowi6_secid = 0;
> +		fl6.flowi6_tun_key.tun_id = 0;
> +		fl6.flowi6_uid = sock_net_uid(net, NULL);
> +
> +		f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict);
> +	}
> +
> +	if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
> +		return 0;
> +
> +	if (unlikely(f6i->fib6_flags & RTF_REJECT ||
> +	    f6i->fib6_type != RTN_UNICAST))
> +		return 0;
> +
> +	if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
> +		f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
> +						       fl6.flowi6_oif, NULL,
> +						       strict);
> +
> +	if (f6i->fib6_nh.nh_lwtstate)
> +		return 0;
> +
> +	if (f6i->fib6_flags & RTF_GATEWAY)
> +		params->ipv6_dst = f6i->fib6_nh.nh_gw;
> +
> +	dev = f6i->fib6_nh.nh_dev;
> +	params->rt_metric = f6i->fib6_metric;
> +
> +	/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
> +	 * not needed here. Can not use __ipv6_neigh_lookup_noref here
> +	 * because we need to get nd_tbl via the stub
> +	 */
> +	neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
> +				      ndisc_hashfn, &params->ipv6_dst, dev);
> +	if (neigh)
> +		return bpf_fib_set_fwd_params(params, neigh, dev);
> +
> +	return 0;
> +}
> +#endif
> +
> +BPF_CALL_4(bpf_fib_lookup, struct xdp_buff *, ctx,
> +	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
> +{
> +	if (plen < sizeof(*params))
> +		return -EINVAL;
> +
> +	switch (params->family) {
> +#if IS_ENABLED(CONFIG_INET)
> +	case AF_INET:
> +		return bpf_ipv4_fib_lookup(ctx, params, flags);
> +#endif
> +#if IS_ENABLED(CONFIG_IPV6)
> +	case AF_INET6:
> +		return bpf_ipv6_fib_lookup(ctx, params, flags);
> +#endif
> +	}
> +	return -ENOTSUPP;
> +}
> +
> +static const struct bpf_func_proto bpf_fib_lookup_proto = {
> +	.func		= bpf_fib_lookup,
> +	.gpl_only	= true,
> +	.pkt_access	= true,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type      = ARG_PTR_TO_CTX,
> +	.arg2_type      = ARG_PTR_TO_MEM,
> +	.arg3_type      = ARG_CONST_SIZE,
> +	.arg4_type	= ARG_ANYTHING,
> +};
> +
>  static const struct bpf_func_proto *
>  bpf_base_func_proto(enum bpf_func_id func_id)
>  {
> @@ -3861,6 +4090,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  		return &bpf_get_socket_cookie_proto;
>  	case BPF_FUNC_get_socket_uid:
>  		return &bpf_get_socket_uid_proto;
> +	case BPF_FUNC_fib_lookup:
> +		return &bpf_fib_lookup_proto;
>  	default:
>  		return bpf_base_func_proto(func_id);
>  	}
> @@ -3957,6 +4188,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  		return &bpf_xdp_redirect_map_proto;
>  	case BPF_FUNC_xdp_adjust_tail:
>  		return &bpf_xdp_adjust_tail_proto;
> +	case BPF_FUNC_fib_lookup:
> +		return &bpf_fib_lookup_proto;
>  	default:
>  		return bpf_base_func_proto(func_id);
>  	}
> -- 
> 2.11.0
>
David Ahern April 27, 2018, 4:49 p.m. UTC | #4
On 4/27/18 10:43 AM, Martin KaFai Lau wrote:
>> +#if IS_ENABLED(CONFIG_IPV6)
>> +static int bpf_ipv6_fib_lookup(struct xdp_buff *ctx,
>> +			       struct bpf_fib_lookup *params, u32 flags)
>> +{
>> +	struct net *net = dev_net(ctx->rxq->dev);
>> +	struct neighbour *neigh;
>> +	struct net_device *dev;
>> +	struct fib6_info *f6i;
>> +	struct flowi6 fl6;
>> +	int strict = 0;
>> +	int oif;
>> +
>> +	/* link local addresses are never forwarded */
>> +	if (rt6_need_strict(&params->ipv6_dst) ||
>> +	    rt6_need_strict(&params->ipv6_src))
>> +		return 0;
>> +
>> +	dev = dev_get_by_index_rcu(net, params->ifindex);
>> +	if (unlikely(!dev))
>> +		return -ENODEV;
>> +
>> +	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
>> +		fl6.flowi6_iif = 1;
> 1 is for LOOPBACK_IFINDEX?

yes. The intention is to mirror the flow struct created by full stack so
that routing in bpf == routing in IPv6 stack. ip6_route_output_flags
sets flowi6_iif to 1, so I repeated it here.

> 
>> +		oif = fl6.flowi6_oif = params->ifindex;
>> +	} else {
>> +		oif = fl6.flowi6_iif = params->ifindex;
>> +		fl6.flowi6_oif = 0;
>> +		strict = RT6_LOOKUP_F_HAS_SADDR;
>> +	}
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e6679393b687..82601c132b9f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -10,6 +10,8 @@ 
 
 #include <linux/types.h>
 #include <linux/bpf_common.h>
+#include <linux/if_ether.h>
+#include <linux/in6.h>
 
 /* Extended instruction set based on top of classic BPF */
 
@@ -783,6 +785,17 @@  union bpf_attr {
  *     @size: size of 'struct bpf_xfrm_state'
  *     @flags: room for future extensions
  *     Return: 0 on success or negative error
+ *
+ * int bpf_fib_lookup(ctx, params, plen, flags)
+ *     Do a FIB lookup based on given parameters
+ *     @ctx:     pointer to context of type xdp_md
+ *     @params:  pointer to bpf_fib_lookup
+ *     @plen:    size of params argument
+ *     @flags:   u32 bitmask of BPF_FIB_LOOKUP_* flags
+ *     Return: egress device index if packet is to be forwarded,
+ *             0 for local delivery (anything that needs to be handled
+ *             by the full stack), or negative on error.
+ *             If index is > 0, output data in bpf_fib_lookup is set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -851,7 +864,9 @@  union bpf_attr {
 	FN(msg_pull_data),		\
 	FN(bind),			\
 	FN(xdp_adjust_tail),		\
-	FN(skb_get_xfrm_state),
+	FN(skb_get_xfrm_state),		\
+	FN(fib_lookup),			\
+
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -1255,4 +1270,55 @@  struct bpf_raw_tracepoint_args {
 	__u64 args[0];
 };
 
+/* DIRECT:  Skip the FIB rules and go to FIB table associated with device
+ * OUTPUT:  Do lookup from egress perspective; default is ingress
+ */
+#define BPF_FIB_LOOKUP_DIRECT  BIT(0)
+#define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
+
+struct bpf_fib_lookup {
+	/* input */
+	__u8	family;   /* network family, AF_INET, AF_INET6, AF_MPLS */
+
+	/* set if lookup is to consider L4 data - e.g., FIB rules */
+	__u8	l4_protocol;
+	__be16	sport;
+	__be16	dport;
+
+	/* total length of packet from network header - used for MTU check */
+	__u16	tot_len;
+	__u32	ifindex;  /* L3 device index for lookup */
+
+	union {
+		/* inputs to lookup */
+		__u8	tos;		/* AF_INET  */
+		__be32	flowlabel;	/* AF_INET6 */
+
+		/* output: metric of fib result */
+		__u32 rt_metric;
+	};
+
+	union {
+		__be32		mpls_in;
+		__be32		ipv4_src;
+		struct in6_addr	ipv6_src;
+	};
+
+	/* input to bpf_fib_lookup, *dst is destination address.
+	 * output: bpf_fib_lookup sets to gateway address
+	 */
+	union {
+		/* return for MPLS lookups */
+		__be32		mpls_out[4];  /* support up to 4 labels */
+		__be32		ipv4_dst;
+		struct in6_addr	ipv6_dst;
+	};
+
+	/* output */
+	__be16	h_vlan_proto;
+	__be16	h_vlan_TCI;
+	__u8	smac[ETH_ALEN];
+	__u8	dmac[ETH_ALEN];
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/filter.c b/net/core/filter.c
index 8e45c6c7ab08..37602b2fb94a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -59,6 +59,10 @@ 
 #include <net/tcp.h>
 #include <net/xfrm.h>
 #include <linux/bpf_trace.h>
+#include <linux/inetdevice.h>
+#include <net/ip_fib.h>
+#include <net/flow.h>
+#include <net/arp.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -3787,6 +3791,231 @@  static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
 };
 #endif
 
+#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
+static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
+				  const struct neighbour *neigh,
+				  const struct net_device *dev)
+{
+	memcpy(params->dmac, neigh->ha, ETH_ALEN);
+	memcpy(params->smac, dev->dev_addr, ETH_ALEN);
+	params->h_vlan_TCI = 0;
+	params->h_vlan_proto = 0;
+
+	return dev->ifindex;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_INET)
+static int bpf_ipv4_fib_lookup(struct xdp_buff *ctx,
+			       struct bpf_fib_lookup *params, u32 flags)
+{
+	struct net *net = dev_net(ctx->rxq->dev);
+	struct in_device *in_dev;
+	struct neighbour *neigh;
+	struct net_device *dev;
+	struct fib_result res;
+	struct fib_nh *nh;
+	struct flowi4 fl4;
+	int err;
+
+	dev = dev_get_by_index_rcu(net, params->ifindex);
+	if (unlikely(!dev))
+		return -ENODEV;
+
+	/* verify forwarding is enabled on this interface */
+	in_dev = __in_dev_get_rcu(dev);
+	if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
+		return 0;
+
+	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
+		fl4.flowi4_iif = 1;
+		fl4.flowi4_oif = params->ifindex;
+	} else {
+		fl4.flowi4_iif = params->ifindex;
+		fl4.flowi4_oif = 0;
+	}
+	fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
+	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+	fl4.flowi4_flags = 0;
+
+	fl4.flowi4_proto = params->l4_protocol;
+	fl4.daddr = params->ipv4_dst;
+	fl4.saddr = params->ipv4_src;
+	fl4.fl4_sport = params->sport;
+	fl4.fl4_dport = params->dport;
+
+	if (flags & BPF_FIB_LOOKUP_DIRECT) {
+		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+		struct fib_table *tb;
+
+		tb = fib_get_table(net, tbid);
+		if (unlikely(!tb))
+			return 0;
+
+		err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
+	} else {
+		fl4.flowi4_mark = 0;
+		fl4.flowi4_secid = 0;
+		fl4.flowi4_tun_key.tun_id = 0;
+		fl4.flowi4_uid = sock_net_uid(net, NULL);
+
+		err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
+	}
+
+	if (err || res.type != RTN_UNICAST)
+		return 0;
+
+	if (res.fi->fib_nhs > 1)
+		fib_select_path(net, &res, &fl4, NULL);
+
+	nh = &res.fi->fib_nh[res.nh_sel];
+
+	/* do not handle lwt encaps right now */
+	if (nh->nh_lwtstate)
+		return 0;
+
+	dev = nh->nh_dev;
+	if (unlikely(!dev))
+		return 0;
+
+	if (nh->nh_gw)
+		params->ipv4_dst = nh->nh_gw;
+
+	params->rt_metric = res.fi->fib_priority;
+
+	/* xdp and cls_bpf programs are run in RCU-bh so
+	 * rcu_read_lock_bh is not needed here
+	 */
+	neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
+	if (neigh)
+		return bpf_fib_set_fwd_params(params, neigh, dev);
+
+	return 0;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int bpf_ipv6_fib_lookup(struct xdp_buff *ctx,
+			       struct bpf_fib_lookup *params, u32 flags)
+{
+	struct net *net = dev_net(ctx->rxq->dev);
+	struct neighbour *neigh;
+	struct net_device *dev;
+	struct fib6_info *f6i;
+	struct flowi6 fl6;
+	int strict = 0;
+	int oif;
+
+	/* link local addresses are never forwarded */
+	if (rt6_need_strict(&params->ipv6_dst) ||
+	    rt6_need_strict(&params->ipv6_src))
+		return 0;
+
+	dev = dev_get_by_index_rcu(net, params->ifindex);
+	if (unlikely(!dev))
+		return -ENODEV;
+
+	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
+		fl6.flowi6_iif = 1;
+		oif = fl6.flowi6_oif = params->ifindex;
+	} else {
+		oif = fl6.flowi6_iif = params->ifindex;
+		fl6.flowi6_oif = 0;
+		strict = RT6_LOOKUP_F_HAS_SADDR;
+	}
+	fl6.flowlabel = params->flowlabel;
+	fl6.flowi6_scope = 0;
+	fl6.flowi6_flags = 0;
+	fl6.mp_hash = 0;
+
+	fl6.flowi6_proto = params->l4_protocol;
+	fl6.daddr = params->ipv6_dst;
+	fl6.saddr = params->ipv6_src;
+	fl6.fl6_sport = params->sport;
+	fl6.fl6_dport = params->dport;
+
+	if (flags & BPF_FIB_LOOKUP_DIRECT) {
+		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+		struct fib6_table *tb;
+
+		tb = ipv6_stub->fib6_get_table(net, tbid);
+		if (unlikely(!tb))
+			return 0;
+
+		f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
+	} else {
+		fl6.flowi6_mark = 0;
+		fl6.flowi6_secid = 0;
+		fl6.flowi6_tun_key.tun_id = 0;
+		fl6.flowi6_uid = sock_net_uid(net, NULL);
+
+		f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict);
+	}
+
+	if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
+		return 0;
+
+	if (unlikely(f6i->fib6_flags & RTF_REJECT ||
+	    f6i->fib6_type != RTN_UNICAST))
+		return 0;
+
+	if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
+		f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
+						       fl6.flowi6_oif, NULL,
+						       strict);
+
+	if (f6i->fib6_nh.nh_lwtstate)
+		return 0;
+
+	if (f6i->fib6_flags & RTF_GATEWAY)
+		params->ipv6_dst = f6i->fib6_nh.nh_gw;
+
+	dev = f6i->fib6_nh.nh_dev;
+	params->rt_metric = f6i->fib6_metric;
+
+	/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
+	 * not needed here. Can not use __ipv6_neigh_lookup_noref here
+	 * because we need to get nd_tbl via the stub
+	 */
+	neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
+				      ndisc_hashfn, &params->ipv6_dst, dev);
+	if (neigh)
+		return bpf_fib_set_fwd_params(params, neigh, dev);
+
+	return 0;
+}
+#endif
+
+BPF_CALL_4(bpf_fib_lookup, struct xdp_buff *, ctx,
+	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
+{
+	if (plen < sizeof(*params))
+		return -EINVAL;
+
+	switch (params->family) {
+#if IS_ENABLED(CONFIG_INET)
+	case AF_INET:
+		return bpf_ipv4_fib_lookup(ctx, params, flags);
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		return bpf_ipv6_fib_lookup(ctx, params, flags);
+#endif
+	}
+	return -ENOTSUPP;
+}
+
+static const struct bpf_func_proto bpf_fib_lookup_proto = {
+	.func		= bpf_fib_lookup,
+	.gpl_only	= true,
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_PTR_TO_MEM,
+	.arg3_type      = ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -3861,6 +4090,8 @@  sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_socket_cookie_proto;
 	case BPF_FUNC_get_socket_uid:
 		return &bpf_get_socket_uid_proto;
+	case BPF_FUNC_fib_lookup:
+		return &bpf_fib_lookup_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -3957,6 +4188,8 @@  xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_xdp_redirect_map_proto;
 	case BPF_FUNC_xdp_adjust_tail:
 		return &bpf_xdp_adjust_tail_proto;
+	case BPF_FUNC_fib_lookup:
+		return &bpf_fib_lookup_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}