diff mbox series

[bpf-net] bpf: Change bpf_fib_lookup to return lookup status

Message ID 20180617151819.6824-1-dsahern@kernel.org
State Changes Requested, archived
Delegated to: BPF Maintainers
Headers show
Series [bpf-net] bpf: Change bpf_fib_lookup to return lookup status | expand

Commit Message

David Ahern June 17, 2018, 3:18 p.m. UTC
From: David Ahern <dsahern@gmail.com>

For ACLs implemented using either FIB rules or FIB entries, the BPF
program needs the FIB lookup status to be able to drop the packet.
Since the bpf_fib_lookup API has not reached a released kernel yet,
change the return code to contain an encoding of the FIB lookup
result and return the nexthop device index in the params struct.

In addition, inform the BPF program of any post FIB lookup reason as
to why the packet needs to go up the stack.

Update the sample program per the change in API.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/uapi/linux/bpf.h   | 28 ++++++++++++++----
 net/core/filter.c          | 74 ++++++++++++++++++++++++++++++++--------------
 samples/bpf/xdp_fwd_kern.c |  8 ++---
 3 files changed, 78 insertions(+), 32 deletions(-)

Comments

Martin KaFai Lau June 18, 2018, 6:11 p.m. UTC | #1
On Sun, Jun 17, 2018 at 08:18:19AM -0700, dsahern@kernel.org wrote:
> From: David Ahern <dsahern@gmail.com>
> 
> For ACLs implemented using either FIB rules or FIB entries, the BPF
> program needs the FIB lookup status to be able to drop the packet.
Except BPF_FIB_LKUP_RET_SUCCESS and BPF_FIB_LKUP_RET_NO_NEIGH,  can you
give an example on how the xdp_prog may decide XDP_PASS vs XDP_DROP based
on other BPF_FIB_LKUP_RET_*?

> Since the bpf_fib_lookup API has not reached a released kernel yet,
> change the return code to contain an encoding of the FIB lookup
> result and return the nexthop device index in the params struct.
> 
> In addition, inform the BPF program of any post FIB lookup reason as
> to why the packet needs to go up the stack.
> 
> Update the sample program per the change in API.
> 
> Signed-off-by: David Ahern <dsahern@gmail.com>
> ---
>  include/uapi/linux/bpf.h   | 28 ++++++++++++++----
>  net/core/filter.c          | 74 ++++++++++++++++++++++++++++++++--------------
>  samples/bpf/xdp_fwd_kern.c |  8 ++---
>  3 files changed, 78 insertions(+), 32 deletions(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 59b19b6a40d7..ceb80071c341 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1857,7 +1857,8 @@ union bpf_attr {
>   *		is resolved), the nexthop address is returned in ipv4_dst
>   *		or ipv6_dst based on family, smac is set to mac address of
>   *		egress device, dmac is set to nexthop mac address, rt_metric
> - *		is set to metric from route (IPv4/IPv6 only).
> + *		is set to metric from route (IPv4/IPv6 only), and ifindex
> + *		is set to the device index of the nexthop from the FIB lookup.
>   *
>   *             *plen* argument is the size of the passed in struct.
>   *             *flags* argument can be a combination of one or more of the
> @@ -1873,9 +1874,9 @@ union bpf_attr {
>   *             *ctx* is either **struct xdp_md** for XDP programs or
>   *             **struct sk_buff** tc cls_act programs.
>   *     Return
> - *             Egress device index on success, 0 if packet needs to continue
> - *             up the stack for further processing or a negative error in case
> - *             of failure.
> + *		< 0 if any input argument is invalid
> + *		  0 on success (packet is forwarded and nexthop neighbor exists)
> + *		> 0 one of BPF_FIB_LKUP_RET_ codes on FIB lookup response
>   *
>   * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
>   *	Description
> @@ -2612,6 +2613,19 @@ struct bpf_raw_tracepoint_args {
>  #define BPF_FIB_LOOKUP_DIRECT  BIT(0)
>  #define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
>  
> +enum {
> +	BPF_FIB_LKUP_RET_SUCCESS,      /* lookup successful */
> +	BPF_FIB_LKUP_RET_BLACKHOLE,    /* dest is blackholed */
> +	BPF_FIB_LKUP_RET_UNREACHABLE,  /* dest is unreachable */
> +	BPF_FIB_LKUP_RET_PROHIBIT,     /* dest not allowed */
> +	BPF_FIB_LKUP_RET_NOT_FWDED,    /* pkt is not forwardded */
BPF_FIB_LKUP_RET_NOT_FWDED is a catch all?

> +	BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */
> +	BPF_FIB_LKUP_RET_UNSUPP_LWT,   /* fwd requires unsupported encap */
> +	BPF_FIB_LKUP_RET_NO_NHDEV,     /* nh device does not exist */
> +	BPF_FIB_LKUP_RET_NO_NEIGH,     /* no neigh entry for nh */
> +	BPF_FIB_LKUP_RET_FRAG_NEEDED,  /* pkt too big to fwd */
> +};
> +
>  struct bpf_fib_lookup {
>  	/* input:  network family for lookup (AF_INET, AF_INET6)
>  	 * output: network family of egress nexthop
> @@ -2625,7 +2639,11 @@ struct bpf_fib_lookup {
>  
>  	/* total length of packet from network header - used for MTU check */
>  	__u16	tot_len;
> -	__u32	ifindex;  /* L3 device index for lookup */
> +
> +	/* input: L3 device index for lookup
> +	 * output: nexthop device index from FIB lookup
> +	 */
> +	__u32	ifindex;
>  
>  	union {
>  		/* inputs to lookup */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index e7f12e9f598c..e758ca487878 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -4073,8 +4073,9 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
>  	memcpy(params->smac, dev->dev_addr, ETH_ALEN);
>  	params->h_vlan_TCI = 0;
>  	params->h_vlan_proto = 0;
> +	params->ifindex = dev->ifindex;
>  
> -	return dev->ifindex;
> +	return 0;
>  }
>  #endif
>  
> @@ -4098,7 +4099,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>  	/* verify forwarding is enabled on this interface */
>  	in_dev = __in_dev_get_rcu(dev);
>  	if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
> -		return 0;
> +		return BPF_FIB_LKUP_RET_FWD_DISABLED;
>  
>  	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
>  		fl4.flowi4_iif = 1;
> @@ -4123,7 +4124,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>  
>  		tb = fib_get_table(net, tbid);
>  		if (unlikely(!tb))
> -			return 0;
> +			return BPF_FIB_LKUP_RET_NOT_FWDED;
>  
>  		err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
>  	} else {
> @@ -4135,8 +4136,20 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>  		err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
>  	}
>  
> -	if (err || res.type != RTN_UNICAST)
> -		return 0;
> +	if (err) {
> +		/* map fib lookup errors to RTN_ type */
> +		if (err == -EINVAL)
> +			return BPF_FIB_LKUP_RET_BLACKHOLE;
> +		if (err == -EHOSTUNREACH)
> +			return BPF_FIB_LKUP_RET_UNREACHABLE;
> +		if (err == -EACCES)
> +			return BPF_FIB_LKUP_RET_PROHIBIT;
> +
> +		return BPF_FIB_LKUP_RET_NOT_FWDED;
> +	}
> +
> +	if (res.type != RTN_UNICAST)
> +		return BPF_FIB_LKUP_RET_NOT_FWDED;
>  
>  	if (res.fi->fib_nhs > 1)
>  		fib_select_path(net, &res, &fl4, NULL);
> @@ -4144,18 +4157,18 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>  	if (check_mtu) {
>  		mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
>  		if (params->tot_len > mtu)
> -			return 0;
> +			return BPF_FIB_LKUP_RET_FRAG_NEEDED;
>  	}
>  
>  	nh = &res.fi->fib_nh[res.nh_sel];
>  
>  	/* do not handle lwt encaps right now */
>  	if (nh->nh_lwtstate)
> -		return 0;
> +		return BPF_FIB_LKUP_RET_UNSUPP_LWT;
>  
>  	dev = nh->nh_dev;
>  	if (unlikely(!dev))
> -		return 0;
> +		return BPF_FIB_LKUP_RET_NO_NHDEV;
>  
>  	if (nh->nh_gw)
>  		params->ipv4_dst = nh->nh_gw;
> @@ -4166,10 +4179,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>  	 * rcu_read_lock_bh is not needed here
>  	 */
>  	neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
> -	if (neigh)
> -		return bpf_fib_set_fwd_params(params, neigh, dev);
> +	if (!neigh)
> +		return BPF_FIB_LKUP_RET_NO_NEIGH;
>  
> -	return 0;
> +	return bpf_fib_set_fwd_params(params, neigh, dev);
>  }
>  #endif
>  
> @@ -4190,7 +4203,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>  
>  	/* link local addresses are never forwarded */
>  	if (rt6_need_strict(dst) || rt6_need_strict(src))
> -		return 0;
> +		return BPF_FIB_LKUP_RET_NOT_FWDED;
>  
>  	dev = dev_get_by_index_rcu(net, params->ifindex);
>  	if (unlikely(!dev))
> @@ -4198,7 +4211,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>  
>  	idev = __in6_dev_get_safely(dev);
>  	if (unlikely(!idev || !net->ipv6.devconf_all->forwarding))
> -		return 0;
> +		return BPF_FIB_LKUP_RET_FWD_DISABLED;
>  
>  	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
>  		fl6.flowi6_iif = 1;
> @@ -4225,7 +4238,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>  
>  		tb = ipv6_stub->fib6_get_table(net, tbid);
>  		if (unlikely(!tb))
> -			return 0;
> +			return BPF_FIB_LKUP_RET_NOT_FWDED;
>  
>  		f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
>  	} else {
> @@ -4238,11 +4251,23 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>  	}
>  
>  	if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
> -		return 0;
> +		return BPF_FIB_LKUP_RET_NOT_FWDED;
> +
> +	if (unlikely(f6i->fib6_flags & RTF_REJECT)) {
> +		switch (f6i->fib6_type) {
> +		case RTN_BLACKHOLE:
> +			return BPF_FIB_LKUP_RET_BLACKHOLE;
> +		case RTN_UNREACHABLE:
> +			return BPF_FIB_LKUP_RET_UNREACHABLE;
> +		case RTN_PROHIBIT:
> +			return BPF_FIB_LKUP_RET_PROHIBIT;
> +		default:
> +			return BPF_FIB_LKUP_RET_NOT_FWDED;
> +		}
> +	}
>  
> -	if (unlikely(f6i->fib6_flags & RTF_REJECT ||
> -	    f6i->fib6_type != RTN_UNICAST))
> -		return 0;
> +	if (f6i->fib6_type != RTN_UNICAST)
> +		return BPF_FIB_LKUP_RET_NOT_FWDED;
>  
>  	if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
>  		f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
> @@ -4252,16 +4277,19 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>  	if (check_mtu) {
>  		mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src);
>  		if (params->tot_len > mtu)
> -			return 0;
> +			return BPF_FIB_LKUP_RET_FRAG_NEEDED;
>  	}
>  
>  	if (f6i->fib6_nh.nh_lwtstate)
> -		return 0;
> +		return BPF_FIB_LKUP_RET_UNSUPP_LWT;
>  
>  	if (f6i->fib6_flags & RTF_GATEWAY)
>  		*dst = f6i->fib6_nh.nh_gw;
>  
>  	dev = f6i->fib6_nh.nh_dev;
> +	if (unlikely(!dev))
> +		return BPF_FIB_LKUP_RET_NO_NHDEV;
Is this a bug fix?

> +
>  	params->rt_metric = f6i->fib6_metric;
>  
>  	/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
> @@ -4270,10 +4298,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>  	 */
>  	neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
>  				      ndisc_hashfn, dst, dev);
> -	if (neigh)
> -		return bpf_fib_set_fwd_params(params, neigh, dev);
> +	if (!neigh)
> +		return BPF_FIB_LKUP_RET_NO_NEIGH;
>  
> -	return 0;
> +	return bpf_fib_set_fwd_params(params, neigh, dev);
>  }
>  #endif
>  
> diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c
> index 6673cdb9f55c..a7e94e7ff87d 100644
> --- a/samples/bpf/xdp_fwd_kern.c
> +++ b/samples/bpf/xdp_fwd_kern.c
> @@ -48,9 +48,9 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
>  	struct ethhdr *eth = data;
>  	struct ipv6hdr *ip6h;
>  	struct iphdr *iph;
> -	int out_index;
>  	u16 h_proto;
>  	u64 nh_off;
> +	int rc;
>  
>  	nh_off = sizeof(*eth);
>  	if (data + nh_off > data_end)
> @@ -101,7 +101,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
>  
>  	fib_params.ifindex = ctx->ingress_ifindex;
>  
> -	out_index = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
> +	rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
>  
>  	/* verify egress index has xdp support
>  	 * TO-DO bpf_map_lookup_elem(&tx_port, &key) fails with
> @@ -109,7 +109,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
>  	 * NOTE: without verification that egress index supports XDP
>  	 *       forwarding packets are dropped.
>  	 */
> -	if (out_index > 0) {
> +	if (rc == 0) {
>  		if (h_proto == htons(ETH_P_IP))
>  			ip_decrease_ttl(iph);
>  		else if (h_proto == htons(ETH_P_IPV6))
> @@ -117,7 +117,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
>  
>  		memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN);
>  		memcpy(eth->h_source, fib_params.smac, ETH_ALEN);
> -		return bpf_redirect_map(&tx_port, out_index, 0);
> +		return bpf_redirect_map(&tx_port, fib_params.ifindex, 0);
>  	}
>  
>  	return XDP_PASS;
> -- 
> 2.11.0
>
David Ahern June 18, 2018, 6:27 p.m. UTC | #2
On 6/18/18 12:11 PM, Martin KaFai Lau wrote:
> On Sun, Jun 17, 2018 at 08:18:19AM -0700, dsahern@kernel.org wrote:
>> From: David Ahern <dsahern@gmail.com>
>>
>> For ACLs implemented using either FIB rules or FIB entries, the BPF
>> program needs the FIB lookup status to be able to drop the packet.
> Except BPF_FIB_LKUP_RET_SUCCESS and BPF_FIB_LKUP_RET_NO_NEIGH,  can you
> give an example on how the xdp_prog may decide XDP_PASS vs XDP_DROP based
> on other BPF_FIB_LKUP_RET_*?
> 

	rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
	if (rc == 0)
		packet is forwarded, do the redirect

	/* the program is misconfigured -- wrong parameters in struct or flags */
	if (rc < 0)
		....

	/* rc > 0 case */
	switch(rc) {
	case BPF_FIB_LKUP_RET_BLACKHOLE:
	case BPF_FIB_LKUP_RET_UNREACHABLE:
	case BPF_FIB_LKUP_RET_PROHIBIT:
		return XDP_DROP;
	}

For the others it becomes a question of do we share why the stack needs
to be involved? Maybe the program wants to collect stats to show traffic
patterns that can be improved (BPF_FIB_LKUP_RET_FRAG_NEEDED) or support
in the kernel needs to be improved (BPF_FIB_LKUP_RET_UNSUPP_LWT) or an
interface is misconfigured (BPF_FIB_LKUP_RET_FWD_DISABLED).

Arguably BPF_FIB_LKUP_RET_NO_NHDEV is not needed. See below.

>> @@ -2612,6 +2613,19 @@ struct bpf_raw_tracepoint_args {
>>  #define BPF_FIB_LOOKUP_DIRECT  BIT(0)
>>  #define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
>>  
>> +enum {
>> +	BPF_FIB_LKUP_RET_SUCCESS,      /* lookup successful */
>> +	BPF_FIB_LKUP_RET_BLACKHOLE,    /* dest is blackholed */
>> +	BPF_FIB_LKUP_RET_UNREACHABLE,  /* dest is unreachable */
>> +	BPF_FIB_LKUP_RET_PROHIBIT,     /* dest not allowed */
>> +	BPF_FIB_LKUP_RET_NOT_FWDED,    /* pkt is not forwardded */
> BPF_FIB_LKUP_RET_NOT_FWDED is a catch all?
> 

Destination is local. More precisely, the FIB lookup is not unicast so
not forwarded. It could be RTN_LOCAL, RTN_BROADCAST, RTN_ANYCAST, or
RTN_MULTICAST. The next ones -- blackhole, reachable, prohibit -- are
called out.

>> @@ -4252,16 +4277,19 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>>  	if (check_mtu) {
>>  		mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src);
>>  		if (params->tot_len > mtu)
>> -			return 0;
>> +			return BPF_FIB_LKUP_RET_FRAG_NEEDED;
>>  	}
>>  
>>  	if (f6i->fib6_nh.nh_lwtstate)
>> -		return 0;
>> +		return BPF_FIB_LKUP_RET_UNSUPP_LWT;
>>  
>>  	if (f6i->fib6_flags & RTF_GATEWAY)
>>  		*dst = f6i->fib6_nh.nh_gw;
>>  
>>  	dev = f6i->fib6_nh.nh_dev;
>> +	if (unlikely(!dev))
>> +		return BPF_FIB_LKUP_RET_NO_NHDEV;
> Is this a bug fix?
> 

Difference between IPv4 and IPv6. Making them consistent.

It is a major BUG in the kernel to reach this point in either protocol
to have a unicast route not tied to a device. IPv4 has checks; v6 does
not. I figured this being new code, why not make bpf_ipv{4,6}_fib_lookup
as close to the same as possible.
Martin KaFai Lau June 18, 2018, 8:55 p.m. UTC | #3
On Mon, Jun 18, 2018 at 12:27:07PM -0600, David Ahern wrote:
> On 6/18/18 12:11 PM, Martin KaFai Lau wrote:
> > On Sun, Jun 17, 2018 at 08:18:19AM -0700, dsahern@kernel.org wrote:
> >> From: David Ahern <dsahern@gmail.com>
> >>
> >> For ACLs implemented using either FIB rules or FIB entries, the BPF
> >> program needs the FIB lookup status to be able to drop the packet.
> > Except BPF_FIB_LKUP_RET_SUCCESS and BPF_FIB_LKUP_RET_NO_NEIGH,  can you
> > give an example on how the xdp_prog may decide XDP_PASS vs XDP_DROP based
> > on other BPF_FIB_LKUP_RET_*?
> > 
> 
> 	rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
> 	if (rc == 0)
> 		packet is forwarded, do the redirect
> 
> 	/* the program is misconfigured -- wrong parameters in struct or flags */
> 	if (rc < 0)
> 		....
> 
> 	/* rc > 0 case */
> 	switch(rc) {
> 	case BPF_FIB_LKUP_RET_BLACKHOLE:
> 	case BPF_FIB_LKUP_RET_UNREACHABLE:
> 	case BPF_FIB_LKUP_RET_PROHIBIT:
> 		return XDP_DROP;
> 	}
> 
> For the others it becomes a question of do we share why the stack needs
> to be involved? Maybe the program wants to collect stats to show traffic
> patterns that can be improved (BPF_FIB_LKUP_RET_FRAG_NEEDED) or support
> in the kernel needs to be improved (BPF_FIB_LKUP_RET_UNSUPP_LWT) or an
> interface is misconfigured (BPF_FIB_LKUP_RET_FWD_DISABLED).
Thanks for the explanation.

Agree on the bpf able to collect stats will be useful.

I am wondering, if a new BPF_FIB_LKUP_RET_XYZ is added later,
how may the old xdp_prog work/not-work?  As of now, the return value
is straight forward, FWD, PASS (to stack) or DROP (error).
With this change, the xdp_prog needs to match/switch() the
BPF_FIB_LKUP_RET_* to at least PASS and DROP.

> 
> Arguably BPF_FIB_LKUP_RET_NO_NHDEV is not needed. See below.
> 
> >> @@ -2612,6 +2613,19 @@ struct bpf_raw_tracepoint_args {
> >>  #define BPF_FIB_LOOKUP_DIRECT  BIT(0)
> >>  #define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
> >>  
> >> +enum {
> >> +	BPF_FIB_LKUP_RET_SUCCESS,      /* lookup successful */
> >> +	BPF_FIB_LKUP_RET_BLACKHOLE,    /* dest is blackholed */
> >> +	BPF_FIB_LKUP_RET_UNREACHABLE,  /* dest is unreachable */
> >> +	BPF_FIB_LKUP_RET_PROHIBIT,     /* dest not allowed */
> >> +	BPF_FIB_LKUP_RET_NOT_FWDED,    /* pkt is not forwardded */
> > BPF_FIB_LKUP_RET_NOT_FWDED is a catch all?
> > 
> 
> Destination is local. More precisely, the FIB lookup is not unicast so
> not forwarded. It could be RTN_LOCAL, RTN_BROADCAST, RTN_ANYCAST, or
> RTN_MULTICAST. The next ones -- blackhole, reachable, prohibit -- are
> called out.
I think it also includes the tbid not found case.

> 
> >> @@ -4252,16 +4277,19 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
> >>  	if (check_mtu) {
> >>  		mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src);
> >>  		if (params->tot_len > mtu)
> >> -			return 0;
> >> +			return BPF_FIB_LKUP_RET_FRAG_NEEDED;
> >>  	}
> >>  
> >>  	if (f6i->fib6_nh.nh_lwtstate)
> >> -		return 0;
> >> +		return BPF_FIB_LKUP_RET_UNSUPP_LWT;
> >>  
> >>  	if (f6i->fib6_flags & RTF_GATEWAY)
> >>  		*dst = f6i->fib6_nh.nh_gw;
> >>  
> >>  	dev = f6i->fib6_nh.nh_dev;
> >> +	if (unlikely(!dev))
> >> +		return BPF_FIB_LKUP_RET_NO_NHDEV;
> > Is this a bug fix?
> > 
> 
> Difference between IPv4 and IPv6. Making them consistent.
> 
> It is a major BUG in the kernel to reach this point in either protocol
> to have a unicast route not tied to a device. IPv4 has checks; v6 does
> not. I figured this being new code, why not make bpf_ipv{4,6}_fib_lookup
> as close to the same as possible.
Make sense.  A comment in the commit log will be useful if there is a
re-spin.
David Ahern June 18, 2018, 9:35 p.m. UTC | #4
On 6/18/18 2:55 PM, Martin KaFai Lau wrote:
>> 	/* rc > 0 case */
>> 	switch(rc) {
>> 	case BPF_FIB_LKUP_RET_BLACKHOLE:
>> 	case BPF_FIB_LKUP_RET_UNREACHABLE:
>> 	case BPF_FIB_LKUP_RET_PROHIBIT:
>> 		return XDP_DROP;
>> 	}
>>
>> For the others it becomes a question of do we share why the stack needs
>> to be involved? Maybe the program wants to collect stats to show traffic
>> patterns that can be improved (BPF_FIB_LKUP_RET_FRAG_NEEDED) or support
>> in the kernel needs to be improved (BPF_FIB_LKUP_RET_UNSUPP_LWT) or an
>> interface is misconfigured (BPF_FIB_LKUP_RET_FWD_DISABLED).
> Thanks for the explanation.
> 
> Agree on the bpf able to collect stats will be useful.
> 
> I am wondering, if a new BPF_FIB_LKUP_RET_XYZ is added later,
> how may the old xdp_prog work/not-work?  As of now, the return value
> is straight forward, FWD, PASS (to stack) or DROP (error).
> With this change, the xdp_prog needs to match/switch() the
> BPF_FIB_LKUP_RET_* to at least PASS and DROP.

IMO, programs should only call XDP_DROP for known reasons - like the 3
above. Anything else punt to the stack.

If a new RET_XYZ comes along:
1. the new XYZ is a new ACL response where the packet is to be dropped.
If the program does not understand XYZ and punts to the stack
(recommendation), then a second lookup is done during normal packet
processing and the stack drops it.

2. the new XYZ is a new path in the kernel that is unsupported with
respect to XDP forwarding, nothing new for the program to do.

Either way I would expect stats on BPF_FIB_LKUP_RET_* to give a hint to
the program writer.

Worst case of punting packets to the stack for any rc != 0 means the
stack is doing 2 lookups - 1 in XDP based on its lookup parameters and 1
in normal stack processing - to handle the packet.

> 
>>
>> Arguably BPF_FIB_LKUP_RET_NO_NHDEV is not needed. See below.
>>
>>>> @@ -2612,6 +2613,19 @@ struct bpf_raw_tracepoint_args {
>>>>  #define BPF_FIB_LOOKUP_DIRECT  BIT(0)
>>>>  #define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
>>>>  
>>>> +enum {
>>>> +	BPF_FIB_LKUP_RET_SUCCESS,      /* lookup successful */
>>>> +	BPF_FIB_LKUP_RET_BLACKHOLE,    /* dest is blackholed */
>>>> +	BPF_FIB_LKUP_RET_UNREACHABLE,  /* dest is unreachable */
>>>> +	BPF_FIB_LKUP_RET_PROHIBIT,     /* dest not allowed */
>>>> +	BPF_FIB_LKUP_RET_NOT_FWDED,    /* pkt is not forwardded */
>>> BPF_FIB_LKUP_RET_NOT_FWDED is a catch all?
>>>
>>
>> Destination is local. More precisely, the FIB lookup is not unicast so
>> not forwarded. It could be RTN_LOCAL, RTN_BROADCAST, RTN_ANYCAST, or
>> RTN_MULTICAST. The next ones -- blackhole, reachable, prohibit -- are
>> called out.
> I think it also includes the tbid not found case.

Another one of those "should never happen scenarios". The user does not
specify the table; it is retrieved based on device association. Table
defaults to the main table - which always exists - and any VRF
enslavement of a device happens after the VRF device creates the table.

> 
>>
>>>> @@ -4252,16 +4277,19 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>>>>  	if (check_mtu) {
>>>>  		mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src);
>>>>  		if (params->tot_len > mtu)
>>>> -			return 0;
>>>> +			return BPF_FIB_LKUP_RET_FRAG_NEEDED;
>>>>  	}
>>>>  
>>>>  	if (f6i->fib6_nh.nh_lwtstate)
>>>> -		return 0;
>>>> +		return BPF_FIB_LKUP_RET_UNSUPP_LWT;
>>>>  
>>>>  	if (f6i->fib6_flags & RTF_GATEWAY)
>>>>  		*dst = f6i->fib6_nh.nh_gw;
>>>>  
>>>>  	dev = f6i->fib6_nh.nh_dev;
>>>> +	if (unlikely(!dev))
>>>> +		return BPF_FIB_LKUP_RET_NO_NHDEV;
>>> Is this a bug fix?
>>>
>>
>> Difference between IPv4 and IPv6. Making them consistent.
>>
>> It is a major BUG in the kernel to reach this point in either protocol
>> to have a unicast route not tied to a device. IPv4 has checks; v6 does
>> not. I figured this being new code, why not make bpf_ipv{4,6}_fib_lookup
>> as close to the same as possible.
> Make sense.  A comment in the commit log will be useful if there is a
> re-spin.
> 

ok.
Quentin Monnet June 19, 2018, 9:36 a.m. UTC | #5
Hi David,

2018-06-17 08:18 UTC-0700 ~ dsahern@kernel.org
> From: David Ahern <dsahern@gmail.com>
> 
> For ACLs implemented using either FIB rules or FIB entries, the BPF
> program needs the FIB lookup status to be able to drop the packet.
> Since the bpf_fib_lookup API has not reached a released kernel yet,
> change the return code to contain an encoding of the FIB lookup
> result and return the nexthop device index in the params struct.
> 
> In addition, inform the BPF program of any post FIB lookup reason as
> to why the packet needs to go up the stack.
> 
> Update the sample program per the change in API.
> 
> Signed-off-by: David Ahern <dsahern@gmail.com>
> ---
>  include/uapi/linux/bpf.h   | 28 ++++++++++++++----
>  net/core/filter.c          | 74 ++++++++++++++++++++++++++++++++--------------
>  samples/bpf/xdp_fwd_kern.c |  8 ++---
>  3 files changed, 78 insertions(+), 32 deletions(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 59b19b6a40d7..ceb80071c341 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1857,7 +1857,8 @@ union bpf_attr {
>   *		is resolved), the nexthop address is returned in ipv4_dst
>   *		or ipv6_dst based on family, smac is set to mac address of
>   *		egress device, dmac is set to nexthop mac address, rt_metric
> - *		is set to metric from route (IPv4/IPv6 only).
> + *		is set to metric from route (IPv4/IPv6 only), and ifindex
> + *		is set to the device index of the nexthop from the FIB lookup.
>   *
>   *             *plen* argument is the size of the passed in struct.
>   *             *flags* argument can be a combination of one or more of the
> @@ -1873,9 +1874,9 @@ union bpf_attr {
>   *             *ctx* is either **struct xdp_md** for XDP programs or
>   *             **struct sk_buff** tc cls_act programs.
>   *     Return
> - *             Egress device index on success, 0 if packet needs to continue
> - *             up the stack for further processing or a negative error in case
> - *             of failure.
> + *		< 0 if any input argument is invalid
> + *		  0 on success (packet is forwarded and nexthop neighbor exists)
> + *		> 0 one of BPF_FIB_LKUP_RET_ codes on FIB lookup response


Since you are about to respin (I think?), could you please also fix the
formatting in your change to the doc? The "BPF_FIB_LKUP_RET_" is not
emphasized (and will even cause an error message when producing the man
page, because of the trailing underscore that gets interpreted in RST),
and the three cases for the return value are not formatted properly for
the conversion.

Something like the following would work:

---
 *     Return
 *		* < 0 if any input argument is invalid.
 *		*   0 on success (packet is forwarded and nexthop neighbor exists).
 *		* > 0: one of **BPF_FIB_LKUP_RET_** codes on FIB lookup response.
---

Thank you,
Quentin
David Ahern June 19, 2018, 3:02 p.m. UTC | #6
On 6/19/18 3:36 AM, Quentin Monnet wrote:
> Since you are about to respin (I think?), could you please also fix the
> formatting in your change to the doc? The "BPF_FIB_LKUP_RET_" is not
> emphasized (and will even cause an error message when producing the man
> page, because of the trailing underscore that gets interpreted in RST),
> and the three cases for the return value are not formatted properly for
> the conversion.
> 
> Something like the following would work:
> 
> ---
>  *     Return
>  *		* < 0 if any input argument is invalid.
>  *		*   0 on success (packet is forwarded and nexthop neighbor exists).
>  *		* > 0: one of **BPF_FIB_LKUP_RET_** codes on FIB lookup response.
> ---
> 

Will do. thanks for the review.
David Ahern June 19, 2018, 3:11 p.m. UTC | #7
On 6/18/18 2:55 PM, Martin KaFai Lau wrote:
>>
>> Arguably BPF_FIB_LKUP_RET_NO_NHDEV is not needed. See below.
>>

...

>>>> @@ -4252,16 +4277,19 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>>>>  	if (check_mtu) {
>>>>  		mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src);
>>>>  		if (params->tot_len > mtu)
>>>> -			return 0;
>>>> +			return BPF_FIB_LKUP_RET_FRAG_NEEDED;
>>>>  	}
>>>>  
>>>>  	if (f6i->fib6_nh.nh_lwtstate)
>>>> -		return 0;
>>>> +		return BPF_FIB_LKUP_RET_UNSUPP_LWT;
>>>>  
>>>>  	if (f6i->fib6_flags & RTF_GATEWAY)
>>>>  		*dst = f6i->fib6_nh.nh_gw;
>>>>  
>>>>  	dev = f6i->fib6_nh.nh_dev;
>>>> +	if (unlikely(!dev))
>>>> +		return BPF_FIB_LKUP_RET_NO_NHDEV;
>>> Is this a bug fix?
>>>
>>
>> Difference between IPv4 and IPv6. Making them consistent.
>>
>> It is a major BUG in the kernel to reach this point in either protocol
>> to have a unicast route not tied to a device. IPv4 has checks; v6 does
>> not. I figured this being new code, why not make bpf_ipv{4,6}_fib_lookup
>> as close to the same as possible.
> Make sense.  A comment in the commit log will be useful if there is a
> re-spin.
> 

Upon further review, I will remove BPF_FIB_LKUP_RET_NO_NHDEV. The dev
check is not needed in either ipv4 or ipv6. For IPv4 after fib_lookup
calls both __mkroute_input and ip_route_output_key_hash_rcu expect dev
to be set for unicast as it should be.
Martin KaFai Lau June 19, 2018, 3:25 p.m. UTC | #8
On Mon, Jun 18, 2018 at 03:35:25PM -0600, David Ahern wrote:
> On 6/18/18 2:55 PM, Martin KaFai Lau wrote:
> >> 	/* rc > 0 case */
> >> 	switch(rc) {
> >> 	case BPF_FIB_LKUP_RET_BLACKHOLE:
> >> 	case BPF_FIB_LKUP_RET_UNREACHABLE:
> >> 	case BPF_FIB_LKUP_RET_PROHIBIT:
> >> 		return XDP_DROP;
> >> 	}
> >>
> >> For the others it becomes a question of do we share why the stack needs
> >> to be involved? Maybe the program wants to collect stats to show traffic
> >> patterns that can be improved (BPF_FIB_LKUP_RET_FRAG_NEEDED) or support
> >> in the kernel needs to be improved (BPF_FIB_LKUP_RET_UNSUPP_LWT) or an
> >> interface is misconfigured (BPF_FIB_LKUP_RET_FWD_DISABLED).
> > Thanks for the explanation.
> > 
> > Agree on the bpf able to collect stats will be useful.
> > 
> > I am wondering, if a new BPF_FIB_LKUP_RET_XYZ is added later,
> > how may the old xdp_prog work/not-work?  As of now, the return value
> > is straight forward, FWD, PASS (to stack) or DROP (error).
> > With this change, the xdp_prog needs to match/switch() the
> > BPF_FIB_LKUP_RET_* to at least PASS and DROP.
> 
> IMO, programs should only call XDP_DROP for known reasons - like the 3
> above. Anything else punt to the stack.
> 
> If a new RET_XYZ comes along:
> 1. the new XYZ is a new ACL response where the packet is to be dropped.
> If the program does not understand XYZ and punts to the stack
> (recommendation), then a second lookup is done during normal packet
> processing and the stack drops it.
> 
> 2. the new XYZ is a new path in the kernel that is unsupported with
> respect to XDP forwarding, nothing new for the program to do.
> 
> Either way I would expect stats on BPF_FIB_LKUP_RET_* to give a hint to
> the program writer.
> 
> Worst case of punting packets to the stack for any rc != 0 means the
> stack is doing 2 lookups - 1 in XDP based on its lookup parameters and 1
> in normal stack processing - to handle the packet.
Instead of having the xdp_prog to follow the meaning of what RET_SYZ is,
should the bpf_*_fib_lookup() return value be kept as is such that
the xdp_prog is clear what to do.  The reason can be returned in
the 'struct bpf_fib_lookup'.  The number of reasons can be extended.
If the xdp_prog does not understand a reason, it still will not
affect its decision because the return value is clear.
I think the situation here is similar to regular syscall which usually
uses -1 to clearly states error and errno to spells out the reason.

> 
> > 
> >>
> >> Arguably BPF_FIB_LKUP_RET_NO_NHDEV is not needed. See below.
> >>
> >>>> @@ -2612,6 +2613,19 @@ struct bpf_raw_tracepoint_args {
> >>>>  #define BPF_FIB_LOOKUP_DIRECT  BIT(0)
> >>>>  #define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
> >>>>  
> >>>> +enum {
> >>>> +	BPF_FIB_LKUP_RET_SUCCESS,      /* lookup successful */
> >>>> +	BPF_FIB_LKUP_RET_BLACKHOLE,    /* dest is blackholed */
> >>>> +	BPF_FIB_LKUP_RET_UNREACHABLE,  /* dest is unreachable */
> >>>> +	BPF_FIB_LKUP_RET_PROHIBIT,     /* dest not allowed */
> >>>> +	BPF_FIB_LKUP_RET_NOT_FWDED,    /* pkt is not forwardded */
> >>> BPF_FIB_LKUP_RET_NOT_FWDED is a catch all?
> >>>
> >>
> >> Destination is local. More precisely, the FIB lookup is not unicast so
> >> not forwarded. It could be RTN_LOCAL, RTN_BROADCAST, RTN_ANYCAST, or
> >> RTN_MULTICAST. The next ones -- blackhole, reachable, prohibit -- are
> >> called out.
> > I think it also includes the tbid not found case.
> 
> Another one of those "should never happen scenarios". The user does not
> specify the table; it is retrieved based on device association. Table
> defaults to the main table - which always exists - and any VRF
> enslavement of a device happens after the VRF device creates the table.
> 
> > 
> >>
> >>>> @@ -4252,16 +4277,19 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
> >>>>  	if (check_mtu) {
> >>>>  		mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src);
> >>>>  		if (params->tot_len > mtu)
> >>>> -			return 0;
> >>>> +			return BPF_FIB_LKUP_RET_FRAG_NEEDED;
> >>>>  	}
> >>>>  
> >>>>  	if (f6i->fib6_nh.nh_lwtstate)
> >>>> -		return 0;
> >>>> +		return BPF_FIB_LKUP_RET_UNSUPP_LWT;
> >>>>  
> >>>>  	if (f6i->fib6_flags & RTF_GATEWAY)
> >>>>  		*dst = f6i->fib6_nh.nh_gw;
> >>>>  
> >>>>  	dev = f6i->fib6_nh.nh_dev;
> >>>> +	if (unlikely(!dev))
> >>>> +		return BPF_FIB_LKUP_RET_NO_NHDEV;
> >>> Is this a bug fix?
> >>>
> >>
> >> Difference between IPv4 and IPv6. Making them consistent.
> >>
> >> It is a major BUG in the kernel to reach this point in either protocol
> >> to have a unicast route not tied to a device. IPv4 has checks; v6 does
> >> not. I figured this being new code, why not make bpf_ipv{4,6}_fib_lookup
> >> as close to the same as possible.
> > Make sense.  A comment in the commit log will be useful if there is a
> > re-spin.
> > 
> 
> ok.
David Ahern June 19, 2018, 3:34 p.m. UTC | #9
On 6/19/18 9:25 AM, Martin KaFai Lau wrote:
> On Mon, Jun 18, 2018 at 03:35:25PM -0600, David Ahern wrote:
>> On 6/18/18 2:55 PM, Martin KaFai Lau wrote:
>>>> 	/* rc > 0 case */
>>>> 	switch(rc) {
>>>> 	case BPF_FIB_LKUP_RET_BLACKHOLE:
>>>> 	case BPF_FIB_LKUP_RET_UNREACHABLE:
>>>> 	case BPF_FIB_LKUP_RET_PROHIBIT:
>>>> 		return XDP_DROP;
>>>> 	}
>>>>
>>>> For the others it becomes a question of do we share why the stack needs
>>>> to be involved? Maybe the program wants to collect stats to show traffic
>>>> patterns that can be improved (BPF_FIB_LKUP_RET_FRAG_NEEDED) or support
>>>> in the kernel needs to be improved (BPF_FIB_LKUP_RET_UNSUPP_LWT) or an
>>>> interface is misconfigured (BPF_FIB_LKUP_RET_FWD_DISABLED).
>>> Thanks for the explanation.
>>>
>>> Agree on the bpf able to collect stats will be useful.
>>>
>>> I am wondering, if a new BPF_FIB_LKUP_RET_XYZ is added later,
>>> how may the old xdp_prog work/not-work?  As of now, the return value
>>> is straight forward, FWD, PASS (to stack) or DROP (error).
>>> With this change, the xdp_prog needs to match/switch() the
>>> BPF_FIB_LKUP_RET_* to at least PASS and DROP.
>>
>> IMO, programs should only call XDP_DROP for known reasons - like the 3
>> above. Anything else punt to the stack.
>>
>> If a new RET_XYZ comes along:
>> 1. the new XYZ is a new ACL response where the packet is to be dropped.
>> If the program does not understand XYZ and punts to the stack
>> (recommendation), then a second lookup is done during normal packet
>> processing and the stack drops it.
>>
>> 2. the new XYZ is a new path in the kernel that is unsupported with
>> respect to XDP forwarding, nothing new for the program to do.
>>
>> Either way I would expect stats on BPF_FIB_LKUP_RET_* to give a hint to
>> the program writer.
>>
>> Worst case of punting packets to the stack for any rc != 0 means the
>> stack is doing 2 lookups - 1 in XDP based on its lookup parameters and 1
>> in normal stack processing - to handle the packet.
> Instead of having the xdp_prog to follow the meaning of what RET_SYZ is,
> should the bpf_*_fib_lookup() return value be kept as is such that
> the xdp_prog is clear what to do.  The reason can be returned in
> the 'struct bpf_fib_lookup'.  The number of reasons can be extended.
> If the xdp_prog does not understand a reason, it still will not
> affect its decision because the return value is clear.
> I think the situation here is similar to regular syscall which usually
> uses -1 to clearly states error and errno to spells out the reason.
> 

I did consider returning the status in struct bpf_fib_lookup. However,
it is 64 bytes and can not be extended without a big performance
penalty, so the only option there is to make an existing entry a union
the most logical of which is the ifindex. It seemed odd to me to have
the result by hidden in the struct as a union on ifindex and returning
the egress index from the function:

@@ -2625,7 +2636,11 @@ struct bpf_fib_lookup {

        /* total length of packet from network header - used for MTU
check */
        __u16   tot_len;
-       __u32   ifindex;  /* L3 device index for lookup */
+
+       union {
+               __u32   ifindex;  /* input: L3 device index for lookup */
+               __u32   result;   /* output: one of BPF_FIB_LKUP_RET_* */
+       };


It seemed more natural to have ifindex stay ifindex and only change
value on return:

@@ -2625,7 +2639,11 @@ struct bpf_fib_lookup {

 	/* total length of packet from network header - used for MTU check */
 	__u16	tot_len;
-	__u32	ifindex;  /* L3 device index for lookup */
+
+	/* input: L3 device index for lookup
+	 * output: nexthop device index from FIB lookup
+	 */
+	__u32	ifindex;

 	union {
 		/* inputs to lookup */


From a program's perspective:

rc < 0  -- program is passing incorrect data
rc == 0 -- packet can be forwarded
rc > 0  -- packet can not be forwarded.

BPF programs are not required to track the LKUP_RET values any more than
a function returning multiple negative values - the caller just checks
rc < 0 means failure. If the program cares it can look at specific
values of rc to see the specific value.

The same applies with the LKUP_RET values - they are there to provide
insight into why the packet is not forwarded directly if the program
cares to know why.
Martin KaFai Lau June 19, 2018, 4:36 p.m. UTC | #10
On Tue, Jun 19, 2018 at 09:34:28AM -0600, David Ahern wrote:
> On 6/19/18 9:25 AM, Martin KaFai Lau wrote:
> > On Mon, Jun 18, 2018 at 03:35:25PM -0600, David Ahern wrote:
> >> On 6/18/18 2:55 PM, Martin KaFai Lau wrote:
> >>>> 	/* rc > 0 case */
> >>>> 	switch(rc) {
> >>>> 	case BPF_FIB_LKUP_RET_BLACKHOLE:
> >>>> 	case BPF_FIB_LKUP_RET_UNREACHABLE:
> >>>> 	case BPF_FIB_LKUP_RET_PROHIBIT:
> >>>> 		return XDP_DROP;
> >>>> 	}
> >>>>
> >>>> For the others it becomes a question of do we share why the stack needs
> >>>> to be involved? Maybe the program wants to collect stats to show traffic
> >>>> patterns that can be improved (BPF_FIB_LKUP_RET_FRAG_NEEDED) or support
> >>>> in the kernel needs to be improved (BPF_FIB_LKUP_RET_UNSUPP_LWT) or an
> >>>> interface is misconfigured (BPF_FIB_LKUP_RET_FWD_DISABLED).
> >>> Thanks for the explanation.
> >>>
> >>> Agree on the bpf able to collect stats will be useful.
> >>>
> >>> I am wondering, if a new BPF_FIB_LKUP_RET_XYZ is added later,
> >>> how may the old xdp_prog work/not-work?  As of now, the return value
> >>> is straight forward, FWD, PASS (to stack) or DROP (error).
> >>> With this change, the xdp_prog needs to match/switch() the
> >>> BPF_FIB_LKUP_RET_* to at least PASS and DROP.
> >>
> >> IMO, programs should only call XDP_DROP for known reasons - like the 3
> >> above. Anything else punt to the stack.
> >>
> >> If a new RET_XYZ comes along:
> >> 1. the new XYZ is a new ACL response where the packet is to be dropped.
> >> If the program does not understand XYZ and punts to the stack
> >> (recommendation), then a second lookup is done during normal packet
> >> processing and the stack drops it.
> >>
> >> 2. the new XYZ is a new path in the kernel that is unsupported with
> >> respect to XDP forwarding, nothing new for the program to do.
> >>
> >> Either way I would expect stats on BPF_FIB_LKUP_RET_* to give a hint to
> >> the program writer.
> >>
> >> Worst case of punting packets to the stack for any rc != 0 means the
> >> stack is doing 2 lookups - 1 in XDP based on its lookup parameters and 1
> >> in normal stack processing - to handle the packet.
> > Instead of having the xdp_prog to follow the meaning of what RET_SYZ is,
> > should the bpf_*_fib_lookup() return value be kept as is such that
> > the xdp_prog is clear what to do.  The reason can be returned in
> > the 'struct bpf_fib_lookup'.  The number of reasons can be extended.
> > If the xdp_prog does not understand a reason, it still will not
> > affect its decision because the return value is clear.
> > I think the situation here is similar to regular syscall which usually
> > uses -1 to clearly states error and errno to spells out the reason.
> > 
> 
> I did consider returning the status in struct bpf_fib_lookup. However,
> it is 64 bytes and can not be extended without a big performance
> penalty, so the only option there is to make an existing entry a union
> the most logical of which is the ifindex. It seemed odd to me to have
> the result by hidden in the struct as a union on ifindex and returning
> the egress index from the function:
> 
> @@ -2625,7 +2636,11 @@ struct bpf_fib_lookup {
> 
>         /* total length of packet from network header - used for MTU
> check */
>         __u16   tot_len;
> -       __u32   ifindex;  /* L3 device index for lookup */
> +
> +       union {
> +               __u32   ifindex;  /* input: L3 device index for lookup */
> +               __u32   result;   /* output: one of BPF_FIB_LKUP_RET_* */
> +       };
> 
> 
> It seemed more natural to have ifindex stay ifindex and only change
> value on return:
> 
> @@ -2625,7 +2639,11 @@ struct bpf_fib_lookup {
> 
>  	/* total length of packet from network header - used for MTU check */
>  	__u16	tot_len;
> -	__u32	ifindex;  /* L3 device index for lookup */
> +
> +	/* input: L3 device index for lookup
> +	 * output: nexthop device index from FIB lookup
> +	 */
> +	__u32	ifindex;
> 
>  	union {
>  		/* inputs to lookup */
> 
> 
> From a program's perspective:
> 
> rc < 0  -- program is passing incorrect data
> rc == 0 -- packet can be forwarded
> rc > 0  -- packet can not be forwarded.
> 
> BPF programs are not required to track the LKUP_RET values any more than
> a function returning multiple negative values - the caller just checks
> rc < 0 means failure. If the program cares it can look at specific
> values of rc to see the specific value.
> 
> The same applies with the LKUP_RET values - they are there to provide
> insight into why the packet is not forwarded directly if the program
> cares to know why.
hmm...ic. My concern is, the prog can interpret rc > 0 (in this patch) to be
drop vs pass (although we can advise them in bpf.h to always pass if it does
not understand a rc but it is not a strong contract),  it may catch people
a surprise if a xdp_prog suddenly drops everything when running in a
newer kernel where the upper stack can actually handle it.

while the current behavior (i.e. before this patch, rc == 0) is always pass
to the stack.

I think at least comments should be put in the enum such that
the xdp/tc_prog should expect the enum could be extended later, so
the suggested behavior should be a pass for unknown LKUP_RET and let
the stack to decide.
David Ahern June 19, 2018, 8:16 p.m. UTC | #11
On 6/19/18 10:36 AM, Martin KaFai Lau wrote:
> On Tue, Jun 19, 2018 at 09:34:28AM -0600, David Ahern wrote:
>> On 6/19/18 9:25 AM, Martin KaFai Lau wrote:
>>> On Mon, Jun 18, 2018 at 03:35:25PM -0600, David Ahern wrote:
>>>> On 6/18/18 2:55 PM, Martin KaFai Lau wrote:
>>>>>> 	/* rc > 0 case */
>>>>>> 	switch(rc) {
>>>>>> 	case BPF_FIB_LKUP_RET_BLACKHOLE:
>>>>>> 	case BPF_FIB_LKUP_RET_UNREACHABLE:
>>>>>> 	case BPF_FIB_LKUP_RET_PROHIBIT:
>>>>>> 		return XDP_DROP;
>>>>>> 	}
>>>>>>
>>>>>> For the others it becomes a question of do we share why the stack needs
>>>>>> to be involved? Maybe the program wants to collect stats to show traffic
>>>>>> patterns that can be improved (BPF_FIB_LKUP_RET_FRAG_NEEDED) or support
>>>>>> in the kernel needs to be improved (BPF_FIB_LKUP_RET_UNSUPP_LWT) or an
>>>>>> interface is misconfigured (BPF_FIB_LKUP_RET_FWD_DISABLED).
>>>>> Thanks for the explanation.
>>>>>
>>>>> Agree on the bpf able to collect stats will be useful.
>>>>>
>>>>> I am wondering, if a new BPF_FIB_LKUP_RET_XYZ is added later,
>>>>> how may the old xdp_prog work/not-work?  As of now, the return value
>>>>> is straight forward, FWD, PASS (to stack) or DROP (error).
>>>>> With this change, the xdp_prog needs to match/switch() the
>>>>> BPF_FIB_LKUP_RET_* to at least PASS and DROP.
>>>>
>>>> IMO, programs should only call XDP_DROP for known reasons - like the 3
>>>> above. Anything else punt to the stack.
>>>>
>>>> If a new RET_XYZ comes along:
>>>> 1. the new XYZ is a new ACL response where the packet is to be dropped.
>>>> If the program does not understand XYZ and punts to the stack
>>>> (recommendation), then a second lookup is done during normal packet
>>>> processing and the stack drops it.
>>>>
>>>> 2. the new XYZ is a new path in the kernel that is unsupported with
>>>> respect to XDP forwarding, nothing new for the program to do.
>>>>
>>>> Either way I would expect stats on BPF_FIB_LKUP_RET_* to give a hint to
>>>> the program writer.
>>>>
>>>> Worst case of punting packets to the stack for any rc != 0 means the
>>>> stack is doing 2 lookups - 1 in XDP based on its lookup parameters and 1
>>>> in normal stack processing - to handle the packet.
>>> Instead of having the xdp_prog to follow the meaning of what RET_SYZ is,
>>> should the bpf_*_fib_lookup() return value be kept as is such that
>>> the xdp_prog is clear what to do.  The reason can be returned in
>>> the 'struct bpf_fib_lookup'.  The number of reasons can be extended.
>>> If the xdp_prog does not understand a reason, it still will not
>>> affect its decision because the return value is clear.
>>> I think the situation here is similar to regular syscall which usually
>>> uses -1 to clearly states error and errno to spells out the reason.
>>>
>>
>> I did consider returning the status in struct bpf_fib_lookup. However,
>> it is 64 bytes and can not be extended without a big performance
>> penalty, so the only option there is to make an existing entry a union
>> the most logical of which is the ifindex. It seemed odd to me to have
>> the result by hidden in the struct as a union on ifindex and returning
>> the egress index from the function:
>>
>> @@ -2625,7 +2636,11 @@ struct bpf_fib_lookup {
>>
>>         /* total length of packet from network header - used for MTU
>> check */
>>         __u16   tot_len;
>> -       __u32   ifindex;  /* L3 device index for lookup */
>> +
>> +       union {
>> +               __u32   ifindex;  /* input: L3 device index for lookup */
>> +               __u32   result;   /* output: one of BPF_FIB_LKUP_RET_* */
>> +       };
>>
>>
>> It seemed more natural to have ifindex stay ifindex and only change
>> value on return:
>>
>> @@ -2625,7 +2639,11 @@ struct bpf_fib_lookup {
>>
>>  	/* total length of packet from network header - used for MTU check */
>>  	__u16	tot_len;
>> -	__u32	ifindex;  /* L3 device index for lookup */
>> +
>> +	/* input: L3 device index for lookup
>> +	 * output: nexthop device index from FIB lookup
>> +	 */
>> +	__u32	ifindex;
>>
>>  	union {
>>  		/* inputs to lookup */
>>
>>
>> From a program's perspective:
>>
>> rc < 0  -- program is passing incorrect data
>> rc == 0 -- packet can be forwarded
>> rc > 0  -- packet can not be forwarded.
>>
>> BPF programs are not required to track the LKUP_RET values any more than
>> a function returning multiple negative values - the caller just checks
>> rc < 0 means failure. If the program cares it can look at specific
>> values of rc to see the specific value.
>>
>> The same applies with the LKUP_RET values - they are there to provide
>> insight into why the packet is not forwarded directly if the program
>> cares to know why.
> hmm...ic. My concern is, the prog can interpret rc > 0 (in this patch) to be
> drop vs pass (although we can advise them in bpf.h to always pass if it does
> not understand a rc but it is not a strong contract),  it may catch people
> a surprise if a xdp_prog suddenly drops everything when running in a
> newer kernel where the upper stack can actually handle it.
> 
> while the current behavior (i.e. before this patch, rc == 0) is always pass
> to the stack.
> 
> I think at least comments should be put in the enum such that
> the xdp/tc_prog should expect the enum could be extended later, so
> the suggested behavior should be a pass for unknown LKUP_RET and let
> the stack to decide.
> 

All APIs with enums have the inherent quality that more can be added.
Nothing about rc > 0 says it is ok to drop the packet and nothing in the
documentation says it is ok to drop the packet. The program author needs
to look at the extra information provided by the rc. Specific values are
a hint that yes the packet can be dropped; others merely say 'packet
needs help from the stack' with a reason why it needs help.

My intention is to allow the XDP program to understand FIB based ACLs.
To that end a fib lookup result of blackhole, unreachable and prohibit
needs to be returned to the xdp program with the effective summary of
"can drop in the xdp program".

If the other return codes are going to cause confusion then less shorten
the list:

enum {
	BPF_FIB_LKUP_RET_SUCCESS,      /* packet is to be forwareded */
	BPF_FIB_LKUP_RET_CAN_DROP,     /* XDP program can drop the packet */
	BPF_FIB_LKUP_RET_NEED_STACK,   /* packet needs full stack assist */
};

But, that still does not solve the problem of rc > 0 means xdp program
can drop the packet, but then that is not the intention of rc > 0. The
intention is only "here's more information about why this packet can not
be forwarded at this layer"
Martin KaFai Lau June 19, 2018, 9:24 p.m. UTC | #12
On Tue, Jun 19, 2018 at 02:16:53PM -0600, David Ahern wrote:
> On 6/19/18 10:36 AM, Martin KaFai Lau wrote:
> > On Tue, Jun 19, 2018 at 09:34:28AM -0600, David Ahern wrote:
> >> On 6/19/18 9:25 AM, Martin KaFai Lau wrote:
> >>> On Mon, Jun 18, 2018 at 03:35:25PM -0600, David Ahern wrote:
> >>>> On 6/18/18 2:55 PM, Martin KaFai Lau wrote:
> >>>>>> 	/* rc > 0 case */
> >>>>>> 	switch(rc) {
> >>>>>> 	case BPF_FIB_LKUP_RET_BLACKHOLE:
> >>>>>> 	case BPF_FIB_LKUP_RET_UNREACHABLE:
> >>>>>> 	case BPF_FIB_LKUP_RET_PROHIBIT:
> >>>>>> 		return XDP_DROP;
> >>>>>> 	}
> >>>>>>
> >>>>>> For the others it becomes a question of do we share why the stack needs
> >>>>>> to be involved? Maybe the program wants to collect stats to show traffic
> >>>>>> patterns that can be improved (BPF_FIB_LKUP_RET_FRAG_NEEDED) or support
> >>>>>> in the kernel needs to be improved (BPF_FIB_LKUP_RET_UNSUPP_LWT) or an
> >>>>>> interface is misconfigured (BPF_FIB_LKUP_RET_FWD_DISABLED).
> >>>>> Thanks for the explanation.
> >>>>>
> >>>>> Agree on the bpf able to collect stats will be useful.
> >>>>>
> >>>>> I am wondering, if a new BPF_FIB_LKUP_RET_XYZ is added later,
> >>>>> how may the old xdp_prog work/not-work?  As of now, the return value
> >>>>> is straight forward, FWD, PASS (to stack) or DROP (error).
> >>>>> With this change, the xdp_prog needs to match/switch() the
> >>>>> BPF_FIB_LKUP_RET_* to at least PASS and DROP.
> >>>>
> >>>> IMO, programs should only call XDP_DROP for known reasons - like the 3
> >>>> above. Anything else punt to the stack.
> >>>>
> >>>> If a new RET_XYZ comes along:
> >>>> 1. the new XYZ is a new ACL response where the packet is to be dropped.
> >>>> If the program does not understand XYZ and punts to the stack
> >>>> (recommendation), then a second lookup is done during normal packet
> >>>> processing and the stack drops it.
> >>>>
> >>>> 2. the new XYZ is a new path in the kernel that is unsupported with
> >>>> respect to XDP forwarding, nothing new for the program to do.
> >>>>
> >>>> Either way I would expect stats on BPF_FIB_LKUP_RET_* to give a hint to
> >>>> the program writer.
> >>>>
> >>>> Worst case of punting packets to the stack for any rc != 0 means the
> >>>> stack is doing 2 lookups - 1 in XDP based on its lookup parameters and 1
> >>>> in normal stack processing - to handle the packet.
> >>> Instead of having the xdp_prog to follow the meaning of what RET_SYZ is,
> >>> should the bpf_*_fib_lookup() return value be kept as is such that
> >>> the xdp_prog is clear what to do.  The reason can be returned in
> >>> the 'struct bpf_fib_lookup'.  The number of reasons can be extended.
> >>> If the xdp_prog does not understand a reason, it still will not
> >>> affect its decision because the return value is clear.
> >>> I think the situation here is similar to regular syscall which usually
> >>> uses -1 to clearly states error and errno to spells out the reason.
> >>>
> >>
> >> I did consider returning the status in struct bpf_fib_lookup. However,
> >> it is 64 bytes and can not be extended without a big performance
> >> penalty, so the only option there is to make an existing entry a union
> >> the most logical of which is the ifindex. It seemed odd to me to have
> >> the result by hidden in the struct as a union on ifindex and returning
> >> the egress index from the function:
> >>
> >> @@ -2625,7 +2636,11 @@ struct bpf_fib_lookup {
> >>
> >>         /* total length of packet from network header - used for MTU
> >> check */
> >>         __u16   tot_len;
> >> -       __u32   ifindex;  /* L3 device index for lookup */
> >> +
> >> +       union {
> >> +               __u32   ifindex;  /* input: L3 device index for lookup */
> >> +               __u32   result;   /* output: one of BPF_FIB_LKUP_RET_* */
> >> +       };
> >>
> >>
> >> It seemed more natural to have ifindex stay ifindex and only change
> >> value on return:
> >>
> >> @@ -2625,7 +2639,11 @@ struct bpf_fib_lookup {
> >>
> >>  	/* total length of packet from network header - used for MTU check */
> >>  	__u16	tot_len;
> >> -	__u32	ifindex;  /* L3 device index for lookup */
> >> +
> >> +	/* input: L3 device index for lookup
> >> +	 * output: nexthop device index from FIB lookup
> >> +	 */
> >> +	__u32	ifindex;
> >>
> >>  	union {
> >>  		/* inputs to lookup */
> >>
> >>
> >> From a program's perspective:
> >>
> >> rc < 0  -- program is passing incorrect data
> >> rc == 0 -- packet can be forwarded
> >> rc > 0  -- packet can not be forwarded.
> >>
> >> BPF programs are not required to track the LKUP_RET values any more than
> >> a function returning multiple negative values - the caller just checks
> >> rc < 0 means failure. If the program cares it can look at specific
> >> values of rc to see the specific value.
> >>
> >> The same applies with the LKUP_RET values - they are there to provide
> >> insight into why the packet is not forwarded directly if the program
> >> cares to know why.
> > hmm...ic. My concern is, the prog can interpret rc > 0 (in this patch) to be
> > drop vs pass (although we can advise them in bpf.h to always pass if it does
> > not understand a rc but it is not a strong contract),  it may catch people
> > a surprise if a xdp_prog suddenly drops everything when running in a
> > newer kernel where the upper stack can actually handle it.
> > 
> > while the current behavior (i.e. before this patch, rc == 0) is always pass
> > to the stack.
> > 
> > I think at least comments should be put in the enum such that
> > the xdp/tc_prog should expect the enum could be extended later, so
> > the suggested behavior should be a pass for unknown LKUP_RET and let
> > the stack to decide.
> > 
> 
> All APIs with enums have the inherent quality that more can be added.
> Nothing about rc > 0 says it is ok to drop the packet and nothing in the
> documentation says it is ok to drop the packet. The program author needs
> to look at the extra information provided by the rc. Specific values are
> a hint that yes the packet can be dropped; others merely say 'packet
> needs help from the stack' with a reason why it needs help.
Right, I understand there are values that hint drop (as your earlier example)
and there are values that hint pass to stack.  and either set of these values
could be extended later.

> 
> My intention is to allow the XDP program to understand FIB based ACLs.
> To that end a fib lookup result of blackhole, unreachable and prohibit
> needs to be returned to the xdp program with the effective summary of
> "can drop in the xdp program".
> 
> If the other return codes are going to cause confusion then less shorten
> the list:
> 
> enum {
> 	BPF_FIB_LKUP_RET_SUCCESS,      /* packet is to be forwareded */
> 	BPF_FIB_LKUP_RET_CAN_DROP,     /* XDP program can drop the packet */
> 	BPF_FIB_LKUP_RET_NEED_STACK,   /* packet needs full stack assist */
> };
> 
> But, that still does not solve the problem of rc > 0 means xdp program
> can drop the packet, but then that is not the intention of rc > 0. The
> intention is only "here's more information about why this packet can not
> be forwarded at this layer"
ok. Please spell out this intention in bpf.h so that the writer will not
default to DROP for the reason that it does not understand.
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 59b19b6a40d7..ceb80071c341 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1857,7 +1857,8 @@  union bpf_attr {
  *		is resolved), the nexthop address is returned in ipv4_dst
  *		or ipv6_dst based on family, smac is set to mac address of
  *		egress device, dmac is set to nexthop mac address, rt_metric
- *		is set to metric from route (IPv4/IPv6 only).
+ *		is set to metric from route (IPv4/IPv6 only), and ifindex
+ *		is set to the device index of the nexthop from the FIB lookup.
  *
  *             *plen* argument is the size of the passed in struct.
  *             *flags* argument can be a combination of one or more of the
@@ -1873,9 +1874,9 @@  union bpf_attr {
  *             *ctx* is either **struct xdp_md** for XDP programs or
  *             **struct sk_buff** tc cls_act programs.
  *     Return
- *             Egress device index on success, 0 if packet needs to continue
- *             up the stack for further processing or a negative error in case
- *             of failure.
+ *		< 0 if any input argument is invalid
+ *		  0 on success (packet is forwarded and nexthop neighbor exists)
+ *		> 0 one of BPF_FIB_LKUP_RET_ codes on FIB lookup response
  *
  * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
  *	Description
@@ -2612,6 +2613,19 @@  struct bpf_raw_tracepoint_args {
 #define BPF_FIB_LOOKUP_DIRECT  BIT(0)
 #define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
 
+enum {
+	BPF_FIB_LKUP_RET_SUCCESS,      /* lookup successful */
+	BPF_FIB_LKUP_RET_BLACKHOLE,    /* dest is blackholed */
+	BPF_FIB_LKUP_RET_UNREACHABLE,  /* dest is unreachable */
+	BPF_FIB_LKUP_RET_PROHIBIT,     /* dest not allowed */
+	BPF_FIB_LKUP_RET_NOT_FWDED,    /* pkt is not forwardded */
+	BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */
+	BPF_FIB_LKUP_RET_UNSUPP_LWT,   /* fwd requires unsupported encap */
+	BPF_FIB_LKUP_RET_NO_NHDEV,     /* nh device does not exist */
+	BPF_FIB_LKUP_RET_NO_NEIGH,     /* no neigh entry for nh */
+	BPF_FIB_LKUP_RET_FRAG_NEEDED,  /* pkt too big to fwd */
+};
+
 struct bpf_fib_lookup {
 	/* input:  network family for lookup (AF_INET, AF_INET6)
 	 * output: network family of egress nexthop
@@ -2625,7 +2639,11 @@  struct bpf_fib_lookup {
 
 	/* total length of packet from network header - used for MTU check */
 	__u16	tot_len;
-	__u32	ifindex;  /* L3 device index for lookup */
+
+	/* input: L3 device index for lookup
+	 * output: nexthop device index from FIB lookup
+	 */
+	__u32	ifindex;
 
 	union {
 		/* inputs to lookup */
diff --git a/net/core/filter.c b/net/core/filter.c
index e7f12e9f598c..e758ca487878 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4073,8 +4073,9 @@  static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
 	memcpy(params->smac, dev->dev_addr, ETH_ALEN);
 	params->h_vlan_TCI = 0;
 	params->h_vlan_proto = 0;
+	params->ifindex = dev->ifindex;
 
-	return dev->ifindex;
+	return 0;
 }
 #endif
 
@@ -4098,7 +4099,7 @@  static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	/* verify forwarding is enabled on this interface */
 	in_dev = __in_dev_get_rcu(dev);
 	if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
-		return 0;
+		return BPF_FIB_LKUP_RET_FWD_DISABLED;
 
 	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
 		fl4.flowi4_iif = 1;
@@ -4123,7 +4124,7 @@  static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 
 		tb = fib_get_table(net, tbid);
 		if (unlikely(!tb))
-			return 0;
+			return BPF_FIB_LKUP_RET_NOT_FWDED;
 
 		err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
 	} else {
@@ -4135,8 +4136,20 @@  static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 		err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
 	}
 
-	if (err || res.type != RTN_UNICAST)
-		return 0;
+	if (err) {
+		/* map fib lookup errors to RTN_ type */
+		if (err == -EINVAL)
+			return BPF_FIB_LKUP_RET_BLACKHOLE;
+		if (err == -EHOSTUNREACH)
+			return BPF_FIB_LKUP_RET_UNREACHABLE;
+		if (err == -EACCES)
+			return BPF_FIB_LKUP_RET_PROHIBIT;
+
+		return BPF_FIB_LKUP_RET_NOT_FWDED;
+	}
+
+	if (res.type != RTN_UNICAST)
+		return BPF_FIB_LKUP_RET_NOT_FWDED;
 
 	if (res.fi->fib_nhs > 1)
 		fib_select_path(net, &res, &fl4, NULL);
@@ -4144,18 +4157,18 @@  static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	if (check_mtu) {
 		mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
 		if (params->tot_len > mtu)
-			return 0;
+			return BPF_FIB_LKUP_RET_FRAG_NEEDED;
 	}
 
 	nh = &res.fi->fib_nh[res.nh_sel];
 
 	/* do not handle lwt encaps right now */
 	if (nh->nh_lwtstate)
-		return 0;
+		return BPF_FIB_LKUP_RET_UNSUPP_LWT;
 
 	dev = nh->nh_dev;
 	if (unlikely(!dev))
-		return 0;
+		return BPF_FIB_LKUP_RET_NO_NHDEV;
 
 	if (nh->nh_gw)
 		params->ipv4_dst = nh->nh_gw;
@@ -4166,10 +4179,10 @@  static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	 * rcu_read_lock_bh is not needed here
 	 */
 	neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
-	if (neigh)
-		return bpf_fib_set_fwd_params(params, neigh, dev);
+	if (!neigh)
+		return BPF_FIB_LKUP_RET_NO_NEIGH;
 
-	return 0;
+	return bpf_fib_set_fwd_params(params, neigh, dev);
 }
 #endif
 
@@ -4190,7 +4203,7 @@  static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 
 	/* link local addresses are never forwarded */
 	if (rt6_need_strict(dst) || rt6_need_strict(src))
-		return 0;
+		return BPF_FIB_LKUP_RET_NOT_FWDED;
 
 	dev = dev_get_by_index_rcu(net, params->ifindex);
 	if (unlikely(!dev))
@@ -4198,7 +4211,7 @@  static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 
 	idev = __in6_dev_get_safely(dev);
 	if (unlikely(!idev || !net->ipv6.devconf_all->forwarding))
-		return 0;
+		return BPF_FIB_LKUP_RET_FWD_DISABLED;
 
 	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
 		fl6.flowi6_iif = 1;
@@ -4225,7 +4238,7 @@  static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 
 		tb = ipv6_stub->fib6_get_table(net, tbid);
 		if (unlikely(!tb))
-			return 0;
+			return BPF_FIB_LKUP_RET_NOT_FWDED;
 
 		f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
 	} else {
@@ -4238,11 +4251,23 @@  static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	}
 
 	if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
-		return 0;
+		return BPF_FIB_LKUP_RET_NOT_FWDED;
+
+	if (unlikely(f6i->fib6_flags & RTF_REJECT)) {
+		switch (f6i->fib6_type) {
+		case RTN_BLACKHOLE:
+			return BPF_FIB_LKUP_RET_BLACKHOLE;
+		case RTN_UNREACHABLE:
+			return BPF_FIB_LKUP_RET_UNREACHABLE;
+		case RTN_PROHIBIT:
+			return BPF_FIB_LKUP_RET_PROHIBIT;
+		default:
+			return BPF_FIB_LKUP_RET_NOT_FWDED;
+		}
+	}
 
-	if (unlikely(f6i->fib6_flags & RTF_REJECT ||
-	    f6i->fib6_type != RTN_UNICAST))
-		return 0;
+	if (f6i->fib6_type != RTN_UNICAST)
+		return BPF_FIB_LKUP_RET_NOT_FWDED;
 
 	if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
 		f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
@@ -4252,16 +4277,19 @@  static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	if (check_mtu) {
 		mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src);
 		if (params->tot_len > mtu)
-			return 0;
+			return BPF_FIB_LKUP_RET_FRAG_NEEDED;
 	}
 
 	if (f6i->fib6_nh.nh_lwtstate)
-		return 0;
+		return BPF_FIB_LKUP_RET_UNSUPP_LWT;
 
 	if (f6i->fib6_flags & RTF_GATEWAY)
 		*dst = f6i->fib6_nh.nh_gw;
 
 	dev = f6i->fib6_nh.nh_dev;
+	if (unlikely(!dev))
+		return BPF_FIB_LKUP_RET_NO_NHDEV;
+
 	params->rt_metric = f6i->fib6_metric;
 
 	/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
@@ -4270,10 +4298,10 @@  static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	 */
 	neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
 				      ndisc_hashfn, dst, dev);
-	if (neigh)
-		return bpf_fib_set_fwd_params(params, neigh, dev);
+	if (!neigh)
+		return BPF_FIB_LKUP_RET_NO_NEIGH;
 
-	return 0;
+	return bpf_fib_set_fwd_params(params, neigh, dev);
 }
 #endif
 
diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c
index 6673cdb9f55c..a7e94e7ff87d 100644
--- a/samples/bpf/xdp_fwd_kern.c
+++ b/samples/bpf/xdp_fwd_kern.c
@@ -48,9 +48,9 @@  static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
 	struct ethhdr *eth = data;
 	struct ipv6hdr *ip6h;
 	struct iphdr *iph;
-	int out_index;
 	u16 h_proto;
 	u64 nh_off;
+	int rc;
 
 	nh_off = sizeof(*eth);
 	if (data + nh_off > data_end)
@@ -101,7 +101,7 @@  static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
 
 	fib_params.ifindex = ctx->ingress_ifindex;
 
-	out_index = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
+	rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
 
 	/* verify egress index has xdp support
 	 * TO-DO bpf_map_lookup_elem(&tx_port, &key) fails with
@@ -109,7 +109,7 @@  static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
 	 * NOTE: without verification that egress index supports XDP
 	 *       forwarding packets are dropped.
 	 */
-	if (out_index > 0) {
+	if (rc == 0) {
 		if (h_proto == htons(ETH_P_IP))
 			ip_decrease_ttl(iph);
 		else if (h_proto == htons(ETH_P_IPV6))
@@ -117,7 +117,7 @@  static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
 
 		memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN);
 		memcpy(eth->h_source, fib_params.smac, ETH_ALEN);
-		return bpf_redirect_map(&tx_port, out_index, 0);
+		return bpf_redirect_map(&tx_port, fib_params.ifindex, 0);
 	}
 
 	return XDP_PASS;