Message ID | 20180429180752.15428-9-dsahern@gmail.com |
---|---|
State | RFC, archived |
Delegated to: | BPF Maintainers |
Headers | show |
Series | bpf: Add helper to do FIB lookups | expand |
On Sun, Apr 29, 2018 at 11:07:51AM -0700, David Ahern wrote: > Provide a helper for doing a FIB and neighbor lookup in the kernel > tables from an XDP program. The helper provides a fastpath for forwarding > packets. If the packet is a local delivery or for any reason is not a > simple lookup and forward, the packet continues up the stack. > > If it is to be forwarded, the forwarding can be done directly if the > neighbor is already known. If the neighbor does not exist, the first > few packets go up the stack for neighbor resolution. Once resolved, the > xdp program provides the fast path. > > On successful lookup the nexthop dmac, current device smac and egress > device index are returned. > > The API supports IPv4, IPv6 and MPLS protocols, but only IPv4 and IPv6 > are implemented in this patch. The API includes layer 4 parameters if > the XDP program chooses to do deep packet inspection to allow compare > against ACLs implemented as FIB rules. > > Header rewrite is left to the XDP program. > > The lookup takes 2 flags: > - BPF_FIB_LOOKUP_DIRECT to do a lookup that bypasses FIB rules and goes > straight to the table associated with the device (expert setting for > those looking to maximize throughput) > > - BPF_FIB_LOOKUP_OUTPUT to do a lookup from the egress perspective. > Default is an ingress lookup. > > Initial performance numbers collected by Jesper, forwarded packets/sec: > > Full stack XDP FIB lookup XDP Direct lookup > IPv4 1,947,969 7,074,156 7,415,333 > IPv6 1,728,000 6,165,504 7,262,720 > > Signed-off-by: David Ahern <dsahern@gmail.com> > --- > include/uapi/linux/bpf.h | 83 ++++++++++++++- > net/core/filter.c | 263 +++++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 345 insertions(+), 1 deletion(-) > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 23b334bba1a6..52652507113e 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -10,6 +10,8 @@ > > #include <linux/types.h> > #include <linux/bpf_common.h> > +#include <linux/if_ether.h> > +#include <linux/in6.h> > > /* Extended instruction set based on top of classic BPF */ > > @@ -1801,6 +1803,33 @@ union bpf_attr { > * Return > * a non-negative value equal to or less than size on success, or > * a negative error in case of failure. > + * > + * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) > + * Description > + * Do FIB lookup in kernel tables using parameters in *params*. > + * If lookup is successful and result shows packets is to be > + * forwarded, the neighbor tables are searched for the nexthop. > + * If successful (ie., FIB lookup shows forwarding and nexthop > + * is resolved), the nexthop address is returned in ipv4_dst, > + * ipv6_dst or mpls_out based on family, smac is set to mac > + * address of egress device, dmac is set to nexthop mac address, > + * rt_metric is set to metric from route. > + * > + * *plen* argument is the size of the passed in struct. > + * *flags* argument can be one or more BPF_FIB_LOOKUP_ flags: > + * > + * **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs > + * full lookup using FIB rules > + * **BPF_FIB_LOOKUP_OUTPUT** mmeans do lookup from an egress > + * perspective (default is ingress) > + * > + * *ctx* is either **struct xdp_md** for XDP programs or > + * **struct sk_buff** tc cls_act programs. > + * > + * Return > + * Egress device index on success, 0 if packet needs to continue > + * up the stack for further processing or a negative error in case > + * of failure. > */ > #define __BPF_FUNC_MAPPER(FN) \ > FN(unspec), \ > @@ -1870,7 +1899,8 @@ union bpf_attr { > FN(bind), \ > FN(xdp_adjust_tail), \ > FN(skb_get_xfrm_state), \ > - FN(get_stack), > + FN(get_stack), \ > + FN(fib_lookup), > > /* integer value in 'imm' field of BPF_CALL instruction selects which helper > * function eBPF program intends to call > @@ -2278,4 +2308,55 @@ struct bpf_raw_tracepoint_args { > __u64 args[0]; > }; > > +/* DIRECT: Skip the FIB rules and go to FIB table associated with device > + * OUTPUT: Do lookup from egress perspective; default is ingress > + */ > +#define BPF_FIB_LOOKUP_DIRECT BIT(0) > +#define BPF_FIB_LOOKUP_OUTPUT BIT(1) > + > +struct bpf_fib_lookup { > + /* input */ > + __u8 family; /* network family, AF_INET, AF_INET6, AF_MPLS */ > + > + /* set if lookup is to consider L4 data - e.g., FIB rules */ > + __u8 l4_protocol; > + __be16 sport; > + __be16 dport; > + > + /* total length of packet from network header - used for MTU check */ > + __u16 tot_len; > + __u32 ifindex; /* L3 device index for lookup */ > + > + union { > + /* inputs to lookup */ > + __u8 tos; /* AF_INET */ > + __be32 flowlabel; /* AF_INET6 */ > + > + /* output: metric of fib result */ > + __u32 rt_metric; > + }; > + > + union { > + __be32 mpls_in; > + __be32 ipv4_src; > + struct in6_addr ipv6_src; > + }; > + > + /* input to bpf_fib_lookup, *dst is destination address. > + * output: bpf_fib_lookup sets to gateway address > + */ > + union { > + /* return for MPLS lookups */ > + __be32 mpls_out[4]; /* support up to 4 labels */ > + __be32 ipv4_dst; > + struct in6_addr ipv6_dst; > + }; > + > + /* output */ > + __be16 h_vlan_proto; > + __be16 h_vlan_TCI; > + __u8 smac[ETH_ALEN]; > + __u8 dmac[ETH_ALEN]; > +}; > + > #endif /* _UAPI__LINUX_BPF_H__ */ > diff --git a/net/core/filter.c b/net/core/filter.c > index d3781daa26ab..c34ba2675a98 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -59,6 +59,10 @@ > #include <net/tcp.h> > #include <net/xfrm.h> > #include <linux/bpf_trace.h> > +#include <linux/inetdevice.h> > +#include <net/ip_fib.h> > +#include <net/flow.h> > +#include <net/arp.h> > > /** > * sk_filter_trim_cap - run a packet through a socket filter > @@ -3788,6 +3792,261 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { > }; > #endif > > +#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6) > +static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, > + const struct neighbour *neigh, > + const struct net_device *dev) > +{ > + memcpy(params->dmac, neigh->ha, ETH_ALEN); > + memcpy(params->smac, dev->dev_addr, ETH_ALEN); > + params->h_vlan_TCI = 0; > + params->h_vlan_proto = 0; > + > + return dev->ifindex; > +} > +#endif > + > +#if IS_ENABLED(CONFIG_INET) > +static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, > + u32 flags) > +{ > + struct in_device *in_dev; > + struct neighbour *neigh; > + struct net_device *dev; > + struct fib_result res; > + struct fib_nh *nh; > + struct flowi4 fl4; > + int err; > + > + dev = dev_get_by_index_rcu(net, params->ifindex); > + if (unlikely(!dev)) > + return -ENODEV; > + > + /* verify forwarding is enabled on this interface */ > + in_dev = __in_dev_get_rcu(dev); > + if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) > + return 0; > + > + if (flags & BPF_FIB_LOOKUP_OUTPUT) { > + fl4.flowi4_iif = 1; > + fl4.flowi4_oif = params->ifindex; > + } else { > + fl4.flowi4_iif = params->ifindex; > + fl4.flowi4_oif = 0; > + } > + fl4.flowi4_tos = params->tos & IPTOS_RT_MASK; > + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; > + fl4.flowi4_flags = 0; > + > + fl4.flowi4_proto = params->l4_protocol; > + fl4.daddr = params->ipv4_dst; > + fl4.saddr = params->ipv4_src; > + fl4.fl4_sport = params->sport; > + fl4.fl4_dport = params->dport; > + > + if (flags & BPF_FIB_LOOKUP_DIRECT) { > + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; > + struct fib_table *tb; > + > + tb = fib_get_table(net, tbid); > + if (unlikely(!tb)) > + return 0; > + > + err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); > + } else { > + fl4.flowi4_mark = 0; > + fl4.flowi4_secid = 0; > + fl4.flowi4_tun_key.tun_id = 0; > + fl4.flowi4_uid = sock_net_uid(net, NULL); > + > + err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); > + } > + > + if (err || res.type != RTN_UNICAST) > + return 0; should this be an error returned to the user instead of zero? Seems useful to indicate. > + > + if (res.fi->fib_nhs > 1) > + fib_select_path(net, &res, &fl4, NULL); > + > + nh = &res.fi->fib_nh[res.nh_sel]; > + > + /* do not handle lwt encaps right now */ > + if (nh->nh_lwtstate) > + return 0; adn return enotsupp here? > + > + dev = nh->nh_dev; > + if (unlikely(!dev)) > + return 0; enodev ? > + > + if (nh->nh_gw) > + params->ipv4_dst = nh->nh_gw; > + > + params->rt_metric = res.fi->fib_priority; > + > + /* xdp and cls_bpf programs are run in RCU-bh so > + * rcu_read_lock_bh is not needed here > + */ > + neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); > + if (neigh) > + return bpf_fib_set_fwd_params(params, neigh, dev); > + > + return 0; Even this return 0 doesn't quite fit to what doc says: "0 if packet needs to continue up the stack for further processing" What stack suppose to do ? It will hit the same condition and packet will be dropped, right? Isn't it better to report all errors back to bpf prog and let the program make decision instead of 'return 0' almost everywhere? > +} > +#endif > + > +#if IS_ENABLED(CONFIG_IPV6) > +static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, > + u32 flags) > +{ > + struct neighbour *neigh; > + struct net_device *dev; > + struct fib6_info *f6i; > + struct flowi6 fl6; > + int strict = 0; > + int oif; > + > + /* link local addresses are never forwarded */ > + if (rt6_need_strict(¶ms->ipv6_dst) || > + rt6_need_strict(¶ms->ipv6_src)) > + return 0; > + > + dev = dev_get_by_index_rcu(net, params->ifindex); > + if (unlikely(!dev)) > + return -ENODEV; > + > + if (flags & BPF_FIB_LOOKUP_OUTPUT) { > + fl6.flowi6_iif = 1; > + oif = fl6.flowi6_oif = params->ifindex; > + } else { > + oif = fl6.flowi6_iif = params->ifindex; > + fl6.flowi6_oif = 0; > + strict = RT6_LOOKUP_F_HAS_SADDR; > + } > + fl6.flowlabel = params->flowlabel; > + fl6.flowi6_scope = 0; > + fl6.flowi6_flags = 0; > + fl6.mp_hash = 0; > + > + fl6.flowi6_proto = params->l4_protocol; > + fl6.daddr = params->ipv6_dst; > + fl6.saddr = params->ipv6_src; > + fl6.fl6_sport = params->sport; > + fl6.fl6_dport = params->dport; > + > + if (flags & BPF_FIB_LOOKUP_DIRECT) { > + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; > + struct fib6_table *tb; > + > + tb = ipv6_stub->fib6_get_table(net, tbid); > + if (unlikely(!tb)) > + return 0; > + > + f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict); > + } else { > + fl6.flowi6_mark = 0; > + fl6.flowi6_secid = 0; > + fl6.flowi6_tun_key.tun_id = 0; > + fl6.flowi6_uid = sock_net_uid(net, NULL); > + > + f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict); > + } > + > + if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry)) > + return 0; > + > + if (unlikely(f6i->fib6_flags & RTF_REJECT || > + f6i->fib6_type != RTN_UNICAST)) > + return 0; > + > + if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0) > + f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6, > + fl6.flowi6_oif, NULL, > + strict); > + > + if (f6i->fib6_nh.nh_lwtstate) > + return 0; > + > + if (f6i->fib6_flags & RTF_GATEWAY) > + params->ipv6_dst = f6i->fib6_nh.nh_gw; > + > + dev = f6i->fib6_nh.nh_dev; > + params->rt_metric = f6i->fib6_metric; > + > + /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is > + * not needed here. Can not use __ipv6_neigh_lookup_noref here > + * because we need to get nd_tbl via the stub > + */ > + neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128, > + ndisc_hashfn, ¶ms->ipv6_dst, dev); > + if (neigh) > + return bpf_fib_set_fwd_params(params, neigh, dev); > + > + return 0; > +} > +#endif > + > +BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, > + struct bpf_fib_lookup *, params, int, plen, u32, flags) > +{ > + if (plen < sizeof(*params)) > + return -EINVAL; there should be a check here that only two bits are used in 'flags' Otherwise flags will not be extendable. > + > + switch (params->family) { > +#if IS_ENABLED(CONFIG_INET) > + case AF_INET: > + return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params, > + flags); > +#endif > +#if IS_ENABLED(CONFIG_IPV6) > + case AF_INET6: > + return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params, > + flags); > +#endif > + } > + return -ENOTSUPP; > +} > + > +static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = { > + .func = bpf_xdp_fib_lookup, > + .gpl_only = true, do you really want to force all users of this helper to be gpl only? I don't mind at all. That's your choice as the author of this helper. Just the rest of networking helpers don't require gpl-ness. > + .pkt_access = true, > + .ret_type = RET_INTEGER, > + .arg1_type = ARG_PTR_TO_CTX, > + .arg2_type = ARG_PTR_TO_MEM, > + .arg3_type = ARG_CONST_SIZE, > + .arg4_type = ARG_ANYTHING, > +}; > + > +BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, > + struct bpf_fib_lookup *, params, int, plen, u32, flags) > +{ > + if (plen < sizeof(*params)) > + return -EINVAL; > + > + switch (params->family) { > +#if IS_ENABLED(CONFIG_INET) > + case AF_INET: > + return bpf_ipv4_fib_lookup(dev_net(skb->dev), params, flags); > +#endif > +#if IS_ENABLED(CONFIG_IPV6) > + case AF_INET6: > + return bpf_ipv6_fib_lookup(dev_net(skb->dev), params, flags); > +#endif > + } > + return -ENOTSUPP; > +} > + > +static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { > + .func = bpf_skb_fib_lookup, > + .gpl_only = true, > + .pkt_access = true, > + .ret_type = RET_INTEGER, > + .arg1_type = ARG_PTR_TO_CTX, > + .arg2_type = ARG_PTR_TO_MEM, > + .arg3_type = ARG_CONST_SIZE, > + .arg4_type = ARG_ANYTHING, > +}; > + > static const struct bpf_func_proto * > bpf_base_func_proto(enum bpf_func_id func_id) > { > @@ -3933,6 +4192,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) > case BPF_FUNC_skb_get_xfrm_state: > return &bpf_skb_get_xfrm_state_proto; > #endif > + case BPF_FUNC_fib_lookup: > + return &bpf_skb_fib_lookup_proto; > default: > return bpf_base_func_proto(func_id); > } > @@ -3958,6 +4219,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) > return &bpf_xdp_redirect_map_proto; > case BPF_FUNC_xdp_adjust_tail: > return &bpf_xdp_adjust_tail_proto; > + case BPF_FUNC_fib_lookup: > + return &bpf_xdp_fib_lookup_proto; > default: > return bpf_base_func_proto(func_id); > } > -- > 2.11.0 >
On 4/29/18 5:36 PM, Alexei Starovoitov wrote: >> + if (flags & BPF_FIB_LOOKUP_DIRECT) { >> + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; >> + struct fib_table *tb; >> + >> + tb = fib_get_table(net, tbid); >> + if (unlikely(!tb)) >> + return 0; >> + >> + err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); >> + } else { >> + fl4.flowi4_mark = 0; >> + fl4.flowi4_secid = 0; >> + fl4.flowi4_tun_key.tun_id = 0; >> + fl4.flowi4_uid = sock_net_uid(net, NULL); >> + >> + err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); >> + } >> + >> + if (err || res.type != RTN_UNICAST) >> + return 0; > > should this be an error returned to the user instead of zero? > Seems useful to indicate. res.type != UNICAST is not an error; it means other delivery type (e.g., local). err < 0 means unreachable, prohibit, blackhole, etc. Arguably the error could be returned to the xdp program, but it is more complicated than that. Blackhole is a common default route or policy, but RTN_BLACKHOLE == -EINVAL which is also the error code if the user passes invalid arguments to the program. > >> + >> + if (res.fi->fib_nhs > 1) >> + fib_select_path(net, &res, &fl4, NULL); >> + >> + nh = &res.fi->fib_nh[res.nh_sel]; >> + >> + /* do not handle lwt encaps right now */ >> + if (nh->nh_lwtstate) >> + return 0; > > adn return enotsupp here? see below > >> + >> + dev = nh->nh_dev; >> + if (unlikely(!dev)) >> + return 0; > > enodev ? see below > >> + >> + if (nh->nh_gw) >> + params->ipv4_dst = nh->nh_gw; >> + >> + params->rt_metric = res.fi->fib_priority; >> + >> + /* xdp and cls_bpf programs are run in RCU-bh so >> + * rcu_read_lock_bh is not needed here >> + */ >> + neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); >> + if (neigh) >> + return bpf_fib_set_fwd_params(params, neigh, dev); >> + >> + return 0; > > Even this return 0 doesn't quite fit to what doc says: > "0 if packet needs to continue up the stack for further processing" > What stack suppose to do ? First packet on a route the nexthop may not be resolved. Without punting to the stack it never has an impetus to resolve that neighbor. > It will hit the same condition and packet will be dropped, right? no. It can resolve the neighbor so follow up packets can be forwarded in the fast path. > Isn't it better to report all errors back to bpf prog and let > the program make decision instead of 'return 0' almost everywhere? The idea here is to fast pass packets that fit a supported profile and are to be forwarded. Everything else should continue up the stack as it has wider capabilities. The helper and XDP programs should make no assumptions on what the broader kernel and userspace might be monitoring or want to do with packets that can not be forwarded in the fast path. This is very similar to hardware forwarding when it punts packets to the CPU for control plane assistance.
On Sun, 29 Apr 2018 11:07:51 -0700 David Ahern <dsahern@gmail.com> wrote: > Initial performance numbers collected by Jesper, forwarded packets/sec: > > Full stack XDP FIB lookup XDP Direct lookup > IPv4 1,947,969 7,074,156 7,415,333 > IPv6 1,728,000 6,165,504 7,262,720 Do notice these number is single CPU core forwarding performance! On a Broadwell E5-1650 v4 @ 3.60GHz. Another interesting data point is that xdp_redirect_map performance is 13,365,161 pps, which allow us to calculate/isolate the overhead/cost of the FIB lookup. (1/13365161-1/7074156)*10^9 = -66.5 ns (1/13365161-1/7415333)*10^9 = -60.0 ns Which is very close to the measured 50 ns cost of the FIB lookup, done by Vincent Bernat. See: https://vincent.bernat.im/en/blog/2017-ipv4-route-lookup-linux Another way I calculate this is by (ran a new benchmark): Performance: 7641593 (7,641,593) <= tx_unicast /sec * Packet-gap: (1/7641593*10^9) = 130.86 ns Find all FIB related lookup functions in perf-report:: Samples: 93K of event 'cycles:ppp', Event count (approx.): 88553104553 Overhead Cost CPU Command Symbol 20.63 % 26.99 ns 002 ksoftirqd/2 [k] fib_table_lookup 12.92 % 16.90 ns 002 ksoftirqd/2 [k] bpf_fib_lookup 2.40 % 3.14 ns 002 ksoftirqd/2 [k] fib_select_path 0.83 % 1.09 ns 002 ksoftirqd/2 [k] fib_get_table 0.40 % 0.52 ns 002 ksoftirqd/2 [k] l3mdev_fib_table_rcu ----------- Tot:37.18 % (20.63+12.92+2.40+0.83+0.40) =========== Cost of FIB lookup: - 130.86/100*37.18 = 48.65 ns overhead by FIB lookup. Again very close to Vincent's IPv4 measurements of ~50 ns. Notice that the IPv6 measurements does not match up: https://vincent.bernat.im/en/blog/2017-ipv6-route-lookup-linux This is because, we/I'm just testing the IPv6 route cache here...
On 5/2/18 5:27 AM, Jesper Dangaard Brouer wrote: > On Sun, 29 Apr 2018 11:07:51 -0700 > David Ahern <dsahern@gmail.com> wrote: > >> Initial performance numbers collected by Jesper, forwarded packets/sec: >> >> Full stack XDP FIB lookup XDP Direct lookup >> IPv4 1,947,969 7,074,156 7,415,333 >> IPv6 1,728,000 6,165,504 7,262,720 > > Do notice these number is single CPU core forwarding performance! > On a Broadwell E5-1650 v4 @ 3.60GHz. I'll add that context to the commit message. Thanks, > > Another interesting data point is that xdp_redirect_map performance is > 13,365,161 pps, which allow us to calculate/isolate the overhead/cost > of the FIB lookup. > > (1/13365161-1/7074156)*10^9 = -66.5 ns > (1/13365161-1/7415333)*10^9 = -60.0 ns > > Which is very close to the measured 50 ns cost of the FIB lookup, done > by Vincent Bernat. > See: https://vincent.bernat.im/en/blog/2017-ipv4-route-lookup-linux > > > > Another way I calculate this is by (ran a new benchmark): > > Performance: 7641593 (7,641,593) <= tx_unicast /sec > * Packet-gap: (1/7641593*10^9) = 130.86 ns > > Find all FIB related lookup functions in perf-report:: > > Samples: 93K of event 'cycles:ppp', Event count (approx.): 88553104553 > Overhead Cost CPU Command Symbol > 20.63 % 26.99 ns 002 ksoftirqd/2 [k] fib_table_lookup > 12.92 % 16.90 ns 002 ksoftirqd/2 [k] bpf_fib_lookup > 2.40 % 3.14 ns 002 ksoftirqd/2 [k] fib_select_path > 0.83 % 1.09 ns 002 ksoftirqd/2 [k] fib_get_table > 0.40 % 0.52 ns 002 ksoftirqd/2 [k] l3mdev_fib_table_rcu > ----------- > Tot:37.18 % (20.63+12.92+2.40+0.83+0.40) > =========== > > Cost of FIB lookup: > - 130.86/100*37.18 = 48.65 ns overhead by FIB lookup. > > Again very close to Vincent's IPv4 measurements of ~50 ns. > > > > Notice that the IPv6 measurements does not match up: > https://vincent.bernat.im/en/blog/2017-ipv6-route-lookup-linux > This is because, we/I'm just testing the IPv6 route cache here... > Vincent's blog is before recent changes -- 4.15 getting the rcu locking, net-next getting separate fib entries and now this set adding a FIB lookup without the dst. To share numbers from recent testing I did using Vincent's modules, lookup times in nsec (using local_clock) with MULTIPLE_TABLES config disabled for IPv4 and IPv6 IPv4 IPv6-dst IPv6-fib6 baseline 49 126 52 I have other cases with combinations of configs and rules, but this shows the best possible case. IPv6 needs some more work to improve speeds with MULTIPLE_TABLES enabled (separate local and main tables unlike IPv4) and IPV6_SUBTREES enabled.
From: David Ahern <dsahern@gmail.com> Date: Wed, 2 May 2018 09:37:21 -0600 > To share numbers from recent testing I did using Vincent's modules, > lookup times in nsec (using local_clock) with MULTIPLE_TABLES config > disabled for IPv4 and IPv6 > > IPv4 IPv6-dst IPv6-fib6 > baseline 49 126 52 > > I have other cases with combinations of configs and rules, but this > shows the best possible case. > > IPv6 needs some more work to improve speeds with MULTIPLE_TABLES enabled > (separate local and main tables unlike IPv4) and IPV6_SUBTREES enabled. Yes, like for ipv4 sharing local and main tables will help a lot.
On 4/29/18 7:13 PM, David Ahern wrote: > > The idea here is to fast pass packets that fit a supported profile and > are to be forwarded. Everything else should continue up the stack as it > has wider capabilities. The helper and XDP programs should make no > assumptions on what the broader kernel and userspace might be monitoring > or want to do with packets that can not be forwarded in the fast path. > This is very similar to hardware forwarding when it punts packets to the > CPU for control plane assistance. > Thinking about this some more and how to return more information to the bpf program about the FIB lookup. bpf_fib_lookup struct is 64-bytes. It can not be expanded without hurting performance. I could do another union on an input parameter and return flags indicating why the returned index is 0. Something like this: diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 360a1168c353..75591522444c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2314,6 +2314,12 @@ struct bpf_raw_tracepoint_args { #define BPF_FIB_LOOKUP_DIRECT BIT(0) #define BPF_FIB_LOOKUP_OUTPUT BIT(1) +#define BPF_FIB_LKUP_RET_NO_FWD BIT(0) /* pkt is not fwded */ +#define BPF_FIB_LKUP_RET_UNSUPP_LWT BIT(1) /* fwd requires unsupp encap */ +#define BPF_FIB_LKUP_RET_NO_NHDEV BIT(2) /* nh device does not exist */ +#define BPF_FIB_LKUP_RET_NO_NEIGH BIT(3) /* no neigh entry for nh */ +#define BPF_FIB_LKUP_RET_FRAG_NEEDED BIT(4) /* pkt too big to fwd */ + struct bpf_fib_lookup { /* input */ __u8 family; /* network family, AF_INET, AF_INET6, AF_MPLS */ @@ -2325,7 +2331,11 @@ struct bpf_fib_lookup { /* total length of packet from network header - used for MTU check */ __u16 tot_len; - __u32 ifindex; /* L3 device index for lookup */ + + union { + __u32 ifindex; /* in: L3 device index for lookup */ + __u32 ret_flags; /* out: BPF_FIB_LOOKUP_RET flags */ + } union { /* inputs to lookup */ Similarly for the fib result, it could be returned with a union on say family: union { __u8 family; /* in: network family, AF_INET, AF_INET6, AF_MPLS */ __u8 rt_type; /* out: FIB lookup route type */ }; Then if the fib result is -EINVAL/-EHOSTUNREACH/-EACCES, rt_type is set to RTN_BLACKHOLE/RTN_UNREACHABLE/RTN_PROHIBIT allowing the XDP program to make an informed decision on dropping the packet. To avoid performance hits on the forwarding path, these return values would *only* set if the ifindex returned is 0.
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 23b334bba1a6..52652507113e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -10,6 +10,8 @@ #include <linux/types.h> #include <linux/bpf_common.h> +#include <linux/if_ether.h> +#include <linux/in6.h> /* Extended instruction set based on top of classic BPF */ @@ -1801,6 +1803,33 @@ union bpf_attr { * Return * a non-negative value equal to or less than size on success, or * a negative error in case of failure. + * + * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) + * Description + * Do FIB lookup in kernel tables using parameters in *params*. + * If lookup is successful and result shows packets is to be + * forwarded, the neighbor tables are searched for the nexthop. + * If successful (ie., FIB lookup shows forwarding and nexthop + * is resolved), the nexthop address is returned in ipv4_dst, + * ipv6_dst or mpls_out based on family, smac is set to mac + * address of egress device, dmac is set to nexthop mac address, + * rt_metric is set to metric from route. + * + * *plen* argument is the size of the passed in struct. + * *flags* argument can be one or more BPF_FIB_LOOKUP_ flags: + * + * **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs + * full lookup using FIB rules + * **BPF_FIB_LOOKUP_OUTPUT** mmeans do lookup from an egress + * perspective (default is ingress) + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** tc cls_act programs. + * + * Return + * Egress device index on success, 0 if packet needs to continue + * up the stack for further processing or a negative error in case + * of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -1870,7 +1899,8 @@ union bpf_attr { FN(bind), \ FN(xdp_adjust_tail), \ FN(skb_get_xfrm_state), \ - FN(get_stack), + FN(get_stack), \ + FN(fib_lookup), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2278,4 +2308,55 @@ struct bpf_raw_tracepoint_args { __u64 args[0]; }; +/* DIRECT: Skip the FIB rules and go to FIB table associated with device + * OUTPUT: Do lookup from egress perspective; default is ingress + */ +#define BPF_FIB_LOOKUP_DIRECT BIT(0) +#define BPF_FIB_LOOKUP_OUTPUT BIT(1) + +struct bpf_fib_lookup { + /* input */ + __u8 family; /* network family, AF_INET, AF_INET6, AF_MPLS */ + + /* set if lookup is to consider L4 data - e.g., FIB rules */ + __u8 l4_protocol; + __be16 sport; + __be16 dport; + + /* total length of packet from network header - used for MTU check */ + __u16 tot_len; + __u32 ifindex; /* L3 device index for lookup */ + + union { + /* inputs to lookup */ + __u8 tos; /* AF_INET */ + __be32 flowlabel; /* AF_INET6 */ + + /* output: metric of fib result */ + __u32 rt_metric; + }; + + union { + __be32 mpls_in; + __be32 ipv4_src; + struct in6_addr ipv6_src; + }; + + /* input to bpf_fib_lookup, *dst is destination address. + * output: bpf_fib_lookup sets to gateway address + */ + union { + /* return for MPLS lookups */ + __be32 mpls_out[4]; /* support up to 4 labels */ + __be32 ipv4_dst; + struct in6_addr ipv6_dst; + }; + + /* output */ + __be16 h_vlan_proto; + __be16 h_vlan_TCI; + __u8 smac[ETH_ALEN]; + __u8 dmac[ETH_ALEN]; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/net/core/filter.c b/net/core/filter.c index d3781daa26ab..c34ba2675a98 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -59,6 +59,10 @@ #include <net/tcp.h> #include <net/xfrm.h> #include <linux/bpf_trace.h> +#include <linux/inetdevice.h> +#include <net/ip_fib.h> +#include <net/flow.h> +#include <net/arp.h> /** * sk_filter_trim_cap - run a packet through a socket filter @@ -3788,6 +3792,261 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { }; #endif +#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6) +static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, + const struct neighbour *neigh, + const struct net_device *dev) +{ + memcpy(params->dmac, neigh->ha, ETH_ALEN); + memcpy(params->smac, dev->dev_addr, ETH_ALEN); + params->h_vlan_TCI = 0; + params->h_vlan_proto = 0; + + return dev->ifindex; +} +#endif + +#if IS_ENABLED(CONFIG_INET) +static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, + u32 flags) +{ + struct in_device *in_dev; + struct neighbour *neigh; + struct net_device *dev; + struct fib_result res; + struct fib_nh *nh; + struct flowi4 fl4; + int err; + + dev = dev_get_by_index_rcu(net, params->ifindex); + if (unlikely(!dev)) + return -ENODEV; + + /* verify forwarding is enabled on this interface */ + in_dev = __in_dev_get_rcu(dev); + if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) + return 0; + + if (flags & BPF_FIB_LOOKUP_OUTPUT) { + fl4.flowi4_iif = 1; + fl4.flowi4_oif = params->ifindex; + } else { + fl4.flowi4_iif = params->ifindex; + fl4.flowi4_oif = 0; + } + fl4.flowi4_tos = params->tos & IPTOS_RT_MASK; + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.flowi4_flags = 0; + + fl4.flowi4_proto = params->l4_protocol; + fl4.daddr = params->ipv4_dst; + fl4.saddr = params->ipv4_src; + fl4.fl4_sport = params->sport; + fl4.fl4_dport = params->dport; + + if (flags & BPF_FIB_LOOKUP_DIRECT) { + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; + struct fib_table *tb; + + tb = fib_get_table(net, tbid); + if (unlikely(!tb)) + return 0; + + err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); + } else { + fl4.flowi4_mark = 0; + fl4.flowi4_secid = 0; + fl4.flowi4_tun_key.tun_id = 0; + fl4.flowi4_uid = sock_net_uid(net, NULL); + + err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); + } + + if (err || res.type != RTN_UNICAST) + return 0; + + if (res.fi->fib_nhs > 1) + fib_select_path(net, &res, &fl4, NULL); + + nh = &res.fi->fib_nh[res.nh_sel]; + + /* do not handle lwt encaps right now */ + if (nh->nh_lwtstate) + return 0; + + dev = nh->nh_dev; + if (unlikely(!dev)) + return 0; + + if (nh->nh_gw) + params->ipv4_dst = nh->nh_gw; + + params->rt_metric = res.fi->fib_priority; + + /* xdp and cls_bpf programs are run in RCU-bh so + * rcu_read_lock_bh is not needed here + */ + neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); + if (neigh) + return bpf_fib_set_fwd_params(params, neigh, dev); + + return 0; +} +#endif + +#if IS_ENABLED(CONFIG_IPV6) +static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, + u32 flags) +{ + struct neighbour *neigh; + struct net_device *dev; + struct fib6_info *f6i; + struct flowi6 fl6; + int strict = 0; + int oif; + + /* link local addresses are never forwarded */ + if (rt6_need_strict(¶ms->ipv6_dst) || + rt6_need_strict(¶ms->ipv6_src)) + return 0; + + dev = dev_get_by_index_rcu(net, params->ifindex); + if (unlikely(!dev)) + return -ENODEV; + + if (flags & BPF_FIB_LOOKUP_OUTPUT) { + fl6.flowi6_iif = 1; + oif = fl6.flowi6_oif = params->ifindex; + } else { + oif = fl6.flowi6_iif = params->ifindex; + fl6.flowi6_oif = 0; + strict = RT6_LOOKUP_F_HAS_SADDR; + } + fl6.flowlabel = params->flowlabel; + fl6.flowi6_scope = 0; + fl6.flowi6_flags = 0; + fl6.mp_hash = 0; + + fl6.flowi6_proto = params->l4_protocol; + fl6.daddr = params->ipv6_dst; + fl6.saddr = params->ipv6_src; + fl6.fl6_sport = params->sport; + fl6.fl6_dport = params->dport; + + if (flags & BPF_FIB_LOOKUP_DIRECT) { + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; + struct fib6_table *tb; + + tb = ipv6_stub->fib6_get_table(net, tbid); + if (unlikely(!tb)) + return 0; + + f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict); + } else { + fl6.flowi6_mark = 0; + fl6.flowi6_secid = 0; + fl6.flowi6_tun_key.tun_id = 0; + fl6.flowi6_uid = sock_net_uid(net, NULL); + + f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict); + } + + if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry)) + return 0; + + if (unlikely(f6i->fib6_flags & RTF_REJECT || + f6i->fib6_type != RTN_UNICAST)) + return 0; + + if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0) + f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6, + fl6.flowi6_oif, NULL, + strict); + + if (f6i->fib6_nh.nh_lwtstate) + return 0; + + if (f6i->fib6_flags & RTF_GATEWAY) + params->ipv6_dst = f6i->fib6_nh.nh_gw; + + dev = f6i->fib6_nh.nh_dev; + params->rt_metric = f6i->fib6_metric; + + /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is + * not needed here. Can not use __ipv6_neigh_lookup_noref here + * because we need to get nd_tbl via the stub + */ + neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128, + ndisc_hashfn, ¶ms->ipv6_dst, dev); + if (neigh) + return bpf_fib_set_fwd_params(params, neigh, dev); + + return 0; +} +#endif + +BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, + struct bpf_fib_lookup *, params, int, plen, u32, flags) +{ + if (plen < sizeof(*params)) + return -EINVAL; + + switch (params->family) { +#if IS_ENABLED(CONFIG_INET) + case AF_INET: + return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params, + flags); +#endif +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params, + flags); +#endif + } + return -ENOTSUPP; +} + +static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = { + .func = bpf_xdp_fib_lookup, + .gpl_only = true, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, + struct bpf_fib_lookup *, params, int, plen, u32, flags) +{ + if (plen < sizeof(*params)) + return -EINVAL; + + switch (params->family) { +#if IS_ENABLED(CONFIG_INET) + case AF_INET: + return bpf_ipv4_fib_lookup(dev_net(skb->dev), params, flags); +#endif +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + return bpf_ipv6_fib_lookup(dev_net(skb->dev), params, flags); +#endif + } + return -ENOTSUPP; +} + +static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { + .func = bpf_skb_fib_lookup, + .gpl_only = true, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -3933,6 +4192,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_get_xfrm_state: return &bpf_skb_get_xfrm_state_proto; #endif + case BPF_FUNC_fib_lookup: + return &bpf_skb_fib_lookup_proto; default: return bpf_base_func_proto(func_id); } @@ -3958,6 +4219,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_redirect_map_proto; case BPF_FUNC_xdp_adjust_tail: return &bpf_xdp_adjust_tail_proto; + case BPF_FUNC_fib_lookup: + return &bpf_xdp_fib_lookup_proto; default: return bpf_base_func_proto(func_id); }
Provide a helper for doing a FIB and neighbor lookup in the kernel tables from an XDP program. The helper provides a fastpath for forwarding packets. If the packet is a local delivery or for any reason is not a simple lookup and forward, the packet continues up the stack. If it is to be forwarded, the forwarding can be done directly if the neighbor is already known. If the neighbor does not exist, the first few packets go up the stack for neighbor resolution. Once resolved, the xdp program provides the fast path. On successful lookup the nexthop dmac, current device smac and egress device index are returned. The API supports IPv4, IPv6 and MPLS protocols, but only IPv4 and IPv6 are implemented in this patch. The API includes layer 4 parameters if the XDP program chooses to do deep packet inspection to allow compare against ACLs implemented as FIB rules. Header rewrite is left to the XDP program. The lookup takes 2 flags: - BPF_FIB_LOOKUP_DIRECT to do a lookup that bypasses FIB rules and goes straight to the table associated with the device (expert setting for those looking to maximize throughput) - BPF_FIB_LOOKUP_OUTPUT to do a lookup from the egress perspective. Default is an ingress lookup. Initial performance numbers collected by Jesper, forwarded packets/sec: Full stack XDP FIB lookup XDP Direct lookup IPv4 1,947,969 7,074,156 7,415,333 IPv6 1,728,000 6,165,504 7,262,720 Signed-off-by: David Ahern <dsahern@gmail.com> --- include/uapi/linux/bpf.h | 83 ++++++++++++++- net/core/filter.c | 263 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 345 insertions(+), 1 deletion(-)