Message ID | 9e3f898c74cdd45c2d71676a0b60fbe6215b5e3d.1526565671.git.m.xhonneux@gmail.com |
---|---|
State | Changes Requested, archived |
Delegated to: | BPF Maintainers |
Headers | show |
Series | ipv6: sr: introduce seg6local End.BPF action | expand |
On 05/17/2018 04:28 PM, Mathieu Xhonneux wrote: > This patch adds the End.BPF action to the LWT seg6local infrastructure. > This action works like any other seg6local End action, meaning that an IPv6 > header with SRH is needed, whose DA has to be equal to the SID of the > action. It will also advance the SRH to the next segment, the BPF program > does not have to take care of this. > > Since the BPF program may not be a source of instability in the kernel, it > is important to ensure that the integrity of the packet is maintained > before yielding it back to the IPv6 layer. The hook hence keeps track if > the SRH has been altered through the helpers, and re-validates its > content if needed with seg6_validate_srh. The state kept for validation is > stored in a per-CPU buffer. The BPF program is not allowed to directly > write into the packet, and only some fields of the SRH can be altered > through the helper bpf_lwt_seg6_store_bytes. > > Performances profiling has shown that the SRH re-validation does not induce > a significant overhead. If the altered SRH is deemed as invalid, the packet > is dropped. > > This validation is also done before executing any action through > bpf_lwt_seg6_action, and will not be performed again if the SRH is not > modified after calling the action. > > The BPF program may return 3 types of return codes: > - BPF_OK: the End.BPF action will look up the next destination through > seg6_lookup_nexthop. > - BPF_REDIRECT: if an action has been executed through the > bpf_lwt_seg6_action helper, the BPF program should return this > value, as the skb's destination is already set and the default > lookup should not be performed. > - BPF_DROP : the packet will be dropped. > > Signed-off-by: Mathieu Xhonneux <m.xhonneux@gmail.com> > Acked-by: David Lebrun <dlebrun@google.com> [...] > static struct seg6_action_desc seg6_action_table[] = { > { > .action = SEG6_LOCAL_ACTION_END, > @@ -497,7 +568,13 @@ static struct seg6_action_desc seg6_action_table[] = { > .attrs = (1 << SEG6_LOCAL_SRH), > .input = input_action_end_b6_encap, > .static_headroom = sizeof(struct ipv6hdr), > - } > + }, > + { > + .action = SEG6_LOCAL_ACTION_END_BPF, > + .attrs = (1 << SEG6_LOCAL_BPF), > + .input = input_action_end_bpf, > + }, > + > }; > > static struct seg6_action_desc *__get_action_desc(int action) > @@ -542,6 +619,7 @@ static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = { > .len = sizeof(struct in6_addr) }, > [SEG6_LOCAL_IIF] = { .type = NLA_U32 }, > [SEG6_LOCAL_OIF] = { .type = NLA_U32 }, > + [SEG6_LOCAL_BPF] = { .type = NLA_NESTED }, > }; > > static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt) > @@ -719,6 +797,71 @@ static int cmp_nla_oif(struct seg6_local_lwt *a, struct seg6_local_lwt *b) > return 0; > } > > +#define MAX_PROG_NAME 256 > +static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = { > + [LWT_BPF_PROG_FD] = { .type = NLA_U32, }, From UAPI point of view, I wouldn't name it LWT_BPF_PROG_FD but rather something like LWT_BPF_PROG for example. That way, the setup can contain the fd number, but on the dump you can put the prog->aux->id in there so that prog lookup can be done again. > + [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, > + .len = MAX_PROG_NAME }, > +}; > + > +static int parse_nla_bpf(struct nlattr **attrs, struct seg6_local_lwt *slwt) > +{ > + struct nlattr *tb[LWT_BPF_PROG_MAX + 1]; > + struct bpf_prog *p; > + int ret; > + u32 fd; > + > + ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attrs[SEG6_LOCAL_BPF], > + bpf_prog_policy, NULL); > + if (ret < 0) > + return ret; > + > + if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) > + return -EINVAL; > + > + slwt->bpf.name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL); > + if (!slwt->bpf.name) > + return -ENOMEM; > + > + fd = nla_get_u32(tb[LWT_BPF_PROG_FD]); > + p = bpf_prog_get_type(fd, BPF_PROG_TYPE_LWT_SEG6LOCAL); > + if (IS_ERR(p)) > + return PTR_ERR(p); Here in the above error path is definitely a bug in that you don't free the prior allocated slwt->bpf.name from nla_memdup(). Also when you destroy the struct seg6_local_lwt object, what I'm not getting is where you drop the prog reference again and free slwt->bpf.name there? > + > + slwt->bpf.prog = p; > + > + return 0; > +} > + > +static int put_nla_bpf(struct sk_buff *skb, struct seg6_local_lwt *slwt) > +{ > + struct nlattr *nest; > + > + if (!slwt->bpf.prog) > + return 0; > + > + nest = nla_nest_start(skb, SEG6_LOCAL_BPF); > + if (!nest) > + return -EMSGSIZE; > + > + if (slwt->bpf.name && > + nla_put_string(skb, LWT_BPF_PROG_NAME, slwt->bpf.name)) > + return -EMSGSIZE; > + > + return nla_nest_end(skb, nest); > +} > + > +static int cmp_nla_bpf(struct seg6_local_lwt *a, struct seg6_local_lwt *b) > +{ > + if (!a->bpf.name && !b->bpf.name) > + return 0; > + > + if (!a->bpf.name || !b->bpf.name) > + return 1; > + > + return strcmp(a->bpf.name, b->bpf.name); > +} > + > struct seg6_action_param { > int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt); > int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt); > @@ -749,6 +892,11 @@ static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = { > [SEG6_LOCAL_OIF] = { .parse = parse_nla_oif, > .put = put_nla_oif, > .cmp = cmp_nla_oif }, > + > + [SEG6_LOCAL_BPF] = { .parse = parse_nla_bpf, > + .put = put_nla_bpf, > + .cmp = cmp_nla_bpf }, > + > }; > > static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt) > @@ -797,7 +945,6 @@ static int seg6_local_build_state(struct nlattr *nla, unsigned int family, > > err = nla_parse_nested(tb, SEG6_LOCAL_MAX, nla, seg6_local_policy, > extack); > - > if (err < 0) > return err; > > @@ -886,6 +1033,11 @@ static int seg6_local_get_encap_size(struct lwtunnel_state *lwt) > if (attrs & (1 << SEG6_LOCAL_OIF)) > nlsize += nla_total_size(4); > > + if (attrs & (1 << SEG6_LOCAL_BPF)) > + nlsize += nla_total_size(sizeof(struct nlattr)) + > + nla_total_size(MAX_PROG_NAME) + > + nla_total_size(4); > + > return nlsize; > } > > diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c > index 3dbe217bf23e..a29fed1dfce2 100644 > --- a/tools/lib/bpf/libbpf.c > +++ b/tools/lib/bpf/libbpf.c > @@ -1456,6 +1456,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type) > case BPF_PROG_TYPE_LWT_IN: > case BPF_PROG_TYPE_LWT_OUT: > case BPF_PROG_TYPE_LWT_XMIT: > + case BPF_PROG_TYPE_LWT_SEG6LOCAL: > case BPF_PROG_TYPE_SOCK_OPS: > case BPF_PROG_TYPE_SK_SKB: > case BPF_PROG_TYPE_CGROUP_DEVICE: >
2018-05-18 21:24 GMT+01:00 Daniel Borkmann <daniel@iogearbox.net>: >> +#define MAX_PROG_NAME 256 >> +static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = { >> + [LWT_BPF_PROG_FD] = { .type = NLA_U32, }, > > From UAPI point of view, I wouldn't name it LWT_BPF_PROG_FD but rather something like > LWT_BPF_PROG for example. That way, the setup can contain the fd number, but on the > dump you can put the prog->aux->id in there so that prog lookup can be done again. Good idea. >> + [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, >> + .len = MAX_PROG_NAME }, >> +}; >> + >> +static int parse_nla_bpf(struct nlattr **attrs, struct seg6_local_lwt *slwt) >> +{ >> + struct nlattr *tb[LWT_BPF_PROG_MAX + 1]; >> + struct bpf_prog *p; >> + int ret; >> + u32 fd; >> + >> + ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attrs[SEG6_LOCAL_BPF], >> + bpf_prog_policy, NULL); >> + if (ret < 0) >> + return ret; >> + >> + if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) >> + return -EINVAL; >> + >> + slwt->bpf.name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL); >> + if (!slwt->bpf.name) >> + return -ENOMEM; >> + >> + fd = nla_get_u32(tb[LWT_BPF_PROG_FD]); >> + p = bpf_prog_get_type(fd, BPF_PROG_TYPE_LWT_SEG6LOCAL); >> + if (IS_ERR(p)) >> + return PTR_ERR(p); > > Here in the above error path is definitely a bug in that you don't free the > prior allocated slwt->bpf.name from nla_memdup(). Indeed, I took this part from the lwt bpf infrastructure. I sent a patch to fix it there too. > Also when you destroy the struct seg6_local_lwt object, what I'm not getting > is where you drop the prog reference again and free slwt->bpf.name there? I totally missed this. Sending a v7 with all of this fixed. Thanks for your comments !
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index aa5c8b878474..b161e506dcfc 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -12,6 +12,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_in) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_out) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_SEG6LOCAL, lwt_seg6local) BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb) BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 37f098ca822b..e8efb12d0a7d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -141,6 +141,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_MSG, BPF_PROG_TYPE_RAW_TRACEPOINT, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_PROG_TYPE_LWT_SEG6LOCAL, }; enum bpf_attach_type { diff --git a/include/uapi/linux/seg6_local.h b/include/uapi/linux/seg6_local.h index ef2d8c3e76c1..aadcc11fb918 100644 --- a/include/uapi/linux/seg6_local.h +++ b/include/uapi/linux/seg6_local.h @@ -25,6 +25,7 @@ enum { SEG6_LOCAL_NH6, SEG6_LOCAL_IIF, SEG6_LOCAL_OIF, + SEG6_LOCAL_BPF, __SEG6_LOCAL_MAX, }; #define SEG6_LOCAL_MAX (__SEG6_LOCAL_MAX - 1) @@ -59,6 +60,8 @@ enum { SEG6_LOCAL_ACTION_END_AS = 13, /* forward to SR-unaware VNF with masquerading */ SEG6_LOCAL_ACTION_END_AM = 14, + /* custom BPF action */ + SEG6_LOCAL_ACTION_END_BPF = 15, __SEG6_LOCAL_ACTION_MAX, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a9e4b1372da6..390142d62ba1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1262,6 +1262,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, switch (env->prog->type) { case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: + case BPF_PROG_TYPE_LWT_SEG6LOCAL: /* dst_input() and dst_output() can't write for now */ if (t == BPF_WRITE) return false; diff --git a/net/core/filter.c b/net/core/filter.c index 39641ea567b4..8cf0065107a3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4893,6 +4893,21 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_lwt_seg6_store_bytes: + return &bpf_lwt_seg6_store_bytes_proto; + case BPF_FUNC_lwt_seg6_action: + return &bpf_lwt_seg6_action_proto; + case BPF_FUNC_lwt_seg6_adjust_srh: + return &bpf_lwt_seg6_adjust_srh_proto; + default: + return lwt_out_func_proto(func_id, prog); + } +} + static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) @@ -6493,6 +6508,16 @@ const struct bpf_prog_ops lwt_xmit_prog_ops = { .test_run = bpf_prog_test_run_skb, }; +const struct bpf_verifier_ops lwt_seg6local_verifier_ops = { + .get_func_proto = lwt_seg6local_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops lwt_seg6local_prog_ops = { + .test_run = bpf_prog_test_run_skb, +}; + const struct bpf_verifier_ops cg_sock_verifier_ops = { .get_func_proto = sock_filter_func_proto, .is_valid_access = sock_filter_is_valid_access, diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index ae68c1ef8fb0..2ac887da63e2 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -1,8 +1,9 @@ /* * SR-IPv6 implementation * - * Author: + * Authors: * David Lebrun <david.lebrun@uclouvain.be> + * eBPF support: Mathieu Xhonneux <m.xhonneux@gmail.com> * * * This program is free software; you can redistribute it and/or @@ -32,6 +33,7 @@ #endif #include <net/seg6_local.h> #include <linux/etherdevice.h> +#include <linux/bpf.h> struct seg6_local_lwt; @@ -42,6 +44,11 @@ struct seg6_action_desc { int static_headroom; }; +struct bpf_lwt_prog { + struct bpf_prog *prog; + char *name; +}; + struct seg6_local_lwt { int action; struct ipv6_sr_hdr *srh; @@ -50,6 +57,7 @@ struct seg6_local_lwt { struct in6_addr nh6; int iif; int oif; + struct bpf_lwt_prog bpf; int headroom; struct seg6_action_desc *desc; @@ -451,6 +459,69 @@ static int input_action_end_b6_encap(struct sk_buff *skb, DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states); +static int input_action_end_bpf(struct sk_buff *skb, + struct seg6_local_lwt *slwt) +{ + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + struct seg6_bpf_srh_state local_srh_state; + struct ipv6_sr_hdr *srh; + int srhoff = 0; + int ret; + + srh = get_and_validate_srh(skb); + if (!srh) + goto drop; + advance_nextseg(srh, &ipv6_hdr(skb)->daddr); + + /* preempt_disable is needed to protect the per-CPU buffer srh_state, + * which is also accessed by the bpf_lwt_seg6_* helpers + */ + preempt_disable(); + srh_state->hdrlen = srh->hdrlen << 3; + srh_state->valid = 1; + + rcu_read_lock(); + bpf_compute_data_pointers(skb); + ret = bpf_prog_run_save_cb(slwt->bpf.prog, skb); + rcu_read_unlock(); + + local_srh_state = *srh_state; + preempt_enable(); + + switch (ret) { + case BPF_OK: + case BPF_REDIRECT: + break; + case BPF_DROP: + goto drop; + default: + pr_warn_once("bpf-seg6local: Illegal return value %u\n", ret); + goto drop; + } + + if (unlikely((local_srh_state.hdrlen & 7) != 0)) + goto drop; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + goto drop; + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + srh->hdrlen = (u8)(local_srh_state.hdrlen >> 3); + + if (!local_srh_state.valid && + unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3))) + goto drop; + + if (ret != BPF_REDIRECT) + seg6_lookup_nexthop(skb, NULL, 0); + + return dst_input(skb); + +drop: + kfree_skb(skb); + return -EINVAL; +} + static struct seg6_action_desc seg6_action_table[] = { { .action = SEG6_LOCAL_ACTION_END, @@ -497,7 +568,13 @@ static struct seg6_action_desc seg6_action_table[] = { .attrs = (1 << SEG6_LOCAL_SRH), .input = input_action_end_b6_encap, .static_headroom = sizeof(struct ipv6hdr), - } + }, + { + .action = SEG6_LOCAL_ACTION_END_BPF, + .attrs = (1 << SEG6_LOCAL_BPF), + .input = input_action_end_bpf, + }, + }; static struct seg6_action_desc *__get_action_desc(int action) @@ -542,6 +619,7 @@ static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = { .len = sizeof(struct in6_addr) }, [SEG6_LOCAL_IIF] = { .type = NLA_U32 }, [SEG6_LOCAL_OIF] = { .type = NLA_U32 }, + [SEG6_LOCAL_BPF] = { .type = NLA_NESTED }, }; static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt) @@ -719,6 +797,71 @@ static int cmp_nla_oif(struct seg6_local_lwt *a, struct seg6_local_lwt *b) return 0; } +#define MAX_PROG_NAME 256 +static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = { + [LWT_BPF_PROG_FD] = { .type = NLA_U32, }, + [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, + .len = MAX_PROG_NAME }, +}; + +static int parse_nla_bpf(struct nlattr **attrs, struct seg6_local_lwt *slwt) +{ + struct nlattr *tb[LWT_BPF_PROG_MAX + 1]; + struct bpf_prog *p; + int ret; + u32 fd; + + ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attrs[SEG6_LOCAL_BPF], + bpf_prog_policy, NULL); + if (ret < 0) + return ret; + + if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) + return -EINVAL; + + slwt->bpf.name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL); + if (!slwt->bpf.name) + return -ENOMEM; + + fd = nla_get_u32(tb[LWT_BPF_PROG_FD]); + p = bpf_prog_get_type(fd, BPF_PROG_TYPE_LWT_SEG6LOCAL); + if (IS_ERR(p)) + return PTR_ERR(p); + + slwt->bpf.prog = p; + + return 0; +} + +static int put_nla_bpf(struct sk_buff *skb, struct seg6_local_lwt *slwt) +{ + struct nlattr *nest; + + if (!slwt->bpf.prog) + return 0; + + nest = nla_nest_start(skb, SEG6_LOCAL_BPF); + if (!nest) + return -EMSGSIZE; + + if (slwt->bpf.name && + nla_put_string(skb, LWT_BPF_PROG_NAME, slwt->bpf.name)) + return -EMSGSIZE; + + return nla_nest_end(skb, nest); +} + +static int cmp_nla_bpf(struct seg6_local_lwt *a, struct seg6_local_lwt *b) +{ + if (!a->bpf.name && !b->bpf.name) + return 0; + + if (!a->bpf.name || !b->bpf.name) + return 1; + + return strcmp(a->bpf.name, b->bpf.name); +} + struct seg6_action_param { int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt); int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt); @@ -749,6 +892,11 @@ static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = { [SEG6_LOCAL_OIF] = { .parse = parse_nla_oif, .put = put_nla_oif, .cmp = cmp_nla_oif }, + + [SEG6_LOCAL_BPF] = { .parse = parse_nla_bpf, + .put = put_nla_bpf, + .cmp = cmp_nla_bpf }, + }; static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt) @@ -797,7 +945,6 @@ static int seg6_local_build_state(struct nlattr *nla, unsigned int family, err = nla_parse_nested(tb, SEG6_LOCAL_MAX, nla, seg6_local_policy, extack); - if (err < 0) return err; @@ -886,6 +1033,11 @@ static int seg6_local_get_encap_size(struct lwtunnel_state *lwt) if (attrs & (1 << SEG6_LOCAL_OIF)) nlsize += nla_total_size(4); + if (attrs & (1 << SEG6_LOCAL_BPF)) + nlsize += nla_total_size(sizeof(struct nlattr)) + + nla_total_size(MAX_PROG_NAME) + + nla_total_size(4); + return nlsize; } diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 3dbe217bf23e..a29fed1dfce2 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1456,6 +1456,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type) case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_XMIT: + case BPF_PROG_TYPE_LWT_SEG6LOCAL: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_CGROUP_DEVICE: