Message ID | 20200330030001.2312810-2-andriin@fb.com |
---|---|
State | Accepted |
Delegated to: | BPF Maintainers |
Headers | show |
Series | Add support for cgroup bpf_link | expand |
Andrii Nakryiko <andriin@fb.com> [Sun, 2020-03-29 20:00 -0700]: > Implement new sub-command to attach cgroup BPF programs and return FD-based > bpf_link back on success. bpf_link, once attached to cgroup, cannot be > replaced, except by owner having its FD. Cgroup bpf_link supports only > BPF_F_ALLOW_MULTI semantics. Both link-based and prog-based BPF_F_ALLOW_MULTI > attachments can be freely intermixed. > > To prevent bpf_cgroup_link from keeping cgroup alive past the point when no > BPF program can be executed, implement auto-detachment of link. When > cgroup_bpf_release() is called, all attached bpf_links are forced to release > cgroup refcounts, but they leave bpf_link otherwise active and allocated, as > well as still owning underlying bpf_prog. This is because user-space might > still have FDs open and active, so bpf_link as a user-referenced object can't > be freed yet. Once last active FD is closed, bpf_link will be freed and > underlying bpf_prog refcount will be dropped. But cgroup refcount won't be > touched, because cgroup is released already. > > The inherent race between bpf_cgroup_link release (from closing last FD) and > cgroup_bpf_release() is resolved by both operations taking cgroup_mutex. So > the only additional check required is when bpf_cgroup_link attempts to detach > itself from cgroup. At that time we need to check whether there is still > cgroup associated with that link. And if not, exit with success, because > bpf_cgroup_link was already successfully detached. > > Acked-by: Roman Gushchin <guro@fb.com> > Signed-off-by: Andrii Nakryiko <andriin@fb.com> > --- > include/linux/bpf-cgroup.h | 29 ++- > include/linux/bpf.h | 10 +- > include/uapi/linux/bpf.h | 10 +- > kernel/bpf/cgroup.c | 315 +++++++++++++++++++++++++-------- > kernel/bpf/syscall.c | 61 ++++++- > kernel/cgroup/cgroup.c | 14 +- > tools/include/uapi/linux/bpf.h | 10 +- > 7 files changed, 351 insertions(+), 98 deletions(-) > > diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h > index a7cd5c7a2509..d2d969669564 100644 > --- a/include/linux/bpf-cgroup.h > +++ b/include/linux/bpf-cgroup.h > @@ -51,9 +51,18 @@ struct bpf_cgroup_storage { > struct rcu_head rcu; > }; > > +struct bpf_cgroup_link { > + struct bpf_link link; > + struct cgroup *cgroup; > + enum bpf_attach_type type; > +}; > + > +extern const struct bpf_link_ops bpf_cgroup_link_lops; > + > struct bpf_prog_list { > struct list_head node; > struct bpf_prog *prog; > + struct bpf_cgroup_link *link; > struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; > }; > > @@ -84,20 +93,23 @@ struct cgroup_bpf { > int cgroup_bpf_inherit(struct cgroup *cgrp); > void cgroup_bpf_offline(struct cgroup *cgrp); > > -int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, > - struct bpf_prog *replace_prog, > +int __cgroup_bpf_attach(struct cgroup *cgrp, > + struct bpf_prog *prog, struct bpf_prog *replace_prog, > + struct bpf_cgroup_link *link, > enum bpf_attach_type type, u32 flags); > int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, > + struct bpf_cgroup_link *link, > enum bpf_attach_type type); > int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, > union bpf_attr __user *uattr); > > /* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */ > -int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, > - struct bpf_prog *replace_prog, enum bpf_attach_type type, > +int cgroup_bpf_attach(struct cgroup *cgrp, > + struct bpf_prog *prog, struct bpf_prog *replace_prog, > + struct bpf_cgroup_link *link, enum bpf_attach_type type, > u32 flags); > int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, > - enum bpf_attach_type type, u32 flags); > + enum bpf_attach_type type); > int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, > union bpf_attr __user *uattr); > > @@ -332,6 +344,7 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr, > enum bpf_prog_type ptype, struct bpf_prog *prog); > int cgroup_bpf_prog_detach(const union bpf_attr *attr, > enum bpf_prog_type ptype); > +int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); > int cgroup_bpf_prog_query(const union bpf_attr *attr, > union bpf_attr __user *uattr); > #else > @@ -354,6 +367,12 @@ static inline int cgroup_bpf_prog_detach(const union bpf_attr *attr, > return -EINVAL; > } > > +static inline int cgroup_bpf_link_attach(const union bpf_attr *attr, > + struct bpf_prog *prog) > +{ > + return -EINVAL; > +} > + > static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, > union bpf_attr __user *uattr) > { > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 3bde59a8453b..56254d880293 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -1082,15 +1082,23 @@ extern int sysctl_unprivileged_bpf_disabled; > int bpf_map_new_fd(struct bpf_map *map, int flags); > int bpf_prog_new_fd(struct bpf_prog *prog); > > -struct bpf_link; > +struct bpf_link { > + atomic64_t refcnt; > + const struct bpf_link_ops *ops; > + struct bpf_prog *prog; > + struct work_struct work; > +}; > > struct bpf_link_ops { > void (*release)(struct bpf_link *link); > void (*dealloc)(struct bpf_link *link); > + > }; > > void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, > struct bpf_prog *prog); > +void bpf_link_cleanup(struct bpf_link *link, struct file *link_file, > + int link_fd); > void bpf_link_inc(struct bpf_link *link); > void bpf_link_put(struct bpf_link *link); > int bpf_link_new_fd(struct bpf_link *link); > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index f1fbc36f58d3..8b3f1c098ac0 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -111,6 +111,7 @@ enum bpf_cmd { > BPF_MAP_LOOKUP_AND_DELETE_BATCH, > BPF_MAP_UPDATE_BATCH, > BPF_MAP_DELETE_BATCH, > + BPF_LINK_CREATE, > }; > > enum bpf_map_type { > @@ -541,7 +542,7 @@ union bpf_attr { > __u32 prog_cnt; > } query; > > - struct { > + struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ > __u64 name; > __u32 prog_fd; > } raw_tracepoint; > @@ -569,6 +570,13 @@ union bpf_attr { > __u64 probe_offset; /* output: probe_offset */ > __u64 probe_addr; /* output: probe_addr */ > } task_fd_query; > + > + struct { /* struct used by BPF_LINK_CREATE command */ > + __u32 prog_fd; /* eBPF program to attach */ > + __u32 target_fd; /* object to attach to */ > + __u32 attach_type; /* attach type */ > + __u32 flags; /* extra flags */ > + } link_create; > } __attribute__((aligned(8))); > > /* The description below is an attempt at providing documentation to eBPF > diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c > index 9c8472823a7f..c24029937431 100644 > --- a/kernel/bpf/cgroup.c > +++ b/kernel/bpf/cgroup.c > @@ -80,6 +80,17 @@ static void bpf_cgroup_storages_unlink(struct bpf_cgroup_storage *storages[]) > bpf_cgroup_storage_unlink(storages[stype]); > } > > +/* Called when bpf_cgroup_link is auto-detached from dying cgroup. > + * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It > + * doesn't free link memory, which will eventually be done by bpf_link's > + * release() callback, when its last FD is closed. > + */ > +static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link) > +{ > + cgroup_put(link->cgroup); > + link->cgroup = NULL; > +} > + > /** > * cgroup_bpf_release() - put references of all bpf programs and > * release all cgroup bpf data > @@ -100,7 +111,10 @@ static void cgroup_bpf_release(struct work_struct *work) > > list_for_each_entry_safe(pl, tmp, progs, node) { > list_del(&pl->node); > - bpf_prog_put(pl->prog); > + if (pl->prog) > + bpf_prog_put(pl->prog); > + if (pl->link) > + bpf_cgroup_link_auto_detach(pl->link); > bpf_cgroup_storages_unlink(pl->storage); > bpf_cgroup_storages_free(pl->storage); > kfree(pl); > @@ -134,6 +148,18 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref) > queue_work(system_wq, &cgrp->bpf.release_work); > } > > +/* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through > + * link or direct prog. > + */ > +static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl) > +{ > + if (pl->prog) > + return pl->prog; > + if (pl->link) > + return pl->link->link.prog; > + return NULL; > +} > + > /* count number of elements in the list. > * it's slow but the list cannot be long > */ > @@ -143,7 +169,7 @@ static u32 prog_list_length(struct list_head *head) > u32 cnt = 0; > > list_for_each_entry(pl, head, node) { > - if (!pl->prog) > + if (!prog_list_prog(pl)) > continue; > cnt++; > } > @@ -212,11 +238,11 @@ static int compute_effective_progs(struct cgroup *cgrp, > continue; > > list_for_each_entry(pl, &p->bpf.progs[type], node) { > - if (!pl->prog) > + if (!prog_list_prog(pl)) > continue; > > item = &progs->items[cnt]; > - item->prog = pl->prog; > + item->prog = prog_list_prog(pl); > bpf_cgroup_storages_assign(item->cgroup_storage, > pl->storage); > cnt++; > @@ -333,19 +359,60 @@ static int update_effective_progs(struct cgroup *cgrp, > > #define BPF_CGROUP_MAX_PROGS 64 > > +static struct bpf_prog_list *find_attach_entry(struct list_head *progs, > + struct bpf_prog *prog, > + struct bpf_cgroup_link *link, > + struct bpf_prog *replace_prog, > + bool allow_multi) > +{ > + struct bpf_prog_list *pl; > + > + /* single-attach case */ > + if (!allow_multi) { > + if (list_empty(progs)) > + return NULL; > + return list_first_entry(progs, typeof(*pl), node); > + } > + > + list_for_each_entry(pl, progs, node) { > + if (prog && pl->prog == prog) > + /* disallow attaching the same prog twice */ > + return ERR_PTR(-EINVAL); > + if (link && pl->link == link) > + /* disallow attaching the same link twice */ > + return ERR_PTR(-EINVAL); > + } > + > + /* direct prog multi-attach w/ replacement case */ > + if (replace_prog) { > + list_for_each_entry(pl, progs, node) { > + if (pl->prog == replace_prog) > + /* a match found */ > + return pl; > + } > + /* prog to replace not found for cgroup */ > + return ERR_PTR(-ENOENT); > + } > + > + return NULL; > +} > + > /** > - * __cgroup_bpf_attach() - Attach the program to a cgroup, and > + * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and > * propagate the change to descendants > * @cgrp: The cgroup which descendants to traverse > * @prog: A program to attach > + * @link: A link to attach > * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set > * @type: Type of attach operation > * @flags: Option flags > * > + * Exactly one of @prog or @link can be non-null. > * Must be called with cgroup_mutex held. > */ > -int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, > - struct bpf_prog *replace_prog, > +int __cgroup_bpf_attach(struct cgroup *cgrp, > + struct bpf_prog *prog, struct bpf_prog *replace_prog, > + struct bpf_cgroup_link *link, > enum bpf_attach_type type, u32 flags) > { > u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); > @@ -353,13 +420,19 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, > struct bpf_prog *old_prog = NULL; > struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], > *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; > - struct bpf_prog_list *pl, *replace_pl = NULL; > + struct bpf_prog_list *pl; > int err; > > if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) || > ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI))) > /* invalid combination */ > return -EINVAL; > + if (link && (prog || replace_prog)) > + /* only either link or prog/replace_prog can be specified */ > + return -EINVAL; > + if (!!replace_prog != !!(flags & BPF_F_REPLACE)) > + /* replace_prog implies BPF_F_REPLACE, and vice versa */ > + return -EINVAL; > > if (!hierarchy_allows_attach(cgrp, type)) > return -EPERM; > @@ -374,26 +447,15 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, > if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) > return -E2BIG; > > - if (flags & BPF_F_ALLOW_MULTI) { > - list_for_each_entry(pl, progs, node) { > - if (pl->prog == prog) > - /* disallow attaching the same prog twice */ > - return -EINVAL; > - if (pl->prog == replace_prog) > - replace_pl = pl; > - } > - if ((flags & BPF_F_REPLACE) && !replace_pl) > - /* prog to replace not found for cgroup */ > - return -ENOENT; > - } else if (!list_empty(progs)) { > - replace_pl = list_first_entry(progs, typeof(*pl), node); > - } > + pl = find_attach_entry(progs, prog, link, replace_prog, > + flags & BPF_F_ALLOW_MULTI); > + if (IS_ERR(pl)) > + return PTR_ERR(pl); > > - if (bpf_cgroup_storages_alloc(storage, prog)) > + if (bpf_cgroup_storages_alloc(storage, prog ? : link->link.prog)) > return -ENOMEM; > > - if (replace_pl) { > - pl = replace_pl; > + if (pl) { > old_prog = pl->prog; > bpf_cgroup_storages_unlink(pl->storage); > bpf_cgroup_storages_assign(old_storage, pl->storage); > @@ -407,6 +469,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, > } > > pl->prog = prog; > + pl->link = link; > bpf_cgroup_storages_assign(pl->storage, storage); > cgrp->bpf.flags[type] = saved_flags; > > @@ -414,80 +477,93 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, > if (err) > goto cleanup; > > - static_branch_inc(&cgroup_bpf_enabled_key); > bpf_cgroup_storages_free(old_storage); > - if (old_prog) { > + if (old_prog) > bpf_prog_put(old_prog); > - static_branch_dec(&cgroup_bpf_enabled_key); > - } > - bpf_cgroup_storages_link(storage, cgrp, type); > + else > + static_branch_inc(&cgroup_bpf_enabled_key); > + bpf_cgroup_storages_link(pl->storage, cgrp, type); > return 0; > > cleanup: > - /* and cleanup the prog list */ > - pl->prog = old_prog; > + if (old_prog) { > + pl->prog = old_prog; > + pl->link = NULL; > + } > bpf_cgroup_storages_free(pl->storage); > bpf_cgroup_storages_assign(pl->storage, old_storage); > bpf_cgroup_storages_link(pl->storage, cgrp, type); > - if (!replace_pl) { > + if (!old_prog) { > list_del(&pl->node); > kfree(pl); > } > return err; > } > > +static struct bpf_prog_list *find_detach_entry(struct list_head *progs, > + struct bpf_prog *prog, > + struct bpf_cgroup_link *link, > + bool allow_multi) > +{ > + struct bpf_prog_list *pl; > + > + if (!allow_multi) { > + if (list_empty(progs)) > + /* report error when trying to detach and nothing is attached */ > + return ERR_PTR(-ENOENT); > + > + /* to maintain backward compatibility NONE and OVERRIDE cgroups > + * allow detaching with invalid FD (prog==NULL) in legacy mode > + */ > + return list_first_entry(progs, typeof(*pl), node); > + } > + > + if (!prog && !link) > + /* to detach MULTI prog the user has to specify valid FD > + * of the program or link to be detached > + */ > + return ERR_PTR(-EINVAL); > + > + /* find the prog or link and detach it */ > + list_for_each_entry(pl, progs, node) { > + if (pl->prog == prog && pl->link == link) > + return pl; > + } > + return ERR_PTR(-ENOENT); > +} > + > /** > - * __cgroup_bpf_detach() - Detach the program from a cgroup, and > + * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and > * propagate the change to descendants > * @cgrp: The cgroup which descendants to traverse > * @prog: A program to detach or NULL > + * @prog: A link to detach or NULL > * @type: Type of detach operation > * > + * At most one of @prog or @link can be non-NULL. > * Must be called with cgroup_mutex held. > */ > int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, > - enum bpf_attach_type type) > + struct bpf_cgroup_link *link, enum bpf_attach_type type) > { > struct list_head *progs = &cgrp->bpf.progs[type]; > u32 flags = cgrp->bpf.flags[type]; > - struct bpf_prog *old_prog = NULL; > struct bpf_prog_list *pl; > + struct bpf_prog *old_prog; > int err; > > - if (flags & BPF_F_ALLOW_MULTI) { > - if (!prog) > - /* to detach MULTI prog the user has to specify valid FD > - * of the program to be detached > - */ > - return -EINVAL; > - } else { > - if (list_empty(progs)) > - /* report error when trying to detach and nothing is attached */ > - return -ENOENT; > - } > + if (prog && link) > + /* only one of prog or link can be specified */ > + return -EINVAL; > > - if (flags & BPF_F_ALLOW_MULTI) { > - /* find the prog and detach it */ > - list_for_each_entry(pl, progs, node) { > - if (pl->prog != prog) > - continue; > - old_prog = prog; > - /* mark it deleted, so it's ignored while > - * recomputing effective > - */ > - pl->prog = NULL; > - break; > - } > - if (!old_prog) > - return -ENOENT; > - } else { > - /* to maintain backward compatibility NONE and OVERRIDE cgroups > - * allow detaching with invalid FD (prog==NULL) > - */ > - pl = list_first_entry(progs, typeof(*pl), node); > - old_prog = pl->prog; > - pl->prog = NULL; > - } > + pl = find_detach_entry(progs, prog, link, flags & BPF_F_ALLOW_MULTI); > + if (IS_ERR(pl)) > + return PTR_ERR(pl); > + > + /* mark it deleted, so it's ignored while recomputing effective */ > + old_prog = pl->prog; > + pl->prog = NULL; > + pl->link = NULL; > > err = update_effective_progs(cgrp, type); > if (err) > @@ -501,14 +577,15 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, > if (list_empty(progs)) > /* last program was detached, reset flags to zero */ > cgrp->bpf.flags[type] = 0; > - > - bpf_prog_put(old_prog); > + if (old_prog) > + bpf_prog_put(old_prog); > static_branch_dec(&cgroup_bpf_enabled_key); > return 0; > > cleanup: > - /* and restore back old_prog */ > + /* restore back prog or link */ > pl->prog = old_prog; > + pl->link = link; > return err; > } > > @@ -521,6 +598,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, > struct list_head *progs = &cgrp->bpf.progs[type]; > u32 flags = cgrp->bpf.flags[type]; > struct bpf_prog_array *effective; > + struct bpf_prog *prog; > int cnt, ret = 0, i; > > effective = rcu_dereference_protected(cgrp->bpf.effective[type], > @@ -551,7 +629,8 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, > > i = 0; > list_for_each_entry(pl, progs, node) { > - id = pl->prog->aux->id; > + prog = prog_list_prog(pl); > + id = prog->aux->id; > if (copy_to_user(prog_ids + i, &id, sizeof(id))) > return -EFAULT; > if (++i == cnt) > @@ -581,8 +660,8 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr, > } > } > > - ret = cgroup_bpf_attach(cgrp, prog, replace_prog, attr->attach_type, > - attr->attach_flags); > + ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL, > + attr->attach_type, attr->attach_flags); > > if (replace_prog) > bpf_prog_put(replace_prog); > @@ -604,7 +683,7 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) > if (IS_ERR(prog)) > prog = NULL; > > - ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); > + ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type); > if (prog) > bpf_prog_put(prog); > > @@ -612,6 +691,90 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) > return ret; > } > > +static void bpf_cgroup_link_release(struct bpf_link *link) > +{ > + struct bpf_cgroup_link *cg_link = > + container_of(link, struct bpf_cgroup_link, link); > + > + /* link might have been auto-detached by dying cgroup already, > + * in that case our work is done here > + */ > + if (!cg_link->cgroup) > + return; > + > + mutex_lock(&cgroup_mutex); > + > + /* re-check cgroup under lock again */ > + if (!cg_link->cgroup) { > + mutex_unlock(&cgroup_mutex); > + return; > + } > + > + WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, > + cg_link->type)); > + > + mutex_unlock(&cgroup_mutex); > + cgroup_put(cg_link->cgroup); > +} > + > +static void bpf_cgroup_link_dealloc(struct bpf_link *link) > +{ > + struct bpf_cgroup_link *cg_link = > + container_of(link, struct bpf_cgroup_link, link); > + > + kfree(cg_link); > +} > + > +const struct bpf_link_ops bpf_cgroup_link_lops = { > + .release = bpf_cgroup_link_release, > + .dealloc = bpf_cgroup_link_dealloc, > +}; > + > +int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) > +{ > + struct bpf_cgroup_link *link; > + struct file *link_file; > + struct cgroup *cgrp; > + int err, link_fd; > + > + if (attr->link_create.flags) > + return -EINVAL; > + > + cgrp = cgroup_get_from_fd(attr->link_create.target_fd); > + if (IS_ERR(cgrp)) > + return PTR_ERR(cgrp); > + > + link = kzalloc(sizeof(*link), GFP_USER); > + if (!link) { > + err = -ENOMEM; > + goto out_put_cgroup; > + } > + bpf_link_init(&link->link, &bpf_cgroup_link_lops, prog); > + link->cgroup = cgrp; > + link->type = attr->link_create.attach_type; > + > + link_file = bpf_link_new_file(&link->link, &link_fd); > + if (IS_ERR(link_file)) { > + kfree(link); > + err = PTR_ERR(link_file); > + goto out_put_cgroup; > + } > + > + err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type, > + BPF_F_ALLOW_MULTI); > + if (err) { > + bpf_link_cleanup(&link->link, link_file, link_fd); > + goto out_put_cgroup; > + } > + > + fd_install(link_fd, link_file); > + return link_fd; > + > +out_put_cgroup: > + cgroup_put(cgrp); > + return err; > +} > + > int cgroup_bpf_prog_query(const union bpf_attr *attr, > union bpf_attr __user *uattr) > { > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index a616b63f23b4..05412b83ed6c 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -2175,13 +2175,6 @@ static int bpf_obj_get(const union bpf_attr *attr) > attr->file_flags); > } > > -struct bpf_link { > - atomic64_t refcnt; > - const struct bpf_link_ops *ops; > - struct bpf_prog *prog; > - struct work_struct work; > -}; > - > void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, > struct bpf_prog *prog) > { > @@ -2195,8 +2188,8 @@ void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, > * anon_inode's release() call. This helper manages marking bpf_link as > * defunct, releases anon_inode file and puts reserved FD. > */ > -static void bpf_link_cleanup(struct bpf_link *link, struct file *link_file, > - int link_fd) > +void bpf_link_cleanup(struct bpf_link *link, struct file *link_file, > + int link_fd) > { > link->prog = NULL; > fput(link_file); > @@ -2266,6 +2259,10 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) > link_type = "raw_tracepoint"; > else if (link->ops == &bpf_tracing_link_lops) > link_type = "tracing"; > +#ifdef CONFIG_CGROUP_BPF > + else if (link->ops == &bpf_cgroup_link_lops) > + link_type = "cgroup"; > +#endif > else > link_type = "unknown"; > > @@ -3553,6 +3550,49 @@ static int bpf_map_do_batch(const union bpf_attr *attr, > return err; > } > > +#define BPF_LINK_CREATE_LAST_FIELD link_create.flags > +static int link_create(union bpf_attr *attr) > +{ From what I see this function does not check any capability whether the existing bpf_prog_attach() checks for CAP_NET_ADMIN. This is pretty importnant difference but I don't see it clarified in the commit message or discussed (or I missed it?). Having a way to attach cgroup bpf prog by non-priv users is actually helpful in some use-cases, e.g. systemd required patching in the past to make it work with user (non-priv) sessions, see [0]. But in other cases it's also useful to limit the ability to attach programs to a cgroup while using bpf_link so that only the thing that controls cgroup setup can attach but not any non-priv process running in that cgroup. How is this use-case covered in BPF_LINK_CREATE? [0] https://github.com/systemd/systemd/pull/12745 > + enum bpf_prog_type ptype; > + struct bpf_prog *prog; > + int ret; > + > + if (CHECK_ATTR(BPF_LINK_CREATE)) > + return -EINVAL; > + > + ptype = attach_type_to_prog_type(attr->link_create.attach_type); > + if (ptype == BPF_PROG_TYPE_UNSPEC) > + return -EINVAL; > + > + prog = bpf_prog_get_type(attr->link_create.prog_fd, ptype); > + if (IS_ERR(prog)) > + return PTR_ERR(prog); > + > + ret = bpf_prog_attach_check_attach_type(prog, > + attr->link_create.attach_type); > + if (ret) > + goto err_out; > + > + switch (ptype) { > + case BPF_PROG_TYPE_CGROUP_SKB: > + case BPF_PROG_TYPE_CGROUP_SOCK: > + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: > + case BPF_PROG_TYPE_SOCK_OPS: > + case BPF_PROG_TYPE_CGROUP_DEVICE: > + case BPF_PROG_TYPE_CGROUP_SYSCTL: > + case BPF_PROG_TYPE_CGROUP_SOCKOPT: > + ret = cgroup_bpf_link_attach(attr, prog); > + break; > + default: > + ret = -EINVAL; > + } > + > +err_out: > + if (ret < 0) > + bpf_prog_put(prog); > + return ret; > +} > + > SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) > { > union bpf_attr attr = {}; > @@ -3663,6 +3703,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz > case BPF_MAP_DELETE_BATCH: > err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH); > break; > + case BPF_LINK_CREATE: > + err = link_create(&attr); > + break; > default: > err = -EINVAL; > break; > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c > index 3dead0416b91..219624fba9ba 100644 > --- a/kernel/cgroup/cgroup.c > +++ b/kernel/cgroup/cgroup.c > @@ -6303,27 +6303,31 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) > #endif /* CONFIG_SOCK_CGROUP_DATA */ > > #ifdef CONFIG_CGROUP_BPF > -int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, > - struct bpf_prog *replace_prog, enum bpf_attach_type type, > +int cgroup_bpf_attach(struct cgroup *cgrp, > + struct bpf_prog *prog, struct bpf_prog *replace_prog, > + struct bpf_cgroup_link *link, > + enum bpf_attach_type type, > u32 flags) > { > int ret; > > mutex_lock(&cgroup_mutex); > - ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, type, flags); > + ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags); > mutex_unlock(&cgroup_mutex); > return ret; > } > + > int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, > - enum bpf_attach_type type, u32 flags) > + enum bpf_attach_type type) > { > int ret; > > mutex_lock(&cgroup_mutex); > - ret = __cgroup_bpf_detach(cgrp, prog, type); > + ret = __cgroup_bpf_detach(cgrp, prog, NULL, type); > mutex_unlock(&cgroup_mutex); > return ret; > } > + > int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, > union bpf_attr __user *uattr) > { > diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h > index f1fbc36f58d3..8b3f1c098ac0 100644 > --- a/tools/include/uapi/linux/bpf.h > +++ b/tools/include/uapi/linux/bpf.h > @@ -111,6 +111,7 @@ enum bpf_cmd { > BPF_MAP_LOOKUP_AND_DELETE_BATCH, > BPF_MAP_UPDATE_BATCH, > BPF_MAP_DELETE_BATCH, > + BPF_LINK_CREATE, > }; > > enum bpf_map_type { > @@ -541,7 +542,7 @@ union bpf_attr { > __u32 prog_cnt; > } query; > > - struct { > + struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ > __u64 name; > __u32 prog_fd; > } raw_tracepoint; > @@ -569,6 +570,13 @@ union bpf_attr { > __u64 probe_offset; /* output: probe_offset */ > __u64 probe_addr; /* output: probe_addr */ > } task_fd_query; > + > + struct { /* struct used by BPF_LINK_CREATE command */ > + __u32 prog_fd; /* eBPF program to attach */ > + __u32 target_fd; /* object to attach to */ > + __u32 attach_type; /* attach type */ > + __u32 flags; /* extra flags */ > + } link_create; > } __attribute__((aligned(8))); > > /* The description below is an attempt at providing documentation to eBPF > -- > 2.17.1 >
On Mon, Mar 30, 2020 at 05:05:13PM -0700, Andrey Ignatov wrote: > > > > +#define BPF_LINK_CREATE_LAST_FIELD link_create.flags > > +static int link_create(union bpf_attr *attr) > > +{ > > From what I see this function does not check any capability whether the > existing bpf_prog_attach() checks for CAP_NET_ADMIN. Great catch! It's a bug. I fixed it up. > This is pretty importnant difference but I don't see it clarified in the > commit message or discussed (or I missed it?). > > Having a way to attach cgroup bpf prog by non-priv users is actually > helpful in some use-cases, e.g. systemd required patching in the past to > make it work with user (non-priv) sessions, see [0]. > > But in other cases it's also useful to limit the ability to attach > programs to a cgroup while using bpf_link so that only the thing that > controls cgroup setup can attach but not any non-priv process running in > that cgroup. How is this use-case covered in BPF_LINK_CREATE? > > > [0] https://github.com/systemd/systemd/pull/12745 yeah. we need to resurrect the discussion around CAP_BPF. PS pls trim your replies.
On Mon, Mar 30, 2020 at 5:38 PM Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > On Mon, Mar 30, 2020 at 05:05:13PM -0700, Andrey Ignatov wrote: > > > > > > +#define BPF_LINK_CREATE_LAST_FIELD link_create.flags > > > +static int link_create(union bpf_attr *attr) > > > +{ > > > > From what I see this function does not check any capability whether the > > existing bpf_prog_attach() checks for CAP_NET_ADMIN. > > Great catch! It's a bug. > I fixed it up. Thanks! > > > This is pretty importnant difference but I don't see it clarified in the > > commit message or discussed (or I missed it?). Yeah, not intentional, thanks for catching! > > > > Having a way to attach cgroup bpf prog by non-priv users is actually > > helpful in some use-cases, e.g. systemd required patching in the past to > > make it work with user (non-priv) sessions, see [0]. > > > > But in other cases it's also useful to limit the ability to attach > > programs to a cgroup while using bpf_link so that only the thing that > > controls cgroup setup can attach but not any non-priv process running in > > that cgroup. How is this use-case covered in BPF_LINK_CREATE? > > > > > > [0] https://github.com/systemd/systemd/pull/12745 > > yeah. we need to resurrect the discussion around CAP_BPF. > > PS > pls trim your replies.
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index a7cd5c7a2509..d2d969669564 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -51,9 +51,18 @@ struct bpf_cgroup_storage { struct rcu_head rcu; }; +struct bpf_cgroup_link { + struct bpf_link link; + struct cgroup *cgroup; + enum bpf_attach_type type; +}; + +extern const struct bpf_link_ops bpf_cgroup_link_lops; + struct bpf_prog_list { struct list_head node; struct bpf_prog *prog; + struct bpf_cgroup_link *link; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; }; @@ -84,20 +93,23 @@ struct cgroup_bpf { int cgroup_bpf_inherit(struct cgroup *cgrp); void cgroup_bpf_offline(struct cgroup *cgrp); -int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, - struct bpf_prog *replace_prog, +int __cgroup_bpf_attach(struct cgroup *cgrp, + struct bpf_prog *prog, struct bpf_prog *replace_prog, + struct bpf_cgroup_link *link, enum bpf_attach_type type, u32 flags); int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, + struct bpf_cgroup_link *link, enum bpf_attach_type type); int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr); /* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */ -int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, - struct bpf_prog *replace_prog, enum bpf_attach_type type, +int cgroup_bpf_attach(struct cgroup *cgrp, + struct bpf_prog *prog, struct bpf_prog *replace_prog, + struct bpf_cgroup_link *link, enum bpf_attach_type type, u32 flags); int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type, u32 flags); + enum bpf_attach_type type); int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr); @@ -332,6 +344,7 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, struct bpf_prog *prog); int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); +int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); #else @@ -354,6 +367,12 @@ static inline int cgroup_bpf_prog_detach(const union bpf_attr *attr, return -EINVAL; } +static inline int cgroup_bpf_link_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3bde59a8453b..56254d880293 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1082,15 +1082,23 @@ extern int sysctl_unprivileged_bpf_disabled; int bpf_map_new_fd(struct bpf_map *map, int flags); int bpf_prog_new_fd(struct bpf_prog *prog); -struct bpf_link; +struct bpf_link { + atomic64_t refcnt; + const struct bpf_link_ops *ops; + struct bpf_prog *prog; + struct work_struct work; +}; struct bpf_link_ops { void (*release)(struct bpf_link *link); void (*dealloc)(struct bpf_link *link); + }; void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, struct bpf_prog *prog); +void bpf_link_cleanup(struct bpf_link *link, struct file *link_file, + int link_fd); void bpf_link_inc(struct bpf_link *link); void bpf_link_put(struct bpf_link *link); int bpf_link_new_fd(struct bpf_link *link); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f1fbc36f58d3..8b3f1c098ac0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -111,6 +111,7 @@ enum bpf_cmd { BPF_MAP_LOOKUP_AND_DELETE_BATCH, BPF_MAP_UPDATE_BATCH, BPF_MAP_DELETE_BATCH, + BPF_LINK_CREATE, }; enum bpf_map_type { @@ -541,7 +542,7 @@ union bpf_attr { __u32 prog_cnt; } query; - struct { + struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ __u64 name; __u32 prog_fd; } raw_tracepoint; @@ -569,6 +570,13 @@ union bpf_attr { __u64 probe_offset; /* output: probe_offset */ __u64 probe_addr; /* output: probe_addr */ } task_fd_query; + + struct { /* struct used by BPF_LINK_CREATE command */ + __u32 prog_fd; /* eBPF program to attach */ + __u32 target_fd; /* object to attach to */ + __u32 attach_type; /* attach type */ + __u32 flags; /* extra flags */ + } link_create; } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9c8472823a7f..c24029937431 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -80,6 +80,17 @@ static void bpf_cgroup_storages_unlink(struct bpf_cgroup_storage *storages[]) bpf_cgroup_storage_unlink(storages[stype]); } +/* Called when bpf_cgroup_link is auto-detached from dying cgroup. + * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It + * doesn't free link memory, which will eventually be done by bpf_link's + * release() callback, when its last FD is closed. + */ +static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link) +{ + cgroup_put(link->cgroup); + link->cgroup = NULL; +} + /** * cgroup_bpf_release() - put references of all bpf programs and * release all cgroup bpf data @@ -100,7 +111,10 @@ static void cgroup_bpf_release(struct work_struct *work) list_for_each_entry_safe(pl, tmp, progs, node) { list_del(&pl->node); - bpf_prog_put(pl->prog); + if (pl->prog) + bpf_prog_put(pl->prog); + if (pl->link) + bpf_cgroup_link_auto_detach(pl->link); bpf_cgroup_storages_unlink(pl->storage); bpf_cgroup_storages_free(pl->storage); kfree(pl); @@ -134,6 +148,18 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref) queue_work(system_wq, &cgrp->bpf.release_work); } +/* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through + * link or direct prog. + */ +static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl) +{ + if (pl->prog) + return pl->prog; + if (pl->link) + return pl->link->link.prog; + return NULL; +} + /* count number of elements in the list. * it's slow but the list cannot be long */ @@ -143,7 +169,7 @@ static u32 prog_list_length(struct list_head *head) u32 cnt = 0; list_for_each_entry(pl, head, node) { - if (!pl->prog) + if (!prog_list_prog(pl)) continue; cnt++; } @@ -212,11 +238,11 @@ static int compute_effective_progs(struct cgroup *cgrp, continue; list_for_each_entry(pl, &p->bpf.progs[type], node) { - if (!pl->prog) + if (!prog_list_prog(pl)) continue; item = &progs->items[cnt]; - item->prog = pl->prog; + item->prog = prog_list_prog(pl); bpf_cgroup_storages_assign(item->cgroup_storage, pl->storage); cnt++; @@ -333,19 +359,60 @@ static int update_effective_progs(struct cgroup *cgrp, #define BPF_CGROUP_MAX_PROGS 64 +static struct bpf_prog_list *find_attach_entry(struct list_head *progs, + struct bpf_prog *prog, + struct bpf_cgroup_link *link, + struct bpf_prog *replace_prog, + bool allow_multi) +{ + struct bpf_prog_list *pl; + + /* single-attach case */ + if (!allow_multi) { + if (list_empty(progs)) + return NULL; + return list_first_entry(progs, typeof(*pl), node); + } + + list_for_each_entry(pl, progs, node) { + if (prog && pl->prog == prog) + /* disallow attaching the same prog twice */ + return ERR_PTR(-EINVAL); + if (link && pl->link == link) + /* disallow attaching the same link twice */ + return ERR_PTR(-EINVAL); + } + + /* direct prog multi-attach w/ replacement case */ + if (replace_prog) { + list_for_each_entry(pl, progs, node) { + if (pl->prog == replace_prog) + /* a match found */ + return pl; + } + /* prog to replace not found for cgroup */ + return ERR_PTR(-ENOENT); + } + + return NULL; +} + /** - * __cgroup_bpf_attach() - Attach the program to a cgroup, and + * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse * @prog: A program to attach + * @link: A link to attach * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set * @type: Type of attach operation * @flags: Option flags * + * Exactly one of @prog or @link can be non-null. * Must be called with cgroup_mutex held. */ -int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, - struct bpf_prog *replace_prog, +int __cgroup_bpf_attach(struct cgroup *cgrp, + struct bpf_prog *prog, struct bpf_prog *replace_prog, + struct bpf_cgroup_link *link, enum bpf_attach_type type, u32 flags) { u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); @@ -353,13 +420,19 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; - struct bpf_prog_list *pl, *replace_pl = NULL; + struct bpf_prog_list *pl; int err; if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) || ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI))) /* invalid combination */ return -EINVAL; + if (link && (prog || replace_prog)) + /* only either link or prog/replace_prog can be specified */ + return -EINVAL; + if (!!replace_prog != !!(flags & BPF_F_REPLACE)) + /* replace_prog implies BPF_F_REPLACE, and vice versa */ + return -EINVAL; if (!hierarchy_allows_attach(cgrp, type)) return -EPERM; @@ -374,26 +447,15 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; - if (flags & BPF_F_ALLOW_MULTI) { - list_for_each_entry(pl, progs, node) { - if (pl->prog == prog) - /* disallow attaching the same prog twice */ - return -EINVAL; - if (pl->prog == replace_prog) - replace_pl = pl; - } - if ((flags & BPF_F_REPLACE) && !replace_pl) - /* prog to replace not found for cgroup */ - return -ENOENT; - } else if (!list_empty(progs)) { - replace_pl = list_first_entry(progs, typeof(*pl), node); - } + pl = find_attach_entry(progs, prog, link, replace_prog, + flags & BPF_F_ALLOW_MULTI); + if (IS_ERR(pl)) + return PTR_ERR(pl); - if (bpf_cgroup_storages_alloc(storage, prog)) + if (bpf_cgroup_storages_alloc(storage, prog ? : link->link.prog)) return -ENOMEM; - if (replace_pl) { - pl = replace_pl; + if (pl) { old_prog = pl->prog; bpf_cgroup_storages_unlink(pl->storage); bpf_cgroup_storages_assign(old_storage, pl->storage); @@ -407,6 +469,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, } pl->prog = prog; + pl->link = link; bpf_cgroup_storages_assign(pl->storage, storage); cgrp->bpf.flags[type] = saved_flags; @@ -414,80 +477,93 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, if (err) goto cleanup; - static_branch_inc(&cgroup_bpf_enabled_key); bpf_cgroup_storages_free(old_storage); - if (old_prog) { + if (old_prog) bpf_prog_put(old_prog); - static_branch_dec(&cgroup_bpf_enabled_key); - } - bpf_cgroup_storages_link(storage, cgrp, type); + else + static_branch_inc(&cgroup_bpf_enabled_key); + bpf_cgroup_storages_link(pl->storage, cgrp, type); return 0; cleanup: - /* and cleanup the prog list */ - pl->prog = old_prog; + if (old_prog) { + pl->prog = old_prog; + pl->link = NULL; + } bpf_cgroup_storages_free(pl->storage); bpf_cgroup_storages_assign(pl->storage, old_storage); bpf_cgroup_storages_link(pl->storage, cgrp, type); - if (!replace_pl) { + if (!old_prog) { list_del(&pl->node); kfree(pl); } return err; } +static struct bpf_prog_list *find_detach_entry(struct list_head *progs, + struct bpf_prog *prog, + struct bpf_cgroup_link *link, + bool allow_multi) +{ + struct bpf_prog_list *pl; + + if (!allow_multi) { + if (list_empty(progs)) + /* report error when trying to detach and nothing is attached */ + return ERR_PTR(-ENOENT); + + /* to maintain backward compatibility NONE and OVERRIDE cgroups + * allow detaching with invalid FD (prog==NULL) in legacy mode + */ + return list_first_entry(progs, typeof(*pl), node); + } + + if (!prog && !link) + /* to detach MULTI prog the user has to specify valid FD + * of the program or link to be detached + */ + return ERR_PTR(-EINVAL); + + /* find the prog or link and detach it */ + list_for_each_entry(pl, progs, node) { + if (pl->prog == prog && pl->link == link) + return pl; + } + return ERR_PTR(-ENOENT); +} + /** - * __cgroup_bpf_detach() - Detach the program from a cgroup, and + * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse * @prog: A program to detach or NULL + * @prog: A link to detach or NULL * @type: Type of detach operation * + * At most one of @prog or @link can be non-NULL. * Must be called with cgroup_mutex held. */ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type) + struct bpf_cgroup_link *link, enum bpf_attach_type type) { struct list_head *progs = &cgrp->bpf.progs[type]; u32 flags = cgrp->bpf.flags[type]; - struct bpf_prog *old_prog = NULL; struct bpf_prog_list *pl; + struct bpf_prog *old_prog; int err; - if (flags & BPF_F_ALLOW_MULTI) { - if (!prog) - /* to detach MULTI prog the user has to specify valid FD - * of the program to be detached - */ - return -EINVAL; - } else { - if (list_empty(progs)) - /* report error when trying to detach and nothing is attached */ - return -ENOENT; - } + if (prog && link) + /* only one of prog or link can be specified */ + return -EINVAL; - if (flags & BPF_F_ALLOW_MULTI) { - /* find the prog and detach it */ - list_for_each_entry(pl, progs, node) { - if (pl->prog != prog) - continue; - old_prog = prog; - /* mark it deleted, so it's ignored while - * recomputing effective - */ - pl->prog = NULL; - break; - } - if (!old_prog) - return -ENOENT; - } else { - /* to maintain backward compatibility NONE and OVERRIDE cgroups - * allow detaching with invalid FD (prog==NULL) - */ - pl = list_first_entry(progs, typeof(*pl), node); - old_prog = pl->prog; - pl->prog = NULL; - } + pl = find_detach_entry(progs, prog, link, flags & BPF_F_ALLOW_MULTI); + if (IS_ERR(pl)) + return PTR_ERR(pl); + + /* mark it deleted, so it's ignored while recomputing effective */ + old_prog = pl->prog; + pl->prog = NULL; + pl->link = NULL; err = update_effective_progs(cgrp, type); if (err) @@ -501,14 +577,15 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, if (list_empty(progs)) /* last program was detached, reset flags to zero */ cgrp->bpf.flags[type] = 0; - - bpf_prog_put(old_prog); + if (old_prog) + bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key); return 0; cleanup: - /* and restore back old_prog */ + /* restore back prog or link */ pl->prog = old_prog; + pl->link = link; return err; } @@ -521,6 +598,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, struct list_head *progs = &cgrp->bpf.progs[type]; u32 flags = cgrp->bpf.flags[type]; struct bpf_prog_array *effective; + struct bpf_prog *prog; int cnt, ret = 0, i; effective = rcu_dereference_protected(cgrp->bpf.effective[type], @@ -551,7 +629,8 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, i = 0; list_for_each_entry(pl, progs, node) { - id = pl->prog->aux->id; + prog = prog_list_prog(pl); + id = prog->aux->id; if (copy_to_user(prog_ids + i, &id, sizeof(id))) return -EFAULT; if (++i == cnt) @@ -581,8 +660,8 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr, } } - ret = cgroup_bpf_attach(cgrp, prog, replace_prog, attr->attach_type, - attr->attach_flags); + ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL, + attr->attach_type, attr->attach_flags); if (replace_prog) bpf_prog_put(replace_prog); @@ -604,7 +683,7 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) if (IS_ERR(prog)) prog = NULL; - ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); + ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type); if (prog) bpf_prog_put(prog); @@ -612,6 +691,90 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) return ret; } +static void bpf_cgroup_link_release(struct bpf_link *link) +{ + struct bpf_cgroup_link *cg_link = + container_of(link, struct bpf_cgroup_link, link); + + /* link might have been auto-detached by dying cgroup already, + * in that case our work is done here + */ + if (!cg_link->cgroup) + return; + + mutex_lock(&cgroup_mutex); + + /* re-check cgroup under lock again */ + if (!cg_link->cgroup) { + mutex_unlock(&cgroup_mutex); + return; + } + + WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, + cg_link->type)); + + mutex_unlock(&cgroup_mutex); + cgroup_put(cg_link->cgroup); +} + +static void bpf_cgroup_link_dealloc(struct bpf_link *link) +{ + struct bpf_cgroup_link *cg_link = + container_of(link, struct bpf_cgroup_link, link); + + kfree(cg_link); +} + +const struct bpf_link_ops bpf_cgroup_link_lops = { + .release = bpf_cgroup_link_release, + .dealloc = bpf_cgroup_link_dealloc, +}; + +int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_cgroup_link *link; + struct file *link_file; + struct cgroup *cgrp; + int err, link_fd; + + if (attr->link_create.flags) + return -EINVAL; + + cgrp = cgroup_get_from_fd(attr->link_create.target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; + goto out_put_cgroup; + } + bpf_link_init(&link->link, &bpf_cgroup_link_lops, prog); + link->cgroup = cgrp; + link->type = attr->link_create.attach_type; + + link_file = bpf_link_new_file(&link->link, &link_fd); + if (IS_ERR(link_file)) { + kfree(link); + err = PTR_ERR(link_file); + goto out_put_cgroup; + } + + err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type, + BPF_F_ALLOW_MULTI); + if (err) { + bpf_link_cleanup(&link->link, link_file, link_fd); + goto out_put_cgroup; + } + + fd_install(link_fd, link_file); + return link_fd; + +out_put_cgroup: + cgroup_put(cgrp); + return err; +} + int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a616b63f23b4..05412b83ed6c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2175,13 +2175,6 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } -struct bpf_link { - atomic64_t refcnt; - const struct bpf_link_ops *ops; - struct bpf_prog *prog; - struct work_struct work; -}; - void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, struct bpf_prog *prog) { @@ -2195,8 +2188,8 @@ void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, * anon_inode's release() call. This helper manages marking bpf_link as * defunct, releases anon_inode file and puts reserved FD. */ -static void bpf_link_cleanup(struct bpf_link *link, struct file *link_file, - int link_fd) +void bpf_link_cleanup(struct bpf_link *link, struct file *link_file, + int link_fd) { link->prog = NULL; fput(link_file); @@ -2266,6 +2259,10 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) link_type = "raw_tracepoint"; else if (link->ops == &bpf_tracing_link_lops) link_type = "tracing"; +#ifdef CONFIG_CGROUP_BPF + else if (link->ops == &bpf_cgroup_link_lops) + link_type = "cgroup"; +#endif else link_type = "unknown"; @@ -3553,6 +3550,49 @@ static int bpf_map_do_batch(const union bpf_attr *attr, return err; } +#define BPF_LINK_CREATE_LAST_FIELD link_create.flags +static int link_create(union bpf_attr *attr) +{ + enum bpf_prog_type ptype; + struct bpf_prog *prog; + int ret; + + if (CHECK_ATTR(BPF_LINK_CREATE)) + return -EINVAL; + + ptype = attach_type_to_prog_type(attr->link_create.attach_type); + if (ptype == BPF_PROG_TYPE_UNSPEC) + return -EINVAL; + + prog = bpf_prog_get_type(attr->link_create.prog_fd, ptype); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + ret = bpf_prog_attach_check_attach_type(prog, + attr->link_create.attach_type); + if (ret) + goto err_out; + + switch (ptype) { + case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + ret = cgroup_bpf_link_attach(attr, prog); + break; + default: + ret = -EINVAL; + } + +err_out: + if (ret < 0) + bpf_prog_put(prog); + return ret; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -3663,6 +3703,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_DELETE_BATCH: err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH); break; + case BPF_LINK_CREATE: + err = link_create(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3dead0416b91..219624fba9ba 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6303,27 +6303,31 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #endif /* CONFIG_SOCK_CGROUP_DATA */ #ifdef CONFIG_CGROUP_BPF -int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, - struct bpf_prog *replace_prog, enum bpf_attach_type type, +int cgroup_bpf_attach(struct cgroup *cgrp, + struct bpf_prog *prog, struct bpf_prog *replace_prog, + struct bpf_cgroup_link *link, + enum bpf_attach_type type, u32 flags) { int ret; mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, type, flags); + ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags); mutex_unlock(&cgroup_mutex); return ret; } + int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type, u32 flags) + enum bpf_attach_type type) { int ret; mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_detach(cgrp, prog, type); + ret = __cgroup_bpf_detach(cgrp, prog, NULL, type); mutex_unlock(&cgroup_mutex); return ret; } + int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr) { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f1fbc36f58d3..8b3f1c098ac0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -111,6 +111,7 @@ enum bpf_cmd { BPF_MAP_LOOKUP_AND_DELETE_BATCH, BPF_MAP_UPDATE_BATCH, BPF_MAP_DELETE_BATCH, + BPF_LINK_CREATE, }; enum bpf_map_type { @@ -541,7 +542,7 @@ union bpf_attr { __u32 prog_cnt; } query; - struct { + struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ __u64 name; __u32 prog_fd; } raw_tracepoint; @@ -569,6 +570,13 @@ union bpf_attr { __u64 probe_offset; /* output: probe_offset */ __u64 probe_addr; /* output: probe_addr */ } task_fd_query; + + struct { /* struct used by BPF_LINK_CREATE command */ + __u32 prog_fd; /* eBPF program to attach */ + __u32 target_fd; /* object to attach to */ + __u32 attach_type; /* attach type */ + __u32 flags; /* extra flags */ + } link_create; } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF