Message ID | 20181213004237.3888568-1-mmullins@fb.com |
---|---|
State | Accepted, archived |
Delegated to: | BPF Maintainers |
Headers | show |
Series | [bpf-next,v2] bpf: support raw tracepoints in modules | expand |
On Wed, Dec 12, 2018 at 04:42:37PM -0800, Matt Mullins wrote: > Distributions build drivers as modules, including network and filesystem > drivers which export numerous tracepoints. This enables > bpf(BPF_RAW_TRACEPOINT_OPEN) to attach to those tracepoints. > > Signed-off-by: Matt Mullins <mmullins@fb.com> > --- > v1->v2: > * avoid taking the mutex in bpf_event_notify when op is neither COMING nor > GOING. > * check that kzalloc actually succeeded > > I didn't try to check list_empty before taking the mutex since I want to avoid > races between bpf_event_notify and bpf_get_raw_tracepoint. Additionally, > list_for_each_entry_safe is not strictly necessary upon MODULE_STATE_GOING, but > Alexei suggested I use it to protect against fragility if the subsequent break; > eventually disappears. > > include/linux/module.h | 4 ++ > include/linux/trace_events.h | 8 ++- > kernel/bpf/syscall.c | 11 ++-- > kernel/module.c | 5 ++ > kernel/trace/bpf_trace.c | 99 +++++++++++++++++++++++++++++++++++- > 5 files changed, 120 insertions(+), 7 deletions(-) > > diff --git a/include/linux/module.h b/include/linux/module.h > index fce6b4335e36..5f147dd5e709 100644 > --- a/include/linux/module.h > +++ b/include/linux/module.h > @@ -432,6 +432,10 @@ struct module { > unsigned int num_tracepoints; > tracepoint_ptr_t *tracepoints_ptrs; > #endif > +#ifdef CONFIG_BPF_EVENTS > + unsigned int num_bpf_raw_events; > + struct bpf_raw_event_map *bpf_raw_events; > +#endif > #ifdef HAVE_JUMP_LABEL > struct jump_entry *jump_entries; > unsigned int num_jump_entries; > diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h > index 4130a5497d40..8a62731673f7 100644 > --- a/include/linux/trace_events.h > +++ b/include/linux/trace_events.h > @@ -471,7 +471,8 @@ void perf_event_detach_bpf_prog(struct perf_event *event); > int perf_event_query_prog_array(struct perf_event *event, void __user *info); > int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); > int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); > -struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name); > +struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name); > +void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp); > int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, > u32 *fd_type, const char **buf, > u64 *probe_offset, u64 *probe_addr); > @@ -502,10 +503,13 @@ static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf > { > return -EOPNOTSUPP; > } > -static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) > +static inline struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name) > { > return NULL; > } > +static inline void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) > +{ > +} > static inline int bpf_get_perf_event_info(const struct perf_event *event, > u32 *prog_id, u32 *fd_type, > const char **buf, u64 *probe_offset, > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index 70fb11106fc2..754370e3155e 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -1609,6 +1609,7 @@ static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp) > bpf_probe_unregister(raw_tp->btp, raw_tp->prog); > bpf_prog_put(raw_tp->prog); > } > + bpf_put_raw_tracepoint(raw_tp->btp); > kfree(raw_tp); > return 0; > } > @@ -1634,13 +1635,15 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) > return -EFAULT; > tp_name[sizeof(tp_name) - 1] = 0; > > - btp = bpf_find_raw_tracepoint(tp_name); > + btp = bpf_get_raw_tracepoint(tp_name); > if (!btp) > return -ENOENT; > > raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER); > - if (!raw_tp) > - return -ENOMEM; > + if (!raw_tp) { > + err = -ENOMEM; > + goto out_put_btp; > + } > raw_tp->btp = btp; > > prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd, > @@ -1668,6 +1671,8 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) > bpf_prog_put(prog); > out_free_tp: > kfree(raw_tp); > +out_put_btp: > + bpf_put_raw_tracepoint(btp); > return err; > } > > diff --git a/kernel/module.c b/kernel/module.c > index 49a405891587..06ec68f08387 100644 > --- a/kernel/module.c > +++ b/kernel/module.c > @@ -3093,6 +3093,11 @@ static int find_module_sections(struct module *mod, struct load_info *info) > sizeof(*mod->tracepoints_ptrs), > &mod->num_tracepoints); > #endif > +#ifdef CONFIG_BPF_EVENTS > + mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map", > + sizeof(*mod->bpf_raw_events), > + &mod->num_bpf_raw_events); > +#endif > #ifdef HAVE_JUMP_LABEL > mod->jump_entries = section_objs(info, "__jump_table", > sizeof(*mod->jump_entries), > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c > index 9864a35c8bb5..9ddb6fddb4e0 100644 > --- a/kernel/trace/bpf_trace.c > +++ b/kernel/trace/bpf_trace.c > @@ -17,6 +17,43 @@ > #include "trace_probe.h" > #include "trace.h" > > +#ifdef CONFIG_MODULES > +struct bpf_trace_module { > + struct module *module; > + struct list_head list; > +}; > + > +static LIST_HEAD(bpf_trace_modules); > +static DEFINE_MUTEX(bpf_module_mutex); > + > +static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name) > +{ > + struct bpf_raw_event_map *btp, *ret = NULL; > + struct bpf_trace_module *btm; > + unsigned int i; > + > + mutex_lock(&bpf_module_mutex); > + list_for_each_entry(btm, &bpf_trace_modules, list) { > + for (i = 0; i < btm->module->num_bpf_raw_events; ++i) { > + btp = &btm->module->bpf_raw_events[i]; > + if (!strcmp(btp->tp->name, name)) { > + if (try_module_get(btm->module)) > + ret = btp; > + goto out; > + } > + } > + } > +out: > + mutex_unlock(&bpf_module_mutex); > + return ret; > +} > +#else > +static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name) > +{ > + return NULL; > +} > +#endif /* CONFIG_MODULES */ > + > u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); > u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); > > @@ -1076,7 +1113,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) > extern struct bpf_raw_event_map __start__bpf_raw_tp[]; > extern struct bpf_raw_event_map __stop__bpf_raw_tp[]; > > -struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) > +struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name) > { > struct bpf_raw_event_map *btp = __start__bpf_raw_tp; > > @@ -1084,7 +1121,16 @@ struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) > if (!strcmp(btp->tp->name, name)) > return btp; > } > - return NULL; > + > + return bpf_get_raw_tracepoint_module(name); > +} > + > +void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) > +{ > + struct module *mod = __module_address((unsigned long)btp); > + > + if (mod) > + module_put(mod); > } > > static __always_inline > @@ -1222,3 +1268,52 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, > > return err; > } > + > +#ifdef CONFIG_MODULES > +int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module) > +{ > + struct bpf_trace_module *btm, *tmp; > + struct module *mod = module; > + > + if (mod->num_bpf_raw_events == 0 || > + (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING)) > + return 0; > + > + mutex_lock(&bpf_module_mutex); > + > + switch (op) { > + case MODULE_STATE_COMING: > + btm = kzalloc(sizeof(*btm), GFP_KERNEL); > + if (btm) { > + btm->module = module; > + list_add(&btm->list, &bpf_trace_modules); > + } Is it fine to return 0 on !btm case? Other looks good. > + break; > + case MODULE_STATE_GOING: > + list_for_each_entry_safe(btm, tmp, &bpf_trace_modules, list) { > + if (btm->module == module) { > + list_del(&btm->list); > + kfree(btm); > + break; > + } > + } > + break; > + } > + > + mutex_unlock(&bpf_module_mutex); > + > + return 0; > +} > + > +static struct notifier_block bpf_module_nb = { > + .notifier_call = bpf_event_notify, > +}; > + > +int __init bpf_event_init(void) > +{ > + register_module_notifier(&bpf_module_nb); > + return 0; > +} > + > +fs_initcall(bpf_event_init); > +#endif /* CONFIG_MODULES */ > -- > 2.17.1 >
On Thu, 2018-12-13 at 19:22 +0000, Martin Lau wrote: > On Wed, Dec 12, 2018 at 04:42:37PM -0800, Matt Mullins wrote: > > Distributions build drivers as modules, including network and filesystem > > drivers which export numerous tracepoints. This enables > > bpf(BPF_RAW_TRACEPOINT_OPEN) to attach to those tracepoints. > > > > Signed-off-by: Matt Mullins <mmullins@fb.com> > > --- > > v1->v2: > > * avoid taking the mutex in bpf_event_notify when op is neither COMING nor > > GOING. > > * check that kzalloc actually succeeded > > > > I didn't try to check list_empty before taking the mutex since I want to avoid > > races between bpf_event_notify and bpf_get_raw_tracepoint. Additionally, > > list_for_each_entry_safe is not strictly necessary upon MODULE_STATE_GOING, but > > Alexei suggested I use it to protect against fragility if the subsequent break; > > eventually disappears. > > > > include/linux/module.h | 4 ++ > > include/linux/trace_events.h | 8 ++- > > kernel/bpf/syscall.c | 11 ++-- > > kernel/module.c | 5 ++ > > kernel/trace/bpf_trace.c | 99 +++++++++++++++++++++++++++++++++++- > > 5 files changed, 120 insertions(+), 7 deletions(-) > > > > diff --git a/include/linux/module.h b/include/linux/module.h > > index fce6b4335e36..5f147dd5e709 100644 > > --- a/include/linux/module.h > > +++ b/include/linux/module.h > > @@ -432,6 +432,10 @@ struct module { > > unsigned int num_tracepoints; > > tracepoint_ptr_t *tracepoints_ptrs; > > #endif > > +#ifdef CONFIG_BPF_EVENTS > > + unsigned int num_bpf_raw_events; > > + struct bpf_raw_event_map *bpf_raw_events; > > +#endif > > #ifdef HAVE_JUMP_LABEL > > struct jump_entry *jump_entries; > > unsigned int num_jump_entries; > > diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h > > index 4130a5497d40..8a62731673f7 100644 > > --- a/include/linux/trace_events.h > > +++ b/include/linux/trace_events.h > > @@ -471,7 +471,8 @@ void perf_event_detach_bpf_prog(struct perf_event *event); > > int perf_event_query_prog_array(struct perf_event *event, void __user *info); > > int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); > > int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); > > -struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name); > > +struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name); > > +void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp); > > int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, > > u32 *fd_type, const char **buf, > > u64 *probe_offset, u64 *probe_addr); > > @@ -502,10 +503,13 @@ static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf > > { > > return -EOPNOTSUPP; > > } > > -static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) > > +static inline struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name) > > { > > return NULL; > > } > > +static inline void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) > > +{ > > +} > > static inline int bpf_get_perf_event_info(const struct perf_event *event, > > u32 *prog_id, u32 *fd_type, > > const char **buf, u64 *probe_offset, > > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > > index 70fb11106fc2..754370e3155e 100644 > > --- a/kernel/bpf/syscall.c > > +++ b/kernel/bpf/syscall.c > > @@ -1609,6 +1609,7 @@ static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp) > > bpf_probe_unregister(raw_tp->btp, raw_tp->prog); > > bpf_prog_put(raw_tp->prog); > > } > > + bpf_put_raw_tracepoint(raw_tp->btp); > > kfree(raw_tp); > > return 0; > > } > > @@ -1634,13 +1635,15 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) > > return -EFAULT; > > tp_name[sizeof(tp_name) - 1] = 0; > > > > - btp = bpf_find_raw_tracepoint(tp_name); > > + btp = bpf_get_raw_tracepoint(tp_name); > > if (!btp) > > return -ENOENT; > > > > raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER); > > - if (!raw_tp) > > - return -ENOMEM; > > + if (!raw_tp) { > > + err = -ENOMEM; > > + goto out_put_btp; > > + } > > raw_tp->btp = btp; > > > > prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd, > > @@ -1668,6 +1671,8 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) > > bpf_prog_put(prog); > > out_free_tp: > > kfree(raw_tp); > > +out_put_btp: > > + bpf_put_raw_tracepoint(btp); > > return err; > > } > > > > diff --git a/kernel/module.c b/kernel/module.c > > index 49a405891587..06ec68f08387 100644 > > --- a/kernel/module.c > > +++ b/kernel/module.c > > @@ -3093,6 +3093,11 @@ static int find_module_sections(struct module *mod, struct load_info *info) > > sizeof(*mod->tracepoints_ptrs), > > &mod->num_tracepoints); > > #endif > > +#ifdef CONFIG_BPF_EVENTS > > + mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map", > > + sizeof(*mod->bpf_raw_events), > > + &mod->num_bpf_raw_events); > > +#endif > > #ifdef HAVE_JUMP_LABEL > > mod->jump_entries = section_objs(info, "__jump_table", > > sizeof(*mod->jump_entries), > > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c > > index 9864a35c8bb5..9ddb6fddb4e0 100644 > > --- a/kernel/trace/bpf_trace.c > > +++ b/kernel/trace/bpf_trace.c > > @@ -17,6 +17,43 @@ > > #include "trace_probe.h" > > #include "trace.h" > > > > +#ifdef CONFIG_MODULES > > +struct bpf_trace_module { > > + struct module *module; > > + struct list_head list; > > +}; > > + > > +static LIST_HEAD(bpf_trace_modules); > > +static DEFINE_MUTEX(bpf_module_mutex); > > + > > +static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name) > > +{ > > + struct bpf_raw_event_map *btp, *ret = NULL; > > + struct bpf_trace_module *btm; > > + unsigned int i; > > + > > + mutex_lock(&bpf_module_mutex); > > + list_for_each_entry(btm, &bpf_trace_modules, list) { > > + for (i = 0; i < btm->module->num_bpf_raw_events; ++i) { > > + btp = &btm->module->bpf_raw_events[i]; > > + if (!strcmp(btp->tp->name, name)) { > > + if (try_module_get(btm->module)) > > + ret = btp; > > + goto out; > > + } > > + } > > + } > > +out: > > + mutex_unlock(&bpf_module_mutex); > > + return ret; > > +} > > +#else > > +static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name) > > +{ > > + return NULL; > > +} > > +#endif /* CONFIG_MODULES */ > > + > > u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); > > u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); > > > > @@ -1076,7 +1113,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) > > extern struct bpf_raw_event_map __start__bpf_raw_tp[]; > > extern struct bpf_raw_event_map __stop__bpf_raw_tp[]; > > > > -struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) > > +struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name) > > { > > struct bpf_raw_event_map *btp = __start__bpf_raw_tp; > > > > @@ -1084,7 +1121,16 @@ struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) > > if (!strcmp(btp->tp->name, name)) > > return btp; > > } > > - return NULL; > > + > > + return bpf_get_raw_tracepoint_module(name); > > +} > > + > > +void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) > > +{ > > + struct module *mod = __module_address((unsigned long)btp); > > + > > + if (mod) > > + module_put(mod); > > } > > > > static __always_inline > > @@ -1222,3 +1268,52 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, > > > > return err; > > } > > + > > +#ifdef CONFIG_MODULES > > +int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module) > > +{ > > + struct bpf_trace_module *btm, *tmp; > > + struct module *mod = module; > > + > > + if (mod->num_bpf_raw_events == 0 || > > + (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING)) > > + return 0; > > + > > + mutex_lock(&bpf_module_mutex); > > + > > + switch (op) { > > + case MODULE_STATE_COMING: > > + btm = kzalloc(sizeof(*btm), GFP_KERNEL); > > + if (btm) { > > + btm->module = module; > > + list_add(&btm->list, &bpf_trace_modules); > > + } > > Is it fine to return 0 on !btm case? That effectively just means we'll be ignoring tracepoints for a module that is loaded while we can't allocate a bpf_trace_module (24 bytes) to track it. That feels like reasonable behavior to me. > > Other looks good. > > > + break; > > + case MODULE_STATE_GOING: > > + list_for_each_entry_safe(btm, tmp, &bpf_trace_modules, list) { > > + if (btm->module == module) { > > + list_del(&btm->list); > > + kfree(btm); > > + break; > > + } > > + } > > + break; > > + } > > + > > + mutex_unlock(&bpf_module_mutex); > > + > > + return 0; > > +} > > + > > +static struct notifier_block bpf_module_nb = { > > + .notifier_call = bpf_event_notify, > > +}; > > + > > +int __init bpf_event_init(void) > > +{ > > + register_module_notifier(&bpf_module_nb); > > + return 0; > > +} > > + > > +fs_initcall(bpf_event_init); > > +#endif /* CONFIG_MODULES */ > > -- > > 2.17.1 > >
On Thu, Dec 13, 2018 at 11:38:51AM -0800, Matt Mullins wrote: > On Thu, 2018-12-13 at 19:22 +0000, Martin Lau wrote: > > On Wed, Dec 12, 2018 at 04:42:37PM -0800, Matt Mullins wrote: > > > Distributions build drivers as modules, including network and filesystem > > > drivers which export numerous tracepoints. This enables > > > bpf(BPF_RAW_TRACEPOINT_OPEN) to attach to those tracepoints. > > > Acked-by: Martin KaFai Lau <kafai@fb.com> [ ... ] > > > +#ifdef CONFIG_MODULES > > > +int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module) > > > +{ > > > + struct bpf_trace_module *btm, *tmp; > > > + struct module *mod = module; > > > + > > > + if (mod->num_bpf_raw_events == 0 || > > > + (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING)) > > > + return 0; > > > + > > > + mutex_lock(&bpf_module_mutex); > > > + > > > + switch (op) { > > > + case MODULE_STATE_COMING: > > > + btm = kzalloc(sizeof(*btm), GFP_KERNEL); > > > + if (btm) { > > > + btm->module = module; > > > + list_add(&btm->list, &bpf_trace_modules); > > > + } > > > > Is it fine to return 0 on !btm case? > > That effectively just means we'll be ignoring tracepoints for a module > that is loaded while we can't allocate a bpf_trace_module (24 bytes) to > track it. That feels like reasonable behavior to me. ok.
On Wed, Dec 12, 2018 at 04:42:37PM -0800, Matt Mullins wrote: > Distributions build drivers as modules, including network and filesystem > drivers which export numerous tracepoints. This enables > bpf(BPF_RAW_TRACEPOINT_OPEN) to attach to those tracepoints. > > Signed-off-by: Matt Mullins <mmullins@fb.com> > --- > v1->v2: > * avoid taking the mutex in bpf_event_notify when op is neither COMING nor > GOING. > * check that kzalloc actually succeeded Applied, Thanks
diff --git a/include/linux/module.h b/include/linux/module.h index fce6b4335e36..5f147dd5e709 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -432,6 +432,10 @@ struct module { unsigned int num_tracepoints; tracepoint_ptr_t *tracepoints_ptrs; #endif +#ifdef CONFIG_BPF_EVENTS + unsigned int num_bpf_raw_events; + struct bpf_raw_event_map *bpf_raw_events; +#endif #ifdef HAVE_JUMP_LABEL struct jump_entry *jump_entries; unsigned int num_jump_entries; diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 4130a5497d40..8a62731673f7 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -471,7 +471,8 @@ void perf_event_detach_bpf_prog(struct perf_event *event); int perf_event_query_prog_array(struct perf_event *event, void __user *info); int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); -struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name); +struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name); +void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp); int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, u64 *probe_offset, u64 *probe_addr); @@ -502,10 +503,13 @@ static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf { return -EOPNOTSUPP; } -static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) +static inline struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name) { return NULL; } +static inline void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) +{ +} static inline int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, u64 *probe_offset, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 70fb11106fc2..754370e3155e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1609,6 +1609,7 @@ static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp) bpf_probe_unregister(raw_tp->btp, raw_tp->prog); bpf_prog_put(raw_tp->prog); } + bpf_put_raw_tracepoint(raw_tp->btp); kfree(raw_tp); return 0; } @@ -1634,13 +1635,15 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) return -EFAULT; tp_name[sizeof(tp_name) - 1] = 0; - btp = bpf_find_raw_tracepoint(tp_name); + btp = bpf_get_raw_tracepoint(tp_name); if (!btp) return -ENOENT; raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER); - if (!raw_tp) - return -ENOMEM; + if (!raw_tp) { + err = -ENOMEM; + goto out_put_btp; + } raw_tp->btp = btp; prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd, @@ -1668,6 +1671,8 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) bpf_prog_put(prog); out_free_tp: kfree(raw_tp); +out_put_btp: + bpf_put_raw_tracepoint(btp); return err; } diff --git a/kernel/module.c b/kernel/module.c index 49a405891587..06ec68f08387 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3093,6 +3093,11 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->tracepoints_ptrs), &mod->num_tracepoints); #endif +#ifdef CONFIG_BPF_EVENTS + mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map", + sizeof(*mod->bpf_raw_events), + &mod->num_bpf_raw_events); +#endif #ifdef HAVE_JUMP_LABEL mod->jump_entries = section_objs(info, "__jump_table", sizeof(*mod->jump_entries), diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9864a35c8bb5..9ddb6fddb4e0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -17,6 +17,43 @@ #include "trace_probe.h" #include "trace.h" +#ifdef CONFIG_MODULES +struct bpf_trace_module { + struct module *module; + struct list_head list; +}; + +static LIST_HEAD(bpf_trace_modules); +static DEFINE_MUTEX(bpf_module_mutex); + +static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name) +{ + struct bpf_raw_event_map *btp, *ret = NULL; + struct bpf_trace_module *btm; + unsigned int i; + + mutex_lock(&bpf_module_mutex); + list_for_each_entry(btm, &bpf_trace_modules, list) { + for (i = 0; i < btm->module->num_bpf_raw_events; ++i) { + btp = &btm->module->bpf_raw_events[i]; + if (!strcmp(btp->tp->name, name)) { + if (try_module_get(btm->module)) + ret = btp; + goto out; + } + } + } +out: + mutex_unlock(&bpf_module_mutex); + return ret; +} +#else +static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name) +{ + return NULL; +} +#endif /* CONFIG_MODULES */ + u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); @@ -1076,7 +1113,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) extern struct bpf_raw_event_map __start__bpf_raw_tp[]; extern struct bpf_raw_event_map __stop__bpf_raw_tp[]; -struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) +struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name) { struct bpf_raw_event_map *btp = __start__bpf_raw_tp; @@ -1084,7 +1121,16 @@ struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) if (!strcmp(btp->tp->name, name)) return btp; } - return NULL; + + return bpf_get_raw_tracepoint_module(name); +} + +void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) +{ + struct module *mod = __module_address((unsigned long)btp); + + if (mod) + module_put(mod); } static __always_inline @@ -1222,3 +1268,52 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, return err; } + +#ifdef CONFIG_MODULES +int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module) +{ + struct bpf_trace_module *btm, *tmp; + struct module *mod = module; + + if (mod->num_bpf_raw_events == 0 || + (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING)) + return 0; + + mutex_lock(&bpf_module_mutex); + + switch (op) { + case MODULE_STATE_COMING: + btm = kzalloc(sizeof(*btm), GFP_KERNEL); + if (btm) { + btm->module = module; + list_add(&btm->list, &bpf_trace_modules); + } + break; + case MODULE_STATE_GOING: + list_for_each_entry_safe(btm, tmp, &bpf_trace_modules, list) { + if (btm->module == module) { + list_del(&btm->list); + kfree(btm); + break; + } + } + break; + } + + mutex_unlock(&bpf_module_mutex); + + return 0; +} + +static struct notifier_block bpf_module_nb = { + .notifier_call = bpf_event_notify, +}; + +int __init bpf_event_init(void) +{ + register_module_notifier(&bpf_module_nb); + return 0; +} + +fs_initcall(bpf_event_init); +#endif /* CONFIG_MODULES */
Distributions build drivers as modules, including network and filesystem drivers which export numerous tracepoints. This enables bpf(BPF_RAW_TRACEPOINT_OPEN) to attach to those tracepoints. Signed-off-by: Matt Mullins <mmullins@fb.com> --- v1->v2: * avoid taking the mutex in bpf_event_notify when op is neither COMING nor GOING. * check that kzalloc actually succeeded I didn't try to check list_empty before taking the mutex since I want to avoid races between bpf_event_notify and bpf_get_raw_tracepoint. Additionally, list_for_each_entry_safe is not strictly necessary upon MODULE_STATE_GOING, but Alexei suggested I use it to protect against fragility if the subsequent break; eventually disappears. include/linux/module.h | 4 ++ include/linux/trace_events.h | 8 ++- kernel/bpf/syscall.c | 11 ++-- kernel/module.c | 5 ++ kernel/trace/bpf_trace.c | 99 +++++++++++++++++++++++++++++++++++- 5 files changed, 120 insertions(+), 7 deletions(-)