Message ID | 20190116162931.1542429-2-songliubraving@fb.com |
---|---|
State | Changes Requested |
Delegated to: | BPF Maintainers |
Headers | show |
Series | reveal invisible bpf programs | expand |
On Wed, Jan 16, 2019 at 08:29:23AM -0800, Song Liu wrote: > +/* callback function to generate ksymbol name */ > +typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data); > +extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, > + bool unregister, > + perf_ksymbol_get_name_f get_name, void *data); > + I'm not liking the getname thing.. I know that's what BPF does, but can we please do that in the caller or something? --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1114,11 +1114,8 @@ static inline void perf_event_task_sched extern void perf_event_mmap(struct vm_area_struct *vma); -/* callback function to generate ksymbol name */ -typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data); extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, - bool unregister, - perf_ksymbol_get_name_f get_name, void *data); + bool unregister, const char *name); extern struct perf_guest_info_callbacks *perf_guest_cbs; extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); @@ -1341,11 +1338,8 @@ static inline int perf_unregister_guest_ static inline void perf_event_mmap(struct vm_area_struct *vma) { } -typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data); static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, - bool unregister, - perf_ksymbol_get_name_f get_name, - void *data) { } + bool unregister, const char *name) { } static inline void perf_event_exec(void) { } static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } static inline void perf_event_namespaces(struct task_struct *tsk) { } --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7698,8 +7698,7 @@ static void perf_event_ksymbol_output(st perf_output_end(&handle); } -void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, - perf_ksymbol_get_name_f get_name, void *data) +void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym) { struct perf_ksymbol_event ksymbol_event; char name[KSYM_NAME_LEN]; @@ -7713,7 +7712,7 @@ void perf_event_ksymbol(u16 ksym_type, u ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN) goto err; - get_name(name, KSYM_NAME_LEN, data); + strlcpy(name, sym, KSYM_NAME_LEN); name_len = strlen(name) + 1; while (!IS_ALIGNED(name_len, sizeof(u64))) name[name_len++] = '\0';
On Wed, Jan 16, 2019 at 08:29:23AM -0800, Song Liu wrote: > For better performance analysis of dynamically JITed and loaded kernel > functions, such as BPF programs, this patch introduces > PERF_RECORD_KSYMBOL, a new perf_event_type that exposes kernel symbol > register/unregister information to user space. > > The following data structure is used for PERF_RECORD_KSYMBOL. > > /* > * struct { > * struct perf_event_header header; > * u64 addr; > * u32 len; > * u16 ksym_type; > * u16 flags; > * char name[]; > * struct sample_id sample_id; > * }; > */ So I've cobbled together the attached patches to see how it would work out.. I didn't convert ftrace trampolines; because ftrace code has this uncanny ability to make my head hurt. But I don't think it should be hard, once one figures out the right structure to stick that kallsym_node thing in (ftrace_ops ?!). It is compiled only, so no testing what so ever (also, no changelogs). I didn't wire up the KSYM_TYPE thing; I'm wondering if we really need that, OTOH it really doesn't hurt having it either. One weird thing I noticed, wth does bpf_prog_kallsyms_add() check CAP_ADMIN ?! Surely even a non-priv JIT'ed program generates symbols, why hide those? Anyway; with the one nit about the get_names() thing sorted: Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> (thanks for sticking with this) Subject: From: Peter Zijlstra <peterz@infradead.org> Date: Thu Jan 17 11:41:01 CET 2019 Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> --- include/linux/rbtree_latch.h | 48 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) --- a/include/linux/rbtree_latch.h +++ b/include/linux/rbtree_latch.h @@ -211,4 +211,52 @@ latch_tree_find(void *key, struct latch_ return node; } +static __always_inline struct latch_tree_node * +latch_tree_first(struct latch_tree_root *root) +{ + struct latch_tree_node *ltn = NULL; + struct rb_node *node; + unsigned int seq; + + do { + struct rb_root *rbr; + + seq = raw_read_seqcount_latch(&root->seq); + rbr = &root->tree[seq & 1]; + node = rb_first(rbr); + } while (read_seqcount_retry(&root->seq, seq)); + + if (node) + ltn = __lt_from_rb(node, seq & 1); + + return ltn; +} + +/** + * latch_tree_next() - find the next @ltn in @root per sort order + * @root: trees to which @ltn belongs + * @ltn: nodes to start from + * + * Does a lockless lookup in the trees @root for the next node starting at + * @ltn. + * + * Using this function outside of the write side lock is rather dodgy but given + * latch_tree_erase() doesn't re-init the nodes and the whole iteration is done + * under a single RCU critical section, it should be non-fatal and generate some + * semblance of order - albeit possibly missing chunks of the tree. + */ +static __always_inline struct latch_tree_node * +latch_tree_next(struct latch_tree_root *root, struct latch_tree_node *ltn) +{ + struct rb_node *node; + unsigned int seq; + + do { + seq = raw_read_seqcount_latch(&root->seq); + node = rb_next(<n->node[seq & 1]); + } while (read_seqcount_retry(&root->seq, seq)); + + return __lt_from_rb(node, seq & 1); +} + #endif /* RB_TREE_LATCH_H */ Subject: From: Peter Zijlstra <peterz@infradead.org> Date: Thu Jan 17 11:18:21 CET 2019 Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> --- include/linux/kallsyms.h | 14 +++ kernel/extable.c | 2 kernel/kallsyms.c | 187 ++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 202 insertions(+), 1 deletion(-) --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -11,6 +11,7 @@ #include <linux/stddef.h> #include <linux/mm.h> #include <linux/module.h> +#include <linux/rbtree_latch.h> #include <asm/sections.h> @@ -20,6 +21,19 @@ struct module; +struct kallsym_node +{ + struct latch_tree_node kn_node; + unsigned long kn_addr; + unsigned long kn_len; + void (*kn_names)(struct kallsym_node *kn, char *sym_name, char **mod_name); +}; + +extern void kallsym_tree_add(struct kallsym_node *kn); +extern void kallsym_tree_del(struct kallsym_node *kn); + +extern bool is_kallsym_tree_text_address(unsigned long addr); + static inline int is_kernel_inittext(unsigned long addr) { if (addr >= (unsigned long)_sinittext --- a/kernel/extable.c +++ b/kernel/extable.c @@ -145,6 +145,8 @@ int kernel_text_address(unsigned long ad if (is_module_text_address(addr)) goto out; + if (is_kallsym_tree_text_address(addr)) + goto out; if (is_ftrace_trampoline(addr)) goto out; if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -24,6 +24,8 @@ #include <linux/filter.h> #include <linux/ftrace.h> #include <linux/compiler.h> +#include <linux/spinlock.h> +#include <linux/perf_event.h> /* * These will be re-linked against their real values @@ -48,6 +50,164 @@ extern const u16 kallsyms_token_index[] extern const unsigned int kallsyms_markers[] __weak; +static DEFINE_SPINLOCK(kallsym_lock); +static struct latch_tree_root kallsym_tree __cacheline_aligned; + +static __always_inline unsigned long +kallsym_node_addr(struct latch_tree_node *node) +{ + return ((struct kallsym_node *)node)->kn_addr; +} + +static __always_inline bool kallsym_tree_less(struct latch_tree_node *a, + struct latch_tree_node *b) +{ + return kallsym_node_addr(a) < kallsym_node_addr(b); +} + +static __always_inline int kallsym_tree_comp(void *key, + struct latch_tree_node *n) +{ + unsigned long val = (unsigned long)key; + unsigned long sym_start, sym_end; + const struct kallsym_node *kn; + + kn = container_of(n, struct kallsym_node, kn_node); + sym_start = kn->kn_addr; + sym_end = sym_start + kn->kn_len; + + if (val < sym_start) + return -1; + if (val >= sym_end) + return 1; + + return 0; +} + +static const struct latch_tree_ops kallsym_tree_ops = { + .less = kallsym_tree_less, + .comp = kallsym_tree_comp, +}; + +void kallsym_tree_add(struct kallsym_node *kn) +{ + char namebuf[KSYM_NAME_LEN] = ""; + char *modname = NULL; + + spin_lock_irq(&kallsym_lock); + latch_tree_insert(&kn->kn_node, &kallsym_tree, &kallsym_tree_ops); + spin_unlock_irq(&kallsym_lock); + + kn->kn_names(kn, namebuf, &modname); + + if (modname) { + int len = strlen(namebuf); + + snprintf(namebuf + len, sizeof(namebuf) - len, " [%s]", modname); + } + + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_UNKNOWN, + kn->kn_addr, kn->kn_len, false, namebuf); +} + +void kallsym_tree_del(struct kallsym_node *kn) +{ + char namebuf[KSYM_NAME_LEN] = ""; + char *modname = NULL; + + kn->kn_names(kn, namebuf, &modname); + + if (modname) { + int len = strlen(namebuf); + + snprintf(namebuf + len, sizeof(namebuf) - len, " [%s]", modname); + } + + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_UNKNOWN, + kn->kn_addr, kn->kn_len, true, namebuf); + + spin_lock_irq(&kallsym_lock); + latch_tree_erase(&kn->kn_node, &kallsym_tree, &kallsym_tree_ops); + spin_unlock_irq(&kallsym_lock); +} + +static struct kallsym_node *kallsym_tree_find(unsigned long addr) +{ + struct kallsym_node *kn = NULL; + struct latch_tree_node *n; + + n = latch_tree_find((void *)addr, &kallsym_tree, &kallsym_tree_ops); + if (n) + kn = container_of(n, struct kallsym_node, kn_node); + + return kn; +} + +static char *kallsym_tree_address_lookup(unsigned long addr, unsigned long *size, + unsigned long *off, char **modname, char *sym) +{ + struct kallsym_node *kn; + char *ret = NULL; + + rcu_read_lock(); + kn = kallsym_tree_find(addr); + if (kn) { + kn->kn_names(kn, sym, modname); + + ret = sym; + if (size) + *size = kn->kn_len; + if (off) + *off = addr - kn->kn_addr; + } + rcu_read_unlock(); + + return ret; +} + +bool is_kallsym_tree_text_address(unsigned long addr) +{ + bool ret; + + rcu_read_lock(); + ret = kallsym_tree_find(addr) != NULL; + rcu_read_unlock(); + + return ret; +} + +static int kallsym_tree_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *sym, char *modname, int *exported) +{ + struct latch_tree_node *ltn; + int i, ret = -ERANGE; + + rcu_read_lock(); + for (i = 0, ltn = latch_tree_first(&kallsym_tree); i < symnum && ltn; + i++, ltn = latch_tree_next(&kallsym_tree, ltn)) + ; + + if (ltn) { + struct kallsym_node *kn; + char *mod; + + kn = container_of(ltn, struct kallsym_node, kn_node); + + kn->kn_names(kn, sym, &mod); + if (mod) + strlcpy(modname, mod, MODULE_NAME_LEN); + else + modname[0] = '\0'; + + *type = 't'; + *exported = 0; + ret = 0; + } + rcu_read_unlock(); + + return ret; +} + /* * Expand a compressed symbol data into the resulting uncompressed string, * if uncompressed string is too long (>= maxlen), it will be truncated, @@ -265,6 +425,7 @@ int kallsyms_lookup_size_offset(unsigned if (is_ksym_addr(addr)) return !!get_symbol_pos(addr, symbolsize, offset); return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) || + !!kallsym_tree_address_lookup(addr, symbolsize, offset, NULL, namebuf) || !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); } @@ -301,6 +462,10 @@ const char *kallsyms_lookup(unsigned lon ret = module_address_lookup(addr, symbolsize, offset, modname, namebuf); if (!ret) + ret = kallsym_tree_address_lookup(addr, symbolsize, + offset, modname, namebuf); + + if (!ret) ret = bpf_address_lookup(addr, symbolsize, offset, modname, namebuf); @@ -434,6 +599,7 @@ struct kallsym_iter { loff_t pos; loff_t pos_arch_end; loff_t pos_mod_end; + loff_t pos_tree_end; loff_t pos_ftrace_mod_end; unsigned long value; unsigned int nameoff; /* If iterating in core kernel symbols. */ @@ -478,9 +644,24 @@ static int get_ksymbol_mod(struct kallsy return 1; } +static int get_ksymbol_tree(struct kallsym_iter *iter) +{ + int ret = kallsym_tree_kallsym(iter->pos - iter->pos_mod_end, + &iter->value, &iter->type, + iter->name, iter->module_name, + &iter->exported); + + if (ret < 0) { + iter->pos_tree_end = iter->pos; + return 0; + } + + return 1; +} + static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter) { - int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end, + int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_tree_end, &iter->value, &iter->type, iter->name, iter->module_name, &iter->exported); @@ -545,6 +726,10 @@ static int update_iter_mod(struct kallsy get_ksymbol_mod(iter)) return 1; + if ((!iter->pos_tree_end || iter->pos_tree_end > pos) && + get_ksymbol_tree(iter)) + return 1; + if ((!iter->pos_ftrace_mod_end || iter->pos_ftrace_mod_end > pos) && get_ksymbol_ftrace_mod(iter)) return 1; Subject: From: Peter Zijlstra <peterz@infradead.org> Date: Thu Jan 17 13:19:25 CET 2019 Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> --- include/linux/bpf.h | 7 +- include/linux/filter.h | 42 ------------ kernel/bpf/core.c | 164 ++++--------------------------------------------- kernel/extable.c | 4 - kernel/kallsyms.c | 19 ----- 5 files changed, 22 insertions(+), 214 deletions(-) --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -13,7 +13,7 @@ #include <linux/file.h> #include <linux/percpu.h> #include <linux/err.h> -#include <linux/rbtree_latch.h> +#include <linux/kallsyms.h> #include <linux/numa.h> #include <linux/wait.h> @@ -307,8 +307,9 @@ struct bpf_prog_aux { bool offload_requested; struct bpf_prog **func; void *jit_data; /* JIT specific data. arch dependent */ - struct latch_tree_node ksym_tnode; - struct list_head ksym_lnode; + + struct kallsym_node ktn; + const struct bpf_prog_ops *ops; struct bpf_map **used_maps; struct bpf_prog *prog; --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -932,23 +932,6 @@ static inline bool bpf_jit_kallsyms_enab return false; } -const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, - unsigned long *off, char *sym); -bool is_bpf_text_address(unsigned long addr); -int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, - char *sym); - -static inline const char * -bpf_address_lookup(unsigned long addr, unsigned long *size, - unsigned long *off, char **modname, char *sym) -{ - const char *ret = __bpf_address_lookup(addr, size, off, sym); - - if (ret && modname) - *modname = NULL; - return ret; -} - void bpf_prog_kallsyms_add(struct bpf_prog *fp); void bpf_prog_kallsyms_del(struct bpf_prog *fp); @@ -974,31 +957,6 @@ static inline bool bpf_jit_kallsyms_enab return false; } -static inline const char * -__bpf_address_lookup(unsigned long addr, unsigned long *size, - unsigned long *off, char *sym) -{ - return NULL; -} - -static inline bool is_bpf_text_address(unsigned long addr) -{ - return false; -} - -static inline int bpf_get_kallsym(unsigned int symnum, unsigned long *value, - char *type, char *sym) -{ - return -ERANGE; -} - -static inline const char * -bpf_address_lookup(unsigned long addr, unsigned long *size, - unsigned long *off, char **modname, char *sym) -{ - return NULL; -} - static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp) { } --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -30,7 +30,6 @@ #include <linux/bpf.h> #include <linux/btf.h> #include <linux/frame.h> -#include <linux/rbtree_latch.h> #include <linux/kallsyms.h> #include <linux/rcupdate.h> #include <linux/perf_event.h> @@ -100,8 +99,6 @@ struct bpf_prog *bpf_prog_alloc(unsigned fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); - INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode); - return fp; } EXPORT_SYMBOL_GPL(bpf_prog_alloc); @@ -530,86 +527,35 @@ static void bpf_get_prog_name(const stru *sym = 0; } -static __always_inline unsigned long -bpf_get_prog_addr_start(struct latch_tree_node *n) -{ - unsigned long symbol_start, symbol_end; - const struct bpf_prog_aux *aux; - - aux = container_of(n, struct bpf_prog_aux, ksym_tnode); - bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); - - return symbol_start; -} - -static __always_inline bool bpf_tree_less(struct latch_tree_node *a, - struct latch_tree_node *b) -{ - return bpf_get_prog_addr_start(a) < bpf_get_prog_addr_start(b); -} - -static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n) -{ - unsigned long val = (unsigned long)key; - unsigned long symbol_start, symbol_end; - const struct bpf_prog_aux *aux; - - aux = container_of(n, struct bpf_prog_aux, ksym_tnode); - bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); - - if (val < symbol_start) - return -1; - if (val >= symbol_end) - return 1; - - return 0; -} - -static const struct latch_tree_ops bpf_tree_ops = { - .less = bpf_tree_less, - .comp = bpf_tree_comp, -}; - -static DEFINE_SPINLOCK(bpf_lock); -static LIST_HEAD(bpf_kallsyms); -static struct latch_tree_root bpf_tree __cacheline_aligned; - -static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux) -{ - WARN_ON_ONCE(!list_empty(&aux->ksym_lnode)); - list_add_tail_rcu(&aux->ksym_lnode, &bpf_kallsyms); - latch_tree_insert(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops); -} - -static void bpf_prog_ksym_node_del(struct bpf_prog_aux *aux) -{ - if (list_empty(&aux->ksym_lnode)) - return; - - latch_tree_erase(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops); - list_del_rcu(&aux->ksym_lnode); -} static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) { return fp->jited && !bpf_prog_was_classic(fp); } -static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) +static void bpf_kn_names(struct kallsym_node *kn, char *sym, char **modname) { - return list_empty(&fp->aux->ksym_lnode) || - fp->aux->ksym_lnode.prev == LIST_POISON2; + struct bpf_prog_aux *aux = container_of(kn, struct bpf_prog_aux, ktn); + + *modname = "eBPF-jit"; + bpf_get_prog_name(aux->prog, sym); } void bpf_prog_kallsyms_add(struct bpf_prog *fp) { + unsigned long sym_start, sym_end; + if (!bpf_prog_kallsyms_candidate(fp) || !capable(CAP_SYS_ADMIN)) return; - spin_lock_bh(&bpf_lock); - bpf_prog_ksym_node_add(fp->aux); - spin_unlock_bh(&bpf_lock); + bpf_get_prog_addr_region(fp, &sym_start, &sym_end); + + fp->aux->ktn.kn_addr = sym_start; + fp->aux->ktn.kn_len = sym_end - sym_start; + fp->aux->ktn.kn_names = bpf_kn_names; + + kallsym_tree_add(&fp->aux->ktn); } void bpf_prog_kallsyms_del(struct bpf_prog *fp) @@ -617,85 +563,7 @@ void bpf_prog_kallsyms_del(struct bpf_pr if (!bpf_prog_kallsyms_candidate(fp)) return; - spin_lock_bh(&bpf_lock); - bpf_prog_ksym_node_del(fp->aux); - spin_unlock_bh(&bpf_lock); -} - -static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr) -{ - struct latch_tree_node *n; - - if (!bpf_jit_kallsyms_enabled()) - return NULL; - - n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops); - return n ? - container_of(n, struct bpf_prog_aux, ksym_tnode)->prog : - NULL; -} - -const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, - unsigned long *off, char *sym) -{ - unsigned long symbol_start, symbol_end; - struct bpf_prog *prog; - char *ret = NULL; - - rcu_read_lock(); - prog = bpf_prog_kallsyms_find(addr); - if (prog) { - bpf_get_prog_addr_region(prog, &symbol_start, &symbol_end); - bpf_get_prog_name(prog, sym); - - ret = sym; - if (size) - *size = symbol_end - symbol_start; - if (off) - *off = addr - symbol_start; - } - rcu_read_unlock(); - - return ret; -} - -bool is_bpf_text_address(unsigned long addr) -{ - bool ret; - - rcu_read_lock(); - ret = bpf_prog_kallsyms_find(addr) != NULL; - rcu_read_unlock(); - - return ret; -} - -int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, - char *sym) -{ - struct bpf_prog_aux *aux; - unsigned int it = 0; - int ret = -ERANGE; - - if (!bpf_jit_kallsyms_enabled()) - return ret; - - rcu_read_lock(); - list_for_each_entry_rcu(aux, &bpf_kallsyms, ksym_lnode) { - if (it++ != symnum) - continue; - - bpf_get_prog_name(aux->prog, sym); - - *value = (unsigned long)aux->prog->bpf_func; - *type = BPF_SYM_ELF_TYPE; - - ret = 0; - break; - } - rcu_read_unlock(); - - return ret; + kallsym_tree_del(&fp->aux->ktn); } static atomic_long_t bpf_jit_current; @@ -806,8 +674,6 @@ void __weak bpf_jit_free(struct bpf_prog bpf_jit_binary_unlock_ro(hdr); bpf_jit_binary_free(hdr); - - WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); } bpf_prog_unlock_free(fp); --- a/kernel/extable.c +++ b/kernel/extable.c @@ -135,7 +135,7 @@ int kernel_text_address(unsigned long ad * coming back from idle, or cpu on or offlining. * * is_module_text_address() as well as the kprobe slots - * and is_bpf_text_address() require RCU to be watching. + * and is_kallsym_tree_text_address() require RCU to be watching. */ no_rcu = !rcu_is_watching(); @@ -151,8 +151,6 @@ int kernel_text_address(unsigned long ad goto out; if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) goto out; - if (is_bpf_text_address(addr)) - goto out; ret = 0; out: if (no_rcu) --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -425,8 +425,7 @@ int kallsyms_lookup_size_offset(unsigned if (is_ksym_addr(addr)) return !!get_symbol_pos(addr, symbolsize, offset); return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) || - !!kallsym_tree_address_lookup(addr, symbolsize, offset, NULL, namebuf) || - !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); + !!kallsym_tree_address_lookup(addr, symbolsize, offset, NULL, namebuf); } /* @@ -464,11 +463,6 @@ const char *kallsyms_lookup(unsigned lon if (!ret) ret = kallsym_tree_address_lookup(addr, symbolsize, offset, modname, namebuf); - - if (!ret) - ret = bpf_address_lookup(addr, symbolsize, - offset, modname, namebuf); - if (!ret) ret = ftrace_mod_address_lookup(addr, symbolsize, offset, modname, namebuf); @@ -673,15 +667,6 @@ static int get_ksymbol_ftrace_mod(struct return 1; } -static int get_ksymbol_bpf(struct kallsym_iter *iter) -{ - iter->module_name[0] = '\0'; - iter->exported = 0; - return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, - &iter->value, &iter->type, - iter->name) < 0 ? 0 : 1; -} - /* Returns space to next name. */ static unsigned long get_ksymbol_core(struct kallsym_iter *iter) { @@ -734,7 +719,7 @@ static int update_iter_mod(struct kallsy get_ksymbol_ftrace_mod(iter)) return 1; - return get_ksymbol_bpf(iter); + return 0; } /* Returns false if pos at or past end of file. */
> On Jan 17, 2019, at 4:56 AM, Peter Zijlstra <peterz@infradead.org> wrote: > > On Wed, Jan 16, 2019 at 08:29:23AM -0800, Song Liu wrote: >> For better performance analysis of dynamically JITed and loaded kernel >> functions, such as BPF programs, this patch introduces >> PERF_RECORD_KSYMBOL, a new perf_event_type that exposes kernel symbol >> register/unregister information to user space. >> >> The following data structure is used for PERF_RECORD_KSYMBOL. >> >> /* >> * struct { >> * struct perf_event_header header; >> * u64 addr; >> * u32 len; >> * u16 ksym_type; >> * u16 flags; >> * char name[]; >> * struct sample_id sample_id; >> * }; >> */ > > So I've cobbled together the attached patches to see how it would work > out.. > > I didn't convert ftrace trampolines; because ftrace code has this > uncanny ability to make my head hurt. But I don't think it should be > hard, once one figures out the right structure to stick that > kallsym_node thing in (ftrace_ops ?!). > > It is compiled only, so no testing what so ever (also, no changelogs). > > I didn't wire up the KSYM_TYPE thing; I'm wondering if we really need > that, OTOH it really doesn't hurt having it either. > > One weird thing I noticed, wth does bpf_prog_kallsyms_add() check > CAP_ADMIN ?! Surely even a non-priv JIT'ed program generates symbols, > why hide those? > > Anyway; with the one nit about the get_names() thing sorted: > > Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> > > (thanks for sticking with this) > <peterz-latch-next.patch><peterz-kallsym.patch><peterz-kallsym-bpf.patch> Aha, now I get the point on perf_event_ksymbol(). Yeah this approach is definitely better. While I run more tests with these patches, could we get current in perf/core? This will enable the development of user space tools like bcc. Also, I current base this set on bpf-next tree, as tip/perf/core is 4 week old. Shall I rebase the set on Linus' tree? Or shall I wait for tip/perf/core? Thanks again! Song
Em Thu, Jan 17, 2019 at 02:49:10PM +0000, Song Liu escreveu: > > > > On Jan 17, 2019, at 4:56 AM, Peter Zijlstra <peterz@infradead.org> wrote: > > > > On Wed, Jan 16, 2019 at 08:29:23AM -0800, Song Liu wrote: > >> For better performance analysis of dynamically JITed and loaded kernel > >> functions, such as BPF programs, this patch introduces > >> PERF_RECORD_KSYMBOL, a new perf_event_type that exposes kernel symbol > >> register/unregister information to user space. > >> > >> The following data structure is used for PERF_RECORD_KSYMBOL. > >> > >> /* > >> * struct { > >> * struct perf_event_header header; > >> * u64 addr; > >> * u32 len; > >> * u16 ksym_type; > >> * u16 flags; > >> * char name[]; > >> * struct sample_id sample_id; > >> * }; > >> */ > > > > So I've cobbled together the attached patches to see how it would work > > out.. > > > > I didn't convert ftrace trampolines; because ftrace code has this > > uncanny ability to make my head hurt. But I don't think it should be > > hard, once one figures out the right structure to stick that > > kallsym_node thing in (ftrace_ops ?!). > > > > It is compiled only, so no testing what so ever (also, no changelogs). > > > > I didn't wire up the KSYM_TYPE thing; I'm wondering if we really need > > that, OTOH it really doesn't hurt having it either. > > > > One weird thing I noticed, wth does bpf_prog_kallsyms_add() check > > CAP_ADMIN ?! Surely even a non-priv JIT'ed program generates symbols, > > why hide those? > > > > Anyway; with the one nit about the get_names() thing sorted: > > > > Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> > > > > (thanks for sticking with this) > > <peterz-latch-next.patch><peterz-kallsym.patch><peterz-kallsym-bpf.patch> > > Aha, now I get the point on perf_event_ksymbol(). Yeah this approach is > definitely better. > > While I run more tests with these patches, could we get current in > perf/core? This will enable the development of user space tools like > bcc. > > Also, I current base this set on bpf-next tree, as tip/perf/core is > 4 week old. Shall I rebase the set on Linus' tree? Or shall I wait for > tip/perf/core? So, can you post one last set, this time with PeterZ's Acked-by, assuming you're sorting out the get_names() thing, and I can try merging this into my perf/core branch, then pushing it out to Ingo, my perf/core starts from tip/perf/urgent, so should be new enough. I'd then right after testing it send a pull request to Ingo, synching everything. - Arnaldo
> On Jan 17, 2019, at 6:58 AM, Arnaldo Carvalho de Melo <acme@kernel.org> wrote: > > Em Thu, Jan 17, 2019 at 02:49:10PM +0000, Song Liu escreveu: >> >> >>> On Jan 17, 2019, at 4:56 AM, Peter Zijlstra <peterz@infradead.org> wrote: >>> >>> On Wed, Jan 16, 2019 at 08:29:23AM -0800, Song Liu wrote: >>>> For better performance analysis of dynamically JITed and loaded kernel >>>> functions, such as BPF programs, this patch introduces >>>> PERF_RECORD_KSYMBOL, a new perf_event_type that exposes kernel symbol >>>> register/unregister information to user space. >>>> >>>> The following data structure is used for PERF_RECORD_KSYMBOL. >>>> >>>> /* >>>> * struct { >>>> * struct perf_event_header header; >>>> * u64 addr; >>>> * u32 len; >>>> * u16 ksym_type; >>>> * u16 flags; >>>> * char name[]; >>>> * struct sample_id sample_id; >>>> * }; >>>> */ >>> >>> So I've cobbled together the attached patches to see how it would work >>> out.. >>> >>> I didn't convert ftrace trampolines; because ftrace code has this >>> uncanny ability to make my head hurt. But I don't think it should be >>> hard, once one figures out the right structure to stick that >>> kallsym_node thing in (ftrace_ops ?!). >>> >>> It is compiled only, so no testing what so ever (also, no changelogs). >>> >>> I didn't wire up the KSYM_TYPE thing; I'm wondering if we really need >>> that, OTOH it really doesn't hurt having it either. >>> >>> One weird thing I noticed, wth does bpf_prog_kallsyms_add() check >>> CAP_ADMIN ?! Surely even a non-priv JIT'ed program generates symbols, >>> why hide those? >>> >>> Anyway; with the one nit about the get_names() thing sorted: >>> >>> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> >>> >>> (thanks for sticking with this) >>> <peterz-latch-next.patch><peterz-kallsym.patch><peterz-kallsym-bpf.patch> >> >> Aha, now I get the point on perf_event_ksymbol(). Yeah this approach is >> definitely better. >> >> While I run more tests with these patches, could we get current in >> perf/core? This will enable the development of user space tools like >> bcc. >> >> Also, I current base this set on bpf-next tree, as tip/perf/core is >> 4 week old. Shall I rebase the set on Linus' tree? Or shall I wait for >> tip/perf/core? > > So, can you post one last set, this time with PeterZ's Acked-by, > assuming you're sorting out the get_names() thing, and I can try merging > this into my perf/core branch, then pushing it out to Ingo, my perf/core > starts from tip/perf/urgent, so should be new enough. > > I'd then right after testing it send a pull request to Ingo, synching > everything. > > - Arnaldo Thanks Arnaldo! I will send it soon. Song
On Thu, Jan 17, 2019 at 11:58:54AM -0300, Arnaldo Carvalho de Melo wrote: > So, can you post one last set, this time with PeterZ's Acked-by, > assuming you're sorting out the get_names() thing, and I can try merging > this into my perf/core branch, then pushing it out to Ingo, my perf/core > starts from tip/perf/urgent, so should be new enough. Works for me; thanks Arnaldo!
On Thu, Jan 17, 2019 at 01:56:53PM +0100, Peter Zijlstra wrote: > +static __always_inline struct latch_tree_node * > +latch_tree_first(struct latch_tree_root *root) > +{ > + struct latch_tree_node *ltn = NULL; > + struct rb_node *node; > + unsigned int seq; > + > + do { > + struct rb_root *rbr; > + > + seq = raw_read_seqcount_latch(&root->seq); > + rbr = &root->tree[seq & 1]; > + node = rb_first(rbr); > + } while (read_seqcount_retry(&root->seq, seq)); > + > + if (node) > + ltn = __lt_from_rb(node, seq & 1); > + > + return ltn; > +} > + > +/** > + * latch_tree_next() - find the next @ltn in @root per sort order > + * @root: trees to which @ltn belongs > + * @ltn: nodes to start from > + * > + * Does a lockless lookup in the trees @root for the next node starting at > + * @ltn. > + * > + * Using this function outside of the write side lock is rather dodgy but given > + * latch_tree_erase() doesn't re-init the nodes and the whole iteration is done > + * under a single RCU critical section, it should be non-fatal and generate some > + * semblance of order - albeit possibly missing chunks of the tree. > + */ > +static __always_inline struct latch_tree_node * > +latch_tree_next(struct latch_tree_root *root, struct latch_tree_node *ltn) > +{ > + struct rb_node *node; > + unsigned int seq; > + > + do { > + seq = raw_read_seqcount_latch(&root->seq); > + node = rb_next(<n->node[seq & 1]); > + } while (read_seqcount_retry(&root->seq, seq)); > + > + return __lt_from_rb(node, seq & 1); > +} > +static int kallsym_tree_kallsym(unsigned int symnum, unsigned long *value, char *type, > + char *sym, char *modname, int *exported) > +{ > + struct latch_tree_node *ltn; > + int i, ret = -ERANGE; > + > + rcu_read_lock(); > + for (i = 0, ltn = latch_tree_first(&kallsym_tree); i < symnum && ltn; > + i++, ltn = latch_tree_next(&kallsym_tree, ltn)) > + ; On second thought; I don't think this will be good enough after all. Missing entire subtrees is too much. The rcu-list iteration will only miss newly added symbols, and for those we'll get the events, combined we'll still have a complete picture. Not so when a whole subtree goes missing. I thought I could avoid the list this way, but alas, not so. > + > + if (ltn) { > + struct kallsym_node *kn; > + char *mod; > + > + kn = container_of(ltn, struct kallsym_node, kn_node); > + > + kn->kn_names(kn, sym, &mod); > + if (mod) > + strlcpy(modname, mod, MODULE_NAME_LEN); > + else > + modname[0] = '\0'; > + > + *type = 't'; > + *exported = 0; > + ret = 0; > + } > + rcu_read_unlock(); > + > + return ret; > +}
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1d5c551a5add..77b2560f2dc7 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1113,6 +1113,13 @@ static inline void perf_event_task_sched_out(struct task_struct *prev, } extern void perf_event_mmap(struct vm_area_struct *vma); + +/* callback function to generate ksymbol name */ +typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data); +extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, + bool unregister, + perf_ksymbol_get_name_f get_name, void *data); + extern struct perf_guest_info_callbacks *perf_guest_cbs; extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); @@ -1333,6 +1340,12 @@ static inline int perf_unregister_guest_info_callbacks (struct perf_guest_info_callbacks *callbacks) { return 0; } static inline void perf_event_mmap(struct vm_area_struct *vma) { } + +typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data); +static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, + bool unregister, + perf_ksymbol_get_name_f get_name, + void *data) { } static inline void perf_event_exec(void) { } static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } static inline void perf_event_namespaces(struct task_struct *tsk) { } diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 9de8780ac8d9..68c4da0227c5 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -372,7 +372,8 @@ struct perf_event_attr { context_switch : 1, /* context switch data */ write_backward : 1, /* Write ring buffer from end to beginning */ namespaces : 1, /* include namespaces data */ - __reserved_1 : 35; + ksymbol : 1, /* include ksymbol events */ + __reserved_1 : 34; union { __u32 wakeup_events; /* wakeup every n events */ @@ -965,9 +966,32 @@ enum perf_event_type { */ PERF_RECORD_NAMESPACES = 16, + /* + * Record ksymbol register/unregister events: + * + * struct { + * struct perf_event_header header; + * u64 addr; + * u32 len; + * u16 ksym_type; + * u16 flags; + * char name[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_KSYMBOL = 17, + PERF_RECORD_MAX, /* non-ABI */ }; +enum perf_record_ksymbol_type { + PERF_RECORD_KSYMBOL_TYPE_UNKNOWN = 0, + PERF_RECORD_KSYMBOL_TYPE_BPF = 1, + PERF_RECORD_KSYMBOL_TYPE_MAX /* non-ABI */ +}; + +#define PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER (1 << 0) + #define PERF_MAX_STACK_DEPTH 127 #define PERF_MAX_CONTEXTS_PER_STACK 8 diff --git a/kernel/events/core.c b/kernel/events/core.c index 3cd13a30f732..ef27f2776999 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -385,6 +385,7 @@ static atomic_t nr_namespaces_events __read_mostly; static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; static atomic_t nr_switch_events __read_mostly; +static atomic_t nr_ksymbol_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -4235,7 +4236,7 @@ static bool is_sb_event(struct perf_event *event) if (attr->mmap || attr->mmap_data || attr->mmap2 || attr->comm || attr->comm_exec || - attr->task || + attr->task || attr->ksymbol || attr->context_switch) return true; return false; @@ -4305,6 +4306,8 @@ static void unaccount_event(struct perf_event *event) dec = true; if (has_branch_stack(event)) dec = true; + if (event->attr.ksymbol) + atomic_dec(&nr_ksymbol_events); if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1)) @@ -7650,6 +7653,97 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_output_end(&handle); } +/* + * ksymbol register/unregister tracking + */ + +struct perf_ksymbol_event { + const char *name; + int name_len; + struct { + struct perf_event_header header; + u64 addr; + u32 len; + u16 ksym_type; + u16 flags; + } event_id; +}; + +static int perf_event_ksymbol_match(struct perf_event *event) +{ + return event->attr.ksymbol; +} + +static void perf_event_ksymbol_output(struct perf_event *event, void *data) +{ + struct perf_ksymbol_event *ksymbol_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + if (!perf_event_ksymbol_match(event)) + return; + + perf_event_header__init_id(&ksymbol_event->event_id.header, + &sample, event); + ret = perf_output_begin(&handle, event, + ksymbol_event->event_id.header.size); + if (ret) + return; + + perf_output_put(&handle, ksymbol_event->event_id); + __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, + perf_ksymbol_get_name_f get_name, void *data) +{ + struct perf_ksymbol_event ksymbol_event; + char name[KSYM_NAME_LEN]; + u16 flags = 0; + int name_len; + + if (!atomic_read(&nr_ksymbol_events)) + return; + + if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX || + ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN) + goto err; + + get_name(name, KSYM_NAME_LEN, data); + name_len = strlen(name) + 1; + while (!IS_ALIGNED(name_len, sizeof(u64))) + name[name_len++] = '\0'; + BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64)); + + if (unregister) + flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER; + + ksymbol_event = (struct perf_ksymbol_event){ + .name = name, + .name_len = name_len, + .event_id = { + .header = { + .type = PERF_RECORD_KSYMBOL, + .size = sizeof(ksymbol_event.event_id) + + name_len, + }, + .addr = addr, + .len = len, + .ksym_type = ksym_type, + .flags = flags, + }, + }; + + perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL); + return; +err: + WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type); +} + void perf_event_itrace_started(struct perf_event *event) { event->attach_state |= PERF_ATTACH_ITRACE; @@ -9900,6 +9994,8 @@ static void account_event(struct perf_event *event) inc = true; if (is_cgroup_event(event)) inc = true; + if (event->attr.ksymbol) + atomic_inc(&nr_ksymbol_events); if (inc) { /*