diff mbox series

[v10,perf,bpf-next,1/9] perf, bpf: Introduce PERF_RECORD_KSYMBOL

Message ID 20190116162931.1542429-2-songliubraving@fb.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series reveal invisible bpf programs | expand

Commit Message

Song Liu Jan. 16, 2019, 4:29 p.m. UTC
For better performance analysis of dynamically JITed and loaded kernel
functions, such as BPF programs, this patch introduces
PERF_RECORD_KSYMBOL, a new perf_event_type that exposes kernel symbol
register/unregister information to user space.

The following data structure is used for PERF_RECORD_KSYMBOL.

    /*
     * struct {
     *      struct perf_event_header        header;
     *      u64                             addr;
     *      u32                             len;
     *      u16                             ksym_type;
     *      u16                             flags;
     *      char                            name[];
     *      struct sample_id                sample_id;
     * };
     */

Reviewed-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 include/linux/perf_event.h      | 13 +++++
 include/uapi/linux/perf_event.h | 26 ++++++++-
 kernel/events/core.c            | 98 ++++++++++++++++++++++++++++++++-
 3 files changed, 135 insertions(+), 2 deletions(-)

Comments

Peter Zijlstra Jan. 17, 2019, 12:48 p.m. UTC | #1
On Wed, Jan 16, 2019 at 08:29:23AM -0800, Song Liu wrote:
> +/* callback function to generate ksymbol name */
> +typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data);
> +extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len,
> +			       bool unregister,
> +			       perf_ksymbol_get_name_f get_name, void *data);
> +

I'm not liking the getname thing.. I know that's what BPF does, but can
we please do that in the caller or something?


--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1114,11 +1114,8 @@ static inline void perf_event_task_sched
 
 extern void perf_event_mmap(struct vm_area_struct *vma);
 
-/* callback function to generate ksymbol name */
-typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data);
 extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len,
-			       bool unregister,
-			       perf_ksymbol_get_name_f get_name, void *data);
+			       bool unregister, const char *name);
 
 extern struct perf_guest_info_callbacks *perf_guest_cbs;
 extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
@@ -1341,11 +1338,8 @@ static inline int perf_unregister_guest_
 
 static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
 
-typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data);
 static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len,
-				      bool unregister,
-				      perf_ksymbol_get_name_f get_name,
-				      void *data)			{ }
+				      bool unregister, const char *name) { }
 static inline void perf_event_exec(void)				{ }
 static inline void perf_event_comm(struct task_struct *tsk, bool exec)	{ }
 static inline void perf_event_namespaces(struct task_struct *tsk)	{ }
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7698,8 +7698,7 @@ static void perf_event_ksymbol_output(st
 	perf_output_end(&handle);
 }
 
-void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
-			perf_ksymbol_get_name_f get_name, void *data)
+void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym)
 {
 	struct perf_ksymbol_event ksymbol_event;
 	char name[KSYM_NAME_LEN];
@@ -7713,7 +7712,7 @@ void perf_event_ksymbol(u16 ksym_type, u
 	    ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
 		goto err;
 
-	get_name(name, KSYM_NAME_LEN, data);
+	strlcpy(name, sym, KSYM_NAME_LEN);
 	name_len = strlen(name) + 1;
 	while (!IS_ALIGNED(name_len, sizeof(u64)))
 		name[name_len++] = '\0';
Peter Zijlstra Jan. 17, 2019, 12:56 p.m. UTC | #2
On Wed, Jan 16, 2019 at 08:29:23AM -0800, Song Liu wrote:
> For better performance analysis of dynamically JITed and loaded kernel
> functions, such as BPF programs, this patch introduces
> PERF_RECORD_KSYMBOL, a new perf_event_type that exposes kernel symbol
> register/unregister information to user space.
> 
> The following data structure is used for PERF_RECORD_KSYMBOL.
> 
>     /*
>      * struct {
>      *      struct perf_event_header        header;
>      *      u64                             addr;
>      *      u32                             len;
>      *      u16                             ksym_type;
>      *      u16                             flags;
>      *      char                            name[];
>      *      struct sample_id                sample_id;
>      * };
>      */

So I've cobbled together the attached patches to see how it would work
out..

I didn't convert ftrace trampolines; because ftrace code has this
uncanny ability to make my head hurt. But I don't think it should be
hard, once one figures out the right structure to stick that
kallsym_node thing in (ftrace_ops ?!).

It is compiled only, so no testing what so ever (also, no changelogs).

I didn't wire up the KSYM_TYPE thing; I'm wondering if we really need
that, OTOH it really doesn't hurt having it either.

One weird thing I noticed, wth does bpf_prog_kallsyms_add() check
CAP_ADMIN ?! Surely even a non-priv JIT'ed program generates symbols,
why hide those?

Anyway; with the one nit about the get_names() thing sorted:

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>

(thanks for sticking with this)
Subject: 
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu Jan 17 11:41:01 CET 2019



Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/rbtree_latch.h |   48 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

--- a/include/linux/rbtree_latch.h
+++ b/include/linux/rbtree_latch.h
@@ -211,4 +211,52 @@ latch_tree_find(void *key, struct latch_
 	return node;
 }
 
+static __always_inline struct latch_tree_node *
+latch_tree_first(struct latch_tree_root *root)
+{
+	struct latch_tree_node *ltn = NULL;
+	struct rb_node *node;
+	unsigned int seq;
+
+	do {
+		struct rb_root *rbr;
+
+		seq = raw_read_seqcount_latch(&root->seq);
+		rbr = &root->tree[seq & 1];
+		node = rb_first(rbr);
+	} while (read_seqcount_retry(&root->seq, seq));
+
+	if (node)
+		ltn = __lt_from_rb(node, seq & 1);
+
+	return ltn;
+}
+
+/**
+ * latch_tree_next() - find the next @ltn in @root per sort order
+ * @root: trees to which @ltn belongs
+ * @ltn: nodes to start from
+ *
+ * Does a lockless lookup in the trees @root for the next node starting at
+ * @ltn.
+ *
+ * Using this function outside of the write side lock is rather dodgy but given
+ * latch_tree_erase() doesn't re-init the nodes and the whole iteration is done
+ * under a single RCU critical section, it should be non-fatal and generate some
+ * semblance of order - albeit possibly missing chunks of the tree.
+ */
+static __always_inline struct latch_tree_node *
+latch_tree_next(struct latch_tree_root *root, struct latch_tree_node *ltn)
+{
+	struct rb_node *node;
+	unsigned int seq;
+
+	do {
+		seq = raw_read_seqcount_latch(&root->seq);
+		node = rb_next(&ltn->node[seq & 1]);
+	} while (read_seqcount_retry(&root->seq, seq));
+
+	return __lt_from_rb(node, seq & 1);
+}
+
 #endif /* RB_TREE_LATCH_H */
Subject: 
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu Jan 17 11:18:21 CET 2019



Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/kallsyms.h |   14 +++
 kernel/extable.c         |    2 
 kernel/kallsyms.c        |  187 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 202 insertions(+), 1 deletion(-)

--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -11,6 +11,7 @@
 #include <linux/stddef.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/rbtree_latch.h>
 
 #include <asm/sections.h>
 
@@ -20,6 +21,19 @@
 
 struct module;
 
+struct kallsym_node
+{
+	struct latch_tree_node kn_node;
+	unsigned long kn_addr;
+	unsigned long kn_len;
+	void (*kn_names)(struct kallsym_node *kn, char *sym_name, char **mod_name);
+};
+
+extern void kallsym_tree_add(struct kallsym_node *kn);
+extern void kallsym_tree_del(struct kallsym_node *kn);
+
+extern bool is_kallsym_tree_text_address(unsigned long addr);
+
 static inline int is_kernel_inittext(unsigned long addr)
 {
 	if (addr >= (unsigned long)_sinittext
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -145,6 +145,8 @@ int kernel_text_address(unsigned long ad
 
 	if (is_module_text_address(addr))
 		goto out;
+	if (is_kallsym_tree_text_address(addr))
+		goto out;
 	if (is_ftrace_trampoline(addr))
 		goto out;
 	if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -24,6 +24,8 @@
 #include <linux/filter.h>
 #include <linux/ftrace.h>
 #include <linux/compiler.h>
+#include <linux/spinlock.h>
+#include <linux/perf_event.h>
 
 /*
  * These will be re-linked against their real values
@@ -48,6 +50,164 @@ extern const u16 kallsyms_token_index[]
 
 extern const unsigned int kallsyms_markers[] __weak;
 
+static DEFINE_SPINLOCK(kallsym_lock);
+static struct latch_tree_root kallsym_tree __cacheline_aligned;
+
+static __always_inline unsigned long
+kallsym_node_addr(struct latch_tree_node *node)
+{
+	return ((struct kallsym_node *)node)->kn_addr;
+}
+
+static __always_inline bool kallsym_tree_less(struct latch_tree_node *a,
+					      struct latch_tree_node *b)
+{
+	return kallsym_node_addr(a) < kallsym_node_addr(b);
+}
+
+static __always_inline int kallsym_tree_comp(void *key,
+					     struct latch_tree_node *n)
+{
+	unsigned long val = (unsigned long)key;
+	unsigned long sym_start, sym_end;
+	const struct kallsym_node *kn;
+
+	kn = container_of(n, struct kallsym_node, kn_node);
+	sym_start = kn->kn_addr;
+	sym_end = sym_start + kn->kn_len;
+
+	if (val < sym_start)
+		return -1;
+	if (val >= sym_end)
+		return 1;
+
+	return 0;
+}
+
+static const struct latch_tree_ops kallsym_tree_ops = {
+	.less = kallsym_tree_less,
+	.comp = kallsym_tree_comp,
+};
+
+void kallsym_tree_add(struct kallsym_node *kn)
+{
+	char namebuf[KSYM_NAME_LEN] = "";
+	char *modname = NULL;
+
+	spin_lock_irq(&kallsym_lock);
+	latch_tree_insert(&kn->kn_node, &kallsym_tree, &kallsym_tree_ops);
+	spin_unlock_irq(&kallsym_lock);
+
+	kn->kn_names(kn, namebuf, &modname);
+
+	if (modname) {
+		int len = strlen(namebuf);
+
+		snprintf(namebuf + len, sizeof(namebuf) - len, " [%s]", modname);
+	}
+
+	perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_UNKNOWN,
+			   kn->kn_addr, kn->kn_len, false, namebuf);
+}
+
+void kallsym_tree_del(struct kallsym_node *kn)
+{
+	char namebuf[KSYM_NAME_LEN] = "";
+	char *modname = NULL;
+
+	kn->kn_names(kn, namebuf, &modname);
+
+	if (modname) {
+		int len = strlen(namebuf);
+
+		snprintf(namebuf + len, sizeof(namebuf) - len, " [%s]", modname);
+	}
+
+	perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_UNKNOWN,
+			   kn->kn_addr, kn->kn_len, true, namebuf);
+
+	spin_lock_irq(&kallsym_lock);
+	latch_tree_erase(&kn->kn_node, &kallsym_tree, &kallsym_tree_ops);
+	spin_unlock_irq(&kallsym_lock);
+}
+
+static struct kallsym_node *kallsym_tree_find(unsigned long addr)
+{
+	struct kallsym_node *kn = NULL;
+	struct latch_tree_node *n;
+
+	n = latch_tree_find((void *)addr, &kallsym_tree, &kallsym_tree_ops);
+	if (n)
+		kn = container_of(n, struct kallsym_node, kn_node);
+
+	return kn;
+}
+
+static char *kallsym_tree_address_lookup(unsigned long addr, unsigned long *size,
+					 unsigned long *off, char **modname, char *sym)
+{
+	struct kallsym_node *kn;
+	char *ret = NULL;
+
+	rcu_read_lock();
+	kn = kallsym_tree_find(addr);
+	if (kn) {
+		kn->kn_names(kn, sym, modname);
+
+		ret = sym;
+		if (size)
+			*size = kn->kn_len;
+		if (off)
+			*off = addr - kn->kn_addr;
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+bool is_kallsym_tree_text_address(unsigned long addr)
+{
+	bool ret;
+
+	rcu_read_lock();
+	ret = kallsym_tree_find(addr) != NULL;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int kallsym_tree_kallsym(unsigned int symnum, unsigned long *value, char *type,
+				char *sym, char *modname, int *exported)
+{
+	struct latch_tree_node *ltn;
+	int i, ret = -ERANGE;
+
+	rcu_read_lock();
+	for (i = 0, ltn = latch_tree_first(&kallsym_tree); i < symnum && ltn;
+	     i++, ltn = latch_tree_next(&kallsym_tree, ltn))
+		;
+
+	if (ltn) {
+		struct kallsym_node *kn;
+		char *mod;
+
+		kn = container_of(ltn, struct kallsym_node, kn_node);
+
+		kn->kn_names(kn, sym, &mod);
+		if (mod)
+			strlcpy(modname, mod, MODULE_NAME_LEN);
+		else
+			modname[0] = '\0';
+
+		*type = 't';
+		*exported = 0;
+		ret = 0;
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
 /*
  * Expand a compressed symbol data into the resulting uncompressed string,
  * if uncompressed string is too long (>= maxlen), it will be truncated,
@@ -265,6 +425,7 @@ int kallsyms_lookup_size_offset(unsigned
 	if (is_ksym_addr(addr))
 		return !!get_symbol_pos(addr, symbolsize, offset);
 	return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) ||
+	       !!kallsym_tree_address_lookup(addr, symbolsize, offset, NULL, namebuf) ||
 	       !!__bpf_address_lookup(addr, symbolsize, offset, namebuf);
 }
 
@@ -301,6 +462,10 @@ const char *kallsyms_lookup(unsigned lon
 	ret = module_address_lookup(addr, symbolsize, offset,
 				    modname, namebuf);
 	if (!ret)
+		ret = kallsym_tree_address_lookup(addr, symbolsize,
+						  offset, modname, namebuf);
+
+	if (!ret)
 		ret = bpf_address_lookup(addr, symbolsize,
 					 offset, modname, namebuf);
 
@@ -434,6 +599,7 @@ struct kallsym_iter {
 	loff_t pos;
 	loff_t pos_arch_end;
 	loff_t pos_mod_end;
+	loff_t pos_tree_end;
 	loff_t pos_ftrace_mod_end;
 	unsigned long value;
 	unsigned int nameoff; /* If iterating in core kernel symbols. */
@@ -478,9 +644,24 @@ static int get_ksymbol_mod(struct kallsy
 	return 1;
 }
 
+static int get_ksymbol_tree(struct kallsym_iter *iter)
+{
+	int ret = kallsym_tree_kallsym(iter->pos - iter->pos_mod_end,
+				       &iter->value, &iter->type,
+				       iter->name, iter->module_name,
+				       &iter->exported);
+
+	if (ret < 0) {
+		iter->pos_tree_end = iter->pos;
+		return 0;
+	}
+
+	return 1;
+}
+
 static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter)
 {
-	int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end,
+	int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_tree_end,
 					 &iter->value, &iter->type,
 					 iter->name, iter->module_name,
 					 &iter->exported);
@@ -545,6 +726,10 @@ static int update_iter_mod(struct kallsy
 	    get_ksymbol_mod(iter))
 		return 1;
 
+	if ((!iter->pos_tree_end || iter->pos_tree_end > pos) &&
+	    get_ksymbol_tree(iter))
+		return 1;
+
 	if ((!iter->pos_ftrace_mod_end || iter->pos_ftrace_mod_end > pos) &&
 	    get_ksymbol_ftrace_mod(iter))
 		return 1;
Subject: 
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu Jan 17 13:19:25 CET 2019



Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/bpf.h    |    7 +-
 include/linux/filter.h |   42 ------------
 kernel/bpf/core.c      |  164 ++++---------------------------------------------
 kernel/extable.c       |    4 -
 kernel/kallsyms.c      |   19 -----
 5 files changed, 22 insertions(+), 214 deletions(-)

--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -13,7 +13,7 @@
 #include <linux/file.h>
 #include <linux/percpu.h>
 #include <linux/err.h>
-#include <linux/rbtree_latch.h>
+#include <linux/kallsyms.h>
 #include <linux/numa.h>
 #include <linux/wait.h>
 
@@ -307,8 +307,9 @@ struct bpf_prog_aux {
 	bool offload_requested;
 	struct bpf_prog **func;
 	void *jit_data; /* JIT specific data. arch dependent */
-	struct latch_tree_node ksym_tnode;
-	struct list_head ksym_lnode;
+
+	struct kallsym_node ktn;
+
 	const struct bpf_prog_ops *ops;
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -932,23 +932,6 @@ static inline bool bpf_jit_kallsyms_enab
 	return false;
 }
 
-const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
-				 unsigned long *off, char *sym);
-bool is_bpf_text_address(unsigned long addr);
-int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
-		    char *sym);
-
-static inline const char *
-bpf_address_lookup(unsigned long addr, unsigned long *size,
-		   unsigned long *off, char **modname, char *sym)
-{
-	const char *ret = __bpf_address_lookup(addr, size, off, sym);
-
-	if (ret && modname)
-		*modname = NULL;
-	return ret;
-}
-
 void bpf_prog_kallsyms_add(struct bpf_prog *fp);
 void bpf_prog_kallsyms_del(struct bpf_prog *fp);
 
@@ -974,31 +957,6 @@ static inline bool bpf_jit_kallsyms_enab
 	return false;
 }
 
-static inline const char *
-__bpf_address_lookup(unsigned long addr, unsigned long *size,
-		     unsigned long *off, char *sym)
-{
-	return NULL;
-}
-
-static inline bool is_bpf_text_address(unsigned long addr)
-{
-	return false;
-}
-
-static inline int bpf_get_kallsym(unsigned int symnum, unsigned long *value,
-				  char *type, char *sym)
-{
-	return -ERANGE;
-}
-
-static inline const char *
-bpf_address_lookup(unsigned long addr, unsigned long *size,
-		   unsigned long *off, char **modname, char *sym)
-{
-	return NULL;
-}
-
 static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 {
 }
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -30,7 +30,6 @@
 #include <linux/bpf.h>
 #include <linux/btf.h>
 #include <linux/frame.h>
-#include <linux/rbtree_latch.h>
 #include <linux/kallsyms.h>
 #include <linux/rcupdate.h>
 #include <linux/perf_event.h>
@@ -100,8 +99,6 @@ struct bpf_prog *bpf_prog_alloc(unsigned
 	fp->aux->prog = fp;
 	fp->jit_requested = ebpf_jit_enabled();
 
-	INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode);
-
 	return fp;
 }
 EXPORT_SYMBOL_GPL(bpf_prog_alloc);
@@ -530,86 +527,35 @@ static void bpf_get_prog_name(const stru
 		*sym = 0;
 }
 
-static __always_inline unsigned long
-bpf_get_prog_addr_start(struct latch_tree_node *n)
-{
-	unsigned long symbol_start, symbol_end;
-	const struct bpf_prog_aux *aux;
-
-	aux = container_of(n, struct bpf_prog_aux, ksym_tnode);
-	bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end);
-
-	return symbol_start;
-}
-
-static __always_inline bool bpf_tree_less(struct latch_tree_node *a,
-					  struct latch_tree_node *b)
-{
-	return bpf_get_prog_addr_start(a) < bpf_get_prog_addr_start(b);
-}
-
-static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n)
-{
-	unsigned long val = (unsigned long)key;
-	unsigned long symbol_start, symbol_end;
-	const struct bpf_prog_aux *aux;
-
-	aux = container_of(n, struct bpf_prog_aux, ksym_tnode);
-	bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end);
-
-	if (val < symbol_start)
-		return -1;
-	if (val >= symbol_end)
-		return  1;
-
-	return 0;
-}
-
-static const struct latch_tree_ops bpf_tree_ops = {
-	.less	= bpf_tree_less,
-	.comp	= bpf_tree_comp,
-};
-
-static DEFINE_SPINLOCK(bpf_lock);
-static LIST_HEAD(bpf_kallsyms);
-static struct latch_tree_root bpf_tree __cacheline_aligned;
-
-static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux)
-{
-	WARN_ON_ONCE(!list_empty(&aux->ksym_lnode));
-	list_add_tail_rcu(&aux->ksym_lnode, &bpf_kallsyms);
-	latch_tree_insert(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops);
-}
-
-static void bpf_prog_ksym_node_del(struct bpf_prog_aux *aux)
-{
-	if (list_empty(&aux->ksym_lnode))
-		return;
-
-	latch_tree_erase(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops);
-	list_del_rcu(&aux->ksym_lnode);
-}
 
 static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
 {
 	return fp->jited && !bpf_prog_was_classic(fp);
 }
 
-static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
+static void bpf_kn_names(struct kallsym_node *kn, char *sym, char **modname)
 {
-	return list_empty(&fp->aux->ksym_lnode) ||
-	       fp->aux->ksym_lnode.prev == LIST_POISON2;
+	struct bpf_prog_aux *aux = container_of(kn, struct bpf_prog_aux, ktn);
+
+	*modname = "eBPF-jit";
+	bpf_get_prog_name(aux->prog, sym);
 }
 
 void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 {
+	unsigned long sym_start, sym_end;
+
 	if (!bpf_prog_kallsyms_candidate(fp) ||
 	    !capable(CAP_SYS_ADMIN))
 		return;
 
-	spin_lock_bh(&bpf_lock);
-	bpf_prog_ksym_node_add(fp->aux);
-	spin_unlock_bh(&bpf_lock);
+	bpf_get_prog_addr_region(fp, &sym_start, &sym_end);
+
+	fp->aux->ktn.kn_addr = sym_start;
+	fp->aux->ktn.kn_len = sym_end - sym_start;
+	fp->aux->ktn.kn_names = bpf_kn_names;
+
+	kallsym_tree_add(&fp->aux->ktn);
 }
 
 void bpf_prog_kallsyms_del(struct bpf_prog *fp)
@@ -617,85 +563,7 @@ void bpf_prog_kallsyms_del(struct bpf_pr
 	if (!bpf_prog_kallsyms_candidate(fp))
 		return;
 
-	spin_lock_bh(&bpf_lock);
-	bpf_prog_ksym_node_del(fp->aux);
-	spin_unlock_bh(&bpf_lock);
-}
-
-static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr)
-{
-	struct latch_tree_node *n;
-
-	if (!bpf_jit_kallsyms_enabled())
-		return NULL;
-
-	n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops);
-	return n ?
-	       container_of(n, struct bpf_prog_aux, ksym_tnode)->prog :
-	       NULL;
-}
-
-const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
-				 unsigned long *off, char *sym)
-{
-	unsigned long symbol_start, symbol_end;
-	struct bpf_prog *prog;
-	char *ret = NULL;
-
-	rcu_read_lock();
-	prog = bpf_prog_kallsyms_find(addr);
-	if (prog) {
-		bpf_get_prog_addr_region(prog, &symbol_start, &symbol_end);
-		bpf_get_prog_name(prog, sym);
-
-		ret = sym;
-		if (size)
-			*size = symbol_end - symbol_start;
-		if (off)
-			*off  = addr - symbol_start;
-	}
-	rcu_read_unlock();
-
-	return ret;
-}
-
-bool is_bpf_text_address(unsigned long addr)
-{
-	bool ret;
-
-	rcu_read_lock();
-	ret = bpf_prog_kallsyms_find(addr) != NULL;
-	rcu_read_unlock();
-
-	return ret;
-}
-
-int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
-		    char *sym)
-{
-	struct bpf_prog_aux *aux;
-	unsigned int it = 0;
-	int ret = -ERANGE;
-
-	if (!bpf_jit_kallsyms_enabled())
-		return ret;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(aux, &bpf_kallsyms, ksym_lnode) {
-		if (it++ != symnum)
-			continue;
-
-		bpf_get_prog_name(aux->prog, sym);
-
-		*value = (unsigned long)aux->prog->bpf_func;
-		*type  = BPF_SYM_ELF_TYPE;
-
-		ret = 0;
-		break;
-	}
-	rcu_read_unlock();
-
-	return ret;
+	kallsym_tree_del(&fp->aux->ktn);
 }
 
 static atomic_long_t bpf_jit_current;
@@ -806,8 +674,6 @@ void __weak bpf_jit_free(struct bpf_prog
 
 		bpf_jit_binary_unlock_ro(hdr);
 		bpf_jit_binary_free(hdr);
-
-		WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
 	}
 
 	bpf_prog_unlock_free(fp);
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -135,7 +135,7 @@ int kernel_text_address(unsigned long ad
 	 * coming back from idle, or cpu on or offlining.
 	 *
 	 * is_module_text_address() as well as the kprobe slots
-	 * and is_bpf_text_address() require RCU to be watching.
+	 * and is_kallsym_tree_text_address() require RCU to be watching.
 	 */
 	no_rcu = !rcu_is_watching();
 
@@ -151,8 +151,6 @@ int kernel_text_address(unsigned long ad
 		goto out;
 	if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
 		goto out;
-	if (is_bpf_text_address(addr))
-		goto out;
 	ret = 0;
 out:
 	if (no_rcu)
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -425,8 +425,7 @@ int kallsyms_lookup_size_offset(unsigned
 	if (is_ksym_addr(addr))
 		return !!get_symbol_pos(addr, symbolsize, offset);
 	return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) ||
-	       !!kallsym_tree_address_lookup(addr, symbolsize, offset, NULL, namebuf) ||
-	       !!__bpf_address_lookup(addr, symbolsize, offset, namebuf);
+	       !!kallsym_tree_address_lookup(addr, symbolsize, offset, NULL, namebuf);
 }
 
 /*
@@ -464,11 +463,6 @@ const char *kallsyms_lookup(unsigned lon
 	if (!ret)
 		ret = kallsym_tree_address_lookup(addr, symbolsize,
 						  offset, modname, namebuf);
-
-	if (!ret)
-		ret = bpf_address_lookup(addr, symbolsize,
-					 offset, modname, namebuf);
-
 	if (!ret)
 		ret = ftrace_mod_address_lookup(addr, symbolsize,
 						offset, modname, namebuf);
@@ -673,15 +667,6 @@ static int get_ksymbol_ftrace_mod(struct
 	return 1;
 }
 
-static int get_ksymbol_bpf(struct kallsym_iter *iter)
-{
-	iter->module_name[0] = '\0';
-	iter->exported = 0;
-	return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
-			       &iter->value, &iter->type,
-			       iter->name) < 0 ? 0 : 1;
-}
-
 /* Returns space to next name. */
 static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
 {
@@ -734,7 +719,7 @@ static int update_iter_mod(struct kallsy
 	    get_ksymbol_ftrace_mod(iter))
 		return 1;
 
-	return get_ksymbol_bpf(iter);
+	return 0;
 }
 
 /* Returns false if pos at or past end of file. */
Song Liu Jan. 17, 2019, 2:49 p.m. UTC | #3
> On Jan 17, 2019, at 4:56 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Wed, Jan 16, 2019 at 08:29:23AM -0800, Song Liu wrote:
>> For better performance analysis of dynamically JITed and loaded kernel
>> functions, such as BPF programs, this patch introduces
>> PERF_RECORD_KSYMBOL, a new perf_event_type that exposes kernel symbol
>> register/unregister information to user space.
>> 
>> The following data structure is used for PERF_RECORD_KSYMBOL.
>> 
>>    /*
>>     * struct {
>>     *      struct perf_event_header        header;
>>     *      u64                             addr;
>>     *      u32                             len;
>>     *      u16                             ksym_type;
>>     *      u16                             flags;
>>     *      char                            name[];
>>     *      struct sample_id                sample_id;
>>     * };
>>     */
> 
> So I've cobbled together the attached patches to see how it would work
> out..
> 
> I didn't convert ftrace trampolines; because ftrace code has this
> uncanny ability to make my head hurt. But I don't think it should be
> hard, once one figures out the right structure to stick that
> kallsym_node thing in (ftrace_ops ?!).
> 
> It is compiled only, so no testing what so ever (also, no changelogs).
> 
> I didn't wire up the KSYM_TYPE thing; I'm wondering if we really need
> that, OTOH it really doesn't hurt having it either.
> 
> One weird thing I noticed, wth does bpf_prog_kallsyms_add() check
> CAP_ADMIN ?! Surely even a non-priv JIT'ed program generates symbols,
> why hide those?
> 
> Anyway; with the one nit about the get_names() thing sorted:
> 
> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> 
> (thanks for sticking with this)
> <peterz-latch-next.patch><peterz-kallsym.patch><peterz-kallsym-bpf.patch>

Aha, now I get the point on perf_event_ksymbol(). Yeah this approach is 
definitely better. 

While I run more tests with these patches, could we get current in 
perf/core? This will enable the development of user space tools like
bcc. 

Also, I current base this set on bpf-next tree, as tip/perf/core is 
4 week old. Shall I rebase the set on Linus' tree? Or shall I wait for
tip/perf/core?

Thanks again!
Song
Arnaldo Carvalho de Melo Jan. 17, 2019, 2:58 p.m. UTC | #4
Em Thu, Jan 17, 2019 at 02:49:10PM +0000, Song Liu escreveu:
> 
> 
> > On Jan 17, 2019, at 4:56 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > On Wed, Jan 16, 2019 at 08:29:23AM -0800, Song Liu wrote:
> >> For better performance analysis of dynamically JITed and loaded kernel
> >> functions, such as BPF programs, this patch introduces
> >> PERF_RECORD_KSYMBOL, a new perf_event_type that exposes kernel symbol
> >> register/unregister information to user space.
> >> 
> >> The following data structure is used for PERF_RECORD_KSYMBOL.
> >> 
> >>    /*
> >>     * struct {
> >>     *      struct perf_event_header        header;
> >>     *      u64                             addr;
> >>     *      u32                             len;
> >>     *      u16                             ksym_type;
> >>     *      u16                             flags;
> >>     *      char                            name[];
> >>     *      struct sample_id                sample_id;
> >>     * };
> >>     */
> > 
> > So I've cobbled together the attached patches to see how it would work
> > out..
> > 
> > I didn't convert ftrace trampolines; because ftrace code has this
> > uncanny ability to make my head hurt. But I don't think it should be
> > hard, once one figures out the right structure to stick that
> > kallsym_node thing in (ftrace_ops ?!).
> > 
> > It is compiled only, so no testing what so ever (also, no changelogs).
> > 
> > I didn't wire up the KSYM_TYPE thing; I'm wondering if we really need
> > that, OTOH it really doesn't hurt having it either.
> > 
> > One weird thing I noticed, wth does bpf_prog_kallsyms_add() check
> > CAP_ADMIN ?! Surely even a non-priv JIT'ed program generates symbols,
> > why hide those?
> > 
> > Anyway; with the one nit about the get_names() thing sorted:
> > 
> > Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > 
> > (thanks for sticking with this)
> > <peterz-latch-next.patch><peterz-kallsym.patch><peterz-kallsym-bpf.patch>
> 
> Aha, now I get the point on perf_event_ksymbol(). Yeah this approach is 
> definitely better. 
> 
> While I run more tests with these patches, could we get current in 
> perf/core? This will enable the development of user space tools like
> bcc. 
> 
> Also, I current base this set on bpf-next tree, as tip/perf/core is 
> 4 week old. Shall I rebase the set on Linus' tree? Or shall I wait for
> tip/perf/core?

So, can you post one last set, this time with PeterZ's Acked-by,
assuming you're sorting out the get_names() thing, and I can try merging
this into my perf/core branch, then pushing it out to Ingo, my perf/core
starts from tip/perf/urgent, so should be new enough.

I'd then right after testing it send a pull request to Ingo, synching
everything.

- Arnaldo
Song Liu Jan. 17, 2019, 3:02 p.m. UTC | #5
> On Jan 17, 2019, at 6:58 AM, Arnaldo Carvalho de Melo <acme@kernel.org> wrote:
> 
> Em Thu, Jan 17, 2019 at 02:49:10PM +0000, Song Liu escreveu:
>> 
>> 
>>> On Jan 17, 2019, at 4:56 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>>> 
>>> On Wed, Jan 16, 2019 at 08:29:23AM -0800, Song Liu wrote:
>>>> For better performance analysis of dynamically JITed and loaded kernel
>>>> functions, such as BPF programs, this patch introduces
>>>> PERF_RECORD_KSYMBOL, a new perf_event_type that exposes kernel symbol
>>>> register/unregister information to user space.
>>>> 
>>>> The following data structure is used for PERF_RECORD_KSYMBOL.
>>>> 
>>>>   /*
>>>>    * struct {
>>>>    *      struct perf_event_header        header;
>>>>    *      u64                             addr;
>>>>    *      u32                             len;
>>>>    *      u16                             ksym_type;
>>>>    *      u16                             flags;
>>>>    *      char                            name[];
>>>>    *      struct sample_id                sample_id;
>>>>    * };
>>>>    */
>>> 
>>> So I've cobbled together the attached patches to see how it would work
>>> out..
>>> 
>>> I didn't convert ftrace trampolines; because ftrace code has this
>>> uncanny ability to make my head hurt. But I don't think it should be
>>> hard, once one figures out the right structure to stick that
>>> kallsym_node thing in (ftrace_ops ?!).
>>> 
>>> It is compiled only, so no testing what so ever (also, no changelogs).
>>> 
>>> I didn't wire up the KSYM_TYPE thing; I'm wondering if we really need
>>> that, OTOH it really doesn't hurt having it either.
>>> 
>>> One weird thing I noticed, wth does bpf_prog_kallsyms_add() check
>>> CAP_ADMIN ?! Surely even a non-priv JIT'ed program generates symbols,
>>> why hide those?
>>> 
>>> Anyway; with the one nit about the get_names() thing sorted:
>>> 
>>> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>>> 
>>> (thanks for sticking with this)
>>> <peterz-latch-next.patch><peterz-kallsym.patch><peterz-kallsym-bpf.patch>
>> 
>> Aha, now I get the point on perf_event_ksymbol(). Yeah this approach is 
>> definitely better. 
>> 
>> While I run more tests with these patches, could we get current in 
>> perf/core? This will enable the development of user space tools like
>> bcc. 
>> 
>> Also, I current base this set on bpf-next tree, as tip/perf/core is 
>> 4 week old. Shall I rebase the set on Linus' tree? Or shall I wait for
>> tip/perf/core?
> 
> So, can you post one last set, this time with PeterZ's Acked-by,
> assuming you're sorting out the get_names() thing, and I can try merging
> this into my perf/core branch, then pushing it out to Ingo, my perf/core
> starts from tip/perf/urgent, so should be new enough.
> 
> I'd then right after testing it send a pull request to Ingo, synching
> everything.
> 
> - Arnaldo

Thanks Arnaldo! I will send it soon. 

Song
Peter Zijlstra Jan. 18, 2019, 8:38 a.m. UTC | #6
On Thu, Jan 17, 2019 at 11:58:54AM -0300, Arnaldo Carvalho de Melo wrote:
> So, can you post one last set, this time with PeterZ's Acked-by,
> assuming you're sorting out the get_names() thing, and I can try merging
> this into my perf/core branch, then pushing it out to Ingo, my perf/core
> starts from tip/perf/urgent, so should be new enough.

Works for me; thanks Arnaldo!
Peter Zijlstra Jan. 18, 2019, 8:41 a.m. UTC | #7
On Thu, Jan 17, 2019 at 01:56:53PM +0100, Peter Zijlstra wrote:
> +static __always_inline struct latch_tree_node *
> +latch_tree_first(struct latch_tree_root *root)
> +{
> +	struct latch_tree_node *ltn = NULL;
> +	struct rb_node *node;
> +	unsigned int seq;
> +
> +	do {
> +		struct rb_root *rbr;
> +
> +		seq = raw_read_seqcount_latch(&root->seq);
> +		rbr = &root->tree[seq & 1];
> +		node = rb_first(rbr);
> +	} while (read_seqcount_retry(&root->seq, seq));
> +
> +	if (node)
> +		ltn = __lt_from_rb(node, seq & 1);
> +
> +	return ltn;
> +}
> +
> +/**
> + * latch_tree_next() - find the next @ltn in @root per sort order
> + * @root: trees to which @ltn belongs
> + * @ltn: nodes to start from
> + *
> + * Does a lockless lookup in the trees @root for the next node starting at
> + * @ltn.
> + *
> + * Using this function outside of the write side lock is rather dodgy but given
> + * latch_tree_erase() doesn't re-init the nodes and the whole iteration is done
> + * under a single RCU critical section, it should be non-fatal and generate some
> + * semblance of order - albeit possibly missing chunks of the tree.
> + */
> +static __always_inline struct latch_tree_node *
> +latch_tree_next(struct latch_tree_root *root, struct latch_tree_node *ltn)
> +{
> +	struct rb_node *node;
> +	unsigned int seq;
> +
> +	do {
> +		seq = raw_read_seqcount_latch(&root->seq);
> +		node = rb_next(&ltn->node[seq & 1]);
> +	} while (read_seqcount_retry(&root->seq, seq));
> +
> +	return __lt_from_rb(node, seq & 1);
> +}

> +static int kallsym_tree_kallsym(unsigned int symnum, unsigned long *value, char *type,
> +				char *sym, char *modname, int *exported)
> +{
> +	struct latch_tree_node *ltn;
> +	int i, ret = -ERANGE;
> +
> +	rcu_read_lock();
> +	for (i = 0, ltn = latch_tree_first(&kallsym_tree); i < symnum && ltn;
> +	     i++, ltn = latch_tree_next(&kallsym_tree, ltn))
> +		;

On second thought; I don't think this will be good enough after all.
Missing entire subtrees is too much.

The rcu-list iteration will only miss newly added symbols, and for those
we'll get the events, combined we'll still have a complete picture. Not
so when a whole subtree goes missing.

I thought I could avoid the list this way, but alas, not so.

> +
> +	if (ltn) {
> +		struct kallsym_node *kn;
> +		char *mod;
> +
> +		kn = container_of(ltn, struct kallsym_node, kn_node);
> +
> +		kn->kn_names(kn, sym, &mod);
> +		if (mod)
> +			strlcpy(modname, mod, MODULE_NAME_LEN);
> +		else
> +			modname[0] = '\0';
> +
> +		*type = 't';
> +		*exported = 0;
> +		ret = 0;
> +	}
> +	rcu_read_unlock();
> +
> +	return ret;
> +}
diff mbox series

Patch

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1d5c551a5add..77b2560f2dc7 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1113,6 +1113,13 @@  static inline void perf_event_task_sched_out(struct task_struct *prev,
 }
 
 extern void perf_event_mmap(struct vm_area_struct *vma);
+
+/* callback function to generate ksymbol name */
+typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data);
+extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len,
+			       bool unregister,
+			       perf_ksymbol_get_name_f get_name, void *data);
+
 extern struct perf_guest_info_callbacks *perf_guest_cbs;
 extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
 extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
@@ -1333,6 +1340,12 @@  static inline int perf_unregister_guest_info_callbacks
 (struct perf_guest_info_callbacks *callbacks)				{ return 0; }
 
 static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
+
+typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data);
+static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len,
+				      bool unregister,
+				      perf_ksymbol_get_name_f get_name,
+				      void *data)			{ }
 static inline void perf_event_exec(void)				{ }
 static inline void perf_event_comm(struct task_struct *tsk, bool exec)	{ }
 static inline void perf_event_namespaces(struct task_struct *tsk)	{ }
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 9de8780ac8d9..68c4da0227c5 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -372,7 +372,8 @@  struct perf_event_attr {
 				context_switch :  1, /* context switch data */
 				write_backward :  1, /* Write ring buffer from end to beginning */
 				namespaces     :  1, /* include namespaces data */
-				__reserved_1   : 35;
+				ksymbol        :  1, /* include ksymbol events */
+				__reserved_1   : 34;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -965,9 +966,32 @@  enum perf_event_type {
 	 */
 	PERF_RECORD_NAMESPACES			= 16,
 
+	/*
+	 * Record ksymbol register/unregister events:
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u64				addr;
+	 *	u32				len;
+	 *	u16				ksym_type;
+	 *	u16				flags;
+	 *	char				name[];
+	 *	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_KSYMBOL			= 17,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
 
+enum perf_record_ksymbol_type {
+	PERF_RECORD_KSYMBOL_TYPE_UNKNOWN	= 0,
+	PERF_RECORD_KSYMBOL_TYPE_BPF		= 1,
+	PERF_RECORD_KSYMBOL_TYPE_MAX		/* non-ABI */
+};
+
+#define PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER	(1 << 0)
+
 #define PERF_MAX_STACK_DEPTH		127
 #define PERF_MAX_CONTEXTS_PER_STACK	  8
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3cd13a30f732..ef27f2776999 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -385,6 +385,7 @@  static atomic_t nr_namespaces_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
 static atomic_t nr_freq_events __read_mostly;
 static atomic_t nr_switch_events __read_mostly;
+static atomic_t nr_ksymbol_events __read_mostly;
 
 static LIST_HEAD(pmus);
 static DEFINE_MUTEX(pmus_lock);
@@ -4235,7 +4236,7 @@  static bool is_sb_event(struct perf_event *event)
 
 	if (attr->mmap || attr->mmap_data || attr->mmap2 ||
 	    attr->comm || attr->comm_exec ||
-	    attr->task ||
+	    attr->task || attr->ksymbol ||
 	    attr->context_switch)
 		return true;
 	return false;
@@ -4305,6 +4306,8 @@  static void unaccount_event(struct perf_event *event)
 		dec = true;
 	if (has_branch_stack(event))
 		dec = true;
+	if (event->attr.ksymbol)
+		atomic_dec(&nr_ksymbol_events);
 
 	if (dec) {
 		if (!atomic_add_unless(&perf_sched_count, -1, 1))
@@ -7650,6 +7653,97 @@  static void perf_log_throttle(struct perf_event *event, int enable)
 	perf_output_end(&handle);
 }
 
+/*
+ * ksymbol register/unregister tracking
+ */
+
+struct perf_ksymbol_event {
+	const char	*name;
+	int		name_len;
+	struct {
+		struct perf_event_header        header;
+		u64				addr;
+		u32				len;
+		u16				ksym_type;
+		u16				flags;
+	} event_id;
+};
+
+static int perf_event_ksymbol_match(struct perf_event *event)
+{
+	return event->attr.ksymbol;
+}
+
+static void perf_event_ksymbol_output(struct perf_event *event, void *data)
+{
+	struct perf_ksymbol_event *ksymbol_event = data;
+	struct perf_output_handle handle;
+	struct perf_sample_data sample;
+	int ret;
+
+	if (!perf_event_ksymbol_match(event))
+		return;
+
+	perf_event_header__init_id(&ksymbol_event->event_id.header,
+				   &sample, event);
+	ret = perf_output_begin(&handle, event,
+				ksymbol_event->event_id.header.size);
+	if (ret)
+		return;
+
+	perf_output_put(&handle, ksymbol_event->event_id);
+	__output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
+	perf_event__output_id_sample(event, &handle, &sample);
+
+	perf_output_end(&handle);
+}
+
+void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
+			perf_ksymbol_get_name_f get_name, void *data)
+{
+	struct perf_ksymbol_event ksymbol_event;
+	char name[KSYM_NAME_LEN];
+	u16 flags = 0;
+	int name_len;
+
+	if (!atomic_read(&nr_ksymbol_events))
+		return;
+
+	if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
+	    ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
+		goto err;
+
+	get_name(name, KSYM_NAME_LEN, data);
+	name_len = strlen(name) + 1;
+	while (!IS_ALIGNED(name_len, sizeof(u64)))
+		name[name_len++] = '\0';
+	BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
+
+	if (unregister)
+		flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
+
+	ksymbol_event = (struct perf_ksymbol_event){
+		.name = name,
+		.name_len = name_len,
+		.event_id = {
+			.header = {
+				.type = PERF_RECORD_KSYMBOL,
+				.size = sizeof(ksymbol_event.event_id) +
+					name_len,
+			},
+			.addr = addr,
+			.len = len,
+			.ksym_type = ksym_type,
+			.flags = flags,
+		},
+	};
+
+	perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
+	return;
+err:
+	WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
+}
+
 void perf_event_itrace_started(struct perf_event *event)
 {
 	event->attach_state |= PERF_ATTACH_ITRACE;
@@ -9900,6 +9994,8 @@  static void account_event(struct perf_event *event)
 		inc = true;
 	if (is_cgroup_event(event))
 		inc = true;
+	if (event->attr.ksymbol)
+		atomic_inc(&nr_ksymbol_events);
 
 	if (inc) {
 		/*