diff mbox series

[1/6] perf: Add new type PERF_TYPE_PROBE

Message ID 20171115172339.1791161-3-songliubraving@fb.com
State Not Applicable, archived
Delegated to: David Miller
Headers show
Series enable creating [k,u]probe with perf_event_open | expand

Commit Message

Song Liu Nov. 15, 2017, 5:23 p.m. UTC
A new perf type PERF_TYPE_PROBE is added to allow creating [k,u]probe
with perf_event_open. These [k,u]probe are associated with the file
decriptor created by perf_event_open, thus are easy to clean when
the file descriptor is destroyed.

Struct probe_desc and two flags, is_uprobe and is_return, are added
to describe the probe being created with perf_event_open.

Note: We use type __u64 for pointer probe_desc instead of __aligned_u64.
The reason here is to avoid changing the size of struct perf_event_attr,
and breaking new-kernel-old-utility scenario. To avoid alignment problem
with the pointer, we will (in the following patches) copy probe_desc to
__aligned_u64 before using it as pointer.

Signed-off-by: Song Liu <songliubraving@fb.com>
Reviewed-by: Yonghong Song <yhs@fb.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/perf_event.h | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

Comments

Peter Zijlstra Nov. 23, 2017, 10:02 a.m. UTC | #1
On Wed, Nov 15, 2017 at 09:23:33AM -0800, Song Liu wrote:

> Note: We use type __u64 for pointer probe_desc instead of __aligned_u64.
> The reason here is to avoid changing the size of struct perf_event_attr,
> and breaking new-kernel-old-utility scenario. To avoid alignment problem
> with the pointer, we will (in the following patches) copy probe_desc to
> __aligned_u64 before using it as pointer.

ISTR there are only relatively few architectures where __u64 and
__aligned_u64 are not the same thing.

The comment that goes with it seems to suggest i386 has short alignment
for u64 but my compiler says differently:

        printf("%d, %d\n", sizeof(unsigned long long), __alignof__(unsigned long long));

$ gcc -m32 -o align align.c && ./align
8, 8
Peter Zijlstra Nov. 23, 2017, 10:22 a.m. UTC | #2
On Wed, Nov 15, 2017 at 09:23:33AM -0800, Song Liu wrote:
> A new perf type PERF_TYPE_PROBE is added to allow creating [k,u]probe
> with perf_event_open. These [k,u]probe are associated with the file
> decriptor created by perf_event_open, thus are easy to clean when
> the file descriptor is destroyed.
> 
> Struct probe_desc and two flags, is_uprobe and is_return, are added
> to describe the probe being created with perf_event_open.

> ---
>  include/uapi/linux/perf_event.h | 35 +++++++++++++++++++++++++++++++++--
>  1 file changed, 33 insertions(+), 2 deletions(-)
> 
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index 362493a..cc42d59 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -33,6 +33,7 @@ enum perf_type_id {
>  	PERF_TYPE_HW_CACHE			= 3,
>  	PERF_TYPE_RAW				= 4,
>  	PERF_TYPE_BREAKPOINT			= 5,
> +	PERF_TYPE_PROBE				= 6,

Not required.. these fixed types are mostly legacy at this point.

>  
>  	PERF_TYPE_MAX,				/* non-ABI */
>  };
> @@ -299,6 +300,29 @@ enum perf_event_read_format {
>  #define PERF_ATTR_SIZE_VER4	104	/* add: sample_regs_intr */
>  #define PERF_ATTR_SIZE_VER5	112	/* add: aux_watermark */
>  
> +#define MAX_PROBE_FUNC_NAME_LEN	64
> +/*
> + * Describe a kprobe or uprobe for PERF_TYPE_PROBE.
> + * perf_event_attr.probe_desc will point to this structure. is_uprobe
> + * and is_return are used to differentiate different types of probe
> + * (k/u, probe/retprobe).
> + *
> + * The two unions should be used as follows:
> + * For uprobe: use path and offset;
> + * For kprobe: if func is empty, use addr
> + *             if func is not emtpy, use func and offset
> + */
> +struct probe_desc {
> +	union {
> +		__aligned_u64	func;
> +		__aligned_u64	path;
> +	};
> +	union {
> +		__aligned_u64	addr;
> +		__u64		offset;
> +	};
> +};
> +
>  /*
>   * Hardware event_id to monitor via a performance monitoring event:
>   *
> @@ -320,7 +344,10 @@ struct perf_event_attr {
>  	/*
>  	 * Type specific configuration information.
>  	 */
> -	__u64			config;
> +	union {
> +		__u64		config;
> +		__u64		probe_desc; /* ptr to struct probe_desc */
> +	};
>  
>  	union {
>  		__u64		sample_period;
> @@ -370,7 +397,11 @@ struct perf_event_attr {
>  				context_switch :  1, /* context switch data */
>  				write_backward :  1, /* Write ring buffer from end to beginning */
>  				namespaces     :  1, /* include namespaces data */
> -				__reserved_1   : 35;
> +
> +				/* For PERF_TYPE_PROBE */
> +				is_uprobe      :  1, /* 0: kprobe, 1: uprobe */
> +				is_return      :  1, /* 0: probe, 1: retprobe */
> +				__reserved_1   : 33;
>  
>  	union {
>  		__u32		wakeup_events;	  /* wakeup every n events */


So I hate this... there's so much that doesn't make sense.. not in the
least that __align_u64 fixation. Who cares about that?

Why does probe_desc exist? You could have overloaded config{1,2}
further, just like the breakpoint crap did.

And the extra flags seem misplaced too, why are they not config?

You could have simply registered two PMU types:

  perf_pmu_register(&perf_uprobe, "uprobe", -1);
  perf_pmu_register(&perf_kprobe, "kprobe", -1);

Where perf_{u,k}probe differ only in the init function.

Then for uprobe you use config1 as pointer to a path string and config2
as offset. And for kprobe you use config1 as function string pointer and
config2 as either address or offset.

This leaves you config in both cases to stuff further modifiers, like
for example the is_return thing for kprobes.
Alexei Starovoitov Nov. 24, 2017, 6:31 a.m. UTC | #3
On 11/23/17 2:02 AM, Peter Zijlstra wrote:
> On Wed, Nov 15, 2017 at 09:23:33AM -0800, Song Liu wrote:
>
>> Note: We use type __u64 for pointer probe_desc instead of __aligned_u64.
>> The reason here is to avoid changing the size of struct perf_event_attr,
>> and breaking new-kernel-old-utility scenario. To avoid alignment problem
>> with the pointer, we will (in the following patches) copy probe_desc to
>> __aligned_u64 before using it as pointer.
>
> ISTR there are only relatively few architectures where __u64 and
> __aligned_u64 are not the same thing.
>
> The comment that goes with it seems to suggest i386 has short alignment
> for u64 but my compiler says differently:
>
>         printf("%d, %d\n", sizeof(unsigned long long), __alignof__(unsigned long long));
>
> $ gcc -m32 -o align align.c && ./align
> 8, 8

unfortunately 32-bit is more screwed than it seems:

$ cat align.c
#include <stdio.h>

struct S {
   unsigned long long a;
} s;

struct U {
   unsigned long long a;
} u;

int main()
{
         printf("%d, %d\n", sizeof(unsigned long long),
                __alignof__(unsigned long long));
         printf("%d, %d\n", sizeof(s), __alignof__(s));
         printf("%d, %d\n", sizeof(u), __alignof__(u));
}
$ gcc -m32 align.c
$ ./a.out
8, 8
8, 4
8, 4

so we have to use __aligned_u64 in uapi.

Otherwise, yes, we could have used config1 and config2 to pass pointers
to the kernel, but since they're defined as __u64 already we cannot
change them and have to do this ugly dance around 'config' field.
If you prefer we can do the same around 'config1', but it's not
any prettier.
We considered adding __aligned_u64 to the end of
'struct perf_event_attr', but it's a waste for most users, so reusing
the space of 'config' field like this seems the least evil.
Peter Zijlstra Nov. 24, 2017, 8:28 a.m. UTC | #4
On Thu, Nov 23, 2017 at 10:31:29PM -0800, Alexei Starovoitov wrote:
> unfortunately 32-bit is more screwed than it seems:
> 
> $ cat align.c
> #include <stdio.h>
> 
> struct S {
>   unsigned long long a;
> } s;
> 
> struct U {
>   unsigned long long a;
> } u;
> 
> int main()
> {
>         printf("%d, %d\n", sizeof(unsigned long long),
>                __alignof__(unsigned long long));
>         printf("%d, %d\n", sizeof(s), __alignof__(s));
>         printf("%d, %d\n", sizeof(u), __alignof__(u));
> }
> $ gcc -m32 align.c
> $ ./a.out
> 8, 8
> 8, 4
> 8, 4

*blink* how is that even correct? I understood the spec to say the
alignment of composite types should be the max alignment of any of its
member types (otherwise it cannot guarantee the alignment of its
members).

> so we have to use __aligned_u64 in uapi.

Ideally yes, but effectively it most often doesn't matter.

> Otherwise, yes, we could have used config1 and config2 to pass pointers
> to the kernel, but since they're defined as __u64 already we cannot
> change them and have to do this ugly dance around 'config' field.

I don't understand the reasoning why you cannot use them. Even if they
are not naturally aligned on x86_32, why would it matter?

x86_32 needs two loads in any case, but there is no concurrency, so
split loads is not a problem. Add to that that 'intptr_t' on ILP32
is in fact only a single u32 and thus the other u32 will always be 0.

So yes, alignment is screwy, but I really don't see who cares and why it
would matter in practise.

Please explain.
Alexei Starovoitov Nov. 26, 2017, 1:59 a.m. UTC | #5
On 11/24/17 12:28 AM, Peter Zijlstra wrote:
> On Thu, Nov 23, 2017 at 10:31:29PM -0800, Alexei Starovoitov wrote:
>> unfortunately 32-bit is more screwed than it seems:
>>
>> $ cat align.c
>> #include <stdio.h>
>>
>> struct S {
>>   unsigned long long a;
>> } s;
>>
>> struct U {
>>   unsigned long long a;
>> } u;
>>
>> int main()
>> {
>>         printf("%d, %d\n", sizeof(unsigned long long),
>>                __alignof__(unsigned long long));
>>         printf("%d, %d\n", sizeof(s), __alignof__(s));
>>         printf("%d, %d\n", sizeof(u), __alignof__(u));
>> }
>> $ gcc -m32 align.c
>> $ ./a.out
>> 8, 8
>> 8, 4
>> 8, 4
>
> *blink* how is that even correct? I understood the spec to say the
> alignment of composite types should be the max alignment of any of its
> member types (otherwise it cannot guarantee the alignment of its
> members).
>
>> so we have to use __aligned_u64 in uapi.
>
> Ideally yes, but effectively it most often doesn't matter.
>
>> Otherwise, yes, we could have used config1 and config2 to pass pointers
>> to the kernel, but since they're defined as __u64 already we cannot
>> change them and have to do this ugly dance around 'config' field.
>
> I don't understand the reasoning why you cannot use them. Even if they
> are not naturally aligned on x86_32, why would it matter?
>
> x86_32 needs two loads in any case, but there is no concurrency, so
> split loads is not a problem. Add to that that 'intptr_t' on ILP32
> is in fact only a single u32 and thus the other u32 will always be 0.
>
> So yes, alignment is screwy, but I really don't see who cares and why it
> would matter in practise.

If we were poking into 'struct perf_event_attr __user *uptr'
directly like get|put_user(.., &uptr->config)
then 32-bit user space with 4-byte aligned u64s would cause
64-bit kernel to trap on archs like sparc.
But in this case you're right. We can use config[12] as-is, since these
u64 fields are passing the value one way only (into the kernel) and
we do full perf_copy_attr() first and all further accesses are from
copied structure and u64_to_user_ptr(event->attr.config) will be fine.

Do you mind we do
union {
  __u64 file_path;
  __u64 func_name;
  __u64 config;
};
and similar with config1 ?
Or prefer that we use 'config/config1' to store string+offset there?
I think config/config1 is cleaner than config1/config2
Peter Zijlstra Nov. 27, 2017, 7:58 a.m. UTC | #6
On Sat, Nov 25, 2017 at 05:59:54PM -0800, Alexei Starovoitov wrote:

> If we were poking into 'struct perf_event_attr __user *uptr'
> directly like get|put_user(.., &uptr->config)
> then 32-bit user space with 4-byte aligned u64s would cause
> 64-bit kernel to trap on archs like sparc.

But surely archs that have hardware alignment requirements have __u64 ==
__aligned_u64 ?

It's just that the structure layout can change between archs that have
__u64 != __aligned_u64 and __u64 == __aligned_u64.

But I would argue an architecture that has hardware alignment
requirements and has an unaligned __u64 is just plain broken.

> But in this case you're right. We can use config[12] as-is, since these
> u64 fields are passing the value one way only (into the kernel) and
> we do full perf_copy_attr() first and all further accesses are from
> copied structure and u64_to_user_ptr(event->attr.config) will be fine.

Right. Also note that there are no holes in perf_event_attr, if the
structure itself is allocated aligned the individual fields will be
aligned.

> Do you mind we do
> union {
>  __u64 file_path;
>  __u64 func_name;
>  __u64 config;
> };
> and similar with config1 ?

> Or prefer that we use 'config/config1' to store string+offset there?
> I think config/config1 is cleaner than config1/config2

I would prefer you use config1/config2 for this and leave config itself
for modifiers (like the retprobe thing). It also better lines up with
the BP stuff.

A little something like so perhaps:

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 362493a2f950..b6e76512f757 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -380,10 +380,14 @@ struct perf_event_attr {
 	__u32			bp_type;
 	union {
 		__u64		bp_addr;
+		__u64		uprobe_path;
+		__u64		kprobe_func;
 		__u64		config1; /* extension of config */
 	};
 	union {
 		__u64		bp_len;
+		__u64		kprobe_addr;
+		__u64		probe_offset;
 		__u64		config2; /* extension of config1 */
 	};
 	__u64	branch_sample_type; /* enum perf_branch_sample_type */
Song Liu Nov. 30, 2017, 1:43 a.m. UTC | #7
> On Nov 23, 2017, at 2:22 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Wed, Nov 15, 2017 at 09:23:33AM -0800, Song Liu wrote:
>> A new perf type PERF_TYPE_PROBE is added to allow creating [k,u]probe
>> with perf_event_open. These [k,u]probe are associated with the file
>> decriptor created by perf_event_open, thus are easy to clean when
>> the file descriptor is destroyed.
>> 
>> Struct probe_desc and two flags, is_uprobe and is_return, are added
>> to describe the probe being created with perf_event_open.
> 
>> ---
>> include/uapi/linux/perf_event.h | 35 +++++++++++++++++++++++++++++++++--
>> 1 file changed, 33 insertions(+), 2 deletions(-)
>> 
>> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
>> index 362493a..cc42d59 100644
>> --- a/include/uapi/linux/perf_event.h
>> +++ b/include/uapi/linux/perf_event.h
>> @@ -33,6 +33,7 @@ enum perf_type_id {
>> 	PERF_TYPE_HW_CACHE			= 3,
>> 	PERF_TYPE_RAW				= 4,
>> 	PERF_TYPE_BREAKPOINT			= 5,
>> +	PERF_TYPE_PROBE				= 6,
> 
> Not required.. these fixed types are mostly legacy at this point.

Dear Peter,

Thanks a lot for your feedback. I have incorporated them in the next version
(sending soon). 

I added two fixed types (PERF_TYPE_KPROBE and PERF_TYPE_UPROBE) in the new 
version. I know that perf doesn't need them any more. But currently bcc still 
relies on these fixed types to use the probes/tracepoints. 

Thanks,
Song
Peter Zijlstra Nov. 30, 2017, 1:37 p.m. UTC | #8
On Thu, Nov 30, 2017 at 01:43:06AM +0000, Song Liu wrote:
> I added two fixed types (PERF_TYPE_KPROBE and PERF_TYPE_UPROBE) in the new 
> version. I know that perf doesn't need them any more. But currently bcc still 
> relies on these fixed types to use the probes/tracepoints. 

Yeah, sorry, that's just not going to fly. Fix bcc.
diff mbox series

Patch

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 362493a..cc42d59 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -33,6 +33,7 @@  enum perf_type_id {
 	PERF_TYPE_HW_CACHE			= 3,
 	PERF_TYPE_RAW				= 4,
 	PERF_TYPE_BREAKPOINT			= 5,
+	PERF_TYPE_PROBE				= 6,
 
 	PERF_TYPE_MAX,				/* non-ABI */
 };
@@ -299,6 +300,29 @@  enum perf_event_read_format {
 #define PERF_ATTR_SIZE_VER4	104	/* add: sample_regs_intr */
 #define PERF_ATTR_SIZE_VER5	112	/* add: aux_watermark */
 
+#define MAX_PROBE_FUNC_NAME_LEN	64
+/*
+ * Describe a kprobe or uprobe for PERF_TYPE_PROBE.
+ * perf_event_attr.probe_desc will point to this structure. is_uprobe
+ * and is_return are used to differentiate different types of probe
+ * (k/u, probe/retprobe).
+ *
+ * The two unions should be used as follows:
+ * For uprobe: use path and offset;
+ * For kprobe: if func is empty, use addr
+ *             if func is not emtpy, use func and offset
+ */
+struct probe_desc {
+	union {
+		__aligned_u64	func;
+		__aligned_u64	path;
+	};
+	union {
+		__aligned_u64	addr;
+		__u64		offset;
+	};
+};
+
 /*
  * Hardware event_id to monitor via a performance monitoring event:
  *
@@ -320,7 +344,10 @@  struct perf_event_attr {
 	/*
 	 * Type specific configuration information.
 	 */
-	__u64			config;
+	union {
+		__u64		config;
+		__u64		probe_desc; /* ptr to struct probe_desc */
+	};
 
 	union {
 		__u64		sample_period;
@@ -370,7 +397,11 @@  struct perf_event_attr {
 				context_switch :  1, /* context switch data */
 				write_backward :  1, /* Write ring buffer from end to beginning */
 				namespaces     :  1, /* include namespaces data */
-				__reserved_1   : 35;
+
+				/* For PERF_TYPE_PROBE */
+				is_uprobe      :  1, /* 0: kprobe, 1: uprobe */
+				is_return      :  1, /* 0: probe, 1: retprobe */
+				__reserved_1   : 33;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */