Patchwork [08/40] tracing: remove syscall bitmaps in preparation for compat support

login
register
mail settings
Submitter Ian Munsie
Date June 23, 2010, 10:02 a.m.
Message ID <1277287401-28571-9-git-send-email-imunsie@au1.ibm.com>
Download mbox | patch
Permalink /patch/56634/
State Not Applicable
Headers show

Comments

Ian Munsie - June 23, 2010, 10:02 a.m.
From: Jason Baron <jbaron@redhat.com>

In preparation for compat syscall tracing support, let's store the enabled
syscalls, with the struct syscall_metadata itself. That way we don't duplicate
enabled information when the compat table points to an entry in the regular
syscall table. Also, allows us to remove the bitmap data structures completely.

Signed-off-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Ian Munsie <imunsie@au1.ibm.com>
---
 include/linux/syscalls.h      |    8 +++++++
 include/trace/syscall.h       |    4 +++
 kernel/trace/trace_syscalls.c |   42 +++++++++++++++++++---------------------
 3 files changed, 32 insertions(+), 22 deletions(-)
Steven Rostedt - June 23, 2010, 3:16 p.m.
On Wed, 2010-06-23 at 20:02 +1000, Ian Munsie wrote:
> From: Jason Baron <jbaron@redhat.com>
> 
> In preparation for compat syscall tracing support, let's store the enabled
> syscalls, with the struct syscall_metadata itself. That way we don't duplicate
> enabled information when the compat table points to an entry in the regular
> syscall table. Also, allows us to remove the bitmap data structures completely.
> 
> Signed-off-by: Jason Baron <jbaron@redhat.com>
> Signed-off-by: Ian Munsie <imunsie@au1.ibm.com>
> ---
>  include/linux/syscalls.h      |    8 +++++++
>  include/trace/syscall.h       |    4 +++
>  kernel/trace/trace_syscalls.c |   42 +++++++++++++++++++---------------------
>  3 files changed, 32 insertions(+), 22 deletions(-)
> 
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 86f082b..755d05b 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -163,6 +163,10 @@ extern struct trace_event_functions exit_syscall_print_funcs;
>  		.nb_args 	= nb,				\
>  		.types		= types_##sname,		\
>  		.args		= args_##sname,			\
> +		.ftrace_enter	= 0,				\
> +		.ftrace_exit	= 0,				\
> +		.perf_enter	= 0,				\
> +		.perf_exit	= 0,				\

I really hate this change!

You just removed a nice compressed bitmap (1 bit per syscall) to add 4
bytes per syscall. On my box I have 308 syscalls being traced. That was
308 bits per bitmask = 39 bytes * 2 = 78 * 2 (perf and ftrace) = 156.

Now we have 8 bytes per syscall (enter and exit), which is 1232 bytes.

Thus this change added 1076 bytes.

This may not seem as much, but the change is not worth 1K. Can't we just
add another bitmask or something for the compat case?

I also hate the moving of ftrace and perf internal data to an external
interface.

-- Steve

>  		.enter_event	= &event_enter_##sname,		\
>  		.exit_event	= &event_exit_##sname,		\
>  		.enter_fields	= LIST_HEAD_INIT(__syscall_meta_##sname.enter_fields), \
> @@ -179,6 +183,10 @@ extern struct trace_event_functions exit_syscall_print_funcs;
>  		.name 		= "sys_"#sname,			\
>  		.syscall_nr	= -1,	/* Filled in at boot */	\
>  		.nb_args 	= 0,				\
> +		.ftrace_enter	= 0,				\
> +		.ftrace_exit	= 0,				\
> +		.perf_enter	= 0,				\
> +		.perf_exit	= 0,				\
>  		.enter_event	= &event_enter__##sname,	\
>  		.exit_event	= &event_exit__##sname,		\
>  		.enter_fields	= LIST_HEAD_INIT(__syscall_meta__##sname.enter_fields), \
Jason Baron - June 23, 2010, 7:14 p.m.
On Wed, Jun 23, 2010 at 11:16:44AM -0400, Steven Rostedt wrote:
> On Wed, 2010-06-23 at 20:02 +1000, Ian Munsie wrote:
> > From: Jason Baron <jbaron@redhat.com>
> > 
> > In preparation for compat syscall tracing support, let's store the enabled
> > syscalls, with the struct syscall_metadata itself. That way we don't duplicate
> > enabled information when the compat table points to an entry in the regular
> > syscall table. Also, allows us to remove the bitmap data structures completely.
> > 
> > Signed-off-by: Jason Baron <jbaron@redhat.com>
> > Signed-off-by: Ian Munsie <imunsie@au1.ibm.com>
> > ---
> >  include/linux/syscalls.h      |    8 +++++++
> >  include/trace/syscall.h       |    4 +++
> >  kernel/trace/trace_syscalls.c |   42 +++++++++++++++++++---------------------
> >  3 files changed, 32 insertions(+), 22 deletions(-)
> > 
> > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> > index 86f082b..755d05b 100644
> > --- a/include/linux/syscalls.h
> > +++ b/include/linux/syscalls.h
> > @@ -163,6 +163,10 @@ extern struct trace_event_functions exit_syscall_print_funcs;
> >  		.nb_args 	= nb,				\
> >  		.types		= types_##sname,		\
> >  		.args		= args_##sname,			\
> > +		.ftrace_enter	= 0,				\
> > +		.ftrace_exit	= 0,				\
> > +		.perf_enter	= 0,				\
> > +		.perf_exit	= 0,				\
> 
> I really hate this change!
> 
> You just removed a nice compressed bitmap (1 bit per syscall) to add 4
> bytes per syscall. On my box I have 308 syscalls being traced. That was
> 308 bits per bitmask = 39 bytes * 2 = 78 * 2 (perf and ftrace) = 156.
> 
> Now we have 8 bytes per syscall (enter and exit), which is 1232 bytes.
> 
> Thus this change added 1076 bytes.
> 
> This may not seem as much, but the change is not worth 1K. Can't we just
> add another bitmask or something for the compat case?
> 
> I also hate the moving of ftrace and perf internal data to an external
> interface.
> 
> -- Steve
> 

I made this change (I also wrote the original bitmap), b/c compat
syscalls can share "regular" syscalls. That is the compat syscall table
points to syscalls from non-compat mode. (looking at ia32 on x86 it
looks like at least half).

Thus, if we continue along the bitmap path, we would have to introduce
another 4 bitmaps for compat. 2 for enter and exit and 2 for perf and
ftrace. Thus, using your math above: 39 bytes * 8 = 312 bytes. So
approximately 1 byte per system call.

Instead, if we store this data in the syscall metadata, we actually only
need 4 bits per syscall. Now, the above implementation uses 4 chars,
where we really only need 1 char (or really 4 bits, which we could
eventually store in the last last bit of the four existing pointer
assuming they are 2 byte aligned for no increased storage space at all).
But even assuming we use 1 byte per system call we are going to have in
the worse case the above 312 bytes + (1 byte * # of non-shared compat
syscalls). So, yes we might need a little more storage in this scheme.
Another consideration too, is obviously the alignment of
syscall_metadata, since the extra 1 byte, might be more...

However, we don't have to compute the location of the bits in the
compat syscall map each time a tracing syscall is enable/disable. This
would be more expensive, especially if we don't store the compat syscall
number with each syscall meta data structure (which you have proposed
dropping). So with compat syscalls, we are setting two bit locations
with each enable/disable instead of 1 with this new scheme.

Also, I think the more important reason to store these bits in the
syscall meta data structure is simplicity. Not all arches start their tables
counting from 0 (requiring a constant shift factor), and obviously we
waste bits for non-implemented syscalls. I don't want to have to deal
with these arch specific implementation issues, if I don't need to.

thanks,

-Jason
Jason Baron - June 23, 2010, 7:34 p.m.
On Wed, Jun 23, 2010 at 03:14:54PM -0400, Jason Baron wrote:
> On Wed, Jun 23, 2010 at 11:16:44AM -0400, Steven Rostedt wrote:
> > On Wed, 2010-06-23 at 20:02 +1000, Ian Munsie wrote:
> > > From: Jason Baron <jbaron@redhat.com>
> > > 
> > > In preparation for compat syscall tracing support, let's store the enabled
> > > syscalls, with the struct syscall_metadata itself. That way we don't duplicate
> > > enabled information when the compat table points to an entry in the regular
> > > syscall table. Also, allows us to remove the bitmap data structures completely.
> > > 
> > > Signed-off-by: Jason Baron <jbaron@redhat.com>
> > > Signed-off-by: Ian Munsie <imunsie@au1.ibm.com>
> > > ---
> > >  include/linux/syscalls.h      |    8 +++++++
> > >  include/trace/syscall.h       |    4 +++
> > >  kernel/trace/trace_syscalls.c |   42 +++++++++++++++++++---------------------
> > >  3 files changed, 32 insertions(+), 22 deletions(-)
> > > 
> > > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> > > index 86f082b..755d05b 100644
> > > --- a/include/linux/syscalls.h
> > > +++ b/include/linux/syscalls.h
> > > @@ -163,6 +163,10 @@ extern struct trace_event_functions exit_syscall_print_funcs;
> > >  		.nb_args 	= nb,				\
> > >  		.types		= types_##sname,		\
> > >  		.args		= args_##sname,			\
> > > +		.ftrace_enter	= 0,				\
> > > +		.ftrace_exit	= 0,				\
> > > +		.perf_enter	= 0,				\
> > > +		.perf_exit	= 0,				\
> > 
> > I really hate this change!
> > 
> > You just removed a nice compressed bitmap (1 bit per syscall) to add 4
> > bytes per syscall. On my box I have 308 syscalls being traced. That was
> > 308 bits per bitmask = 39 bytes * 2 = 78 * 2 (perf and ftrace) = 156.
> > 
> > Now we have 8 bytes per syscall (enter and exit), which is 1232 bytes.
> > 
> > Thus this change added 1076 bytes.
> > 
> > This may not seem as much, but the change is not worth 1K. Can't we just
> > add another bitmask or something for the compat case?
> > 
> > I also hate the moving of ftrace and perf internal data to an external
> > interface.
> > 
> > -- Steve
> > 
> 
> I made this change (I also wrote the original bitmap), b/c compat
> syscalls can share "regular" syscalls. That is the compat syscall table
> points to syscalls from non-compat mode. (looking at ia32 on x86 it
> looks like at least half).
> 
> Thus, if we continue along the bitmap path, we would have to introduce
> another 4 bitmaps for compat. 2 for enter and exit and 2 for perf and
> ftrace. Thus, using your math above: 39 bytes * 8 = 312 bytes. So
> approximately 1 byte per system call.
> 
> Instead, if we store this data in the syscall metadata, we actually only
> need 4 bits per syscall. Now, the above implementation uses 4 chars,
> where we really only need 1 char (or really 4 bits, which we could
> eventually store in the last last bit of the four existing pointer
> assuming they are 2 byte aligned for no increased storage space at all).
> But even assuming we use 1 byte per system call we are going to have in
> the worse case the above 312 bytes + (1 byte * # of non-shared compat
> syscalls). So, yes we might need a little more storage in this scheme.
> Another consideration too, is obviously the alignment of
> syscall_metadata, since the extra 1 byte, might be more...
> 
> However, we don't have to compute the location of the bits in the
> compat syscall map each time a tracing syscall is enable/disable. This
> would be more expensive, especially if we don't store the compat syscall
> number with each syscall meta data structure (which you have proposed
> dropping). So with compat syscalls, we are setting two bit locations
> with each enable/disable instead of 1 with this new scheme.
> 
> Also, I think the more important reason to store these bits in the
> syscall meta data structure is simplicity. Not all arches start their tables
> counting from 0 (requiring a constant shift factor), and obviously we
> waste bits for non-implemented syscalls. I don't want to have to deal
> with these arch specific implementation issues, if I don't need to.
> 
> thanks,
> 
> -Jason
> 

Actually, looking at this further, what we probably want to do change
the "int nb_args" field, which is already in syscall_metadata into a bit
field. nb_args I think can be at most 6, or 3 bits, and we only need 4
bits for storing the enabled/disabled data, so we could even make it a
char. Thus, actually saving space with this patch :) (at least as far as
the syscall_metadata field is concerned).

thanks,

-Jason
Steven Rostedt - June 23, 2010, 7:45 p.m.
On Wed, 2010-06-23 at 15:34 -0400, Jason Baron wrote:

> Actually, looking at this further, what we probably want to do change
> the "int nb_args" field, which is already in syscall_metadata into a bit
> field. nb_args I think can be at most 6, or 3 bits, and we only need 4
> bits for storing the enabled/disabled data, so we could even make it a
> char. Thus, actually saving space with this patch :) (at least as far as
> the syscall_metadata field is concerned).

Yeah, I'm fine with turning that into a count/flags field.

-- Steve

Patch

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 86f082b..755d05b 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -163,6 +163,10 @@  extern struct trace_event_functions exit_syscall_print_funcs;
 		.nb_args 	= nb,				\
 		.types		= types_##sname,		\
 		.args		= args_##sname,			\
+		.ftrace_enter	= 0,				\
+		.ftrace_exit	= 0,				\
+		.perf_enter	= 0,				\
+		.perf_exit	= 0,				\
 		.enter_event	= &event_enter_##sname,		\
 		.exit_event	= &event_exit_##sname,		\
 		.enter_fields	= LIST_HEAD_INIT(__syscall_meta_##sname.enter_fields), \
@@ -179,6 +183,10 @@  extern struct trace_event_functions exit_syscall_print_funcs;
 		.name 		= "sys_"#sname,			\
 		.syscall_nr	= -1,	/* Filled in at boot */	\
 		.nb_args 	= 0,				\
+		.ftrace_enter	= 0,				\
+		.ftrace_exit	= 0,				\
+		.perf_enter	= 0,				\
+		.perf_exit	= 0,				\
 		.enter_event	= &event_enter__##sname,	\
 		.exit_event	= &event_exit__##sname,		\
 		.enter_fields	= LIST_HEAD_INIT(__syscall_meta__##sname.enter_fields), \
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 257e089..75f3dce 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -27,6 +27,10 @@  struct syscall_metadata {
 	const char	**args;
 	struct list_head enter_fields;
 	struct list_head exit_fields;
+	char		ftrace_enter;
+	char		ftrace_exit;
+	char		perf_enter;
+	char		perf_exit;
 
 	struct ftrace_event_call *enter_event;
 	struct ftrace_event_call *exit_event;
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 18f27bb..f5ddb9c 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -12,8 +12,6 @@ 
 static DEFINE_MUTEX(syscall_trace_lock);
 static int sys_refcount_enter;
 static int sys_refcount_exit;
-static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
-static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
 
 static int syscall_enter_register(struct ftrace_event_call *event,
 				 enum trace_reg type);
@@ -299,13 +297,14 @@  void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	syscall_nr = syscall_get_nr(current, regs);
 	if (syscall_nr < 0)
 		return;
-	if (!test_bit(syscall_nr, enabled_enter_syscalls))
-		return;
 
 	sys_data = syscall_nr_to_meta(syscall_nr);
 	if (!sys_data)
 		return;
 
+	if (!sys_data->ftrace_enter)
+		return;
+
 	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
 
 	event = trace_current_buffer_lock_reserve(&buffer,
@@ -333,13 +332,14 @@  void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	syscall_nr = syscall_get_nr(current, regs);
 	if (syscall_nr < 0)
 		return;
-	if (!test_bit(syscall_nr, enabled_exit_syscalls))
-		return;
 
 	sys_data = syscall_nr_to_meta(syscall_nr);
 	if (!sys_data)
 		return;
 
+	if (!sys_data->ftrace_exit)
+		return;
+
 	event = trace_current_buffer_lock_reserve(&buffer,
 			sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
 	if (!event)
@@ -366,7 +366,7 @@  int reg_event_syscall_enter(struct ftrace_event_call *call)
 	if (!sys_refcount_enter)
 		ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
 	if (!ret) {
-		set_bit(num, enabled_enter_syscalls);
+		((struct syscall_metadata *)call->data)->ftrace_enter = 1;
 		sys_refcount_enter++;
 	}
 	mutex_unlock(&syscall_trace_lock);
@@ -382,7 +382,7 @@  void unreg_event_syscall_enter(struct ftrace_event_call *call)
 		return;
 	mutex_lock(&syscall_trace_lock);
 	sys_refcount_enter--;
-	clear_bit(num, enabled_enter_syscalls);
+	((struct syscall_metadata *)call->data)->ftrace_enter = 0;
 	if (!sys_refcount_enter)
 		unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
 	mutex_unlock(&syscall_trace_lock);
@@ -400,7 +400,7 @@  int reg_event_syscall_exit(struct ftrace_event_call *call)
 	if (!sys_refcount_exit)
 		ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
 	if (!ret) {
-		set_bit(num, enabled_exit_syscalls);
+		((struct syscall_metadata *)call->data)->ftrace_exit = 1;
 		sys_refcount_exit++;
 	}
 	mutex_unlock(&syscall_trace_lock);
@@ -416,7 +416,7 @@  void unreg_event_syscall_exit(struct ftrace_event_call *call)
 		return;
 	mutex_lock(&syscall_trace_lock);
 	sys_refcount_exit--;
-	clear_bit(num, enabled_exit_syscalls);
+	((struct syscall_metadata *)call->data)->ftrace_exit = 0;
 	if (!sys_refcount_exit)
 		unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
 	mutex_unlock(&syscall_trace_lock);
@@ -492,8 +492,6 @@  core_initcall(init_ftrace_syscalls);
 
 #ifdef CONFIG_PERF_EVENTS
 
-static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
-static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
 static int sys_perf_refcount_enter;
 static int sys_perf_refcount_exit;
 
@@ -507,13 +505,13 @@  static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	int size;
 
 	syscall_nr = syscall_get_nr(current, regs);
-	if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
-		return;
-
 	sys_data = syscall_nr_to_meta(syscall_nr);
 	if (!sys_data)
 		return;
 
+	if (!sys_data->perf_enter)
+		return;
+
 	/* get the size after alignment with the u32 buffer size field */
 	size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
 	size = ALIGN(size + sizeof(u32), sizeof(u64));
@@ -550,7 +548,7 @@  int perf_sysenter_enable(struct ftrace_event_call *call)
 		pr_info("event trace: Could not activate"
 				"syscall entry trace point");
 	} else {
-		set_bit(num, enabled_perf_enter_syscalls);
+		((struct syscall_metadata *)call->data)->perf_enter = 1;
 		sys_perf_refcount_enter++;
 	}
 	mutex_unlock(&syscall_trace_lock);
@@ -565,7 +563,7 @@  void perf_sysenter_disable(struct ftrace_event_call *call)
 
 	mutex_lock(&syscall_trace_lock);
 	sys_perf_refcount_enter--;
-	clear_bit(num, enabled_perf_enter_syscalls);
+	((struct syscall_metadata *)call->data)->perf_enter = 0;
 	if (!sys_perf_refcount_enter)
 		unregister_trace_sys_enter(perf_syscall_enter, NULL);
 	mutex_unlock(&syscall_trace_lock);
@@ -581,13 +579,13 @@  static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	int size;
 
 	syscall_nr = syscall_get_nr(current, regs);
-	if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
-		return;
-
 	sys_data = syscall_nr_to_meta(syscall_nr);
 	if (!sys_data)
 		return;
 
+	if (!sys_data->perf_exit)
+		return;
+
 	/* We can probably do that at build time */
 	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
@@ -626,7 +624,7 @@  int perf_sysexit_enable(struct ftrace_event_call *call)
 		pr_info("event trace: Could not activate"
 				"syscall exit trace point");
 	} else {
-		set_bit(num, enabled_perf_exit_syscalls);
+		((struct syscall_metadata *)call->data)->perf_exit = 1;
 		sys_perf_refcount_exit++;
 	}
 	mutex_unlock(&syscall_trace_lock);
@@ -641,7 +639,7 @@  void perf_sysexit_disable(struct ftrace_event_call *call)
 
 	mutex_lock(&syscall_trace_lock);
 	sys_perf_refcount_exit--;
-	clear_bit(num, enabled_perf_exit_syscalls);
+	((struct syscall_metadata *)call->data)->perf_exit = 0;
 	if (!sys_perf_refcount_exit)
 		unregister_trace_sys_exit(perf_syscall_exit, NULL);
 	mutex_unlock(&syscall_trace_lock);