um: Try to avoid kmalloc in signal handling

Message ID 20190104100517.9809-1-anton.ivanov@cambridgegreys.com
State New
Headers show
Series
  • um: Try to avoid kmalloc in signal handling
Related show

Commit Message

Anton Ivanov Jan. 4, 2019, 10:05 a.m.
From: Anton Ivanov <anton.ivanov@cambridgegreys.com>

Signal handling (which maps to interrupt handling in UML) needs
to pass current registers to the relevant handlers and was
allocating a structure for that using kmalloc. It is possible
to avoid this kmalloc by using a small "signal register stack".
A depth of 4 suffices for normal use. If it is exceeded
further sets of registers are allocated as before via kmalloc.

The end result is >10% performance increase in networking
as measured by iperf and >5% across the board.

Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
---
 arch/um/include/shared/os.h |  4 ++++
 arch/um/kernel/signal.c     | 31 +++++++++++++++++++++++++++++++
 arch/um/os-Linux/signal.c   | 16 ++++------------
 3 files changed, 39 insertions(+), 12 deletions(-)

Comments

Richard Weinberger Jan. 4, 2019, 10:13 a.m. | #1
Am Freitag, 4. Januar 2019, 11:05:17 CET schrieb anton.ivanov@cambridgegreys.com:
> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
> 
> Signal handling (which maps to interrupt handling in UML) needs
> to pass current registers to the relevant handlers and was
> allocating a structure for that using kmalloc. It is possible
> to avoid this kmalloc by using a small "signal register stack".
> A depth of 4 suffices for normal use. If it is exceeded
> further sets of registers are allocated as before via kmalloc.
> 
> The end result is >10% performance increase in networking
> as measured by iperf and >5% across the board.
> 
> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
> ---
>  arch/um/include/shared/os.h |  4 ++++
>  arch/um/kernel/signal.c     | 31 +++++++++++++++++++++++++++++++
>  arch/um/os-Linux/signal.c   | 16 ++++------------
>  3 files changed, 39 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
> index ebf23012a59b..4ec1bc63213b 100644
> --- a/arch/um/include/shared/os.h
> +++ b/arch/um/include/shared/os.h
> @@ -319,4 +319,8 @@ extern unsigned long os_get_top_address(void);
>  
>  long syscall(long number, ...);
>  
> +/* signal.c */
> +extern struct uml_pt_regs *get_save_register_state(void);
> +extern void release_save_register_state(struct uml_pt_regs *r);
> +
>  #endif
> diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c
> index 57acbd67d85d..558ac7e4df97 100644
> --- a/arch/um/kernel/signal.c
> +++ b/arch/um/kernel/signal.c
> @@ -6,11 +6,18 @@
>  #include <linux/module.h>
>  #include <linux/ptrace.h>
>  #include <linux/sched.h>
> +#include <linux/slab.h>
>  #include <asm/siginfo.h>
>  #include <asm/signal.h>
>  #include <asm/unistd.h>
>  #include <frame_kern.h>
>  #include <kern_util.h>
> +#include <os.h>
> +
> +static atomic_t reg_depth;
> +
> +static struct uml_pt_regs reg_file[4];
> +#define REG_SWITCH_TO_KMALLOC_DEPTH 3

I fear this is not correct.
Think of task switching, if you push something to reg_file[],
it is not guaranteed that the same task will run next and pop the item.
The order can change.

Thanks,
//richard
Anton Ivanov Jan. 4, 2019, 10:19 a.m. | #2
On 1/4/19 10:13 AM, Richard Weinberger wrote:
> Am Freitag, 4. Januar 2019, 11:05:17 CET schrieb anton.ivanov@cambridgegreys.com:
>> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>
>> Signal handling (which maps to interrupt handling in UML) needs
>> to pass current registers to the relevant handlers and was
>> allocating a structure for that using kmalloc. It is possible
>> to avoid this kmalloc by using a small "signal register stack".
>> A depth of 4 suffices for normal use. If it is exceeded
>> further sets of registers are allocated as before via kmalloc.
>>
>> The end result is >10% performance increase in networking
>> as measured by iperf and >5% across the board.
>>
>> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>> ---
>>   arch/um/include/shared/os.h |  4 ++++
>>   arch/um/kernel/signal.c     | 31 +++++++++++++++++++++++++++++++
>>   arch/um/os-Linux/signal.c   | 16 ++++------------
>>   3 files changed, 39 insertions(+), 12 deletions(-)
>>
>> diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
>> index ebf23012a59b..4ec1bc63213b 100644
>> --- a/arch/um/include/shared/os.h
>> +++ b/arch/um/include/shared/os.h
>> @@ -319,4 +319,8 @@ extern unsigned long os_get_top_address(void);
>>   
>>   long syscall(long number, ...);
>>   
>> +/* signal.c */
>> +extern struct uml_pt_regs *get_save_register_state(void);
>> +extern void release_save_register_state(struct uml_pt_regs *r);
>> +
>>   #endif
>> diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c
>> index 57acbd67d85d..558ac7e4df97 100644
>> --- a/arch/um/kernel/signal.c
>> +++ b/arch/um/kernel/signal.c
>> @@ -6,11 +6,18 @@
>>   #include <linux/module.h>
>>   #include <linux/ptrace.h>
>>   #include <linux/sched.h>
>> +#include <linux/slab.h>
>>   #include <asm/siginfo.h>
>>   #include <asm/signal.h>
>>   #include <asm/unistd.h>
>>   #include <frame_kern.h>
>>   #include <kern_util.h>
>> +#include <os.h>
>> +
>> +static atomic_t reg_depth;
>> +
>> +static struct uml_pt_regs reg_file[4];
>> +#define REG_SWITCH_TO_KMALLOC_DEPTH 3
> 
> I fear this is not correct.
> Think of task switching, if you push something to reg_file[],
> it is not guaranteed that the same task will run next and pop the item.
> The order can change.

It should not occur for the signals that map onto IRQ (timer and IO) - 
there it is all kernel.

I cannot trigger that for the other ones, but it may be possible.

IMO we should just bite the bullet and go back to using the stack 
increasing the minimum stack order needed to 1 for 32 bit and to 2 for 
64 bit.

x86 if I read it correctly is still using the stack even for the new 
register sets.


> 
> Thanks,
> //richard
> 
> 
> 
> 
>
Richard Weinberger Jan. 4, 2019, 10:33 a.m. | #3
Am Freitag, 4. Januar 2019, 11:19:46 CET schrieb Anton Ivanov:
> > I fear this is not correct.
> > Think of task switching, if you push something to reg_file[],
> > it is not guaranteed that the same task will run next and pop the item.
> > The order can change.
> 
> It should not occur for the signals that map onto IRQ (timer and IO) - 
> there it is all kernel.
> 
> I cannot trigger that for the other ones, but it may be possible.

We need to be super sure about this.
 
> IMO we should just bite the bullet and go back to using the stack 
> increasing the minimum stack order needed to 1 for 32 bit and to 2 for 
> 64 bit.

Increasing the kernel stack just for this is also a huge overhead.
But I never benchmarked it myself.

> x86 if I read it correctly is still using the stack even for the new 
> register sets.

Are you sure? I don't think so. The kernel stack is small and the xregset is
huge.
Where did you see this?

Thanks,
//richard
Anton Ivanov Jan. 4, 2019, 10:49 a.m. | #4
On 1/4/19 10:33 AM, Richard Weinberger wrote:
> Am Freitag, 4. Januar 2019, 11:19:46 CET schrieb Anton Ivanov:
>>> I fear this is not correct.
>>> Think of task switching, if you push something to reg_file[],
>>> it is not guaranteed that the same task will run next and pop the item.
>>> The order can change.
>>
>> It should not occur for the signals that map onto IRQ (timer and IO) -
>> there it is all kernel.
>>
>> I cannot trigger that for the other ones, but it may be possible.
> 
> We need to be super sure about this.

I am pretty sure about the IRQ ones. That call chain goes cleanly down 
and up resulting in whatever was pushed to be popped.

I am trying to get my head around what can possibly happen in the 
various fault ones that will result in a task switch to a task which is 
half-done with handling signal and is about to return (and pop the wrong 
one).

>   
>> IMO we should just bite the bullet and go back to using the stack
>> increasing the minimum stack order needed to 1 for 32 bit and to 2 for
>> 64 bit.
> 
> Increasing the kernel stack just for this is also a huge overhead.
> But I never benchmarked it myself.
> 
>> x86 if I read it correctly is still using the stack even for the new
>> register sets.
> 
> Are you sure? I don't think so. The kernel stack is small and the xregset is
> huge.
> Where did you see this?

It looked like that when I skimmed across get_sigframe and 
fpu_alloc_mathframe in x86 signal.c in first read, but I may be wrong.

> 
> Thanks,
> //richard
> 
> 
>
Anton Ivanov Jan. 4, 2019, 11:31 a.m. | #5
On 1/4/19 10:33 AM, Richard Weinberger wrote:
> Am Freitag, 4. Januar 2019, 11:19:46 CET schrieb Anton Ivanov:
>>> I fear this is not correct.
>>> Think of task switching, if you push something to reg_file[],
>>> it is not guaranteed that the same task will run next and pop the item.
>>> The order can change.
>>
>> It should not occur for the signals that map onto IRQ (timer and IO) -
>> there it is all kernel.
>>
>> I cannot trigger that for the other ones, but it may be possible.
> 
> We need to be super sure about this.
>   
>> IMO we should just bite the bullet and go back to using the stack
>> increasing the minimum stack order needed to 1 for 32 bit and to 2 for
>> 64 bit.
> 
> Increasing the kernel stack just for this is also a huge overhead.
> But I never benchmarked it myself.

I just did. I wrote a quick alternative to put it back on the stack.

Stack is marginally ~ 1% or thereabouts faster than the data segment 
patch on network tests. That makes it 14% faster than kmalloc-ing the 
pt_regs in the signal handling path.

The difference for other stuff (disk, etc) is within statistical error.

It is indeed using more stack - the minimum stack order needed for my 
setup on 64 bit is now 2 (On AMD A12).

> 
>> x86 if I read it correctly is still using the stack even for the new
>> register sets.
> 
> Are you sure? I don't think so. The kernel stack is small and the xregset is
> huge.
> Where did you see this?
> 
> Thanks,
> //richard
> 
> 
>

Patch

diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
index ebf23012a59b..4ec1bc63213b 100644
--- a/arch/um/include/shared/os.h
+++ b/arch/um/include/shared/os.h
@@ -319,4 +319,8 @@  extern unsigned long os_get_top_address(void);
 
 long syscall(long number, ...);
 
+/* signal.c */
+extern struct uml_pt_regs *get_save_register_state(void);
+extern void release_save_register_state(struct uml_pt_regs *r);
+
 #endif
diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c
index 57acbd67d85d..558ac7e4df97 100644
--- a/arch/um/kernel/signal.c
+++ b/arch/um/kernel/signal.c
@@ -6,11 +6,18 @@ 
 #include <linux/module.h>
 #include <linux/ptrace.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <asm/siginfo.h>
 #include <asm/signal.h>
 #include <asm/unistd.h>
 #include <frame_kern.h>
 #include <kern_util.h>
+#include <os.h>
+
+static atomic_t reg_depth;
+
+static struct uml_pt_regs reg_file[4];
+#define REG_SWITCH_TO_KMALLOC_DEPTH 3
 
 EXPORT_SYMBOL(block_signals);
 EXPORT_SYMBOL(unblock_signals);
@@ -111,3 +118,27 @@  void do_signal(struct pt_regs *regs)
 	if (!handled_sig)
 		restore_saved_sigmask();
 }
+
+struct uml_pt_regs *get_save_register_state(void)
+{
+	struct uml_pt_regs *r;
+	int depth = atomic_inc_return(&reg_depth);
+
+	if (depth > REG_SWITCH_TO_KMALLOC_DEPTH) {
+		WARN(1, "Exceeded uml save register space, should not occur");
+		r = kmalloc(sizeof(struct uml_pt_regs), GFP_ATOMIC);
+		if (!r)
+			panic("Cannot allocate extra save register space");
+	} else {
+		r = &reg_file[depth - 1];
+	}
+	return r;
+}
+
+void release_save_register_state(struct uml_pt_regs *r)
+{
+	if (atomic_dec_return(&reg_depth) > REG_SWITCH_TO_KMALLOC_DEPTH)
+		kfree(r);
+}
+
+
diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c
index bf0acb8aad8b..81765abe1c15 100644
--- a/arch/um/os-Linux/signal.c
+++ b/arch/um/os-Linux/signal.c
@@ -31,13 +31,9 @@  void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *) = {
 
 static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc)
 {
-	struct uml_pt_regs *r;
+	struct uml_pt_regs *r = get_save_register_state();
 	int save_errno = errno;
 
-	r = uml_kmalloc(sizeof(struct uml_pt_regs), UM_GFP_ATOMIC);
-	if (!r)
-		panic("out of memory");
-
 	r->is_user = 0;
 	if (sig == SIGSEGV) {
 		/* For segfaults, we want the data from the sigcontext. */
@@ -53,7 +49,7 @@  static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc)
 
 	errno = save_errno;
 
-	free(r);
+	release_save_register_state(r);
 }
 
 /*
@@ -91,17 +87,13 @@  void sig_handler(int sig, struct siginfo *si, mcontext_t *mc)
 
 static void timer_real_alarm_handler(mcontext_t *mc)
 {
-	struct uml_pt_regs *regs;
-
-	regs = uml_kmalloc(sizeof(struct uml_pt_regs), UM_GFP_ATOMIC);
-	if (!regs)
-		panic("out of memory");
+	struct uml_pt_regs *regs = get_save_register_state();
 
 	if (mc != NULL)
 		get_regs_from_mc(regs, mc);
 	timer_handler(SIGALRM, NULL, regs);
 
-	free(regs);
+	release_save_register_state(regs);
 }
 
 void timer_alarm_handler(int sig, struct siginfo *unused_si, mcontext_t *mc)