diff mbox series

um: Try to avoid kmalloc in signal handling

Message ID 20190103160917.10217-1-anton.ivanov@cambridgegreys.com
State Superseded
Headers show
Series um: Try to avoid kmalloc in signal handling | expand

Commit Message

Anton Ivanov Jan. 3, 2019, 4:09 p.m. UTC
From: Anton Ivanov <anton.ivanov@cambridgegreys.com>

Signal handling (which maps to interrupt handling in UML) needs
to pass current registers to the relevant handlers and was
allocating a structure for that using kmalloc. It is possible
to avoid this kmalloc by using a small "signal register stack".
A depth of 4 suffices 99%+ of the time. If it is exceeded
further sets of registers are allocated as before via kmalloc.

The end result is >10% performance increase in networking
as measured by iperf.

Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
---
 arch/um/os-Linux/signal.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

Comments

Anton Ivanov Jan. 3, 2019, 5:13 p.m. UTC | #1
On 1/3/19 4:09 PM, anton.ivanov@cambridgegreys.com wrote:
> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
> 
> Signal handling (which maps to interrupt handling in UML) needs
> to pass current registers to the relevant handlers and was
> allocating a structure for that using kmalloc. It is possible
> to avoid this kmalloc by using a small "signal register stack".
> A depth of 4 suffices 99%+ of the time. If it is exceeded
> further sets of registers are allocated as before via kmalloc.
> 
> The end result is >10% performance increase in networking
> as measured by iperf.
> 
> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
> ---
>   arch/um/os-Linux/signal.c | 20 ++++++++++++++++----
>   1 file changed, 16 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c
> index bf0acb8aad8b..21536581ab57 100644
> --- a/arch/um/os-Linux/signal.c
> +++ b/arch/um/os-Linux/signal.c
> @@ -29,14 +29,24 @@ void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *) = {
>   	[SIGALRM]	= timer_handler
>   };
>   
> +static int reg_depth;
> +
> +static struct uml_pt_regs reg_file[4];
> +#define REG_SWITCH_TO_KMALLOC_DEPTH 3
> +
>   static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc)
>   {
>   	struct uml_pt_regs *r;
>   	int save_errno = errno;
>   
> -	r = uml_kmalloc(sizeof(struct uml_pt_regs), UM_GFP_ATOMIC);
> -	if (!r)
> -		panic("out of memory");
> +	if (reg_depth >= REG_SWITCH_TO_KMALLOC_DEPTH) {
> +		r = uml_kmalloc(sizeof(struct uml_pt_regs), UM_GFP_ATOMIC);
> +		if (!r)
> +			panic("out of memory");
> +	} else {
> +		r = &reg_file[reg_depth];
> +	}
> +	reg_depth++;
>   
>   	r->is_user = 0;
>   	if (sig == SIGSEGV) {
> @@ -53,7 +63,9 @@ static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc)
>   
>   	errno = save_errno;
>   
> -	free(r);
> +	reg_depth--;
> +	if (reg_depth >= REG_SWITCH_TO_KMALLOC_DEPTH)
> +		free(r);
>   }
>   
>   /*
> 

This redresses the performance loss introduced in the IRQ handling path 
by b6024b21fec8367ef961a771cc9dde31f1831965. Mea culpa, I should have 
spotted that one at the time.

In my tests the depth has never managed to exceed 2. So a threshold for 
switching to kmalloc set to 3 is possibly an overkill and can be reduced 
to 2.
Anton Ivanov Jan. 3, 2019, 9:29 p.m. UTC | #2
On 03/01/2019 17:13, Anton Ivanov wrote:
> 
> 
> On 1/3/19 4:09 PM, anton.ivanov@cambridgegreys.com wrote:
>> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>
>> Signal handling (which maps to interrupt handling in UML) needs
>> to pass current registers to the relevant handlers and was
>> allocating a structure for that using kmalloc. It is possible
>> to avoid this kmalloc by using a small "signal register stack".
>> A depth of 4 suffices 99%+ of the time. If it is exceeded
>> further sets of registers are allocated as before via kmalloc.
>>
>> The end result is >10% performance increase in networking
>> as measured by iperf.
>>
>> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>> ---
>>   arch/um/os-Linux/signal.c | 20 ++++++++++++++++----
>>   1 file changed, 16 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c
>> index bf0acb8aad8b..21536581ab57 100644
>> --- a/arch/um/os-Linux/signal.c
>> +++ b/arch/um/os-Linux/signal.c
>> @@ -29,14 +29,24 @@ void (*sig_info[NSIG])(int, struct siginfo *, 
>> struct uml_pt_regs *) = {
>>       [SIGALRM]    = timer_handler
>>   };
>> +static int reg_depth;
>> +
>> +static struct uml_pt_regs reg_file[4];
>> +#define REG_SWITCH_TO_KMALLOC_DEPTH 3
>> +
>>   static void sig_handler_common(int sig, struct siginfo *si, 
>> mcontext_t *mc)
>>   {
>>       struct uml_pt_regs *r;
>>       int save_errno = errno;
>> -    r = uml_kmalloc(sizeof(struct uml_pt_regs), UM_GFP_ATOMIC);
>> -    if (!r)
>> -        panic("out of memory");
>> +    if (reg_depth >= REG_SWITCH_TO_KMALLOC_DEPTH) {
>> +        r = uml_kmalloc(sizeof(struct uml_pt_regs), UM_GFP_ATOMIC);
>> +        if (!r)
>> +            panic("out of memory");
>> +    } else {
>> +        r = &reg_file[reg_depth];
>> +    }
>> +    reg_depth++;
>>       r->is_user = 0;
>>       if (sig == SIGSEGV) {
>> @@ -53,7 +63,9 @@ static void sig_handler_common(int sig, struct 
>> siginfo *si, mcontext_t *mc)
>>       errno = save_errno;
>> -    free(r);
>> +    reg_depth--;
>> +    if (reg_depth >= REG_SWITCH_TO_KMALLOC_DEPTH)
>> +        free(r);
>>   }
>>   /*
>>
> 
> This redresses the performance loss introduced in the IRQ handling path 
> by b6024b21fec8367ef961a771cc9dde31f1831965. Mea culpa, I should have 
> spotted that one at the time.
> 
> In my tests the depth has never managed to exceed 2. So a threshold for 
> switching to kmalloc set to 3 is possibly an overkill and can be reduced 
> to 2.
> 

Second thought - this should not use kmalloc at all if it can. It should 
either go on the stack the way it used to do before 
b6024b21fec8367ef961a771cc9dde31f1831965 or it should have a sufficient 
"register stack" to push registers for all use cases.

Kmalloc/ATOMIC can fail just because it feels like it which means that 
this can throw a panic() under very heavy IO.

I am surprised this was not triggered before. The vector IO regularly 
has "holes" in the vector from failed atomic allocations in the same path.
Richard Weinberger Jan. 3, 2019, 9:42 p.m. UTC | #3
Am Donnerstag, 3. Januar 2019, 22:29:17 CET schrieb Anton Ivanov:
> Second thought - this should not use kmalloc at all if it can. It should 
> either go on the stack the way it used to do before 
> b6024b21fec8367ef961a771cc9dde31f1831965 or it should have a sufficient 
> "register stack" to push registers for all use cases.
> 
> Kmalloc/ATOMIC can fail just because it feels like it which means that 
> this can throw a panic() under very heavy IO.

Not good. :-(
We cannot go back to stack based allocation. The stack is too small for
AVX register sets.
Maybe we can preallocate it.
(And check what arch/x86/ does)

Thanks,
//richard
Anton Ivanov Jan. 4, 2019, 7:07 a.m. UTC | #4
On 03/01/2019 21:42, Richard Weinberger wrote:
> Am Donnerstag, 3. Januar 2019, 22:29:17 CET schrieb Anton Ivanov:
>> Second thought - this should not use kmalloc at all if it can. It should
>> either go on the stack the way it used to do before
>> b6024b21fec8367ef961a771cc9dde31f1831965 or it should have a sufficient
>> "register stack" to push registers for all use cases.
>>
>> Kmalloc/ATOMIC can fail just because it feels like it which means that
>> this can throw a panic() under very heavy IO.
> 
> Not good. :-(
> We cannot go back to stack based allocation. The stack is too small for
> AVX register sets.
> Maybe we can preallocate it.

I can update the patch so it uses a preallocated buffer instead of a 
data segment. IMO for this particular case there is little benefit in 
kmallocing that at init. It is not something you can get away without so 
data segment is probably OK.

Unless I am mistaken, the worst case scenario for depth there is 4 (as 
in the patch at the moment).

IO + timer + fault + FPU.

We can still leave the "overfill kmalloc" and add a WARN() on that and 
see if it gets triggered in the wild.

> (And check what arch/x86/ does)

I will have a look and update the patch.

> 
> Thanks,
> //richard
> 
> 
>
Anton Ivanov Jan. 4, 2019, 7:48 a.m. UTC | #5
On 03/01/2019 21:42, Richard Weinberger wrote:
> Am Donnerstag, 3. Januar 2019, 22:29:17 CET schrieb Anton Ivanov:
>> Second thought - this should not use kmalloc at all if it can. It should
>> either go on the stack the way it used to do before
>> b6024b21fec8367ef961a771cc9dde31f1831965 or it should have a sufficient
>> "register stack" to push registers for all use cases.
>>
>> Kmalloc/ATOMIC can fail just because it feels like it which means that
>> this can throw a panic() under very heavy IO.
> 
> Not good. :-(
> We cannot go back to stack based allocation. The stack is too small for
> AVX register sets.

If I read it correctly it still uses the stack even for the new register 
sets.

> Maybe we can preallocate it.
> (And check what arch/x86/ does)
> 
> Thanks,
> //richard
> 
> 
>
Anton Ivanov Jan. 4, 2019, 3:50 p.m. UTC | #6
On 1/3/19 9:42 PM, Richard Weinberger wrote:
> Am Donnerstag, 3. Januar 2019, 22:29:17 CET schrieb Anton Ivanov:
>> Second thought - this should not use kmalloc at all if it can. It should
>> either go on the stack the way it used to do before
>> b6024b21fec8367ef961a771cc9dde31f1831965 or it should have a sufficient
>> "register stack" to push registers for all use cases.
>>
>> Kmalloc/ATOMIC can fail just because it feels like it which means that
>> this can throw a panic() under very heavy IO.
> 
> Not good. :-(
> We cannot go back to stack based allocation. The stack is too small for
> AVX register sets.
> Maybe we can preallocate it.

I will write a pre-allocation alternative, but that may take some time 
because it needs to do additional allocations at some "safe" point in 
time like system idle, not in the IO/signal path where they can fail.

In the meantime we have to make sure that we are not sitting on a latent 
crash in the IO/Fault path so I have sent out a patch which removes 
kmalloc/ATOMIC and puts things back on the stack.

There are a few other options which are worth investigating like sizing 
the storage dynamically based on CPU and/or storing fp regs only if they 
have been touched, not every time (afaik MIPS was doing something like 
that). Once again, this may take a while to figure out if it is possible 
and some time to make it happen after that. That may be worth doing 
anyway for performance reasons. If memory serves me right, storing FPU 
state is a high cost operation. Not doing it for cases where it is not 
needed should yield some performance gains.

> (And check what arch/x86/ does)
> 
> Thanks,
> //richard
> 
> 
>
Anton Ivanov Jan. 7, 2019, 10:07 a.m. UTC | #7
On 1/4/19 3:50 PM, Anton Ivanov wrote:
> 
> 
> On 1/3/19 9:42 PM, Richard Weinberger wrote:
>> Am Donnerstag, 3. Januar 2019, 22:29:17 CET schrieb Anton Ivanov:
>>> Second thought - this should not use kmalloc at all if it can. It should
>>> either go on the stack the way it used to do before
>>> b6024b21fec8367ef961a771cc9dde31f1831965 or it should have a sufficient
>>> "register stack" to push registers for all use cases.
>>>
>>> Kmalloc/ATOMIC can fail just because it feels like it which means that
>>> this can throw a panic() under very heavy IO.
>>
>> Not good. :-(
>> We cannot go back to stack based allocation. The stack is too small for
>> AVX register sets.
>> Maybe we can preallocate it.
> 
> I will write a pre-allocation alternative, but that may take some time 
> because it needs to do additional allocations at some "safe" point in 
> time like system idle, not in the IO/signal path where they can fail.
> 
> In the meantime we have to make sure that we are not sitting on a latent 
> crash in the IO/Fault path so I have sent out a patch which removes 
> kmalloc/ATOMIC and puts things back on the stack.
> 
> There are a few other options which are worth investigating like sizing 
> the storage dynamically based on CPU and/or storing fp regs only if they 
> have been touched, not every time (afaik MIPS was doing something like 
> that). Once again, this may take a while to figure out if it is possible 
> and some time to make it happen after that. That may be worth doing 
> anyway for performance reasons. If memory serves me right, storing FPU 
> state is a high cost operation. Not doing it for cases where it is not 
> needed should yield some performance gains.
> 
>> (And check what arch/x86/ does)
>>
>> Thanks,
>> //richard
>>
>>
>>
> 

After digging both x86 and uml sources, I think that the long term goal 
should be to get rid of regs->fp altogether.

x86 does not have it and keeps fp state separate under most 
circumstances. As a result it has no issues pushing pt_reqs on the stack 
- it is small.

It is not easy though - references to it are all over the place and I am 
having quite an interesting time figuring out where we can replace them 
with the fpu field in the thread_info structure and where we have to do 
something else :)
diff mbox series

Patch

diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c
index bf0acb8aad8b..21536581ab57 100644
--- a/arch/um/os-Linux/signal.c
+++ b/arch/um/os-Linux/signal.c
@@ -29,14 +29,24 @@  void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *) = {
 	[SIGALRM]	= timer_handler
 };
 
+static int reg_depth;
+
+static struct uml_pt_regs reg_file[4];
+#define REG_SWITCH_TO_KMALLOC_DEPTH 3
+
 static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc)
 {
 	struct uml_pt_regs *r;
 	int save_errno = errno;
 
-	r = uml_kmalloc(sizeof(struct uml_pt_regs), UM_GFP_ATOMIC);
-	if (!r)
-		panic("out of memory");
+	if (reg_depth >= REG_SWITCH_TO_KMALLOC_DEPTH) {
+		r = uml_kmalloc(sizeof(struct uml_pt_regs), UM_GFP_ATOMIC);
+		if (!r)
+			panic("out of memory");
+	} else {
+		r = &reg_file[reg_depth];
+	}
+	reg_depth++;
 
 	r->is_user = 0;
 	if (sig == SIGSEGV) {
@@ -53,7 +63,9 @@  static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc)
 
 	errno = save_errno;
 
-	free(r);
+	reg_depth--;
+	if (reg_depth >= REG_SWITCH_TO_KMALLOC_DEPTH)
+		free(r);
 }
 
 /*