diff mbox series

[3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode

Message ID 20230404134224.137038-4-ypodemsk@redhat.com
State New
Headers show
Series send tlb_remove_table_smp_sync IPI only to necessary CPUs | expand

Commit Message

Yair Podemsky April 4, 2023, 1:42 p.m. UTC
The tlb_remove_table_smp_sync IPI is used to ensure the outdated tlb page
is not currently being accessed and can be cleared.
This occurs once all CPUs have left the lockless gup code section.
If they reenter the page table walk, the pointers will be to the new
pages.
Therefore the IPI is only needed for CPUs in kernel mode.
By preventing the IPI from being sent to CPUs not in kernel mode,
Latencies are reduced.

Race conditions considerations:
The context state check is vulnerable to race conditions between the
moment the context state is read to when the IPI is sent (or not).

Here are these scenarios.
case 1:
CPU-A                                             CPU-B

                                                  state == CONTEXT_KERNEL
int state = atomic_read(&ct->state);
                                                  Kernel-exit:
                                                  state == CONTEXT_USER
if (state & CT_STATE_MASK == CONTEXT_KERNEL)

In this case, the IPI will be sent to CPU-B despite it is no longer in
the kernel. The consequence of which would be an unnecessary IPI being
handled by CPU-B, causing a reduction in latency.
This would have been the case every time without this patch.

case 2:
CPU-A                                             CPU-B

modify pagetables
tlb_flush (memory barrier)
                                                  state == CONTEXT_USER
int state = atomic_read(&ct->state);
                                                  Kernel-enter:
                                                  state == CONTEXT_KERNEL
                                                  READ(pagetable values)
if (state & CT_STATE_MASK == CONTEXT_USER)

In this case, the IPI will not be sent to CPU-B despite it returning to
the kernel and even reading the pagetable.
However since this CPU-B has entered the pagetable after the
modification it is reading the new, safe values.

The only case when this IPI is truly necessary is when CPU-B has entered
the lockless gup code section before the pagetable modifications and
has yet to exit them, in which case it is still in the kernel.

Signed-off-by: Yair Podemsky <ypodemsk@redhat.com>
---
 mm/mmu_gather.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

Comments

David Hildenbrand April 4, 2023, 2:03 p.m. UTC | #1
On 04.04.23 15:42, Yair Podemsky wrote:
> The tlb_remove_table_smp_sync IPI is used to ensure the outdated tlb page
> is not currently being accessed and can be cleared.
> This occurs once all CPUs have left the lockless gup code section.
> If they reenter the page table walk, the pointers will be to the new
> pages.
> Therefore the IPI is only needed for CPUs in kernel mode.
> By preventing the IPI from being sent to CPUs not in kernel mode,
> Latencies are reduced.
> 
> Race conditions considerations:
> The context state check is vulnerable to race conditions between the
> moment the context state is read to when the IPI is sent (or not).
> 
> Here are these scenarios.
> case 1:
> CPU-A                                             CPU-B
> 
>                                                    state == CONTEXT_KERNEL
> int state = atomic_read(&ct->state);
>                                                    Kernel-exit:
>                                                    state == CONTEXT_USER
> if (state & CT_STATE_MASK == CONTEXT_KERNEL)
> 
> In this case, the IPI will be sent to CPU-B despite it is no longer in
> the kernel. The consequence of which would be an unnecessary IPI being
> handled by CPU-B, causing a reduction in latency.
> This would have been the case every time without this patch.
> 
> case 2:
> CPU-A                                             CPU-B
> 
> modify pagetables
> tlb_flush (memory barrier)
>                                                    state == CONTEXT_USER
> int state = atomic_read(&ct->state);
>                                                    Kernel-enter:
>                                                    state == CONTEXT_KERNEL
>                                                    READ(pagetable values)
> if (state & CT_STATE_MASK == CONTEXT_USER)
> 
> In this case, the IPI will not be sent to CPU-B despite it returning to
> the kernel and even reading the pagetable.
> However since this CPU-B has entered the pagetable after the
> modification it is reading the new, safe values.
> 
> The only case when this IPI is truly necessary is when CPU-B has entered
> the lockless gup code section before the pagetable modifications and
> has yet to exit them, in which case it is still in the kernel.
> 
> Signed-off-by: Yair Podemsky <ypodemsk@redhat.com>
> ---
>   mm/mmu_gather.c | 19 +++++++++++++++++--
>   1 file changed, 17 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
> index 5ea9be6fb87c..731d955e152d 100644
> --- a/mm/mmu_gather.c
> +++ b/mm/mmu_gather.c
> @@ -9,6 +9,7 @@
>   #include <linux/smp.h>
>   #include <linux/swap.h>
>   #include <linux/rmap.h>
> +#include <linux/context_tracking_state.h>
>   
>   #include <asm/pgalloc.h>
>   #include <asm/tlb.h>
> @@ -191,6 +192,20 @@ static void tlb_remove_table_smp_sync(void *arg)
>   	/* Simply deliver the interrupt */
>   }
>   
> +
> +#ifdef CONFIG_CONTEXT_TRACKING
> +static bool cpu_in_kernel(int cpu, void *info)
> +{
> +	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
> +	int state = atomic_read(&ct->state);
> +	/* will return true only for cpus in kernel space */
> +	return state & CT_STATE_MASK == CONTEXT_KERNEL;
> +}
> +#define CONTEXT_PREDICATE cpu_in_kernel
> +#else
> +#define CONTEXT_PREDICATE NULL
> +#endif /* CONFIG_CONTEXT_TRACKING */
> +
>   #ifdef CONFIG_ARCH_HAS_CPUMASK_BITS
>   #define REMOVE_TABLE_IPI_MASK mm_cpumask(mm)
>   #else
> @@ -206,8 +221,8 @@ void tlb_remove_table_sync_one(struct mm_struct *mm)
>   	 * It is however sufficient for software page-table walkers that rely on
>   	 * IRQ disabling.
>   	 */
> -	on_each_cpu_mask(REMOVE_TABLE_IPI_MASK, tlb_remove_table_smp_sync,
> -			NULL, true);
> +	on_each_cpu_cond_mask(CONTEXT_PREDICATE, tlb_remove_table_smp_sync,
> +			NULL, true, REMOVE_TABLE_IPI_MASK);
>   }
>   
>   static void tlb_remove_table_rcu(struct rcu_head *head)


Maybe a bit cleaner by avoiding CONTEXT_PREDICATE, still not completely nice
(an empty dummy function "cpu_maybe_in_kernel" might be cleanest but would
be slightly slower for !CONFIG_CONTEXT_TRACKING):

#ifdef CONFIG_CONTEXT_TRACKING
static bool cpu_in_kernel(int cpu, void *info)
{
	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
	int state = atomic_read(&ct->state);
	/* will return true only for cpus in kernel space */
	return state & CT_STATE_MASK == CONTEXT_KERNEL;
}
#endif /* CONFIG_CONTEXT_TRACKING */


...
#ifdef CONFIG_CONTEXT_TRACKING
	on_each_cpu_mask(REMOVE_TABLE_IPI_MASK, tlb_remove_table_smp_sync,
			 NULL, true);
#else /* CONFIG_CONTEXT_TRACKING */
	on_each_cpu_cond_mask(cpu_in_kernel, tlb_remove_table_smp_sync,
			      NULL, true, REMOVE_TABLE_IPI_MASK);
#endif /* CONFIG_CONTEXT_TRACKING */
Peter Zijlstra April 4, 2023, 3:12 p.m. UTC | #2
On Tue, Apr 04, 2023 at 04:42:24PM +0300, Yair Podemsky wrote:
> The tlb_remove_table_smp_sync IPI is used to ensure the outdated tlb page
> is not currently being accessed and can be cleared.
> This occurs once all CPUs have left the lockless gup code section.
> If they reenter the page table walk, the pointers will be to the new
> pages.
> Therefore the IPI is only needed for CPUs in kernel mode.
> By preventing the IPI from being sent to CPUs not in kernel mode,
> Latencies are reduced.
> 
> Race conditions considerations:
> The context state check is vulnerable to race conditions between the
> moment the context state is read to when the IPI is sent (or not).
> 
> Here are these scenarios.
> case 1:
> CPU-A                                             CPU-B
> 
>                                                   state == CONTEXT_KERNEL
> int state = atomic_read(&ct->state);
>                                                   Kernel-exit:
>                                                   state == CONTEXT_USER
> if (state & CT_STATE_MASK == CONTEXT_KERNEL)
> 
> In this case, the IPI will be sent to CPU-B despite it is no longer in
> the kernel. The consequence of which would be an unnecessary IPI being
> handled by CPU-B, causing a reduction in latency.
> This would have been the case every time without this patch.
> 
> case 2:
> CPU-A                                             CPU-B
> 
> modify pagetables
> tlb_flush (memory barrier)
>                                                   state == CONTEXT_USER
> int state = atomic_read(&ct->state);
>                                                   Kernel-enter:
>                                                   state == CONTEXT_KERNEL
>                                                   READ(pagetable values)
> if (state & CT_STATE_MASK == CONTEXT_USER)
> 
> In this case, the IPI will not be sent to CPU-B despite it returning to
> the kernel and even reading the pagetable.
> However since this CPU-B has entered the pagetable after the
> modification it is reading the new, safe values.
> 
> The only case when this IPI is truly necessary is when CPU-B has entered
> the lockless gup code section before the pagetable modifications and
> has yet to exit them, in which case it is still in the kernel.
> 
> Signed-off-by: Yair Podemsky <ypodemsk@redhat.com>
> ---
>  mm/mmu_gather.c | 19 +++++++++++++++++--
>  1 file changed, 17 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
> index 5ea9be6fb87c..731d955e152d 100644
> --- a/mm/mmu_gather.c
> +++ b/mm/mmu_gather.c
> @@ -9,6 +9,7 @@
>  #include <linux/smp.h>
>  #include <linux/swap.h>
>  #include <linux/rmap.h>
> +#include <linux/context_tracking_state.h>
>  
>  #include <asm/pgalloc.h>
>  #include <asm/tlb.h>
> @@ -191,6 +192,20 @@ static void tlb_remove_table_smp_sync(void *arg)
>  	/* Simply deliver the interrupt */
>  }
>  
> +
> +#ifdef CONFIG_CONTEXT_TRACKING
> +static bool cpu_in_kernel(int cpu, void *info)
> +{
> +	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
> +	int state = atomic_read(&ct->state);
> +	/* will return true only for cpus in kernel space */
> +	return state & CT_STATE_MASK == CONTEXT_KERNEL;
> +}
> +#define CONTEXT_PREDICATE cpu_in_kernel
> +#else
> +#define CONTEXT_PREDICATE NULL
> +#endif /* CONFIG_CONTEXT_TRACKING */
> +
>  #ifdef CONFIG_ARCH_HAS_CPUMASK_BITS
>  #define REMOVE_TABLE_IPI_MASK mm_cpumask(mm)
>  #else
> @@ -206,8 +221,8 @@ void tlb_remove_table_sync_one(struct mm_struct *mm)
>  	 * It is however sufficient for software page-table walkers that rely on
>  	 * IRQ disabling.
>  	 */
> -	on_each_cpu_mask(REMOVE_TABLE_IPI_MASK, tlb_remove_table_smp_sync,
> -			NULL, true);
> +	on_each_cpu_cond_mask(CONTEXT_PREDICATE, tlb_remove_table_smp_sync,
> +			NULL, true, REMOVE_TABLE_IPI_MASK);
>  }

I think this is correct; but... I would like much of the changelog
included in a comment above cpu_in_kernel(). I'm sure someone will try
and read this code and wonder about those race conditions.

Of crucial importance is the fact that the page-table modification comes
before the tlbi.

Also, do we really not already have this helper function somewhere, it
seems like something obvious to already have, Frederic?
Peter Zijlstra April 4, 2023, 4 p.m. UTC | #3
On Tue, Apr 04, 2023 at 05:12:17PM +0200, Peter Zijlstra wrote:
> > case 2:
> > CPU-A                                             CPU-B
> > 
> > modify pagetables
> > tlb_flush (memory barrier)
> >                                                   state == CONTEXT_USER
> > int state = atomic_read(&ct->state);
> >                                                   Kernel-enter:
> >                                                   state == CONTEXT_KERNEL
> >                                                   READ(pagetable values)
> > if (state & CT_STATE_MASK == CONTEXT_USER)
> > 


Hmm, hold up; what about memory ordering, we need a store-load ordering
between the page-table write and the context trackng load, and a
store-load order on the context tracking update and software page-table
walker loads.

Now, iirc page-table modification is done under pte_lock (or
page_table_lock) and that only provides a RELEASE barrier on this end,
which is insufficient to order against a later load.

Is there anything else?

On the state tracking side, we have ct_state_inc() which is
atomic_add_return() which should provide full barrier and is sufficient.
Frederic Weisbecker April 5, 2023, 10:43 a.m. UTC | #4
On Tue, Apr 04, 2023 at 04:42:24PM +0300, Yair Podemsky wrote:
> @@ -191,6 +192,20 @@ static void tlb_remove_table_smp_sync(void *arg)
>  	/* Simply deliver the interrupt */
>  }
>  
> +
> +#ifdef CONFIG_CONTEXT_TRACKING
> +static bool cpu_in_kernel(int cpu, void *info)
> +{
> +	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);

Like Peter said, an smp_mb() is required here before the read (unless there is
already one between the page table modification and that ct->state read?).

So that you have this pairing:


           WRITE page_table                  WRITE ct->state
	   smp_mb()                          smp_mb() // implied by atomic_fetch_or
           READ ct->state                    READ page_table

> +	int state = atomic_read(&ct->state);
> +	/* will return true only for cpus in kernel space */
> +	return state & CT_STATE_MASK == CONTEXT_KERNEL;
> +}

Also note that this doesn't stricly prevent userspace from being interrupted.
You may well observe the CPU in kernel but it may receive the IPI later after
switching to userspace.

We could arrange for avoiding that with marking ct->state with a pending work bit
to flush upon user entry/exit but that's a bit more overhead so I first need to
know about your expectations here, ie: can you tolerate such an occasional
interruption or not?

Thanks.
Frederic Weisbecker April 5, 2023, 11:10 a.m. UTC | #5
On Wed, Apr 05, 2023 at 12:44:04PM +0200, Frederic Weisbecker wrote:
> On Tue, Apr 04, 2023 at 04:42:24PM +0300, Yair Podemsky wrote:
> > +	int state = atomic_read(&ct->state);
> > +	/* will return true only for cpus in kernel space */
> > +	return state & CT_STATE_MASK == CONTEXT_KERNEL;
> > +}
> 
> Also note that this doesn't stricly prevent userspace from being interrupted.
> You may well observe the CPU in kernel but it may receive the IPI later after
> switching to userspace.
> 
> We could arrange for avoiding that with marking ct->state with a pending work bit
> to flush upon user entry/exit but that's a bit more overhead so I first need to
> know about your expectations here, ie: can you tolerate such an occasional
> interruption or not?

Bah, actually what can we do to prevent from that racy IPI? Not much I fear...
Peter Zijlstra April 5, 2023, 11:41 a.m. UTC | #6
On Wed, Apr 05, 2023 at 01:10:07PM +0200, Frederic Weisbecker wrote:
> On Wed, Apr 05, 2023 at 12:44:04PM +0200, Frederic Weisbecker wrote:
> > On Tue, Apr 04, 2023 at 04:42:24PM +0300, Yair Podemsky wrote:
> > > +	int state = atomic_read(&ct->state);
> > > +	/* will return true only for cpus in kernel space */
> > > +	return state & CT_STATE_MASK == CONTEXT_KERNEL;
> > > +}
> > 
> > Also note that this doesn't stricly prevent userspace from being interrupted.
> > You may well observe the CPU in kernel but it may receive the IPI later after
> > switching to userspace.
> > 
> > We could arrange for avoiding that with marking ct->state with a pending work bit
> > to flush upon user entry/exit but that's a bit more overhead so I first need to
> > know about your expectations here, ie: can you tolerate such an occasional
> > interruption or not?
> 
> Bah, actually what can we do to prevent from that racy IPI? Not much I fear...

Yeah, so I don't think that's actually a problem. The premise is that
*IFF* NOHZ_FULL stays in userspace, then it will never observe the IPI.

If it violates this by doing syscalls or other kernel entries; it gets
to keep the pieces.
David Hildenbrand April 5, 2023, noon UTC | #7
On 05.04.23 13:41, Peter Zijlstra wrote:
> On Wed, Apr 05, 2023 at 01:10:07PM +0200, Frederic Weisbecker wrote:
>> On Wed, Apr 05, 2023 at 12:44:04PM +0200, Frederic Weisbecker wrote:
>>> On Tue, Apr 04, 2023 at 04:42:24PM +0300, Yair Podemsky wrote:
>>>> +	int state = atomic_read(&ct->state);
>>>> +	/* will return true only for cpus in kernel space */
>>>> +	return state & CT_STATE_MASK == CONTEXT_KERNEL;
>>>> +}
>>>
>>> Also note that this doesn't stricly prevent userspace from being interrupted.
>>> You may well observe the CPU in kernel but it may receive the IPI later after
>>> switching to userspace.
>>>
>>> We could arrange for avoiding that with marking ct->state with a pending work bit
>>> to flush upon user entry/exit but that's a bit more overhead so I first need to
>>> know about your expectations here, ie: can you tolerate such an occasional
>>> interruption or not?
>>
>> Bah, actually what can we do to prevent from that racy IPI? Not much I fear...
> 
> Yeah, so I don't think that's actually a problem. The premise is that
> *IFF* NOHZ_FULL stays in userspace, then it will never observe the IPI.
> 
> If it violates this by doing syscalls or other kernel entries; it gets
> to keep the pieces.

Yair is currently on vacation, so I'm replying on his behalf.

Indeed, RT userspace is supposed to not call into the kernel, that's the 
premise.
Frederic Weisbecker April 5, 2023, 12:05 p.m. UTC | #8
On Wed, Apr 05, 2023 at 01:41:48PM +0200, Peter Zijlstra wrote:
> On Wed, Apr 05, 2023 at 01:10:07PM +0200, Frederic Weisbecker wrote:
> > On Wed, Apr 05, 2023 at 12:44:04PM +0200, Frederic Weisbecker wrote:
> > > On Tue, Apr 04, 2023 at 04:42:24PM +0300, Yair Podemsky wrote:
> > > > +	int state = atomic_read(&ct->state);
> > > > +	/* will return true only for cpus in kernel space */
> > > > +	return state & CT_STATE_MASK == CONTEXT_KERNEL;
> > > > +}
> > > 
> > > Also note that this doesn't stricly prevent userspace from being interrupted.
> > > You may well observe the CPU in kernel but it may receive the IPI later after
> > > switching to userspace.
> > > 
> > > We could arrange for avoiding that with marking ct->state with a pending work bit
> > > to flush upon user entry/exit but that's a bit more overhead so I first need to
> > > know about your expectations here, ie: can you tolerate such an occasional
> > > interruption or not?
> > 
> > Bah, actually what can we do to prevent from that racy IPI? Not much I fear...
> 
> Yeah, so I don't think that's actually a problem. The premise is that
> *IFF* NOHZ_FULL stays in userspace, then it will never observe the IPI.
> 
> If it violates this by doing syscalls or other kernel entries; it gets
> to keep the pieces.

Ok so how about the following (only build tested)?

Two things:

1) It has the advantage to check context tracking _after_ the llist_add(), so
   it really can't be misused ordering-wise.

2) The IPI callback is always enqueued and then executed upon return
   from userland. The ordering makes sure it will either IPI or execute
   upon return to userspace.

diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index 4a4d56f77180..dc4b56da1747 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -137,10 +137,23 @@ static __always_inline int ct_state(void)
 	return ret;
 }
 
+static __always_inline int ct_state_cpu(int cpu)
+{
+	struct context_tracking *ct;
+
+	if (!context_tracking_enabled())
+		return CONTEXT_DISABLED;
+
+	ct = per_cpu_ptr(&context_tracking, cpu);
+
+	return atomic_read(&ct->state) & CT_STATE_MASK;
+}
+
 #else
 static __always_inline bool context_tracking_enabled(void) { return false; }
 static __always_inline bool context_tracking_enabled_cpu(int cpu) { return false; }
 static __always_inline bool context_tracking_enabled_this_cpu(void) { return false; }
+static inline int ct_state_cpu(int cpu) { return CONTEXT_DISABLED; }
 #endif /* CONFIG_CONTEXT_TRACKING_USER */
 
 #endif
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 846add8394c4..cdc7e8a59acc 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -10,6 +10,7 @@
 #include <linux/audit.h>
 #include <linux/tick.h>
 
+#include "../kernel/sched/smp.h"
 #include "common.h"
 
 #define CREATE_TRACE_POINTS
@@ -27,6 +28,10 @@ static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
 	instrumentation_begin();
 	kmsan_unpoison_entry_regs(regs);
 	trace_hardirqs_off_finish();
+
+	/* Flush delayed IPI queue on nohz_full */
+	if (context_tracking_enabled_this_cpu())
+		flush_smp_call_function_queue();
 	instrumentation_end();
 }
 
diff --git a/kernel/smp.c b/kernel/smp.c
index 06a413987a14..14b25d25ef3a 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -878,6 +878,8 @@ EXPORT_SYMBOL_GPL(smp_call_function_any);
  */
 #define SCF_WAIT	(1U << 0)
 #define SCF_RUN_LOCAL	(1U << 1)
+#define SCF_NO_USER	(1U << 2)
+
 
 static void smp_call_function_many_cond(const struct cpumask *mask,
 					smp_call_func_t func, void *info,
@@ -946,10 +948,13 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
 #endif
 			cfd_seq_store(pcpu->seq_queue, this_cpu, cpu, CFD_SEQ_QUEUE);
 			if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) {
-				__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
-				nr_cpus++;
-				last_cpu = cpu;
-
+				if (!(scf_flags & SCF_NO_USER) ||
+				    !IS_ENABLED(CONFIG_GENERIC_ENTRY) ||
+				     ct_state_cpu(cpu) != CONTEXT_USER) {
+					__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
+					nr_cpus++;
+					last_cpu = cpu;
+				}
 				cfd_seq_store(pcpu->seq_ipi, this_cpu, cpu, CFD_SEQ_IPI);
 			} else {
 				cfd_seq_store(pcpu->seq_noipi, this_cpu, cpu, CFD_SEQ_NOIPI);
@@ -1121,6 +1126,24 @@ void __init smp_init(void)
 	smp_cpus_done(setup_max_cpus);
 }
 
+static void __on_each_cpu_cond_mask(smp_cond_func_t cond_func,
+				    smp_call_func_t func,
+				    void *info, bool wait, bool nouser,
+				    const struct cpumask *mask)
+{
+	unsigned int scf_flags = SCF_RUN_LOCAL;
+
+	if (wait)
+		scf_flags |= SCF_WAIT;
+
+	if (nouser)
+		scf_flags |= SCF_NO_USER;
+
+	preempt_disable();
+	smp_call_function_many_cond(mask, func, info, scf_flags, cond_func);
+	preempt_enable();
+}
+
 /*
  * on_each_cpu_cond(): Call a function on each processor for which
  * the supplied function cond_func returns true, optionally waiting
@@ -1146,17 +1169,18 @@ void __init smp_init(void)
 void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
 			   void *info, bool wait, const struct cpumask *mask)
 {
-	unsigned int scf_flags = SCF_RUN_LOCAL;
-
-	if (wait)
-		scf_flags |= SCF_WAIT;
-
-	preempt_disable();
-	smp_call_function_many_cond(mask, func, info, scf_flags, cond_func);
-	preempt_enable();
+	__on_each_cpu_cond_mask(cond_func, func, info, wait, false, mask);
 }
 EXPORT_SYMBOL(on_each_cpu_cond_mask);
 
+void on_each_cpu_cond_nouser_mask(smp_cond_func_t cond_func,
+				  smp_call_func_t func,
+				  void *info, bool wait,
+				  const struct cpumask *mask)
+{
+	__on_each_cpu_cond_mask(cond_func, func, info, wait, true, mask);
+}
+
 static void do_nothing(void *unused)
 {
 }
Frederic Weisbecker April 5, 2023, 12:31 p.m. UTC | #9
On Wed, Apr 05, 2023 at 02:05:13PM +0200, Frederic Weisbecker wrote:
> On Wed, Apr 05, 2023 at 01:41:48PM +0200, Peter Zijlstra wrote:
> 1) It has the advantage to check context tracking _after_ the llist_add(), so
>    it really can't be misused ordering-wise.
> 
> 2) The IPI callback is always enqueued and then executed upon return
>    from userland. The ordering makes sure it will either IPI or execute
>    upon return to userspace.

*from userspace
Valentin Schneider April 5, 2023, 12:45 p.m. UTC | #10
On 05/04/23 14:05, Frederic Weisbecker wrote:
>  static void smp_call_function_many_cond(const struct cpumask *mask,
>                                       smp_call_func_t func, void *info,
> @@ -946,10 +948,13 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
>  #endif
>                       cfd_seq_store(pcpu->seq_queue, this_cpu, cpu, CFD_SEQ_QUEUE);
>                       if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) {
> -				__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
> -				nr_cpus++;
> -				last_cpu = cpu;
> -
> +				if (!(scf_flags & SCF_NO_USER) ||
> +				    !IS_ENABLED(CONFIG_GENERIC_ENTRY) ||
> +				     ct_state_cpu(cpu) != CONTEXT_USER) {
> +					__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
> +					nr_cpus++;
> +					last_cpu = cpu;
> +				}

I've been hacking on something like this (CSD deferral for NOHZ-full),
and unfortunately this uses the CPU-local cfd_data storage thing, which
means any further smp_call_function() from the same CPU to the same
destination will spin on csd_lock_wait(), waiting for the target CPU to
come out of userspace and flush the queue - and we've just spent extra
effort into *not* disturbing it, so that'll take a while :(

I don't have much that is in a shareable state yet (though I'm supposed to
talk some more about it at OSPM in <2 weeks, so I'll have to get there),
but ATM I'm playing with
o a bitmask (like in [1]) for coalescable stuff such as do_sync_core() for
  x86 instruction patching
o a CSD-like queue for things that need to pass data around, using
  statically-allocated storage (so with a limit on how much it can be used) - the
  alternative being allocating a struct on sending, since you don't have a
  bound on how much crap you can queue on an undisturbed NOHZ-full CPU...

[1]: https://lore.kernel.org/all/20210929152429.067060646@infradead.org/
Marcelo Tosatti April 5, 2023, 7:43 p.m. UTC | #11
On Wed, Apr 05, 2023 at 12:43:58PM +0200, Frederic Weisbecker wrote:
> On Tue, Apr 04, 2023 at 04:42:24PM +0300, Yair Podemsky wrote:
> > @@ -191,6 +192,20 @@ static void tlb_remove_table_smp_sync(void *arg)
> >  	/* Simply deliver the interrupt */
> >  }
> >  
> > +
> > +#ifdef CONFIG_CONTEXT_TRACKING
> > +static bool cpu_in_kernel(int cpu, void *info)
> > +{
> > +	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
> 
> Like Peter said, an smp_mb() is required here before the read (unless there is
> already one between the page table modification and that ct->state read?).
> 
> So that you have this pairing:
> 
> 
>            WRITE page_table                  WRITE ct->state
> 	   smp_mb()                          smp_mb() // implied by atomic_fetch_or
>            READ ct->state                    READ page_table
> 
> > +	int state = atomic_read(&ct->state);
> > +	/* will return true only for cpus in kernel space */
> > +	return state & CT_STATE_MASK == CONTEXT_KERNEL;
> > +}
> 
> Also note that this doesn't stricly prevent userspace from being interrupted.
> You may well observe the CPU in kernel but it may receive the IPI later after
> switching to userspace.
> 
> We could arrange for avoiding that with marking ct->state with a pending work bit
> to flush upon user entry/exit but that's a bit more overhead so I first need to
> know about your expectations here, ie: can you tolerate such an occasional
> interruption or not?

Two points:

1) For a virtualized system, the overhead is not only of executing the
IPI but:

	VM-exit
	run VM-exit code in host
	handle IPI
	run VM-entry code in host
	VM-entry

2) Depends on the application and the definition of "occasional".

For certain types of applications (for example PLC software or
RAN processing), upon occurrence of an event, it is necessary to
complete a certain task in a maximum amount of time (deadline).

One way to express this requirement is with a pair of numbers,
deadline time and execution time, where:

       	* deadline time: length of time between event and deadline.
       	* execution time: length of time it takes for processing of event
                          to occur on a particular hardware platform
                          (uninterrupted).
Marcelo Tosatti April 5, 2023, 7:45 p.m. UTC | #12
On Wed, Apr 05, 2023 at 01:10:07PM +0200, Frederic Weisbecker wrote:
> On Wed, Apr 05, 2023 at 12:44:04PM +0200, Frederic Weisbecker wrote:
> > On Tue, Apr 04, 2023 at 04:42:24PM +0300, Yair Podemsky wrote:
> > > +	int state = atomic_read(&ct->state);
> > > +	/* will return true only for cpus in kernel space */
> > > +	return state & CT_STATE_MASK == CONTEXT_KERNEL;
> > > +}
> > 
> > Also note that this doesn't stricly prevent userspace from being interrupted.
> > You may well observe the CPU in kernel but it may receive the IPI later after
> > switching to userspace.
> > 
> > We could arrange for avoiding that with marking ct->state with a pending work bit
> > to flush upon user entry/exit but that's a bit more overhead so I first need to
> > know about your expectations here, ie: can you tolerate such an occasional
> > interruption or not?
> 
> Bah, actually what can we do to prevent from that racy IPI? Not much I fear...

Use a different mechanism other than an IPI to ensure in progress
__get_free_pages_fast() has finished execution.

Isnt this codepath slow path enough that it can use
synchronize_rcu_expedited?
Peter Zijlstra April 5, 2023, 7:52 p.m. UTC | #13
On Wed, Apr 05, 2023 at 04:45:32PM -0300, Marcelo Tosatti wrote:
> On Wed, Apr 05, 2023 at 01:10:07PM +0200, Frederic Weisbecker wrote:
> > On Wed, Apr 05, 2023 at 12:44:04PM +0200, Frederic Weisbecker wrote:
> > > On Tue, Apr 04, 2023 at 04:42:24PM +0300, Yair Podemsky wrote:
> > > > +	int state = atomic_read(&ct->state);
> > > > +	/* will return true only for cpus in kernel space */
> > > > +	return state & CT_STATE_MASK == CONTEXT_KERNEL;
> > > > +}
> > > 
> > > Also note that this doesn't stricly prevent userspace from being interrupted.
> > > You may well observe the CPU in kernel but it may receive the IPI later after
> > > switching to userspace.
> > > 
> > > We could arrange for avoiding that with marking ct->state with a pending work bit
> > > to flush upon user entry/exit but that's a bit more overhead so I first need to
> > > know about your expectations here, ie: can you tolerate such an occasional
> > > interruption or not?
> > 
> > Bah, actually what can we do to prevent from that racy IPI? Not much I fear...
> 
> Use a different mechanism other than an IPI to ensure in progress
> __get_free_pages_fast() has finished execution.
> 
> Isnt this codepath slow path enough that it can use
> synchronize_rcu_expedited?

To actually hit this path you're doing something really dodgy.
Peter Zijlstra April 5, 2023, 7:54 p.m. UTC | #14
On Wed, Apr 05, 2023 at 04:43:14PM -0300, Marcelo Tosatti wrote:

> Two points:
> 
> 1) For a virtualized system, the overhead is not only of executing the
> IPI but:
> 
> 	VM-exit
> 	run VM-exit code in host
> 	handle IPI
> 	run VM-entry code in host
> 	VM-entry

I thought we could do IPIs without VMexit these days? Also virt... /me
walks away.

> 2) Depends on the application and the definition of "occasional".
> 
> For certain types of applications (for example PLC software or
> RAN processing), upon occurrence of an event, it is necessary to
> complete a certain task in a maximum amount of time (deadline).

If the application is properly NOHZ_FULL and never does a kernel entry,
it will never get that IPI. If it is a pile of shit and does kernel
entries while it pretends to be NOHZ_FULL it gets to keep the pieces and
no amount of crying will get me to care.
Marcelo Tosatti April 6, 2023, 12:38 p.m. UTC | #15
On Wed, Apr 05, 2023 at 09:52:26PM +0200, Peter Zijlstra wrote:
> On Wed, Apr 05, 2023 at 04:45:32PM -0300, Marcelo Tosatti wrote:
> > On Wed, Apr 05, 2023 at 01:10:07PM +0200, Frederic Weisbecker wrote:
> > > On Wed, Apr 05, 2023 at 12:44:04PM +0200, Frederic Weisbecker wrote:
> > > > On Tue, Apr 04, 2023 at 04:42:24PM +0300, Yair Podemsky wrote:
> > > > > +	int state = atomic_read(&ct->state);
> > > > > +	/* will return true only for cpus in kernel space */
> > > > > +	return state & CT_STATE_MASK == CONTEXT_KERNEL;
> > > > > +}
> > > > 
> > > > Also note that this doesn't stricly prevent userspace from being interrupted.
> > > > You may well observe the CPU in kernel but it may receive the IPI later after
> > > > switching to userspace.
> > > > 
> > > > We could arrange for avoiding that with marking ct->state with a pending work bit
> > > > to flush upon user entry/exit but that's a bit more overhead so I first need to
> > > > know about your expectations here, ie: can you tolerate such an occasional
> > > > interruption or not?
> > > 
> > > Bah, actually what can we do to prevent from that racy IPI? Not much I fear...
> > 
> > Use a different mechanism other than an IPI to ensure in progress
> > __get_free_pages_fast() has finished execution.
> > 
> > Isnt this codepath slow path enough that it can use
> > synchronize_rcu_expedited?
> 
> To actually hit this path you're doing something really dodgy.

Apparently khugepaged is using the same infrastructure:

$ grep tlb_remove_table khugepaged.c 
	tlb_remove_table_sync_one();
	tlb_remove_table_sync_one();

So just enabling khugepaged will hit that path.
Marcelo Tosatti April 6, 2023, 12:49 p.m. UTC | #16
On Wed, Apr 05, 2023 at 09:54:57PM +0200, Peter Zijlstra wrote:
> On Wed, Apr 05, 2023 at 04:43:14PM -0300, Marcelo Tosatti wrote:
> 
> > Two points:
> > 
> > 1) For a virtualized system, the overhead is not only of executing the
> > IPI but:
> > 
> > 	VM-exit
> > 	run VM-exit code in host
> > 	handle IPI
> > 	run VM-entry code in host
> > 	VM-entry
> 
> I thought we could do IPIs without VMexit these days? 

Yes, IPIs to vCPU (guest context). In this case we can consider
an IPI to the host pCPU (which requires VM-exit from guest context).

> Also virt... /me walks away.
> 
> > 2) Depends on the application and the definition of "occasional".
> > 
> > For certain types of applications (for example PLC software or
> > RAN processing), upon occurrence of an event, it is necessary to
> > complete a certain task in a maximum amount of time (deadline).
> 
> If the application is properly NOHZ_FULL and never does a kernel entry,
> it will never get that IPI. If it is a pile of shit and does kernel
> entries while it pretends to be NOHZ_FULL it gets to keep the pieces and
> no amount of crying will get me to care.

I suppose its common practice to use certain system calls in latency
sensitive applications, for example nanosleep. Some examples:

1) cyclictest		(nanosleep)
2) PLC programs		(nanosleep)

A system call does not necessarily have to take locks, does it ?

Or even if application does system calls, but runs under a VM,
then you are requiring it to never VM-exit.

This reduces the flexibility of developing such applications.
Peter Zijlstra April 6, 2023, 1:29 p.m. UTC | #17
On Thu, Apr 06, 2023 at 09:38:50AM -0300, Marcelo Tosatti wrote:

> > To actually hit this path you're doing something really dodgy.
> 
> Apparently khugepaged is using the same infrastructure:
> 
> $ grep tlb_remove_table khugepaged.c 
> 	tlb_remove_table_sync_one();
> 	tlb_remove_table_sync_one();
> 
> So just enabling khugepaged will hit that path.

Urgh, WTF..

Let me go read that stuff :/
Peter Zijlstra April 6, 2023, 1:32 p.m. UTC | #18
On Thu, Apr 06, 2023 at 09:49:22AM -0300, Marcelo Tosatti wrote:

> > > 2) Depends on the application and the definition of "occasional".
> > > 
> > > For certain types of applications (for example PLC software or
> > > RAN processing), upon occurrence of an event, it is necessary to
> > > complete a certain task in a maximum amount of time (deadline).
> > 
> > If the application is properly NOHZ_FULL and never does a kernel entry,
> > it will never get that IPI. If it is a pile of shit and does kernel
> > entries while it pretends to be NOHZ_FULL it gets to keep the pieces and
> > no amount of crying will get me to care.
> 
> I suppose its common practice to use certain system calls in latency
> sensitive applications, for example nanosleep. Some examples:
> 
> 1) cyclictest		(nanosleep)

cyclictest is not a NOHZ_FULL application, if you tihnk it is, you're
deluded.

> 2) PLC programs		(nanosleep)

What's a PLC? Programmable Logic Circuit?

> A system call does not necessarily have to take locks, does it ?

This all is unrelated to locks

> Or even if application does system calls, but runs under a VM,
> then you are requiring it to never VM-exit.

That seems to be a goal for performance anyway.

> This reduces the flexibility of developing such applications.

Yeah, that's the cards you're dealt, deal with it.
Peter Zijlstra April 6, 2023, 1:38 p.m. UTC | #19
On Wed, Apr 05, 2023 at 01:45:02PM +0100, Valentin Schneider wrote:
> On 05/04/23 14:05, Frederic Weisbecker wrote:
> >  static void smp_call_function_many_cond(const struct cpumask *mask,
> >                                       smp_call_func_t func, void *info,
> > @@ -946,10 +948,13 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
> >  #endif
> >                       cfd_seq_store(pcpu->seq_queue, this_cpu, cpu, CFD_SEQ_QUEUE);
> >                       if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) {
> > -				__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
> > -				nr_cpus++;
> > -				last_cpu = cpu;
> > -
> > +				if (!(scf_flags & SCF_NO_USER) ||
> > +				    !IS_ENABLED(CONFIG_GENERIC_ENTRY) ||
> > +				     ct_state_cpu(cpu) != CONTEXT_USER) {
> > +					__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
> > +					nr_cpus++;
> > +					last_cpu = cpu;
> > +				}
> 
> I've been hacking on something like this (CSD deferral for NOHZ-full),
> and unfortunately this uses the CPU-local cfd_data storage thing, which
> means any further smp_call_function() from the same CPU to the same
> destination will spin on csd_lock_wait(), waiting for the target CPU to
> come out of userspace and flush the queue - and we've just spent extra
> effort into *not* disturbing it, so that'll take a while :(

I'm not sure I buy into deferring stuff.. a NOHZ_FULL cpu might 'never'
come back. Queueing data just in case it does seems wasteful.
Peter Zijlstra April 6, 2023, 2:04 p.m. UTC | #20
On Thu, Apr 06, 2023 at 03:29:28PM +0200, Peter Zijlstra wrote:
> On Thu, Apr 06, 2023 at 09:38:50AM -0300, Marcelo Tosatti wrote:
> 
> > > To actually hit this path you're doing something really dodgy.
> > 
> > Apparently khugepaged is using the same infrastructure:
> > 
> > $ grep tlb_remove_table khugepaged.c 
> > 	tlb_remove_table_sync_one();
> > 	tlb_remove_table_sync_one();
> > 
> > So just enabling khugepaged will hit that path.
> 
> Urgh, WTF..
> 
> Let me go read that stuff :/

At the very least the one on collapse_and_free_pmd() could easily become
a call_rcu() based free.

I'm not sure I'm following what collapse_huge_page() does just yet.
Valentin Schneider April 6, 2023, 2:11 p.m. UTC | #21
On 06/04/23 15:38, Peter Zijlstra wrote:
> On Wed, Apr 05, 2023 at 01:45:02PM +0100, Valentin Schneider wrote:
>>
>> I've been hacking on something like this (CSD deferral for NOHZ-full),
>> and unfortunately this uses the CPU-local cfd_data storage thing, which
>> means any further smp_call_function() from the same CPU to the same
>> destination will spin on csd_lock_wait(), waiting for the target CPU to
>> come out of userspace and flush the queue - and we've just spent extra
>> effort into *not* disturbing it, so that'll take a while :(
>
> I'm not sure I buy into deferring stuff.. a NOHZ_FULL cpu might 'never'
> come back. Queueing data just in case it does seems wasteful.

Putting those callbacks straight into the bin would make my life much
easier!

Unfortunately, even if they really should, I don't believe all of the
things being crammed onto NOHZ_FULL CPUs have the same definition of
'never' as we do :/
Peter Zijlstra April 6, 2023, 2:39 p.m. UTC | #22
On Thu, Apr 06, 2023 at 03:11:52PM +0100, Valentin Schneider wrote:
> On 06/04/23 15:38, Peter Zijlstra wrote:
> > On Wed, Apr 05, 2023 at 01:45:02PM +0100, Valentin Schneider wrote:
> >>
> >> I've been hacking on something like this (CSD deferral for NOHZ-full),
> >> and unfortunately this uses the CPU-local cfd_data storage thing, which
> >> means any further smp_call_function() from the same CPU to the same
> >> destination will spin on csd_lock_wait(), waiting for the target CPU to
> >> come out of userspace and flush the queue - and we've just spent extra
> >> effort into *not* disturbing it, so that'll take a while :(
> >
> > I'm not sure I buy into deferring stuff.. a NOHZ_FULL cpu might 'never'
> > come back. Queueing data just in case it does seems wasteful.
> 
> Putting those callbacks straight into the bin would make my life much
> easier!

Well, it's either they get inhibited at the source like the parent patch
does, or they go through. I really don't see a sane middle way here.

> Unfortunately, even if they really should, I don't believe all of the
> things being crammed onto NOHZ_FULL CPUs have the same definition of
> 'never' as we do :/

That's not entirely the point, the point is that there are proper
NOHZ_FULL users that won't return to the kernel until the machine shuts
down. Buffering stuff for them is more or less a direct memory leak.
David Hildenbrand April 6, 2023, 2:42 p.m. UTC | #23
On 06.04.23 16:04, Peter Zijlstra wrote:
> On Thu, Apr 06, 2023 at 03:29:28PM +0200, Peter Zijlstra wrote:
>> On Thu, Apr 06, 2023 at 09:38:50AM -0300, Marcelo Tosatti wrote:
>>
>>>> To actually hit this path you're doing something really dodgy.
>>>
>>> Apparently khugepaged is using the same infrastructure:
>>>
>>> $ grep tlb_remove_table khugepaged.c
>>> 	tlb_remove_table_sync_one();
>>> 	tlb_remove_table_sync_one();
>>>
>>> So just enabling khugepaged will hit that path.
>>
>> Urgh, WTF..
>>
>> Let me go read that stuff :/
> 
> At the very least the one on collapse_and_free_pmd() could easily become
> a call_rcu() based free.
> 
> I'm not sure I'm following what collapse_huge_page() does just yet.

It wants to replace a leaf page table by a THP (Transparent Huge Page 
mapped by a PMD). So we want to rip out a leaf page table while other 
code (GUP-fast) might still be walking it. In contrast to freeing the 
page table, we put it into a list where it can be reuse when having to 
PTE-map a THP again.

Now, similar to after freeing the page table, someone else could reuse 
that page table and modify it.

If we have GUP-fast walking the page table while that is happening, 
we're in trouble. So we have to make sure GUP-fast is done before 
enqueuing the now-free page table.

That's why the tlb_remove_table_sync_one() was recently added (by Jann 
IIRC).
Peter Zijlstra April 6, 2023, 3:02 p.m. UTC | #24
On Thu, Apr 06, 2023 at 04:04:23PM +0200, Peter Zijlstra wrote:
> On Thu, Apr 06, 2023 at 03:29:28PM +0200, Peter Zijlstra wrote:
> > On Thu, Apr 06, 2023 at 09:38:50AM -0300, Marcelo Tosatti wrote:
> > 
> > > > To actually hit this path you're doing something really dodgy.
> > > 
> > > Apparently khugepaged is using the same infrastructure:
> > > 
> > > $ grep tlb_remove_table khugepaged.c 
> > > 	tlb_remove_table_sync_one();
> > > 	tlb_remove_table_sync_one();
> > > 
> > > So just enabling khugepaged will hit that path.
> > 
> > Urgh, WTF..
> > 
> > Let me go read that stuff :/
> 
> At the very least the one on collapse_and_free_pmd() could easily become
> a call_rcu() based free.
> 
> I'm not sure I'm following what collapse_huge_page() does just yet.

DavidH, what do you thikn about reviving Jann's patches here:

  https://bugs.chromium.org/p/project-zero/issues/detail?id=2365#c1

Those are far more invasive, but afaict they seem to do the right thing.
Peter Zijlstra April 6, 2023, 3:06 p.m. UTC | #25
On Thu, Apr 06, 2023 at 04:42:02PM +0200, David Hildenbrand wrote:
> On 06.04.23 16:04, Peter Zijlstra wrote:
> > On Thu, Apr 06, 2023 at 03:29:28PM +0200, Peter Zijlstra wrote:
> > > On Thu, Apr 06, 2023 at 09:38:50AM -0300, Marcelo Tosatti wrote:
> > > 
> > > > > To actually hit this path you're doing something really dodgy.
> > > > 
> > > > Apparently khugepaged is using the same infrastructure:
> > > > 
> > > > $ grep tlb_remove_table khugepaged.c
> > > > 	tlb_remove_table_sync_one();
> > > > 	tlb_remove_table_sync_one();
> > > > 
> > > > So just enabling khugepaged will hit that path.
> > > 
> > > Urgh, WTF..
> > > 
> > > Let me go read that stuff :/
> > 
> > At the very least the one on collapse_and_free_pmd() could easily become
> > a call_rcu() based free.
> > 
> > I'm not sure I'm following what collapse_huge_page() does just yet.
> 
> It wants to replace a leaf page table by a THP (Transparent Huge Page mapped
> by a PMD). So we want to rip out a leaf page table while other code
> (GUP-fast) might still be walking it. 

Right, I got that far.

> In contrast to freeing the page table,
> we put it into a list where it can be reuse when having to PTE-map a THP
> again.

Yeah, this is the bit I couldn't find, that code is a bit of a maze.

> Now, similar to after freeing the page table, someone else could reuse that
> page table and modify it.

So ideally we'll RCU free the page instead of sticking it on that list.
David Hildenbrand April 6, 2023, 3:51 p.m. UTC | #26
On 06.04.23 17:02, Peter Zijlstra wrote:
> On Thu, Apr 06, 2023 at 04:04:23PM +0200, Peter Zijlstra wrote:
>> On Thu, Apr 06, 2023 at 03:29:28PM +0200, Peter Zijlstra wrote:
>>> On Thu, Apr 06, 2023 at 09:38:50AM -0300, Marcelo Tosatti wrote:
>>>
>>>>> To actually hit this path you're doing something really dodgy.
>>>>
>>>> Apparently khugepaged is using the same infrastructure:
>>>>
>>>> $ grep tlb_remove_table khugepaged.c
>>>> 	tlb_remove_table_sync_one();
>>>> 	tlb_remove_table_sync_one();
>>>>
>>>> So just enabling khugepaged will hit that path.
>>>
>>> Urgh, WTF..
>>>
>>> Let me go read that stuff :/
>>
>> At the very least the one on collapse_and_free_pmd() could easily become
>> a call_rcu() based free.
>>
>> I'm not sure I'm following what collapse_huge_page() does just yet.
> 
> DavidH, what do you thikn about reviving Jann's patches here:
> 
>    https://bugs.chromium.org/p/project-zero/issues/detail?id=2365#c1
> 
> Those are far more invasive, but afaict they seem to do the right thing.
> 

I recall seeing those while discussed on security@kernel.org. What we 
currently have was (IMHO for good reasons) deemed better to fix the 
issue, especially when caring about backports and getting it right.

The alternative that was discussed in that context IIRC was to simply 
allocate a fresh page table, place the fresh page table into the list 
instead, and simply free the old page table (then using common machinery).

TBH, I'd wish (and recently raised) that we could just stop wasting 
memory on page tables for THPs that are maybe never going to get 
PTE-mapped ... and eventually just allocate on demand (with some 
caching?) and handle the places where we're OOM and cannot PTE-map a THP 
in some descend way.

... instead of trying to figure out how to deal with these page tables 
we cannot free but have to special-case simply because of GUP-fast.
Peter Zijlstra April 6, 2023, 6:27 p.m. UTC | #27
On Thu, Apr 06, 2023 at 05:51:52PM +0200, David Hildenbrand wrote:
> On 06.04.23 17:02, Peter Zijlstra wrote:

> > DavidH, what do you thikn about reviving Jann's patches here:
> > 
> >    https://bugs.chromium.org/p/project-zero/issues/detail?id=2365#c1
> > 
> > Those are far more invasive, but afaict they seem to do the right thing.
> > 
> 
> I recall seeing those while discussed on security@kernel.org. What we
> currently have was (IMHO for good reasons) deemed better to fix the issue,
> especially when caring about backports and getting it right.

Yes, and I think that was the right call. However, we can now revisit
without having the pressure of a known defect and backport
considerations.

> The alternative that was discussed in that context IIRC was to simply
> allocate a fresh page table, place the fresh page table into the list
> instead, and simply free the old page table (then using common machinery).
> 
> TBH, I'd wish (and recently raised) that we could just stop wasting memory
> on page tables for THPs that are maybe never going to get PTE-mapped ... and
> eventually just allocate on demand (with some caching?) and handle the
> places where we're OOM and cannot PTE-map a THP in some descend way.
> 
> ... instead of trying to figure out how to deal with these page tables we
> cannot free but have to special-case simply because of GUP-fast.

Not keeping them around sounds good to me, but I'm not *that* familiar
with the THP code, most of that happened after I stopped tracking mm. So
I'm not sure how feasible is it.

But it does look entirely feasible to rework this page-table freeing
along the lines Jann did.
Marcelo Tosatti April 19, 2023, 11:01 a.m. UTC | #28
On Thu, Apr 06, 2023 at 03:32:06PM +0200, Peter Zijlstra wrote:
> On Thu, Apr 06, 2023 at 09:49:22AM -0300, Marcelo Tosatti wrote:
> 
> > > > 2) Depends on the application and the definition of "occasional".
> > > > 
> > > > For certain types of applications (for example PLC software or
> > > > RAN processing), upon occurrence of an event, it is necessary to
> > > > complete a certain task in a maximum amount of time (deadline).
> > > 
> > > If the application is properly NOHZ_FULL and never does a kernel entry,
> > > it will never get that IPI. If it is a pile of shit and does kernel
> > > entries while it pretends to be NOHZ_FULL it gets to keep the pieces and
> > > no amount of crying will get me to care.
> > 
> > I suppose its common practice to use certain system calls in latency
> > sensitive applications, for example nanosleep. Some examples:
> > 
> > 1) cyclictest		(nanosleep)
> 
> cyclictest is not a NOHZ_FULL application, if you tihnk it is, you're
> deluded.

On the field (what end-users do on production):

cyclictest runs on NOHZ_FULL cores.
PLC type programs run on NOHZ_FULL cores.

So accordingly to physical reality i observe, i am not deluded.

> > 2) PLC programs		(nanosleep)
> 
> What's a PLC? Programmable Logic Circuit?

Programmable logic controller.

> > A system call does not necessarily have to take locks, does it ?
> 
> This all is unrelated to locks

OK.

> > Or even if application does system calls, but runs under a VM,
> > then you are requiring it to never VM-exit.
> 
> That seems to be a goal for performance anyway.

Not sure what you mean.

> > This reduces the flexibility of developing such applications.
> 
> Yeah, that's the cards you're dealt, deal with it.

This is not what happens on the field.
David Hildenbrand April 19, 2023, 11:30 a.m. UTC | #29
On 06.04.23 20:27, Peter Zijlstra wrote:
> On Thu, Apr 06, 2023 at 05:51:52PM +0200, David Hildenbrand wrote:
>> On 06.04.23 17:02, Peter Zijlstra wrote:
> 
>>> DavidH, what do you thikn about reviving Jann's patches here:
>>>
>>>     https://bugs.chromium.org/p/project-zero/issues/detail?id=2365#c1
>>>
>>> Those are far more invasive, but afaict they seem to do the right thing.
>>>
>>
>> I recall seeing those while discussed on security@kernel.org. What we
>> currently have was (IMHO for good reasons) deemed better to fix the issue,
>> especially when caring about backports and getting it right.
> 
> Yes, and I think that was the right call. However, we can now revisit
> without having the pressure of a known defect and backport
> considerations.
> 
>> The alternative that was discussed in that context IIRC was to simply
>> allocate a fresh page table, place the fresh page table into the list
>> instead, and simply free the old page table (then using common machinery).
>>
>> TBH, I'd wish (and recently raised) that we could just stop wasting memory
>> on page tables for THPs that are maybe never going to get PTE-mapped ... and
>> eventually just allocate on demand (with some caching?) and handle the
>> places where we're OOM and cannot PTE-map a THP in some descend way.
>>
>> ... instead of trying to figure out how to deal with these page tables we
>> cannot free but have to special-case simply because of GUP-fast.
> 
> Not keeping them around sounds good to me, but I'm not *that* familiar
> with the THP code, most of that happened after I stopped tracking mm. So
> I'm not sure how feasible is it.
> 
> But it does look entirely feasible to rework this page-table freeing
> along the lines Jann did.

It's most probably more feasible, although the easiest would be to just 
allocate a fresh page table to deposit and free the old one using the 
mmu gatherer.

This way we can avoid the khugepaged of tlb_remove_table_smp_sync(), but 
not the tlb_remove_table_one() usage. I suspect khugepaged isn't really 
relevant in RT kernels (IIRC, most of RT setups disable THP completely).

tlb_remove_table_one() only triggers if __get_free_page(GFP_NOWAIT | 
__GFP_NOWARN); fails. IIUC, that can happen easily under memory pressure 
because it doesn't wait for direct reclaim.

I don't know much about RT workloads (so I'd appreciate some feedback), 
but I guess we can run int memory pressure as well due to some !rt 
housekeeping task on the system?
Marcelo Tosatti April 19, 2023, 11:39 a.m. UTC | #30
On Wed, Apr 19, 2023 at 01:30:57PM +0200, David Hildenbrand wrote:
> On 06.04.23 20:27, Peter Zijlstra wrote:
> > On Thu, Apr 06, 2023 at 05:51:52PM +0200, David Hildenbrand wrote:
> > > On 06.04.23 17:02, Peter Zijlstra wrote:
> > 
> > > > DavidH, what do you thikn about reviving Jann's patches here:
> > > > 
> > > >     https://bugs.chromium.org/p/project-zero/issues/detail?id=2365#c1
> > > > 
> > > > Those are far more invasive, but afaict they seem to do the right thing.
> > > > 
> > > 
> > > I recall seeing those while discussed on security@kernel.org. What we
> > > currently have was (IMHO for good reasons) deemed better to fix the issue,
> > > especially when caring about backports and getting it right.
> > 
> > Yes, and I think that was the right call. However, we can now revisit
> > without having the pressure of a known defect and backport
> > considerations.
> > 
> > > The alternative that was discussed in that context IIRC was to simply
> > > allocate a fresh page table, place the fresh page table into the list
> > > instead, and simply free the old page table (then using common machinery).
> > > 
> > > TBH, I'd wish (and recently raised) that we could just stop wasting memory
> > > on page tables for THPs that are maybe never going to get PTE-mapped ... and
> > > eventually just allocate on demand (with some caching?) and handle the
> > > places where we're OOM and cannot PTE-map a THP in some descend way.
> > > 
> > > ... instead of trying to figure out how to deal with these page tables we
> > > cannot free but have to special-case simply because of GUP-fast.
> > 
> > Not keeping them around sounds good to me, but I'm not *that* familiar
> > with the THP code, most of that happened after I stopped tracking mm. So
> > I'm not sure how feasible is it.
> > 
> > But it does look entirely feasible to rework this page-table freeing
> > along the lines Jann did.
> 
> It's most probably more feasible, although the easiest would be to just
> allocate a fresh page table to deposit and free the old one using the mmu
> gatherer.
> 
> This way we can avoid the khugepaged of tlb_remove_table_smp_sync(), but not
> the tlb_remove_table_one() usage. I suspect khugepaged isn't really relevant
> in RT kernels (IIRC, most of RT setups disable THP completely).

People will disable khugepaged because it causes IPIs (and the fact one
has to disable khugepaged is a configuration overhead, and a source of
headache for configuring the realtime system, since one can forget of
doing that, etc).

But people do want to run non-RT applications along with RT applications
(in case you have a single box on a priviledged location, for example).

> 
> tlb_remove_table_one() only triggers if __get_free_page(GFP_NOWAIT |
> __GFP_NOWARN); fails. IIUC, that can happen easily under memory pressure
> because it doesn't wait for direct reclaim.
> 
> I don't know much about RT workloads (so I'd appreciate some feedback), but
> I guess we can run int memory pressure as well due to some !rt housekeeping
> task on the system?

Yes, exactly (memory for -RT app will be mlocked).
diff mbox series

Patch

diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 5ea9be6fb87c..731d955e152d 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -9,6 +9,7 @@ 
 #include <linux/smp.h>
 #include <linux/swap.h>
 #include <linux/rmap.h>
+#include <linux/context_tracking_state.h>
 
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
@@ -191,6 +192,20 @@  static void tlb_remove_table_smp_sync(void *arg)
 	/* Simply deliver the interrupt */
 }
 
+
+#ifdef CONFIG_CONTEXT_TRACKING
+static bool cpu_in_kernel(int cpu, void *info)
+{
+	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
+	int state = atomic_read(&ct->state);
+	/* will return true only for cpus in kernel space */
+	return state & CT_STATE_MASK == CONTEXT_KERNEL;
+}
+#define CONTEXT_PREDICATE cpu_in_kernel
+#else
+#define CONTEXT_PREDICATE NULL
+#endif /* CONFIG_CONTEXT_TRACKING */
+
 #ifdef CONFIG_ARCH_HAS_CPUMASK_BITS
 #define REMOVE_TABLE_IPI_MASK mm_cpumask(mm)
 #else
@@ -206,8 +221,8 @@  void tlb_remove_table_sync_one(struct mm_struct *mm)
 	 * It is however sufficient for software page-table walkers that rely on
 	 * IRQ disabling.
 	 */
-	on_each_cpu_mask(REMOVE_TABLE_IPI_MASK, tlb_remove_table_smp_sync,
-			NULL, true);
+	on_each_cpu_cond_mask(CONTEXT_PREDICATE, tlb_remove_table_smp_sync,
+			NULL, true, REMOVE_TABLE_IPI_MASK);
 }
 
 static void tlb_remove_table_rcu(struct rcu_head *head)