@@ -590,7 +590,7 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
* by sending an IPI to all the cpus and executing a dummy
* function there.
*/
- kick_all_cpus_sync();
+ poke_nonidle_cpus_sync();
/*
* Now invalidate the hpte entries in the range
* covered by pmd. This make sure we take a
@@ -670,7 +670,7 @@ void pmdp_splitting_flush(struct vm_area_struct *vma,
* This ensures that generic code that rely on IRQ disabling
* to prevent a parallel THP split work as expected.
*/
- kick_all_cpus_sync();
+ poke_nonidle_cpus_sync();
}
/*
@@ -855,7 +855,7 @@ pmd_t pmdp_get_and_clear(struct mm_struct *mm,
* different code paths. So make sure we wait for the parallel
* find_linux_pte_or_hugepage to finish.
*/
- kick_all_cpus_sync();
+ poke_nonidle_cpus_sync();
return old_pmd;
}
@@ -101,6 +101,14 @@ int smp_call_function_any(const struct cpumask *mask,
void kick_all_cpus_sync(void);
void wake_up_all_idle_cpus(void);
+#ifdef CONFIG_NO_HZ_COMMON
+void poke_nonidle_cpus_sync(void);
+#else
+static inline void poke_nonidle_cpus_sync(void)
+{
+ return kick_all_cpus_sync();
+}
+#endif
/*
* Generic and arch helpers
@@ -150,6 +158,7 @@ smp_call_function_any(const struct cpumask *mask, smp_call_func_t func,
static inline void kick_all_cpus_sync(void) { }
static inline void wake_up_all_idle_cpus(void) { }
+static inline void poke_nonidle_cpus_sync(void) { }
#ifdef CONFIG_UP_LATE_INIT
extern void __init up_late_init(void);
@@ -7513,6 +7513,25 @@ static int sched_ilb_notifier(struct notifier_block *nfb,
return NOTIFY_DONE;
}
}
+
+static void do_nothing(void *unused)
+{
+}
+
+void poke_nonidle_cpus_sync(void)
+{
+ struct cpumask mask;
+
+ /*
+ * Make sure the change is visible before we poke the cpus
+ */
+ smp_mb();
+ preempt_disable();
+ cpumask_andnot(&mask, cpu_online_mask, nohz.idle_cpus_mask);
+ smp_call_function_many(&mask, do_nothing, NULL, 1);
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(poke_nonidle_cpus_sync);
#endif
static DEFINE_SPINLOCK(balancing);
What we really need is the ability to wait for other cpus to finish local_irq_save/local_irq_restore region. We don't need to send IPI to idle cpus in that case. Add a vairant of kick_all_cpus_sync to do that. If idle_cpu_mask change during the call, we should be ok because: 1) new cpus got added. In this case when they enter the critical path they would have seen the new values i modified before smp_wmb(); 2) cpus got removed: In this case we are ok, because we send stray IPI to them Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> --- NOTE: This need closer review, because I am new to the area of cpu mask. arch/powerpc/mm/pgtable_64.c | 6 +++--- include/linux/smp.h | 9 +++++++++ kernel/sched/fair.c | 19 +++++++++++++++++++ 3 files changed, 31 insertions(+), 3 deletions(-)