diff mbox

[1/5] ARM: tlb: don't perform inner-shareable invalidation for local TLB ops

Message ID 1383748487-13886-2-git-send-email-paolo.pisati@canonical.com
State New
Headers show

Commit Message

Paolo Pisati Nov. 6, 2013, 2:34 p.m. UTC
From: Will Deacon <will.deacon@arm.com>

BugLink: http://bugs.launchpad.net/bugs/1239800

Inner-shareable TLB invalidation is typically more expensive than local
(non-shareable) invalidation, so performing the broadcasting for
local_flush_tlb_* operations is a waste of cycles and needlessly
clobbers entries in the TLBs of other CPUs.

This patch introduces __flush_tlb_* versions for many of the TLB
invalidation functions, which only respect inner-shareable variants of
the invalidation instructions when presented with the TLB_V7_UIS_FULL
flag. The local version is also inlined to prevent SMP_ON_UP kernels
from missing flushes, where the __flush variant would be called with
the UP flags.

This gains us around 0.5% in hackbench scores for a dual-core A15, but I
would expect this to improve as more cores (and clusters) are added to
the equation.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reported-by: Albin Tonnerre <Albin.Tonnerre@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
(cherry picked from commit f0915781bd5edf78b1154e61efe962dc15872d09 via rmk's
tree http://ftp.arm.linux.org.uk/pub/linux/arm/kernel/git-cur/linux-2.6-arm.git
for-next)
Signed-off-by: Paolo Pisati <paolo.pisati@canonical.com>
---
 arch/arm/include/asm/tlbflush.h | 138 ++++++++++++++++++++++++++++++++++------
 arch/arm/kernel/smp_tlb.c       |   8 +--
 arch/arm/mm/context.c           |   7 +-
 3 files changed, 123 insertions(+), 30 deletions(-)

Comments

Andy Whitcroft Nov. 7, 2013, 2:26 p.m. UTC | #1
On Wed, Nov 06, 2013 at 03:34:43PM +0100, Paolo Pisati wrote:
> From: Will Deacon <will.deacon@arm.com>
> 
> BugLink: http://bugs.launchpad.net/bugs/1239800
> 
> Inner-shareable TLB invalidation is typically more expensive than local
> (non-shareable) invalidation, so performing the broadcasting for
> local_flush_tlb_* operations is a waste of cycles and needlessly
> clobbers entries in the TLBs of other CPUs.
> 
> This patch introduces __flush_tlb_* versions for many of the TLB
> invalidation functions, which only respect inner-shareable variants of
> the invalidation instructions when presented with the TLB_V7_UIS_FULL
> flag. The local version is also inlined to prevent SMP_ON_UP kernels
> from missing flushes, where the __flush variant would be called with
> the UP flags.
> 
> This gains us around 0.5% in hackbench scores for a dual-core A15, but I
> would expect this to improve as more cores (and clusters) are added to
> the equation.
> 
> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
> Reported-by: Albin Tonnerre <Albin.Tonnerre@arm.com>
> Signed-off-by: Will Deacon <will.deacon@arm.com>
> (cherry picked from commit f0915781bd5edf78b1154e61efe962dc15872d09 via rmk's
> tree http://ftp.arm.linux.org.uk/pub/linux/arm/kernel/git-cur/linux-2.6-arm.git
> for-next)
> Signed-off-by: Paolo Pisati <paolo.pisati@canonical.com>
> ---
>  arch/arm/include/asm/tlbflush.h | 138 ++++++++++++++++++++++++++++++++++------
>  arch/arm/kernel/smp_tlb.c       |   8 +--
>  arch/arm/mm/context.c           |   7 +-
>  3 files changed, 123 insertions(+), 30 deletions(-)
> 
> diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h
> index f467e9b..3316264 100644
> --- a/arch/arm/include/asm/tlbflush.h
> +++ b/arch/arm/include/asm/tlbflush.h
> @@ -319,6 +319,16 @@ extern struct cpu_tlb_fns cpu_tlb;
>  #define tlb_op(f, regs, arg)	__tlb_op(f, "p15, 0, %0, " regs, arg)
>  #define tlb_l2_op(f, regs, arg)	__tlb_op(f, "p15, 1, %0, " regs, arg)
>  
> +static inline void __local_flush_tlb_all(void)
> +{
> +	const int zero = 0;
> +	const unsigned int __tlb_flag = __cpu_tlb_flags;
> +
> +	tlb_op(TLB_V4_U_FULL | TLB_V6_U_FULL, "c8, c7, 0", zero);
> +	tlb_op(TLB_V4_D_FULL | TLB_V6_D_FULL, "c8, c6, 0", zero);
> +	tlb_op(TLB_V4_I_FULL | TLB_V6_I_FULL, "c8, c5, 0", zero);
> +}
> +
>  static inline void local_flush_tlb_all(void)
>  {
>  	const int zero = 0;
> @@ -327,10 +337,8 @@ static inline void local_flush_tlb_all(void)
>  	if (tlb_flag(TLB_WB))
>  		dsb();
>  
> -	tlb_op(TLB_V4_U_FULL | TLB_V6_U_FULL, "c8, c7, 0", zero);
> -	tlb_op(TLB_V4_D_FULL | TLB_V6_D_FULL, "c8, c6, 0", zero);
> -	tlb_op(TLB_V4_I_FULL | TLB_V6_I_FULL, "c8, c5, 0", zero);
> -	tlb_op(TLB_V7_UIS_FULL, "c8, c3, 0", zero);
> +	__local_flush_tlb_all();
> +	tlb_op(TLB_V7_UIS_FULL, "c8, c7, 0", zero);
>  
>  	if (tlb_flag(TLB_BARRIER)) {
>  		dsb();
> @@ -338,31 +346,69 @@ static inline void local_flush_tlb_all(void)
>  	}
>  }
>  
> -static inline void local_flush_tlb_mm(struct mm_struct *mm)
> +static inline void __flush_tlb_all(void)
>  {
>  	const int zero = 0;
> -	const int asid = ASID(mm);
>  	const unsigned int __tlb_flag = __cpu_tlb_flags;
>  
>  	if (tlb_flag(TLB_WB))
>  		dsb();
>  
> +	__local_flush_tlb_all();
> +	tlb_op(TLB_V7_UIS_FULL, "c8, c3, 0", zero);
> +
> +	if (tlb_flag(TLB_BARRIER)) {
> +		dsb();
> +		isb();
> +	}
> +}
> +
> +static inline void __local_flush_tlb_mm(struct mm_struct *mm)
> +{
> +	const int zero = 0;
> +	const int asid = ASID(mm);
> +	const unsigned int __tlb_flag = __cpu_tlb_flags;
> +
>  	if (possible_tlb_flags & (TLB_V4_U_FULL|TLB_V4_D_FULL|TLB_V4_I_FULL)) {
> -		if (cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
> +		if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm))) {
>  			tlb_op(TLB_V4_U_FULL, "c8, c7, 0", zero);
>  			tlb_op(TLB_V4_D_FULL, "c8, c6, 0", zero);
>  			tlb_op(TLB_V4_I_FULL, "c8, c5, 0", zero);
>  		}
> -		put_cpu();
>  	}
>  
>  	tlb_op(TLB_V6_U_ASID, "c8, c7, 2", asid);
>  	tlb_op(TLB_V6_D_ASID, "c8, c6, 2", asid);
>  	tlb_op(TLB_V6_I_ASID, "c8, c5, 2", asid);
> +}
> +
> +static inline void local_flush_tlb_mm(struct mm_struct *mm)
> +{
> +	const int asid = ASID(mm);
> +	const unsigned int __tlb_flag = __cpu_tlb_flags;
> +
> +	if (tlb_flag(TLB_WB))
> +		dsb();
> +
> +	__local_flush_tlb_mm(mm);
> +	tlb_op(TLB_V7_UIS_ASID, "c8, c7, 2", asid);
> +
> +	if (tlb_flag(TLB_BARRIER))
> +		dsb();
> +}
> +
> +static inline void __flush_tlb_mm(struct mm_struct *mm)
> +{
> +	const unsigned int __tlb_flag = __cpu_tlb_flags;
> +
> +	if (tlb_flag(TLB_WB))
> +		dsb();
> +
> +	__local_flush_tlb_mm(mm);
>  #ifdef CONFIG_ARM_ERRATA_720789
> -	tlb_op(TLB_V7_UIS_ASID, "c8, c3, 0", zero);
> +	tlb_op(TLB_V7_UIS_ASID, "c8, c3, 0", 0);
>  #else
> -	tlb_op(TLB_V7_UIS_ASID, "c8, c3, 2", asid);
> +	tlb_op(TLB_V7_UIS_ASID, "c8, c3, 2", ASID(mm));
>  #endif
>  
>  	if (tlb_flag(TLB_BARRIER))
> @@ -370,16 +416,13 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm)
>  }
>  
>  static inline void
> -local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
> +__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
>  {
>  	const int zero = 0;
>  	const unsigned int __tlb_flag = __cpu_tlb_flags;
>  
>  	uaddr = (uaddr & PAGE_MASK) | ASID(vma->vm_mm);
>  
> -	if (tlb_flag(TLB_WB))
> -		dsb();
> -
>  	if (possible_tlb_flags & (TLB_V4_U_PAGE|TLB_V4_D_PAGE|TLB_V4_I_PAGE|TLB_V4_I_FULL) &&
>  	    cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) {
>  		tlb_op(TLB_V4_U_PAGE, "c8, c7, 1", uaddr);
> @@ -392,6 +435,36 @@ local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
>  	tlb_op(TLB_V6_U_PAGE, "c8, c7, 1", uaddr);
>  	tlb_op(TLB_V6_D_PAGE, "c8, c6, 1", uaddr);
>  	tlb_op(TLB_V6_I_PAGE, "c8, c5, 1", uaddr);
> +}
> +
> +static inline void
> +local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
> +{
> +	const unsigned int __tlb_flag = __cpu_tlb_flags;
> +
> +	uaddr = (uaddr & PAGE_MASK) | ASID(vma->vm_mm);
> +
> +	if (tlb_flag(TLB_WB))
> +		dsb();
> +
> +	__local_flush_tlb_page(vma, uaddr);
> +	tlb_op(TLB_V7_UIS_PAGE, "c8, c7, 1", uaddr);
> +
> +	if (tlb_flag(TLB_BARRIER))
> +		dsb();
> +}
> +
> +static inline void
> +__flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
> +{
> +	const unsigned int __tlb_flag = __cpu_tlb_flags;
> +
> +	uaddr = (uaddr & PAGE_MASK) | ASID(vma->vm_mm);
> +
> +	if (tlb_flag(TLB_WB))
> +		dsb();
> +
> +	__local_flush_tlb_page(vma, uaddr);
>  #ifdef CONFIG_ARM_ERRATA_720789
>  	tlb_op(TLB_V7_UIS_PAGE, "c8, c3, 3", uaddr & PAGE_MASK);
>  #else
> @@ -402,16 +475,11 @@ local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
>  		dsb();
>  }
>  
> -static inline void local_flush_tlb_kernel_page(unsigned long kaddr)
> +static inline void __local_flush_tlb_kernel_page(unsigned long kaddr)
>  {
>  	const int zero = 0;
>  	const unsigned int __tlb_flag = __cpu_tlb_flags;
>  
> -	kaddr &= PAGE_MASK;
> -
> -	if (tlb_flag(TLB_WB))
> -		dsb();
> -
>  	tlb_op(TLB_V4_U_PAGE, "c8, c7, 1", kaddr);
>  	tlb_op(TLB_V4_D_PAGE, "c8, c6, 1", kaddr);
>  	tlb_op(TLB_V4_I_PAGE, "c8, c5, 1", kaddr);
> @@ -421,6 +489,36 @@ static inline void local_flush_tlb_kernel_page(unsigned long kaddr)
>  	tlb_op(TLB_V6_U_PAGE, "c8, c7, 1", kaddr);
>  	tlb_op(TLB_V6_D_PAGE, "c8, c6, 1", kaddr);
>  	tlb_op(TLB_V6_I_PAGE, "c8, c5, 1", kaddr);
> +}
> +
> +static inline void local_flush_tlb_kernel_page(unsigned long kaddr)
> +{
> +	const unsigned int __tlb_flag = __cpu_tlb_flags;
> +
> +	kaddr &= PAGE_MASK;
> +
> +	if (tlb_flag(TLB_WB))
> +		dsb();
> +
> +	__local_flush_tlb_kernel_page(kaddr);
> +	tlb_op(TLB_V7_UIS_PAGE, "c8, c7, 1", kaddr);
> +
> +	if (tlb_flag(TLB_BARRIER)) {
> +		dsb();
> +		isb();
> +	}
> +}
> +
> +static inline void __flush_tlb_kernel_page(unsigned long kaddr)
> +{
> +	const unsigned int __tlb_flag = __cpu_tlb_flags;
> +
> +	kaddr &= PAGE_MASK;
> +
> +	if (tlb_flag(TLB_WB))
> +		dsb();
> +
> +	__local_flush_tlb_kernel_page(kaddr);
>  	tlb_op(TLB_V7_UIS_PAGE, "c8, c3, 1", kaddr);
>  
>  	if (tlb_flag(TLB_BARRIER)) {
> diff --git a/arch/arm/kernel/smp_tlb.c b/arch/arm/kernel/smp_tlb.c
> index c2edfff..5883b8a 100644
> --- a/arch/arm/kernel/smp_tlb.c
> +++ b/arch/arm/kernel/smp_tlb.c
> @@ -104,7 +104,7 @@ void flush_tlb_all(void)
>  	if (tlb_ops_need_broadcast())
>  		on_each_cpu(ipi_flush_tlb_all, NULL, 1);
>  	else
> -		local_flush_tlb_all();
> +		__flush_tlb_all();
>  	broadcast_tlb_a15_erratum();
>  }
>  
> @@ -113,7 +113,7 @@ void flush_tlb_mm(struct mm_struct *mm)
>  	if (tlb_ops_need_broadcast())
>  		on_each_cpu_mask(mm_cpumask(mm), ipi_flush_tlb_mm, mm, 1);
>  	else
> -		local_flush_tlb_mm(mm);
> +		__flush_tlb_mm(mm);
>  	broadcast_tlb_mm_a15_erratum(mm);
>  }
>  
> @@ -126,7 +126,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
>  		on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_page,
>  					&ta, 1);
>  	} else
> -		local_flush_tlb_page(vma, uaddr);
> +		__flush_tlb_page(vma, uaddr);
>  	broadcast_tlb_mm_a15_erratum(vma->vm_mm);
>  }
>  
> @@ -137,7 +137,7 @@ void flush_tlb_kernel_page(unsigned long kaddr)
>  		ta.ta_start = kaddr;
>  		on_each_cpu(ipi_flush_tlb_kernel_page, &ta, 1);
>  	} else
> -		local_flush_tlb_kernel_page(kaddr);
> +		__flush_tlb_kernel_page(kaddr);
>  	broadcast_tlb_a15_erratum();
>  }
>  
> diff --git a/arch/arm/mm/context.c b/arch/arm/mm/context.c
> index 4a05444..84e6f77 100644
> --- a/arch/arm/mm/context.c
> +++ b/arch/arm/mm/context.c
> @@ -162,10 +162,7 @@ static void flush_context(unsigned int cpu)
>  	}
>  
>  	/* Queue a TLB invalidate and flush the I-cache if necessary. */
> -	if (!tlb_ops_need_broadcast())
> -		cpumask_set_cpu(cpu, &tlb_flush_pending);
> -	else
> -		cpumask_setall(&tlb_flush_pending);
> +	cpumask_setall(&tlb_flush_pending);
>  
>  	if (icache_is_vivt_asid_tagged())
>  		__flush_icache_all();
> @@ -245,8 +242,6 @@ void check_and_switch_context(struct mm_struct *mm, struct task_struct *tsk)
>  	if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending)) {
>  		local_flush_bp_all();
>  		local_flush_tlb_all();
> -		if (erratum_a15_798181())
> -			dummy_flush_tlb_a15_erratum();
>  	}
>  
>  	atomic64_set(&per_cpu(active_asids, cpu), asid);
> -- 

Not a trival patch to even think about reviewing in detail.  I assume
this would get exposed pretty quickly under load testing.

-apw
diff mbox

Patch

diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h
index f467e9b..3316264 100644
--- a/arch/arm/include/asm/tlbflush.h
+++ b/arch/arm/include/asm/tlbflush.h
@@ -319,6 +319,16 @@  extern struct cpu_tlb_fns cpu_tlb;
 #define tlb_op(f, regs, arg)	__tlb_op(f, "p15, 0, %0, " regs, arg)
 #define tlb_l2_op(f, regs, arg)	__tlb_op(f, "p15, 1, %0, " regs, arg)
 
+static inline void __local_flush_tlb_all(void)
+{
+	const int zero = 0;
+	const unsigned int __tlb_flag = __cpu_tlb_flags;
+
+	tlb_op(TLB_V4_U_FULL | TLB_V6_U_FULL, "c8, c7, 0", zero);
+	tlb_op(TLB_V4_D_FULL | TLB_V6_D_FULL, "c8, c6, 0", zero);
+	tlb_op(TLB_V4_I_FULL | TLB_V6_I_FULL, "c8, c5, 0", zero);
+}
+
 static inline void local_flush_tlb_all(void)
 {
 	const int zero = 0;
@@ -327,10 +337,8 @@  static inline void local_flush_tlb_all(void)
 	if (tlb_flag(TLB_WB))
 		dsb();
 
-	tlb_op(TLB_V4_U_FULL | TLB_V6_U_FULL, "c8, c7, 0", zero);
-	tlb_op(TLB_V4_D_FULL | TLB_V6_D_FULL, "c8, c6, 0", zero);
-	tlb_op(TLB_V4_I_FULL | TLB_V6_I_FULL, "c8, c5, 0", zero);
-	tlb_op(TLB_V7_UIS_FULL, "c8, c3, 0", zero);
+	__local_flush_tlb_all();
+	tlb_op(TLB_V7_UIS_FULL, "c8, c7, 0", zero);
 
 	if (tlb_flag(TLB_BARRIER)) {
 		dsb();
@@ -338,31 +346,69 @@  static inline void local_flush_tlb_all(void)
 	}
 }
 
-static inline void local_flush_tlb_mm(struct mm_struct *mm)
+static inline void __flush_tlb_all(void)
 {
 	const int zero = 0;
-	const int asid = ASID(mm);
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
 	if (tlb_flag(TLB_WB))
 		dsb();
 
+	__local_flush_tlb_all();
+	tlb_op(TLB_V7_UIS_FULL, "c8, c3, 0", zero);
+
+	if (tlb_flag(TLB_BARRIER)) {
+		dsb();
+		isb();
+	}
+}
+
+static inline void __local_flush_tlb_mm(struct mm_struct *mm)
+{
+	const int zero = 0;
+	const int asid = ASID(mm);
+	const unsigned int __tlb_flag = __cpu_tlb_flags;
+
 	if (possible_tlb_flags & (TLB_V4_U_FULL|TLB_V4_D_FULL|TLB_V4_I_FULL)) {
-		if (cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
+		if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm))) {
 			tlb_op(TLB_V4_U_FULL, "c8, c7, 0", zero);
 			tlb_op(TLB_V4_D_FULL, "c8, c6, 0", zero);
 			tlb_op(TLB_V4_I_FULL, "c8, c5, 0", zero);
 		}
-		put_cpu();
 	}
 
 	tlb_op(TLB_V6_U_ASID, "c8, c7, 2", asid);
 	tlb_op(TLB_V6_D_ASID, "c8, c6, 2", asid);
 	tlb_op(TLB_V6_I_ASID, "c8, c5, 2", asid);
+}
+
+static inline void local_flush_tlb_mm(struct mm_struct *mm)
+{
+	const int asid = ASID(mm);
+	const unsigned int __tlb_flag = __cpu_tlb_flags;
+
+	if (tlb_flag(TLB_WB))
+		dsb();
+
+	__local_flush_tlb_mm(mm);
+	tlb_op(TLB_V7_UIS_ASID, "c8, c7, 2", asid);
+
+	if (tlb_flag(TLB_BARRIER))
+		dsb();
+}
+
+static inline void __flush_tlb_mm(struct mm_struct *mm)
+{
+	const unsigned int __tlb_flag = __cpu_tlb_flags;
+
+	if (tlb_flag(TLB_WB))
+		dsb();
+
+	__local_flush_tlb_mm(mm);
 #ifdef CONFIG_ARM_ERRATA_720789
-	tlb_op(TLB_V7_UIS_ASID, "c8, c3, 0", zero);
+	tlb_op(TLB_V7_UIS_ASID, "c8, c3, 0", 0);
 #else
-	tlb_op(TLB_V7_UIS_ASID, "c8, c3, 2", asid);
+	tlb_op(TLB_V7_UIS_ASID, "c8, c3, 2", ASID(mm));
 #endif
 
 	if (tlb_flag(TLB_BARRIER))
@@ -370,16 +416,13 @@  static inline void local_flush_tlb_mm(struct mm_struct *mm)
 }
 
 static inline void
-local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
+__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
 {
 	const int zero = 0;
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
 	uaddr = (uaddr & PAGE_MASK) | ASID(vma->vm_mm);
 
-	if (tlb_flag(TLB_WB))
-		dsb();
-
 	if (possible_tlb_flags & (TLB_V4_U_PAGE|TLB_V4_D_PAGE|TLB_V4_I_PAGE|TLB_V4_I_FULL) &&
 	    cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) {
 		tlb_op(TLB_V4_U_PAGE, "c8, c7, 1", uaddr);
@@ -392,6 +435,36 @@  local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
 	tlb_op(TLB_V6_U_PAGE, "c8, c7, 1", uaddr);
 	tlb_op(TLB_V6_D_PAGE, "c8, c6, 1", uaddr);
 	tlb_op(TLB_V6_I_PAGE, "c8, c5, 1", uaddr);
+}
+
+static inline void
+local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
+{
+	const unsigned int __tlb_flag = __cpu_tlb_flags;
+
+	uaddr = (uaddr & PAGE_MASK) | ASID(vma->vm_mm);
+
+	if (tlb_flag(TLB_WB))
+		dsb();
+
+	__local_flush_tlb_page(vma, uaddr);
+	tlb_op(TLB_V7_UIS_PAGE, "c8, c7, 1", uaddr);
+
+	if (tlb_flag(TLB_BARRIER))
+		dsb();
+}
+
+static inline void
+__flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
+{
+	const unsigned int __tlb_flag = __cpu_tlb_flags;
+
+	uaddr = (uaddr & PAGE_MASK) | ASID(vma->vm_mm);
+
+	if (tlb_flag(TLB_WB))
+		dsb();
+
+	__local_flush_tlb_page(vma, uaddr);
 #ifdef CONFIG_ARM_ERRATA_720789
 	tlb_op(TLB_V7_UIS_PAGE, "c8, c3, 3", uaddr & PAGE_MASK);
 #else
@@ -402,16 +475,11 @@  local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
 		dsb();
 }
 
-static inline void local_flush_tlb_kernel_page(unsigned long kaddr)
+static inline void __local_flush_tlb_kernel_page(unsigned long kaddr)
 {
 	const int zero = 0;
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
-	kaddr &= PAGE_MASK;
-
-	if (tlb_flag(TLB_WB))
-		dsb();
-
 	tlb_op(TLB_V4_U_PAGE, "c8, c7, 1", kaddr);
 	tlb_op(TLB_V4_D_PAGE, "c8, c6, 1", kaddr);
 	tlb_op(TLB_V4_I_PAGE, "c8, c5, 1", kaddr);
@@ -421,6 +489,36 @@  static inline void local_flush_tlb_kernel_page(unsigned long kaddr)
 	tlb_op(TLB_V6_U_PAGE, "c8, c7, 1", kaddr);
 	tlb_op(TLB_V6_D_PAGE, "c8, c6, 1", kaddr);
 	tlb_op(TLB_V6_I_PAGE, "c8, c5, 1", kaddr);
+}
+
+static inline void local_flush_tlb_kernel_page(unsigned long kaddr)
+{
+	const unsigned int __tlb_flag = __cpu_tlb_flags;
+
+	kaddr &= PAGE_MASK;
+
+	if (tlb_flag(TLB_WB))
+		dsb();
+
+	__local_flush_tlb_kernel_page(kaddr);
+	tlb_op(TLB_V7_UIS_PAGE, "c8, c7, 1", kaddr);
+
+	if (tlb_flag(TLB_BARRIER)) {
+		dsb();
+		isb();
+	}
+}
+
+static inline void __flush_tlb_kernel_page(unsigned long kaddr)
+{
+	const unsigned int __tlb_flag = __cpu_tlb_flags;
+
+	kaddr &= PAGE_MASK;
+
+	if (tlb_flag(TLB_WB))
+		dsb();
+
+	__local_flush_tlb_kernel_page(kaddr);
 	tlb_op(TLB_V7_UIS_PAGE, "c8, c3, 1", kaddr);
 
 	if (tlb_flag(TLB_BARRIER)) {
diff --git a/arch/arm/kernel/smp_tlb.c b/arch/arm/kernel/smp_tlb.c
index c2edfff..5883b8a 100644
--- a/arch/arm/kernel/smp_tlb.c
+++ b/arch/arm/kernel/smp_tlb.c
@@ -104,7 +104,7 @@  void flush_tlb_all(void)
 	if (tlb_ops_need_broadcast())
 		on_each_cpu(ipi_flush_tlb_all, NULL, 1);
 	else
-		local_flush_tlb_all();
+		__flush_tlb_all();
 	broadcast_tlb_a15_erratum();
 }
 
@@ -113,7 +113,7 @@  void flush_tlb_mm(struct mm_struct *mm)
 	if (tlb_ops_need_broadcast())
 		on_each_cpu_mask(mm_cpumask(mm), ipi_flush_tlb_mm, mm, 1);
 	else
-		local_flush_tlb_mm(mm);
+		__flush_tlb_mm(mm);
 	broadcast_tlb_mm_a15_erratum(mm);
 }
 
@@ -126,7 +126,7 @@  void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
 		on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_page,
 					&ta, 1);
 	} else
-		local_flush_tlb_page(vma, uaddr);
+		__flush_tlb_page(vma, uaddr);
 	broadcast_tlb_mm_a15_erratum(vma->vm_mm);
 }
 
@@ -137,7 +137,7 @@  void flush_tlb_kernel_page(unsigned long kaddr)
 		ta.ta_start = kaddr;
 		on_each_cpu(ipi_flush_tlb_kernel_page, &ta, 1);
 	} else
-		local_flush_tlb_kernel_page(kaddr);
+		__flush_tlb_kernel_page(kaddr);
 	broadcast_tlb_a15_erratum();
 }
 
diff --git a/arch/arm/mm/context.c b/arch/arm/mm/context.c
index 4a05444..84e6f77 100644
--- a/arch/arm/mm/context.c
+++ b/arch/arm/mm/context.c
@@ -162,10 +162,7 @@  static void flush_context(unsigned int cpu)
 	}
 
 	/* Queue a TLB invalidate and flush the I-cache if necessary. */
-	if (!tlb_ops_need_broadcast())
-		cpumask_set_cpu(cpu, &tlb_flush_pending);
-	else
-		cpumask_setall(&tlb_flush_pending);
+	cpumask_setall(&tlb_flush_pending);
 
 	if (icache_is_vivt_asid_tagged())
 		__flush_icache_all();
@@ -245,8 +242,6 @@  void check_and_switch_context(struct mm_struct *mm, struct task_struct *tsk)
 	if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending)) {
 		local_flush_bp_all();
 		local_flush_tlb_all();
-		if (erratum_a15_798181())
-			dummy_flush_tlb_a15_erratum();
 	}
 
 	atomic64_set(&per_cpu(active_asids, cpu), asid);