diff mbox series

[U,L,1/1] s390/mm: add support for RDP (Reset DAT-Protection)

Message ID 20230228140809.2083989-2-frank.heimes@canonical.com
State New
Headers show
Series Support for new IBM Z Hardware (IBM z16) - Reset DAT-Protection facility support (LP: 1982378) | expand

Commit Message

Frank Heimes Feb. 28, 2023, 2:08 p.m. UTC
From: Gerald Schaefer <gerald.schaefer@linux.ibm.com>

BugLink: https://bugs.launchpad.net/bugs/1982378

RDP instruction allows to reset DAT-protection bit in a PTE, with less
CPU synchronization overhead than IPTE instruction. In particular, IPTE
can cause machine-wide synchronization overhead, and excessive IPTE usage
can negatively impact machine performance.

RDP can be used instead of IPTE, if the new PTE only differs in SW bits
and _PAGE_PROTECT HW bit, for PTE protection changes from RO to RW.
SW PTE bit changes are allowed, e.g. for dirty and young tracking, but none
of the other HW-defined part of the PTE must change. This is because the
architecture forbids such changes to an active and valid PTE, which
is why invalidation with IPTE is always used first, before writing a new
entry.

The RDP optimization helps mainly for fault-driven SW dirty-bit tracking.
Writable PTEs are initially always mapped with HW _PAGE_PROTECT bit set,
to allow SW dirty-bit accounting on first write protection fault, where
the DAT-protection would then be reset. The reset is now done with RDP
instead of IPTE, if RDP instruction is available.

RDP cannot always guarantee that the DAT-protection reset is propagated
to all CPUs immediately. This means that spurious TLB protection faults
on other CPUs can now occur. For this, common code provides a
flush_tlb_fix_spurious_fault() handler, which will now be used to do a
CPU-local TLB flush. However, this will clear the whole TLB of a CPU, and
not just the affected entry. For more fine-grained flushing, by simply
doing a (local) RDP again, flush_tlb_fix_spurious_fault() would need to
also provide the PTE pointer.

Note that spurious TLB protection faults cannot really be distinguished
from racing pagetable updates, where another thread already installed the
correct PTE. In such a case, the local TLB flush would be unnecessary
overhead, but overall reduction of CPU synchronization overhead by not
using IPTE is still expected to be beneficial.

Reviewed-by: Alexander Gordeev <agordeev@linux.ibm.com>
Signed-off-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
(cherry picked from commit 0807b856521f3313d3912ebb52a9144215c4ff08 linux-next)
Signed-off-by: Frank Heimes <frank.heimes@canonical.com>
---
 arch/s390/include/asm/pgtable.h | 62 ++++++++++++++++++++++++++++++++-
 arch/s390/include/asm/setup.h   |  2 ++
 arch/s390/kernel/early.c        |  2 ++
 arch/s390/mm/pgtable.c          | 25 +++++++++++++
 4 files changed, 90 insertions(+), 1 deletion(-)
diff mbox series

Patch

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index b26cbf1c533c..b91e7ac96794 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -181,12 +181,20 @@  static inline int is_module_addr(void *addr)
 #define _PAGE_SOFT_DIRTY 0x000
 #endif
 
+#define _PAGE_SW_BITS	0xffUL		/* All SW bits */
+
 #define _PAGE_SWP_EXCLUSIVE _PAGE_LARGE	/* SW pte exclusive swap bit */
 
 /* Set of bits not changed in pte_modify */
 #define _PAGE_CHG_MASK		(PAGE_MASK | _PAGE_SPECIAL | _PAGE_DIRTY | \
 				 _PAGE_YOUNG | _PAGE_SOFT_DIRTY)
 
+/*
+ * Mask of bits that must not be changed with RDP. Allow only _PAGE_PROTECT
+ * HW bit and all SW bits.
+ */
+#define _PAGE_RDP_MASK		~(_PAGE_PROTECT | _PAGE_SW_BITS)
+
 /*
  * handle_pte_fault uses pte_present and pte_none to find out the pte type
  * WITHOUT holding the page table lock. The _PAGE_PRESENT bit is used to
@@ -1045,6 +1053,19 @@  static inline pte_t pte_mkhuge(pte_t pte)
 #define IPTE_NODAT	0x400
 #define IPTE_GUEST_ASCE	0x800
 
+static __always_inline void __ptep_rdp(unsigned long addr, pte_t *ptep,
+				       unsigned long opt, unsigned long asce,
+				       int local)
+{
+	unsigned long pto;
+
+	pto = __pa(ptep) & ~(PTRS_PER_PTE * sizeof(pte_t) - 1);
+	asm volatile(".insn rrf,0xb98b0000,%[r1],%[r2],%[asce],%[m4]"
+		     : "+m" (*ptep)
+		     : [r1] "a" (pto), [r2] "a" ((addr & PAGE_MASK) | opt),
+		       [asce] "a" (asce), [m4] "i" (local));
+}
+
 static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep,
 					unsigned long opt, unsigned long asce,
 					int local)
@@ -1195,6 +1216,42 @@  static inline void ptep_set_wrprotect(struct mm_struct *mm,
 		ptep_xchg_lazy(mm, addr, ptep, pte_wrprotect(pte));
 }
 
+/*
+ * Check if PTEs only differ in _PAGE_PROTECT HW bit, but also allow SW PTE
+ * bits in the comparison. Those might change e.g. because of dirty and young
+ * tracking.
+ */
+static inline int pte_allow_rdp(pte_t old, pte_t new)
+{
+	/*
+	 * Only allow changes from RO to RW
+	 */
+	if (!(pte_val(old) & _PAGE_PROTECT) || pte_val(new) & _PAGE_PROTECT)
+		return 0;
+
+	return (pte_val(old) & _PAGE_RDP_MASK) == (pte_val(new) & _PAGE_RDP_MASK);
+}
+
+static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
+						unsigned long address)
+{
+	/*
+	 * RDP might not have propagated the PTE protection reset to all CPUs,
+	 * so there could be spurious TLB protection faults.
+	 * NOTE: This will also be called when a racing pagetable update on
+	 * another thread already installed the correct PTE. Both cases cannot
+	 * really be distinguished.
+	 * Therefore, only do the local TLB flush when RDP can be used, to avoid
+	 * unnecessary overhead.
+	 */
+	if (MACHINE_HAS_RDP)
+		asm volatile("ptlb" : : : "memory");
+}
+#define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault
+
+void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+			 pte_t new);
+
 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 static inline int ptep_set_access_flags(struct vm_area_struct *vma,
 					unsigned long addr, pte_t *ptep,
@@ -1202,7 +1259,10 @@  static inline int ptep_set_access_flags(struct vm_area_struct *vma,
 {
 	if (pte_same(*ptep, entry))
 		return 0;
-	ptep_xchg_direct(vma->vm_mm, addr, ptep, entry);
+	if (MACHINE_HAS_RDP && !mm_has_pgste(vma->vm_mm) && pte_allow_rdp(*ptep, entry))
+		ptep_reset_dat_prot(vma->vm_mm, addr, ptep, entry);
+	else
+		ptep_xchg_direct(vma->vm_mm, addr, ptep, entry);
 	return 1;
 }
 
diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h
index 77e6506898f5..27e21a1390bb 100644
--- a/arch/s390/include/asm/setup.h
+++ b/arch/s390/include/asm/setup.h
@@ -34,6 +34,7 @@ 
 #define MACHINE_FLAG_GS		BIT(16)
 #define MACHINE_FLAG_SCC	BIT(17)
 #define MACHINE_FLAG_PCI_MIO	BIT(18)
+#define MACHINE_FLAG_RDP	BIT(19)
 
 #define LPP_MAGIC		BIT(31)
 #define LPP_PID_MASK		_AC(0xffffffff, UL)
@@ -95,6 +96,7 @@  extern unsigned long mio_wb_bit_mask;
 #define MACHINE_HAS_GS		(S390_lowcore.machine_flags & MACHINE_FLAG_GS)
 #define MACHINE_HAS_SCC		(S390_lowcore.machine_flags & MACHINE_FLAG_SCC)
 #define MACHINE_HAS_PCI_MIO	(S390_lowcore.machine_flags & MACHINE_FLAG_PCI_MIO)
+#define MACHINE_HAS_RDP		(S390_lowcore.machine_flags & MACHINE_FLAG_RDP)
 
 /*
  * Console mode. Override with conmode=
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index 6030fdd6997b..9e058ed24d5b 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -227,6 +227,8 @@  static __init void detect_machine_facilities(void)
 		S390_lowcore.machine_flags |= MACHINE_FLAG_PCI_MIO;
 		/* the control bit is set during PCI initialization */
 	}
+	if (test_facility(194))
+		S390_lowcore.machine_flags |= MACHINE_FLAG_RDP;
 }
 
 static inline void save_vector_registers(void)
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 4909dcd762e8..6effb24de6d9 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -302,6 +302,31 @@  pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr,
 }
 EXPORT_SYMBOL(ptep_xchg_direct);
 
+/*
+ * Caller must check that new PTE only differs in _PAGE_PROTECT HW bit, so that
+ * RDP can be used instead of IPTE. See also comments at pte_allow_rdp().
+ */
+void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+			 pte_t new)
+{
+	preempt_disable();
+	atomic_inc(&mm->context.flush_count);
+	if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
+		__ptep_rdp(addr, ptep, 0, 0, 1);
+	else
+		__ptep_rdp(addr, ptep, 0, 0, 0);
+	/*
+	 * PTE is not invalidated by RDP, only _PAGE_PROTECT is cleared. That
+	 * means it is still valid and active, and must not be changed according
+	 * to the architecture. But writing a new value that only differs in SW
+	 * bits is allowed.
+	 */
+	set_pte(ptep, new);
+	atomic_dec(&mm->context.flush_count);
+	preempt_enable();
+}
+EXPORT_SYMBOL(ptep_reset_dat_prot);
+
 pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t new)
 {