diff mbox series

[RFC,v0,5/5] pseries: Asynchronous page fault support

Message ID 20210805072439.501481-6-bharata@linux.ibm.com
State New
Headers show
Series [RFC,v0,1/5] powerpc: Define Expropriation interrupt bit to VPA byte offset 0xB9 | expand

Commit Message

Bharata B Rao Aug. 5, 2021, 7:24 a.m. UTC
Add asynchronous page fault support for pseries guests.

1. Setup the guest to handle async-pf
   - Issue H_REG_SNS hcall to register the SNS region.
   - Setup the subvention interrupt irq.
   - Enable async-pf by updating the byte_b9 of VPA for each
     CPU.
2. Check if the page fault is an expropriation notification
   (SRR1_PROGTRAP set in SRR1) and if so put the task on
   wait queue based on the expropriation correlation number
   read from the VPA.
3. Handle subvention interrupt to wake any waiting tasks.
   The wait and wakeup mechanism from x86 async-pf implementation
   is being reused here.

TODO:
- Check how to keep this feature together with other CMO features.
- The async-pf check in the page fault handler path is limited to
  guest with an #ifdef. This isn't sufficient and hence needs to
  be replaced by an appropriate check.

Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
---
 arch/powerpc/include/asm/async-pf.h       |  12 ++
 arch/powerpc/mm/fault.c                   |   7 +-
 arch/powerpc/platforms/pseries/Makefile   |   2 +-
 arch/powerpc/platforms/pseries/async-pf.c | 219 ++++++++++++++++++++++
 4 files changed, 238 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/include/asm/async-pf.h
 create mode 100644 arch/powerpc/platforms/pseries/async-pf.c

Comments

Nicholas Piggin Aug. 13, 2021, 4:06 a.m. UTC | #1
Excerpts from Bharata B Rao's message of August 5, 2021 5:24 pm:
> Add asynchronous page fault support for pseries guests.
> 
> 1. Setup the guest to handle async-pf
>    - Issue H_REG_SNS hcall to register the SNS region.
>    - Setup the subvention interrupt irq.
>    - Enable async-pf by updating the byte_b9 of VPA for each
>      CPU.
> 2. Check if the page fault is an expropriation notification
>    (SRR1_PROGTRAP set in SRR1) and if so put the task on
>    wait queue based on the expropriation correlation number
>    read from the VPA.
> 3. Handle subvention interrupt to wake any waiting tasks.
>    The wait and wakeup mechanism from x86 async-pf implementation
>    is being reused here.

I don't know too much about the background of this.

How much benefit does this give? What situations? Does PowerVM implement 
it? Do other architectures KVM have something similar?

The SRR1 setting for the DSI is in PAPR? In that case it should be okay,
it might be good to add a small comment in exceptions-64s.S.

[...]

> @@ -395,6 +395,11 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
>  	vm_fault_t fault, major = 0;
>  	bool kprobe_fault = kprobe_page_fault(regs, 11);
>  
> +#ifdef CONFIG_PPC_PSERIES
> +	if (handle_async_page_fault(regs, address))
> +		return 0;
> +#endif
> +
>  	if (unlikely(debugger_fault_handler(regs) || kprobe_fault))
>  		return 0;

[...]

> +int handle_async_page_fault(struct pt_regs *regs, unsigned long addr)
> +{
> +	struct async_pf_sleep_node n;
> +	DECLARE_SWAITQUEUE(wait);
> +	unsigned long exp_corr_nr;
> +
> +	/* Is this Expropriation notification? */
> +	if (!(mfspr(SPRN_SRR1) & SRR1_PROGTRAP))
> +		return 0;

Yep this should be an inline that is guarded by a static key, and then 
probably have an inline check for SRR1_PROGTRAP. You shouldn't need to
mfspr here, but just use regs->msr.

> +
> +	if (unlikely(!user_mode(regs)))
> +		panic("Host injected async PF in kernel mode\n");

Hmm. Is there anything in the PAPR interface that specifies that the
OS can only deal with problem state access faults here? Or is that
inherent in the expropriation feature?

Thanks,
Nick
Bharata B Rao Aug. 13, 2021, 4:54 a.m. UTC | #2
On Fri, Aug 13, 2021 at 02:06:40PM +1000, Nicholas Piggin wrote:
> Excerpts from Bharata B Rao's message of August 5, 2021 5:24 pm:
> > Add asynchronous page fault support for pseries guests.
> > 
> > 1. Setup the guest to handle async-pf
> >    - Issue H_REG_SNS hcall to register the SNS region.
> >    - Setup the subvention interrupt irq.
> >    - Enable async-pf by updating the byte_b9 of VPA for each
> >      CPU.
> > 2. Check if the page fault is an expropriation notification
> >    (SRR1_PROGTRAP set in SRR1) and if so put the task on
> >    wait queue based on the expropriation correlation number
> >    read from the VPA.
> > 3. Handle subvention interrupt to wake any waiting tasks.
> >    The wait and wakeup mechanism from x86 async-pf implementation
> >    is being reused here.
> 
> I don't know too much about the background of this.
> 
> How much benefit does this give? What situations?

I haven't yet gotten into measuring the benefit of this. Once
the patches are bit more stable than what they are currently,
we need to measure and evaluate the benefits.

> Does PowerVM implement it?

I suppose so, need to check though.

> Do other architectures KVM have something similar?

Yes, x86 and s390 KVM have had this feature for a while now
and generic KVM interfaces exist to support it.

> 
> The SRR1 setting for the DSI is in PAPR? In that case it should be okay,
> it might be good to add a small comment in exceptions-64s.S.

Yes, SRR1 setting is part of PAPR.

> 
> [...]
> 
> > @@ -395,6 +395,11 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
> >  	vm_fault_t fault, major = 0;
> >  	bool kprobe_fault = kprobe_page_fault(regs, 11);
> >  
> > +#ifdef CONFIG_PPC_PSERIES
> > +	if (handle_async_page_fault(regs, address))
> > +		return 0;
> > +#endif
> > +
> >  	if (unlikely(debugger_fault_handler(regs) || kprobe_fault))
> >  		return 0;
> 
> [...]
> 
> > +int handle_async_page_fault(struct pt_regs *regs, unsigned long addr)
> > +{
> > +	struct async_pf_sleep_node n;
> > +	DECLARE_SWAITQUEUE(wait);
> > +	unsigned long exp_corr_nr;
> > +
> > +	/* Is this Expropriation notification? */
> > +	if (!(mfspr(SPRN_SRR1) & SRR1_PROGTRAP))
> > +		return 0;
> 
> Yep this should be an inline that is guarded by a static key, and then 
> probably have an inline check for SRR1_PROGTRAP. You shouldn't need to
> mfspr here, but just use regs->msr.

Right.

> 
> > +
> > +	if (unlikely(!user_mode(regs)))
> > +		panic("Host injected async PF in kernel mode\n");
> 
> Hmm. Is there anything in the PAPR interface that specifies that the
> OS can only deal with problem state access faults here? Or is that
> inherent in the expropriation feature?

Didn't see anything specific to that effect in PAPR. However since
this puts the faulting guest process to sleep until the page
becomes ready in the host, I have limited it to guest user space
faults.

Regards,
Bharata.
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/async-pf.h b/arch/powerpc/include/asm/async-pf.h
new file mode 100644
index 000000000000..95d6c3da9f50
--- /dev/null
+++ b/arch/powerpc/include/asm/async-pf.h
@@ -0,0 +1,12 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Async page fault support via PAPR Expropriation/Subvention Notification
+ * option(ESN)
+ *
+ * Copyright 2020 Bharata B Rao, IBM Corp. <bharata@linux.ibm.com>
+ */
+
+#ifndef _ASM_POWERPC_ASYNC_PF_H
+int handle_async_page_fault(struct pt_regs *regs, unsigned long addr);
+#define _ASM_POWERPC_ASYNC_PF_H
+#endif
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index a8d0ce85d39a..bbdc61605885 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -44,7 +44,7 @@ 
 #include <asm/debug.h>
 #include <asm/kup.h>
 #include <asm/inst.h>
-
+#include <asm/async-pf.h>
 
 /*
  * do_page_fault error handling helpers
@@ -395,6 +395,11 @@  static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
 	vm_fault_t fault, major = 0;
 	bool kprobe_fault = kprobe_page_fault(regs, 11);
 
+#ifdef CONFIG_PPC_PSERIES
+	if (handle_async_page_fault(regs, address))
+		return 0;
+#endif
+
 	if (unlikely(debugger_fault_handler(regs) || kprobe_fault))
 		return 0;
 
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index 4cda0ef87be0..e0ada605ef20 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -6,7 +6,7 @@  obj-y			:= lpar.o hvCall.o nvram.o reconfig.o \
 			   of_helpers.o \
 			   setup.o iommu.o event_sources.o ras.o \
 			   firmware.o power.o dlpar.o mobility.o rng.o \
-			   pci.o pci_dlpar.o eeh_pseries.o msi.o
+			   pci.o pci_dlpar.o eeh_pseries.o msi.o async-pf.o
 obj-$(CONFIG_SMP)	+= smp.o
 obj-$(CONFIG_SCANLOG)	+= scanlog.o
 obj-$(CONFIG_KEXEC_CORE)	+= kexec.o
diff --git a/arch/powerpc/platforms/pseries/async-pf.c b/arch/powerpc/platforms/pseries/async-pf.c
new file mode 100644
index 000000000000..c2f3bbc0d674
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/async-pf.c
@@ -0,0 +1,219 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Async page fault support via PAPR Expropriation/Subvention Notification
+ * option(ESN)
+ *
+ * Copyright 2020 Bharata B Rao, IBM Corp. <bharata@linux.ibm.com>
+ */
+
+#include <linux/interrupt.h>
+#include <linux/swait.h>
+#include <linux/irqdomain.h>
+#include <asm/machdep.h>
+#include <asm/hvcall.h>
+#include <asm/paca.h>
+
+static char sns_buffer[PAGE_SIZE] __aligned(4096);
+static uint16_t *esn_q = (uint16_t *)sns_buffer + 1;
+static unsigned long next_eq_entry, nr_eq_entries;
+
+#define ASYNC_PF_SLEEP_HASHBITS 8
+#define ASYNC_PF_SLEEP_HASHSIZE (1<<ASYNC_PF_SLEEP_HASHBITS)
+
+/* Controls access to SNS buffer */
+static DEFINE_RAW_SPINLOCK(async_sns_guest_lock);
+
+/* Wait queue handling is from x86 asyn-pf implementation */
+struct async_pf_sleep_node {
+	struct hlist_node link;
+	struct swait_queue_head wq;
+	u64 token;
+	int cpu;
+};
+
+static struct async_pf_sleep_head {
+	raw_spinlock_t lock;
+	struct hlist_head list;
+} async_pf_sleepers[ASYNC_PF_SLEEP_HASHSIZE];
+
+static struct async_pf_sleep_node *_find_apf_task(struct async_pf_sleep_head *b,
+						  u64 token)
+{
+	struct hlist_node *p;
+
+	hlist_for_each(p, &b->list) {
+		struct async_pf_sleep_node *n =
+			hlist_entry(p, typeof(*n), link);
+		if (n->token == token)
+			return n;
+	}
+
+	return NULL;
+}
+static int async_pf_queue_task(u64 token, struct async_pf_sleep_node *n)
+{
+	u64 key = hash_64(token, ASYNC_PF_SLEEP_HASHBITS);
+	struct async_pf_sleep_head *b = &async_pf_sleepers[key];
+	struct async_pf_sleep_node *e;
+
+	raw_spin_lock(&b->lock);
+	e = _find_apf_task(b, token);
+	if (e) {
+		/* dummy entry exist -> wake up was delivered ahead of PF */
+		hlist_del(&e->link);
+		raw_spin_unlock(&b->lock);
+		kfree(e);
+		return false;
+	}
+
+	n->token = token;
+	n->cpu = smp_processor_id();
+	init_swait_queue_head(&n->wq);
+	hlist_add_head(&n->link, &b->list);
+	raw_spin_unlock(&b->lock);
+	return true;
+}
+
+/*
+ * Handle Expropriation notification.
+ */
+int handle_async_page_fault(struct pt_regs *regs, unsigned long addr)
+{
+	struct async_pf_sleep_node n;
+	DECLARE_SWAITQUEUE(wait);
+	unsigned long exp_corr_nr;
+
+	/* Is this Expropriation notification? */
+	if (!(mfspr(SPRN_SRR1) & SRR1_PROGTRAP))
+		return 0;
+
+	if (unlikely(!user_mode(regs)))
+		panic("Host injected async PF in kernel mode\n");
+
+	exp_corr_nr = be16_to_cpu(get_lppaca()->exp_corr_nr);
+	if (!async_pf_queue_task(exp_corr_nr, &n))
+		return 0;
+
+	for (;;) {
+		prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+		if (hlist_unhashed(&n.link))
+			break;
+
+		local_irq_enable();
+		schedule();
+		local_irq_disable();
+	}
+
+	finish_swait(&n.wq, &wait);
+	return 1;
+}
+
+static void apf_task_wake_one(struct async_pf_sleep_node *n)
+{
+	hlist_del_init(&n->link);
+	if (swq_has_sleeper(&n->wq))
+		swake_up_one(&n->wq);
+}
+
+static void async_pf_wake_task(u64 token)
+{
+	u64 key = hash_64(token, ASYNC_PF_SLEEP_HASHBITS);
+	struct async_pf_sleep_head *b = &async_pf_sleepers[key];
+	struct async_pf_sleep_node *n;
+
+again:
+	raw_spin_lock(&b->lock);
+	n = _find_apf_task(b, token);
+	if (!n) {
+		/*
+		 * async PF was not yet handled.
+		 * Add dummy entry for the token.
+		 */
+		n = kzalloc(sizeof(*n), GFP_ATOMIC);
+		if (!n) {
+			/*
+			 * Allocation failed! Busy wait while other cpu
+			 * handles async PF.
+			 */
+			raw_spin_unlock(&b->lock);
+			cpu_relax();
+			goto again;
+		}
+		n->token = token;
+		n->cpu = smp_processor_id();
+		init_swait_queue_head(&n->wq);
+		hlist_add_head(&n->link, &b->list);
+	} else {
+		apf_task_wake_one(n);
+	}
+	raw_spin_unlock(&b->lock);
+}
+
+/*
+ * Handle Subvention notification.
+ */
+static irqreturn_t async_pf_handler(int irq, void *dev_id)
+{
+	uint16_t exp_token, old;
+
+	raw_spin_lock(&async_sns_guest_lock);
+	do {
+		exp_token = *(esn_q + next_eq_entry);
+		if (!exp_token)
+			break;
+
+		old = arch_cmpxchg(esn_q + next_eq_entry, exp_token, 0);
+		BUG_ON(old != exp_token);
+
+		async_pf_wake_task(exp_token);
+		next_eq_entry = (next_eq_entry + 1) % nr_eq_entries;
+	} while (1);
+	raw_spin_unlock(&async_sns_guest_lock);
+	return IRQ_HANDLED;
+}
+
+static int __init pseries_async_pf_init(void)
+{
+	long rc;
+	unsigned long ret[PLPAR_HCALL_BUFSIZE];
+	unsigned int irq, cpu;
+	int i;
+
+	/* Register buffer via H_REG_SNS */
+	rc = plpar_hcall(H_REG_SNS, ret, __pa(sns_buffer), PAGE_SIZE);
+	if (rc != H_SUCCESS)
+		return -1;
+
+	nr_eq_entries = (PAGE_SIZE - 2) / sizeof(uint16_t);
+
+	/* Register irq handler */
+	irq = irq_create_mapping(NULL, ret[1]);
+	if (!irq) {
+		plpar_hcall(H_REG_SNS, ret, -1, PAGE_SIZE);
+		return -1;
+	}
+
+	rc = request_irq(irq, async_pf_handler, 0, "sns-interrupt", NULL);
+	if (rc < 0) {
+		plpar_hcall(H_REG_SNS, ret, -1, PAGE_SIZE);
+		return -1;
+	}
+
+	for (i = 0; i < ASYNC_PF_SLEEP_HASHSIZE; i++)
+		raw_spin_lock_init(&async_pf_sleepers[i].lock);
+
+	/*
+	 * Enable subvention notifications from the hypervisor
+	 * by setting bit 0, byte 0 of SNS buffer
+	 */
+	*sns_buffer |= 0x1;
+
+	/* Enable LPPACA_EXP_INT_ENABLED in VPA */
+	for_each_possible_cpu(cpu)
+		lppaca_of(cpu).byte_b9 |= LPPACA_EXP_INT_ENABLED;
+
+	pr_err("%s: Enabled Async PF\n", __func__);
+	return 0;
+}
+
+machine_arch_initcall(pseries, pseries_async_pf_init);