diff mbox

[2/2] powerpc/pseries: Dynamically increase RMA size

Message ID 20160809171131.GA2329@us.ibm.com (mailing list archive)
State Changes Requested
Headers show

Commit Message

Sukadev Bhattiprolu Aug. 9, 2016, 5:11 p.m. UTC
Paul Clarke [pc@us.ibm.com] wrote:
> Only nits from me...(see below)

Paul,

I agree with your comments and fixed them. Here is the updated patch.

---

From f9e9e8460206bc3fa7eaa741b9a2bde22870b9e0 Mon Sep 17 00:00:00 2001
From: root <sukadev@linux.vnet.ibm.com>
Date: Thu, 4 Aug 2016 23:13:37 -0400
Subject: [PATCH 2/2] powerpc/pseries: Dynamically grow RMA size

When booting a very large system with a large initrd we run out of space
for the flattened device tree (FDT). To fix this we must increase the
space allocated for the RMA region.

The RMA size is hard-coded in the 'ibm_architecture_vec[]' and increasing
the size there will apply to all systems, large and small, so we want to
increase the RMA region only when necessary.

When we run out of room for the FDT, set a new OF property, 'ibm,new-rma-size'
to the new RMA size (512MB) and issue a client-architecture-support (CAS)
call to the firmware. This will initiate a system reboot. Upon reboot we
notice the new property and update the RMA size accordingly.

Fix suggested by Michael Ellerman.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
---

[v3]:	- [Paul Clarke] Fix a few nits.

[v2]:	- Add a comment in code regarding 'fixup_nr_cores_done'
	- Fix build break when CONFIG_PPC_PSERIES=n
---
 arch/powerpc/kernel/prom_init.c | 97 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 96 insertions(+), 1 deletion(-)

Comments

Michael Ellerman Feb. 1, 2017, 5:37 a.m. UTC | #1
Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com> writes:
> Paul Clarke [pc@us.ibm.com] wrote:
> ---
>
> From f9e9e8460206bc3fa7eaa741b9a2bde22870b9e0 Mon Sep 17 00:00:00 2001

I know it's been a while but I think it would still be good to get this
in a shape that we can merge it.

Comments inline ...

> From: root <sukadev@linux.vnet.ibm.com>
> Date: Thu, 4 Aug 2016 23:13:37 -0400
> Subject: [PATCH 2/2] powerpc/pseries: Dynamically grow RMA size
>
> When booting a very large system with a large initrd we run out of space
> for the flattened device tree (FDT). To fix this we must increase the
> space allocated for the RMA region.
>
> The RMA size is hard-coded in the 'ibm_architecture_vec[]' and increasing
> the size there will apply to all systems, large and small, so we want to
> increase the RMA region only when necessary.
>
> When we run out of room for the FDT, set a new OF property, 'ibm,new-rma-size'
> to the new RMA size (512MB) and issue a client-architecture-support (CAS)
> call to the firmware. This will initiate a system reboot. Upon reboot we
> notice the new property and update the RMA size accordingly.
>
> Fix suggested by Michael Ellerman.
>
> Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
>
> diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
> index f612a99..d1aaeda 100644
> --- a/arch/powerpc/kernel/prom_init.c
> +++ b/arch/powerpc/kernel/prom_init.c
> @@ -87,6 +87,9 @@
>  int of_workarounds;
>  #endif
>  
> +#define IBM_NEW_RMA_SIZE_PROP	"ibm,new-rma-size"
> +#define IBM_NEW_RMA_SIZE_STR	"512"

The property name should really start with "linux,", as it's a Linux
property, not used by firmware at all.

And does it need to contain a value? Just its existence is a flag that
we want to increase the RMA size.

So it could just be called "linux,increase-rma-size".

And we don't need a #define for the name, it's not going to change once
the code is in, and a #define just obscures the actual name.

> @@ -898,6 +910,42 @@ static void fixup_nr_cores(void)
>  		ptcores[1] = (cores >> 16) & 0xff;
>  		ptcores[2] = (cores >> 8) & 0xff;
>  		ptcores[3] = cores & 0xff;
> +		fixup_nr_cores_done = true;

That code has changed upstream, so that won't apply. But that's OK, I
don't think we need to do it anyway.

> +static void __init fixup_rma_size(void)
> +{
> +	int rc;
> +	u64 size;
> +	unsigned char *min_rmap;
> +	phandle optnode;
> +	char str[64];
> +
> +	optnode = call_prom("finddevice", 1, 1, ADDR("/options"));
> +	if (!PHANDLE_VALID(optnode))
> +		prom_panic("Cannot find /options");
> +
> +	/*
> +	 * If a prior boot specified a new RMA size, use that size in
> +	 * ibm_architecture_vec[]. See also increase_rma_size().
> +	 */
> +	size = 0ULL;
> +	memset(str, 0, sizeof(str));
> +	rc = prom_getprop(optnode, IBM_NEW_RMA_SIZE_PROP, &str, sizeof(str));
> +	if (rc <= 0)
> +		return;

So this can just become something like:

	rc = prom_getprop(optnode, "linux,increase-rma-size", NULL, 0)
	if (rc == PROM_ERROR)
        	return;
                
	val = be32_to_cpu(ibm_architecture_vec.vec2.min_rma);
	ibm_architecture_vec.vec2.min_rma = cpu_to_be32(val * 2);

> @@ -946,6 +996,49 @@ static void __init prom_send_capabilities(void)
>  	}
>  #endif /* __BIG_ENDIAN__ */
>  }
> +
> +static void __init increase_rma_size(void)
> +{
> +	int rc, len;
> +	char str[64];
> +	phandle optnode;
> +
> +	optnode = call_prom("finddevice", 1, 1, ADDR("/options"));
> +	if (!PHANDLE_VALID(optnode))
> +		prom_panic("Cannot find /options");
> +
> +	/*
> +	 * If we already increased the RMA size, return.
> +	 */
> +	memset(str, 0, sizeof(str));
> +	rc = prom_getprop(optnode, IBM_NEW_RMA_SIZE_PROP, &str, sizeof(str));
> +
> +	if (!strcmp(str, IBM_NEW_RMA_SIZE_STR)) {
> +		prom_printf("RMA size already at %.3s.\n", str);
> +		return;
> +	}
> +	/*
> +	 * Otherwise, set the ibm,new-rma-size property and initiate a CAS
> +	 * reboot so the RMA size can take effect. See also init_rma_size().
> +	 */
> +	len = strlen(IBM_NEW_RMA_SIZE_STR) + 1;
> +	memcpy(str, IBM_NEW_RMA_SIZE_STR, len);
> +
> +	prom_printf("Setting %s property to %s\n", IBM_NEW_RMA_SIZE_PROP, str);
> +	rc = prom_setprop(optnode, "/options", IBM_NEW_RMA_SIZE_PROP, str, len);

We should check rc there shouldn't we?

Again that code can be simpler if the property is just a flag.

> +	/* Force a reboot. Will work only if ibm,fw-override-cas==false */
> +	prom_send_capabilities();
> +
> +	prom_printf("No CAS initiated reboot? Try setting ibm,fw-override-cas to 'false' in Open Firmware\n");

I'm not sure if we want to be referring to ibm,fw-override-case. I don't
thing it's a documented property (not in PAPR anyway), and it's
certainly IBM PFW specific even if it is.

I know for a fact that on KVM you won't get rebooted here, so I think if
the CAS returns we should just reboot directly.

cheers
Thiago Jung Bauermann Feb. 1, 2017, 5:49 p.m. UTC | #2
Hello,

Am Mittwoch, 1. Februar 2017, 16:37:58 BRST schrieb Michael Ellerman:
> Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com> writes:
> > Paul Clarke [pc@us.ibm.com] wrote:
> > ---
> > 
> > From f9e9e8460206bc3fa7eaa741b9a2bde22870b9e0 Mon Sep 17 00:00:00 2001
> 
> I know it's been a while but I think it would still be good to get this
> in a shape that we can merge it.

Sorry if this has been tried and didn't work or if I'm missing something 
obvious:

Instead of this method of trying a small RMA size and rebooting to try a 
bigger size, could the "min RMA percentage of total RAM" field of the 
ibm_architecture_vec be used?

LoPAPR says that "The Initial size of the RMA is set to the greater of the 
values indicated by bytes 24-27 [min RMA] or 32 [min RMA percentage of total 
RAM] of option vector number 2 “Open Firmware” or minimum RMA size supported 
by the platform and capped by the maximum memory defined for the partition and 
the maximum size of the RMA supported by the platform. The respective selected 
values are reported in the length of the first memory property."

My understanding is that these patches are intended for big guests with many 
processors, but the RMA size isn't changed to 512MB outright because of 
worries that it could affect smaller guests. Since guests with many processors 
tend to have more RAM as well, specifying a min RMA size of 256MB and a min 
RMA percentage of, say, 10% or 20% could make the host automatically allocate 
an adequate RMA size in the first boot.
Sukadev Bhattiprolu Feb. 1, 2017, 6:11 p.m. UTC | #3
Thiago Jung Bauermann [bauerman@linux.vnet.ibm.com] wrote:
> Instead of this method of trying a small RMA size and rebooting to try a 
> bigger size, could the "min RMA percentage of total RAM" field of the 
> ibm_architecture_vec be used?

We tried that and concluded that even 1% could end up reserving a lot of
memory on systems with lot of memory.

Sukadev
diff mbox

Patch

diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index f612a99..d1aaeda 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -87,6 +87,9 @@ 
 int of_workarounds;
 #endif
 
+#define IBM_NEW_RMA_SIZE_PROP	"ibm,new-rma-size"
+#define IBM_NEW_RMA_SIZE_STR	"512"
+
 #define OF_WA_CLAIM	1	/* do phys/virt claim separately, then map */
 #define OF_WA_LONGTRAIL	2	/* work around longtrail bugs */
 
@@ -679,6 +682,7 @@  unsigned char ibm_architecture_vec[] = {
 	W(0xffffffff),			/* virt_base */
 	W(0xffffffff),			/* virt_size */
 	W(0xffffffff),			/* load_base */
+#define IBM_ARCH_VEC_MIN_RMA_OFFSET	108
 	W(256),				/* 256MB min RMA */
 	W(0xffffffff),			/* full client load */
 	0,				/* min RMA percentage of total RAM */
@@ -867,6 +871,14 @@  static void fixup_nr_cores(void)
 {
 	u32 cores;
 	unsigned char *ptcores;
+	static bool fixup_nr_cores_done = false;
+
+	/*
+	 * If this is a second CAS call in the same boot sequence, (see
+	 * increase_rma_size()), we don't need to do the fixup again.
+	 */
+	if (fixup_nr_cores_done)
+		return;
 
 	/* We need to tell the FW about the number of cores we support.
 	 *
@@ -898,6 +910,42 @@  static void fixup_nr_cores(void)
 		ptcores[1] = (cores >> 16) & 0xff;
 		ptcores[2] = (cores >> 8) & 0xff;
 		ptcores[3] = cores & 0xff;
+		fixup_nr_cores_done = true;
+	}
+}
+
+static void __init fixup_rma_size(void)
+{
+	int rc;
+	u64 size;
+	unsigned char *min_rmap;
+	phandle optnode;
+	char str[64];
+
+	optnode = call_prom("finddevice", 1, 1, ADDR("/options"));
+	if (!PHANDLE_VALID(optnode))
+		prom_panic("Cannot find /options");
+
+	/*
+	 * If a prior boot specified a new RMA size, use that size in
+	 * ibm_architecture_vec[]. See also increase_rma_size().
+	 */
+	size = 0ULL;
+	memset(str, 0, sizeof(str));
+	rc = prom_getprop(optnode, IBM_NEW_RMA_SIZE_PROP, &str, sizeof(str));
+	if (rc <= 0)
+		return;
+
+	size = prom_strtoul(str, NULL);
+	min_rmap = &ibm_architecture_vec[IBM_ARCH_VEC_MIN_RMA_OFFSET];
+
+	if (size) {
+		prom_printf("Using RMA size %lu from %s.\n", size,
+						IBM_NEW_RMA_SIZE_PROP);
+		min_rmap[0] = (size >> 24) & 0xff;
+		min_rmap[1] = (size >> 16) & 0xff;
+		min_rmap[2] = (size >> 8) & 0xff;
+		min_rmap[3] = size & 0xff;
 	}
 }
 
@@ -911,6 +959,8 @@  static void __init prom_send_capabilities(void)
 
 		fixup_nr_cores();
 
+		fixup_rma_size();
+
 		/* try calling the ibm,client-architecture-support method */
 		prom_printf("Calling ibm,client-architecture-support...");
 		if (call_prom_ret("call-method", 3, 2, &ret,
@@ -946,6 +996,49 @@  static void __init prom_send_capabilities(void)
 	}
 #endif /* __BIG_ENDIAN__ */
 }
+
+static void __init increase_rma_size(void)
+{
+	int rc, len;
+	char str[64];
+	phandle optnode;
+
+	optnode = call_prom("finddevice", 1, 1, ADDR("/options"));
+	if (!PHANDLE_VALID(optnode))
+		prom_panic("Cannot find /options");
+
+	/*
+	 * If we already increased the RMA size, return.
+	 */
+	memset(str, 0, sizeof(str));
+	rc = prom_getprop(optnode, IBM_NEW_RMA_SIZE_PROP, &str, sizeof(str));
+
+	if (!strcmp(str, IBM_NEW_RMA_SIZE_STR)) {
+		prom_printf("RMA size already at %.3s.\n", str);
+		return;
+	}
+	/*
+	 * Otherwise, set the ibm,new-rma-size property and initiate a CAS
+	 * reboot so the RMA size can take effect. See also init_rma_size().
+	 */
+	len = strlen(IBM_NEW_RMA_SIZE_STR) + 1;
+	memcpy(str, IBM_NEW_RMA_SIZE_STR, len);
+
+	prom_printf("Setting %s property to %s\n", IBM_NEW_RMA_SIZE_PROP, str);
+	rc = prom_setprop(optnode, "/options", IBM_NEW_RMA_SIZE_PROP, str, len);
+
+	/* Force a reboot. Will work only if ibm,fw-override-cas==false */
+	prom_send_capabilities();
+
+	prom_printf("No CAS initiated reboot? Try setting ibm,fw-override-cas to 'false' in Open Firmware\n");
+}
+
+#else
+
+static void __init increase_rma_size(void)
+{
+}
+
 #endif /* #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */
 
 /*
@@ -2027,9 +2120,11 @@  static void __init *make_room(unsigned long *mem_start, unsigned long *mem_end,
 		room = alloc_top - alloc_bottom;
 		if (room > DEVTREE_CHUNK_SIZE)
 			room = DEVTREE_CHUNK_SIZE;
-		if (room < PAGE_SIZE)
+		if (room < PAGE_SIZE) {
+			increase_rma_size();
 			prom_panic("No memory for flatten_device_tree "
 				   "(no room)\n");
+		}
 		chunk = alloc_up(room, 0);
 		if (chunk == 0)
 			prom_panic("No memory for flatten_device_tree "