[RFC,v2,2/6] fast-reboot: parallel memory clearing

Message ID 20180628025501.20676-3-stewart@linux.ibm.com
State RFC
Headers show
Series
  • Faster fast reboot (2x!)
Related show

Commit Message

Stewart Smith June 28, 2018, 2:54 a.m.
Arbitrarily pick 16GB as the unit of parallelism, and
split up clearing memory into jobs and schedule them
node-local to the memory (or on node 0 if we can't
work that out because it's the memory up to SKIBOOT_BASE)

This seems to cut at least ~40% time from memory zeroing on
fast-reboot on a 256GB Boston system.

Signed-off-by: Stewart Smith <stewart@linux.ibm.com>
---
 core/device.c                                 |  2 +-
 core/mem_region.c                             | 94 ++++++++++++++++++-
 core/test/dummy-cpu.h                         | 40 ++++++++
 core/test/run-malloc-speed.c                  |  7 +-
 core/test/run-malloc.c                        |  7 +-
 core/test/run-mem_range_is_reserved.c         |  8 +-
 core/test/run-mem_region.c                    | 10 +-
 core/test/run-mem_region_init.c               |  8 +-
 core/test/run-mem_region_next.c               |  8 +-
 core/test/run-mem_region_release_unused.c     |  8 +-
 .../run-mem_region_release_unused_noalloc.c   |  8 +-
 core/test/run-mem_region_reservations.c       |  8 +-
 core/test/stubs.c                             | 44 +++++++++
 hdata/test/hdata_to_dt.c                      |  9 ++
 hdata/test/stubs.c                            | 47 ++++++++++
 include/device.h                              |  5 +-
 16 files changed, 255 insertions(+), 58 deletions(-)
 create mode 100644 core/test/dummy-cpu.h

Comments

Nicholas Piggin July 3, 2018, 6:30 a.m. | #1
On Thu, 28 Jun 2018 12:54:57 +1000
Stewart Smith <stewart@linux.ibm.com> wrote:

> Arbitrarily pick 16GB as the unit of parallelism, and
> split up clearing memory into jobs and schedule them
> node-local to the memory (or on node 0 if we can't
> work that out because it's the memory up to SKIBOOT_BASE)
> 
> This seems to cut at least ~40% time from memory zeroing on
> fast-reboot on a 256GB Boston system.
> 
> Signed-off-by: Stewart Smith <stewart@linux.ibm.com>

[...]

> +		while(l > MEM_REGION_CLEAR_JOB_SIZE) {
> +			job_args[i].s = s+l - MEM_REGION_CLEAR_JOB_SIZE;
> +			job_args[i].e = s+l;
> +			l-=MEM_REGION_CLEAR_JOB_SIZE;
> +			job_args[i].job_name = malloc(sizeof(char)*100);
> +			total+=MEM_REGION_CLEAR_JOB_SIZE;
> +			chip_id = __dt_get_chip_id(r->node);
> +			if (chip_id == -1)
> +				chip_id = 0;
> +			path = dt_get_path(r->node);
> +			snprintf(job_args[i].job_name, 100,
> +				 "clear %s, %s 0x%"PRIx64" len: %"PRIx64" on %d",
> +				 r->name, path,
> +				 job_args[i].s,
> +				 (job_args[i].e - job_args[i].s),
> +				 chip_id);
> +			free(path);
> +			printf("job: %s\n", job_args[i].job_name);
> +			jobs[i] = cpu_queue_job_on_node(chip_id,
> +							job_args[i].job_name,
> +							mem_region_clear_job,
> +							&job_args[i]);

Ahh, the API will return NULL if it can't be scheduled on the requested
node it will return. So you'll want to re-run cpu_queue_job() in that
case to ensure it runs.

Thanks,
Nick
Stewart Smith July 17, 2018, 4:08 a.m. | #2
Nicholas Piggin <npiggin@gmail.com> writes:
> On Thu, 28 Jun 2018 12:54:57 +1000
> Stewart Smith <stewart@linux.ibm.com> wrote:
>
>> Arbitrarily pick 16GB as the unit of parallelism, and
>> split up clearing memory into jobs and schedule them
>> node-local to the memory (or on node 0 if we can't
>> work that out because it's the memory up to SKIBOOT_BASE)
>> 
>> This seems to cut at least ~40% time from memory zeroing on
>> fast-reboot on a 256GB Boston system.
>> 
>> Signed-off-by: Stewart Smith <stewart@linux.ibm.com>
>
> [...]
>
>> +		while(l > MEM_REGION_CLEAR_JOB_SIZE) {
>> +			job_args[i].s = s+l - MEM_REGION_CLEAR_JOB_SIZE;
>> +			job_args[i].e = s+l;
>> +			l-=MEM_REGION_CLEAR_JOB_SIZE;
>> +			job_args[i].job_name = malloc(sizeof(char)*100);
>> +			total+=MEM_REGION_CLEAR_JOB_SIZE;
>> +			chip_id = __dt_get_chip_id(r->node);
>> +			if (chip_id == -1)
>> +				chip_id = 0;
>> +			path = dt_get_path(r->node);
>> +			snprintf(job_args[i].job_name, 100,
>> +				 "clear %s, %s 0x%"PRIx64" len: %"PRIx64" on %d",
>> +				 r->name, path,
>> +				 job_args[i].s,
>> +				 (job_args[i].e - job_args[i].s),
>> +				 chip_id);
>> +			free(path);
>> +			printf("job: %s\n", job_args[i].job_name);
>> +			jobs[i] = cpu_queue_job_on_node(chip_id,
>> +							job_args[i].job_name,
>> +							mem_region_clear_job,
>> +							&job_args[i]);
>
> Ahh, the API will return NULL if it can't be scheduled on the requested
> node it will return. So you'll want to re-run cpu_queue_job() in that
> case to ensure it runs.

Ahh yep, good point. Fixed in next revision.

Patch

diff --git a/core/device.c b/core/device.c
index 38dbdfc09f30..19a8d8d60cd2 100644
--- a/core/device.c
+++ b/core/device.c
@@ -955,7 +955,7 @@  u64 dt_get_address(const struct dt_node *node, unsigned int index,
 	return dt_get_number(p->prop + pos, na);
 }
 
-static u32 __dt_get_chip_id(const struct dt_node *node)
+u32 __dt_get_chip_id(const struct dt_node *node)
 {
 	const struct dt_property *prop;
 
diff --git a/core/mem_region.c b/core/mem_region.c
index 8ae49bb790fd..a8f18b01a17b 100644
--- a/core/mem_region.c
+++ b/core/mem_region.c
@@ -1206,19 +1206,52 @@  static void mem_clear_range(uint64_t s, uint64_t e)
 		return;
 	}
 
-	prlog(PR_NOTICE, "Clearing region %llx-%llx\n",
+	prlog(PR_DEBUG, "Clearing region %llx-%llx\n",
 	      (long long)s, (long long)e);
 	memset((void *)s, 0, e - s);
 }
 
+struct mem_region_clear_job_args {
+	char *job_name;
+	uint64_t s,e;
+};
+
+static void mem_region_clear_job(void *data)
+{
+	struct mem_region_clear_job_args *arg = (struct mem_region_clear_job_args*)data;
+	mem_clear_range(arg->s, arg->e);
+}
+
+#define MEM_REGION_CLEAR_JOB_SIZE (16ULL*(1<<30))
+
 void mem_region_clear_unused(void)
 {
+	int njobs = 0;
+	struct cpu_job **jobs;
 	struct mem_region *r;
+	struct mem_region_clear_job_args *job_args;
+	uint64_t s,l;
+	uint64_t total = 0;
+	uint32_t chip_id;
+	char *path;
+	int i;
 
 	lock(&mem_region_lock);
 	assert(mem_regions_finalised);
 
+	list_for_each(&regions, r, list) {
+		if (!(r->type == REGION_OS))
+			continue;
+		njobs++;
+		/* One job per 16GB */
+		njobs += r->len / MEM_REGION_CLEAR_JOB_SIZE;
+	}
+
+	jobs = malloc(njobs * sizeof(struct cpu_job*));
+	job_args = malloc(njobs * sizeof(struct mem_region_clear_job_args));
+
 	prlog(PR_NOTICE, "Clearing unused memory:\n");
+	i = 0;
 	list_for_each(&regions, r, list) {
 		/* If it's not unused, ignore it. */
 		if (!(r->type == REGION_OS))
@@ -1226,9 +1259,66 @@  void mem_region_clear_unused(void)
 
 		assert(r != &skiboot_heap);
 
-		mem_clear_range(r->start, r->start + r->len);
+		s = r->start;
+		l = r->len;
+		while(l > MEM_REGION_CLEAR_JOB_SIZE) {
+			job_args[i].s = s+l - MEM_REGION_CLEAR_JOB_SIZE;
+			job_args[i].e = s+l;
+			l-=MEM_REGION_CLEAR_JOB_SIZE;
+			job_args[i].job_name = malloc(sizeof(char)*100);
+			total+=MEM_REGION_CLEAR_JOB_SIZE;
+			chip_id = __dt_get_chip_id(r->node);
+			if (chip_id == -1)
+				chip_id = 0;
+			path = dt_get_path(r->node);
+			snprintf(job_args[i].job_name, 100,
+				 "clear %s, %s 0x%"PRIx64" len: %"PRIx64" on %d",
+				 r->name, path,
+				 job_args[i].s,
+				 (job_args[i].e - job_args[i].s),
+				 chip_id);
+			free(path);
+			printf("job: %s\n", job_args[i].job_name);
+			jobs[i] = cpu_queue_job_on_node(chip_id,
+							job_args[i].job_name,
+							mem_region_clear_job,
+							&job_args[i]);
+			i++;
+		}
+		job_args[i].s = s;
+		job_args[i].e = s+l;
+		job_args[i].job_name = malloc(sizeof(char)*100);
+		total+=l;
+		chip_id = __dt_get_chip_id(r->node);
+		if (chip_id == -1)
+			chip_id = 0;
+		path = dt_get_path(r->node);
+		snprintf(job_args[i].job_name,100,
+			 "clear %s, %s 0x%"PRIx64" len: 0x%"PRIx64" on %d",
+			 r->name, path,
+			 job_args[i].s,
+			 (job_args[i].e - job_args[i].s),
+			 chip_id);
+		free(path);
+		printf("job: %s\n", job_args[i].job_name);
+		jobs[i] = cpu_queue_job_on_node(chip_id,
+						job_args[i].job_name,
+					mem_region_clear_job,
+					&job_args[i]);
+		i++;
+	}
+	cpu_process_local_jobs();
+	l = 0;
+	for(i=0; i < njobs; i++) {
+		cpu_wait_job(jobs[i], true);
+		l += (job_args[i].e - job_args[i].s);
+		printf("Clearing memory... %"PRIu64"/%"PRIu64"GB done\n",
+		       l>>30, total>>30);
+		free(job_args[i].job_name);
 	}
 	unlock(&mem_region_lock);
+	free(jobs);
+	free(job_args);
 }
 
 static void mem_region_add_dt_reserved_node(struct dt_node *parent,
diff --git a/core/test/dummy-cpu.h b/core/test/dummy-cpu.h
new file mode 100644
index 000000000000..46f180cde1e8
--- /dev/null
+++ b/core/test/dummy-cpu.h
@@ -0,0 +1,40 @@ 
+/* Copyright 2013-2018 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* A dummy cpu.h for tests.
+ * We don't want to include the real skiboot cpu.h, it's PPC-specific
+ */
+
+#ifndef __CPU_H
+#define __CPU_H
+
+#include <stdint.h>
+#include <stdbool.h>
+
+static unsigned int cpu_max_pir = 1;
+struct cpu_thread {
+	unsigned int			chip_id;
+};
+struct cpu_job *__cpu_queue_job(struct cpu_thread *cpu,
+				const char *name,
+				void (*func)(void *data), void *data,
+				bool no_return);
+void cpu_wait_job(struct cpu_job *job, bool free_it);
+void cpu_process_local_jobs(void);
+struct cpu_job *cpu_queue_job_on_node(uint32_t chip_id,
+				       const char *name,
+				       void (*func)(void *data), void *data);
+#endif /* __CPU_H */
diff --git a/core/test/run-malloc-speed.c b/core/test/run-malloc-speed.c
index d842bd6432c4..8ecef3a09f24 100644
--- a/core/test/run-malloc-speed.c
+++ b/core/test/run-malloc-speed.c
@@ -17,12 +17,7 @@ 
 #include <config.h>
 
 #define BITS_PER_LONG (sizeof(long) * 8)
-/* Don't include this, it's PPC-specific */
-#define __CPU_H
-static unsigned int cpu_max_pir = 1;
-struct cpu_thread {
-	unsigned int			chip_id;
-};
+#include "dummy-cpu.h"
 
 #include <stdlib.h>
 
diff --git a/core/test/run-malloc.c b/core/test/run-malloc.c
index 2feaacb95974..0204e77d6326 100644
--- a/core/test/run-malloc.c
+++ b/core/test/run-malloc.c
@@ -18,12 +18,7 @@ 
 
 #define BITS_PER_LONG (sizeof(long) * 8)
 
-/* Don't include this, it's PPC-specific */
-#define __CPU_H
-static unsigned int cpu_max_pir = 1;
-struct cpu_thread {
-	unsigned int			chip_id;
-};
+#include "dummy-cpu.h"
 
 #include <stdlib.h>
 
diff --git a/core/test/run-mem_range_is_reserved.c b/core/test/run-mem_range_is_reserved.c
index 37f7db3f744c..f44f1c2db6fd 100644
--- a/core/test/run-mem_range_is_reserved.c
+++ b/core/test/run-mem_range_is_reserved.c
@@ -17,12 +17,8 @@ 
 #include <config.h>
 
 #define BITS_PER_LONG (sizeof(long) * 8)
-/* Don't include this, it's PPC-specific */
-#define __CPU_H
-static unsigned int cpu_max_pir = 1;
-struct cpu_thread {
-	unsigned int			chip_id;
-};
+
+#include "dummy-cpu.h"
 
 #include <stdlib.h>
 
diff --git a/core/test/run-mem_region.c b/core/test/run-mem_region.c
index f2506d65f19a..1fd20937e12f 100644
--- a/core/test/run-mem_region.c
+++ b/core/test/run-mem_region.c
@@ -15,14 +15,12 @@ 
  */
 
 #include <config.h>
+#include <stdbool.h>
+#include <stdint.h>
 
 #define BITS_PER_LONG (sizeof(long) * 8)
-/* Don't include this, it's PPC-specific */
-#define __CPU_H
-static unsigned int cpu_max_pir = 1;
-struct cpu_thread {
-	unsigned int			chip_id;
-};
+
+#include "dummy-cpu.h"
 
 #include <stdlib.h>
 #include <string.h>
diff --git a/core/test/run-mem_region_init.c b/core/test/run-mem_region_init.c
index f1028da32703..f70d70f85280 100644
--- a/core/test/run-mem_region_init.c
+++ b/core/test/run-mem_region_init.c
@@ -17,12 +17,8 @@ 
 #include <config.h>
 
 #define BITS_PER_LONG (sizeof(long) * 8)
-/* Don't include this, it's PPC-specific */
-#define __CPU_H
-static unsigned int cpu_max_pir = 1;
-struct cpu_thread {
-	unsigned int			chip_id;
-};
+
+#include "dummy-cpu.h"
 
 #include <stdlib.h>
 
diff --git a/core/test/run-mem_region_next.c b/core/test/run-mem_region_next.c
index 72d02a981a6e..fec5df8f7c13 100644
--- a/core/test/run-mem_region_next.c
+++ b/core/test/run-mem_region_next.c
@@ -17,12 +17,8 @@ 
 #include <config.h>
 
 #define BITS_PER_LONG (sizeof(long) * 8)
-/* Don't include this, it's PPC-specific */
-#define __CPU_H
-static unsigned int cpu_max_pir = 1;
-struct cpu_thread {
-	unsigned int			chip_id;
-};
+
+#include "dummy-cpu.h"
 
 #include <stdlib.h>
 #include <string.h>
diff --git a/core/test/run-mem_region_release_unused.c b/core/test/run-mem_region_release_unused.c
index fdd273a7f55b..4fe62ca5dc39 100644
--- a/core/test/run-mem_region_release_unused.c
+++ b/core/test/run-mem_region_release_unused.c
@@ -17,12 +17,8 @@ 
 #include <config.h>
 
 #define BITS_PER_LONG (sizeof(long) * 8)
-/* Don't include this, it's PPC-specific */
-#define __CPU_H
-static unsigned int cpu_max_pir = 1;
-struct cpu_thread {
-	unsigned int			chip_id;
-};
+
+#include "dummy-cpu.h"
 
 #include <stdlib.h>
 
diff --git a/core/test/run-mem_region_release_unused_noalloc.c b/core/test/run-mem_region_release_unused_noalloc.c
index 6ae79591f08e..fe571350d957 100644
--- a/core/test/run-mem_region_release_unused_noalloc.c
+++ b/core/test/run-mem_region_release_unused_noalloc.c
@@ -17,12 +17,8 @@ 
 #include <config.h>
 
 #define BITS_PER_LONG (sizeof(long) * 8)
-/* Don't include this, it's PPC-specific */
-#define __CPU_H
-static unsigned int cpu_max_pir = 1;
-struct cpu_thread {
-	unsigned int			chip_id;
-};
+
+#include "dummy-cpu.h"
 
 #include <stdlib.h>
 
diff --git a/core/test/run-mem_region_reservations.c b/core/test/run-mem_region_reservations.c
index ae885829645f..b0e484740ff0 100644
--- a/core/test/run-mem_region_reservations.c
+++ b/core/test/run-mem_region_reservations.c
@@ -17,12 +17,8 @@ 
 #include <config.h>
 
 #define BITS_PER_LONG (sizeof(long) * 8)
-/* Don't include this, it's PPC-specific */
-#define __CPU_H
-static unsigned int cpu_max_pir = 1;
-struct cpu_thread {
-	unsigned int			chip_id;
-};
+
+#include "dummy-cpu.h"
 
 #include <stdlib.h>
 
diff --git a/core/test/stubs.c b/core/test/stubs.c
index 39ff18d8f103..939e3dc7950b 100644
--- a/core/test/stubs.c
+++ b/core/test/stubs.c
@@ -16,6 +16,7 @@ 
 #include <stdlib.h>
 #include <stdio.h>
 #include <stdarg.h>
+#include <stdint.h>
 
 #include "../../ccan/list/list.c"
 
@@ -41,6 +42,49 @@  static void stub_function(void)
 	abort();
 }
 
+struct cpu_thread;
+
+struct cpu_job *__cpu_queue_job(struct cpu_thread *cpu,
+				const char *name,
+				void (*func)(void *data), void *data,
+				bool no_return);
+void cpu_wait_job(struct cpu_job *job, bool free_it);
+void cpu_process_local_jobs(void);
+struct cpu_job *cpu_queue_job_on_node(uint32_t chip_id,
+				       const char *name,
+				       void (*func)(void *data), void *data);
+
+struct cpu_job *cpu_queue_job_on_node(uint32_t chip_id,
+				       const char *name,
+				       void (*func)(void *data), void *data)
+{
+	(void)chip_id;
+	return __cpu_queue_job(NULL, name, func, data, false);
+}
+
+struct cpu_job *__cpu_queue_job(struct cpu_thread *cpu,
+				const char *name,
+				void (*func)(void *data), void *data,
+				bool no_return)
+{
+	(void)cpu;
+	(void)name;
+	(func)(data);
+	(void)no_return;
+	return NULL;
+}
+
+void cpu_wait_job(struct cpu_job *job, bool free_it)
+{
+	(void)job;
+	(void)free_it;
+	return;
+}
+
+void cpu_process_local_jobs(void)
+{
+}
+
 #define STUB(fnname) \
 	void fnname(void) __attribute__((weak, alias ("stub_function")))
 
diff --git a/hdata/test/hdata_to_dt.c b/hdata/test/hdata_to_dt.c
index 8c61b4f62a48..bd11fb0b40e7 100644
--- a/hdata/test/hdata_to_dt.c
+++ b/hdata/test/hdata_to_dt.c
@@ -83,6 +83,15 @@  struct cpu_thread {
 	uint32_t			pir;
 	uint32_t			chip_id;
 };
+struct cpu_job *__cpu_queue_job(struct cpu_thread *cpu,
+				const char *name,
+				void (*func)(void *data), void *data,
+				bool no_return);
+void cpu_wait_job(struct cpu_job *job, bool free_it);
+void cpu_process_local_jobs(void);
+struct cpu_job *cpu_queue_job_on_node(uint32_t chip_id,
+				       const char *name,
+				       void (*func)(void *data), void *data);
 
 struct cpu_thread __boot_cpu, *boot_cpu = &__boot_cpu;
 static unsigned long fake_pvr = PVR_P7;
diff --git a/hdata/test/stubs.c b/hdata/test/stubs.c
index 592257051bd2..f7b1da105e0e 100644
--- a/hdata/test/stubs.c
+++ b/hdata/test/stubs.c
@@ -18,6 +18,7 @@ 
 #include <stdarg.h>
 #include <string.h>
 #include <malloc.h>
+#include <stdint.h>
 
 #include <compiler.h>
 
@@ -84,6 +85,52 @@  void *__zalloc(size_t bytes, const char *location)
 	return p;
 }
 
+struct cpu_thread;
+
+struct cpu_job *__cpu_queue_job(struct cpu_thread *cpu,
+				const char *name,
+				void (*func)(void *data), void *data,
+				bool no_return);
+
+struct cpu_job *cpu_queue_job_on_node(uint32_t chip_id,
+				      const char *name,
+				      void (*func)(void *data), void *data);
+
+struct cpu_job *cpu_queue_job_on_node(uint32_t chip_id,
+				       const char *name,
+				       void (*func)(void *data), void *data)
+{
+	(void)chip_id;
+	return __cpu_queue_job(NULL, name, func, data, false);
+}
+
+struct cpu_job *__cpu_queue_job(struct cpu_thread *cpu,
+				const char *name,
+				void (*func)(void *data), void *data,
+				bool no_return)
+{
+	(void)cpu;
+	(void)name;
+	(func)(data);
+	(void)no_return;
+	return NULL;
+}
+
+void cpu_wait_job(struct cpu_job *job, bool free_it);
+
+void cpu_wait_job(struct cpu_job *job, bool free_it)
+{
+	(void)job;
+	(void)free_it;
+	return;
+}
+
+void cpu_process_local_jobs(void);
+
+void cpu_process_local_jobs(void)
+{
+}
+
 /* Add any stub functions required for linking here. */
 static void stub_function(void)
 {
diff --git a/include/device.h b/include/device.h
index fd66df11a1f2..a447dcd4b727 100644
--- a/include/device.h
+++ b/include/device.h
@@ -237,9 +237,12 @@  u32 dt_n_size_cells(const struct dt_node *node);
 u64 dt_get_number(const void *pdata, unsigned int cells);
 
 /* Find an ibm,chip-id property in this node; if not found, walk up the parent
- * nodes. Returns -1 if no chip-id property exists. */
+ * nodes. */
 u32 dt_get_chip_id(const struct dt_node *node);
 
+/* Same as dt_get_chip_id except Returns -1 if no chip-id property exists. */
+u32 __dt_get_chip_id(const struct dt_node *node);
+
 /* Address accessors ("reg" properties parsing). No translation,
  * only support "simple" address forms (1 or 2 cells). Asserts
  * if address doesn't exist