@@ -955,7 +955,7 @@ u64 dt_get_address(const struct dt_node *node, unsigned int index,
return dt_get_number(p->prop + pos, na);
}
-static u32 __dt_get_chip_id(const struct dt_node *node)
+u32 __dt_get_chip_id(const struct dt_node *node)
{
const struct dt_property *prop;
@@ -1206,19 +1206,52 @@ static void mem_clear_range(uint64_t s, uint64_t e)
return;
}
- prlog(PR_NOTICE, "Clearing region %llx-%llx\n",
+ prlog(PR_DEBUG, "Clearing region %llx-%llx\n",
(long long)s, (long long)e);
memset((void *)s, 0, e - s);
}
+struct mem_region_clear_job_args {
+ char *job_name;
+ uint64_t s,e;
+};
+
+static void mem_region_clear_job(void *data)
+{
+ struct mem_region_clear_job_args *arg = (struct mem_region_clear_job_args*)data;
+ mem_clear_range(arg->s, arg->e);
+}
+
+#define MEM_REGION_CLEAR_JOB_SIZE (16ULL*(1<<30))
+
void mem_region_clear_unused(void)
{
+ int njobs = 0;
+ struct cpu_job **jobs;
struct mem_region *r;
+ struct mem_region_clear_job_args *job_args;
+ uint64_t s,l;
+ uint64_t total = 0;
+ uint32_t chip_id;
+ char *path;
+ int i;
lock(&mem_region_lock);
assert(mem_regions_finalised);
+ list_for_each(®ions, r, list) {
+ if (!(r->type == REGION_OS))
+ continue;
+ njobs++;
+ /* One job per 16GB */
+ njobs += r->len / MEM_REGION_CLEAR_JOB_SIZE;
+ }
+
+ jobs = malloc(njobs * sizeof(struct cpu_job*));
+ job_args = malloc(njobs * sizeof(struct mem_region_clear_job_args));
+
prlog(PR_NOTICE, "Clearing unused memory:\n");
+ i = 0;
list_for_each(®ions, r, list) {
/* If it's not unused, ignore it. */
if (!(r->type == REGION_OS))
@@ -1226,9 +1259,66 @@ void mem_region_clear_unused(void)
assert(r != &skiboot_heap);
- mem_clear_range(r->start, r->start + r->len);
+ s = r->start;
+ l = r->len;
+ while(l > MEM_REGION_CLEAR_JOB_SIZE) {
+ job_args[i].s = s+l - MEM_REGION_CLEAR_JOB_SIZE;
+ job_args[i].e = s+l;
+ l-=MEM_REGION_CLEAR_JOB_SIZE;
+ job_args[i].job_name = malloc(sizeof(char)*100);
+ total+=MEM_REGION_CLEAR_JOB_SIZE;
+ chip_id = __dt_get_chip_id(r->node);
+ if (chip_id == -1)
+ chip_id = 0;
+ path = dt_get_path(r->node);
+ snprintf(job_args[i].job_name, 100,
+ "clear %s, %s 0x%"PRIx64" len: %"PRIx64" on %d",
+ r->name, path,
+ job_args[i].s,
+ (job_args[i].e - job_args[i].s),
+ chip_id);
+ free(path);
+ printf("job: %s\n", job_args[i].job_name);
+ jobs[i] = cpu_queue_job_on_node(chip_id,
+ job_args[i].job_name,
+ mem_region_clear_job,
+ &job_args[i]);
+ i++;
+ }
+ job_args[i].s = s;
+ job_args[i].e = s+l;
+ job_args[i].job_name = malloc(sizeof(char)*100);
+ total+=l;
+ chip_id = __dt_get_chip_id(r->node);
+ if (chip_id == -1)
+ chip_id = 0;
+ path = dt_get_path(r->node);
+ snprintf(job_args[i].job_name,100,
+ "clear %s, %s 0x%"PRIx64" len: 0x%"PRIx64" on %d",
+ r->name, path,
+ job_args[i].s,
+ (job_args[i].e - job_args[i].s),
+ chip_id);
+ free(path);
+ printf("job: %s\n", job_args[i].job_name);
+ jobs[i] = cpu_queue_job_on_node(chip_id,
+ job_args[i].job_name,
+ mem_region_clear_job,
+ &job_args[i]);
+ i++;
+ }
+ cpu_process_local_jobs();
+ l = 0;
+ for(i=0; i < njobs; i++) {
+ cpu_wait_job(jobs[i], true);
+ l += (job_args[i].e - job_args[i].s);
+ printf("Clearing memory... %"PRIu64"/%"PRIu64"GB done\n",
+ l>>30, total>>30);
+ free(job_args[i].job_name);
}
unlock(&mem_region_lock);
+ free(jobs);
+ free(job_args);
}
static void mem_region_add_dt_reserved_node(struct dt_node *parent,
new file mode 100644
@@ -0,0 +1,40 @@
+/* Copyright 2013-2018 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* A dummy cpu.h for tests.
+ * We don't want to include the real skiboot cpu.h, it's PPC-specific
+ */
+
+#ifndef __CPU_H
+#define __CPU_H
+
+#include <stdint.h>
+#include <stdbool.h>
+
+static unsigned int cpu_max_pir = 1;
+struct cpu_thread {
+ unsigned int chip_id;
+};
+struct cpu_job *__cpu_queue_job(struct cpu_thread *cpu,
+ const char *name,
+ void (*func)(void *data), void *data,
+ bool no_return);
+void cpu_wait_job(struct cpu_job *job, bool free_it);
+void cpu_process_local_jobs(void);
+struct cpu_job *cpu_queue_job_on_node(uint32_t chip_id,
+ const char *name,
+ void (*func)(void *data), void *data);
+#endif /* __CPU_H */
@@ -17,12 +17,7 @@
#include <config.h>
#define BITS_PER_LONG (sizeof(long) * 8)
-/* Don't include this, it's PPC-specific */
-#define __CPU_H
-static unsigned int cpu_max_pir = 1;
-struct cpu_thread {
- unsigned int chip_id;
-};
+#include "dummy-cpu.h"
#include <stdlib.h>
@@ -18,12 +18,7 @@
#define BITS_PER_LONG (sizeof(long) * 8)
-/* Don't include this, it's PPC-specific */
-#define __CPU_H
-static unsigned int cpu_max_pir = 1;
-struct cpu_thread {
- unsigned int chip_id;
-};
+#include "dummy-cpu.h"
#include <stdlib.h>
@@ -17,12 +17,8 @@
#include <config.h>
#define BITS_PER_LONG (sizeof(long) * 8)
-/* Don't include this, it's PPC-specific */
-#define __CPU_H
-static unsigned int cpu_max_pir = 1;
-struct cpu_thread {
- unsigned int chip_id;
-};
+
+#include "dummy-cpu.h"
#include <stdlib.h>
@@ -15,14 +15,12 @@
*/
#include <config.h>
+#include <stdbool.h>
+#include <stdint.h>
#define BITS_PER_LONG (sizeof(long) * 8)
-/* Don't include this, it's PPC-specific */
-#define __CPU_H
-static unsigned int cpu_max_pir = 1;
-struct cpu_thread {
- unsigned int chip_id;
-};
+
+#include "dummy-cpu.h"
#include <stdlib.h>
#include <string.h>
@@ -17,12 +17,8 @@
#include <config.h>
#define BITS_PER_LONG (sizeof(long) * 8)
-/* Don't include this, it's PPC-specific */
-#define __CPU_H
-static unsigned int cpu_max_pir = 1;
-struct cpu_thread {
- unsigned int chip_id;
-};
+
+#include "dummy-cpu.h"
#include <stdlib.h>
@@ -17,12 +17,8 @@
#include <config.h>
#define BITS_PER_LONG (sizeof(long) * 8)
-/* Don't include this, it's PPC-specific */
-#define __CPU_H
-static unsigned int cpu_max_pir = 1;
-struct cpu_thread {
- unsigned int chip_id;
-};
+
+#include "dummy-cpu.h"
#include <stdlib.h>
#include <string.h>
@@ -17,12 +17,8 @@
#include <config.h>
#define BITS_PER_LONG (sizeof(long) * 8)
-/* Don't include this, it's PPC-specific */
-#define __CPU_H
-static unsigned int cpu_max_pir = 1;
-struct cpu_thread {
- unsigned int chip_id;
-};
+
+#include "dummy-cpu.h"
#include <stdlib.h>
@@ -17,12 +17,8 @@
#include <config.h>
#define BITS_PER_LONG (sizeof(long) * 8)
-/* Don't include this, it's PPC-specific */
-#define __CPU_H
-static unsigned int cpu_max_pir = 1;
-struct cpu_thread {
- unsigned int chip_id;
-};
+
+#include "dummy-cpu.h"
#include <stdlib.h>
@@ -17,12 +17,8 @@
#include <config.h>
#define BITS_PER_LONG (sizeof(long) * 8)
-/* Don't include this, it's PPC-specific */
-#define __CPU_H
-static unsigned int cpu_max_pir = 1;
-struct cpu_thread {
- unsigned int chip_id;
-};
+
+#include "dummy-cpu.h"
#include <stdlib.h>
@@ -16,6 +16,7 @@
#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
+#include <stdint.h>
#include "../../ccan/list/list.c"
@@ -41,6 +42,49 @@ static void stub_function(void)
abort();
}
+struct cpu_thread;
+
+struct cpu_job *__cpu_queue_job(struct cpu_thread *cpu,
+ const char *name,
+ void (*func)(void *data), void *data,
+ bool no_return);
+void cpu_wait_job(struct cpu_job *job, bool free_it);
+void cpu_process_local_jobs(void);
+struct cpu_job *cpu_queue_job_on_node(uint32_t chip_id,
+ const char *name,
+ void (*func)(void *data), void *data);
+
+struct cpu_job *cpu_queue_job_on_node(uint32_t chip_id,
+ const char *name,
+ void (*func)(void *data), void *data)
+{
+ (void)chip_id;
+ return __cpu_queue_job(NULL, name, func, data, false);
+}
+
+struct cpu_job *__cpu_queue_job(struct cpu_thread *cpu,
+ const char *name,
+ void (*func)(void *data), void *data,
+ bool no_return)
+{
+ (void)cpu;
+ (void)name;
+ (func)(data);
+ (void)no_return;
+ return NULL;
+}
+
+void cpu_wait_job(struct cpu_job *job, bool free_it)
+{
+ (void)job;
+ (void)free_it;
+ return;
+}
+
+void cpu_process_local_jobs(void)
+{
+}
+
#define STUB(fnname) \
void fnname(void) __attribute__((weak, alias ("stub_function")))
@@ -83,6 +83,15 @@ struct cpu_thread {
uint32_t pir;
uint32_t chip_id;
};
+struct cpu_job *__cpu_queue_job(struct cpu_thread *cpu,
+ const char *name,
+ void (*func)(void *data), void *data,
+ bool no_return);
+void cpu_wait_job(struct cpu_job *job, bool free_it);
+void cpu_process_local_jobs(void);
+struct cpu_job *cpu_queue_job_on_node(uint32_t chip_id,
+ const char *name,
+ void (*func)(void *data), void *data);
struct cpu_thread __boot_cpu, *boot_cpu = &__boot_cpu;
static unsigned long fake_pvr = PVR_P7;
@@ -18,6 +18,7 @@
#include <stdarg.h>
#include <string.h>
#include <malloc.h>
+#include <stdint.h>
#include <compiler.h>
@@ -84,6 +85,52 @@ void *__zalloc(size_t bytes, const char *location)
return p;
}
+struct cpu_thread;
+
+struct cpu_job *__cpu_queue_job(struct cpu_thread *cpu,
+ const char *name,
+ void (*func)(void *data), void *data,
+ bool no_return);
+
+struct cpu_job *cpu_queue_job_on_node(uint32_t chip_id,
+ const char *name,
+ void (*func)(void *data), void *data);
+
+struct cpu_job *cpu_queue_job_on_node(uint32_t chip_id,
+ const char *name,
+ void (*func)(void *data), void *data)
+{
+ (void)chip_id;
+ return __cpu_queue_job(NULL, name, func, data, false);
+}
+
+struct cpu_job *__cpu_queue_job(struct cpu_thread *cpu,
+ const char *name,
+ void (*func)(void *data), void *data,
+ bool no_return)
+{
+ (void)cpu;
+ (void)name;
+ (func)(data);
+ (void)no_return;
+ return NULL;
+}
+
+void cpu_wait_job(struct cpu_job *job, bool free_it);
+
+void cpu_wait_job(struct cpu_job *job, bool free_it)
+{
+ (void)job;
+ (void)free_it;
+ return;
+}
+
+void cpu_process_local_jobs(void);
+
+void cpu_process_local_jobs(void)
+{
+}
+
/* Add any stub functions required for linking here. */
static void stub_function(void)
{
@@ -237,9 +237,12 @@ u32 dt_n_size_cells(const struct dt_node *node);
u64 dt_get_number(const void *pdata, unsigned int cells);
/* Find an ibm,chip-id property in this node; if not found, walk up the parent
- * nodes. Returns -1 if no chip-id property exists. */
+ * nodes. */
u32 dt_get_chip_id(const struct dt_node *node);
+/* Same as dt_get_chip_id except Returns -1 if no chip-id property exists. */
+u32 __dt_get_chip_id(const struct dt_node *node);
+
/* Address accessors ("reg" properties parsing). No translation,
* only support "simple" address forms (1 or 2 cells). Asserts
* if address doesn't exist
Arbitrarily pick 16GB as the unit of parallelism, and split up clearing memory into jobs and schedule them node-local to the memory (or on node 0 if we can't work that out because it's the memory up to SKIBOOT_BASE) This seems to cut at least ~40% time from memory zeroing on fast-reboot on a 256GB Boston system. Signed-off-by: Stewart Smith <stewart@linux.ibm.com> --- core/device.c | 2 +- core/mem_region.c | 94 ++++++++++++++++++- core/test/dummy-cpu.h | 40 ++++++++ core/test/run-malloc-speed.c | 7 +- core/test/run-malloc.c | 7 +- core/test/run-mem_range_is_reserved.c | 8 +- core/test/run-mem_region.c | 10 +- core/test/run-mem_region_init.c | 8 +- core/test/run-mem_region_next.c | 8 +- core/test/run-mem_region_release_unused.c | 8 +- .../run-mem_region_release_unused_noalloc.c | 8 +- core/test/run-mem_region_reservations.c | 8 +- core/test/stubs.c | 44 +++++++++ hdata/test/hdata_to_dt.c | 9 ++ hdata/test/stubs.c | 47 ++++++++++ include/device.h | 5 +- 16 files changed, 255 insertions(+), 58 deletions(-) create mode 100644 core/test/dummy-cpu.h