fast-reboot: parallel memory clearing

Message ID 20180417084607.18252-1-stewart@linux.ibm.com
State Superseded
Headers show
Series
  • fast-reboot: parallel memory clearing
Related show

Commit Message

Stewart Smith April 17, 2018, 8:46 a.m.
A real simple hack to split things up into 16GB jobs,
and name them as how far we are into zeroing, so we can
simply print progress out as XGB cleared as we wait for
the jobs to finish.

This seems to cut at least ~40% time from memory zeroing on
fast-reboot on a 256GB Boston system.

Signed-off-by: Stewart Smith <stewart@linux.ibm.com>
---
 core/mem_region.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 61 insertions(+), 2 deletions(-)

Comments

Nicholas Piggin April 17, 2018, 9:30 a.m. | #1
On Tue, 17 Apr 2018 18:46:07 +1000
Stewart Smith <stewart@linux.ibm.com> wrote:

> A real simple hack to split things up into 16GB jobs,
> and name them as how far we are into zeroing, so we can
> simply print progress out as XGB cleared as we wait for
> the jobs to finish.
> 
> This seems to cut at least ~40% time from memory zeroing on
> fast-reboot on a 256GB Boston system.

Cool! I'm assuming this is just an initial hack, but I'll give some
comments anyway.

> 
> Signed-off-by: Stewart Smith <stewart@linux.ibm.com>
> ---
>  core/mem_region.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 61 insertions(+), 2 deletions(-)
> 
> diff --git a/core/mem_region.c b/core/mem_region.c
> index 8ae49bb790fd..a06e725db3ec 100644
> --- a/core/mem_region.c
> +++ b/core/mem_region.c
> @@ -1206,19 +1206,47 @@ static void mem_clear_range(uint64_t s, uint64_t e)
>  		return;
>  	}
>  
> -	prlog(PR_NOTICE, "Clearing region %llx-%llx\n",
> +	prlog(PR_DEBUG, "Clearing region %llx-%llx\n",
>  	      (long long)s, (long long)e);
>  	memset((void *)s, 0, e - s);
>  }
>  
> +static void mem_region_clear_job(void *data)
> +{
> +	uint64_t *arg = (uint64_t*)data;
> +	mem_clear_range(arg[0], arg[1]);
> +}
> +
> +#define MEM_REGION_CLEAR_JOB_SIZE (16ULL*(1<<30))
> +
>  void mem_region_clear_unused(void)
>  {
> +	int njobs = 0;
> +	struct cpu_job **jobs;
>  	struct mem_region *r;
> +	uint64_t *job_args;
> +	uint64_t s,l;
> +	uint64_t total = 0;
> +	char **job_names;
> +	int i;

Can you do a struct for all of these dynamic allocations?

>  
>  	lock(&mem_region_lock);
>  	assert(mem_regions_finalised);
>  
> +	list_for_each(&regions, r, list) {
> +		if (!(r->type == REGION_OS))
> +			continue;
> +		njobs++;
> +		/* One job per 16GB */
> +		njobs += r->len / MEM_REGION_CLEAR_JOB_SIZE;

A quick glance and I thought you had an off by one there in case of not
16GB length, but no, because that initial increment. So maybe you have
one too many if it is 16GB multiple?

How about

  njobs += (r->len + MEM_REGION_CLEAR_JOB_SIZE - 1) /
                   MEM_REGION_CLEAR_JOB_SIZE;


> @@ -1226,9 +1254,40 @@ void mem_region_clear_unused(void)
>  
>  		assert(r != &skiboot_heap);
>  
> -		mem_clear_range(r->start, r->start + r->len);
> +		s = r->start;
> +		l = r->len;
> +		while(l > MEM_REGION_CLEAR_JOB_SIZE) {
> +			job_args[i*2] = s+l - MEM_REGION_CLEAR_JOB_SIZE;
> +			job_args[i*2+1] = s+l;
> +			l-=MEM_REGION_CLEAR_JOB_SIZE;
> +			job_names[i] = malloc(sizeof(char)*40);
> +			total+=MEM_REGION_CLEAR_JOB_SIZE;
> +			snprintf(job_names[i], 40, "%lldGB cleared", total/(1<<30));
> +			jobs[i] = cpu_queue_job(NULL, job_names[i],
> +						mem_region_clear_job,
> +						&job_args[i*2]);
> +			i++;

I'd be going the other way and incrementing s by the job size on each
round, and assigning a length of max(job size, l - s) each iteration,
so you should get them all into the one loop. Just my preference.

Bonus points for scheduling memory clearing on to node local cores.
That looks like it will take a bit of work in the job scheduler code,
so probably best left to patch 2.

Thanks,
Nick
Stewart Smith April 17, 2018, 10:11 a.m. | #2
Nicholas Piggin <nicholas.piggin@gmail.com> writes:

> On Tue, 17 Apr 2018 18:46:07 +1000
> Stewart Smith <stewart@linux.ibm.com> wrote:
>
>> A real simple hack to split things up into 16GB jobs,
>> and name them as how far we are into zeroing, so we can
>> simply print progress out as XGB cleared as we wait for
>> the jobs to finish.
>> 
>> This seems to cut at least ~40% time from memory zeroing on
>> fast-reboot on a 256GB Boston system.
>
> Cool! I'm assuming this is just an initial hack, but I'll give some
> comments anyway.

Initial hack *while* distracted and trying not to accidentally 'git
reset --hard' the wrong tree. Miracle it boots at all. git-send-email is
my backup system.

>> Signed-off-by: Stewart Smith <stewart@linux.ibm.com>
>> ---
>>  core/mem_region.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
>>  1 file changed, 61 insertions(+), 2 deletions(-)
>> 
>> diff --git a/core/mem_region.c b/core/mem_region.c
>> index 8ae49bb790fd..a06e725db3ec 100644
>> --- a/core/mem_region.c
>> +++ b/core/mem_region.c
>> @@ -1206,19 +1206,47 @@ static void mem_clear_range(uint64_t s, uint64_t e)
>>  		return;
>>  	}
>>  
>> -	prlog(PR_NOTICE, "Clearing region %llx-%llx\n",
>> +	prlog(PR_DEBUG, "Clearing region %llx-%llx\n",
>>  	      (long long)s, (long long)e);
>>  	memset((void *)s, 0, e - s);
>>  }
>>  
>> +static void mem_region_clear_job(void *data)
>> +{
>> +	uint64_t *arg = (uint64_t*)data;
>> +	mem_clear_range(arg[0], arg[1]);
>> +}
>> +
>> +#define MEM_REGION_CLEAR_JOB_SIZE (16ULL*(1<<30))
>> +
>>  void mem_region_clear_unused(void)
>>  {
>> +	int njobs = 0;
>> +	struct cpu_job **jobs;
>>  	struct mem_region *r;
>> +	uint64_t *job_args;
>> +	uint64_t s,l;
>> +	uint64_t total = 0;
>> +	char **job_names;
>> +	int i;
>
> Can you do a struct for all of these dynamic allocations?

Yeah, that makes a lot more sense. I should probably name the jobs
separately from what I log too.

>>  	lock(&mem_region_lock);
>>  	assert(mem_regions_finalised);
>>  
>> +	list_for_each(&regions, r, list) {
>> +		if (!(r->type == REGION_OS))
>> +			continue;
>> +		njobs++;
>> +		/* One job per 16GB */
>> +		njobs += r->len / MEM_REGION_CLEAR_JOB_SIZE;
>
> A quick glance and I thought you had an off by one there in case of not
> 16GB length, but no, because that initial increment. So maybe you have
> one too many if it is 16GB multiple?
>
> How about
>
>   njobs += (r->len + MEM_REGION_CLEAR_JOB_SIZE - 1) /
>                    MEM_REGION_CLEAR_JOB_SIZE;

Yeah, that looks better.

>> @@ -1226,9 +1254,40 @@ void mem_region_clear_unused(void)
>>  
>>  		assert(r != &skiboot_heap);
>>  
>> -		mem_clear_range(r->start, r->start + r->len);
>> +		s = r->start;
>> +		l = r->len;
>> +		while(l > MEM_REGION_CLEAR_JOB_SIZE) {
>> +			job_args[i*2] = s+l - MEM_REGION_CLEAR_JOB_SIZE;
>> +			job_args[i*2+1] = s+l;
>> +			l-=MEM_REGION_CLEAR_JOB_SIZE;
>> +			job_names[i] = malloc(sizeof(char)*40);
>> +			total+=MEM_REGION_CLEAR_JOB_SIZE;
>> +			snprintf(job_names[i], 40, "%lldGB cleared", total/(1<<30));
>> +			jobs[i] = cpu_queue_job(NULL, job_names[i],
>> +						mem_region_clear_job,
>> +						&job_args[i*2]);
>> +			i++;
>
> I'd be going the other way and incrementing s by the job size on each
> round, and assigning a length of max(job size, l - s) each iteration,
> so you should get them all into the one loop. Just my preference.

Yeah, that's nicer.

> Bonus points for scheduling memory clearing on to node local cores.
> That looks like it will take a bit of work in the job scheduler code,
> so probably best left to patch 2.

Yeah, I initially started down that route and quickly realised that it
needed multiple patches :)
Nicholas Piggin April 18, 2018, 3:55 a.m. | #3
On Tue, 17 Apr 2018 20:11:41 +1000
Stewart Smith <stewart@linux.ibm.com> wrote:

> Nicholas Piggin <nicholas.piggin@gmail.com> writes:

> > Bonus points for scheduling memory clearing on to node local cores.
> > That looks like it will take a bit of work in the job scheduler code,
> > so probably best left to patch 2.  
> 
> Yeah, I initially started down that route and quickly realised that it
> needed multiple patches :)
> 

I made a start on the job scheduler side when I did the clearing patch,
before giving up on the bit you just did :)

Here's the patch if you want (not really tested). I'd be nice to put the
chip id into memory regions, then it seems like it should be pretty easy
to add on top of your patch.

I made it strict (run on this chip_id or return NULL) because I figured
firmware might get some particular requirements like that. But it's
easy to make it a fallback, just test if (!cpu) cpu = cpu_find_job_target(-1);
-- you'd have to put a fallback into the caller otherwise.

Thanks,
Nick

---
 core/cpu.c    | 77 ++++++++++++++++++++++++++++++++++++++++-----------
 include/cpu.h |  4 +++
 2 files changed, 65 insertions(+), 16 deletions(-)

diff --git a/core/cpu.c b/core/cpu.c
index 6826fee0..04de8c34 100644
--- a/core/cpu.c
+++ b/core/cpu.c
@@ -103,7 +103,11 @@ static void cpu_wake(struct cpu_thread *cpu)
 	}
 }
 
-static struct cpu_thread *cpu_find_job_target(void)
+/*
+ * If chip_id is >= 0, schedule the job on that node.
+ * Otherwise schedule the job anywhere.
+ */
+static struct cpu_thread *cpu_find_job_target(int32_t chip_id)
 {
 	struct cpu_thread *cpu, *best, *me = this_cpu();
 	uint32_t best_count;
@@ -123,6 +127,8 @@ static struct cpu_thread *cpu_find_job_target(void)
 	for_each_available_cpu(cpu) {
 		if (cpu == me || !cpu_is_thread0(cpu) || cpu->job_has_no_return)
 			continue;
+		if (chip_id >= 0 && cpu->chip_id != chip_id)
+			continue;
 		if (cpu->job_count)
 			continue;
 		lock(&cpu->job_lock);
@@ -142,6 +148,8 @@ static struct cpu_thread *cpu_find_job_target(void)
 	for_each_available_cpu(cpu) {
 		if (cpu == me || cpu->job_has_no_return)
 			continue;
+		if (chip_id >= 0 && cpu->chip_id != chip_id)
+			continue;
 		if (!best || cpu->job_count < best_count) {
 			best = cpu;
 			best_count = cpu->job_count;
@@ -164,6 +172,26 @@ static struct cpu_thread *cpu_find_job_target(void)
 	return NULL;
 }
 
+/* job_lock is held, returns with it released */
+static void queue_job_on_cpu(struct cpu_thread *cpu, struct cpu_job *job)
+{
+	/* That's bad, the job will never run */
+	if (cpu->job_has_no_return) {
+		prlog(PR_WARNING, "WARNING ! Job %s scheduled on CPU 0x%x"
+		      " which has a no-return job on its queue !\n",
+		      job->name, cpu->pir);
+		backtrace();
+	}
+	list_add_tail(&cpu->job_queue, &job->link);
+	if (job->no_return)
+		cpu->job_has_no_return = true;
+	else
+		cpu->job_count++;
+	if (pm_enabled)
+		cpu_wake(cpu);
+	unlock(&cpu->job_lock);
+}
+
 struct cpu_job *__cpu_queue_job(struct cpu_thread *cpu,
 				const char *name,
 				void (*func)(void *data), void *data,
@@ -193,7 +221,7 @@ struct cpu_job *__cpu_queue_job(struct cpu_thread *cpu,
 
 	/* Pick a candidate. Returns with target queue locked */
 	if (cpu == NULL)
-		cpu = cpu_find_job_target();
+		cpu = cpu_find_job_target(-1);
 	else if (cpu != this_cpu())
 		lock(&cpu->job_lock);
 	else
@@ -206,21 +234,38 @@ struct cpu_job *__cpu_queue_job(struct cpu_thread *cpu,
 		return job;
 	}
 
-	/* That's bad, the job will never run */
-	if (cpu->job_has_no_return) {
-		prlog(PR_WARNING, "WARNING ! Job %s scheduled on CPU 0x%x"
-		      " which has a no-return job on its queue !\n",
-		      job->name, cpu->pir);
-		backtrace();
+	queue_job_on_cpu(cpu, job);
+
+	return job;
+}
+
+struct cpu_job *cpu_queue_job_on_node(uint32_t chip_id,
+				const char *name,
+				void (*func)(void *data), void *data)
+{
+	struct cpu_thread *cpu;
+	struct cpu_job *job;
+
+	job = zalloc(sizeof(struct cpu_job));
+	if (!job)
+		return NULL;
+	job->func = func;
+	job->data = data;
+	job->name = name;
+	job->complete = false;
+	job->no_return = false;
+
+	/* Pick a candidate. Returns with target queue locked */
+	cpu = cpu_find_job_target(chip_id);
+
+	/* Can't be scheduled, run it now */
+	if (cpu == NULL) {
+		unlock(&cpu->job_lock);
+		free(job);
+		return NULL;
 	}
-	list_add_tail(&cpu->job_queue, &job->link);
-	if (no_return)
-		cpu->job_has_no_return = true;
-	else
-		cpu->job_count++;
-	if (pm_enabled)
-		cpu_wake(cpu);
-	unlock(&cpu->job_lock);
+
+	queue_job_on_cpu(cpu, job);
 
 	return job;
 }
diff --git a/include/cpu.h b/include/cpu.h
index b7cd588d..175b7c28 100644
--- a/include/cpu.h
+++ b/include/cpu.h
@@ -281,6 +281,10 @@ static inline struct cpu_job *cpu_queue_job(struct cpu_thread *cpu,
 	return __cpu_queue_job(cpu, name, func, data, false);
 }
 
+extern struct cpu_job *cpu_queue_job_on_node(uint32_t chip_id,
+				       const char *name,
+				       void (*func)(void *data), void *data);
+
 
 /* Poll job status, returns true if completed */
 extern bool cpu_poll_job(struct cpu_job *job);

Patch

diff --git a/core/mem_region.c b/core/mem_region.c
index 8ae49bb790fd..a06e725db3ec 100644
--- a/core/mem_region.c
+++ b/core/mem_region.c
@@ -1206,19 +1206,47 @@  static void mem_clear_range(uint64_t s, uint64_t e)
 		return;
 	}
 
-	prlog(PR_NOTICE, "Clearing region %llx-%llx\n",
+	prlog(PR_DEBUG, "Clearing region %llx-%llx\n",
 	      (long long)s, (long long)e);
 	memset((void *)s, 0, e - s);
 }
 
+static void mem_region_clear_job(void *data)
+{
+	uint64_t *arg = (uint64_t*)data;
+	mem_clear_range(arg[0], arg[1]);
+}
+
+#define MEM_REGION_CLEAR_JOB_SIZE (16ULL*(1<<30))
+
 void mem_region_clear_unused(void)
 {
+	int njobs = 0;
+	struct cpu_job **jobs;
 	struct mem_region *r;
+	uint64_t *job_args;
+	uint64_t s,l;
+	uint64_t total = 0;
+	char **job_names;
+	int i;
 
 	lock(&mem_region_lock);
 	assert(mem_regions_finalised);
 
+	list_for_each(&regions, r, list) {
+		if (!(r->type == REGION_OS))
+			continue;
+		njobs++;
+		/* One job per 16GB */
+		njobs += r->len / MEM_REGION_CLEAR_JOB_SIZE;
+	}
+
+	jobs = malloc(njobs * sizeof(struct cpu_job*));
+	job_args = malloc(2*njobs*sizeof(uint64_t));
+	job_names = malloc(njobs*sizeof(char*));
+
 	prlog(PR_NOTICE, "Clearing unused memory:\n");
+	i = 0;
 	list_for_each(&regions, r, list) {
 		/* If it's not unused, ignore it. */
 		if (!(r->type == REGION_OS))
@@ -1226,9 +1254,40 @@  void mem_region_clear_unused(void)
 
 		assert(r != &skiboot_heap);
 
-		mem_clear_range(r->start, r->start + r->len);
+		s = r->start;
+		l = r->len;
+		while(l > MEM_REGION_CLEAR_JOB_SIZE) {
+			job_args[i*2] = s+l - MEM_REGION_CLEAR_JOB_SIZE;
+			job_args[i*2+1] = s+l;
+			l-=MEM_REGION_CLEAR_JOB_SIZE;
+			job_names[i] = malloc(sizeof(char)*40);
+			total+=MEM_REGION_CLEAR_JOB_SIZE;
+			snprintf(job_names[i], 40, "%lldGB cleared", total/(1<<30));
+			jobs[i] = cpu_queue_job(NULL, job_names[i],
+						mem_region_clear_job,
+						&job_args[i*2]);
+			i++;
+		}
+		job_args[i*2] = s;
+		job_args[i*2+1] = s+l;
+		job_names[i] = malloc(sizeof(char)*40);
+		total+=l;
+		snprintf(job_names[i], 40, "%lldGB cleared", total/(1<<30));
+		jobs[i] = cpu_queue_job(NULL, job_names[i],
+					mem_region_clear_job,
+					&job_args[i*2]);
+		i++;
+	}
+	cpu_process_local_jobs();
+	for(i=0; i < njobs; i++) {
+		cpu_wait_job(jobs[i], true);
+		printf("%s\n",job_names[i]);
+		free(job_names[i]);
 	}
 	unlock(&mem_region_lock);
+	free(jobs);
+	free(job_args);
+	free(job_names);
 }
 
 static void mem_region_add_dt_reserved_node(struct dt_node *parent,