Patchwork [12/12] : sparc64: Use new dynamic per-cpu allocator.

login
register
mail settings
Submitter David Miller
Date April 9, 2009, 5:37 a.m.
Message ID <20090408.223758.81953207.davem@davemloft.net>
Download mbox | patch
Permalink /patch/25762/
State Accepted
Delegated to: David Miller
Headers show

Comments

David Miller - April 9, 2009, 5:37 a.m.
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/Kconfig         |    3 +
 arch/sparc/kernel/smp_64.c |  165 +++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 159 insertions(+), 9 deletions(-)
Sam Ravnborg - April 9, 2009, 6:44 a.m.
On Wed, Apr 08, 2009 at 10:37:58PM -0700, David Miller wrote:
> 
> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---
>  arch/sparc/Kconfig         |    3 +
>  arch/sparc/kernel/smp_64.c |  165 +++++++++++++++++++++++++++++++++++++++++---
>  2 files changed, 159 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
> index cc12cd4..2185cf9 100644
> --- a/arch/sparc/Kconfig
> +++ b/arch/sparc/Kconfig
> @@ -93,6 +93,9 @@ config AUDIT_ARCH
>  config HAVE_SETUP_PER_CPU_AREA
>  	def_bool y if SPARC64
>  
> +config HAVE_DYNAMIC_PER_CPU_AREA
> +	def_bool y if SPARC64
> +

Not related to this specific patch - but I wonder if there
is any good reason these HAVE_ varaibles do not follow the
normal pattern with one definition
and users using select to enable it.

	Sam
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tejun Heo - April 9, 2009, 11:48 a.m.
Hello,

The percpu part looks good to me.  Just one question below.

David Miller wrote:
>  void __init setup_per_cpu_areas(void)
>  {
> -	unsigned long size, i, nr_possible_cpus = num_possible_cpus();
> -	char *ptr;
> +	size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start;
> +	static struct vm_struct vm;
> +	unsigned long delta, cpu;
> +	size_t pcpu_unit_size;
> +	size_t ptrs_size;
> +
> +	pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
> +			       PERCPU_DYNAMIC_RESERVE);
> +	dyn_size = pcpur_size - static_size - PERCPU_MODULE_RESERVE;

Isn't it better to use embedding allocator for !NUMA cases (one less
TLB entry usage for each CPU)?

Thanks.
David Miller - April 9, 2009, 9:29 p.m.
From: Tejun Heo <tj@kernel.org>
Date: Thu, 09 Apr 2009 04:48:12 -0700

> 
> David Miller wrote:
>>  void __init setup_per_cpu_areas(void)
>>  {
>> -	unsigned long size, i, nr_possible_cpus = num_possible_cpus();
>> -	char *ptr;
>> +	size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start;
>> +	static struct vm_struct vm;
>> +	unsigned long delta, cpu;
>> +	size_t pcpu_unit_size;
>> +	size_t ptrs_size;
>> +
>> +	pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
>> +			       PERCPU_DYNAMIC_RESERVE);
>> +	dyn_size = pcpur_size - static_size - PERCPU_MODULE_RESERVE;
> 
> Isn't it better to use embedding allocator for !NUMA cases (one less
> TLB entry usage for each CPU)?

Heck, the embedding case would probably be optimal for Niagara NUMA
systems too.

On Niagara systems all of the "possible" cpu numbers are linear and in
order.  No holes, gaps, or other stuff like this.  So just allocating
big TLB mapping chunks and chopping them up to the individual cpus is
the best scheme possible.

Indeed, these are the kinds of things I plan to experiment with.
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tejun Heo - April 9, 2009, 9:45 p.m.
David Miller wrote:
>> Isn't it better to use embedding allocator for !NUMA cases (one less
>> TLB entry usage for each CPU)?
> 
> Heck, the embedding case would probably be optimal for Niagara NUMA
> systems too.
> 
> On Niagara systems all of the "possible" cpu numbers are linear and in
> order.  No holes, gaps, or other stuff like this.  So just allocating
> big TLB mapping chunks and chopping them up to the individual cpus is
> the best scheme possible.

Sounds great.  I wonder whether the remap allocator could be replaced
with embed allocator with right parameters on x86 too.

Thanks.

Patch

diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index cc12cd4..2185cf9 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -93,6 +93,9 @@  config AUDIT_ARCH
 config HAVE_SETUP_PER_CPU_AREA
 	def_bool y if SPARC64
 
+config HAVE_DYNAMIC_PER_CPU_AREA
+	def_bool y if SPARC64
+
 config GENERIC_HARDIRQS_NO__DO_IRQ
 	bool
 	def_bool y if SPARC64
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 168025f..f1c8208 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -21,6 +21,7 @@ 
 #include <linux/jiffies.h>
 #include <linux/profile.h>
 #include <linux/bootmem.h>
+#include <linux/vmalloc.h>
 #include <linux/cpu.h>
 
 #include <asm/head.h>
@@ -1371,19 +1372,165 @@  void smp_send_stop(void)
 {
 }
 
+/**
+ * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
+ * @cpu: cpu to allocate for
+ * @size: size allocation in bytes
+ * @align: alignment
+ *
+ * Allocate @size bytes aligned at @align for cpu @cpu.  This wrapper
+ * does the right thing for NUMA regardless of the current
+ * configuration.
+ *
+ * RETURNS:
+ * Pointer to the allocated area on success, NULL on failure.
+ */
+static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
+					unsigned long align)
+{
+	const unsigned long goal = __pa(MAX_DMA_ADDRESS);
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+	int node = cpu_to_node(cpu);
+	void *ptr;
+
+	if (!node_online(node) || !NODE_DATA(node)) {
+		ptr = __alloc_bootmem(size, align, goal);
+		pr_info("cpu %d has no node %d or node-local memory\n",
+			cpu, node);
+		pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
+			 cpu, size, __pa(ptr));
+	} else {
+		ptr = __alloc_bootmem_node(NODE_DATA(node),
+					   size, align, goal);
+		pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
+			 "%016lx\n", cpu, size, node, __pa(ptr));
+	}
+	return ptr;
+#else
+	return __alloc_bootmem(size, align, goal);
+#endif
+}
+
+static size_t pcpur_size __initdata;
+static void **pcpur_ptrs __initdata;
+
+static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
+{
+	size_t off = (size_t)pageno << PAGE_SHIFT;
+
+	if (off >= pcpur_size)
+		return NULL;
+
+	return virt_to_page(pcpur_ptrs[cpu] + off);
+}
+
+#define PCPU_CHUNK_SIZE (4UL * 1024UL * 1024UL)
+
+static void __init pcpu_map_range(unsigned long start, unsigned long end,
+				  struct page *page)
+{
+	unsigned long pfn = page_to_pfn(page);
+	unsigned long pte_base;
+
+	BUG_ON((pfn<<PAGE_SHIFT)&(PCPU_CHUNK_SIZE - 1UL));
+
+	pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4U |
+		    _PAGE_CP_4U | _PAGE_CV_4U |
+		    _PAGE_P_4U | _PAGE_W_4U);
+	if (tlb_type == hypervisor)
+		pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4V |
+			    _PAGE_CP_4V | _PAGE_CV_4V |
+			    _PAGE_P_4V | _PAGE_W_4V);
+
+	while (start < end) {
+		pgd_t *pgd = pgd_offset_k(start);
+		unsigned long this_end;
+		pud_t *pud;
+		pmd_t *pmd;
+		pte_t *pte;
+
+		pud = pud_offset(pgd, start);
+		if (pud_none(*pud)) {
+			pmd_t *new;
+
+			new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
+			pud_populate(&init_mm, pud, new);
+		}
+
+		pmd = pmd_offset(pud, start);
+		if (!pmd_present(*pmd)) {
+			pte_t *new;
+
+			new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
+			pmd_populate_kernel(&init_mm, pmd, new);
+		}
+
+		pte = pte_offset_kernel(pmd, start);
+		this_end = (start + PMD_SIZE) & PMD_MASK;
+		if (this_end > end)
+			this_end = end;
+
+		while (start < this_end) {
+			unsigned long paddr = pfn << PAGE_SHIFT;
+
+			pte_val(*pte) = (paddr | pte_base);
+
+			start += PAGE_SIZE;
+			pte++;
+			pfn++;
+		}
+	}
+}
+
 void __init setup_per_cpu_areas(void)
 {
-	unsigned long size, i, nr_possible_cpus = num_possible_cpus();
-	char *ptr;
+	size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start;
+	static struct vm_struct vm;
+	unsigned long delta, cpu;
+	size_t pcpu_unit_size;
+	size_t ptrs_size;
+
+	pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
+			       PERCPU_DYNAMIC_RESERVE);
+	dyn_size = pcpur_size - static_size - PERCPU_MODULE_RESERVE;
+
+
+	ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
+	pcpur_ptrs = alloc_bootmem(ptrs_size);
+
+	for_each_possible_cpu(cpu) {
+		pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE,
+						     PCPU_CHUNK_SIZE);
+
+		free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
+			     PCPU_CHUNK_SIZE - pcpur_size);
+
+		memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
+	}
+
+	/* allocate address and map */
+	vm.flags = VM_ALLOC;
+	vm.size = num_possible_cpus() * PCPU_CHUNK_SIZE;
+	vm_area_register_early(&vm, PCPU_CHUNK_SIZE);
+
+	for_each_possible_cpu(cpu) {
+		unsigned long start = (unsigned long) vm.addr;
+		unsigned long end;
+
+		start += cpu * PCPU_CHUNK_SIZE;
+		end = start + PCPU_CHUNK_SIZE;
+		pcpu_map_range(start, end, virt_to_page(pcpur_ptrs[cpu]));
+	}
+
+	pcpu_unit_size = pcpu_setup_first_chunk(pcpur_get_page, static_size,
+						PERCPU_MODULE_RESERVE, dyn_size,
+						PCPU_CHUNK_SIZE, vm.addr, NULL);
 
-	/* Copy section for each CPU (we discard the original) */
-	size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
-	ptr = alloc_bootmem_pages(size * nr_possible_cpus);
+	free_bootmem(__pa(pcpur_ptrs), ptrs_size);
 
-	for_each_possible_cpu(i) {
-		__per_cpu_offset(i) = ptr - __per_cpu_start;
-		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
-		ptr += size;
+	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
+	for_each_possible_cpu(cpu) {
+		__per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
 	}
 
 	/* Setup %g5 for the boot cpu.  */