diff mbox series

[kernel,v3,10/22] powerpc/pseries/iommu: Use memory@ nodes in max RAM address calculation

Message ID 20181113082823.2440-11-aik@ozlabs.ru (mailing list archive)
State Superseded
Headers show
Series powerpc/powernv/npu, vfio: NVIDIA V100 + P9 passthrough | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch warning next/apply_patch Patch failed to apply
snowpatch_ozlabs/apply_patch fail Failed to apply to any branch

Commit Message

Alexey Kardashevskiy Nov. 13, 2018, 8:28 a.m. UTC
We might have memory@ nodes with "linux,usable-memory" set to zero
(for example, to replicate powernv's behaviour for GPU coherent memory)
which means that the memory needs an extra initialization but since
it can be used afterwards, the pseries platform will try mapping it
for DMA so the DMA window needs to cover those memory regions too.

This walks through the memory nodes to find the highest RAM address to
let a huge DMA window cover that too in case this memory gets onlined
later.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/platforms/pseries/iommu.c | 43 +++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

Comments

David Gibson Nov. 16, 2018, 5:23 a.m. UTC | #1
On Tue, Nov 13, 2018 at 07:28:11PM +1100, Alexey Kardashevskiy wrote:
> We might have memory@ nodes with "linux,usable-memory" set to zero
> (for example, to replicate powernv's behaviour for GPU coherent memory)
> which means that the memory needs an extra initialization but since
> it can be used afterwards, the pseries platform will try mapping it
> for DMA so the DMA window needs to cover those memory regions too.
> 
> This walks through the memory nodes to find the highest RAM address to
> let a huge DMA window cover that too in case this memory gets onlined
> later.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/platforms/pseries/iommu.c | 43 +++++++++++++++++++++++++-
>  1 file changed, 42 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> index 78473ac..f818737 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -967,6 +967,47 @@ struct failed_ddw_pdn {
>  
>  static LIST_HEAD(failed_ddw_pdn_list);
>  
> +static unsigned long read_n_cells(int n, const __be32 **buf)
> +{
> +	unsigned long result = 0;
> +
> +	while (n--) {
> +		result = (result << 32) | of_read_number(*buf, 1);
> +		(*buf)++;
> +	}
> +	return result;
> +}

Um.. this appears to be re-implementing of_read_number() in terms of
of_read_number().   Wat!?

> +static phys_addr_t ddw_memory_hotplug_max(void)
> +{
> +	phys_addr_t max_addr = memory_hotplug_max();
> +	struct device_node *memory;
> +
> +	for_each_node_by_type(memory, "memory") {
> +		unsigned long start, size;
> +		int ranges, n_mem_addr_cells, n_mem_size_cells, len;
> +		const __be32 *memcell_buf;
> +
> +		memcell_buf = of_get_property(memory, "reg", &len);
> +		if (!memcell_buf || len <= 0)
> +			continue;
> +
> +		n_mem_addr_cells = of_n_addr_cells(memory);
> +		n_mem_size_cells = of_n_size_cells(memory);
> +
> +		/* ranges in cell */
> +		ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
> +
> +		/* these are order-sensitive, and modify the buffer pointer */
> +		start = read_n_cells(n_mem_addr_cells, &memcell_buf);
> +		size = read_n_cells(n_mem_size_cells, &memcell_buf);
> +
> +		max_addr = max_t(phys_addr_t, max_addr, start + size);
> +	}
> +
> +	return max_addr;
> +}

Is there really no existing place we keep track of maxmimum possible
memory address?

>  /*
>   * If the PE supports dynamic dma windows, and there is space for a table
>   * that can map all pages in a linear offset, then setup such a table,
> @@ -1067,7 +1108,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn,
>  	}
>  	/* verify the window * number of ptes will map the partition */
>  	/* check largest block * page size > max memory hotplug addr */
> -	max_addr = memory_hotplug_max();
> +	max_addr = ddw_memory_hotplug_max();
>  	if (query.largest_available_block < (max_addr >> page_shift)) {
>  		dev_dbg(&dev->dev, "can't map partition max 0x%llx with %u "
>  			  "%llu-sized pages\n", max_addr,  query.largest_available_block,
Alexey Kardashevskiy Nov. 19, 2018, 7:43 a.m. UTC | #2
On 16/11/2018 16:23, David Gibson wrote:
> On Tue, Nov 13, 2018 at 07:28:11PM +1100, Alexey Kardashevskiy wrote:
>> We might have memory@ nodes with "linux,usable-memory" set to zero
>> (for example, to replicate powernv's behaviour for GPU coherent memory)
>> which means that the memory needs an extra initialization but since
>> it can be used afterwards, the pseries platform will try mapping it
>> for DMA so the DMA window needs to cover those memory regions too.
>>
>> This walks through the memory nodes to find the highest RAM address to
>> let a huge DMA window cover that too in case this memory gets onlined
>> later.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>  arch/powerpc/platforms/pseries/iommu.c | 43 +++++++++++++++++++++++++-
>>  1 file changed, 42 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
>> index 78473ac..f818737 100644
>> --- a/arch/powerpc/platforms/pseries/iommu.c
>> +++ b/arch/powerpc/platforms/pseries/iommu.c
>> @@ -967,6 +967,47 @@ struct failed_ddw_pdn {
>>  
>>  static LIST_HEAD(failed_ddw_pdn_list);
>>  
>> +static unsigned long read_n_cells(int n, const __be32 **buf)
>> +{
>> +	unsigned long result = 0;
>> +
>> +	while (n--) {
>> +		result = (result << 32) | of_read_number(*buf, 1);
>> +		(*buf)++;
>> +	}
>> +	return result;
>> +}
> 
> Um.. this appears to be re-implementing of_read_number() in terms of
> of_read_number().   Wat!?


This is a cut-n-paste from arch/powerpc/mm/numa.c :) My bad, I did not
think much when I did this.


> 
>> +static phys_addr_t ddw_memory_hotplug_max(void)
>> +{
>> +	phys_addr_t max_addr = memory_hotplug_max();
>> +	struct device_node *memory;
>> +
>> +	for_each_node_by_type(memory, "memory") {
>> +		unsigned long start, size;
>> +		int ranges, n_mem_addr_cells, n_mem_size_cells, len;
>> +		const __be32 *memcell_buf;
>> +
>> +		memcell_buf = of_get_property(memory, "reg", &len);
>> +		if (!memcell_buf || len <= 0)
>> +			continue;
>> +
>> +		n_mem_addr_cells = of_n_addr_cells(memory);
>> +		n_mem_size_cells = of_n_size_cells(memory);
>> +
>> +		/* ranges in cell */
>> +		ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
>> +
>> +		/* these are order-sensitive, and modify the buffer pointer */
>> +		start = read_n_cells(n_mem_addr_cells, &memcell_buf);
>> +		size = read_n_cells(n_mem_size_cells, &memcell_buf);
>> +
>> +		max_addr = max_t(phys_addr_t, max_addr, start + size);
>> +	}
>> +
>> +	return max_addr;
>> +}
> 
> Is there really no existing place we keep track of maxmimum possible
> memory address?

There are:

1. memblocks from mm/memblock.c - populated at the boot time from
"usable" memory@ nodes and mine are not "usable";

2. drmem from mm/drmem.c - populated from ibm,dynamic-memory-v2 - these
things do not support sparse regions so when I tried these with a GPU
RAM region mapped at 0x244000000000 - the device tree became quickly
over 1 MB and then qemu crashed, I did not debug any further as this
memory is not hotpluggable anyway from the rtas/qemu prospective, in
other words it is not something the user can hotplug or unplug.

And that is it afaict.


> 
>>  /*
>>   * If the PE supports dynamic dma windows, and there is space for a table
>>   * that can map all pages in a linear offset, then setup such a table,
>> @@ -1067,7 +1108,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn,
>>  	}
>>  	/* verify the window * number of ptes will map the partition */
>>  	/* check largest block * page size > max memory hotplug addr */
>> -	max_addr = memory_hotplug_max();
>> +	max_addr = ddw_memory_hotplug_max();
>>  	if (query.largest_available_block < (max_addr >> page_shift)) {
>>  		dev_dbg(&dev->dev, "can't map partition max 0x%llx with %u "
>>  			  "%llu-sized pages\n", max_addr,  query.largest_available_block,
>
diff mbox series

Patch

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 78473ac..f818737 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -967,6 +967,47 @@  struct failed_ddw_pdn {
 
 static LIST_HEAD(failed_ddw_pdn_list);
 
+static unsigned long read_n_cells(int n, const __be32 **buf)
+{
+	unsigned long result = 0;
+
+	while (n--) {
+		result = (result << 32) | of_read_number(*buf, 1);
+		(*buf)++;
+	}
+	return result;
+}
+
+static phys_addr_t ddw_memory_hotplug_max(void)
+{
+	phys_addr_t max_addr = memory_hotplug_max();
+	struct device_node *memory;
+
+	for_each_node_by_type(memory, "memory") {
+		unsigned long start, size;
+		int ranges, n_mem_addr_cells, n_mem_size_cells, len;
+		const __be32 *memcell_buf;
+
+		memcell_buf = of_get_property(memory, "reg", &len);
+		if (!memcell_buf || len <= 0)
+			continue;
+
+		n_mem_addr_cells = of_n_addr_cells(memory);
+		n_mem_size_cells = of_n_size_cells(memory);
+
+		/* ranges in cell */
+		ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
+
+		/* these are order-sensitive, and modify the buffer pointer */
+		start = read_n_cells(n_mem_addr_cells, &memcell_buf);
+		size = read_n_cells(n_mem_size_cells, &memcell_buf);
+
+		max_addr = max_t(phys_addr_t, max_addr, start + size);
+	}
+
+	return max_addr;
+}
+
 /*
  * If the PE supports dynamic dma windows, and there is space for a table
  * that can map all pages in a linear offset, then setup such a table,
@@ -1067,7 +1108,7 @@  static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn,
 	}
 	/* verify the window * number of ptes will map the partition */
 	/* check largest block * page size > max memory hotplug addr */
-	max_addr = memory_hotplug_max();
+	max_addr = ddw_memory_hotplug_max();
 	if (query.largest_available_block < (max_addr >> page_shift)) {
 		dev_dbg(&dev->dev, "can't map partition max 0x%llx with %u "
 			  "%llu-sized pages\n", max_addr,  query.largest_available_block,