Patchwork [v2] powerpc/mm: using two zones for freescale 64 bit kernel

login
register
mail settings
Submitter shaohui xie
Date Aug. 24, 2012, 10:50 a.m.
Message ID <1345805425-3829-1-git-send-email-Shaohui.Xie@freescale.com>
Download mbox | patch
Permalink /patch/179828/
State Changes Requested
Headers show

Comments

shaohui xie - Aug. 24, 2012, 10:50 a.m.
PowerPC platform only supports ZONE_DMA zone for 64bit kernel, so all the
memory will be put into this zone. If the memory size is greater than
the device's DMA capability and device uses dma_alloc_coherent to allocate
memory, it will get an address which is over the device's DMA addressing,
the device will fail.

So we split the memory to two zones: zone ZONE_DMA32 & ZONE_NORMAL, since
we already allocate PCICSRBAR/PEXCSRBAR right below the 4G boundary (if the
lowest PCI address is above 4G), so we constrain the DMA zone ZONE_DMA32
to 2GB, also, we clear flag __GFP_DMA & __GFP_DMA32 and set __GFP_DMA32 only
if the device's dma_mask < total memory size. By doing this, devices which
cannot DMA all the memory will be limited to ZONE_DMA32, but devices which
can DMA all the memory will not be affected by this limitation.

Signed-off-by: Shaohui Xie <Shaohui.Xie@freescale.com>
Signed-off-by: Mingkai Hu <Mingkai.hu@freescale.com>
Signed-off-by: Chen Yuanquan <B41889@freescale.com>
---
changes for v2:
1. use a config option for using two zones (ZONE_DMA32 & ZONE_NORMAL) in
freescale 64 bit kernel.

 arch/powerpc/Kconfig      |    3 +++
 arch/powerpc/kernel/dma.c |   15 +++++++++++++++
 arch/powerpc/mm/mem.c     |    4 ++++
 3 files changed, 22 insertions(+), 0 deletions(-)
Kumar Gala - Aug. 30, 2012, 8:49 p.m.
On Aug 24, 2012, at 5:50 AM, Shaohui Xie wrote:

> PowerPC platform only supports ZONE_DMA zone for 64bit kernel, so all the
> memory will be put into this zone. If the memory size is greater than
> the device's DMA capability and device uses dma_alloc_coherent to allocate
> memory, it will get an address which is over the device's DMA addressing,
> the device will fail.
> 
> So we split the memory to two zones: zone ZONE_DMA32 & ZONE_NORMAL, since
> we already allocate PCICSRBAR/PEXCSRBAR right below the 4G boundary (if the
> lowest PCI address is above 4G), so we constrain the DMA zone ZONE_DMA32
> to 2GB, also, we clear flag __GFP_DMA & __GFP_DMA32 and set __GFP_DMA32 only
> if the device's dma_mask < total memory size. By doing this, devices which
> cannot DMA all the memory will be limited to ZONE_DMA32, but devices which
> can DMA all the memory will not be affected by this limitation.
> 
> Signed-off-by: Shaohui Xie <Shaohui.Xie@freescale.com>
> Signed-off-by: Mingkai Hu <Mingkai.hu@freescale.com>
> Signed-off-by: Chen Yuanquan <B41889@freescale.com>
> ---
> changes for v2:
> 1. use a config option for using two zones (ZONE_DMA32 & ZONE_NORMAL) in
> freescale 64 bit kernel.
> 
> arch/powerpc/Kconfig      |    3 +++
> arch/powerpc/kernel/dma.c |   15 +++++++++++++++
> arch/powerpc/mm/mem.c     |    4 ++++
> 3 files changed, 22 insertions(+), 0 deletions(-)

Ben,

What's the feeling of doing this on ppc64 always? 

- k

> 
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 352f416..a96fbbb 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -629,6 +629,9 @@ config ZONE_DMA
> 	bool
> 	default y
> 
> +config ZONE_DMA32
> +	def_bool (PPC64 && PPC_FSL_BOOK3E)
> +
> config NEED_DMA_MAP_STATE
> 	def_bool (PPC64 || NOT_COHERENT_CACHE)
> 
> diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
> index 355b9d8..cbf5ac1 100644
> --- a/arch/powerpc/kernel/dma.c
> +++ b/arch/powerpc/kernel/dma.c
> @@ -41,9 +41,24 @@ void *dma_direct_alloc_coherent(struct device *dev, size_t size,
> #else
> 	struct page *page;
> 	int node = dev_to_node(dev);
> +#ifdef CONFIG_ZONE_DMA32
> +	phys_addr_t top_ram_pfn = memblock_end_of_DRAM();
> 
> +	/*
> +	 * check for crappy device which has dma_mask < ZONE_DMA, and
> +	 * we are not going to support it, just warn and fail.
> +	 */
> +	if (*dev->dma_mask < DMA_BIT_MASK(31)) {
> +		dev_err(dev, "Unsupported dma_mask 0x%llx\n", *dev->dma_mask);
> +		return NULL;
> +	}
> 	/* ignore region specifiers */
> +	flag  &= ~(__GFP_HIGHMEM | __GFP_DMA | __GFP_DMA32);
> +	if (*dev->dma_mask < top_ram_pfn - 1)
> +		flag |= __GFP_DMA32;
> +#else
> 	flag  &= ~(__GFP_HIGHMEM);
> +#endif
> 
> 	page = alloc_pages_node(node, flag, get_order(size));
> 	if (page == NULL)
> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> index baaafde..2a11e49 100644
> --- a/arch/powerpc/mm/mem.c
> +++ b/arch/powerpc/mm/mem.c
> @@ -280,6 +280,10 @@ void __init paging_init(void)
> #ifdef CONFIG_HIGHMEM
> 	max_zone_pfns[ZONE_DMA] = lowmem_end_addr >> PAGE_SHIFT;
> 	max_zone_pfns[ZONE_HIGHMEM] = top_of_ram >> PAGE_SHIFT;
> +#elif defined CONFIG_ZONE_DMA32
> +	max_zone_pfns[ZONE_DMA32] = min_t(phys_addr_t, top_of_ram,
> +					1ull << 31) >> PAGE_SHIFT;
> +	max_zone_pfns[ZONE_NORMAL] = top_of_ram >> PAGE_SHIFT;
> #else
> 	max_zone_pfns[ZONE_DMA] = top_of_ram >> PAGE_SHIFT;
> #endif
> -- 
> 1.6.4
> 
> 
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev
Benjamin Herrenschmidt - Sept. 9, 2012, 11:37 p.m.
On Thu, 2012-08-30 at 15:49 -0500, Kumar Gala wrote:
> On Aug 24, 2012, at 5:50 AM, Shaohui Xie wrote:
> 
> > PowerPC platform only supports ZONE_DMA zone for 64bit kernel, so all the
> > memory will be put into this zone. If the memory size is greater than
> > the device's DMA capability and device uses dma_alloc_coherent to allocate
> > memory, it will get an address which is over the device's DMA addressing,
> > the device will fail.
> > 
> > So we split the memory to two zones: zone ZONE_DMA32 & ZONE_NORMAL, since
> > we already allocate PCICSRBAR/PEXCSRBAR right below the 4G boundary (if the
> > lowest PCI address is above 4G), so we constrain the DMA zone ZONE_DMA32
> > to 2GB, also, we clear flag __GFP_DMA & __GFP_DMA32 and set __GFP_DMA32 only
> > if the device's dma_mask < total memory size. By doing this, devices which
> > cannot DMA all the memory will be limited to ZONE_DMA32, but devices which
> > can DMA all the memory will not be affected by this limitation.
> > 
> > Signed-off-by: Shaohui Xie <Shaohui.Xie@freescale.com>
> > Signed-off-by: Mingkai Hu <Mingkai.hu@freescale.com>
> > Signed-off-by: Chen Yuanquan <B41889@freescale.com>
> > ---
> > changes for v2:
> > 1. use a config option for using two zones (ZONE_DMA32 & ZONE_NORMAL) in
> > freescale 64 bit kernel.
> > 

There must have been a misunderstanding. I think this should be a
runtime choice, possibly by the platform code. Any reason that can't be
done ?

Also how does Intel do it ? Do they have iommu and ZONE_DMA32 co-exist ?

Cheers,
Ben.
shaohui xie - Sept. 10, 2012, 9:51 a.m.
> -----Original Message-----
> From: Benjamin Herrenschmidt [mailto:benh@kernel.crashing.org]
> Sent: Monday, September 10, 2012 7:38 AM
> To: Kumar Gala
> Cc: Xie Shaohui-B21989; linuxppc-dev@lists.ozlabs.org list; Hu Mingkai-
> B21284; Chen Yuanquan-B41889
> Subject: Re: [PATCH][v2] powerpc/mm: using two zones for freescale 64 bit
> kernel
> 
> On Thu, 2012-08-30 at 15:49 -0500, Kumar Gala wrote:
> > On Aug 24, 2012, at 5:50 AM, Shaohui Xie wrote:
> >
> > > PowerPC platform only supports ZONE_DMA zone for 64bit kernel, so
> > > all the memory will be put into this zone. If the memory size is
> > > greater than the device's DMA capability and device uses
> > > dma_alloc_coherent to allocate memory, it will get an address which
> > > is over the device's DMA addressing, the device will fail.
> > >
> > > So we split the memory to two zones: zone ZONE_DMA32 & ZONE_NORMAL,
> > > since we already allocate PCICSRBAR/PEXCSRBAR right below the 4G
> > > boundary (if the lowest PCI address is above 4G), so we constrain
> > > the DMA zone ZONE_DMA32 to 2GB, also, we clear flag __GFP_DMA &
> > > __GFP_DMA32 and set __GFP_DMA32 only if the device's dma_mask <
> > > total memory size. By doing this, devices which cannot DMA all the
> > > memory will be limited to ZONE_DMA32, but devices which can DMA all
> the memory will not be affected by this limitation.
> > >
> > > Signed-off-by: Shaohui Xie <Shaohui.Xie@freescale.com>
> > > Signed-off-by: Mingkai Hu <Mingkai.hu@freescale.com>
> > > Signed-off-by: Chen Yuanquan <B41889@freescale.com>
> > > ---
> > > changes for v2:
> > > 1. use a config option for using two zones (ZONE_DMA32 &
> > > ZONE_NORMAL) in freescale 64 bit kernel.
> > >
> 
> There must have been a misunderstanding. I think this should be a runtime
> choice, possibly by the platform code. Any reason that can't be done ?
> 
[S.H] Do you mean this:

phys_addr_t platform_dma_size (maybe a default value should be used, then platform code will change it)

if (top_of_ram > platform_dma_size)
	max_zone_pfns[ZONE_DMA] = platform_dma_size >> PAGE_SHIFT;
else
	max_zone_pfns[ZONE_DMA] = top_of_ram >> PAGE_SHIFT;

max_zone_pfns[ZONE_NORMAL] = top_of_ram >> PAGE_SHIFT;
	
> Also how does Intel do it ? 
[S.H] below are codes in Intel:

403 void __init zone_sizes_init(void)
404 {       
405         unsigned long max_zone_pfns[MAX_NR_ZONES];
406         
407         memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
408                 
409 #ifdef CONFIG_ZONE_DMA
410         max_zone_pfns[ZONE_DMA]         = MAX_DMA_PFN;
411 #endif
412 #ifdef CONFIG_ZONE_DMA32
413         max_zone_pfns[ZONE_DMA32]       = MAX_DMA32_PFN;
414 #endif
415         max_zone_pfns[ZONE_NORMAL]      = max_low_pfn;
416 #ifdef CONFIG_HIGHMEM
417         max_zone_pfns[ZONE_HIGHMEM]     = max_pfn;
418 #endif
419  
      
For x86_64, there is no CONFIG_HIGHMEM, so there will be three zones: 
ZONE_DMA/ZONE_DMA32/ZONE_NORMAL.

>Do they have iommu and ZONE_DMA32 co-exist ?

I'm not familiar with IOMMU, I read some kernel docs, in Documentation/kernel-parameters.txt,
There are amd_iommu & intel_iommu available, in Documentation/x86/x86_64/boot-options.txt,
It says " Currently four x86-64 PCI-DMA mapping implementations exist". 

Does PPC64 support IOMMU, how to use it?


Best Regards, 
Shaohui Xie
shaohui xie - Sept. 20, 2012, 10:14 a.m.
> > On Thu, 2012-08-30 at 15:49 -0500, Kumar Gala wrote:
> > > On Aug 24, 2012, at 5:50 AM, Shaohui Xie wrote:
> > >
> > > > PowerPC platform only supports ZONE_DMA zone for 64bit kernel, so
> > > > all the memory will be put into this zone. If the memory size is
> > > > greater than the device's DMA capability and device uses
> > > > dma_alloc_coherent to allocate memory, it will get an address
> > > > which is over the device's DMA addressing, the device will fail.
> > > >
> > > > So we split the memory to two zones: zone ZONE_DMA32 &
> > > > ZONE_NORMAL, since we already allocate PCICSRBAR/PEXCSRBAR right
> > > > below the 4G boundary (if the lowest PCI address is above 4G), so
> > > > we constrain the DMA zone ZONE_DMA32 to 2GB, also, we clear flag
> > > > __GFP_DMA &
> > > > __GFP_DMA32 and set __GFP_DMA32 only if the device's dma_mask <
> > > > total memory size. By doing this, devices which cannot DMA all the
> > > > memory will be limited to ZONE_DMA32, but devices which can DMA
> > > > all
> > the memory will not be affected by this limitation.
> > > >
> > > > Signed-off-by: Shaohui Xie <Shaohui.Xie@freescale.com>
> > > > Signed-off-by: Mingkai Hu <Mingkai.hu@freescale.com>
> > > > Signed-off-by: Chen Yuanquan <B41889@freescale.com>
> > > > ---
> > > > changes for v2:
> > > > 1. use a config option for using two zones (ZONE_DMA32 &
> > > > ZONE_NORMAL) in freescale 64 bit kernel.
> > > >
> >
> > There must have been a misunderstanding. I think this should be a
> > runtime choice, possibly by the platform code. Any reason that can't be
> done ?
> >
> [S.H] Do you mean this:
> 
> phys_addr_t platform_dma_size (maybe a default value should be used, then
> platform code will change it)
> 
> if (top_of_ram > platform_dma_size)
> 	max_zone_pfns[ZONE_DMA] = platform_dma_size >> PAGE_SHIFT; else
> 	max_zone_pfns[ZONE_DMA] = top_of_ram >> PAGE_SHIFT;
> 
> max_zone_pfns[ZONE_NORMAL] = top_of_ram >> PAGE_SHIFT;
> 
> > Also how does Intel do it ?
> [S.H] below are codes in Intel:
> 
> 403 void __init zone_sizes_init(void)
> 404 {
> 405         unsigned long max_zone_pfns[MAX_NR_ZONES];
> 406
> 407         memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
> 408
> 409 #ifdef CONFIG_ZONE_DMA
> 410         max_zone_pfns[ZONE_DMA]         = MAX_DMA_PFN;
> 411 #endif
> 412 #ifdef CONFIG_ZONE_DMA32
> 413         max_zone_pfns[ZONE_DMA32]       = MAX_DMA32_PFN;
> 414 #endif
> 415         max_zone_pfns[ZONE_NORMAL]      = max_low_pfn;
> 416 #ifdef CONFIG_HIGHMEM
> 417         max_zone_pfns[ZONE_HIGHMEM]     = max_pfn;
> 418 #endif
> 419
> 
> For x86_64, there is no CONFIG_HIGHMEM, so there will be three zones:
> ZONE_DMA/ZONE_DMA32/ZONE_NORMAL.
> 
[S.H] Hello, Ben,

I have some questions, though I'm still expecting your comments.
PPC does not have ZONE_DMA32 by default, if we want to use it, we need to add "config ZONE_DMA32" in Kconfig first.
If setting multiple zones without ZONE_DMA, kmalloc in "include/linux/slab_def.h" will fail if it uses flag GFP_DMA.
For the runtime choice in 64-bit kernel, what exactly multiple zones should be used?
"ZONE_DMA & ZONE_NORMAL" or "ZONE_DMA & ZONE_DMA32 & ZONE_NORMAL"?
Then what the size should be set for them respectively?

Please comment, Thanks!


Best Regards, 
Shaohui Xie
Kumar Gala - Sept. 20, 2012, 1:36 p.m.
On Sep 20, 2012, at 5:14 AM, Xie Shaohui-B21989 wrote:

>>> On Thu, 2012-08-30 at 15:49 -0500, Kumar Gala wrote:
>>>> On Aug 24, 2012, at 5:50 AM, Shaohui Xie wrote:
>>>> 
>>>>> PowerPC platform only supports ZONE_DMA zone for 64bit kernel, so
>>>>> all the memory will be put into this zone. If the memory size is
>>>>> greater than the device's DMA capability and device uses
>>>>> dma_alloc_coherent to allocate memory, it will get an address
>>>>> which is over the device's DMA addressing, the device will fail.
>>>>> 
>>>>> So we split the memory to two zones: zone ZONE_DMA32 &
>>>>> ZONE_NORMAL, since we already allocate PCICSRBAR/PEXCSRBAR right
>>>>> below the 4G boundary (if the lowest PCI address is above 4G), so
>>>>> we constrain the DMA zone ZONE_DMA32 to 2GB, also, we clear flag
>>>>> __GFP_DMA &
>>>>> __GFP_DMA32 and set __GFP_DMA32 only if the device's dma_mask <
>>>>> total memory size. By doing this, devices which cannot DMA all the
>>>>> memory will be limited to ZONE_DMA32, but devices which can DMA
>>>>> all
>>> the memory will not be affected by this limitation.
>>>>> 
>>>>> Signed-off-by: Shaohui Xie <Shaohui.Xie@freescale.com>
>>>>> Signed-off-by: Mingkai Hu <Mingkai.hu@freescale.com>
>>>>> Signed-off-by: Chen Yuanquan <B41889@freescale.com>
>>>>> ---
>>>>> changes for v2:
>>>>> 1. use a config option for using two zones (ZONE_DMA32 &
>>>>> ZONE_NORMAL) in freescale 64 bit kernel.
>>>>> 
>>> 
>>> There must have been a misunderstanding. I think this should be a
>>> runtime choice, possibly by the platform code. Any reason that can't be
>> done ?
>>> 
>> [S.H] Do you mean this:
>> 
>> phys_addr_t platform_dma_size (maybe a default value should be used, then
>> platform code will change it)
>> 
>> if (top_of_ram > platform_dma_size)
>> 	max_zone_pfns[ZONE_DMA] = platform_dma_size >> PAGE_SHIFT; else
>> 	max_zone_pfns[ZONE_DMA] = top_of_ram >> PAGE_SHIFT;
>> 
>> max_zone_pfns[ZONE_NORMAL] = top_of_ram >> PAGE_SHIFT;
>> 
>>> Also how does Intel do it ?
>> [S.H] below are codes in Intel:
>> 
>> 403 void __init zone_sizes_init(void)
>> 404 {
>> 405         unsigned long max_zone_pfns[MAX_NR_ZONES];
>> 406
>> 407         memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
>> 408
>> 409 #ifdef CONFIG_ZONE_DMA
>> 410         max_zone_pfns[ZONE_DMA]         = MAX_DMA_PFN;
>> 411 #endif
>> 412 #ifdef CONFIG_ZONE_DMA32
>> 413         max_zone_pfns[ZONE_DMA32]       = MAX_DMA32_PFN;
>> 414 #endif
>> 415         max_zone_pfns[ZONE_NORMAL]      = max_low_pfn;
>> 416 #ifdef CONFIG_HIGHMEM
>> 417         max_zone_pfns[ZONE_HIGHMEM]     = max_pfn;
>> 418 #endif
>> 419
>> 
>> For x86_64, there is no CONFIG_HIGHMEM, so there will be three zones:
>> ZONE_DMA/ZONE_DMA32/ZONE_NORMAL.
>> 
> [S.H] Hello, Ben,
> 
> I have some questions, though I'm still expecting your comments.
> PPC does not have ZONE_DMA32 by default, if we want to use it, we need to add "config ZONE_DMA32" in Kconfig first.
> If setting multiple zones without ZONE_DMA, kmalloc in "include/linux/slab_def.h" will fail if it uses flag GFP_DMA.
> For the runtime choice in 64-bit kernel, what exactly multiple zones should be used?
> "ZONE_DMA & ZONE_NORMAL" or "ZONE_DMA & ZONE_DMA32 & ZONE_NORMAL"?
> Then what the size should be set for them respectively?
> 
> Please comment, Thanks!

I think Ben is saying that Kconfig would enable ZONE_DMA32 for all PPC64, but make it runtime/per platform how we setup the zone's such that either ZONE_DMA32 is set to MAX_DMA32_PFN or it set to same value as ZONE_DMA.

However that's just a guess.

- k
Kumar Gala - Sept. 24, 2012, 12:31 p.m.
On Sep 20, 2012, at 8:36 AM, Kumar Gala wrote:

> 
> On Sep 20, 2012, at 5:14 AM, Xie Shaohui-B21989 wrote:
> 
>>>> On Thu, 2012-08-30 at 15:49 -0500, Kumar Gala wrote:
>>>>> On Aug 24, 2012, at 5:50 AM, Shaohui Xie wrote:
>>>>> 
>>>>>> PowerPC platform only supports ZONE_DMA zone for 64bit kernel, so
>>>>>> all the memory will be put into this zone. If the memory size is
>>>>>> greater than the device's DMA capability and device uses
>>>>>> dma_alloc_coherent to allocate memory, it will get an address
>>>>>> which is over the device's DMA addressing, the device will fail.
>>>>>> 
>>>>>> So we split the memory to two zones: zone ZONE_DMA32 &
>>>>>> ZONE_NORMAL, since we already allocate PCICSRBAR/PEXCSRBAR right
>>>>>> below the 4G boundary (if the lowest PCI address is above 4G), so
>>>>>> we constrain the DMA zone ZONE_DMA32 to 2GB, also, we clear flag
>>>>>> __GFP_DMA &
>>>>>> __GFP_DMA32 and set __GFP_DMA32 only if the device's dma_mask <
>>>>>> total memory size. By doing this, devices which cannot DMA all the
>>>>>> memory will be limited to ZONE_DMA32, but devices which can DMA
>>>>>> all
>>>> the memory will not be affected by this limitation.
>>>>>> 
>>>>>> Signed-off-by: Shaohui Xie <Shaohui.Xie@freescale.com>
>>>>>> Signed-off-by: Mingkai Hu <Mingkai.hu@freescale.com>
>>>>>> Signed-off-by: Chen Yuanquan <B41889@freescale.com>
>>>>>> ---
>>>>>> changes for v2:
>>>>>> 1. use a config option for using two zones (ZONE_DMA32 &
>>>>>> ZONE_NORMAL) in freescale 64 bit kernel.
>>>>>> 
>>>> 
>>>> There must have been a misunderstanding. I think this should be a
>>>> runtime choice, possibly by the platform code. Any reason that can't be
>>> done ?
>>>> 
>>> [S.H] Do you mean this:
>>> 
>>> phys_addr_t platform_dma_size (maybe a default value should be used, then
>>> platform code will change it)
>>> 
>>> if (top_of_ram > platform_dma_size)
>>> 	max_zone_pfns[ZONE_DMA] = platform_dma_size >> PAGE_SHIFT; else
>>> 	max_zone_pfns[ZONE_DMA] = top_of_ram >> PAGE_SHIFT;
>>> 
>>> max_zone_pfns[ZONE_NORMAL] = top_of_ram >> PAGE_SHIFT;
>>> 
>>>> Also how does Intel do it ?
>>> [S.H] below are codes in Intel:
>>> 
>>> 403 void __init zone_sizes_init(void)
>>> 404 {
>>> 405         unsigned long max_zone_pfns[MAX_NR_ZONES];
>>> 406
>>> 407         memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
>>> 408
>>> 409 #ifdef CONFIG_ZONE_DMA
>>> 410         max_zone_pfns[ZONE_DMA]         = MAX_DMA_PFN;
>>> 411 #endif
>>> 412 #ifdef CONFIG_ZONE_DMA32
>>> 413         max_zone_pfns[ZONE_DMA32]       = MAX_DMA32_PFN;
>>> 414 #endif
>>> 415         max_zone_pfns[ZONE_NORMAL]      = max_low_pfn;
>>> 416 #ifdef CONFIG_HIGHMEM
>>> 417         max_zone_pfns[ZONE_HIGHMEM]     = max_pfn;
>>> 418 #endif
>>> 419
>>> 
>>> For x86_64, there is no CONFIG_HIGHMEM, so there will be three zones:
>>> ZONE_DMA/ZONE_DMA32/ZONE_NORMAL.
>>> 
>> [S.H] Hello, Ben,
>> 
>> I have some questions, though I'm still expecting your comments.
>> PPC does not have ZONE_DMA32 by default, if we want to use it, we need to add "config ZONE_DMA32" in Kconfig first.
>> If setting multiple zones without ZONE_DMA, kmalloc in "include/linux/slab_def.h" will fail if it uses flag GFP_DMA.
>> For the runtime choice in 64-bit kernel, what exactly multiple zones should be used?
>> "ZONE_DMA & ZONE_NORMAL" or "ZONE_DMA & ZONE_DMA32 & ZONE_NORMAL"?
>> Then what the size should be set for them respectively?
>> 
>> Please comment, Thanks!
> 
> I think Ben is saying that Kconfig would enable ZONE_DMA32 for all PPC64, but make it runtime/per platform how we setup the zone's such that either ZONE_DMA32 is set to MAX_DMA32_PFN or it set to same value as ZONE_DMA.
> 
> However that's just a guess.

Ben,

Can you help clarify your thoughts here.

thanks

- k
Kumar Gala - Sept. 27, 2012, 12:37 p.m.
On Sep 24, 2012, at 7:31 AM, Kumar Gala wrote:

> 
> On Sep 20, 2012, at 8:36 AM, Kumar Gala wrote:
> 
>> 
>> On Sep 20, 2012, at 5:14 AM, Xie Shaohui-B21989 wrote:
>> 
>>>>> On Thu, 2012-08-30 at 15:49 -0500, Kumar Gala wrote:
>>>>>> On Aug 24, 2012, at 5:50 AM, Shaohui Xie wrote:
>>>>>> 
>>>>>>> PowerPC platform only supports ZONE_DMA zone for 64bit kernel, so
>>>>>>> all the memory will be put into this zone. If the memory size is
>>>>>>> greater than the device's DMA capability and device uses
>>>>>>> dma_alloc_coherent to allocate memory, it will get an address
>>>>>>> which is over the device's DMA addressing, the device will fail.
>>>>>>> 
>>>>>>> So we split the memory to two zones: zone ZONE_DMA32 &
>>>>>>> ZONE_NORMAL, since we already allocate PCICSRBAR/PEXCSRBAR right
>>>>>>> below the 4G boundary (if the lowest PCI address is above 4G), so
>>>>>>> we constrain the DMA zone ZONE_DMA32 to 2GB, also, we clear flag
>>>>>>> __GFP_DMA &
>>>>>>> __GFP_DMA32 and set __GFP_DMA32 only if the device's dma_mask <
>>>>>>> total memory size. By doing this, devices which cannot DMA all the
>>>>>>> memory will be limited to ZONE_DMA32, but devices which can DMA
>>>>>>> all
>>>>> the memory will not be affected by this limitation.
>>>>>>> 
>>>>>>> Signed-off-by: Shaohui Xie <Shaohui.Xie@freescale.com>
>>>>>>> Signed-off-by: Mingkai Hu <Mingkai.hu@freescale.com>
>>>>>>> Signed-off-by: Chen Yuanquan <B41889@freescale.com>
>>>>>>> ---
>>>>>>> changes for v2:
>>>>>>> 1. use a config option for using two zones (ZONE_DMA32 &
>>>>>>> ZONE_NORMAL) in freescale 64 bit kernel.
>>>>>>> 
>>>>> 
>>>>> There must have been a misunderstanding. I think this should be a
>>>>> runtime choice, possibly by the platform code. Any reason that can't be
>>>> done ?
>>>>> 
>>>> [S.H] Do you mean this:
>>>> 
>>>> phys_addr_t platform_dma_size (maybe a default value should be used, then
>>>> platform code will change it)
>>>> 
>>>> if (top_of_ram > platform_dma_size)
>>>> 	max_zone_pfns[ZONE_DMA] = platform_dma_size >> PAGE_SHIFT; else
>>>> 	max_zone_pfns[ZONE_DMA] = top_of_ram >> PAGE_SHIFT;
>>>> 
>>>> max_zone_pfns[ZONE_NORMAL] = top_of_ram >> PAGE_SHIFT;
>>>> 
>>>>> Also how does Intel do it ?
>>>> [S.H] below are codes in Intel:
>>>> 
>>>> 403 void __init zone_sizes_init(void)
>>>> 404 {
>>>> 405         unsigned long max_zone_pfns[MAX_NR_ZONES];
>>>> 406
>>>> 407         memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
>>>> 408
>>>> 409 #ifdef CONFIG_ZONE_DMA
>>>> 410         max_zone_pfns[ZONE_DMA]         = MAX_DMA_PFN;
>>>> 411 #endif
>>>> 412 #ifdef CONFIG_ZONE_DMA32
>>>> 413         max_zone_pfns[ZONE_DMA32]       = MAX_DMA32_PFN;
>>>> 414 #endif
>>>> 415         max_zone_pfns[ZONE_NORMAL]      = max_low_pfn;
>>>> 416 #ifdef CONFIG_HIGHMEM
>>>> 417         max_zone_pfns[ZONE_HIGHMEM]     = max_pfn;
>>>> 418 #endif
>>>> 419
>>>> 
>>>> For x86_64, there is no CONFIG_HIGHMEM, so there will be three zones:
>>>> ZONE_DMA/ZONE_DMA32/ZONE_NORMAL.
>>>> 
>>> [S.H] Hello, Ben,
>>> 
>>> I have some questions, though I'm still expecting your comments.
>>> PPC does not have ZONE_DMA32 by default, if we want to use it, we need to add "config ZONE_DMA32" in Kconfig first.
>>> If setting multiple zones without ZONE_DMA, kmalloc in "include/linux/slab_def.h" will fail if it uses flag GFP_DMA.
>>> For the runtime choice in 64-bit kernel, what exactly multiple zones should be used?
>>> "ZONE_DMA & ZONE_NORMAL" or "ZONE_DMA & ZONE_DMA32 & ZONE_NORMAL"?
>>> Then what the size should be set for them respectively?
>>> 
>>> Please comment, Thanks!
>> 
>> I think Ben is saying that Kconfig would enable ZONE_DMA32 for all PPC64, but make it runtime/per platform how we setup the zone's such that either ZONE_DMA32 is set to MAX_DMA32_PFN or it set to same value as ZONE_DMA.
>> 
>> However that's just a guess.
> 
> Ben,
> 
> Can you help clarify your thoughts here.
> 
> thanks

Ben?

- k

Patch

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 352f416..a96fbbb 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -629,6 +629,9 @@  config ZONE_DMA
 	bool
 	default y
 
+config ZONE_DMA32
+	def_bool (PPC64 && PPC_FSL_BOOK3E)
+
 config NEED_DMA_MAP_STATE
 	def_bool (PPC64 || NOT_COHERENT_CACHE)
 
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index 355b9d8..cbf5ac1 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -41,9 +41,24 @@  void *dma_direct_alloc_coherent(struct device *dev, size_t size,
 #else
 	struct page *page;
 	int node = dev_to_node(dev);
+#ifdef CONFIG_ZONE_DMA32
+	phys_addr_t top_ram_pfn = memblock_end_of_DRAM();
 
+	/*
+	 * check for crappy device which has dma_mask < ZONE_DMA, and
+	 * we are not going to support it, just warn and fail.
+	 */
+	if (*dev->dma_mask < DMA_BIT_MASK(31)) {
+		dev_err(dev, "Unsupported dma_mask 0x%llx\n", *dev->dma_mask);
+		return NULL;
+	}
 	/* ignore region specifiers */
+	flag  &= ~(__GFP_HIGHMEM | __GFP_DMA | __GFP_DMA32);
+	if (*dev->dma_mask < top_ram_pfn - 1)
+		flag |= __GFP_DMA32;
+#else
 	flag  &= ~(__GFP_HIGHMEM);
+#endif
 
 	page = alloc_pages_node(node, flag, get_order(size));
 	if (page == NULL)
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index baaafde..2a11e49 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -280,6 +280,10 @@  void __init paging_init(void)
 #ifdef CONFIG_HIGHMEM
 	max_zone_pfns[ZONE_DMA] = lowmem_end_addr >> PAGE_SHIFT;
 	max_zone_pfns[ZONE_HIGHMEM] = top_of_ram >> PAGE_SHIFT;
+#elif defined CONFIG_ZONE_DMA32
+	max_zone_pfns[ZONE_DMA32] = min_t(phys_addr_t, top_of_ram,
+					1ull << 31) >> PAGE_SHIFT;
+	max_zone_pfns[ZONE_NORMAL] = top_of_ram >> PAGE_SHIFT;
 #else
 	max_zone_pfns[ZONE_DMA] = top_of_ram >> PAGE_SHIFT;
 #endif