diff mbox

[V2,1/2] mm/page_alloc: Replace set_dma_reserve to set_memory_reserve

Message ID 1470330729-6273-1-git-send-email-srikar@linux.vnet.ibm.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Srikar Dronamraju Aug. 4, 2016, 5:12 p.m. UTC
Expand the scope of the existing dma_reserve to accommodate other memory
reserves too. Accordingly rename variable dma_reserve to
nr_memory_reserve.

set_memory_reserve also takes a new parameter that helps to identify if
the current value needs to be incremented.

Suggested-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 arch/x86/kernel/e820.c |  2 +-
 include/linux/mm.h     |  2 +-
 mm/page_alloc.c        | 20 ++++++++++++--------
 3 files changed, 14 insertions(+), 10 deletions(-)

Comments

Vlastimil Babka Aug. 5, 2016, 6:45 a.m. UTC | #1
On 08/04/2016 07:12 PM, Srikar Dronamraju wrote:
> Expand the scope of the existing dma_reserve to accommodate other memory
> reserves too. Accordingly rename variable dma_reserve to
> nr_memory_reserve.
>
> set_memory_reserve also takes a new parameter that helps to identify if
> the current value needs to be incremented.
>
> Suggested-by: Mel Gorman <mgorman@techsingularity.net>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
> ---
>  arch/x86/kernel/e820.c |  2 +-
>  include/linux/mm.h     |  2 +-
>  mm/page_alloc.c        | 20 ++++++++++++--------
>  3 files changed, 14 insertions(+), 10 deletions(-)
>
> diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
> index 621b501..d935983 100644
> --- a/arch/x86/kernel/e820.c
> +++ b/arch/x86/kernel/e820.c
> @@ -1188,6 +1188,6 @@ void __init memblock_find_dma_reserve(void)
>  			nr_free_pages += end_pfn - start_pfn;
>  	}
>
> -	set_dma_reserve(nr_pages - nr_free_pages);
> +	set_memory_reserve(nr_pages - nr_free_pages, false);
>  #endif
>  }
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 8f468e0..c884ffb 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1886,7 +1886,7 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn,
>  					struct mminit_pfnnid_cache *state);
>  #endif
>
> -extern void set_dma_reserve(unsigned long new_dma_reserve);
> +extern void set_memory_reserve(unsigned long nr_reserve, bool inc);
>  extern void memmap_init_zone(unsigned long, int, unsigned long,
>  				unsigned long, enum memmap_context);
>  extern void setup_per_zone_wmarks(void);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index c1069ef..a154c2f 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -253,7 +253,7 @@ int watermark_scale_factor = 10;
>
>  static unsigned long __meminitdata nr_kernel_pages;
>  static unsigned long __meminitdata nr_all_pages;
> -static unsigned long __meminitdata dma_reserve;
> +static unsigned long __meminitdata nr_memory_reserve;
>
>  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
>  static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
> @@ -5493,10 +5493,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
>  		}
>
>  		/* Account for reserved pages */
> -		if (j == 0 && freesize > dma_reserve) {
> -			freesize -= dma_reserve;
> +		if (j == 0 && freesize > nr_memory_reserve) {

Will this really work (together with patch 2) as intended?
This j == 0 means that we are doing this only for the first zone, which 
is ZONE_DMA (or ZONE_DMA32) on node 0 on many systems. I.e. I don't 
think it's really true that "dma_reserve has nothing to do with DMA or 
ZONE_DMA".

This zone will have limited amount of memory, so the "freesize > 
nr_memory_reserve" will easily be false once you set this to many 
gigabytes, so in fact nothing will get subtracted.

On the other hand if the kernel has both CONFIG_ZONE_DMA and 
CONFIG_ZONE_DMA32 disabled, then j == 0 will be true for ZONE_NORMAL. 
This zone might be present on multiple nodes (unless they are configured 
as movable) and then the value intended to be global will be subtracted 
from several nodes.

I don't know what's the exact ppc64 situation here, perhaps there are 
indeed no DMA/DMA32 zones, and the fadump kernel only uses one node, so 
it works in the end, but it doesn't seem much robust to me?

> +			freesize -= nr_memory_reserve;
>  			printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
> -					zone_names[0], dma_reserve);
> +					zone_names[0], nr_memory_reserve);
>  		}
>
>  		if (!is_highmem_idx(j))
> @@ -6186,8 +6186,9 @@ void __init mem_init_print_info(const char *str)
>  }
>
>  /**
> - * set_dma_reserve - set the specified number of pages reserved in the first zone
> - * @new_dma_reserve: The number of pages to mark reserved
> + * set_memory_reserve - set number of pages reserved in the first zone
> + * @nr_reserve: The number of pages to mark reserved
> + * @inc: true increment to existing value; false set new value.
>   *
>   * The per-cpu batchsize and zone watermarks are determined by managed_pages.
>   * In the DMA zone, a significant percentage may be consumed by kernel image
> @@ -6196,9 +6197,12 @@ void __init mem_init_print_info(const char *str)
>   * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
>   * smaller per-cpu batchsize.
>   */
> -void __init set_dma_reserve(unsigned long new_dma_reserve)
> +void __init set_memory_reserve(unsigned long nr_reserve, bool inc)
>  {
> -	dma_reserve = new_dma_reserve;
> +	if (inc)
> +		nr_memory_reserve += nr_reserve;
> +	else
> +		nr_memory_reserve = nr_reserve;
>  }
>
>  void __init free_area_init(unsigned long *zones_size)
>
Mel Gorman Aug. 5, 2016, 6:47 a.m. UTC | #2
On Thu, Aug 04, 2016 at 10:42:08PM +0530, Srikar Dronamraju wrote:
> Expand the scope of the existing dma_reserve to accommodate other memory
> reserves too. Accordingly rename variable dma_reserve to
> nr_memory_reserve.
> 
> set_memory_reserve also takes a new parameter that helps to identify if
> the current value needs to be incremented.
> 

I think the parameter is ugly and it should have been just
inc_memory_reserve but at least it works.
Srikar Dronamraju Aug. 5, 2016, 7:24 a.m. UTC | #3
* Vlastimil Babka <vbabka@suse.cz> [2016-08-05 08:45:03]:

> >@@ -5493,10 +5493,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
> > 		}
> >
> > 		/* Account for reserved pages */
> >-		if (j == 0 && freesize > dma_reserve) {
> >-			freesize -= dma_reserve;
> >+		if (j == 0 && freesize > nr_memory_reserve) {
> 
> Will this really work (together with patch 2) as intended?
> This j == 0 means that we are doing this only for the first zone, which is
> ZONE_DMA (or ZONE_DMA32) on node 0 on many systems. I.e. I don't think it's
> really true that "dma_reserve has nothing to do with DMA or ZONE_DMA".
> 
> This zone will have limited amount of memory, so the "freesize >
> nr_memory_reserve" will easily be false once you set this to many gigabytes,
> so in fact nothing will get subtracted.
> 
> On the other hand if the kernel has both CONFIG_ZONE_DMA and
> CONFIG_ZONE_DMA32 disabled, then j == 0 will be true for ZONE_NORMAL. This
> zone might be present on multiple nodes (unless they are configured as
> movable) and then the value intended to be global will be subtracted from
> several nodes.
> 
> I don't know what's the exact ppc64 situation here, perhaps there are indeed
> no DMA/DMA32 zones, and the fadump kernel only uses one node, so it works in
> the end, but it doesn't seem much robust to me?
> 

At the page initialization time, powerpc seems to have just one zone
spread across the 16 nodes.

From the dmesg.

[    0.000000] Memory hole size: 0MB
[    0.000000] Zone ranges:
[    0.000000]   DMA      [mem 0x0000000000000000-0x00001f5c8fffffff]
[    0.000000]   DMA32    empty
[    0.000000]   Normal   empty
[    0.000000] Movable zone start for each node
[    0.000000] Early memory node ranges
[    0.000000]   node   0: [mem 0x0000000000000000-0x000001fb4fffffff]
[    0.000000]   node   1: [mem 0x000001fb50000000-0x000003fa8fffffff]
[    0.000000]   node   2: [mem 0x000003fa90000000-0x000005f9cfffffff]
[    0.000000]   node   3: [mem 0x000005f9d0000000-0x000007f8efffffff]
[    0.000000]   node   4: [mem 0x000007f8f0000000-0x000009f81fffffff]
[    0.000000]   node   5: [mem 0x000009f820000000-0x00000bf77fffffff]
[    0.000000]   node   6: [mem 0x00000bf780000000-0x00000df6dfffffff]
[    0.000000]   node   7: [mem 0x00000df6e0000000-0x00000ff63fffffff]
[    0.000000]   node   8: [mem 0x00000ff640000000-0x000011f58fffffff]
[    0.000000]   node   9: [mem 0x000011f590000000-0x000013644fffffff]
[    0.000000]   node  10: [mem 0x0000136450000000-0x00001563afffffff]
[    0.000000]   node  11: [mem 0x00001563b0000000-0x000017630fffffff]
[    0.000000]   node  12: [mem 0x0000176310000000-0x000019625fffffff]
[    0.000000]   node  13: [mem 0x0000196260000000-0x00001b5dcfffffff]
[    0.000000]   node  14: [mem 0x00001b5dd0000000-0x00001d5d2fffffff]
[    0.000000]   node  15: [mem 0x00001d5d30000000-0x00001f5c8fffffff]


The config has the below.

CONFIG_ZONE_DMA32=y
CONFIG_HIGH_RES_TIMERS=y
CONFIG_ZONE_DMA_FLAG=1
CONFIG_FORCE_MAX_ZONEORDER=9
CONFIG_ZONE_DMA=y

I tried forcing CONFIG_ZONE_DMA to be not set, but make always pick it.
From source arch/powerpc/Kconfig marks CONFIG_ZONE_DMA as "default y"
Srikar Dronamraju Aug. 5, 2016, 7:36 a.m. UTC | #4
* Mel Gorman <mgorman@techsingularity.net> [2016-08-05 07:47:47]:

> On Thu, Aug 04, 2016 at 10:42:08PM +0530, Srikar Dronamraju wrote:
> > Expand the scope of the existing dma_reserve to accommodate other memory
> > reserves too. Accordingly rename variable dma_reserve to
> > nr_memory_reserve.
> > 
> > set_memory_reserve also takes a new parameter that helps to identify if
> > the current value needs to be incremented.
> > 
> 
> I think the parameter is ugly and it should have been just
> inc_memory_reserve but at least it works.
> 

Yes while the parameter is definitely ugly, the only other use
case in arch/x86/kernel/e820.c seems to be written with an intention to
set to an absolute value.

It was "set_dma_reserve(nr_pages - nr_free_pages)". Both of them
nr_pages and nr_free_pages are calculated after walking through the mem
blocks. I didnt want to take a chance where someother code path also
starts to set reserve value and then the code in e820.c just increments
it.

However if you still feel strongly about using inc_memory_reserve than
set_memory_reserve, I will respin.
Vlastimil Babka Aug. 5, 2016, 9:09 a.m. UTC | #5
On 08/05/2016 09:24 AM, Srikar Dronamraju wrote:
> * Vlastimil Babka <vbabka@suse.cz> [2016-08-05 08:45:03]:
>
>>> @@ -5493,10 +5493,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
>>> 		}
>>>
>>> 		/* Account for reserved pages */
>>> -		if (j == 0 && freesize > dma_reserve) {
>>> -			freesize -= dma_reserve;
>>> +		if (j == 0 && freesize > nr_memory_reserve) {
>>
>> Will this really work (together with patch 2) as intended?
>> This j == 0 means that we are doing this only for the first zone, which is
>> ZONE_DMA (or ZONE_DMA32) on node 0 on many systems. I.e. I don't think it's
>> really true that "dma_reserve has nothing to do with DMA or ZONE_DMA".
>>
>> This zone will have limited amount of memory, so the "freesize >
>> nr_memory_reserve" will easily be false once you set this to many gigabytes,
>> so in fact nothing will get subtracted.
>>
>> On the other hand if the kernel has both CONFIG_ZONE_DMA and
>> CONFIG_ZONE_DMA32 disabled, then j == 0 will be true for ZONE_NORMAL. This
>> zone might be present on multiple nodes (unless they are configured as
>> movable) and then the value intended to be global will be subtracted from
>> several nodes.
>>
>> I don't know what's the exact ppc64 situation here, perhaps there are indeed
>> no DMA/DMA32 zones, and the fadump kernel only uses one node, so it works in
>> the end, but it doesn't seem much robust to me?
>>
>
> At the page initialization time, powerpc seems to have just one zone
> spread across the 16 nodes.
>
> From the dmesg.
>
> [    0.000000] Memory hole size: 0MB
> [    0.000000] Zone ranges:
> [    0.000000]   DMA      [mem 0x0000000000000000-0x00001f5c8fffffff]
> [    0.000000]   DMA32    empty
> [    0.000000]   Normal   empty
> [    0.000000] Movable zone start for each node
> [    0.000000] Early memory node ranges
> [    0.000000]   node   0: [mem 0x0000000000000000-0x000001fb4fffffff]
> [    0.000000]   node   1: [mem 0x000001fb50000000-0x000003fa8fffffff]
> [    0.000000]   node   2: [mem 0x000003fa90000000-0x000005f9cfffffff]
> [    0.000000]   node   3: [mem 0x000005f9d0000000-0x000007f8efffffff]
> [    0.000000]   node   4: [mem 0x000007f8f0000000-0x000009f81fffffff]
> [    0.000000]   node   5: [mem 0x000009f820000000-0x00000bf77fffffff]
> [    0.000000]   node   6: [mem 0x00000bf780000000-0x00000df6dfffffff]
> [    0.000000]   node   7: [mem 0x00000df6e0000000-0x00000ff63fffffff]
> [    0.000000]   node   8: [mem 0x00000ff640000000-0x000011f58fffffff]
> [    0.000000]   node   9: [mem 0x000011f590000000-0x000013644fffffff]
> [    0.000000]   node  10: [mem 0x0000136450000000-0x00001563afffffff]
> [    0.000000]   node  11: [mem 0x00001563b0000000-0x000017630fffffff]
> [    0.000000]   node  12: [mem 0x0000176310000000-0x000019625fffffff]
> [    0.000000]   node  13: [mem 0x0000196260000000-0x00001b5dcfffffff]
> [    0.000000]   node  14: [mem 0x00001b5dd0000000-0x00001d5d2fffffff]
> [    0.000000]   node  15: [mem 0x00001d5d30000000-0x00001f5c8fffffff]

Hmm so it will work for ppc64 and its fadump, but I'm not happy that we 
made the function name sound like it's generic (unlike when the name 
contained "dma"), while it only works as intended in specific corner 
cases. The next user might be surprised...
diff mbox

Patch

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 621b501..d935983 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1188,6 +1188,6 @@  void __init memblock_find_dma_reserve(void)
 			nr_free_pages += end_pfn - start_pfn;
 	}
 
-	set_dma_reserve(nr_pages - nr_free_pages);
+	set_memory_reserve(nr_pages - nr_free_pages, false);
 #endif
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8f468e0..c884ffb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1886,7 +1886,7 @@  extern int __meminit __early_pfn_to_nid(unsigned long pfn,
 					struct mminit_pfnnid_cache *state);
 #endif
 
-extern void set_dma_reserve(unsigned long new_dma_reserve);
+extern void set_memory_reserve(unsigned long nr_reserve, bool inc);
 extern void memmap_init_zone(unsigned long, int, unsigned long,
 				unsigned long, enum memmap_context);
 extern void setup_per_zone_wmarks(void);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c1069ef..a154c2f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -253,7 +253,7 @@  int watermark_scale_factor = 10;
 
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
-static unsigned long __meminitdata dma_reserve;
+static unsigned long __meminitdata nr_memory_reserve;
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
@@ -5493,10 +5493,10 @@  static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 		}
 
 		/* Account for reserved pages */
-		if (j == 0 && freesize > dma_reserve) {
-			freesize -= dma_reserve;
+		if (j == 0 && freesize > nr_memory_reserve) {
+			freesize -= nr_memory_reserve;
 			printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
-					zone_names[0], dma_reserve);
+					zone_names[0], nr_memory_reserve);
 		}
 
 		if (!is_highmem_idx(j))
@@ -6186,8 +6186,9 @@  void __init mem_init_print_info(const char *str)
 }
 
 /**
- * set_dma_reserve - set the specified number of pages reserved in the first zone
- * @new_dma_reserve: The number of pages to mark reserved
+ * set_memory_reserve - set number of pages reserved in the first zone
+ * @nr_reserve: The number of pages to mark reserved
+ * @inc: true increment to existing value; false set new value.
  *
  * The per-cpu batchsize and zone watermarks are determined by managed_pages.
  * In the DMA zone, a significant percentage may be consumed by kernel image
@@ -6196,9 +6197,12 @@  void __init mem_init_print_info(const char *str)
  * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
  * smaller per-cpu batchsize.
  */
-void __init set_dma_reserve(unsigned long new_dma_reserve)
+void __init set_memory_reserve(unsigned long nr_reserve, bool inc)
 {
-	dma_reserve = new_dma_reserve;
+	if (inc)
+		nr_memory_reserve += nr_reserve;
+	else
+		nr_memory_reserve = nr_reserve;
 }
 
 void __init free_area_init(unsigned long *zones_size)