Message ID | 20180209100854.30232-1-colin.king@canonical.com |
---|---|
State | New |
Headers | show |
Series | [SRU,ARTFUL] Revert "mm, memory_hotplug: do not associate hotadded memory to zones until online" | expand |
Acked-by: Kamal Mostafa <kamal@canonical.com> On Fri, Feb 09, 2018 at 10:08:54AM +0000, Colin King wrote: > From: Colin Ian King <colin.king@canonical.com> > > BugLink: http://bugs.launchpad.net/bugs/1747069 > > Hotplug removal causes i386 crashes when exercised with the kernel > selftest mem-on-off-test script. A fix occurs in 4.15 however this > requires a large set of changes that are way too large to be SRU'able > and the least risky way forward is to revert the offending commit. > > This fix reverts commit f1dd2cd13c4b ("mm, memory_hotplug: do not > associate hotadded memory to zones until online"), however the > revert required some manual fix-ups because of subsequent fixes after > this commit. > > Note that running the mem-on-off-script is not always sufficient to > trigger the bug. A good reproducer is to run in a 4 CPU VM with 2GB > of memory and after running the script run sync and then re-install > the kernel packages to trip the issue. This has been thoroughly > tested on i386 and amd64 and given a solid soak test using the > ADT tests too. > > Signed-off-by: Colin Ian King <colin.king@canonical.com> > --- > arch/ia64/mm/init.c | 11 +- > arch/powerpc/mm/mem.c | 12 +- > arch/s390/mm/init.c | 32 ++- > arch/sh/mm/init.c | 10 +- > arch/x86/mm/init_32.c | 7 +- > arch/x86/mm/init_64.c | 11 +- > drivers/base/memory.c | 52 +++-- > include/linux/memory_hotplug.h | 19 +- > include/linux/mmzone.h | 16 -- > kernel/memremap.c | 6 +- > mm/memory_hotplug.c | 457 +++++++++++++++++++++++++++-------------- > mm/sparse.c | 3 +- > 12 files changed, 409 insertions(+), 227 deletions(-) > > diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c > index a4e8d6b..39e2aeb 100644 > --- a/arch/ia64/mm/init.c > +++ b/arch/ia64/mm/init.c > @@ -646,13 +646,20 @@ mem_init (void) > } > > #ifdef CONFIG_MEMORY_HOTPLUG > -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) > +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) > { > + pg_data_t *pgdat; > + struct zone *zone; > unsigned long start_pfn = start >> PAGE_SHIFT; > unsigned long nr_pages = size >> PAGE_SHIFT; > int ret; > > - ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); > + pgdat = NODE_DATA(nid); > + > + zone = pgdat->node_zones + > + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); > + ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); > + > if (ret) > printk("%s: Problem encountered in __add_pages() as ret=%d\n", > __func__, ret); > diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c > index 46b4e67..331a78b 100644 > --- a/arch/powerpc/mm/mem.c > +++ b/arch/powerpc/mm/mem.c > @@ -127,14 +127,18 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end) > return -ENODEV; > } > > -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) > +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) > { > + struct pglist_data *pgdata; > + struct zone *zone; > unsigned long start_pfn = start >> PAGE_SHIFT; > unsigned long nr_pages = size >> PAGE_SHIFT; > int rc; > > resize_hpt_for_hotplug(memblock_phys_mem_size()); > > + pgdata = NODE_DATA(nid); > + > start = (unsigned long)__va(start); > rc = create_section_mapping(start, start + size); > if (rc) { > @@ -144,7 +148,11 @@ int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) > return -EFAULT; > } > > - return __add_pages(nid, start_pfn, nr_pages, want_memblock); > + /* this should work for most non-highmem platforms */ > + zone = pgdata->node_zones + > + zone_for_memory(nid, start, size, 0, for_device); > + > + return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); > } > > #ifdef CONFIG_MEMORY_HOTREMOVE > diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c > index 8111694..a3d5499 100644 > --- a/arch/s390/mm/init.c > +++ b/arch/s390/mm/init.c > @@ -166,17 +166,43 @@ unsigned long memory_block_size_bytes(void) > } > > #ifdef CONFIG_MEMORY_HOTPLUG > -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) > +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) > { > + unsigned long zone_start_pfn, zone_end_pfn, nr_pages; > unsigned long start_pfn = PFN_DOWN(start); > unsigned long size_pages = PFN_DOWN(size); > - int rc; > + pg_data_t *pgdat = NODE_DATA(nid); > + struct zone *zone; > + int rc, i; > > rc = vmem_add_mapping(start, size); > if (rc) > return rc; > > - rc = __add_pages(nid, start_pfn, size_pages, want_memblock); > + for (i = 0; i < MAX_NR_ZONES; i++) { > + zone = pgdat->node_zones + i; > + if (zone_idx(zone) != ZONE_MOVABLE) { > + /* Add range within existing zone limits, if possible */ > + zone_start_pfn = zone->zone_start_pfn; > + zone_end_pfn = zone->zone_start_pfn + > + zone->spanned_pages; > + } else { > + /* Add remaining range to ZONE_MOVABLE */ > + zone_start_pfn = start_pfn; > + zone_end_pfn = start_pfn + size_pages; > + } > + if (start_pfn < zone_start_pfn || start_pfn >= zone_end_pfn) > + continue; > + nr_pages = (start_pfn + size_pages > zone_end_pfn) ? > + zone_end_pfn - start_pfn : size_pages; > + rc = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); > + if (rc) > + break; > + start_pfn += nr_pages; > + size_pages -= nr_pages; > + if (!size_pages) > + break; > + } > if (rc) > vmem_remove_mapping(start, size); > return rc; > diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c > index bf726af..a9d57f7 100644 > --- a/arch/sh/mm/init.c > +++ b/arch/sh/mm/init.c > @@ -485,14 +485,20 @@ void free_initrd_mem(unsigned long start, unsigned long end) > #endif > > #ifdef CONFIG_MEMORY_HOTPLUG > -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) > +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) > { > + pg_data_t *pgdat; > unsigned long start_pfn = PFN_DOWN(start); > unsigned long nr_pages = size >> PAGE_SHIFT; > int ret; > > + pgdat = NODE_DATA(nid); > + > /* We only have ZONE_NORMAL, so this is easy.. */ > - ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); > + ret = __add_pages(nid, pgdat->node_zones + > + zone_for_memory(nid, start, size, ZONE_NORMAL, > + for_device), > + start_pfn, nr_pages, !for_device); > if (unlikely(ret)) > printk("%s: Failed, __add_pages() == %d\n", __func__, ret); > > diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c > index 135c9a7..e7bae21 100644 > --- a/arch/x86/mm/init_32.c > +++ b/arch/x86/mm/init_32.c > @@ -829,12 +829,15 @@ void __init mem_init(void) > } > > #ifdef CONFIG_MEMORY_HOTPLUG > -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) > +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) > { > + struct pglist_data *pgdata = NODE_DATA(nid); > + struct zone *zone = pgdata->node_zones + > + zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device); > unsigned long start_pfn = start >> PAGE_SHIFT; > unsigned long nr_pages = size >> PAGE_SHIFT; > > - return __add_pages(nid, start_pfn, nr_pages, want_memblock); > + return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); > } > > #ifdef CONFIG_MEMORY_HOTREMOVE > diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c > index 902983c..456b452 100644 > --- a/arch/x86/mm/init_64.c > +++ b/arch/x86/mm/init_64.c > @@ -772,15 +772,22 @@ static void update_end_of_memory_vars(u64 start, u64 size) > } > } > > -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) > +/* > + * Memory is added always to NORMAL zone. This means you will never get > + * additional DMA/DMA32 memory. > + */ > +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) > { > + struct pglist_data *pgdat = NODE_DATA(nid); > + struct zone *zone = pgdat->node_zones + > + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); > unsigned long start_pfn = start >> PAGE_SHIFT; > unsigned long nr_pages = size >> PAGE_SHIFT; > int ret; > > init_memory_mapping(start, start + size); > > - ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); > + ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); > WARN_ON_ONCE(ret); > > /* update max_pfn, max_low_pfn and high_memory */ > diff --git a/drivers/base/memory.c b/drivers/base/memory.c > index c7c4e03..1e884d8 100644 > --- a/drivers/base/memory.c > +++ b/drivers/base/memory.c > @@ -392,43 +392,39 @@ static ssize_t show_valid_zones(struct device *dev, > struct device_attribute *attr, char *buf) > { > struct memory_block *mem = to_memory_block(dev); > - unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); > + unsigned long start_pfn, end_pfn; > + unsigned long valid_start, valid_end, valid_pages; > unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; > - unsigned long valid_start_pfn, valid_end_pfn; > - bool append = false; > - int nid; > + struct zone *zone; > + int zone_shift = 0; > > - /* > - * The block contains more than one zone can not be offlined. > - * This can happen e.g. for ZONE_DMA and ZONE_DMA32 > - */ > - if (!test_pages_in_a_zone(start_pfn, start_pfn + nr_pages, &valid_start_pfn, &valid_end_pfn)) > + start_pfn = section_nr_to_pfn(mem->start_section_nr); > + end_pfn = start_pfn + nr_pages; > + > + /* The block contains more than one zone can not be offlined. */ > + if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end)) > return sprintf(buf, "none\n"); > > - start_pfn = valid_start_pfn; > - nr_pages = valid_end_pfn - start_pfn; > + zone = page_zone(pfn_to_page(valid_start)); > + valid_pages = valid_end - valid_start; > > - /* > - * Check the existing zone. Make sure that we do that only on the > - * online nodes otherwise the page_zone is not reliable > - */ > - if (mem->state == MEM_ONLINE) { > - strcat(buf, page_zone(pfn_to_page(start_pfn))->name); > - goto out; > - } > + /* MMOP_ONLINE_KEEP */ > + sprintf(buf, "%s", zone->name); > > - nid = pfn_to_nid(start_pfn); > - if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL)) { > - strcat(buf, default_zone_for_pfn(nid, start_pfn, nr_pages)->name); > - append = true; > + /* MMOP_ONLINE_KERNEL */ > + zone_can_shift(valid_start, valid_pages, ZONE_NORMAL, &zone_shift); > + if (zone_shift) { > + strcat(buf, " "); > + strcat(buf, (zone + zone_shift)->name); > } > > - if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE)) { > - if (append) > - strcat(buf, " "); > - strcat(buf, NODE_DATA(nid)->node_zones[ZONE_MOVABLE].name); > + /* MMOP_ONLINE_MOVABLE */ > + zone_can_shift(valid_start, valid_pages, ZONE_MOVABLE, &zone_shift); > + if (zone_shift) { > + strcat(buf, " "); > + strcat(buf, (zone + zone_shift)->name); > } > -out: > + > strcat(buf, "\n"); > > return strlen(buf); > diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h > index c8a5056..b18a1b8 100644 > --- a/include/linux/memory_hotplug.h > +++ b/include/linux/memory_hotplug.h > @@ -129,8 +129,8 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn, > unsigned long nr_pages); > #endif /* CONFIG_MEMORY_HOTREMOVE */ > > -/* reasonably generic interface to expand the physical pages */ > -extern int __add_pages(int nid, unsigned long start_pfn, > +/* reasonably generic interface to expand the physical pages in a zone */ > +extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn, > unsigned long nr_pages, bool want_memblock); > > #ifdef CONFIG_NUMA > @@ -306,19 +306,18 @@ extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, > void *arg, int (*func)(struct memory_block *, void *)); > extern int add_memory(int nid, u64 start, u64 size); > extern int add_memory_resource(int nid, struct resource *resource, bool online); > -extern int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock); > -extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, > - unsigned long nr_pages); > +extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default, > + bool for_device); > +extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device); > extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); > extern bool is_memblock_offlined(struct memory_block *mem); > extern void remove_memory(int nid, u64 start, u64 size); > -extern int sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn); > +extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn); > extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, > unsigned long map_offset); > extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, > unsigned long pnum); > -extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, > - int online_type); > -extern struct zone *default_zone_for_pfn(int nid, unsigned long pfn, > - unsigned long nr_pages); > +extern bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, > + enum zone_type target, int *zone_shift); > + > #endif /* __LINUX_MEMORY_HOTPLUG_H */ > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 9c6c001..ceb86e9 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -533,22 +533,6 @@ static inline bool zone_is_empty(struct zone *zone) > } > > /* > - * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty > - * intersection with the given zone > - */ > -static inline bool zone_intersects(struct zone *zone, > - unsigned long start_pfn, unsigned long nr_pages) > -{ > - if (zone_is_empty(zone)) > - return false; > - if (start_pfn >= zone_end_pfn(zone) || > - start_pfn + nr_pages <= zone->zone_start_pfn) > - return false; > - > - return true; > -} > - > -/* > * The "priority" of VM scanning is how much of the queues we will scan in one > * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the > * queues ("queue_length >> 12") during an aging round. > diff --git a/kernel/memremap.c b/kernel/memremap.c > index 124bed7..23a6483 100644 > --- a/kernel/memremap.c > +++ b/kernel/memremap.c > @@ -358,11 +358,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, > goto err_pfn_remap; > > mem_hotplug_begin(); > - error = arch_add_memory(nid, align_start, align_size, false); > - if (!error) > - move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], > - align_start >> PAGE_SHIFT, > - align_size >> PAGE_SHIFT); > + error = arch_add_memory(nid, align_start, align_size, true); > mem_hotplug_done(); > if (error) > goto err_add_memory; > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c > index 8dccc31..a4724e7 100644 > --- a/mm/memory_hotplug.c > +++ b/mm/memory_hotplug.c > @@ -243,35 +243,216 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) > } > #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ > > -static int __meminit __add_section(int nid, unsigned long phys_start_pfn, > - bool want_memblock) > +static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn, > + unsigned long end_pfn) > +{ > + unsigned long old_zone_end_pfn; > + > + zone_span_writelock(zone); > + > + old_zone_end_pfn = zone_end_pfn(zone); > + if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) > + zone->zone_start_pfn = start_pfn; > + > + zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - > + zone->zone_start_pfn; > + > + zone_span_writeunlock(zone); > +} > + > +static void resize_zone(struct zone *zone, unsigned long start_pfn, > + unsigned long end_pfn) > +{ > + zone_span_writelock(zone); > + > + if (end_pfn - start_pfn) { > + zone->zone_start_pfn = start_pfn; > + zone->spanned_pages = end_pfn - start_pfn; > + } else { > + /* > + * make it consist as free_area_init_core(), > + * if spanned_pages = 0, then keep start_pfn = 0 > + */ > + zone->zone_start_pfn = 0; > + zone->spanned_pages = 0; > + } > + > + zone_span_writeunlock(zone); > +} > + > +static void fix_zone_id(struct zone *zone, unsigned long start_pfn, > + unsigned long end_pfn) > +{ > + enum zone_type zid = zone_idx(zone); > + int nid = zone->zone_pgdat->node_id; > + unsigned long pfn; > + > + for (pfn = start_pfn; pfn < end_pfn; pfn++) > + set_page_links(pfn_to_page(pfn), zid, nid, pfn); > +} > + > +static void __ref ensure_zone_is_initialized(struct zone *zone, > + unsigned long start_pfn, unsigned long num_pages) > +{ > + if (!zone_is_initialized(zone)) > + init_currently_empty_zone(zone, start_pfn, num_pages); > +} > + > +static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, > + unsigned long start_pfn, unsigned long end_pfn) > +{ > + unsigned long flags; > + unsigned long z1_start_pfn; > + > + ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); > + > + pgdat_resize_lock(z1->zone_pgdat, &flags); > + > + /* can't move pfns which are higher than @z2 */ > + if (end_pfn > zone_end_pfn(z2)) > + goto out_fail; > + /* the move out part must be at the left most of @z2 */ > + if (start_pfn > z2->zone_start_pfn) > + goto out_fail; > + /* must included/overlap */ > + if (end_pfn <= z2->zone_start_pfn) > + goto out_fail; > + > + /* use start_pfn for z1's start_pfn if z1 is empty */ > + if (!zone_is_empty(z1)) > + z1_start_pfn = z1->zone_start_pfn; > + else > + z1_start_pfn = start_pfn; > + > + resize_zone(z1, z1_start_pfn, end_pfn); > + resize_zone(z2, end_pfn, zone_end_pfn(z2)); > + > + pgdat_resize_unlock(z1->zone_pgdat, &flags); > + > + fix_zone_id(z1, start_pfn, end_pfn); > + > + return 0; > +out_fail: > + pgdat_resize_unlock(z1->zone_pgdat, &flags); > + return -1; > +} > + > +static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, > + unsigned long start_pfn, unsigned long end_pfn) > +{ > + unsigned long flags; > + unsigned long z2_end_pfn; > + > + ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); > + > + pgdat_resize_lock(z1->zone_pgdat, &flags); > + > + /* can't move pfns which are lower than @z1 */ > + if (z1->zone_start_pfn > start_pfn) > + goto out_fail; > + /* the move out part mast at the right most of @z1 */ > + if (zone_end_pfn(z1) > end_pfn) > + goto out_fail; > + /* must included/overlap */ > + if (start_pfn >= zone_end_pfn(z1)) > + goto out_fail; > + > + /* use end_pfn for z2's end_pfn if z2 is empty */ > + if (!zone_is_empty(z2)) > + z2_end_pfn = zone_end_pfn(z2); > + else > + z2_end_pfn = end_pfn; > + > + resize_zone(z1, z1->zone_start_pfn, start_pfn); > + resize_zone(z2, start_pfn, z2_end_pfn); > + > + pgdat_resize_unlock(z1->zone_pgdat, &flags); > + > + fix_zone_id(z2, start_pfn, end_pfn); > + > + return 0; > +out_fail: > + pgdat_resize_unlock(z1->zone_pgdat, &flags); > + return -1; > +} > + > +static struct zone * __meminit move_pfn_range(int zone_shift, > + unsigned long start_pfn, unsigned long end_pfn) > +{ > + struct zone *zone = page_zone(pfn_to_page(start_pfn)); > + int ret = 0; > + > + if (zone_shift < 0) > + ret = move_pfn_range_left(zone + zone_shift, zone, > + start_pfn, end_pfn); > + else if (zone_shift) > + ret = move_pfn_range_right(zone, zone + zone_shift, > + start_pfn, end_pfn); > + > + if (ret) > + return NULL; > + > + return zone + zone_shift; > +} > + > +static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, > + unsigned long end_pfn) > +{ > + unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); > + > + if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) > + pgdat->node_start_pfn = start_pfn; > + > + pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - > + pgdat->node_start_pfn; > +} > + > +static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) > +{ > + struct pglist_data *pgdat = zone->zone_pgdat; > + int nr_pages = PAGES_PER_SECTION; > + int nid = pgdat->node_id; > + int zone_type; > + unsigned long flags, pfn; > + > + zone_type = zone - pgdat->node_zones; > + ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); > + > + pgdat_resize_lock(zone->zone_pgdat, &flags); > + grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); > + grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, > + phys_start_pfn + nr_pages); > + pgdat_resize_unlock(zone->zone_pgdat, &flags); > + memmap_init_zone(nr_pages, nid, zone_type, > + phys_start_pfn, MEMMAP_HOTPLUG); > + > + /* online_page_range is called later and expects pages reserved */ > + for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) { > + if (!pfn_valid(pfn)) > + continue; > + > + SetPageReserved(pfn_to_page(pfn)); > + } > + return 0; > +} > + > +static int __meminit __add_section(int nid, struct zone *zone, > + unsigned long phys_start_pfn, bool want_memblock) > { > int ret; > - int i; > > if (pfn_valid(phys_start_pfn)) > return -EEXIST; > > - ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn); > + ret = sparse_add_one_section(zone, phys_start_pfn); > + > if (ret < 0) > return ret; > > - /* > - * Make all the pages reserved so that nobody will stumble over half > - * initialized state. > - * FIXME: We also have to associate it with a node because pfn_to_node > - * relies on having page with the proper node. > - */ > - for (i = 0; i < PAGES_PER_SECTION; i++) { > - unsigned long pfn = phys_start_pfn + i; > - struct page *page; > - if (!pfn_valid(pfn)) > - continue; > + ret = __add_zone(zone, phys_start_pfn); > > - page = pfn_to_page(pfn); > - set_page_node(page, nid); > - SetPageReserved(page); > - } > + if (ret < 0) > + return ret; > > if (!want_memblock) > return 0; > @@ -285,7 +466,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, > * call this function after deciding the zone to which to > * add the new pages. > */ > -int __ref __add_pages(int nid, unsigned long phys_start_pfn, > +int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, > unsigned long nr_pages, bool want_memblock) > { > unsigned long i; > @@ -293,6 +474,8 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn, > int start_sec, end_sec; > struct vmem_altmap *altmap; > > + clear_zone_contiguous(zone); > + > /* during initialize mem_map, align hot-added range to section */ > start_sec = pfn_to_section_nr(phys_start_pfn); > end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); > @@ -312,7 +495,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn, > } > > for (i = start_sec; i <= end_sec; i++) { > - err = __add_section(nid, section_nr_to_pfn(i), want_memblock); > + err = __add_section(nid, zone, section_nr_to_pfn(i), want_memblock); > > /* > * EEXIST is finally dealt with by ioresource collision > @@ -325,6 +508,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn, > } > vmemmap_populate_print_last(); > out: > + set_zone_contiguous(zone); > return err; > } > EXPORT_SYMBOL_GPL(__add_pages); > @@ -699,6 +883,23 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, > return 0; > } > > +#ifdef CONFIG_MOVABLE_NODE > +/* > + * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have > + * normal memory. > + */ > +static bool can_online_high_movable(int nid) > +{ > + return true; > +} > +#else /* CONFIG_MOVABLE_NODE */ > +/* ensure every online node has NORMAL memory */ > +static bool can_online_high_movable(int nid) > +{ > + return node_state(nid, N_NORMAL_MEMORY); > +} > +#endif /* CONFIG_MOVABLE_NODE */ > + > /* check which state of node_states will be changed when online memory */ > static void node_states_check_changes_online(unsigned long nr_pages, > struct zone *zone, struct memory_notify *arg) > @@ -773,144 +974,39 @@ static void node_states_set_node(int node, struct memory_notify *arg) > node_set_state(node, N_MEMORY); > } > > -bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type) > +bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, > + enum zone_type target, int *zone_shift) > { > - struct pglist_data *pgdat = NODE_DATA(nid); > - struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; > - struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages); > - > - /* > - * TODO there shouldn't be any inherent reason to have ZONE_NORMAL > - * physically before ZONE_MOVABLE. All we need is they do not > - * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE > - * though so let's stick with it for simplicity for now. > - * TODO make sure we do not overlap with ZONE_DEVICE > - */ > - if (online_type == MMOP_ONLINE_KERNEL) { > - if (zone_is_empty(movable_zone)) > - return true; > - return movable_zone->zone_start_pfn >= pfn + nr_pages; > - } else if (online_type == MMOP_ONLINE_MOVABLE) { > - return zone_end_pfn(default_zone) <= pfn; > - } > - > - /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */ > - return online_type == MMOP_ONLINE_KEEP; > -} > - > -static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, > - unsigned long nr_pages) > -{ > - unsigned long old_end_pfn = zone_end_pfn(zone); > - > - if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) > - zone->zone_start_pfn = start_pfn; > - > - zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn; > -} > - > -static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn, > - unsigned long nr_pages) > -{ > - unsigned long old_end_pfn = pgdat_end_pfn(pgdat); > - > - if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) > - pgdat->node_start_pfn = start_pfn; > - > - pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; > -} > - > -void __ref move_pfn_range_to_zone(struct zone *zone, > - unsigned long start_pfn, unsigned long nr_pages) > -{ > - struct pglist_data *pgdat = zone->zone_pgdat; > - int nid = pgdat->node_id; > - unsigned long flags; > - > - if (zone_is_empty(zone)) > - init_currently_empty_zone(zone, start_pfn, nr_pages); > - > - clear_zone_contiguous(zone); > - > - /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */ > - pgdat_resize_lock(pgdat, &flags); > - zone_span_writelock(zone); > - resize_zone_range(zone, start_pfn, nr_pages); > - zone_span_writeunlock(zone); > - resize_pgdat_range(pgdat, start_pfn, nr_pages); > - pgdat_resize_unlock(pgdat, &flags); > - > - /* > - * TODO now we have a visible range of pages which are not associated > - * with their zone properly. Not nice but set_pfnblock_flags_mask > - * expects the zone spans the pfn range. All the pages in the range > - * are reserved so nobody should be touching them so we should be safe > - */ > - memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMMAP_HOTPLUG); > - > - set_zone_contiguous(zone); > -} > + struct zone *zone = page_zone(pfn_to_page(pfn)); > + enum zone_type idx = zone_idx(zone); > + int i; > > -/* > - * Returns a default kernel memory zone for the given pfn range. > - * If no kernel zone covers this pfn range it will automatically go > - * to the ZONE_NORMAL. > - */ > -struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, > - unsigned long nr_pages) > -{ > - struct pglist_data *pgdat = NODE_DATA(nid); > - int zid; > + *zone_shift = 0; > > - for (zid = 0; zid <= ZONE_NORMAL; zid++) { > - struct zone *zone = &pgdat->node_zones[zid]; > + if (idx < target) { > + /* pages must be at end of current zone */ > + if (pfn + nr_pages != zone_end_pfn(zone)) > + return false; > > - if (zone_intersects(zone, start_pfn, nr_pages)) > - return zone; > + /* no zones in use between current zone and target */ > + for (i = idx + 1; i < target; i++) > + if (zone_is_initialized(zone - idx + i)) > + return false; > } > > - return &pgdat->node_zones[ZONE_NORMAL]; > -} > - > -static inline bool movable_pfn_range(int nid, struct zone *default_zone, > - unsigned long start_pfn, unsigned long nr_pages) > -{ > - if (!allow_online_pfn_range(nid, start_pfn, nr_pages, > - MMOP_ONLINE_KERNEL)) > - return true; > - > - if (!movable_node_is_enabled()) > - return false; > - > - return !zone_intersects(default_zone, start_pfn, nr_pages); > -} > - > -/* > - * Associates the given pfn range with the given node and the zone appropriate > - * for the given online type. > - */ > -static struct zone * __meminit move_pfn_range(int online_type, int nid, > - unsigned long start_pfn, unsigned long nr_pages) > -{ > - struct pglist_data *pgdat = NODE_DATA(nid); > - struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages); > + if (target < idx) { > + /* pages must be at beginning of current zone */ > + if (pfn != zone->zone_start_pfn) > + return false; > > - if (online_type == MMOP_ONLINE_KEEP) { > - struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; > - /* > - * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use > - * movable zone if that is not possible (e.g. we are within > - * or past the existing movable zone). movable_node overrides > - * this default and defaults to movable zone > - */ > - if (movable_pfn_range(nid, zone, start_pfn, nr_pages)) > - zone = movable_zone; > - } else if (online_type == MMOP_ONLINE_MOVABLE) { > - zone = &pgdat->node_zones[ZONE_MOVABLE]; > + /* no zones in use between current zone and target */ > + for (i = target + 1; i < idx; i++) > + if (zone_is_initialized(zone - idx + i)) > + return false; > } > > - move_pfn_range_to_zone(zone, start_pfn, nr_pages); > - return zone; > + *zone_shift = target - idx; > + return true; > } > > /* Must be protected by mem_hotplug_begin() */ > @@ -923,18 +1019,38 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ > int nid; > int ret; > struct memory_notify arg; > + int zone_shift = 0; > > - nid = pfn_to_nid(pfn); > - if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type)) > + /* > + * This doesn't need a lock to do pfn_to_page(). > + * The section can't be removed here because of the > + * memory_block->state_mutex. > + */ > + zone = page_zone(pfn_to_page(pfn)); > + > + if ((zone_idx(zone) > ZONE_NORMAL || > + online_type == MMOP_ONLINE_MOVABLE) && > + !can_online_high_movable(pfn_to_nid(pfn))) > return -EINVAL; > > - /* associate pfn range with the zone */ > - zone = move_pfn_range(online_type, nid, pfn, nr_pages); > + if (online_type == MMOP_ONLINE_KERNEL) { > + if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift)) > + return -EINVAL; > + } else if (online_type == MMOP_ONLINE_MOVABLE) { > + if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift)) > + return -EINVAL; > + } > + > + zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages); > + if (!zone) > + return -EINVAL; > > arg.start_pfn = pfn; > arg.nr_pages = nr_pages; > node_states_check_changes_online(nr_pages, zone, &arg); > > + nid = zone_to_nid(zone); > + > ret = memory_notify(MEM_GOING_ONLINE, &arg); > ret = notifier_to_errno(ret); > if (ret) > @@ -1129,6 +1245,39 @@ static int check_hotplug_memory_range(u64 start, u64 size) > return 0; > } > > +/* > + * If movable zone has already been setup, newly added memory should be check. > + * If its address is higher than movable zone, it should be added as movable. > + * Without this check, movable zone may overlap with other zone. > + */ > +static int should_add_memory_movable(int nid, u64 start, u64 size) > +{ > + unsigned long start_pfn = start >> PAGE_SHIFT; > + pg_data_t *pgdat = NODE_DATA(nid); > + struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE; > + > + if (zone_is_empty(movable_zone)) > + return 0; > + > + if (movable_zone->zone_start_pfn <= start_pfn) > + return 1; > + > + return 0; > +} > + > +int zone_for_memory(int nid, u64 start, u64 size, int zone_default, > + bool for_device) > +{ > +#ifdef CONFIG_ZONE_DEVICE > + if (for_device) > + return ZONE_DEVICE; > +#endif > + if (should_add_memory_movable(nid, start, size)) > + return ZONE_MOVABLE; > + > + return zone_default; > +} > + > static int online_memory_block(struct memory_block *mem, void *arg) > { > return device_online(&mem->dev); > diff --git a/mm/sparse.c b/mm/sparse.c > index 9c48e4f..293cb5d 100644 > --- a/mm/sparse.c > +++ b/mm/sparse.c > @@ -776,9 +776,10 @@ static void free_map_bootmem(struct page *memmap) > * set. If this is <=0, then that means that the passed-in > * map was not consumed and must be freed. > */ > -int __meminit sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn) > +int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn) > { > unsigned long section_nr = pfn_to_section_nr(start_pfn); > + struct pglist_data *pgdat = zone->zone_pgdat; > struct mem_section *ms; > struct page *memmap; > unsigned long *usemap; > -- > 2.7.4 > > > -- > kernel-team mailing list > kernel-team@lists.ubuntu.com > https://lists.ubuntu.com/mailman/listinfo/kernel-team
On 2018-02-09 10:08:54 , Colin King wrote: > From: Colin Ian King <colin.king@canonical.com> > > BugLink: http://bugs.launchpad.net/bugs/1747069 > > Hotplug removal causes i386 crashes when exercised with the kernel > selftest mem-on-off-test script. A fix occurs in 4.15 however this > requires a large set of changes that are way too large to be SRU'able > and the least risky way forward is to revert the offending commit. > > This fix reverts commit f1dd2cd13c4b ("mm, memory_hotplug: do not > associate hotadded memory to zones until online"), however the > revert required some manual fix-ups because of subsequent fixes after > this commit. > > Note that running the mem-on-off-script is not always sufficient to > trigger the bug. A good reproducer is to run in a 4 CPU VM with 2GB > of memory and after running the script run sync and then re-install > the kernel packages to trip the issue. This has been thoroughly > tested on i386 and amd64 and given a solid soak test using the > ADT tests too. > > Signed-off-by: Colin Ian King <colin.king@canonical.com> > --- > arch/ia64/mm/init.c | 11 +- > arch/powerpc/mm/mem.c | 12 +- > arch/s390/mm/init.c | 32 ++- > arch/sh/mm/init.c | 10 +- > arch/x86/mm/init_32.c | 7 +- > arch/x86/mm/init_64.c | 11 +- > drivers/base/memory.c | 52 +++-- > include/linux/memory_hotplug.h | 19 +- > include/linux/mmzone.h | 16 -- > kernel/memremap.c | 6 +- > mm/memory_hotplug.c | 457 +++++++++++++++++++++++++++-------------- > mm/sparse.c | 3 +- > 12 files changed, 409 insertions(+), 227 deletions(-) > > diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c > index a4e8d6b..39e2aeb 100644 > --- a/arch/ia64/mm/init.c > +++ b/arch/ia64/mm/init.c > @@ -646,13 +646,20 @@ mem_init (void) > } > > #ifdef CONFIG_MEMORY_HOTPLUG > -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) > +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) > { > + pg_data_t *pgdat; > + struct zone *zone; > unsigned long start_pfn = start >> PAGE_SHIFT; > unsigned long nr_pages = size >> PAGE_SHIFT; > int ret; > > - ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); > + pgdat = NODE_DATA(nid); > + > + zone = pgdat->node_zones + > + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); > + ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); > + > if (ret) > printk("%s: Problem encountered in __add_pages() as ret=%d\n", > __func__, ret); > diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c > index 46b4e67..331a78b 100644 > --- a/arch/powerpc/mm/mem.c > +++ b/arch/powerpc/mm/mem.c > @@ -127,14 +127,18 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end) > return -ENODEV; > } > > -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) > +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) > { > + struct pglist_data *pgdata; > + struct zone *zone; > unsigned long start_pfn = start >> PAGE_SHIFT; > unsigned long nr_pages = size >> PAGE_SHIFT; > int rc; > > resize_hpt_for_hotplug(memblock_phys_mem_size()); > > + pgdata = NODE_DATA(nid); > + > start = (unsigned long)__va(start); > rc = create_section_mapping(start, start + size); > if (rc) { > @@ -144,7 +148,11 @@ int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) > return -EFAULT; > } > > - return __add_pages(nid, start_pfn, nr_pages, want_memblock); > + /* this should work for most non-highmem platforms */ > + zone = pgdata->node_zones + > + zone_for_memory(nid, start, size, 0, for_device); > + > + return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); > } > > #ifdef CONFIG_MEMORY_HOTREMOVE > diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c > index 8111694..a3d5499 100644 > --- a/arch/s390/mm/init.c > +++ b/arch/s390/mm/init.c > @@ -166,17 +166,43 @@ unsigned long memory_block_size_bytes(void) > } > > #ifdef CONFIG_MEMORY_HOTPLUG > -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) > +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) > { > + unsigned long zone_start_pfn, zone_end_pfn, nr_pages; > unsigned long start_pfn = PFN_DOWN(start); > unsigned long size_pages = PFN_DOWN(size); > - int rc; > + pg_data_t *pgdat = NODE_DATA(nid); > + struct zone *zone; > + int rc, i; > > rc = vmem_add_mapping(start, size); > if (rc) > return rc; > > - rc = __add_pages(nid, start_pfn, size_pages, want_memblock); > + for (i = 0; i < MAX_NR_ZONES; i++) { > + zone = pgdat->node_zones + i; > + if (zone_idx(zone) != ZONE_MOVABLE) { > + /* Add range within existing zone limits, if possible */ > + zone_start_pfn = zone->zone_start_pfn; > + zone_end_pfn = zone->zone_start_pfn + > + zone->spanned_pages; > + } else { > + /* Add remaining range to ZONE_MOVABLE */ > + zone_start_pfn = start_pfn; > + zone_end_pfn = start_pfn + size_pages; > + } > + if (start_pfn < zone_start_pfn || start_pfn >= zone_end_pfn) > + continue; > + nr_pages = (start_pfn + size_pages > zone_end_pfn) ? > + zone_end_pfn - start_pfn : size_pages; > + rc = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); > + if (rc) > + break; > + start_pfn += nr_pages; > + size_pages -= nr_pages; > + if (!size_pages) > + break; > + } > if (rc) > vmem_remove_mapping(start, size); > return rc; > diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c > index bf726af..a9d57f7 100644 > --- a/arch/sh/mm/init.c > +++ b/arch/sh/mm/init.c > @@ -485,14 +485,20 @@ void free_initrd_mem(unsigned long start, unsigned long end) > #endif > > #ifdef CONFIG_MEMORY_HOTPLUG > -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) > +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) > { > + pg_data_t *pgdat; > unsigned long start_pfn = PFN_DOWN(start); > unsigned long nr_pages = size >> PAGE_SHIFT; > int ret; > > + pgdat = NODE_DATA(nid); > + > /* We only have ZONE_NORMAL, so this is easy.. */ > - ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); > + ret = __add_pages(nid, pgdat->node_zones + > + zone_for_memory(nid, start, size, ZONE_NORMAL, > + for_device), > + start_pfn, nr_pages, !for_device); > if (unlikely(ret)) > printk("%s: Failed, __add_pages() == %d\n", __func__, ret); > > diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c > index 135c9a7..e7bae21 100644 > --- a/arch/x86/mm/init_32.c > +++ b/arch/x86/mm/init_32.c > @@ -829,12 +829,15 @@ void __init mem_init(void) > } > > #ifdef CONFIG_MEMORY_HOTPLUG > -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) > +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) > { > + struct pglist_data *pgdata = NODE_DATA(nid); > + struct zone *zone = pgdata->node_zones + > + zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device); > unsigned long start_pfn = start >> PAGE_SHIFT; > unsigned long nr_pages = size >> PAGE_SHIFT; > > - return __add_pages(nid, start_pfn, nr_pages, want_memblock); > + return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); > } > > #ifdef CONFIG_MEMORY_HOTREMOVE > diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c > index 902983c..456b452 100644 > --- a/arch/x86/mm/init_64.c > +++ b/arch/x86/mm/init_64.c > @@ -772,15 +772,22 @@ static void update_end_of_memory_vars(u64 start, u64 size) > } > } > > -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) > +/* > + * Memory is added always to NORMAL zone. This means you will never get > + * additional DMA/DMA32 memory. > + */ > +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) > { > + struct pglist_data *pgdat = NODE_DATA(nid); > + struct zone *zone = pgdat->node_zones + > + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); > unsigned long start_pfn = start >> PAGE_SHIFT; > unsigned long nr_pages = size >> PAGE_SHIFT; > int ret; > > init_memory_mapping(start, start + size); > > - ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); > + ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); > WARN_ON_ONCE(ret); > > /* update max_pfn, max_low_pfn and high_memory */ > diff --git a/drivers/base/memory.c b/drivers/base/memory.c > index c7c4e03..1e884d8 100644 > --- a/drivers/base/memory.c > +++ b/drivers/base/memory.c > @@ -392,43 +392,39 @@ static ssize_t show_valid_zones(struct device *dev, > struct device_attribute *attr, char *buf) > { > struct memory_block *mem = to_memory_block(dev); > - unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); > + unsigned long start_pfn, end_pfn; > + unsigned long valid_start, valid_end, valid_pages; > unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; > - unsigned long valid_start_pfn, valid_end_pfn; > - bool append = false; > - int nid; > + struct zone *zone; > + int zone_shift = 0; > > - /* > - * The block contains more than one zone can not be offlined. > - * This can happen e.g. for ZONE_DMA and ZONE_DMA32 > - */ > - if (!test_pages_in_a_zone(start_pfn, start_pfn + nr_pages, &valid_start_pfn, &valid_end_pfn)) > + start_pfn = section_nr_to_pfn(mem->start_section_nr); > + end_pfn = start_pfn + nr_pages; > + > + /* The block contains more than one zone can not be offlined. */ > + if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end)) > return sprintf(buf, "none\n"); > > - start_pfn = valid_start_pfn; > - nr_pages = valid_end_pfn - start_pfn; > + zone = page_zone(pfn_to_page(valid_start)); > + valid_pages = valid_end - valid_start; > > - /* > - * Check the existing zone. Make sure that we do that only on the > - * online nodes otherwise the page_zone is not reliable > - */ > - if (mem->state == MEM_ONLINE) { > - strcat(buf, page_zone(pfn_to_page(start_pfn))->name); > - goto out; > - } > + /* MMOP_ONLINE_KEEP */ > + sprintf(buf, "%s", zone->name); > > - nid = pfn_to_nid(start_pfn); > - if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL)) { > - strcat(buf, default_zone_for_pfn(nid, start_pfn, nr_pages)->name); > - append = true; > + /* MMOP_ONLINE_KERNEL */ > + zone_can_shift(valid_start, valid_pages, ZONE_NORMAL, &zone_shift); > + if (zone_shift) { > + strcat(buf, " "); > + strcat(buf, (zone + zone_shift)->name); > } > > - if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE)) { > - if (append) > - strcat(buf, " "); > - strcat(buf, NODE_DATA(nid)->node_zones[ZONE_MOVABLE].name); > + /* MMOP_ONLINE_MOVABLE */ > + zone_can_shift(valid_start, valid_pages, ZONE_MOVABLE, &zone_shift); > + if (zone_shift) { > + strcat(buf, " "); > + strcat(buf, (zone + zone_shift)->name); > } > -out: > + > strcat(buf, "\n"); > > return strlen(buf); > diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h > index c8a5056..b18a1b8 100644 > --- a/include/linux/memory_hotplug.h > +++ b/include/linux/memory_hotplug.h > @@ -129,8 +129,8 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn, > unsigned long nr_pages); > #endif /* CONFIG_MEMORY_HOTREMOVE */ > > -/* reasonably generic interface to expand the physical pages */ > -extern int __add_pages(int nid, unsigned long start_pfn, > +/* reasonably generic interface to expand the physical pages in a zone */ > +extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn, > unsigned long nr_pages, bool want_memblock); > > #ifdef CONFIG_NUMA > @@ -306,19 +306,18 @@ extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, > void *arg, int (*func)(struct memory_block *, void *)); > extern int add_memory(int nid, u64 start, u64 size); > extern int add_memory_resource(int nid, struct resource *resource, bool online); > -extern int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock); > -extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, > - unsigned long nr_pages); > +extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default, > + bool for_device); > +extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device); > extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); > extern bool is_memblock_offlined(struct memory_block *mem); > extern void remove_memory(int nid, u64 start, u64 size); > -extern int sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn); > +extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn); > extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, > unsigned long map_offset); > extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, > unsigned long pnum); > -extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, > - int online_type); > -extern struct zone *default_zone_for_pfn(int nid, unsigned long pfn, > - unsigned long nr_pages); > +extern bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, > + enum zone_type target, int *zone_shift); > + > #endif /* __LINUX_MEMORY_HOTPLUG_H */ > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 9c6c001..ceb86e9 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -533,22 +533,6 @@ static inline bool zone_is_empty(struct zone *zone) > } > > /* > - * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty > - * intersection with the given zone > - */ > -static inline bool zone_intersects(struct zone *zone, > - unsigned long start_pfn, unsigned long nr_pages) > -{ > - if (zone_is_empty(zone)) > - return false; > - if (start_pfn >= zone_end_pfn(zone) || > - start_pfn + nr_pages <= zone->zone_start_pfn) > - return false; > - > - return true; > -} > - > -/* > * The "priority" of VM scanning is how much of the queues we will scan in one > * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the > * queues ("queue_length >> 12") during an aging round. > diff --git a/kernel/memremap.c b/kernel/memremap.c > index 124bed7..23a6483 100644 > --- a/kernel/memremap.c > +++ b/kernel/memremap.c > @@ -358,11 +358,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, > goto err_pfn_remap; > > mem_hotplug_begin(); > - error = arch_add_memory(nid, align_start, align_size, false); > - if (!error) > - move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], > - align_start >> PAGE_SHIFT, > - align_size >> PAGE_SHIFT); > + error = arch_add_memory(nid, align_start, align_size, true); > mem_hotplug_done(); > if (error) > goto err_add_memory; > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c > index 8dccc31..a4724e7 100644 > --- a/mm/memory_hotplug.c > +++ b/mm/memory_hotplug.c > @@ -243,35 +243,216 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) > } > #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ > > -static int __meminit __add_section(int nid, unsigned long phys_start_pfn, > - bool want_memblock) > +static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn, > + unsigned long end_pfn) > +{ > + unsigned long old_zone_end_pfn; > + > + zone_span_writelock(zone); > + > + old_zone_end_pfn = zone_end_pfn(zone); > + if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) > + zone->zone_start_pfn = start_pfn; > + > + zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - > + zone->zone_start_pfn; > + > + zone_span_writeunlock(zone); > +} > + > +static void resize_zone(struct zone *zone, unsigned long start_pfn, > + unsigned long end_pfn) > +{ > + zone_span_writelock(zone); > + > + if (end_pfn - start_pfn) { > + zone->zone_start_pfn = start_pfn; > + zone->spanned_pages = end_pfn - start_pfn; > + } else { > + /* > + * make it consist as free_area_init_core(), > + * if spanned_pages = 0, then keep start_pfn = 0 > + */ > + zone->zone_start_pfn = 0; > + zone->spanned_pages = 0; > + } > + > + zone_span_writeunlock(zone); > +} > + > +static void fix_zone_id(struct zone *zone, unsigned long start_pfn, > + unsigned long end_pfn) > +{ > + enum zone_type zid = zone_idx(zone); > + int nid = zone->zone_pgdat->node_id; > + unsigned long pfn; > + > + for (pfn = start_pfn; pfn < end_pfn; pfn++) > + set_page_links(pfn_to_page(pfn), zid, nid, pfn); > +} > + > +static void __ref ensure_zone_is_initialized(struct zone *zone, > + unsigned long start_pfn, unsigned long num_pages) > +{ > + if (!zone_is_initialized(zone)) > + init_currently_empty_zone(zone, start_pfn, num_pages); > +} > + > +static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, > + unsigned long start_pfn, unsigned long end_pfn) > +{ > + unsigned long flags; > + unsigned long z1_start_pfn; > + > + ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); > + > + pgdat_resize_lock(z1->zone_pgdat, &flags); > + > + /* can't move pfns which are higher than @z2 */ > + if (end_pfn > zone_end_pfn(z2)) > + goto out_fail; > + /* the move out part must be at the left most of @z2 */ > + if (start_pfn > z2->zone_start_pfn) > + goto out_fail; > + /* must included/overlap */ > + if (end_pfn <= z2->zone_start_pfn) > + goto out_fail; > + > + /* use start_pfn for z1's start_pfn if z1 is empty */ > + if (!zone_is_empty(z1)) > + z1_start_pfn = z1->zone_start_pfn; > + else > + z1_start_pfn = start_pfn; > + > + resize_zone(z1, z1_start_pfn, end_pfn); > + resize_zone(z2, end_pfn, zone_end_pfn(z2)); > + > + pgdat_resize_unlock(z1->zone_pgdat, &flags); > + > + fix_zone_id(z1, start_pfn, end_pfn); > + > + return 0; > +out_fail: > + pgdat_resize_unlock(z1->zone_pgdat, &flags); > + return -1; > +} > + > +static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, > + unsigned long start_pfn, unsigned long end_pfn) > +{ > + unsigned long flags; > + unsigned long z2_end_pfn; > + > + ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); > + > + pgdat_resize_lock(z1->zone_pgdat, &flags); > + > + /* can't move pfns which are lower than @z1 */ > + if (z1->zone_start_pfn > start_pfn) > + goto out_fail; > + /* the move out part mast at the right most of @z1 */ > + if (zone_end_pfn(z1) > end_pfn) > + goto out_fail; > + /* must included/overlap */ > + if (start_pfn >= zone_end_pfn(z1)) > + goto out_fail; > + > + /* use end_pfn for z2's end_pfn if z2 is empty */ > + if (!zone_is_empty(z2)) > + z2_end_pfn = zone_end_pfn(z2); > + else > + z2_end_pfn = end_pfn; > + > + resize_zone(z1, z1->zone_start_pfn, start_pfn); > + resize_zone(z2, start_pfn, z2_end_pfn); > + > + pgdat_resize_unlock(z1->zone_pgdat, &flags); > + > + fix_zone_id(z2, start_pfn, end_pfn); > + > + return 0; > +out_fail: > + pgdat_resize_unlock(z1->zone_pgdat, &flags); > + return -1; > +} > + > +static struct zone * __meminit move_pfn_range(int zone_shift, > + unsigned long start_pfn, unsigned long end_pfn) > +{ > + struct zone *zone = page_zone(pfn_to_page(start_pfn)); > + int ret = 0; > + > + if (zone_shift < 0) > + ret = move_pfn_range_left(zone + zone_shift, zone, > + start_pfn, end_pfn); > + else if (zone_shift) > + ret = move_pfn_range_right(zone, zone + zone_shift, > + start_pfn, end_pfn); > + > + if (ret) > + return NULL; > + > + return zone + zone_shift; > +} > + > +static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, > + unsigned long end_pfn) > +{ > + unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); > + > + if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) > + pgdat->node_start_pfn = start_pfn; > + > + pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - > + pgdat->node_start_pfn; > +} > + > +static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) > +{ > + struct pglist_data *pgdat = zone->zone_pgdat; > + int nr_pages = PAGES_PER_SECTION; > + int nid = pgdat->node_id; > + int zone_type; > + unsigned long flags, pfn; > + > + zone_type = zone - pgdat->node_zones; > + ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); > + > + pgdat_resize_lock(zone->zone_pgdat, &flags); > + grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); > + grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, > + phys_start_pfn + nr_pages); > + pgdat_resize_unlock(zone->zone_pgdat, &flags); > + memmap_init_zone(nr_pages, nid, zone_type, > + phys_start_pfn, MEMMAP_HOTPLUG); > + > + /* online_page_range is called later and expects pages reserved */ > + for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) { > + if (!pfn_valid(pfn)) > + continue; > + > + SetPageReserved(pfn_to_page(pfn)); > + } > + return 0; > +} > + > +static int __meminit __add_section(int nid, struct zone *zone, > + unsigned long phys_start_pfn, bool want_memblock) > { > int ret; > - int i; > > if (pfn_valid(phys_start_pfn)) > return -EEXIST; > > - ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn); > + ret = sparse_add_one_section(zone, phys_start_pfn); > + > if (ret < 0) > return ret; > > - /* > - * Make all the pages reserved so that nobody will stumble over half > - * initialized state. > - * FIXME: We also have to associate it with a node because pfn_to_node > - * relies on having page with the proper node. > - */ > - for (i = 0; i < PAGES_PER_SECTION; i++) { > - unsigned long pfn = phys_start_pfn + i; > - struct page *page; > - if (!pfn_valid(pfn)) > - continue; > + ret = __add_zone(zone, phys_start_pfn); > > - page = pfn_to_page(pfn); > - set_page_node(page, nid); > - SetPageReserved(page); > - } > + if (ret < 0) > + return ret; > > if (!want_memblock) > return 0; > @@ -285,7 +466,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, > * call this function after deciding the zone to which to > * add the new pages. > */ > -int __ref __add_pages(int nid, unsigned long phys_start_pfn, > +int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, > unsigned long nr_pages, bool want_memblock) > { > unsigned long i; > @@ -293,6 +474,8 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn, > int start_sec, end_sec; > struct vmem_altmap *altmap; > > + clear_zone_contiguous(zone); > + > /* during initialize mem_map, align hot-added range to section */ > start_sec = pfn_to_section_nr(phys_start_pfn); > end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); > @@ -312,7 +495,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn, > } > > for (i = start_sec; i <= end_sec; i++) { > - err = __add_section(nid, section_nr_to_pfn(i), want_memblock); > + err = __add_section(nid, zone, section_nr_to_pfn(i), want_memblock); > > /* > * EEXIST is finally dealt with by ioresource collision > @@ -325,6 +508,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn, > } > vmemmap_populate_print_last(); > out: > + set_zone_contiguous(zone); > return err; > } > EXPORT_SYMBOL_GPL(__add_pages); > @@ -699,6 +883,23 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, > return 0; > } > > +#ifdef CONFIG_MOVABLE_NODE > +/* > + * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have > + * normal memory. > + */ > +static bool can_online_high_movable(int nid) > +{ > + return true; > +} > +#else /* CONFIG_MOVABLE_NODE */ > +/* ensure every online node has NORMAL memory */ > +static bool can_online_high_movable(int nid) > +{ > + return node_state(nid, N_NORMAL_MEMORY); > +} > +#endif /* CONFIG_MOVABLE_NODE */ > + > /* check which state of node_states will be changed when online memory */ > static void node_states_check_changes_online(unsigned long nr_pages, > struct zone *zone, struct memory_notify *arg) > @@ -773,144 +974,39 @@ static void node_states_set_node(int node, struct memory_notify *arg) > node_set_state(node, N_MEMORY); > } > > -bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type) > +bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, > + enum zone_type target, int *zone_shift) > { > - struct pglist_data *pgdat = NODE_DATA(nid); > - struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; > - struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages); > - > - /* > - * TODO there shouldn't be any inherent reason to have ZONE_NORMAL > - * physically before ZONE_MOVABLE. All we need is they do not > - * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE > - * though so let's stick with it for simplicity for now. > - * TODO make sure we do not overlap with ZONE_DEVICE > - */ > - if (online_type == MMOP_ONLINE_KERNEL) { > - if (zone_is_empty(movable_zone)) > - return true; > - return movable_zone->zone_start_pfn >= pfn + nr_pages; > - } else if (online_type == MMOP_ONLINE_MOVABLE) { > - return zone_end_pfn(default_zone) <= pfn; > - } > - > - /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */ > - return online_type == MMOP_ONLINE_KEEP; > -} > - > -static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, > - unsigned long nr_pages) > -{ > - unsigned long old_end_pfn = zone_end_pfn(zone); > - > - if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) > - zone->zone_start_pfn = start_pfn; > - > - zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn; > -} > - > -static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn, > - unsigned long nr_pages) > -{ > - unsigned long old_end_pfn = pgdat_end_pfn(pgdat); > - > - if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) > - pgdat->node_start_pfn = start_pfn; > - > - pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; > -} > - > -void __ref move_pfn_range_to_zone(struct zone *zone, > - unsigned long start_pfn, unsigned long nr_pages) > -{ > - struct pglist_data *pgdat = zone->zone_pgdat; > - int nid = pgdat->node_id; > - unsigned long flags; > - > - if (zone_is_empty(zone)) > - init_currently_empty_zone(zone, start_pfn, nr_pages); > - > - clear_zone_contiguous(zone); > - > - /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */ > - pgdat_resize_lock(pgdat, &flags); > - zone_span_writelock(zone); > - resize_zone_range(zone, start_pfn, nr_pages); > - zone_span_writeunlock(zone); > - resize_pgdat_range(pgdat, start_pfn, nr_pages); > - pgdat_resize_unlock(pgdat, &flags); > - > - /* > - * TODO now we have a visible range of pages which are not associated > - * with their zone properly. Not nice but set_pfnblock_flags_mask > - * expects the zone spans the pfn range. All the pages in the range > - * are reserved so nobody should be touching them so we should be safe > - */ > - memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMMAP_HOTPLUG); > - > - set_zone_contiguous(zone); > -} > + struct zone *zone = page_zone(pfn_to_page(pfn)); > + enum zone_type idx = zone_idx(zone); > + int i; > > -/* > - * Returns a default kernel memory zone for the given pfn range. > - * If no kernel zone covers this pfn range it will automatically go > - * to the ZONE_NORMAL. > - */ > -struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, > - unsigned long nr_pages) > -{ > - struct pglist_data *pgdat = NODE_DATA(nid); > - int zid; > + *zone_shift = 0; > > - for (zid = 0; zid <= ZONE_NORMAL; zid++) { > - struct zone *zone = &pgdat->node_zones[zid]; > + if (idx < target) { > + /* pages must be at end of current zone */ > + if (pfn + nr_pages != zone_end_pfn(zone)) > + return false; > > - if (zone_intersects(zone, start_pfn, nr_pages)) > - return zone; > + /* no zones in use between current zone and target */ > + for (i = idx + 1; i < target; i++) > + if (zone_is_initialized(zone - idx + i)) > + return false; > } > > - return &pgdat->node_zones[ZONE_NORMAL]; > -} > - > -static inline bool movable_pfn_range(int nid, struct zone *default_zone, > - unsigned long start_pfn, unsigned long nr_pages) > -{ > - if (!allow_online_pfn_range(nid, start_pfn, nr_pages, > - MMOP_ONLINE_KERNEL)) > - return true; > - > - if (!movable_node_is_enabled()) > - return false; > - > - return !zone_intersects(default_zone, start_pfn, nr_pages); > -} > - > -/* > - * Associates the given pfn range with the given node and the zone appropriate > - * for the given online type. > - */ > -static struct zone * __meminit move_pfn_range(int online_type, int nid, > - unsigned long start_pfn, unsigned long nr_pages) > -{ > - struct pglist_data *pgdat = NODE_DATA(nid); > - struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages); > + if (target < idx) { > + /* pages must be at beginning of current zone */ > + if (pfn != zone->zone_start_pfn) > + return false; > > - if (online_type == MMOP_ONLINE_KEEP) { > - struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; > - /* > - * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use > - * movable zone if that is not possible (e.g. we are within > - * or past the existing movable zone). movable_node overrides > - * this default and defaults to movable zone > - */ > - if (movable_pfn_range(nid, zone, start_pfn, nr_pages)) > - zone = movable_zone; > - } else if (online_type == MMOP_ONLINE_MOVABLE) { > - zone = &pgdat->node_zones[ZONE_MOVABLE]; > + /* no zones in use between current zone and target */ > + for (i = target + 1; i < idx; i++) > + if (zone_is_initialized(zone - idx + i)) > + return false; > } > > - move_pfn_range_to_zone(zone, start_pfn, nr_pages); > - return zone; > + *zone_shift = target - idx; > + return true; > } > > /* Must be protected by mem_hotplug_begin() */ > @@ -923,18 +1019,38 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ > int nid; > int ret; > struct memory_notify arg; > + int zone_shift = 0; > > - nid = pfn_to_nid(pfn); > - if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type)) > + /* > + * This doesn't need a lock to do pfn_to_page(). > + * The section can't be removed here because of the > + * memory_block->state_mutex. > + */ > + zone = page_zone(pfn_to_page(pfn)); > + > + if ((zone_idx(zone) > ZONE_NORMAL || > + online_type == MMOP_ONLINE_MOVABLE) && > + !can_online_high_movable(pfn_to_nid(pfn))) > return -EINVAL; > > - /* associate pfn range with the zone */ > - zone = move_pfn_range(online_type, nid, pfn, nr_pages); > + if (online_type == MMOP_ONLINE_KERNEL) { > + if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift)) > + return -EINVAL; > + } else if (online_type == MMOP_ONLINE_MOVABLE) { > + if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift)) > + return -EINVAL; > + } > + > + zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages); > + if (!zone) > + return -EINVAL; > > arg.start_pfn = pfn; > arg.nr_pages = nr_pages; > node_states_check_changes_online(nr_pages, zone, &arg); > > + nid = zone_to_nid(zone); > + > ret = memory_notify(MEM_GOING_ONLINE, &arg); > ret = notifier_to_errno(ret); > if (ret) > @@ -1129,6 +1245,39 @@ static int check_hotplug_memory_range(u64 start, u64 size) > return 0; > } > > +/* > + * If movable zone has already been setup, newly added memory should be check. > + * If its address is higher than movable zone, it should be added as movable. > + * Without this check, movable zone may overlap with other zone. > + */ > +static int should_add_memory_movable(int nid, u64 start, u64 size) > +{ > + unsigned long start_pfn = start >> PAGE_SHIFT; > + pg_data_t *pgdat = NODE_DATA(nid); > + struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE; > + > + if (zone_is_empty(movable_zone)) > + return 0; > + > + if (movable_zone->zone_start_pfn <= start_pfn) > + return 1; > + > + return 0; > +} > + > +int zone_for_memory(int nid, u64 start, u64 size, int zone_default, > + bool for_device) > +{ > +#ifdef CONFIG_ZONE_DEVICE > + if (for_device) > + return ZONE_DEVICE; > +#endif > + if (should_add_memory_movable(nid, start, size)) > + return ZONE_MOVABLE; > + > + return zone_default; > +} > + > static int online_memory_block(struct memory_block *mem, void *arg) > { > return device_online(&mem->dev); > diff --git a/mm/sparse.c b/mm/sparse.c > index 9c48e4f..293cb5d 100644 > --- a/mm/sparse.c > +++ b/mm/sparse.c > @@ -776,9 +776,10 @@ static void free_map_bootmem(struct page *memmap) > * set. If this is <=0, then that means that the passed-in > * map was not consumed and must be freed. > */ > -int __meminit sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn) > +int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn) > { > unsigned long section_nr = pfn_to_section_nr(start_pfn); > + struct pglist_data *pgdat = zone->zone_pgdat; > struct mem_section *ms; > struct page *memmap; > unsigned long *usemap; I don't really get this code, but it's a revert and has been thoroughly tested and discussed, so: Acked-by: Khalid Elmously <khalid.elmously@canonical.com>
Applied to artful/master-next On 2018-02-09 10:08:54 , Colin King wrote: > From: Colin Ian King <colin.king@canonical.com> > > BugLink: http://bugs.launchpad.net/bugs/1747069 > > Hotplug removal causes i386 crashes when exercised with the kernel > selftest mem-on-off-test script. A fix occurs in 4.15 however this > requires a large set of changes that are way too large to be SRU'able > and the least risky way forward is to revert the offending commit. > > This fix reverts commit f1dd2cd13c4b ("mm, memory_hotplug: do not > associate hotadded memory to zones until online"), however the > revert required some manual fix-ups because of subsequent fixes after > this commit. > > Note that running the mem-on-off-script is not always sufficient to > trigger the bug. A good reproducer is to run in a 4 CPU VM with 2GB > of memory and after running the script run sync and then re-install > the kernel packages to trip the issue. This has been thoroughly > tested on i386 and amd64 and given a solid soak test using the > ADT tests too. > > Signed-off-by: Colin Ian King <colin.king@canonical.com> > --- > arch/ia64/mm/init.c | 11 +- > arch/powerpc/mm/mem.c | 12 +- > arch/s390/mm/init.c | 32 ++- > arch/sh/mm/init.c | 10 +- > arch/x86/mm/init_32.c | 7 +- > arch/x86/mm/init_64.c | 11 +- > drivers/base/memory.c | 52 +++-- > include/linux/memory_hotplug.h | 19 +- > include/linux/mmzone.h | 16 -- > kernel/memremap.c | 6 +- > mm/memory_hotplug.c | 457 +++++++++++++++++++++++++++-------------- > mm/sparse.c | 3 +- > 12 files changed, 409 insertions(+), 227 deletions(-) > > diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c > index a4e8d6b..39e2aeb 100644 > --- a/arch/ia64/mm/init.c > +++ b/arch/ia64/mm/init.c > @@ -646,13 +646,20 @@ mem_init (void) > } > > #ifdef CONFIG_MEMORY_HOTPLUG > -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) > +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) > { > + pg_data_t *pgdat; > + struct zone *zone; > unsigned long start_pfn = start >> PAGE_SHIFT; > unsigned long nr_pages = size >> PAGE_SHIFT; > int ret; > > - ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); > + pgdat = NODE_DATA(nid); > + > + zone = pgdat->node_zones + > + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); > + ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); > + > if (ret) > printk("%s: Problem encountered in __add_pages() as ret=%d\n", > __func__, ret); > diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c > index 46b4e67..331a78b 100644 > --- a/arch/powerpc/mm/mem.c > +++ b/arch/powerpc/mm/mem.c > @@ -127,14 +127,18 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end) > return -ENODEV; > } > > -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) > +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) > { > + struct pglist_data *pgdata; > + struct zone *zone; > unsigned long start_pfn = start >> PAGE_SHIFT; > unsigned long nr_pages = size >> PAGE_SHIFT; > int rc; > > resize_hpt_for_hotplug(memblock_phys_mem_size()); > > + pgdata = NODE_DATA(nid); > + > start = (unsigned long)__va(start); > rc = create_section_mapping(start, start + size); > if (rc) { > @@ -144,7 +148,11 @@ int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) > return -EFAULT; > } > > - return __add_pages(nid, start_pfn, nr_pages, want_memblock); > + /* this should work for most non-highmem platforms */ > + zone = pgdata->node_zones + > + zone_for_memory(nid, start, size, 0, for_device); > + > + return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); > } > > #ifdef CONFIG_MEMORY_HOTREMOVE > diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c > index 8111694..a3d5499 100644 > --- a/arch/s390/mm/init.c > +++ b/arch/s390/mm/init.c > @@ -166,17 +166,43 @@ unsigned long memory_block_size_bytes(void) > } > > #ifdef CONFIG_MEMORY_HOTPLUG > -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) > +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) > { > + unsigned long zone_start_pfn, zone_end_pfn, nr_pages; > unsigned long start_pfn = PFN_DOWN(start); > unsigned long size_pages = PFN_DOWN(size); > - int rc; > + pg_data_t *pgdat = NODE_DATA(nid); > + struct zone *zone; > + int rc, i; > > rc = vmem_add_mapping(start, size); > if (rc) > return rc; > > - rc = __add_pages(nid, start_pfn, size_pages, want_memblock); > + for (i = 0; i < MAX_NR_ZONES; i++) { > + zone = pgdat->node_zones + i; > + if (zone_idx(zone) != ZONE_MOVABLE) { > + /* Add range within existing zone limits, if possible */ > + zone_start_pfn = zone->zone_start_pfn; > + zone_end_pfn = zone->zone_start_pfn + > + zone->spanned_pages; > + } else { > + /* Add remaining range to ZONE_MOVABLE */ > + zone_start_pfn = start_pfn; > + zone_end_pfn = start_pfn + size_pages; > + } > + if (start_pfn < zone_start_pfn || start_pfn >= zone_end_pfn) > + continue; > + nr_pages = (start_pfn + size_pages > zone_end_pfn) ? > + zone_end_pfn - start_pfn : size_pages; > + rc = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); > + if (rc) > + break; > + start_pfn += nr_pages; > + size_pages -= nr_pages; > + if (!size_pages) > + break; > + } > if (rc) > vmem_remove_mapping(start, size); > return rc; > diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c > index bf726af..a9d57f7 100644 > --- a/arch/sh/mm/init.c > +++ b/arch/sh/mm/init.c > @@ -485,14 +485,20 @@ void free_initrd_mem(unsigned long start, unsigned long end) > #endif > > #ifdef CONFIG_MEMORY_HOTPLUG > -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) > +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) > { > + pg_data_t *pgdat; > unsigned long start_pfn = PFN_DOWN(start); > unsigned long nr_pages = size >> PAGE_SHIFT; > int ret; > > + pgdat = NODE_DATA(nid); > + > /* We only have ZONE_NORMAL, so this is easy.. */ > - ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); > + ret = __add_pages(nid, pgdat->node_zones + > + zone_for_memory(nid, start, size, ZONE_NORMAL, > + for_device), > + start_pfn, nr_pages, !for_device); > if (unlikely(ret)) > printk("%s: Failed, __add_pages() == %d\n", __func__, ret); > > diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c > index 135c9a7..e7bae21 100644 > --- a/arch/x86/mm/init_32.c > +++ b/arch/x86/mm/init_32.c > @@ -829,12 +829,15 @@ void __init mem_init(void) > } > > #ifdef CONFIG_MEMORY_HOTPLUG > -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) > +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) > { > + struct pglist_data *pgdata = NODE_DATA(nid); > + struct zone *zone = pgdata->node_zones + > + zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device); > unsigned long start_pfn = start >> PAGE_SHIFT; > unsigned long nr_pages = size >> PAGE_SHIFT; > > - return __add_pages(nid, start_pfn, nr_pages, want_memblock); > + return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); > } > > #ifdef CONFIG_MEMORY_HOTREMOVE > diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c > index 902983c..456b452 100644 > --- a/arch/x86/mm/init_64.c > +++ b/arch/x86/mm/init_64.c > @@ -772,15 +772,22 @@ static void update_end_of_memory_vars(u64 start, u64 size) > } > } > > -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) > +/* > + * Memory is added always to NORMAL zone. This means you will never get > + * additional DMA/DMA32 memory. > + */ > +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) > { > + struct pglist_data *pgdat = NODE_DATA(nid); > + struct zone *zone = pgdat->node_zones + > + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); > unsigned long start_pfn = start >> PAGE_SHIFT; > unsigned long nr_pages = size >> PAGE_SHIFT; > int ret; > > init_memory_mapping(start, start + size); > > - ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); > + ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); > WARN_ON_ONCE(ret); > > /* update max_pfn, max_low_pfn and high_memory */ > diff --git a/drivers/base/memory.c b/drivers/base/memory.c > index c7c4e03..1e884d8 100644 > --- a/drivers/base/memory.c > +++ b/drivers/base/memory.c > @@ -392,43 +392,39 @@ static ssize_t show_valid_zones(struct device *dev, > struct device_attribute *attr, char *buf) > { > struct memory_block *mem = to_memory_block(dev); > - unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); > + unsigned long start_pfn, end_pfn; > + unsigned long valid_start, valid_end, valid_pages; > unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; > - unsigned long valid_start_pfn, valid_end_pfn; > - bool append = false; > - int nid; > + struct zone *zone; > + int zone_shift = 0; > > - /* > - * The block contains more than one zone can not be offlined. > - * This can happen e.g. for ZONE_DMA and ZONE_DMA32 > - */ > - if (!test_pages_in_a_zone(start_pfn, start_pfn + nr_pages, &valid_start_pfn, &valid_end_pfn)) > + start_pfn = section_nr_to_pfn(mem->start_section_nr); > + end_pfn = start_pfn + nr_pages; > + > + /* The block contains more than one zone can not be offlined. */ > + if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end)) > return sprintf(buf, "none\n"); > > - start_pfn = valid_start_pfn; > - nr_pages = valid_end_pfn - start_pfn; > + zone = page_zone(pfn_to_page(valid_start)); > + valid_pages = valid_end - valid_start; > > - /* > - * Check the existing zone. Make sure that we do that only on the > - * online nodes otherwise the page_zone is not reliable > - */ > - if (mem->state == MEM_ONLINE) { > - strcat(buf, page_zone(pfn_to_page(start_pfn))->name); > - goto out; > - } > + /* MMOP_ONLINE_KEEP */ > + sprintf(buf, "%s", zone->name); > > - nid = pfn_to_nid(start_pfn); > - if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL)) { > - strcat(buf, default_zone_for_pfn(nid, start_pfn, nr_pages)->name); > - append = true; > + /* MMOP_ONLINE_KERNEL */ > + zone_can_shift(valid_start, valid_pages, ZONE_NORMAL, &zone_shift); > + if (zone_shift) { > + strcat(buf, " "); > + strcat(buf, (zone + zone_shift)->name); > } > > - if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE)) { > - if (append) > - strcat(buf, " "); > - strcat(buf, NODE_DATA(nid)->node_zones[ZONE_MOVABLE].name); > + /* MMOP_ONLINE_MOVABLE */ > + zone_can_shift(valid_start, valid_pages, ZONE_MOVABLE, &zone_shift); > + if (zone_shift) { > + strcat(buf, " "); > + strcat(buf, (zone + zone_shift)->name); > } > -out: > + > strcat(buf, "\n"); > > return strlen(buf); > diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h > index c8a5056..b18a1b8 100644 > --- a/include/linux/memory_hotplug.h > +++ b/include/linux/memory_hotplug.h > @@ -129,8 +129,8 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn, > unsigned long nr_pages); > #endif /* CONFIG_MEMORY_HOTREMOVE */ > > -/* reasonably generic interface to expand the physical pages */ > -extern int __add_pages(int nid, unsigned long start_pfn, > +/* reasonably generic interface to expand the physical pages in a zone */ > +extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn, > unsigned long nr_pages, bool want_memblock); > > #ifdef CONFIG_NUMA > @@ -306,19 +306,18 @@ extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, > void *arg, int (*func)(struct memory_block *, void *)); > extern int add_memory(int nid, u64 start, u64 size); > extern int add_memory_resource(int nid, struct resource *resource, bool online); > -extern int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock); > -extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, > - unsigned long nr_pages); > +extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default, > + bool for_device); > +extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device); > extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); > extern bool is_memblock_offlined(struct memory_block *mem); > extern void remove_memory(int nid, u64 start, u64 size); > -extern int sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn); > +extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn); > extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, > unsigned long map_offset); > extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, > unsigned long pnum); > -extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, > - int online_type); > -extern struct zone *default_zone_for_pfn(int nid, unsigned long pfn, > - unsigned long nr_pages); > +extern bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, > + enum zone_type target, int *zone_shift); > + > #endif /* __LINUX_MEMORY_HOTPLUG_H */ > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 9c6c001..ceb86e9 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -533,22 +533,6 @@ static inline bool zone_is_empty(struct zone *zone) > } > > /* > - * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty > - * intersection with the given zone > - */ > -static inline bool zone_intersects(struct zone *zone, > - unsigned long start_pfn, unsigned long nr_pages) > -{ > - if (zone_is_empty(zone)) > - return false; > - if (start_pfn >= zone_end_pfn(zone) || > - start_pfn + nr_pages <= zone->zone_start_pfn) > - return false; > - > - return true; > -} > - > -/* > * The "priority" of VM scanning is how much of the queues we will scan in one > * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the > * queues ("queue_length >> 12") during an aging round. > diff --git a/kernel/memremap.c b/kernel/memremap.c > index 124bed7..23a6483 100644 > --- a/kernel/memremap.c > +++ b/kernel/memremap.c > @@ -358,11 +358,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, > goto err_pfn_remap; > > mem_hotplug_begin(); > - error = arch_add_memory(nid, align_start, align_size, false); > - if (!error) > - move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], > - align_start >> PAGE_SHIFT, > - align_size >> PAGE_SHIFT); > + error = arch_add_memory(nid, align_start, align_size, true); > mem_hotplug_done(); > if (error) > goto err_add_memory; > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c > index 8dccc31..a4724e7 100644 > --- a/mm/memory_hotplug.c > +++ b/mm/memory_hotplug.c > @@ -243,35 +243,216 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) > } > #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ > > -static int __meminit __add_section(int nid, unsigned long phys_start_pfn, > - bool want_memblock) > +static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn, > + unsigned long end_pfn) > +{ > + unsigned long old_zone_end_pfn; > + > + zone_span_writelock(zone); > + > + old_zone_end_pfn = zone_end_pfn(zone); > + if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) > + zone->zone_start_pfn = start_pfn; > + > + zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - > + zone->zone_start_pfn; > + > + zone_span_writeunlock(zone); > +} > + > +static void resize_zone(struct zone *zone, unsigned long start_pfn, > + unsigned long end_pfn) > +{ > + zone_span_writelock(zone); > + > + if (end_pfn - start_pfn) { > + zone->zone_start_pfn = start_pfn; > + zone->spanned_pages = end_pfn - start_pfn; > + } else { > + /* > + * make it consist as free_area_init_core(), > + * if spanned_pages = 0, then keep start_pfn = 0 > + */ > + zone->zone_start_pfn = 0; > + zone->spanned_pages = 0; > + } > + > + zone_span_writeunlock(zone); > +} > + > +static void fix_zone_id(struct zone *zone, unsigned long start_pfn, > + unsigned long end_pfn) > +{ > + enum zone_type zid = zone_idx(zone); > + int nid = zone->zone_pgdat->node_id; > + unsigned long pfn; > + > + for (pfn = start_pfn; pfn < end_pfn; pfn++) > + set_page_links(pfn_to_page(pfn), zid, nid, pfn); > +} > + > +static void __ref ensure_zone_is_initialized(struct zone *zone, > + unsigned long start_pfn, unsigned long num_pages) > +{ > + if (!zone_is_initialized(zone)) > + init_currently_empty_zone(zone, start_pfn, num_pages); > +} > + > +static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, > + unsigned long start_pfn, unsigned long end_pfn) > +{ > + unsigned long flags; > + unsigned long z1_start_pfn; > + > + ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); > + > + pgdat_resize_lock(z1->zone_pgdat, &flags); > + > + /* can't move pfns which are higher than @z2 */ > + if (end_pfn > zone_end_pfn(z2)) > + goto out_fail; > + /* the move out part must be at the left most of @z2 */ > + if (start_pfn > z2->zone_start_pfn) > + goto out_fail; > + /* must included/overlap */ > + if (end_pfn <= z2->zone_start_pfn) > + goto out_fail; > + > + /* use start_pfn for z1's start_pfn if z1 is empty */ > + if (!zone_is_empty(z1)) > + z1_start_pfn = z1->zone_start_pfn; > + else > + z1_start_pfn = start_pfn; > + > + resize_zone(z1, z1_start_pfn, end_pfn); > + resize_zone(z2, end_pfn, zone_end_pfn(z2)); > + > + pgdat_resize_unlock(z1->zone_pgdat, &flags); > + > + fix_zone_id(z1, start_pfn, end_pfn); > + > + return 0; > +out_fail: > + pgdat_resize_unlock(z1->zone_pgdat, &flags); > + return -1; > +} > + > +static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, > + unsigned long start_pfn, unsigned long end_pfn) > +{ > + unsigned long flags; > + unsigned long z2_end_pfn; > + > + ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); > + > + pgdat_resize_lock(z1->zone_pgdat, &flags); > + > + /* can't move pfns which are lower than @z1 */ > + if (z1->zone_start_pfn > start_pfn) > + goto out_fail; > + /* the move out part mast at the right most of @z1 */ > + if (zone_end_pfn(z1) > end_pfn) > + goto out_fail; > + /* must included/overlap */ > + if (start_pfn >= zone_end_pfn(z1)) > + goto out_fail; > + > + /* use end_pfn for z2's end_pfn if z2 is empty */ > + if (!zone_is_empty(z2)) > + z2_end_pfn = zone_end_pfn(z2); > + else > + z2_end_pfn = end_pfn; > + > + resize_zone(z1, z1->zone_start_pfn, start_pfn); > + resize_zone(z2, start_pfn, z2_end_pfn); > + > + pgdat_resize_unlock(z1->zone_pgdat, &flags); > + > + fix_zone_id(z2, start_pfn, end_pfn); > + > + return 0; > +out_fail: > + pgdat_resize_unlock(z1->zone_pgdat, &flags); > + return -1; > +} > + > +static struct zone * __meminit move_pfn_range(int zone_shift, > + unsigned long start_pfn, unsigned long end_pfn) > +{ > + struct zone *zone = page_zone(pfn_to_page(start_pfn)); > + int ret = 0; > + > + if (zone_shift < 0) > + ret = move_pfn_range_left(zone + zone_shift, zone, > + start_pfn, end_pfn); > + else if (zone_shift) > + ret = move_pfn_range_right(zone, zone + zone_shift, > + start_pfn, end_pfn); > + > + if (ret) > + return NULL; > + > + return zone + zone_shift; > +} > + > +static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, > + unsigned long end_pfn) > +{ > + unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); > + > + if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) > + pgdat->node_start_pfn = start_pfn; > + > + pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - > + pgdat->node_start_pfn; > +} > + > +static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) > +{ > + struct pglist_data *pgdat = zone->zone_pgdat; > + int nr_pages = PAGES_PER_SECTION; > + int nid = pgdat->node_id; > + int zone_type; > + unsigned long flags, pfn; > + > + zone_type = zone - pgdat->node_zones; > + ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); > + > + pgdat_resize_lock(zone->zone_pgdat, &flags); > + grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); > + grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, > + phys_start_pfn + nr_pages); > + pgdat_resize_unlock(zone->zone_pgdat, &flags); > + memmap_init_zone(nr_pages, nid, zone_type, > + phys_start_pfn, MEMMAP_HOTPLUG); > + > + /* online_page_range is called later and expects pages reserved */ > + for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) { > + if (!pfn_valid(pfn)) > + continue; > + > + SetPageReserved(pfn_to_page(pfn)); > + } > + return 0; > +} > + > +static int __meminit __add_section(int nid, struct zone *zone, > + unsigned long phys_start_pfn, bool want_memblock) > { > int ret; > - int i; > > if (pfn_valid(phys_start_pfn)) > return -EEXIST; > > - ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn); > + ret = sparse_add_one_section(zone, phys_start_pfn); > + > if (ret < 0) > return ret; > > - /* > - * Make all the pages reserved so that nobody will stumble over half > - * initialized state. > - * FIXME: We also have to associate it with a node because pfn_to_node > - * relies on having page with the proper node. > - */ > - for (i = 0; i < PAGES_PER_SECTION; i++) { > - unsigned long pfn = phys_start_pfn + i; > - struct page *page; > - if (!pfn_valid(pfn)) > - continue; > + ret = __add_zone(zone, phys_start_pfn); > > - page = pfn_to_page(pfn); > - set_page_node(page, nid); > - SetPageReserved(page); > - } > + if (ret < 0) > + return ret; > > if (!want_memblock) > return 0; > @@ -285,7 +466,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, > * call this function after deciding the zone to which to > * add the new pages. > */ > -int __ref __add_pages(int nid, unsigned long phys_start_pfn, > +int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, > unsigned long nr_pages, bool want_memblock) > { > unsigned long i; > @@ -293,6 +474,8 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn, > int start_sec, end_sec; > struct vmem_altmap *altmap; > > + clear_zone_contiguous(zone); > + > /* during initialize mem_map, align hot-added range to section */ > start_sec = pfn_to_section_nr(phys_start_pfn); > end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); > @@ -312,7 +495,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn, > } > > for (i = start_sec; i <= end_sec; i++) { > - err = __add_section(nid, section_nr_to_pfn(i), want_memblock); > + err = __add_section(nid, zone, section_nr_to_pfn(i), want_memblock); > > /* > * EEXIST is finally dealt with by ioresource collision > @@ -325,6 +508,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn, > } > vmemmap_populate_print_last(); > out: > + set_zone_contiguous(zone); > return err; > } > EXPORT_SYMBOL_GPL(__add_pages); > @@ -699,6 +883,23 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, > return 0; > } > > +#ifdef CONFIG_MOVABLE_NODE > +/* > + * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have > + * normal memory. > + */ > +static bool can_online_high_movable(int nid) > +{ > + return true; > +} > +#else /* CONFIG_MOVABLE_NODE */ > +/* ensure every online node has NORMAL memory */ > +static bool can_online_high_movable(int nid) > +{ > + return node_state(nid, N_NORMAL_MEMORY); > +} > +#endif /* CONFIG_MOVABLE_NODE */ > + > /* check which state of node_states will be changed when online memory */ > static void node_states_check_changes_online(unsigned long nr_pages, > struct zone *zone, struct memory_notify *arg) > @@ -773,144 +974,39 @@ static void node_states_set_node(int node, struct memory_notify *arg) > node_set_state(node, N_MEMORY); > } > > -bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type) > +bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, > + enum zone_type target, int *zone_shift) > { > - struct pglist_data *pgdat = NODE_DATA(nid); > - struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; > - struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages); > - > - /* > - * TODO there shouldn't be any inherent reason to have ZONE_NORMAL > - * physically before ZONE_MOVABLE. All we need is they do not > - * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE > - * though so let's stick with it for simplicity for now. > - * TODO make sure we do not overlap with ZONE_DEVICE > - */ > - if (online_type == MMOP_ONLINE_KERNEL) { > - if (zone_is_empty(movable_zone)) > - return true; > - return movable_zone->zone_start_pfn >= pfn + nr_pages; > - } else if (online_type == MMOP_ONLINE_MOVABLE) { > - return zone_end_pfn(default_zone) <= pfn; > - } > - > - /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */ > - return online_type == MMOP_ONLINE_KEEP; > -} > - > -static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, > - unsigned long nr_pages) > -{ > - unsigned long old_end_pfn = zone_end_pfn(zone); > - > - if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) > - zone->zone_start_pfn = start_pfn; > - > - zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn; > -} > - > -static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn, > - unsigned long nr_pages) > -{ > - unsigned long old_end_pfn = pgdat_end_pfn(pgdat); > - > - if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) > - pgdat->node_start_pfn = start_pfn; > - > - pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; > -} > - > -void __ref move_pfn_range_to_zone(struct zone *zone, > - unsigned long start_pfn, unsigned long nr_pages) > -{ > - struct pglist_data *pgdat = zone->zone_pgdat; > - int nid = pgdat->node_id; > - unsigned long flags; > - > - if (zone_is_empty(zone)) > - init_currently_empty_zone(zone, start_pfn, nr_pages); > - > - clear_zone_contiguous(zone); > - > - /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */ > - pgdat_resize_lock(pgdat, &flags); > - zone_span_writelock(zone); > - resize_zone_range(zone, start_pfn, nr_pages); > - zone_span_writeunlock(zone); > - resize_pgdat_range(pgdat, start_pfn, nr_pages); > - pgdat_resize_unlock(pgdat, &flags); > - > - /* > - * TODO now we have a visible range of pages which are not associated > - * with their zone properly. Not nice but set_pfnblock_flags_mask > - * expects the zone spans the pfn range. All the pages in the range > - * are reserved so nobody should be touching them so we should be safe > - */ > - memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMMAP_HOTPLUG); > - > - set_zone_contiguous(zone); > -} > + struct zone *zone = page_zone(pfn_to_page(pfn)); > + enum zone_type idx = zone_idx(zone); > + int i; > > -/* > - * Returns a default kernel memory zone for the given pfn range. > - * If no kernel zone covers this pfn range it will automatically go > - * to the ZONE_NORMAL. > - */ > -struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, > - unsigned long nr_pages) > -{ > - struct pglist_data *pgdat = NODE_DATA(nid); > - int zid; > + *zone_shift = 0; > > - for (zid = 0; zid <= ZONE_NORMAL; zid++) { > - struct zone *zone = &pgdat->node_zones[zid]; > + if (idx < target) { > + /* pages must be at end of current zone */ > + if (pfn + nr_pages != zone_end_pfn(zone)) > + return false; > > - if (zone_intersects(zone, start_pfn, nr_pages)) > - return zone; > + /* no zones in use between current zone and target */ > + for (i = idx + 1; i < target; i++) > + if (zone_is_initialized(zone - idx + i)) > + return false; > } > > - return &pgdat->node_zones[ZONE_NORMAL]; > -} > - > -static inline bool movable_pfn_range(int nid, struct zone *default_zone, > - unsigned long start_pfn, unsigned long nr_pages) > -{ > - if (!allow_online_pfn_range(nid, start_pfn, nr_pages, > - MMOP_ONLINE_KERNEL)) > - return true; > - > - if (!movable_node_is_enabled()) > - return false; > - > - return !zone_intersects(default_zone, start_pfn, nr_pages); > -} > - > -/* > - * Associates the given pfn range with the given node and the zone appropriate > - * for the given online type. > - */ > -static struct zone * __meminit move_pfn_range(int online_type, int nid, > - unsigned long start_pfn, unsigned long nr_pages) > -{ > - struct pglist_data *pgdat = NODE_DATA(nid); > - struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages); > + if (target < idx) { > + /* pages must be at beginning of current zone */ > + if (pfn != zone->zone_start_pfn) > + return false; > > - if (online_type == MMOP_ONLINE_KEEP) { > - struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; > - /* > - * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use > - * movable zone if that is not possible (e.g. we are within > - * or past the existing movable zone). movable_node overrides > - * this default and defaults to movable zone > - */ > - if (movable_pfn_range(nid, zone, start_pfn, nr_pages)) > - zone = movable_zone; > - } else if (online_type == MMOP_ONLINE_MOVABLE) { > - zone = &pgdat->node_zones[ZONE_MOVABLE]; > + /* no zones in use between current zone and target */ > + for (i = target + 1; i < idx; i++) > + if (zone_is_initialized(zone - idx + i)) > + return false; > } > > - move_pfn_range_to_zone(zone, start_pfn, nr_pages); > - return zone; > + *zone_shift = target - idx; > + return true; > } > > /* Must be protected by mem_hotplug_begin() */ > @@ -923,18 +1019,38 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ > int nid; > int ret; > struct memory_notify arg; > + int zone_shift = 0; > > - nid = pfn_to_nid(pfn); > - if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type)) > + /* > + * This doesn't need a lock to do pfn_to_page(). > + * The section can't be removed here because of the > + * memory_block->state_mutex. > + */ > + zone = page_zone(pfn_to_page(pfn)); > + > + if ((zone_idx(zone) > ZONE_NORMAL || > + online_type == MMOP_ONLINE_MOVABLE) && > + !can_online_high_movable(pfn_to_nid(pfn))) > return -EINVAL; > > - /* associate pfn range with the zone */ > - zone = move_pfn_range(online_type, nid, pfn, nr_pages); > + if (online_type == MMOP_ONLINE_KERNEL) { > + if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift)) > + return -EINVAL; > + } else if (online_type == MMOP_ONLINE_MOVABLE) { > + if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift)) > + return -EINVAL; > + } > + > + zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages); > + if (!zone) > + return -EINVAL; > > arg.start_pfn = pfn; > arg.nr_pages = nr_pages; > node_states_check_changes_online(nr_pages, zone, &arg); > > + nid = zone_to_nid(zone); > + > ret = memory_notify(MEM_GOING_ONLINE, &arg); > ret = notifier_to_errno(ret); > if (ret) > @@ -1129,6 +1245,39 @@ static int check_hotplug_memory_range(u64 start, u64 size) > return 0; > } > > +/* > + * If movable zone has already been setup, newly added memory should be check. > + * If its address is higher than movable zone, it should be added as movable. > + * Without this check, movable zone may overlap with other zone. > + */ > +static int should_add_memory_movable(int nid, u64 start, u64 size) > +{ > + unsigned long start_pfn = start >> PAGE_SHIFT; > + pg_data_t *pgdat = NODE_DATA(nid); > + struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE; > + > + if (zone_is_empty(movable_zone)) > + return 0; > + > + if (movable_zone->zone_start_pfn <= start_pfn) > + return 1; > + > + return 0; > +} > + > +int zone_for_memory(int nid, u64 start, u64 size, int zone_default, > + bool for_device) > +{ > +#ifdef CONFIG_ZONE_DEVICE > + if (for_device) > + return ZONE_DEVICE; > +#endif > + if (should_add_memory_movable(nid, start, size)) > + return ZONE_MOVABLE; > + > + return zone_default; > +} > + > static int online_memory_block(struct memory_block *mem, void *arg) > { > return device_online(&mem->dev); > diff --git a/mm/sparse.c b/mm/sparse.c > index 9c48e4f..293cb5d 100644 > --- a/mm/sparse.c > +++ b/mm/sparse.c > @@ -776,9 +776,10 @@ static void free_map_bootmem(struct page *memmap) > * set. If this is <=0, then that means that the passed-in > * map was not consumed and must be freed. > */ > -int __meminit sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn) > +int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn) > { > unsigned long section_nr = pfn_to_section_nr(start_pfn); > + struct pglist_data *pgdat = zone->zone_pgdat; > struct mem_section *ms; > struct page *memmap; > unsigned long *usemap; > -- > 2.7.4 > > > -- > kernel-team mailing list > kernel-team@lists.ubuntu.com > https://lists.ubuntu.com/mailman/listinfo/kernel-team
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index a4e8d6b..39e2aeb 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -646,13 +646,20 @@ mem_init (void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { + pg_data_t *pgdat; + struct zone *zone; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); + pgdat = NODE_DATA(nid); + + zone = pgdat->node_zones + + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); + ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); + if (ret) printk("%s: Problem encountered in __add_pages() as ret=%d\n", __func__, ret); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 46b4e67..331a78b 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -127,14 +127,18 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end) return -ENODEV; } -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { + struct pglist_data *pgdata; + struct zone *zone; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int rc; resize_hpt_for_hotplug(memblock_phys_mem_size()); + pgdata = NODE_DATA(nid); + start = (unsigned long)__va(start); rc = create_section_mapping(start, start + size); if (rc) { @@ -144,7 +148,11 @@ int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) return -EFAULT; } - return __add_pages(nid, start_pfn, nr_pages, want_memblock); + /* this should work for most non-highmem platforms */ + zone = pgdata->node_zones + + zone_for_memory(nid, start, size, 0, for_device); + + return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 8111694..a3d5499 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -166,17 +166,43 @@ unsigned long memory_block_size_bytes(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { + unsigned long zone_start_pfn, zone_end_pfn, nr_pages; unsigned long start_pfn = PFN_DOWN(start); unsigned long size_pages = PFN_DOWN(size); - int rc; + pg_data_t *pgdat = NODE_DATA(nid); + struct zone *zone; + int rc, i; rc = vmem_add_mapping(start, size); if (rc) return rc; - rc = __add_pages(nid, start_pfn, size_pages, want_memblock); + for (i = 0; i < MAX_NR_ZONES; i++) { + zone = pgdat->node_zones + i; + if (zone_idx(zone) != ZONE_MOVABLE) { + /* Add range within existing zone limits, if possible */ + zone_start_pfn = zone->zone_start_pfn; + zone_end_pfn = zone->zone_start_pfn + + zone->spanned_pages; + } else { + /* Add remaining range to ZONE_MOVABLE */ + zone_start_pfn = start_pfn; + zone_end_pfn = start_pfn + size_pages; + } + if (start_pfn < zone_start_pfn || start_pfn >= zone_end_pfn) + continue; + nr_pages = (start_pfn + size_pages > zone_end_pfn) ? + zone_end_pfn - start_pfn : size_pages; + rc = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); + if (rc) + break; + start_pfn += nr_pages; + size_pages -= nr_pages; + if (!size_pages) + break; + } if (rc) vmem_remove_mapping(start, size); return rc; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index bf726af..a9d57f7 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -485,14 +485,20 @@ void free_initrd_mem(unsigned long start, unsigned long end) #endif #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { + pg_data_t *pgdat; unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; int ret; + pgdat = NODE_DATA(nid); + /* We only have ZONE_NORMAL, so this is easy.. */ - ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); + ret = __add_pages(nid, pgdat->node_zones + + zone_for_memory(nid, start, size, ZONE_NORMAL, + for_device), + start_pfn, nr_pages, !for_device); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 135c9a7..e7bae21 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -829,12 +829,15 @@ void __init mem_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { + struct pglist_data *pgdata = NODE_DATA(nid); + struct zone *zone = pgdata->node_zones + + zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(nid, start_pfn, nr_pages, want_memblock); + return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 902983c..456b452 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -772,15 +772,22 @@ static void update_end_of_memory_vars(u64 start, u64 size) } } -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) +/* + * Memory is added always to NORMAL zone. This means you will never get + * additional DMA/DMA32 memory. + */ +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { + struct pglist_data *pgdat = NODE_DATA(nid); + struct zone *zone = pgdat->node_zones + + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; init_memory_mapping(start, start + size); - ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); + ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ diff --git a/drivers/base/memory.c b/drivers/base/memory.c index c7c4e03..1e884d8 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -392,43 +392,39 @@ static ssize_t show_valid_zones(struct device *dev, struct device_attribute *attr, char *buf) { struct memory_block *mem = to_memory_block(dev); - unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); + unsigned long start_pfn, end_pfn; + unsigned long valid_start, valid_end, valid_pages; unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; - unsigned long valid_start_pfn, valid_end_pfn; - bool append = false; - int nid; + struct zone *zone; + int zone_shift = 0; - /* - * The block contains more than one zone can not be offlined. - * This can happen e.g. for ZONE_DMA and ZONE_DMA32 - */ - if (!test_pages_in_a_zone(start_pfn, start_pfn + nr_pages, &valid_start_pfn, &valid_end_pfn)) + start_pfn = section_nr_to_pfn(mem->start_section_nr); + end_pfn = start_pfn + nr_pages; + + /* The block contains more than one zone can not be offlined. */ + if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end)) return sprintf(buf, "none\n"); - start_pfn = valid_start_pfn; - nr_pages = valid_end_pfn - start_pfn; + zone = page_zone(pfn_to_page(valid_start)); + valid_pages = valid_end - valid_start; - /* - * Check the existing zone. Make sure that we do that only on the - * online nodes otherwise the page_zone is not reliable - */ - if (mem->state == MEM_ONLINE) { - strcat(buf, page_zone(pfn_to_page(start_pfn))->name); - goto out; - } + /* MMOP_ONLINE_KEEP */ + sprintf(buf, "%s", zone->name); - nid = pfn_to_nid(start_pfn); - if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL)) { - strcat(buf, default_zone_for_pfn(nid, start_pfn, nr_pages)->name); - append = true; + /* MMOP_ONLINE_KERNEL */ + zone_can_shift(valid_start, valid_pages, ZONE_NORMAL, &zone_shift); + if (zone_shift) { + strcat(buf, " "); + strcat(buf, (zone + zone_shift)->name); } - if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE)) { - if (append) - strcat(buf, " "); - strcat(buf, NODE_DATA(nid)->node_zones[ZONE_MOVABLE].name); + /* MMOP_ONLINE_MOVABLE */ + zone_can_shift(valid_start, valid_pages, ZONE_MOVABLE, &zone_shift); + if (zone_shift) { + strcat(buf, " "); + strcat(buf, (zone + zone_shift)->name); } -out: + strcat(buf, "\n"); return strlen(buf); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index c8a5056..b18a1b8 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -129,8 +129,8 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); #endif /* CONFIG_MEMORY_HOTREMOVE */ -/* reasonably generic interface to expand the physical pages */ -extern int __add_pages(int nid, unsigned long start_pfn, +/* reasonably generic interface to expand the physical pages in a zone */ +extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, bool want_memblock); #ifdef CONFIG_NUMA @@ -306,19 +306,18 @@ extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, void *arg, int (*func)(struct memory_block *, void *)); extern int add_memory(int nid, u64 start, u64 size); extern int add_memory_resource(int nid, struct resource *resource, bool online); -extern int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock); -extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages); +extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default, + bool for_device); +extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern bool is_memblock_offlined(struct memory_block *mem); extern void remove_memory(int nid, u64 start, u64 size); -extern int sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn); +extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn); extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, unsigned long map_offset); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); -extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, - int online_type); -extern struct zone *default_zone_for_pfn(int nid, unsigned long pfn, - unsigned long nr_pages); +extern bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, + enum zone_type target, int *zone_shift); + #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9c6c001..ceb86e9 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -533,22 +533,6 @@ static inline bool zone_is_empty(struct zone *zone) } /* - * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty - * intersection with the given zone - */ -static inline bool zone_intersects(struct zone *zone, - unsigned long start_pfn, unsigned long nr_pages) -{ - if (zone_is_empty(zone)) - return false; - if (start_pfn >= zone_end_pfn(zone) || - start_pfn + nr_pages <= zone->zone_start_pfn) - return false; - - return true; -} - -/* * The "priority" of VM scanning is how much of the queues we will scan in one * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the * queues ("queue_length >> 12") during an aging round. diff --git a/kernel/memremap.c b/kernel/memremap.c index 124bed7..23a6483 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -358,11 +358,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, goto err_pfn_remap; mem_hotplug_begin(); - error = arch_add_memory(nid, align_start, align_size, false); - if (!error) - move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], - align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT); + error = arch_add_memory(nid, align_start, align_size, true); mem_hotplug_done(); if (error) goto err_add_memory; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 8dccc31..a4724e7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -243,35 +243,216 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) } #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ -static int __meminit __add_section(int nid, unsigned long phys_start_pfn, - bool want_memblock) +static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long old_zone_end_pfn; + + zone_span_writelock(zone); + + old_zone_end_pfn = zone_end_pfn(zone); + if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) + zone->zone_start_pfn = start_pfn; + + zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - + zone->zone_start_pfn; + + zone_span_writeunlock(zone); +} + +static void resize_zone(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn) +{ + zone_span_writelock(zone); + + if (end_pfn - start_pfn) { + zone->zone_start_pfn = start_pfn; + zone->spanned_pages = end_pfn - start_pfn; + } else { + /* + * make it consist as free_area_init_core(), + * if spanned_pages = 0, then keep start_pfn = 0 + */ + zone->zone_start_pfn = 0; + zone->spanned_pages = 0; + } + + zone_span_writeunlock(zone); +} + +static void fix_zone_id(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn) +{ + enum zone_type zid = zone_idx(zone); + int nid = zone->zone_pgdat->node_id; + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn++) + set_page_links(pfn_to_page(pfn), zid, nid, pfn); +} + +static void __ref ensure_zone_is_initialized(struct zone *zone, + unsigned long start_pfn, unsigned long num_pages) +{ + if (!zone_is_initialized(zone)) + init_currently_empty_zone(zone, start_pfn, num_pages); +} + +static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, + unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long flags; + unsigned long z1_start_pfn; + + ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); + + pgdat_resize_lock(z1->zone_pgdat, &flags); + + /* can't move pfns which are higher than @z2 */ + if (end_pfn > zone_end_pfn(z2)) + goto out_fail; + /* the move out part must be at the left most of @z2 */ + if (start_pfn > z2->zone_start_pfn) + goto out_fail; + /* must included/overlap */ + if (end_pfn <= z2->zone_start_pfn) + goto out_fail; + + /* use start_pfn for z1's start_pfn if z1 is empty */ + if (!zone_is_empty(z1)) + z1_start_pfn = z1->zone_start_pfn; + else + z1_start_pfn = start_pfn; + + resize_zone(z1, z1_start_pfn, end_pfn); + resize_zone(z2, end_pfn, zone_end_pfn(z2)); + + pgdat_resize_unlock(z1->zone_pgdat, &flags); + + fix_zone_id(z1, start_pfn, end_pfn); + + return 0; +out_fail: + pgdat_resize_unlock(z1->zone_pgdat, &flags); + return -1; +} + +static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, + unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long flags; + unsigned long z2_end_pfn; + + ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); + + pgdat_resize_lock(z1->zone_pgdat, &flags); + + /* can't move pfns which are lower than @z1 */ + if (z1->zone_start_pfn > start_pfn) + goto out_fail; + /* the move out part mast at the right most of @z1 */ + if (zone_end_pfn(z1) > end_pfn) + goto out_fail; + /* must included/overlap */ + if (start_pfn >= zone_end_pfn(z1)) + goto out_fail; + + /* use end_pfn for z2's end_pfn if z2 is empty */ + if (!zone_is_empty(z2)) + z2_end_pfn = zone_end_pfn(z2); + else + z2_end_pfn = end_pfn; + + resize_zone(z1, z1->zone_start_pfn, start_pfn); + resize_zone(z2, start_pfn, z2_end_pfn); + + pgdat_resize_unlock(z1->zone_pgdat, &flags); + + fix_zone_id(z2, start_pfn, end_pfn); + + return 0; +out_fail: + pgdat_resize_unlock(z1->zone_pgdat, &flags); + return -1; +} + +static struct zone * __meminit move_pfn_range(int zone_shift, + unsigned long start_pfn, unsigned long end_pfn) +{ + struct zone *zone = page_zone(pfn_to_page(start_pfn)); + int ret = 0; + + if (zone_shift < 0) + ret = move_pfn_range_left(zone + zone_shift, zone, + start_pfn, end_pfn); + else if (zone_shift) + ret = move_pfn_range_right(zone, zone + zone_shift, + start_pfn, end_pfn); + + if (ret) + return NULL; + + return zone + zone_shift; +} + +static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); + + if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) + pgdat->node_start_pfn = start_pfn; + + pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - + pgdat->node_start_pfn; +} + +static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + int nr_pages = PAGES_PER_SECTION; + int nid = pgdat->node_id; + int zone_type; + unsigned long flags, pfn; + + zone_type = zone - pgdat->node_zones; + ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); + + pgdat_resize_lock(zone->zone_pgdat, &flags); + grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); + grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, + phys_start_pfn + nr_pages); + pgdat_resize_unlock(zone->zone_pgdat, &flags); + memmap_init_zone(nr_pages, nid, zone_type, + phys_start_pfn, MEMMAP_HOTPLUG); + + /* online_page_range is called later and expects pages reserved */ + for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) { + if (!pfn_valid(pfn)) + continue; + + SetPageReserved(pfn_to_page(pfn)); + } + return 0; +} + +static int __meminit __add_section(int nid, struct zone *zone, + unsigned long phys_start_pfn, bool want_memblock) { int ret; - int i; if (pfn_valid(phys_start_pfn)) return -EEXIST; - ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn); + ret = sparse_add_one_section(zone, phys_start_pfn); + if (ret < 0) return ret; - /* - * Make all the pages reserved so that nobody will stumble over half - * initialized state. - * FIXME: We also have to associate it with a node because pfn_to_node - * relies on having page with the proper node. - */ - for (i = 0; i < PAGES_PER_SECTION; i++) { - unsigned long pfn = phys_start_pfn + i; - struct page *page; - if (!pfn_valid(pfn)) - continue; + ret = __add_zone(zone, phys_start_pfn); - page = pfn_to_page(pfn); - set_page_node(page, nid); - SetPageReserved(page); - } + if (ret < 0) + return ret; if (!want_memblock) return 0; @@ -285,7 +466,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, * call this function after deciding the zone to which to * add the new pages. */ -int __ref __add_pages(int nid, unsigned long phys_start_pfn, +int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, unsigned long nr_pages, bool want_memblock) { unsigned long i; @@ -293,6 +474,8 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn, int start_sec, end_sec; struct vmem_altmap *altmap; + clear_zone_contiguous(zone); + /* during initialize mem_map, align hot-added range to section */ start_sec = pfn_to_section_nr(phys_start_pfn); end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); @@ -312,7 +495,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn, } for (i = start_sec; i <= end_sec; i++) { - err = __add_section(nid, section_nr_to_pfn(i), want_memblock); + err = __add_section(nid, zone, section_nr_to_pfn(i), want_memblock); /* * EEXIST is finally dealt with by ioresource collision @@ -325,6 +508,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn, } vmemmap_populate_print_last(); out: + set_zone_contiguous(zone); return err; } EXPORT_SYMBOL_GPL(__add_pages); @@ -699,6 +883,23 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, return 0; } +#ifdef CONFIG_MOVABLE_NODE +/* + * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have + * normal memory. + */ +static bool can_online_high_movable(int nid) +{ + return true; +} +#else /* CONFIG_MOVABLE_NODE */ +/* ensure every online node has NORMAL memory */ +static bool can_online_high_movable(int nid) +{ + return node_state(nid, N_NORMAL_MEMORY); +} +#endif /* CONFIG_MOVABLE_NODE */ + /* check which state of node_states will be changed when online memory */ static void node_states_check_changes_online(unsigned long nr_pages, struct zone *zone, struct memory_notify *arg) @@ -773,144 +974,39 @@ static void node_states_set_node(int node, struct memory_notify *arg) node_set_state(node, N_MEMORY); } -bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type) +bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, + enum zone_type target, int *zone_shift) { - struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; - struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages); - - /* - * TODO there shouldn't be any inherent reason to have ZONE_NORMAL - * physically before ZONE_MOVABLE. All we need is they do not - * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE - * though so let's stick with it for simplicity for now. - * TODO make sure we do not overlap with ZONE_DEVICE - */ - if (online_type == MMOP_ONLINE_KERNEL) { - if (zone_is_empty(movable_zone)) - return true; - return movable_zone->zone_start_pfn >= pfn + nr_pages; - } else if (online_type == MMOP_ONLINE_MOVABLE) { - return zone_end_pfn(default_zone) <= pfn; - } - - /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */ - return online_type == MMOP_ONLINE_KEEP; -} - -static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages) -{ - unsigned long old_end_pfn = zone_end_pfn(zone); - - if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) - zone->zone_start_pfn = start_pfn; - - zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn; -} - -static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn, - unsigned long nr_pages) -{ - unsigned long old_end_pfn = pgdat_end_pfn(pgdat); - - if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) - pgdat->node_start_pfn = start_pfn; - - pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; -} - -void __ref move_pfn_range_to_zone(struct zone *zone, - unsigned long start_pfn, unsigned long nr_pages) -{ - struct pglist_data *pgdat = zone->zone_pgdat; - int nid = pgdat->node_id; - unsigned long flags; - - if (zone_is_empty(zone)) - init_currently_empty_zone(zone, start_pfn, nr_pages); - - clear_zone_contiguous(zone); - - /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */ - pgdat_resize_lock(pgdat, &flags); - zone_span_writelock(zone); - resize_zone_range(zone, start_pfn, nr_pages); - zone_span_writeunlock(zone); - resize_pgdat_range(pgdat, start_pfn, nr_pages); - pgdat_resize_unlock(pgdat, &flags); - - /* - * TODO now we have a visible range of pages which are not associated - * with their zone properly. Not nice but set_pfnblock_flags_mask - * expects the zone spans the pfn range. All the pages in the range - * are reserved so nobody should be touching them so we should be safe - */ - memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMMAP_HOTPLUG); - - set_zone_contiguous(zone); -} + struct zone *zone = page_zone(pfn_to_page(pfn)); + enum zone_type idx = zone_idx(zone); + int i; -/* - * Returns a default kernel memory zone for the given pfn range. - * If no kernel zone covers this pfn range it will automatically go - * to the ZONE_NORMAL. - */ -struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, - unsigned long nr_pages) -{ - struct pglist_data *pgdat = NODE_DATA(nid); - int zid; + *zone_shift = 0; - for (zid = 0; zid <= ZONE_NORMAL; zid++) { - struct zone *zone = &pgdat->node_zones[zid]; + if (idx < target) { + /* pages must be at end of current zone */ + if (pfn + nr_pages != zone_end_pfn(zone)) + return false; - if (zone_intersects(zone, start_pfn, nr_pages)) - return zone; + /* no zones in use between current zone and target */ + for (i = idx + 1; i < target; i++) + if (zone_is_initialized(zone - idx + i)) + return false; } - return &pgdat->node_zones[ZONE_NORMAL]; -} - -static inline bool movable_pfn_range(int nid, struct zone *default_zone, - unsigned long start_pfn, unsigned long nr_pages) -{ - if (!allow_online_pfn_range(nid, start_pfn, nr_pages, - MMOP_ONLINE_KERNEL)) - return true; - - if (!movable_node_is_enabled()) - return false; - - return !zone_intersects(default_zone, start_pfn, nr_pages); -} - -/* - * Associates the given pfn range with the given node and the zone appropriate - * for the given online type. - */ -static struct zone * __meminit move_pfn_range(int online_type, int nid, - unsigned long start_pfn, unsigned long nr_pages) -{ - struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages); + if (target < idx) { + /* pages must be at beginning of current zone */ + if (pfn != zone->zone_start_pfn) + return false; - if (online_type == MMOP_ONLINE_KEEP) { - struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; - /* - * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use - * movable zone if that is not possible (e.g. we are within - * or past the existing movable zone). movable_node overrides - * this default and defaults to movable zone - */ - if (movable_pfn_range(nid, zone, start_pfn, nr_pages)) - zone = movable_zone; - } else if (online_type == MMOP_ONLINE_MOVABLE) { - zone = &pgdat->node_zones[ZONE_MOVABLE]; + /* no zones in use between current zone and target */ + for (i = target + 1; i < idx; i++) + if (zone_is_initialized(zone - idx + i)) + return false; } - move_pfn_range_to_zone(zone, start_pfn, nr_pages); - return zone; + *zone_shift = target - idx; + return true; } /* Must be protected by mem_hotplug_begin() */ @@ -923,18 +1019,38 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ int nid; int ret; struct memory_notify arg; + int zone_shift = 0; - nid = pfn_to_nid(pfn); - if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type)) + /* + * This doesn't need a lock to do pfn_to_page(). + * The section can't be removed here because of the + * memory_block->state_mutex. + */ + zone = page_zone(pfn_to_page(pfn)); + + if ((zone_idx(zone) > ZONE_NORMAL || + online_type == MMOP_ONLINE_MOVABLE) && + !can_online_high_movable(pfn_to_nid(pfn))) return -EINVAL; - /* associate pfn range with the zone */ - zone = move_pfn_range(online_type, nid, pfn, nr_pages); + if (online_type == MMOP_ONLINE_KERNEL) { + if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift)) + return -EINVAL; + } else if (online_type == MMOP_ONLINE_MOVABLE) { + if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift)) + return -EINVAL; + } + + zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages); + if (!zone) + return -EINVAL; arg.start_pfn = pfn; arg.nr_pages = nr_pages; node_states_check_changes_online(nr_pages, zone, &arg); + nid = zone_to_nid(zone); + ret = memory_notify(MEM_GOING_ONLINE, &arg); ret = notifier_to_errno(ret); if (ret) @@ -1129,6 +1245,39 @@ static int check_hotplug_memory_range(u64 start, u64 size) return 0; } +/* + * If movable zone has already been setup, newly added memory should be check. + * If its address is higher than movable zone, it should be added as movable. + * Without this check, movable zone may overlap with other zone. + */ +static int should_add_memory_movable(int nid, u64 start, u64 size) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + pg_data_t *pgdat = NODE_DATA(nid); + struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE; + + if (zone_is_empty(movable_zone)) + return 0; + + if (movable_zone->zone_start_pfn <= start_pfn) + return 1; + + return 0; +} + +int zone_for_memory(int nid, u64 start, u64 size, int zone_default, + bool for_device) +{ +#ifdef CONFIG_ZONE_DEVICE + if (for_device) + return ZONE_DEVICE; +#endif + if (should_add_memory_movable(nid, start, size)) + return ZONE_MOVABLE; + + return zone_default; +} + static int online_memory_block(struct memory_block *mem, void *arg) { return device_online(&mem->dev); diff --git a/mm/sparse.c b/mm/sparse.c index 9c48e4f..293cb5d 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -776,9 +776,10 @@ static void free_map_bootmem(struct page *memmap) * set. If this is <=0, then that means that the passed-in * map was not consumed and must be freed. */ -int __meminit sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn) +int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn) { unsigned long section_nr = pfn_to_section_nr(start_pfn); + struct pglist_data *pgdat = zone->zone_pgdat; struct mem_section *ms; struct page *memmap; unsigned long *usemap;