Message ID | 4C3B3895.3040209@austin.ibm.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
On Mon, 12 Jul 2010 10:45:25 -0500 Nathan Fontenot <nfont@austin.ibm.com> wrote: > This patch introduces the new 'split' file in each memory sysfs > directory and the associated routines needed to handle splitting > a directory. > > Signed-off-by; Nathan Fontenot <nfont@austin.ibm.com> > --- pleae check diff option... > drivers/base/memory.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 98 insertions(+), 1 deletion(-) > > Index: linux-2.6/drivers/base/memory.c > =================================================================== > --- linux-2.6.orig/drivers/base/memory.c 2010-07-09 14:23:20.000000000 -0500 > +++ linux-2.6/drivers/base/memory.c 2010-07-09 14:38:09.000000000 -0500 > @@ -32,6 +32,9 @@ > > static int sections_per_block; > > +static int register_memory(struct memory_block *, struct mem_section *, > + int, enum mem_add_context); > + > static inline int base_memory_block_id(int section_nr) > { > return (section_nr / sections_per_block) * sections_per_block; > @@ -309,11 +312,100 @@ > return sprintf(buf, "%d\n", mem->phys_device); > } > > +static void update_memory_block_phys_indexes(struct memory_block *mem) > +{ > + struct list_head *pos; > + struct memory_block_section *mbs; > + unsigned long min_index = 0xffffffff; > + unsigned long max_index = 0; > + > + list_for_each(pos, &mem->sections) { > + mbs = list_entry(pos, struct memory_block_section, next); > + > + if (mbs->phys_index < min_index) > + min_index = mbs->phys_index; > + > + if (mbs->phys_index > max_index) > + max_index = mbs->phys_index; > + } > + > + mem->start_phys_index = min_index; > + mem->end_phys_index = max_index; > +} > + > +static ssize_t > +store_mem_split_block(struct sys_device *dev, struct sysdev_attribute *attr, > + const char *buf, size_t count) > +{ > + struct memory_block *mem, *new_mem_blk; > + struct memory_block_section *mbs; > + struct list_head *pos, *tmp; > + struct mem_section *section; > + int min_scn_nr = 0; > + int max_scn_nr = 0; > + int total_scns = 0; > + int new_blk_min, new_blk_total; > + int ret = -EINVAL; > + > + mem = container_of(dev, struct memory_block, sysdev); > + > + if (list_is_singular(&mem->sections)) > + return -EINVAL; What this means ? > + > + mutex_lock(&mem->state_mutex); > + > + list_for_each(pos, &mem->sections) { > + mbs = list_entry(pos, struct memory_block_section, next); > + > + total_scns++; > + > + if (min_scn_nr > mbs->phys_index) > + min_scn_nr = mbs->phys_index; > + > + if (max_scn_nr < mbs->phys_index) > + max_scn_nr = mbs->phys_index; > + } > + > + new_mem_blk = kzalloc(sizeof(*new_mem_blk), GFP_KERNEL); > + if (!new_mem_blk) > + return -ENOMEM; > + > + mutex_init(&new_mem_blk->state_mutex); > + INIT_LIST_HEAD(&new_mem_blk->sections); > + new_mem_blk->state = mem->state; > + > + mutex_lock(&new_mem_blk->state_mutex); > + > + new_blk_total = total_scns / 2; > + new_blk_min = max_scn_nr - new_blk_total + 1; > + > + section = __nr_to_section(new_blk_min); > + ret = register_memory(new_mem_blk, section, 0, HOTPLUG); > + 'nid' is always 0 ? And for what purpose this interface is ? Does this split memory block into 2 pieces of the same size ?? sounds __very__ strange interface to me. If this is necessary, I hope move the whole things to configfs rather than something tricky. Bye. -Kame
On 07/13/2010 01:28 AM, KAMEZAWA Hiroyuki wrote: > On Mon, 12 Jul 2010 10:45:25 -0500 > Nathan Fontenot <nfont@austin.ibm.com> wrote: > >> This patch introduces the new 'split' file in each memory sysfs >> directory and the associated routines needed to handle splitting >> a directory. >> >> Signed-off-by; Nathan Fontenot <nfont@austin.ibm.com> >> --- > > pleae check diff option... > > >> drivers/base/memory.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++- >> 1 file changed, 98 insertions(+), 1 deletion(-) >> >> Index: linux-2.6/drivers/base/memory.c >> =================================================================== >> --- linux-2.6.orig/drivers/base/memory.c 2010-07-09 14:23:20.000000000 -0500 >> +++ linux-2.6/drivers/base/memory.c 2010-07-09 14:38:09.000000000 -0500 >> @@ -32,6 +32,9 @@ >> >> static int sections_per_block; >> >> +static int register_memory(struct memory_block *, struct mem_section *, >> + int, enum mem_add_context); >> + >> static inline int base_memory_block_id(int section_nr) >> { >> return (section_nr / sections_per_block) * sections_per_block; >> @@ -309,11 +312,100 @@ >> return sprintf(buf, "%d\n", mem->phys_device); >> } >> >> +static void update_memory_block_phys_indexes(struct memory_block *mem) >> +{ >> + struct list_head *pos; >> + struct memory_block_section *mbs; >> + unsigned long min_index = 0xffffffff; >> + unsigned long max_index = 0; >> + >> + list_for_each(pos, &mem->sections) { >> + mbs = list_entry(pos, struct memory_block_section, next); >> + >> + if (mbs->phys_index < min_index) >> + min_index = mbs->phys_index; >> + >> + if (mbs->phys_index > max_index) >> + max_index = mbs->phys_index; >> + } >> + >> + mem->start_phys_index = min_index; >> + mem->end_phys_index = max_index; >> +} >> + >> +static ssize_t >> +store_mem_split_block(struct sys_device *dev, struct sysdev_attribute *attr, >> + const char *buf, size_t count) >> +{ >> + struct memory_block *mem, *new_mem_blk; >> + struct memory_block_section *mbs; >> + struct list_head *pos, *tmp; >> + struct mem_section *section; >> + int min_scn_nr = 0; >> + int max_scn_nr = 0; >> + int total_scns = 0; >> + int new_blk_min, new_blk_total; >> + int ret = -EINVAL; >> + >> + mem = container_of(dev, struct memory_block, sysdev); >> + >> + if (list_is_singular(&mem->sections)) >> + return -EINVAL; > > What this means ? list_is_singular() will return true if there is only one item on the list. In this case we cannot split a memory_block with only one memory_block_section. > > >> + >> + mutex_lock(&mem->state_mutex); >> + >> + list_for_each(pos, &mem->sections) { >> + mbs = list_entry(pos, struct memory_block_section, next); >> + >> + total_scns++; >> + >> + if (min_scn_nr > mbs->phys_index) >> + min_scn_nr = mbs->phys_index; >> + >> + if (max_scn_nr < mbs->phys_index) >> + max_scn_nr = mbs->phys_index; >> + } >> + >> + new_mem_blk = kzalloc(sizeof(*new_mem_blk), GFP_KERNEL); >> + if (!new_mem_blk) >> + return -ENOMEM; >> + >> + mutex_init(&new_mem_blk->state_mutex); >> + INIT_LIST_HEAD(&new_mem_blk->sections); >> + new_mem_blk->state = mem->state; >> + >> + mutex_lock(&new_mem_blk->state_mutex); >> + >> + new_blk_total = total_scns / 2; >> + new_blk_min = max_scn_nr - new_blk_total + 1; >> + >> + section = __nr_to_section(new_blk_min); >> + ret = register_memory(new_mem_blk, section, 0, HOTPLUG); >> + > 'nid' is always 0 ? Ahh.. good catch. it may not be. I'll look into finding the correct nid. > > And for what purpose this interface is ? Does this split memory block into 2 pieces > of the same size ?? sounds __very__ strange interface to me. Yes, this splits the memory_block into two blocks of the same size. This was suggested as something we may want to do. From ppc perspective I am not sure we would use this. The split functionality is not required. The main goal of the patch set is to reduce the number of memory sysfs directories created. From a ppc perspective the split functionality is not really needed. > > If this is necessary, I hope move the whole things to configfs rather than > something tricky. > > Bye. > -Kame >
On Tue, 13 Jul 2010 10:51:58 -0500 Nathan Fontenot <nfont@austin.ibm.com> wrote: > > > > And for what purpose this interface is ? Does this split memory block into 2 pieces > > of the same size ?? sounds __very__ strange interface to me. > > Yes, this splits the memory_block into two blocks of the same size. This was > suggested as something we may want to do. From ppc perspective I am not sure we > would use this. > > The split functionality is not required. The main goal of the patch set is to > reduce the number of memory sysfs directories created. From a ppc perspective > the split functionality is not really needed. > Okay, this is an offer from me. 1. I think you can add an boot option as "don't create memory sysfs". please do. 2. I'd like to write a configfs module for handling memory hotplug even when sysfs directroy is not created. Because configfs support rmdir/mkdir, the user (ppc's daemon?) has to do When offlining section X. # insmod configfs_memory.ko # mount -t configfs none /configfs # mkdir /configfs/memoryX # echo offline > /configfs/memoryX/state # rmdir /configfs/memoryX And making this operation as the default bahavior for all arch's memory hotplug may be better... Dave, how do you think ? Because ppc guys uses "probe" interface already, this can be handled... no ? One problem is that I don't have enough knowledge about configfs..it seems complex. Thanks, -Kame
On 07/13/2010 07:35 PM, KAMEZAWA Hiroyuki wrote: > On Tue, 13 Jul 2010 10:51:58 -0500 > Nathan Fontenot <nfont@austin.ibm.com> wrote: > >>> >>> And for what purpose this interface is ? Does this split memory block into 2 pieces >>> of the same size ?? sounds __very__ strange interface to me. >> >> Yes, this splits the memory_block into two blocks of the same size. This was >> suggested as something we may want to do. From ppc perspective I am not sure we >> would use this. >> >> The split functionality is not required. The main goal of the patch set is to >> reduce the number of memory sysfs directories created. From a ppc perspective >> the split functionality is not really needed. >> > > Okay, this is an offer from me. > > 1. I think you can add an boot option as "don't create memory sysfs". > please do. I posted a patch to do that a week or so ago, it didn't go over very well. > > 2. I'd like to write a configfs module for handling memory hotplug even when > sysfs directroy is not created. > Because configfs support rmdir/mkdir, the user (ppc's daemon?) has to do > > When offlining section X. > # insmod configfs_memory.ko > # mount -t configfs none /configfs > # mkdir /configfs/memoryX > # echo offline > /configfs/memoryX/state > # rmdir /configfs/memoryX > > And making this operation as the default bahavior for all arch's memory hotplug may > be better... > > Dave, how do you think ? Because ppc guys uses "probe" interface already, > this can be handled... no ? ppc would still require the existance of the 'probe' interface. Are you objecting to the 'split' functionality? If so I do not see any reason from ppc perspective that it is needed. This was something Dave suggested, unless I am missing something. Since ppc needs the 'probe' interface in sysfs, and for ppc having mutliple memory_block_sections reside under a single memory_block makes memory hotplug simpler. On ppc we do emory hotplug operations on an LMB size basis. With my patches this now lets us set each memory_block to span an LMB's worth of memory. Now we could do emory hotplug in a single operation instead of multiple operations to offline/online all of the memory sections in an LMB. Of course the easy solution would be to increase SECTION_SIZE_BITS, but we need support hardware that can have LMB's from 16 MB to 256 MB in size so the SECTION_SIZE_BITS value has to remain small. > > One problem is that I don't have enough knowledge about configfs..it seems complex. Me neither, thoug I will take a look at it. -Nathan
On Tue, 13 Jul 2010 22:18:03 -0500 Nathan Fontenot <nfont@austin.ibm.com> wrote: > On 07/13/2010 07:35 PM, KAMEZAWA Hiroyuki wrote: > > On Tue, 13 Jul 2010 10:51:58 -0500 > > Nathan Fontenot <nfont@austin.ibm.com> wrote: > > > >>> > >>> And for what purpose this interface is ? Does this split memory block into 2 pieces > >>> of the same size ?? sounds __very__ strange interface to me. > >> > >> Yes, this splits the memory_block into two blocks of the same size. This was > >> suggested as something we may want to do. From ppc perspective I am not sure we > >> would use this. > >> > >> The split functionality is not required. The main goal of the patch set is to > >> reduce the number of memory sysfs directories created. From a ppc perspective > >> the split functionality is not really needed. > >> > > > > Okay, this is an offer from me. > > > > 1. I think you can add an boot option as "don't create memory sysfs". > > please do. > > I posted a patch to do that a week or so ago, it didn't go over very well. > > > > > 2. I'd like to write a configfs module for handling memory hotplug even when > > sysfs directroy is not created. > > Because configfs support rmdir/mkdir, the user (ppc's daemon?) has to do > > > > When offlining section X. > > # insmod configfs_memory.ko > > # mount -t configfs none /configfs > > # mkdir /configfs/memoryX > > # echo offline > /configfs/memoryX/state > > # rmdir /configfs/memoryX > > > > And making this operation as the default bahavior for all arch's memory hotplug may > > be better... > > > > Dave, how do you think ? Because ppc guys uses "probe" interface already, > > this can be handled... no ? > > ppc would still require the existance of the 'probe' interface. > > Are you objecting to the 'split' functionality? yes. > If so I do not see any reason from ppc > perspective that it is needed. This was something Dave suggested, unless I am missing > something. > > Since ppc needs the 'probe' interface in sysfs, and for ppc having mutliple > memory_block_sections reside under a single memory_block makes memory hotplug > simpler. On ppc we do emory hotplug operations on an LMB size basis. With my > patches this now lets us set each memory_block to span an LMB's worth of > memory. Now we could do emory hotplug in a single operation instead of multiple > operations to offline/online all of the memory sections in an LMB. > Why per-section memory offlining is provided is for allowing good success-rate of memory offlining. Because memory-hotplug has to "migrate or free" all used page under a section, possibility of memory unplug depends on usage of memory. If a section contains unmovable page(kernel page), we can't offline sectin. For example, comparing 1. offlining 128MB of memory at once 2. offlining 8 chunks of 16MB memory "2" can get very good possibility and system-busy time can be much reduced. IIUC, ppc's 1st requirement is "resizing" not "hot-removing some memory device", "2" is much welcomed. So, some fine-grained interface to section_size is appreciated. So, "multiple operations" is much better than single operation. As I posted show/hide patch, I'm writing it in configfs. I think it meets IBM's requirements. _But_, it's IBM's issue not Fujitsu's. So, final decistion will depend on you guys. Anyway, I don't like a too fancy interface as "split". Thanks, -Kame
On Wed, 2010-07-14 at 09:35 +0900, KAMEZAWA Hiroyuki wrote: > 2. I'd like to write a configfs module for handling memory hotplug even when > sysfs directroy is not created. > Because configfs support rmdir/mkdir, the user (ppc's daemon?) has to do > > When offlining section X. > # insmod configfs_memory.ko > # mount -t configfs none /configfs > # mkdir /configfs/memoryX > # echo offline > /configfs/memoryX/state > # rmdir /configfs/memoryX > > And making this operation as the default bahavior for all arch's memory hotplug may > be better... > > Dave, how do you think ? Because ppc guys uses "probe" interface already, > this can be handled... no ? I think creating a interface to duplicate the existing sysfs one is a bad idea. I also think removing the existing sysfs one isn't feasible since there are users, and it's truly part of the ABI. So, I'm not really a fan on the configfs interface. :( I really do think the sysfs interface is fixable. We should at least give it a good shot before largely duplicating its functionality. -- Dave
On 07/13/2010 10:26 PM, Dave Hansen wrote: > On Wed, 2010-07-14 at 09:35 +0900, KAMEZAWA Hiroyuki wrote: >> 2. I'd like to write a configfs module for handling memory hotplug even when >> sysfs directroy is not created. >> Because configfs support rmdir/mkdir, the user (ppc's daemon?) has to do >> >> When offlining section X. >> # insmod configfs_memory.ko >> # mount -t configfs none /configfs >> # mkdir /configfs/memoryX >> # echo offline > /configfs/memoryX/state >> # rmdir /configfs/memoryX >> >> And making this operation as the default bahavior for all arch's memory hotplug may >> be better... >> >> Dave, how do you think ? Because ppc guys uses "probe" interface already, >> this can be handled... no ? > > I think creating a interface to duplicate the existing sysfs one is a > bad idea. I also think removing the existing sysfs one isn't feasible > since there are users, and it's truly part of the ABI. So, I'm not > really a fan on the configfs interface. :( > > I really do think the sysfs interface is fixable. We should at least > give it a good shot before largely duplicating its functionality. I agree with Dave, I don't think another memory hotplug interface is needed. I am working to update the patchset to remove the split functionality and fix other items commented on. this new patch will just split the memory_block structure so that a memory_block can span multiple memory sections. Kame, I understand that offlining 16 MB is easier than 256 MB. From the ppc perspective though, we are still offlining 256 MB. We do memory add/remove on LMB size chunks, so we have to add/remove all of the memory sections contained in an LMB. If any one memory section covered by a LMB fails to add/remove, we restore the memory sections to their orignal state an fail the add/remove operation. NOTE: the code doing this is not in the kernel, but in the user-space drmgr command (from powerpc-utils package). -Nathan
Index: linux-2.6/drivers/base/memory.c =================================================================== --- linux-2.6.orig/drivers/base/memory.c 2010-07-09 14:23:20.000000000 -0500 +++ linux-2.6/drivers/base/memory.c 2010-07-09 14:38:09.000000000 -0500 @@ -32,6 +32,9 @@ static int sections_per_block; +static int register_memory(struct memory_block *, struct mem_section *, + int, enum mem_add_context); + static inline int base_memory_block_id(int section_nr) { return (section_nr / sections_per_block) * sections_per_block; @@ -309,11 +312,100 @@ return sprintf(buf, "%d\n", mem->phys_device); } +static void update_memory_block_phys_indexes(struct memory_block *mem) +{ + struct list_head *pos; + struct memory_block_section *mbs; + unsigned long min_index = 0xffffffff; + unsigned long max_index = 0; + + list_for_each(pos, &mem->sections) { + mbs = list_entry(pos, struct memory_block_section, next); + + if (mbs->phys_index < min_index) + min_index = mbs->phys_index; + + if (mbs->phys_index > max_index) + max_index = mbs->phys_index; + } + + mem->start_phys_index = min_index; + mem->end_phys_index = max_index; +} + +static ssize_t +store_mem_split_block(struct sys_device *dev, struct sysdev_attribute *attr, + const char *buf, size_t count) +{ + struct memory_block *mem, *new_mem_blk; + struct memory_block_section *mbs; + struct list_head *pos, *tmp; + struct mem_section *section; + int min_scn_nr = 0; + int max_scn_nr = 0; + int total_scns = 0; + int new_blk_min, new_blk_total; + int ret = -EINVAL; + + mem = container_of(dev, struct memory_block, sysdev); + + if (list_is_singular(&mem->sections)) + return -EINVAL; + + mutex_lock(&mem->state_mutex); + + list_for_each(pos, &mem->sections) { + mbs = list_entry(pos, struct memory_block_section, next); + + total_scns++; + + if (min_scn_nr > mbs->phys_index) + min_scn_nr = mbs->phys_index; + + if (max_scn_nr < mbs->phys_index) + max_scn_nr = mbs->phys_index; + } + + new_mem_blk = kzalloc(sizeof(*new_mem_blk), GFP_KERNEL); + if (!new_mem_blk) + return -ENOMEM; + + mutex_init(&new_mem_blk->state_mutex); + INIT_LIST_HEAD(&new_mem_blk->sections); + new_mem_blk->state = mem->state; + + mutex_lock(&new_mem_blk->state_mutex); + + new_blk_total = total_scns / 2; + new_blk_min = max_scn_nr - new_blk_total + 1; + + section = __nr_to_section(new_blk_min); + ret = register_memory(new_mem_blk, section, 0, HOTPLUG); + + list_for_each_safe(pos, tmp, &mem->sections) { + unsigned long scn_nr; + + mbs = list_entry(pos, struct memory_block_section, next); + scn_nr = mbs->phys_index; + + if ((scn_nr >= new_blk_min) && (scn_nr <= max_scn_nr)) + list_move(&mbs->next, &new_mem_blk->sections); + } + + update_memory_block_phys_indexes(mem); + update_memory_block_phys_indexes(new_mem_blk); + + mutex_unlock(&new_mem_blk->state_mutex); + mutex_unlock(&mem->state_mutex); + return count; +} + static SYSDEV_ATTR(phys_index, 0444, show_mem_phys_index, NULL); static SYSDEV_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL); static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state); static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL); static SYSDEV_ATTR(removable, 0444, show_mem_removable, NULL); +static SYSDEV_ATTR(split, 0200, NULL, store_mem_split_block); #define mem_create_simple_file(mem, attr_name) \ sysdev_create_file(&mem->sysdev, &attr_##attr_name) @@ -343,6 +435,8 @@ ret = mem_create_simple_file(memory, phys_device); if (!ret) ret = mem_create_simple_file(memory, removable); + if (!ret) + ret = mem_create_simple_file(memory, split); if (!ret) { if (context == HOTPLUG) ret = register_mem_sect_under_node(memory, nid); @@ -361,6 +455,7 @@ mem_remove_simple_file(memory, state); mem_remove_simple_file(memory, phys_device); mem_remove_simple_file(memory, removable); + mem_remove_simple_file(memory, split); /* drop the ref. we got in remove_memory_block() */ kobject_put(&memory->sysdev.kobj); @@ -568,8 +663,10 @@ kobject_put(&mem->sysdev.kobj); } - if (!ret) + if (!ret) { ret = add_mem_block_section(mem, __section_nr(section), state); + update_memory_block_phys_indexes(mem); + } return ret; }