Message ID | 55887781.2040709@linux.vnet.ibm.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Michael Ellerman |
Headers | show |
On Mon, 2015-22-06 at 21:00:49 UTC, Nathan Fontenot wrote: > Add the ability to dlpar remove CPUs via hotplug rtas events, either by > specifying the drc-index of the CPU to remove or providing a count of cpus > to remove. > > To accomplish we create a list of possible dr cpus and their drc indexes What does "dr" mean? I know but not everyone does. If it's an acronymn it should be uppercase and spelt out at its first usage. > diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c > index 7890b2f..49b7196 100644 > --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c > +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c > @@ -506,6 +507,207 @@ static ssize_t dlpar_cpu_remove(struct device_node *dn, u32 drc_index) > return rc; > } > > +static struct device_node *cpu_drc_index_to_dn(struct device_node *parent, > + u32 drc_index) > +{ > + struct device_node *dn; > + u32 my_index; > + int rc; > + > + for_each_child_of_node(parent, dn) { > + if (of_node_cmp(dn->type, "cpu") != 0) > + continue; parent is always "/cpus", so I wonder if this should just be: for_each_node_by_type(dn, "cpu") { And then you don't need parent in here at all. Or do we realistically expect to be looking for cpus under a node other than "/cpus" ? > + rc = of_property_read_u32(dn, "ibm,my-drc-index", &my_index); > + if (rc) > + continue; > + > + if (my_index == drc_index) > + break; > + } > + > + return dn; > +} > + > +static int dlpar_cpu_remove_by_index(struct device_node *parent, > + u32 drc_index) > +{ > + struct device_node *dn; > + int rc; > + > + dn = cpu_drc_index_to_dn(parent, drc_index); > + if (!dn) > + return -ENODEV; > + > + rc = dlpar_cpu_remove(dn, drc_index); > + of_node_put(dn); > + return rc; > +} > + > +static int dlpar_cpus_possible(struct device_node *parent) > +{ > + int dr_cpus_possible; > + > + /* The first u32 in the ibm,drc-indexes property is the numnber > + * of drc entries in the property, which is the possible number > + * number of dr capable cpus. > + */ > + of_property_read_u32(parent, "ibm,drc-indexes", &dr_cpus_possible); > + return dr_cpus_possible; > +} > + > +struct dr_cpu { > + u32 drc_index; > + bool present; > + bool modified; > +}; > + > +static struct dr_cpu *get_dlpar_cpus(struct device_node *parent) > +{ > + struct device_node *dn; > + struct property *prop; > + struct dr_cpu *dr_cpus; > + const __be32 *p; > + u32 drc_index; > + int dr_cpus_possible, index; > + bool first; > + > + dr_cpus_possible = dlpar_cpus_possible(parent); > + dr_cpus = kcalloc(dr_cpus_possible, sizeof(*dr_cpus), GFP_KERNEL); > + if (!dr_cpus) > + return NULL; > + > + first = true; > + index = 0; > + of_property_for_each_u32(parent, "ibm,drc-indexes", prop, p, > + drc_index) { > + if (first) { What about: if (index == 0) { Then you don't need first at all? > + /* The first entry is the number of drc indexes in > + * the property, skip it. > + */ > + first = false; > + continue; > + } > + > + dr_cpus[index].drc_index = drc_index; > + > + dn = cpu_drc_index_to_dn(parent, drc_index); The outer loop is iterating over all drc indexes (ie. one per cpu usually?), and then cpu_drc_index_to_dn() will iterate over all cpus. So that's n^2 for n = nr_cpus which is not ideal. I realise we can't do much better, but what we can do is limit the number of iterations of the outer loop. Because usually we will be here because we want to offline less than nr_cpus cpus. ie. if I ask to offline 1 cpu on a 4096 cpu system we will still do up to 16M iterations in here, when all we needed to do was one iteration of the outer loop. So I think this routine should take the number of cpus we're trying to remove and only iterate until it's found that many. > + if (dn) { > + dr_cpus[index].present = true; > + of_node_put(dn); > + } Why do we put non present ones in the dr_cpus array at all? It just means you have to skip them later? (in two separate places) > + > + index++; > + } > + > + return dr_cpus; > +} > + > +static int dlpar_cpu_remove_by_count(struct device_node *parent, > + u32 cpus_to_remove) > +{ > + struct dr_cpu *dr_cpus; > + int dr_cpus_removed = 0; > + int dr_cpus_present = 0; > + int dr_cpus_possible; > + int i, rc; > + > + pr_info("Attempting to hot-remove %d CPUs\n", cpus_to_remove); > + > + dr_cpus = get_dlpar_cpus(parent); So I think this should be: > + dr_cpus = get_dlpar_cpus(parent, cpus_to_remove); > + if (!dr_cpus) { > + pr_info("Could not gather dr CPU info\n"); > + return -EINVAL; > + } And get_dlpar_cpus() should return cpus_to_remove worth of present cpus, or it should error, meaning the below then won't be needed: > + dr_cpus_possible = dlpar_cpus_possible(parent); > + > + for (i = 0; i < dr_cpus_possible; i++) { > + if (dr_cpus[i].present) > + dr_cpus_present++; > + } > + > + /* Validate the available CPUs to remove. > + * NOTE: we can't remove the last CPU. > + */ > + if (cpus_to_remove >= dr_cpus_present) { > + pr_err("Insufficient CPUs (%d) to satisfy remove request\n", > + dr_cpus_present); > + kfree(dr_cpus); > + return -EINVAL; > + } > + Having only present cpus in dr_cpus should also simplify this loop because you don't need to skip non-present ones: > + for (i = 0; i < dr_cpus_possible; i++) { > + if (dr_cpus_removed == cpus_to_remove) > + break; > + > + if (!dr_cpus[i].present) > + continue; > + > + rc = dlpar_cpu_remove_by_index(parent, dr_cpus[i].drc_index); > + if (!rc) { > + dr_cpus_removed++; > + dr_cpus[i].modified = true; > + } else .. ? Surely you should bail on the first error? Definitely you should, because you're going to undo everything below anyway: > + } > + > + if (dr_cpus_removed != cpus_to_remove) { > + pr_info("CPU hot-remove failed, adding any removed CPUs\n"); > + > + for (i = 0; i < dr_cpus_possible; i++) { > + if (!dr_cpus[i].modified) > + continue; And if you bail on the first error above you shouldn't need modified, instead you just iterate in reverse from i using a new counter, eg. j. > + > + rc = dlpar_cpu_add(parent, dr_cpus[i].drc_index); > + if (rc) > + pr_info("Failed to re-add CPU (%x)\n", > + dr_cpus[i].drc_index); > + } > + > + rc = -EINVAL; > + } else { > + rc = 0; > + } > + > + kfree(dr_cpus); > + return rc; > +} > + > +int dlpar_cpu(struct pseries_hp_errorlog *hp_elog) > +{ > + struct device_node *parent; > + u32 count, drc_index; > + int rc; > + > + count = hp_elog->_drc_u.drc_count; > + drc_index = hp_elog->_drc_u.drc_index; Those both need be32_to_cpu(). arch/powerpc/platforms/pseries/hotplug-cpu.c:750:15: warning: incorrect type in assignment (different base types) arch/powerpc/platforms/pseries/hotplug-cpu.c:750:15: expected unsigned int [unsigned] [usertype] count arch/powerpc/platforms/pseries/hotplug-cpu.c:750:15: got restricted __be32 [usertype] drc_count arch/powerpc/platforms/pseries/hotplug-cpu.c:751:19: warning: incorrect type in assignment (different base types) arch/powerpc/platforms/pseries/hotplug-cpu.c:751:19: expected unsigned int [unsigned] [usertype] drc_index arch/powerpc/platforms/pseries/hotplug-cpu.c:751:19: got restricted __be32 [usertype] drc_index Though looking closer I don't see where we ever pass or receive a pseries_hp_errorlog to or from firmware? So I'm a bit confused why we're bothering with the __be32 shenanigans. Hopefully I've just missed a detail somewhere. > + parent = of_find_node_by_path("/cpus"); > + if (!parent) > + return -ENODEV; > + > + lock_device_hotplug(); > + > + switch (hp_elog->action) { > + case PSERIES_HP_ELOG_ACTION_REMOVE: > + if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) > + rc = dlpar_cpu_remove_by_count(parent, count); > + else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) > + rc = dlpar_cpu_remove_by_index(parent, drc_index); > + else > + rc = -EINVAL; > + break; > + default: > + pr_err("Invalid action (%d) specified\n", hp_elog->action); > + rc = -EINVAL; > + break; > + } > + > + unlock_device_hotplug(); > + of_node_put(parent); > + return rc; > +} > + > #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE > > static ssize_t dlpar_cpu_probe(const char *buf, size_t count) > diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h > index 8411c27..58892f1 100644 > --- a/arch/powerpc/platforms/pseries/pseries.h > +++ b/arch/powerpc/platforms/pseries/pseries.h > @@ -66,11 +66,16 @@ extern int dlpar_release_drc(u32 drc_index); > > #ifdef CONFIG_MEMORY_HOTPLUG > int dlpar_memory(struct pseries_hp_errorlog *hp_elog); > +int dlpar_cpu(struct pseries_hp_errorlog *hp_elog); I don't think this should be under CONFIG_MEMORY_HOTPLUG ? cheers
On Mon, 2015-22-06 at 21:00:49 UTC, Nathan Fontenot wrote: > Add the ability to dlpar remove CPUs via hotplug rtas events, either by > specifying the drc-index of the CPU to remove or providing a count of cpus > to remove. > > To accomplish we create a list of possible dr cpus and their drc indexes What does "dr" mean? I know but not everyone does. If it's an acronymn it should be uppercase and spelt out at its first usage. > diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c > index 7890b2f..49b7196 100644 > --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c > +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c > @@ -506,6 +507,207 @@ static ssize_t dlpar_cpu_remove(struct device_node *dn, u32 drc_index) > return rc; > } > > +static struct device_node *cpu_drc_index_to_dn(struct device_node *parent, > + u32 drc_index) > +{ > + struct device_node *dn; > + u32 my_index; > + int rc; > + > + for_each_child_of_node(parent, dn) { > + if (of_node_cmp(dn->type, "cpu") != 0) > + continue; parent is always "/cpus", so I wonder if this should just be: for_each_node_by_type(dn, "cpu") { And then you don't need parent in here at all. Or do we realistically expect to be looking for cpus under a node other than "/cpus" ? > + rc = of_property_read_u32(dn, "ibm,my-drc-index", &my_index); > + if (rc) > + continue; > + > + if (my_index == drc_index) > + break; > + } > + > + return dn; > +} > + > +static int dlpar_cpu_remove_by_index(struct device_node *parent, > + u32 drc_index) > +{ > + struct device_node *dn; > + int rc; > + > + dn = cpu_drc_index_to_dn(parent, drc_index); > + if (!dn) > + return -ENODEV; > + > + rc = dlpar_cpu_remove(dn, drc_index); > + of_node_put(dn); > + return rc; > +} > + > +static int dlpar_cpus_possible(struct device_node *parent) > +{ > + int dr_cpus_possible; > + > + /* The first u32 in the ibm,drc-indexes property is the numnber > + * of drc entries in the property, which is the possible number > + * number of dr capable cpus. > + */ > + of_property_read_u32(parent, "ibm,drc-indexes", &dr_cpus_possible); > + return dr_cpus_possible; > +} > + > +struct dr_cpu { > + u32 drc_index; > + bool present; > + bool modified; > +}; > + > +static struct dr_cpu *get_dlpar_cpus(struct device_node *parent) > +{ > + struct device_node *dn; > + struct property *prop; > + struct dr_cpu *dr_cpus; > + const __be32 *p; > + u32 drc_index; > + int dr_cpus_possible, index; > + bool first; > + > + dr_cpus_possible = dlpar_cpus_possible(parent); > + dr_cpus = kcalloc(dr_cpus_possible, sizeof(*dr_cpus), GFP_KERNEL); > + if (!dr_cpus) > + return NULL; > + > + first = true; > + index = 0; > + of_property_for_each_u32(parent, "ibm,drc-indexes", prop, p, > + drc_index) { > + if (first) { What about: if (index == 0) { Then you don't need first at all? > + /* The first entry is the number of drc indexes in > + * the property, skip it. > + */ > + first = false; > + continue; > + } > + > + dr_cpus[index].drc_index = drc_index; > + > + dn = cpu_drc_index_to_dn(parent, drc_index); The outer loop is iterating over all drc indexes (ie. one per cpu usually?), and then cpu_drc_index_to_dn() will iterate over all cpus. So that's n^2 for n = nr_cpus which is not ideal. I realise we can't do much better, but what we can do is limit the number of iterations of the outer loop. Because usually we will be here because we want to offline less than nr_cpus cpus. ie. if I ask to offline 1 cpu on a 4096 cpu system we will still do up to 16M iterations in here, when all we needed to do was one iteration of the outer loop. So I think this routine should take the number of cpus we're trying to remove and only iterate until it's found that many. > + if (dn) { > + dr_cpus[index].present = true; > + of_node_put(dn); > + } Why do we put non present ones in the dr_cpus array at all? It just means you have to skip them later? (in two separate places) > + > + index++; > + } > + > + return dr_cpus; > +} > + > +static int dlpar_cpu_remove_by_count(struct device_node *parent, > + u32 cpus_to_remove) > +{ > + struct dr_cpu *dr_cpus; > + int dr_cpus_removed = 0; > + int dr_cpus_present = 0; > + int dr_cpus_possible; > + int i, rc; > + > + pr_info("Attempting to hot-remove %d CPUs\n", cpus_to_remove); > + > + dr_cpus = get_dlpar_cpus(parent); So I think this should be: > + dr_cpus = get_dlpar_cpus(parent, cpus_to_remove); > + if (!dr_cpus) { > + pr_info("Could not gather dr CPU info\n"); > + return -EINVAL; > + } And get_dlpar_cpus() should return cpus_to_remove worth of present cpus, or it should error, meaning the below then won't be needed: > + dr_cpus_possible = dlpar_cpus_possible(parent); > + > + for (i = 0; i < dr_cpus_possible; i++) { > + if (dr_cpus[i].present) > + dr_cpus_present++; > + } > + > + /* Validate the available CPUs to remove. > + * NOTE: we can't remove the last CPU. > + */ > + if (cpus_to_remove >= dr_cpus_present) { > + pr_err("Insufficient CPUs (%d) to satisfy remove request\n", > + dr_cpus_present); > + kfree(dr_cpus); > + return -EINVAL; > + } > + Having only present cpus in dr_cpus should also simplify this loop because you don't need to skip non-present ones: > + for (i = 0; i < dr_cpus_possible; i++) { > + if (dr_cpus_removed == cpus_to_remove) > + break; > + > + if (!dr_cpus[i].present) > + continue; > + > + rc = dlpar_cpu_remove_by_index(parent, dr_cpus[i].drc_index); > + if (!rc) { > + dr_cpus_removed++; > + dr_cpus[i].modified = true; > + } else .. ? Surely you should bail on the first error? Definitely you should, because you're going to undo everything below anyway: > + } > + > + if (dr_cpus_removed != cpus_to_remove) { > + pr_info("CPU hot-remove failed, adding any removed CPUs\n"); > + > + for (i = 0; i < dr_cpus_possible; i++) { > + if (!dr_cpus[i].modified) > + continue; And if you bail on the first error above you shouldn't need modified, instead you just iterate in reverse from i using a new counter, eg. j. > + > + rc = dlpar_cpu_add(parent, dr_cpus[i].drc_index); > + if (rc) > + pr_info("Failed to re-add CPU (%x)\n", > + dr_cpus[i].drc_index); > + } > + > + rc = -EINVAL; > + } else { > + rc = 0; > + } > + > + kfree(dr_cpus); > + return rc; > +} > + > +int dlpar_cpu(struct pseries_hp_errorlog *hp_elog) > +{ > + struct device_node *parent; > + u32 count, drc_index; > + int rc; > + > + count = hp_elog->_drc_u.drc_count; > + drc_index = hp_elog->_drc_u.drc_index; Those both need be32_to_cpu(). arch/powerpc/platforms/pseries/hotplug-cpu.c:750:15: warning: incorrect type in assignment (different base types) arch/powerpc/platforms/pseries/hotplug-cpu.c:750:15: expected unsigned int [unsigned] [usertype] count arch/powerpc/platforms/pseries/hotplug-cpu.c:750:15: got restricted __be32 [usertype] drc_count arch/powerpc/platforms/pseries/hotplug-cpu.c:751:19: warning: incorrect type in assignment (different base types) arch/powerpc/platforms/pseries/hotplug-cpu.c:751:19: expected unsigned int [unsigned] [usertype] drc_index arch/powerpc/platforms/pseries/hotplug-cpu.c:751:19: got restricted __be32 [usertype] drc_index Though looking closer I don't see where we ever pass or receive a pseries_hp_errorlog to or from firmware? So I'm a bit confused why we're bothering with the __be32 shenanigans. Hopefully I've just missed a detail somewhere. > + parent = of_find_node_by_path("/cpus"); > + if (!parent) > + return -ENODEV; > + > + lock_device_hotplug(); > + > + switch (hp_elog->action) { > + case PSERIES_HP_ELOG_ACTION_REMOVE: > + if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) > + rc = dlpar_cpu_remove_by_count(parent, count); > + else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) > + rc = dlpar_cpu_remove_by_index(parent, drc_index); > + else > + rc = -EINVAL; > + break; > + default: > + pr_err("Invalid action (%d) specified\n", hp_elog->action); > + rc = -EINVAL; > + break; > + } > + > + unlock_device_hotplug(); > + of_node_put(parent); > + return rc; > +} > + > #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE > > static ssize_t dlpar_cpu_probe(const char *buf, size_t count) > diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h > index 8411c27..58892f1 100644 > --- a/arch/powerpc/platforms/pseries/pseries.h > +++ b/arch/powerpc/platforms/pseries/pseries.h > @@ -66,11 +66,16 @@ extern int dlpar_release_drc(u32 drc_index); > > #ifdef CONFIG_MEMORY_HOTPLUG > int dlpar_memory(struct pseries_hp_errorlog *hp_elog); > +int dlpar_cpu(struct pseries_hp_errorlog *hp_elog); I don't think this should be under CONFIG_MEMORY_HOTPLUG ? cheers
On 07/21/2015 04:27 AM, Michael Ellerman wrote: > On Mon, 2015-22-06 at 21:00:49 UTC, Nathan Fontenot wrote: >> Add the ability to dlpar remove CPUs via hotplug rtas events, either by >> specifying the drc-index of the CPU to remove or providing a count of cpus >> to remove. >> >> To accomplish we create a list of possible dr cpus and their drc indexes > > What does "dr" mean? I know but not everyone does. If it's an acronymn it > should be uppercase and spelt out at its first usage. > Good point. >> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c >> index 7890b2f..49b7196 100644 >> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c >> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c >> @@ -506,6 +507,207 @@ static ssize_t dlpar_cpu_remove(struct device_node *dn, u32 drc_index) >> return rc; >> } >> >> +static struct device_node *cpu_drc_index_to_dn(struct device_node *parent, >> + u32 drc_index) >> +{ >> + struct device_node *dn; >> + u32 my_index; >> + int rc; >> + >> + for_each_child_of_node(parent, dn) { >> + if (of_node_cmp(dn->type, "cpu") != 0) >> + continue; > > parent is always "/cpus", so I wonder if this should just be: > > for_each_node_by_type(dn, "cpu") { > > And then you don't need parent in here at all. > > Or do we realistically expect to be looking for cpus under a node other than "/cpus" ? This, combined with the comments below about iterating over the list of dr_cpus, is going away in the next version of the patch. The 'parent' node pointer is not passed around anymore. > >> + rc = of_property_read_u32(dn, "ibm,my-drc-index", &my_index); >> + if (rc) >> + continue; >> + >> + if (my_index == drc_index) >> + break; >> + } >> + >> + return dn; >> +} >> + >> +static int dlpar_cpu_remove_by_index(struct device_node *parent, >> + u32 drc_index) >> +{ >> + struct device_node *dn; >> + int rc; >> + >> + dn = cpu_drc_index_to_dn(parent, drc_index); >> + if (!dn) >> + return -ENODEV; >> + >> + rc = dlpar_cpu_remove(dn, drc_index); >> + of_node_put(dn); >> + return rc; >> +} >> + >> +static int dlpar_cpus_possible(struct device_node *parent) >> +{ >> + int dr_cpus_possible; >> + >> + /* The first u32 in the ibm,drc-indexes property is the numnber >> + * of drc entries in the property, which is the possible number >> + * number of dr capable cpus. >> + */ >> + of_property_read_u32(parent, "ibm,drc-indexes", &dr_cpus_possible); >> + return dr_cpus_possible; >> +} >> + >> +struct dr_cpu { >> + u32 drc_index; >> + bool present; >> + bool modified; >> +}; >> + >> +static struct dr_cpu *get_dlpar_cpus(struct device_node *parent) >> +{ >> + struct device_node *dn; >> + struct property *prop; >> + struct dr_cpu *dr_cpus; >> + const __be32 *p; >> + u32 drc_index; >> + int dr_cpus_possible, index; >> + bool first; >> + >> + dr_cpus_possible = dlpar_cpus_possible(parent); >> + dr_cpus = kcalloc(dr_cpus_possible, sizeof(*dr_cpus), GFP_KERNEL); >> + if (!dr_cpus) >> + return NULL; >> + >> + first = true; >> + index = 0; >> + of_property_for_each_u32(parent, "ibm,drc-indexes", prop, p, >> + drc_index) { >> + if (first) { > > What about: > > if (index == 0) { > > Then you don't need first at all? > This block of code is getting re-written, no need to look for the first node in the new version. >> + /* The first entry is the number of drc indexes in >> + * the property, skip it. >> + */ >> + first = false; >> + continue; >> + } >> + >> + dr_cpus[index].drc_index = drc_index; >> + >> + dn = cpu_drc_index_to_dn(parent, drc_index); > > The outer loop is iterating over all drc indexes (ie. one per cpu usually?), > and then cpu_drc_index_to_dn() will iterate over all cpus. > > So that's n^2 for n = nr_cpus which is not ideal. > > I realise we can't do much better, but what we can do is limit the number of > iterations of the outer loop. Because usually we will be here because we want > to offline less than nr_cpus cpus. > > ie. if I ask to offline 1 cpu on a 4096 cpu system we will still do up to 16M > iterations in here, when all we needed to do was one iteration of the outer > loop. > > So I think this routine should take the number of cpus we're trying to remove > and only iterate until it's found that many. > Yeah, now that you point it out there is a lot of no good in that code. Re-writing this with a new approach to only create lists of what is present. >> + if (dn) { >> + dr_cpus[index].present = true; >> + of_node_put(dn); >> + } > > Why do we put non present ones in the dr_cpus array at all? It just means you > have to skip them later? (in two separate places) > >> + >> + index++; >> + } >> + >> + return dr_cpus; >> +} >> + >> +static int dlpar_cpu_remove_by_count(struct device_node *parent, >> + u32 cpus_to_remove) >> +{ >> + struct dr_cpu *dr_cpus; >> + int dr_cpus_removed = 0; >> + int dr_cpus_present = 0; >> + int dr_cpus_possible; >> + int i, rc; >> + >> + pr_info("Attempting to hot-remove %d CPUs\n", cpus_to_remove); >> + >> + dr_cpus = get_dlpar_cpus(parent); > > So I think this should be: > >> + dr_cpus = get_dlpar_cpus(parent, cpus_to_remove); > >> + if (!dr_cpus) { >> + pr_info("Could not gather dr CPU info\n"); >> + return -EINVAL; >> + } > > And get_dlpar_cpus() should return cpus_to_remove worth of present cpus, or it > should error, meaning the below then won't be needed: > When adding cpus by count we may need more than just cpus_to_remove worth of present cpus. The goal was to provide all possibilities so we could continue trying to satisfy the request even if one or more cpus fails to remove. From this comment and comments below I think your approach is that we should bail if any error occurs during cpu remove. Is this what we should be doing? >> + dr_cpus_possible = dlpar_cpus_possible(parent); >> + >> + for (i = 0; i < dr_cpus_possible; i++) { >> + if (dr_cpus[i].present) >> + dr_cpus_present++; >> + } >> + >> + /* Validate the available CPUs to remove. >> + * NOTE: we can't remove the last CPU. >> + */ >> + if (cpus_to_remove >= dr_cpus_present) { >> + pr_err("Insufficient CPUs (%d) to satisfy remove request\n", >> + dr_cpus_present); >> + kfree(dr_cpus); >> + return -EINVAL; >> + } >> + > > Having only present cpus in dr_cpus should also simplify this loop because you > don't need to skip non-present ones: Fixed in updated code. > >> + for (i = 0; i < dr_cpus_possible; i++) { >> + if (dr_cpus_removed == cpus_to_remove) >> + break; >> + >> + if (!dr_cpus[i].present) >> + continue; >> + >> + rc = dlpar_cpu_remove_by_index(parent, dr_cpus[i].drc_index); >> + if (!rc) { >> + dr_cpus_removed++; >> + dr_cpus[i].modified = true; >> + } > > else .. ? > > Surely you should bail on the first error? > > Definitely you should, because you're going to undo everything below anyway: See comment above, I kept going here in the hopes that we could satisfy the request even if a failure occurred. > >> + } >> + >> + if (dr_cpus_removed != cpus_to_remove) { >> + pr_info("CPU hot-remove failed, adding any removed CPUs\n"); >> + >> + for (i = 0; i < dr_cpus_possible; i++) { >> + if (!dr_cpus[i].modified) >> + continue; > > And if you bail on the first error above you shouldn't need modified, instead > you just iterate in reverse from i using a new counter, eg. j. Yep, I'll work that into the new code. > >> + >> + rc = dlpar_cpu_add(parent, dr_cpus[i].drc_index); >> + if (rc) >> + pr_info("Failed to re-add CPU (%x)\n", >> + dr_cpus[i].drc_index); >> + } >> + >> + rc = -EINVAL; >> + } else { >> + rc = 0; >> + } >> + >> + kfree(dr_cpus); >> + return rc; >> +} >> + >> +int dlpar_cpu(struct pseries_hp_errorlog *hp_elog) >> +{ >> + struct device_node *parent; >> + u32 count, drc_index; >> + int rc; >> + >> + count = hp_elog->_drc_u.drc_count; >> + drc_index = hp_elog->_drc_u.drc_index; > > Those both need be32_to_cpu(). > > arch/powerpc/platforms/pseries/hotplug-cpu.c:750:15: warning: incorrect type in assignment (different base types) > arch/powerpc/platforms/pseries/hotplug-cpu.c:750:15: expected unsigned int [unsigned] [usertype] count > arch/powerpc/platforms/pseries/hotplug-cpu.c:750:15: got restricted __be32 [usertype] drc_count > arch/powerpc/platforms/pseries/hotplug-cpu.c:751:19: warning: incorrect type in assignment (different base types) > arch/powerpc/platforms/pseries/hotplug-cpu.c:751:19: expected unsigned int [unsigned] [usertype] drc_index > arch/powerpc/platforms/pseries/hotplug-cpu.c:751:19: got restricted __be32 [usertype] drc_index > Will fix. > > Though looking closer I don't see where we ever pass or receive a > pseries_hp_errorlog to or from firmware? So I'm a bit confused why we're > bothering with the __be32 shenanigans. Hopefully I've just missed a detail > somewhere. > That patch is coming. For hotplug in KVM guests the pseries_hp_errorlog is received when we call rtas_check_exception(). Currently these are sent up to rtas_errd in userspace, When this partchset goes in I planned on sending a patch to have cpu and memory hotplug requests handled entirely in the kernel instead of going to userspace. >> + parent = of_find_node_by_path("/cpus"); >> + if (!parent) >> + return -ENODEV; >> + >> + lock_device_hotplug(); >> + >> + switch (hp_elog->action) { >> + case PSERIES_HP_ELOG_ACTION_REMOVE: >> + if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) >> + rc = dlpar_cpu_remove_by_count(parent, count); >> + else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) >> + rc = dlpar_cpu_remove_by_index(parent, drc_index); >> + else >> + rc = -EINVAL; >> + break; >> + default: >> + pr_err("Invalid action (%d) specified\n", hp_elog->action); >> + rc = -EINVAL; >> + break; >> + } >> + >> + unlock_device_hotplug(); >> + of_node_put(parent); >> + return rc; >> +} >> + >> #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE >> >> static ssize_t dlpar_cpu_probe(const char *buf, size_t count) >> diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h >> index 8411c27..58892f1 100644 >> --- a/arch/powerpc/platforms/pseries/pseries.h >> +++ b/arch/powerpc/platforms/pseries/pseries.h >> @@ -66,11 +66,16 @@ extern int dlpar_release_drc(u32 drc_index); >> >> #ifdef CONFIG_MEMORY_HOTPLUG >> int dlpar_memory(struct pseries_hp_errorlog *hp_elog); >> +int dlpar_cpu(struct pseries_hp_errorlog *hp_elog); > > I don't think this should be under CONFIG_MEMORY_HOTPLUG ? > No it should not. Thanks for the review. -Nathan
On Tue, 2015-07-21 at 16:34 -0500, Nathan Fontenot wrote: > On 07/21/2015 04:27 AM, Michael Ellerman wrote: > > On Mon, 2015-22-06 at 21:00:49 UTC, Nathan Fontenot wrote: > >> +static int dlpar_cpu_remove_by_count(struct device_node *parent, > >> + u32 cpus_to_remove) > >> +{ > >> + struct dr_cpu *dr_cpus; > >> + int dr_cpus_removed = 0; > >> + int dr_cpus_present = 0; > >> + int dr_cpus_possible; > >> + int i, rc; > >> + > >> + pr_info("Attempting to hot-remove %d CPUs\n", cpus_to_remove); > >> + > >> + dr_cpus = get_dlpar_cpus(parent); > > > > So I think this should be: > > > >> + dr_cpus = get_dlpar_cpus(parent, cpus_to_remove); > > > >> + if (!dr_cpus) { > >> + pr_info("Could not gather dr CPU info\n"); > >> + return -EINVAL; > >> + } > > > > And get_dlpar_cpus() should return cpus_to_remove worth of present cpus, or it > > should error, meaning the below then won't be needed: > > When adding cpus by count we may need more than just cpus_to_remove worth > of present cpus. The goal was to provide all possibilities so we could > continue trying to satisfy the request even if one or more cpus fails > to remove. > > From this comment and comments below I think your approach is that we > should bail if any error occurs during cpu remove. Is this what we > should be doing? I think so. But you can convince me otherwise if you like :) It seems to me that we don't expect failures in the general case, so a failure genuinely indicates something has gone wrong. In which case it's best to stop and back out the request, rather than trying to continue and possibly making things worse. There's also the dilemma that if you get an error offlining one cpu, but then continue and manage to offline enough cpus, should you report an error to the caller (userspace)? So I think it's better to bail on the first error, undo what's been done, and then report the error to the caller. > > Though looking closer I don't see where we ever pass or receive a > > pseries_hp_errorlog to or from firmware? So I'm a bit confused why we're > > bothering with the __be32 shenanigans. Hopefully I've just missed a detail > > somewhere. > > > > That patch is coming. For hotplug in KVM guests the pseries_hp_errorlog > is received when we call rtas_check_exception(). Currently these are > sent up to rtas_errd in userspace, When this partchset goes in I planned > on sending a patch to have cpu and memory hotplug requests handled entirely > in the kernel instead of going to userspace. OK that makes sense. I'll wait to see that patch, but when it comes I'll probably tell you to do the endian swaps once when we receive the error log, rather than at each usage. cheers
On 07/21/2015 08:11 PM, Michael Ellerman wrote: > On Tue, 2015-07-21 at 16:34 -0500, Nathan Fontenot wrote: >> On 07/21/2015 04:27 AM, Michael Ellerman wrote: >>> On Mon, 2015-22-06 at 21:00:49 UTC, Nathan Fontenot wrote: >>>> +static int dlpar_cpu_remove_by_count(struct device_node *parent, >>>> + u32 cpus_to_remove) >>>> +{ >>>> + struct dr_cpu *dr_cpus; >>>> + int dr_cpus_removed = 0; >>>> + int dr_cpus_present = 0; >>>> + int dr_cpus_possible; >>>> + int i, rc; >>>> + >>>> + pr_info("Attempting to hot-remove %d CPUs\n", cpus_to_remove); >>>> + >>>> + dr_cpus = get_dlpar_cpus(parent); >>> >>> So I think this should be: >>> >>>> + dr_cpus = get_dlpar_cpus(parent, cpus_to_remove); >>> >>>> + if (!dr_cpus) { >>>> + pr_info("Could not gather dr CPU info\n"); >>>> + return -EINVAL; >>>> + } >>> >>> And get_dlpar_cpus() should return cpus_to_remove worth of present cpus, or it >>> should error, meaning the below then won't be needed: >> >> When adding cpus by count we may need more than just cpus_to_remove worth >> of present cpus. The goal was to provide all possibilities so we could >> continue trying to satisfy the request even if one or more cpus fails >> to remove. >> >> From this comment and comments below I think your approach is that we >> should bail if any error occurs during cpu remove. Is this what we >> should be doing? > > I think so. But you can convince me otherwise if you like :) > > It seems to me that we don't expect failures in the general case, so a failure > genuinely indicates something has gone wrong. In which case it's best to stop > and back out the request, rather than trying to continue and possibly making > things worse. > > There's also the dilemma that if you get an error offlining one cpu, but then > continue and manage to offline enough cpus, should you report an error to the > caller (userspace)? > > So I think it's better to bail on the first error, undo what's been done, and > then report the error to the caller. Thinking through this and I cannot come up with a reason good enough to justify not bailing on the first error. I'll re-work the patch for this and resend. > >>> Though looking closer I don't see where we ever pass or receive a >>> pseries_hp_errorlog to or from firmware? So I'm a bit confused why we're >>> bothering with the __be32 shenanigans. Hopefully I've just missed a detail >>> somewhere. >>> >> >> That patch is coming. For hotplug in KVM guests the pseries_hp_errorlog >> is received when we call rtas_check_exception(). Currently these are >> sent up to rtas_errd in userspace, When this partchset goes in I planned >> on sending a patch to have cpu and memory hotplug requests handled entirely >> in the kernel instead of going to userspace. > > OK that makes sense. > > I'll wait to see that patch, but when it comes I'll probably tell you to do the > endian swaps once when we receive the error log, rather than at each usage. > I'll make a note to look into this. The issue I find is that it gets ugly because we need to use some of these values (such as drc_index) in cpu format at times and in BE at different times (such as comparing to device tree values or making rtas calls). My goal was to pass around the values in cpu format and do the endian swaps when needed. -Nathan
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 7890b2f..49b7196 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -26,6 +26,7 @@ #include <linux/sched.h> /* for idle_task_exit */ #include <linux/cpu.h> #include <linux/of.h> +#include <linux/slab.h> #include <asm/prom.h> #include <asm/rtas.h> #include <asm/firmware.h> @@ -506,6 +507,207 @@ static ssize_t dlpar_cpu_remove(struct device_node *dn, u32 drc_index) return rc; } +static struct device_node *cpu_drc_index_to_dn(struct device_node *parent, + u32 drc_index) +{ + struct device_node *dn; + u32 my_index; + int rc; + + for_each_child_of_node(parent, dn) { + if (of_node_cmp(dn->type, "cpu") != 0) + continue; + + rc = of_property_read_u32(dn, "ibm,my-drc-index", &my_index); + if (rc) + continue; + + if (my_index == drc_index) + break; + } + + return dn; +} + +static int dlpar_cpu_remove_by_index(struct device_node *parent, + u32 drc_index) +{ + struct device_node *dn; + int rc; + + dn = cpu_drc_index_to_dn(parent, drc_index); + if (!dn) + return -ENODEV; + + rc = dlpar_cpu_remove(dn, drc_index); + of_node_put(dn); + return rc; +} + +static int dlpar_cpus_possible(struct device_node *parent) +{ + int dr_cpus_possible; + + /* The first u32 in the ibm,drc-indexes property is the numnber + * of drc entries in the property, which is the possible number + * number of dr capable cpus. + */ + of_property_read_u32(parent, "ibm,drc-indexes", &dr_cpus_possible); + return dr_cpus_possible; +} + +struct dr_cpu { + u32 drc_index; + bool present; + bool modified; +}; + +static struct dr_cpu *get_dlpar_cpus(struct device_node *parent) +{ + struct device_node *dn; + struct property *prop; + struct dr_cpu *dr_cpus; + const __be32 *p; + u32 drc_index; + int dr_cpus_possible, index; + bool first; + + dr_cpus_possible = dlpar_cpus_possible(parent); + dr_cpus = kcalloc(dr_cpus_possible, sizeof(*dr_cpus), GFP_KERNEL); + if (!dr_cpus) + return NULL; + + first = true; + index = 0; + of_property_for_each_u32(parent, "ibm,drc-indexes", prop, p, + drc_index) { + if (first) { + /* The first entry is the number of drc indexes in + * the property, skip it. + */ + first = false; + continue; + } + + dr_cpus[index].drc_index = drc_index; + + dn = cpu_drc_index_to_dn(parent, drc_index); + if (dn) { + dr_cpus[index].present = true; + of_node_put(dn); + } + + index++; + } + + return dr_cpus; +} + +static int dlpar_cpu_remove_by_count(struct device_node *parent, + u32 cpus_to_remove) +{ + struct dr_cpu *dr_cpus; + int dr_cpus_removed = 0; + int dr_cpus_present = 0; + int dr_cpus_possible; + int i, rc; + + pr_info("Attempting to hot-remove %d CPUs\n", cpus_to_remove); + + dr_cpus = get_dlpar_cpus(parent); + if (!dr_cpus) { + pr_info("Could not gather dr CPU info\n"); + return -EINVAL; + } + + dr_cpus_possible = dlpar_cpus_possible(parent); + + for (i = 0; i < dr_cpus_possible; i++) { + if (dr_cpus[i].present) + dr_cpus_present++; + } + + /* Validate the available CPUs to remove. + * NOTE: we can't remove the last CPU. + */ + if (cpus_to_remove >= dr_cpus_present) { + pr_err("Insufficient CPUs (%d) to satisfy remove request\n", + dr_cpus_present); + kfree(dr_cpus); + return -EINVAL; + } + + for (i = 0; i < dr_cpus_possible; i++) { + if (dr_cpus_removed == cpus_to_remove) + break; + + if (!dr_cpus[i].present) + continue; + + rc = dlpar_cpu_remove_by_index(parent, dr_cpus[i].drc_index); + if (!rc) { + dr_cpus_removed++; + dr_cpus[i].modified = true; + } + } + + if (dr_cpus_removed != cpus_to_remove) { + pr_info("CPU hot-remove failed, adding any removed CPUs\n"); + + for (i = 0; i < dr_cpus_possible; i++) { + if (!dr_cpus[i].modified) + continue; + + rc = dlpar_cpu_add(parent, dr_cpus[i].drc_index); + if (rc) + pr_info("Failed to re-add CPU (%x)\n", + dr_cpus[i].drc_index); + } + + rc = -EINVAL; + } else { + rc = 0; + } + + kfree(dr_cpus); + return rc; +} + +int dlpar_cpu(struct pseries_hp_errorlog *hp_elog) +{ + struct device_node *parent; + u32 count, drc_index; + int rc; + + count = hp_elog->_drc_u.drc_count; + drc_index = hp_elog->_drc_u.drc_index; + + parent = of_find_node_by_path("/cpus"); + if (!parent) + return -ENODEV; + + lock_device_hotplug(); + + switch (hp_elog->action) { + case PSERIES_HP_ELOG_ACTION_REMOVE: + if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) + rc = dlpar_cpu_remove_by_count(parent, count); + else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) + rc = dlpar_cpu_remove_by_index(parent, drc_index); + else + rc = -EINVAL; + break; + default: + pr_err("Invalid action (%d) specified\n", hp_elog->action); + rc = -EINVAL; + break; + } + + unlock_device_hotplug(); + of_node_put(parent); + return rc; +} + #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE static ssize_t dlpar_cpu_probe(const char *buf, size_t count) diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index 8411c27..58892f1 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -66,11 +66,16 @@ extern int dlpar_release_drc(u32 drc_index); #ifdef CONFIG_MEMORY_HOTPLUG int dlpar_memory(struct pseries_hp_errorlog *hp_elog); +int dlpar_cpu(struct pseries_hp_errorlog *hp_elog); #else static inline int dlpar_memory(struct pseries_hp_errorlog *hp_elog) { return -EOPNOTSUPP; } +static inline int dlpar_cpu(struct pseries_hp_errorlog *hp_elog) +{ + return -EOPNOTSUPP; +} #endif /* PCI root bridge prepare function override for pseries */
Add the ability to dlpar remove CPUs via hotplug rtas events, either by specifying the drc-index of the CPU to remove or providing a count of cpus to remove. To accomplish we create a list of possible dr cpus and their drc indexes so we can easily traverse the list looking for candidates to remove and easily clean up in any cases of failure. Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com> --- arch/powerpc/platforms/pseries/hotplug-cpu.c | 202 ++++++++++++++++++++++++++ arch/powerpc/platforms/pseries/pseries.h | 5 + 2 files changed, 207 insertions(+)