Patchwork [v3,1/5] powernv: cpufreq driver for powernv platform

login
register
mail settings
Submitter Gautham R. Shenoy
Date March 20, 2014, 12:10 p.m.
Message ID <1395317460-14811-2-git-send-email-ego@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/332096/
State Changes Requested
Headers show

Comments

Gautham R. Shenoy - March 20, 2014, 12:10 p.m.
From: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>

Backend driver to dynamically set voltage and frequency on
IBM POWER non-virtualized platforms.  Power management SPRs
are used to set the required PState.

This driver works in conjunction with cpufreq governors
like 'ondemand' to provide a demand based frequency and
voltage setting on IBM POWER non-virtualized platforms.

PState table is obtained from OPAL v3 firmware through device
tree.

powernv_cpufreq back-end driver would parse the relevant device-tree
nodes and initialise the cpufreq subsystem on powernv platform.

The policy->cpus needs to be populated in a hotplug-invariant manner
instead of using cpu_sibling_mask() which varies with cpu-hotplug. This
is because the cpufreq core code copies this content into policy->related_cpus
mask which is should not vary on cpu-hotplug.

[Change log updated by ego@linux.vnet.ibm.com]

Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/reg.h         |   4 +
 arch/powerpc/platforms/powernv/Kconfig |   1 +
 drivers/cpufreq/Kconfig                |   1 +
 drivers/cpufreq/Kconfig.powerpc        |  13 ++
 drivers/cpufreq/Makefile               |   1 +
 drivers/cpufreq/powernv-cpufreq.c      | 277 +++++++++++++++++++++++++++++++++
 6 files changed, 297 insertions(+)
 create mode 100644 drivers/cpufreq/powernv-cpufreq.c
viresh kumar - March 21, 2014, 8:41 a.m.
On Thu, Mar 20, 2014 at 5:40 PM, Gautham R. Shenoy
<ego@linux.vnet.ibm.com> wrote:
> From: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>

Hi Vaidy,

> diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
> index 4b029c0..4ba1632 100644
> --- a/drivers/cpufreq/Kconfig
> +++ b/drivers/cpufreq/Kconfig
> @@ -48,6 +48,7 @@ config CPU_FREQ_STAT_DETAILS
>  choice
>         prompt "Default CPUFreq governor"
>         default CPU_FREQ_DEFAULT_GOV_USERSPACE if ARM_SA1100_CPUFREQ || ARM_SA1110_CPUFREQ
> +       default CPU_FREQ_DEFAULT_GOV_ONDEMAND if POWERNV_CPUFREQ

Probably we should remove SA1100's entry as well from here. This is
not the right way of doing it. Imagine 100 platforms having entries here.
If you want it, then select it from your platforms Kconfig.

> diff --git a/drivers/cpufreq/Kconfig.powerpc b/drivers/cpufreq/Kconfig.powerpc
> index ca0021a..93f8689 100644
> --- a/drivers/cpufreq/Kconfig.powerpc
> +++ b/drivers/cpufreq/Kconfig.powerpc
> @@ -54,3 +54,16 @@ config PPC_PASEMI_CPUFREQ
>         help
>           This adds the support for frequency switching on PA Semi
>           PWRficient processors.
> +
> +config POWERNV_CPUFREQ
> +       tristate "CPU frequency scaling for IBM POWERNV platform"
> +       depends on PPC_POWERNV
> +       select CPU_FREQ_GOV_PERFORMANCE
> +       select CPU_FREQ_GOV_POWERSAVE
> +       select CPU_FREQ_GOV_USERSPACE
> +       select CPU_FREQ_GOV_ONDEMAND
> +       select CPU_FREQ_GOV_CONSERVATIVE

Nice :)

People normally do this from the defconfigs instead. And logically speaking
we might better have only dependencies here, isn't it? And obviously
governors aren't a dependency for this driver. :)

> +       default y
> +       help
> +        This adds support for CPU frequency switching on IBM POWERNV
> +        platform

> diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
> new file mode 100644
> index 0000000..ab1551f
> --- /dev/null
> +++ b/drivers/cpufreq/powernv-cpufreq.c
> @@ -0,0 +1,277 @@
> +/*
> + * POWERNV cpufreq driver for the IBM POWER processors
> + *
> + * (C) Copyright IBM 2014
> + *
> + * Author: Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2, or (at your option)
> + * any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + */
> +
> +#define pr_fmt(fmt)    "powernv-cpufreq: " fmt
> +
> +#include <linux/module.h>
> +#include <linux/cpufreq.h>
> +#include <linux/of.h>
> +#include <asm/cputhreads.h>

That's it? Sure?

Even if things compile for you, you must explicitly include all the
files on which
you depend.

> +/* FIXME: Make this per-core */
> +static DEFINE_MUTEX(freq_switch_mutex);
> +
> +#define POWERNV_MAX_PSTATES    256
> +
> +static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
> +static int powernv_pstate_ids[POWERNV_MAX_PSTATES+1];
> +
> +/*
> + * Initialize the freq table based on data obtained
> + * from the firmware passed via device-tree
> + */
> +
> +static int init_powernv_pstates(void)
> +{
> +       struct device_node *power_mgt;
> +       int nr_pstates = 0;
> +       int pstate_min, pstate_max, pstate_nominal;
> +       const __be32 *pstate_ids, *pstate_freqs;
> +       int i;
> +       u32 len_ids, len_freqs;
> +
> +       power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
> +       if (!power_mgt) {
> +               pr_warn("power-mgt node not found\n");
> +               return -ENODEV;
> +       }
> +
> +       if (of_property_read_u32(power_mgt, "ibm,pstate-min", &pstate_min)) {
> +               pr_warn("ibm,pstate-min node not found\n");
> +               return -ENODEV;
> +       }
> +
> +       if (of_property_read_u32(power_mgt, "ibm,pstate-max", &pstate_max)) {
> +               pr_warn("ibm,pstate-max node not found\n");
> +               return -ENODEV;
> +       }
> +
> +       if (of_property_read_u32(power_mgt, "ibm,pstate-nominal",
> +                                &pstate_nominal)) {
> +               pr_warn("ibm,pstate-nominal not found\n");
> +               return -ENODEV;
> +       }
> +       pr_info("cpufreq pstate min %d nominal %d max %d\n", pstate_min,
> +               pstate_nominal, pstate_max);
> +
> +       pstate_ids = of_get_property(power_mgt, "ibm,pstate-ids", &len_ids);
> +       if (!pstate_ids) {
> +               pr_warn("ibm,pstate-ids not found\n");
> +               return -ENODEV;
> +       }
> +
> +       pstate_freqs = of_get_property(power_mgt, "ibm,pstate-frequencies-mhz",
> +                                     &len_freqs);
> +       if (!pstate_freqs) {
> +               pr_warn("ibm,pstate-frequencies-mhz not found\n");
> +               return -ENODEV;
> +       }
> +
> +       WARN_ON(len_ids != len_freqs);
> +       nr_pstates = min(len_ids, len_freqs) / sizeof(u32);
> +       WARN_ON(!nr_pstates);

Why do you want to continue here?

> +       pr_debug("NR PStates %d\n", nr_pstates);
> +       for (i = 0; i < nr_pstates; i++) {
> +               u32 id = be32_to_cpu(pstate_ids[i]);
> +               u32 freq = be32_to_cpu(pstate_freqs[i]);
> +
> +               pr_debug("PState id %d freq %d MHz\n", id, freq);
> +               powernv_freqs[i].driver_data = i;

I don't think you are using this field at all and this is the field you can
use for driver_data and so you can get rid of powernv_pstate_ids[ ].

> +               powernv_freqs[i].frequency = freq * 1000; /* kHz */
> +               powernv_pstate_ids[i] = id;
> +       }
> +       /* End of list marker entry */
> +       powernv_freqs[i].driver_data = 0;

Not required.

> +       powernv_freqs[i].frequency = CPUFREQ_TABLE_END;
> +
> +       /* Print frequency table */
> +       for (i = 0; powernv_freqs[i].frequency != CPUFREQ_TABLE_END; i++)
> +               pr_debug("%d: %d\n", i, powernv_freqs[i].frequency);

You have already printed this table earlier..

> +
> +       return 0;
> +}
> +
> +static struct freq_attr *powernv_cpu_freq_attr[] = {
> +       &cpufreq_freq_attr_scaling_available_freqs,
> +       NULL,
> +};

Can use this instead: cpufreq_generic_attr?

> +/* Helper routines */
> +
> +/* Access helpers to power mgt SPR */
> +
> +static inline unsigned long get_pmspr(unsigned long sprn)

Looks big enough not be inlined?

> +{
> +       switch (sprn) {
> +       case SPRN_PMCR:
> +               return mfspr(SPRN_PMCR);
> +
> +       case SPRN_PMICR:
> +               return mfspr(SPRN_PMICR);
> +
> +       case SPRN_PMSR:
> +               return mfspr(SPRN_PMSR);
> +       }
> +       BUG();
> +}
> +
> +static inline void set_pmspr(unsigned long sprn, unsigned long val)
> +{

Same here..

> +       switch (sprn) {
> +       case SPRN_PMCR:
> +               mtspr(SPRN_PMCR, val);
> +               return;
> +
> +       case SPRN_PMICR:
> +               mtspr(SPRN_PMICR, val);
> +               return;
> +
> +       case SPRN_PMSR:
> +               mtspr(SPRN_PMSR, val);
> +               return;
> +       }
> +       BUG();
> +}
> +
> +static void set_pstate(void *pstate)
> +{
> +       unsigned long val;
> +       unsigned long pstate_ul = *(unsigned long *) pstate;

Why not sending value only to this routine instead of pointer?

> +
> +       val = get_pmspr(SPRN_PMCR);
> +       val = val & 0x0000ffffffffffffULL;

Maybe a blank line here?

> +       /* Set both global(bits 56..63) and local(bits 48..55) PStates */
> +       val = val | (pstate_ul << 56) | (pstate_ul << 48);

here as well?

> +       pr_debug("Setting cpu %d pmcr to %016lX\n", smp_processor_id(), val);
> +       set_pmspr(SPRN_PMCR, val);
> +}
> +
> +static int powernv_set_freq(cpumask_var_t cpus, unsigned int new_index)
> +{
> +       unsigned long val = (unsigned long) powernv_pstate_ids[new_index];

I think automatic type conversion will happen here.

> +
> +       /*
> +        * Use smp_call_function to send IPI and execute the
> +        * mtspr on target cpu.  We could do that without IPI
> +        * if current CPU is within policy->cpus (core)
> +        */

Hmm, interesting I also feel there are cases where this routine can
get called from other CPUs. Can you list those use cases where it can
happen? Governors will mostly call this from one of the CPUs present
in policy->cpus.

> +       val = val & 0xFF;
> +       smp_call_function_any(cpus, set_pstate, &val, 1);
> +       return 0;
> +}
> +
> +static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
> +{
> +       int base, i;
> +
> +#ifdef CONFIG_SMP

What will break if you don't have this ifdef here? Without that as well
below code should work?

> +       base = cpu_first_thread_sibling(policy->cpu);
> +
> +       for (i = 0; i < threads_per_core; i++)
> +               cpumask_set_cpu(base + i, policy->cpus);
> +#endif
> +       policy->cpuinfo.transition_latency = 25000;
> +
> +       policy->cur = powernv_freqs[0].frequency;
> +       cpufreq_frequency_table_get_attr(powernv_freqs, policy->cpu);

This doesn't exist anymore.

> +       return cpufreq_frequency_table_cpuinfo(policy, powernv_freqs);

Have you written this driver long time back? CPUFreq core has been
cleaned up heavily since last few kernel releases and I think there are
better helper routines available now.

> +}
> +
> +static int powernv_cpufreq_cpu_exit(struct cpufreq_policy *policy)
> +{
> +       cpufreq_frequency_table_put_attr(policy->cpu);
> +       return 0;
> +}

You don't need this..

> +static int powernv_cpufreq_verify(struct cpufreq_policy *policy)
> +{
> +       return cpufreq_frequency_table_verify(policy, powernv_freqs);
> +}

use generic verify function pointer instead..

> +static int powernv_cpufreq_target(struct cpufreq_policy *policy,
> +                             unsigned int target_freq,
> +                             unsigned int relation)

use target_index() instead..

> +{
> +       int rc;
> +       struct cpufreq_freqs freqs;
> +       unsigned int new_index;
> +
> +       cpufreq_frequency_table_target(policy, powernv_freqs, target_freq,
> +                                      relation, &new_index);
> +
> +       freqs.old = policy->cur;
> +       freqs.new = powernv_freqs[new_index].frequency;
> +       freqs.cpu = policy->cpu;
> +
> +       mutex_lock(&freq_switch_mutex);

Why do you need this lock for?

> +       cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
> +
> +       pr_debug("setting frequency for cpu %d to %d kHz index %d pstate %d",
> +                policy->cpu,
> +                powernv_freqs[new_index].frequency,
> +                new_index,
> +                powernv_pstate_ids[new_index]);
> +
> +       rc = powernv_set_freq(policy->cpus, new_index);
> +
> +       cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
> +       mutex_unlock(&freq_switch_mutex);
> +
> +       return rc;
> +}
> +
> +static struct cpufreq_driver powernv_cpufreq_driver = {
> +       .verify         = powernv_cpufreq_verify,
> +       .target         = powernv_cpufreq_target,

I think you have Srivatsa there who has seen lots of cpufreq code and
could have helped you a lot :)

> +       .init           = powernv_cpufreq_cpu_init,
> +       .exit           = powernv_cpufreq_cpu_exit,
> +       .name           = "powernv-cpufreq",
> +       .flags          = CPUFREQ_CONST_LOOPS,
> +       .attr           = powernv_cpu_freq_attr,

Would be better if you keep these callbacks in the order they are declared
in cpufreq.h..

> +};
> +
> +static int __init powernv_cpufreq_init(void)
> +{
> +       int rc = 0;
> +
> +       /* Discover pstates from device tree and init */
> +

Remove blank line.

> +       rc = init_powernv_pstates();
> +

same here..

> +       if (rc) {
> +               pr_info("powernv-cpufreq disabled\n");
> +               return rc;
> +       }
> +
> +       rc = cpufreq_register_driver(&powernv_cpufreq_driver);
> +       return rc;

Merge above two lines.

> +}
> +
> +static void __exit powernv_cpufreq_exit(void)
> +{
> +       cpufreq_unregister_driver(&powernv_cpufreq_driver);
> +}
> +
> +module_init(powernv_cpufreq_init);
> +module_exit(powernv_cpufreq_exit);

Place these right after the routines without any blank lines in between.

> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>");
Gautham R. Shenoy - March 21, 2014, 10:43 a.m.
Hi Viresh,

On Fri, Mar 21, 2014 at 02:11:32PM +0530, Viresh Kumar wrote:
> On Thu, Mar 20, 2014 at 5:40 PM, Gautham R. Shenoy
> <ego@linux.vnet.ibm.com> wrote:
> > From: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
> 
> Hi Vaidy,
> 
> > diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
> > index 4b029c0..4ba1632 100644
> > --- a/drivers/cpufreq/Kconfig
> > +++ b/drivers/cpufreq/Kconfig
> > @@ -48,6 +48,7 @@ config CPU_FREQ_STAT_DETAILS
> >  choice
> >         prompt "Default CPUFreq governor"
> >         default CPU_FREQ_DEFAULT_GOV_USERSPACE if ARM_SA1100_CPUFREQ || ARM_SA1110_CPUFREQ
> > +       default CPU_FREQ_DEFAULT_GOV_ONDEMAND if POWERNV_CPUFREQ
> 
> Probably we should remove SA1100's entry as well from here. This is
> not the right way of doing it. Imagine 100 platforms having entries here.
> If you want it, then select it from your platforms Kconfig.

Sure. Will move these bits and the other governor related bits to the
Powernv Kconfig.

> > diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
> > new file mode 100644
> > index 0000000..ab1551f
> > --- /dev/null
> > +
> > +#define pr_fmt(fmt)    "powernv-cpufreq: " fmt
> > +
> > +#include <linux/module.h>
> > +#include <linux/cpufreq.h>
> > +#include <linux/of.h>
> > +#include <asm/cputhreads.h>
> 
> That's it? Sure?
> 
> Even if things compile for you, you must explicitly include all the
> files on which
> you depend.

Ok. 

> 
> > +
> > +       WARN_ON(len_ids != len_freqs);
> > +       nr_pstates = min(len_ids, len_freqs) / sizeof(u32);
> > +       WARN_ON(!nr_pstates);
> 
> Why do you want to continue here?

Good point. We might be better off exiting at this point. 

> 
> > +       pr_debug("NR PStates %d\n", nr_pstates);
> > +       for (i = 0; i < nr_pstates; i++) {
> > +               u32 id = be32_to_cpu(pstate_ids[i]);
> > +               u32 freq = be32_to_cpu(pstate_freqs[i]);
> > +
> > +               pr_debug("PState id %d freq %d MHz\n", id, freq);
> > +               powernv_freqs[i].driver_data = i;
> 
> I don't think you are using this field at all and this is the field you can
> use for driver_data and so you can get rid of powernv_pstate_ids[ ].

Using driver_data to record powernv_pstate_ids won't work since
powernv_pstate_ids can be negative. So a pstate_id -3 can be confused
with CPUFREQ_BOOST_FREQ thereby not displaying the frequency
corresponding to pstate id -3. So for now I think we will be retaining
powernv_pstate_ids.

> 
> > +               powernv_freqs[i].frequency = freq * 1000; /* kHz */
> > +               powernv_pstate_ids[i] = id;
> > +       }
> > +       /* End of list marker entry */
> > +       powernv_freqs[i].driver_data = 0;
> 
> Not required.

Ok.
> 
> > +       powernv_freqs[i].frequency = CPUFREQ_TABLE_END;
> > +
> > +       /* Print frequency table */
> > +       for (i = 0; powernv_freqs[i].frequency != CPUFREQ_TABLE_END; i++)
> > +               pr_debug("%d: %d\n", i, powernv_freqs[i].frequency);
> 
> You have already printed this table earlier..

Fair enough.

> 
> > +
> > +       return 0;
> > +}
> > +
> > +static struct freq_attr *powernv_cpu_freq_attr[] = {
> > +       &cpufreq_freq_attr_scaling_available_freqs,
> > +       NULL,
> > +};
> 
> Can use this instead: cpufreq_generic_attr?

In this patch yes. But later patch introduces an additional attribute
for displaying the nominal frequency. Will handle that part in a clean
way in the next version.

> 
> > +/* Helper routines */
> > +
> > +/* Access helpers to power mgt SPR */
> > +
> > +static inline unsigned long get_pmspr(unsigned long sprn)
> 
> Looks big enough not be inlined?

It is called from one function. It has been defined separately for
readability. 

> 
> > +{
> > +       switch (sprn) {
> > +       case SPRN_PMCR:
> > +               return mfspr(SPRN_PMCR);
> > +
> > +       case SPRN_PMICR:
> > +               return mfspr(SPRN_PMICR);
> > +
> > +       case SPRN_PMSR:
> > +               return mfspr(SPRN_PMSR);
> > +       }
> > +       BUG();
> > +}
> > +
> > +static inline void set_pmspr(unsigned long sprn, unsigned long val)
> > +{
> 
> Same here..

Same reason as above.

> 
> > +       switch (sprn) {
> > +       case SPRN_PMCR:
> > +               mtspr(SPRN_PMCR, val);
> > +               return;
> > +
> > +       case SPRN_PMICR:
> > +               mtspr(SPRN_PMICR, val);
> > +               return;
> > +
> > +       case SPRN_PMSR:
> > +               mtspr(SPRN_PMSR, val);
> > +               return;
> > +       }
> > +       BUG();
> > +}
> > +
> > +static void set_pstate(void *pstate)
> > +{
> > +       unsigned long val;
> > +       unsigned long pstate_ul = *(unsigned long *) pstate;
> 
> Why not sending value only to this routine instead of pointer?

Well this function is called via an smp_call_function. so, cannot send
a value :(

> 
> > +
> > +       val = get_pmspr(SPRN_PMCR);
> > +       val = val & 0x0000ffffffffffffULL;
> 
> Maybe a blank line here?

Ok.

> 
> > +       /* Set both global(bits 56..63) and local(bits 48..55) PStates */
> > +       val = val | (pstate_ul << 56) | (pstate_ul << 48);
> 
> here as well?

Ok.
> 

> > +       pr_debug("Setting cpu %d pmcr to %016lX\n", smp_processor_id(), val);
> > +       set_pmspr(SPRN_PMCR, val);
> > +}
> > +
> > +static int powernv_set_freq(cpumask_var_t cpus, unsigned int new_index)
> > +{
> > +       unsigned long val = (unsigned long) powernv_pstate_ids[new_index];
> 
> I think automatic type conversion will happen here.

Ok. Will fix this.

> 
> > +
> > +       /*
> > +        * Use smp_call_function to send IPI and execute the
> > +        * mtspr on target cpu.  We could do that without IPI
> > +        * if current CPU is within policy->cpus (core)
> > +        */
> 
> Hmm, interesting I also feel there are cases where this routine can
> get called from other CPUs. Can you list those use cases where it can
> happen? Governors will mostly call this from one of the CPUs present
> in policy->cpus.

Consider the case when the governor is userspace and we are executing 

    # echo freqvalue  > 
         /sys/devices/system/cpu/cpu<i>/cpufreq/scaling_setspeed 

from a cpu <j> which is not in policy->cpus of cpu i. 


> > +static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
> > +{
> > +       int base, i;
> > +
> > +#ifdef CONFIG_SMP
> 
> What will break if you don't have this ifdef here? Without that as well
> below code should work?
> 
> > +       base = cpu_first_thread_sibling(policy->cpu);
> > +
> > +       for (i = 0; i < threads_per_core; i++)
> > +               cpumask_set_cpu(base + i, policy->cpus);
> > +#endif
> > +       policy->cpuinfo.transition_latency = 25000;
> > +
> > +       policy->cur = powernv_freqs[0].frequency;
> > +       cpufreq_frequency_table_get_attr(powernv_freqs, policy->cpu);
> 
> This doesn't exist anymore.

Didn't get this comment!

> 
> > +       return cpufreq_frequency_table_cpuinfo(policy, powernv_freqs);
> 
> Have you written this driver long time back? CPUFreq core has been
> cleaned up heavily since last few kernel releases and I think there are
> better helper routines available now.

Yup it was written quite a while ago. And yeah, CPUFreq has changed
quite a bit since the last time I saw it :-)

> 
> > +}
> > +
> > +static int powernv_cpufreq_cpu_exit(struct cpufreq_policy *policy)
> > +{
> > +       cpufreq_frequency_table_put_attr(policy->cpu);
> > +       return 0;
> > +}
> 
> You don't need this..

Why not ?

> 
> > +static int powernv_cpufreq_verify(struct cpufreq_policy *policy)
> > +{
> > +       return cpufreq_frequency_table_verify(policy, powernv_freqs);
> > +}
> 
> use generic verify function pointer instead..
> 
> > +static int powernv_cpufreq_target(struct cpufreq_policy *policy,
> > +                             unsigned int target_freq,
> > +                             unsigned int relation)
> 
> use target_index() instead..
> 
> > +{
> > +       int rc;
> > +       struct cpufreq_freqs freqs;
> > +       unsigned int new_index;
> > +
> > +       cpufreq_frequency_table_target(policy, powernv_freqs, target_freq,
> > +                                      relation, &new_index);
> > +
> > +       freqs.old = policy->cur;
> > +       freqs.new = powernv_freqs[new_index].frequency;
> > +       freqs.cpu = policy->cpu;
> > +
> > +       mutex_lock(&freq_switch_mutex);
> 
> Why do you need this lock for?

I guess it was to serialize accesses to PMCR. But Srivatsa's patch
converts this to a per-core lock which probably is no longer required
after your cpufreq_freq_transition_begin/end() patch series.

> 
> > +       cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
> > +
> > +       pr_debug("setting frequency for cpu %d to %d kHz index %d pstate %d",
> > +                policy->cpu,
> > +                powernv_freqs[new_index].frequency,
> > +                new_index,
> > +                powernv_pstate_ids[new_index]);
> > +
> > +       rc = powernv_set_freq(policy->cpus, new_index);
> > +
> > +       cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
> > +       mutex_unlock(&freq_switch_mutex);
> > +
> > +       return rc;
> > +}
> > +
> > +static struct cpufreq_driver powernv_cpufreq_driver = {
> > +       .verify         = powernv_cpufreq_verify,
> > +       .target         = powernv_cpufreq_target,
> 
> I think you have Srivatsa there who has seen lots of cpufreq code and
> could have helped you a lot :)
> 
> > +       .init           = powernv_cpufreq_cpu_init,
> > +       .exit           = powernv_cpufreq_cpu_exit,
> > +       .name           = "powernv-cpufreq",
> > +       .flags          = CPUFREQ_CONST_LOOPS,
> > +       .attr           = powernv_cpu_freq_attr,
> 
> Would be better if you keep these callbacks in the order they are declared
> in cpufreq.h..

Sure.

> 
> > +module_init(powernv_cpufreq_init);
> > +module_exit(powernv_cpufreq_exit);
> 
> Place these right after the routines without any blank lines in
between.

Is this the new convention ?

> 
> > +MODULE_LICENSE("GPL");
> > +MODULE_AUTHOR("Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>");
> 

Thanks for the detailed review.

--
Regards
gautham.
viresh kumar - March 21, 2014, 10:54 a.m.
On 21 March 2014 16:13, Gautham R Shenoy <ego@linux.vnet.ibm.com> wrote:
> On Fri, Mar 21, 2014 at 02:11:32PM +0530, Viresh Kumar wrote:

>> > +               pr_debug("PState id %d freq %d MHz\n", id, freq);
>> > +               powernv_freqs[i].driver_data = i;
>>
>> I don't think you are using this field at all and this is the field you can
>> use for driver_data and so you can get rid of powernv_pstate_ids[ ].
>
> Using driver_data to record powernv_pstate_ids won't work since
> powernv_pstate_ids can be negative. So a pstate_id -3 can be confused
> with CPUFREQ_BOOST_FREQ thereby not displaying the frequency
> corresponding to pstate id -3. So for now I think we will be retaining
> powernv_pstate_ids.

As I said earlier, this field is only used by cpufreq drivers and cpufreq core
isn't supposed to touch it. CPUFREQ_BOOST_FREQ and other macros
are there for .frequency field and not this one.


>> > +static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
>> > +{
>> > +       int base, i;
>> > +
>> > +#ifdef CONFIG_SMP
>>
>> What will break if you don't have this ifdef here? Without that as well
>> below code should work?

Missed this one?

>> > +       base = cpu_first_thread_sibling(policy->cpu);
>> > +
>> > +       for (i = 0; i < threads_per_core; i++)
>> > +               cpumask_set_cpu(base + i, policy->cpus);
>> > +#endif
>> > +       policy->cpuinfo.transition_latency = 25000;
>> > +
>> > +       policy->cur = powernv_freqs[0].frequency;
>> > +       cpufreq_frequency_table_get_attr(powernv_freqs, policy->cpu);
>>
>> This doesn't exist anymore.
>
> Didn't get this comment!

cpufreq_frequency_table_get_attr() routine doesn't exist anymore.

>> > +static int powernv_cpufreq_cpu_exit(struct cpufreq_policy *policy)
>> > +{
>> > +       cpufreq_frequency_table_put_attr(policy->cpu);
>> > +       return 0;
>> > +}
>>
>> You don't need this..
>
> Why not ?

Because cpufreq_frequency_table_get_attr() and
cpufreq_frequency_table_put_attr() don't exist anymore :)

>> > +module_init(powernv_cpufreq_init);
>> > +module_exit(powernv_cpufreq_exit);
>>
>> Place these right after the routines without any blank lines in
> between.
>
> Is this the new convention ?

Don't know I have been following this since long time, probably from the
time I started with Mainline stuff.. I have seen many maintainers advising this
as it make code more readable, specially if these routines are quite big..

Probably it isn't mentioned in coding guidelines but people follow it most of
the times.
Srivatsa S. Bhat - March 21, 2014, 11:40 a.m.
On 03/21/2014 04:24 PM, Viresh Kumar wrote:
> On 21 March 2014 16:13, Gautham R Shenoy <ego@linux.vnet.ibm.com> wrote:
>> On Fri, Mar 21, 2014 at 02:11:32PM +0530, Viresh Kumar wrote:
> 
>>>> +               pr_debug("PState id %d freq %d MHz\n", id, freq);
>>>> +               powernv_freqs[i].driver_data = i;
>>>
[...] 
>>>> +static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
>>>> +{
>>>> +       int base, i;
>>>> +
>>>> +#ifdef CONFIG_SMP
>>>
>>> What will break if you don't have this ifdef here? Without that as well
>>> below code should work?
> 
> Missed this one?
> 

Nothing will break, its just that the code size will be a tiny bit
lesser on UP configs, that's all :-) Anyway, I think removing the ifdef
improves the readability (and doesn't add any noticeable overhead on UP
kernels), so let's get rid of it.

Regards,
Srivatsa S. Bhat

>>>> +       base = cpu_first_thread_sibling(policy->cpu);
>>>> +
>>>> +       for (i = 0; i < threads_per_core; i++)
>>>> +               cpumask_set_cpu(base + i, policy->cpus);
>>>> +#endif
>>>> +       policy->cpuinfo.transition_latency = 25000;
>>>> +
>>>> +       policy->cur = powernv_freqs[0].frequency;
>>>> +       cpufreq_frequency_table_get_attr(powernv_freqs, policy->cpu);
>>>
Srivatsa S. Bhat - March 21, 2014, 11:45 a.m.
On 03/21/2014 02:11 PM, Viresh Kumar wrote:
> On Thu, Mar 20, 2014 at 5:40 PM, Gautham R. Shenoy
> <ego@linux.vnet.ibm.com> wrote:
>> From: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
> 
> Hi Vaidy,
> 
>> diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
>> index 4b029c0..4ba1632 100644
>> --- a/drivers/cpufreq/Kconfig
>> +++ b/drivers/cpufreq/Kconfig
>> @@ -48,6 +48,7 @@ config CPU_FREQ_STAT_DETAILS
>>  choice
[...]
>> +static struct cpufreq_driver powernv_cpufreq_driver = {
>> +       .verify         = powernv_cpufreq_verify,
>> +       .target         = powernv_cpufreq_target,
> 
> I think you have Srivatsa there who has seen lots of cpufreq code and
> could have helped you a lot :)
>

:-)

I have followed the locking and synchronization aspects of cpufreq
closely, but unfortunately, I haven't been able to keep up with some of
the other details that well :-( Besides, its hard to keep up with the
rate of your changes, you are way too fast! ;-)

Regards,
Srivatsa S. Bhat
viresh kumar - March 21, 2014, 11:47 a.m.
On 21 March 2014 17:15, Srivatsa S. Bhat
<srivatsa.bhat@linux.vnet.ibm.com> wrote:
>> I think you have Srivatsa there who has seen lots of cpufreq code and
>> could have helped you a lot :)
>>
>
> :-)

I was waiting for your reply here :)

> I have followed the locking and synchronization aspects of cpufreq
> closely, but unfortunately, I haven't been able to keep up with some of
> the other details that well :-( Besides, its hard to keep up with the
> rate of your changes, you are way too fast! ;-)

Haha.. I am a very slow guy, really :)
Gautham R. Shenoy - March 21, 2014, 1:23 p.m.
Hi Viresh,

On Fri, Mar 21, 2014 at 04:24:27PM +0530, Viresh Kumar wrote:
> On 21 March 2014 16:13, Gautham R Shenoy <ego@linux.vnet.ibm.com> wrote:
> > On Fri, Mar 21, 2014 at 02:11:32PM +0530, Viresh Kumar wrote:
> 
> >> > +               pr_debug("PState id %d freq %d MHz\n", id, freq);
> >> > +               powernv_freqs[i].driver_data = i;
> >>
> >> I don't think you are using this field at all and this is the field you can
> >> use for driver_data and so you can get rid of powernv_pstate_ids[ ].
> >
> > Using driver_data to record powernv_pstate_ids won't work since
> > powernv_pstate_ids can be negative. So a pstate_id -3 can be confused
> > with CPUFREQ_BOOST_FREQ thereby not displaying the frequency
> > corresponding to pstate id -3. So for now I think we will be retaining
> > powernv_pstate_ids.
> 
> As I said earlier, this field is only used by cpufreq drivers and cpufreq core
> isn't supposed to touch it. CPUFREQ_BOOST_FREQ and other macros
> are there for .frequency field and not this one.
> 

Ok, I had based my code on linus's git tree. I checked the 'pm-cpufreq'
branch of Rafael's 'linux-pm' tree and freq_table.c contains the
following code snippet in show_available_frequencies:

	if (show_boost ^ (table[i].driver_data == CPUFREQ_BOOST_FREQ))
			continue;

I suspect we're talking about different code bases. So could you
please tell me which one should I be looking at ?

> 
> >> > +static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
> >> > +{
> >> > +       int base, i;
> >> > +
> >> > +#ifdef CONFIG_SMP
> >>
> >> What will break if you don't have this ifdef here? Without that as well
> >> below code should work?
> 
> Missed this one?
> 
> >> > +       base = cpu_first_thread_sibling(policy->cpu);
> >> > +
> >> > +       for (i = 0; i < threads_per_core; i++)
> >> > +               cpumask_set_cpu(base + i, policy->cpus);
> >> > +#endif
> >> > +       policy->cpuinfo.transition_latency = 25000;
> >> > +
> >> > +       policy->cur = powernv_freqs[0].frequency;
> >> > +       cpufreq_frequency_table_get_attr(powernv_freqs, policy->cpu);
> >>
> >> This doesn't exist anymore.
> >
> > Didn't get this comment!
> 
> cpufreq_frequency_table_get_attr() routine doesn't exist anymore.
> 
> >> > +static int powernv_cpufreq_cpu_exit(struct cpufreq_policy *policy)
> >> > +{
> >> > +       cpufreq_frequency_table_put_attr(policy->cpu);
> >> > +       return 0;
> >> > +}
> >>
> >> You don't need this..
> >
> > Why not ?
> 
> Because cpufreq_frequency_table_get_attr() and
> cpufreq_frequency_table_put_attr() don't exist anymore :)

Well, it does in the codebases that I was looking at. May be I've been
looking at the wrong place.

> 
> >> > +module_init(powernv_cpufreq_init);
> >> > +module_exit(powernv_cpufreq_exit);
> >>
> >> Place these right after the routines without any blank lines in
> > between.
> >
> > Is this the new convention ?
> 
> Don't know I have been following this since long time, probably from the
> time I started with Mainline stuff.. I have seen many maintainers advising this
> as it make code more readable, specially if these routines are quite big..
> 
> Probably it isn't mentioned in coding guidelines but people follow it most of
> the times.

Ok. I was not aware of this. Good to know :-)
> 

--
Thanks and Regards
gautham.
viresh kumar - March 21, 2014, 1:34 p.m.
On 21 March 2014 18:53, Gautham R Shenoy <ego@linux.vnet.ibm.com> wrote:
> Ok, I had based my code on linus's git tree. I checked the 'pm-cpufreq'

Always check his bleeding-edge or linux-next branch.

> branch of Rafael's 'linux-pm' tree and freq_table.c contains the
> following code snippet in show_available_frequencies:
>
>         if (show_boost ^ (table[i].driver_data == CPUFREQ_BOOST_FREQ))
>                         continue;
>
> I suspect we're talking about different code bases. So could you
> please tell me which one should I be looking at ?

Okay, there is some problem here. I will check how can I simplify this..
And yes, you were correct.

>> Because cpufreq_frequency_table_get_attr() and
>> cpufreq_frequency_table_put_attr() don't exist anymore :)
>
> Well, it does in the codebases that I was looking at. May be I've been
> looking at the wrong place.

Check the one I mentioned.
Vaidyanathan Srinivasan - March 21, 2014, 2:48 p.m.
* Gautham R Shenoy <ego@linux.vnet.ibm.com> [2014-03-21 16:13:17]:

> Hi Viresh,
> 
> On Fri, Mar 21, 2014 at 02:11:32PM +0530, Viresh Kumar wrote:
> > On Thu, Mar 20, 2014 at 5:40 PM, Gautham R. Shenoy
> > <ego@linux.vnet.ibm.com> wrote:
> > > From: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
> > 
> > Hi Vaidy,
> > 

Hi Viresh,

Thanks for the detailed review.

> > > diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
> > > index 4b029c0..4ba1632 100644
> > > --- a/drivers/cpufreq/Kconfig
> > > +++ b/drivers/cpufreq/Kconfig
> > > @@ -48,6 +48,7 @@ config CPU_FREQ_STAT_DETAILS
> > >  choice
> > >         prompt "Default CPUFreq governor"
> > >         default CPU_FREQ_DEFAULT_GOV_USERSPACE if ARM_SA1100_CPUFREQ || ARM_SA1110_CPUFREQ
> > > +       default CPU_FREQ_DEFAULT_GOV_ONDEMAND if POWERNV_CPUFREQ
> > 
> > Probably we should remove SA1100's entry as well from here. This is
> > not the right way of doing it. Imagine 100 platforms having entries here.
> > If you want it, then select it from your platforms Kconfig.
> 
> Sure. Will move these bits and the other governor related bits to the
> Powernv Kconfig.
> 
> > > diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
> > > new file mode 100644
> > > index 0000000..ab1551f
> > > --- /dev/null
> > > +
> > > +#define pr_fmt(fmt)    "powernv-cpufreq: " fmt
> > > +
> > > +#include <linux/module.h>
> > > +#include <linux/cpufreq.h>
> > > +#include <linux/of.h>
> > > +#include <asm/cputhreads.h>
> > 
> > That's it? Sure?
> > 
> > Even if things compile for you, you must explicitly include all the
> > files on which
> > you depend.
> 
> Ok. 
> 
> > 
> > > +
> > > +       WARN_ON(len_ids != len_freqs);
> > > +       nr_pstates = min(len_ids, len_freqs) / sizeof(u32);
> > > +       WARN_ON(!nr_pstates);
> > 
> > Why do you want to continue here?
> 
> Good point. We might be better off exiting at this point. 

Yes, we could return here.  We do not generally come till this point
if the platform has a problem discovering PStates from device tree.
But there is no useful debug info after this point if nr_pstates is 0.

> > 
> > > +       pr_debug("NR PStates %d\n", nr_pstates);
> > > +       for (i = 0; i < nr_pstates; i++) {
> > > +               u32 id = be32_to_cpu(pstate_ids[i]);
> > > +               u32 freq = be32_to_cpu(pstate_freqs[i]);
> > > +
> > > +               pr_debug("PState id %d freq %d MHz\n", id, freq);
> > > +               powernv_freqs[i].driver_data = i;
> > 
> > I don't think you are using this field at all and this is the field you can
> > use for driver_data and so you can get rid of powernv_pstate_ids[ ].
> 
> Using driver_data to record powernv_pstate_ids won't work since
> powernv_pstate_ids can be negative. So a pstate_id -3 can be confused
> with CPUFREQ_BOOST_FREQ thereby not displaying the frequency
> corresponding to pstate id -3. So for now I think we will be retaining
> powernv_pstate_ids.

Yeah, I had the driver written using driver_data to store pstates.
Gautham found the bug that we are missing one PState when we match the
ID with CPUFREQ_BOOST_FREQ!

We did not know that you have taken care of those issues.  Ideally
I did expect that driver_data should not be touched by the framework.
Thanks for fixing that and allowing the back-end driver to use
driver_data.
 
> > 
> > > +               powernv_freqs[i].frequency = freq * 1000; /* kHz */
> > > +               powernv_pstate_ids[i] = id;
> > > +       }
> > > +       /* End of list marker entry */
> > > +       powernv_freqs[i].driver_data = 0;
> > 
> > Not required.
> 
> Ok.
> > 
> > > +       powernv_freqs[i].frequency = CPUFREQ_TABLE_END;
> > > +
> > > +       /* Print frequency table */
> > > +       for (i = 0; powernv_freqs[i].frequency != CPUFREQ_TABLE_END; i++)
> > > +               pr_debug("%d: %d\n", i, powernv_freqs[i].frequency);
> > 
> > You have already printed this table earlier..
> 
> Fair enough.
> 
> > 
> > > +
> > > +       return 0;
> > > +}
> > > +
> > > +static struct freq_attr *powernv_cpu_freq_attr[] = {
> > > +       &cpufreq_freq_attr_scaling_available_freqs,
> > > +       NULL,
> > > +};
> > 
> > Can use this instead: cpufreq_generic_attr?
> 
> In this patch yes. But later patch introduces an additional attribute
> for displaying the nominal frequency. Will handle that part in a clean
> way in the next version.
> 
> > 
> > > +/* Helper routines */
> > > +
> > > +/* Access helpers to power mgt SPR */
> > > +
> > > +static inline unsigned long get_pmspr(unsigned long sprn)
> > 
> > Looks big enough not be inlined?
> 
> It is called from one function. It has been defined separately for
> readability. 

Let the compiler decide.  The code is straight forward :)

I wanted to keep this SPR operations in a separate function to
facilitate debug and also introduce any timing/sequence change here if
required.  Keeping this separate is helpful, if inlining is
successful, we get a bonus :)

> > 
> > > +{
> > > +       switch (sprn) {
> > > +       case SPRN_PMCR:
> > > +               return mfspr(SPRN_PMCR);
> > > +
> > > +       case SPRN_PMICR:
> > > +               return mfspr(SPRN_PMICR);
> > > +
> > > +       case SPRN_PMSR:
> > > +               return mfspr(SPRN_PMSR);
> > > +       }
> > > +       BUG();
> > > +}
> > > +
> > > +static inline void set_pmspr(unsigned long sprn, unsigned long val)
> > > +{
> > 
> > Same here..
> 
> Same reason as above.
> 
> > 
> > > +       switch (sprn) {
> > > +       case SPRN_PMCR:
> > > +               mtspr(SPRN_PMCR, val);
> > > +               return;
> > > +
> > > +       case SPRN_PMICR:
> > > +               mtspr(SPRN_PMICR, val);
> > > +               return;
> > > +
> > > +       case SPRN_PMSR:
> > > +               mtspr(SPRN_PMSR, val);
> > > +               return;
> > > +       }
> > > +       BUG();
> > > +}
> > > +
> > > +static void set_pstate(void *pstate)
> > > +{
> > > +       unsigned long val;
> > > +       unsigned long pstate_ul = *(unsigned long *) pstate;
> > 
> > Why not sending value only to this routine instead of pointer?
> 
> Well this function is called via an smp_call_function. so, cannot send
> a value :(
> 
> > 
> > > +
> > > +       val = get_pmspr(SPRN_PMCR);
> > > +       val = val & 0x0000ffffffffffffULL;
> > 
> > Maybe a blank line here?
> 
> Ok.
> 
> > 
> > > +       /* Set both global(bits 56..63) and local(bits 48..55) PStates */
> > > +       val = val | (pstate_ul << 56) | (pstate_ul << 48);
> > 
> > here as well?
> 
> Ok.
> > 
> 
> > > +       pr_debug("Setting cpu %d pmcr to %016lX\n", smp_processor_id(), val);
> > > +       set_pmspr(SPRN_PMCR, val);
> > > +}
> > > +
> > > +static int powernv_set_freq(cpumask_var_t cpus, unsigned int new_index)
> > > +{
> > > +       unsigned long val = (unsigned long) powernv_pstate_ids[new_index];
> > 
> > I think automatic type conversion will happen here.
> 
> Ok. Will fix this.

Most of the conversions are verbose mainly to help readability of the
required sign-extensions.  There is scope to make these concise.

> > 
> > > +
> > > +       /*
> > > +        * Use smp_call_function to send IPI and execute the
> > > +        * mtspr on target cpu.  We could do that without IPI
> > > +        * if current CPU is within policy->cpus (core)
> > > +        */
> > 
> > Hmm, interesting I also feel there are cases where this routine can
> > get called from other CPUs. Can you list those use cases where it can
> > happen? Governors will mostly call this from one of the CPUs present
> > in policy->cpus.
> 
> Consider the case when the governor is userspace and we are executing 
> 
>     # echo freqvalue  > 
>          /sys/devices/system/cpu/cpu<i>/cpufreq/scaling_setspeed 
> 
> from a cpu <j> which is not in policy->cpus of cpu i. 
> 
> 
> > > +static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
> > > +{
> > > +       int base, i;
> > > +
> > > +#ifdef CONFIG_SMP
> > 
> > What will break if you don't have this ifdef here? Without that as well
> > below code should work?
> > 
> > > +       base = cpu_first_thread_sibling(policy->cpu);
> > > +
> > > +       for (i = 0; i < threads_per_core; i++)
> > > +               cpumask_set_cpu(base + i, policy->cpus);
> > > +#endif
> > > +       policy->cpuinfo.transition_latency = 25000;
> > > +
> > > +       policy->cur = powernv_freqs[0].frequency;
> > > +       cpufreq_frequency_table_get_attr(powernv_freqs, policy->cpu);
> > 
> > This doesn't exist anymore.
> 
> Didn't get this comment!
> 
> > 
> > > +       return cpufreq_frequency_table_cpuinfo(policy, powernv_freqs);
> > 
> > Have you written this driver long time back? CPUFreq core has been
> > cleaned up heavily since last few kernel releases and I think there are
> > better helper routines available now.
> 
> Yup it was written quite a while ago. And yeah, CPUFreq has changed
> quite a bit since the last time I saw it :-)

Yes, the driver is more than a year old and based on even older
implementation that I had written.  I kept up until I got a compiler
warning or functional issue.  Definitely could not catch-up with your
cleanups :)

> > 
> > > +}
> > > +
> > > +static int powernv_cpufreq_cpu_exit(struct cpufreq_policy *policy)
> > > +{
> > > +       cpufreq_frequency_table_put_attr(policy->cpu);
> > > +       return 0;
> > > +}
> > 
> > You don't need this..
> 
> Why not ?
> 
> > 
> > > +static int powernv_cpufreq_verify(struct cpufreq_policy *policy)
> > > +{
> > > +       return cpufreq_frequency_table_verify(policy, powernv_freqs);
> > > +}
> > 
> > use generic verify function pointer instead..
> > 
> > > +static int powernv_cpufreq_target(struct cpufreq_policy *policy,
> > > +                             unsigned int target_freq,
> > > +                             unsigned int relation)
> > 
> > use target_index() instead..
> > 
> > > +{
> > > +       int rc;
> > > +       struct cpufreq_freqs freqs;
> > > +       unsigned int new_index;
> > > +
> > > +       cpufreq_frequency_table_target(policy, powernv_freqs, target_freq,
> > > +                                      relation, &new_index);
> > > +
> > > +       freqs.old = policy->cur;
> > > +       freqs.new = powernv_freqs[new_index].frequency;
> > > +       freqs.cpu = policy->cpu;
> > > +
> > > +       mutex_lock(&freq_switch_mutex);
> > 
> > Why do you need this lock for?
> 
> I guess it was to serialize accesses to PMCR. But Srivatsa's patch
> converts this to a per-core lock which probably is no longer required
> after your cpufreq_freq_transition_begin/end() patch series.

Right.  Frequency transitions are per-core, the h/w SPRs are per-core.
We need to prevent threads from colliding on h/w SPR access.  We do
make the lock per-core later in the series as mentioned above.

Gautham has addressed most comments.

Thanks,
Vaidy
Gautham R. Shenoy - March 21, 2014, 2:54 p.m.
On Fri, Mar 21, 2014 at 04:24:27PM +0530, Viresh Kumar wrote:
> >> > +static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
> >> > +{
> >> > +       int base, i;
> >> > +
> >> > +       base = cpu_first_thread_sibling(policy->cpu);
> >> > +
> >> > +       for (i = 0; i < threads_per_core; i++)
> >> > +               cpumask_set_cpu(base + i, policy->cpus);
> >> > +       policy->cpuinfo.transition_latency = 25000;
> >> > +
> >> > +       policy->cur = powernv_freqs[0].frequency;
> >> > +       cpufreq_frequency_table_get_attr(powernv_freqs, policy->cpu);
> >>
> >> This doesn't exist anymore.

Ok, I guess the right thing to do at this point is call

   cpufreq_table_validate_and_show(policy, powernv_freqs);

Will fix the code to take care of this.
--
Thanks and Regards
gautham.
viresh kumar - March 22, 2014, 3:43 a.m.
On 21 March 2014 20:18, Vaidyanathan Srinivasan
<svaidy@linux.vnet.ibm.com> wrote:
> Yeah, I had the driver written using driver_data to store pstates.
> Gautham found the bug that we are missing one PState when we match the
> ID with CPUFREQ_BOOST_FREQ!

I see..

> We did not know that you have taken care of those issues.  Ideally
> I did expect that driver_data should not be touched by the framework.
> Thanks for fixing that and allowing the back-end driver to use
> driver_data.

No, I haven't fixed anything yet. And this piece of code still exists.
I will see if I can get this fixed, by that time you can continue the
way your code is there in this version.
viresh kumar - March 22, 2014, 3:43 a.m.
On 21 March 2014 20:24, Gautham R Shenoy <ego@linux.vnet.ibm.com> wrote:
> Ok, I guess the right thing to do at this point is call
>
>    cpufreq_table_validate_and_show(policy, powernv_freqs);
>
> Will fix the code to take care of this.

Yes.

Patch

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 90c06ec..84f92ca 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -271,6 +271,10 @@ 
 #define SPRN_HSRR1	0x13B	/* Hypervisor Save/Restore 1 */
 #define SPRN_IC		0x350	/* Virtual Instruction Count */
 #define SPRN_VTB	0x351	/* Virtual Time Base */
+#define SPRN_PMICR	0x354   /* Power Management Idle Control Reg */
+#define SPRN_PMSR	0x355   /* Power Management Status Reg */
+#define SPRN_PMCR	0x374	/* Power Management Control Register */
+
 /* HFSCR and FSCR bit numbers are the same */
 #define FSCR_TAR_LG	8	/* Enable Target Address Register */
 #define FSCR_EBB_LG	7	/* Enable Event Based Branching */
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index 895e8a2..1fe12b1 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -11,6 +11,7 @@  config PPC_POWERNV
 	select PPC_UDBG_16550
 	select PPC_SCOM
 	select ARCH_RANDOM
+	select CPU_FREQ
 	default y
 
 config PPC_POWERNV_RTAS
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index 4b029c0..4ba1632 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -48,6 +48,7 @@  config CPU_FREQ_STAT_DETAILS
 choice
 	prompt "Default CPUFreq governor"
 	default CPU_FREQ_DEFAULT_GOV_USERSPACE if ARM_SA1100_CPUFREQ || ARM_SA1110_CPUFREQ
+	default CPU_FREQ_DEFAULT_GOV_ONDEMAND if POWERNV_CPUFREQ
 	default CPU_FREQ_DEFAULT_GOV_PERFORMANCE
 	help
 	  This option sets which CPUFreq governor shall be loaded at
diff --git a/drivers/cpufreq/Kconfig.powerpc b/drivers/cpufreq/Kconfig.powerpc
index ca0021a..93f8689 100644
--- a/drivers/cpufreq/Kconfig.powerpc
+++ b/drivers/cpufreq/Kconfig.powerpc
@@ -54,3 +54,16 @@  config PPC_PASEMI_CPUFREQ
 	help
 	  This adds the support for frequency switching on PA Semi
 	  PWRficient processors.
+
+config POWERNV_CPUFREQ
+       tristate "CPU frequency scaling for IBM POWERNV platform"
+       depends on PPC_POWERNV
+       select CPU_FREQ_GOV_PERFORMANCE
+       select CPU_FREQ_GOV_POWERSAVE
+       select CPU_FREQ_GOV_USERSPACE
+       select CPU_FREQ_GOV_ONDEMAND
+       select CPU_FREQ_GOV_CONSERVATIVE
+       default y
+       help
+	 This adds support for CPU frequency switching on IBM POWERNV
+	 platform
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index 7494565..0dbb963 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -86,6 +86,7 @@  obj-$(CONFIG_PPC_CORENET_CPUFREQ)   += ppc-corenet-cpufreq.o
 obj-$(CONFIG_CPU_FREQ_PMAC)		+= pmac32-cpufreq.o
 obj-$(CONFIG_CPU_FREQ_PMAC64)		+= pmac64-cpufreq.o
 obj-$(CONFIG_PPC_PASEMI_CPUFREQ)	+= pasemi-cpufreq.o
+obj-$(CONFIG_POWERNV_CPUFREQ)		+= powernv-cpufreq.o
 
 ##################################################################################
 # Other platform drivers
diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
new file mode 100644
index 0000000..ab1551f
--- /dev/null
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -0,0 +1,277 @@ 
+/*
+ * POWERNV cpufreq driver for the IBM POWER processors
+ *
+ * (C) Copyright IBM 2014
+ *
+ * Author: Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#define pr_fmt(fmt)	"powernv-cpufreq: " fmt
+
+#include <linux/module.h>
+#include <linux/cpufreq.h>
+#include <linux/of.h>
+#include <asm/cputhreads.h>
+
+/* FIXME: Make this per-core */
+static DEFINE_MUTEX(freq_switch_mutex);
+
+#define POWERNV_MAX_PSTATES	256
+
+static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
+static int powernv_pstate_ids[POWERNV_MAX_PSTATES+1];
+
+/*
+ * Initialize the freq table based on data obtained
+ * from the firmware passed via device-tree
+ */
+
+static int init_powernv_pstates(void)
+{
+	struct device_node *power_mgt;
+	int nr_pstates = 0;
+	int pstate_min, pstate_max, pstate_nominal;
+	const __be32 *pstate_ids, *pstate_freqs;
+	int i;
+	u32 len_ids, len_freqs;
+
+	power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
+	if (!power_mgt) {
+		pr_warn("power-mgt node not found\n");
+		return -ENODEV;
+	}
+
+	if (of_property_read_u32(power_mgt, "ibm,pstate-min", &pstate_min)) {
+		pr_warn("ibm,pstate-min node not found\n");
+		return -ENODEV;
+	}
+
+	if (of_property_read_u32(power_mgt, "ibm,pstate-max", &pstate_max)) {
+		pr_warn("ibm,pstate-max node not found\n");
+		return -ENODEV;
+	}
+
+	if (of_property_read_u32(power_mgt, "ibm,pstate-nominal",
+				 &pstate_nominal)) {
+		pr_warn("ibm,pstate-nominal not found\n");
+		return -ENODEV;
+	}
+	pr_info("cpufreq pstate min %d nominal %d max %d\n", pstate_min,
+		pstate_nominal, pstate_max);
+
+	pstate_ids = of_get_property(power_mgt, "ibm,pstate-ids", &len_ids);
+	if (!pstate_ids) {
+		pr_warn("ibm,pstate-ids not found\n");
+		return -ENODEV;
+	}
+
+	pstate_freqs = of_get_property(power_mgt, "ibm,pstate-frequencies-mhz",
+				      &len_freqs);
+	if (!pstate_freqs) {
+		pr_warn("ibm,pstate-frequencies-mhz not found\n");
+		return -ENODEV;
+	}
+
+	WARN_ON(len_ids != len_freqs);
+	nr_pstates = min(len_ids, len_freqs) / sizeof(u32);
+	WARN_ON(!nr_pstates);
+
+	pr_debug("NR PStates %d\n", nr_pstates);
+	for (i = 0; i < nr_pstates; i++) {
+		u32 id = be32_to_cpu(pstate_ids[i]);
+		u32 freq = be32_to_cpu(pstate_freqs[i]);
+
+		pr_debug("PState id %d freq %d MHz\n", id, freq);
+		powernv_freqs[i].driver_data = i;
+		powernv_freqs[i].frequency = freq * 1000; /* kHz */
+		powernv_pstate_ids[i] = id;
+	}
+	/* End of list marker entry */
+	powernv_freqs[i].driver_data = 0;
+	powernv_freqs[i].frequency = CPUFREQ_TABLE_END;
+
+	/* Print frequency table */
+	for (i = 0; powernv_freqs[i].frequency != CPUFREQ_TABLE_END; i++)
+		pr_debug("%d: %d\n", i, powernv_freqs[i].frequency);
+
+	return 0;
+}
+
+static struct freq_attr *powernv_cpu_freq_attr[] = {
+	&cpufreq_freq_attr_scaling_available_freqs,
+	NULL,
+};
+
+/* Helper routines */
+
+/* Access helpers to power mgt SPR */
+
+static inline unsigned long get_pmspr(unsigned long sprn)
+{
+	switch (sprn) {
+	case SPRN_PMCR:
+		return mfspr(SPRN_PMCR);
+
+	case SPRN_PMICR:
+		return mfspr(SPRN_PMICR);
+
+	case SPRN_PMSR:
+		return mfspr(SPRN_PMSR);
+	}
+	BUG();
+}
+
+static inline void set_pmspr(unsigned long sprn, unsigned long val)
+{
+	switch (sprn) {
+	case SPRN_PMCR:
+		mtspr(SPRN_PMCR, val);
+		return;
+
+	case SPRN_PMICR:
+		mtspr(SPRN_PMICR, val);
+		return;
+
+	case SPRN_PMSR:
+		mtspr(SPRN_PMSR, val);
+		return;
+	}
+	BUG();
+}
+
+static void set_pstate(void *pstate)
+{
+	unsigned long val;
+	unsigned long pstate_ul = *(unsigned long *) pstate;
+
+	val = get_pmspr(SPRN_PMCR);
+	val = val & 0x0000ffffffffffffULL;
+	/* Set both global(bits 56..63) and local(bits 48..55) PStates */
+	val = val | (pstate_ul << 56) | (pstate_ul << 48);
+	pr_debug("Setting cpu %d pmcr to %016lX\n", smp_processor_id(), val);
+	set_pmspr(SPRN_PMCR, val);
+}
+
+static int powernv_set_freq(cpumask_var_t cpus, unsigned int new_index)
+{
+	unsigned long val = (unsigned long) powernv_pstate_ids[new_index];
+
+	/*
+	 * Use smp_call_function to send IPI and execute the
+	 * mtspr on target cpu.  We could do that without IPI
+	 * if current CPU is within policy->cpus (core)
+	 */
+
+	val = val & 0xFF;
+	smp_call_function_any(cpus, set_pstate, &val, 1);
+	return 0;
+}
+
+static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
+{
+	int base, i;
+
+#ifdef CONFIG_SMP
+	base = cpu_first_thread_sibling(policy->cpu);
+
+	for (i = 0; i < threads_per_core; i++)
+		cpumask_set_cpu(base + i, policy->cpus);
+#endif
+	policy->cpuinfo.transition_latency = 25000;
+
+	policy->cur = powernv_freqs[0].frequency;
+	cpufreq_frequency_table_get_attr(powernv_freqs, policy->cpu);
+	return cpufreq_frequency_table_cpuinfo(policy, powernv_freqs);
+}
+
+static int powernv_cpufreq_cpu_exit(struct cpufreq_policy *policy)
+{
+	cpufreq_frequency_table_put_attr(policy->cpu);
+	return 0;
+}
+
+static int powernv_cpufreq_verify(struct cpufreq_policy *policy)
+{
+	return cpufreq_frequency_table_verify(policy, powernv_freqs);
+}
+
+static int powernv_cpufreq_target(struct cpufreq_policy *policy,
+			      unsigned int target_freq,
+			      unsigned int relation)
+{
+	int rc;
+	struct cpufreq_freqs freqs;
+	unsigned int new_index;
+
+	cpufreq_frequency_table_target(policy, powernv_freqs, target_freq,
+				       relation, &new_index);
+
+	freqs.old = policy->cur;
+	freqs.new = powernv_freqs[new_index].frequency;
+	freqs.cpu = policy->cpu;
+
+	mutex_lock(&freq_switch_mutex);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
+
+	pr_debug("setting frequency for cpu %d to %d kHz index %d pstate %d",
+		 policy->cpu,
+		 powernv_freqs[new_index].frequency,
+		 new_index,
+		 powernv_pstate_ids[new_index]);
+
+	rc = powernv_set_freq(policy->cpus, new_index);
+
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
+	mutex_unlock(&freq_switch_mutex);
+
+	return rc;
+}
+
+static struct cpufreq_driver powernv_cpufreq_driver = {
+	.verify		= powernv_cpufreq_verify,
+	.target		= powernv_cpufreq_target,
+	.init		= powernv_cpufreq_cpu_init,
+	.exit		= powernv_cpufreq_cpu_exit,
+	.name		= "powernv-cpufreq",
+	.flags		= CPUFREQ_CONST_LOOPS,
+	.attr		= powernv_cpu_freq_attr,
+};
+
+static int __init powernv_cpufreq_init(void)
+{
+	int rc = 0;
+
+	/* Discover pstates from device tree and init */
+
+	rc = init_powernv_pstates();
+
+	if (rc) {
+		pr_info("powernv-cpufreq disabled\n");
+		return rc;
+	}
+
+	rc = cpufreq_register_driver(&powernv_cpufreq_driver);
+	return rc;
+}
+
+static void __exit powernv_cpufreq_exit(void)
+{
+	cpufreq_unregister_driver(&powernv_cpufreq_driver);
+}
+
+module_init(powernv_cpufreq_init);
+module_exit(powernv_cpufreq_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>");