diff mbox

[v2,2/2] cpufreq: powernv: Ramp-down global pstate slower than local-pstate

Message ID 1460701739-31549-3-git-send-email-akshay.adiga@linux.vnet.ibm.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Akshay Adiga April 15, 2016, 6:28 a.m. UTC
The frequency transition latency from pmin to pmax is observed to be in
few millisecond granurality. And it usually happens to take a performance
penalty during sudden frequency rampup requests.

This patch set solves this problem by using an entity called "global
pstates". The global pstate is a Chip-level entity, so the global entitiy
(Voltage) is managed across the cores. The local pstate is a Core-level
entity, so the local entity (frequency) is managed across threads.

This patch brings down global pstate at a slower rate than the local
pstate. Hence by holding global pstates higher than local pstate makes
the subsequent rampups faster.

A per policy structure is maintained to keep track of the global and
local pstate changes. The global pstate is brought down using a parabolic
equation. The ramp down time to pmin is set to ~5 seconds. To make sure
that the global pstates are dropped at regular interval , a timer is
queued for every 2 seconds during ramp-down phase, which eventually brings
the pstate down to local pstate.

Iozone results show fairly consistent performance boost.
YCSB on redis shows improved Max latencies in most cases.

Iozone write/rewite test were made with filesizes 200704Kb and 401408Kb
with different record sizes . The following table shows IOoperations/sec
with and without patch.

Iozone Results ( in op/sec) ( mean over 3 iterations )
---------------------------------------------------------------------
file size-                      with            without		  %
recordsize-IOtype               patch           patch		change
----------------------------------------------------------------------
200704-1-SeqWrite               1616532         1615425         0.06
200704-1-Rewrite                2423195         2303130         5.21
200704-2-SeqWrite               1628577         1602620         1.61
200704-2-Rewrite                2428264         2312154         5.02
200704-4-SeqWrite               1617605         1617182         0.02
200704-4-Rewrite                2430524         2351238         3.37
200704-8-SeqWrite               1629478         1600436         1.81
200704-8-Rewrite                2415308         2298136         5.09
200704-16-SeqWrite              1619632         1618250         0.08
200704-16-Rewrite               2396650         2352591         1.87
200704-32-SeqWrite              1632544         1598083         2.15
200704-32-Rewrite               2425119         2329743         4.09
200704-64-SeqWrite              1617812         1617235         0.03
200704-64-Rewrite               2402021         2321080         3.48
200704-128-SeqWrite             1631998         1600256         1.98
200704-128-Rewrite              2422389         2304954         5.09
200704-256 SeqWrite             1617065         1616962         0.00
200704-256-Rewrite              2432539         2301980         5.67
200704-512-SeqWrite             1632599         1598656         2.12
200704-512-Rewrite              2429270         2323676         4.54
200704-1024-SeqWrite            1618758         1616156         0.16
200704-1024-Rewrite             2431631         2315889         4.99
401408-1-SeqWrite               1631479         1608132         1.45
401408-1-Rewrite                2501550         2459409         1.71
401408-2-SeqWrite               1617095         1626069         -0.55
401408-2-Rewrite                2507557         2443621         2.61
401408-4-SeqWrite               1629601         1611869         1.10
401408-4-Rewrite                2505909         2462098         1.77
401408-8-SeqWrite               1617110         1626968         -0.60
401408-8-Rewrite                2512244         2456827         2.25
401408-16-SeqWrite              1632609         1609603         1.42
401408-16-Rewrite               2500792         2451405         2.01
401408-32-SeqWrite              1619294         1628167         -0.54
401408-32-Rewrite               2510115         2451292         2.39
401408-64-SeqWrite              1632709         1603746         1.80
401408-64-Rewrite               2506692         2433186         3.02
401408-128-SeqWrite             1619284         1627461         -0.50
401408-128-Rewrite              2518698         2453361         2.66
401408-256-SeqWrite             1634022         1610681         1.44
401408-256-Rewrite              2509987         2446328         2.60
401408-512-SeqWrite             1617524         1628016         -0.64
401408-512-Rewrite              2504409         2442899         2.51
401408-1024-SeqWrite            1629812         1611566         1.13
401408-1024-Rewrite             2507620          2442968        2.64

Tested with YCSB workload (50% update + 50% read) over redis for 1 million
records and 1 million operation. Each test was carried out with target
operations per second and persistence disabled.

Max-latency (in us)( mean over 5 iterations )
---------------------------------------------------------------
op/s    Operation       with patch      without patch   %change
---------------------------------------------------------------
15000   Read            61480.6         50261.4         22.32
15000   cleanup         215.2           293.6           -26.70
15000   update          25666.2         25163.8         2.00

25000   Read            32626.2         89525.4         -63.56
25000   cleanup         292.2           263.0           11.10
25000   update          32293.4         90255.0         -64.22

35000   Read            34783.0         33119.0         5.02
35000   cleanup         321.2           395.8           -18.8
35000   update          36047.0         38747.8         -6.97

40000   Read            38562.2         42357.4         -8.96
40000   cleanup         371.8           384.6           -3.33
40000   update          27861.4         41547.8         -32.94

45000   Read            42271.0         88120.6         -52.03
45000   cleanup         263.6           383.0           -31.17
45000   update          29755.8         81359.0         -63.43

(test without target op/s)
47659   Read            83061.4         136440.6        -39.12
47659   cleanup         195.8           193.8           1.03
47659   update          73429.4         124971.8        -41.24

Signed-off-by: Akshay Adiga <akshay.adiga@linux.vnet.ibm.com>
Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
---
 drivers/cpufreq/powernv-cpufreq.c | 261 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 252 insertions(+), 9 deletions(-)

Comments

Viresh Kumar April 18, 2016, 10:18 a.m. UTC | #1
On 15-04-16, 11:58, Akshay Adiga wrote:
>  static int powernv_cpufreq_reboot_notifier(struct notifier_block *nb,
> -				unsigned long action, void *unused)
> +					   unsigned long action, void *unused)

Unrelated change.. better don't add such changes..

>  {
>  	int cpu;
>  	struct cpufreq_policy cpu_policy;
> @@ -603,15 +843,18 @@ static struct notifier_block powernv_cpufreq_opal_nb = {
>  static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy)
>  {
>  	struct powernv_smp_call_data freq_data;
> -
> +	struct global_pstate_info *gpstates = policy->driver_data;

You removed a blank line here and I feel the code looks better with
that.

>  	freq_data.pstate_id = powernv_pstate_info.min;
> +	freq_data.gpstate_id = powernv_pstate_info.min;
>  	smp_call_function_single(policy->cpu, set_pstate, &freq_data, 1);
> +	del_timer_sync(&gpstates->timer);
>  }
>  
>  static struct cpufreq_driver powernv_cpufreq_driver = {
>  	.name		= "powernv-cpufreq",
>  	.flags		= CPUFREQ_CONST_LOOPS,
>  	.init		= powernv_cpufreq_cpu_init,
> +	.exit		= powernv_cpufreq_cpu_exit,
>  	.verify		= cpufreq_generic_frequency_table_verify,
>  	.target_index	= powernv_cpufreq_target_index,
>  	.get		= powernv_cpufreq_get,

None of the above comments are mandatory for you to fix..

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Akshay Adiga April 19, 2016, 9:55 a.m. UTC | #2
Hi Viresh,

On 04/18/2016 03:48 PM, Viresh Kumar wrote:
> On 15-04-16, 11:58, Akshay Adiga wrote:
>>   static int powernv_cpufreq_reboot_notifier(struct notifier_block *nb,
>> -				unsigned long action, void *unused)
>> +					   unsigned long action, void *unused)
> Unrelated change.. better don't add such changes..

Posting out v3 with out this unrelated change.

>>   {
>>   	int cpu;
>>   	struct cpufreq_policy cpu_policy;
>> @@ -603,15 +843,18 @@ static struct notifier_block powernv_cpufreq_opal_nb = {
>>   static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy)
>>   {
>>   	struct powernv_smp_call_data freq_data;
>> -
>> +	struct global_pstate_info *gpstates = policy->driver_data;
> You removed a blank line here and I feel the code looks better with
> that.
>
>>   	freq_data.pstate_id = powernv_pstate_info.min;
>> +	freq_data.gpstate_id = powernv_pstate_info.min;
>>   	smp_call_function_single(policy->cpu, set_pstate, &freq_data, 1);
>> +	del_timer_sync(&gpstates->timer);
>>   }
>>   
>>   static struct cpufreq_driver powernv_cpufreq_driver = {
>>   	.name		= "powernv-cpufreq",
>>   	.flags		= CPUFREQ_CONST_LOOPS,
>>   	.init		= powernv_cpufreq_cpu_init,
>> +	.exit		= powernv_cpufreq_cpu_exit,
>>   	.verify		= cpufreq_generic_frequency_table_verify,
>>   	.target_index	= powernv_cpufreq_target_index,
>>   	.get		= powernv_cpufreq_get,
> None of the above comments are mandatory for you to fix..
>
> Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
>
Thanks for Ack  :)
Stewart Smith April 20, 2016, 5:18 p.m. UTC | #3
Akshay Adiga <akshay.adiga@linux.vnet.ibm.com> writes:
> Iozone results show fairly consistent performance boost.
> YCSB on redis shows improved Max latencies in most cases.

What about power consumption?

> Iozone write/rewite test were made with filesizes 200704Kb and 401408Kb
> with different record sizes . The following table shows IOoperations/sec
> with and without patch.

> Iozone Results ( in op/sec) ( mean over 3 iterations )

What's the variance between runs?

> Tested with YCSB workload (50% update + 50% read) over redis for 1 million
> records and 1 million operation. Each test was carried out with target
> operations per second and persistence disabled.
>
> Max-latency (in us)( mean over 5 iterations )

What's the variance between runs?

std dev? 95th percentile?

> ---------------------------------------------------------------
> op/s    Operation       with patch      without patch   %change
> ---------------------------------------------------------------
> 15000   Read            61480.6         50261.4         22.32

This seems fairly significant regression. Any idea why at 15K op/s
there's such a regression?

> --- a/drivers/cpufreq/powernv-cpufreq.c
> +++ b/drivers/cpufreq/powernv-cpufreq.c
[ 15 more citation lines. Click/Enter to show. ]
> @@ -36,12 +36,56 @@
>  #include <asm/reg.h>
>  #include <asm/smp.h> /* Required for cpu_sibling_mask() in UP configs */
>  #include <asm/opal.h>
> +#include <linux/timer.h>
>  
>  #define POWERNV_MAX_PSTATES	256
>  #define PMSR_PSAFE_ENABLE	(1UL << 30)
>  #define PMSR_SPR_EM_DISABLE	(1UL << 31)
>  #define PMSR_MAX(x)		((x >> 32) & 0xFF)
>  
> +#define MAX_RAMP_DOWN_TIME				5120
> +/*
> + * On an idle system we want the global pstate to ramp-down from max value
> to
> + * min over a span of ~5 secs. Also we want it to initially ramp-down
> slowly and
> + * then ramp-down rapidly later on.

Where does 5 seconds come from?

Why 5 and not 10, or not 2? Is there some time period inherit in
hardware or software that this is computed from?

> +/* Interval after which the timer is queued to bring down global pstate */
> +#define GPSTATE_TIMER_INTERVAL				2000

in ms?
Akshay Adiga April 22, 2016, 5:35 p.m. UTC | #4
Hi Stewart,

On 04/20/2016 03:41 AM, Stewart Smith wrote:
> Akshay Adiga<akshay.adiga@linux.vnet.ibm.com>  writes:
>> Iozone results show fairly consistent performance boost.
>> YCSB on redis shows improved Max latencies in most cases.
> What about power consumption?
>
>> Iozone write/rewite test were made with filesizes 200704Kb and 401408Kb
>> with different record sizes . The following table shows IOoperations/sec
>> with and without patch.
>> Iozone Results ( in op/sec) ( mean over 3 iterations )
> What's the variance between runs?

Re-Ran Iozone test

w/o : without patch,  w : with patch , stdev : standard deviation , avg ; average

Iozone Results for ReWrite
+----------+--------+-----------+------------+-----------+-----------+---------+
| filesize | reclen |  w/o(avg) | w/o(stdev) |   w(avg)  |  w(stdev) | change% |
+----------+--------+-----------+------------+-----------+-----------+---------+
|  200704  |   1    |  795070.4 |  5813.51   |  805127.8 |  16872.59 |  1.264  |
|  200704  |   2    | 1448973.8 |  23058.79  | 1472098.8 |  18062.73 |  1.595  |
|  200704  |   4    |  2413444  |  85988.09  | 2562535.8 |  48649.35 |  6.177  |
|  200704  |   8    |  3827453  |  87710.52  | 3846888.2 |  86438.51 |  0.507  |
|  200704  |   16   | 5276096.8 |  73208.19  | 5425961.6 | 170774.75 |  2.840  |
|  200704  |   32   | 6742930.6 |  22789.45  | 6848904.4 | 257768.84 |  1.571  |
|  200704  |   64   | 7059479.2 | 300725.26  |  7373635  | 285106.90 |  4.450  |
|  200704  |  128   | 7097647.2 | 408171.71  |  7716500  | 266139.68 |  8.719  |
|  200704  |  256   |  6710810  | 314594.13  | 7661752.6 | 454049.27 |  14.170 |
|  200704  |  512   | 7034675.4 | 516152.97  | 7378583.2 | 613617.57 |  4.888  |
|  200704  |  1024  | 6265317.2 | 446101.38  | 7540629.6 | 294865.20 |  20.355 |
|  401408  |   1    |  802233.2 |  4263.92   |   817507  |  17727.09 |  1.903  |
|  401408  |   2    | 1461892.8 |  53678.12  |  1482872  |  45670.30 |  1.435  |
|  401408  |   4    | 2629686.8 |  24365.33  | 2673196.2 |  41576.78 |  1.654  |
|  401408  |   8    | 4156353.8 |  70636.85  | 4149330.4 |  56521.84 |  -0.168 |
|  401408  |   16   |  5895437  |  63762.43  | 5924167.4 | 396311.75 |  0.487  |
|  401408  |   32   | 7330826.6 | 167080.53  | 7785889.2 | 245434.99 |  6.207  |
|  401408  |   64   | 8298555.2 | 328890.89  | 8482416.8 | 249698.02 |  2.215  |
|  401408  |  128   | 8241108.6 | 490560.96  |  8686478  | 224816.21 |  5.404  |
|  401408  |  256   | 8038080.6 | 327704.66  | 8372327.4 | 210978.18 |  4.158  |
|  401408  |  512   | 8229523.4 | 371701.73  | 8654695.2 | 296715.07 |  5.166  |
+----------+--------+-----------+------------+-----------+-----------+---------+

Iozone results for Write
+----------+--------+-----------+------------+-----------+------------+---------+
| filesize | reclen |  w/o(avg) | w/o(stdev) |   w(avg)  |  w(stdev)  | change% |
+----------+--------+-----------+------------+-----------+------------+---------+
|  200704  |   1    |   575825  |  7,876.69  |  569388.4 |  6,699.59  |  -1.12  |
|  200704  |   2    | 1061229.4 |  7,589.50  | 1045193.2 | 19,785.85  |  -1.51  |
|  200704  |   4    |  1808329  | 13,040.67  | 1798138.4 | 50,367.19  |  -0.56  |
|  200704  |   8    | 2822953.4 | 19,948.89  | 2830305.6 | 21,202.77  |   0.26  |
|  200704  |   16   |  3976987  | 62,201.72  | 3909063.8 | 268,640.51 |  -1.71  |
|  200704  |   32   | 4959358.2 | 112,052.99 |  4760303  | 330,343.73 |  -4.01  |
|  200704  |   64   | 5452454.6 | 628,078.72 | 5692265.6 | 190,562.91 |   4.40  |
|  200704  |  128   | 5645246.8 | 10,455.85  | 5653330.2 | 18,153.76  |   0.14  |
|  200704  |  256   | 5855897.2 | 184,854.25 |  5402069  | 538,523.04 |  -7.75  |
|  200704  |  512   |  5515904  | 326,198.86 | 5639976.4 |  8,480.46  |   2.25  |
|  200704  |  1024  | 5471718.2 | 415,179.15 | 5399414.6 | 686,124.50 |  -1.32  |
|  401408  |   1    |  584786.6 |  1,256.59  |  587237.2 |  6,552.55  |   0.42  |
|  401408  |   2    | 1047018.8 | 26,567.72  | 1040926.8 | 16,495.93  |  -0.58  |
|  401408  |   4    | 1815465.8 | 16,426.92  | 1773652.6 | 38,169.02  |  -2.30  |
|  401408  |   8    |  2814285  | 27,374.53  |  2756608  | 96,689.13  |  -2.05  |
|  401408  |   16   |  3931646  | 129,648.79 | 3805793.4 | 141,368.40 |  -3.20  |
|  401408  |   32   | 4875353.4 | 146,203.70 |  4884084  | 265,484.01 |   0.18  |
|  401408  |   64   | 5479805.8 | 349,995.36 | 5565292.2 | 20,645.45  |   1.56  |
|  401408  |  128   |  5598486  | 195,680.23 |  5645125  | 62,017.38  |   0.83  |
|  401408  |  256   |  5803148  | 328,683.02 |  5657215  | 20,579.28  |  -2.51  |
|  401408  |  512   | 5565091.4 | 166,123.57 | 5725974.4 | 169,506.29 |   2.89  |
+----------+--------+-----------+------------+-----------+------------+---------+

>> Tested with YCSB workload (50% update + 50% read) over redis for 1 million
>> records and 1 million operation. Each test was carried out with target
>> operations per second and persistence disabled.
>>
>> Max-latency (in us)( mean over 5 iterations )
> What's the variance between runs?
>
> std dev? 95th percentile?
>
>> ---------------------------------------------------------------
>> op/s    Operation       with patch      without patch   %change
>> ---------------------------------------------------------------
>> 15000   Read            61480.6         50261.4         22.32
> This seems fairly significant regression. Any idea why at 15K op/s
> there's such a regression?

Just Re-Ran the test for power numbers.
Results for YCSB+Redis test.
P95 : 95 Percentile
P99 : 99 Percentile

Power numbers are taken for one run of YCSB+redis test which has 50% Read + 50% Update.
Maximum Latency has clearly gone down for all cases will less than 5% increase in power.


+------------+----------+--------+------------+---------+---------+----------------+
|   Op/sec   | Testcase | AvgLat |   MaxLat   |   P95   |   P99   |     Power      |
+------------+----------+--------+------------+---------+---------+----------------+
|   15000    |   Read   |   -    |     -      |    -    |    -    |       -        |
| w/o patch  | Average  |  51.8  |  127903.0  |   55.8  |  145.2  |     602.7      |
| w/o patch  |  StdDev  | 5.692  | 105355.497 |  11.232 |   2.04  |      5.11      |
| with patch | Average  | 53.28  |  30834.2   |   72.2  |  151.2  |     629.01     |
| with patch |  StdDev  | 2.348  |  8928.323  |  15.74  |  3.544  |      3.25      |
|     -      |*Change% | 2.86 | -75.89 | 29.39 | 4.13 | 4.36535589846*   |
|   25000    |   Read   |   -    |     -      |    -    |    -    |       -        |
| w/o patch  | Average  | 53.78  |  123743.0  |   85.4  |  152.2  |     617.95     |
| w/o patch  |  StdDev  | 4.593  |  80224.53  |  5.886  |   4.49  |      1.32      |
| with patch | Average  | 49.65  |  84101.4   |   84.2  |  154.4  |     651.64     |
| with patch |  StdDev  | 1.658  | 72656.042  |  4.261  |  2.332  |      8.76      |
|     -      |*Change% | -7.68 | -32.04 | -1.41 | 1.45 | 5.4518974027 *  |
|   35000    |   Read   |   -    |     -      |    -    |    -    |       -        |
| w/o patch  | Average  | 56.07  |  57391.0   |   93.0  |  147.6  |     636.39     |
| w/o patch  |  StdDev  | 1.391  | 34494.839  |  1.789  |  2.871  |      2.92      |
| with patch | Average  | 56.46  |  39634.2   |   95.0  |  149.2  |     653.44     |
| with patch |  StdDev  | 3.174  |  6089.848  |  3.347  |   3.37  |      4.4       |
|     -      |*Change% | 0.69 | -30.94 | 2.15 | 1.08 | 2.6791747199 *  |
|   40000    |   Read   |   -    |     -      |    -    |    -    |       -        |
| w/o patch  | Average  |  58.6  |  80427.8   |   97.2  |  147.4  |     636.85     |
| w/o patch  |  StdDev  | 1.105  | 59327.584  |  0.748  |  2.498  |      1.51      |
| with patch | Average  | 58.76  |  45291.8   |   97.2  |  149.0  |     656.12     |
| with patch |  StdDev  | 1.675  | 10486.954  |  2.482  |  3.406  |      6.97      |
|     -      |*Change% | 0.27 | -43.69 | 0.0 | 1.09 | 3.0258302583*   |
|   45000    |   Read   |   -    |     -      |    -    |    -    |       -        |
| w/o patch  | Average  | 69.02  |  120027.8  |  102.6  |  149.6  |     640.68     |
| w/o patch  |  StdDev  |  0.74  | 96288.811  |  1.855  |  1.497  |      7.65      |
| with patch | Average  | 69.65  |  98024.6   |  102.0  |  147.8  |     653.09     |
| with patch |  StdDev  |  1.14  | 78041.439  |   2.28  |  1.939  |      3.91      |
|     -*| Change% | 0.92 | -18.33 | -0.58 | -1.2 | 1.93700443279*   |
|   15000    |  Update  |   -    |     -      |    -    |    -    |       -        |
| w/o patch  | Average  | 48.144 |  86847.0   |   52.4  |  189.2  |     602.7      |
| w/o patch  |  StdDev  | 5.971  | 41580.919  |  16.427 |  8.376  |      5.11      |
| with patch | Average  | 47.964 |  31106.2   |   58.4  |  182.2  |     629.01     |
| with patch |  StdDev  | 3.003  |  4906.179  |  7.088  |  6.177  |      3.25      |
|     -      |*Change% | -0.37 | -64.18 | 11.45 | -3.7 | -3.69978858351*  |
|   25000    |  Update  |   -    |     -      |    -    |    -    |       -        |
| w/o patch  | Average  | 51.856 |  102808.6  |   87.0  |  182.4  |     617.95     |
| w/o patch  |  StdDev  | 5.721  | 79308.823  |  4.899  |  7.965  |      1.32      |
| with patch | Average  | 46.07  |  74623.0   |   86.2  |  183.0  |     651.64     |
| with patch |  StdDev  | 1.779  | 77511.229  |  4.069  |  7.014  |      8.76      |
|     -      |*Change% | -11.16 | -27.42 | -0.92 | 0.33 | 0.328947368421*  |
|   35000    |  Update  |   -    |     -      |    -    |    -    |       -        |
| w/o patch  | Average  | 54.142 |  51074.2   |   93.6  |  181.8  |     636.39     |
| w/o patch  |  StdDev  | 1.671  | 36877.588  |  1.497  |  8.035  |      2.92      |
| with patch | Average  | 54.034 |  44731.8   |   94.4  |  184.4  |     653.44     |
| with patch |  StdDev  | 3.363  |  13400.4   |   1.02  |  7.172  |      4.4       |
|     -      |*Change% | -0.2 | -12.42 | 0.85 | 1.43 | 1.4301430143*   |
|   40000    |  Update  |   -    |     -      |    -    |    -    |       -        |
| w/o patch  | Average  | 57.528 |  71672.6   |   98.4  |  184.8  |     636.85     |
| w/o patch  |  StdDev  | 1.111  | 63103.862  |  1.744  |  9.282  |      1.51      |
| with patch | Average  | 57.738 |  32101.4   |   98.0  |  186.4  |     656.12     |
| with patch |  StdDev  | 1.294  |  4481.801  |  1.673  |   7.71  |      6.97      |
|     -      |*Change% | 0.37 | -55.21 | -0.41 | 0.87 | 0.865800865801 *|
|   45000    |  Update  |   -    |     -      |    -    |    -    |       -        |
| w/o patch  | Average  | 69.97  |  117183.0  |  105.4  |  182.4  |     640.68     |
| w/o patch  |  StdDev  | 0.925  | 99836.076  |   1.2   |  9.091  |      7.65      |
| with patch | Average  | 70.508 |  104175.0  |  103.2  |  185.4  |     653.09     |
| with patch |  StdDev  | 1.463  |  74438.13  |   1.47  |  7.915  |      3.91      |
|     -      |*Change% | 0.77 | -11.1 | -2.09 | 1.64 | 1.64473684211 *|
+------------+----------+--------+------------+---------+---------+----------------+

>> --- a/drivers/cpufreq/powernv-cpufreq.c
>> +++ b/drivers/cpufreq/powernv-cpufreq.c
>> @@ -36,12 +36,56 @@
>>   #include <asm/reg.h>
>>   #include <asm/smp.h> /* Required for cpu_sibling_mask() in UP configs */
>>   #include <asm/opal.h>
>> +#include <linux/timer.h>
>>   
>>   #define POWERNV_MAX_PSTATES	256
>>   #define PMSR_PSAFE_ENABLE	(1UL << 30)
>>   #define PMSR_SPR_EM_DISABLE	(1UL << 31)
>>   #define PMSR_MAX(x)		((x >> 32) & 0xFF)
>>   
>> +#define MAX_RAMP_DOWN_TIME				5120
>> +/*
>> + * On an idle system we want the global pstate to ramp-down from max value to
>> + * min over a span of ~5 secs. Also we want it to initially ramp-down slowly and
>> + * then ramp-down rapidly later on.
> Where does 5 seconds come from?
>
> Why 5 and not 10, or not 2? Is there some time period inherit in
> hardware or software that this is computed from?

  As global pstates are per-chip and there are max 12 cores, so if the system is really
  idle, considering 5 seconds for each cores, it should take 60 seconds for the chip to
  go to pmin.

>> +/* Interval after which the timer is queued to bring down global pstate */
>> +#define GPSTATE_TIMER_INTERVAL				2000
> in ms?

Yes its 2000 ms.
diff mbox

Patch

diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index e2e2219..78388c0 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -36,12 +36,56 @@ 
 #include <asm/reg.h>
 #include <asm/smp.h> /* Required for cpu_sibling_mask() in UP configs */
 #include <asm/opal.h>
+#include <linux/timer.h>
 
 #define POWERNV_MAX_PSTATES	256
 #define PMSR_PSAFE_ENABLE	(1UL << 30)
 #define PMSR_SPR_EM_DISABLE	(1UL << 31)
 #define PMSR_MAX(x)		((x >> 32) & 0xFF)
 
+#define MAX_RAMP_DOWN_TIME				5120
+/*
+ * On an idle system we want the global pstate to ramp-down from max value to
+ * min over a span of ~5 secs. Also we want it to initially ramp-down slowly and
+ * then ramp-down rapidly later on.
+ *
+ * This gives a percentage rampdown for time elapsed in milliseconds.
+ * ramp_down_percentage = ((ms * ms) >> 18)
+ *			~= 3.8 * (sec * sec)
+ *
+ * At 0 ms	ramp_down_percent = 0
+ * At 5120 ms	ramp_down_percent = 100
+ */
+#define ramp_down_percent(time)		((time * time) >> 18)
+
+/* Interval after which the timer is queued to bring down global pstate */
+#define GPSTATE_TIMER_INTERVAL				2000
+
+/**
+ * struct global_pstate_info -	Per policy data structure to maintain history of
+ *				global pstates
+ * @highest_lpstate:		The local pstate from which we are ramping down
+ * @elapsed_time:		Time in ms spent in ramping down from
+ *				highest_lpstate
+ * @last_sampled_time:		Time from boot in ms when global pstates were
+ *				last set
+ * @last_lpstate,last_gpstate:	Last set values for local and global pstates
+ * @timer:			Is used for ramping down if cpu goes idle for
+ *				a long time with global pstate held high
+ * @gpstate_lock:		A spinlock to maintain synchronization between
+ *				routines called by the timer handler and
+ *				governer's target_index calls
+ */
+struct global_pstate_info {
+	int highest_lpstate;
+	unsigned int elapsed_time;
+	unsigned int last_sampled_time;
+	int last_lpstate;
+	int last_gpstate;
+	spinlock_t gpstate_lock;
+	struct timer_list timer;
+};
+
 static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
 static bool rebooting, throttled, occ_reset;
 
@@ -94,6 +138,17 @@  static struct powernv_pstate_info {
 	int nr_pstates;
 } powernv_pstate_info;
 
+static inline void reset_gpstates(struct cpufreq_policy *policy)
+{
+	struct global_pstate_info *gpstates = policy->driver_data;
+
+	gpstates->highest_lpstate = 0;
+	gpstates->elapsed_time = 0;
+	gpstates->last_sampled_time = 0;
+	gpstates->last_lpstate = 0;
+	gpstates->last_gpstate = 0;
+}
+
 /*
  * Initialize the freq table based on data obtained
  * from the firmware passed via device-tree
@@ -285,6 +340,7 @@  static inline void set_pmspr(unsigned long sprn, unsigned long val)
 struct powernv_smp_call_data {
 	unsigned int freq;
 	int pstate_id;
+	int gpstate_id;
 };
 
 /*
@@ -343,19 +399,21 @@  static unsigned int powernv_cpufreq_get(unsigned int cpu)
  * (struct powernv_smp_call_data *) and the pstate_id which needs to be set
  * on this CPU should be present in freq_data->pstate_id.
  */
-static void set_pstate(void *freq_data)
+static void set_pstate(void *data)
 {
 	unsigned long val;
-	unsigned long pstate_ul =
-		((struct powernv_smp_call_data *) freq_data)->pstate_id;
+	struct powernv_smp_call_data *freq_data = data;
+	unsigned long pstate_ul = freq_data->pstate_id;
+	unsigned long gpstate_ul = freq_data->gpstate_id;
 
 	val = get_pmspr(SPRN_PMCR);
 	val = val & 0x0000FFFFFFFFFFFFULL;
 
 	pstate_ul = pstate_ul & 0xFF;
+	gpstate_ul = gpstate_ul & 0xFF;
 
 	/* Set both global(bits 56..63) and local(bits 48..55) PStates */
-	val = val | (pstate_ul << 56) | (pstate_ul << 48);
+	val = val | (gpstate_ul << 56) | (pstate_ul << 48);
 
 	pr_debug("Setting cpu %d pmcr to %016lX\n",
 			raw_smp_processor_id(), val);
@@ -424,6 +482,110 @@  next:
 	}
 }
 
+/**
+ * calc_global_pstate - Calculate global pstate
+ * @elapsed_time:	Elapsed time in milliseconds
+ * @local_pstate:	New local pstate
+ * @highest_lpstate:	pstate from which its ramping down
+ *
+ * Finds the appropriate global pstate based on the pstate from which its
+ * ramping down and the time elapsed in ramping down. It follows a quadratic
+ * equation which ensures that it reaches ramping down to pmin in 5sec.
+ */
+static inline int calc_global_pstate(unsigned int elapsed_time,
+				     int highest_lpstate, int local_pstate)
+{
+	int pstate_diff;
+
+	/*
+	 * Using ramp_down_percent we get the percentage of rampdown
+	 * that we are expecting to be dropping. Difference between
+	 * highest_lpstate and powernv_pstate_info.min will give a absolute
+	 * number of how many pstates we will drop eventually by the end of
+	 * 5 seconds, then just scale it get the number pstates to be dropped.
+	 */
+	pstate_diff =  ((int)ramp_down_percent(elapsed_time) *
+			(highest_lpstate - powernv_pstate_info.min)) / 100;
+
+	/* Ensure that global pstate is >= to local pstate */
+	if (highest_lpstate - pstate_diff < local_pstate)
+		return local_pstate;
+	else
+		return highest_lpstate - pstate_diff;
+}
+
+static inline void  queue_gpstate_timer(struct global_pstate_info *gpstates)
+{
+	unsigned int timer_interval;
+
+	/*
+	 * Setting up timer to fire after GPSTATE_TIMER_INTERVAL ms, But
+	 * if it exceeds MAX_RAMP_DOWN_TIME ms for ramp down time.
+	 * Set timer such that it fires exactly at MAX_RAMP_DOWN_TIME
+	 * seconds of ramp down time.
+	 */
+	if ((gpstates->elapsed_time + GPSTATE_TIMER_INTERVAL)
+	     > MAX_RAMP_DOWN_TIME)
+		timer_interval = MAX_RAMP_DOWN_TIME - gpstates->elapsed_time;
+	else
+		timer_interval = GPSTATE_TIMER_INTERVAL;
+
+	mod_timer_pinned(&gpstates->timer, jiffies +
+			msecs_to_jiffies(timer_interval));
+}
+
+/**
+ * gpstate_timer_handler
+ *
+ * @data: pointer to cpufreq_policy on which timer was queued
+ *
+ * This handler brings down the global pstate closer to the local pstate
+ * according quadratic equation. Queues a new timer if it is still not equal
+ * to local pstate
+ */
+void gpstate_timer_handler(unsigned long data)
+{
+	struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
+	struct global_pstate_info *gpstates = policy->driver_data;
+	int gpstate_id;
+	unsigned int time_diff = jiffies_to_msecs(jiffies)
+					- gpstates->last_sampled_time;
+	struct powernv_smp_call_data freq_data;
+
+	if (!spin_trylock(&gpstates->gpstate_lock))
+		return;
+
+	gpstates->last_sampled_time += time_diff;
+	gpstates->elapsed_time += time_diff;
+	freq_data.pstate_id = gpstates->last_lpstate;
+
+	if ((gpstates->last_gpstate == freq_data.pstate_id) ||
+	    (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME)) {
+		gpstate_id = freq_data.pstate_id;
+		reset_gpstates(policy);
+		gpstates->highest_lpstate = freq_data.pstate_id;
+	} else {
+		gpstate_id = calc_global_pstate(gpstates->elapsed_time,
+						gpstates->highest_lpstate,
+						freq_data.pstate_id);
+	}
+
+	/*
+	 * If local pstate is equal to global pstate, rampdown is over
+	 * So timer is not required to be queued.
+	 */
+	if (gpstate_id != freq_data.pstate_id)
+		queue_gpstate_timer(gpstates);
+
+	freq_data.gpstate_id = gpstate_id;
+	gpstates->last_gpstate = freq_data.gpstate_id;
+	gpstates->last_lpstate = freq_data.pstate_id;
+
+	/* Timer may get migrated to a different cpu on cpu hot unplug */
+	smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
+	spin_unlock(&gpstates->gpstate_lock);
+}
+
 /*
  * powernv_cpufreq_target_index: Sets the frequency corresponding to
  * the cpufreq table entry indexed by new_index on the cpus in the
@@ -433,6 +595,9 @@  static int powernv_cpufreq_target_index(struct cpufreq_policy *policy,
 					unsigned int new_index)
 {
 	struct powernv_smp_call_data freq_data;
+	unsigned int cur_msec, gpstate_id;
+	unsigned long flags;
+	struct global_pstate_info *gpstates = policy->driver_data;
 
 	if (unlikely(rebooting) && new_index != get_nominal_index())
 		return 0;
@@ -440,22 +605,70 @@  static int powernv_cpufreq_target_index(struct cpufreq_policy *policy,
 	if (!throttled)
 		powernv_cpufreq_throttle_check(NULL);
 
+	cur_msec = jiffies_to_msecs(get_jiffies_64());
+
+	spin_lock_irqsave(&gpstates->gpstate_lock, flags);
 	freq_data.pstate_id = powernv_freqs[new_index].driver_data;
 
+	if (!gpstates->last_sampled_time) {
+		gpstate_id = freq_data.pstate_id;
+		gpstates->highest_lpstate = freq_data.pstate_id;
+		goto gpstates_done;
+	}
+
+	if (gpstates->last_gpstate > freq_data.pstate_id) {
+		gpstates->elapsed_time += cur_msec -
+						 gpstates->last_sampled_time;
+
+		/*
+		 * If its has been ramping down for more than MAX_RAMP_DOWN_TIME
+		 * we should be resetting all global pstate related data. Set it
+		 * equal to local pstate to start fresh.
+		 */
+		if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
+			reset_gpstates(policy);
+			gpstates->highest_lpstate = freq_data.pstate_id;
+			gpstate_id = freq_data.pstate_id;
+		} else {
+		/* Elaspsed_time is less than 5 seconds, continue to rampdown */
+			gpstate_id = calc_global_pstate(gpstates->elapsed_time,
+							gpstates->highest_lpstate,
+							freq_data.pstate_id);
+		}
+	} else {
+		reset_gpstates(policy);
+		gpstates->highest_lpstate = freq_data.pstate_id;
+		gpstate_id = freq_data.pstate_id;
+	}
+
+	/*
+	 * If local pstate is equal to global pstate, rampdown is over
+	 * So timer is not required to be queued.
+	 */
+	if (gpstate_id != freq_data.pstate_id)
+		queue_gpstate_timer(gpstates);
+
+gpstates_done:
+	freq_data.gpstate_id = gpstate_id;
+	gpstates->last_sampled_time = cur_msec;
+	gpstates->last_gpstate = freq_data.gpstate_id;
+	gpstates->last_lpstate = freq_data.pstate_id;
+
 	/*
 	 * Use smp_call_function to send IPI and execute the
 	 * mtspr on target CPU.  We could do that without IPI
 	 * if current CPU is within policy->cpus (core)
 	 */
 	smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
-
+	spin_unlock_irqrestore(&gpstates->gpstate_lock, flags);
 	return 0;
 }
 
 static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
 {
-	int base, i;
+	int base, i, ret;
 	struct kernfs_node *kn;
+	struct global_pstate_info *gpstates;
 
 	base = cpu_first_thread_sibling(policy->cpu);
 
@@ -475,11 +688,38 @@  static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	} else {
 		kernfs_put(kn);
 	}
-	return cpufreq_table_validate_and_show(policy, powernv_freqs);
+
+	gpstates =  kzalloc(sizeof(*gpstates), GFP_KERNEL);
+	if (!gpstates)
+		return -ENOMEM;
+
+	policy->driver_data = gpstates;
+
+	/* initialize timer */
+	init_timer_deferrable(&gpstates->timer);
+	gpstates->timer.data = (unsigned long)policy;
+	gpstates->timer.function = gpstate_timer_handler;
+	gpstates->timer.expires = jiffies +
+				msecs_to_jiffies(GPSTATE_TIMER_INTERVAL);
+	spin_lock_init(&gpstates->gpstate_lock);
+	ret = cpufreq_table_validate_and_show(policy, powernv_freqs);
+
+	if (ret < 0)
+		kfree(policy->driver_data);
+
+	return ret;
+}
+
+static int powernv_cpufreq_cpu_exit(struct cpufreq_policy *policy)
+{
+	/* timer is deleted in cpufreq_cpu_stop() */
+	kfree(policy->driver_data);
+
+	return 0;
 }
 
 static int powernv_cpufreq_reboot_notifier(struct notifier_block *nb,
-				unsigned long action, void *unused)
+					   unsigned long action, void *unused)
 {
 	int cpu;
 	struct cpufreq_policy cpu_policy;
@@ -603,15 +843,18 @@  static struct notifier_block powernv_cpufreq_opal_nb = {
 static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy)
 {
 	struct powernv_smp_call_data freq_data;
-
+	struct global_pstate_info *gpstates = policy->driver_data;
 	freq_data.pstate_id = powernv_pstate_info.min;
+	freq_data.gpstate_id = powernv_pstate_info.min;
 	smp_call_function_single(policy->cpu, set_pstate, &freq_data, 1);
+	del_timer_sync(&gpstates->timer);
 }
 
 static struct cpufreq_driver powernv_cpufreq_driver = {
 	.name		= "powernv-cpufreq",
 	.flags		= CPUFREQ_CONST_LOOPS,
 	.init		= powernv_cpufreq_cpu_init,
+	.exit		= powernv_cpufreq_cpu_exit,
 	.verify		= cpufreq_generic_frequency_table_verify,
 	.target_index	= powernv_cpufreq_target_index,
 	.get		= powernv_cpufreq_get,