diff mbox

cpufreq: powernv: Redesign the presentation of throttle notification

Message ID 1450030657-9121-1-git-send-email-shilpa.bhat@linux.vnet.ibm.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Shilpasri G Bhat Dec. 13, 2015, 6:17 p.m. UTC
Replace the throttling event console messages to perf trace event
"power:powernv_throttle" and throttle counter stats which are
exported in sysfs. The newly added sysfs files are as follows:

1)/sys/devices/system/node/node0/throttle_frequencies
  This gives the throttle stats for each of the available frequencies.
  The throttle stat of a frequency is the total number of times the max
  frequency was reduced to that frequency.
  # cat /sys/devices/system/node/node0/throttle_frequencies
  4023000 0
  3990000 0
  3956000 1
  3923000 0
  3890000 0
  3857000 2
  3823000 0
  3790000 0
  3757000 2
  3724000 1
  3690000 1
  ...

2)/sys/devices/system/node/node0/throttle_reasons
  This gives the stats for each of the supported throttle reasons.
  This gives the total number of times the frequency was throttled due
  to each of the reasons.
  # cat /sys/devices/system/node/node0/throttle_reasons
  No throttling 7
  Power Cap 0
  Processor Over Temperature 7
  Power Supply Failure 0
  Over Current 0
  OCC Reset 0

3)/sys/devices/system/node/node0/throttle_stat
  This gives the total number of throttle events occurred in turbo
  range of frequencies and non-turbo(below nominal) range of
  frequencies.
  # cat /sys/devices/system/node/node0/throttle_stat
  Turbo 7
  Nominal 0

Signed-off-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
---
 drivers/cpufreq/powernv-cpufreq.c | 186 +++++++++++++++++++++++++++++---------
 include/trace/events/power.h      |  22 +++++
 2 files changed, 166 insertions(+), 42 deletions(-)

Comments

Paul A. Clarke Dec. 14, 2015, 9:29 p.m. UTC | #1
On 12/13/2015 12:17 PM, Shilpasri G Bhat wrote:
> Replace the throttling event console messages to perf trace event
> "power:powernv_throttle" and throttle counter stats which are
> exported in sysfs. The newly added sysfs files are as follows:
>
> 1)/sys/devices/system/node/node0/throttle_frequencies
>    This gives the throttle stats for each of the available frequencies.
>    The throttle stat of a frequency is the total number of times the max
>    frequency was reduced to that frequency.
>    # cat /sys/devices/system/node/node0/throttle_frequencies
>    4023000 0
>    3990000 0
>    3956000 1
>    3923000 0
>    3890000 0
>    3857000 2
>    3823000 0
>    3790000 0
>    3757000 2
>    3724000 1
>    3690000 1
>    ...

Is this data useful?  It seems like "elapsed time" at each frequency might be 
more useful, if any.

> 2)/sys/devices/system/node/node0/throttle_reasons
>    This gives the stats for each of the supported throttle reasons.
>    This gives the total number of times the frequency was throttled due
>    to each of the reasons.
>    # cat /sys/devices/system/node/node0/throttle_reasons
>    No throttling 7
>    Power Cap 0
>    Processor Over Temperature 7
>    Power Supply Failure 0
>    Over Current 0
>    OCC Reset 0
>
> 3)/sys/devices/system/node/node0/throttle_stat
>    This gives the total number of throttle events occurred in turbo
>    range of frequencies and non-turbo(below nominal) range of
>    frequencies.

non-turbo should read "at or below nominal".  Maybe "sub-turbo" is a better 
term(?)

>    # cat /sys/devices/system/node/node0/throttle_stat
>    Turbo 7
>    Nominal 0

Should this read "Non-turbo" or "Sub-turbo" instead of "Nominal", since the 
events could well occur when already operating below nominal.

> Signed-off-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
> ---
>   drivers/cpufreq/powernv-cpufreq.c | 186 +++++++++++++++++++++++++++++---------
>   include/trace/events/power.h      |  22 +++++
>   2 files changed, 166 insertions(+), 42 deletions(-)
>
> diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
> index cb50138..bdde9d6 100644
> --- a/drivers/cpufreq/powernv-cpufreq.c
> +++ b/drivers/cpufreq/powernv-cpufreq.c
> @@ -28,6 +28,9 @@
>   #include <linux/of.h>
>   #include <linux/reboot.h>
>   #include <linux/slab.h>
> +#include <trace/events/power.h>
> +#include <linux/device.h>
> +#include <linux/node.h>
>
>   #include <asm/cputhreads.h>
>   #include <asm/firmware.h>
> @@ -43,12 +46,27 @@
>   static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
>   static bool rebooting, throttled, occ_reset;
>
> +static char throttle_reason[][30] = {
> +					"No throttling",
> +					"Power Cap",
> +					"Processor Over Temperature",
> +					"Power Supply Failure",
> +					"Over Current",
> +					"OCC Reset"
> +				     };

I'm curious if this would be slightly more efficiently implemented as:
static const char *throttle_reason[] = { ... };

Do you need 30 characters per string for a reason?

Regardless, it should be const.

[...]
--
PC
Shilpasri G Bhat Jan. 1, 2016, 10:40 p.m. UTC | #2
Hi,

On 12/15/2015 02:59 AM, Paul Clarke wrote:
> On 12/13/2015 12:17 PM, Shilpasri G Bhat wrote:
>> Replace the throttling event console messages to perf trace event
>> "power:powernv_throttle" and throttle counter stats which are
>> exported in sysfs. The newly added sysfs files are as follows:
>>
>> 1)/sys/devices/system/node/node0/throttle_frequencies
>>    This gives the throttle stats for each of the available frequencies.
>>    The throttle stat of a frequency is the total number of times the max
>>    frequency was reduced to that frequency.
>>    # cat /sys/devices/system/node/node0/throttle_frequencies
>>    4023000 0
>>    3990000 0
>>    3956000 1
>>    3923000 0
>>    3890000 0
>>    3857000 2
>>    3823000 0
>>    3790000 0
>>    3757000 2
>>    3724000 1
>>    3690000 1
>>    ...
> 
> Is this data useful?  It seems like "elapsed time" at each frequency might be
> more useful, if any.
> 

Yes elapsed time is more useful data here. But the concern here is with the
accuracy of measurement/observation of elapsed time by the kernel. OCC can
throttle/unthrottle the frequency at the granularity of 250us. Although OCC
updates the throttle status to HOMER region immediately there may be a delay in
propagating this message by the opal-poller to the driver.

So instead we might want OCC to give us the throttled elapsed time stat for each
frequency and opal-poller/driver can take the snapshot of this info every n seconds.

>> 2)/sys/devices/system/node/node0/throttle_reasons
>>    This gives the stats for each of the supported throttle reasons.
>>    This gives the total number of times the frequency was throttled due
>>    to each of the reasons.
>>    # cat /sys/devices/system/node/node0/throttle_reasons
>>    No throttling 7
>>    Power Cap 0
>>    Processor Over Temperature 7
>>    Power Supply Failure 0
>>    Over Current 0
>>    OCC Reset 0
>>
>> 3)/sys/devices/system/node/node0/throttle_stat
>>    This gives the total number of throttle events occurred in turbo
>>    range of frequencies and non-turbo(below nominal) range of
>>    frequencies.
> 
> non-turbo should read "at or below nominal".  Maybe "sub-turbo" is a better term(?)
> 
>>    # cat /sys/devices/system/node/node0/throttle_stat
>>    Turbo 7
>>    Nominal 0
> 
> Should this read "Non-turbo" or "Sub-turbo" instead of "Nominal", since the
> events could well occur when already operating below nominal.
> 

Agree. Applied 'sub-turbo' in v2

>> Signed-off-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
>> ---
>>   drivers/cpufreq/powernv-cpufreq.c | 186 +++++++++++++++++++++++++++++---------
>>   include/trace/events/power.h      |  22 +++++
>>   2 files changed, 166 insertions(+), 42 deletions(-)
>>
>> diff --git a/drivers/cpufreq/powernv-cpufreq.c
>> b/drivers/cpufreq/powernv-cpufreq.c
>> index cb50138..bdde9d6 100644
>> --- a/drivers/cpufreq/powernv-cpufreq.c
>> +++ b/drivers/cpufreq/powernv-cpufreq.c
>> @@ -28,6 +28,9 @@
>>   #include <linux/of.h>
>>   #include <linux/reboot.h>
>>   #include <linux/slab.h>
>> +#include <trace/events/power.h>
>> +#include <linux/device.h>
>> +#include <linux/node.h>
>>
>>   #include <asm/cputhreads.h>
>>   #include <asm/firmware.h>
>> @@ -43,12 +46,27 @@
>>   static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
>>   static bool rebooting, throttled, occ_reset;
>>
>> +static char throttle_reason[][30] = {
>> +                    "No throttling",
>> +                    "Power Cap",
>> +                    "Processor Over Temperature",
>> +                    "Power Supply Failure",
>> +                    "Over Current",
>> +                    "OCC Reset"
>> +                     };
> 
> I'm curious if this would be slightly more efficiently implemented as:
> static const char *throttle_reason[] = { ... };
> 
> Do you need 30 characters per string for a reason?
> 
> Regardless, it should be const.

Modified the declaration in v2 version of the patch.

> 
> [...]
> -- 
> PC

Thanks and Regards,
Shilpa
diff mbox

Patch

diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index cb50138..bdde9d6 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -28,6 +28,9 @@ 
 #include <linux/of.h>
 #include <linux/reboot.h>
 #include <linux/slab.h>
+#include <trace/events/power.h>
+#include <linux/device.h>
+#include <linux/node.h>
 
 #include <asm/cputhreads.h>
 #include <asm/firmware.h>
@@ -43,12 +46,27 @@ 
 static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
 static bool rebooting, throttled, occ_reset;
 
+static char throttle_reason[][30] = {
+					"No throttling",
+					"Power Cap",
+					"Processor Over Temperature",
+					"Power Supply Failure",
+					"Over Current",
+					"OCC Reset"
+				     };
+
 static struct chip {
 	unsigned int id;
 	bool throttled;
 	cpumask_t mask;
 	struct work_struct throttle;
 	bool restore;
+	/* Pmax throttle stats */
+	int throt_reason;
+	int throt_turbo;
+	int throt_nominal;
+	int reason[OCC_MAX_THROTTLE_STATUS + 1];
+	int *pstate_stat;
 } *chips;
 
 static int nr_chips;
@@ -309,40 +327,54 @@  static inline unsigned int get_nominal_index(void)
 	return powernv_pstate_info.max - powernv_pstate_info.nominal;
 }
 
-static void powernv_cpufreq_throttle_check(void *data)
+static void powernv_cpufreq_read_pmax(void *data)
 {
 	unsigned int cpu = smp_processor_id();
 	unsigned long pmsr;
-	int pmsr_pmax, i;
-
-	pmsr = get_pmspr(SPRN_PMSR);
+	int pmsr_pmax, index, i;
 
 	for (i = 0; i < nr_chips; i++)
 		if (chips[i].id == cpu_to_chip_id(cpu))
 			break;
 
-	/* Check for Pmax Capping */
+	pmsr = get_pmspr(SPRN_PMSR);
 	pmsr_pmax = (s8)PMSR_MAX(pmsr);
 	if (pmsr_pmax != powernv_pstate_info.max) {
 		if (chips[i].throttled)
-			goto next;
+			return;
 		chips[i].throttled = true;
-		if (pmsr_pmax < powernv_pstate_info.nominal)
-			pr_crit("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n",
+		if (pmsr_pmax < powernv_pstate_info.nominal) {
+			pr_warn("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n",
 				cpu, chips[i].id, pmsr_pmax,
 				powernv_pstate_info.nominal);
-		else
-			pr_info("CPU %d on Chip %u has Pmax reduced below turbo frequency (%d < %d)\n",
-				cpu, chips[i].id, pmsr_pmax,
-				powernv_pstate_info.max);
+			chips[i].throt_nominal++;
+		} else {
+			chips[i].throt_turbo++;
+		}
+		index  = powernv_pstate_info.max - pmsr_pmax;
+		if (index >= 0 && index < powernv_pstate_info.nr_pstates)
+			chips[i].pstate_stat[index]++;
+		trace_powernv_throttle(chips[i].id,
+				       throttle_reason[chips[i].throt_reason],
+				       pmsr_pmax);
 	} else if (chips[i].throttled) {
 		chips[i].throttled = false;
-		pr_info("CPU %d on Chip %u has Pmax restored to %d\n", cpu,
-			chips[i].id, pmsr_pmax);
+		trace_powernv_throttle(chips[i].id,
+				       throttle_reason[chips[i].throt_reason],
+				       pmsr_pmax);
 	}
+}
+
+static void powernv_cpufreq_throttle_check(void *data)
+{
+	unsigned long pmsr;
+
+	pmsr = get_pmspr(SPRN_PMSR);
+
+	/* Check for Pmax Capping */
+	powernv_cpufreq_read_pmax(NULL);
 
 	/* Check if Psafe_mode_active is set in PMSR. */
-next:
 	if (pmsr & PMSR_PSAFE_ENABLE) {
 		throttled = true;
 		pr_info("Pstate set to safe frequency\n");
@@ -356,7 +388,7 @@  next:
 
 	if (throttled) {
 		pr_info("PMSR = %16lx\n", pmsr);
-		pr_crit("CPU Frequency could be throttled\n");
+		pr_warn("CPU Frequency could be throttled\n");
 	}
 }
 
@@ -447,15 +479,6 @@  void powernv_cpufreq_work_fn(struct work_struct *work)
 	}
 }
 
-static char throttle_reason[][30] = {
-					"No throttling",
-					"Power Cap",
-					"Processor Over Temperature",
-					"Power Supply Failure",
-					"Over Current",
-					"OCC Reset"
-				     };
-
 static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
 				   unsigned long msg_type, void *_msg)
 {
@@ -481,7 +504,7 @@  static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
 		 */
 		if (!throttled) {
 			throttled = true;
-			pr_crit("CPU frequency is throttled for duration\n");
+			pr_warn("CPU frequency is throttled for duration\n");
 		}
 
 		break;
@@ -505,23 +528,19 @@  static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
 			return 0;
 		}
 
-		if (omsg.throttle_status &&
-		    omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS)
-			pr_info("OCC: Chip %u Pmax reduced due to %s\n",
-				(unsigned int)omsg.chip,
-				throttle_reason[omsg.throttle_status]);
-		else if (!omsg.throttle_status)
-			pr_info("OCC: Chip %u %s\n", (unsigned int)omsg.chip,
-				throttle_reason[omsg.throttle_status]);
-		else
-			return 0;
-
 		for (i = 0; i < nr_chips; i++)
-			if (chips[i].id == omsg.chip) {
-				if (!omsg.throttle_status)
-					chips[i].restore = true;
-				schedule_work(&chips[i].throttle);
-			}
+			if (chips[i].id == omsg.chip)
+				break;
+		if (omsg.throttle_status >= 0 &&
+		    omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS) {
+			chips[i].reason[omsg.throttle_status]++;
+			chips[i].throt_reason = omsg.throttle_status;
+		}
+
+		if (!omsg.throttle_status)
+			chips[i].restore = true;
+
+		schedule_work(&chips[i].throttle);
 	}
 	return 0;
 }
@@ -532,6 +551,61 @@  static struct notifier_block powernv_cpufreq_opal_nb = {
 	.priority	= 0,
 };
 
+static ssize_t throttle_freq_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	int i, count = 0, id;
+
+	for (i = 0; i < nr_chips; i++)
+		if (chips[i].id == dev->id)
+			break;
+	id = i;
+	for (i = 0; i < powernv_pstate_info.nr_pstates; i++)
+		count += sprintf(&buf[count], "%d %d\n",
+				powernv_freqs[i].frequency,
+				chips[id].pstate_stat[i]);
+	count += sprintf(&buf[count], "\n");
+
+	return count;
+}
+
+static DEVICE_ATTR(throttle_frequencies, 0400, throttle_freq_show, NULL);
+
+static ssize_t throttle_reasons_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	int i, count = 0, id;
+
+	for (i = 0; i < nr_chips; i++)
+		if (chips[i].id == dev->id)
+			break;
+	id = i;
+	for (i = 0; i <= OCC_MAX_THROTTLE_STATUS; i++)
+		count += sprintf(&buf[count], "%s %d\n", throttle_reason[i],
+				 chips[id].reason[i]);
+
+	return count;
+}
+
+static DEVICE_ATTR(throttle_reasons, 0400, throttle_reasons_show, NULL);
+
+static ssize_t throttle_stat_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	int i, count = 0;
+
+	for (i = 0; i < nr_chips; i++)
+		if (chips[i].id == dev->id)
+			break;
+
+	count += sprintf(&buf[count], "Turbo %d\n", chips[i].throt_turbo);
+	count += sprintf(&buf[count], "Nominal %d\n", chips[i].throt_nominal);
+
+	return count;
+}
+
+static DEVICE_ATTR(throttle_stat, 0400, throttle_stat_show, NULL);
+
 static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy)
 {
 	struct powernv_smp_call_data freq_data;
@@ -571,11 +645,29 @@  static int init_chip_info(void)
 		return -ENOMEM;
 
 	for (i = 0; i < nr_chips; i++) {
+		unsigned int j;
+
 		chips[i].id = chip[i];
 		chips[i].throttled = false;
 		cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i]));
 		INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn);
 		chips[i].restore = false;
+		chips[i].throt_nominal = 0;
+		chips[i].throt_turbo = 0;
+		chips[i].pstate_stat = kcalloc(powernv_pstate_info.nr_pstates,
+						sizeof(int), GFP_KERNEL);
+		if (!chips[i].pstate_stat)
+			return -ENOMEM;
+
+		for (j = 0; j <= OCC_MAX_THROTTLE_STATUS; j++)
+			chips[i].reason[j] = 0;
+
+		device_create_file(&node_devices[chips[i].id]->dev,
+				   &dev_attr_throttle_reasons);
+		device_create_file(&node_devices[chips[i].id]->dev,
+				   &dev_attr_throttle_frequencies);
+		device_create_file(&node_devices[chips[i].id]->dev,
+				   &dev_attr_throttle_stat);
 	}
 
 	return 0;
@@ -609,9 +701,19 @@  module_init(powernv_cpufreq_init);
 
 static void __exit powernv_cpufreq_exit(void)
 {
+	int i;
+
 	unregister_reboot_notifier(&powernv_cpufreq_reboot_nb);
 	opal_message_notifier_unregister(OPAL_MSG_OCC,
 					 &powernv_cpufreq_opal_nb);
+	for (i = 0; i < nr_chips; i++) {
+		device_remove_file(&node_devices[chips[i].id]->dev,
+				   &dev_attr_throttle_reasons);
+		device_remove_file(&node_devices[chips[i].id]->dev,
+				   &dev_attr_throttle_frequencies);
+		device_remove_file(&node_devices[chips[i].id]->dev,
+				   &dev_attr_throttle_stat);
+	}
 	cpufreq_unregister_driver(&powernv_cpufreq_driver);
 }
 module_exit(powernv_cpufreq_exit);
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index 284244e..8a77228 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -38,6 +38,28 @@  DEFINE_EVENT(cpu, cpu_idle,
 	TP_ARGS(state, cpu_id)
 );
 
+TRACE_EVENT(powernv_throttle,
+
+	TP_PROTO(int chip_id, char *reason, int pmax),
+
+	TP_ARGS(chip_id, reason, pmax),
+
+	TP_STRUCT__entry(
+		__field(int, chip_id)
+		__string(reason, reason)
+		__field(int, pmax)
+	),
+
+	TP_fast_assign(
+		__entry->chip_id = chip_id;
+		__assign_str(reason, reason);
+		__entry->pmax = pmax;
+	),
+
+	TP_printk("Chip %d Pmax %d %s", __entry->chip_id,
+		  __entry->pmax, __get_str(reason))
+);
+
 TRACE_EVENT(pstate_sample,
 
 	TP_PROTO(u32 core_busy,