diff mbox series

powerpc/pseries: new lparcfg key/value pair: partition_affinity_score

Message ID 20200619153419.3676392-1-cheloha@linux.ibm.com (mailing list archive)
State Superseded
Headers show
Series powerpc/pseries: new lparcfg key/value pair: partition_affinity_score | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch powerpc/merge (c3405d517d606e965030026daec198d314f20195)
snowpatch_ozlabs/build-ppc64le success Build succeeded
snowpatch_ozlabs/build-ppc64be success Build succeeded
snowpatch_ozlabs/build-ppc64e success Build succeeded
snowpatch_ozlabs/build-pmac32 success Build succeeded
snowpatch_ozlabs/checkpatch warning Test checkpatch on branch powerpc/merge
snowpatch_ozlabs/needsstable success Patch has no Fixes tags

Commit Message

Scott Cheloha June 19, 2020, 3:34 p.m. UTC
The H_GetPerformanceCounterInfo PHYP hypercall has a subcall,
Affinity_Domain_Info_By_Partition, which returns, among other things,
a "partition affinity score" for a given LPAR.  This score, a value on
[0-100], represents the processor-memory affinity for the LPAR in
question.  A score of 0 indicates the worst possible affinity while a
score of 100 indicates perfect affinity.  The score can be used to
reason about performance.

This patch adds the score for the local LPAR to the lparcfg procfile
under a new 'partition_affinity_score' key.

The H_GetPerformanceCounterInfo hypercall is already used elsewhere in
the kernel, in powerpc/perf/hv-gpci.c.  Refactoring that code and this
code into a more general API might be worthwhile if additional modules
require the hypercall in the future.

Signed-off-by: Scott Cheloha <cheloha@linux.ibm.com>
---
 arch/powerpc/platforms/pseries/lparcfg.c | 53 ++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

Comments

Tyrel Datwyler June 19, 2020, 8:38 p.m. UTC | #1
On 6/19/20 8:34 AM, Scott Cheloha wrote:
> The H_GetPerformanceCounterInfo PHYP hypercall has a subcall,
> Affinity_Domain_Info_By_Partition, which returns, among other things,
> a "partition affinity score" for a given LPAR.  This score, a value on
> [0-100], represents the processor-memory affinity for the LPAR in
> question.  A score of 0 indicates the worst possible affinity while a
> score of 100 indicates perfect affinity.  The score can be used to
> reason about performance.
> 
> This patch adds the score for the local LPAR to the lparcfg procfile
> under a new 'partition_affinity_score' key.

I expect that you will probably get a NACK from Michael on this. The overall
desire is to move away from these dated /proc interfaces. While its true that I
did add a new value recently it was strictly to facilitate and correct the
calculation of a derived value that was already dependent on a couple other
existing values in lparcfg.

With that said I would expect that you would likely be advised to expose this as
a sysfs attribute. The question is where? We probably should put some thought in
to this as I would like to port each lparcfg value over to sysfs so that we can
move to deprecating lparcfg. Putting everything under something like
/sys/kernel/lparcfg/* maybe. Michael may have a better suggestion.

> 
> The H_GetPerformanceCounterInfo hypercall is already used elsewhere in
> the kernel, in powerpc/perf/hv-gpci.c.  Refactoring that code and this
> code into a more general API might be worthwhile if additional modules
> require the hypercall in the future.

If you are duplicating code its likely you should already be doing this. See the
rest of my comments about below.

> 
> Signed-off-by: Scott Cheloha <cheloha@linux.ibm.com>
> ---
>  arch/powerpc/platforms/pseries/lparcfg.c | 53 ++++++++++++++++++++++++
>  1 file changed, 53 insertions(+)
> 
> diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c
> index b8d28ab88178..b75151eee0f0 100644
> --- a/arch/powerpc/platforms/pseries/lparcfg.c
> +++ b/arch/powerpc/platforms/pseries/lparcfg.c
> @@ -136,6 +136,57 @@ static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data)
>  	return rc;
>  }
>  
> +/*
> + * Based on H_GetPerformanceCounterInfo v1.10.
> + */
> +static void show_gpci_data(struct seq_file *m)
> +{
> +	struct perf_counter_info_params {
> +		__be32 counter_request;
> +		__be32 starting_index;
> +		__be16 secondary_index;
> +		__be16 returned_values;
> +		__be32 detail_rc;
> +		__be16 counter_value_element_size;
> +		u8     counter_info_version_in;
> +		u8     counter_info_version_out;
> +		u8     reserved[0xC];
> +	} __packed;

This looks to duplicate the hv_get_perf_counter_info_params struct from
arch/powerpc/perf/hv-gpci.h. Maybe this include file needs to move to
arch/powerpc/asm/inlcude so you don't have to redefine this struct.

> +	struct hv_gpci_request_buffer {
> +		struct perf_counter_info_params params;
> +		u8 output[4096 - sizeof(struct perf_counter_info_params)];
> +	} __packed;

This struct is code duplication of the one defined in
arch/powerpc/perf/hv-gpci.c and could be moved into hv-gpci.h along with
HGPCI_MAX_DATA_BYTES so that you can use those versions here.

> +	struct hv_gpci_request_buffer *buf;
> +	long ret;
> +	unsigned int affinity_score;
> +
> +	buf = kmalloc(sizeof(*buf), GFP_KERNEL);
> +	if (buf == NULL)
> +		return;
> +
> +	/*
> +	 * Show the local LPAR's affinity score.
> +	 *
> +	 * 0xB1 selects the Affinity_Domain_Info_By_Partition subcall.
> +	 * The score is at byte 0xB in the output buffer.
> +	 */
> +	memset(&buf->params, 0, sizeof(buf->params));
> +	buf->params.counter_request = cpu_to_be32(0xB1);
> +	buf->params.starting_index = cpu_to_be32(-1);	/* local LPAR */
> +	buf->params.counter_info_version_in = 0x5;	/* v5+ for score */
> +	ret = plpar_hcall_norets(H_GET_PERF_COUNTER_INFO, virt_to_phys(buf),
> +				 sizeof(*buf));
> +	if (ret != H_SUCCESS) {
> +		pr_debug("hcall failed: H_GET_PERF_COUNTER_INFO: %ld, %x\n",
> +			 ret, be32_to_cpu(buf->params.detail_rc));
> +		goto out;
> +	}
> +	affinity_score = buf->output[0xB];
> +	seq_printf(m, "partition_affinity_score=%u\n", affinity_score);
> +out:
> +	kfree(buf);
> +}
> +

IIUC we should already be able to get this value from userspace using perf tool,
right? If thats the case can't we also programatically retrieve it via the
perf_event interface in userspace as well?

-Tyrel

>  static unsigned h_pic(unsigned long *pool_idle_time,
>  		      unsigned long *num_procs)
>  {
> @@ -487,6 +538,8 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
>  			   partition_active_processors * 100);
>  	}
>  
> +	show_gpci_data(m);
> +
>  	seq_printf(m, "partition_active_processors=%d\n",
>  		   partition_active_processors);
>  
>
Nathan Lynch June 20, 2020, 12:32 a.m. UTC | #2
Hi Tyrel,

Tyrel Datwyler <tyreld@linux.ibm.com> writes:
> On 6/19/20 8:34 AM, Scott Cheloha wrote:
>> The H_GetPerformanceCounterInfo PHYP hypercall has a subcall,
>> Affinity_Domain_Info_By_Partition, which returns, among other things,
>> a "partition affinity score" for a given LPAR.  This score, a value on
>> [0-100], represents the processor-memory affinity for the LPAR in
>> question.  A score of 0 indicates the worst possible affinity while a
>> score of 100 indicates perfect affinity.  The score can be used to
>> reason about performance.
>> 
>> This patch adds the score for the local LPAR to the lparcfg procfile
>> under a new 'partition_affinity_score' key.
>
> I expect that you will probably get a NACK from Michael on this. The overall
> desire is to move away from these dated /proc interfaces. While its true that I
> did add a new value recently it was strictly to facilitate and correct the
> calculation of a derived value that was already dependent on a couple other
> existing values in lparcfg.
>
> With that said I would expect that you would likely be advised to expose this as
> a sysfs attribute. The question is where? We probably should put some thought in
> to this as I would like to port each lparcfg value over to sysfs so that we can
> move to deprecating lparcfg. Putting everything under something like
> /sys/kernel/lparcfg/* maybe. Michael may have a better suggestion.

I think this score fits pretty naturally in lparcfg: it's a simple
metric that is specific to the pseries/papr platform, like everything
else in there.

A few dozen key=value pairs contained in a single file is simple and
efficient, unlike sysfs with its rather inconsistently applied
one-value-per-file convention. Surely it's OK if lparcfg gains a line
every few years?


>> The H_GetPerformanceCounterInfo hypercall is already used elsewhere in
>> the kernel, in powerpc/perf/hv-gpci.c.  Refactoring that code and this
>> code into a more general API might be worthwhile if additional modules
>> require the hypercall in the future.
>
> If you are duplicating code its likely you should already be doing this. See the
> rest of my comments about below.
>
>> 
>> Signed-off-by: Scott Cheloha <cheloha@linux.ibm.com>
>> ---
>>  arch/powerpc/platforms/pseries/lparcfg.c | 53 ++++++++++++++++++++++++
>>  1 file changed, 53 insertions(+)
>> 
>> diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c
>> index b8d28ab88178..b75151eee0f0 100644
>> --- a/arch/powerpc/platforms/pseries/lparcfg.c
>> +++ b/arch/powerpc/platforms/pseries/lparcfg.c
>> @@ -136,6 +136,57 @@ static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data)
>>  	return rc;
>>  }
>>  
>> +/*
>> + * Based on H_GetPerformanceCounterInfo v1.10.
>> + */
>> +static void show_gpci_data(struct seq_file *m)
>> +{
>> +	struct perf_counter_info_params {
>> +		__be32 counter_request;
>> +		__be32 starting_index;
>> +		__be16 secondary_index;
>> +		__be16 returned_values;
>> +		__be32 detail_rc;
>> +		__be16 counter_value_element_size;
>> +		u8     counter_info_version_in;
>> +		u8     counter_info_version_out;
>> +		u8     reserved[0xC];
>> +	} __packed;
>
> This looks to duplicate the hv_get_perf_counter_info_params struct from
> arch/powerpc/perf/hv-gpci.h. Maybe this include file needs to move to
> arch/powerpc/asm/inlcude so you don't have to redefine this struct.
>
>> +	struct hv_gpci_request_buffer {
>> +		struct perf_counter_info_params params;
>> +		u8 output[4096 - sizeof(struct perf_counter_info_params)];
>> +	} __packed;
>
> This struct is code duplication of the one defined in
> arch/powerpc/perf/hv-gpci.c and could be moved into hv-gpci.h along with
> HGPCI_MAX_DATA_BYTES so that you can use those versions here.

I tend to agree with these comments.


>> +	struct hv_gpci_request_buffer *buf;
>> +	long ret;
>> +	unsigned int affinity_score;
>> +
>> +	buf = kmalloc(sizeof(*buf), GFP_KERNEL);
>> +	if (buf == NULL)
>> +		return;
>> +
>> +	/*
>> +	 * Show the local LPAR's affinity score.
>> +	 *
>> +	 * 0xB1 selects the Affinity_Domain_Info_By_Partition subcall.
>> +	 * The score is at byte 0xB in the output buffer.
>> +	 */
>> +	memset(&buf->params, 0, sizeof(buf->params));
>> +	buf->params.counter_request = cpu_to_be32(0xB1);
>> +	buf->params.starting_index = cpu_to_be32(-1);	/* local LPAR */
>> +	buf->params.counter_info_version_in = 0x5;	/* v5+ for score */
>> +	ret = plpar_hcall_norets(H_GET_PERF_COUNTER_INFO, virt_to_phys(buf),
>> +				 sizeof(*buf));
>> +	if (ret != H_SUCCESS) {
>> +		pr_debug("hcall failed: H_GET_PERF_COUNTER_INFO: %ld, %x\n",
>> +			 ret, be32_to_cpu(buf->params.detail_rc));
>> +		goto out;
>> +	}
>> +	affinity_score = buf->output[0xB];
>> +	seq_printf(m, "partition_affinity_score=%u\n", affinity_score);
>> +out:
>> +	kfree(buf);
>> +}
>> +
>
> IIUC we should already be able to get this value from userspace using perf tool,
> right? If thats the case can't we also programatically retrieve it via the
> perf_event interface in userspace as well?

No... I had the same thought when I first looked at this, but perf is
for sampling counters, and the partition affinity score is not a
counter.
Tyrel Datwyler June 22, 2020, 6:59 p.m. UTC | #3
On 6/19/20 5:32 PM, Nathan Lynch wrote:
> Hi Tyrel,
> 
> Tyrel Datwyler <tyreld@linux.ibm.com> writes:
>> On 6/19/20 8:34 AM, Scott Cheloha wrote:
>>> The H_GetPerformanceCounterInfo PHYP hypercall has a subcall,
>>> Affinity_Domain_Info_By_Partition, which returns, among other things,
>>> a "partition affinity score" for a given LPAR.  This score, a value on
>>> [0-100], represents the processor-memory affinity for the LPAR in
>>> question.  A score of 0 indicates the worst possible affinity while a
>>> score of 100 indicates perfect affinity.  The score can be used to
>>> reason about performance.
>>>
>>> This patch adds the score for the local LPAR to the lparcfg procfile
>>> under a new 'partition_affinity_score' key.
>>
>> I expect that you will probably get a NACK from Michael on this. The overall
>> desire is to move away from these dated /proc interfaces. While its true that I
>> did add a new value recently it was strictly to facilitate and correct the
>> calculation of a derived value that was already dependent on a couple other
>> existing values in lparcfg.
>>
>> With that said I would expect that you would likely be advised to expose this as
>> a sysfs attribute. The question is where? We probably should put some thought in
>> to this as I would like to port each lparcfg value over to sysfs so that we can
>> move to deprecating lparcfg. Putting everything under something like
>> /sys/kernel/lparcfg/* maybe. Michael may have a better suggestion.
> 
> I think this score fits pretty naturally in lparcfg: it's a simple
> metric that is specific to the pseries/papr platform, like everything
> else in there.
> 
> A few dozen key=value pairs contained in a single file is simple and
> efficient, unlike sysfs with its rather inconsistently applied
> one-value-per-file convention. Surely it's OK if lparcfg gains a line
> every few years?
> 

So, two things:

1.) I wanted to give fore warning that Michael is generally reluctant to add new
things to lparcfg and I wanted to prepare you for that possibility.

2.) As a simple metric who is consuming said metric? I've not seen any patches
to powerpc-utils in prep so should I be expecting this to be exposed via
lparstat, or are we expecting new tooling to also start parsing lparcfg? If we
are planning for users to consume it via existing powerpc-utils tooling than its
probably easier to make the argument that it fits into lparcfg. If we are
creating new tooling, or expecting users to extract that value on their own I
think the sysfs route is going to be favored.

-Tyrel

> 
>>> The H_GetPerformanceCounterInfo hypercall is already used elsewhere in
>>> the kernel, in powerpc/perf/hv-gpci.c.  Refactoring that code and this
>>> code into a more general API might be worthwhile if additional modules
>>> require the hypercall in the future.
>>
>> If you are duplicating code its likely you should already be doing this. See the
>> rest of my comments about below.
>>
>>>
>>> Signed-off-by: Scott Cheloha <cheloha@linux.ibm.com>
>>> ---
>>>  arch/powerpc/platforms/pseries/lparcfg.c | 53 ++++++++++++++++++++++++
>>>  1 file changed, 53 insertions(+)
>>>
>>> diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c
>>> index b8d28ab88178..b75151eee0f0 100644
>>> --- a/arch/powerpc/platforms/pseries/lparcfg.c
>>> +++ b/arch/powerpc/platforms/pseries/lparcfg.c
>>> @@ -136,6 +136,57 @@ static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data)
>>>  	return rc;
>>>  }
>>>  
>>> +/*
>>> + * Based on H_GetPerformanceCounterInfo v1.10.
>>> + */
>>> +static void show_gpci_data(struct seq_file *m)
>>> +{
>>> +	struct perf_counter_info_params {
>>> +		__be32 counter_request;
>>> +		__be32 starting_index;
>>> +		__be16 secondary_index;
>>> +		__be16 returned_values;
>>> +		__be32 detail_rc;
>>> +		__be16 counter_value_element_size;
>>> +		u8     counter_info_version_in;
>>> +		u8     counter_info_version_out;
>>> +		u8     reserved[0xC];
>>> +	} __packed;
>>
>> This looks to duplicate the hv_get_perf_counter_info_params struct from
>> arch/powerpc/perf/hv-gpci.h. Maybe this include file needs to move to
>> arch/powerpc/asm/inlcude so you don't have to redefine this struct.
>>
>>> +	struct hv_gpci_request_buffer {
>>> +		struct perf_counter_info_params params;
>>> +		u8 output[4096 - sizeof(struct perf_counter_info_params)];
>>> +	} __packed;
>>
>> This struct is code duplication of the one defined in
>> arch/powerpc/perf/hv-gpci.c and could be moved into hv-gpci.h along with
>> HGPCI_MAX_DATA_BYTES so that you can use those versions here.
> 
> I tend to agree with these comments.
> 
> 
>>> +	struct hv_gpci_request_buffer *buf;
>>> +	long ret;
>>> +	unsigned int affinity_score;
>>> +
>>> +	buf = kmalloc(sizeof(*buf), GFP_KERNEL);
>>> +	if (buf == NULL)
>>> +		return;
>>> +
>>> +	/*
>>> +	 * Show the local LPAR's affinity score.
>>> +	 *
>>> +	 * 0xB1 selects the Affinity_Domain_Info_By_Partition subcall.
>>> +	 * The score is at byte 0xB in the output buffer.
>>> +	 */
>>> +	memset(&buf->params, 0, sizeof(buf->params));
>>> +	buf->params.counter_request = cpu_to_be32(0xB1);
>>> +	buf->params.starting_index = cpu_to_be32(-1);	/* local LPAR */
>>> +	buf->params.counter_info_version_in = 0x5;	/* v5+ for score */
>>> +	ret = plpar_hcall_norets(H_GET_PERF_COUNTER_INFO, virt_to_phys(buf),
>>> +				 sizeof(*buf));
>>> +	if (ret != H_SUCCESS) {
>>> +		pr_debug("hcall failed: H_GET_PERF_COUNTER_INFO: %ld, %x\n",
>>> +			 ret, be32_to_cpu(buf->params.detail_rc));
>>> +		goto out;
>>> +	}
>>> +	affinity_score = buf->output[0xB];
>>> +	seq_printf(m, "partition_affinity_score=%u\n", affinity_score);
>>> +out:
>>> +	kfree(buf);
>>> +}
>>> +
>>
>> IIUC we should already be able to get this value from userspace using perf tool,
>> right? If thats the case can't we also programatically retrieve it via the
>> perf_event interface in userspace as well?
> 
> No... I had the same thought when I first looked at this, but perf is
> for sampling counters, and the partition affinity score is not a
> counter.

But, its obtained through the same H_GET_PERF_COUNTER_INFO interface as the rest
of the performance counters. Shouldn't it be obtainable via 'perf stat -e
hv-gpci/*' and therefore also via perf_event? I really am not that familiar with
perf so correct me on whatever I'm missing here.

-Tyrel
diff mbox series

Patch

diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c
index b8d28ab88178..b75151eee0f0 100644
--- a/arch/powerpc/platforms/pseries/lparcfg.c
+++ b/arch/powerpc/platforms/pseries/lparcfg.c
@@ -136,6 +136,57 @@  static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data)
 	return rc;
 }
 
+/*
+ * Based on H_GetPerformanceCounterInfo v1.10.
+ */
+static void show_gpci_data(struct seq_file *m)
+{
+	struct perf_counter_info_params {
+		__be32 counter_request;
+		__be32 starting_index;
+		__be16 secondary_index;
+		__be16 returned_values;
+		__be32 detail_rc;
+		__be16 counter_value_element_size;
+		u8     counter_info_version_in;
+		u8     counter_info_version_out;
+		u8     reserved[0xC];
+	} __packed;
+	struct hv_gpci_request_buffer {
+		struct perf_counter_info_params params;
+		u8 output[4096 - sizeof(struct perf_counter_info_params)];
+	} __packed;
+	struct hv_gpci_request_buffer *buf;
+	long ret;
+	unsigned int affinity_score;
+
+	buf = kmalloc(sizeof(*buf), GFP_KERNEL);
+	if (buf == NULL)
+		return;
+
+	/*
+	 * Show the local LPAR's affinity score.
+	 *
+	 * 0xB1 selects the Affinity_Domain_Info_By_Partition subcall.
+	 * The score is at byte 0xB in the output buffer.
+	 */
+	memset(&buf->params, 0, sizeof(buf->params));
+	buf->params.counter_request = cpu_to_be32(0xB1);
+	buf->params.starting_index = cpu_to_be32(-1);	/* local LPAR */
+	buf->params.counter_info_version_in = 0x5;	/* v5+ for score */
+	ret = plpar_hcall_norets(H_GET_PERF_COUNTER_INFO, virt_to_phys(buf),
+				 sizeof(*buf));
+	if (ret != H_SUCCESS) {
+		pr_debug("hcall failed: H_GET_PERF_COUNTER_INFO: %ld, %x\n",
+			 ret, be32_to_cpu(buf->params.detail_rc));
+		goto out;
+	}
+	affinity_score = buf->output[0xB];
+	seq_printf(m, "partition_affinity_score=%u\n", affinity_score);
+out:
+	kfree(buf);
+}
+
 static unsigned h_pic(unsigned long *pool_idle_time,
 		      unsigned long *num_procs)
 {
@@ -487,6 +538,8 @@  static int pseries_lparcfg_data(struct seq_file *m, void *v)
 			   partition_active_processors * 100);
 	}
 
+	show_gpci_data(m);
+
 	seq_printf(m, "partition_active_processors=%d\n",
 		   partition_active_processors);