diff mbox series

occ: Set up OCC messaging even if we fail to setup pstates

Message ID 20180323012403.16334-1-stewart@linux.vnet.ibm.com
State Accepted
Headers show
Series occ: Set up OCC messaging even if we fail to setup pstates | expand

Commit Message

Stewart Smith March 23, 2018, 1:24 a.m. UTC
This means that we no longer hit this bug if we fail to get valid pstates
from the OCC.

[console-pexpect]#echo 1 > //sys/firmware/opal/sensor_groups//occ-csm0/clear
echo 1 > //sys/firmware/opal/sensor_groups//occ-csm0/clear
[   94.019971181,5] CPU ATTEMPT TO RE-ENTER FIRMWARE! PIR=083d cpu @0x33cf4000 -> pir=083d token=8
[   94.020098392,5] CPU ATTEMPT TO RE-ENTER FIRMWARE! PIR=083d cpu @0x33cf4000 -> pir=083d token=8
[   10.318805] Disabling lock debugging due to kernel taint
[   10.318808] Severe Machine check interrupt [Not recovered]
[   10.318812]   NIP [000000003003e434]: 0x3003e434
[   10.318813]   Initiator: CPU
[   10.318815]   Error type: Real address [Load/Store (foreign)]
[   10.318817] opal: Hardware platform error: Unrecoverable Machine Check exception
[   10.318821] CPU: 117 PID: 2745 Comm: sh Tainted: G   M             4.15.9-openpower1 #3
[   10.318823] NIP:  000000003003e434 LR: 000000003003025c CTR: 0000000030030240
[   10.318825] REGS: c00000003fa7bd80 TRAP: 0200   Tainted: G   M              (4.15.9-openpower1)
[   10.318826] MSR:  9000000000201002 <SF,HV,ME,RI>  CR: 48002888  XER: 20040000
[   10.318831] CFAR: 0000000030030258 DAR: 394a00147d5a03a6 DSISR: 00000008 SOFTE: 1

Signed-off-by: Stewart Smith <stewart@linux.vnet.ibm.com>
---
 hw/occ.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

Comments

Shilpasri G Bhat March 23, 2018, 4:57 a.m. UTC | #1
Hi,

On 03/23/2018 06:54 AM, Stewart Smith wrote:
> This means that we no longer hit this bug if we fail to get valid pstates
> from the OCC.
> 
> [console-pexpect]#echo 1 > //sys/firmware/opal/sensor_groups//occ-csm0/clear
> echo 1 > //sys/firmware/opal/sensor_groups//occ-csm0/clear
> [   94.019971181,5] CPU ATTEMPT TO RE-ENTER FIRMWARE! PIR=083d cpu @0x33cf4000 -> pir=083d token=8
> [   94.020098392,5] CPU ATTEMPT TO RE-ENTER FIRMWARE! PIR=083d cpu @0x33cf4000 -> pir=083d token=8
> [   10.318805] Disabling lock debugging due to kernel taint
> [   10.318808] Severe Machine check interrupt [Not recovered]
> [   10.318812]   NIP [000000003003e434]: 0x3003e434
> [   10.318813]   Initiator: CPU
> [   10.318815]   Error type: Real address [Load/Store (foreign)]
> [   10.318817] opal: Hardware platform error: Unrecoverable Machine Check exception
> [   10.318821] CPU: 117 PID: 2745 Comm: sh Tainted: G   M             4.15.9-openpower1 #3
> [   10.318823] NIP:  000000003003e434 LR: 000000003003025c CTR: 0000000030030240
> [   10.318825] REGS: c00000003fa7bd80 TRAP: 0200   Tainted: G   M              (4.15.9-openpower1)
> [   10.318826] MSR:  9000000000201002 <SF,HV,ME,RI>  CR: 48002888  XER: 20040000
> [   10.318831] CFAR: 0000000030030258 DAR: 394a00147d5a03a6 DSISR: 00000008 SOFTE: 1
> 
> Signed-off-by: Stewart Smith <stewart@linux.vnet.ibm.com>
> ---

Thanks for the patch. We will need the below diff along with this patch.
diff --git a/hw/occ.c b/hw/occ.c
index d2b4a84..53e10f2 100644
--- a/hw/occ.c
+++ b/hw/occ.c
@@ -1242,6 +1242,13 @@ static void occ_cmd_interface_init(void)
        struct proc_chip *chip;
        int i = 0;

+       /* Check if the OCC data is valid */
+       for_each_chip(chip) {
+               pdata = get_occ_pstate_table(chip);
+               if (!pdata->valid)
+                       return;
+       }
+
        chip = next_chip(NULL);
        pdata = get_occ_pstate_table(chip);
        if (pdata->version != 0x90)
@@ -1610,6 +1614,12 @@ void occ_add_sensor_groups(struct dt_node *sg, u32
*phandles, u32 *ptype,
        };
        int i, j;

+       /*
+        * Dont add sensor groups if cmd-interface is not intialized
+        */
+       if (!chips)
+               return;
+
        for (i = 0; i < nr_occs; i++)
                if (chips[i].chip_id == chipid)
                        break;

>  hw/occ.c | 16 ++++++++--------
>  1 file changed, 8 insertions(+), 8 deletions(-)
> 
> diff --git a/hw/occ.c b/hw/occ.c
> index 37cf73c30bbd..90d3105eb1a8 100644
> --- a/hw/occ.c
> +++ b/hw/occ.c
> @@ -1365,6 +1365,9 @@ int occ_set_powercap(u32 handle, int token, u32 pcap)
>  	if (powercap_get_attr(handle) != POWERCAP_OCC_CUR)
>  		return OPAL_PERMISSION;
>  
> +	if (!chips)
> +		return OPAL_HARDWARE;
> +

This check is not required as powercap is initialized after cmd-interface
initialization. So occ_set_powercap() will not be invoked by host if
cmd-interface is not initialized.

>  	for (i = 0; i < nr_occs; i++)
>  		if (chips[i].occ_role == OCC_ROLE_MASTER)
>  			break;
> @@ -1721,14 +1724,11 @@ void occ_pstates_init(void)
>  	if (!add_cpu_pstate_properties(&pstate_nom)) {
>  		log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT),
>  			"Skiping core cpufreq init due to OCC error\n");
> -		return;
> -	}
> -
> -	/*
> -	 * Setup host based pstates and set nominal frequency only in
> -	 * P8.
> -	 */
> -	if (proc_gen == proc_gen_p8) {
> +	} else if (proc_gen == proc_gen_p8) {
> +		/*
> +		 * Setup host based pstates and set nominal frequency only in
> +		 * P8.
> +		 */
>  		for_each_chip(chip)
>  			for_each_available_core_in_chip(c, chip->id)
>  				cpu_pstates_prepare_core(chip, c, pstate_nom);
> 

Reviewed-and-tested-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
Stewart Smith March 27, 2018, 4:54 a.m. UTC | #2
Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com> writes:
> On 03/23/2018 06:54 AM, Stewart Smith wrote:
>> This means that we no longer hit this bug if we fail to get valid pstates
>> from the OCC.
>> 
>> [console-pexpect]#echo 1 > //sys/firmware/opal/sensor_groups//occ-csm0/clear
>> echo 1 > //sys/firmware/opal/sensor_groups//occ-csm0/clear
>> [   94.019971181,5] CPU ATTEMPT TO RE-ENTER FIRMWARE! PIR=083d cpu @0x33cf4000 -> pir=083d token=8
>> [   94.020098392,5] CPU ATTEMPT TO RE-ENTER FIRMWARE! PIR=083d cpu @0x33cf4000 -> pir=083d token=8
>> [   10.318805] Disabling lock debugging due to kernel taint
>> [   10.318808] Severe Machine check interrupt [Not recovered]
>> [   10.318812]   NIP [000000003003e434]: 0x3003e434
>> [   10.318813]   Initiator: CPU
>> [   10.318815]   Error type: Real address [Load/Store (foreign)]
>> [   10.318817] opal: Hardware platform error: Unrecoverable Machine Check exception
>> [   10.318821] CPU: 117 PID: 2745 Comm: sh Tainted: G   M             4.15.9-openpower1 #3
>> [   10.318823] NIP:  000000003003e434 LR: 000000003003025c CTR: 0000000030030240
>> [   10.318825] REGS: c00000003fa7bd80 TRAP: 0200   Tainted: G   M              (4.15.9-openpower1)
>> [   10.318826] MSR:  9000000000201002 <SF,HV,ME,RI>  CR: 48002888  XER: 20040000
>> [   10.318831] CFAR: 0000000030030258 DAR: 394a00147d5a03a6 DSISR: 00000008 SOFTE: 1
>> 
>> Signed-off-by: Stewart Smith <stewart@linux.vnet.ibm.com>
>> ---
>
> Thanks for the patch. We will need the below diff along with this patch.
> diff --git a/hw/occ.c b/hw/occ.c
> index d2b4a84..53e10f2 100644
> --- a/hw/occ.c
> +++ b/hw/occ.c
> @@ -1242,6 +1242,13 @@ static void occ_cmd_interface_init(void)
>         struct proc_chip *chip;
>         int i = 0;
>
> +       /* Check if the OCC data is valid */
> +       for_each_chip(chip) {
> +               pdata = get_occ_pstate_table(chip);
> +               if (!pdata->valid)
> +                       return;
> +       }
> +
>         chip = next_chip(NULL);
>         pdata = get_occ_pstate_table(chip);
>         if (pdata->version != 0x90)
> @@ -1610,6 +1614,12 @@ void occ_add_sensor_groups(struct dt_node *sg, u32
> *phandles, u32 *ptype,
>         };
>         int i, j;
>
> +       /*
> +        * Dont add sensor groups if cmd-interface is not intialized
> +        */
> +       if (!chips)
> +               return;
> +
>         for (i = 0; i < nr_occs; i++)
>                 if (chips[i].chip_id == chipid)
>                         break;
>
>>  hw/occ.c | 16 ++++++++--------
>>  1 file changed, 8 insertions(+), 8 deletions(-)
>> 
>> diff --git a/hw/occ.c b/hw/occ.c
>> index 37cf73c30bbd..90d3105eb1a8 100644
>> --- a/hw/occ.c
>> +++ b/hw/occ.c
>> @@ -1365,6 +1365,9 @@ int occ_set_powercap(u32 handle, int token, u32 pcap)
>>  	if (powercap_get_attr(handle) != POWERCAP_OCC_CUR)
>>  		return OPAL_PERMISSION;
>>  
>> +	if (!chips)
>> +		return OPAL_HARDWARE;
>> +
>
> This check is not required as powercap is initialized after cmd-interface
> initialization. So occ_set_powercap() will not be invoked by host if
> cmd-interface is not initialized.
>
>>  	for (i = 0; i < nr_occs; i++)
>>  		if (chips[i].occ_role == OCC_ROLE_MASTER)
>>  			break;
>> @@ -1721,14 +1724,11 @@ void occ_pstates_init(void)
>>  	if (!add_cpu_pstate_properties(&pstate_nom)) {
>>  		log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT),
>>  			"Skiping core cpufreq init due to OCC error\n");
>> -		return;
>> -	}
>> -
>> -	/*
>> -	 * Setup host based pstates and set nominal frequency only in
>> -	 * P8.
>> -	 */
>> -	if (proc_gen == proc_gen_p8) {
>> +	} else if (proc_gen == proc_gen_p8) {
>> +		/*
>> +		 * Setup host based pstates and set nominal frequency only in
>> +		 * P8.
>> +		 */
>>  		for_each_chip(chip)
>>  			for_each_available_core_in_chip(c, chip->id)
>>  				cpu_pstates_prepare_core(chip, c, pstate_nom);
>> 
>
> Reviewed-and-tested-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>

Thanks for taking a look into that, merged with your changes to master
as of 547377ddcd93257bc825cea4809c25c1e90ffcf3

and then to 5.10.x as of e37bb5a8afcdba38c294e096c3fd4b63f38de0da
because of that big customer we know of.
diff mbox series

Patch

diff --git a/hw/occ.c b/hw/occ.c
index 37cf73c30bbd..90d3105eb1a8 100644
--- a/hw/occ.c
+++ b/hw/occ.c
@@ -1365,6 +1365,9 @@  int occ_set_powercap(u32 handle, int token, u32 pcap)
 	if (powercap_get_attr(handle) != POWERCAP_OCC_CUR)
 		return OPAL_PERMISSION;
 
+	if (!chips)
+		return OPAL_HARDWARE;
+
 	for (i = 0; i < nr_occs; i++)
 		if (chips[i].occ_role == OCC_ROLE_MASTER)
 			break;
@@ -1721,14 +1724,11 @@  void occ_pstates_init(void)
 	if (!add_cpu_pstate_properties(&pstate_nom)) {
 		log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT),
 			"Skiping core cpufreq init due to OCC error\n");
-		return;
-	}
-
-	/*
-	 * Setup host based pstates and set nominal frequency only in
-	 * P8.
-	 */
-	if (proc_gen == proc_gen_p8) {
+	} else if (proc_gen == proc_gen_p8) {
+		/*
+		 * Setup host based pstates and set nominal frequency only in
+		 * P8.
+		 */
 		for_each_chip(chip)
 			for_each_available_core_in_chip(c, chip->id)
 				cpu_pstates_prepare_core(chip, c, pstate_nom);