diff mbox series

[58/61] P10 Cleanup special wakeup and xive stop api usage

Message ID 20210719132012.150948-59-hegdevasant@linux.vnet.ibm.com
State Superseded
Headers show
Series P10 Enablement | expand

Commit Message

Vasant Hegde July 19, 2021, 1:20 p.m. UTC
From: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>

Cleanup P9 code pending implementation for P10.

P10 stop-api integration will be needed for STOP11
support only.  STOP11 will be a restricted usage
for testing core re-init on P10.  Only stop0,2,3
will be available for general usage.

Also, do not treat gated core with SPW done as error.
Core gating bit is a software state updated by microcode,
while SPWU done bit comes from hardware logic to indicate
successful operation.  Print a warning if the status bits
are out of sync, but no need to fail the special wakeup
operation.

Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
Signed-off-by: Pratik R. Sampat <psampat@linux.ibm.com>
Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
---
 core/direct-controls.c | 34 +++++++++++++++++++++++++++-------
 hw/slw.c               | 30 +++++++++---------------------
 hw/xive2.c             | 26 +-------------------------
 3 files changed, 37 insertions(+), 53 deletions(-)

Comments

Nicholas Piggin July 21, 2021, 11:32 a.m. UTC | #1
Excerpts from Vasant Hegde's message of July 19, 2021 11:20 pm:
> From: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
> 
> Cleanup P9 code pending implementation for P10.
> 
> P10 stop-api integration will be needed for STOP11
> support only.  STOP11 will be a restricted usage
> for testing core re-init on P10.  Only stop0,2,3
> will be available for general usage.
> 
> Also, do not treat gated core with SPW done as error.
> Core gating bit is a software state updated by microcode,
> while SPWU done bit comes from hardware logic to indicate
> successful operation.  Print a warning if the status bits
> are out of sync, but no need to fail the special wakeup
> operation.

Hmm, I wonder if similar should be done with P9 code first,
and then some of the P10 updates merged into the P10 enable
patches.

> 
> Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
> Signed-off-by: Pratik R. Sampat <psampat@linux.ibm.com>
> Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
> ---
>  core/direct-controls.c | 34 +++++++++++++++++++++++++++-------
>  hw/slw.c               | 30 +++++++++---------------------
>  hw/xive2.c             | 26 +-------------------------
>  3 files changed, 37 insertions(+), 53 deletions(-)
> 
> diff --git a/core/direct-controls.c b/core/direct-controls.c
> index 879a537af..4795c19dc 100644
> --- a/core/direct-controls.c
> +++ b/core/direct-controls.c
> @@ -600,15 +600,35 @@ static int p10_core_set_special_wakeup(struct cpu_thread *cpu)
>  			 * CORE_GATED will be unset on a successful special
>  			 * wakeup of the core which indicates that the core is
>  			 * out of stop state. If CORE_GATED is still set then
> -			 * raise error.
> +			 * check SPWU register and raise error only if SPWU_DONE
> +			 * is not set, else print a warning and consider SPWU
> +			 * operation as successful.
>  			 */
>  			if (p10_core_is_gated(cpu)) {
> -				/* Deassert spwu for this strange error */
> -				xscom_write(chip_id, spwu_addr, 0);
> -				prlog(PR_ERR, "Failed special wakeup on %u:%u"
> -						" core remains gated.\n",
> -						chip_id, core_id);
> -				return OPAL_HARDWARE;
> +				if(xscom_read(chip_id, spwu_addr, &val)) {
> +					prlog(PR_ERR, "Core %u:%u:"
> +					      " unable to read QME_SPWU_HYP\n",
> +					      chip_id, core_id);
> +                                        return OPAL_HARDWARE;
> +                                }
> +				if (val & P10_SPWU_DONE) {
> +					/*
> +					 * If SPWU DONE bit is set then
> +					 * SPWU operation is complete
> +					 */
> +					prlog(PR_WARNING, "Special wakeup on "
> +					      "%u:%u: core remains gated while"
> +					      " SPWU_HYP DONE set\n",
> +					      chip_id, core_id);
> +					return 0;
> +				}
> +                                /* Deassert spwu for this strange error */
> +                                xscom_write(chip_id, spwu_addr, 0);
> +                                prlog(PR_ERR,
> +                                      "Failed special wakeup on %u:%u"
> +                                      " core remains gated.\n",
> +                                      chip_id, core_id);
> +                                return OPAL_HARDWARE;

Should the equivalent be done for P9 here?

>  			} else {
>  				return 0;
>  			}
> diff --git a/hw/slw.c b/hw/slw.c
> index e22d1bdde..52536db06 100644
> --- a/hw/slw.c
> +++ b/hw/slw.c
> @@ -228,32 +228,20 @@ static bool slw_set_overrides_p10(struct proc_chip *chip, struct cpu_thread *c)
>  	int rc;
>  	uint32_t core = pir_to_core_id(c->pir);
>  
> -	/* Clear special wakeup bits that could hold power mgt */
> -	rc = xscom_write(chip->id,
> -			 XSCOM_ADDR_P10_QME_CORE(core, P10_QME_SPWU_HYP),
> -			 0);
> -	if (rc) {
> -		log_simple_error(&e_info(OPAL_RC_SLW_SET),
> -			"SLW: Failed to write P10_QME_SPWU_HYP\n");
> -		return false;
> -	}
> -	/* Read back for debug */
> +	/* Special wakeup bits that could hold power mgt */
>  	rc = xscom_read(chip->id,
>  			XSCOM_ADDR_P10_QME_CORE(core, P10_QME_SPWU_HYP),
>  			&tmp);
> -	if (tmp)
> +        if (rc) {
> +          log_simple_error(&e_info(OPAL_RC_SLW_SET),
> +                           "SLW: Failed to read P10_QME_SPWU_HYP\n");
> +          return false;
> +        }

Ditto here -- should the p9 code be changed? I wonder if the special 
wakeup check should be made in direct controls init code rather than 
here.

> +        if (tmp & P10_SPWU_REQ)
>  		prlog(PR_WARNING,
> -			"SLW: core %d P10_QME_SPWU_HYP read  0x%016llx\n",
> -		     core, tmp);
> -#if 0
> -	rc = xscom_read(chip->id,
> -			XSCOM_ADDR_P10_QME_CORE(core, P10_QME_SPWU_OTR),
> -			&tmp);
> -	if (tmp)
> -		prlog(PR_WARNING,
> -			"SLW: core %d P10_QME_SPWU_OTR read  0x%016llx\n",
> +		        "SLW: core %d P10_QME_SPWU_HYP requested 0x%016llx\n",
>  		      core, tmp);
> -#endif
> +
>  	return true;
>  }
>  
> diff --git a/hw/xive2.c b/hw/xive2.c
> index b79635cc9..f559b0f9a 100644
> --- a/hw/xive2.c
> +++ b/hw/xive2.c
> @@ -3022,34 +3022,10 @@ static void xive_configure_ex_special_bar(struct xive *x, struct cpu_thread *c)
>  
>  void xive2_late_init(void)
>  {
> -	struct cpu_thread *c;
> -
>  	prlog(PR_INFO, "SLW: Configuring self-restore for NCU_SPEC_BAR\n");
> -	for_each_present_cpu(c) {
> -		if(cpu_is_thread0(c)) {
> -			struct proc_chip *chip = get_chip(c->chip_id);
> -			struct xive *x = chip->xive;
> -			uint64_t xa, val, rc;
> -			xa = XSCOM_ADDR_P10_NCU(pir_to_core_id(c->pir), P10_NCU_SPEC_BAR);
> -			val = (uint64_t)x->tm_base | P10_NCU_SPEC_BAR_ENABLE;
> -			/* Bail out if wakeup engine has already failed */
> -			if ( wakeup_engine_state != WAKEUP_ENGINE_PRESENT) {
> -				prlog(PR_ERR, "XIVE p9_stop_api fail detected\n");
> -				break;
> -			}
>  			/*
> -			 * TODO (p10): need P10 stop state engine
> +			 * TODO (p10): need P10 stop state engine and fix for STOP11
>  			 */
> -			rc = p9_stop_save_scom((void *)chip->homer_base, xa, val,
> -				P9_STOP_SCOM_REPLACE, P9_STOP_SECTION_EQ_SCOM);
> -			if (rc) {
> -				xive_cpu_err(c, "p9_stop_api failed for NCU_SPEC_BAR rc=%lld\n",
> -				rc);
> -				wakeup_engine_state = WAKEUP_ENGINE_FAILED;
> -			}
> -		}
> -	}

This should probably be merged withthe xive2 code that added it.

Thanks,
Nick
C├ędric Le Goater July 21, 2021, 1:35 p.m. UTC | #2
>> diff --git a/hw/xive2.c b/hw/xive2.c
>> index b79635cc9..f559b0f9a 100644
>> --- a/hw/xive2.c
>> +++ b/hw/xive2.c
>> @@ -3022,34 +3022,10 @@ static void xive_configure_ex_special_bar(struct xive *x, struct cpu_thread *c)
>>  
>>  void xive2_late_init(void)
>>  {
>> -	struct cpu_thread *c;
>> -
>>  	prlog(PR_INFO, "SLW: Configuring self-restore for NCU_SPEC_BAR\n");
>> -	for_each_present_cpu(c) {
>> -		if(cpu_is_thread0(c)) {
>> -			struct proc_chip *chip = get_chip(c->chip_id);
>> -			struct xive *x = chip->xive;
>> -			uint64_t xa, val, rc;
>> -			xa = XSCOM_ADDR_P10_NCU(pir_to_core_id(c->pir), P10_NCU_SPEC_BAR);
>> -			val = (uint64_t)x->tm_base | P10_NCU_SPEC_BAR_ENABLE;
>> -			/* Bail out if wakeup engine has already failed */
>> -			if ( wakeup_engine_state != WAKEUP_ENGINE_PRESENT) {
>> -				prlog(PR_ERR, "XIVE p9_stop_api fail detected\n");
>> -				break;
>> -			}
>>  			/*
>> -			 * TODO (p10): need P10 stop state engine
>> +			 * TODO (p10): need P10 stop state engine and fix for STOP11
>>  			 */
>> -			rc = p9_stop_save_scom((void *)chip->homer_base, xa, val,
>> -				P9_STOP_SCOM_REPLACE, P9_STOP_SECTION_EQ_SCOM);
>> -			if (rc) {
>> -				xive_cpu_err(c, "p9_stop_api failed for NCU_SPEC_BAR rc=%lld\n",
>> -				rc);
>> -				wakeup_engine_state = WAKEUP_ENGINE_FAILED;
>> -			}
>> -		}
>> -	}
> 
> This should probably be merged with the xive2 code that added it.

yes. I agree.

C.
Pratik Sampat July 23, 2021, 2:58 p.m. UTC | #3
On 21/07/21 5:02 pm, Nicholas Piggin wrote:
> Excerpts from Vasant Hegde's message of July 19, 2021 11:20 pm:
>> From: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
>>
>> Cleanup P9 code pending implementation for P10.
>>
>> P10 stop-api integration will be needed for STOP11
>> support only.  STOP11 will be a restricted usage
>> for testing core re-init on P10.  Only stop0,2,3
>> will be available for general usage.
>>
>> Also, do not treat gated core with SPW done as error.
>> Core gating bit is a software state updated by microcode,
>> while SPWU done bit comes from hardware logic to indicate
>> successful operation.  Print a warning if the status bits
>> are out of sync, but no need to fail the special wakeup
>> operation.
> Hmm, I wonder if similar should be done with P9 code first,
> and then some of the P10 updates merged into the P10 enable
> patches.

I believe P9 didn't fail the SPW command if the core is gated, that is just the
behavior I observe on P10, is that what you mean?

>> Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
>> Signed-off-by: Pratik R. Sampat <psampat@linux.ibm.com>
>> Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
>> ---
>>   core/direct-controls.c | 34 +++++++++++++++++++++++++++-------
>>   hw/slw.c               | 30 +++++++++---------------------
>>   hw/xive2.c             | 26 +-------------------------
>>   3 files changed, 37 insertions(+), 53 deletions(-)
>>
>> diff --git a/core/direct-controls.c b/core/direct-controls.c
>> index 879a537af..4795c19dc 100644
>> --- a/core/direct-controls.c
>> +++ b/core/direct-controls.c
>> @@ -600,15 +600,35 @@ static int p10_core_set_special_wakeup(struct cpu_thread *cpu)
>>   			 * CORE_GATED will be unset on a successful special
>>   			 * wakeup of the core which indicates that the core is
>>   			 * out of stop state. If CORE_GATED is still set then
>> -			 * raise error.
>> +			 * check SPWU register and raise error only if SPWU_DONE
>> +			 * is not set, else print a warning and consider SPWU
>> +			 * operation as successful.
>>   			 */
>>   			if (p10_core_is_gated(cpu)) {
>> -				/* Deassert spwu for this strange error */
>> -				xscom_write(chip_id, spwu_addr, 0);
>> -				prlog(PR_ERR, "Failed special wakeup on %u:%u"
>> -						" core remains gated.\n",
>> -						chip_id, core_id);
>> -				return OPAL_HARDWARE;
>> +				if(xscom_read(chip_id, spwu_addr, &val)) {
>> +					prlog(PR_ERR, "Core %u:%u:"
>> +					      " unable to read QME_SPWU_HYP\n",
>> +					      chip_id, core_id);
>> +                                        return OPAL_HARDWARE;
>> +                                }
>> +				if (val & P10_SPWU_DONE) {
>> +					/*
>> +					 * If SPWU DONE bit is set then
>> +					 * SPWU operation is complete
>> +					 */
>> +					prlog(PR_WARNING, "Special wakeup on "
>> +					      "%u:%u: core remains gated while"
>> +					      " SPWU_HYP DONE set\n",
>> +					      chip_id, core_id);
>> +					return 0;
>> +				}
>> +                                /* Deassert spwu for this strange error */
>> +                                xscom_write(chip_id, spwu_addr, 0);
>> +                                prlog(PR_ERR,
>> +                                      "Failed special wakeup on %u:%u"
>> +                                      " core remains gated.\n",
>> +                                      chip_id, core_id);
>> +                                return OPAL_HARDWARE;
> Should the equivalent be done for P9 here?

I believe P9 does not fail on special wakeup when core is gated hence, it may
not be necessary.

>>   			} else {
>>   				return 0;
>>   			}
>> diff --git a/hw/slw.c b/hw/slw.c
>> index e22d1bdde..52536db06 100644
>> --- a/hw/slw.c
>> +++ b/hw/slw.c
>> @@ -228,32 +228,20 @@ static bool slw_set_overrides_p10(struct proc_chip *chip, struct cpu_thread *c)
>>   	int rc;
>>   	uint32_t core = pir_to_core_id(c->pir);
>>   
>> -	/* Clear special wakeup bits that could hold power mgt */
>> -	rc = xscom_write(chip->id,
>> -			 XSCOM_ADDR_P10_QME_CORE(core, P10_QME_SPWU_HYP),
>> -			 0);
>> -	if (rc) {
>> -		log_simple_error(&e_info(OPAL_RC_SLW_SET),
>> -			"SLW: Failed to write P10_QME_SPWU_HYP\n");
>> -		return false;
>> -	}
>> -	/* Read back for debug */
>> +	/* Special wakeup bits that could hold power mgt */
>>   	rc = xscom_read(chip->id,
>>   			XSCOM_ADDR_P10_QME_CORE(core, P10_QME_SPWU_HYP),
>>   			&tmp);
>> -	if (tmp)
>> +        if (rc) {
>> +          log_simple_error(&e_info(OPAL_RC_SLW_SET),
>> +                           "SLW: Failed to read P10_QME_SPWU_HYP\n");
>> +          return false;
>> +        }
> Ditto here -- should the p9 code be changed? I wonder if the special
> wakeup check should be made in direct controls init code rather than
> here.

P9 made those check here too and that's why I followed pattern, however if you
believe this belongs in direct controls I could maybe clean it up for both P9
and P10.

>
>> +        if (tmp & P10_SPWU_REQ)
>>   		prlog(PR_WARNING,
>> -			"SLW: core %d P10_QME_SPWU_HYP read  0x%016llx\n",
>> -		     core, tmp);
>> -#if 0
>> -	rc = xscom_read(chip->id,
>> -			XSCOM_ADDR_P10_QME_CORE(core, P10_QME_SPWU_OTR),
>> -			&tmp);
>> -	if (tmp)
>> -		prlog(PR_WARNING,
>> -			"SLW: core %d P10_QME_SPWU_OTR read  0x%016llx\n",
>> +		        "SLW: core %d P10_QME_SPWU_HYP requested 0x%016llx\n",
>>   		      core, tmp);
>> -#endif
>> +
>>   	return true;
>>   }
>>   
>> diff --git a/hw/xive2.c b/hw/xive2.c
>> index b79635cc9..f559b0f9a 100644
>> --- a/hw/xive2.c
>> +++ b/hw/xive2.c
>> @@ -3022,34 +3022,10 @@ static void xive_configure_ex_special_bar(struct xive *x, struct cpu_thread *c)
>>   
>>   void xive2_late_init(void)
>>   {
>> -	struct cpu_thread *c;
>> -
>>   	prlog(PR_INFO, "SLW: Configuring self-restore for NCU_SPEC_BAR\n");
>> -	for_each_present_cpu(c) {
>> -		if(cpu_is_thread0(c)) {
>> -			struct proc_chip *chip = get_chip(c->chip_id);
>> -			struct xive *x = chip->xive;
>> -			uint64_t xa, val, rc;
>> -			xa = XSCOM_ADDR_P10_NCU(pir_to_core_id(c->pir), P10_NCU_SPEC_BAR);
>> -			val = (uint64_t)x->tm_base | P10_NCU_SPEC_BAR_ENABLE;
>> -			/* Bail out if wakeup engine has already failed */
>> -			if ( wakeup_engine_state != WAKEUP_ENGINE_PRESENT) {
>> -				prlog(PR_ERR, "XIVE p9_stop_api fail detected\n");
>> -				break;
>> -			}
>>   			/*
>> -			 * TODO (p10): need P10 stop state engine
>> +			 * TODO (p10): need P10 stop state engine and fix for STOP11
>>   			 */
>> -			rc = p9_stop_save_scom((void *)chip->homer_base, xa, val,
>> -				P9_STOP_SCOM_REPLACE, P9_STOP_SECTION_EQ_SCOM);
>> -			if (rc) {
>> -				xive_cpu_err(c, "p9_stop_api failed for NCU_SPEC_BAR rc=%lld\n",
>> -				rc);
>> -				wakeup_engine_state = WAKEUP_ENGINE_FAILED;
>> -			}
>> -		}
>> -	}
> This should probably be merged withthe xive2 code that added it.

I agree, I could move this hunk within the xive2 patch.
Thanks for pointing it out!

Thanks,
Pratik

> Thanks,
> Nick
Nicholas Piggin July 27, 2021, 3:37 a.m. UTC | #4
Excerpts from Pratik Sampat's message of July 24, 2021 12:58 am:
> 
> 
> On 21/07/21 5:02 pm, Nicholas Piggin wrote:
>> Excerpts from Vasant Hegde's message of July 19, 2021 11:20 pm:
>>> From: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
>>>
>>> Cleanup P9 code pending implementation for P10.
>>>
>>> P10 stop-api integration will be needed for STOP11
>>> support only.  STOP11 will be a restricted usage
>>> for testing core re-init on P10.  Only stop0,2,3
>>> will be available for general usage.
>>>
>>> Also, do not treat gated core with SPW done as error.
>>> Core gating bit is a software state updated by microcode,
>>> while SPWU done bit comes from hardware logic to indicate
>>> successful operation.  Print a warning if the status bits
>>> are out of sync, but no need to fail the special wakeup
>>> operation.
>> Hmm, I wonder if similar should be done with P9 code first,
>> and then some of the P10 updates merged into the P10 enable
>> patches.
> 
> I believe P9 didn't fail the SPW command if the core is gated, that is just the
> behavior I observe on P10, is that what you mean?

I mean changing from treating core gated as a SPW failure as P9 does,
to allowing it.

And P9 may not require it, but if it's a valid sequence on P9 as well,
I would prefer to keep them in synch. If the P9 and P10 sequences must 
be different, a small comment might be in order to explain.

>>> Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
>>> Signed-off-by: Pratik R. Sampat <psampat@linux.ibm.com>
>>> Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
>>> ---
>>>   core/direct-controls.c | 34 +++++++++++++++++++++++++++-------
>>>   hw/slw.c               | 30 +++++++++---------------------
>>>   hw/xive2.c             | 26 +-------------------------
>>>   3 files changed, 37 insertions(+), 53 deletions(-)
>>>
>>> diff --git a/core/direct-controls.c b/core/direct-controls.c
>>> index 879a537af..4795c19dc 100644
>>> --- a/core/direct-controls.c
>>> +++ b/core/direct-controls.c
>>> @@ -600,15 +600,35 @@ static int p10_core_set_special_wakeup(struct cpu_thread *cpu)
>>>   			 * CORE_GATED will be unset on a successful special
>>>   			 * wakeup of the core which indicates that the core is
>>>   			 * out of stop state. If CORE_GATED is still set then
>>> -			 * raise error.
>>> +			 * check SPWU register and raise error only if SPWU_DONE
>>> +			 * is not set, else print a warning and consider SPWU
>>> +			 * operation as successful.
>>>   			 */
>>>   			if (p10_core_is_gated(cpu)) {
>>> -				/* Deassert spwu for this strange error */
>>> -				xscom_write(chip_id, spwu_addr, 0);
>>> -				prlog(PR_ERR, "Failed special wakeup on %u:%u"
>>> -						" core remains gated.\n",
>>> -						chip_id, core_id);
>>> -				return OPAL_HARDWARE;
>>> +				if(xscom_read(chip_id, spwu_addr, &val)) {
>>> +					prlog(PR_ERR, "Core %u:%u:"
>>> +					      " unable to read QME_SPWU_HYP\n",
>>> +					      chip_id, core_id);
>>> +                                        return OPAL_HARDWARE;
>>> +                                }

Shoud clear the spwu request in case of errors to be consistent.

>>> +				if (val & P10_SPWU_DONE) {
>>> +					/*
>>> +					 * If SPWU DONE bit is set then
>>> +					 * SPWU operation is complete
>>> +					 */
>>> +					prlog(PR_WARNING, "Special wakeup on "
>>> +					      "%u:%u: core remains gated while"
>>> +					      " SPWU_HYP DONE set\n",
>>> +					      chip_id, core_id);
>>> +					return 0;
>>> +				}

So succeeded, but we have this error message. Does that mean microcode 
has a bug? Under what circumstances does this happen?

>>> +                                /* Deassert spwu for this strange error */
>>> +                                xscom_write(chip_id, spwu_addr, 0);
>>> +                                prlog(PR_ERR,
>>> +                                      "Failed special wakeup on %u:%u"
>>> +                                      " core remains gated.\n",
>>> +                                      chip_id, core_id);
>>> +                                return OPAL_HARDWARE;
>> Should the equivalent be done for P9 here?
> 
> I believe P9 does not fail on special wakeup when core is gated hence, it may
> not be necessary.

Do you mean that P9 clears the core gated bit, but P10 does not?

> 
>>>   			} else {
>>>   				return 0;
>>>   			}
>>> diff --git a/hw/slw.c b/hw/slw.c
>>> index e22d1bdde..52536db06 100644
>>> --- a/hw/slw.c
>>> +++ b/hw/slw.c
>>> @@ -228,32 +228,20 @@ static bool slw_set_overrides_p10(struct proc_chip *chip, struct cpu_thread *c)
>>>   	int rc;
>>>   	uint32_t core = pir_to_core_id(c->pir);
>>>   
>>> -	/* Clear special wakeup bits that could hold power mgt */
>>> -	rc = xscom_write(chip->id,
>>> -			 XSCOM_ADDR_P10_QME_CORE(core, P10_QME_SPWU_HYP),
>>> -			 0);
>>> -	if (rc) {
>>> -		log_simple_error(&e_info(OPAL_RC_SLW_SET),
>>> -			"SLW: Failed to write P10_QME_SPWU_HYP\n");
>>> -		return false;
>>> -	}
>>> -	/* Read back for debug */
>>> +	/* Special wakeup bits that could hold power mgt */
>>>   	rc = xscom_read(chip->id,
>>>   			XSCOM_ADDR_P10_QME_CORE(core, P10_QME_SPWU_HYP),
>>>   			&tmp);
>>> -	if (tmp)
>>> +        if (rc) {
>>> +          log_simple_error(&e_info(OPAL_RC_SLW_SET),
>>> +                           "SLW: Failed to read P10_QME_SPWU_HYP\n");
>>> +          return false;
>>> +        }
>> Ditto here -- should the p9 code be changed? I wonder if the special
>> wakeup check should be made in direct controls init code rather than
>> here.
> 
> P9 made those check here too and that's why I followed pattern, however if you
> believe this belongs in direct controls I could maybe clean it up for both P9
> and P10.

Well P9 just does a blind clear, whereas you are checking it here (and 
seems you don't try to clear at all).

I just wonder if that's the better way to go for P9 code as well, so 
they don't have to diverge?

And yes I think it might be nice to just move that (at least P9 and P10)
out of SLW and into direct-controls init entirely. The only reason not 
to AFAIKS would be if some bring-up firmware without power management
just keeps special wakeup asserted.

Thanks,
Nick
Pratik Sampat July 30, 2021, 7:08 a.m. UTC | #5
Hello,

Apologies for late response, I was collating answers for the questions from
@vaidy and the firmware team.


On 27/07/21 9:07 am, Nicholas Piggin wrote:
> Excerpts from Pratik Sampat's message of July 24, 2021 12:58 am:
>>
>> On 21/07/21 5:02 pm, Nicholas Piggin wrote:
>>> Excerpts from Vasant Hegde's message of July 19, 2021 11:20 pm:
>>>> From: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
>>>>
>>>> Cleanup P9 code pending implementation for P10.
>>>>
>>>> P10 stop-api integration will be needed for STOP11
>>>> support only.  STOP11 will be a restricted usage
>>>> for testing core re-init on P10.  Only stop0,2,3
>>>> will be available for general usage.
>>>>
>>>> Also, do not treat gated core with SPW done as error.
>>>> Core gating bit is a software state updated by microcode,
>>>> while SPWU done bit comes from hardware logic to indicate
>>>> successful operation.  Print a warning if the status bits
>>>> are out of sync, but no need to fail the special wakeup
>>>> operation.
>>> Hmm, I wonder if similar should be done with P9 code first,
>>> and then some of the P10 updates merged into the P10 enable
>>> patches.
>> I believe P9 didn't fail the SPW command if the core is gated, that is just the
>> behavior I observe on P10, is that what you mean?
> I mean changing from treating core gated as a SPW failure as P9 does,
> to allowing it.
>
> And P9 may not require it, but if it's a valid sequence on P9 as well,
> I would prefer to keep them in synch. If the P9 and P10 sequences must
> be different, a small comment might be in order to explain.
>
On P9 the behavior on the check failing is properly defined to fail. However, on
P10 there does seem to be a bug which seems to say if the core gated but SPW is
done it is still a pass.

Sure, I could add a comment on it explaining about this in the code.

>>>> Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
>>>> Signed-off-by: Pratik R. Sampat <psampat@linux.ibm.com>
>>>> Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
>>>> ---
>>>>    core/direct-controls.c | 34 +++++++++++++++++++++++++++-------
>>>>    hw/slw.c               | 30 +++++++++---------------------
>>>>    hw/xive2.c             | 26 +-------------------------
>>>>    3 files changed, 37 insertions(+), 53 deletions(-)
>>>>
>>>> diff --git a/core/direct-controls.c b/core/direct-controls.c
>>>> index 879a537af..4795c19dc 100644
>>>> --- a/core/direct-controls.c
>>>> +++ b/core/direct-controls.c
>>>> @@ -600,15 +600,35 @@ static int p10_core_set_special_wakeup(struct cpu_thread *cpu)
>>>>    			 * CORE_GATED will be unset on a successful special
>>>>    			 * wakeup of the core which indicates that the core is
>>>>    			 * out of stop state. If CORE_GATED is still set then
>>>> -			 * raise error.
>>>> +			 * check SPWU register and raise error only if SPWU_DONE
>>>> +			 * is not set, else print a warning and consider SPWU
>>>> +			 * operation as successful.
>>>>    			 */
>>>>    			if (p10_core_is_gated(cpu)) {
>>>> -				/* Deassert spwu for this strange error */
>>>> -				xscom_write(chip_id, spwu_addr, 0);
>>>> -				prlog(PR_ERR, "Failed special wakeup on %u:%u"
>>>> -						" core remains gated.\n",
>>>> -						chip_id, core_id);
>>>> -				return OPAL_HARDWARE;
>>>> +				if(xscom_read(chip_id, spwu_addr, &val)) {
>>>> +					prlog(PR_ERR, "Core %u:%u:"
>>>> +					      " unable to read QME_SPWU_HYP\n",
>>>> +					      chip_id, core_id);
>>>> +                                        return OPAL_HARDWARE;
>>>> +                                }
> Shoud clear the spwu request in case of errors to be consistent.
>
If we fail to read QME_SPWU_HYP then we have to just fail and return because
chances of we writing same QME_SPWU_HYP xscom to clear SPW is less. This can
happen if xscom engine is broken or a code bug or we are targeting some dead
core/invalid core etc.

So I would be inclined to not try any other xscom, just return error as is.

>>>> +				if (val & P10_SPWU_DONE) {
>>>> +					/*
>>>> +					 * If SPWU DONE bit is set then
>>>> +					 * SPWU operation is complete
>>>> +					 */
>>>> +					prlog(PR_WARNING, "Special wakeup on "
>>>> +					      "%u:%u: core remains gated while"
>>>> +					      " SPWU_HYP DONE set\n",
>>>> +					      chip_id, core_id);
>>>> +					return 0;
>>>> +				}
> So succeeded, but we have this error message. Does that mean microcode
> has a bug? Under what circumstances does this happen?

I just confirmed that this is indeed a microcode bug seen only on P10, So
instead of failing SPW and hence the callers, we know that SPW has succeeded by
checking both the sources: SPWU_HYP and SSH_HYP.  So we print a warning and
make the SPW call success.

>>>> +                                /* Deassert spwu for this strange error */
>>>> +                                xscom_write(chip_id, spwu_addr, 0);
>>>> +                                prlog(PR_ERR,
>>>> +                                      "Failed special wakeup on %u:%u"
>>>> +                                      " core remains gated.\n",
>>>> +                                      chip_id, core_id);
>>>> +                                return OPAL_HARDWARE;
>>> Should the equivalent be done for P9 here?
>> I believe P9 does not fail on special wakeup when core is gated hence, it may
>> not be necessary.
> Do you mean that P9 clears the core gated bit, but P10 does not?

No, I think you're right. Even though we haven't hit this issue in P9. I think
we should still keep the de-assert logic mimicked for P9 as well.

>>>>    			} else {
>>>>    				return 0;
>>>>    			}
>>>> diff --git a/hw/slw.c b/hw/slw.c
>>>> index e22d1bdde..52536db06 100644
>>>> --- a/hw/slw.c
>>>> +++ b/hw/slw.c
>>>> @@ -228,32 +228,20 @@ static bool slw_set_overrides_p10(struct proc_chip *chip, struct cpu_thread *c)
>>>>    	int rc;
>>>>    	uint32_t core = pir_to_core_id(c->pir);
>>>>    
>>>> -	/* Clear special wakeup bits that could hold power mgt */
>>>> -	rc = xscom_write(chip->id,
>>>> -			 XSCOM_ADDR_P10_QME_CORE(core, P10_QME_SPWU_HYP),
>>>> -			 0);
>>>> -	if (rc) {
>>>> -		log_simple_error(&e_info(OPAL_RC_SLW_SET),
>>>> -			"SLW: Failed to write P10_QME_SPWU_HYP\n");
>>>> -		return false;
>>>> -	}
>>>> -	/* Read back for debug */
>>>> +	/* Special wakeup bits that could hold power mgt */
>>>>    	rc = xscom_read(chip->id,
>>>>    			XSCOM_ADDR_P10_QME_CORE(core, P10_QME_SPWU_HYP),
>>>>    			&tmp);
>>>> -	if (tmp)
>>>> +        if (rc) {
>>>> +          log_simple_error(&e_info(OPAL_RC_SLW_SET),
>>>> +                           "SLW: Failed to read P10_QME_SPWU_HYP\n");
>>>> +          return false;
>>>> +        }
>>> Ditto here -- should the p9 code be changed? I wonder if the special
>>> wakeup check should be made in direct controls init code rather than
>>> here.
>> P9 made those check here too and that's why I followed pattern, however if you
>> believe this belongs in direct controls I could maybe clean it up for both P9
>> and P10.
> Well P9 just does a blind clear, whereas you are checking it here (and
> seems you don't try to clear at all).
>
> I just wonder if that's the better way to go for P9 code as well, so
> they don't have to diverge?

I agree. For P9, we probably should not do a blind clear and just check for the
bit and report. I'll clean that up for P9.

>
> And yes I think it might be nice to just move that (at least P9 and P10)
> out of SLW and into direct-controls init entirely. The only reason not
> to AFAIKS would be if some bring-up firmware without power management
> just keeps special wakeup asserted.

I would still be inclined to keep this in SLW init as now we know that it's job
is to just check and report and moving this check to direct controls may cause
more confusion.
However, if you still believe that it would be better suited in direct-controls
I would be happy to move it there after testing once if there isn't any
microcode interaction that keeps SPW asserted at that point.

Thanks
Pratik

> Thanks,
> Nick
Nicholas Piggin Aug. 2, 2021, 2:13 a.m. UTC | #6
Excerpts from Pratik Sampat's message of July 30, 2021 5:08 pm:
> Hello,
> 
> Apologies for late response, I was collating answers for the questions from
> @vaidy and the firmware team.

No problem thanks for checking.

> On 27/07/21 9:07 am, Nicholas Piggin wrote:
>> Excerpts from Pratik Sampat's message of July 24, 2021 12:58 am:
>>>
>>> On 21/07/21 5:02 pm, Nicholas Piggin wrote:
>>>> Excerpts from Vasant Hegde's message of July 19, 2021 11:20 pm:
>>>>> From: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
>>>>>
>>>>> Cleanup P9 code pending implementation for P10.
>>>>>
>>>>> P10 stop-api integration will be needed for STOP11
>>>>> support only.  STOP11 will be a restricted usage
>>>>> for testing core re-init on P10.  Only stop0,2,3
>>>>> will be available for general usage.
>>>>>
>>>>> Also, do not treat gated core with SPW done as error.
>>>>> Core gating bit is a software state updated by microcode,
>>>>> while SPWU done bit comes from hardware logic to indicate
>>>>> successful operation.  Print a warning if the status bits
>>>>> are out of sync, but no need to fail the special wakeup
>>>>> operation.
>>>> Hmm, I wonder if similar should be done with P9 code first,
>>>> and then some of the P10 updates merged into the P10 enable
>>>> patches.
>>> I believe P9 didn't fail the SPW command if the core is gated, that is just the
>>> behavior I observe on P10, is that what you mean?
>> I mean changing from treating core gated as a SPW failure as P9 does,
>> to allowing it.
>>
>> And P9 may not require it, but if it's a valid sequence on P9 as well,
>> I would prefer to keep them in synch. If the P9 and P10 sequences must
>> be different, a small comment might be in order to explain.
>>
> On P9 the behavior on the check failing is properly defined to fail. However, on
> P10 there does seem to be a bug which seems to say if the core gated but SPW is
> done it is still a pass.
> 
> Sure, I could add a comment on it explaining about this in the code.

Yeah that would be good.

> 
>>>>> Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
>>>>> Signed-off-by: Pratik R. Sampat <psampat@linux.ibm.com>
>>>>> Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
>>>>> ---
>>>>>    core/direct-controls.c | 34 +++++++++++++++++++++++++++-------
>>>>>    hw/slw.c               | 30 +++++++++---------------------
>>>>>    hw/xive2.c             | 26 +-------------------------
>>>>>    3 files changed, 37 insertions(+), 53 deletions(-)
>>>>>
>>>>> diff --git a/core/direct-controls.c b/core/direct-controls.c
>>>>> index 879a537af..4795c19dc 100644
>>>>> --- a/core/direct-controls.c
>>>>> +++ b/core/direct-controls.c
>>>>> @@ -600,15 +600,35 @@ static int p10_core_set_special_wakeup(struct cpu_thread *cpu)
>>>>>    			 * CORE_GATED will be unset on a successful special
>>>>>    			 * wakeup of the core which indicates that the core is
>>>>>    			 * out of stop state. If CORE_GATED is still set then
>>>>> -			 * raise error.
>>>>> +			 * check SPWU register and raise error only if SPWU_DONE
>>>>> +			 * is not set, else print a warning and consider SPWU
>>>>> +			 * operation as successful.
>>>>>    			 */
>>>>>    			if (p10_core_is_gated(cpu)) {
>>>>> -				/* Deassert spwu for this strange error */
>>>>> -				xscom_write(chip_id, spwu_addr, 0);
>>>>> -				prlog(PR_ERR, "Failed special wakeup on %u:%u"
>>>>> -						" core remains gated.\n",
>>>>> -						chip_id, core_id);
>>>>> -				return OPAL_HARDWARE;
>>>>> +				if(xscom_read(chip_id, spwu_addr, &val)) {
>>>>> +					prlog(PR_ERR, "Core %u:%u:"
>>>>> +					      " unable to read QME_SPWU_HYP\n",
>>>>> +					      chip_id, core_id);
>>>>> +                                        return OPAL_HARDWARE;
>>>>> +                                }
>> Shoud clear the spwu request in case of errors to be consistent.
>>
> If we fail to read QME_SPWU_HYP then we have to just fail and return because
> chances of we writing same QME_SPWU_HYP xscom to clear SPW is less. This can
> happen if xscom engine is broken or a code bug or we are targeting some dead
> core/invalid core etc.
> 
> So I would be inclined to not try any other xscom, just return error as is.

Okay you're right, other code does not do this, it's only the strange 
gated error that does.

> 
>>>>> +				if (val & P10_SPWU_DONE) {
>>>>> +					/*
>>>>> +					 * If SPWU DONE bit is set then
>>>>> +					 * SPWU operation is complete
>>>>> +					 */
>>>>> +					prlog(PR_WARNING, "Special wakeup on "
>>>>> +					      "%u:%u: core remains gated while"
>>>>> +					      " SPWU_HYP DONE set\n",
>>>>> +					      chip_id, core_id);
>>>>> +					return 0;
>>>>> +				}
>> So succeeded, but we have this error message. Does that mean microcode
>> has a bug? Under what circumstances does this happen?
> 
> I just confirmed that this is indeed a microcode bug seen only on P10, So
> instead of failing SPW and hence the callers, we know that SPW has succeeded by
> checking both the sources: SPWU_HYP and SSH_HYP.  So we print a warning and
> make the SPW call success.

May not need to be a warning then unless it would be useful for 
debugging. In that case possibly use PR_DEBUG. Some skiboots are 
way too verbose and scary which we have to watch out for. If your
firmware starts printing messages like this then you'd be inclined to 
think the machine is failing or unstable.

> 
>>>>> +                                /* Deassert spwu for this strange error */
>>>>> +                                xscom_write(chip_id, spwu_addr, 0);
>>>>> +                                prlog(PR_ERR,
>>>>> +                                      "Failed special wakeup on %u:%u"
>>>>> +                                      " core remains gated.\n",
>>>>> +                                      chip_id, core_id);
>>>>> +                                return OPAL_HARDWARE;
>>>> Should the equivalent be done for P9 here?
>>> I believe P9 does not fail on special wakeup when core is gated hence, it may
>>> not be necessary.
>> Do you mean that P9 clears the core gated bit, but P10 does not?
> 
> No, I think you're right. Even though we haven't hit this issue in P9. I think
> we should still keep the de-assert logic mimicked for P9 as well.
> 
>>>>>    			} else {
>>>>>    				return 0;
>>>>>    			}
>>>>> diff --git a/hw/slw.c b/hw/slw.c
>>>>> index e22d1bdde..52536db06 100644
>>>>> --- a/hw/slw.c
>>>>> +++ b/hw/slw.c
>>>>> @@ -228,32 +228,20 @@ static bool slw_set_overrides_p10(struct proc_chip *chip, struct cpu_thread *c)
>>>>>    	int rc;
>>>>>    	uint32_t core = pir_to_core_id(c->pir);
>>>>>    
>>>>> -	/* Clear special wakeup bits that could hold power mgt */
>>>>> -	rc = xscom_write(chip->id,
>>>>> -			 XSCOM_ADDR_P10_QME_CORE(core, P10_QME_SPWU_HYP),
>>>>> -			 0);
>>>>> -	if (rc) {
>>>>> -		log_simple_error(&e_info(OPAL_RC_SLW_SET),
>>>>> -			"SLW: Failed to write P10_QME_SPWU_HYP\n");
>>>>> -		return false;
>>>>> -	}
>>>>> -	/* Read back for debug */
>>>>> +	/* Special wakeup bits that could hold power mgt */
>>>>>    	rc = xscom_read(chip->id,
>>>>>    			XSCOM_ADDR_P10_QME_CORE(core, P10_QME_SPWU_HYP),
>>>>>    			&tmp);
>>>>> -	if (tmp)
>>>>> +        if (rc) {
>>>>> +          log_simple_error(&e_info(OPAL_RC_SLW_SET),
>>>>> +                           "SLW: Failed to read P10_QME_SPWU_HYP\n");
>>>>> +          return false;
>>>>> +        }
>>>> Ditto here -- should the p9 code be changed? I wonder if the special
>>>> wakeup check should be made in direct controls init code rather than
>>>> here.
>>> P9 made those check here too and that's why I followed pattern, however if you
>>> believe this belongs in direct controls I could maybe clean it up for both P9
>>> and P10.
>> Well P9 just does a blind clear, whereas you are checking it here (and
>> seems you don't try to clear at all).
>>
>> I just wonder if that's the better way to go for P9 code as well, so
>> they don't have to diverge?
> 
> I agree. For P9, we probably should not do a blind clear and just check for the
> bit and report. I'll clean that up for P9.

Thanks, I agree.

> 
>>
>> And yes I think it might be nice to just move that (at least P9 and P10)
>> out of SLW and into direct-controls init entirely. The only reason not
>> to AFAIKS would be if some bring-up firmware without power management
>> just keeps special wakeup asserted.
> 
> I would still be inclined to keep this in SLW init as now we know that it's job
> is to just check and report and moving this check to direct controls may cause
> more confusion.
> However, if you still believe that it would be better suited in direct-controls
> I would be happy to move it there after testing once if there isn't any
> microcode interaction that keeps SPW asserted at that point.

Sure leave it there for now, it's not a big deal.

Thanks,
Nick
Pratik Sampat Aug. 2, 2021, 7:14 a.m. UTC | #7
On 02/08/21 7:43 am, Nicholas Piggin wrote:
> Excerpts from Pratik Sampat's message of July 30, 2021 5:08 pm:
>> Hello,
>>
>> Apologies for late response, I was collating answers for the questions from
>> @vaidy and the firmware team.
> No problem thanks for checking.
>
>> On 27/07/21 9:07 am, Nicholas Piggin wrote:
>>> Excerpts from Pratik Sampat's message of July 24, 2021 12:58 am:
>>>> On 21/07/21 5:02 pm, Nicholas Piggin wrote:
>>>>> Excerpts from Vasant Hegde's message of July 19, 2021 11:20 pm:
>>>>>> From: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
>>>>>>
>>>>>> Cleanup P9 code pending implementation for P10.
>>>>>>
>>>>>> P10 stop-api integration will be needed for STOP11
>>>>>> support only.  STOP11 will be a restricted usage
>>>>>> for testing core re-init on P10.  Only stop0,2,3
>>>>>> will be available for general usage.
>>>>>>
>>>>>> Also, do not treat gated core with SPW done as error.
>>>>>> Core gating bit is a software state updated by microcode,
>>>>>> while SPWU done bit comes from hardware logic to indicate
>>>>>> successful operation.  Print a warning if the status bits
>>>>>> are out of sync, but no need to fail the special wakeup
>>>>>> operation.
>>>>> Hmm, I wonder if similar should be done with P9 code first,
>>>>> and then some of the P10 updates merged into the P10 enable
>>>>> patches.
>>>> I believe P9 didn't fail the SPW command if the core is gated, that is just the
>>>> behavior I observe on P10, is that what you mean?
>>> I mean changing from treating core gated as a SPW failure as P9 does,
>>> to allowing it.
>>>
>>> And P9 may not require it, but if it's a valid sequence on P9 as well,
>>> I would prefer to keep them in synch. If the P9 and P10 sequences must
>>> be different, a small comment might be in order to explain.
>>>
>> On P9 the behavior on the check failing is properly defined to fail. However, on
>> P10 there does seem to be a bug which seems to say if the core gated but SPW is
>> done it is still a pass.
>>
>> Sure, I could add a comment on it explaining about this in the code.
> Yeah that would be good.
>
Sure, added.

>>>>>> Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
>>>>>> Signed-off-by: Pratik R. Sampat <psampat@linux.ibm.com>
>>>>>> Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
>>>>>> ---
>>>>>>     core/direct-controls.c | 34 +++++++++++++++++++++++++++-------
>>>>>>     hw/slw.c               | 30 +++++++++---------------------
>>>>>>     hw/xive2.c             | 26 +-------------------------
>>>>>>     3 files changed, 37 insertions(+), 53 deletions(-)
>>>>>>
>>>>>> diff --git a/core/direct-controls.c b/core/direct-controls.c
>>>>>> index 879a537af..4795c19dc 100644
>>>>>> --- a/core/direct-controls.c
>>>>>> +++ b/core/direct-controls.c
>>>>>> @@ -600,15 +600,35 @@ static int p10_core_set_special_wakeup(struct cpu_thread *cpu)
>>>>>>     			 * CORE_GATED will be unset on a successful special
>>>>>>     			 * wakeup of the core which indicates that the core is
>>>>>>     			 * out of stop state. If CORE_GATED is still set then
>>>>>> -			 * raise error.
>>>>>> +			 * check SPWU register and raise error only if SPWU_DONE
>>>>>> +			 * is not set, else print a warning and consider SPWU
>>>>>> +			 * operation as successful.
>>>>>>     			 */
>>>>>>     			if (p10_core_is_gated(cpu)) {
>>>>>> -				/* Deassert spwu for this strange error */
>>>>>> -				xscom_write(chip_id, spwu_addr, 0);
>>>>>> -				prlog(PR_ERR, "Failed special wakeup on %u:%u"
>>>>>> -						" core remains gated.\n",
>>>>>> -						chip_id, core_id);
>>>>>> -				return OPAL_HARDWARE;
>>>>>> +				if(xscom_read(chip_id, spwu_addr, &val)) {
>>>>>> +					prlog(PR_ERR, "Core %u:%u:"
>>>>>> +					      " unable to read QME_SPWU_HYP\n",
>>>>>> +					      chip_id, core_id);
>>>>>> +                                        return OPAL_HARDWARE;
>>>>>> +                                }
>>> Shoud clear the spwu request in case of errors to be consistent.
>>>
>> If we fail to read QME_SPWU_HYP then we have to just fail and return because
>> chances of we writing same QME_SPWU_HYP xscom to clear SPW is less. This can
>> happen if xscom engine is broken or a code bug or we are targeting some dead
>> core/invalid core etc.
>>
>> So I would be inclined to not try any other xscom, just return error as is.
> Okay you're right, other code does not do this, it's only the strange
> gated error that does.
>
>>>>>> +				if (val & P10_SPWU_DONE) {
>>>>>> +					/*
>>>>>> +					 * If SPWU DONE bit is set then
>>>>>> +					 * SPWU operation is complete
>>>>>> +					 */
>>>>>> +					prlog(PR_WARNING, "Special wakeup on "
>>>>>> +					      "%u:%u: core remains gated while"
>>>>>> +					      " SPWU_HYP DONE set\n",
>>>>>> +					      chip_id, core_id);
>>>>>> +					return 0;
>>>>>> +				}
>>> So succeeded, but we have this error message. Does that mean microcode
>>> has a bug? Under what circumstances does this happen?
>> I just confirmed that this is indeed a microcode bug seen only on P10, So
>> instead of failing SPW and hence the callers, we know that SPW has succeeded by
>> checking both the sources: SPWU_HYP and SSH_HYP.  So we print a warning and
>> make the SPW call success.
> May not need to be a warning then unless it would be useful for
> debugging. In that case possibly use PR_DEBUG. Some skiboots are
> way too verbose and scary which we have to watch out for. If your
> firmware starts printing messages like this then you'd be inclined to
> think the machine is failing or unstable.
>
Sure I understand, this warning can cause unnecessary worry.
I can convert this to a PR_DEBUG then.

>>>>>> +                                /* Deassert spwu for this strange error */
>>>>>> +                                xscom_write(chip_id, spwu_addr, 0);
>>>>>> +                                prlog(PR_ERR,
>>>>>> +                                      "Failed special wakeup on %u:%u"
>>>>>> +                                      " core remains gated.\n",
>>>>>> +                                      chip_id, core_id);
>>>>>> +                                return OPAL_HARDWARE;
>>>>> Should the equivalent be done for P9 here?
>>>> I believe P9 does not fail on special wakeup when core is gated hence, it may
>>>> not be necessary.
>>> Do you mean that P9 clears the core gated bit, but P10 does not?
>> No, I think you're right. Even though we haven't hit this issue in P9. I think
>> we should still keep the de-assert logic mimicked for P9 as well.
>>
>>>>>>     			} else {
>>>>>>     				return 0;
>>>>>>     			}
>>>>>> diff --git a/hw/slw.c b/hw/slw.c
>>>>>> index e22d1bdde..52536db06 100644
>>>>>> --- a/hw/slw.c
>>>>>> +++ b/hw/slw.c
>>>>>> @@ -228,32 +228,20 @@ static bool slw_set_overrides_p10(struct proc_chip *chip, struct cpu_thread *c)
>>>>>>     	int rc;
>>>>>>     	uint32_t core = pir_to_core_id(c->pir);
>>>>>>     
>>>>>> -	/* Clear special wakeup bits that could hold power mgt */
>>>>>> -	rc = xscom_write(chip->id,
>>>>>> -			 XSCOM_ADDR_P10_QME_CORE(core, P10_QME_SPWU_HYP),
>>>>>> -			 0);
>>>>>> -	if (rc) {
>>>>>> -		log_simple_error(&e_info(OPAL_RC_SLW_SET),
>>>>>> -			"SLW: Failed to write P10_QME_SPWU_HYP\n");
>>>>>> -		return false;
>>>>>> -	}
>>>>>> -	/* Read back for debug */
>>>>>> +	/* Special wakeup bits that could hold power mgt */
>>>>>>     	rc = xscom_read(chip->id,
>>>>>>     			XSCOM_ADDR_P10_QME_CORE(core, P10_QME_SPWU_HYP),
>>>>>>     			&tmp);
>>>>>> -	if (tmp)
>>>>>> +        if (rc) {
>>>>>> +          log_simple_error(&e_info(OPAL_RC_SLW_SET),
>>>>>> +                           "SLW: Failed to read P10_QME_SPWU_HYP\n");
>>>>>> +          return false;
>>>>>> +        }
>>>>> Ditto here -- should the p9 code be changed? I wonder if the special
>>>>> wakeup check should be made in direct controls init code rather than
>>>>> here.
>>>> P9 made those check here too and that's why I followed pattern, however if you
>>>> believe this belongs in direct controls I could maybe clean it up for both P9
>>>> and P10.
>>> Well P9 just does a blind clear, whereas you are checking it here (and
>>> seems you don't try to clear at all).
>>>
>>> I just wonder if that's the better way to go for P9 code as well, so
>>> they don't have to diverge?
>> I agree. For P9, we probably should not do a blind clear and just check for the
>> bit and report. I'll clean that up for P9.
> Thanks, I agree.
>
>>> And yes I think it might be nice to just move that (at least P9 and P10)
>>> out of SLW and into direct-controls init entirely. The only reason not
>>> to AFAIKS would be if some bring-up firmware without power management
>>> just keeps special wakeup asserted.
>> I would still be inclined to keep this in SLW init as now we know that it's job
>> is to just check and report and moving this check to direct controls may cause
>> more confusion.
>> However, if you still believe that it would be better suited in direct-controls
>> I would be happy to move it there after testing once if there isn't any
>> microcode interaction that keeps SPW asserted at that point.
> Sure leave it there for now, it's not a big deal.

Sure thing, I'll leave it as-is for now.

Thanks,
Pratik

> Thanks,
> Nick
diff mbox series

Patch

diff --git a/core/direct-controls.c b/core/direct-controls.c
index 879a537af..4795c19dc 100644
--- a/core/direct-controls.c
+++ b/core/direct-controls.c
@@ -600,15 +600,35 @@  static int p10_core_set_special_wakeup(struct cpu_thread *cpu)
 			 * CORE_GATED will be unset on a successful special
 			 * wakeup of the core which indicates that the core is
 			 * out of stop state. If CORE_GATED is still set then
-			 * raise error.
+			 * check SPWU register and raise error only if SPWU_DONE
+			 * is not set, else print a warning and consider SPWU
+			 * operation as successful.
 			 */
 			if (p10_core_is_gated(cpu)) {
-				/* Deassert spwu for this strange error */
-				xscom_write(chip_id, spwu_addr, 0);
-				prlog(PR_ERR, "Failed special wakeup on %u:%u"
-						" core remains gated.\n",
-						chip_id, core_id);
-				return OPAL_HARDWARE;
+				if(xscom_read(chip_id, spwu_addr, &val)) {
+					prlog(PR_ERR, "Core %u:%u:"
+					      " unable to read QME_SPWU_HYP\n",
+					      chip_id, core_id);
+                                        return OPAL_HARDWARE;
+                                }
+				if (val & P10_SPWU_DONE) {
+					/*
+					 * If SPWU DONE bit is set then
+					 * SPWU operation is complete
+					 */
+					prlog(PR_WARNING, "Special wakeup on "
+					      "%u:%u: core remains gated while"
+					      " SPWU_HYP DONE set\n",
+					      chip_id, core_id);
+					return 0;
+				}
+                                /* Deassert spwu for this strange error */
+                                xscom_write(chip_id, spwu_addr, 0);
+                                prlog(PR_ERR,
+                                      "Failed special wakeup on %u:%u"
+                                      " core remains gated.\n",
+                                      chip_id, core_id);
+                                return OPAL_HARDWARE;
 			} else {
 				return 0;
 			}
diff --git a/hw/slw.c b/hw/slw.c
index e22d1bdde..52536db06 100644
--- a/hw/slw.c
+++ b/hw/slw.c
@@ -228,32 +228,20 @@  static bool slw_set_overrides_p10(struct proc_chip *chip, struct cpu_thread *c)
 	int rc;
 	uint32_t core = pir_to_core_id(c->pir);
 
-	/* Clear special wakeup bits that could hold power mgt */
-	rc = xscom_write(chip->id,
-			 XSCOM_ADDR_P10_QME_CORE(core, P10_QME_SPWU_HYP),
-			 0);
-	if (rc) {
-		log_simple_error(&e_info(OPAL_RC_SLW_SET),
-			"SLW: Failed to write P10_QME_SPWU_HYP\n");
-		return false;
-	}
-	/* Read back for debug */
+	/* Special wakeup bits that could hold power mgt */
 	rc = xscom_read(chip->id,
 			XSCOM_ADDR_P10_QME_CORE(core, P10_QME_SPWU_HYP),
 			&tmp);
-	if (tmp)
+        if (rc) {
+          log_simple_error(&e_info(OPAL_RC_SLW_SET),
+                           "SLW: Failed to read P10_QME_SPWU_HYP\n");
+          return false;
+        }
+        if (tmp & P10_SPWU_REQ)
 		prlog(PR_WARNING,
-			"SLW: core %d P10_QME_SPWU_HYP read  0x%016llx\n",
-		     core, tmp);
-#if 0
-	rc = xscom_read(chip->id,
-			XSCOM_ADDR_P10_QME_CORE(core, P10_QME_SPWU_OTR),
-			&tmp);
-	if (tmp)
-		prlog(PR_WARNING,
-			"SLW: core %d P10_QME_SPWU_OTR read  0x%016llx\n",
+		        "SLW: core %d P10_QME_SPWU_HYP requested 0x%016llx\n",
 		      core, tmp);
-#endif
+
 	return true;
 }
 
diff --git a/hw/xive2.c b/hw/xive2.c
index b79635cc9..f559b0f9a 100644
--- a/hw/xive2.c
+++ b/hw/xive2.c
@@ -3022,34 +3022,10 @@  static void xive_configure_ex_special_bar(struct xive *x, struct cpu_thread *c)
 
 void xive2_late_init(void)
 {
-	struct cpu_thread *c;
-
 	prlog(PR_INFO, "SLW: Configuring self-restore for NCU_SPEC_BAR\n");
-	for_each_present_cpu(c) {
-		if(cpu_is_thread0(c)) {
-			struct proc_chip *chip = get_chip(c->chip_id);
-			struct xive *x = chip->xive;
-			uint64_t xa, val, rc;
-			xa = XSCOM_ADDR_P10_NCU(pir_to_core_id(c->pir), P10_NCU_SPEC_BAR);
-			val = (uint64_t)x->tm_base | P10_NCU_SPEC_BAR_ENABLE;
-			/* Bail out if wakeup engine has already failed */
-			if ( wakeup_engine_state != WAKEUP_ENGINE_PRESENT) {
-				prlog(PR_ERR, "XIVE p9_stop_api fail detected\n");
-				break;
-			}
 			/*
-			 * TODO (p10): need P10 stop state engine
+			 * TODO (p10): need P10 stop state engine and fix for STOP11
 			 */
-			rc = p9_stop_save_scom((void *)chip->homer_base, xa, val,
-				P9_STOP_SCOM_REPLACE, P9_STOP_SECTION_EQ_SCOM);
-			if (rc) {
-				xive_cpu_err(c, "p9_stop_api failed for NCU_SPEC_BAR rc=%lld\n",
-				rc);
-				wakeup_engine_state = WAKEUP_ENGINE_FAILED;
-			}
-		}
-	}
-
 }
 static void xive_provision_cpu(struct xive_cpu_state *xs, struct cpu_thread *c)
 {