[v2] Add purging CPU L2 and L3 caches into NPU hreset.

Message ID 20181120234404.4311-1-rashmica.g@gmail.com
State Superseded
Headers show
Series
  • [v2] Add purging CPU L2 and L3 caches into NPU hreset.
Related show

Checks

Context Check Description
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot success Test snowpatch/job/snowpatch-skiboot on branch master
snowpatch_ozlabs/apply_patch success master/apply_patch Successfully applied

Commit Message

Rashmica Gupta Nov. 20, 2018, 11:44 p.m.
If a GPU is passed through to a guest and the guest unexpectedly terminates,
there can be cache lines in CPUs that belong to the GPU. So purge the caches
as part of the reset sequence. L1 is write through, so doesn't need to be purged.

The sequence to purge the L2 and L3 caches from the hw team:

"L2 purge:
 (1) initiate purge
 putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TYPE L2CAC_FLUSH -all
 putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TRIGGER ON -all

 (2) check this is off in all caches to know purge completed
 getspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_REG_BUSY -all

 (3) putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TRIGGER OFF -all

L3 purge:
 1) Start the purge:
 putspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_TTYPE FULL_PURGE -all
 putspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_REQ ON -all

 2) Ensure that the purge has completed by checking the status bit:
 getspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_REQ -all

 You should see it say OFF if it's done:
 p9n.ex k0:n0:s0:p00:c0
 EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_REQ
 OFF"

Suggested-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Rashmica Gupta <rashmica.g@gmail.com>
---

This is done synchronously for now as it doesn't seem to take *too* long
(purging the L2 and L3 caches after building the 4.16 linux kernel on a p9
with 16 cores took 1.57 ms, 1.49ms and 1.46ms).

v1->v2: Addressed comments from Alexey, Alistair, Nick and Oliver.


 hw/npu2.c           | 136 ++++++++++++++++++++++++++++++++++++++++++++
 include/npu2-regs.h |  11 ++++
 2 files changed, 147 insertions(+)

Comments

Alexey Kardashevskiy Nov. 30, 2018, 1:05 a.m. | #1
On 21/11/2018 10:44, Rashmica Gupta wrote:
> If a GPU is passed through to a guest and the guest unexpectedly terminates,
> there can be cache lines in CPUs that belong to the GPU. So purge the caches
> as part of the reset sequence. L1 is write through, so doesn't need to be purged.
> 
> The sequence to purge the L2 and L3 caches from the hw team:
> 
> "L2 purge:
>  (1) initiate purge
>  putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TYPE L2CAC_FLUSH -all
>  putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TRIGGER ON -all
> 
>  (2) check this is off in all caches to know purge completed
>  getspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_REG_BUSY -all
> 
>  (3) putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TRIGGER OFF -all
> 
> L3 purge:
>  1) Start the purge:
>  putspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_TTYPE FULL_PURGE -all
>  putspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_REQ ON -all
> 
>  2) Ensure that the purge has completed by checking the status bit:
>  getspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_REQ -all
> 
>  You should see it say OFF if it's done:
>  p9n.ex k0:n0:s0:p00:c0
>  EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_REQ
>  OFF"
> 
> Suggested-by: Alistair Popple <alistair@popple.id.au>
> Signed-off-by: Rashmica Gupta <rashmica.g@gmail.com>
> ---
> 
> This is done synchronously for now as it doesn't seem to take *too* long
> (purging the L2 and L3 caches after building the 4.16 linux kernel on a p9
> with 16 cores took 1.57 ms, 1.49ms and 1.46ms).
> 
> v1->v2: Addressed comments from Alexey, Alistair, Nick and Oliver.
> 
> 
>  hw/npu2.c           | 136 ++++++++++++++++++++++++++++++++++++++++++++
>  include/npu2-regs.h |  11 ++++
>  2 files changed, 147 insertions(+)
> 
> diff --git a/hw/npu2.c b/hw/npu2.c
> index 30049f5b..a10d4439 100644
> --- a/hw/npu2.c
> +++ b/hw/npu2.c
> @@ -1109,6 +1109,141 @@ static int64_t npu2_get_power_state(struct pci_slot *slot __unused, uint8_t *val
>  	return OPAL_SUCCESS;
>  }
>  
> +static int start_l2_purge(uint32_t chip_id, uint32_t core_id)
> +{
> +	int rc;
> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
> +
> +	rc = xscom_write_mask(chip_id, addr, L2CAC_FLUSH,
> +			      L2_PRD_PURGE_CMD_TYPE_MASK);
> +	if (rc)
> +		goto out;
> +	rc = xscom_write_mask(chip_id, addr, L2_PRD_PURGE_CMD_TRIGGER,
> +			      L2_PRD_PURGE_CMD_TRIGGER);
> +	if (rc)
> +out:		prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write_mask "
> +		      "failed %i\n", core_id, rc);


Nit: I find such use of goto with a label inside another scope rather
confusing. Goto is usually used to avoid code duplication on a cleanup
path and this is not the case.

I'd rather ditch "goto" and do "if (!rc) xscom_write_mask" for the
second xscom_write_mask()   or   move "out:" 1 line higher. Up to
Stewart really :)


> +	return rc;
> +

Unnecessary empty line.

> +}
> +
> +static int wait_l2_purge(uint32_t chip_id, uint32_t core_id)
> +{
> +	int rc;
> +	unsigned long now = mftb();
> +	unsigned long end = now + msecs_to_tb(2);
> +	uint64_t val = L2_PRD_PURGE_CMD_REG_BUSY;
> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
> +
> +	while (val & L2_PRD_PURGE_CMD_REG_BUSY) {

for (val = L2_PRD_PURGE_CMD_REG_BUSY;;)

and....

> +		rc = xscom_read(chip_id, addr, &val);
> +		if (rc) {
> +			prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM read "
> +			      "failed %i\n", core_id, rc);
> +			break;
> +		}


... break out here:
	if (val & L2_PRD_PURGE_CMD_REG_BUSY)
		break;

As if you read val with the bit set, then checking timeout is rather
pointless, no?


> +		now = mftb();
> +		if (tb_compare(now, end) == TB_AAFTERB) {
> +			prlog(PR_ERR, "PURGE L2 on core 0x%x timed out %i\n",
> +			      core_id, rc);
> +			return OPAL_BUSY;
> +		}
> +	}
> +
> +	/* We have to clear the trigger bit ourselves */
> +	val &= ~L2_PRD_PURGE_CMD_TRIGGER;
> +	rc = xscom_write(chip_id, addr, val);
> +	if (rc)
> +		prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write failed %i\n",
> +		      core_id, rc);
> +	return rc;
> +

Unnecessary empty line.


> +}
> +
> +static int start_l3_purge(uint32_t chip_id, uint32_t core_id)
> +{
> +	int rc = 0;
> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
> +
> +	rc = xscom_write_mask(chip_id, addr, L3_FULL_PURGE,
> +			      L3_PRD_PURGE_TTYPE_MASK);
> +	if (rc)
> +		goto out;
> +	rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ,
> +			      L3_PRD_PURGE_REQ);
> +	if (rc)
> +out:		prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM write_mask "
> +		      "failed %i\n", core_id, rc);
> +	return rc;
> +
> +}
> +
> +static int wait_l3_purge(uint32_t chip_id, uint32_t core_id)
> +{
> +	int rc;
> +	unsigned long now = mftb();
> +	unsigned long end = now + msecs_to_tb(2);
> +	uint64_t val = L3_PRD_PURGE_REQ;
> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
> +
> +	/* Trigger bit is automatically set to zero when flushing is done*/
> +	while (val & L3_PRD_PURGE_REQ) {
> +		rc = xscom_read(chip_id, addr, &val);
> +		if (rc) {
> +			prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM read "
> +			      "failed %i\n", core_id, rc);
> +			break;
> +		}
> +		now = mftb();
> +		if (tb_compare(now, end) == TB_AAFTERB) {
> +			prlog(PR_ERR, "PURGE L3 on core 0x%x timed out %i\n",
> +			      core_id, rc);
> +			return OPAL_BUSY;
> +		}
> +	}
> +	return rc;
> +}
> +
> +static int64_t purge_l2_l3_caches(void)
> +{
> +	int rc = 0;
> +	struct cpu_thread *t;
> +	uint64_t core_id, prev_core_id = 0xdeadbeef;

I'd make it (uint64_t)-1 :) deafbeaf is more for poisoning memory.


> +
> +	if (proc_gen != proc_gen_p9)
> +		return OPAL_UNSUPPORTED;


I assume this is a leftover from when it was an OPAL call. There is no
even small chance for this to be called for anything but p9, right?


> +
> +	for_each_ungarded_cpu(t) {
> +		/* Only need to do it once per core chiplet */
> +		core_id = pir_to_core_id(t->pir);
> +		if (prev_core_id == core_id)
> +			continue;
> +		prev_core_id = core_id;
> +		if (start_l2_purge(t->chip_id, core_id))
> +			goto out;
> +		if (start_l3_purge(t->chip_id, core_id))
> +			goto out;
> +	}
> +
> +	for_each_ungarded_cpu(t) {
> +		/* Only need to do it once per core chiplet */
> +		core_id = pir_to_core_id(t->pir);
> +		if (prev_core_id == core_id)
> +			continue;

Looks like if there is a single core at all (quite unlikely but still a
possible situation), we won't wait below at all.

> +		prev_core_id = core_id;
> +
> +		if (wait_l2_purge(t->chip_id, core_id))
> +			goto out;
> +		if (wait_l3_purge(t->chip_id, core_id))
> +			goto out;
> +	}
> +	return rc;


return OPAL_SUCCESS;


> +out:
> +	prlog(PR_ERR, "Failed on core: 0x%llx\n", core_id);
> +	return rc;

return OPAL_BUSY_EVENT. Now it always returns 0.


> +
> +}
> +
>  static int64_t npu2_hreset(struct pci_slot *slot __unused)
>  {
>  	struct npu2 *p;
> @@ -1125,6 +1260,7 @@ static int64_t npu2_hreset(struct pci_slot *slot __unused)
>  			reset_ntl(ndev);
>  		}
>  	}
> +	purge_l2_l3_caches();


This takes care of the guest termination case (which is good).

Now in order to cover the period between the guest started rebooting and
the guest started onlining the memory, we also need purge_l2_l3_caches()
near npu2_dev_procedure_reset() in npu2_dev_cfg_exp_devcap() as this is
the reset method used for NPU when the guest reboots.



>  	return OPAL_SUCCESS;


return purge_l2_l3_caches(), otherwise the timeout error is lost.




>  }
>  
> diff --git a/include/npu2-regs.h b/include/npu2-regs.h
> index 165e0b79..20a61e28 100644
> --- a/include/npu2-regs.h
> +++ b/include/npu2-regs.h
> @@ -749,4 +749,15 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
>  #define OB3_ODL1_TRAINING_STATUS		0xC01082F
>  #define   OB_ODL_TRAINING_STATUS_STS_RX_PATTERN_B PPC_BITMASK(8, 15)
>  
> +/* Registers and bits used to clear the L2 and L3 cache */
> +#define L2_PRD_PURGE_CMD_REG 			0x1080E
> +#define L2_PRD_PURGE_CMD_REG_BUSY 		0x0040000000000000
> +#define L2_PRD_PURGE_CMD_TYPE_MASK		PPC_BIT(1) | PPC_BIT(2) | PPC_BIT(3) | PPC_BIT(4)
> +#define L2_PRD_PURGE_CMD_TRIGGER		PPC_BIT(0)
> +#define L2CAC_FLUSH				0x0
> +#define L3_PRD_PURGE_REG			0x1180E
> +#define L3_PRD_PURGE_REQ			PPC_BIT(0)
> +#define L3_PRD_PURGE_TTYPE_MASK 		PPC_BIT(1) | PPC_BIT(2) | PPC_BIT(3) | PPC_BIT(4)
> +#define L3_FULL_PURGE				0x0
> +
>  #endif /* __NPU2_REGS_H */
>
Andrew Donnellan Nov. 30, 2018, 6:16 a.m. | #2
On 30/11/18 12:05 pm, Alexey Kardashevskiy wrote:>> +static int 
start_l2_purge(uint32_t chip_id, uint32_t core_id)
>> +{
>> +	int rc;
>> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
>> +
>> +	rc = xscom_write_mask(chip_id, addr, L2CAC_FLUSH,
>> +			      L2_PRD_PURGE_CMD_TYPE_MASK);
>> +	if (rc)
>> +		goto out;
>> +	rc = xscom_write_mask(chip_id, addr, L2_PRD_PURGE_CMD_TRIGGER,
>> +			      L2_PRD_PURGE_CMD_TRIGGER);
>> +	if (rc)
>> +out:		prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write_mask "
>> +		      "failed %i\n", core_id, rc);
> 
> 
> Nit: I find such use of goto with a label inside another scope rather
> confusing. Goto is usually used to avoid code duplication on a cleanup
> path and this is not the case.
> 
> I'd rather ditch "goto" and do "if (!rc) xscom_write_mask" for the
> second xscom_write_mask()   or   move "out:" 1 line higher. Up to
> Stewart really :)

Just handle the second one exactly like the first one.

+	if (rc)
+		goto out;
+	rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ,
+			      L3_PRD_PURGE_REQ);
+	if (rc)
+		goto out;
+
+	return rc;
+
+out:
+	prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM write_mask "
+	      "failed %i\n", core_id, rc);
+	return rc;

> 
> 
>> +	return rc;
>> +
> 
> Unnecessary empty line.
> 
>> +}
>> +
>> +static int wait_l2_purge(uint32_t chip_id, uint32_t core_id)
>> +{
>> +	int rc;
>> +	unsigned long now = mftb();
>> +	unsigned long end = now + msecs_to_tb(2);
>> +	uint64_t val = L2_PRD_PURGE_CMD_REG_BUSY;
>> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
>> +
>> +	while (val & L2_PRD_PURGE_CMD_REG_BUSY) {
> 
> for (val = L2_PRD_PURGE_CMD_REG_BUSY;;)

If the concern is an unnecessary initialiser then use do/while? Better 
than using a for.

> 
> and....
> 
>> +		rc = xscom_read(chip_id, addr, &val);
>> +		if (rc) {
>> +			prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM read "
>> +			      "failed %i\n", core_id, rc);
>> +			break;
>> +		}
> 
> 
> ... break out here:
> 	if (val & L2_PRD_PURGE_CMD_REG_BUSY)
> 		break;
> 
> As if you read val with the bit set, then checking timeout is rather
> pointless, no? >
> 
>> +		now = mftb();
>> +		if (tb_compare(now, end) == TB_AAFTERB) {
>> +			prlog(PR_ERR, "PURGE L2 on core 0x%x timed out %i\n",
>> +			      core_id, rc);
>> +			return OPAL_BUSY;
>> +		}
>> +	}
>> +
>> +	/* We have to clear the trigger bit ourselves */
>> +	val &= ~L2_PRD_PURGE_CMD_TRIGGER;
>> +	rc = xscom_write(chip_id, addr, val);
>> +	if (rc)
>> +		prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write failed %i\n",
>> +		      core_id, rc);
>> +	return rc;
>> +
> 
> Unnecessary empty line.
> 
> 
>> +}
>> +
>> +static int start_l3_purge(uint32_t chip_id, uint32_t core_id)
>> +{
>> +	int rc = 0;
>> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
>> +
>> +	rc = xscom_write_mask(chip_id, addr, L3_FULL_PURGE,
>> +			      L3_PRD_PURGE_TTYPE_MASK);
>> +	if (rc)
>> +		goto out;
>> +	rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ,
>> +			      L3_PRD_PURGE_REQ);
>> +	if (rc)
>> +out:		prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM write_mask "
>> +		      "failed %i\n", core_id, rc);
>> +	return rc;
>> +
>> +}
>> +
>> +static int wait_l3_purge(uint32_t chip_id, uint32_t core_id)
>> +{
>> +	int rc;
>> +	unsigned long now = mftb();
>> +	unsigned long end = now + msecs_to_tb(2);
>> +	uint64_t val = L3_PRD_PURGE_REQ;
>> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
>> +
>> +	/* Trigger bit is automatically set to zero when flushing is done*/
>> +	while (val & L3_PRD_PURGE_REQ) {
>> +		rc = xscom_read(chip_id, addr, &val);
>> +		if (rc) {
>> +			prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM read "
>> +			      "failed %i\n", core_id, rc);
>> +			break;
>> +		}
>> +		now = mftb();
>> +		if (tb_compare(now, end) == TB_AAFTERB) {
>> +			prlog(PR_ERR, "PURGE L3 on core 0x%x timed out %i\n",
>> +			      core_id, rc);
>> +			return OPAL_BUSY;
>> +		}
>> +	}
>> +	return rc;
>> +}
>> +
>> +static int64_t purge_l2_l3_caches(void)
>> +{
>> +	int rc = 0;
>> +	struct cpu_thread *t;
>> +	uint64_t core_id, prev_core_id = 0xdeadbeef;
> 
> I'd make it (uint64_t)-1 :) deafbeaf is more for poisoning memory.
> 
> 
>> +
>> +	if (proc_gen != proc_gen_p9)
>> +		return OPAL_UNSUPPORTED;
> 
> 
> I assume this is a leftover from when it was an OPAL call. There is no
> even small chance for this to be called for anything but p9, right?
> 
> 
>> +
>> +	for_each_ungarded_cpu(t) {
>> +		/* Only need to do it once per core chiplet */
>> +		core_id = pir_to_core_id(t->pir);
>> +		if (prev_core_id == core_id)
>> +			continue;
>> +		prev_core_id = core_id;
>> +		if (start_l2_purge(t->chip_id, core_id))
>> +			goto out;
>> +		if (start_l3_purge(t->chip_id, core_id))
>> +			goto out;
>> +	}
>> +
>> +	for_each_ungarded_cpu(t) {
>> +		/* Only need to do it once per core chiplet */
>> +		core_id = pir_to_core_id(t->pir);
>> +		if (prev_core_id == core_id)
>> +			continue;
> 
> Looks like if there is a single core at all (quite unlikely but still a
> possible situation), we won't wait below at all.
> 
>> +		prev_core_id = core_id;
>> +
>> +		if (wait_l2_purge(t->chip_id, core_id))
>> +			goto out;
>> +		if (wait_l3_purge(t->chip_id, core_id))
>> +			goto out;
>> +	}
>> +	return rc;
> 
> 
> return OPAL_SUCCESS;
> 
> 
>> +out:
>> +	prlog(PR_ERR, "Failed on core: 0x%llx\n", core_id);
>> +	return rc;
> 
> return OPAL_BUSY_EVENT. Now it always returns 0.
> 
> 
>> +
>> +}
>> +
>>   static int64_t npu2_hreset(struct pci_slot *slot __unused)
>>   {
>>   	struct npu2 *p;
>> @@ -1125,6 +1260,7 @@ static int64_t npu2_hreset(struct pci_slot *slot __unused)
>>   			reset_ntl(ndev);
>>   		}
>>   	}
>> +	purge_l2_l3_caches();
> 
> 
> This takes care of the guest termination case (which is good).
> 
> Now in order to cover the period between the guest started rebooting and
> the guest started onlining the memory, we also need purge_l2_l3_caches()
> near npu2_dev_procedure_reset() in npu2_dev_cfg_exp_devcap() as this is
> the reset method used for NPU when the guest reboots.
> 
> 
> 
>>   	return OPAL_SUCCESS;
> 
> 
> return purge_l2_l3_caches(), otherwise the timeout error is lost.
> 
> 
> 
> 
>>   }
>>   
>> diff --git a/include/npu2-regs.h b/include/npu2-regs.h
>> index 165e0b79..20a61e28 100644
>> --- a/include/npu2-regs.h
>> +++ b/include/npu2-regs.h
>> @@ -749,4 +749,15 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
>>   #define OB3_ODL1_TRAINING_STATUS		0xC01082F
>>   #define   OB_ODL_TRAINING_STATUS_STS_RX_PATTERN_B PPC_BITMASK(8, 15)
>>   
>> +/* Registers and bits used to clear the L2 and L3 cache */
>> +#define L2_PRD_PURGE_CMD_REG 			0x1080E
>> +#define L2_PRD_PURGE_CMD_REG_BUSY 		0x0040000000000000
>> +#define L2_PRD_PURGE_CMD_TYPE_MASK		PPC_BIT(1) | PPC_BIT(2) | PPC_BIT(3) | PPC_BIT(4)
>> +#define L2_PRD_PURGE_CMD_TRIGGER		PPC_BIT(0)
>> +#define L2CAC_FLUSH				0x0
>> +#define L3_PRD_PURGE_REG			0x1180E
>> +#define L3_PRD_PURGE_REQ			PPC_BIT(0)
>> +#define L3_PRD_PURGE_TTYPE_MASK 		PPC_BIT(1) | PPC_BIT(2) | PPC_BIT(3) | PPC_BIT(4)
>> +#define L3_FULL_PURGE				0x0
>> +
>>   #endif /* __NPU2_REGS_H */
>>
>
Rashmica Gupta Dec. 2, 2018, 10:50 p.m. | #3
On Fri, 2018-11-30 at 12:05 +1100, Alexey Kardashevskiy wrote:
> > 
> On 21/11/2018 10:44, Rashmica Gupta wrote:

...

> > +static int start_l2_purge(uint32_t chip_id, uint32_t core_id)
> > +{
> > +	int rc;
> > +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id,
> > L2_PRD_PURGE_CMD_REG);
> > +
> > +	rc = xscom_write_mask(chip_id, addr, L2CAC_FLUSH,
> > +			      L2_PRD_PURGE_CMD_TYPE_MASK);
> > +	if (rc)
> > +		goto out;
> > +	rc = xscom_write_mask(chip_id, addr, L2_PRD_PURGE_CMD_TRIGGER,
> > +			      L2_PRD_PURGE_CMD_TRIGGER);
> > +	if (rc)
> > +out:		prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM
> > write_mask "
> > +		      "failed %i\n", core_id, rc);
> 
> 
> Nit: I find such use of goto with a label inside another scope rather
> confusing. Goto is usually used to avoid code duplication on a
> cleanup
> path and this is not the case.
> 
> I'd rather ditch "goto" and do "if (!rc) xscom_write_mask" for the
> second xscom_write_mask()   or   move "out:" 1 line higher. Up to
> Stewart really :)
> 
> 
> > +	return rc;
> > +
> 
> Unnecessary empty line.
> 
> > +}
> > +
> > +static int wait_l2_purge(uint32_t chip_id, uint32_t core_id)
> > +{
> > +	int rc;
> > +	unsigned long now = mftb();
> > +	unsigned long end = now + msecs_to_tb(2);
> > +	uint64_t val = L2_PRD_PURGE_CMD_REG_BUSY;
> > +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id,
> > L2_PRD_PURGE_CMD_REG);
> > +
> > +	while (val & L2_PRD_PURGE_CMD_REG_BUSY) {
> 
> for (val = L2_PRD_PURGE_CMD_REG_BUSY;;)
> 
> and....
> 
> > +		rc = xscom_read(chip_id, addr, &val);
> > +		if (rc) {
> > +			prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM
> > read "
> > +			      "failed %i\n", core_id, rc);
> > +			break;
> > +		}
> 
> 
> ... break out here:
> 	if (val & L2_PRD_PURGE_CMD_REG_BUSY)
> 		break;
> 
> As if you read val with the bit set, then checking timeout is rather
> pointless, no?
> 

I think you mean:
if (!(val & L2_PRD_PURGE_CMD_REG_BUSY))
 		break;
??
Checking timeout is pointless if the purge is completed, but if it
hasn't completed then we do want to check the timeout. 

> 
> > +		now = mftb();
> > +		if (tb_compare(now, end) == TB_AAFTERB) {
> > +			prlog(PR_ERR, "PURGE L2 on core 0x%x timed out
> > %i\n",
> > +			      core_id, rc);
> > +			return OPAL_BUSY;
> > +		}
> > +	}
> > +
> > +	/* We have to clear the trigger bit ourselves */
> > +	val &= ~L2_PRD_PURGE_CMD_TRIGGER;
> > +	rc = xscom_write(chip_id, addr, val);
> > +	if (rc)
> > +		prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write
> > failed %i\n",
> > +		      core_id, rc);
> > +	return rc;
> > +
> 
> Unnecessary empty line.
> 
> 
> > +}
> > +
> > +static int start_l3_purge(uint32_t chip_id, uint32_t core_id)
> > +{
> > +	int rc = 0;
> > +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
> > +
> > +	rc = xscom_write_mask(chip_id, addr, L3_FULL_PURGE,
> > +			      L3_PRD_PURGE_TTYPE_MASK);
> > +	if (rc)
> > +		goto out;
> > +	rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ,
> > +			      L3_PRD_PURGE_REQ);
> > +	if (rc)
> > +out:		prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM
> > write_mask "
> > +		      "failed %i\n", core_id, rc);
> > +	return rc;
> > +
> > +}
> > +
> > +static int wait_l3_purge(uint32_t chip_id, uint32_t core_id)
> > +{
> > +	int rc;
> > +	unsigned long now = mftb();
> > +	unsigned long end = now + msecs_to_tb(2);
> > +	uint64_t val = L3_PRD_PURGE_REQ;
> > +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
> > +
> > +	/* Trigger bit is automatically set to zero when flushing is
> > done*/
> > +	while (val & L3_PRD_PURGE_REQ) {
> > +		rc = xscom_read(chip_id, addr, &val);
> > +		if (rc) {
> > +			prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM
> > read "
> > +			      "failed %i\n", core_id, rc);
> > +			break;
> > +		}
> > +		now = mftb();
> > +		if (tb_compare(now, end) == TB_AAFTERB) {
> > +			prlog(PR_ERR, "PURGE L3 on core 0x%x timed out
> > %i\n",
> > +			      core_id, rc);
> > +			return OPAL_BUSY;
> > +		}
> > +	}
> > +	return rc;
> > +}
> > +
> > +static int64_t purge_l2_l3_caches(void)
> > +{
> > +	int rc = 0;
> > +	struct cpu_thread *t;
> > +	uint64_t core_id, prev_core_id = 0xdeadbeef;
> 
> I'd make it (uint64_t)-1 :) deafbeaf is more for poisoning memory.
> 

Ok!

> 
> > +
> > +	if (proc_gen != proc_gen_p9)
> > +		return OPAL_UNSUPPORTED;
> 
> 
> I assume this is a leftover from when it was an OPAL call. There is
> no
> even small chance for this to be called for anything but p9, right?
> 

Good point :)

> 
> > +
> > +	for_each_ungarded_cpu(t) {
> > +		/* Only need to do it once per core chiplet */
> > +		core_id = pir_to_core_id(t->pir);
> > +		if (prev_core_id == core_id)
> > +			continue;
> > +		prev_core_id = core_id;
> > +		if (start_l2_purge(t->chip_id, core_id))
> > +			goto out;
> > +		if (start_l3_purge(t->chip_id, core_id))
> > +			goto out;
> > +	}
> > +
> > +	for_each_ungarded_cpu(t) {
> > +		/* Only need to do it once per core chiplet */
> > +		core_id = pir_to_core_id(t->pir);
> > +		if (prev_core_id == core_id)
> > +			continue;
> 
> Looks like if there is a single core at all (quite unlikely but still
> a
> possible situation), we won't wait below at all.

Sorry I don't understand why you think that. Can you please elaborate?

> 
> > +		prev_core_id = core_id;
> > +
> > +		if (wait_l2_purge(t->chip_id, core_id))
> > +			goto out;
> > +		if (wait_l3_purge(t->chip_id, core_id))
> > +			goto out;
> > +	}
> > +	return rc;
> 
> 
> return OPAL_SUCCESS;
> 
> 
> > +out:
> > +	prlog(PR_ERR, "Failed on core: 0x%llx\n", core_id);
> > +	return rc;
> 
> return OPAL_BUSY_EVENT. Now it always returns 0.
> 

woops, good spot :)

> 
> > +
> > +}
> > +
> >  static int64_t npu2_hreset(struct pci_slot *slot __unused)
> >  {
> >  	struct npu2 *p;
> > @@ -1125,6 +1260,7 @@ static int64_t npu2_hreset(struct pci_slot
> > *slot __unused)
> >  			reset_ntl(ndev);
> >  		}
> >  	}
> > +	purge_l2_l3_caches();
> 
> 
> This takes care of the guest termination case (which is good).
> 
> Now in order to cover the period between the guest started rebooting
> and
> the guest started onlining the memory, we also need
> purge_l2_l3_caches()
> near npu2_dev_procedure_reset() in npu2_dev_cfg_exp_devcap() as this
> is
> the reset method used for NPU when the guest reboots.
> 
> 

OK. Does it matter if purge_l2_l3_caches() is called before or after
npu2_dev_procedure_reset()? 

> 
> >  	return OPAL_SUCCESS;
> 
> 
> return purge_l2_l3_caches(), otherwise the timeout error is lost.
> 
> 

ack.

> 
> 
> >  }
> >  
> > diff --git a/include/npu2-regs.h b/include/npu2-regs.h
> > index 165e0b79..20a61e28 100644
> > --- a/include/npu2-regs.h
> > +++ b/include/npu2-regs.h
> > @@ -749,4 +749,15 @@ void npu2_scom_write(uint64_t gcid, uint64_t
> > scom_base,
> >  #define OB3_ODL1_TRAINING_STATUS		0xC01082F
> >  #define   OB_ODL_TRAINING_STATUS_STS_RX_PATTERN_B PPC_BITMASK(8,
> > 15)
> >  
> > +/* Registers and bits used to clear the L2 and L3 cache */
> > +#define L2_PRD_PURGE_CMD_REG 			0x1080E
> > +#define L2_PRD_PURGE_CMD_REG_BUSY 		0x0040000000000000
> > +#define L2_PRD_PURGE_CMD_TYPE_MASK		PPC_BIT(1) | PPC_BIT(2)
> > | PPC_BIT(3) | PPC_BIT(4)
> > +#define L2_PRD_PURGE_CMD_TRIGGER		PPC_BIT(0)
> > +#define L2CAC_FLUSH				0x0
> > +#define L3_PRD_PURGE_REG			0x1180E
> > +#define L3_PRD_PURGE_REQ			PPC_BIT(0)
> > +#define L3_PRD_PURGE_TTYPE_MASK 		PPC_BIT(1) | PPC_BIT(2)
> > | PPC_BIT(3) | PPC_BIT(4)
> > +#define L3_FULL_PURGE				0x0
> > +
> >  #endif /* __NPU2_REGS_H */
> > 
> 
>
Alexey Kardashevskiy Dec. 3, 2018, 3:01 a.m. | #4
On 30/11/2018 17:16, Andrew Donnellan wrote:
> On 30/11/18 12:05 pm, Alexey Kardashevskiy wrote:>> +static int
> start_l2_purge(uint32_t chip_id, uint32_t core_id)
>>> +{
>>> +    int rc;
>>> +    uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
>>> +
>>> +    rc = xscom_write_mask(chip_id, addr, L2CAC_FLUSH,
>>> +                  L2_PRD_PURGE_CMD_TYPE_MASK);
>>> +    if (rc)
>>> +        goto out;
>>> +    rc = xscom_write_mask(chip_id, addr, L2_PRD_PURGE_CMD_TRIGGER,
>>> +                  L2_PRD_PURGE_CMD_TRIGGER);
>>> +    if (rc)
>>> +out:        prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write_mask "
>>> +              "failed %i\n", core_id, rc);
>>
>>
>> Nit: I find such use of goto with a label inside another scope rather
>> confusing. Goto is usually used to avoid code duplication on a cleanup
>> path and this is not the case.
>>
>> I'd rather ditch "goto" and do "if (!rc) xscom_write_mask" for the
>> second xscom_write_mask()   or   move "out:" 1 line higher. Up to
>> Stewart really :)
> 
> Just handle the second one exactly like the first one.
> 
> +    if (rc)
> +        goto out;
> +    rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ,
> +                  L3_PRD_PURGE_REQ);
> +    if (rc)
> +        goto out;
> +
> +    return rc;
> +
> +out:
> +    prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM write_mask "
> +          "failed %i\n", core_id, rc);
> +    return rc;


I still like goto-less option better.


> 
>>
>>
>>> +    return rc;
>>> +
>>
>> Unnecessary empty line.
>>
>>> +}
>>> +
>>> +static int wait_l2_purge(uint32_t chip_id, uint32_t core_id)
>>> +{
>>> +    int rc;
>>> +    unsigned long now = mftb();
>>> +    unsigned long end = now + msecs_to_tb(2);
>>> +    uint64_t val = L2_PRD_PURGE_CMD_REG_BUSY;
>>> +    uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
>>> +
>>> +    while (val & L2_PRD_PURGE_CMD_REG_BUSY) {
>>
>> for (val = L2_PRD_PURGE_CMD_REG_BUSY;;)
> 
> If the concern is an unnecessary initialiser then use do/while? Better
> than using a for.


imho it is not. We exit the loop because of that flag so making
initialization a part of the loop is a good thing. It is a very minor
thing though, I am fine either way.


> 
>>
>> and....
>>
>>> +        rc = xscom_read(chip_id, addr, &val);
>>> +        if (rc) {
>>> +            prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM read "
>>> +                  "failed %i\n", core_id, rc);
>>> +            break;
>>> +        }
>>
>>
>> ... break out here:
>>     if (val & L2_PRD_PURGE_CMD_REG_BUSY)
>>         break;
>>
>> As if you read val with the bit set, then checking timeout is rather
>> pointless, no? >
>>
>>> +        now = mftb();
>>> +        if (tb_compare(now, end) == TB_AAFTERB) {
>>> +            prlog(PR_ERR, "PURGE L2 on core 0x%x timed out %i\n",
>>> +                  core_id, rc);
>>> +            return OPAL_BUSY;
>>> +        }
>>> +    }
>>> +
>>> +    /* We have to clear the trigger bit ourselves */
>>> +    val &= ~L2_PRD_PURGE_CMD_TRIGGER;
>>> +    rc = xscom_write(chip_id, addr, val);
>>> +    if (rc)
>>> +        prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write failed %i\n",
>>> +              core_id, rc);
>>> +    return rc;
>>> +
>>
>> Unnecessary empty line.
>>
>>
>>> +}
>>> +
>>> +static int start_l3_purge(uint32_t chip_id, uint32_t core_id)
>>> +{
>>> +    int rc = 0;
>>> +    uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
>>> +
>>> +    rc = xscom_write_mask(chip_id, addr, L3_FULL_PURGE,
>>> +                  L3_PRD_PURGE_TTYPE_MASK);
>>> +    if (rc)
>>> +        goto out;
>>> +    rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ,
>>> +                  L3_PRD_PURGE_REQ);
>>> +    if (rc)
>>> +out:        prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM write_mask "
>>> +              "failed %i\n", core_id, rc);
>>> +    return rc;
>>> +
>>> +}
>>> +
>>> +static int wait_l3_purge(uint32_t chip_id, uint32_t core_id)
>>> +{
>>> +    int rc;
>>> +    unsigned long now = mftb();
>>> +    unsigned long end = now + msecs_to_tb(2);
>>> +    uint64_t val = L3_PRD_PURGE_REQ;
>>> +    uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
>>> +
>>> +    /* Trigger bit is automatically set to zero when flushing is done*/
>>> +    while (val & L3_PRD_PURGE_REQ) {
>>> +        rc = xscom_read(chip_id, addr, &val);
>>> +        if (rc) {
>>> +            prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM read "
>>> +                  "failed %i\n", core_id, rc);
>>> +            break;
>>> +        }
>>> +        now = mftb();
>>> +        if (tb_compare(now, end) == TB_AAFTERB) {
>>> +            prlog(PR_ERR, "PURGE L3 on core 0x%x timed out %i\n",
>>> +                  core_id, rc);
>>> +            return OPAL_BUSY;
>>> +        }
>>> +    }
>>> +    return rc;
>>> +}
>>> +
>>> +static int64_t purge_l2_l3_caches(void)
>>> +{
>>> +    int rc = 0;
>>> +    struct cpu_thread *t;
>>> +    uint64_t core_id, prev_core_id = 0xdeadbeef;
>>
>> I'd make it (uint64_t)-1 :) deafbeaf is more for poisoning memory.
>>
>>
>>> +
>>> +    if (proc_gen != proc_gen_p9)
>>> +        return OPAL_UNSUPPORTED;
>>
>>
>> I assume this is a leftover from when it was an OPAL call. There is no
>> even small chance for this to be called for anything but p9, right?
>>
>>
>>> +
>>> +    for_each_ungarded_cpu(t) {
>>> +        /* Only need to do it once per core chiplet */
>>> +        core_id = pir_to_core_id(t->pir);
>>> +        if (prev_core_id == core_id)
>>> +            continue;
>>> +        prev_core_id = core_id;
>>> +        if (start_l2_purge(t->chip_id, core_id))
>>> +            goto out;
>>> +        if (start_l3_purge(t->chip_id, core_id))
>>> +            goto out;
>>> +    }
>>> +
>>> +    for_each_ungarded_cpu(t) {
>>> +        /* Only need to do it once per core chiplet */
>>> +        core_id = pir_to_core_id(t->pir);
>>> +        if (prev_core_id == core_id)
>>> +            continue;
>>
>> Looks like if there is a single core at all (quite unlikely but still a
>> possible situation), we won't wait below at all.
>>
>>> +        prev_core_id = core_id;
>>> +
>>> +        if (wait_l2_purge(t->chip_id, core_id))
>>> +            goto out;
>>> +        if (wait_l3_purge(t->chip_id, core_id))
>>> +            goto out;
>>> +    }
>>> +    return rc;
>>
>>
>> return OPAL_SUCCESS;
>>
>>
>>> +out:
>>> +    prlog(PR_ERR, "Failed on core: 0x%llx\n", core_id);
>>> +    return rc;
>>
>> return OPAL_BUSY_EVENT. Now it always returns 0.
>>
>>
>>> +
>>> +}
>>> +
>>>   static int64_t npu2_hreset(struct pci_slot *slot __unused)
>>>   {
>>>       struct npu2 *p;
>>> @@ -1125,6 +1260,7 @@ static int64_t npu2_hreset(struct pci_slot
>>> *slot __unused)
>>>               reset_ntl(ndev);
>>>           }
>>>       }
>>> +    purge_l2_l3_caches();
>>
>>
>> This takes care of the guest termination case (which is good).
>>
>> Now in order to cover the period between the guest started rebooting and
>> the guest started onlining the memory, we also need purge_l2_l3_caches()
>> near npu2_dev_procedure_reset() in npu2_dev_cfg_exp_devcap() as this is
>> the reset method used for NPU when the guest reboots.
>>
>>
>>
>>>       return OPAL_SUCCESS;
>>
>>
>> return purge_l2_l3_caches(), otherwise the timeout error is lost.
>>
>>
>>
>>
>>>   }
>>>   diff --git a/include/npu2-regs.h b/include/npu2-regs.h
>>> index 165e0b79..20a61e28 100644
>>> --- a/include/npu2-regs.h
>>> +++ b/include/npu2-regs.h
>>> @@ -749,4 +749,15 @@ void npu2_scom_write(uint64_t gcid, uint64_t
>>> scom_base,
>>>   #define OB3_ODL1_TRAINING_STATUS        0xC01082F
>>>   #define   OB_ODL_TRAINING_STATUS_STS_RX_PATTERN_B PPC_BITMASK(8, 15)
>>>   +/* Registers and bits used to clear the L2 and L3 cache */
>>> +#define L2_PRD_PURGE_CMD_REG             0x1080E
>>> +#define L2_PRD_PURGE_CMD_REG_BUSY         0x0040000000000000
>>> +#define L2_PRD_PURGE_CMD_TYPE_MASK        PPC_BIT(1) | PPC_BIT(2) |
>>> PPC_BIT(3) | PPC_BIT(4)
>>> +#define L2_PRD_PURGE_CMD_TRIGGER        PPC_BIT(0)
>>> +#define L2CAC_FLUSH                0x0
>>> +#define L3_PRD_PURGE_REG            0x1180E
>>> +#define L3_PRD_PURGE_REQ            PPC_BIT(0)
>>> +#define L3_PRD_PURGE_TTYPE_MASK         PPC_BIT(1) | PPC_BIT(2) |
>>> PPC_BIT(3) | PPC_BIT(4)
>>> +#define L3_FULL_PURGE                0x0
>>> +
>>>   #endif /* __NPU2_REGS_H */
>>>
>>
>
Alexey Kardashevskiy Dec. 3, 2018, 3:24 a.m. | #5
On 03/12/2018 09:50, Rashmica Gupta wrote:
> On Fri, 2018-11-30 at 12:05 +1100, Alexey Kardashevskiy wrote:
>>>
>> On 21/11/2018 10:44, Rashmica Gupta wrote:
> 
> ...
> 
>>> +static int start_l2_purge(uint32_t chip_id, uint32_t core_id)
>>> +{
>>> +	int rc;
>>> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id,
>>> L2_PRD_PURGE_CMD_REG);
>>> +
>>> +	rc = xscom_write_mask(chip_id, addr, L2CAC_FLUSH,
>>> +			      L2_PRD_PURGE_CMD_TYPE_MASK);
>>> +	if (rc)
>>> +		goto out;
>>> +	rc = xscom_write_mask(chip_id, addr, L2_PRD_PURGE_CMD_TRIGGER,
>>> +			      L2_PRD_PURGE_CMD_TRIGGER);
>>> +	if (rc)
>>> +out:		prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM
>>> write_mask "
>>> +		      "failed %i\n", core_id, rc);
>>
>>
>> Nit: I find such use of goto with a label inside another scope rather
>> confusing. Goto is usually used to avoid code duplication on a
>> cleanup
>> path and this is not the case.
>>
>> I'd rather ditch "goto" and do "if (!rc) xscom_write_mask" for the
>> second xscom_write_mask()   or   move "out:" 1 line higher. Up to
>> Stewart really :)
>>
>>
>>> +	return rc;
>>> +
>>
>> Unnecessary empty line.
>>
>>> +}
>>> +
>>> +static int wait_l2_purge(uint32_t chip_id, uint32_t core_id)
>>> +{
>>> +	int rc;
>>> +	unsigned long now = mftb();
>>> +	unsigned long end = now + msecs_to_tb(2);
>>> +	uint64_t val = L2_PRD_PURGE_CMD_REG_BUSY;
>>> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id,
>>> L2_PRD_PURGE_CMD_REG);
>>> +
>>> +	while (val & L2_PRD_PURGE_CMD_REG_BUSY) {
>>
>> for (val = L2_PRD_PURGE_CMD_REG_BUSY;;)
>>
>> and....
>>
>>> +		rc = xscom_read(chip_id, addr, &val);
>>> +		if (rc) {
>>> +			prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM
>>> read "
>>> +			      "failed %i\n", core_id, rc);
>>> +			break;
>>> +		}
>>
>>
>> ... break out here:
>> 	if (val & L2_PRD_PURGE_CMD_REG_BUSY)
>> 		break;
>>
>> As if you read val with the bit set, then checking timeout is rather
>> pointless, no?
>>
> 
> I think you mean:
> if (!(val & L2_PRD_PURGE_CMD_REG_BUSY))
>  		break;
> ??

Yes, my bad :)


> Checking timeout is pointless if the purge is completed, but if it
> hasn't completed then we do want to check the timeout. 
> 
>>
>>> +		now = mftb();
>>> +		if (tb_compare(now, end) == TB_AAFTERB) {
>>> +			prlog(PR_ERR, "PURGE L2 on core 0x%x timed out
>>> %i\n",
>>> +			      core_id, rc);
>>> +			return OPAL_BUSY;
>>> +		}
>>> +	}
>>> +
>>> +	/* We have to clear the trigger bit ourselves */
>>> +	val &= ~L2_PRD_PURGE_CMD_TRIGGER;
>>> +	rc = xscom_write(chip_id, addr, val);
>>> +	if (rc)
>>> +		prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write
>>> failed %i\n",
>>> +		      core_id, rc);
>>> +	return rc;
>>> +
>>
>> Unnecessary empty line.
>>
>>
>>> +}
>>> +
>>> +static int start_l3_purge(uint32_t chip_id, uint32_t core_id)
>>> +{
>>> +	int rc = 0;
>>> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
>>> +
>>> +	rc = xscom_write_mask(chip_id, addr, L3_FULL_PURGE,
>>> +			      L3_PRD_PURGE_TTYPE_MASK);
>>> +	if (rc)
>>> +		goto out;
>>> +	rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ,
>>> +			      L3_PRD_PURGE_REQ);
>>> +	if (rc)
>>> +out:		prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM
>>> write_mask "
>>> +		      "failed %i\n", core_id, rc);
>>> +	return rc;
>>> +
>>> +}
>>> +
>>> +static int wait_l3_purge(uint32_t chip_id, uint32_t core_id)
>>> +{
>>> +	int rc;
>>> +	unsigned long now = mftb();
>>> +	unsigned long end = now + msecs_to_tb(2);
>>> +	uint64_t val = L3_PRD_PURGE_REQ;
>>> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
>>> +
>>> +	/* Trigger bit is automatically set to zero when flushing is
>>> done*/
>>> +	while (val & L3_PRD_PURGE_REQ) {
>>> +		rc = xscom_read(chip_id, addr, &val);
>>> +		if (rc) {
>>> +			prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM
>>> read "
>>> +			      "failed %i\n", core_id, rc);
>>> +			break;
>>> +		}
>>> +		now = mftb();
>>> +		if (tb_compare(now, end) == TB_AAFTERB) {
>>> +			prlog(PR_ERR, "PURGE L3 on core 0x%x timed out
>>> %i\n",
>>> +			      core_id, rc);
>>> +			return OPAL_BUSY;
>>> +		}
>>> +	}
>>> +	return rc;
>>> +}
>>> +
>>> +static int64_t purge_l2_l3_caches(void)
>>> +{
>>> +	int rc = 0;
>>> +	struct cpu_thread *t;
>>> +	uint64_t core_id, prev_core_id = 0xdeadbeef;
>>
>> I'd make it (uint64_t)-1 :) deafbeaf is more for poisoning memory.
>>
> 
> Ok!
> 
>>
>>> +
>>> +	if (proc_gen != proc_gen_p9)
>>> +		return OPAL_UNSUPPORTED;
>>
>>
>> I assume this is a leftover from when it was an OPAL call. There is
>> no
>> even small chance for this to be called for anything but p9, right?
>>
> 
> Good point :)
> 
>>
>>> +
>>> +	for_each_ungarded_cpu(t) {
>>> +		/* Only need to do it once per core chiplet */
>>> +		core_id = pir_to_core_id(t->pir);
>>> +		if (prev_core_id == core_id)
>>> +			continue;
>>> +		prev_core_id = core_id;
>>> +		if (start_l2_purge(t->chip_id, core_id))
>>> +			goto out;
>>> +		if (start_l3_purge(t->chip_id, core_id))
>>> +			goto out;
>>> +	}
>>> +
>>> +	for_each_ungarded_cpu(t) {
>>> +		/* Only need to do it once per core chiplet */
>>> +		core_id = pir_to_core_id(t->pir);
>>> +		if (prev_core_id == core_id)
>>> +			continue;
>>
>> Looks like if there is a single core at all (quite unlikely but still
>> a
>> possible situation), we won't wait below at all.
> 
> Sorry I don't understand why you think that. Can you please elaborate?


If there is a single core, then after the first loop prev_core_id will
be the id of that single core so in the second loop you won't get to the
waits because prev_core_id == core_id. You need (prev_core_id = -1)
before the second loop.


> 
>>
>>> +		prev_core_id = core_id;
>>> +
>>> +		if (wait_l2_purge(t->chip_id, core_id))
>>> +			goto out;
>>> +		if (wait_l3_purge(t->chip_id, core_id))
>>> +			goto out;
>>> +	}
>>> +	return rc;
>>
>>
>> return OPAL_SUCCESS;
>>
>>
>>> +out:
>>> +	prlog(PR_ERR, "Failed on core: 0x%llx\n", core_id);
>>> +	return rc;
>>
>> return OPAL_BUSY_EVENT. Now it always returns 0.
>>
> 
> woops, good spot :)
> 
>>
>>> +
>>> +}
>>> +
>>>  static int64_t npu2_hreset(struct pci_slot *slot __unused)
>>>  {
>>>  	struct npu2 *p;
>>> @@ -1125,6 +1260,7 @@ static int64_t npu2_hreset(struct pci_slot
>>> *slot __unused)
>>>  			reset_ntl(ndev);
>>>  		}
>>>  	}
>>> +	purge_l2_l3_caches();
>>
>>
>> This takes care of the guest termination case (which is good).
>>
>> Now in order to cover the period between the guest started rebooting
>> and
>> the guest started onlining the memory, we also need
>> purge_l2_l3_caches()
>> near npu2_dev_procedure_reset() in npu2_dev_cfg_exp_devcap() as this
>> is
>> the reset method used for NPU when the guest reboots.
>>
>>
> 
> OK. Does it matter if purge_l2_l3_caches() is called before or after
> npu2_dev_procedure_reset()?


npu2_dev_procedure_reset() and then purge_l2_l3_caches() should do it.


> 
>>
>>>  	return OPAL_SUCCESS;
>>
>>
>> return purge_l2_l3_caches(), otherwise the timeout error is lost.
>>
>>
> 
> ack.
> 
>>
>>
>>>  }
>>>  
>>> diff --git a/include/npu2-regs.h b/include/npu2-regs.h
>>> index 165e0b79..20a61e28 100644
>>> --- a/include/npu2-regs.h
>>> +++ b/include/npu2-regs.h
>>> @@ -749,4 +749,15 @@ void npu2_scom_write(uint64_t gcid, uint64_t
>>> scom_base,
>>>  #define OB3_ODL1_TRAINING_STATUS		0xC01082F
>>>  #define   OB_ODL_TRAINING_STATUS_STS_RX_PATTERN_B PPC_BITMASK(8,
>>> 15)
>>>  
>>> +/* Registers and bits used to clear the L2 and L3 cache */
>>> +#define L2_PRD_PURGE_CMD_REG 			0x1080E
>>> +#define L2_PRD_PURGE_CMD_REG_BUSY 		0x0040000000000000
>>> +#define L2_PRD_PURGE_CMD_TYPE_MASK		PPC_BIT(1) | PPC_BIT(2)
>>> | PPC_BIT(3) | PPC_BIT(4)
>>> +#define L2_PRD_PURGE_CMD_TRIGGER		PPC_BIT(0)
>>> +#define L2CAC_FLUSH				0x0
>>> +#define L3_PRD_PURGE_REG			0x1180E
>>> +#define L3_PRD_PURGE_REQ			PPC_BIT(0)
>>> +#define L3_PRD_PURGE_TTYPE_MASK 		PPC_BIT(1) | PPC_BIT(2)
>>> | PPC_BIT(3) | PPC_BIT(4)
>>> +#define L3_FULL_PURGE				0x0
>>> +
>>>  #endif /* __NPU2_REGS_H */
>>>
>>
>>
>

Patch

diff --git a/hw/npu2.c b/hw/npu2.c
index 30049f5b..a10d4439 100644
--- a/hw/npu2.c
+++ b/hw/npu2.c
@@ -1109,6 +1109,141 @@  static int64_t npu2_get_power_state(struct pci_slot *slot __unused, uint8_t *val
 	return OPAL_SUCCESS;
 }
 
+static int start_l2_purge(uint32_t chip_id, uint32_t core_id)
+{
+	int rc;
+	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
+
+	rc = xscom_write_mask(chip_id, addr, L2CAC_FLUSH,
+			      L2_PRD_PURGE_CMD_TYPE_MASK);
+	if (rc)
+		goto out;
+	rc = xscom_write_mask(chip_id, addr, L2_PRD_PURGE_CMD_TRIGGER,
+			      L2_PRD_PURGE_CMD_TRIGGER);
+	if (rc)
+out:		prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write_mask "
+		      "failed %i\n", core_id, rc);
+	return rc;
+
+}
+
+static int wait_l2_purge(uint32_t chip_id, uint32_t core_id)
+{
+	int rc;
+	unsigned long now = mftb();
+	unsigned long end = now + msecs_to_tb(2);
+	uint64_t val = L2_PRD_PURGE_CMD_REG_BUSY;
+	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
+
+	while (val & L2_PRD_PURGE_CMD_REG_BUSY) {
+		rc = xscom_read(chip_id, addr, &val);
+		if (rc) {
+			prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM read "
+			      "failed %i\n", core_id, rc);
+			break;
+		}
+		now = mftb();
+		if (tb_compare(now, end) == TB_AAFTERB) {
+			prlog(PR_ERR, "PURGE L2 on core 0x%x timed out %i\n",
+			      core_id, rc);
+			return OPAL_BUSY;
+		}
+	}
+
+	/* We have to clear the trigger bit ourselves */
+	val &= ~L2_PRD_PURGE_CMD_TRIGGER;
+	rc = xscom_write(chip_id, addr, val);
+	if (rc)
+		prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write failed %i\n",
+		      core_id, rc);
+	return rc;
+
+}
+
+static int start_l3_purge(uint32_t chip_id, uint32_t core_id)
+{
+	int rc = 0;
+	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
+
+	rc = xscom_write_mask(chip_id, addr, L3_FULL_PURGE,
+			      L3_PRD_PURGE_TTYPE_MASK);
+	if (rc)
+		goto out;
+	rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ,
+			      L3_PRD_PURGE_REQ);
+	if (rc)
+out:		prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM write_mask "
+		      "failed %i\n", core_id, rc);
+	return rc;
+
+}
+
+static int wait_l3_purge(uint32_t chip_id, uint32_t core_id)
+{
+	int rc;
+	unsigned long now = mftb();
+	unsigned long end = now + msecs_to_tb(2);
+	uint64_t val = L3_PRD_PURGE_REQ;
+	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
+
+	/* Trigger bit is automatically set to zero when flushing is done*/
+	while (val & L3_PRD_PURGE_REQ) {
+		rc = xscom_read(chip_id, addr, &val);
+		if (rc) {
+			prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM read "
+			      "failed %i\n", core_id, rc);
+			break;
+		}
+		now = mftb();
+		if (tb_compare(now, end) == TB_AAFTERB) {
+			prlog(PR_ERR, "PURGE L3 on core 0x%x timed out %i\n",
+			      core_id, rc);
+			return OPAL_BUSY;
+		}
+	}
+	return rc;
+}
+
+static int64_t purge_l2_l3_caches(void)
+{
+	int rc = 0;
+	struct cpu_thread *t;
+	uint64_t core_id, prev_core_id = 0xdeadbeef;
+
+	if (proc_gen != proc_gen_p9)
+		return OPAL_UNSUPPORTED;
+
+	for_each_ungarded_cpu(t) {
+		/* Only need to do it once per core chiplet */
+		core_id = pir_to_core_id(t->pir);
+		if (prev_core_id == core_id)
+			continue;
+		prev_core_id = core_id;
+		if (start_l2_purge(t->chip_id, core_id))
+			goto out;
+		if (start_l3_purge(t->chip_id, core_id))
+			goto out;
+	}
+
+	for_each_ungarded_cpu(t) {
+		/* Only need to do it once per core chiplet */
+		core_id = pir_to_core_id(t->pir);
+		if (prev_core_id == core_id)
+			continue;
+		prev_core_id = core_id;
+
+		if (wait_l2_purge(t->chip_id, core_id))
+			goto out;
+		if (wait_l3_purge(t->chip_id, core_id))
+			goto out;
+	}
+	return rc;
+out:
+	prlog(PR_ERR, "Failed on core: 0x%llx\n", core_id);
+	return rc;
+
+}
+
 static int64_t npu2_hreset(struct pci_slot *slot __unused)
 {
 	struct npu2 *p;
@@ -1125,6 +1260,7 @@  static int64_t npu2_hreset(struct pci_slot *slot __unused)
 			reset_ntl(ndev);
 		}
 	}
+	purge_l2_l3_caches();
 	return OPAL_SUCCESS;
 }
 
diff --git a/include/npu2-regs.h b/include/npu2-regs.h
index 165e0b79..20a61e28 100644
--- a/include/npu2-regs.h
+++ b/include/npu2-regs.h
@@ -749,4 +749,15 @@  void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
 #define OB3_ODL1_TRAINING_STATUS		0xC01082F
 #define   OB_ODL_TRAINING_STATUS_STS_RX_PATTERN_B PPC_BITMASK(8, 15)
 
+/* Registers and bits used to clear the L2 and L3 cache */
+#define L2_PRD_PURGE_CMD_REG 			0x1080E
+#define L2_PRD_PURGE_CMD_REG_BUSY 		0x0040000000000000
+#define L2_PRD_PURGE_CMD_TYPE_MASK		PPC_BIT(1) | PPC_BIT(2) | PPC_BIT(3) | PPC_BIT(4)
+#define L2_PRD_PURGE_CMD_TRIGGER		PPC_BIT(0)
+#define L2CAC_FLUSH				0x0
+#define L3_PRD_PURGE_REG			0x1180E
+#define L3_PRD_PURGE_REQ			PPC_BIT(0)
+#define L3_PRD_PURGE_TTYPE_MASK 		PPC_BIT(1) | PPC_BIT(2) | PPC_BIT(3) | PPC_BIT(4)
+#define L3_FULL_PURGE				0x0
+
 #endif /* __NPU2_REGS_H */