[v3] Add purging CPU L2 and L3 caches into NPU hreset.

Message ID 20181203064932.27598-1-rashmica.g@gmail.com
State Superseded
Headers show
Series
  • [v3] Add purging CPU L2 and L3 caches into NPU hreset.
Related show

Checks

Context Check Description
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot success Test snowpatch/job/snowpatch-skiboot on branch master
snowpatch_ozlabs/apply_patch success master/apply_patch Successfully applied

Commit Message

Rashmica Gupta Dec. 3, 2018, 6:49 a.m.
If a GPU is passed through to a guest and the guest unexpectedly terminates,
there can be cache lines in CPUs that belong to the GPU. So purge the caches
as part of the reset sequence. L1 is write through, so doesn't need to be purged.

This also needs to be called if the guest reboots so call it in
npu2_dev_cfg_exp_devcap().

The sequence to purge the L2 and L3 caches from the hw team:

"L2 purge:
 (1) initiate purge
 putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TYPE L2CAC_FLUSH -all
 putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TRIGGER ON -all

 (2) check this is off in all caches to know purge completed
 getspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_REG_BUSY -all

 (3) putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TRIGGER OFF -all

L3 purge:
 1) Start the purge:
 putspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_TTYPE FULL_PURGE -all
 putspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_REQ ON -all

 2) Ensure that the purge has completed by checking the status bit:
 getspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_REQ -all

 You should see it say OFF if it's done:
 p9n.ex k0:n0:s0:p00:c0
 EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_REQ
 OFF"

Suggested-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Rashmica Gupta <rashmica.g@gmail.com>
---

This is done synchronously for now as it doesn't seem to take *too* long
(purging the L2 and L3 caches after building the 4.16 linux kernel on a p9
with 16 cores took 1.57 ms, 1.49ms and 1.46ms).


 hw/npu2.c           | 135 +++++++++++++++++++++++++++++++++++++++++++-
 include/npu2-regs.h |  11 ++++
 2 files changed, 145 insertions(+), 1 deletion(-)

Comments

Oliver Dec. 4, 2018, 12:56 a.m. | #1
On Mon, Dec 3, 2018 at 5:49 PM Rashmica Gupta <rashmica.g@gmail.com> wrote:
>
> If a GPU is passed through to a guest and the guest unexpectedly terminates,
> there can be cache lines in CPUs that belong to the GPU. So purge the caches
> as part of the reset sequence. L1 is write through, so doesn't need to be purged.
>
> This also needs to be called if the guest reboots so call it in
> npu2_dev_cfg_exp_devcap().
>
> The sequence to purge the L2 and L3 caches from the hw team:
>
> "L2 purge:
>  (1) initiate purge
>  putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TYPE L2CAC_FLUSH -all
>  putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TRIGGER ON -all
>
>  (2) check this is off in all caches to know purge completed
>  getspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_REG_BUSY -all
>
>  (3) putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TRIGGER OFF -all
>
> L3 purge:
>  1) Start the purge:
>  putspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_TTYPE FULL_PURGE -all
>  putspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_REQ ON -all
>
>  2) Ensure that the purge has completed by checking the status bit:
>  getspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_REQ -all
>
>  You should see it say OFF if it's done:
>  p9n.ex k0:n0:s0:p00:c0
>  EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_REQ
>  OFF"
>
> Suggested-by: Alistair Popple <alistair@popple.id.au>
> Signed-off-by: Rashmica Gupta <rashmica.g@gmail.com>
> ---
>
> This is done synchronously for now as it doesn't seem to take *too* long
> (purging the L2 and L3 caches after building the 4.16 linux kernel on a p9
> with 16 cores took 1.57 ms, 1.49ms and 1.46ms).
>
>
>  hw/npu2.c           | 135 +++++++++++++++++++++++++++++++++++++++++++-
>  include/npu2-regs.h |  11 ++++
>  2 files changed, 145 insertions(+), 1 deletion(-)
>
> diff --git a/hw/npu2.c b/hw/npu2.c
> index 30049f5b..9c0e6114 100644
> --- a/hw/npu2.c
> +++ b/hw/npu2.c
> @@ -326,6 +326,136 @@ static int64_t npu2_dev_cfg_bar(void *dev, struct pci_cfg_reg_filter *pcrf,
>         return npu2_cfg_read_bar(ndev, pcrf, offset, len, data);
>  }
>
> +static int start_l2_purge(uint32_t chip_id, uint32_t core_id)
> +{
> +       int rc;
> +       uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
> +
> +       rc = xscom_write_mask(chip_id, addr, L2CAC_FLUSH,
> +                             L2_PRD_PURGE_CMD_TYPE_MASK);
> +       if (!rc)
> +               rc = xscom_write_mask(chip_id, addr, L2_PRD_PURGE_CMD_TRIGGER,
> +                             L2_PRD_PURGE_CMD_TRIGGER);
> +       if (rc)
> +               prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write_mask "
> +                     "failed %i\n", core_id, rc);
> +       return rc;
> +}
> +
> +static int wait_l2_purge(uint32_t chip_id, uint32_t core_id)
> +{
> +       int rc;
> +       unsigned long now = mftb();
> +       unsigned long end = now + msecs_to_tb(2);
> +       uint64_t val = L2_PRD_PURGE_CMD_REG_BUSY;
> +       uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
> +
> +       while (val & L2_PRD_PURGE_CMD_REG_BUSY) {
> +               rc = xscom_read(chip_id, addr, &val);
> +               if (rc) {
> +                       prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM read "
> +                             "failed %i\n", core_id, rc);
> +                       break;
> +               }
> +               if (!(val & L2_PRD_PURGE_CMD_REG_BUSY))
> +                       break;
> +               now = mftb();
> +               if (tb_compare(now, end) == TB_AAFTERB) {
> +                       prlog(PR_ERR, "PURGE L2 on core 0x%x timed out %i\n",
> +                             core_id, rc);
> +                       return OPAL_BUSY;
> +               }
> +       }
> +
> +       /* We have to clear the trigger bit ourselves */
> +       val &= ~L2_PRD_PURGE_CMD_TRIGGER;
> +       rc = xscom_write(chip_id, addr, val);
> +       if (rc)
> +               prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write failed %i\n",
> +                     core_id, rc);
> +       return rc;
> +}
> +
> +static int start_l3_purge(uint32_t chip_id, uint32_t core_id)
> +{
> +       int rc;
> +       uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
> +
> +       rc = xscom_write_mask(chip_id, addr, L3_FULL_PURGE,
> +                             L3_PRD_PURGE_TTYPE_MASK);
> +       if (!rc)
> +               rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ,
> +                             L3_PRD_PURGE_REQ);
> +       if (rc)
> +               prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM write_mask "
> +                     "failed %i\n", core_id, rc);
> +       return rc;
> +}
> +
> +static int wait_l3_purge(uint32_t chip_id, uint32_t core_id)
> +{
> +       int rc;
> +       unsigned long now = mftb();
> +       unsigned long end = now + msecs_to_tb(2);
> +       uint64_t val = L3_PRD_PURGE_REQ;
> +       uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
> +
> +       /* Trigger bit is automatically set to zero when flushing is done */
> +       while (val & L3_PRD_PURGE_REQ) {
> +               rc = xscom_read(chip_id, addr, &val);
> +               if (rc) {
> +                       prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM read "
> +                             "failed %i\n", core_id, rc);
> +                       break;
> +               }
> +               if (!(val & L3_PRD_PURGE_REQ))
> +                       break;
> +               now = mftb();
> +               if (tb_compare(now, end) == TB_AAFTERB) {
> +                       prlog(PR_ERR, "PURGE L3 on core 0x%x timed out %i\n",
> +                             core_id, rc);
> +                       return OPAL_BUSY;
> +               }
> +       }
> +       return rc;
> +}
> +
> +static int64_t purge_l2_l3_caches(void)
> +{
> +       struct cpu_thread *t;
> +       uint64_t core_id, prev_core_id = (uint64_t)-1;
> +
> +       for_each_ungarded_cpu(t) {
> +               /* Only need to do it once per core chiplet */
> +               core_id = pir_to_core_id(t->pir);
> +               if (prev_core_id == core_id)
> +                       continue;
> +               prev_core_id = core_id;
> +               if (start_l2_purge(t->chip_id, core_id))
> +                       goto out;
> +               if (start_l3_purge(t->chip_id, core_id))
> +                       goto out;
> +       }
> +
> +       prev_core_id = (uint64_t)-1;
> +       for_each_ungarded_cpu(t) {
> +               /* Only need to do it once per core chiplet */
> +               core_id = pir_to_core_id(t->pir);
> +               if (prev_core_id == core_id)
> +                       continue;
> +               prev_core_id = core_id;
> +
> +               if (wait_l2_purge(t->chip_id, core_id))
> +                       goto out;
> +               if (wait_l3_purge(t->chip_id, core_id))
> +                       goto out;
> +       }
> +       return OPAL_SUCCESS;
> +out:

> +       prlog(PR_ERR, "Failed on core: 0x%llx\n", core_id);

You can probably delete this. It provides no useful information and
the called functions already print an error message.

> +       return OPAL_BUSY_EVENT;
> +}
> +
>  static int64_t npu2_dev_cfg_exp_devcap(void *dev,
>                 struct pci_cfg_reg_filter *pcrf __unused,
>                 uint32_t offset, uint32_t size,
> @@ -346,6 +476,9 @@ static int64_t npu2_dev_cfg_exp_devcap(void *dev,
>         if (*data & PCICAP_EXP_DEVCTL_FUNC_RESET)
>                 npu2_dev_procedure_reset(ndev);

> +       if (purge_l2_l3_caches())
> +               return OPAL_BUSY_EVENT;
> +
>         return OPAL_PARTIAL;
>  }
>
> @@ -1125,7 +1258,7 @@ static int64_t npu2_hreset(struct pci_slot *slot __unused)
>                         reset_ntl(ndev);
>                 }
>         }
> -       return OPAL_SUCCESS;
> +       return purge_l2_l3_caches();

This is more of a question for Alexy, but why are we returning
OPAL_BUSY_EVENT here? It seems like a weird hack to determine when the
cache flush failed.
Alexey Kardashevskiy Dec. 4, 2018, 1:01 a.m. | #2
On 03/12/2018 17:49, Rashmica Gupta wrote:
> If a GPU is passed through to a guest and the guest unexpectedly terminates,
> there can be cache lines in CPUs that belong to the GPU. So purge the caches
> as part of the reset sequence. L1 is write through, so doesn't need to be purged.
> 
> This also needs to be called if the guest reboots so call it in
> npu2_dev_cfg_exp_devcap().
> 
> The sequence to purge the L2 and L3 caches from the hw team:
> 
> "L2 purge:
>  (1) initiate purge
>  putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TYPE L2CAC_FLUSH -all
>  putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TRIGGER ON -all
> 
>  (2) check this is off in all caches to know purge completed
>  getspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_REG_BUSY -all
> 
>  (3) putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TRIGGER OFF -all
> 
> L3 purge:
>  1) Start the purge:
>  putspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_TTYPE FULL_PURGE -all
>  putspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_REQ ON -all
> 
>  2) Ensure that the purge has completed by checking the status bit:
>  getspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_REQ -all
> 
>  You should see it say OFF if it's done:
>  p9n.ex k0:n0:s0:p00:c0
>  EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_REQ
>  OFF"
> 
> Suggested-by: Alistair Popple <alistair@popple.id.au>
> Signed-off-by: Rashmica Gupta <rashmica.g@gmail.com>



Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>

It is correct as it is but there is also some room for bikeshedding, see
below :)


> ---
> 
> This is done synchronously for now as it doesn't seem to take *too* long
> (purging the L2 and L3 caches after building the 4.16 linux kernel on a p9
> with 16 cores took 1.57 ms, 1.49ms and 1.46ms).
> 
> 
>  hw/npu2.c           | 135 +++++++++++++++++++++++++++++++++++++++++++-
>  include/npu2-regs.h |  11 ++++
>  2 files changed, 145 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/npu2.c b/hw/npu2.c
> index 30049f5b..9c0e6114 100644
> --- a/hw/npu2.c
> +++ b/hw/npu2.c
> @@ -326,6 +326,136 @@ static int64_t npu2_dev_cfg_bar(void *dev, struct pci_cfg_reg_filter *pcrf,
>  	return npu2_cfg_read_bar(ndev, pcrf, offset, len, data);
>  }
>  
> +static int start_l2_purge(uint32_t chip_id, uint32_t core_id)
> +{
> +	int rc;
> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
> +
> +	rc = xscom_write_mask(chip_id, addr, L2CAC_FLUSH,
> +			      L2_PRD_PURGE_CMD_TYPE_MASK);
> +	if (!rc)
> +		rc = xscom_write_mask(chip_id, addr, L2_PRD_PURGE_CMD_TRIGGER,
> +			      L2_PRD_PURGE_CMD_TRIGGER);
> +	if (rc)
> +		prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write_mask "
> +		      "failed %i\n", core_id, rc);
> +	return rc;
> +}
> +
> +static int wait_l2_purge(uint32_t chip_id, uint32_t core_id)
> +{
> +	int rc;
> +	unsigned long now = mftb();
> +	unsigned long end = now + msecs_to_tb(2);
> +	uint64_t val = L2_PRD_PURGE_CMD_REG_BUSY;
> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
> +
> +	while (val & L2_PRD_PURGE_CMD_REG_BUSY) {


This check is pointless as you never break out of the loop because of it
(you do it explicitly below), could be as simple while(1) or for(;;).


> +		rc = xscom_read(chip_id, addr, &val);
> +		if (rc) {
> +			prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM read "
> +			      "failed %i\n", core_id, rc);
> +			break;
> +		}
> +		if (!(val & L2_PRD_PURGE_CMD_REG_BUSY))
> +			break;
> +		now = mftb();
> +		if (tb_compare(now, end) == TB_AAFTERB) {
> +			prlog(PR_ERR, "PURGE L2 on core 0x%x timed out %i\n",
> +			      core_id, rc);
> +			return OPAL_BUSY;
> +		}
> +	}
> +
> +	/* We have to clear the trigger bit ourselves */
> +	val &= ~L2_PRD_PURGE_CMD_TRIGGER;
> +	rc = xscom_write(chip_id, addr, val);
> +	if (rc)
> +		prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write failed %i\n",
> +		      core_id, rc);
> +	return rc;
> +}
> +
> +static int start_l3_purge(uint32_t chip_id, uint32_t core_id)
> +{
> +	int rc;
> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
> +
> +	rc = xscom_write_mask(chip_id, addr, L3_FULL_PURGE,
> +			      L3_PRD_PURGE_TTYPE_MASK);
> +	if (!rc)
> +		rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ,
> +			      L3_PRD_PURGE_REQ);
> +	if (rc)
> +		prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM write_mask "
> +		      "failed %i\n", core_id, rc);
> +	return rc;
> +}
> +
> +static int wait_l3_purge(uint32_t chip_id, uint32_t core_id)
> +{
> +	int rc;
> +	unsigned long now = mftb();
> +	unsigned long end = now + msecs_to_tb(2);
> +	uint64_t val = L3_PRD_PURGE_REQ;
> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
> +
> +	/* Trigger bit is automatically set to zero when flushing is done */
> +	while (val & L3_PRD_PURGE_REQ) {

Same here.

> +		rc = xscom_read(chip_id, addr, &val);
> +		if (rc) {
> +			prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM read "
> +			      "failed %i\n", core_id, rc);
> +			break;
> +		}
> +		if (!(val & L3_PRD_PURGE_REQ))
> +			break;
> +		now = mftb();
> +		if (tb_compare(now, end) == TB_AAFTERB) {
> +			prlog(PR_ERR, "PURGE L3 on core 0x%x timed out %i\n",
> +			      core_id, rc);
> +			return OPAL_BUSY;
> +		}
> +	}
> +	return rc;
> +}
> +
> +static int64_t purge_l2_l3_caches(void)
> +{
> +	struct cpu_thread *t;
> +	uint64_t core_id, prev_core_id = (uint64_t)-1;
> +
> +	for_each_ungarded_cpu(t) {
> +		/* Only need to do it once per core chiplet */
> +		core_id = pir_to_core_id(t->pir);
> +		if (prev_core_id == core_id)
> +			continue;
> +		prev_core_id = core_id;
> +		if (start_l2_purge(t->chip_id, core_id))
> +			goto out;


start_l2_purge() can return OPAL_BUSY but can also return other errors
from xscom_read()/xscom_write() but you convert them all to "busy" anyway.

A better approach would be:
ret = start_l2_purge(t->chip_id, core_id);
if (ret)
	goto out;

and...


> +		if (start_l3_purge(t->chip_id, core_id))
> +			goto out;
> +	}
> +
> +	prev_core_id = (uint64_t)-1;
> +	for_each_ungarded_cpu(t) {
> +		/* Only need to do it once per core chiplet */
> +		core_id = pir_to_core_id(t->pir);
> +		if (prev_core_id == core_id)
> +			continue;
> +		prev_core_id = core_id;
> +
> +		if (wait_l2_purge(t->chip_id, core_id))
> +			goto out;
> +		if (wait_l3_purge(t->chip_id, core_id))
> +			goto out;
> +	}
> +	return OPAL_SUCCESS;
> +out:
> +	prlog(PR_ERR, "Failed on core: 0x%llx\n", core_id);
> +	return OPAL_BUSY_EVENT;

... return ret here.

> +}
> +
>  static int64_t npu2_dev_cfg_exp_devcap(void *dev,
>  		struct pci_cfg_reg_filter *pcrf __unused,
>  		uint32_t offset, uint32_t size,
> @@ -346,6 +476,9 @@ static int64_t npu2_dev_cfg_exp_devcap(void *dev,
>  	if (*data & PCICAP_EXP_DEVCTL_FUNC_RESET)
>  		npu2_dev_procedure_reset(ndev);
>  
> +	if (purge_l2_l3_caches())
> +		return OPAL_BUSY_EVENT;

We do not want to purge caches every time we touch this capability. It
is not going to be often in practice but I'd think more often than just
cases when a driver wants to reset a device.

Also you convert every possible error from purge_l2_l3_caches() to just
"busy" while there are more options.

ret = purge_l2_l3_caches();
if (ret)
	return ret;

We have more than 30 error codes, let them be used :)


> +
>  	return OPAL_PARTIAL;
>  }
>  
> @@ -1125,7 +1258,7 @@ static int64_t npu2_hreset(struct pci_slot *slot __unused)
>  			reset_ntl(ndev);
>  		}
>  	}
> -	return OPAL_SUCCESS;
> +	return purge_l2_l3_caches();
>  }
>  
>  static int64_t npu2_freset(struct pci_slot *slot __unused)
> diff --git a/include/npu2-regs.h b/include/npu2-regs.h
> index 10a28166..8273b2be 100644
> --- a/include/npu2-regs.h
> +++ b/include/npu2-regs.h
> @@ -756,4 +756,15 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
>  #define OB3_ODL0_ENDPOINT_INFO			0xC010832
>  #define OB3_ODL1_ENDPOINT_INFO			0xC010833
>  
> +/* Registers and bits used to clear the L2 and L3 cache */
> +#define L2_PRD_PURGE_CMD_REG 			0x1080E
> +#define L2_PRD_PURGE_CMD_REG_BUSY 		0x0040000000000000
> +#define L2_PRD_PURGE_CMD_TYPE_MASK		PPC_BIT(1) | PPC_BIT(2) | PPC_BIT(3) | PPC_BIT(4)
> +#define L2_PRD_PURGE_CMD_TRIGGER		PPC_BIT(0)
> +#define L2CAC_FLUSH				0x0
> +#define L3_PRD_PURGE_REG			0x1180E
> +#define L3_PRD_PURGE_REQ			PPC_BIT(0)
> +#define L3_PRD_PURGE_TTYPE_MASK 		PPC_BIT(1) | PPC_BIT(2) | PPC_BIT(3) | PPC_BIT(4)
> +#define L3_FULL_PURGE				0x0
> +
>  #endif /* __NPU2_REGS_H */
>

Patch

diff --git a/hw/npu2.c b/hw/npu2.c
index 30049f5b..9c0e6114 100644
--- a/hw/npu2.c
+++ b/hw/npu2.c
@@ -326,6 +326,136 @@  static int64_t npu2_dev_cfg_bar(void *dev, struct pci_cfg_reg_filter *pcrf,
 	return npu2_cfg_read_bar(ndev, pcrf, offset, len, data);
 }
 
+static int start_l2_purge(uint32_t chip_id, uint32_t core_id)
+{
+	int rc;
+	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
+
+	rc = xscom_write_mask(chip_id, addr, L2CAC_FLUSH,
+			      L2_PRD_PURGE_CMD_TYPE_MASK);
+	if (!rc)
+		rc = xscom_write_mask(chip_id, addr, L2_PRD_PURGE_CMD_TRIGGER,
+			      L2_PRD_PURGE_CMD_TRIGGER);
+	if (rc)
+		prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write_mask "
+		      "failed %i\n", core_id, rc);
+	return rc;
+}
+
+static int wait_l2_purge(uint32_t chip_id, uint32_t core_id)
+{
+	int rc;
+	unsigned long now = mftb();
+	unsigned long end = now + msecs_to_tb(2);
+	uint64_t val = L2_PRD_PURGE_CMD_REG_BUSY;
+	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
+
+	while (val & L2_PRD_PURGE_CMD_REG_BUSY) {
+		rc = xscom_read(chip_id, addr, &val);
+		if (rc) {
+			prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM read "
+			      "failed %i\n", core_id, rc);
+			break;
+		}
+		if (!(val & L2_PRD_PURGE_CMD_REG_BUSY))
+			break;
+		now = mftb();
+		if (tb_compare(now, end) == TB_AAFTERB) {
+			prlog(PR_ERR, "PURGE L2 on core 0x%x timed out %i\n",
+			      core_id, rc);
+			return OPAL_BUSY;
+		}
+	}
+
+	/* We have to clear the trigger bit ourselves */
+	val &= ~L2_PRD_PURGE_CMD_TRIGGER;
+	rc = xscom_write(chip_id, addr, val);
+	if (rc)
+		prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write failed %i\n",
+		      core_id, rc);
+	return rc;
+}
+
+static int start_l3_purge(uint32_t chip_id, uint32_t core_id)
+{
+	int rc;
+	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
+
+	rc = xscom_write_mask(chip_id, addr, L3_FULL_PURGE,
+			      L3_PRD_PURGE_TTYPE_MASK);
+	if (!rc)
+		rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ,
+			      L3_PRD_PURGE_REQ);
+	if (rc)
+		prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM write_mask "
+		      "failed %i\n", core_id, rc);
+	return rc;
+}
+
+static int wait_l3_purge(uint32_t chip_id, uint32_t core_id)
+{
+	int rc;
+	unsigned long now = mftb();
+	unsigned long end = now + msecs_to_tb(2);
+	uint64_t val = L3_PRD_PURGE_REQ;
+	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
+
+	/* Trigger bit is automatically set to zero when flushing is done */
+	while (val & L3_PRD_PURGE_REQ) {
+		rc = xscom_read(chip_id, addr, &val);
+		if (rc) {
+			prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM read "
+			      "failed %i\n", core_id, rc);
+			break;
+		}
+		if (!(val & L3_PRD_PURGE_REQ))
+			break;
+		now = mftb();
+		if (tb_compare(now, end) == TB_AAFTERB) {
+			prlog(PR_ERR, "PURGE L3 on core 0x%x timed out %i\n",
+			      core_id, rc);
+			return OPAL_BUSY;
+		}
+	}
+	return rc;
+}
+
+static int64_t purge_l2_l3_caches(void)
+{
+	struct cpu_thread *t;
+	uint64_t core_id, prev_core_id = (uint64_t)-1;
+
+	for_each_ungarded_cpu(t) {
+		/* Only need to do it once per core chiplet */
+		core_id = pir_to_core_id(t->pir);
+		if (prev_core_id == core_id)
+			continue;
+		prev_core_id = core_id;
+		if (start_l2_purge(t->chip_id, core_id))
+			goto out;
+		if (start_l3_purge(t->chip_id, core_id))
+			goto out;
+	}
+
+	prev_core_id = (uint64_t)-1;
+	for_each_ungarded_cpu(t) {
+		/* Only need to do it once per core chiplet */
+		core_id = pir_to_core_id(t->pir);
+		if (prev_core_id == core_id)
+			continue;
+		prev_core_id = core_id;
+
+		if (wait_l2_purge(t->chip_id, core_id))
+			goto out;
+		if (wait_l3_purge(t->chip_id, core_id))
+			goto out;
+	}
+	return OPAL_SUCCESS;
+out:
+	prlog(PR_ERR, "Failed on core: 0x%llx\n", core_id);
+	return OPAL_BUSY_EVENT;
+}
+
 static int64_t npu2_dev_cfg_exp_devcap(void *dev,
 		struct pci_cfg_reg_filter *pcrf __unused,
 		uint32_t offset, uint32_t size,
@@ -346,6 +476,9 @@  static int64_t npu2_dev_cfg_exp_devcap(void *dev,
 	if (*data & PCICAP_EXP_DEVCTL_FUNC_RESET)
 		npu2_dev_procedure_reset(ndev);
 
+	if (purge_l2_l3_caches())
+		return OPAL_BUSY_EVENT;
+
 	return OPAL_PARTIAL;
 }
 
@@ -1125,7 +1258,7 @@  static int64_t npu2_hreset(struct pci_slot *slot __unused)
 			reset_ntl(ndev);
 		}
 	}
-	return OPAL_SUCCESS;
+	return purge_l2_l3_caches();
 }
 
 static int64_t npu2_freset(struct pci_slot *slot __unused)
diff --git a/include/npu2-regs.h b/include/npu2-regs.h
index 10a28166..8273b2be 100644
--- a/include/npu2-regs.h
+++ b/include/npu2-regs.h
@@ -756,4 +756,15 @@  void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
 #define OB3_ODL0_ENDPOINT_INFO			0xC010832
 #define OB3_ODL1_ENDPOINT_INFO			0xC010833
 
+/* Registers and bits used to clear the L2 and L3 cache */
+#define L2_PRD_PURGE_CMD_REG 			0x1080E
+#define L2_PRD_PURGE_CMD_REG_BUSY 		0x0040000000000000
+#define L2_PRD_PURGE_CMD_TYPE_MASK		PPC_BIT(1) | PPC_BIT(2) | PPC_BIT(3) | PPC_BIT(4)
+#define L2_PRD_PURGE_CMD_TRIGGER		PPC_BIT(0)
+#define L2CAC_FLUSH				0x0
+#define L3_PRD_PURGE_REG			0x1180E
+#define L3_PRD_PURGE_REQ			PPC_BIT(0)
+#define L3_PRD_PURGE_TTYPE_MASK 		PPC_BIT(1) | PPC_BIT(2) | PPC_BIT(3) | PPC_BIT(4)
+#define L3_FULL_PURGE				0x0
+
 #endif /* __NPU2_REGS_H */