Add in new OPAL call to flush the L2 and L3 caches.

Message ID 20181029061907.31951-1-rashmica.g@gmail.com
State Superseded
Headers show
Series
  • Add in new OPAL call to flush the L2 and L3 caches.
Related show

Checks

Context Check Description
snowpatch_ozlabs/make_check success Test make_check on branch master
snowpatch_ozlabs/apply_patch success master/apply_patch Successfully applied

Commit Message

Rashmica Gupta Oct. 29, 2018, 6:19 a.m.
Using the dcbf instruction is really slow... This is much faster.

Suggested-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Rashmica Gupta <rashmica.g@gmail.com>
---
Stewart: I realise that cpu.c is probably not where this should live...
Thoughts on where it should go?

Context: When resetting a GPU we want to make sure all dirty cache lines
in the CPU cache are cleared. Hit up Alexey and Alistair for the nitty
gritty details.

 core/cpu.c         | 108 +++++++++++++++++++++++++++++++++++++++++++++
 include/cpu.h      |   2 +
 include/opal-api.h |   3 +-
 3 files changed, 112 insertions(+), 1 deletion(-)

Comments

Alexey Kardashevskiy Nov. 2, 2018, 2:17 a.m. | #1
On 29/10/2018 17:19, Rashmica Gupta wrote:
> Using the dcbf instruction is really slow... This is much faster.
> 
> Suggested-by: Alistair Popple <alistair@popple.id.au>
> Signed-off-by: Rashmica Gupta <rashmica.g@gmail.com>
> ---
> Stewart: I realise that cpu.c is probably not where this should live...
> Thoughts on where it should go?

I cannot really see any better place for this though.


> 
> Context: When resetting a GPU we want to make sure all dirty cache lines
> in the CPU cache are cleared. Hit up Alexey and Alistair for the nitty
> gritty details.
> 
>  core/cpu.c         | 108 +++++++++++++++++++++++++++++++++++++++++++++
>  include/cpu.h      |   2 +
>  include/opal-api.h |   3 +-
>  3 files changed, 112 insertions(+), 1 deletion(-)
> 
> diff --git a/core/cpu.c b/core/cpu.c
> index 4f518a4c..bc4fcaf8 100644
> --- a/core/cpu.c
> +++ b/core/cpu.c
> @@ -1630,3 +1630,111 @@ static int64_t opal_nmmu_set_ptcr(uint64_t chip_id, uint64_t ptcr)
>  	return rc;
>  }
>  opal_call(OPAL_NMMU_SET_PTCR, opal_nmmu_set_ptcr, 2);
> +
> +

One empty line should be fine :)

> +#define L2_PRD_PURGE_CMD_REG 0x1080E
> +#define L2_PRD_PURGE_CMD_REG_BUSY 0x0040000000000000
> +#define L2_PRD_PURGE_CMD_TRIGGER 0x8000000000000000
> +#define L3_PRD_PURGE_REG 0x1180E
> +#define L3_PRD_PURGE_REQ 0x8000000000000000

My guess is that these should go to include/xscom-p9-regs.h, where other
registers used with XSCOM_ADDR_P9_EX live. Also, the values written to
the registers are better be defined with PPC_BIT, just like everything
else in that file, it makes it easier to match the numbers with
scoms/ex_chiplet.html


> +#define TIMEOUT_MS 2
> +
> +static inline bool time_expired(unsigned long start)
> +{
> +	unsigned long time = tb_to_msecs(mftb());
> +
> +	if (time - start > TIMEOUT_MS) {
> +		return true;
> +	}


Does Stewart^wskiboot require curly braces for single line statements?
Also, is there 80 chars requirement (just curious)?


> +	return false;
> +}
> +
> +static int flush_l2_caches(uint32_t chip_id, uint32_t core_id)
> +{
> +	int rc, timeout = 0;
> +	unsigned long start_time;
> +	uint64_t val = L2_PRD_PURGE_CMD_REG_BUSY;
> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
> +
> +	rc = xscom_write_mask(chip_id, addr, L2_PRD_PURGE_CMD_TRIGGER,
> +			      L2_PRD_PURGE_CMD_TRIGGER);

The advise from the hw folks was:

 putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TYPE L2CAC_FLUSH -all

 putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TRIGGER ON -all

I can see trigger action but not L2CAC_FLUSH.

> +	if (rc) {
> +		prlog(PR_ERR, "FLUSH L2 on core 0x%x: XSCOM write_mask failed %i\n", core_id, rc);

Any reason not to "return rc" right here?

> +	}
> +	start_time = tb_to_msecs(mftb());
> +	while ((val & L2_PRD_PURGE_CMD_REG_BUSY) && !(timeout = time_expired(start_time))) {


Can we please not have '=' in a condition?


> +		rc = xscom_read(chip_id, addr, &val);
> +		if (rc) {
> +			prlog(PR_ERR, "FLUSH L2 on core 0x%x: XSCOM read failed %i\n", core_id, rc);
> +			break;
> +		}


if (tb_to_msecs(mftb()) - start_time > TIMEOUT_MS) {
	prlog(PR_ERR, "FLUSH L3 on core 0x%x timed out %i\n", core_id, rc);
	return OPAL_BUSY;
}

...


> +	}
> +	if (timeout) {
> +		prlog(PR_ERR, "FLUSH L3 on core 0x%x timed out %i\n", core_id, rc);
> +		return OPAL_BUSY;
> +	}

... and ditch this check. And the time_expired() helper (which does not
seem to help much anyway). And the local @timeout variable. And '=' in
the condition :)


> +
> +	/* We have to clear the trigger bit ourselves */
> +	val &= ~L2_PRD_PURGE_CMD_TRIGGER;
> +	rc = xscom_write(chip_id, addr, val);
> +	if (rc)
> +		prlog(PR_ERR, "FLUSH L2 on core 0x%x: XSCOM write failed %i\n", core_id, rc);
> +	return 0;


Why not return rc?


> +
> +}
> +
> +static int flush_l3_caches(uint32_t chip_id, uint32_t core_id)
> +{
> +	int rc, timeout = 0;
> +	unsigned long start_time;
> +	uint64_t val = L3_PRD_PURGE_REQ;
> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
> +
> +	rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ, L3_PRD_PURGE_REQ);

Similar to L2CAC_FLUSH question here:

 putspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_TTYPE FULL_PURGE -all

 putspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_REQ ON -all


There is L3_PRD_PURGE_REQ but no FULL_PURGE.


> +	if (rc) {
> +		prlog(PR_ERR, "FLUSH L3 on core 0x%x: XSCOM write_mask failed %i\n", core_id, rc);


return rc?

> +	}
> +
> +	/* Trigger bit is automatically set to zero when flushing is done*/
> +	start_time = tb_to_msecs(mftb());
> +	while ((val & L3_PRD_PURGE_REQ) && !(timeout = time_expired(start_time) )) {
> +		rc = xscom_read(chip_id, addr, &val);
> +		if (rc) {
> +			prlog(PR_ERR, "FLUSH L3 on core 0x%x: XSCOM read failed %i\n", core_id, rc);
> +			break;
> +		}
> +	}
> +	if (timeout) {
> +		prlog(PR_ERR, "FLUSH L3 on core 0x%x timed out %i\n", core_id, rc);
> +		return OPAL_BUSY;
> +	}
> +
> +	return 0;

return rc?


> +}
> +
> +int flush_caches(void)

OPAL calls return int64_t. And the handlers are static so there is no
reason to export the symbol via include/cpu.h below.


> +{
> +	int rc = 0;
> +	struct cpu_thread *t;
> +	uint64_t chip_id, core_id, prev_core_id = 0xdeadbeef;
> +
> +	if ((mfspr(SPR_PVR) & PVR_TYPE_P9) != PVR_TYPE_P9)
> +		return OPAL_UNSUPPORTED;
> +
> +	for_each_cpu(t) {
> +		/* Only need to do it once per core chiplet */
> +		core_id = pir_to_core_id(t->pir);
> +		if (prev_core_id == core_id)
> +			continue;
> +		prev_core_id = core_id;
> +		chip_id = t->chip_id;


The chip_id local variable seems pretty useless, could just use t->chip_id.

> +
> +		rc |= flush_l2_caches(chip_id, core_id);
> +		rc |= flush_l3_caches(chip_id, core_id);
> +	}
> +
> +	return rc;
> +}
> +
> +

Extra empty line?

> +opal_call(OPAL_CLEAR_CACHE, flush_caches, 0);
> diff --git a/include/cpu.h b/include/cpu.h
> index 2fe47982..04c862c5 100644
> --- a/include/cpu.h
> +++ b/include/cpu.h
> @@ -329,4 +329,6 @@ int dctl_set_special_wakeup(struct cpu_thread *t);
>  int dctl_clear_special_wakeup(struct cpu_thread *t);
>  int dctl_core_is_gated(struct cpu_thread *t);
>  
> +extern int flush_caches(void);
> +
>  #endif /* __CPU_H */
> diff --git a/include/opal-api.h b/include/opal-api.h
> index 5f397c8e..c24838d2 100644
> --- a/include/opal-api.h
> +++ b/include/opal-api.h
> @@ -226,7 +226,8 @@
>  #define OPAL_NX_COPROC_INIT			167
>  #define OPAL_NPU_SET_RELAXED_ORDER		168
>  #define OPAL_NPU_GET_RELAXED_ORDER		169
> -#define OPAL_LAST				169
> +#define OPAL_CLEAR_CACHE			170


s/clear/purge/. The scoms calls it "purge", let's stick to this name.


> +#define OPAL_LAST				170
>  
>  #define QUIESCE_HOLD			1 /* Spin all calls at entry */
>  #define QUIESCE_REJECT			2 /* Fail all calls with OPAL_BUSY */
>
Alistair Popple Nov. 2, 2018, 2:22 a.m. | #2
Hi Rashmica,

> > +	}
> > +	start_time = tb_to_msecs(mftb());
> > +	while ((val & L2_PRD_PURGE_CMD_REG_BUSY) && !(timeout =
> > time_expired(start_time))) {
> Can we please not have '=' in a condition?

...

> ... and ditch this check. And the time_expired() helper (which does not
> seem to help much anyway). And the local @timeout variable. And '=' in
> the condition :)

Take a look at the tb_compare() function (and perhaps how others use it). I 
think that probably does what you want.

- Alistair

> > +
> > +	/* We have to clear the trigger bit ourselves */
> > +	val &= ~L2_PRD_PURGE_CMD_TRIGGER;
> > +	rc = xscom_write(chip_id, addr, val);
> > +	if (rc)
> > +		prlog(PR_ERR, "FLUSH L2 on core 0x%x: XSCOM write failed %i\n",
> > core_id, rc); +	return 0;
> 
> Why not return rc?
> 
> > +
> > +}
> > +
> > +static int flush_l3_caches(uint32_t chip_id, uint32_t core_id)
> > +{
> > +	int rc, timeout = 0;
> > +	unsigned long start_time;
> > +	uint64_t val = L3_PRD_PURGE_REQ;
> > +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
> > +
> > +	rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ,
> > L3_PRD_PURGE_REQ);
> 
> Similar to L2CAC_FLUSH question here:
> 
>  putspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_TTYPE FULL_PURGE -all
> 
>  putspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_REQ ON -all
> 
> 
> There is L3_PRD_PURGE_REQ but no FULL_PURGE.
> 
> > +	if (rc) {
> > +		prlog(PR_ERR, "FLUSH L3 on core 0x%x: XSCOM write_mask failed %i
\n",
> > core_id, rc);
> return rc?
> 
> > +	}
> > +
> > +	/* Trigger bit is automatically set to zero when flushing is done*/
> > +	start_time = tb_to_msecs(mftb());
> > +	while ((val & L3_PRD_PURGE_REQ) && !(timeout = 
time_expired(start_time)
> > )) { +		rc = xscom_read(chip_id, addr, &val);
> > +		if (rc) {
> > +			prlog(PR_ERR, "FLUSH L3 on core 0x%x: XSCOM read failed %i\n",
> > core_id, rc); +			break;
> > +		}
> > +	}
> > +	if (timeout) {
> > +		prlog(PR_ERR, "FLUSH L3 on core 0x%x timed out %i\n", core_id, 
rc);
> > +		return OPAL_BUSY;
> > +	}
> > +
> > +	return 0;
> 
> return rc?
> 
> > +}
> > +
> > +int flush_caches(void)
> 
> OPAL calls return int64_t. And the handlers are static so there is no
> reason to export the symbol via include/cpu.h below.
> 
> > +{
> > +	int rc = 0;
> > +	struct cpu_thread *t;
> > +	uint64_t chip_id, core_id, prev_core_id = 0xdeadbeef;
> > +
> > +	if ((mfspr(SPR_PVR) & PVR_TYPE_P9) != PVR_TYPE_P9)
> > +		return OPAL_UNSUPPORTED;
> > +
> > +	for_each_cpu(t) {
> > +		/* Only need to do it once per core chiplet */
> > +		core_id = pir_to_core_id(t->pir);
> > +		if (prev_core_id == core_id)
> > +			continue;
> > +		prev_core_id = core_id;
> > +		chip_id = t->chip_id;
> 
> The chip_id local variable seems pretty useless, could just use t->chip_id.
> 
> > +
> > +		rc |= flush_l2_caches(chip_id, core_id);
> > +		rc |= flush_l3_caches(chip_id, core_id);
> > +	}
> > +
> > +	return rc;
> > +}
> > +
> > +
> 
> Extra empty line?
> 
> > +opal_call(OPAL_CLEAR_CACHE, flush_caches, 0);
> > diff --git a/include/cpu.h b/include/cpu.h
> > index 2fe47982..04c862c5 100644
> > --- a/include/cpu.h
> > +++ b/include/cpu.h
> > @@ -329,4 +329,6 @@ int dctl_set_special_wakeup(struct cpu_thread *t);
> > 
> >  int dctl_clear_special_wakeup(struct cpu_thread *t);
> >  int dctl_core_is_gated(struct cpu_thread *t);
> > 
> > +extern int flush_caches(void);
> > +
> > 
> >  #endif /* __CPU_H */
> > 
> > diff --git a/include/opal-api.h b/include/opal-api.h
> > index 5f397c8e..c24838d2 100644
> > --- a/include/opal-api.h
> > +++ b/include/opal-api.h
> > @@ -226,7 +226,8 @@
> > 
> >  #define OPAL_NX_COPROC_INIT			167
> >  #define OPAL_NPU_SET_RELAXED_ORDER		168
> >  #define OPAL_NPU_GET_RELAXED_ORDER		169
> > 
> > -#define OPAL_LAST				169
> > +#define OPAL_CLEAR_CACHE			170
> 
> s/clear/purge/. The scoms calls it "purge", let's stick to this name.
> 
> > +#define OPAL_LAST				170
> > 
> >  #define QUIESCE_HOLD			1 /* Spin all calls at entry */
> >  #define QUIESCE_REJECT			2 /* Fail all calls with OPAL_BUSY */
Rashmica Gupta Nov. 2, 2018, 2:33 a.m. | #3
On Fri, 2018-11-02 at 13:22 +1100, Alistair Popple wrote:
> Hi Rashmica,
> 
> > > +	}
> > > +	start_time = tb_to_msecs(mftb());
> > > +	while ((val & L2_PRD_PURGE_CMD_REG_BUSY) && !(timeout =
> > > time_expired(start_time))) {
> > 
> > Can we please not have '=' in a condition?
> 
> ...
> 
> > ... and ditch this check. And the time_expired() helper (which does
> > not
> > seem to help much anyway). And the local @timeout variable. And '='
> > in
> > the condition :)
> 
> Take a look at the tb_compare() function (and perhaps how others use
> it). I 
> think that probably does what you want.

:P it sure does!

> 
> - Alistair
> 
> > > +
> > > +	/* We have to clear the trigger bit ourselves */
> > > +	val &= ~L2_PRD_PURGE_CMD_TRIGGER;
> > > +	rc = xscom_write(chip_id, addr, val);
> > > +	if (rc)
> > > +		prlog(PR_ERR, "FLUSH L2 on core 0x%x: XSCOM write
> > > failed %i\n",
> > > core_id, rc); +	return 0;
> > 
> > Why not return rc?
> > 
> > > +
> > > +}
> > > +
> > > +static int flush_l3_caches(uint32_t chip_id, uint32_t core_id)
> > > +{
> > > +	int rc, timeout = 0;
> > > +	unsigned long start_time;
> > > +	uint64_t val = L3_PRD_PURGE_REQ;
> > > +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
> > > +
> > > +	rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ,
> > > L3_PRD_PURGE_REQ);
> > 
> > Similar to L2CAC_FLUSH question here:
> > 
> >  putspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_TTYPE FULL_PURGE
> > -all
> > 
> >  putspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_REQ ON -all
> > 
> > 
> > There is L3_PRD_PURGE_REQ but no FULL_PURGE.
> > 
> > > +	if (rc) {
> > > +		prlog(PR_ERR, "FLUSH L3 on core 0x%x: XSCOM write_mask
> > > failed %i
> 
> \n",
> > > core_id, rc);
> > 
> > return rc?
> > 
> > > +	}
> > > +
> > > +	/* Trigger bit is automatically set to zero when flushing is
> > > done*/
> > > +	start_time = tb_to_msecs(mftb());
> > > +	while ((val & L3_PRD_PURGE_REQ) && !(timeout = 
> 
> time_expired(start_time)
> > > )) { +		rc = xscom_read(chip_id, addr, &val);
> > > +		if (rc) {
> > > +			prlog(PR_ERR, "FLUSH L3 on core 0x%x: XSCOM
> > > read failed %i\n",
> > > core_id, rc); +			break;
> > > +		}
> > > +	}
> > > +	if (timeout) {
> > > +		prlog(PR_ERR, "FLUSH L3 on core 0x%x timed out %i\n",
> > > core_id, 
> 
> rc);
> > > +		return OPAL_BUSY;
> > > +	}
> > > +
> > > +	return 0;
> > 
> > return rc?
> > 
> > > +}
> > > +
> > > +int flush_caches(void)
> > 
> > OPAL calls return int64_t. And the handlers are static so there is
> > no
> > reason to export the symbol via include/cpu.h below.
> > 
> > > +{
> > > +	int rc = 0;
> > > +	struct cpu_thread *t;
> > > +	uint64_t chip_id, core_id, prev_core_id = 0xdeadbeef;
> > > +
> > > +	if ((mfspr(SPR_PVR) & PVR_TYPE_P9) != PVR_TYPE_P9)
> > > +		return OPAL_UNSUPPORTED;
> > > +
> > > +	for_each_cpu(t) {
> > > +		/* Only need to do it once per core chiplet */
> > > +		core_id = pir_to_core_id(t->pir);
> > > +		if (prev_core_id == core_id)
> > > +			continue;
> > > +		prev_core_id = core_id;
> > > +		chip_id = t->chip_id;
> > 
> > The chip_id local variable seems pretty useless, could just use t-
> > >chip_id.
> > 
> > > +
> > > +		rc |= flush_l2_caches(chip_id, core_id);
> > > +		rc |= flush_l3_caches(chip_id, core_id);
> > > +	}
> > > +
> > > +	return rc;
> > > +}
> > > +
> > > +
> > 
> > Extra empty line?
> > 
> > > +opal_call(OPAL_CLEAR_CACHE, flush_caches, 0);
> > > diff --git a/include/cpu.h b/include/cpu.h
> > > index 2fe47982..04c862c5 100644
> > > --- a/include/cpu.h
> > > +++ b/include/cpu.h
> > > @@ -329,4 +329,6 @@ int dctl_set_special_wakeup(struct cpu_thread
> > > *t);
> > > 
> > >  int dctl_clear_special_wakeup(struct cpu_thread *t);
> > >  int dctl_core_is_gated(struct cpu_thread *t);
> > > 
> > > +extern int flush_caches(void);
> > > +
> > > 
> > >  #endif /* __CPU_H */
> > > 
> > > diff --git a/include/opal-api.h b/include/opal-api.h
> > > index 5f397c8e..c24838d2 100644
> > > --- a/include/opal-api.h
> > > +++ b/include/opal-api.h
> > > @@ -226,7 +226,8 @@
> > > 
> > >  #define OPAL_NX_COPROC_INIT			167
> > >  #define OPAL_NPU_SET_RELAXED_ORDER		168
> > >  #define OPAL_NPU_GET_RELAXED_ORDER		169
> > > 
> > > -#define OPAL_LAST				169
> > > +#define OPAL_CLEAR_CACHE			170
> > 
> > s/clear/purge/. The scoms calls it "purge", let's stick to this
> > name.
> > 
> > > +#define OPAL_LAST				170
> > > 
> > >  #define QUIESCE_HOLD			1 /* Spin all calls at
> > > entry */
> > >  #define QUIESCE_REJECT			2 /* Fail all calls
> > > with OPAL_BUSY */
> 
>
Rashmica Gupta Nov. 2, 2018, 2:48 a.m. | #4
On Fri, 2018-11-02 at 13:17 +1100, Alexey Kardashevskiy wrote:
> 
> On 29/10/2018 17:19, Rashmica Gupta wrote:
> > Using the dcbf instruction is really slow... This is much faster.
> > 
> > Suggested-by: Alistair Popple <alistair@popple.id.au>
> > Signed-off-by: Rashmica Gupta <rashmica.g@gmail.com>
> > ---
> > Stewart: I realise that cpu.c is probably not where this should
> > live...
> > Thoughts on where it should go?
> 
> I cannot really see any better place for this though.
> 
> 
> > 
> > Context: When resetting a GPU we want to make sure all dirty cache
> > lines
> > in the CPU cache are cleared. Hit up Alexey and Alistair for the
> > nitty
> > gritty details.
> > 
> >  core/cpu.c         | 108
> > +++++++++++++++++++++++++++++++++++++++++++++
> >  include/cpu.h      |   2 +
> >  include/opal-api.h |   3 +-
> >  3 files changed, 112 insertions(+), 1 deletion(-)
> > 
> > diff --git a/core/cpu.c b/core/cpu.c
> > index 4f518a4c..bc4fcaf8 100644
> > --- a/core/cpu.c
> > +++ b/core/cpu.c
> > @@ -1630,3 +1630,111 @@ static int64_t opal_nmmu_set_ptcr(uint64_t
> > chip_id, uint64_t ptcr)
> >  	return rc;
> >  }
> >  opal_call(OPAL_NMMU_SET_PTCR, opal_nmmu_set_ptcr, 2);
> > +
> > +
> 
> One empty line should be fine :)
> 
> > +#define L2_PRD_PURGE_CMD_REG 0x1080E
> > +#define L2_PRD_PURGE_CMD_REG_BUSY 0x0040000000000000
> > +#define L2_PRD_PURGE_CMD_TRIGGER 0x8000000000000000
> > +#define L3_PRD_PURGE_REG 0x1180E
> > +#define L3_PRD_PURGE_REQ 0x8000000000000000
> 
> My guess is that these should go to include/xscom-p9-regs.h, where
> other
> registers used with XSCOM_ADDR_P9_EX live. Also, the values written
> to
> the registers are better be defined with PPC_BIT, just like
> everything
> else in that file, it makes it easier to match the numbers with
> scoms/ex_chiplet.html
> 

Good point :)

> 
> > +#define TIMEOUT_MS 2
> > +
> > +static inline bool time_expired(unsigned long start)
> > +{
> > +	unsigned long time = tb_to_msecs(mftb());
> > +
> > +	if (time - start > TIMEOUT_MS) {
> > +		return true;
> > +	}
> 
> 
> Does Stewart^wskiboot require curly braces for single line
> statements?
> Also, is there 80 chars requirement (just curious)?
> 
I'm not sure... but I think I sent the wrong version - I had tidied up
the lines to 80 chars and was returning rc (which you mentioned below).

> 
> > +	return false;
> > +}
> > +
> > +static int flush_l2_caches(uint32_t chip_id, uint32_t core_id)
> > +{
> > +	int rc, timeout = 0;
> > +	unsigned long start_time;
> > +	uint64_t val = L2_PRD_PURGE_CMD_REG_BUSY;
> > +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id,
> > L2_PRD_PURGE_CMD_REG);
> > +
> > +	rc = xscom_write_mask(chip_id, addr, L2_PRD_PURGE_CMD_TRIGGER,
> > +			      L2_PRD_PURGE_CMD_TRIGGER);
> 
> The advise from the hw folks was:
> 
>  putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TYPE L2CAC_FLUSH
> -all
> 
>  putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TRIGGER ON -all
> 
> I can see trigger action but not L2CAC_FLUSH.
> 

I lumped them together (as L2CAC_FLUSH=>0b0000). I'll split it up and
make it clearer though.

> > +	if (rc) {
> > +		prlog(PR_ERR, "FLUSH L2 on core 0x%x: XSCOM write_mask
> > failed %i\n", core_id, rc);
> 
> Any reason not to "return rc" right here?

Not that I can think of. 

> 
> > +	}
> > +	start_time = tb_to_msecs(mftb());
> > +	while ((val & L2_PRD_PURGE_CMD_REG_BUSY) && !(timeout =
> > time_expired(start_time))) {
> 
> 
> Can we please not have '=' in a condition?
> 

That will disapear with Al's suggestion.

> 
> > +		rc = xscom_read(chip_id, addr, &val);
> > +		if (rc) {
> > +			prlog(PR_ERR, "FLUSH L2 on core 0x%x: XSCOM
> > read failed %i\n", core_id, rc);
> > +			break;
> > +		}
> 
> 
> if (tb_to_msecs(mftb()) - start_time > TIMEOUT_MS) {
> 	prlog(PR_ERR, "FLUSH L3 on core 0x%x timed out %i\n", core_id,
> rc);
> 	return OPAL_BUSY;
> }
> 
> ...
> 
> 
> > +	}
> > +	if (timeout) {
> > +		prlog(PR_ERR, "FLUSH L3 on core 0x%x timed out %i\n",
> > core_id, rc);
> > +		return OPAL_BUSY;
> > +	}
> 
> ... and ditch this check. 

Why? You don't care if we timed out trying to clear the caches?

> And the time_expired() helper (which does not
> seem to help much anyway). And the local @timeout variable. And '='
> in
> the condition :)
> 
> 
> > +
> > +	/* We have to clear the trigger bit ourselves */
> > +	val &= ~L2_PRD_PURGE_CMD_TRIGGER;
> > +	rc = xscom_write(chip_id, addr, val);
> > +	if (rc)
> > +		prlog(PR_ERR, "FLUSH L2 on core 0x%x: XSCOM write
> > failed %i\n", core_id, rc);
> > +	return 0;
> 
> 
> Why not return rc?
> 

As mentioned above

> 
> > +
> > +}
> > +
> > +static int flush_l3_caches(uint32_t chip_id, uint32_t core_id)
> > +{
> > +	int rc, timeout = 0;
> > +	unsigned long start_time;
> > +	uint64_t val = L3_PRD_PURGE_REQ;
> > +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
> > +
> > +	rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ,
> > L3_PRD_PURGE_REQ);
> 
> Similar to L2CAC_FLUSH question here:
> 
>  putspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_TTYPE FULL_PURGE
> -all
> 
>  putspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_REQ ON -all
> 
> 
> There is L3_PRD_PURGE_REQ but no FULL_PURGE.
> 

Same as above, I combined them as FULL_PURGE=>0b0000. Will seperate and
make clearer :)

> 
> > +	if (rc) {
> > +		prlog(PR_ERR, "FLUSH L3 on core 0x%x: XSCOM write_mask
> > failed %i\n", core_id, rc);
> 
> 
> return rc?
> 
> > +	}
> > +
> > +	/* Trigger bit is automatically set to zero when flushing is
> > done*/
> > +	start_time = tb_to_msecs(mftb());
> > +	while ((val & L3_PRD_PURGE_REQ) && !(timeout =
> > time_expired(start_time) )) {
> > +		rc = xscom_read(chip_id, addr, &val);
> > +		if (rc) {
> > +			prlog(PR_ERR, "FLUSH L3 on core 0x%x: XSCOM
> > read failed %i\n", core_id, rc);
> > +			break;
> > +		}
> > +	}
> > +	if (timeout) {
> > +		prlog(PR_ERR, "FLUSH L3 on core 0x%x timed out %i\n",
> > core_id, rc);
> > +		return OPAL_BUSY;
> > +	}
> > +
> > +	return 0;
> 
> return rc?
> 
> 
> > +}
> > +
> > +int flush_caches(void)
> 
> OPAL calls return int64_t. And the handlers are static so there is no
> reason to export the symbol via include/cpu.h below.
> 

Ok.

> 
> > +{
> > +	int rc = 0;
> > +	struct cpu_thread *t;
> > +	uint64_t chip_id, core_id, prev_core_id = 0xdeadbeef;
> > +
> > +	if ((mfspr(SPR_PVR) & PVR_TYPE_P9) != PVR_TYPE_P9)
> > +		return OPAL_UNSUPPORTED;
> > +
> > +	for_each_cpu(t) {
> > +		/* Only need to do it once per core chiplet */
> > +		core_id = pir_to_core_id(t->pir);
> > +		if (prev_core_id == core_id)
> > +			continue;
> > +		prev_core_id = core_id;
> > +		chip_id = t->chip_id;
> 
> 
> The chip_id local variable seems pretty useless, could just use t-
> >chip_id.

Yep.

> 
> > +
> > +		rc |= flush_l2_caches(chip_id, core_id);
> > +		rc |= flush_l3_caches(chip_id, core_id);
> > +	}
> > +
> > +	return rc;
> > +}
> > +
> > +
> 
> Extra empty line?
> 
> > +opal_call(OPAL_CLEAR_CACHE, flush_caches, 0);
> > diff --git a/include/cpu.h b/include/cpu.h
> > index 2fe47982..04c862c5 100644
> > --- a/include/cpu.h
> > +++ b/include/cpu.h
> > @@ -329,4 +329,6 @@ int dctl_set_special_wakeup(struct cpu_thread
> > *t);
> >  int dctl_clear_special_wakeup(struct cpu_thread *t);
> >  int dctl_core_is_gated(struct cpu_thread *t);
> >  
> > +extern int flush_caches(void);
> > +
> >  #endif /* __CPU_H */
> > diff --git a/include/opal-api.h b/include/opal-api.h
> > index 5f397c8e..c24838d2 100644
> > --- a/include/opal-api.h
> > +++ b/include/opal-api.h
> > @@ -226,7 +226,8 @@
> >  #define OPAL_NX_COPROC_INIT			167
> >  #define OPAL_NPU_SET_RELAXED_ORDER		168
> >  #define OPAL_NPU_GET_RELAXED_ORDER		169
> > -#define OPAL_LAST				169
> > +#define OPAL_CLEAR_CACHE			170
> 
> 
> s/clear/purge/. The scoms calls it "purge", let's stick to this name.
> 

Good point :)

> 
> > +#define OPAL_LAST				170
> >  
> >  #define QUIESCE_HOLD			1 /* Spin all calls at
> > entry */
> >  #define QUIESCE_REJECT			2 /* Fail all calls
> > with OPAL_BUSY */
> > 
> 
>
Alexey Kardashevskiy Nov. 2, 2018, 3:14 a.m. | #5
On 02/11/2018 13:48, Rashmica Gupta wrote:
> On Fri, 2018-11-02 at 13:17 +1100, Alexey Kardashevskiy wrote:
>>
>> On 29/10/2018 17:19, Rashmica Gupta wrote:
>>> Using the dcbf instruction is really slow... This is much faster.
>>>
>>> Suggested-by: Alistair Popple <alistair@popple.id.au>
>>> Signed-off-by: Rashmica Gupta <rashmica.g@gmail.com>
>>> ---
>>> Stewart: I realise that cpu.c is probably not where this should
>>> live...
>>> Thoughts on where it should go?
>>
>> I cannot really see any better place for this though.
>>
>>
>>>
>>> Context: When resetting a GPU we want to make sure all dirty cache
>>> lines
>>> in the CPU cache are cleared. Hit up Alexey and Alistair for the
>>> nitty
>>> gritty details.
>>>
>>>  core/cpu.c         | 108
>>> +++++++++++++++++++++++++++++++++++++++++++++
>>>  include/cpu.h      |   2 +
>>>  include/opal-api.h |   3 +-
>>>  3 files changed, 112 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/core/cpu.c b/core/cpu.c
>>> index 4f518a4c..bc4fcaf8 100644
>>> --- a/core/cpu.c
>>> +++ b/core/cpu.c
>>> @@ -1630,3 +1630,111 @@ static int64_t opal_nmmu_set_ptcr(uint64_t
>>> chip_id, uint64_t ptcr)
>>>  	return rc;
>>>  }
>>>  opal_call(OPAL_NMMU_SET_PTCR, opal_nmmu_set_ptcr, 2);
>>> +
>>> +
>>
>> One empty line should be fine :)
>>
>>> +#define L2_PRD_PURGE_CMD_REG 0x1080E
>>> +#define L2_PRD_PURGE_CMD_REG_BUSY 0x0040000000000000
>>> +#define L2_PRD_PURGE_CMD_TRIGGER 0x8000000000000000
>>> +#define L3_PRD_PURGE_REG 0x1180E
>>> +#define L3_PRD_PURGE_REQ 0x8000000000000000
>>
>> My guess is that these should go to include/xscom-p9-regs.h, where
>> other
>> registers used with XSCOM_ADDR_P9_EX live. Also, the values written
>> to
>> the registers are better be defined with PPC_BIT, just like
>> everything
>> else in that file, it makes it easier to match the numbers with
>> scoms/ex_chiplet.html
>>
> 
> Good point :)
> 
>>
>>> +#define TIMEOUT_MS 2
>>> +
>>> +static inline bool time_expired(unsigned long start)
>>> +{
>>> +	unsigned long time = tb_to_msecs(mftb());
>>> +
>>> +	if (time - start > TIMEOUT_MS) {
>>> +		return true;
>>> +	}
>>
>>
>> Does Stewart^wskiboot require curly braces for single line
>> statements?
>> Also, is there 80 chars requirement (just curious)?
>>
> I'm not sure... but I think I sent the wrong version - I had tidied up
> the lines to 80 chars and was returning rc (which you mentioned below).
> 
>>
>>> +	return false;
>>> +}
>>> +
>>> +static int flush_l2_caches(uint32_t chip_id, uint32_t core_id)
>>> +{
>>> +	int rc, timeout = 0;
>>> +	unsigned long start_time;
>>> +	uint64_t val = L2_PRD_PURGE_CMD_REG_BUSY;
>>> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id,
>>> L2_PRD_PURGE_CMD_REG);
>>> +
>>> +	rc = xscom_write_mask(chip_id, addr, L2_PRD_PURGE_CMD_TRIGGER,
>>> +			      L2_PRD_PURGE_CMD_TRIGGER);
>>
>> The advise from the hw folks was:
>>
>>  putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TYPE L2CAC_FLUSH
>> -all
>>
>>  putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TRIGGER ON -all
>>
>> I can see trigger action but not L2CAC_FLUSH.
>>
> 
> I lumped them together (as L2CAC_FLUSH=>0b0000). I'll split it up and
> make it clearer though.


Ah, my bad, I was reading it as 0x0b0000 and was looking for 'b'
:)Anyway, useful to see them defined somewhere. Also, can it be a single
write to the register (looks like it can and then there is no need to
separation) or should it be 2 writes as the hw folks strangely suggested?

When you post another version, please include those 'putspy' commands to
the commit log for the reference.


> 
>>> +	if (rc) {
>>> +		prlog(PR_ERR, "FLUSH L2 on core 0x%x: XSCOM write_mask
>>> failed %i\n", core_id, rc);
>>
>> Any reason not to "return rc" right here?
> 
> Not that I can think of. 
> 
>>
>>> +	}
>>> +	start_time = tb_to_msecs(mftb());
>>> +	while ((val & L2_PRD_PURGE_CMD_REG_BUSY) && !(timeout =
>>> time_expired(start_time))) {
>>
>>
>> Can we please not have '=' in a condition?
>>
> 
> That will disapear with Al's suggestion.
> 
>>
>>> +		rc = xscom_read(chip_id, addr, &val);
>>> +		if (rc) {
>>> +			prlog(PR_ERR, "FLUSH L2 on core 0x%x: XSCOM
>>> read failed %i\n", core_id, rc);
>>> +			break;
>>> +		}
>>
>>
>> if (tb_to_msecs(mftb()) - start_time > TIMEOUT_MS) {
>> 	prlog(PR_ERR, "FLUSH L3 on core 0x%x timed out %i\n", core_id,
>> rc);
>> 	return OPAL_BUSY;
>> }
>>
>> ...
>>
>>
>>> +	}
>>> +	if (timeout) {
>>> +		prlog(PR_ERR, "FLUSH L3 on core 0x%x timed out %i\n",
>>> core_id, rc);
>>> +		return OPAL_BUSY;
>>> +	}
>>
>> ... and ditch this check. 
> 
> Why? You don't care if we timed out trying to clear the caches?

I meant if you check-and-return for timeout inside the loop without any
extra @timeout variable, you won't need this one.



> 
>> And the time_expired() helper (which does not
>> seem to help much anyway). And the local @timeout variable. And '='
>> in
>> the condition :)
>>
>>
>>> +
>>> +	/* We have to clear the trigger bit ourselves */
>>> +	val &= ~L2_PRD_PURGE_CMD_TRIGGER;
>>> +	rc = xscom_write(chip_id, addr, val);
>>> +	if (rc)
>>> +		prlog(PR_ERR, "FLUSH L2 on core 0x%x: XSCOM write
>>> failed %i\n", core_id, rc);
>>> +	return 0;
>>
>>
>> Why not return rc?
>>
> 
> As mentioned above
> 
>>
>>> +
>>> +}
>>> +
>>> +static int flush_l3_caches(uint32_t chip_id, uint32_t core_id)
>>> +{
>>> +	int rc, timeout = 0;
>>> +	unsigned long start_time;
>>> +	uint64_t val = L3_PRD_PURGE_REQ;
>>> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
>>> +
>>> +	rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ,
>>> L3_PRD_PURGE_REQ);
>>
>> Similar to L2CAC_FLUSH question here:
>>
>>  putspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_TTYPE FULL_PURGE
>> -all
>>
>>  putspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_REQ ON -all
>>
>>
>> There is L3_PRD_PURGE_REQ but no FULL_PURGE.
>>
> 
> Same as above, I combined them as FULL_PURGE=>0b0000. Will seperate and
> make clearer :)
> 
>>
>>> +	if (rc) {
>>> +		prlog(PR_ERR, "FLUSH L3 on core 0x%x: XSCOM write_mask
>>> failed %i\n", core_id, rc);
>>
>>
>> return rc?
>>
>>> +	}
>>> +
>>> +	/* Trigger bit is automatically set to zero when flushing is
>>> done*/
>>> +	start_time = tb_to_msecs(mftb());
>>> +	while ((val & L3_PRD_PURGE_REQ) && !(timeout =
>>> time_expired(start_time) )) {
>>> +		rc = xscom_read(chip_id, addr, &val);
>>> +		if (rc) {
>>> +			prlog(PR_ERR, "FLUSH L3 on core 0x%x: XSCOM
>>> read failed %i\n", core_id, rc);
>>> +			break;
>>> +		}
>>> +	}
>>> +	if (timeout) {
>>> +		prlog(PR_ERR, "FLUSH L3 on core 0x%x timed out %i\n",
>>> core_id, rc);
>>> +		return OPAL_BUSY;
>>> +	}
>>> +
>>> +	return 0;
>>
>> return rc?
>>
>>
>>> +}
>>> +
>>> +int flush_caches(void)
>>
>> OPAL calls return int64_t. And the handlers are static so there is no
>> reason to export the symbol via include/cpu.h below.
>>
> 
> Ok.
> 
>>
>>> +{
>>> +	int rc = 0;
>>> +	struct cpu_thread *t;
>>> +	uint64_t chip_id, core_id, prev_core_id = 0xdeadbeef;
>>> +
>>> +	if ((mfspr(SPR_PVR) & PVR_TYPE_P9) != PVR_TYPE_P9)
>>> +		return OPAL_UNSUPPORTED;
>>> +
>>> +	for_each_cpu(t) {
>>> +		/* Only need to do it once per core chiplet */
>>> +		core_id = pir_to_core_id(t->pir);
>>> +		if (prev_core_id == core_id)
>>> +			continue;
>>> +		prev_core_id = core_id;
>>> +		chip_id = t->chip_id;
>>
>>
>> The chip_id local variable seems pretty useless, could just use t-
>>> chip_id.
> 
> Yep.
> 
>>
>>> +
>>> +		rc |= flush_l2_caches(chip_id, core_id);
>>> +		rc |= flush_l3_caches(chip_id, core_id);
>>> +	}
>>> +
>>> +	return rc;
>>> +}
>>> +
>>> +
>>
>> Extra empty line?
>>
>>> +opal_call(OPAL_CLEAR_CACHE, flush_caches, 0);
>>> diff --git a/include/cpu.h b/include/cpu.h
>>> index 2fe47982..04c862c5 100644
>>> --- a/include/cpu.h
>>> +++ b/include/cpu.h
>>> @@ -329,4 +329,6 @@ int dctl_set_special_wakeup(struct cpu_thread
>>> *t);
>>>  int dctl_clear_special_wakeup(struct cpu_thread *t);
>>>  int dctl_core_is_gated(struct cpu_thread *t);
>>>  
>>> +extern int flush_caches(void);
>>> +
>>>  #endif /* __CPU_H */
>>> diff --git a/include/opal-api.h b/include/opal-api.h
>>> index 5f397c8e..c24838d2 100644
>>> --- a/include/opal-api.h
>>> +++ b/include/opal-api.h
>>> @@ -226,7 +226,8 @@
>>>  #define OPAL_NX_COPROC_INIT			167
>>>  #define OPAL_NPU_SET_RELAXED_ORDER		168
>>>  #define OPAL_NPU_GET_RELAXED_ORDER		169
>>> -#define OPAL_LAST				169
>>> +#define OPAL_CLEAR_CACHE			170
>>
>>
>> s/clear/purge/. The scoms calls it "purge", let's stick to this name.
>>
> 
> Good point :)
> 
>>
>>> +#define OPAL_LAST				170
>>>  
>>>  #define QUIESCE_HOLD			1 /* Spin all calls at
>>> entry */
>>>  #define QUIESCE_REJECT			2 /* Fail all calls
>>> with OPAL_BUSY */
>>>
>>
>>
>
Rashmica Gupta Nov. 2, 2018, 4:19 a.m. | #6
On Fri, 2018-11-02 at 14:14 +1100, Alexey Kardashevskiy wrote:
> > > > 
> 
> On 02/11/2018 13:48, Rashmica Gupta wrote:
> > On Fri, 2018-11-02 at 13:17 +1100, Alexey Kardashevskiy wrote:
> > > 
> > > On 29/10/2018 17:19, Rashmica Gupta wrote:

...

> > > > 
> > > > +static int flush_l2_caches(uint32_t chip_id, uint32_t core_id)
> > > > +{
> > > > +	int rc, timeout = 0;
> > > > +	unsigned long start_time;
> > > > +	uint64_t val = L2_PRD_PURGE_CMD_REG_BUSY;
> > > > +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id,
> > > > L2_PRD_PURGE_CMD_REG);
> > > > +
> > > > +	rc = xscom_write_mask(chip_id, addr,
> > > > L2_PRD_PURGE_CMD_TRIGGER,
> > > > +			      L2_PRD_PURGE_CMD_TRIGGER);
> > > 
> > > The advise from the hw folks was:
> > > 
> > >  putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TYPE
> > > L2CAC_FLUSH
> > > -all
> > > 
> > >  putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TRIGGER ON -all
> > > 
> > > I can see trigger action but not L2CAC_FLUSH.
> > > 
> > 
> > I lumped them together (as L2CAC_FLUSH=>0b0000). I'll split it up
> > and
> > make it clearer though.
> 
> 
> Ah, my bad, I was reading it as 0x0b0000 and was looking for 'b'
> :)Anyway, useful to see them defined somewhere. Also, can it be a
> single
> write to the register (looks like it can and then there is no need to
> separation) or should it be 2 writes as the hw folks strangely
> suggested?

I assumed that it would be fine to write the type of flush and trigger
it at the same time... But their suggestion did have them seperate, so 
maybe I should do it like that just to be safe?

> 
> When you post another version, please include those 'putspy' commands
> to
> the commit log for the reference.
> 
> 

Ok :)

> > 
> > > > +	if (rc) {
> > > > +		prlog(PR_ERR, "FLUSH L2 on core 0x%x: XSCOM
> > > > write_mask
> > > > failed %i\n", core_id, rc);
> > > 
> > > Any reason not to "return rc" right here?
> > 
> > Not that I can think of. 
> > 
> > > 
> > > > +	}
> > > > +	start_time = tb_to_msecs(mftb());
> > > > +	while ((val & L2_PRD_PURGE_CMD_REG_BUSY) && !(timeout =
> > > > time_expired(start_time))) {
> > > 
> > > 
> > > Can we please not have '=' in a condition?
> > > 
> > 
> > That will disapear with Al's suggestion.
> > 
> > > 
> > > > +		rc = xscom_read(chip_id, addr, &val);
> > > > +		if (rc) {
> > > > +			prlog(PR_ERR, "FLUSH L2 on core 0x%x:
> > > > XSCOM
> > > > read failed %i\n", core_id, rc);
> > > > +			break;
> > > > +		}
> > > 
> > > 
> > > if (tb_to_msecs(mftb()) - start_time > TIMEOUT_MS) {
> > > 	prlog(PR_ERR, "FLUSH L3 on core 0x%x timed out %i\n", core_id,
> > > rc);
> > > 	return OPAL_BUSY;
> > > }
> > > 
> > > ...
> > > 
> > > 
> > > > +	}
> > > > +	if (timeout) {
> > > > +		prlog(PR_ERR, "FLUSH L3 on core 0x%x timed out
> > > > %i\n",
> > > > core_id, rc);
> > > > +		return OPAL_BUSY;
> > > > +	}
> > > 
> > > ... and ditch this check. 
> > 
> > Why? You don't care if we timed out trying to clear the caches?
> 
> I meant if you check-and-return for timeout inside the loop without
> any
> extra @timeout variable, you won't need this one.
> 

Ah, gotcha.
Alexey Kardashevskiy Nov. 2, 2018, 4:40 a.m. | #7
On 02/11/2018 15:19, Rashmica Gupta wrote:
> On Fri, 2018-11-02 at 14:14 +1100, Alexey Kardashevskiy wrote:
>>>>>
>>
>> On 02/11/2018 13:48, Rashmica Gupta wrote:
>>> On Fri, 2018-11-02 at 13:17 +1100, Alexey Kardashevskiy wrote:
>>>>
>>>> On 29/10/2018 17:19, Rashmica Gupta wrote:
> 
> ...
> 
>>>>>
>>>>> +static int flush_l2_caches(uint32_t chip_id, uint32_t core_id)
>>>>> +{
>>>>> +	int rc, timeout = 0;
>>>>> +	unsigned long start_time;
>>>>> +	uint64_t val = L2_PRD_PURGE_CMD_REG_BUSY;
>>>>> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id,
>>>>> L2_PRD_PURGE_CMD_REG);
>>>>> +
>>>>> +	rc = xscom_write_mask(chip_id, addr,
>>>>> L2_PRD_PURGE_CMD_TRIGGER,
>>>>> +			      L2_PRD_PURGE_CMD_TRIGGER);
>>>>
>>>> The advise from the hw folks was:
>>>>
>>>>  putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TYPE
>>>> L2CAC_FLUSH
>>>> -all
>>>>
>>>>  putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TRIGGER ON -all
>>>>
>>>> I can see trigger action but not L2CAC_FLUSH.
>>>>
>>>
>>> I lumped them together (as L2CAC_FLUSH=>0b0000). I'll split it up
>>> and
>>> make it clearer though.
>>
>>
>> Ah, my bad, I was reading it as 0x0b0000 and was looking for 'b'
>> :)Anyway, useful to see them defined somewhere. Also, can it be a
>> single
>> write to the register (looks like it can and then there is no need to
>> separation) or should it be 2 writes as the hw folks strangely
>> suggested?
> 
> I assumed that it would be fine to write the type of flush and trigger
> it at the same time... But their suggestion did have them seperate, so 
> maybe I should do it like that just to be safe?


I suggest asking Alistair the expert  :)
Oliver Nov. 5, 2018, 1:57 a.m. | #8
On Mon, Oct 29, 2018 at 5:19 PM Rashmica Gupta <rashmica.g@gmail.com> wrote:
>
> Using the dcbf instruction is really slow... This is much faster.
>
> Suggested-by: Alistair Popple <alistair@popple.id.au>
> Signed-off-by: Rashmica Gupta <rashmica.g@gmail.com>
> ---
> Stewart: I realise that cpu.c is probably not where this should live...
> Thoughts on where it should go?
>
> Context: When resetting a GPU we want to make sure all dirty cache lines
> in the CPU cache are cleared. Hit up Alexey and Alistair for the nitty
> gritty details.
>
>  core/cpu.c         | 108 +++++++++++++++++++++++++++++++++++++++++++++
>  include/cpu.h      |   2 +
>  include/opal-api.h |   3 +-
>  3 files changed, 112 insertions(+), 1 deletion(-)
>
> diff --git a/core/cpu.c b/core/cpu.c
> index 4f518a4c..bc4fcaf8 100644
> --- a/core/cpu.c
> +++ b/core/cpu.c
> @@ -1630,3 +1630,111 @@ static int64_t opal_nmmu_set_ptcr(uint64_t chip_id, uint64_t ptcr)
>         return rc;
>  }
>  opal_call(OPAL_NMMU_SET_PTCR, opal_nmmu_set_ptcr, 2);
> +
> +
> +#define L2_PRD_PURGE_CMD_REG 0x1080E
> +#define L2_PRD_PURGE_CMD_REG_BUSY 0x0040000000000000
> +#define L2_PRD_PURGE_CMD_TRIGGER 0x8000000000000000
> +#define L3_PRD_PURGE_REG 0x1180E
> +#define L3_PRD_PURGE_REQ 0x8000000000000000
> +#define TIMEOUT_MS 2
> +
> +static inline bool time_expired(unsigned long start)
> +{
> +       unsigned long time = tb_to_msecs(mftb());
> +
> +       if (time - start > TIMEOUT_MS) {
> +               return true;
> +       }
> +       return false;
> +}
> +
> +static int flush_l2_caches(uint32_t chip_id, uint32_t core_id)
> +{
> +       int rc, timeout = 0;
> +       unsigned long start_time;
> +       uint64_t val = L2_PRD_PURGE_CMD_REG_BUSY;
> +       uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
> +
> +       rc = xscom_write_mask(chip_id, addr, L2_PRD_PURGE_CMD_TRIGGER,
> +                             L2_PRD_PURGE_CMD_TRIGGER);
> +       if (rc) {
> +               prlog(PR_ERR, "FLUSH L2 on core 0x%x: XSCOM write_mask failed %i\n", core_id, rc);
> +       }
> +       start_time = tb_to_msecs(mftb());
> +       while ((val & L2_PRD_PURGE_CMD_REG_BUSY) && !(timeout = time_expired(start_time))) {
> +               rc = xscom_read(chip_id, addr, &val);
> +               if (rc) {
> +                       prlog(PR_ERR, "FLUSH L2 on core 0x%x: XSCOM read failed %i\n", core_id, rc);
> +                       break;
> +               }
> +       }
> +       if (timeout) {
> +               prlog(PR_ERR, "FLUSH L3 on core 0x%x timed out %i\n", core_id, rc);
> +               return OPAL_BUSY;
> +       }
> +
> +       /* We have to clear the trigger bit ourselves */
> +       val &= ~L2_PRD_PURGE_CMD_TRIGGER;
> +       rc = xscom_write(chip_id, addr, val);
> +       if (rc)
> +               prlog(PR_ERR, "FLUSH L2 on core 0x%x: XSCOM write failed %i\n", core_id, rc);
> +       return 0;
> +
> +}
> +
> +static int flush_l3_caches(uint32_t chip_id, uint32_t core_id)
> +{
> +       int rc, timeout = 0;
> +       unsigned long start_time;
> +       uint64_t val = L3_PRD_PURGE_REQ;
> +       uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
> +
> +       rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ, L3_PRD_PURGE_REQ);
> +       if (rc) {
> +               prlog(PR_ERR, "FLUSH L3 on core 0x%x: XSCOM write_mask failed %i\n", core_id, rc);
> +       }
> +
> +       /* Trigger bit is automatically set to zero when flushing is done*/
> +       start_time = tb_to_msecs(mftb());
> +       while ((val & L3_PRD_PURGE_REQ) && !(timeout = time_expired(start_time) )) {
> +               rc = xscom_read(chip_id, addr, &val);
> +               if (rc) {
> +                       prlog(PR_ERR, "FLUSH L3 on core 0x%x: XSCOM read failed %i\n", core_id, rc);
> +                       break;
> +               }
> +       }

This seems like one of the few occasions where a do {} while() loop
makes sense. not a big deal though.

> +       if (timeout) {
> +               prlog(PR_ERR, "FLUSH L3 on core 0x%x timed out %i\n", core_id, rc);
> +               return OPAL_BUSY;
> +       }
> +
> +       return 0;
> +}
> +
> +int flush_caches(void)
> +{
> +       int rc = 0;
> +       struct cpu_thread *t;
> +       uint64_t chip_id, core_id, prev_core_id = 0xdeadbeef;
> +

> +       if ((mfspr(SPR_PVR) & PVR_TYPE_P9) != PVR_TYPE_P9)
> +               return OPAL_UNSUPPORTED;

This looks broken. The processor type portion of the PVR is in the
upper 16 bits:
    #define SPR_PVR_TYPE                    0xffff0000
And the PVE_TYPE_* macros are shifted down:
   #define PVR_TYPE_P9     0x004e

If you want to check the PVR this way then you need:
PVR_TYPE(mfspr(SPR_PVR)) != PVR_TYPE_P9 instead. That said, we
generally don't look at the PVR directly and use the "proc_gen" global
instead: e.g.
   if (proc_gen != proc_gen_p9)

> +       for_each_cpu(t) {

Does this need to be for_each_cpu_ungarded()?

> +               /* Only need to do it once per core chiplet */
> +               core_id = pir_to_core_id(t->pir);
> +               if (prev_core_id == core_id)
> +                       continue;

The L2 and L3 are shared between a pairs of cores (i.e. one per EX
chiplet) rather than being per-core so I think this is doubling up on
flushes. Also, do we need to do a special wakeup here? I'm not sure
what happens to the caches if both cores in the pair are in a deep
sleep state.

> +               prev_core_id = core_id;
> +               chip_id = t->chip_id;
> +
> +               rc |= flush_l2_caches(chip_id, core_id);
> +               rc |= flush_l3_caches(chip_id, core_id);

If each call fails in a different manner you'll get gibberish in rc
since they're ORed together. Maybe we don't care but eh... it seems a
bit bad.

> +       }
> +
> +       return rc;
> +}
> +
> +

> +opal_call(OPAL_CLEAR_CACHE, flush_caches, 0);

I'd rather you called this CLEAR_L2_L3 or something. It doesn't touch
the L1 at all so I think calling it "CLEAR_CACHE" is a bit misleading.
As an alternative, you could also adjust the API to something like
opal_clear_cache(pir, mask_of_levels) and make opal_clear_cache(-1,
PURGE_L2 | PURGE_L3) match the current behaviour.

> diff --git a/include/cpu.h b/include/cpu.h
> index 2fe47982..04c862c5 100644
> --- a/include/cpu.h
> +++ b/include/cpu.h
> @@ -329,4 +329,6 @@ int dctl_set_special_wakeup(struct cpu_thread *t);
>  int dctl_clear_special_wakeup(struct cpu_thread *t);
>  int dctl_core_is_gated(struct cpu_thread *t);
>
> +extern int flush_caches(void);
> +
>  #endif /* __CPU_H */
> diff --git a/include/opal-api.h b/include/opal-api.h
> index 5f397c8e..c24838d2 100644
> --- a/include/opal-api.h
> +++ b/include/opal-api.h
> @@ -226,7 +226,8 @@
>  #define OPAL_NX_COPROC_INIT                    167
>  #define OPAL_NPU_SET_RELAXED_ORDER             168
>  #define OPAL_NPU_GET_RELAXED_ORDER             169
> -#define OPAL_LAST                              169
> +#define OPAL_CLEAR_CACHE                       170
> +#define OPAL_LAST                              170
>
>  #define QUIESCE_HOLD                   1 /* Spin all calls at entry */
>  #define QUIESCE_REJECT                 2 /* Fail all calls with OPAL_BUSY */
> --
> 2.17.2

As a final comment, it seems like we might be better off making this a
multi-pass thing. Currently we walk the list of CPUs once and wait for
the purge to finish on each before continuing. Since we don't really
have a good error handling story (I guess you fall back to doing a
dcbf loop in linux?) it might make sense to do something like:

for_each_cpu()
   start_l2_purge()
for_each_cpu()
   wait_for_l2_purge()
for_each_cpu()
   start_l3_purge()
for_each_cpu()
   wait_for_l3_purge()

and bail out if we hit an error at any point. As a general rule we
want OPAL calls to be handled in microseconds so since opal runs in
real mode with interrupts disabled. I'm not sure how fast a purge is,
but if it's a significant fraction of the 2 millisecond timeout window
then we might end up in OPAL for dozens for milliseconds in the worst
case. This might just be micro-optimising though since I expect this
is only going to be called once in a blue moon, but it's something to
consider.

>
> _______________________________________________
> Skiboot mailing list
> Skiboot@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/skiboot
Alistair Popple Nov. 5, 2018, 11:30 p.m. | #9
> > I assumed that it would be fine to write the type of flush and trigger
> > it at the same time... But their suggestion did have them seperate, so
> > maybe I should do it like that just to be safe?
> 
> I suggest asking Alistair the expert  :)

Ha! We'd need to ask the HW folk to be sure. It seems unlikely we should have 
to do them seperately but we should follow their suggestions unless we have 
confirmed otherwise. Seems like a pretty minor comment though so not sure 
changing it really gets us much.

- Alistair
Rashmica Gupta Nov. 9, 2018, 2:44 a.m. | #10
On Mon, 2018-11-05 at 12:57 +1100, Oliver wrote:
> > 

...

> On Mon, Oct 29, 2018 at 5:19 PM Rashmica Gupta <rashmica.g@gmail.com>
> wrote:
> 
> As a final comment, it seems like we might be better off making this
> a
> multi-pass thing. Currently we walk the list of CPUs once and wait
> for
> the purge to finish on each before continuing. Since we don't really
> have a good error handling story (I guess you fall back to doing a
> dcbf loop in linux?) it might make sense to do something like:
> 
> for_each_cpu()
>    start_l2_purge()
> for_each_cpu()
>    wait_for_l2_purge()
> for_each_cpu()
>    start_l3_purge()
> for_each_cpu()
>    wait_for_l3_purge()
> 
> and bail out if we hit an error at any point. As a general rule we
> want OPAL calls to be handled in microseconds so since opal runs in
> real mode with interrupts disabled. I'm not sure how fast a purge is,
> but if it's a significant fraction of the 2 millisecond timeout
> window
> then we might end up in OPAL for dozens for milliseconds in the worst
> case. This might just be micro-optimising though since I expect this
> is only going to be called once in a blue moon, but it's something to
> consider.
> 

Good point... Even implemented as you suggested, i've seen it take 5-
6ms for the whole call... Perhaps I should split it up into 2 opal
calls: start_l2_l3_cache_purge() and poll_l2_l3_cache_purge()?

Or maybe with the format you suggested, start_cache_purge(pir,
cache_mask) and poll_cache_purge(pir, cache_mask).

> > 
> > _______________________________________________
> > Skiboot mailing list
> > Skiboot@lists.ozlabs.org
> > https://lists.ozlabs.org/listinfo/skiboot
Oliver Nov. 9, 2018, 4 a.m. | #11
On Fri, Nov 9, 2018 at 1:44 PM Rashmica Gupta <rashmica.g@gmail.com> wrote:
>
> On Mon, 2018-11-05 at 12:57 +1100, Oliver wrote:
> > >
>
> ...
>
> > On Mon, Oct 29, 2018 at 5:19 PM Rashmica Gupta <rashmica.g@gmail.com>
> > wrote:
> >
> > As a final comment, it seems like we might be better off making this
> > a
> > multi-pass thing. Currently we walk the list of CPUs once and wait
> > for
> > the purge to finish on each before continuing. Since we don't really
> > have a good error handling story (I guess you fall back to doing a
> > dcbf loop in linux?) it might make sense to do something like:
> >
> > for_each_cpu()
> >    start_l2_purge()
> > for_each_cpu()
> >    wait_for_l2_purge()
> > for_each_cpu()
> >    start_l3_purge()
> > for_each_cpu()
> >    wait_for_l3_purge()
> >
> > and bail out if we hit an error at any point. As a general rule we
> > want OPAL calls to be handled in microseconds so since opal runs in
> > real mode with interrupts disabled. I'm not sure how fast a purge is,
> > but if it's a significant fraction of the 2 millisecond timeout
> > window
> > then we might end up in OPAL for dozens for milliseconds in the worst
> > case. This might just be micro-optimising though since I expect this
> > is only going to be called once in a blue moon, but it's something to
> > consider.
> >
>
> Good point... Even implemented as you suggested, i've seen it take 5-
> 6ms for the whole call... Perhaps I should split it up into 2 opal
> calls: start_l2_l3_cache_purge() and poll_l2_l3_cache_purge()?
>
> Or maybe with the format you suggested, start_cache_purge(pir,
> cache_mask) and poll_cache_purge(pir, cache_mask).

An async completion might make more sense here. They're a little
convoluted but the basic process is:

1) pass an async_token (magic number) in the OPAL call
2) schedule a timer inside of OPAL to check on the async job
3) return from the opal call with OPAL_ASYNC_COMPLETION
4) check on the purge state in the timer's callback function
5) when the async job is done, return use opal_msg() to send an
OPAL_MSG_ASYNC_COMP message.

The calling thread in linux can sleep while waiting for the async
completion message. If you want an example have a look at
core/i2c.c:opal_i2c_request() in skiboot and the
drivers/i2c/busses/i2c-opal.c in linux.

> > > _______________________________________________
> > > Skiboot mailing list
> > > Skiboot@lists.ozlabs.org
> > > https://lists.ozlabs.org/listinfo/skiboot
>
Alistair Popple Nov. 13, 2018, 5:27 a.m. | #12
> 
> An async completion might make more sense here. They're a little
> convoluted but the basic process is:
> 
> 1) pass an async_token (magic number) in the OPAL call
> 2) schedule a timer inside of OPAL to check on the async job
> 3) return from the opal call with OPAL_ASYNC_COMPLETION
> 4) check on the purge state in the timer's callback function
> 5) when the async job is done, return use opal_msg() to send an
> OPAL_MSG_ASYNC_COMP message.
> 
> The calling thread in linux can sleep while waiting for the async
> completion message. If you want an example have a look at
> core/i2c.c:opal_i2c_request() in skiboot and the
> drivers/i2c/busses/i2c-opal.c in linux.

Both Nick and Alexey have asked offline if this could just be made part of the 
NPU reset sequence. Technically I think it could be but needing to do it async 
complicates things. Oliver do you know if the existing PCI slot reset code has 
any mechanism to return something similar to an async completion? 

There was talk of having the NVIDIA driver call be able to call this sequence 
directly but doing it directly as part of the reset may mitigate that.

- Alistair
 
> > > > _______________________________________________
> > > > Skiboot mailing list
> > > > Skiboot@lists.ozlabs.org
> > > > https://lists.ozlabs.org/listinfo/skiboot
> 
> _______________________________________________
> Skiboot mailing list
> Skiboot@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/skiboot
Oliver Nov. 13, 2018, 8:24 a.m. | #13
On Tue, Nov 13, 2018 at 4:28 PM Alistair Popple <alistair@popple.id.au> wrote:
>
> >
> > An async completion might make more sense here. They're a little
> > convoluted but the basic process is:
> >
> > 1) pass an async_token (magic number) in the OPAL call
> > 2) schedule a timer inside of OPAL to check on the async job
> > 3) return from the opal call with OPAL_ASYNC_COMPLETION
> > 4) check on the purge state in the timer's callback function
> > 5) when the async job is done, return use opal_msg() to send an
> > OPAL_MSG_ASYNC_COMP message.
> >
> > The calling thread in linux can sleep while waiting for the async
> > completion message. If you want an example have a look at
> > core/i2c.c:opal_i2c_request() in skiboot and the
> > drivers/i2c/busses/i2c-opal.c in linux.
>
> Both Nick and Alexey have asked offline if this could just be made part of the
> NPU reset sequence. Technically I think it could be but needing to do it async
> complicates things. Oliver do you know if the existing PCI slot reset code has
> any mechanism to return something similar to an async completion?
>
> There was talk of having the NVIDIA driver call be able to call this sequence
> directly but doing it directly as part of the reset may mitigate that.

You should be able to put it into the creset function for the npu2's
virtual PHB. For normal PHBs creset is a fairly long operation since
it requires asserting PERST for half a second or something. However
the interface for doing that is slightly different (not sure why,
might be a OPALv1 holdover). Rather than returning a async token it
returns a wait time (in ms) and expects you to call OPAL_PCI_POLL
repeatedly until you get OPAL_SUCCESS, at which point the reset is
done.

I'd say it should be fairly straightforward to do the purge in the
npu2 creset(). If the current NPU driver in linux expects to get
OPAL_SUCCESS when it does an NPU reset you might be screwed though.

>
> - Alistair
>
> > > > > _______________________________________________
> > > > > Skiboot mailing list
> > > > > Skiboot@lists.ozlabs.org
> > > > > https://lists.ozlabs.org/listinfo/skiboot
> >
> > _______________________________________________
> > Skiboot mailing list
> > Skiboot@lists.ozlabs.org
> > https://lists.ozlabs.org/listinfo/skiboot
>
>
Alistair Popple Nov. 14, 2018, 3:32 a.m. | #14
On Tuesday, 13 November 2018 7:24:28 PM AEDT Oliver wrote:
> On Tue, Nov 13, 2018 at 4:28 PM Alistair Popple <alistair@popple.id.au> 
wrote:
> > > An async completion might make more sense here. They're a little
> > > convoluted but the basic process is:
> > > 
> > > 1) pass an async_token (magic number) in the OPAL call
> > > 2) schedule a timer inside of OPAL to check on the async job
> > > 3) return from the opal call with OPAL_ASYNC_COMPLETION
> > > 4) check on the purge state in the timer's callback function
> > > 5) when the async job is done, return use opal_msg() to send an
> > > OPAL_MSG_ASYNC_COMP message.
> > > 
> > > The calling thread in linux can sleep while waiting for the async
> > > completion message. If you want an example have a look at
> > > core/i2c.c:opal_i2c_request() in skiboot and the
> > > drivers/i2c/busses/i2c-opal.c in linux.
> > 
> > Both Nick and Alexey have asked offline if this could just be made part of
> > the NPU reset sequence. Technically I think it could be but needing to do
> > it async complicates things. Oliver do you know if the existing PCI slot
> > reset code has any mechanism to return something similar to an async
> > completion?
> > 
> > There was talk of having the NVIDIA driver call be able to call this
> > sequence directly but doing it directly as part of the reset may mitigate
> > that.
> You should be able to put it into the creset function for the npu2's
> virtual PHB. For normal PHBs creset is a fairly long operation since
> it requires asserting PERST for half a second or something. However
> the interface for doing that is slightly different (not sure why,
> might be a OPALv1 holdover). Rather than returning a async token it
> returns a wait time (in ms) and expects you to call OPAL_PCI_POLL
> repeatedly until you get OPAL_SUCCESS, at which point the reset is
> done.

Thanks Oliver, makes sense.

> I'd say it should be fairly straightforward to do the purge in the
> npu2 creset(). If the current NPU driver in linux expects to get
> OPAL_SUCCESS when it does an NPU reset you might be screwed though.

We don't do anything special here in Linux for the NPU so I'd assume it would 
work as well as it does for everything else. Alexey you were hooking up these 
resets for pass-through do you know if returning OPAL_PCI_POLL would work in 
your call paths?

- Alistair

> > 
> > > > > > _______________________________________________
> > > > > > Skiboot mailing list
> > > > > > Skiboot@lists.ozlabs.org
> > > > > > https://lists.ozlabs.org/listinfo/skiboot
> > > 
> > > _______________________________________________
> > > Skiboot mailing list
> > > Skiboot@lists.ozlabs.org
> > > https://lists.ozlabs.org/listinfo/skiboot
> 
> _______________________________________________
> Skiboot mailing list
> Skiboot@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/skiboot
Alexey Kardashevskiy Nov. 14, 2018, 6:35 a.m. | #15
On 14/11/2018 14:32, Alistair Popple wrote:
> On Tuesday, 13 November 2018 7:24:28 PM AEDT Oliver wrote:
>> On Tue, Nov 13, 2018 at 4:28 PM Alistair Popple <alistair@popple.id.au> 
> wrote:
>>>> An async completion might make more sense here. They're a little
>>>> convoluted but the basic process is:
>>>>
>>>> 1) pass an async_token (magic number) in the OPAL call
>>>> 2) schedule a timer inside of OPAL to check on the async job
>>>> 3) return from the opal call with OPAL_ASYNC_COMPLETION
>>>> 4) check on the purge state in the timer's callback function
>>>> 5) when the async job is done, return use opal_msg() to send an
>>>> OPAL_MSG_ASYNC_COMP message.
>>>>
>>>> The calling thread in linux can sleep while waiting for the async
>>>> completion message. If you want an example have a look at
>>>> core/i2c.c:opal_i2c_request() in skiboot and the
>>>> drivers/i2c/busses/i2c-opal.c in linux.
>>>
>>> Both Nick and Alexey have asked offline if this could just be made part of
>>> the NPU reset sequence. Technically I think it could be but needing to do
>>> it async complicates things. Oliver do you know if the existing PCI slot
>>> reset code has any mechanism to return something similar to an async
>>> completion?
>>>
>>> There was talk of having the NVIDIA driver call be able to call this
>>> sequence directly but doing it directly as part of the reset may mitigate
>>> that.
>> You should be able to put it into the creset function for the npu2's
>> virtual PHB. For normal PHBs creset is a fairly long operation since
>> it requires asserting PERST for half a second or something. However
>> the interface for doing that is slightly different (not sure why,
>> might be a OPALv1 holdover). Rather than returning a async token it
>> returns a wait time (in ms) and expects you to call OPAL_PCI_POLL
>> repeatedly until you get OPAL_SUCCESS, at which point the reset is
>> done.
> 
> Thanks Oliver, makes sense.
> 
>> I'd say it should be fairly straightforward to do the purge in the
>> npu2 creset(). If the current NPU driver in linux expects to get
>> OPAL_SUCCESS when it does an NPU reset you might be screwed though.
> 
> We don't do anything special here in Linux for the NPU so I'd assume it would 
> work as well as it does for everything else. Alexey you were hooking up these 
> resets for pass-through do you know if returning OPAL_PCI_POLL would work in 
> your call paths?

GPUs are reset from pnv_pci_reset_secondary_bus()+pnv_eeh_bridge_reset()
which does wait for opal_pci_poll(); in skiboot it is phb4_hreset() and
this could be the place for the cache purge thingy but I cannot easily
figure out this state machine in skiboot :-/

NPUs are reset from pcie_flr() and wait 100ms (then linux waits for
PCI_COMMAND!=0xffff but this happens immediately).
Alistair Popple Nov. 14, 2018, 7:16 a.m. | #16
> >> I'd say it should be fairly straightforward to do the purge in the
> >> npu2 creset(). If the current NPU driver in linux expects to get
> >> OPAL_SUCCESS when it does an NPU reset you might be screwed though.
> > 
> > We don't do anything special here in Linux for the NPU so I'd assume it
> > would work as well as it does for everything else. Alexey you were
> > hooking up these resets for pass-through do you know if returning
> > OPAL_PCI_POLL would work in your call paths?
> 
> GPUs are reset from pnv_pci_reset_secondary_bus()+pnv_eeh_bridge_reset()
> which does wait for opal_pci_poll(); in skiboot it is phb4_hreset() and
> this could be the place for the cache purge thingy but I cannot easily
> figure out this state machine in skiboot :-/

Nobody can :-)

Anyway it looks like that deals with a return code of OPAL_PCI_POLL so we 
should be ok to do the cache purge as part of the reset.  I guess knowing this 
I don't feel too strongly either way with respect to implementing this as a 
standalone opal call vs. part of the GPU reset sequence. Alexey/Nick did 
either of you have a preference?

- Alistair
 
> NPUs are reset from pcie_flr() and wait 100ms (then linux waits for
> PCI_COMMAND!=0xffff but this happens immediately).
Nicholas Piggin Nov. 14, 2018, 10:13 a.m. | #17
On Wed, 14 Nov 2018 18:16:10 +1100
Alistair Popple <alistair@popple.id.au> wrote:

> > >> I'd say it should be fairly straightforward to do the purge in the
> > >> npu2 creset(). If the current NPU driver in linux expects to get
> > >> OPAL_SUCCESS when it does an NPU reset you might be screwed though.  
> > > 
> > > We don't do anything special here in Linux for the NPU so I'd assume it
> > > would work as well as it does for everything else. Alexey you were
> > > hooking up these resets for pass-through do you know if returning
> > > OPAL_PCI_POLL would work in your call paths?  
> > 
> > GPUs are reset from pnv_pci_reset_secondary_bus()+pnv_eeh_bridge_reset()
> > which does wait for opal_pci_poll(); in skiboot it is phb4_hreset() and
> > this could be the place for the cache purge thingy but I cannot easily
> > figure out this state machine in skiboot :-/  
> 
> Nobody can :-)
> 
> Anyway it looks like that deals with a return code of OPAL_PCI_POLL so we 
> should be ok to do the cache purge as part of the reset.  I guess knowing this 
> I don't feel too strongly either way with respect to implementing this as a 
> standalone opal call vs. part of the GPU reset sequence. Alexey/Nick did 
> either of you have a preference?

I would like to do it as part of the reset for this case, if
that doesn't cause anybody a whole lot more work.

I think having facility to flush caches and various other hardware
knobs is neat in its own right for testing and possibly debugging
or even workarounds. So I wouldn't say no to these types of OPAL
calls, but I feel like as a GPU memory coherency correctness step,
it fits in the GPU reset.

As the person not writing any code or using the APIs, I'm happy for
Rashmica and others to decide though :)

Thanks,
Nick

Patch

diff --git a/core/cpu.c b/core/cpu.c
index 4f518a4c..bc4fcaf8 100644
--- a/core/cpu.c
+++ b/core/cpu.c
@@ -1630,3 +1630,111 @@  static int64_t opal_nmmu_set_ptcr(uint64_t chip_id, uint64_t ptcr)
 	return rc;
 }
 opal_call(OPAL_NMMU_SET_PTCR, opal_nmmu_set_ptcr, 2);
+
+
+#define L2_PRD_PURGE_CMD_REG 0x1080E
+#define L2_PRD_PURGE_CMD_REG_BUSY 0x0040000000000000
+#define L2_PRD_PURGE_CMD_TRIGGER 0x8000000000000000
+#define L3_PRD_PURGE_REG 0x1180E
+#define L3_PRD_PURGE_REQ 0x8000000000000000
+#define TIMEOUT_MS 2
+
+static inline bool time_expired(unsigned long start)
+{
+	unsigned long time = tb_to_msecs(mftb());
+
+	if (time - start > TIMEOUT_MS) {
+		return true;
+	}
+	return false;
+}
+
+static int flush_l2_caches(uint32_t chip_id, uint32_t core_id)
+{
+	int rc, timeout = 0;
+	unsigned long start_time;
+	uint64_t val = L2_PRD_PURGE_CMD_REG_BUSY;
+	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
+
+	rc = xscom_write_mask(chip_id, addr, L2_PRD_PURGE_CMD_TRIGGER,
+			      L2_PRD_PURGE_CMD_TRIGGER);
+	if (rc) {
+		prlog(PR_ERR, "FLUSH L2 on core 0x%x: XSCOM write_mask failed %i\n", core_id, rc);
+	}
+	start_time = tb_to_msecs(mftb());
+	while ((val & L2_PRD_PURGE_CMD_REG_BUSY) && !(timeout = time_expired(start_time))) {
+		rc = xscom_read(chip_id, addr, &val);
+		if (rc) {
+			prlog(PR_ERR, "FLUSH L2 on core 0x%x: XSCOM read failed %i\n", core_id, rc);
+			break;
+		}
+	}
+	if (timeout) {
+		prlog(PR_ERR, "FLUSH L3 on core 0x%x timed out %i\n", core_id, rc);
+		return OPAL_BUSY;
+	}
+
+	/* We have to clear the trigger bit ourselves */
+	val &= ~L2_PRD_PURGE_CMD_TRIGGER;
+	rc = xscom_write(chip_id, addr, val);
+	if (rc)
+		prlog(PR_ERR, "FLUSH L2 on core 0x%x: XSCOM write failed %i\n", core_id, rc);
+	return 0;
+
+}
+
+static int flush_l3_caches(uint32_t chip_id, uint32_t core_id)
+{
+	int rc, timeout = 0;
+	unsigned long start_time;
+	uint64_t val = L3_PRD_PURGE_REQ;
+	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
+
+	rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ, L3_PRD_PURGE_REQ);
+	if (rc) {
+		prlog(PR_ERR, "FLUSH L3 on core 0x%x: XSCOM write_mask failed %i\n", core_id, rc);
+	}
+
+	/* Trigger bit is automatically set to zero when flushing is done*/
+	start_time = tb_to_msecs(mftb());
+	while ((val & L3_PRD_PURGE_REQ) && !(timeout = time_expired(start_time) )) {
+		rc = xscom_read(chip_id, addr, &val);
+		if (rc) {
+			prlog(PR_ERR, "FLUSH L3 on core 0x%x: XSCOM read failed %i\n", core_id, rc);
+			break;
+		}
+	}
+	if (timeout) {
+		prlog(PR_ERR, "FLUSH L3 on core 0x%x timed out %i\n", core_id, rc);
+		return OPAL_BUSY;
+	}
+
+	return 0;
+}
+
+int flush_caches(void)
+{
+	int rc = 0;
+	struct cpu_thread *t;
+	uint64_t chip_id, core_id, prev_core_id = 0xdeadbeef;
+
+	if ((mfspr(SPR_PVR) & PVR_TYPE_P9) != PVR_TYPE_P9)
+		return OPAL_UNSUPPORTED;
+
+	for_each_cpu(t) {
+		/* Only need to do it once per core chiplet */
+		core_id = pir_to_core_id(t->pir);
+		if (prev_core_id == core_id)
+			continue;
+		prev_core_id = core_id;
+		chip_id = t->chip_id;
+
+		rc |= flush_l2_caches(chip_id, core_id);
+		rc |= flush_l3_caches(chip_id, core_id);
+	}
+
+	return rc;
+}
+
+
+opal_call(OPAL_CLEAR_CACHE, flush_caches, 0);
diff --git a/include/cpu.h b/include/cpu.h
index 2fe47982..04c862c5 100644
--- a/include/cpu.h
+++ b/include/cpu.h
@@ -329,4 +329,6 @@  int dctl_set_special_wakeup(struct cpu_thread *t);
 int dctl_clear_special_wakeup(struct cpu_thread *t);
 int dctl_core_is_gated(struct cpu_thread *t);
 
+extern int flush_caches(void);
+
 #endif /* __CPU_H */
diff --git a/include/opal-api.h b/include/opal-api.h
index 5f397c8e..c24838d2 100644
--- a/include/opal-api.h
+++ b/include/opal-api.h
@@ -226,7 +226,8 @@ 
 #define OPAL_NX_COPROC_INIT			167
 #define OPAL_NPU_SET_RELAXED_ORDER		168
 #define OPAL_NPU_GET_RELAXED_ORDER		169
-#define OPAL_LAST				169
+#define OPAL_CLEAR_CACHE			170
+#define OPAL_LAST				170
 
 #define QUIESCE_HOLD			1 /* Spin all calls at entry */
 #define QUIESCE_REJECT			2 /* Fail all calls with OPAL_BUSY */