diff mbox series

[RFC] direct-controls: Use P8 sequence from workbook

Message ID 20190312003031.21367-1-stewart@linux.ibm.com
State New
Headers show
Series [RFC] direct-controls: Use P8 sequence from workbook | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch master (261ca8e779e5138869a45f174caa49be6a274501)
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot success Test snowpatch/job/snowpatch-skiboot on branch master
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot-dco success Signed-off-by present

Commit Message

Stewart Smith March 12, 2019, 12:30 a.m. UTC
From: Stewart Smith <stewart@linux.vnet.ibm.com>

When attempting to do SRESET under load, our existing implementation
fell down and would fail to SRESET some CPUs, causing the fast-reboot
to fail. This is reproduced by running the (new) op-test test
testcases.OpTestFastReboot.FastRebootHostStressTorture - and especially
so if you up the `stress` invocation to be 120 seconds.

See PR for op-test: https://github.com/open-power/op-test-framework/pull/437

Unfortunately, this patch itself isn't the whole story. It makes things
*better* but there's still a window. Mind you, with this patch I've
seen it pass 24 fast-reboots, which is around 22 more than previously.

The same problem has been observed with pdbg, see discussion there
https://lists.ozlabs.org/pipermail/pdbg/2019-March/001076.html

Signed-off-by: Stewart Smith <stewart@linux.vnet.ibm.com>
---
 core/direct-controls.c | 44 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 42 insertions(+), 2 deletions(-)

Comments

Oliver O'Halloran March 12, 2019, 1:07 a.m. UTC | #1
On Tue, Mar 12, 2019 at 11:31 AM Stewart Smith <stewart@linux.ibm.com> wrote:
>
> From: Stewart Smith <stewart@linux.vnet.ibm.com>
>
> When attempting to do SRESET under load, our existing implementation
> fell down and would fail to SRESET some CPUs, causing the fast-reboot
> to fail. This is reproduced by running the (new) op-test test
> testcases.OpTestFastReboot.FastRebootHostStressTorture - and especially
> so if you up the `stress` invocation to be 120 seconds.
>
> See PR for op-test: https://github.com/open-power/op-test-framework/pull/437
>
> Unfortunately, this patch itself isn't the whole story. It makes things
> *better* but there's still a window. Mind you, with this patch I've
> seen it pass 24 fast-reboots, which is around 22 more than previously.

Damn, it's almost like fast-reboot is a dumb hack that should have
been opt-in to begin with ;)

>
> The same problem has been observed with pdbg, see discussion there
> https://lists.ozlabs.org/pipermail/pdbg/2019-March/001076.html
>
> Signed-off-by: Stewart Smith <stewart@linux.vnet.ibm.com>
> ---
>  core/direct-controls.c | 44 ++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 42 insertions(+), 2 deletions(-)
>
> diff --git a/core/direct-controls.c b/core/direct-controls.c
> index 1d0f6818e2c8..8783a83a25ac 100644
> --- a/core/direct-controls.c
> +++ b/core/direct-controls.c
> @@ -1,4 +1,4 @@
> -/* Copyright 2017 IBM Corp.
> +/* Copyright 2017-2019 IBM Corp.
>   *
>   * Licensed under the Apache License, Version 2.0 (the "License");
>   * you may not use this file except in compliance with the License.
> @@ -59,6 +59,12 @@ static void mambo_stop_cpu(struct cpu_thread *cpu)
>  #define P8_DIRECT_CTL_STOP             PPC_BIT(63)
>  #define P8_DIRECT_CTL_PRENAP           PPC_BIT(47)
>  #define P8_DIRECT_CTL_SRESET           PPC_BIT(60)
> +#define P8_EX_TCTL_RAS_STATUS(t)       (0x10013002 + (t) * 0x10)
> +#define P8_RAS_STATUS_SRQ_EMPTY                PPC_BIT(8)
> +#define P8_RAS_STATUS_LSU_QUIESCED     PPC_BIT(9)
> +#define P8_RAS_STATUS_INST_COMPLETE    PPC_BIT(12)
> +#define P8_RAS_STATUS_THREAD_ACTIVE    PPC_BIT(48)
> +#define P8_RAS_STATUS_TS_QUIESCE       PPC_BIT(49)
>
>  static int p8_core_set_special_wakeup(struct cpu_thread *cpu)
>  {
> @@ -224,6 +230,9 @@ static int p8_stop_thread(struct cpu_thread *cpu)
>         uint32_t chip_id = pir_to_chip_id(cpu->pir);
>         uint32_t thread_id = pir_to_thread_id(cpu->pir);
>         uint32_t xscom_addr;
> +       uint64_t val;
> +       int i;
> +       int rc;
>
>         xscom_addr = XSCOM_ADDR_P8_EX(core_id,
>                                       P8_EX_TCTL_DIRECT_CONTROLS(thread_id));
> @@ -235,7 +244,38 @@ static int p8_stop_thread(struct cpu_thread *cpu)
>                 return OPAL_HARDWARE;
>         }
>
> -       return OPAL_SUCCESS;
> +       xscom_addr = XSCOM_ADDR_P8_EX(core_id,
> +                                     P8_EX_TCTL_RAS_STATUS(thread_id));
> +       for (i=0; i < 1000; i++) {
> +               rc = xscom_read(chip_id, xscom_addr, &val);
> +               if (rc) {
> +                       prlog(PR_ERR, "Could not check state of thread "
> +                             "%u:%u:%u\n", chip_id, core_id, thread_id);
> +               return OPAL_HARDWARE;
> +               }
> +               prlog(PR_INFO, "RAS_STATUS for %u:%u:%u = 0x%llx\n",
> +                     chip_id, core_id, thread_id, val);
If this needs to be PR_INFO then you should move it outside the loop
to where the final status is checked. Doing 1000s retries and printing
the status at PR_INFO on each retry is going to be a little spammy.

> +               if (val & P8_RAS_STATUS_INST_COMPLETE)
> +                       break;
> +               time_wait_ms(10);

1000 * 10e-3 = 10s timeout, seems a bit excessive?


> +       }
> +       if (!(val & P8_RAS_STATUS_INST_COMPLETE))
> +               return OPAL_HARDWARE;
> +
> +       for (i=0; i < 1000; i++) {
> +               rc = xscom_read(chip_id, xscom_addr, &val);
> +               if (rc) {
> +                       prlog(PR_ERR, "Could not check state of thread "
> +                             "%u:%u:%u\n", chip_id, core_id, thread_id);
> +               return OPAL_HARDWARE;
> +               }
> +               prlog(PR_INFO, "RAS_STATUS for %u:%u:%u = 0x%llx\n",
> +                     chip_id, core_id, thread_id, val);
> +               if (val & P8_RAS_STATUS_TS_QUIESCE)
> +                       return OPAL_SUCCESS;
> +               time_wait_ms(10);
> +       }
> +       return OPAL_HARDWARE;
>  }

It might be worth moving the retry logic out of p8_stop_thread() and
doing it at a higher level so we're not blocking on each thread.

>
>  static int p8_sreset_thread(struct cpu_thread *cpu)
> --
> 2.20.1
>
> _______________________________________________
> Skiboot mailing list
> Skiboot@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/skiboot
Stewart Smith March 12, 2019, 4:01 a.m. UTC | #2
Oliver <oohall@gmail.com> writes:
> On Tue, Mar 12, 2019 at 11:31 AM Stewart Smith <stewart@linux.ibm.com> wrote:
>>
>> From: Stewart Smith <stewart@linux.vnet.ibm.com>
>>
>> When attempting to do SRESET under load, our existing implementation
>> fell down and would fail to SRESET some CPUs, causing the fast-reboot
>> to fail. This is reproduced by running the (new) op-test test
>> testcases.OpTestFastReboot.FastRebootHostStressTorture - and especially
>> so if you up the `stress` invocation to be 120 seconds.
>>
>> See PR for op-test: https://github.com/open-power/op-test-framework/pull/437
>>
>> Unfortunately, this patch itself isn't the whole story. It makes things
>> *better* but there's still a window. Mind you, with this patch I've
>> seen it pass 24 fast-reboots, which is around 22 more than previously.
>
> Damn, it's almost like fast-reboot is a dumb hack that should have
> been opt-in to begin with ;)

:P

>> The same problem has been observed with pdbg, see discussion there
>> https://lists.ozlabs.org/pipermail/pdbg/2019-March/001076.html
>>
>> Signed-off-by: Stewart Smith <stewart@linux.vnet.ibm.com>
>> ---
>>  core/direct-controls.c | 44 ++++++++++++++++++++++++++++++++++++++++--
>>  1 file changed, 42 insertions(+), 2 deletions(-)
>>
>> diff --git a/core/direct-controls.c b/core/direct-controls.c
>> index 1d0f6818e2c8..8783a83a25ac 100644
>> --- a/core/direct-controls.c
>> +++ b/core/direct-controls.c
>> @@ -1,4 +1,4 @@
>> -/* Copyright 2017 IBM Corp.
>> +/* Copyright 2017-2019 IBM Corp.
>>   *
>>   * Licensed under the Apache License, Version 2.0 (the "License");
>>   * you may not use this file except in compliance with the License.
>> @@ -59,6 +59,12 @@ static void mambo_stop_cpu(struct cpu_thread *cpu)
>>  #define P8_DIRECT_CTL_STOP             PPC_BIT(63)
>>  #define P8_DIRECT_CTL_PRENAP           PPC_BIT(47)
>>  #define P8_DIRECT_CTL_SRESET           PPC_BIT(60)
>> +#define P8_EX_TCTL_RAS_STATUS(t)       (0x10013002 + (t) * 0x10)
>> +#define P8_RAS_STATUS_SRQ_EMPTY                PPC_BIT(8)
>> +#define P8_RAS_STATUS_LSU_QUIESCED     PPC_BIT(9)
>> +#define P8_RAS_STATUS_INST_COMPLETE    PPC_BIT(12)
>> +#define P8_RAS_STATUS_THREAD_ACTIVE    PPC_BIT(48)
>> +#define P8_RAS_STATUS_TS_QUIESCE       PPC_BIT(49)
>>
>>  static int p8_core_set_special_wakeup(struct cpu_thread *cpu)
>>  {
>> @@ -224,6 +230,9 @@ static int p8_stop_thread(struct cpu_thread *cpu)
>>         uint32_t chip_id = pir_to_chip_id(cpu->pir);
>>         uint32_t thread_id = pir_to_thread_id(cpu->pir);
>>         uint32_t xscom_addr;
>> +       uint64_t val;
>> +       int i;
>> +       int rc;
>>
>>         xscom_addr = XSCOM_ADDR_P8_EX(core_id,
>>                                       P8_EX_TCTL_DIRECT_CONTROLS(thread_id));
>> @@ -235,7 +244,38 @@ static int p8_stop_thread(struct cpu_thread *cpu)
>>                 return OPAL_HARDWARE;
>>         }
>>
>> -       return OPAL_SUCCESS;
>> +       xscom_addr = XSCOM_ADDR_P8_EX(core_id,
>> +                                     P8_EX_TCTL_RAS_STATUS(thread_id));
>> +       for (i=0; i < 1000; i++) {
>> +               rc = xscom_read(chip_id, xscom_addr, &val);
>> +               if (rc) {
>> +                       prlog(PR_ERR, "Could not check state of thread "
>> +                             "%u:%u:%u\n", chip_id, core_id, thread_id);
>> +               return OPAL_HARDWARE;
>> +               }
>> +               prlog(PR_INFO, "RAS_STATUS for %u:%u:%u = 0x%llx\n",
>> +                     chip_id, core_id, thread_id, val);
> If this needs to be PR_INFO then you should move it outside the loop
> to where the final status is checked. Doing 1000s retries and printing
> the status at PR_INFO on each retry is going to be a little spammy.

Totally agree. I possibly should have labeled WIP HACK rather than RFC :)

>
>> +               if (val & P8_RAS_STATUS_INST_COMPLETE)
>> +                       break;
>> +               time_wait_ms(10);
>
> 1000 * 10e-3 = 10s timeout, seems a bit excessive?

yeah, *really* excessive. So far, it seems it's really quick, so tempted
to just to 1ms delay and 100 iterations, if it's not done by then,
something is just boned. I honestly don't see it taking long at all, but
this is part of my Ultra Paranoid Assert All The Things work...

>> +       }
>> +       if (!(val & P8_RAS_STATUS_INST_COMPLETE))
>> +               return OPAL_HARDWARE;
>> +
>> +       for (i=0; i < 1000; i++) {
>> +               rc = xscom_read(chip_id, xscom_addr, &val);
>> +               if (rc) {
>> +                       prlog(PR_ERR, "Could not check state of thread "
>> +                             "%u:%u:%u\n", chip_id, core_id, thread_id);
>> +               return OPAL_HARDWARE;
>> +               }
>> +               prlog(PR_INFO, "RAS_STATUS for %u:%u:%u = 0x%llx\n",
>> +                     chip_id, core_id, thread_id, val);
>> +               if (val & P8_RAS_STATUS_TS_QUIESCE)
>> +                       return OPAL_SUCCESS;
>> +               time_wait_ms(10);
>> +       }
>> +       return OPAL_HARDWARE;
>>  }
>
> It might be worth moving the retry logic out of p8_stop_thread() and
> doing it at a higher level so we're not blocking on each thread.

you might be right there.
diff mbox series

Patch

diff --git a/core/direct-controls.c b/core/direct-controls.c
index 1d0f6818e2c8..8783a83a25ac 100644
--- a/core/direct-controls.c
+++ b/core/direct-controls.c
@@ -1,4 +1,4 @@ 
-/* Copyright 2017 IBM Corp.
+/* Copyright 2017-2019 IBM Corp.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,6 +59,12 @@  static void mambo_stop_cpu(struct cpu_thread *cpu)
 #define P8_DIRECT_CTL_STOP		PPC_BIT(63)
 #define P8_DIRECT_CTL_PRENAP		PPC_BIT(47)
 #define P8_DIRECT_CTL_SRESET		PPC_BIT(60)
+#define P8_EX_TCTL_RAS_STATUS(t)	(0x10013002 + (t) * 0x10)
+#define P8_RAS_STATUS_SRQ_EMPTY		PPC_BIT(8)
+#define P8_RAS_STATUS_LSU_QUIESCED	PPC_BIT(9)
+#define P8_RAS_STATUS_INST_COMPLETE	PPC_BIT(12)
+#define P8_RAS_STATUS_THREAD_ACTIVE	PPC_BIT(48)
+#define P8_RAS_STATUS_TS_QUIESCE	PPC_BIT(49)
 
 static int p8_core_set_special_wakeup(struct cpu_thread *cpu)
 {
@@ -224,6 +230,9 @@  static int p8_stop_thread(struct cpu_thread *cpu)
 	uint32_t chip_id = pir_to_chip_id(cpu->pir);
 	uint32_t thread_id = pir_to_thread_id(cpu->pir);
 	uint32_t xscom_addr;
+	uint64_t val;
+	int i;
+	int rc;
 
 	xscom_addr = XSCOM_ADDR_P8_EX(core_id,
 				      P8_EX_TCTL_DIRECT_CONTROLS(thread_id));
@@ -235,7 +244,38 @@  static int p8_stop_thread(struct cpu_thread *cpu)
 		return OPAL_HARDWARE;
 	}
 
-	return OPAL_SUCCESS;
+	xscom_addr = XSCOM_ADDR_P8_EX(core_id,
+				      P8_EX_TCTL_RAS_STATUS(thread_id));
+	for (i=0; i < 1000; i++) {
+		rc = xscom_read(chip_id, xscom_addr, &val);
+		if (rc) {
+			prlog(PR_ERR, "Could not check state of thread "
+			      "%u:%u:%u\n", chip_id, core_id, thread_id);
+		return OPAL_HARDWARE;
+		}
+		prlog(PR_INFO, "RAS_STATUS for %u:%u:%u = 0x%llx\n",
+		      chip_id, core_id, thread_id, val);
+		if (val & P8_RAS_STATUS_INST_COMPLETE)
+			break;
+		time_wait_ms(10);
+	}
+	if (!(val & P8_RAS_STATUS_INST_COMPLETE))
+		return OPAL_HARDWARE;
+
+	for (i=0; i < 1000; i++) {
+		rc = xscom_read(chip_id, xscom_addr, &val);
+		if (rc) {
+			prlog(PR_ERR, "Could not check state of thread "
+			      "%u:%u:%u\n", chip_id, core_id, thread_id);
+		return OPAL_HARDWARE;
+		}
+		prlog(PR_INFO, "RAS_STATUS for %u:%u:%u = 0x%llx\n",
+		      chip_id, core_id, thread_id, val);
+		if (val & P8_RAS_STATUS_TS_QUIESCE)
+			return OPAL_SUCCESS;
+		time_wait_ms(10);
+	}
+	return OPAL_HARDWARE;
 }
 
 static int p8_sreset_thread(struct cpu_thread *cpu)