diff mbox series

core/flash: Retry requests as necessary in flash_load_resource()

Message ID 20190325074558.22994-1-andrew@aj.id.au
State Accepted
Headers show
Series core/flash: Retry requests as necessary in flash_load_resource() | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch master (b392d785eb49630b9f00fef8d17944ed82b2c1fe)
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot success Test snowpatch/job/snowpatch-skiboot on branch master
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot-dco success Signed-off-by present

Commit Message

Andrew Jeffery March 25, 2019, 7:45 a.m. UTC
We would like to successfully boot if we have a dependency on the BMC
for flash even if the BMC is not current ready to service flash
requests. On the assumption that it will become ready, retry for several
minutes to cover a BMC reboot cycle and *eventually* rather than
*immediately* crash out with:

    [  269.549748] reboot: Restarting system
    [  390.297462587,5] OPAL: Reboot request...
    [  390.297737995,5] RESET: Initiating fast reboot 1...
    [  391.074707590,5] Clearing unused memory:
    [  391.075198880,5] PCI: Clearing all devices...
    [  391.075201618,7] Clearing region 201ffe000000-201fff800000
    [  391.086235699,5] PCI: Resetting PHBs and training links...
    [  391.254089525,3] FFS: Error 17 reading flash header
    [  391.254159668,3] FLASH: Can't open ffs handle: 17
    [  392.307245135,5] PCI: Probing slots...
    [  392.363723191,5] PCI Summary:
    ...
    [  393.423255262,5] OCC: All Chip Rdy after 0 ms
    [  393.453092828,5] INIT: Starting kernel at 0x20000000, fdt at
    0x30800a88 390645 bytes
    [  393.453202605,0] FATAL: Kernel is zeros, can't execute!
    [  393.453247064,0] Assert fail: core/init.c:593:0
    [  393.453289682,0] Aborting!
    CPU 0040 Backtrace:
     S: 0000000031e03ca0 R: 000000003001af60   ._abort+0x4c
     S: 0000000031e03d20 R: 000000003001afdc   .assert_fail+0x34
     S: 0000000031e03da0 R: 00000000300146d8   .load_and_boot_kernel+0xb30
     S: 0000000031e03e70 R: 0000000030026cf0   .fast_reboot_entry+0x39c
     S: 0000000031e03f00 R: 0000000030002a4c   fast_reset_entry+0x2c
     --- OPAL boot ---

The OPAL flash API hooks directly into the blocklevel layer, so there's
no delay for e.g. the host kernel, just for asynchronously loaded
resources during boot.

Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
---
 core/flash.c | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

Comments

Stewart Smith March 29, 2019, 4:44 a.m. UTC | #1
Andrew Jeffery <andrew@aj.id.au> writes:
> We would like to successfully boot if we have a dependency on the BMC
> for flash even if the BMC is not current ready to service flash
> requests. On the assumption that it will become ready, retry for several
> minutes to cover a BMC reboot cycle and *eventually* rather than
> *immediately* crash out with:
>
>     [  269.549748] reboot: Restarting system
>     [  390.297462587,5] OPAL: Reboot request...
>     [  390.297737995,5] RESET: Initiating fast reboot 1...
>     [  391.074707590,5] Clearing unused memory:
>     [  391.075198880,5] PCI: Clearing all devices...
>     [  391.075201618,7] Clearing region 201ffe000000-201fff800000
>     [  391.086235699,5] PCI: Resetting PHBs and training links...
>     [  391.254089525,3] FFS: Error 17 reading flash header
>     [  391.254159668,3] FLASH: Can't open ffs handle: 17
>     [  392.307245135,5] PCI: Probing slots...
>     [  392.363723191,5] PCI Summary:
>     ...
>     [  393.423255262,5] OCC: All Chip Rdy after 0 ms
>     [  393.453092828,5] INIT: Starting kernel at 0x20000000, fdt at
>     0x30800a88 390645 bytes
>     [  393.453202605,0] FATAL: Kernel is zeros, can't execute!
>     [  393.453247064,0] Assert fail: core/init.c:593:0
>     [  393.453289682,0] Aborting!
>     CPU 0040 Backtrace:
>      S: 0000000031e03ca0 R: 000000003001af60   ._abort+0x4c
>      S: 0000000031e03d20 R: 000000003001afdc   .assert_fail+0x34
>      S: 0000000031e03da0 R: 00000000300146d8   .load_and_boot_kernel+0xb30
>      S: 0000000031e03e70 R: 0000000030026cf0   .fast_reboot_entry+0x39c
>      S: 0000000031e03f00 R: 0000000030002a4c   fast_reset_entry+0x2c
>      --- OPAL boot ---
>
> The OPAL flash API hooks directly into the blocklevel layer, so there's
> no delay for e.g. the host kernel, just for asynchronously loaded
> resources during boot.

Merged to master as of cccf5d79de07844cf095b8f45146b92944d15c2e.

No doubt at some point somebody is going to go "huh" when running some
crazy simulator or broken machine, but I think I'm okay with that :)
diff mbox series

Patch

diff --git a/core/flash.c b/core/flash.c
index 5fae0f3f507d..1bfcb7fd9741 100644
--- a/core/flash.c
+++ b/core/flash.c
@@ -28,6 +28,7 @@ 
 #include <libstb/secureboot.h>
 #include <libstb/trustedboot.h>
 #include <elf.h>
+#include <timebase.h>
 
 struct flash {
 	struct list_node	list;
@@ -764,10 +765,18 @@  int flash_resource_loaded(enum resource_id id, uint32_t subid)
 	return rc;
 }
 
+/*
+ * Retry for 10 minutes in 5 second intervals: allow 5 minutes for a BMC reboot
+ * (need the BMC if we're using HIOMAP flash access), then 2x for some margin.
+ */
+#define FLASH_LOAD_WAIT_MS	5000
+#define FLASH_LOAD_RETRIES	(2 * 5 * (60 / (FLASH_LOAD_WAIT_MS / 1000)))
+
 static void flash_load_resources(void *data __unused)
 {
 	struct flash_load_resource_item *r;
-	int result;
+	int retries = FLASH_LOAD_RETRIES;
+	int result = OPAL_RESOURCE;
 
 	lock(&flash_load_resource_lock);
 	do {
@@ -782,11 +791,31 @@  static void flash_load_resources(void *data __unused)
 		r->result = OPAL_BUSY;
 		unlock(&flash_load_resource_lock);
 
-		result = flash_load_resource(r->id, r->subid, r->buf, r->len);
+		while (retries) {
+			result = flash_load_resource(r->id, r->subid, r->buf,
+						     r->len);
+			if (result == OPAL_SUCCESS) {
+				retries = FLASH_LOAD_RETRIES;
+				break;
+			}
+
+			if (result != FLASH_ERR_AGAIN &&
+					result != FLASH_ERR_DEVICE_GONE)
+				break;
+
+			time_wait_ms(FLASH_LOAD_WAIT_MS);
+
+			retries--;
+
+			prlog(PR_WARNING,
+			      "FLASH: Retrying load of %d:%d, %d attempts remain\n",
+			      r->id, r->subid, retries);
+		}
 
 		lock(&flash_load_resource_lock);
 		r = list_pop(&flash_load_resource_queue,
 			     struct flash_load_resource_item, link);
+		/* Will reuse the result from when we hit retries == 0 */
 		r->result = result;
 		list_add_tail(&flash_loaded_resources, &r->link);
 	} while(true);