diff mbox series

opal-prd: Have a worker process handle page offlining

Message ID 20200916004150.2312623-1-oohall@gmail.com
State New
Headers show
Series opal-prd: Have a worker process handle page offlining | expand

Commit Message

Oliver O'Halloran Sept. 16, 2020, 12:41 a.m. UTC
The memory_error() hservice interface expects the memory_error() call to
just accept the offline request and return without actually offlining the
memory. Currently we will attempt to offline the marked pages before
returning to HBRT which can result in an excessively long time spent in the
memory_error() hservice call which blocks HBRT from processing other
errors. Fix this by adding a worker process which performs the page
offlining via the sysfs memory error interfaces.

Cc: skiboot-stable@lists.ozlabs.org
Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
 external/opal-prd/opal-prd.c | 85 ++++++++++++++++++++++++++++--------
 1 file changed, 66 insertions(+), 19 deletions(-)

Comments

Mahesh J Salgaonkar Sept. 17, 2020, 5:36 a.m. UTC | #1
On 2020-09-16 10:41:50 Wed, Oliver O'Halloran wrote:
> The memory_error() hservice interface expects the memory_error() call to
> just accept the offline request and return without actually offlining the
> memory. Currently we will attempt to offline the marked pages before
> returning to HBRT which can result in an excessively long time spent in the
> memory_error() hservice call which blocks HBRT from processing other
> errors. Fix this by adding a worker process which performs the page
> offlining via the sysfs memory error interfaces.
> 
> Cc: skiboot-stable@lists.ozlabs.org
> Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
> ---
>  external/opal-prd/opal-prd.c | 85 ++++++++++++++++++++++++++++--------
>  1 file changed, 66 insertions(+), 19 deletions(-)
> 
> diff --git a/external/opal-prd/opal-prd.c b/external/opal-prd/opal-prd.c
> index 33ea5f5a8f6e..f2861611fe7a 100644
> --- a/external/opal-prd/opal-prd.c
> +++ b/external/opal-prd/opal-prd.c
> @@ -41,6 +41,7 @@
>  #include <sys/time.h>
>  #include <sys/socket.h>
>  #include <sys/un.h>
> +#include <sys/wait.h>
>  
>  #include <linux/ipmi.h>
>  #include <linux/limits.h>
> @@ -701,13 +702,40 @@ out:
>  	return rc;
>  }
>  
[...]
> @@ -727,26 +755,21 @@ int hservice_memory_error(uint64_t i_start_addr, uint64_t i_endAddr,
>  	pr_log(LOG_ERR, "MEM: Memory error: range %016lx-%016lx, type: %s",
>  			i_start_addr, i_endAddr, typestr);
>  
> +	/*
> +	 * HBRT expects the memory offlining process to happen in the background
> +	 * after the notification is delivered.
> +	 */
> +	pid = fork();
> +	if (pid > 0)
> +		exit(memory_error_worker(sysfsfile, typestr, i_start_addr, i_endAddr));
>  
> -	memfd = open(sysfsfile, O_WRONLY);
> -	if (memfd < 0) {
> -		pr_log(LOG_CRIT, "MEM: Failed to offline memory! "
> -				"Unable to open sysfs node %s: %m", sysfsfile);
> +	if (pid < 0) {
> +		perror("MEM: unable to fork worker to offline memory!\n");
>  		return -1;
>  	}
>  
> -	for (addr = i_start_addr; addr <= i_endAddr; addr += ctx->page_size) {
> -		n = snprintf(buf, ADDR_STRING_SZ, "0x%lx", addr);
> -		rc = write(memfd, buf, n);
> -		if (rc != n) {
> -			pr_log(LOG_CRIT, "MEM: Failed to offline memory! "
> -					"page addr: %016lx type: %d: %m",
> -				addr, i_errorType);
> -			ret = rc;
> -		}
> -	}
> -
> -	return ret;
> +	pr_log(LOG_INFO, "MEM: forked off %d to handle mem error\n", pid);
> +	return 0;
>  }
>  
>  uint64_t hservice_get_interface_capabilities(uint64_t set)
> @@ -2003,6 +2026,13 @@ out_send:
>  		free(send_msg);
>  }
>  
> +volatile bool worker_terminated;
> +
> +void signchild_handler(int sig)
> +{
> +	worker_terminated = true;
> +}

Should this handler be registered to catch SIGCHILD ? I don't
see it is being registered.

> +
>  static int run_attn_loop(struct opal_prd_ctx *ctx)
>  {
>  	struct pollfd pollfds[2];
> @@ -2049,6 +2079,23 @@ static int run_attn_loop(struct opal_prd_ctx *ctx)
>  		process_msgq(ctx);
>  
>  		rc = poll(pollfds, 2, -1);
> +
> +		if (worker_terminated) {
> +			pid_t pid;
> +
> +			worker_terminated = false;
> +			do {
> +				pid = waitpid(-1, NULL, WNOHANG);
> +				if (pid > 0) {
> +					pr_log(LOG_DEBUG, "reaped %d\n", pid);
> +				} else if (rc == -1 && errno != ECHILD) {

Shouldn't this be if (pid == -1 && ... ?

> +					pr_log(LOG_ERR, "error %m while reaping\n");
> +					break;
> +				}
> +			} while (pid > 0);
> +
> +			continue;
> +		}
>  		if (rc < 0) {
>  			pr_log(LOG_ERR, "FW: event poll failed: %m");
>  			exit(EXIT_FAILURE);

Rest looks good to me.

Thanks,
-Mahesh.

> -- 
> 2.26.2
> 
> _______________________________________________
> Skiboot mailing list
> Skiboot@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/skiboot
Oliver O'Halloran Sept. 17, 2020, 6:20 a.m. UTC | #2
On Thu, Sep 17, 2020 at 3:36 PM Mahesh J Salgaonkar
<mahesh@linux.ibm.com> wrote:
>
> > +volatile bool worker_terminated;
> > +
> > +void signchild_handler(int sig)
> > +{
> > +     worker_terminated = true;
> > +}
>
> Should this handler be registered to catch SIGCHILD ? I don't
> see it is being registered.

Yeah looks like the signal registration went AWOL. I remember writing
the code to do it at some point so I must have screwed up while
rebasing.

> >  static int run_attn_loop(struct opal_prd_ctx *ctx)
> >  {
> >       struct pollfd pollfds[2];
> > @@ -2049,6 +2079,23 @@ static int run_attn_loop(struct opal_prd_ctx *ctx)
> >               process_msgq(ctx);
> >
> >               rc = poll(pollfds, 2, -1);
> > +
> > +             if (worker_terminated) {
> > +                     pid_t pid;
> > +
> > +                     worker_terminated = false;
> > +                     do {
> > +                             pid = waitpid(-1, NULL, WNOHANG);
> > +                             if (pid > 0) {
> > +                                     pr_log(LOG_DEBUG, "reaped %d\n", pid);
> > +                             } else if (rc == -1 && errno != ECHILD) {
>
> Shouldn't this be if (pid == -1 && ... ?

Yep. Although, it might actually work anyway since rc would be -1 due
to poll() being interrupted by a signal. Probably best not to rely on
that though :)

Oliver
diff mbox series

Patch

diff --git a/external/opal-prd/opal-prd.c b/external/opal-prd/opal-prd.c
index 33ea5f5a8f6e..f2861611fe7a 100644
--- a/external/opal-prd/opal-prd.c
+++ b/external/opal-prd/opal-prd.c
@@ -41,6 +41,7 @@ 
 #include <sys/time.h>
 #include <sys/socket.h>
 #include <sys/un.h>
+#include <sys/wait.h>
 
 #include <linux/ipmi.h>
 #include <linux/limits.h>
@@ -701,13 +702,40 @@  out:
 	return rc;
 }
 
+static int memory_error_worker(const char *sysfsfile, const char *type,
+			       uint64_t i_start_addr, uint64_t i_endAddr)
+{
+	int memfd, rc, n, ret = 0;
+	char buf[ADDR_STRING_SZ];
+	uint64_t addr;
+
+	memfd = open(sysfsfile, O_WRONLY);
+	if (memfd < 0) {
+		pr_log(LOG_CRIT, "MEM: Failed to offline memory! "
+				"Unable to open sysfs node %s: %m", sysfsfile);
+		return -1;
+	}
+
+	for (addr = i_start_addr; addr <= i_endAddr; addr += ctx->page_size) {
+		n = snprintf(buf, ADDR_STRING_SZ, "0x%lx", addr);
+		rc = write(memfd, buf, n);
+		if (rc != n) {
+			pr_log(LOG_CRIT, "MEM: Failed to offline memory! "
+					"page addr: %016lx type: %s: %m",
+				addr, type);
+			ret = rc;
+		}
+	}
+
+	close(memfd);
+	return ret;
+}
+
 int hservice_memory_error(uint64_t i_start_addr, uint64_t i_endAddr,
 		enum MemoryError_t i_errorType)
 {
 	const char *sysfsfile, *typestr;
-	char buf[ADDR_STRING_SZ];
-	int memfd, rc, n, ret = 0;
-	uint64_t addr;
+	pid_t pid;
 
 	switch(i_errorType) {
 	case MEMORY_ERROR_CE:
@@ -727,26 +755,21 @@  int hservice_memory_error(uint64_t i_start_addr, uint64_t i_endAddr,
 	pr_log(LOG_ERR, "MEM: Memory error: range %016lx-%016lx, type: %s",
 			i_start_addr, i_endAddr, typestr);
 
+	/*
+	 * HBRT expects the memory offlining process to happen in the background
+	 * after the notification is delivered.
+	 */
+	pid = fork();
+	if (pid > 0)
+		exit(memory_error_worker(sysfsfile, typestr, i_start_addr, i_endAddr));
 
-	memfd = open(sysfsfile, O_WRONLY);
-	if (memfd < 0) {
-		pr_log(LOG_CRIT, "MEM: Failed to offline memory! "
-				"Unable to open sysfs node %s: %m", sysfsfile);
+	if (pid < 0) {
+		perror("MEM: unable to fork worker to offline memory!\n");
 		return -1;
 	}
 
-	for (addr = i_start_addr; addr <= i_endAddr; addr += ctx->page_size) {
-		n = snprintf(buf, ADDR_STRING_SZ, "0x%lx", addr);
-		rc = write(memfd, buf, n);
-		if (rc != n) {
-			pr_log(LOG_CRIT, "MEM: Failed to offline memory! "
-					"page addr: %016lx type: %d: %m",
-				addr, i_errorType);
-			ret = rc;
-		}
-	}
-
-	return ret;
+	pr_log(LOG_INFO, "MEM: forked off %d to handle mem error\n", pid);
+	return 0;
 }
 
 uint64_t hservice_get_interface_capabilities(uint64_t set)
@@ -2003,6 +2026,13 @@  out_send:
 		free(send_msg);
 }
 
+volatile bool worker_terminated;
+
+void signchild_handler(int sig)
+{
+	worker_terminated = true;
+}
+
 static int run_attn_loop(struct opal_prd_ctx *ctx)
 {
 	struct pollfd pollfds[2];
@@ -2049,6 +2079,23 @@  static int run_attn_loop(struct opal_prd_ctx *ctx)
 		process_msgq(ctx);
 
 		rc = poll(pollfds, 2, -1);
+
+		if (worker_terminated) {
+			pid_t pid;
+
+			worker_terminated = false;
+			do {
+				pid = waitpid(-1, NULL, WNOHANG);
+				if (pid > 0) {
+					pr_log(LOG_DEBUG, "reaped %d\n", pid);
+				} else if (rc == -1 && errno != ECHILD) {
+					pr_log(LOG_ERR, "error %m while reaping\n");
+					break;
+				}
+			} while (pid > 0);
+
+			continue;
+		}
 		if (rc < 0) {
 			pr_log(LOG_ERR, "FW: event poll failed: %m");
 			exit(EXIT_FAILURE);