diff mbox series

[v2,2/2] uart: Drop console write data if BMC becomes unresponsive

Message ID 20200521154948.13760-2-hegdevasant@linux.vnet.ibm.com
State Accepted
Headers show
Series [v2,1/2] xscom: Do not wait indefinitely for xscom operation to complete | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch master (0f1937ef40fca0c3212a9dff1010b832a24fb063)
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot success Test snowpatch/job/snowpatch-skiboot on branch master
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot-dco success Signed-off-by present

Commit Message

Vasant Hegde May 21, 2020, 3:49 p.m. UTC
If BMC becomes unresponsive (ex: during BMC reboot) during console write
then we may get stuck in uart_wait_tx_room(). This will result in CPU
to get stuck in OPAL. This will result in kernel lockups and in some
cases host becomes unresponsive.

This patch introduces timeout option. If UART operation doesn't complete
within predefined time then it will drop write data and comes out.

Note that this patch fixes both OPAL internal console as well as
console write APIs.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
[Various fixes on top of Nick's proposal to have single timer - Vasant]
Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
---
Nick,
  I have added your s-o-b. Hope you are fine with that.

-Vasant

 hw/lpc-uart.c | 100 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 74 insertions(+), 26 deletions(-)

Comments

Oliver O'Halloran June 3, 2020, 7:07 a.m. UTC | #1
On Fri, May 22, 2020 at 1:50 AM Vasant Hegde
<hegdevasant@linux.vnet.ibm.com> wrote:
>
> If BMC becomes unresponsive (ex: during BMC reboot) during console write
> then we may get stuck in uart_wait_tx_room(). This will result in CPU
> to get stuck in OPAL. This will result in kernel lockups and in some
> cases host becomes unresponsive.
>
> This patch introduces timeout option. If UART operation doesn't complete
> within predefined time then it will drop write data and comes out.
>
> Note that this patch fixes both OPAL internal console as well as
> console write APIs.
>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> [Various fixes on top of Nick's proposal to have single timer - Vasant]
> Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>

Thanks, patch merged as 6bf21350da32776aac8ba75bf48933854647bd7e.

I've dropped Patch 1/2 since I don't think it's fixing a real problem,
just a red-herring we found along the way.

Oliver
diff mbox series

Patch

diff --git a/hw/lpc-uart.c b/hw/lpc-uart.c
index 979a617c3..10c5ed4fe 100644
--- a/hw/lpc-uart.c
+++ b/hw/lpc-uart.c
@@ -56,6 +56,7 @@  DEFINE_LOG_ENTRY(OPAL_RC_UART_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_UART,
 static struct lock uart_lock = LOCK_UNLOCKED;
 static struct dt_node *uart_node;
 static uint32_t uart_base;
+static uint64_t uart_tx_full_time;
 static bool has_irq = false, irq_ok, rx_full, tx_full;
 static uint8_t tx_room;
 static uint8_t cached_ier;
@@ -95,28 +96,59 @@  static inline void uart_write(unsigned int reg, uint8_t val)
 		lpc_outb(val, uart_base + reg);
 }
 
-static void uart_check_tx_room(void)
+static bool uart_check_tx_room(void)
 {
+	if (tx_room)
+		return true;
+
 	if (uart_read(REG_LSR) & LSR_THRE) {
 		/* FIFO is 16 entries */
 		tx_room = 16;
 		tx_full = false;
+		return true;
 	}
+
+	return false;
 }
 
-static void uart_wait_tx_room(void)
+/* Must be called with UART lock held */
+static void uart_write_thr(uint8_t val)
 {
-	while (!tx_room) {
-		uart_check_tx_room();
-		if (!tx_room) {
-			smt_lowest();
-			do {
-				barrier();
-				uart_check_tx_room();
-			} while (!tx_room);
+	uart_write(REG_THR, val);
+
+	tx_room--;
+	if (tx_room == 0) {
+		if (!uart_check_tx_room())
+			uart_tx_full_time = mftb();
+	}
+}
+
+static bool uart_timed_out(unsigned long msecs)
+{
+	if (uart_check_tx_room())
+		return false;
+
+	if (tb_compare(mftb(), uart_tx_full_time + msecs_to_tb(msecs)) == TB_AAFTERB)
+		return true;
+
+	return false;
+}
+
+static bool uart_wait_tx_room(void)
+{
+	if (uart_check_tx_room())
+		return true;
+
+	smt_lowest();
+	while (!uart_check_tx_room()) {
+		if (uart_timed_out(100)) {
 			smt_medium();
+			return false;
 		}
 	}
+	smt_medium();
+
+	return true;
 }
 
 static void uart_update_ier(void)
@@ -160,18 +192,20 @@  static size_t uart_con_write(const char *buf, size_t len)
 		return written;
 
 	lock(&uart_lock);
-	while(written < len) {
-		if (tx_room == 0) {
-			uart_wait_tx_room();
-			if (tx_room == 0)
-				goto bail;
-		} else {
-			uart_write(REG_THR, buf[written++]);
-			tx_room--;
-		}
+	while (written < len) {
+		if (!uart_wait_tx_room())
+			break;
+
+		uart_write_thr(buf[written++]);
+	}
+
+	if (!written && uart_timed_out(1000)) {
+		unlock(&uart_lock);
+		return len; /* swallow data */
 	}
- bail:
+
 	unlock(&uart_lock);
+
 	return written;
 }
 
@@ -232,16 +266,19 @@  static int64_t uart_con_flush(void)
 			tx_full = true;
 			break;
 		}
-		uart_write(REG_THR, out_buf[out_buf_cons++]);
+
+		uart_write_thr(out_buf[out_buf_cons++]);
 		out_buf_cons %= OUT_BUF_SIZE;
-		tx_room--;
 	}
 	if (tx_full != tx_was_full)
 		uart_update_ier();
 	if (out_buf_prod != out_buf_cons) {
 		/* Return busy if nothing was flushed this call */
-		if (out_buf_cons == out_buf_cons_initial)
+		if (out_buf_cons == out_buf_cons_initial) {
+			if (uart_timed_out(1000))
+				return OPAL_TIMEOUT;
 			return OPAL_BUSY;
+		}
 		/* Return partial if there's more to flush */
 		return OPAL_PARTIAL;
 	}
@@ -259,6 +296,7 @@  static int64_t uart_opal_write(int64_t term_number, __be64 *__length,
 			       const uint8_t *buffer)
 {
 	size_t written = 0, len = be64_to_cpu(*__length);
+	int64_t ret = OPAL_SUCCESS;
 
 	if (term_number != 0)
 		return OPAL_PARAMETER;
@@ -275,24 +313,34 @@  static int64_t uart_opal_write(int64_t term_number, __be64 *__length,
 	/* Flush out buffer again */
 	uart_con_flush();
 
+	if (!written && uart_timed_out(1000))
+		ret = OPAL_TIMEOUT;
 	unlock(&uart_lock);
 
 	*__length = cpu_to_be64(written);
 
-	return OPAL_SUCCESS;
+	return ret;
 }
 
 static int64_t uart_opal_write_buffer_space(int64_t term_number,
 					    __be64 *__length)
 {
+	int64_t ret = OPAL_SUCCESS;
+	int64_t tx_buf_len;
+
 	if (term_number != 0)
 		return OPAL_PARAMETER;
 
 	lock(&uart_lock);
-	*__length = cpu_to_be64(uart_tx_buf_space());
+	tx_buf_len = uart_tx_buf_space();
+
+	if ((tx_buf_len < be64_to_cpu(*__length)) && uart_timed_out(1000))
+		ret = OPAL_TIMEOUT;
+
+	*__length = cpu_to_be64(tx_buf_len);
 	unlock(&uart_lock);
 
-	return OPAL_SUCCESS;
+	return ret;
 }
 
 /* Must be called with UART lock held */