diff mbox

bt: Add a temporary workaround for bmc dropping messages

Message ID 1427088346-28806-1-git-send-email-alistair@popple.id.au
State Accepted
Headers show

Commit Message

Alistair Popple March 23, 2015, 5:25 a.m. UTC
There is a bug (most likely on the bmc) that causes some bt messages to be
ignored. The message data is still in the bt fifo and the message read
pointers still appear to be valid so we can attempt a resend by just
setting the appropriate status flag.

A single retry seems to fix the problem most of the time, however this
should be regarded as a temporary fix. If unlucky we could encounter
the same bug resetting the flag so the message could still get
dropped.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
---
 hw/bt.c | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

Comments

Jeremy Kerr April 9, 2015, 4:41 a.m. UTC | #1
Hi Alistair,

> There is a bug (most likely on the bmc) that causes some bt messages to be
> ignored. The message data is still in the bt fifo and the message read
> pointers still appear to be valid so we can attempt a resend by just
> setting the appropriate status flag.
> 
> A single retry seems to fix the problem most of the time, however this
> should be regarded as a temporary fix. If unlucky we could encounter
> the same bug resetting the flag so the message could still get
> dropped.

Looks good. We'll still try and chase this up on the BMC side though...

Acked-by: Jeremy Kerr <jk@ozlabs.org>

Cheers,


Jeremy
diff mbox

Patch

diff --git a/hw/bt.c b/hw/bt.c
index 4b6deb0..7bf1b2f 100644
--- a/hw/bt.c
+++ b/hw/bt.c
@@ -64,6 +64,11 @@ 
  */
 #define BT_MSG_TIMEOUT (secs_to_tb(3))
 
+/*
+ * Maximum number of times to attempt sending a message before giving up.
+ */
+#define BT_MAX_RETRY_COUNT 1
+
 #define BT_QUEUE_DEBUG 0
 
 #define BT_ERR(msg, fmt, args...) \
@@ -80,6 +85,7 @@  struct bt_msg {
 	struct list_node link;
 	unsigned long tb;
 	uint8_t seq;
+	uint8_t retry_count;
 	struct ipmi_msg ipmi_msg;
 };
 
@@ -298,14 +304,25 @@  static void bt_expire_old_msg(void)
 	bt_msg = list_top(&bt.msgq, struct bt_msg, link);
 
 	if (bt_msg && bt_msg->tb > 0 && (bt_msg->tb + BT_MSG_TIMEOUT) < tb) {
-		BT_ERR(bt_msg, "Timeout sending message");
-		bt_msg_del(bt_msg);
-
-		/* Timing out a message is inherently racy as the BMC
-		   may start writing just as we decide to kill the
-		   message. Hopefully resetting the interface is
-		   sufficient to guard against such things. */
-		   bt_reset_interface();
+		if (bt_msg->retry_count < BT_MAX_RETRY_COUNT) {
+			/* A message timeout is usually due to the BMC
+			clearing the H2B_ATN flag without actually
+			doing anything. The data will still be in the
+			FIFO so just reset the flag.*/
+			BT_ERR(bt_msg, "Retry sending message");
+			bt_msg->retry_count++;
+			bt_msg->tb = mftb();
+			bt_outb(BT_CTRL_H2B_ATN, BT_CTRL);
+		} else {
+			BT_ERR(bt_msg, "Timeout sending message");
+			bt_msg_del(bt_msg);
+
+			/* Timing out a message is inherently racy as the BMC
+			   may start writing just as we decide to kill the
+			   message. Hopefully resetting the interface is
+			   sufficient to guard against such things. */
+			bt_reset_interface();
+		}
 	}
 }
 
@@ -384,6 +401,7 @@  static void bt_add_msg(struct bt_msg *bt_msg)
 {
 	bt_msg->tb = 0;
 	bt_msg->seq = ipmi_seq++;
+	bt_msg->retry_count = 0;
 	bt.queue_len++;
 	if (bt.queue_len > BT_MAX_QUEUE_LEN) {
 		/* Maximum queue length exceeded - remove the oldest message