diff mbox series

[V2,01/15] core/pldm: Handle Watchdog timer.

Message ID 20220429094744.72855-2-clombard@linux.vnet.ibm.com
State Superseded
Headers show
Series Complete PLDM responder and enable PLDM support | expand

Commit Message

Christophe Lombard April 29, 2022, 9:47 a.m. UTC
Encode a PLDM platform event message to send the heartbeat to the BMC.
Watchdog is "armed" when a
PLDM_EVENT_MESSAGE_GLOBAL_ENABLE_ASYNC_KEEP_ALIVE is received.

Signed-off-by: Christophe Lombard <clombard@linux.vnet.ibm.com>
---
 core/pldm/Makefile.inc    |   2 +
 core/pldm/pldm-watchdog.c | 127 ++++++++++++++++++++++++++++++++++++++
 core/pldm/pldm.h          |   1 +
 include/pldm.h            |   5 ++
 4 files changed, 135 insertions(+)
 create mode 100644 core/pldm/pldm-watchdog.c

Comments

Frederic Barrat April 26, 2023, 12:23 p.m. UTC | #1
On 29/04/2022 11:47, Christophe Lombard wrote:
> Encode a PLDM platform event message to send the heartbeat to the BMC.
> Watchdog is "armed" when a
> PLDM_EVENT_MESSAGE_GLOBAL_ENABLE_ASYNC_KEEP_ALIVE is received.
> 
> Signed-off-by: Christophe Lombard <clombard@linux.vnet.ibm.com>
> ---
>   core/pldm/Makefile.inc    |   2 +
>   core/pldm/pldm-watchdog.c | 127 ++++++++++++++++++++++++++++++++++++++
>   core/pldm/pldm.h          |   1 +
>   include/pldm.h            |   5 ++
>   4 files changed, 135 insertions(+)
>   create mode 100644 core/pldm/pldm-watchdog.c
> 
> diff --git a/core/pldm/Makefile.inc b/core/pldm/Makefile.inc
> index 506bf5d7..c6f78822 100644
> --- a/core/pldm/Makefile.inc
> +++ b/core/pldm/Makefile.inc
> @@ -9,11 +9,13 @@ CPPFLAGS += -I$(SRC)/pldm/ibm/libpldm/
>   
>   CFLAGS_$(PLDM_DIR)/pldm-platform-requests.o = -Wno-strict-prototypes
>   CFLAGS_$(PLDM_DIR)/pldm-bios-requests.o = -Wno-strict-prototypes
> +CFLAGS_$(PLDM_DIR)/pldm-watchdog.o = -Wno-strict-prototypes


I know you're going to update to latest libpldm, but it seems that 
-Wno-strict-prototypes is no longer going to be needed (they fixed the 
prototypes with no arguments). Might also be true for the other 
file-specific CFLAGS, this is just a reminder that we should check it.


>   
>   PLDM_OBJS = pldm-common.o pldm-responder.o pldm-requester.o
>   PLDM_OBJS += pldm-base-requests.o pldm-platform-requests.o
>   PLDM_OBJS += pldm-bios-requests.o pldm-fru-requests.o
>   PLDM_OBJS += pldm-file-io-requests.o pldm-lid-files.o
> +PLDM_OBJS += pldm-watchdog.o
>   
>   PLDM = $(PLDM_DIR)/built-in.a
>   $(PLDM): $(PLDM_OBJS:%=$(PLDM_DIR)/%)
> diff --git a/core/pldm/pldm-watchdog.c b/core/pldm/pldm-watchdog.c
> new file mode 100644
> index 00000000..9d2aabe8
> --- /dev/null
> +++ b/core/pldm/pldm-watchdog.c
> @@ -0,0 +1,127 @@
> +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
> +// Copyright 2022 IBM Corp.
> +
> +#define pr_fmt(fmt) "PLDM: " fmt
> +
> +#include <lock.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <opal.h>
> +#include <timebase.h>
> +#include <timer.h>
> +#include <pldm/libpldm/platform.h>
> +#include "pldm.h"
> +
> +#define DEFAULT_WATCHDOG_TIMEOUT_SEC (10 * 60) /* 10 min */
> +
> +/* Whether the watchdog timer is armed and Skiboot should be sending
> + * regular heartbeats.
> + */
> +bool watchdog_armed;
> +
> +/* The period (in seconds) of the PLDM watchdog, as dictated by BMC */
> +int watchdog_period_sec = DEFAULT_WATCHDOG_TIMEOUT_SEC;
> +
> +static struct lock sequence_lock;
> +static uint8_t sequence_number;
> +
> +int pldm_watchdog_reset_timer(void)
> +{
> +	uint8_t heartbeat_elapsed_data[2];
> +	size_t response_len, payload_len;
> +	uint32_t request_length;
> +	void *response_msg;
> +	char *request_msg;
> +	int rc;
> +
> +	struct pldm_platform_event_message_req event_message_req = {
> +		.format_version = PLDM_PLATFORM_EVENT_MESSAGE_FORMAT_VERSION,
> +		.tid = HOST_TID,
> +		.event_class = PLDM_HEARTBEAT_TIMER_ELAPSED_EVENT,
> +	};
> +
> +	struct pldm_platform_event_message_resp response;
> +
> +	/* watchdog is not armed, so no need to send the heartbeat */
> +	if (!watchdog_armed) {
> +		prlog(PR_ERR, "%s - PLDM watchdog is not armed, not sending the heartbeat\n",
> +			      __func__);
> +		return OPAL_PARAMETER;
> +	}
> +
> +	prlog(PR_INFO, "%s - send the heartbeat to the BMC, sequence: %d, period: %d\n",
> +		       __func__, sequence_number, watchdog_period_sec);
> +
> +	/* Send the event request */
> +	heartbeat_elapsed_data[0] = PLDM_PLATFORM_EVENT_MESSAGE_FORMAT_VERSION;
> +
> +	/* We need to make sure that we send the BMC the correct
> +	 * sequence number. To prevent possible race conditions for the
> +	 * sequence number, lock it while we're incrementing and
> +	 * sending it down.
> +	 */
> +	lock(&sequence_lock);
> +	heartbeat_elapsed_data[1] = sequence_number++;
> +
> +	payload_len = PLDM_PLATFORM_EVENT_MESSAGE_MIN_REQ_BYTES + sizeof(heartbeat_elapsed_data);
> +
> +	request_length = sizeof(struct pldm_msg_hdr) +
> +			 sizeof(struct pldm_platform_event_message_req) +
> +			 sizeof(heartbeat_elapsed_data);
> +	request_msg = malloc(request_length);
> +
> +	/* Encode the platform event message request */
> +	rc = encode_platform_event_message_req(
> +			DEFAULT_INSTANCE_ID,
> +			event_message_req.format_version,
> +			event_message_req.tid,
> +			event_message_req.event_class,
> +			heartbeat_elapsed_data,
> +			sizeof(heartbeat_elapsed_data),
> +			(struct pldm_msg *)request_msg,
> +			payload_len);
> +	if (rc != PLDM_SUCCESS) {
> +		prlog(PR_ERR, "Encode PlatformEventMessage Error, rc: %d\n", rc);
> +		free(request_msg);
> +		sequence_number--;


We don't release the sequence_lock in this error path.

Do we really care about decrementing sequence_number in case of error? 
The spec says it allows the BMC to know if it misses one hearbeat. Which 
would be the case in case of error, so decrementing is actually 
misleading. And it would simplify the code.


> +		return OPAL_PARAMETER;
> +	}
> +	unlock(&sequence_lock);
> +
> +	/* Send and get the response message bytes */
> +	rc = pldm_do_request(BMC_EID, request_msg, request_length - 1,
> +			     &response_msg, &response_len);
> +	if (rc) {
> +		prlog(PR_ERR, "Communication Error, req: PlatformEventMessage, rc: %d\n", rc);
> +		free(request_msg);
> +		sequence_number--;


sequence_number is decremented with no lock held. But see previous 
comment, we may not need to decrement.


> +		return rc;
> +	}
> +	free(request_msg);
> +
> +	/* Decode the message */
> +	payload_len = response_len - sizeof(struct pldm_msg_hdr);
> +	rc = decode_platform_event_message_resp(
> +				response_msg,
> +				payload_len,
> +				&response.completion_code,
> +				&response.platform_event_status);
> +	if (rc != PLDM_SUCCESS || response.completion_code != PLDM_SUCCESS) {
> +		prlog(PR_ERR, "Decode PlatformEventMessage Error, rc: %d, cc: %d, pes: %d\n",
> +			      rc, response.completion_code,
> +			      response.platform_event_status);
> +		free(response_msg);
> +		return OPAL_PARAMETER;
> +	}
> +
> +	free(response_msg);
> +


So we only send one heartbeat message? Not emitted periodically?
 From the spec:
"heartbeatTimer
Amount of time in seconds after *each* elapsing of which the terminus 
shall emit a heartbeat event (the heartbeatTimerElapsedEvent) to the 
event receiver. If the terminus cannot produce heartbeat events at
the requested rate, it shall return completion code 
HEARTBEAT_FREQUENCY_TOO_HIGH."

So it seems to me that once a SetEventReceiver command is received 
requesting a heartbeat, we should send one periodically.


> +	return OPAL_SUCCESS;
> +}
> +
> +int pldm_watchdog_init(void)
> +{
> +	init_lock(&sequence_lock);


pldm_watchdog_reset_timer() is an exported API and a misbehaving caller 
might call it and use the lock before it's init. We should use a static 
initializer when declaring the global variable:

	static struct lock sequence_lock = LOCK_UNLOCKED



> +
> +	return pldm_watchdog_reset_timer();

Does it make sense to call pldm_watchdog_reset_timer() during init? Can 
watchdog_timer already be set?


   Fred
diff mbox series

Patch

diff --git a/core/pldm/Makefile.inc b/core/pldm/Makefile.inc
index 506bf5d7..c6f78822 100644
--- a/core/pldm/Makefile.inc
+++ b/core/pldm/Makefile.inc
@@ -9,11 +9,13 @@  CPPFLAGS += -I$(SRC)/pldm/ibm/libpldm/
 
 CFLAGS_$(PLDM_DIR)/pldm-platform-requests.o = -Wno-strict-prototypes
 CFLAGS_$(PLDM_DIR)/pldm-bios-requests.o = -Wno-strict-prototypes
+CFLAGS_$(PLDM_DIR)/pldm-watchdog.o = -Wno-strict-prototypes
 
 PLDM_OBJS = pldm-common.o pldm-responder.o pldm-requester.o
 PLDM_OBJS += pldm-base-requests.o pldm-platform-requests.o
 PLDM_OBJS += pldm-bios-requests.o pldm-fru-requests.o
 PLDM_OBJS += pldm-file-io-requests.o pldm-lid-files.o
+PLDM_OBJS += pldm-watchdog.o
 
 PLDM = $(PLDM_DIR)/built-in.a
 $(PLDM): $(PLDM_OBJS:%=$(PLDM_DIR)/%)
diff --git a/core/pldm/pldm-watchdog.c b/core/pldm/pldm-watchdog.c
new file mode 100644
index 00000000..9d2aabe8
--- /dev/null
+++ b/core/pldm/pldm-watchdog.c
@@ -0,0 +1,127 @@ 
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+// Copyright 2022 IBM Corp.
+
+#define pr_fmt(fmt) "PLDM: " fmt
+
+#include <lock.h>
+#include <stdlib.h>
+#include <string.h>
+#include <opal.h>
+#include <timebase.h>
+#include <timer.h>
+#include <pldm/libpldm/platform.h>
+#include "pldm.h"
+
+#define DEFAULT_WATCHDOG_TIMEOUT_SEC (10 * 60) /* 10 min */
+
+/* Whether the watchdog timer is armed and Skiboot should be sending
+ * regular heartbeats.
+ */
+bool watchdog_armed;
+
+/* The period (in seconds) of the PLDM watchdog, as dictated by BMC */
+int watchdog_period_sec = DEFAULT_WATCHDOG_TIMEOUT_SEC;
+
+static struct lock sequence_lock;
+static uint8_t sequence_number;
+
+int pldm_watchdog_reset_timer(void)
+{
+	uint8_t heartbeat_elapsed_data[2];
+	size_t response_len, payload_len;
+	uint32_t request_length;
+	void *response_msg;
+	char *request_msg;
+	int rc;
+
+	struct pldm_platform_event_message_req event_message_req = {
+		.format_version = PLDM_PLATFORM_EVENT_MESSAGE_FORMAT_VERSION,
+		.tid = HOST_TID,
+		.event_class = PLDM_HEARTBEAT_TIMER_ELAPSED_EVENT,
+	};
+
+	struct pldm_platform_event_message_resp response;
+
+	/* watchdog is not armed, so no need to send the heartbeat */
+	if (!watchdog_armed) {
+		prlog(PR_ERR, "%s - PLDM watchdog is not armed, not sending the heartbeat\n",
+			      __func__);
+		return OPAL_PARAMETER;
+	}
+
+	prlog(PR_INFO, "%s - send the heartbeat to the BMC, sequence: %d, period: %d\n",
+		       __func__, sequence_number, watchdog_period_sec);
+
+	/* Send the event request */
+	heartbeat_elapsed_data[0] = PLDM_PLATFORM_EVENT_MESSAGE_FORMAT_VERSION;
+
+	/* We need to make sure that we send the BMC the correct
+	 * sequence number. To prevent possible race conditions for the
+	 * sequence number, lock it while we're incrementing and
+	 * sending it down.
+	 */
+	lock(&sequence_lock);
+	heartbeat_elapsed_data[1] = sequence_number++;
+
+	payload_len = PLDM_PLATFORM_EVENT_MESSAGE_MIN_REQ_BYTES + sizeof(heartbeat_elapsed_data);
+
+	request_length = sizeof(struct pldm_msg_hdr) +
+			 sizeof(struct pldm_platform_event_message_req) +
+			 sizeof(heartbeat_elapsed_data);
+	request_msg = malloc(request_length);
+
+	/* Encode the platform event message request */
+	rc = encode_platform_event_message_req(
+			DEFAULT_INSTANCE_ID,
+			event_message_req.format_version,
+			event_message_req.tid,
+			event_message_req.event_class,
+			heartbeat_elapsed_data,
+			sizeof(heartbeat_elapsed_data),
+			(struct pldm_msg *)request_msg,
+			payload_len);
+	if (rc != PLDM_SUCCESS) {
+		prlog(PR_ERR, "Encode PlatformEventMessage Error, rc: %d\n", rc);
+		free(request_msg);
+		sequence_number--;
+		return OPAL_PARAMETER;
+	}
+	unlock(&sequence_lock);
+
+	/* Send and get the response message bytes */
+	rc = pldm_do_request(BMC_EID, request_msg, request_length - 1,
+			     &response_msg, &response_len);
+	if (rc) {
+		prlog(PR_ERR, "Communication Error, req: PlatformEventMessage, rc: %d\n", rc);
+		free(request_msg);
+		sequence_number--;
+		return rc;
+	}
+	free(request_msg);
+
+	/* Decode the message */
+	payload_len = response_len - sizeof(struct pldm_msg_hdr);
+	rc = decode_platform_event_message_resp(
+				response_msg,
+				payload_len,
+				&response.completion_code,
+				&response.platform_event_status);
+	if (rc != PLDM_SUCCESS || response.completion_code != PLDM_SUCCESS) {
+		prlog(PR_ERR, "Decode PlatformEventMessage Error, rc: %d, cc: %d, pes: %d\n",
+			      rc, response.completion_code,
+			      response.platform_event_status);
+		free(response_msg);
+		return OPAL_PARAMETER;
+	}
+
+	free(response_msg);
+
+	return OPAL_SUCCESS;
+}
+
+int pldm_watchdog_init(void)
+{
+	init_lock(&sequence_lock);
+
+	return pldm_watchdog_reset_timer();
+}
diff --git a/core/pldm/pldm.h b/core/pldm/pldm.h
index 7054ba54..a6de9ae8 100644
--- a/core/pldm/pldm.h
+++ b/core/pldm/pldm.h
@@ -50,6 +50,7 @@  struct pldm_type {
 };
 
 int pldm_send(uint8_t dest_id, uint8_t *buf, int len);
+int pldm_watchdog_reset_timer(void);
 
 /* Responder support */
 int pldm_rx_handle_request(struct pldm_rx_data *rx);
diff --git a/include/pldm.h b/include/pldm.h
index 5ad724af..01af9a33 100644
--- a/include/pldm.h
+++ b/include/pldm.h
@@ -42,4 +42,9 @@  int pldm_lid_files_init(struct blocklevel_device **bl);
  */
 bool pldm_lid_files_exit(struct blocklevel_device *bl);
 
+/**
+ * Initialize and reset the watchdog
+ */
+int pldm_watchdog_init(void);
+
 #endif /* __PLDM_H__ */