diff mbox series

[RFC,net-next,17/19] net/mlx5: Add FW fatal devlink_health_reporter

Message ID 1546266733-9512-18-git-send-email-eranbe@mellanox.com
State RFC, archived
Delegated to: David Miller
Headers show
Series Devlink health reporting and recovery system | expand

Commit Message

Eran Ben Elisha Dec. 31, 2018, 2:32 p.m. UTC
From: Moshe Shemesh <moshe@mellanox.com>

Create mlx5_devlink_health_reporter for FW fatal reporter.
The FW fatal reporter is added in addition to the fw reporter and
implements only the recover callback.
The point of having two reporters for FW issues, is that we
don't want to run FW recover on any issue, but only fatal ones.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/devlink.c | 51 +++++++++++++++----
 .../net/ethernet/mellanox/mlx5/core/devlink.h |  4 +-
 .../net/ethernet/mellanox/mlx5/core/main.c    |  6 +--
 include/linux/mlx5/driver.h                   |  1 +
 4 files changed, 48 insertions(+), 14 deletions(-)
diff mbox series

Patch

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index 04c904214e4c..2e6c74ed1f04 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -234,21 +234,54 @@  static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = {
 		.diagnose = mlx5_fw_reporter_diagnose,
 };
 
-int mlx5_fw_reporter_create(struct mlx5_core_dev *dev)
+static int
+mlx5_fw_fatal_reporter_recover(struct devlink_health_reporter *reporter,
+			       void *priv_ctx)
 {
-	struct devlink *devlink = priv_to_devlink(dev);
+	struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter);
+	u8 nic_state;
 
-	dev->fw_reporter = devlink_health_reporter_create(devlink, &mlx5_fw_reporter_ops,
-							  0, false, dev);
-	return PTR_ERR_OR_ZERO(dev->fw_reporter);
+	nic_state = mlx5_get_nic_state(dev);
+	if (nic_state == MLX5_NIC_IFC_INVALID) {
+		dev_err(&dev->pdev->dev, "health recovery flow aborted since the nic state is invalid\n");
+		return -ECANCELED;
+	}
+	dev_err(&dev->pdev->dev, "starting health recovery flow\n");
+
+	mlx5_recover_device(dev);
+
+	return 0;
 }
 
-void mlx5_fw_reporter_destroy(struct mlx5_core_dev *dev)
+static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = {
+		.name = "FW_fatal",
+		.recover = mlx5_fw_fatal_reporter_recover,
+};
+
+#define MLX5_REPORTER_FW_GRACEFUL_PERIOD 120000
+int mlx5_fw_reporters_create(struct mlx5_core_dev *dev)
 {
-	if (!dev->fw_reporter)
-		return;
+	struct devlink *devlink = priv_to_devlink(dev);
 
-	devlink_health_reporter_destroy(dev->fw_reporter);
+	dev->fw_reporter =
+		devlink_health_reporter_create(devlink, &mlx5_fw_reporter_ops,
+					       0, false, dev);
+	if (IS_ERR(dev->fw_reporter))
+		return PTR_ERR(dev->fw_reporter);
+
+	dev->fw_fatal_reporter =
+		devlink_health_reporter_create(devlink, &mlx5_fw_fatal_reporter_ops,
+					       MLX5_REPORTER_FW_GRACEFUL_PERIOD,
+					       true, dev);
+	return PTR_ERR_OR_ZERO(dev->fw_fatal_reporter);
+}
+
+void mlx5_fw_reporters_destroy(struct mlx5_core_dev *dev)
+{
+	if (dev->fw_reporter)
+		devlink_health_reporter_destroy(dev->fw_reporter);
+	if (dev->fw_fatal_reporter)
+		devlink_health_reporter_destroy(dev->fw_fatal_reporter);
 }
 
 int mlx5_devlink_register(struct devlink *devlink, struct device *dev)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
index 082a648a3af3..9b544f677aa7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
@@ -14,8 +14,8 @@  struct mlx5_fw_reporter_ctx {
 
 int mlx5_devlink_register(struct devlink *devlink, struct device *dev);
 void mlx5_devlink_unregister(struct devlink *devlink);
-int mlx5_fw_reporter_create(struct mlx5_core_dev *dev);
-void mlx5_fw_reporter_destroy(struct mlx5_core_dev *dev);
+int mlx5_fw_reporters_create(struct mlx5_core_dev *dev);
+void mlx5_fw_reporters_destroy(struct mlx5_core_dev *dev);
 void mlx5_fw_reporter_err_work(struct work_struct *work);
 
 #endif /* __MLX5_DEVLINK_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index ca5f4c661f6d..8f12c761a485 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -981,9 +981,9 @@  static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 		goto err_fw_tracer;
 	}
 
-	err = mlx5_fw_reporter_create(dev);
+	err = mlx5_fw_reporters_create(dev);
 	if (err)
-		dev_warn(&pdev->dev, "Failed to create FW reporter\n");
+		dev_warn(&pdev->dev, "Failed to create FW reporters\n");
 
 	err = mlx5_fpga_device_start(dev);
 	if (err) {
@@ -1117,7 +1117,7 @@  static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 	mlx5_accel_ipsec_cleanup(dev);
 	mlx5_accel_tls_cleanup(dev);
 	mlx5_fpga_device_stop(dev);
-	mlx5_fw_reporter_destroy(dev);
+	mlx5_fw_reporters_destroy(dev);
 	mlx5_fw_tracer_cleanup(dev->tracer);
 	mlx5_eq_table_destroy(dev);
 	mlx5_pagealloc_stop(dev);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index b2dc32b553b4..8ab66bb40a17 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -681,6 +681,7 @@  struct mlx5_core_dev {
 	struct page             *clock_info_page;
 	struct mlx5_fw_tracer   *tracer;
 	struct devlink_health_reporter *fw_reporter;
+	struct devlink_health_reporter *fw_fatal_reporter;
 };
 
 struct mlx5_db {