diff mbox series

[net-next,24/27] net/mlx5: Add FW fatal devlink_health_reporter

Message ID 1547737521-29888-25-git-send-email-eranbe@mellanox.com
State Changes Requested
Delegated to: David Miller
Headers show
Series Devlink health reporting and recovery system | expand

Commit Message

Eran Ben Elisha Jan. 17, 2019, 3:05 p.m. UTC
From: Moshe Shemesh <moshe@mellanox.com>

Create mlx5_devlink_health_reporter for FW fatal reporter.
The FW fatal reporter is added in addition to the fw reporter and
implements only the recover callback.
The point of having two reporters for FW issues, is that we
don't want to run FW recover on any issue, but only fatal ones.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Reviewed-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/devlink.c | 49 +++++++++++++++----
 .../net/ethernet/mellanox/mlx5/core/devlink.h |  4 +-
 .../net/ethernet/mellanox/mlx5/core/health.c  |  6 +--
 .../net/ethernet/mellanox/mlx5/core/main.c    |  6 +--
 .../ethernet/mellanox/mlx5/core/mlx5_core.h   |  1 +
 include/linux/mlx5/driver.h                   |  1 +
 6 files changed, 50 insertions(+), 17 deletions(-)
diff mbox series

Patch

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index 5713f89d9235..ae08af00b101 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -235,21 +235,52 @@  static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = {
 		.diagnose = mlx5_fw_reporter_diagnose,
 };
 
-int mlx5_fw_reporter_create(struct mlx5_core_dev *dev)
+static int
+mlx5_fw_fatal_reporter_recover(struct devlink_health_reporter *reporter,
+			       void *priv_ctx)
 {
-	struct devlink *devlink = priv_to_devlink(dev);
+	struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter);
 
-	dev->fw_reporter = devlink_health_reporter_create(devlink, &mlx5_fw_reporter_ops,
-							  0, false, dev);
-	return PTR_ERR_OR_ZERO(dev->fw_reporter);
+	if (mlx5_sensor_pci_not_working(dev)) {
+		dev_err(&dev->pdev->dev, "health recovery flow aborted, PCI reads still not working\n");
+		return -ECANCELED;
+	}
+	dev_err(&dev->pdev->dev, "starting health recovery flow\n");
+
+	mlx5_recover_device(dev);
+
+	return 0;
 }
 
-void mlx5_fw_reporter_destroy(struct mlx5_core_dev *dev)
+static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = {
+		.name = "FW_fatal",
+		.recover = mlx5_fw_fatal_reporter_recover,
+};
+
+#define MLX5_REPORTER_FW_GRACEFUL_PERIOD 1200000
+int mlx5_fw_reporters_create(struct mlx5_core_dev *dev)
 {
-	if (!dev->fw_reporter)
-		return;
+	struct devlink *devlink = priv_to_devlink(dev);
 
-	devlink_health_reporter_destroy(dev->fw_reporter);
+	dev->fw_reporter =
+		devlink_health_reporter_create(devlink, &mlx5_fw_reporter_ops,
+					       0, false, dev);
+	if (IS_ERR(dev->fw_reporter))
+		return PTR_ERR(dev->fw_reporter);
+
+	dev->fw_fatal_reporter =
+		devlink_health_reporter_create(devlink, &mlx5_fw_fatal_reporter_ops,
+					       MLX5_REPORTER_FW_GRACEFUL_PERIOD,
+					       true, dev);
+	return PTR_ERR_OR_ZERO(dev->fw_fatal_reporter);
+}
+
+void mlx5_fw_reporters_destroy(struct mlx5_core_dev *dev)
+{
+	if (dev->fw_reporter)
+		devlink_health_reporter_destroy(dev->fw_reporter);
+	if (dev->fw_fatal_reporter)
+		devlink_health_reporter_destroy(dev->fw_fatal_reporter);
 }
 
 static int mlx5_devlink_get_crdump_snapshot(struct devlink *devlink, u32 id,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
index 082a648a3af3..9b544f677aa7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
@@ -14,8 +14,8 @@  struct mlx5_fw_reporter_ctx {
 
 int mlx5_devlink_register(struct devlink *devlink, struct device *dev);
 void mlx5_devlink_unregister(struct devlink *devlink);
-int mlx5_fw_reporter_create(struct mlx5_core_dev *dev);
-void mlx5_fw_reporter_destroy(struct mlx5_core_dev *dev);
+int mlx5_fw_reporters_create(struct mlx5_core_dev *dev);
+void mlx5_fw_reporters_destroy(struct mlx5_core_dev *dev);
 void mlx5_fw_reporter_err_work(struct work_struct *work);
 
 #endif /* __MLX5_DEVLINK_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 61ff82380093..d3ea624bd03c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -91,7 +91,7 @@  void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state)
 		    &dev->iseg->cmdq_addr_l_sz);
 }
 
-static bool sensor_pci_not_working(struct mlx5_core_dev *dev)
+bool mlx5_sensor_pci_not_working(struct mlx5_core_dev *dev)
 {
 	struct mlx5_core_health *health = &dev->priv.health;
 	struct health_buffer __iomem *h = health->health;
@@ -114,7 +114,7 @@  static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev)
 
 static u32 check_fatal_sensors(struct mlx5_core_dev *dev)
 {
-	if (sensor_pci_not_working(dev))
+	if (mlx5_sensor_pci_not_working(dev))
 		return MLX5_SENSOR_PCI_COMM_ERR;
 	if (pci_channel_offline(dev->pdev))
 		return MLX5_SENSOR_PCI_ERR;
@@ -315,7 +315,7 @@  static void health_recover(struct work_struct *work)
 	priv = container_of(health, struct mlx5_priv, health);
 	dev = container_of(priv, struct mlx5_core_dev, priv);
 
-	if (sensor_pci_not_working(dev)) {
+	if (mlx5_sensor_pci_not_working(dev)) {
 		dev_err(&dev->pdev->dev, "health recovery flow aborted, PCI reads still not working\n");
 		return;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 010a6a28e08d..2e7c54975f47 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -980,9 +980,9 @@  static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 		goto err_fw_tracer;
 	}
 
-	err = mlx5_fw_reporter_create(dev);
+	err = mlx5_fw_reporters_create(dev);
 	if (err)
-		dev_warn(&pdev->dev, "Failed to create FW reporter\n");
+		dev_warn(&pdev->dev, "Failed to create FW reporters\n");
 
 	err = mlx5_fpga_device_start(dev);
 	if (err) {
@@ -1116,7 +1116,7 @@  static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 	mlx5_accel_ipsec_cleanup(dev);
 	mlx5_accel_tls_cleanup(dev);
 	mlx5_fpga_device_stop(dev);
-	mlx5_fw_reporter_destroy(dev);
+	mlx5_fw_reporters_destroy(dev);
 	mlx5_fw_tracer_cleanup(dev->tracer);
 	mlx5_eq_table_destroy(dev);
 	mlx5_pagealloc_stop(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index af5cf19cc019..dca9a736c2cd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -204,6 +204,7 @@  enum {
 
 u8 mlx5_get_nic_state(struct mlx5_core_dev *dev);
 void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state);
+bool mlx5_sensor_pci_not_working(struct mlx5_core_dev *dev);
 
 #define HEALTH_INFO_MAX_LINE 64
 #define HEALTH_INFO_LINES (MLX5_FLD_SZ_DW(health_buffer, assert_var) + 8)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index b5393684a6df..f2c6850b0ec4 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -686,6 +686,7 @@  struct mlx5_core_dev {
 	struct page             *clock_info_page;
 	struct mlx5_fw_tracer   *tracer;
 	struct devlink_health_reporter *fw_reporter;
+	struct devlink_health_reporter *fw_fatal_reporter;
 	u32                      vsc_addr;
 };