@@ -327,19 +327,6 @@ static int mlx5_health_care(struct mlx5_core_dev *dev)
return 0;
}
-static void health_care_work(struct work_struct *work)
-{
- struct mlx5_core_health *health;
- struct mlx5_core_dev *dev;
- struct mlx5_priv *priv;
-
- health = container_of(work, struct mlx5_core_health, work);
- priv = container_of(health, struct mlx5_priv, health);
- dev = container_of(priv, struct mlx5_core_dev, priv);
-
- mlx5_health_care(dev);
-}
-
static const char *hsynd_str(u8 synd)
{
switch (synd) {
@@ -585,6 +572,29 @@ mlx5_fw_fatal_reporter_dump(struct devlink_health_reporter *reporter,
return 0;
}
+static void mlx5_fw_fatal_reporter_err_work(struct work_struct *work)
+{
+ struct mlx5_fw_reporter_ctx fw_reporter_ctx;
+ struct mlx5_core_health *health;
+ struct mlx5_core_dev *dev;
+ struct mlx5_priv *priv;
+
+ health = container_of(work, struct mlx5_core_health, fatal_report_work);
+ priv = container_of(health, struct mlx5_priv, health);
+ dev = container_of(priv, struct mlx5_core_dev, priv);
+
+ mlx5_enter_error_state(dev, false);
+ if (IS_ERR_OR_NULL(health->fw_fatal_reporter)) {
+ if (mlx5_health_care(dev))
+ mlx5_core_err(dev, "health recovery failed\n");
+ return;
+ }
+ fw_reporter_ctx.err_synd = health->synd;
+ fw_reporter_ctx.miss_counter = health->miss_counter;
+ devlink_health_report(health->fw_fatal_reporter,
+ "FW fatal error reported", &fw_reporter_ctx);
+}
+
static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = {
.name = "fw_fatal",
.recover = mlx5_fw_fatal_reporter_recover,
@@ -642,7 +652,7 @@ void mlx5_trigger_health_work(struct mlx5_core_dev *dev)
spin_lock_irqsave(&health->wq_lock, flags);
if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
- queue_work(health->wq, &health->work);
+ queue_work(health->wq, &health->fatal_report_work);
else
mlx5_core_err(dev, "new health works are not permitted at this stage\n");
spin_unlock_irqrestore(&health->wq_lock, flags);
@@ -728,7 +738,7 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
spin_unlock_irqrestore(&health->wq_lock, flags);
cancel_work_sync(&health->report_work);
- cancel_work_sync(&health->work);
+ cancel_work_sync(&health->fatal_report_work);
}
void mlx5_health_flush(struct mlx5_core_dev *dev)
@@ -764,7 +774,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
if (!health->wq)
return -ENOMEM;
spin_lock_init(&health->wq_lock);
- INIT_WORK(&health->work, health_care_work);
+ INIT_WORK(&health->fatal_report_work, mlx5_fw_fatal_reporter_err_work);
INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work);
health->crdump = NULL;
health->info_buf = kmalloc(HEALTH_INFO_MAX_BUFF, GFP_KERNEL);
@@ -1367,7 +1367,8 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
mlx5_core_info(dev, "%s was called\n", __func__);
- mlx5_enter_error_state(dev, false);
+ if (state)
+ mlx5_enter_error_state(dev, false);
mlx5_error_sw_reset(dev);
mlx5_unload_one(dev, false);
/* In case of kernel call drain the health wq */
@@ -443,7 +443,7 @@ struct mlx5_core_health {
spinlock_t wq_lock;
struct workqueue_struct *wq;
unsigned long flags;
- struct work_struct work;
+ struct work_struct fatal_report_work;
struct work_struct report_work;
struct delayed_work recover_work;
struct mlx5_fw_crdump *crdump;