diff mbox series

[net-next,v2,09/11] net/mlx5e: Add TX reporter support

Message ID 1547762360-7075-10-git-send-email-eranbe@mellanox.com
State Accepted
Delegated to: David Miller
Headers show
Series Devlink health reporting and recovery system | expand

Commit Message

Eran Ben Elisha Jan. 17, 2019, 9:59 p.m. UTC
Add mlx5e tx reporter to devlink health reporters. This reporter will be
responsible for diagnosing, reporting and recovering of TX errors.
This patch declares the TX reporter operations and allocate it using the
devlink health API. Currently, this reporter supports reporting and
recovering from send error CQE only. In addition, it adds diagnose
information for the open SQs.

For a local SQ recover (due to driver error report), in case of SQ recover
failure, the recover operation will be considered as a failure.
For a full TX recover, an attempt to close and open the channels will be
done. If this one passed successfully, it will be considered as a
successful recover.

The SQ recover from error CQE flow is not a new feature in the driver,
this patch re-organize the functions and adapt them for the devlink
health API. For this purpose, move code from en_main.c to a new file
named reporter_tx.c.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Reviewed-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/Makefile  |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  18 +-
 .../ethernet/mellanox/mlx5/core/en/reporter.h |  14 +
 .../mellanox/mlx5/core/en/reporter_tx.c       | 321 ++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 135 +-------
 .../net/ethernet/mellanox/mlx5/core/en_tx.c   |   2 +-
 6 files changed, 367 insertions(+), 125 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c

Comments

Jiri Pirko Jan. 20, 2019, 11:06 a.m. UTC | #1
Thu, Jan 17, 2019 at 10:59:18PM CET, eranbe@mellanox.com wrote:
>Add mlx5e tx reporter to devlink health reporters. This reporter will be
>responsible for diagnosing, reporting and recovering of TX errors.
>This patch declares the TX reporter operations and allocate it using the
>devlink health API. Currently, this reporter supports reporting and
>recovering from send error CQE only. In addition, it adds diagnose
>information for the open SQs.
>
>For a local SQ recover (due to driver error report), in case of SQ recover
>failure, the recover operation will be considered as a failure.
>For a full TX recover, an attempt to close and open the channels will be
>done. If this one passed successfully, it will be considered as a
>successful recover.
>
>The SQ recover from error CQE flow is not a new feature in the driver,
>this patch re-organize the functions and adapt them for the devlink
>health API. For this purpose, move code from en_main.c to a new file
>named reporter_tx.c.
>
>Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
>Reviewed-by: Saeed Mahameed <saeedm@mellanox.com>
>---
> .../net/ethernet/mellanox/mlx5/core/Makefile  |   2 +-
> drivers/net/ethernet/mellanox/mlx5/core/en.h  |  18 +-
> .../ethernet/mellanox/mlx5/core/en/reporter.h |  14 +
> .../mellanox/mlx5/core/en/reporter_tx.c       | 321 ++++++++++++++++++
> .../net/ethernet/mellanox/mlx5/core/en_main.c | 135 +-------
> .../net/ethernet/mellanox/mlx5/core/en_tx.c   |   2 +-
> 6 files changed, 367 insertions(+), 125 deletions(-)
> create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h
> create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>
>diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
>index 9de9abacf7f6..6bb2a860b15b 100644
>--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
>+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
>@@ -22,7 +22,7 @@ mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
> #
> mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \
> 		en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o \
>-		en_selftest.o en/port.o en/monitor_stats.o
>+		en_selftest.o en/port.o en/monitor_stats.o en/reporter_tx.o
> 
> #
> # Netdev extra
>diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
>index 8fa8fdd30b85..27e276c9bf84 100644
>--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
>+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
>@@ -388,10 +388,7 @@ struct mlx5e_txqsq {
> 	struct mlx5e_channel      *channel;
> 	int                        txq_ix;
> 	u32                        rate_limit;
>-	struct mlx5e_txqsq_recover {
>-		struct work_struct         recover_work;
>-		u64                        last_recover;
>-	} recover;
>+	struct work_struct         recover_work;
> } ____cacheline_aligned_in_smp;
> 
> struct mlx5e_dma_info {
>@@ -682,6 +679,13 @@ struct mlx5e_rss_params {
> 	u8	hfunc;
> };
> 
>+struct mlx5e_modify_sq_param {
>+	int curr_state;
>+	int next_state;
>+	int rl_update;
>+	int rl_index;
>+};
>+
> struct mlx5e_priv {
> 	/* priv data path fields - start */
> 	struct mlx5e_txqsq *txq2sq[MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC];
>@@ -737,6 +741,7 @@ struct mlx5e_priv {
> #ifdef CONFIG_MLX5_EN_TLS
> 	struct mlx5e_tls          *tls;
> #endif
>+	struct devlink_health_reporter *tx_reporter;
> };
> 
> struct mlx5e_profile {
>@@ -866,6 +871,11 @@ void mlx5e_set_rq_type(struct mlx5_core_dev *mdev, struct mlx5e_params *params);
> void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
> 			       struct mlx5e_params *params);
> 
>+int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn,
>+		    struct mlx5e_modify_sq_param *p);
>+void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq);
>+void mlx5e_tx_disable_queue(struct netdev_queue *txq);
>+
> static inline bool mlx5e_tunnel_inner_ft_supported(struct mlx5_core_dev *mdev)
> {
> 	return (MLX5_CAP_ETH(mdev, tunnel_stateless_gre) &&
>diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h
>new file mode 100644
>index 000000000000..74083e120ab3
>--- /dev/null
>+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h
>@@ -0,0 +1,14 @@
>+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
>+/* Copyright (c) 2018 Mellanox Technologies. */
>+
>+#ifndef __MLX5E_EN_REPORTER_H
>+#define __MLX5E_EN_REPORTER_H
>+
>+#include <linux/mlx5/driver.h>
>+#include "en.h"
>+
>+int mlx5e_tx_reporter_create(struct mlx5e_priv *priv);
>+void mlx5e_tx_reporter_destroy(struct mlx5e_priv *priv);
>+void mlx5e_tx_reporter_err_cqe(struct mlx5e_txqsq *sq);
>+
>+#endif
>diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>new file mode 100644
>index 000000000000..9800df4909c2
>--- /dev/null
>+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>@@ -0,0 +1,321 @@
>+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
>+/* Copyright (c) 2018 Mellanox Technologies. */
>+
>+#include <net/devlink.h>
>+#include "reporter.h"
>+#include "lib/eq.h"
>+
>+#define MLX5E_TX_REPORTER_PER_SQ_MAX_LEN 256

Some made-up number. I don't like how this whole thing is do. I will
comment on it in the devlink part.


[...]


>+static int
>+mlx5e_tx_reporter_build_diagnose_output(struct devlink_health_buffer *buffer,
>+					u32 sqn, u8 state, u8 stopped)
>+{
>+	int err, i;
>+	int nest = 0;
>+	char name[20];
>+
>+	err = devlink_health_buffer_nest_start(buffer,
>+					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT);
>+	if (err)
>+		goto buffer_error;
>+	nest++;
>+
>+	err = devlink_health_buffer_nest_start(buffer,
>+					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR);
>+	if (err)
>+		goto buffer_error;
>+	nest++;
>+
>+	sprintf(name, "SQ 0x%x", sqn);

No. The whole idea of having the message build up with nested attributes
(json-like) was to avoid things like this. No sprintfs please. If you
want to do sprintf, most of the time you are doing something wrong.


>+	err = devlink_health_buffer_put_object_name(buffer, name);
>+	if (err)
>+		goto buffer_error;
>+
>+	err = devlink_health_buffer_nest_start(buffer,
>+					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE);
>+	if (err)
>+		goto buffer_error;
>+	nest++;
>+
>+	err = devlink_health_buffer_nest_start(buffer,
>+					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT);
>+	if (err)
>+		goto buffer_error;
>+	nest++;
>+
>+	err = devlink_health_buffer_nest_start(buffer,
>+					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR);
>+	if (err)
>+		goto buffer_error;
>+	nest++;
>+
>+	err = devlink_health_buffer_put_object_name(buffer, "HW state");
>+	if (err)
>+		goto buffer_error;
>+
>+	err = devlink_health_buffer_nest_start(buffer,
>+					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE);

How could you put an object name and don't start nesting? It looks
implicit to me.



>+	if (err)
>+		goto buffer_error;
>+	nest++;
>+
>+	err = devlink_health_buffer_put_value_u8(buffer, state);
>+	if (err)
>+		goto buffer_error;
>+
>+	devlink_health_buffer_nest_end(buffer); /* DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE */
>+	nest--;
>+
>+	devlink_health_buffer_nest_end(buffer); /* DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR */
>+	nest--;
>+
>+	err = devlink_health_buffer_nest_start(buffer,
>+					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR);
>+	if (err)
>+		goto buffer_error;
>+	nest++;
>+
>+	err = devlink_health_buffer_put_object_name(buffer, "stopped");
>+	if (err)
>+		goto buffer_error;
>+
>+	err = devlink_health_buffer_nest_start(buffer,
>+					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE);
>+	if (err)
>+		goto buffer_error;
>+	nest++;

What's up with this nest++ nest--?
When you open a nest, you should have an error path inten to close the
nest and jump there in case of an error. This is hard to read and
understand.
	


>+
>+	err = devlink_health_buffer_put_value_u8(buffer, stopped);
>+	if (err)
>+		goto buffer_error;
>+
>+	for (i = 0; i < nest; i++)
>+		devlink_health_buffer_nest_end(buffer);
>+
>+	return 0;
>+
>+buffer_error:
>+	for (i = 0; i < nest; i++)
>+		devlink_health_buffer_nest_cancel(buffer);
>+	return err;
>+}
>+
>+static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
>+				      struct devlink_health_buffer **buffers_array,
>+				      unsigned int buffer_size,
>+				      unsigned int num_buffers)
>+{
>+	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
>+	unsigned int buff = 0;
>+	int i = 0, err = 0;
>+
>+	if (buffer_size < MLX5E_TX_REPORTER_PER_SQ_MAX_LEN)
>+		return -ENOMEM;
>+
>+	mutex_lock(&priv->state_lock);
>+
>+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
>+		mutex_unlock(&priv->state_lock);
>+		return 0;
>+	}
>+
>+	while (i < priv->channels.num * priv->channels.params.num_tc) {
>+		struct mlx5e_txqsq *sq = priv->txq2sq[i];
>+		u8 state;
>+
>+		err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state);
>+		if (err)
>+			break;
>+
>+		err = mlx5e_tx_reporter_build_diagnose_output(buffers_array[buff],
>+							      sq->sqn, state,
>+							      netif_xmit_stopped(sq->txq));
>+		if (err) {
>+			if (++buff == num_buffers)
>+				break;
>+		} else {
>+			i++;
>+		}
>+	}
>+
>+	mutex_unlock(&priv->state_lock);
>+	return err;
>+}
>+
>+static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
>+		.name = "TX",
>+		.recover = mlx5e_tx_reporter_recover,
>+		.diagnose_size = MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC *
>+				 MLX5E_TX_REPORTER_PER_SQ_MAX_LEN,
>+		.diagnose = mlx5e_tx_reporter_diagnose,
>+		.dump_size = 0,
>+		.dump = NULL,

Don't initialize 0 and NULL in static const.

[...]
Eran Ben Elisha Jan. 21, 2019, 11:32 a.m. UTC | #2
On 1/20/2019 1:06 PM, Jiri Pirko wrote:
> Thu, Jan 17, 2019 at 10:59:18PM CET, eranbe@mellanox.com wrote:
>> Add mlx5e tx reporter to devlink health reporters. This reporter will be
>> responsible for diagnosing, reporting and recovering of TX errors.
>> This patch declares the TX reporter operations and allocate it using the
>> devlink health API. Currently, this reporter supports reporting and
>> recovering from send error CQE only. In addition, it adds diagnose
>> information for the open SQs.
>>
>> For a local SQ recover (due to driver error report), in case of SQ recover
>> failure, the recover operation will be considered as a failure.
>> For a full TX recover, an attempt to close and open the channels will be
>> done. If this one passed successfully, it will be considered as a
>> successful recover.
>>
>> The SQ recover from error CQE flow is not a new feature in the driver,
>> this patch re-organize the functions and adapt them for the devlink
>> health API. For this purpose, move code from en_main.c to a new file
>> named reporter_tx.c.
>>
>> Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
>> Reviewed-by: Saeed Mahameed <saeedm@mellanox.com>
>> ---
>> .../net/ethernet/mellanox/mlx5/core/Makefile  |   2 +-
>> drivers/net/ethernet/mellanox/mlx5/core/en.h  |  18 +-
>> .../ethernet/mellanox/mlx5/core/en/reporter.h |  14 +
>> .../mellanox/mlx5/core/en/reporter_tx.c       | 321 ++++++++++++++++++
>> .../net/ethernet/mellanox/mlx5/core/en_main.c | 135 +-------
>> .../net/ethernet/mellanox/mlx5/core/en_tx.c   |   2 +-
>> 6 files changed, 367 insertions(+), 125 deletions(-)
>> create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h
>> create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>
>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
>> index 9de9abacf7f6..6bb2a860b15b 100644
>> --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
>> @@ -22,7 +22,7 @@ mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
>> #
>> mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \
>> 		en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o \
>> -		en_selftest.o en/port.o en/monitor_stats.o
>> +		en_selftest.o en/port.o en/monitor_stats.o en/reporter_tx.o
>>
>> #
>> # Netdev extra
>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
>> index 8fa8fdd30b85..27e276c9bf84 100644
>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
>> @@ -388,10 +388,7 @@ struct mlx5e_txqsq {
>> 	struct mlx5e_channel      *channel;
>> 	int                        txq_ix;
>> 	u32                        rate_limit;
>> -	struct mlx5e_txqsq_recover {
>> -		struct work_struct         recover_work;
>> -		u64                        last_recover;
>> -	} recover;
>> +	struct work_struct         recover_work;
>> } ____cacheline_aligned_in_smp;
>>
>> struct mlx5e_dma_info {
>> @@ -682,6 +679,13 @@ struct mlx5e_rss_params {
>> 	u8	hfunc;
>> };
>>
>> +struct mlx5e_modify_sq_param {
>> +	int curr_state;
>> +	int next_state;
>> +	int rl_update;
>> +	int rl_index;
>> +};
>> +
>> struct mlx5e_priv {
>> 	/* priv data path fields - start */
>> 	struct mlx5e_txqsq *txq2sq[MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC];
>> @@ -737,6 +741,7 @@ struct mlx5e_priv {
>> #ifdef CONFIG_MLX5_EN_TLS
>> 	struct mlx5e_tls          *tls;
>> #endif
>> +	struct devlink_health_reporter *tx_reporter;
>> };
>>
>> struct mlx5e_profile {
>> @@ -866,6 +871,11 @@ void mlx5e_set_rq_type(struct mlx5_core_dev *mdev, struct mlx5e_params *params);
>> void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
>> 			       struct mlx5e_params *params);
>>
>> +int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn,
>> +		    struct mlx5e_modify_sq_param *p);
>> +void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq);
>> +void mlx5e_tx_disable_queue(struct netdev_queue *txq);
>> +
>> static inline bool mlx5e_tunnel_inner_ft_supported(struct mlx5_core_dev *mdev)
>> {
>> 	return (MLX5_CAP_ETH(mdev, tunnel_stateless_gre) &&
>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h
>> new file mode 100644
>> index 000000000000..74083e120ab3
>> --- /dev/null
>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h
>> @@ -0,0 +1,14 @@
>> +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
>> +/* Copyright (c) 2018 Mellanox Technologies. */
>> +
>> +#ifndef __MLX5E_EN_REPORTER_H
>> +#define __MLX5E_EN_REPORTER_H
>> +
>> +#include <linux/mlx5/driver.h>
>> +#include "en.h"
>> +
>> +int mlx5e_tx_reporter_create(struct mlx5e_priv *priv);
>> +void mlx5e_tx_reporter_destroy(struct mlx5e_priv *priv);
>> +void mlx5e_tx_reporter_err_cqe(struct mlx5e_txqsq *sq);
>> +
>> +#endif
>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>> new file mode 100644
>> index 000000000000..9800df4909c2
>> --- /dev/null
>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>> @@ -0,0 +1,321 @@
>> +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
>> +/* Copyright (c) 2018 Mellanox Technologies. */
>> +
>> +#include <net/devlink.h>
>> +#include "reporter.h"
>> +#include "lib/eq.h"
>> +
>> +#define MLX5E_TX_REPORTER_PER_SQ_MAX_LEN 256
> 
> Some made-up number. I don't like how this whole thing is do. I will
> comment on it in the devlink part.
> 
> 
> [...]
> 
> 
>> +static int
>> +mlx5e_tx_reporter_build_diagnose_output(struct devlink_health_buffer *buffer,
>> +					u32 sqn, u8 state, u8 stopped)
>> +{
>> +	int err, i;
>> +	int nest = 0;
>> +	char name[20];
>> +
>> +	err = devlink_health_buffer_nest_start(buffer,
>> +					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT);
>> +	if (err)
>> +		goto buffer_error;
>> +	nest++;
>> +
>> +	err = devlink_health_buffer_nest_start(buffer,
>> +					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR);
>> +	if (err)
>> +		goto buffer_error;
>> +	nest++;
>> +
>> +	sprintf(name, "SQ 0x%x", sqn);
> 
> No. The whole idea of having the message build up with nested attributes
> (json-like) was to avoid things like this. No sprintfs please. If you
> want to do sprintf, most of the time you are doing something wrong.

I wanted that each SQ object will have a unique name. But I can merge 
the sqn into its attributes instead.

> 
> 
>> +	err = devlink_health_buffer_put_object_name(buffer, name);
>> +	if (err)
>> +		goto buffer_error;
>> +
>> +	err = devlink_health_buffer_nest_start(buffer,
>> +					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE);
>> +	if (err)
>> +		goto buffer_error;
>> +	nest++;
>> +
>> +	err = devlink_health_buffer_nest_start(buffer,
>> +					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT);
>> +	if (err)
>> +		goto buffer_error;
>> +	nest++;
>> +
>> +	err = devlink_health_buffer_nest_start(buffer,
>> +					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR);
>> +	if (err)
>> +		goto buffer_error;
>> +	nest++;
>> +
>> +	err = devlink_health_buffer_put_object_name(buffer, "HW state");
>> +	if (err)
>> +		goto buffer_error;
>> +
>> +	err = devlink_health_buffer_nest_start(buffer,
>> +					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE);
> 
> How could you put an object name and don't start nesting? It looks
> implicit to me.

I will add some helper functions that you could review. Just keep in 
mind the implicit nest start must come with implicit nest end, so it 
won't fit into every use...
> 
> 
> 
>> +	if (err)
>> +		goto buffer_error;
>> +	nest++;
>> +
>> +	err = devlink_health_buffer_put_value_u8(buffer, state);
>> +	if (err)
>> +		goto buffer_error;
>> +
>> +	devlink_health_buffer_nest_end(buffer); /* DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE */
>> +	nest--;
>> +
>> +	devlink_health_buffer_nest_end(buffer); /* DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR */
>> +	nest--;
>> +
>> +	err = devlink_health_buffer_nest_start(buffer,
>> +					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR);
>> +	if (err)
>> +		goto buffer_error;
>> +	nest++;
>> +
>> +	err = devlink_health_buffer_put_object_name(buffer, "stopped");
>> +	if (err)
>> +		goto buffer_error;
>> +
>> +	err = devlink_health_buffer_nest_start(buffer,
>> +					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE);
>> +	if (err)
>> +		goto buffer_error;
>> +	nest++;
> 
> What's up with this nest++ nest--?
> When you open a nest, you should have an error path inten to close the
> nest and jump there in case of an error. This is hard to read and
> understand.
it was in order to have the loop for nest_end. I removed it.
> 	
> 
> 
>> +
>> +	err = devlink_health_buffer_put_value_u8(buffer, stopped);
>> +	if (err)
>> +		goto buffer_error;
>> +
>> +	for (i = 0; i < nest; i++)
>> +		devlink_health_buffer_nest_end(buffer);
>> +
>> +	return 0;
>> +
>> +buffer_error:
>> +	for (i = 0; i < nest; i++)
>> +		devlink_health_buffer_nest_cancel(buffer);
>> +	return err;
>> +}
>> +
>> +static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
>> +				      struct devlink_health_buffer **buffers_array,
>> +				      unsigned int buffer_size,
>> +				      unsigned int num_buffers)
>> +{
>> +	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
>> +	unsigned int buff = 0;
>> +	int i = 0, err = 0;
>> +
>> +	if (buffer_size < MLX5E_TX_REPORTER_PER_SQ_MAX_LEN)
>> +		return -ENOMEM;
>> +
>> +	mutex_lock(&priv->state_lock);
>> +
>> +	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
>> +		mutex_unlock(&priv->state_lock);
>> +		return 0;
>> +	}
>> +
>> +	while (i < priv->channels.num * priv->channels.params.num_tc) {
>> +		struct mlx5e_txqsq *sq = priv->txq2sq[i];
>> +		u8 state;
>> +
>> +		err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state);
>> +		if (err)
>> +			break;
>> +
>> +		err = mlx5e_tx_reporter_build_diagnose_output(buffers_array[buff],
>> +							      sq->sqn, state,
>> +							      netif_xmit_stopped(sq->txq));
>> +		if (err) {
>> +			if (++buff == num_buffers)
>> +				break;
>> +		} else {
>> +			i++;
>> +		}
>> +	}
>> +
>> +	mutex_unlock(&priv->state_lock);
>> +	return err;
>> +}
>> +
>> +static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
>> +		.name = "TX",
>> +		.recover = mlx5e_tx_reporter_recover,
>> +		.diagnose_size = MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC *
>> +				 MLX5E_TX_REPORTER_PER_SQ_MAX_LEN,
>> +		.diagnose = mlx5e_tx_reporter_diagnose,
>> +		.dump_size = 0,
>> +		.dump = NULL,
> 
> Don't initialize 0 and NULL in static const.
Ack.
> 
> [...]
>
Jiri Pirko Jan. 21, 2019, 12:11 p.m. UTC | #3
Mon, Jan 21, 2019 at 12:32:07PM CET, eranbe@mellanox.com wrote:
>
>
>On 1/20/2019 1:06 PM, Jiri Pirko wrote:
>> Thu, Jan 17, 2019 at 10:59:18PM CET, eranbe@mellanox.com wrote:

[...]

	
>>> +static int
>>> +mlx5e_tx_reporter_build_diagnose_output(struct devlink_health_buffer *buffer,
>>> +					u32 sqn, u8 state, u8 stopped)
>>> +{
>>> +	int err, i;
>>> +	int nest = 0;
>>> +	char name[20];
>>> +
>>> +	err = devlink_health_buffer_nest_start(buffer,
>>> +					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT);
>>> +	if (err)
>>> +		goto buffer_error;
>>> +	nest++;
>>> +
>>> +	err = devlink_health_buffer_nest_start(buffer,
>>> +					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR);
>>> +	if (err)
>>> +		goto buffer_error;
>>> +	nest++;
>>> +
>>> +	sprintf(name, "SQ 0x%x", sqn);
>> 
>> No. The whole idea of having the message build up with nested attributes
>> (json-like) was to avoid things like this. No sprintfs please. If you
>> want to do sprintf, most of the time you are doing something wrong.
>
>I wanted that each SQ object will have a unique name. But I can merge 
>the sqn into its attributes instead.

Should be an array.


>
>> 
>> 
>>> +	err = devlink_health_buffer_put_object_name(buffer, name);
>>> +	if (err)
>>> +		goto buffer_error;
>>> +
>>> +	err = devlink_health_buffer_nest_start(buffer,
>>> +					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE);
>>> +	if (err)
>>> +		goto buffer_error;
>>> +	nest++;
>>> +
>>> +	err = devlink_health_buffer_nest_start(buffer,
>>> +					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT);
>>> +	if (err)
>>> +		goto buffer_error;
>>> +	nest++;
>>> +
>>> +	err = devlink_health_buffer_nest_start(buffer,
>>> +					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR);
>>> +	if (err)
>>> +		goto buffer_error;
>>> +	nest++;
>>> +
>>> +	err = devlink_health_buffer_put_object_name(buffer, "HW state");
>>> +	if (err)
>>> +		goto buffer_error;
>>> +
>>> +	err = devlink_health_buffer_nest_start(buffer,
>>> +					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE);
>> 
>> How could you put an object name and don't start nesting? It looks
>> implicit to me.
>
>I will add some helper functions that you could review. Just keep in 
>mind the implicit nest start must come with implicit nest end, so it 
>won't fit into every use...

You can do just object_start(), object_finish() or something like that.

[...]
Eran Ben Elisha Jan. 21, 2019, 1:06 p.m. UTC | #4
On 1/21/2019 2:11 PM, Jiri Pirko wrote:
> Mon, Jan 21, 2019 at 12:32:07PM CET, eranbe@mellanox.com wrote:
>>
>>
>> On 1/20/2019 1:06 PM, Jiri Pirko wrote:
>>> Thu, Jan 17, 2019 at 10:59:18PM CET, eranbe@mellanox.com wrote:
> 
> [...]
> 
> 	
>>>> +static int
>>>> +mlx5e_tx_reporter_build_diagnose_output(struct devlink_health_buffer *buffer,
>>>> +					u32 sqn, u8 state, u8 stopped)
>>>> +{
>>>> +	int err, i;
>>>> +	int nest = 0;
>>>> +	char name[20];
>>>> +
>>>> +	err = devlink_health_buffer_nest_start(buffer,
>>>> +					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT);
>>>> +	if (err)
>>>> +		goto buffer_error;
>>>> +	nest++;
>>>> +
>>>> +	err = devlink_health_buffer_nest_start(buffer,
>>>> +					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR);
>>>> +	if (err)
>>>> +		goto buffer_error;
>>>> +	nest++;
>>>> +
>>>> +	sprintf(name, "SQ 0x%x", sqn);
>>>
>>> No. The whole idea of having the message build up with nested attributes
>>> (json-like) was to avoid things like this. No sprintfs please. If you
>>> want to do sprintf, most of the time you are doing something wrong.
>>
>> I wanted that each SQ object will have a unique name. But I can merge
>> the sqn into its attributes instead.
> 
> Should be an array.

Every SQ shall hold it's own properties. I don't see why all SQs shall 
be held under one array, this array do not provide any additional 
info/order.

In addition, it is better to keep main preliminary objects up to the 
size of one netlink SKB, otherwise it will be impossible for the devlink 
to prepare this one big object as skb fragments, and also to re-assmble 
them in user space as driver intended them to be.

If all SWs will be under one big array, under one big object, we will 
hit this corner case.

> 
> 
>>
>>>
>>>
>>>> +	err = devlink_health_buffer_put_object_name(buffer, name);
>>>> +	if (err)
>>>> +		goto buffer_error;
>>>> +
>>>> +	err = devlink_health_buffer_nest_start(buffer,
>>>> +					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE);
>>>> +	if (err)
>>>> +		goto buffer_error;
>>>> +	nest++;
>>>> +
>>>> +	err = devlink_health_buffer_nest_start(buffer,
>>>> +					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT);
>>>> +	if (err)
>>>> +		goto buffer_error;
>>>> +	nest++;
>>>> +
>>>> +	err = devlink_health_buffer_nest_start(buffer,
>>>> +					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR);
>>>> +	if (err)
>>>> +		goto buffer_error;
>>>> +	nest++;
>>>> +
>>>> +	err = devlink_health_buffer_put_object_name(buffer, "HW state");
>>>> +	if (err)
>>>> +		goto buffer_error;
>>>> +
>>>> +	err = devlink_health_buffer_nest_start(buffer,
>>>> +					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE);
>>>
>>> How could you put an object name and don't start nesting? It looks
>>> implicit to me.
>>
>> I will add some helper functions that you could review. Just keep in
>> mind the implicit nest start must come with implicit nest end, so it
>> won't fit into every use...
> 
> You can do just object_start(), object_finish() or something like that.

ack.

Better to have also helpers that start and close their nests on one 
shot. like pair_name_value or pair_name_value_array.

Also, The json is too powerful to close it inside few wrappers, we must 
give the basic blocks as well for nontraditional structures.


> 
> [...]
>
Jiri Pirko Jan. 21, 2019, 1:45 p.m. UTC | #5
Mon, Jan 21, 2019 at 02:06:44PM CET, eranbe@mellanox.com wrote:
>
>
>On 1/21/2019 2:11 PM, Jiri Pirko wrote:
>> Mon, Jan 21, 2019 at 12:32:07PM CET, eranbe@mellanox.com wrote:
>>>
>>>
>>> On 1/20/2019 1:06 PM, Jiri Pirko wrote:
>>>> Thu, Jan 17, 2019 at 10:59:18PM CET, eranbe@mellanox.com wrote:
>> 
>> [...]
>> 
>> 	
>>>>> +static int
>>>>> +mlx5e_tx_reporter_build_diagnose_output(struct devlink_health_buffer *buffer,
>>>>> +					u32 sqn, u8 state, u8 stopped)
>>>>> +{
>>>>> +	int err, i;
>>>>> +	int nest = 0;
>>>>> +	char name[20];
>>>>> +
>>>>> +	err = devlink_health_buffer_nest_start(buffer,
>>>>> +					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT);
>>>>> +	if (err)
>>>>> +		goto buffer_error;
>>>>> +	nest++;
>>>>> +
>>>>> +	err = devlink_health_buffer_nest_start(buffer,
>>>>> +					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR);
>>>>> +	if (err)
>>>>> +		goto buffer_error;
>>>>> +	nest++;
>>>>> +
>>>>> +	sprintf(name, "SQ 0x%x", sqn);
>>>>
>>>> No. The whole idea of having the message build up with nested attributes
>>>> (json-like) was to avoid things like this. No sprintfs please. If you
>>>> want to do sprintf, most of the time you are doing something wrong.
>>>
>>> I wanted that each SQ object will have a unique name. But I can merge
>>> the sqn into its attributes instead.
>> 
>> Should be an array.
>
>Every SQ shall hold it's own properties. I don't see why all SQs shall 
>be held under one array, this array do not provide any additional 
>info/order.

If you have 8 SQs, you should have 8 objects (with subobjects) in an
array.


>
>In addition, it is better to keep main preliminary objects up to the 
>size of one netlink SKB, otherwise it will be impossible for the devlink 
>to prepare this one big object as skb fragments, and also to re-assmble 
>them in user space as driver intended them to be.
>
>If all SWs will be under one big array, under one big object, we will 
>hit this corner case.
>
>> 
>> 
>>>
>>>>
>>>>
>>>>> +	err = devlink_health_buffer_put_object_name(buffer, name);
>>>>> +	if (err)
>>>>> +		goto buffer_error;
>>>>> +
>>>>> +	err = devlink_health_buffer_nest_start(buffer,
>>>>> +					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE);
>>>>> +	if (err)
>>>>> +		goto buffer_error;
>>>>> +	nest++;
>>>>> +
>>>>> +	err = devlink_health_buffer_nest_start(buffer,
>>>>> +					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT);
>>>>> +	if (err)
>>>>> +		goto buffer_error;
>>>>> +	nest++;
>>>>> +
>>>>> +	err = devlink_health_buffer_nest_start(buffer,
>>>>> +					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR);
>>>>> +	if (err)
>>>>> +		goto buffer_error;
>>>>> +	nest++;
>>>>> +
>>>>> +	err = devlink_health_buffer_put_object_name(buffer, "HW state");
>>>>> +	if (err)
>>>>> +		goto buffer_error;
>>>>> +
>>>>> +	err = devlink_health_buffer_nest_start(buffer,
>>>>> +					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE);
>>>>
>>>> How could you put an object name and don't start nesting? It looks
>>>> implicit to me.
>>>
>>> I will add some helper functions that you could review. Just keep in
>>> mind the implicit nest start must come with implicit nest end, so it
>>> won't fit into every use...
>> 
>> You can do just object_start(), object_finish() or something like that.
>
>ack.
>
>Better to have also helpers that start and close their nests on one 
>shot. like pair_name_value or pair_name_value_array.
>
>Also, The json is too powerful to close it inside few wrappers, we must 
>give the basic blocks as well for nontraditional structures.
>
>
>> 
>> [...]
>>
diff mbox series

Patch

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 9de9abacf7f6..6bb2a860b15b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -22,7 +22,7 @@  mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
 #
 mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \
 		en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o \
-		en_selftest.o en/port.o en/monitor_stats.o
+		en_selftest.o en/port.o en/monitor_stats.o en/reporter_tx.o
 
 #
 # Netdev extra
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 8fa8fdd30b85..27e276c9bf84 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -388,10 +388,7 @@  struct mlx5e_txqsq {
 	struct mlx5e_channel      *channel;
 	int                        txq_ix;
 	u32                        rate_limit;
-	struct mlx5e_txqsq_recover {
-		struct work_struct         recover_work;
-		u64                        last_recover;
-	} recover;
+	struct work_struct         recover_work;
 } ____cacheline_aligned_in_smp;
 
 struct mlx5e_dma_info {
@@ -682,6 +679,13 @@  struct mlx5e_rss_params {
 	u8	hfunc;
 };
 
+struct mlx5e_modify_sq_param {
+	int curr_state;
+	int next_state;
+	int rl_update;
+	int rl_index;
+};
+
 struct mlx5e_priv {
 	/* priv data path fields - start */
 	struct mlx5e_txqsq *txq2sq[MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC];
@@ -737,6 +741,7 @@  struct mlx5e_priv {
 #ifdef CONFIG_MLX5_EN_TLS
 	struct mlx5e_tls          *tls;
 #endif
+	struct devlink_health_reporter *tx_reporter;
 };
 
 struct mlx5e_profile {
@@ -866,6 +871,11 @@  void mlx5e_set_rq_type(struct mlx5_core_dev *mdev, struct mlx5e_params *params);
 void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
 			       struct mlx5e_params *params);
 
+int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn,
+		    struct mlx5e_modify_sq_param *p);
+void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq);
+void mlx5e_tx_disable_queue(struct netdev_queue *txq);
+
 static inline bool mlx5e_tunnel_inner_ft_supported(struct mlx5_core_dev *mdev)
 {
 	return (MLX5_CAP_ETH(mdev, tunnel_stateless_gre) &&
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h
new file mode 100644
index 000000000000..74083e120ab3
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h
@@ -0,0 +1,14 @@ 
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2018 Mellanox Technologies. */
+
+#ifndef __MLX5E_EN_REPORTER_H
+#define __MLX5E_EN_REPORTER_H
+
+#include <linux/mlx5/driver.h>
+#include "en.h"
+
+int mlx5e_tx_reporter_create(struct mlx5e_priv *priv);
+void mlx5e_tx_reporter_destroy(struct mlx5e_priv *priv);
+void mlx5e_tx_reporter_err_cqe(struct mlx5e_txqsq *sq);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
new file mode 100644
index 000000000000..9800df4909c2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
@@ -0,0 +1,321 @@ 
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2018 Mellanox Technologies. */
+
+#include <net/devlink.h>
+#include "reporter.h"
+#include "lib/eq.h"
+
+#define MLX5E_TX_REPORTER_PER_SQ_MAX_LEN 256
+
+struct mlx5e_tx_err_ctx {
+	int (*recover)(struct mlx5e_txqsq *sq);
+	struct mlx5e_txqsq *sq;
+};
+
+static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq)
+{
+	unsigned long exp_time = jiffies + msecs_to_jiffies(2000);
+
+	while (time_before(jiffies, exp_time)) {
+		if (sq->cc == sq->pc)
+			return 0;
+
+		msleep(20);
+	}
+
+	netdev_err(sq->channel->netdev,
+		   "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n",
+		   sq->sqn, sq->cc, sq->pc);
+
+	return -ETIMEDOUT;
+}
+
+static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq)
+{
+	WARN_ONCE(sq->cc != sq->pc,
+		  "SQ 0x%x: cc (0x%x) != pc (0x%x)\n",
+		  sq->sqn, sq->cc, sq->pc);
+	sq->cc = 0;
+	sq->dma_fifo_cc = 0;
+	sq->pc = 0;
+}
+
+static int mlx5e_sq_to_ready(struct mlx5e_txqsq *sq, int curr_state)
+{
+	struct mlx5_core_dev *mdev = sq->channel->mdev;
+	struct net_device *dev = sq->channel->netdev;
+	struct mlx5e_modify_sq_param msp = {0};
+	int err;
+
+	msp.curr_state = curr_state;
+	msp.next_state = MLX5_SQC_STATE_RST;
+
+	err = mlx5e_modify_sq(mdev, sq->sqn, &msp);
+	if (err) {
+		netdev_err(dev, "Failed to move sq 0x%x to reset\n", sq->sqn);
+		return err;
+	}
+
+	memset(&msp, 0, sizeof(msp));
+	msp.curr_state = MLX5_SQC_STATE_RST;
+	msp.next_state = MLX5_SQC_STATE_RDY;
+
+	err = mlx5e_modify_sq(mdev, sq->sqn, &msp);
+	if (err) {
+		netdev_err(dev, "Failed to move sq 0x%x to ready\n", sq->sqn);
+		return err;
+	}
+
+	return 0;
+}
+
+static int mlx5e_tx_reporter_err_cqe_recover(struct mlx5e_txqsq *sq)
+{
+	struct mlx5_core_dev *mdev = sq->channel->mdev;
+	struct net_device *dev = sq->channel->netdev;
+	u8 state;
+	int err;
+
+	if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
+		return 0;
+
+	err = mlx5_core_query_sq_state(mdev, sq->sqn, &state);
+	if (err) {
+		netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n",
+			   sq->sqn, err);
+		return err;
+	}
+
+	if (state != MLX5_RQC_STATE_ERR) {
+		netdev_err(dev, "SQ 0x%x not in ERROR state\n", sq->sqn);
+		return -EINVAL;
+	}
+
+	mlx5e_tx_disable_queue(sq->txq);
+
+	err = mlx5e_wait_for_sq_flush(sq);
+	if (err)
+		return err;
+
+	/* At this point, no new packets will arrive from the stack as TXQ is
+	 * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all
+	 * pending WQEs.  SQ can safely reset the SQ.
+	 */
+
+	err = mlx5e_sq_to_ready(sq, state);
+	if (err)
+		return err;
+
+	mlx5e_reset_txqsq_cc_pc(sq);
+	sq->stats->recover++;
+	mlx5e_activate_txqsq(sq);
+
+	return 0;
+}
+
+void mlx5e_tx_reporter_err_cqe(struct mlx5e_txqsq *sq)
+{
+	char err_str[MLX5E_TX_REPORTER_PER_SQ_MAX_LEN];
+	struct mlx5e_tx_err_ctx err_ctx = {0};
+
+	err_ctx.sq       = sq;
+	err_ctx.recover  = mlx5e_tx_reporter_err_cqe_recover;
+	sprintf(err_str, "ERR CQE on SQ: 0x%x", sq->sqn);
+
+	devlink_health_report(sq->channel->priv->tx_reporter, err_str,
+			      &err_ctx);
+}
+
+/* state lock cannot be grabbed within this function.
+ * It can cause a dead lock or a read-after-free.
+ */
+int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_tx_err_ctx *err_ctx)
+{
+	return err_ctx->recover(err_ctx->sq);
+}
+
+static int mlx5e_tx_reporter_recover_all(struct mlx5e_priv *priv)
+{
+	int err;
+
+	mutex_lock(&priv->state_lock);
+	mlx5e_close_locked(priv->netdev);
+	err = mlx5e_open_locked(priv->netdev);
+	mutex_unlock(&priv->state_lock);
+
+	return err;
+}
+
+static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter,
+				     void *context)
+{
+	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
+	struct mlx5e_tx_err_ctx *err_ctx = context;
+
+	return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) :
+			 mlx5e_tx_reporter_recover_all(priv);
+}
+
+static int
+mlx5e_tx_reporter_build_diagnose_output(struct devlink_health_buffer *buffer,
+					u32 sqn, u8 state, u8 stopped)
+{
+	int err, i;
+	int nest = 0;
+	char name[20];
+
+	err = devlink_health_buffer_nest_start(buffer,
+					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT);
+	if (err)
+		goto buffer_error;
+	nest++;
+
+	err = devlink_health_buffer_nest_start(buffer,
+					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR);
+	if (err)
+		goto buffer_error;
+	nest++;
+
+	sprintf(name, "SQ 0x%x", sqn);
+	err = devlink_health_buffer_put_object_name(buffer, name);
+	if (err)
+		goto buffer_error;
+
+	err = devlink_health_buffer_nest_start(buffer,
+					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE);
+	if (err)
+		goto buffer_error;
+	nest++;
+
+	err = devlink_health_buffer_nest_start(buffer,
+					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT);
+	if (err)
+		goto buffer_error;
+	nest++;
+
+	err = devlink_health_buffer_nest_start(buffer,
+					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR);
+	if (err)
+		goto buffer_error;
+	nest++;
+
+	err = devlink_health_buffer_put_object_name(buffer, "HW state");
+	if (err)
+		goto buffer_error;
+
+	err = devlink_health_buffer_nest_start(buffer,
+					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE);
+	if (err)
+		goto buffer_error;
+	nest++;
+
+	err = devlink_health_buffer_put_value_u8(buffer, state);
+	if (err)
+		goto buffer_error;
+
+	devlink_health_buffer_nest_end(buffer); /* DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE */
+	nest--;
+
+	devlink_health_buffer_nest_end(buffer); /* DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR */
+	nest--;
+
+	err = devlink_health_buffer_nest_start(buffer,
+					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR);
+	if (err)
+		goto buffer_error;
+	nest++;
+
+	err = devlink_health_buffer_put_object_name(buffer, "stopped");
+	if (err)
+		goto buffer_error;
+
+	err = devlink_health_buffer_nest_start(buffer,
+					       DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE);
+	if (err)
+		goto buffer_error;
+	nest++;
+
+	err = devlink_health_buffer_put_value_u8(buffer, stopped);
+	if (err)
+		goto buffer_error;
+
+	for (i = 0; i < nest; i++)
+		devlink_health_buffer_nest_end(buffer);
+
+	return 0;
+
+buffer_error:
+	for (i = 0; i < nest; i++)
+		devlink_health_buffer_nest_cancel(buffer);
+	return err;
+}
+
+static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
+				      struct devlink_health_buffer **buffers_array,
+				      unsigned int buffer_size,
+				      unsigned int num_buffers)
+{
+	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
+	unsigned int buff = 0;
+	int i = 0, err = 0;
+
+	if (buffer_size < MLX5E_TX_REPORTER_PER_SQ_MAX_LEN)
+		return -ENOMEM;
+
+	mutex_lock(&priv->state_lock);
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		mutex_unlock(&priv->state_lock);
+		return 0;
+	}
+
+	while (i < priv->channels.num * priv->channels.params.num_tc) {
+		struct mlx5e_txqsq *sq = priv->txq2sq[i];
+		u8 state;
+
+		err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state);
+		if (err)
+			break;
+
+		err = mlx5e_tx_reporter_build_diagnose_output(buffers_array[buff],
+							      sq->sqn, state,
+							      netif_xmit_stopped(sq->txq));
+		if (err) {
+			if (++buff == num_buffers)
+				break;
+		} else {
+			i++;
+		}
+	}
+
+	mutex_unlock(&priv->state_lock);
+	return err;
+}
+
+static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
+		.name = "TX",
+		.recover = mlx5e_tx_reporter_recover,
+		.diagnose_size = MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC *
+				 MLX5E_TX_REPORTER_PER_SQ_MAX_LEN,
+		.diagnose = mlx5e_tx_reporter_diagnose,
+		.dump_size = 0,
+		.dump = NULL,
+};
+
+#define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
+int mlx5e_tx_reporter_create(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct devlink *devlink = priv_to_devlink(mdev);
+
+	priv->tx_reporter =
+		devlink_health_reporter_create(devlink, &mlx5_tx_reporter_ops,
+					       MLX5_REPORTER_TX_GRACEFUL_PERIOD,
+					       true, priv);
+	return PTR_ERR_OR_ZERO(priv->tx_reporter);
+}
+
+void mlx5e_tx_reporter_destroy(struct mlx5e_priv *priv)
+{
+	devlink_health_reporter_destroy(priv->tx_reporter);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 8cfd2ec7c0a2..4c4507384fed 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -51,6 +51,7 @@ 
 #include "en/xdp.h"
 #include "lib/eq.h"
 #include "en/monitor_stats.h"
+#include "en/reporter.h"
 
 struct mlx5e_rq_param {
 	u32			rqc[MLX5_ST_SZ_DW(rqc)];
@@ -1160,7 +1161,7 @@  static int mlx5e_alloc_txqsq_db(struct mlx5e_txqsq *sq, int numa)
 	return 0;
 }
 
-static void mlx5e_sq_recover(struct work_struct *work);
+static void mlx5e_tx_err_cqe_work(struct work_struct *recover_work);
 static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
 			     int txq_ix,
 			     struct mlx5e_params *params,
@@ -1182,7 +1183,7 @@  static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
 	sq->uar_map   = mdev->mlx5e_res.bfreg.map;
 	sq->min_inline_mode = params->tx_min_inline_mode;
 	sq->stats     = &c->priv->channel_stats[c->ix].sq[tc];
-	INIT_WORK(&sq->recover.recover_work, mlx5e_sq_recover);
+	INIT_WORK(&sq->recover_work, mlx5e_tx_err_cqe_work);
 	if (MLX5_IPSEC_DEV(c->priv->mdev))
 		set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state);
 	if (mlx5_accel_is_tls_device(c->priv->mdev))
@@ -1270,15 +1271,8 @@  static int mlx5e_create_sq(struct mlx5_core_dev *mdev,
 	return err;
 }
 
-struct mlx5e_modify_sq_param {
-	int curr_state;
-	int next_state;
-	bool rl_update;
-	int rl_index;
-};
-
-static int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn,
-			   struct mlx5e_modify_sq_param *p)
+int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn,
+		    struct mlx5e_modify_sq_param *p)
 {
 	void *in;
 	void *sqc;
@@ -1376,17 +1370,7 @@  static int mlx5e_open_txqsq(struct mlx5e_channel *c,
 	return err;
 }
 
-static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq)
-{
-	WARN_ONCE(sq->cc != sq->pc,
-		  "SQ 0x%x: cc (0x%x) != pc (0x%x)\n",
-		  sq->sqn, sq->cc, sq->pc);
-	sq->cc = 0;
-	sq->dma_fifo_cc = 0;
-	sq->pc = 0;
-}
-
-static void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq)
+void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq)
 {
 	sq->txq = netdev_get_tx_queue(sq->channel->netdev, sq->txq_ix);
 	clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
@@ -1395,7 +1379,7 @@  static void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq)
 	netif_tx_start_queue(sq->txq);
 }
 
-static inline void netif_tx_disable_queue(struct netdev_queue *txq)
+void mlx5e_tx_disable_queue(struct netdev_queue *txq)
 {
 	__netif_tx_lock_bh(txq);
 	netif_tx_stop_queue(txq);
@@ -1411,7 +1395,7 @@  static void mlx5e_deactivate_txqsq(struct mlx5e_txqsq *sq)
 	/* prevent netif_tx_wake_queue */
 	napi_synchronize(&c->napi);
 
-	netif_tx_disable_queue(sq->txq);
+	mlx5e_tx_disable_queue(sq->txq);
 
 	/* last doorbell out, godspeed .. */
 	if (mlx5e_wqc_has_room_for(wq, sq->cc, sq->pc, 1)) {
@@ -1431,6 +1415,7 @@  static void mlx5e_close_txqsq(struct mlx5e_txqsq *sq)
 	struct mlx5_rate_limit rl = {0};
 
 	cancel_work_sync(&sq->dim.work);
+	cancel_work_sync(&sq->recover_work);
 	mlx5e_destroy_sq(mdev, sq->sqn);
 	if (sq->rate_limit) {
 		rl.rate = sq->rate_limit;
@@ -1440,105 +1425,15 @@  static void mlx5e_close_txqsq(struct mlx5e_txqsq *sq)
 	mlx5e_free_txqsq(sq);
 }
 
-static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq)
+static void mlx5e_tx_err_cqe_work(struct work_struct *recover_work)
 {
-	unsigned long exp_time = jiffies + msecs_to_jiffies(2000);
-
-	while (time_before(jiffies, exp_time)) {
-		if (sq->cc == sq->pc)
-			return 0;
-
-		msleep(20);
-	}
-
-	netdev_err(sq->channel->netdev,
-		   "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n",
-		   sq->sqn, sq->cc, sq->pc);
-
-	return -ETIMEDOUT;
-}
-
-static int mlx5e_sq_to_ready(struct mlx5e_txqsq *sq, int curr_state)
-{
-	struct mlx5_core_dev *mdev = sq->channel->mdev;
-	struct net_device *dev = sq->channel->netdev;
-	struct mlx5e_modify_sq_param msp = {0};
-	int err;
-
-	msp.curr_state = curr_state;
-	msp.next_state = MLX5_SQC_STATE_RST;
-
-	err = mlx5e_modify_sq(mdev, sq->sqn, &msp);
-	if (err) {
-		netdev_err(dev, "Failed to move sq 0x%x to reset\n", sq->sqn);
-		return err;
-	}
-
-	memset(&msp, 0, sizeof(msp));
-	msp.curr_state = MLX5_SQC_STATE_RST;
-	msp.next_state = MLX5_SQC_STATE_RDY;
+	struct mlx5e_txqsq *sq = container_of(recover_work, struct mlx5e_txqsq,
+					      recover_work);
 
-	err = mlx5e_modify_sq(mdev, sq->sqn, &msp);
-	if (err) {
-		netdev_err(dev, "Failed to move sq 0x%x to ready\n", sq->sqn);
-		return err;
-	}
-
-	return 0;
-}
-
-static void mlx5e_sq_recover(struct work_struct *work)
-{
-	struct mlx5e_txqsq_recover *recover =
-		container_of(work, struct mlx5e_txqsq_recover,
-			     recover_work);
-	struct mlx5e_txqsq *sq = container_of(recover, struct mlx5e_txqsq,
-					      recover);
-	struct mlx5_core_dev *mdev = sq->channel->mdev;
-	struct net_device *dev = sq->channel->netdev;
-	u8 state;
-	int err;
-
-	err = mlx5_core_query_sq_state(mdev, sq->sqn, &state);
-	if (err) {
-		netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n",
-			   sq->sqn, err);
-		return;
-	}
-
-	if (state != MLX5_RQC_STATE_ERR) {
-		netdev_err(dev, "SQ 0x%x not in ERROR state\n", sq->sqn);
-		return;
-	}
-
-	netif_tx_disable_queue(sq->txq);
-
-	if (mlx5e_wait_for_sq_flush(sq))
-		return;
-
-	/* If the interval between two consecutive recovers per SQ is too
-	 * short, don't recover to avoid infinite loop of ERR_CQE -> recover.
-	 * If we reached this state, there is probably a bug that needs to be
-	 * fixed. let's keep the queue close and let tx timeout cleanup.
-	 */
-	if (jiffies_to_msecs(jiffies - recover->last_recover) <
-	    MLX5E_SQ_RECOVER_MIN_INTERVAL) {
-		netdev_err(dev, "Recover SQ 0x%x canceled, too many error CQEs\n",
-			   sq->sqn);
-		return;
-	}
-
-	/* At this point, no new packets will arrive from the stack as TXQ is
-	 * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all
-	 * pending WQEs.  SQ can safely reset the SQ.
-	 */
-	if (mlx5e_sq_to_ready(sq, state))
+	if (!sq->channel->priv->tx_reporter)
 		return;
 
-	mlx5e_reset_txqsq_cc_pc(sq);
-	sq->stats->recover++;
-	recover->last_recover = jiffies;
-	mlx5e_activate_txqsq(sq);
+	mlx5e_tx_reporter_err_cqe(sq);
 }
 
 static int mlx5e_open_icosq(struct mlx5e_channel *c,
@@ -3207,6 +3102,7 @@  static void mlx5e_cleanup_nic_tx(struct mlx5e_priv *priv)
 {
 	int tc;
 
+	mlx5e_tx_reporter_destroy(priv);
 	for (tc = 0; tc < priv->profile->max_tc; tc++)
 		mlx5e_destroy_tis(priv->mdev, priv->tisn[tc]);
 }
@@ -4908,6 +4804,7 @@  static int mlx5e_init_nic_tx(struct mlx5e_priv *priv)
 #ifdef CONFIG_MLX5_CORE_EN_DCB
 	mlx5e_dcbnl_initialize(priv);
 #endif
+	mlx5e_tx_reporter_create(priv);
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 598ad7e4d5c9..a8e052a5ce36 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -514,7 +514,7 @@  bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget)
 				mlx5e_dump_error_cqe(sq,
 						     (struct mlx5_err_cqe *)cqe);
 				queue_work(cq->channel->priv->wq,
-					   &sq->recover.recover_work);
+					   &sq->recover_work);
 			}
 			stats->cqe_err++;
 		}