diff mbox series

[net-next,09/15] net/mlx5: Create FW devlink health reporter

Message ID 20190505003207.1353-10-saeedm@mellanox.com
State Changes Requested
Delegated to: David Miller
Headers show
Series [net-next,01/15] net/mlx5: Move all devlink related functions calls to devlink.c | expand

Commit Message

Saeed Mahameed May 5, 2019, 12:33 a.m. UTC
From: Moshe Shemesh <moshe@mellanox.com>

Create mlx5_devlink_health_reporter for FW reporter. The FW reporter
implements devlink_health_reporter diagnose callback.

The fw reporter diagnose command can be triggered any time by the user
to check current fw status.
In healthy status, it will return clear syndrome. Otherwise it will dump
the health info buffer.

Command example and output on healthy status:
$ devlink health diagnose pci/0000:82:00.0 reporter fw
Syndrome: 0

Command example and output on non healthy status:
$ devlink health diagnose pci/0000:82:00.0 reporter fw
diagnose data:
assert_var[0] 0xfc3fc043
assert_var[1] 0x0001b41c
assert_var[2] 0x00000000
assert_var[3] 0x00000000
assert_var[4] 0x00000000
assert_exit_ptr 0x008033b4
assert_callra 0x0080365c
fw_ver 16.24.1000
hw_id 0x0000020d
irisc_index 0
synd 0x8: unrecoverable hardware error
ext_synd 0x003d
raw fw_ver 0x101803e8

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/health.c  | 55 +++++++++++++++++++
 include/linux/mlx5/driver.h                   |  1 +
 2 files changed, 56 insertions(+)

Comments

Jiri Pirko May 5, 2019, 3:42 p.m. UTC | #1
Sun, May 05, 2019 at 02:33:23AM CEST, saeedm@mellanox.com wrote:
>From: Moshe Shemesh <moshe@mellanox.com>
>
>Create mlx5_devlink_health_reporter for FW reporter. The FW reporter
>implements devlink_health_reporter diagnose callback.
>
>The fw reporter diagnose command can be triggered any time by the user
>to check current fw status.
>In healthy status, it will return clear syndrome. Otherwise it will dump
>the health info buffer.
>
>Command example and output on healthy status:
>$ devlink health diagnose pci/0000:82:00.0 reporter fw
>Syndrome: 0
>
>Command example and output on non healthy status:
>$ devlink health diagnose pci/0000:82:00.0 reporter fw
>diagnose data:
>assert_var[0] 0xfc3fc043
>assert_var[1] 0x0001b41c
>assert_var[2] 0x00000000
>assert_var[3] 0x00000000
>assert_var[4] 0x00000000
>assert_exit_ptr 0x008033b4
>assert_callra 0x0080365c
>fw_ver 16.24.1000
>hw_id 0x0000020d
>irisc_index 0
>synd 0x8: unrecoverable hardware error
>ext_synd 0x003d
>raw fw_ver 0x101803e8
>
>Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
>Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
>Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>

	
[...]	
	
	
>+static int
>+mlx5_fw_reporter_diagnose(struct devlink_health_reporter *reporter,
>+			  struct devlink_fmsg *fmsg)
>+{
>+	struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter);
>+	struct mlx5_core_health *health = &dev->priv.health;
>+	u8 synd;
>+	int err;
>+
>+	mutex_lock(&health->info_buf_lock);
>+	mlx5_get_health_info(dev, &synd);
>+
>+	if (!synd) {
>+		mutex_unlock(&health->info_buf_lock);
>+		return devlink_fmsg_u8_pair_put(fmsg, "Syndrome", synd);
>+	}
>+
>+	err = devlink_fmsg_string_pair_put(fmsg, "diagnose data",
>+					   health->info_buf);

No! This is wrong! You are sneaking in text blob. Please put the info in
structured form using proper fmsg helpers.
Moshe Shemesh May 6, 2019, 10:45 a.m. UTC | #2
On 5/5/2019 6:42 PM, Jiri Pirko wrote:
> Sun, May 05, 2019 at 02:33:23AM CEST, saeedm@mellanox.com wrote:
>> From: Moshe Shemesh <moshe@mellanox.com>
>>
>> Create mlx5_devlink_health_reporter for FW reporter. The FW reporter
>> implements devlink_health_reporter diagnose callback.
>>
>> The fw reporter diagnose command can be triggered any time by the user
>> to check current fw status.
>> In healthy status, it will return clear syndrome. Otherwise it will dump
>> the health info buffer.
>>
>> Command example and output on healthy status:
>> $ devlink health diagnose pci/0000:82:00.0 reporter fw
>> Syndrome: 0
>>
>> Command example and output on non healthy status:
>> $ devlink health diagnose pci/0000:82:00.0 reporter fw
>> diagnose data:
>> assert_var[0] 0xfc3fc043
>> assert_var[1] 0x0001b41c
>> assert_var[2] 0x00000000
>> assert_var[3] 0x00000000
>> assert_var[4] 0x00000000
>> assert_exit_ptr 0x008033b4
>> assert_callra 0x0080365c
>> fw_ver 16.24.1000
>> hw_id 0x0000020d
>> irisc_index 0
>> synd 0x8: unrecoverable hardware error
>> ext_synd 0x003d
>> raw fw_ver 0x101803e8
>>
>> Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
>> Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
>> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
> 
> 	
> [...]	
> 	
> 	
>> +static int
>> +mlx5_fw_reporter_diagnose(struct devlink_health_reporter *reporter,
>> +			  struct devlink_fmsg *fmsg)
>> +{
>> +	struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter);
>> +	struct mlx5_core_health *health = &dev->priv.health;
>> +	u8 synd;
>> +	int err;
>> +
>> +	mutex_lock(&health->info_buf_lock);
>> +	mlx5_get_health_info(dev, &synd);
>> +
>> +	if (!synd) {
>> +		mutex_unlock(&health->info_buf_lock);
>> +		return devlink_fmsg_u8_pair_put(fmsg, "Syndrome", synd);
>> +	}
>> +
>> +	err = devlink_fmsg_string_pair_put(fmsg, "diagnose data",
>> +					   health->info_buf);
> 
> No! This is wrong! You are sneaking in text blob. Please put the info in
> structured form using proper fmsg helpers.
> 
This is the fw output format, it is already in use, I don't want to 
change it, just have it here in the diagnose output too.
Jiri Pirko May 6, 2019, 11:38 a.m. UTC | #3
Mon, May 06, 2019 at 12:45:44PM CEST, moshe@mellanox.com wrote:
>
>
>On 5/5/2019 6:42 PM, Jiri Pirko wrote:
>> Sun, May 05, 2019 at 02:33:23AM CEST, saeedm@mellanox.com wrote:
>>> From: Moshe Shemesh <moshe@mellanox.com>
>>>
>>> Create mlx5_devlink_health_reporter for FW reporter. The FW reporter
>>> implements devlink_health_reporter diagnose callback.
>>>
>>> The fw reporter diagnose command can be triggered any time by the user
>>> to check current fw status.
>>> In healthy status, it will return clear syndrome. Otherwise it will dump
>>> the health info buffer.
>>>
>>> Command example and output on healthy status:
>>> $ devlink health diagnose pci/0000:82:00.0 reporter fw
>>> Syndrome: 0
>>>
>>> Command example and output on non healthy status:
>>> $ devlink health diagnose pci/0000:82:00.0 reporter fw
>>> diagnose data:
>>> assert_var[0] 0xfc3fc043
>>> assert_var[1] 0x0001b41c
>>> assert_var[2] 0x00000000
>>> assert_var[3] 0x00000000
>>> assert_var[4] 0x00000000
>>> assert_exit_ptr 0x008033b4
>>> assert_callra 0x0080365c
>>> fw_ver 16.24.1000
>>> hw_id 0x0000020d
>>> irisc_index 0
>>> synd 0x8: unrecoverable hardware error
>>> ext_synd 0x003d
>>> raw fw_ver 0x101803e8
>>>
>>> Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
>>> Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
>>> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
>> 
>> 	
>> [...]	
>> 	
>> 	
>>> +static int
>>> +mlx5_fw_reporter_diagnose(struct devlink_health_reporter *reporter,
>>> +			  struct devlink_fmsg *fmsg)
>>> +{
>>> +	struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter);
>>> +	struct mlx5_core_health *health = &dev->priv.health;
>>> +	u8 synd;
>>> +	int err;
>>> +
>>> +	mutex_lock(&health->info_buf_lock);
>>> +	mlx5_get_health_info(dev, &synd);
>>> +
>>> +	if (!synd) {
>>> +		mutex_unlock(&health->info_buf_lock);
>>> +		return devlink_fmsg_u8_pair_put(fmsg, "Syndrome", synd);
>>> +	}
>>> +
>>> +	err = devlink_fmsg_string_pair_put(fmsg, "diagnose data",
>>> +					   health->info_buf);
>> 
>> No! This is wrong! You are sneaking in text blob. Please put the info in
>> structured form using proper fmsg helpers.
>> 
>This is the fw output format, it is already in use, I don't want to 
>change it, just have it here in the diagnose output too.

Already in use where? in dmesg? Sorry, but that is not an argument.
Please format the message properly.
Saeed Mahameed May 6, 2019, 7:52 p.m. UTC | #4
On Mon, 2019-05-06 at 13:38 +0200, Jiri Pirko wrote:
> Mon, May 06, 2019 at 12:45:44PM CEST, moshe@mellanox.com wrote:
> > 
> > On 5/5/2019 6:42 PM, Jiri Pirko wrote:
> > > Sun, May 05, 2019 at 02:33:23AM CEST, saeedm@mellanox.com wrote:
> > > > From: Moshe Shemesh <moshe@mellanox.com>
> > > > 
> > > > Create mlx5_devlink_health_reporter for FW reporter. The FW
> > > > reporter
> > > > implements devlink_health_reporter diagnose callback.
> > > > 
> > > > The fw reporter diagnose command can be triggered any time by
> > > > the user
> > > > to check current fw status.
> > > > In healthy status, it will return clear syndrome. Otherwise it
> > > > will dump
> > > > the health info buffer.
> > > > 
> > > > Command example and output on healthy status:
> > > > $ devlink health diagnose pci/0000:82:00.0 reporter fw
> > > > Syndrome: 0
> > > > 
> > > > Command example and output on non healthy status:
> > > > $ devlink health diagnose pci/0000:82:00.0 reporter fw
> > > > diagnose data:
> > > > assert_var[0] 0xfc3fc043
> > > > assert_var[1] 0x0001b41c
> > > > assert_var[2] 0x00000000
> > > > assert_var[3] 0x00000000
> > > > assert_var[4] 0x00000000
> > > > assert_exit_ptr 0x008033b4
> > > > assert_callra 0x0080365c
> > > > fw_ver 16.24.1000
> > > > hw_id 0x0000020d
> > > > irisc_index 0
> > > > synd 0x8: unrecoverable hardware error
> > > > ext_synd 0x003d
> > > > raw fw_ver 0x101803e8
> > > > 
> > > > Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
> > > > Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
> > > > Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
> > > 
> > > 	
> > > [...]	
> > > 	
> > > 	
> > > > +static int
> > > > +mlx5_fw_reporter_diagnose(struct devlink_health_reporter
> > > > *reporter,
> > > > +			  struct devlink_fmsg *fmsg)
> > > > +{
> > > > +	struct mlx5_core_dev *dev =
> > > > devlink_health_reporter_priv(reporter);
> > > > +	struct mlx5_core_health *health = &dev->priv.health;
> > > > +	u8 synd;
> > > > +	int err;
> > > > +
> > > > +	mutex_lock(&health->info_buf_lock);
> > > > +	mlx5_get_health_info(dev, &synd);
> > > > +
> > > > +	if (!synd) {
> > > > +		mutex_unlock(&health->info_buf_lock);
> > > > +		return devlink_fmsg_u8_pair_put(fmsg,
> > > > "Syndrome", synd);
> > > > +	}
> > > > +
> > > > +	err = devlink_fmsg_string_pair_put(fmsg, "diagnose
> > > > data",
> > > > +					   health->info_buf);
> > > 
> > > No! This is wrong! You are sneaking in text blob. Please put the
> > > info in
> > > structured form using proper fmsg helpers.
> > > 
> > This is the fw output format, it is already in use, I don't want
> > to 
> > change it, just have it here in the diagnose output too.
> 
> Already in use where? in dmesg? Sorry, but that is not an argument.
> Please format the message properly.
> 

What is wrong here ? 

Unlike binary dump data, I thought diagnose data is allowed to be
developer friendly free text format, if not then let's enforce it in
devlink API. Jiri,  you can't audit each and every use of
devlink_fmsg_string_pair_put, it must be done by design.

Thanks,
Saeed
Alexei Starovoitov May 6, 2019, 9:46 p.m. UTC | #5
On Mon, May 06, 2019 at 07:52:18PM +0000, Saeed Mahameed wrote:
> On Mon, 2019-05-06 at 13:38 +0200, Jiri Pirko wrote:
> > Mon, May 06, 2019 at 12:45:44PM CEST, moshe@mellanox.com wrote:
> > > 
> > > On 5/5/2019 6:42 PM, Jiri Pirko wrote:
> > > > Sun, May 05, 2019 at 02:33:23AM CEST, saeedm@mellanox.com wrote:
> > > > > From: Moshe Shemesh <moshe@mellanox.com>
> > > > > 
> > > > > Create mlx5_devlink_health_reporter for FW reporter. The FW
> > > > > reporter
> > > > > implements devlink_health_reporter diagnose callback.
> > > > > 
> > > > > The fw reporter diagnose command can be triggered any time by
> > > > > the user
> > > > > to check current fw status.
> > > > > In healthy status, it will return clear syndrome. Otherwise it
> > > > > will dump
> > > > > the health info buffer.
> > > > > 
> > > > > Command example and output on healthy status:
> > > > > $ devlink health diagnose pci/0000:82:00.0 reporter fw
> > > > > Syndrome: 0
> > > > > 
> > > > > Command example and output on non healthy status:
> > > > > $ devlink health diagnose pci/0000:82:00.0 reporter fw
> > > > > diagnose data:
> > > > > assert_var[0] 0xfc3fc043
> > > > > assert_var[1] 0x0001b41c
> > > > > assert_var[2] 0x00000000
> > > > > assert_var[3] 0x00000000
> > > > > assert_var[4] 0x00000000
> > > > > assert_exit_ptr 0x008033b4
> > > > > assert_callra 0x0080365c
> > > > > fw_ver 16.24.1000
> > > > > hw_id 0x0000020d
> > > > > irisc_index 0
> > > > > synd 0x8: unrecoverable hardware error
> > > > > ext_synd 0x003d
> > > > > raw fw_ver 0x101803e8
> > > > > 
> > > > > Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
> > > > > Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
> > > > > Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
> > > > 
> > > > 	
> > > > [...]	
> > > > 	
> > > > 	
> > > > > +static int
> > > > > +mlx5_fw_reporter_diagnose(struct devlink_health_reporter
> > > > > *reporter,
> > > > > +			  struct devlink_fmsg *fmsg)
> > > > > +{
> > > > > +	struct mlx5_core_dev *dev =
> > > > > devlink_health_reporter_priv(reporter);
> > > > > +	struct mlx5_core_health *health = &dev->priv.health;
> > > > > +	u8 synd;
> > > > > +	int err;
> > > > > +
> > > > > +	mutex_lock(&health->info_buf_lock);
> > > > > +	mlx5_get_health_info(dev, &synd);
> > > > > +
> > > > > +	if (!synd) {
> > > > > +		mutex_unlock(&health->info_buf_lock);
> > > > > +		return devlink_fmsg_u8_pair_put(fmsg,
> > > > > "Syndrome", synd);
> > > > > +	}
> > > > > +
> > > > > +	err = devlink_fmsg_string_pair_put(fmsg, "diagnose
> > > > > data",
> > > > > +					   health->info_buf);
> > > > 
> > > > No! This is wrong! You are sneaking in text blob. Please put the
> > > > info in
> > > > structured form using proper fmsg helpers.
> > > > 
> > > This is the fw output format, it is already in use, I don't want
> > > to 
> > > change it, just have it here in the diagnose output too.
> > 
> > Already in use where? in dmesg? Sorry, but that is not an argument.
> > Please format the message properly.
> > 
> 
> What is wrong here ? 
> 
> Unlike binary dump data, I thought diagnose data is allowed to be
> developer friendly free text format, if not then let's enforce it in
> devlink API. Jiri,  you can't audit each and every use of
> devlink_fmsg_string_pair_put, it must be done by design.

I agree with the purpose that Jiri is trying to get out of this infra,
but I disagree that strings should be disallowed.

The example messages from commit log are not useful at all:
$ devlink health diagnose pci/0000:82:00.0 reporter fw
diagnose data:
assert_var[0] 0xfc3fc043
assert_var[1] 0x0001b41c
assert_var[2] 0x00000000
assert_var[3] 0x00000000
assert_var[4] 0x00000000
assert_exit_ptr 0x008033b4
assert_callra 0x0080365c
fw_ver 16.24.1000
hw_id 0x0000020d
irisc_index 0
synd 0x8: unrecoverable hardware error
ext_synd 0x003d
raw fw_ver 0x101803e8

Firmware dumps some magic numbers. How this is any better than
what we have today?
Every bit has to have meaningful message.
The point of devlink diag is not to replicate dmesg, but to give
users a clean way to understand and debug the hw.

It seems mlx is adding a new interface and immediately misusing it.
This not an example to give to other vendors.
Jakub Kicinski May 7, 2019, 12:11 a.m. UTC | #6
On Sun, 5 May 2019 00:33:23 +0000, Saeed Mahameed wrote:
> From: Moshe Shemesh <moshe@mellanox.com>
> 
> Create mlx5_devlink_health_reporter for FW reporter. The FW reporter
> implements devlink_health_reporter diagnose callback.
> 
> The fw reporter diagnose command can be triggered any time by the user
> to check current fw status.
> In healthy status, it will return clear syndrome. Otherwise it will dump
> the health info buffer.
> 
> Command example and output on healthy status:
> $ devlink health diagnose pci/0000:82:00.0 reporter fw
> Syndrome: 0
> 
> Command example and output on non healthy status:
> $ devlink health diagnose pci/0000:82:00.0 reporter fw
> diagnose data:
> assert_var[0] 0xfc3fc043
> assert_var[1] 0x0001b41c
> assert_var[2] 0x00000000
> assert_var[3] 0x00000000
> assert_var[4] 0x00000000
> assert_exit_ptr 0x008033b4
> assert_callra 0x0080365c
> fw_ver 16.24.1000

Does FW version really have to be duplicated in this API?

> hw_id 0x0000020d
> irisc_index 0
> synd 0x8: unrecoverable hardware error
> ext_synd 0x003d
> raw fw_ver 0x101803e8
Jiri Pirko May 7, 2019, 5:59 a.m. UTC | #7
Mon, May 06, 2019 at 11:46:43PM CEST, alexei.starovoitov@gmail.com wrote:
>On Mon, May 06, 2019 at 07:52:18PM +0000, Saeed Mahameed wrote:
>> On Mon, 2019-05-06 at 13:38 +0200, Jiri Pirko wrote:
>> > Mon, May 06, 2019 at 12:45:44PM CEST, moshe@mellanox.com wrote:
>> > > 
>> > > On 5/5/2019 6:42 PM, Jiri Pirko wrote:
>> > > > Sun, May 05, 2019 at 02:33:23AM CEST, saeedm@mellanox.com wrote:
>> > > > > From: Moshe Shemesh <moshe@mellanox.com>
>> > > > > 
>> > > > > Create mlx5_devlink_health_reporter for FW reporter. The FW
>> > > > > reporter
>> > > > > implements devlink_health_reporter diagnose callback.
>> > > > > 
>> > > > > The fw reporter diagnose command can be triggered any time by
>> > > > > the user
>> > > > > to check current fw status.
>> > > > > In healthy status, it will return clear syndrome. Otherwise it
>> > > > > will dump
>> > > > > the health info buffer.
>> > > > > 
>> > > > > Command example and output on healthy status:
>> > > > > $ devlink health diagnose pci/0000:82:00.0 reporter fw
>> > > > > Syndrome: 0
>> > > > > 
>> > > > > Command example and output on non healthy status:
>> > > > > $ devlink health diagnose pci/0000:82:00.0 reporter fw
>> > > > > diagnose data:
>> > > > > assert_var[0] 0xfc3fc043
>> > > > > assert_var[1] 0x0001b41c
>> > > > > assert_var[2] 0x00000000
>> > > > > assert_var[3] 0x00000000
>> > > > > assert_var[4] 0x00000000
>> > > > > assert_exit_ptr 0x008033b4
>> > > > > assert_callra 0x0080365c
>> > > > > fw_ver 16.24.1000
>> > > > > hw_id 0x0000020d
>> > > > > irisc_index 0
>> > > > > synd 0x8: unrecoverable hardware error
>> > > > > ext_synd 0x003d
>> > > > > raw fw_ver 0x101803e8
>> > > > > 
>> > > > > Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
>> > > > > Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
>> > > > > Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
>> > > > 
>> > > > 	
>> > > > [...]	
>> > > > 	
>> > > > 	
>> > > > > +static int
>> > > > > +mlx5_fw_reporter_diagnose(struct devlink_health_reporter
>> > > > > *reporter,
>> > > > > +			  struct devlink_fmsg *fmsg)
>> > > > > +{
>> > > > > +	struct mlx5_core_dev *dev =
>> > > > > devlink_health_reporter_priv(reporter);
>> > > > > +	struct mlx5_core_health *health = &dev->priv.health;
>> > > > > +	u8 synd;
>> > > > > +	int err;
>> > > > > +
>> > > > > +	mutex_lock(&health->info_buf_lock);
>> > > > > +	mlx5_get_health_info(dev, &synd);
>> > > > > +
>> > > > > +	if (!synd) {
>> > > > > +		mutex_unlock(&health->info_buf_lock);
>> > > > > +		return devlink_fmsg_u8_pair_put(fmsg,
>> > > > > "Syndrome", synd);
>> > > > > +	}
>> > > > > +
>> > > > > +	err = devlink_fmsg_string_pair_put(fmsg, "diagnose
>> > > > > data",
>> > > > > +					   health->info_buf);
>> > > > 
>> > > > No! This is wrong! You are sneaking in text blob. Please put the
>> > > > info in
>> > > > structured form using proper fmsg helpers.
>> > > > 
>> > > This is the fw output format, it is already in use, I don't want
>> > > to 
>> > > change it, just have it here in the diagnose output too.
>> > 
>> > Already in use where? in dmesg? Sorry, but that is not an argument.
>> > Please format the message properly.
>> > 
>> 
>> What is wrong here ? 
>> 
>> Unlike binary dump data, I thought diagnose data is allowed to be
>> developer friendly free text format, if not then let's enforce it in
>> devlink API. Jiri,  you can't audit each and every use of
>> devlink_fmsg_string_pair_put, it must be done by design.
>
>I agree with the purpose that Jiri is trying to get out of this infra,
>but I disagree that strings should be disallowed.

Not strings in general, I have nothing against them. If they are plain.
But when you push structured data into strings, for example using
sprintf for ints etc, that is a problem I'm pointing at.
Instead of that, the structured data should be formatted using
proper fmsg helpers. We already have them in kernel, it is just about
using them and not taking shortcuts.


>
>The example messages from commit log are not useful at all:
>$ devlink health diagnose pci/0000:82:00.0 reporter fw
>diagnose data:
>assert_var[0] 0xfc3fc043
>assert_var[1] 0x0001b41c
>assert_var[2] 0x00000000
>assert_var[3] 0x00000000
>assert_var[4] 0x00000000
>assert_exit_ptr 0x008033b4
>assert_callra 0x0080365c
>fw_ver 16.24.1000
>hw_id 0x0000020d
>irisc_index 0
>synd 0x8: unrecoverable hardware error
>ext_synd 0x003d
>raw fw_ver 0x101803e8
>
>Firmware dumps some magic numbers. How this is any better than
>what we have today?
>Every bit has to have meaningful message.
>The point of devlink diag is not to replicate dmesg, but to give
>users a clean way to understand and debug the hw.
>
>It seems mlx is adding a new interface and immediately misusing it.
>This not an example to give to other vendors.

I agree.
Jiri Pirko May 7, 2019, 6:01 a.m. UTC | #8
Mon, May 06, 2019 at 09:52:18PM CEST, saeedm@mellanox.com wrote:
>On Mon, 2019-05-06 at 13:38 +0200, Jiri Pirko wrote:
>> Mon, May 06, 2019 at 12:45:44PM CEST, moshe@mellanox.com wrote:
>> > 
>> > On 5/5/2019 6:42 PM, Jiri Pirko wrote:
>> > > Sun, May 05, 2019 at 02:33:23AM CEST, saeedm@mellanox.com wrote:
>> > > > From: Moshe Shemesh <moshe@mellanox.com>
>> > > > 
>> > > > Create mlx5_devlink_health_reporter for FW reporter. The FW
>> > > > reporter
>> > > > implements devlink_health_reporter diagnose callback.
>> > > > 
>> > > > The fw reporter diagnose command can be triggered any time by
>> > > > the user
>> > > > to check current fw status.
>> > > > In healthy status, it will return clear syndrome. Otherwise it
>> > > > will dump
>> > > > the health info buffer.
>> > > > 
>> > > > Command example and output on healthy status:
>> > > > $ devlink health diagnose pci/0000:82:00.0 reporter fw
>> > > > Syndrome: 0
>> > > > 
>> > > > Command example and output on non healthy status:
>> > > > $ devlink health diagnose pci/0000:82:00.0 reporter fw
>> > > > diagnose data:
>> > > > assert_var[0] 0xfc3fc043
>> > > > assert_var[1] 0x0001b41c
>> > > > assert_var[2] 0x00000000
>> > > > assert_var[3] 0x00000000
>> > > > assert_var[4] 0x00000000
>> > > > assert_exit_ptr 0x008033b4
>> > > > assert_callra 0x0080365c
>> > > > fw_ver 16.24.1000
>> > > > hw_id 0x0000020d
>> > > > irisc_index 0
>> > > > synd 0x8: unrecoverable hardware error
>> > > > ext_synd 0x003d
>> > > > raw fw_ver 0x101803e8
>> > > > 
>> > > > Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
>> > > > Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
>> > > > Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
>> > > 
>> > > 	
>> > > [...]	
>> > > 	
>> > > 	
>> > > > +static int
>> > > > +mlx5_fw_reporter_diagnose(struct devlink_health_reporter
>> > > > *reporter,
>> > > > +			  struct devlink_fmsg *fmsg)
>> > > > +{
>> > > > +	struct mlx5_core_dev *dev =
>> > > > devlink_health_reporter_priv(reporter);
>> > > > +	struct mlx5_core_health *health = &dev->priv.health;
>> > > > +	u8 synd;
>> > > > +	int err;
>> > > > +
>> > > > +	mutex_lock(&health->info_buf_lock);
>> > > > +	mlx5_get_health_info(dev, &synd);
>> > > > +
>> > > > +	if (!synd) {
>> > > > +		mutex_unlock(&health->info_buf_lock);
>> > > > +		return devlink_fmsg_u8_pair_put(fmsg,
>> > > > "Syndrome", synd);
>> > > > +	}
>> > > > +
>> > > > +	err = devlink_fmsg_string_pair_put(fmsg, "diagnose
>> > > > data",
>> > > > +					   health->info_buf);
>> > > 
>> > > No! This is wrong! You are sneaking in text blob. Please put the
>> > > info in
>> > > structured form using proper fmsg helpers.
>> > > 
>> > This is the fw output format, it is already in use, I don't want
>> > to 
>> > change it, just have it here in the diagnose output too.
>> 
>> Already in use where? in dmesg? Sorry, but that is not an argument.
>> Please format the message properly.
>> 
>
>What is wrong here ? 
>
>Unlike binary dump data, I thought diagnose data is allowed to be
>developer friendly free text format, if not then let's enforce it in
>devlink API. Jiri,  you can't audit each and every use of
>devlink_fmsg_string_pair_put, it must be done by design.

No way to enforce it. Strings need to be there in general. But drivers
have to use them wisely, only for plain values. Unlike this patch, which
is pushing structured text blob in it.
diff mbox series

Patch

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index a3c7e46aafd9..9ffa9c7f81a0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -428,6 +428,58 @@  static void mlx5_print_health_info(struct mlx5_core_dev *dev)
 	mutex_unlock(&health->info_buf_lock);
 }
 
+static int
+mlx5_fw_reporter_diagnose(struct devlink_health_reporter *reporter,
+			  struct devlink_fmsg *fmsg)
+{
+	struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter);
+	struct mlx5_core_health *health = &dev->priv.health;
+	u8 synd;
+	int err;
+
+	mutex_lock(&health->info_buf_lock);
+	mlx5_get_health_info(dev, &synd);
+
+	if (!synd) {
+		mutex_unlock(&health->info_buf_lock);
+		return devlink_fmsg_u8_pair_put(fmsg, "Syndrome", synd);
+	}
+
+	err = devlink_fmsg_string_pair_put(fmsg, "diagnose data",
+					   health->info_buf);
+
+	mutex_unlock(&health->info_buf_lock);
+	return err;
+}
+
+static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = {
+		.name = "fw",
+		.diagnose = mlx5_fw_reporter_diagnose,
+};
+
+static void mlx5_fw_reporter_create(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+	struct devlink *devlink = priv_to_devlink(dev);
+
+	health->fw_reporter =
+		devlink_health_reporter_create(devlink, &mlx5_fw_reporter_ops,
+					       0, false, dev);
+	if (IS_ERR(health->fw_reporter))
+		mlx5_core_warn(dev, "Failed to create fw reporter, err = %ld\n",
+			       PTR_ERR(health->fw_reporter));
+}
+
+static void mlx5_fw_reporter_destroy(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+
+	if (IS_ERR_OR_NULL(health->fw_reporter))
+		return;
+
+	devlink_health_reporter_destroy(health->fw_reporter);
+}
+
 static unsigned long get_next_poll_jiffies(void)
 {
 	unsigned long next;
@@ -539,6 +591,7 @@  void mlx5_health_cleanup(struct mlx5_core_dev *dev)
 
 	kfree(health->info_buf);
 	destroy_workqueue(health->wq);
+	mlx5_fw_reporter_destroy(dev);
 }
 
 int mlx5_health_init(struct mlx5_core_dev *dev)
@@ -565,6 +618,8 @@  int mlx5_health_init(struct mlx5_core_dev *dev)
 		goto err_alloc_buff;
 	mutex_init(&health->info_buf_lock);
 
+	mlx5_fw_reporter_create(dev);
+
 	return 0;
 
 err_alloc_buff:
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index df8f4c4e21c6..a362aa6c799c 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -448,6 +448,7 @@  struct mlx5_core_health {
 	int				info_buf_len;
 	/* protect info buf access */
 	struct mutex			info_buf_lock;
+	struct devlink_health_reporter *fw_reporter;
 };
 
 struct mlx5_qp_table {