diff mbox series

[iproute2-next,v2] devlink: Add health command support

Message ID 1547976434-4142-1-git-send-email-ayal@mellanox.com
State Changes Requested
Delegated to: David Ahern
Headers show
Series [iproute2-next,v2] devlink: Add health command support | expand

Commit Message

Aya Levin Jan. 20, 2019, 9:27 a.m. UTC
This patch add support for the following commands:
Devlink health show [DEV reporter REPORTE_NAME],
Devlink health recover DEV reporter REPORTER_NAME,
Devlink health diagnose DEV reporter REPORTER_NAME,
Devlink health dump show DEV reporter REPORTER_NAME,
Devlink health dump clear DEV reporter REPORTER_NAME,
Devlink health set DEV reporter REPORTER_NAME NAME VALUE

 * Devlink health show command displays status and configuration info on
   specific reporter on a device or dump the info on all reporters on all
   devices.
 * Devlink health recover enables the user to initiate a
   recovery on a reporter. This operation will increment the recoveries
   counter displayed in the show command.
 * Devlink health diagnose enables the user to retrieve diagnostics data
   on a reporter on a device. The command's output is a free text defined
   by the reporter.
 * Devlink health dump show displays the last saved dump. Devlink
   health saves a single dump. If an dump is not already stored by
   the Devlink for this reporter, Devlink generates a new dump. The
   dump can be generated automatically when a reporter reports on an
   error or by the user's request.
   dump output is defined by the reporter.
 * Devlink health dump clear, deletes the last saved dump file.
 * Devlink health set, enables the user to configure:
	1) grace_period [msec] time interval between auto recoveries.
	2) auto_recover [true/false] whether the devlink should execute
	automatic recover on error.

Examples:
$devlink health show pci/0000:00:09.0 reporter TX
pci/0000:00:09.0:
  name TX 
  state healthy #err 0 #recover 1 last_dump_ts N/A dump_available false
  attributes:
    grace period 600 auto_recover true
$devlink health diagnose pci/0000:00:09.0 reporter TX
SQ 0x9b: HW state: 1 stopped: 0
SQ 0x9f: HW state: 1 stopped: 0
SQ 0xa3: HW state: 1 stopped: 0
SQ 0xa7: HW state: 1 stopped: 0
SQ 0xab: HW state: 1 stopped: 0
$devlink health dump show pci/0000:00:09.0 reporter TX
TX dump data
$devlink health dump clear pci/0000:00:09.0 reporter TX
$devlink health set pci/0000:00:09.0 reporter TX grace_period 3500
$devlink health set pci/0000:00:09.0 reporter TX auto_recover false

Signed-off-by: Aya Levin <ayal@mellanox.com>
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
---
 devlink/devlink.c            | 536 ++++++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/devlink.h |  29 +++
 man/man8/devlink-health.8    | 175 ++++++++++++++
 man/man8/devlink.8           |   7 +-
 4 files changed, 743 insertions(+), 4 deletions(-)
 create mode 100644 man/man8/devlink-health.8

Comments

David Ahern Jan. 23, 2019, 3:37 a.m. UTC | #1
On 1/20/19 2:27 AM, Aya Levin wrote:
> diff --git a/devlink/devlink.c b/devlink/devlink.c
> index 3651e90c1159..9fc19668ccd0 100644
> --- a/devlink/devlink.c
> +++ b/devlink/devlink.c
> @@ -1,4 +1,5 @@
>  /*
> + *

extra newline

>   * devlink.c	Devlink tool
>   *
>   *              This program is free software; you can redistribute it and/or
> @@ -22,7 +23,7 @@
>  #include <linux/devlink.h>
>  #include <libmnl/libmnl.h>
>  #include <netinet/ether.h>
> -
> +#include <sys/sysinfo.h>
>  #include "SNAPSHOT.h"
>  #include "list.h"
>  #include "mnlg.h"
> @@ -40,6 +41,12 @@
>  #define PARAM_CMODE_DRIVERINIT_STR "driverinit"
>  #define PARAM_CMODE_PERMANENT_STR "permanent"
>  
> +#define HEALTH_REPORTER_STATE_HEALTHY_STR "healthy"
> +#define HEALTH_REPORTER_STATE_ERROR_STR "error"
> +#define HEALTH_REPORTER_GRACE_PERIOD_STR "grace_period"
> +#define HEALTH_REPORTER_AUTO_RECOVER_STR "auto_recover"
> +#define HEALTH_REPORTER_TS_LEN 80
> +
>  static int g_new_line_count;
>  
>  #define pr_err(args...) fprintf(stderr, ##args)
> @@ -199,6 +206,10 @@ static void ifname_map_free(struct ifname_map *ifname_map)
>  #define DL_OPT_REGION_SNAPSHOT_ID	BIT(22)
>  #define DL_OPT_REGION_ADDRESS		BIT(23)
>  #define DL_OPT_REGION_LENGTH		BIT(24)
> +#define DL_OPT_HANDLE_HEALTH		BIT(25)
> +#define DL_OPT_HEALTH_REPORTER_NAME	BIT(26)
> +#define DL_OPT_HEALTH_REPORTER_DEV	BIT(27)
> +
>  

extra newline


>  struct dl_opts {
>  	uint32_t present; /* flags of present items */
> @@ -230,6 +241,10 @@ struct dl_opts {
>  	uint32_t region_snapshot_id;
>  	uint64_t region_address;
>  	uint64_t region_length;
> +	const char *reporter_name;
> +	const char *reporter_param_name;
> +	const char *reporter_param_value;
> +
>  };
>  
>  struct dl {
> @@ -959,7 +974,7 @@ static int dl_argv_parse(struct dl *dl, uint32_t o_required,
>  		if (err)
>  			return err;
>  		o_found |= handle_bit;
> -	} else if (o_required & DL_OPT_HANDLE) {
> +	} else if (DL_OPT_HANDLE) {

typo? Seems like o_required is required.


>  		err = dl_argv_handle(dl, &opts->bus_name, &opts->dev_name);
>  		if (err)
>  			return err;
> @@ -1178,6 +1193,15 @@ static int dl_argv_parse(struct dl *dl, uint32_t o_required,
>  			if (err)
>  				return err;
>  			o_found |= DL_OPT_REGION_LENGTH;
> +		} else if (dl_argv_match(dl, "reporter") &&
> +			   (o_all & DL_OPT_HEALTH_REPORTER_NAME)) {
> +			dl_arg_inc(dl);
> +			err = dl_argv_str(dl, &opts->reporter_name);
> +			if (err)
> +				return err;
> +			o_found |= DL_OPT_HEALTH_REPORTER_NAME;
> +			o_found |= DL_OPT_HANDLE;
> +			break;

why the break? It is the only option that breaks after a match. If it is
required, please add a comment why for future readers.


>  		} else {
>  			pr_err("Unknown option \"%s\"\n", dl_argv(dl));
>  			return -EINVAL;
> @@ -1298,6 +1322,12 @@ static int dl_argv_parse(struct dl *dl, uint32_t o_required,
>  		return -EINVAL;
>  	}
>  
> +	if ((o_required & DL_OPT_HEALTH_REPORTER_NAME) &&
> +	    !(o_found & DL_OPT_HEALTH_REPORTER_NAME)) {
> +		pr_err("Reporter name expected.\n");
> +		return -EINVAL;
> +	}

I realize your are following suit with this change, but these checks for
'required and not found' are getting really long. There is a better way
to do this.


> +
>  	return 0;
>  }
>  
> @@ -1382,6 +1412,9 @@ static void dl_opts_put(struct nlmsghdr *nlh, struct dl *dl)
>  	if (opts->present & DL_OPT_REGION_LENGTH)
>  		mnl_attr_put_u64(nlh, DEVLINK_ATTR_REGION_CHUNK_LEN,
>  				 opts->region_length);
> +	if (opts->present & DL_OPT_HEALTH_REPORTER_NAME)
> +		mnl_attr_put_strz(nlh, DEVLINK_ATTR_HEALTH_REPORTER_NAME,
> +				  opts->reporter_name);
>  }
>  
>  static int dl_argv_parse_put(struct nlmsghdr *nlh, struct dl *dl,
> @@ -1513,6 +1546,8 @@ static void __pr_out_handle_start(struct dl *dl, struct nlattr **tb,
>  				__pr_out_newline();
>  				__pr_out_indent_inc();
>  				arr_last_handle_set(dl, bus_name, dev_name);
> +			} else {
> +				__pr_out_indent_inc();
>  			}
>  		} else {
>  			pr_out("%s%s", buf, content ? ":" : "");
> @@ -5557,11 +5592,502 @@ static int cmd_region(struct dl *dl)
>  	return -ENOENT;
>  }
>  
> +static int cmd_health_set_params(struct dl *dl)
> +{
> +	struct nlmsghdr *nlh;
> +	uint64_t period;
> +	bool auto_recover;
> +	int err;
> +
> +	nlh = mnlg_msg_prepare(dl->nlg, DEVLINK_CMD_HEALTH_REPORTER_SET,
> +			       NLM_F_REQUEST | NLM_F_ACK);
> +	err = dl_argv_parse(dl, DL_OPT_HANDLE |
> +			    DL_OPT_HEALTH_REPORTER_NAME, 0);
> +	if (err)
> +		return err;
> +
> +	err = dl_argv_str(dl, &dl->opts.reporter_param_name);
> +	if (err)
> +		return err;
> +	err = dl_argv_str(dl, &dl->opts.reporter_param_value);
> +	if (err)
> +		return err;
> +	dl_opts_put(nlh, dl);
> +
> +	if (!strncmp(dl->opts.reporter_param_name,
> +		     HEALTH_REPORTER_GRACE_PERIOD_STR, strlen("garce"))) {
> +		err = strtouint64_t(dl->opts.reporter_param_value, &period);
> +		if (err)
> +			goto err_param_value_parse;
> +		mnl_attr_put_u64(nlh, DEVLINK_ATTR_HEALTH_REPORTER_PERIOD,
> +				 period);
> +	} else if (!strncmp(dl->opts.reporter_param_name,
> +			    HEALTH_REPORTER_AUTO_RECOVER_STR,
> +			    strlen("auto"))) {
> +		err = strtobool(dl->opts.reporter_param_value, &auto_recover);
> +		if (err)
> +			goto err_param_value_parse;
> +		mnl_attr_put_u8(nlh, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_REC,
> +				(uint8_t)auto_recover);
> +	} else {
> +		printf("Parameter name: %s  is not supported\n",
> +		       dl->opts.reporter_param_name);
> +		return -ENOTSUP;
> +	}
> +
> +	return _mnlg_socket_sndrcv(dl->nlg, nlh, NULL, NULL);
> +
> +err_param_value_parse:
> +	pr_err("Value \"%s\" is not a number or not within range\n",
> +	       dl->opts.param_value);
> +	return err;
> +}
> +
> +static int cmd_health_dump_clear(struct dl *dl)
> +{
> +	struct nlmsghdr *nlh;
> +	int err;
> +
> +	nlh = mnlg_msg_prepare(dl->nlg,
> +			       DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR,
> +			       NLM_F_REQUEST | NLM_F_ACK);
> +
> +	err = dl_argv_parse_put(nlh, dl, DL_OPT_HANDLE |
> +				DL_OPT_HEALTH_REPORTER_NAME, 0);
> +	if (err)
> +		return err;
> +
> +	dl_opts_put(nlh, dl);
> +	return _mnlg_socket_sndrcv(dl->nlg, nlh, NULL, NULL);
> +}
> +
> +static int health_value_show(struct dl *dl, int type, struct nlattr *nl_data)
> +{
> +	const char *str;
> +	uint8_t *data;
> +	uint32_t len;
> +	uint64_t u64;
> +	uint32_t u32;
> +	uint16_t u16;
> +	uint8_t u8;
> +	int i;
> +
> +	switch (type) {
> +	case MNL_TYPE_FLAG:
> +		if (dl->json_output)
> +			jsonw_string(dl->jw, nl_data ? "true" : "false");
> +		else
> +			pr_out("%s ", nl_data ? "true" : "false");
> +		break;
> +	case MNL_TYPE_U8:
> +		u8 = mnl_attr_get_u8(nl_data);
> +		if (dl->json_output)
> +			jsonw_uint(dl->jw, u8);
> +		else
> +			pr_out("%u ", u8);
> +		break;
> +	case MNL_TYPE_U16:
> +		u16 = mnl_attr_get_u16(nl_data);
> +		if (dl->json_output)
> +			jsonw_uint(dl->jw, u16);
> +		else
> +			pr_out("%u ", u16);
> +		break;
> +	case MNL_TYPE_U32:
> +		u32 = mnl_attr_get_u32(nl_data);
> +		if (dl->json_output)
> +			jsonw_uint(dl->jw, u32);
> +		else
> +			pr_out("%u ", u32);
> +		break;
> +	case MNL_TYPE_U64:
> +		u64 = mnl_attr_get_u64(nl_data);
> +		if (dl->json_output)
> +			jsonw_u64(dl->jw, u64);
> +		else
> +			pr_out("%lu ", u64);
> +		break;
> +	case MNL_TYPE_STRING:
> +	case MNL_TYPE_NUL_STRING:
> +		str = mnl_attr_get_str(nl_data);
> +		if (dl->json_output)
> +			jsonw_string(dl->jw, str);
> +		else
> +			pr_out("%s ", str);
> +		break;
> +	case MNL_TYPE_BINARY:
> +		len = mnl_attr_get_payload_len(nl_data);
> +		data = mnl_attr_get_payload(nl_data);
> +		i = 0;
> +		while (i < len) {
> +			if (dl->json_output)
> +				jsonw_printf(dl->jw, "%d", data[i]);
> +			else
> +				pr_out("%02x ", data[i]);
> +			i++;
> +		}

If 'len' is an arbitrary size then line lengths can be ridiculous here.


> +		break;
> +	default:
> +		return -EINVAL;
> +	}
> +	return MNL_CB_OK;
> +}
> +

...

> +static int jiffies_to_logtime(uint64_t jiffies, char *output)
> +{
> +	struct sysinfo s_info;
> +	struct tm *info;
> +	time_t now, sec;
> +	int err;
> +
> +	time(&now);
> +	info = localtime(&now);
> +	strftime(output, HEALTH_REPORTER_TS_LEN, "%Y-%b-%d %H:%M:%S", info);

This strftime should really be done in the error path of sysinfo as
opposed to every call to this function calling strftime twice.

> +	err = sysinfo(&s_info);
> +	if (err)
> +		return err;
> +	/*substruct uptime in sec from now
> +	 * add jiffies and 5 minutes factor*/

fix the comment style.


> +	sec = now - (s_info.uptime - jiffies/1000) + 300;
> +	info = localtime(&sec);
> +	strftime(output, HEALTH_REPORTER_TS_LEN, "%Y-%b-%d %H:%M:%S", info);
> +
> +	return 0;
> +}
> +
Aya Levin Jan. 23, 2019, 11:27 a.m. UTC | #2
נכתב על ידי David Ahern, ב־1/23/2019 בשעה 5:37 AM:
> On 1/20/19 2:27 AM, Aya Levin wrote:
>> diff --git a/devlink/devlink.c b/devlink/devlink.c
>> index 3651e90c1159..9fc19668ccd0 100644
>> --- a/devlink/devlink.c
>> +++ b/devlink/devlink.c
>> @@ -1,4 +1,5 @@
>>   /*
>> + *
> 
> extra newline
> 
>>    * devlink.c	Devlink tool
>>    *
>>    *              This program is free software; you can redistribute it and/or
>> @@ -22,7 +23,7 @@
>>   #include <linux/devlink.h>
>>   #include <libmnl/libmnl.h>
>>   #include <netinet/ether.h>
>> -
>> +#include <sys/sysinfo.h>
>>   #include "SNAPSHOT.h"
>>   #include "list.h"
>>   #include "mnlg.h"
>> @@ -40,6 +41,12 @@
>>   #define PARAM_CMODE_DRIVERINIT_STR "driverinit"
>>   #define PARAM_CMODE_PERMANENT_STR "permanent"
>>   
>> +#define HEALTH_REPORTER_STATE_HEALTHY_STR "healthy"
>> +#define HEALTH_REPORTER_STATE_ERROR_STR "error"
>> +#define HEALTH_REPORTER_GRACE_PERIOD_STR "grace_period"
>> +#define HEALTH_REPORTER_AUTO_RECOVER_STR "auto_recover"
>> +#define HEALTH_REPORTER_TS_LEN 80
>> +
>>   static int g_new_line_count;
>>   
>>   #define pr_err(args...) fprintf(stderr, ##args)
>> @@ -199,6 +206,10 @@ static void ifname_map_free(struct ifname_map *ifname_map)
>>   #define DL_OPT_REGION_SNAPSHOT_ID	BIT(22)
>>   #define DL_OPT_REGION_ADDRESS		BIT(23)
>>   #define DL_OPT_REGION_LENGTH		BIT(24)
>> +#define DL_OPT_HANDLE_HEALTH		BIT(25)
>> +#define DL_OPT_HEALTH_REPORTER_NAME	BIT(26)
>> +#define DL_OPT_HEALTH_REPORTER_DEV	BIT(27)
>> +
>>   
> 
> extra newline
> 
> 
>>   struct dl_opts {
>>   	uint32_t present; /* flags of present items */
>> @@ -230,6 +241,10 @@ struct dl_opts {
>>   	uint32_t region_snapshot_id;
>>   	uint64_t region_address;
>>   	uint64_t region_length;
>> +	const char *reporter_name;
>> +	const char *reporter_param_name;
>> +	const char *reporter_param_value;
>> +
>>   };
>>   
>>   struct dl {
>> @@ -959,7 +974,7 @@ static int dl_argv_parse(struct dl *dl, uint32_t o_required,
>>   		if (err)
>>   			return err;
>>   		o_found |= handle_bit;
>> -	} else if (o_required & DL_OPT_HANDLE) {
>> +	} else if (DL_OPT_HANDLE) {
> 
> typo? Seems like o_required is required.
typo should have been o_all
> 
> 
>>   		err = dl_argv_handle(dl, &opts->bus_name, &opts->dev_name);
>>   		if (err)
>>   			return err;
>> @@ -1178,6 +1193,15 @@ static int dl_argv_parse(struct dl *dl, uint32_t o_required,
>>   			if (err)
>>   				return err;
>>   			o_found |= DL_OPT_REGION_LENGTH;
>> +		} else if (dl_argv_match(dl, "reporter") &&
>> +			   (o_all & DL_OPT_HEALTH_REPORTER_NAME)) {
>> +			dl_arg_inc(dl);
>> +			err = dl_argv_str(dl, &opts->reporter_name);
>> +			if (err)
>> +				return err;
>> +			o_found |= DL_OPT_HEALTH_REPORTER_NAME;
>> +			o_found |= DL_OPT_HANDLE;
>> +			break;
> 
> why the break? It is the only option that breaks after a match. If it is
> required, please add a comment why for future readers.
> 
> 
>>   		} else {
>>   			pr_err("Unknown option \"%s\"\n", dl_argv(dl));
>>   			return -EINVAL;
>> @@ -1298,6 +1322,12 @@ static int dl_argv_parse(struct dl *dl, uint32_t o_required,
>>   		return -EINVAL;
>>   	}
>>   
>> +	if ((o_required & DL_OPT_HEALTH_REPORTER_NAME) &&
>> +	    !(o_found & DL_OPT_HEALTH_REPORTER_NAME)) {
>> +		pr_err("Reporter name expected.\n");
>> +		return -EINVAL;
>> +	}
> 
> I realize your are following suit with this change, but these checks for
> 'required and not found' are getting really long. There is a better way
> to do this.
Do you mean somthing like:
#define requiered_not_found(o_found, o_required, flag, msg)            \
           ({                                                           \
                   if ((o_required & flag) && !(o_found & flag)) {      \
                           pr_err("%s \n", msg);                        \
                           return -EINVAL;                              \
                   }                                                    \
           })

> 
> 
>> +
>>   	return 0;
>>   }
>>   
>> @@ -1382,6 +1412,9 @@ static void dl_opts_put(struct nlmsghdr *nlh, struct dl *dl)
>>   	if (opts->present & DL_OPT_REGION_LENGTH)
>>   		mnl_attr_put_u64(nlh, DEVLINK_ATTR_REGION_CHUNK_LEN,
>>   				 opts->region_length);
>> +	if (opts->present & DL_OPT_HEALTH_REPORTER_NAME)
>> +		mnl_attr_put_strz(nlh, DEVLINK_ATTR_HEALTH_REPORTER_NAME,
>> +				  opts->reporter_name);
>>   }
>>   
>>   static int dl_argv_parse_put(struct nlmsghdr *nlh, struct dl *dl,
>> @@ -1513,6 +1546,8 @@ static void __pr_out_handle_start(struct dl *dl, struct nlattr **tb,
>>   				__pr_out_newline();
>>   				__pr_out_indent_inc();
>>   				arr_last_handle_set(dl, bus_name, dev_name);
>> +			} else {
>> +				__pr_out_indent_inc();
>>   			}
>>   		} else {
>>   			pr_out("%s%s", buf, content ? ":" : "");
>> @@ -5557,11 +5592,502 @@ static int cmd_region(struct dl *dl)
>>   	return -ENOENT;
>>   }
>>   
>> +static int cmd_health_set_params(struct dl *dl)
>> +{
>> +	struct nlmsghdr *nlh;
>> +	uint64_t period;
>> +	bool auto_recover;
>> +	int err;
>> +
>> +	nlh = mnlg_msg_prepare(dl->nlg, DEVLINK_CMD_HEALTH_REPORTER_SET,
>> +			       NLM_F_REQUEST | NLM_F_ACK);
>> +	err = dl_argv_parse(dl, DL_OPT_HANDLE |
>> +			    DL_OPT_HEALTH_REPORTER_NAME, 0);
>> +	if (err)
>> +		return err;
>> +
>> +	err = dl_argv_str(dl, &dl->opts.reporter_param_name);
>> +	if (err)
>> +		return err;
>> +	err = dl_argv_str(dl, &dl->opts.reporter_param_value);
>> +	if (err)
>> +		return err;
>> +	dl_opts_put(nlh, dl);
>> +
>> +	if (!strncmp(dl->opts.reporter_param_name,
>> +		     HEALTH_REPORTER_GRACE_PERIOD_STR, strlen("garce"))) {
>> +		err = strtouint64_t(dl->opts.reporter_param_value, &period);
>> +		if (err)
>> +			goto err_param_value_parse;
>> +		mnl_attr_put_u64(nlh, DEVLINK_ATTR_HEALTH_REPORTER_PERIOD,
>> +				 period);
>> +	} else if (!strncmp(dl->opts.reporter_param_name,
>> +			    HEALTH_REPORTER_AUTO_RECOVER_STR,
>> +			    strlen("auto"))) {
>> +		err = strtobool(dl->opts.reporter_param_value, &auto_recover);
>> +		if (err)
>> +			goto err_param_value_parse;
>> +		mnl_attr_put_u8(nlh, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_REC,
>> +				(uint8_t)auto_recover);
>> +	} else {
>> +		printf("Parameter name: %s  is not supported\n",
>> +		       dl->opts.reporter_param_name);
>> +		return -ENOTSUP;
>> +	}
>> +
>> +	return _mnlg_socket_sndrcv(dl->nlg, nlh, NULL, NULL);
>> +
>> +err_param_value_parse:
>> +	pr_err("Value \"%s\" is not a number or not within range\n",
>> +	       dl->opts.param_value);
>> +	return err;
>> +}
>> +
>> +static int cmd_health_dump_clear(struct dl *dl)
>> +{
>> +	struct nlmsghdr *nlh;
>> +	int err;
>> +
>> +	nlh = mnlg_msg_prepare(dl->nlg,
>> +			       DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR,
>> +			       NLM_F_REQUEST | NLM_F_ACK);
>> +
>> +	err = dl_argv_parse_put(nlh, dl, DL_OPT_HANDLE |
>> +				DL_OPT_HEALTH_REPORTER_NAME, 0);
>> +	if (err)
>> +		return err;
>> +
>> +	dl_opts_put(nlh, dl);
>> +	return _mnlg_socket_sndrcv(dl->nlg, nlh, NULL, NULL);
>> +}
>> +
>> +static int health_value_show(struct dl *dl, int type, struct nlattr *nl_data)
>> +{
>> +	const char *str;
>> +	uint8_t *data;
>> +	uint32_t len;
>> +	uint64_t u64;
>> +	uint32_t u32;
>> +	uint16_t u16;
>> +	uint8_t u8;
>> +	int i;
>> +
>> +	switch (type) {
>> +	case MNL_TYPE_FLAG:
>> +		if (dl->json_output)
>> +			jsonw_string(dl->jw, nl_data ? "true" : "false");
>> +		else
>> +			pr_out("%s ", nl_data ? "true" : "false");
>> +		break;
>> +	case MNL_TYPE_U8:
>> +		u8 = mnl_attr_get_u8(nl_data);
>> +		if (dl->json_output)
>> +			jsonw_uint(dl->jw, u8);
>> +		else
>> +			pr_out("%u ", u8);
>> +		break;
>> +	case MNL_TYPE_U16:
>> +		u16 = mnl_attr_get_u16(nl_data);
>> +		if (dl->json_output)
>> +			jsonw_uint(dl->jw, u16);
>> +		else
>> +			pr_out("%u ", u16);
>> +		break;
>> +	case MNL_TYPE_U32:
>> +		u32 = mnl_attr_get_u32(nl_data);
>> +		if (dl->json_output)
>> +			jsonw_uint(dl->jw, u32);
>> +		else
>> +			pr_out("%u ", u32);
>> +		break;
>> +	case MNL_TYPE_U64:
>> +		u64 = mnl_attr_get_u64(nl_data);
>> +		if (dl->json_output)
>> +			jsonw_u64(dl->jw, u64);
>> +		else
>> +			pr_out("%lu ", u64);
>> +		break;
>> +	case MNL_TYPE_STRING:
>> +	case MNL_TYPE_NUL_STRING:
>> +		str = mnl_attr_get_str(nl_data);
>> +		if (dl->json_output)
>> +			jsonw_string(dl->jw, str);
>> +		else
>> +			pr_out("%s ", str);
>> +		break;
>> +	case MNL_TYPE_BINARY:
>> +		len = mnl_attr_get_payload_len(nl_data);
>> +		data = mnl_attr_get_payload(nl_data);
>> +		i = 0;
>> +		while (i < len) {
>> +			if (dl->json_output)
>> +				jsonw_printf(dl->jw, "%d", data[i]);
>> +			else
>> +				pr_out("%02x ", data[i]);
>> +			i++;
>> +		}
> 
> If 'len' is an arbitrary size then line lengths can be ridiculous here.
> 
> 
>> +		break;
>> +	default:
>> +		return -EINVAL;
>> +	}
>> +	return MNL_CB_OK;
>> +}
>> +
> 
> ...
> 
>> +static int jiffies_to_logtime(uint64_t jiffies, char *output)
>> +{
>> +	struct sysinfo s_info;
>> +	struct tm *info;
>> +	time_t now, sec;
>> +	int err;
>> +
>> +	time(&now);
>> +	info = localtime(&now);
>> +	strftime(output, HEALTH_REPORTER_TS_LEN, "%Y-%b-%d %H:%M:%S", info);
> 
> This strftime should really be done in the error path of sysinfo as
> opposed to every call to this function calling strftime twice.
> 
>> +	err = sysinfo(&s_info);
>> +	if (err)
>> +		return err;
>> +	/*substruct uptime in sec from now
>> +	 * add jiffies and 5 minutes factor*/
> 
> fix the comment style.
> 
> 
>> +	sec = now - (s_info.uptime - jiffies/1000) + 300;
>> +	info = localtime(&sec);
>> +	strftime(output, HEALTH_REPORTER_TS_LEN, "%Y-%b-%d %H:%M:%S", info);
>> +
>> +	return 0;
>> +}
>> +
>
David Ahern Jan. 24, 2019, 12:27 a.m. UTC | #3
On 1/23/19 4:27 AM, Aya Levin wrote:
>>> @@ -1298,6 +1322,12 @@ static int dl_argv_parse(struct dl *dl, uint32_t o_required,
>>>   		return -EINVAL;
>>>   	}
>>>   
>>> +	if ((o_required & DL_OPT_HEALTH_REPORTER_NAME) &&
>>> +	    !(o_found & DL_OPT_HEALTH_REPORTER_NAME)) {
>>> +		pr_err("Reporter name expected.\n");
>>> +		return -EINVAL;
>>> +	}
>>
>> I realize your are following suit with this change, but these checks for
>> 'required and not found' are getting really long. There is a better way
>> to do this.
> Do you mean somthing like:
> #define requiered_not_found(o_found, o_required, flag, msg)            \
>            ({                                                           \
>                    if ((o_required & flag) && !(o_found & flag)) {      \
>                            pr_err("%s \n", msg);                        \
>                            return -EINVAL;                              \
>                    }                                                    \
>            })
> 

That's one way.

I was also thinking a helper that does a single loop with an array of
bits and messages to print if required and not found.

That parse function is currently 355 lines long with a lot of repetition.
diff mbox series

Patch

diff --git a/devlink/devlink.c b/devlink/devlink.c
index 3651e90c1159..9fc19668ccd0 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -1,4 +1,5 @@ 
 /*
+ *
  * devlink.c	Devlink tool
  *
  *              This program is free software; you can redistribute it and/or
@@ -22,7 +23,7 @@ 
 #include <linux/devlink.h>
 #include <libmnl/libmnl.h>
 #include <netinet/ether.h>
-
+#include <sys/sysinfo.h>
 #include "SNAPSHOT.h"
 #include "list.h"
 #include "mnlg.h"
@@ -40,6 +41,12 @@ 
 #define PARAM_CMODE_DRIVERINIT_STR "driverinit"
 #define PARAM_CMODE_PERMANENT_STR "permanent"
 
+#define HEALTH_REPORTER_STATE_HEALTHY_STR "healthy"
+#define HEALTH_REPORTER_STATE_ERROR_STR "error"
+#define HEALTH_REPORTER_GRACE_PERIOD_STR "grace_period"
+#define HEALTH_REPORTER_AUTO_RECOVER_STR "auto_recover"
+#define HEALTH_REPORTER_TS_LEN 80
+
 static int g_new_line_count;
 
 #define pr_err(args...) fprintf(stderr, ##args)
@@ -199,6 +206,10 @@  static void ifname_map_free(struct ifname_map *ifname_map)
 #define DL_OPT_REGION_SNAPSHOT_ID	BIT(22)
 #define DL_OPT_REGION_ADDRESS		BIT(23)
 #define DL_OPT_REGION_LENGTH		BIT(24)
+#define DL_OPT_HANDLE_HEALTH		BIT(25)
+#define DL_OPT_HEALTH_REPORTER_NAME	BIT(26)
+#define DL_OPT_HEALTH_REPORTER_DEV	BIT(27)
+
 
 struct dl_opts {
 	uint32_t present; /* flags of present items */
@@ -230,6 +241,10 @@  struct dl_opts {
 	uint32_t region_snapshot_id;
 	uint64_t region_address;
 	uint64_t region_length;
+	const char *reporter_name;
+	const char *reporter_param_name;
+	const char *reporter_param_value;
+
 };
 
 struct dl {
@@ -959,7 +974,7 @@  static int dl_argv_parse(struct dl *dl, uint32_t o_required,
 		if (err)
 			return err;
 		o_found |= handle_bit;
-	} else if (o_required & DL_OPT_HANDLE) {
+	} else if (DL_OPT_HANDLE) {
 		err = dl_argv_handle(dl, &opts->bus_name, &opts->dev_name);
 		if (err)
 			return err;
@@ -1178,6 +1193,15 @@  static int dl_argv_parse(struct dl *dl, uint32_t o_required,
 			if (err)
 				return err;
 			o_found |= DL_OPT_REGION_LENGTH;
+		} else if (dl_argv_match(dl, "reporter") &&
+			   (o_all & DL_OPT_HEALTH_REPORTER_NAME)) {
+			dl_arg_inc(dl);
+			err = dl_argv_str(dl, &opts->reporter_name);
+			if (err)
+				return err;
+			o_found |= DL_OPT_HEALTH_REPORTER_NAME;
+			o_found |= DL_OPT_HANDLE;
+			break;
 		} else {
 			pr_err("Unknown option \"%s\"\n", dl_argv(dl));
 			return -EINVAL;
@@ -1298,6 +1322,12 @@  static int dl_argv_parse(struct dl *dl, uint32_t o_required,
 		return -EINVAL;
 	}
 
+	if ((o_required & DL_OPT_HEALTH_REPORTER_NAME) &&
+	    !(o_found & DL_OPT_HEALTH_REPORTER_NAME)) {
+		pr_err("Reporter name expected.\n");
+		return -EINVAL;
+	}
+
 	return 0;
 }
 
@@ -1382,6 +1412,9 @@  static void dl_opts_put(struct nlmsghdr *nlh, struct dl *dl)
 	if (opts->present & DL_OPT_REGION_LENGTH)
 		mnl_attr_put_u64(nlh, DEVLINK_ATTR_REGION_CHUNK_LEN,
 				 opts->region_length);
+	if (opts->present & DL_OPT_HEALTH_REPORTER_NAME)
+		mnl_attr_put_strz(nlh, DEVLINK_ATTR_HEALTH_REPORTER_NAME,
+				  opts->reporter_name);
 }
 
 static int dl_argv_parse_put(struct nlmsghdr *nlh, struct dl *dl,
@@ -1513,6 +1546,8 @@  static void __pr_out_handle_start(struct dl *dl, struct nlattr **tb,
 				__pr_out_newline();
 				__pr_out_indent_inc();
 				arr_last_handle_set(dl, bus_name, dev_name);
+			} else {
+				__pr_out_indent_inc();
 			}
 		} else {
 			pr_out("%s%s", buf, content ? ":" : "");
@@ -5557,11 +5592,502 @@  static int cmd_region(struct dl *dl)
 	return -ENOENT;
 }
 
+static int cmd_health_set_params(struct dl *dl)
+{
+	struct nlmsghdr *nlh;
+	uint64_t period;
+	bool auto_recover;
+	int err;
+
+	nlh = mnlg_msg_prepare(dl->nlg, DEVLINK_CMD_HEALTH_REPORTER_SET,
+			       NLM_F_REQUEST | NLM_F_ACK);
+	err = dl_argv_parse(dl, DL_OPT_HANDLE |
+			    DL_OPT_HEALTH_REPORTER_NAME, 0);
+	if (err)
+		return err;
+
+	err = dl_argv_str(dl, &dl->opts.reporter_param_name);
+	if (err)
+		return err;
+	err = dl_argv_str(dl, &dl->opts.reporter_param_value);
+	if (err)
+		return err;
+	dl_opts_put(nlh, dl);
+
+	if (!strncmp(dl->opts.reporter_param_name,
+		     HEALTH_REPORTER_GRACE_PERIOD_STR, strlen("garce"))) {
+		err = strtouint64_t(dl->opts.reporter_param_value, &period);
+		if (err)
+			goto err_param_value_parse;
+		mnl_attr_put_u64(nlh, DEVLINK_ATTR_HEALTH_REPORTER_PERIOD,
+				 period);
+	} else if (!strncmp(dl->opts.reporter_param_name,
+			    HEALTH_REPORTER_AUTO_RECOVER_STR,
+			    strlen("auto"))) {
+		err = strtobool(dl->opts.reporter_param_value, &auto_recover);
+		if (err)
+			goto err_param_value_parse;
+		mnl_attr_put_u8(nlh, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_REC,
+				(uint8_t)auto_recover);
+	} else {
+		printf("Parameter name: %s  is not supported\n",
+		       dl->opts.reporter_param_name);
+		return -ENOTSUP;
+	}
+
+	return _mnlg_socket_sndrcv(dl->nlg, nlh, NULL, NULL);
+
+err_param_value_parse:
+	pr_err("Value \"%s\" is not a number or not within range\n",
+	       dl->opts.param_value);
+	return err;
+}
+
+static int cmd_health_dump_clear(struct dl *dl)
+{
+	struct nlmsghdr *nlh;
+	int err;
+
+	nlh = mnlg_msg_prepare(dl->nlg,
+			       DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR,
+			       NLM_F_REQUEST | NLM_F_ACK);
+
+	err = dl_argv_parse_put(nlh, dl, DL_OPT_HANDLE |
+				DL_OPT_HEALTH_REPORTER_NAME, 0);
+	if (err)
+		return err;
+
+	dl_opts_put(nlh, dl);
+	return _mnlg_socket_sndrcv(dl->nlg, nlh, NULL, NULL);
+}
+
+static int health_value_show(struct dl *dl, int type, struct nlattr *nl_data)
+{
+	const char *str;
+	uint8_t *data;
+	uint32_t len;
+	uint64_t u64;
+	uint32_t u32;
+	uint16_t u16;
+	uint8_t u8;
+	int i;
+
+	switch (type) {
+	case MNL_TYPE_FLAG:
+		if (dl->json_output)
+			jsonw_string(dl->jw, nl_data ? "true" : "false");
+		else
+			pr_out("%s ", nl_data ? "true" : "false");
+		break;
+	case MNL_TYPE_U8:
+		u8 = mnl_attr_get_u8(nl_data);
+		if (dl->json_output)
+			jsonw_uint(dl->jw, u8);
+		else
+			pr_out("%u ", u8);
+		break;
+	case MNL_TYPE_U16:
+		u16 = mnl_attr_get_u16(nl_data);
+		if (dl->json_output)
+			jsonw_uint(dl->jw, u16);
+		else
+			pr_out("%u ", u16);
+		break;
+	case MNL_TYPE_U32:
+		u32 = mnl_attr_get_u32(nl_data);
+		if (dl->json_output)
+			jsonw_uint(dl->jw, u32);
+		else
+			pr_out("%u ", u32);
+		break;
+	case MNL_TYPE_U64:
+		u64 = mnl_attr_get_u64(nl_data);
+		if (dl->json_output)
+			jsonw_u64(dl->jw, u64);
+		else
+			pr_out("%lu ", u64);
+		break;
+	case MNL_TYPE_STRING:
+	case MNL_TYPE_NUL_STRING:
+		str = mnl_attr_get_str(nl_data);
+		if (dl->json_output)
+			jsonw_string(dl->jw, str);
+		else
+			pr_out("%s ", str);
+		break;
+	case MNL_TYPE_BINARY:
+		len = mnl_attr_get_payload_len(nl_data);
+		data = mnl_attr_get_payload(nl_data);
+		i = 0;
+		while (i < len) {
+			if (dl->json_output)
+				jsonw_printf(dl->jw, "%d", data[i]);
+			else
+				pr_out("%02x ", data[i]);
+			i++;
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+	return MNL_CB_OK;
+}
+
+static int health_object_pair_show(struct dl *dl, struct nlattr *nl)
+{
+	struct nlattr *nla_pair[DEVLINK_ATTR_MAX + 1] = {};
+	struct nlattr *nla_value[DEVLINK_ATTR_MAX + 1] = {};
+	struct nlattr *nla_value_data;
+	struct nlattr *nla_obj_pair;
+	struct nlattr *nla_object;
+	struct nlattr *nla_array;
+	struct nlattr *nla_val;
+	int err, type;
+	const char *name;
+
+	err = mnl_attr_parse_nested(nl, attr_cb, nla_pair);
+	if (err != MNL_CB_OK)
+		return -EINVAL;
+
+	if (!nla_pair[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_NAME] ||
+	    !nla_pair[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE])
+		return -EINVAL;
+
+	name = mnl_attr_get_str(nla_pair[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_NAME]);
+	if (dl->json_output)
+		jsonw_name(dl->jw, name);
+	else
+		pr_out("%s: ", name);
+
+	nla_val = nla_pair[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE];
+	err = mnl_attr_parse_nested(nla_val, attr_cb, nla_value);
+	if (err != MNL_CB_OK)
+		return -EINVAL;
+
+	if (!nla_value[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT] &&
+	    !nla_value[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_ARRAY] &&
+	    !nla_value[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE])
+		return -EINVAL;
+
+	if (nla_value[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT]) {
+		nla_object = nla_value[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT];
+		/*inside must be an object pair*/
+		if (dl->json_output)
+			jsonw_start_object(dl->jw);
+
+		mnl_attr_for_each_nested(nla_obj_pair, nla_object) {
+			err = health_object_pair_show(dl, nla_obj_pair);
+			if (err != MNL_CB_OK)
+				break;
+		}
+		if (dl->json_output)
+			jsonw_end_object(dl->jw);
+	}
+	if (nla_value[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE]) {
+		type = mnl_attr_get_u8(nla_value[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE]);
+		if (!nla_value[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA])
+			return -EINVAL;
+		nla_value_data = nla_value[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA];
+		health_value_show(dl, type, nla_value_data);
+	}
+	if (nla_value[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_ARRAY]) {
+		if (dl->json_output)
+			jsonw_start_array(dl->jw);
+		nla_array = nla_value[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_ARRAY];
+		mnl_attr_for_each_nested(nla_object, nla_array) {
+			mnl_attr_for_each_nested(nla_obj_pair, nla_object) {
+				err = health_object_pair_show(dl, nla_obj_pair);
+				if (err != MNL_CB_OK)
+					break;
+			}
+		}
+		if (dl->json_output)
+			jsonw_end_array(dl->jw);
+	}
+	return err;
+}
+
+static int cmd_health_object_cb(const struct nlmsghdr *nlh, void *data)
+{
+	struct genlmsghdr *genl = mnl_nlmsg_get_payload(nlh);
+	struct nlattr *tb[DEVLINK_ATTR_MAX + 1] = {};
+	struct nlattr *nla_object;
+	struct nlattr *nla_object_pair;
+	struct dl *dl = data;
+	int err = MNL_CB_OK;
+
+	mnl_attr_parse(nlh, sizeof(*genl), attr_cb, tb);
+	if (!tb[DEVLINK_ATTR_HEALTH_BUFFER_OBJECT])
+		return MNL_CB_ERROR;
+
+	mnl_attr_for_each(nla_object, nlh, sizeof(*genl)) {
+		mnl_attr_for_each_nested(nla_object_pair, nla_object) {
+			err = health_object_pair_show(dl, nla_object_pair);
+			if (err != MNL_CB_OK)
+				break;
+		}
+		if (!dl->json_output)
+			pr_out("\n");
+	}
+
+	return err;
+}
+
+static int cmd_health_object_common(struct dl *dl, uint8_t cmd)
+{
+	struct nlmsghdr *nlh;
+	int err;
+
+	nlh = mnlg_msg_prepare(dl->nlg, cmd,
+			       NLM_F_REQUEST | NLM_F_ACK);
+
+	err = dl_argv_parse_put(nlh, dl, DL_OPT_HANDLE |
+				DL_OPT_HEALTH_REPORTER_NAME, 0);
+	if (err)
+		return err;
+
+	if (dl->json_output)
+		jsonw_start_object(dl->jw);
+
+	err = _mnlg_socket_sndrcv(dl->nlg, nlh, cmd_health_object_cb, dl);
+
+	if (dl->json_output)
+		jsonw_end_object(dl->jw);
+
+	return err;
+}
+
+static int cmd_health_diagnose(struct dl *dl)
+{
+	return cmd_health_object_common(dl,
+					DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE);
+}
+
+static int cmd_health_dump_show(struct dl *dl)
+{
+	return cmd_health_object_common(dl,
+					DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET);
+}
+
+static int cmd_health_recover(struct dl *dl)
+{
+	struct nlmsghdr *nlh;
+	int err;
+
+	nlh = mnlg_msg_prepare(dl->nlg, DEVLINK_CMD_HEALTH_REPORTER_RECOVER,
+			       NLM_F_REQUEST | NLM_F_ACK);
+
+	err = dl_argv_parse_put(nlh, dl, DL_OPT_HANDLE |
+				DL_OPT_HEALTH_REPORTER_NAME, 0);
+	if (err)
+		return err;
+
+	dl_opts_put(nlh, dl);
+	return _mnlg_socket_sndrcv(dl->nlg, nlh, NULL, NULL);
+}
+
+static void pr_out_array_parameters_start(struct dl *dl, const char *name)
+{
+	if (dl->json_output) {
+		jsonw_name(dl->jw, name);
+		jsonw_start_array(dl->jw);
+	} else {
+		__pr_out_newline();
+		pr_out("%s:", name);
+		 __pr_out_indent_inc();
+		__pr_out_newline();
+	}
+}
+
+static const char *health_state_name(uint8_t state)
+{
+	switch (state) {
+	case DEVLINK_HEALTH_REPORTER_STATE_HEALTHY:
+		return HEALTH_REPORTER_STATE_HEALTHY_STR;
+	case DEVLINK_HEALTH_REPORTER_STATE_ERROR:
+		return HEALTH_REPORTER_STATE_ERROR_STR;
+	default: return "<unknown state>";
+	}
+}
+
+static int jiffies_to_logtime(uint64_t jiffies, char *output)
+{
+	struct sysinfo s_info;
+	struct tm *info;
+	time_t now, sec;
+	int err;
+
+	time(&now);
+	info = localtime(&now);
+	strftime(output, HEALTH_REPORTER_TS_LEN, "%Y-%b-%d %H:%M:%S", info);
+	err = sysinfo(&s_info);
+	if (err)
+		return err;
+	/*substruct uptime in sec from now
+	 * add jiffies and 5 minutes factor*/
+	sec = now - (s_info.uptime - jiffies/1000) + 300;
+	info = localtime(&sec);
+	strftime(output, HEALTH_REPORTER_TS_LEN, "%Y-%b-%d %H:%M:%S", info);
+
+	return 0;
+}
+
+static void pr_out_health(struct dl *dl, struct nlattr **tb)
+{
+	struct nlattr *hlt[DEVLINK_ATTR_MAX + 1] = {};
+	enum devlink_health_reporter_state state;
+	char dump_time_date[HEALTH_REPORTER_TS_LEN] = "N/A";
+	bool auto_recover = false;
+	const struct nlattr *attr;
+	bool dmp = false;
+	uint64_t jiffies;
+	int err;
+
+	state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY;
+
+	err = mnl_attr_parse_nested(tb[DEVLINK_ATTR_HEALTH_REPORTER], attr_cb,
+				    hlt);
+	if (err != MNL_CB_OK)
+		return;
+
+	if (!hlt[DEVLINK_ATTR_HEALTH_REPORTER_NAME]		||
+	    !hlt[DEVLINK_ATTR_HEALTH_REPORTER_ERR]		||
+	    !hlt[DEVLINK_ATTR_HEALTH_REPORTER_RECOVER]		||
+	    !hlt[DEVLINK_ATTR_HEALTH_REPORTER_DUMP_AVAIL]	||
+	    !hlt[DEVLINK_ATTR_HEALTH_REPORTER_STATE]		||
+	    !hlt[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_REC]		||
+	    !hlt[DEVLINK_ATTR_HEALTH_REPORTER_PERIOD])
+		return;
+
+	dmp = mnl_attr_get_u8(hlt[DEVLINK_ATTR_HEALTH_REPORTER_DUMP_AVAIL]);
+	if (dmp) {
+		if (hlt[DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS]) {
+			attr = hlt[DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS];
+			jiffies = mnl_attr_get_u64(attr);
+			err = jiffies_to_logtime(jiffies, dump_time_date);
+			if (err)
+				sprintf(dump_time_date,
+					"Failed to retrieve dump's timestamp");
+		}
+	}
+
+	pr_out_handle_start_arr(dl, tb);
+
+	pr_out_str(dl, "name",
+		   mnl_attr_get_str(hlt[DEVLINK_ATTR_HEALTH_REPORTER_NAME]));
+	__pr_out_newline();
+	__pr_out_indent_inc();
+	state = mnl_attr_get_u8(hlt[DEVLINK_ATTR_HEALTH_REPORTER_STATE]);
+	pr_out_str(dl, "state", health_state_name(state));
+	pr_out_u64(dl, "#err",
+		   mnl_attr_get_u64(hlt[DEVLINK_ATTR_HEALTH_REPORTER_ERR]));
+	pr_out_u64(dl, "#recover",
+		   mnl_attr_get_u64(hlt[DEVLINK_ATTR_HEALTH_REPORTER_RECOVER]));
+	pr_out_str(dl, "last_dump_ts", dump_time_date);
+	pr_out_bool(dl, "dump_available", dmp);
+	pr_out_array_parameters_start(dl, "parameters");
+	pr_out_entry_start(dl);
+	pr_out_u64(dl, "grace_period",
+		   mnl_attr_get_u64(hlt[DEVLINK_ATTR_HEALTH_REPORTER_PERIOD]));
+	auto_recover = mnl_attr_get_u8(hlt[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_REC]);
+	pr_out_bool(dl, "auto_recover", auto_recover);
+	pr_out_entry_end(dl);
+	pr_out_array_end(dl);
+	__pr_out_indent_dec();
+	pr_out_handle_end(dl);
+}
+
+static int cmd_health_show_cb(const struct nlmsghdr *nlh, void *data)
+{
+	struct genlmsghdr *genl = mnl_nlmsg_get_payload(nlh);
+	struct nlattr *tb[DEVLINK_ATTR_MAX + 1] = {};
+	struct dl *dl = data;
+
+	mnl_attr_parse(nlh, sizeof(*genl), attr_cb, tb);
+	if (!tb[DEVLINK_ATTR_BUS_NAME] || !tb[DEVLINK_ATTR_DEV_NAME] ||
+	    !tb[DEVLINK_ATTR_HEALTH_REPORTER])
+		return MNL_CB_ERROR;
+
+	pr_out_health(dl, tb);
+
+	return MNL_CB_OK;
+}
+
+static int cmd_health_show(struct dl *dl)
+{
+	struct nlmsghdr *nlh;
+	uint16_t flags = NLM_F_REQUEST | NLM_F_ACK;
+	int err;
+
+	if (dl_argc(dl) == 0)
+		flags |= NLM_F_DUMP;
+	nlh = mnlg_msg_prepare(dl->nlg, DEVLINK_CMD_HEALTH_REPORTER_GET,
+			       flags);
+
+	if (dl_argc(dl) > 0) {
+		err = dl_argv_parse_put(nlh, dl, 0, DL_OPT_HANDLE |
+					DL_OPT_HEALTH_REPORTER_NAME);
+		if (err)
+			return err;
+	}
+	pr_out_section_start(dl, "health");
+
+	err = _mnlg_socket_sndrcv(dl->nlg, nlh, cmd_health_show_cb, dl);
+	if (err)
+		return err;
+	pr_out_section_end(dl);
+	return err;
+}
+
+static void cmd_health_help(void)
+{
+	pr_err("Usage: devlink health show [ dev DEV reporter REPORTER_NAME ]\n");
+	pr_err("Usage: devlink health recover DEV reporter REPORTER_NAME\n");
+	pr_err("Usage: devlink health diagnose DEV reporter REPORTER_NAME\n");
+	pr_err("Usage: devlink health dump show DEV reporter REPORTER_NAME\n");
+	pr_err("Usage: devlink health dump clear DEV reporter REPORTER_NAME\n");
+	pr_err("Usage: devlink health set DEV reporter REPORTER_NAME NAME VALUE\n");
+}
+
+static int cmd_health(struct dl *dl)
+{
+	if (dl_no_arg(dl)) {
+		return cmd_health_show(dl);
+	} else if (dl_argv_match(dl, "help")) {
+		cmd_health_help();
+		return 0;
+	} else if (dl_argv_match(dl, "show")) {
+		dl_arg_inc(dl);
+		return cmd_health_show(dl);
+	} else if (dl_argv_match(dl, "recover")) {
+		dl_arg_inc(dl);
+		return cmd_health_recover(dl);
+	} else if (dl_argv_match(dl, "diagnose")) {
+		dl_arg_inc(dl);
+		return cmd_health_diagnose(dl);
+	} else if (dl_argv_match(dl, "dump")) {
+		dl_arg_inc(dl);
+		if (dl_argv_match(dl, "show")) {
+			dl_arg_inc(dl);
+			return cmd_health_dump_show(dl);
+		} else if (dl_argv_match(dl, "clear")) {
+			dl_arg_inc(dl);
+			return cmd_health_dump_clear(dl);
+		}
+	} else if (dl_argv_match(dl, "set")) {
+		dl_arg_inc(dl);
+		return cmd_health_set_params(dl);
+	}
+
+	pr_err("Command \"%s\" not found\n", dl_argv(dl));
+	return -ENOENT;
+}
+
 static void help(void)
 {
 	pr_err("Usage: devlink [ OPTIONS ] OBJECT { COMMAND | help }\n"
 	       "       devlink [ -f[orce] ] -b[atch] filename\n"
-	       "where  OBJECT := { dev | port | sb | monitor | dpipe | resource | region }\n"
+	       "where  OBJECT := { dev | port | sb | monitor | dpipe | resource | region | health }\n"
 	       "       OPTIONS := { -V[ersion] | -n[o-nice-names] | -j[son] | -p[retty] | -v[erbose] }\n");
 }
 
@@ -5594,7 +6120,11 @@  static int dl_cmd(struct dl *dl, int argc, char **argv)
 	} else if (dl_argv_match(dl, "region")) {
 		dl_arg_inc(dl);
 		return cmd_region(dl);
+	} else if (dl_argv_match(dl, "health")) {
+		dl_arg_inc(dl);
+		return cmd_health(dl);
 	}
+
 	pr_err("Object \"%s\" not found\n", dl_argv(dl));
 	return -ENOENT;
 }
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index d0a33d79dc22..90c6ebb4829d 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -89,6 +89,12 @@  enum devlink_command {
 	DEVLINK_CMD_REGION_DEL,
 	DEVLINK_CMD_REGION_READ,
 
+	DEVLINK_CMD_HEALTH_REPORTER_GET,
+	DEVLINK_CMD_HEALTH_REPORTER_SET,
+	DEVLINK_CMD_HEALTH_REPORTER_RECOVER,
+	DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE,
+	DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET,
+	DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR,
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
 	DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
@@ -168,6 +174,11 @@  enum devlink_param_fw_load_policy_value {
 	DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_FLASH,
 };
 
+enum devlink_health_reporter_state {
+	DEVLINK_HEALTH_REPORTER_STATE_HEALTHY,
+	DEVLINK_HEALTH_REPORTER_STATE_ERROR,
+};
+
 enum devlink_attr {
 	/* don't change the order or add anything between, this is ABI! */
 	DEVLINK_ATTR_UNSPEC,
@@ -285,6 +296,24 @@  enum devlink_attr {
 	DEVLINK_ATTR_REGION_CHUNK_ADDR,         /* u64 */
 	DEVLINK_ATTR_REGION_CHUNK_LEN,          /* u64 */
 
+	DEVLINK_ATTR_HEALTH_BUFFER_OBJECT,		/* nested */
+	DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR,		/* nested */
+	DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_NAME,		/* string */
+	DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE,	/* nested */
+	DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_ARRAY,	/* nested */
+	DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE,	/* u8 */
+	DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA,	/* dynamic */
+
+	DEVLINK_ATTR_HEALTH_REPORTER,			/* nested */
+	DEVLINK_ATTR_HEALTH_REPORTER_NAME,		/* string */
+	DEVLINK_ATTR_HEALTH_REPORTER_STATE,		/* u8 */
+	DEVLINK_ATTR_HEALTH_REPORTER_ERR,		/* u64 */
+	DEVLINK_ATTR_HEALTH_REPORTER_RECOVER,		/* u64 */
+	DEVLINK_ATTR_HEALTH_REPORTER_DUMP_AVAIL,	/* u8 */
+	DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS,		/* u64 */
+	DEVLINK_ATTR_HEALTH_REPORTER_PERIOD,		/* u64 */
+	DEVLINK_ATTR_HEALTH_REPORTER_AUTO_REC,		/* u8 */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/man/man8/devlink-health.8 b/man/man8/devlink-health.8
new file mode 100644
index 000000000000..880672290d36
--- /dev/null
+++ b/man/man8/devlink-health.8
@@ -0,0 +1,175 @@ 
+.TH DEVLINK\-HEALTH 8 "27 Dec 2018" "iproute2" "Linux"
+.SH NAME
+devlink-health \- devlink health reporting and recovery
+.SH SYNOPSIS
+.sp
+.ad l
+.in +8
+.ti -8
+.B devlink
+.RI "[ " OPTIONS " ]"
+.B health
+.RI  " { " COMMAND " | "
+.BR help " }"
+.sp
+.ti -8
+.IR OPTIONS " := { "
+\fB\-V\fR[\fIersion\fR] }
+.ti -8
+.BR "devlink health show"
+.RI "[ "
+.RI "" DEV ""
+.BR " reporter "
+.RI ""REPORTER " ] "
+.ti -8
+.BR "devlink health recover"
+.RI "" DEV ""
+.BR "reporter"
+.RI "" REPORTER ""
+.ti -8
+.BR "devlink health diagnose"
+.RI "" DEV ""
+.BR "reporter"
+.RI "" REPORTER ""
+.ti -8
+.BR "devlink health dump show"
+.RI "" DEV ""
+.BR "reporter"
+.RI "" REPORTER ""
+.ti -8
+.BR "devlink health dump clear"
+.RI "" DEV ""
+.BR "reporter"
+.RI "" REPORTER ""
+.ti -8
+.BR "devlink health set"
+.RI "" DEV ""
+.BR "reporter"
+.RI "" REPORTER ""
+.RI "" NAME ""
+.RI "" VALUE ""
+.ti -8
+.B devlink health help
+.SH "DESCRIPTION"
+.SS devlink health show - Show status and configuration on all supported reporters on all devlink devices.
+.PP
+.I "DEV"
+- specifies the devlink device.
+.PP
+.I "REPORTER"
+- specifies the reporter's name registered on the devlink device.
+.SS devlink health recover - Initiate a recovery operation on a reporter.
+This action performs a recovery and increases the recoveries counter on success.
+.PP
+.I "DEV"
+- specifies the devlink device.
+.PP
+.I "REPORTER"
+- specifies the reporter's name registered on the devlink device.
+.SS devlink health diagnose - Retrieve diagnostics data on a reporter.
+.PP
+.I "DEV"
+- specifies the devlink device.
+.PP
+.I "REPORTER"
+- specifies the reporter's name registered on the devlink device.
+.SS devlink health dump show - Display the last saved dump.
+.PD 0
+.P
+Devlink health saves a single dump per reporter. If an dump is
+.P
+not already stored by the Devlink, this command will generate a new
+.P
+dump. The dump can be generated either automatically when a
+.P
+reporter reports on an error or manually at the user's request.
+.PD
+.PP
+.I "DEV"
+- specifies the devlink device.
+.PP
+.I "REPORTER"
+- specifies the reporter's name registered on the devlink device.
+.SS devlink health dump clear - Delete the saved dump.
+Deleting the save dump enables a generation of a new dump on
+.PD 0
+.P
+the next "devlink health dump show" command.
+.PD
+.PP
+.I "DEV"
+- specifies the devlink device.
+.PP
+.I "REPORTER"
+- specifies the reporter's name registered on the devlink device.
+.SS devlink health set - Enable the user to configure:
+.PD 0
+1) grace_period [msec] - Time interval between auto recoveries.
+.P
+2) auto_recover [true/false] - Indicates whether the devlink should execute automatic recover on error.
+.PD
+.PP
+.I "DEV"
+- specifies the devlink device.
+.PP
+.I "REPORTER"
+- specifies the reporter's name registered on the devlink device.
+.SH "EXAMPLES"
+.PP
+devlink health show
+.RS 4
+pci/0000:00:09.0:
+  name TX state healthy #err 1 #recover 1 last_dump_ts N/A dump_available false
+  attributes:
+    grace period 600 auto_recover true
+.RE
+.PP
+devlink health recover pci/0000:00:09.0 reporter TX
+.RS 4
+Initiate recovery on TX reporter registered on pci/0000:00:09.0.
+.RE
+.PP
+devlink health diagnose pci/0000:00:09.0 reporter TX
+.RS 4
+.PD 0
+SQ 0x9b: HW state: 1 stopped: 0
+.P
+SQ 0x9f: HW state: 1 stopped: 0
+.P
+SQ 0xa3: HW state: 1 stopped: 0
+.P
+SQ 0xa7: HW state: 1 stopped: 0
+.P
+SQ 0xab: HW state: 1 stopped: 0
+.PD
+.RE
+.PP
+devlink health dump show pci/0000:00:09.0 reporter TX
+.RS 4
+Display the last saved dump on TX reporter registered on pci/0000:00:09.0.
+.RE
+.PP
+devlink health dump clear pci/0000:00:09.0 reporter TX
+.RS 4
+Delete saved dump on TX reporter registered on pci/0000:00:09.0.
+.RE
+.PP
+devlink health set pci/0000:00:09.0 reporter TX grace_period 3500
+.RS 4
+Set time interval between auto recoveries to minimum of 3500 mSec on
+TX reporter registered on pci/0000:00:09.0.
+.RE
+.PP
+devlink health set pci/0000:00:09.0 reporter TX auto_recover false
+.RS 4
+Turn off auto recovery on TX reporter registered on pci/0000:00:09.0.
+.RE
+.SH SEE ALSO
+.BR devlink (8),
+.BR devlink-dev (8),
+.BR devlink-port (8),
+.BR devlink-region (8),
+.br
+
+.SH AUTHOR
+Aya Levin <ayal@mellanox.com>
diff --git a/man/man8/devlink.8 b/man/man8/devlink.8
index 8d527e7e1d60..13d4dcd908b3 100644
--- a/man/man8/devlink.8
+++ b/man/man8/devlink.8
@@ -7,7 +7,7 @@  devlink \- Devlink tool
 .in +8
 .ti -8
 .B devlink
-.RI "[ " OPTIONS " ] { " dev | port | monitor | sb | resource | region " } { " COMMAND " | "
+.RI "[ " OPTIONS " ] { " dev | port | monitor | sb | resource | region | health " } { " COMMAND " | "
 .BR help " }"
 .sp
 
@@ -78,6 +78,10 @@  Turn on verbose output.
 .B region
 - devlink address region access
 
+.TP
+.B health
+- devlink reporting and recovery
+
 .SS
 .I COMMAND
 
@@ -109,6 +113,7 @@  Exit status is 0 if command was successful or a positive integer upon failure.
 .BR devlink-sb (8),
 .BR devlink-resource (8),
 .BR devlink-region (8),
+.BR devlink-health (8),
 .br
 
 .SH REPORTING BUGS