diff mbox series

[net-next,RFC] Dump SW SQ context as part of tx reporter

Message ID 1556547459-7756-1-git-send-email-ayal@mellanox.com
State RFC
Delegated to: David Miller
Headers show
Series [net-next,RFC] Dump SW SQ context as part of tx reporter | expand

Commit Message

Aya Levin April 29, 2019, 2:17 p.m. UTC
TX reporter reports an error on two scenarios:
- TX timeout on a specific tx queue
- TX completion error on a specific send queue
Prior to this patch, no dump data was supported by the tx reporter. This
patch adds support for SW data dump of the related SQ context. The dump
is simply the SQ's raw memory snapshot taken right after the error was
reported, before any recovery procedure was launched. With this
approach, no maintenance is needed as the driver fetch the actual data
according to the layout on which the SQ was compiled with.  By providing
a SW context, one can easily debug error on a given SQ.

In order to offline translate the raw memory into a human readable
format, the user can use some out-of-kernel scripts which receives as an
input the following:
- Object raw memory
- Driver object compiled with debug info (can be taken/generated at any time from the machine)
- Object name

An example of such script output can be seen below.
Note: the script is not offered as part of this patch as it do not
belong to the kernel, I just described it in order to grasp the general
idea of how/what can be fetched from SW dump via devlink health.

The output of the SW dump can be extracted by devlink health command:
$ sudo devlink health dump show pci/0000:00:0b.0 reporter tx.
 mlx5e_txqsq: sqn: 6336
 memory:
   00 00 00 00 00 00 00 00
   01 00 00 00 00 00 00 00
   00 00 00 00 00 00 00 00
   45 f4 88 cb 09 00 00 00
   00 00 00 00 00 00 00 00
   00 00 00 00 00 00 00 00
   c0 ff ff ff 1f 00 00 00
   f8 18 1e 89 81 88 ff ff
   ...

script output below, with struct members names and actual values:

struct  mlx5e_txqsq {
	short unsigned int         cc 	 0x5 ;
	unsigned int               dma_fifo_cc 	 0x5 ;
	struct  net_dim {
		unsigned char      state 	 0x1 ;
		struct  net_dim_stats {
			int        ppms 	 0x0 ;
			int        bpms 	 0x0 ;
			int        epms 	 0x0 ;
		} prev_stats;
		struct  net_dim_sample {
			long long int time 	 0x90766ef9d ;
			unsigned int pkt_ctr 	 0x0 ;
			unsigned int byte_ctr 	 0x0 ;
			short unsigned int event_ctr 	 0x0 ;
		} start_sample;
		struct  work_struct {
			struct   {
				long int counter 	 0x1fffffffc0 ;
			} data;
			struct  list_head {
				struct list_head * next 	 0xffff8881b08998f8 ;
				struct list_head * prev 	 0xffff8881b08998f8 ;
			} entry;
			void       (*func)(struct work_struct *) 	 0xffffffffa02d0e30 ;
		} work;
		unsigned char      profile_ix 	 0x60 ;
		unsigned char      mode 	 0x72 ;
		unsigned char      tune_state 	 0x35 ;
		unsigned char      steps_right 	 0xa0 ;
		unsigned char      steps_left 	 0xff ;
		unsigned char      tired 	 0xff ;
	} dim;
	short unsigned int         pc 	 0x0 ;
	unsigned int               dma_fifo_pc 	 0x0 ;
	struct  mlx5e_cq {
		struct  mlx5_cqwq {
			struct  mlx5_frag_buf_ctrl {
				struct mlx5_buf_list * frags 	 0x500000005 ;
				unsigned int sz_m1 	 0x0 ;
				short unsigned int frag_sz_m1 	 0x0 ;
				short unsigned int strides_offset 	 0x0 ;
				unsigned char log_sz 	 0x0 ;
				unsigned char log_stride 	 0x0 ;
				unsigned char log_frag_strides 	 0x0 ;
			} fbc;
			__be32 *   db 	 0x0 ;
			unsigned int cc 	 0x0 ;
		} wq;
		short unsigned int event_ctr 	 0x0 ;
		struct napi_struct * napi 	 0x0 ;
		struct  mlx5_core_cq {
			unsigned int cqn 	 0x0 ;
			int        cqe_sz 	 0x0 ;
			__be32 *   set_ci_db 	 0xffff8881b1aa4988 ;
			__be32 *   arm_db 	 0x3f000003ff ;
			struct mlx5_uars_page * uar 	 0x6060a ;
			struct  refcount_struct {
				struct   {
					int    counter 	 0xa1814500 ;
				} refs;
			} refcount;
			struct  completion {
				unsigned int done 	 0x5 ;
				struct  wait_queue_head {
					struct  spinlock {
						union   {
							struct  raw_spinlock {
								struct  qspinlock {
									union   {
										struct   {
											int                                                    counter 	 0x5 ;
										} val;
										struct   {
											unsigned char                                          locked 	 0x5 ;
											unsigned char                                          pending 	 0x0 ;
										} ;
										struct   {
											short unsigned int                                     locked_pending 	 0x5 ;
											short unsigned int                                     tail 	 0x0 ;
										} ;
									} ;
								} raw_lock;
							} rlock;
						} ;
					} lock;
					struct  list_head {
						struct list_head * next 	 0xffff8881b089bb88 ;
						struct list_head * prev 	 0x4000000c0a ;
					} head;
				} wait;
			} free;
			unsigned int vector 	 0xa1814500 ;
			unsigned int irqn 	 0xffff8881 ;
			void       (*comp)(struct mlx5_core_cq *) 	 0xffff8881a1814504 ;
			void       (*event)(struct mlx5_core_cq *, enum mlx5_event) 	 0xffff8881a2cdea08 ;
			unsigned int cons_index 	 0x1 ;
			unsigned int arm_sn 	 0x0 ;
			struct mlx5_rsc_debug * dbg 	 0x0 ;
			int        pid 	 0x0 ;
			struct   {
				struct  list_head {
					struct list_head * next 	 0xffffffff ;
					struct list_head * prev 	 0xffffffffffffffff ;
				} list;
				void (*comp)(struct mlx5_core_cq *) 	 0xffffffffa0356940 ;
				void * priv 	 0x0 ;
			} tasklet_ctx;
			int        reset_notify_added 	 0x0 ;
			struct  list_head {
				struct list_head * next 	 0xffffffffa0300700 ;
				struct list_head * prev 	 0xd ;
			} reset_notify;
			struct mlx5_eq_comp * eq 	 0x0 ;
			short unsigned int uid 	 0x9a70 ;
		} mcq;
		struct mlx5e_channel * channel 	 0xffff8881b0899a70 ;
		struct mlx5_core_dev * mdev 	 0x4800000001 ;
		struct  mlx5_wq_ctrl {
			struct mlx5_core_dev * mdev 	 0xffffffffa02d5350 ;
			struct  mlx5_frag_buf {
				struct mlx5_buf_list * frags 	 0xffffffffa02d5460 ;
				int npages 	 0x0 ;
				int size 	 0x5 ;
				unsigned char page_shift 	 0x8 ;
			} buf;
			struct  mlx5_db {
				__be32 * db 	 0x1c6 ;
				union   {
					struct mlx5_db_pgdir * pgdir 	 0x0 ;
					struct mlx5_ib_user_db_page * user_page 	 0x0 ;
				} u;
				long long unsigned int dma 	 0xffff8881b0899ab0 ;
				int index 	 0x0 ;
			} db;
		} wq_ctrl;
	} cq;
	struct  mlx5_wq_cyc {
		struct  mlx5_frag_buf_ctrl {
			struct mlx5_buf_list * frags 	 0xffff8881a7600160 ;
			unsigned int sz_m1 	 0xa7600160 ;
			short unsigned int frag_sz_m1 	 0x8881 ;
			short unsigned int strides_offset 	 0xffff ;
			unsigned char log_sz 	 0x88 ;
			unsigned char log_stride 	 0x49 ;
			unsigned char log_frag_strides 	 0xaa ;
		} fbc;
		__be32 *           db 	 0x1000000000010 ;
		short unsigned int sz 	 0xc ;
		short unsigned int wqe_ctr 	 0x0 ;
		short unsigned int cur_sz 	 0x0 ;
	} wq;
	unsigned int               dma_fifo_mask 	 0xa1814500 ;
	struct mlx5e_sq_stats *    stats 	 0xffff8881a33a0348 ;
	struct   {
		struct mlx5e_sq_dma * dma_fifo 	 0x1a1814500 ;
		struct mlx5e_tx_wqe_info * wqe_info 	 0x14 ;
	} db;
	void *                     uar_map 	 0x0 ;
	struct netdev_queue *      txq 	 0x0 ;
	unsigned int               sqn 	 0x18c0 ;
	unsigned char              min_inline_mode 	 0x0 ;
	struct device *            pdev 	 0x0 ;
	unsigned int               mkey_be 	 0x0 ;
	long unsigned int          state 	 0x0 ;
	struct hwtstamp_config *   tstamp 	 0x0 ;
	struct mlx5_clock *        clock 	 0xffff8881b1aa6f88 ;
	struct  mlx5_wq_ctrl {
		struct mlx5_core_dev * mdev 	 0x3f000003ff ;
		struct  mlx5_frag_buf {
			struct mlx5_buf_list * frags 	 0x6060a ;
			int        npages 	 0xa1814604 ;
			int        size 	 0xffff8881 ;
			unsigned char page_shift 	 0x0 ;
		} buf;
		struct  mlx5_db {
			__be32 *   db 	 0xfff ;
			union   {
				struct mlx5_db_pgdir * pgdir 	 0x0 ;
				struct mlx5_ib_user_db_page * user_page 	 0x0 ;
			} u;
			long long unsigned int dma 	 0xffff888188440000 ;
			int        index 	 0x8b074000 ;
		} db;
	} wq_ctrl;
	struct mlx5e_channel *     channel 	 0xffffc9000010d800 ;
	int                        txq_ix 	 0xa0020180 ;
	unsigned int               rate_limit 	 0xffff8881 ;
	struct  work_struct {
		struct   {
			long int   counter 	 0x1000018c0 ;
		} data;
		struct  list_head {
			struct list_head * next 	 0xffff8881c32b68e8 ;
			struct list_head * prev 	 0x800 ;
		} entry;
		void               (*func)(struct work_struct *) 	 0x9 ;
	} recover_work;
} ;

Signed-off-by: Aya Levin <ayal@mellanox.com>
---
 .../ethernet/mellanox/mlx5/core/en/reporter_tx.c   | 100 +++++++++++++++++++++
 1 file changed, 100 insertions(+)

Comments

Saeed Mahameed April 29, 2019, 6:32 p.m. UTC | #1
On Mon, 2019-04-29 at 17:17 +0300, Aya Levin wrote:
> TX reporter reports an error on two scenarios:
> - TX timeout on a specific tx queue
> - TX completion error on a specific send queue
> Prior to this patch, no dump data was supported by the tx reporter.
> This
> patch adds support for SW data dump of the related SQ context. The
> dump
> is simply the SQ's raw memory snapshot taken right after the error
> was
> reported, before any recovery procedure was launched. With this
> approach, no maintenance is needed as the driver fetch the actual
> data
> according to the layout on which the SQ was compiled with.  By
> providing
> a SW context, one can easily debug error on a given SQ.
> 
> In order to offline translate the raw memory into a human readable
> format, the user can use some out-of-kernel scripts which receives as
> an
> input the following:
> - Object raw memory
> - Driver object compiled with debug info (can be taken/generated at
> any time from the machine)
> - Object name
> 
> An example of such script output can be seen below.
> Note: the script is not offered as part of this patch as it do not
> belong to the kernel, I just described it in order to grasp the
> general
> idea of how/what can be fetched from SW dump via devlink health.
> 

What was the script ? provided a given raw dump how do you find which
version is it, object name ?

did you use any well known raw debug format, like DWARF ?

> The output of the SW dump can be extracted by devlink health command:
> $ sudo devlink health dump show pci/0000:00:0b.0 reporter tx.
>  mlx5e_txqsq: sqn: 6336
>  memory:
>    00 00 00 00 00 00 00 00
>    01 00 00 00 00 00 00 00
>    00 00 00 00 00 00 00 00
>    45 f4 88 cb 09 00 00 00
>    00 00 00 00 00 00 00 00
>    00 00 00 00 00 00 00 00
>    c0 ff ff ff 1f 00 00 00
>    f8 18 1e 89 81 88 ff ff
>    ...
> 
> script output below, with struct members names and actual values:
> 
> struct  mlx5e_txqsq {
> 	short unsigned int         cc 	 0x5 ;
> 	unsigned int               dma_fifo_cc 	 0x5 ;
> 	struct  net_dim {
> 		unsigned char      state 	 0x1 ;
> 		struct  net_dim_stats {
> 			int        ppms 	 0x0 ;
> 			int        bpms 	 0x0 ;
> 			int        epms 	 0x0 ;
> 		} prev_stats;
> 		struct  net_dim_sample {
> 			long long int time 	 0x90766ef9d ;
> 			unsigned int pkt_ctr 	 0x0 ;
> 			unsigned int byte_ctr 	 0x0 ;
> 			short unsigned int event_ctr 	 0x0 ;
> 		} start_sample;
> 		struct  work_struct {
> 			struct   {
> 				long int counter 	 0x1fffffffc0 ;
> 			} data;
> 			struct  list_head {
> 				struct list_head * next 	 0xffff8881b0
> 8998f8 ;
> 				struct list_head * prev 	 0xffff8881b0
> 8998f8 ;
> 			} entry;
> 			void       (*func)(struct work_struct *) 	 0xff
> ffffffa02d0e30 ;
> 		} work;
> 		unsigned char      profile_ix 	 0x60 ;
> 		unsigned char      mode 	 0x72 ;
> 		unsigned char      tune_state 	 0x35 ;
> 		unsigned char      steps_right 	 0xa0 ;
> 		unsigned char      steps_left 	 0xff ;
> 		unsigned char      tired 	 0xff ;
> 	} dim;
> 	short unsigned int         pc 	 0x0 ;
> 	unsigned int               dma_fifo_pc 	 0x0 ;
> 	struct  mlx5e_cq {
> 		struct  mlx5_cqwq {
> 			struct  mlx5_frag_buf_ctrl {
> 				struct mlx5_buf_list * frags 	 0x50
> 0000005 ;
> 				unsigned int sz_m1 	 0x0 ;
> 				short unsigned int frag_sz_m1 	 0x0
> ;
> 				short unsigned int strides_offset 	
>  0x0 ;
> 				unsigned char log_sz 	 0x0 ;
> 				unsigned char log_stride 	 0x0 ;
> 				unsigned char log_frag_strides 	 0x0
> ;
> 			} fbc;
> 			__be32 *   db 	 0x0 ;
> 			unsigned int cc 	 0x0 ;
> 		} wq;
> 		short unsigned int event_ctr 	 0x0 ;
> 		struct napi_struct * napi 	 0x0 ;
> 		struct  mlx5_core_cq {
> 			unsigned int cqn 	 0x0 ;
> 			int        cqe_sz 	 0x0 ;
> 			__be32 *   set_ci_db 	 0xffff8881b1aa4988 ;
> 			__be32 *   arm_db 	 0x3f000003ff ;
> 			struct mlx5_uars_page * uar 	 0x6060a ;
> 			struct  refcount_struct {
> 				struct   {
> 					int    counter 	 0xa1814500 ;
> 				} refs;
> 			} refcount;
> 			struct  completion {
> 				unsigned int done 	 0x5 ;
> 				struct  wait_queue_head {
> 					struct  spinlock {
> 						union   {
> 							struct  raw_spi
> nlock {
> 								struct 
>  qspinlock {
> 									
> union   {
> 									
> 	struct   {
> 									
> 		int                                                    
> counter 	 0x5 ;
> 									
> 	} val;
> 									
> 	struct   {
> 									
> 		unsigned
> char                                          locked 	 0x5 ;
> 									
> 		unsigned
> char                                          pending 	 0x0 ;
> 									
> 	} ;
> 									
> 	struct   {
> 									
> 		short unsigned
> int                                     locked_pending 	 0x5 ;
> 									
> 		short unsigned
> int                                     tail 	 0x0 ;
> 									
> 	} ;
> 									
> } ;
> 								}
> raw_lock;
> 							} rlock;
> 						} ;
> 					} lock;
> 					struct  list_head {
> 						struct list_head *
> next 	 0xffff8881b089bb88 ;
> 						struct list_head *
> prev 	 0x4000000c0a ;
> 					} head;
> 				} wait;
> 			} free;
> 			unsigned int vector 	 0xa1814500 ;
> 			unsigned int irqn 	 0xffff8881 ;
> 			void       (*comp)(struct mlx5_core_cq *) 	
>  0xffff8881a1814504 ;
> 			void       (*event)(struct mlx5_core_cq *, enum
> mlx5_event) 	 0xffff8881a2cdea08 ;
> 			unsigned int cons_index 	 0x1 ;
> 			unsigned int arm_sn 	 0x0 ;
> 			struct mlx5_rsc_debug * dbg 	 0x0 ;
> 			int        pid 	 0x0 ;
> 			struct   {
> 				struct  list_head {
> 					struct list_head * next 	 0xff
> ffffff ;
> 					struct list_head * prev 	 0xff
> ffffffffffffff ;
> 				} list;
> 				void (*comp)(struct mlx5_core_cq *) 	
>  0xffffffffa0356940 ;
> 				void * priv 	 0x0 ;
> 			} tasklet_ctx;
> 			int        reset_notify_added 	 0x0 ;
> 			struct  list_head {
> 				struct list_head * next 	 0xffffffffa0
> 300700 ;
> 				struct list_head * prev 	 0xd ;
> 			} reset_notify;
> 			struct mlx5_eq_comp * eq 	 0x0 ;
> 			short unsigned int uid 	 0x9a70 ;
> 		} mcq;
> 		struct mlx5e_channel * channel 	 0xffff8881b0899a70 ;
> 		struct mlx5_core_dev * mdev 	 0x4800000001 ;
> 		struct  mlx5_wq_ctrl {
> 			struct mlx5_core_dev * mdev 	 0xffffffffa0
> 2d5350 ;
> 			struct  mlx5_frag_buf {
> 				struct mlx5_buf_list * frags 	 0xff
> ffffffa02d5460 ;
> 				int npages 	 0x0 ;
> 				int size 	 0x5 ;
> 				unsigned char page_shift 	 0x8 ;
> 			} buf;
> 			struct  mlx5_db {
> 				__be32 * db 	 0x1c6 ;
> 				union   {
> 					struct mlx5_db_pgdir * pgdir 	
>  0x0 ;
> 					struct mlx5_ib_user_db_page *
> user_page 	 0x0 ;
> 				} u;
> 				long long unsigned int dma 	 0xff
> ff8881b0899ab0 ;
> 				int index 	 0x0 ;
> 			} db;
> 		} wq_ctrl;
> 	} cq;
> 	struct  mlx5_wq_cyc {
> 		struct  mlx5_frag_buf_ctrl {
> 			struct mlx5_buf_list * frags 	 0xffff8881a7
> 600160 ;
> 			unsigned int sz_m1 	 0xa7600160 ;
> 			short unsigned int frag_sz_m1 	 0x8881 ;
> 			short unsigned int strides_offset 	 0xff
> ff ;
> 			unsigned char log_sz 	 0x88 ;
> 			unsigned char log_stride 	 0x49 ;
> 			unsigned char log_frag_strides 	 0xaa ;
> 		} fbc;
> 		__be32 *           db 	 0x1000000000010 ;
> 		short unsigned int sz 	 0xc ;
> 		short unsigned int wqe_ctr 	 0x0 ;
> 		short unsigned int cur_sz 	 0x0 ;
> 	} wq;
> 	unsigned int               dma_fifo_mask 	 0xa1814500 ;
> 	struct mlx5e_sq_stats *    stats 	 0xffff8881a33a0348 ;
> 	struct   {
> 		struct mlx5e_sq_dma * dma_fifo 	 0x1a1814500 ;
> 		struct mlx5e_tx_wqe_info * wqe_info 	 0x14 ;
> 	} db;
> 	void *                     uar_map 	 0x0 ;
> 	struct netdev_queue *      txq 	 0x0 ;
> 	unsigned int               sqn 	 0x18c0 ;
> 	unsigned char              min_inline_mode 	 0x0 ;
> 	struct device *            pdev 	 0x0 ;
> 	unsigned int               mkey_be 	 0x0 ;
> 	long unsigned int          state 	 0x0 ;
> 	struct hwtstamp_config *   tstamp 	 0x0 ;
> 	struct mlx5_clock *        clock 	 0xffff8881b1aa6f88 ;
> 	struct  mlx5_wq_ctrl {
> 		struct mlx5_core_dev * mdev 	 0x3f000003ff ;
> 		struct  mlx5_frag_buf {
> 			struct mlx5_buf_list * frags 	 0x6060a ;
> 			int        npages 	 0xa1814604 ;
> 			int        size 	 0xffff8881 ;
> 			unsigned char page_shift 	 0x0 ;
> 		} buf;
> 		struct  mlx5_db {
> 			__be32 *   db 	 0xfff ;
> 			union   {
> 				struct mlx5_db_pgdir * pgdir 	 0x0
> ;
> 				struct mlx5_ib_user_db_page *
> user_page 	 0x0 ;
> 			} u;
> 			long long unsigned int dma 	 0xffff888188
> 440000 ;
> 			int        index 	 0x8b074000 ;
> 		} db;
> 	} wq_ctrl;
> 	struct mlx5e_channel *     channel 	 0xffffc9000010d800 ;
> 	int                        txq_ix 	 0xa0020180 ;

txq_ix is too hight to make any sense here.


> 	unsigned int               rate_limit 	 0xffff8881 ;
> 	struct  work_struct {
> 		struct   {
> 			long int   counter 	 0x1000018c0 ;
> 		} data;
> 		struct  list_head {
> 			struct list_head * next 	 0xffff8881c32b68e8 ;
> 			struct list_head * prev 	 0x800 ;
> 		} entry;
> 		void               (*func)(struct work_struct *) 	 0x9
> ;
> 	} recover_work;
> } ;
> 
> Signed-off-by: Aya Levin <ayal@mellanox.com>
> ---
>  .../ethernet/mellanox/mlx5/core/en/reporter_tx.c   | 100
> +++++++++++++++++++++
>  1 file changed, 100 insertions(+)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
> b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
> index 476dd97f7f2f..8a39f5525e57 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
> @@ -9,6 +9,7 @@
>  
>  struct mlx5e_tx_err_ctx {
>  	int (*recover)(struct mlx5e_txqsq *sq);
> +	int (*dump)(struct mlx5e_txqsq *sq);
>  	struct mlx5e_txqsq *sq;
>  };
>  
> @@ -281,10 +282,109 @@ static int mlx5e_tx_reporter_diagnose(struct
> devlink_health_reporter *reporter,
>  	return err;
>  }
>  
> +static int mlx5e_tx_reporter_sw_dump_from_ctx(struct mlx5e_priv
> *priv,
> +					      struct mlx5e_txqsq *sq,
> +					      struct devlink_fmsg
> *fmsg)
> +{
> +	u64 *ptr = (u64 *)sq;
> +	int copy, err;
> +	int i = 0;
> +
> +	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
> +		return 0;
> +
> +	err = devlink_fmsg_pair_nest_start(fmsg, "mlx5e_txqsq");
> +	if (err)
> +		return err;
> +
> +	err = devlink_fmsg_obj_nest_start(fmsg);
> +	if (err)
> +		return err;
> +
> +	err = devlink_fmsg_arr_pair_nest_start(fmsg, "memory");
> +	if (err)
> +		return err;
> +
> +	while (i < sizeof(struct mlx5e_txqsq)) {
> +		copy = sizeof(u64);
> +
> +		if (i + copy > sizeof(struct mlx5e_txqsq))
> +			copy = sizeof(struct mlx5e_txqsq) - i;
> +
> +		err = devlink_fmsg_binary_put(fmsg, ptr, copy);
> +		if (err)
> +			return err;
> +		ptr++;
> +		i += copy;
> +	}
> +
> +	err = devlink_fmsg_arr_pair_nest_end(fmsg);
> +	if (err)
> +		return err;
> +
> +	err = devlink_fmsg_obj_nest_end(fmsg);
> +	if (err)
> +		return err;
> +
> +	err = devlink_fmsg_pair_nest_end(fmsg);
> +
> +	return err;
> +}
> +
> +static int mlx5e_tx_reporter_sw_dump_all(struct mlx5e_priv *priv,
> +					 struct devlink_fmsg *fmsg)
> +{
> +	int i, err = 0;
> +
> +	mutex_lock(&priv->state_lock);
> +
> +	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
> +		goto unlock;
> +
> +	err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
> +	if (err)
> +		goto unlock;
> +
> +	for (i = 0; i < priv->channels.num * priv-
> >channels.params.num_tc;
> +	     i++) {
> +		err = devlink_fmsg_obj_nest_start(fmsg);
> +		if (err)
> +			goto unlock;
> +
> +		err = mlx5e_tx_reporter_sw_dump_from_ctx(priv, priv-
> >txq2sq[i],
> +							 fmsg);
> +		if (err)
> +			goto unlock;
> +
> +		err = devlink_fmsg_pair_nest_end(fmsg);
> +		if (err)
> +			goto unlock;
> +	}
> +	err = devlink_fmsg_arr_pair_nest_end(fmsg);
> +	if (err)
> +		goto unlock;
> +
> +unlock:
> +	mutex_unlock(&priv->state_lock);
> +	return err;
> +}
> +
> +static int mlx5e_tx_reporter_sw_dump(struct devlink_health_reporter
> *reporter,
> +				     struct devlink_fmsg *fmsg, void
> *context)
> +{
> +	struct mlx5e_priv *priv =
> devlink_health_reporter_priv(reporter);
> +	struct mlx5e_tx_err_ctx *err_ctx = context;
> +
> +	return err_ctx ? mlx5e_tx_reporter_sw_dump_from_ctx(priv,
> err_ctx->sq,
> +							    fmsg) :
> +			 mlx5e_tx_reporter_sw_dump_all(priv, fmsg);
> +}
> +
>  static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops
> = {
>  		.name = "tx",
>  		.recover = mlx5e_tx_reporter_recover,
>  		.diagnose = mlx5e_tx_reporter_diagnose,
> +		.dump = mlx5e_tx_reporter_sw_dump,
>  };
>  
>  #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
Jakub Kicinski April 30, 2019, 12:54 a.m. UTC | #2
On Mon, 29 Apr 2019 17:17:39 +0300, Aya Levin wrote:
> In order to offline translate the raw memory into a human readable
> format, the user can use some out-of-kernel scripts which receives as an
> input the following:
> - Object raw memory
> - Driver object compiled with debug info (can be taken/generated at any time from the machine)
> - Object name

Nice!  IMHO this is more clean, precise and scalable than the fmsg stuff
that we have now.

Would you mind taking the string identifiers down a little bit more?
"memory" could just have a first-class netlink attribute, it doesn't
have to be this fake JSON string pair..
Aya Levin April 30, 2019, 11:13 a.m. UTC | #3
On 4/29/2019 9:32 PM, Saeed Mahameed wrote:
> On Mon, 2019-04-29 at 17:17 +0300, Aya Levin wrote:
>> TX reporter reports an error on two scenarios:
>> - TX timeout on a specific tx queue
>> - TX completion error on a specific send queue
>> Prior to this patch, no dump data was supported by the tx reporter.
>> This
>> patch adds support for SW data dump of the related SQ context. The
>> dump
>> is simply the SQ's raw memory snapshot taken right after the error
>> was
>> reported, before any recovery procedure was launched. With this
>> approach, no maintenance is needed as the driver fetch the actual
>> data
>> according to the layout on which the SQ was compiled with.  By
>> providing
>> a SW context, one can easily debug error on a given SQ.
>>
>> In order to offline translate the raw memory into a human readable
>> format, the user can use some out-of-kernel scripts which receives as
>> an
>> input the following:
>> - Object raw memory
>> - Driver object compiled with debug info (can be taken/generated at
>> any time from the machine)
>> - Object name
>>
>> An example of such script output can be seen below.
>> Note: the script is not offered as part of this patch as it do not
>> belong to the kernel, I just described it in order to grasp the
>> general
>> idea of how/what can be fetched from SW dump via devlink health.
>>
> 
> What was the script ? provided a given raw dump how do you find which
> version is it, object name ?

The script parses the pahole output of an object with debug info, from 
it the script extracts the struct's layout and members offsets. In 
addition it merges this with the raw memory given by devlink.
Since the analysis of the dump is done in offline - the customer/vendor 
should use the object file of the corresponding version which generated 
the report.
> 
> did you use any well known raw debug format, like DWARF ?
Yes, I use a standard debug format supported by pahole (in my case 
DWARF). In addition to debug info I also set CONFIG_DEBUG_INFO_REDUCED=no
> 
>> The output of the SW dump can be extracted by devlink health command:
>> $ sudo devlink health dump show pci/0000:00:0b.0 reporter tx.
>>   mlx5e_txqsq: sqn: 6336
>>   memory:
>>     00 00 00 00 00 00 00 00
>>     01 00 00 00 00 00 00 00
>>     00 00 00 00 00 00 00 00
>>     45 f4 88 cb 09 00 00 00
>>     00 00 00 00 00 00 00 00
>>     00 00 00 00 00 00 00 00
>>     c0 ff ff ff 1f 00 00 00
>>     f8 18 1e 89 81 88 ff ff
>>     ...
>>
>> script output below, with struct members names and actual values:
>>
>> struct  mlx5e_txqsq {
>> 	short unsigned int         cc 	 0x5 ;
>> 	unsigned int               dma_fifo_cc 	 0x5 ;
>> 	struct  net_dim {
>> 		unsigned char      state 	 0x1 ;
>> 		struct  net_dim_stats {
>> 			int        ppms 	 0x0 ;
>> 			int        bpms 	 0x0 ;
>> 			int        epms 	 0x0 ;
>> 		} prev_stats;
>> 		struct  net_dim_sample {
>> 			long long int time 	 0x90766ef9d ;
>> 			unsigned int pkt_ctr 	 0x0 ;
>> 			unsigned int byte_ctr 	 0x0 ;
>> 			short unsigned int event_ctr 	 0x0 ;
>> 		} start_sample;
>> 		struct  work_struct {
>> 			struct   {
>> 				long int counter 	 0x1fffffffc0 ;
>> 			} data;
>> 			struct  list_head {
>> 				struct list_head * next 	 0xffff8881b0
>> 8998f8 ;
>> 				struct list_head * prev 	 0xffff8881b0
>> 8998f8 ;
>> 			} entry;
>> 			void       (*func)(struct work_struct *) 	 0xff
>> ffffffa02d0e30 ;
>> 		} work;
>> 		unsigned char      profile_ix 	 0x60 ;
>> 		unsigned char      mode 	 0x72 ;
>> 		unsigned char      tune_state 	 0x35 ;
>> 		unsigned char      steps_right 	 0xa0 ;
>> 		unsigned char      steps_left 	 0xff ;
>> 		unsigned char      tired 	 0xff ;
>> 	} dim;
>> 	short unsigned int         pc 	 0x0 ;
>> 	unsigned int               dma_fifo_pc 	 0x0 ;
>> 	struct  mlx5e_cq {
>> 		struct  mlx5_cqwq {
>> 			struct  mlx5_frag_buf_ctrl {
>> 				struct mlx5_buf_list * frags 	 0x50
>> 0000005 ;
>> 				unsigned int sz_m1 	 0x0 ;
>> 				short unsigned int frag_sz_m1 	 0x0
>> ;
>> 				short unsigned int strides_offset 	
>>   0x0 ;
>> 				unsigned char log_sz 	 0x0 ;
>> 				unsigned char log_stride 	 0x0 ;
>> 				unsigned char log_frag_strides 	 0x0
>> ;
>> 			} fbc;
>> 			__be32 *   db 	 0x0 ;
>> 			unsigned int cc 	 0x0 ;
>> 		} wq;
>> 		short unsigned int event_ctr 	 0x0 ;
>> 		struct napi_struct * napi 	 0x0 ;
>> 		struct  mlx5_core_cq {
>> 			unsigned int cqn 	 0x0 ;
>> 			int        cqe_sz 	 0x0 ;
>> 			__be32 *   set_ci_db 	 0xffff8881b1aa4988 ;
>> 			__be32 *   arm_db 	 0x3f000003ff ;
>> 			struct mlx5_uars_page * uar 	 0x6060a ;
>> 			struct  refcount_struct {
>> 				struct   {
>> 					int    counter 	 0xa1814500 ;
>> 				} refs;
>> 			} refcount;
>> 			struct  completion {
>> 				unsigned int done 	 0x5 ;
>> 				struct  wait_queue_head {
>> 					struct  spinlock {
>> 						union   {
>> 							struct  raw_spi
>> nlock {
>> 								struct
>>   qspinlock {
>> 									
>> union   {
>> 									
>> 	struct   {
>> 									
>> 		int
>> counter 	 0x5 ;
>> 									
>> 	} val;
>> 									
>> 	struct   {
>> 									
>> 		unsigned
>> char                                          locked 	 0x5 ;
>> 									
>> 		unsigned
>> char                                          pending 	 0x0 ;
>> 									
>> 	} ;
>> 									
>> 	struct   {
>> 									
>> 		short unsigned
>> int                                     locked_pending 	 0x5 ;
>> 									
>> 		short unsigned
>> int                                     tail 	 0x0 ;
>> 									
>> 	} ;
>> 									
>> } ;
>> 								}
>> raw_lock;
>> 							} rlock;
>> 						} ;
>> 					} lock;
>> 					struct  list_head {
>> 						struct list_head *
>> next 	 0xffff8881b089bb88 ;
>> 						struct list_head *
>> prev 	 0x4000000c0a ;
>> 					} head;
>> 				} wait;
>> 			} free;
>> 			unsigned int vector 	 0xa1814500 ;
>> 			unsigned int irqn 	 0xffff8881 ;
>> 			void       (*comp)(struct mlx5_core_cq *) 	
>>   0xffff8881a1814504 ;
>> 			void       (*event)(struct mlx5_core_cq *, enum
>> mlx5_event) 	 0xffff8881a2cdea08 ;
>> 			unsigned int cons_index 	 0x1 ;
>> 			unsigned int arm_sn 	 0x0 ;
>> 			struct mlx5_rsc_debug * dbg 	 0x0 ;
>> 			int        pid 	 0x0 ;
>> 			struct   {
>> 				struct  list_head {
>> 					struct list_head * next 	 0xff
>> ffffff ;
>> 					struct list_head * prev 	 0xff
>> ffffffffffffff ;
>> 				} list;
>> 				void (*comp)(struct mlx5_core_cq *) 	
>>   0xffffffffa0356940 ;
>> 				void * priv 	 0x0 ;
>> 			} tasklet_ctx;
>> 			int        reset_notify_added 	 0x0 ;
>> 			struct  list_head {
>> 				struct list_head * next 	 0xffffffffa0
>> 300700 ;
>> 				struct list_head * prev 	 0xd ;
>> 			} reset_notify;
>> 			struct mlx5_eq_comp * eq 	 0x0 ;
>> 			short unsigned int uid 	 0x9a70 ;
>> 		} mcq;
>> 		struct mlx5e_channel * channel 	 0xffff8881b0899a70 ;
>> 		struct mlx5_core_dev * mdev 	 0x4800000001 ;
>> 		struct  mlx5_wq_ctrl {
>> 			struct mlx5_core_dev * mdev 	 0xffffffffa0
>> 2d5350 ;
>> 			struct  mlx5_frag_buf {
>> 				struct mlx5_buf_list * frags 	 0xff
>> ffffffa02d5460 ;
>> 				int npages 	 0x0 ;
>> 				int size 	 0x5 ;
>> 				unsigned char page_shift 	 0x8 ;
>> 			} buf;
>> 			struct  mlx5_db {
>> 				__be32 * db 	 0x1c6 ;
>> 				union   {
>> 					struct mlx5_db_pgdir * pgdir 	
>>   0x0 ;
>> 					struct mlx5_ib_user_db_page *
>> user_page 	 0x0 ;
>> 				} u;
>> 				long long unsigned int dma 	 0xff
>> ff8881b0899ab0 ;
>> 				int index 	 0x0 ;
>> 			} db;
>> 		} wq_ctrl;
>> 	} cq;
>> 	struct  mlx5_wq_cyc {
>> 		struct  mlx5_frag_buf_ctrl {
>> 			struct mlx5_buf_list * frags 	 0xffff8881a7
>> 600160 ;
>> 			unsigned int sz_m1 	 0xa7600160 ;
>> 			short unsigned int frag_sz_m1 	 0x8881 ;
>> 			short unsigned int strides_offset 	 0xff
>> ff ;
>> 			unsigned char log_sz 	 0x88 ;
>> 			unsigned char log_stride 	 0x49 ;
>> 			unsigned char log_frag_strides 	 0xaa ;
>> 		} fbc;
>> 		__be32 *           db 	 0x1000000000010 ;
>> 		short unsigned int sz 	 0xc ;
>> 		short unsigned int wqe_ctr 	 0x0 ;
>> 		short unsigned int cur_sz 	 0x0 ;
>> 	} wq;
>> 	unsigned int               dma_fifo_mask 	 0xa1814500 ;
>> 	struct mlx5e_sq_stats *    stats 	 0xffff8881a33a0348 ;
>> 	struct   {
>> 		struct mlx5e_sq_dma * dma_fifo 	 0x1a1814500 ;
>> 		struct mlx5e_tx_wqe_info * wqe_info 	 0x14 ;
>> 	} db;
>> 	void *                     uar_map 	 0x0 ;
>> 	struct netdev_queue *      txq 	 0x0 ;
>> 	unsigned int               sqn 	 0x18c0 ;
>> 	unsigned char              min_inline_mode 	 0x0 ;
>> 	struct device *            pdev 	 0x0 ;
>> 	unsigned int               mkey_be 	 0x0 ;
>> 	long unsigned int          state 	 0x0 ;
>> 	struct hwtstamp_config *   tstamp 	 0x0 ;
>> 	struct mlx5_clock *        clock 	 0xffff8881b1aa6f88 ;
>> 	struct  mlx5_wq_ctrl {
>> 		struct mlx5_core_dev * mdev 	 0x3f000003ff ;
>> 		struct  mlx5_frag_buf {
>> 			struct mlx5_buf_list * frags 	 0x6060a ;
>> 			int        npages 	 0xa1814604 ;
>> 			int        size 	 0xffff8881 ;
>> 			unsigned char page_shift 	 0x0 ;
>> 		} buf;
>> 		struct  mlx5_db {
>> 			__be32 *   db 	 0xfff ;
>> 			union   {
>> 				struct mlx5_db_pgdir * pgdir 	 0x0
>> ;
>> 				struct mlx5_ib_user_db_page *
>> user_page 	 0x0 ;
>> 			} u;
>> 			long long unsigned int dma 	 0xffff888188
>> 440000 ;
>> 			int        index 	 0x8b074000 ;
>> 		} db;
>> 	} wq_ctrl;
>> 	struct mlx5e_channel *     channel 	 0xffffc9000010d800 ;
>> 	int                        txq_ix 	 0xa0020180 ;
> 
> txq_ix is too hight to make any sense here.
Thank you for noticing this - I found the bug in the script and fixed it
> 
> 
>> 	unsigned int               rate_limit 	 0xffff8881 ;
>> 	struct  work_struct {
>> 		struct   {
>> 			long int   counter 	 0x1000018c0 ;
>> 		} data;
>> 		struct  list_head {
>> 			struct list_head * next 	 0xffff8881c32b68e8 ;
>> 			struct list_head * prev 	 0x800 ;
>> 		} entry;
>> 		void               (*func)(struct work_struct *) 	 0x9
>> ;
>> 	} recover_work;
>> } ;
>>
>> Signed-off-by: Aya Levin <ayal@mellanox.com>
>> ---
>>   .../ethernet/mellanox/mlx5/core/en/reporter_tx.c   | 100
>> +++++++++++++++++++++
>>   1 file changed, 100 insertions(+)
>>
>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>> b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>> index 476dd97f7f2f..8a39f5525e57 100644
>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>> @@ -9,6 +9,7 @@
>>   
>>   struct mlx5e_tx_err_ctx {
>>   	int (*recover)(struct mlx5e_txqsq *sq);
>> +	int (*dump)(struct mlx5e_txqsq *sq);
>>   	struct mlx5e_txqsq *sq;
>>   };
>>   
>> @@ -281,10 +282,109 @@ static int mlx5e_tx_reporter_diagnose(struct
>> devlink_health_reporter *reporter,
>>   	return err;
>>   }
>>   
>> +static int mlx5e_tx_reporter_sw_dump_from_ctx(struct mlx5e_priv
>> *priv,
>> +					      struct mlx5e_txqsq *sq,
>> +					      struct devlink_fmsg
>> *fmsg)
>> +{
>> +	u64 *ptr = (u64 *)sq;
>> +	int copy, err;
>> +	int i = 0;
>> +
>> +	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>> +		return 0;
>> +
>> +	err = devlink_fmsg_pair_nest_start(fmsg, "mlx5e_txqsq");
>> +	if (err)
>> +		return err;
>> +
>> +	err = devlink_fmsg_obj_nest_start(fmsg);
>> +	if (err)
>> +		return err;
>> +
>> +	err = devlink_fmsg_arr_pair_nest_start(fmsg, "memory");
>> +	if (err)
>> +		return err;
>> +
>> +	while (i < sizeof(struct mlx5e_txqsq)) {
>> +		copy = sizeof(u64);
>> +
>> +		if (i + copy > sizeof(struct mlx5e_txqsq))
>> +			copy = sizeof(struct mlx5e_txqsq) - i;
>> +
>> +		err = devlink_fmsg_binary_put(fmsg, ptr, copy);
>> +		if (err)
>> +			return err;
>> +		ptr++;
>> +		i += copy;
>> +	}
>> +
>> +	err = devlink_fmsg_arr_pair_nest_end(fmsg);
>> +	if (err)
>> +		return err;
>> +
>> +	err = devlink_fmsg_obj_nest_end(fmsg);
>> +	if (err)
>> +		return err;
>> +
>> +	err = devlink_fmsg_pair_nest_end(fmsg);
>> +
>> +	return err;
>> +}
>> +
>> +static int mlx5e_tx_reporter_sw_dump_all(struct mlx5e_priv *priv,
>> +					 struct devlink_fmsg *fmsg)
>> +{
>> +	int i, err = 0;
>> +
>> +	mutex_lock(&priv->state_lock);
>> +
>> +	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>> +		goto unlock;
>> +
>> +	err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
>> +	if (err)
>> +		goto unlock;
>> +
>> +	for (i = 0; i < priv->channels.num * priv-
>>> channels.params.num_tc;
>> +	     i++) {
>> +		err = devlink_fmsg_obj_nest_start(fmsg);
>> +		if (err)
>> +			goto unlock;
>> +
>> +		err = mlx5e_tx_reporter_sw_dump_from_ctx(priv, priv-
>>> txq2sq[i],
>> +							 fmsg);
>> +		if (err)
>> +			goto unlock;
>> +
>> +		err = devlink_fmsg_pair_nest_end(fmsg);
>> +		if (err)
>> +			goto unlock;
>> +	}
>> +	err = devlink_fmsg_arr_pair_nest_end(fmsg);
>> +	if (err)
>> +		goto unlock;
>> +
>> +unlock:
>> +	mutex_unlock(&priv->state_lock);
>> +	return err;
>> +}
>> +
>> +static int mlx5e_tx_reporter_sw_dump(struct devlink_health_reporter
>> *reporter,
>> +				     struct devlink_fmsg *fmsg, void
>> *context)
>> +{
>> +	struct mlx5e_priv *priv =
>> devlink_health_reporter_priv(reporter);
>> +	struct mlx5e_tx_err_ctx *err_ctx = context;
>> +
>> +	return err_ctx ? mlx5e_tx_reporter_sw_dump_from_ctx(priv,
>> err_ctx->sq,
>> +							    fmsg) :
>> +			 mlx5e_tx_reporter_sw_dump_all(priv, fmsg);
>> +}
>> +
>>   static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops
>> = {
>>   		.name = "tx",
>>   		.recover = mlx5e_tx_reporter_recover,
>>   		.diagnose = mlx5e_tx_reporter_diagnose,
>> +		.dump = mlx5e_tx_reporter_sw_dump,
>>   };
>>   
>>   #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
Aya Levin April 30, 2019, 11:26 a.m. UTC | #4
On 4/30/2019 3:54 AM, Jakub Kicinski wrote:
> On Mon, 29 Apr 2019 17:17:39 +0300, Aya Levin wrote:
>> In order to offline translate the raw memory into a human readable
>> format, the user can use some out-of-kernel scripts which receives as an
>> input the following:
>> - Object raw memory
>> - Driver object compiled with debug info (can be taken/generated at any time from the machine)
>> - Object name
> 
> Nice!  IMHO this is more clean, precise and scalable than the fmsg stuff
> that we have now.
> 
> Would you mind taking the string identifiers down a little bit more?
> "memory" could just have a first-class netlink attribute, it doesn't
> have to be this fake JSON string pair..
> 
Thanks!
I am still using fmsg which is the API to construct JSON like output. 
The fmg object contains the raw data.
I like your idea to remove the memory tag and just leave the array of 
byte arrays. The SW dump devlink output without scripting is meaningless.
Jiri Pirko May 7, 2019, 12:41 p.m. UTC | #5
Mon, Apr 29, 2019 at 04:17:39PM CEST, ayal@mellanox.com wrote:
>TX reporter reports an error on two scenarios:
>- TX timeout on a specific tx queue
>- TX completion error on a specific send queue
>Prior to this patch, no dump data was supported by the tx reporter. This
>patch adds support for SW data dump of the related SQ context. The dump
>is simply the SQ's raw memory snapshot taken right after the error was
>reported, before any recovery procedure was launched. With this
>approach, no maintenance is needed as the driver fetch the actual data
>according to the layout on which the SQ was compiled with.  By providing
>a SW context, one can easily debug error on a given SQ.
>
>In order to offline translate the raw memory into a human readable
>format, the user can use some out-of-kernel scripts which receives as an
>input the following:
>- Object raw memory
>- Driver object compiled with debug info (can be taken/generated at any time from the machine)
>- Object name
>
>An example of such script output can be seen below.
>Note: the script is not offered as part of this patch as it do not
>belong to the kernel, I just described it in order to grasp the general
>idea of how/what can be fetched from SW dump via devlink health.
>
>The output of the SW dump can be extracted by devlink health command:
>$ sudo devlink health dump show pci/0000:00:0b.0 reporter tx.
> mlx5e_txqsq: sqn: 6336
> memory:
>   00 00 00 00 00 00 00 00
>   01 00 00 00 00 00 00 00
>   00 00 00 00 00 00 00 00
>   45 f4 88 cb 09 00 00 00
>   00 00 00 00 00 00 00 00
>   00 00 00 00 00 00 00 00
>   c0 ff ff ff 1f 00 00 00
>   f8 18 1e 89 81 88 ff ff
>   ...
>
>script output below, with struct members names and actual values:
>
>struct  mlx5e_txqsq {
>	short unsigned int         cc 	 0x5 ;
>	unsigned int               dma_fifo_cc 	 0x5 ;
>	struct  net_dim {
>		unsigned char      state 	 0x1 ;
>		struct  net_dim_stats {
>			int        ppms 	 0x0 ;
>			int        bpms 	 0x0 ;
>			int        epms 	 0x0 ;
>		} prev_stats;
>		struct  net_dim_sample {
>			long long int time 	 0x90766ef9d ;
>			unsigned int pkt_ctr 	 0x0 ;
>			unsigned int byte_ctr 	 0x0 ;
>			short unsigned int event_ctr 	 0x0 ;
>		} start_sample;
>		struct  work_struct {
>			struct   {
>				long int counter 	 0x1fffffffc0 ;
>			} data;
>			struct  list_head {
>				struct list_head * next 	 0xffff8881b08998f8 ;
>				struct list_head * prev 	 0xffff8881b08998f8 ;
>			} entry;
>			void       (*func)(struct work_struct *) 	 0xffffffffa02d0e30 ;
>		} work;
>		unsigned char      profile_ix 	 0x60 ;
>		unsigned char      mode 	 0x72 ;
>		unsigned char      tune_state 	 0x35 ;
>		unsigned char      steps_right 	 0xa0 ;
>		unsigned char      steps_left 	 0xff ;
>		unsigned char      tired 	 0xff ;
>	} dim;
>	short unsigned int         pc 	 0x0 ;
>	unsigned int               dma_fifo_pc 	 0x0 ;
>	struct  mlx5e_cq {
>		struct  mlx5_cqwq {
>			struct  mlx5_frag_buf_ctrl {
>				struct mlx5_buf_list * frags 	 0x500000005 ;
>				unsigned int sz_m1 	 0x0 ;
>				short unsigned int frag_sz_m1 	 0x0 ;
>				short unsigned int strides_offset 	 0x0 ;
>				unsigned char log_sz 	 0x0 ;
>				unsigned char log_stride 	 0x0 ;
>				unsigned char log_frag_strides 	 0x0 ;
>			} fbc;
>			__be32 *   db 	 0x0 ;
>			unsigned int cc 	 0x0 ;
>		} wq;
>		short unsigned int event_ctr 	 0x0 ;
>		struct napi_struct * napi 	 0x0 ;
>		struct  mlx5_core_cq {
>			unsigned int cqn 	 0x0 ;
>			int        cqe_sz 	 0x0 ;
>			__be32 *   set_ci_db 	 0xffff8881b1aa4988 ;
>			__be32 *   arm_db 	 0x3f000003ff ;
>			struct mlx5_uars_page * uar 	 0x6060a ;
>			struct  refcount_struct {
>				struct   {
>					int    counter 	 0xa1814500 ;
>				} refs;
>			} refcount;
>			struct  completion {
>				unsigned int done 	 0x5 ;
>				struct  wait_queue_head {
>					struct  spinlock {
>						union   {
>							struct  raw_spinlock {
>								struct  qspinlock {
>									union   {
>										struct   {
>											int                                                    counter 	 0x5 ;
>										} val;
>										struct   {
>											unsigned char                                          locked 	 0x5 ;
>											unsigned char                                          pending 	 0x0 ;
>										} ;
>										struct   {
>											short unsigned int                                     locked_pending 	 0x5 ;
>											short unsigned int                                     tail 	 0x0 ;
>										} ;
>									} ;
>								} raw_lock;
>							} rlock;
>						} ;
>					} lock;
>					struct  list_head {
>						struct list_head * next 	 0xffff8881b089bb88 ;
>						struct list_head * prev 	 0x4000000c0a ;
>					} head;
>				} wait;
>			} free;
>			unsigned int vector 	 0xa1814500 ;
>			unsigned int irqn 	 0xffff8881 ;
>			void       (*comp)(struct mlx5_core_cq *) 	 0xffff8881a1814504 ;
>			void       (*event)(struct mlx5_core_cq *, enum mlx5_event) 	 0xffff8881a2cdea08 ;
>			unsigned int cons_index 	 0x1 ;
>			unsigned int arm_sn 	 0x0 ;
>			struct mlx5_rsc_debug * dbg 	 0x0 ;
>			int        pid 	 0x0 ;
>			struct   {
>				struct  list_head {
>					struct list_head * next 	 0xffffffff ;
>					struct list_head * prev 	 0xffffffffffffffff ;
>				} list;
>				void (*comp)(struct mlx5_core_cq *) 	 0xffffffffa0356940 ;
>				void * priv 	 0x0 ;
>			} tasklet_ctx;
>			int        reset_notify_added 	 0x0 ;
>			struct  list_head {
>				struct list_head * next 	 0xffffffffa0300700 ;
>				struct list_head * prev 	 0xd ;
>			} reset_notify;
>			struct mlx5_eq_comp * eq 	 0x0 ;
>			short unsigned int uid 	 0x9a70 ;
>		} mcq;
>		struct mlx5e_channel * channel 	 0xffff8881b0899a70 ;
>		struct mlx5_core_dev * mdev 	 0x4800000001 ;
>		struct  mlx5_wq_ctrl {
>			struct mlx5_core_dev * mdev 	 0xffffffffa02d5350 ;
>			struct  mlx5_frag_buf {
>				struct mlx5_buf_list * frags 	 0xffffffffa02d5460 ;
>				int npages 	 0x0 ;
>				int size 	 0x5 ;
>				unsigned char page_shift 	 0x8 ;
>			} buf;
>			struct  mlx5_db {
>				__be32 * db 	 0x1c6 ;
>				union   {
>					struct mlx5_db_pgdir * pgdir 	 0x0 ;
>					struct mlx5_ib_user_db_page * user_page 	 0x0 ;
>				} u;
>				long long unsigned int dma 	 0xffff8881b0899ab0 ;
>				int index 	 0x0 ;
>			} db;
>		} wq_ctrl;
>	} cq;
>	struct  mlx5_wq_cyc {
>		struct  mlx5_frag_buf_ctrl {
>			struct mlx5_buf_list * frags 	 0xffff8881a7600160 ;
>			unsigned int sz_m1 	 0xa7600160 ;
>			short unsigned int frag_sz_m1 	 0x8881 ;
>			short unsigned int strides_offset 	 0xffff ;
>			unsigned char log_sz 	 0x88 ;
>			unsigned char log_stride 	 0x49 ;
>			unsigned char log_frag_strides 	 0xaa ;
>		} fbc;
>		__be32 *           db 	 0x1000000000010 ;
>		short unsigned int sz 	 0xc ;
>		short unsigned int wqe_ctr 	 0x0 ;
>		short unsigned int cur_sz 	 0x0 ;
>	} wq;
>	unsigned int               dma_fifo_mask 	 0xa1814500 ;
>	struct mlx5e_sq_stats *    stats 	 0xffff8881a33a0348 ;
>	struct   {
>		struct mlx5e_sq_dma * dma_fifo 	 0x1a1814500 ;
>		struct mlx5e_tx_wqe_info * wqe_info 	 0x14 ;
>	} db;
>	void *                     uar_map 	 0x0 ;
>	struct netdev_queue *      txq 	 0x0 ;
>	unsigned int               sqn 	 0x18c0 ;
>	unsigned char              min_inline_mode 	 0x0 ;
>	struct device *            pdev 	 0x0 ;
>	unsigned int               mkey_be 	 0x0 ;
>	long unsigned int          state 	 0x0 ;
>	struct hwtstamp_config *   tstamp 	 0x0 ;
>	struct mlx5_clock *        clock 	 0xffff8881b1aa6f88 ;
>	struct  mlx5_wq_ctrl {
>		struct mlx5_core_dev * mdev 	 0x3f000003ff ;
>		struct  mlx5_frag_buf {
>			struct mlx5_buf_list * frags 	 0x6060a ;
>			int        npages 	 0xa1814604 ;
>			int        size 	 0xffff8881 ;
>			unsigned char page_shift 	 0x0 ;
>		} buf;
>		struct  mlx5_db {
>			__be32 *   db 	 0xfff ;
>			union   {
>				struct mlx5_db_pgdir * pgdir 	 0x0 ;
>				struct mlx5_ib_user_db_page * user_page 	 0x0 ;
>			} u;
>			long long unsigned int dma 	 0xffff888188440000 ;
>			int        index 	 0x8b074000 ;
>		} db;
>	} wq_ctrl;
>	struct mlx5e_channel *     channel 	 0xffffc9000010d800 ;
>	int                        txq_ix 	 0xa0020180 ;
>	unsigned int               rate_limit 	 0xffff8881 ;
>	struct  work_struct {
>		struct   {
>			long int   counter 	 0x1000018c0 ;
>		} data;
>		struct  list_head {
>			struct list_head * next 	 0xffff8881c32b68e8 ;
>			struct list_head * prev 	 0x800 ;
>		} entry;
>		void               (*func)(struct work_struct *) 	 0x9 ;
>	} recover_work;
>} ;

I don't get it. You are dumping live kernel memory? There are already
facilities to do that in place. Why to replicate it?


>
>Signed-off-by: Aya Levin <ayal@mellanox.com>
>---
> .../ethernet/mellanox/mlx5/core/en/reporter_tx.c   | 100 +++++++++++++++++++++
> 1 file changed, 100 insertions(+)
>
>diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>index 476dd97f7f2f..8a39f5525e57 100644
>--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>@@ -9,6 +9,7 @@
> 
> struct mlx5e_tx_err_ctx {
> 	int (*recover)(struct mlx5e_txqsq *sq);
>+	int (*dump)(struct mlx5e_txqsq *sq);
> 	struct mlx5e_txqsq *sq;
> };
> 
>@@ -281,10 +282,109 @@ static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
> 	return err;
> }
> 
>+static int mlx5e_tx_reporter_sw_dump_from_ctx(struct mlx5e_priv *priv,
>+					      struct mlx5e_txqsq *sq,
>+					      struct devlink_fmsg *fmsg)
>+{
>+	u64 *ptr = (u64 *)sq;
>+	int copy, err;
>+	int i = 0;
>+
>+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>+		return 0;
>+
>+	err = devlink_fmsg_pair_nest_start(fmsg, "mlx5e_txqsq");
>+	if (err)
>+		return err;
>+
>+	err = devlink_fmsg_obj_nest_start(fmsg);
>+	if (err)
>+		return err;
>+
>+	err = devlink_fmsg_arr_pair_nest_start(fmsg, "memory");
>+	if (err)
>+		return err;
>+
>+	while (i < sizeof(struct mlx5e_txqsq)) {
>+		copy = sizeof(u64);
>+
>+		if (i + copy > sizeof(struct mlx5e_txqsq))
>+			copy = sizeof(struct mlx5e_txqsq) - i;
>+
>+		err = devlink_fmsg_binary_put(fmsg, ptr, copy);
>+		if (err)
>+			return err;
>+		ptr++;
>+		i += copy;
>+	}
>+
>+	err = devlink_fmsg_arr_pair_nest_end(fmsg);
>+	if (err)
>+		return err;
>+
>+	err = devlink_fmsg_obj_nest_end(fmsg);
>+	if (err)
>+		return err;
>+
>+	err = devlink_fmsg_pair_nest_end(fmsg);
>+
>+	return err;
>+}
>+
>+static int mlx5e_tx_reporter_sw_dump_all(struct mlx5e_priv *priv,
>+					 struct devlink_fmsg *fmsg)
>+{
>+	int i, err = 0;
>+
>+	mutex_lock(&priv->state_lock);
>+
>+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>+		goto unlock;
>+
>+	err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
>+	if (err)
>+		goto unlock;
>+
>+	for (i = 0; i < priv->channels.num * priv->channels.params.num_tc;
>+	     i++) {
>+		err = devlink_fmsg_obj_nest_start(fmsg);
>+		if (err)
>+			goto unlock;
>+
>+		err = mlx5e_tx_reporter_sw_dump_from_ctx(priv, priv->txq2sq[i],
>+							 fmsg);
>+		if (err)
>+			goto unlock;
>+
>+		err = devlink_fmsg_pair_nest_end(fmsg);
>+		if (err)
>+			goto unlock;
>+	}
>+	err = devlink_fmsg_arr_pair_nest_end(fmsg);
>+	if (err)
>+		goto unlock;
>+
>+unlock:
>+	mutex_unlock(&priv->state_lock);
>+	return err;
>+}
>+
>+static int mlx5e_tx_reporter_sw_dump(struct devlink_health_reporter *reporter,
>+				     struct devlink_fmsg *fmsg, void *context)
>+{
>+	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
>+	struct mlx5e_tx_err_ctx *err_ctx = context;
>+
>+	return err_ctx ? mlx5e_tx_reporter_sw_dump_from_ctx(priv, err_ctx->sq,
>+							    fmsg) :
>+			 mlx5e_tx_reporter_sw_dump_all(priv, fmsg);
>+}
>+
> static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
> 		.name = "tx",
> 		.recover = mlx5e_tx_reporter_recover,
> 		.diagnose = mlx5e_tx_reporter_diagnose,
>+		.dump = mlx5e_tx_reporter_sw_dump,
> };
> 
> #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
>-- 
>2.14.1
>
Aya Levin May 7, 2019, 12:58 p.m. UTC | #6
On 5/7/2019 3:41 PM, Jiri Pirko wrote:
> Mon, Apr 29, 2019 at 04:17:39PM CEST, ayal@mellanox.com wrote:
>> TX reporter reports an error on two scenarios:
>> - TX timeout on a specific tx queue
>> - TX completion error on a specific send queue
>> Prior to this patch, no dump data was supported by the tx reporter. This
>> patch adds support for SW data dump of the related SQ context. The dump
>> is simply the SQ's raw memory snapshot taken right after the error was
>> reported, before any recovery procedure was launched. With this
>> approach, no maintenance is needed as the driver fetch the actual data
>> according to the layout on which the SQ was compiled with.  By providing
>> a SW context, one can easily debug error on a given SQ.
>>
>> In order to offline translate the raw memory into a human readable
>> format, the user can use some out-of-kernel scripts which receives as an
>> input the following:
>> - Object raw memory
>> - Driver object compiled with debug info (can be taken/generated at any time from the machine)
>> - Object name
>>
>> An example of such script output can be seen below.
>> Note: the script is not offered as part of this patch as it do not
>> belong to the kernel, I just described it in order to grasp the general
>> idea of how/what can be fetched from SW dump via devlink health.
>>
>> The output of the SW dump can be extracted by devlink health command:
>> $ sudo devlink health dump show pci/0000:00:0b.0 reporter tx.
>> mlx5e_txqsq: sqn: 6336
>> memory:
>>    00 00 00 00 00 00 00 00
>>    01 00 00 00 00 00 00 00
>>    00 00 00 00 00 00 00 00
>>    45 f4 88 cb 09 00 00 00
>>    00 00 00 00 00 00 00 00
>>    00 00 00 00 00 00 00 00
>>    c0 ff ff ff 1f 00 00 00
>>    f8 18 1e 89 81 88 ff ff
>>    ...
>>
>> script output below, with struct members names and actual values:
>>
>> struct  mlx5e_txqsq {
>> 	short unsigned int         cc 	 0x5 ;
>> 	unsigned int               dma_fifo_cc 	 0x5 ;
>> 	struct  net_dim {
>> 		unsigned char      state 	 0x1 ;
>> 		struct  net_dim_stats {
>> 			int        ppms 	 0x0 ;
>> 			int        bpms 	 0x0 ;
>> 			int        epms 	 0x0 ;
>> 		} prev_stats;
>> 		struct  net_dim_sample {
>> 			long long int time 	 0x90766ef9d ;
>> 			unsigned int pkt_ctr 	 0x0 ;
>> 			unsigned int byte_ctr 	 0x0 ;
>> 			short unsigned int event_ctr 	 0x0 ;
>> 		} start_sample;
>> 		struct  work_struct {
>> 			struct   {
>> 				long int counter 	 0x1fffffffc0 ;
>> 			} data;
>> 			struct  list_head {
>> 				struct list_head * next 	 0xffff8881b08998f8 ;
>> 				struct list_head * prev 	 0xffff8881b08998f8 ;
>> 			} entry;
>> 			void       (*func)(struct work_struct *) 	 0xffffffffa02d0e30 ;
>> 		} work;
>> 		unsigned char      profile_ix 	 0x60 ;
>> 		unsigned char      mode 	 0x72 ;
>> 		unsigned char      tune_state 	 0x35 ;
>> 		unsigned char      steps_right 	 0xa0 ;
>> 		unsigned char      steps_left 	 0xff ;
>> 		unsigned char      tired 	 0xff ;
>> 	} dim;
>> 	short unsigned int         pc 	 0x0 ;
>> 	unsigned int               dma_fifo_pc 	 0x0 ;
>> 	struct  mlx5e_cq {
>> 		struct  mlx5_cqwq {
>> 			struct  mlx5_frag_buf_ctrl {
>> 				struct mlx5_buf_list * frags 	 0x500000005 ;
>> 				unsigned int sz_m1 	 0x0 ;
>> 				short unsigned int frag_sz_m1 	 0x0 ;
>> 				short unsigned int strides_offset 	 0x0 ;
>> 				unsigned char log_sz 	 0x0 ;
>> 				unsigned char log_stride 	 0x0 ;
>> 				unsigned char log_frag_strides 	 0x0 ;
>> 			} fbc;
>> 			__be32 *   db 	 0x0 ;
>> 			unsigned int cc 	 0x0 ;
>> 		} wq;
>> 		short unsigned int event_ctr 	 0x0 ;
>> 		struct napi_struct * napi 	 0x0 ;
>> 		struct  mlx5_core_cq {
>> 			unsigned int cqn 	 0x0 ;
>> 			int        cqe_sz 	 0x0 ;
>> 			__be32 *   set_ci_db 	 0xffff8881b1aa4988 ;
>> 			__be32 *   arm_db 	 0x3f000003ff ;
>> 			struct mlx5_uars_page * uar 	 0x6060a ;
>> 			struct  refcount_struct {
>> 				struct   {
>> 					int    counter 	 0xa1814500 ;
>> 				} refs;
>> 			} refcount;
>> 			struct  completion {
>> 				unsigned int done 	 0x5 ;
>> 				struct  wait_queue_head {
>> 					struct  spinlock {
>> 						union   {
>> 							struct  raw_spinlock {
>> 								struct  qspinlock {
>> 									union   {
>> 										struct   {
>> 											int                                                    counter 	 0x5 ;
>> 										} val;
>> 										struct   {
>> 											unsigned char                                          locked 	 0x5 ;
>> 											unsigned char                                          pending 	 0x0 ;
>> 										} ;
>> 										struct   {
>> 											short unsigned int                                     locked_pending 	 0x5 ;
>> 											short unsigned int                                     tail 	 0x0 ;
>> 										} ;
>> 									} ;
>> 								} raw_lock;
>> 							} rlock;
>> 						} ;
>> 					} lock;
>> 					struct  list_head {
>> 						struct list_head * next 	 0xffff8881b089bb88 ;
>> 						struct list_head * prev 	 0x4000000c0a ;
>> 					} head;
>> 				} wait;
>> 			} free;
>> 			unsigned int vector 	 0xa1814500 ;
>> 			unsigned int irqn 	 0xffff8881 ;
>> 			void       (*comp)(struct mlx5_core_cq *) 	 0xffff8881a1814504 ;
>> 			void       (*event)(struct mlx5_core_cq *, enum mlx5_event) 	 0xffff8881a2cdea08 ;
>> 			unsigned int cons_index 	 0x1 ;
>> 			unsigned int arm_sn 	 0x0 ;
>> 			struct mlx5_rsc_debug * dbg 	 0x0 ;
>> 			int        pid 	 0x0 ;
>> 			struct   {
>> 				struct  list_head {
>> 					struct list_head * next 	 0xffffffff ;
>> 					struct list_head * prev 	 0xffffffffffffffff ;
>> 				} list;
>> 				void (*comp)(struct mlx5_core_cq *) 	 0xffffffffa0356940 ;
>> 				void * priv 	 0x0 ;
>> 			} tasklet_ctx;
>> 			int        reset_notify_added 	 0x0 ;
>> 			struct  list_head {
>> 				struct list_head * next 	 0xffffffffa0300700 ;
>> 				struct list_head * prev 	 0xd ;
>> 			} reset_notify;
>> 			struct mlx5_eq_comp * eq 	 0x0 ;
>> 			short unsigned int uid 	 0x9a70 ;
>> 		} mcq;
>> 		struct mlx5e_channel * channel 	 0xffff8881b0899a70 ;
>> 		struct mlx5_core_dev * mdev 	 0x4800000001 ;
>> 		struct  mlx5_wq_ctrl {
>> 			struct mlx5_core_dev * mdev 	 0xffffffffa02d5350 ;
>> 			struct  mlx5_frag_buf {
>> 				struct mlx5_buf_list * frags 	 0xffffffffa02d5460 ;
>> 				int npages 	 0x0 ;
>> 				int size 	 0x5 ;
>> 				unsigned char page_shift 	 0x8 ;
>> 			} buf;
>> 			struct  mlx5_db {
>> 				__be32 * db 	 0x1c6 ;
>> 				union   {
>> 					struct mlx5_db_pgdir * pgdir 	 0x0 ;
>> 					struct mlx5_ib_user_db_page * user_page 	 0x0 ;
>> 				} u;
>> 				long long unsigned int dma 	 0xffff8881b0899ab0 ;
>> 				int index 	 0x0 ;
>> 			} db;
>> 		} wq_ctrl;
>> 	} cq;
>> 	struct  mlx5_wq_cyc {
>> 		struct  mlx5_frag_buf_ctrl {
>> 			struct mlx5_buf_list * frags 	 0xffff8881a7600160 ;
>> 			unsigned int sz_m1 	 0xa7600160 ;
>> 			short unsigned int frag_sz_m1 	 0x8881 ;
>> 			short unsigned int strides_offset 	 0xffff ;
>> 			unsigned char log_sz 	 0x88 ;
>> 			unsigned char log_stride 	 0x49 ;
>> 			unsigned char log_frag_strides 	 0xaa ;
>> 		} fbc;
>> 		__be32 *           db 	 0x1000000000010 ;
>> 		short unsigned int sz 	 0xc ;
>> 		short unsigned int wqe_ctr 	 0x0 ;
>> 		short unsigned int cur_sz 	 0x0 ;
>> 	} wq;
>> 	unsigned int               dma_fifo_mask 	 0xa1814500 ;
>> 	struct mlx5e_sq_stats *    stats 	 0xffff8881a33a0348 ;
>> 	struct   {
>> 		struct mlx5e_sq_dma * dma_fifo 	 0x1a1814500 ;
>> 		struct mlx5e_tx_wqe_info * wqe_info 	 0x14 ;
>> 	} db;
>> 	void *                     uar_map 	 0x0 ;
>> 	struct netdev_queue *      txq 	 0x0 ;
>> 	unsigned int               sqn 	 0x18c0 ;
>> 	unsigned char              min_inline_mode 	 0x0 ;
>> 	struct device *            pdev 	 0x0 ;
>> 	unsigned int               mkey_be 	 0x0 ;
>> 	long unsigned int          state 	 0x0 ;
>> 	struct hwtstamp_config *   tstamp 	 0x0 ;
>> 	struct mlx5_clock *        clock 	 0xffff8881b1aa6f88 ;
>> 	struct  mlx5_wq_ctrl {
>> 		struct mlx5_core_dev * mdev 	 0x3f000003ff ;
>> 		struct  mlx5_frag_buf {
>> 			struct mlx5_buf_list * frags 	 0x6060a ;
>> 			int        npages 	 0xa1814604 ;
>> 			int        size 	 0xffff8881 ;
>> 			unsigned char page_shift 	 0x0 ;
>> 		} buf;
>> 		struct  mlx5_db {
>> 			__be32 *   db 	 0xfff ;
>> 			union   {
>> 				struct mlx5_db_pgdir * pgdir 	 0x0 ;
>> 				struct mlx5_ib_user_db_page * user_page 	 0x0 ;
>> 			} u;
>> 			long long unsigned int dma 	 0xffff888188440000 ;
>> 			int        index 	 0x8b074000 ;
>> 		} db;
>> 	} wq_ctrl;
>> 	struct mlx5e_channel *     channel 	 0xffffc9000010d800 ;
>> 	int                        txq_ix 	 0xa0020180 ;
>> 	unsigned int               rate_limit 	 0xffff8881 ;
>> 	struct  work_struct {
>> 		struct   {
>> 			long int   counter 	 0x1000018c0 ;
>> 		} data;
>> 		struct  list_head {
>> 			struct list_head * next 	 0xffff8881c32b68e8 ;
>> 			struct list_head * prev 	 0x800 ;
>> 		} entry;
>> 		void               (*func)(struct work_struct *) 	 0x9 ;
>> 	} recover_work;
>> } ;
> 
> I don't get it. You are dumping live kernel memory? There are already
> facilities to do that in place. Why to replicate it?
I am dumping the driver's memory under a lock so I can ensure it's 
consistency (as appose to /dev/mem)
vmcore cannot be taken from a live kernel (without crashing).
I need the memory's snapshot right after the error from the driver's 
context.
Which other tools do you mean?
> 
> 
>>
>> Signed-off-by: Aya Levin <ayal@mellanox.com>
>> ---
>> .../ethernet/mellanox/mlx5/core/en/reporter_tx.c   | 100 +++++++++++++++++++++
>> 1 file changed, 100 insertions(+)
>>
>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>> index 476dd97f7f2f..8a39f5525e57 100644
>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>> @@ -9,6 +9,7 @@
>>
>> struct mlx5e_tx_err_ctx {
>> 	int (*recover)(struct mlx5e_txqsq *sq);
>> +	int (*dump)(struct mlx5e_txqsq *sq);
>> 	struct mlx5e_txqsq *sq;
>> };
>>
>> @@ -281,10 +282,109 @@ static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
>> 	return err;
>> }
>>
>> +static int mlx5e_tx_reporter_sw_dump_from_ctx(struct mlx5e_priv *priv,
>> +					      struct mlx5e_txqsq *sq,
>> +					      struct devlink_fmsg *fmsg)
>> +{
>> +	u64 *ptr = (u64 *)sq;
>> +	int copy, err;
>> +	int i = 0;
>> +
>> +	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>> +		return 0;
>> +
>> +	err = devlink_fmsg_pair_nest_start(fmsg, "mlx5e_txqsq");
>> +	if (err)
>> +		return err;
>> +
>> +	err = devlink_fmsg_obj_nest_start(fmsg);
>> +	if (err)
>> +		return err;
>> +
>> +	err = devlink_fmsg_arr_pair_nest_start(fmsg, "memory");
>> +	if (err)
>> +		return err;
>> +
>> +	while (i < sizeof(struct mlx5e_txqsq)) {
>> +		copy = sizeof(u64);
>> +
>> +		if (i + copy > sizeof(struct mlx5e_txqsq))
>> +			copy = sizeof(struct mlx5e_txqsq) - i;
>> +
>> +		err = devlink_fmsg_binary_put(fmsg, ptr, copy);
>> +		if (err)
>> +			return err;
>> +		ptr++;
>> +		i += copy;
>> +	}
>> +
>> +	err = devlink_fmsg_arr_pair_nest_end(fmsg);
>> +	if (err)
>> +		return err;
>> +
>> +	err = devlink_fmsg_obj_nest_end(fmsg);
>> +	if (err)
>> +		return err;
>> +
>> +	err = devlink_fmsg_pair_nest_end(fmsg);
>> +
>> +	return err;
>> +}
>> +
>> +static int mlx5e_tx_reporter_sw_dump_all(struct mlx5e_priv *priv,
>> +					 struct devlink_fmsg *fmsg)
>> +{
>> +	int i, err = 0;
>> +
>> +	mutex_lock(&priv->state_lock);
>> +
>> +	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>> +		goto unlock;
>> +
>> +	err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
>> +	if (err)
>> +		goto unlock;
>> +
>> +	for (i = 0; i < priv->channels.num * priv->channels.params.num_tc;
>> +	     i++) {
>> +		err = devlink_fmsg_obj_nest_start(fmsg);
>> +		if (err)
>> +			goto unlock;
>> +
>> +		err = mlx5e_tx_reporter_sw_dump_from_ctx(priv, priv->txq2sq[i],
>> +							 fmsg);
>> +		if (err)
>> +			goto unlock;
>> +
>> +		err = devlink_fmsg_pair_nest_end(fmsg);
>> +		if (err)
>> +			goto unlock;
>> +	}
>> +	err = devlink_fmsg_arr_pair_nest_end(fmsg);
>> +	if (err)
>> +		goto unlock;
>> +
>> +unlock:
>> +	mutex_unlock(&priv->state_lock);
>> +	return err;
>> +}
>> +
>> +static int mlx5e_tx_reporter_sw_dump(struct devlink_health_reporter *reporter,
>> +				     struct devlink_fmsg *fmsg, void *context)
>> +{
>> +	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
>> +	struct mlx5e_tx_err_ctx *err_ctx = context;
>> +
>> +	return err_ctx ? mlx5e_tx_reporter_sw_dump_from_ctx(priv, err_ctx->sq,
>> +							    fmsg) :
>> +			 mlx5e_tx_reporter_sw_dump_all(priv, fmsg);
>> +}
>> +
>> static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
>> 		.name = "tx",
>> 		.recover = mlx5e_tx_reporter_recover,
>> 		.diagnose = mlx5e_tx_reporter_diagnose,
>> +		.dump = mlx5e_tx_reporter_sw_dump,
>> };
>>
>> #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
>> -- 
>> 2.14.1
>>
Jiri Pirko May 9, 2019, 8:23 a.m. UTC | #7
Tue, May 07, 2019 at 02:58:32PM CEST, ayal@mellanox.com wrote:
>
>
>On 5/7/2019 3:41 PM, Jiri Pirko wrote:
>> Mon, Apr 29, 2019 at 04:17:39PM CEST, ayal@mellanox.com wrote:
>>> TX reporter reports an error on two scenarios:
>>> - TX timeout on a specific tx queue
>>> - TX completion error on a specific send queue
>>> Prior to this patch, no dump data was supported by the tx reporter. This
>>> patch adds support for SW data dump of the related SQ context. The dump
>>> is simply the SQ's raw memory snapshot taken right after the error was
>>> reported, before any recovery procedure was launched. With this
>>> approach, no maintenance is needed as the driver fetch the actual data
>>> according to the layout on which the SQ was compiled with.  By providing
>>> a SW context, one can easily debug error on a given SQ.
>>>
>>> In order to offline translate the raw memory into a human readable
>>> format, the user can use some out-of-kernel scripts which receives as an
>>> input the following:
>>> - Object raw memory
>>> - Driver object compiled with debug info (can be taken/generated at any time from the machine)
>>> - Object name
>>>
>>> An example of such script output can be seen below.
>>> Note: the script is not offered as part of this patch as it do not
>>> belong to the kernel, I just described it in order to grasp the general
>>> idea of how/what can be fetched from SW dump via devlink health.
>>>
>>> The output of the SW dump can be extracted by devlink health command:
>>> $ sudo devlink health dump show pci/0000:00:0b.0 reporter tx.
>>> mlx5e_txqsq: sqn: 6336
>>> memory:
>>>    00 00 00 00 00 00 00 00
>>>    01 00 00 00 00 00 00 00
>>>    00 00 00 00 00 00 00 00
>>>    45 f4 88 cb 09 00 00 00
>>>    00 00 00 00 00 00 00 00
>>>    00 00 00 00 00 00 00 00
>>>    c0 ff ff ff 1f 00 00 00
>>>    f8 18 1e 89 81 88 ff ff
>>>    ...
>>>
>>> script output below, with struct members names and actual values:
>>>
>>> struct  mlx5e_txqsq {
>>> 	short unsigned int         cc 	 0x5 ;
>>> 	unsigned int               dma_fifo_cc 	 0x5 ;
>>> 	struct  net_dim {
>>> 		unsigned char      state 	 0x1 ;
>>> 		struct  net_dim_stats {
>>> 			int        ppms 	 0x0 ;
>>> 			int        bpms 	 0x0 ;
>>> 			int        epms 	 0x0 ;
>>> 		} prev_stats;
>>> 		struct  net_dim_sample {
>>> 			long long int time 	 0x90766ef9d ;
>>> 			unsigned int pkt_ctr 	 0x0 ;
>>> 			unsigned int byte_ctr 	 0x0 ;
>>> 			short unsigned int event_ctr 	 0x0 ;
>>> 		} start_sample;
>>> 		struct  work_struct {
>>> 			struct   {
>>> 				long int counter 	 0x1fffffffc0 ;
>>> 			} data;
>>> 			struct  list_head {
>>> 				struct list_head * next 	 0xffff8881b08998f8 ;
>>> 				struct list_head * prev 	 0xffff8881b08998f8 ;
>>> 			} entry;
>>> 			void       (*func)(struct work_struct *) 	 0xffffffffa02d0e30 ;
>>> 		} work;
>>> 		unsigned char      profile_ix 	 0x60 ;
>>> 		unsigned char      mode 	 0x72 ;
>>> 		unsigned char      tune_state 	 0x35 ;
>>> 		unsigned char      steps_right 	 0xa0 ;
>>> 		unsigned char      steps_left 	 0xff ;
>>> 		unsigned char      tired 	 0xff ;
>>> 	} dim;
>>> 	short unsigned int         pc 	 0x0 ;
>>> 	unsigned int               dma_fifo_pc 	 0x0 ;
>>> 	struct  mlx5e_cq {
>>> 		struct  mlx5_cqwq {
>>> 			struct  mlx5_frag_buf_ctrl {
>>> 				struct mlx5_buf_list * frags 	 0x500000005 ;
>>> 				unsigned int sz_m1 	 0x0 ;
>>> 				short unsigned int frag_sz_m1 	 0x0 ;
>>> 				short unsigned int strides_offset 	 0x0 ;
>>> 				unsigned char log_sz 	 0x0 ;
>>> 				unsigned char log_stride 	 0x0 ;
>>> 				unsigned char log_frag_strides 	 0x0 ;
>>> 			} fbc;
>>> 			__be32 *   db 	 0x0 ;
>>> 			unsigned int cc 	 0x0 ;
>>> 		} wq;
>>> 		short unsigned int event_ctr 	 0x0 ;
>>> 		struct napi_struct * napi 	 0x0 ;
>>> 		struct  mlx5_core_cq {
>>> 			unsigned int cqn 	 0x0 ;
>>> 			int        cqe_sz 	 0x0 ;
>>> 			__be32 *   set_ci_db 	 0xffff8881b1aa4988 ;
>>> 			__be32 *   arm_db 	 0x3f000003ff ;
>>> 			struct mlx5_uars_page * uar 	 0x6060a ;
>>> 			struct  refcount_struct {
>>> 				struct   {
>>> 					int    counter 	 0xa1814500 ;
>>> 				} refs;
>>> 			} refcount;
>>> 			struct  completion {
>>> 				unsigned int done 	 0x5 ;
>>> 				struct  wait_queue_head {
>>> 					struct  spinlock {
>>> 						union   {
>>> 							struct  raw_spinlock {
>>> 								struct  qspinlock {
>>> 									union   {
>>> 										struct   {
>>> 											int                                                    counter 	 0x5 ;
>>> 										} val;
>>> 										struct   {
>>> 											unsigned char                                          locked 	 0x5 ;
>>> 											unsigned char                                          pending 	 0x0 ;
>>> 										} ;
>>> 										struct   {
>>> 											short unsigned int                                     locked_pending 	 0x5 ;
>>> 											short unsigned int                                     tail 	 0x0 ;
>>> 										} ;
>>> 									} ;
>>> 								} raw_lock;
>>> 							} rlock;
>>> 						} ;
>>> 					} lock;
>>> 					struct  list_head {
>>> 						struct list_head * next 	 0xffff8881b089bb88 ;
>>> 						struct list_head * prev 	 0x4000000c0a ;
>>> 					} head;
>>> 				} wait;
>>> 			} free;
>>> 			unsigned int vector 	 0xa1814500 ;
>>> 			unsigned int irqn 	 0xffff8881 ;
>>> 			void       (*comp)(struct mlx5_core_cq *) 	 0xffff8881a1814504 ;
>>> 			void       (*event)(struct mlx5_core_cq *, enum mlx5_event) 	 0xffff8881a2cdea08 ;
>>> 			unsigned int cons_index 	 0x1 ;
>>> 			unsigned int arm_sn 	 0x0 ;
>>> 			struct mlx5_rsc_debug * dbg 	 0x0 ;
>>> 			int        pid 	 0x0 ;
>>> 			struct   {
>>> 				struct  list_head {
>>> 					struct list_head * next 	 0xffffffff ;
>>> 					struct list_head * prev 	 0xffffffffffffffff ;
>>> 				} list;
>>> 				void (*comp)(struct mlx5_core_cq *) 	 0xffffffffa0356940 ;
>>> 				void * priv 	 0x0 ;
>>> 			} tasklet_ctx;
>>> 			int        reset_notify_added 	 0x0 ;
>>> 			struct  list_head {
>>> 				struct list_head * next 	 0xffffffffa0300700 ;
>>> 				struct list_head * prev 	 0xd ;
>>> 			} reset_notify;
>>> 			struct mlx5_eq_comp * eq 	 0x0 ;
>>> 			short unsigned int uid 	 0x9a70 ;
>>> 		} mcq;
>>> 		struct mlx5e_channel * channel 	 0xffff8881b0899a70 ;
>>> 		struct mlx5_core_dev * mdev 	 0x4800000001 ;
>>> 		struct  mlx5_wq_ctrl {
>>> 			struct mlx5_core_dev * mdev 	 0xffffffffa02d5350 ;
>>> 			struct  mlx5_frag_buf {
>>> 				struct mlx5_buf_list * frags 	 0xffffffffa02d5460 ;
>>> 				int npages 	 0x0 ;
>>> 				int size 	 0x5 ;
>>> 				unsigned char page_shift 	 0x8 ;
>>> 			} buf;
>>> 			struct  mlx5_db {
>>> 				__be32 * db 	 0x1c6 ;
>>> 				union   {
>>> 					struct mlx5_db_pgdir * pgdir 	 0x0 ;
>>> 					struct mlx5_ib_user_db_page * user_page 	 0x0 ;
>>> 				} u;
>>> 				long long unsigned int dma 	 0xffff8881b0899ab0 ;
>>> 				int index 	 0x0 ;
>>> 			} db;
>>> 		} wq_ctrl;
>>> 	} cq;
>>> 	struct  mlx5_wq_cyc {
>>> 		struct  mlx5_frag_buf_ctrl {
>>> 			struct mlx5_buf_list * frags 	 0xffff8881a7600160 ;
>>> 			unsigned int sz_m1 	 0xa7600160 ;
>>> 			short unsigned int frag_sz_m1 	 0x8881 ;
>>> 			short unsigned int strides_offset 	 0xffff ;
>>> 			unsigned char log_sz 	 0x88 ;
>>> 			unsigned char log_stride 	 0x49 ;
>>> 			unsigned char log_frag_strides 	 0xaa ;
>>> 		} fbc;
>>> 		__be32 *           db 	 0x1000000000010 ;
>>> 		short unsigned int sz 	 0xc ;
>>> 		short unsigned int wqe_ctr 	 0x0 ;
>>> 		short unsigned int cur_sz 	 0x0 ;
>>> 	} wq;
>>> 	unsigned int               dma_fifo_mask 	 0xa1814500 ;
>>> 	struct mlx5e_sq_stats *    stats 	 0xffff8881a33a0348 ;
>>> 	struct   {
>>> 		struct mlx5e_sq_dma * dma_fifo 	 0x1a1814500 ;
>>> 		struct mlx5e_tx_wqe_info * wqe_info 	 0x14 ;
>>> 	} db;
>>> 	void *                     uar_map 	 0x0 ;
>>> 	struct netdev_queue *      txq 	 0x0 ;
>>> 	unsigned int               sqn 	 0x18c0 ;
>>> 	unsigned char              min_inline_mode 	 0x0 ;
>>> 	struct device *            pdev 	 0x0 ;
>>> 	unsigned int               mkey_be 	 0x0 ;
>>> 	long unsigned int          state 	 0x0 ;
>>> 	struct hwtstamp_config *   tstamp 	 0x0 ;
>>> 	struct mlx5_clock *        clock 	 0xffff8881b1aa6f88 ;
>>> 	struct  mlx5_wq_ctrl {
>>> 		struct mlx5_core_dev * mdev 	 0x3f000003ff ;
>>> 		struct  mlx5_frag_buf {
>>> 			struct mlx5_buf_list * frags 	 0x6060a ;
>>> 			int        npages 	 0xa1814604 ;
>>> 			int        size 	 0xffff8881 ;
>>> 			unsigned char page_shift 	 0x0 ;
>>> 		} buf;
>>> 		struct  mlx5_db {
>>> 			__be32 *   db 	 0xfff ;
>>> 			union   {
>>> 				struct mlx5_db_pgdir * pgdir 	 0x0 ;
>>> 				struct mlx5_ib_user_db_page * user_page 	 0x0 ;
>>> 			} u;
>>> 			long long unsigned int dma 	 0xffff888188440000 ;
>>> 			int        index 	 0x8b074000 ;
>>> 		} db;
>>> 	} wq_ctrl;
>>> 	struct mlx5e_channel *     channel 	 0xffffc9000010d800 ;
>>> 	int                        txq_ix 	 0xa0020180 ;
>>> 	unsigned int               rate_limit 	 0xffff8881 ;
>>> 	struct  work_struct {
>>> 		struct   {
>>> 			long int   counter 	 0x1000018c0 ;
>>> 		} data;
>>> 		struct  list_head {
>>> 			struct list_head * next 	 0xffff8881c32b68e8 ;
>>> 			struct list_head * prev 	 0x800 ;
>>> 		} entry;
>>> 		void               (*func)(struct work_struct *) 	 0x9 ;
>>> 	} recover_work;
>>> } ;
>> 
>> I don't get it. You are dumping live kernel memory? There are already
>> facilities to do that in place. Why to replicate it?
>I am dumping the driver's memory under a lock so I can ensure it's 
>consistency (as appose to /dev/mem)
>vmcore cannot be taken from a live kernel (without crashing).
>I need the memory's snapshot right after the error from the driver's 
>context.

Got it. However, this sounds like a generic problem not specific to
nic drivers. How other subsystems resolve this (if they do at all)?



>Which other tools do you mean?
>> 
>> 
>>>
>>> Signed-off-by: Aya Levin <ayal@mellanox.com>
>>> ---
>>> .../ethernet/mellanox/mlx5/core/en/reporter_tx.c   | 100 +++++++++++++++++++++
>>> 1 file changed, 100 insertions(+)
>>>
>>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>> index 476dd97f7f2f..8a39f5525e57 100644
>>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>> @@ -9,6 +9,7 @@
>>>
>>> struct mlx5e_tx_err_ctx {
>>> 	int (*recover)(struct mlx5e_txqsq *sq);
>>> +	int (*dump)(struct mlx5e_txqsq *sq);
>>> 	struct mlx5e_txqsq *sq;
>>> };
>>>
>>> @@ -281,10 +282,109 @@ static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
>>> 	return err;
>>> }
>>>
>>> +static int mlx5e_tx_reporter_sw_dump_from_ctx(struct mlx5e_priv *priv,
>>> +					      struct mlx5e_txqsq *sq,
>>> +					      struct devlink_fmsg *fmsg)
>>> +{
>>> +	u64 *ptr = (u64 *)sq;
>>> +	int copy, err;
>>> +	int i = 0;
>>> +
>>> +	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>>> +		return 0;
>>> +
>>> +	err = devlink_fmsg_pair_nest_start(fmsg, "mlx5e_txqsq");
>>> +	if (err)
>>> +		return err;
>>> +
>>> +	err = devlink_fmsg_obj_nest_start(fmsg);
>>> +	if (err)
>>> +		return err;
>>> +
>>> +	err = devlink_fmsg_arr_pair_nest_start(fmsg, "memory");
>>> +	if (err)
>>> +		return err;
>>> +
>>> +	while (i < sizeof(struct mlx5e_txqsq)) {
>>> +		copy = sizeof(u64);
>>> +
>>> +		if (i + copy > sizeof(struct mlx5e_txqsq))
>>> +			copy = sizeof(struct mlx5e_txqsq) - i;
>>> +
>>> +		err = devlink_fmsg_binary_put(fmsg, ptr, copy);
>>> +		if (err)
>>> +			return err;
>>> +		ptr++;
>>> +		i += copy;
>>> +	}
>>> +
>>> +	err = devlink_fmsg_arr_pair_nest_end(fmsg);
>>> +	if (err)
>>> +		return err;
>>> +
>>> +	err = devlink_fmsg_obj_nest_end(fmsg);
>>> +	if (err)
>>> +		return err;
>>> +
>>> +	err = devlink_fmsg_pair_nest_end(fmsg);
>>> +
>>> +	return err;
>>> +}
>>> +
>>> +static int mlx5e_tx_reporter_sw_dump_all(struct mlx5e_priv *priv,
>>> +					 struct devlink_fmsg *fmsg)
>>> +{
>>> +	int i, err = 0;
>>> +
>>> +	mutex_lock(&priv->state_lock);
>>> +
>>> +	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>>> +		goto unlock;
>>> +
>>> +	err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
>>> +	if (err)
>>> +		goto unlock;
>>> +
>>> +	for (i = 0; i < priv->channels.num * priv->channels.params.num_tc;
>>> +	     i++) {
>>> +		err = devlink_fmsg_obj_nest_start(fmsg);
>>> +		if (err)
>>> +			goto unlock;
>>> +
>>> +		err = mlx5e_tx_reporter_sw_dump_from_ctx(priv, priv->txq2sq[i],
>>> +							 fmsg);
>>> +		if (err)
>>> +			goto unlock;
>>> +
>>> +		err = devlink_fmsg_pair_nest_end(fmsg);
>>> +		if (err)
>>> +			goto unlock;
>>> +	}
>>> +	err = devlink_fmsg_arr_pair_nest_end(fmsg);
>>> +	if (err)
>>> +		goto unlock;
>>> +
>>> +unlock:
>>> +	mutex_unlock(&priv->state_lock);
>>> +	return err;
>>> +}
>>> +
>>> +static int mlx5e_tx_reporter_sw_dump(struct devlink_health_reporter *reporter,
>>> +				     struct devlink_fmsg *fmsg, void *context)
>>> +{
>>> +	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
>>> +	struct mlx5e_tx_err_ctx *err_ctx = context;
>>> +
>>> +	return err_ctx ? mlx5e_tx_reporter_sw_dump_from_ctx(priv, err_ctx->sq,
>>> +							    fmsg) :
>>> +			 mlx5e_tx_reporter_sw_dump_all(priv, fmsg);
>>> +}
>>> +
>>> static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
>>> 		.name = "tx",
>>> 		.recover = mlx5e_tx_reporter_recover,
>>> 		.diagnose = mlx5e_tx_reporter_diagnose,
>>> +		.dump = mlx5e_tx_reporter_sw_dump,
>>> };
>>>
>>> #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
>>> -- 
>>> 2.14.1
>>>
Aya Levin May 12, 2019, 8:37 a.m. UTC | #8
On 5/9/2019 11:23 AM, Jiri Pirko wrote:
> Tue, May 07, 2019 at 02:58:32PM CEST, ayal@mellanox.com wrote:
>>
>>
>> On 5/7/2019 3:41 PM, Jiri Pirko wrote:
>>> Mon, Apr 29, 2019 at 04:17:39PM CEST, ayal@mellanox.com wrote:
>>>> TX reporter reports an error on two scenarios:
>>>> - TX timeout on a specific tx queue
>>>> - TX completion error on a specific send queue
>>>> Prior to this patch, no dump data was supported by the tx reporter. This
>>>> patch adds support for SW data dump of the related SQ context. The dump
>>>> is simply the SQ's raw memory snapshot taken right after the error was
>>>> reported, before any recovery procedure was launched. With this
>>>> approach, no maintenance is needed as the driver fetch the actual data
>>>> according to the layout on which the SQ was compiled with.  By providing
>>>> a SW context, one can easily debug error on a given SQ.
>>>>
>>>> In order to offline translate the raw memory into a human readable
>>>> format, the user can use some out-of-kernel scripts which receives as an
>>>> input the following:
>>>> - Object raw memory
>>>> - Driver object compiled with debug info (can be taken/generated at any time from the machine)
>>>> - Object name
>>>>
>>>> An example of such script output can be seen below.
>>>> Note: the script is not offered as part of this patch as it do not
>>>> belong to the kernel, I just described it in order to grasp the general
>>>> idea of how/what can be fetched from SW dump via devlink health.
>>>>
>>>> The output of the SW dump can be extracted by devlink health command:
>>>> $ sudo devlink health dump show pci/0000:00:0b.0 reporter tx.
>>>> mlx5e_txqsq: sqn: 6336
>>>> memory:
>>>>     00 00 00 00 00 00 00 00
>>>>     01 00 00 00 00 00 00 00
>>>>     00 00 00 00 00 00 00 00
>>>>     45 f4 88 cb 09 00 00 00
>>>>     00 00 00 00 00 00 00 00
>>>>     00 00 00 00 00 00 00 00
>>>>     c0 ff ff ff 1f 00 00 00
>>>>     f8 18 1e 89 81 88 ff ff
>>>>     ...
>>>>
>>>> script output below, with struct members names and actual values:
>>>>
>>>> struct  mlx5e_txqsq {
>>>> 	short unsigned int         cc 	 0x5 ;
>>>> 	unsigned int               dma_fifo_cc 	 0x5 ;
>>>> 	struct  net_dim {
>>>> 		unsigned char      state 	 0x1 ;
>>>> 		struct  net_dim_stats {
>>>> 			int        ppms 	 0x0 ;
>>>> 			int        bpms 	 0x0 ;
>>>> 			int        epms 	 0x0 ;
>>>> 		} prev_stats;
>>>> 		struct  net_dim_sample {
>>>> 			long long int time 	 0x90766ef9d ;
>>>> 			unsigned int pkt_ctr 	 0x0 ;
>>>> 			unsigned int byte_ctr 	 0x0 ;
>>>> 			short unsigned int event_ctr 	 0x0 ;
>>>> 		} start_sample;
>>>> 		struct  work_struct {
>>>> 			struct   {
>>>> 				long int counter 	 0x1fffffffc0 ;
>>>> 			} data;
>>>> 			struct  list_head {
>>>> 				struct list_head * next 	 0xffff8881b08998f8 ;
>>>> 				struct list_head * prev 	 0xffff8881b08998f8 ;
>>>> 			} entry;
>>>> 			void       (*func)(struct work_struct *) 	 0xffffffffa02d0e30 ;
>>>> 		} work;
>>>> 		unsigned char      profile_ix 	 0x60 ;
>>>> 		unsigned char      mode 	 0x72 ;
>>>> 		unsigned char      tune_state 	 0x35 ;
>>>> 		unsigned char      steps_right 	 0xa0 ;
>>>> 		unsigned char      steps_left 	 0xff ;
>>>> 		unsigned char      tired 	 0xff ;
>>>> 	} dim;
>>>> 	short unsigned int         pc 	 0x0 ;
>>>> 	unsigned int               dma_fifo_pc 	 0x0 ;
>>>> 	struct  mlx5e_cq {
>>>> 		struct  mlx5_cqwq {
>>>> 			struct  mlx5_frag_buf_ctrl {
>>>> 				struct mlx5_buf_list * frags 	 0x500000005 ;
>>>> 				unsigned int sz_m1 	 0x0 ;
>>>> 				short unsigned int frag_sz_m1 	 0x0 ;
>>>> 				short unsigned int strides_offset 	 0x0 ;
>>>> 				unsigned char log_sz 	 0x0 ;
>>>> 				unsigned char log_stride 	 0x0 ;
>>>> 				unsigned char log_frag_strides 	 0x0 ;
>>>> 			} fbc;
>>>> 			__be32 *   db 	 0x0 ;
>>>> 			unsigned int cc 	 0x0 ;
>>>> 		} wq;
>>>> 		short unsigned int event_ctr 	 0x0 ;
>>>> 		struct napi_struct * napi 	 0x0 ;
>>>> 		struct  mlx5_core_cq {
>>>> 			unsigned int cqn 	 0x0 ;
>>>> 			int        cqe_sz 	 0x0 ;
>>>> 			__be32 *   set_ci_db 	 0xffff8881b1aa4988 ;
>>>> 			__be32 *   arm_db 	 0x3f000003ff ;
>>>> 			struct mlx5_uars_page * uar 	 0x6060a ;
>>>> 			struct  refcount_struct {
>>>> 				struct   {
>>>> 					int    counter 	 0xa1814500 ;
>>>> 				} refs;
>>>> 			} refcount;
>>>> 			struct  completion {
>>>> 				unsigned int done 	 0x5 ;
>>>> 				struct  wait_queue_head {
>>>> 					struct  spinlock {
>>>> 						union   {
>>>> 							struct  raw_spinlock {
>>>> 								struct  qspinlock {
>>>> 									union   {
>>>> 										struct   {
>>>> 											int                                                    counter 	 0x5 ;
>>>> 										} val;
>>>> 										struct   {
>>>> 											unsigned char                                          locked 	 0x5 ;
>>>> 											unsigned char                                          pending 	 0x0 ;
>>>> 										} ;
>>>> 										struct   {
>>>> 											short unsigned int                                     locked_pending 	 0x5 ;
>>>> 											short unsigned int                                     tail 	 0x0 ;
>>>> 										} ;
>>>> 									} ;
>>>> 								} raw_lock;
>>>> 							} rlock;
>>>> 						} ;
>>>> 					} lock;
>>>> 					struct  list_head {
>>>> 						struct list_head * next 	 0xffff8881b089bb88 ;
>>>> 						struct list_head * prev 	 0x4000000c0a ;
>>>> 					} head;
>>>> 				} wait;
>>>> 			} free;
>>>> 			unsigned int vector 	 0xa1814500 ;
>>>> 			unsigned int irqn 	 0xffff8881 ;
>>>> 			void       (*comp)(struct mlx5_core_cq *) 	 0xffff8881a1814504 ;
>>>> 			void       (*event)(struct mlx5_core_cq *, enum mlx5_event) 	 0xffff8881a2cdea08 ;
>>>> 			unsigned int cons_index 	 0x1 ;
>>>> 			unsigned int arm_sn 	 0x0 ;
>>>> 			struct mlx5_rsc_debug * dbg 	 0x0 ;
>>>> 			int        pid 	 0x0 ;
>>>> 			struct   {
>>>> 				struct  list_head {
>>>> 					struct list_head * next 	 0xffffffff ;
>>>> 					struct list_head * prev 	 0xffffffffffffffff ;
>>>> 				} list;
>>>> 				void (*comp)(struct mlx5_core_cq *) 	 0xffffffffa0356940 ;
>>>> 				void * priv 	 0x0 ;
>>>> 			} tasklet_ctx;
>>>> 			int        reset_notify_added 	 0x0 ;
>>>> 			struct  list_head {
>>>> 				struct list_head * next 	 0xffffffffa0300700 ;
>>>> 				struct list_head * prev 	 0xd ;
>>>> 			} reset_notify;
>>>> 			struct mlx5_eq_comp * eq 	 0x0 ;
>>>> 			short unsigned int uid 	 0x9a70 ;
>>>> 		} mcq;
>>>> 		struct mlx5e_channel * channel 	 0xffff8881b0899a70 ;
>>>> 		struct mlx5_core_dev * mdev 	 0x4800000001 ;
>>>> 		struct  mlx5_wq_ctrl {
>>>> 			struct mlx5_core_dev * mdev 	 0xffffffffa02d5350 ;
>>>> 			struct  mlx5_frag_buf {
>>>> 				struct mlx5_buf_list * frags 	 0xffffffffa02d5460 ;
>>>> 				int npages 	 0x0 ;
>>>> 				int size 	 0x5 ;
>>>> 				unsigned char page_shift 	 0x8 ;
>>>> 			} buf;
>>>> 			struct  mlx5_db {
>>>> 				__be32 * db 	 0x1c6 ;
>>>> 				union   {
>>>> 					struct mlx5_db_pgdir * pgdir 	 0x0 ;
>>>> 					struct mlx5_ib_user_db_page * user_page 	 0x0 ;
>>>> 				} u;
>>>> 				long long unsigned int dma 	 0xffff8881b0899ab0 ;
>>>> 				int index 	 0x0 ;
>>>> 			} db;
>>>> 		} wq_ctrl;
>>>> 	} cq;
>>>> 	struct  mlx5_wq_cyc {
>>>> 		struct  mlx5_frag_buf_ctrl {
>>>> 			struct mlx5_buf_list * frags 	 0xffff8881a7600160 ;
>>>> 			unsigned int sz_m1 	 0xa7600160 ;
>>>> 			short unsigned int frag_sz_m1 	 0x8881 ;
>>>> 			short unsigned int strides_offset 	 0xffff ;
>>>> 			unsigned char log_sz 	 0x88 ;
>>>> 			unsigned char log_stride 	 0x49 ;
>>>> 			unsigned char log_frag_strides 	 0xaa ;
>>>> 		} fbc;
>>>> 		__be32 *           db 	 0x1000000000010 ;
>>>> 		short unsigned int sz 	 0xc ;
>>>> 		short unsigned int wqe_ctr 	 0x0 ;
>>>> 		short unsigned int cur_sz 	 0x0 ;
>>>> 	} wq;
>>>> 	unsigned int               dma_fifo_mask 	 0xa1814500 ;
>>>> 	struct mlx5e_sq_stats *    stats 	 0xffff8881a33a0348 ;
>>>> 	struct   {
>>>> 		struct mlx5e_sq_dma * dma_fifo 	 0x1a1814500 ;
>>>> 		struct mlx5e_tx_wqe_info * wqe_info 	 0x14 ;
>>>> 	} db;
>>>> 	void *                     uar_map 	 0x0 ;
>>>> 	struct netdev_queue *      txq 	 0x0 ;
>>>> 	unsigned int               sqn 	 0x18c0 ;
>>>> 	unsigned char              min_inline_mode 	 0x0 ;
>>>> 	struct device *            pdev 	 0x0 ;
>>>> 	unsigned int               mkey_be 	 0x0 ;
>>>> 	long unsigned int          state 	 0x0 ;
>>>> 	struct hwtstamp_config *   tstamp 	 0x0 ;
>>>> 	struct mlx5_clock *        clock 	 0xffff8881b1aa6f88 ;
>>>> 	struct  mlx5_wq_ctrl {
>>>> 		struct mlx5_core_dev * mdev 	 0x3f000003ff ;
>>>> 		struct  mlx5_frag_buf {
>>>> 			struct mlx5_buf_list * frags 	 0x6060a ;
>>>> 			int        npages 	 0xa1814604 ;
>>>> 			int        size 	 0xffff8881 ;
>>>> 			unsigned char page_shift 	 0x0 ;
>>>> 		} buf;
>>>> 		struct  mlx5_db {
>>>> 			__be32 *   db 	 0xfff ;
>>>> 			union   {
>>>> 				struct mlx5_db_pgdir * pgdir 	 0x0 ;
>>>> 				struct mlx5_ib_user_db_page * user_page 	 0x0 ;
>>>> 			} u;
>>>> 			long long unsigned int dma 	 0xffff888188440000 ;
>>>> 			int        index 	 0x8b074000 ;
>>>> 		} db;
>>>> 	} wq_ctrl;
>>>> 	struct mlx5e_channel *     channel 	 0xffffc9000010d800 ;
>>>> 	int                        txq_ix 	 0xa0020180 ;
>>>> 	unsigned int               rate_limit 	 0xffff8881 ;
>>>> 	struct  work_struct {
>>>> 		struct   {
>>>> 			long int   counter 	 0x1000018c0 ;
>>>> 		} data;
>>>> 		struct  list_head {
>>>> 			struct list_head * next 	 0xffff8881c32b68e8 ;
>>>> 			struct list_head * prev 	 0x800 ;
>>>> 		} entry;
>>>> 		void               (*func)(struct work_struct *) 	 0x9 ;
>>>> 	} recover_work;
>>>> } ;
>>>
>>> I don't get it. You are dumping live kernel memory? There are already
>>> facilities to do that in place. Why to replicate it?
>> I am dumping the driver's memory under a lock so I can ensure it's
>> consistency (as appose to /dev/mem)
>> vmcore cannot be taken from a live kernel (without crashing).
>> I need the memory's snapshot right after the error from the driver's
>> context.
> 
> Got it. However, this sounds like a generic problem not specific to
> nic drivers. How other subsystems resolve this (if they do at all)?
> 
> 
Correct, this is a suggested debugging solution for a generic problem: 
enabling the user of a run time memory snapshot for kernel modules (at a 
given error event). My research shows that other subsystems deal with 
errors either by panicking (too much) or by debug/log prints (too little).
This solution is (a) low in maintenance (b) consistent in memory (c) has 
small performance impact (d) use an existing infra-structure between the 
kernel module and the user space.
It might be ported to other subsystems using their own user-space vs. 
kernel tools. Regardless of how the memory output was generated to the 
user, the parsing script can work on it.

> 
>> Which other tools do you mean?
>>>
>>>
>>>>
>>>> Signed-off-by: Aya Levin <ayal@mellanox.com>
>>>> ---
>>>> .../ethernet/mellanox/mlx5/core/en/reporter_tx.c   | 100 +++++++++++++++++++++
>>>> 1 file changed, 100 insertions(+)
>>>>
>>>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>> index 476dd97f7f2f..8a39f5525e57 100644
>>>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>> @@ -9,6 +9,7 @@
>>>>
>>>> struct mlx5e_tx_err_ctx {
>>>> 	int (*recover)(struct mlx5e_txqsq *sq);
>>>> +	int (*dump)(struct mlx5e_txqsq *sq);
>>>> 	struct mlx5e_txqsq *sq;
>>>> };
>>>>
>>>> @@ -281,10 +282,109 @@ static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
>>>> 	return err;
>>>> }
>>>>
>>>> +static int mlx5e_tx_reporter_sw_dump_from_ctx(struct mlx5e_priv *priv,
>>>> +					      struct mlx5e_txqsq *sq,
>>>> +					      struct devlink_fmsg *fmsg)
>>>> +{
>>>> +	u64 *ptr = (u64 *)sq;
>>>> +	int copy, err;
>>>> +	int i = 0;
>>>> +
>>>> +	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>>>> +		return 0;
>>>> +
>>>> +	err = devlink_fmsg_pair_nest_start(fmsg, "mlx5e_txqsq");
>>>> +	if (err)
>>>> +		return err;
>>>> +
>>>> +	err = devlink_fmsg_obj_nest_start(fmsg);
>>>> +	if (err)
>>>> +		return err;
>>>> +
>>>> +	err = devlink_fmsg_arr_pair_nest_start(fmsg, "memory");
>>>> +	if (err)
>>>> +		return err;
>>>> +
>>>> +	while (i < sizeof(struct mlx5e_txqsq)) {
>>>> +		copy = sizeof(u64);
>>>> +
>>>> +		if (i + copy > sizeof(struct mlx5e_txqsq))
>>>> +			copy = sizeof(struct mlx5e_txqsq) - i;
>>>> +
>>>> +		err = devlink_fmsg_binary_put(fmsg, ptr, copy);
>>>> +		if (err)
>>>> +			return err;
>>>> +		ptr++;
>>>> +		i += copy;
>>>> +	}
>>>> +
>>>> +	err = devlink_fmsg_arr_pair_nest_end(fmsg);
>>>> +	if (err)
>>>> +		return err;
>>>> +
>>>> +	err = devlink_fmsg_obj_nest_end(fmsg);
>>>> +	if (err)
>>>> +		return err;
>>>> +
>>>> +	err = devlink_fmsg_pair_nest_end(fmsg);
>>>> +
>>>> +	return err;
>>>> +}
>>>> +
>>>> +static int mlx5e_tx_reporter_sw_dump_all(struct mlx5e_priv *priv,
>>>> +					 struct devlink_fmsg *fmsg)
>>>> +{
>>>> +	int i, err = 0;
>>>> +
>>>> +	mutex_lock(&priv->state_lock);
>>>> +
>>>> +	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>>>> +		goto unlock;
>>>> +
>>>> +	err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
>>>> +	if (err)
>>>> +		goto unlock;
>>>> +
>>>> +	for (i = 0; i < priv->channels.num * priv->channels.params.num_tc;
>>>> +	     i++) {
>>>> +		err = devlink_fmsg_obj_nest_start(fmsg);
>>>> +		if (err)
>>>> +			goto unlock;
>>>> +
>>>> +		err = mlx5e_tx_reporter_sw_dump_from_ctx(priv, priv->txq2sq[i],
>>>> +							 fmsg);
>>>> +		if (err)
>>>> +			goto unlock;
>>>> +
>>>> +		err = devlink_fmsg_pair_nest_end(fmsg);
>>>> +		if (err)
>>>> +			goto unlock;
>>>> +	}
>>>> +	err = devlink_fmsg_arr_pair_nest_end(fmsg);
>>>> +	if (err)
>>>> +		goto unlock;
>>>> +
>>>> +unlock:
>>>> +	mutex_unlock(&priv->state_lock);
>>>> +	return err;
>>>> +}
>>>> +
>>>> +static int mlx5e_tx_reporter_sw_dump(struct devlink_health_reporter *reporter,
>>>> +				     struct devlink_fmsg *fmsg, void *context)
>>>> +{
>>>> +	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
>>>> +	struct mlx5e_tx_err_ctx *err_ctx = context;
>>>> +
>>>> +	return err_ctx ? mlx5e_tx_reporter_sw_dump_from_ctx(priv, err_ctx->sq,
>>>> +							    fmsg) :
>>>> +			 mlx5e_tx_reporter_sw_dump_all(priv, fmsg);
>>>> +}
>>>> +
>>>> static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
>>>> 		.name = "tx",
>>>> 		.recover = mlx5e_tx_reporter_recover,
>>>> 		.diagnose = mlx5e_tx_reporter_diagnose,
>>>> +		.dump = mlx5e_tx_reporter_sw_dump,
>>>> };
>>>>
>>>> #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
>>>> -- 
>>>> 2.14.1
>>>>
Jiri Pirko May 14, 2019, 12:07 p.m. UTC | #9
Sun, May 12, 2019 at 10:37:35AM CEST, ayal@mellanox.com wrote:
>
>
>On 5/9/2019 11:23 AM, Jiri Pirko wrote:
>> Tue, May 07, 2019 at 02:58:32PM CEST, ayal@mellanox.com wrote:
>>>
>>>
>>> On 5/7/2019 3:41 PM, Jiri Pirko wrote:
>>>> Mon, Apr 29, 2019 at 04:17:39PM CEST, ayal@mellanox.com wrote:
>>>>> TX reporter reports an error on two scenarios:
>>>>> - TX timeout on a specific tx queue
>>>>> - TX completion error on a specific send queue
>>>>> Prior to this patch, no dump data was supported by the tx reporter. This
>>>>> patch adds support for SW data dump of the related SQ context. The dump
>>>>> is simply the SQ's raw memory snapshot taken right after the error was
>>>>> reported, before any recovery procedure was launched. With this
>>>>> approach, no maintenance is needed as the driver fetch the actual data
>>>>> according to the layout on which the SQ was compiled with.  By providing
>>>>> a SW context, one can easily debug error on a given SQ.
>>>>>
>>>>> In order to offline translate the raw memory into a human readable
>>>>> format, the user can use some out-of-kernel scripts which receives as an
>>>>> input the following:
>>>>> - Object raw memory
>>>>> - Driver object compiled with debug info (can be taken/generated at any time from the machine)
>>>>> - Object name
>>>>>
>>>>> An example of such script output can be seen below.
>>>>> Note: the script is not offered as part of this patch as it do not
>>>>> belong to the kernel, I just described it in order to grasp the general
>>>>> idea of how/what can be fetched from SW dump via devlink health.
>>>>>
>>>>> The output of the SW dump can be extracted by devlink health command:
>>>>> $ sudo devlink health dump show pci/0000:00:0b.0 reporter tx.
>>>>> mlx5e_txqsq: sqn: 6336
>>>>> memory:
>>>>>     00 00 00 00 00 00 00 00
>>>>>     01 00 00 00 00 00 00 00
>>>>>     00 00 00 00 00 00 00 00
>>>>>     45 f4 88 cb 09 00 00 00
>>>>>     00 00 00 00 00 00 00 00
>>>>>     00 00 00 00 00 00 00 00
>>>>>     c0 ff ff ff 1f 00 00 00
>>>>>     f8 18 1e 89 81 88 ff ff
>>>>>     ...
>>>>>
>>>>> script output below, with struct members names and actual values:
>>>>>
>>>>> struct  mlx5e_txqsq {
>>>>> 	short unsigned int         cc 	 0x5 ;
>>>>> 	unsigned int               dma_fifo_cc 	 0x5 ;
>>>>> 	struct  net_dim {
>>>>> 		unsigned char      state 	 0x1 ;
>>>>> 		struct  net_dim_stats {
>>>>> 			int        ppms 	 0x0 ;
>>>>> 			int        bpms 	 0x0 ;
>>>>> 			int        epms 	 0x0 ;
>>>>> 		} prev_stats;
>>>>> 		struct  net_dim_sample {
>>>>> 			long long int time 	 0x90766ef9d ;
>>>>> 			unsigned int pkt_ctr 	 0x0 ;
>>>>> 			unsigned int byte_ctr 	 0x0 ;
>>>>> 			short unsigned int event_ctr 	 0x0 ;
>>>>> 		} start_sample;
>>>>> 		struct  work_struct {
>>>>> 			struct   {
>>>>> 				long int counter 	 0x1fffffffc0 ;
>>>>> 			} data;
>>>>> 			struct  list_head {
>>>>> 				struct list_head * next 	 0xffff8881b08998f8 ;
>>>>> 				struct list_head * prev 	 0xffff8881b08998f8 ;
>>>>> 			} entry;
>>>>> 			void       (*func)(struct work_struct *) 	 0xffffffffa02d0e30 ;
>>>>> 		} work;
>>>>> 		unsigned char      profile_ix 	 0x60 ;
>>>>> 		unsigned char      mode 	 0x72 ;
>>>>> 		unsigned char      tune_state 	 0x35 ;
>>>>> 		unsigned char      steps_right 	 0xa0 ;
>>>>> 		unsigned char      steps_left 	 0xff ;
>>>>> 		unsigned char      tired 	 0xff ;
>>>>> 	} dim;
>>>>> 	short unsigned int         pc 	 0x0 ;
>>>>> 	unsigned int               dma_fifo_pc 	 0x0 ;
>>>>> 	struct  mlx5e_cq {
>>>>> 		struct  mlx5_cqwq {
>>>>> 			struct  mlx5_frag_buf_ctrl {
>>>>> 				struct mlx5_buf_list * frags 	 0x500000005 ;
>>>>> 				unsigned int sz_m1 	 0x0 ;
>>>>> 				short unsigned int frag_sz_m1 	 0x0 ;
>>>>> 				short unsigned int strides_offset 	 0x0 ;
>>>>> 				unsigned char log_sz 	 0x0 ;
>>>>> 				unsigned char log_stride 	 0x0 ;
>>>>> 				unsigned char log_frag_strides 	 0x0 ;
>>>>> 			} fbc;
>>>>> 			__be32 *   db 	 0x0 ;
>>>>> 			unsigned int cc 	 0x0 ;
>>>>> 		} wq;
>>>>> 		short unsigned int event_ctr 	 0x0 ;
>>>>> 		struct napi_struct * napi 	 0x0 ;
>>>>> 		struct  mlx5_core_cq {
>>>>> 			unsigned int cqn 	 0x0 ;
>>>>> 			int        cqe_sz 	 0x0 ;
>>>>> 			__be32 *   set_ci_db 	 0xffff8881b1aa4988 ;
>>>>> 			__be32 *   arm_db 	 0x3f000003ff ;
>>>>> 			struct mlx5_uars_page * uar 	 0x6060a ;
>>>>> 			struct  refcount_struct {
>>>>> 				struct   {
>>>>> 					int    counter 	 0xa1814500 ;
>>>>> 				} refs;
>>>>> 			} refcount;
>>>>> 			struct  completion {
>>>>> 				unsigned int done 	 0x5 ;
>>>>> 				struct  wait_queue_head {
>>>>> 					struct  spinlock {
>>>>> 						union   {
>>>>> 							struct  raw_spinlock {
>>>>> 								struct  qspinlock {
>>>>> 									union   {
>>>>> 										struct   {
>>>>> 											int                                                    counter 	 0x5 ;
>>>>> 										} val;
>>>>> 										struct   {
>>>>> 											unsigned char                                          locked 	 0x5 ;
>>>>> 											unsigned char                                          pending 	 0x0 ;
>>>>> 										} ;
>>>>> 										struct   {
>>>>> 											short unsigned int                                     locked_pending 	 0x5 ;
>>>>> 											short unsigned int                                     tail 	 0x0 ;
>>>>> 										} ;
>>>>> 									} ;
>>>>> 								} raw_lock;
>>>>> 							} rlock;
>>>>> 						} ;
>>>>> 					} lock;
>>>>> 					struct  list_head {
>>>>> 						struct list_head * next 	 0xffff8881b089bb88 ;
>>>>> 						struct list_head * prev 	 0x4000000c0a ;
>>>>> 					} head;
>>>>> 				} wait;
>>>>> 			} free;
>>>>> 			unsigned int vector 	 0xa1814500 ;
>>>>> 			unsigned int irqn 	 0xffff8881 ;
>>>>> 			void       (*comp)(struct mlx5_core_cq *) 	 0xffff8881a1814504 ;
>>>>> 			void       (*event)(struct mlx5_core_cq *, enum mlx5_event) 	 0xffff8881a2cdea08 ;
>>>>> 			unsigned int cons_index 	 0x1 ;
>>>>> 			unsigned int arm_sn 	 0x0 ;
>>>>> 			struct mlx5_rsc_debug * dbg 	 0x0 ;
>>>>> 			int        pid 	 0x0 ;
>>>>> 			struct   {
>>>>> 				struct  list_head {
>>>>> 					struct list_head * next 	 0xffffffff ;
>>>>> 					struct list_head * prev 	 0xffffffffffffffff ;
>>>>> 				} list;
>>>>> 				void (*comp)(struct mlx5_core_cq *) 	 0xffffffffa0356940 ;
>>>>> 				void * priv 	 0x0 ;
>>>>> 			} tasklet_ctx;
>>>>> 			int        reset_notify_added 	 0x0 ;
>>>>> 			struct  list_head {
>>>>> 				struct list_head * next 	 0xffffffffa0300700 ;
>>>>> 				struct list_head * prev 	 0xd ;
>>>>> 			} reset_notify;
>>>>> 			struct mlx5_eq_comp * eq 	 0x0 ;
>>>>> 			short unsigned int uid 	 0x9a70 ;
>>>>> 		} mcq;
>>>>> 		struct mlx5e_channel * channel 	 0xffff8881b0899a70 ;
>>>>> 		struct mlx5_core_dev * mdev 	 0x4800000001 ;
>>>>> 		struct  mlx5_wq_ctrl {
>>>>> 			struct mlx5_core_dev * mdev 	 0xffffffffa02d5350 ;
>>>>> 			struct  mlx5_frag_buf {
>>>>> 				struct mlx5_buf_list * frags 	 0xffffffffa02d5460 ;
>>>>> 				int npages 	 0x0 ;
>>>>> 				int size 	 0x5 ;
>>>>> 				unsigned char page_shift 	 0x8 ;
>>>>> 			} buf;
>>>>> 			struct  mlx5_db {
>>>>> 				__be32 * db 	 0x1c6 ;
>>>>> 				union   {
>>>>> 					struct mlx5_db_pgdir * pgdir 	 0x0 ;
>>>>> 					struct mlx5_ib_user_db_page * user_page 	 0x0 ;
>>>>> 				} u;
>>>>> 				long long unsigned int dma 	 0xffff8881b0899ab0 ;
>>>>> 				int index 	 0x0 ;
>>>>> 			} db;
>>>>> 		} wq_ctrl;
>>>>> 	} cq;
>>>>> 	struct  mlx5_wq_cyc {
>>>>> 		struct  mlx5_frag_buf_ctrl {
>>>>> 			struct mlx5_buf_list * frags 	 0xffff8881a7600160 ;
>>>>> 			unsigned int sz_m1 	 0xa7600160 ;
>>>>> 			short unsigned int frag_sz_m1 	 0x8881 ;
>>>>> 			short unsigned int strides_offset 	 0xffff ;
>>>>> 			unsigned char log_sz 	 0x88 ;
>>>>> 			unsigned char log_stride 	 0x49 ;
>>>>> 			unsigned char log_frag_strides 	 0xaa ;
>>>>> 		} fbc;
>>>>> 		__be32 *           db 	 0x1000000000010 ;
>>>>> 		short unsigned int sz 	 0xc ;
>>>>> 		short unsigned int wqe_ctr 	 0x0 ;
>>>>> 		short unsigned int cur_sz 	 0x0 ;
>>>>> 	} wq;
>>>>> 	unsigned int               dma_fifo_mask 	 0xa1814500 ;
>>>>> 	struct mlx5e_sq_stats *    stats 	 0xffff8881a33a0348 ;
>>>>> 	struct   {
>>>>> 		struct mlx5e_sq_dma * dma_fifo 	 0x1a1814500 ;
>>>>> 		struct mlx5e_tx_wqe_info * wqe_info 	 0x14 ;
>>>>> 	} db;
>>>>> 	void *                     uar_map 	 0x0 ;
>>>>> 	struct netdev_queue *      txq 	 0x0 ;
>>>>> 	unsigned int               sqn 	 0x18c0 ;
>>>>> 	unsigned char              min_inline_mode 	 0x0 ;
>>>>> 	struct device *            pdev 	 0x0 ;
>>>>> 	unsigned int               mkey_be 	 0x0 ;
>>>>> 	long unsigned int          state 	 0x0 ;
>>>>> 	struct hwtstamp_config *   tstamp 	 0x0 ;
>>>>> 	struct mlx5_clock *        clock 	 0xffff8881b1aa6f88 ;
>>>>> 	struct  mlx5_wq_ctrl {
>>>>> 		struct mlx5_core_dev * mdev 	 0x3f000003ff ;
>>>>> 		struct  mlx5_frag_buf {
>>>>> 			struct mlx5_buf_list * frags 	 0x6060a ;
>>>>> 			int        npages 	 0xa1814604 ;
>>>>> 			int        size 	 0xffff8881 ;
>>>>> 			unsigned char page_shift 	 0x0 ;
>>>>> 		} buf;
>>>>> 		struct  mlx5_db {
>>>>> 			__be32 *   db 	 0xfff ;
>>>>> 			union   {
>>>>> 				struct mlx5_db_pgdir * pgdir 	 0x0 ;
>>>>> 				struct mlx5_ib_user_db_page * user_page 	 0x0 ;
>>>>> 			} u;
>>>>> 			long long unsigned int dma 	 0xffff888188440000 ;
>>>>> 			int        index 	 0x8b074000 ;
>>>>> 		} db;
>>>>> 	} wq_ctrl;
>>>>> 	struct mlx5e_channel *     channel 	 0xffffc9000010d800 ;
>>>>> 	int                        txq_ix 	 0xa0020180 ;
>>>>> 	unsigned int               rate_limit 	 0xffff8881 ;
>>>>> 	struct  work_struct {
>>>>> 		struct   {
>>>>> 			long int   counter 	 0x1000018c0 ;
>>>>> 		} data;
>>>>> 		struct  list_head {
>>>>> 			struct list_head * next 	 0xffff8881c32b68e8 ;
>>>>> 			struct list_head * prev 	 0x800 ;
>>>>> 		} entry;
>>>>> 		void               (*func)(struct work_struct *) 	 0x9 ;
>>>>> 	} recover_work;
>>>>> } ;
>>>>
>>>> I don't get it. You are dumping live kernel memory? There are already
>>>> facilities to do that in place. Why to replicate it?
>>> I am dumping the driver's memory under a lock so I can ensure it's
>>> consistency (as appose to /dev/mem)
>>> vmcore cannot be taken from a live kernel (without crashing).
>>> I need the memory's snapshot right after the error from the driver's
>>> context.
>> 
>> Got it. However, this sounds like a generic problem not specific to
>> nic drivers. How other subsystems resolve this (if they do at all)?
>> 
>> 
>Correct, this is a suggested debugging solution for a generic problem: 
>enabling the user of a run time memory snapshot for kernel modules (at a 
>given error event). My research shows that other subsystems deal with 
>errors either by panicking (too much) or by debug/log prints (too little).
>This solution is (a) low in maintenance (b) consistent in memory (c) has 
>small performance impact (d) use an existing infra-structure between the 
>kernel module and the user space.

I'm still convinced that dumping kernel memory over devlink health dump
is a good idea :/


>It might be ported to other subsystems using their own user-space vs. 
>kernel tools. Regardless of how the memory output was generated to the 
>user, the parsing script can work on it.

Could you share the script? How is it going to be distributed?


>
>> 
>>> Which other tools do you mean?
>>>>
>>>>
>>>>>
>>>>> Signed-off-by: Aya Levin <ayal@mellanox.com>
>>>>> ---
>>>>> .../ethernet/mellanox/mlx5/core/en/reporter_tx.c   | 100 +++++++++++++++++++++
>>>>> 1 file changed, 100 insertions(+)
>>>>>
>>>>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>>> index 476dd97f7f2f..8a39f5525e57 100644
>>>>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>>> @@ -9,6 +9,7 @@
>>>>>
>>>>> struct mlx5e_tx_err_ctx {
>>>>> 	int (*recover)(struct mlx5e_txqsq *sq);
>>>>> +	int (*dump)(struct mlx5e_txqsq *sq);
>>>>> 	struct mlx5e_txqsq *sq;
>>>>> };
>>>>>
>>>>> @@ -281,10 +282,109 @@ static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
>>>>> 	return err;
>>>>> }
>>>>>
>>>>> +static int mlx5e_tx_reporter_sw_dump_from_ctx(struct mlx5e_priv *priv,
>>>>> +					      struct mlx5e_txqsq *sq,
>>>>> +					      struct devlink_fmsg *fmsg)
>>>>> +{
>>>>> +	u64 *ptr = (u64 *)sq;
>>>>> +	int copy, err;
>>>>> +	int i = 0;
>>>>> +
>>>>> +	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>>>>> +		return 0;
>>>>> +
>>>>> +	err = devlink_fmsg_pair_nest_start(fmsg, "mlx5e_txqsq");
>>>>> +	if (err)
>>>>> +		return err;
>>>>> +
>>>>> +	err = devlink_fmsg_obj_nest_start(fmsg);
>>>>> +	if (err)
>>>>> +		return err;
>>>>> +
>>>>> +	err = devlink_fmsg_arr_pair_nest_start(fmsg, "memory");
>>>>> +	if (err)
>>>>> +		return err;
>>>>> +
>>>>> +	while (i < sizeof(struct mlx5e_txqsq)) {
>>>>> +		copy = sizeof(u64);
>>>>> +
>>>>> +		if (i + copy > sizeof(struct mlx5e_txqsq))
>>>>> +			copy = sizeof(struct mlx5e_txqsq) - i;
>>>>> +
>>>>> +		err = devlink_fmsg_binary_put(fmsg, ptr, copy);
>>>>> +		if (err)
>>>>> +			return err;
>>>>> +		ptr++;
>>>>> +		i += copy;
>>>>> +	}
>>>>> +
>>>>> +	err = devlink_fmsg_arr_pair_nest_end(fmsg);
>>>>> +	if (err)
>>>>> +		return err;
>>>>> +
>>>>> +	err = devlink_fmsg_obj_nest_end(fmsg);
>>>>> +	if (err)
>>>>> +		return err;
>>>>> +
>>>>> +	err = devlink_fmsg_pair_nest_end(fmsg);
>>>>> +
>>>>> +	return err;
>>>>> +}
>>>>> +
>>>>> +static int mlx5e_tx_reporter_sw_dump_all(struct mlx5e_priv *priv,
>>>>> +					 struct devlink_fmsg *fmsg)
>>>>> +{
>>>>> +	int i, err = 0;
>>>>> +
>>>>> +	mutex_lock(&priv->state_lock);
>>>>> +
>>>>> +	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>>>>> +		goto unlock;
>>>>> +
>>>>> +	err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
>>>>> +	if (err)
>>>>> +		goto unlock;
>>>>> +
>>>>> +	for (i = 0; i < priv->channels.num * priv->channels.params.num_tc;
>>>>> +	     i++) {
>>>>> +		err = devlink_fmsg_obj_nest_start(fmsg);
>>>>> +		if (err)
>>>>> +			goto unlock;
>>>>> +
>>>>> +		err = mlx5e_tx_reporter_sw_dump_from_ctx(priv, priv->txq2sq[i],
>>>>> +							 fmsg);
>>>>> +		if (err)
>>>>> +			goto unlock;
>>>>> +
>>>>> +		err = devlink_fmsg_pair_nest_end(fmsg);
>>>>> +		if (err)
>>>>> +			goto unlock;
>>>>> +	}
>>>>> +	err = devlink_fmsg_arr_pair_nest_end(fmsg);
>>>>> +	if (err)
>>>>> +		goto unlock;
>>>>> +
>>>>> +unlock:
>>>>> +	mutex_unlock(&priv->state_lock);
>>>>> +	return err;
>>>>> +}
>>>>> +
>>>>> +static int mlx5e_tx_reporter_sw_dump(struct devlink_health_reporter *reporter,
>>>>> +				     struct devlink_fmsg *fmsg, void *context)
>>>>> +{
>>>>> +	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
>>>>> +	struct mlx5e_tx_err_ctx *err_ctx = context;
>>>>> +
>>>>> +	return err_ctx ? mlx5e_tx_reporter_sw_dump_from_ctx(priv, err_ctx->sq,
>>>>> +							    fmsg) :
>>>>> +			 mlx5e_tx_reporter_sw_dump_all(priv, fmsg);
>>>>> +}
>>>>> +
>>>>> static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
>>>>> 		.name = "tx",
>>>>> 		.recover = mlx5e_tx_reporter_recover,
>>>>> 		.diagnose = mlx5e_tx_reporter_diagnose,
>>>>> +		.dump = mlx5e_tx_reporter_sw_dump,
>>>>> };
>>>>>
>>>>> #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
>>>>> -- 
>>>>> 2.14.1
>>>>>
Aya Levin May 16, 2019, 8:49 a.m. UTC | #10
On 5/14/2019 3:07 PM, Jiri Pirko wrote:
> Sun, May 12, 2019 at 10:37:35AM CEST, ayal@mellanox.com wrote:
>>
>>
>> On 5/9/2019 11:23 AM, Jiri Pirko wrote:
>>> Tue, May 07, 2019 at 02:58:32PM CEST, ayal@mellanox.com wrote:
>>>>
>>>>
>>>> On 5/7/2019 3:41 PM, Jiri Pirko wrote:
>>>>> Mon, Apr 29, 2019 at 04:17:39PM CEST, ayal@mellanox.com wrote:
>>>>>> TX reporter reports an error on two scenarios:
>>>>>> - TX timeout on a specific tx queue
>>>>>> - TX completion error on a specific send queue
>>>>>> Prior to this patch, no dump data was supported by the tx reporter. This
>>>>>> patch adds support for SW data dump of the related SQ context. The dump
>>>>>> is simply the SQ's raw memory snapshot taken right after the error was
>>>>>> reported, before any recovery procedure was launched. With this
>>>>>> approach, no maintenance is needed as the driver fetch the actual data
>>>>>> according to the layout on which the SQ was compiled with.  By providing
>>>>>> a SW context, one can easily debug error on a given SQ.
>>>>>>
>>>>>> In order to offline translate the raw memory into a human readable
>>>>>> format, the user can use some out-of-kernel scripts which receives as an
>>>>>> input the following:
>>>>>> - Object raw memory
>>>>>> - Driver object compiled with debug info (can be taken/generated at any time from the machine)
>>>>>> - Object name
>>>>>>
>>>>>> An example of such script output can be seen below.
>>>>>> Note: the script is not offered as part of this patch as it do not
>>>>>> belong to the kernel, I just described it in order to grasp the general
>>>>>> idea of how/what can be fetched from SW dump via devlink health.
>>>>>>
>>>>>> The output of the SW dump can be extracted by devlink health command:
>>>>>> $ sudo devlink health dump show pci/0000:00:0b.0 reporter tx.
>>>>>> mlx5e_txqsq: sqn: 6336
>>>>>> memory:
>>>>>>      00 00 00 00 00 00 00 00
>>>>>>      01 00 00 00 00 00 00 00
>>>>>>      00 00 00 00 00 00 00 00
>>>>>>      45 f4 88 cb 09 00 00 00
>>>>>>      00 00 00 00 00 00 00 00
>>>>>>      00 00 00 00 00 00 00 00
>>>>>>      c0 ff ff ff 1f 00 00 00
>>>>>>      f8 18 1e 89 81 88 ff ff
>>>>>>      ...
>>>>>>
>>>>>> script output below, with struct members names and actual values:
>>>>>>
>>>>>> struct  mlx5e_txqsq {
>>>>>> 	short unsigned int         cc 	 0x5 ;
>>>>>> 	unsigned int               dma_fifo_cc 	 0x5 ;
>>>>>> 	struct  net_dim {
>>>>>> 		unsigned char      state 	 0x1 ;
>>>>>> 		struct  net_dim_stats {
>>>>>> 			int        ppms 	 0x0 ;
>>>>>> 			int        bpms 	 0x0 ;
>>>>>> 			int        epms 	 0x0 ;
>>>>>> 		} prev_stats;
>>>>>> 		struct  net_dim_sample {
>>>>>> 			long long int time 	 0x90766ef9d ;
>>>>>> 			unsigned int pkt_ctr 	 0x0 ;
>>>>>> 			unsigned int byte_ctr 	 0x0 ;
>>>>>> 			short unsigned int event_ctr 	 0x0 ;
>>>>>> 		} start_sample;
>>>>>> 		struct  work_struct {
>>>>>> 			struct   {
>>>>>> 				long int counter 	 0x1fffffffc0 ;
>>>>>> 			} data;
>>>>>> 			struct  list_head {
>>>>>> 				struct list_head * next 	 0xffff8881b08998f8 ;
>>>>>> 				struct list_head * prev 	 0xffff8881b08998f8 ;
>>>>>> 			} entry;
>>>>>> 			void       (*func)(struct work_struct *) 	 0xffffffffa02d0e30 ;
>>>>>> 		} work;
>>>>>> 		unsigned char      profile_ix 	 0x60 ;
>>>>>> 		unsigned char      mode 	 0x72 ;
>>>>>> 		unsigned char      tune_state 	 0x35 ;
>>>>>> 		unsigned char      steps_right 	 0xa0 ;
>>>>>> 		unsigned char      steps_left 	 0xff ;
>>>>>> 		unsigned char      tired 	 0xff ;
>>>>>> 	} dim;
>>>>>> 	short unsigned int         pc 	 0x0 ;
>>>>>> 	unsigned int               dma_fifo_pc 	 0x0 ;
>>>>>> 	struct  mlx5e_cq {
>>>>>> 		struct  mlx5_cqwq {
>>>>>> 			struct  mlx5_frag_buf_ctrl {
>>>>>> 				struct mlx5_buf_list * frags 	 0x500000005 ;
>>>>>> 				unsigned int sz_m1 	 0x0 ;
>>>>>> 				short unsigned int frag_sz_m1 	 0x0 ;
>>>>>> 				short unsigned int strides_offset 	 0x0 ;
>>>>>> 				unsigned char log_sz 	 0x0 ;
>>>>>> 				unsigned char log_stride 	 0x0 ;
>>>>>> 				unsigned char log_frag_strides 	 0x0 ;
>>>>>> 			} fbc;
>>>>>> 			__be32 *   db 	 0x0 ;
>>>>>> 			unsigned int cc 	 0x0 ;
>>>>>> 		} wq;
>>>>>> 		short unsigned int event_ctr 	 0x0 ;
>>>>>> 		struct napi_struct * napi 	 0x0 ;
>>>>>> 		struct  mlx5_core_cq {
>>>>>> 			unsigned int cqn 	 0x0 ;
>>>>>> 			int        cqe_sz 	 0x0 ;
>>>>>> 			__be32 *   set_ci_db 	 0xffff8881b1aa4988 ;
>>>>>> 			__be32 *   arm_db 	 0x3f000003ff ;
>>>>>> 			struct mlx5_uars_page * uar 	 0x6060a ;
>>>>>> 			struct  refcount_struct {
>>>>>> 				struct   {
>>>>>> 					int    counter 	 0xa1814500 ;
>>>>>> 				} refs;
>>>>>> 			} refcount;
>>>>>> 			struct  completion {
>>>>>> 				unsigned int done 	 0x5 ;
>>>>>> 				struct  wait_queue_head {
>>>>>> 					struct  spinlock {
>>>>>> 						union   {
>>>>>> 							struct  raw_spinlock {
>>>>>> 								struct  qspinlock {
>>>>>> 									union   {
>>>>>> 										struct   {
>>>>>> 											int                                                    counter 	 0x5 ;
>>>>>> 										} val;
>>>>>> 										struct   {
>>>>>> 											unsigned char                                          locked 	 0x5 ;
>>>>>> 											unsigned char                                          pending 	 0x0 ;
>>>>>> 										} ;
>>>>>> 										struct   {
>>>>>> 											short unsigned int                                     locked_pending 	 0x5 ;
>>>>>> 											short unsigned int                                     tail 	 0x0 ;
>>>>>> 										} ;
>>>>>> 									} ;
>>>>>> 								} raw_lock;
>>>>>> 							} rlock;
>>>>>> 						} ;
>>>>>> 					} lock;
>>>>>> 					struct  list_head {
>>>>>> 						struct list_head * next 	 0xffff8881b089bb88 ;
>>>>>> 						struct list_head * prev 	 0x4000000c0a ;
>>>>>> 					} head;
>>>>>> 				} wait;
>>>>>> 			} free;
>>>>>> 			unsigned int vector 	 0xa1814500 ;
>>>>>> 			unsigned int irqn 	 0xffff8881 ;
>>>>>> 			void       (*comp)(struct mlx5_core_cq *) 	 0xffff8881a1814504 ;
>>>>>> 			void       (*event)(struct mlx5_core_cq *, enum mlx5_event) 	 0xffff8881a2cdea08 ;
>>>>>> 			unsigned int cons_index 	 0x1 ;
>>>>>> 			unsigned int arm_sn 	 0x0 ;
>>>>>> 			struct mlx5_rsc_debug * dbg 	 0x0 ;
>>>>>> 			int        pid 	 0x0 ;
>>>>>> 			struct   {
>>>>>> 				struct  list_head {
>>>>>> 					struct list_head * next 	 0xffffffff ;
>>>>>> 					struct list_head * prev 	 0xffffffffffffffff ;
>>>>>> 				} list;
>>>>>> 				void (*comp)(struct mlx5_core_cq *) 	 0xffffffffa0356940 ;
>>>>>> 				void * priv 	 0x0 ;
>>>>>> 			} tasklet_ctx;
>>>>>> 			int        reset_notify_added 	 0x0 ;
>>>>>> 			struct  list_head {
>>>>>> 				struct list_head * next 	 0xffffffffa0300700 ;
>>>>>> 				struct list_head * prev 	 0xd ;
>>>>>> 			} reset_notify;
>>>>>> 			struct mlx5_eq_comp * eq 	 0x0 ;
>>>>>> 			short unsigned int uid 	 0x9a70 ;
>>>>>> 		} mcq;
>>>>>> 		struct mlx5e_channel * channel 	 0xffff8881b0899a70 ;
>>>>>> 		struct mlx5_core_dev * mdev 	 0x4800000001 ;
>>>>>> 		struct  mlx5_wq_ctrl {
>>>>>> 			struct mlx5_core_dev * mdev 	 0xffffffffa02d5350 ;
>>>>>> 			struct  mlx5_frag_buf {
>>>>>> 				struct mlx5_buf_list * frags 	 0xffffffffa02d5460 ;
>>>>>> 				int npages 	 0x0 ;
>>>>>> 				int size 	 0x5 ;
>>>>>> 				unsigned char page_shift 	 0x8 ;
>>>>>> 			} buf;
>>>>>> 			struct  mlx5_db {
>>>>>> 				__be32 * db 	 0x1c6 ;
>>>>>> 				union   {
>>>>>> 					struct mlx5_db_pgdir * pgdir 	 0x0 ;
>>>>>> 					struct mlx5_ib_user_db_page * user_page 	 0x0 ;
>>>>>> 				} u;
>>>>>> 				long long unsigned int dma 	 0xffff8881b0899ab0 ;
>>>>>> 				int index 	 0x0 ;
>>>>>> 			} db;
>>>>>> 		} wq_ctrl;
>>>>>> 	} cq;
>>>>>> 	struct  mlx5_wq_cyc {
>>>>>> 		struct  mlx5_frag_buf_ctrl {
>>>>>> 			struct mlx5_buf_list * frags 	 0xffff8881a7600160 ;
>>>>>> 			unsigned int sz_m1 	 0xa7600160 ;
>>>>>> 			short unsigned int frag_sz_m1 	 0x8881 ;
>>>>>> 			short unsigned int strides_offset 	 0xffff ;
>>>>>> 			unsigned char log_sz 	 0x88 ;
>>>>>> 			unsigned char log_stride 	 0x49 ;
>>>>>> 			unsigned char log_frag_strides 	 0xaa ;
>>>>>> 		} fbc;
>>>>>> 		__be32 *           db 	 0x1000000000010 ;
>>>>>> 		short unsigned int sz 	 0xc ;
>>>>>> 		short unsigned int wqe_ctr 	 0x0 ;
>>>>>> 		short unsigned int cur_sz 	 0x0 ;
>>>>>> 	} wq;
>>>>>> 	unsigned int               dma_fifo_mask 	 0xa1814500 ;
>>>>>> 	struct mlx5e_sq_stats *    stats 	 0xffff8881a33a0348 ;
>>>>>> 	struct   {
>>>>>> 		struct mlx5e_sq_dma * dma_fifo 	 0x1a1814500 ;
>>>>>> 		struct mlx5e_tx_wqe_info * wqe_info 	 0x14 ;
>>>>>> 	} db;
>>>>>> 	void *                     uar_map 	 0x0 ;
>>>>>> 	struct netdev_queue *      txq 	 0x0 ;
>>>>>> 	unsigned int               sqn 	 0x18c0 ;
>>>>>> 	unsigned char              min_inline_mode 	 0x0 ;
>>>>>> 	struct device *            pdev 	 0x0 ;
>>>>>> 	unsigned int               mkey_be 	 0x0 ;
>>>>>> 	long unsigned int          state 	 0x0 ;
>>>>>> 	struct hwtstamp_config *   tstamp 	 0x0 ;
>>>>>> 	struct mlx5_clock *        clock 	 0xffff8881b1aa6f88 ;
>>>>>> 	struct  mlx5_wq_ctrl {
>>>>>> 		struct mlx5_core_dev * mdev 	 0x3f000003ff ;
>>>>>> 		struct  mlx5_frag_buf {
>>>>>> 			struct mlx5_buf_list * frags 	 0x6060a ;
>>>>>> 			int        npages 	 0xa1814604 ;
>>>>>> 			int        size 	 0xffff8881 ;
>>>>>> 			unsigned char page_shift 	 0x0 ;
>>>>>> 		} buf;
>>>>>> 		struct  mlx5_db {
>>>>>> 			__be32 *   db 	 0xfff ;
>>>>>> 			union   {
>>>>>> 				struct mlx5_db_pgdir * pgdir 	 0x0 ;
>>>>>> 				struct mlx5_ib_user_db_page * user_page 	 0x0 ;
>>>>>> 			} u;
>>>>>> 			long long unsigned int dma 	 0xffff888188440000 ;
>>>>>> 			int        index 	 0x8b074000 ;
>>>>>> 		} db;
>>>>>> 	} wq_ctrl;
>>>>>> 	struct mlx5e_channel *     channel 	 0xffffc9000010d800 ;
>>>>>> 	int                        txq_ix 	 0xa0020180 ;
>>>>>> 	unsigned int               rate_limit 	 0xffff8881 ;
>>>>>> 	struct  work_struct {
>>>>>> 		struct   {
>>>>>> 			long int   counter 	 0x1000018c0 ;
>>>>>> 		} data;
>>>>>> 		struct  list_head {
>>>>>> 			struct list_head * next 	 0xffff8881c32b68e8 ;
>>>>>> 			struct list_head * prev 	 0x800 ;
>>>>>> 		} entry;
>>>>>> 		void               (*func)(struct work_struct *) 	 0x9 ;
>>>>>> 	} recover_work;
>>>>>> } ;
>>>>>
>>>>> I don't get it. You are dumping live kernel memory? There are already
>>>>> facilities to do that in place. Why to replicate it?
>>>> I am dumping the driver's memory under a lock so I can ensure it's
>>>> consistency (as appose to /dev/mem)
>>>> vmcore cannot be taken from a live kernel (without crashing).
>>>> I need the memory's snapshot right after the error from the driver's
>>>> context.
>>>
>>> Got it. However, this sounds like a generic problem not specific to
>>> nic drivers. How other subsystems resolve this (if they do at all)?
>>>
>>>
>> Correct, this is a suggested debugging solution for a generic problem:
>> enabling the user of a run time memory snapshot for kernel modules (at a
>> given error event). My research shows that other subsystems deal with
>> errors either by panicking (too much) or by debug/log prints (too little).
>> This solution is (a) low in maintenance (b) consistent in memory (c) has
>> small performance impact (d) use an existing infra-structure between the
>> kernel module and the user space.
> 
> I'm still convinced that dumping kernel memory over devlink health dump
> is a good idea :/
> 
> 
>> It might be ported to other subsystems using their own user-space vs.
>> kernel tools. Regardless of how the memory output was generated to the
>> user, the parsing script can work on it.
> 
> Could you share the script? How is it going to be distributed?
I thought that the script should be in a available on Mellanox website. 
The script is still pending review but I will be happy to share it when 
its ready.
> 
> 
>>
>>>
>>>> Which other tools do you mean?
>>>>>
>>>>>
>>>>>>
>>>>>> Signed-off-by: Aya Levin <ayal@mellanox.com>
>>>>>> ---
>>>>>> .../ethernet/mellanox/mlx5/core/en/reporter_tx.c   | 100 +++++++++++++++++++++
>>>>>> 1 file changed, 100 insertions(+)
>>>>>>
>>>>>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>>>> index 476dd97f7f2f..8a39f5525e57 100644
>>>>>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>>>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>>>> @@ -9,6 +9,7 @@
>>>>>>
>>>>>> struct mlx5e_tx_err_ctx {
>>>>>> 	int (*recover)(struct mlx5e_txqsq *sq);
>>>>>> +	int (*dump)(struct mlx5e_txqsq *sq);
>>>>>> 	struct mlx5e_txqsq *sq;
>>>>>> };
>>>>>>
>>>>>> @@ -281,10 +282,109 @@ static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
>>>>>> 	return err;
>>>>>> }
>>>>>>
>>>>>> +static int mlx5e_tx_reporter_sw_dump_from_ctx(struct mlx5e_priv *priv,
>>>>>> +					      struct mlx5e_txqsq *sq,
>>>>>> +					      struct devlink_fmsg *fmsg)
>>>>>> +{
>>>>>> +	u64 *ptr = (u64 *)sq;
>>>>>> +	int copy, err;
>>>>>> +	int i = 0;
>>>>>> +
>>>>>> +	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>>>>>> +		return 0;
>>>>>> +
>>>>>> +	err = devlink_fmsg_pair_nest_start(fmsg, "mlx5e_txqsq");
>>>>>> +	if (err)
>>>>>> +		return err;
>>>>>> +
>>>>>> +	err = devlink_fmsg_obj_nest_start(fmsg);
>>>>>> +	if (err)
>>>>>> +		return err;
>>>>>> +
>>>>>> +	err = devlink_fmsg_arr_pair_nest_start(fmsg, "memory");
>>>>>> +	if (err)
>>>>>> +		return err;
>>>>>> +
>>>>>> +	while (i < sizeof(struct mlx5e_txqsq)) {
>>>>>> +		copy = sizeof(u64);
>>>>>> +
>>>>>> +		if (i + copy > sizeof(struct mlx5e_txqsq))
>>>>>> +			copy = sizeof(struct mlx5e_txqsq) - i;
>>>>>> +
>>>>>> +		err = devlink_fmsg_binary_put(fmsg, ptr, copy);
>>>>>> +		if (err)
>>>>>> +			return err;
>>>>>> +		ptr++;
>>>>>> +		i += copy;
>>>>>> +	}
>>>>>> +
>>>>>> +	err = devlink_fmsg_arr_pair_nest_end(fmsg);
>>>>>> +	if (err)
>>>>>> +		return err;
>>>>>> +
>>>>>> +	err = devlink_fmsg_obj_nest_end(fmsg);
>>>>>> +	if (err)
>>>>>> +		return err;
>>>>>> +
>>>>>> +	err = devlink_fmsg_pair_nest_end(fmsg);
>>>>>> +
>>>>>> +	return err;
>>>>>> +}
>>>>>> +
>>>>>> +static int mlx5e_tx_reporter_sw_dump_all(struct mlx5e_priv *priv,
>>>>>> +					 struct devlink_fmsg *fmsg)
>>>>>> +{
>>>>>> +	int i, err = 0;
>>>>>> +
>>>>>> +	mutex_lock(&priv->state_lock);
>>>>>> +
>>>>>> +	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>>>>>> +		goto unlock;
>>>>>> +
>>>>>> +	err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
>>>>>> +	if (err)
>>>>>> +		goto unlock;
>>>>>> +
>>>>>> +	for (i = 0; i < priv->channels.num * priv->channels.params.num_tc;
>>>>>> +	     i++) {
>>>>>> +		err = devlink_fmsg_obj_nest_start(fmsg);
>>>>>> +		if (err)
>>>>>> +			goto unlock;
>>>>>> +
>>>>>> +		err = mlx5e_tx_reporter_sw_dump_from_ctx(priv, priv->txq2sq[i],
>>>>>> +							 fmsg);
>>>>>> +		if (err)
>>>>>> +			goto unlock;
>>>>>> +
>>>>>> +		err = devlink_fmsg_pair_nest_end(fmsg);
>>>>>> +		if (err)
>>>>>> +			goto unlock;
>>>>>> +	}
>>>>>> +	err = devlink_fmsg_arr_pair_nest_end(fmsg);
>>>>>> +	if (err)
>>>>>> +		goto unlock;
>>>>>> +
>>>>>> +unlock:
>>>>>> +	mutex_unlock(&priv->state_lock);
>>>>>> +	return err;
>>>>>> +}
>>>>>> +
>>>>>> +static int mlx5e_tx_reporter_sw_dump(struct devlink_health_reporter *reporter,
>>>>>> +				     struct devlink_fmsg *fmsg, void *context)
>>>>>> +{
>>>>>> +	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
>>>>>> +	struct mlx5e_tx_err_ctx *err_ctx = context;
>>>>>> +
>>>>>> +	return err_ctx ? mlx5e_tx_reporter_sw_dump_from_ctx(priv, err_ctx->sq,
>>>>>> +							    fmsg) :
>>>>>> +			 mlx5e_tx_reporter_sw_dump_all(priv, fmsg);
>>>>>> +}
>>>>>> +
>>>>>> static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
>>>>>> 		.name = "tx",
>>>>>> 		.recover = mlx5e_tx_reporter_recover,
>>>>>> 		.diagnose = mlx5e_tx_reporter_diagnose,
>>>>>> +		.dump = mlx5e_tx_reporter_sw_dump,
>>>>>> };
>>>>>>
>>>>>> #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
>>>>>> -- 
>>>>>> 2.14.1
>>>>>>
Jiri Pirko May 16, 2019, 11:53 a.m. UTC | #11
Thu, May 16, 2019 at 10:49:54AM CEST, ayal@mellanox.com wrote:
>
>
>On 5/14/2019 3:07 PM, Jiri Pirko wrote:
>> Sun, May 12, 2019 at 10:37:35AM CEST, ayal@mellanox.com wrote:
>>>
>>>
>>> On 5/9/2019 11:23 AM, Jiri Pirko wrote:
>>>> Tue, May 07, 2019 at 02:58:32PM CEST, ayal@mellanox.com wrote:
>>>>>
>>>>>
>>>>> On 5/7/2019 3:41 PM, Jiri Pirko wrote:
>>>>>> Mon, Apr 29, 2019 at 04:17:39PM CEST, ayal@mellanox.com wrote:
>>>>>>> TX reporter reports an error on two scenarios:
>>>>>>> - TX timeout on a specific tx queue
>>>>>>> - TX completion error on a specific send queue
>>>>>>> Prior to this patch, no dump data was supported by the tx reporter. This
>>>>>>> patch adds support for SW data dump of the related SQ context. The dump
>>>>>>> is simply the SQ's raw memory snapshot taken right after the error was
>>>>>>> reported, before any recovery procedure was launched. With this
>>>>>>> approach, no maintenance is needed as the driver fetch the actual data
>>>>>>> according to the layout on which the SQ was compiled with.  By providing
>>>>>>> a SW context, one can easily debug error on a given SQ.
>>>>>>>
>>>>>>> In order to offline translate the raw memory into a human readable
>>>>>>> format, the user can use some out-of-kernel scripts which receives as an
>>>>>>> input the following:
>>>>>>> - Object raw memory
>>>>>>> - Driver object compiled with debug info (can be taken/generated at any time from the machine)
>>>>>>> - Object name
>>>>>>>
>>>>>>> An example of such script output can be seen below.
>>>>>>> Note: the script is not offered as part of this patch as it do not
>>>>>>> belong to the kernel, I just described it in order to grasp the general
>>>>>>> idea of how/what can be fetched from SW dump via devlink health.
>>>>>>>
>>>>>>> The output of the SW dump can be extracted by devlink health command:
>>>>>>> $ sudo devlink health dump show pci/0000:00:0b.0 reporter tx.
>>>>>>> mlx5e_txqsq: sqn: 6336
>>>>>>> memory:
>>>>>>>      00 00 00 00 00 00 00 00
>>>>>>>      01 00 00 00 00 00 00 00
>>>>>>>      00 00 00 00 00 00 00 00
>>>>>>>      45 f4 88 cb 09 00 00 00
>>>>>>>      00 00 00 00 00 00 00 00
>>>>>>>      00 00 00 00 00 00 00 00
>>>>>>>      c0 ff ff ff 1f 00 00 00
>>>>>>>      f8 18 1e 89 81 88 ff ff
>>>>>>>      ...
>>>>>>>
>>>>>>> script output below, with struct members names and actual values:
>>>>>>>
>>>>>>> struct  mlx5e_txqsq {
>>>>>>> 	short unsigned int         cc 	 0x5 ;
>>>>>>> 	unsigned int               dma_fifo_cc 	 0x5 ;
>>>>>>> 	struct  net_dim {
>>>>>>> 		unsigned char      state 	 0x1 ;
>>>>>>> 		struct  net_dim_stats {
>>>>>>> 			int        ppms 	 0x0 ;
>>>>>>> 			int        bpms 	 0x0 ;
>>>>>>> 			int        epms 	 0x0 ;
>>>>>>> 		} prev_stats;
>>>>>>> 		struct  net_dim_sample {
>>>>>>> 			long long int time 	 0x90766ef9d ;
>>>>>>> 			unsigned int pkt_ctr 	 0x0 ;
>>>>>>> 			unsigned int byte_ctr 	 0x0 ;
>>>>>>> 			short unsigned int event_ctr 	 0x0 ;
>>>>>>> 		} start_sample;
>>>>>>> 		struct  work_struct {
>>>>>>> 			struct   {
>>>>>>> 				long int counter 	 0x1fffffffc0 ;
>>>>>>> 			} data;
>>>>>>> 			struct  list_head {
>>>>>>> 				struct list_head * next 	 0xffff8881b08998f8 ;
>>>>>>> 				struct list_head * prev 	 0xffff8881b08998f8 ;
>>>>>>> 			} entry;
>>>>>>> 			void       (*func)(struct work_struct *) 	 0xffffffffa02d0e30 ;
>>>>>>> 		} work;
>>>>>>> 		unsigned char      profile_ix 	 0x60 ;
>>>>>>> 		unsigned char      mode 	 0x72 ;
>>>>>>> 		unsigned char      tune_state 	 0x35 ;
>>>>>>> 		unsigned char      steps_right 	 0xa0 ;
>>>>>>> 		unsigned char      steps_left 	 0xff ;
>>>>>>> 		unsigned char      tired 	 0xff ;
>>>>>>> 	} dim;
>>>>>>> 	short unsigned int         pc 	 0x0 ;
>>>>>>> 	unsigned int               dma_fifo_pc 	 0x0 ;
>>>>>>> 	struct  mlx5e_cq {
>>>>>>> 		struct  mlx5_cqwq {
>>>>>>> 			struct  mlx5_frag_buf_ctrl {
>>>>>>> 				struct mlx5_buf_list * frags 	 0x500000005 ;
>>>>>>> 				unsigned int sz_m1 	 0x0 ;
>>>>>>> 				short unsigned int frag_sz_m1 	 0x0 ;
>>>>>>> 				short unsigned int strides_offset 	 0x0 ;
>>>>>>> 				unsigned char log_sz 	 0x0 ;
>>>>>>> 				unsigned char log_stride 	 0x0 ;
>>>>>>> 				unsigned char log_frag_strides 	 0x0 ;
>>>>>>> 			} fbc;
>>>>>>> 			__be32 *   db 	 0x0 ;
>>>>>>> 			unsigned int cc 	 0x0 ;
>>>>>>> 		} wq;
>>>>>>> 		short unsigned int event_ctr 	 0x0 ;
>>>>>>> 		struct napi_struct * napi 	 0x0 ;
>>>>>>> 		struct  mlx5_core_cq {
>>>>>>> 			unsigned int cqn 	 0x0 ;
>>>>>>> 			int        cqe_sz 	 0x0 ;
>>>>>>> 			__be32 *   set_ci_db 	 0xffff8881b1aa4988 ;
>>>>>>> 			__be32 *   arm_db 	 0x3f000003ff ;
>>>>>>> 			struct mlx5_uars_page * uar 	 0x6060a ;
>>>>>>> 			struct  refcount_struct {
>>>>>>> 				struct   {
>>>>>>> 					int    counter 	 0xa1814500 ;
>>>>>>> 				} refs;
>>>>>>> 			} refcount;
>>>>>>> 			struct  completion {
>>>>>>> 				unsigned int done 	 0x5 ;
>>>>>>> 				struct  wait_queue_head {
>>>>>>> 					struct  spinlock {
>>>>>>> 						union   {
>>>>>>> 							struct  raw_spinlock {
>>>>>>> 								struct  qspinlock {
>>>>>>> 									union   {
>>>>>>> 										struct   {
>>>>>>> 											int                                                    counter 	 0x5 ;
>>>>>>> 										} val;
>>>>>>> 										struct   {
>>>>>>> 											unsigned char                                          locked 	 0x5 ;
>>>>>>> 											unsigned char                                          pending 	 0x0 ;
>>>>>>> 										} ;
>>>>>>> 										struct   {
>>>>>>> 											short unsigned int                                     locked_pending 	 0x5 ;
>>>>>>> 											short unsigned int                                     tail 	 0x0 ;
>>>>>>> 										} ;
>>>>>>> 									} ;
>>>>>>> 								} raw_lock;
>>>>>>> 							} rlock;
>>>>>>> 						} ;
>>>>>>> 					} lock;
>>>>>>> 					struct  list_head {
>>>>>>> 						struct list_head * next 	 0xffff8881b089bb88 ;
>>>>>>> 						struct list_head * prev 	 0x4000000c0a ;
>>>>>>> 					} head;
>>>>>>> 				} wait;
>>>>>>> 			} free;
>>>>>>> 			unsigned int vector 	 0xa1814500 ;
>>>>>>> 			unsigned int irqn 	 0xffff8881 ;
>>>>>>> 			void       (*comp)(struct mlx5_core_cq *) 	 0xffff8881a1814504 ;
>>>>>>> 			void       (*event)(struct mlx5_core_cq *, enum mlx5_event) 	 0xffff8881a2cdea08 ;
>>>>>>> 			unsigned int cons_index 	 0x1 ;
>>>>>>> 			unsigned int arm_sn 	 0x0 ;
>>>>>>> 			struct mlx5_rsc_debug * dbg 	 0x0 ;
>>>>>>> 			int        pid 	 0x0 ;
>>>>>>> 			struct   {
>>>>>>> 				struct  list_head {
>>>>>>> 					struct list_head * next 	 0xffffffff ;
>>>>>>> 					struct list_head * prev 	 0xffffffffffffffff ;
>>>>>>> 				} list;
>>>>>>> 				void (*comp)(struct mlx5_core_cq *) 	 0xffffffffa0356940 ;
>>>>>>> 				void * priv 	 0x0 ;
>>>>>>> 			} tasklet_ctx;
>>>>>>> 			int        reset_notify_added 	 0x0 ;
>>>>>>> 			struct  list_head {
>>>>>>> 				struct list_head * next 	 0xffffffffa0300700 ;
>>>>>>> 				struct list_head * prev 	 0xd ;
>>>>>>> 			} reset_notify;
>>>>>>> 			struct mlx5_eq_comp * eq 	 0x0 ;
>>>>>>> 			short unsigned int uid 	 0x9a70 ;
>>>>>>> 		} mcq;
>>>>>>> 		struct mlx5e_channel * channel 	 0xffff8881b0899a70 ;
>>>>>>> 		struct mlx5_core_dev * mdev 	 0x4800000001 ;
>>>>>>> 		struct  mlx5_wq_ctrl {
>>>>>>> 			struct mlx5_core_dev * mdev 	 0xffffffffa02d5350 ;
>>>>>>> 			struct  mlx5_frag_buf {
>>>>>>> 				struct mlx5_buf_list * frags 	 0xffffffffa02d5460 ;
>>>>>>> 				int npages 	 0x0 ;
>>>>>>> 				int size 	 0x5 ;
>>>>>>> 				unsigned char page_shift 	 0x8 ;
>>>>>>> 			} buf;
>>>>>>> 			struct  mlx5_db {
>>>>>>> 				__be32 * db 	 0x1c6 ;
>>>>>>> 				union   {
>>>>>>> 					struct mlx5_db_pgdir * pgdir 	 0x0 ;
>>>>>>> 					struct mlx5_ib_user_db_page * user_page 	 0x0 ;
>>>>>>> 				} u;
>>>>>>> 				long long unsigned int dma 	 0xffff8881b0899ab0 ;
>>>>>>> 				int index 	 0x0 ;
>>>>>>> 			} db;
>>>>>>> 		} wq_ctrl;
>>>>>>> 	} cq;
>>>>>>> 	struct  mlx5_wq_cyc {
>>>>>>> 		struct  mlx5_frag_buf_ctrl {
>>>>>>> 			struct mlx5_buf_list * frags 	 0xffff8881a7600160 ;
>>>>>>> 			unsigned int sz_m1 	 0xa7600160 ;
>>>>>>> 			short unsigned int frag_sz_m1 	 0x8881 ;
>>>>>>> 			short unsigned int strides_offset 	 0xffff ;
>>>>>>> 			unsigned char log_sz 	 0x88 ;
>>>>>>> 			unsigned char log_stride 	 0x49 ;
>>>>>>> 			unsigned char log_frag_strides 	 0xaa ;
>>>>>>> 		} fbc;
>>>>>>> 		__be32 *           db 	 0x1000000000010 ;
>>>>>>> 		short unsigned int sz 	 0xc ;
>>>>>>> 		short unsigned int wqe_ctr 	 0x0 ;
>>>>>>> 		short unsigned int cur_sz 	 0x0 ;
>>>>>>> 	} wq;
>>>>>>> 	unsigned int               dma_fifo_mask 	 0xa1814500 ;
>>>>>>> 	struct mlx5e_sq_stats *    stats 	 0xffff8881a33a0348 ;
>>>>>>> 	struct   {
>>>>>>> 		struct mlx5e_sq_dma * dma_fifo 	 0x1a1814500 ;
>>>>>>> 		struct mlx5e_tx_wqe_info * wqe_info 	 0x14 ;
>>>>>>> 	} db;
>>>>>>> 	void *                     uar_map 	 0x0 ;
>>>>>>> 	struct netdev_queue *      txq 	 0x0 ;
>>>>>>> 	unsigned int               sqn 	 0x18c0 ;
>>>>>>> 	unsigned char              min_inline_mode 	 0x0 ;
>>>>>>> 	struct device *            pdev 	 0x0 ;
>>>>>>> 	unsigned int               mkey_be 	 0x0 ;
>>>>>>> 	long unsigned int          state 	 0x0 ;
>>>>>>> 	struct hwtstamp_config *   tstamp 	 0x0 ;
>>>>>>> 	struct mlx5_clock *        clock 	 0xffff8881b1aa6f88 ;
>>>>>>> 	struct  mlx5_wq_ctrl {
>>>>>>> 		struct mlx5_core_dev * mdev 	 0x3f000003ff ;
>>>>>>> 		struct  mlx5_frag_buf {
>>>>>>> 			struct mlx5_buf_list * frags 	 0x6060a ;
>>>>>>> 			int        npages 	 0xa1814604 ;
>>>>>>> 			int        size 	 0xffff8881 ;
>>>>>>> 			unsigned char page_shift 	 0x0 ;
>>>>>>> 		} buf;
>>>>>>> 		struct  mlx5_db {
>>>>>>> 			__be32 *   db 	 0xfff ;
>>>>>>> 			union   {
>>>>>>> 				struct mlx5_db_pgdir * pgdir 	 0x0 ;
>>>>>>> 				struct mlx5_ib_user_db_page * user_page 	 0x0 ;
>>>>>>> 			} u;
>>>>>>> 			long long unsigned int dma 	 0xffff888188440000 ;
>>>>>>> 			int        index 	 0x8b074000 ;
>>>>>>> 		} db;
>>>>>>> 	} wq_ctrl;
>>>>>>> 	struct mlx5e_channel *     channel 	 0xffffc9000010d800 ;
>>>>>>> 	int                        txq_ix 	 0xa0020180 ;
>>>>>>> 	unsigned int               rate_limit 	 0xffff8881 ;
>>>>>>> 	struct  work_struct {
>>>>>>> 		struct   {
>>>>>>> 			long int   counter 	 0x1000018c0 ;
>>>>>>> 		} data;
>>>>>>> 		struct  list_head {
>>>>>>> 			struct list_head * next 	 0xffff8881c32b68e8 ;
>>>>>>> 			struct list_head * prev 	 0x800 ;
>>>>>>> 		} entry;
>>>>>>> 		void               (*func)(struct work_struct *) 	 0x9 ;
>>>>>>> 	} recover_work;
>>>>>>> } ;
>>>>>>
>>>>>> I don't get it. You are dumping live kernel memory? There are already
>>>>>> facilities to do that in place. Why to replicate it?
>>>>> I am dumping the driver's memory under a lock so I can ensure it's
>>>>> consistency (as appose to /dev/mem)
>>>>> vmcore cannot be taken from a live kernel (without crashing).
>>>>> I need the memory's snapshot right after the error from the driver's
>>>>> context.
>>>>
>>>> Got it. However, this sounds like a generic problem not specific to
>>>> nic drivers. How other subsystems resolve this (if they do at all)?
>>>>
>>>>
>>> Correct, this is a suggested debugging solution for a generic problem:
>>> enabling the user of a run time memory snapshot for kernel modules (at a
>>> given error event). My research shows that other subsystems deal with
>>> errors either by panicking (too much) or by debug/log prints (too little).
>>> This solution is (a) low in maintenance (b) consistent in memory (c) has
>>> small performance impact (d) use an existing infra-structure between the
>>> kernel module and the user space.
>> 
>> I'm still convinced that dumping kernel memory over devlink health dump
>> is a good idea :/
>> 
>> 
>>> It might be ported to other subsystems using their own user-space vs.
>>> kernel tools. Regardless of how the memory output was generated to the
>>> user, the parsing script can work on it.
>> 
>> Could you share the script? How is it going to be distributed?
>I thought that the script should be in a available on Mellanox website. 

:(


>The script is still pending review but I will be happy to share it when 
>its ready.
>> 
>> 
>>>
>>>>
>>>>> Which other tools do you mean?
>>>>>>
>>>>>>
>>>>>>>
>>>>>>> Signed-off-by: Aya Levin <ayal@mellanox.com>
>>>>>>> ---
>>>>>>> .../ethernet/mellanox/mlx5/core/en/reporter_tx.c   | 100 +++++++++++++++++++++
>>>>>>> 1 file changed, 100 insertions(+)
>>>>>>>
>>>>>>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>>>>> index 476dd97f7f2f..8a39f5525e57 100644
>>>>>>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>>>>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>>>>> @@ -9,6 +9,7 @@
>>>>>>>
>>>>>>> struct mlx5e_tx_err_ctx {
>>>>>>> 	int (*recover)(struct mlx5e_txqsq *sq);
>>>>>>> +	int (*dump)(struct mlx5e_txqsq *sq);
>>>>>>> 	struct mlx5e_txqsq *sq;
>>>>>>> };
>>>>>>>
>>>>>>> @@ -281,10 +282,109 @@ static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
>>>>>>> 	return err;
>>>>>>> }
>>>>>>>
>>>>>>> +static int mlx5e_tx_reporter_sw_dump_from_ctx(struct mlx5e_priv *priv,
>>>>>>> +					      struct mlx5e_txqsq *sq,
>>>>>>> +					      struct devlink_fmsg *fmsg)
>>>>>>> +{
>>>>>>> +	u64 *ptr = (u64 *)sq;
>>>>>>> +	int copy, err;
>>>>>>> +	int i = 0;
>>>>>>> +
>>>>>>> +	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>>>>>>> +		return 0;
>>>>>>> +
>>>>>>> +	err = devlink_fmsg_pair_nest_start(fmsg, "mlx5e_txqsq");
>>>>>>> +	if (err)
>>>>>>> +		return err;
>>>>>>> +
>>>>>>> +	err = devlink_fmsg_obj_nest_start(fmsg);
>>>>>>> +	if (err)
>>>>>>> +		return err;
>>>>>>> +
>>>>>>> +	err = devlink_fmsg_arr_pair_nest_start(fmsg, "memory");
>>>>>>> +	if (err)
>>>>>>> +		return err;
>>>>>>> +
>>>>>>> +	while (i < sizeof(struct mlx5e_txqsq)) {
>>>>>>> +		copy = sizeof(u64);
>>>>>>> +
>>>>>>> +		if (i + copy > sizeof(struct mlx5e_txqsq))
>>>>>>> +			copy = sizeof(struct mlx5e_txqsq) - i;
>>>>>>> +
>>>>>>> +		err = devlink_fmsg_binary_put(fmsg, ptr, copy);
>>>>>>> +		if (err)
>>>>>>> +			return err;
>>>>>>> +		ptr++;
>>>>>>> +		i += copy;
>>>>>>> +	}
>>>>>>> +
>>>>>>> +	err = devlink_fmsg_arr_pair_nest_end(fmsg);
>>>>>>> +	if (err)
>>>>>>> +		return err;
>>>>>>> +
>>>>>>> +	err = devlink_fmsg_obj_nest_end(fmsg);
>>>>>>> +	if (err)
>>>>>>> +		return err;
>>>>>>> +
>>>>>>> +	err = devlink_fmsg_pair_nest_end(fmsg);
>>>>>>> +
>>>>>>> +	return err;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static int mlx5e_tx_reporter_sw_dump_all(struct mlx5e_priv *priv,
>>>>>>> +					 struct devlink_fmsg *fmsg)
>>>>>>> +{
>>>>>>> +	int i, err = 0;
>>>>>>> +
>>>>>>> +	mutex_lock(&priv->state_lock);
>>>>>>> +
>>>>>>> +	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>>>>>>> +		goto unlock;
>>>>>>> +
>>>>>>> +	err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
>>>>>>> +	if (err)
>>>>>>> +		goto unlock;
>>>>>>> +
>>>>>>> +	for (i = 0; i < priv->channels.num * priv->channels.params.num_tc;
>>>>>>> +	     i++) {
>>>>>>> +		err = devlink_fmsg_obj_nest_start(fmsg);
>>>>>>> +		if (err)
>>>>>>> +			goto unlock;
>>>>>>> +
>>>>>>> +		err = mlx5e_tx_reporter_sw_dump_from_ctx(priv, priv->txq2sq[i],
>>>>>>> +							 fmsg);
>>>>>>> +		if (err)
>>>>>>> +			goto unlock;
>>>>>>> +
>>>>>>> +		err = devlink_fmsg_pair_nest_end(fmsg);
>>>>>>> +		if (err)
>>>>>>> +			goto unlock;
>>>>>>> +	}
>>>>>>> +	err = devlink_fmsg_arr_pair_nest_end(fmsg);
>>>>>>> +	if (err)
>>>>>>> +		goto unlock;
>>>>>>> +
>>>>>>> +unlock:
>>>>>>> +	mutex_unlock(&priv->state_lock);
>>>>>>> +	return err;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static int mlx5e_tx_reporter_sw_dump(struct devlink_health_reporter *reporter,
>>>>>>> +				     struct devlink_fmsg *fmsg, void *context)
>>>>>>> +{
>>>>>>> +	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
>>>>>>> +	struct mlx5e_tx_err_ctx *err_ctx = context;
>>>>>>> +
>>>>>>> +	return err_ctx ? mlx5e_tx_reporter_sw_dump_from_ctx(priv, err_ctx->sq,
>>>>>>> +							    fmsg) :
>>>>>>> +			 mlx5e_tx_reporter_sw_dump_all(priv, fmsg);
>>>>>>> +}
>>>>>>> +
>>>>>>> static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
>>>>>>> 		.name = "tx",
>>>>>>> 		.recover = mlx5e_tx_reporter_recover,
>>>>>>> 		.diagnose = mlx5e_tx_reporter_diagnose,
>>>>>>> +		.dump = mlx5e_tx_reporter_sw_dump,
>>>>>>> };
>>>>>>>
>>>>>>> #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
>>>>>>> -- 
>>>>>>> 2.14.1
>>>>>>>
Aya Levin May 16, 2019, 12:02 p.m. UTC | #12
On 5/16/2019 2:53 PM, Jiri Pirko wrote:
> Thu, May 16, 2019 at 10:49:54AM CEST, ayal@mellanox.com wrote:
>>
>>
>> On 5/14/2019 3:07 PM, Jiri Pirko wrote:
>>> Sun, May 12, 2019 at 10:37:35AM CEST, ayal@mellanox.com wrote:
>>>>
>>>>
>>>> On 5/9/2019 11:23 AM, Jiri Pirko wrote:
>>>>> Tue, May 07, 2019 at 02:58:32PM CEST, ayal@mellanox.com wrote:
>>>>>>
>>>>>>
>>>>>> On 5/7/2019 3:41 PM, Jiri Pirko wrote:
>>>>>>> Mon, Apr 29, 2019 at 04:17:39PM CEST, ayal@mellanox.com wrote:
>>>>>>>> TX reporter reports an error on two scenarios:
>>>>>>>> - TX timeout on a specific tx queue
>>>>>>>> - TX completion error on a specific send queue
>>>>>>>> Prior to this patch, no dump data was supported by the tx reporter. This
>>>>>>>> patch adds support for SW data dump of the related SQ context. The dump
>>>>>>>> is simply the SQ's raw memory snapshot taken right after the error was
>>>>>>>> reported, before any recovery procedure was launched. With this
>>>>>>>> approach, no maintenance is needed as the driver fetch the actual data
>>>>>>>> according to the layout on which the SQ was compiled with.  By providing
>>>>>>>> a SW context, one can easily debug error on a given SQ.
>>>>>>>>
>>>>>>>> In order to offline translate the raw memory into a human readable
>>>>>>>> format, the user can use some out-of-kernel scripts which receives as an
>>>>>>>> input the following:
>>>>>>>> - Object raw memory
>>>>>>>> - Driver object compiled with debug info (can be taken/generated at any time from the machine)
>>>>>>>> - Object name
>>>>>>>>
>>>>>>>> An example of such script output can be seen below.
>>>>>>>> Note: the script is not offered as part of this patch as it do not
>>>>>>>> belong to the kernel, I just described it in order to grasp the general
>>>>>>>> idea of how/what can be fetched from SW dump via devlink health.
>>>>>>>>
>>>>>>>> The output of the SW dump can be extracted by devlink health command:
>>>>>>>> $ sudo devlink health dump show pci/0000:00:0b.0 reporter tx.
>>>>>>>> mlx5e_txqsq: sqn: 6336
>>>>>>>> memory:
>>>>>>>>       00 00 00 00 00 00 00 00
>>>>>>>>       01 00 00 00 00 00 00 00
>>>>>>>>       00 00 00 00 00 00 00 00
>>>>>>>>       45 f4 88 cb 09 00 00 00
>>>>>>>>       00 00 00 00 00 00 00 00
>>>>>>>>       00 00 00 00 00 00 00 00
>>>>>>>>       c0 ff ff ff 1f 00 00 00
>>>>>>>>       f8 18 1e 89 81 88 ff ff
>>>>>>>>       ...
>>>>>>>>
>>>>>>>> script output below, with struct members names and actual values:
>>>>>>>>
>>>>>>>> struct  mlx5e_txqsq {
>>>>>>>> 	short unsigned int         cc 	 0x5 ;
>>>>>>>> 	unsigned int               dma_fifo_cc 	 0x5 ;
>>>>>>>> 	struct  net_dim {
>>>>>>>> 		unsigned char      state 	 0x1 ;
>>>>>>>> 		struct  net_dim_stats {
>>>>>>>> 			int        ppms 	 0x0 ;
>>>>>>>> 			int        bpms 	 0x0 ;
>>>>>>>> 			int        epms 	 0x0 ;
>>>>>>>> 		} prev_stats;
>>>>>>>> 		struct  net_dim_sample {
>>>>>>>> 			long long int time 	 0x90766ef9d ;
>>>>>>>> 			unsigned int pkt_ctr 	 0x0 ;
>>>>>>>> 			unsigned int byte_ctr 	 0x0 ;
>>>>>>>> 			short unsigned int event_ctr 	 0x0 ;
>>>>>>>> 		} start_sample;
>>>>>>>> 		struct  work_struct {
>>>>>>>> 			struct   {
>>>>>>>> 				long int counter 	 0x1fffffffc0 ;
>>>>>>>> 			} data;
>>>>>>>> 			struct  list_head {
>>>>>>>> 				struct list_head * next 	 0xffff8881b08998f8 ;
>>>>>>>> 				struct list_head * prev 	 0xffff8881b08998f8 ;
>>>>>>>> 			} entry;
>>>>>>>> 			void       (*func)(struct work_struct *) 	 0xffffffffa02d0e30 ;
>>>>>>>> 		} work;
>>>>>>>> 		unsigned char      profile_ix 	 0x60 ;
>>>>>>>> 		unsigned char      mode 	 0x72 ;
>>>>>>>> 		unsigned char      tune_state 	 0x35 ;
>>>>>>>> 		unsigned char      steps_right 	 0xa0 ;
>>>>>>>> 		unsigned char      steps_left 	 0xff ;
>>>>>>>> 		unsigned char      tired 	 0xff ;
>>>>>>>> 	} dim;
>>>>>>>> 	short unsigned int         pc 	 0x0 ;
>>>>>>>> 	unsigned int               dma_fifo_pc 	 0x0 ;
>>>>>>>> 	struct  mlx5e_cq {
>>>>>>>> 		struct  mlx5_cqwq {
>>>>>>>> 			struct  mlx5_frag_buf_ctrl {
>>>>>>>> 				struct mlx5_buf_list * frags 	 0x500000005 ;
>>>>>>>> 				unsigned int sz_m1 	 0x0 ;
>>>>>>>> 				short unsigned int frag_sz_m1 	 0x0 ;
>>>>>>>> 				short unsigned int strides_offset 	 0x0 ;
>>>>>>>> 				unsigned char log_sz 	 0x0 ;
>>>>>>>> 				unsigned char log_stride 	 0x0 ;
>>>>>>>> 				unsigned char log_frag_strides 	 0x0 ;
>>>>>>>> 			} fbc;
>>>>>>>> 			__be32 *   db 	 0x0 ;
>>>>>>>> 			unsigned int cc 	 0x0 ;
>>>>>>>> 		} wq;
>>>>>>>> 		short unsigned int event_ctr 	 0x0 ;
>>>>>>>> 		struct napi_struct * napi 	 0x0 ;
>>>>>>>> 		struct  mlx5_core_cq {
>>>>>>>> 			unsigned int cqn 	 0x0 ;
>>>>>>>> 			int        cqe_sz 	 0x0 ;
>>>>>>>> 			__be32 *   set_ci_db 	 0xffff8881b1aa4988 ;
>>>>>>>> 			__be32 *   arm_db 	 0x3f000003ff ;
>>>>>>>> 			struct mlx5_uars_page * uar 	 0x6060a ;
>>>>>>>> 			struct  refcount_struct {
>>>>>>>> 				struct   {
>>>>>>>> 					int    counter 	 0xa1814500 ;
>>>>>>>> 				} refs;
>>>>>>>> 			} refcount;
>>>>>>>> 			struct  completion {
>>>>>>>> 				unsigned int done 	 0x5 ;
>>>>>>>> 				struct  wait_queue_head {
>>>>>>>> 					struct  spinlock {
>>>>>>>> 						union   {
>>>>>>>> 							struct  raw_spinlock {
>>>>>>>> 								struct  qspinlock {
>>>>>>>> 									union   {
>>>>>>>> 										struct   {
>>>>>>>> 											int                                                    counter 	 0x5 ;
>>>>>>>> 										} val;
>>>>>>>> 										struct   {
>>>>>>>> 											unsigned char                                          locked 	 0x5 ;
>>>>>>>> 											unsigned char                                          pending 	 0x0 ;
>>>>>>>> 										} ;
>>>>>>>> 										struct   {
>>>>>>>> 											short unsigned int                                     locked_pending 	 0x5 ;
>>>>>>>> 											short unsigned int                                     tail 	 0x0 ;
>>>>>>>> 										} ;
>>>>>>>> 									} ;
>>>>>>>> 								} raw_lock;
>>>>>>>> 							} rlock;
>>>>>>>> 						} ;
>>>>>>>> 					} lock;
>>>>>>>> 					struct  list_head {
>>>>>>>> 						struct list_head * next 	 0xffff8881b089bb88 ;
>>>>>>>> 						struct list_head * prev 	 0x4000000c0a ;
>>>>>>>> 					} head;
>>>>>>>> 				} wait;
>>>>>>>> 			} free;
>>>>>>>> 			unsigned int vector 	 0xa1814500 ;
>>>>>>>> 			unsigned int irqn 	 0xffff8881 ;
>>>>>>>> 			void       (*comp)(struct mlx5_core_cq *) 	 0xffff8881a1814504 ;
>>>>>>>> 			void       (*event)(struct mlx5_core_cq *, enum mlx5_event) 	 0xffff8881a2cdea08 ;
>>>>>>>> 			unsigned int cons_index 	 0x1 ;
>>>>>>>> 			unsigned int arm_sn 	 0x0 ;
>>>>>>>> 			struct mlx5_rsc_debug * dbg 	 0x0 ;
>>>>>>>> 			int        pid 	 0x0 ;
>>>>>>>> 			struct   {
>>>>>>>> 				struct  list_head {
>>>>>>>> 					struct list_head * next 	 0xffffffff ;
>>>>>>>> 					struct list_head * prev 	 0xffffffffffffffff ;
>>>>>>>> 				} list;
>>>>>>>> 				void (*comp)(struct mlx5_core_cq *) 	 0xffffffffa0356940 ;
>>>>>>>> 				void * priv 	 0x0 ;
>>>>>>>> 			} tasklet_ctx;
>>>>>>>> 			int        reset_notify_added 	 0x0 ;
>>>>>>>> 			struct  list_head {
>>>>>>>> 				struct list_head * next 	 0xffffffffa0300700 ;
>>>>>>>> 				struct list_head * prev 	 0xd ;
>>>>>>>> 			} reset_notify;
>>>>>>>> 			struct mlx5_eq_comp * eq 	 0x0 ;
>>>>>>>> 			short unsigned int uid 	 0x9a70 ;
>>>>>>>> 		} mcq;
>>>>>>>> 		struct mlx5e_channel * channel 	 0xffff8881b0899a70 ;
>>>>>>>> 		struct mlx5_core_dev * mdev 	 0x4800000001 ;
>>>>>>>> 		struct  mlx5_wq_ctrl {
>>>>>>>> 			struct mlx5_core_dev * mdev 	 0xffffffffa02d5350 ;
>>>>>>>> 			struct  mlx5_frag_buf {
>>>>>>>> 				struct mlx5_buf_list * frags 	 0xffffffffa02d5460 ;
>>>>>>>> 				int npages 	 0x0 ;
>>>>>>>> 				int size 	 0x5 ;
>>>>>>>> 				unsigned char page_shift 	 0x8 ;
>>>>>>>> 			} buf;
>>>>>>>> 			struct  mlx5_db {
>>>>>>>> 				__be32 * db 	 0x1c6 ;
>>>>>>>> 				union   {
>>>>>>>> 					struct mlx5_db_pgdir * pgdir 	 0x0 ;
>>>>>>>> 					struct mlx5_ib_user_db_page * user_page 	 0x0 ;
>>>>>>>> 				} u;
>>>>>>>> 				long long unsigned int dma 	 0xffff8881b0899ab0 ;
>>>>>>>> 				int index 	 0x0 ;
>>>>>>>> 			} db;
>>>>>>>> 		} wq_ctrl;
>>>>>>>> 	} cq;
>>>>>>>> 	struct  mlx5_wq_cyc {
>>>>>>>> 		struct  mlx5_frag_buf_ctrl {
>>>>>>>> 			struct mlx5_buf_list * frags 	 0xffff8881a7600160 ;
>>>>>>>> 			unsigned int sz_m1 	 0xa7600160 ;
>>>>>>>> 			short unsigned int frag_sz_m1 	 0x8881 ;
>>>>>>>> 			short unsigned int strides_offset 	 0xffff ;
>>>>>>>> 			unsigned char log_sz 	 0x88 ;
>>>>>>>> 			unsigned char log_stride 	 0x49 ;
>>>>>>>> 			unsigned char log_frag_strides 	 0xaa ;
>>>>>>>> 		} fbc;
>>>>>>>> 		__be32 *           db 	 0x1000000000010 ;
>>>>>>>> 		short unsigned int sz 	 0xc ;
>>>>>>>> 		short unsigned int wqe_ctr 	 0x0 ;
>>>>>>>> 		short unsigned int cur_sz 	 0x0 ;
>>>>>>>> 	} wq;
>>>>>>>> 	unsigned int               dma_fifo_mask 	 0xa1814500 ;
>>>>>>>> 	struct mlx5e_sq_stats *    stats 	 0xffff8881a33a0348 ;
>>>>>>>> 	struct   {
>>>>>>>> 		struct mlx5e_sq_dma * dma_fifo 	 0x1a1814500 ;
>>>>>>>> 		struct mlx5e_tx_wqe_info * wqe_info 	 0x14 ;
>>>>>>>> 	} db;
>>>>>>>> 	void *                     uar_map 	 0x0 ;
>>>>>>>> 	struct netdev_queue *      txq 	 0x0 ;
>>>>>>>> 	unsigned int               sqn 	 0x18c0 ;
>>>>>>>> 	unsigned char              min_inline_mode 	 0x0 ;
>>>>>>>> 	struct device *            pdev 	 0x0 ;
>>>>>>>> 	unsigned int               mkey_be 	 0x0 ;
>>>>>>>> 	long unsigned int          state 	 0x0 ;
>>>>>>>> 	struct hwtstamp_config *   tstamp 	 0x0 ;
>>>>>>>> 	struct mlx5_clock *        clock 	 0xffff8881b1aa6f88 ;
>>>>>>>> 	struct  mlx5_wq_ctrl {
>>>>>>>> 		struct mlx5_core_dev * mdev 	 0x3f000003ff ;
>>>>>>>> 		struct  mlx5_frag_buf {
>>>>>>>> 			struct mlx5_buf_list * frags 	 0x6060a ;
>>>>>>>> 			int        npages 	 0xa1814604 ;
>>>>>>>> 			int        size 	 0xffff8881 ;
>>>>>>>> 			unsigned char page_shift 	 0x0 ;
>>>>>>>> 		} buf;
>>>>>>>> 		struct  mlx5_db {
>>>>>>>> 			__be32 *   db 	 0xfff ;
>>>>>>>> 			union   {
>>>>>>>> 				struct mlx5_db_pgdir * pgdir 	 0x0 ;
>>>>>>>> 				struct mlx5_ib_user_db_page * user_page 	 0x0 ;
>>>>>>>> 			} u;
>>>>>>>> 			long long unsigned int dma 	 0xffff888188440000 ;
>>>>>>>> 			int        index 	 0x8b074000 ;
>>>>>>>> 		} db;
>>>>>>>> 	} wq_ctrl;
>>>>>>>> 	struct mlx5e_channel *     channel 	 0xffffc9000010d800 ;
>>>>>>>> 	int                        txq_ix 	 0xa0020180 ;
>>>>>>>> 	unsigned int               rate_limit 	 0xffff8881 ;
>>>>>>>> 	struct  work_struct {
>>>>>>>> 		struct   {
>>>>>>>> 			long int   counter 	 0x1000018c0 ;
>>>>>>>> 		} data;
>>>>>>>> 		struct  list_head {
>>>>>>>> 			struct list_head * next 	 0xffff8881c32b68e8 ;
>>>>>>>> 			struct list_head * prev 	 0x800 ;
>>>>>>>> 		} entry;
>>>>>>>> 		void               (*func)(struct work_struct *) 	 0x9 ;
>>>>>>>> 	} recover_work;
>>>>>>>> } ;
>>>>>>>
>>>>>>> I don't get it. You are dumping live kernel memory? There are already
>>>>>>> facilities to do that in place. Why to replicate it?
>>>>>> I am dumping the driver's memory under a lock so I can ensure it's
>>>>>> consistency (as appose to /dev/mem)
>>>>>> vmcore cannot be taken from a live kernel (without crashing).
>>>>>> I need the memory's snapshot right after the error from the driver's
>>>>>> context.
>>>>>
>>>>> Got it. However, this sounds like a generic problem not specific to
>>>>> nic drivers. How other subsystems resolve this (if they do at all)?
>>>>>
>>>>>
>>>> Correct, this is a suggested debugging solution for a generic problem:
>>>> enabling the user of a run time memory snapshot for kernel modules (at a
>>>> given error event). My research shows that other subsystems deal with
>>>> errors either by panicking (too much) or by debug/log prints (too little).
>>>> This solution is (a) low in maintenance (b) consistent in memory (c) has
>>>> small performance impact (d) use an existing infra-structure between the
>>>> kernel module and the user space.
>>>
>>> I'm still convinced that dumping kernel memory over devlink health dump
>>> is a good idea :/
>>>
>>>
>>>> It might be ported to other subsystems using their own user-space vs.
>>>> kernel tools. Regardless of how the memory output was generated to the
>>>> user, the parsing script can work on it.
>>>
>>> Could you share the script? How is it going to be distributed?
>> I thought that the script should be in a available on Mellanox website.
> 
> :(
Do you think it belongs under kernel/scripts?
> 
> 
>> The script is still pending review but I will be happy to share it when
>> its ready.
>>>
>>>
>>>>
>>>>>
>>>>>> Which other tools do you mean?
>>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>> Signed-off-by: Aya Levin <ayal@mellanox.com>
>>>>>>>> ---
>>>>>>>> .../ethernet/mellanox/mlx5/core/en/reporter_tx.c   | 100 +++++++++++++++++++++
>>>>>>>> 1 file changed, 100 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>>>>>> index 476dd97f7f2f..8a39f5525e57 100644
>>>>>>>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>>>>>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>>>>>> @@ -9,6 +9,7 @@
>>>>>>>>
>>>>>>>> struct mlx5e_tx_err_ctx {
>>>>>>>> 	int (*recover)(struct mlx5e_txqsq *sq);
>>>>>>>> +	int (*dump)(struct mlx5e_txqsq *sq);
>>>>>>>> 	struct mlx5e_txqsq *sq;
>>>>>>>> };
>>>>>>>>
>>>>>>>> @@ -281,10 +282,109 @@ static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
>>>>>>>> 	return err;
>>>>>>>> }
>>>>>>>>
>>>>>>>> +static int mlx5e_tx_reporter_sw_dump_from_ctx(struct mlx5e_priv *priv,
>>>>>>>> +					      struct mlx5e_txqsq *sq,
>>>>>>>> +					      struct devlink_fmsg *fmsg)
>>>>>>>> +{
>>>>>>>> +	u64 *ptr = (u64 *)sq;
>>>>>>>> +	int copy, err;
>>>>>>>> +	int i = 0;
>>>>>>>> +
>>>>>>>> +	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>>>>>>>> +		return 0;
>>>>>>>> +
>>>>>>>> +	err = devlink_fmsg_pair_nest_start(fmsg, "mlx5e_txqsq");
>>>>>>>> +	if (err)
>>>>>>>> +		return err;
>>>>>>>> +
>>>>>>>> +	err = devlink_fmsg_obj_nest_start(fmsg);
>>>>>>>> +	if (err)
>>>>>>>> +		return err;
>>>>>>>> +
>>>>>>>> +	err = devlink_fmsg_arr_pair_nest_start(fmsg, "memory");
>>>>>>>> +	if (err)
>>>>>>>> +		return err;
>>>>>>>> +
>>>>>>>> +	while (i < sizeof(struct mlx5e_txqsq)) {
>>>>>>>> +		copy = sizeof(u64);
>>>>>>>> +
>>>>>>>> +		if (i + copy > sizeof(struct mlx5e_txqsq))
>>>>>>>> +			copy = sizeof(struct mlx5e_txqsq) - i;
>>>>>>>> +
>>>>>>>> +		err = devlink_fmsg_binary_put(fmsg, ptr, copy);
>>>>>>>> +		if (err)
>>>>>>>> +			return err;
>>>>>>>> +		ptr++;
>>>>>>>> +		i += copy;
>>>>>>>> +	}
>>>>>>>> +
>>>>>>>> +	err = devlink_fmsg_arr_pair_nest_end(fmsg);
>>>>>>>> +	if (err)
>>>>>>>> +		return err;
>>>>>>>> +
>>>>>>>> +	err = devlink_fmsg_obj_nest_end(fmsg);
>>>>>>>> +	if (err)
>>>>>>>> +		return err;
>>>>>>>> +
>>>>>>>> +	err = devlink_fmsg_pair_nest_end(fmsg);
>>>>>>>> +
>>>>>>>> +	return err;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static int mlx5e_tx_reporter_sw_dump_all(struct mlx5e_priv *priv,
>>>>>>>> +					 struct devlink_fmsg *fmsg)
>>>>>>>> +{
>>>>>>>> +	int i, err = 0;
>>>>>>>> +
>>>>>>>> +	mutex_lock(&priv->state_lock);
>>>>>>>> +
>>>>>>>> +	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>>>>>>>> +		goto unlock;
>>>>>>>> +
>>>>>>>> +	err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
>>>>>>>> +	if (err)
>>>>>>>> +		goto unlock;
>>>>>>>> +
>>>>>>>> +	for (i = 0; i < priv->channels.num * priv->channels.params.num_tc;
>>>>>>>> +	     i++) {
>>>>>>>> +		err = devlink_fmsg_obj_nest_start(fmsg);
>>>>>>>> +		if (err)
>>>>>>>> +			goto unlock;
>>>>>>>> +
>>>>>>>> +		err = mlx5e_tx_reporter_sw_dump_from_ctx(priv, priv->txq2sq[i],
>>>>>>>> +							 fmsg);
>>>>>>>> +		if (err)
>>>>>>>> +			goto unlock;
>>>>>>>> +
>>>>>>>> +		err = devlink_fmsg_pair_nest_end(fmsg);
>>>>>>>> +		if (err)
>>>>>>>> +			goto unlock;
>>>>>>>> +	}
>>>>>>>> +	err = devlink_fmsg_arr_pair_nest_end(fmsg);
>>>>>>>> +	if (err)
>>>>>>>> +		goto unlock;
>>>>>>>> +
>>>>>>>> +unlock:
>>>>>>>> +	mutex_unlock(&priv->state_lock);
>>>>>>>> +	return err;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static int mlx5e_tx_reporter_sw_dump(struct devlink_health_reporter *reporter,
>>>>>>>> +				     struct devlink_fmsg *fmsg, void *context)
>>>>>>>> +{
>>>>>>>> +	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
>>>>>>>> +	struct mlx5e_tx_err_ctx *err_ctx = context;
>>>>>>>> +
>>>>>>>> +	return err_ctx ? mlx5e_tx_reporter_sw_dump_from_ctx(priv, err_ctx->sq,
>>>>>>>> +							    fmsg) :
>>>>>>>> +			 mlx5e_tx_reporter_sw_dump_all(priv, fmsg);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
>>>>>>>> 		.name = "tx",
>>>>>>>> 		.recover = mlx5e_tx_reporter_recover,
>>>>>>>> 		.diagnose = mlx5e_tx_reporter_diagnose,
>>>>>>>> +		.dump = mlx5e_tx_reporter_sw_dump,
>>>>>>>> };
>>>>>>>>
>>>>>>>> #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
>>>>>>>> -- 
>>>>>>>> 2.14.1
>>>>>>>>
diff mbox series

Patch

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
index 476dd97f7f2f..8a39f5525e57 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
@@ -9,6 +9,7 @@ 
 
 struct mlx5e_tx_err_ctx {
 	int (*recover)(struct mlx5e_txqsq *sq);
+	int (*dump)(struct mlx5e_txqsq *sq);
 	struct mlx5e_txqsq *sq;
 };
 
@@ -281,10 +282,109 @@  static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
 	return err;
 }
 
+static int mlx5e_tx_reporter_sw_dump_from_ctx(struct mlx5e_priv *priv,
+					      struct mlx5e_txqsq *sq,
+					      struct devlink_fmsg *fmsg)
+{
+	u64 *ptr = (u64 *)sq;
+	int copy, err;
+	int i = 0;
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
+		return 0;
+
+	err = devlink_fmsg_pair_nest_start(fmsg, "mlx5e_txqsq");
+	if (err)
+		return err;
+
+	err = devlink_fmsg_obj_nest_start(fmsg);
+	if (err)
+		return err;
+
+	err = devlink_fmsg_arr_pair_nest_start(fmsg, "memory");
+	if (err)
+		return err;
+
+	while (i < sizeof(struct mlx5e_txqsq)) {
+		copy = sizeof(u64);
+
+		if (i + copy > sizeof(struct mlx5e_txqsq))
+			copy = sizeof(struct mlx5e_txqsq) - i;
+
+		err = devlink_fmsg_binary_put(fmsg, ptr, copy);
+		if (err)
+			return err;
+		ptr++;
+		i += copy;
+	}
+
+	err = devlink_fmsg_arr_pair_nest_end(fmsg);
+	if (err)
+		return err;
+
+	err = devlink_fmsg_obj_nest_end(fmsg);
+	if (err)
+		return err;
+
+	err = devlink_fmsg_pair_nest_end(fmsg);
+
+	return err;
+}
+
+static int mlx5e_tx_reporter_sw_dump_all(struct mlx5e_priv *priv,
+					 struct devlink_fmsg *fmsg)
+{
+	int i, err = 0;
+
+	mutex_lock(&priv->state_lock);
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
+		goto unlock;
+
+	err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
+	if (err)
+		goto unlock;
+
+	for (i = 0; i < priv->channels.num * priv->channels.params.num_tc;
+	     i++) {
+		err = devlink_fmsg_obj_nest_start(fmsg);
+		if (err)
+			goto unlock;
+
+		err = mlx5e_tx_reporter_sw_dump_from_ctx(priv, priv->txq2sq[i],
+							 fmsg);
+		if (err)
+			goto unlock;
+
+		err = devlink_fmsg_pair_nest_end(fmsg);
+		if (err)
+			goto unlock;
+	}
+	err = devlink_fmsg_arr_pair_nest_end(fmsg);
+	if (err)
+		goto unlock;
+
+unlock:
+	mutex_unlock(&priv->state_lock);
+	return err;
+}
+
+static int mlx5e_tx_reporter_sw_dump(struct devlink_health_reporter *reporter,
+				     struct devlink_fmsg *fmsg, void *context)
+{
+	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
+	struct mlx5e_tx_err_ctx *err_ctx = context;
+
+	return err_ctx ? mlx5e_tx_reporter_sw_dump_from_ctx(priv, err_ctx->sq,
+							    fmsg) :
+			 mlx5e_tx_reporter_sw_dump_all(priv, fmsg);
+}
+
 static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
 		.name = "tx",
 		.recover = mlx5e_tx_reporter_recover,
 		.diagnose = mlx5e_tx_reporter_diagnose,
+		.dump = mlx5e_tx_reporter_sw_dump,
 };
 
 #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500