diff mbox series

[v2,6/7] powerpc/eeh: Allow disabling recovery

Message ID 20190215004817.19961-6-oohall@gmail.com (mailing list archive)
State Accepted
Commit 6b493f6079a430fd41f66933b68d1bb1ad37ca8c
Headers show
Series [v2,1/7] powerpc/eeh: Use debugfs_create_u32 for eeh_max_freezes | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success next/apply_patch Successfully applied
snowpatch_ozlabs/checkpatch warning total: 0 errors, 0 warnings, 1 checks, 44 lines checked

Commit Message

Oliver O'Halloran Feb. 15, 2019, 12:48 a.m. UTC
Currently when we detect an error we automatically invoke the EEH recovery
handler. This can be annoying when debugging EEH problems, or when working
on EEH itself so this patch adds a debugfs knob that will prevent a
recovery event from being queued up when an issue is detected.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
 arch/powerpc/include/asm/eeh.h  |  1 +
 arch/powerpc/kernel/eeh.c       | 10 ++++++++++
 arch/powerpc/kernel/eeh_event.c |  9 +++++++++
 3 files changed, 20 insertions(+)

Comments

Sam Bobroff Feb. 15, 2019, 5:58 a.m. UTC | #1
On Fri, Feb 15, 2019 at 11:48:16AM +1100, Oliver O'Halloran wrote:
> Currently when we detect an error we automatically invoke the EEH recovery
> handler. This can be annoying when debugging EEH problems, or when working
> on EEH itself so this patch adds a debugfs knob that will prevent a
> recovery event from being queued up when an issue is detected.
> 
> Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
> ---
>  arch/powerpc/include/asm/eeh.h  |  1 +
>  arch/powerpc/kernel/eeh.c       | 10 ++++++++++
>  arch/powerpc/kernel/eeh_event.c |  9 +++++++++
>  3 files changed, 20 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
> index 478f199d5663..810e05273ad3 100644
> --- a/arch/powerpc/include/asm/eeh.h
> +++ b/arch/powerpc/include/asm/eeh.h
> @@ -220,6 +220,7 @@ struct eeh_ops {
>  
>  extern int eeh_subsystem_flags;
>  extern u32 eeh_max_freezes;
> +extern bool eeh_debugfs_no_recover;
>  extern struct eeh_ops *eeh_ops;
>  extern raw_spinlock_t confirm_error_lock;
>  
> diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
> index 82d22c671c0e..9f20099ce2d9 100644
> --- a/arch/powerpc/kernel/eeh.c
> +++ b/arch/powerpc/kernel/eeh.c
> @@ -111,6 +111,13 @@ EXPORT_SYMBOL(eeh_subsystem_flags);
>   */
>  u32 eeh_max_freezes = 5;
>  
> +/*
> + * Controls whether a recovery event should be scheduled when an
> + * isolated device is discovered. This is only really useful for
> + * debugging problems with the EEH core.
> + */
> +bool eeh_debugfs_no_recover;
> +
>  /* Platform dependent EEH operations */
>  struct eeh_ops *eeh_ops = NULL;
>  
> @@ -1810,6 +1817,9 @@ static int __init eeh_init_proc(void)
>  					   &eeh_enable_dbgfs_ops);
>  		debugfs_create_u32("eeh_max_freezes", 0600,
>  				powerpc_debugfs_root, &eeh_max_freezes);
> +		debugfs_create_bool("eeh_disable_recovery", 0600,
> +				powerpc_debugfs_root,
> +				&eeh_debugfs_no_recover);
>  		eeh_cache_debugfs_init();
>  #endif
>  	}
> diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c
> index 227e57f980df..19837798bb1d 100644
> --- a/arch/powerpc/kernel/eeh_event.c
> +++ b/arch/powerpc/kernel/eeh_event.c
> @@ -126,6 +126,15 @@ int eeh_send_failure_event(struct eeh_pe *pe)
>  	unsigned long flags;
>  	struct eeh_event *event;
>  
> +	/*
> +	 * If we've manually supressed recovery events via debugfs
> +	 * then just drop it on the floor.
> +	 */
> +	if (eeh_debugfs_no_recover) {
> +		pr_err("EEH: Event dropped due to no_recover setting\n");
> +		return 0;
> +	}
> +

I think it might be clearer if you did the 'no recovery' test at the
call sites (I think there are only a few), instead of inside the
function (and then the next patch wouldn't need to add a wrapper).

>  	event = kzalloc(sizeof(*event), GFP_ATOMIC);
>  	if (!event) {
>  		pr_err("EEH: out of memory, event not handled\n");
> -- 
> 2.20.1
>
Oliver O'Halloran Feb. 17, 2019, 11:19 p.m. UTC | #2
On Fri, Feb 15, 2019 at 4:58 PM Sam Bobroff <sbobroff@linux.ibm.com> wrote:
>
> On Fri, Feb 15, 2019 at 11:48:16AM +1100, Oliver O'Halloran wrote:
> > Currently when we detect an error we automatically invoke the EEH recovery
> > handler. This can be annoying when debugging EEH problems, or when working
> > on EEH itself so this patch adds a debugfs knob that will prevent a
> > recovery event from being queued up when an issue is detected.
> >
> > Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
> > ---
> >  arch/powerpc/include/asm/eeh.h  |  1 +
> >  arch/powerpc/kernel/eeh.c       | 10 ++++++++++
> >  arch/powerpc/kernel/eeh_event.c |  9 +++++++++
> >  3 files changed, 20 insertions(+)
> >
> > diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
> > index 478f199d5663..810e05273ad3 100644
> > --- a/arch/powerpc/include/asm/eeh.h
> > +++ b/arch/powerpc/include/asm/eeh.h
> > @@ -220,6 +220,7 @@ struct eeh_ops {
> >
> >  extern int eeh_subsystem_flags;
> >  extern u32 eeh_max_freezes;
> > +extern bool eeh_debugfs_no_recover;
> >  extern struct eeh_ops *eeh_ops;
> >  extern raw_spinlock_t confirm_error_lock;
> >
> > diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
> > index 82d22c671c0e..9f20099ce2d9 100644
> > --- a/arch/powerpc/kernel/eeh.c
> > +++ b/arch/powerpc/kernel/eeh.c
> > @@ -111,6 +111,13 @@ EXPORT_SYMBOL(eeh_subsystem_flags);
> >   */
> >  u32 eeh_max_freezes = 5;
> >
> > +/*
> > + * Controls whether a recovery event should be scheduled when an
> > + * isolated device is discovered. This is only really useful for
> > + * debugging problems with the EEH core.
> > + */
> > +bool eeh_debugfs_no_recover;
> > +
> >  /* Platform dependent EEH operations */
> >  struct eeh_ops *eeh_ops = NULL;
> >
> > @@ -1810,6 +1817,9 @@ static int __init eeh_init_proc(void)
> >                                          &eeh_enable_dbgfs_ops);
> >               debugfs_create_u32("eeh_max_freezes", 0600,
> >                               powerpc_debugfs_root, &eeh_max_freezes);
> > +             debugfs_create_bool("eeh_disable_recovery", 0600,
> > +                             powerpc_debugfs_root,
> > +                             &eeh_debugfs_no_recover);
> >               eeh_cache_debugfs_init();
> >  #endif
> >       }
> > diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c
> > index 227e57f980df..19837798bb1d 100644
> > --- a/arch/powerpc/kernel/eeh_event.c
> > +++ b/arch/powerpc/kernel/eeh_event.c
> > @@ -126,6 +126,15 @@ int eeh_send_failure_event(struct eeh_pe *pe)
> >       unsigned long flags;
> >       struct eeh_event *event;
> >
> > +     /*
> > +      * If we've manually supressed recovery events via debugfs
> > +      * then just drop it on the floor.
> > +      */
> > +     if (eeh_debugfs_no_recover) {
> > +             pr_err("EEH: Event dropped due to no_recover setting\n");
> > +             return 0;
> > +     }
> > +
>
> I think it might be clearer if you did the 'no recovery' test at the
> call sites (I think there are only a few), instead of inside the
> function (and then the next patch wouldn't need to add a wrapper).

I don't think adding boilerplate at all the call sites is an
improvement. It's just a recipe for adding bugs.

>
> >       event = kzalloc(sizeof(*event), GFP_ATOMIC);
> >       if (!event) {
> >               pr_err("EEH: out of memory, event not handled\n");
> > --
> > 2.20.1
> >
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 478f199d5663..810e05273ad3 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -220,6 +220,7 @@  struct eeh_ops {
 
 extern int eeh_subsystem_flags;
 extern u32 eeh_max_freezes;
+extern bool eeh_debugfs_no_recover;
 extern struct eeh_ops *eeh_ops;
 extern raw_spinlock_t confirm_error_lock;
 
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 82d22c671c0e..9f20099ce2d9 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -111,6 +111,13 @@  EXPORT_SYMBOL(eeh_subsystem_flags);
  */
 u32 eeh_max_freezes = 5;
 
+/*
+ * Controls whether a recovery event should be scheduled when an
+ * isolated device is discovered. This is only really useful for
+ * debugging problems with the EEH core.
+ */
+bool eeh_debugfs_no_recover;
+
 /* Platform dependent EEH operations */
 struct eeh_ops *eeh_ops = NULL;
 
@@ -1810,6 +1817,9 @@  static int __init eeh_init_proc(void)
 					   &eeh_enable_dbgfs_ops);
 		debugfs_create_u32("eeh_max_freezes", 0600,
 				powerpc_debugfs_root, &eeh_max_freezes);
+		debugfs_create_bool("eeh_disable_recovery", 0600,
+				powerpc_debugfs_root,
+				&eeh_debugfs_no_recover);
 		eeh_cache_debugfs_init();
 #endif
 	}
diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c
index 227e57f980df..19837798bb1d 100644
--- a/arch/powerpc/kernel/eeh_event.c
+++ b/arch/powerpc/kernel/eeh_event.c
@@ -126,6 +126,15 @@  int eeh_send_failure_event(struct eeh_pe *pe)
 	unsigned long flags;
 	struct eeh_event *event;
 
+	/*
+	 * If we've manually supressed recovery events via debugfs
+	 * then just drop it on the floor.
+	 */
+	if (eeh_debugfs_no_recover) {
+		pr_err("EEH: Event dropped due to no_recover setting\n");
+		return 0;
+	}
+
 	event = kzalloc(sizeof(*event), GFP_ATOMIC);
 	if (!event) {
 		pr_err("EEH: out of memory, event not handled\n");