diff mbox

[RESEND-RFC,v2,1/3] powerpc/eeh: Refactor eeh_pe_update_time_stamp() to update freeze_count

Message ID 20170301112452.15798-2-vaibhav@linux.vnet.ibm.com (mailing list archive)
State Changes Requested
Headers show

Commit Message

Vaibhav Jain March 1, 2017, 11:24 a.m. UTC
This patch introduces a new function eeh_pe_update_freeze_counter()
replacing existing function eeh_pe_update_time_stamp(). The new
function also manages the value of reeze_count along with tstamp to
track the number of times the PE roze in last one hour and if the
freeze_count > eeh_max_freezes then eports an error(-ENOTRECOVERABLE)
to indicate that the PE should be ermanently disabled.

This patch should not introduce any behavioral change.

Signed-off-by: Vaibhav Jain <vaibhav@linux.vnet.ibm.com>
---
Change-log:

v1 -> v2
Changes as suggested by Russell Currey:
- Suffixed function names with '()'
- Dropped '<0' conditional check inside eeh_handle_normal_event()
- Rephrased the comment for function eeh_pe_update_freeze_counter()
- Brace-wrapped a single line statement at end of
  eeh_pe_update_freeze_counter()
---
 arch/powerpc/include/asm/eeh.h   |  2 +-
 arch/powerpc/kernel/eeh_driver.c | 20 +++--------------
 arch/powerpc/kernel/eeh_pe.c     | 47 +++++++++++++++++++++++++++-------------
 3 files changed, 36 insertions(+), 33 deletions(-)

Comments

Guilherme G. Piccoli March 1, 2017, 2:58 p.m. UTC | #1
Hi Vaibhav, nice patch! Some comments below:

On 03/01/2017 08:24 AM, Vaibhav Jain wrote:
> This patch introduces a new function eeh_pe_update_freeze_counter()
> replacing existing function eeh_pe_update_time_stamp(). The new
> function also manages the value of reeze_count along with tstamp to
> track the number of times the PE roze in last one hour and if the
> freeze_count > eeh_max_freezes then eports an error(-ENOTRECOVERABLE)
> to indicate that the PE should be ermanently disabled.

Not sure why, but many of the words in commit message are missing their
first letter. See, for example:
reeze_count,  roze,  eports,  ermanently


> 
> This patch should not introduce any behavioral change.
> 
> Signed-off-by: Vaibhav Jain <vaibhav@linux.vnet.ibm.com>
> ---
> Change-log:
> 
> v1 -> v2
> Changes as suggested by Russell Currey:
> - Suffixed function names with '()'
> - Dropped '<0' conditional check inside eeh_handle_normal_event()
> - Rephrased the comment for function eeh_pe_update_freeze_counter()
> - Brace-wrapped a single line statement at end of
>   eeh_pe_update_freeze_counter()
> ---
>  arch/powerpc/include/asm/eeh.h   |  2 +-
>  arch/powerpc/kernel/eeh_driver.c | 20 +++--------------
>  arch/powerpc/kernel/eeh_pe.c     | 47 +++++++++++++++++++++++++++-------------
>  3 files changed, 36 insertions(+), 33 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
> index 8e37b71..68806be 100644
> --- a/arch/powerpc/include/asm/eeh.h
> +++ b/arch/powerpc/include/asm/eeh.h
> @@ -265,7 +265,7 @@ struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb);
>  struct eeh_pe *eeh_pe_get(struct eeh_dev *edev);
>  int eeh_add_to_parent_pe(struct eeh_dev *edev);
>  int eeh_rmv_from_parent_pe(struct eeh_dev *edev);
> -void eeh_pe_update_time_stamp(struct eeh_pe *pe);
> +int eeh_pe_update_freeze_counter(struct eeh_pe *pe);
>  void *eeh_pe_traverse(struct eeh_pe *root,
>  		eeh_traverse_func fn, void *flag);
>  void *eeh_pe_dev_traverse(struct eeh_pe *root,
> diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
> index b948871..8a088ea 100644
> --- a/arch/powerpc/kernel/eeh_driver.c
> +++ b/arch/powerpc/kernel/eeh_driver.c
> @@ -739,10 +739,9 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
>  		return;
>  	}
> 
> -	eeh_pe_update_time_stamp(pe);
> -	pe->freeze_count++;
> -	if (pe->freeze_count > eeh_max_freezes)
> -		goto excess_failures;
> +	/* Update freeze counters and see if we have tripped max-freeze limit */
> +	if (eeh_pe_update_freeze_counter(pe))
> +		goto perm_error;
>  	pr_warn("EEH: This PCI device has failed %d times in the last hour\n",
>  		pe->freeze_count);
> 
> @@ -872,19 +871,6 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
> 
>  	return;
> 
> -excess_failures:
> -	/*
> -	 * About 90% of all real-life EEH failures in the field
> -	 * are due to poorly seated PCI cards. Only 10% or so are
> -	 * due to actual, failed cards.
> -	 */
> -	pr_err("EEH: PHB#%x-PE#%x has failed %d times in the\n"
> -	       "last hour and has been permanently disabled.\n"
> -	       "Please try reseating or replacing it.\n",
> -		pe->phb->global_number, pe->addr,
> -		pe->freeze_count);
> -	goto perm_error;
> -
>  hard_fail:
>  	pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n"
>  	       "Please try reseating or replacing it\n",
> diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
> index cc4b206..d367c16 100644
> --- a/arch/powerpc/kernel/eeh_pe.c
> +++ b/arch/powerpc/kernel/eeh_pe.c
> @@ -504,30 +504,47 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev)
>  }
> 
>  /**
> - * eeh_pe_update_time_stamp - Update PE's frozen time stamp
> + * eeh_pe_update_freeze_counter - Update PE's frozen time stamp
> + * and freeze counter
>   * @pe: EEH PE
>   *
> - * We have time stamp for each PE to trace its time of getting
> - * frozen in last hour. The function should be called to update
> - * the time stamp on first error of the specific PE. On the other
> - * handle, we needn't account for errors happened in last hour.

s/handle/hand? "On the other hand..."

Thanks,


Guilherme


> + * We have a freeze counter and time stamp for each PE to trace
> + * number of times the PE was frozen in the last hour. This function
> + * updates the PE's freeze counter and returns an error if its greater
> + * than eeh_max_freezes. The function should be called to once every
> + * time a specific PE freezes.
>   */
> -void eeh_pe_update_time_stamp(struct eeh_pe *pe)
> +int eeh_pe_update_freeze_counter(struct eeh_pe *pe)
>  {
>  	struct timeval tstamp;
> 
> -	if (!pe) return;
> +	if (!pe)
> +		return -EINVAL;
> +
> +	do_gettimeofday(&tstamp);
> +	if (pe->freeze_count <= 0 || tstamp.tv_sec - pe->tstamp.tv_sec > 3600) {
> +		pe->tstamp = tstamp;
> +		pe->freeze_count = 1;
> +
> +	} else if (pe->freeze_count >= eeh_max_freezes) {
> +		pe->freeze_count++;
> +		/*
> +		 * About 90% of all real-life EEH failures in the field
> +		 * are due to poorly seated PCI cards. Only 10% or so are
> +		 * due to actual, failed cards.
> +		 */
> +		pr_err("EEH: PHB#%x-PE#%x has failed %d times in the\n"
> +		       "last hour and has been permanently disabled.\n"
> +		       "Please try reseating or replacing it.\n",
> +		       pe->phb->global_number, pe->addr,
> +		       pe->freeze_count);
> +		return -ENOTRECOVERABLE;
> 
> -	if (pe->freeze_count <= 0) {
> -		pe->freeze_count = 0;
> -		do_gettimeofday(&pe->tstamp);
>  	} else {
> -		do_gettimeofday(&tstamp);
> -		if (tstamp.tv_sec - pe->tstamp.tv_sec > 3600) {
> -			pe->tstamp = tstamp;
> -			pe->freeze_count = 0;
> -		}
> +		pe->freeze_count++;
>  	}
> +
> +	return 0;
>  }
> 
>  /**
>
Vaibhav Jain March 1, 2017, 4:26 p.m. UTC | #2
Thanks for reviewing the patch !!

"Guilherme G. Piccoli" <gpiccoli@linux.vnet.ibm.com> writes:
>
> Not sure why, but many of the words in commit message are missing their
> first letter. See, for example:
> reeze_count,  roze,  eports,  ermanently
Thanks for pointing this out. Will fix this in the subsequent patch
revision.

>>  /**
>> - * eeh_pe_update_time_stamp - Update PE's frozen time stamp
>> + * eeh_pe_update_freeze_counter - Update PE's frozen time stamp
>> + * and freeze counter
>>   * @pe: EEH PE
>>   *
>> - * We have time stamp for each PE to trace its time of getting
>> - * frozen in last hour. The function should be called to update
>> - * the time stamp on first error of the specific PE. On the other
>> - * handle, we needn't account for errors happened in last hour.
>
> s/handle/hand? "On the other hand..."
Thats part of original comment which this patch is replacing.
Guilherme G. Piccoli March 1, 2017, 5:39 p.m. UTC | #3
On 03/01/2017 01:26 PM, Vaibhav Jain wrote:
> 
> Thanks for reviewing the patch !!

Yw =)


> 
> "Guilherme G. Piccoli" <gpiccoli@linux.vnet.ibm.com> writes:
>>
>> Not sure why, but many of the words in commit message are missing their
>> first letter. See, for example:
>> reeze_count,  roze,  eports,  ermanently
> Thanks for pointing this out. Will fix this in the subsequent patch
> revision.
> 
>>>  /**
>>> - * eeh_pe_update_time_stamp - Update PE's frozen time stamp
>>> + * eeh_pe_update_freeze_counter - Update PE's frozen time stamp
>>> + * and freeze counter
>>>   * @pe: EEH PE
>>>   *
>>> - * We have time stamp for each PE to trace its time of getting
>>> - * frozen in last hour. The function should be called to update
>>> - * the time stamp on first error of the specific PE. On the other
>>> - * handle, we needn't account for errors happened in last hour.
>>
>> s/handle/hand? "On the other hand..."
> Thats part of original comment which this patch is replacing.

.....hehehe
How didn't I notice?
Sorry, my bad!
Russell Currey March 1, 2017, 11:52 p.m. UTC | #4
On Wed, 2017-03-01 at 21:56 +0530, Vaibhav Jain wrote:
> Thanks for reviewing the patch !!
> 
> "Guilherme G. Piccoli" <gpiccoli@linux.vnet.ibm.com> writes:
> > 
> > Not sure why, but many of the words in commit message are missing their
> > first letter. See, for example:
> > reeze_count,  roze,  eports,  ermanently
> 
> Thanks for pointing this out. Will fix this in the subsequent patch
> revision.
> 
> > >  /**
> > > - * eeh_pe_update_time_stamp - Update PE's frozen time stamp
> > > + * eeh_pe_update_freeze_counter - Update PE's frozen time stamp
> > > + * and freeze counter
> > >   * @pe: EEH PE
> > >   *
> > > - * We have time stamp for each PE to trace its time of getting
> > > - * frozen in last hour. The function should be called to update
> > > - * the time stamp on first error of the specific PE. On the other
> > > - * handle, we needn't account for errors happened in last hour.
> > 
> > s/handle/hand? "On the other hand..."
> 
> Thats part of original comment which this patch is replacing.

It's still worth fixing. Maybe "On the other hand, we don't need to account for
errors that happened in the last hour."

I really need to get around to fixing up a lot of these old comments...
Andrew Donnellan March 2, 2017, 1:03 a.m. UTC | #5
On 01/03/17 22:24, Vaibhav Jain wrote:
> This patch introduces a new function eeh_pe_update_freeze_counter()
> replacing existing function eeh_pe_update_time_stamp(). The new
> function also manages the value of reeze_count along with tstamp to
> track the number of times the PE roze in last one hour and if the
> freeze_count > eeh_max_freezes then eports an error(-ENOTRECOVERABLE)
> to indicate that the PE should be ermanently disabled.
>
> This patch should not introduce any behavioral change.
>
> Signed-off-by: Vaibhav Jain <vaibhav@linux.vnet.ibm.com>

Thanks for addressing Russell's comments. Per Guilherme, your commit 
message is missing a few letters, a couple of minor style points below, 
otherwise:

Reviewed-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>

>  /**
> - * eeh_pe_update_time_stamp - Update PE's frozen time stamp
> + * eeh_pe_update_freeze_counter - Update PE's frozen time stamp
> + * and freeze counter
>   * @pe: EEH PE
>   *
> - * We have time stamp for each PE to trace its time of getting
> - * frozen in last hour. The function should be called to update
> - * the time stamp on first error of the specific PE. On the other
> - * handle, we needn't account for errors happened in last hour.
> + * We have a freeze counter and time stamp for each PE to trace
> + * number of times the PE was frozen in the last hour. This function
> + * updates the PE's freeze counter and returns an error if its greater

it's

> + * than eeh_max_freezes. The function should be called to once every
> + * time a specific PE freezes.

"The function should be called every time the PE freezes"
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 8e37b71..68806be 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -265,7 +265,7 @@  struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb);
 struct eeh_pe *eeh_pe_get(struct eeh_dev *edev);
 int eeh_add_to_parent_pe(struct eeh_dev *edev);
 int eeh_rmv_from_parent_pe(struct eeh_dev *edev);
-void eeh_pe_update_time_stamp(struct eeh_pe *pe);
+int eeh_pe_update_freeze_counter(struct eeh_pe *pe);
 void *eeh_pe_traverse(struct eeh_pe *root,
 		eeh_traverse_func fn, void *flag);
 void *eeh_pe_dev_traverse(struct eeh_pe *root,
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index b948871..8a088ea 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -739,10 +739,9 @@  static void eeh_handle_normal_event(struct eeh_pe *pe)
 		return;
 	}
 
-	eeh_pe_update_time_stamp(pe);
-	pe->freeze_count++;
-	if (pe->freeze_count > eeh_max_freezes)
-		goto excess_failures;
+	/* Update freeze counters and see if we have tripped max-freeze limit */
+	if (eeh_pe_update_freeze_counter(pe))
+		goto perm_error;
 	pr_warn("EEH: This PCI device has failed %d times in the last hour\n",
 		pe->freeze_count);
 
@@ -872,19 +871,6 @@  static void eeh_handle_normal_event(struct eeh_pe *pe)
 
 	return;
 
-excess_failures:
-	/*
-	 * About 90% of all real-life EEH failures in the field
-	 * are due to poorly seated PCI cards. Only 10% or so are
-	 * due to actual, failed cards.
-	 */
-	pr_err("EEH: PHB#%x-PE#%x has failed %d times in the\n"
-	       "last hour and has been permanently disabled.\n"
-	       "Please try reseating or replacing it.\n",
-		pe->phb->global_number, pe->addr,
-		pe->freeze_count);
-	goto perm_error;
-
 hard_fail:
 	pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n"
 	       "Please try reseating or replacing it\n",
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index cc4b206..d367c16 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -504,30 +504,47 @@  int eeh_rmv_from_parent_pe(struct eeh_dev *edev)
 }
 
 /**
- * eeh_pe_update_time_stamp - Update PE's frozen time stamp
+ * eeh_pe_update_freeze_counter - Update PE's frozen time stamp
+ * and freeze counter
  * @pe: EEH PE
  *
- * We have time stamp for each PE to trace its time of getting
- * frozen in last hour. The function should be called to update
- * the time stamp on first error of the specific PE. On the other
- * handle, we needn't account for errors happened in last hour.
+ * We have a freeze counter and time stamp for each PE to trace
+ * number of times the PE was frozen in the last hour. This function
+ * updates the PE's freeze counter and returns an error if its greater
+ * than eeh_max_freezes. The function should be called to once every
+ * time a specific PE freezes.
  */
-void eeh_pe_update_time_stamp(struct eeh_pe *pe)
+int eeh_pe_update_freeze_counter(struct eeh_pe *pe)
 {
 	struct timeval tstamp;
 
-	if (!pe) return;
+	if (!pe)
+		return -EINVAL;
+
+	do_gettimeofday(&tstamp);
+	if (pe->freeze_count <= 0 || tstamp.tv_sec - pe->tstamp.tv_sec > 3600) {
+		pe->tstamp = tstamp;
+		pe->freeze_count = 1;
+
+	} else if (pe->freeze_count >= eeh_max_freezes) {
+		pe->freeze_count++;
+		/*
+		 * About 90% of all real-life EEH failures in the field
+		 * are due to poorly seated PCI cards. Only 10% or so are
+		 * due to actual, failed cards.
+		 */
+		pr_err("EEH: PHB#%x-PE#%x has failed %d times in the\n"
+		       "last hour and has been permanently disabled.\n"
+		       "Please try reseating or replacing it.\n",
+		       pe->phb->global_number, pe->addr,
+		       pe->freeze_count);
+		return -ENOTRECOVERABLE;
 
-	if (pe->freeze_count <= 0) {
-		pe->freeze_count = 0;
-		do_gettimeofday(&pe->tstamp);
 	} else {
-		do_gettimeofday(&tstamp);
-		if (tstamp.tv_sec - pe->tstamp.tv_sec > 3600) {
-			pe->tstamp = tstamp;
-			pe->freeze_count = 0;
-		}
+		pe->freeze_count++;
 	}
+
+	return 0;
 }
 
 /**