diff mbox series

[RFC] core/lock: Add debug options to store backtrace of where lock was taken

Message ID 20190221034859.14029-1-andrew.donnellan@au1.ibm.com
State RFC
Headers show
Series [RFC] core/lock: Add debug options to store backtrace of where lock was taken | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success master/apply_patch Successfully applied
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot fail Test snowpatch/job/snowpatch-skiboot on branch master

Commit Message

Andrew Donnellan Feb. 21, 2019, 3:48 a.m. UTC
Contrary to popular belief, skiboot developers are imperfect and
occasionally write locking bugs. When we exit skiboot, we check if we're
still holding any locks, and if so, we print an error with a list of the
locks currently held and the locations where they were taken.

However, this only tells us the location where lock() was called, which may
not be enough to work out what's going on. To give us more to go on with,
we can store backtrace data in the lock and print that out when we
unexpectedly still hold locks.

Because the backtrace data is rather big, we only enable this if
DEBUG_LOCKS_BACKTRACE is defined.

Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>

---

RFC. IDK whether anyone else might find this useful, it helped me work out
what was going on when I was hitting a SIGNAL_SYSTEM_RESET and it was
complaining I was still holding a lock.

This uses the backtrace functions which have more leading underscores than
I'm comfortable with, and the printout is ugly.

I've hidden this behind a new config ifdef for now, so that I can switch it
on without activating all of DEBUG which occasionally causes other weird
problems elsewhere. Maybe not the best way?
---
 core/lock.c      | 25 +++++++++++++++++++++++--
 include/config.h |  3 +++
 include/lock.h   | 12 ++++++++++++
 3 files changed, 38 insertions(+), 2 deletions(-)

Comments

Stewart Smith Feb. 25, 2019, 6:11 a.m. UTC | #1
Andrew Donnellan <andrew.donnellan@au1.ibm.com> writes:
> Contrary to popular belief, skiboot developers are imperfect and

Say it isn't so!

> occasionally write locking bugs. When we exit skiboot, we check if we're
> still holding any locks, and if so, we print an error with a list of the
> locks currently held and the locations where they were taken.
>
> However, this only tells us the location where lock() was called, which may
> not be enough to work out what's going on. To give us more to go on with,
> we can store backtrace data in the lock and print that out when we
> unexpectedly still hold locks.
>
> Because the backtrace data is rather big, we only enable this if
> DEBUG_LOCKS_BACKTRACE is defined.

Okay, so, I like this as a feature. I'm tempted to say it should be
enabled always when we have DEBUG=1 yet have the option to *just* enable
it. Which I think is what you've done here?

> Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
>
> ---
>
> RFC. IDK whether anyone else might find this useful, it helped me work out
> what was going on when I was hitting a SIGNAL_SYSTEM_RESET and it was
> complaining I was still holding a lock.
>
> This uses the backtrace functions which have more leading underscores than
> I'm comfortable with, and the printout is ugly.

It's better than nothing, and whenever I've hit something like this I've
immediately gone and put in backtrace()s around the place to find wtf I
got wrong.

> I've hidden this behind a new config ifdef for now, so that I can switch it
> on without activating all of DEBUG which occasionally causes other weird
> problems elsewhere. Maybe not the best way?
> ---
>  core/lock.c      | 25 +++++++++++++++++++++++--
>  include/config.h |  3 +++
>  include/lock.h   | 12 ++++++++++++
>  3 files changed, 38 insertions(+), 2 deletions(-)
>
> diff --git a/core/lock.c b/core/lock.c
> index fbf0b21ed5c6..38545784e916 100644
> --- a/core/lock.c
> +++ b/core/lock.c
> @@ -208,6 +208,15 @@ bool try_lock_caller(struct lock *l, const char *owner)
>  		cpu->con_suspend++;
>  	if (__try_lock(cpu, l)) {
>  		l->owner = owner;
> +
> +#ifdef DEBUG_LOCKS_BACKTRACE
> +		l->bt_ents = 60;
> +		___backtrace(l->bt_buf, &l->bt_ents,
> +			     (unsigned long)__builtin_frame_address(0),
> +			     &l->bt_token, &l->bt_r1_caller);
> +		l->bt_pir = mfspr(SPR_PIR);
> +#endif
> +
>  		list_add(&cpu->locks_held, &l->list);
>  		return true;
>  	}
> @@ -312,8 +321,14 @@ void dump_locks_list(void)
>  	struct lock *l;
>  
>  	prlog(PR_ERR, "Locks held:\n");
> -	list_for_each(&this_cpu()->locks_held, l, list)
> +	list_for_each(&this_cpu()->locks_held, l, list) {
>  		prlog(PR_ERR, "  %s\n", l->owner);
> +#ifdef DEBUG_LOCKS_BACKTRACE
> +		___print_backtrace(l->bt_pir, l->bt_buf, l->bt_ents,
> +				   l->bt_token, l->bt_r1_caller,
> +				   NULL, NULL, true);
> +#endif
> +	}
>  }
>  
>  void drop_my_locks(bool warn)
> @@ -322,8 +337,14 @@ void drop_my_locks(bool warn)
>  
>  	disable_fast_reboot("Lock corruption");
>  	while((l = list_top(&this_cpu()->locks_held, struct lock, list)) != NULL) {
> -		if (warn)
> +		if (warn) {
>  			prlog(PR_ERR, "  %s\n", l->owner);
> +#ifdef DEBUG_LOCKS_BACKTRACE
> +			___print_backtrace(l->bt_pir, l->bt_buf, l->bt_ents,
> +					   l->bt_token, l->bt_r1_caller,
> +					   NULL, NULL, true);
> +#endif
> +		}
>  		unlock(l);
>  	}
>  }
> diff --git a/include/config.h b/include/config.h
> index 6b36590c39f0..c285d549ccab 100644
> --- a/include/config.h
> +++ b/include/config.h
> @@ -39,6 +39,9 @@
>  /* Enable lock debugging */
>  #define DEBUG_LOCKS		1
>  
> +/* Enable printing of backtraces when locks not released */
> +#define DEBUG_LOCKS_BACKTRACE  1

this is behind a #if for DEBUG=1, right?

> +
>  /* Enable lock dependency checker */
>  #define DEADLOCK_CHECKER	1
>  
> diff --git a/include/lock.h b/include/lock.h
> index b18757394dbb..b7d47be83ba9 100644
> --- a/include/lock.h
> +++ b/include/lock.h
> @@ -23,6 +23,10 @@
>  #include <ccan/list/list.h>
>  #include <ccan/str/str.h>
>  
> +#ifdef DEBUG_LOCKS_BACKTRACE
> +#include <stack.h>
> +#endif
> +
>  struct lock {
>  	/* Lock value has bit 63 as lock bit and the PIR of the owner
>  	 * in the top 32-bit
> @@ -38,6 +42,14 @@ struct lock {
>  	/* file/line of lock owner */
>  	const char *owner;
>  
> +#ifdef DEBUG_LOCKS_BACKTRACE
> +	struct bt_entry bt_buf[60];
> +	unsigned int bt_ents;
> +	unsigned long bt_token;
> +	unsigned long bt_r1_caller;
> +	unsigned long bt_pir;
> +#endif

almost feels like it should be a struct.

> +
>  	/* linkage in per-cpu list of owned locks */
>  	struct list_node list;
>  };
> -- 
> 2.11.0
>
> _______________________________________________
> Skiboot mailing list
> Skiboot@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/skiboot
>
Andrew Donnellan Feb. 25, 2019, 6:25 a.m. UTC | #2
On 25/2/19 5:11 pm, Stewart Smith wrote:
>> +/* Enable printing of backtraces when locks not released */
>> +#define DEBUG_LOCKS_BACKTRACE  1
> 
> this is behind a #if for DEBUG=1, right?

it is not, but I can fix that...

> 
>> +
>>   /* Enable lock dependency checker */
>>   #define DEADLOCK_CHECKER	1
>>   
>> diff --git a/include/lock.h b/include/lock.h
>> index b18757394dbb..b7d47be83ba9 100644
>> --- a/include/lock.h
>> +++ b/include/lock.h
>> @@ -23,6 +23,10 @@
>>   #include <ccan/list/list.h>
>>   #include <ccan/str/str.h>
>>   
>> +#ifdef DEBUG_LOCKS_BACKTRACE
>> +#include <stack.h>
>> +#endif
>> +
>>   struct lock {
>>   	/* Lock value has bit 63 as lock bit and the PIR of the owner
>>   	 * in the top 32-bit
>> @@ -38,6 +42,14 @@ struct lock {
>>   	/* file/line of lock owner */
>>   	const char *owner;
>>   
>> +#ifdef DEBUG_LOCKS_BACKTRACE
>> +	struct bt_entry bt_buf[60];
>> +	unsigned int bt_ents;
>> +	unsigned long bt_token;
>> +	unsigned long bt_r1_caller;
>> +	unsigned long bt_pir;
>> +#endif
> 
> almost feels like it should be a struct.

And if we made it a struct, that would make it easier to clean up the 
API and make stuff like this easier in future... hmm...

> 
>> +
>>   	/* linkage in per-cpu list of owned locks */
>>   	struct list_node list;
>>   };
>> -- 
>> 2.11.0
>>
>> _______________________________________________
>> Skiboot mailing list
>> Skiboot@lists.ozlabs.org
>> https://lists.ozlabs.org/listinfo/skiboot
>>
>
Andrew Donnellan Feb. 26, 2019, 3:45 a.m. UTC | #3
On 25/2/19 5:25 pm, Andrew Donnellan wrote:
>> this is behind a #if for DEBUG=1, right?
> 
> it is not, but I can fix that...
> 
>>> +#ifdef DEBUG_LOCKS_BACKTRACE
>>> +    struct bt_entry bt_buf[60];
>>> +    unsigned int bt_ents;
>>> +    unsigned long bt_token;
>>> +    unsigned long bt_r1_caller;
>>> +    unsigned long bt_pir;
>>> +#endif
>>
>> almost feels like it should be a struct.
> 
> And if we made it a struct, that would make it easier to clean up the 
> API and make stuff like this easier in future... hmm...

I'll send another version which addresses both these points and does 
some refactoring around the backtrace API.
diff mbox series

Patch

diff --git a/core/lock.c b/core/lock.c
index fbf0b21ed5c6..38545784e916 100644
--- a/core/lock.c
+++ b/core/lock.c
@@ -208,6 +208,15 @@  bool try_lock_caller(struct lock *l, const char *owner)
 		cpu->con_suspend++;
 	if (__try_lock(cpu, l)) {
 		l->owner = owner;
+
+#ifdef DEBUG_LOCKS_BACKTRACE
+		l->bt_ents = 60;
+		___backtrace(l->bt_buf, &l->bt_ents,
+			     (unsigned long)__builtin_frame_address(0),
+			     &l->bt_token, &l->bt_r1_caller);
+		l->bt_pir = mfspr(SPR_PIR);
+#endif
+
 		list_add(&cpu->locks_held, &l->list);
 		return true;
 	}
@@ -312,8 +321,14 @@  void dump_locks_list(void)
 	struct lock *l;
 
 	prlog(PR_ERR, "Locks held:\n");
-	list_for_each(&this_cpu()->locks_held, l, list)
+	list_for_each(&this_cpu()->locks_held, l, list) {
 		prlog(PR_ERR, "  %s\n", l->owner);
+#ifdef DEBUG_LOCKS_BACKTRACE
+		___print_backtrace(l->bt_pir, l->bt_buf, l->bt_ents,
+				   l->bt_token, l->bt_r1_caller,
+				   NULL, NULL, true);
+#endif
+	}
 }
 
 void drop_my_locks(bool warn)
@@ -322,8 +337,14 @@  void drop_my_locks(bool warn)
 
 	disable_fast_reboot("Lock corruption");
 	while((l = list_top(&this_cpu()->locks_held, struct lock, list)) != NULL) {
-		if (warn)
+		if (warn) {
 			prlog(PR_ERR, "  %s\n", l->owner);
+#ifdef DEBUG_LOCKS_BACKTRACE
+			___print_backtrace(l->bt_pir, l->bt_buf, l->bt_ents,
+					   l->bt_token, l->bt_r1_caller,
+					   NULL, NULL, true);
+#endif
+		}
 		unlock(l);
 	}
 }
diff --git a/include/config.h b/include/config.h
index 6b36590c39f0..c285d549ccab 100644
--- a/include/config.h
+++ b/include/config.h
@@ -39,6 +39,9 @@ 
 /* Enable lock debugging */
 #define DEBUG_LOCKS		1
 
+/* Enable printing of backtraces when locks not released */
+#define DEBUG_LOCKS_BACKTRACE  1
+
 /* Enable lock dependency checker */
 #define DEADLOCK_CHECKER	1
 
diff --git a/include/lock.h b/include/lock.h
index b18757394dbb..b7d47be83ba9 100644
--- a/include/lock.h
+++ b/include/lock.h
@@ -23,6 +23,10 @@ 
 #include <ccan/list/list.h>
 #include <ccan/str/str.h>
 
+#ifdef DEBUG_LOCKS_BACKTRACE
+#include <stack.h>
+#endif
+
 struct lock {
 	/* Lock value has bit 63 as lock bit and the PIR of the owner
 	 * in the top 32-bit
@@ -38,6 +42,14 @@  struct lock {
 	/* file/line of lock owner */
 	const char *owner;
 
+#ifdef DEBUG_LOCKS_BACKTRACE
+	struct bt_entry bt_buf[60];
+	unsigned int bt_ents;
+	unsigned long bt_token;
+	unsigned long bt_r1_caller;
+	unsigned long bt_pir;
+#endif
+
 	/* linkage in per-cpu list of owned locks */
 	struct list_node list;
 };