diff mbox series

[7/7] core/lock: Add debug options to store backtrace of where lock was taken

Message ID 20190318042900.32558-7-andrew.donnellan@au1.ibm.com
State Superseded
Headers show
Series [1/7] core/stack: Remove r1 argument from ___backtrace() | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch master (2ba5ce84a197ee61423355f443a3ff3eea185ff1)
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot fail Test snowpatch/job/snowpatch-skiboot on branch master
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot-dco success Signed-off-by present

Commit Message

Andrew Donnellan March 18, 2019, 4:29 a.m. UTC
Contrary to popular belief, skiboot developers are imperfect and
occasionally write locking bugs. When we exit skiboot, we check if we're
still holding any locks, and if so, we print an error with a list of the
locks currently held and the locations where they were taken.

However, this only tells us the location where lock() was called, which may
not be enough to work out what's going on. To give us more to go on with,
we can store backtrace data in the lock and print that out when we
unexpectedly still hold locks.

Because the backtrace data is rather big, we only enable this if
DEBUG_LOCKS_BACKTRACE is defined, which in turn is switched on when
DEBUG=1.

Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>

---

RFC->v1:
- Switch this on when DEBUG=1 (Stewart)
- Rebase on top of some backtrace API changes
---
 core/lock.c      | 19 +++++++++++++++++--
 include/config.h |  7 +++++++
 include/lock.h   | 11 +++++++++++
 3 files changed, 35 insertions(+), 2 deletions(-)

Comments

Andrew Donnellan March 18, 2019, 4:43 a.m. UTC | #1
On 18/3/19 3:29 pm, Andrew Donnellan wrote:
> Contrary to popular belief, skiboot developers are imperfect and
> occasionally write locking bugs. When we exit skiboot, we check if we're
> still holding any locks, and if so, we print an error with a list of the
> locks currently held and the locations where they were taken.
> 
> However, this only tells us the location where lock() was called, which may
> not be enough to work out what's going on. To give us more to go on with,
> we can store backtrace data in the lock and print that out when we
> unexpectedly still hold locks.
> 
> Because the backtrace data is rather big, we only enable this if
> DEBUG_LOCKS_BACKTRACE is defined, which in turn is switched on when
> DEBUG=1.
> 
> Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>

Apparently this causes a test failure in core/test/run-mem_region, which 
is interesting!

> 
> ---
> 
> RFC->v1:
> - Switch this on when DEBUG=1 (Stewart)
> - Rebase on top of some backtrace API changes
> ---
>   core/lock.c      | 19 +++++++++++++++++--
>   include/config.h |  7 +++++++
>   include/lock.h   | 11 +++++++++++
>   3 files changed, 35 insertions(+), 2 deletions(-)
> 
> diff --git a/core/lock.c b/core/lock.c
> index fbf0b21ed5c6..6fbb5eb9bf5e 100644
> --- a/core/lock.c
> +++ b/core/lock.c
> @@ -208,6 +208,12 @@ bool try_lock_caller(struct lock *l, const char *owner)
>   		cpu->con_suspend++;
>   	if (__try_lock(cpu, l)) {
>   		l->owner = owner;
> +
> +#ifdef DEBUG_LOCKS_BACKTRACE
> +		backtrace_create(l->bt_buf, LOCKS_BACKTRACE_MAX_ENTS,
> +				 &l->bt_metadata);
> +#endif
> +
>   		list_add(&cpu->locks_held, &l->list);
>   		return true;
>   	}
> @@ -312,8 +318,12 @@ void dump_locks_list(void)
>   	struct lock *l;
>   
>   	prlog(PR_ERR, "Locks held:\n");
> -	list_for_each(&this_cpu()->locks_held, l, list)
> +	list_for_each(&this_cpu()->locks_held, l, list) {
>   		prlog(PR_ERR, "  %s\n", l->owner);
> +#ifdef DEBUG_LOCKS_BACKTRACE
> +		backtrace_print(l->bt_buf, &l->bt_metadata, NULL, NULL, true);
> +#endif
> +	}
>   }
>   
>   void drop_my_locks(bool warn)
> @@ -322,8 +332,13 @@ void drop_my_locks(bool warn)
>   
>   	disable_fast_reboot("Lock corruption");
>   	while((l = list_top(&this_cpu()->locks_held, struct lock, list)) != NULL) {
> -		if (warn)
> +		if (warn) {
>   			prlog(PR_ERR, "  %s\n", l->owner);
> +#ifdef DEBUG_LOCKS_BACKTRACE
> +			backtrace_print(l->bt_buf, &l->bt_metadata, NULL, NULL,
> +					true);
> +#endif
> +		}
>   		unlock(l);
>   	}
>   }
> diff --git a/include/config.h b/include/config.h
> index 438f1b0f3373..7059e39ddb91 100644
> --- a/include/config.h
> +++ b/include/config.h
> @@ -39,6 +39,13 @@
>   /* Enable lock debugging */
>   #define DEBUG_LOCKS		1
>   
> +/* Enable printing of backtraces when locks not released */
> +#ifdef DEBUG
> +#define DEBUG_LOCKS_BACKTRACE  1
> +#else
> +//#define DEBUG_LOCKS_BACKTRACE	1
> +#endif
> +
>   /* Enable lock dependency checker */
>   #define DEADLOCK_CHECKER	1
>   
> diff --git a/include/lock.h b/include/lock.h
> index b18757394dbb..e08ec08fcdfb 100644
> --- a/include/lock.h
> +++ b/include/lock.h
> @@ -23,6 +23,12 @@
>   #include <ccan/list/list.h>
>   #include <ccan/str/str.h>
>   
> +#ifdef DEBUG_LOCKS_BACKTRACE
> +#include <stack.h>
> +
> +#define LOCKS_BACKTRACE_MAX_ENTS	60
> +#endif
> +
>   struct lock {
>   	/* Lock value has bit 63 as lock bit and the PIR of the owner
>   	 * in the top 32-bit
> @@ -38,6 +44,11 @@ struct lock {
>   	/* file/line of lock owner */
>   	const char *owner;
>   
> +#ifdef DEBUG_LOCKS_BACKTRACE
> +	struct bt_entry bt_buf[LOCKS_BACKTRACE_MAX_ENTS];
> +	struct bt_metadata bt_metadata;
> +#endif
> +
>   	/* linkage in per-cpu list of owned locks */
>   	struct list_node list;
>   };
>
Stewart Smith March 29, 2019, 4:23 a.m. UTC | #2
Andrew Donnellan <andrew.donnellan@au1.ibm.com> writes:
> On 18/3/19 3:29 pm, Andrew Donnellan wrote:
>> Contrary to popular belief, skiboot developers are imperfect and
>> occasionally write locking bugs. When we exit skiboot, we check if we're
>> still holding any locks, and if so, we print an error with a list of the
>> locks currently held and the locations where they were taken.
>> 
>> However, this only tells us the location where lock() was called, which may
>> not be enough to work out what's going on. To give us more to go on with,
>> we can store backtrace data in the lock and print that out when we
>> unexpectedly still hold locks.
>> 
>> Because the backtrace data is rather big, we only enable this if
>> DEBUG_LOCKS_BACKTRACE is defined, which in turn is switched on when
>> DEBUG=1.
>> 
>> Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
>
> Apparently this causes a test failure in core/test/run-mem_region, which 
> is interesting!

It is! I bet it's something to do with size of the structures and the
hairy duck tape (or is it duct, I've watched whole youtube videos on this and
still can't remember) holding those tests together.

Anyway, I've taken patches 1-6 as of
b965b9de25d2270c4ba65b3dc31fd9649a8c922f as they're good cleanup
anyway. I think I went and ensured everything looks correct... I
possibly need to context switch less :)
diff mbox series

Patch

diff --git a/core/lock.c b/core/lock.c
index fbf0b21ed5c6..6fbb5eb9bf5e 100644
--- a/core/lock.c
+++ b/core/lock.c
@@ -208,6 +208,12 @@  bool try_lock_caller(struct lock *l, const char *owner)
 		cpu->con_suspend++;
 	if (__try_lock(cpu, l)) {
 		l->owner = owner;
+
+#ifdef DEBUG_LOCKS_BACKTRACE
+		backtrace_create(l->bt_buf, LOCKS_BACKTRACE_MAX_ENTS,
+				 &l->bt_metadata);
+#endif
+
 		list_add(&cpu->locks_held, &l->list);
 		return true;
 	}
@@ -312,8 +318,12 @@  void dump_locks_list(void)
 	struct lock *l;
 
 	prlog(PR_ERR, "Locks held:\n");
-	list_for_each(&this_cpu()->locks_held, l, list)
+	list_for_each(&this_cpu()->locks_held, l, list) {
 		prlog(PR_ERR, "  %s\n", l->owner);
+#ifdef DEBUG_LOCKS_BACKTRACE
+		backtrace_print(l->bt_buf, &l->bt_metadata, NULL, NULL, true);
+#endif
+	}
 }
 
 void drop_my_locks(bool warn)
@@ -322,8 +332,13 @@  void drop_my_locks(bool warn)
 
 	disable_fast_reboot("Lock corruption");
 	while((l = list_top(&this_cpu()->locks_held, struct lock, list)) != NULL) {
-		if (warn)
+		if (warn) {
 			prlog(PR_ERR, "  %s\n", l->owner);
+#ifdef DEBUG_LOCKS_BACKTRACE
+			backtrace_print(l->bt_buf, &l->bt_metadata, NULL, NULL,
+					true);
+#endif
+		}
 		unlock(l);
 	}
 }
diff --git a/include/config.h b/include/config.h
index 438f1b0f3373..7059e39ddb91 100644
--- a/include/config.h
+++ b/include/config.h
@@ -39,6 +39,13 @@ 
 /* Enable lock debugging */
 #define DEBUG_LOCKS		1
 
+/* Enable printing of backtraces when locks not released */
+#ifdef DEBUG
+#define DEBUG_LOCKS_BACKTRACE  1
+#else
+//#define DEBUG_LOCKS_BACKTRACE	1
+#endif
+
 /* Enable lock dependency checker */
 #define DEADLOCK_CHECKER	1
 
diff --git a/include/lock.h b/include/lock.h
index b18757394dbb..e08ec08fcdfb 100644
--- a/include/lock.h
+++ b/include/lock.h
@@ -23,6 +23,12 @@ 
 #include <ccan/list/list.h>
 #include <ccan/str/str.h>
 
+#ifdef DEBUG_LOCKS_BACKTRACE
+#include <stack.h>
+
+#define LOCKS_BACKTRACE_MAX_ENTS	60
+#endif
+
 struct lock {
 	/* Lock value has bit 63 as lock bit and the PIR of the owner
 	 * in the top 32-bit
@@ -38,6 +44,11 @@  struct lock {
 	/* file/line of lock owner */
 	const char *owner;
 
+#ifdef DEBUG_LOCKS_BACKTRACE
+	struct bt_entry bt_buf[LOCKS_BACKTRACE_MAX_ENTS];
+	struct bt_metadata bt_metadata;
+#endif
+
 	/* linkage in per-cpu list of owned locks */
 	struct list_node list;
 };