[v3,1/2] core/lock: Add deadlock detection

Message ID 20170707015244.9260-1-matthew.brown.dev@gmail.com
State Accepted
Headers show

Commit Message

Matt Brown July 7, 2017, 1:52 a.m.
This adds simple deadlock detection. The detection looks for circular
dependencies in the lock requests. It will abort and display a stack trace
when a deadlock occurs.
The detection is enabled by DEBUG_LOCKS (enabled by default).
While the detection may have a slight performance overhead, as there are
not a huge number of locks in skiboot this overhead isn't significant.

Signed-off-by: Matt Brown <matthew.brown.dev@gmail.com>
---
Changelog:
v3: 
	- added mutual exclusion for deadlock check
	- updated commit message
v2:
	- changed global lock table to be field in the respective 
	  cpu_thread struct
	- tidied the lock function
	
---
 core/cpu.c    |  1 +
 core/lock.c   | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/cpu.h |  5 ++++
 3 files changed, 86 insertions(+)

Comments

Stewart Smith March 7, 2018, 7:04 a.m. | #1
Matt Brown <matthew.brown.dev@gmail.com> writes:
> This adds simple deadlock detection. The detection looks for circular
> dependencies in the lock requests. It will abort and display a stack trace
> when a deadlock occurs.
> The detection is enabled by DEBUG_LOCKS (enabled by default).
> While the detection may have a slight performance overhead, as there are
> not a huge number of locks in skiboot this overhead isn't significant.
>
> Signed-off-by: Matt Brown <matthew.brown.dev@gmail.com>

In my quest to go through the backlog of patches to skiboot, I *finally*
got to these lock-hardening patches.

I'm understandably slightly concerned about adding in overhead to the
taking of locks... but I'm not so concerned as to go and immediately put
this all behind another #ifdef and limit to debug builds (at least yet).

So, I've taken the series into master as of 84186ef0944c and let's see
how it goes! Worst case we may hide behind a DEBUG build if we notice
them having a too large impact.

Thanks heaps for this and apologies for taking so long to get this far
down the todo list!

Patch

diff --git a/core/cpu.c b/core/cpu.c
index 75c7008..8498011 100644
--- a/core/cpu.c
+++ b/core/cpu.c
@@ -855,6 +855,7 @@  void init_all_cpus(void)
 		t->node = cpu;
 		t->chip_id = chip_id;
 		t->icp_regs = NULL; /* Will be set later */
+		t->requested_lock = NULL;
 		t->core_hmi_state = 0;
 		t->core_hmi_state_ptr = &t->core_hmi_state;
 		t->thread_mask = 1;
diff --git a/core/lock.c b/core/lock.c
index 0868f2b..a9d1894 100644
--- a/core/lock.c
+++ b/core/lock.c
@@ -28,6 +28,7 @@ 
 bool bust_locks = true;
 
 #ifdef DEBUG_LOCKS
+static struct lock dl_lock = LOCK_UNLOCKED;
 
 static void lock_error(struct lock *l, const char *reason, uint16_t err)
 {
@@ -61,9 +62,81 @@  static void unlock_check(struct lock *l)
 		lock_error(l, "Releasing lock with 0 depth", 4);
 }
 
+/* Find circular dependencies in the lock requests. */
+static bool check_deadlock(void)
+{
+	uint32_t lock_owner, start, i;
+	struct cpu_thread *next_cpu;
+	struct lock *next;
+
+	next  = this_cpu()->requested_lock;
+	start = this_cpu()->pir;
+	i = 0;
+
+	while (i < cpu_max_pir) {
+
+		if (!next)
+			return false;
+
+		if (!(next->lock_val & 1) || next->in_con_path)
+			return false;
+
+		lock_owner = next->lock_val >> 32;
+
+		if (lock_owner == start)
+			return true;
+
+		next_cpu = find_cpu_by_pir(lock_owner);
+
+		if (!next_cpu)
+			return false;
+
+		next = next_cpu->requested_lock;
+		i++;
+	}
+
+	return false;
+}
+
+static void add_lock_request(struct lock *l)
+{
+	struct cpu_thread *curr = this_cpu();
+
+	if (curr->state != cpu_state_active &&
+	    curr->state != cpu_state_os)
+		return;
+
+	/*
+	 * For deadlock detection we must keep the lock states constant
+	 * while doing the deadlock check.
+	 */
+	for (;;) {
+		if (try_lock(&dl_lock))
+			break;
+		smt_lowest();
+		while (dl_lock.lock_val)
+			barrier();
+		smt_medium();
+	}
+
+	curr->requested_lock = l;
+
+	if (check_deadlock())
+		lock_error(l, "Deadlock detected", 0);
+
+	unlock(&dl_lock);
+}
+
+static void remove_lock_request(void)
+{
+	this_cpu()->requested_lock = NULL;
+}
+
 #else
 static inline void lock_check(struct lock *l) { };
 static inline void unlock_check(struct lock *l) { };
+static inline void add_lock_request(struct lock *l) { };
+static inline void remove_lock_request(void) { };
 #endif /* DEBUG_LOCKS */
 
 bool lock_held_by_me(struct lock *l)
@@ -90,6 +163,11 @@  void lock(struct lock *l)
 		return;
 
 	lock_check(l);
+
+	if (try_lock(l))
+		return;
+	add_lock_request(l);
+
 	for (;;) {
 		if (try_lock(l))
 			break;
@@ -98,6 +176,8 @@  void lock(struct lock *l)
 			barrier();
 		smt_medium();
 	}
+
+	remove_lock_request();
 }
 
 void unlock(struct lock *l)
diff --git a/include/cpu.h b/include/cpu.h
index fd3acf7..ab4e4fe 100644
--- a/include/cpu.h
+++ b/include/cpu.h
@@ -95,6 +95,11 @@  struct cpu_thread {
 
 	/* For use by XICS emulation on XIVE */
 	struct xive_cpu_state		*xstate;
+
+#ifdef DEBUG_LOCKS
+	/* The lock requested by this cpu, used for deadlock detection */
+	struct lock			*requested_lock;
+#endif
 };
 
 /* This global is set to 1 to allow secondaries to callin,