diff mbox

[1/2] UBUNTU: SAUCE: Update spinlock handling code

Message ID 1340618133-10755-2-git-send-email-stefan.bader@canonical.com
State New
Headers show

Commit Message

Stefan Bader June 25, 2012, 9:55 a.m. UTC
The current spinlock handling code seems to have a rare race
that causes complete guest hangs. All VCPUs are waiting on
a spinlock IRQ which apparently does not happen.

The newer code de-facto disables usage of ticket spinlocks
and host assisted waiting if compiled to be compatible with
Xen hosts older than 3.2 (those are claimed to have a race
in the hypervisor call).

Should we at some point compile to only be compatible with
Xen 3.2 and newer (which requires to be sure EC2 has no
older systems any more), then the new code will:

- use an event channel for the host assisted wakeup instead
  of an IPI. This results in xen_.*_irq_pending and
  xen_poll_irq defined in evtchn.c to become useless and
  therefor removed (spinlock was the only user of those).
- the waiting code gets more complicated as it tries to
  handle special cases of VPUs already spinning on an other
  lock getting interrupted.
- The kicker will also wake up a VCPU if it was interrupted
  while spinning on a lock that become unlocked.

BugLink: http://bugs.launchpad.net/bugs/929941

(backported from newer suse patchset)
Signed-off-by: Stefan Bader <stefan.bader@canonical.com>
---
 arch/x86/include/asm/hardirq.h                 |    4 +
 arch/x86/include/mach-xen/asm/hypervisor.h     |    2 +
 arch/x86/include/mach-xen/asm/irq_vectors.h    |    5 +-
 arch/x86/include/mach-xen/asm/spinlock.h       |  209 +++++++++++++------
 arch/x86/include/mach-xen/asm/spinlock_types.h |   60 ++++++
 arch/x86/kernel/irq-xen.c                      |    5 +
 arch/x86/kernel/time-xen.c                     |   29 ++-
 drivers/xen/core/evtchn.c                      |   37 ----
 drivers/xen/core/spinlock.c                    |  265 ++++++++++++++++++------
 include/xen/events.h                           |    9 -
 10 files changed, 436 insertions(+), 189 deletions(-)
 create mode 100644 arch/x86/include/mach-xen/asm/spinlock_types.h
diff mbox

Patch

diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 82e3e8f..e85d015 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -18,7 +18,11 @@  typedef struct {
 #ifdef CONFIG_SMP
 	unsigned int irq_resched_count;
 	unsigned int irq_call_count;
+#ifndef CONFIG_XEN
 	unsigned int irq_tlb_count;
+#else
+	unsigned int irq_lock_count;
+#endif
 #endif
 #ifdef CONFIG_X86_MCE
 	unsigned int irq_thermal_count;
diff --git a/arch/x86/include/mach-xen/asm/hypervisor.h b/arch/x86/include/mach-xen/asm/hypervisor.h
index edd1b82..fb728c6 100644
--- a/arch/x86/include/mach-xen/asm/hypervisor.h
+++ b/arch/x86/include/mach-xen/asm/hypervisor.h
@@ -86,6 +86,8 @@  extern start_info_t *xen_start_info;
 #define init_hypervisor(c) ((void)((c)->x86_hyper_vendor = X86_HYPER_VENDOR_XEN))
 #define init_hypervisor_platform() init_hypervisor(&boot_cpu_data)
 
+struct vcpu_runstate_info *setup_runstate_area(unsigned int cpu);
+
 /* arch/xen/kernel/evtchn.c */
 /* Force a proper event-channel callback from Xen. */
 void force_evtchn_callback(void);
diff --git a/arch/x86/include/mach-xen/asm/irq_vectors.h b/arch/x86/include/mach-xen/asm/irq_vectors.h
index 8d696f5..8e18c9a 100644
--- a/arch/x86/include/mach-xen/asm/irq_vectors.h
+++ b/arch/x86/include/mach-xen/asm/irq_vectors.h
@@ -13,9 +13,8 @@ 
 #define RESCHEDULE_VECTOR		0
 #define CALL_FUNCTION_VECTOR		1
 #define CALL_FUNC_SINGLE_VECTOR		2
-#define SPIN_UNLOCK_VECTOR		3
-#define REBOOT_VECTOR			4
-#define NR_IPIS				5
+#define REBOOT_VECTOR			3
+#define NR_IPIS				4
 
 /*
  * The maximum number of vectors supported by i386 processors
diff --git a/arch/x86/include/mach-xen/asm/spinlock.h b/arch/x86/include/mach-xen/asm/spinlock.h
index 336b1a6..7b8548d 100644
--- a/arch/x86/include/mach-xen/asm/spinlock.h
+++ b/arch/x86/include/mach-xen/asm/spinlock.h
@@ -38,13 +38,20 @@ 
 # define UNLOCK_LOCK_PREFIX
 #endif
 
+#ifdef TICKET_SHIFT
+
+#include <asm/irqflags.h>
+#include <asm/smp-processor-id.h>
+#include <xen/interface/vcpu.h>
+
+DECLARE_PER_CPU(struct vcpu_runstate_info, runstate);
+
 int xen_spinlock_init(unsigned int cpu);
 void xen_spinlock_cleanup(unsigned int cpu);
-extern int xen_spin_wait(raw_spinlock_t *, unsigned int token);
-extern int xen_spin_wait_flags(raw_spinlock_t *, unsigned int *token,
-			       unsigned int flags);
-extern unsigned int xen_spin_adjust(raw_spinlock_t *, unsigned int token);
-extern void xen_spin_kick(raw_spinlock_t *, unsigned int token);
+bool xen_spin_wait(raw_spinlock_t *, unsigned int *token,
+		   unsigned int flags);
+unsigned int xen_spin_adjust(const raw_spinlock_t *, unsigned int token);
+void xen_spin_kick(raw_spinlock_t *, unsigned int token);
 
 /*
  * Ticket locks are conceptually two parts, one indicating the current head of
@@ -63,8 +70,7 @@  extern void xen_spin_kick(raw_spinlock_t *, unsigned int token);
  * save some instructions and make the code more elegant. There really isn't
  * much between them in performance though, especially as locks are out of line.
  */
-#if (NR_CPUS < 256)
-#define TICKET_SHIFT 8
+#if TICKET_SHIFT == 8
 #define __ticket_spin_lock_preamble \
 	asm(LOCK_PREFIX "xaddw %w0, %2\n\t" \
 	    "cmpb %h0, %b0\n\t" \
@@ -86,7 +92,14 @@  extern void xen_spin_kick(raw_spinlock_t *, unsigned int token);
 	    : "+Q" (token), "+g" (count) \
 	    : "m" (lock->slock) \
 	    : "memory", "cc")
-
+#define __ticket_spin_unlock_body \
+	asm(UNLOCK_LOCK_PREFIX "incb %2\n\t" \
+	    "movzwl %2, %0\n\t" \
+	    "cmpb %h0, %b0\n\t" \
+	    "setne %1" \
+	    : "=&Q" (token), "=qm" (kick), "+m" (lock->slock) \
+	    : \
+	    : "memory", "cc")
 
 static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
 {
@@ -104,26 +117,12 @@  static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
 	    :
 	    : "memory", "cc");
 
-	return tmp;
-}
-
-static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
-{
-	unsigned int token;
-	unsigned char kick;
+	if (tmp)
+		lock->owner = raw_smp_processor_id();
 
-	asm(UNLOCK_LOCK_PREFIX "incb %2\n\t"
-	    "movzwl %2, %0\n\t"
-	    "cmpb %h0, %b0\n\t"
-	    "setne %1"
-	    : "=&Q" (token), "=qm" (kick), "+m" (lock->slock)
-	    :
-	    : "memory", "cc");
-	if (kick)
-		xen_spin_kick(lock, token);
+	return tmp;
 }
-#else
-#define TICKET_SHIFT 16
+#elif TICKET_SHIFT == 16
 #define __ticket_spin_lock_preamble \
 	do { \
 		unsigned int tmp; \
@@ -154,6 +153,19 @@  static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
 		    : "m" (lock->slock) \
 		    : "memory", "cc"); \
 	} while (0)
+#define __ticket_spin_unlock_body \
+	do { \
+		unsigned int tmp; \
+		asm(UNLOCK_LOCK_PREFIX "incw %2\n\t" \
+		    "movl %2, %0\n\t" \
+		    "shldl $16, %0, %3\n\t" \
+		    "cmpw %w3, %w0\n\t" \
+		    "setne %1" \
+		    : "=&r" (token), "=qm" (kick), "+m" (lock->slock), \
+		      "=&r" (tmp) \
+		    : \
+		    : "memory", "cc"); \
+	} while (0)
 
 static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
 {
@@ -174,27 +186,17 @@  static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
 	    :
 	    : "memory", "cc");
 
-	return tmp;
-}
-
-static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
-{
-	unsigned int token, tmp;
-	bool kick;
+	if (tmp)
+		lock->owner = raw_smp_processor_id();
 
-	asm(UNLOCK_LOCK_PREFIX "incw %2\n\t"
-	    "movl %2, %0\n\t"
-	    "shldl $16, %0, %3\n\t"
-	    "cmpw %w3, %w0\n\t"
-	    "setne %1"
-	    : "=&r" (token), "=qm" (kick), "+m" (lock->slock), "=&r" (tmp)
-	    :
-	    : "memory", "cc");
-	if (kick)
-		xen_spin_kick(lock, token);
+	return tmp;
 }
 #endif
 
+#define __ticket_spin_count(lock) \
+	(per_cpu(runstate.state, (lock)->owner) == RUNSTATE_running \
+	 ? 1 << 10 : 2)
+
 static inline int __ticket_spin_is_locked(raw_spinlock_t *lock)
 {
 	int tmp = ACCESS_ONCE(lock->slock);
@@ -212,16 +214,22 @@  static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
 static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
 {
 	unsigned int token, count;
+	unsigned int flags = __raw_local_irq_save();
 	bool free;
 
 	__ticket_spin_lock_preamble;
 	if (likely(free))
-		return;
-	token = xen_spin_adjust(lock, token);
-	do {
-		count = 1 << 10;
-		__ticket_spin_lock_body;
-	} while (unlikely(!count) && !xen_spin_wait(lock, token));
+		raw_local_irq_restore(flags);
+	else {
+		token = xen_spin_adjust(lock, token);
+		raw_local_irq_restore(flags);
+		do {
+			count = __ticket_spin_count(lock);
+			__ticket_spin_lock_body;
+		} while (unlikely(!count)
+			 && !xen_spin_wait(lock, &token, flags));
+	}
+	lock->owner = raw_smp_processor_id();
 }
 
 static __always_inline void __ticket_spin_lock_flags(raw_spinlock_t *lock,
@@ -231,50 +239,123 @@  static __always_inline void __ticket_spin_lock_flags(raw_spinlock_t *lock,
 	bool free;
 
 	__ticket_spin_lock_preamble;
-	if (likely(free))
-		return;
-	token = xen_spin_adjust(lock, token);
-	do {
-		count = 1 << 10;
-		__ticket_spin_lock_body;
-	} while (unlikely(!count) && !xen_spin_wait_flags(lock, &token, flags));
+	if (unlikely(!free)) {
+		token = xen_spin_adjust(lock, token);
+		do {
+			count = __ticket_spin_count(lock);
+			__ticket_spin_lock_body;
+		} while (unlikely(!count)
+			 && !xen_spin_wait(lock, &token, flags));
+	}
+	lock->owner = raw_smp_processor_id();
+}
+
+static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
+{
+	unsigned int token;
+	bool kick;
+
+	__ticket_spin_unlock_body;
+	if (kick)
+		xen_spin_kick(lock, token);
 }
 
-#ifndef CONFIG_PARAVIRT_SPINLOCKS
+#ifndef XEN_SPINLOCK_SOURCE
+#undef __ticket_spin_lock_preamble
+#undef __ticket_spin_lock_body
+#undef __ticket_spin_unlock_body
+#undef __ticket_spin_count
+#endif
+
+#define __raw_spin(n) __ticket_spin_##n
+
+#else /* TICKET_SHIFT */
+
+static inline int xen_spinlock_init(unsigned int cpu) { return 0; }
+static inline void xen_spinlock_cleanup(unsigned int cpu) {}
+
+static inline int __byte_spin_is_locked(raw_spinlock_t *lock)
+{
+	return lock->lock != 0;
+}
+
+static inline int __byte_spin_is_contended(raw_spinlock_t *lock)
+{
+	return lock->spinners != 0;
+}
+
+static inline void __byte_spin_lock(raw_spinlock_t *lock)
+{
+	s8 val = 1;
+
+	asm("1: xchgb %1, %0\n"
+	    "   test %1,%1\n"
+	    "   jz 3f\n"
+	    "   " LOCK_PREFIX "incb %2\n"
+	    "2: rep;nop\n"
+	    "   cmpb $1, %0\n"
+	    "   je 2b\n"
+	    "   " LOCK_PREFIX "decb %2\n"
+	    "   jmp 1b\n"
+	    "3:"
+	    : "+m" (lock->lock), "+q" (val), "+m" (lock->spinners): : "memory");
+}
+
+#define __byte_spin_lock_flags(lock, flags) __byte_spin_lock(lock)
+
+static inline int __byte_spin_trylock(raw_spinlock_t *lock)
+{
+	u8 old = 1;
+
+	asm("xchgb %1,%0"
+	    : "+m" (lock->lock), "+q" (old) : : "memory");
+
+	return old == 0;
+}
+
+static inline void __byte_spin_unlock(raw_spinlock_t *lock)
+{
+	smp_wmb();
+	lock->lock = 0;
+}
+
+#define __raw_spin(n) __byte_spin_##n
+
+#endif /* TICKET_SHIFT */
 
 static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
 {
-	return __ticket_spin_is_locked(lock);
+	return __raw_spin(is_locked)(lock);
 }
 
 static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
 {
-	return __ticket_spin_is_contended(lock);
+	return __raw_spin(is_contended)(lock);
 }
 #define __raw_spin_is_contended	__raw_spin_is_contended
 
 static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
 {
-	__ticket_spin_lock(lock);
+	__raw_spin(lock)(lock);
 }
 
 static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
 {
-	return __ticket_spin_trylock(lock);
+	return __raw_spin(trylock)(lock);
 }
 
 static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
 {
-	__ticket_spin_unlock(lock);
+	__raw_spin(unlock)(lock);
 }
 
 static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
 						  unsigned long flags)
 {
-	__ticket_spin_lock_flags(lock, flags);
+	__raw_spin(lock_flags)(lock, flags);
 }
 
-#endif	/* CONFIG_PARAVIRT_SPINLOCKS */
+#undef __raw_spin
 
 static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
 {
diff --git a/arch/x86/include/mach-xen/asm/spinlock_types.h b/arch/x86/include/mach-xen/asm/spinlock_types.h
new file mode 100644
index 0000000..7d01ae1
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/spinlock_types.h
@@ -0,0 +1,60 @@ 
+#ifndef _ASM_X86_SPINLOCK_TYPES_H
+#define _ASM_X86_SPINLOCK_TYPES_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+# error "please don't include this file directly"
+#endif
+
+#include <asm/types.h>
+
+typedef union {
+	unsigned int slock;
+	struct {
+/*
+ * Xen versions prior to 3.2.x have a race condition with HYPERVISOR_poll().
+ */
+#if CONFIG_XEN_COMPAT >= 0x030200
+/*
+ * On Xen we support a single level of interrupt re-enabling per lock. Hence
+ * we can have twice as many outstanding tickets. Thus the cut-off for using
+ * byte register pairs must be at half the number of CPUs.
+ */
+#if 2 * CONFIG_NR_CPUS < 256
+# define TICKET_SHIFT 8
+		u8 cur, seq;
+#else
+# define TICKET_SHIFT 16
+		u16 cur, seq;
+#endif
+#if CONFIG_NR_CPUS <= 256
+		u8 owner;
+#else
+		u16 owner;
+#endif
+#else
+/*
+ * This differs from the pre-2.6.24 spinlock by always using xchgb
+ * rather than decb to take the lock; this allows it to use a
+ * zero-initialized lock structure.  It also maintains a 1-byte
+ * contention counter, so that we can implement
+ * __byte_spin_is_contended.
+ */
+		u8 lock;
+#if CONFIG_NR_CPUS < 256
+		u8 spinners;
+#else
+# error NR_CPUS >= 256 not implemented
+#endif
+#endif
+	};
+} raw_spinlock_t;
+
+#define __RAW_SPIN_LOCK_UNLOCKED	{ 0 }
+
+typedef struct {
+	unsigned int lock;
+} raw_rwlock_t;
+
+#define __RAW_RW_LOCK_UNLOCKED		{ RW_LOCK_BIAS }
+
+#endif /* _ASM_X86_SPINLOCK_TYPES_H */
diff --git a/arch/x86/kernel/irq-xen.c b/arch/x86/kernel/irq-xen.c
index cc77647..a3dbf16 100644
--- a/arch/x86/kernel/irq-xen.c
+++ b/arch/x86/kernel/irq-xen.c
@@ -96,6 +96,11 @@  static int show_other_interrupts(struct seq_file *p, int prec)
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
 	seq_printf(p, "  TLB shootdowns\n");
+#else
+	seq_printf(p, "LCK: ");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->irq_lock_count);
+	seq_printf(p, "  Spinlock wakeups\n");
 #endif
 #endif
 #ifdef CONFIG_X86_THERMAL_VECTOR
diff --git a/arch/x86/kernel/time-xen.c b/arch/x86/kernel/time-xen.c
index 7bab3e4..3e9b37b 100644
--- a/arch/x86/kernel/time-xen.c
+++ b/arch/x86/kernel/time-xen.c
@@ -65,7 +65,7 @@  static DEFINE_PER_CPU(u64, processed_stolen_time);
 static DEFINE_PER_CPU(u64, processed_blocked_time);
 
 /* Current runstate of each CPU (updated automatically by the hypervisor). */
-static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
+DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
 
 /* Must be signed, as it's compared with s64 quantities which can be -ve. */
 #define NS_PER_TICK (1000000000LL/HZ)
@@ -550,15 +550,7 @@  EXPORT_SYMBOL_GPL(mark_tsc_unstable);
 
 static void init_missing_ticks_accounting(unsigned int cpu)
 {
-	struct vcpu_register_runstate_memory_area area;
-	struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
-	int rc;
-
-	memset(runstate, 0, sizeof(*runstate));
-
-	area.addr.v = runstate;
-	rc = HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
-	WARN_ON(rc && rc != -ENOSYS);
+	struct vcpu_runstate_info *runstate = setup_runstate_area(cpu);
 
 	per_cpu(processed_blocked_time, cpu) =
 		runstate->time[RUNSTATE_blocked];
@@ -643,6 +635,23 @@  static struct clocksource clocksource_xen = {
 	.resume			= xen_clocksource_resume,
 };
 
+struct vcpu_runstate_info *setup_runstate_area(unsigned int cpu)
+{
+	struct vcpu_register_runstate_memory_area area;
+	struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
+	int rc;
+
+	set_xen_guest_handle(area.addr.h, runstate);
+	rc = HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
+	if (rc) {
+		BUILD_BUG_ON(RUNSTATE_running);
+		memset(runstate, 0, sizeof(*runstate));
+		WARN_ON(rc != -ENOSYS);
+	}
+
+	return runstate;
+}
+
 void xen_read_persistent_clock(struct timespec *ts)
 {
 	const shared_info_t *s = HYPERVISOR_shared_info;
diff --git a/drivers/xen/core/evtchn.c b/drivers/xen/core/evtchn.c
index 421aa86..25e1199 100644
--- a/drivers/xen/core/evtchn.c
+++ b/drivers/xen/core/evtchn.c
@@ -1414,43 +1414,6 @@  void disable_all_local_evtchn(void)
 			synch_set_bit(i, &s->evtchn_mask[0]);
 }
 
-/* Clear an irq's pending state, in preparation for polling on it. */
-void xen_clear_irq_pending(int irq)
-{
-	int evtchn = evtchn_from_irq(irq);
-
-	if (VALID_EVTCHN(evtchn))
-		clear_evtchn(evtchn);
-}
-
-/* Set an irq's pending state, to avoid blocking on it. */
-void xen_set_irq_pending(int irq)
-{
-	int evtchn = evtchn_from_irq(irq);
-
-	if (VALID_EVTCHN(evtchn))
-		set_evtchn(evtchn);
-}
-
-/* Test an irq's pending state. */
-int xen_test_irq_pending(int irq)
-{
-	int evtchn = evtchn_from_irq(irq);
-
-	return VALID_EVTCHN(evtchn) && test_evtchn(evtchn);
-}
-
-/* Poll waiting for an irq to become pending.  In the usual case, the
-   irq will be disabled so it won't deliver an interrupt. */
-void xen_poll_irq(int irq)
-{
-	evtchn_port_t evtchn = evtchn_from_irq(irq);
-
-	if (VALID_EVTCHN(evtchn)
-	    && HYPERVISOR_poll_no_timeout(&evtchn, 1))
-		BUG();
-}
-
 #ifdef CONFIG_PM_SLEEP
 static void restore_cpu_virqs(unsigned int cpu)
 {
diff --git a/drivers/xen/core/spinlock.c b/drivers/xen/core/spinlock.c
index 07dda38..fc343a4 100644
--- a/drivers/xen/core/spinlock.c
+++ b/drivers/xen/core/spinlock.c
@@ -4,24 +4,24 @@ 
  *	See arch/x86/xen/smp.c for copyright and credits for derived
  *	portions of this file.
  */
+#define XEN_SPINLOCK_SOURCE
+#include <linux/spinlock_types.h>
+
+#ifdef TICKET_SHIFT
 
 #include <linux/init.h>
-#include <linux/irq.h>
 #include <linux/kernel.h>
-#include <linux/kernel_stat.h>
 #include <linux/module.h>
+#include <asm/hardirq.h>
 #include <xen/evtchn.h>
 
-#ifdef TICKET_SHIFT
-
-static int __read_mostly spinlock_irq = -1;
-
 struct spinning {
 	raw_spinlock_t *lock;
 	unsigned int ticket;
 	struct spinning *prev;
 };
 static DEFINE_PER_CPU(struct spinning *, spinning);
+static DEFINE_PER_CPU(evtchn_port_t, poll_evtchn);
 /*
  * Protect removal of objects: Addition can be done lockless, and even
  * removal itself doesn't need protection - what needs to be prevented is
@@ -31,98 +31,229 @@  static DEFINE_PER_CPU(raw_rwlock_t, spinning_rm_lock) = __RAW_RW_LOCK_UNLOCKED;
 
 int __cpuinit xen_spinlock_init(unsigned int cpu)
 {
-	static struct irqaction spinlock_action = {
-		.handler = smp_reschedule_interrupt,
-		.flags   = IRQF_DISABLED,
-		.name    = "spinlock"
-	};
+	struct evtchn_bind_ipi bind_ipi;
 	int rc;
 
-	rc = bind_ipi_to_irqaction(SPIN_UNLOCK_VECTOR,
-				   cpu,
-				   &spinlock_action);
- 	if (rc < 0)
- 		return rc;
+	setup_runstate_area(cpu);
 
-	if (spinlock_irq < 0) {
-		disable_irq(rc); /* make sure it's never delivered */
-		spinlock_irq = rc;
-	} else
-		BUG_ON(spinlock_irq != rc);
+ 	WARN_ON(per_cpu(poll_evtchn, cpu));
+	bind_ipi.vcpu = cpu;
+	rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi);
+	if (!rc)
+	 	per_cpu(poll_evtchn, cpu) = bind_ipi.port;
+	else
+		printk(KERN_WARNING
+		       "No spinlock poll event channel for CPU#%u (%d)\n",
+		       cpu, rc);
 
 	return 0;
 }
 
 void __cpuinit xen_spinlock_cleanup(unsigned int cpu)
 {
-	unbind_from_per_cpu_irq(spinlock_irq, cpu, NULL);
+	struct evtchn_close close;
+
+	close.port = per_cpu(poll_evtchn, cpu);
+ 	per_cpu(poll_evtchn, cpu) = 0;
+	WARN_ON(HYPERVISOR_event_channel_op(EVTCHNOP_close, &close));
 }
 
-int xen_spin_wait(raw_spinlock_t *lock, unsigned int token)
+#ifdef CONFIG_PM_SLEEP
+#include <linux/sysdev.h>
+
+static int __cpuinit spinlock_resume(struct sys_device *dev)
 {
-	int rc = 0, irq = spinlock_irq;
-	raw_rwlock_t *rm_lock;
-	unsigned long flags;
-	struct spinning spinning;
+	unsigned int cpu;
 
-	/* If kicker interrupt not initialized yet, just spin. */
-	if (unlikely(irq < 0) || unlikely(!cpu_online(raw_smp_processor_id())))
+	for_each_online_cpu(cpu) {
+		per_cpu(poll_evtchn, cpu) = 0;
+		xen_spinlock_init(cpu);
+	}
+
+	return 0;
+}
+
+static struct sysdev_class __cpuinitdata spinlock_sysclass = {
+	.name	= "spinlock",
+	.resume	= spinlock_resume
+};
+
+static struct sys_device __cpuinitdata device_spinlock = {
+	.id		= 0,
+	.cls		= &spinlock_sysclass
+};
+
+static int __init spinlock_register(void)
+{
+	int rc;
+
+	if (is_initial_xendomain())
 		return 0;
 
-	token >>= TICKET_SHIFT;
+	rc = sysdev_class_register(&spinlock_sysclass);
+	if (!rc)
+		rc = sysdev_register(&device_spinlock);
+	return rc;
+}
+core_initcall(spinlock_register);
+#endif
+
+static unsigned int spin_adjust(struct spinning *spinning,
+				const raw_spinlock_t *lock,
+				unsigned int token)
+{
+	for (; spinning; spinning = spinning->prev)
+		if (spinning->lock == lock) {
+			unsigned int ticket = spinning->ticket;
+
+			if (unlikely(!(ticket + 1)))
+				break;
+			spinning->ticket = token >> TICKET_SHIFT;
+			token = (token & ((1 << TICKET_SHIFT) - 1))
+				| (ticket << TICKET_SHIFT);
+			break;
+		}
+
+	return token;
+}
+
+unsigned int xen_spin_adjust(const raw_spinlock_t *lock, unsigned int token)
+{
+	return spin_adjust(percpu_read(spinning), lock, token);
+}
+
+bool xen_spin_wait(raw_spinlock_t *lock, unsigned int *ptok,
+                   unsigned int flags)
+{
+	unsigned int cpu = raw_smp_processor_id();
+	bool rc;
+	typeof(vcpu_info(0)->evtchn_upcall_mask) upcall_mask;
+	raw_rwlock_t *rm_lock;
+	struct spinning spinning, *other;
+
+	/* If kicker interrupt not initialized yet, just spin. */
+	if (unlikely(!cpu_online(cpu)) || unlikely(!percpu_read(poll_evtchn)))
+		return false;
 
 	/* announce we're spinning */
-	spinning.ticket = token;
+	spinning.ticket = *ptok >> TICKET_SHIFT;
 	spinning.lock = lock;
 	spinning.prev = percpu_read(spinning);
 	smp_wmb();
 	percpu_write(spinning, &spinning);
-
-	/* clear pending */
-	xen_clear_irq_pending(irq);
+	upcall_mask = vcpu_info_read(evtchn_upcall_mask);
 
 	do {
-		/* Check again to make sure it didn't become free while
-		 * we weren't looking. */
-		if ((lock->slock & ((1U << TICKET_SHIFT) - 1)) == token) {
-			/* If we interrupted another spinlock while it was
+		bool nested = false;
+
+		clear_evtchn(percpu_read(poll_evtchn));
+
+		/*
+		 * Check again to make sure it didn't become free while
+		 * we weren't looking.
+		 */
+		if (lock->cur == spinning.ticket) {
+			lock->owner = cpu;
+			/*
+			 * If we interrupted another spinlock while it was
 			 * blocking, make sure it doesn't block (again)
-			 * without rechecking the lock. */
+			 * without rechecking the lock.
+			 */
 			if (spinning.prev)
-				xen_set_irq_pending(irq);
-			rc = 1;
+				set_evtchn(percpu_read(poll_evtchn));
+			rc = true;
 			break;
 		}
 
-		/* block until irq becomes pending */
-		xen_poll_irq(irq);
-	} while (!xen_test_irq_pending(irq));
+		for (other = spinning.prev; other; other = other->prev) {
+			if (other->lock == lock)
+				nested = true;
+			else {
+				/*
+				 * Return the ticket if we now own the lock.
+				 * While just being desirable generally (to
+				 * reduce latency on other CPUs), this is
+				 * essential in the case where interrupts
+				 * get re-enabled below.
+				 * Try to get a new ticket right away (to
+				 * reduce latency after the current lock was
+				 * released), but don't acquire the lock.
+				 */
+				raw_spinlock_t *lock = other->lock;
 
-	/* Leave the irq pending so that any interrupted blocker will
-	 * re-check. */
-	if (!rc)
-		kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
+				raw_local_irq_disable();
+				while (lock->cur == other->ticket) {
+					unsigned int token;
+					bool kick, free;
+
+					other->ticket = -1;
+					__ticket_spin_unlock_body;
+					if (!kick)
+						break;
+					xen_spin_kick(lock, token);
+					__ticket_spin_lock_preamble;
+					if (!free)
+						token = spin_adjust(
+							other->prev, lock,
+							token);
+					other->ticket = token >> TICKET_SHIFT;
+					smp_mb();
+				}
+			}
+		}
+
+		/*
+		 * No need to use raw_local_irq_restore() here, as the
+		 * intended event processing will happen with the poll
+		 * call.
+		 */
+		vcpu_info_write(evtchn_upcall_mask,
+				nested ? upcall_mask : flags);
+
+		if (HYPERVISOR_poll_no_timeout(&__get_cpu_var(poll_evtchn), 1))
+			BUG();
+
+		vcpu_info_write(evtchn_upcall_mask, upcall_mask);
+
+		rc = !test_evtchn(percpu_read(poll_evtchn));
+		if (!rc)
+			inc_irq_stat(irq_lock_count);
+	} while (spinning.prev || rc);
+
+	/*
+	 * Leave the irq pending so that any interrupted blocker will
+	 * re-check.
+	 */
 
 	/* announce we're done */
-	percpu_write(spinning, spinning.prev);
+	other = spinning.prev;
+	percpu_write(spinning, other);
 	rm_lock = &__get_cpu_var(spinning_rm_lock);
-	raw_local_irq_save(flags);
+	raw_local_irq_disable();
 	__raw_write_lock(rm_lock);
 	__raw_write_unlock(rm_lock);
-	raw_local_irq_restore(flags);
+	*ptok = lock->cur | (spinning.ticket << TICKET_SHIFT);
 
-	return rc;
-}
+	/*
+	 * Obtain new tickets for (or acquire) all those locks where
+	 * above we avoided acquiring them.
+	 */
+	for (; other; other = other->prev)
+		if (!(other->ticket + 1)) {
+			unsigned int token;
+			bool free;
 
-unsigned int xen_spin_adjust(raw_spinlock_t *lock, unsigned int token)
-{
-	return token;//todo
-}
+			lock = other->lock;
+			__ticket_spin_lock_preamble;
+			if (!free)
+				token = spin_adjust(other->prev, lock, token);
+			other->ticket = token >> TICKET_SHIFT;
+			if (lock->cur == other->ticket)
+				lock->owner = cpu;
+		}
+	raw_local_irq_restore(upcall_mask);
 
-int xen_spin_wait_flags(raw_spinlock_t *lock, unsigned int *token,
-			  unsigned int flags)
-{
-	return xen_spin_wait(lock, *token);//todo
+	return rc;
 }
 
 void xen_spin_kick(raw_spinlock_t *lock, unsigned int token)
@@ -135,7 +266,7 @@  void xen_spin_kick(raw_spinlock_t *lock, unsigned int token)
 		unsigned long flags;
 		struct spinning *spinning;
 
-		if (cpu == raw_smp_processor_id() || !per_cpu(spinning, cpu))
+		if (cpu == raw_smp_processor_id())
 			continue;
 
 		rm_lock = &per_cpu(spinning_rm_lock, cpu);
@@ -144,15 +275,17 @@  void xen_spin_kick(raw_spinlock_t *lock, unsigned int token)
 
 		spinning = per_cpu(spinning, cpu);
 		smp_rmb();
-		if (spinning
-		    && (spinning->lock != lock || spinning->ticket != token))
-			spinning = NULL;
+		while (spinning) {
+			if (spinning->lock == lock && spinning->ticket == token)
+				break;
+			spinning = spinning->prev;
+		}
 
 		__raw_read_unlock(rm_lock);
 		raw_local_irq_restore(flags);
 
 		if (unlikely(spinning)) {
-			notify_remote_via_ipi(SPIN_UNLOCK_VECTOR, cpu);
+			notify_remote_via_evtchn(per_cpu(poll_evtchn, cpu));
 			return;
 		}
 	}
diff --git a/include/xen/events.h b/include/xen/events.h
index e68d59a..1812ea6 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -44,15 +44,6 @@  extern void notify_remote_via_irq(int irq);
 
 extern void xen_irq_resume(void);
 
-/* Clear an irq's pending state, in preparation for polling on it */
-void xen_clear_irq_pending(int irq);
-void xen_set_irq_pending(int irq);
-bool xen_test_irq_pending(int irq);
-
-/* Poll waiting for an irq to become pending.  In the usual case, the
-   irq will be disabled so it won't deliver an interrupt. */
-void xen_poll_irq(int irq);
-
 /* Determine the IRQ which is bound to an event channel */
 unsigned irq_from_evtchn(unsigned int evtchn);