diff mbox series

[RFC,v2] futex: extend set_robust_list to allow 2 locking ABIs at the same time.

Message ID 20191104002909.25783-1-shawn@git.icu
State New
Headers show
Series [RFC,v2] futex: extend set_robust_list to allow 2 locking ABIs at the same time. | expand

Commit Message

Shawn Landden Nov. 4, 2019, 12:29 a.m. UTC
The robust futexes ABI was designed to be flexible to changing ABIs in
glibc, however it did not take into consideration that this ABI is
particularly sticky, and suffers from lock-step problems, due to the
fact that the ABI is shared between processes. This introduces a new
size in set_robust_list that takes an additional futex_offset2 value
so that two locking ABIs can be used at the same time.

If this new ABI is used, then bit 1 of the *next pointer of the
user-space robust_list indicates that the futex_offset2 value should
be used in place of the existing futex_offset.

The use case for this is sharing locks between 32-bit and 64-bit
processes, which Linux supports, but glibc does not, and is difficult
to implement with the current Linux support because of mix-matched
ABIs. Keith Packard has complained about this:
https://keithp.com/blogs/Shared_Memory_Fences/

This can also be used to add a new ABI that uses smaller structs,
as the existing ABI on x86_64 is a minimum of 32 bytes, and 20 bytes
would suffice.

v2: fix size of compat_extended_robust_list_head
    fix some issues with number literals being implicitly ints
---
 include/linux/compat.h     |   5 +
 include/linux/sched.h      |   6 ++
 include/uapi/linux/futex.h |  31 +++++++
 kernel/futex.c             | 182 ++++++++++++++++++++++++-------------
 4 files changed, 160 insertions(+), 64 deletions(-)

Comments

Shawn Landden Nov. 4, 2019, 12:51 a.m. UTC | #1
I am sorry, I will fix this and resubmit.

03.11.2019, 19:29, "Shawn Landden" <shawn@git.icu>:
> The robust futexes ABI was designed to be flexible to changing ABIs in
> glibc, however it did not take into consideration that this ABI is
> particularly sticky, and suffers from lock-step problems, due to the
> fact that the ABI is shared between processes. This introduces a new
> size in set_robust_list that takes an additional futex_offset2 value
> so that two locking ABIs can be used at the same time.
>
> If this new ABI is used, then bit 1 of the *next pointer of the
> user-space robust_list indicates that the futex_offset2 value should
> be used in place of the existing futex_offset.
>
> The use case for this is sharing locks between 32-bit and 64-bit
> processes, which Linux supports, but glibc does not, and is difficult
> to implement with the current Linux support because of mix-matched
> ABIs. Keith Packard has complained about this:
> https://keithp.com/blogs/Shared_Memory_Fences/
>
> This can also be used to add a new ABI that uses smaller structs,
> as the existing ABI on x86_64 is a minimum of 32 bytes, and 20 bytes
> would suffice.
>
> v2: fix size of compat_extended_robust_list_head
>     fix some issues with number literals being implicitly ints
> ---
>  include/linux/compat.h | 5 +
>  include/linux/sched.h | 6 ++
>  include/uapi/linux/futex.h | 31 +++++++
>  kernel/futex.c | 182 ++++++++++++++++++++++++-------------
>  4 files changed, 160 insertions(+), 64 deletions(-)
>
> diff --git a/include/linux/compat.h b/include/linux/compat.h
> index 16dafd9f4b86..00a0741bf658 100644
> --- a/include/linux/compat.h
> +++ b/include/linux/compat.h
> @@ -379,10 +379,15 @@ struct compat_robust_list_head {
>          struct compat_robust_list list;
>          compat_long_t futex_offset;
>          compat_uptr_t list_op_pending;
>  };
>
> +struct compat_extended_robust_list_head {
> + struct compat_robust_list_head list_head;
> + compat_long_t futex_offset2;
> +};
> +
>  #ifdef CONFIG_COMPAT_OLD_SIGACTION
>  struct compat_old_sigaction {
>          compat_uptr_t sa_handler;
>          compat_old_sigset_t sa_mask;
>          compat_ulong_t sa_flags;
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 9f51932bd543..894258fd44ac 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1057,10 +1057,16 @@ struct task_struct {
>  #ifdef CONFIG_X86_CPU_RESCTRL
>          u32 closid;
>          u32 rmid;
>  #endif
>  #ifdef CONFIG_FUTEX
> + /*
> + * bottom two bits are masked
> + * 0: struct extended_robust_list_head
> + * 1: struct robust_list_head
> + * same for compat_robust_list
> + */
>          struct robust_list_head __user *robust_list;
>  #ifdef CONFIG_COMPAT
>          struct compat_robust_list_head __user *compat_robust_list;
>  #endif
>          struct list_head pi_state_list;
> diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
> index a89eb0accd5e..30c08e07f26b 100644
> --- a/include/uapi/linux/futex.h
> +++ b/include/uapi/linux/futex.h
> @@ -92,10 +92,41 @@ struct robust_list_head {
>           * so only truly owned locks will be handled.
>           */
>          struct robust_list __user *list_op_pending;
>  };
>
> +/*
> + * Extensible per-thread list head:
> + *
> + * As locks are shared between processes, the futex_offset field
> + * has ABI lock-stepping issues, which the original robust_list_head
> + * structure did not anticipate. (And which prevents 32-bit/64-bit
> + * interoperability, as well as shrinking of mutex structures).
> + * This new extensible_robust_list_head allows multiple
> + * concurrent futex_offset values, chosen using the bottom 2 bits of the
> + * robust_list *next pointer, which are now masked in BOTH the old and
> + * new ABI.
> + *
> + * Note: this structure is part of the syscall ABI like
> + * robust_list_head above, and must have a different size than
> + * robust_list_head.
> + *
> + */
> +struct extended_robust_list_head {
> + struct robust_list_head list_head;
> +
> + /*
> + * These relative offsets are set by user-space. They give the kernel
> + * the relative position of the futex field to examine, based on the
> + * bit 1 *next pointer.
> + * The original version was insufficiently flexible. Locks are held
> + * in shared memory between processes, and a process might want to hold
> + * locks of two ABIs at the same time.
> + */
> + long futex_offset2;
> +};
> +
>  /*
>   * Are there any waiters for this robust futex:
>   */
>  #define FUTEX_WAITERS 0x80000000
>
> diff --git a/kernel/futex.c b/kernel/futex.c
> index 6d50728ef2e7..3a17d2d63178 100644
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -3396,17 +3396,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
>  SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
>                  size_t, len)
>  {
>          if (!futex_cmpxchg_enabled)
>                  return -ENOSYS;
> - /*
> - * The kernel knows only one size for now:
> - */
> - if (unlikely(len != sizeof(*head)))
> +
> + if (unlikely(len != sizeof(struct robust_list_head) &&
> + len != sizeof(struct extensible_robust_list_head)))
>                  return -EINVAL;
>
> - current->robust_list = head;
> + current->robust_list = head & 0b11UL;
> + BUILD_BUG_ON(sizeof(struct robust_list_head) ==
> + sizeof(struct extended_robust_list_head));
> + if (len == sizeof(struct robust_list_head))
> + current->robust_list |= 1;
>
>          return 0;
>  }
>
>  /**
> @@ -3419,10 +3422,11 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
>                  struct robust_list_head __user * __user *, head_ptr,
>                  size_t __user *, len_ptr)
>  {
>          struct robust_list_head __user *head;
>          unsigned long ret;
> + size_t len;
>          struct task_struct *p;
>
>          if (!futex_cmpxchg_enabled)
>                  return -ENOSYS;
>
> @@ -3439,14 +3443,18 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
>
>          ret = -EPERM;
>          if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
>                  goto err_unlock;
>
> - head = p->robust_list;
> + head = p->robust_list & ~0b11UL;
> + if (p->robust_list & 0b11 == 0b1)
> + len = sizeof(struct robust_list_head);
> + else
> + len = sizeof(struct extended_robust_list_head);
>          rcu_read_unlock();
>
> - if (put_user(sizeof(*head), len_ptr))
> + if (put_user(len, len_ptr))
>                  return -EFAULT;
>          return put_user(head, head_ptr);
>
>  err_unlock:
>          rcu_read_unlock();
> @@ -3524,23 +3532,26 @@ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int p
>
>          return 0;
>  }
>
>  /*
> - * Fetch a robust-list pointer. Bit 0 signals PI futexes:
> + * Fetch a robust-list pointer. Bit 0 signals PI futexes. Bit 1 choses which
> + * futex_offset to use:
>   */
>  static inline int fetch_robust_entry(struct robust_list __user **entry,
>                                       struct robust_list __user * __user *head,
> - unsigned int *pi)
> + unsigned int *pi,
> + *unsigned int *second_abi)
>  {
>          unsigned long uentry;
>
>          if (get_user(uentry, (unsigned long __user *)head))
>                  return -EFAULT;
>
> - *entry = (void __user *)(uentry & ~1UL);
> + *entry = (void __user *)(uentry & ~0b11UL);
>          *pi = uentry & 1;
> + *second_abi = uentry & 0b10;
>
>          return 0;
>  }
>
>  /*
> @@ -3549,69 +3560,84 @@ static inline int fetch_robust_entry(struct robust_list __user **entry,
>   *
>   * We silently return on any sign of list-walking problem.
>   */
>  void exit_robust_list(struct task_struct *curr)
>  {
> - struct robust_list_head __user *head = curr->robust_list;
> - struct robust_list __user *entry, *next_entry, *pending;
> - unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
> - unsigned int uninitialized_var(next_pi);
> - unsigned long futex_offset;
> + struct robust_list_head __user *head_ptr = curr->robust_list & ~1UL;
> + unsigned int is_extended_list = curr->robust_list & 1 == 0;
> + struct extended_robust_list_head head;
> + struct robust_list __user *entry = &head->list_head.list.next,
> + *next_entry, *pending;
> + unsigned int limit = ROBUST_LIST_LIMIT, pi, pip, second_abi,
> + second_abip;
> + unsigned int uninitialized_var(next_pi),
> + uninitialized_var(next_second_abi);
>          int rc;
>
>          if (!futex_cmpxchg_enabled)
>                  return;
>
>          /*
> - * Fetch the list head (which was registered earlier, via
> - * sys_set_robust_list()):
> + * fetch_robust_entry code is duplicated here to avoid excessive calls
> + * to get_user()
>           */
> - if (fetch_robust_entry(&entry, &head->list.next, &pi))
> - return;
> - /*
> - * Fetch the relative futex offset:
> - */
> - if (get_user(futex_offset, &head->futex_offset))
> - return;
> - /*
> - * Fetch any possibly pending lock-add first, and handle it
> - * if it exists:
> - */
> - if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
> - return;
> + if (is_extended_list) {
> + if (get_user(head, (struct extended_robust_list_head *)
> + head_ptr))
> + return;
> + } else {
> + if (get_user(head.list_head, head_ptr))
> + return;
> + }
> +
> + pi = head.list_head.list.next & 1;
> + second_abi = head.list_head.list.next & 0b10;
> + head.list_head.list.next &= ~0b11UL;
> + pip = head.list_head.list_op_pending & 1;
> + second_abip = head.list_head.list_op_pending & 0b10;
> + head.list_head.list_op_pending &= ~0b11UL;
>
>          next_entry = NULL; /* avoid warning with gcc */
> - while (entry != &head->list) {
> + while (entry != &head->list_head.list) {
>                  /*
>                   * Fetch the next entry in the list before calling
>                   * handle_futex_death:
>                   */
> - rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
> + rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi,
> + &next_second_abi);
>                  /*
>                   * A pending lock might already be on the list, so
>                   * don't process it twice:
>                   */
> - if (entry != pending)
> + if (entry != pending) {
> + long futex_offset = second_abi ?
> + head.futex_offset2 :
> + head.list_head.futex_offset;
>                          if (handle_futex_death((void __user *)entry + futex_offset,
>                                                  curr, pi))
>                                  return;
> + }
>                  if (rc)
>                          return;
>                  entry = next_entry;
>                  pi = next_pi;
> + second_abi = next_second_abi;
>                  /*
>                   * Avoid excessively long or circular lists:
>                   */
>                  if (!--limit)
>                          break;
>
>                  cond_resched();
>          }
>
> - if (pending)
> + if (pending) {
> + long futex_offset = second_abip ? head.futex_offset2 :
> + head.list_head.futex_offset;
>                  handle_futex_death((void __user *)pending + futex_offset,
>                                     curr, pip);
> + }
>  }
>
>  long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
>                  u32 __user *uaddr2, u32 val2, u32 val3)
>  {
> @@ -3707,21 +3733,25 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
>          return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
>  }
>
>  #ifdef CONFIG_COMPAT
>  /*
> - * Fetch a robust-list pointer. Bit 0 signals PI futexes:
> + * Fetch a robust-list pointer. Bit 0 signals PI futexes.
> + * Bit 1 choses which futex_offset to use:
>   */
>  static inline int
> -compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
> - compat_uptr_t __user *head, unsigned int *pi)
> +compat_fetch_robust_entry(compat_uptr_t *uentry,
> + struct robust_list __user **entry,
> + compat_uptr_t __user *head, unsigned int *pi,
> + unsigned int *second_abi)
>  {
>          if (get_user(*uentry, head))
>                  return -EFAULT;
>
> - *entry = compat_ptr((*uentry) & ~1);
> + *entry = compat_ptr((*uentry) & ~0b11);
>          *pi = (unsigned int)(*uentry) & 1;
> + *second_abi = (unsigned int)(*uentry) & 0b10;
>
>          return 0;
>  }
>
>  static void __user *futex_uaddr(struct robust_list __user *entry,
> @@ -3739,72 +3769,86 @@ static void __user *futex_uaddr(struct robust_list __user *entry,
>   *
>   * We silently return on any sign of list-walking problem.
>   */
>  void compat_exit_robust_list(struct task_struct *curr)
>  {
> - struct compat_robust_list_head __user *head = curr->compat_robust_list;
> - struct robust_list __user *entry, *next_entry, *pending;
> - unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
> - unsigned int uninitialized_var(next_pi);
> + struct compat_robust_list_head __user *head = curr->compat_robust_list &
> + ~1UL;
> + unsigned int is_extended_list = curr->compat_robust_list & 1 == 0;
> + struct compat_extended_robust_list_head head;
> + struct robust_list __user *entry = &head->list_head.list.next,
> + *next_entry, *pending;
> + unsigned int limit = ROBUST_LIST_LIMIT, pi, pip, second_abi,
> + second_abip;
> + unsigned int uninitialized_var(next_pi),
> + uninitialized_var(next_second_abi);
>          compat_uptr_t uentry, next_uentry, upending;
> - compat_long_t futex_offset;
>          int rc;
>
>          if (!futex_cmpxchg_enabled)
>                  return;
>
>          /*
> - * Fetch the list head (which was registered earlier, via
> - * sys_set_robust_list()):
> - */
> - if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
> - return;
> - /*
> - * Fetch the relative futex offset:
> - */
> - if (get_user(futex_offset, &head->futex_offset))
> - return;
> - /*
> - * Fetch any possibly pending lock-add first, and handle it
> - * if it exists:
> + * compat_fetch_robust_entry code is duplicated here to avoid excessive
> + * calls to get_user()
>           */
> - if (compat_fetch_robust_entry(&upending, &pending,
> - &head->list_op_pending, &pip))
> - return;
> + if (is_extended_list) {
> + if (get_user(head, (struct compat_extended_robust_list_head *)
> + head_ptr))
> + return;
> + } else {
> + if (get_user(head.list_head, head_ptr))
> + return;
> + }
> +
> + pi = head.list_head.list.next & 1;
> + second_abi = head.list_head.list.next & 0b10;
> + head.list_head.list.next &= ~0b11UL;
> + pip = head.list_head.list_op_pending & 1;
> + second_abip = head.list_head.list_op_pending & 0b10;
> + head.list_head.list_op_pending &= ~0b11UL;
>
>          next_entry = NULL; /* avoid warning with gcc */
>          while (entry != (struct robust_list __user *) &head->list) {
>                  /*
>                   * Fetch the next entry in the list before calling
>                   * handle_futex_death:
>                   */
>                  rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
> - (compat_uptr_t __user *)&entry->next, &next_pi);
> + (compat_uptr_t __user *)&entry->next, &next_pi,
> + &next_second_abi);
>                  /*
>                   * A pending lock might already be on the list, so
>                   * dont process it twice:
>                   */
>                  if (entry != pending) {
> + compat_long_t futex_offset = second_abi ?
> + head.futex_offset2 :
> + head.list_head.futex_offset;
>                          void __user *uaddr = futex_uaddr(entry, futex_offset);
>
>                          if (handle_futex_death(uaddr, curr, pi))
>                                  return;
>                  }
>                  if (rc)
>                          return;
>                  uentry = next_uentry;
>                  entry = next_entry;
>                  pi = next_pi;
> + second_abi = next_second_abi;
>                  /*
>                   * Avoid excessively long or circular lists:
>                   */
>                  if (!--limit)
>                          break;
>
>                  cond_resched();
>          }
>          if (pending) {
> + compat_long_t futex_offset = second_abip ?
> + head.futex_offset2 :
> + head.list_head.futex_offset;
>                  void __user *uaddr = futex_uaddr(pending, futex_offset);
>
>                  handle_futex_death(uaddr, curr, pip);
>          }
>  }
> @@ -3814,23 +3858,29 @@ COMPAT_SYSCALL_DEFINE2(set_robust_list,
>                  compat_size_t, len)
>  {
>          if (!futex_cmpxchg_enabled)
>                  return -ENOSYS;
>
> - if (unlikely(len != sizeof(*head)))
> + if (unlikely(len != sizeof(struct compat_robust_list_head) &&
> + len != sizeof(struct compat_extended_robust_list_head)))
>                  return -EINVAL;
>
> - current->compat_robust_list = head;
> + current->compat_robust_list = head & ~0b11;
> + BUILD_BUG_ON(sizeof(compat_robust_list_head) ==
> + sizeof(compat_extended_robust_list_head));
> + if (len == sizeof(compat_robust_list_head))
> + current->compat_robust_list |= 0b1;
>
>          return 0;
>  }
>
>  COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
>                          compat_uptr_t __user *, head_ptr,
>                          compat_size_t __user *, len_ptr)
>  {
>          struct compat_robust_list_head __user *head;
> + size_t len;
>          unsigned long ret;
>          struct task_struct *p;
>
>          if (!futex_cmpxchg_enabled)
>                  return -ENOSYS;
> @@ -3848,14 +3898,18 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
>
>          ret = -EPERM;
>          if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
>                  goto err_unlock;
>
> - head = p->compat_robust_list;
> + head = p->compat_robust_list & ~0b11;
> + if (p->compat_robust_list & 0b11 == 0b1)
> + len = sizeof(struct compat_robust_list_head);
> + else
> + len = sizeof(struct compat_extended_robust_list_head);
>          rcu_read_unlock();
>
> - if (put_user(sizeof(*head), len_ptr))
> + if (put_user(len, len_ptr))
>                  return -EFAULT;
>          return put_user(ptr_to_compat(head), head_ptr);
>
>  err_unlock:
>          rcu_read_unlock();
> --
> 2.20.1
Thomas Gleixner Nov. 4, 2019, 3:37 p.m. UTC | #2
On Sun, 3 Nov 2019, Shawn Landden wrote:

While you Cc'ed various people for whatever reasons, you missed to Cc 3/4
of the FUTEX maintainers/reviewers from the MAINTAINERS file:

FUTEX SUBSYSTEM
M:      Thomas Gleixner <tglx@linutronix.de>
M:      Ingo Molnar <mingo@redhat.com>
R:      Peter Zijlstra <peterz@infradead.org>
R:      Darren Hart <dvhart@infradead.org>

> The robust futexes ABI was designed to be flexible to changing ABIs in
> glibc, however it did not take into consideration that this ABI is
> particularly sticky, and suffers from lock-step problems, due to the
> fact that the ABI is shared between processes. This introduces a new
> size in set_robust_list that takes an additional futex_offset2 value
> so that two locking ABIs can be used at the same time.

This text does not really explain the problem you are trying to solve. I
can crystalball something out of it, but that's not what a changelog should
force people to do.

> If this new ABI is used, then bit 1 of the *next pointer of the
> user-space robust_list indicates that the futex_offset2 value should
> be used in place of the existing futex_offset.
> 
> The use case for this is sharing locks between 32-bit and 64-bit
> processes, which Linux supports, but glibc does not, and is difficult
> to implement with the current Linux support because of mix-matched
> ABIs.

The problem with mixed processes is that the *next pointer in the robust
list points to some random data structure, but not to the actual futex
value itself. The pointer to the underlying futex value is calculated from
there with 'futex_offset'.

I assume that pthread_mutex and friends have different layouts on 32 and 64
bit and therefore futex_offset is different, right?

But even if you manage to keep track of the offsets per futex, I don't see
how sharing the actual pthread_mutex between a 32bit and a 64bit process
would work at all if the underlying data structures are not fundamentally
the same. The contain more than the robust list.

> Keith Packard has complained about this:
> https://keithp.com/blogs/Shared_Memory_Fences/

That complaint is important because the info there is missing in the
changelog, right? Such links are mostly useless as they are outdated sooner
than later.

Aside of that the blog entry is absolutely not useful to fill the blanks in
your changelog.

> This can also be used to add a new ABI that uses smaller structs, as the
> existing ABI on x86_64 is a minimum of 32 bytes, and 20 bytes would
> suffice.

I have no idea how you make that a minimum of 32 bytes. It's fixed 24 bytes
sized on x86_64.

struct robust_list_head {
	struct robust_list         list;                 /*     0     8 */
	long int                   futex_offset;         /*     8     8 */
	struct robust_list *       list_op_pending;      /*    16     8 */

	/* size: 24, cachelines: 1, members: 3 */
};

>

This lacks a Signed-off-by: ....

> 
> v2: fix size of compat_extended_robust_list_head
>     fix some issues with number literals being implicitly ints

Please move the vN change history after a --- separator so tools can
discard it. It's not part of the changelog. Additionally a link to the
previous submission would be helpful for keeping track.

i.e.

---
v2: Fix ....

v1: https://lore.kernel.org/r/$MESSAGE_ID
---
diffstat
....

> +struct compat_extended_robust_list_head {
> +	struct compat_robust_list_head list_head;
> +	compat_long_t			futex_offset2;

Please align the names of the members properly. list head should be
separated from the type by a tab not by a space.

>  #ifdef CONFIG_FUTEX
> +    /*
> +     * bottom two bits are masked
> +     * 0: struct extended_robust_list_head
> +     * 1: struct robust_list_head
> +     * same for compat_robust_list

What? That's not what the code does. It merily sets bit 0.

> +/*
> + * Extensible per-thread list head:
> + *
> + * As locks are shared between processes, the futex_offset field
> + * has ABI lock-stepping issues, which the original robust_list_head
> + * structure did not anticipate. (And which prevents 32-bit/64-bit
> + * interoperability, as well as shrinking of mutex structures).
> + * This new extensible_robust_list_head allows multiple
> + * concurrent futex_offset values, chosen using the bottom 2 bits of the
> + * robust_list *next pointer, which are now masked in BOTH the old and
> + * new ABI.
> + *
> + * Note: this structure is part of the syscall ABI like
> + * robust_list_head above, and must have a different size than
> + * robust_list_head.

Versioning an ABI by different sizes is an horrible idea.

> + *
> + */
> +struct extended_robust_list_head {
> +	struct robust_list_head list_head;
> +
> +	/*
> +	 * These relative offsets are set by user-space. They give the kernel
> +	 * the relative position of the futex field to examine, based on the
> +	 * bit 1 *next pointer.

What on earth is a 'bit 1 *next pointer'?

> +	 * The original version was insufficiently flexible. Locks are held
> +	 * in shared memory between processes, and a process might want to hold
> +	 * locks of two ABIs at the same time.
> +	 */
> +	long futex_offset2;

Where is *list_op_pending gone and how is that supposed to work? Are you
now enqueueing the next lock into list_head list right away?

Please provide proper design information about this.

> +};
> +
>  /*
>   * Are there any waiters for this robust futex:
>   */
>  #define FUTEX_WAITERS		0x80000000
>  
> diff --git a/kernel/futex.c b/kernel/futex.c
> index 6d50728ef2e7..3a17d2d63178 100644
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -3396,17 +3396,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
>  SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
>  		size_t, len)
>  {
>  	if (!futex_cmpxchg_enabled)
>  		return -ENOSYS;
> -	/*
> -	 * The kernel knows only one size for now:
> -	 */
> -	if (unlikely(len != sizeof(*head)))
> +
> +	if (unlikely(len != sizeof(struct robust_list_head) &&
> +		     len != sizeof(struct extensible_robust_list_head)))
>  		return -EINVAL;
>  
> -	current->robust_list = head;
> +	current->robust_list = head & 0b11UL;

What? This results in:

	0 <= current->robust_list <=3

How is that supposed to work and why would the 'head' argument ever have
any of those bits set in the first place?

>  void exit_robust_list(struct task_struct *curr)
>  {
> -	struct robust_list_head __user *head = curr->robust_list;
> -	struct robust_list __user *entry, *next_entry, *pending;
> -	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
> -	unsigned int uninitialized_var(next_pi);
> -	unsigned long futex_offset;
> +	struct robust_list_head __user *head_ptr = curr->robust_list & ~1UL;
> +	unsigned int is_extended_list = curr->robust_list & 1 == 0;
> +	struct extended_robust_list_head head;
> +	struct robust_list __user *entry = &head->list_head.list.next,
> +					   *next_entry, *pending;
> +	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip, second_abi,
> +			     second_abip;
> +	unsigned int uninitialized_var(next_pi),
> +		     uninitialized_var(next_second_abi);
>  	int rc;
>  
>  	if (!futex_cmpxchg_enabled)
>  		return;
>  
>  	/*
> -	 * Fetch the list head (which was registered earlier, via
> -	 * sys_set_robust_list()):
> +	 * fetch_robust_entry code is duplicated here to avoid excessive calls
> +	 * to get_user()
>  	 */
> -	if (fetch_robust_entry(&entry, &head->list.next, &pi))
> -		return;
> -	/*
> -	 * Fetch the relative futex offset:
> -	 */
> -	if (get_user(futex_offset, &head->futex_offset))
> -		return;
> -	/*
> -	 * Fetch any possibly pending lock-add first, and handle it
> -	 * if it exists:
> -	 */
> -	if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
> -		return;
> +	if (is_extended_list) {
> +		if (get_user(head, (struct extended_robust_list_head *)
> +			     head_ptr))
> +			return;
> +	} else {
> +		if (get_user(head.list_head, head_ptr))
> +			return;
> +	}
> +
> +	pi = head.list_head.list.next & 1;
> +	second_abi = head.list_head.list.next & 0b10;

That's truly consistent. For current->robust_list you use bit 0 to
determine whether it is the extended one, but in user space you use bit 1
because bit 0 is already occupied for PI.

> +	head.list_head.list.next &= ~0b11UL;
> +	pip = head.list_head.list_op_pending & 1;
> +	second_abip = head.list_head.list_op_pending & 0b10;
> +	head.list_head.list_op_pending &= ~0b11UL;

So you are seriously trying to mix both ABIs in one robust list? Why on
earth?

One thread can hardly use two different libraries accessing the robust list
with two different versions of the ABI. That's just a recipe for
disaster. Futexes are complicated enough and the robust list is a fragile
beast to begin with.

To be honest. The result of this approach is just unreadable garbage and
this combo patch does too many things in one go to be reviewable at all.

Please provide a proper design and explain how this should work together
with existing robust list using code first.

Thanks,

	tglx
Thomas Gleixner Nov. 5, 2019, 12:10 a.m. UTC | #3
On Mon, 4 Nov 2019, Thomas Gleixner wrote:
> Please provide a proper design and explain how this should work together
> with existing robust list using code first.

Just for clarification:

Any change to that interface needs to be discussed with the *libc people
first. Your way of overriding the robust list interfacce with some ad hoc
solution will break existing users which is not acceptable.

So either the problem is solved with the *libc people in a way which allows
them to build upon or if your intention is solely to solve the problem
Keith described then none of this is required at all as user space can
handle the different layouts for that magic fence implementation perfectly
fine.

Thanks,

	tglx
Florian Weimer Nov. 5, 2019, 9:48 a.m. UTC | #4
* Shawn Landden:

> If this new ABI is used, then bit 1 of the *next pointer of the
> user-space robust_list indicates that the futex_offset2 value should
> be used in place of the existing futex_offset.

The futex interface currently has some races which can only be fixed by
API changes.  I'm concerned that we sacrifice the last bit for some
rather obscure feature.  What if we need that bit for fixing the
correctness issues?

Thanks,
Florian
Thomas Gleixner Nov. 5, 2019, 9:59 a.m. UTC | #5
On Tue, 5 Nov 2019, Florian Weimer wrote:
> * Shawn Landden:
> > If this new ABI is used, then bit 1 of the *next pointer of the
> > user-space robust_list indicates that the futex_offset2 value should
> > be used in place of the existing futex_offset.
> 
> The futex interface currently has some races which can only be fixed by
> API changes.  I'm concerned that we sacrifice the last bit for some
> rather obscure feature.  What if we need that bit for fixing the
> correctness issues?

That current approach is going nowhere and if we change the ABI ever then
this needs to happen with all *libc folks involved and agreeing.

Out of curiosity, what's the race issue vs. robust list which you are
trying to solve?

Thanks,

	tglx
Florian Weimer Nov. 5, 2019, 10:06 a.m. UTC | #6
* Thomas Gleixner:

> On Tue, 5 Nov 2019, Florian Weimer wrote:
>> * Shawn Landden:
>> > If this new ABI is used, then bit 1 of the *next pointer of the
>> > user-space robust_list indicates that the futex_offset2 value should
>> > be used in place of the existing futex_offset.
>> 
>> The futex interface currently has some races which can only be fixed by
>> API changes.  I'm concerned that we sacrifice the last bit for some
>> rather obscure feature.  What if we need that bit for fixing the
>> correctness issues?
>
> That current approach is going nowhere and if we change the ABI ever then
> this needs to happen with all *libc folks involved and agreeing.
>
> Out of curiosity, what's the race issue vs. robust list which you are
> trying to solve?

Sadly I'm not trying to solve them.  Here's one of the issues:

  <https://sourceware.org/bugzilla/show_bug.cgi?id=14485>

I think there are others, but I can't find a reference to them.  If
anyone wants to work on this, I can dig out the details and ask some
folks who have looked at this in the past.

Thanks,
Florian
Thomas Gleixner Nov. 5, 2019, 11:56 a.m. UTC | #7
On Tue, 5 Nov 2019, Florian Weimer wrote:
> * Thomas Gleixner:
> > On Tue, 5 Nov 2019, Florian Weimer wrote:
> >> * Shawn Landden:
> >> > If this new ABI is used, then bit 1 of the *next pointer of the
> >> > user-space robust_list indicates that the futex_offset2 value should
> >> > be used in place of the existing futex_offset.
> >> 
> >> The futex interface currently has some races which can only be fixed by
> >> API changes.  I'm concerned that we sacrifice the last bit for some
> >> rather obscure feature.  What if we need that bit for fixing the
> >> correctness issues?
> >
> > That current approach is going nowhere and if we change the ABI ever then
> > this needs to happen with all *libc folks involved and agreeing.
> >
> > Out of curiosity, what's the race issue vs. robust list which you are
> > trying to solve?
> 
> Sadly I'm not trying to solve them.  Here's one of the issues:
> 
>   <https://sourceware.org/bugzilla/show_bug.cgi?id=14485>

That one seems more a life time problem, i.e. the mutex is destroyed,
memory freed and map address reused while another thread was not yet out of
the mutex_unlock() call. Nasty.

Thanks,

	tglx
Carlos O'Donell Nov. 5, 2019, 2:10 p.m. UTC | #8
On 11/5/19 6:56 AM, Thomas Gleixner wrote:
> On Tue, 5 Nov 2019, Florian Weimer wrote:
>> * Thomas Gleixner:
>>> On Tue, 5 Nov 2019, Florian Weimer wrote:
>>>> * Shawn Landden:
>>>>> If this new ABI is used, then bit 1 of the *next pointer of the
>>>>> user-space robust_list indicates that the futex_offset2 value should
>>>>> be used in place of the existing futex_offset.
>>>>
>>>> The futex interface currently has some races which can only be fixed by
>>>> API changes.  I'm concerned that we sacrifice the last bit for some
>>>> rather obscure feature.  What if we need that bit for fixing the
>>>> correctness issues?
>>>
>>> That current approach is going nowhere and if we change the ABI ever then
>>> this needs to happen with all *libc folks involved and agreeing.
>>>
>>> Out of curiosity, what's the race issue vs. robust list which you are
>>> trying to solve?
>>
>> Sadly I'm not trying to solve them.  Here's one of the issues:
>>
>>   <https://sourceware.org/bugzilla/show_bug.cgi?id=14485>
> 
> That one seems more a life time problem, i.e. the mutex is destroyed,
> memory freed and map address reused while another thread was not yet out of
> the mutex_unlock() call. Nasty.

It is difficult to fix.

The other issue is this:

"Robust mutexes do not take ROBUST_LIST_LIMIT into account"
https://sourceware.org/bugzilla/show_bug.cgi?id=19089
Florian Weimer Nov. 5, 2019, 2:27 p.m. UTC | #9
* Carlos O'Donell:

> On 11/5/19 6:56 AM, Thomas Gleixner wrote:
>> On Tue, 5 Nov 2019, Florian Weimer wrote:
>>> * Thomas Gleixner:
>>>> On Tue, 5 Nov 2019, Florian Weimer wrote:
>>>>> * Shawn Landden:
>>>>>> If this new ABI is used, then bit 1 of the *next pointer of the
>>>>>> user-space robust_list indicates that the futex_offset2 value should
>>>>>> be used in place of the existing futex_offset.
>>>>>
>>>>> The futex interface currently has some races which can only be fixed by
>>>>> API changes.  I'm concerned that we sacrifice the last bit for some
>>>>> rather obscure feature.  What if we need that bit for fixing the
>>>>> correctness issues?
>>>>
>>>> That current approach is going nowhere and if we change the ABI ever then
>>>> this needs to happen with all *libc folks involved and agreeing.
>>>>
>>>> Out of curiosity, what's the race issue vs. robust list which you are
>>>> trying to solve?
>>>
>>> Sadly I'm not trying to solve them.  Here's one of the issues:
>>>
>>>   <https://sourceware.org/bugzilla/show_bug.cgi?id=14485>
>> 
>> That one seems more a life time problem, i.e. the mutex is destroyed,
>> memory freed and map address reused while another thread was not yet out of
>> the mutex_unlock() call. Nasty.
>
> It is difficult to fix.
>
> The other issue is this:
>
> "Robust mutexes do not take ROBUST_LIST_LIMIT into account"
> https://sourceware.org/bugzilla/show_bug.cgi?id=19089

That's just a missing check in our implementation and something that few
applications will encounter, if any.  There is this one here:

  <https://sourceware.org/bugzilla/show_bug.cgi?id=19004>

It contains a kernel patch.

I thought that there were more issues in the current implementation, but
I can't a record of them. 8-(

Thanks,
Florian
Thomas Gleixner Nov. 5, 2019, 2:27 p.m. UTC | #10
On Tue, 5 Nov 2019, Carlos O'Donell wrote:
> On 11/5/19 6:56 AM, Thomas Gleixner wrote:
> The other issue is this:
> 
> "Robust mutexes do not take ROBUST_LIST_LIMIT into account"
> https://sourceware.org/bugzilla/show_bug.cgi?id=19089

  "The kernel limits the length of the robust mutex list to 2048 entries.
   This constant does not seem to be exported to user space."

FWIW, the constant is defined in the UAPI futex header.

The main concern here is not the actual number of futexes held by a task.

The real issue is that the robust list could be circular by incident or
malice and there is no way for the kernel to figure that out. That would
prevent the task from exiting and make it iterate over the list until
doomsday, i.e. a nice unpriviledged DoS.

So I fear the kernel cannot really help with this one.

Thanks,

	tglx
Florian Weimer Nov. 5, 2019, 2:33 p.m. UTC | #11
* Thomas Gleixner:

> On Tue, 5 Nov 2019, Carlos O'Donell wrote:
>> On 11/5/19 6:56 AM, Thomas Gleixner wrote:
>> The other issue is this:
>> 
>> "Robust mutexes do not take ROBUST_LIST_LIMIT into account"
>> https://sourceware.org/bugzilla/show_bug.cgi?id=19089
>
>   "The kernel limits the length of the robust mutex list to 2048 entries.
>    This constant does not seem to be exported to user space."
>
> FWIW, the constant is defined in the UAPI futex header.
>
> The main concern here is not the actual number of futexes held by a task.
>
> The real issue is that the robust list could be circular by incident or
> malice and there is no way for the kernel to figure that out. That would
> prevent the task from exiting and make it iterate over the list until
> doomsday, i.e. a nice unpriviledged DoS.
>
> So I fear the kernel cannot really help with this one.

I'm actually fine with treating ROBUST_LIST_LIMIT as an ABI constant.
It's just not clear to me if the constant has this status today.  I
suspect it was just split from the implementation headers at one point.

Thanks,
Florian
Thomas Gleixner Nov. 5, 2019, 2:48 p.m. UTC | #12
On Tue, 5 Nov 2019, Florian Weimer wrote:
> * Thomas Gleixner:
> 
> > On Tue, 5 Nov 2019, Carlos O'Donell wrote:
> >> On 11/5/19 6:56 AM, Thomas Gleixner wrote:
> >> The other issue is this:
> >> 
> >> "Robust mutexes do not take ROBUST_LIST_LIMIT into account"
> >> https://sourceware.org/bugzilla/show_bug.cgi?id=19089
> >
> >   "The kernel limits the length of the robust mutex list to 2048 entries.
> >    This constant does not seem to be exported to user space."
> >
> > FWIW, the constant is defined in the UAPI futex header.
> >
> > The main concern here is not the actual number of futexes held by a task.
> >
> > The real issue is that the robust list could be circular by incident or
> > malice and there is no way for the kernel to figure that out. That would
> > prevent the task from exiting and make it iterate over the list until
> > doomsday, i.e. a nice unpriviledged DoS.
> >
> > So I fear the kernel cannot really help with this one.
> 
> I'm actually fine with treating ROBUST_LIST_LIMIT as an ABI constant.
> It's just not clear to me if the constant has this status today.  I
> suspect it was just split from the implementation headers at one point.

Yes, but we really can declare it as an ABI constant.

I think the limit is reasonably sized. But I'm not familiar with the lock
nesting expectations of insanely big enterprise applications.

Thanks,

	tglx
Thomas Gleixner Nov. 5, 2019, 2:53 p.m. UTC | #13
On Tue, 5 Nov 2019, Florian Weimer wrote:
> * Carlos O'Donell:
> > "Robust mutexes do not take ROBUST_LIST_LIMIT into account"
> > https://sourceware.org/bugzilla/show_bug.cgi?id=19089
> 
> That's just a missing check in our implementation and something that few
> applications will encounter, if any.  There is this one here:
> 
>   <https://sourceware.org/bugzilla/show_bug.cgi?id=19004>
> 
> It contains a kernel patch.
> 
> I thought that there were more issues in the current implementation, but
> I can't a record of them. 8-(

There is a nasty one in my inbox with a kernel patch fixing it, which I
still need to review with all futex brain cells activated:

  https://lore.kernel.org/r/1572573789-16557-1-git-send-email-wang.yi59@zte.com.cn

Thanks,

	tglx
Zack Weinberg Nov. 6, 2019, 2 p.m. UTC | #14
On Tue, Nov 5, 2019 at 9:28 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> The real issue is that the robust list could be circular by incident or
> malice and there is no way for the kernel to figure that out. That would
> prevent the task from exiting and make it iterate over the list until
> doomsday, i.e. a nice unpriviledged DoS.

Why can't the kernel use the standard tortoise-and-hare algorithm for
detecting circular linked lists here?

zw
Florian Weimer Nov. 6, 2019, 2:04 p.m. UTC | #15
* Zack Weinberg:

> On Tue, Nov 5, 2019 at 9:28 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>>
>> The real issue is that the robust list could be circular by incident or
>> malice and there is no way for the kernel to figure that out. That would
>> prevent the task from exiting and make it iterate over the list until
>> doomsday, i.e. a nice unpriviledged DoS.
>
> Why can't the kernel use the standard tortoise-and-hare algorithm for
> detecting circular linked lists here?

It's not guaranteed to terminate if the list is in shared memory.

Thanks,
Florian
diff mbox series

Patch

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 16dafd9f4b86..00a0741bf658 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -379,10 +379,15 @@  struct compat_robust_list_head {
 	struct compat_robust_list	list;
 	compat_long_t			futex_offset;
 	compat_uptr_t			list_op_pending;
 };
 
+struct compat_extended_robust_list_head {
+	struct compat_robust_list_head list_head;
+	compat_long_t			futex_offset2;
+};
+
 #ifdef CONFIG_COMPAT_OLD_SIGACTION
 struct compat_old_sigaction {
 	compat_uptr_t			sa_handler;
 	compat_old_sigset_t		sa_mask;
 	compat_ulong_t			sa_flags;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9f51932bd543..894258fd44ac 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1057,10 +1057,16 @@  struct task_struct {
 #ifdef CONFIG_X86_CPU_RESCTRL
 	u32				closid;
 	u32				rmid;
 #endif
 #ifdef CONFIG_FUTEX
+    /*
+     * bottom two bits are masked
+     * 0: struct extended_robust_list_head
+     * 1: struct robust_list_head
+     * same for compat_robust_list
+     */
 	struct robust_list_head __user	*robust_list;
 #ifdef CONFIG_COMPAT
 	struct compat_robust_list_head __user *compat_robust_list;
 #endif
 	struct list_head		pi_state_list;
diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index a89eb0accd5e..30c08e07f26b 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -92,10 +92,41 @@  struct robust_list_head {
 	 * so only truly owned locks will be handled.
 	 */
 	struct robust_list __user *list_op_pending;
 };
 
+/*
+ * Extensible per-thread list head:
+ *
+ * As locks are shared between processes, the futex_offset field
+ * has ABI lock-stepping issues, which the original robust_list_head
+ * structure did not anticipate. (And which prevents 32-bit/64-bit
+ * interoperability, as well as shrinking of mutex structures).
+ * This new extensible_robust_list_head allows multiple
+ * concurrent futex_offset values, chosen using the bottom 2 bits of the
+ * robust_list *next pointer, which are now masked in BOTH the old and
+ * new ABI.
+ *
+ * Note: this structure is part of the syscall ABI like
+ * robust_list_head above, and must have a different size than
+ * robust_list_head.
+ *
+ */
+struct extended_robust_list_head {
+	struct robust_list_head list_head;
+
+	/*
+	 * These relative offsets are set by user-space. They give the kernel
+	 * the relative position of the futex field to examine, based on the
+	 * bit 1 *next pointer.
+	 * The original version was insufficiently flexible. Locks are held
+	 * in shared memory between processes, and a process might want to hold
+	 * locks of two ABIs at the same time.
+	 */
+	long futex_offset2;
+};
+
 /*
  * Are there any waiters for this robust futex:
  */
 #define FUTEX_WAITERS		0x80000000
 
diff --git a/kernel/futex.c b/kernel/futex.c
index 6d50728ef2e7..3a17d2d63178 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -3396,17 +3396,20 @@  static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
 		size_t, len)
 {
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
-	/*
-	 * The kernel knows only one size for now:
-	 */
-	if (unlikely(len != sizeof(*head)))
+
+	if (unlikely(len != sizeof(struct robust_list_head) &&
+		     len != sizeof(struct extensible_robust_list_head)))
 		return -EINVAL;
 
-	current->robust_list = head;
+	current->robust_list = head & 0b11UL;
+	BUILD_BUG_ON(sizeof(struct robust_list_head) ==
+		     sizeof(struct extended_robust_list_head));
+	if (len == sizeof(struct robust_list_head))
+		current->robust_list |= 1;
 
 	return 0;
 }
 
 /**
@@ -3419,10 +3422,11 @@  SYSCALL_DEFINE3(get_robust_list, int, pid,
 		struct robust_list_head __user * __user *, head_ptr,
 		size_t __user *, len_ptr)
 {
 	struct robust_list_head __user *head;
 	unsigned long ret;
+	size_t len;
 	struct task_struct *p;
 
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
 
@@ -3439,14 +3443,18 @@  SYSCALL_DEFINE3(get_robust_list, int, pid,
 
 	ret = -EPERM;
 	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
 		goto err_unlock;
 
-	head = p->robust_list;
+	head = p->robust_list & ~0b11UL;
+	if (p->robust_list & 0b11 == 0b1)
+		len = sizeof(struct robust_list_head);
+	else
+		len = sizeof(struct extended_robust_list_head);
 	rcu_read_unlock();
 
-	if (put_user(sizeof(*head), len_ptr))
+	if (put_user(len, len_ptr))
 		return -EFAULT;
 	return put_user(head, head_ptr);
 
 err_unlock:
 	rcu_read_unlock();
@@ -3524,23 +3532,26 @@  static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int p
 
 	return 0;
 }
 
 /*
- * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes. Bit 1 choses which
+ * futex_offset to use:
  */
 static inline int fetch_robust_entry(struct robust_list __user **entry,
 				     struct robust_list __user * __user *head,
-				     unsigned int *pi)
+				     unsigned int *pi,
+				     *unsigned int *second_abi)
 {
 	unsigned long uentry;
 
 	if (get_user(uentry, (unsigned long __user *)head))
 		return -EFAULT;
 
-	*entry = (void __user *)(uentry & ~1UL);
+	*entry = (void __user *)(uentry & ~0b11UL);
 	*pi = uentry & 1;
+	*second_abi = uentry & 0b10;
 
 	return 0;
 }
 
 /*
@@ -3549,69 +3560,84 @@  static inline int fetch_robust_entry(struct robust_list __user **entry,
  *
  * We silently return on any sign of list-walking problem.
  */
 void exit_robust_list(struct task_struct *curr)
 {
-	struct robust_list_head __user *head = curr->robust_list;
-	struct robust_list __user *entry, *next_entry, *pending;
-	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
-	unsigned int uninitialized_var(next_pi);
-	unsigned long futex_offset;
+	struct robust_list_head __user *head_ptr = curr->robust_list & ~1UL;
+	unsigned int is_extended_list = curr->robust_list & 1 == 0;
+	struct extended_robust_list_head head;
+	struct robust_list __user *entry = &head->list_head.list.next,
+					   *next_entry, *pending;
+	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip, second_abi,
+			     second_abip;
+	unsigned int uninitialized_var(next_pi),
+		     uninitialized_var(next_second_abi);
 	int rc;
 
 	if (!futex_cmpxchg_enabled)
 		return;
 
 	/*
-	 * Fetch the list head (which was registered earlier, via
-	 * sys_set_robust_list()):
+	 * fetch_robust_entry code is duplicated here to avoid excessive calls
+	 * to get_user()
 	 */
-	if (fetch_robust_entry(&entry, &head->list.next, &pi))
-		return;
-	/*
-	 * Fetch the relative futex offset:
-	 */
-	if (get_user(futex_offset, &head->futex_offset))
-		return;
-	/*
-	 * Fetch any possibly pending lock-add first, and handle it
-	 * if it exists:
-	 */
-	if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
-		return;
+	if (is_extended_list) {
+		if (get_user(head, (struct extended_robust_list_head *)
+			     head_ptr))
+			return;
+	} else {
+		if (get_user(head.list_head, head_ptr))
+			return;
+	}
+
+	pi = head.list_head.list.next & 1;
+	second_abi = head.list_head.list.next & 0b10;
+	head.list_head.list.next &= ~0b11UL;
+	pip = head.list_head.list_op_pending & 1;
+	second_abip = head.list_head.list_op_pending & 0b10;
+	head.list_head.list_op_pending &= ~0b11UL;
 
 	next_entry = NULL;	/* avoid warning with gcc */
-	while (entry != &head->list) {
+	while (entry != &head->list_head.list) {
 		/*
 		 * Fetch the next entry in the list before calling
 		 * handle_futex_death:
 		 */
-		rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
+		rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi,
+					&next_second_abi);
 		/*
 		 * A pending lock might already be on the list, so
 		 * don't process it twice:
 		 */
-		if (entry != pending)
+		if (entry != pending) {
+			long futex_offset = second_abi ?
+					    head.futex_offset2 :
+					    head.list_head.futex_offset;
 			if (handle_futex_death((void __user *)entry + futex_offset,
 						curr, pi))
 				return;
+		}
 		if (rc)
 			return;
 		entry = next_entry;
 		pi = next_pi;
+		second_abi = next_second_abi;
 		/*
 		 * Avoid excessively long or circular lists:
 		 */
 		if (!--limit)
 			break;
 
 		cond_resched();
 	}
 
-	if (pending)
+	if (pending) {
+		long futex_offset = second_abip ? head.futex_offset2 :
+						  head.list_head.futex_offset;
 		handle_futex_death((void __user *)pending + futex_offset,
 				   curr, pip);
+	}
 }
 
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		u32 __user *uaddr2, u32 val2, u32 val3)
 {
@@ -3707,21 +3733,25 @@  SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
 }
 
 #ifdef CONFIG_COMPAT
 /*
- * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes.
+ * Bit 1 choses which futex_offset to use:
  */
 static inline int
-compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
-		   compat_uptr_t __user *head, unsigned int *pi)
+compat_fetch_robust_entry(compat_uptr_t *uentry,
+			  struct robust_list __user **entry,
+			  compat_uptr_t __user *head, unsigned int *pi,
+			  unsigned int *second_abi)
 {
 	if (get_user(*uentry, head))
 		return -EFAULT;
 
-	*entry = compat_ptr((*uentry) & ~1);
+	*entry = compat_ptr((*uentry) & ~0b11);
 	*pi = (unsigned int)(*uentry) & 1;
+	*second_abi = (unsigned int)(*uentry) & 0b10;
 
 	return 0;
 }
 
 static void __user *futex_uaddr(struct robust_list __user *entry,
@@ -3739,72 +3769,86 @@  static void __user *futex_uaddr(struct robust_list __user *entry,
  *
  * We silently return on any sign of list-walking problem.
  */
 void compat_exit_robust_list(struct task_struct *curr)
 {
-	struct compat_robust_list_head __user *head = curr->compat_robust_list;
-	struct robust_list __user *entry, *next_entry, *pending;
-	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
-	unsigned int uninitialized_var(next_pi);
+	struct compat_robust_list_head __user *head = curr->compat_robust_list &
+						      ~1UL;
+	unsigned int is_extended_list = curr->compat_robust_list & 1 == 0;
+	struct compat_extended_robust_list_head head;
+	struct robust_list __user *entry = &head->list_head.list.next,
+					   *next_entry, *pending;
+	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip, second_abi,
+			     second_abip;
+	unsigned int uninitialized_var(next_pi),
+		     uninitialized_var(next_second_abi);
 	compat_uptr_t uentry, next_uentry, upending;
-	compat_long_t futex_offset;
 	int rc;
 
 	if (!futex_cmpxchg_enabled)
 		return;
 
 	/*
-	 * Fetch the list head (which was registered earlier, via
-	 * sys_set_robust_list()):
-	 */
-	if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
-		return;
-	/*
-	 * Fetch the relative futex offset:
-	 */
-	if (get_user(futex_offset, &head->futex_offset))
-		return;
-	/*
-	 * Fetch any possibly pending lock-add first, and handle it
-	 * if it exists:
+	 * compat_fetch_robust_entry code is duplicated here to avoid excessive
+	 * calls to get_user()
 	 */
-	if (compat_fetch_robust_entry(&upending, &pending,
-			       &head->list_op_pending, &pip))
-		return;
+	if (is_extended_list) {
+		if (get_user(head, (struct compat_extended_robust_list_head *)
+			     head_ptr))
+			return;
+	} else {
+		if (get_user(head.list_head, head_ptr))
+			return;
+	}
+
+	pi = head.list_head.list.next & 1;
+	second_abi = head.list_head.list.next & 0b10;
+	head.list_head.list.next &= ~0b11UL;
+	pip = head.list_head.list_op_pending & 1;
+	second_abip = head.list_head.list_op_pending & 0b10;
+	head.list_head.list_op_pending &= ~0b11UL;
 
 	next_entry = NULL;	/* avoid warning with gcc */
 	while (entry != (struct robust_list __user *) &head->list) {
 		/*
 		 * Fetch the next entry in the list before calling
 		 * handle_futex_death:
 		 */
 		rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
-			(compat_uptr_t __user *)&entry->next, &next_pi);
+			(compat_uptr_t __user *)&entry->next, &next_pi,
+			&next_second_abi);
 		/*
 		 * A pending lock might already be on the list, so
 		 * dont process it twice:
 		 */
 		if (entry != pending) {
+			compat_long_t futex_offset = second_abi ?
+				head.futex_offset2 :
+				head.list_head.futex_offset;
 			void __user *uaddr = futex_uaddr(entry, futex_offset);
 
 			if (handle_futex_death(uaddr, curr, pi))
 				return;
 		}
 		if (rc)
 			return;
 		uentry = next_uentry;
 		entry = next_entry;
 		pi = next_pi;
+		second_abi = next_second_abi;
 		/*
 		 * Avoid excessively long or circular lists:
 		 */
 		if (!--limit)
 			break;
 
 		cond_resched();
 	}
 	if (pending) {
+		compat_long_t futex_offset = second_abip ?
+					     head.futex_offset2 :
+					     head.list_head.futex_offset;
 		void __user *uaddr = futex_uaddr(pending, futex_offset);
 
 		handle_futex_death(uaddr, curr, pip);
 	}
 }
@@ -3814,23 +3858,29 @@  COMPAT_SYSCALL_DEFINE2(set_robust_list,
 		compat_size_t, len)
 {
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
 
-	if (unlikely(len != sizeof(*head)))
+	if (unlikely(len != sizeof(struct compat_robust_list_head) &&
+		     len != sizeof(struct compat_extended_robust_list_head)))
 		return -EINVAL;
 
-	current->compat_robust_list = head;
+	current->compat_robust_list = head & ~0b11;
+	BUILD_BUG_ON(sizeof(compat_robust_list_head) ==
+		     sizeof(compat_extended_robust_list_head));
+	if (len == sizeof(compat_robust_list_head))
+		current->compat_robust_list |= 0b1;
 
 	return 0;
 }
 
 COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
 			compat_uptr_t __user *, head_ptr,
 			compat_size_t __user *, len_ptr)
 {
 	struct compat_robust_list_head __user *head;
+	size_t len;
 	unsigned long ret;
 	struct task_struct *p;
 
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
@@ -3848,14 +3898,18 @@  COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
 
 	ret = -EPERM;
 	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
 		goto err_unlock;
 
-	head = p->compat_robust_list;
+	head = p->compat_robust_list & ~0b11;
+	if (p->compat_robust_list & 0b11 == 0b1)
+		len = sizeof(struct compat_robust_list_head);
+	else
+		len = sizeof(struct compat_extended_robust_list_head);
 	rcu_read_unlock();
 
-	if (put_user(sizeof(*head), len_ptr))
+	if (put_user(len, len_ptr))
 		return -EFAULT;
 	return put_user(ptr_to_compat(head), head_ptr);
 
 err_unlock:
 	rcu_read_unlock();