diff mbox series

[RFC,4/6] mm: provide generic compat_sys_readahead() implementation

Message ID 20180318161056.5377-5-linux@dominikbrodowski.net (mailing list archive)
State Not Applicable
Headers show
Series [RFC,1/6] fs: provide a generic compat_sys_fallocate() implementation | expand

Commit Message

Dominik Brodowski March 18, 2018, 4:10 p.m. UTC
The compat_sys_readahead() implementations in mips, powerpc, s390, sparc
and x86 only differed based on whether the u64 parameter needed padding
and on their endianness.

Oh, and some defined the parameters as u64 or "unsigned long" which
expanded to u64, though it only expected u32 in these parameters.

This patch is part of a series which tries to remove in-kernel calls to
syscalls. On this basis, the syscall entry path can be streamlined.

Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: linux-mips@linux-mips.org
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: linux-s390@vger.kernel.org
Cc: David S. Miller <davem@davemloft.net>
Cc: sparclinux@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Slaby <jslaby@suse.com>
Cc: x86@kernel.org
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 arch/mips/include/asm/unistd.h         |  1 +
 arch/mips/kernel/linux32.c             |  6 ---
 arch/mips/kernel/scall64-o32.S         |  2 +-
 arch/powerpc/include/asm/unistd.h      |  1 +
 arch/powerpc/kernel/sys_ppc32.c        |  5 ---
 arch/s390/include/asm/unistd.h         |  1 +
 arch/s390/kernel/compat_linux.c        |  5 ---
 arch/s390/kernel/compat_linux.h        |  1 -
 arch/s390/kernel/syscalls/syscall.tbl  |  2 +-
 arch/sparc/include/asm/unistd.h        |  1 +
 arch/sparc/kernel/sys_sparc32.c        |  8 ----
 arch/sparc/kernel/systbls.h            |  4 --
 arch/x86/entry/syscalls/syscall_32.tbl |  2 +-
 arch/x86/ia32/sys_ia32.c               |  6 ---
 arch/x86/include/asm/sys_ia32.h        |  2 -
 arch/x86/include/asm/unistd.h          |  1 +
 include/linux/compat.h                 | 10 +++++
 mm/readahead.c                         | 81 ++++++++++++++++++++++++----------
 18 files changed, 76 insertions(+), 63 deletions(-)

Comments

Al Viro March 18, 2018, 5:40 p.m. UTC | #1
On Sun, Mar 18, 2018 at 05:10:54PM +0100, Dominik Brodowski wrote:

> +#ifdef __ARCH_WANT_COMPAT_SYS_READAHEAD
> +#if defined(__ARCH_WANT_COMPAT_SYS_WITH_PADDING) && \
> +	defined(__ARCH_WANT_LE_COMPAT_SYS)
> +COMPAT_SYSCALL_DEFINE5(readahead, int, fd, unsigned int, padding,
> +		       unsigned int, off_lo, unsigned int, off_hi,
> +		       size_t, count)
> +#elif defined(__ARCH_WANT_COMPAT_SYS_WITH_PADDING) && \
> +	!defined(__ARCH_WANT_LE_COMPAT_SYS)
> +COMPAT_SYSCALL_DEFINE5(readahead, int, fd, unsigned int, padding,
> +		       unsigned int, off_hi, unsigned int, off_lo,
> +		       size_t, count)
> +#elif !defined(__ARCH_WANT_COMPAT_SYS_WITH_PADDING) && \
> +	defined(__ARCH_WANT_LE_COMPAT_SYS)
> +COMPAT_SYSCALL_DEFINE4(readahead, int, fd,
> +		       unsigned int, off_lo, unsigned int, off_hi,
> +		       size_t, count)
> +#else /* no padding, big endian */
> +COMPAT_SYSCALL_DEFINE4(readahead, int, fd,
> +		       unsigned int, off_hi, unsigned int, off_lo,
> +		       size_t, count)
> +#endif
> +{
> +	return do_readahead(fd, ((u64) off_hi << 32) | off_lo, count);
>  }

*UGH*

static inline compat_to_u64(u32 w0, u32 w1)
{
#ifdef __BIG_ENDIAN
	return ((u64)w0 << 32) | w1;
#else
	return ((u64)w1 << 32) | w0;
#endif
}

in compat.h, then this turns into

#ifdef __ARCH_WANT_COMPAT_SYS_WITH_PADDING
COMPAT_SYSCALL_DEFINE5(readahead, int, fd, unsigned int, padding,
		       u32, off0, u32 off1,
		       compat_size_t, count)
#else
COMPAT_SYSCALL_DEFINE4(readahead, int, fd,
		       u32, off0, u32 off1,
		       compat_size_t, count)
#endif
{
	return do_readahead(fd, compat_to_u64(off0, off1), count);
}
Linus Torvalds March 18, 2018, 6:06 p.m. UTC | #2
On Sun, Mar 18, 2018 at 10:40 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> *UGH*
>
> static inline compat_to_u64(u32 w0, u32 w1)
> {
> #ifdef __BIG_ENDIAN
>         return ((u64)w0 << 32) | w1;
> #else
>         return ((u64)w1 << 32) | w0;
> #endif
> }
>
> in compat.h, then this turns into
>
> #ifdef __ARCH_WANT_COMPAT_SYS_WITH_PADDING
> COMPAT_SYSCALL_DEFINE5(readahead, int, fd, unsigned int, padding,
>                        u32, off0, u32 off1,
>                        compat_size_t, count)
> #else
> COMPAT_SYSCALL_DEFINE4(readahead, int, fd,
>                        u32, off0, u32 off1,
>                        compat_size_t, count)
> #endif

No. This is still too ugly to live.

What *may* be acceptable is if architectures defined something like this:

x86:

    /* Little endian registers - low bits first, no padding for odd
register numbers necessary */
    #define COMPAT_ARG_64BIT(x) unsigned int x##_lo, unsigned int x##_hi
    #define COMPAT_ARG_64BIT_ODD(x) COMPAT_ARG_64BIT(x)

ppc BE:

    /* Big-endian registers - high bits first, odd argument pairs
padded up to the next even register */
    #define COMPAT_ARG_64BIT(x) unsigned int x##_hi, unsigned int x##_lo
    #define COMPAT_ARG_64BIT_ODD(x) unsigned int  x##_padding,
COMPAT_ARG_64BIT(x)

and then we can do

  COMPAT_SYSCALL_DEFINE5(readahead, int, fd,
COMPAT_ARG_64BIT_ODD(off), compat_size_t, count)
  {
      return do_readahead(fd, off_lo + ((u64)off_hi << 64), count);
  }

which at least looks reasonably legible, and has *zero* ifdef's anywhere.

I do *not* want to see those disgusting __ARCH_WANT_LE_COMPAT_SYS
things and crazy #ifdef's in code.

So either let the architectures do their own trivial wrappers
entirely, or do something clean like the above. Do *not* do
#ifdef'fery at the system call declaration time.

Also note that the "ODD" arguments may not be the ones that need
padding. I could easily see a system call argument numbering scheme
like

   r0 - system call number
   r1 - first argument
   r2 - second argument
   ...

and then it's the *EVEN* 64-bit arguments that would need the padding
(because they are actually odd in the register numbers). The above
COMPAT_ARG_64BIT[_ODD]() model allows for that too.

Of course, if some architecture then has some other arbitrary rules (I
could see register pairing rules that aren't the usual "even register"
ones), then such an architecture would really have to have its own
wrapper, but the above at least would handle the simple cases, and
doesn't look disgusting to use.

                   Linus

PS. It is possible that we should then add a

   #define COMPAT_ARG_64BIT_VAL(x) (x_##lo + ((u64)x_##hi << 32))

and then do

  COMPAT_SYSCALL_DEFINE5(readahead, int, fd,
COMPAT_ARG_64BIT_ODD(off), compat_size_t, count)
  {
      return do_readahead(fd, COMPAT_ARG_64BIT_VAL(off), count);
  }

because then we could perhaps generate the *non*compat system calls
this way too, just using

    #define COMPAT_ARG_64BIT(x) unsigned long x
    #define COMPAT_ARG_64BIT_VAL(x) (x)

instead (there might also be a "compat" mode that actually has access
to 64-bit registers, like x32 does, but I suspect it would have other
issues).
Al Viro March 18, 2018, 6:18 p.m. UTC | #3
On Sun, Mar 18, 2018 at 11:06:42AM -0700, Linus Torvalds wrote:

> and then we can do
> 
>   COMPAT_SYSCALL_DEFINE5(readahead, int, fd,
> COMPAT_ARG_64BIT_ODD(off), compat_size_t, count)
>   {
>       return do_readahead(fd, off_lo + ((u64)off_hi << 64), count);
>   }
> 
> which at least looks reasonably legible, and has *zero* ifdef's anywhere.

It's a bit more complicated, but...

> I do *not* want to see those disgusting __ARCH_WANT_LE_COMPAT_SYS
> things and crazy #ifdef's in code.

Absolutely.  Those piles of ifdefs are unreadable garbage.

> So either let the architectures do their own trivial wrappers
> entirely, or do something clean like the above. Do *not* do
> #ifdef'fery at the system call declaration time.
> 
> Also note that the "ODD" arguments may not be the ones that need
> padding. I could easily see a system call argument numbering scheme
> like
> 
>    r0 - system call number
>    r1 - first argument
>    r2 - second argument
>    ...
> 
> and then it's the *EVEN* 64-bit arguments that would need the padding
> (because they are actually odd in the register numbers). The above
> COMPAT_ARG_64BIT[_ODD]() model allows for that too.
> 
> Of course, if some architecture then has some other arbitrary rules (I
> could see register pairing rules that aren't the usual "even register"
> ones), then such an architecture would really have to have its own
> wrapper, but the above at least would handle the simple cases, and
> doesn't look disgusting to use.

I'd done some digging in that area, will find the notes and post.
Basically, we can even avoid the odd/even annotations and have
COMPAT_SYSCALL_DEFINE... sort it out.  It's a bit more hairy than
I would like at this stage in the cycle, though.  I'll see if it can
be done without too much PITA.

However, there still are genuinely speci^Wfucked in head cases - see
e.g. this sad story:
commit ab8a261ba5e2dd9206da640de5870cc31d568a7c
Author: Helge Deller <deller@gmx.de>
Date:   Thu Jul 10 18:07:17 2014 +0200

    parisc: fix fanotify_mark() syscall on 32bit compat kernel

Those certainly ought to stay in arch/*
Al Viro March 19, 2018, 4:23 a.m. UTC | #4
On Sun, Mar 18, 2018 at 06:18:48PM +0000, Al Viro wrote:

> I'd done some digging in that area, will find the notes and post.

OK, found:

We have two ABIs in the game - syscall and normal C.  The latter
(for all targets we support) can be described in the following
terms:
	* there is a series of word-sized objects used to pass
arguments, some in registers, some on stack.  Arguments are
mapped on that sequence.  Anything up to word size simply takes
the next available word, so on 64bit it's all there is - nth
argument goes into the nth object, types simply do not matter.
On 32bit it's not that simple - there 64bit arguments occupy
two objects.  They are still picked from the same sequence;
on little-endian the lower half goes first, on big-endian -
the higher one.  For some architectures that's all there is to it.
However, on quite a few there's an extra complication - not every
pair can be used for 64bit value.  Rules for those are arch-dependent.
One variety is "pairs should be aligned", i.e. "if we'd consumed
an odd number of slots, add a dummy one before eating a pair".
Another is "don't let a pair span the registers/stack boundary";
surprisingly enough, that's not universal.

The syscall ABI can considerably differ from C one.  First of all,
we really do *not* want to pass anything on stack - it's a major
headache at syscall entry.  See mips/o32 for the scale of that headache.
Not fun.  OTOH, the registers that can be used are a limited resource.
i386 can't pass more than 6 words and that has served as an upper
limit.  If we encode the argument sizes (W - word, D - 64bit) we
have the following variants among the syscalls:
	* no arguments at all (SYSCALL_DEFINE0)
	* W (SYSCALL_DEFINE1)
	* WW (SYSCALL_DEFINE2)
	* WWW (SYSCALL_DEFINE3)
	* WWWW (SYSCALL_DEFINE4)
	* WWWWW (SYSCALL_DEFINE5)
	* WWWWWW (SYSCALL_DEFINE6)
	* WD (SYSCALL_DEFINE2, truncate64, ftruncate64)
	* DWW (SYSCALL_DEFINE3, lookup_dcookie)
	* WDW (SYSCALL_DEFINE3, readahead)
	* WWWD (SYSCALL_DEFINE4, pread64, pwrite64)
	* WWDD (SYSCALL_DEFINE4, fallocate, sync_file_range2)
	* WDDW (SYSCALL_DEFINE4, sync_file_range, fadvise64_64)
	* WDWW (SYSCALL_DEFINE4, fadvise64)
	* WWDWW (SYSCALL_DEFINE5, fanotify_mark)

The list for each long long-passing variant might be incomplete, but
they really are rare.  And a source of headache; not just for biarch
architectures - e.g. c6x and metag are not biarch and these syscalls
are exactly what stinks them up.

One thing we *really* don't want is syscall-dependent mapping from
syscall ABI registers to C ABI sequence.  Inside a syscall (or in
per-syscall glue) - sure, we can do it (if not happily).  In the
syscall dispatcher - fuck no, too much PITA.  mips/o32 used to be
a horrible example of why not, then they went for "copy from userland
stack whether we need it or not"...

For simple syscalls (first 7 classes in the above, each argument fits
into word) we simply map registers to the first 6 slots of the sequence
and we are done.  It's bloody tempting to use the same mapping for
the rest - use the same code that calls simple syscalls and have it
call sys_readahead() instead of sys_mkdir(), etc.  For something like
x86 or sparc that works perfectly - these guys have no padding for 64bit
arguments, so we are good (provided that userland sets those registers
sanely, that is).

OTOH, consider arm.  There we have
	* r0, r1, r2, r3, [sp,#8], [sp,#12], [sp,#16]... is the sequence
of objects used to pass arguments
	* 32bit and less - pick the next available slot
	* 64bit - skip a slot if we'd already taken an odd number, then use
the next two slots for lower and upper 32 bits of the argument.

So our classes take
simple n-argument:	0 to 6 slots
WD			4 slots
DWW			4 slots
WDW			5 slots
WWDD			6 slots
WDWW			5 slots
WWWD			6 slots
WWDWW			6 slots
WDDW			7 slots (!)  Also ****, !!!!, !@#!@#!@#!# and other nice
and well-deserved comments from arch maintainers, some of them even printable:
/* It would be nice if people remember that not all the world's an i386
   when they introduce new system calls */
SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags,
                                 loff_t, offset, loff_t, nbytes)

No idea why that hadn't been done to fadvise64_64() - that got
/*
 * Since loff_t is a 64 bit type we avoid a lot of ABI hassle
 * with a different argument ordering.
 */
asmlinkage long sys_arm_fadvise64_64(int fd, int advice,
                                     loff_t offset, loff_t len)
{
        return sys_fadvise64_64(fd, offset, len, advice);
}
long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low,
                      u32 len_high, u32 len_low)
{
        return sys_fadvise64(fd, (u64)offset_high << 32 | offset_low,
                             (u64)len_high << 32 | len_low, advice);
}
and
asmlinkage long parisc_fadvise64_64(int fd,
                        unsigned int high_off, unsigned int low_off,
                        unsigned int high_len, unsigned int low_len, int advice)
{
        return sys_fadvise64_64(fd, (loff_t)high_off << 32 | low_off,
                        (loff_t)high_len << 32 | low_len, advice);
}
consistency, shmonsistency...  And yes, glibc has
#ifdef __ASSUME_FADVISE64_64_6ARG
  int ret = INTERNAL_SYSCALL_CALL (fadvise64_64, err, fd, advise,
                                   SYSCALL_LL64 (offset), SYSCALL_LL64 (len));
#else
  int ret = INTERNAL_SYSCALL_CALL (fadvise64_64, err, fd,
                                   __ALIGNMENT_ARG SYSCALL_LL64 (offset),
                                   SYSCALL_LL64 (len), advise);
#endif
with __ASSUME_FADVISE64_64_6ARG defined for arm and ppc...
xtensa, BTW, has
asmlinkage long xtensa_fadvise64_64(int fd, int advice,
                unsigned long long offset, unsigned long long len)
{
        return sys_fadvise64_64(fd, offset, len, advice);
}
In other words, same solution as arm and ppc.  No xtensa support in
glibc, AFAICS...

Anyway, padding rules for 32bit ones, with not too many arguments:

* arc cris frv h8300 i386 m32r m68k microblaze mn10300 nios2 openrisc
riscv sparc - no padding.
Note that e.g. frv *does* have a "don't let it cross regs/stack
boundary" rule, but it has 6 register slots, so for syscalls it
doesn't matter.  OTOH, sparc (and m32r, and...) really won't
care about regs/stack boundary.

* arm mips parisc ppc xtensa - pad to even number of words consumed;
since the number of registers in the sequence is even for all those,
there is nothing special going on at the edge.  WDDW is a bitch for
that group; mips goes for "fuck it, we copy userland stack anyway",
the rest try to cope in various ways.

* s390 - 5 register slots, padding on the crossing the regs/stack
boundary and nowhere else (i.e. pad if exactly 4 words had been
consumed).  WWDD runs afoul of that one.

* bfin - similar, except that we pad on exactly 2 words.  Same
story as with s390, only we have 3 register slots.  Headache
on: WWDWW and WWDD.

* c6x - strange beast, that; registers are 32bit, but for the
argument passing purposes it's using 64bit register pairs, filling
only the lower half when passing a 32bit argument.

* sh - really weird.  If 64bit would span the registers/stack
boundary (4 register slots there), it's done on stack... and
the first 32bit argument after that will eat the remaining
register.  Out of our classes it affects WWWD (as if there
had been a padding) and WDDW (as if the last two arguments had
switched places).

FWIW, it looks like we have several different issues mixed here

1) teaching COMPAT_SYSCALL_DEFINE to do the right thing in case
when 64bit arguments are present.  It's not terribly hard (sh
is not biarch, thankfully), but we'd better watch out carefully
when unifying - "fucker eats 7 slots" cases are irregular by
definition and solutions used for those differ between the
architectures.  It won't be pretty, though - we can get something
like
asmlinkage long compat_sys_truncate64(long a0, long a1, long a2, long a3, long a4, long a5)
{
	CS_truncate64((const char __user *)a0, ((u64)a2<<32)|(u64)a3);
}
static inline long CS_truncate64(const char __user *path, loff_t length)
{
	return sys_truncate64(path, length);
}
out of
COMPAT_SYSCALL_DEFINE2(trucate64, const char __user *, path, loff_t, length)
{
	return sys_truncate(path, length);
}
but the damn thing will be 6-argument and expressions for arguments will
expand to something equal to the forms above, but they'll be choke-full
of conditional expressions with constant 0 or 1 for selectors.  What we can't
do is evaluation of "is it long long" at macro expansion time - we can't tell
int, loff_t from int, pid_t until typedefs had been handled, at which point
the parse tree is cast in stone.  It will have to be 6-argument.
Alternatively, we could do COMPAT_SYSCALL_DEFINE_WD et.al. so that preprocessor
would get that information directly from us.

[snip the preprocessor horrors - the sketches I've got there are downright obscene]

2) s390 compat wrappers ;-/  I still wonder if we would be better
off with SYSCALL_DEFINE on s390 defining a wrapper with COMPAT_SYSCALL_DEFINE
(if present) redefining it, throwing the original away.  That could be done
with some amount of linker trickery.

3) sparc wrappers in sys32.S.  Most of those got killed off back in 2012,
what remained was
SIGN1(sys32_getrusage, compat_sys_getrusage, %o0)
SIGN1(sys32_readahead, compat_sys_readahead, %o0)
SIGN2(sys32_fadvise64, compat_sys_fadvise64, %o0, %o4)
SIGN2(sys32_fadvise64_64, compat_sys_fadvise64_64, %o0, %o5)
SIGN1(sys32_clock_nanosleep, compat_sys_clock_nanosleep, %o1)
SIGN1(sys32_timer_settime, compat_sys_timer_settime, %o1)
SIGN1(sys32_io_submit, compat_sys_io_submit, %o1)
SIGN1(sys32_mq_open, compat_sys_mq_open, %o1)
SIGN1(sys32_select, compat_sys_select, %o0)
SIGN3(sys32_futex, compat_sys_futex, %o1, %o2, %o5)
SIGN2(sys32_sendfile, compat_sys_sendfile, %o0, %o1)
SIGN1(sys32_recvfrom, compat_sys_recvfrom, %o0)
SIGN1(sys32_recvmsg, compat_sys_recvmsg, %o0)
SIGN1(sys32_sendmsg, compat_sys_sendmsg, %o0)
SIGN2(sys32_sync_file_range, compat_sync_file_range, %o0, %o5)
SIGN1(sys32_vmsplice, compat_sys_vmsplice, %o0)
with some of those killed off later and (completely pointless)
SIGN2(sys32_renameat2, sys_renameat2, %o0, %o2)
added.

Since then clock_nanosleep(), timer_settime(), io_submit(), mq_open(),
select(), recvfrom(), recvmsg(), sendmsg(), getrusage(), sync_file_range(),
futex() and vmsplice() got switched to COMPAT_SYSCALL_DEFINE, making those
SIGN... wrappers pointless.  That leaves
SIGN1(sys32_readahead, compat_sys_readahead, %o0)
SIGN2(sys32_fadvise64, compat_sys_fadvise64, %o0, %o4)
SIGN2(sys32_fadvise64_64, compat_sys_fadvise64_64, %o0, %o5)
all of which are right there in arch/sparc/kernel/sys_sparc32.c and
trivially converted to COMPAT_SYSCALL_DEFINE.  Which would kill all the
SIGN... bunch off, leaving there sys_mmap() and sys_socketcall()

4) stuff in arch really needs a good look.  And not just biarch -
I could've easily missed some "takes a 64bit argument" cases in
that area.  Moreover, it might be a good idea to have all syscall
tables generated a-la x86 and s390 ones are, ideally with the same
format of input data for all of them...
Ingo Molnar March 19, 2018, 9:29 a.m. UTC | #5
* Al Viro <viro@ZenIV.linux.org.uk> wrote:

> On Sun, Mar 18, 2018 at 06:18:48PM +0000, Al Viro wrote:
> 
> > I'd done some digging in that area, will find the notes and post.
> 
> OK, found:

Very nice writeup - IMHO this should go into Documentation/!

> OTOH, consider arm.  There we have
> 	* r0, r1, r2, r3, [sp,#8], [sp,#12], [sp,#16]... is the sequence
> of objects used to pass arguments
> 	* 32bit and less - pick the next available slot
> 	* 64bit - skip a slot if we'd already taken an odd number, then use
> the next two slots for lower and upper 32 bits of the argument.
> 
> So our classes take
> simple n-argument:	0 to 6 slots
> WD			4 slots
> DWW			4 slots
> WDW			5 slots
> WWDD			6 slots
> WDWW			5 slots
> WWWD			6 slots
> WWDWW			6 slots
> WDDW			7 slots (!)  Also ****, !!!!, !@#!@#!@#!# and other nice
> and well-deserved comments from arch maintainers, some of them even printable:
> /* It would be nice if people remember that not all the world's an i386
>    when they introduce new system calls */
> SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags,
>                                  loff_t, offset, loff_t, nbytes)

Such idiosyncratic platform quirks that have an impact on generic code should be 
as self-maintaining as possible: i.e. there should be a build time warning even on 
x86 if someone introduces a new, suboptimally packed system call.

Otherwise we'll have such incidents again and again as new system calls get added.

> [snip the preprocessor horrors - the sketches I've got there are downright obscene]

I still think we should consider creating a generic facility and a tool: which 
would immediately and automatically add new system calls to *every* architecture - 
or which would initially at least check these syscall ABI constraints.

I.e. this would start with a new generic kernel facility that warns about 
suboptimal new system call argument layouts on every architecture, not just on the 
affected ones.

That's a significant undertaking but should be possible to do.

Once such a facility is in place all the existing old mess is still a PITA, but 
should be manageable eventually - as no new mess is added to it.

IMHO that's the only thing that could break the somewhat deadly current dynamic of 
system call mappings mess. Complaining about people not knowing about quirks won't 
help.

One way to implement this would be to put the argument chain types (string) and 
sizes (int) into a special debug section which isn't included in the final kernel 
image but which can be checked at link time.

For example this attempt at creating a new system call:

  SYSCALL_DEFINE3(moron, int, fd, loff_t, offset, size_t, count)

... would translate into something like:

	.name = "moron", .pattern = "WWW", .type = "int",    .size = 4,
	.name = NULL,                      .type = "loff_t", .size = 8,
	.name = NULL,                      .type = "size_t", .size = 4,
	.name = NULL,                      .type = NULL,     .size = 0,     /* end of parameter list */

i.e. "WDW". The build-time constraint checker could then warn about:

  # error: System call "moron" uses invalid 'WWW' argument mapping for a 'WDW' sequence
  #        please avoid long-long arguments or use 'SYSCALL_DEFINE3_WDW()' instead

Each architecture can provide its own syscall parameter checking logic. Both 
'stack boundary' and parameter packing rules would be straightforward to express 
if we had such a data structure.

Also note that this tool could also check for optimum packing, i.e. if the new 
system call is defined as:

  SYSCALL_DEFINE3_WDW(moron, int, fd, loff_t, offset, size_t, count)

... would translate to something like:

	.name = "moron", .pattern = "WDW", .type = "int",    .size = 4,
	.name = NULL,                      .type = "loff_t", .size = 8,
	.name = NULL,                      .type = "size_t", .size = 4,
	.name = NULL,                      .type = NULL,     .size = 0,     /* end of parameter list */

where the tool would print out this error:

  # error: System call "moron" uses suboptimal 'WDW' argument mapping instead of 'WWD'

there would be a whitelist of existing system calls that are already using an 
suboptimal argument order - but the warnings/errors would trigger for all new 
system calls.

But adding non-straight-mapped system calls would be the exception in any case.

Such tooling could also do other things, such as limit the C types used for system 
call defines to a well-chosen set of ABI-safe types, such as:

      3  key_t
      3  uint32_t
      4  aio_context_t
      4  mqd_t
      4  timer_t
     10  clockid_t
     10  gid_t
     10  loff_t
     10  long
     10  old_gid_t
     10  old_uid_t
     10  umode_t
     11  uid_t
     31  pid_t
     34  size_t
     69  unsigned int
    130  unsigned long
    226  int

This would also allow us some cleanups as well, such as dropping the pointless 
'const' from arithmetic types in syscall definitions for example.

etc.

Basically this tool would be a secondary parser of the syscall arguments, with 
most of the parsing and type sizing difficulties solved by the C parser already.

I think this problem could be much more sanely solved via annotations and a bit of 
tooling, than trying to trick CPP into doing this for us (which won't really work 
in any case).

Thanks,

	Ingo
Al Viro March 19, 2018, 11:23 p.m. UTC | #6
On Mon, Mar 19, 2018 at 10:29:20AM +0100, Ingo Molnar wrote:
> 
> * Al Viro <viro@ZenIV.linux.org.uk> wrote:
> 
> > On Sun, Mar 18, 2018 at 06:18:48PM +0000, Al Viro wrote:
> > 
> > > I'd done some digging in that area, will find the notes and post.
> > 
> > OK, found:
> 
> Very nice writeup - IMHO this should go into Documentation/!

If you want to turn that into something printable - more power to you...
FWIW, I think we need to require per-architecture descriptions of ABI
for all architectures.  Something along the lines of

alpha:
C ABI: 64bit, location sequence is ($16, $17, $18, $19, $20, $21, stack)
No arg padding (as for all 64bit).  Stack pointer in $30, return value
in $0.
Syscall ABI: syscall number in $0, arg slots filled from $16, $17, $18, $19,
$20, $21.  Return value in $0, error is reported as 1 in $19.  Saved
syscall number is used as a flag for __force_successful_syscall_return()
purposes - sticking 0 there inhibits the effect of negative return value.

arm:
C ABI: 32bit, location sequence is (r0, r1, r2, r3, stack).  Arg padding
for 64bit args: to even slot.  Stack pointer in sp, return value in r0
Syscall ABI, EABI variant: syscall number in r7.
Syscall ABI, OABI variant: syscall number encoded into insn.
Syscall ABI (both variants): arg slots filled from r0, r1, r2, r3, r4, r5.
Return value in r0.  It's not quite a biarch (support of e.g. ioctl
handling is absent, etc.; basic syscalls are handled, but that's it).

etc.  Ideally the information about callee-saved registers, syscall restart
logics, etc. should also go there.  I'm sick and tired of digging though
the asm glue of unfamiliar architectures ;-/

Another relevant piece of information (especially for biarch) is how
should sub-word arguments be normalized.  E.g. on amd64 both int and long
are passed in 64bit words and function that expects an int does *not*
care about the upper 32 bits.  If you have long f(int a) {return a;},
it will sign-extend the argument.  On ppc, OTOH, it won't - the caller
is responsible for having the bits 31..63 all equal.

That used to be a source of considerable PITA - e.g. kill(2) used to
require a compat wrapper on ppc.  These days SYSCALL_DEFINE and
COMPAT_SYSCALL_DEFINE glue takes care of normalizations.  However
it doesn't apply for the stuff that does *not* use ...DEFINE and
for use of native syscalls on biarch we need a bit more.  Consider
e.g. 32bit syscall on sparc64 wanting to use the native counterpart.
Arguments that are <= 32bit in both ABIs are fine - normalizations
will take care of them.  Anything that is 64bit in both ABIs means
that we will need compat anyway - the argument needs to be recombined
from two registers into one.  The headache comes from
	* signed long
	* unsigned long
	* pointers
Those are word-sized and we need to normalize.  Solution before
SYSCALL_DEFINE glue: have upper halves forcibly zeroed on entry (which
normalizes unsigned long and pointers) and then sign-extend every
signed int and signed long in per-syscall glue (that zeroing is
guaranteed to denormalize int arguments).  Once SYSCALL_DEFINE started
to do normalization we disposed on the need to do separate wrappers
for int arguments; that still leaves us with signed long ones, but
	* they are very rare
	* most of the syscalls passing them need compat for more
serious reasons anyway.
There are only two exceptions - bdflush(2) and pciconfig_iobase(2).
The latter doesn't exist on sparc, the former ignores its signed long
argument completely.  So we are left with "zero upper halves of all
argument-bearing registers upon the entry and have per-syscall glue
take care of the rest".

For s390 the situation is nastier - normalization for signed and
unsigned long is the same as usual, but pointers might have junk
in bit 31.  IOW, for anything with pointer in arguments we can't
just use the native syscall.  As the result, s390 doesn't bother
with zeroing upper halves in syscall dispatcher and does private
mini-wrappers for native syscalls with pointer/long/ulong arguments.

That kind of crap really needs to be documented - RTFS becomes
somewhat painful when it involves tens of assemblers *and*
missing ABI documents (try to locate one for something embedded -
great motivation for expanding vocabulary, that) ;-/

FWIW, SYSCALL_DEFINE and its ilk (COMPAT_SYSCALL_DEFINE,
s390 COMPAT_SYSCALL_WRAP, etc.) are all about stepping over the
ABI gap - we've got some values from userland caller and we
need to turn that into a valid C function call that would
satisfy C ABI constraints.  Some amount of normalization might've
been done by syscall dispatcher; this stuff does the rest on
per-function basis.

> One way to implement this would be to put the argument chain types (string) and 
> sizes (int) into a special debug section which isn't included in the final kernel 
> image but which can be checked at link time.

Umm...  Possible, but I actually believe that we can do that without
debug info.

WARNING: AVERT YOUR EYES IF YOU HAVE A WEAK STOMACH.  I'm _not_ saying that
the trickery below is a good idea, no need to break out a straightjacket.
It does have some merits, but in the current form it's ugly as hell and
almost certainly not fit for inclusion.  Consider that as a theoretical
exercise, please.

Again, don't read further if you are easily squicked.  You've been warned.
===============================================================================
Look:
* we can generate
	enum {S0 = __TYPE_IS_LL(int),
	      S1 = __TYPE_IS_LL(loff_t),
	      S2 = __TYPE_IS_LL(size_t)};
at preprocessor level (inside the compat wrapper being built).  Calculations
will be done at compile time, of course, but those _are_ constants (0, 1, 0 in
our case).
* we can generate
	enum {OFF0 = 0, OFF1 = STEP(OFF0 + S0, S1), OFF2 = STEP(OFF1 + S1, S2)};
at the same time, with STEP(base, big) defined as base + 1 for something like
x86, (big ? ((base) | 1) + 1 : (base) + 1) for arm and friends,
base + 1 + (big && base == 3) for s390.  Again, compile-time calculations.
For x86 - (0, 1, 3), for arm - (0, 2, 4), etc.
* we can generate
	C_S_moron((__force int)(S0 ? PAIR(OFF0) : ARG(OFF0)),
		  (__force loff_t)(S1 ? PAIR(OFF1) : ARG(OFF1)),
		  (__force size_t)(S2 ? PAIR(OFF2) : ARG(OFF2)));
in the body of wrapper.  With PAIR(n) defined as either ((u64)ARG(n) << 32) | ARG(n+1)
or ((u64)ARG(n+1) << 32) | ARG(n) (depending upon endianness) and ARG(n)
defined as (n == 0 ? a0 : n == 1 ? a1 : n == 2 ? a2 : n == 3 ? a3 :
	    n == 4 ? a4 : n == 5 ? a5 : a6)
Note that each of those suckers expands to a cascade of conditional
expressions with all conditions being integer constant expressions,
no more than one of those being true.  In our case that crap will fold into
	C_S_moron((__force int)a0,
		  (__force loff_t)(((u64)a2 << 32)|a1),
		  (__force size_t)a3);
as soon as optimizations at the level of 0 ? x : y => y and 1 ? x : y => x
are done.  And we have, in effect,

static inline long C_S_moron(int, loff_t, size_t);
long compat_SyS_moron(long a0, long a1, long a2, long a3, long a4, long a5, long a6)
{
	return C_S_moron((__force int)a0,
		  (__force loff_t)(((u64)a2 << 32)|a1),
		  (__force size_t)a3);
}
static inline long C_S_moron(int fd, loff_t offset, size_t count)
{
	whatever body you had for it
}

That - from
COMPAT_SYSCALL_DEFINE3(moron, int, fd, loff_t, offset, size_t, count)
{
	whatever body you had for it
}

We can use similar machinery for SYSCALL_DEFINE itself, so that
SyS_moron() would be defined with (long, long, long, long, long, long)
as arguments and not (long, long long, long) as we have now.

It's not impossible to do.  It won't be pretty, but that use of local
enums allows to avoid unbearably long expansions.

Benefits:
	* all SyS... wrappers (i.e. the thing that really ought to
go into syscall tables) have the same type.
	* we could have SYSCALL_DEFINE produce a trivial compat
wrapper, have explicit COMPAT_SYSCALL_DEFINE discard that thing
and populate the compat syscall table *entirely* with compat_SyS_...,
letting the linker sort it out.  That way we don't need to keep
track of what can use native and what needs compat in each compat
table on biarch.
	* s390 compat wrappers would disappear with that approach.
	* we could even stop generating sys_... aliases - if
syscall table is generated by slapping SyS_... or compat_SyS_...
on the name given there, we don't need to _have_ those sys_...
things at all.  All SyS_... would have the same type, so the pile
in syscalls.h would not be needed - we could generate the externs
at the same time we generate the syscall table.

And yes, it's a high-squick approach.  I know and I'm not saying
it's a good idea.  OTOH, to quote the motto of philosophers and
shell game operators, "there's something in it"...

> For example this attempt at creating a new system call:
> 
>   SYSCALL_DEFINE3(moron, int, fd, loff_t, offset, size_t, count)
> 
> ... would translate into something like:
> 
> 	.name = "moron", .pattern = "WWW", .type = "int",    .size = 4,
> 	.name = NULL,                      .type = "loff_t", .size = 8,
> 	.name = NULL,                      .type = "size_t", .size = 4,
> 	.name = NULL,                      .type = NULL,     .size = 0,     /* end of parameter list */
> 
> i.e. "WDW". The build-time constraint checker could then warn about:
> 
>   # error: System call "moron" uses invalid 'WWW' argument mapping for a 'WDW' sequence
>   #        please avoid long-long arguments or use 'SYSCALL_DEFINE3_WDW()' instead

... if you do 32bit build.

> Each architecture can provide its own syscall parameter checking logic. Both 
> 'stack boundary' and parameter packing rules would be straightforward to express 
> if we had such a data structure.
> 
> Also note that this tool could also check for optimum packing, i.e. if the new 
> system call is defined as:
> 
>   SYSCALL_DEFINE3_WDW(moron, int, fd, loff_t, offset, size_t, count)

> Such tooling could also do other things, such as limit the C types used for system 
> call defines to a well-chosen set of ABI-safe types, such as:
> 
>       3  key_t
>       3  uint32_t
>       4  aio_context_t
>       4  mqd_t
>       4  timer_t
>      10  clockid_t
>      10  gid_t
>      10  loff_t
>      10  long
Uhhuh - ABI-safe is not how I would describe that.  Sodding PITA, fortunately
rare, is more like it...

>      10  old_gid_t
>      10  old_uid_t
>      10  umode_t
>      11  uid_t
>      31  pid_t
>      34  size_t
>      69  unsigned int
>     130  unsigned long
>     226  int
> 
> This would also allow us some cleanups as well, such as dropping the pointless 
> 'const' from arithmetic types in syscall definitions for example.
> 
> etc.
> 
> Basically this tool would be a secondary parser of the syscall arguments, with 
> most of the parsing and type sizing difficulties solved by the C parser already.
> 
> I think this problem could be much more sanely solved via annotations and a bit of 
> tooling, than trying to trick CPP into doing this for us (which won't really work 
> in any case).

See above.  It *can* be done.  Not by cpp alone, of course - you need compiler
involvement.
Dominik Brodowski March 20, 2018, 8:56 a.m. UTC | #7
On Mon, Mar 19, 2018 at 11:23:42PM +0000, Al Viro wrote:
> static inline long C_S_moron(int, loff_t, size_t);
> long compat_SyS_moron(long a0, long a1, long a2, long a3, long a4, long a5, long a6)
> {
> 	return C_S_moron((__force int)a0,
> 		  (__force loff_t)(((u64)a2 << 32)|a1),
> 		  (__force size_t)a3);
> }
> static inline long C_S_moron(int fd, loff_t offset, size_t count)
> {
> 	whatever body you had for it
> }
> 
> That - from
> COMPAT_SYSCALL_DEFINE3(moron, int, fd, loff_t, offset, size_t, count)
> {
> 	whatever body you had for it
> }
> 
> We can use similar machinery for SYSCALL_DEFINE itself, so that
> SyS_moron() would be defined with (long, long, long, long, long, long)
> as arguments and not (long, long long, long) as we have now.

That would be great, as it would allow to use a struct pt_regs * based
syscall calling convention on i386 as well, and not only on x86-64, right?

> It's not impossible to do.  It won't be pretty, but that use of local
> enums allows to avoid unbearably long expansions.
> 
> Benefits:
> 	* all SyS... wrappers (i.e. the thing that really ought to
> go into syscall tables) have the same type.
> 	* we could have SYSCALL_DEFINE produce a trivial compat
> wrapper, have explicit COMPAT_SYSCALL_DEFINE discard that thing
> and populate the compat syscall table *entirely* with compat_SyS_...,
> letting the linker sort it out.  That way we don't need to keep
> track of what can use native and what needs compat in each compat
> table on biarch.
> 	* s390 compat wrappers would disappear with that approach.
> 	* we could even stop generating sys_... aliases - if
> syscall table is generated by slapping SyS_... or compat_SyS_...
> on the name given there, we don't need to _have_ those sys_...
> things at all.  All SyS_... would have the same type, so the pile
> in syscalls.h would not be needed - we could generate the externs
> at the same time we generate the syscall table.
> 
> And yes, it's a high-squick approach.  I know and I'm not saying
> it's a good idea.  OTOH, to quote the motto of philosophers and
> shell game operators, "there's something in it"...

... and getting rid of all in-kernel calls to sys_*() is needed as
groundwork for that. So I'll continue to do that "mindless" conversion
first. On top of that, three things (which are mostly orthogonal to each
other) can be done:

1) ptregs system call conversion for x86-64

   Original implementation by Linus exists; needs a bit of tweaking
   but should be doable soon. Need to double-check it does the right
   thing for IA32_EMULATION, though.

2) re-work initramfs etc. code to not use in-kernel equivalents of
   syscalls, but operate on the VFS level instead.

3) re-work SYSCALL_DEFINEx() / COMPAT_SYSCALL_DEFINEx() based on
   your suggestions.

Does that sound sensible?

Thanks,
	Dominik
Ingo Molnar March 20, 2018, 8:59 a.m. UTC | #8
* Al Viro <viro@ZenIV.linux.org.uk> wrote:

> > For example this attempt at creating a new system call:
> > 
> >   SYSCALL_DEFINE3(moron, int, fd, loff_t, offset, size_t, count)
> > 
> > ... would translate into something like:
> > 
> > 	.name = "moron", .pattern = "WWW", .type = "int",    .size = 4,
> > 	.name = NULL,                      .type = "loff_t", .size = 8,
> > 	.name = NULL,                      .type = "size_t", .size = 4,
> > 	.name = NULL,                      .type = NULL,     .size = 0,     /* end of parameter list */
> > 
> > i.e. "WDW". The build-time constraint checker could then warn about:
> > 
> >   # error: System call "moron" uses invalid 'WWW' argument mapping for a 'WDW' sequence
> >   #        please avoid long-long arguments or use 'SYSCALL_DEFINE3_WDW()' instead
> 
> ... if you do 32bit build.

Yeah - but the checking tool could do a 32-bit sizing of the types and thus the 
checks would work on all arches and on all bitness settings.

I don't think doing part of this in CPP is a good idea:

 - It won't be able to do the full range of checks

 - Wrappers should IMHO be trivial and open coded as much as possible - not hidden
   inside several layers of macros.

 - There should be a penalty for newly introduced, badly designed system call
   ABIs, while most CPP variants I can think of will just make bad but solvable 
   decisions palatable, AFAICS.

I.e. I think the way out of this would be two steps:

 1) for new system calls: hard-enforce the highest quality at the development
    stage and hard-reject crap. No new 6-parameter system calls or badly ordered
    arguments. The tool would also check new extensions to existing system calls, 
    i.e. no more "add a crappy 4th argument to an existing system call that works 
    on x86 but hurts MIPS".

 2) for old legacies: cleanly open code all our existing legacies and weird
    wrappers. No new muck will be added to it so the line count does not matter.

... is there anything I'm missing?

Thanks,

	Ingo
Al Viro March 22, 2018, 12:15 a.m. UTC | #9
On Mon, Mar 19, 2018 at 11:23:42PM +0000, Al Viro wrote:
> Benefits:
> 	* all SyS... wrappers (i.e. the thing that really ought to
> go into syscall tables) have the same type.
> 	* we could have SYSCALL_DEFINE produce a trivial compat
> wrapper, have explicit COMPAT_SYSCALL_DEFINE discard that thing
> and populate the compat syscall table *entirely* with compat_SyS_...,
> letting the linker sort it out.  That way we don't need to keep
> track of what can use native and what needs compat in each compat
> table on biarch.
> 	* s390 compat wrappers would disappear with that approach.
> 	* we could even stop generating sys_... aliases - if
> syscall table is generated by slapping SyS_... or compat_SyS_...
> on the name given there, we don't need to _have_ those sys_...
> things at all.  All SyS_... would have the same type, so the pile
> in syscalls.h would not be needed - we could generate the externs
> at the same time we generate the syscall table.
> 
> And yes, it's a high-squick approach.  I know and I'm not saying
> it's a good idea.  OTOH, to quote the motto of philosophers and
> shell game operators, "there's something in it"...

FWIW, I have something that is almost reasonable on preprocessor side;
however, that has uncovered the following fun:
void f(unsigned long long);
void g(unsigned a, unsigned b)
{
        f((((unsigned long long)b)<<32)|a);
}

which does compile to "jump to f" on i386, ends up with the following
joy on arm:
        mov     r3, r1
        mov     r2, #0
        push    {r4, lr}
        orr     r2, r2, r0
        mov     r0, r2
        mov     r1, r3
        bl      f
        pop     {r4, lr}
        bx      lr
with gcc6; gcc7 is saner - there we have just
        mov     r2, #0
        orr     r0, r2, r0
        b       f

The former is
	r3 = r1
	r2 = 0
	r2 |= r0
	r0 = r2
	r1 = r3
The latter -
	r2 = 0
	r0 |= r2
which is better, but still bloody odd

And I'm afraid to check what e.g. 4.4 will do with that testcase...
Al Viro March 26, 2018, 12:40 a.m. UTC | #10
On Thu, Mar 22, 2018 at 12:15:32AM +0000, Al Viro wrote:

> FWIW, I have something that is almost reasonable on preprocessor side;
> however, that has uncovered the following fun:
[snip]

According to gcc folks, the right way to do it is type-punning via
union.  Anyway, below is something that kinda-sorta works as a
replacement of macrology in compat.h and syscalls.h.

Kinda-sorta part:
	* asmlinkage_protect is taken out for now, so m68k has problems.
	* syscalls that run out of 6 slots barf violently.  For mips it's
wrong (there we have 8 slots); for stuff like arm and ppc it's right, but
it means that things like e.g. compat sync_file_range() should not even
be compiled on those.  __ARCH_WANT_SYS_SYNC_FILE_RANGE, presumably...
In any case, we *can't* do pt_regs-based wrappers for those syscalls on
such architectures, so ifdefs around those puppies are probably the right
thing to do.
	* s390 macrology in compat_wrapper.c not even touched; it needs
a trivial update to keep working (__MAP callbacks take an extra argument,
unused for those users).
	* sys_... and compat_sys_... aliases are unchanged; if we kill
direct callers, we can trivially rename SyS##name and compat_SyS##name
to sys##name and compat_sys##name and get rid of aliases.

It allows an architecture to decide whether it wants 6-argument wrappers
or pt_regs ones - that's encapsulated into several macros.  Padding is
done properly; things like truncate64() et.al. can be done as single
COMPAT_SYSCALL_DEFINE.  Moreover, AFAICS the same wrappers work just fine
for arm64 - no worse than asm glue currently used there.

I've tried to put the sane amount of comments in there, hopefully
explaining what the hell is that nest of horrors doing.  It's ugly,
but less eye-bleeding that I thought it would be.

I'm not sure if it's a good idea; something of that sort would probably be
useful for any kind of pt_regs-passing wrappers, though.  Review would be
welcome...

#include <linux/compiler.h>
#include <linux/linkage.h>
#include <linux/build_bug.h>
#include <trace/syscall.h>

/*
 * __MAP - apply a macro to syscall arguments
 * __MAP(n, m, t1, a1, t2, a2, ..., tn, an) will expand to
 *    m(, t1, a1), m(_, t2, a2), ..., m(_..._, tn, an)
 * The first argument must be equal to the amount of type/name
 * pairs given.  Note that this list of pairs (i.e. the arguments
 * of __MAP starting at the third one) is in the same format as
 * for SYSCALL_DEFINE<n>/COMPAT_SYSCALL_DEFINE<n>
 */
#define __MAP0(m,p,...)
#define __MAP1(m,p,t,a) m(p,t,a)
#define __MAP2(m,p,t,a,...) m(p,t,a), __MAP1(m,p##_,__VA_ARGS__)
#define __MAP3(m,p,t,a,...) m(p,t,a), __MAP2(m,p##_,__VA_ARGS__)
#define __MAP4(m,p,t,a,...) m(p,t,a), __MAP3(m,p##_,__VA_ARGS__)
#define __MAP5(m,p,t,a,...) m(p,t,a), __MAP4(m,p##_,__VA_ARGS__)
#define __MAP6(m,p,t,a,...) m(p,t,a), __MAP5(m,p##_,__VA_ARGS__)
#define __MAP(n,m,...) __MAP##n(m,,__VA_ARGS__)

#define __TYPE_AS(t, v)	__same_type((__force t)0, v)
#define __TYPE_IS_L(t)	(__TYPE_AS(t, 0L))
#define __TYPE_IS_UL(t)	(__TYPE_AS(t, 0UL))
#define __TYPE_IS_LL(t) (__TYPE_AS(t, 0LL) || __TYPE_AS(t, 0ULL))

/*
 * __SC_SIZES32(n, t1, a1, ..., tn, an)
 * __SC_SIZES64(n, t1, a1, ..., tn, an)
 * Build an enum describing the number of slots taken by syscall arguments.
 * Example: __SC_SIZES32(3, int, a, loff_t, b, void *, c) ends up with
 * enum {__SC_BIG_ = 0, __SC_BIG__ = 1, __SC_BIG___ = 0};
 * __SC_BIG<k underscores> => does k-th argument take two slots.
 * Note that __SC_SIZES64 will always define them as zeroes.
 */
#define __SC_SIZE32(p,t,a) __SC_BIG_##p = __TYPE_IS_LL(t)
#define __SC_SIZE64(p,t,a) __SC_BIG_##p = 0
#define __SC_SIZES32(n,...) enum{__MAP(n,__SC_SIZE32,__VA_ARGS__)}
#define __SC_SIZES64(n,...) enum{__MAP(n,__SC_SIZE64,__VA_ARGS__)}


/*
 * Distributing the argument slots.  That depends upon the C ABI for target
 * architecture.  Background: there is an arch-specific sequence of word-sized
 * objects and function arguments are mapped on that sequence.  Each argument
 * takes one or two slots (we are dealing only with integer and pointer types;
 * the rules for floating point, structs, etc. can be a lot more complex).
 * 
 * Syscall ABI marshals the arguments across the userland boundary.  In almost
 * all cases it is done by having 6 registers, each assigned to a slot in
 * the sequence.  The value of those registers are stored in the corresponding
 * slots and control is passed to a wrapper C function implementing the syscall.
 * There the values are normalized and passed to the syscall itself.
 *
 * In case of biarch architectures we need to know the allocation rules for
 * 32bit counterpart - the 64bit ones are trivial (each argument takes exactly
 * one slot, the nth argument goes into nth slot and that's it).
 *
 * Rules are encoded in a pair of macros - __SC_FIRST_ARG and __SC_NEXT_ARG.
 * For the absolute majority of architectures it is simply "take the next
 * one or two slots, depending upon the argument size", and those are fine
 * with defaults.  Something trickier needs to override those.
 */
#ifndef __SC_FIRST_ARG
/*
 * Unless the ABI is really pathological, default will do.
 */
#define __SC_FIRST_ARG 0
#endif

#ifndef __SC_NEXT_ARG
/* For architectures with no alignment requirements.  Most of our targets are
 * such: alpha arc cris frv h8300 ia64 m32r m68k microblaze mn10300 nios2
 * openrisc riscv sparc x86.
*/
#define __SC_NEXT_ARG(p, next, big) next
#endif

#if 0
// Those should live in the corresponding arch/.../include/asm/ABI.h:

// arm, mips, parisc, powerpc, xtensa: double-slot arguments are aligned
// to even slot there.

#define __SC_NEXT_ARG(p, next, big) next + (big & next)

// s390: double-slot can't occupy slots 4 and 5.

#define __SC_NEXT_ARG(p, next, big) next + (big && next == 4)

// blackfin: double-slot can't occupy slots 2 and 3.

#define __SC_NEXT_ARG(p, next, big) next + (big && next == 2)

// sh: this is really nasty.
// Double-slot ones can't occupy slots 3 and 4; if slot 3 had been skipped due
// to that, the next single-slot argument goes there.  That's where we need
// to keep track of more than just the size of the argument and the slots occupied
// by the previous one.


#define __SC_FIRST_ARG 0, __SC_S_ = 0
#define __SC_NEXT_ARG(p, next, big) \
		(big && next == 3) ? 4 : \
		(!big && next == __SC_S##p) ? 3 : \
		(next < __SC_S##p) ? __SC_S##p : next, \
	__SC_S_##p = \
		(big && next == 3) ? 6 : \
 		(next != __SC_S##p) ? 0 : \
		__SC_S##p + (big ? 2 : 0)
#endif

/*
 * __SC_SLOTS(n, t1, a1, ..., tn, an)
 * Build an enum describing the slots taken by each argument.
 * Assumes __SC_BIG_... already in the scope and __SC_{FIRST,NEXT}_ARG
 * defined according to the architecture.
 */
#define __SC_IF(x,y) y
#define __SC_IF_(x,y) x
#define __SC_IF__ __SC_IF_
#define __SC_IF___ __SC_IF_
#define __SC_IF____ __SC_IF_
#define __SC_IF_____ __SC_IF_
#define __SC_SLOT(p,t,a) __SC_OFF_##p = \
	__SC_IF##p(__SC_NEXT_ARG(p, (__SC_OFF##p + __SC_BIG##p + 1), __SC_BIG_##p),\
		   __SC_FIRST_ARG)
#define __SC_SLOTS(n,...) enum{ __MAP(n,__SC_SLOT,__VA_ARGS__) }

/*
 * Syscall ABI marshalling. Syscall dispatcher can either arrange loading
 * of registers into slots on its own and have the syscall wrapper called
 * (in that case we want the wrapper looking like a 6-argument function,
 * with each argument taking a slot) or pass a pointer to struct pt_regs
 * and have wrapper fetch the individual registers from there.  In the
 * former case all knowledge of the mapping from registers to slots sits
 * in the caller, in the latter - in wrappers.  Other strategies are
 * also possible; depends upon the architecture.
 *
 * For syscall wrappers we want the prototype to be used for wrapper and
 * macros for accessing individual slots.  Default is 6-argument wrappers;
 * if architecture wants something different, it should supply
 * __SC_{32,64}PROTO for the prototype and __SC_{32,64}ARG{1..6} for
 * arguments.  Example: for struct pt_regs on x86 we want,
 * in arch/x86/include/asm/ABI.h,
 * #define __SC_64PROTO struct pt_regs *regs
 * #define __SC_64ARG1 regs->di
 * #define __SC_64ARG2 regs->si
 * #define __SC_64ARG3 regs->dx
 * #define __SC_64ARG4 regs->r10
 * #define __SC_64ARG5 regs->r8
 * #define __SC_64ARG6 regs->r9
 * #define __SC_32PROTO struct pt_regs *regs
 * #define __SC_32ARG1 regs->bx
 * #define __SC_32ARG2 regs->cx
 * #define __SC_32ARG3 regs->dx
 * #define __SC_32ARG4 regs->si
 * #define __SC_32ARG5 regs->di
 * #define __SC_32ARG6 regs->bp
 */
#ifndef __SC_64PROTO
#define __SC_64PROTO u64 __SC_64ARG1, u64 __SC_64ARG2, u64 __SC_64ARG3,\
		    u64 __SC_64ARG4, u64 __SC_64ARG5, u64 __SC_64ARG6
#endif
#ifndef __SC_32PROTO
#define __SC_32PROTO u32 __SC_32ARG1, u32 __SC_32ARG2, u32 __SC_32ARG3,\
		    u32 __SC_32ARG4, u32 __SC_32ARG5, u32 __SC_32ARG6
#endif

static __always_inline __nostackprotector u64 combine_long_long(u32 a, u32 b)
{
	union { u32 c[2]; u64 d; } x = {.c={a,b}};
	return x.d;
}
extern u32 __SC_32_ARG7(void) __compiletime_error("too many argument slots");
extern u64 __SC_64_ARG7(void) __compiletime_error("too many argument slots");
#define __SC_32ARG7 __SC_64_ARG7()
#define __SC_64ARG7 __SC_32_ARG7()
#define __SC_W(s,n) (n == 0 ? __SC_##s##ARG1 : \
		   n == 1 ? __SC_##s##ARG2 : \
		   n == 2 ? __SC_##s##ARG3 : \
		   n == 3 ? __SC_##s##ARG4 : \
		   n == 4 ? __SC_##s##ARG5 : \
		   n == 5 ? __SC_##s##ARG6 : \
		   __SC_##s##ARG7)
#define __SC_D(s,n) \
	(n == 0 ? combine_long_long(__SC_##s##ARG1,__SC_##s##ARG2) : \
	 n == 1 ? combine_long_long(__SC_##s##ARG2,__SC_##s##ARG3) : \
	 n == 2 ? combine_long_long(__SC_##s##ARG3,__SC_##s##ARG4) : \
	 n == 3 ? combine_long_long(__SC_##s##ARG4,__SC_##s##ARG5) : \
	 n == 4 ? combine_long_long(__SC_##s##ARG5,__SC_##s##ARG6) : \
	 combine_long_long(__SC_##s##ARG6,__SC_##s##ARG7))
#define __SC_ARG32(p, t, a)	\
	(__force t)__builtin_choose_expr(__SC_BIG_##p, \
		__SC_D(32, __SC_OFF_##p), __SC_W(32, __SC_OFF_##p))
#define __SC_ARG64(p, t, a)	\
	(__force t)__builtin_choose_expr(__SC_BIG_##p, \
		__SC_D(64, __SC_OFF_##p), __SC_W(64, __SC_OFF_##p))
#define __SC_ARGS32(n, ...) __MAP(n, __SC_ARG32, __VA_ARGS__)
#define __SC_ARGS64(n, ...) __MAP(n, __SC_ARG64, __VA_ARGS__)

#ifdef CONFIG_64BIT
#define __SC_PROTO __SC_64PROTO
#define __SC_ARGS __SC_ARGS64
#define __SC_SIZES __SC_SIZES64
#else
#define __SC_PROTO __SC_32PROTO
#define __SC_ARGS __SC_ARGS32
#define __SC_SIZES __SC_SIZES32
#endif

/* The following is ftrace glue, almost verbatim from mainline */

#ifdef CONFIG_FTRACE_SYSCALLS
#define __SC_STR_ADECL(p, t, a)	#a
#define __SC_STR_TDECL(p, t, a)	#t

extern struct trace_event_class event_class_syscall_enter;
extern struct trace_event_class event_class_syscall_exit;
extern struct trace_event_functions enter_syscall_print_funcs;
extern struct trace_event_functions exit_syscall_print_funcs;

#define SYSCALL_TRACE_ENTER_EVENT(sname)				\
	static struct syscall_metadata __syscall_meta_##sname;		\
	static struct trace_event_call __used				\
	  event_enter_##sname = {					\
		.class			= &event_class_syscall_enter,	\
		{							\
			.name                   = "sys_enter"#sname,	\
		},							\
		.event.funcs            = &enter_syscall_print_funcs,	\
		.data			= (void *)&__syscall_meta_##sname,\
		.flags                  = TRACE_EVENT_FL_CAP_ANY,	\
	};								\
	static struct trace_event_call __used				\
	  __attribute__((section("_ftrace_events")))			\
	 *__event_enter_##sname = &event_enter_##sname;

#define SYSCALL_TRACE_EXIT_EVENT(sname)					\
	static struct syscall_metadata __syscall_meta_##sname;		\
	static struct trace_event_call __used				\
	  event_exit_##sname = {					\
		.class			= &event_class_syscall_exit,	\
		{							\
			.name                   = "sys_exit"#sname,	\
		},							\
		.event.funcs		= &exit_syscall_print_funcs,	\
		.data			= (void *)&__syscall_meta_##sname,\
		.flags                  = TRACE_EVENT_FL_CAP_ANY,	\
	};								\
	static struct trace_event_call __used				\
	  __attribute__((section("_ftrace_events")))			\
	*__event_exit_##sname = &event_exit_##sname;

#define SYSCALL_METADATA(sname, nb, ...)			\
	static const char *types_##sname[] = {			\
		__MAP(nb,__SC_STR_TDECL,__VA_ARGS__)		\
	};							\
	static const char *args_##sname[] = {			\
		__MAP(nb,__SC_STR_ADECL,__VA_ARGS__)		\
	};							\
	SYSCALL_TRACE_ENTER_EVENT(sname);			\
	SYSCALL_TRACE_EXIT_EVENT(sname);			\
	static struct syscall_metadata __used			\
	  __syscall_meta_##sname = {				\
		.name 		= "sys"#sname,			\
		.syscall_nr	= -1,	/* Filled in at boot */	\
		.nb_args 	= nb,				\
		.types		= nb ? types_##sname : NULL,	\
		.args		= nb ? args_##sname : NULL,	\
		.enter_event	= &event_enter_##sname,		\
		.exit_event	= &event_exit_##sname,		\
		.enter_fields	= LIST_HEAD_INIT(__syscall_meta_##sname.enter_fields), \
	};							\
	static struct syscall_metadata __used			\
	  __attribute__((section("__syscalls_metadata")))	\
	 *__p_syscall_meta_##sname = &__syscall_meta_##sname;

static inline int is_syscall_trace_event(struct trace_event_call *tp_event)
{
	return tp_event->class == &event_class_syscall_enter ||
	       tp_event->class == &event_class_syscall_exit;
}

#else
#define SYSCALL_METADATA(sname, nb, ...)

static inline int is_syscall_trace_event(struct trace_event_call *tp_event)
{
	return 0;
}
#endif

#define SYSCALL_DEFINE0(sname)					\
	SYSCALL_METADATA(_##sname, 0);				\
	asmlinkage long sys_##sname(void)

#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)

#define SYSCALL_DEFINEx(x, sname, ...)				\
	SYSCALL_METADATA(sname, x, __VA_ARGS__)			\
	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)

#define __SC_DECL(p, t, a)	t a
#define __SC_TEST(p, t, a) (void)BUILD_BUG_ON_ZERO(!__TYPE_IS_LL(t) && sizeof(t) > sizeof(long))
#define __SYSCALL_DEFINEx(x, name, ...)					\
	asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))	\
		__attribute__((alias(__stringify(SyS##name))));		\
	static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));	\
	asmlinkage long SyS##name(__SC_PROTO);				\
	asmlinkage long SyS##name(__SC_PROTO)				\
	{								\
		__SC_SIZES(x,__VA_ARGS__);				\
		__SC_SLOTS(x,__VA_ARGS__);				\
		__MAP(x,__SC_TEST,__VA_ARGS__);				\
		return SYSC##name(__SC_ARGS(x,__VA_ARGS__));		\
	}								\
	static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))

#ifdef CONFIG_COMPAT
#define __SC_COMPAT_PROTO __SC_32PROTO
#define __SC_COMPAT_ARGS __SC_ARGS32
#define __SC_COMPAT_SIZES __SC_SIZES32

#define COMPAT_SYSCALL_DEFINE0(name) \
	asmlinkage long compat_sys_##name(void)

#define COMPAT_SYSCALL_DEFINE1(name, ...) \
        COMPAT_SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
#define COMPAT_SYSCALL_DEFINE2(name, ...) \
	COMPAT_SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
#define COMPAT_SYSCALL_DEFINE3(name, ...) \
	COMPAT_SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#define COMPAT_SYSCALL_DEFINE4(name, ...) \
	COMPAT_SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
#define COMPAT_SYSCALL_DEFINE5(name, ...) \
	COMPAT_SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
#define COMPAT_SYSCALL_DEFINE6(name, ...) \
	COMPAT_SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)

#define COMPAT_SYSCALL_DEFINEx(x, name, ...)				\
	asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))\
		__attribute__((alias(__stringify(compat_SyS##name))));  \
	static inline long C_SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
	asmlinkage long compat_SyS##name(__SC_COMPAT_PROTO);		\
	asmlinkage long compat_SyS##name(__SC_COMPAT_PROTO)		\
	{								\
		__SC_COMPAT_SIZES(x,__VA_ARGS__);			\
		__SC_SLOTS(x,__VA_ARGS__);				\
		return C_SYSC##name(__SC_COMPAT_ARGS(x,__VA_ARGS__));	\
	}								\
	static inline long C_SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
#endif
Al Viro March 26, 2018, 3:47 a.m. UTC | #11
On Mon, Mar 26, 2018 at 01:40:17AM +0100, Al Viro wrote:

> Kinda-sorta part:
> 	* asmlinkage_protect is taken out for now, so m68k has problems.
> 	* syscalls that run out of 6 slots barf violently.  For mips it's
> wrong (there we have 8 slots); for stuff like arm and ppc it's right, but
> it means that things like e.g. compat sync_file_range() should not even
> be compiled on those.  __ARCH_WANT_SYS_SYNC_FILE_RANGE, presumably...
> In any case, we *can't* do pt_regs-based wrappers for those syscalls on
> such architectures, so ifdefs around those puppies are probably the right
> thing to do.
> 	* s390 macrology in compat_wrapper.c not even touched; it needs
> a trivial update to keep working (__MAP callbacks take an extra argument,
> unused for those users).
> 	* sys_... and compat_sys_... aliases are unchanged; if we kill
> direct callers, we can trivially rename SyS##name and compat_SyS##name
> to sys##name and compat_sys##name and get rid of aliases.

	* mips n32 and x86 x32 can become an extra source of headache.
That actually applies to any plans of passing struct pt_regs *.  As it
is, e.g. syscall 515 on amd64 is compat_sys_readv().  Dispatched via
this:
        /*
         * NB: Native and x32 syscalls are dispatched from the same
         * table.  The only functional difference is the x32 bit in
         * regs->orig_ax, which changes the behavior of some syscalls.
         */
        if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) {
                nr = array_index_nospec(nr & __SYSCALL_MASK, NR_syscalls);
                regs->ax = sys_call_table[nr](
                        regs->di, regs->si, regs->dx,
                        regs->r10, regs->r8, regs->r9);
        }
Now, syscall 145 via 32bit call is *also* compat_sys_readv(), dispatched
via
                nr = array_index_nospec(nr, IA32_NR_syscalls);
                /*
                 * It's possible that a 32-bit syscall implementation
                 * takes a 64-bit parameter but nonetheless assumes that
                 * the high bits are zero.  Make sure we zero-extend all
                 * of the args.
                 */
                regs->ax = ia32_sys_call_table[nr](
                        (unsigned int)regs->bx, (unsigned int)regs->cx,
                        (unsigned int)regs->dx, (unsigned int)regs->si,
                        (unsigned int)regs->di, (unsigned int)regs->bp);
Right now it works - we call the same function, passing it arguments picked
from different set of registers (di/si/dx in x32 case, bx/cx/dx in i386 one).
But if we switch to passing struct pt_regs * and have the wrapper fetch
regs->{bx,cx,dx}, we have a problem.  It won't work for both entry points.

IMO it's a good reason to have dispatcher(s) handle extraction from pt_regs
and let the wrapper deal with the resulting 6 u64 or 6 u32, normalizing
them and arranging them into arguments expected by syscall body.

Linus, Dominik - how do you plan dealing with that fun?  Regardless of the
way we generate the glue, the issue remains.  We can't get the same
struct pt_regs *-taking function for both; we either need to produce
a separate chunk of glue for each compat_sys_... involved (either making
COMPAT_SYSCALL_DEFINE generate both, or having duplicate X32_SYSCALL_DEFINE
for each of those COMPAT_SYSCALL_DEFINE - with identical body, at that)
or we need to have the registers-to-slots mapping done in dispatcher...
Linus Torvalds March 26, 2018, 6:15 a.m. UTC | #12
On Sun, Mar 25, 2018 at 5:47 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> Linus, Dominik - how do you plan dealing with that fun?

Secretly, I was hoping to kill x32, because it's not being used afaik.

More realistically, I was thinking we'd just use a separate table or
system calls, and generate different versions.

In fact, you can see exactly that in my WIP branch, except it uses the
wrong name.

So see the "WIP-syscall" branch in my normal git kernel repo, and in
particular the patch to <linux/syscalls.h>, which generates
"sys_x64##name" and "sys_i86##name()" inline functions that do that
mapping correcty for native x86-64, and for the (misnamed) x32 cases.

So there are three different cases:

 - native: sys_x64_name() generated by SYSCALL_DEFINEx()

 - compat -bit: compat_sys_i86_name() generated by COMPAT_SYSCALL_DEFINEx()

 - x32: sys_i86_name() generated by SYSCALL_DEFINEx().

and then I actually changed the names in the tables (ie in
arch/x86/entry/syscalls/syscall_64.tbl etc).

HOWEVER.

I didn't actually test any of the compat or x32 ones, and the way I
did it there also was no type-checking or other automated catching of
getting it wrong. So it's almost certainly completely buggy, but the
_intent_ is there and there is a remote possibility that it might even
work.

              Linus
Linus Torvalds March 26, 2018, 6:20 a.m. UTC | #13
On Sun, Mar 25, 2018 at 8:15 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> HOWEVER.
>
> I didn't actually test any of the compat or x32 ones, and the way I
> did it there also was no type-checking or other automated catching of
> getting it wrong. So it's almost certainly completely buggy, but the
> _intent_ is there and there is a remote possibility that it might even
> work.

Note: the commit message is "broken, but working, ptregs system call
conversion for x86-64".

The "but working" is not because it would be right, but because it
booted a real 64-bit distribution successfully, and I actually used
that kernel.

But that only tests the native 64-bit case, it in no way tests the
compat or x32 cases what-so-ever. That part was literally a "it
compiles - it must be perfect" with absolutely zero testing of even
the most trivial kind.

And it really would have been most trivial to just do a "Hello world"
and build it with "-m32". I didn't do even that, and if I had, I would
have been honestly surprised had it worked.

But there was a _theoretical_ chance that it could have worked. Maybe.

      Linus
Dominik Brodowski March 26, 2018, 6:24 a.m. UTC | #14
On Mon, Mar 26, 2018 at 04:47:50AM +0100, Al Viro wrote:
> 	* mips n32 and x86 x32 can become an extra source of headache.
> That actually applies to any plans of passing struct pt_regs *.  As it
> is, e.g. syscall 515 on amd64 is compat_sys_readv().  Dispatched via
> this:
>         /*
>          * NB: Native and x32 syscalls are dispatched from the same
>          * table.  The only functional difference is the x32 bit in
>          * regs->orig_ax, which changes the behavior of some syscalls.
>          */
>         if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) {
>                 nr = array_index_nospec(nr & __SYSCALL_MASK, NR_syscalls);
>                 regs->ax = sys_call_table[nr](
>                         regs->di, regs->si, regs->dx,
>                         regs->r10, regs->r8, regs->r9);
>         }
> Now, syscall 145 via 32bit call is *also* compat_sys_readv(), dispatched
> via
>                 nr = array_index_nospec(nr, IA32_NR_syscalls);
>                 /*
>                  * It's possible that a 32-bit syscall implementation
>                  * takes a 64-bit parameter but nonetheless assumes that
>                  * the high bits are zero.  Make sure we zero-extend all
>                  * of the args.
>                  */
>                 regs->ax = ia32_sys_call_table[nr](
>                         (unsigned int)regs->bx, (unsigned int)regs->cx,
>                         (unsigned int)regs->dx, (unsigned int)regs->si,
>                         (unsigned int)regs->di, (unsigned int)regs->bp);
> Right now it works - we call the same function, passing it arguments picked
> from different set of registers (di/si/dx in x32 case, bx/cx/dx in i386 one).
> But if we switch to passing struct pt_regs * and have the wrapper fetch
> regs->{bx,cx,dx}, we have a problem.  It won't work for both entry points.
> 
> IMO it's a good reason to have dispatcher(s) handle extraction from pt_regs
> and let the wrapper deal with the resulting 6 u64 or 6 u32, normalizing
> them and arranging them into arguments expected by syscall body.
> 
> Linus, Dominik - how do you plan dealing with that fun?  Regardless of the
> way we generate the glue, the issue remains.  We can't get the same
> struct pt_regs *-taking function for both; we either need to produce
> a separate chunk of glue for each compat_sys_... involved (either making
> COMPAT_SYSCALL_DEFINE generate both, or having duplicate X32_SYSCALL_DEFINE
> for each of those COMPAT_SYSCALL_DEFINE - with identical body, at that)
> or we need to have the registers-to-slots mapping done in dispatcher...

Nice catch. A similar thing is needed already for non-compat syscalls like
sys_close(), which takes pt_regs->bx on IA32_EMULATION and pt_regs->di on
native x86-64. Therefore, I propose to generate all the stubs we need within
SYSCALL_DEFINEx() and COMPAT_SYSCALL_DEFINEx() (actually, within the
arch-provided version of these macros). See

	https://git.kernel.org/pub/scm/linux/kernel/git/brodo/linux.git	syscalls-WIP

for details on my current plans.

Thanks,
	Dominik
John Paul Adrian Glaubitz March 26, 2018, 6:44 a.m. UTC | #15
Hi Linus!

On 03/26/2018 03:15 PM, Linus Torvalds wrote:
> Secretly, I was hoping to kill x32, because it's not being used afaik.
FWIW, we are maintaining an x32 port in Debian and there are some people
actually using it [1]. There is one build instance running on VMWare that
I am hosting [2] and around 10800 out of 12900 source packages build fine
on x32 (12900 being the number for x86_64).

The port is mostly stable but the fact that it's a 32-bit target with a
64-bit kernel API often blows up in people's faces because they don't
expect things like a 64-bit time_t on a 32-bit system.

The port also has some issues with software unaware of the x32 and builds
fail when they just test for __x86_64__ but not for __ILP32__, but overall
it's usable and stable and has some performance advantages over x86_64.

Adrian

> [1] http://debian-x32.org/
> [2] https://buildd.debian.org/status/architecture.php?a=x32&suite=sid
Linus Torvalds March 27, 2018, 1:03 a.m. UTC | #16
On Sun, Mar 25, 2018 at 8:44 PM, John Paul Adrian Glaubitz
<glaubitz@physik.fu-berlin.de> wrote:
<
> FWIW, we are maintaining an x32 port in Debian and there are some people
> actually using it [1]. There is one build instance running on VMWare that
> I am hosting [2] and around 10800 out of 12900 source packages build fine
> on x32 (12900 being the number for x86_64).

Hmm. Do you have a few statically built binaries that could be tested
without installing a whole distribution? Something real and meaningful
enough that it actually exercised a few real system calls, but not
something that needs to bring in 50 different shared libraries?

Something in /sbin, perhaps, that is still runnable by a regular user
and doesn't require some distro-specific /etc layout etc, so that it
would work even if you don't run Debian at all? Maybe some shell
binary or something?

      Linus
John Paul Adrian Glaubitz March 27, 2018, 2:37 a.m. UTC | #17
On 03/27/2018 10:03 AM, Linus Torvalds wrote:
> Hmm. Do you have a few statically built binaries that could be tested
> without installing a whole distribution? Something real and meaningful
> enough that it actually exercised a few real system calls, but not
> something that needs to bring in 50 different shared libraries?
> 
> Something in /sbin, perhaps, that is still runnable by a regular user
> and doesn't require some distro-specific /etc layout etc, so that it
> would work even if you don't run Debian at all? Maybe some shell
> binary or something?

What about a tarball with a minimal Debian x32 chroot? Then you can
install interesting packages you would like to test yourself.

Adrian
Linus Torvalds March 27, 2018, 3:40 a.m. UTC | #18
On Mon, Mar 26, 2018 at 4:37 PM, John Paul Adrian Glaubitz
<glaubitz@physik.fu-berlin.de> wrote:
>
> What about a tarball with a minimal Debian x32 chroot? Then you can
> install interesting packages you would like to test yourself.

That probably works fine.

          Linus
John Paul Adrian Glaubitz March 27, 2018, 4:58 a.m. UTC | #19
On 03/27/2018 12:40 PM, Linus Torvalds wrote:
> On Mon, Mar 26, 2018 at 4:37 PM, John Paul Adrian Glaubitz
> <glaubitz@physik.fu-berlin.de> wrote:
>>
>> What about a tarball with a minimal Debian x32 chroot? Then you can
>> install interesting packages you would like to test yourself.
> 
> That probably works fine.

I just created a fresh Debian x32 unstable chroot using this command:

$ debootstrap --no-check-gpg --variant=minbase --arch=x32 unstable debian-x32-unstable http://ftp.ports.debian.org/debian-ports

It can be downloaded from my Debian webspace along checksum files for
verification:

> https://people.debian.org/~glaubitz/chroots/

Let me know if you run into any issues.

Adrian
Ingo Molnar March 30, 2018, 10:58 a.m. UTC | #20
* John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> wrote:

> On 03/27/2018 12:40 PM, Linus Torvalds wrote:
> > On Mon, Mar 26, 2018 at 4:37 PM, John Paul Adrian Glaubitz
> > <glaubitz@physik.fu-berlin.de> wrote:
> >>
> >> What about a tarball with a minimal Debian x32 chroot? Then you can
> >> install interesting packages you would like to test yourself.
> > 
> > That probably works fine.
> 
> I just created a fresh Debian x32 unstable chroot using this command:
> 
> $ debootstrap --no-check-gpg --variant=minbase --arch=x32 unstable debian-x32-unstable http://ftp.ports.debian.org/debian-ports
> 
> It can be downloaded from my Debian webspace along checksum files for
> verification:
> 
> > https://people.debian.org/~glaubitz/chroots/
> 
> Let me know if you run into any issues.

Here's the direct download link:

  $ wget https://people.debian.org/~glaubitz/chroots/debian-x32-unstable.tar.gz

Checksum should be:

  $ sha256sum debian-x32-unstable.tar.gz
  010844bcc76bd1a3b7a20fe47f7067ed8e429a84fa60030a2868626e8fa7ec3b  debian-x32-unstable.tar.gz

Seems to work fine here (on a distro kernel) even if I extract all the files as a 
non-root user and do:

  ~/s/debian-x32-unstable> fakechroot /usr/sbin/chroot . /usr/bin/dpkg -l  | tail -2

  ERROR: ld.so: object 'libfakechroot.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
  ii  util-linux:x32         2.31.1-0.5           x32          miscellaneous system utilities
  ii  zlib1g:x32             1:1.2.8.dfsg-5       x32          compression library - runtime

So that 'dpkg' instance appears to be running inside the chroot environment and is 
listing x32 installed packages.

Although I did get this warning:

  ERROR: ld.so: object 'libfakechroot.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.

Even with that warning, is still still a sufficiently complex test of x32 syscall 
code paths?

BTW., "fakechroot /usr/sbin/chroot ." crashes instead of giving me a bash shell.

Thanks,

	Ingo
Adam Borowski March 30, 2018, 3:54 p.m. UTC | #21
On Fri, Mar 30, 2018 at 12:58:02PM +0200, Ingo Molnar wrote:
> * John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> wrote:
> 
> > On 03/27/2018 12:40 PM, Linus Torvalds wrote:
> > > On Mon, Mar 26, 2018 at 4:37 PM, John Paul Adrian Glaubitz
> > > <glaubitz@physik.fu-berlin.de> wrote:
> > >>
> > >> What about a tarball with a minimal Debian x32 chroot? Then you can
> > >> install interesting packages you would like to test yourself.

> Here's the direct download link:
>   $ wget https://people.debian.org/~glaubitz/chroots/debian-x32-unstable.tar.gz

> Seems to work fine here (on a distro kernel) even if I extract all the files as a 
> non-root user and do:
> 
>   ~/s/debian-x32-unstable> fakechroot /usr/sbin/chroot . /usr/bin/dpkg -l  | tail -2
> 
>   ERROR: ld.so: object 'libfakechroot.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
>   ii  util-linux:x32         2.31.1-0.5           x32          miscellaneous system utilities
>   ii  zlib1g:x32             1:1.2.8.dfsg-5       x32          compression library - runtime

> So that 'dpkg' instance appears to be running inside the chroot environment and is 
> listing x32 installed packages.

> Although I did get this warning:
>   ERROR: ld.so: object 'libfakechroot.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
> Even with that warning, is still still a sufficiently complex test of x32 syscall 
> code paths?

Instead of mucking with fakechroot which would require installing its :x32
part inside the guest, or running the test as root, what about using any
random static binary?  For example, a shell like sash or bash-static would
have a decentish syscall coverage even by itself.

I've extracted sash from
http://ftp.ports.debian.org/debian-ports//pool-x32/main/s/sash/sash_3.8-4_x32.deb
and placed at https://angband.pl/tmp/sash.x32

$ sha256sum sash.x32 
de24097c859b313fa422af742b648c9d731de6b33b98cb995658d1da16398456  sash.x32

Obviously, you can compile a static binary that uses whatever syscalls you
want.  Without a native chroot, you can "gcc -mx32" although you'd need some
kind of libc unless your program is stand-alone.


It might be worth mentioning my "arch-test:
https://github.com/kilobyte/arch-test
Because of many toolchain pieces it needs, you want a prebuilt copy:
https://github.com/kilobyte/arch-test/releases/download/v0.10/arch-test_prebuilt_0.10.tar.xz
https://github.com/kilobyte/arch-test/releases/download/v0.10/arch-test_prebuilt_0.10.tar.xz.asc
-- while it has _extremely_ small coverage of syscalls (just write() and
_exit(), enough to check endianness and pointer width), concentrating on
instruction set inadequacies (broken SWP on arm, POWER7 vs POWER8, powerpc
vs powerpcspe, etc), it provides minimal test binaries for a wide range of
architectures.


Meow!
diff mbox series

Patch

diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h
index 3ddc271ad77b..f8f9046164ae 100644
--- a/arch/mips/include/asm/unistd.h
+++ b/arch/mips/include/asm/unistd.h
@@ -49,6 +49,7 @@ 
 #  define __ARCH_WANT_COMPAT_SYS_FALLOCATE
 #  define __ARCH_WANT_COMPAT_SYS_TRUNCATE64
 #  define __ARCH_WANT_COMPAT_SYS_PREADWRITE64
+#  define __ARCH_WANT_COMPAT_SYS_READAHEAD
 #  ifdef __MIPSEL__
 #    define __ARCH_WANT_LE_COMPAT_SYS
 #  endif
diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c
index 871cda53a915..c40ce08be17d 100644
--- a/arch/mips/kernel/linux32.c
+++ b/arch/mips/kernel/linux32.c
@@ -106,12 +106,6 @@  SYSCALL_DEFINE1(32_personality, unsigned long, personality)
 	return ret;
 }
 
-asmlinkage ssize_t sys32_readahead(int fd, u32 pad0, u64 a2, u64 a3,
-				   size_t count)
-{
-	return sys_readahead(fd, merge_64(a2, a3), count);
-}
-
 asmlinkage long sys32_sync_file_range(int fd, int __pad,
 	unsigned long a2, unsigned long a3,
 	unsigned long a4, unsigned long a5,
diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S
index fbc463b234a1..eb4e66ba025a 100644
--- a/arch/mips/kernel/scall64-o32.S
+++ b/arch/mips/kernel/scall64-o32.S
@@ -439,7 +439,7 @@  EXPORT(sys32_call_table)
 	PTR	compat_sys_fcntl64		/* 4220 */
 	PTR	sys_ni_syscall
 	PTR	sys_gettid
-	PTR	sys32_readahead
+	PTR	compat_sys_readahead
 	PTR	sys_setxattr
 	PTR	sys_lsetxattr			/* 4225 */
 	PTR	sys_fsetxattr
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index 704f2413ac30..870317a35763 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -53,6 +53,7 @@ 
 #define __ARCH_WANT_COMPAT_SYS_FALLOCATE
 #define __ARCH_WANT_COMPAT_SYS_TRUNCATE64
 #define __ARCH_WANT_COMPAT_SYS_PREADWRITE64
+#define __ARCH_WANT_COMPAT_SYS_READAHEAD
 #endif
 #define __ARCH_WANT_SYS_FORK
 #define __ARCH_WANT_SYS_VFORK
diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c
index ec896c8df968..289ae55bb4b5 100644
--- a/arch/powerpc/kernel/sys_ppc32.c
+++ b/arch/powerpc/kernel/sys_ppc32.c
@@ -74,11 +74,6 @@  unsigned long compat_sys_mmap2(unsigned long addr, size_t len,
  * The 32 bit ABI passes long longs in an odd even register pair.
  */
 
-compat_ssize_t compat_sys_readahead(int fd, u32 r4, u32 offhi, u32 offlo, u32 count)
-{
-	return sys_readahead(fd, ((loff_t)offhi << 32) | offlo, count);
-}
-
 asmlinkage int compat_sys_ftruncate64(unsigned int fd, u32 reg4, unsigned long high,
 				 unsigned long low)
 {
diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h
index 71e6f7d65762..685ad7944850 100644
--- a/arch/s390/include/asm/unistd.h
+++ b/arch/s390/include/asm/unistd.h
@@ -37,6 +37,7 @@ 
 #   define __ARCH_WANT_COMPAT_SYS_FALLOCATE
 #   define __ARCH_WANT_COMPAT_SYS_TRUNCATE64
 #   define __ARCH_WANT_COMPAT_SYS_PREADWRITE64
+#   define __ARCH_WANT_COMPAT_SYS_READAHEAD
 # endif
 #define __ARCH_WANT_SYS_FORK
 #define __ARCH_WANT_SYS_VFORK
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index 8dd12a9d99a3..9f111d1f1ec2 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -305,11 +305,6 @@  COMPAT_SYSCALL_DEFINE3(s390_ftruncate64, unsigned int, fd, u32, high, u32, low)
 	return sys_ftruncate(fd, (unsigned long)high << 32 | low);
 }
 
-COMPAT_SYSCALL_DEFINE4(s390_readahead, int, fd, u32, high, u32, low, s32, count)
-{
-	return sys_readahead(fd, (unsigned long)high << 32 | low, count);
-}
-
 struct stat64_emu31 {
 	unsigned long long  st_dev;
 	unsigned int    __pad1;
diff --git a/arch/s390/kernel/compat_linux.h b/arch/s390/kernel/compat_linux.h
index 35fe45225185..cac6dd7bced0 100644
--- a/arch/s390/kernel/compat_linux.h
+++ b/arch/s390/kernel/compat_linux.h
@@ -108,7 +108,6 @@  long compat_sys_s390_geteuid16(void);
 long compat_sys_s390_getgid16(void);
 long compat_sys_s390_getegid16(void);
 long compat_sys_s390_ftruncate64(unsigned int fd, u32 high, u32 low);
-long compat_sys_s390_readahead(int fd, u32 high, u32 low, s32 count);
 long compat_sys_s390_stat64(const char __user *filename, struct stat64_emu31 __user *statbuf);
 long compat_sys_s390_lstat64(const char __user *filename, struct stat64_emu31 __user *statbuf);
 long compat_sys_s390_fstat64(unsigned int fd, struct stat64_emu31 __user *statbuf);
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index ceaf5ab6ac47..a7d989a292f7 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -230,7 +230,7 @@ 
 219  common	madvise			sys_madvise			compat_sys_madvise
 220  common	getdents64		sys_getdents64			compat_sys_getdents64
 221  32		fcntl64			-				compat_sys_fcntl64
-222  common	readahead		sys_readahead			compat_sys_s390_readahead
+222  common	readahead		sys_readahead			compat_sys_readahead
 223  32		sendfile64		-				compat_sys_sendfile64
 224  common	setxattr		sys_setxattr			compat_sys_setxattr
 225  common	lsetxattr		sys_lsetxattr			compat_sys_lsetxattr
diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h
index e04452be8db4..85579de64f60 100644
--- a/arch/sparc/include/asm/unistd.h
+++ b/arch/sparc/include/asm/unistd.h
@@ -46,6 +46,7 @@ 
 #define __ARCH_WANT_COMPAT_SYS_FALLOCATE
 #define __ARCH_WANT_COMPAT_SYS_TRUNCATE64
 #define __ARCH_WANT_COMPAT_SYS_PREADWRITE64
+#define __ARCH_WANT_COMPAT_SYS_READAHEAD
 #endif
 
 #endif /* _SPARC_UNISTD_H */
diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c
index 8a4f5accf6be..cade09deff8d 100644
--- a/arch/sparc/kernel/sys_sparc32.c
+++ b/arch/sparc/kernel/sys_sparc32.c
@@ -186,14 +186,6 @@  COMPAT_SYSCALL_DEFINE5(rt_sigaction, int, sig,
         return ret;
 }
 
-asmlinkage long compat_sys_readahead(int fd,
-				     unsigned long offhi,
-				     unsigned long offlo,
-				     compat_size_t count)
-{
-	return sys_readahead(fd, (offhi << 32) | offlo, count);
-}
-
 long compat_sys_fadvise64(int fd,
 			  unsigned long offhi,
 			  unsigned long offlo,
diff --git a/arch/sparc/kernel/systbls.h b/arch/sparc/kernel/systbls.h
index 6b5fd12e821d..aad40627ba52 100644
--- a/arch/sparc/kernel/systbls.h
+++ b/arch/sparc/kernel/systbls.h
@@ -63,10 +63,6 @@  asmlinkage long compat_sys_fstat64(unsigned int fd,
 asmlinkage long compat_sys_fstatat64(unsigned int dfd,
 				     const char __user *filename,
 				     struct compat_stat64 __user * statbuf, int flag);
-asmlinkage long compat_sys_readahead(int fd,
-				     unsigned long offhi,
-				     unsigned long offlo,
-				     compat_size_t count);
 long compat_sys_fadvise64(int fd,
 			  unsigned long offhi,
 			  unsigned long offlo,
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 2f39235785d8..17b8dc7130f5 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -231,7 +231,7 @@ 
 # 222 is unused
 # 223 is unused
 224	i386	gettid			sys_gettid
-225	i386	readahead		sys_readahead			compat_sys_x86_readahead
+225	i386	readahead		sys_readahead			compat_sys_readahead
 226	i386	setxattr		sys_setxattr
 227	i386	lsetxattr		sys_lsetxattr
 228	i386	fsetxattr		sys_fsetxattr
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index eae207229a93..89dcb36d19da 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -181,12 +181,6 @@  COMPAT_SYSCALL_DEFINE6(x86_fadvise64_64, int, fd, __u32, offset_low,
 				advice);
 }
 
-COMPAT_SYSCALL_DEFINE4(x86_readahead, int, fd, unsigned int, off_lo,
-		       unsigned int, off_hi, size_t, count)
-{
-	return sys_readahead(fd, ((u64)off_hi << 32) | off_lo, count);
-}
-
 COMPAT_SYSCALL_DEFINE6(x86_sync_file_range, int, fd, unsigned int, off_low,
 		       unsigned int, off_hi, unsigned int, n_low,
 		       unsigned int, n_hi, int, flags)
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index ded631bb33de..6a78bee5a314 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -39,8 +39,6 @@  asmlinkage long compat_sys_x86_waitpid(compat_pid_t, unsigned int __user *,
 asmlinkage long compat_sys_x86_fadvise64_64(int, __u32, __u32, __u32, __u32,
 					    int);
 
-asmlinkage ssize_t compat_sys_x86_readahead(int, unsigned int, unsigned int,
-					    size_t);
 asmlinkage long compat_sys_x86_sync_file_range(int, unsigned int, unsigned int,
 					       unsigned int, unsigned int,
 					       int);
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index be8f52494ee3..8d56e1dd71d6 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -32,6 +32,7 @@ 
 #  define __ARCH_WANT_COMPAT_SYS_FALLOCATE
 #  define __ARCH_WANT_COMPAT_SYS_TRUNCATE64
 #  define __ARCH_WANT_COMPAT_SYS_PREADWRITE64
+#  define __ARCH_WANT_COMPAT_SYS_READAHEAD
 
 # endif
 
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 95301d1a6793..96d1244b332e 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -872,6 +872,16 @@  asmlinkage long compat_sys_pread64(unsigned int, char __user *,
 #endif /* __ARCH_WANT_COMPAT_SYS_WITH_PADDING */
 #endif /* __ARCH_WANT_COMPAT_SYS_PREADWRITE64 */
 
+#ifdef __ARCH_WANT_COMPAT_SYS_READAHEAD
+/* __ARCH_WANT_LE_COMPAT_SYS determines order of lo and hi */
+#ifdef __ARCH_WANT_COMPAT_SYS_WITH_PADDING
+asmlinkage long compat_sys_readahead(int, unsigned int, unsigned int,
+				     unsigned int, size_t);
+#else /* __ARCH_WANT_COMPAT_SYS_WITH_PADDING */
+asmlinkage long compat_sys_readahead(int, unsigned int, unsigned int, size_t);
+#endif /* __ARCH_WANT_COMPAT_SYS_WITH_PADDING */
+#endif /* __ARCH_WANT_COMPAT_SYS_READAHEAD */
+
 int compat_restore_altstack(const compat_stack_t __user *uss);
 int __compat_save_altstack(compat_stack_t __user *, unsigned long);
 #define compat_save_altstack_ex(uss, sp) do { \
diff --git a/mm/readahead.c b/mm/readahead.c
index c4ca70239233..092866309593 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -17,6 +17,7 @@ 
 #include <linux/pagevec.h>
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
+#include <linux/compat.h>
 #include <linux/file.h>
 #include <linux/mm_inline.h>
 
@@ -555,40 +556,74 @@  page_cache_async_readahead(struct address_space *mapping,
 }
 EXPORT_SYMBOL_GPL(page_cache_async_readahead);
 
-static ssize_t
-do_readahead(struct address_space *mapping, struct file *filp,
-	     pgoff_t index, unsigned long nr)
+static ssize_t do_readahead(int fd, loff_t offset, size_t count)
 {
-	if (!mapping || !mapping->a_ops)
-		return -EINVAL;
+	ssize_t ret = -EBADF;
+	struct fd f;
+	struct address_space *mapping;
+	pgoff_t start = offset >> PAGE_SHIFT;
+	pgoff_t end = (offset + count - 1) >> PAGE_SHIFT;
+	unsigned long len = end - start + 1;
+
+	f = fdget(fd);
+	if (!f.file)
+		goto out;
+
+	if (!(f.file->f_mode & FMODE_READ))
+		goto put_out;
+
+	mapping = f.file->f_mapping;
+	if (!mapping || !mapping->a_ops) {
+		ret = -EINVAL;
+		goto put_out;
+	}
 
 	/*
 	 * Readahead doesn't make sense for DAX inodes, but we don't want it
 	 * to report a failure either.  Instead, we just return success and
 	 * don't do any work.
 	 */
-	if (dax_mapping(mapping))
-		return 0;
+	if (dax_mapping(mapping)) {
+		ret = 0;
+		goto put_out;
+	}
+
+	ret = force_page_cache_readahead(mapping, f.file, start, len);
 
-	return force_page_cache_readahead(mapping, filp, index, nr);
+put_out:
+	fdput(f);
+
+out:
+	return ret;
 }
 
 SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
 {
-	ssize_t ret;
-	struct fd f;
+	return do_readahead(fd, offset, count);
+}
 
-	ret = -EBADF;
-	f = fdget(fd);
-	if (f.file) {
-		if (f.file->f_mode & FMODE_READ) {
-			struct address_space *mapping = f.file->f_mapping;
-			pgoff_t start = offset >> PAGE_SHIFT;
-			pgoff_t end = (offset + count - 1) >> PAGE_SHIFT;
-			unsigned long len = end - start + 1;
-			ret = do_readahead(mapping, f.file, start, len);
-		}
-		fdput(f);
-	}
-	return ret;
+#ifdef __ARCH_WANT_COMPAT_SYS_READAHEAD
+#if defined(__ARCH_WANT_COMPAT_SYS_WITH_PADDING) && \
+	defined(__ARCH_WANT_LE_COMPAT_SYS)
+COMPAT_SYSCALL_DEFINE5(readahead, int, fd, unsigned int, padding,
+		       unsigned int, off_lo, unsigned int, off_hi,
+		       size_t, count)
+#elif defined(__ARCH_WANT_COMPAT_SYS_WITH_PADDING) && \
+	!defined(__ARCH_WANT_LE_COMPAT_SYS)
+COMPAT_SYSCALL_DEFINE5(readahead, int, fd, unsigned int, padding,
+		       unsigned int, off_hi, unsigned int, off_lo,
+		       size_t, count)
+#elif !defined(__ARCH_WANT_COMPAT_SYS_WITH_PADDING) && \
+	defined(__ARCH_WANT_LE_COMPAT_SYS)
+COMPAT_SYSCALL_DEFINE4(readahead, int, fd,
+		       unsigned int, off_lo, unsigned int, off_hi,
+		       size_t, count)
+#else /* no padding, big endian */
+COMPAT_SYSCALL_DEFINE4(readahead, int, fd,
+		       unsigned int, off_hi, unsigned int, off_lo,
+		       size_t, count)
+#endif
+{
+	return do_readahead(fd, ((u64) off_hi << 32) | off_lo, count);
 }
+#endif /* __ARCH_WANT_COMPAT_SYS_READAHEAD */