diff mbox

[v2,3/3] move unaligned memory access functions to bswap.h

Message ID 1307370348-28400-4-git-send-email-pbonzini@redhat.com
State New
Headers show

Commit Message

Paolo Bonzini June 6, 2011, 2:25 p.m. UTC
This is just code movement, and moving the fpu/ include path from
target-dependent to target-independent Make variables.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Makefile.hw |    2 +-
 bswap.h     |  474 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 configure   |    3 +-
 cpu-all.h   |  446 +-------------------------------------------------------
 4 files changed, 477 insertions(+), 448 deletions(-)

Comments

Richard Henderson June 6, 2011, 7:56 p.m. UTC | #1
Patches 1-3:
Reviewed-by: Richard Henderson <rth@twiddle.net>

That said,

On 06/06/2011 09:25 AM, Paolo Bonzini wrote:
> +/* conservative code for little endian unaligned accesses */
> +static inline int lduw_le_p(const void *ptr)
> +{
> +#ifdef _ARCH_PPC
> +    int val;
> +    __asm__ __volatile__ ("lhbrx %0,0,%1" : "=r" (val) : "r" (ptr));
> +    return val;
> +#else
> +    const uint8_t *p = ptr;
> +    return p[0] | (p[1] << 8);
> +#endif
> +}

Can we add a patch 4/3 that removes this sort of hard-coded
assembly stuff in favour of generic gcc code.  E.g.

static inline int lduw_le_p(const void *ptr)
{
  struct pack { uint16_t x __attribute__((packed)); };
  const struct pack *p = ptr;
  uint16_t ret;

  ret = p->x;
  ret = le_bswap(ret, 16);

  return ret;
}

One could fairly well macroize all 6 instances.

The compiler knows that ppc and i386 can do the unaligned loads.
The compiler *should* be able to match the bswap_N patterns, and
thus also fold the unaligned load with the bswap for ppc.

I can confirm that gcc head does in fact do the right thing for
ppc32/64 here, as well as for i386/x86_64.  I don't have previous
versions of gcc checked out atm...

I havn't checked, but this *ought* to enable the load-asi bswap
instruction for sparcv9.  I also havn't checked what happens for
a target like sparcv8 that lacks both unaligned load and bswap,
to see that we don't simply double the number of shifts.  However,
I'd be tempted to file that as a gcc missed optimization bug.


r~
Paolo Bonzini June 6, 2011, 8:27 p.m. UTC | #2
On 06/06/2011 09:56 PM, Richard Henderson wrote:
> Can we add a patch 4/3 that removes this sort of hard-coded
> assembly stuff in favour of generic gcc code.

Then it's also possible to commit patch 2 from this submission, and wait 
while I rework the other two to use generic GCC code.

Paolo
Richard Henderson June 6, 2011, 10:05 p.m. UTC | #3
On 06/06/2011 01:27 PM, Paolo Bonzini wrote:
> On 06/06/2011 09:56 PM, Richard Henderson wrote:
>> Can we add a patch 4/3 that removes this sort of hard-coded
>> assembly stuff in favour of generic gcc code.
> 
> Then it's also possible to commit patch 2 from this submission, and
> wait while I rework the other two to use generic GCC code.

I began the message by acking all of 1 through 3.

The re-work for GCC code ought to be a follow-on patch.


r~
malc June 6, 2011, 11:07 p.m. UTC | #4
On Mon, 6 Jun 2011, Richard Henderson wrote:

> Patches 1-3:
> Reviewed-by: Richard Henderson <rth@twiddle.net>
> 
> That said,
> 
> On 06/06/2011 09:25 AM, Paolo Bonzini wrote:
> > +/* conservative code for little endian unaligned accesses */
> > +static inline int lduw_le_p(const void *ptr)
> > +{
> > +#ifdef _ARCH_PPC
> > +    int val;
> > +    __asm__ __volatile__ ("lhbrx %0,0,%1" : "=r" (val) : "r" (ptr));
> > +    return val;
> > +#else
> > +    const uint8_t *p = ptr;
> > +    return p[0] | (p[1] << 8);
> > +#endif
> > +}
> 
> Can we add a patch 4/3 that removes this sort of hard-coded
> assembly stuff in favour of generic gcc code.  E.g.
> 
> static inline int lduw_le_p(const void *ptr)
> {
>   struct pack { uint16_t x __attribute__((packed)); };
>   const struct pack *p = ptr;
>   uint16_t ret;
> 
>   ret = p->x;
>   ret = le_bswap(ret, 16);
> 
>   return ret;
> }
> 
> One could fairly well macroize all 6 instances.
> 
> The compiler knows that ppc and i386 can do the unaligned loads.
> The compiler *should* be able to match the bswap_N patterns, and
> thus also fold the unaligned load with the bswap for ppc.
> 
> I can confirm that gcc head does in fact do the right thing for
> ppc32/64 here, as well as for i386/x86_64.  I don't have previous
> versions of gcc checked out atm...

Depends on how bswap_16 is defined. If it is __builtin_bswap16
then 4.5.0 and 4.6.0 generate byte reversed loads, and previous
versions lack that builtin, so i don't think this generic code
should go in.

> 
> I havn't checked, but this *ought* to enable the load-asi bswap
> instruction for sparcv9.  I also havn't checked what happens for
> a target like sparcv8 that lacks both unaligned load and bswap,
> to see that we don't simply double the number of shifts.  However,
> I'd be tempted to file that as a gcc missed optimization bug.
> 
> 
> r~
>
Richard Henderson June 7, 2011, 4:54 p.m. UTC | #5
On 06/06/2011 04:07 PM, malc wrote:
> Depends on how bswap_16 is defined. If it is __builtin_bswap16
> then 4.5.0 and 4.6.0 generate byte reversed loads, and previous
> versions lack that builtin, so i don't think this generic code
> should go in.

It would continue to be defined as-is, without direct reference
to the __builtin_bswap functions.  But you're right that the 
generic code would depend on the bswap optimization pass that
recognizes the mask/shift/or pattern and converts it to the
builtins internally.

What if we kept the ppc ifdefs, but converted the rest to the
gcc generic code?


r~
malc June 7, 2011, 5:29 p.m. UTC | #6
On Tue, 7 Jun 2011, Richard Henderson wrote:

> On 06/06/2011 04:07 PM, malc wrote:
> > Depends on how bswap_16 is defined. If it is __builtin_bswap16
> > then 4.5.0 and 4.6.0 generate byte reversed loads, and previous
> > versions lack that builtin, so i don't think this generic code
> > should go in.
> 
> It would continue to be defined as-is, without direct reference
> to the __builtin_bswap functions.  But you're right that the 
> generic code would depend on the bswap optimization pass that
> recognizes the mask/shift/or pattern and converts it to the
> builtins internally.
> 
> What if we kept the ppc ifdefs, but converted the rest to the
> gcc generic code?
> 

No objections.
diff mbox

Patch

diff --git a/Makefile.hw b/Makefile.hw
index b9181ab..659e441 100644
--- a/Makefile.hw
+++ b/Makefile.hw
@@ -9,7 +9,7 @@  include $(SRC_PATH)/rules.mak
 
 $(call set-vpath, $(SRC_PATH):$(SRC_PATH)/hw)
 
-QEMU_CFLAGS+=-I.. -I$(SRC_PATH)/fpu
+QEMU_CFLAGS+=-I..
 
 include $(SRC_PATH)/Makefile.objs
 
diff --git a/bswap.h b/bswap.h
index 82a7951..f41bebe 100644
--- a/bswap.h
+++ b/bswap.h
@@ -11,6 +11,8 @@ 
 #include <machine/bswap.h>
 #else
 
+#include "softfloat.h"
+
 #ifdef CONFIG_BYTESWAP_H
 #include <byteswap.h>
 #else
@@ -237,4 +239,476 @@  static inline uint32_t qemu_bswap_len(uint32_t value, int len)
     return bswap32(value) >> (32 - 8 * len);
 }
 
+typedef union {
+    float32 f;
+    uint32_t l;
+} CPU_FloatU;
+
+typedef union {
+    float64 d;
+#if defined(HOST_WORDS_BIGENDIAN)
+    struct {
+        uint32_t upper;
+        uint32_t lower;
+    } l;
+#else
+    struct {
+        uint32_t lower;
+        uint32_t upper;
+    } l;
+#endif
+    uint64_t ll;
+} CPU_DoubleU;
+
+typedef union {
+     floatx80 d;
+     struct {
+         uint64_t lower;
+         uint16_t upper;
+     } l;
+} CPU_LDoubleU;
+
+typedef union {
+    float128 q;
+#if defined(HOST_WORDS_BIGENDIAN)
+    struct {
+        uint32_t upmost;
+        uint32_t upper;
+        uint32_t lower;
+        uint32_t lowest;
+    } l;
+    struct {
+        uint64_t upper;
+        uint64_t lower;
+    } ll;
+#else
+    struct {
+        uint32_t lowest;
+        uint32_t lower;
+        uint32_t upper;
+        uint32_t upmost;
+    } l;
+    struct {
+        uint64_t lower;
+        uint64_t upper;
+    } ll;
+#endif
+} CPU_QuadU;
+
+/* unaligned/endian-independent pointer access */
+
+/*
+ * the generic syntax is:
+ *
+ * load: ld{type}{sign}{size}{endian}_p(ptr)
+ *
+ * store: st{type}{size}{endian}_p(ptr, val)
+ *
+ * Note there are small differences with the softmmu access API!
+ *
+ * type is:
+ * (empty): integer access
+ *   f    : float access
+ *
+ * sign is:
+ * (empty): for floats or 32 bit size
+ *   u    : unsigned
+ *   s    : signed
+ *
+ * size is:
+ *   b: 8 bits
+ *   w: 16 bits
+ *   l: 32 bits
+ *   q: 64 bits
+ *
+ * endian is:
+ * (empty): 8 bit access
+ *   be   : big endian
+ *   le   : little endian
+ */
+static inline int ldub_p(const void *ptr)
+{
+    return *(uint8_t *)ptr;
+}
+
+static inline int ldsb_p(const void *ptr)
+{
+    return *(int8_t *)ptr;
+}
+
+static inline void stb_p(void *ptr, int v)
+{
+    *(uint8_t *)ptr = v;
+}
+
+/* NOTE: on arm, putting 2 in /proc/sys/debug/alignment so that the
+   kernel handles unaligned load/stores may give better results, but
+   it is a system wide setting : bad */
+#if defined(HOST_WORDS_BIGENDIAN) || defined(WORDS_ALIGNED)
+
+/* conservative code for little endian unaligned accesses */
+static inline int lduw_le_p(const void *ptr)
+{
+#ifdef _ARCH_PPC
+    int val;
+    __asm__ __volatile__ ("lhbrx %0,0,%1" : "=r" (val) : "r" (ptr));
+    return val;
+#else
+    const uint8_t *p = ptr;
+    return p[0] | (p[1] << 8);
+#endif
+}
+
+static inline int ldsw_le_p(const void *ptr)
+{
+#ifdef _ARCH_PPC
+    int val;
+    __asm__ __volatile__ ("lhbrx %0,0,%1" : "=r" (val) : "r" (ptr));
+    return (int16_t)val;
+#else
+    const uint8_t *p = ptr;
+    return (int16_t)(p[0] | (p[1] << 8));
+#endif
+}
+
+static inline int ldl_le_p(const void *ptr)
+{
+#ifdef _ARCH_PPC
+    int val;
+    __asm__ __volatile__ ("lwbrx %0,0,%1" : "=r" (val) : "r" (ptr));
+    return val;
+#else
+    const uint8_t *p = ptr;
+    return p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24);
+#endif
+}
+
+static inline uint64_t ldq_le_p(const void *ptr)
+{
+    const uint8_t *p = ptr;
+    uint32_t v1, v2;
+    v1 = ldl_le_p(p);
+    v2 = ldl_le_p(p + 4);
+    return v1 | ((uint64_t)v2 << 32);
+}
+
+static inline void stw_le_p(void *ptr, int v)
+{
+#ifdef _ARCH_PPC
+    __asm__ __volatile__ ("sthbrx %1,0,%2" : "=m" (*(uint16_t *)ptr) : "r" (v), "r" (ptr));
+#else
+    uint8_t *p = ptr;
+    p[0] = v;
+    p[1] = v >> 8;
+#endif
+}
+
+static inline void stl_le_p(void *ptr, int v)
+{
+#ifdef _ARCH_PPC
+    __asm__ __volatile__ ("stwbrx %1,0,%2" : "=m" (*(uint32_t *)ptr) : "r" (v), "r" (ptr));
+#else
+    uint8_t *p = ptr;
+    p[0] = v;
+    p[1] = v >> 8;
+    p[2] = v >> 16;
+    p[3] = v >> 24;
+#endif
+}
+
+static inline void stq_le_p(void *ptr, uint64_t v)
+{
+    uint8_t *p = ptr;
+    stl_le_p(p, (uint32_t)v);
+    stl_le_p(p + 4, v >> 32);
+}
+
+/* float access */
+
+static inline float32 ldfl_le_p(const void *ptr)
+{
+    union {
+        float32 f;
+        uint32_t i;
+    } u;
+    u.i = ldl_le_p(ptr);
+    return u.f;
+}
+
+static inline void stfl_le_p(void *ptr, float32 v)
+{
+    union {
+        float32 f;
+        uint32_t i;
+    } u;
+    u.f = v;
+    stl_le_p(ptr, u.i);
+}
+
+static inline float64 ldfq_le_p(const void *ptr)
+{
+    CPU_DoubleU u;
+    u.l.lower = ldl_le_p(ptr);
+    u.l.upper = ldl_le_p(ptr + 4);
+    return u.d;
+}
+
+static inline void stfq_le_p(void *ptr, float64 v)
+{
+    CPU_DoubleU u;
+    u.d = v;
+    stl_le_p(ptr, u.l.lower);
+    stl_le_p(ptr + 4, u.l.upper);
+}
+
+#else
+
+static inline int lduw_le_p(const void *ptr)
+{
+    return *(uint16_t *)ptr;
+}
+
+static inline int ldsw_le_p(const void *ptr)
+{
+    return *(int16_t *)ptr;
+}
+
+static inline int ldl_le_p(const void *ptr)
+{
+    return *(uint32_t *)ptr;
+}
+
+static inline uint64_t ldq_le_p(const void *ptr)
+{
+    return *(uint64_t *)ptr;
+}
+
+static inline void stw_le_p(void *ptr, int v)
+{
+    *(uint16_t *)ptr = v;
+}
+
+static inline void stl_le_p(void *ptr, int v)
+{
+    *(uint32_t *)ptr = v;
+}
+
+static inline void stq_le_p(void *ptr, uint64_t v)
+{
+    *(uint64_t *)ptr = v;
+}
+
+/* float access */
+
+static inline float32 ldfl_le_p(const void *ptr)
+{
+    return *(float32 *)ptr;
+}
+
+static inline float64 ldfq_le_p(const void *ptr)
+{
+    return *(float64 *)ptr;
+}
+
+static inline void stfl_le_p(void *ptr, float32 v)
+{
+    *(float32 *)ptr = v;
+}
+
+static inline void stfq_le_p(void *ptr, float64 v)
+{
+    *(float64 *)ptr = v;
+}
+#endif
+
+#if !defined(HOST_WORDS_BIGENDIAN) || defined(WORDS_ALIGNED)
+
+static inline int lduw_be_p(const void *ptr)
+{
+#if defined(__i386__)
+    int val;
+    asm volatile ("movzwl %1, %0\n"
+                  "xchgb %b0, %h0\n"
+                  : "=q" (val)
+                  : "m" (*(uint16_t *)ptr));
+    return val;
+#else
+    const uint8_t *b = ptr;
+    return ((b[0] << 8) | b[1]);
+#endif
+}
+
+static inline int ldsw_be_p(const void *ptr)
+{
+#if defined(__i386__)
+    int val;
+    asm volatile ("movzwl %1, %0\n"
+                  "xchgb %b0, %h0\n"
+                  : "=q" (val)
+                  : "m" (*(uint16_t *)ptr));
+    return (int16_t)val;
+#else
+    const uint8_t *b = ptr;
+    return (int16_t)((b[0] << 8) | b[1]);
+#endif
+}
+
+static inline int ldl_be_p(const void *ptr)
+{
+#if defined(__i386__) || defined(__x86_64__)
+    int val;
+    asm volatile ("movl %1, %0\n"
+                  "bswap %0\n"
+                  : "=r" (val)
+                  : "m" (*(uint32_t *)ptr));
+    return val;
+#else
+    const uint8_t *b = ptr;
+    return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3];
+#endif
+}
+
+static inline uint64_t ldq_be_p(const void *ptr)
+{
+    uint32_t a,b;
+    a = ldl_be_p(ptr);
+    b = ldl_be_p((uint8_t *)ptr + 4);
+    return (((uint64_t)a<<32)|b);
+}
+
+static inline void stw_be_p(void *ptr, int v)
+{
+#if defined(__i386__)
+    asm volatile ("xchgb %b0, %h0\n"
+                  "movw %w0, %1\n"
+                  : "=q" (v)
+                  : "m" (*(uint16_t *)ptr), "0" (v));
+#else
+    uint8_t *d = (uint8_t *) ptr;
+    d[0] = v >> 8;
+    d[1] = v;
+#endif
+}
+
+static inline void stl_be_p(void *ptr, int v)
+{
+#if defined(__i386__) || defined(__x86_64__)
+    asm volatile ("bswap %0\n"
+                  "movl %0, %1\n"
+                  : "=r" (v)
+                  : "m" (*(uint32_t *)ptr), "0" (v));
+#else
+    uint8_t *d = (uint8_t *) ptr;
+    d[0] = v >> 24;
+    d[1] = v >> 16;
+    d[2] = v >> 8;
+    d[3] = v;
+#endif
+}
+
+static inline void stq_be_p(void *ptr, uint64_t v)
+{
+    stl_be_p(ptr, v >> 32);
+    stl_be_p((uint8_t *)ptr + 4, v);
+}
+
+/* float access */
+
+static inline float32 ldfl_be_p(const void *ptr)
+{
+    union {
+        float32 f;
+        uint32_t i;
+    } u;
+    u.i = ldl_be_p(ptr);
+    return u.f;
+}
+
+static inline void stfl_be_p(void *ptr, float32 v)
+{
+    union {
+        float32 f;
+        uint32_t i;
+    } u;
+    u.f = v;
+    stl_be_p(ptr, u.i);
+}
+
+static inline float64 ldfq_be_p(const void *ptr)
+{
+    CPU_DoubleU u;
+    u.l.upper = ldl_be_p(ptr);
+    u.l.lower = ldl_be_p((uint8_t *)ptr + 4);
+    return u.d;
+}
+
+static inline void stfq_be_p(void *ptr, float64 v)
+{
+    CPU_DoubleU u;
+    u.d = v;
+    stl_be_p(ptr, u.l.upper);
+    stl_be_p((uint8_t *)ptr + 4, u.l.lower);
+}
+
+#else
+
+static inline int lduw_be_p(const void *ptr)
+{
+    return *(uint16_t *)ptr;
+}
+
+static inline int ldsw_be_p(const void *ptr)
+{
+    return *(int16_t *)ptr;
+}
+
+static inline int ldl_be_p(const void *ptr)
+{
+    return *(uint32_t *)ptr;
+}
+
+static inline uint64_t ldq_be_p(const void *ptr)
+{
+    return *(uint64_t *)ptr;
+}
+
+static inline void stw_be_p(void *ptr, int v)
+{
+    *(uint16_t *)ptr = v;
+}
+
+static inline void stl_be_p(void *ptr, int v)
+{
+    *(uint32_t *)ptr = v;
+}
+
+static inline void stq_be_p(void *ptr, uint64_t v)
+{
+    *(uint64_t *)ptr = v;
+}
+
+/* float access */
+
+static inline float32 ldfl_be_p(const void *ptr)
+{
+    return *(float32 *)ptr;
+}
+
+static inline float64 ldfq_be_p(const void *ptr)
+{
+    return *(float64 *)ptr;
+}
+
+static inline void stfl_be_p(void *ptr, float32 v)
+{
+    *(float32 *)ptr = v;
+}
+
+static inline void stfq_be_p(void *ptr, float64 v)
+{
+    *(float64 *)ptr = v;
+}
+
+#endif
+
 #endif /* BSWAP_H */
diff --git a/configure b/configure
index d38b952..29c5bcc 100755
--- a/configure
+++ b/configure
@@ -233,7 +233,7 @@  QEMU_CFLAGS="-Wall -Wundef -Wwrite-strings -Wmissing-prototypes $QEMU_CFLAGS"
 QEMU_CFLAGS="-Wstrict-prototypes -Wredundant-decls $QEMU_CFLAGS"
 QEMU_CFLAGS="-D_GNU_SOURCE -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE $QEMU_CFLAGS"
 QEMU_CFLAGS="-D_FORTIFY_SOURCE=2 $QEMU_CFLAGS"
-QEMU_INCLUDES="-I. -I\$(SRC_PATH)"
+QEMU_INCLUDES="-I. -I\$(SRC_PATH) -I\$(SRC_PATH)/fpu"
 LDFLAGS="-g $LDFLAGS"
 
 # make source path absolute
@@ -3415,7 +3415,6 @@  else
   includes="-I\$(SRC_PATH)/tcg/\$(ARCH) $includes"
 fi
 includes="-I\$(SRC_PATH)/tcg $includes"
-includes="-I\$(SRC_PATH)/fpu $includes"
 
 if test "$target_user_only" = "yes" ; then
     libdis_config_mak=libdis-user/config.mak
diff --git a/cpu-all.h b/cpu-all.h
index 880f570..bdf1f08 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -35,8 +35,6 @@ 
  * TARGET_WORDS_BIGENDIAN : same for target cpu
  */
 
-#include "softfloat.h"
-
 #if defined(HOST_WORDS_BIGENDIAN) != defined(TARGET_WORDS_BIGENDIAN)
 #define BSWAP_NEEDED
 #endif
@@ -114,64 +112,6 @@  static inline void tswap64s(uint64_t *s)
 #define bswaptls(s) bswap64s(s)
 #endif
 
-typedef union {
-    float32 f;
-    uint32_t l;
-} CPU_FloatU;
-
-/* NOTE: arm FPA is horrible as double 32 bit words are stored in big
-   endian ! */
-typedef union {
-    float64 d;
-#if defined(HOST_WORDS_BIGENDIAN)
-    struct {
-        uint32_t upper;
-        uint32_t lower;
-    } l;
-#else
-    struct {
-        uint32_t lower;
-        uint32_t upper;
-    } l;
-#endif
-    uint64_t ll;
-} CPU_DoubleU;
-
-typedef union {
-     floatx80 d;
-     struct {
-         uint64_t lower;
-         uint16_t upper;
-     } l;
-} CPU_LDoubleU;
-
-typedef union {
-    float128 q;
-#if defined(HOST_WORDS_BIGENDIAN)
-    struct {
-        uint32_t upmost;
-        uint32_t upper;
-        uint32_t lower;
-        uint32_t lowest;
-    } l;
-    struct {
-        uint64_t upper;
-        uint64_t lower;
-    } ll;
-#else
-    struct {
-        uint32_t lowest;
-        uint32_t lower;
-        uint32_t upper;
-        uint32_t upmost;
-    } l;
-    struct {
-        uint64_t lower;
-        uint64_t upper;
-    } ll;
-#endif
-} CPU_QuadU;
-
 /* CPU memory access without any memory or io remapping */
 
 /*
@@ -207,392 +147,8 @@  typedef union {
  *   user   : user mode access using soft MMU
  *   kernel : kernel mode access using soft MMU
  */
-static inline int ldub_p(const void *ptr)
-{
-    return *(uint8_t *)ptr;
-}
-
-static inline int ldsb_p(const void *ptr)
-{
-    return *(int8_t *)ptr;
-}
-
-static inline void stb_p(void *ptr, int v)
-{
-    *(uint8_t *)ptr = v;
-}
-
-/* NOTE: on arm, putting 2 in /proc/sys/debug/alignment so that the
-   kernel handles unaligned load/stores may give better results, but
-   it is a system wide setting : bad */
-#if defined(HOST_WORDS_BIGENDIAN) || defined(WORDS_ALIGNED)
-
-/* conservative code for little endian unaligned accesses */
-static inline int lduw_le_p(const void *ptr)
-{
-#ifdef _ARCH_PPC
-    int val;
-    __asm__ __volatile__ ("lhbrx %0,0,%1" : "=r" (val) : "r" (ptr));
-    return val;
-#else
-    const uint8_t *p = ptr;
-    return p[0] | (p[1] << 8);
-#endif
-}
-
-static inline int ldsw_le_p(const void *ptr)
-{
-#ifdef _ARCH_PPC
-    int val;
-    __asm__ __volatile__ ("lhbrx %0,0,%1" : "=r" (val) : "r" (ptr));
-    return (int16_t)val;
-#else
-    const uint8_t *p = ptr;
-    return (int16_t)(p[0] | (p[1] << 8));
-#endif
-}
-
-static inline int ldl_le_p(const void *ptr)
-{
-#ifdef _ARCH_PPC
-    int val;
-    __asm__ __volatile__ ("lwbrx %0,0,%1" : "=r" (val) : "r" (ptr));
-    return val;
-#else
-    const uint8_t *p = ptr;
-    return p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24);
-#endif
-}
-
-static inline uint64_t ldq_le_p(const void *ptr)
-{
-    const uint8_t *p = ptr;
-    uint32_t v1, v2;
-    v1 = ldl_le_p(p);
-    v2 = ldl_le_p(p + 4);
-    return v1 | ((uint64_t)v2 << 32);
-}
-
-static inline void stw_le_p(void *ptr, int v)
-{
-#ifdef _ARCH_PPC
-    __asm__ __volatile__ ("sthbrx %1,0,%2" : "=m" (*(uint16_t *)ptr) : "r" (v), "r" (ptr));
-#else
-    uint8_t *p = ptr;
-    p[0] = v;
-    p[1] = v >> 8;
-#endif
-}
-
-static inline void stl_le_p(void *ptr, int v)
-{
-#ifdef _ARCH_PPC
-    __asm__ __volatile__ ("stwbrx %1,0,%2" : "=m" (*(uint32_t *)ptr) : "r" (v), "r" (ptr));
-#else
-    uint8_t *p = ptr;
-    p[0] = v;
-    p[1] = v >> 8;
-    p[2] = v >> 16;
-    p[3] = v >> 24;
-#endif
-}
-
-static inline void stq_le_p(void *ptr, uint64_t v)
-{
-    uint8_t *p = ptr;
-    stl_le_p(p, (uint32_t)v);
-    stl_le_p(p + 4, v >> 32);
-}
-
-/* float access */
-
-static inline float32 ldfl_le_p(const void *ptr)
-{
-    union {
-        float32 f;
-        uint32_t i;
-    } u;
-    u.i = ldl_le_p(ptr);
-    return u.f;
-}
-
-static inline void stfl_le_p(void *ptr, float32 v)
-{
-    union {
-        float32 f;
-        uint32_t i;
-    } u;
-    u.f = v;
-    stl_le_p(ptr, u.i);
-}
-
-static inline float64 ldfq_le_p(const void *ptr)
-{
-    CPU_DoubleU u;
-    u.l.lower = ldl_le_p(ptr);
-    u.l.upper = ldl_le_p(ptr + 4);
-    return u.d;
-}
-
-static inline void stfq_le_p(void *ptr, float64 v)
-{
-    CPU_DoubleU u;
-    u.d = v;
-    stl_le_p(ptr, u.l.lower);
-    stl_le_p(ptr + 4, u.l.upper);
-}
-
-#else
-
-static inline int lduw_le_p(const void *ptr)
-{
-    return *(uint16_t *)ptr;
-}
-
-static inline int ldsw_le_p(const void *ptr)
-{
-    return *(int16_t *)ptr;
-}
-
-static inline int ldl_le_p(const void *ptr)
-{
-    return *(uint32_t *)ptr;
-}
-
-static inline uint64_t ldq_le_p(const void *ptr)
-{
-    return *(uint64_t *)ptr;
-}
-
-static inline void stw_le_p(void *ptr, int v)
-{
-    *(uint16_t *)ptr = v;
-}
-
-static inline void stl_le_p(void *ptr, int v)
-{
-    *(uint32_t *)ptr = v;
-}
-
-static inline void stq_le_p(void *ptr, uint64_t v)
-{
-    *(uint64_t *)ptr = v;
-}
-
-/* float access */
-
-static inline float32 ldfl_le_p(const void *ptr)
-{
-    return *(float32 *)ptr;
-}
-
-static inline float64 ldfq_le_p(const void *ptr)
-{
-    return *(float64 *)ptr;
-}
-
-static inline void stfl_le_p(void *ptr, float32 v)
-{
-    *(float32 *)ptr = v;
-}
-
-static inline void stfq_le_p(void *ptr, float64 v)
-{
-    *(float64 *)ptr = v;
-}
-#endif
-
-#if !defined(HOST_WORDS_BIGENDIAN) || defined(WORDS_ALIGNED)
-
-static inline int lduw_be_p(const void *ptr)
-{
-#if defined(__i386__)
-    int val;
-    asm volatile ("movzwl %1, %0\n"
-                  "xchgb %b0, %h0\n"
-                  : "=q" (val)
-                  : "m" (*(uint16_t *)ptr));
-    return val;
-#else
-    const uint8_t *b = ptr;
-    return ((b[0] << 8) | b[1]);
-#endif
-}
-
-static inline int ldsw_be_p(const void *ptr)
-{
-#if defined(__i386__)
-    int val;
-    asm volatile ("movzwl %1, %0\n"
-                  "xchgb %b0, %h0\n"
-                  : "=q" (val)
-                  : "m" (*(uint16_t *)ptr));
-    return (int16_t)val;
-#else
-    const uint8_t *b = ptr;
-    return (int16_t)((b[0] << 8) | b[1]);
-#endif
-}
-
-static inline int ldl_be_p(const void *ptr)
-{
-#if defined(__i386__) || defined(__x86_64__)
-    int val;
-    asm volatile ("movl %1, %0\n"
-                  "bswap %0\n"
-                  : "=r" (val)
-                  : "m" (*(uint32_t *)ptr));
-    return val;
-#else
-    const uint8_t *b = ptr;
-    return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3];
-#endif
-}
-
-static inline uint64_t ldq_be_p(const void *ptr)
-{
-    uint32_t a,b;
-    a = ldl_be_p(ptr);
-    b = ldl_be_p((uint8_t *)ptr + 4);
-    return (((uint64_t)a<<32)|b);
-}
-
-static inline void stw_be_p(void *ptr, int v)
-{
-#if defined(__i386__)
-    asm volatile ("xchgb %b0, %h0\n"
-                  "movw %w0, %1\n"
-                  : "=q" (v)
-                  : "m" (*(uint16_t *)ptr), "0" (v));
-#else
-    uint8_t *d = (uint8_t *) ptr;
-    d[0] = v >> 8;
-    d[1] = v;
-#endif
-}
-
-static inline void stl_be_p(void *ptr, int v)
-{
-#if defined(__i386__) || defined(__x86_64__)
-    asm volatile ("bswap %0\n"
-                  "movl %0, %1\n"
-                  : "=r" (v)
-                  : "m" (*(uint32_t *)ptr), "0" (v));
-#else
-    uint8_t *d = (uint8_t *) ptr;
-    d[0] = v >> 24;
-    d[1] = v >> 16;
-    d[2] = v >> 8;
-    d[3] = v;
-#endif
-}
-
-static inline void stq_be_p(void *ptr, uint64_t v)
-{
-    stl_be_p(ptr, v >> 32);
-    stl_be_p((uint8_t *)ptr + 4, v);
-}
-
-/* float access */
-
-static inline float32 ldfl_be_p(const void *ptr)
-{
-    union {
-        float32 f;
-        uint32_t i;
-    } u;
-    u.i = ldl_be_p(ptr);
-    return u.f;
-}
-
-static inline void stfl_be_p(void *ptr, float32 v)
-{
-    union {
-        float32 f;
-        uint32_t i;
-    } u;
-    u.f = v;
-    stl_be_p(ptr, u.i);
-}
-
-static inline float64 ldfq_be_p(const void *ptr)
-{
-    CPU_DoubleU u;
-    u.l.upper = ldl_be_p(ptr);
-    u.l.lower = ldl_be_p((uint8_t *)ptr + 4);
-    return u.d;
-}
-
-static inline void stfq_be_p(void *ptr, float64 v)
-{
-    CPU_DoubleU u;
-    u.d = v;
-    stl_be_p(ptr, u.l.upper);
-    stl_be_p((uint8_t *)ptr + 4, u.l.lower);
-}
-
-#else
-
-static inline int lduw_be_p(const void *ptr)
-{
-    return *(uint16_t *)ptr;
-}
-
-static inline int ldsw_be_p(const void *ptr)
-{
-    return *(int16_t *)ptr;
-}
-
-static inline int ldl_be_p(const void *ptr)
-{
-    return *(uint32_t *)ptr;
-}
-
-static inline uint64_t ldq_be_p(const void *ptr)
-{
-    return *(uint64_t *)ptr;
-}
-
-static inline void stw_be_p(void *ptr, int v)
-{
-    *(uint16_t *)ptr = v;
-}
-
-static inline void stl_be_p(void *ptr, int v)
-{
-    *(uint32_t *)ptr = v;
-}
-
-static inline void stq_be_p(void *ptr, uint64_t v)
-{
-    *(uint64_t *)ptr = v;
-}
-
-/* float access */
-
-static inline float32 ldfl_be_p(const void *ptr)
-{
-    return *(float32 *)ptr;
-}
-
-static inline float64 ldfq_be_p(const void *ptr)
-{
-    return *(float64 *)ptr;
-}
-
-static inline void stfl_be_p(void *ptr, float32 v)
-{
-    *(float32 *)ptr = v;
-}
-
-static inline void stfq_be_p(void *ptr, float64 v)
-{
-    *(float64 *)ptr = v;
-}
-
-#endif
 
-/* target CPU memory access functions */
+/* target-endianness CPU memory access functions */
 #if defined(TARGET_WORDS_BIGENDIAN)
 #define lduw_p(p) lduw_be_p(p)
 #define ldsw_p(p) ldsw_be_p(p)