diff mbox series

[v2,37/55] target/mips: Use 8-byte memory ops for msa load/store

Message ID 20210803041443.55452-38-richard.henderson@linaro.org
State New
Headers show
Series Unaligned access for user-only | expand

Commit Message

Richard Henderson Aug. 3, 2021, 4:14 a.m. UTC
Rather than use 4-16 separate operations, use 2 operations
plus some byte reordering as necessary.

Cc: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/mips/tcg/msa_helper.c | 201 +++++++++++++----------------------
 1 file changed, 71 insertions(+), 130 deletions(-)

Comments

Philippe Mathieu-Daudé Aug. 18, 2021, 9:21 a.m. UTC | #1
On 8/3/21 6:14 AM, Richard Henderson wrote:
> Rather than use 4-16 separate operations, use 2 operations
> plus some byte reordering as necessary.
> 
> Cc: Philippe Mathieu-Daudé <f4bug@amsat.org>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  target/mips/tcg/msa_helper.c | 201 +++++++++++++----------------------
>  1 file changed, 71 insertions(+), 130 deletions(-)
> 
> diff --git a/target/mips/tcg/msa_helper.c b/target/mips/tcg/msa_helper.c
> index a8880ce81c..e40c1b7057 100644
> --- a/target/mips/tcg/msa_helper.c
> +++ b/target/mips/tcg/msa_helper.c
> @@ -8218,47 +8218,31 @@ void helper_msa_ffint_u_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
>  #define MEMOP_IDX(DF)
>  #endif
>  
> +#ifdef TARGET_WORDS_BIGENDIAN
> +static inline uint64_t bswap16x4(uint64_t x)
> +{
> +    uint64_t m = 0x00ff00ff00ff00ffull;
> +    return ((x & m) << 8) | ((x >> 8) & m);
> +}
> +
> +static inline uint64_t bswap32x2(uint64_t x)
> +{
> +    return ror64(bswap64(x), 32);
> +}
> +#endif

I'm trying to remove TARGET_WORDS_BIGENDIAN uses,
so this would become:

static inline bool is_cpu_bigendian(CPUMIPSState *)
{
    return extract32(env->CP0_Config0, CP0C0_BE, 1);
}

static inline uint64_t bswap16x4(CPUMIPSState *env, uint64_t x)
{
    if (is_cpu_bigendian(env)) {
        uint64_t m = 0x00ff00ff00ff00ffull;
        return ((x & m) << 8) | ((x >> 8) & m);
    } else {
        return x;
    }
}

static inline uint64_t bswap32x2(CPUMIPSState *env, uint64_t x)
{
    if (is_cpu_bigendian(env)) {
        return ror64(bswap64(x), 32);
    } else {
        return x;
    }
}

And we can remove the other TARGET_WORDS_BIGENDIAN uses:

>  void helper_msa_ld_w(CPUMIPSState *env, uint32_t wd,
> @@ -8293,18 +8271,20 @@ void helper_msa_ld_w(CPUMIPSState *env, uint32_t wd,
>  {
>      wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
>      uintptr_t ra = GETPC();
> +    uint64_t d0, d1;
>  
> -#if !defined(HOST_WORDS_BIGENDIAN)
> -    pwd->w[0] = cpu_ldl_data_ra(env, addr + (0 << DF_WORD), ra);
> -    pwd->w[1] = cpu_ldl_data_ra(env, addr + (1 << DF_WORD), ra);
> -    pwd->w[2] = cpu_ldl_data_ra(env, addr + (2 << DF_WORD), ra);
> -    pwd->w[3] = cpu_ldl_data_ra(env, addr + (3 << DF_WORD), ra);
> -#else
> -    pwd->w[0] = cpu_ldl_data_ra(env, addr + (1 << DF_WORD), ra);
> -    pwd->w[1] = cpu_ldl_data_ra(env, addr + (0 << DF_WORD), ra);
> -    pwd->w[2] = cpu_ldl_data_ra(env, addr + (3 << DF_WORD), ra);
> -    pwd->w[3] = cpu_ldl_data_ra(env, addr + (2 << DF_WORD), ra);
> +    /*
> +     * Load 8 bytes at a time.  Use little-endian load, then for
> +     * big-endian target, we must then bswap the two words.
> +     */
> +    d0 = cpu_ldq_le_data_ra(env, addr + 0, ra);
> +    d1 = cpu_ldq_le_data_ra(env, addr + 8, ra);
> +#ifdef TARGET_WORDS_BIGENDIAN
> +    d0 = bswap32x2(d0);
> +    d1 = bswap32x2(d1);
>  #endif
> +    pwd->d[0] = d0;
> +    pwd->d[1] = d1;
>  }

But can be done later, so:

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>

(nice simplification BTW!).
Richard Henderson Aug. 18, 2021, 5:55 p.m. UTC | #2
On 8/17/21 11:21 PM, Philippe Mathieu-Daudé wrote:
>> +#ifdef TARGET_WORDS_BIGENDIAN
>> +static inline uint64_t bswap16x4(uint64_t x)
>> +{
>> +    uint64_t m = 0x00ff00ff00ff00ffull;
>> +    return ((x & m) << 8) | ((x >> 8) & m);
>> +}
>> +
>> +static inline uint64_t bswap32x2(uint64_t x)
>> +{
>> +    return ror64(bswap64(x), 32);
>> +}
>> +#endif
> 
> I'm trying to remove TARGET_WORDS_BIGENDIAN uses,
> so this would become:
> 
> static inline bool is_cpu_bigendian(CPUMIPSState *)
> {
>      return extract32(env->CP0_Config0, CP0C0_BE, 1);
> }
> 
> static inline uint64_t bswap16x4(CPUMIPSState *env, uint64_t x)
> {
>      if (is_cpu_bigendian(env)) {
>          uint64_t m = 0x00ff00ff00ff00ffull;
>          return ((x & m) << 8) | ((x >> 8) & m);
>      } else {
>          return x;
>      }
> }
> 
> static inline uint64_t bswap32x2(CPUMIPSState *env, uint64_t x)
> {
>      if (is_cpu_bigendian(env)) {
>          return ror64(bswap64(x), 32);
>      } else {
>          return x;
>      }
> }

I would not put the bigendian test in here...

> 
> And we can remove the other TARGET_WORDS_BIGENDIAN uses:
> 
>>   void helper_msa_ld_w(CPUMIPSState *env, uint32_t wd,
>> @@ -8293,18 +8271,20 @@ void helper_msa_ld_w(CPUMIPSState *env, uint32_t wd,
>>   {
>>       wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
>>       uintptr_t ra = GETPC();
>> +    uint64_t d0, d1;
>>   
>> -#if !defined(HOST_WORDS_BIGENDIAN)
>> -    pwd->w[0] = cpu_ldl_data_ra(env, addr + (0 << DF_WORD), ra);
>> -    pwd->w[1] = cpu_ldl_data_ra(env, addr + (1 << DF_WORD), ra);
>> -    pwd->w[2] = cpu_ldl_data_ra(env, addr + (2 << DF_WORD), ra);
>> -    pwd->w[3] = cpu_ldl_data_ra(env, addr + (3 << DF_WORD), ra);
>> -#else
>> -    pwd->w[0] = cpu_ldl_data_ra(env, addr + (1 << DF_WORD), ra);
>> -    pwd->w[1] = cpu_ldl_data_ra(env, addr + (0 << DF_WORD), ra);
>> -    pwd->w[2] = cpu_ldl_data_ra(env, addr + (3 << DF_WORD), ra);
>> -    pwd->w[3] = cpu_ldl_data_ra(env, addr + (2 << DF_WORD), ra);
>> +    /*
>> +     * Load 8 bytes at a time.  Use little-endian load, then for
>> +     * big-endian target, we must then bswap the two words.
>> +     */
>> +    d0 = cpu_ldq_le_data_ra(env, addr + 0, ra);
>> +    d1 = cpu_ldq_le_data_ra(env, addr + 8, ra);
>> +#ifdef TARGET_WORDS_BIGENDIAN
>> +    d0 = bswap32x2(d0);
>> +    d1 = bswap32x2(d1);
>>   #endif

... I would leave it here.


r~
diff mbox series

Patch

diff --git a/target/mips/tcg/msa_helper.c b/target/mips/tcg/msa_helper.c
index a8880ce81c..e40c1b7057 100644
--- a/target/mips/tcg/msa_helper.c
+++ b/target/mips/tcg/msa_helper.c
@@ -8218,47 +8218,31 @@  void helper_msa_ffint_u_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
 #define MEMOP_IDX(DF)
 #endif
 
+#ifdef TARGET_WORDS_BIGENDIAN
+static inline uint64_t bswap16x4(uint64_t x)
+{
+    uint64_t m = 0x00ff00ff00ff00ffull;
+    return ((x & m) << 8) | ((x >> 8) & m);
+}
+
+static inline uint64_t bswap32x2(uint64_t x)
+{
+    return ror64(bswap64(x), 32);
+}
+#endif
+
 void helper_msa_ld_b(CPUMIPSState *env, uint32_t wd,
                      target_ulong addr)
 {
     wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
     uintptr_t ra = GETPC();
+    uint64_t d0, d1;
 
-#if !defined(HOST_WORDS_BIGENDIAN)
-    pwd->b[0]  = cpu_ldub_data_ra(env, addr + (0  << DF_BYTE), ra);
-    pwd->b[1]  = cpu_ldub_data_ra(env, addr + (1  << DF_BYTE), ra);
-    pwd->b[2]  = cpu_ldub_data_ra(env, addr + (2  << DF_BYTE), ra);
-    pwd->b[3]  = cpu_ldub_data_ra(env, addr + (3  << DF_BYTE), ra);
-    pwd->b[4]  = cpu_ldub_data_ra(env, addr + (4  << DF_BYTE), ra);
-    pwd->b[5]  = cpu_ldub_data_ra(env, addr + (5  << DF_BYTE), ra);
-    pwd->b[6]  = cpu_ldub_data_ra(env, addr + (6  << DF_BYTE), ra);
-    pwd->b[7]  = cpu_ldub_data_ra(env, addr + (7  << DF_BYTE), ra);
-    pwd->b[8]  = cpu_ldub_data_ra(env, addr + (8  << DF_BYTE), ra);
-    pwd->b[9]  = cpu_ldub_data_ra(env, addr + (9  << DF_BYTE), ra);
-    pwd->b[10] = cpu_ldub_data_ra(env, addr + (10 << DF_BYTE), ra);
-    pwd->b[11] = cpu_ldub_data_ra(env, addr + (11 << DF_BYTE), ra);
-    pwd->b[12] = cpu_ldub_data_ra(env, addr + (12 << DF_BYTE), ra);
-    pwd->b[13] = cpu_ldub_data_ra(env, addr + (13 << DF_BYTE), ra);
-    pwd->b[14] = cpu_ldub_data_ra(env, addr + (14 << DF_BYTE), ra);
-    pwd->b[15] = cpu_ldub_data_ra(env, addr + (15 << DF_BYTE), ra);
-#else
-    pwd->b[0]  = cpu_ldub_data_ra(env, addr + (7  << DF_BYTE), ra);
-    pwd->b[1]  = cpu_ldub_data_ra(env, addr + (6  << DF_BYTE), ra);
-    pwd->b[2]  = cpu_ldub_data_ra(env, addr + (5  << DF_BYTE), ra);
-    pwd->b[3]  = cpu_ldub_data_ra(env, addr + (4  << DF_BYTE), ra);
-    pwd->b[4]  = cpu_ldub_data_ra(env, addr + (3  << DF_BYTE), ra);
-    pwd->b[5]  = cpu_ldub_data_ra(env, addr + (2  << DF_BYTE), ra);
-    pwd->b[6]  = cpu_ldub_data_ra(env, addr + (1  << DF_BYTE), ra);
-    pwd->b[7]  = cpu_ldub_data_ra(env, addr + (0  << DF_BYTE), ra);
-    pwd->b[8]  = cpu_ldub_data_ra(env, addr + (15 << DF_BYTE), ra);
-    pwd->b[9]  = cpu_ldub_data_ra(env, addr + (14 << DF_BYTE), ra);
-    pwd->b[10] = cpu_ldub_data_ra(env, addr + (13 << DF_BYTE), ra);
-    pwd->b[11] = cpu_ldub_data_ra(env, addr + (12 << DF_BYTE), ra);
-    pwd->b[12] = cpu_ldub_data_ra(env, addr + (11 << DF_BYTE), ra);
-    pwd->b[13] = cpu_ldub_data_ra(env, addr + (10 << DF_BYTE), ra);
-    pwd->b[14] = cpu_ldub_data_ra(env, addr + (9 << DF_BYTE), ra);
-    pwd->b[15] = cpu_ldub_data_ra(env, addr + (8 << DF_BYTE), ra);
-#endif
+    /* Load 8 bytes at a time.  Vector element ordering makes this LE.  */
+    d0 = cpu_ldq_le_data_ra(env, addr + 0, ra);
+    d1 = cpu_ldq_le_data_ra(env, addr + 8, ra);
+    pwd->d[0] = d0;
+    pwd->d[1] = d1;
 }
 
 void helper_msa_ld_h(CPUMIPSState *env, uint32_t wd,
@@ -8266,26 +8250,20 @@  void helper_msa_ld_h(CPUMIPSState *env, uint32_t wd,
 {
     wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
     uintptr_t ra = GETPC();
+    uint64_t d0, d1;
 
-#if !defined(HOST_WORDS_BIGENDIAN)
-    pwd->h[0] = cpu_lduw_data_ra(env, addr + (0 << DF_HALF), ra);
-    pwd->h[1] = cpu_lduw_data_ra(env, addr + (1 << DF_HALF), ra);
-    pwd->h[2] = cpu_lduw_data_ra(env, addr + (2 << DF_HALF), ra);
-    pwd->h[3] = cpu_lduw_data_ra(env, addr + (3 << DF_HALF), ra);
-    pwd->h[4] = cpu_lduw_data_ra(env, addr + (4 << DF_HALF), ra);
-    pwd->h[5] = cpu_lduw_data_ra(env, addr + (5 << DF_HALF), ra);
-    pwd->h[6] = cpu_lduw_data_ra(env, addr + (6 << DF_HALF), ra);
-    pwd->h[7] = cpu_lduw_data_ra(env, addr + (7 << DF_HALF), ra);
-#else
-    pwd->h[0] = cpu_lduw_data_ra(env, addr + (3 << DF_HALF), ra);
-    pwd->h[1] = cpu_lduw_data_ra(env, addr + (2 << DF_HALF), ra);
-    pwd->h[2] = cpu_lduw_data_ra(env, addr + (1 << DF_HALF), ra);
-    pwd->h[3] = cpu_lduw_data_ra(env, addr + (0 << DF_HALF), ra);
-    pwd->h[4] = cpu_lduw_data_ra(env, addr + (7 << DF_HALF), ra);
-    pwd->h[5] = cpu_lduw_data_ra(env, addr + (6 << DF_HALF), ra);
-    pwd->h[6] = cpu_lduw_data_ra(env, addr + (5 << DF_HALF), ra);
-    pwd->h[7] = cpu_lduw_data_ra(env, addr + (4 << DF_HALF), ra);
+    /*
+     * Load 8 bytes at a time.  Use little-endian load, then for
+     * big-endian target, we must then swap the four halfwords.
+     */
+    d0 = cpu_ldq_le_data_ra(env, addr + 0, ra);
+    d1 = cpu_ldq_le_data_ra(env, addr + 8, ra);
+#ifdef TARGET_WORDS_BIGENDIAN
+    d0 = bswap16x4(d0);
+    d1 = bswap16x4(d1);
 #endif
+    pwd->d[0] = d0;
+    pwd->d[1] = d1;
 }
 
 void helper_msa_ld_w(CPUMIPSState *env, uint32_t wd,
@@ -8293,18 +8271,20 @@  void helper_msa_ld_w(CPUMIPSState *env, uint32_t wd,
 {
     wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
     uintptr_t ra = GETPC();
+    uint64_t d0, d1;
 
-#if !defined(HOST_WORDS_BIGENDIAN)
-    pwd->w[0] = cpu_ldl_data_ra(env, addr + (0 << DF_WORD), ra);
-    pwd->w[1] = cpu_ldl_data_ra(env, addr + (1 << DF_WORD), ra);
-    pwd->w[2] = cpu_ldl_data_ra(env, addr + (2 << DF_WORD), ra);
-    pwd->w[3] = cpu_ldl_data_ra(env, addr + (3 << DF_WORD), ra);
-#else
-    pwd->w[0] = cpu_ldl_data_ra(env, addr + (1 << DF_WORD), ra);
-    pwd->w[1] = cpu_ldl_data_ra(env, addr + (0 << DF_WORD), ra);
-    pwd->w[2] = cpu_ldl_data_ra(env, addr + (3 << DF_WORD), ra);
-    pwd->w[3] = cpu_ldl_data_ra(env, addr + (2 << DF_WORD), ra);
+    /*
+     * Load 8 bytes at a time.  Use little-endian load, then for
+     * big-endian target, we must then bswap the two words.
+     */
+    d0 = cpu_ldq_le_data_ra(env, addr + 0, ra);
+    d1 = cpu_ldq_le_data_ra(env, addr + 8, ra);
+#ifdef TARGET_WORDS_BIGENDIAN
+    d0 = bswap32x2(d0);
+    d1 = bswap32x2(d1);
 #endif
+    pwd->d[0] = d0;
+    pwd->d[1] = d1;
 }
 
 void helper_msa_ld_d(CPUMIPSState *env, uint32_t wd,
@@ -8312,9 +8292,12 @@  void helper_msa_ld_d(CPUMIPSState *env, uint32_t wd,
 {
     wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
     uintptr_t ra = GETPC();
+    uint64_t d0, d1;
 
-    pwd->d[0] = cpu_ldq_data_ra(env, addr + (0 << DF_DOUBLE), ra);
-    pwd->d[1] = cpu_ldq_data_ra(env, addr + (1 << DF_DOUBLE), ra);
+    d0 = cpu_ldq_data_ra(env, addr + 0, ra);
+    d1 = cpu_ldq_data_ra(env, addr + 8, ra);
+    pwd->d[0] = d0;
+    pwd->d[1] = d1;
 }
 
 #define MSA_PAGESPAN(x) \
@@ -8344,41 +8327,9 @@  void helper_msa_st_b(CPUMIPSState *env, uint32_t wd,
 
     ensure_writable_pages(env, addr, mmu_idx, ra);
 
-#if !defined(HOST_WORDS_BIGENDIAN)
-    cpu_stb_data_ra(env, addr + (0  << DF_BYTE), pwd->b[0], ra);
-    cpu_stb_data_ra(env, addr + (1  << DF_BYTE), pwd->b[1], ra);
-    cpu_stb_data_ra(env, addr + (2  << DF_BYTE), pwd->b[2], ra);
-    cpu_stb_data_ra(env, addr + (3  << DF_BYTE), pwd->b[3], ra);
-    cpu_stb_data_ra(env, addr + (4  << DF_BYTE), pwd->b[4], ra);
-    cpu_stb_data_ra(env, addr + (5  << DF_BYTE), pwd->b[5], ra);
-    cpu_stb_data_ra(env, addr + (6  << DF_BYTE), pwd->b[6], ra);
-    cpu_stb_data_ra(env, addr + (7  << DF_BYTE), pwd->b[7], ra);
-    cpu_stb_data_ra(env, addr + (8  << DF_BYTE), pwd->b[8], ra);
-    cpu_stb_data_ra(env, addr + (9  << DF_BYTE), pwd->b[9], ra);
-    cpu_stb_data_ra(env, addr + (10 << DF_BYTE), pwd->b[10], ra);
-    cpu_stb_data_ra(env, addr + (11 << DF_BYTE), pwd->b[11], ra);
-    cpu_stb_data_ra(env, addr + (12 << DF_BYTE), pwd->b[12], ra);
-    cpu_stb_data_ra(env, addr + (13 << DF_BYTE), pwd->b[13], ra);
-    cpu_stb_data_ra(env, addr + (14 << DF_BYTE), pwd->b[14], ra);
-    cpu_stb_data_ra(env, addr + (15 << DF_BYTE), pwd->b[15], ra);
-#else
-    cpu_stb_data_ra(env, addr + (7  << DF_BYTE), pwd->b[0], ra);
-    cpu_stb_data_ra(env, addr + (6  << DF_BYTE), pwd->b[1], ra);
-    cpu_stb_data_ra(env, addr + (5  << DF_BYTE), pwd->b[2], ra);
-    cpu_stb_data_ra(env, addr + (4  << DF_BYTE), pwd->b[3], ra);
-    cpu_stb_data_ra(env, addr + (3  << DF_BYTE), pwd->b[4], ra);
-    cpu_stb_data_ra(env, addr + (2  << DF_BYTE), pwd->b[5], ra);
-    cpu_stb_data_ra(env, addr + (1  << DF_BYTE), pwd->b[6], ra);
-    cpu_stb_data_ra(env, addr + (0  << DF_BYTE), pwd->b[7], ra);
-    cpu_stb_data_ra(env, addr + (15 << DF_BYTE), pwd->b[8], ra);
-    cpu_stb_data_ra(env, addr + (14 << DF_BYTE), pwd->b[9], ra);
-    cpu_stb_data_ra(env, addr + (13 << DF_BYTE), pwd->b[10], ra);
-    cpu_stb_data_ra(env, addr + (12 << DF_BYTE), pwd->b[11], ra);
-    cpu_stb_data_ra(env, addr + (11 << DF_BYTE), pwd->b[12], ra);
-    cpu_stb_data_ra(env, addr + (10 << DF_BYTE), pwd->b[13], ra);
-    cpu_stb_data_ra(env, addr + (9  << DF_BYTE), pwd->b[14], ra);
-    cpu_stb_data_ra(env, addr + (8  << DF_BYTE), pwd->b[15], ra);
-#endif
+    /* Store 8 bytes at a time.  Vector element ordering makes this LE.  */
+    cpu_stq_le_data_ra(env, addr + 0, pwd->d[0], ra);
+    cpu_stq_le_data_ra(env, addr + 0, pwd->d[1], ra);
 }
 
 void helper_msa_st_h(CPUMIPSState *env, uint32_t wd,
@@ -8387,28 +8338,19 @@  void helper_msa_st_h(CPUMIPSState *env, uint32_t wd,
     wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
     int mmu_idx = cpu_mmu_index(env, false);
     uintptr_t ra = GETPC();
+    uint64_t d0, d1;
 
     ensure_writable_pages(env, addr, mmu_idx, ra);
 
-#if !defined(HOST_WORDS_BIGENDIAN)
-    cpu_stw_data_ra(env, addr + (0 << DF_HALF), pwd->h[0], ra);
-    cpu_stw_data_ra(env, addr + (1 << DF_HALF), pwd->h[1], ra);
-    cpu_stw_data_ra(env, addr + (2 << DF_HALF), pwd->h[2], ra);
-    cpu_stw_data_ra(env, addr + (3 << DF_HALF), pwd->h[3], ra);
-    cpu_stw_data_ra(env, addr + (4 << DF_HALF), pwd->h[4], ra);
-    cpu_stw_data_ra(env, addr + (5 << DF_HALF), pwd->h[5], ra);
-    cpu_stw_data_ra(env, addr + (6 << DF_HALF), pwd->h[6], ra);
-    cpu_stw_data_ra(env, addr + (7 << DF_HALF), pwd->h[7], ra);
-#else
-    cpu_stw_data_ra(env, addr + (3 << DF_HALF), pwd->h[0], ra);
-    cpu_stw_data_ra(env, addr + (2 << DF_HALF), pwd->h[1], ra);
-    cpu_stw_data_ra(env, addr + (1 << DF_HALF), pwd->h[2], ra);
-    cpu_stw_data_ra(env, addr + (0 << DF_HALF), pwd->h[3], ra);
-    cpu_stw_data_ra(env, addr + (7 << DF_HALF), pwd->h[4], ra);
-    cpu_stw_data_ra(env, addr + (6 << DF_HALF), pwd->h[5], ra);
-    cpu_stw_data_ra(env, addr + (5 << DF_HALF), pwd->h[6], ra);
-    cpu_stw_data_ra(env, addr + (4 << DF_HALF), pwd->h[7], ra);
+    /* Store 8 bytes at a time.  See helper_msa_ld_h. */
+    d0 = pwd->d[0];
+    d1 = pwd->d[1];
+#ifdef TARGET_WORDS_BIGENDIAN
+    d0 = bswap16x4(d0);
+    d1 = bswap16x4(d1);
 #endif
+    cpu_stq_le_data_ra(env, addr + 0, d0, ra);
+    cpu_stq_le_data_ra(env, addr + 8, d1, ra);
 }
 
 void helper_msa_st_w(CPUMIPSState *env, uint32_t wd,
@@ -8417,20 +8359,19 @@  void helper_msa_st_w(CPUMIPSState *env, uint32_t wd,
     wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
     int mmu_idx = cpu_mmu_index(env, false);
     uintptr_t ra = GETPC();
+    uint64_t d0, d1;
 
     ensure_writable_pages(env, addr, mmu_idx, ra);
 
-#if !defined(HOST_WORDS_BIGENDIAN)
-    cpu_stl_data_ra(env, addr + (0 << DF_WORD), pwd->w[0], ra);
-    cpu_stl_data_ra(env, addr + (1 << DF_WORD), pwd->w[1], ra);
-    cpu_stl_data_ra(env, addr + (2 << DF_WORD), pwd->w[2], ra);
-    cpu_stl_data_ra(env, addr + (3 << DF_WORD), pwd->w[3], ra);
-#else
-    cpu_stl_data_ra(env, addr + (1 << DF_WORD), pwd->w[0], ra);
-    cpu_stl_data_ra(env, addr + (0 << DF_WORD), pwd->w[1], ra);
-    cpu_stl_data_ra(env, addr + (3 << DF_WORD), pwd->w[2], ra);
-    cpu_stl_data_ra(env, addr + (2 << DF_WORD), pwd->w[3], ra);
+    /* Store 8 bytes at a time.  See helper_msa_ld_w. */
+    d0 = pwd->d[0];
+    d1 = pwd->d[1];
+#ifdef TARGET_WORDS_BIGENDIAN
+    d0 = bswap32x2(d0);
+    d1 = bswap32x2(d1);
 #endif
+    cpu_stq_le_data_ra(env, addr + 0, d0, ra);
+    cpu_stq_le_data_ra(env, addr + 8, d1, ra);
 }
 
 void helper_msa_st_d(CPUMIPSState *env, uint32_t wd,
@@ -8442,6 +8383,6 @@  void helper_msa_st_d(CPUMIPSState *env, uint32_t wd,
 
     ensure_writable_pages(env, addr, mmu_idx, GETPC());
 
-    cpu_stq_data_ra(env, addr + (0 << DF_DOUBLE), pwd->d[0], ra);
-    cpu_stq_data_ra(env, addr + (1 << DF_DOUBLE), pwd->d[1], ra);
+    cpu_stq_data_ra(env, addr + 0, pwd->d[0], ra);
+    cpu_stq_data_ra(env, addr + 8, pwd->d[1], ra);
 }