diff mbox

[RESEND,v2,15/17] target-ppc: add lxvb16x and lxvh8x

Message ID 1473662506-27441-16-git-send-email-nikunj@linux.vnet.ibm.com
State New
Headers show

Commit Message

Nikunj A Dadhania Sept. 12, 2016, 6:41 a.m. UTC
lxvb16x: Load VSX Vector Byte*16
lxvh8x:  Load VSX Vector Halfword*8

Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
---
 target-ppc/helper.h                 |  1 +
 target-ppc/mem_helper.c             |  6 ++++
 target-ppc/translate/vsx-impl.inc.c | 57 +++++++++++++++++++++++++++++++++++++
 target-ppc/translate/vsx-ops.inc.c  |  2 ++
 4 files changed, 66 insertions(+)

Comments

David Gibson Sept. 15, 2016, 1:41 a.m. UTC | #1
On Mon, Sep 12, 2016 at 12:11:44PM +0530, Nikunj A Dadhania wrote:
> lxvb16x: Load VSX Vector Byte*16
> lxvh8x:  Load VSX Vector Halfword*8
> 
> Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
> ---
>  target-ppc/helper.h                 |  1 +
>  target-ppc/mem_helper.c             |  6 ++++
>  target-ppc/translate/vsx-impl.inc.c | 57 +++++++++++++++++++++++++++++++++++++
>  target-ppc/translate/vsx-ops.inc.c  |  2 ++
>  4 files changed, 66 insertions(+)
> 
> diff --git a/target-ppc/helper.h b/target-ppc/helper.h
> index 1bbeac4..6de0db7 100644
> --- a/target-ppc/helper.h
> +++ b/target-ppc/helper.h
> @@ -298,6 +298,7 @@ DEF_HELPER_3(lvebx, void, env, avr, tl)
>  DEF_HELPER_3(lvehx, void, env, avr, tl)
>  DEF_HELPER_3(lvewx, void, env, avr, tl)
>  DEF_HELPER_1(bswap32x2, i64, i64)
> +DEF_HELPER_1(bswap16x4, i64, i64)
>  DEF_HELPER_3(stvebx, void, env, avr, tl)
>  DEF_HELPER_3(stvehx, void, env, avr, tl)
>  DEF_HELPER_3(stvewx, void, env, avr, tl)
> diff --git a/target-ppc/mem_helper.c b/target-ppc/mem_helper.c
> index a56051a..608803f 100644
> --- a/target-ppc/mem_helper.c
> +++ b/target-ppc/mem_helper.c
> @@ -290,6 +290,12 @@ uint64_t helper_bswap32x2(uint64_t x)
>      return deposit64((x >> 32), 32, 32, (x));
>  }
>  
> +uint64_t helper_bswap16x4(uint64_t x)
> +{
> +    uint64_t m = 0x00ff00ff00ff00ffull;
> +    return ((x & m) << 8) | ((x >> 8) & m);
> +}

This doesn't seem to match the bswap32x2 function above.  bswap32x2
just swaps the two 32-bit words in the 64-bit word.  This one swaps
the bytes in each individual 16 bit work in the 64-bit word.

I suspect the bswap32x2 is wrong, which would explain why the previous
patch didn't seem to make sense.

> +
>  #undef HI_IDX
>  #undef LO_IDX
>  
> diff --git a/target-ppc/translate/vsx-impl.inc.c b/target-ppc/translate/vsx-impl.inc.c
> index e3374df..caa6660 100644
> --- a/target-ppc/translate/vsx-impl.inc.c
> +++ b/target-ppc/translate/vsx-impl.inc.c
> @@ -108,6 +108,63 @@ static void gen_lxvw4x(DisasContext *ctx)
>      tcg_temp_free(EA);
>  }
>  
> +static void gen_lxvb16x(DisasContext *ctx)
> +{
> +    TCGv EA;
> +    TCGv_i64 xth = cpu_vsrh(xT(ctx->opcode));
> +    TCGv_i64 xtl = cpu_vsrl(xT(ctx->opcode));
> +
> +    if (unlikely(!ctx->vsx_enabled)) {
> +        gen_exception(ctx, POWERPC_EXCP_VSXU);
> +        return;
> +    }
> +    gen_set_access_type(ctx, ACCESS_INT);
> +    EA = tcg_temp_new();
> +    gen_addr_reg_index(ctx, EA);
> +    if (ctx->le_mode) {
> +        tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_BEQ);
> +        tcg_gen_addi_tl(EA, EA, 8);
> +        tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_BEQ);
> +    } else {
> +        tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_LEQ);
> +        gen_helper_bswap32x2(xth, xth);

I really don't understand how a bswap32x2 helps here, either as it's
defined now, or as I suspect it should be defined by analogy with
bswap16x4.

> +        tcg_gen_addi_tl(EA, EA, 8);
> +        tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_LEQ);
> +        gen_helper_bswap32x2(xtl, xtl);

Also.. if I'm understanding the ISA correctly, this instruction loads
subsequent higher-address bytes into subsequent (i.e. less signficant,
since IBM uses BE bit/byte numbering) bytes in the vector.  Doesn't
that mean you want a BE load in all cases, not just the LE guest case?

> +    }
> +    tcg_temp_free(EA);
> +}
> +
> +static void gen_lxvh8x(DisasContext *ctx)
> +{
> +    TCGv EA;
> +    TCGv_i64 xth = cpu_vsrh(xT(ctx->opcode));
> +    TCGv_i64 xtl = cpu_vsrl(xT(ctx->opcode));
> +
> +    if (unlikely(!ctx->vsx_enabled)) {
> +        gen_exception(ctx, POWERPC_EXCP_VSXU);
> +        return;
> +    }
> +    gen_set_access_type(ctx, ACCESS_INT);
> +    EA = tcg_temp_new();
> +    gen_addr_reg_index(ctx, EA);
> +
> +    if (ctx->le_mode) {
> +        tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_BEQ);
> +        gen_helper_bswap16x4(xth, xth);
> +        tcg_gen_addi_tl(EA, EA, 8);
> +        tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_BEQ);
> +        gen_helper_bswap16x4(xtl, xtl);
> +    } else {
> +        tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_LEQ);
> +        gen_helper_bswap32x2(xth, xth);
> +        tcg_gen_addi_tl(EA, EA, 8);
> +        tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_LEQ);
> +        gen_helper_bswap32x2(xtl, xtl);

Again, I think you want a BE load in both cases, and the bswap32x2
makes no sense to me.

> +    }
> +    tcg_temp_free(EA);
> +}
> +
>  #define VSX_STORE_SCALAR(name, operation)                     \
>  static void gen_##name(DisasContext *ctx)                     \
>  {                                                             \
> diff --git a/target-ppc/translate/vsx-ops.inc.c b/target-ppc/translate/vsx-ops.inc.c
> index 414b73b..598b349 100644
> --- a/target-ppc/translate/vsx-ops.inc.c
> +++ b/target-ppc/translate/vsx-ops.inc.c
> @@ -7,6 +7,8 @@ GEN_HANDLER_E(lxsspx, 0x1F, 0x0C, 0x10, 0, PPC_NONE, PPC2_VSX207),
>  GEN_HANDLER_E(lxvd2x, 0x1F, 0x0C, 0x1A, 0, PPC_NONE, PPC2_VSX),
>  GEN_HANDLER_E(lxvdsx, 0x1F, 0x0C, 0x0A, 0, PPC_NONE, PPC2_VSX),
>  GEN_HANDLER_E(lxvw4x, 0x1F, 0x0C, 0x18, 0, PPC_NONE, PPC2_VSX),
> +GEN_HANDLER_E(lxvh8x, 0x1F, 0x0C, 0x19, 0, PPC_NONE,  PPC2_ISA300),
> +GEN_HANDLER_E(lxvb16x, 0x1F, 0x0C, 0x1B, 0, PPC_NONE, PPC2_ISA300),
>  
>  GEN_HANDLER_E(stxsdx, 0x1F, 0xC, 0x16, 0, PPC_NONE, PPC2_VSX),
>  GEN_HANDLER_E(stxsibx, 0x1F, 0xD, 0x1C, 0, PPC_NONE, PPC2_ISA300),
Nikunj A Dadhania Sept. 16, 2016, 8:26 a.m. UTC | #2
David Gibson <david@gibson.dropbear.id.au> writes:

> [ Unknown signature status ]
> On Mon, Sep 12, 2016 at 12:11:44PM +0530, Nikunj A Dadhania wrote:
>> lxvb16x: Load VSX Vector Byte*16
>> lxvh8x:  Load VSX Vector Halfword*8
>> 
>> Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
>> ---
>>  target-ppc/helper.h                 |  1 +
>>  target-ppc/mem_helper.c             |  6 ++++
>>  target-ppc/translate/vsx-impl.inc.c | 57 +++++++++++++++++++++++++++++++++++++
>>  target-ppc/translate/vsx-ops.inc.c  |  2 ++
>>  4 files changed, 66 insertions(+)
>> 
>> diff --git a/target-ppc/helper.h b/target-ppc/helper.h
>> index 1bbeac4..6de0db7 100644
>> --- a/target-ppc/helper.h
>> +++ b/target-ppc/helper.h
>> @@ -298,6 +298,7 @@ DEF_HELPER_3(lvebx, void, env, avr, tl)
>>  DEF_HELPER_3(lvehx, void, env, avr, tl)
>>  DEF_HELPER_3(lvewx, void, env, avr, tl)
>>  DEF_HELPER_1(bswap32x2, i64, i64)
>> +DEF_HELPER_1(bswap16x4, i64, i64)
>>  DEF_HELPER_3(stvebx, void, env, avr, tl)
>>  DEF_HELPER_3(stvehx, void, env, avr, tl)
>>  DEF_HELPER_3(stvewx, void, env, avr, tl)
>> diff --git a/target-ppc/mem_helper.c b/target-ppc/mem_helper.c
>> index a56051a..608803f 100644
>> --- a/target-ppc/mem_helper.c
>> +++ b/target-ppc/mem_helper.c
>> @@ -290,6 +290,12 @@ uint64_t helper_bswap32x2(uint64_t x)
>>      return deposit64((x >> 32), 32, 32, (x));
>>  }
>>  
>> +uint64_t helper_bswap16x4(uint64_t x)
>> +{
>> +    uint64_t m = 0x00ff00ff00ff00ffull;
>> +    return ((x & m) << 8) | ((x >> 8) & m);
>> +}
>
> This doesn't seem to match the bswap32x2 function above.  bswap32x2
> just swaps the two 32-bit words in the 64-bit word.  This one swaps
> the bytes in each individual 16 bit work in the 64-bit word.
>
> I suspect the bswap32x2 is wrong, which would explain why the previous
> patch didn't seem to make sense.

The confusion is because of the name(bswap32x2), I will rename it as
deposit32x2. And let bswap16x4 as is, as that is a required operation in
certain cases to get the right order expected by the instruction.

>
>> +
>>  #undef HI_IDX
>>  #undef LO_IDX
>>  
>> diff --git a/target-ppc/translate/vsx-impl.inc.c b/target-ppc/translate/vsx-impl.inc.c
>> index e3374df..caa6660 100644
>> --- a/target-ppc/translate/vsx-impl.inc.c
>> +++ b/target-ppc/translate/vsx-impl.inc.c
>> @@ -108,6 +108,63 @@ static void gen_lxvw4x(DisasContext *ctx)
>>      tcg_temp_free(EA);
>>  }
>>  
>> +static void gen_lxvb16x(DisasContext *ctx)
>> +{
>> +    TCGv EA;
>> +    TCGv_i64 xth = cpu_vsrh(xT(ctx->opcode));
>> +    TCGv_i64 xtl = cpu_vsrl(xT(ctx->opcode));
>> +
>> +    if (unlikely(!ctx->vsx_enabled)) {
>> +        gen_exception(ctx, POWERPC_EXCP_VSXU);
>> +        return;
>> +    }
>> +    gen_set_access_type(ctx, ACCESS_INT);
>> +    EA = tcg_temp_new();
>> +    gen_addr_reg_index(ctx, EA);
>> +    if (ctx->le_mode) {
>> +        tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_BEQ);
>> +        tcg_gen_addi_tl(EA, EA, 8);
>> +        tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_BEQ);
>> +    } else {
>> +        tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_LEQ);
>> +        gen_helper_bswap32x2(xth, xth);
>
> I really don't understand how a bswap32x2 helps here, either as it's
> defined now, or as I suspect it should be defined by analogy with
> bswap16x4.
>
>> +        tcg_gen_addi_tl(EA, EA, 8);
>> +        tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_LEQ);
>> +        gen_helper_bswap32x2(xtl, xtl);
>
> Also.. if I'm understanding the ISA correctly, this instruction loads
> subsequent higher-address bytes into subsequent (i.e. less signficant,
> since IBM uses BE bit/byte numbering) bytes in the vector.  Doesn't
> that mean you want a BE load in all cases, not just the LE guest case?
>
>> +    }
>> +    tcg_temp_free(EA);
>> +}
>> +
>> +static void gen_lxvh8x(DisasContext *ctx)
>> +{
>> +    TCGv EA;
>> +    TCGv_i64 xth = cpu_vsrh(xT(ctx->opcode));
>> +    TCGv_i64 xtl = cpu_vsrl(xT(ctx->opcode));
>> +
>> +    if (unlikely(!ctx->vsx_enabled)) {
>> +        gen_exception(ctx, POWERPC_EXCP_VSXU);
>> +        return;
>> +    }
>> +    gen_set_access_type(ctx, ACCESS_INT);
>> +    EA = tcg_temp_new();
>> +    gen_addr_reg_index(ctx, EA);
>> +
>> +    if (ctx->le_mode) {
>> +        tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_BEQ);
>> +        gen_helper_bswap16x4(xth, xth);
>> +        tcg_gen_addi_tl(EA, EA, 8);
>> +        tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_BEQ);
>> +        gen_helper_bswap16x4(xtl, xtl);
>> +    } else {
>> +        tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_LEQ);
>> +        gen_helper_bswap32x2(xth, xth);
>> +        tcg_gen_addi_tl(EA, EA, 8);
>> +        tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_LEQ);
>> +        gen_helper_bswap32x2(xtl, xtl);
>
> Again, I think you want a BE load in both cases, and the bswap32x2
> makes no sense to me.

BTW, MO_BEQ(big-endian load) and MO_LEQ(little-endian) also does magic
during loads. Now when I have 8bytes loaded, i do the manipulation
within to get the right order.

For an input of:
uint16_t rb16[8] = {0x0001, 0x1011, 0x2021, 0x3031, 0x4041, 0x5051, 0x6061, 0x7071};

For LE I should get this result:
7071 6061 5051 4041 3031 2021 1011 0001

For BE it should be:
0001 1011 2021 3031 4041 5051 6061 7071

Thats what the tcg operations achieves. I guess renaming bswap32x2 will
clear the confusion.

Regards
Nikunj
diff mbox

Patch

diff --git a/target-ppc/helper.h b/target-ppc/helper.h
index 1bbeac4..6de0db7 100644
--- a/target-ppc/helper.h
+++ b/target-ppc/helper.h
@@ -298,6 +298,7 @@  DEF_HELPER_3(lvebx, void, env, avr, tl)
 DEF_HELPER_3(lvehx, void, env, avr, tl)
 DEF_HELPER_3(lvewx, void, env, avr, tl)
 DEF_HELPER_1(bswap32x2, i64, i64)
+DEF_HELPER_1(bswap16x4, i64, i64)
 DEF_HELPER_3(stvebx, void, env, avr, tl)
 DEF_HELPER_3(stvehx, void, env, avr, tl)
 DEF_HELPER_3(stvewx, void, env, avr, tl)
diff --git a/target-ppc/mem_helper.c b/target-ppc/mem_helper.c
index a56051a..608803f 100644
--- a/target-ppc/mem_helper.c
+++ b/target-ppc/mem_helper.c
@@ -290,6 +290,12 @@  uint64_t helper_bswap32x2(uint64_t x)
     return deposit64((x >> 32), 32, 32, (x));
 }
 
+uint64_t helper_bswap16x4(uint64_t x)
+{
+    uint64_t m = 0x00ff00ff00ff00ffull;
+    return ((x & m) << 8) | ((x >> 8) & m);
+}
+
 #undef HI_IDX
 #undef LO_IDX
 
diff --git a/target-ppc/translate/vsx-impl.inc.c b/target-ppc/translate/vsx-impl.inc.c
index e3374df..caa6660 100644
--- a/target-ppc/translate/vsx-impl.inc.c
+++ b/target-ppc/translate/vsx-impl.inc.c
@@ -108,6 +108,63 @@  static void gen_lxvw4x(DisasContext *ctx)
     tcg_temp_free(EA);
 }
 
+static void gen_lxvb16x(DisasContext *ctx)
+{
+    TCGv EA;
+    TCGv_i64 xth = cpu_vsrh(xT(ctx->opcode));
+    TCGv_i64 xtl = cpu_vsrl(xT(ctx->opcode));
+
+    if (unlikely(!ctx->vsx_enabled)) {
+        gen_exception(ctx, POWERPC_EXCP_VSXU);
+        return;
+    }
+    gen_set_access_type(ctx, ACCESS_INT);
+    EA = tcg_temp_new();
+    gen_addr_reg_index(ctx, EA);
+    if (ctx->le_mode) {
+        tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_BEQ);
+        tcg_gen_addi_tl(EA, EA, 8);
+        tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_BEQ);
+    } else {
+        tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_LEQ);
+        gen_helper_bswap32x2(xth, xth);
+        tcg_gen_addi_tl(EA, EA, 8);
+        tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_LEQ);
+        gen_helper_bswap32x2(xtl, xtl);
+    }
+    tcg_temp_free(EA);
+}
+
+static void gen_lxvh8x(DisasContext *ctx)
+{
+    TCGv EA;
+    TCGv_i64 xth = cpu_vsrh(xT(ctx->opcode));
+    TCGv_i64 xtl = cpu_vsrl(xT(ctx->opcode));
+
+    if (unlikely(!ctx->vsx_enabled)) {
+        gen_exception(ctx, POWERPC_EXCP_VSXU);
+        return;
+    }
+    gen_set_access_type(ctx, ACCESS_INT);
+    EA = tcg_temp_new();
+    gen_addr_reg_index(ctx, EA);
+
+    if (ctx->le_mode) {
+        tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_BEQ);
+        gen_helper_bswap16x4(xth, xth);
+        tcg_gen_addi_tl(EA, EA, 8);
+        tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_BEQ);
+        gen_helper_bswap16x4(xtl, xtl);
+    } else {
+        tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_LEQ);
+        gen_helper_bswap32x2(xth, xth);
+        tcg_gen_addi_tl(EA, EA, 8);
+        tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_LEQ);
+        gen_helper_bswap32x2(xtl, xtl);
+    }
+    tcg_temp_free(EA);
+}
+
 #define VSX_STORE_SCALAR(name, operation)                     \
 static void gen_##name(DisasContext *ctx)                     \
 {                                                             \
diff --git a/target-ppc/translate/vsx-ops.inc.c b/target-ppc/translate/vsx-ops.inc.c
index 414b73b..598b349 100644
--- a/target-ppc/translate/vsx-ops.inc.c
+++ b/target-ppc/translate/vsx-ops.inc.c
@@ -7,6 +7,8 @@  GEN_HANDLER_E(lxsspx, 0x1F, 0x0C, 0x10, 0, PPC_NONE, PPC2_VSX207),
 GEN_HANDLER_E(lxvd2x, 0x1F, 0x0C, 0x1A, 0, PPC_NONE, PPC2_VSX),
 GEN_HANDLER_E(lxvdsx, 0x1F, 0x0C, 0x0A, 0, PPC_NONE, PPC2_VSX),
 GEN_HANDLER_E(lxvw4x, 0x1F, 0x0C, 0x18, 0, PPC_NONE, PPC2_VSX),
+GEN_HANDLER_E(lxvh8x, 0x1F, 0x0C, 0x19, 0, PPC_NONE,  PPC2_ISA300),
+GEN_HANDLER_E(lxvb16x, 0x1F, 0x0C, 0x1B, 0, PPC_NONE, PPC2_ISA300),
 
 GEN_HANDLER_E(stxsdx, 0x1F, 0xC, 0x16, 0, PPC_NONE, PPC2_VSX),
 GEN_HANDLER_E(stxsibx, 0x1F, 0xD, 0x1C, 0, PPC_NONE, PPC2_ISA300),