diff mbox

[3/4] TCG x86: add SSE4a instruction support

Message ID 1253313049-25874-4-git-send-email-andre.przywara@amd.com
State Superseded
Headers show

Commit Message

Andre Przywara Sept. 18, 2009, 10:30 p.m. UTC
This adds support for the AMD Phenom/Barcelona's SSE4a instructions.
Those include insertq and extrq, which are doing shift and mask on
XMM registers, in two versions (immediate shift/length values and
stored in another XMM register).
Additionally it implements movntss, movntsd, which are scalar
non-temporal stores (avoiding cache trashing). These are implemented
as normal stores, though.
SSE4a is guarded by the SSE4A CPUID bit (Fn8000_0001:ECX[6]).

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
---
 target-i386/ops_sse.h        |   44 ++++++++++++++++++++++++++++++++++++++++++
 target-i386/ops_sse_header.h |    4 +++
 target-i386/translate.c      |   39 +++++++++++++++++++++++++++++++++++-
 3 files changed, 85 insertions(+), 2 deletions(-)

Comments

Aurelien Jarno Oct. 4, 2009, 12:10 p.m. UTC | #1
On Sat, Sep 19, 2009 at 12:30:48AM +0200, Andre Przywara wrote:
> This adds support for the AMD Phenom/Barcelona's SSE4a instructions.
> Those include insertq and extrq, which are doing shift and mask on
> XMM registers, in two versions (immediate shift/length values and
> stored in another XMM register).
> Additionally it implements movntss, movntsd, which are scalar
> non-temporal stores (avoiding cache trashing). These are implemented
> as normal stores, though.
> SSE4a is guarded by the SSE4A CPUID bit (Fn8000_0001:ECX[6]).

Thanks applied.

> Signed-off-by: Andre Przywara <andre.przywara@amd.com>
> ---
>  target-i386/ops_sse.h        |   44 ++++++++++++++++++++++++++++++++++++++++++
>  target-i386/ops_sse_header.h |    4 +++
>  target-i386/translate.c      |   39 +++++++++++++++++++++++++++++++++++-
>  3 files changed, 85 insertions(+), 2 deletions(-)
> 
> diff --git a/target-i386/ops_sse.h b/target-i386/ops_sse.h
> index 709732a..3232abd 100644
> --- a/target-i386/ops_sse.h
> +++ b/target-i386/ops_sse.h
> @@ -802,6 +802,50 @@ void helper_rcpss(XMMReg *d, XMMReg *s)
>      d->XMM_S(0) = approx_rcp(s->XMM_S(0));
>  }
>  
> +static inline uint64_t helper_extrq(uint64_t src, int shift, int len)
> +{
> +    uint64_t mask;
> +
> +    if (len == 0) {
> +        mask = ~0LL;
> +    } else {
> +        mask = (1ULL << len) - 1;
> +    }
> +    return (src >> shift) & mask;
> +}
> +
> +void helper_extrq_r(XMMReg *d, XMMReg *s)
> +{
> +    d->XMM_Q(0) = helper_extrq(d->XMM_Q(0), s->XMM_B(1), s->XMM_B(0));
> +}
> +
> +void helper_extrq_i(XMMReg *d, int index, int length)
> +{
> +    d->XMM_Q(0) = helper_extrq(d->XMM_Q(0), index, length);
> +}
> +
> +static inline uint64_t helper_insertq(uint64_t src, int shift, int len)
> +{
> +    uint64_t mask;
> +
> +    if (len == 0) {
> +        mask = ~0ULL;
> +    } else {
> +        mask = (1ULL << len) - 1;
> +    }
> +    return (src & ~(mask << shift)) | ((src & mask) << shift);
> +}
> +
> +void helper_insertq_r(XMMReg *d, XMMReg *s)
> +{
> +    d->XMM_Q(0) = helper_insertq(s->XMM_Q(0), s->XMM_B(9), s->XMM_B(8));
> +}
> +
> +void helper_insertq_i(XMMReg *d, int index, int length)
> +{
> +    d->XMM_Q(0) = helper_insertq(d->XMM_Q(0), index, length);
> +}
> +
>  void helper_haddps(XMMReg *d, XMMReg *s)
>  {
>      XMMReg r;
> diff --git a/target-i386/ops_sse_header.h b/target-i386/ops_sse_header.h
> index 53add99..a0a6361 100644
> --- a/target-i386/ops_sse_header.h
> +++ b/target-i386/ops_sse_header.h
> @@ -187,6 +187,10 @@ DEF_HELPER_2(rsqrtps, void, XMMReg, XMMReg)
>  DEF_HELPER_2(rsqrtss, void, XMMReg, XMMReg)
>  DEF_HELPER_2(rcpps, void, XMMReg, XMMReg)
>  DEF_HELPER_2(rcpss, void, XMMReg, XMMReg)
> +DEF_HELPER_2(extrq_r, void, XMMReg, XMMReg)
> +DEF_HELPER_3(extrq_i, void, XMMReg, int, int)
> +DEF_HELPER_2(insertq_r, void, XMMReg, XMMReg)
> +DEF_HELPER_3(insertq_i, void, XMMReg, int, int)
>  DEF_HELPER_2(haddps, void, XMMReg, XMMReg)
>  DEF_HELPER_2(haddpd, void, XMMReg, XMMReg)
>  DEF_HELPER_2(hsubps, void, XMMReg, XMMReg)
> diff --git a/target-i386/translate.c b/target-i386/translate.c
> index 134c870..5cbcf07 100644
> --- a/target-i386/translate.c
> +++ b/target-i386/translate.c
> @@ -2822,7 +2822,7 @@ static void *sse_op_table1[256][4] = {
>      [0x28] = { SSE_SPECIAL, SSE_SPECIAL },  /* movaps, movapd */
>      [0x29] = { SSE_SPECIAL, SSE_SPECIAL },  /* movaps, movapd */
>      [0x2a] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* cvtpi2ps, cvtpi2pd, cvtsi2ss, cvtsi2sd */
> -    [0x2b] = { SSE_SPECIAL, SSE_SPECIAL },  /* movntps, movntpd */
> +    [0x2b] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movntps, movntpd, movntss, movntsd */
>      [0x2c] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* cvttps2pi, cvttpd2pi, cvttsd2si, cvttss2si */
>      [0x2d] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* cvtps2pi, cvtpd2pi, cvtsd2si, cvtss2si */
>      [0x2e] = { gen_helper_ucomiss, gen_helper_ucomisd },
> @@ -2879,6 +2879,8 @@ static void *sse_op_table1[256][4] = {
>      [0x75] = MMX_OP2(pcmpeqw),
>      [0x76] = MMX_OP2(pcmpeql),
>      [0x77] = { SSE_DUMMY }, /* emms */
> +    [0x78] = { NULL, SSE_SPECIAL, NULL, SSE_SPECIAL }, /* extrq_i, insertq_i */
> +    [0x79] = { NULL, gen_helper_extrq_r, NULL, gen_helper_insertq_r },
>      [0x7c] = { NULL, gen_helper_haddpd, NULL, gen_helper_haddps },
>      [0x7d] = { NULL, gen_helper_hsubpd, NULL, gen_helper_hsubps },
>      [0x7e] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movd, movd, , movq */
> @@ -3165,6 +3167,20 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
>              gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
>              gen_sto_env_A0(s->mem_index, offsetof(CPUX86State,xmm_regs[reg]));
>              break;
> +        case 0x22b: /* movntss */
> +        case 0x32b: /* movntsd */
> +            if (mod == 3)
> +                goto illegal_op;
> +            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
> +            if (b1 & 1) {
> +                gen_stq_env_A0(s->mem_index, offsetof(CPUX86State,
> +                    xmm_regs[reg]));
> +            } else {
> +                tcg_gen_ld32u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,
> +                    xmm_regs[reg].XMM_L(0)));
> +                gen_op_st_T0_A0(OT_LONG + s->mem_index);
> +            }
> +            break;
>          case 0x6e: /* movd mm, ea */
>  #ifdef TARGET_X86_64
>              if (s->dflag == 2) {
> @@ -3320,6 +3336,25 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
>              gen_op_movl(offsetof(CPUX86State,xmm_regs[reg].XMM_L(2)),
>                          offsetof(CPUX86State,xmm_regs[reg].XMM_L(3)));
>              break;
> +        case 0x178:
> +        case 0x378:
> +            {
> +                int bit_index, field_length;
> +
> +                if (b1 == 1 && reg != 0)
> +                    goto illegal_op;
> +                field_length = ldub_code(s->pc++) & 0x3F;
> +                bit_index = ldub_code(s->pc++) & 0x3F;
> +                tcg_gen_addi_ptr(cpu_ptr0, cpu_env,
> +                    offsetof(CPUX86State,xmm_regs[reg]));
> +                if (b1 == 1)
> +                    gen_helper_extrq_i(cpu_ptr0, tcg_const_i32(bit_index),
> +                        tcg_const_i32(field_length));
> +                else
> +                    gen_helper_insertq_i(cpu_ptr0, tcg_const_i32(bit_index),
> +                        tcg_const_i32(field_length));
> +            }
> +            break;
>          case 0x7e: /* movd ea, mm */
>  #ifdef TARGET_X86_64
>              if (s->dflag == 2) {
> @@ -7566,7 +7601,7 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
>      case 0x110 ... 0x117:
>      case 0x128 ... 0x12f:
>      case 0x138 ... 0x13a:
> -    case 0x150 ... 0x177:
> +    case 0x150 ... 0x179:
>      case 0x17c ... 0x17f:
>      case 0x1c2:
>      case 0x1c4 ... 0x1c6:
> -- 
> 1.6.1.3
> 
> 
> 
> 
>
diff mbox

Patch

diff --git a/target-i386/ops_sse.h b/target-i386/ops_sse.h
index 709732a..3232abd 100644
--- a/target-i386/ops_sse.h
+++ b/target-i386/ops_sse.h
@@ -802,6 +802,50 @@  void helper_rcpss(XMMReg *d, XMMReg *s)
     d->XMM_S(0) = approx_rcp(s->XMM_S(0));
 }
 
+static inline uint64_t helper_extrq(uint64_t src, int shift, int len)
+{
+    uint64_t mask;
+
+    if (len == 0) {
+        mask = ~0LL;
+    } else {
+        mask = (1ULL << len) - 1;
+    }
+    return (src >> shift) & mask;
+}
+
+void helper_extrq_r(XMMReg *d, XMMReg *s)
+{
+    d->XMM_Q(0) = helper_extrq(d->XMM_Q(0), s->XMM_B(1), s->XMM_B(0));
+}
+
+void helper_extrq_i(XMMReg *d, int index, int length)
+{
+    d->XMM_Q(0) = helper_extrq(d->XMM_Q(0), index, length);
+}
+
+static inline uint64_t helper_insertq(uint64_t src, int shift, int len)
+{
+    uint64_t mask;
+
+    if (len == 0) {
+        mask = ~0ULL;
+    } else {
+        mask = (1ULL << len) - 1;
+    }
+    return (src & ~(mask << shift)) | ((src & mask) << shift);
+}
+
+void helper_insertq_r(XMMReg *d, XMMReg *s)
+{
+    d->XMM_Q(0) = helper_insertq(s->XMM_Q(0), s->XMM_B(9), s->XMM_B(8));
+}
+
+void helper_insertq_i(XMMReg *d, int index, int length)
+{
+    d->XMM_Q(0) = helper_insertq(d->XMM_Q(0), index, length);
+}
+
 void helper_haddps(XMMReg *d, XMMReg *s)
 {
     XMMReg r;
diff --git a/target-i386/ops_sse_header.h b/target-i386/ops_sse_header.h
index 53add99..a0a6361 100644
--- a/target-i386/ops_sse_header.h
+++ b/target-i386/ops_sse_header.h
@@ -187,6 +187,10 @@  DEF_HELPER_2(rsqrtps, void, XMMReg, XMMReg)
 DEF_HELPER_2(rsqrtss, void, XMMReg, XMMReg)
 DEF_HELPER_2(rcpps, void, XMMReg, XMMReg)
 DEF_HELPER_2(rcpss, void, XMMReg, XMMReg)
+DEF_HELPER_2(extrq_r, void, XMMReg, XMMReg)
+DEF_HELPER_3(extrq_i, void, XMMReg, int, int)
+DEF_HELPER_2(insertq_r, void, XMMReg, XMMReg)
+DEF_HELPER_3(insertq_i, void, XMMReg, int, int)
 DEF_HELPER_2(haddps, void, XMMReg, XMMReg)
 DEF_HELPER_2(haddpd, void, XMMReg, XMMReg)
 DEF_HELPER_2(hsubps, void, XMMReg, XMMReg)
diff --git a/target-i386/translate.c b/target-i386/translate.c
index 134c870..5cbcf07 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -2822,7 +2822,7 @@  static void *sse_op_table1[256][4] = {
     [0x28] = { SSE_SPECIAL, SSE_SPECIAL },  /* movaps, movapd */
     [0x29] = { SSE_SPECIAL, SSE_SPECIAL },  /* movaps, movapd */
     [0x2a] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* cvtpi2ps, cvtpi2pd, cvtsi2ss, cvtsi2sd */
-    [0x2b] = { SSE_SPECIAL, SSE_SPECIAL },  /* movntps, movntpd */
+    [0x2b] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movntps, movntpd, movntss, movntsd */
     [0x2c] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* cvttps2pi, cvttpd2pi, cvttsd2si, cvttss2si */
     [0x2d] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* cvtps2pi, cvtpd2pi, cvtsd2si, cvtss2si */
     [0x2e] = { gen_helper_ucomiss, gen_helper_ucomisd },
@@ -2879,6 +2879,8 @@  static void *sse_op_table1[256][4] = {
     [0x75] = MMX_OP2(pcmpeqw),
     [0x76] = MMX_OP2(pcmpeql),
     [0x77] = { SSE_DUMMY }, /* emms */
+    [0x78] = { NULL, SSE_SPECIAL, NULL, SSE_SPECIAL }, /* extrq_i, insertq_i */
+    [0x79] = { NULL, gen_helper_extrq_r, NULL, gen_helper_insertq_r },
     [0x7c] = { NULL, gen_helper_haddpd, NULL, gen_helper_haddps },
     [0x7d] = { NULL, gen_helper_hsubpd, NULL, gen_helper_hsubps },
     [0x7e] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movd, movd, , movq */
@@ -3165,6 +3167,20 @@  static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
             gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
             gen_sto_env_A0(s->mem_index, offsetof(CPUX86State,xmm_regs[reg]));
             break;
+        case 0x22b: /* movntss */
+        case 0x32b: /* movntsd */
+            if (mod == 3)
+                goto illegal_op;
+            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            if (b1 & 1) {
+                gen_stq_env_A0(s->mem_index, offsetof(CPUX86State,
+                    xmm_regs[reg]));
+            } else {
+                tcg_gen_ld32u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,
+                    xmm_regs[reg].XMM_L(0)));
+                gen_op_st_T0_A0(OT_LONG + s->mem_index);
+            }
+            break;
         case 0x6e: /* movd mm, ea */
 #ifdef TARGET_X86_64
             if (s->dflag == 2) {
@@ -3320,6 +3336,25 @@  static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
             gen_op_movl(offsetof(CPUX86State,xmm_regs[reg].XMM_L(2)),
                         offsetof(CPUX86State,xmm_regs[reg].XMM_L(3)));
             break;
+        case 0x178:
+        case 0x378:
+            {
+                int bit_index, field_length;
+
+                if (b1 == 1 && reg != 0)
+                    goto illegal_op;
+                field_length = ldub_code(s->pc++) & 0x3F;
+                bit_index = ldub_code(s->pc++) & 0x3F;
+                tcg_gen_addi_ptr(cpu_ptr0, cpu_env,
+                    offsetof(CPUX86State,xmm_regs[reg]));
+                if (b1 == 1)
+                    gen_helper_extrq_i(cpu_ptr0, tcg_const_i32(bit_index),
+                        tcg_const_i32(field_length));
+                else
+                    gen_helper_insertq_i(cpu_ptr0, tcg_const_i32(bit_index),
+                        tcg_const_i32(field_length));
+            }
+            break;
         case 0x7e: /* movd ea, mm */
 #ifdef TARGET_X86_64
             if (s->dflag == 2) {
@@ -7566,7 +7601,7 @@  static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
     case 0x110 ... 0x117:
     case 0x128 ... 0x12f:
     case 0x138 ... 0x13a:
-    case 0x150 ... 0x177:
+    case 0x150 ... 0x179:
     case 0x17c ... 0x17f:
     case 0x1c2:
     case 0x1c4 ... 0x1c6: