diff mbox series

[RFC,2/7] target/ppc: Implemented xvi*ger* instructions

Message ID 20220426125028.18844-3-lucas.araujo@eldorado.org.br
State New
Headers show
Series VSX MMA Implementation | expand

Commit Message

Lucas Mateus Martins Araujo e Castro April 26, 2022, 12:50 p.m. UTC
From: "Lucas Mateus Castro (alqotel)" <lucas.araujo@eldorado.org.br>

Implement the following PowerISA v3.1 instructions:
xvi4ger8:     VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update)
xvi4ger8pp:   VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update)
Positive multiply, Positive accumulate
xvi8ger4:     VSX Vector 4-bit Signed Integer GER (rank-8 update)
xvi8ger4pp:   VSX Vector 4-bit Signed Integer GER (rank-8 update)
Positive multiply, Positive accumulate
xvi8ger4spp:  VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update)
with Saturate Positive multiply, Positive accumulate
xvi16ger2:    VSX Vector 16-bit Signed Integer GER (rank-2 update)
xvi16ger2pp:  VSX Vector 16-bit Signed Integer GER (rank-2 update)
Positive multiply, Positive accumulate
xvi16ger2s:   VSX Vector 16-bit Signed Integer GER (rank-2 update)
with Saturation
xvi16ger2spp: VSX Vector 16-bit Signed Integer GER (rank-2 update)
with Saturation Positive multiply, Positive accumulate

Signed-off-by: Lucas Mateus Castro (alqotel) <lucas.araujo@eldorado.org.br>
---
 target/ppc/cpu.h                    |  5 ++
 target/ppc/helper.h                 |  3 +
 target/ppc/insn32.decode            | 15 +++++
 target/ppc/int_helper.c             | 85 +++++++++++++++++++++++++++++
 target/ppc/internal.h               | 28 ++++++++++
 target/ppc/translate/vsx-impl.c.inc | 50 +++++++++++++++++
 6 files changed, 186 insertions(+)

Comments

Richard Henderson April 26, 2022, 11:40 p.m. UTC | #1
On 4/26/22 05:50, Lucas Mateus Castro(alqotel) wrote:
> +%xx_at          23:3 !function=times_4
> +@XX3_at         ...... ... .. ..... ..... ........ ...          &XX3 xt=%xx_at xb=%xx_xb

Hmm.  Depends, I suppose on whether you want acc[0-7] or vsr[0-28]

> +/*
> + * Packed VSX Integer GER Flags
> + * 00 - no accumulation no saturation
> + * 01 - accumulate but no saturation
> + * 10 - no accumulation but with saturation
> + * 11 - accumulate with saturation
> + */
> +static inline bool get_sat(uint32_t flags)
> +{
> +    return flags & 0x2;
> +}
> +
> +static inline bool get_acc(uint32_t flags)
> +{
> +    return flags & 0x1;
> +}

Better to have separate helpers for these?  They'd be immediate operands to the function 
replacing XVIGER (see below) and thus optimize well.

> +#define GET_VsrN(a, i) (extract32(a->VsrB((i) / 2), (i) % 2 ? 4 : 0, 4))
> +#define GET_VsrB(a, i) a->VsrB(i)
> +#define GET_VsrH(a, i) a->VsrH(i)
> +
> +#define GET_VsrSN(a, i) (sextract32(a->VsrSB((i) / 2), (i) % 2 ? 4 : 0, 4))
> +#define GET_VsrSB(a, i) a->VsrSB(i)
> +#define GET_VsrSH(a, i) a->VsrSH(i)

These can be made into functions of the form

     typedef int32_t xviger_extract(ppc_vsr_t *a, int i);


> +#define XVIGER(NAME, RANK, EL)                                                 \
> +    void NAME(CPUPPCState *env, uint32_t a_r, uint32_t b_r,                    \
> +              uint32_t  at_r, uint32_t mask, uint32_t packed_flags)            \
> +    {                                                                          \
> +        ppc_vsr_t *a = cpu_vsr_ptr(env, a_r), *b = cpu_vsr_ptr(env, b_r), *at; \
> +        bool sat = get_sat(packed_flags), acc = get_acc(packed_flags);         \
> +        uint8_t pmsk = ger_get_pmsk(mask), xmsk = ger_get_xmsk(mask),          \
> +                ymsk = ger_get_ymsk(mask);                                     \
> +        uint8_t pmsk_bit, xmsk_bit, ymsk_bit;                                  \
> +        int64_t psum;                                                          \
> +        int32_t va, vb;                                                        \
> +        int i, j, k;                                                           \
> +        for (i = 0, xmsk_bit = 1 << 3; i < 4; i++, xmsk_bit >>= 1) {           \
> +            at = cpu_vsr_ptr(env, at_r + i);                                   \
> +            for (j = 0, ymsk_bit = 1 << 3; j < 4; j++, ymsk_bit >>= 1) {       \
> +                if ((xmsk_bit & xmsk) && (ymsk_bit & ymsk)) {                  \
> +                    psum = 0;                                                  \
> +                    for (k = 0, pmsk_bit = 1 << (RANK - 1); k < RANK;          \
> +                         k++, pmsk_bit >>= 1) {                                \
> +                        if (pmsk_bit & pmsk) {                                 \
> +                            va = (int32_t)GET_VsrS##EL(a, RANK * i + k);       \
> +                            vb = (int32_t) ((RANK == 4) ?                      \
> +                                                GET_Vsr##EL(b, RANK * j + k) : \
> +                                                GET_VsrS##EL(b, RANK * j + k));\
> +                            psum += va * vb;                                   \
> +                        }                                                      \
> +                    }                                                          \
> +                    if (acc) {                                                 \
> +                        psum += at->VsrSW(j);                                  \
> +                    }                                                          \
> +                    if (sat && psum > INT32_MAX) {                             \
> +                        set_vscr_sat(env);                                     \
> +                        at->VsrSW(j) = INT32_MAX;                              \
> +                    } else if (sat && psum < INT32_MIN) {                      \
> +                        set_vscr_sat(env);                                     \
> +                        at->VsrSW(j) = INT32_MIN;                              \
> +                    } else {                                                   \
> +                        at->VsrSW(j) = (int32_t) psum;                         \
> +                    }                                                          \
> +                } else {                                                       \
> +                    at->VsrSW(j) = 0;                                          \
> +                }                                                              \
> +            }                                                                  \
> +        }                                                                      \
> +    }

... which means that this monster can be a function instead of a non-debuggable macro.

> diff --git a/target/ppc/internal.h b/target/ppc/internal.h
> index 8094e0b033..a994d98238 100644
> --- a/target/ppc/internal.h
> +++ b/target/ppc/internal.h
> @@ -291,4 +291,32 @@ G_NORETURN void ppc_cpu_do_unaligned_access(CPUState *cs, vaddr addr,
>                                               uintptr_t retaddr);
>   #endif
>   
> +/*
> + * Auxiliary functions to pack/unpack masks for GER instructions.
> + *
> + * Packed format:
> + *  Bits 0-3: xmsk
> + *  Bits 4-7: ymsk
> + *  Bits 8-15: pmsk
> + */
> +static inline uint8_t ger_get_xmsk(uint32_t packed_masks)
> +{
> +    return packed_masks & 0xF;
> +}
> +
> +static inline uint8_t ger_get_ymsk(uint32_t packed_masks)
> +{
> +    return (packed_masks >> 4) & 0xF;
> +}
> +
> +static inline uint8_t ger_get_pmsk(uint32_t packed_masks)
> +{
> +    return (packed_masks >> 8) & 0xFF;
> +}
> +
> +static inline int ger_pack_masks(int pmsk, int ymsk, int xmsk)
> +{
> +    return (pmsk & 0xFF) << 8 | (ymsk & 0xF) << 4 | (xmsk & 0xF);
> +}

Use hw/registerfields.h.  C.f. PREDDESC in target/arm/internals.h.

> +static bool do_ger_XX3(DisasContext *ctx, arg_XX3 *a, uint32_t op,
> +                             void (*helper)(TCGv_env, TCGv_i32, TCGv_i32,
> +                                            TCGv_i32, TCGv_i32, TCGv_i32))
> +{
> +    uint32_t mask;
> +    REQUIRE_INSNS_FLAGS2(ctx, ISA310);
> +    REQUIRE_VSX(ctx);
> +    if (unlikely((a->xa / 4 == a->xt / 4) || (a->xb / 4 == a->xt / 4))) {
> +        gen_invalid(ctx);
> +        return true;
> +    }
> +
> +    mask = 0xFFFFFFFF;
> +    helper(cpu_env, tcg_constant_i32(a->xa), tcg_constant_i32(a->xb),
> +           tcg_constant_i32(a->xt), tcg_constant_i32(mask),
> +           tcg_constant_i32(op));
> +    return true;
> +}

Why are you passing register numbers instead of pointers, like everywhere else?


r~
Lucas Mateus Martins Araujo e Castro April 27, 2022, 8:24 p.m. UTC | #2
On 26/04/2022 20:40, Richard Henderson wrote:
>
> On 4/26/22 05:50, Lucas Mateus Castro(alqotel) wrote:
>> +%xx_at          23:3 !function=times_4
>> +@XX3_at         ...... ... .. ..... ..... ........ ... &XX3 
>> xt=%xx_at xb=%xx_xb
>
> Hmm.  Depends, I suppose on whether you want acc[0-7] or vsr[0-28]
I mostly used VSR function here, but since I'll change the patch 1 to 
your suggestion (which will require creating acc_full_offset) I'll make 
a few changes to create some functions for the accumulator
>
>> +/*
>> + * Packed VSX Integer GER Flags
>> + * 00 - no accumulation no saturation
>> + * 01 - accumulate but no saturation
>> + * 10 - no accumulation but with saturation
>> + * 11 - accumulate with saturation
>> + */
>> +static inline bool get_sat(uint32_t flags)
>> +{
>> +    return flags & 0x2;
>> +}
>> +
>> +static inline bool get_acc(uint32_t flags)
>> +{
>> +    return flags & 0x1;
>> +}
>
> Better to have separate helpers for these?  They'd be immediate 
> operands to the function
> replacing XVIGER (see below) and thus optimize well.
Do you mean different functions or a function that receives packed_flags 
along with the callback functions?
>
>> +#define GET_VsrN(a, i) (extract32(a->VsrB((i) / 2), (i) % 2 ? 4 : 0, 
>> 4))
>> +#define GET_VsrB(a, i) a->VsrB(i)
>> +#define GET_VsrH(a, i) a->VsrH(i)
>> +
>> +#define GET_VsrSN(a, i) (sextract32(a->VsrSB((i) / 2), (i) % 2 ? 4 : 
>> 0, 4))
>> +#define GET_VsrSB(a, i) a->VsrSB(i)
>> +#define GET_VsrSH(a, i) a->VsrSH(i)
>
> These can be made into functions of the form
>
>     typedef int32_t xviger_extract(ppc_vsr_t *a, int i);
>
In this case it'd be necessary to receive 2 xviger_extract functions 
since XVI8GER4* multiply one value as signed and the other as unsigned 
(and other integer GER treat both as signed).

An alternative would be to isolate the innermost loop into a different 
function, like:

     typedef int64_t do_ger(int32_t a, int32_t b, int32_t at, int32_t pmsk);

     static int64_t ger_rank4(int32_t a, int32_t b, int32_t at, int32_t 
mask)
     {
         int64_t psum = 0, i;
         for (i = 0; i < 4; i++, mask >>= 1) {
             if (mask & 1) {
                 psum += (sextract32(a, i * 8, 8)) * (extract32(b, i * 
8, 8));
            }
         }
         return psum;
     }

That way we could avoid having 'rank' as a parameter, what do you think?

>
>
>> diff --git a/target/ppc/internal.h b/target/ppc/internal.h
>> index 8094e0b033..a994d98238 100644
>> --- a/target/ppc/internal.h
>> +++ b/target/ppc/internal.h
>> @@ -291,4 +291,32 @@ G_NORETURN void 
>> ppc_cpu_do_unaligned_access(CPUState *cs, vaddr addr,
>>                                               uintptr_t retaddr);
>>   #endif
>>
>> +/*
>> + * Auxiliary functions to pack/unpack masks for GER instructions.
>> + *
>> + * Packed format:
>> + *  Bits 0-3: xmsk
>> + *  Bits 4-7: ymsk
>> + *  Bits 8-15: pmsk
>> + */
>> +static inline uint8_t ger_get_xmsk(uint32_t packed_masks)
>> +{
>> +    return packed_masks & 0xF;
>> +}
>> +
>> +static inline uint8_t ger_get_ymsk(uint32_t packed_masks)
>> +{
>> +    return (packed_masks >> 4) & 0xF;
>> +}
>> +
>> +static inline uint8_t ger_get_pmsk(uint32_t packed_masks)
>> +{
>> +    return (packed_masks >> 8) & 0xFF;
>> +}
>> +
>> +static inline int ger_pack_masks(int pmsk, int ymsk, int xmsk)
>> +{
>> +    return (pmsk & 0xFF) << 8 | (ymsk & 0xF) << 4 | (xmsk & 0xF);
>> +}
>
> Use hw/registerfields.h.  C.f. PREDDESC in target/arm/internals.h.
Ok, will do
>
>> +static bool do_ger_XX3(DisasContext *ctx, arg_XX3 *a, uint32_t op,
>> +                             void (*helper)(TCGv_env, TCGv_i32, 
>> TCGv_i32,
>> +                                            TCGv_i32, TCGv_i32, 
>> TCGv_i32))
>> +{
>> +    uint32_t mask;
>> +    REQUIRE_INSNS_FLAGS2(ctx, ISA310);
>> +    REQUIRE_VSX(ctx);
>> +    if (unlikely((a->xa / 4 == a->xt / 4) || (a->xb / 4 == a->xt / 
>> 4))) {
>> +        gen_invalid(ctx);
>> +        return true;
>> +    }
>> +
>> +    mask = 0xFFFFFFFF;
>> +    helper(cpu_env, tcg_constant_i32(a->xa), tcg_constant_i32(a->xb),
>> +           tcg_constant_i32(a->xt), tcg_constant_i32(mask),
>> +           tcg_constant_i32(op));
>> +    return true;
>> +}
>
> Why are you passing register numbers instead of pointers, like 
> everywhere else?
Because here we are not working only with 1 register per register 
number, the ACC uses 4 and the XVF64GER* needs to use XA and XA+1, and 
while VSR is an array so I could do ppc_vsr_ptr+1 I thought it was 
better not to access memory I was not given a pointer to, so I passed XA 
so I can request cpu_vsr_ptr(env, xa) and cpu_vsr_ptr(env, xa + 1)
>
>
> r~
Richard Henderson April 27, 2022, 10:28 p.m. UTC | #3
On 4/27/22 13:24, Lucas Mateus Martins Araujo e Castro wrote:
> 
> On 26/04/2022 20:40, Richard Henderson wrote:
>>
>> On 4/26/22 05:50, Lucas Mateus Castro(alqotel) wrote:
>>> +%xx_at          23:3 !function=times_4
>>> +@XX3_at         ...... ... .. ..... ..... ........ ... &XX3 xt=%xx_at xb=%xx_xb
>>
>> Hmm.  Depends, I suppose on whether you want acc[0-7] or vsr[0-28]
> I mostly used VSR function here, but since I'll change the patch 1 to your suggestion 
> (which will require creating acc_full_offset) I'll make a few changes to create some 
> functions for the accumulator
>>
>>> +/*
>>> + * Packed VSX Integer GER Flags
>>> + * 00 - no accumulation no saturation
>>> + * 01 - accumulate but no saturation
>>> + * 10 - no accumulation but with saturation
>>> + * 11 - accumulate with saturation
>>> + */
>>> +static inline bool get_sat(uint32_t flags)
>>> +{
>>> +    return flags & 0x2;
>>> +}
>>> +
>>> +static inline bool get_acc(uint32_t flags)
>>> +{
>>> +    return flags & 0x1;
>>> +}
>>
>> Better to have separate helpers for these?  They'd be immediate operands to the function
>> replacing XVIGER (see below) and thus optimize well.
> Do you mean different functions or a function that receives packed_flags along with the 
> callback functions?

I mean separate helper entry points, which use a common function that receives these as 
separate boolean arguments, along with the callbacks.  Use QEMU_FLATTEN on the helper 
entry points to ensure that everything is inlined and the constant args are optimized.

> In this case it'd be necessary to receive 2 xviger_extract functions since XVI8GER4* 
> multiply one value as signed and the other as unsigned (and other integer GER treat both 
> as signed).

Certainly.

> 
> An alternative would be to isolate the innermost loop into a different function, like:
> 
>      typedef int64_t do_ger(int32_t a, int32_t b, int32_t at, int32_t pmsk);
> 
>      static int64_t ger_rank4(int32_t a, int32_t b, int32_t at, int32_t mask)
>      {
>          int64_t psum = 0, i;
>          for (i = 0; i < 4; i++, mask >>= 1) {
>              if (mask & 1) {
>                  psum += (sextract32(a, i * 8, 8)) * (extract32(b, i * 8, 8));
>             }
>          }
>          return psum;
>      }
> 
> That way we could avoid having 'rank' as a parameter, what do you think?

Reasonable.  I certainly like extracting uint32_t from the vector generically and not 
having to pass that on further.

>> Why are you passing register numbers instead of pointers, like everywhere else?
> Because here we are not working only with 1 register per register number, the ACC uses 4 
> and the XVF64GER* needs to use XA and XA+1, and while VSR is an array so I could do 
> ppc_vsr_ptr+1 I thought it was better not to access memory I was not given a pointer to, 
> so I passed XA so I can request cpu_vsr_ptr(env, xa) and cpu_vsr_ptr(env, xa + 1)

I think using cpu_vsr_ptr is the mistake.

It might be clarifying to define a ppc_acc_t, if only as a typedef of ppc_vsr_t.  The 
acc_full_offset function will compute the offset for this pointer and, importantly, will 
be the place to modify if and when the architecture changes to allow or require separate 
storage for the ACC registers.


r~
diff mbox series

Patch

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index c2b6c987c0..ee55c6cfa2 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -2688,6 +2688,11 @@  static inline uint64_t *cpu_vsrl_ptr(CPUPPCState *env, int i)
     return (uint64_t *)((uintptr_t)env + vsr64_offset(i, false));
 }
 
+static inline ppc_vsr_t *cpu_vsr_ptr(CPUPPCState *env, int i)
+{
+    return (ppc_vsr_t *)((uintptr_t)env + vsr_full_offset(i));
+}
+
 static inline long avr64_offset(int i, bool high)
 {
     return vsr64_offset(i + 32, high);
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index aa6773c4a5..06553517de 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -537,6 +537,9 @@  DEF_HELPER_5(XXBLENDVB, void, vsr, vsr, vsr, vsr, i32)
 DEF_HELPER_5(XXBLENDVH, void, vsr, vsr, vsr, vsr, i32)
 DEF_HELPER_5(XXBLENDVW, void, vsr, vsr, vsr, vsr, i32)
 DEF_HELPER_5(XXBLENDVD, void, vsr, vsr, vsr, vsr, i32)
+DEF_HELPER_6(XVI4GER8, void, env, i32, i32, i32, i32, i32)
+DEF_HELPER_6(XVI8GER4, void, env, i32, i32, i32, i32, i32)
+DEF_HELPER_6(XVI16GER2, void, env, i32, i32, i32, i32, i32)
 
 DEF_HELPER_2(efscfsi, i32, env, i32)
 DEF_HELPER_2(efscfui, i32, env, i32)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 7a76bedfa6..653f50db93 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -170,6 +170,9 @@ 
 &XX3            xt xa xb
 @XX3            ...... ..... ..... ..... ........ ...           &XX3 xt=%xx_xt xa=%xx_xa xb=%xx_xb
 
+%xx_at          23:3 !function=times_4
+@XX3_at         ...... ... .. ..... ..... ........ ...          &XX3 xt=%xx_at xb=%xx_xb
+
 &XX3_dm         xt xa xb dm
 @XX3_dm         ...... ..... ..... ..... . dm:2 ..... ...       &XX3_dm xt=%xx_xt xa=%xx_xa xb=%xx_xb
 
@@ -719,3 +722,15 @@  RFEBB           010011-------------- .   0010010010 -   @XL_s
 XXMFACC         011111 ... -- 00000 ----- 0010110001 -   @X_a
 XXMTACC         011111 ... -- 00001 ----- 0010110001 -   @X_a
 XXSETACCZ       011111 ... -- 00011 ----- 0010110001 -   @X_a
+
+## Vector GER instruction
+
+XVI4GER8        111011 ... -- ..... ..... 00100011 ..-  @XX3_at xa=%xx_xa
+XVI4GER8PP      111011 ... -- ..... ..... 00100010 ..-  @XX3_at xa=%xx_xa
+XVI8GER4        111011 ... -- ..... ..... 00000011 ..-  @XX3_at xa=%xx_xa
+XVI8GER4PP      111011 ... -- ..... ..... 00000010 ..-  @XX3_at xa=%xx_xa
+XVI16GER2       111011 ... -- ..... ..... 01001011 ..-  @XX3_at xa=%xx_xa
+XVI16GER2PP     111011 ... -- ..... ..... 01101011 ..-  @XX3_at xa=%xx_xa
+XVI8GER4SPP     111011 ... -- ..... ..... 01100011 ..-  @XX3_at xa=%xx_xa
+XVI16GER2S      111011 ... -- ..... ..... 00101011 ..-  @XX3_at xa=%xx_xa
+XVI16GER2SPP    111011 ... -- ..... ..... 00101010 ..-  @XX3_at xa=%xx_xa
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 8c1674510b..bd2f1a7c2a 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -782,6 +782,91 @@  VCT(uxs, cvtsduw, u32)
 VCT(sxs, cvtsdsw, s32)
 #undef VCT
 
+/*
+ * Packed VSX Integer GER Flags
+ * 00 - no accumulation no saturation
+ * 01 - accumulate but no saturation
+ * 10 - no accumulation but with saturation
+ * 11 - accumulate with saturation
+ */
+static inline bool get_sat(uint32_t flags)
+{
+    return flags & 0x2;
+}
+
+static inline bool get_acc(uint32_t flags)
+{
+    return flags & 0x1;
+}
+
+#define GET_VsrN(a, i) (extract32(a->VsrB((i) / 2), (i) % 2 ? 4 : 0, 4))
+#define GET_VsrB(a, i) a->VsrB(i)
+#define GET_VsrH(a, i) a->VsrH(i)
+
+#define GET_VsrSN(a, i) (sextract32(a->VsrSB((i) / 2), (i) % 2 ? 4 : 0, 4))
+#define GET_VsrSB(a, i) a->VsrSB(i)
+#define GET_VsrSH(a, i) a->VsrSH(i)
+
+#define XVIGER(NAME, RANK, EL)                                                 \
+    void NAME(CPUPPCState *env, uint32_t a_r, uint32_t b_r,                    \
+              uint32_t  at_r, uint32_t mask, uint32_t packed_flags)            \
+    {                                                                          \
+        ppc_vsr_t *a = cpu_vsr_ptr(env, a_r), *b = cpu_vsr_ptr(env, b_r), *at; \
+        bool sat = get_sat(packed_flags), acc = get_acc(packed_flags);         \
+        uint8_t pmsk = ger_get_pmsk(mask), xmsk = ger_get_xmsk(mask),          \
+                ymsk = ger_get_ymsk(mask);                                     \
+        uint8_t pmsk_bit, xmsk_bit, ymsk_bit;                                  \
+        int64_t psum;                                                          \
+        int32_t va, vb;                                                        \
+        int i, j, k;                                                           \
+        for (i = 0, xmsk_bit = 1 << 3; i < 4; i++, xmsk_bit >>= 1) {           \
+            at = cpu_vsr_ptr(env, at_r + i);                                   \
+            for (j = 0, ymsk_bit = 1 << 3; j < 4; j++, ymsk_bit >>= 1) {       \
+                if ((xmsk_bit & xmsk) && (ymsk_bit & ymsk)) {                  \
+                    psum = 0;                                                  \
+                    for (k = 0, pmsk_bit = 1 << (RANK - 1); k < RANK;          \
+                         k++, pmsk_bit >>= 1) {                                \
+                        if (pmsk_bit & pmsk) {                                 \
+                            va = (int32_t)GET_VsrS##EL(a, RANK * i + k);       \
+                            vb = (int32_t) ((RANK == 4) ?                      \
+                                                GET_Vsr##EL(b, RANK * j + k) : \
+                                                GET_VsrS##EL(b, RANK * j + k));\
+                            psum += va * vb;                                   \
+                        }                                                      \
+                    }                                                          \
+                    if (acc) {                                                 \
+                        psum += at->VsrSW(j);                                  \
+                    }                                                          \
+                    if (sat && psum > INT32_MAX) {                             \
+                        set_vscr_sat(env);                                     \
+                        at->VsrSW(j) = INT32_MAX;                              \
+                    } else if (sat && psum < INT32_MIN) {                      \
+                        set_vscr_sat(env);                                     \
+                        at->VsrSW(j) = INT32_MIN;                              \
+                    } else {                                                   \
+                        at->VsrSW(j) = (int32_t) psum;                         \
+                    }                                                          \
+                } else {                                                       \
+                    at->VsrSW(j) = 0;                                          \
+                }                                                              \
+            }                                                                  \
+        }                                                                      \
+    }
+
+XVIGER(helper_XVI4GER8, 8, N)
+XVIGER(helper_XVI8GER4, 4, B)
+XVIGER(helper_XVI16GER2, 2, H)
+
+#undef GER_MULT
+#undef XVIGER_NAME
+#undef XVIGER
+#undef GET_VsrN
+#undef GET_VsrB
+#undef GET_VsrH
+#undef GET_VsrSN
+#undef GET_VsrSB
+#undef GET_VsrSH
+
 target_ulong helper_vclzlsbb(ppc_avr_t *r)
 {
     target_ulong count = 0;
diff --git a/target/ppc/internal.h b/target/ppc/internal.h
index 8094e0b033..a994d98238 100644
--- a/target/ppc/internal.h
+++ b/target/ppc/internal.h
@@ -291,4 +291,32 @@  G_NORETURN void ppc_cpu_do_unaligned_access(CPUState *cs, vaddr addr,
                                             uintptr_t retaddr);
 #endif
 
+/*
+ * Auxiliary functions to pack/unpack masks for GER instructions.
+ *
+ * Packed format:
+ *  Bits 0-3: xmsk
+ *  Bits 4-7: ymsk
+ *  Bits 8-15: pmsk
+ */
+static inline uint8_t ger_get_xmsk(uint32_t packed_masks)
+{
+    return packed_masks & 0xF;
+}
+
+static inline uint8_t ger_get_ymsk(uint32_t packed_masks)
+{
+    return (packed_masks >> 4) & 0xF;
+}
+
+static inline uint8_t ger_get_pmsk(uint32_t packed_masks)
+{
+    return (packed_masks >> 8) & 0xFF;
+}
+
+static inline int ger_pack_masks(int pmsk, int ymsk, int xmsk)
+{
+    return (pmsk & 0xFF) << 8 | (ymsk & 0xF) << 4 | (xmsk & 0xF);
+}
+
 #endif /* PPC_INTERNAL_H */
diff --git a/target/ppc/translate/vsx-impl.c.inc b/target/ppc/translate/vsx-impl.c.inc
index 919b889c40..1eb68c7081 100644
--- a/target/ppc/translate/vsx-impl.c.inc
+++ b/target/ppc/translate/vsx-impl.c.inc
@@ -2823,6 +2823,56 @@  static bool trans_XXSETACCZ(DisasContext *ctx, arg_X_a *a)
     return true;
 }
 
+/*
+ * Packed VSX Integer GER Flags
+ * 00 - no accumulation no saturation
+ * 01 - accumulate but no saturation
+ * 10 - no accumulation but with saturation
+ * 11 - accumulate with saturation
+ */
+static uint32_t pack_flags_xvi(int acc, int sat)
+{
+    return (sat << 1) | acc;
+}
+
+static bool do_ger_XX3(DisasContext *ctx, arg_XX3 *a, uint32_t op,
+                             void (*helper)(TCGv_env, TCGv_i32, TCGv_i32,
+                                            TCGv_i32, TCGv_i32, TCGv_i32))
+{
+    uint32_t mask;
+    REQUIRE_INSNS_FLAGS2(ctx, ISA310);
+    REQUIRE_VSX(ctx);
+    if (unlikely((a->xa / 4 == a->xt / 4) || (a->xb / 4 == a->xt / 4))) {
+        gen_invalid(ctx);
+        return true;
+    }
+
+    mask = 0xFFFFFFFF;
+    helper(cpu_env, tcg_constant_i32(a->xa), tcg_constant_i32(a->xb),
+           tcg_constant_i32(a->xt), tcg_constant_i32(mask),
+           tcg_constant_i32(op));
+    return true;
+}
+
+/* Used to keep line length < 80 */
+#define GER_NOP pack_flags_xvi(0, 0)
+#define GER_PP  pack_flags_xvi(1, 0)
+#define GER_SAT pack_flags_xvi(0, 1)
+#define GER_SPP pack_flags_xvi(1, 1)
+TRANS(XVI4GER8, do_ger_XX3, GER_NOP, gen_helper_XVI4GER8)
+TRANS(XVI4GER8PP, do_ger_XX3, GER_PP, gen_helper_XVI4GER8)
+TRANS(XVI8GER4, do_ger_XX3, GER_NOP, gen_helper_XVI8GER4)
+TRANS(XVI8GER4PP, do_ger_XX3, GER_PP, gen_helper_XVI8GER4)
+TRANS(XVI8GER4SPP, do_ger_XX3, GER_SPP, gen_helper_XVI8GER4)
+TRANS(XVI16GER2, do_ger_XX3, GER_NOP, gen_helper_XVI16GER2)
+TRANS(XVI16GER2PP, do_ger_XX3, GER_PP, gen_helper_XVI16GER2)
+TRANS(XVI16GER2S, do_ger_XX3, GER_SAT, gen_helper_XVI16GER2)
+TRANS(XVI16GER2SPP, do_ger_XX3, GER_SPP, gen_helper_XVI16GER2)
+#undef GER_NOP
+#undef GER_PP
+#undef GER_SAT
+#undef GER_SPP
+
 #undef GEN_XX2FORM
 #undef GEN_XX3FORM
 #undef GEN_XX2IFORM