@@ -1856,3 +1856,45 @@ (define_predicate "prefixed_memory"
{
return address_is_prefixed (XEXP (op, 0), mode, NON_PREFIXED_DEFAULT);
})
+
+;; Return true if the operand is a valid memory operand with an offsettable
+;; address that can be split into 2 sub-addresses, each of which is a valid
+;; DS-form (bottom 2 bits of the offset are 0). This is used to optimize
+;; creating a vector of two DImode elements and then storing the vector. We
+;; want to eliminate the direct moves from GPRs to form the vector and do the
+;; store directly from the GPRs.
+
+(define_predicate "ds_form_memory"
+ (match_code "mem")
+{
+ if (!memory_operand (op, mode))
+ return false;
+
+ rtx addr = XEXP (op, 0);
+
+ if (REG_P (addr) || SUBREG_P (addr))
+ return true;
+
+ if (GET_CODE (addr) != PLUS)
+ return false;
+
+ if (!base_reg_operand (XEXP (addr, 0), Pmode))
+ return false;
+
+ rtx offset = XEXP (addr, 1);
+ if (!CONST_INT_P (offset))
+ return false;
+
+ HOST_WIDE_INT value = INTVAL (offset);
+
+ if (TARGET_PREFIXED)
+ return SIGNED_34BIT_OFFSET_EXTRA_P (value, GET_MODE_SIZE (DImode));
+
+ /* If we don't support prefixed addressing, ensure that the two addresses
+ created would each be valid for doing a STD instruction (which is a
+ DS-form instruction that requires the bottom 2 bits to be 0). */
+ if ((value & 0x3) != 0)
+ return false;
+
+ return SIGNED_16BIT_OFFSET_EXTRA_P (value, GET_MODE_SIZE (DImode));
+})
@@ -2896,6 +2896,90 @@ (define_insn "*vsx_concat_<mode>_3"
}
[(set_attr "type" "vecperm")])
+;; If the only use for a VEC_CONCAT is to store 2 64-bit values, replace it
+;; with two stores. Only do this on DImode, since it saves doing 1 direct move
+;; on power9, and 2 direct moves + XXPERMDI on power8 to form the vector so we
+;; can do a vector store. This typically shows up with -O3 where two stores
+;; are combined into a vector.
+;;
+;; Typically DFmode would generate XXPERMDI and a vector store. Benchmarks
+;; like Spec show that is typically the same speed or faster than doing the two
+;; scalar DFmode stores.
+(define_insn_and_split "*concatv2di_store"
+ [(set (match_operand:V2DI 0 "memory_operand" "=m,m,m,m")
+ (vec_concat:V2DI
+ (match_operand:DI 1 "gpc_reg_operand" "r,wa,r,wa")
+ (match_operand:DI 2 "gpc_reg_operand" "r,wa,wa,r")))
+ (clobber (match_scratch:DI 3 "=&b,&b,&b,&b"))]
+ "TARGET_DIRECT_MOVE_64BIT"
+ "#"
+ "&& 1"
+ [(set (match_dup 4)
+ (match_dup 5))
+ (set (match_dup 6)
+ (match_dup 7))]
+{
+ rtx mem = operands[0];
+
+ /* If the address can't be used directly for both stores, copy it to the
+ temporary base register. */
+ if (!ds_form_memory (mem, V2DImode))
+ {
+ rtx old_addr = XEXP (mem, 0);
+ rtx new_addr = operands[3];
+ if (GET_CODE (new_addr) == SCRATCH)
+ new_addr = gen_reg_rtx (Pmode);
+
+ emit_move_insn (new_addr, old_addr);
+ mem = change_address (mem, VOIDmode, new_addr);
+ }
+
+ /* Because we are creating scalar stores, we don't have to swap the order
+ of the elements and then swap the stores to get the right order on
+ little endian systems. */
+ operands[4] = adjust_address (mem, DImode, 0);
+ operands[5] = operands[1];
+ operands[6] = adjust_address (mem, DImode, 8);
+ operands[7] = operands[2];
+}
+ [(set_attr "length" "8")
+ (set_attr "type" "store,fpstore,fpstore,store")])
+
+;; Optimize creating a vector with 2 duplicate DImode elements and storing it.
+(define_insn_and_split "*dupv2di_store"
+ [(set (match_operand:V2DI 0 "memory_operand" "=m,m")
+ (vec_duplicate:V2DI
+ (match_operand:DI 1 "gpc_reg_operand" "r,wa")))
+ (clobber (match_scratch:DI 2 "=&b,&b"))]
+ "TARGET_DIRECT_MOVE_64BIT"
+ "#"
+ "&& 1"
+ [(set (match_dup 3)
+ (match_dup 1))
+ (set (match_dup 4)
+ (match_dup 1))]
+{
+ rtx mem = operands[0];
+
+ /* If the address can't be used directly for both stores, copy it to the
+ temporary base register. */
+ if (!ds_form_memory (mem, V2DImode))
+ {
+ rtx old_addr = XEXP (mem, 0);
+ rtx new_addr = operands[2];
+ if (GET_CODE (new_addr) == SCRATCH)
+ new_addr = gen_reg_rtx (Pmode);
+
+ emit_move_insn (new_addr, old_addr);
+ mem = change_address (mem, VOIDmode, new_addr);
+ }
+
+ operands[3] = adjust_address (mem, DImode, 0);
+ operands[4] = adjust_address (mem, DImode, 8);
+}
+ [(set_attr "length" "8")
+ (set_attr "type" "store,fpstore")])
+
;; Special purpose concat using xxpermdi to glue two single precision values
;; together, relying on the fact that internally scalar floats are represented
;; as doubles. This is used to initialize a V4SF vector with 4 floats
new file mode 100644
@@ -0,0 +1,61 @@
+/* { dg-do compile { target { powerpc-*-* && ilp64 } } } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-options "-mdejagnu-cpu=power8 -O2" } */
+
+/* PR target/81594. Optimize creating a vector of 2 64-bit elements and then
+ storing the vector into separate stores. */
+
+void
+store_v2di_0 (vector unsigned long long *p,
+ unsigned long long a,
+ unsigned long long b)
+{
+ *p = (vector unsigned long long) { a, b };
+}
+
+void
+store_v2di_4 (vector unsigned long long *p,
+ unsigned long long a,
+ unsigned long long b)
+{
+ p[4] = (vector unsigned long long) { a, b };
+}
+
+void
+store_v2di_splat_0 (vector unsigned long long *p, unsigned long long a)
+{
+ *p = (vector unsigned long) { a, a };
+}
+
+void
+store_v2di_splat_8 (vector unsigned long long *p, unsigned long long a)
+{
+ p[8] = (vector unsigned long long) { a, a };
+}
+
+/* 2047 is the largest index that can be used with DS-form instructions. */
+void
+store_v2di_2047 (vector unsigned long long *p,
+ unsigned long long a,
+ unsigned long long b)
+{
+ p[2047] = (vector unsigned long long) { a, b };
+}
+
+/* 2048 will require the constant to be loaded because we can't use a pair of
+ DS-form instructions. If we have prefixed addressing, a prefixed form will
+ be generated instead. Two separate stores should still be issued. */
+void
+store_v2di_2048 (vector unsigned long long *p,
+ unsigned long long a,
+ unsigned long long b)
+{
+ p[2048] = (vector unsigned long long) { a, b };
+}
+
+/* { dg-final { scan-assembler-not {\mstxv\M} } } */
+/* { dg-final { scan-assembler-not {\mstxvx\M} } } */
+/* { dg-final { scan-assembler-not {\mmfvsrd\M} } } */
+/* { dg-final { scan-assembler-not {\mmtvsrd\M} } } */
+/* { dg-final { scan-assembler-not {\mmtvsrdd\M} } } */
+/* { dg-final { scan-assembler-not {\mxxpermdi\M} } } */