Message ID | 20131120084245.GE892@tucnak.redhat.com |
---|---|
State | New |
Headers | show |
On Wed, 20 Nov 2013, Jakub Jelinek wrote: > Hi! > > I've noticed we generate terrible code for the testcase below. > E.g. with -mavx2 it is: > leal 6(%rdi), %edx > leal 12(%rdi), %ecx > leal 18(%rdi), %esi > leal 3(%rdi), %eax > movl %edx, -20(%rsp) > movl %ecx, -24(%rsp) > leal 9(%rdi), %edx > movl %esi, -28(%rsp) > vmovd -20(%rsp), %xmm6 > leal 15(%rdi), %ecx > movl %edi, -20(%rsp) > leal 21(%rdi), %esi > vmovd -28(%rsp), %xmm4 > vmovd -24(%rsp), %xmm5 > vpinsrd $1, %edx, %xmm6, %xmm3 > vmovd -20(%rsp), %xmm7 > vpinsrd $1, %esi, %xmm4, %xmm2 > vpinsrd $1, %ecx, %xmm5, %xmm1 > vpinsrd $1, %eax, %xmm7, %xmm0 > vpunpcklqdq %xmm2, %xmm1, %xmm1 > vpunpcklqdq %xmm3, %xmm0, %xmm0 > vinserti128 $0x1, %xmm1, %ymm0, %ymm0 > to load the { x, x + 3, x + 6, x + 9, x + 12, x + 15, x + 18, x + 21 } > CONSTRUCTOR into a vector register. This patch expands it as: > movl %edi, -20(%rsp) > vbroadcastss -20(%rsp), %ymm0 > vpaddd .LC0(%rip), %ymm0, %ymm0 > instead. Similarly for SSE2: > leal 3(%rdi), %eax > movl %eax, -20(%rsp) > leal 6(%rdi), %eax > movd -20(%rsp), %xmm4 > movl %eax, -16(%rsp) > leal 9(%rdi), %eax > movd -16(%rsp), %xmm1 > movl %edi, -16(%rsp) > movl %eax, -12(%rsp) > movd -16(%rsp), %xmm0 > movd -12(%rsp), %xmm3 > punpckldq %xmm4, %xmm0 > punpckldq %xmm3, %xmm1 > punpcklqdq %xmm1, %xmm0 > instead of: > movl %edi, -12(%rsp) > movd -12(%rsp), %xmm3 > pshufd $0, %xmm3, %xmm0 > paddd .LC0(%rip), %xmm0 > > The patch leaves that decision to the target (in it's > *_expand_vector_init). > > Ok? Aww ;) Nice improvement. Generally when I see this I always wonder whether we want to do this kind of stuff pre RTL expansion. 1st to not rely on being able to TER, 2nd to finally eventually get rid of TER. These patches are unfortunately a step backward for #2. As of the patch, do we have a way to query whether the target can efficiently broadcast? If so this IMHO belongs in generic code, not in ix86_expand_vector_init - passing down plus_constant results may pessimize other targets that don't understand this trick, no? (OTOH, which vector ISA does _not_ know how to broadcast ...?) Thanks, Richard. > 2013-11-20 Jakub Jelinek <jakub@redhat.com> > > * expr.c (store_constructor): If all CONSTRUCTOR values are > the same SSA_NAME plus optional constant, pass plus_constant results > down to vec_init. > * config/i386/i386.c (ix86_expand_vector_init): Optimize it using > broadcast followed by addition of a constant vector. > > * gcc.dg/vect/vect-124.c: New test. > > --- gcc/expr.c.jj 2013-11-19 21:56:27.000000000 +0100 > +++ gcc/expr.c 2013-11-20 09:24:14.435860130 +0100 > @@ -6296,6 +6296,8 @@ store_constructor (tree exp, rtx target, > rtvec vector = NULL; > unsigned n_elts; > alias_set_type alias; > + tree base = NULL_TREE; > + rtx base_rtx = NULL_RTX; > > gcc_assert (eltmode != BLKmode); > > @@ -6334,6 +6336,31 @@ store_constructor (tree exp, rtx target, > TYPE_SIZE (TREE_TYPE (value)), > TYPE_SIZE (elttype))); > > + /* Try to detect fairly common pattern of > + { a_5, a_5 + 1, a_5 + 2, a_5 + 3 }. */ > + if (vector > + && (count == 0 || base) > + && TREE_CODE (value) == SSA_NAME) > + { > + gimple g = get_gimple_for_ssa_name (value); > + tree this_base = value; > + if (g && is_gimple_assign (g)) > + switch (gimple_assign_rhs_code (g)) > + { > + case PLUS_EXPR: > + case POINTER_PLUS_EXPR: > + if (TREE_CODE (gimple_assign_rhs1 (g)) == SSA_NAME > + && tree_fits_shwi_p (gimple_assign_rhs2 (g))) > + this_base = gimple_assign_rhs1 (g); > + break; > + default: > + break; > + } > + if (count == 0) > + base = this_base; > + else if (base != this_base) > + base = NULL_TREE; > + } > count += n_elts_here; > if (mostly_zeros_p (value)) > zero_count += n_elts_here; > @@ -6342,6 +6369,9 @@ store_constructor (tree exp, rtx target, > /* Clear the entire vector first if there are any missing elements, > or if the incidence of zero elements is >= 75%. */ > need_to_clear = (count < n_elts || 4 * zero_count >= 3 * count); > + > + if (count < n_elts) > + base = NULL_TREE; > } > > if (need_to_clear && size > 0 && !vector) > @@ -6362,6 +6392,9 @@ store_constructor (tree exp, rtx target, > else > alias = get_alias_set (elttype); > > + if (base) > + base_rtx = expand_normal (base); > + > /* Store each element of the constructor into the corresponding > element of TARGET, determined by counting the elements. */ > for (idx = 0, i = 0; > @@ -6385,8 +6418,21 @@ store_constructor (tree exp, rtx target, > /* Vector CONSTRUCTORs should only be built from smaller > vectors in the case of BLKmode vectors. */ > gcc_assert (TREE_CODE (TREE_TYPE (value)) != VECTOR_TYPE); > - RTVEC_ELT (vector, eltpos) > - = expand_normal (value); > + if (base) > + { > + if (value == base) > + RTVEC_ELT (vector, eltpos) = base_rtx; > + else > + { > + gimple g = get_gimple_for_ssa_name (value); > + rtx cst = expand_normal (gimple_assign_rhs2 (g)); > + RTVEC_ELT (vector, eltpos) > + = plus_constant (TYPE_MODE (TREE_TYPE (value)), > + base_rtx, INTVAL (cst)); > + } > + } > + else > + RTVEC_ELT (vector, eltpos) = expand_normal (value); > } > else > { > --- gcc/config/i386/i386.c.jj 2013-11-19 21:56:41.000000000 +0100 > +++ gcc/config/i386/i386.c 2013-11-20 09:20:16.837649403 +0100 > @@ -37736,9 +37736,10 @@ ix86_expand_vector_init (bool mmx_ok, rt > enum machine_mode inner_mode = GET_MODE_INNER (mode); > int n_elts = GET_MODE_NUNITS (mode); > int n_var = 0, one_var = -1; > - bool all_same = true, all_const_zero = true; > + bool all_same = true, all_const_zero = true, need_force_reg = false; > int i; > rtx x; > + rtx base = NULL_RTX; > > for (i = 0; i < n_elts; ++i) > { > @@ -37746,11 +37747,25 @@ ix86_expand_vector_init (bool mmx_ok, rt > if (!(CONST_INT_P (x) > || GET_CODE (x) == CONST_DOUBLE > || GET_CODE (x) == CONST_FIXED)) > - n_var++, one_var = i; > + { > + n_var++; > + one_var = i; > + if (!REG_P (x) && !MEM_P (x)) > + need_force_reg = true; > + } > else if (x != CONST0_RTX (inner_mode)) > all_const_zero = false; > if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0))) > all_same = false; > + if (i == 0 || base) > + { > + if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1))) > + x = XEXP (x, 0); > + if (i == 0) > + base = x; > + else if (!rtx_equal_p (x, base)) > + base = NULL_RTX; > + } > } > > /* Constants are best loaded from the constant pool. */ > @@ -37760,6 +37775,49 @@ ix86_expand_vector_init (bool mmx_ok, rt > return; > } > > + /* For { a_5, a_5 + 4, a_5 + 2, a_5 + 7 } generate a broadcast > + followed by addition of constant. */ > + if (!all_same > + && base > + && n_elts >= 4 > + && (REG_P (base) || MEM_P (base))) > + { > + rtx subtarget = gen_reg_rtx (mode); > + if (ix86_expand_vector_init_duplicate (mmx_ok, mode, subtarget, base)) > + { > + rtvec cstvals = rtvec_alloc (n_elts); > + rtx cstsubtarget = gen_reg_rtx (mode); > + for (i = 0; i < n_elts; i++) > + { > + x = XVECEXP (vals, 0, i); > + if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1))) > + RTVEC_ELT (cstvals, i) = XEXP (x, 1); > + else > + RTVEC_ELT (cstvals, i) = const0_rtx; > + } > + ix86_expand_vector_init (mmx_ok, cstsubtarget, > + gen_rtx_PARALLEL (mode, cstvals)); > + subtarget = expand_simple_binop (mode, PLUS, subtarget, > + cstsubtarget, target, 1, > + OPTAB_DIRECT); > + if (subtarget != target) > + emit_move_insn (target, subtarget); > + return; > + } > + } > + > + if (need_force_reg) > + for (i = 0; i < n_elts; ++i) > + { > + x = XVECEXP (vals, 0, i); > + if (!(CONST_INT_P (x) > + || GET_CODE (x) == CONST_DOUBLE > + || GET_CODE (x) == CONST_FIXED > + || REG_P (x) > + || MEM_P (x))) > + XVECEXP (vals, 0, i) = force_reg (GET_MODE (x), x); > + } > + > /* If all values are identical, broadcast the value. */ > if (all_same > && ix86_expand_vector_init_duplicate (mmx_ok, mode, target, > --- gcc/testsuite/gcc.dg/vect/vect-124.c.jj 2013-11-20 09:27:35.760798649 +0100 > +++ gcc/testsuite/gcc.dg/vect/vect-124.c 2013-11-20 09:29:32.219199794 +0100 > @@ -0,0 +1,28 @@ > +#include "tree-vect.h" > + > +#ifndef N > +#define N 64 > +#endif > + > +int a[N]; > + > +__attribute__((noinline, noclone)) void > +foo (int x) > +{ > + int i; > + for (i = 0; i < N; i++, x += 3) > + a[i] = x; > +} > + > +int > +main () > +{ > + int i; > + > + check_vect (); > + foo (6); > + for (i = 0; i < N; i++) > + if (a[i] != i * 3 + 6) > + abort (); > + return 0; > +} > > Jakub > >
On Wed, Nov 20, 2013 at 10:31:38AM +0100, Richard Biener wrote: > Aww ;) Nice improvement. Generally when I see this I always wonder > whether we want to do this kind of stuff pre RTL expansion. > 1st to not rely on being able to TER, 2nd to finally eventually > get rid of TER. > > These patches are unfortunately a step backward for #2. > > As of the patch, do we have a way to query whether the target > can efficiently broadcast? If so this IMHO belongs in generic We don't. Perhaps if we'd add optab for vec_dup<mode> and mentioned clearly in the documentation that it should be used only if it is reasonably efficient. But still, even with optab, it would probably better to do it in the veclower* passes than in the vectorizer itself. Jakub
On Wed, 20 Nov 2013, Jakub Jelinek wrote: > On Wed, Nov 20, 2013 at 10:31:38AM +0100, Richard Biener wrote: > > Aww ;) Nice improvement. Generally when I see this I always wonder > > whether we want to do this kind of stuff pre RTL expansion. > > 1st to not rely on being able to TER, 2nd to finally eventually > > get rid of TER. > > > > These patches are unfortunately a step backward for #2. > > > > As of the patch, do we have a way to query whether the target > > can efficiently broadcast? If so this IMHO belongs in generic > > We don't. Perhaps if we'd add optab for vec_dup<mode> and mentioned > clearly in the documentation that it should be used only if it is reasonably > efficient. But still, even with optab, it would probably better to do it > in the veclower* passes than in the vectorizer itself. Yes, veclower would be a good fit. I was of course thinking of a new "apply TER" pass right before cfgexpand (for the cases where TER ends up doing some kind of "folding"). Richard.
On 11/20/2013 07:44 PM, Jakub Jelinek wrote: > On Wed, Nov 20, 2013 at 10:31:38AM +0100, Richard Biener wrote: >> Aww ;) Nice improvement. Generally when I see this I always wonder >> whether we want to do this kind of stuff pre RTL expansion. >> 1st to not rely on being able to TER, 2nd to finally eventually >> get rid of TER. >> >> These patches are unfortunately a step backward for #2. >> >> As of the patch, do we have a way to query whether the target >> can efficiently broadcast? If so this IMHO belongs in generic > > We don't. Perhaps if we'd add optab for vec_dup<mode> and mentioned > clearly in the documentation that it should be used only if it is reasonably > efficient. But still, even with optab, it would probably better to do it > in the veclower* passes than in the vectorizer itself. I think we can assume that broadcast is relatively efficient, whether or not vec_dup is present. I'd lean to making the transformation generic to start with, so that you don't need extra handling in the i386 backend. Surely any operation with a common operand and a constant operand should be applicable for this transformation. r~
--- gcc/expr.c.jj 2013-11-19 21:56:27.000000000 +0100 +++ gcc/expr.c 2013-11-20 09:24:14.435860130 +0100 @@ -6296,6 +6296,8 @@ store_constructor (tree exp, rtx target, rtvec vector = NULL; unsigned n_elts; alias_set_type alias; + tree base = NULL_TREE; + rtx base_rtx = NULL_RTX; gcc_assert (eltmode != BLKmode); @@ -6334,6 +6336,31 @@ store_constructor (tree exp, rtx target, TYPE_SIZE (TREE_TYPE (value)), TYPE_SIZE (elttype))); + /* Try to detect fairly common pattern of + { a_5, a_5 + 1, a_5 + 2, a_5 + 3 }. */ + if (vector + && (count == 0 || base) + && TREE_CODE (value) == SSA_NAME) + { + gimple g = get_gimple_for_ssa_name (value); + tree this_base = value; + if (g && is_gimple_assign (g)) + switch (gimple_assign_rhs_code (g)) + { + case PLUS_EXPR: + case POINTER_PLUS_EXPR: + if (TREE_CODE (gimple_assign_rhs1 (g)) == SSA_NAME + && tree_fits_shwi_p (gimple_assign_rhs2 (g))) + this_base = gimple_assign_rhs1 (g); + break; + default: + break; + } + if (count == 0) + base = this_base; + else if (base != this_base) + base = NULL_TREE; + } count += n_elts_here; if (mostly_zeros_p (value)) zero_count += n_elts_here; @@ -6342,6 +6369,9 @@ store_constructor (tree exp, rtx target, /* Clear the entire vector first if there are any missing elements, or if the incidence of zero elements is >= 75%. */ need_to_clear = (count < n_elts || 4 * zero_count >= 3 * count); + + if (count < n_elts) + base = NULL_TREE; } if (need_to_clear && size > 0 && !vector) @@ -6362,6 +6392,9 @@ store_constructor (tree exp, rtx target, else alias = get_alias_set (elttype); + if (base) + base_rtx = expand_normal (base); + /* Store each element of the constructor into the corresponding element of TARGET, determined by counting the elements. */ for (idx = 0, i = 0; @@ -6385,8 +6418,21 @@ store_constructor (tree exp, rtx target, /* Vector CONSTRUCTORs should only be built from smaller vectors in the case of BLKmode vectors. */ gcc_assert (TREE_CODE (TREE_TYPE (value)) != VECTOR_TYPE); - RTVEC_ELT (vector, eltpos) - = expand_normal (value); + if (base) + { + if (value == base) + RTVEC_ELT (vector, eltpos) = base_rtx; + else + { + gimple g = get_gimple_for_ssa_name (value); + rtx cst = expand_normal (gimple_assign_rhs2 (g)); + RTVEC_ELT (vector, eltpos) + = plus_constant (TYPE_MODE (TREE_TYPE (value)), + base_rtx, INTVAL (cst)); + } + } + else + RTVEC_ELT (vector, eltpos) = expand_normal (value); } else { --- gcc/config/i386/i386.c.jj 2013-11-19 21:56:41.000000000 +0100 +++ gcc/config/i386/i386.c 2013-11-20 09:20:16.837649403 +0100 @@ -37736,9 +37736,10 @@ ix86_expand_vector_init (bool mmx_ok, rt enum machine_mode inner_mode = GET_MODE_INNER (mode); int n_elts = GET_MODE_NUNITS (mode); int n_var = 0, one_var = -1; - bool all_same = true, all_const_zero = true; + bool all_same = true, all_const_zero = true, need_force_reg = false; int i; rtx x; + rtx base = NULL_RTX; for (i = 0; i < n_elts; ++i) { @@ -37746,11 +37747,25 @@ ix86_expand_vector_init (bool mmx_ok, rt if (!(CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE || GET_CODE (x) == CONST_FIXED)) - n_var++, one_var = i; + { + n_var++; + one_var = i; + if (!REG_P (x) && !MEM_P (x)) + need_force_reg = true; + } else if (x != CONST0_RTX (inner_mode)) all_const_zero = false; if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0))) all_same = false; + if (i == 0 || base) + { + if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1))) + x = XEXP (x, 0); + if (i == 0) + base = x; + else if (!rtx_equal_p (x, base)) + base = NULL_RTX; + } } /* Constants are best loaded from the constant pool. */ @@ -37760,6 +37775,49 @@ ix86_expand_vector_init (bool mmx_ok, rt return; } + /* For { a_5, a_5 + 4, a_5 + 2, a_5 + 7 } generate a broadcast + followed by addition of constant. */ + if (!all_same + && base + && n_elts >= 4 + && (REG_P (base) || MEM_P (base))) + { + rtx subtarget = gen_reg_rtx (mode); + if (ix86_expand_vector_init_duplicate (mmx_ok, mode, subtarget, base)) + { + rtvec cstvals = rtvec_alloc (n_elts); + rtx cstsubtarget = gen_reg_rtx (mode); + for (i = 0; i < n_elts; i++) + { + x = XVECEXP (vals, 0, i); + if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1))) + RTVEC_ELT (cstvals, i) = XEXP (x, 1); + else + RTVEC_ELT (cstvals, i) = const0_rtx; + } + ix86_expand_vector_init (mmx_ok, cstsubtarget, + gen_rtx_PARALLEL (mode, cstvals)); + subtarget = expand_simple_binop (mode, PLUS, subtarget, + cstsubtarget, target, 1, + OPTAB_DIRECT); + if (subtarget != target) + emit_move_insn (target, subtarget); + return; + } + } + + if (need_force_reg) + for (i = 0; i < n_elts; ++i) + { + x = XVECEXP (vals, 0, i); + if (!(CONST_INT_P (x) + || GET_CODE (x) == CONST_DOUBLE + || GET_CODE (x) == CONST_FIXED + || REG_P (x) + || MEM_P (x))) + XVECEXP (vals, 0, i) = force_reg (GET_MODE (x), x); + } + /* If all values are identical, broadcast the value. */ if (all_same && ix86_expand_vector_init_duplicate (mmx_ok, mode, target, --- gcc/testsuite/gcc.dg/vect/vect-124.c.jj 2013-11-20 09:27:35.760798649 +0100 +++ gcc/testsuite/gcc.dg/vect/vect-124.c 2013-11-20 09:29:32.219199794 +0100 @@ -0,0 +1,28 @@ +#include "tree-vect.h" + +#ifndef N +#define N 64 +#endif + +int a[N]; + +__attribute__((noinline, noclone)) void +foo (int x) +{ + int i; + for (i = 0; i < N; i++, x += 3) + a[i] = x; +} + +int +main () +{ + int i; + + check_vect (); + foo (6); + for (i = 0; i < N; i++) + if (a[i] != i * 3 + 6) + abort (); + return 0; +}