Message ID | 20190303143230.19742-1-hjl.tools@gmail.com |
---|---|
State | New |
Headers | show |
Series | Optimize vector init constructor | expand |
) ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > For vector init constructor: > > --- > typedef float __v4sf __attribute__ ((__vector_size__ (16))); > > __v4sf > foo (__v4sf x, float f) > { > __v4sf y = { f, x[1], x[2], x[3] }; > return y; > } > --- > > we can optimize vector init constructor with vector copy or permute > followed by a single scalar insert: > > __v4sf D.1912; > __v4sf D.1913; > __v4sf D.1914; > __v4sf y; > > x.0_1 = x; > D.1912 = x.0_1; > _2 = D.1912; > D.1913 = _2; > BIT_FIELD_REF <D.1913, 32, 0> = f; > y = D.1913; > D.1914 = y; > return D.1914; > > instead of > > __v4sf D.1962; > __v4sf y; > > _1 = BIT_FIELD_REF <x, 32, 32>; > _2 = BIT_FIELD_REF <x, 32, 64>; > _3 = BIT_FIELD_REF <x, 32, 96>; > y = {f, _1, _2, _3}; > D.1962 = y; > return D.1962; > > gcc/ > > PR tree-optimization/88828 > * gimplify.c (gimplify_init_constructor): Optimize vector init > constructor with vector copy or permute followed by a single > scalar insert. Doing this here does not catch things like: typedef float __v4sf __attribute__ ((__vector_size__ (16))); __v4sf vector_init (float f0,float f1, float f2,float f3) { __v4sf y = { f, x[1], x[2], x[3] }; return y; } __v4sf foo (__v4sf x, float f) { return vector_init (f, x[1], x[2], x[3]) ; } > > gcc/testsuite/ > > PR tree-optimization/88828 > * gcc.target/i386/pr88828-1.c: New test. > * gcc.target/i386/pr88828-2.c: Likewise. > * gcc.target/i386/pr88828-3a.c: Likewise. > * gcc.target/i386/pr88828-3b.c: Likewise. > * gcc.target/i386/pr88828-4a.c: Likewise. > * gcc.target/i386/pr88828-4b.c: Likewise. > * gcc.target/i386/pr88828-5a.c: Likewise. > * gcc.target/i386/pr88828-5b.c: Likewise. > * gcc.target/i386/pr88828-6a.c: Likewise. > * gcc.target/i386/pr88828-6b.c: Likewise. > --- > gcc/gimplify.c | 176 +++++++++++++++++++-- > gcc/testsuite/gcc.target/i386/pr88828-1.c | 16 ++ > gcc/testsuite/gcc.target/i386/pr88828-2.c | 17 ++ > gcc/testsuite/gcc.target/i386/pr88828-3a.c | 16 ++ > gcc/testsuite/gcc.target/i386/pr88828-3b.c | 18 +++ > gcc/testsuite/gcc.target/i386/pr88828-4a.c | 17 ++ > gcc/testsuite/gcc.target/i386/pr88828-4b.c | 20 +++ > gcc/testsuite/gcc.target/i386/pr88828-5a.c | 16 ++ > gcc/testsuite/gcc.target/i386/pr88828-5b.c | 18 +++ > gcc/testsuite/gcc.target/i386/pr88828-6a.c | 17 ++ > gcc/testsuite/gcc.target/i386/pr88828-6b.c | 19 +++ > 11 files changed, 336 insertions(+), 14 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3a.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3b.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4a.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4b.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5a.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5b.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6a.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6b.c > > diff --git a/gcc/gimplify.c b/gcc/gimplify.c > index 983635ba21f..893a4311f9e 100644 > --- a/gcc/gimplify.c > +++ b/gcc/gimplify.c > @@ -5082,22 +5082,170 @@ gimplify_init_constructor (tree *expr_p, gimple_seq *pre_p, gimple_seq *post_p, > TREE_CONSTANT (ctor) = 0; > } > > - /* Vector types use CONSTRUCTOR all the way through gimple > - compilation as a general initializer. */ > - FOR_EACH_VEC_SAFE_ELT (elts, ix, ce) > + tree rhs_vector = NULL; > + /* The vector element to replace scalar elements, which > + will be overridden by scalar insert. */ > + tree vector_element = NULL; > + /* The single scalar element. */ > + tree scalar_element = NULL; > + unsigned int scalar_idx = 0; > + enum { unknown, copy, permute, init } operation = unknown; > + bool insert = false; > + > + /* Check if we can generate vector copy or permute followed by > + a single scalar insert. */ > + if (TYPE_VECTOR_SUBPARTS (type).is_constant ()) > { > - enum gimplify_status tret; > - tret = gimplify_expr (&ce->value, pre_p, post_p, is_gimple_val, > - fb_rvalue); > - if (tret == GS_ERROR) > - ret = GS_ERROR; > - else if (TREE_STATIC (ctor) > - && !initializer_constant_valid_p (ce->value, > - TREE_TYPE (ce->value))) > - TREE_STATIC (ctor) = 0; > + /* If all RHS vector elements come from the same vector, > + we can use permute. If all RHS vector elements come > + from the same vector in the same order, we can use > + copy. */ > + unsigned int nunits > + = TYPE_VECTOR_SUBPARTS (type).to_constant (); > + unsigned int nscalars = 0; > + unsigned int nvectors = 0; > + operation = unknown; > + FOR_EACH_VEC_SAFE_ELT (elts, ix, ce) > + if (TREE_CODE (ce->value) == ARRAY_REF > + || TREE_CODE (ce->value) == ARRAY_RANGE_REF) > + { > + if (!vector_element) > + vector_element = ce->value; > + /* Get the vector index. */ > + tree idx = TREE_OPERAND (ce->value, 1); > + if (TREE_CODE (idx) == INTEGER_CST) > + { > + /* Get the RHS vector. */ > + tree r = ce->value; > + while (handled_component_p (r)) > + r = TREE_OPERAND (r, 0); > + if (type == TREE_TYPE (r)) > + { > + /* The RHS vector has the same type as > + LHS. */ > + if (rhs_vector == NULL) > + rhs_vector = r; > + > + /* Check if all RHS vector elements come > + fome the same vector. */ > + if (rhs_vector == r) > + { > + nvectors++; > + if (TREE_INT_CST_LOW (idx) == ix > + && (operation == unknown > + || operation == copy)) > + operation = copy; > + else > + operation = permute; > + continue; > + } > + } > + } > + > + /* Otherwise, use vector init. */ > + break; > + } > + else if (TREE_CODE (TYPE_SIZE (TREE_TYPE (ce->value))) > + == INTEGER_CST) > + { > + /* Only allow one single scalar insert. */ > + if (nscalars != 0) > + break; > + nscalars = 1; > + insert = true; > + scalar_idx = ix; > + scalar_element = ce->value; > + } > + > + /* Allow a single scalar insert with vector copy or > + vector permute. Vector copy without insert is OK. */ > + if (nunits != (nscalars + nvectors) > + || (nscalars == 0 && operation != copy)) > + operation = unknown; > + } > + > + if (operation == unknown) > + { > + /* Default to the regular vector init constructor. */ > + operation = init; > + insert = false; > + } > + > + if (operation == copy) > + { > + /* Generate a vector copy. */ > + tree var = create_tmp_var (type); > + if (gimplify_expr (&rhs_vector, pre_p, post_p, > + is_gimple_val, fb_rvalue) == GS_ERROR) > + { > + ret = GS_ERROR; > + break; > + } > + gassign *init = gimple_build_assign (var, rhs_vector); > + gimple_seq_add_stmt (pre_p, init); > + if (gimplify_expr (&var, pre_p, post_p, is_gimple_val, > + fb_rvalue) == GS_ERROR) > + { > + ret = GS_ERROR; > + break; > + } > + /* Replace RHS with the vector copy. */ > + if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0))) > + TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (var, pre_p); > + else > + TREE_OPERAND (*expr_p, 1) = var; > + } > + else > + { > + /* Prepare for vector permute by replacing the scalar > + element with the vector one. */ > + if (operation == permute) > + (elts->address())[scalar_idx].value = vector_element; > + > + /* Vector types use CONSTRUCTOR all the way through gimple > + compilation as a general initializer. */ > + FOR_EACH_VEC_SAFE_ELT (elts, ix, ce) > + { > + enum gimplify_status tret; > + tret = gimplify_expr (&ce->value, pre_p, post_p, > + is_gimple_val, > + fb_rvalue); > + if (tret == GS_ERROR) > + ret = GS_ERROR; > + else if (TREE_STATIC (ctor) > + && !initializer_constant_valid_p (ce->value, > + TREE_TYPE (ce->value))) > + TREE_STATIC (ctor) = 0; > + } > + if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0))) > + TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (ctor, pre_p); > + } > + > + if (insert) > + { > + /* Generate a single scalar insert after vector copy or > + permute. */ > + tree rhs = TREE_OPERAND (*expr_p, 1); > + tree var = create_tmp_var (type); > + gassign *init = gimple_build_assign (var, rhs); > + gimple_seq_add_stmt (pre_p, init); > + if (gimplify_expr (&scalar_element, pre_p, post_p, > + is_gimple_val, fb_rvalue) == GS_ERROR) > + { > + ret = GS_ERROR; > + break; > + } > + tree scalar_type = TREE_TYPE (scalar_element); > + tree scalar_size = TYPE_SIZE (scalar_type); > + tree bitpos = bitsize_int (scalar_idx > + * TREE_INT_CST_LOW (scalar_size)); > + tree ref = build3_loc (EXPR_LOCATION (rhs), BIT_FIELD_REF, > + scalar_type, var, scalar_size, > + bitpos); > + init = gimple_build_assign (ref, scalar_element); > + gimplify_seq_add_stmt (pre_p, init); > + TREE_OPERAND (*expr_p, 1) = var; > } > - if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0))) > - TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (ctor, pre_p); > } > break; > > diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1.c b/gcc/testsuite/gcc.target/i386/pr88828-1.c > new file mode 100644 > index 00000000000..4ef1feab389 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr88828-1.c > @@ -0,0 +1,16 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse -mno-sse4" } */ > +/* { dg-final { scan-assembler "movss" } } */ > +/* { dg-final { scan-assembler-not "movaps" } } */ > +/* { dg-final { scan-assembler-not "movlhps" } } */ > +/* { dg-final { scan-assembler-not "unpcklps" } } */ > +/* { dg-final { scan-assembler-not "shufps" } } */ > + > +typedef float __v4sf __attribute__ ((__vector_size__ (16))); > + > +__v4sf > +foo (__v4sf x, float f) > +{ > + __v4sf y = { f, x[1], x[2], x[3] }; > + return y; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2.c b/gcc/testsuite/gcc.target/i386/pr88828-2.c > new file mode 100644 > index 00000000000..6dc482b6f4b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr88828-2.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse -mno-sse4" } */ > +/* { dg-final { scan-assembler "movss" } } */ > +/* { dg-final { scan-assembler-not "movaps" } } */ > +/* { dg-final { scan-assembler-not "movlhps" } } */ > +/* { dg-final { scan-assembler-not "unpcklps" } } */ > +/* { dg-final { scan-assembler-not "shufps" } } */ > + > +typedef float __v4sf __attribute__ ((__vector_size__ (16))); > + > +__v4sf > +foo (__v4sf x, float f) > +{ > + __v4sf y = x; > + y[0] = f; > + return y; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3a.c b/gcc/testsuite/gcc.target/i386/pr88828-3a.c > new file mode 100644 > index 00000000000..97eb8e7162a > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr88828-3a.c > @@ -0,0 +1,16 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse -mno-sse4" } */ > +/* { dg-final { scan-assembler "movss" } } */ > +/* { dg-final { scan-assembler-times "shufps" 1 } } */ > +/* { dg-final { scan-assembler-not "movaps" } } */ > +/* { dg-final { scan-assembler-not "movlhps" } } */ > +/* { dg-final { scan-assembler-not "unpcklps" } } */ > + > +typedef float __v4sf __attribute__ ((__vector_size__ (16))); > + > +__v4sf > +foo (__v4sf x, float f) > +{ > + __v4sf y = { f, x[0], x[2], x[3] }; > + return y; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3b.c b/gcc/testsuite/gcc.target/i386/pr88828-3b.c > new file mode 100644 > index 00000000000..ab2ba730716 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr88828-3b.c > @@ -0,0 +1,18 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx" } */ > +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ > +/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */ > +/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */ > +/* { dg-final { scan-assembler-not "vmovaps" } } */ > +/* { dg-final { scan-assembler-not "vmovlhps" } } */ > +/* { dg-final { scan-assembler-not "vunpcklps" } } */ > + > +typedef float __v4sf __attribute__ ((__vector_size__ (16))); > + > +__v4sf > +foo (__v4sf x, float f) > +{ > + __v4sf y = { f, x[0], x[2], x[3] }; > + return y; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4a.c b/gcc/testsuite/gcc.target/i386/pr88828-4a.c > new file mode 100644 > index 00000000000..a54689be701 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr88828-4a.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse -mno-sse4" } */ > +/* { dg-final { scan-assembler "movss" } } */ > +/* { dg-final { scan-assembler-times "shufps" 1 } } */ > +/* { dg-final { scan-assembler-not "movaps" } } */ > +/* { dg-final { scan-assembler-not "movlhps" } } */ > +/* { dg-final { scan-assembler-not "unpcklps" } } */ > + > +typedef float __v4sf __attribute__ ((__vector_size__ (16))); > + > +__v4sf > +foo (__v4sf x, float f) > +{ > + __v4sf y = { x[0], x[2], x[3], x[1] }; > + y[0] = f; > + return y; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4b.c b/gcc/testsuite/gcc.target/i386/pr88828-4b.c > new file mode 100644 > index 00000000000..0c3a1024d93 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr88828-4b.c > @@ -0,0 +1,20 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx" } */ > +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ > +/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */ > +/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */ > +/* { dg-final { scan-assembler-not "vshufps" } } */ > +/* { dg-final { scan-assembler-not "vmovaps" } } */ > +/* { dg-final { scan-assembler-not "vmovlhps" } } */ > +/* { dg-final { scan-assembler-not "vunpcklps" } } */ > + > +typedef float __v4sf __attribute__ ((__vector_size__ (16))); > + > +__v4sf > +foo (__v4sf x, float f) > +{ > + __v4sf y = { x[0], x[2], x[3], x[1] }; > + y[0] = f; > + return y; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5a.c b/gcc/testsuite/gcc.target/i386/pr88828-5a.c > new file mode 100644 > index 00000000000..534808d3cd1 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr88828-5a.c > @@ -0,0 +1,16 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse -mno-sse4" } */ > +/* { dg-final { scan-assembler "movss" } } */ > +/* { dg-final { scan-assembler-times "shufps" 2 } } */ > +/* { dg-final { scan-assembler-times "movaps" 1 } } */ > +/* { dg-final { scan-assembler-not "movlhps" } } */ > +/* { dg-final { scan-assembler-not "unpcklps" } } */ > + > +typedef float __v4sf __attribute__ ((__vector_size__ (16))); > + > +__v4sf > +foo (__v4sf x, float f) > +{ > + __v4sf y = { x[0], x[2], x[3], f }; > + return y; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5b.c b/gcc/testsuite/gcc.target/i386/pr88828-5b.c > new file mode 100644 > index 00000000000..aebea790979 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr88828-5b.c > @@ -0,0 +1,18 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx" } */ > +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ > +/* { dg-final { scan-assembler-times "vinsertps" 1 } } */ > +/* { dg-final { scan-assembler-not "vmovss" } } */ > +/* { dg-final { scan-assembler-not "vshufps" } } */ > +/* { dg-final { scan-assembler-not "vmovaps" } } */ > +/* { dg-final { scan-assembler-not "vmovlhps" } } */ > +/* { dg-final { scan-assembler-not "vunpcklps" } } */ > + > +typedef float __v4sf __attribute__ ((__vector_size__ (16))); > + > +__v4sf > +foo (__v4sf x, float f) > +{ > + __v4sf y = { x[0], x[2], x[3], f }; > + return y; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6a.c b/gcc/testsuite/gcc.target/i386/pr88828-6a.c > new file mode 100644 > index 00000000000..d43a36d9137 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr88828-6a.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse -mno-sse4" } */ > +/* { dg-final { scan-assembler "movss" } } */ > +/* { dg-final { scan-assembler-times "shufps" 2 } } */ > +/* { dg-final { scan-assembler-times "movaps" 1 } } */ > +/* { dg-final { scan-assembler-not "movlhps" } } */ > +/* { dg-final { scan-assembler-not "unpcklps" } } */ > + > +typedef float __v4sf __attribute__ ((__vector_size__ (16))); > + > +__v4sf > +foo (__v4sf x, float f) > +{ > + __v4sf y = { x[0], x[2], x[3], x[0] }; > + y[3] = f; > + return y; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6b.c b/gcc/testsuite/gcc.target/i386/pr88828-6b.c > new file mode 100644 > index 00000000000..6856fe6500e > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr88828-6b.c > @@ -0,0 +1,19 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx" } */ > +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ > +/* { dg-final { scan-assembler-times "vinsertps" 1 } } */ > +/* { dg-final { scan-assembler-not "vshufps" } } */ > +/* { dg-final { scan-assembler-not "vmovss" } } */ > +/* { dg-final { scan-assembler-not "vmovaps" } } */ > +/* { dg-final { scan-assembler-not "vmovlhps" } } */ > +/* { dg-final { scan-assembler-not "vunpcklps" } } */ > + > +typedef float __v4sf __attribute__ ((__vector_size__ (16))); > + > +__v4sf > +foo (__v4sf x, float f) > +{ > + __v4sf y = { x[0], x[2], x[3], x[0] }; > + y[3] = f; > + return y; > +} > -- > 2.20.1 >
On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote: > ) > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > For vector init constructor: > > > > --- > > typedef float __v4sf __attribute__ ((__vector_size__ (16))); > > > > __v4sf > > foo (__v4sf x, float f) > > { > > __v4sf y = { f, x[1], x[2], x[3] }; > > return y; > > } > > --- > > > > we can optimize vector init constructor with vector copy or permute > > followed by a single scalar insert: > > > > __v4sf D.1912; > > __v4sf D.1913; > > __v4sf D.1914; > > __v4sf y; > > > > x.0_1 = x; > > D.1912 = x.0_1; > > _2 = D.1912; > > D.1913 = _2; > > BIT_FIELD_REF <D.1913, 32, 0> = f; > > y = D.1913; > > D.1914 = y; > > return D.1914; > > > > instead of > > > > __v4sf D.1962; > > __v4sf y; > > > > _1 = BIT_FIELD_REF <x, 32, 32>; > > _2 = BIT_FIELD_REF <x, 32, 64>; > > _3 = BIT_FIELD_REF <x, 32, 96>; > > y = {f, _1, _2, _3}; > > D.1962 = y; > > return D.1962; > > > > gcc/ > > > > PR tree-optimization/88828 > > * gimplify.c (gimplify_init_constructor): Optimize vector init > > constructor with vector copy or permute followed by a single > > scalar insert. > > > Doing this here does not catch things like: > typedef float __v4sf __attribute__ ((__vector_size__ (16))); > > > __v4sf > vector_init (float f0,float f1, float f2,float f3) > { > __v4sf y = { f, x[1], x[2], x[3] }; > return y; > } > > __v4sf > foo (__v4sf x, float f) > { > return vector_init (f, x[1], x[2], x[3]) ; > } > Here is a patch for simplify_vector_constructor to optimize vector init constructor with vector copy or permute followed by a single scalar insert. But this doesn't work correcly: [hjl@gnu-cfl-2 pr88828]$ cat bar.i typedef float __v4sf __attribute__ ((__vector_size__ (16))); static __v4sf vector_init (float f0,float f1, float f2,float f3) { __v4sf y = { f0, f1, f2, f3 }; return y; } __v4sf foo (__v4sf x, float f) { return vector_init (f, x[1], x[2], x[3]) ; } [hjl@gnu-cfl-2 pr88828]$ make bar.s /export/build/gnu/tools-build/gcc-wip-debug/build-x86_64-linux/gcc/xgcc -B/export/build/gnu/tools-build/gcc-wip-debug/build-x86_64-linux/gcc/ -O2 -S bar.i [hjl@gnu-cfl-2 pr88828]$ cat bar.s .file "bar.i" .text .p2align 4 .globl foo .type foo, @function foo: .LFB1: .cfi_startproc ret .cfi_endproc .LFE1: .size foo, .-foo .ident "GCC: (GNU) 9.0.1 20190303 (experimental)" .section .note.GNU-stack,"",@progbits [hjl@gnu-cfl-2 pr88828]$ Scalar insert is missing. --- gcc/tree-ssa-forwprop.c | 77 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 69 insertions(+), 8 deletions(-) diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c index eeb6281c652..b10cfccf7b8 100644 --- a/gcc/tree-ssa-forwprop.c +++ b/gcc/tree-ssa-forwprop.c @@ -2008,7 +2008,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) unsigned elem_size, i; unsigned HOST_WIDE_INT nelts; enum tree_code code, conv_code; - constructor_elt *elt; + constructor_elt *ce; bool maybe_ident; gcc_checking_assert (gimple_assign_rhs_code (stmt) == CONSTRUCTOR); @@ -2027,18 +2027,41 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) orig[1] = NULL; conv_code = ERROR_MARK; maybe_ident = true; - FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, elt) + + tree rhs_vector = NULL; + /* The single scalar element. */ + tree scalar_element = NULL; + unsigned int scalar_idx = 0; + bool insert = false; + unsigned int nscalars = 0; + unsigned int nvectors = 0; + FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, ce) { tree ref, op1; if (i >= nelts) return false; - if (TREE_CODE (elt->value) != SSA_NAME) + if (TREE_CODE (ce->value) != SSA_NAME) return false; - def_stmt = get_prop_source_stmt (elt->value, false, NULL); + def_stmt = get_prop_source_stmt (ce->value, false, NULL); if (!def_stmt) - return false; + { + if ( gimple_nop_p (SSA_NAME_DEF_STMT (ce->value))) + { + /* Only allow one single scalar insert. */ + if (nscalars != 0) + return false; + + nscalars = 1; + insert = true; + scalar_idx = i; + scalar_element = ce->value; + continue; + } + else + return false; + } code = gimple_assign_rhs_code (def_stmt); if (code == FLOAT_EXPR || code == FIX_TRUNC_EXPR) @@ -2046,7 +2069,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) op1 = gimple_assign_rhs1 (def_stmt); if (conv_code == ERROR_MARK) { - if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (elt->value))), + if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (ce->value))), GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1))))) return false; conv_code = code; @@ -2095,6 +2118,18 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) elt += nelts; if (elt != i) maybe_ident = false; + + if (type == TREE_TYPE (ref)) + { + /* The RHS vector has the same type as LHS. */ + if (rhs_vector == NULL) + rhs_vector = ref; + /* Check if all RHS vector elements come fome the same + vector. */ + if (rhs_vector == ref) + nvectors++; + } + sel.quick_push (elt); } if (i < nelts) @@ -2113,6 +2148,12 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) || conv_code == CALL_EXPR)) return false; + /* Replace the scalar element with the vector element. */ + if (insert + && (TYPE_VECTOR_SUBPARTS (type).to_constant () + == (nscalars + nvectors))) + sel.quick_push (scalar_idx); + if (maybe_ident) { if (conv_code == ERROR_MARK) @@ -2127,14 +2168,22 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) vec_perm_indices indices (sel, orig[1] ? 2 : 1, nelts); if (!can_vec_perm_const_p (TYPE_MODE (type), indices)) - return false; + { + if (insert) + gcc_unreachable (); + return false; + } mask_type = build_vector_type (build_nonstandard_integer_type (elem_size, 1), nelts); if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_VECTOR_INT || maybe_ne (GET_MODE_SIZE (TYPE_MODE (mask_type)), GET_MODE_SIZE (TYPE_MODE (type)))) - return false; + { + if (insert) + gcc_unreachable (); + return false; + } op2 = vec_perm_indices_to_tree (mask_type, indices); if (!orig[1]) orig[1] = orig[0]; @@ -2153,6 +2202,18 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) } } update_stmt (gsi_stmt (*gsi)); + if (insert) + { + /* Generate a single scalar insert. */ + /* FIXME: This doesn't work correctly. */ + tree lhs = gimple_assign_lhs (stmt); + tree bitfield = build3 (BIT_FIELD_REF, elem_type, lhs, + bitsize_int (elem_size), + bitsize_int (scalar_idx * elem_size)); + gimple *new_stmt = gimple_build_assign (bitfield, scalar_element); + gsi_insert_after (gsi, new_stmt, GSI_SAME_STMT); + update_stmt (gsi_stmt (*gsi)); + } return true; }
On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote: > > ) > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > For vector init constructor: > > > > > > --- > > > typedef float __v4sf __attribute__ ((__vector_size__ (16))); > > > > > > __v4sf > > > foo (__v4sf x, float f) > > > { > > > __v4sf y = { f, x[1], x[2], x[3] }; > > > return y; > > > } > > > --- > > > > > > we can optimize vector init constructor with vector copy or permute > > > followed by a single scalar insert: > > > > > > __v4sf D.1912; > > > __v4sf D.1913; > > > __v4sf D.1914; > > > __v4sf y; > > > > > > x.0_1 = x; > > > D.1912 = x.0_1; > > > _2 = D.1912; > > > D.1913 = _2; > > > BIT_FIELD_REF <D.1913, 32, 0> = f; > > > y = D.1913; > > > D.1914 = y; > > > return D.1914; > > > > > > instead of > > > > > > __v4sf D.1962; > > > __v4sf y; > > > > > > _1 = BIT_FIELD_REF <x, 32, 32>; > > > _2 = BIT_FIELD_REF <x, 32, 64>; > > > _3 = BIT_FIELD_REF <x, 32, 96>; > > > y = {f, _1, _2, _3}; > > > D.1962 = y; > > > return D.1962; > > > > > > gcc/ > > > > > > PR tree-optimization/88828 > > > * gimplify.c (gimplify_init_constructor): Optimize vector init > > > constructor with vector copy or permute followed by a single > > > scalar insert. > > > > > > Doing this here does not catch things like: > > typedef float __v4sf __attribute__ ((__vector_size__ (16))); > > > > > > __v4sf > > vector_init (float f0,float f1, float f2,float f3) > > { > > __v4sf y = { f, x[1], x[2], x[3] }; > > return y; > > } > > > > __v4sf > > foo (__v4sf x, float f) > > { > > return vector_init (f, x[1], x[2], x[3]) ; > > } > > > > Here is a patch for simplify_vector_constructor to optimize vector init > constructor with vector copy or permute followed by a single scalar > insert. That's the correct place to fix this indeed. But this doesn't work correcly: > > [hjl@gnu-cfl-2 pr88828]$ cat bar.i > typedef float __v4sf __attribute__ ((__vector_size__ (16))); > > static __v4sf > vector_init (float f0,float f1, float f2,float f3) > { > __v4sf y = { f0, f1, f2, f3 }; > return y; > } > > __v4sf > foo (__v4sf x, float f) > { > return vector_init (f, x[1], x[2], x[3]) ; > } > [hjl@gnu-cfl-2 pr88828]$ make bar.s > /export/build/gnu/tools-build/gcc-wip-debug/build-x86_64-linux/gcc/xgcc -B/export/build/gnu/tools-build/gcc-wip-debug/build-x86_64-linux/gcc/ -O2 -S bar.i > [hjl@gnu-cfl-2 pr88828]$ cat bar.s > .file "bar.i" > .text > .p2align 4 > .globl foo > .type foo, @function > foo: > .LFB1: > .cfi_startproc > ret > .cfi_endproc > .LFE1: > .size foo, .-foo > .ident "GCC: (GNU) 9.0.1 20190303 (experimental)" > .section .note.GNU-stack,"",@progbits > [hjl@gnu-cfl-2 pr88828]$ > > Scalar insert is missing. > --- > gcc/tree-ssa-forwprop.c | 77 ++++++++++++++++++++++++++++++++++++----- > 1 file changed, 69 insertions(+), 8 deletions(-) > > diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c > index eeb6281c652..b10cfccf7b8 100644 > --- a/gcc/tree-ssa-forwprop.c > +++ b/gcc/tree-ssa-forwprop.c > @@ -2008,7 +2008,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) > unsigned elem_size, i; > unsigned HOST_WIDE_INT nelts; > enum tree_code code, conv_code; > - constructor_elt *elt; > + constructor_elt *ce; > bool maybe_ident; > > gcc_checking_assert (gimple_assign_rhs_code (stmt) == CONSTRUCTOR); > @@ -2027,18 +2027,41 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) > orig[1] = NULL; > conv_code = ERROR_MARK; > maybe_ident = true; > - FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, elt) > + > + tree rhs_vector = NULL; > + /* The single scalar element. */ > + tree scalar_element = NULL; > + unsigned int scalar_idx = 0; > + bool insert = false; > + unsigned int nscalars = 0; > + unsigned int nvectors = 0; > + FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, ce) > { > tree ref, op1; > > if (i >= nelts) > return false; > > - if (TREE_CODE (elt->value) != SSA_NAME) > + if (TREE_CODE (ce->value) != SSA_NAME) > return false; > - def_stmt = get_prop_source_stmt (elt->value, false, NULL); > + def_stmt = get_prop_source_stmt (ce->value, false, NULL); > if (!def_stmt) > - return false; > + { > + if ( gimple_nop_p (SSA_NAME_DEF_STMT (ce->value))) > + { > + /* Only allow one single scalar insert. */ > + if (nscalars != 0) > + return false; > + > + nscalars = 1; > + insert = true; > + scalar_idx = i; > + scalar_element = ce->value; > + continue; > + } > + else > + return false; > + } > code = gimple_assign_rhs_code (def_stmt); > if (code == FLOAT_EXPR > || code == FIX_TRUNC_EXPR) > @@ -2046,7 +2069,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) > op1 = gimple_assign_rhs1 (def_stmt); > if (conv_code == ERROR_MARK) > { > - if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (elt->value))), > + if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (ce->value))), > GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1))))) > return false; > conv_code = code; > @@ -2095,6 +2118,18 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) > elt += nelts; > if (elt != i) > maybe_ident = false; > + > + if (type == TREE_TYPE (ref)) > + { > + /* The RHS vector has the same type as LHS. */ > + if (rhs_vector == NULL) > + rhs_vector = ref; > + /* Check if all RHS vector elements come fome the same > + vector. */ > + if (rhs_vector == ref) > + nvectors++; > + } > + > sel.quick_push (elt); > } > if (i < nelts) > @@ -2113,6 +2148,12 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) > || conv_code == CALL_EXPR)) > return false; > > + /* Replace the scalar element with the vector element. */ > + if (insert > + && (TYPE_VECTOR_SUBPARTS (type).to_constant () > + == (nscalars + nvectors))) > + sel.quick_push (scalar_idx); > + > if (maybe_ident) > { > if (conv_code == ERROR_MARK) > @@ -2127,14 +2168,22 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) > > vec_perm_indices indices (sel, orig[1] ? 2 : 1, nelts); > if (!can_vec_perm_const_p (TYPE_MODE (type), indices)) > - return false; > + { > + if (insert) > + gcc_unreachable (); > + return false; > + } > mask_type > = build_vector_type (build_nonstandard_integer_type (elem_size, 1), > nelts); > if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_VECTOR_INT > || maybe_ne (GET_MODE_SIZE (TYPE_MODE (mask_type)), > GET_MODE_SIZE (TYPE_MODE (type)))) > - return false; > + { > + if (insert) > + gcc_unreachable (); > + return false; > + } > op2 = vec_perm_indices_to_tree (mask_type, indices); > if (!orig[1]) > orig[1] = orig[0]; > @@ -2153,6 +2202,18 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) > } > } > update_stmt (gsi_stmt (*gsi)); > + if (insert) > + { > + /* Generate a single scalar insert. */ > + /* FIXME: This doesn't work correctly. */ > + tree lhs = gimple_assign_lhs (stmt); > + tree bitfield = build3 (BIT_FIELD_REF, elem_type, lhs, > + bitsize_int (elem_size), > + bitsize_int (scalar_idx * elem_size)); > + gimple *new_stmt = gimple_build_assign (bitfield, scalar_element); I think you want to generate from the original _1 = { .... }; the new _2 = copy or permute to _new_ LHS SSA name _1 = BIT_INSERT_EXPR <_2, scalar_element, scalar_idx * elem_size>; > + gsi_insert_after (gsi, new_stmt, GSI_SAME_STMT); and you want to advance to the _1 = BIT_INSERT_EXPR here. The easiest way is to emit a new stmt for _2 = copy ...; and do the set_rhs with the BIT_INSERT_EXPR. > + update_stmt (gsi_stmt (*gsi)); > + } > return true; > } > > -- > 2.20.1 >
diff --git a/gcc/gimplify.c b/gcc/gimplify.c index 983635ba21f..893a4311f9e 100644 --- a/gcc/gimplify.c +++ b/gcc/gimplify.c @@ -5082,22 +5082,170 @@ gimplify_init_constructor (tree *expr_p, gimple_seq *pre_p, gimple_seq *post_p, TREE_CONSTANT (ctor) = 0; } - /* Vector types use CONSTRUCTOR all the way through gimple - compilation as a general initializer. */ - FOR_EACH_VEC_SAFE_ELT (elts, ix, ce) + tree rhs_vector = NULL; + /* The vector element to replace scalar elements, which + will be overridden by scalar insert. */ + tree vector_element = NULL; + /* The single scalar element. */ + tree scalar_element = NULL; + unsigned int scalar_idx = 0; + enum { unknown, copy, permute, init } operation = unknown; + bool insert = false; + + /* Check if we can generate vector copy or permute followed by + a single scalar insert. */ + if (TYPE_VECTOR_SUBPARTS (type).is_constant ()) { - enum gimplify_status tret; - tret = gimplify_expr (&ce->value, pre_p, post_p, is_gimple_val, - fb_rvalue); - if (tret == GS_ERROR) - ret = GS_ERROR; - else if (TREE_STATIC (ctor) - && !initializer_constant_valid_p (ce->value, - TREE_TYPE (ce->value))) - TREE_STATIC (ctor) = 0; + /* If all RHS vector elements come from the same vector, + we can use permute. If all RHS vector elements come + from the same vector in the same order, we can use + copy. */ + unsigned int nunits + = TYPE_VECTOR_SUBPARTS (type).to_constant (); + unsigned int nscalars = 0; + unsigned int nvectors = 0; + operation = unknown; + FOR_EACH_VEC_SAFE_ELT (elts, ix, ce) + if (TREE_CODE (ce->value) == ARRAY_REF + || TREE_CODE (ce->value) == ARRAY_RANGE_REF) + { + if (!vector_element) + vector_element = ce->value; + /* Get the vector index. */ + tree idx = TREE_OPERAND (ce->value, 1); + if (TREE_CODE (idx) == INTEGER_CST) + { + /* Get the RHS vector. */ + tree r = ce->value; + while (handled_component_p (r)) + r = TREE_OPERAND (r, 0); + if (type == TREE_TYPE (r)) + { + /* The RHS vector has the same type as + LHS. */ + if (rhs_vector == NULL) + rhs_vector = r; + + /* Check if all RHS vector elements come + fome the same vector. */ + if (rhs_vector == r) + { + nvectors++; + if (TREE_INT_CST_LOW (idx) == ix + && (operation == unknown + || operation == copy)) + operation = copy; + else + operation = permute; + continue; + } + } + } + + /* Otherwise, use vector init. */ + break; + } + else if (TREE_CODE (TYPE_SIZE (TREE_TYPE (ce->value))) + == INTEGER_CST) + { + /* Only allow one single scalar insert. */ + if (nscalars != 0) + break; + nscalars = 1; + insert = true; + scalar_idx = ix; + scalar_element = ce->value; + } + + /* Allow a single scalar insert with vector copy or + vector permute. Vector copy without insert is OK. */ + if (nunits != (nscalars + nvectors) + || (nscalars == 0 && operation != copy)) + operation = unknown; + } + + if (operation == unknown) + { + /* Default to the regular vector init constructor. */ + operation = init; + insert = false; + } + + if (operation == copy) + { + /* Generate a vector copy. */ + tree var = create_tmp_var (type); + if (gimplify_expr (&rhs_vector, pre_p, post_p, + is_gimple_val, fb_rvalue) == GS_ERROR) + { + ret = GS_ERROR; + break; + } + gassign *init = gimple_build_assign (var, rhs_vector); + gimple_seq_add_stmt (pre_p, init); + if (gimplify_expr (&var, pre_p, post_p, is_gimple_val, + fb_rvalue) == GS_ERROR) + { + ret = GS_ERROR; + break; + } + /* Replace RHS with the vector copy. */ + if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0))) + TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (var, pre_p); + else + TREE_OPERAND (*expr_p, 1) = var; + } + else + { + /* Prepare for vector permute by replacing the scalar + element with the vector one. */ + if (operation == permute) + (elts->address())[scalar_idx].value = vector_element; + + /* Vector types use CONSTRUCTOR all the way through gimple + compilation as a general initializer. */ + FOR_EACH_VEC_SAFE_ELT (elts, ix, ce) + { + enum gimplify_status tret; + tret = gimplify_expr (&ce->value, pre_p, post_p, + is_gimple_val, + fb_rvalue); + if (tret == GS_ERROR) + ret = GS_ERROR; + else if (TREE_STATIC (ctor) + && !initializer_constant_valid_p (ce->value, + TREE_TYPE (ce->value))) + TREE_STATIC (ctor) = 0; + } + if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0))) + TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (ctor, pre_p); + } + + if (insert) + { + /* Generate a single scalar insert after vector copy or + permute. */ + tree rhs = TREE_OPERAND (*expr_p, 1); + tree var = create_tmp_var (type); + gassign *init = gimple_build_assign (var, rhs); + gimple_seq_add_stmt (pre_p, init); + if (gimplify_expr (&scalar_element, pre_p, post_p, + is_gimple_val, fb_rvalue) == GS_ERROR) + { + ret = GS_ERROR; + break; + } + tree scalar_type = TREE_TYPE (scalar_element); + tree scalar_size = TYPE_SIZE (scalar_type); + tree bitpos = bitsize_int (scalar_idx + * TREE_INT_CST_LOW (scalar_size)); + tree ref = build3_loc (EXPR_LOCATION (rhs), BIT_FIELD_REF, + scalar_type, var, scalar_size, + bitpos); + init = gimple_build_assign (ref, scalar_element); + gimplify_seq_add_stmt (pre_p, init); + TREE_OPERAND (*expr_p, 1) = var; } - if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0))) - TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (ctor, pre_p); } break; diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1.c b/gcc/testsuite/gcc.target/i386/pr88828-1.c new file mode 100644 index 00000000000..4ef1feab389 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-1.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-not "movaps" } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ +/* { dg-final { scan-assembler-not "shufps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = { f, x[1], x[2], x[3] }; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2.c b/gcc/testsuite/gcc.target/i386/pr88828-2.c new file mode 100644 index 00000000000..6dc482b6f4b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-2.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-not "movaps" } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ +/* { dg-final { scan-assembler-not "shufps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = x; + y[0] = f; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3a.c b/gcc/testsuite/gcc.target/i386/pr88828-3a.c new file mode 100644 index 00000000000..97eb8e7162a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-3a.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-times "shufps" 1 } } */ +/* { dg-final { scan-assembler-not "movaps" } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = { f, x[0], x[2], x[3] }; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3b.c b/gcc/testsuite/gcc.target/i386/pr88828-3b.c new file mode 100644 index 00000000000..ab2ba730716 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-3b.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ +/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */ +/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */ +/* { dg-final { scan-assembler-not "vmovaps" } } */ +/* { dg-final { scan-assembler-not "vmovlhps" } } */ +/* { dg-final { scan-assembler-not "vunpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = { f, x[0], x[2], x[3] }; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4a.c b/gcc/testsuite/gcc.target/i386/pr88828-4a.c new file mode 100644 index 00000000000..a54689be701 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-4a.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-times "shufps" 1 } } */ +/* { dg-final { scan-assembler-not "movaps" } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = { x[0], x[2], x[3], x[1] }; + y[0] = f; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4b.c b/gcc/testsuite/gcc.target/i386/pr88828-4b.c new file mode 100644 index 00000000000..0c3a1024d93 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-4b.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ +/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */ +/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */ +/* { dg-final { scan-assembler-not "vshufps" } } */ +/* { dg-final { scan-assembler-not "vmovaps" } } */ +/* { dg-final { scan-assembler-not "vmovlhps" } } */ +/* { dg-final { scan-assembler-not "vunpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = { x[0], x[2], x[3], x[1] }; + y[0] = f; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5a.c b/gcc/testsuite/gcc.target/i386/pr88828-5a.c new file mode 100644 index 00000000000..534808d3cd1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-5a.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-times "shufps" 2 } } */ +/* { dg-final { scan-assembler-times "movaps" 1 } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = { x[0], x[2], x[3], f }; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5b.c b/gcc/testsuite/gcc.target/i386/pr88828-5b.c new file mode 100644 index 00000000000..aebea790979 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-5b.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ +/* { dg-final { scan-assembler-times "vinsertps" 1 } } */ +/* { dg-final { scan-assembler-not "vmovss" } } */ +/* { dg-final { scan-assembler-not "vshufps" } } */ +/* { dg-final { scan-assembler-not "vmovaps" } } */ +/* { dg-final { scan-assembler-not "vmovlhps" } } */ +/* { dg-final { scan-assembler-not "vunpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = { x[0], x[2], x[3], f }; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6a.c b/gcc/testsuite/gcc.target/i386/pr88828-6a.c new file mode 100644 index 00000000000..d43a36d9137 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-6a.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-times "shufps" 2 } } */ +/* { dg-final { scan-assembler-times "movaps" 1 } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = { x[0], x[2], x[3], x[0] }; + y[3] = f; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6b.c b/gcc/testsuite/gcc.target/i386/pr88828-6b.c new file mode 100644 index 00000000000..6856fe6500e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-6b.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ +/* { dg-final { scan-assembler-times "vinsertps" 1 } } */ +/* { dg-final { scan-assembler-not "vshufps" } } */ +/* { dg-final { scan-assembler-not "vmovss" } } */ +/* { dg-final { scan-assembler-not "vmovaps" } } */ +/* { dg-final { scan-assembler-not "vmovlhps" } } */ +/* { dg-final { scan-assembler-not "vunpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = { x[0], x[2], x[3], x[0] }; + y[3] = f; + return y; +}