diff mbox series

Optimize vector init constructor

Message ID 20190303143230.19742-1-hjl.tools@gmail.com
State New
Headers show
Series Optimize vector init constructor | expand

Commit Message

H.J. Lu March 3, 2019, 2:32 p.m. UTC
For vector init constructor:

---
typedef float __v4sf __attribute__ ((__vector_size__ (16)));

__v4sf
foo (__v4sf x, float f)
{
  __v4sf y = { f, x[1], x[2], x[3] };
  return y;
}
---

we can optimize vector init constructor with vector copy or permute
followed by a single scalar insert:

  __v4sf D.1912;
  __v4sf D.1913;
  __v4sf D.1914;
  __v4sf y;

  x.0_1 = x;
  D.1912 = x.0_1;
  _2 = D.1912;
  D.1913 = _2;
  BIT_FIELD_REF <D.1913, 32, 0> = f;
  y = D.1913;
  D.1914 = y;
  return D.1914;

instead of

  __v4sf D.1962;
  __v4sf y;

  _1 = BIT_FIELD_REF <x, 32, 32>;
  _2 = BIT_FIELD_REF <x, 32, 64>;
  _3 = BIT_FIELD_REF <x, 32, 96>;
  y = {f, _1, _2, _3};
  D.1962 = y;
  return D.1962;

gcc/

	PR tree-optimization/88828
	* gimplify.c (gimplify_init_constructor): Optimize vector init
	constructor with vector copy or permute followed by a single
	scalar insert.

gcc/testsuite/

	PR tree-optimization/88828
	* gcc.target/i386/pr88828-1.c: New test.
	* gcc.target/i386/pr88828-2.c: Likewise.
	* gcc.target/i386/pr88828-3a.c: Likewise.
	* gcc.target/i386/pr88828-3b.c: Likewise.
	* gcc.target/i386/pr88828-4a.c: Likewise.
	* gcc.target/i386/pr88828-4b.c: Likewise.
	* gcc.target/i386/pr88828-5a.c: Likewise.
	* gcc.target/i386/pr88828-5b.c: Likewise.
	* gcc.target/i386/pr88828-6a.c: Likewise.
	* gcc.target/i386/pr88828-6b.c: Likewise.
---
 gcc/gimplify.c                             | 176 +++++++++++++++++++--
 gcc/testsuite/gcc.target/i386/pr88828-1.c  |  16 ++
 gcc/testsuite/gcc.target/i386/pr88828-2.c  |  17 ++
 gcc/testsuite/gcc.target/i386/pr88828-3a.c |  16 ++
 gcc/testsuite/gcc.target/i386/pr88828-3b.c |  18 +++
 gcc/testsuite/gcc.target/i386/pr88828-4a.c |  17 ++
 gcc/testsuite/gcc.target/i386/pr88828-4b.c |  20 +++
 gcc/testsuite/gcc.target/i386/pr88828-5a.c |  16 ++
 gcc/testsuite/gcc.target/i386/pr88828-5b.c |  18 +++
 gcc/testsuite/gcc.target/i386/pr88828-6a.c |  17 ++
 gcc/testsuite/gcc.target/i386/pr88828-6b.c |  19 +++
 11 files changed, 336 insertions(+), 14 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6b.c

Comments

Andrew Pinski March 3, 2019, 2:40 p.m. UTC | #1
)
,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> For vector init constructor:
>
> ---
> typedef float __v4sf __attribute__ ((__vector_size__ (16)));
>
> __v4sf
> foo (__v4sf x, float f)
> {
>   __v4sf y = { f, x[1], x[2], x[3] };
>   return y;
> }
> ---
>
> we can optimize vector init constructor with vector copy or permute
> followed by a single scalar insert:
>
>   __v4sf D.1912;
>   __v4sf D.1913;
>   __v4sf D.1914;
>   __v4sf y;
>
>   x.0_1 = x;
>   D.1912 = x.0_1;
>   _2 = D.1912;
>   D.1913 = _2;
>   BIT_FIELD_REF <D.1913, 32, 0> = f;
>   y = D.1913;
>   D.1914 = y;
>   return D.1914;
>
> instead of
>
>   __v4sf D.1962;
>   __v4sf y;
>
>   _1 = BIT_FIELD_REF <x, 32, 32>;
>   _2 = BIT_FIELD_REF <x, 32, 64>;
>   _3 = BIT_FIELD_REF <x, 32, 96>;
>   y = {f, _1, _2, _3};
>   D.1962 = y;
>   return D.1962;
>
> gcc/
>
>         PR tree-optimization/88828
>         * gimplify.c (gimplify_init_constructor): Optimize vector init
>         constructor with vector copy or permute followed by a single
>         scalar insert.


Doing this here does not catch things like:
typedef float __v4sf __attribute__ ((__vector_size__ (16)));


__v4sf
vector_init (float f0,float f1, float f2,float f3)
{
  __v4sf y = { f, x[1], x[2], x[3] };
   return y;
}

__v4sf
foo (__v4sf x, float f)
{
  return vector_init (f, x[1], x[2], x[3]) ;
}

>
> gcc/testsuite/
>
>         PR tree-optimization/88828
>         * gcc.target/i386/pr88828-1.c: New test.
>         * gcc.target/i386/pr88828-2.c: Likewise.
>         * gcc.target/i386/pr88828-3a.c: Likewise.
>         * gcc.target/i386/pr88828-3b.c: Likewise.
>         * gcc.target/i386/pr88828-4a.c: Likewise.
>         * gcc.target/i386/pr88828-4b.c: Likewise.
>         * gcc.target/i386/pr88828-5a.c: Likewise.
>         * gcc.target/i386/pr88828-5b.c: Likewise.
>         * gcc.target/i386/pr88828-6a.c: Likewise.
>         * gcc.target/i386/pr88828-6b.c: Likewise.
> ---
>  gcc/gimplify.c                             | 176 +++++++++++++++++++--
>  gcc/testsuite/gcc.target/i386/pr88828-1.c  |  16 ++
>  gcc/testsuite/gcc.target/i386/pr88828-2.c  |  17 ++
>  gcc/testsuite/gcc.target/i386/pr88828-3a.c |  16 ++
>  gcc/testsuite/gcc.target/i386/pr88828-3b.c |  18 +++
>  gcc/testsuite/gcc.target/i386/pr88828-4a.c |  17 ++
>  gcc/testsuite/gcc.target/i386/pr88828-4b.c |  20 +++
>  gcc/testsuite/gcc.target/i386/pr88828-5a.c |  16 ++
>  gcc/testsuite/gcc.target/i386/pr88828-5b.c |  18 +++
>  gcc/testsuite/gcc.target/i386/pr88828-6a.c |  17 ++
>  gcc/testsuite/gcc.target/i386/pr88828-6b.c |  19 +++
>  11 files changed, 336 insertions(+), 14 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6b.c
>
> diff --git a/gcc/gimplify.c b/gcc/gimplify.c
> index 983635ba21f..893a4311f9e 100644
> --- a/gcc/gimplify.c
> +++ b/gcc/gimplify.c
> @@ -5082,22 +5082,170 @@ gimplify_init_constructor (tree *expr_p, gimple_seq *pre_p, gimple_seq *post_p,
>             TREE_CONSTANT (ctor) = 0;
>           }
>
> -       /* Vector types use CONSTRUCTOR all the way through gimple
> -          compilation as a general initializer.  */
> -       FOR_EACH_VEC_SAFE_ELT (elts, ix, ce)
> +       tree rhs_vector = NULL;
> +       /* The vector element to replace scalar elements, which
> +          will be overridden by scalar insert.  */
> +       tree vector_element = NULL;
> +       /* The single scalar element.  */
> +       tree scalar_element = NULL;
> +       unsigned int scalar_idx = 0;
> +       enum { unknown, copy, permute, init } operation = unknown;
> +       bool insert = false;
> +
> +       /* Check if we can generate vector copy or permute followed by
> +          a single scalar insert.  */
> +       if (TYPE_VECTOR_SUBPARTS (type).is_constant ())
>           {
> -           enum gimplify_status tret;
> -           tret = gimplify_expr (&ce->value, pre_p, post_p, is_gimple_val,
> -                                 fb_rvalue);
> -           if (tret == GS_ERROR)
> -             ret = GS_ERROR;
> -           else if (TREE_STATIC (ctor)
> -                    && !initializer_constant_valid_p (ce->value,
> -                                                      TREE_TYPE (ce->value)))
> -             TREE_STATIC (ctor) = 0;
> +           /* If all RHS vector elements come from the same vector,
> +              we can use permute.  If all RHS vector elements come
> +              from the same vector in the same order, we can use
> +              copy.  */
> +           unsigned int nunits
> +             = TYPE_VECTOR_SUBPARTS (type).to_constant ();
> +           unsigned int nscalars = 0;
> +           unsigned int nvectors = 0;
> +           operation = unknown;
> +           FOR_EACH_VEC_SAFE_ELT (elts, ix, ce)
> +             if (TREE_CODE (ce->value) == ARRAY_REF
> +                 || TREE_CODE (ce->value) == ARRAY_RANGE_REF)
> +               {
> +                 if (!vector_element)
> +                   vector_element = ce->value;
> +                 /* Get the vector index.  */
> +                 tree idx = TREE_OPERAND (ce->value, 1);
> +                 if (TREE_CODE (idx) == INTEGER_CST)
> +                   {
> +                     /* Get the RHS vector.  */
> +                     tree r = ce->value;
> +                     while (handled_component_p (r))
> +                       r = TREE_OPERAND (r, 0);
> +                     if (type == TREE_TYPE (r))
> +                       {
> +                         /* The RHS vector has the same type as
> +                            LHS.  */
> +                         if (rhs_vector == NULL)
> +                           rhs_vector = r;
> +
> +                         /* Check if all RHS vector elements come
> +                            fome the same vector.  */
> +                         if (rhs_vector == r)
> +                           {
> +                             nvectors++;
> +                             if (TREE_INT_CST_LOW (idx) == ix
> +                                 && (operation == unknown
> +                                     || operation == copy))
> +                               operation = copy;
> +                             else
> +                               operation = permute;
> +                             continue;
> +                           }
> +                       }
> +                   }
> +
> +                 /* Otherwise, use vector init.  */
> +                 break;
> +               }
> +             else if (TREE_CODE (TYPE_SIZE (TREE_TYPE (ce->value)))
> +                      == INTEGER_CST)
> +               {
> +                 /* Only allow one single scalar insert.  */
> +                 if (nscalars != 0)
> +                   break;
> +                 nscalars = 1;
> +                 insert = true;
> +                 scalar_idx = ix;
> +                 scalar_element = ce->value;
> +               }
> +
> +           /* Allow a single scalar insert with vector copy or
> +              vector permute.  Vector copy without insert is OK.  */
> +           if (nunits != (nscalars + nvectors)
> +               || (nscalars == 0 && operation != copy))
> +             operation = unknown;
> +         }
> +
> +       if (operation == unknown)
> +         {
> +           /* Default to the regular vector init constructor.  */
> +           operation = init;
> +           insert = false;
> +         }
> +
> +       if (operation == copy)
> +         {
> +           /* Generate a vector copy.  */
> +           tree var = create_tmp_var (type);
> +           if (gimplify_expr (&rhs_vector, pre_p, post_p,
> +                              is_gimple_val, fb_rvalue) == GS_ERROR)
> +             {
> +               ret = GS_ERROR;
> +               break;
> +             }
> +           gassign *init = gimple_build_assign (var, rhs_vector);
> +           gimple_seq_add_stmt (pre_p, init);
> +           if (gimplify_expr (&var, pre_p, post_p, is_gimple_val,
> +                              fb_rvalue) == GS_ERROR)
> +             {
> +               ret = GS_ERROR;
> +               break;
> +             }
> +           /* Replace RHS with the vector copy.  */
> +           if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0)))
> +             TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (var, pre_p);
> +           else
> +             TREE_OPERAND (*expr_p, 1) = var;
> +         }
> +       else
> +         {
> +           /* Prepare for vector permute by replacing the scalar
> +              element with the vector one.  */
> +           if (operation == permute)
> +             (elts->address())[scalar_idx].value = vector_element;
> +
> +           /* Vector types use CONSTRUCTOR all the way through gimple
> +              compilation as a general initializer.  */
> +           FOR_EACH_VEC_SAFE_ELT (elts, ix, ce)
> +             {
> +               enum gimplify_status tret;
> +               tret = gimplify_expr (&ce->value, pre_p, post_p,
> +                                     is_gimple_val,
> +                                     fb_rvalue);
> +               if (tret == GS_ERROR)
> +                 ret = GS_ERROR;
> +               else if (TREE_STATIC (ctor)
> +                        && !initializer_constant_valid_p (ce->value,
> +                                                          TREE_TYPE (ce->value)))
> +                 TREE_STATIC (ctor) = 0;
> +             }
> +           if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0)))
> +             TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (ctor, pre_p);
> +         }
> +
> +       if (insert)
> +         {
> +           /* Generate a single scalar insert after vector copy or
> +              permute.  */
> +           tree rhs = TREE_OPERAND (*expr_p, 1);
> +           tree var = create_tmp_var (type);
> +           gassign *init = gimple_build_assign (var, rhs);
> +           gimple_seq_add_stmt (pre_p, init);
> +           if (gimplify_expr (&scalar_element, pre_p, post_p,
> +                              is_gimple_val, fb_rvalue) == GS_ERROR)
> +             {
> +               ret = GS_ERROR;
> +               break;
> +             }
> +           tree scalar_type = TREE_TYPE (scalar_element);
> +           tree scalar_size = TYPE_SIZE (scalar_type);
> +           tree bitpos = bitsize_int (scalar_idx
> +                                      * TREE_INT_CST_LOW (scalar_size));
> +           tree ref = build3_loc (EXPR_LOCATION (rhs), BIT_FIELD_REF,
> +                                  scalar_type, var, scalar_size,
> +                                  bitpos);
> +           init = gimple_build_assign (ref, scalar_element);
> +           gimplify_seq_add_stmt (pre_p, init);
> +           TREE_OPERAND (*expr_p, 1) = var;
>           }
> -       if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0)))
> -         TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (ctor, pre_p);
>        }
>        break;
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1.c b/gcc/testsuite/gcc.target/i386/pr88828-1.c
> new file mode 100644
> index 00000000000..4ef1feab389
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-1.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse -mno-sse4" } */
> +/* { dg-final { scan-assembler "movss" } } */
> +/* { dg-final { scan-assembler-not "movaps" } } */
> +/* { dg-final { scan-assembler-not "movlhps" } } */
> +/* { dg-final { scan-assembler-not "unpcklps" } } */
> +/* { dg-final { scan-assembler-not "shufps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> +  __v4sf y = { f, x[1], x[2], x[3] };
> +  return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2.c b/gcc/testsuite/gcc.target/i386/pr88828-2.c
> new file mode 100644
> index 00000000000..6dc482b6f4b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-2.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse -mno-sse4" } */
> +/* { dg-final { scan-assembler "movss" } } */
> +/* { dg-final { scan-assembler-not "movaps" } } */
> +/* { dg-final { scan-assembler-not "movlhps" } } */
> +/* { dg-final { scan-assembler-not "unpcklps" } } */
> +/* { dg-final { scan-assembler-not "shufps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> +  __v4sf y = x;
> +  y[0] = f;
> +  return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3a.c b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
> new file mode 100644
> index 00000000000..97eb8e7162a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse -mno-sse4" } */
> +/* { dg-final { scan-assembler "movss" } } */
> +/* { dg-final { scan-assembler-times "shufps" 1 } } */
> +/* { dg-final { scan-assembler-not "movaps" } } */
> +/* { dg-final { scan-assembler-not "movlhps" } } */
> +/* { dg-final { scan-assembler-not "unpcklps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> +  __v4sf y = { f, x[0], x[2], x[3] };
> +  return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3b.c b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
> new file mode 100644
> index 00000000000..ab2ba730716
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
> +/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
> +/* { dg-final { scan-assembler-not "vmovaps" } } */
> +/* { dg-final { scan-assembler-not "vmovlhps" } } */
> +/* { dg-final { scan-assembler-not "vunpcklps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> +  __v4sf y = { f, x[0], x[2], x[3] };
> +  return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4a.c b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
> new file mode 100644
> index 00000000000..a54689be701
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse -mno-sse4" } */
> +/* { dg-final { scan-assembler "movss" } } */
> +/* { dg-final { scan-assembler-times "shufps" 1 } } */
> +/* { dg-final { scan-assembler-not "movaps" } } */
> +/* { dg-final { scan-assembler-not "movlhps" } } */
> +/* { dg-final { scan-assembler-not "unpcklps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> +  __v4sf y = { x[0], x[2], x[3], x[1] };
> +  y[0] = f;
> +  return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4b.c b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
> new file mode 100644
> index 00000000000..0c3a1024d93
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
> @@ -0,0 +1,20 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
> +/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
> +/* { dg-final { scan-assembler-not "vshufps" } } */
> +/* { dg-final { scan-assembler-not "vmovaps" } } */
> +/* { dg-final { scan-assembler-not "vmovlhps" } } */
> +/* { dg-final { scan-assembler-not "vunpcklps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> +  __v4sf y = { x[0], x[2], x[3], x[1] };
> +  y[0] = f;
> +  return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5a.c b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
> new file mode 100644
> index 00000000000..534808d3cd1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse -mno-sse4" } */
> +/* { dg-final { scan-assembler "movss" } } */
> +/* { dg-final { scan-assembler-times "shufps" 2 } } */
> +/* { dg-final { scan-assembler-times "movaps" 1 } } */
> +/* { dg-final { scan-assembler-not "movlhps" } } */
> +/* { dg-final { scan-assembler-not "unpcklps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> +  __v4sf y = { x[0], x[2], x[3], f };
> +  return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5b.c b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
> new file mode 100644
> index 00000000000..aebea790979
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
> +/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
> +/* { dg-final { scan-assembler-not "vmovss" } } */
> +/* { dg-final { scan-assembler-not "vshufps" } } */
> +/* { dg-final { scan-assembler-not "vmovaps" } } */
> +/* { dg-final { scan-assembler-not "vmovlhps" } } */
> +/* { dg-final { scan-assembler-not "vunpcklps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> +  __v4sf y = { x[0], x[2], x[3], f };
> +  return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6a.c b/gcc/testsuite/gcc.target/i386/pr88828-6a.c
> new file mode 100644
> index 00000000000..d43a36d9137
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-6a.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse -mno-sse4" } */
> +/* { dg-final { scan-assembler "movss" } } */
> +/* { dg-final { scan-assembler-times "shufps" 2 } } */
> +/* { dg-final { scan-assembler-times "movaps" 1 } } */
> +/* { dg-final { scan-assembler-not "movlhps" } } */
> +/* { dg-final { scan-assembler-not "unpcklps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> +  __v4sf y = { x[0], x[2], x[3], x[0] };
> +  y[3] = f;
> +  return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6b.c b/gcc/testsuite/gcc.target/i386/pr88828-6b.c
> new file mode 100644
> index 00000000000..6856fe6500e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-6b.c
> @@ -0,0 +1,19 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
> +/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
> +/* { dg-final { scan-assembler-not "vshufps" } } */
> +/* { dg-final { scan-assembler-not "vmovss" } } */
> +/* { dg-final { scan-assembler-not "vmovaps" } } */
> +/* { dg-final { scan-assembler-not "vmovlhps" } } */
> +/* { dg-final { scan-assembler-not "vunpcklps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> +  __v4sf y = { x[0], x[2], x[3], x[0] };
> +  y[3] = f;
> +  return y;
> +}
> --
> 2.20.1
>
H.J. Lu March 3, 2019, 9:13 p.m. UTC | #2
On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> )
> ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > For vector init constructor:
> >
> > ---
> > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> >
> > __v4sf
> > foo (__v4sf x, float f)
> > {
> >   __v4sf y = { f, x[1], x[2], x[3] };
> >   return y;
> > }
> > ---
> >
> > we can optimize vector init constructor with vector copy or permute
> > followed by a single scalar insert:
> >
> >   __v4sf D.1912;
> >   __v4sf D.1913;
> >   __v4sf D.1914;
> >   __v4sf y;
> >
> >   x.0_1 = x;
> >   D.1912 = x.0_1;
> >   _2 = D.1912;
> >   D.1913 = _2;
> >   BIT_FIELD_REF <D.1913, 32, 0> = f;
> >   y = D.1913;
> >   D.1914 = y;
> >   return D.1914;
> >
> > instead of
> >
> >   __v4sf D.1962;
> >   __v4sf y;
> >
> >   _1 = BIT_FIELD_REF <x, 32, 32>;
> >   _2 = BIT_FIELD_REF <x, 32, 64>;
> >   _3 = BIT_FIELD_REF <x, 32, 96>;
> >   y = {f, _1, _2, _3};
> >   D.1962 = y;
> >   return D.1962;
> >
> > gcc/
> >
> >         PR tree-optimization/88828
> >         * gimplify.c (gimplify_init_constructor): Optimize vector init
> >         constructor with vector copy or permute followed by a single
> >         scalar insert.
> 
> 
> Doing this here does not catch things like:
> typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> 
> 
> __v4sf
> vector_init (float f0,float f1, float f2,float f3)
> {
>   __v4sf y = { f, x[1], x[2], x[3] };
>    return y;
> }
> 
> __v4sf
> foo (__v4sf x, float f)
> {
>   return vector_init (f, x[1], x[2], x[3]) ;
> }
> 

Here is a patch for simplify_vector_constructor to optimize vector init
constructor with vector copy or permute followed by a single scalar
insert.  But this doesn't work correcly:

[hjl@gnu-cfl-2 pr88828]$ cat bar.i
typedef float __v4sf __attribute__ ((__vector_size__ (16)));

static __v4sf
vector_init (float f0,float f1, float f2,float f3)
{
  __v4sf y = { f0, f1, f2, f3 };
   return y;
}

__v4sf
foo (__v4sf x, float f)
{
  return vector_init (f, x[1], x[2], x[3]) ;
}
[hjl@gnu-cfl-2 pr88828]$ make bar.s
/export/build/gnu/tools-build/gcc-wip-debug/build-x86_64-linux/gcc/xgcc -B/export/build/gnu/tools-build/gcc-wip-debug/build-x86_64-linux/gcc/ -O2 -S bar.i
[hjl@gnu-cfl-2 pr88828]$ cat bar.s
	.file	"bar.i"
	.text
	.p2align 4
	.globl	foo
	.type	foo, @function
foo:
.LFB1:
	.cfi_startproc
	ret
	.cfi_endproc
.LFE1:
	.size	foo, .-foo
	.ident	"GCC: (GNU) 9.0.1 20190303 (experimental)"
	.section	.note.GNU-stack,"",@progbits
[hjl@gnu-cfl-2 pr88828]$

Scalar insert is missing.
---
 gcc/tree-ssa-forwprop.c | 77 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 69 insertions(+), 8 deletions(-)

diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c
index eeb6281c652..b10cfccf7b8 100644
--- a/gcc/tree-ssa-forwprop.c
+++ b/gcc/tree-ssa-forwprop.c
@@ -2008,7 +2008,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
   unsigned elem_size, i;
   unsigned HOST_WIDE_INT nelts;
   enum tree_code code, conv_code;
-  constructor_elt *elt;
+  constructor_elt *ce;
   bool maybe_ident;
 
   gcc_checking_assert (gimple_assign_rhs_code (stmt) == CONSTRUCTOR);
@@ -2027,18 +2027,41 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
   orig[1] = NULL;
   conv_code = ERROR_MARK;
   maybe_ident = true;
-  FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, elt)
+
+  tree rhs_vector = NULL;
+  /* The single scalar element.  */
+  tree scalar_element = NULL;
+  unsigned int scalar_idx = 0;
+  bool insert = false;
+  unsigned int nscalars = 0;
+  unsigned int nvectors = 0;
+  FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, ce)
     {
       tree ref, op1;
 
       if (i >= nelts)
 	return false;
 
-      if (TREE_CODE (elt->value) != SSA_NAME)
+      if (TREE_CODE (ce->value) != SSA_NAME)
 	return false;
-      def_stmt = get_prop_source_stmt (elt->value, false, NULL);
+      def_stmt = get_prop_source_stmt (ce->value, false, NULL);
       if (!def_stmt)
-	return false;
+	{
+	  if ( gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
+	    {
+	      /* Only allow one single scalar insert.  */
+	      if (nscalars != 0)
+		return false;
+
+	      nscalars = 1;
+	      insert = true;
+	      scalar_idx = i;
+	      scalar_element = ce->value;
+	      continue;
+	    }
+	  else
+	    return false;
+	}
       code = gimple_assign_rhs_code (def_stmt);
       if (code == FLOAT_EXPR
 	  || code == FIX_TRUNC_EXPR)
@@ -2046,7 +2069,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	  op1 = gimple_assign_rhs1 (def_stmt);
 	  if (conv_code == ERROR_MARK)
 	    {
-	      if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (elt->value))),
+	      if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (ce->value))),
 			    GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1)))))
 		return false;
 	      conv_code = code;
@@ -2095,6 +2118,18 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	elt += nelts;
       if (elt != i)
 	maybe_ident = false;
+
+       if (type == TREE_TYPE (ref))
+	 {
+	   /* The RHS vector has the same type as LHS.  */
+	   if (rhs_vector == NULL)
+	     rhs_vector = ref;
+	   /* Check if all RHS vector elements come fome the same
+	      vector.  */
+	   if (rhs_vector == ref)
+	     nvectors++;
+	 }
+
       sel.quick_push (elt);
     }
   if (i < nelts)
@@ -2113,6 +2148,12 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	  || conv_code == CALL_EXPR))
     return false;
 
+  /* Replace the scalar element with the vector element.  */
+  if (insert
+      && (TYPE_VECTOR_SUBPARTS (type).to_constant ()
+	  == (nscalars + nvectors)))
+    sel.quick_push (scalar_idx);
+
   if (maybe_ident)
     {
       if (conv_code == ERROR_MARK)
@@ -2127,14 +2168,22 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 
       vec_perm_indices indices (sel, orig[1] ? 2 : 1, nelts);
       if (!can_vec_perm_const_p (TYPE_MODE (type), indices))
-	return false;
+	{
+	  if (insert)
+	    gcc_unreachable ();
+	  return false;
+	}
       mask_type
 	= build_vector_type (build_nonstandard_integer_type (elem_size, 1),
 			     nelts);
       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_VECTOR_INT
 	  || maybe_ne (GET_MODE_SIZE (TYPE_MODE (mask_type)),
 		       GET_MODE_SIZE (TYPE_MODE (type))))
-	return false;
+	{
+	  if (insert)
+	    gcc_unreachable ();
+	  return false;
+	}
       op2 = vec_perm_indices_to_tree (mask_type, indices);
       if (!orig[1])
 	orig[1] = orig[0];
@@ -2153,6 +2202,18 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	}
     }
   update_stmt (gsi_stmt (*gsi));
+  if (insert)
+    {
+      /* Generate a single scalar insert.  */
+      /* FIXME: This doesn't work correctly.  */
+      tree lhs = gimple_assign_lhs (stmt);
+      tree bitfield = build3 (BIT_FIELD_REF, elem_type, lhs,
+			      bitsize_int (elem_size),
+			      bitsize_int (scalar_idx * elem_size));
+      gimple *new_stmt = gimple_build_assign (bitfield, scalar_element);
+      gsi_insert_after (gsi, new_stmt, GSI_SAME_STMT);
+      update_stmt (gsi_stmt (*gsi));
+    }
   return true;
 }
Richard Biener March 4, 2019, 11:55 a.m. UTC | #3
On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> > )
> > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > For vector init constructor:
> > >
> > > ---
> > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> > >
> > > __v4sf
> > > foo (__v4sf x, float f)
> > > {
> > >   __v4sf y = { f, x[1], x[2], x[3] };
> > >   return y;
> > > }
> > > ---
> > >
> > > we can optimize vector init constructor with vector copy or permute
> > > followed by a single scalar insert:
> > >
> > >   __v4sf D.1912;
> > >   __v4sf D.1913;
> > >   __v4sf D.1914;
> > >   __v4sf y;
> > >
> > >   x.0_1 = x;
> > >   D.1912 = x.0_1;
> > >   _2 = D.1912;
> > >   D.1913 = _2;
> > >   BIT_FIELD_REF <D.1913, 32, 0> = f;
> > >   y = D.1913;
> > >   D.1914 = y;
> > >   return D.1914;
> > >
> > > instead of
> > >
> > >   __v4sf D.1962;
> > >   __v4sf y;
> > >
> > >   _1 = BIT_FIELD_REF <x, 32, 32>;
> > >   _2 = BIT_FIELD_REF <x, 32, 64>;
> > >   _3 = BIT_FIELD_REF <x, 32, 96>;
> > >   y = {f, _1, _2, _3};
> > >   D.1962 = y;
> > >   return D.1962;
> > >
> > > gcc/
> > >
> > >         PR tree-optimization/88828
> > >         * gimplify.c (gimplify_init_constructor): Optimize vector init
> > >         constructor with vector copy or permute followed by a single
> > >         scalar insert.
> >
> >
> > Doing this here does not catch things like:
> > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> >
> >
> > __v4sf
> > vector_init (float f0,float f1, float f2,float f3)
> > {
> >   __v4sf y = { f, x[1], x[2], x[3] };
> >    return y;
> > }
> >
> > __v4sf
> > foo (__v4sf x, float f)
> > {
> >   return vector_init (f, x[1], x[2], x[3]) ;
> > }
> >
>
> Here is a patch for simplify_vector_constructor to optimize vector init
> constructor with vector copy or permute followed by a single scalar
> insert.

That's the correct place to fix this indeed.

  But this doesn't work correcly:
>
> [hjl@gnu-cfl-2 pr88828]$ cat bar.i
> typedef float __v4sf __attribute__ ((__vector_size__ (16)));
>
> static __v4sf
> vector_init (float f0,float f1, float f2,float f3)
> {
>   __v4sf y = { f0, f1, f2, f3 };
>    return y;
> }
>
> __v4sf
> foo (__v4sf x, float f)
> {
>   return vector_init (f, x[1], x[2], x[3]) ;
> }
> [hjl@gnu-cfl-2 pr88828]$ make bar.s
> /export/build/gnu/tools-build/gcc-wip-debug/build-x86_64-linux/gcc/xgcc -B/export/build/gnu/tools-build/gcc-wip-debug/build-x86_64-linux/gcc/ -O2 -S bar.i
> [hjl@gnu-cfl-2 pr88828]$ cat bar.s
>         .file   "bar.i"
>         .text
>         .p2align 4
>         .globl  foo
>         .type   foo, @function
> foo:
> .LFB1:
>         .cfi_startproc
>         ret
>         .cfi_endproc
> .LFE1:
>         .size   foo, .-foo
>         .ident  "GCC: (GNU) 9.0.1 20190303 (experimental)"
>         .section        .note.GNU-stack,"",@progbits
> [hjl@gnu-cfl-2 pr88828]$
>
> Scalar insert is missing.
> ---
>  gcc/tree-ssa-forwprop.c | 77 ++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 69 insertions(+), 8 deletions(-)
>
> diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c
> index eeb6281c652..b10cfccf7b8 100644
> --- a/gcc/tree-ssa-forwprop.c
> +++ b/gcc/tree-ssa-forwprop.c
> @@ -2008,7 +2008,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
>    unsigned elem_size, i;
>    unsigned HOST_WIDE_INT nelts;
>    enum tree_code code, conv_code;
> -  constructor_elt *elt;
> +  constructor_elt *ce;
>    bool maybe_ident;
>
>    gcc_checking_assert (gimple_assign_rhs_code (stmt) == CONSTRUCTOR);
> @@ -2027,18 +2027,41 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
>    orig[1] = NULL;
>    conv_code = ERROR_MARK;
>    maybe_ident = true;
> -  FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, elt)
> +
> +  tree rhs_vector = NULL;
> +  /* The single scalar element.  */
> +  tree scalar_element = NULL;
> +  unsigned int scalar_idx = 0;
> +  bool insert = false;
> +  unsigned int nscalars = 0;
> +  unsigned int nvectors = 0;
> +  FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, ce)
>      {
>        tree ref, op1;
>
>        if (i >= nelts)
>         return false;
>
> -      if (TREE_CODE (elt->value) != SSA_NAME)
> +      if (TREE_CODE (ce->value) != SSA_NAME)
>         return false;
> -      def_stmt = get_prop_source_stmt (elt->value, false, NULL);
> +      def_stmt = get_prop_source_stmt (ce->value, false, NULL);
>        if (!def_stmt)
> -       return false;
> +       {
> +         if ( gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
> +           {
> +             /* Only allow one single scalar insert.  */
> +             if (nscalars != 0)
> +               return false;
> +
> +             nscalars = 1;
> +             insert = true;
> +             scalar_idx = i;
> +             scalar_element = ce->value;
> +             continue;
> +           }
> +         else
> +           return false;
> +       }
>        code = gimple_assign_rhs_code (def_stmt);
>        if (code == FLOAT_EXPR
>           || code == FIX_TRUNC_EXPR)
> @@ -2046,7 +2069,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
>           op1 = gimple_assign_rhs1 (def_stmt);
>           if (conv_code == ERROR_MARK)
>             {
> -             if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (elt->value))),
> +             if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (ce->value))),
>                             GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1)))))
>                 return false;
>               conv_code = code;
> @@ -2095,6 +2118,18 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
>         elt += nelts;
>        if (elt != i)
>         maybe_ident = false;
> +
> +       if (type == TREE_TYPE (ref))
> +        {
> +          /* The RHS vector has the same type as LHS.  */
> +          if (rhs_vector == NULL)
> +            rhs_vector = ref;
> +          /* Check if all RHS vector elements come fome the same
> +             vector.  */
> +          if (rhs_vector == ref)
> +            nvectors++;
> +        }
> +
>        sel.quick_push (elt);
>      }
>    if (i < nelts)
> @@ -2113,6 +2148,12 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
>           || conv_code == CALL_EXPR))
>      return false;
>
> +  /* Replace the scalar element with the vector element.  */
> +  if (insert
> +      && (TYPE_VECTOR_SUBPARTS (type).to_constant ()
> +         == (nscalars + nvectors)))
> +    sel.quick_push (scalar_idx);
> +
>    if (maybe_ident)
>      {
>        if (conv_code == ERROR_MARK)
> @@ -2127,14 +2168,22 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
>
>        vec_perm_indices indices (sel, orig[1] ? 2 : 1, nelts);
>        if (!can_vec_perm_const_p (TYPE_MODE (type), indices))
> -       return false;
> +       {
> +         if (insert)
> +           gcc_unreachable ();
> +         return false;
> +       }
>        mask_type
>         = build_vector_type (build_nonstandard_integer_type (elem_size, 1),
>                              nelts);
>        if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_VECTOR_INT
>           || maybe_ne (GET_MODE_SIZE (TYPE_MODE (mask_type)),
>                        GET_MODE_SIZE (TYPE_MODE (type))))
> -       return false;
> +       {
> +         if (insert)
> +           gcc_unreachable ();
> +         return false;
> +       }
>        op2 = vec_perm_indices_to_tree (mask_type, indices);
>        if (!orig[1])
>         orig[1] = orig[0];
> @@ -2153,6 +2202,18 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
>         }
>      }
>    update_stmt (gsi_stmt (*gsi));
> +  if (insert)
> +    {
> +      /* Generate a single scalar insert.  */
> +      /* FIXME: This doesn't work correctly.  */
> +      tree lhs = gimple_assign_lhs (stmt);
> +      tree bitfield = build3 (BIT_FIELD_REF, elem_type, lhs,
> +                             bitsize_int (elem_size),
> +                             bitsize_int (scalar_idx * elem_size));
> +      gimple *new_stmt = gimple_build_assign (bitfield, scalar_element);

I think you want to generate from the original

    _1 = { .... };

the new

    _2 = copy or permute to _new_ LHS SSA name
    _1 = BIT_INSERT_EXPR <_2, scalar_element, scalar_idx * elem_size>;

> +      gsi_insert_after (gsi, new_stmt, GSI_SAME_STMT);

and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way
is to emit a new stmt for _2 = copy ...; and do the set_rhs with the
BIT_INSERT_EXPR.

> +      update_stmt (gsi_stmt (*gsi));
> +    }
>    return true;
>  }
>
> --
> 2.20.1
>
diff mbox series

Patch

diff --git a/gcc/gimplify.c b/gcc/gimplify.c
index 983635ba21f..893a4311f9e 100644
--- a/gcc/gimplify.c
+++ b/gcc/gimplify.c
@@ -5082,22 +5082,170 @@  gimplify_init_constructor (tree *expr_p, gimple_seq *pre_p, gimple_seq *post_p,
 	    TREE_CONSTANT (ctor) = 0;
 	  }
 
-	/* Vector types use CONSTRUCTOR all the way through gimple
-	   compilation as a general initializer.  */
-	FOR_EACH_VEC_SAFE_ELT (elts, ix, ce)
+	tree rhs_vector = NULL;
+	/* The vector element to replace scalar elements, which
+	   will be overridden by scalar insert.  */
+	tree vector_element = NULL;
+	/* The single scalar element.  */
+	tree scalar_element = NULL;
+	unsigned int scalar_idx = 0;
+	enum { unknown, copy, permute, init } operation = unknown;
+	bool insert = false;
+
+	/* Check if we can generate vector copy or permute followed by
+	   a single scalar insert.  */
+	if (TYPE_VECTOR_SUBPARTS (type).is_constant ())
 	  {
-	    enum gimplify_status tret;
-	    tret = gimplify_expr (&ce->value, pre_p, post_p, is_gimple_val,
-				  fb_rvalue);
-	    if (tret == GS_ERROR)
-	      ret = GS_ERROR;
-	    else if (TREE_STATIC (ctor)
-		     && !initializer_constant_valid_p (ce->value,
-						       TREE_TYPE (ce->value)))
-	      TREE_STATIC (ctor) = 0;
+	    /* If all RHS vector elements come from the same vector,
+	       we can use permute.  If all RHS vector elements come
+	       from the same vector in the same order, we can use
+	       copy.  */
+	    unsigned int nunits
+	      = TYPE_VECTOR_SUBPARTS (type).to_constant ();
+	    unsigned int nscalars = 0;
+	    unsigned int nvectors = 0;
+	    operation = unknown;
+	    FOR_EACH_VEC_SAFE_ELT (elts, ix, ce)
+	      if (TREE_CODE (ce->value) == ARRAY_REF
+		  || TREE_CODE (ce->value) == ARRAY_RANGE_REF)
+		{
+		  if (!vector_element)
+		    vector_element = ce->value;
+		  /* Get the vector index.  */
+		  tree idx = TREE_OPERAND (ce->value, 1);
+		  if (TREE_CODE (idx) == INTEGER_CST)
+		    {
+		      /* Get the RHS vector.  */
+		      tree r = ce->value;
+		      while (handled_component_p (r))
+			r = TREE_OPERAND (r, 0);
+		      if (type == TREE_TYPE (r))
+			{
+			  /* The RHS vector has the same type as
+			     LHS.  */
+			  if (rhs_vector == NULL)
+			    rhs_vector = r;
+
+			  /* Check if all RHS vector elements come
+			     fome the same vector.  */
+			  if (rhs_vector == r)
+			    {
+			      nvectors++;
+			      if (TREE_INT_CST_LOW (idx) == ix
+				  && (operation == unknown
+				      || operation == copy))
+				operation = copy;
+			      else
+				operation = permute;
+			      continue;
+			    }
+			}
+		    }
+
+		  /* Otherwise, use vector init.  */
+		  break;
+		}
+	      else if (TREE_CODE (TYPE_SIZE (TREE_TYPE (ce->value)))
+		       == INTEGER_CST)
+		{
+		  /* Only allow one single scalar insert.  */
+		  if (nscalars != 0)
+		    break;
+		  nscalars = 1;
+		  insert = true;
+		  scalar_idx = ix;
+		  scalar_element = ce->value;
+		}
+
+	    /* Allow a single scalar insert with vector copy or
+	       vector permute.  Vector copy without insert is OK.  */
+	    if (nunits != (nscalars + nvectors)
+		|| (nscalars == 0 && operation != copy))
+	      operation = unknown;
+	  }
+
+	if (operation == unknown)
+	  {
+	    /* Default to the regular vector init constructor.  */
+	    operation = init;
+	    insert = false;
+	  }
+
+	if (operation == copy)
+	  {
+	    /* Generate a vector copy.  */
+	    tree var = create_tmp_var (type);
+	    if (gimplify_expr (&rhs_vector, pre_p, post_p,
+			       is_gimple_val, fb_rvalue) == GS_ERROR)
+	      {
+		ret = GS_ERROR;
+		break;
+	      }
+	    gassign *init = gimple_build_assign (var, rhs_vector);
+	    gimple_seq_add_stmt (pre_p, init);
+	    if (gimplify_expr (&var, pre_p, post_p, is_gimple_val,
+			       fb_rvalue) == GS_ERROR)
+	      {
+		ret = GS_ERROR;
+		break;
+	      }
+	    /* Replace RHS with the vector copy.  */
+	    if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0)))
+	      TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (var, pre_p);
+	    else
+	      TREE_OPERAND (*expr_p, 1) = var;
+	  }
+	else
+	  {
+	    /* Prepare for vector permute by replacing the scalar
+	       element with the vector one.  */
+	    if (operation == permute)
+	      (elts->address())[scalar_idx].value = vector_element;
+
+	    /* Vector types use CONSTRUCTOR all the way through gimple
+	       compilation as a general initializer.  */
+	    FOR_EACH_VEC_SAFE_ELT (elts, ix, ce)
+	      {
+		enum gimplify_status tret;
+		tret = gimplify_expr (&ce->value, pre_p, post_p,
+				      is_gimple_val,
+				      fb_rvalue);
+		if (tret == GS_ERROR)
+		  ret = GS_ERROR;
+		else if (TREE_STATIC (ctor)
+			 && !initializer_constant_valid_p (ce->value,
+							   TREE_TYPE (ce->value)))
+		  TREE_STATIC (ctor) = 0;
+	      }
+	    if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0)))
+	      TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (ctor, pre_p);
+	  }
+
+	if (insert)
+	  {
+	    /* Generate a single scalar insert after vector copy or
+	       permute.  */
+	    tree rhs = TREE_OPERAND (*expr_p, 1);
+	    tree var = create_tmp_var (type);
+	    gassign *init = gimple_build_assign (var, rhs);
+	    gimple_seq_add_stmt (pre_p, init);
+	    if (gimplify_expr (&scalar_element, pre_p, post_p,
+			       is_gimple_val, fb_rvalue) == GS_ERROR)
+	      {
+		ret = GS_ERROR;
+		break;
+	      }
+	    tree scalar_type = TREE_TYPE (scalar_element);
+	    tree scalar_size = TYPE_SIZE (scalar_type);
+	    tree bitpos = bitsize_int (scalar_idx
+				       * TREE_INT_CST_LOW (scalar_size));
+	    tree ref = build3_loc (EXPR_LOCATION (rhs), BIT_FIELD_REF,
+				   scalar_type, var, scalar_size,
+				   bitpos);
+	    init = gimple_build_assign (ref, scalar_element);
+	    gimplify_seq_add_stmt (pre_p, init);
+	    TREE_OPERAND (*expr_p, 1) = var;
 	  }
-	if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0)))
-	  TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (ctor, pre_p);
       }
       break;
 
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1.c b/gcc/testsuite/gcc.target/i386/pr88828-1.c
new file mode 100644
index 00000000000..4ef1feab389
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1.c
@@ -0,0 +1,16 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { f, x[1], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2.c b/gcc/testsuite/gcc.target/i386/pr88828-2.c
new file mode 100644
index 00000000000..6dc482b6f4b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = x;
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3a.c b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
new file mode 100644
index 00000000000..97eb8e7162a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
@@ -0,0 +1,16 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { f, x[0], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3b.c b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
new file mode 100644
index 00000000000..ab2ba730716
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { f, x[0], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4a.c b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
new file mode 100644
index 00000000000..a54689be701
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[1] };
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4b.c b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
new file mode 100644
index 00000000000..0c3a1024d93
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
@@ -0,0 +1,20 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[1] };
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5a.c b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
new file mode 100644
index 00000000000..534808d3cd1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
@@ -0,0 +1,16 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 2 } } */
+/* { dg-final { scan-assembler-times "movaps" 1 } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], f };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5b.c b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
new file mode 100644
index 00000000000..aebea790979
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], f };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6a.c b/gcc/testsuite/gcc.target/i386/pr88828-6a.c
new file mode 100644
index 00000000000..d43a36d9137
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-6a.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 2 } } */
+/* { dg-final { scan-assembler-times "movaps" 1 } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[0] };
+  y[3] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6b.c b/gcc/testsuite/gcc.target/i386/pr88828-6b.c
new file mode 100644
index 00000000000..6856fe6500e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-6b.c
@@ -0,0 +1,19 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[0] };
+  y[3] = f;
+  return y;
+}