diff mbox series

V3 [PATCH] Optimize vector constructor

Message ID CAMe9rOrCAJv663ontj0sBtn7p1vuH0i-+mmcLyzzFv0xszfLJA@mail.gmail.com
State New
Headers show
Series V3 [PATCH] Optimize vector constructor | expand

Commit Message

H.J. Lu March 6, 2019, 7:46 a.m. UTC
On Tue, Mar 5, 2019 at 1:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote:
> > On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> > > > )
> > > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > For vector init constructor:
> > > > >
> > > > > ---
> > > > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> > > > >
> > > > > __v4sf
> > > > > foo (__v4sf x, float f)
> > > > > {
> > > > >   __v4sf y = { f, x[1], x[2], x[3] };
> > > > >   return y;
> > > > > }
> > > > > ---
> > > > >
> > > > > we can optimize vector init constructor with vector copy or permute
> > > > > followed by a single scalar insert:
>
> > and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way
> > is to emit a new stmt for _2 = copy ...; and do the set_rhs with the
> > BIT_INSERT_EXPR.
>
> Thanks for BIT_INSERT_EXPR suggestion.  I am testing this patch.
>
>
> H.J.
> ---
> We can optimize vector constructor with vector copy or permute followed
> by a single scalar insert:
>
>   __v4sf y;
>   __v4sf D.1930;
>   float _1;
>   float _2;
>   float _3;
>
>   <bb 2> :
>   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
>   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
>   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
>   y_6 = {f_5(D), _3, _2, _1};
>   return y_6;
>
> with
>
>  __v4sf y;
>   __v4sf D.1930;
>   float _1;
>   float _2;
>   float _3;
>   vector(4) float _8;
>
>   <bb 2> :
>   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
>   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
>   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
>   _8 = x_9(D);
>   y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
>   return y_6;
>
> gcc/
>
>         PR tree-optimization/88828
>         * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
>         vector init constructor with vector copy or permute followed
>         by a single scalar insert.
>
> gcc/testsuite/
>
>         PR tree-optimization/88828
>         * gcc.target/i386/pr88828-1a.c: New test.
>         * gcc.target/i386/pr88828-2b.c: Likewise.
>         * gcc.target/i386/pr88828-2.c: Likewise.
>         * gcc.target/i386/pr88828-3a.c: Likewise.
>         * gcc.target/i386/pr88828-3b.c: Likewise.
>         * gcc.target/i386/pr88828-3c.c: Likewise.
>         * gcc.target/i386/pr88828-3d.c: Likewise.
>         * gcc.target/i386/pr88828-4a.c: Likewise.
>         * gcc.target/i386/pr88828-4b.c: Likewise.
>         * gcc.target/i386/pr88828-5a.c: Likewise.
>         * gcc.target/i386/pr88828-5b.c: Likewise.
>         * gcc.target/i386/pr88828-6a.c: Likewise.
>         * gcc.target/i386/pr88828-6b.c: Likewise.

Here is the updated patch with run-time tests.

Comments

Richard Biener March 6, 2019, 12:33 p.m. UTC | #1
On Wed, Mar 6, 2019 at 8:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Mar 5, 2019 at 1:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote:
> > > On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> > > > > )
> > > > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > >
> > > > > > For vector init constructor:
> > > > > >
> > > > > > ---
> > > > > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> > > > > >
> > > > > > __v4sf
> > > > > > foo (__v4sf x, float f)
> > > > > > {
> > > > > >   __v4sf y = { f, x[1], x[2], x[3] };
> > > > > >   return y;
> > > > > > }
> > > > > > ---
> > > > > >
> > > > > > we can optimize vector init constructor with vector copy or permute
> > > > > > followed by a single scalar insert:
> >
> > > and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way
> > > is to emit a new stmt for _2 = copy ...; and do the set_rhs with the
> > > BIT_INSERT_EXPR.
> >
> > Thanks for BIT_INSERT_EXPR suggestion.  I am testing this patch.
> >
> >
> > H.J.
> > ---
> > We can optimize vector constructor with vector copy or permute followed
> > by a single scalar insert:
> >
> >   __v4sf y;
> >   __v4sf D.1930;
> >   float _1;
> >   float _2;
> >   float _3;
> >
> >   <bb 2> :
> >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> >   y_6 = {f_5(D), _3, _2, _1};
> >   return y_6;
> >
> > with
> >
> >  __v4sf y;
> >   __v4sf D.1930;
> >   float _1;
> >   float _2;
> >   float _3;
> >   vector(4) float _8;
> >
> >   <bb 2> :
> >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> >   _8 = x_9(D);
> >   y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
> >   return y_6;
> >
> > gcc/
> >
> >         PR tree-optimization/88828
> >         * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
> >         vector init constructor with vector copy or permute followed
> >         by a single scalar insert.
> >
> > gcc/testsuite/
> >
> >         PR tree-optimization/88828
> >         * gcc.target/i386/pr88828-1a.c: New test.
> >         * gcc.target/i386/pr88828-2b.c: Likewise.
> >         * gcc.target/i386/pr88828-2.c: Likewise.
> >         * gcc.target/i386/pr88828-3a.c: Likewise.
> >         * gcc.target/i386/pr88828-3b.c: Likewise.
> >         * gcc.target/i386/pr88828-3c.c: Likewise.
> >         * gcc.target/i386/pr88828-3d.c: Likewise.
> >         * gcc.target/i386/pr88828-4a.c: Likewise.
> >         * gcc.target/i386/pr88828-4b.c: Likewise.
> >         * gcc.target/i386/pr88828-5a.c: Likewise.
> >         * gcc.target/i386/pr88828-5b.c: Likewise.
> >         * gcc.target/i386/pr88828-6a.c: Likewise.
> >         * gcc.target/i386/pr88828-6b.c: Likewise.
>
> Here is the updated patch with run-time tests.

-      if (TREE_CODE (elt->value) != SSA_NAME)
+      if (TREE_CODE (ce->value) != SSA_NAME)
        return false;

hmm, so it doesn't allow { 0, v[1], v[2], v[3] }?  I think the single
scalar value can be a constant as well.

       if (!def_stmt)
-       return false;
+       {
+         if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))

if (SSA_NAME_IS_DEFAULT_DEF (ce->value))

+           {

also you seem to disallow

  { i + 1, v[1], v[2], v[3] }

because get_prop_source_stmt will return the definition computing
i + 1 in this case and your code will be skipped?

I think you can simplify the code by treating scalar_element != NULL
as nscalars == 1 and eliding nscalars.

-      if (conv_code == ERROR_MARK)
+      if (conv_code == ERROR_MARK && !insert)
        gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
                                        orig[1], op2);
       else
@@ -2148,10 +2198,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
                                   VEC_PERM_EXPR, orig[0], orig[1], op2);
          orig[0] = gimple_assign_lhs (perm);
          gsi_insert_before (gsi, perm, GSI_SAME_STMT);
-         gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
+         gimple_assign_set_rhs_with_ops (gsi,
+                                         (conv_code != ERROR_MARK
+                                          ? conv_code
+                                          : NOP_EXPR),
+                                         orig[0],
                                          NULL_TREE, NULL_TREE);

I believe you should elide the last stmt for conv_code == ERROR_MARK,
that is, why did you need to add the && !insert check in the guarding condition
(this path should already do the correct thing?).  Note that in all
cases it looks
that with conv_code != ERROR_MARK you may end up doing a float->int
or int->float conversion on a value it wasn't done on before which might
raise exceptions?  That is, do we need to make sure we permute a
value we already do convert into the place we're going to insert to?

+  if (insert)
+    {
+      /* Generate a single scalar insert.  */
+      tree var = make_ssa_name (type);
+      tree val = gimple_assign_rhs1 (stmt);
+      gimple *copy = gimple_build_assign (var, val);

I believe this doesn't properly copy the stmt in case it is a permute.
You can use (note the use of gsi_stmt - gimple_assign_set_rhs_with_ops
can re-allocate the stmt)

        gimple *copy = gimple_copy (gsi_stmt (*gsi));
        gimple_assign_set_lhs (copy, var);

+      gsi_insert_before (gsi, copy, GSI_SAME_STMT);
+      tree bitpos = bitsize_int (scalar_idx * elem_size);
+      gimple_assign_set_rhs_with_ops (gsi, BIT_INSERT_EXPR, var,
+                                     scalar_element, bitpos);
+    }

Otherwise looks OK to me.

As separate followup patch it might be interesting to support

 { 0, a[1], a[2], 3 }

kinds as well, thus combining a VECTOR_CST (which is
reasonably cheap to create) with another vector.  That should
be maybe done as a first patch given this is just a two-vector
permute which the code already handles apart from not
recognizing the implicit constant vector participating.

Similar

 { 0, a[1], b[2], 3 }

where the combination of a and b is blended with another
constant vector.  I'm not sure if handling an arbitrary number
of scalar elements should be done in a similar way, that is,
implementing

 { s1, a[1], a[2], s2, s3, b[0], b[1], b[2] }

as

  tem = VEC_PERM <a, b, { ... }>
  tem2 = { s1, 0, 0, s2, s3, 0, 0, 0 }
  res = VEC_PERM <tem, tem2, { blend-mask }>

where constructing tem2 should take at most
N-1 inserts (the first element to insert into tem2
can use a splat or if element zero a zero-extending move).

Doing this effectively lifts the restriction of only
handling two vectors - we'd incrementally do
two-vector permute plus blend of the rest which has
its constructor re-processed.

But as said - the code is already a bit awkward so changing
this in multiple reivisions is preferred and the single-element
case is certainly sth to do via a BIT_INSERT_EXPR.

Thanks,
Richard.

> --
> H.J.
diff mbox series

Patch

From b2bc0bf3a8ee17d53bf39f0aeabe7025b33e9c96 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 5 Feb 2019 15:39:27 -0800
Subject: [PATCH] Optimize vector constructor

We can optimize vector constructor with vector copy or permute followed
by a single scalar insert:

  __v4sf y;
  __v4sf D.1930;
  float _1;
  float _2;
  float _3;

  <bb 2> :
  _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
  _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
  _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
  y_6 = {f_5(D), _3, _2, _1};
  return y_6;

with

 __v4sf y;
  __v4sf D.1930;
  float _1;
  float _2;
  float _3;
  vector(4) float _8;

  <bb 2> :
  _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
  _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
  _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
  _8 = x_9(D);
  y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
  return y_6;

gcc/

	PR tree-optimization/88828
	* tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
	vector init constructor with vector copy or permute followed
	by a single scalar insert.

gcc/testsuite/

	PR tree-optimization/88828
	* gcc.target/i386/pr88828-1.c: New test.
	* gcc.target/i386/pr88828-1a.c: Likewise.
	* gcc.target/i386/pr88828-1b.c: Likewise.
	* gcc.target/i386/pr88828-1c.c: Likewise.
	* gcc.target/i386/pr88828-2.c: Likewise.
	* gcc.target/i386/pr88828-2a.c: Likewise.
	* gcc.target/i386/pr88828-2b.c: Likewise.
	* gcc.target/i386/pr88828-2c.c: Likewise.
	* gcc.target/i386/pr88828-2d.c: Likewise.
	* gcc.target/i386/pr88828-3.c: Likewise.
	* gcc.target/i386/pr88828-3a.c: Likewise.
	* gcc.target/i386/pr88828-3b.c: Likewise.
	* gcc.target/i386/pr88828-3c.c: Likewise.
	* gcc.target/i386/pr88828-3d.c: Likewise.
	* gcc.target/i386/pr88828-4a.c: Likewise.
	* gcc.target/i386/pr88828-4b.c: Likewise.
	* gcc.target/i386/pr88828-5a.c: Likewise.
	* gcc.target/i386/pr88828-5b.c: Likewise.
---
 gcc/testsuite/gcc.target/i386/pr88828-1.c  | 49 +++++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-1a.c | 17 +++++
 gcc/testsuite/gcc.target/i386/pr88828-1b.c | 23 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-1c.c | 18 +++++
 gcc/testsuite/gcc.target/i386/pr88828-2.c  | 51 +++++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-2a.c | 17 +++++
 gcc/testsuite/gcc.target/i386/pr88828-2b.c | 19 +++++
 gcc/testsuite/gcc.target/i386/pr88828-2c.c | 23 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-2d.c | 25 +++++++
 gcc/testsuite/gcc.target/i386/pr88828-3.c  | 54 ++++++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-3a.c | 17 +++++
 gcc/testsuite/gcc.target/i386/pr88828-3b.c | 19 +++++
 gcc/testsuite/gcc.target/i386/pr88828-3c.c | 25 +++++++
 gcc/testsuite/gcc.target/i386/pr88828-4a.c | 18 +++++
 gcc/testsuite/gcc.target/i386/pr88828-4b.c | 21 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-5a.c | 18 +++++
 gcc/testsuite/gcc.target/i386/pr88828-5b.c | 20 +++++
 gcc/tree-ssa-forwprop.c                    | 85 +++++++++++++++++++---
 18 files changed, 509 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2d.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5b.c

diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1.c b/gcc/testsuite/gcc.target/i386/pr88828-1.c
new file mode 100644
index 00000000000..a15d1fea3f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1.c
@@ -0,0 +1,49 @@ 
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-1a.c"
+#include "pr88828-1b.c"
+#include "pr88828-1c.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 134567;
+  __v4sf x = { f[0], f[1], f[2], f[3] };
+  __v4sf y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x, z);
+  do_check (y, f, z);
+  y = foo2 (x, z);
+  do_check (y, f, z);
+  y = foo3 (x, z);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1a.c b/gcc/testsuite/gcc.target/i386/pr88828-1a.c
new file mode 100644
index 00000000000..d37b24c6661
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1a.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo1 (__v4sf x, float f)
+{
+  __v4sf y = { f, x[1], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1b.c b/gcc/testsuite/gcc.target/i386/pr88828-1b.c
new file mode 100644
index 00000000000..af4aced65f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1b.c
@@ -0,0 +1,23 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__attribute__((noinline, noclone))
+__v4sf
+foo2 (__v4sf x, float f)
+{
+  return vector_init (f, x[1], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1c.c b/gcc/testsuite/gcc.target/i386/pr88828-1c.c
new file mode 100644
index 00000000000..a117f3ec7b1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1c.c
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo3 (__v4sf x, float f)
+{
+  __v4sf y = x;
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2.c b/gcc/testsuite/gcc.target/i386/pr88828-2.c
new file mode 100644
index 00000000000..011fd486bb1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2.c
@@ -0,0 +1,51 @@ 
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-2a.c"
+#include "pr88828-2c.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else if (i == 1)
+      {
+	if (y[i] != f[0])
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 134567;
+  __v4sf x = { f[0], f[1], f[2], f[3] };
+  __v4sf y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x, z);
+  do_check (y, f, z);
+  y = foo2 (x, z);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2a.c b/gcc/testsuite/gcc.target/i386/pr88828-2a.c
new file mode 100644
index 00000000000..85e49535ebd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2a.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo1 (__v4sf x, float f)
+{
+  __v4sf y = { f, x[0], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2b.c b/gcc/testsuite/gcc.target/i386/pr88828-2b.c
new file mode 100644
index 00000000000..adfd7002a4d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2b.c
@@ -0,0 +1,19 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo1 (__v4sf x, float f)
+{
+  __v4sf y = { f, x[0], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2c.c b/gcc/testsuite/gcc.target/i386/pr88828-2c.c
new file mode 100644
index 00000000000..149967ea0b9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2c.c
@@ -0,0 +1,23 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__attribute__((noinline, noclone))
+__v4sf
+foo2 (__v4sf x, float f)
+{
+  return vector_init (f, x[0], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2d.c b/gcc/testsuite/gcc.target/i386/pr88828-2d.c
new file mode 100644
index 00000000000..21088496730
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2d.c
@@ -0,0 +1,25 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  return vector_init (f, x[0], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3.c b/gcc/testsuite/gcc.target/i386/pr88828-3.c
new file mode 100644
index 00000000000..adbc46dbf3b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3.c
@@ -0,0 +1,54 @@ 
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-3a.c"
+#include "pr88828-3b.c"
+#include "pr88828-3c.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 3)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else if (i == 0)
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i + 1])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 134567;
+  __v4sf x = { f[0], f[1], f[2], f[3] };
+  __v4sf y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x, z);
+  do_check (y, f, z);
+  y = foo2 (x, z);
+  do_check (y, f, z);
+  y = foo3 (x, z);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3a.c b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
new file mode 100644
index 00000000000..e5cb95c1275
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 2 } } */
+/* { dg-final { scan-assembler-times "movaps" 1 } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo1 (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], f };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3b.c b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
new file mode 100644
index 00000000000..0349f35b08a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
@@ -0,0 +1,19 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo2 (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], f };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3c.c b/gcc/testsuite/gcc.target/i386/pr88828-3c.c
new file mode 100644
index 00000000000..fb668a55f1d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3c.c
@@ -0,0 +1,25 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__attribute__((noinline, noclone))
+__v4sf
+foo3 (__v4sf x, float f)
+{
+  return vector_init (x[0], x[2], x[3], f);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4a.c b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
new file mode 100644
index 00000000000..64043b9855f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[1] };
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4b.c b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
new file mode 100644
index 00000000000..ad8d2b985d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
@@ -0,0 +1,21 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[1] };
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5a.c b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
new file mode 100644
index 00000000000..5e908faef5c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 2 } } */
+/* { dg-final { scan-assembler-times "movaps" 1 } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[0] };
+  y[3] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5b.c b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
new file mode 100644
index 00000000000..988a48823e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
@@ -0,0 +1,20 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[0] };
+  y[3] = f;
+  return y;
+}
diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c
index eeb6281c652..85d9f86288b 100644
--- a/gcc/tree-ssa-forwprop.c
+++ b/gcc/tree-ssa-forwprop.c
@@ -2008,7 +2008,7 @@  simplify_vector_constructor (gimple_stmt_iterator *gsi)
   unsigned elem_size, i;
   unsigned HOST_WIDE_INT nelts;
   enum tree_code code, conv_code;
-  constructor_elt *elt;
+  constructor_elt *ce;
   bool maybe_ident;
 
   gcc_checking_assert (gimple_assign_rhs_code (stmt) == CONSTRUCTOR);
@@ -2027,18 +2027,42 @@  simplify_vector_constructor (gimple_stmt_iterator *gsi)
   orig[1] = NULL;
   conv_code = ERROR_MARK;
   maybe_ident = true;
-  FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, elt)
+
+  tree rhs_vector = NULL;
+  /* The single scalar element.  */
+  tree scalar_element = NULL;
+  unsigned int scalar_idx = 0;
+  bool insert = false;
+  unsigned int nscalars = 0;
+  unsigned int nvectors = 0;
+  FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, ce)
     {
       tree ref, op1;
 
       if (i >= nelts)
 	return false;
 
-      if (TREE_CODE (elt->value) != SSA_NAME)
+      if (TREE_CODE (ce->value) != SSA_NAME)
 	return false;
-      def_stmt = get_prop_source_stmt (elt->value, false, NULL);
+      def_stmt = get_prop_source_stmt (ce->value, false, NULL);
       if (!def_stmt)
-	return false;
+	{
+	  if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
+	    {
+	      /* Only allow one scalar insert.  */
+	      if (nscalars != 0)
+		return false;
+
+	      nscalars = 1;
+	      insert = true;
+	      scalar_idx = i;
+	      sel.quick_push (i);
+	      scalar_element = ce->value;
+	      continue;
+	    }
+	  else
+	    return false;
+	}
       code = gimple_assign_rhs_code (def_stmt);
       if (code == FLOAT_EXPR
 	  || code == FIX_TRUNC_EXPR)
@@ -2046,7 +2070,7 @@  simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	  op1 = gimple_assign_rhs1 (def_stmt);
 	  if (conv_code == ERROR_MARK)
 	    {
-	      if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (elt->value))),
+	      if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (ce->value))),
 			    GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1)))))
 		return false;
 	      conv_code = code;
@@ -2095,11 +2119,29 @@  simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	elt += nelts;
       if (elt != i)
 	maybe_ident = false;
+
+       if (type == TREE_TYPE (ref))
+	 {
+	   /* The RHS vector has the same type as LHS.  */
+	   if (rhs_vector == NULL)
+	     rhs_vector = ref;
+	   /* Check if all RHS vector elements come fome the same
+	      vector.  */
+	   if (rhs_vector == ref)
+	     nvectors++;
+	 }
+
       sel.quick_push (elt);
     }
   if (i < nelts)
     return false;
 
+  if (insert
+      && (nvectors == 0
+	  || (TYPE_VECTOR_SUBPARTS (type).to_constant ()
+	      != (nscalars + nvectors))))
+    return false;
+
   if (! VECTOR_TYPE_P (TREE_TYPE (orig[0]))
       || maybe_ne (TYPE_VECTOR_SUBPARTS (type),
 		   TYPE_VECTOR_SUBPARTS (TREE_TYPE (orig[0]))))
@@ -2127,18 +2169,26 @@  simplify_vector_constructor (gimple_stmt_iterator *gsi)
 
       vec_perm_indices indices (sel, orig[1] ? 2 : 1, nelts);
       if (!can_vec_perm_const_p (TYPE_MODE (type), indices))
-	return false;
+	{
+	  if (insert)
+	    gcc_unreachable ();
+	  return false;
+	}
       mask_type
 	= build_vector_type (build_nonstandard_integer_type (elem_size, 1),
 			     nelts);
       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_VECTOR_INT
 	  || maybe_ne (GET_MODE_SIZE (TYPE_MODE (mask_type)),
 		       GET_MODE_SIZE (TYPE_MODE (type))))
-	return false;
+	{
+	  if (insert)
+	    gcc_unreachable ();
+	  return false;
+	}
       op2 = vec_perm_indices_to_tree (mask_type, indices);
       if (!orig[1])
 	orig[1] = orig[0];
-      if (conv_code == ERROR_MARK)
+      if (conv_code == ERROR_MARK && !insert)
 	gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
 					orig[1], op2);
       else
@@ -2148,10 +2198,25 @@  simplify_vector_constructor (gimple_stmt_iterator *gsi)
 				   VEC_PERM_EXPR, orig[0], orig[1], op2);
 	  orig[0] = gimple_assign_lhs (perm);
 	  gsi_insert_before (gsi, perm, GSI_SAME_STMT);
-	  gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
+	  gimple_assign_set_rhs_with_ops (gsi,
+					  (conv_code != ERROR_MARK
+					   ? conv_code
+					   : NOP_EXPR),
+					  orig[0],
 					  NULL_TREE, NULL_TREE);
 	}
     }
+  if (insert)
+    {
+      /* Generate a single scalar insert.  */
+      tree var = make_ssa_name (type);
+      tree val = gimple_assign_rhs1 (stmt);
+      gimple *copy = gimple_build_assign (var, val);
+      gsi_insert_before (gsi, copy, GSI_SAME_STMT);
+      tree bitpos = bitsize_int (scalar_idx * elem_size);
+      gimple_assign_set_rhs_with_ops (gsi, BIT_INSERT_EXPR, var,
+				      scalar_element, bitpos);
+    }
   update_stmt (gsi_stmt (*gsi));
   return true;
 }
-- 
2.20.1