diff mbox series

V5 [PATCH] Optimize vector constructor

Message ID CAMe9rOpQehpx=sxjvE1wgZYd+NrQUaVn0eEPgDcr5TuPTEjj5w@mail.gmail.com
State New
Headers show
Series V5 [PATCH] Optimize vector constructor | expand

Commit Message

H.J. Lu March 8, 2019, 8:48 a.m. UTC
On Thu, Mar 7, 2019 at 9:51 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Mar 6, 2019 at 8:33 PM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Wed, Mar 6, 2019 at 8:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Tue, Mar 5, 2019 at 1:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote:
> > > > > On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > >
> > > > > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> > > > > > > )
> > > > > > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > >
> > > > > > > > For vector init constructor:
> > > > > > > >
> > > > > > > > ---
> > > > > > > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> > > > > > > >
> > > > > > > > __v4sf
> > > > > > > > foo (__v4sf x, float f)
> > > > > > > > {
> > > > > > > >   __v4sf y = { f, x[1], x[2], x[3] };
> > > > > > > >   return y;
> > > > > > > > }
> > > > > > > > ---
> > > > > > > >
> > > > > > > > we can optimize vector init constructor with vector copy or permute
> > > > > > > > followed by a single scalar insert:
> > > >
> > > > > and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way
> > > > > is to emit a new stmt for _2 = copy ...; and do the set_rhs with the
> > > > > BIT_INSERT_EXPR.
> > > >
> > > > Thanks for BIT_INSERT_EXPR suggestion.  I am testing this patch.
> > > >
> > > >
> > > > H.J.
> > > > ---
> > > > We can optimize vector constructor with vector copy or permute followed
> > > > by a single scalar insert:
> > > >
> > > >   __v4sf y;
> > > >   __v4sf D.1930;
> > > >   float _1;
> > > >   float _2;
> > > >   float _3;
> > > >
> > > >   <bb 2> :
> > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > >   y_6 = {f_5(D), _3, _2, _1};
> > > >   return y_6;
> > > >
> > > > with
> > > >
> > > >  __v4sf y;
> > > >   __v4sf D.1930;
> > > >   float _1;
> > > >   float _2;
> > > >   float _3;
> > > >   vector(4) float _8;
> > > >
> > > >   <bb 2> :
> > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > >   _8 = x_9(D);
> > > >   y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
> > > >   return y_6;
> > > >
> > > > gcc/
> > > >
> > > >         PR tree-optimization/88828
> > > >         * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
> > > >         vector init constructor with vector copy or permute followed
> > > >         by a single scalar insert.
> > > >
> > > > gcc/testsuite/
> > > >
> > > >         PR tree-optimization/88828
> > > >         * gcc.target/i386/pr88828-1a.c: New test.
> > > >         * gcc.target/i386/pr88828-2b.c: Likewise.
> > > >         * gcc.target/i386/pr88828-2.c: Likewise.
> > > >         * gcc.target/i386/pr88828-3a.c: Likewise.
> > > >         * gcc.target/i386/pr88828-3b.c: Likewise.
> > > >         * gcc.target/i386/pr88828-3c.c: Likewise.
> > > >         * gcc.target/i386/pr88828-3d.c: Likewise.
> > > >         * gcc.target/i386/pr88828-4a.c: Likewise.
> > > >         * gcc.target/i386/pr88828-4b.c: Likewise.
> > > >         * gcc.target/i386/pr88828-5a.c: Likewise.
> > > >         * gcc.target/i386/pr88828-5b.c: Likewise.
> > > >         * gcc.target/i386/pr88828-6a.c: Likewise.
> > > >         * gcc.target/i386/pr88828-6b.c: Likewise.
> > >
> > > Here is the updated patch with run-time tests.
> >
> > -      if (TREE_CODE (elt->value) != SSA_NAME)
> > +      if (TREE_CODE (ce->value) != SSA_NAME)
> >         return false;
> >
> > hmm, so it doesn't allow { 0, v[1], v[2], v[3] }?  I think the single
> > scalar value can be a constant as well.
>
> Fixed.
>
> >        if (!def_stmt)
> > -       return false;
> > +       {
> > +         if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
> >
> > if (SSA_NAME_IS_DEFAULT_DEF (ce->value))
> >
> > +           {
> >
> > also you seem to disallow
> >
> >   { i + 1, v[1], v[2], v[3] }
>
> Fixed by
>
>      if (code != BIT_FIELD_REF)
>         {
>           /* Only allow one scalar insert.  */
>           if (nscalars != 0)
>             return false;
>
>           nscalars = 1;
>           insert = true;
>           scalar_idx = i;
>           sel.quick_push (i);
>           scalar_element = ce->value;
>           continue;
>         }
>
> > because get_prop_source_stmt will return the definition computing
> > i + 1 in this case and your code will be skipped?
> >
> > I think you can simplify the code by treating scalar_element != NULL
> > as nscalars == 1 and eliding nscalars.
>
> It works only if
>
> TYPE_VECTOR_SUBPARTS (type).to_constant ()  == (nscalars + nvectors)
>
> We need to check both nscalars and nvectors.  Elide nscalar
> check doesn't help much here.
>
> > -      if (conv_code == ERROR_MARK)
> > +      if (conv_code == ERROR_MARK && !insert)
> >         gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
> >                                         orig[1], op2);
> >        else
> > @@ -2148,10 +2198,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
> >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> >           orig[0] = gimple_assign_lhs (perm);
> >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > -         gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
> > +         gimple_assign_set_rhs_with_ops (gsi,
> > +                                         (conv_code != ERROR_MARK
> > +                                          ? conv_code
> > +                                          : NOP_EXPR),
> > +                                         orig[0],
> >                                           NULL_TREE, NULL_TREE);
> >
> > I believe you should elide the last stmt for conv_code == ERROR_MARK,
> > that is, why did you need to add the && !insert check in the guarding condition
>
> When conv_code == ERROR_MARK, we still need
>
>        gimple *perm
>             = gimple_build_assign (make_ssa_name (TREE_TYPE (orig[0])),
>                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
>           orig[0] = gimple_assign_lhs (perm);
>           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
>           gimple_assign_set_rhs_with_ops (gsi,  NOP_EXPR,
>                                           orig[0],
>                                           NULL_TREE, NULL_TREE);
>
> Otherwise, scalar insert won't work.
>
> > (this path should already do the correct thing?).  Note that in all
> > cases it looks
> > that with conv_code != ERROR_MARK you may end up doing a float->int
> > or int->float conversion on a value it wasn't done on before which might
> > raise exceptions?  That is, do we need to make sure we permute a
> > value we already do convert into the place we're going to insert to?
>
> This couldn't happen:
>
>       if (type == TREE_TYPE (ref))
>          {
>            /* The RHS vector has the same type as LHS.  */
>            if (rhs_vector == NULL)
>              rhs_vector = ref;
>            /* Check if all RHS vector elements come fome the same
>               vector.  */
>            if (rhs_vector == ref)
>              nvectors++;
>          }
> ...
>   if (insert
>       && (nvectors == 0
>           || (TYPE_VECTOR_SUBPARTS (type).to_constant ()
>               != (nscalars + nvectors))))
>     return false;
>
> > +  if (insert)
> > +    {
> > +      /* Generate a single scalar insert.  */
> > +      tree var = make_ssa_name (type);
> > +      tree val = gimple_assign_rhs1 (stmt);
> > +      gimple *copy = gimple_build_assign (var, val);
> >
> > I believe this doesn't properly copy the stmt in case it is a permute.
> > You can use (note the use of gsi_stmt - gimple_assign_set_rhs_with_ops
> > can re-allocate the stmt)
> >
> >         gimple *copy = gimple_copy (gsi_stmt (*gsi));
> >         gimple_assign_set_lhs (copy, var);
>
> Fixed.
>
> > +      gsi_insert_before (gsi, copy, GSI_SAME_STMT);
> > +      tree bitpos = bitsize_int (scalar_idx * elem_size);
> > +      gimple_assign_set_rhs_with_ops (gsi, BIT_INSERT_EXPR, var,
> > +                                     scalar_element, bitpos);
> > +    }
> >
> > Otherwise looks OK to me.
> >
> > As separate followup patch it might be interesting to support
> >
> >  { 0, a[1], a[2], 3 }
> >
> > kinds as well, thus combining a VECTOR_CST (which is
> > reasonably cheap to create) with another vector.  That should
> > be maybe done as a first patch given this is just a two-vector
> > permute which the code already handles apart from not
> > recognizing the implicit constant vector participating.
> >
> > Similar
> >
> >  { 0, a[1], b[2], 3 }
> >
> > where the combination of a and b is blended with another
> > constant vector.  I'm not sure if handling an arbitrary number
> > of scalar elements should be done in a similar way, that is,
> > implementing
> >
> >  { s1, a[1], a[2], s2, s3, b[0], b[1], b[2] }
> >
> > as
> >
> >   tem = VEC_PERM <a, b, { ... }>
> >   tem2 = { s1, 0, 0, s2, s3, 0, 0, 0 }
> >   res = VEC_PERM <tem, tem2, { blend-mask }>
> >
> > where constructing tem2 should take at most
> > N-1 inserts (the first element to insert into tem2
> > can use a splat or if element zero a zero-extending move).
> >
> > Doing this effectively lifts the restriction of only
> > handling two vectors - we'd incrementally do
> > two-vector permute plus blend of the rest which has
> > its constructor re-processed.
> >
> > But as said - the code is already a bit awkward so changing
> > this in multiple reivisions is preferred and the single-element
> > case is certainly sth to do via a BIT_INSERT_EXPR.
>
> Agreed.
>
> I am testing this updated patch.
>

This is the fully tested patch.  I used

+
+      if (useless_type_conversion_p (type, TREE_TYPE (ref)))
+ {
+    /* The RHS vector has the same type as LHS.  */
+    if (rhs_vector == NULL)
+      rhs_vector = ref;
+    /* Check if all RHS vector elements come fome the same
+       vector.  */
+    if (rhs_vector == ref)
+      nvectors++;
+ }
+

to support cast between __v4sf and __m128.

Comments

Richard Biener March 8, 2019, 11:03 a.m. UTC | #1
On Fri, Mar 8, 2019 at 9:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, Mar 7, 2019 at 9:51 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Wed, Mar 6, 2019 at 8:33 PM Richard Biener
> > <richard.guenther@gmail.com> wrote:
> > >
> > > On Wed, Mar 6, 2019 at 8:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Tue, Mar 5, 2019 at 1:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote:
> > > > > > On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > >
> > > > > > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> > > > > > > > )
> > > > > > > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > >
> > > > > > > > > For vector init constructor:
> > > > > > > > >
> > > > > > > > > ---
> > > > > > > > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> > > > > > > > >
> > > > > > > > > __v4sf
> > > > > > > > > foo (__v4sf x, float f)
> > > > > > > > > {
> > > > > > > > >   __v4sf y = { f, x[1], x[2], x[3] };
> > > > > > > > >   return y;
> > > > > > > > > }
> > > > > > > > > ---
> > > > > > > > >
> > > > > > > > > we can optimize vector init constructor with vector copy or permute
> > > > > > > > > followed by a single scalar insert:
> > > > >
> > > > > > and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way
> > > > > > is to emit a new stmt for _2 = copy ...; and do the set_rhs with the
> > > > > > BIT_INSERT_EXPR.
> > > > >
> > > > > Thanks for BIT_INSERT_EXPR suggestion.  I am testing this patch.
> > > > >
> > > > >
> > > > > H.J.
> > > > > ---
> > > > > We can optimize vector constructor with vector copy or permute followed
> > > > > by a single scalar insert:
> > > > >
> > > > >   __v4sf y;
> > > > >   __v4sf D.1930;
> > > > >   float _1;
> > > > >   float _2;
> > > > >   float _3;
> > > > >
> > > > >   <bb 2> :
> > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > >   y_6 = {f_5(D), _3, _2, _1};
> > > > >   return y_6;
> > > > >
> > > > > with
> > > > >
> > > > >  __v4sf y;
> > > > >   __v4sf D.1930;
> > > > >   float _1;
> > > > >   float _2;
> > > > >   float _3;
> > > > >   vector(4) float _8;
> > > > >
> > > > >   <bb 2> :
> > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > >   _8 = x_9(D);
> > > > >   y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
> > > > >   return y_6;
> > > > >
> > > > > gcc/
> > > > >
> > > > >         PR tree-optimization/88828
> > > > >         * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
> > > > >         vector init constructor with vector copy or permute followed
> > > > >         by a single scalar insert.
> > > > >
> > > > > gcc/testsuite/
> > > > >
> > > > >         PR tree-optimization/88828
> > > > >         * gcc.target/i386/pr88828-1a.c: New test.
> > > > >         * gcc.target/i386/pr88828-2b.c: Likewise.
> > > > >         * gcc.target/i386/pr88828-2.c: Likewise.
> > > > >         * gcc.target/i386/pr88828-3a.c: Likewise.
> > > > >         * gcc.target/i386/pr88828-3b.c: Likewise.
> > > > >         * gcc.target/i386/pr88828-3c.c: Likewise.
> > > > >         * gcc.target/i386/pr88828-3d.c: Likewise.
> > > > >         * gcc.target/i386/pr88828-4a.c: Likewise.
> > > > >         * gcc.target/i386/pr88828-4b.c: Likewise.
> > > > >         * gcc.target/i386/pr88828-5a.c: Likewise.
> > > > >         * gcc.target/i386/pr88828-5b.c: Likewise.
> > > > >         * gcc.target/i386/pr88828-6a.c: Likewise.
> > > > >         * gcc.target/i386/pr88828-6b.c: Likewise.
> > > >
> > > > Here is the updated patch with run-time tests.
> > >
> > > -      if (TREE_CODE (elt->value) != SSA_NAME)
> > > +      if (TREE_CODE (ce->value) != SSA_NAME)
> > >         return false;
> > >
> > > hmm, so it doesn't allow { 0, v[1], v[2], v[3] }?  I think the single
> > > scalar value can be a constant as well.
> >
> > Fixed.
> >
> > >        if (!def_stmt)
> > > -       return false;
> > > +       {
> > > +         if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
> > >
> > > if (SSA_NAME_IS_DEFAULT_DEF (ce->value))
> > >
> > > +           {
> > >
> > > also you seem to disallow
> > >
> > >   { i + 1, v[1], v[2], v[3] }
> >
> > Fixed by
> >
> >      if (code != BIT_FIELD_REF)
> >         {
> >           /* Only allow one scalar insert.  */
> >           if (nscalars != 0)
> >             return false;
> >
> >           nscalars = 1;
> >           insert = true;
> >           scalar_idx = i;
> >           sel.quick_push (i);
> >           scalar_element = ce->value;
> >           continue;
> >         }
> >
> > > because get_prop_source_stmt will return the definition computing
> > > i + 1 in this case and your code will be skipped?
> > >
> > > I think you can simplify the code by treating scalar_element != NULL
> > > as nscalars == 1 and eliding nscalars.
> >
> > It works only if
> >
> > TYPE_VECTOR_SUBPARTS (type).to_constant ()  == (nscalars + nvectors)
> >
> > We need to check both nscalars and nvectors.  Elide nscalar
> > check doesn't help much here.
> >
> > > -      if (conv_code == ERROR_MARK)
> > > +      if (conv_code == ERROR_MARK && !insert)
> > >         gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
> > >                                         orig[1], op2);
> > >        else
> > > @@ -2148,10 +2198,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
> > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > >           orig[0] = gimple_assign_lhs (perm);
> > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > -         gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
> > > +         gimple_assign_set_rhs_with_ops (gsi,
> > > +                                         (conv_code != ERROR_MARK
> > > +                                          ? conv_code
> > > +                                          : NOP_EXPR),
> > > +                                         orig[0],
> > >                                           NULL_TREE, NULL_TREE);
> > >
> > > I believe you should elide the last stmt for conv_code == ERROR_MARK,
> > > that is, why did you need to add the && !insert check in the guarding condition
> >
> > When conv_code == ERROR_MARK, we still need
> >
> >        gimple *perm
> >             = gimple_build_assign (make_ssa_name (TREE_TYPE (orig[0])),
> >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> >           orig[0] = gimple_assign_lhs (perm);
> >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> >           gimple_assign_set_rhs_with_ops (gsi,  NOP_EXPR,
> >                                           orig[0],
> >                                           NULL_TREE, NULL_TREE);
> >
> > Otherwise, scalar insert won't work.
> >
> > > (this path should already do the correct thing?).  Note that in all
> > > cases it looks
> > > that with conv_code != ERROR_MARK you may end up doing a float->int
> > > or int->float conversion on a value it wasn't done on before which might
> > > raise exceptions?  That is, do we need to make sure we permute a
> > > value we already do convert into the place we're going to insert to?
> >
> > This couldn't happen:
> >
> >       if (type == TREE_TYPE (ref))
> >          {
> >            /* The RHS vector has the same type as LHS.  */
> >            if (rhs_vector == NULL)
> >              rhs_vector = ref;
> >            /* Check if all RHS vector elements come fome the same
> >               vector.  */
> >            if (rhs_vector == ref)
> >              nvectors++;
> >          }
> > ...
> >   if (insert
> >       && (nvectors == 0
> >           || (TYPE_VECTOR_SUBPARTS (type).to_constant ()
> >               != (nscalars + nvectors))))
> >     return false;

I see - that looks like a missed case then?

 { 1., (float)v[1], (float)v[2], (float)v[3] }

with integer vector v?

I'll have a look at the full patch next week (it's GCC 10 material in any case).

Richard.

> > > +  if (insert)
> > > +    {
> > > +      /* Generate a single scalar insert.  */
> > > +      tree var = make_ssa_name (type);
> > > +      tree val = gimple_assign_rhs1 (stmt);
> > > +      gimple *copy = gimple_build_assign (var, val);
> > >
> > > I believe this doesn't properly copy the stmt in case it is a permute.
> > > You can use (note the use of gsi_stmt - gimple_assign_set_rhs_with_ops
> > > can re-allocate the stmt)
> > >
> > >         gimple *copy = gimple_copy (gsi_stmt (*gsi));
> > >         gimple_assign_set_lhs (copy, var);
> >
> > Fixed.
> >
> > > +      gsi_insert_before (gsi, copy, GSI_SAME_STMT);
> > > +      tree bitpos = bitsize_int (scalar_idx * elem_size);
> > > +      gimple_assign_set_rhs_with_ops (gsi, BIT_INSERT_EXPR, var,
> > > +                                     scalar_element, bitpos);
> > > +    }
> > >
> > > Otherwise looks OK to me.
> > >
> > > As separate followup patch it might be interesting to support
> > >
> > >  { 0, a[1], a[2], 3 }
> > >
> > > kinds as well, thus combining a VECTOR_CST (which is
> > > reasonably cheap to create) with another vector.  That should
> > > be maybe done as a first patch given this is just a two-vector
> > > permute which the code already handles apart from not
> > > recognizing the implicit constant vector participating.
> > >
> > > Similar
> > >
> > >  { 0, a[1], b[2], 3 }
> > >
> > > where the combination of a and b is blended with another
> > > constant vector.  I'm not sure if handling an arbitrary number
> > > of scalar elements should be done in a similar way, that is,
> > > implementing
> > >
> > >  { s1, a[1], a[2], s2, s3, b[0], b[1], b[2] }
> > >
> > > as
> > >
> > >   tem = VEC_PERM <a, b, { ... }>
> > >   tem2 = { s1, 0, 0, s2, s3, 0, 0, 0 }
> > >   res = VEC_PERM <tem, tem2, { blend-mask }>
> > >
> > > where constructing tem2 should take at most
> > > N-1 inserts (the first element to insert into tem2
> > > can use a splat or if element zero a zero-extending move).
> > >
> > > Doing this effectively lifts the restriction of only
> > > handling two vectors - we'd incrementally do
> > > two-vector permute plus blend of the rest which has
> > > its constructor re-processed.
> > >
> > > But as said - the code is already a bit awkward so changing
> > > this in multiple reivisions is preferred and the single-element
> > > case is certainly sth to do via a BIT_INSERT_EXPR.
> >
> > Agreed.
> >
> > I am testing this updated patch.
> >
>
> This is the fully tested patch.  I used
>
> +
> +      if (useless_type_conversion_p (type, TREE_TYPE (ref)))
> + {
> +    /* The RHS vector has the same type as LHS.  */
> +    if (rhs_vector == NULL)
> +      rhs_vector = ref;
> +    /* Check if all RHS vector elements come fome the same
> +       vector.  */
> +    if (rhs_vector == ref)
> +      nvectors++;
> + }
> +
>
> to support cast between __v4sf and __m128.
>
> --
> H.J.
H.J. Lu March 11, 2019, 7:02 a.m. UTC | #2
On Fri, Mar 8, 2019 at 7:03 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Fri, Mar 8, 2019 at 9:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Thu, Mar 7, 2019 at 9:51 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Wed, Mar 6, 2019 at 8:33 PM Richard Biener
> > > <richard.guenther@gmail.com> wrote:
> > > >
> > > > On Wed, Mar 6, 2019 at 8:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > On Tue, Mar 5, 2019 at 1:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > >
> > > > > > On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote:
> > > > > > > On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > >
> > > > > > > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> > > > > > > > > )
> > > > > > > > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > >
> > > > > > > > > > For vector init constructor:
> > > > > > > > > >
> > > > > > > > > > ---
> > > > > > > > > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> > > > > > > > > >
> > > > > > > > > > __v4sf
> > > > > > > > > > foo (__v4sf x, float f)
> > > > > > > > > > {
> > > > > > > > > >   __v4sf y = { f, x[1], x[2], x[3] };
> > > > > > > > > >   return y;
> > > > > > > > > > }
> > > > > > > > > > ---
> > > > > > > > > >
> > > > > > > > > > we can optimize vector init constructor with vector copy or permute
> > > > > > > > > > followed by a single scalar insert:
> > > > > >
> > > > > > > and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way
> > > > > > > is to emit a new stmt for _2 = copy ...; and do the set_rhs with the
> > > > > > > BIT_INSERT_EXPR.
> > > > > >
> > > > > > Thanks for BIT_INSERT_EXPR suggestion.  I am testing this patch.
> > > > > >
> > > > > >
> > > > > > H.J.
> > > > > > ---
> > > > > > We can optimize vector constructor with vector copy or permute followed
> > > > > > by a single scalar insert:
> > > > > >
> > > > > >   __v4sf y;
> > > > > >   __v4sf D.1930;
> > > > > >   float _1;
> > > > > >   float _2;
> > > > > >   float _3;
> > > > > >
> > > > > >   <bb 2> :
> > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > >   y_6 = {f_5(D), _3, _2, _1};
> > > > > >   return y_6;
> > > > > >
> > > > > > with
> > > > > >
> > > > > >  __v4sf y;
> > > > > >   __v4sf D.1930;
> > > > > >   float _1;
> > > > > >   float _2;
> > > > > >   float _3;
> > > > > >   vector(4) float _8;
> > > > > >
> > > > > >   <bb 2> :
> > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > >   _8 = x_9(D);
> > > > > >   y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
> > > > > >   return y_6;
> > > > > >
> > > > > > gcc/
> > > > > >
> > > > > >         PR tree-optimization/88828
> > > > > >         * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
> > > > > >         vector init constructor with vector copy or permute followed
> > > > > >         by a single scalar insert.
> > > > > >
> > > > > > gcc/testsuite/
> > > > > >
> > > > > >         PR tree-optimization/88828
> > > > > >         * gcc.target/i386/pr88828-1a.c: New test.
> > > > > >         * gcc.target/i386/pr88828-2b.c: Likewise.
> > > > > >         * gcc.target/i386/pr88828-2.c: Likewise.
> > > > > >         * gcc.target/i386/pr88828-3a.c: Likewise.
> > > > > >         * gcc.target/i386/pr88828-3b.c: Likewise.
> > > > > >         * gcc.target/i386/pr88828-3c.c: Likewise.
> > > > > >         * gcc.target/i386/pr88828-3d.c: Likewise.
> > > > > >         * gcc.target/i386/pr88828-4a.c: Likewise.
> > > > > >         * gcc.target/i386/pr88828-4b.c: Likewise.
> > > > > >         * gcc.target/i386/pr88828-5a.c: Likewise.
> > > > > >         * gcc.target/i386/pr88828-5b.c: Likewise.
> > > > > >         * gcc.target/i386/pr88828-6a.c: Likewise.
> > > > > >         * gcc.target/i386/pr88828-6b.c: Likewise.
> > > > >
> > > > > Here is the updated patch with run-time tests.
> > > >
> > > > -      if (TREE_CODE (elt->value) != SSA_NAME)
> > > > +      if (TREE_CODE (ce->value) != SSA_NAME)
> > > >         return false;
> > > >
> > > > hmm, so it doesn't allow { 0, v[1], v[2], v[3] }?  I think the single
> > > > scalar value can be a constant as well.
> > >
> > > Fixed.
> > >
> > > >        if (!def_stmt)
> > > > -       return false;
> > > > +       {
> > > > +         if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
> > > >
> > > > if (SSA_NAME_IS_DEFAULT_DEF (ce->value))
> > > >
> > > > +           {
> > > >
> > > > also you seem to disallow
> > > >
> > > >   { i + 1, v[1], v[2], v[3] }
> > >
> > > Fixed by
> > >
> > >      if (code != BIT_FIELD_REF)
> > >         {
> > >           /* Only allow one scalar insert.  */
> > >           if (nscalars != 0)
> > >             return false;
> > >
> > >           nscalars = 1;
> > >           insert = true;
> > >           scalar_idx = i;
> > >           sel.quick_push (i);
> > >           scalar_element = ce->value;
> > >           continue;
> > >         }
> > >
> > > > because get_prop_source_stmt will return the definition computing
> > > > i + 1 in this case and your code will be skipped?
> > > >
> > > > I think you can simplify the code by treating scalar_element != NULL
> > > > as nscalars == 1 and eliding nscalars.
> > >
> > > It works only if
> > >
> > > TYPE_VECTOR_SUBPARTS (type).to_constant ()  == (nscalars + nvectors)
> > >
> > > We need to check both nscalars and nvectors.  Elide nscalar
> > > check doesn't help much here.
> > >
> > > > -      if (conv_code == ERROR_MARK)
> > > > +      if (conv_code == ERROR_MARK && !insert)
> > > >         gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
> > > >                                         orig[1], op2);
> > > >        else
> > > > @@ -2148,10 +2198,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
> > > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > > >           orig[0] = gimple_assign_lhs (perm);
> > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > > -         gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
> > > > +         gimple_assign_set_rhs_with_ops (gsi,
> > > > +                                         (conv_code != ERROR_MARK
> > > > +                                          ? conv_code
> > > > +                                          : NOP_EXPR),
> > > > +                                         orig[0],
> > > >                                           NULL_TREE, NULL_TREE);
> > > >
> > > > I believe you should elide the last stmt for conv_code == ERROR_MARK,
> > > > that is, why did you need to add the && !insert check in the guarding condition
> > >
> > > When conv_code == ERROR_MARK, we still need
> > >
> > >        gimple *perm
> > >             = gimple_build_assign (make_ssa_name (TREE_TYPE (orig[0])),
> > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > >           orig[0] = gimple_assign_lhs (perm);
> > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > >           gimple_assign_set_rhs_with_ops (gsi,  NOP_EXPR,
> > >                                           orig[0],
> > >                                           NULL_TREE, NULL_TREE);
> > >
> > > Otherwise, scalar insert won't work.
> > >
> > > > (this path should already do the correct thing?).  Note that in all
> > > > cases it looks
> > > > that with conv_code != ERROR_MARK you may end up doing a float->int
> > > > or int->float conversion on a value it wasn't done on before which might
> > > > raise exceptions?  That is, do we need to make sure we permute a
> > > > value we already do convert into the place we're going to insert to?
> > >
> > > This couldn't happen:
> > >
> > >       if (type == TREE_TYPE (ref))
> > >          {
> > >            /* The RHS vector has the same type as LHS.  */
> > >            if (rhs_vector == NULL)
> > >              rhs_vector = ref;
> > >            /* Check if all RHS vector elements come fome the same
> > >               vector.  */
> > >            if (rhs_vector == ref)
> > >              nvectors++;
> > >          }
> > > ...
> > >   if (insert
> > >       && (nvectors == 0
> > >           || (TYPE_VECTOR_SUBPARTS (type).to_constant ()
> > >               != (nscalars + nvectors))))
> > >     return false;
>
> I see - that looks like a missed case then?
>
>  { 1., (float)v[1], (float)v[2], (float)v[3] }
>
> with integer vector v?

True.

> I'll have a look at the full patch next week (it's GCC 10 material in any case).
>

Thanks.
Richard Biener May 2, 2019, 2:54 p.m. UTC | #3
On Mon, Mar 11, 2019 at 8:03 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Mar 8, 2019 at 7:03 PM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Fri, Mar 8, 2019 at 9:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Thu, Mar 7, 2019 at 9:51 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Wed, Mar 6, 2019 at 8:33 PM Richard Biener
> > > > <richard.guenther@gmail.com> wrote:
> > > > >
> > > > > On Wed, Mar 6, 2019 at 8:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > >
> > > > > > On Tue, Mar 5, 2019 at 1:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > >
> > > > > > > On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote:
> > > > > > > > On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > >
> > > > > > > > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> > > > > > > > > > )
> > > > > > > > > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > For vector init constructor:
> > > > > > > > > > >
> > > > > > > > > > > ---
> > > > > > > > > > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> > > > > > > > > > >
> > > > > > > > > > > __v4sf
> > > > > > > > > > > foo (__v4sf x, float f)
> > > > > > > > > > > {
> > > > > > > > > > >   __v4sf y = { f, x[1], x[2], x[3] };
> > > > > > > > > > >   return y;
> > > > > > > > > > > }
> > > > > > > > > > > ---
> > > > > > > > > > >
> > > > > > > > > > > we can optimize vector init constructor with vector copy or permute
> > > > > > > > > > > followed by a single scalar insert:
> > > > > > >
> > > > > > > > and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way
> > > > > > > > is to emit a new stmt for _2 = copy ...; and do the set_rhs with the
> > > > > > > > BIT_INSERT_EXPR.
> > > > > > >
> > > > > > > Thanks for BIT_INSERT_EXPR suggestion.  I am testing this patch.
> > > > > > >
> > > > > > >
> > > > > > > H.J.
> > > > > > > ---
> > > > > > > We can optimize vector constructor with vector copy or permute followed
> > > > > > > by a single scalar insert:
> > > > > > >
> > > > > > >   __v4sf y;
> > > > > > >   __v4sf D.1930;
> > > > > > >   float _1;
> > > > > > >   float _2;
> > > > > > >   float _3;
> > > > > > >
> > > > > > >   <bb 2> :
> > > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > > >   y_6 = {f_5(D), _3, _2, _1};
> > > > > > >   return y_6;
> > > > > > >
> > > > > > > with
> > > > > > >
> > > > > > >  __v4sf y;
> > > > > > >   __v4sf D.1930;
> > > > > > >   float _1;
> > > > > > >   float _2;
> > > > > > >   float _3;
> > > > > > >   vector(4) float _8;
> > > > > > >
> > > > > > >   <bb 2> :
> > > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > > >   _8 = x_9(D);
> > > > > > >   y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
> > > > > > >   return y_6;
> > > > > > >
> > > > > > > gcc/
> > > > > > >
> > > > > > >         PR tree-optimization/88828
> > > > > > >         * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
> > > > > > >         vector init constructor with vector copy or permute followed
> > > > > > >         by a single scalar insert.
> > > > > > >
> > > > > > > gcc/testsuite/
> > > > > > >
> > > > > > >         PR tree-optimization/88828
> > > > > > >         * gcc.target/i386/pr88828-1a.c: New test.
> > > > > > >         * gcc.target/i386/pr88828-2b.c: Likewise.
> > > > > > >         * gcc.target/i386/pr88828-2.c: Likewise.
> > > > > > >         * gcc.target/i386/pr88828-3a.c: Likewise.
> > > > > > >         * gcc.target/i386/pr88828-3b.c: Likewise.
> > > > > > >         * gcc.target/i386/pr88828-3c.c: Likewise.
> > > > > > >         * gcc.target/i386/pr88828-3d.c: Likewise.
> > > > > > >         * gcc.target/i386/pr88828-4a.c: Likewise.
> > > > > > >         * gcc.target/i386/pr88828-4b.c: Likewise.
> > > > > > >         * gcc.target/i386/pr88828-5a.c: Likewise.
> > > > > > >         * gcc.target/i386/pr88828-5b.c: Likewise.
> > > > > > >         * gcc.target/i386/pr88828-6a.c: Likewise.
> > > > > > >         * gcc.target/i386/pr88828-6b.c: Likewise.
> > > > > >
> > > > > > Here is the updated patch with run-time tests.
> > > > >
> > > > > -      if (TREE_CODE (elt->value) != SSA_NAME)
> > > > > +      if (TREE_CODE (ce->value) != SSA_NAME)
> > > > >         return false;
> > > > >
> > > > > hmm, so it doesn't allow { 0, v[1], v[2], v[3] }?  I think the single
> > > > > scalar value can be a constant as well.
> > > >
> > > > Fixed.
> > > >
> > > > >        if (!def_stmt)
> > > > > -       return false;
> > > > > +       {
> > > > > +         if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
> > > > >
> > > > > if (SSA_NAME_IS_DEFAULT_DEF (ce->value))
> > > > >
> > > > > +           {
> > > > >
> > > > > also you seem to disallow
> > > > >
> > > > >   { i + 1, v[1], v[2], v[3] }
> > > >
> > > > Fixed by
> > > >
> > > >      if (code != BIT_FIELD_REF)
> > > >         {
> > > >           /* Only allow one scalar insert.  */
> > > >           if (nscalars != 0)
> > > >             return false;
> > > >
> > > >           nscalars = 1;
> > > >           insert = true;
> > > >           scalar_idx = i;
> > > >           sel.quick_push (i);
> > > >           scalar_element = ce->value;
> > > >           continue;
> > > >         }
> > > >
> > > > > because get_prop_source_stmt will return the definition computing
> > > > > i + 1 in this case and your code will be skipped?
> > > > >
> > > > > I think you can simplify the code by treating scalar_element != NULL
> > > > > as nscalars == 1 and eliding nscalars.
> > > >
> > > > It works only if
> > > >
> > > > TYPE_VECTOR_SUBPARTS (type).to_constant ()  == (nscalars + nvectors)
> > > >
> > > > We need to check both nscalars and nvectors.  Elide nscalar
> > > > check doesn't help much here.
> > > >
> > > > > -      if (conv_code == ERROR_MARK)
> > > > > +      if (conv_code == ERROR_MARK && !insert)
> > > > >         gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
> > > > >                                         orig[1], op2);
> > > > >        else
> > > > > @@ -2148,10 +2198,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
> > > > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > > > >           orig[0] = gimple_assign_lhs (perm);
> > > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > > > -         gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
> > > > > +         gimple_assign_set_rhs_with_ops (gsi,
> > > > > +                                         (conv_code != ERROR_MARK
> > > > > +                                          ? conv_code
> > > > > +                                          : NOP_EXPR),
> > > > > +                                         orig[0],
> > > > >                                           NULL_TREE, NULL_TREE);
> > > > >
> > > > > I believe you should elide the last stmt for conv_code == ERROR_MARK,
> > > > > that is, why did you need to add the && !insert check in the guarding condition
> > > >
> > > > When conv_code == ERROR_MARK, we still need
> > > >
> > > >        gimple *perm
> > > >             = gimple_build_assign (make_ssa_name (TREE_TYPE (orig[0])),
> > > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > > >           orig[0] = gimple_assign_lhs (perm);
> > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > >           gimple_assign_set_rhs_with_ops (gsi,  NOP_EXPR,
> > > >                                           orig[0],
> > > >                                           NULL_TREE, NULL_TREE);
> > > >
> > > > Otherwise, scalar insert won't work.
> > > >
> > > > > (this path should already do the correct thing?).  Note that in all
> > > > > cases it looks
> > > > > that with conv_code != ERROR_MARK you may end up doing a float->int
> > > > > or int->float conversion on a value it wasn't done on before which might
> > > > > raise exceptions?  That is, do we need to make sure we permute a
> > > > > value we already do convert into the place we're going to insert to?
> > > >
> > > > This couldn't happen:
> > > >
> > > >       if (type == TREE_TYPE (ref))
> > > >          {
> > > >            /* The RHS vector has the same type as LHS.  */
> > > >            if (rhs_vector == NULL)
> > > >              rhs_vector = ref;
> > > >            /* Check if all RHS vector elements come fome the same
> > > >               vector.  */
> > > >            if (rhs_vector == ref)
> > > >              nvectors++;
> > > >          }
> > > > ...
> > > >   if (insert
> > > >       && (nvectors == 0
> > > >           || (TYPE_VECTOR_SUBPARTS (type).to_constant ()
> > > >               != (nscalars + nvectors))))
> > > >     return false;
> >
> > I see - that looks like a missed case then?
> >
> >  { 1., (float)v[1], (float)v[2], (float)v[3] }
> >
> > with integer vector v?
>
> True.
>
> > I'll have a look at the full patch next week (it's GCC 10 material in any case).
> >

Now looking again.  I still don't like the new "structure" of the loop
very much.
A refactoring like the attached should make it easier to clearly separate the
cases where we reach a vector def and where not.

Do you want me to take over the patch?

Thanks,
Richard.
Richard Biener May 2, 2019, 2:55 p.m. UTC | #4
On Thu, May 2, 2019 at 4:54 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Mon, Mar 11, 2019 at 8:03 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Fri, Mar 8, 2019 at 7:03 PM Richard Biener
> > <richard.guenther@gmail.com> wrote:
> > >
> > > On Fri, Mar 8, 2019 at 9:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Thu, Mar 7, 2019 at 9:51 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > On Wed, Mar 6, 2019 at 8:33 PM Richard Biener
> > > > > <richard.guenther@gmail.com> wrote:
> > > > > >
> > > > > > On Wed, Mar 6, 2019 at 8:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > >
> > > > > > > On Tue, Mar 5, 2019 at 1:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > >
> > > > > > > > On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote:
> > > > > > > > > On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> > > > > > > > > > > )
> > > > > > > > > > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > > > >
> > > > > > > > > > > > For vector init constructor:
> > > > > > > > > > > >
> > > > > > > > > > > > ---
> > > > > > > > > > > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> > > > > > > > > > > >
> > > > > > > > > > > > __v4sf
> > > > > > > > > > > > foo (__v4sf x, float f)
> > > > > > > > > > > > {
> > > > > > > > > > > >   __v4sf y = { f, x[1], x[2], x[3] };
> > > > > > > > > > > >   return y;
> > > > > > > > > > > > }
> > > > > > > > > > > > ---
> > > > > > > > > > > >
> > > > > > > > > > > > we can optimize vector init constructor with vector copy or permute
> > > > > > > > > > > > followed by a single scalar insert:
> > > > > > > >
> > > > > > > > > and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way
> > > > > > > > > is to emit a new stmt for _2 = copy ...; and do the set_rhs with the
> > > > > > > > > BIT_INSERT_EXPR.
> > > > > > > >
> > > > > > > > Thanks for BIT_INSERT_EXPR suggestion.  I am testing this patch.
> > > > > > > >
> > > > > > > >
> > > > > > > > H.J.
> > > > > > > > ---
> > > > > > > > We can optimize vector constructor with vector copy or permute followed
> > > > > > > > by a single scalar insert:
> > > > > > > >
> > > > > > > >   __v4sf y;
> > > > > > > >   __v4sf D.1930;
> > > > > > > >   float _1;
> > > > > > > >   float _2;
> > > > > > > >   float _3;
> > > > > > > >
> > > > > > > >   <bb 2> :
> > > > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > > > >   y_6 = {f_5(D), _3, _2, _1};
> > > > > > > >   return y_6;
> > > > > > > >
> > > > > > > > with
> > > > > > > >
> > > > > > > >  __v4sf y;
> > > > > > > >   __v4sf D.1930;
> > > > > > > >   float _1;
> > > > > > > >   float _2;
> > > > > > > >   float _3;
> > > > > > > >   vector(4) float _8;
> > > > > > > >
> > > > > > > >   <bb 2> :
> > > > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > > > >   _8 = x_9(D);
> > > > > > > >   y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
> > > > > > > >   return y_6;
> > > > > > > >
> > > > > > > > gcc/
> > > > > > > >
> > > > > > > >         PR tree-optimization/88828
> > > > > > > >         * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
> > > > > > > >         vector init constructor with vector copy or permute followed
> > > > > > > >         by a single scalar insert.
> > > > > > > >
> > > > > > > > gcc/testsuite/
> > > > > > > >
> > > > > > > >         PR tree-optimization/88828
> > > > > > > >         * gcc.target/i386/pr88828-1a.c: New test.
> > > > > > > >         * gcc.target/i386/pr88828-2b.c: Likewise.
> > > > > > > >         * gcc.target/i386/pr88828-2.c: Likewise.
> > > > > > > >         * gcc.target/i386/pr88828-3a.c: Likewise.
> > > > > > > >         * gcc.target/i386/pr88828-3b.c: Likewise.
> > > > > > > >         * gcc.target/i386/pr88828-3c.c: Likewise.
> > > > > > > >         * gcc.target/i386/pr88828-3d.c: Likewise.
> > > > > > > >         * gcc.target/i386/pr88828-4a.c: Likewise.
> > > > > > > >         * gcc.target/i386/pr88828-4b.c: Likewise.
> > > > > > > >         * gcc.target/i386/pr88828-5a.c: Likewise.
> > > > > > > >         * gcc.target/i386/pr88828-5b.c: Likewise.
> > > > > > > >         * gcc.target/i386/pr88828-6a.c: Likewise.
> > > > > > > >         * gcc.target/i386/pr88828-6b.c: Likewise.
> > > > > > >
> > > > > > > Here is the updated patch with run-time tests.
> > > > > >
> > > > > > -      if (TREE_CODE (elt->value) != SSA_NAME)
> > > > > > +      if (TREE_CODE (ce->value) != SSA_NAME)
> > > > > >         return false;
> > > > > >
> > > > > > hmm, so it doesn't allow { 0, v[1], v[2], v[3] }?  I think the single
> > > > > > scalar value can be a constant as well.
> > > > >
> > > > > Fixed.
> > > > >
> > > > > >        if (!def_stmt)
> > > > > > -       return false;
> > > > > > +       {
> > > > > > +         if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
> > > > > >
> > > > > > if (SSA_NAME_IS_DEFAULT_DEF (ce->value))
> > > > > >
> > > > > > +           {
> > > > > >
> > > > > > also you seem to disallow
> > > > > >
> > > > > >   { i + 1, v[1], v[2], v[3] }
> > > > >
> > > > > Fixed by
> > > > >
> > > > >      if (code != BIT_FIELD_REF)
> > > > >         {
> > > > >           /* Only allow one scalar insert.  */
> > > > >           if (nscalars != 0)
> > > > >             return false;
> > > > >
> > > > >           nscalars = 1;
> > > > >           insert = true;
> > > > >           scalar_idx = i;
> > > > >           sel.quick_push (i);
> > > > >           scalar_element = ce->value;
> > > > >           continue;
> > > > >         }
> > > > >
> > > > > > because get_prop_source_stmt will return the definition computing
> > > > > > i + 1 in this case and your code will be skipped?
> > > > > >
> > > > > > I think you can simplify the code by treating scalar_element != NULL
> > > > > > as nscalars == 1 and eliding nscalars.
> > > > >
> > > > > It works only if
> > > > >
> > > > > TYPE_VECTOR_SUBPARTS (type).to_constant ()  == (nscalars + nvectors)
> > > > >
> > > > > We need to check both nscalars and nvectors.  Elide nscalar
> > > > > check doesn't help much here.
> > > > >
> > > > > > -      if (conv_code == ERROR_MARK)
> > > > > > +      if (conv_code == ERROR_MARK && !insert)
> > > > > >         gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
> > > > > >                                         orig[1], op2);
> > > > > >        else
> > > > > > @@ -2148,10 +2198,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
> > > > > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > > > > >           orig[0] = gimple_assign_lhs (perm);
> > > > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > > > > -         gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
> > > > > > +         gimple_assign_set_rhs_with_ops (gsi,
> > > > > > +                                         (conv_code != ERROR_MARK
> > > > > > +                                          ? conv_code
> > > > > > +                                          : NOP_EXPR),
> > > > > > +                                         orig[0],
> > > > > >                                           NULL_TREE, NULL_TREE);
> > > > > >
> > > > > > I believe you should elide the last stmt for conv_code == ERROR_MARK,
> > > > > > that is, why did you need to add the && !insert check in the guarding condition
> > > > >
> > > > > When conv_code == ERROR_MARK, we still need
> > > > >
> > > > >        gimple *perm
> > > > >             = gimple_build_assign (make_ssa_name (TREE_TYPE (orig[0])),
> > > > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > > > >           orig[0] = gimple_assign_lhs (perm);
> > > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > > >           gimple_assign_set_rhs_with_ops (gsi,  NOP_EXPR,
> > > > >                                           orig[0],
> > > > >                                           NULL_TREE, NULL_TREE);
> > > > >
> > > > > Otherwise, scalar insert won't work.
> > > > >
> > > > > > (this path should already do the correct thing?).  Note that in all
> > > > > > cases it looks
> > > > > > that with conv_code != ERROR_MARK you may end up doing a float->int
> > > > > > or int->float conversion on a value it wasn't done on before which might
> > > > > > raise exceptions?  That is, do we need to make sure we permute a
> > > > > > value we already do convert into the place we're going to insert to?
> > > > >
> > > > > This couldn't happen:
> > > > >
> > > > >       if (type == TREE_TYPE (ref))
> > > > >          {
> > > > >            /* The RHS vector has the same type as LHS.  */
> > > > >            if (rhs_vector == NULL)
> > > > >              rhs_vector = ref;
> > > > >            /* Check if all RHS vector elements come fome the same
> > > > >               vector.  */
> > > > >            if (rhs_vector == ref)
> > > > >              nvectors++;
> > > > >          }
> > > > > ...
> > > > >   if (insert
> > > > >       && (nvectors == 0
> > > > >           || (TYPE_VECTOR_SUBPARTS (type).to_constant ()
> > > > >               != (nscalars + nvectors))))
> > > > >     return false;
> > >
> > > I see - that looks like a missed case then?
> > >
> > >  { 1., (float)v[1], (float)v[2], (float)v[3] }
> > >
> > > with integer vector v?
> >
> > True.
> >
> > > I'll have a look at the full patch next week (it's GCC 10 material in any case).
> > >
>
> Now looking again.  I still don't like the new "structure" of the loop
> very much.
> A refactoring like the attached should make it easier to clearly separate the
> cases where we reach a vector def and where not.

Now attached.

> Do you want me to take over the patch?
>
> Thanks,
> Richard.
H.J. Lu May 2, 2019, 5:53 p.m. UTC | #5
On Thu, May 2, 2019 at 7:55 AM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Thu, May 2, 2019 at 4:54 PM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Mon, Mar 11, 2019 at 8:03 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Fri, Mar 8, 2019 at 7:03 PM Richard Biener
> > > <richard.guenther@gmail.com> wrote:
> > > >
> > > > On Fri, Mar 8, 2019 at 9:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > On Thu, Mar 7, 2019 at 9:51 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > >
> > > > > > On Wed, Mar 6, 2019 at 8:33 PM Richard Biener
> > > > > > <richard.guenther@gmail.com> wrote:
> > > > > > >
> > > > > > > On Wed, Mar 6, 2019 at 8:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > >
> > > > > > > > On Tue, Mar 5, 2019 at 1:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > >
> > > > > > > > > On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote:
> > > > > > > > > > On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> > > > > > > > > > > > )
> > > > > > > > > > > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > > > > >
> > > > > > > > > > > > > For vector init constructor:
> > > > > > > > > > > > >
> > > > > > > > > > > > > ---
> > > > > > > > > > > > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> > > > > > > > > > > > >
> > > > > > > > > > > > > __v4sf
> > > > > > > > > > > > > foo (__v4sf x, float f)
> > > > > > > > > > > > > {
> > > > > > > > > > > > >   __v4sf y = { f, x[1], x[2], x[3] };
> > > > > > > > > > > > >   return y;
> > > > > > > > > > > > > }
> > > > > > > > > > > > > ---
> > > > > > > > > > > > >
> > > > > > > > > > > > > we can optimize vector init constructor with vector copy or permute
> > > > > > > > > > > > > followed by a single scalar insert:
> > > > > > > > >
> > > > > > > > > > and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way
> > > > > > > > > > is to emit a new stmt for _2 = copy ...; and do the set_rhs with the
> > > > > > > > > > BIT_INSERT_EXPR.
> > > > > > > > >
> > > > > > > > > Thanks for BIT_INSERT_EXPR suggestion.  I am testing this patch.
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > H.J.
> > > > > > > > > ---
> > > > > > > > > We can optimize vector constructor with vector copy or permute followed
> > > > > > > > > by a single scalar insert:
> > > > > > > > >
> > > > > > > > >   __v4sf y;
> > > > > > > > >   __v4sf D.1930;
> > > > > > > > >   float _1;
> > > > > > > > >   float _2;
> > > > > > > > >   float _3;
> > > > > > > > >
> > > > > > > > >   <bb 2> :
> > > > > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > > > > >   y_6 = {f_5(D), _3, _2, _1};
> > > > > > > > >   return y_6;
> > > > > > > > >
> > > > > > > > > with
> > > > > > > > >
> > > > > > > > >  __v4sf y;
> > > > > > > > >   __v4sf D.1930;
> > > > > > > > >   float _1;
> > > > > > > > >   float _2;
> > > > > > > > >   float _3;
> > > > > > > > >   vector(4) float _8;
> > > > > > > > >
> > > > > > > > >   <bb 2> :
> > > > > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > > > > >   _8 = x_9(D);
> > > > > > > > >   y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
> > > > > > > > >   return y_6;
> > > > > > > > >
> > > > > > > > > gcc/
> > > > > > > > >
> > > > > > > > >         PR tree-optimization/88828
> > > > > > > > >         * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
> > > > > > > > >         vector init constructor with vector copy or permute followed
> > > > > > > > >         by a single scalar insert.
> > > > > > > > >
> > > > > > > > > gcc/testsuite/
> > > > > > > > >
> > > > > > > > >         PR tree-optimization/88828
> > > > > > > > >         * gcc.target/i386/pr88828-1a.c: New test.
> > > > > > > > >         * gcc.target/i386/pr88828-2b.c: Likewise.
> > > > > > > > >         * gcc.target/i386/pr88828-2.c: Likewise.
> > > > > > > > >         * gcc.target/i386/pr88828-3a.c: Likewise.
> > > > > > > > >         * gcc.target/i386/pr88828-3b.c: Likewise.
> > > > > > > > >         * gcc.target/i386/pr88828-3c.c: Likewise.
> > > > > > > > >         * gcc.target/i386/pr88828-3d.c: Likewise.
> > > > > > > > >         * gcc.target/i386/pr88828-4a.c: Likewise.
> > > > > > > > >         * gcc.target/i386/pr88828-4b.c: Likewise.
> > > > > > > > >         * gcc.target/i386/pr88828-5a.c: Likewise.
> > > > > > > > >         * gcc.target/i386/pr88828-5b.c: Likewise.
> > > > > > > > >         * gcc.target/i386/pr88828-6a.c: Likewise.
> > > > > > > > >         * gcc.target/i386/pr88828-6b.c: Likewise.
> > > > > > > >
> > > > > > > > Here is the updated patch with run-time tests.
> > > > > > >
> > > > > > > -      if (TREE_CODE (elt->value) != SSA_NAME)
> > > > > > > +      if (TREE_CODE (ce->value) != SSA_NAME)
> > > > > > >         return false;
> > > > > > >
> > > > > > > hmm, so it doesn't allow { 0, v[1], v[2], v[3] }?  I think the single
> > > > > > > scalar value can be a constant as well.
> > > > > >
> > > > > > Fixed.
> > > > > >
> > > > > > >        if (!def_stmt)
> > > > > > > -       return false;
> > > > > > > +       {
> > > > > > > +         if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
> > > > > > >
> > > > > > > if (SSA_NAME_IS_DEFAULT_DEF (ce->value))
> > > > > > >
> > > > > > > +           {
> > > > > > >
> > > > > > > also you seem to disallow
> > > > > > >
> > > > > > >   { i + 1, v[1], v[2], v[3] }
> > > > > >
> > > > > > Fixed by
> > > > > >
> > > > > >      if (code != BIT_FIELD_REF)
> > > > > >         {
> > > > > >           /* Only allow one scalar insert.  */
> > > > > >           if (nscalars != 0)
> > > > > >             return false;
> > > > > >
> > > > > >           nscalars = 1;
> > > > > >           insert = true;
> > > > > >           scalar_idx = i;
> > > > > >           sel.quick_push (i);
> > > > > >           scalar_element = ce->value;
> > > > > >           continue;
> > > > > >         }
> > > > > >
> > > > > > > because get_prop_source_stmt will return the definition computing
> > > > > > > i + 1 in this case and your code will be skipped?
> > > > > > >
> > > > > > > I think you can simplify the code by treating scalar_element != NULL
> > > > > > > as nscalars == 1 and eliding nscalars.
> > > > > >
> > > > > > It works only if
> > > > > >
> > > > > > TYPE_VECTOR_SUBPARTS (type).to_constant ()  == (nscalars + nvectors)
> > > > > >
> > > > > > We need to check both nscalars and nvectors.  Elide nscalar
> > > > > > check doesn't help much here.
> > > > > >
> > > > > > > -      if (conv_code == ERROR_MARK)
> > > > > > > +      if (conv_code == ERROR_MARK && !insert)
> > > > > > >         gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
> > > > > > >                                         orig[1], op2);
> > > > > > >        else
> > > > > > > @@ -2148,10 +2198,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
> > > > > > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > > > > > >           orig[0] = gimple_assign_lhs (perm);
> > > > > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > > > > > -         gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
> > > > > > > +         gimple_assign_set_rhs_with_ops (gsi,
> > > > > > > +                                         (conv_code != ERROR_MARK
> > > > > > > +                                          ? conv_code
> > > > > > > +                                          : NOP_EXPR),
> > > > > > > +                                         orig[0],
> > > > > > >                                           NULL_TREE, NULL_TREE);
> > > > > > >
> > > > > > > I believe you should elide the last stmt for conv_code == ERROR_MARK,
> > > > > > > that is, why did you need to add the && !insert check in the guarding condition
> > > > > >
> > > > > > When conv_code == ERROR_MARK, we still need
> > > > > >
> > > > > >        gimple *perm
> > > > > >             = gimple_build_assign (make_ssa_name (TREE_TYPE (orig[0])),
> > > > > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > > > > >           orig[0] = gimple_assign_lhs (perm);
> > > > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > > > >           gimple_assign_set_rhs_with_ops (gsi,  NOP_EXPR,
> > > > > >                                           orig[0],
> > > > > >                                           NULL_TREE, NULL_TREE);
> > > > > >
> > > > > > Otherwise, scalar insert won't work.
> > > > > >
> > > > > > > (this path should already do the correct thing?).  Note that in all
> > > > > > > cases it looks
> > > > > > > that with conv_code != ERROR_MARK you may end up doing a float->int
> > > > > > > or int->float conversion on a value it wasn't done on before which might
> > > > > > > raise exceptions?  That is, do we need to make sure we permute a
> > > > > > > value we already do convert into the place we're going to insert to?
> > > > > >
> > > > > > This couldn't happen:
> > > > > >
> > > > > >       if (type == TREE_TYPE (ref))
> > > > > >          {
> > > > > >            /* The RHS vector has the same type as LHS.  */
> > > > > >            if (rhs_vector == NULL)
> > > > > >              rhs_vector = ref;
> > > > > >            /* Check if all RHS vector elements come fome the same
> > > > > >               vector.  */
> > > > > >            if (rhs_vector == ref)
> > > > > >              nvectors++;
> > > > > >          }
> > > > > > ...
> > > > > >   if (insert
> > > > > >       && (nvectors == 0
> > > > > >           || (TYPE_VECTOR_SUBPARTS (type).to_constant ()
> > > > > >               != (nscalars + nvectors))))
> > > > > >     return false;
> > > >
> > > > I see - that looks like a missed case then?
> > > >
> > > >  { 1., (float)v[1], (float)v[2], (float)v[3] }
> > > >
> > > > with integer vector v?
> > >
> > > True.
> > >
> > > > I'll have a look at the full patch next week (it's GCC 10 material in any case).
> > > >
> >
> > Now looking again.  I still don't like the new "structure" of the loop
> > very much.
> > A refactoring like the attached should make it easier to clearly separate the
> > cases where we reach a vector def and where not.
>
> Now attached.
>
> > Do you want me to take over the patch?
> >

Sure.

Thanks.
H.J. Lu May 3, 2019, 4:53 p.m. UTC | #6
On Thu, May 2, 2019 at 10:53 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, May 2, 2019 at 7:55 AM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Thu, May 2, 2019 at 4:54 PM Richard Biener
> > <richard.guenther@gmail.com> wrote:
> > >
> > > On Mon, Mar 11, 2019 at 8:03 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Fri, Mar 8, 2019 at 7:03 PM Richard Biener
> > > > <richard.guenther@gmail.com> wrote:
> > > > >
> > > > > On Fri, Mar 8, 2019 at 9:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > >
> > > > > > On Thu, Mar 7, 2019 at 9:51 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > >
> > > > > > > On Wed, Mar 6, 2019 at 8:33 PM Richard Biener
> > > > > > > <richard.guenther@gmail.com> wrote:
> > > > > > > >
> > > > > > > > On Wed, Mar 6, 2019 at 8:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > >
> > > > > > > > > On Tue, Mar 5, 2019 at 1:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote:
> > > > > > > > > > > On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > > > >
> > > > > > > > > > > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> > > > > > > > > > > > > )
> > > > > > > > > > > > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > For vector init constructor:
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > ---
> > > > > > > > > > > > > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > __v4sf
> > > > > > > > > > > > > > foo (__v4sf x, float f)
> > > > > > > > > > > > > > {
> > > > > > > > > > > > > >   __v4sf y = { f, x[1], x[2], x[3] };
> > > > > > > > > > > > > >   return y;
> > > > > > > > > > > > > > }
> > > > > > > > > > > > > > ---
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > we can optimize vector init constructor with vector copy or permute
> > > > > > > > > > > > > > followed by a single scalar insert:
> > > > > > > > > >
> > > > > > > > > > > and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way
> > > > > > > > > > > is to emit a new stmt for _2 = copy ...; and do the set_rhs with the
> > > > > > > > > > > BIT_INSERT_EXPR.
> > > > > > > > > >
> > > > > > > > > > Thanks for BIT_INSERT_EXPR suggestion.  I am testing this patch.
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > H.J.
> > > > > > > > > > ---
> > > > > > > > > > We can optimize vector constructor with vector copy or permute followed
> > > > > > > > > > by a single scalar insert:
> > > > > > > > > >
> > > > > > > > > >   __v4sf y;
> > > > > > > > > >   __v4sf D.1930;
> > > > > > > > > >   float _1;
> > > > > > > > > >   float _2;
> > > > > > > > > >   float _3;
> > > > > > > > > >
> > > > > > > > > >   <bb 2> :
> > > > > > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > > > > > >   y_6 = {f_5(D), _3, _2, _1};
> > > > > > > > > >   return y_6;
> > > > > > > > > >
> > > > > > > > > > with
> > > > > > > > > >
> > > > > > > > > >  __v4sf y;
> > > > > > > > > >   __v4sf D.1930;
> > > > > > > > > >   float _1;
> > > > > > > > > >   float _2;
> > > > > > > > > >   float _3;
> > > > > > > > > >   vector(4) float _8;
> > > > > > > > > >
> > > > > > > > > >   <bb 2> :
> > > > > > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > > > > > >   _8 = x_9(D);
> > > > > > > > > >   y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
> > > > > > > > > >   return y_6;
> > > > > > > > > >
> > > > > > > > > > gcc/
> > > > > > > > > >
> > > > > > > > > >         PR tree-optimization/88828
> > > > > > > > > >         * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
> > > > > > > > > >         vector init constructor with vector copy or permute followed
> > > > > > > > > >         by a single scalar insert.
> > > > > > > > > >
> > > > > > > > > > gcc/testsuite/
> > > > > > > > > >
> > > > > > > > > >         PR tree-optimization/88828
> > > > > > > > > >         * gcc.target/i386/pr88828-1a.c: New test.
> > > > > > > > > >         * gcc.target/i386/pr88828-2b.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-2.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-3a.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-3b.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-3c.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-3d.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-4a.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-4b.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-5a.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-5b.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-6a.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-6b.c: Likewise.
> > > > > > > > >
> > > > > > > > > Here is the updated patch with run-time tests.
> > > > > > > >
> > > > > > > > -      if (TREE_CODE (elt->value) != SSA_NAME)
> > > > > > > > +      if (TREE_CODE (ce->value) != SSA_NAME)
> > > > > > > >         return false;
> > > > > > > >
> > > > > > > > hmm, so it doesn't allow { 0, v[1], v[2], v[3] }?  I think the single
> > > > > > > > scalar value can be a constant as well.
> > > > > > >
> > > > > > > Fixed.
> > > > > > >
> > > > > > > >        if (!def_stmt)
> > > > > > > > -       return false;
> > > > > > > > +       {
> > > > > > > > +         if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
> > > > > > > >
> > > > > > > > if (SSA_NAME_IS_DEFAULT_DEF (ce->value))
> > > > > > > >
> > > > > > > > +           {
> > > > > > > >
> > > > > > > > also you seem to disallow
> > > > > > > >
> > > > > > > >   { i + 1, v[1], v[2], v[3] }
> > > > > > >
> > > > > > > Fixed by
> > > > > > >
> > > > > > >      if (code != BIT_FIELD_REF)
> > > > > > >         {
> > > > > > >           /* Only allow one scalar insert.  */
> > > > > > >           if (nscalars != 0)
> > > > > > >             return false;
> > > > > > >
> > > > > > >           nscalars = 1;
> > > > > > >           insert = true;
> > > > > > >           scalar_idx = i;
> > > > > > >           sel.quick_push (i);
> > > > > > >           scalar_element = ce->value;
> > > > > > >           continue;
> > > > > > >         }
> > > > > > >
> > > > > > > > because get_prop_source_stmt will return the definition computing
> > > > > > > > i + 1 in this case and your code will be skipped?
> > > > > > > >
> > > > > > > > I think you can simplify the code by treating scalar_element != NULL
> > > > > > > > as nscalars == 1 and eliding nscalars.
> > > > > > >
> > > > > > > It works only if
> > > > > > >
> > > > > > > TYPE_VECTOR_SUBPARTS (type).to_constant ()  == (nscalars + nvectors)
> > > > > > >
> > > > > > > We need to check both nscalars and nvectors.  Elide nscalar
> > > > > > > check doesn't help much here.
> > > > > > >
> > > > > > > > -      if (conv_code == ERROR_MARK)
> > > > > > > > +      if (conv_code == ERROR_MARK && !insert)
> > > > > > > >         gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
> > > > > > > >                                         orig[1], op2);
> > > > > > > >        else
> > > > > > > > @@ -2148,10 +2198,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
> > > > > > > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > > > > > > >           orig[0] = gimple_assign_lhs (perm);
> > > > > > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > > > > > > -         gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
> > > > > > > > +         gimple_assign_set_rhs_with_ops (gsi,
> > > > > > > > +                                         (conv_code != ERROR_MARK
> > > > > > > > +                                          ? conv_code
> > > > > > > > +                                          : NOP_EXPR),
> > > > > > > > +                                         orig[0],
> > > > > > > >                                           NULL_TREE, NULL_TREE);
> > > > > > > >
> > > > > > > > I believe you should elide the last stmt for conv_code == ERROR_MARK,
> > > > > > > > that is, why did you need to add the && !insert check in the guarding condition
> > > > > > >
> > > > > > > When conv_code == ERROR_MARK, we still need
> > > > > > >
> > > > > > >        gimple *perm
> > > > > > >             = gimple_build_assign (make_ssa_name (TREE_TYPE (orig[0])),
> > > > > > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > > > > > >           orig[0] = gimple_assign_lhs (perm);
> > > > > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > > > > >           gimple_assign_set_rhs_with_ops (gsi,  NOP_EXPR,
> > > > > > >                                           orig[0],
> > > > > > >                                           NULL_TREE, NULL_TREE);
> > > > > > >
> > > > > > > Otherwise, scalar insert won't work.
> > > > > > >
> > > > > > > > (this path should already do the correct thing?).  Note that in all
> > > > > > > > cases it looks
> > > > > > > > that with conv_code != ERROR_MARK you may end up doing a float->int
> > > > > > > > or int->float conversion on a value it wasn't done on before which might
> > > > > > > > raise exceptions?  That is, do we need to make sure we permute a
> > > > > > > > value we already do convert into the place we're going to insert to?
> > > > > > >
> > > > > > > This couldn't happen:
> > > > > > >
> > > > > > >       if (type == TREE_TYPE (ref))
> > > > > > >          {
> > > > > > >            /* The RHS vector has the same type as LHS.  */
> > > > > > >            if (rhs_vector == NULL)
> > > > > > >              rhs_vector = ref;
> > > > > > >            /* Check if all RHS vector elements come fome the same
> > > > > > >               vector.  */
> > > > > > >            if (rhs_vector == ref)
> > > > > > >              nvectors++;
> > > > > > >          }
> > > > > > > ...
> > > > > > >   if (insert
> > > > > > >       && (nvectors == 0
> > > > > > >           || (TYPE_VECTOR_SUBPARTS (type).to_constant ()
> > > > > > >               != (nscalars + nvectors))))
> > > > > > >     return false;
> > > > >
> > > > > I see - that looks like a missed case then?
> > > > >
> > > > >  { 1., (float)v[1], (float)v[2], (float)v[3] }
> > > > >
> > > > > with integer vector v?
> > > >
> > > > True.
> > > >
> > > > > I'll have a look at the full patch next week (it's GCC 10 material in any case).
> > > > >
> > >
> > > Now looking again.  I still don't like the new "structure" of the loop
> > > very much.
> > > A refactoring like the attached should make it easier to clearly separate the
> > > cases where we reach a vector def and where not.
> >
> > Now attached.
> >
> > > Do you want me to take over the patch?
> > >
>

Here is the updated patch on top of your patch plus my fix.
Richard Biener May 8, 2019, 12:04 p.m. UTC | #7
On Fri, May 3, 2019 at 6:54 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, May 2, 2019 at 10:53 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Thu, May 2, 2019 at 7:55 AM Richard Biener
> > <richard.guenther@gmail.com> wrote:
> > >
> > > On Thu, May 2, 2019 at 4:54 PM Richard Biener
> > > <richard.guenther@gmail.com> wrote:
> > > >
> > > > On Mon, Mar 11, 2019 at 8:03 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > On Fri, Mar 8, 2019 at 7:03 PM Richard Biener
> > > > > <richard.guenther@gmail.com> wrote:
> > > > > >
> > > > > > On Fri, Mar 8, 2019 at 9:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > >
> > > > > > > On Thu, Mar 7, 2019 at 9:51 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > >
> > > > > > > > On Wed, Mar 6, 2019 at 8:33 PM Richard Biener
> > > > > > > > <richard.guenther@gmail.com> wrote:
> > > > > > > > >
> > > > > > > > > On Wed, Mar 6, 2019 at 8:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Tue, Mar 5, 2019 at 1:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote:
> > > > > > > > > > > > On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > > > > >
> > > > > > > > > > > > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> > > > > > > > > > > > > > )
> > > > > > > > > > > > > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > For vector init constructor:
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > ---
> > > > > > > > > > > > > > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > __v4sf
> > > > > > > > > > > > > > > foo (__v4sf x, float f)
> > > > > > > > > > > > > > > {
> > > > > > > > > > > > > > >   __v4sf y = { f, x[1], x[2], x[3] };
> > > > > > > > > > > > > > >   return y;
> > > > > > > > > > > > > > > }
> > > > > > > > > > > > > > > ---
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > we can optimize vector init constructor with vector copy or permute
> > > > > > > > > > > > > > > followed by a single scalar insert:
> > > > > > > > > > >
> > > > > > > > > > > > and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way
> > > > > > > > > > > > is to emit a new stmt for _2 = copy ...; and do the set_rhs with the
> > > > > > > > > > > > BIT_INSERT_EXPR.
> > > > > > > > > > >
> > > > > > > > > > > Thanks for BIT_INSERT_EXPR suggestion.  I am testing this patch.
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > H.J.
> > > > > > > > > > > ---
> > > > > > > > > > > We can optimize vector constructor with vector copy or permute followed
> > > > > > > > > > > by a single scalar insert:
> > > > > > > > > > >
> > > > > > > > > > >   __v4sf y;
> > > > > > > > > > >   __v4sf D.1930;
> > > > > > > > > > >   float _1;
> > > > > > > > > > >   float _2;
> > > > > > > > > > >   float _3;
> > > > > > > > > > >
> > > > > > > > > > >   <bb 2> :
> > > > > > > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > > > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > > > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > > > > > > >   y_6 = {f_5(D), _3, _2, _1};
> > > > > > > > > > >   return y_6;
> > > > > > > > > > >
> > > > > > > > > > > with
> > > > > > > > > > >
> > > > > > > > > > >  __v4sf y;
> > > > > > > > > > >   __v4sf D.1930;
> > > > > > > > > > >   float _1;
> > > > > > > > > > >   float _2;
> > > > > > > > > > >   float _3;
> > > > > > > > > > >   vector(4) float _8;
> > > > > > > > > > >
> > > > > > > > > > >   <bb 2> :
> > > > > > > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > > > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > > > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > > > > > > >   _8 = x_9(D);
> > > > > > > > > > >   y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
> > > > > > > > > > >   return y_6;
> > > > > > > > > > >
> > > > > > > > > > > gcc/
> > > > > > > > > > >
> > > > > > > > > > >         PR tree-optimization/88828
> > > > > > > > > > >         * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
> > > > > > > > > > >         vector init constructor with vector copy or permute followed
> > > > > > > > > > >         by a single scalar insert.
> > > > > > > > > > >
> > > > > > > > > > > gcc/testsuite/
> > > > > > > > > > >
> > > > > > > > > > >         PR tree-optimization/88828
> > > > > > > > > > >         * gcc.target/i386/pr88828-1a.c: New test.
> > > > > > > > > > >         * gcc.target/i386/pr88828-2b.c: Likewise.
> > > > > > > > > > >         * gcc.target/i386/pr88828-2.c: Likewise.
> > > > > > > > > > >         * gcc.target/i386/pr88828-3a.c: Likewise.
> > > > > > > > > > >         * gcc.target/i386/pr88828-3b.c: Likewise.
> > > > > > > > > > >         * gcc.target/i386/pr88828-3c.c: Likewise.
> > > > > > > > > > >         * gcc.target/i386/pr88828-3d.c: Likewise.
> > > > > > > > > > >         * gcc.target/i386/pr88828-4a.c: Likewise.
> > > > > > > > > > >         * gcc.target/i386/pr88828-4b.c: Likewise.
> > > > > > > > > > >         * gcc.target/i386/pr88828-5a.c: Likewise.
> > > > > > > > > > >         * gcc.target/i386/pr88828-5b.c: Likewise.
> > > > > > > > > > >         * gcc.target/i386/pr88828-6a.c: Likewise.
> > > > > > > > > > >         * gcc.target/i386/pr88828-6b.c: Likewise.
> > > > > > > > > >
> > > > > > > > > > Here is the updated patch with run-time tests.
> > > > > > > > >
> > > > > > > > > -      if (TREE_CODE (elt->value) != SSA_NAME)
> > > > > > > > > +      if (TREE_CODE (ce->value) != SSA_NAME)
> > > > > > > > >         return false;
> > > > > > > > >
> > > > > > > > > hmm, so it doesn't allow { 0, v[1], v[2], v[3] }?  I think the single
> > > > > > > > > scalar value can be a constant as well.
> > > > > > > >
> > > > > > > > Fixed.
> > > > > > > >
> > > > > > > > >        if (!def_stmt)
> > > > > > > > > -       return false;
> > > > > > > > > +       {
> > > > > > > > > +         if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
> > > > > > > > >
> > > > > > > > > if (SSA_NAME_IS_DEFAULT_DEF (ce->value))
> > > > > > > > >
> > > > > > > > > +           {
> > > > > > > > >
> > > > > > > > > also you seem to disallow
> > > > > > > > >
> > > > > > > > >   { i + 1, v[1], v[2], v[3] }
> > > > > > > >
> > > > > > > > Fixed by
> > > > > > > >
> > > > > > > >      if (code != BIT_FIELD_REF)
> > > > > > > >         {
> > > > > > > >           /* Only allow one scalar insert.  */
> > > > > > > >           if (nscalars != 0)
> > > > > > > >             return false;
> > > > > > > >
> > > > > > > >           nscalars = 1;
> > > > > > > >           insert = true;
> > > > > > > >           scalar_idx = i;
> > > > > > > >           sel.quick_push (i);
> > > > > > > >           scalar_element = ce->value;
> > > > > > > >           continue;
> > > > > > > >         }
> > > > > > > >
> > > > > > > > > because get_prop_source_stmt will return the definition computing
> > > > > > > > > i + 1 in this case and your code will be skipped?
> > > > > > > > >
> > > > > > > > > I think you can simplify the code by treating scalar_element != NULL
> > > > > > > > > as nscalars == 1 and eliding nscalars.
> > > > > > > >
> > > > > > > > It works only if
> > > > > > > >
> > > > > > > > TYPE_VECTOR_SUBPARTS (type).to_constant ()  == (nscalars + nvectors)
> > > > > > > >
> > > > > > > > We need to check both nscalars and nvectors.  Elide nscalar
> > > > > > > > check doesn't help much here.
> > > > > > > >
> > > > > > > > > -      if (conv_code == ERROR_MARK)
> > > > > > > > > +      if (conv_code == ERROR_MARK && !insert)
> > > > > > > > >         gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
> > > > > > > > >                                         orig[1], op2);
> > > > > > > > >        else
> > > > > > > > > @@ -2148,10 +2198,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
> > > > > > > > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > > > > > > > >           orig[0] = gimple_assign_lhs (perm);
> > > > > > > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > > > > > > > -         gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
> > > > > > > > > +         gimple_assign_set_rhs_with_ops (gsi,
> > > > > > > > > +                                         (conv_code != ERROR_MARK
> > > > > > > > > +                                          ? conv_code
> > > > > > > > > +                                          : NOP_EXPR),
> > > > > > > > > +                                         orig[0],
> > > > > > > > >                                           NULL_TREE, NULL_TREE);
> > > > > > > > >
> > > > > > > > > I believe you should elide the last stmt for conv_code == ERROR_MARK,
> > > > > > > > > that is, why did you need to add the && !insert check in the guarding condition
> > > > > > > >
> > > > > > > > When conv_code == ERROR_MARK, we still need
> > > > > > > >
> > > > > > > >        gimple *perm
> > > > > > > >             = gimple_build_assign (make_ssa_name (TREE_TYPE (orig[0])),
> > > > > > > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > > > > > > >           orig[0] = gimple_assign_lhs (perm);
> > > > > > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > > > > > >           gimple_assign_set_rhs_with_ops (gsi,  NOP_EXPR,
> > > > > > > >                                           orig[0],
> > > > > > > >                                           NULL_TREE, NULL_TREE);
> > > > > > > >
> > > > > > > > Otherwise, scalar insert won't work.
> > > > > > > >
> > > > > > > > > (this path should already do the correct thing?).  Note that in all
> > > > > > > > > cases it looks
> > > > > > > > > that with conv_code != ERROR_MARK you may end up doing a float->int
> > > > > > > > > or int->float conversion on a value it wasn't done on before which might
> > > > > > > > > raise exceptions?  That is, do we need to make sure we permute a
> > > > > > > > > value we already do convert into the place we're going to insert to?
> > > > > > > >
> > > > > > > > This couldn't happen:
> > > > > > > >
> > > > > > > >       if (type == TREE_TYPE (ref))
> > > > > > > >          {
> > > > > > > >            /* The RHS vector has the same type as LHS.  */
> > > > > > > >            if (rhs_vector == NULL)
> > > > > > > >              rhs_vector = ref;
> > > > > > > >            /* Check if all RHS vector elements come fome the same
> > > > > > > >               vector.  */
> > > > > > > >            if (rhs_vector == ref)
> > > > > > > >              nvectors++;
> > > > > > > >          }
> > > > > > > > ...
> > > > > > > >   if (insert
> > > > > > > >       && (nvectors == 0
> > > > > > > >           || (TYPE_VECTOR_SUBPARTS (type).to_constant ()
> > > > > > > >               != (nscalars + nvectors))))
> > > > > > > >     return false;
> > > > > >
> > > > > > I see - that looks like a missed case then?
> > > > > >
> > > > > >  { 1., (float)v[1], (float)v[2], (float)v[3] }
> > > > > >
> > > > > > with integer vector v?
> > > > >
> > > > > True.
> > > > >
> > > > > > I'll have a look at the full patch next week (it's GCC 10 material in any case).
> > > > > >
> > > >
> > > > Now looking again.  I still don't like the new "structure" of the loop
> > > > very much.
> > > > A refactoring like the attached should make it easier to clearly separate the
> > > > cases where we reach a vector def and where not.
> > >
> > > Now attached.
> > >
> > > > Do you want me to take over the patch?
> > > >
> >
>
> Here is the updated patch on top of your patch plus my fix.

Thanks - when doing the constant vector I was thinking of the following patch
to handle your cases.  It doesn't use insertion but eventually leaves that
to a separate transform.  Instead it handles non-constants similar to constants
by permuting from a uniform vector.  Thus

__attribute__((noinline, noclone))
__v4sf
foo2 (__v4sf x, float f)
{
  __v4sf y = { f, x[1], x[2], x[3] };
  return y;
}

becomes

  _4 = {f_2(D), f_2(D), f_2(D), f_2(D)};
  y_3 = VEC_PERM_EXPR <x_5(D), _4, { 4, 1, 2, 3 }>;
  return y_3;

this allows us to handle an arbitrary number of inserts of this
single value.  It also ensures we can actually perform the
permutation while for the insertion we currently do not have
a convenient way to query whether the target can perform
it efficiently (IIRC x86 needs AVX to insert to arbitrary lanes
with a single instruction?).  Similarly if the user writes the above
in source using __builtin_shuffle we'd want to optimize it as well.

The patch as attached only passes some of your testcases,
the following FAIL:

FAIL: gcc.target/i386/pr88828-2a.c scan-assembler movss
FAIL: gcc.target/i386/pr88828-2a.c scan-assembler-not movaps
FAIL: gcc.target/i386/pr88828-2a.c scan-assembler-not movlhps
FAIL: gcc.target/i386/pr88828-2a.c scan-assembler-not unpcklps
FAIL: gcc.target/i386/pr88828-2b.c scan-assembler-times vpermilps 1
FAIL: gcc.target/i386/pr88828-2b.c scan-assembler-times vmovss 1
FAIL: gcc.target/i386/pr88828-2c.c scan-assembler movss
FAIL: gcc.target/i386/pr88828-2c.c scan-assembler-not movaps
FAIL: gcc.target/i386/pr88828-2c.c scan-assembler-not movlhps
FAIL: gcc.target/i386/pr88828-2c.c scan-assembler-not unpcklps
FAIL: gcc.target/i386/pr88828-2d.c scan-assembler-times vpermilps 1
FAIL: gcc.target/i386/pr88828-2d.c scan-assembler-times vmovss 1
FAIL: gcc.target/i386/pr88828-3a.c scan-assembler movss
FAIL: gcc.target/i386/pr88828-3a.c scan-assembler-times shufps 2
FAIL: gcc.target/i386/pr88828-3a.c scan-assembler-times movaps 1
FAIL: gcc.target/i386/pr88828-3a.c scan-assembler-not movlhps
FAIL: gcc.target/i386/pr88828-3a.c scan-assembler-not unpcklps
FAIL: gcc.target/i386/pr88828-3b.c scan-assembler-times vpermilps 1
FAIL: gcc.target/i386/pr88828-3b.c scan-assembler-times vinsertps 1
FAIL: gcc.target/i386/pr88828-3b.c scan-assembler-not vshufps
FAIL: gcc.target/i386/pr88828-3c.c scan-assembler-times vpermilps 1
FAIL: gcc.target/i386/pr88828-3c.c scan-assembler-times vinsertps 1
FAIL: gcc.target/i386/pr88828-3c.c scan-assembler-not vshufps

Making the patch emit inserts for single insert locations is of course
still possible but you get to arrive at heuristics like your choice
of permuting the original lane into the later overwritten lane which
might be a choice making the permute impossible or more expensive?

The original purpose of simplify_vector_constructor was to simplify
the IL, not so much optimal code-generation in the end but I wonder
if we can rely on RTL expansion or later RTL optimization to do
the optimal choices here?

I guess simplify_permutation could perform a VEC_PERM
into an insert if the remaining permutation would be a no-op
but RTL optimization handles this case well already.

Whether code-generation for a one vector permute plus insert or
a two-vector permute is better in the end I don't know - at least
the permute expansion has a chance to see the combined
instruction.

Do you think the remaining cases above can be handled in the
backend?

Comments?

Thanks,
Richard.

2019-05-08  Richard Biener  <rguenther@suse.de>

        PR tree-optimization/88828
        * tree-ssa-forwprop.c (simplify_vector_constructor): Handle
        permuting in a single non-constant element not extracted
        from a vector.


> --
> H.J.
Richard Biener May 14, 2019, 9:13 a.m. UTC | #8
On Wed, May 8, 2019 at 2:04 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Fri, May 3, 2019 at 6:54 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Thu, May 2, 2019 at 10:53 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Thu, May 2, 2019 at 7:55 AM Richard Biener
> > > <richard.guenther@gmail.com> wrote:
> > > >
> > > > On Thu, May 2, 2019 at 4:54 PM Richard Biener
> > > > <richard.guenther@gmail.com> wrote:
> > > > >
> > > > > On Mon, Mar 11, 2019 at 8:03 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > >
> > > > > > On Fri, Mar 8, 2019 at 7:03 PM Richard Biener
> > > > > > <richard.guenther@gmail.com> wrote:
> > > > > > >
> > > > > > > On Fri, Mar 8, 2019 at 9:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > >
> > > > > > > > On Thu, Mar 7, 2019 at 9:51 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > >
> > > > > > > > > On Wed, Mar 6, 2019 at 8:33 PM Richard Biener
> > > > > > > > > <richard.guenther@gmail.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Wed, Mar 6, 2019 at 8:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > On Tue, Mar 5, 2019 at 1:46 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > > > >
> > > > > > > > > > > > On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote:
> > > > > > > > > > > > > On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote:
> > > > > > > > > > > > > > > )
> > > > > > > > > > > > > > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > For vector init constructor:
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > ---
> > > > > > > > > > > > > > > > typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > __v4sf
> > > > > > > > > > > > > > > > foo (__v4sf x, float f)
> > > > > > > > > > > > > > > > {
> > > > > > > > > > > > > > > >   __v4sf y = { f, x[1], x[2], x[3] };
> > > > > > > > > > > > > > > >   return y;
> > > > > > > > > > > > > > > > }
> > > > > > > > > > > > > > > > ---
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > we can optimize vector init constructor with vector copy or permute
> > > > > > > > > > > > > > > > followed by a single scalar insert:
> > > > > > > > > > > >
> > > > > > > > > > > > > and you want to advance to the _1 = BIT_INSERT_EXPR here.  The easiest way
> > > > > > > > > > > > > is to emit a new stmt for _2 = copy ...; and do the set_rhs with the
> > > > > > > > > > > > > BIT_INSERT_EXPR.
> > > > > > > > > > > >
> > > > > > > > > > > > Thanks for BIT_INSERT_EXPR suggestion.  I am testing this patch.
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > H.J.
> > > > > > > > > > > > ---
> > > > > > > > > > > > We can optimize vector constructor with vector copy or permute followed
> > > > > > > > > > > > by a single scalar insert:
> > > > > > > > > > > >
> > > > > > > > > > > >   __v4sf y;
> > > > > > > > > > > >   __v4sf D.1930;
> > > > > > > > > > > >   float _1;
> > > > > > > > > > > >   float _2;
> > > > > > > > > > > >   float _3;
> > > > > > > > > > > >
> > > > > > > > > > > >   <bb 2> :
> > > > > > > > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > > > > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > > > > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > > > > > > > >   y_6 = {f_5(D), _3, _2, _1};
> > > > > > > > > > > >   return y_6;
> > > > > > > > > > > >
> > > > > > > > > > > > with
> > > > > > > > > > > >
> > > > > > > > > > > >  __v4sf y;
> > > > > > > > > > > >   __v4sf D.1930;
> > > > > > > > > > > >   float _1;
> > > > > > > > > > > >   float _2;
> > > > > > > > > > > >   float _3;
> > > > > > > > > > > >   vector(4) float _8;
> > > > > > > > > > > >
> > > > > > > > > > > >   <bb 2> :
> > > > > > > > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > > > > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > > > > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > > > > > > > >   _8 = x_9(D);
> > > > > > > > > > > >   y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
> > > > > > > > > > > >   return y_6;
> > > > > > > > > > > >
> > > > > > > > > > > > gcc/
> > > > > > > > > > > >
> > > > > > > > > > > >         PR tree-optimization/88828
> > > > > > > > > > > >         * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
> > > > > > > > > > > >         vector init constructor with vector copy or permute followed
> > > > > > > > > > > >         by a single scalar insert.
> > > > > > > > > > > >
> > > > > > > > > > > > gcc/testsuite/
> > > > > > > > > > > >
> > > > > > > > > > > >         PR tree-optimization/88828
> > > > > > > > > > > >         * gcc.target/i386/pr88828-1a.c: New test.
> > > > > > > > > > > >         * gcc.target/i386/pr88828-2b.c: Likewise.
> > > > > > > > > > > >         * gcc.target/i386/pr88828-2.c: Likewise.
> > > > > > > > > > > >         * gcc.target/i386/pr88828-3a.c: Likewise.
> > > > > > > > > > > >         * gcc.target/i386/pr88828-3b.c: Likewise.
> > > > > > > > > > > >         * gcc.target/i386/pr88828-3c.c: Likewise.
> > > > > > > > > > > >         * gcc.target/i386/pr88828-3d.c: Likewise.
> > > > > > > > > > > >         * gcc.target/i386/pr88828-4a.c: Likewise.
> > > > > > > > > > > >         * gcc.target/i386/pr88828-4b.c: Likewise.
> > > > > > > > > > > >         * gcc.target/i386/pr88828-5a.c: Likewise.
> > > > > > > > > > > >         * gcc.target/i386/pr88828-5b.c: Likewise.
> > > > > > > > > > > >         * gcc.target/i386/pr88828-6a.c: Likewise.
> > > > > > > > > > > >         * gcc.target/i386/pr88828-6b.c: Likewise.
> > > > > > > > > > >
> > > > > > > > > > > Here is the updated patch with run-time tests.
> > > > > > > > > >
> > > > > > > > > > -      if (TREE_CODE (elt->value) != SSA_NAME)
> > > > > > > > > > +      if (TREE_CODE (ce->value) != SSA_NAME)
> > > > > > > > > >         return false;
> > > > > > > > > >
> > > > > > > > > > hmm, so it doesn't allow { 0, v[1], v[2], v[3] }?  I think the single
> > > > > > > > > > scalar value can be a constant as well.
> > > > > > > > >
> > > > > > > > > Fixed.
> > > > > > > > >
> > > > > > > > > >        if (!def_stmt)
> > > > > > > > > > -       return false;
> > > > > > > > > > +       {
> > > > > > > > > > +         if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
> > > > > > > > > >
> > > > > > > > > > if (SSA_NAME_IS_DEFAULT_DEF (ce->value))
> > > > > > > > > >
> > > > > > > > > > +           {
> > > > > > > > > >
> > > > > > > > > > also you seem to disallow
> > > > > > > > > >
> > > > > > > > > >   { i + 1, v[1], v[2], v[3] }
> > > > > > > > >
> > > > > > > > > Fixed by
> > > > > > > > >
> > > > > > > > >      if (code != BIT_FIELD_REF)
> > > > > > > > >         {
> > > > > > > > >           /* Only allow one scalar insert.  */
> > > > > > > > >           if (nscalars != 0)
> > > > > > > > >             return false;
> > > > > > > > >
> > > > > > > > >           nscalars = 1;
> > > > > > > > >           insert = true;
> > > > > > > > >           scalar_idx = i;
> > > > > > > > >           sel.quick_push (i);
> > > > > > > > >           scalar_element = ce->value;
> > > > > > > > >           continue;
> > > > > > > > >         }
> > > > > > > > >
> > > > > > > > > > because get_prop_source_stmt will return the definition computing
> > > > > > > > > > i + 1 in this case and your code will be skipped?
> > > > > > > > > >
> > > > > > > > > > I think you can simplify the code by treating scalar_element != NULL
> > > > > > > > > > as nscalars == 1 and eliding nscalars.
> > > > > > > > >
> > > > > > > > > It works only if
> > > > > > > > >
> > > > > > > > > TYPE_VECTOR_SUBPARTS (type).to_constant ()  == (nscalars + nvectors)
> > > > > > > > >
> > > > > > > > > We need to check both nscalars and nvectors.  Elide nscalar
> > > > > > > > > check doesn't help much here.
> > > > > > > > >
> > > > > > > > > > -      if (conv_code == ERROR_MARK)
> > > > > > > > > > +      if (conv_code == ERROR_MARK && !insert)
> > > > > > > > > >         gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
> > > > > > > > > >                                         orig[1], op2);
> > > > > > > > > >        else
> > > > > > > > > > @@ -2148,10 +2198,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
> > > > > > > > > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > > > > > > > > >           orig[0] = gimple_assign_lhs (perm);
> > > > > > > > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > > > > > > > > -         gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
> > > > > > > > > > +         gimple_assign_set_rhs_with_ops (gsi,
> > > > > > > > > > +                                         (conv_code != ERROR_MARK
> > > > > > > > > > +                                          ? conv_code
> > > > > > > > > > +                                          : NOP_EXPR),
> > > > > > > > > > +                                         orig[0],
> > > > > > > > > >                                           NULL_TREE, NULL_TREE);
> > > > > > > > > >
> > > > > > > > > > I believe you should elide the last stmt for conv_code == ERROR_MARK,
> > > > > > > > > > that is, why did you need to add the && !insert check in the guarding condition
> > > > > > > > >
> > > > > > > > > When conv_code == ERROR_MARK, we still need
> > > > > > > > >
> > > > > > > > >        gimple *perm
> > > > > > > > >             = gimple_build_assign (make_ssa_name (TREE_TYPE (orig[0])),
> > > > > > > > >                                    VEC_PERM_EXPR, orig[0], orig[1], op2);
> > > > > > > > >           orig[0] = gimple_assign_lhs (perm);
> > > > > > > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > > > > > > >           gimple_assign_set_rhs_with_ops (gsi,  NOP_EXPR,
> > > > > > > > >                                           orig[0],
> > > > > > > > >                                           NULL_TREE, NULL_TREE);
> > > > > > > > >
> > > > > > > > > Otherwise, scalar insert won't work.
> > > > > > > > >
> > > > > > > > > > (this path should already do the correct thing?).  Note that in all
> > > > > > > > > > cases it looks
> > > > > > > > > > that with conv_code != ERROR_MARK you may end up doing a float->int
> > > > > > > > > > or int->float conversion on a value it wasn't done on before which might
> > > > > > > > > > raise exceptions?  That is, do we need to make sure we permute a
> > > > > > > > > > value we already do convert into the place we're going to insert to?
> > > > > > > > >
> > > > > > > > > This couldn't happen:
> > > > > > > > >
> > > > > > > > >       if (type == TREE_TYPE (ref))
> > > > > > > > >          {
> > > > > > > > >            /* The RHS vector has the same type as LHS.  */
> > > > > > > > >            if (rhs_vector == NULL)
> > > > > > > > >              rhs_vector = ref;
> > > > > > > > >            /* Check if all RHS vector elements come fome the same
> > > > > > > > >               vector.  */
> > > > > > > > >            if (rhs_vector == ref)
> > > > > > > > >              nvectors++;
> > > > > > > > >          }
> > > > > > > > > ...
> > > > > > > > >   if (insert
> > > > > > > > >       && (nvectors == 0
> > > > > > > > >           || (TYPE_VECTOR_SUBPARTS (type).to_constant ()
> > > > > > > > >               != (nscalars + nvectors))))
> > > > > > > > >     return false;
> > > > > > >
> > > > > > > I see - that looks like a missed case then?
> > > > > > >
> > > > > > >  { 1., (float)v[1], (float)v[2], (float)v[3] }
> > > > > > >
> > > > > > > with integer vector v?
> > > > > >
> > > > > > True.
> > > > > >
> > > > > > > I'll have a look at the full patch next week (it's GCC 10 material in any case).
> > > > > > >
> > > > >
> > > > > Now looking again.  I still don't like the new "structure" of the loop
> > > > > very much.
> > > > > A refactoring like the attached should make it easier to clearly separate the
> > > > > cases where we reach a vector def and where not.
> > > >
> > > > Now attached.
> > > >
> > > > > Do you want me to take over the patch?
> > > > >
> > >
> >
> > Here is the updated patch on top of your patch plus my fix.
>
> Thanks - when doing the constant vector I was thinking of the following patch
> to handle your cases.  It doesn't use insertion but eventually leaves that
> to a separate transform.  Instead it handles non-constants similar to constants
> by permuting from a uniform vector.  Thus
>
> __attribute__((noinline, noclone))
> __v4sf
> foo2 (__v4sf x, float f)
> {
>   __v4sf y = { f, x[1], x[2], x[3] };
>   return y;
> }
>
> becomes
>
>   _4 = {f_2(D), f_2(D), f_2(D), f_2(D)};
>   y_3 = VEC_PERM_EXPR <x_5(D), _4, { 4, 1, 2, 3 }>;
>   return y_3;
>
> this allows us to handle an arbitrary number of inserts of this
> single value.  It also ensures we can actually perform the
> permutation while for the insertion we currently do not have
> a convenient way to query whether the target can perform
> it efficiently (IIRC x86 needs AVX to insert to arbitrary lanes
> with a single instruction?).  Similarly if the user writes the above
> in source using __builtin_shuffle we'd want to optimize it as well.
>
> The patch as attached only passes some of your testcases,
> the following FAIL:
>
> FAIL: gcc.target/i386/pr88828-2a.c scan-assembler movss
> FAIL: gcc.target/i386/pr88828-2a.c scan-assembler-not movaps
> FAIL: gcc.target/i386/pr88828-2a.c scan-assembler-not movlhps
> FAIL: gcc.target/i386/pr88828-2a.c scan-assembler-not unpcklps
> FAIL: gcc.target/i386/pr88828-2b.c scan-assembler-times vpermilps 1
> FAIL: gcc.target/i386/pr88828-2b.c scan-assembler-times vmovss 1
> FAIL: gcc.target/i386/pr88828-2c.c scan-assembler movss
> FAIL: gcc.target/i386/pr88828-2c.c scan-assembler-not movaps
> FAIL: gcc.target/i386/pr88828-2c.c scan-assembler-not movlhps
> FAIL: gcc.target/i386/pr88828-2c.c scan-assembler-not unpcklps
> FAIL: gcc.target/i386/pr88828-2d.c scan-assembler-times vpermilps 1
> FAIL: gcc.target/i386/pr88828-2d.c scan-assembler-times vmovss 1
> FAIL: gcc.target/i386/pr88828-3a.c scan-assembler movss
> FAIL: gcc.target/i386/pr88828-3a.c scan-assembler-times shufps 2
> FAIL: gcc.target/i386/pr88828-3a.c scan-assembler-times movaps 1
> FAIL: gcc.target/i386/pr88828-3a.c scan-assembler-not movlhps
> FAIL: gcc.target/i386/pr88828-3a.c scan-assembler-not unpcklps
> FAIL: gcc.target/i386/pr88828-3b.c scan-assembler-times vpermilps 1
> FAIL: gcc.target/i386/pr88828-3b.c scan-assembler-times vinsertps 1
> FAIL: gcc.target/i386/pr88828-3b.c scan-assembler-not vshufps
> FAIL: gcc.target/i386/pr88828-3c.c scan-assembler-times vpermilps 1
> FAIL: gcc.target/i386/pr88828-3c.c scan-assembler-times vinsertps 1
> FAIL: gcc.target/i386/pr88828-3c.c scan-assembler-not vshufps
>
> Making the patch emit inserts for single insert locations is of course
> still possible but you get to arrive at heuristics like your choice
> of permuting the original lane into the later overwritten lane which
> might be a choice making the permute impossible or more expensive?
>
> The original purpose of simplify_vector_constructor was to simplify
> the IL, not so much optimal code-generation in the end but I wonder
> if we can rely on RTL expansion or later RTL optimization to do
> the optimal choices here?
>
> I guess simplify_permutation could perform a VEC_PERM
> into an insert if the remaining permutation would be a no-op
> but RTL optimization handles this case well already.
>
> Whether code-generation for a one vector permute plus insert or
> a two-vector permute is better in the end I don't know - at least
> the permute expansion has a chance to see the combined
> instruction.
>
> Do you think the remaining cases above can be handled in the
> backend?
>
> Comments?

I have now applied this after bootstrap / regtest on x86_64-unknown-linux-gnu
together with the part of the testcases that PASS.  Note I had to
add -fexcess-precision=standard to the -7.c one as it otherwise fails to
execute both patched and unpatched.

r271153.

I didn't check whether the remaining testcases simply need adjustments
(thus their code-gen is OK) or if there's something to do on the target
or in a GIMPLE transform.  That needs to be evaluated still.

There's also still the missed optimization of using VEC_UNPACK/PACK
codes for conversions.

Richard.

> Thanks,
> Richard.
>
> 2019-05-08  Richard Biener  <rguenther@suse.de>
>
>         PR tree-optimization/88828
>         * tree-ssa-forwprop.c (simplify_vector_constructor): Handle
>         permuting in a single non-constant element not extracted
>         from a vector.
>
>
> > --
> > H.J.
diff mbox series

Patch

From 65c61509182afceb4f1b35839e16adcf5c3503d8 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 5 Feb 2019 15:39:27 -0800
Subject: [PATCH] Optimize vector constructor

We can optimize vector constructor with vector copy or permute followed
by a single scalar insert:

  __v4sf y;
  __v4sf D.1930;
  float _1;
  float _2;
  float _3;

  <bb 2> :
  _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
  _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
  _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
  y_6 = {f_5(D), _3, _2, _1};
  return y_6;

with

 __v4sf y;
  __v4sf D.1930;
  float _1;
  float _2;
  float _3;
  vector(4) float _8;

  <bb 2> :
  _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
  _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
  _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
  _8 = x_9(D);
  y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
  return y_6;

gcc/

	PR tree-optimization/88828
	* tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
	vector init constructor with vector copy or permute followed
	by a single scalar insert.

gcc/testsuite/

	PR tree-optimization/88828
	* gcc.target/i386/pr88828-1.c: New test.
	* gcc.target/i386/pr88828-1a.c: Likewise.
	* gcc.target/i386/pr88828-1b.c: Likewise.
	* gcc.target/i386/pr88828-1c.c: Likewise.
	* gcc.target/i386/pr88828-2.c: Likewise.
	* gcc.target/i386/pr88828-2a.c: Likewise.
	* gcc.target/i386/pr88828-2b.c: Likewise.
	* gcc.target/i386/pr88828-2c.c: Likewise.
	* gcc.target/i386/pr88828-2d.c: Likewise.
	* gcc.target/i386/pr88828-3.c: Likewise.
	* gcc.target/i386/pr88828-3a.c: Likewise.
	* gcc.target/i386/pr88828-3b.c: Likewise.
	* gcc.target/i386/pr88828-3c.c: Likewise.
	* gcc.target/i386/pr88828-3d.c: Likewise.
	* gcc.target/i386/pr88828-4a.c: Likewise.
	* gcc.target/i386/pr88828-4b.c: Likewise.
	* gcc.target/i386/pr88828-5a.c: Likewise.
	* gcc.target/i386/pr88828-5b.c: Likewise.
	* gcc.target/i386/pr88828-6.c: Likewise.
	* gcc.target/i386/pr88828-6a.c: Likewise.
	* gcc.target/i386/pr88828-6b.c: Likewise.
	* gcc.target/i386/pr88828-7.c: Likewise.
	* gcc.target/i386/pr88828-7a.c: Likewise.
	* gcc.target/i386/pr88828-7b.c: Likewise.
	* gcc.target/i386/pr88828-8.c: Likewise.
	* gcc.target/i386/pr88828-8a.c: Likewise.
	* gcc.target/i386/pr88828-8b.c: Likewise.
	* gcc.target/i386/pr88828-9.c: Likewise.
	* gcc.target/i386/pr88828-9a.c: Likewise.
	* gcc.target/i386/pr88828-9b.c: Likewise.
---
 gcc/testsuite/gcc.target/i386/pr88828-1.c  | 49 +++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-1a.c | 17 ++++
 gcc/testsuite/gcc.target/i386/pr88828-1b.c | 23 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-1c.c | 18 ++++
 gcc/testsuite/gcc.target/i386/pr88828-2.c  | 51 ++++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-2a.c | 17 ++++
 gcc/testsuite/gcc.target/i386/pr88828-2b.c | 19 +++++
 gcc/testsuite/gcc.target/i386/pr88828-2c.c | 23 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-2d.c | 25 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-3.c  | 54 ++++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-3a.c | 17 ++++
 gcc/testsuite/gcc.target/i386/pr88828-3b.c | 19 +++++
 gcc/testsuite/gcc.target/i386/pr88828-3c.c | 25 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-4a.c | 18 ++++
 gcc/testsuite/gcc.target/i386/pr88828-4b.c | 21 +++++
 gcc/testsuite/gcc.target/i386/pr88828-5a.c | 18 ++++
 gcc/testsuite/gcc.target/i386/pr88828-5b.c | 20 +++++
 gcc/testsuite/gcc.target/i386/pr88828-6.c  | 47 +++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-6a.c | 16 ++++
 gcc/testsuite/gcc.target/i386/pr88828-6b.c | 22 +++++
 gcc/testsuite/gcc.target/i386/pr88828-7.c  | 53 ++++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-7a.c | 16 ++++
 gcc/testsuite/gcc.target/i386/pr88828-7b.c | 22 +++++
 gcc/testsuite/gcc.target/i386/pr88828-8.c  | 46 +++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-8a.c | 15 ++++
 gcc/testsuite/gcc.target/i386/pr88828-8b.c | 21 +++++
 gcc/testsuite/gcc.target/i386/pr88828-9.c  | 46 +++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-9a.c | 16 ++++
 gcc/testsuite/gcc.target/i386/pr88828-9b.c | 23 ++++++
 gcc/tree-ssa-forwprop.c                    | 96 +++++++++++++++++++---
 30 files changed, 861 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2d.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-7a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-7b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-8a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-8b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-9.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-9a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-9b.c

diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1.c b/gcc/testsuite/gcc.target/i386/pr88828-1.c
new file mode 100644
index 00000000000..a15d1fea3f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1.c
@@ -0,0 +1,49 @@ 
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-1a.c"
+#include "pr88828-1b.c"
+#include "pr88828-1c.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 134567;
+  __v4sf x = { f[0], f[1], f[2], f[3] };
+  __v4sf y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x, z);
+  do_check (y, f, z);
+  y = foo2 (x, z);
+  do_check (y, f, z);
+  y = foo3 (x, z);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1a.c b/gcc/testsuite/gcc.target/i386/pr88828-1a.c
new file mode 100644
index 00000000000..d37b24c6661
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1a.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo1 (__v4sf x, float f)
+{
+  __v4sf y = { f, x[1], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1b.c b/gcc/testsuite/gcc.target/i386/pr88828-1b.c
new file mode 100644
index 00000000000..af4aced65f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1b.c
@@ -0,0 +1,23 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__attribute__((noinline, noclone))
+__v4sf
+foo2 (__v4sf x, float f)
+{
+  return vector_init (f, x[1], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1c.c b/gcc/testsuite/gcc.target/i386/pr88828-1c.c
new file mode 100644
index 00000000000..a117f3ec7b1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1c.c
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo3 (__v4sf x, float f)
+{
+  __v4sf y = x;
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2.c b/gcc/testsuite/gcc.target/i386/pr88828-2.c
new file mode 100644
index 00000000000..011fd486bb1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2.c
@@ -0,0 +1,51 @@ 
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-2a.c"
+#include "pr88828-2c.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else if (i == 1)
+      {
+	if (y[i] != f[0])
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 134567;
+  __v4sf x = { f[0], f[1], f[2], f[3] };
+  __v4sf y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x, z);
+  do_check (y, f, z);
+  y = foo2 (x, z);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2a.c b/gcc/testsuite/gcc.target/i386/pr88828-2a.c
new file mode 100644
index 00000000000..85e49535ebd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2a.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo1 (__v4sf x, float f)
+{
+  __v4sf y = { f, x[0], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2b.c b/gcc/testsuite/gcc.target/i386/pr88828-2b.c
new file mode 100644
index 00000000000..adfd7002a4d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2b.c
@@ -0,0 +1,19 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo1 (__v4sf x, float f)
+{
+  __v4sf y = { f, x[0], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2c.c b/gcc/testsuite/gcc.target/i386/pr88828-2c.c
new file mode 100644
index 00000000000..149967ea0b9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2c.c
@@ -0,0 +1,23 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__attribute__((noinline, noclone))
+__v4sf
+foo2 (__v4sf x, float f)
+{
+  return vector_init (f, x[0], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2d.c b/gcc/testsuite/gcc.target/i386/pr88828-2d.c
new file mode 100644
index 00000000000..21088496730
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2d.c
@@ -0,0 +1,25 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  return vector_init (f, x[0], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3.c b/gcc/testsuite/gcc.target/i386/pr88828-3.c
new file mode 100644
index 00000000000..adbc46dbf3b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3.c
@@ -0,0 +1,54 @@ 
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-3a.c"
+#include "pr88828-3b.c"
+#include "pr88828-3c.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 3)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else if (i == 0)
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i + 1])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 134567;
+  __v4sf x = { f[0], f[1], f[2], f[3] };
+  __v4sf y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x, z);
+  do_check (y, f, z);
+  y = foo2 (x, z);
+  do_check (y, f, z);
+  y = foo3 (x, z);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3a.c b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
new file mode 100644
index 00000000000..e5cb95c1275
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 2 } } */
+/* { dg-final { scan-assembler-times "movaps" 1 } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo1 (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], f };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3b.c b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
new file mode 100644
index 00000000000..0349f35b08a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
@@ -0,0 +1,19 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo2 (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], f };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3c.c b/gcc/testsuite/gcc.target/i386/pr88828-3c.c
new file mode 100644
index 00000000000..fb668a55f1d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3c.c
@@ -0,0 +1,25 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__attribute__((noinline, noclone))
+__v4sf
+foo3 (__v4sf x, float f)
+{
+  return vector_init (x[0], x[2], x[3], f);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4a.c b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
new file mode 100644
index 00000000000..64043b9855f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[1] };
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4b.c b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
new file mode 100644
index 00000000000..ad8d2b985d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
@@ -0,0 +1,21 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[1] };
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5a.c b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
new file mode 100644
index 00000000000..5e908faef5c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 2 } } */
+/* { dg-final { scan-assembler-times "movaps" 1 } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[0] };
+  y[3] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5b.c b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
new file mode 100644
index 00000000000..988a48823e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
@@ -0,0 +1,20 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[0] };
+  y[3] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6.c b/gcc/testsuite/gcc.target/i386/pr88828-6.c
new file mode 100644
index 00000000000..8d920396896
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-6.c
@@ -0,0 +1,47 @@ 
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-6a.c"
+#include "pr88828-6b.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf x, float f1[4], float f2[4])
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (x[i] != (f1[i] + f2[i]))
+	  abort ();
+      }
+    else
+      {
+	if (x[i] != f1[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f1[4] = { -11, 2, 55553, -4 };
+  float f2[4] = { 111, 3.3, -55.553, 4.8 };
+  __v4sf x = { f1[0], f1[1], f1[2], f1[3] };
+  __v4sf y = { f2[0], f2[1], f2[2], f2[3] };
+  __v4sf z;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f1[i] || y[i] != f2[i] )
+      abort ();
+
+  z = foo1 (x, y);
+  do_check (z, f1, f2);
+  x = foo2 (x, y);
+  do_check (z, f1, f2);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6a.c b/gcc/testsuite/gcc.target/i386/pr88828-6a.c
new file mode 100644
index 00000000000..4094f25a1fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-6a.c
@@ -0,0 +1,16 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "addss" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo1 (__v4sf x, __v4sf y)
+{
+  __v4sf z = { x[0] + y[0], x[1], x[2], x[3] };
+  return z;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6b.c b/gcc/testsuite/gcc.target/i386/pr88828-6b.c
new file mode 100644
index 00000000000..a423a089963
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-6b.c
@@ -0,0 +1,22 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "addss" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__v4sf
+foo2 (__v4sf x, __v4sf y)
+{
+  return vector_init (x[0] + y[0], x[1], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-7.c b/gcc/testsuite/gcc.target/i386/pr88828-7.c
new file mode 100644
index 00000000000..471028d417d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-7.c
@@ -0,0 +1,53 @@ 
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-7a.c"
+#include "pr88828-7b.c"
+
+extern void abort ();
+
+float
+bar (float x, float y)
+{
+  return x / y - y * x;
+}
+
+void
+do_check (__v4sf x, float f1[4], float f2[4])
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (x[i] != bar (f1[i], f2[i]))
+	  abort ();
+      }
+    else
+      {
+	if (x[i] != f1[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f1[4] = { -11, 2, 55553, -4 };
+  float f2[4] = { 111, 3.3, -55.553, 4.8 };
+  __v4sf x = { f1[0], f1[1], f1[2], f1[3] };
+  __v4sf y = { f2[0], f2[1], f2[2], f2[3] };
+  __v4sf z;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f1[i] || y[i] != f2[i] )
+      abort ();
+
+  z = foo1 (x, y);
+  do_check (z, f1, f2);
+  x = foo2 (x, y);
+  do_check (z, f1, f2);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-7a.c b/gcc/testsuite/gcc.target/i386/pr88828-7a.c
new file mode 100644
index 00000000000..f1ae57422d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-7a.c
@@ -0,0 +1,16 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+extern float bar (float, float);
+
+__v4sf
+foo1 (__v4sf x, __v4sf y)
+{
+  __v4sf z = { bar (x[0], y[0]), x[1], x[2], x[3] };
+  return z;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-7b.c b/gcc/testsuite/gcc.target/i386/pr88828-7b.c
new file mode 100644
index 00000000000..c027c56948d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-7b.c
@@ -0,0 +1,22 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+extern float bar (float, float);
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__v4sf
+foo2 (__v4sf x, __v4sf y)
+{
+  return vector_init (bar (x[0], y[0]), x[1], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-8.c b/gcc/testsuite/gcc.target/i386/pr88828-8.c
new file mode 100644
index 00000000000..3b8eabd225f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-8.c
@@ -0,0 +1,46 @@ 
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-8a.c"
+#include "pr88828-8b.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 11.4;
+  __v4sf x = { f[0], f[1], f[2], f[3] };
+  __v4sf y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x);
+  do_check (y, f, z);
+  y = foo2 (x);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-8a.c b/gcc/testsuite/gcc.target/i386/pr88828-8a.c
new file mode 100644
index 00000000000..5d383dfd081
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-8a.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo1 (__v4sf x)
+{
+  __v4sf z = { 11.4, x[1], x[2], x[3] };
+  return z;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-8b.c b/gcc/testsuite/gcc.target/i386/pr88828-8b.c
new file mode 100644
index 00000000000..5ffbc9c3103
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-8b.c
@@ -0,0 +1,21 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__v4sf
+foo2 (__v4sf x)
+{
+  return vector_init (11.4, x[1], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-9.c b/gcc/testsuite/gcc.target/i386/pr88828-9.c
new file mode 100644
index 00000000000..c33907b4a6f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-9.c
@@ -0,0 +1,46 @@ 
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-9a.c"
+#include "pr88828-9b.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 11.4;
+  __m128 x = (__m128) (__v4sf) { f[0], f[1], f[2], f[3] };
+  __m128 y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x);
+  do_check (y, f, z);
+  y = foo2 (x);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-9a.c b/gcc/testsuite/gcc.target/i386/pr88828-9a.c
new file mode 100644
index 00000000000..7f830657732
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-9a.c
@@ -0,0 +1,16 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
+
+__m128
+foo1 (__m128 x)
+{
+  __v4sf z = { 11.4, ((__v4sf) x)[1], ((__v4sf) x)[2], ((__v4sf) x) [3] };
+  return (__m128) z;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-9b.c b/gcc/testsuite/gcc.target/i386/pr88828-9b.c
new file mode 100644
index 00000000000..6588ad15a9b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-9b.c
@@ -0,0 +1,23 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
+
+static __m128
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return (__m128) y;
+}
+
+__m128
+foo2 (__m128 x)
+{
+  return vector_init (11.4, ((__v4sf) x)[1], ((__v4sf) x)[2],
+		      ((__v4sf) x) [3]);
+}
diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c
index eeb6281c652..32a3af5687e 100644
--- a/gcc/tree-ssa-forwprop.c
+++ b/gcc/tree-ssa-forwprop.c
@@ -2008,7 +2008,7 @@  simplify_vector_constructor (gimple_stmt_iterator *gsi)
   unsigned elem_size, i;
   unsigned HOST_WIDE_INT nelts;
   enum tree_code code, conv_code;
-  constructor_elt *elt;
+  constructor_elt *ce;
   bool maybe_ident;
 
   gcc_checking_assert (gimple_assign_rhs_code (stmt) == CONSTRUCTOR);
@@ -2027,18 +2027,38 @@  simplify_vector_constructor (gimple_stmt_iterator *gsi)
   orig[1] = NULL;
   conv_code = ERROR_MARK;
   maybe_ident = true;
-  FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, elt)
+
+  tree rhs_vector = NULL;
+  /* The single scalar element.  */
+  tree scalar_element = NULL;
+  unsigned int scalar_idx = 0;
+  bool insert = false;
+  unsigned int nscalars = 0;
+  unsigned int nvectors = 0;
+  FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, ce)
     {
       tree ref, op1;
 
       if (i >= nelts)
 	return false;
 
-      if (TREE_CODE (elt->value) != SSA_NAME)
-	return false;
-      def_stmt = get_prop_source_stmt (elt->value, false, NULL);
+      if (TREE_CODE (ce->value) == SSA_NAME)
+	def_stmt = get_prop_source_stmt (ce->value, false, NULL);
+      else
+	def_stmt = NULL;
       if (!def_stmt)
-	return false;
+	{
+	  /* Only allow one scalar insert.  */
+	  if (nscalars != 0)
+	    return false;
+
+	  nscalars = 1;
+	  insert = true;
+	  scalar_idx = i;
+	  sel.quick_push (i);
+	  scalar_element = ce->value;
+	  continue;
+	}
       code = gimple_assign_rhs_code (def_stmt);
       if (code == FLOAT_EXPR
 	  || code == FIX_TRUNC_EXPR)
@@ -2046,7 +2066,7 @@  simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	  op1 = gimple_assign_rhs1 (def_stmt);
 	  if (conv_code == ERROR_MARK)
 	    {
-	      if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (elt->value))),
+	      if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (ce->value))),
 			    GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1)))))
 		return false;
 	      conv_code = code;
@@ -2061,7 +2081,18 @@  simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	  code = gimple_assign_rhs_code (def_stmt);
 	}
       if (code != BIT_FIELD_REF)
-	return false;
+	{
+	  /* Only allow one scalar insert.  */
+	  if (nscalars != 0)
+	    return false;
+
+	  nscalars = 1;
+	  insert = true;
+	  scalar_idx = i;
+	  sel.quick_push (i);
+	  scalar_element = ce->value;
+	  continue;
+	}
       op1 = gimple_assign_rhs1 (def_stmt);
       ref = TREE_OPERAND (op1, 0);
       unsigned int j;
@@ -2095,11 +2126,29 @@  simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	elt += nelts;
       if (elt != i)
 	maybe_ident = false;
+
+      if (useless_type_conversion_p (type, TREE_TYPE (ref)))
+	 {
+	   /* The RHS vector has the same type as LHS.  */
+	   if (rhs_vector == NULL)
+	     rhs_vector = ref;
+	   /* Check if all RHS vector elements come fome the same
+	      vector.  */
+	   if (rhs_vector == ref)
+	     nvectors++;
+	 }
+
       sel.quick_push (elt);
     }
   if (i < nelts)
     return false;
 
+  if (insert
+      && (nvectors == 0
+	  || (TYPE_VECTOR_SUBPARTS (type).to_constant ()
+	      != (nscalars + nvectors))))
+    return false;
+
   if (! VECTOR_TYPE_P (TREE_TYPE (orig[0]))
       || maybe_ne (TYPE_VECTOR_SUBPARTS (type),
 		   TYPE_VECTOR_SUBPARTS (TREE_TYPE (orig[0]))))
@@ -2127,18 +2176,26 @@  simplify_vector_constructor (gimple_stmt_iterator *gsi)
 
       vec_perm_indices indices (sel, orig[1] ? 2 : 1, nelts);
       if (!can_vec_perm_const_p (TYPE_MODE (type), indices))
-	return false;
+	{
+	  if (insert)
+	    gcc_unreachable ();
+	  return false;
+	}
       mask_type
 	= build_vector_type (build_nonstandard_integer_type (elem_size, 1),
 			     nelts);
       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_VECTOR_INT
 	  || maybe_ne (GET_MODE_SIZE (TYPE_MODE (mask_type)),
 		       GET_MODE_SIZE (TYPE_MODE (type))))
-	return false;
+	{
+	  if (insert)
+	    gcc_unreachable ();
+	  return false;
+	}
       op2 = vec_perm_indices_to_tree (mask_type, indices);
       if (!orig[1])
 	orig[1] = orig[0];
-      if (conv_code == ERROR_MARK)
+      if (conv_code == ERROR_MARK && !insert)
 	gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
 					orig[1], op2);
       else
@@ -2148,10 +2205,25 @@  simplify_vector_constructor (gimple_stmt_iterator *gsi)
 				   VEC_PERM_EXPR, orig[0], orig[1], op2);
 	  orig[0] = gimple_assign_lhs (perm);
 	  gsi_insert_before (gsi, perm, GSI_SAME_STMT);
-	  gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
+	  gimple_assign_set_rhs_with_ops (gsi,
+					  (conv_code != ERROR_MARK
+					   ? conv_code
+					   : NOP_EXPR),
+					  orig[0],
 					  NULL_TREE, NULL_TREE);
 	}
     }
+  if (insert)
+    {
+      /* Generate a single scalar insert.  */
+      tree var = make_ssa_name (type);
+      gimple *copy = gimple_copy (gsi_stmt (*gsi));
+      gimple_assign_set_lhs (copy, var);
+      gsi_insert_before (gsi, copy, GSI_SAME_STMT);
+      tree bitpos = bitsize_int (scalar_idx * elem_size);
+      gimple_assign_set_rhs_with_ops (gsi, BIT_INSERT_EXPR, var,
+				      scalar_element, bitpos);
+    }
   update_stmt (gsi_stmt (*gsi));
   return true;
 }
-- 
2.20.1