diff mbox series

Strip of a vector load which is only used partially.

Message ID 20220505050437.86261-1-hongtao.liu@intel.com
State New
Headers show
Series Strip of a vector load which is only used partially. | expand

Commit Message

Liu, Hongtao May 5, 2022, 5:04 a.m. UTC
Optimize

  _1 = *srcp_3(D);
  _4 = VEC_PERM_EXPR <_1, _1, { 4, 5, 6, 7, 4, 5, 6, 7 }>;
  _5 = BIT_FIELD_REF <_4, 128, 0>;

to

  _1 = *srcp_3(D);
  _5 = BIT_FIELD_REF <_1, 128, 128>;

the upper will finally be optimized to

_5 = BIT_FIELD_REF <*srcp_3(D), 128, 128>;

Bootstrapped and regtested on x86_64-pc-linux-gnu{m32,}.
Ok for trunk?

gcc/ChangeLog:

	PR tree-optimization/102583
	* gimple.h (gate_optimize_vector_load): Declare.
	* match.pd: Simplify (BIT_FIELD_REF (vec_perm *p *p { 4, 5, 6,
	7, 4, 5, 6, 7 }) 128 0) to (BIT_FIELD_REF *p 128 128).
	* tree-ssa-forwprop.cc (gate_optimize_vector_load): New
	function.
	(pass_forwprop::execute): Put condition codes in the upper new
	function.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr102583.c: New test.
---
 gcc/gimple.h                             |  1 +
 gcc/match.pd                             | 56 ++++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr102583.c | 30 +++++++++++++
 gcc/tree-ssa-forwprop.cc                 | 32 +++++++++-----
 4 files changed, 109 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102583.c

Comments

Richard Biener May 5, 2022, 8:26 a.m. UTC | #1
On Thu, May 5, 2022 at 7:04 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> Optimize
>
>   _1 = *srcp_3(D);
>   _4 = VEC_PERM_EXPR <_1, _1, { 4, 5, 6, 7, 4, 5, 6, 7 }>;
>   _5 = BIT_FIELD_REF <_4, 128, 0>;
>
> to
>
>   _1 = *srcp_3(D);
>   _5 = BIT_FIELD_REF <_1, 128, 128>;
>
> the upper will finally be optimized to
>
> _5 = BIT_FIELD_REF <*srcp_3(D), 128, 128>;
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{m32,}.
> Ok for trunk?

Hmm, tree-ssa-forwprop.cc:simplify_bitfield_ref should already
handle this in the

  if (code == VEC_PERM_EXPR
      && constant_multiple_p (bit_field_offset (op), size, &idx))
    {

part of the code - maybe that needs to be enhanced to cover
a contiguous stride in the VEC_PERM_EXPR.  I see
we have

  size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
  if (maybe_ne (bit_field_size (op), size))
    return false;

where it will currently bail, so adjust that to check for a
constant multiple.  I also think we should only handle the
case where the new bit_field_offset alignment is not
worse than the original one.

That said, I'd prefer if you integrate this transform with
simplify_bitfield_ref.

Richard.

>
> gcc/ChangeLog:
>
>         PR tree-optimization/102583
>         * gimple.h (gate_optimize_vector_load): Declare.
>         * match.pd: Simplify (BIT_FIELD_REF (vec_perm *p *p { 4, 5, 6,
>         7, 4, 5, 6, 7 }) 128 0) to (BIT_FIELD_REF *p 128 128).
>         * tree-ssa-forwprop.cc (gate_optimize_vector_load): New
>         function.
>         (pass_forwprop::execute): Put condition codes in the upper new
>         function.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr102583.c: New test.
> ---
>  gcc/gimple.h                             |  1 +
>  gcc/match.pd                             | 56 ++++++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr102583.c | 30 +++++++++++++
>  gcc/tree-ssa-forwprop.cc                 | 32 +++++++++-----
>  4 files changed, 109 insertions(+), 10 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr102583.c
>
> diff --git a/gcc/gimple.h b/gcc/gimple.h
> index 6b1e89ad74e..1747dae1193 100644
> --- a/gcc/gimple.h
> +++ b/gcc/gimple.h
> @@ -1638,6 +1638,7 @@ extern void maybe_remove_unused_call_args (struct function *, gimple *);
>  extern bool gimple_inexpensive_call_p (gcall *);
>  extern bool stmt_can_terminate_bb_p (gimple *);
>  extern location_t gimple_or_expr_nonartificial_location (gimple *, tree);
> +extern bool gate_optimize_vector_load (gimple *);
>
>  /* Return the disposition for a warning (or all warnings by default)
>     for a statement.  */
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 6d691d302b3..ac214310251 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -6832,6 +6832,62 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>         }
>         (cmp @0 { res; })))))))))
>
> +#if GIMPLE
> +/* Simplify partail vector access, transform
> +
> +   V8SI A;
> +   V4SI B;
> +   A = *PA;
> +   B = VEC_PERM_EXPR (A, A, { 4, 5, 6, 7, 4, 5, 6, 7 });
> +   C = BIT_FIELD_REF (B, 128, 0)
> +
> +to
> +
> +   A = *PA;
> +   C = BIT_FIELD_REF (B, 128, 128);
> +
> +optimize_vector_load will eventually optimize the upper to
> +
> +   C = BIT_FIELD_REF (*PA, 128, 128);  */
> +
> +(simplify
> + (BIT_FIELD_REF (vec_perm@2 SSA_NAME@0 @0 VECTOR_CST@1) @rsize @rpos)
> + (if (VECTOR_TYPE_P (type)
> +     && TYPE_MODE (type) != BLKmode
> +     && single_use (@2)
> +     && gate_optimize_vector_load (SSA_NAME_DEF_STMT (@0))
> +     && types_match (TREE_TYPE (type), TREE_TYPE (TREE_TYPE (@0))))
> +  (with
> +   {
> +     unsigned HOST_WIDE_INT nelts = -1;
> +     if (!VECTOR_CST_NELTS (@1).is_constant (&nelts))
> +       return NULL_TREE;
> +     tree inner_type = TREE_TYPE (type);
> +     unsigned HOST_WIDE_INT elt_w = tree_to_uhwi (TYPE_SIZE (inner_type));
> +     unsigned HOST_WIDE_INT pos = tree_to_uhwi (@rpos);
> +     unsigned HOST_WIDE_INT size = tree_to_uhwi (@rsize);
> +     unsigned HOST_WIDE_INT start
> +       = tree_to_uhwi (vector_cst_elt (@1, pos / elt_w));
> +
> +     for (unsigned HOST_WIDE_INT i  = pos / elt_w + 1; i != size / elt_w; i++)
> +       {
> +        /* Continuous area.  */
> +        if (tree_to_uhwi (vector_cst_elt (@1, i)) - 1
> +            != tree_to_uhwi (vector_cst_elt (@1, i - 1)))
> +          return NULL_TREE;
> +       }
> +
> +     /* Aligned or support movmisalign_optab.  */
> +     unsigned HOST_WIDE_INT dest_align = tree_to_uhwi (TYPE_SIZE (type));
> +     if ((TYPE_ALIGN (TREE_TYPE (@0)) % dest_align
> +         || start * elt_w % dest_align)
> +       && (optab_handler (movmisalign_optab, TYPE_MODE (type))
> +           == CODE_FOR_nothing))
> +       return NULL_TREE;
> +   }
> +   (BIT_FIELD_REF @0 @rsize { bitsize_int (start * elt_w); }))))
> +#endif
> +
>  /* Canonicalizations of BIT_FIELD_REFs.  */
>
>  (simplify
> diff --git a/gcc/testsuite/gcc.target/i386/pr102583.c b/gcc/testsuite/gcc.target/i386/pr102583.c
> new file mode 100644
> index 00000000000..ff2ffb5e671
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr102583.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2" } */
> +/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+20\(%.*%ymm} 1 } } */
> +/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+8\(%.*%xmm} 1 } } */
> +/* { dg-final { scan-assembler-times {(?n)vmovq[ \t]+16\(%.*%xmm} 1 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-not {(?n)vpermd[ \t]+.*%zmm} } } */
> +
> +typedef int v16si __attribute__((vector_size(64)));
> +typedef float v8sf __attribute__((vector_size(32)));
> +typedef float v4sf __attribute__((vector_size(16)));
> +typedef float v2sf __attribute__((vector_size(8)));
> +
> +v8sf part (v16si *srcp)
> +{
> +  v16si src = *srcp;
> +  return (v8sf) { (float)src[5], (float)src[6], (float)src[7], (float)src[8],
> +      (float)src[9], (float)src[10], (float)src[11], (float)src[12] };
> +}
> +
> +v4sf part1 (v16si *srcp)
> +{
> +  v16si src = *srcp;
> +  return (v4sf) { (float)src[2], (float)src[3], (float)src[4], (float)src[5] };
> +}
> +
> +v2sf part2 (v16si *srcp)
> +{
> +  v16si src = *srcp;
> +  return (v2sf) { (float)src[4], (float)src[5] };
> +}
> diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
> index 484491fa1c5..2c8d8bc6dce 100644
> --- a/gcc/tree-ssa-forwprop.cc
> +++ b/gcc/tree-ssa-forwprop.cc
> @@ -3074,6 +3074,27 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
>    return true;
>  }
>
> +/* Gate for optimize_vector_load.  */
> +bool
> +gate_optimize_vector_load (gimple* stmt)
> +{
> +  if (!is_gimple_assign (stmt))
> +    return false;
> +
> +  tree lhs = gimple_assign_lhs (stmt);
> +  tree rhs = gimple_assign_rhs1 (stmt);
> +  return (cfun
> +         && TREE_CODE (TREE_TYPE (lhs)) == VECTOR_TYPE
> +         && (TYPE_MODE (TREE_TYPE (lhs)) == BLKmode
> +             /* After vector lowering rewrite all loads, but
> +                initially do not since this conflicts with
> +                vector CONSTRUCTOR to shuffle optimization.  */
> +             || (cfun->curr_properties & PROP_gimple_lvec))
> +         && gimple_assign_load_p (stmt)
> +         && !gimple_has_volatile_ops (stmt)
> +         && !stmt_can_throw_internal (cfun, stmt)
> +         && (!VAR_P (rhs) || !DECL_HARD_REGISTER (rhs)));
> +}
>
>  /* Rewrite the vector load at *GSI to component-wise loads if the load
>     is only used in BIT_FIELD_REF extractions with eventual intermediate
> @@ -3500,16 +3521,7 @@ pass_forwprop::execute (function *fun)
>               else
>                 gsi_next (&gsi);
>             }
> -         else if (TREE_CODE (TREE_TYPE (lhs)) == VECTOR_TYPE
> -                  && (TYPE_MODE (TREE_TYPE (lhs)) == BLKmode
> -                      /* After vector lowering rewrite all loads, but
> -                         initially do not since this conflicts with
> -                         vector CONSTRUCTOR to shuffle optimization.  */
> -                      || (fun->curr_properties & PROP_gimple_lvec))
> -                  && gimple_assign_load_p (stmt)
> -                  && !gimple_has_volatile_ops (stmt)
> -                  && !stmt_can_throw_internal (cfun, stmt)
> -                  && (!VAR_P (rhs) || !DECL_HARD_REGISTER (rhs)))
> +         else if (gate_optimize_vector_load (stmt))
>             optimize_vector_load (&gsi);
>
>           else if (code == COMPLEX_EXPR)
> --
> 2.18.1
>
Jeff Law May 9, 2022, 10:57 p.m. UTC | #2
On 5/5/2022 2:26 AM, Richard Biener via Gcc-patches wrote:
> On Thu, May 5, 2022 at 7:04 AM liuhongt <hongtao.liu@intel.com> wrote:
>> Optimize
>>
>>    _1 = *srcp_3(D);
>>    _4 = VEC_PERM_EXPR <_1, _1, { 4, 5, 6, 7, 4, 5, 6, 7 }>;
>>    _5 = BIT_FIELD_REF <_4, 128, 0>;
>>
>> to
>>
>>    _1 = *srcp_3(D);
>>    _5 = BIT_FIELD_REF <_1, 128, 128>;
>>
>> the upper will finally be optimized to
>>
>> _5 = BIT_FIELD_REF <*srcp_3(D), 128, 128>;
>>
>> Bootstrapped and regtested on x86_64-pc-linux-gnu{m32,}.
>> Ok for trunk?
> Hmm, tree-ssa-forwprop.cc:simplify_bitfield_ref should already
> handle this in the
>
>    if (code == VEC_PERM_EXPR
>        && constant_multiple_p (bit_field_offset (op), size, &idx))
>      {
>
> part of the code - maybe that needs to be enhanced to cover
> a contiguous stride in the VEC_PERM_EXPR.  I see
> we have
>
>    size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
>    if (maybe_ne (bit_field_size (op), size))
>      return false;
>
> where it will currently bail, so adjust that to check for a
> constant multiple.  I also think we should only handle the
> case where the new bit_field_offset alignment is not
> worse than the original one.
>
> That said, I'd prefer if you integrate this transform with
> simplify_bitfield_ref.
I've got a hack here that tries to do something similar, but it's trying 
to catch the case where we CONSTRUCTOR feeds the BIT_FIELD_REF.  It 
walks the CONSTRUCTOR elements to see if an element has the right 
offset/size to satisify the BIT_FIELD_REF. For x264 we're often able to 
eliminate the VEC_PERMUTE entirely and just forward operands into the 
BIT_FIELD_REF.

I was leaning towards moving those bits into match.pd before submitting, 
but if you'd prefer them in tree-ssa-forwprop, that's even easier.

Jeff
Richard Biener May 10, 2022, 6:30 a.m. UTC | #3
On Tue, May 10, 2022 at 12:58 AM Jeff Law via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
>
>
> On 5/5/2022 2:26 AM, Richard Biener via Gcc-patches wrote:
> > On Thu, May 5, 2022 at 7:04 AM liuhongt <hongtao.liu@intel.com> wrote:
> >> Optimize
> >>
> >>    _1 = *srcp_3(D);
> >>    _4 = VEC_PERM_EXPR <_1, _1, { 4, 5, 6, 7, 4, 5, 6, 7 }>;
> >>    _5 = BIT_FIELD_REF <_4, 128, 0>;
> >>
> >> to
> >>
> >>    _1 = *srcp_3(D);
> >>    _5 = BIT_FIELD_REF <_1, 128, 128>;
> >>
> >> the upper will finally be optimized to
> >>
> >> _5 = BIT_FIELD_REF <*srcp_3(D), 128, 128>;
> >>
> >> Bootstrapped and regtested on x86_64-pc-linux-gnu{m32,}.
> >> Ok for trunk?
> > Hmm, tree-ssa-forwprop.cc:simplify_bitfield_ref should already
> > handle this in the
> >
> >    if (code == VEC_PERM_EXPR
> >        && constant_multiple_p (bit_field_offset (op), size, &idx))
> >      {
> >
> > part of the code - maybe that needs to be enhanced to cover
> > a contiguous stride in the VEC_PERM_EXPR.  I see
> > we have
> >
> >    size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
> >    if (maybe_ne (bit_field_size (op), size))
> >      return false;
> >
> > where it will currently bail, so adjust that to check for a
> > constant multiple.  I also think we should only handle the
> > case where the new bit_field_offset alignment is not
> > worse than the original one.
> >
> > That said, I'd prefer if you integrate this transform with
> > simplify_bitfield_ref.
> I've got a hack here that tries to do something similar, but it's trying
> to catch the case where we CONSTRUCTOR feeds the BIT_FIELD_REF.  It
> walks the CONSTRUCTOR elements to see if an element has the right
> offset/size to satisify the BIT_FIELD_REF. For x264 we're often able to
> eliminate the VEC_PERMUTE entirely and just forward operands into the
> BIT_FIELD_REF.
>
> I was leaning towards moving those bits into match.pd before submitting,
> but if you'd prefer them in tree-ssa-forwprop, that's even easier.

I think when deciding where to put things it's important to look where related
transforms reside.  We already do have a (simplify (BIT_FIELD_REF
CONSTRUCTOR@ ...))
pattern which should also handle your case already.  So instead of
adding something
new it would be nice to figure why it doesn't handle the case you are
interested in and
eventually just adjust the existing pattern.

In the case of the above patch there isn't a match.pd pattern for this yet but
forwprop already has code to match bit-field-refs with vec-perms, so that's the
reason I preferred extending that.  But of course the whole thing could live in
match.pd as well.

Richard.

> Jeff
>
>
Jeff Law June 26, 2022, 6:59 p.m. UTC | #4
On 5/10/2022 12:30 AM, Richard Biener wrote:
> On Tue, May 10, 2022 at 12:58 AM Jeff Law via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>>
>>
>> On 5/5/2022 2:26 AM, Richard Biener via Gcc-patches wrote:
>>> On Thu, May 5, 2022 at 7:04 AM liuhongt <hongtao.liu@intel.com> wrote:
>>>> Optimize
>>>>
>>>>     _1 = *srcp_3(D);
>>>>     _4 = VEC_PERM_EXPR <_1, _1, { 4, 5, 6, 7, 4, 5, 6, 7 }>;
>>>>     _5 = BIT_FIELD_REF <_4, 128, 0>;
>>>>
>>>> to
>>>>
>>>>     _1 = *srcp_3(D);
>>>>     _5 = BIT_FIELD_REF <_1, 128, 128>;
>>>>
>>>> the upper will finally be optimized to
>>>>
>>>> _5 = BIT_FIELD_REF <*srcp_3(D), 128, 128>;
>>>>
>>>> Bootstrapped and regtested on x86_64-pc-linux-gnu{m32,}.
>>>> Ok for trunk?
>>> Hmm, tree-ssa-forwprop.cc:simplify_bitfield_ref should already
>>> handle this in the
>>>
>>>     if (code == VEC_PERM_EXPR
>>>         && constant_multiple_p (bit_field_offset (op), size, &idx))
>>>       {
>>>
>>> part of the code - maybe that needs to be enhanced to cover
>>> a contiguous stride in the VEC_PERM_EXPR.  I see
>>> we have
>>>
>>>     size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
>>>     if (maybe_ne (bit_field_size (op), size))
>>>       return false;
>>>
>>> where it will currently bail, so adjust that to check for a
>>> constant multiple.  I also think we should only handle the
>>> case where the new bit_field_offset alignment is not
>>> worse than the original one.
>>>
>>> That said, I'd prefer if you integrate this transform with
>>> simplify_bitfield_ref.
>> I've got a hack here that tries to do something similar, but it's trying
>> to catch the case where we CONSTRUCTOR feeds the BIT_FIELD_REF.  It
>> walks the CONSTRUCTOR elements to see if an element has the right
>> offset/size to satisify the BIT_FIELD_REF. For x264 we're often able to
>> eliminate the VEC_PERMUTE entirely and just forward operands into the
>> BIT_FIELD_REF.
>>
>> I was leaning towards moving those bits into match.pd before submitting,
>> but if you'd prefer them in tree-ssa-forwprop, that's even easier.
> I think when deciding where to put things it's important to look where related
> transforms reside.  We already do have a (simplify (BIT_FIELD_REF
> CONSTRUCTOR@ ...))
> pattern which should also handle your case already.  So instead of
> adding something
> new it would be nice to figure why it doesn't handle the case you are
> interested in and
> eventually just adjust the existing pattern.
I'm aware of that pattern.  I've found it painfully inadequate in every 
case I've looked at.    In general I've found tree-ssa-forwprop is a 
reasonable place to prototype a lot of stuff to see how it works in 
practice, but I think match.pd is better for most of the transformations 
in the long term.

It sounds like you'd prefer this particular case to move into match.pd.  
Fine.  That's what I'd originally planned to do.  It's pretty simple 
support code, so doing it in match.pd shouldn't be too hard.

Jeff
diff mbox series

Patch

diff --git a/gcc/gimple.h b/gcc/gimple.h
index 6b1e89ad74e..1747dae1193 100644
--- a/gcc/gimple.h
+++ b/gcc/gimple.h
@@ -1638,6 +1638,7 @@  extern void maybe_remove_unused_call_args (struct function *, gimple *);
 extern bool gimple_inexpensive_call_p (gcall *);
 extern bool stmt_can_terminate_bb_p (gimple *);
 extern location_t gimple_or_expr_nonartificial_location (gimple *, tree);
+extern bool gate_optimize_vector_load (gimple *);
 
 /* Return the disposition for a warning (or all warnings by default)
    for a statement.  */
diff --git a/gcc/match.pd b/gcc/match.pd
index 6d691d302b3..ac214310251 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -6832,6 +6832,62 @@  DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 	}
 	(cmp @0 { res; })))))))))
 
+#if GIMPLE
+/* Simplify partail vector access, transform
+
+   V8SI A;
+   V4SI B;
+   A = *PA;
+   B = VEC_PERM_EXPR (A, A, { 4, 5, 6, 7, 4, 5, 6, 7 });
+   C = BIT_FIELD_REF (B, 128, 0)
+
+to
+
+   A = *PA;
+   C = BIT_FIELD_REF (B, 128, 128);
+
+optimize_vector_load will eventually optimize the upper to
+
+   C = BIT_FIELD_REF (*PA, 128, 128);  */
+
+(simplify
+ (BIT_FIELD_REF (vec_perm@2 SSA_NAME@0 @0 VECTOR_CST@1) @rsize @rpos)
+ (if (VECTOR_TYPE_P (type)
+     && TYPE_MODE (type) != BLKmode
+     && single_use (@2)
+     && gate_optimize_vector_load (SSA_NAME_DEF_STMT (@0))
+     && types_match (TREE_TYPE (type), TREE_TYPE (TREE_TYPE (@0))))
+  (with
+   {
+     unsigned HOST_WIDE_INT nelts = -1;
+     if (!VECTOR_CST_NELTS (@1).is_constant (&nelts))
+       return NULL_TREE;
+     tree inner_type = TREE_TYPE (type);
+     unsigned HOST_WIDE_INT elt_w = tree_to_uhwi (TYPE_SIZE (inner_type));
+     unsigned HOST_WIDE_INT pos = tree_to_uhwi (@rpos);
+     unsigned HOST_WIDE_INT size = tree_to_uhwi (@rsize);
+     unsigned HOST_WIDE_INT start
+       = tree_to_uhwi (vector_cst_elt (@1, pos / elt_w));
+
+     for (unsigned HOST_WIDE_INT i  = pos / elt_w + 1; i != size / elt_w; i++)
+       {
+	 /* Continuous area.  */
+	 if (tree_to_uhwi (vector_cst_elt (@1, i)) - 1
+	     != tree_to_uhwi (vector_cst_elt (@1, i - 1)))
+	   return NULL_TREE;
+       }
+
+     /* Aligned or support movmisalign_optab.  */
+     unsigned HOST_WIDE_INT dest_align = tree_to_uhwi (TYPE_SIZE (type));
+     if ((TYPE_ALIGN (TREE_TYPE (@0)) % dest_align
+	  || start * elt_w % dest_align)
+	&& (optab_handler (movmisalign_optab, TYPE_MODE (type))
+	    == CODE_FOR_nothing))
+       return NULL_TREE;
+   }
+   (BIT_FIELD_REF @0 @rsize { bitsize_int (start * elt_w); }))))
+#endif
+
 /* Canonicalizations of BIT_FIELD_REFs.  */
 
 (simplify
diff --git a/gcc/testsuite/gcc.target/i386/pr102583.c b/gcc/testsuite/gcc.target/i386/pr102583.c
new file mode 100644
index 00000000000..ff2ffb5e671
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102583.c
@@ -0,0 +1,30 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2" } */
+/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+20\(%.*%ymm} 1 } } */
+/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+8\(%.*%xmm} 1 } } */
+/* { dg-final { scan-assembler-times {(?n)vmovq[ \t]+16\(%.*%xmm} 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-not {(?n)vpermd[ \t]+.*%zmm} } } */
+
+typedef int v16si __attribute__((vector_size(64)));
+typedef float v8sf __attribute__((vector_size(32)));
+typedef float v4sf __attribute__((vector_size(16)));
+typedef float v2sf __attribute__((vector_size(8)));
+
+v8sf part (v16si *srcp)
+{
+  v16si src = *srcp;
+  return (v8sf) { (float)src[5], (float)src[6], (float)src[7], (float)src[8],
+      (float)src[9], (float)src[10], (float)src[11], (float)src[12] };
+}
+
+v4sf part1 (v16si *srcp)
+{
+  v16si src = *srcp;
+  return (v4sf) { (float)src[2], (float)src[3], (float)src[4], (float)src[5] };
+}
+
+v2sf part2 (v16si *srcp)
+{
+  v16si src = *srcp;
+  return (v2sf) { (float)src[4], (float)src[5] };
+}
diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
index 484491fa1c5..2c8d8bc6dce 100644
--- a/gcc/tree-ssa-forwprop.cc
+++ b/gcc/tree-ssa-forwprop.cc
@@ -3074,6 +3074,27 @@  simplify_vector_constructor (gimple_stmt_iterator *gsi)
   return true;
 }
 
+/* Gate for optimize_vector_load.  */
+bool
+gate_optimize_vector_load (gimple* stmt)
+{
+  if (!is_gimple_assign (stmt))
+    return false;
+
+  tree lhs = gimple_assign_lhs (stmt);
+  tree rhs = gimple_assign_rhs1 (stmt);
+  return (cfun
+	  && TREE_CODE (TREE_TYPE (lhs)) == VECTOR_TYPE
+	  && (TYPE_MODE (TREE_TYPE (lhs)) == BLKmode
+	      /* After vector lowering rewrite all loads, but
+		 initially do not since this conflicts with
+		 vector CONSTRUCTOR to shuffle optimization.  */
+	      || (cfun->curr_properties & PROP_gimple_lvec))
+	  && gimple_assign_load_p (stmt)
+	  && !gimple_has_volatile_ops (stmt)
+	  && !stmt_can_throw_internal (cfun, stmt)
+	  && (!VAR_P (rhs) || !DECL_HARD_REGISTER (rhs)));
+}
 
 /* Rewrite the vector load at *GSI to component-wise loads if the load
    is only used in BIT_FIELD_REF extractions with eventual intermediate
@@ -3500,16 +3521,7 @@  pass_forwprop::execute (function *fun)
 	      else
 		gsi_next (&gsi);
 	    }
-	  else if (TREE_CODE (TREE_TYPE (lhs)) == VECTOR_TYPE
-		   && (TYPE_MODE (TREE_TYPE (lhs)) == BLKmode
-		       /* After vector lowering rewrite all loads, but
-			  initially do not since this conflicts with
-			  vector CONSTRUCTOR to shuffle optimization.  */
-		       || (fun->curr_properties & PROP_gimple_lvec))
-		   && gimple_assign_load_p (stmt)
-		   && !gimple_has_volatile_ops (stmt)
-		   && !stmt_can_throw_internal (cfun, stmt)
-		   && (!VAR_P (rhs) || !DECL_HARD_REGISTER (rhs)))
+	  else if (gate_optimize_vector_load (stmt))
 	    optimize_vector_load (&gsi);
 
 	  else if (code == COMPLEX_EXPR)