diff mbox series

store-merging: Handle vector CONSTRUCTORs using bswap [PR96239]

Message ID 20201217145117.GS3788@tucnak
State New
Headers show
Series store-merging: Handle vector CONSTRUCTORs using bswap [PR96239] | expand

Commit Message

Jakub Jelinek Dec. 17, 2020, 2:51 p.m. UTC
On Wed, Dec 16, 2020 at 09:29:31AM +0100, Richard Biener wrote:
> I think it probably makes sense to have some helper split out that
> collects & classifies vector constructor components we can use from
> both forwprop (where matching the V_C_E from integer could be done
> as well IMHO) and bswap (when a permute is involved) and store-merging.

I've tried to add such helper, but handling over just analysis and letting
each pass handle it differently seems complicated given the limitations of
the bswap infrastructure.

So, this patch just hooks the optimization also into store-merging so that
the original testcase from the PR can be fixed.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

Note, I have no idea why the bswap code needs TODO_update_ssa if it changed
things, for the vuses it copies them from the surrounding vuses, which looks
correct to me.  Perhaps because it uses force_gimple_operand_gsi* in a few
spots in bswap_replace?  Confused...

2020-12-17  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/96239
	* gimple-ssa-store-merging.c (maybe_optimize_vector_constructor): New
	function.
	(get_status_for_store_merging): Don't return BB_INVALID for blocks
	with potential bswap optimizable CONSTRUCTORs.
	(pass_store_merging::execute): Optimize vector CONSTRUCTORs with bswap
	if possible.

	* gcc.dg/tree-ssa/pr96239.c: New test.



	Jakub

Comments

Richard Biener Jan. 5, 2021, 12:55 p.m. UTC | #1
On Thu, 17 Dec 2020, Jakub Jelinek wrote:

> On Wed, Dec 16, 2020 at 09:29:31AM +0100, Richard Biener wrote:
> > I think it probably makes sense to have some helper split out that
> > collects & classifies vector constructor components we can use from
> > both forwprop (where matching the V_C_E from integer could be done
> > as well IMHO) and bswap (when a permute is involved) and store-merging.
> 
> I've tried to add such helper, but handling over just analysis and letting
> each pass handle it differently seems complicated given the limitations of
> the bswap infrastructure.
> 
> So, this patch just hooks the optimization also into store-merging so that
> the original testcase from the PR can be fixed.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

OK, but ...

> Note, I have no idea why the bswap code needs TODO_update_ssa if it changed
> things, for the vuses it copies them from the surrounding vuses, which looks
> correct to me.  Perhaps because it uses force_gimple_operand_gsi* in a few
> spots in bswap_replace?  Confused...

.. that shouldn't cause updating SSA to be necessary.  Maybe it at some
point did not update virtual operands appropriately.

Richard.

> 2020-12-17  Jakub Jelinek  <jakub@redhat.com>
> 
> 	PR tree-optimization/96239
> 	* gimple-ssa-store-merging.c (maybe_optimize_vector_constructor): New
> 	function.
> 	(get_status_for_store_merging): Don't return BB_INVALID for blocks
> 	with potential bswap optimizable CONSTRUCTORs.
> 	(pass_store_merging::execute): Optimize vector CONSTRUCTORs with bswap
> 	if possible.
> 
> 	* gcc.dg/tree-ssa/pr96239.c: New test.
> 
> --- gcc/gimple-ssa-store-merging.c.jj	2020-12-16 13:07:51.729733816 +0100
> +++ gcc/gimple-ssa-store-merging.c	2020-12-16 16:02:06.238868137 +0100
> @@ -1255,6 +1255,75 @@ bswap_replace (gimple_stmt_iterator gsi,
>    return tgt;
>  }
>  
> +/* Try to optimize an assignment CUR_STMT with CONSTRUCTOR on the rhs
> +   using bswap optimizations.  CDI_DOMINATORS need to be
> +   computed on entry.  Return true if it has been optimized and
> +   TODO_update_ssa is needed.  */
> +
> +static bool
> +maybe_optimize_vector_constructor (gimple *cur_stmt)
> +{
> +  tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
> +  struct symbolic_number n;
> +  bool bswap;
> +
> +  gcc_assert (is_gimple_assign (cur_stmt)
> +	      && gimple_assign_rhs_code (cur_stmt) == CONSTRUCTOR);
> +
> +  tree rhs = gimple_assign_rhs1 (cur_stmt);
> +  if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
> +      || !INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (rhs)))
> +      || gimple_assign_lhs (cur_stmt) == NULL_TREE)
> +    return false;
> +
> +  HOST_WIDE_INT sz = int_size_in_bytes (TREE_TYPE (rhs)) * BITS_PER_UNIT;
> +  switch (sz)
> +    {
> +    case 16:
> +      load_type = bswap_type = uint16_type_node;
> +      break;
> +    case 32:
> +      if (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
> +	  && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing)
> +	{
> +	  load_type = uint32_type_node;
> +	  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
> +	  bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
> +	}
> +      else
> +	return false;
> +      break;
> +    case 64:
> +      if (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
> +	  && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
> +	      || (word_mode == SImode
> +		  && builtin_decl_explicit_p (BUILT_IN_BSWAP32)
> +		  && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing)))
> +	{
> +	  load_type = uint64_type_node;
> +	  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
> +	  bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
> +	}
> +      else
> +	return false;
> +      break;
> +    default:
> +      return false;
> +    }
> +
> +  gimple *ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
> +  if (!ins_stmt || n.range != (unsigned HOST_WIDE_INT) sz)
> +    return false;
> +
> +  if (bswap && !fndecl && n.range != 16)
> +    return false;
> +
> +  memset (&nop_stats, 0, sizeof (nop_stats));
> +  memset (&bswap_stats, 0, sizeof (bswap_stats));
> +  return bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl,
> +			bswap_type, load_type, &n, bswap) != NULL_TREE;
> +}
> +
>  /* Find manual byte swap implementations as well as load in a given
>     endianness. Byte swaps are turned into a bswap builtin invokation
>     while endian loads are converted to bswap builtin invokation or
> @@ -5126,6 +5195,7 @@ static enum basic_block_status
>  get_status_for_store_merging (basic_block bb)
>  {
>    unsigned int num_statements = 0;
> +  unsigned int num_constructors = 0;
>    gimple_stmt_iterator gsi;
>    edge e;
>  
> @@ -5138,9 +5208,27 @@ get_status_for_store_merging (basic_bloc
>  
>        if (store_valid_for_store_merging_p (stmt) && ++num_statements >= 2)
>  	break;
> +
> +      if (is_gimple_assign (stmt)
> +	  && gimple_assign_rhs_code (stmt) == CONSTRUCTOR)
> +	{
> +	  tree rhs = gimple_assign_rhs1 (stmt);
> +	  if (VECTOR_TYPE_P (TREE_TYPE (rhs))
> +	      && INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (rhs)))
> +	      && gimple_assign_lhs (stmt) != NULL_TREE)
> +	    {
> +	      HOST_WIDE_INT sz
> +		= int_size_in_bytes (TREE_TYPE (rhs)) * BITS_PER_UNIT;
> +	      if (sz == 16 || sz == 32 || sz == 64)
> +		{
> +		  num_constructors = 1;
> +		  break;
> +		}
> +	    }
> +	}
>      }
>  
> -  if (num_statements == 0)
> +  if (num_statements == 0 && num_constructors == 0)
>      return BB_INVALID;
>  
>    if (cfun->can_throw_non_call_exceptions && cfun->eh
> @@ -5149,7 +5237,7 @@ get_status_for_store_merging (basic_bloc
>        && e->dest == bb->next_bb)
>      return BB_EXTENDED_VALID;
>  
> -  return num_statements >= 2 ? BB_VALID : BB_INVALID;
> +  return (num_statements >= 2 || num_constructors) ? BB_VALID : BB_INVALID;
>  }
>  
>  /* Entry point for the pass.  Go over each basic block recording chains of
> @@ -5163,6 +5251,7 @@ pass_store_merging::execute (function *f
>    basic_block bb;
>    hash_set<gimple *> orig_stmts;
>    bool changed = false, open_chains = false;
> +  unsigned todo = 0;
>  
>    /* If the function can throw and catch non-call exceptions, we'll be trying
>       to merge stores across different basic blocks so we need to first unsplit
> @@ -5189,9 +5278,10 @@ pass_store_merging::execute (function *f
>        if (dump_file && (dump_flags & TDF_DETAILS))
>  	fprintf (dump_file, "Processing basic block <%d>:\n", bb->index);
>  
> -      for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> +      for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); )
>  	{
>  	  gimple *stmt = gsi_stmt (gsi);
> +	  gsi_next (&gsi);
>  
>  	  if (is_gimple_debug (stmt))
>  	    continue;
> @@ -5207,6 +5297,14 @@ pass_store_merging::execute (function *f
>  	      continue;
>  	    }
>  
> +	  if (is_gimple_assign (stmt)
> +	      && gimple_assign_rhs_code (stmt) == CONSTRUCTOR
> +	      && maybe_optimize_vector_constructor (stmt))
> +	    {
> +	      todo = TODO_update_ssa;
> +	      continue;
> +	    }
> +
>  	  if (store_valid_for_store_merging_p (stmt))
>  	    changed |= process_store (stmt);
>  	  else
> @@ -5230,10 +5328,10 @@ pass_store_merging::execute (function *f
>    if (cfun->can_throw_non_call_exceptions && cfun->eh && changed)
>      {
>        free_dominance_info (CDI_DOMINATORS);
> -      return TODO_cleanup_cfg;
> +      return TODO_cleanup_cfg | todo;
>      }
>  
> -  return 0;
> +  return todo;
>  }
>  
>  } // anon namespace
> --- gcc/testsuite/gcc.dg/tree-ssa/pr96239.c.jj	2020-12-16 16:10:30.013256862 +0100
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr96239.c	2020-12-16 16:11:08.802822537 +0100
> @@ -0,0 +1,18 @@
> +/* PR tree-optimization/96239 */
> +/* { dg-do compile { target { ilp32 || lp64 } } } */
> +/* { dg-options "-O3 -fdump-tree-optimized" } */
> +/* { dg-final { scan-tree-dump " r>> 8;" "optimized" { target bswap } } } */
> +
> +union U { unsigned char c[2]; unsigned short s; };
> +
> +unsigned short
> +foo (unsigned short x)
> +{
> +  union U u;
> +  u.s = x;
> +  unsigned char v = u.c[0];
> +  unsigned char w = u.c[1];
> +  u.c[0] = w;
> +  u.c[1] = v;
> +  return u.s;
> +}
> 
> 
> 	Jakub
> 
>
Jakub Jelinek Jan. 5, 2021, 3:18 p.m. UTC | #2
On Tue, Jan 05, 2021 at 01:55:21PM +0100, Richard Biener wrote:
> > Note, I have no idea why the bswap code needs TODO_update_ssa if it changed
> > things, for the vuses it copies them from the surrounding vuses, which looks
> > correct to me.  Perhaps because it uses force_gimple_operand_gsi* in a few
> > spots in bswap_replace?  Confused...
> 
> .. that shouldn't cause updating SSA to be necessary.  Maybe it at some
> point did not update virtual operands appropriately.

Ok, I've committed the following version without the TODO_update_ssa, which
passed another bootstrap/regtest on x86_64-linux and i686-linux.

2021-01-05  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/96239
	* gimple-ssa-store-merging.c (maybe_optimize_vector_constructor): New
	function.
	(get_status_for_store_merging): Don't return BB_INVALID for blocks
	with potential bswap optimizable CONSTRUCTORs.
	(pass_store_merging::execute): Optimize vector CONSTRUCTORs with bswap
	if possible.

	* gcc.dg/tree-ssa/pr96239.c: New test.

--- gcc/gimple-ssa-store-merging.c.jj	2020-12-16 13:07:51.729733816 +0100
+++ gcc/gimple-ssa-store-merging.c	2020-12-16 16:02:06.238868137 +0100
@@ -1255,6 +1255,75 @@ bswap_replace (gimple_stmt_iterator gsi,
   return tgt;
 }
 
+/* Try to optimize an assignment CUR_STMT with CONSTRUCTOR on the rhs
+   using bswap optimizations.  CDI_DOMINATORS need to be
+   computed on entry.  Return true if it has been optimized and
+   TODO_update_ssa is needed.  */
+
+static bool
+maybe_optimize_vector_constructor (gimple *cur_stmt)
+{
+  tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
+  struct symbolic_number n;
+  bool bswap;
+
+  gcc_assert (is_gimple_assign (cur_stmt)
+	      && gimple_assign_rhs_code (cur_stmt) == CONSTRUCTOR);
+
+  tree rhs = gimple_assign_rhs1 (cur_stmt);
+  if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
+      || !INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (rhs)))
+      || gimple_assign_lhs (cur_stmt) == NULL_TREE)
+    return false;
+
+  HOST_WIDE_INT sz = int_size_in_bytes (TREE_TYPE (rhs)) * BITS_PER_UNIT;
+  switch (sz)
+    {
+    case 16:
+      load_type = bswap_type = uint16_type_node;
+      break;
+    case 32:
+      if (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
+	  && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing)
+	{
+	  load_type = uint32_type_node;
+	  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
+	  bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
+	}
+      else
+	return false;
+      break;
+    case 64:
+      if (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
+	  && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
+	      || (word_mode == SImode
+		  && builtin_decl_explicit_p (BUILT_IN_BSWAP32)
+		  && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing)))
+	{
+	  load_type = uint64_type_node;
+	  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
+	  bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
+	}
+      else
+	return false;
+      break;
+    default:
+      return false;
+    }
+
+  gimple *ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
+  if (!ins_stmt || n.range != (unsigned HOST_WIDE_INT) sz)
+    return false;
+
+  if (bswap && !fndecl && n.range != 16)
+    return false;
+
+  memset (&nop_stats, 0, sizeof (nop_stats));
+  memset (&bswap_stats, 0, sizeof (bswap_stats));
+  return bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl,
+			bswap_type, load_type, &n, bswap) != NULL_TREE;
+}
+
 /* Find manual byte swap implementations as well as load in a given
    endianness. Byte swaps are turned into a bswap builtin invokation
    while endian loads are converted to bswap builtin invokation or
@@ -5126,6 +5195,7 @@ static enum basic_block_status
 get_status_for_store_merging (basic_block bb)
 {
   unsigned int num_statements = 0;
+  unsigned int num_constructors = 0;
   gimple_stmt_iterator gsi;
   edge e;
 
@@ -5138,9 +5208,27 @@ get_status_for_store_merging (basic_bloc
 
       if (store_valid_for_store_merging_p (stmt) && ++num_statements >= 2)
 	break;
+
+      if (is_gimple_assign (stmt)
+	  && gimple_assign_rhs_code (stmt) == CONSTRUCTOR)
+	{
+	  tree rhs = gimple_assign_rhs1 (stmt);
+	  if (VECTOR_TYPE_P (TREE_TYPE (rhs))
+	      && INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (rhs)))
+	      && gimple_assign_lhs (stmt) != NULL_TREE)
+	    {
+	      HOST_WIDE_INT sz
+		= int_size_in_bytes (TREE_TYPE (rhs)) * BITS_PER_UNIT;
+	      if (sz == 16 || sz == 32 || sz == 64)
+		{
+		  num_constructors = 1;
+		  break;
+		}
+	    }
+	}
     }
 
-  if (num_statements == 0)
+  if (num_statements == 0 && num_constructors == 0)
     return BB_INVALID;
 
   if (cfun->can_throw_non_call_exceptions && cfun->eh
@@ -5149,7 +5237,7 @@ get_status_for_store_merging (basic_bloc
       && e->dest == bb->next_bb)
     return BB_EXTENDED_VALID;
 
-  return num_statements >= 2 ? BB_VALID : BB_INVALID;
+  return (num_statements >= 2 || num_constructors) ? BB_VALID : BB_INVALID;
 }
 
 /* Entry point for the pass.  Go over each basic block recording chains of
@@ -5189,9 +5278,10 @@ pass_store_merging::execute (function *f
       if (dump_file && (dump_flags & TDF_DETAILS))
 	fprintf (dump_file, "Processing basic block <%d>:\n", bb->index);
 
-      for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+      for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); )
 	{
 	  gimple *stmt = gsi_stmt (gsi);
+	  gsi_next (&gsi);
 
 	  if (is_gimple_debug (stmt))
 	    continue;
@@ -5207,6 +5297,11 @@ pass_store_merging::execute (function *f
 	      continue;
 	    }
 
+	  if (is_gimple_assign (stmt)
+	      && gimple_assign_rhs_code (stmt) == CONSTRUCTOR
+	      && maybe_optimize_vector_constructor (stmt))
+	    continue;
+
 	  if (store_valid_for_store_merging_p (stmt))
 	    changed |= process_store (stmt);
 	  else
--- gcc/testsuite/gcc.dg/tree-ssa/pr96239.c.jj	2020-12-16 16:10:30.013256862 +0100
+++ gcc/testsuite/gcc.dg/tree-ssa/pr96239.c	2020-12-16 16:11:08.802822537 +0100
@@ -0,0 +1,18 @@
+/* PR tree-optimization/96239 */
+/* { dg-do compile { target { ilp32 || lp64 } } } */
+/* { dg-options "-O3 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump " r>> 8;" "optimized" { target bswap } } } */
+
+union U { unsigned char c[2]; unsigned short s; };
+
+unsigned short
+foo (unsigned short x)
+{
+  union U u;
+  u.s = x;
+  unsigned char v = u.c[0];
+  unsigned char w = u.c[1];
+  u.c[0] = w;
+  u.c[1] = v;
+  return u.s;
+}


	Jakub
diff mbox series

Patch

--- gcc/gimple-ssa-store-merging.c.jj	2020-12-16 13:07:51.729733816 +0100
+++ gcc/gimple-ssa-store-merging.c	2020-12-16 16:02:06.238868137 +0100
@@ -1255,6 +1255,75 @@  bswap_replace (gimple_stmt_iterator gsi,
   return tgt;
 }
 
+/* Try to optimize an assignment CUR_STMT with CONSTRUCTOR on the rhs
+   using bswap optimizations.  CDI_DOMINATORS need to be
+   computed on entry.  Return true if it has been optimized and
+   TODO_update_ssa is needed.  */
+
+static bool
+maybe_optimize_vector_constructor (gimple *cur_stmt)
+{
+  tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
+  struct symbolic_number n;
+  bool bswap;
+
+  gcc_assert (is_gimple_assign (cur_stmt)
+	      && gimple_assign_rhs_code (cur_stmt) == CONSTRUCTOR);
+
+  tree rhs = gimple_assign_rhs1 (cur_stmt);
+  if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
+      || !INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (rhs)))
+      || gimple_assign_lhs (cur_stmt) == NULL_TREE)
+    return false;
+
+  HOST_WIDE_INT sz = int_size_in_bytes (TREE_TYPE (rhs)) * BITS_PER_UNIT;
+  switch (sz)
+    {
+    case 16:
+      load_type = bswap_type = uint16_type_node;
+      break;
+    case 32:
+      if (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
+	  && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing)
+	{
+	  load_type = uint32_type_node;
+	  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
+	  bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
+	}
+      else
+	return false;
+      break;
+    case 64:
+      if (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
+	  && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
+	      || (word_mode == SImode
+		  && builtin_decl_explicit_p (BUILT_IN_BSWAP32)
+		  && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing)))
+	{
+	  load_type = uint64_type_node;
+	  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
+	  bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
+	}
+      else
+	return false;
+      break;
+    default:
+      return false;
+    }
+
+  gimple *ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
+  if (!ins_stmt || n.range != (unsigned HOST_WIDE_INT) sz)
+    return false;
+
+  if (bswap && !fndecl && n.range != 16)
+    return false;
+
+  memset (&nop_stats, 0, sizeof (nop_stats));
+  memset (&bswap_stats, 0, sizeof (bswap_stats));
+  return bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl,
+			bswap_type, load_type, &n, bswap) != NULL_TREE;
+}
+
 /* Find manual byte swap implementations as well as load in a given
    endianness. Byte swaps are turned into a bswap builtin invokation
    while endian loads are converted to bswap builtin invokation or
@@ -5126,6 +5195,7 @@  static enum basic_block_status
 get_status_for_store_merging (basic_block bb)
 {
   unsigned int num_statements = 0;
+  unsigned int num_constructors = 0;
   gimple_stmt_iterator gsi;
   edge e;
 
@@ -5138,9 +5208,27 @@  get_status_for_store_merging (basic_bloc
 
       if (store_valid_for_store_merging_p (stmt) && ++num_statements >= 2)
 	break;
+
+      if (is_gimple_assign (stmt)
+	  && gimple_assign_rhs_code (stmt) == CONSTRUCTOR)
+	{
+	  tree rhs = gimple_assign_rhs1 (stmt);
+	  if (VECTOR_TYPE_P (TREE_TYPE (rhs))
+	      && INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (rhs)))
+	      && gimple_assign_lhs (stmt) != NULL_TREE)
+	    {
+	      HOST_WIDE_INT sz
+		= int_size_in_bytes (TREE_TYPE (rhs)) * BITS_PER_UNIT;
+	      if (sz == 16 || sz == 32 || sz == 64)
+		{
+		  num_constructors = 1;
+		  break;
+		}
+	    }
+	}
     }
 
-  if (num_statements == 0)
+  if (num_statements == 0 && num_constructors == 0)
     return BB_INVALID;
 
   if (cfun->can_throw_non_call_exceptions && cfun->eh
@@ -5149,7 +5237,7 @@  get_status_for_store_merging (basic_bloc
       && e->dest == bb->next_bb)
     return BB_EXTENDED_VALID;
 
-  return num_statements >= 2 ? BB_VALID : BB_INVALID;
+  return (num_statements >= 2 || num_constructors) ? BB_VALID : BB_INVALID;
 }
 
 /* Entry point for the pass.  Go over each basic block recording chains of
@@ -5163,6 +5251,7 @@  pass_store_merging::execute (function *f
   basic_block bb;
   hash_set<gimple *> orig_stmts;
   bool changed = false, open_chains = false;
+  unsigned todo = 0;
 
   /* If the function can throw and catch non-call exceptions, we'll be trying
      to merge stores across different basic blocks so we need to first unsplit
@@ -5189,9 +5278,10 @@  pass_store_merging::execute (function *f
       if (dump_file && (dump_flags & TDF_DETAILS))
 	fprintf (dump_file, "Processing basic block <%d>:\n", bb->index);
 
-      for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+      for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); )
 	{
 	  gimple *stmt = gsi_stmt (gsi);
+	  gsi_next (&gsi);
 
 	  if (is_gimple_debug (stmt))
 	    continue;
@@ -5207,6 +5297,14 @@  pass_store_merging::execute (function *f
 	      continue;
 	    }
 
+	  if (is_gimple_assign (stmt)
+	      && gimple_assign_rhs_code (stmt) == CONSTRUCTOR
+	      && maybe_optimize_vector_constructor (stmt))
+	    {
+	      todo = TODO_update_ssa;
+	      continue;
+	    }
+
 	  if (store_valid_for_store_merging_p (stmt))
 	    changed |= process_store (stmt);
 	  else
@@ -5230,10 +5328,10 @@  pass_store_merging::execute (function *f
   if (cfun->can_throw_non_call_exceptions && cfun->eh && changed)
     {
       free_dominance_info (CDI_DOMINATORS);
-      return TODO_cleanup_cfg;
+      return TODO_cleanup_cfg | todo;
     }
 
-  return 0;
+  return todo;
 }
 
 } // anon namespace
--- gcc/testsuite/gcc.dg/tree-ssa/pr96239.c.jj	2020-12-16 16:10:30.013256862 +0100
+++ gcc/testsuite/gcc.dg/tree-ssa/pr96239.c	2020-12-16 16:11:08.802822537 +0100
@@ -0,0 +1,18 @@ 
+/* PR tree-optimization/96239 */
+/* { dg-do compile { target { ilp32 || lp64 } } } */
+/* { dg-options "-O3 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump " r>> 8;" "optimized" { target bswap } } } */
+
+union U { unsigned char c[2]; unsigned short s; };
+
+unsigned short
+foo (unsigned short x)
+{
+  union U u;
+  u.s = x;
+  unsigned char v = u.c[0];
+  unsigned char w = u.c[1];
+  u.c[0] = w;
+  u.c[1] = v;
+  return u.s;
+}