diff mbox series

[committed] openmp: Optimize for OpenMP atomics 2x__builtin_clear_padding+__builtin_memcmp if possible [PR102571]

Message ID 20211006084712.GI920483@tucnak
State New
Headers show
Series [committed] openmp: Optimize for OpenMP atomics 2x__builtin_clear_padding+__builtin_memcmp if possible [PR102571] | expand

Commit Message

Jakub Jelinek Oct. 6, 2021, 8:47 a.m. UTC
Hi!

For the few long double types that do have padding bits, e.g. on x86
the clear_type_padding_in_mask computed mask is
ff ff ff ff ff ff ff ff ff ff 00 00 for 32-bit and
ff ff ff ff ff ff ff ff ff ff 00 00 00 00 00 00 for 64-bit.
Instead of doing __builtin_clear_padding on both operands that will clear the
last 2 or 6 bytes and then memcmp on the whole 12/16 bytes, we can just
memcmp 10 bytes.  The code also handles if the padding would be at the start
or both at the start and end, but everything on byte boundaries only and
non-padding bits being contiguous.
This works around a tree-ssa-dse.c bug (but we need to fix it anyway,
as libstdc++ won't do this and as it can deal with arbitrary types, it even
can't do that generally).

Bootstrapped/regtested on x86_64-linux and i686-linux, committed to trunk.

2021-10-06  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/102571
	* c-omp.c (c_finish_omp_atomic): Optimize the case where type has
	padding, but the non-padding bits are contiguous set of bytes
	by adjusting the memcmp call arguments instead of emitting
	__builtin_clear_padding and then comparing all the type's bytes.


	Jakub
diff mbox series

Patch

--- gcc/c-family/c-omp.c.jj	2021-10-01 10:45:37.909412708 +0200
+++ gcc/c-family/c-omp.c	2021-10-05 15:19:36.853387522 +0200
@@ -379,6 +379,8 @@  c_finish_omp_atomic (location_t loc, enu
       if (SCALAR_FLOAT_TYPE_P (cmptype) && !test)
 	{
 	  bool clear_padding = false;
+	  HOST_WIDE_INT non_padding_start = 0;
+	  HOST_WIDE_INT non_padding_end = 0;
 	  if (BITS_PER_UNIT == 8 && CHAR_BIT == 8)
 	    {
 	      HOST_WIDE_INT sz = int_size_in_bytes (cmptype), i;
@@ -392,6 +394,40 @@  c_finish_omp_atomic (location_t loc, enu
 		    clear_padding = true;
 		    break;
 		  }
+	      if (clear_padding && buf[i] == 0)
+		{
+		  /* Try to optimize.  In the common case where
+		     non-padding bits are all continuous and start
+		     and end at a byte boundary, we can just adjust
+		     the memcmp call arguments and don't need to
+		     emit __builtin_clear_padding calls.  */
+		  if (i == 0)
+		    {
+		      for (i = 0; i < sz; i++)
+			if (buf[i] != 0)
+			  break;
+		      if (i < sz && buf[i] == (unsigned char) ~0)
+			{
+			  non_padding_start = i;
+			  for (; i < sz; i++)
+			    if (buf[i] != (unsigned char) ~0)
+			      break;
+			}
+		      else
+			i = 0;
+		    }
+		  if (i != 0)
+		    {
+		      non_padding_end = i;
+		      for (; i < sz; i++)
+			if (buf[i] != 0)
+			  {
+			    non_padding_start = 0;
+			    non_padding_end = 0;
+			    break;
+			  }
+		    }
+		}
 	    }
 	  tree inttype = NULL_TREE;
 	  if (!clear_padding && tree_fits_uhwi_p (TYPE_SIZE (cmptype)))
@@ -428,12 +464,22 @@  c_finish_omp_atomic (location_t loc, enu
 	      tmp2 = build4 (TARGET_EXPR, cmptype, tmp2,
 			     TREE_OPERAND (rhs1, 1), NULL, NULL);
 	      tmp2 = build1 (ADDR_EXPR, pcmptype, tmp2);
+	      if (non_padding_start)
+		{
+		  tmp1 = build2 (POINTER_PLUS_EXPR, pcmptype, tmp1,
+				 size_int (non_padding_start));
+		  tmp2 = build2 (POINTER_PLUS_EXPR, pcmptype, tmp2,
+				 size_int (non_padding_start));
+		}
 	      tree fndecl = builtin_decl_explicit (BUILT_IN_MEMCMP);
 	      rhs1 = build_call_expr_loc (loc, fndecl, 3, tmp1, tmp2,
-					  TYPE_SIZE_UNIT (cmptype));
+					  non_padding_end
+					  ? size_int (non_padding_end
+						      - non_padding_start)
+					  : TYPE_SIZE_UNIT (cmptype));
 	      rhs1 = build2 (EQ_EXPR, boolean_type_node, rhs1,
 			     integer_zero_node);
-	      if (clear_padding)
+	      if (clear_padding && non_padding_end == 0)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_CLEAR_PADDING);
 		  tree cp1 = build_call_expr_loc (loc, fndecl, 1, tmp1);