diff mbox

C PATCH to add __atomic_fetch_* optimization for _Atomics (PR c/68908)

Message ID 20151218171511.GK3296@redhat.com
State New
Headers show

Commit Message

Marek Polacek Dec. 18, 2015, 5:15 p.m. UTC
As outlined in the PR, build_atomic_assign can do a better job.  It seems
profitable to use __atomic_fetch_* built-ins when possible rather than the
generic CAS loop.  This patch attempts to implement that.

There are two ???s in the patch.  For the first one, I think let's just
ignore enums, the potential trouble doesn't seem to be worth it.  And wrt
the second ???, well, here I was unsure, so I decided to go the generic way
for restrict-qualified pointers.

The two new tests should insure that things work sane even when this
optimization kicks in.

Bootstrapped/regtested on x86_64-linux and powerpc64le-linux, ok for trunk?

2015-12-18  Marek Polacek  <polacek@redhat.com>

	PR c/68908
	* c-typeck.c (build_atomic_assign): Improve commentary.  Add
	optimization to use __atomic_fetch_* built-in if possible.

	* gcc.dg/atomic/c11-atomic-exec-6.c: New test.
	* gcc.dg/atomic/stdatomic-op-5.c: New test.


	Marek

Comments

Marek Polacek Dec. 18, 2015, 5:19 p.m. UTC | #1
On Fri, Dec 18, 2015 at 06:15:11PM +0100, Marek Polacek wrote:
> As outlined in the PR, build_atomic_assign can do a better job.  It seems
> profitable to use __atomic_fetch_* built-ins when possible rather than the
> generic CAS loop.  This patch attempts to implement that.
> 
> There are two ???s in the patch.  For the first one, I think let's just
> ignore enums, the potential trouble doesn't seem to be worth it.  And wrt
> the second ???, well, here I was unsure, so I decided to go the generic way
> for restrict-qualified pointers.
> 
> The two new tests should insure that things work sane even when this
> optimization kicks in.
> 
> Bootstrapped/regtested on x86_64-linux and powerpc64le-linux, ok for trunk?
> 
> 2015-12-18  Marek Polacek  <polacek@redhat.com>
> 
> 	PR c/68908
> 	* c-typeck.c (build_atomic_assign): Improve commentary.  Add
> 	optimization to use __atomic_fetch_* built-in if possible.
> 
> 	* gcc.dg/atomic/c11-atomic-exec-6.c: New test.
> 	* gcc.dg/atomic/stdatomic-op-5.c: New test.
> 
> diff --git gcc/c/c-typeck.c gcc/c/c-typeck.c
> index b605f81..9b6febb 100644
> --- gcc/c/c-typeck.c
> +++ gcc/c/c-typeck.c
> @@ -3685,9 +3685,9 @@ pointer_diff (location_t loc, tree op0, tree op1)
>    return convert (restype, result);
>  }
>  
> -/* Expand atomic compound assignments into an approriate sequence as
> -   specified by the C11 standard section 6.5.16.2.   
> -    given 
> +/* Expand atomic compound assignments into an appropriate sequence as
> +   specified by the C11 standard section 6.5.16.2.
> +
>         _Atomic T1 E1
>         T2 E2
>         E1 op= E2
> @@ -3719,26 +3719,25 @@ loop:
>  done:
>    feupdateenv (&fenv);
>  
> -  Also note that the compiler is simply issuing the generic form of
> -  the atomic operations.  This requires temp(s) and has their address
> -  taken.  The atomic processing is smart enough to figure out when the
> -  size of an object can utilize a lock-free version, and convert the
> -  built-in call to the appropriate lock-free routine.  The optimizers
> -  will then dispose of any temps that are no longer required, and
> -  lock-free implementations are utilized as long as there is target
> -  support for the required size.
> +  The compiler will issue the __atomic_fetch_* built-in when possible,
> +  otherwise it will generate the generic form of the atomic operations.
> +  This requires temp(s) and has their address taken.  The atomic processing
> +  is smart enough to figure out when the size of an object can utilize
> +  a lock-free version, and convert the built-in call to the appropriate
> +  lock-free routine.  The optimizers will then dispose of any temps that
> +  are no longer required, and lock-free implementations are utilized as
> +  long as there is target support for the required size.
>  
>    If the operator is NOP_EXPR, then this is a simple assignment, and
>    an __atomic_store is issued to perform the assignment rather than
> -  the above loop.
> -
> -*/
> +  the above loop.  */
>  
>  /* Build an atomic assignment at LOC, expanding into the proper
>     sequence to store LHS MODIFYCODE= RHS.  Return a value representing
> -   the result of the operation, unless RETURN_OLD_P in which case
> +   the result of the operation, unless RETURN_OLD_P, in which case
>     return the old value of LHS (this is only for postincrement and
>     postdecrement).  */
> +
>  static tree
>  build_atomic_assign (location_t loc, tree lhs, enum tree_code modifycode,
>  		     tree rhs, bool return_old_p)
> @@ -3805,6 +3804,86 @@ build_atomic_assign (location_t loc, tree lhs, enum tree_code modifycode,
>        return build2 (COMPOUND_EXPR, nonatomic_lhs_type, compound_stmt, val);
>      }
>  
> +  /* Attempt to implement the atomic operation as an __atomic_fetch_* or
> +     __atomic_*_fetch built-in rather than a CAS loop.  atomic_bool type
> +     isn't applicable for such builtins.  ??? Do we want to handle enums?  */
> +  if ((TREE_CODE (lhs_type) == INTEGER_TYPE || POINTER_TYPE_P (lhs_type))
> +      && TREE_CODE (rhs_type) == INTEGER_TYPE)
> +    {
> +      built_in_function fncode;
> +      switch (modifycode)
> +	{
> +	case PLUS_EXPR:
> +	case POINTER_PLUS_EXPR:
> +	  fncode = (return_old_p
> +		    ? BUILT_IN_ATOMIC_FETCH_ADD_N
> +		    : BUILT_IN_ATOMIC_ADD_FETCH_N);
> +	  break;
> +	case MINUS_EXPR:
> +	  fncode = (return_old_p
> +		    ? BUILT_IN_ATOMIC_FETCH_SUB_N
> +		    : BUILT_IN_ATOMIC_SUB_FETCH_N);
> +	  break;
> +	case BIT_AND_EXPR:
> +	  fncode = (return_old_p
> +		    ? BUILT_IN_ATOMIC_FETCH_AND_N
> +		    : BUILT_IN_ATOMIC_AND_FETCH_N);
> +	  break;
> +	case BIT_IOR_EXPR:
> +	  fncode = (return_old_p
> +		    ? BUILT_IN_ATOMIC_FETCH_OR_N
> +		    : BUILT_IN_ATOMIC_OR_FETCH_N);
> +	  break;
> +	case BIT_XOR_EXPR:
> +	  fncode = (return_old_p
> +		    ? BUILT_IN_ATOMIC_FETCH_XOR_N
> +		    : BUILT_IN_ATOMIC_XOR_FETCH_N);
> +	  break;
> +	default:
> +	  goto cas_loop;
> +	}
> +
> +      /* If this is a pointer type, we need to multiplicate by the size of

This should read "multiply".  Consider it fixed.

	Marek
Joseph Myers Dec. 18, 2015, 9:10 p.m. UTC | #2
On Fri, 18 Dec 2015, Marek Polacek wrote:

> +	  tree sz = TYPE_SIZE_UNIT (TREE_TYPE (lhs_type));
> +	  rhs = fold_build2_loc (loc, MULT_EXPR, rhs_type, rhs,
> +				 convert (rhs_type, sz));

Converting the size to rhs_type seems wrong, as does multiplying in 
rhs_type.  In a test like

struct s { char a[256]; } *_Atomic p;
char c;
void f (void) { p += c; }

rhs_type is char and the conversion of the size to char will result in 0.  
(Of course this needs expanding to produce an executable test that fails 
with the present patch.)  Instead, both size and rhs should be converted 
to ptrdiff_t and the multiplication done in ptrdiff_t.
diff mbox

Patch

diff --git gcc/c/c-typeck.c gcc/c/c-typeck.c
index b605f81..9b6febb 100644
--- gcc/c/c-typeck.c
+++ gcc/c/c-typeck.c
@@ -3685,9 +3685,9 @@  pointer_diff (location_t loc, tree op0, tree op1)
   return convert (restype, result);
 }
 
-/* Expand atomic compound assignments into an approriate sequence as
-   specified by the C11 standard section 6.5.16.2.   
-    given 
+/* Expand atomic compound assignments into an appropriate sequence as
+   specified by the C11 standard section 6.5.16.2.
+
        _Atomic T1 E1
        T2 E2
        E1 op= E2
@@ -3719,26 +3719,25 @@  loop:
 done:
   feupdateenv (&fenv);
 
-  Also note that the compiler is simply issuing the generic form of
-  the atomic operations.  This requires temp(s) and has their address
-  taken.  The atomic processing is smart enough to figure out when the
-  size of an object can utilize a lock-free version, and convert the
-  built-in call to the appropriate lock-free routine.  The optimizers
-  will then dispose of any temps that are no longer required, and
-  lock-free implementations are utilized as long as there is target
-  support for the required size.
+  The compiler will issue the __atomic_fetch_* built-in when possible,
+  otherwise it will generate the generic form of the atomic operations.
+  This requires temp(s) and has their address taken.  The atomic processing
+  is smart enough to figure out when the size of an object can utilize
+  a lock-free version, and convert the built-in call to the appropriate
+  lock-free routine.  The optimizers will then dispose of any temps that
+  are no longer required, and lock-free implementations are utilized as
+  long as there is target support for the required size.
 
   If the operator is NOP_EXPR, then this is a simple assignment, and
   an __atomic_store is issued to perform the assignment rather than
-  the above loop.
-
-*/
+  the above loop.  */
 
 /* Build an atomic assignment at LOC, expanding into the proper
    sequence to store LHS MODIFYCODE= RHS.  Return a value representing
-   the result of the operation, unless RETURN_OLD_P in which case
+   the result of the operation, unless RETURN_OLD_P, in which case
    return the old value of LHS (this is only for postincrement and
    postdecrement).  */
+
 static tree
 build_atomic_assign (location_t loc, tree lhs, enum tree_code modifycode,
 		     tree rhs, bool return_old_p)
@@ -3805,6 +3804,86 @@  build_atomic_assign (location_t loc, tree lhs, enum tree_code modifycode,
       return build2 (COMPOUND_EXPR, nonatomic_lhs_type, compound_stmt, val);
     }
 
+  /* Attempt to implement the atomic operation as an __atomic_fetch_* or
+     __atomic_*_fetch built-in rather than a CAS loop.  atomic_bool type
+     isn't applicable for such builtins.  ??? Do we want to handle enums?  */
+  if ((TREE_CODE (lhs_type) == INTEGER_TYPE || POINTER_TYPE_P (lhs_type))
+      && TREE_CODE (rhs_type) == INTEGER_TYPE)
+    {
+      built_in_function fncode;
+      switch (modifycode)
+	{
+	case PLUS_EXPR:
+	case POINTER_PLUS_EXPR:
+	  fncode = (return_old_p
+		    ? BUILT_IN_ATOMIC_FETCH_ADD_N
+		    : BUILT_IN_ATOMIC_ADD_FETCH_N);
+	  break;
+	case MINUS_EXPR:
+	  fncode = (return_old_p
+		    ? BUILT_IN_ATOMIC_FETCH_SUB_N
+		    : BUILT_IN_ATOMIC_SUB_FETCH_N);
+	  break;
+	case BIT_AND_EXPR:
+	  fncode = (return_old_p
+		    ? BUILT_IN_ATOMIC_FETCH_AND_N
+		    : BUILT_IN_ATOMIC_AND_FETCH_N);
+	  break;
+	case BIT_IOR_EXPR:
+	  fncode = (return_old_p
+		    ? BUILT_IN_ATOMIC_FETCH_OR_N
+		    : BUILT_IN_ATOMIC_OR_FETCH_N);
+	  break;
+	case BIT_XOR_EXPR:
+	  fncode = (return_old_p
+		    ? BUILT_IN_ATOMIC_FETCH_XOR_N
+		    : BUILT_IN_ATOMIC_XOR_FETCH_N);
+	  break;
+	default:
+	  goto cas_loop;
+	}
+
+      /* If this is a pointer type, we need to multiplicate by the size of
+	 the pointer target type.  */
+      if (POINTER_TYPE_P (lhs_type))
+	{
+	  if (!COMPLETE_TYPE_P (TREE_TYPE (lhs_type))
+	      /* ??? This would introduce -Wdiscarded-qualifiers
+		 warning: __atomic_fetch_* expect volatile void *
+		 type as the first argument.  (Assignments between
+		 atomic and non-atomic objects are OK.) */
+	      || TYPE_RESTRICT (lhs_type))
+	    goto cas_loop;
+	  tree sz = TYPE_SIZE_UNIT (TREE_TYPE (lhs_type));
+	  rhs = fold_build2_loc (loc, MULT_EXPR, rhs_type, rhs,
+				 convert (rhs_type, sz));
+	}
+
+      /* Build __atomic_fetch_* (&lhs, &val, SEQ_CST), or
+	 __atomic_*_fetch (&lhs, &val, SEQ_CST).  */
+      fndecl = builtin_decl_explicit (fncode);
+      params->quick_push (lhs_addr);
+      params->quick_push (rhs);
+      params->quick_push (seq_cst);
+      func_call = c_build_function_call_vec (loc, vNULL, fndecl, params, NULL);
+
+      newval = create_tmp_var_raw (nonatomic_lhs_type);
+      TREE_ADDRESSABLE (newval) = 1;
+      TREE_NO_WARNING (newval) = 1;
+      rhs = build4 (TARGET_EXPR, nonatomic_lhs_type, newval, func_call,
+		    NULL_TREE, NULL_TREE);
+      SET_EXPR_LOCATION (rhs, loc);
+      add_stmt (rhs);
+
+      /* Finish the compound statement.  */
+      compound_stmt = c_end_compound_stmt (loc, compound_stmt, false);
+
+      /* NEWVAL is the value which was stored, return a COMPOUND_STMT of
+	 the statement and that value.  */
+      return build2 (COMPOUND_EXPR, nonatomic_lhs_type, compound_stmt, newval);
+    }
+
+cas_loop:
   /* Create the variables and labels required for the op= form.  */
   old = create_tmp_var_raw (nonatomic_lhs_type);
   old_addr = build_unary_op (loc, ADDR_EXPR, old, 0);
diff --git gcc/testsuite/gcc.dg/atomic/c11-atomic-exec-6.c gcc/testsuite/gcc.dg/atomic/c11-atomic-exec-6.c
index e69de29..2dc91c5 100644
--- gcc/testsuite/gcc.dg/atomic/c11-atomic-exec-6.c
+++ gcc/testsuite/gcc.dg/atomic/c11-atomic-exec-6.c
@@ -0,0 +1,54 @@ 
+/* Test we do correct thing for adding to / subtracting from a pointer,
+   i.e. that the multiplication by the size of the pointer target type
+   still occurs.  */
+/* { dg-do run } */
+/* { dg-options "-std=c11 -pedantic-errors" } */
+
+#define TEST_POINTER_ADD_SUB(TYPE)			\
+  do							\
+    {							\
+      TYPE a[3][3];					\
+      TYPE (*_Atomic q)[3] = &a[0];			\
+      ++q;						\
+      if (q != &a[1])					\
+	__builtin_abort ();				\
+      q++;						\
+      if (q != &a[2])					\
+	__builtin_abort ();				\
+      --q;						\
+      if (q != &a[1])					\
+	__builtin_abort ();				\
+      q--;						\
+      if (q != &a[0])					\
+	__builtin_abort ();				\
+      q += 2;						\
+      if (q != &a[2])					\
+	__builtin_abort ();				\
+      q -= 2;						\
+      if (q != &a[0])					\
+	__builtin_abort ();				\
+    }							\
+  while (0)
+
+int
+main (void)
+{
+  TEST_POINTER_ADD_SUB (_Bool);
+  TEST_POINTER_ADD_SUB (char);
+  TEST_POINTER_ADD_SUB (signed char);
+  TEST_POINTER_ADD_SUB (unsigned char);
+  TEST_POINTER_ADD_SUB (signed short);
+  TEST_POINTER_ADD_SUB (unsigned short);
+  TEST_POINTER_ADD_SUB (signed int);
+  TEST_POINTER_ADD_SUB (unsigned int);
+  TEST_POINTER_ADD_SUB (signed long);
+  TEST_POINTER_ADD_SUB (unsigned long);
+  TEST_POINTER_ADD_SUB (signed long long);
+  TEST_POINTER_ADD_SUB (unsigned long long);
+  TEST_POINTER_ADD_SUB (float);
+  TEST_POINTER_ADD_SUB (double);
+  TEST_POINTER_ADD_SUB (long double);
+  TEST_POINTER_ADD_SUB (_Complex float);
+  TEST_POINTER_ADD_SUB (_Complex double);
+  TEST_POINTER_ADD_SUB (_Complex long double);
+}
diff --git gcc/testsuite/gcc.dg/atomic/stdatomic-op-5.c gcc/testsuite/gcc.dg/atomic/stdatomic-op-5.c
index e69de29..daba8ec 100644
--- gcc/testsuite/gcc.dg/atomic/stdatomic-op-5.c
+++ gcc/testsuite/gcc.dg/atomic/stdatomic-op-5.c
@@ -0,0 +1,141 @@ 
+/* Test we're able use __atomic_fetch_* where possible and verify
+   we generate correct code.  */
+/* { dg-do run } */
+/* { dg-options "-std=c11 -pedantic-errors -fdump-tree-original" } */
+
+#include <stdatomic.h>
+
+extern void abort (void);
+
+static void
+test_inc_dec (void)
+{
+  atomic_int i = ATOMIC_VAR_INIT (1);
+
+  i++;
+  if (i != 2)
+    abort ();
+  i--;
+  if (i != 1)
+    abort ();
+  ++i;
+  if (i != 2)
+    abort ();
+  --i;
+  if (i != 1)
+    abort ();
+  if (++i != 2)
+    abort ();
+  if (i++ != 2)
+    abort ();
+  if (i != 3)
+    abort ();
+  if (i-- != 3)
+    abort ();
+  if (i != 2)
+    abort ();
+}
+
+static void
+test_add_sub (void)
+{
+  atomic_int i = ATOMIC_VAR_INIT (1);
+
+  i += 2;
+  if (i != 3)
+    abort ();
+  i -= 2;
+  if (i != 1)
+    abort ();
+  if ((i += 2) != 3)
+    abort ();
+  if ((i -= 2) != 1)
+    abort ();
+}
+
+static void
+test_and (void)
+{
+  atomic_int i = ATOMIC_VAR_INIT (5);
+
+  i &= 4;
+  if (i != 4)
+    abort ();
+  if ((i &= 4) != 4)
+    abort ();
+}
+
+static void
+test_xor (void)
+{
+  atomic_int i = ATOMIC_VAR_INIT (5);
+
+  i ^= 2;
+  if (i != 7)
+    abort ();
+  if ((i ^= 4) != 3)
+    abort ();
+}
+
+static void
+test_or (void)
+{
+  atomic_int i = ATOMIC_VAR_INIT (5);
+
+  i |= 2;
+  if (i != 7)
+    abort ();
+  if ((i |= 8) != 15)
+    abort ();
+}
+
+static void
+test_ptr (atomic_int *p)
+{
+  ++*p;
+  if (*p != 2)
+    abort ();
+
+  *p += 2;
+  if (*p != 4)
+    abort ();
+
+  (*p)++;
+  if (*p != 5)
+    abort ();
+
+  --*p;
+  if (*p != 4)
+    abort ();
+
+  (*p)--;
+  if (*p != 3)
+    abort ();
+
+  *p -= 2;
+  if (*p != 1)
+    abort ();
+
+  atomic_int j = ATOMIC_VAR_INIT (0);
+  j += *p;
+  if (j != 1)
+    abort ();
+
+  j -= *p;
+  if (j != 0)
+    abort ();
+}
+
+int
+main (void)
+{
+  atomic_int i = ATOMIC_VAR_INIT (1);
+  test_inc_dec ();
+  test_add_sub ();
+  test_and ();
+  test_xor ();
+  test_or ();
+  test_ptr (&i);
+}
+
+/* { dg-final { scan-tree-dump-not "__atomic_compare_exchange" "original" } } */