diff mbox

[nvptx] complex vector reductions

Message ID 5645FDB1.4060400@acm.org
State New
Headers show

Commit Message

Nathan Sidwell Nov. 13, 2015, 3:11 p.m. UTC
I noticed that we weren't supporting reductions of complex type, particularly 
complex double.

I've committed this patch to add support for vector reductions.  We split the 
complex type apart and process each half before sticking it back together again. 
  As it happens only 32-bit shuffles exist on hardware, so splitting (say) 
complex float this way doesn't result in worse code that punning the  whole 
thing to a 64-bit int.  I suspect it might play better, by keeping the real and 
imaginary parts separate.

Worker and gang-level reductions of such types will be a different patch, which 
I need to think about.

nathan

Comments

Bernd Schmidt Nov. 13, 2015, 4:46 p.m. UTC | #1
On 11/13/2015 04:11 PM, Nathan Sidwell wrote:
> I noticed that we weren't supporting reductions of complex type,
> particularly complex double.
>
> I've committed this patch to add support for vector reductions.  We
> split the complex type apart and process each half before sticking it
> back together again.  As it happens only 32-bit shuffles exist on
> hardware, so splitting (say) complex float this way doesn't result in
> worse code that punning the  whole thing to a 64-bit int.  I suspect it
> might play better, by keeping the real and imaginary parts separate.
>
> Worker and gang-level reductions of such types will be a different
> patch, which I need to think about.

LGTM.


Bernd
diff mbox

Patch

2015-11-13  Nathan Sidwell  <nathan@codesourcery.com>

	gcc/
	* config/nvptx/nvptx.c (nvptx_generate_vector_shuffle): Deal with
	complex types.

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c: New.
	* testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c: New.

Index: gcc/config/nvptx/nvptx.c
===================================================================
--- gcc/config/nvptx/nvptx.c	(revision 230320)
+++ gcc/config/nvptx/nvptx.c	(working copy)
@@ -3634,26 +3634,51 @@  nvptx_generate_vector_shuffle (location_
 {
   unsigned fn = NVPTX_BUILTIN_SHUFFLE;
   tree_code code = NOP_EXPR;
-  tree type = unsigned_type_node;
-  enum machine_mode mode = TYPE_MODE (TREE_TYPE (var));
+  tree arg_type = unsigned_type_node;
+  tree var_type = TREE_TYPE (var);
+  tree dest_type = var_type;
 
-  if (!INTEGRAL_MODE_P (mode))
+  if (TREE_CODE (var_type) == COMPLEX_TYPE)
+    var_type = TREE_TYPE (var_type);
+
+  if (TREE_CODE (var_type) == REAL_TYPE)
     code = VIEW_CONVERT_EXPR;
-  if (GET_MODE_SIZE (mode) == GET_MODE_SIZE (DImode))
+
+  if (TYPE_SIZE (var_type)
+      == TYPE_SIZE (long_long_unsigned_type_node))
     {
       fn = NVPTX_BUILTIN_SHUFFLELL;
-      type = long_long_unsigned_type_node;
+      arg_type = long_long_unsigned_type_node;
     }
-
+  
   tree call = nvptx_builtin_decl (fn, true);
-  call = build_call_expr_loc
-    (loc, call, 3, fold_build1 (code, type, var),
-     build_int_cst (unsigned_type_node, shift),
-     build_int_cst (unsigned_type_node, SHUFFLE_DOWN));
+  tree bits = build_int_cst (unsigned_type_node, shift);
+  tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
+  tree expr;
+
+  if (var_type != dest_type)
+    {
+      /* Do real and imaginary parts separately.  */
+      tree real = fold_build1 (REALPART_EXPR, var_type, var);
+      real = fold_build1 (code, arg_type, real);
+      real = build_call_expr_loc (loc, call, 3, real, bits, kind);
+      real = fold_build1 (code, var_type, real);
 
-  call = fold_build1 (code, TREE_TYPE (dest_var), call);
+      tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
+      imag = fold_build1 (code, arg_type, imag);
+      imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
+      imag = fold_build1 (code, var_type, imag);
+
+      expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
+    }
+  else
+    {
+      expr = fold_build1 (code, arg_type, var);
+      expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
+      expr = fold_build1 (code, dest_type, expr);
+    }
 
-  gimplify_assign (dest_var, call, seq);
+  gimplify_assign (dest_var, expr, seq);
 }
 
 /* Insert code to locklessly update  *PTR with *PTR OP VAR just before
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c	(revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c	(working copy)
@@ -0,0 +1,52 @@ 
+
+#include <complex.h>
+
+/* Double float has 53 bits of fraction. */
+#define FRAC (1.0 / (1LL << 48))
+
+int close_enough (double _Complex a, double _Complex b)
+{
+  double _Complex diff = a - b;
+  double mag2_a = __real__(a) * __real__ (a) + __imag__ (a) * __imag__ (a);
+  double mag2_diff = (__real__(diff) * __real__ (diff)
+		     + __imag__ (diff) * __imag__ (diff));
+
+  return mag2_diff / mag2_a < (FRAC * FRAC);
+}
+
+int main (void)
+{
+#define N 100
+  double _Complex ary[N], sum, prod, tsum, tprod;
+  int ix;
+
+  sum = tsum = 0;
+  prod = tprod = 1;
+  
+  for (ix = 0; ix < N;  ix++)
+    {
+      double frac = ix * (1.0 / 1024) + 1.0;
+      
+      ary[ix] = frac + frac * 2.0i - 1.0i;
+      sum += ary[ix];
+      prod *= ary[ix];
+    }
+
+#pragma acc parallel vector_length(32) copyin(ary) copy (tsum, tprod)
+  {
+#pragma acc loop vector reduction(+:tsum) reduction (*:tprod)
+    for (ix = 0; ix < N; ix++)
+      {
+	tsum += ary[ix];
+	tprod *= ary[ix];
+      }
+  }
+
+  if (!close_enough (sum, tsum))
+    return 1;
+
+  if (!close_enough (prod, tprod))
+    return 1;
+
+  return 0;
+}
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c	(revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c	(working copy)
@@ -0,0 +1,52 @@ 
+
+#include <complex.h>
+
+/* Single float has 23 bits of fraction. */
+#define FRAC (1.0f / (1 << 20))
+
+int close_enough (float _Complex a, float _Complex b)
+{
+  float _Complex diff = a - b;
+  float mag2_a = __real__(a) * __real__ (a) + __imag__ (a) * __imag__ (a);
+  float mag2_diff = (__real__(diff) * __real__ (diff)
+		     + __imag__ (diff) * __imag__ (diff));
+
+  return mag2_diff / mag2_a < (FRAC * FRAC);
+}
+
+int main (void)
+{
+#define N 100
+  float _Complex ary[N], sum, prod, tsum, tprod;
+  int ix;
+
+  sum = tsum = 0;
+  prod = tprod = 1;
+  
+  for (ix = 0; ix < N;  ix++)
+    {
+      float frac = ix * (1.0f / 1024) + 1.0f;
+      
+      ary[ix] = frac + frac * 2.0i - 1.0i;
+      sum += ary[ix];
+      prod *= ary[ix];
+    }
+
+#pragma acc parallel vector_length(32) copyin(ary) copy (tsum, tprod)
+  {
+#pragma acc loop vector reduction(+:tsum) reduction (*:tprod)
+    for (ix = 0; ix < N; ix++)
+      {
+	tsum += ary[ix];
+	tprod *= ary[ix];
+      }
+  }
+
+  if (!close_enough (sum, tsum))
+    return 1;
+
+  if (!close_enough (prod, tprod))
+    return 1;
+
+  return 0;
+}