[og10] openacc: Adjust loop lowering for AMD GCN

Message ID	20210113234842.71133-1-julian@codesourcery.com
State	New
Headers	show Return-Path: <gcc-patches-bounces@gcc.gnu.org> DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org C6E23386F80C IronPort-SDR: iewwYcCSrqUVaQypMKuYyccd1t3MFWT/527Hl7tLE3Uv/qXALwPD2XFKprjKAE3L88HuvQs7Cc vrJYhcBpqWjwMJQNYKW2OcvANA0594SXnS0HYpM9S8P09DU7v8RNoUNiJVnfFUIhaPvu88GNCA atqlFwZKgtNUXvvHswepF0nAuVjj1UziPWgi3SuLrK2SIwSPXwir7JorsHd0D7MSsywj4vRgsG 0gd5k2koKNZTwSsboTsVKB9T7GEE0o0XNqfo/UR5phfqdP90siMVijuouOKuW3LufhFLv8M1U8 3lY= IronPort-SDR: sJLJR3GEPfJ4vzH6M8uSVzsGv2Pn1c08S0h0tcFRIpZp5zBCRPYV1gK9kQXdLPrT3n8xYIJxix yNHAWSwwZqcWRDjD22zEjOPYg541pgoe4z9FmnQkHKh3vDW391jEkhTCVN1y5hesnEOSLfUL8Q pcUX/D2docstep0REkOxGme7oo/cFsZm6paKOaz7mFmOZA9aRxlQbdvBIfVgwMfvNOiM6FpSEy iY46SS5DlnDfyeWM/YYFRkObDOf0QvYOkWh/b5SghV07wApu/QpF/FUIvOBnZklBloP79nW1AH 4Wk= From: Julian Brown <julian@codesourcery.com> To: <gcc-patches@gcc.gnu.org> Subject: [PATCH] [og10] openacc: Adjust loop lowering for AMD GCN Date: Wed, 13 Jan 2021 15:48:41 -0800 Message-ID: <20210113234842.71133-1-julian@codesourcery.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Content-Type: text/plain Precedence: list Errors-To: gcc-patches-bounces@gcc.gnu.org Sender: "Gcc-patches" <gcc-patches-bounces@gcc.gnu.org>
Series	[og10] openacc: Adjust loop lowering for AMD GCN \| expand [og10] openacc: Adjust loop lowering for AMD GCN

diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c index f5ce34d3bdd8..bb3bfd130ee4 100644 --- a/gcc/omp-offload.c +++ b/gcc/omp-offload.c @@ -470,11 +470,13 @@ oacc_dim_call (bool pos, int dim, gimple_seq *seq) } /* Find the number of threads (POS = false), or thread number (POS = - true) for an OpenACC region partitioned as MASK. Setup code + true) for an OpenACC region partitioned as MASK. If VF_BY_VECTORIZER is + true, use that as the vectorization factor for the auto-vectorized + dimension size, instead of calling the builtin function. Setup code required for the calculation is added to SEQ. */ static tree -oacc_thread_numbers (bool pos, int mask, gimple_seq *seq) +oacc_thread_numbers (bool pos, int mask, tree vf_by_vectorizer, gimple_seq *seq) { tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1); unsigned ix; @@ -487,13 +489,15 @@ oacc_thread_numbers (bool pos, int mask, gimple_seq *seq) { /* We had an outer index, so scale that by the size of this dimension. */ - tree n = oacc_dim_call (false, ix, seq); + tree n = (ix == GOMP_DIM_VECTOR && vf_by_vectorizer) + ? vf_by_vectorizer : oacc_dim_call (false, ix, seq); res = fold_build2 (MULT_EXPR, integer_type_node, res, n); } if (pos) { /* Determine index in this dimension. */ - tree id = oacc_dim_call (true, ix, seq); + tree id = (ix == GOMP_DIM_VECTOR && vf_by_vectorizer) + ? integer_zero_node : oacc_dim_call (true, ix, seq); if (res) res = fold_build2 (PLUS_EXPR, integer_type_node, res, id); else @@ -507,6 +511,12 @@ oacc_thread_numbers (bool pos, int mask, gimple_seq *seq) return res; } +static tree +oacc_thread_numbers (bool pos, int mask, gimple_seq *seq) +{ + return oacc_thread_numbers (pos, mask, NULL_TREE, seq); +} + /* Transform IFN_GOACC_LOOP calls to actual code. See expand_oacc_for for where these are generated. At the vector level, we stride loops, such that each member of a warp will @@ -534,6 +544,8 @@ oacc_xform_loop (gcall *call) bool chunking = false, striding = true; unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any) + bool vec_tiles = true; + tree vf_by_vectorizer = NULL_TREE; /* Skip lowering if return value of IFN_GOACC_LOOP call is not used. */ if (!lhs) @@ -561,16 +573,39 @@ oacc_xform_loop (gcall *call) striding = integer_onep (chunk_size); chunking = !striding; } + + if (!chunking + && !targetm.simt.vf + && (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))) + { + poly_uint64 max_vf = omp_max_vf (); + vf_by_vectorizer = build_int_cst (integer_type_node, max_vf); + } + #endif - /* striding=true, chunking=true + /* For SIMT targets: + + striding=true, chunking=true -> invalid. striding=true, chunking=false -> chunks=1 striding=false,chunking=true -> chunks=ceil (range/(chunksize*threads*step)) striding=false,chunking=false - -> chunk_size=ceil(range/(threads*step)),chunks=1 */ + -> chunk_size=ceil(range/(threads*step)),chunks=1 + + For non-SIMT targets: + + striding=N/A, chunking=true + -> as above, for now. + striding=N/A, chunking=false + -> chunks=1 + threads=gangs*workers*vf + chunk_size=ceil(range/(threads*step)) + inner chunking loop steps by "step", vf*chunk_size times. + */ + push_gimplify_context (true); switch (code) @@ -589,49 +624,83 @@ oacc_xform_loop (gcall *call) chunk_size = fold_convert (type, chunk_size); per = fold_build2 (MULT_EXPR, type, per, chunk_size); per = fold_build2 (MULT_EXPR, type, per, step); - r = build2 (MINUS_EXPR, type, range, dir); - r = build2 (PLUS_EXPR, type, r, per); + r = fold_build2 (MINUS_EXPR, type, range, dir); + r = fold_build2 (PLUS_EXPR, type, r, per); r = build2 (TRUNC_DIV_EXPR, type, r, per); } break; case IFN_GOACC_LOOP_STEP: { - /* If striding, step by the entire compute volume, otherwise - step by the inner volume. */ - unsigned volume = striding ? mask : inner_mask; + if (vf_by_vectorizer) + r = step; + else + { + /* If striding, step by the entire compute volume, otherwise + step by the inner volume. */ + unsigned volume = striding ? mask : inner_mask; - r = oacc_thread_numbers (false, volume, &seq); - r = build2 (MULT_EXPR, type, fold_convert (type, r), step); + r = oacc_thread_numbers (false, volume, &seq); + r = build2 (MULT_EXPR, type, fold_convert (type, r), step); + } } break; case IFN_GOACC_LOOP_OFFSET: - /* Enable vectorization on non-SIMT targets. */ - if (!targetm.simt.vf - && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR) + if (vf_by_vectorizer) + { /* If not -fno-tree-loop-vectorize, hint that we want to vectorize the loop. */ - && (flag_tree_loop_vectorize - || !global_options_set.x_flag_tree_loop_vectorize)) - { - basic_block bb = gsi_bb (gsi); - class loop *parent = bb->loop_father; - class loop *body = parent->inner; - - parent->force_vectorize = true; - parent->safelen = INT_MAX; - - /* "Chunking loops" may have inner loops. */ - if (parent->inner) + if (flag_tree_loop_vectorize + || !global_options_set.x_flag_tree_loop_vectorize) { - body->force_vectorize = true; - body->safelen = INT_MAX; + /* Enable vectorization on non-SIMT targets. */ + basic_block bb = gsi_bb (gsi); + class loop *chunk_loop = bb->loop_father; + class loop *inner_loop = chunk_loop->inner; + + /* Chunking isn't supported for VF_BY_VECTORIZER loops yet, + so we know that the outer chunking loop will be executed just + once and the inner loop is the one which must be + vectorized (unless it has been optimized out for some + reason). */ + gcc_assert (!chunking); + + if (inner_loop) + { + inner_loop->force_vectorize = true; + inner_loop->safelen = INT_MAX; + + cfun->has_force_vectorize_loops = true; + } } - cfun->has_force_vectorize_loops = true; + /* ...and expand the abstract loops such that the vectorizer can + work on them more effectively. + + It might be nicer to merge this code with the "!striding" case + below, particularly if chunking support is added. */ + tree warppos + = oacc_thread_numbers (true, mask, vf_by_vectorizer, &seq); + warppos = fold_convert (diff_type, warppos); + + tree volume + = oacc_thread_numbers (false, mask, vf_by_vectorizer, &seq); + volume = fold_convert (diff_type, volume); + + tree per = fold_build2 (MULT_EXPR, diff_type, volume, step); + chunk_size = fold_build2 (PLUS_EXPR, diff_type, range, per); + chunk_size = fold_build2 (MINUS_EXPR, diff_type, chunk_size, dir); + chunk_size = fold_build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, + per); + + warppos = fold_build2 (MULT_EXPR, diff_type, warppos, chunk_size); + + tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6)); + chunk = fold_build2 (MULT_EXPR, diff_type, chunk, volume); + r = fold_build2 (PLUS_EXPR, diff_type, chunk, warppos); } - if (striding) + else if (striding) { r = oacc_thread_numbers (true, mask, &seq); r = fold_convert (diff_type, r); @@ -649,7 +718,7 @@ oacc_xform_loop (gcall *call) else { tree per = fold_build2 (MULT_EXPR, diff_type, volume, step); - + /* chunk_size = (range + per - 1) / per. */ chunk_size = build2 (MINUS_EXPR, diff_type, range, dir); chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per); chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per); @@ -681,7 +750,28 @@ oacc_xform_loop (gcall *call) break; case IFN_GOACC_LOOP_BOUND: - if (striding) + if (vf_by_vectorizer) + { + tree volume + = oacc_thread_numbers (false, mask, vf_by_vectorizer, &seq); + volume = fold_convert (diff_type, volume); + + tree per = fold_build2 (MULT_EXPR, diff_type, volume, step); + chunk_size = fold_build2 (PLUS_EXPR, diff_type, range, per); + chunk_size = fold_build2 (MINUS_EXPR, diff_type, chunk_size, dir); + chunk_size = fold_build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, + per); + + vf_by_vectorizer = fold_convert (diff_type, vf_by_vectorizer); + tree vecsize = fold_build2 (MULT_EXPR, diff_type, chunk_size, + vf_by_vectorizer); + vecsize = fold_build2 (MULT_EXPR, diff_type, vecsize, step); + tree vecend = fold_convert (diff_type, gimple_call_arg (call, 6)); + vecend = fold_build2 (PLUS_EXPR, diff_type, vecend, vecsize); + r = fold_build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR, diff_type, + range, vecend); + } + else if (striding) r = range; else { @@ -696,7 +786,7 @@ oacc_xform_loop (gcall *call) else { tree per = fold_build2 (MULT_EXPR, diff_type, volume, step); - + /* chunk_size = (range + per - 1) / per. */ chunk_size = build2 (MINUS_EXPR, diff_type, range, dir); chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per); chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per); diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c index 5c8430120618..c444543586fc 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c @@ -45,12 +45,23 @@ int main () int expected = ix; if(ondev) { - int chunk_size = (N + gangsize * workersize * vectorsize - 1) - / (gangsize * workersize * vectorsize); +#if defined (ACC_DEVICE_TYPE_radeon) && defined (__OPTIMIZE__) + int use_vectorsize = 64; +#else + int use_vectorsize = vectorsize; +#endif + int chunk_size = (N + gangsize * workersize * use_vectorsize - 1) + / (gangsize * workersize * use_vectorsize); +#ifdef ACC_DEVICE_TYPE_radeon + int g = ix / (chunk_size * workersize * use_vectorsize); + int w = (ix / (chunk_size * use_vectorsize)) % workersize; + int v = 0; +#else int g = ix / (chunk_size * workersize * vectorsize); int w = (ix / vectorsize) % workersize; int v = ix % vectorsize; +#endif expected = (g << 16) | (w << 8) | v; } diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c index 9c4a85f7b16b..1571236bfb49 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c @@ -40,12 +40,23 @@ int main () int val = ix; if (ondev) { - int chunk_size = (N + gangsize * workersize * vectorsize - 1) - / (gangsize * workersize * vectorsize); - +#if defined (ACC_DEVICE_TYPE_radeon) && defined (__OPTIMIZE__) + int use_vectorsize = 64; +#else + int use_vectorsize = vectorsize; +#endif + int chunk_size = (N + gangsize * workersize * use_vectorsize - 1) + / (gangsize * workersize * use_vectorsize); + +#ifdef ACC_DEVICE_TYPE_radeon + int g = ix / (chunk_size * workersize * use_vectorsize); + int w = (ix / (chunk_size * use_vectorsize)) % workersize; + int v = 0; +#else int g = ix / (chunk_size * vectorsize * workersize); int w = ix / vectorsize % workersize; int v = ix % vectorsize; +#endif val = (g << 16) | (w << 8) | v; } diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c index c360ad11e7cb..423fbf442a29 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c @@ -40,8 +40,24 @@ int main () if(ondev) { int g = 0; +#ifdef ACC_DEVICE_TYPE_radeon +# ifdef __OPTIMIZE__ + int use_vecsize = 64; +# else + int use_vecsize = vectorsize; +# endif + /* For Radeon, the loop is split into contiguous blocks of + chunk_size * vector_size, with chunk_size selected to cover the + whole iteration space. Each block is then autovectorized where + possible. */ + int chunk_size = (N + workersize * use_vecsize - 1) + / (workersize * use_vecsize); + int w = ix / (chunk_size * use_vecsize); + int v = 0; +#else int w = (ix / vectorsize) % workersize; int v = ix % vectorsize; +#endif val = (g << 16) | (w << 8) | v; } diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c index fd4e4cf5ea9c..e4ffe3931bbe 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c @@ -43,8 +43,24 @@ int main () if(ondev) { int g = 0; +#ifdef ACC_DEVICE_TYPE_radeon +# ifdef __OPTIMIZE__ + int use_vecsize = 64; +# else + int use_vecsize = vectorsize; +# endif + /* For Radeon, the loop is split into contiguous blocks of + chunk_size * vector_size, with chunk_size selected to cover the + whole iteration space. Each block is then autovectorized where + possible. */ + int chunk_size = (N + workersize * use_vecsize - 1) + / (workersize * use_vecsize); + int w = ix / (chunk_size * use_vecsize); + int v = 0; +#else int w = (ix / vectorsize) % workersize; int v = ix % vectorsize; +#endif expected = (g << 16) | (w << 8) | v; } diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c index da13d84908a8..a4af1902d3d7 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c @@ -49,12 +49,23 @@ int main () int expected = ix; if(ondev) { - int chunk_size = (N + gangsize * workersize * vectorsize - 1) - / (gangsize * workersize * vectorsize); +#if defined (ACC_DEVICE_TYPE_radeon) && defined (__OPTIMIZE__) + int use_vectorsize = 64; +#else + int use_vectorsize = vectorsize; +#endif + int chunk_size = (N + gangsize * workersize * use_vectorsize - 1) + / (gangsize * workersize * use_vectorsize); - int g = ix / (chunk_size * vectorsize * workersize); +#ifdef ACC_DEVICE_TYPE_radeon + int g = ix / (chunk_size * workersize * use_vectorsize); + int w = (ix / (chunk_size * use_vectorsize)) % workersize; + int v = 0; +#else + int g = ix / (chunk_size * workersize * vectorsize); int w = (ix / vectorsize) % workersize; int v = ix % vectorsize; +#endif expected = (g << 16) | (w << 8) | v; } diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c index 73696e4e59a3..091ef0682499 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c @@ -50,8 +50,24 @@ int main () if(ondev) { int g = 0; +#ifdef ACC_DEVICE_TYPE_radeon +# ifdef __OPTIMIZE__ + int use_vecsize = 64; +# else + int use_vecsize = vectorsize; +# endif + /* For Radeon, the loop is split into contiguous blocks of + chunk_size * vector_size, with chunk_size selected to cover the + whole iteration space. Each block is then autovectorized where + possible. */ + int chunk_size = (N + workersize * use_vecsize - 1) + / (workersize * use_vecsize); + int w = ix / (chunk_size * use_vecsize); + int v = 0; +#else int w = (ix / vectorsize) % workersize; int v = ix % vectorsize; +#endif expected = (g << 16) | (w << 8) | v; }

[og10] openacc: Adjust loop lowering for AMD GCN

Commit Message

Patch