Message ID | 20181212224310.GP12380@tucnak |
---|---|
State | New |
Headers | show |
Series | Support AVX512F masked gather loads (PR tree-optimization/88464) | expand |
On December 12, 2018 11:43:10 PM GMT+01:00, Jakub Jelinek <jakub@redhat.com> wrote: >Hi! > >We support either the AVX2 gather loads (128-bit or 256-bit, using >masks in >vector registers), both conditional and unconditional, but AVX512F >gather >loads only unconditional. The problem was that tree-vect-stmts.c >didn't >have code to deal with the integral masktype argument the builtins >have, >where we need to compute it as vector bool first and then VCE. > >Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux, ok >for >trunk? OK. Is there any chance you could look at replacing the i386 code to work with the new IFN style used by Aarch64? >2018-12-12 Jakub Jelinek <jakub@redhat.com> > > PR tree-optimization/88464 > * tree-vect-stmts.c (vect_build_gather_load_calls): Handle >INTEGER_TYPE > masktype if mask is non-NULL. > (vectorizable_load): Don't reject masked gather loads if masktype > in the decl is INTEGER_TYPE. > > * gcc.target/i386/avx512f-pr88462-1.c: New test. > * gcc.target/i386/avx512f-pr88462-2.c: New test. > >--- gcc/tree-vect-stmts.c.jj 2018-11-20 21:39:06.983478940 +0100 >+++ gcc/tree-vect-stmts.c 2018-12-12 20:05:42.980019685 +0100 >@@ -2647,8 +2647,13 @@ vect_build_gather_load_calls (stmt_vec_i > tree idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist); > tree masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist); > tree scaletype = TREE_VALUE (arglist); >+ tree real_masktype = masktype; > gcc_checking_assert (types_compatible_p (srctype, rettype) >- && (!mask || types_compatible_p (srctype, masktype))); >+ && (!mask >+ || TREE_CODE (masktype) == INTEGER_TYPE >+ || types_compatible_p (srctype, masktype))); >+ if (mask && TREE_CODE (masktype) == INTEGER_TYPE) >+ masktype = build_same_sized_truth_vector_type (srctype); > > tree perm_mask = NULL_TREE; > tree mask_perm_mask = NULL_TREE; >@@ -2763,9 +2768,9 @@ vect_build_gather_load_calls (stmt_vec_i > mask_op = vec_mask; > if (!useless_type_conversion_p (masktype, TREE_TYPE (vec_mask))) > { >- gcc_assert >- (known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask_op)), >- TYPE_VECTOR_SUBPARTS (masktype))); >+ poly_uint64 sub1 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask_op)); >+ poly_uint64 sub2 = TYPE_VECTOR_SUBPARTS (masktype); >+ gcc_assert (known_eq (sub1, sub2)); > var = vect_get_new_ssa_name (masktype, vect_simple_var); > mask_op = build1 (VIEW_CONVERT_EXPR, masktype, mask_op); > gassign *new_stmt >@@ -2777,8 +2782,33 @@ vect_build_gather_load_calls (stmt_vec_i > src_op = mask_op; > } > >+ tree mask_arg = mask_op; >+ if (masktype != real_masktype) >+ { >+ tree utype; >+ if (TYPE_MODE (real_masktype) == TYPE_MODE (masktype)) >+ utype = real_masktype; >+ else >+ utype = lang_hooks.types.type_for_mode (TYPE_MODE (masktype), 1); >+ var = vect_get_new_ssa_name (utype, vect_scalar_var); >+ mask_arg = build1 (VIEW_CONVERT_EXPR, utype, mask_op); >+ gassign *new_stmt >+ = gimple_build_assign (var, VIEW_CONVERT_EXPR, mask_arg); >+ vect_finish_stmt_generation (stmt_info, new_stmt, gsi); >+ mask_arg = var; >+ if (!useless_type_conversion_p (real_masktype, utype)) >+ { >+ gcc_assert (TYPE_PRECISION (utype) >+ <= TYPE_PRECISION (real_masktype)); >+ var = vect_get_new_ssa_name (real_masktype, vect_scalar_var); >+ new_stmt = gimple_build_assign (var, NOP_EXPR, utype); >+ vect_finish_stmt_generation (stmt_info, new_stmt, gsi); >+ mask_arg = var; >+ } >+ src_op = build_zero_cst (srctype); >+ } >gcall *new_call = gimple_build_call (gs_info->decl, 5, src_op, ptr, op, >- mask_op, scale); >+ mask_arg, scale); > > stmt_vec_info new_stmt_info; > if (!useless_type_conversion_p (vectype, rettype)) >@@ -7555,20 +7585,6 @@ vectorizable_load (stmt_vec_info stmt_in > TYPE_MODE (mask_vectype), true)) > return false; > } >- else if (memory_access_type == VMAT_GATHER_SCATTER && >gs_info.decl) >- { >- tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl)); >- tree masktype >- = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (TREE_CHAIN (arglist)))); >- if (TREE_CODE (masktype) == INTEGER_TYPE) >- { >- if (dump_enabled_p ()) >- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, >- "masked gather with integer mask not" >- " supported."); >- return false; >- } >- } > else if (memory_access_type != VMAT_LOAD_STORE_LANES > && memory_access_type != VMAT_GATHER_SCATTER) > { >--- gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c.jj 2018-12-12 >20:39:23.207028399 +0100 >+++ gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c 2018-12-12 >20:44:22.195148359 +0100 >@@ -0,0 +1,35 @@ >+/* PR tree-optimization/88462 */ >+/* { dg-do compile } */ >+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 >-mtune=skylake-avx512 -fdump-tree-vect-details" } */ >+/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte >vectors" 3 "vect" } } */ >+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" >3 "vect" } } */ >+ >+__attribute__((noipa)) void >+f1 (double * __restrict__ a, const double * __restrict__ b, const int >* __restrict__ c, int n) >+{ >+ int i; >+#pragma GCC ivdep >+ for (i = 0; i < n; ++i) >+ if (a[i] > 10.0) >+ a[i] = b[c[i]]; >+} >+ >+__attribute__((noipa)) void >+f2 (double * __restrict__ a, const double * __restrict__ b, const long >* __restrict__ c, int n) >+{ >+ int i; >+#pragma GCC ivdep >+ for (i = 0; i < n; ++i) >+ if (a[i] > 10.0) >+ a[i] = b[c[i]]; >+} >+ >+__attribute__((noipa)) void >+f3 (float * __restrict__ a, const float * __restrict__ b, const int * >__restrict__ c, int n) >+{ >+ int i; >+#pragma GCC ivdep >+ for (i = 0; i < n; ++i) >+ if (a[i] > 10.0f) >+ a[i] = b[c[i]]; >+} >--- gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c.jj 2018-12-12 >20:58:36.540211660 +0100 >+++ gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c 2018-12-12 >21:00:26.182423054 +0100 >@@ -0,0 +1,51 @@ >+/* PR tree-optimization/88462 */ >+/* { dg-do run { target { avx512f } } } */ >+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 >-mtune=skylake-avx512" } */ >+ >+#include "avx512f-check.h" >+ >+#include "avx512f-pr88462-1.c" >+ >+static void >+avx512f_test (void) >+{ >+ double a[1024], b[1024]; >+ float c[1024], f[1024]; >+ int d[1024]; >+ long e[1024]; >+ int i; >+ for (i = 0; i < 1024; i++) >+ { >+ asm volatile ("" : "+g" (i)); >+ a[i] = (i % 3) != 0 ? 15.0 : -5.0; >+ b[i] = 2 * i; >+ d[i] = (i % 3) ? 1023 - i : __INT_MAX__; >+ } >+ f1 (a, b, d, 1024); >+ for (i = 0; i < 1024; i++) >+ { >+ asm volatile ("" : "+g" (i)); >+ if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0)) >+ abort (); >+ a[i] = (i % 3) != 1 ? 15.0 : -5.0; >+ b[i] = 3 * i; >+ e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__; >+ } >+ f2 (a, b, e, 1024); >+ for (i = 0; i < 1024; i++) >+ { >+ asm volatile ("" : "+g" (i)); >+ if (a[i] != ((i % 3) != 1 ? (1023 - i) * 3.0 : -5.0)) >+ abort (); >+ c[i] = (i % 3) != 2 ? 15.0f : -5.0f; >+ d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__; >+ f[i] = 4 * i; >+ } >+ f3 (c, f, d, 1024); >+ for (i = 0; i < 1024; i++) >+ { >+ asm volatile ("" : "+g" (i)); >+ if (c[i] != ((i % 3) != 2 ? (1023 - i) * 4.0f : -5.0f)) >+ abort (); >+ } >+} > > Jakub
On Thu, Dec 13, 2018 at 05:09:17PM +0100, Richard Biener wrote: > Is there any chance you could look at replacing the i386 code to work with > the new IFN style used by Aarch64? Maybe for GCC 10, I'm afraid it is a lot of work mainly due to the weirdnesses of x86 mixed index vs. data sized types, that would be non-fun to represent in the optabs. Jakub
--- gcc/tree-vect-stmts.c.jj 2018-11-20 21:39:06.983478940 +0100 +++ gcc/tree-vect-stmts.c 2018-12-12 20:05:42.980019685 +0100 @@ -2647,8 +2647,13 @@ vect_build_gather_load_calls (stmt_vec_i tree idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist); tree masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist); tree scaletype = TREE_VALUE (arglist); + tree real_masktype = masktype; gcc_checking_assert (types_compatible_p (srctype, rettype) - && (!mask || types_compatible_p (srctype, masktype))); + && (!mask + || TREE_CODE (masktype) == INTEGER_TYPE + || types_compatible_p (srctype, masktype))); + if (mask && TREE_CODE (masktype) == INTEGER_TYPE) + masktype = build_same_sized_truth_vector_type (srctype); tree perm_mask = NULL_TREE; tree mask_perm_mask = NULL_TREE; @@ -2763,9 +2768,9 @@ vect_build_gather_load_calls (stmt_vec_i mask_op = vec_mask; if (!useless_type_conversion_p (masktype, TREE_TYPE (vec_mask))) { - gcc_assert - (known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask_op)), - TYPE_VECTOR_SUBPARTS (masktype))); + poly_uint64 sub1 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask_op)); + poly_uint64 sub2 = TYPE_VECTOR_SUBPARTS (masktype); + gcc_assert (known_eq (sub1, sub2)); var = vect_get_new_ssa_name (masktype, vect_simple_var); mask_op = build1 (VIEW_CONVERT_EXPR, masktype, mask_op); gassign *new_stmt @@ -2777,8 +2782,33 @@ vect_build_gather_load_calls (stmt_vec_i src_op = mask_op; } + tree mask_arg = mask_op; + if (masktype != real_masktype) + { + tree utype; + if (TYPE_MODE (real_masktype) == TYPE_MODE (masktype)) + utype = real_masktype; + else + utype = lang_hooks.types.type_for_mode (TYPE_MODE (masktype), 1); + var = vect_get_new_ssa_name (utype, vect_scalar_var); + mask_arg = build1 (VIEW_CONVERT_EXPR, utype, mask_op); + gassign *new_stmt + = gimple_build_assign (var, VIEW_CONVERT_EXPR, mask_arg); + vect_finish_stmt_generation (stmt_info, new_stmt, gsi); + mask_arg = var; + if (!useless_type_conversion_p (real_masktype, utype)) + { + gcc_assert (TYPE_PRECISION (utype) + <= TYPE_PRECISION (real_masktype)); + var = vect_get_new_ssa_name (real_masktype, vect_scalar_var); + new_stmt = gimple_build_assign (var, NOP_EXPR, utype); + vect_finish_stmt_generation (stmt_info, new_stmt, gsi); + mask_arg = var; + } + src_op = build_zero_cst (srctype); + } gcall *new_call = gimple_build_call (gs_info->decl, 5, src_op, ptr, op, - mask_op, scale); + mask_arg, scale); stmt_vec_info new_stmt_info; if (!useless_type_conversion_p (vectype, rettype)) @@ -7555,20 +7585,6 @@ vectorizable_load (stmt_vec_info stmt_in TYPE_MODE (mask_vectype), true)) return false; } - else if (memory_access_type == VMAT_GATHER_SCATTER && gs_info.decl) - { - tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl)); - tree masktype - = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (TREE_CHAIN (arglist)))); - if (TREE_CODE (masktype) == INTEGER_TYPE) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "masked gather with integer mask not" - " supported."); - return false; - } - } else if (memory_access_type != VMAT_LOAD_STORE_LANES && memory_access_type != VMAT_GATHER_SCATTER) { --- gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c.jj 2018-12-12 20:39:23.207028399 +0100 +++ gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c 2018-12-12 20:44:22.195148359 +0100 @@ -0,0 +1,35 @@ +/* PR tree-optimization/88462 */ +/* { dg-do compile } */ +/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 -fdump-tree-vect-details" } */ +/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 3 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 3 "vect" } } */ + +__attribute__((noipa)) void +f1 (double * __restrict__ a, const double * __restrict__ b, const int * __restrict__ c, int n) +{ + int i; +#pragma GCC ivdep + for (i = 0; i < n; ++i) + if (a[i] > 10.0) + a[i] = b[c[i]]; +} + +__attribute__((noipa)) void +f2 (double * __restrict__ a, const double * __restrict__ b, const long * __restrict__ c, int n) +{ + int i; +#pragma GCC ivdep + for (i = 0; i < n; ++i) + if (a[i] > 10.0) + a[i] = b[c[i]]; +} + +__attribute__((noipa)) void +f3 (float * __restrict__ a, const float * __restrict__ b, const int * __restrict__ c, int n) +{ + int i; +#pragma GCC ivdep + for (i = 0; i < n; ++i) + if (a[i] > 10.0f) + a[i] = b[c[i]]; +} --- gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c.jj 2018-12-12 20:58:36.540211660 +0100 +++ gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c 2018-12-12 21:00:26.182423054 +0100 @@ -0,0 +1,51 @@ +/* PR tree-optimization/88462 */ +/* { dg-do run { target { avx512f } } } */ +/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512" } */ + +#include "avx512f-check.h" + +#include "avx512f-pr88462-1.c" + +static void +avx512f_test (void) +{ + double a[1024], b[1024]; + float c[1024], f[1024]; + int d[1024]; + long e[1024]; + int i; + for (i = 0; i < 1024; i++) + { + asm volatile ("" : "+g" (i)); + a[i] = (i % 3) != 0 ? 15.0 : -5.0; + b[i] = 2 * i; + d[i] = (i % 3) ? 1023 - i : __INT_MAX__; + } + f1 (a, b, d, 1024); + for (i = 0; i < 1024; i++) + { + asm volatile ("" : "+g" (i)); + if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0)) + abort (); + a[i] = (i % 3) != 1 ? 15.0 : -5.0; + b[i] = 3 * i; + e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__; + } + f2 (a, b, e, 1024); + for (i = 0; i < 1024; i++) + { + asm volatile ("" : "+g" (i)); + if (a[i] != ((i % 3) != 1 ? (1023 - i) * 3.0 : -5.0)) + abort (); + c[i] = (i % 3) != 2 ? 15.0f : -5.0f; + d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__; + f[i] = 4 * i; + } + f3 (c, f, d, 1024); + for (i = 0; i < 1024; i++) + { + asm volatile ("" : "+g" (i)); + if (c[i] != ((i % 3) != 2 ? (1023 - i) * 4.0f : -5.0f)) + abort (); + } +}