Message ID | 20230324130404.2C4ED138ED@imap2.suse-dmz.suse.de |
---|---|
State | New |
Headers | show |
Series | [1/2] Add emulated scatter capability to the vectorizer | expand |
> Emulated gather/scatter behave similar to strided elementwise > accesses in that they need to decompose the offset vector > and construct or decompose the data vector so handle them > the same way, pessimizing the cases with may elements. > > For pr88531-2c.c instead of > > .L4: > leaq (%r15,%rcx), %rdx > incl %edi > movl 16(%rdx), %r13d > movl 24(%rdx), %r14d > movl (%rdx), %r10d > movl 4(%rdx), %r9d > movl 8(%rdx), %ebx > movl 12(%rdx), %r11d > movl 20(%rdx), %r12d > vmovss (%rax,%r14,4), %xmm2 > movl 28(%rdx), %edx > vmovss (%rax,%r13,4), %xmm1 > vmovss (%rax,%r10,4), %xmm0 > vinsertps $0x10, (%rax,%rdx,4), %xmm2, %xmm2 > vinsertps $0x10, (%rax,%r12,4), %xmm1, %xmm1 > vinsertps $0x10, (%rax,%r9,4), %xmm0, %xmm0 > vmovlhps %xmm2, %xmm1, %xmm1 > vmovss (%rax,%rbx,4), %xmm2 > vinsertps $0x10, (%rax,%r11,4), %xmm2, %xmm2 > vmovlhps %xmm2, %xmm0, %xmm0 > vinsertf128 $0x1, %xmm1, %ymm0, %ymm0 > vmulps %ymm3, %ymm0, %ymm0 > vmovups %ymm0, (%r8,%rcx) > addq $32, %rcx > cmpl %esi, %edi > jb .L4 > > we now prefer > > .L4: > leaq 0(%rbp,%rdx,8), %rcx > movl (%rcx), %r10d > movl 4(%rcx), %ecx > vmovss (%rsi,%r10,4), %xmm0 > vinsertps $0x10, (%rsi,%rcx,4), %xmm0, %xmm0 > vmulps %xmm1, %xmm0, %xmm0 > vmovlps %xmm0, (%rbx,%rdx,8) > incq %rdx > cmpl %edi, %edx > jb .L4 > > which vectorizes with SSE instead of AVX2 which looks like an > improvement. > > When testing this on SPEC CPU 2017 with -Ofast -flto -march=znver4 > there are quite some cases where we now prefer SSE vectorization > over AVX512 + AVX2 epilogue and some cases where we now reject > vectorization. Runtime the changes are noise with the off-noise > candidates better after the patch. > > Bootstrapped and tested on x86_64-unknown-linux-gnu. > > OK for stage1? > > Thanks, > Richard. > > * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): > Tame down element extracts and scalar loads for gather/scatter > similar to elementwise strided accesses. > > * gcc.target/i386/pr89618-2.c: New testcase. > * gcc.target/i386/pr88531-2b.c: Adjust. > * gcc.target/i386/pr88531-2c.c: Likewise. OK. Honza
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 6a8734c2346..7a0b48c62c5 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -23555,8 +23555,10 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, && stmt_info && (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type || STMT_VINFO_TYPE (stmt_info) == store_vec_info_type) - && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE - && TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info))) != INTEGER_CST) + && ((STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE + && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info))) + != INTEGER_CST)) + || STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)) { stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1); diff --git a/gcc/testsuite/gcc.target/i386/pr88531-2b.c b/gcc/testsuite/gcc.target/i386/pr88531-2b.c index 011607c3d54..cdefff2ce8e 100644 --- a/gcc/testsuite/gcc.target/i386/pr88531-2b.c +++ b/gcc/testsuite/gcc.target/i386/pr88531-2b.c @@ -3,4 +3,4 @@ #include "pr88531-2a.c" -/* { dg-final { scan-assembler-times "vmulps" 2 } } */ +/* { dg-final { scan-assembler-times "vmulps" 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr88531-2c.c b/gcc/testsuite/gcc.target/i386/pr88531-2c.c index 0f7ec3832f8..17b24c0dacc 100644 --- a/gcc/testsuite/gcc.target/i386/pr88531-2c.c +++ b/gcc/testsuite/gcc.target/i386/pr88531-2c.c @@ -3,4 +3,4 @@ #include "pr88531-2a.c" -/* { dg-final { scan-assembler-times "vmulps" 2 } } */ +/* { dg-final { scan-assembler-times "vmulps" 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr89618-2.c b/gcc/testsuite/gcc.target/i386/pr89618-2.c new file mode 100644 index 00000000000..0b7dcfd8806 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr89618-2.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -mavx2 -fdump-tree-vect-details" } */ + +void foo (int n, int *off, double *a) +{ + const int m = 32; + + for (int j = 0; j < n/m; ++j) + { + int const start = j*m; + int const end = (j+1)*m; + +#pragma GCC ivdep + for (int i = start; i < end; ++i) + { + a[off[i]] = a[i] < 0 ? a[i] : 0; + } + } +} + +/* Make sure the cost model selects SSE vectors rather than AVX to avoid + too many scalar ops for the address computes in the loop. */ +/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" "vect" } } */