diff mbox series

znver3 tuning part 2

Message ID 20210317214524.GA55027@kam.mff.cuni.cz
State New
Headers show
Series znver3 tuning part 2 | expand

Commit Message

Jan Hubicka March 17, 2021, 9:45 p.m. UTC
Hi,
this patch enables gather on zen3 hardware.  For TSVC it get used by 6
benchmarks with following runtime improvements:

s4114: 1.424 -> 1.209  (84.9017%)
s4115: 2.021 -> 1.065  (52.6967%)
s4116: 1.549 -> 0.854  (55.1323%)
s4117: 1.386 -> 1.193  (86.075%)
vag: 2.741 -> 1.940  (70.7771%)

and one regression:

s4112: 1.115 -> 1.184  (106.188%)

In s4112 the internal loop is:

        for (int i = 0; i < LEN_1D; i++) {
            a[i] += b[ip[i]] * s;
        }

(so a standard accmulate and add with indirect addressing)

  40a400:       c5 fe 6f 24 03          vmovdqu (%rbx,%rax,1),%ymm4
  40a405:       c5 fc 28 da             vmovaps %ymm2,%ymm3
  40a409:       48 83 c0 20             add    $0x20,%rax
  40a40d:       c4 e2 65 92 04 a5 00    vgatherdps %ymm3,0x594100(,%ymm4,4),%ymm0
  40a414:       41 59 00 
  40a417:       c4 e2 75 a8 80 e0 34    vfmadd213ps 0x5b34e0(%rax),%ymm1,%ymm0
  40a41e:       5b 00 
  40a420:       c5 fc 29 80 e0 34 5b    vmovaps %ymm0,0x5b34e0(%rax)
  40a427:       00 
  40a428:       48 3d 00 f4 01 00       cmp    $0x1f400,%rax
  40a42e:       75 d0                   jne    40a400 <s4112+0x60>

compared to:

  40a280:       49 63 14 04             movslq (%r12,%rax,1),%rdx
  40a284:       48 83 c0 04             add    $0x4,%rax
  40a288:       c5 fa 10 04 95 00 41    vmovss 0x594100(,%rdx,4),%xmm0
  40a28f:       59 00 
  40a291:       c4 e2 71 a9 80 fc 34    vfmadd213ss 0x5b34fc(%rax),%xmm1,%xmm0
  40a298:       5b 00 
  40a29a:       c5 fa 11 80 fc 34 5b    vmovss %xmm0,0x5b34fc(%rax)
  40a2a1:       00 
  40a2a2:       48 3d 00 f4 01 00       cmp    $0x1f400,%rax
  40a2a8:       75 d6                   jne    40a280 <s4112+0x40>

Looking at instructions latencies

 - fmadd is 4 cycles
 - vgatherdps is 39

So vgather iself is 4.8 cycle per iteration and probably CPU is able to execute
rest out of order getting clos to 4 cycles per iteration (it can do 2 loads in
parallel, one store and rest fits easily to execution resources). That would
explain 20% slowdown.

gimple internal loop is:
  _2 = a[i_38];
  _3 = (long unsigned int) i_38;
  _4 = _3 * 4;
  _5 = ip_18 + _4;
  _6 = *_5;
  _7 = b[_6];
  _8 = _7 * s_19;
  _9 = _2 + _8;
  a[i_38] = _9;
  i_28 = i_38 + 1;
  ivtmp_52 = ivtmp_53 - 1;
  if (ivtmp_52 != 0)
    goto <bb 8>; [98.99%]
  else
    goto <bb 4>; [1.01%]

0x25bac30 a[i_38] 1 times scalar_load costs 12 in body
0x25bac30 *_5 1 times scalar_load costs 12 in body
0x25bac30 b[_6] 1 times scalar_load costs 12 in body
0x25bac30 _7 * s_19 1 times scalar_stmt costs 12 in body
0x25bac30 _2 + _8 1 times scalar_stmt costs 12 in body
0x25bac30 _9 1 times scalar_store costs 16 in body

so 19 cycles estimate of scalar load

0x2668630 a[i_38] 1 times vector_load costs 12 in body
0x2668630 *_5 1 times unaligned_load (misalign -1) costs 12 in body
0x2668630 b[_6] 8 times scalar_load costs 96 in body
0x2668630 _7 * s_19 1 times scalar_to_vec costs 4 in prologue
0x2668630 _7 * s_19 1 times vector_stmt costs 12 in body
0x2668630 _2 + _8 1 times vector_stmt costs 12 in body
0x2668630 _9 1 times vector_store costs 16 in body

so 40 cycles per 8x vectorized body

tsvc.c:3450:27: note:  operating only on full vectors.
tsvc.c:3450:27: note:  Cost model analysis:
  Vector inside of loop cost: 160
  Vector prologue cost: 4
  Vector epilogue cost: 0
  Scalar iteration cost: 76
  Scalar outside cost: 0
  Vector outside cost: 4
  prologue iterations: 0
  epilogue iterations: 0
  Calculated minimum iters for profitability: 1

I think this generally suffers from GIGO principle.
One problem seems to be that we do not know about fmadd yet and compute it as
two instructions (6 cycles instead of 4). More importnat problem is that we do
not account the parallelism at all.  I do not see how to disable the
vecotrization here without bumping gather costs noticeably off reality and thus
we probably can try to experiment with this if more similar problems are found.

Icc is also using gather in s1115 and s128.
For s1115 the vectorization does not seem to help and s128 gets slower.

Clang nor aocc does not use gathers.

Honza

	* x86-tune-costs.h (znver3_cost): Update costs of gather to match reality.
	* x86-tune.def (X86_TUNE_USE_GATHER): Enable for znver3.

Comments

Richard Biener March 18, 2021, 12:34 p.m. UTC | #1
On Wed, Mar 17, 2021 at 10:46 PM Jan Hubicka <hubicka@ucw.cz> wrote:
>
> Hi,
> this patch enables gather on zen3 hardware.  For TSVC it get used by 6
> benchmarks with following runtime improvements:
>
> s4114: 1.424 -> 1.209  (84.9017%)
> s4115: 2.021 -> 1.065  (52.6967%)
> s4116: 1.549 -> 0.854  (55.1323%)
> s4117: 1.386 -> 1.193  (86.075%)
> vag: 2.741 -> 1.940  (70.7771%)
>
> and one regression:
>
> s4112: 1.115 -> 1.184  (106.188%)
>
> In s4112 the internal loop is:
>
>         for (int i = 0; i < LEN_1D; i++) {
>             a[i] += b[ip[i]] * s;
>         }
>
> (so a standard accmulate and add with indirect addressing)
>
>   40a400:       c5 fe 6f 24 03          vmovdqu (%rbx,%rax,1),%ymm4
>   40a405:       c5 fc 28 da             vmovaps %ymm2,%ymm3
>   40a409:       48 83 c0 20             add    $0x20,%rax
>   40a40d:       c4 e2 65 92 04 a5 00    vgatherdps %ymm3,0x594100(,%ymm4,4),%ymm0
>   40a414:       41 59 00
>   40a417:       c4 e2 75 a8 80 e0 34    vfmadd213ps 0x5b34e0(%rax),%ymm1,%ymm0
>   40a41e:       5b 00
>   40a420:       c5 fc 29 80 e0 34 5b    vmovaps %ymm0,0x5b34e0(%rax)
>   40a427:       00
>   40a428:       48 3d 00 f4 01 00       cmp    $0x1f400,%rax
>   40a42e:       75 d0                   jne    40a400 <s4112+0x60>
>
> compared to:
>
>   40a280:       49 63 14 04             movslq (%r12,%rax,1),%rdx
>   40a284:       48 83 c0 04             add    $0x4,%rax
>   40a288:       c5 fa 10 04 95 00 41    vmovss 0x594100(,%rdx,4),%xmm0
>   40a28f:       59 00
>   40a291:       c4 e2 71 a9 80 fc 34    vfmadd213ss 0x5b34fc(%rax),%xmm1,%xmm0
>   40a298:       5b 00
>   40a29a:       c5 fa 11 80 fc 34 5b    vmovss %xmm0,0x5b34fc(%rax)
>   40a2a1:       00
>   40a2a2:       48 3d 00 f4 01 00       cmp    $0x1f400,%rax
>   40a2a8:       75 d6                   jne    40a280 <s4112+0x40>
>
> Looking at instructions latencies
>
>  - fmadd is 4 cycles
>  - vgatherdps is 39
>
> So vgather iself is 4.8 cycle per iteration and probably CPU is able to execute
> rest out of order getting clos to 4 cycles per iteration (it can do 2 loads in
> parallel, one store and rest fits easily to execution resources). That would
> explain 20% slowdown.
>
> gimple internal loop is:
>   _2 = a[i_38];
>   _3 = (long unsigned int) i_38;
>   _4 = _3 * 4;
>   _5 = ip_18 + _4;
>   _6 = *_5;
>   _7 = b[_6];
>   _8 = _7 * s_19;
>   _9 = _2 + _8;
>   a[i_38] = _9;
>   i_28 = i_38 + 1;
>   ivtmp_52 = ivtmp_53 - 1;
>   if (ivtmp_52 != 0)
>     goto <bb 8>; [98.99%]
>   else
>     goto <bb 4>; [1.01%]
>
> 0x25bac30 a[i_38] 1 times scalar_load costs 12 in body
> 0x25bac30 *_5 1 times scalar_load costs 12 in body
> 0x25bac30 b[_6] 1 times scalar_load costs 12 in body
> 0x25bac30 _7 * s_19 1 times scalar_stmt costs 12 in body
> 0x25bac30 _2 + _8 1 times scalar_stmt costs 12 in body
> 0x25bac30 _9 1 times scalar_store costs 16 in body
>
> so 19 cycles estimate of scalar load
>
> 0x2668630 a[i_38] 1 times vector_load costs 12 in body
> 0x2668630 *_5 1 times unaligned_load (misalign -1) costs 12 in body
> 0x2668630 b[_6] 8 times scalar_load costs 96 in body
> 0x2668630 _7 * s_19 1 times scalar_to_vec costs 4 in prologue
> 0x2668630 _7 * s_19 1 times vector_stmt costs 12 in body
> 0x2668630 _2 + _8 1 times vector_stmt costs 12 in body
> 0x2668630 _9 1 times vector_store costs 16 in body
>
> so 40 cycles per 8x vectorized body
>
> tsvc.c:3450:27: note:  operating only on full vectors.
> tsvc.c:3450:27: note:  Cost model analysis:
>   Vector inside of loop cost: 160
>   Vector prologue cost: 4
>   Vector epilogue cost: 0
>   Scalar iteration cost: 76
>   Scalar outside cost: 0
>   Vector outside cost: 4
>   prologue iterations: 0
>   epilogue iterations: 0
>   Calculated minimum iters for profitability: 1
>
> I think this generally suffers from GIGO principle.
> One problem seems to be that we do not know about fmadd yet and compute it as
> two instructions (6 cycles instead of 4). More importnat problem is that we do
> not account the parallelism at all.  I do not see how to disable the
> vecotrization here without bumping gather costs noticeably off reality and thus
> we probably can try to experiment with this if more similar problems are found.

Yep.  Vectorizer costing is really hard w/o modeling the CPU pipeline more
accurately.  Esp. for the scalar side of the code where modern CPUs often
can effectively do two-lane "vectorization" by executing two lanes in parallel.
At the moment we simply assume a single-issue pipeline.  But doing better
requires tracking dependences but the current vectorizer costing API does
not expose dependencies to the target so even rough estimates are hard to
come by (like assuming an issue width of two).  My current plan is not to
revisit this as long as we have both SLP and non-SLP data structures.

> Icc is also using gather in s1115 and s128.
> For s1115 the vectorization does not seem to help and s128 gets slower.
>
> Clang nor aocc does not use gathers.
>
> Honza
>
>         * x86-tune-costs.h (znver3_cost): Update costs of gather to match reality.
>         * x86-tune.def (X86_TUNE_USE_GATHER): Enable for znver3.
>
> diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
> index e655e668c7a..db03738313e 100644
> --- a/gcc/config/i386/x86-tune-costs.h
> +++ b/gcc/config/i386/x86-tune-costs.h
> @@ -1767,11 +1767,11 @@ struct processor_costs znver3_cost = {
>    2, 2, 3,                             /* cost of moving XMM,YMM,ZMM
>                                            register.  */
>    6,                                   /* cost of moving SSE register to integer.  */
> -  /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> -     throughput 12.  Approx 9 uops do not depend on vector size and every load
> -     is 7 uops.  */
> -  18, 8,                               /* Gather load static, per_elt.  */
> -  18, 10,                              /* Gather store static, per_elt.  */
> +  /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
> +     throughput 9.  Approx 7 uops do not depend on vector size and every load
> +     is 4 uops.  */
> +  14, 8,                               /* Gather load static, per_elt.  */
> +  14, 10,                              /* Gather store static, per_elt.  */
>    32,                                  /* size of l1 cache.  */
>    512,                                 /* size of l2 cache.  */
>    64,                                  /* size of prefetch block.  */
> diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
> index 140ccb3d921..caebf76736e 100644
> --- a/gcc/config/i386/x86-tune.def
> +++ b/gcc/config/i386/x86-tune.def
> @@ -436,7 +436,7 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
>
>  /* X86_TUNE_USE_GATHER: Use gather instructions.  */
>  DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather",
> -         ~(m_ZNVER | m_GENERIC))
> +         ~(m_ZNVER1 | m_ZNVER2 | m_GENERIC))
>
>  /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
>     smaller FMA chain.  */
diff mbox series

Patch

diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index e655e668c7a..db03738313e 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -1767,11 +1767,11 @@  struct processor_costs znver3_cost = {
   2, 2, 3,				/* cost of moving XMM,YMM,ZMM
 					   register.  */
   6,					/* cost of moving SSE register to integer.  */
-  /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
-     throughput 12.  Approx 9 uops do not depend on vector size and every load
-     is 7 uops.  */
-  18, 8,				/* Gather load static, per_elt.  */
-  18, 10,				/* Gather store static, per_elt.  */
+  /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
+     throughput 9.  Approx 7 uops do not depend on vector size and every load
+     is 4 uops.  */
+  14, 8,				/* Gather load static, per_elt.  */
+  14, 10,				/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
   512,					/* size of l2 cache.  */
   64,					/* size of prefetch block.  */
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 140ccb3d921..caebf76736e 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -436,7 +436,7 @@  DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
 
 /* X86_TUNE_USE_GATHER: Use gather instructions.  */
 DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather",
-	  ~(m_ZNVER | m_GENERIC))
+	  ~(m_ZNVER1 | m_ZNVER2 | m_GENERIC))
 
 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
    smaller FMA chain.  */