diff mbox

[fortran] Enable FMA for AVX2 and AVX512F for matmul

Message ID 6ecdb93d-3aef-746c-8ca0-ed6b78eb12d4@netcologne.de
State New
Headers show

Commit Message

Thomas Koenig March 1, 2017, 9 p.m. UTC
Hello world,

the attached patch enables FMA for the AVX2 and AVX512F variants of
matmul.  This should bring a very nice speedup (although I have
been unable to run benchmarks due to lack of a suitable machine).

Question: Is this still appropriate for the current state of trunk?
Or rather, OK for when gcc 8 opens (which might still be some time
in the future)?

2017-03-01  Thomas Koenig  <tkoenig@gcc.gnu.org>

         PR fortran/78379
         * m4/matmul.m4: (matmul_'rtype_code`_avx2): Also generate for
         reals.  Add fma to target options.
         (matmul_'rtype_code`_avx512f): Add fma to target options.
         (matmul_'rtype_code`):  Call AVX2 and AVX512F only if
         FMA is available.
         * generated/matmul_c10.c: Regenerated.
         * generated/matmul_c16.c: Regenerated.
         * generated/matmul_c4.c: Regenerated.
         * generated/matmul_c8.c: Regenerated.
         * generated/matmul_i1.c: Regenerated.
         * generated/matmul_i16.c: Regenerated.
         * generated/matmul_i2.c: Regenerated.
         * generated/matmul_i4.c: Regenerated.
         * generated/matmul_i8.c: Regenerated.
         * generated/matmul_r10.c: Regenerated.
         * generated/matmul_r16.c: Regenerated.
         * generated/matmul_r4.c: Regenerated.
         * generated/matmul_r8.c: Regenerated.

Regards

	Thomas

Comments

Jerry DeLisle March 2, 2017, 3:20 a.m. UTC | #1
On 03/01/2017 01:00 PM, Thomas Koenig wrote:
> Hello world,
>
> the attached patch enables FMA for the AVX2 and AVX512F variants of
> matmul.  This should bring a very nice speedup (although I have
> been unable to run benchmarks due to lack of a suitable machine).
>
> Question: Is this still appropriate for the current state of trunk?
> Or rather, OK for when gcc 8 opens (which might still be some time
> in the future)?

I think it may be appropriate now because you are making an adjustment to the 
just added new feature.

I would prefer that it was tested on the actual expected platform. Does anyone 
anywhere on this list have access to one of these machines to test?

Jerry
Janne Blomqvist March 2, 2017, 7:32 a.m. UTC | #2
On Wed, Mar 1, 2017 at 11:00 PM, Thomas Koenig <tkoenig@netcologne.de> wrote:
> Hello world,
>
> the attached patch enables FMA for the AVX2 and AVX512F variants of
> matmul.  This should bring a very nice speedup (although I have
> been unable to run benchmarks due to lack of a suitable machine).

In lieu of benchmarks, have you looked at the generated asm to verify
that fma is actually used?

> Question: Is this still appropriate for the current state of trunk?

Yes, looks pretty safe.
Thomas Koenig March 2, 2017, 7:50 a.m. UTC | #3
Am 02.03.2017 um 08:32 schrieb Janne Blomqvist:
> On Wed, Mar 1, 2017 at 11:00 PM, Thomas Koenig <tkoenig@netcologne.de> wrote:
>> Hello world,
>>
>> the attached patch enables FMA for the AVX2 and AVX512F variants of
>> matmul.  This should bring a very nice speedup (although I have
>> been unable to run benchmarks due to lack of a suitable machine).
>
> In lieu of benchmarks, have you looked at the generated asm to verify
> that fma is actually used?

Yes, I did.

Here's something from the new matmul_r8_avx2:

     156c:       c4 62 e5 b8 fd          vfmadd231pd %ymm5,%ymm3,%ymm15
     1571:       c4 c1 79 10 04 06       vmovupd (%r14,%rax,1),%xmm0
     1577:       c4 62 dd b8 db          vfmadd231pd %ymm3,%ymm4,%ymm11
     157c:       c4 c3 7d 18 44 06 10    vinsertf128 
$0x1,0x10(%r14,%rax,1),%ymm0,%ymm0
     1583:       01
     1584:       c4 62 ed b8 ed          vfmadd231pd %ymm5,%ymm2,%ymm13
     1589:       c4 e2 ed b8 fc          vfmadd231pd %ymm4,%ymm2,%ymm7
     158e:       c4 e2 fd a8 ad 30 ff    vfmadd213pd 
-0x800d0(%rbp),%ymm0,%ymm5

... and here from matmul_r8_avx512f:

     1da8:       c4 a1 7b 10 14 d6       vmovsd (%rsi,%r10,8),%xmm2
     1dae:       c4 c2 b1 b9 f0          vfmadd231sd %xmm8,%xmm9,%xmm6
     1db3:       62 62 ed 08 b9 e5       vfmadd231sd %xmm5,%xmm2,%xmm28
     1db9:       62 62 ed 08 b9 ec       vfmadd231sd %xmm4,%xmm2,%xmm29
     1dbf:       62 62 ed 08 b9 f3       vfmadd231sd %xmm3,%xmm2,%xmm30
     1dc5:       c4 e2 91 99 e8          vfmadd132sd %xmm0,%xmm13,%xmm5
     1dca:       c4 e2 99 99 e0          vfmadd132sd %xmm0,%xmm12,%xmm4
     1dcf:       c4 e2 a1 99 d8          vfmadd132sd %xmm0,%xmm11,%xmm3
     1dd4:       c4 c2 a9 99 d1          vfmadd132sd %xmm9,%xmm10,%xmm2
     1dd9:       c4 c2 89 99 c1          vfmadd132sd %xmm9,%xmm14,%xmm0
     1dde:       0f 8e d3 fe ff ff       jle    1cb7 
<matmul_r8_avx512f+0x1cb7>

... so this is looking pretty good.

Regards

	Thomas
Janne Blomqvist March 2, 2017, 8:09 a.m. UTC | #4
On Thu, Mar 2, 2017 at 9:50 AM, Thomas Koenig <tkoenig@netcologne.de> wrote:
> Am 02.03.2017 um 08:32 schrieb Janne Blomqvist:
>>
>> On Wed, Mar 1, 2017 at 11:00 PM, Thomas Koenig <tkoenig@netcologne.de>
>> wrote:
>>>
>>> Hello world,
>>>
>>> the attached patch enables FMA for the AVX2 and AVX512F variants of
>>> matmul.  This should bring a very nice speedup (although I have
>>> been unable to run benchmarks due to lack of a suitable machine).
>>
>>
>> In lieu of benchmarks, have you looked at the generated asm to verify
>> that fma is actually used?
>
>
> Yes, I did.
>
> Here's something from the new matmul_r8_avx2:
>
>     156c:       c4 62 e5 b8 fd          vfmadd231pd %ymm5,%ymm3,%ymm15
>     1571:       c4 c1 79 10 04 06       vmovupd (%r14,%rax,1),%xmm0
>     1577:       c4 62 dd b8 db          vfmadd231pd %ymm3,%ymm4,%ymm11
>     157c:       c4 c3 7d 18 44 06 10    vinsertf128
> $0x1,0x10(%r14,%rax,1),%ymm0,%ymm0
>     1583:       01
>     1584:       c4 62 ed b8 ed          vfmadd231pd %ymm5,%ymm2,%ymm13
>     1589:       c4 e2 ed b8 fc          vfmadd231pd %ymm4,%ymm2,%ymm7
>     158e:       c4 e2 fd a8 ad 30 ff    vfmadd213pd
> -0x800d0(%rbp),%ymm0,%ymm5

Great, looks good!

> ... and here from matmul_r8_avx512f:
>
>     1da8:       c4 a1 7b 10 14 d6       vmovsd (%rsi,%r10,8),%xmm2
>     1dae:       c4 c2 b1 b9 f0          vfmadd231sd %xmm8,%xmm9,%xmm6
>     1db3:       62 62 ed 08 b9 e5       vfmadd231sd %xmm5,%xmm2,%xmm28
>     1db9:       62 62 ed 08 b9 ec       vfmadd231sd %xmm4,%xmm2,%xmm29
>     1dbf:       62 62 ed 08 b9 f3       vfmadd231sd %xmm3,%xmm2,%xmm30
>     1dc5:       c4 e2 91 99 e8          vfmadd132sd %xmm0,%xmm13,%xmm5
>     1dca:       c4 e2 99 99 e0          vfmadd132sd %xmm0,%xmm12,%xmm4
>     1dcf:       c4 e2 a1 99 d8          vfmadd132sd %xmm0,%xmm11,%xmm3
>     1dd4:       c4 c2 a9 99 d1          vfmadd132sd %xmm9,%xmm10,%xmm2
>     1dd9:       c4 c2 89 99 c1          vfmadd132sd %xmm9,%xmm14,%xmm0
>     1dde:       0f 8e d3 fe ff ff       jle    1cb7
> <matmul_r8_avx512f+0x1cb7>

Good, it's using fma, but why is this using xmm registers? That would
mean it's operating only on 128 bit blocks at a time so no better than
plain AVX. AFAIU avx512 should use zmm registers to operate on 512 bit
chunks.

I guess this is not due to your patch, but some other issue.
Richard Biener March 2, 2017, 8:14 a.m. UTC | #5
On Thu, Mar 2, 2017 at 9:09 AM, Janne Blomqvist
<blomqvist.janne@gmail.com> wrote:
> On Thu, Mar 2, 2017 at 9:50 AM, Thomas Koenig <tkoenig@netcologne.de> wrote:
>> Am 02.03.2017 um 08:32 schrieb Janne Blomqvist:
>>>
>>> On Wed, Mar 1, 2017 at 11:00 PM, Thomas Koenig <tkoenig@netcologne.de>
>>> wrote:
>>>>
>>>> Hello world,
>>>>
>>>> the attached patch enables FMA for the AVX2 and AVX512F variants of
>>>> matmul.  This should bring a very nice speedup (although I have
>>>> been unable to run benchmarks due to lack of a suitable machine).
>>>
>>>
>>> In lieu of benchmarks, have you looked at the generated asm to verify
>>> that fma is actually used?
>>
>>
>> Yes, I did.
>>
>> Here's something from the new matmul_r8_avx2:
>>
>>     156c:       c4 62 e5 b8 fd          vfmadd231pd %ymm5,%ymm3,%ymm15
>>     1571:       c4 c1 79 10 04 06       vmovupd (%r14,%rax,1),%xmm0
>>     1577:       c4 62 dd b8 db          vfmadd231pd %ymm3,%ymm4,%ymm11
>>     157c:       c4 c3 7d 18 44 06 10    vinsertf128
>> $0x1,0x10(%r14,%rax,1),%ymm0,%ymm0
>>     1583:       01
>>     1584:       c4 62 ed b8 ed          vfmadd231pd %ymm5,%ymm2,%ymm13
>>     1589:       c4 e2 ed b8 fc          vfmadd231pd %ymm4,%ymm2,%ymm7
>>     158e:       c4 e2 fd a8 ad 30 ff    vfmadd213pd
>> -0x800d0(%rbp),%ymm0,%ymm5
>
> Great, looks good!
>
>> ... and here from matmul_r8_avx512f:
>>
>>     1da8:       c4 a1 7b 10 14 d6       vmovsd (%rsi,%r10,8),%xmm2
>>     1dae:       c4 c2 b1 b9 f0          vfmadd231sd %xmm8,%xmm9,%xmm6
>>     1db3:       62 62 ed 08 b9 e5       vfmadd231sd %xmm5,%xmm2,%xmm28
>>     1db9:       62 62 ed 08 b9 ec       vfmadd231sd %xmm4,%xmm2,%xmm29
>>     1dbf:       62 62 ed 08 b9 f3       vfmadd231sd %xmm3,%xmm2,%xmm30
>>     1dc5:       c4 e2 91 99 e8          vfmadd132sd %xmm0,%xmm13,%xmm5
>>     1dca:       c4 e2 99 99 e0          vfmadd132sd %xmm0,%xmm12,%xmm4
>>     1dcf:       c4 e2 a1 99 d8          vfmadd132sd %xmm0,%xmm11,%xmm3
>>     1dd4:       c4 c2 a9 99 d1          vfmadd132sd %xmm9,%xmm10,%xmm2
>>     1dd9:       c4 c2 89 99 c1          vfmadd132sd %xmm9,%xmm14,%xmm0
>>     1dde:       0f 8e d3 fe ff ff       jle    1cb7
>> <matmul_r8_avx512f+0x1cb7>
>
> Good, it's using fma, but why is this using xmm registers? That would
> mean it's operating only on 128 bit blocks at a time so no better than
> plain AVX. AFAIU avx512 should use zmm registers to operate on 512 bit
> chunks.
>
> I guess this is not due to your patch, but some other issue.

The question is, was it using %zmm before the patch?

>
> --
> Janne Blomqvist
Jakub Jelinek March 2, 2017, 8:16 a.m. UTC | #6
On Thu, Mar 02, 2017 at 10:09:31AM +0200, Janne Blomqvist wrote:
> > Here's something from the new matmul_r8_avx2:
> >
> >     156c:       c4 62 e5 b8 fd          vfmadd231pd %ymm5,%ymm3,%ymm15
> >     1571:       c4 c1 79 10 04 06       vmovupd (%r14,%rax,1),%xmm0
> >     1577:       c4 62 dd b8 db          vfmadd231pd %ymm3,%ymm4,%ymm11
> >     157c:       c4 c3 7d 18 44 06 10    vinsertf128
> > $0x1,0x10(%r14,%rax,1),%ymm0,%ymm0
> >     1583:       01
> >     1584:       c4 62 ed b8 ed          vfmadd231pd %ymm5,%ymm2,%ymm13
> >     1589:       c4 e2 ed b8 fc          vfmadd231pd %ymm4,%ymm2,%ymm7
> >     158e:       c4 e2 fd a8 ad 30 ff    vfmadd213pd
> > -0x800d0(%rbp),%ymm0,%ymm5
> 
> Great, looks good!
> 
> > ... and here from matmul_r8_avx512f:
> >
> >     1da8:       c4 a1 7b 10 14 d6       vmovsd (%rsi,%r10,8),%xmm2
> >     1dae:       c4 c2 b1 b9 f0          vfmadd231sd %xmm8,%xmm9,%xmm6
> >     1db3:       62 62 ed 08 b9 e5       vfmadd231sd %xmm5,%xmm2,%xmm28
> >     1db9:       62 62 ed 08 b9 ec       vfmadd231sd %xmm4,%xmm2,%xmm29
> >     1dbf:       62 62 ed 08 b9 f3       vfmadd231sd %xmm3,%xmm2,%xmm30
> >     1dc5:       c4 e2 91 99 e8          vfmadd132sd %xmm0,%xmm13,%xmm5
> >     1dca:       c4 e2 99 99 e0          vfmadd132sd %xmm0,%xmm12,%xmm4
> >     1dcf:       c4 e2 a1 99 d8          vfmadd132sd %xmm0,%xmm11,%xmm3
> >     1dd4:       c4 c2 a9 99 d1          vfmadd132sd %xmm9,%xmm10,%xmm2
> >     1dd9:       c4 c2 89 99 c1          vfmadd132sd %xmm9,%xmm14,%xmm0
> >     1dde:       0f 8e d3 fe ff ff       jle    1cb7
> > <matmul_r8_avx512f+0x1cb7>
> 
> Good, it's using fma, but why is this using xmm registers? That would
> mean it's operating only on 128 bit blocks at a time so no better than
> plain AVX. AFAIU avx512 should use zmm registers to operate on 512 bit
> chunks.

Well, it uses sd, i.e. the scalar fma, not pd, so those are always xmm regs
and only a single double in them, this must be some scalar epilogue loop or
whatever; but matmul_r8_avx512f also has:
    140c:       62 72 e5 40 98 c1       vfmadd132pd %zmm1,%zmm19,%zmm8
    1412:       62 72 e5 40 98 cd       vfmadd132pd %zmm5,%zmm19,%zmm9
    1418:       62 72 e5 40 98 d1       vfmadd132pd %zmm1,%zmm19,%zmm10
    141e:       62 72 e5 40 98 de       vfmadd132pd %zmm6,%zmm19,%zmm11
    1424:       62 72 e5 40 98 e1       vfmadd132pd %zmm1,%zmm19,%zmm12
    142a:       62 e2 e5 40 98 c6       vfmadd132pd %zmm6,%zmm19,%zmm16
    1430:       62 f2 e5 40 98 c8       vfmadd132pd %zmm0,%zmm19,%zmm1
    1436:       62 f2 e5 40 98 f0       vfmadd132pd %zmm0,%zmm19,%zmm6
    143c:       62 72 e5 40 98 fd       vfmadd132pd %zmm5,%zmm19,%zmm15
    1442:       62 72 e5 40 98 f4       vfmadd132pd %zmm4,%zmm19,%zmm14
    1448:       62 72 e5 40 98 eb       vfmadd132pd %zmm3,%zmm19,%zmm13
    144e:       62 f2 e5 40 98 d0       vfmadd132pd %zmm0,%zmm19,%zmm2
    1454:       62 b2 e5 40 98 ec       vfmadd132pd %zmm20,%zmm19,%zmm5
    145a:       62 b2 e5 40 98 e4       vfmadd132pd %zmm20,%zmm19,%zmm4
    1460:       62 b2 e5 40 98 dc       vfmadd132pd %zmm20,%zmm19,%zmm3
    1466:       62 b2 e5 40 98 c4       vfmadd132pd %zmm20,%zmm19,%zmm0
etc. where 8 doubles in zmm regs are processed together.

	Jakub
Jakub Jelinek March 2, 2017, 8:43 a.m. UTC | #7
On Wed, Mar 01, 2017 at 10:00:08PM +0100, Thomas Koenig wrote:
> @@ -101,7 +93,7 @@
>  `static void
>  'matmul_name` ('rtype` * const restrict retarray, 
>  	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
> -	int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
> +	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
>  static' include(matmul_internal.m4)dnl
>  `#endif /* HAVE_AVX2 */
>  

I guess the question here is if there are any CPUs that have AVX2 but don't
have FMA3.  If there are none, then this is not controversial, if there are
some, it depends on how widely they are used compared to ones that have both
AVX2 and FMA3.  Going just from our -march= bitsets, it seems if there is
PTA_AVX2, then there is also PTA_FMA: haswell, broadwell, skylake, skylake-avx512, knl,
bdver4, znver1, there are CPUs that have just PTA_AVX and not PTA_AVX2 and
still have PTA_FMA: bdver2, bdver3 (but that is not relevant to this patch).

> @@ -110,7 +102,7 @@
>  `static void
>  'matmul_name` ('rtype` * const restrict retarray, 
>  	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
> -	int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
> +	int blas_limit, blas_call gemm) __attribute__((__target__("avx512f,fma")));
>  static' include(matmul_internal.m4)dnl
>  `#endif  /* HAVE_AVX512F */
>  

I think this change is not needed, because the EVEX encoded
VFMADD???[SP][DS] instructions etc. are in AVX512F ISA, not in FMA3 ISA
(which has just the VEX encoded ones).
Which is why I'm seeing the fmas in my libgfortran even without your patch.
Thus I think you should remove this from your patch.

> @@ -147,7 +141,8 @@
>  #endif  /* HAVE_AVX512F */
>  
>  #ifdef HAVE_AVX2
> -      	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
> +      	  if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
> +	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
>  	    {
>  	      matmul_p = matmul_'rtype_code`_avx2;
>  	      goto tailcall;

and this too.

	Jakub
Thomas Koenig March 2, 2017, 9:03 a.m. UTC | #8
Am 02.03.2017 um 09:43 schrieb Jakub Jelinek:
> On Wed, Mar 01, 2017 at 10:00:08PM +0100, Thomas Koenig wrote:
>> @@ -101,7 +93,7 @@
>>  `static void
>>  'matmul_name` ('rtype` * const restrict retarray,
>>  	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
>> -	int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
>> +	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
>>  static' include(matmul_internal.m4)dnl
>>  `#endif /* HAVE_AVX2 */
>>
>
> I guess the question here is if there are any CPUs that have AVX2 but don't
> have FMA3.  If there are none, then this is not controversial, if there are
> some, it depends on how widely they are used compared to ones that have both
> AVX2 and FMA3.  Going just from our -march= bitsets, it seems if there is
> PTA_AVX2, then there is also PTA_FMA: haswell, broadwell, skylake, skylake-avx512, knl,
> bdver4, znver1, there are CPUs that have just PTA_AVX and not PTA_AVX2 and
> still have PTA_FMA: bdver2, bdver3 (but that is not relevant to this patch).

In a previous incantation of the patch, I saw that the compiler
generated the same floating point code for AVX and AVX2 (which why
there currently is no AVX2 floating point version).  I could also
generate an AVX+FMA version for floating point and an AVX2 version
for integer (if anybody cares about integer matmul).

Or I could just leave it as it is.

>> @@ -110,7 +102,7 @@
>>  `static void
>>  'matmul_name` ('rtype` * const restrict retarray,
>>  	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
>> -	int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
>> +	int blas_limit, blas_call gemm) __attribute__((__target__("avx512f,fma")));
>>  static' include(matmul_internal.m4)dnl
>>  `#endif  /* HAVE_AVX512F */
>>
>
> I think this change is not needed, because the EVEX encoded
> VFMADD???[SP][DS] instructions etc. are in AVX512F ISA, not in FMA3 ISA
> (which has just the VEX encoded ones).
> Which is why I'm seeing the fmas in my libgfortran even without your patch.
> Thus I think you should remove this from your patch.

OK, I'll remove it.

>
>> @@ -147,7 +141,8 @@
>>  #endif  /* HAVE_AVX512F */
>>
>>  #ifdef HAVE_AVX2
>> -      	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
>> +      	  if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
>> +	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
>>  	    {
>>  	      matmul_p = matmul_'rtype_code`_avx2;
>>  	      goto tailcall;
>
> and this too.

Will do.
Jakub Jelinek March 2, 2017, 9:08 a.m. UTC | #9
On Thu, Mar 02, 2017 at 10:03:28AM +0100, Thomas Koenig wrote:
> Am 02.03.2017 um 09:43 schrieb Jakub Jelinek:
> > On Wed, Mar 01, 2017 at 10:00:08PM +0100, Thomas Koenig wrote:
> > > @@ -101,7 +93,7 @@
> > >  `static void
> > >  'matmul_name` ('rtype` * const restrict retarray,
> > >  	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
> > > -	int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
> > > +	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
> > >  static' include(matmul_internal.m4)dnl
> > >  `#endif /* HAVE_AVX2 */
> > > 
> > 
> > I guess the question here is if there are any CPUs that have AVX2 but don't
> > have FMA3.  If there are none, then this is not controversial, if there are
> > some, it depends on how widely they are used compared to ones that have both
> > AVX2 and FMA3.  Going just from our -march= bitsets, it seems if there is
> > PTA_AVX2, then there is also PTA_FMA: haswell, broadwell, skylake, skylake-avx512, knl,
> > bdver4, znver1, there are CPUs that have just PTA_AVX and not PTA_AVX2 and
> > still have PTA_FMA: bdver2, bdver3 (but that is not relevant to this patch).
> 
> In a previous incantation of the patch, I saw that the compiler
> generated the same floating point code for AVX and AVX2 (which why
> there currently is no AVX2 floating point version).  I could also
> generate an AVX+FMA version for floating point and an AVX2 version
> for integer (if anybody cares about integer matmul).

I think having another avx,fma version is not worth it, avx+fma is far less
common than avx without fma.

> > > @@ -147,7 +141,8 @@
> > >  #endif  /* HAVE_AVX512F */
> > > 
> > >  #ifdef HAVE_AVX2
> > > -      	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
> > > +      	  if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
> > > +	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
> > >  	    {
> > >  	      matmul_p = matmul_'rtype_code`_avx2;
> > >  	      goto tailcall;
> > 
> > and this too.
> 
> Will do.

Note I meant obviously the FEATURE_AVX512F related hunk, not this one,
sorry.

	Jakub
diff mbox

Patch

Index: m4/matmul.m4
===================================================================
--- m4/matmul.m4	(Revision 245760)
+++ m4/matmul.m4	(Arbeitskopie)
@@ -75,14 +75,6 @@ 
 	int blas_limit, blas_call gemm);
 export_proto(matmul_'rtype_code`);
 
-'ifelse(rtype_letter,`r',dnl
-`#if defined(HAVE_AVX) && defined(HAVE_AVX2)
-/* REAL types generate identical code for AVX and AVX2.  Only generate
-   an AVX2 function if we are dealing with integer.  */
-#undef HAVE_AVX2
-#endif')
-`
-
 /* Put exhaustive list of possible architectures here here, ORed together.  */
 
 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
@@ -101,7 +93,7 @@ 
 `static void
 'matmul_name` ('rtype` * const restrict retarray, 
 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
-	int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
 static' include(matmul_internal.m4)dnl
 `#endif /* HAVE_AVX2 */
 
@@ -110,7 +102,7 @@ 
 `static void
 'matmul_name` ('rtype` * const restrict retarray, 
 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
-	int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx512f,fma")));
 static' include(matmul_internal.m4)dnl
 `#endif  /* HAVE_AVX512F */
 
@@ -138,7 +130,9 @@ 
 	{
           /* Run down the available processors in order of preference.  */
 #ifdef HAVE_AVX512F
-      	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
+      	  if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
+	      && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+
 	    {
 	      matmul_p = matmul_'rtype_code`_avx512f;
 	      goto tailcall;
@@ -147,7 +141,8 @@ 
 #endif  /* HAVE_AVX512F */
 
 #ifdef HAVE_AVX2
-      	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+      	  if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
 	    {
 	      matmul_p = matmul_'rtype_code`_avx2;
 	      goto tailcall;