diff mbox

[fortran] Enable FMA for AVX2 and AVX512F for matmul

Message ID c93b99b7-08a4-2d43-8587-24bc820c5090@netcologne.de
State New
Headers show

Commit Message

Thomas Koenig March 2, 2017, 10:45 a.m. UTC
Here's the updated version, which just uses FMA for AVX2.

OK for trunk?

Regards

	Thomas

2017-03-01  Thomas Koenig  <tkoenig@gcc.gnu.org>

         PR fortran/78379
         * m4/matmul.m4: (matmul_'rtype_code`_avx2): Also generate for
         reals.  Add fma to target options.
         (matmul_'rtype_code`):  Call AVX2 only if FMA is available.
         * generated/matmul_c10.c: Regenerated.
         * generated/matmul_c16.c: Regenerated.
         * generated/matmul_c4.c: Regenerated.
         * generated/matmul_c8.c: Regenerated.
         * generated/matmul_i1.c: Regenerated.
         * generated/matmul_i16.c: Regenerated.
         * generated/matmul_i2.c: Regenerated.
         * generated/matmul_i4.c: Regenerated.
         * generated/matmul_i8.c: Regenerated.
         * generated/matmul_r10.c: Regenerated.
         * generated/matmul_r16.c: Regenerated.
         * generated/matmul_r4.c: Regenerated.
         * generated/matmul_r8.c: Regenerated.

Comments

Jakub Jelinek March 2, 2017, 10:48 a.m. UTC | #1
On Thu, Mar 02, 2017 at 11:45:59AM +0100, Thomas Koenig wrote:
> Here's the updated version, which just uses FMA for AVX2.
> 
> OK for trunk?
> 
> Regards
> 
> 	Thomas
> 
> 2017-03-01  Thomas Koenig  <tkoenig@gcc.gnu.org>
> 
>         PR fortran/78379
>         * m4/matmul.m4: (matmul_'rtype_code`_avx2): Also generate for
>         reals.  Add fma to target options.
>         (matmul_'rtype_code`):  Call AVX2 only if FMA is available.
>         * generated/matmul_c10.c: Regenerated.
>         * generated/matmul_c16.c: Regenerated.
>         * generated/matmul_c4.c: Regenerated.
>         * generated/matmul_c8.c: Regenerated.
>         * generated/matmul_i1.c: Regenerated.
>         * generated/matmul_i16.c: Regenerated.
>         * generated/matmul_i2.c: Regenerated.
>         * generated/matmul_i4.c: Regenerated.
>         * generated/matmul_i8.c: Regenerated.
>         * generated/matmul_r10.c: Regenerated.
>         * generated/matmul_r16.c: Regenerated.
>         * generated/matmul_r4.c: Regenerated.
>         * generated/matmul_r8.c: Regenerated.

Ok, thanks.

	Jakub
Jakub Jelinek March 2, 2017, 11:01 a.m. UTC | #2
On Thu, Mar 02, 2017 at 11:45:59AM +0100, Thomas Koenig wrote:
> Here's the updated version, which just uses FMA for AVX2.
> 
> OK for trunk?
> 
> Regards
> 
> 	Thomas
> 
> 2017-03-01  Thomas Koenig  <tkoenig@gcc.gnu.org>
> 
>         PR fortran/78379
>         * m4/matmul.m4: (matmul_'rtype_code`_avx2): Also generate for
>         reals.  Add fma to target options.
>         (matmul_'rtype_code`):  Call AVX2 only if FMA is available.
>         * generated/matmul_c10.c: Regenerated.
>         * generated/matmul_c16.c: Regenerated.
>         * generated/matmul_c4.c: Regenerated.
>         * generated/matmul_c8.c: Regenerated.
>         * generated/matmul_i1.c: Regenerated.
>         * generated/matmul_i16.c: Regenerated.
>         * generated/matmul_i2.c: Regenerated.
>         * generated/matmul_i4.c: Regenerated.
>         * generated/matmul_i8.c: Regenerated.
>         * generated/matmul_r10.c: Regenerated.
>         * generated/matmul_r16.c: Regenerated.
>         * generated/matmul_r4.c: Regenerated.
>         * generated/matmul_r8.c: Regenerated.

Actually, I see a problem, but not related to this patch.
I bet e.g. tsan would complain heavily on the wrappers, because the code
is racy:
  static void (*matmul_p) ('rtype` * const restrict retarray,
        'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
        int blas_limit, blas_call gemm) = NULL;

  if (matmul_p == NULL)
    {
      matmul_p = matmul_'rtype_code`_vanilla;
      if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
        {
          /* Run down the available processors in order of preference.  */
#ifdef HAVE_AVX512F
          if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
            {
              matmul_p = matmul_'rtype_code`_avx512f;
              goto tailcall;
            }
            
#endif  /* HAVE_AVX512F */
...
    }

tailcall:
   (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm);

So, even when assuming all matmul_p = stores are atomic, e.g. if you call
matmul from 2 or more threads about the same time for the first time,
it could be that the first one sets matmul_p to vanilla and then another
thread runs it (uselessly slow), etc.

As you don't care about the if (matmul_p == NULL) part being done in
multiple threads concurrently, I guess you could e.g. do:
  static void (*matmul_p) ('rtype` * const restrict retarray,
        'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
        int blas_limit, blas_call gemm); //  <--- No need for NULL initializer for static var
  void (*matmul_fn) ('rtype` * const restrict retarray,
        'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
        int blas_limit, blas_call gemm);

  matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
  if (matmul_fn == NULL)
    {
      matmul_fn = matmul_'rtype_code`_vanilla;
      if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
        {
          /* Run down the available processors in order of preference.  */
#ifdef HAVE_AVX512F
          if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
            {
              matmul_fn = matmul_'rtype_code`_avx512f;
              goto finish;
            }
            
#endif  /* HAVE_AVX512F */
...
  finish:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
    }
  (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);

(i.e. make sure you read matmul_p in each call exactly once and store at
most once per thread).

	Jakub
diff mbox

Patch

Index: generated/matmul_c10.c
===================================================================
--- generated/matmul_c10.c	(Revision 245760)
+++ generated/matmul_c10.c	(Arbeitskopie)
@@ -74,9 +74,6 @@  extern void matmul_c10 (gfc_array_c10 * const rest
 	int blas_limit, blas_call gemm);
 export_proto(matmul_c10);
 
-
-
-
 /* Put exhaustive list of possible architectures here here, ORed together.  */
 
 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
@@ -628,7 +625,7 @@  matmul_c10_avx (gfc_array_c10 * const restrict ret
 static void
 matmul_c10_avx2 (gfc_array_c10 * const restrict retarray, 
 	gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
-	int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
 static void
 matmul_c10_avx2 (gfc_array_c10 * const restrict retarray, 
 	gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
@@ -2277,7 +2274,8 @@  void matmul_c10 (gfc_array_c10 * const restrict re
 #endif  /* HAVE_AVX512F */
 
 #ifdef HAVE_AVX2
-      	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+      	  if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
 	    {
 	      matmul_p = matmul_c10_avx2;
 	      goto tailcall;
Index: generated/matmul_c16.c
===================================================================
--- generated/matmul_c16.c	(Revision 245760)
+++ generated/matmul_c16.c	(Arbeitskopie)
@@ -74,9 +74,6 @@  extern void matmul_c16 (gfc_array_c16 * const rest
 	int blas_limit, blas_call gemm);
 export_proto(matmul_c16);
 
-
-
-
 /* Put exhaustive list of possible architectures here here, ORed together.  */
 
 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
@@ -628,7 +625,7 @@  matmul_c16_avx (gfc_array_c16 * const restrict ret
 static void
 matmul_c16_avx2 (gfc_array_c16 * const restrict retarray, 
 	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
-	int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
 static void
 matmul_c16_avx2 (gfc_array_c16 * const restrict retarray, 
 	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
@@ -2277,7 +2274,8 @@  void matmul_c16 (gfc_array_c16 * const restrict re
 #endif  /* HAVE_AVX512F */
 
 #ifdef HAVE_AVX2
-      	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+      	  if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
 	    {
 	      matmul_p = matmul_c16_avx2;
 	      goto tailcall;
Index: generated/matmul_c4.c
===================================================================
--- generated/matmul_c4.c	(Revision 245760)
+++ generated/matmul_c4.c	(Arbeitskopie)
@@ -74,9 +74,6 @@  extern void matmul_c4 (gfc_array_c4 * const restri
 	int blas_limit, blas_call gemm);
 export_proto(matmul_c4);
 
-
-
-
 /* Put exhaustive list of possible architectures here here, ORed together.  */
 
 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
@@ -628,7 +625,7 @@  matmul_c4_avx (gfc_array_c4 * const restrict retar
 static void
 matmul_c4_avx2 (gfc_array_c4 * const restrict retarray, 
 	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
-	int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
 static void
 matmul_c4_avx2 (gfc_array_c4 * const restrict retarray, 
 	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
@@ -2277,7 +2274,8 @@  void matmul_c4 (gfc_array_c4 * const restrict reta
 #endif  /* HAVE_AVX512F */
 
 #ifdef HAVE_AVX2
-      	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+      	  if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
 	    {
 	      matmul_p = matmul_c4_avx2;
 	      goto tailcall;
Index: generated/matmul_c8.c
===================================================================
--- generated/matmul_c8.c	(Revision 245760)
+++ generated/matmul_c8.c	(Arbeitskopie)
@@ -74,9 +74,6 @@  extern void matmul_c8 (gfc_array_c8 * const restri
 	int blas_limit, blas_call gemm);
 export_proto(matmul_c8);
 
-
-
-
 /* Put exhaustive list of possible architectures here here, ORed together.  */
 
 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
@@ -628,7 +625,7 @@  matmul_c8_avx (gfc_array_c8 * const restrict retar
 static void
 matmul_c8_avx2 (gfc_array_c8 * const restrict retarray, 
 	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
-	int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
 static void
 matmul_c8_avx2 (gfc_array_c8 * const restrict retarray, 
 	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
@@ -2277,7 +2274,8 @@  void matmul_c8 (gfc_array_c8 * const restrict reta
 #endif  /* HAVE_AVX512F */
 
 #ifdef HAVE_AVX2
-      	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+      	  if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
 	    {
 	      matmul_p = matmul_c8_avx2;
 	      goto tailcall;
Index: generated/matmul_i1.c
===================================================================
--- generated/matmul_i1.c	(Revision 245760)
+++ generated/matmul_i1.c	(Arbeitskopie)
@@ -74,9 +74,6 @@  extern void matmul_i1 (gfc_array_i1 * const restri
 	int blas_limit, blas_call gemm);
 export_proto(matmul_i1);
 
-
-
-
 /* Put exhaustive list of possible architectures here here, ORed together.  */
 
 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
@@ -628,7 +625,7 @@  matmul_i1_avx (gfc_array_i1 * const restrict retar
 static void
 matmul_i1_avx2 (gfc_array_i1 * const restrict retarray, 
 	gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
-	int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
 static void
 matmul_i1_avx2 (gfc_array_i1 * const restrict retarray, 
 	gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
@@ -2277,7 +2274,8 @@  void matmul_i1 (gfc_array_i1 * const restrict reta
 #endif  /* HAVE_AVX512F */
 
 #ifdef HAVE_AVX2
-      	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+      	  if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
 	    {
 	      matmul_p = matmul_i1_avx2;
 	      goto tailcall;
Index: generated/matmul_i16.c
===================================================================
--- generated/matmul_i16.c	(Revision 245760)
+++ generated/matmul_i16.c	(Arbeitskopie)
@@ -74,9 +74,6 @@  extern void matmul_i16 (gfc_array_i16 * const rest
 	int blas_limit, blas_call gemm);
 export_proto(matmul_i16);
 
-
-
-
 /* Put exhaustive list of possible architectures here here, ORed together.  */
 
 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
@@ -628,7 +625,7 @@  matmul_i16_avx (gfc_array_i16 * const restrict ret
 static void
 matmul_i16_avx2 (gfc_array_i16 * const restrict retarray, 
 	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
-	int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
 static void
 matmul_i16_avx2 (gfc_array_i16 * const restrict retarray, 
 	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
@@ -2277,7 +2274,8 @@  void matmul_i16 (gfc_array_i16 * const restrict re
 #endif  /* HAVE_AVX512F */
 
 #ifdef HAVE_AVX2
-      	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+      	  if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
 	    {
 	      matmul_p = matmul_i16_avx2;
 	      goto tailcall;
Index: generated/matmul_i2.c
===================================================================
--- generated/matmul_i2.c	(Revision 245760)
+++ generated/matmul_i2.c	(Arbeitskopie)
@@ -74,9 +74,6 @@  extern void matmul_i2 (gfc_array_i2 * const restri
 	int blas_limit, blas_call gemm);
 export_proto(matmul_i2);
 
-
-
-
 /* Put exhaustive list of possible architectures here here, ORed together.  */
 
 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
@@ -628,7 +625,7 @@  matmul_i2_avx (gfc_array_i2 * const restrict retar
 static void
 matmul_i2_avx2 (gfc_array_i2 * const restrict retarray, 
 	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
-	int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
 static void
 matmul_i2_avx2 (gfc_array_i2 * const restrict retarray, 
 	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
@@ -2277,7 +2274,8 @@  void matmul_i2 (gfc_array_i2 * const restrict reta
 #endif  /* HAVE_AVX512F */
 
 #ifdef HAVE_AVX2
-      	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+      	  if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
 	    {
 	      matmul_p = matmul_i2_avx2;
 	      goto tailcall;
Index: generated/matmul_i4.c
===================================================================
--- generated/matmul_i4.c	(Revision 245760)
+++ generated/matmul_i4.c	(Arbeitskopie)
@@ -74,9 +74,6 @@  extern void matmul_i4 (gfc_array_i4 * const restri
 	int blas_limit, blas_call gemm);
 export_proto(matmul_i4);
 
-
-
-
 /* Put exhaustive list of possible architectures here here, ORed together.  */
 
 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
@@ -628,7 +625,7 @@  matmul_i4_avx (gfc_array_i4 * const restrict retar
 static void
 matmul_i4_avx2 (gfc_array_i4 * const restrict retarray, 
 	gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
-	int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
 static void
 matmul_i4_avx2 (gfc_array_i4 * const restrict retarray, 
 	gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
@@ -2277,7 +2274,8 @@  void matmul_i4 (gfc_array_i4 * const restrict reta
 #endif  /* HAVE_AVX512F */
 
 #ifdef HAVE_AVX2
-      	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+      	  if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
 	    {
 	      matmul_p = matmul_i4_avx2;
 	      goto tailcall;
Index: generated/matmul_i8.c
===================================================================
--- generated/matmul_i8.c	(Revision 245760)
+++ generated/matmul_i8.c	(Arbeitskopie)
@@ -74,9 +74,6 @@  extern void matmul_i8 (gfc_array_i8 * const restri
 	int blas_limit, blas_call gemm);
 export_proto(matmul_i8);
 
-
-
-
 /* Put exhaustive list of possible architectures here here, ORed together.  */
 
 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
@@ -628,7 +625,7 @@  matmul_i8_avx (gfc_array_i8 * const restrict retar
 static void
 matmul_i8_avx2 (gfc_array_i8 * const restrict retarray, 
 	gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
-	int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
 static void
 matmul_i8_avx2 (gfc_array_i8 * const restrict retarray, 
 	gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
@@ -2277,7 +2274,8 @@  void matmul_i8 (gfc_array_i8 * const restrict reta
 #endif  /* HAVE_AVX512F */
 
 #ifdef HAVE_AVX2
-      	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+      	  if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
 	    {
 	      matmul_p = matmul_i8_avx2;
 	      goto tailcall;
Index: generated/matmul_r10.c
===================================================================
--- generated/matmul_r10.c	(Revision 245760)
+++ generated/matmul_r10.c	(Arbeitskopie)
@@ -74,13 +74,6 @@  extern void matmul_r10 (gfc_array_r10 * const rest
 	int blas_limit, blas_call gemm);
 export_proto(matmul_r10);
 
-#if defined(HAVE_AVX) && defined(HAVE_AVX2)
-/* REAL types generate identical code for AVX and AVX2.  Only generate
-   an AVX2 function if we are dealing with integer.  */
-#undef HAVE_AVX2
-#endif
-
-
 /* Put exhaustive list of possible architectures here here, ORed together.  */
 
 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
@@ -632,7 +625,7 @@  matmul_r10_avx (gfc_array_r10 * const restrict ret
 static void
 matmul_r10_avx2 (gfc_array_r10 * const restrict retarray, 
 	gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
-	int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
 static void
 matmul_r10_avx2 (gfc_array_r10 * const restrict retarray, 
 	gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
@@ -2281,7 +2274,8 @@  void matmul_r10 (gfc_array_r10 * const restrict re
 #endif  /* HAVE_AVX512F */
 
 #ifdef HAVE_AVX2
-      	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+      	  if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
 	    {
 	      matmul_p = matmul_r10_avx2;
 	      goto tailcall;
Index: generated/matmul_r16.c
===================================================================
--- generated/matmul_r16.c	(Revision 245760)
+++ generated/matmul_r16.c	(Arbeitskopie)
@@ -74,13 +74,6 @@  extern void matmul_r16 (gfc_array_r16 * const rest
 	int blas_limit, blas_call gemm);
 export_proto(matmul_r16);
 
-#if defined(HAVE_AVX) && defined(HAVE_AVX2)
-/* REAL types generate identical code for AVX and AVX2.  Only generate
-   an AVX2 function if we are dealing with integer.  */
-#undef HAVE_AVX2
-#endif
-
-
 /* Put exhaustive list of possible architectures here here, ORed together.  */
 
 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
@@ -632,7 +625,7 @@  matmul_r16_avx (gfc_array_r16 * const restrict ret
 static void
 matmul_r16_avx2 (gfc_array_r16 * const restrict retarray, 
 	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
-	int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
 static void
 matmul_r16_avx2 (gfc_array_r16 * const restrict retarray, 
 	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
@@ -2281,7 +2274,8 @@  void matmul_r16 (gfc_array_r16 * const restrict re
 #endif  /* HAVE_AVX512F */
 
 #ifdef HAVE_AVX2
-      	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+      	  if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
 	    {
 	      matmul_p = matmul_r16_avx2;
 	      goto tailcall;
Index: generated/matmul_r4.c
===================================================================
--- generated/matmul_r4.c	(Revision 245760)
+++ generated/matmul_r4.c	(Arbeitskopie)
@@ -74,13 +74,6 @@  extern void matmul_r4 (gfc_array_r4 * const restri
 	int blas_limit, blas_call gemm);
 export_proto(matmul_r4);
 
-#if defined(HAVE_AVX) && defined(HAVE_AVX2)
-/* REAL types generate identical code for AVX and AVX2.  Only generate
-   an AVX2 function if we are dealing with integer.  */
-#undef HAVE_AVX2
-#endif
-
-
 /* Put exhaustive list of possible architectures here here, ORed together.  */
 
 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
@@ -632,7 +625,7 @@  matmul_r4_avx (gfc_array_r4 * const restrict retar
 static void
 matmul_r4_avx2 (gfc_array_r4 * const restrict retarray, 
 	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
-	int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
 static void
 matmul_r4_avx2 (gfc_array_r4 * const restrict retarray, 
 	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
@@ -2281,7 +2274,8 @@  void matmul_r4 (gfc_array_r4 * const restrict reta
 #endif  /* HAVE_AVX512F */
 
 #ifdef HAVE_AVX2
-      	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+      	  if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
 	    {
 	      matmul_p = matmul_r4_avx2;
 	      goto tailcall;
Index: generated/matmul_r8.c
===================================================================
--- generated/matmul_r8.c	(Revision 245760)
+++ generated/matmul_r8.c	(Arbeitskopie)
@@ -74,13 +74,6 @@  extern void matmul_r8 (gfc_array_r8 * const restri
 	int blas_limit, blas_call gemm);
 export_proto(matmul_r8);
 
-#if defined(HAVE_AVX) && defined(HAVE_AVX2)
-/* REAL types generate identical code for AVX and AVX2.  Only generate
-   an AVX2 function if we are dealing with integer.  */
-#undef HAVE_AVX2
-#endif
-
-
 /* Put exhaustive list of possible architectures here here, ORed together.  */
 
 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
@@ -632,7 +625,7 @@  matmul_r8_avx (gfc_array_r8 * const restrict retar
 static void
 matmul_r8_avx2 (gfc_array_r8 * const restrict retarray, 
 	gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
-	int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
 static void
 matmul_r8_avx2 (gfc_array_r8 * const restrict retarray, 
 	gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
@@ -2281,7 +2274,8 @@  void matmul_r8 (gfc_array_r8 * const restrict reta
 #endif  /* HAVE_AVX512F */
 
 #ifdef HAVE_AVX2
-      	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+      	  if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
 	    {
 	      matmul_p = matmul_r8_avx2;
 	      goto tailcall;
Index: m4/matmul.m4
===================================================================
--- m4/matmul.m4	(Revision 245760)
+++ m4/matmul.m4	(Arbeitskopie)
@@ -75,14 +75,6 @@  extern void matmul_'rtype_code` ('rtype` * const r
 	int blas_limit, blas_call gemm);
 export_proto(matmul_'rtype_code`);
 
-'ifelse(rtype_letter,`r',dnl
-`#if defined(HAVE_AVX) && defined(HAVE_AVX2)
-/* REAL types generate identical code for AVX and AVX2.  Only generate
-   an AVX2 function if we are dealing with integer.  */
-#undef HAVE_AVX2
-#endif')
-`
-
 /* Put exhaustive list of possible architectures here here, ORed together.  */
 
 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
@@ -101,7 +93,7 @@  static' include(matmul_internal.m4)dnl
 `static void
 'matmul_name` ('rtype` * const restrict retarray, 
 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
-	int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
 static' include(matmul_internal.m4)dnl
 `#endif /* HAVE_AVX2 */
 
@@ -147,7 +139,8 @@  void matmul_'rtype_code` ('rtype` * const restrict
 #endif  /* HAVE_AVX512F */
 
 #ifdef HAVE_AVX2
-      	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+      	  if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
 	    {
 	      matmul_p = matmul_'rtype_code`_avx2;
 	      goto tailcall;