diff mbox

[libfortran] Fix matmul result zeroing for empty arguments

Message ID 7854e3d2-87c0-c466-5f63-9a19349de15e@netcologne.de
State New
Headers show

Commit Message

Thomas Koenig June 5, 2017, 9:52 a.m. UTC
Hello world,

the attached patch moves the zeroing of the result variable before the
early return. This is done so that the result for zero-sized arguments
is still valid.

The bug was only in the library version, but I have also added a test
case for the inline version to make sure the bug does not suddenly
appear there.

OK for trunk?

Regards

	Thomas

2017-06-05  Thomas Koenig  <tkoenig@gcc.gnu.org>

	PR fortran/80975
	* m4/matmul_internal.m4:  Move zeroing before early return.
	* generated/matmul_c10.c: Regenerated.
         * generated/matmul_c16.c: Regenerated.
         * generated/matmul_c4.c: Regenerated.
         * generated/matmul_c8.c: Regenerated.
         * generated/matmul_i1.c: Regenerated.
         * generated/matmul_i16.c: Regenerated.
         * generated/matmul_i2.c: Regenerated.
         * generated/matmul_i4.c: Regenerated.
         * generated/matmul_i8.c: Regenerated.
         * generated/matmul_r10.c: Regenerated.
         * generated/matmul_r16.c: Regenerated.
         * generated/matmul_r4.c: Regenerated.
         * generated/matmul_r8.c: Regenerated.
         * generated/matmulavx128_c10.c: Regenerated.
         * generated/matmulavx128_c16.c: Regenerated.
         * generated/matmulavx128_c4.c: Regenerated.
         * generated/matmulavx128_c8.c: Regenerated.
         * generated/matmulavx128_i1.c: Regenerated.
         * generated/matmulavx128_i16.c: Regenerated.
         * generated/matmulavx128_i2.c: Regenerated.
         * generated/matmulavx128_i4.c: Regenerated.
         * generated/matmulavx128_i8.c: Regenerated.
         * generated/matmulavx128_r10.c: Regenerated.
         * generated/matmulavx128_r16.c: Regenerated.
         * generated/matmulavx128_r4.c: Regenerated.

Comments

Thomas Koenig June 5, 2017, 9:53 a.m. UTC | #1
Am 05.06.2017 um 11:52 schrieb Thomas Koenig:
> 
> OK for trunk?

... and 7.1, where this bug was actually discovered?  This is
a regerssion.

Regards

	Thomas
Jerry DeLisle June 5, 2017, 9:24 p.m. UTC | #2
On 06/05/2017 02:52 AM, Thomas Koenig wrote:
> Hello world,
> 
> the attached patch moves the zeroing of the result variable before the
> early return. This is done so that the result for zero-sized arguments
> is still valid.
> 
> The bug was only in the library version, but I have also added a test
> case for the inline version to make sure the bug does not suddenly
> appear there.
> 
> OK for trunk?


Yes OK for Trunk and 7

Thanks,

Jerry
diff mbox

Patch

Index: generated/matmul_c10.c
===================================================================
--- generated/matmul_c10.c	(Revision 248472)
+++ generated/matmul_c10.c	(Arbeitskopie)
@@ -307,6 +307,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -319,11 +324,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_10));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -859,6 +859,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -871,11 +876,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_10));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -1411,6 +1411,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -1423,11 +1428,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_10));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -1977,6 +1977,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -1989,11 +1994,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_10));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -2603,6 +2603,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -2615,11 +2620,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_10));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
Index: generated/matmul_c16.c
===================================================================
--- generated/matmul_c16.c	(Revision 248472)
+++ generated/matmul_c16.c	(Arbeitskopie)
@@ -307,6 +307,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_16)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -319,11 +324,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_16));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_16)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -859,6 +859,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_16)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -871,11 +876,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_16));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_16)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -1411,6 +1411,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_16)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -1423,11 +1428,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_16));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_16)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -1977,6 +1977,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_16)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -1989,11 +1994,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_16));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_16)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -2603,6 +2603,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_16)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -2615,11 +2620,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_16));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_16)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
Index: generated/matmul_c4.c
===================================================================
--- generated/matmul_c4.c	(Revision 248472)
+++ generated/matmul_c4.c	(Arbeitskopie)
@@ -307,6 +307,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -319,11 +324,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -859,6 +859,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -871,11 +876,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -1411,6 +1411,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -1423,11 +1428,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -1977,6 +1977,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -1989,11 +1994,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -2603,6 +2603,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -2615,11 +2620,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
Index: generated/matmul_c8.c
===================================================================
--- generated/matmul_c8.c	(Revision 248472)
+++ generated/matmul_c8.c	(Arbeitskopie)
@@ -307,6 +307,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -319,11 +324,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_8));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -859,6 +859,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -871,11 +876,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_8));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -1411,6 +1411,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -1423,11 +1428,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_8));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -1977,6 +1977,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -1989,11 +1994,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_8));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -2603,6 +2603,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -2615,11 +2620,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_8));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
Index: generated/matmul_i1.c
===================================================================
--- generated/matmul_i1.c	(Revision 248472)
+++ generated/matmul_i1.c	(Arbeitskopie)
@@ -307,6 +307,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_1)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -319,11 +324,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_1)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -859,6 +859,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_1)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -871,11 +876,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_1)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -1411,6 +1411,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_1)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -1423,11 +1428,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_1)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -1977,6 +1977,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_1)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -1989,11 +1994,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_1)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -2603,6 +2603,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_1)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -2615,11 +2620,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_1)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
Index: generated/matmul_i16.c
===================================================================
--- generated/matmul_i16.c	(Revision 248472)
+++ generated/matmul_i16.c	(Arbeitskopie)
@@ -307,6 +307,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_16)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -319,11 +324,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_16)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -859,6 +859,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_16)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -871,11 +876,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_16)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -1411,6 +1411,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_16)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -1423,11 +1428,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_16)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -1977,6 +1977,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_16)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -1989,11 +1994,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_16)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -2603,6 +2603,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_16)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -2615,11 +2620,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_16)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
Index: generated/matmul_i2.c
===================================================================
--- generated/matmul_i2.c	(Revision 248472)
+++ generated/matmul_i2.c	(Arbeitskopie)
@@ -307,6 +307,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_2)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -319,11 +324,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_2)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -859,6 +859,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_2)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -871,11 +876,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_2)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -1411,6 +1411,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_2)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -1423,11 +1428,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_2)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -1977,6 +1977,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_2)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -1989,11 +1994,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_2)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -2603,6 +2603,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_2)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -2615,11 +2620,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_2)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
Index: generated/matmul_i4.c
===================================================================
--- generated/matmul_i4.c	(Revision 248472)
+++ generated/matmul_i4.c	(Arbeitskopie)
@@ -307,6 +307,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_4)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -319,11 +324,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_4)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -859,6 +859,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_4)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -871,11 +876,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_4)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -1411,6 +1411,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_4)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -1423,11 +1428,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_4)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -1977,6 +1977,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_4)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -1989,11 +1994,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_4)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -2603,6 +2603,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_4)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -2615,11 +2620,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_4)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
Index: generated/matmul_i8.c
===================================================================
--- generated/matmul_i8.c	(Revision 248472)
+++ generated/matmul_i8.c	(Arbeitskopie)
@@ -307,6 +307,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_8)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -319,11 +324,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_8)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -859,6 +859,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_8)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -871,11 +876,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_8)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -1411,6 +1411,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_8)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -1423,11 +1428,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_8)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -1977,6 +1977,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_8)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -1989,11 +1994,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_8)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -2603,6 +2603,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_8)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -2615,11 +2620,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_8)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
Index: generated/matmul_r10.c
===================================================================
--- generated/matmul_r10.c	(Revision 248472)
+++ generated/matmul_r10.c	(Arbeitskopie)
@@ -307,6 +307,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_10)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -319,11 +324,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_10));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_10)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -859,6 +859,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_10)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -871,11 +876,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_10));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_10)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -1411,6 +1411,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_10)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -1423,11 +1428,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_10));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_10)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -1977,6 +1977,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_10)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -1989,11 +1994,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_10));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_10)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -2603,6 +2603,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_10)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -2615,11 +2620,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_10));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_10)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
Index: generated/matmul_r16.c
===================================================================
--- generated/matmul_r16.c	(Revision 248472)
+++ generated/matmul_r16.c	(Arbeitskopie)
@@ -307,6 +307,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_16)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -319,11 +324,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_16));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_16)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -859,6 +859,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_16)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -871,11 +876,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_16));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_16)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -1411,6 +1411,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_16)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -1423,11 +1428,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_16));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_16)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -1977,6 +1977,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_16)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -1989,11 +1994,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_16));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_16)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -2603,6 +2603,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_16)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -2615,11 +2620,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_16));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_16)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
Index: generated/matmul_r4.c
===================================================================
--- generated/matmul_r4.c	(Revision 248472)
+++ generated/matmul_r4.c	(Arbeitskopie)
@@ -307,6 +307,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_4)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -319,11 +324,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_4));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_4)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -859,6 +859,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_4)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -871,11 +876,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_4));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_4)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -1411,6 +1411,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_4)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -1423,11 +1428,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_4));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_4)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -1977,6 +1977,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_4)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -1989,11 +1994,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_4));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_4)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -2603,6 +2603,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_4)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -2615,11 +2620,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_4));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_4)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
Index: generated/matmul_r8.c
===================================================================
--- generated/matmul_r8.c	(Revision 248472)
+++ generated/matmul_r8.c	(Arbeitskopie)
@@ -307,6 +307,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_8)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -319,11 +324,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_8));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_8)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -859,6 +859,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_8)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -871,11 +876,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_8));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_8)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -1411,6 +1411,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_8)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -1423,11 +1428,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_8));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_8)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -1977,6 +1977,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_8)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -1989,11 +1994,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_8));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_8)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -2603,6 +2603,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_8)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -2615,11 +2620,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_8));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_8)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
Index: generated/matmulavx128_c10.c
===================================================================
--- generated/matmulavx128_c10.c	(Revision 248472)
+++ generated/matmulavx128_c10.c	(Arbeitskopie)
@@ -272,6 +272,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -284,11 +289,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_10));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -825,6 +825,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -837,11 +842,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_10));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
Index: generated/matmulavx128_c16.c
===================================================================
--- generated/matmulavx128_c16.c	(Revision 248472)
+++ generated/matmulavx128_c16.c	(Arbeitskopie)
@@ -272,6 +272,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_16)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -284,11 +289,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_16));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_16)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -825,6 +825,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_16)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -837,11 +842,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_16));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_16)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
Index: generated/matmulavx128_c4.c
===================================================================
--- generated/matmulavx128_c4.c	(Revision 248472)
+++ generated/matmulavx128_c4.c	(Arbeitskopie)
@@ -272,6 +272,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -284,11 +289,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -825,6 +825,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -837,11 +842,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
Index: generated/matmulavx128_c8.c
===================================================================
--- generated/matmulavx128_c8.c	(Revision 248472)
+++ generated/matmulavx128_c8.c	(Arbeitskopie)
@@ -272,6 +272,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -284,11 +289,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_8));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -825,6 +825,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -837,11 +842,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_8));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
Index: generated/matmulavx128_i1.c
===================================================================
--- generated/matmulavx128_i1.c	(Revision 248472)
+++ generated/matmulavx128_i1.c	(Arbeitskopie)
@@ -272,6 +272,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_1)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -284,11 +289,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_1)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -825,6 +825,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_1)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -837,11 +842,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_1)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
Index: generated/matmulavx128_i16.c
===================================================================
--- generated/matmulavx128_i16.c	(Revision 248472)
+++ generated/matmulavx128_i16.c	(Arbeitskopie)
@@ -272,6 +272,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_16)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -284,11 +289,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_16)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -825,6 +825,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_16)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -837,11 +842,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_16)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
Index: generated/matmulavx128_i2.c
===================================================================
--- generated/matmulavx128_i2.c	(Revision 248472)
+++ generated/matmulavx128_i2.c	(Arbeitskopie)
@@ -272,6 +272,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_2)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -284,11 +289,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_2)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -825,6 +825,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_2)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -837,11 +842,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_2)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
Index: generated/matmulavx128_i4.c
===================================================================
--- generated/matmulavx128_i4.c	(Revision 248472)
+++ generated/matmulavx128_i4.c	(Arbeitskopie)
@@ -272,6 +272,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_4)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -284,11 +289,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_4)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -825,6 +825,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_4)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -837,11 +842,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_4)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
Index: generated/matmulavx128_i8.c
===================================================================
--- generated/matmulavx128_i8.c	(Revision 248472)
+++ generated/matmulavx128_i8.c	(Arbeitskopie)
@@ -272,6 +272,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_8)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -284,11 +289,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_8)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -825,6 +825,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_INTEGER_8)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -837,11 +842,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_INTEGER_8)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
Index: generated/matmulavx128_r10.c
===================================================================
--- generated/matmulavx128_r10.c	(Revision 248472)
+++ generated/matmulavx128_r10.c	(Arbeitskopie)
@@ -272,6 +272,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_10)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -284,11 +289,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_10));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_10)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -825,6 +825,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_10)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -837,11 +842,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_10));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_10)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
Index: generated/matmulavx128_r16.c
===================================================================
--- generated/matmulavx128_r16.c	(Revision 248472)
+++ generated/matmulavx128_r16.c	(Arbeitskopie)
@@ -272,6 +272,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_16)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -284,11 +289,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_16));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_16)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -825,6 +825,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_16)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -837,11 +842,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_16));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_16)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
Index: generated/matmulavx128_r4.c
===================================================================
--- generated/matmulavx128_r4.c	(Revision 248472)
+++ generated/matmulavx128_r4.c	(Arbeitskopie)
@@ -272,6 +272,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_4)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -284,11 +289,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_4));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_4)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -825,6 +825,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_4)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -837,11 +842,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_4));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_4)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
Index: generated/matmulavx128_r8.c
===================================================================
--- generated/matmulavx128_r8.c	(Revision 248472)
+++ generated/matmulavx128_r8.c	(Arbeitskopie)
@@ -272,6 +272,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_8)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -284,11 +289,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_8));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_8)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
@@ -825,6 +825,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = (GFC_REAL_8)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -837,11 +842,6 @@ 
 
       t1 = malloc (t1_dim * sizeof(GFC_REAL_8));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = (GFC_REAL_8)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)
Index: m4/matmul_internal.m4
===================================================================
--- m4/matmul_internal.m4	(Revision 248467)
+++ m4/matmul_internal.m4	(Arbeitskopie)
@@ -223,6 +223,11 @@ 
       b_offset = 1 + b_dim1;
       b -= b_offset;
 
+      /* Empty c first.  */
+      for (j=1; j<=n; j++)
+	for (i=1; i<=m; i++)
+	  c[i + j * c_dim1] = ('rtype_name`)0;
+
       /* Early exit if possible */
       if (m == 0 || n == 0 || k == 0)
 	return;
@@ -235,11 +240,6 @@ 
 
       t1 = malloc (t1_dim * sizeof('rtype_name`));
 
-      /* Empty c first.  */
-      for (j=1; j<=n; j++)
-	for (i=1; i<=m; i++)
-	  c[i + j * c_dim1] = ('rtype_name`)0;
-
       /* Start turning the crank. */
       i1 = n;
       for (jj = 1; jj <= i1; jj += 512)