[libfortran] Add AVX-specific matmul

Message ID	05fbb04a-f4c1-cb61-9baa-7a86ea673784@netcologne.de
State	New
Headers	show Return-Path: <gcc-patches-return-441731-incoming=patchwork.ozlabs.org@gcc.gnu.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:to :from:subject:message-id:date:mime-version:content-type; q=dns; s=default; b=QGmY/IJadC31w7hnikhoqP74Xke2PwPk8cS3Ft43g82VBoOEFb 05BJUL3af6QO15/3L/W+4uaGx3QGdYZ1nQAn9qsc7Kmq9FnxNMgojyEtamowE8Fc iEZUNpoJNLWieDVHba1gtHc9iNkBNH/2gBWRDXyD4H/q4AHblKbw3dzOk= Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org To: "fortran@gcc.gnu.org" <fortran@gcc.gnu.org>, gcc-patches <gcc-patches@gcc.gnu.org> From: Thomas Koenig <tkoenig@netcologne.de> Subject: [patch, libfortran] Add AVX-specific matmul Message-ID: <05fbb04a-f4c1-cb61-9baa-7a86ea673784@netcologne.de> Date: Wed, 16 Nov 2016 22:30:03 +0100 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Thunderbird/45.4.0 MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="------------D884887647D1526F59CB48BA"

Index: generated/matmul_c10.c =================================================================== --- generated/matmul_c10.c (Revision 242477) +++ generated/matmul_c10.c (Arbeitskopie) @@ -75,11 +75,37 @@ extern void matmul_c10 (gfc_array_c10 * const rest int blas_limit, blas_call gemm); export_proto(matmul_c10); +#ifdef __x86_64__ + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_c10 (gfc_array_c10 * const restrict retarray, + gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones("avx,default"))); + void matmul_c10 (gfc_array_c10 * const restrict retarray, gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_c10 (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_c10 (gfc_array_c10 * const restrict retarray, + gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_c10 (gfc_array_c10 * const restrict retarray, + gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const GFC_COMPLEX_10 * restrict abase; const GFC_COMPLEX_10 * restrict bbase; GFC_COMPLEX_10 * restrict dest; Index: generated/matmul_c16.c =================================================================== --- generated/matmul_c16.c (Revision 242477) +++ generated/matmul_c16.c (Arbeitskopie) @@ -75,11 +75,37 @@ extern void matmul_c16 (gfc_array_c16 * const rest int blas_limit, blas_call gemm); export_proto(matmul_c16); +#ifdef __x86_64__ + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_c16 (gfc_array_c16 * const restrict retarray, + gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones("avx,default"))); + void matmul_c16 (gfc_array_c16 * const restrict retarray, gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_c16 (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_c16 (gfc_array_c16 * const restrict retarray, + gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_c16 (gfc_array_c16 * const restrict retarray, + gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const GFC_COMPLEX_16 * restrict abase; const GFC_COMPLEX_16 * restrict bbase; GFC_COMPLEX_16 * restrict dest; Index: generated/matmul_c4.c =================================================================== --- generated/matmul_c4.c (Revision 242477) +++ generated/matmul_c4.c (Arbeitskopie) @@ -75,11 +75,37 @@ extern void matmul_c4 (gfc_array_c4 * const restri int blas_limit, blas_call gemm); export_proto(matmul_c4); +#ifdef __x86_64__ + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_c4 (gfc_array_c4 * const restrict retarray, + gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones("avx,default"))); + void matmul_c4 (gfc_array_c4 * const restrict retarray, gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_c4 (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_c4 (gfc_array_c4 * const restrict retarray, + gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_c4 (gfc_array_c4 * const restrict retarray, + gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const GFC_COMPLEX_4 * restrict abase; const GFC_COMPLEX_4 * restrict bbase; GFC_COMPLEX_4 * restrict dest; Index: generated/matmul_c8.c =================================================================== --- generated/matmul_c8.c (Revision 242477) +++ generated/matmul_c8.c (Arbeitskopie) @@ -75,11 +75,37 @@ extern void matmul_c8 (gfc_array_c8 * const restri int blas_limit, blas_call gemm); export_proto(matmul_c8); +#ifdef __x86_64__ + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_c8 (gfc_array_c8 * const restrict retarray, + gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones("avx,default"))); + void matmul_c8 (gfc_array_c8 * const restrict retarray, gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_c8 (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_c8 (gfc_array_c8 * const restrict retarray, + gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_c8 (gfc_array_c8 * const restrict retarray, + gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const GFC_COMPLEX_8 * restrict abase; const GFC_COMPLEX_8 * restrict bbase; GFC_COMPLEX_8 * restrict dest; Index: generated/matmul_i1.c =================================================================== --- generated/matmul_i1.c (Revision 242477) +++ generated/matmul_i1.c (Arbeitskopie) @@ -75,11 +75,37 @@ extern void matmul_i1 (gfc_array_i1 * const restri int blas_limit, blas_call gemm); export_proto(matmul_i1); +#ifdef __x86_64__ + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_i1 (gfc_array_i1 * const restrict retarray, + gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones("avx,default"))); + void matmul_i1 (gfc_array_i1 * const restrict retarray, gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_i1 (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_i1 (gfc_array_i1 * const restrict retarray, + gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_i1 (gfc_array_i1 * const restrict retarray, + gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const GFC_INTEGER_1 * restrict abase; const GFC_INTEGER_1 * restrict bbase; GFC_INTEGER_1 * restrict dest; Index: generated/matmul_i16.c =================================================================== --- generated/matmul_i16.c (Revision 242477) +++ generated/matmul_i16.c (Arbeitskopie) @@ -75,11 +75,37 @@ extern void matmul_i16 (gfc_array_i16 * const rest int blas_limit, blas_call gemm); export_proto(matmul_i16); +#ifdef __x86_64__ + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_i16 (gfc_array_i16 * const restrict retarray, + gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones("avx,default"))); + void matmul_i16 (gfc_array_i16 * const restrict retarray, gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_i16 (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_i16 (gfc_array_i16 * const restrict retarray, + gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_i16 (gfc_array_i16 * const restrict retarray, + gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const GFC_INTEGER_16 * restrict abase; const GFC_INTEGER_16 * restrict bbase; GFC_INTEGER_16 * restrict dest; Index: generated/matmul_i2.c =================================================================== --- generated/matmul_i2.c (Revision 242477) +++ generated/matmul_i2.c (Arbeitskopie) @@ -75,11 +75,37 @@ extern void matmul_i2 (gfc_array_i2 * const restri int blas_limit, blas_call gemm); export_proto(matmul_i2); +#ifdef __x86_64__ + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_i2 (gfc_array_i2 * const restrict retarray, + gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones("avx,default"))); + void matmul_i2 (gfc_array_i2 * const restrict retarray, gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_i2 (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_i2 (gfc_array_i2 * const restrict retarray, + gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_i2 (gfc_array_i2 * const restrict retarray, + gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const GFC_INTEGER_2 * restrict abase; const GFC_INTEGER_2 * restrict bbase; GFC_INTEGER_2 * restrict dest; Index: generated/matmul_i4.c =================================================================== --- generated/matmul_i4.c (Revision 242477) +++ generated/matmul_i4.c (Arbeitskopie) @@ -75,11 +75,37 @@ extern void matmul_i4 (gfc_array_i4 * const restri int blas_limit, blas_call gemm); export_proto(matmul_i4); +#ifdef __x86_64__ + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_i4 (gfc_array_i4 * const restrict retarray, + gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones("avx,default"))); + void matmul_i4 (gfc_array_i4 * const restrict retarray, gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_i4 (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_i4 (gfc_array_i4 * const restrict retarray, + gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_i4 (gfc_array_i4 * const restrict retarray, + gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const GFC_INTEGER_4 * restrict abase; const GFC_INTEGER_4 * restrict bbase; GFC_INTEGER_4 * restrict dest; Index: generated/matmul_i8.c =================================================================== --- generated/matmul_i8.c (Revision 242477) +++ generated/matmul_i8.c (Arbeitskopie) @@ -75,11 +75,37 @@ extern void matmul_i8 (gfc_array_i8 * const restri int blas_limit, blas_call gemm); export_proto(matmul_i8); +#ifdef __x86_64__ + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_i8 (gfc_array_i8 * const restrict retarray, + gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones("avx,default"))); + void matmul_i8 (gfc_array_i8 * const restrict retarray, gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_i8 (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_i8 (gfc_array_i8 * const restrict retarray, + gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_i8 (gfc_array_i8 * const restrict retarray, + gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const GFC_INTEGER_8 * restrict abase; const GFC_INTEGER_8 * restrict bbase; GFC_INTEGER_8 * restrict dest; Index: generated/matmul_r10.c =================================================================== --- generated/matmul_r10.c (Revision 242477) +++ generated/matmul_r10.c (Arbeitskopie) @@ -75,11 +75,37 @@ extern void matmul_r10 (gfc_array_r10 * const rest int blas_limit, blas_call gemm); export_proto(matmul_r10); +#ifdef __x86_64__ + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_r10 (gfc_array_r10 * const restrict retarray, + gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones("avx,default"))); + void matmul_r10 (gfc_array_r10 * const restrict retarray, gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_r10 (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_r10 (gfc_array_r10 * const restrict retarray, + gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_r10 (gfc_array_r10 * const restrict retarray, + gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const GFC_REAL_10 * restrict abase; const GFC_REAL_10 * restrict bbase; GFC_REAL_10 * restrict dest; Index: generated/matmul_r16.c =================================================================== --- generated/matmul_r16.c (Revision 242477) +++ generated/matmul_r16.c (Arbeitskopie) @@ -75,11 +75,37 @@ extern void matmul_r16 (gfc_array_r16 * const rest int blas_limit, blas_call gemm); export_proto(matmul_r16); +#ifdef __x86_64__ + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_r16 (gfc_array_r16 * const restrict retarray, + gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones("avx,default"))); + void matmul_r16 (gfc_array_r16 * const restrict retarray, gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_r16 (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_r16 (gfc_array_r16 * const restrict retarray, + gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_r16 (gfc_array_r16 * const restrict retarray, + gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const GFC_REAL_16 * restrict abase; const GFC_REAL_16 * restrict bbase; GFC_REAL_16 * restrict dest; Index: generated/matmul_r4.c =================================================================== --- generated/matmul_r4.c (Revision 242477) +++ generated/matmul_r4.c (Arbeitskopie) @@ -75,11 +75,37 @@ extern void matmul_r4 (gfc_array_r4 * const restri int blas_limit, blas_call gemm); export_proto(matmul_r4); +#ifdef __x86_64__ + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_r4 (gfc_array_r4 * const restrict retarray, + gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones("avx,default"))); + void matmul_r4 (gfc_array_r4 * const restrict retarray, gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_r4 (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_r4 (gfc_array_r4 * const restrict retarray, + gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_r4 (gfc_array_r4 * const restrict retarray, + gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const GFC_REAL_4 * restrict abase; const GFC_REAL_4 * restrict bbase; GFC_REAL_4 * restrict dest; Index: generated/matmul_r8.c =================================================================== --- generated/matmul_r8.c (Revision 242477) +++ generated/matmul_r8.c (Arbeitskopie) @@ -75,11 +75,37 @@ extern void matmul_r8 (gfc_array_r8 * const restri int blas_limit, blas_call gemm); export_proto(matmul_r8); +#ifdef __x86_64__ + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_r8 (gfc_array_r8 * const restrict retarray, + gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones("avx,default"))); + void matmul_r8 (gfc_array_r8 * const restrict retarray, gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_r8 (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_r8 (gfc_array_r8 * const restrict retarray, + gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_r8 (gfc_array_r8 * const restrict retarray, + gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const GFC_REAL_8 * restrict abase; const GFC_REAL_8 * restrict bbase; GFC_REAL_8 * restrict dest; Index: m4/matmul.m4 =================================================================== --- m4/matmul.m4 (Revision 242477) +++ m4/matmul.m4 (Arbeitskopie) @@ -76,11 +76,37 @@ extern void matmul_'rtype_code` ('rtype` * const r int blas_limit, blas_call gemm); export_proto(matmul_'rtype_code`); +#ifdef __x86_64__ + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_'rtype_code` ('rtype` * const restrict retarray, + 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones("avx,default"))); + void matmul_'rtype_code` ('rtype` * const restrict retarray, 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_'rtype_code` (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_'rtype_code` ('rtype` * const restrict retarray, + 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_'rtype_code` ('rtype` * const restrict retarray, + 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const 'rtype_name` * restrict abase; const 'rtype_name` * restrict bbase; 'rtype_name` * restrict dest;

[libfortran] Add AVX-specific matmul

Commit Message

Comments

Patch