From patchwork Thu Nov 19 19:02:36 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Nathan Sidwell X-Patchwork-Id: 546625 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 2AB5814141C for ; Fri, 20 Nov 2015 06:02:52 +1100 (AEDT) Authentication-Results: ozlabs.org; dkim=pass (1024-bit key; unprotected) header.d=gcc.gnu.org header.i=@gcc.gnu.org header.b=b0kLJlka; dkim-atps=neutral DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:to :from:subject:message-id:date:mime-version:content-type; q=dns; s=default; b=E1Hcw9s3m8ciuAp37Q74lkHc7k31k7K/uDlRR0C9vGt4UoYq/Z vokBroB1i2mRJLorMcRBm+IWViDRcEccoROO/W7Qa/WQhCUS1vk017BgXwPOMim3 KtaB1T6Sj4GpHEW/7xTyGOy4ts/ZTmQ7Y61aXSBwXoRikaNneDzDvKYMs= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:to :from:subject:message-id:date:mime-version:content-type; s= default; bh=2oelbbHmGuB8Vep4VeXUePbu98U=; b=b0kLJlkapjDyZbFlgoKD stKDd7xIELWtf8h2yu/yNMY7qVaIbRWnq5tx4V0nMrp/7EKfC1On/I2cOn7cx6K2 WqC++htIFU865NAZV3BDDKVcuE/KmgpjN9zp1PDQD6m7E8mTXd+K3OBoh9Xwhlct qI8eK+/4cysSjzLVgg3iowg= Received: (qmail 54727 invoked by alias); 19 Nov 2015 19:02:43 -0000 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org Received: (qmail 54711 invoked by uid 89); 19 Nov 2015 19:02:42 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-1.8 required=5.0 tests=BAYES_00, FREEMAIL_FROM, KAM_ASCII_DIVIDERS, RCVD_IN_DNSWL_LOW, SPF_PASS autolearn=no version=3.3.2 X-HELO: mail-yk0-f170.google.com Received: from mail-yk0-f170.google.com (HELO mail-yk0-f170.google.com) (209.85.160.170) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with (AES128-GCM-SHA256 encrypted) ESMTPS; Thu, 19 Nov 2015 19:02:40 +0000 Received: by ykdv3 with SMTP id v3so121737178ykd.0 for ; Thu, 19 Nov 2015 11:02:38 -0800 (PST) X-Received: by 10.129.32.213 with SMTP id g204mr8295879ywg.234.1447959758010; Thu, 19 Nov 2015 11:02:38 -0800 (PST) Received: from ?IPv6:2601:181:c000:c497:a2a8:cdff:fe3e:b48? ([2601:181:c000:c497:a2a8:cdff:fe3e:b48]) by smtp.googlemail.com with ESMTPSA id f129sm1476045ywd.10.2015.11.19.11.02.37 (version=TLSv1/SSLv3 cipher=OTHER); Thu, 19 Nov 2015 11:02:37 -0800 (PST) To: GCC Patches From: Nathan Sidwell Subject: Openacc reduction tests Message-ID: <564E1CCC.9040609@acm.org> Date: Thu, 19 Nov 2015 14:02:36 -0500 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.3.0 MIME-Version: 1.0 in adding the complex double support I noticed some existing tests had commented out sequences. These were broken tests a) the multiplication reduction assumed that 0 * 99! was numerically stable in the face of arbitrary re-association. While it did guess that results might vary, it used an absolute, rather than relative, epsilon value. b) A min/max test assumed that that operator is well defined on complex numbers -- it's not. c) Another min/max test failed because the reduction was specified as + Fixed thusly, nathan 2015-11-19 Nathan Sidwell * libgomp.oacc-c-c++-common/reduction-dbl.c: New. * libgomp.oacc-c-c++-common/reduction-flt.c: New. * libgomp.oacc-c-c++-common/reduction-cplx-dbl.c: Use typedef. * libgomp.oacc-c-c++-common/reduction-cplx-flt.c: Use typedef. * libgomp.oacc-c-c++-common/reduction-2.c: Uncomment broken tests and fix. * libgomp.oacc-c-c++-common/reduction-3.c: Likewise. * libgomp.oacc-c-c++-common/reduction-4.c: Likewise. Index: testsuite/libgomp.oacc-c-c++-common/reduction-2.c =================================================================== --- testsuite/libgomp.oacc-c-c++-common/reduction-2.c (revision 230605) +++ testsuite/libgomp.oacc-c-c++-common/reduction-2.c (working copy) @@ -50,39 +50,37 @@ main(void) if (fabs(result - vresult) > .0001) abort (); -// result = 0; -// vresult = 0; -// -// /* 'max' reductions. */ -// #pragma acc parallel vector_length (vl) -// #pragma acc loop reduction (+:result) -// for (i = 0; i < n; i++) -// result = result > array[i] ? result : array[i]; -// -// /* Verify the reduction. */ -// for (i = 0; i < n; i++) -// vresult = vresult > array[i] ? vresult : array[i]; -// -// printf("%d != %d\n", result, vresult); -// if (result != vresult) -// abort (); -// -// result = 0; -// vresult = 0; -// -// /* 'min' reductions. */ -// #pragma acc parallel vector_length (vl) -// #pragma acc loop reduction (+:result) -// for (i = 0; i < n; i++) -// result = result < array[i] ? result : array[i]; -// -// /* Verify the reduction. */ -// for (i = 0; i < n; i++) -// vresult = vresult < array[i] ? vresult : array[i]; -// -// printf("%d != %d\n", result, vresult); -// if (result != vresult) -// abort (); + result = 0; + vresult = 0; + + /* 'max' reductions. */ +#pragma acc parallel vector_length (vl) copy(result) +#pragma acc loop reduction (max:result) + for (i = 0; i < n; i++) + result = result > array[i] ? result : array[i]; + + /* Verify the reduction. */ + for (i = 0; i < n; i++) + vresult = vresult > array[i] ? vresult : array[i]; + + if (result != vresult) + abort (); + + result = 0; + vresult = 0; + + /* 'min' reductions. */ +#pragma acc parallel vector_length (vl) copy(result) +#pragma acc loop reduction (min:result) + for (i = 0; i < n; i++) + result = result < array[i] ? result : array[i]; + + /* Verify the reduction. */ + for (i = 0; i < n; i++) + vresult = vresult < array[i] ? vresult : array[i]; + + if (result != vresult) + abort (); result = 5; vresult = 5; Index: testsuite/libgomp.oacc-c-c++-common/reduction-3.c =================================================================== --- testsuite/libgomp.oacc-c-c++-common/reduction-3.c (revision 230605) +++ testsuite/libgomp.oacc-c-c++-common/reduction-3.c (working copy) @@ -22,15 +22,15 @@ main(void) result = 0; vresult = 0; - /* '+' reductions. */ + /* 'max' reductions. */ #pragma acc parallel vector_length (vl) copy(result) -#pragma acc loop reduction (+:result) +#pragma acc loop reduction (max:result) for (i = 0; i < n; i++) - result += array[i]; + result = result > array[i] ? result : array[i]; /* Verify the reduction. */ for (i = 0; i < n; i++) - vresult += array[i]; + vresult = vresult > array[i] ? vresult : array[i]; if (result != vresult) abort (); @@ -38,51 +38,18 @@ main(void) result = 0; vresult = 0; - /* '*' reductions. */ + /* 'min' reductions. */ #pragma acc parallel vector_length (vl) copy(result) -#pragma acc loop reduction (*:result) +#pragma acc loop reduction (min:result) for (i = 0; i < n; i++) - result *= array[i]; + result = result < array[i] ? result : array[i]; /* Verify the reduction. */ for (i = 0; i < n; i++) - vresult *= array[i]; + vresult = vresult < array[i] ? vresult : array[i]; - if (fabs(result - vresult) > .0001) + if (result != vresult) abort (); -// result = 0; -// vresult = 0; -// -// /* 'max' reductions. */ -// #pragma acc parallel vector_length (vl) -// #pragma acc loop reduction (+:result) -// for (i = 0; i < n; i++) -// result = result > array[i] ? result : array[i]; -// -// /* Verify the reduction. */ -// for (i = 0; i < n; i++) -// vresult = vresult > array[i] ? vresult : array[i]; -// -// printf("%d != %d\n", result, vresult); -// if (result != vresult) -// abort (); -// -// result = 0; -// vresult = 0; -// -// /* 'min' reductions. */ -// #pragma acc parallel vector_length (vl) -// #pragma acc loop reduction (+:result) -// for (i = 0; i < n; i++) -// result = result < array[i] ? result : array[i]; -// -// /* Verify the reduction. */ -// for (i = 0; i < n; i++) -// vresult = vresult < array[i] ? vresult : array[i]; -// -// printf("%d != %d\n", result, vresult); -// if (result != vresult) -// abort (); result = 5; vresult = 5; Index: testsuite/libgomp.oacc-c-c++-common/reduction-4.c =================================================================== --- testsuite/libgomp.oacc-c-c++-common/reduction-4.c (revision 230605) +++ testsuite/libgomp.oacc-c-c++-common/reduction-4.c (working copy) @@ -23,76 +23,6 @@ main(void) result = 0; vresult = 0; - /* '+' reductions. */ -#pragma acc parallel vector_length (vl) copy(result) -#pragma acc loop reduction (+:result) - for (i = 0; i < n; i++) - result += array[i]; - - /* Verify the reduction. */ - for (i = 0; i < n; i++) - vresult += array[i]; - - if (result != vresult) - abort (); - - result = 0; - vresult = 0; - - /* Needs support for complex multiplication. */ - -// /* '*' reductions. */ -// #pragma acc parallel vector_length (vl) -// #pragma acc loop reduction (*:result) -// for (i = 0; i < n; i++) -// result *= array[i]; -// -// /* Verify the reduction. */ -// for (i = 0; i < n; i++) -// vresult *= array[i]; -// -// if (fabs(result - vresult) > .0001) -// abort (); -// result = 0; -// vresult = 0; - -// /* 'max' reductions. */ -// #pragma acc parallel vector_length (vl) -// #pragma acc loop reduction (+:result) -// for (i = 0; i < n; i++) -// result = result > array[i] ? result : array[i]; -// -// /* Verify the reduction. */ -// for (i = 0; i < n; i++) -// vresult = vresult > array[i] ? vresult : array[i]; -// -// printf("%d != %d\n", result, vresult); -// if (result != vresult) -// abort (); -// -// result = 0; -// vresult = 0; -// -// /* 'min' reductions. */ -// #pragma acc parallel vector_length (vl) -// #pragma acc loop reduction (+:result) -// for (i = 0; i < n; i++) -// result = result < array[i] ? result : array[i]; -// -// /* Verify the reduction. */ -// for (i = 0; i < n; i++) -// vresult = vresult < array[i] ? vresult : array[i]; -// -// printf("%d != %d\n", result, vresult); -// if (result != vresult) -// abort (); - - result = 5; - vresult = 5; - - lresult = false; - lvresult = false; - /* '&&' reductions. */ #pragma acc parallel vector_length (vl) copy(lresult) #pragma acc loop reduction (&&:lresult) Index: testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c =================================================================== --- testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c (revision 230605) +++ testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c (working copy) @@ -3,10 +3,11 @@ /* Double float has 53 bits of fraction. */ #define FRAC (1.0 / (1LL << 48)) +typedef double _Complex Type; -int close_enough (double _Complex a, double _Complex b) +int close_enough (Type a, Type b) { - double _Complex diff = a - b; + Type diff = a - b; double mag2_a = __real__(a) * __real__ (a) + __imag__ (a) * __imag__ (a); double mag2_diff = (__real__(diff) * __real__ (diff) + __imag__ (diff) * __imag__ (diff)); @@ -17,9 +18,9 @@ int close_enough (double _Complex a, dou #define N 100 static int __attribute__ ((noinline)) -vector (double _Complex ary[N], double _Complex sum, double _Complex prod) +vector (Type ary[N], Type sum, Type prod) { - double _Complex tsum = 0, tprod = 1; + Type tsum = 0, tprod = 1; #pragma acc parallel vector_length(32) copyin(ary[0:N]) copy (tsum, tprod) { @@ -41,9 +42,9 @@ vector (double _Complex ary[N], double _ } static int __attribute__ ((noinline)) -worker (double _Complex ary[N], double _Complex sum, double _Complex prod) +worker (Type ary[N], Type sum, Type prod) { - double _Complex tsum = 0, tprod = 1; + Type tsum = 0, tprod = 1; #pragma acc parallel num_workers(32) copyin(ary[0:N]) copy (tsum, tprod) { @@ -65,9 +66,9 @@ worker (double _Complex ary[N], double _ } static int __attribute__ ((noinline)) -gang (double _Complex ary[N], double _Complex sum, double _Complex prod) +gang (Type ary[N], Type sum, Type prod) { - double _Complex tsum = 0, tprod = 1; + Type tsum = 0, tprod = 1; #pragma acc parallel num_gangs (32) copyin(ary[0:N]) copy (tsum, tprod) { @@ -90,7 +91,7 @@ gang (double _Complex ary[N], double _Co int main (void) { - double _Complex ary[N], sum = 0, prod = 1; + Type ary[N], sum = 0, prod = 1; for (int ix = 0; ix < N; ix++) { Index: testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c =================================================================== --- testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c (revision 230605) +++ testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c (working copy) @@ -3,10 +3,11 @@ /* Single float has 23 bits of fraction. */ #define FRAC (1.0f / (1 << 20)) +typedef float _Complex Type; -int close_enough (float _Complex a, float _Complex b) +int close_enough (Type a, Type b) { - float _Complex diff = a - b; + Type diff = a - b; float mag2_a = __real__(a) * __real__ (a) + __imag__ (a) * __imag__ (a); float mag2_diff = (__real__(diff) * __real__ (diff) + __imag__ (diff) * __imag__ (diff)); @@ -17,9 +18,9 @@ int close_enough (float _Complex a, floa #define N 100 static int __attribute__ ((noinline)) -vector (float _Complex ary[N], float _Complex sum, float _Complex prod) +vector (Type ary[N], Type sum, Type prod) { - float _Complex tsum = 0, tprod = 1; + Type tsum = 0, tprod = 1; #pragma acc parallel vector_length(32) copyin(ary[0:N]) copy (tsum, tprod) { @@ -41,9 +42,9 @@ vector (float _Complex ary[N], float _Co } static int __attribute__ ((noinline)) -worker (float _Complex ary[N], float _Complex sum, float _Complex prod) +worker (Type ary[N], Type sum, Type prod) { - float _Complex tsum = 0, tprod = 1; + Type tsum = 0, tprod = 1; #pragma acc parallel num_workers(32) copyin(ary[0:N]) copy (tsum, tprod) { @@ -65,9 +66,9 @@ worker (float _Complex ary[N], float _Co } static int __attribute__ ((noinline)) -gang (float _Complex ary[N], float _Complex sum, float _Complex prod) +gang (Type ary[N], Type sum, Type prod) { - float _Complex tsum = 0, tprod = 1; + Type tsum = 0, tprod = 1; #pragma acc parallel num_gangs (32) copyin(ary[0:N]) copy (tsum, tprod) { @@ -90,7 +91,7 @@ gang (float _Complex ary[N], float _Comp int main (void) { - float _Complex ary[N], sum = 0, prod = 1; + Type ary[N], sum = 0, prod = 1; for (int ix = 0; ix < N; ix++) { Index: testsuite/libgomp.oacc-c-c++-common/reduction-dbl.c =================================================================== --- testsuite/libgomp.oacc-c-c++-common/reduction-dbl.c (revision 0) +++ testsuite/libgomp.oacc-c-c++-common/reduction-dbl.c (working copy) @@ -0,0 +1,112 @@ + +/* Double float has 53 bits of fraction. */ +#define FRAC (1.0 / (1LL << 48)) +typedef double Type; + +int close_enough (Type a, Type b) +{ + Type diff = a - b; + if (diff < 0) + diff = -diff; + + return diff / a < FRAC; +} + +#define N 100 + +static int __attribute__ ((noinline)) +vector (Type ary[N], Type sum, Type prod) +{ + Type tsum = 0, tprod = 1; + +#pragma acc parallel vector_length(32) copyin(ary[0:N]) copy (tsum, tprod) + { +#pragma acc loop vector reduction(+:tsum) reduction (*:tprod) + for (int ix = 0; ix < N; ix++) + { + tsum += ary[ix]; + tprod *= ary[ix]; + } + } + + if (!close_enough (sum, tsum)) + return 1; + + if (!close_enough (prod, tprod)) + return 1; + + return 0; +} + +static int __attribute__ ((noinline)) +worker (Type ary[N], Type sum, Type prod) +{ + Type tsum = 0, tprod = 1; + +#pragma acc parallel num_workers(32) copyin(ary[0:N]) copy (tsum, tprod) + { +#pragma acc loop worker reduction(+:tsum) reduction (*:tprod) + for (int ix = 0; ix < N; ix++) + { + tsum += ary[ix]; + tprod *= ary[ix]; + } + } + + if (!close_enough (sum, tsum)) + return 1; + + if (!close_enough (prod, tprod)) + return 1; + + return 0; +} + +static int __attribute__ ((noinline)) +gang (Type ary[N], Type sum, Type prod) +{ + Type tsum = 0, tprod = 1; + +#pragma acc parallel num_gangs (32) copyin(ary[0:N]) copy (tsum, tprod) + { +#pragma acc loop gang reduction(+:tsum) reduction (*:tprod) + for (int ix = 0; ix < N; ix++) + { + tsum += ary[ix]; + tprod *= ary[ix]; + } + } + + if (!close_enough (sum, tsum)) + return 1; + + if (!close_enough (prod, tprod)) + return 1; + + return 0; +} + +int main (void) +{ + Type ary[N], sum = 0, prod = 1; + + for (int ix = 0; ix < N; ix++) + { + float frac = ix * (1.0f / 1024) + 1.0f; + + ary[ix] = frac; + sum += ary[ix]; + prod *= ary[ix]; + } + + if (vector (ary, sum, prod)) + return 1; + + if (worker (ary, sum, prod)) + return 1; + + if (gang (ary, sum, prod)) + return 1; + + return 0; +} Index: testsuite/libgomp.oacc-c-c++-common/reduction-flt.c =================================================================== --- testsuite/libgomp.oacc-c-c++-common/reduction-flt.c (revision 0) +++ testsuite/libgomp.oacc-c-c++-common/reduction-flt.c (working copy) @@ -0,0 +1,112 @@ + +/* Single float has 23 bits of fraction. */ +#define FRAC (1.0f / (1 << 20)) +typedef float Type; + +int close_enough (Type a, Type b) +{ + Type diff = a - b; + if (diff < 0) + diff = -diff; + + return diff / a < FRAC; +} + +#define N 100 + +static int __attribute__ ((noinline)) +vector (Type ary[N], Type sum, Type prod) +{ + Type tsum = 0, tprod = 1; + +#pragma acc parallel vector_length(32) copyin(ary[0:N]) copy (tsum, tprod) + { +#pragma acc loop vector reduction(+:tsum) reduction (*:tprod) + for (int ix = 0; ix < N; ix++) + { + tsum += ary[ix]; + tprod *= ary[ix]; + } + } + + if (!close_enough (sum, tsum)) + return 1; + + if (!close_enough (prod, tprod)) + return 1; + + return 0; +} + +static int __attribute__ ((noinline)) +worker (Type ary[N], Type sum, Type prod) +{ + Type tsum = 0, tprod = 1; + +#pragma acc parallel num_workers(32) copyin(ary[0:N]) copy (tsum, tprod) + { +#pragma acc loop worker reduction(+:tsum) reduction (*:tprod) + for (int ix = 0; ix < N; ix++) + { + tsum += ary[ix]; + tprod *= ary[ix]; + } + } + + if (!close_enough (sum, tsum)) + return 1; + + if (!close_enough (prod, tprod)) + return 1; + + return 0; +} + +static int __attribute__ ((noinline)) +gang (Type ary[N], Type sum, Type prod) +{ + Type tsum = 0, tprod = 1; + +#pragma acc parallel num_gangs (32) copyin(ary[0:N]) copy (tsum, tprod) + { +#pragma acc loop gang reduction(+:tsum) reduction (*:tprod) + for (int ix = 0; ix < N; ix++) + { + tsum += ary[ix]; + tprod *= ary[ix]; + } + } + + if (!close_enough (sum, tsum)) + return 1; + + if (!close_enough (prod, tprod)) + return 1; + + return 0; +} + +int main (void) +{ + Type ary[N], sum = 0, prod = 1; + + for (int ix = 0; ix < N; ix++) + { + float frac = ix * (1.0f / 1024) + 1.0f; + + ary[ix] = frac; + sum += ary[ix]; + prod *= ary[ix]; + } + + if (vector (ary, sum, prod)) + return 1; + + if (worker (ary, sum, prod)) + return 1; + + if (gang (ary, sum, prod)) + return 1; + + return 0; +}