===================================================================
@@ -0,0 +1,138 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+ This file is part of the UPC runtime Library.
+ Written by Gary Funck <gary@intrepid.com>
+ and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+/**
+ * @file gupcr_clock.c
+ * GUPC Clock routines.
+ */
+
+/**
+ * @addtogroup UPCCLOCK UPC Clock Functions
+ * @{
+ */
+
+#include "gupcr_config.h"
+#include "gupcr_defs.h"
+#include "gupcr_utils.h"
+
+static double gupcr_clock_rez;
+static double gupcr_clock_base;
+
+#if HAVE_CLOCK_GETTIME
+
+#if defined (CLOCK_MONOTONIC_RAW) && defined (CLOCK_MONOTONIC)
+#define GUPCR_CLOCK_ID CLOCK_MONOTONIC_RAW
+/* On some RHEL/CentOS systems, the timer resolution returned for
+ CLOCK_MONOTONIC_RAW is incorrect. Use CLOCK_MONOTONIC instead. */
+#define GUPCR_CLOCK_REZ_ID CLOCK_MONOTONIC
+#elif defined (CLOCK_MONOTONIC)
+#define GUPCR_CLOCK_ID CLOCK_MONOTONIC
+#define GUPCR_CLOCK_REZ_ID CLOCK_MONOTONIC
+#else
+#error missing system clock name definition.
+#endif
+
+double
+gupcr_clock (void)
+{
+ struct timespec ts;
+ double t;
+ gupcr_syscall (clock_gettime, (GUPCR_CLOCK_ID, &ts));
+ t = (double) ts.tv_sec + (double) ts.tv_nsec * 1.0e-9;
+ t -= gupcr_clock_base;
+ return t;
+}
+
+double
+gupcr_clock_resolution (void)
+{
+ return gupcr_clock_rez;
+}
+
+void
+gupcr_clock_init (void)
+{
+ struct timespec clock_rez;
+ gupcr_syscall (clock_getres, (GUPCR_CLOCK_REZ_ID, &clock_rez));
+ gupcr_assert (clock_rez.tv_sec == 0);
+ gupcr_clock_rez = clock_rez.tv_nsec * 1.0e-9;
+ gupcr_clock_base = gupcr_clock ();
+}
+
+#else /* Use gettimeofday(). */
+
+double
+gupcr_clock (void)
+{
+ struct timeval tv;
+ double t;
+ gupcr_syscall (gettimeofday, (&tv, NULL));
+ t = (double) tv.tv_sec + (double) tv.tv_usec * 1.0e-6;
+ t -= gupcr_clock_base;
+ return t;
+}
+
+double
+gupcr_clock_resolution (void)
+{
+ return gupcr_clock_rez;
+}
+
+void
+gupcr_clock_init (void)
+{
+ int i;
+ gupcr_clock_base = gupcr_clock ();
+ gupcr_clock_rez = 1.0;
+ for (i = 1; i <= 10; ++i)
+ {
+ double t1, t2, diff;
+ t1 = gupcr_clock ();
+ do
+ {
+ t2 = gupcr_clock ();
+ }
+ while (t1 == t2);
+ diff = t2 - t1;
+ if (diff < gupcr_clock_rez)
+ gupcr_clock_rez = diff;
+ }
+ /* Round the clock resolution to some common values
+ if it is within range of one of them. */
+ if (gupcr_clock_rez > 0.9e-6 && gupcr_clock_rez < 1.1e-6)
+ gupcr_clock_rez = 1.0e-6;
+ else if (gupcr_clock_rez > 0.9e-3 && gupcr_clock_rez < 1.1e-3)
+ gupcr_clock_rez = 1.0e-3;
+ else if (gupcr_clock_rez > 0.9e-2 && gupcr_clock_rez < 1.1e-2)
+ gupcr_clock_rez = 1.0e-2;
+ else if (gupcr_clock_rez > 1.63e-2 && gupcr_clock_rez < 1.69e-2)
+ gupcr_clock_rez = 1.0 / 60.0;
+ else if (gupcr_clock_rez > 1.95e-2 && gupcr_clock_rez < 2.05e-2)
+ gupcr_clock_rez = 1.0 / 50.0;
+}
+
+#endif
+/** @} */
===================================================================
@@ -0,0 +1,129 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+ This file is part of the UPC runtime Library.
+ Written by Gary Funck <gary@intrepid.com>
+ and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+#include <upc.h>
+#include <upc_collective.h>
+#include "gupcr_config.h"
+#include "gupcr_defs.h"
+#include "gupcr_sup.h"
+#include "gupcr_portals.h"
+#include "gupcr_gmem.h"
+#include "gupcr_utils.h"
+#include "gupcr_coll_sup.h"
+
+/**
+ * @file gupcr_coll_broadcast.upc
+ * GUPC Portals4 collectives broadcast implementation.
+ *
+ * @addtogroup COLLECTIVES GUPCR Collectives Functions
+ * @{
+ */
+
+/**
+ * @fn upc_all_broadcast (shared void *dst,
+ * shared const void *src,
+ * size_t nbytes, upc_flag_t sync_mode)
+ * Broadcast data referenced by the src pointer.
+ *
+ * @param [in] dst Destination shared pointer
+ * @param [in] src Source shared pointer
+ * @param [in] nbytes Number of bytes to broadcast
+ * @param [in] sync_mode Synchronization mode
+ * @ingroup COLLECTIVES
+ */
+
+void
+upc_all_broadcast (shared void *dst, shared const void *src,
+ size_t nbytes, upc_flag_t sync_mode)
+{
+ size_t src_thread = upc_threadof ((shared void *) src);
+ size_t send_cnt = nbytes;
+ int i, blk_cnt;
+
+ gupcr_trace (FC_COLL, "COLL ALL_BROADCAST ENTER %lu %lu",
+ (long unsigned) src_thread, (long unsigned) nbytes);
+#ifdef _UPC_COLL_CHECK_ARGS
+ upc_coll_err (dst, src, NULL, nbytes, sync_mode, 0, 0, 0, UPC_BRDCST);
+#endif
+
+ /* Initialize the collectives broadcast tree. */
+ gupcr_coll_tree_setup (src_thread, 0, THREADS);
+
+ /* Optional IN synchronization mode. */
+ if (UPC_IN_MYSYNC & sync_mode || !(UPC_IN_NOSYNC & sync_mode))
+ upc_barrier;
+
+ blk_cnt = 0;
+ while (send_cnt)
+ {
+ size_t blk_size = (send_cnt > GUPCR_PORTALS_MAX_MSG_SIZE) ?
+ GUPCR_PORTALS_MAX_MSG_SIZE : send_cnt;
+ send_cnt -= blk_size;
+
+ if (MYTHREAD != (int) src_thread)
+ {
+ /* Wait for parent to deliver data. */
+ gupcr_coll_signal_wait (1);
+ }
+ else
+ {
+ /* Copy data into the thread's own memory. */
+ size_t doffset = upc_addrfield ((shared char *) dst + MYTHREAD);
+ size_t soffset = upc_addrfield ((shared void *) src);
+ doffset += blk_cnt * GUPCR_PORTALS_MAX_MSG_SIZE;
+ soffset += blk_cnt * GUPCR_PORTALS_MAX_MSG_SIZE;
+ gupcr_debug (FC_COLL,
+ "Local copy - doffset: %lld soffset: %lld nbytes: %lld",
+ (long long int) doffset, (long long int) soffset,
+ (long long int) nbytes);
+ memcpy ((char *) gupcr_gmem_base + doffset,
+ (char *) gupcr_gmem_base + soffset, blk_size);
+ }
+
+ /* Send data to all children. */
+ if (gupcr_coll_child_cnt)
+ {
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ int dthread = gupcr_coll_child[i];
+ size_t doffset = upc_addrfield ((shared char *) dst + dthread);
+ size_t soffset = upc_addrfield ((shared char *) dst + MYTHREAD);
+ doffset += blk_cnt * GUPCR_PORTALS_MAX_MSG_SIZE;
+ soffset += blk_cnt * GUPCR_PORTALS_MAX_MSG_SIZE;
+ gupcr_coll_put (dthread, doffset, soffset, blk_size);
+ }
+ gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+ }
+ ++blk_cnt;
+ }
+
+ /* Optional OUT synchronization mode. */
+ if (UPC_OUT_MYSYNC & sync_mode || !(UPC_OUT_NOSYNC & sync_mode))
+ upc_barrier;
+ gupcr_trace (FC_COLL, "COLL ALL_BROADCAST EXIT");
+}
+
+/* @} */
===================================================================
@@ -0,0 +1,67 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+ This file is part of the UPC runtime Library.
+ Written by Gary Funck <gary@intrepid.com>
+ and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+#include <upc.h>
+#include <upc_collective.h>
+#include <upc_coll.h>
+#include "gupcr_config.h"
+#include "gupcr_defs.h"
+#include "gupcr_sup.h"
+#include "gupcr_portals.h"
+#include "gupcr_gmem.h"
+#include "gupcr_utils.h"
+#include "gupcr_coll_sup.h"
+
+/**
+ * @file gupcr_coll_init.upc
+ * GUPC Portals4 collectives initialization.
+ *
+ * @addtogroup COLLECTIVES GUPCR Collectives Functions
+ * @{
+ */
+int upc_coll_init_flag = 0;
+
+/**
+ * Collectives initialization function.
+ *
+ * Initialize necessary storage area for the broadcast/reduce
+ * thread trees.
+ */
+void
+upc_coll_init ()
+{
+ if (upc_coll_init_flag)
+ gupcr_fatal_error ("multiple attempts to initialize collectives");
+ upc_coll_init_flag = 1;
+
+ /* Allocate the "all reduce" storage area. */
+ gupcr_reduce_storage = (gupcr_reduce_str_t)
+ upc_all_alloc (THREADS, sizeof (struct gupcr_reduce_str));
+ if (gupcr_reduce_storage == NULL)
+ gupcr_fatal_error ("cannot allocate collectives reduce shared storage");
+}
+
+/* @} */
===================================================================
@@ -0,0 +1,4978 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+ This file is part of the UPC runtime library.
+ Written by Gary Funck <gary@intrepid.com>
+ and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+/*****************************************************************************/
+/* */
+/* Copyright (c) 2004, Michigan Technological University */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or without */
+/* modification, are permitted provided that the following conditions */
+/* are met: */
+/* */
+/* * Redistributions of source code must retain the above copyright */
+/* notice, this list of conditions and the following disclaimer. */
+/* * Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials provided */
+/* with the distribution. */
+/* * Neither the name of the Michigan Technological University */
+/* nor the names of its contributors may be used to endorse or promote */
+/* products derived from this software without specific prior written */
+/* permission. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS */
+/* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT */
+/* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A */
+/* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER */
+/* OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, */
+/* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, */
+/* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR */
+/* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING */
+/* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS */
+/* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/*****************************************************************************/
+
+#include <stdlib.h>
+#include <upc.h>
+#include <upc_collective.h>
+#include <upc_coll.h>
+#include "gupcr_config.h"
+#include "gupcr_defs.h"
+#include "gupcr_sup.h"
+#include "gupcr_portals.h"
+#include "gupcr_gmem.h"
+#include "gupcr_utils.h"
+#include "gupcr_coll_sup.h"
+
+/*****************************************************************************/
+/* */
+/* UPC collective function library, reference implementation */
+/* */
+/* Steve Seidel, Dept. of Computer Science, Michigan Technological Univ. */
+/* steve@mtu.edu March 1, 2004 */
+/* */
+/*****************************************************************************/
+
+/**
+ * @file gupcr_coll_reduce.upc
+ * GUPC Portals4 reduce collectives implementation.
+ *
+ * @addtogroup COLLECTIVES GUPCR Collectives Functions
+ * @{
+ */
+
+/** Collectives reduce storage pointer */
+gupcr_reduce_str_t gupcr_reduce_storage;
+
+/**
+ * Convert from UPC reduce to Portals atomic operation.
+ *
+ * @parm [in] op UPC reduce operation
+ * @retval Portals atomic operation
+*/
+ptl_op_t
+gupcr_portals_reduce_op (upc_op_t op)
+{
+ switch (op)
+ {
+ case UPC_ADD:
+ return PTL_SUM;
+ case UPC_MULT:
+ return PTL_PROD;
+ case UPC_AND:
+ return PTL_BAND;
+ case UPC_OR:
+ return PTL_BOR;
+ case UPC_XOR:
+ return PTL_BXOR;
+ case UPC_LOGAND:
+ return PTL_LAND;
+ case UPC_LOGOR:
+ return PTL_LOR;
+ case UPC_MIN:
+ return PTL_MIN;
+ case UPC_MAX:
+ return PTL_MAX;
+ default:
+ gupcr_fatal_error ("cannot convert UPC reduce operation 0x%lx.", op);
+ }
+}
+
+
+
+/**
+ * Collectives reduce (C) function
+ *
+ * The following steps are taken to calculate the reduced value:
+ *
+ * - Each thread reduces the values it has affinity to. Note that
+ * some of the threads might not participate in collectives reduce.
+ * - A reduce tree is created out of the threads participating.
+ * - All the parent threads signal their children that they are ready
+ * for the collectives reduce operation.
+ * - All the children perform atomic portals reduce operations in the
+ * parent shared space. The reduced values are propagated to the
+ * top of the tree.
+ * - Result is written to the specified destination.
+ *
+ * @param [in] dst Destination shared pointer
+ * @param [in] src Source shared pointer
+ * @param [in] op Collectives reduce operation
+ * @param [in] nelems Number of elements
+ * @param [in] blk_size Block size
+ * @param [in] func Optional reduce function
+ * @param [in] sync_mode Synchronization mode
+ *
+ */
+void upc_all_reduceC
+ (shared void *dst,
+ shared const void *src,
+ upc_op_t op,
+ size_t nelems,
+ size_t blk_size,
+ signed char (*func) (signed char, signed char), upc_flag_t sync_mode)
+{
+ int i, n_local, full_rows, last_row;
+ int num_thr, tail_thr, extras, ph, src_thr, dst_thr, velems, start;
+
+ signed char local_result = 0;
+ signed char *l_src;
+
+ if (!upc_coll_init_flag)
+ upc_coll_init ();
+
+ gupcr_trace (FC_COLL, "COLL ALL_REDUCE ENTER signed char %lu %lu",
+ (long unsigned) nelems, (long unsigned) blk_size);
+
+ if (blk_size == 0)
+ blk_size = nelems;
+
+#ifdef _UPC_COLL_CHECK_ARGS
+ upc_coll_err (dst, src, NULL, 0, sync_mode, blk_size, nelems, op, UPC_RED);
+#endif
+
+ /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC. */
+ if (UPC_IN_MYSYNC & sync_mode || !(UPC_IN_NOSYNC & sync_mode))
+ upc_barrier;
+
+ /* Compute n_local, the number of elements local to this thread. */
+ n_local = 0;
+
+ /* Also compute start, the starting index of src for each thread. */
+
+ src_thr = upc_threadof ((shared void *) src);
+ dst_thr = upc_threadof ((shared void *) dst);
+ ph = upc_phaseof ((shared void *) src);
+
+ /* nelems plus the number of virtual elements in first row. */
+ velems = nelems + src_thr * blk_size + ph;
+
+ /* Include virtual elements when computing number of local elements. */
+ full_rows = velems / (blk_size * THREADS);
+ last_row = velems % (blk_size * THREADS);
+ tail_thr = last_row / blk_size;
+
+ /* Calculate number of participating threads. */
+ num_thr = (nelems + ph + blk_size - 1) / blk_size;
+ if (num_thr > THREADS)
+ num_thr = THREADS;
+
+ gupcr_debug (FC_COLL,
+ "src_thr: %d tail_thr: %d ph: %d num_thr: %d full_rows: %d",
+ src_thr, tail_thr, ph, num_thr, full_rows);
+
+ /* Calculate number of local elements. */
+ if (blk_size > 0)
+ {
+ if (MYTHREAD <= tail_thr)
+ if (MYTHREAD == tail_thr)
+ extras = last_row % blk_size;
+ else
+ extras = blk_size;
+ else
+ extras = 0;
+
+ n_local = blk_size * full_rows + extras;
+
+ /* Adjust the number of elements in this thread, if necessary. */
+ if (MYTHREAD < src_thr)
+ n_local -= blk_size;
+ else if (MYTHREAD == src_thr)
+ n_local -= ph;
+ }
+ else
+ {
+ n_local = 0;
+ if (src_thr == MYTHREAD) /* Revise the number of local elements. */
+ n_local = nelems;
+ }
+
+ /* Starting index for this thread
+ Note: start is sometimes negative because src is
+ addressed here as if its block size is 1. */
+
+ if (blk_size > 0)
+ if (MYTHREAD > src_thr)
+ start = MYTHREAD - src_thr - ph * THREADS;
+ else if (MYTHREAD < src_thr)
+ start = (blk_size - ph) * THREADS + MYTHREAD - src_thr;
+ else /* This is the source thread. */
+ start = 0;
+ else
+ start = 0;
+
+
+ /* Reduce the elements local to this thread. */
+
+ if (n_local > 0)
+ {
+ int loop_cnt = n_local - 1;
+
+ l_src = (signed char *) ((shared const signed char *) src + start);
+ local_result = *l_src++;
+
+ switch (op)
+ {
+ case UPC_ADD:
+ while (loop_cnt--)
+ local_result += *l_src++;
+ break;
+ case UPC_MULT:
+ while (loop_cnt--)
+ local_result *= *l_src++;
+ break;
+ /* Skip if not integral type, per spec 4.3.1.1
+ (See additional comments in upc_collective.c) */
+ case UPC_AND:
+ while (loop_cnt--)
+ local_result &= *l_src++;
+ break;
+ case UPC_OR:
+ while (loop_cnt--)
+ local_result |= *l_src++;
+ break;
+ case UPC_XOR:
+ while (loop_cnt--)
+ local_result ^= *l_src++;
+ break;
+ case UPC_LOGAND:
+ while (loop_cnt--)
+ local_result = local_result && *l_src++;
+ break;
+ case UPC_LOGOR:
+ while (loop_cnt--)
+ local_result = local_result || *l_src++;
+ break;
+ case UPC_MIN:
+ while (loop_cnt--)
+ {
+ if (local_result > *l_src)
+ local_result = *l_src;
+ ++l_src;
+ }
+ break;
+ case UPC_MAX:
+ while (loop_cnt--)
+ {
+ if (local_result < *l_src)
+ local_result = *l_src;
+ ++l_src;
+ }
+ break;
+ case UPC_FUNC:
+ while (loop_cnt--)
+ local_result = func (local_result, *l_src++);
+ break;
+ case UPC_NONCOMM_FUNC:
+ while (loop_cnt--)
+ local_result = func (local_result, *l_src++);
+ break;
+ default:
+ gupcr_fatal_error ("bad UPC collectives reduce operator 0x%lx", op);
+ }
+ }
+
+ /* Note: local_result is undefined if n_local == 0.
+ Note: Only a proper subset of threads have a meaningful local_result.
+ Note: dst might be a thread that does not have a local result. */
+
+ /* Global reduce on only participating threads. */
+ if (n_local)
+ {
+ /* Local pointer where reduced values are written too. */
+ signed char *t_result =
+ (signed char *) & gupcr_reduce_storage[MYTHREAD].value[0];
+
+ /* Initialize collectives reduce tree. */
+ gupcr_coll_tree_setup (dst_thr, src_thr, num_thr);
+
+ /* Copy in local results into the area for reduce operation.
+ NOTE: Not needed for the case of collective functions. However,
+ this covers the case of only one thread. */
+ *t_result = local_result;
+
+#ifdef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+/* Run reduce operation without triggered functions. */
+#undef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+#endif
+#if GUPCR_USE_PORTALS4_TRIGGERED_OPS
+ /* Note: In the case of UPC_FUNC and UPC_NONCOMM, it is not possible
+ to use triggered operations on inner nodes. In that case, inner
+ nodes must calculate reduced value by calling the specified
+ function. */
+ if (gupcr_coll_child_cnt)
+ {
+ if (IS_ROOT_THREAD)
+ {
+ /* ROOT THREAD */
+ /* Let children know that parent is ready. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+ /* Wait for children to report their values. */
+ gupcr_coll_signal_wait (gupcr_coll_child_cnt);
+
+ /* Reduce local values with those of children if necessary. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ /* Reduce local result with those of children. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result =
+ func (local_result, *(signed char *)
+ & gupcr_reduce_storage[MYTHREAD].value[i]);
+ }
+ *t_result = local_result;
+ }
+ }
+ else
+ {
+ /* INNER THREAD */
+ /* Prepare triggered atomic function. */
+ if ((op != UPC_FUNC) && (op != UPC_NONCOMM_FUNC))
+ {
+ /* Use triggered atomic operations once children sent
+ their results and parent is ready to receive it. */
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+ offset, sizeof (signed char),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_CHAR,
+ gupcr_coll_child_cnt + 1);
+ }
+ /* Let children know that parent is ready. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+ /* Wait for completion, children and parent are ready. */
+ gupcr_coll_signal_wait (gupcr_coll_child_cnt + 1);
+ /* Execute reduce functions if necessary. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ size_t doffset =
+ upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage[MYTHREAD].value
+ [gupcr_coll_child_index]));
+ /* Reduce local result with those of children. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result = func (local_result, *(signed char *)
+ &
+ gupcr_reduce_storage
+ [MYTHREAD].value[i]);
+ }
+ *t_result = local_result;
+ gupcr_coll_put (gupcr_coll_parent_thread, doffset, offset,
+ sizeof (signed char));
+ }
+ /* Wait for our value to go up the tree. */
+ gupcr_coll_ack_wait (1);
+ }
+ }
+ else
+ {
+ /* Avoid the case where only one thread is available. */
+ if (!IS_ROOT_THREAD)
+ {
+ /* LEAF THREAD */
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ switch (op)
+ {
+ case UPC_FUNC:
+ case UPC_NONCOMM_FUNC:
+ {
+ /* Schedule a triggered put once signal is received. */
+ size_t doffset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].
+ value
+ [gupcr_coll_child_index]));
+ gupcr_coll_trigput (gupcr_coll_parent_thread, doffset,
+ offset, sizeof (signed char), 1);
+ }
+ break;
+ default:
+ /* Schedule a triggered atomic put once parent is ready. */
+ gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+ offset, sizeof (signed char),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_CHAR, 1);
+ break;
+ }
+ /* Wait for parent to be ready. */
+ gupcr_coll_signal_wait (1);
+ /* Wait for our value to leave. */
+ gupcr_coll_ack_wait (1);
+ }
+ }
+#else /* NO TRIGGERED OPS */
+ /* Send signal to all children. */
+ if (gupcr_coll_child_cnt)
+ {
+ /* ROOT OR INNER THREAD */
+ int wait_cnt = gupcr_coll_child_cnt;
+
+ /* Signal that parent is ready to receive the locally reduced
+ values from its children. Value that we send does not matter. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (wait_cnt);
+
+ /* Wait for children to report their local reduced values and
+ parent to report it is ready to receive the reduced value. */
+ if (!IS_ROOT_THREAD)
+ ++wait_cnt;
+ gupcr_coll_signal_wait (wait_cnt);
+
+ /* Compute result if reduce functions are used. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result = func (local_result,
+ *(signed char *) &
+ gupcr_reduce_storage[MYTHREAD].value
+ [i]);
+ }
+ /* Prepare reduced value for going up the tree. */
+ *t_result = local_result;
+ }
+ }
+ else if (!IS_ROOT_THREAD)
+ {
+ /* LEAF THREAD */
+ gupcr_coll_signal_wait (1);
+ }
+
+ /* Send reduced value to the parent. */
+ if (!IS_ROOT_THREAD)
+ {
+ /* LEAF OR INNER THREAD */
+ /* Each child places its result into the parent memory slot
+ dedicated for the child. The parent is responsible
+ for creating the reduced result for itself and its
+ children. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ size_t doffset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value
+ [gupcr_coll_child_index]));
+ size_t soffset =
+ upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage[MYTHREAD].value[0]));
+ gupcr_coll_put (gupcr_coll_parent_thread, doffset, soffset,
+ sizeof (signed char));
+ }
+ else
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ gupcr_coll_put_atomic (gupcr_coll_parent_thread, offset, offset,
+ sizeof (signed char),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_CHAR);
+ }
+ gupcr_coll_ack_wait (1);
+ }
+#endif /* GUPCR_USE_PORTALS4_TRIGGERED_OPS */
+
+ /* Copy result into the caller's specified destination. */
+ if (IS_ROOT_THREAD)
+ {
+ *(shared signed char *) dst = *t_result;
+ }
+ }
+
+ /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC. */
+ if (UPC_OUT_MYSYNC & sync_mode || !(UPC_OUT_NOSYNC & sync_mode))
+ upc_barrier;
+
+ gupcr_trace (FC_COLL, "COLL ALL_REDUCE EXIT");
+}
+
+
+/**
+ * Collectives reduce (UC) function
+ *
+ * The following steps are taken to calculate the reduced value:
+ *
+ * - Each thread reduces the values it has affinity to. Note that
+ * some of the threads might not participate in collectives reduce.
+ * - A reduce tree is created out of the threads participating.
+ * - All the parent threads signal their children that they are ready
+ * for the collectives reduce operation.
+ * - All the children perform atomic portals reduce operations in the
+ * parent shared space. The reduced values are propagated to the
+ * top of the tree.
+ * - Result is written to the specified destination.
+ *
+ * @param [in] dst Destination shared pointer
+ * @param [in] src Source shared pointer
+ * @param [in] op Collectives reduce operation
+ * @param [in] nelems Number of elements
+ * @param [in] blk_size Block size
+ * @param [in] func Optional reduce function
+ * @param [in] sync_mode Synchronization mode
+ *
+ */
+void upc_all_reduceUC
+ (shared void *dst,
+ shared const void *src,
+ upc_op_t op,
+ size_t nelems,
+ size_t blk_size,
+ unsigned char (*func) (unsigned char, unsigned char), upc_flag_t sync_mode)
+{
+ int i, n_local, full_rows, last_row;
+ int num_thr, tail_thr, extras, ph, src_thr, dst_thr, velems, start;
+
+ unsigned char local_result = 0;
+ unsigned char *l_src;
+
+ if (!upc_coll_init_flag)
+ upc_coll_init ();
+
+ gupcr_trace (FC_COLL, "COLL ALL_REDUCE ENTER unsigned char %lu %lu",
+ (long unsigned) nelems, (long unsigned) blk_size);
+
+ if (blk_size == 0)
+ blk_size = nelems;
+
+#ifdef _UPC_COLL_CHECK_ARGS
+ upc_coll_err (dst, src, NULL, 0, sync_mode, blk_size, nelems, op, UPC_RED);
+#endif
+
+ /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC. */
+ if (UPC_IN_MYSYNC & sync_mode || !(UPC_IN_NOSYNC & sync_mode))
+ upc_barrier;
+
+ /* Compute n_local, the number of elements local to this thread. */
+ n_local = 0;
+
+ /* Also compute start, the starting index of src for each thread. */
+
+ src_thr = upc_threadof ((shared void *) src);
+ dst_thr = upc_threadof ((shared void *) dst);
+ ph = upc_phaseof ((shared void *) src);
+
+ /* nelems plus the number of virtual elements in first row. */
+ velems = nelems + src_thr * blk_size + ph;
+
+ /* Include virtual elements when computing number of local elements. */
+ full_rows = velems / (blk_size * THREADS);
+ last_row = velems % (blk_size * THREADS);
+ tail_thr = last_row / blk_size;
+
+ /* Calculate number of participating threads. */
+ num_thr = (nelems + ph + blk_size - 1) / blk_size;
+ if (num_thr > THREADS)
+ num_thr = THREADS;
+
+ gupcr_debug (FC_COLL,
+ "src_thr: %d tail_thr: %d ph: %d num_thr: %d full_rows: %d",
+ src_thr, tail_thr, ph, num_thr, full_rows);
+
+ /* Calculate number of local elements. */
+ if (blk_size > 0)
+ {
+ if (MYTHREAD <= tail_thr)
+ if (MYTHREAD == tail_thr)
+ extras = last_row % blk_size;
+ else
+ extras = blk_size;
+ else
+ extras = 0;
+
+ n_local = blk_size * full_rows + extras;
+
+ /* Adjust the number of elements in this thread, if necessary. */
+ if (MYTHREAD < src_thr)
+ n_local -= blk_size;
+ else if (MYTHREAD == src_thr)
+ n_local -= ph;
+ }
+ else
+ {
+ n_local = 0;
+ if (src_thr == MYTHREAD) /* Revise the number of local elements. */
+ n_local = nelems;
+ }
+
+ /* Starting index for this thread
+ Note: start is sometimes negative because src is
+ addressed here as if its block size is 1. */
+
+ if (blk_size > 0)
+ if (MYTHREAD > src_thr)
+ start = MYTHREAD - src_thr - ph * THREADS;
+ else if (MYTHREAD < src_thr)
+ start = (blk_size - ph) * THREADS + MYTHREAD - src_thr;
+ else /* This is the source thread. */
+ start = 0;
+ else
+ start = 0;
+
+
+ /* Reduce the elements local to this thread. */
+
+ if (n_local > 0)
+ {
+ int loop_cnt = n_local - 1;
+
+ l_src = (unsigned char *) ((shared const unsigned char *) src + start);
+ local_result = *l_src++;
+
+ switch (op)
+ {
+ case UPC_ADD:
+ while (loop_cnt--)
+ local_result += *l_src++;
+ break;
+ case UPC_MULT:
+ while (loop_cnt--)
+ local_result *= *l_src++;
+ break;
+ /* Skip if not integral type, per spec 4.3.1.1
+ (See additional comments in upc_collective.c) */
+ case UPC_AND:
+ while (loop_cnt--)
+ local_result &= *l_src++;
+ break;
+ case UPC_OR:
+ while (loop_cnt--)
+ local_result |= *l_src++;
+ break;
+ case UPC_XOR:
+ while (loop_cnt--)
+ local_result ^= *l_src++;
+ break;
+ case UPC_LOGAND:
+ while (loop_cnt--)
+ local_result = local_result && *l_src++;
+ break;
+ case UPC_LOGOR:
+ while (loop_cnt--)
+ local_result = local_result || *l_src++;
+ break;
+ case UPC_MIN:
+ while (loop_cnt--)
+ {
+ if (local_result > *l_src)
+ local_result = *l_src;
+ ++l_src;
+ }
+ break;
+ case UPC_MAX:
+ while (loop_cnt--)
+ {
+ if (local_result < *l_src)
+ local_result = *l_src;
+ ++l_src;
+ }
+ break;
+ case UPC_FUNC:
+ while (loop_cnt--)
+ local_result = func (local_result, *l_src++);
+ break;
+ case UPC_NONCOMM_FUNC:
+ while (loop_cnt--)
+ local_result = func (local_result, *l_src++);
+ break;
+ default:
+ gupcr_fatal_error ("bad UPC collectives reduce operator 0x%lx", op);
+ }
+ }
+
+ /* Note: local_result is undefined if n_local == 0.
+ Note: Only a proper subset of threads have a meaningful local_result.
+ Note: dst might be a thread that does not have a local result. */
+
+ /* Global reduce on only participating threads. */
+ if (n_local)
+ {
+ /* Local pointer where reduced values are written too. */
+ unsigned char *t_result =
+ (unsigned char *) & gupcr_reduce_storage[MYTHREAD].value[0];
+
+ /* Initialize collectives reduce tree. */
+ gupcr_coll_tree_setup (dst_thr, src_thr, num_thr);
+
+ /* Copy in local results into the area for reduce operation.
+ NOTE: Not needed for the case of collective functions. However,
+ this covers the case of only one thread. */
+ *t_result = local_result;
+
+#ifdef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+/* Run reduce operation without triggered functions. */
+#undef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+#endif
+#if GUPCR_USE_PORTALS4_TRIGGERED_OPS
+ /* Note: In the case of UPC_FUNC and UPC_NONCOMM, it is not possible
+ to use triggered operations on inner nodes. In that case, inner
+ nodes must calculate reduced value by calling the specified
+ function. */
+ if (gupcr_coll_child_cnt)
+ {
+ if (IS_ROOT_THREAD)
+ {
+ /* ROOT THREAD */
+ /* Let children know that parent is ready. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+ /* Wait for children to report their values. */
+ gupcr_coll_signal_wait (gupcr_coll_child_cnt);
+
+ /* Reduce local values with those of children if necessary. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ /* Reduce local result with those of children. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result =
+ func (local_result, *(unsigned char *)
+ & gupcr_reduce_storage[MYTHREAD].value[i]);
+ }
+ *t_result = local_result;
+ }
+ }
+ else
+ {
+ /* INNER THREAD */
+ /* Prepare triggered atomic function. */
+ if ((op != UPC_FUNC) && (op != UPC_NONCOMM_FUNC))
+ {
+ /* Use triggered atomic operations once children sent
+ their results and parent is ready to receive it. */
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+ offset, sizeof (unsigned char),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_UCHAR,
+ gupcr_coll_child_cnt + 1);
+ }
+ /* Let children know that parent is ready. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+ /* Wait for completion, children and parent are ready. */
+ gupcr_coll_signal_wait (gupcr_coll_child_cnt + 1);
+ /* Execute reduce functions if necessary. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ size_t doffset =
+ upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage[MYTHREAD].value
+ [gupcr_coll_child_index]));
+ /* Reduce local result with those of children. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result = func (local_result, *(unsigned char *)
+ &
+ gupcr_reduce_storage
+ [MYTHREAD].value[i]);
+ }
+ *t_result = local_result;
+ gupcr_coll_put (gupcr_coll_parent_thread, doffset, offset,
+ sizeof (unsigned char));
+ }
+ /* Wait for our value to go up the tree. */
+ gupcr_coll_ack_wait (1);
+ }
+ }
+ else
+ {
+ /* Avoid the case where only one thread is available. */
+ if (!IS_ROOT_THREAD)
+ {
+ /* LEAF THREAD */
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ switch (op)
+ {
+ case UPC_FUNC:
+ case UPC_NONCOMM_FUNC:
+ {
+ /* Schedule a triggered put once signal is received. */
+ size_t doffset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].
+ value
+ [gupcr_coll_child_index]));
+ gupcr_coll_trigput (gupcr_coll_parent_thread, doffset,
+ offset, sizeof (unsigned char), 1);
+ }
+ break;
+ default:
+ /* Schedule a triggered atomic put once parent is ready. */
+ gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+ offset, sizeof (unsigned char),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_UCHAR, 1);
+ break;
+ }
+ /* Wait for parent to be ready. */
+ gupcr_coll_signal_wait (1);
+ /* Wait for our value to leave. */
+ gupcr_coll_ack_wait (1);
+ }
+ }
+#else /* NO TRIGGERED OPS */
+ /* Send signal to all children. */
+ if (gupcr_coll_child_cnt)
+ {
+ /* ROOT OR INNER THREAD */
+ int wait_cnt = gupcr_coll_child_cnt;
+
+ /* Signal that parent is ready to receive the locally reduced
+ values from its children. Value that we send does not matter. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (wait_cnt);
+
+ /* Wait for children to report their local reduced values and
+ parent to report it is ready to receive the reduced value. */
+ if (!IS_ROOT_THREAD)
+ ++wait_cnt;
+ gupcr_coll_signal_wait (wait_cnt);
+
+ /* Compute result if reduce functions are used. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result = func (local_result,
+ *(unsigned char *) &
+ gupcr_reduce_storage[MYTHREAD].value
+ [i]);
+ }
+ /* Prepare reduced value for going up the tree. */
+ *t_result = local_result;
+ }
+ }
+ else if (!IS_ROOT_THREAD)
+ {
+ /* LEAF THREAD */
+ gupcr_coll_signal_wait (1);
+ }
+
+ /* Send reduced value to the parent. */
+ if (!IS_ROOT_THREAD)
+ {
+ /* LEAF OR INNER THREAD */
+ /* Each child places its result into the parent memory slot
+ dedicated for the child. The parent is responsible
+ for creating the reduced result for itself and its
+ children. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ size_t doffset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value
+ [gupcr_coll_child_index]));
+ size_t soffset =
+ upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage[MYTHREAD].value[0]));
+ gupcr_coll_put (gupcr_coll_parent_thread, doffset, soffset,
+ sizeof (unsigned char));
+ }
+ else
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ gupcr_coll_put_atomic (gupcr_coll_parent_thread, offset, offset,
+ sizeof (unsigned char),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_UCHAR);
+ }
+ gupcr_coll_ack_wait (1);
+ }
+#endif /* GUPCR_USE_PORTALS4_TRIGGERED_OPS */
+
+ /* Copy result into the caller's specified destination. */
+ if (IS_ROOT_THREAD)
+ {
+ *(shared unsigned char *) dst = *t_result;
+ }
+ }
+
+ /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC. */
+ if (UPC_OUT_MYSYNC & sync_mode || !(UPC_OUT_NOSYNC & sync_mode))
+ upc_barrier;
+
+ gupcr_trace (FC_COLL, "COLL ALL_REDUCE EXIT");
+}
+
+
+/**
+ * Collectives reduce (S) function
+ *
+ * The following steps are taken to calculate the reduced value:
+ *
+ * - Each thread reduces the values it has affinity to. Note that
+ * some of the threads might not participate in collectives reduce.
+ * - A reduce tree is created out of the threads participating.
+ * - All the parent threads signal their children that they are ready
+ * for the collectives reduce operation.
+ * - All the children perform atomic portals reduce operations in the
+ * parent shared space. The reduced values are propagated to the
+ * top of the tree.
+ * - Result is written to the specified destination.
+ *
+ * @param [in] dst Destination shared pointer
+ * @param [in] src Source shared pointer
+ * @param [in] op Collectives reduce operation
+ * @param [in] nelems Number of elements
+ * @param [in] blk_size Block size
+ * @param [in] func Optional reduce function
+ * @param [in] sync_mode Synchronization mode
+ *
+ */
+void upc_all_reduceS
+ (shared void *dst,
+ shared const void *src,
+ upc_op_t op,
+ size_t nelems,
+ size_t blk_size,
+ signed short (*func) (signed short, signed short), upc_flag_t sync_mode)
+{
+ int i, n_local, full_rows, last_row;
+ int num_thr, tail_thr, extras, ph, src_thr, dst_thr, velems, start;
+
+ signed short local_result = 0;
+ signed short *l_src;
+
+ if (!upc_coll_init_flag)
+ upc_coll_init ();
+
+ gupcr_trace (FC_COLL, "COLL ALL_REDUCE ENTER signed short %lu %lu",
+ (long unsigned) nelems, (long unsigned) blk_size);
+
+ if (blk_size == 0)
+ blk_size = nelems;
+
+#ifdef _UPC_COLL_CHECK_ARGS
+ upc_coll_err (dst, src, NULL, 0, sync_mode, blk_size, nelems, op, UPC_RED);
+#endif
+
+ /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC. */
+ if (UPC_IN_MYSYNC & sync_mode || !(UPC_IN_NOSYNC & sync_mode))
+ upc_barrier;
+
+ /* Compute n_local, the number of elements local to this thread. */
+ n_local = 0;
+
+ /* Also compute start, the starting index of src for each thread. */
+
+ src_thr = upc_threadof ((shared void *) src);
+ dst_thr = upc_threadof ((shared void *) dst);
+ ph = upc_phaseof ((shared void *) src);
+
+ /* nelems plus the number of virtual elements in first row. */
+ velems = nelems + src_thr * blk_size + ph;
+
+ /* Include virtual elements when computing number of local elements. */
+ full_rows = velems / (blk_size * THREADS);
+ last_row = velems % (blk_size * THREADS);
+ tail_thr = last_row / blk_size;
+
+ /* Calculate number of participating threads. */
+ num_thr = (nelems + ph + blk_size - 1) / blk_size;
+ if (num_thr > THREADS)
+ num_thr = THREADS;
+
+ gupcr_debug (FC_COLL,
+ "src_thr: %d tail_thr: %d ph: %d num_thr: %d full_rows: %d",
+ src_thr, tail_thr, ph, num_thr, full_rows);
+
+ /* Calculate number of local elements. */
+ if (blk_size > 0)
+ {
+ if (MYTHREAD <= tail_thr)
+ if (MYTHREAD == tail_thr)
+ extras = last_row % blk_size;
+ else
+ extras = blk_size;
+ else
+ extras = 0;
+
+ n_local = blk_size * full_rows + extras;
+
+ /* Adjust the number of elements in this thread, if necessary. */
+ if (MYTHREAD < src_thr)
+ n_local -= blk_size;
+ else if (MYTHREAD == src_thr)
+ n_local -= ph;
+ }
+ else
+ {
+ n_local = 0;
+ if (src_thr == MYTHREAD) /* Revise the number of local elements. */
+ n_local = nelems;
+ }
+
+ /* Starting index for this thread
+ Note: start is sometimes negative because src is
+ addressed here as if its block size is 1. */
+
+ if (blk_size > 0)
+ if (MYTHREAD > src_thr)
+ start = MYTHREAD - src_thr - ph * THREADS;
+ else if (MYTHREAD < src_thr)
+ start = (blk_size - ph) * THREADS + MYTHREAD - src_thr;
+ else /* This is the source thread. */
+ start = 0;
+ else
+ start = 0;
+
+
+ /* Reduce the elements local to this thread. */
+
+ if (n_local > 0)
+ {
+ int loop_cnt = n_local - 1;
+
+ l_src = (signed short *) ((shared const signed short *) src + start);
+ local_result = *l_src++;
+
+ switch (op)
+ {
+ case UPC_ADD:
+ while (loop_cnt--)
+ local_result += *l_src++;
+ break;
+ case UPC_MULT:
+ while (loop_cnt--)
+ local_result *= *l_src++;
+ break;
+ /* Skip if not integral type, per spec 4.3.1.1
+ (See additional comments in upc_collective.c) */
+ case UPC_AND:
+ while (loop_cnt--)
+ local_result &= *l_src++;
+ break;
+ case UPC_OR:
+ while (loop_cnt--)
+ local_result |= *l_src++;
+ break;
+ case UPC_XOR:
+ while (loop_cnt--)
+ local_result ^= *l_src++;
+ break;
+ case UPC_LOGAND:
+ while (loop_cnt--)
+ local_result = local_result && *l_src++;
+ break;
+ case UPC_LOGOR:
+ while (loop_cnt--)
+ local_result = local_result || *l_src++;
+ break;
+ case UPC_MIN:
+ while (loop_cnt--)
+ {
+ if (local_result > *l_src)
+ local_result = *l_src;
+ ++l_src;
+ }
+ break;
+ case UPC_MAX:
+ while (loop_cnt--)
+ {
+ if (local_result < *l_src)
+ local_result = *l_src;
+ ++l_src;
+ }
+ break;
+ case UPC_FUNC:
+ while (loop_cnt--)
+ local_result = func (local_result, *l_src++);
+ break;
+ case UPC_NONCOMM_FUNC:
+ while (loop_cnt--)
+ local_result = func (local_result, *l_src++);
+ break;
+ default:
+ gupcr_fatal_error ("bad UPC collectives reduce operator 0x%lx", op);
+ }
+ }
+
+ /* Note: local_result is undefined if n_local == 0.
+ Note: Only a proper subset of threads have a meaningful local_result.
+ Note: dst might be a thread that does not have a local result. */
+
+ /* Global reduce on only participating threads. */
+ if (n_local)
+ {
+ /* Local pointer where reduced values are written too. */
+ signed short *t_result =
+ (signed short *) & gupcr_reduce_storage[MYTHREAD].value[0];
+
+ /* Initialize collectives reduce tree. */
+ gupcr_coll_tree_setup (dst_thr, src_thr, num_thr);
+
+ /* Copy in local results into the area for reduce operation.
+ NOTE: Not needed for the case of collective functions. However,
+ this covers the case of only one thread. */
+ *t_result = local_result;
+
+#ifdef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+/* Run reduce operation without triggered functions. */
+#undef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+#endif
+#if GUPCR_USE_PORTALS4_TRIGGERED_OPS
+ /* Note: In the case of UPC_FUNC and UPC_NONCOMM, it is not possible
+ to use triggered operations on inner nodes. In that case, inner
+ nodes must calculate reduced value by calling the specified
+ function. */
+ if (gupcr_coll_child_cnt)
+ {
+ if (IS_ROOT_THREAD)
+ {
+ /* ROOT THREAD */
+ /* Let children know that parent is ready. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+ /* Wait for children to report their values. */
+ gupcr_coll_signal_wait (gupcr_coll_child_cnt);
+
+ /* Reduce local values with those of children if necessary. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ /* Reduce local result with those of children. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result =
+ func (local_result, *(signed short *)
+ & gupcr_reduce_storage[MYTHREAD].value[i]);
+ }
+ *t_result = local_result;
+ }
+ }
+ else
+ {
+ /* INNER THREAD */
+ /* Prepare triggered atomic function. */
+ if ((op != UPC_FUNC) && (op != UPC_NONCOMM_FUNC))
+ {
+ /* Use triggered atomic operations once children sent
+ their results and parent is ready to receive it. */
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+ offset, sizeof (signed short),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_SHORT,
+ gupcr_coll_child_cnt + 1);
+ }
+ /* Let children know that parent is ready. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+ /* Wait for completion, children and parent are ready. */
+ gupcr_coll_signal_wait (gupcr_coll_child_cnt + 1);
+ /* Execute reduce functions if necessary. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ size_t doffset =
+ upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage[MYTHREAD].value
+ [gupcr_coll_child_index]));
+ /* Reduce local result with those of children. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result = func (local_result, *(signed short *)
+ &
+ gupcr_reduce_storage
+ [MYTHREAD].value[i]);
+ }
+ *t_result = local_result;
+ gupcr_coll_put (gupcr_coll_parent_thread, doffset, offset,
+ sizeof (signed short));
+ }
+ /* Wait for our value to go up the tree. */
+ gupcr_coll_ack_wait (1);
+ }
+ }
+ else
+ {
+ /* Avoid the case where only one thread is available. */
+ if (!IS_ROOT_THREAD)
+ {
+ /* LEAF THREAD */
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ switch (op)
+ {
+ case UPC_FUNC:
+ case UPC_NONCOMM_FUNC:
+ {
+ /* Schedule a triggered put once signal is received. */
+ size_t doffset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].
+ value
+ [gupcr_coll_child_index]));
+ gupcr_coll_trigput (gupcr_coll_parent_thread, doffset,
+ offset, sizeof (signed short), 1);
+ }
+ break;
+ default:
+ /* Schedule a triggered atomic put once parent is ready. */
+ gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+ offset, sizeof (signed short),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_SHORT, 1);
+ break;
+ }
+ /* Wait for parent to be ready. */
+ gupcr_coll_signal_wait (1);
+ /* Wait for our value to leave. */
+ gupcr_coll_ack_wait (1);
+ }
+ }
+#else /* NO TRIGGERED OPS */
+ /* Send signal to all children. */
+ if (gupcr_coll_child_cnt)
+ {
+ /* ROOT OR INNER THREAD */
+ int wait_cnt = gupcr_coll_child_cnt;
+
+ /* Signal that parent is ready to receive the locally reduced
+ values from its children. Value that we send does not matter. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (wait_cnt);
+
+ /* Wait for children to report their local reduced values and
+ parent to report it is ready to receive the reduced value. */
+ if (!IS_ROOT_THREAD)
+ ++wait_cnt;
+ gupcr_coll_signal_wait (wait_cnt);
+
+ /* Compute result if reduce functions are used. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result = func (local_result,
+ *(signed short *) &
+ gupcr_reduce_storage[MYTHREAD].value
+ [i]);
+ }
+ /* Prepare reduced value for going up the tree. */
+ *t_result = local_result;
+ }
+ }
+ else if (!IS_ROOT_THREAD)
+ {
+ /* LEAF THREAD */
+ gupcr_coll_signal_wait (1);
+ }
+
+ /* Send reduced value to the parent. */
+ if (!IS_ROOT_THREAD)
+ {
+ /* LEAF OR INNER THREAD */
+ /* Each child places its result into the parent memory slot
+ dedicated for the child. The parent is responsible
+ for creating the reduced result for itself and its
+ children. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ size_t doffset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value
+ [gupcr_coll_child_index]));
+ size_t soffset =
+ upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage[MYTHREAD].value[0]));
+ gupcr_coll_put (gupcr_coll_parent_thread, doffset, soffset,
+ sizeof (signed short));
+ }
+ else
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ gupcr_coll_put_atomic (gupcr_coll_parent_thread, offset, offset,
+ sizeof (signed short),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_SHORT);
+ }
+ gupcr_coll_ack_wait (1);
+ }
+#endif /* GUPCR_USE_PORTALS4_TRIGGERED_OPS */
+
+ /* Copy result into the caller's specified destination. */
+ if (IS_ROOT_THREAD)
+ {
+ *(shared signed short *) dst = *t_result;
+ }
+ }
+
+ /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC. */
+ if (UPC_OUT_MYSYNC & sync_mode || !(UPC_OUT_NOSYNC & sync_mode))
+ upc_barrier;
+
+ gupcr_trace (FC_COLL, "COLL ALL_REDUCE EXIT");
+}
+
+
+/**
+ * Collectives reduce (US) function
+ *
+ * The following steps are taken to calculate the reduced value:
+ *
+ * - Each thread reduces the values it has affinity to. Note that
+ * some of the threads might not participate in collectives reduce.
+ * - A reduce tree is created out of the threads participating.
+ * - All the parent threads signal their children that they are ready
+ * for the collectives reduce operation.
+ * - All the children perform atomic portals reduce operations in the
+ * parent shared space. The reduced values are propagated to the
+ * top of the tree.
+ * - Result is written to the specified destination.
+ *
+ * @param [in] dst Destination shared pointer
+ * @param [in] src Source shared pointer
+ * @param [in] op Collectives reduce operation
+ * @param [in] nelems Number of elements
+ * @param [in] blk_size Block size
+ * @param [in] func Optional reduce function
+ * @param [in] sync_mode Synchronization mode
+ *
+ */
+void upc_all_reduceUS
+ (shared void *dst,
+ shared const void *src,
+ upc_op_t op,
+ size_t nelems,
+ size_t blk_size,
+ unsigned short (*func) (unsigned short, unsigned short), upc_flag_t sync_mode)
+{
+ int i, n_local, full_rows, last_row;
+ int num_thr, tail_thr, extras, ph, src_thr, dst_thr, velems, start;
+
+ unsigned short local_result = 0;
+ unsigned short *l_src;
+
+ if (!upc_coll_init_flag)
+ upc_coll_init ();
+
+ gupcr_trace (FC_COLL, "COLL ALL_REDUCE ENTER unsigned short %lu %lu",
+ (long unsigned) nelems, (long unsigned) blk_size);
+
+ if (blk_size == 0)
+ blk_size = nelems;
+
+#ifdef _UPC_COLL_CHECK_ARGS
+ upc_coll_err (dst, src, NULL, 0, sync_mode, blk_size, nelems, op, UPC_RED);
+#endif
+
+ /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC. */
+ if (UPC_IN_MYSYNC & sync_mode || !(UPC_IN_NOSYNC & sync_mode))
+ upc_barrier;
+
+ /* Compute n_local, the number of elements local to this thread. */
+ n_local = 0;
+
+ /* Also compute start, the starting index of src for each thread. */
+
+ src_thr = upc_threadof ((shared void *) src);
+ dst_thr = upc_threadof ((shared void *) dst);
+ ph = upc_phaseof ((shared void *) src);
+
+ /* nelems plus the number of virtual elements in first row. */
+ velems = nelems + src_thr * blk_size + ph;
+
+ /* Include virtual elements when computing number of local elements. */
+ full_rows = velems / (blk_size * THREADS);
+ last_row = velems % (blk_size * THREADS);
+ tail_thr = last_row / blk_size;
+
+ /* Calculate number of participating threads. */
+ num_thr = (nelems + ph + blk_size - 1) / blk_size;
+ if (num_thr > THREADS)
+ num_thr = THREADS;
+
+ gupcr_debug (FC_COLL,
+ "src_thr: %d tail_thr: %d ph: %d num_thr: %d full_rows: %d",
+ src_thr, tail_thr, ph, num_thr, full_rows);
+
+ /* Calculate number of local elements. */
+ if (blk_size > 0)
+ {
+ if (MYTHREAD <= tail_thr)
+ if (MYTHREAD == tail_thr)
+ extras = last_row % blk_size;
+ else
+ extras = blk_size;
+ else
+ extras = 0;
+
+ n_local = blk_size * full_rows + extras;
+
+ /* Adjust the number of elements in this thread, if necessary. */
+ if (MYTHREAD < src_thr)
+ n_local -= blk_size;
+ else if (MYTHREAD == src_thr)
+ n_local -= ph;
+ }
+ else
+ {
+ n_local = 0;
+ if (src_thr == MYTHREAD) /* Revise the number of local elements. */
+ n_local = nelems;
+ }
+
+ /* Starting index for this thread
+ Note: start is sometimes negative because src is
+ addressed here as if its block size is 1. */
+
+ if (blk_size > 0)
+ if (MYTHREAD > src_thr)
+ start = MYTHREAD - src_thr - ph * THREADS;
+ else if (MYTHREAD < src_thr)
+ start = (blk_size - ph) * THREADS + MYTHREAD - src_thr;
+ else /* This is the source thread. */
+ start = 0;
+ else
+ start = 0;
+
+
+ /* Reduce the elements local to this thread. */
+
+ if (n_local > 0)
+ {
+ int loop_cnt = n_local - 1;
+
+ l_src = (unsigned short *) ((shared const unsigned short *) src + start);
+ local_result = *l_src++;
+
+ switch (op)
+ {
+ case UPC_ADD:
+ while (loop_cnt--)
+ local_result += *l_src++;
+ break;
+ case UPC_MULT:
+ while (loop_cnt--)
+ local_result *= *l_src++;
+ break;
+ /* Skip if not integral type, per spec 4.3.1.1
+ (See additional comments in upc_collective.c) */
+ case UPC_AND:
+ while (loop_cnt--)
+ local_result &= *l_src++;
+ break;
+ case UPC_OR:
+ while (loop_cnt--)
+ local_result |= *l_src++;
+ break;
+ case UPC_XOR:
+ while (loop_cnt--)
+ local_result ^= *l_src++;
+ break;
+ case UPC_LOGAND:
+ while (loop_cnt--)
+ local_result = local_result && *l_src++;
+ break;
+ case UPC_LOGOR:
+ while (loop_cnt--)
+ local_result = local_result || *l_src++;
+ break;
+ case UPC_MIN:
+ while (loop_cnt--)
+ {
+ if (local_result > *l_src)
+ local_result = *l_src;
+ ++l_src;
+ }
+ break;
+ case UPC_MAX:
+ while (loop_cnt--)
+ {
+ if (local_result < *l_src)
+ local_result = *l_src;
+ ++l_src;
+ }
+ break;
+ case UPC_FUNC:
+ while (loop_cnt--)
+ local_result = func (local_result, *l_src++);
+ break;
+ case UPC_NONCOMM_FUNC:
+ while (loop_cnt--)
+ local_result = func (local_result, *l_src++);
+ break;
+ default:
+ gupcr_fatal_error ("bad UPC collectives reduce operator 0x%lx", op);
+ }
+ }
+
+ /* Note: local_result is undefined if n_local == 0.
+ Note: Only a proper subset of threads have a meaningful local_result.
+ Note: dst might be a thread that does not have a local result. */
+
+ /* Global reduce on only participating threads. */
+ if (n_local)
+ {
+ /* Local pointer where reduced values are written too. */
+ unsigned short *t_result =
+ (unsigned short *) & gupcr_reduce_storage[MYTHREAD].value[0];
+
+ /* Initialize collectives reduce tree. */
+ gupcr_coll_tree_setup (dst_thr, src_thr, num_thr);
+
+ /* Copy in local results into the area for reduce operation.
+ NOTE: Not needed for the case of collective functions. However,
+ this covers the case of only one thread. */
+ *t_result = local_result;
+
+#ifdef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+/* Run reduce operation without triggered functions. */
+#undef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+#endif
+#if GUPCR_USE_PORTALS4_TRIGGERED_OPS
+ /* Note: In the case of UPC_FUNC and UPC_NONCOMM, it is not possible
+ to use triggered operations on inner nodes. In that case, inner
+ nodes must calculate reduced value by calling the specified
+ function. */
+ if (gupcr_coll_child_cnt)
+ {
+ if (IS_ROOT_THREAD)
+ {
+ /* ROOT THREAD */
+ /* Let children know that parent is ready. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+ /* Wait for children to report their values. */
+ gupcr_coll_signal_wait (gupcr_coll_child_cnt);
+
+ /* Reduce local values with those of children if necessary. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ /* Reduce local result with those of children. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result =
+ func (local_result, *(unsigned short *)
+ & gupcr_reduce_storage[MYTHREAD].value[i]);
+ }
+ *t_result = local_result;
+ }
+ }
+ else
+ {
+ /* INNER THREAD */
+ /* Prepare triggered atomic function. */
+ if ((op != UPC_FUNC) && (op != UPC_NONCOMM_FUNC))
+ {
+ /* Use triggered atomic operations once children sent
+ their results and parent is ready to receive it. */
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+ offset, sizeof (unsigned short),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_USHORT,
+ gupcr_coll_child_cnt + 1);
+ }
+ /* Let children know that parent is ready. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+ /* Wait for completion, children and parent are ready. */
+ gupcr_coll_signal_wait (gupcr_coll_child_cnt + 1);
+ /* Execute reduce functions if necessary. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ size_t doffset =
+ upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage[MYTHREAD].value
+ [gupcr_coll_child_index]));
+ /* Reduce local result with those of children. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result = func (local_result, *(unsigned short *)
+ &
+ gupcr_reduce_storage
+ [MYTHREAD].value[i]);
+ }
+ *t_result = local_result;
+ gupcr_coll_put (gupcr_coll_parent_thread, doffset, offset,
+ sizeof (unsigned short));
+ }
+ /* Wait for our value to go up the tree. */
+ gupcr_coll_ack_wait (1);
+ }
+ }
+ else
+ {
+ /* Avoid the case where only one thread is available. */
+ if (!IS_ROOT_THREAD)
+ {
+ /* LEAF THREAD */
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ switch (op)
+ {
+ case UPC_FUNC:
+ case UPC_NONCOMM_FUNC:
+ {
+ /* Schedule a triggered put once signal is received. */
+ size_t doffset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].
+ value
+ [gupcr_coll_child_index]));
+ gupcr_coll_trigput (gupcr_coll_parent_thread, doffset,
+ offset, sizeof (unsigned short), 1);
+ }
+ break;
+ default:
+ /* Schedule a triggered atomic put once parent is ready. */
+ gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+ offset, sizeof (unsigned short),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_USHORT, 1);
+ break;
+ }
+ /* Wait for parent to be ready. */
+ gupcr_coll_signal_wait (1);
+ /* Wait for our value to leave. */
+ gupcr_coll_ack_wait (1);
+ }
+ }
+#else /* NO TRIGGERED OPS */
+ /* Send signal to all children. */
+ if (gupcr_coll_child_cnt)
+ {
+ /* ROOT OR INNER THREAD */
+ int wait_cnt = gupcr_coll_child_cnt;
+
+ /* Signal that parent is ready to receive the locally reduced
+ values from its children. Value that we send does not matter. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (wait_cnt);
+
+ /* Wait for children to report their local reduced values and
+ parent to report it is ready to receive the reduced value. */
+ if (!IS_ROOT_THREAD)
+ ++wait_cnt;
+ gupcr_coll_signal_wait (wait_cnt);
+
+ /* Compute result if reduce functions are used. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result = func (local_result,
+ *(unsigned short *) &
+ gupcr_reduce_storage[MYTHREAD].value
+ [i]);
+ }
+ /* Prepare reduced value for going up the tree. */
+ *t_result = local_result;
+ }
+ }
+ else if (!IS_ROOT_THREAD)
+ {
+ /* LEAF THREAD */
+ gupcr_coll_signal_wait (1);
+ }
+
+ /* Send reduced value to the parent. */
+ if (!IS_ROOT_THREAD)
+ {
+ /* LEAF OR INNER THREAD */
+ /* Each child places its result into the parent memory slot
+ dedicated for the child. The parent is responsible
+ for creating the reduced result for itself and its
+ children. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ size_t doffset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value
+ [gupcr_coll_child_index]));
+ size_t soffset =
+ upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage[MYTHREAD].value[0]));
+ gupcr_coll_put (gupcr_coll_parent_thread, doffset, soffset,
+ sizeof (unsigned short));
+ }
+ else
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ gupcr_coll_put_atomic (gupcr_coll_parent_thread, offset, offset,
+ sizeof (unsigned short),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_USHORT);
+ }
+ gupcr_coll_ack_wait (1);
+ }
+#endif /* GUPCR_USE_PORTALS4_TRIGGERED_OPS */
+
+ /* Copy result into the caller's specified destination. */
+ if (IS_ROOT_THREAD)
+ {
+ *(shared unsigned short *) dst = *t_result;
+ }
+ }
+
+ /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC. */
+ if (UPC_OUT_MYSYNC & sync_mode || !(UPC_OUT_NOSYNC & sync_mode))
+ upc_barrier;
+
+ gupcr_trace (FC_COLL, "COLL ALL_REDUCE EXIT");
+}
+
+
+/**
+ * Collectives reduce (I) function
+ *
+ * The following steps are taken to calculate the reduced value:
+ *
+ * - Each thread reduces the values it has affinity to. Note that
+ * some of the threads might not participate in collectives reduce.
+ * - A reduce tree is created out of the threads participating.
+ * - All the parent threads signal their children that they are ready
+ * for the collectives reduce operation.
+ * - All the children perform atomic portals reduce operations in the
+ * parent shared space. The reduced values are propagated to the
+ * top of the tree.
+ * - Result is written to the specified destination.
+ *
+ * @param [in] dst Destination shared pointer
+ * @param [in] src Source shared pointer
+ * @param [in] op Collectives reduce operation
+ * @param [in] nelems Number of elements
+ * @param [in] blk_size Block size
+ * @param [in] func Optional reduce function
+ * @param [in] sync_mode Synchronization mode
+ *
+ */
+void upc_all_reduceI
+ (shared void *dst,
+ shared const void *src,
+ upc_op_t op,
+ size_t nelems,
+ size_t blk_size,
+ signed int (*func) (signed int, signed int), upc_flag_t sync_mode)
+{
+ int i, n_local, full_rows, last_row;
+ int num_thr, tail_thr, extras, ph, src_thr, dst_thr, velems, start;
+
+ signed int local_result = 0;
+ signed int *l_src;
+
+ if (!upc_coll_init_flag)
+ upc_coll_init ();
+
+ gupcr_trace (FC_COLL, "COLL ALL_REDUCE ENTER signed int %lu %lu",
+ (long unsigned) nelems, (long unsigned) blk_size);
+
+ if (blk_size == 0)
+ blk_size = nelems;
+
+#ifdef _UPC_COLL_CHECK_ARGS
+ upc_coll_err (dst, src, NULL, 0, sync_mode, blk_size, nelems, op, UPC_RED);
+#endif
+
+ /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC. */
+ if (UPC_IN_MYSYNC & sync_mode || !(UPC_IN_NOSYNC & sync_mode))
+ upc_barrier;
+
+ /* Compute n_local, the number of elements local to this thread. */
+ n_local = 0;
+
+ /* Also compute start, the starting index of src for each thread. */
+
+ src_thr = upc_threadof ((shared void *) src);
+ dst_thr = upc_threadof ((shared void *) dst);
+ ph = upc_phaseof ((shared void *) src);
+
+ /* nelems plus the number of virtual elements in first row. */
+ velems = nelems + src_thr * blk_size + ph;
+
+ /* Include virtual elements when computing number of local elements. */
+ full_rows = velems / (blk_size * THREADS);
+ last_row = velems % (blk_size * THREADS);
+ tail_thr = last_row / blk_size;
+
+ /* Calculate number of participating threads. */
+ num_thr = (nelems + ph + blk_size - 1) / blk_size;
+ if (num_thr > THREADS)
+ num_thr = THREADS;
+
+ gupcr_debug (FC_COLL,
+ "src_thr: %d tail_thr: %d ph: %d num_thr: %d full_rows: %d",
+ src_thr, tail_thr, ph, num_thr, full_rows);
+
+ /* Calculate number of local elements. */
+ if (blk_size > 0)
+ {
+ if (MYTHREAD <= tail_thr)
+ if (MYTHREAD == tail_thr)
+ extras = last_row % blk_size;
+ else
+ extras = blk_size;
+ else
+ extras = 0;
+
+ n_local = blk_size * full_rows + extras;
+
+ /* Adjust the number of elements in this thread, if necessary. */
+ if (MYTHREAD < src_thr)
+ n_local -= blk_size;
+ else if (MYTHREAD == src_thr)
+ n_local -= ph;
+ }
+ else
+ {
+ n_local = 0;
+ if (src_thr == MYTHREAD) /* Revise the number of local elements. */
+ n_local = nelems;
+ }
+
+ /* Starting index for this thread
+ Note: start is sometimes negative because src is
+ addressed here as if its block size is 1. */
+
+ if (blk_size > 0)
+ if (MYTHREAD > src_thr)
+ start = MYTHREAD - src_thr - ph * THREADS;
+ else if (MYTHREAD < src_thr)
+ start = (blk_size - ph) * THREADS + MYTHREAD - src_thr;
+ else /* This is the source thread. */
+ start = 0;
+ else
+ start = 0;
+
+
+ /* Reduce the elements local to this thread. */
+
+ if (n_local > 0)
+ {
+ int loop_cnt = n_local - 1;
+
+ l_src = (signed int *) ((shared const signed int *) src + start);
+ local_result = *l_src++;
+
+ switch (op)
+ {
+ case UPC_ADD:
+ while (loop_cnt--)
+ local_result += *l_src++;
+ break;
+ case UPC_MULT:
+ while (loop_cnt--)
+ local_result *= *l_src++;
+ break;
+ /* Skip if not integral type, per spec 4.3.1.1
+ (See additional comments in upc_collective.c) */
+ case UPC_AND:
+ while (loop_cnt--)
+ local_result &= *l_src++;
+ break;
+ case UPC_OR:
+ while (loop_cnt--)
+ local_result |= *l_src++;
+ break;
+ case UPC_XOR:
+ while (loop_cnt--)
+ local_result ^= *l_src++;
+ break;
+ case UPC_LOGAND:
+ while (loop_cnt--)
+ local_result = local_result && *l_src++;
+ break;
+ case UPC_LOGOR:
+ while (loop_cnt--)
+ local_result = local_result || *l_src++;
+ break;
+ case UPC_MIN:
+ while (loop_cnt--)
+ {
+ if (local_result > *l_src)
+ local_result = *l_src;
+ ++l_src;
+ }
+ break;
+ case UPC_MAX:
+ while (loop_cnt--)
+ {
+ if (local_result < *l_src)
+ local_result = *l_src;
+ ++l_src;
+ }
+ break;
+ case UPC_FUNC:
+ while (loop_cnt--)
+ local_result = func (local_result, *l_src++);
+ break;
+ case UPC_NONCOMM_FUNC:
+ while (loop_cnt--)
+ local_result = func (local_result, *l_src++);
+ break;
+ default:
+ gupcr_fatal_error ("bad UPC collectives reduce operator 0x%lx", op);
+ }
+ }
+
+ /* Note: local_result is undefined if n_local == 0.
+ Note: Only a proper subset of threads have a meaningful local_result.
+ Note: dst might be a thread that does not have a local result. */
+
+ /* Global reduce on only participating threads. */
+ if (n_local)
+ {
+ /* Local pointer where reduced values are written too. */
+ signed int *t_result =
+ (signed int *) & gupcr_reduce_storage[MYTHREAD].value[0];
+
+ /* Initialize collectives reduce tree. */
+ gupcr_coll_tree_setup (dst_thr, src_thr, num_thr);
+
+ /* Copy in local results into the area for reduce operation.
+ NOTE: Not needed for the case of collective functions. However,
+ this covers the case of only one thread. */
+ *t_result = local_result;
+
+#ifdef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+/* Run reduce operation without triggered functions. */
+#undef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+#endif
+#if GUPCR_USE_PORTALS4_TRIGGERED_OPS
+ /* Note: In the case of UPC_FUNC and UPC_NONCOMM, it is not possible
+ to use triggered operations on inner nodes. In that case, inner
+ nodes must calculate reduced value by calling the specified
+ function. */
+ if (gupcr_coll_child_cnt)
+ {
+ if (IS_ROOT_THREAD)
+ {
+ /* ROOT THREAD */
+ /* Let children know that parent is ready. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+ /* Wait for children to report their values. */
+ gupcr_coll_signal_wait (gupcr_coll_child_cnt);
+
+ /* Reduce local values with those of children if necessary. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ /* Reduce local result with those of children. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result =
+ func (local_result, *(signed int *)
+ & gupcr_reduce_storage[MYTHREAD].value[i]);
+ }
+ *t_result = local_result;
+ }
+ }
+ else
+ {
+ /* INNER THREAD */
+ /* Prepare triggered atomic function. */
+ if ((op != UPC_FUNC) && (op != UPC_NONCOMM_FUNC))
+ {
+ /* Use triggered atomic operations once children sent
+ their results and parent is ready to receive it. */
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+ offset, sizeof (signed int),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_INT,
+ gupcr_coll_child_cnt + 1);
+ }
+ /* Let children know that parent is ready. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+ /* Wait for completion, children and parent are ready. */
+ gupcr_coll_signal_wait (gupcr_coll_child_cnt + 1);
+ /* Execute reduce functions if necessary. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ size_t doffset =
+ upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage[MYTHREAD].value
+ [gupcr_coll_child_index]));
+ /* Reduce local result with those of children. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result = func (local_result, *(signed int *)
+ &
+ gupcr_reduce_storage
+ [MYTHREAD].value[i]);
+ }
+ *t_result = local_result;
+ gupcr_coll_put (gupcr_coll_parent_thread, doffset, offset,
+ sizeof (signed int));
+ }
+ /* Wait for our value to go up the tree. */
+ gupcr_coll_ack_wait (1);
+ }
+ }
+ else
+ {
+ /* Avoid the case where only one thread is available. */
+ if (!IS_ROOT_THREAD)
+ {
+ /* LEAF THREAD */
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ switch (op)
+ {
+ case UPC_FUNC:
+ case UPC_NONCOMM_FUNC:
+ {
+ /* Schedule a triggered put once signal is received. */
+ size_t doffset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].
+ value
+ [gupcr_coll_child_index]));
+ gupcr_coll_trigput (gupcr_coll_parent_thread, doffset,
+ offset, sizeof (signed int), 1);
+ }
+ break;
+ default:
+ /* Schedule a triggered atomic put once parent is ready. */
+ gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+ offset, sizeof (signed int),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_INT, 1);
+ break;
+ }
+ /* Wait for parent to be ready. */
+ gupcr_coll_signal_wait (1);
+ /* Wait for our value to leave. */
+ gupcr_coll_ack_wait (1);
+ }
+ }
+#else /* NO TRIGGERED OPS */
+ /* Send signal to all children. */
+ if (gupcr_coll_child_cnt)
+ {
+ /* ROOT OR INNER THREAD */
+ int wait_cnt = gupcr_coll_child_cnt;
+
+ /* Signal that parent is ready to receive the locally reduced
+ values from its children. Value that we send does not matter. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (wait_cnt);
+
+ /* Wait for children to report their local reduced values and
+ parent to report it is ready to receive the reduced value. */
+ if (!IS_ROOT_THREAD)
+ ++wait_cnt;
+ gupcr_coll_signal_wait (wait_cnt);
+
+ /* Compute result if reduce functions are used. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result = func (local_result,
+ *(signed int *) &
+ gupcr_reduce_storage[MYTHREAD].value
+ [i]);
+ }
+ /* Prepare reduced value for going up the tree. */
+ *t_result = local_result;
+ }
+ }
+ else if (!IS_ROOT_THREAD)
+ {
+ /* LEAF THREAD */
+ gupcr_coll_signal_wait (1);
+ }
+
+ /* Send reduced value to the parent. */
+ if (!IS_ROOT_THREAD)
+ {
+ /* LEAF OR INNER THREAD */
+ /* Each child places its result into the parent memory slot
+ dedicated for the child. The parent is responsible
+ for creating the reduced result for itself and its
+ children. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ size_t doffset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value
+ [gupcr_coll_child_index]));
+ size_t soffset =
+ upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage[MYTHREAD].value[0]));
+ gupcr_coll_put (gupcr_coll_parent_thread, doffset, soffset,
+ sizeof (signed int));
+ }
+ else
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ gupcr_coll_put_atomic (gupcr_coll_parent_thread, offset, offset,
+ sizeof (signed int),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_INT);
+ }
+ gupcr_coll_ack_wait (1);
+ }
+#endif /* GUPCR_USE_PORTALS4_TRIGGERED_OPS */
+
+ /* Copy result into the caller's specified destination. */
+ if (IS_ROOT_THREAD)
+ {
+ *(shared signed int *) dst = *t_result;
+ }
+ }
+
+ /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC. */
+ if (UPC_OUT_MYSYNC & sync_mode || !(UPC_OUT_NOSYNC & sync_mode))
+ upc_barrier;
+
+ gupcr_trace (FC_COLL, "COLL ALL_REDUCE EXIT");
+}
+
+
+/**
+ * Collectives reduce (UI) function
+ *
+ * The following steps are taken to calculate the reduced value:
+ *
+ * - Each thread reduces the values it has affinity to. Note that
+ * some of the threads might not participate in collectives reduce.
+ * - A reduce tree is created out of the threads participating.
+ * - All the parent threads signal their children that they are ready
+ * for the collectives reduce operation.
+ * - All the children perform atomic portals reduce operations in the
+ * parent shared space. The reduced values are propagated to the
+ * top of the tree.
+ * - Result is written to the specified destination.
+ *
+ * @param [in] dst Destination shared pointer
+ * @param [in] src Source shared pointer
+ * @param [in] op Collectives reduce operation
+ * @param [in] nelems Number of elements
+ * @param [in] blk_size Block size
+ * @param [in] func Optional reduce function
+ * @param [in] sync_mode Synchronization mode
+ *
+ */
+void upc_all_reduceUI
+ (shared void *dst,
+ shared const void *src,
+ upc_op_t op,
+ size_t nelems,
+ size_t blk_size,
+ unsigned int (*func) (unsigned int, unsigned int), upc_flag_t sync_mode)
+{
+ int i, n_local, full_rows, last_row;
+ int num_thr, tail_thr, extras, ph, src_thr, dst_thr, velems, start;
+
+ unsigned int local_result = 0;
+ unsigned int *l_src;
+
+ if (!upc_coll_init_flag)
+ upc_coll_init ();
+
+ gupcr_trace (FC_COLL, "COLL ALL_REDUCE ENTER unsigned int %lu %lu",
+ (long unsigned) nelems, (long unsigned) blk_size);
+
+ if (blk_size == 0)
+ blk_size = nelems;
+
+#ifdef _UPC_COLL_CHECK_ARGS
+ upc_coll_err (dst, src, NULL, 0, sync_mode, blk_size, nelems, op, UPC_RED);
+#endif
+
+ /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC. */
+ if (UPC_IN_MYSYNC & sync_mode || !(UPC_IN_NOSYNC & sync_mode))
+ upc_barrier;
+
+ /* Compute n_local, the number of elements local to this thread. */
+ n_local = 0;
+
+ /* Also compute start, the starting index of src for each thread. */
+
+ src_thr = upc_threadof ((shared void *) src);
+ dst_thr = upc_threadof ((shared void *) dst);
+ ph = upc_phaseof ((shared void *) src);
+
+ /* nelems plus the number of virtual elements in first row. */
+ velems = nelems + src_thr * blk_size + ph;
+
+ /* Include virtual elements when computing number of local elements. */
+ full_rows = velems / (blk_size * THREADS);
+ last_row = velems % (blk_size * THREADS);
+ tail_thr = last_row / blk_size;
+
+ /* Calculate number of participating threads. */
+ num_thr = (nelems + ph + blk_size - 1) / blk_size;
+ if (num_thr > THREADS)
+ num_thr = THREADS;
+
+ gupcr_debug (FC_COLL,
+ "src_thr: %d tail_thr: %d ph: %d num_thr: %d full_rows: %d",
+ src_thr, tail_thr, ph, num_thr, full_rows);
+
+ /* Calculate number of local elements. */
+ if (blk_size > 0)
+ {
+ if (MYTHREAD <= tail_thr)
+ if (MYTHREAD == tail_thr)
+ extras = last_row % blk_size;
+ else
+ extras = blk_size;
+ else
+ extras = 0;
+
+ n_local = blk_size * full_rows + extras;
+
+ /* Adjust the number of elements in this thread, if necessary. */
+ if (MYTHREAD < src_thr)
+ n_local -= blk_size;
+ else if (MYTHREAD == src_thr)
+ n_local -= ph;
+ }
+ else
+ {
+ n_local = 0;
+ if (src_thr == MYTHREAD) /* Revise the number of local elements. */
+ n_local = nelems;
+ }
+
+ /* Starting index for this thread
+ Note: start is sometimes negative because src is
+ addressed here as if its block size is 1. */
+
+ if (blk_size > 0)
+ if (MYTHREAD > src_thr)
+ start = MYTHREAD - src_thr - ph * THREADS;
+ else if (MYTHREAD < src_thr)
+ start = (blk_size - ph) * THREADS + MYTHREAD - src_thr;
+ else /* This is the source thread. */
+ start = 0;
+ else
+ start = 0;
+
+
+ /* Reduce the elements local to this thread. */
+
+ if (n_local > 0)
+ {
+ int loop_cnt = n_local - 1;
+
+ l_src = (unsigned int *) ((shared const unsigned int *) src + start);
+ local_result = *l_src++;
+
+ switch (op)
+ {
+ case UPC_ADD:
+ while (loop_cnt--)
+ local_result += *l_src++;
+ break;
+ case UPC_MULT:
+ while (loop_cnt--)
+ local_result *= *l_src++;
+ break;
+ /* Skip if not integral type, per spec 4.3.1.1
+ (See additional comments in upc_collective.c) */
+ case UPC_AND:
+ while (loop_cnt--)
+ local_result &= *l_src++;
+ break;
+ case UPC_OR:
+ while (loop_cnt--)
+ local_result |= *l_src++;
+ break;
+ case UPC_XOR:
+ while (loop_cnt--)
+ local_result ^= *l_src++;
+ break;
+ case UPC_LOGAND:
+ while (loop_cnt--)
+ local_result = local_result && *l_src++;
+ break;
+ case UPC_LOGOR:
+ while (loop_cnt--)
+ local_result = local_result || *l_src++;
+ break;
+ case UPC_MIN:
+ while (loop_cnt--)
+ {
+ if (local_result > *l_src)
+ local_result = *l_src;
+ ++l_src;
+ }
+ break;
+ case UPC_MAX:
+ while (loop_cnt--)
+ {
+ if (local_result < *l_src)
+ local_result = *l_src;
+ ++l_src;
+ }
+ break;
+ case UPC_FUNC:
+ while (loop_cnt--)
+ local_result = func (local_result, *l_src++);
+ break;
+ case UPC_NONCOMM_FUNC:
+ while (loop_cnt--)
+ local_result = func (local_result, *l_src++);
+ break;
+ default:
+ gupcr_fatal_error ("bad UPC collectives reduce operator 0x%lx", op);
+ }
+ }
+
+ /* Note: local_result is undefined if n_local == 0.
+ Note: Only a proper subset of threads have a meaningful local_result.
+ Note: dst might be a thread that does not have a local result. */
+
+ /* Global reduce on only participating threads. */
+ if (n_local)
+ {
+ /* Local pointer where reduced values are written too. */
+ unsigned int *t_result =
+ (unsigned int *) & gupcr_reduce_storage[MYTHREAD].value[0];
+
+ /* Initialize collectives reduce tree. */
+ gupcr_coll_tree_setup (dst_thr, src_thr, num_thr);
+
+ /* Copy in local results into the area for reduce operation.
+ NOTE: Not needed for the case of collective functions. However,
+ this covers the case of only one thread. */
+ *t_result = local_result;
+
+#ifdef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+/* Run reduce operation without triggered functions. */
+#undef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+#endif
+#if GUPCR_USE_PORTALS4_TRIGGERED_OPS
+ /* Note: In the case of UPC_FUNC and UPC_NONCOMM, it is not possible
+ to use triggered operations on inner nodes. In that case, inner
+ nodes must calculate reduced value by calling the specified
+ function. */
+ if (gupcr_coll_child_cnt)
+ {
+ if (IS_ROOT_THREAD)
+ {
+ /* ROOT THREAD */
+ /* Let children know that parent is ready. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+ /* Wait for children to report their values. */
+ gupcr_coll_signal_wait (gupcr_coll_child_cnt);
+
+ /* Reduce local values with those of children if necessary. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ /* Reduce local result with those of children. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result =
+ func (local_result, *(unsigned int *)
+ & gupcr_reduce_storage[MYTHREAD].value[i]);
+ }
+ *t_result = local_result;
+ }
+ }
+ else
+ {
+ /* INNER THREAD */
+ /* Prepare triggered atomic function. */
+ if ((op != UPC_FUNC) && (op != UPC_NONCOMM_FUNC))
+ {
+ /* Use triggered atomic operations once children sent
+ their results and parent is ready to receive it. */
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+ offset, sizeof (unsigned int),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_UINT,
+ gupcr_coll_child_cnt + 1);
+ }
+ /* Let children know that parent is ready. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+ /* Wait for completion, children and parent are ready. */
+ gupcr_coll_signal_wait (gupcr_coll_child_cnt + 1);
+ /* Execute reduce functions if necessary. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ size_t doffset =
+ upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage[MYTHREAD].value
+ [gupcr_coll_child_index]));
+ /* Reduce local result with those of children. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result = func (local_result, *(unsigned int *)
+ &
+ gupcr_reduce_storage
+ [MYTHREAD].value[i]);
+ }
+ *t_result = local_result;
+ gupcr_coll_put (gupcr_coll_parent_thread, doffset, offset,
+ sizeof (unsigned int));
+ }
+ /* Wait for our value to go up the tree. */
+ gupcr_coll_ack_wait (1);
+ }
+ }
+ else
+ {
+ /* Avoid the case where only one thread is available. */
+ if (!IS_ROOT_THREAD)
+ {
+ /* LEAF THREAD */
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ switch (op)
+ {
+ case UPC_FUNC:
+ case UPC_NONCOMM_FUNC:
+ {
+ /* Schedule a triggered put once signal is received. */
+ size_t doffset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].
+ value
+ [gupcr_coll_child_index]));
+ gupcr_coll_trigput (gupcr_coll_parent_thread, doffset,
+ offset, sizeof (unsigned int), 1);
+ }
+ break;
+ default:
+ /* Schedule a triggered atomic put once parent is ready. */
+ gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+ offset, sizeof (unsigned int),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_UINT, 1);
+ break;
+ }
+ /* Wait for parent to be ready. */
+ gupcr_coll_signal_wait (1);
+ /* Wait for our value to leave. */
+ gupcr_coll_ack_wait (1);
+ }
+ }
+#else /* NO TRIGGERED OPS */
+ /* Send signal to all children. */
+ if (gupcr_coll_child_cnt)
+ {
+ /* ROOT OR INNER THREAD */
+ int wait_cnt = gupcr_coll_child_cnt;
+
+ /* Signal that parent is ready to receive the locally reduced
+ values from its children. Value that we send does not matter. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (wait_cnt);
+
+ /* Wait for children to report their local reduced values and
+ parent to report it is ready to receive the reduced value. */
+ if (!IS_ROOT_THREAD)
+ ++wait_cnt;
+ gupcr_coll_signal_wait (wait_cnt);
+
+ /* Compute result if reduce functions are used. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result = func (local_result,
+ *(unsigned int *) &
+ gupcr_reduce_storage[MYTHREAD].value
+ [i]);
+ }
+ /* Prepare reduced value for going up the tree. */
+ *t_result = local_result;
+ }
+ }
+ else if (!IS_ROOT_THREAD)
+ {
+ /* LEAF THREAD */
+ gupcr_coll_signal_wait (1);
+ }
+
+ /* Send reduced value to the parent. */
+ if (!IS_ROOT_THREAD)
+ {
+ /* LEAF OR INNER THREAD */
+ /* Each child places its result into the parent memory slot
+ dedicated for the child. The parent is responsible
+ for creating the reduced result for itself and its
+ children. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ size_t doffset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value
+ [gupcr_coll_child_index]));
+ size_t soffset =
+ upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage[MYTHREAD].value[0]));
+ gupcr_coll_put (gupcr_coll_parent_thread, doffset, soffset,
+ sizeof (unsigned int));
+ }
+ else
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ gupcr_coll_put_atomic (gupcr_coll_parent_thread, offset, offset,
+ sizeof (unsigned int),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_UINT);
+ }
+ gupcr_coll_ack_wait (1);
+ }
+#endif /* GUPCR_USE_PORTALS4_TRIGGERED_OPS */
+
+ /* Copy result into the caller's specified destination. */
+ if (IS_ROOT_THREAD)
+ {
+ *(shared unsigned int *) dst = *t_result;
+ }
+ }
+
+ /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC. */
+ if (UPC_OUT_MYSYNC & sync_mode || !(UPC_OUT_NOSYNC & sync_mode))
+ upc_barrier;
+
+ gupcr_trace (FC_COLL, "COLL ALL_REDUCE EXIT");
+}
+
+
+/**
+ * Collectives reduce (L) function
+ *
+ * The following steps are taken to calculate the reduced value:
+ *
+ * - Each thread reduces the values it has affinity to. Note that
+ * some of the threads might not participate in collectives reduce.
+ * - A reduce tree is created out of the threads participating.
+ * - All the parent threads signal their children that they are ready
+ * for the collectives reduce operation.
+ * - All the children perform atomic portals reduce operations in the
+ * parent shared space. The reduced values are propagated to the
+ * top of the tree.
+ * - Result is written to the specified destination.
+ *
+ * @param [in] dst Destination shared pointer
+ * @param [in] src Source shared pointer
+ * @param [in] op Collectives reduce operation
+ * @param [in] nelems Number of elements
+ * @param [in] blk_size Block size
+ * @param [in] func Optional reduce function
+ * @param [in] sync_mode Synchronization mode
+ *
+ */
+void upc_all_reduceL
+ (shared void *dst,
+ shared const void *src,
+ upc_op_t op,
+ size_t nelems,
+ size_t blk_size,
+ signed long (*func) (signed long, signed long), upc_flag_t sync_mode)
+{
+ int i, n_local, full_rows, last_row;
+ int num_thr, tail_thr, extras, ph, src_thr, dst_thr, velems, start;
+
+ signed long local_result = 0;
+ signed long *l_src;
+
+ if (!upc_coll_init_flag)
+ upc_coll_init ();
+
+ gupcr_trace (FC_COLL, "COLL ALL_REDUCE ENTER signed long %lu %lu",
+ (long unsigned) nelems, (long unsigned) blk_size);
+
+ if (blk_size == 0)
+ blk_size = nelems;
+
+#ifdef _UPC_COLL_CHECK_ARGS
+ upc_coll_err (dst, src, NULL, 0, sync_mode, blk_size, nelems, op, UPC_RED);
+#endif
+
+ /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC. */
+ if (UPC_IN_MYSYNC & sync_mode || !(UPC_IN_NOSYNC & sync_mode))
+ upc_barrier;
+
+ /* Compute n_local, the number of elements local to this thread. */
+ n_local = 0;
+
+ /* Also compute start, the starting index of src for each thread. */
+
+ src_thr = upc_threadof ((shared void *) src);
+ dst_thr = upc_threadof ((shared void *) dst);
+ ph = upc_phaseof ((shared void *) src);
+
+ /* nelems plus the number of virtual elements in first row. */
+ velems = nelems + src_thr * blk_size + ph;
+
+ /* Include virtual elements when computing number of local elements. */
+ full_rows = velems / (blk_size * THREADS);
+ last_row = velems % (blk_size * THREADS);
+ tail_thr = last_row / blk_size;
+
+ /* Calculate number of participating threads. */
+ num_thr = (nelems + ph + blk_size - 1) / blk_size;
+ if (num_thr > THREADS)
+ num_thr = THREADS;
+
+ gupcr_debug (FC_COLL,
+ "src_thr: %d tail_thr: %d ph: %d num_thr: %d full_rows: %d",
+ src_thr, tail_thr, ph, num_thr, full_rows);
+
+ /* Calculate number of local elements. */
+ if (blk_size > 0)
+ {
+ if (MYTHREAD <= tail_thr)
+ if (MYTHREAD == tail_thr)
+ extras = last_row % blk_size;
+ else
+ extras = blk_size;
+ else
+ extras = 0;
+
+ n_local = blk_size * full_rows + extras;
+
+ /* Adjust the number of elements in this thread, if necessary. */
+ if (MYTHREAD < src_thr)
+ n_local -= blk_size;
+ else if (MYTHREAD == src_thr)
+ n_local -= ph;
+ }
+ else
+ {
+ n_local = 0;
+ if (src_thr == MYTHREAD) /* Revise the number of local elements. */
+ n_local = nelems;
+ }
+
+ /* Starting index for this thread
+ Note: start is sometimes negative because src is
+ addressed here as if its block size is 1. */
+
+ if (blk_size > 0)
+ if (MYTHREAD > src_thr)
+ start = MYTHREAD - src_thr - ph * THREADS;
+ else if (MYTHREAD < src_thr)
+ start = (blk_size - ph) * THREADS + MYTHREAD - src_thr;
+ else /* This is the source thread. */
+ start = 0;
+ else
+ start = 0;
+
+
+ /* Reduce the elements local to this thread. */
+
+ if (n_local > 0)
+ {
+ int loop_cnt = n_local - 1;
+
+ l_src = (signed long *) ((shared const signed long *) src + start);
+ local_result = *l_src++;
+
+ switch (op)
+ {
+ case UPC_ADD:
+ while (loop_cnt--)
+ local_result += *l_src++;
+ break;
+ case UPC_MULT:
+ while (loop_cnt--)
+ local_result *= *l_src++;
+ break;
+ /* Skip if not integral type, per spec 4.3.1.1
+ (See additional comments in upc_collective.c) */
+ case UPC_AND:
+ while (loop_cnt--)
+ local_result &= *l_src++;
+ break;
+ case UPC_OR:
+ while (loop_cnt--)
+ local_result |= *l_src++;
+ break;
+ case UPC_XOR:
+ while (loop_cnt--)
+ local_result ^= *l_src++;
+ break;
+ case UPC_LOGAND:
+ while (loop_cnt--)
+ local_result = local_result && *l_src++;
+ break;
+ case UPC_LOGOR:
+ while (loop_cnt--)
+ local_result = local_result || *l_src++;
+ break;
+ case UPC_MIN:
+ while (loop_cnt--)
+ {
+ if (local_result > *l_src)
+ local_result = *l_src;
+ ++l_src;
+ }
+ break;
+ case UPC_MAX:
+ while (loop_cnt--)
+ {
+ if (local_result < *l_src)
+ local_result = *l_src;
+ ++l_src;
+ }
+ break;
+ case UPC_FUNC:
+ while (loop_cnt--)
+ local_result = func (local_result, *l_src++);
+ break;
+ case UPC_NONCOMM_FUNC:
+ while (loop_cnt--)
+ local_result = func (local_result, *l_src++);
+ break;
+ default:
+ gupcr_fatal_error ("bad UPC collectives reduce operator 0x%lx", op);
+ }
+ }
+
+ /* Note: local_result is undefined if n_local == 0.
+ Note: Only a proper subset of threads have a meaningful local_result.
+ Note: dst might be a thread that does not have a local result. */
+
+ /* Global reduce on only participating threads. */
+ if (n_local)
+ {
+ /* Local pointer where reduced values are written too. */
+ signed long *t_result =
+ (signed long *) & gupcr_reduce_storage[MYTHREAD].value[0];
+
+ /* Initialize collectives reduce tree. */
+ gupcr_coll_tree_setup (dst_thr, src_thr, num_thr);
+
+ /* Copy in local results into the area for reduce operation.
+ NOTE: Not needed for the case of collective functions. However,
+ this covers the case of only one thread. */
+ *t_result = local_result;
+
+#ifdef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+/* Run reduce operation without triggered functions. */
+#undef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+#endif
+#if GUPCR_USE_PORTALS4_TRIGGERED_OPS
+ /* Note: In the case of UPC_FUNC and UPC_NONCOMM, it is not possible
+ to use triggered operations on inner nodes. In that case, inner
+ nodes must calculate reduced value by calling the specified
+ function. */
+ if (gupcr_coll_child_cnt)
+ {
+ if (IS_ROOT_THREAD)
+ {
+ /* ROOT THREAD */
+ /* Let children know that parent is ready. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+ /* Wait for children to report their values. */
+ gupcr_coll_signal_wait (gupcr_coll_child_cnt);
+
+ /* Reduce local values with those of children if necessary. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ /* Reduce local result with those of children. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result =
+ func (local_result, *(signed long *)
+ & gupcr_reduce_storage[MYTHREAD].value[i]);
+ }
+ *t_result = local_result;
+ }
+ }
+ else
+ {
+ /* INNER THREAD */
+ /* Prepare triggered atomic function. */
+ if ((op != UPC_FUNC) && (op != UPC_NONCOMM_FUNC))
+ {
+ /* Use triggered atomic operations once children sent
+ their results and parent is ready to receive it. */
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+ offset, sizeof (signed long),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_LONG,
+ gupcr_coll_child_cnt + 1);
+ }
+ /* Let children know that parent is ready. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+ /* Wait for completion, children and parent are ready. */
+ gupcr_coll_signal_wait (gupcr_coll_child_cnt + 1);
+ /* Execute reduce functions if necessary. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ size_t doffset =
+ upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage[MYTHREAD].value
+ [gupcr_coll_child_index]));
+ /* Reduce local result with those of children. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result = func (local_result, *(signed long *)
+ &
+ gupcr_reduce_storage
+ [MYTHREAD].value[i]);
+ }
+ *t_result = local_result;
+ gupcr_coll_put (gupcr_coll_parent_thread, doffset, offset,
+ sizeof (signed long));
+ }
+ /* Wait for our value to go up the tree. */
+ gupcr_coll_ack_wait (1);
+ }
+ }
+ else
+ {
+ /* Avoid the case where only one thread is available. */
+ if (!IS_ROOT_THREAD)
+ {
+ /* LEAF THREAD */
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ switch (op)
+ {
+ case UPC_FUNC:
+ case UPC_NONCOMM_FUNC:
+ {
+ /* Schedule a triggered put once signal is received. */
+ size_t doffset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].
+ value
+ [gupcr_coll_child_index]));
+ gupcr_coll_trigput (gupcr_coll_parent_thread, doffset,
+ offset, sizeof (signed long), 1);
+ }
+ break;
+ default:
+ /* Schedule a triggered atomic put once parent is ready. */
+ gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+ offset, sizeof (signed long),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_LONG, 1);
+ break;
+ }
+ /* Wait for parent to be ready. */
+ gupcr_coll_signal_wait (1);
+ /* Wait for our value to leave. */
+ gupcr_coll_ack_wait (1);
+ }
+ }
+#else /* NO TRIGGERED OPS */
+ /* Send signal to all children. */
+ if (gupcr_coll_child_cnt)
+ {
+ /* ROOT OR INNER THREAD */
+ int wait_cnt = gupcr_coll_child_cnt;
+
+ /* Signal that parent is ready to receive the locally reduced
+ values from its children. Value that we send does not matter. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (wait_cnt);
+
+ /* Wait for children to report their local reduced values and
+ parent to report it is ready to receive the reduced value. */
+ if (!IS_ROOT_THREAD)
+ ++wait_cnt;
+ gupcr_coll_signal_wait (wait_cnt);
+
+ /* Compute result if reduce functions are used. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result = func (local_result,
+ *(signed long *) &
+ gupcr_reduce_storage[MYTHREAD].value
+ [i]);
+ }
+ /* Prepare reduced value for going up the tree. */
+ *t_result = local_result;
+ }
+ }
+ else if (!IS_ROOT_THREAD)
+ {
+ /* LEAF THREAD */
+ gupcr_coll_signal_wait (1);
+ }
+
+ /* Send reduced value to the parent. */
+ if (!IS_ROOT_THREAD)
+ {
+ /* LEAF OR INNER THREAD */
+ /* Each child places its result into the parent memory slot
+ dedicated for the child. The parent is responsible
+ for creating the reduced result for itself and its
+ children. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ size_t doffset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value
+ [gupcr_coll_child_index]));
+ size_t soffset =
+ upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage[MYTHREAD].value[0]));
+ gupcr_coll_put (gupcr_coll_parent_thread, doffset, soffset,
+ sizeof (signed long));
+ }
+ else
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ gupcr_coll_put_atomic (gupcr_coll_parent_thread, offset, offset,
+ sizeof (signed long),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_LONG);
+ }
+ gupcr_coll_ack_wait (1);
+ }
+#endif /* GUPCR_USE_PORTALS4_TRIGGERED_OPS */
+
+ /* Copy result into the caller's specified destination. */
+ if (IS_ROOT_THREAD)
+ {
+ *(shared signed long *) dst = *t_result;
+ }
+ }
+
+ /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC. */
+ if (UPC_OUT_MYSYNC & sync_mode || !(UPC_OUT_NOSYNC & sync_mode))
+ upc_barrier;
+
+ gupcr_trace (FC_COLL, "COLL ALL_REDUCE EXIT");
+}
+
+
+/**
+ * Collectives reduce (UL) function
+ *
+ * The following steps are taken to calculate the reduced value:
+ *
+ * - Each thread reduces the values it has affinity to. Note that
+ * some of the threads might not participate in collectives reduce.
+ * - A reduce tree is created out of the threads participating.
+ * - All the parent threads signal their children that they are ready
+ * for the collectives reduce operation.
+ * - All the children perform atomic portals reduce operations in the
+ * parent shared space. The reduced values are propagated to the
+ * top of the tree.
+ * - Result is written to the specified destination.
+ *
+ * @param [in] dst Destination shared pointer
+ * @param [in] src Source shared pointer
+ * @param [in] op Collectives reduce operation
+ * @param [in] nelems Number of elements
+ * @param [in] blk_size Block size
+ * @param [in] func Optional reduce function
+ * @param [in] sync_mode Synchronization mode
+ *
+ */
+void upc_all_reduceUL
+ (shared void *dst,
+ shared const void *src,
+ upc_op_t op,
+ size_t nelems,
+ size_t blk_size,
+ unsigned long (*func) (unsigned long, unsigned long), upc_flag_t sync_mode)
+{
+ int i, n_local, full_rows, last_row;
+ int num_thr, tail_thr, extras, ph, src_thr, dst_thr, velems, start;
+
+ unsigned long local_result = 0;
+ unsigned long *l_src;
+
+ if (!upc_coll_init_flag)
+ upc_coll_init ();
+
+ gupcr_trace (FC_COLL, "COLL ALL_REDUCE ENTER unsigned long %lu %lu",
+ (long unsigned) nelems, (long unsigned) blk_size);
+
+ if (blk_size == 0)
+ blk_size = nelems;
+
+#ifdef _UPC_COLL_CHECK_ARGS
+ upc_coll_err (dst, src, NULL, 0, sync_mode, blk_size, nelems, op, UPC_RED);
+#endif
+
+ /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC. */
+ if (UPC_IN_MYSYNC & sync_mode || !(UPC_IN_NOSYNC & sync_mode))
+ upc_barrier;
+
+ /* Compute n_local, the number of elements local to this thread. */
+ n_local = 0;
+
+ /* Also compute start, the starting index of src for each thread. */
+
+ src_thr = upc_threadof ((shared void *) src);
+ dst_thr = upc_threadof ((shared void *) dst);
+ ph = upc_phaseof ((shared void *) src);
+
+ /* nelems plus the number of virtual elements in first row. */
+ velems = nelems + src_thr * blk_size + ph;
+
+ /* Include virtual elements when computing number of local elements. */
+ full_rows = velems / (blk_size * THREADS);
+ last_row = velems % (blk_size * THREADS);
+ tail_thr = last_row / blk_size;
+
+ /* Calculate number of participating threads. */
+ num_thr = (nelems + ph + blk_size - 1) / blk_size;
+ if (num_thr > THREADS)
+ num_thr = THREADS;
+
+ gupcr_debug (FC_COLL,
+ "src_thr: %d tail_thr: %d ph: %d num_thr: %d full_rows: %d",
+ src_thr, tail_thr, ph, num_thr, full_rows);
+
+ /* Calculate number of local elements. */
+ if (blk_size > 0)
+ {
+ if (MYTHREAD <= tail_thr)
+ if (MYTHREAD == tail_thr)
+ extras = last_row % blk_size;
+ else
+ extras = blk_size;
+ else
+ extras = 0;
+
+ n_local = blk_size * full_rows + extras;
+
+ /* Adjust the number of elements in this thread, if necessary. */
+ if (MYTHREAD < src_thr)
+ n_local -= blk_size;
+ else if (MYTHREAD == src_thr)
+ n_local -= ph;
+ }
+ else
+ {
+ n_local = 0;
+ if (src_thr == MYTHREAD) /* Revise the number of local elements. */
+ n_local = nelems;
+ }
+
+ /* Starting index for this thread
+ Note: start is sometimes negative because src is
+ addressed here as if its block size is 1. */
+
+ if (blk_size > 0)
+ if (MYTHREAD > src_thr)
+ start = MYTHREAD - src_thr - ph * THREADS;
+ else if (MYTHREAD < src_thr)
+ start = (blk_size - ph) * THREADS + MYTHREAD - src_thr;
+ else /* This is the source thread. */
+ start = 0;
+ else
+ start = 0;
+
+
+ /* Reduce the elements local to this thread. */
+
+ if (n_local > 0)
+ {
+ int loop_cnt = n_local - 1;
+
+ l_src = (unsigned long *) ((shared const unsigned long *) src + start);
+ local_result = *l_src++;
+
+ switch (op)
+ {
+ case UPC_ADD:
+ while (loop_cnt--)
+ local_result += *l_src++;
+ break;
+ case UPC_MULT:
+ while (loop_cnt--)
+ local_result *= *l_src++;
+ break;
+ /* Skip if not integral type, per spec 4.3.1.1
+ (See additional comments in upc_collective.c) */
+ case UPC_AND:
+ while (loop_cnt--)
+ local_result &= *l_src++;
+ break;
+ case UPC_OR:
+ while (loop_cnt--)
+ local_result |= *l_src++;
+ break;
+ case UPC_XOR:
+ while (loop_cnt--)
+ local_result ^= *l_src++;
+ break;
+ case UPC_LOGAND:
+ while (loop_cnt--)
+ local_result = local_result && *l_src++;
+ break;
+ case UPC_LOGOR:
+ while (loop_cnt--)
+ local_result = local_result || *l_src++;
+ break;
+ case UPC_MIN:
+ while (loop_cnt--)
+ {
+ if (local_result > *l_src)
+ local_result = *l_src;
+ ++l_src;
+ }
+ break;
+ case UPC_MAX:
+ while (loop_cnt--)
+ {
+ if (local_result < *l_src)
+ local_result = *l_src;
+ ++l_src;
+ }
+ break;
+ case UPC_FUNC:
+ while (loop_cnt--)
+ local_result = func (local_result, *l_src++);
+ break;
+ case UPC_NONCOMM_FUNC:
+ while (loop_cnt--)
+ local_result = func (local_result, *l_src++);
+ break;
+ default:
+ gupcr_fatal_error ("bad UPC collectives reduce operator 0x%lx", op);
+ }
+ }
+
+ /* Note: local_result is undefined if n_local == 0.
+ Note: Only a proper subset of threads have a meaningful local_result.
+ Note: dst might be a thread that does not have a local result. */
+
+ /* Global reduce on only participating threads. */
+ if (n_local)
+ {
+ /* Local pointer where reduced values are written too. */
+ unsigned long *t_result =
+ (unsigned long *) & gupcr_reduce_storage[MYTHREAD].value[0];
+
+ /* Initialize collectives reduce tree. */
+ gupcr_coll_tree_setup (dst_thr, src_thr, num_thr);
+
+ /* Copy in local results into the area for reduce operation.
+ NOTE: Not needed for the case of collective functions. However,
+ this covers the case of only one thread. */
+ *t_result = local_result;
+
+#ifdef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+/* Run reduce operation without triggered functions. */
+#undef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+#endif
+#if GUPCR_USE_PORTALS4_TRIGGERED_OPS
+ /* Note: In the case of UPC_FUNC and UPC_NONCOMM, it is not possible
+ to use triggered operations on inner nodes. In that case, inner
+ nodes must calculate reduced value by calling the specified
+ function. */
+ if (gupcr_coll_child_cnt)
+ {
+ if (IS_ROOT_THREAD)
+ {
+ /* ROOT THREAD */
+ /* Let children know that parent is ready. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+ /* Wait for children to report their values. */
+ gupcr_coll_signal_wait (gupcr_coll_child_cnt);
+
+ /* Reduce local values with those of children if necessary. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ /* Reduce local result with those of children. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result =
+ func (local_result, *(unsigned long *)
+ & gupcr_reduce_storage[MYTHREAD].value[i]);
+ }
+ *t_result = local_result;
+ }
+ }
+ else
+ {
+ /* INNER THREAD */
+ /* Prepare triggered atomic function. */
+ if ((op != UPC_FUNC) && (op != UPC_NONCOMM_FUNC))
+ {
+ /* Use triggered atomic operations once children sent
+ their results and parent is ready to receive it. */
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+ offset, sizeof (unsigned long),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_ULONG,
+ gupcr_coll_child_cnt + 1);
+ }
+ /* Let children know that parent is ready. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+ /* Wait for completion, children and parent are ready. */
+ gupcr_coll_signal_wait (gupcr_coll_child_cnt + 1);
+ /* Execute reduce functions if necessary. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ size_t doffset =
+ upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage[MYTHREAD].value
+ [gupcr_coll_child_index]));
+ /* Reduce local result with those of children. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result = func (local_result, *(unsigned long *)
+ &
+ gupcr_reduce_storage
+ [MYTHREAD].value[i]);
+ }
+ *t_result = local_result;
+ gupcr_coll_put (gupcr_coll_parent_thread, doffset, offset,
+ sizeof (unsigned long));
+ }
+ /* Wait for our value to go up the tree. */
+ gupcr_coll_ack_wait (1);
+ }
+ }
+ else
+ {
+ /* Avoid the case where only one thread is available. */
+ if (!IS_ROOT_THREAD)
+ {
+ /* LEAF THREAD */
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ switch (op)
+ {
+ case UPC_FUNC:
+ case UPC_NONCOMM_FUNC:
+ {
+ /* Schedule a triggered put once signal is received. */
+ size_t doffset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].
+ value
+ [gupcr_coll_child_index]));
+ gupcr_coll_trigput (gupcr_coll_parent_thread, doffset,
+ offset, sizeof (unsigned long), 1);
+ }
+ break;
+ default:
+ /* Schedule a triggered atomic put once parent is ready. */
+ gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+ offset, sizeof (unsigned long),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_ULONG, 1);
+ break;
+ }
+ /* Wait for parent to be ready. */
+ gupcr_coll_signal_wait (1);
+ /* Wait for our value to leave. */
+ gupcr_coll_ack_wait (1);
+ }
+ }
+#else /* NO TRIGGERED OPS */
+ /* Send signal to all children. */
+ if (gupcr_coll_child_cnt)
+ {
+ /* ROOT OR INNER THREAD */
+ int wait_cnt = gupcr_coll_child_cnt;
+
+ /* Signal that parent is ready to receive the locally reduced
+ values from its children. Value that we send does not matter. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (wait_cnt);
+
+ /* Wait for children to report their local reduced values and
+ parent to report it is ready to receive the reduced value. */
+ if (!IS_ROOT_THREAD)
+ ++wait_cnt;
+ gupcr_coll_signal_wait (wait_cnt);
+
+ /* Compute result if reduce functions are used. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result = func (local_result,
+ *(unsigned long *) &
+ gupcr_reduce_storage[MYTHREAD].value
+ [i]);
+ }
+ /* Prepare reduced value for going up the tree. */
+ *t_result = local_result;
+ }
+ }
+ else if (!IS_ROOT_THREAD)
+ {
+ /* LEAF THREAD */
+ gupcr_coll_signal_wait (1);
+ }
+
+ /* Send reduced value to the parent. */
+ if (!IS_ROOT_THREAD)
+ {
+ /* LEAF OR INNER THREAD */
+ /* Each child places its result into the parent memory slot
+ dedicated for the child. The parent is responsible
+ for creating the reduced result for itself and its
+ children. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ size_t doffset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value
+ [gupcr_coll_child_index]));
+ size_t soffset =
+ upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage[MYTHREAD].value[0]));
+ gupcr_coll_put (gupcr_coll_parent_thread, doffset, soffset,
+ sizeof (unsigned long));
+ }
+ else
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ gupcr_coll_put_atomic (gupcr_coll_parent_thread, offset, offset,
+ sizeof (unsigned long),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_ULONG);
+ }
+ gupcr_coll_ack_wait (1);
+ }
+#endif /* GUPCR_USE_PORTALS4_TRIGGERED_OPS */
+
+ /* Copy result into the caller's specified destination. */
+ if (IS_ROOT_THREAD)
+ {
+ *(shared unsigned long *) dst = *t_result;
+ }
+ }
+
+ /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC. */
+ if (UPC_OUT_MYSYNC & sync_mode || !(UPC_OUT_NOSYNC & sync_mode))
+ upc_barrier;
+
+ gupcr_trace (FC_COLL, "COLL ALL_REDUCE EXIT");
+}
+
+/**
+ * Collectives UPC_LOGAND function for float types
+ *
+ * Portals4 does not define logical AND atomic operations
+ * and they will be executed as functions.
+ */
+ float
+gupcr_coll_logandF (float a, float b)
+{
+ return a && b;
+}
+
+/**
+ * Collectives UPC_LOGOR function for float types
+ *
+ * Portals4 does not define logical OR atomic operations
+ * and they will be executed as functions.
+ */
+
+float
+gupcr_coll_logorF (float a, float b)
+{
+ return a || b;
+}
+
+/**
+ * Collectives reduce (F) function
+ *
+ * The following steps are taken to calculate the reduced value:
+ *
+ * - Each thread reduces the values it has affinity to. Note that
+ * some of the threads might not participate in collectives reduce.
+ * - A reduce tree is created out of the threads participating.
+ * - All the parent threads signal their children that they are ready
+ * for the collectives reduce operation.
+ * - All the children perform atomic portals reduce operations in the
+ * parent shared space. The reduced values are propagated to the
+ * top of the tree.
+ * - Result is written to the specified destination.
+ *
+ * @param [in] dst Destination shared pointer
+ * @param [in] src Source shared pointer
+ * @param [in] op Collectives reduce operation
+ * @param [in] nelems Number of elements
+ * @param [in] blk_size Block size
+ * @param [in] func Optional reduce function
+ * @param [in] sync_mode Synchronization mode
+ *
+ */
+void upc_all_reduceF
+ (shared void *dst,
+ shared const void *src,
+ upc_op_t op,
+ size_t nelems,
+ size_t blk_size,
+ float (*func) (float, float), upc_flag_t sync_mode)
+{
+ int i, n_local, full_rows, last_row;
+ int num_thr, tail_thr, extras, ph, src_thr, dst_thr, velems, start;
+
+ float local_result = 0;
+ float *l_src;
+
+ if (!upc_coll_init_flag)
+ upc_coll_init ();
+
+ gupcr_trace (FC_COLL, "COLL ALL_REDUCE ENTER float %lu %lu",
+ (long unsigned) nelems, (long unsigned) blk_size);
+
+ if (blk_size == 0)
+ blk_size = nelems;
+
+#ifdef _UPC_COLL_CHECK_ARGS
+ upc_coll_err (dst, src, NULL, 0, sync_mode, blk_size, nelems, op, UPC_RED);
+#endif
+
+ /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC. */
+ if (UPC_IN_MYSYNC & sync_mode || !(UPC_IN_NOSYNC & sync_mode))
+ upc_barrier;
+
+ /* Compute n_local, the number of elements local to this thread. */
+ n_local = 0;
+
+ /* Also compute start, the starting index of src for each thread. */
+
+ src_thr = upc_threadof ((shared void *) src);
+ dst_thr = upc_threadof ((shared void *) dst);
+ ph = upc_phaseof ((shared void *) src);
+
+ /* nelems plus the number of virtual elements in first row. */
+ velems = nelems + src_thr * blk_size + ph;
+
+ /* Include virtual elements when computing number of local elements. */
+ full_rows = velems / (blk_size * THREADS);
+ last_row = velems % (blk_size * THREADS);
+ tail_thr = last_row / blk_size;
+
+ /* Calculate number of participating threads. */
+ num_thr = (nelems + ph + blk_size - 1) / blk_size;
+ if (num_thr > THREADS)
+ num_thr = THREADS;
+
+ gupcr_debug (FC_COLL,
+ "src_thr: %d tail_thr: %d ph: %d num_thr: %d full_rows: %d",
+ src_thr, tail_thr, ph, num_thr, full_rows);
+
+ /* Calculate number of local elements. */
+ if (blk_size > 0)
+ {
+ if (MYTHREAD <= tail_thr)
+ if (MYTHREAD == tail_thr)
+ extras = last_row % blk_size;
+ else
+ extras = blk_size;
+ else
+ extras = 0;
+
+ n_local = blk_size * full_rows + extras;
+
+ /* Adjust the number of elements in this thread, if necessary. */
+ if (MYTHREAD < src_thr)
+ n_local -= blk_size;
+ else if (MYTHREAD == src_thr)
+ n_local -= ph;
+ }
+ else
+ {
+ n_local = 0;
+ if (src_thr == MYTHREAD) /* Revise the number of local elements. */
+ n_local = nelems;
+ }
+
+ /* Starting index for this thread
+ Note: start is sometimes negative because src is
+ addressed here as if its block size is 1. */
+
+ if (blk_size > 0)
+ if (MYTHREAD > src_thr)
+ start = MYTHREAD - src_thr - ph * THREADS;
+ else if (MYTHREAD < src_thr)
+ start = (blk_size - ph) * THREADS + MYTHREAD - src_thr;
+ else /* This is the source thread. */
+ start = 0;
+ else
+ start = 0;
+
+ /* Logical operations on floating point types must execute as
+ functions as Portals4 does not have support for them. */
+ switch (op)
+ {
+ case UPC_LOGAND:
+ func = &gupcr_coll_logandF;
+ op = UPC_FUNC;
+ break;
+ case UPC_LOGOR:
+ func = &gupcr_coll_logorF;
+ op = UPC_FUNC;
+ break;
+ }
+
+ /* Reduce the elements local to this thread. */
+
+ if (n_local > 0)
+ {
+ int loop_cnt = n_local - 1;
+
+ l_src = (float *) ((shared const float *) src + start);
+ local_result = *l_src++;
+
+ switch (op)
+ {
+ case UPC_ADD:
+ while (loop_cnt--)
+ local_result += *l_src++;
+ break;
+ case UPC_MULT:
+ while (loop_cnt--)
+ local_result *= *l_src++;
+ break;
+ case UPC_MIN:
+ while (loop_cnt--)
+ {
+ if (local_result > *l_src)
+ local_result = *l_src;
+ ++l_src;
+ }
+ break;
+ case UPC_MAX:
+ while (loop_cnt--)
+ {
+ if (local_result < *l_src)
+ local_result = *l_src;
+ ++l_src;
+ }
+ break;
+ case UPC_FUNC:
+ while (loop_cnt--)
+ local_result = func (local_result, *l_src++);
+ break;
+ case UPC_NONCOMM_FUNC:
+ while (loop_cnt--)
+ local_result = func (local_result, *l_src++);
+ break;
+ default:
+ gupcr_fatal_error ("bad UPC collectives reduce operator 0x%lx", op);
+ }
+ }
+
+ /* Note: local_result is undefined if n_local == 0.
+ Note: Only a proper subset of threads have a meaningful local_result.
+ Note: dst might be a thread that does not have a local result. */
+
+ /* Global reduce on only participating threads. */
+ if (n_local)
+ {
+ /* Local pointer where reduced values are written too. */
+ float *t_result =
+ (float *) & gupcr_reduce_storage[MYTHREAD].value[0];
+
+ /* Initialize collectives reduce tree. */
+ gupcr_coll_tree_setup (dst_thr, src_thr, num_thr);
+
+ /* Copy in local results into the area for reduce operation.
+ NOTE: Not needed for the case of collective functions. However,
+ this covers the case of only one thread. */
+ *t_result = local_result;
+
+#ifdef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+/* Run reduce operation without triggered functions. */
+#undef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+#endif
+#if GUPCR_USE_PORTALS4_TRIGGERED_OPS
+ /* Note: In the case of UPC_FUNC and UPC_NONCOMM, it is not possible
+ to use triggered operations on inner nodes. In that case, inner
+ nodes must calculate reduced value by calling the specified
+ function. */
+ if (gupcr_coll_child_cnt)
+ {
+ if (IS_ROOT_THREAD)
+ {
+ /* ROOT THREAD */
+ /* Let children know that parent is ready. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+ /* Wait for children to report their values. */
+ gupcr_coll_signal_wait (gupcr_coll_child_cnt);
+
+ /* Reduce local values with those of children if necessary. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ /* Reduce local result with those of children. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result =
+ func (local_result, *(float *)
+ & gupcr_reduce_storage[MYTHREAD].value[i]);
+ }
+ *t_result = local_result;
+ }
+ }
+ else
+ {
+ /* INNER THREAD */
+ /* Prepare triggered atomic function. */
+ if ((op != UPC_FUNC) && (op != UPC_NONCOMM_FUNC))
+ {
+ /* Use triggered atomic operations once children sent
+ their results and parent is ready to receive it. */
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+ offset, sizeof (float),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_FLOAT,
+ gupcr_coll_child_cnt + 1);
+ }
+ /* Let children know that parent is ready. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+ /* Wait for completion, children and parent are ready. */
+ gupcr_coll_signal_wait (gupcr_coll_child_cnt + 1);
+ /* Execute reduce functions if necessary. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ size_t doffset =
+ upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage[MYTHREAD].value
+ [gupcr_coll_child_index]));
+ /* Reduce local result with those of children. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result = func (local_result, *(float *)
+ &
+ gupcr_reduce_storage
+ [MYTHREAD].value[i]);
+ }
+ *t_result = local_result;
+ gupcr_coll_put (gupcr_coll_parent_thread, doffset, offset,
+ sizeof (float));
+ }
+ /* Wait for our value to go up the tree. */
+ gupcr_coll_ack_wait (1);
+ }
+ }
+ else
+ {
+ /* Avoid the case where only one thread is available. */
+ if (!IS_ROOT_THREAD)
+ {
+ /* LEAF THREAD */
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ switch (op)
+ {
+ case UPC_FUNC:
+ case UPC_NONCOMM_FUNC:
+ {
+ /* Schedule a triggered put once signal is received. */
+ size_t doffset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].
+ value
+ [gupcr_coll_child_index]));
+ gupcr_coll_trigput (gupcr_coll_parent_thread, doffset,
+ offset, sizeof (float), 1);
+ }
+ break;
+ default:
+ /* Schedule a triggered atomic put once parent is ready. */
+ gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+ offset, sizeof (float),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_FLOAT, 1);
+ break;
+ }
+ /* Wait for parent to be ready. */
+ gupcr_coll_signal_wait (1);
+ /* Wait for our value to leave. */
+ gupcr_coll_ack_wait (1);
+ }
+ }
+#else /* NO TRIGGERED OPS */
+ /* Send signal to all children. */
+ if (gupcr_coll_child_cnt)
+ {
+ /* ROOT OR INNER THREAD */
+ int wait_cnt = gupcr_coll_child_cnt;
+
+ /* Signal that parent is ready to receive the locally reduced
+ values from its children. Value that we send does not matter. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (wait_cnt);
+
+ /* Wait for children to report their local reduced values and
+ parent to report it is ready to receive the reduced value. */
+ if (!IS_ROOT_THREAD)
+ ++wait_cnt;
+ gupcr_coll_signal_wait (wait_cnt);
+
+ /* Compute result if reduce functions are used. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result = func (local_result,
+ *(float *) &
+ gupcr_reduce_storage[MYTHREAD].value
+ [i]);
+ }
+ /* Prepare reduced value for going up the tree. */
+ *t_result = local_result;
+ }
+ }
+ else if (!IS_ROOT_THREAD)
+ {
+ /* LEAF THREAD */
+ gupcr_coll_signal_wait (1);
+ }
+
+ /* Send reduced value to the parent. */
+ if (!IS_ROOT_THREAD)
+ {
+ /* LEAF OR INNER THREAD */
+ /* Each child places its result into the parent memory slot
+ dedicated for the child. The parent is responsible
+ for creating the reduced result for itself and its
+ children. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ size_t doffset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value
+ [gupcr_coll_child_index]));
+ size_t soffset =
+ upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage[MYTHREAD].value[0]));
+ gupcr_coll_put (gupcr_coll_parent_thread, doffset, soffset,
+ sizeof (float));
+ }
+ else
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ gupcr_coll_put_atomic (gupcr_coll_parent_thread, offset, offset,
+ sizeof (float),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_FLOAT);
+ }
+ gupcr_coll_ack_wait (1);
+ }
+#endif /* GUPCR_USE_PORTALS4_TRIGGERED_OPS */
+
+ /* Copy result into the caller's specified destination. */
+ if (IS_ROOT_THREAD)
+ {
+ *(shared float *) dst = *t_result;
+ }
+ }
+
+ /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC. */
+ if (UPC_OUT_MYSYNC & sync_mode || !(UPC_OUT_NOSYNC & sync_mode))
+ upc_barrier;
+
+ gupcr_trace (FC_COLL, "COLL ALL_REDUCE EXIT");
+}
+
+/**
+ * Collectives UPC_LOGAND function for float types
+ *
+ * Portals4 does not define logical AND atomic operations
+ * and they will be executed as functions.
+ */
+ double
+gupcr_coll_logandD (double a, double b)
+{
+ return a && b;
+}
+
+/**
+ * Collectives UPC_LOGOR function for float types
+ *
+ * Portals4 does not define logical OR atomic operations
+ * and they will be executed as functions.
+ */
+
+double
+gupcr_coll_logorD (double a, double b)
+{
+ return a || b;
+}
+
+/**
+ * Collectives reduce (D) function
+ *
+ * The following steps are taken to calculate the reduced value:
+ *
+ * - Each thread reduces the values it has affinity to. Note that
+ * some of the threads might not participate in collectives reduce.
+ * - A reduce tree is created out of the threads participating.
+ * - All the parent threads signal their children that they are ready
+ * for the collectives reduce operation.
+ * - All the children perform atomic portals reduce operations in the
+ * parent shared space. The reduced values are propagated to the
+ * top of the tree.
+ * - Result is written to the specified destination.
+ *
+ * @param [in] dst Destination shared pointer
+ * @param [in] src Source shared pointer
+ * @param [in] op Collectives reduce operation
+ * @param [in] nelems Number of elements
+ * @param [in] blk_size Block size
+ * @param [in] func Optional reduce function
+ * @param [in] sync_mode Synchronization mode
+ *
+ */
+void upc_all_reduceD
+ (shared void *dst,
+ shared const void *src,
+ upc_op_t op,
+ size_t nelems,
+ size_t blk_size,
+ double (*func) (double, double), upc_flag_t sync_mode)
+{
+ int i, n_local, full_rows, last_row;
+ int num_thr, tail_thr, extras, ph, src_thr, dst_thr, velems, start;
+
+ double local_result = 0;
+ double *l_src;
+
+ if (!upc_coll_init_flag)
+ upc_coll_init ();
+
+ gupcr_trace (FC_COLL, "COLL ALL_REDUCE ENTER double %lu %lu",
+ (long unsigned) nelems, (long unsigned) blk_size);
+
+ if (blk_size == 0)
+ blk_size = nelems;
+
+#ifdef _UPC_COLL_CHECK_ARGS
+ upc_coll_err (dst, src, NULL, 0, sync_mode, blk_size, nelems, op, UPC_RED);
+#endif
+
+ /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC. */
+ if (UPC_IN_MYSYNC & sync_mode || !(UPC_IN_NOSYNC & sync_mode))
+ upc_barrier;
+
+ /* Compute n_local, the number of elements local to this thread. */
+ n_local = 0;
+
+ /* Also compute start, the starting index of src for each thread. */
+
+ src_thr = upc_threadof ((shared void *) src);
+ dst_thr = upc_threadof ((shared void *) dst);
+ ph = upc_phaseof ((shared void *) src);
+
+ /* nelems plus the number of virtual elements in first row. */
+ velems = nelems + src_thr * blk_size + ph;
+
+ /* Include virtual elements when computing number of local elements. */
+ full_rows = velems / (blk_size * THREADS);
+ last_row = velems % (blk_size * THREADS);
+ tail_thr = last_row / blk_size;
+
+ /* Calculate number of participating threads. */
+ num_thr = (nelems + ph + blk_size - 1) / blk_size;
+ if (num_thr > THREADS)
+ num_thr = THREADS;
+
+ gupcr_debug (FC_COLL,
+ "src_thr: %d tail_thr: %d ph: %d num_thr: %d full_rows: %d",
+ src_thr, tail_thr, ph, num_thr, full_rows);
+
+ /* Calculate number of local elements. */
+ if (blk_size > 0)
+ {
+ if (MYTHREAD <= tail_thr)
+ if (MYTHREAD == tail_thr)
+ extras = last_row % blk_size;
+ else
+ extras = blk_size;
+ else
+ extras = 0;
+
+ n_local = blk_size * full_rows + extras;
+
+ /* Adjust the number of elements in this thread, if necessary. */
+ if (MYTHREAD < src_thr)
+ n_local -= blk_size;
+ else if (MYTHREAD == src_thr)
+ n_local -= ph;
+ }
+ else
+ {
+ n_local = 0;
+ if (src_thr == MYTHREAD) /* Revise the number of local elements. */
+ n_local = nelems;
+ }
+
+ /* Starting index for this thread
+ Note: start is sometimes negative because src is
+ addressed here as if its block size is 1. */
+
+ if (blk_size > 0)
+ if (MYTHREAD > src_thr)
+ start = MYTHREAD - src_thr - ph * THREADS;
+ else if (MYTHREAD < src_thr)
+ start = (blk_size - ph) * THREADS + MYTHREAD - src_thr;
+ else /* This is the source thread. */
+ start = 0;
+ else
+ start = 0;
+
+ /* Logical operations on floating point types must execute as
+ functions as Portals4 does not have support for them. */
+ switch (op)
+ {
+ case UPC_LOGAND:
+ func = &gupcr_coll_logandD;
+ op = UPC_FUNC;
+ break;
+ case UPC_LOGOR:
+ func = &gupcr_coll_logorD;
+ op = UPC_FUNC;
+ break;
+ }
+
+ /* Reduce the elements local to this thread. */
+
+ if (n_local > 0)
+ {
+ int loop_cnt = n_local - 1;
+
+ l_src = (double *) ((shared const double *) src + start);
+ local_result = *l_src++;
+
+ switch (op)
+ {
+ case UPC_ADD:
+ while (loop_cnt--)
+ local_result += *l_src++;
+ break;
+ case UPC_MULT:
+ while (loop_cnt--)
+ local_result *= *l_src++;
+ break;
+ case UPC_MIN:
+ while (loop_cnt--)
+ {
+ if (local_result > *l_src)
+ local_result = *l_src;
+ ++l_src;
+ }
+ break;
+ case UPC_MAX:
+ while (loop_cnt--)
+ {
+ if (local_result < *l_src)
+ local_result = *l_src;
+ ++l_src;
+ }
+ break;
+ case UPC_FUNC:
+ while (loop_cnt--)
+ local_result = func (local_result, *l_src++);
+ break;
+ case UPC_NONCOMM_FUNC:
+ while (loop_cnt--)
+ local_result = func (local_result, *l_src++);
+ break;
+ default:
+ gupcr_fatal_error ("bad UPC collectives reduce operator 0x%lx", op);
+ }
+ }
+
+ /* Note: local_result is undefined if n_local == 0.
+ Note: Only a proper subset of threads have a meaningful local_result.
+ Note: dst might be a thread that does not have a local result. */
+
+ /* Global reduce on only participating threads. */
+ if (n_local)
+ {
+ /* Local pointer where reduced values are written too. */
+ double *t_result =
+ (double *) & gupcr_reduce_storage[MYTHREAD].value[0];
+
+ /* Initialize collectives reduce tree. */
+ gupcr_coll_tree_setup (dst_thr, src_thr, num_thr);
+
+ /* Copy in local results into the area for reduce operation.
+ NOTE: Not needed for the case of collective functions. However,
+ this covers the case of only one thread. */
+ *t_result = local_result;
+
+#ifdef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+/* Run reduce operation without triggered functions. */
+#undef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+#endif
+#if GUPCR_USE_PORTALS4_TRIGGERED_OPS
+ /* Note: In the case of UPC_FUNC and UPC_NONCOMM, it is not possible
+ to use triggered operations on inner nodes. In that case, inner
+ nodes must calculate reduced value by calling the specified
+ function. */
+ if (gupcr_coll_child_cnt)
+ {
+ if (IS_ROOT_THREAD)
+ {
+ /* ROOT THREAD */
+ /* Let children know that parent is ready. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+ /* Wait for children to report their values. */
+ gupcr_coll_signal_wait (gupcr_coll_child_cnt);
+
+ /* Reduce local values with those of children if necessary. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ /* Reduce local result with those of children. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result =
+ func (local_result, *(double *)
+ & gupcr_reduce_storage[MYTHREAD].value[i]);
+ }
+ *t_result = local_result;
+ }
+ }
+ else
+ {
+ /* INNER THREAD */
+ /* Prepare triggered atomic function. */
+ if ((op != UPC_FUNC) && (op != UPC_NONCOMM_FUNC))
+ {
+ /* Use triggered atomic operations once children sent
+ their results and parent is ready to receive it. */
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+ offset, sizeof (double),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_DOUBLE,
+ gupcr_coll_child_cnt + 1);
+ }
+ /* Let children know that parent is ready. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+ /* Wait for completion, children and parent are ready. */
+ gupcr_coll_signal_wait (gupcr_coll_child_cnt + 1);
+ /* Execute reduce functions if necessary. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ size_t doffset =
+ upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage[MYTHREAD].value
+ [gupcr_coll_child_index]));
+ /* Reduce local result with those of children. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result = func (local_result, *(double *)
+ &
+ gupcr_reduce_storage
+ [MYTHREAD].value[i]);
+ }
+ *t_result = local_result;
+ gupcr_coll_put (gupcr_coll_parent_thread, doffset, offset,
+ sizeof (double));
+ }
+ /* Wait for our value to go up the tree. */
+ gupcr_coll_ack_wait (1);
+ }
+ }
+ else
+ {
+ /* Avoid the case where only one thread is available. */
+ if (!IS_ROOT_THREAD)
+ {
+ /* LEAF THREAD */
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ switch (op)
+ {
+ case UPC_FUNC:
+ case UPC_NONCOMM_FUNC:
+ {
+ /* Schedule a triggered put once signal is received. */
+ size_t doffset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].
+ value
+ [gupcr_coll_child_index]));
+ gupcr_coll_trigput (gupcr_coll_parent_thread, doffset,
+ offset, sizeof (double), 1);
+ }
+ break;
+ default:
+ /* Schedule a triggered atomic put once parent is ready. */
+ gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+ offset, sizeof (double),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_DOUBLE, 1);
+ break;
+ }
+ /* Wait for parent to be ready. */
+ gupcr_coll_signal_wait (1);
+ /* Wait for our value to leave. */
+ gupcr_coll_ack_wait (1);
+ }
+ }
+#else /* NO TRIGGERED OPS */
+ /* Send signal to all children. */
+ if (gupcr_coll_child_cnt)
+ {
+ /* ROOT OR INNER THREAD */
+ int wait_cnt = gupcr_coll_child_cnt;
+
+ /* Signal that parent is ready to receive the locally reduced
+ values from its children. Value that we send does not matter. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (wait_cnt);
+
+ /* Wait for children to report their local reduced values and
+ parent to report it is ready to receive the reduced value. */
+ if (!IS_ROOT_THREAD)
+ ++wait_cnt;
+ gupcr_coll_signal_wait (wait_cnt);
+
+ /* Compute result if reduce functions are used. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result = func (local_result,
+ *(double *) &
+ gupcr_reduce_storage[MYTHREAD].value
+ [i]);
+ }
+ /* Prepare reduced value for going up the tree. */
+ *t_result = local_result;
+ }
+ }
+ else if (!IS_ROOT_THREAD)
+ {
+ /* LEAF THREAD */
+ gupcr_coll_signal_wait (1);
+ }
+
+ /* Send reduced value to the parent. */
+ if (!IS_ROOT_THREAD)
+ {
+ /* LEAF OR INNER THREAD */
+ /* Each child places its result into the parent memory slot
+ dedicated for the child. The parent is responsible
+ for creating the reduced result for itself and its
+ children. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ size_t doffset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value
+ [gupcr_coll_child_index]));
+ size_t soffset =
+ upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage[MYTHREAD].value[0]));
+ gupcr_coll_put (gupcr_coll_parent_thread, doffset, soffset,
+ sizeof (double));
+ }
+ else
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ gupcr_coll_put_atomic (gupcr_coll_parent_thread, offset, offset,
+ sizeof (double),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_DOUBLE);
+ }
+ gupcr_coll_ack_wait (1);
+ }
+#endif /* GUPCR_USE_PORTALS4_TRIGGERED_OPS */
+
+ /* Copy result into the caller's specified destination. */
+ if (IS_ROOT_THREAD)
+ {
+ *(shared double *) dst = *t_result;
+ }
+ }
+
+ /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC. */
+ if (UPC_OUT_MYSYNC & sync_mode || !(UPC_OUT_NOSYNC & sync_mode))
+ upc_barrier;
+
+ gupcr_trace (FC_COLL, "COLL ALL_REDUCE EXIT");
+}
+
+/**
+ * Collectives UPC_LOGAND function for float types
+ *
+ * Portals4 does not define logical AND atomic operations
+ * and they will be executed as functions.
+ */
+ long double
+gupcr_coll_logandLD (long double a, long double b)
+{
+ return a && b;
+}
+
+/**
+ * Collectives UPC_LOGOR function for float types
+ *
+ * Portals4 does not define logical OR atomic operations
+ * and they will be executed as functions.
+ */
+
+long double
+gupcr_coll_logorLD (long double a, long double b)
+{
+ return a || b;
+}
+
+/**
+ * Collectives reduce (LD) function
+ *
+ * The following steps are taken to calculate the reduced value:
+ *
+ * - Each thread reduces the values it has affinity to. Note that
+ * some of the threads might not participate in collectives reduce.
+ * - A reduce tree is created out of the threads participating.
+ * - All the parent threads signal their children that they are ready
+ * for the collectives reduce operation.
+ * - All the children perform atomic portals reduce operations in the
+ * parent shared space. The reduced values are propagated to the
+ * top of the tree.
+ * - Result is written to the specified destination.
+ *
+ * @param [in] dst Destination shared pointer
+ * @param [in] src Source shared pointer
+ * @param [in] op Collectives reduce operation
+ * @param [in] nelems Number of elements
+ * @param [in] blk_size Block size
+ * @param [in] func Optional reduce function
+ * @param [in] sync_mode Synchronization mode
+ *
+ */
+void upc_all_reduceLD
+ (shared void *dst,
+ shared const void *src,
+ upc_op_t op,
+ size_t nelems,
+ size_t blk_size,
+ long double (*func) (long double, long double), upc_flag_t sync_mode)
+{
+ int i, n_local, full_rows, last_row;
+ int num_thr, tail_thr, extras, ph, src_thr, dst_thr, velems, start;
+
+ long double local_result = 0;
+ long double *l_src;
+
+ if (!upc_coll_init_flag)
+ upc_coll_init ();
+
+ gupcr_trace (FC_COLL, "COLL ALL_REDUCE ENTER long double %lu %lu",
+ (long unsigned) nelems, (long unsigned) blk_size);
+
+ if (blk_size == 0)
+ blk_size = nelems;
+
+#ifdef _UPC_COLL_CHECK_ARGS
+ upc_coll_err (dst, src, NULL, 0, sync_mode, blk_size, nelems, op, UPC_RED);
+#endif
+
+ /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC. */
+ if (UPC_IN_MYSYNC & sync_mode || !(UPC_IN_NOSYNC & sync_mode))
+ upc_barrier;
+
+ /* Compute n_local, the number of elements local to this thread. */
+ n_local = 0;
+
+ /* Also compute start, the starting index of src for each thread. */
+
+ src_thr = upc_threadof ((shared void *) src);
+ dst_thr = upc_threadof ((shared void *) dst);
+ ph = upc_phaseof ((shared void *) src);
+
+ /* nelems plus the number of virtual elements in first row. */
+ velems = nelems + src_thr * blk_size + ph;
+
+ /* Include virtual elements when computing number of local elements. */
+ full_rows = velems / (blk_size * THREADS);
+ last_row = velems % (blk_size * THREADS);
+ tail_thr = last_row / blk_size;
+
+ /* Calculate number of participating threads. */
+ num_thr = (nelems + ph + blk_size - 1) / blk_size;
+ if (num_thr > THREADS)
+ num_thr = THREADS;
+
+ gupcr_debug (FC_COLL,
+ "src_thr: %d tail_thr: %d ph: %d num_thr: %d full_rows: %d",
+ src_thr, tail_thr, ph, num_thr, full_rows);
+
+ /* Calculate number of local elements. */
+ if (blk_size > 0)
+ {
+ if (MYTHREAD <= tail_thr)
+ if (MYTHREAD == tail_thr)
+ extras = last_row % blk_size;
+ else
+ extras = blk_size;
+ else
+ extras = 0;
+
+ n_local = blk_size * full_rows + extras;
+
+ /* Adjust the number of elements in this thread, if necessary. */
+ if (MYTHREAD < src_thr)
+ n_local -= blk_size;
+ else if (MYTHREAD == src_thr)
+ n_local -= ph;
+ }
+ else
+ {
+ n_local = 0;
+ if (src_thr == MYTHREAD) /* Revise the number of local elements. */
+ n_local = nelems;
+ }
+
+ /* Starting index for this thread
+ Note: start is sometimes negative because src is
+ addressed here as if its block size is 1. */
+
+ if (blk_size > 0)
+ if (MYTHREAD > src_thr)
+ start = MYTHREAD - src_thr - ph * THREADS;
+ else if (MYTHREAD < src_thr)
+ start = (blk_size - ph) * THREADS + MYTHREAD - src_thr;
+ else /* This is the source thread. */
+ start = 0;
+ else
+ start = 0;
+
+ /* Logical operations on floating point types must execute as
+ functions as Portals4 does not have support for them. */
+ switch (op)
+ {
+ case UPC_LOGAND:
+ func = &gupcr_coll_logandLD;
+ op = UPC_FUNC;
+ break;
+ case UPC_LOGOR:
+ func = &gupcr_coll_logorLD;
+ op = UPC_FUNC;
+ break;
+ }
+
+ /* Reduce the elements local to this thread. */
+
+ if (n_local > 0)
+ {
+ int loop_cnt = n_local - 1;
+
+ l_src = (long double *) ((shared const long double *) src + start);
+ local_result = *l_src++;
+
+ switch (op)
+ {
+ case UPC_ADD:
+ while (loop_cnt--)
+ local_result += *l_src++;
+ break;
+ case UPC_MULT:
+ while (loop_cnt--)
+ local_result *= *l_src++;
+ break;
+ case UPC_MIN:
+ while (loop_cnt--)
+ {
+ if (local_result > *l_src)
+ local_result = *l_src;
+ ++l_src;
+ }
+ break;
+ case UPC_MAX:
+ while (loop_cnt--)
+ {
+ if (local_result < *l_src)
+ local_result = *l_src;
+ ++l_src;
+ }
+ break;
+ case UPC_FUNC:
+ while (loop_cnt--)
+ local_result = func (local_result, *l_src++);
+ break;
+ case UPC_NONCOMM_FUNC:
+ while (loop_cnt--)
+ local_result = func (local_result, *l_src++);
+ break;
+ default:
+ gupcr_fatal_error ("bad UPC collectives reduce operator 0x%lx", op);
+ }
+ }
+
+ /* Note: local_result is undefined if n_local == 0.
+ Note: Only a proper subset of threads have a meaningful local_result.
+ Note: dst might be a thread that does not have a local result. */
+
+ /* Global reduce on only participating threads. */
+ if (n_local)
+ {
+ /* Local pointer where reduced values are written too. */
+ long double *t_result =
+ (long double *) & gupcr_reduce_storage[MYTHREAD].value[0];
+
+ /* Initialize collectives reduce tree. */
+ gupcr_coll_tree_setup (dst_thr, src_thr, num_thr);
+
+ /* Copy in local results into the area for reduce operation.
+ NOTE: Not needed for the case of collective functions. However,
+ this covers the case of only one thread. */
+ *t_result = local_result;
+
+#ifdef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+/* Run reduce operation without triggered functions. */
+#undef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+#endif
+#if GUPCR_USE_PORTALS4_TRIGGERED_OPS
+ /* Note: In the case of UPC_FUNC and UPC_NONCOMM, it is not possible
+ to use triggered operations on inner nodes. In that case, inner
+ nodes must calculate reduced value by calling the specified
+ function. */
+ if (gupcr_coll_child_cnt)
+ {
+ if (IS_ROOT_THREAD)
+ {
+ /* ROOT THREAD */
+ /* Let children know that parent is ready. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+ /* Wait for children to report their values. */
+ gupcr_coll_signal_wait (gupcr_coll_child_cnt);
+
+ /* Reduce local values with those of children if necessary. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ /* Reduce local result with those of children. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result =
+ func (local_result, *(long double *)
+ & gupcr_reduce_storage[MYTHREAD].value[i]);
+ }
+ *t_result = local_result;
+ }
+ }
+ else
+ {
+ /* INNER THREAD */
+ /* Prepare triggered atomic function. */
+ if ((op != UPC_FUNC) && (op != UPC_NONCOMM_FUNC))
+ {
+ /* Use triggered atomic operations once children sent
+ their results and parent is ready to receive it. */
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+ offset, sizeof (long double),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_LONG_DOUBLE,
+ gupcr_coll_child_cnt + 1);
+ }
+ /* Let children know that parent is ready. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+ /* Wait for completion, children and parent are ready. */
+ gupcr_coll_signal_wait (gupcr_coll_child_cnt + 1);
+ /* Execute reduce functions if necessary. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ size_t doffset =
+ upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage[MYTHREAD].value
+ [gupcr_coll_child_index]));
+ /* Reduce local result with those of children. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result = func (local_result, *(long double *)
+ &
+ gupcr_reduce_storage
+ [MYTHREAD].value[i]);
+ }
+ *t_result = local_result;
+ gupcr_coll_put (gupcr_coll_parent_thread, doffset, offset,
+ sizeof (long double));
+ }
+ /* Wait for our value to go up the tree. */
+ gupcr_coll_ack_wait (1);
+ }
+ }
+ else
+ {
+ /* Avoid the case where only one thread is available. */
+ if (!IS_ROOT_THREAD)
+ {
+ /* LEAF THREAD */
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ switch (op)
+ {
+ case UPC_FUNC:
+ case UPC_NONCOMM_FUNC:
+ {
+ /* Schedule a triggered put once signal is received. */
+ size_t doffset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].
+ value
+ [gupcr_coll_child_index]));
+ gupcr_coll_trigput (gupcr_coll_parent_thread, doffset,
+ offset, sizeof (long double), 1);
+ }
+ break;
+ default:
+ /* Schedule a triggered atomic put once parent is ready. */
+ gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+ offset, sizeof (long double),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_LONG_DOUBLE, 1);
+ break;
+ }
+ /* Wait for parent to be ready. */
+ gupcr_coll_signal_wait (1);
+ /* Wait for our value to leave. */
+ gupcr_coll_ack_wait (1);
+ }
+ }
+#else /* NO TRIGGERED OPS */
+ /* Send signal to all children. */
+ if (gupcr_coll_child_cnt)
+ {
+ /* ROOT OR INNER THREAD */
+ int wait_cnt = gupcr_coll_child_cnt;
+
+ /* Signal that parent is ready to receive the locally reduced
+ values from its children. Value that we send does not matter. */
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].signal));
+ gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+ }
+ gupcr_coll_ack_wait (wait_cnt);
+
+ /* Wait for children to report their local reduced values and
+ parent to report it is ready to receive the reduced value. */
+ if (!IS_ROOT_THREAD)
+ ++wait_cnt;
+ gupcr_coll_signal_wait (wait_cnt);
+
+ /* Compute result if reduce functions are used. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ for (i = 0; i < gupcr_coll_child_cnt; i++)
+ {
+ local_result = func (local_result,
+ *(long double *) &
+ gupcr_reduce_storage[MYTHREAD].value
+ [i]);
+ }
+ /* Prepare reduced value for going up the tree. */
+ *t_result = local_result;
+ }
+ }
+ else if (!IS_ROOT_THREAD)
+ {
+ /* LEAF THREAD */
+ gupcr_coll_signal_wait (1);
+ }
+
+ /* Send reduced value to the parent. */
+ if (!IS_ROOT_THREAD)
+ {
+ /* LEAF OR INNER THREAD */
+ /* Each child places its result into the parent memory slot
+ dedicated for the child. The parent is responsible
+ for creating the reduced result for itself and its
+ children. */
+ if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+ {
+ size_t doffset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value
+ [gupcr_coll_child_index]));
+ size_t soffset =
+ upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage[MYTHREAD].value[0]));
+ gupcr_coll_put (gupcr_coll_parent_thread, doffset, soffset,
+ sizeof (long double));
+ }
+ else
+ {
+ size_t offset = upc_addrfield ((shared void *)
+ &(gupcr_reduce_storage
+ [MYTHREAD].value[0]));
+ gupcr_coll_put_atomic (gupcr_coll_parent_thread, offset, offset,
+ sizeof (long double),
+ gupcr_portals_reduce_op (op),
+ UPC_COLL_TO_PTL_LONG_DOUBLE);
+ }
+ gupcr_coll_ack_wait (1);
+ }
+#endif /* GUPCR_USE_PORTALS4_TRIGGERED_OPS */
+
+ /* Copy result into the caller's specified destination. */
+ if (IS_ROOT_THREAD)
+ {
+ *(shared long double *) dst = *t_result;
+ }
+ }
+
+ /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC. */
+ if (UPC_OUT_MYSYNC & sync_mode || !(UPC_OUT_NOSYNC & sync_mode))
+ upc_barrier;
+
+ gupcr_trace (FC_COLL, "COLL ALL_REDUCE EXIT");
+}
===================================================================
@@ -0,0 +1,393 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+ This file is part of the UPC runtime Library.
+ Written by Gary Funck <gary@intrepid.com>
+ and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+#include "gupcr_config.h"
+#include "gupcr_defs.h"
+#include "gupcr_lib.h"
+#include "gupcr_sup.h"
+#include "gupcr_portals.h"
+#include "gupcr_gmem.h"
+#include "gupcr_utils.h"
+#include "gupcr_coll_sup.h"
+
+/**
+ * @file gupcr_coll_sup.c
+ * GUPC Portals4 collectives implementation support routines.
+ *
+ * @addtogroup COLLECTIVES GUPCR Collectives Functions
+ * @{
+ */
+
+/** Collectives shared access LE handle */
+static ptl_handle_le_t gupcr_coll_le;
+/** Collectives shared access LE counting events handle */
+static ptl_handle_ct_t gupcr_coll_le_ct;
+/** Collectives shared access LE events queue handle */
+static ptl_handle_eq_t gupcr_coll_le_eq;
+/** Collectives number of received signals (PUT/ATOMIC) through LE */
+static ptl_size_t gupcr_coll_signal_cnt;
+
+/** Collectives local access MD handle */
+static ptl_handle_md_t gupcr_coll_md;
+/** Collectives local access MD counting events handle */
+static ptl_handle_ct_t gupcr_coll_md_ct;
+/** Collectives local access MD event queue handle */
+static ptl_handle_eq_t gupcr_coll_md_eq;
+/** Collectives number of received ACKs on local md */
+static ptl_size_t gupcr_coll_ack_cnt;
+
+/* Collectives thread tree. */
+/** Collectives tree parent thread */
+int gupcr_coll_parent_thread;
+/** Collectives tree number of children */
+int gupcr_coll_child_cnt;
+/** Collectives tree child's index */
+int gupcr_coll_child_index;
+/** Collectives tree children threads */
+int gupcr_coll_child[GUPCR_TREE_FANOUT];
+
+/**
+ * Initialize collectives thread tree.
+ *
+ * A collectives tree starts from the "start" thread number and
+ * includes only "nthreads" (e.g. threads involved in
+ * the reduce process). The simplest case is when all the
+ * threads are involved in which case start=0 and
+ * nthreads=THREADS (e.g. used for broadcast).
+ *
+ * The collectives thread tree can be organized in a
+ * form where the "newroot" value identifies
+ * the root thread (only if the "newroot" thread
+ * is participating in the operation).
+ * @param [in] newroot A hint for the tree root thread.
+ * @param [in] start Start thread for reduce
+ * @param [in] nthreads Number of threads participating
+ *
+ */
+void
+gupcr_coll_tree_setup (size_t newroot, size_t start, int nthreads)
+{
+/* Convert from/to 0-(THREADS-1) to start-(nthreads-1) range. */
+#define NEWID(id,first) ((id - first + THREADS) % THREADS)
+#define OLDID(nid,first) ((nid + first) % THREADS)
+
+/* Remap into the new root (from root 0 to "root"). */
+#define NEWIDROOT(id,top,cnt) ((cnt + id - top) % cnt)
+#define OLDIDROOT(nid,top,cnt) ((nid + top) % cnt)
+ int i;
+ int ok_to_root = 0;
+ int myid;
+ int root = NEWID (newroot, start);
+
+ gupcr_debug (FC_COLL, "newroot: %lu, start: %lu nthreads: %d",
+ (long unsigned) newroot, (long unsigned) start, nthreads);
+
+ /* Check if root node is participating. If yes, use that for the
+ root, otherwise 0. */
+ if (root < nthreads)
+ ok_to_root = 1;
+
+ /* Get myid - first convert into the new range (0-nthreads),
+ then, if needed and possible, into the range where newroot becomes 0. */
+ myid = NEWID (MYTHREAD, start);
+ if (ok_to_root)
+ myid = NEWIDROOT (myid, root, nthreads);
+
+ /* Calculate the thread id's of the children and parent. */
+ gupcr_coll_child_cnt = 0;
+ for (i = 0; i < GUPCR_TREE_FANOUT; i++)
+ {
+ int child = (GUPCR_TREE_FANOUT * myid + i + 1);
+ if (child < nthreads)
+ {
+ ++gupcr_coll_child_cnt;
+ if (ok_to_root)
+ child = OLDIDROOT (child, root, nthreads);
+ gupcr_coll_child[i] = OLDID (child, start);
+ }
+ }
+ if (myid)
+ {
+ gupcr_coll_parent_thread = (myid - 1) / GUPCR_TREE_FANOUT;
+ gupcr_coll_child_index =
+ myid - gupcr_coll_parent_thread * GUPCR_TREE_FANOUT - 1;
+ if (ok_to_root)
+ gupcr_coll_parent_thread =
+ OLDIDROOT (gupcr_coll_parent_thread, root, nthreads);
+ gupcr_coll_parent_thread = OLDID (gupcr_coll_parent_thread, start);
+ }
+ else
+ gupcr_coll_parent_thread = ROOT_PARENT;
+}
+
+/**
+ * Collective PUT operation
+ *
+ * @param [in] dthread Destination thread
+ * @param [in] doffset Destination offset in the shared space
+ * @param [in] soffset Source offset in the shared space
+ * @param [in] nbytes Number of bytes to copy
+ */
+
+void
+gupcr_coll_put (size_t dthread, size_t doffset, size_t soffset, size_t nbytes)
+{
+ ptl_process_t rpid;
+
+ gupcr_debug (FC_COLL, "%d:0x%lx %lu:0x%lx %lu",
+ MYTHREAD, (long unsigned) soffset,
+ (long unsigned) dthread, (long unsigned) doffset,
+ (long unsigned) nbytes);
+ rpid.rank = dthread;
+ gupcr_portals_call (PtlPut,
+ (gupcr_coll_md, soffset, nbytes, PTL_ACK_REQ, rpid,
+ GUPCR_PTL_PTE_COLL, PTL_NO_MATCH_BITS, doffset,
+ PTL_NULL_USER_PTR, PTL_NULL_HDR_DATA));
+}
+
+/**
+ * Collective triggered PUT operation
+ *
+ * Schedule put operation once number of signals reaches
+ * the specified value.
+ *
+ * @param [in] dthread Destination thread
+ * @param [in] doffset Destination offset in the shared space
+ * @param [in] soffset Source offset in the shared space
+ * @param [in] nbytes Number of bytes to copy
+ * @param [in] cnt Trigger count
+ */
+void
+gupcr_coll_trigput (size_t dthread, size_t doffset, size_t soffset,
+ size_t nbytes, size_t cnt)
+{
+ ptl_process_t rpid;
+
+ gupcr_debug (FC_COLL, "%d:0x%lx -> %lu:0x%lx %lu trig %lu",
+ MYTHREAD, (long unsigned) soffset,
+ (long unsigned) dthread, (long unsigned) doffset,
+ (long unsigned) nbytes, (long unsigned) cnt);
+ rpid.rank = dthread;
+ gupcr_portals_call (PtlTriggeredPut,
+ (gupcr_coll_md, soffset, nbytes, PTL_ACK_REQ, rpid,
+ GUPCR_PTL_PTE_COLL, PTL_NO_MATCH_BITS, doffset,
+ PTL_NULL_USER_PTR, PTL_NULL_HDR_DATA, gupcr_coll_le_ct,
+ gupcr_coll_signal_cnt + cnt));
+}
+
+/**
+ * Collective atomic PUT operation.
+ *
+ * @param [in] dthread Destination thread
+ * @param [in] doffset Destination offset in the shared space
+ * @param [in] soffset Source offset in the shared space
+ * @param [in] nbytes Number of bytes to copy
+ * @param [in] op Portals atomic operation
+ * @param [in] datatype Portals atomic data type
+ */
+
+void
+gupcr_coll_put_atomic (size_t dthread, size_t doffset, size_t soffset,
+ size_t nbytes, ptl_op_t op, ptl_datatype_t datatype)
+{
+ ptl_process_t rpid;
+
+ gupcr_debug (FC_COLL, "%d:0x%lx %lu:0x%lx %lu %s %s",
+ MYTHREAD, (long unsigned) soffset,
+ (long unsigned) dthread, (long unsigned) doffset,
+ (long unsigned) nbytes,
+ gupcr_strptlop (op), gupcr_strptldatatype (datatype));
+ rpid.rank = dthread;
+ gupcr_portals_call (PtlAtomic,
+ (gupcr_coll_md, soffset, nbytes, PTL_ACK_REQ, rpid,
+ GUPCR_PTL_PTE_COLL, PTL_NO_MATCH_BITS, doffset,
+ PTL_NULL_USER_PTR, PTL_NULL_HDR_DATA, op, datatype));
+}
+
+/**
+ * Collective triggered atomic PUT operation.
+ *
+ * Schedule atomic put operation once number of signals reaches
+ * the specified value.
+ *
+ * @param [in] dthread Destination thread
+ * @param [in] doffset Destination offset in the shared space
+ * @param [in] soffset Source offset in the shared space
+ * @param [in] nbytes Number of bytes to copy
+ * @param [in] op Portals atomic operation
+ * @param [in] datatype Portals atomic data type
+ * @param [in] cnt Number of signals that triggers
+ */
+void
+gupcr_coll_trigput_atomic (size_t dthread, size_t doffset, size_t soffset,
+ size_t nbytes, ptl_op_t op,
+ ptl_datatype_t datatype, size_t cnt)
+{
+ ptl_process_t rpid;
+
+ gupcr_debug (FC_COLL, "%d:0x%lx %lu:0x%lx %lu %s %s trig %lu",
+ MYTHREAD, (long unsigned) soffset,
+ (long unsigned) dthread, (long unsigned) doffset,
+ (long unsigned) nbytes,
+ gupcr_strptlop (op), gupcr_strptldatatype (datatype),
+ (long unsigned) cnt);
+ rpid.rank = dthread;
+ gupcr_portals_call (PtlTriggeredAtomic,
+ (gupcr_coll_md, soffset,
+ nbytes, PTL_ACK_REQ, rpid, GUPCR_PTL_PTE_COLL,
+ PTL_NO_MATCH_BITS, doffset, PTL_NULL_USER_PTR,
+ PTL_NULL_HDR_DATA, op, datatype, gupcr_coll_le_ct,
+ cnt));
+}
+
+/**
+ * Collectives wait for operation completion
+ * This function is used in cases where threads needs to wait
+ * for the completion of remote operations.
+ *
+ * @param [in] cnt Wait count
+ */
+void
+gupcr_coll_ack_wait (size_t cnt)
+{
+ ptl_ct_event_t ct;
+ gupcr_debug (FC_COLL, "wait for %lu (%lu)",
+ (long unsigned) cnt,
+ (long unsigned) (gupcr_coll_ack_cnt + cnt));
+ gupcr_portals_call (PtlCTWait,
+ (gupcr_coll_md_ct, gupcr_coll_ack_cnt + cnt, &ct));
+ if (ct.failure)
+ {
+ gupcr_process_fail_events (gupcr_coll_md_eq);
+ gupcr_fatal_error ("received an error on collective MD");
+ }
+ gupcr_coll_ack_cnt += cnt;
+}
+
+/**
+ * Collectives wait for signaling events
+ * This function is used to wait for other threads to complete
+ * operations in the thread's shared space (e.g. children performing
+ * atomic ops in the parent's shared space).
+ *
+ * @param [in] cnt Wait count
+ */
+void
+gupcr_coll_signal_wait (size_t cnt)
+{
+ ptl_ct_event_t ct;
+
+ gupcr_debug (FC_COLL, "wait for %lu (%lu)",
+ (long unsigned) cnt,
+ (long unsigned) (gupcr_coll_signal_cnt + cnt));
+ gupcr_portals_call (PtlCTWait,
+ (gupcr_coll_le_ct, gupcr_coll_signal_cnt + cnt, &ct));
+ if (ct.failure)
+ {
+ gupcr_process_fail_events (gupcr_coll_le_eq);
+ gupcr_fatal_error ("received an error on collective LE");
+ }
+ gupcr_coll_signal_cnt += cnt;
+}
+
+/**
+ * Initialize collectives resources.
+ * @ingroup INIT
+ *
+ * A thread's shared space is mapped via a Portals LE for other
+ * threads to write to, and an MD as a source for remote
+ * operations. In this way, the address filed of the shared pointer
+ * can be used as an offset into LE/MD.
+ */
+void
+gupcr_coll_init (void)
+{
+ ptl_md_t md;
+ ptl_pt_index_t pte;
+ ptl_le_t le;
+
+ gupcr_log (FC_COLL, "coll init called");
+
+ /* Allocate the Portals PTE that is used for collectives. */
+ gupcr_portals_call (PtlEQAlloc, (gupcr_ptl_ni, 1, &gupcr_coll_le_eq));
+ gupcr_portals_call (PtlPTAlloc, (gupcr_ptl_ni, 0,
+ gupcr_coll_le_eq, GUPCR_PTL_PTE_COLL,
+ &pte));
+ if (pte != GUPCR_PTL_PTE_COLL)
+ gupcr_fatal_error ("cannot allocate PTE GUPCR_PTL_PTE_COLL.");
+ gupcr_debug (FC_COLL, "Collectives PTE allocated: %d", GUPCR_PTL_PTE_COLL);
+
+ /* Allocate the Portals LE that is used for collectives. */
+ gupcr_portals_call (PtlCTAlloc, (gupcr_ptl_ni, &gupcr_coll_le_ct));
+ le.start = gupcr_gmem_base;
+ le.length = gupcr_gmem_size;
+ le.ct_handle = gupcr_coll_le_ct;
+ le.uid = PTL_UID_ANY;
+ le.options = PTL_LE_OP_PUT | PTL_LE_OP_GET | PTL_LE_EVENT_CT_COMM |
+ PTL_LE_EVENT_SUCCESS_DISABLE | PTL_LE_EVENT_LINK_DISABLE;
+ gupcr_portals_call (PtlLEAppend, (gupcr_ptl_ni, GUPCR_PTL_PTE_COLL, &le,
+ PTL_PRIORITY_LIST, NULL, &gupcr_coll_le));
+ gupcr_debug (FC_COLL, "Collectives LE created at 0x%lx size 0x%lx",
+ (long unsigned) gupcr_gmem_base,
+ (long unsigned) gupcr_gmem_size);
+
+ /* Setup the Portals MD for local source/destination copying.
+ We need to map only the shared memory space. */
+ gupcr_portals_call (PtlCTAlloc, (gupcr_ptl_ni, &gupcr_coll_md_ct));
+ gupcr_portals_call (PtlEQAlloc, (gupcr_ptl_ni, 1, &gupcr_coll_md_eq));
+ md.start = gupcr_gmem_base;
+ md.length = gupcr_gmem_size;
+ md.options = PTL_MD_EVENT_CT_ACK | PTL_MD_EVENT_CT_REPLY |
+ PTL_MD_EVENT_SUCCESS_DISABLE;
+ md.eq_handle = gupcr_coll_md_eq;
+ md.ct_handle = gupcr_coll_md_ct;
+ gupcr_portals_call (PtlMDBind, (gupcr_ptl_ni, &md, &gupcr_coll_md));
+
+ /* Reset the number of signals/acks. */
+ gupcr_coll_signal_cnt = 0;
+ gupcr_coll_ack_cnt = 0;
+}
+
+/**
+ * Release collectives resources.
+ * @ingroup INIT
+ */
+void
+gupcr_coll_fini (void)
+{
+ gupcr_log (FC_COLL, "coll fini called");
+ /* Release the collectives MD. */
+ gupcr_portals_call (PtlMDRelease, (gupcr_coll_md));
+ gupcr_portals_call (PtlCTFree, (gupcr_coll_md_ct));
+ gupcr_portals_call (PtlEQFree, (gupcr_coll_md_eq));
+ /* Release the collectives LE and PTE. */
+ gupcr_portals_call (PtlLEUnlink, (gupcr_coll_le));
+ gupcr_portals_call (PtlCTFree, (gupcr_coll_le_ct));
+ gupcr_portals_call (PtlEQFree, (gupcr_coll_le_eq));
+ gupcr_portals_call (PtlPTFree, (gupcr_ptl_ni, GUPCR_PTL_PTE_COLL));
+}
+
+/** @} */
===================================================================
@@ -0,0 +1,106 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+ This file is part of the UPC runtime Library.
+ Written by Gary Funck <gary@intrepid.com>
+ and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+#ifndef _GUPCR_COLL_SUP_H_
+#define _GUPCR_COLL_SUP_H_ 1
+
+/**
+ * @file gupcr_coll_sup.h
+ * GUPC Portals4 collectives implementation support routines.
+ *
+ * @addtogroup COLLECTIVES GUPCR Collectives Functions
+ * @{
+ */
+
+/** Convert from UPC collectives char to Portals atomic type. */
+#define UPC_COLL_TO_PTL_CHAR PTL_INT8_T
+#define UPC_COLL_TO_PTL_UCHAR PTL_UINT8_T
+/** Convert from UPC collectives short to Portals atomic type. */
+#if __SIZEOF_SHORT__ == 2
+#define UPC_COLL_TO_PTL_SHORT PTL_INT16_T
+#define UPC_COLL_TO_PTL_USHORT PTL_UINT16_T
+#elif __SIZEOF_SHORT__ == 4
+#define UPC_COLL_TO_PTL_SHORT PTL_INT32_T
+#define UPC_COLL_TO_PTL_USHORT PTL_UINT32_T
+#else
+#error "Size of short not supported"
+#endif
+/** Convert from UPC collectives int to Portals atomic type. */
+#if __SIZEOF_INT__ == 4
+#define UPC_COLL_TO_PTL_INT PTL_INT32_T
+#define UPC_COLL_TO_PTL_UINT PTL_UINT32_T
+#elif __SIZEOF_INT__ == 8
+#define UPC_COLL_TO_PTL_INT PTL_INT64_T
+#define UPC_COLL_TO_PTL_UINT PTL_UINT64_T
+#else
+#error "Size of int not supported"
+#endif
+/** Convert from UPC collectives long to Portals atomic type. */
+#if __SIZEOF_LONG__ == 4
+#define UPC_COLL_TO_PTL_LONG PTL_INT32_T
+#define UPC_COLL_TO_PTL_ULONG PTL_UINT32_T
+#elif __SIZEOF_LONG__ == 8
+#define UPC_COLL_TO_PTL_LONG PTL_INT64_T
+#define UPC_COLL_TO_PTL_ULONG PTL_UINT64_T
+#else
+#error "Size of long not supported"
+#endif
+/** Convert from UPC collectives float to Portals atomic type. */
+#define UPC_COLL_TO_PTL_FLOAT PTL_FLOAT
+/** Convert from UPC collectives double to Portals atomic type. */
+#define UPC_COLL_TO_PTL_DOUBLE PTL_DOUBLE
+/** Convert from UPC collectives long double to Portals atomic type. */
+#define UPC_COLL_TO_PTL_LONG_DOUBLE PTL_LONG_DOUBLE
+
+extern int gupcr_coll_parent_thread;
+extern int gupcr_coll_child_cnt;
+extern int gupcr_coll_child_index;
+extern int gupcr_coll_child[GUPCR_TREE_FANOUT];
+
+/** Check if thread is the root thread by checking its parent. */
+#define IS_ROOT_THREAD (gupcr_coll_parent_thread == ROOT_PARENT)
+
+void gupcr_coll_tree_setup (size_t newroot, size_t start, int nthreads);
+void gupcr_coll_put (size_t dthread,
+ size_t doffset, size_t soffset, size_t nbytes);
+void gupcr_coll_trigput (size_t dthread,
+ size_t doffset, size_t soffset, size_t nbytes,
+ size_t cnt);
+void gupcr_coll_put_atomic (size_t dthread, size_t doffset, size_t soffset,
+ size_t nbytes, ptl_op_t op,
+ ptl_datatype_t datatype);
+void gupcr_coll_trigput_atomic (size_t dthread, size_t doffset,
+ size_t soffset, size_t nbytes, ptl_op_t op,
+ ptl_datatype_t datatype, size_t cnt);
+void gupcr_coll_ack_wait (size_t cnt);
+void gupcr_coll_signal_wait (size_t cnt);
+
+void gupcr_coll_init (void);
+void gupcr_coll_fini (void);
+
+/** @} */
+
+#endif /* gupcr_coll_sup.h */
===================================================================
@@ -0,0 +1,180 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+ This file is part of the UPC runtime Library.
+ Written by Gary Funck <gary@intrepid.com>
+ and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+/**
+ * @file gupcr_config.h
+ * GUPC Runtime configuration
+ */
+
+#ifndef _GUPCR_CONFIG_H_
+#define _GUPCR_CONFIG_H_
+
+#include <ctype.h>
+#include <errno.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <stddef.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <libgen.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <time.h>
+#if TIME_WITH_SYS_TIME
+#include <sys/time.h>
+#endif
+#include <sys/time.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+
+#ifdef _POSIX_PRIORITY_SCHEDULING
+#define __USE_GNU
+#include <sched.h>
+#endif
+
+#include "config.h"
+
+#define DEV_ZERO "/dev/zero"
+#define OFFSET_ZERO ((off_t) 0)
+/* Darwin has MAP_ANON defined for anonymous memory map. */
+#if !MAP_ANONYMOUS && MAP_ANON
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+#define MAP_ERROR ((void *) -1)
+
+#define GUPCR_SPIN_THREAD_SLOTS 4
+#define GUPCR_SPIN_SLOT_COUNT 64
+#define GUPCR_SPIN_MAX_MULT 1024
+
+#define KILOBYTE 1024
+#define C64K (64*KILOBYTE)
+#define MEGABYTE (KILOBYTE*KILOBYTE)
+#ifndef INT_MIN
+/** __INT_MAX__ is predefined by the gcc compiler. */
+#define INT_MIN (-__INT_MAX__ - 1)
+#endif
+
+//begin detect_target64
+#if (defined (_LP64) && _LP64)
+#define GUPCR_TARGET64 1
+#else
+#define GUPCR_TARGET64 0
+#endif
+//end detect_target64
+
+//begin mode_types
+typedef unsigned int u_intQI_t __attribute__ ((__mode__ (__QI__)));
+typedef unsigned int u_intHI_t __attribute__ ((__mode__ (__HI__)));
+typedef unsigned int u_intSI_t __attribute__ ((__mode__ (__SI__)));
+typedef unsigned int u_intDI_t __attribute__ ((__mode__ (__DI__)));
+#if GUPCR_TARGET64
+typedef unsigned int u_intTI_t __attribute__ ((__mode__ (__TI__)));
+#endif /* GUPCR_TARGET64 */
+//end mode_types
+
+//begin lib_min_max
+
+/* Helper functions. */
+#define GUPCR_MIN(x,y) (((x) < (y)) ? (x): (y))
+#define GUPCR_MAX(x,y) (((x) > (y)) ? (x): (y))
+#define GUPCR_ABS(x) (((x) > 0) ? (x): -(x))
+#define GUPCR_ROUND(x, r) (((x) + (r) - 1)/(r)*(r))
+//end lib_min_max
+
+//begin lib_config_heap
+
+/** Maximum heap size
+ Set here as 64 gigabytes on a 64-bit implementation
+ and 1 gigabyte on other (eg, 32 bit) implementations. */
+#define GUPCR_MAX_HEAP_SIZE (((sizeof (void *)*8) == 64) \
+ ? (64L * KILOBYTE * MEGABYTE) \
+ : ( 1L * KILOBYTE * MEGABYTE))
+
+/** Default per thread UPC shared heap size. */
+#define GUPCR_DEFAULT_PER_THREAD_HEAP_SIZE (256*MEGABYTE)
+
+/** The minimum number of bytes to allocate (128 bytes).
+
+ This allows for 64 bytes of heap management overhead and 64
+ bytes of allocation. The allocation will be aligned to a 64
+ byte boundary. This is not space efficient, but is intended to
+ provide a minimal alignment that agrees with most CPU cache line
+ size requirements. */
+#define GUPCR_HEAP_ALLOC_MIN 128
+
+/** The minimum number of bytes to allocate (in bits). */
+#define GUPCR_HEAP_ALLOC_MIN_BITS 7
+
+/** The size of the heap management header block. */
+#define GUPCR_HEAP_ALLOC_OVERHEAD 64
+
+/** The number of allocation pools per heap. */
+#define GUPCR_HEAP_NUM_POOLS (SIZE_T_BITS - GUPCR_HEAP_ALLOC_MIN_BITS)
+
+/** An unlikely barrier id to be used for runtime synchronization */
+#define GUPCR_RUNTIME_BARRIER_ID 0xBADF00D
+
+/** A value used to tag each heap allocated item, checked by upc_free */
+#define GUPCR_HEAP_ALLOC_TAG 0x0DDF00D
+
+//end lib_config_heap
+
+/*
+ * Main entry for UPC programs.
+ * The runtime will execute before calling the user's main
+ * program. Thus, the user's main program will renamed
+ * inside of the <upc.h> file to 'upc_main'
+ */
+#define GUPCR_START main
+#define GUPCR_MAIN upc_main
+
+//begin lib_config_shared_section
+
+/** The base address of the UPC shared section */
+#define GUPCR_SHARED_SECTION_START __upc_shared_start
+/** The ending address (plus one) of the UPC shared section */
+#define GUPCR_SHARED_SECTION_END __upc_shared_end
+
+/** The base address of the UPC compiled program info section */
+#define GUPCR_PGM_INFO_SECTION_START __upc_pgm_info_start
+/** The ending address (plus one) of the UPC compiled program info section */
+#define GUPCR_PGM_INFO_SECTION_END __upc_pgm_info_end
+
+/** The base address of an array of pointers to UPC initialization routines */
+#define GUPCR_INIT_ARRAY_START __upc_init_array_start
+/** The ending address (plus one) of pointers to UPC initialization routines */
+#define GUPCR_INIT_ARRAY_END __upc_init_array_end
+
+//end lib_config_shared_section
+
+#endif /* gupcr_config.h */
===================================================================
@@ -0,0 +1,98 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+ This file is part of the UPC runtime Library.
+ Written by Gary Funck <gary@intrepid.com>
+ and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+/**
+ * @file gupcr_defs.h
+ * GUPC Runtime definitions
+ */
+
+#ifndef _GUPCR_DEFS_H_
+#define _GUPCR_DEFS_H_
+
+#include "gupcr_pts.h"
+
+//begin lib_max_threads_def
+
+/* Maximum number of THREADS supported in this implementation */
+#define GUPCR_THREAD_SIZE 12
+#define GUPCR_THREADS_MAX (1 << GUPCR_THREAD_SIZE)
+//end lib_max_threads_def
+
+#if GUPCR_PTS_PACKED_REP && (GUPCR_THREADS_SIZE > GUPCR_PTS_THREAD_SIZE)
+#error GUPCR_THREADS_MAX exceeds the size of the packed sptr threads field.
+#endif
+
+/* The filename of the location where a runtime
+ error was detected. This is set by the various
+ debug-enabled ('g') UPC runtime library routines. */
+extern const char *gupcr_err_filename;
+
+/* The line number of the location where a runtime
+ error was detected. This is set by the various
+ debug-enabled ('g') UPC runtime library routines. */
+extern unsigned int gupcr_err_linenum;
+
+#define GUPCR_SET_ERR_LOC() \
+ do \
+ { \
+ gupcr_err_filename = filename; \
+ gupcr_err_linenum = linenum; \
+ } while (0)
+
+#define GUPCR_CLEAR_ERR_LOC() \
+ do \
+ { \
+ gupcr_err_filename = NULL; \
+ gupcr_err_linenum = 0; \
+ } while (0)
+
+/* The base address of the UPC shared section. */
+extern char GUPCR_SHARED_SECTION_START[1];
+
+/* The ending address (plus one) of the UPC shared section. */
+extern char GUPCR_SHARED_SECTION_END[1];
+
+/* The base address of the UPC program information section. */
+extern char GUPCR_PGM_INFO_SECTION_START[1];
+
+/* The ending address (plus one) of the UPC program information section. */
+extern char GUPCR_PGM_INFO_SECTION_END[1];
+
+#ifndef __UPC__
+/* The value of THREADS when defined at run time. */
+extern int THREADS;
+
+/* Current thread id. */
+extern int MYTHREAD;
+#endif /* !__UPC__ */
+
+/* OK to call finalize routines. */
+extern int gupcr_finalize_ok;
+
+/* Prototype for the main finalize routine. */
+extern void gupcr_fini (void);
+
+#endif /* gupcr_defs.h */
===================================================================
@@ -0,0 +1,515 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+ This file is part of the UPC runtime Library.
+ Written by Gary Funck <gary@intrepid.com>
+ and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+/**
+ * @file gupcr_env.c
+ * GUPC Runtime environment variables handling
+ */
+
+/**
+ * @addtogroup GUPCUTILS GUPCR Utility Functions
+ * @{
+ */
+
+/**
+
+ UPC_BACKTRACE
+
+ If set, enable enable backtrace for runtime fatal events. By
+ default backtrace logging on fatal events is disabled (even though
+ it may be configured).
+
+ UPC_DEBUG
+
+ If set, specifies a list of "facilities" that
+ will have debugging output logged.
+
+ UPC_DEBUGFILE
+
+ Path of log file where UPC runtime debug logs are written.
+
+ UPC_FIRSTTOUCH
+
+ Not used. Reserved for future use.
+
+ UPC_FORCETOUCH
+
+ Force the thread to touch every memory page in its own shared
+ memory space on startup. This ensures the correct NUMA memory
+ allocation. By default it is "YES".
+
+ UPC_LOG
+
+ If set, specifies a list of "facilities" that
+ will be logged.
+
+ UPC_LOGFILE
+
+ Path of log file where UPC runtime logs are written.
+
+ UPC_NO_WARN
+
+ If set, the UPC_NO_WARN variable causes startup warnings (such as
+ those displayed when debugging or tracing is enabled) to be omitted.
+
+ UPC_NODE_LOCAL_MEM
+
+ If set to "NO", then disable node local memory optimization.
+
+ UPC_NODES
+
+ Not used. Reserved for future use.
+
+ UPC_QUIET
+
+ UPC_QUIET causes all non-application-generated output to be omitted
+ (including both warnings and the initial display of UPC thread
+ layout).
+
+ UPC_POLITE
+
+ Yield the processor frequently while spin-locking.
+
+ UPC_SHARED_HEAP_SIZE
+
+ UPC_SHARED_HEAP_SIZE sets the amount of shared heap (per UPC thread)
+ for your program.
+
+ UPC_STATS
+
+ If set, specifies a list of "facilities" for
+ which UPC runtime statistics will be collected.
+
+ UPC_STATSFILE
+
+ Path of log file where UPC runtime statistics are written.
+
+ UPC_TRACE
+
+ If set, specifies a list of "facilities" that
+ will be traced.
+
+ UPC_TRACEFILE
+
+ Path of log file where UPC trace logs are written.
+
+ The set of facilities are:
+
+ ADDR UPC casts to local and access to PTS's.
+ ALL All the facilities
+ ALLOC UPC dynamic memory allocation
+ ATOMIC UPC atomic operations
+ BARRIER UPC barrier/notify/wait operations
+ BROADCAST UPC runtime internal broadcast operations
+ COLL UPC collectives
+ INFO General information, program info.
+ LOCKS UPC lock operations
+ MEM UPC shared memory accesses
+ MISC Miscellaneous functions
+ PORTALS Portals operations
+ SYSTEM System calls
+
+ For all environment variables above that set a filename path,
+ each appearance of a single '%' will be substituted with the process
+ pid. Two '%'s together escape a single %. Non-existent intermediate
+ directories will be created. As a special case, if the filename
+ is "stdout" or "stderr", then output will be directed to the
+ specified file descriptor. A filename with no '%' indicates
+ that the file will be shared across all processes.
+
+*/
+
+#include "gupcr_config.h"
+#include "gupcr_defs.h"
+#include "gupcr_utils.h"
+
+static const struct gupcr_fc_tbl_struct
+{
+ const char *name;
+ gupcr_facility_t mask;
+}
+gupcr_facility_table[] =
+{
+ {"addr", FC_ADDR},
+ {"all", FC_ALL},
+ {"alloc", FC_ALLOC},
+ {"atomic", FC_ATOMIC},
+ {"barrier", FC_BARRIER},
+ {"broadcast", FC_BROADCAST},
+ {"coll", FC_COLL},
+ {"info", FC_INFO},
+ {"locks", FC_LOCK},
+ {"mem", FC_MEM},
+ {"misc", FC_MISC},
+ {"nb", FC_NB},
+ {"portals", FC_PORTALS},
+ {"system", FC_SYSTEM}
+};
+
+#define GUPCR_FC_TBL_SIZE (sizeof (gupcr_facility_table) \
+ / sizeof (struct gupcr_fc_tbl_struct))
+typedef enum
+{
+ ENV_NONE = 0,
+ ENV_UPC_BACKTRACE,
+ ENV_UPC_DEBUG,
+ ENV_UPC_DEBUGFILE,
+ ENV_UPC_FIRSTTOUCH,
+ ENV_UPC_FORCETOUCH,
+ ENV_UPC_LOG,
+ ENV_UPC_LOGFILE,
+ ENV_UPC_NO_WARN,
+ ENV_UPC_NODE_LOCAL_MEM,
+ ENV_UPC_NODES,
+ ENV_UPC_POLITE,
+ ENV_UPC_REQUIRE_SHARED_SIZE,
+ ENV_UPC_QUIET,
+ ENV_UPC_SHARED_HEAP_SIZE,
+ ENV_UPC_STATS,
+ ENV_UPC_STATSFILE,
+ ENV_UPC_TRACE,
+ ENV_UPC_TRACEFILE
+} gupcr_env_kind;
+
+static const struct gupcr_env_var_struct
+{
+ const char *name;
+ gupcr_env_kind kind;
+}
+gupcr_env_var_table[] =
+{
+ {"UPC_BACKTRACE", ENV_UPC_BACKTRACE},
+ {"UPC_DEBUG", ENV_UPC_DEBUG},
+ {"UPC_DEBUGFILE", ENV_UPC_DEBUGFILE},
+ {"UPC_FIRSTTOUCH", ENV_UPC_FIRSTTOUCH},
+ {"UPC_FORCETOUCH", ENV_UPC_FORCETOUCH},
+ {"UPC_LOG", ENV_UPC_LOG},
+ {"UPC_LOGFILE", ENV_UPC_LOGFILE},
+ {"UPC_NO_WARN", ENV_UPC_NO_WARN},
+ {"UPC_NODE_LOCAL_MEM", ENV_UPC_NODE_LOCAL_MEM},
+ {"UPC_NODES", ENV_UPC_NODES},
+ {"UPC_POLITE", ENV_UPC_POLITE},
+ {"UPC_REQUIRE_SHARED_SIZE", ENV_UPC_REQUIRE_SHARED_SIZE},
+ {"UPC_QUIET", ENV_UPC_QUIET},
+ {"UPC_SHARED_HEAP_SIZE", ENV_UPC_SHARED_HEAP_SIZE},
+ {"UPC_STATS", ENV_UPC_STATS},
+ {"UPC_STATSFILE", ENV_UPC_STATSFILE},
+ {"UPC_TRACE", ENV_UPC_TRACE},
+ {"UPC_TRACEFILE", ENV_UPC_TRACEFILE}
+};
+
+#define GUPCR_ENV_VAR_TBL_SIZE (sizeof (gupcr_env_var_table) \
+ / sizeof (struct gupcr_env_var_struct))
+
+/* Look up the name given by FACILITY and return the facility mask value
+ associated with that name. */
+
+static gupcr_facility_t
+gupcr_facility_mask_for_name (const char *const facility)
+{
+ unsigned i;
+ for (i = 0; i < GUPCR_FC_TBL_SIZE; ++i)
+ {
+ if (!strcasecmp (gupcr_facility_table[i].name, facility))
+ return gupcr_facility_table[i].mask;
+ }
+ return FC_NONE;
+}
+
+/* Extract the environment variable name appearing before the
+ first '=' sign in ENV_VAR_ARG; look it up in the list of
+ known "UPC_" environment variables and return an
+ integer value that is used to identify this particular
+ environment variable name. */
+
+static gupcr_env_kind
+gupcr_env_kind_for_var (const char *const env_var_arg)
+{
+ gupcr_env_kind env_kind = ENV_NONE;
+ unsigned i;
+ char *env_var_dup, *env_var;
+ gupcr_strdup (env_var_dup, env_var_arg);
+ env_var = strtok (env_var_dup, "=");
+ gupcr_assert (env_var != NULL);
+ for (i = 0; i < GUPCR_ENV_VAR_TBL_SIZE; ++i)
+ {
+ if (!strcmp (gupcr_env_var_table[i].name, env_var))
+ return gupcr_env_var_table[i].kind;
+ }
+ gupcr_free (env_var_dup);
+ return env_kind;
+}
+
+/* Process the comma separated list of facility names that
+ appear after the '=' sign. Return a mask value indicating
+ which facility names were specified. */
+
+gupcr_facility_t
+gupcr_env_facility_list (const char *const env_var_arg)
+{
+ gupcr_facility_t facility_mask = FC_NONE;
+ char *env_var_dup, *env_var, *facility_name;
+ gupcr_strdup (env_var_dup, env_var_arg);
+ if ((env_var = strtok (env_var_dup, "=")))
+ {
+ while ((facility_name = strtok (NULL, ",")))
+ {
+ gupcr_facility_t facility;
+ facility = gupcr_facility_mask_for_name (facility_name);
+ if (!facility)
+ gupcr_error ("invalid facility name `%s' found in "
+ "environment variable: `%s'",
+ facility_name, env_var_arg);
+ facility_mask |= facility;
+ }
+ }
+ else
+ gupcr_error ("invalid UPC environment variable syntax: `%s'", env_var);
+ gupcr_free (env_var_dup);
+ return facility_mask;
+}
+
+/* Return a malloc'd copy of ENV_VAR_STR_ARG with
+ the current pid substituted for each occurrence of a '%'.
+ Two '%'s next to each other are equivalent to a single '%'. */
+
+const char *
+gupcr_env_filename (const char *const env_var_arg)
+{
+ char *env_var_dup, *env_var, *filename_arg;
+ char *filename = NULL;
+ gupcr_strdup (env_var_dup, env_var_arg);
+ if ((env_var = strtok (env_var_dup, "=")))
+ {
+ if ((filename_arg = strtok (NULL, "")))
+ {
+ const char *const pid = gupcr_get_pid_as_string ();
+ const char *cp;
+ char *fp;
+ size_t filename_len;
+ size_t pid_len = strlen (pid);
+ /* Calculate the required string size. */
+ for (cp = filename_arg, filename_len = 0; *cp; ++cp)
+ {
+ if (cp[0] == '%' && cp[1] == '%')
+ cp += 1, ++filename_len;
+ else if (cp[0] == '%')
+ filename_len += pid_len;
+ else
+ ++filename_len;
+ }
+ /* Allocate the string; copy ENV_VAR_STR_ARG and
+ make '%' substitutions. */
+ gupcr_malloc (filename, filename_len + 1);
+ for (fp = filename, cp = filename_arg; *cp; ++cp)
+ {
+ if (cp[0] == '%' && cp[1] == '%')
+ cp += 1, *fp++ = '%';
+ else if (cp[0] == '%')
+ strcpy (fp, pid), fp += pid_len;
+ else
+ *fp++ = *cp;
+ }
+ *fp = '\0';
+ }
+ else
+ gupcr_error ("missing file name in UPC environment "
+ "variable: `%s'", env_var_arg);
+ }
+ else
+ gupcr_error ("invalid UPC environment variable syntax: `%s'",
+ env_var_arg);
+ gupcr_free (env_var_dup);
+ return filename;
+}
+
+static long long
+gupcr_env_size (const char *const env_var_arg, long long int val_max)
+{
+ long long size = 0;
+ char *env_var, *env_var_name, *size_str;
+ gupcr_strdup (env_var, env_var_arg);
+ if ((env_var_name = strtok (env_var, "=")))
+ {
+ if ((size_str = strtok (NULL, "")))
+ {
+ int status;
+ size = gupcr_strtoll (size_str, 0, val_max, &status);
+ if (status)
+ {
+ gupcr_error ("invalid size specifier in UPC environment "
+ "variable: `%s'", env_var_arg);
+ gupcr_strtoll_error (size_str, 0, val_max, status);
+ }
+ }
+ else
+ gupcr_error ("missing size specifier in UPC environment "
+ "variable: `%s'", env_var_arg);
+ }
+ else
+ gupcr_error ("invalid UPC environment variable syntax: `%s'",
+ env_var_arg);
+ gupcr_free (env_var);
+ return size;
+}
+
+static int
+gupcr_env_boolean (const char *const env_var_arg)
+{
+ int value = 0;
+ char *env_var, *env_var_name, *switch_str;
+ gupcr_strdup (env_var, env_var_arg);
+ if ((env_var_name = strtok (env_var, "=")))
+ {
+ if ((switch_str = strtok (NULL, "")))
+ {
+ if (!strcmp (switch_str, "NO") || \
+ !strcmp (switch_str, "no") || \
+ !strcmp (switch_str, "0"))
+ value = 0;
+ else if (!strcmp (switch_str, "YES") || \
+ !strcmp (switch_str, "yes") || \
+ !strcmp (switch_str, "1"))
+ value = 1;
+ else
+ {
+ gupcr_error ("invalid value specifier in UPC environment "
+ "variable: `%s'", env_var_arg);
+ }
+ }
+ else
+ gupcr_error ("missing value specifier in UPC environment "
+ "variable: `%s'", env_var_arg);
+ }
+ else
+ gupcr_error ("invalid UPC environment variable syntax: `%s'",
+ env_var_arg);
+ gupcr_free (env_var);
+ return value;
+}
+
+/* Process all variables in the environment that begin with "UPC_".
+ Make various calls back into "gupcr_utils.c" to implement
+ the actions associated with each given environment variable. */
+
+void
+gupcr_env_init (void)
+{
+ /* System environment, see: environ (7). */
+ extern char **environ;
+ const char *env_var;
+ unsigned i;
+ for (i = 0; (env_var = environ[i]); ++i)
+ {
+ if (!strncmp (env_var, "UPC_", 4))
+ {
+ const int env_kind = gupcr_env_kind_for_var (env_var);
+ gupcr_facility_t facility_mask;
+ const char *filename;
+ size_t heap_size;
+ switch (env_kind)
+ {
+ case ENV_UPC_BACKTRACE:
+ gupcr_set_backtrace (gupcr_env_boolean (env_var));
+ break;
+ case ENV_UPC_DEBUG:
+ facility_mask = gupcr_env_facility_list (env_var);
+ if (facility_mask)
+ gupcr_set_debug_facility (facility_mask);
+ break;
+ case ENV_UPC_DEBUGFILE:
+ filename = gupcr_env_filename (env_var);
+ if (filename)
+ gupcr_set_debug_filename (filename);
+ break;
+ case ENV_UPC_FIRSTTOUCH:
+ /* no-op */
+ break;
+ case ENV_UPC_FORCETOUCH:
+ gupcr_set_forcetouch (gupcr_env_boolean (env_var));
+ break;
+ case ENV_UPC_LOG:
+ facility_mask = gupcr_env_facility_list (env_var);
+ if (facility_mask)
+ gupcr_set_log_facility (facility_mask);
+ break;
+ case ENV_UPC_LOGFILE:
+ filename = gupcr_env_filename (env_var);
+ if (filename)
+ gupcr_set_log_filename (filename);
+ break;
+ case ENV_UPC_NO_WARN:
+ gupcr_no_warn ();
+ break;
+ case ENV_UPC_NODE_LOCAL_MEM:
+ gupcr_set_node_local_memory (gupcr_env_boolean (env_var));
+ break;
+ case ENV_UPC_NODES:
+ /* no-op */
+ break;
+ case ENV_UPC_POLITE:
+ /* no-op */
+ break;
+ case ENV_UPC_QUIET:
+ gupcr_be_quiet ();
+ break;
+ case ENV_UPC_SHARED_HEAP_SIZE:
+ heap_size = (size_t) gupcr_env_size (env_var,
+ GUPCR_MAX_HEAP_SIZE);
+ gupcr_set_shared_heap_size (heap_size);
+ break;
+ case ENV_UPC_STATS:
+ facility_mask = gupcr_env_facility_list (env_var);
+ gupcr_set_stats_facility (facility_mask);
+ break;
+ case ENV_UPC_STATSFILE:
+ filename = gupcr_env_filename (env_var);
+ if (filename)
+ gupcr_set_stats_filename (filename);
+ break;
+ case ENV_UPC_TRACE:
+ facility_mask = gupcr_env_facility_list (env_var);
+ gupcr_set_trace_facility (facility_mask);
+ break;
+ case ENV_UPC_TRACEFILE:
+ filename = gupcr_env_filename (env_var);
+ if (filename)
+ gupcr_set_trace_filename (filename);
+ break;
+ case ENV_UPC_REQUIRE_SHARED_SIZE:
+ /* no-op */
+ break;
+ case ENV_NONE:
+ gupcr_warn ("unknown UPC environment variable: %s", env_var);
+ break;
+ default:
+ gupcr_fatal_error ("env variable case value out of range");
+ }
+ }
+ }
+}
+
+/** @} */
===================================================================
@@ -0,0 +1,521 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+ This file is part of the UPC runtime Library.
+ Written by Gary Funck <gary@intrepid.com>
+ and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+/**
+ * @file gupcr_gmem.c
+ * GUPC Portals4 shared memory interface.
+ */
+
+/**
+ * @addtogroup GMEM GUPCR Shared Memory Access
+ * @{
+ */
+
+#include "gupcr_config.h"
+#include "gupcr_defs.h"
+#include "gupcr_sup.h"
+#include "gupcr_portals.h"
+#include "gupcr_node.h"
+#include "gupcr_gmem.h"
+#include "gupcr_utils.h"
+#include "gupcr_sync.h"
+
+/** GMEM LE handle */
+static ptl_handle_le_t gupcr_gmem_le;
+
+/** Thread's default shared heap size */
+#define GUPCR_GMEM_DEFAULT_HEAP_SIZE 256*1024*1024
+
+/** Shared memory base and size */
+void *gupcr_gmem_base;
+size_t gupcr_gmem_size;
+
+/** GET event tracking */
+gupcr_gmem_xfer_info_t gupcr_gmem_gets;
+/** PUT event tracking */
+gupcr_gmem_xfer_info_t gupcr_gmem_puts;
+
+/** PUT "bounce buffer" type */
+typedef char gupcr_gmem_put_bb_t[GUPCR_BOUNCE_BUFFER_SIZE];
+/** PUT "bounce buffer" space */
+static gupcr_gmem_put_bb_t gupcr_gmem_put_bb;
+/** PUT "bounce buffer" memory descriptor handle */
+static ptl_handle_md_t gupcr_gmem_put_bb_md;
+/** PUT "bounce buffer" used counter */
+size_t gupcr_gmem_put_bb_used;
+
+/** Previous operation was a strict put */
+int gupcr_pending_strict_put;
+
+/** Heap base offset relative to start of UPC shared region */
+size_t gupcr_gmem_heap_base_offset;
+
+/** Size of UPC shared region reserved for the heap */
+size_t gupcr_gmem_heap_size;
+
+/** Remote puts flow control */
+static const size_t gupcr_gmem_high_mark_puts = GUPCR_MAX_OUTSTANDING_PUTS;
+static const size_t gupcr_gmem_low_mark_puts = GUPCR_MAX_OUTSTANDING_PUTS / 2;
+
+/**
+ * Allocate memory for this thread's shared space contribution.
+ *
+ * Calculate needed memory size and let the node allocate
+ * shared memory and map other thread's shared memory into
+ * the current thread memory space.
+ */
+static void
+gupcr_gmem_alloc_shared (void)
+{
+ size_t heap_size = GUPCR_ROUND (gupcr_get_shared_heap_size (), C64K);
+ size_t data_size = GUPCR_ROUND (GUPCR_SHARED_SECTION_END -
+ GUPCR_SHARED_SECTION_START, C64K);
+ gupcr_gmem_heap_base_offset = data_size;
+ gupcr_gmem_heap_size = heap_size;
+ gupcr_gmem_size = heap_size + data_size;
+
+ /* Allocate this thread's shared space. */
+ gupcr_gmem_base = gupcr_node_local_alloc (gupcr_gmem_size);
+}
+
+/**
+ * Complete all outstanding remote GET operations.
+ *
+ * This procedure waits for all outstanding GET operations
+ * to complete. If the wait on the Portals GET counting event returns
+ * a failure, a full event queue is checked for failure specifics
+ * and the program aborts.
+ */
+void
+gupcr_gmem_sync_gets (void)
+{
+ /* Sync all outstanding local accesses. */
+ GUPCR_MEM_BARRIER ();
+ /* Sync all outstanding remote get accesses. */
+ if (gupcr_gmem_gets.num_pending > 0)
+ {
+ ptl_size_t num_initiated =
+ gupcr_gmem_gets.num_completed + gupcr_gmem_gets.num_pending;
+ ptl_ct_event_t ct;
+ gupcr_debug (FC_MEM, "outstanding gets: %lu",
+ (long unsigned) gupcr_gmem_gets.num_pending);
+ gupcr_portals_call (PtlCTWait,
+ (gupcr_gmem_gets.ct_handle, num_initiated, &ct));
+ gupcr_gmem_gets.num_pending = 0;
+ gupcr_gmem_gets.num_completed = num_initiated;
+ if (ct.failure > 0)
+ {
+ gupcr_process_fail_events (gupcr_gmem_gets.eq_handle);
+ gupcr_abort ();
+ }
+ }
+}
+
+/**
+ * Complete outstanding remote PUT operations.
+ *
+ * This procedure waits for all outstanding PUT operations
+ * to complete. If the wait on the Portals PUT counting event returns
+ * a failure, a full event queue is checked for failure specifics
+ * and the program aborts.
+ */
+void
+gupcr_gmem_sync_puts (void)
+{
+ /* Sync all outstanding local accesses. */
+ GUPCR_MEM_BARRIER ();
+ /* Sync all outstanding remote put accesses. */
+ if (gupcr_gmem_puts.num_pending > 0)
+ {
+ ptl_size_t num_initiated =
+ gupcr_gmem_puts.num_completed + gupcr_gmem_puts.num_pending;
+ ptl_ct_event_t ct;
+ gupcr_debug (FC_MEM, "outstanding puts: %lu",
+ (long unsigned) gupcr_gmem_puts.num_pending);
+ gupcr_portals_call (PtlCTWait,
+ (gupcr_gmem_puts.ct_handle, num_initiated, &ct));
+ gupcr_gmem_puts.num_pending = 0;
+ gupcr_gmem_puts.num_completed = num_initiated;
+ gupcr_pending_strict_put = 0;
+ gupcr_gmem_put_bb_used = 0;
+ if (ct.failure > 0)
+ {
+ gupcr_process_fail_events (gupcr_gmem_puts.eq_handle);
+ gupcr_abort ();
+ }
+ }
+}
+
+/**
+ * Complete all outstanding remote operations.
+ *
+ * Check and wait for completion of all PUT/GET operations.
+ */
+void
+gupcr_gmem_sync (void)
+{
+ gupcr_gmem_sync_gets ();
+ gupcr_gmem_sync_puts ();
+}
+
+/**
+ * Read data from remote shared memory.
+ *
+ * A GET request is broken into multiple PtlGet() requests
+ * if the number of requested bytes is greater then
+ * the configuration limited maximum message size.
+ *
+ * @param [in] dest Local memory to receive remote data
+ * @param [in] thread Remote thread to request data from
+ * @param [in] offset Remote address
+ * @param [in] n Number of bytes to transfer
+ */
+void
+gupcr_gmem_get (void *dest, int thread, size_t offset, size_t n)
+{
+ ptl_process_t rpid;
+ char *dest_addr = (char *) (dest - USER_PROG_MEM_START);
+ size_t rem_offset = offset;
+ size_t n_rem = n;
+
+ gupcr_debug (FC_MEM, "%d:0x%lx 0x%lx",
+ thread, (long unsigned) offset, (long unsigned) dest);
+ rpid.rank = thread;
+ while (n_rem > 0)
+ {
+ size_t n_xfer;
+ n_xfer = GUPCR_MIN (n_rem, (size_t) GUPCR_PORTALS_MAX_MSG_SIZE);
+ ++gupcr_gmem_gets.num_pending;
+ gupcr_portals_call (PtlGet, (gupcr_gmem_gets.md,
+ (ptl_size_t) dest_addr, n_xfer, rpid,
+ GUPCR_PTL_PTE_GMEM, PTL_NO_MATCH_BITS,
+ rem_offset, PTL_NULL_USER_PTR));
+ n_rem -= n_xfer;
+ dest_addr += n_xfer;
+ rem_offset += n_xfer;
+ }
+}
+
+/**
+ * Write data to remote shared memory.
+ *
+ * For data requests smaller then maximum safe size, the data is first
+ * copied into a bounce buffer. In this way, the put operation
+ * can be non-blocking and there are no restrictions placed upon
+ * the caller's use of the source data buffer.
+ * Otherwise, a synchronous operation is performed
+ * and this function returns to the caller after the operation completes.
+ *
+ * @param [in] thread Destination thread
+ * @param [in] offset Destination offset
+ * @param [in] src Local source pointer to data
+ * @param [in] n Number of bytes to transfer
+ */
+void
+gupcr_gmem_put (int thread, size_t offset, const void *src, size_t n)
+{
+ int must_sync = (n > GUPCR_GMEM_MAX_SAFE_PUT_SIZE);
+ char *src_addr = (char *) src;
+ size_t n_rem = n;
+ ptl_process_t rpid;
+ gupcr_debug (FC_MEM, "0x%lx %d:0x%lx",
+ (long unsigned) src, thread, (long unsigned) offset);
+ rpid.rank = thread;
+ /* Large puts must be synchronous, to ensure that it is
+ safe to re-use the source buffer upon return. */
+ while (n_rem > 0)
+ {
+ size_t n_xfer;
+ ptl_handle_md_t md_handle;
+ ptl_size_t local_offset;
+ n_xfer = GUPCR_MIN (n_rem, (size_t) GUPCR_PORTALS_MAX_MSG_SIZE);
+ if (must_sync)
+ {
+ local_offset = src_addr - (char *) USER_PROG_MEM_START;
+ md_handle = gupcr_gmem_puts.md;
+ }
+ else if (n_rem <= GUPCR_PORTALS_MAX_VOLATILE_SIZE)
+ {
+ local_offset = src_addr - (char *) USER_PROG_MEM_START;
+ md_handle = gupcr_gmem_puts.md_volatile;
+ }
+ else
+ {
+ char *bounce_buf;
+ /* If this transfer will overflow the bounce buffer,
+ then first wait for all outstanding puts to complete. */
+ if ((gupcr_gmem_put_bb_used + n_xfer) > GUPCR_BOUNCE_BUFFER_SIZE)
+ gupcr_gmem_sync_puts ();
+ bounce_buf = &gupcr_gmem_put_bb[gupcr_gmem_put_bb_used];
+ memcpy (bounce_buf, src_addr, n_xfer);
+ local_offset = bounce_buf - gupcr_gmem_put_bb;
+ gupcr_gmem_put_bb_used += n_xfer;
+ md_handle = gupcr_gmem_put_bb_md;
+ }
+ ++gupcr_gmem_puts.num_pending;
+ gupcr_portals_call (PtlPut, (md_handle, local_offset, n_xfer,
+ PTL_ACK_REQ, rpid,
+ GUPCR_PTL_PTE_GMEM, PTL_NO_MATCH_BITS,
+ offset, PTL_NULL_USER_PTR,
+ PTL_NULL_HDR_DATA));
+ n_rem -= n_xfer;
+ src_addr += n_xfer;
+
+ if (gupcr_gmem_puts.num_pending == gupcr_gmem_high_mark_puts)
+ {
+ ptl_ct_event_t ct;
+ size_t complete_cnt;
+ size_t wait_cnt = gupcr_gmem_puts.num_completed
+ + gupcr_gmem_puts.num_pending
+ - gupcr_gmem_low_mark_puts;
+ gupcr_portals_call (PtlCTWait,
+ (gupcr_gmem_puts.ct_handle, wait_cnt, &ct));
+ if (ct.failure > 0)
+ {
+ gupcr_process_fail_events (gupcr_gmem_puts.eq_handle);
+ gupcr_abort ();
+ }
+ complete_cnt = ct.success - gupcr_gmem_puts.num_completed;
+ gupcr_gmem_puts.num_pending -= complete_cnt;
+ gupcr_gmem_puts.num_completed = ct.success;
+ }
+ }
+ if (must_sync)
+ gupcr_gmem_sync_puts ();
+}
+
+/**
+ * Copy remote shared memory from the source thread
+ * to the destination thread.
+ *
+ * Bulk copy from one thread to another.
+ * The put bounce buffer is used as an intermediate buffer.
+ * Caller assumes responsibility for checking the validity
+ * of the remote thread id's and/or shared memory offsets.
+ *
+ * @param [in] dthread Destination thread
+ * @param [in] doffset Destination offset
+ * @param [in] sthread Source thread
+ * @param [in] soffset Source offset
+ * @param [in] n Number of bytes to transfer
+ */
+void
+gupcr_gmem_copy (int dthread, size_t doffset,
+ int sthread, size_t soffset, size_t n)
+{
+ size_t n_rem = n;
+ ptl_size_t dest_addr = doffset;
+ ptl_size_t src_addr = soffset;
+ ptl_process_t dpid;
+ gupcr_debug (FC_MEM, "%d:0x%lx %d:0x%lx %lu",
+ sthread, (long unsigned) soffset,
+ dthread, (long unsigned) doffset,
+ (long unsigned) n);
+ dpid.rank = dthread;
+ while (n_rem > 0)
+ {
+ size_t n_xfer;
+ char *bounce_buf;
+ ptl_size_t local_offset;
+ /* Use the entire put "bounce buffer" if the transfer
+ count is sufficiently large. */
+ n_xfer = GUPCR_MIN (n_rem, GUPCR_BOUNCE_BUFFER_SIZE);
+ if ((gupcr_gmem_put_bb_used + n_xfer) > GUPCR_BOUNCE_BUFFER_SIZE)
+ gupcr_gmem_sync_puts ();
+ bounce_buf = &gupcr_gmem_put_bb[gupcr_gmem_put_bb_used];
+ gupcr_gmem_put_bb_used += n_xfer;
+ /* Read the source data into the bounce buffer. */
+ gupcr_gmem_get (bounce_buf, sthread, src_addr, n_xfer);
+ gupcr_gmem_sync_gets ();
+ local_offset = bounce_buf - gupcr_gmem_put_bb;
+ ++gupcr_gmem_puts.num_pending;
+ gupcr_portals_call (PtlPut, (gupcr_gmem_put_bb_md, local_offset, n_xfer,
+ PTL_ACK_REQ, dpid,
+ GUPCR_PTL_PTE_GMEM, PTL_NO_MATCH_BITS,
+ dest_addr, PTL_NULL_USER_PTR,
+ PTL_NULL_HDR_DATA));
+ n_rem -= n_xfer;
+ src_addr += n_xfer;
+ dest_addr += n_xfer;
+ }
+}
+
+/**
+ * Write the same byte value into the bytes of the
+ * destination thread's memory at the specified offset.
+ *
+ * The put bounce buffer is used as an intermediate buffer.
+ * The last write of a chunk of data is non-blocking.
+ * Caller assumes responsibility for checking the validity
+ * of the remote thread id's and/or shared memory offsets.
+ *
+ * @param [in] thread Destination thread
+ * @param [in] offset Destination offset
+ * @param [in] c Set value
+ * @param [in] n Number of bytes to transfer
+ */
+void
+gupcr_gmem_set (int thread, size_t offset, int c, size_t n)
+{
+ size_t n_rem = n;
+ int already_filled = 0;
+ ptl_size_t dest_addr = offset;
+ ptl_process_t rpid;
+ gupcr_debug (FC_MEM, "0x%x %d:0x%lx %lu", c, thread,
+ (long unsigned) offset, (long unsigned) n);
+ rpid.rank = thread;
+ while (n_rem > 0)
+ {
+ size_t n_xfer;
+ char *bounce_buf;
+ ptl_size_t local_offset;
+ /* Use the entire put "bounce buffer" if the transfer
+ count is sufficiently large. */
+ n_xfer = GUPCR_MIN (n_rem, (size_t) GUPCR_BOUNCE_BUFFER_SIZE);
+ if ((gupcr_gmem_put_bb_used + n_xfer) > GUPCR_BOUNCE_BUFFER_SIZE)
+ gupcr_gmem_sync_puts ();
+ bounce_buf = &gupcr_gmem_put_bb[gupcr_gmem_put_bb_used];
+ gupcr_gmem_put_bb_used += n_xfer;
+ /* Fill the bounce buffer, if we haven't already. */
+ if (!already_filled)
+ {
+ memset (bounce_buf, c, n_xfer);
+ already_filled = (bounce_buf == gupcr_gmem_put_bb
+ && n_xfer == GUPCR_BOUNCE_BUFFER_SIZE);
+ }
+ local_offset = bounce_buf - gupcr_gmem_put_bb;
+ ++gupcr_gmem_puts.num_pending;
+ gupcr_portals_call (PtlPut, (gupcr_gmem_put_bb_md, local_offset, n_xfer,
+ PTL_ACK_REQ, rpid,
+ GUPCR_PTL_PTE_GMEM, PTL_NO_MATCH_BITS,
+ dest_addr, PTL_NULL_USER_PTR,
+ PTL_NULL_HDR_DATA));
+ n_rem -= n_xfer;
+ dest_addr += n_xfer;
+ }
+}
+
+/**
+ * Initialize gmem resources.
+ * @ingroup INIT
+ */
+void
+gupcr_gmem_init (void)
+{
+ ptl_md_t md, md_volatile;
+ ptl_le_t le;
+ ptl_pt_index_t pte;
+ gupcr_log (FC_MEM, "gmem init called");
+ /* Allocate memory for this thread's contribution to shared memory. */
+ gupcr_gmem_alloc_shared ();
+ gupcr_portals_call (PtlPTAlloc,
+ (gupcr_ptl_ni, 0,
+ PTL_EQ_NONE, GUPCR_PTL_PTE_GMEM, &pte));
+ if (pte != GUPCR_PTL_PTE_GMEM)
+ gupcr_fatal_error ("cannot allocate PTE GUPCR_PTL_PTE_GMEM");
+ gupcr_log (FC_MEM, "Gmem PTE allocated: %d", GUPCR_PTL_PTE_GMEM);
+ /* Setup Gmem LE. */
+ le.start = gupcr_gmem_base;
+ le.length = gupcr_gmem_size;
+ le.ct_handle = PTL_CT_NONE;
+ le.uid = PTL_UID_ANY;
+ le.options = PTL_LE_OP_PUT | PTL_LE_OP_GET;
+ gupcr_portals_call (PtlLEAppend,
+ (gupcr_ptl_ni,
+ GUPCR_PTL_PTE_GMEM, &le,
+ PTL_PRIORITY_LIST, NULL, &gupcr_gmem_le));
+ gupcr_debug (FC_MEM, "Gmem LE created at 0x%lx with size 0x%lx)",
+ (long unsigned) gupcr_gmem_base,
+ (long unsigned) gupcr_gmem_size);
+ /* Initialize GMEM get lists */
+ gupcr_gmem_gets.num_pending = 0;
+ gupcr_gmem_gets.num_completed = 0;
+ gupcr_gmem_gets.md_options =
+ PTL_MD_EVENT_CT_REPLY | PTL_MD_EVENT_SUCCESS_DISABLE;
+ /* Allocate at least THREADS number of EQ entries. */
+ gupcr_portals_call (PtlEQAlloc,
+ (gupcr_ptl_ni, THREADS, &gupcr_gmem_gets.eq_handle));
+ gupcr_portals_call (PtlCTAlloc, (gupcr_ptl_ni, &gupcr_gmem_gets.ct_handle));
+ /* Map user's address space for GET operations. */
+ md.length = (ptl_size_t) USER_PROG_MEM_SIZE;
+ md.start = (void *) USER_PROG_MEM_START;
+ md.options = gupcr_gmem_gets.md_options;
+ md.eq_handle = gupcr_gmem_gets.eq_handle;
+ md.ct_handle = gupcr_gmem_gets.ct_handle;
+ gupcr_portals_call (PtlMDBind, (gupcr_ptl_ni, &md, &gupcr_gmem_gets.md));
+ /* Initialize GMEM put lists. */
+ gupcr_gmem_puts.num_pending = 0;
+ gupcr_gmem_puts.num_completed = 0;
+ gupcr_gmem_puts.md_options =
+ PTL_MD_EVENT_CT_ACK | PTL_MD_EVENT_SUCCESS_DISABLE;
+ /* Allocate at least THREADS number of EQ entries. */
+ gupcr_portals_call (PtlEQAlloc,
+ (gupcr_ptl_ni, THREADS, &gupcr_gmem_puts.eq_handle));
+ gupcr_portals_call (PtlCTAlloc, (gupcr_ptl_ni, &gupcr_gmem_puts.ct_handle));
+ /* Map user's address space for PUT operations. */
+ md.length = (ptl_size_t) USER_PROG_MEM_SIZE;
+ md.start = (void *) USER_PROG_MEM_START;
+ md.options = gupcr_gmem_puts.md_options;
+ md.eq_handle = gupcr_gmem_puts.eq_handle;
+ md.ct_handle = gupcr_gmem_puts.ct_handle;
+ gupcr_portals_call (PtlMDBind, (gupcr_ptl_ni, &md, &gupcr_gmem_puts.md));
+ /* And map the same but with a volatile option. */
+ md_volatile = md;
+ md_volatile.options |= PTL_MD_VOLATILE;
+ gupcr_portals_call (PtlMDBind, (gupcr_ptl_ni, &md_volatile,
+ &gupcr_gmem_puts.md_volatile));
+ /* Initialize GMEM put bounce buffer. */
+ md.length = GUPCR_BOUNCE_BUFFER_SIZE;
+ md.start = gupcr_gmem_put_bb;
+ md.options = gupcr_gmem_puts.md_options;
+ md.eq_handle = gupcr_gmem_puts.eq_handle;
+ md.ct_handle = gupcr_gmem_puts.ct_handle;
+ gupcr_portals_call (PtlMDBind, (gupcr_ptl_ni, &md, &gupcr_gmem_put_bb_md));
+}
+
+/**
+ * Release gmem resources.
+ * @ingroup INIT
+ */
+void
+gupcr_gmem_fini (void)
+{
+ gupcr_log (FC_MEM, "gmem fini called");
+ /* Release GET MD. */
+ gupcr_portals_call (PtlMDRelease, (gupcr_gmem_gets.md));
+ gupcr_portals_call (PtlCTFree, (gupcr_gmem_gets.ct_handle));
+ gupcr_portals_call (PtlEQFree, (gupcr_gmem_gets.eq_handle));
+ /* Release PUT MDs. */
+ gupcr_portals_call (PtlMDRelease, (gupcr_gmem_puts.md));
+ gupcr_portals_call (PtlMDRelease, (gupcr_gmem_put_bb_md));
+ gupcr_portals_call (PtlCTFree, (gupcr_gmem_puts.ct_handle));
+ gupcr_portals_call (PtlEQFree, (gupcr_gmem_puts.eq_handle));
+ /* Release LEs and PTEs. */
+ gupcr_portals_call (PtlLEUnlink, (gupcr_gmem_le));
+ gupcr_portals_call (PtlPTFree, (gupcr_ptl_ni, GUPCR_PTL_PTE_GMEM));
+}
+
+/** @} */
===================================================================
@@ -0,0 +1,132 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+ This file is part of the UPC runtime Library.
+ Written by Gary Funck <gary@intrepid.com>
+ and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+#ifndef _GUPCR_GMEM_H_
+#define _GUPCR_GMEM_H_
+
+/**
+ * @file gupcr_gmem.h
+ * GUPC Portals4 shared memory interface.
+ */
+
+/**
+ * @addtogroup GMEM GUPCR Shared Memory Access
+ * @{
+ */
+
+/* Configuration-defined limits. */
+/** Maximum size of the message that uses put bounce buffer. */
+#define GUPCR_GMEM_MAX_SAFE_PUT_SIZE 1*KILOBYTE
+
+/** Max size of the user program.
+ *
+ * To simplify management of memory descriptors the entire user
+ * program address space is mapped into one memory descriptor per
+ * direction of the transfer.
+ * Per linux kernel document: Documentation/x86/x86_64/mm.txt
+ * the maximum size is 0x8000_0000_0000
+ */
+#define USER_PROG_MEM_SIZE 0x00008000000000000
+/** Beginning of the user program */
+#define USER_PROG_MEM_START NULL
+
+//begin lib_inline_gmem
+/** Check if shared memory of the specified thread can be accessed
+ as node local reference. */
+#define GUPCR_GMEM_IS_LOCAL(thr) (gupcr_node_map[thr] != NULL)
+/** Convert pointer-to-shared address filed into local address. */
+#define GUPCR_GMEM_OFF_TO_LOCAL(thr,off) (gupcr_node_map[thr] + off)
+
+/** GMEM shared memory base */
+extern void *gupcr_gmem_base;
+//end lib_inline_gmem
+
+/** GMEM shared memory size */
+extern ptl_size_t gupcr_gmem_size;
+
+/** GMEM get/put information tracking.
+ *
+ * Track the information required to access global
+ * memory in a given direction (get/put) using non-blocking
+ * 'get' and 'put' functions.
+ */
+typedef struct gupcr_gmem_xfer_info_struct
+{
+ /** Number of pending operations */
+ ptl_size_t num_pending;
+ /** Number of completed operations */
+ ptl_size_t num_completed;
+ /** Memory descriptor options */
+ unsigned int md_options;
+ /** Memory descriptor event handle */
+ ptl_handle_eq_t eq_handle;
+ /** Memory descriptor counting events handle */
+ ptl_handle_ct_t ct_handle;
+ /** Memory descriptor handle */
+ ptl_handle_md_t md;
+ /** Volatile memory descriptor handle */
+ ptl_handle_md_t md_volatile;
+} gupcr_gmem_xfer_info_t;
+/** GET/PUT information tracking pointer type */
+typedef gupcr_gmem_xfer_info_t *gupcr_gmem_xfer_info_p;
+
+/** GET transfer tracking */
+extern gupcr_gmem_xfer_info_t gupcr_gmem_gets;
+/** PUT transfer tracking */
+extern gupcr_gmem_xfer_info_t gupcr_gmem_puts;
+
+/** PUT "bounce buffer" bytes in use */
+extern size_t gupcr_gmem_put_bb_used;
+
+//begin lib_gmem
+extern void gupcr_gmem_sync (void);
+//end lib_gmem
+
+//begin lib_inline_gmem
+
+/** If TRUE, a strict PUT operation is pending */
+extern int gupcr_pending_strict_put;
+
+extern void gupcr_gmem_sync_gets (void);
+extern void gupcr_gmem_sync_puts (void);
+extern void gupcr_gmem_get (void *dest, int rthread, size_t roffset,
+ size_t n);
+extern void gupcr_gmem_put (int rthread, size_t roffset, const void *src,
+ size_t n);
+extern void gupcr_gmem_copy (int dthread, size_t doffset, int sthread,
+ size_t soffset, size_t n);
+extern void gupcr_gmem_set (int dthread, size_t doffset, int c, size_t n);
+
+//end lib_inline_gmem
+
+extern size_t gupcr_gmem_heap_base_offset;
+extern size_t gupcr_gmem_heap_size;
+
+extern void gupcr_gmem_init (void);
+extern void gupcr_gmem_fini (void);
+
+/** @} */
+#endif /* gupcr_gmem.h */
===================================================================
@@ -0,0 +1,72 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+ This file is part of the UPC runtime library.
+ Written by Gary Funck <gary@intrepid.com>
+ and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+/**
+ * @file gupcr_lib.h
+ * GUPC Runtime definitions of user-visible UPC routines.
+ */
+
+#ifndef _GUPCR_LIB_H_
+#define _GUPCR_LIB_H_
+
+/* Definition of user-visible UPC library routines,
+ in a form that they can be called from the
+ "C"-based runtime. */
+
+extern size_t upc_threadof (upc_shared_ptr_t);
+extern size_t upc_phaseof (upc_shared_ptr_t);
+extern upc_shared_ptr_t upc_resetphase (upc_shared_ptr_t);
+extern size_t upc_addrfield (upc_shared_ptr_t);
+extern size_t upc_affinitysize (size_t, size_t, size_t);
+
+extern void upc_global_exit (int);
+
+extern void upc_memcpy (upc_shared_ptr_t dest, upc_shared_ptr_t src,
+ size_t n);
+extern void upc_memget (void *dest, upc_shared_ptr_t src, size_t n);
+extern void upc_memput (upc_shared_ptr_t dest, const void *src, size_t n);
+extern void upc_memset (upc_shared_ptr_t dest, int c, size_t n);
+
+extern upc_shared_ptr_t upc_global_alloc (size_t, size_t);
+extern upc_shared_ptr_t upc_all_alloc (size_t, size_t);
+extern upc_shared_ptr_t upc_alloc (size_t);
+extern void upc_free (upc_shared_ptr_t);
+extern void upc_all_free (upc_shared_ptr_t);
+
+extern upc_shared_ptr_t upc_lock_alloc (void);
+extern void upc_lock_free (upc_shared_ptr_t);
+extern void upc_all_lock_free (upc_shared_ptr_t);
+extern upc_shared_ptr_t upc_all_lock_alloc (void);
+extern upc_shared_ptr_t upc_global_lock_alloc (void);
+extern void upc_lock (upc_shared_ptr_t);
+extern int upc_lock_attempt (upc_shared_ptr_t);
+extern void upc_unlock (upc_shared_ptr_t);
+
+typedef uint64_t upc_tick_t;
+extern upc_tick_t upc_ticks_now (void);
+extern uint64_t upc_ticks_to_ns (upc_tick_t ticks);
+
+#endif /* gupcr_lib.h */