From patchwork Tue Nov 17 16:28:06 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Nathan Sidwell X-Patchwork-Id: 545649 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 0044414140B for ; Wed, 18 Nov 2015 03:28:22 +1100 (AEDT) Authentication-Results: ozlabs.org; dkim=pass (1024-bit key; unprotected) header.d=gcc.gnu.org header.i=@gcc.gnu.org header.b=L96JENAO; dkim-atps=neutral DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :subject:to:references:from:message-id:date:mime-version :in-reply-to:content-type; q=dns; s=default; b=UCidcwV43tGIpyuam S0tk4gspD8zA926xXf2zLjgwZ6pq4K0CESguv0uBR6UWC5zqgzPMHpa+ekDAmSNz KObAahEthY223cXJT0ALDXWltfKP2N/mMc+MdopsK/UWhSInFooLw+xfqmXtvfBq e1kmuh5VauWz7to3UihE4Od4d0= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :subject:to:references:from:message-id:date:mime-version :in-reply-to:content-type; s=default; bh=eXhEcRWwLg09rRSK5EKBDhQ l6/Y=; b=L96JENAOFC5GZNTse1yqO/2RksL0AZp6k/cPuo+OO4nzdIWpoz+IHIc JgPVMB1JhIX9L8ulJ0DLM913OGjcLJmWfsX5JYR0ZDiHzMwrNLGVhOdx2oKWnsAP xmYxat2upVqRPCBsDSoqd9KiHbMZRMocUOej01EltjKe0B+T1yfg= Received: (qmail 69677 invoked by alias); 17 Nov 2015 16:28:13 -0000 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org Received: (qmail 69668 invoked by uid 89); 17 Nov 2015 16:28:12 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=0.1 required=5.0 tests=BAYES_50, FREEMAIL_FROM, RCVD_IN_DNSWL_LOW, SPF_PASS autolearn=ham version=3.3.2 X-HELO: mail-vk0-f53.google.com Received: from mail-vk0-f53.google.com (HELO mail-vk0-f53.google.com) (209.85.213.53) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with (AES128-GCM-SHA256 encrypted) ESMTPS; Tue, 17 Nov 2015 16:28:10 +0000 Received: by vkbs1 with SMTP id s1so9025614vkb.3 for ; Tue, 17 Nov 2015 08:28:08 -0800 (PST) X-Received: by 10.31.52.209 with SMTP id b200mr4670411vka.148.1447777687782; Tue, 17 Nov 2015 08:28:07 -0800 (PST) Received: from ?IPv6:2601:181:c000:c497:a2a8:cdff:fe3e:b48? ([2601:181:c000:c497:a2a8:cdff:fe3e:b48]) by smtp.googlemail.com with ESMTPSA id x139sm3573972vkd.7.2015.11.17.08.28.06 (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128); Tue, 17 Nov 2015 08:28:07 -0800 (PST) Subject: Re: [gomp4, ptx] worker & gang complex double reductions To: GCC Patches References: <564A538F.4000506@acm.org> From: Nathan Sidwell Message-ID: <564B5596.4000404@acm.org> Date: Tue, 17 Nov 2015 11:28:06 -0500 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.3.0 MIME-Version: 1.0 In-Reply-To: <564A538F.4000506@acm.org> On 11/16/15 17:07, Nathan Sidwell wrote: > I've committed this patch to the gomp4 branch. It adds support for worker and > gang level complex double reductions. I was unsatisfied with that approach, so I've separated the two mechanisms into different functions with the attached patch. The locking scheme returns to the early variant (but using cmp&swaq builtin) while (cmp&swap (&lock, 0, 1) continue; T accum = *ptr; accum = accum OP myval; *ptr = accum cmp&swap (&lock, 1, 0); A new dispatcher function decides which approach to take, and that's where we can add atomic optimization smarts and the like. nathan 2015-11-17 Nathan Sidwell * config/nvptx/nvptx.c (nvptx_lockless_update): Remove complex double handling here ... (nvptx_lockfull_update): ... to this new function. (nvptx_reduction_update): New function. (nvptx_goacc_reduction_fini): Call it. Index: gcc/config/nvptx/nvptx.c =================================================================== --- gcc/config/nvptx/nvptx.c (revision 230463) +++ gcc/config/nvptx/nvptx.c (working copy) @@ -4354,11 +4354,10 @@ nvptx_global_lock_addr () write = guess OP myval; actual = cmp&swap (ptr, guess, write) } while (actual bit-differnt-to guess); + return write; - Unfortunately for types larger than 64 bits, there is no cmp&swap - instruction. We use a lock variable in global memory to synthesize - the above sequence. (A lock in global memory is necessary to force - execution engine descheduling and avoid resource starvation.) */ + This relies on a cmp&swap instruction, which is available for 32- + and 64-bit types. Larger types must use a locking scheme. */ static tree nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi, @@ -4368,20 +4367,9 @@ nvptx_lockless_update (location_t loc, g tree_code code = NOP_EXPR; tree arg_type = unsigned_type_node; tree var_type = TREE_TYPE (var); - tree dest_type = var_type; - tree inner_type = NULL_TREE; /* Non-null if synthesizing cmp&swap. */ - if (TREE_CODE (var_type) == COMPLEX_TYPE) - { - if (TYPE_SIZE (TREE_TYPE (var_type)) - == TYPE_SIZE (long_long_unsigned_type_node)) - /* Must do by parts. */ - var_type = TREE_TYPE (var_type); - else - code = VIEW_CONVERT_EXPR; - } - - if (TREE_CODE (var_type) == REAL_TYPE) + if (TREE_CODE (var_type) == COMPLEX_TYPE + || TREE_CODE (var_type) == REAL_TYPE) code = VIEW_CONVERT_EXPR; if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node)) @@ -4390,31 +4378,21 @@ nvptx_lockless_update (location_t loc, g fn = NVPTX_BUILTIN_CMP_SWAPLL; } - if (var_type != dest_type) - { - inner_type = arg_type; - arg_type = dest_type; - /* We use the cmp&swap insn to do the global locking. */ - fn = NVPTX_BUILTIN_CMP_SWAP; - } - tree swap_fn = nvptx_builtin_decl (fn, true); - /* Build and insert the initialization sequence. */ gimple_seq init_seq = NULL; tree init_var = make_ssa_name (arg_type); - tree init_expr = omp_reduction_init_op (loc, op, dest_type); - if (arg_type != dest_type) - init_expr = fold_build1 (code, arg_type, init_expr); + tree init_expr = omp_reduction_init_op (loc, op, var_type); + init_expr = fold_build1 (code, arg_type, init_expr); gimplify_assign (init_var, init_expr, &init_seq); gimple *init_end = gimple_seq_last (init_seq); gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT); - + /* Split the block just after the init stmts. */ basic_block pre_bb = gsi_bb (*gsi); edge pre_edge = split_block (pre_bb, init_end); - basic_block head_bb = pre_edge->dest; + basic_block loop_bb = pre_edge->dest; pre_bb = pre_edge->src; /* Reset the iterator. */ *gsi = gsi_for_stmt (gsi_stmt (*gsi)); @@ -4422,179 +4400,166 @@ nvptx_lockless_update (location_t loc, g tree expect_var = make_ssa_name (arg_type); tree actual_var = make_ssa_name (arg_type); tree write_var = make_ssa_name (arg_type); - tree lock_state = NULL_TREE; - tree uns_unlocked = NULL_TREE, uns_locked = NULL_TREE; - + /* Build and insert the reduction calculation. */ gimple_seq red_seq = NULL; - if (inner_type) - { - /* Unlock the lock using cmp&swap with an appropriate expected - value. This ends up with us unlocking only on subsequent - iterations. */ - lock_state = make_ssa_name (unsigned_type_node); - uns_unlocked = build_int_cst (unsigned_type_node, 0); - uns_locked = build_int_cst (unsigned_type_node, 1); - - tree unlock_expr = nvptx_global_lock_addr (); - unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr, - lock_state, uns_unlocked); - gimplify_and_add (unlock_expr, &red_seq); - } - - tree write_expr = expect_var; - if (arg_type != dest_type) - write_expr = fold_build1 (code, dest_type, expect_var); - write_expr = fold_build2 (op, dest_type, write_expr, var); - if (arg_type != dest_type) - write_expr = fold_build1 (code, arg_type, write_expr); + tree write_expr = fold_build1 (code, var_type, expect_var); + write_expr = fold_build2 (op, var_type, write_expr, var); + write_expr = fold_build1 (code, arg_type, write_expr); gimplify_assign (write_var, write_expr, &red_seq); - gimple *red_end = gimple_seq_last (red_seq); gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT); - basic_block latch_bb = head_bb; - basic_block lock_bb = NULL; - - /* Build the cmp&swap sequence. */ - gcond *cond; - tree cond_var, cond_val, swap_expr; + /* Build & insert the cmp&swap sequence. */ gimple_seq latch_seq = NULL; - if (inner_type) - { - /* Here we have to insert another loop, spinning on acquiring - the global lock. Lock releasing is sone at the head of the - main loop, or in the block following the loop. */ - - /* Split the block just after the reduction stmts. */ - edge lock_edge = split_block (head_bb, red_end); - lock_bb = lock_edge->dest; - head_bb = lock_edge->src; - *gsi = gsi_for_stmt (gsi_stmt (*gsi)); - - /* Create & insert the lock sequence. */ - gimple_seq lock_seq = NULL; - tree locked = make_ssa_name (unsigned_type_node); - tree lock_expr = nvptx_global_lock_addr (); - lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr, - uns_unlocked, uns_locked); - gimplify_assign (locked, lock_expr, &lock_seq); - cond = gimple_build_cond (EQ_EXPR, locked, uns_unlocked, - NULL_TREE, NULL_TREE); - gimple_seq_add_stmt (&lock_seq, cond); - - gimple *lock_end = gimple_seq_last (lock_seq); - gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT); - - /* Split the block just after the lock sequence. */ - edge locked_edge = split_block (lock_bb, lock_end); - latch_bb = locked_edge->dest; - lock_bb = locked_edge->src; - *gsi = gsi_for_stmt (gsi_stmt (*gsi)); - - /* Make lock_bb a loop. */ - locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU; - make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE); - set_immediate_dominator (CDI_DOMINATORS, lock_bb, head_bb); - set_immediate_dominator (CDI_DOMINATORS, latch_bb, lock_bb); - - /* Read the location. */ - tree ref = build_simple_mem_ref (ptr); - TREE_THIS_VOLATILE (ref) = 1; - gimplify_assign (actual_var, ref, &latch_seq); - - /* Determine equality by extracting the real & imaginary parts, - punning to an integral type and then using xor & or to create - a zero or non-zero value we can use in a comparison. */ - tree act_real = fold_build1 (REALPART_EXPR, var_type, actual_var); - tree act_imag = fold_build1 (IMAGPART_EXPR, var_type, actual_var); - tree exp_real = fold_build1 (REALPART_EXPR, var_type, expect_var); - tree exp_imag = fold_build1 (IMAGPART_EXPR, var_type, expect_var); - - act_real = fold_build1 (code, inner_type, act_real); - act_imag = fold_build1 (code, inner_type, act_imag); - exp_real = fold_build1 (code, inner_type, exp_real); - exp_imag = fold_build1 (code, inner_type, exp_imag); - - tree cmp_real = fold_build2 (BIT_XOR_EXPR, inner_type, - act_real, exp_real); - tree cmp_imag = fold_build2 (BIT_XOR_EXPR, inner_type, - act_imag, exp_imag); - swap_expr = fold_build2 (BIT_IOR_EXPR, inner_type, cmp_real, cmp_imag); + tree swap_expr = build_call_expr_loc (loc, swap_fn, 3, + ptr, expect_var, write_var); + gimplify_assign (actual_var, swap_expr, &latch_seq); - cond_var = make_ssa_name (inner_type); - cond_val = build_int_cst (inner_type, 0); - } - else - { - swap_expr = build_call_expr_loc (loc, swap_fn, 3, - ptr, expect_var, write_var); - cond_var = actual_var; - cond_val = expect_var; - } - - gimplify_assign (cond_var, swap_expr, &latch_seq); - cond = gimple_build_cond (EQ_EXPR, cond_var, cond_val, NULL_TREE, NULL_TREE); + gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var, + NULL_TREE, NULL_TREE); gimple_seq_add_stmt (&latch_seq, cond); - /* Insert the latch statements. */ gimple *latch_end = gimple_seq_last (latch_seq); gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT); /* Split the block just after the latch stmts. */ - edge post_edge = split_block (latch_bb, latch_end); + edge post_edge = split_block (loop_bb, latch_end); basic_block post_bb = post_edge->dest; - latch_bb = post_edge->src; + loop_bb = post_edge->src; *gsi = gsi_for_stmt (gsi_stmt (*gsi)); - /* Create the loop. */ post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU; - edge loop_edge = make_edge (latch_bb, head_bb, EDGE_FALSE_VALUE); - set_immediate_dominator (CDI_DOMINATORS, head_bb, pre_bb); - set_immediate_dominator (CDI_DOMINATORS, post_bb, latch_bb); + edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE); + set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb); + set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb); - gphi *phi = create_phi_node (expect_var, head_bb); + gphi *phi = create_phi_node (expect_var, loop_bb); add_phi_arg (phi, init_var, pre_edge, loc); add_phi_arg (phi, actual_var, loop_edge, loc); - loop *update_loop = alloc_loop (); - update_loop->header = head_bb; - update_loop->latch = latch_bb; - update_loop->nb_iterations_estimate = 1; - update_loop->any_estimate = true; - add_loop (update_loop, head_bb->loop_father); + loop *loop = alloc_loop (); + loop->header = loop_bb; + loop->latch = loop_bb; + add_loop (loop, loop_bb->loop_father); - if (inner_type) - { - phi = create_phi_node (lock_state, head_bb); - add_phi_arg (phi, uns_unlocked, pre_edge, loc); - add_phi_arg (phi, uns_locked, loop_edge, loc); + return fold_build1 (code, var_type, write_var); +} - /* Insert store and unlock. */ - gimple_seq post_seq = NULL; +/* Insert code to lockfully update *PTR with *PTR OP VAR just before + GSI. This is necessary for types larger than 64 bits, where there + is no cmp&swap instruction to implement a lockless scheme. We use + a lock variable in global memory. + + while (cmp&swap (&lock_var, 0, 1)) + continue; + T accum = *ptr; + accum = accum OP var; + *ptr = accum; + cmp&swap (&lock_var, 1, 0); + return accum; + + A lock in global memory is necessary to force execution engine + descheduling and avoid resource starvation that can occur if the + lock is in .shared memory. */ - /* Write the location and release the lock. */ - tree ref = build_simple_mem_ref (ptr); - TREE_THIS_VOLATILE (ref) = 1; - gimplify_assign (ref, write_var, &post_seq); +static tree +nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi, + tree ptr, tree var, tree_code op) +{ + tree var_type = TREE_TYPE (var); + tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true); + tree uns_unlocked = build_int_cst (unsigned_type_node, 0); + tree uns_locked = build_int_cst (unsigned_type_node, 1); + + /* Split the block just before the gsi. Insert a gimple nop to make + this easier. */ + gimple *nop = gimple_build_nop (); + gsi_insert_before (gsi, nop, GSI_SAME_STMT); + basic_block entry_bb = gsi_bb (*gsi); + edge entry_edge = split_block (entry_bb, nop); + basic_block lock_bb = entry_edge->dest; + /* Reset the iterator. */ + *gsi = gsi_for_stmt (gsi_stmt (*gsi)); - tree unlock_expr = nvptx_global_lock_addr (); - unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr, - uns_locked, uns_unlocked); - gimplify_and_add (unlock_expr, &post_seq); - - gsi_insert_seq_before (gsi, post_seq, GSI_SAME_STMT); - - loop *lock_loop = alloc_loop (); - lock_loop->header = lock_loop->latch = lock_bb; - lock_loop->nb_iterations_estimate = 1; - lock_loop->any_estimate = true; - add_loop (lock_loop, update_loop); - } + /* Build and insert the locking sequence. */ + gimple_seq lock_seq = NULL; + tree lock_var = make_ssa_name (unsigned_type_node); + tree lock_expr = nvptx_global_lock_addr (); + lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr, + uns_unlocked, uns_locked); + gimplify_assign (lock_var, lock_expr, &lock_seq); + gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked, + NULL_TREE, NULL_TREE); + gimple_seq_add_stmt (&lock_seq, cond); + gimple *lock_end = gimple_seq_last (lock_seq); + gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT); + + /* Split the block just after the lock sequence. */ + edge locked_edge = split_block (lock_bb, lock_end); + basic_block update_bb = locked_edge->dest; + lock_bb = locked_edge->src; + *gsi = gsi_for_stmt (gsi_stmt (*gsi)); + + /* Create the lock loop ... */ + locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU; + make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE); + set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb); + set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb); + + /* ... and the loop structure. */ + loop *lock_loop = alloc_loop (); + lock_loop->header = lock_bb; + lock_loop->latch = lock_bb; + lock_loop->nb_iterations_estimate = 1; + lock_loop->any_estimate = true; + add_loop (lock_loop, entry_bb->loop_father); - if (dest_type != arg_type) - write_var = fold_build1 (code, dest_type, write_var); - return write_var; + /* Build and insert the reduction calculation. */ + gimple_seq red_seq = NULL; + tree acc_in = make_ssa_name (var_type); + tree ref_in = build_simple_mem_ref (ptr); + TREE_THIS_VOLATILE (ref_in) = 1; + gimplify_assign (acc_in, ref_in, &red_seq); + + tree acc_out = make_ssa_name (var_type); + tree update_expr = fold_build2 (op, var_type, ref_in, var); + gimplify_assign (acc_out, update_expr, &red_seq); + + tree ref_out = build_simple_mem_ref (ptr); + TREE_THIS_VOLATILE (ref_out) = 1; + gimplify_assign (ref_out, acc_out, &red_seq); + + gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT); + + /* Build & insert the unlock sequence. */ + gimple_seq unlock_seq = NULL; + tree unlock_expr = nvptx_global_lock_addr (); + unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr, + uns_locked, uns_unlocked); + gimplify_and_add (unlock_expr, &unlock_seq); + gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT); + + return acc_out; +} + +/* Emit a sequence to update a reduction accumlator at *PTR with the + value held in VAR using operator OP. Return the updated value. + + TODO: optimize for atomic ops and indepedent complex ops. */ + +static tree +nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi, + tree ptr, tree var, tree_code op) +{ + tree type = TREE_TYPE (var); + tree size = TYPE_SIZE (type); + + if (size == TYPE_SIZE (unsigned_type_node) + || size == TYPE_SIZE (long_long_unsigned_type_node)) + return nvptx_lockless_update (loc, gsi, ptr, var, op); + else + return nvptx_lockfull_update (loc, gsi, ptr, var, op); } /* NVPTX implementation of GOACC_REDUCTION_SETUP. */ @@ -4776,11 +4741,11 @@ nvptx_goacc_reduction_fini (gcall *call) if (accum) { - /* Locklessly update the accumulator. */ + /* UPDATE the accumulator. */ gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT); seq = NULL; - r = nvptx_lockless_update (gimple_location (call), &gsi, - accum, var, op); + r = nvptx_reduction_update (gimple_location (call), &gsi, + accum, var, op); } }