From patchwork Wed Jun 14 11:47:10 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Richard Biener X-Patchwork-Id: 1794925 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@legolas.ozlabs.org Authentication-Results: legolas.ozlabs.org; spf=pass (sender SPF authorized) smtp.mailfrom=gcc.gnu.org (client-ip=2620:52:3:1:0:246e:9693:128c; helo=sourceware.org; envelope-from=gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org; receiver=) Authentication-Results: legolas.ozlabs.org; dkim=pass (1024-bit key; unprotected) header.d=gcc.gnu.org header.i=@gcc.gnu.org header.a=rsa-sha256 header.s=default header.b=aJ7uDS5z; dkim-atps=neutral Received: from sourceware.org (server2.sourceware.org [IPv6:2620:52:3:1:0:246e:9693:128c]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature ECDSA (P-384) server-digest SHA384) (No client certificate requested) by legolas.ozlabs.org (Postfix) with ESMTPS id 4Qh3Zt4bk8z20QH for ; Wed, 14 Jun 2023 21:48:06 +1000 (AEST) Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 90D7F3857351 for ; Wed, 14 Jun 2023 11:48:04 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 90D7F3857351 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1686743284; bh=18sNdXDeUxwAP1F9Gmn4qTlEINzVo5+NvXNQlJP4xzE=; h=Date:To:cc:Subject:List-Id:List-Unsubscribe:List-Archive: List-Post:List-Help:List-Subscribe:From:Reply-To:From; b=aJ7uDS5zsCKPbM1hkuGZCmpXMAlatM4Z7YyE10oV6fVzGC+4QrBdMSdCgQl3LTaD4 zJVt2U6hAe83Izzt8+oDBletIjAiVlWliLZFKjvS+0ONS7opBERZ4+Q4USZsd/bfE+ 8qp+DEs58qAILVbKALQOWXBKCqK1KtfW0pzVFU0Q= X-Original-To: gcc-patches@gcc.gnu.org Delivered-To: gcc-patches@gcc.gnu.org Received: from smtp-out2.suse.de (smtp-out2.suse.de [195.135.220.29]) by sourceware.org (Postfix) with ESMTPS id 2B0223858422 for ; Wed, 14 Jun 2023 11:47:11 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 2B0223858422 Received: from relay2.suse.de (relay2.suse.de [149.44.160.134]) by smtp-out2.suse.de (Postfix) with ESMTP id 333451FDE8; Wed, 14 Jun 2023 11:47:10 +0000 (UTC) Received: from wotan.suse.de (wotan.suse.de [10.160.0.1]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by relay2.suse.de (Postfix) with ESMTPS id 21B9F2C141; Wed, 14 Jun 2023 11:47:10 +0000 (UTC) Date: Wed, 14 Jun 2023 11:47:10 +0000 (UTC) To: gcc-patches@gcc.gnu.org cc: richard.sandiford@arm.com Subject: [PATCH 1/3] Inline vect_get_max_nscalars_per_iter User-Agent: Alpine 2.22 (LSU 394 2020-01-19) MIME-Version: 1.0 X-Spam-Status: No, score=-10.5 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, GIT_PATCH_0, MISSING_MID, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: gcc-patches@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-patches mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Richard Biener via Gcc-patches From: Richard Biener Reply-To: Richard Biener Errors-To: gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org Sender: "Gcc-patches" Message-Id: <20230614114804.90D7F3857351@sourceware.org> The function is only meaningful for LOOP_VINFO_MASKS processing so inline it into the single use. Bootstrapped and tested on x86_64-unknown-linux-gnu, OK? * tree-vect-loop.cc (vect_get_max_nscalars_per_iter): Inline into ... (vect_verify_full_masking): ... this. --- gcc/tree-vect-loop.cc | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index ace9e759f5b..a9695e5b25d 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -1117,20 +1117,6 @@ can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type) return true; } -/* Calculate the maximum number of scalars per iteration for every - rgroup in LOOP_VINFO. */ - -static unsigned int -vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo) -{ - unsigned int res = 1; - unsigned int i; - rgroup_controls *rgm; - FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm) - res = MAX (res, rgm->max_nscalars_per_iter); - return res; -} - /* Calculate the minimum precision necessary to represent: MAX_NITERS * FACTOR @@ -1210,8 +1196,6 @@ static bool vect_verify_full_masking (loop_vec_info loop_vinfo) { unsigned int min_ni_width; - unsigned int max_nscalars_per_iter - = vect_get_max_nscalars_per_iter (loop_vinfo); /* Use a normal loop if there are no statements that need masking. This only happens in rare degenerate cases: it means that the loop @@ -1219,6 +1203,12 @@ vect_verify_full_masking (loop_vec_info loop_vinfo) if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()) return false; + /* Calculate the maximum number of scalars per iteration for every rgroup. */ + unsigned int max_nscalars_per_iter = 1; + for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo)) + max_nscalars_per_iter + = MAX (max_nscalars_per_iter, rgm.max_nscalars_per_iter); + /* Work out how many bits we need to represent the limit. */ min_ni_width = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter); From patchwork Wed Jun 14 11:47:21 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Richard Biener X-Patchwork-Id: 1794926 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@legolas.ozlabs.org Authentication-Results: legolas.ozlabs.org; spf=pass (sender SPF authorized) smtp.mailfrom=gcc.gnu.org (client-ip=2620:52:3:1:0:246e:9693:128c; helo=sourceware.org; envelope-from=gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org; receiver=) Authentication-Results: legolas.ozlabs.org; dkim=pass (1024-bit key; unprotected) header.d=gcc.gnu.org header.i=@gcc.gnu.org header.a=rsa-sha256 header.s=default header.b=cSYgP5ES; dkim-atps=neutral Received: from sourceware.org (server2.sourceware.org [IPv6:2620:52:3:1:0:246e:9693:128c]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature ECDSA (P-384) server-digest SHA384) (No client certificate requested) by legolas.ozlabs.org (Postfix) with ESMTPS id 4Qh3bB1Xmjz20QH for ; Wed, 14 Jun 2023 21:48:22 +1000 (AEST) Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 378723858022 for ; Wed, 14 Jun 2023 11:48:20 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 378723858022 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1686743300; bh=3VlIc111TgtYk2PJBOsfXn+o47DbKnyuQTX9E+ubCAQ=; h=Date:To:cc:Subject:List-Id:List-Unsubscribe:List-Archive: List-Post:List-Help:List-Subscribe:From:Reply-To:From; b=cSYgP5ES01lW6RKalBCmkFbr7lal2mYKjRqIuTKYSxrIBwD22hX48cqMazb0qDVDd qH0fYqY3XLChC1SKkOjSOVdPQvamc4CSKYS/r1ZwnOEhA8Z0OgmQATXYyEf0BzSn88 s0zbpiylWUEevAm7vttmd6HICsXcE2iDXOFChbYg= X-Original-To: gcc-patches@gcc.gnu.org Delivered-To: gcc-patches@gcc.gnu.org Received: from smtp-out1.suse.de (smtp-out1.suse.de [195.135.220.28]) by sourceware.org (Postfix) with ESMTPS id 443CF3857701 for ; Wed, 14 Jun 2023 11:47:22 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 443CF3857701 Received: from relay2.suse.de (relay2.suse.de [149.44.160.134]) by smtp-out1.suse.de (Postfix) with ESMTP id 5BAC82185C; Wed, 14 Jun 2023 11:47:21 +0000 (UTC) Received: from wotan.suse.de (wotan.suse.de [10.160.0.1]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by relay2.suse.de (Postfix) with ESMTPS id 4E00B2C142; Wed, 14 Jun 2023 11:47:21 +0000 (UTC) Date: Wed, 14 Jun 2023 11:47:21 +0000 (UTC) To: gcc-patches@gcc.gnu.org cc: richard.sandiford@arm.com Subject: [PATCH 2/3] Add loop_vinfo argument to vect_get_loop_mask User-Agent: Alpine 2.22 (LSU 394 2020-01-19) MIME-Version: 1.0 X-Spam-Status: No, score=-10.5 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, GIT_PATCH_0, MISSING_MID, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: gcc-patches@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-patches mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Richard Biener via Gcc-patches From: Richard Biener Reply-To: Richard Biener Errors-To: gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org Sender: "Gcc-patches" Message-Id: <20230614114820.378723858022@sourceware.org> This adds a loop_vinfo argument for future use, making the next patch smaller. * tree-vectorizer.h (vect_get_loop_mask): Add loop_vec_info argument. * tree-vect-loop.cc (vect_get_loop_mask): Likewise. (vectorize_fold_left_reduction): Adjust. (vect_transform_reduction): Likewise. (vectorizable_live_operation): Likewise. * tree-vect-stmts.cc (vectorizable_call): Likewise. (vectorizable_operation): Likewise. (vectorizable_store): Likewise. (vectorizable_load): Likewise. (vectorizable_condition): Likewise. --- gcc/tree-vect-loop.cc | 16 +++++++++------- gcc/tree-vect-stmts.cc | 36 +++++++++++++++++++----------------- gcc/tree-vectorizer.h | 3 ++- 3 files changed, 30 insertions(+), 25 deletions(-) diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index a9695e5b25d..1897e720389 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -6637,7 +6637,7 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo, gimple *new_stmt; tree mask = NULL_TREE; if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) - mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i); + mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i); /* Handle MINUS by adding the negative. */ if (reduc_fn != IFN_LAST && code == MINUS_EXPR) @@ -7950,8 +7950,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo, gcc_assert (commutative_binary_op_p (code, op.type)); std::swap (vop[0], vop[1]); } - tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies, - vectype_in, i); + tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks, + vec_num * ncopies, vectype_in, i); gcall *call = gimple_build_call_internal (cond_fn, 4, mask, vop[0], vop[1], vop[0]); new_temp = make_ssa_name (vec_dest, call); @@ -7967,8 +7967,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo, if (masked_loop_p && mask_by_cond_expr) { - tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies, - vectype_in, i); + tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks, + vec_num * ncopies, vectype_in, i); build_vect_cond_expr (code, vop, mask, gsi); } @@ -10075,7 +10075,8 @@ vectorizable_live_operation (vec_info *vinfo, the loop mask for the final iteration. */ gcc_assert (ncopies == 1 && !slp_node); tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info)); - tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), + tree mask = vect_get_loop_mask (loop_vinfo, gsi, + &LOOP_VINFO_MASKS (loop_vinfo), 1, vectype, 0); tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type, mask, vec_lhs_phi); @@ -10359,7 +10360,8 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks, arrangement. */ tree -vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks, +vect_get_loop_mask (loop_vec_info, + gimple_stmt_iterator *gsi, vec_loop_masks *masks, unsigned int nvectors, tree vectype, unsigned int index) { rgroup_controls *rgm = &(*masks)[nvectors - 1]; diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index a7acc032d47..47baf35227f 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -3692,7 +3692,8 @@ vectorizable_call (vec_info *vinfo, unsigned int vec_num = vec_oprnds0.length (); /* Always true for SLP. */ gcc_assert (ncopies == 1); - vargs[varg++] = vect_get_loop_mask (gsi, masks, vec_num, + vargs[varg++] = vect_get_loop_mask (loop_vinfo, + gsi, masks, vec_num, vectype_out, i); } size_t k; @@ -3733,7 +3734,8 @@ vectorizable_call (vec_info *vinfo, unsigned int vec_num = vec_oprnds0.length (); /* Always true for SLP. */ gcc_assert (ncopies == 1); - tree mask = vect_get_loop_mask (gsi, masks, vec_num, + tree mask = vect_get_loop_mask (loop_vinfo, + gsi, masks, vec_num, vectype_out, i); vargs[mask_opno] = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask, @@ -3758,7 +3760,7 @@ vectorizable_call (vec_info *vinfo, int varg = 0; if (masked_loop_p && reduc_idx >= 0) - vargs[varg++] = vect_get_loop_mask (gsi, masks, ncopies, + vargs[varg++] = vect_get_loop_mask (loop_vinfo, gsi, masks, ncopies, vectype_out, j); for (i = 0; i < nargs; i++) { @@ -3777,7 +3779,7 @@ vectorizable_call (vec_info *vinfo, if (mask_opno >= 0 && masked_loop_p) { - tree mask = vect_get_loop_mask (gsi, masks, ncopies, + tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks, ncopies, vectype_out, j); vargs[mask_opno] = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask, @@ -6823,8 +6825,8 @@ vectorizable_operation (vec_info *vinfo, } else if (masked_loop_p && mask_out_inactive) { - tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies, - vectype, i); + tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks, + vec_num * ncopies, vectype, i); auto_vec vops (5); vops.quick_push (mask); vops.quick_push (vop0); @@ -6865,8 +6867,8 @@ vectorizable_operation (vec_info *vinfo, if (loop_vinfo->scalar_cond_masked_set.contains ({ op0, ncopies})) { - mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies, - vectype, i); + mask = vect_get_loop_mask (loop_vinfo, gsi, masks, + vec_num * ncopies, vectype, i); vop0 = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask, vop0, gsi); @@ -6875,8 +6877,8 @@ vectorizable_operation (vec_info *vinfo, if (loop_vinfo->scalar_cond_masked_set.contains ({ op1, ncopies })) { - mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies, - vectype, i); + mask = vect_get_loop_mask (loop_vinfo, gsi, masks, + vec_num * ncopies, vectype, i); vop1 = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask, vop1, gsi); @@ -8760,8 +8762,8 @@ vectorizable_store (vec_info *vinfo, tree final_mask = NULL; if (loop_masks) - final_mask = vect_get_loop_mask (gsi, loop_masks, ncopies, - vectype, j); + final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, + ncopies, vectype, j); if (vec_mask) final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, final_mask, vec_mask, gsi); @@ -8814,7 +8816,7 @@ vectorizable_store (vec_info *vinfo, tree final_mask = NULL_TREE; if (loop_masks) - final_mask = vect_get_loop_mask (gsi, loop_masks, + final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, vec_num * ncopies, vectype, vec_num * j + i); if (vec_mask) @@ -10133,8 +10135,8 @@ vectorizable_load (vec_info *vinfo, tree final_mask = NULL_TREE; if (loop_masks) - final_mask = vect_get_loop_mask (gsi, loop_masks, ncopies, - vectype, j); + final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, + ncopies, vectype, j); if (vec_mask) final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, final_mask, vec_mask, gsi); @@ -10184,7 +10186,7 @@ vectorizable_load (vec_info *vinfo, tree final_mask = NULL_TREE; if (loop_masks && memory_access_type != VMAT_INVARIANT) - final_mask = vect_get_loop_mask (gsi, loop_masks, + final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, vec_num * ncopies, vectype, vec_num * j + i); if (vec_mask) @@ -11208,7 +11210,7 @@ vectorizable_condition (vec_info *vinfo, if (masks) { tree loop_mask - = vect_get_loop_mask (gsi, masks, vec_num * ncopies, + = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num * ncopies, vectype, i); tree tmp2 = make_ssa_name (vec_cmp_type); gassign *g diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index af25d20bd7e..767a0774d45 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -2302,7 +2302,8 @@ extern tree vect_halve_mask_nunits (tree, machine_mode); extern tree vect_double_mask_nunits (tree, machine_mode); extern void vect_record_loop_mask (loop_vec_info, vec_loop_masks *, unsigned int, tree, tree); -extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *, +extern tree vect_get_loop_mask (loop_vec_info, gimple_stmt_iterator *, + vec_loop_masks *, unsigned int, tree, unsigned int); extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int, tree, unsigned int); From patchwork Wed Jun 14 11:54:06 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Richard Biener X-Patchwork-Id: 1794927 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@legolas.ozlabs.org Authentication-Results: legolas.ozlabs.org; spf=pass (sender SPF authorized) smtp.mailfrom=gcc.gnu.org (client-ip=2620:52:3:1:0:246e:9693:128c; helo=sourceware.org; envelope-from=gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org; receiver=) Authentication-Results: legolas.ozlabs.org; dkim=pass (1024-bit key; unprotected) header.d=gcc.gnu.org header.i=@gcc.gnu.org header.a=rsa-sha256 header.s=default header.b=DJP8M9Nh; dkim-atps=neutral Received: from sourceware.org (server2.sourceware.org [IPv6:2620:52:3:1:0:246e:9693:128c]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature ECDSA (P-384) server-digest SHA384) (No client certificate requested) by legolas.ozlabs.org (Postfix) with ESMTPS id 4Qh3kK5hXnz20WR for ; Wed, 14 Jun 2023 21:54:33 +1000 (AEST) Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id C9D2C385841E for ; Wed, 14 Jun 2023 11:54:31 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org C9D2C385841E DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1686743671; bh=mpJhPk5hSinVDkw8gyyypMrjtAC8Go19tDDfJbDHyPg=; h=Date:To:cc:Subject:List-Id:List-Unsubscribe:List-Archive: List-Post:List-Help:List-Subscribe:From:Reply-To:From; b=DJP8M9Nheo4WLpemueDvAYFdjqrmdepB0HivRGlat41wZsucIhrhdUf11yeGr/M8Y 6pRtDZvJ0XHT8f9JU+ZOVOC24w+Apt7SdqouMDOOjhcoDHK4QMlKFCTfbTwjdhoYoB GTSeaFwCTX4wUMcwQ4lFFhcJxhkfVgEEG7fiIw9g= X-Original-To: gcc-patches@gcc.gnu.org Delivered-To: gcc-patches@gcc.gnu.org Received: from smtp-out1.suse.de (smtp-out1.suse.de [195.135.220.28]) by sourceware.org (Postfix) with ESMTPS id 8AB4F3858D1E for ; Wed, 14 Jun 2023 11:54:08 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 8AB4F3858D1E Received: from relay2.suse.de (relay2.suse.de [149.44.160.134]) by smtp-out1.suse.de (Postfix) with ESMTP id B3EF321854; Wed, 14 Jun 2023 11:54:07 +0000 (UTC) Received: from wotan.suse.de (wotan.suse.de [10.160.0.1]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by relay2.suse.de (Postfix) with ESMTPS id 24E722C142; Wed, 14 Jun 2023 11:54:07 +0000 (UTC) Date: Wed, 14 Jun 2023 11:54:06 +0000 (UTC) To: gcc-patches@gcc.gnu.org cc: richard.sandiford@arm.com, Jan Hubicka , hongtao.liu@intel.com, kirill.yukhin@gmail.com, ams@codesourcery.com Subject: [PATCH 3/3] AVX512 fully masked vectorization User-Agent: Alpine 2.22 (LSU 394 2020-01-19) MIME-Version: 1.0 X-Spam-Status: No, score=-10.5 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, GIT_PATCH_0, MISSING_MID, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: gcc-patches@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-patches mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Richard Biener via Gcc-patches From: Richard Biener Reply-To: Richard Biener Errors-To: gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org Sender: "Gcc-patches" Message-Id: <20230614115431.C9D2C385841E@sourceware.org> This implemens fully masked vectorization or a masked epilog for AVX512 style masks which single themselves out by representing each lane with a single bit and by using integer modes for the mask (both is much like GCN). AVX512 is also special in that it doesn't have any instruction to compute the mask from a scalar IV like SVE has with while_ult. Instead the masks are produced by vector compares and the loop control retains the scalar IV (mainly to avoid dependences on mask generation, a suitable mask test instruction is available). Like RVV code generation prefers a decrementing IV though IVOPTs messes things up in some cases removing that IV to eliminate it with an incrementing one used for address generation. One of the motivating testcases is from PR108410 which in turn is extracted from x264 where large size vectorization shows issues with small trip loops. Execution time there improves compared to classic AVX512 with AVX2 epilogues for the cases of less than 32 iterations. size scalar 128 256 512 512e 512f 1 9.42 11.32 9.35 11.17 15.13 16.89 2 5.72 6.53 6.66 6.66 7.62 8.56 3 4.49 5.10 5.10 5.74 5.08 5.73 4 4.10 4.33 4.29 5.21 3.79 4.25 6 3.78 3.85 3.86 4.76 2.54 2.85 8 3.64 1.89 3.76 4.50 1.92 2.16 12 3.56 2.21 3.75 4.26 1.26 1.42 16 3.36 0.83 1.06 4.16 0.95 1.07 20 3.39 1.42 1.33 4.07 0.75 0.85 24 3.23 0.66 1.72 4.22 0.62 0.70 28 3.18 1.09 2.04 4.20 0.54 0.61 32 3.16 0.47 0.41 0.41 0.47 0.53 34 3.16 0.67 0.61 0.56 0.44 0.50 38 3.19 0.95 0.95 0.82 0.40 0.45 42 3.09 0.58 1.21 1.13 0.36 0.40 'size' specifies the number of actual iterations, 512e is for a masked epilog and 512f for the fully masked loop. From 4 scalar iterations on the AVX512 masked epilog code is clearly the winner, the fully masked variant is clearly worse and it's size benefit is also tiny. This patch does not enable using fully masked loops or masked epilogues by default. More work on cost modeling and vectorization kind selection on x86_64 is necessary for this. Implementation wise this introduces LOOP_VINFO_PARTIAL_VECTORS_STYLE which could be exploited further to unify some of the flags we have right now but there didn't seem to be many easy things to merge, so I'm leaving this for followups. Mask requirements as registered by vect_record_loop_mask are kept in their original form and recorded in a hash_set now instead of being processed to a vector of rgroup_controls. Instead that's now left to the final analysis phase which tries forming the rgroup_controls vector using while_ult and if that fails now tries AVX512 style which needs a different organization and instead fills a hash_map with the relevant info. vect_get_loop_mask now has two implementations, one for the two mask styles we then have. I have decided against interweaving vect_set_loop_condition_partial_vectors with conditions to do AVX512 style masking and instead opted to "duplicate" this to vect_set_loop_condition_partial_vectors_avx512. Likewise for vect_verify_full_masking vs vect_verify_full_masking_avx512. I was split between making 'vec_loop_masks' a class with methods, possibly merging in the _len stuff into a single registry. It seemed to be too many changes for the purpose of getting AVX512 working. I'm going to play wait and see what happens with RISC-V here since they are going to get both masks and lengths registered I think. The vect_prepare_for_masked_peels hunk might run into issues with SVE, I didn't check yet but using LOOP_VINFO_RGROUP_COMPARE_TYPE looked odd. Bootstrapped and tested on x86_64-unknown-linux-gnu. I've run the testsuite with --param vect-partial-vector-usage=2 with and without -fno-vect-cost-model and filed two bugs, one ICE (PR110221) and one latent wrong-code (PR110237). There's followup work to be done to try enabling masked epilogues for x86-64 by default (when AVX512 is enabled, possibly only when -mprefer-vector-width=512). Getting cost modeling and decision right is going to be challenging. Any comments? OK? Btw, testing on GCN would be welcome - the _avx512 paths could work for it so in case the while_ult path fails (not sure if it ever does) it could get _avx512 style masking. Likewise testing on ARM just to see I didn't break anything here. I don't have SVE hardware so testing is probably meaningless. Thanks, Richard. * tree-vectorizer.h (enum vect_partial_vector_style): New. (_loop_vec_info::partial_vector_style): Likewise. (LOOP_VINFO_PARTIAL_VECTORS_STYLE): Likewise. (rgroup_controls::compare_type): Add. (vec_loop_masks): Change from a typedef to auto_vec<> to a structure. * tree-vect-loop-manip.cc (vect_set_loop_condition_partial_vectors): Adjust. (vect_set_loop_condition_partial_vectors_avx512): New function implementing the AVX512 partial vector codegen. (vect_set_loop_condition): Dispatch to the correct vect_set_loop_condition_partial_vectors_* function based on LOOP_VINFO_PARTIAL_VECTORS_STYLE. (vect_prepare_for_masked_peels): Compute LOOP_VINFO_MASK_SKIP_NITERS in the original niter type. * tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Initialize partial_vector_style. (_loop_vec_info::~_loop_vec_info): Release the hash-map recorded rgroup_controls. (can_produce_all_loop_masks_p): Adjust. (vect_verify_full_masking): Produce the rgroup_controls vector here. Set LOOP_VINFO_PARTIAL_VECTORS_STYLE on success. (vect_verify_full_masking_avx512): New function implementing verification of AVX512 style masking. (vect_verify_loop_lens): Set LOOP_VINFO_PARTIAL_VECTORS_STYLE. (vect_analyze_loop_2): Also try AVX512 style masking. Adjust condition. (vect_estimate_min_profitable_iters): Implement AVX512 style mask producing cost. (vect_record_loop_mask): Do not build the rgroup_controls vector here but record masks in a hash-set. (vect_get_loop_mask): Implement AVX512 style mask query, complementing the existing while_ult style. --- gcc/tree-vect-loop-manip.cc | 264 ++++++++++++++++++++++- gcc/tree-vect-loop.cc | 413 +++++++++++++++++++++++++++++++----- gcc/tree-vectorizer.h | 35 ++- 3 files changed, 651 insertions(+), 61 deletions(-) diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc index 1c8100c1a1c..f0ecaec28f4 100644 --- a/gcc/tree-vect-loop-manip.cc +++ b/gcc/tree-vect-loop-manip.cc @@ -50,6 +50,9 @@ along with GCC; see the file COPYING3. If not see #include "insn-config.h" #include "rtl.h" #include "recog.h" +#include "langhooks.h" +#include "tree-vector-builder.h" +#include "optabs-tree.h" /************************************************************************* Simple Loop Peeling Utilities @@ -845,7 +848,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop, rgroup_controls *iv_rgc = nullptr; unsigned int i; auto_vec *controls = use_masks_p - ? &LOOP_VINFO_MASKS (loop_vinfo) + ? &LOOP_VINFO_MASKS (loop_vinfo).rgc_vec : &LOOP_VINFO_LENS (loop_vinfo); FOR_EACH_VEC_ELT (*controls, i, rgc) if (!rgc->controls.is_empty ()) @@ -936,6 +939,246 @@ vect_set_loop_condition_partial_vectors (class loop *loop, return cond_stmt; } +/* Set up the iteration condition and rgroup controls for LOOP in AVX512 + style, given that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the + vectorized loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is + the number of iterations of the original scalar loop that should be + handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as + for vect_set_loop_condition. + + Insert the branch-back condition before LOOP_COND_GSI and return the + final gcond. */ + +static gcond * +vect_set_loop_condition_partial_vectors_avx512 (class loop *loop, + loop_vec_info loop_vinfo, tree niters, + tree final_iv, + bool niters_maybe_zero, + gimple_stmt_iterator loop_cond_gsi) +{ + tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); + poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + tree orig_niters = niters; + gimple_seq preheader_seq = NULL; + + /* Create an IV that counts down from niters and whose step + is the number of iterations processed in the current iteration. + Produce the controls with compares like the following. + + # iv_2 = PHI + rem_4 = MIN ; + remv_6 = { rem_4, rem_4, rem_4, ... } + mask_5 = { 0, 0, 1, 1, 2, 2, ... } < remv6; + iv_3 = iv_2 - VF; + if (iv_2 > VF) + continue; + + Where the constant is built with elements at most VF - 1 and + repetitions according to max_nscalars_per_iter which is guarnateed + to be the same within a group. */ + + /* Convert NITERS to the determined IV type. */ + if (TYPE_PRECISION (iv_type) > TYPE_PRECISION (TREE_TYPE (niters)) + && niters_maybe_zero) + { + /* We know that there is always at least one iteration, so if the + count is zero then it must have wrapped. Cope with this by + subtracting 1 before the conversion and adding 1 to the result. */ + gcc_assert (TYPE_UNSIGNED (TREE_TYPE (niters))); + niters = gimple_build (&preheader_seq, PLUS_EXPR, TREE_TYPE (niters), + niters, build_minus_one_cst (TREE_TYPE (niters))); + niters = gimple_convert (&preheader_seq, iv_type, niters); + niters = gimple_build (&preheader_seq, PLUS_EXPR, iv_type, + niters, build_one_cst (iv_type)); + } + else + niters = gimple_convert (&preheader_seq, iv_type, niters); + + /* Bias the initial value of the IV in case we need to skip iterations + at the beginning. */ + tree niters_adj = niters; + if (niters_skip) + { + tree skip = gimple_convert (&preheader_seq, iv_type, niters_skip); + niters_adj = gimple_build (&preheader_seq, PLUS_EXPR, + iv_type, niters, skip); + } + + /* The iteration step is the vectorization factor. */ + tree iv_step = build_int_cst (iv_type, vf); + + /* Create the decrement IV. */ + tree index_before_incr, index_after_incr; + gimple_stmt_iterator incr_gsi; + bool insert_after; + standard_iv_increment_position (loop, &incr_gsi, &insert_after); + create_iv (niters_adj, MINUS_EXPR, iv_step, NULL_TREE, loop, + &incr_gsi, insert_after, &index_before_incr, + &index_after_incr); + + /* Iterate over all the rgroups and fill in their controls. */ + for (auto rgcm : LOOP_VINFO_MASKS (loop_vinfo).rgc_map) + { + rgroup_controls *rgc = rgcm.second; + if (rgc->controls.is_empty ()) + continue; + + tree ctrl_type = rgc->type; + poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type); + + tree vectype = rgc->compare_type; + + /* index_after_incr is the IV specifying the remaining iterations in + the next iteration. */ + tree rem = index_after_incr; + /* When the data type for the compare to produce the mask is + smaller than the IV type we need to saturate. Saturate to + the smallest possible value (IV_TYPE) so we only have to + saturate once (CSE will catch redundant ones we add). */ + if (TYPE_PRECISION (TREE_TYPE (vectype)) < TYPE_PRECISION (iv_type)) + rem = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING, + UNKNOWN_LOCATION, + MIN_EXPR, TREE_TYPE (rem), rem, iv_step); + rem = gimple_convert (&incr_gsi, false, GSI_CONTINUE_LINKING, + UNKNOWN_LOCATION, TREE_TYPE (vectype), rem); + + /* Build a data vector composed of the remaining iterations. */ + rem = gimple_build_vector_from_val (&incr_gsi, false, GSI_CONTINUE_LINKING, + UNKNOWN_LOCATION, vectype, rem); + + /* Provide a definition of each vector in the control group. */ + tree next_ctrl = NULL_TREE; + tree first_rem = NULL_TREE; + tree ctrl; + unsigned int i; + FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl) + { + /* Previous controls will cover BIAS items. This control covers the + next batch. */ + poly_uint64 bias = nitems_per_ctrl * i; + + /* Build the constant to compare the remaining iters against, + this is sth like { 0, 0, 1, 1, 2, 2, 3, 3, ... } appropriately + split into pieces. */ + unsigned n = TYPE_VECTOR_SUBPARTS (ctrl_type).to_constant (); + tree_vector_builder builder (vectype, n, 1); + for (unsigned i = 0; i < n; ++i) + { + unsigned HOST_WIDE_INT val + = (i + bias.to_constant ()) / rgc->max_nscalars_per_iter; + gcc_assert (val < vf.to_constant ()); + builder.quick_push (build_int_cst (TREE_TYPE (vectype), val)); + } + tree cmp_series = builder.build (); + + /* Create the initial control. First include all items that + are within the loop limit. */ + tree init_ctrl = NULL_TREE; + poly_uint64 const_limit; + /* See whether the first iteration of the vector loop is known + to have a full control. */ + if (poly_int_tree_p (niters, &const_limit) + && known_ge (const_limit, (i + 1) * nitems_per_ctrl)) + init_ctrl = build_minus_one_cst (ctrl_type); + else + { + /* The remaining work items initially are niters. Saturate, + splat and compare. */ + if (!first_rem) + { + first_rem = niters; + if (TYPE_PRECISION (TREE_TYPE (vectype)) + < TYPE_PRECISION (iv_type)) + first_rem = gimple_build (&preheader_seq, + MIN_EXPR, TREE_TYPE (first_rem), + first_rem, iv_step); + first_rem = gimple_convert (&preheader_seq, TREE_TYPE (vectype), + first_rem); + first_rem = gimple_build_vector_from_val (&preheader_seq, + vectype, first_rem); + } + init_ctrl = gimple_build (&preheader_seq, LT_EXPR, ctrl_type, + cmp_series, first_rem); + } + + /* Now AND out the bits that are within the number of skipped + items. */ + poly_uint64 const_skip; + if (niters_skip + && !(poly_int_tree_p (niters_skip, &const_skip) + && known_le (const_skip, bias))) + { + /* For integer mode masks it's cheaper to shift out the bits + since that avoids loading a constant. */ + gcc_assert (GET_MODE_CLASS (TYPE_MODE (ctrl_type)) == MODE_INT); + init_ctrl = gimple_build (&preheader_seq, VIEW_CONVERT_EXPR, + lang_hooks.types.type_for_mode + (TYPE_MODE (ctrl_type), 1), + init_ctrl); + /* ??? But when the shift amount isn't constant this requires + a round-trip to GRPs. We could apply the bias to either + side of the compare instead. */ + tree shift = gimple_build (&preheader_seq, MULT_EXPR, + TREE_TYPE (niters_skip), + niters_skip, + build_int_cst (TREE_TYPE (niters_skip), + rgc->max_nscalars_per_iter)); + init_ctrl = gimple_build (&preheader_seq, LSHIFT_EXPR, + TREE_TYPE (init_ctrl), + init_ctrl, shift); + init_ctrl = gimple_build (&preheader_seq, VIEW_CONVERT_EXPR, + ctrl_type, init_ctrl); + } + + /* Get the control value for the next iteration of the loop. */ + next_ctrl = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING, + UNKNOWN_LOCATION, + LT_EXPR, ctrl_type, cmp_series, rem); + + vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl); + } + } + + /* Emit all accumulated statements. */ + add_preheader_seq (loop, preheader_seq); + + /* Adjust the exit test using the decrementing IV. */ + edge exit_edge = single_exit (loop); + tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR; + /* When we peel for alignment with niter_skip != 0 this can + cause niter + niter_skip to wrap and since we are comparing the + value before the decrement here we get a false early exit. + We can't compare the value after decrement either because that + decrement could wrap as well as we're not doing a saturating + decrement. To avoid this situation we force a larger + iv_type. */ + gcond *cond_stmt = gimple_build_cond (code, index_before_incr, iv_step, + NULL_TREE, NULL_TREE); + gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT); + + /* The loop iterates (NITERS - 1 + NITERS_SKIP) / VF + 1 times. + Subtract one from this to get the latch count. */ + tree niters_minus_one + = fold_build2 (PLUS_EXPR, TREE_TYPE (orig_niters), orig_niters, + build_minus_one_cst (TREE_TYPE (orig_niters))); + tree niters_adj2 = fold_convert (iv_type, niters_minus_one); + if (niters_skip) + niters_adj2 = fold_build2 (PLUS_EXPR, iv_type, niters_minus_one, + fold_convert (iv_type, niters_skip)); + loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, iv_type, + niters_adj2, iv_step); + + if (final_iv) + { + gassign *assign = gimple_build_assign (final_iv, orig_niters); + gsi_insert_on_edge_immediate (single_exit (loop), assign); + } + + return cond_stmt; +} + + /* Like vect_set_loop_condition, but handle the case in which the vector loop handles exactly VF scalars per iteration. */ @@ -1114,10 +1357,18 @@ vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo, gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond); if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) - cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo, - niters, final_iv, - niters_maybe_zero, - loop_cond_gsi); + { + if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_avx512) + cond_stmt = vect_set_loop_condition_partial_vectors_avx512 (loop, loop_vinfo, + niters, final_iv, + niters_maybe_zero, + loop_cond_gsi); + else + cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo, + niters, final_iv, + niters_maybe_zero, + loop_cond_gsi); + } else cond_stmt = vect_set_loop_condition_normal (loop, niters, step, final_iv, niters_maybe_zero, @@ -2030,7 +2281,8 @@ void vect_prepare_for_masked_peels (loop_vec_info loop_vinfo) { tree misalign_in_elems; - tree type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo); + /* ??? With AVX512 we want LOOP_VINFO_RGROUP_IV_TYPE in the end. */ + tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)); gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo)); diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 1897e720389..9be66b8fbc5 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -55,6 +55,7 @@ along with GCC; see the file COPYING3. If not see #include "vec-perm-indices.h" #include "tree-eh.h" #include "case-cfn-macros.h" +#include "langhooks.h" /* Loop Vectorization Pass. @@ -963,6 +964,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared) mask_skip_niters (NULL_TREE), rgroup_compare_type (NULL_TREE), simd_if_cond (NULL_TREE), + partial_vector_style (vect_partial_vectors_none), unaligned_dr (NULL), peeling_for_alignment (0), ptr_mask (0), @@ -1058,7 +1060,12 @@ _loop_vec_info::~_loop_vec_info () { free (bbs); - release_vec_loop_controls (&masks); + for (auto m : masks.rgc_map) + { + m.second->controls.release (); + delete m.second; + } + release_vec_loop_controls (&masks.rgc_vec); release_vec_loop_controls (&lens); delete ivexpr_map; delete scan_map; @@ -1108,7 +1115,7 @@ can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type) { rgroup_controls *rgm; unsigned int i; - FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm) + FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm) if (rgm->type != NULL_TREE && !direct_internal_fn_supported_p (IFN_WHILE_ULT, cmp_type, rgm->type, @@ -1203,9 +1210,33 @@ vect_verify_full_masking (loop_vec_info loop_vinfo) if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()) return false; + /* Produce the rgroup controls. */ + for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set) + { + vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); + tree vectype = mask.first; + unsigned nvectors = mask.second; + + if (masks->rgc_vec.length () < nvectors) + masks->rgc_vec.safe_grow_cleared (nvectors, true); + rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1]; + /* The number of scalars per iteration and the number of vectors are + both compile-time constants. */ + unsigned int nscalars_per_iter + = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype), + LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant (); + + if (rgm->max_nscalars_per_iter < nscalars_per_iter) + { + rgm->max_nscalars_per_iter = nscalars_per_iter; + rgm->type = truth_type_for (vectype); + rgm->factor = 1; + } + } + /* Calculate the maximum number of scalars per iteration for every rgroup. */ unsigned int max_nscalars_per_iter = 1; - for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo)) + for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec) max_nscalars_per_iter = MAX (max_nscalars_per_iter, rgm.max_nscalars_per_iter); @@ -1268,10 +1299,159 @@ vect_verify_full_masking (loop_vec_info loop_vinfo) } if (!cmp_type) - return false; + { + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release (); + return false; + } LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type; LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type; + LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult; + return true; +} + +/* Each statement in LOOP_VINFO can be masked where necessary. Check + whether we can actually generate AVX512 style masks. Return true if so, + storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */ + +static bool +vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo) +{ + /* Produce differently organized rgc_vec and differently check + we can produce masks. */ + + /* Use a normal loop if there are no statements that need masking. + This only happens in rare degenerate cases: it means that the loop + has no loads, no stores, and no live-out values. */ + if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()) + return false; + + /* For the decrementing IV we need to represent all values in + [0, niter + niter_skip] where niter_skip is the elements we + skip in the first iteration for prologue peeling. */ + tree iv_type = NULL_TREE; + widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo); + unsigned int iv_precision = UINT_MAX; + if (iv_limit != -1) + iv_precision = wi::min_precision (iv_limit, UNSIGNED); + + /* First compute the type for the IV we use to track the remaining + scalar iterations. */ + opt_scalar_int_mode cmp_mode_iter; + FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT) + { + unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ()); + if (cmp_bits >= iv_precision + && targetm.scalar_mode_supported_p (cmp_mode_iter.require ())) + { + iv_type = build_nonstandard_integer_type (cmp_bits, true); + if (iv_type) + break; + } + } + if (!iv_type) + return false; + + /* Produce the rgroup controls. */ + for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set) + { + vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); + tree vectype = mask.first; + unsigned nvectors = mask.second; + + /* The number of scalars per iteration and the number of vectors are + both compile-time constants. */ + unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); + unsigned int nscalars_per_iter + = nvectors * nunits / LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant (); + + /* We key off a hash-map with nscalars_per_iter and the number of total + lanes in the mask vector and then remember the needed vector mask + with the largest number of lanes (thus the fewest nV). */ + bool existed; + rgroup_controls *& rgc + = masks->rgc_map.get_or_insert (std::make_pair (nscalars_per_iter, + nvectors * nunits), + &existed); + if (!existed) + { + rgc = new rgroup_controls (); + rgc->type = truth_type_for (vectype); + rgc->compare_type = NULL_TREE; + rgc->max_nscalars_per_iter = nscalars_per_iter; + rgc->factor = 1; + rgc->bias_adjusted_ctrl = NULL_TREE; + } + else + { + gcc_assert (rgc->max_nscalars_per_iter == nscalars_per_iter); + if (known_lt (TYPE_VECTOR_SUBPARTS (rgc->type), + TYPE_VECTOR_SUBPARTS (vectype))) + rgc->type = truth_type_for (vectype); + } + } + + /* There is no fixed compare type we are going to use but we have to + be able to get at one for each mask group. */ + unsigned int min_ni_width + = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED); + + bool ok = true; + for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).rgc_map) + { + rgroup_controls *rgc = mask.second; + tree mask_type = rgc->type; + if (TYPE_PRECISION (TREE_TYPE (mask_type)) != 1) + { + ok = false; + break; + } + + /* If iv_type is usable as compare type use that - we can elide the + saturation in that case. */ + if (TYPE_PRECISION (iv_type) >= min_ni_width) + { + tree cmp_vectype + = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type)); + if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR)) + rgc->compare_type = cmp_vectype; + } + if (!rgc->compare_type) + FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT) + { + unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ()); + if (cmp_bits >= min_ni_width + && targetm.scalar_mode_supported_p (cmp_mode_iter.require ())) + { + tree cmp_type = build_nonstandard_integer_type (cmp_bits, true); + if (!cmp_type) + continue; + + /* Check whether we can produce the mask with cmp_type. */ + tree cmp_vectype + = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type)); + if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR)) + { + rgc->compare_type = cmp_vectype; + break; + } + } + } + if (!rgc->compare_type) + { + ok = false; + break; + } + } + if (!ok) + { + LOOP_VINFO_MASKS (loop_vinfo).rgc_map.empty (); + return false; + } + + LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node; + LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type; + LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512; return true; } @@ -1371,6 +1551,7 @@ vect_verify_loop_lens (loop_vec_info loop_vinfo) LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type; LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type; + LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len; return true; } @@ -2712,16 +2893,24 @@ start_over: /* If we still have the option of using partial vectors, check whether we can generate the necessary loop controls. */ - if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) - && !vect_verify_full_masking (loop_vinfo) - && !vect_verify_loop_lens (loop_vinfo)) - LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; + if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) + { + if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ()) + { + if (!vect_verify_full_masking (loop_vinfo) + && !vect_verify_full_masking_avx512 (loop_vinfo)) + LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; + } + else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */ + if (!vect_verify_loop_lens (loop_vinfo)) + LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; + } /* If we're vectorizing a loop that uses length "controls" and can iterate more than once, we apply decrementing IV approach in loop control. */ if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) - && !LOOP_VINFO_LENS (loop_vinfo).is_empty () + && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0 && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo), @@ -3022,7 +3211,7 @@ again: delete loop_vinfo->vector_costs; loop_vinfo->vector_costs = nullptr; /* Reset accumulated rgroup information. */ - release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo)); + release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec); release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo)); /* Reset assorted flags. */ LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; @@ -4362,13 +4551,67 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, cond_branch_not_taken, vect_epilogue); /* Take care of special costs for rgroup controls of partial vectors. */ - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) + == vect_partial_vectors_avx512)) + { + /* Calculate how many masks we need to generate. */ + unsigned int num_masks = 0; + bool need_saturation = false; + for (auto rgcm : LOOP_VINFO_MASKS (loop_vinfo).rgc_map) + { + rgroup_controls *rgm = rgcm.second; + unsigned nvectors + = (rgcm.first.second + / TYPE_VECTOR_SUBPARTS (rgm->type).to_constant ()); + num_masks += nvectors; + if (TYPE_PRECISION (TREE_TYPE (rgm->compare_type)) + < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo))) + need_saturation = true; + } + + /* ??? The target isn't able to identify the costs below as + producing masks so it cannot penaltize cases where we'd run + out of mask registers for example. */ + + /* In the worst case, we need to generate each mask in the prologue + and in the loop body. We need one splat per group and one + compare per mask. + + Sometimes we can use unpacks instead of generating prologue + masks and sometimes the prologue mask will fold to a constant, + so the actual prologue cost might be smaller. However, it's + simpler and safer to use the worst-case cost; if this ends up + being the tie-breaker between vectorizing or not, then it's + probably better not to vectorize. */ + (void) add_stmt_cost (target_cost_data, + num_masks + + LOOP_VINFO_MASKS (loop_vinfo).rgc_map.elements (), + vector_stmt, NULL, NULL, NULL_TREE, 0, vect_prologue); + (void) add_stmt_cost (target_cost_data, + num_masks + + LOOP_VINFO_MASKS (loop_vinfo).rgc_map.elements (), + vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body); + + /* When we need saturation we need it both in the prologue and + the epilogue. */ + if (need_saturation) + { + (void) add_stmt_cost (target_cost_data, 1, scalar_stmt, + NULL, NULL, NULL_TREE, 0, vect_prologue); + (void) add_stmt_cost (target_cost_data, 1, scalar_stmt, + NULL, NULL, NULL_TREE, 0, vect_body); + } + } + else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) + == vect_partial_vectors_avx512)) { /* Calculate how many masks we need to generate. */ unsigned int num_masks = 0; rgroup_controls *rgm; unsigned int num_vectors_m1; - FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm) + FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, num_vectors_m1, rgm) if (rgm->type) num_masks += num_vectors_m1 + 1; gcc_assert (num_masks > 0); @@ -10329,14 +10572,6 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks, unsigned int nvectors, tree vectype, tree scalar_mask) { gcc_assert (nvectors != 0); - if (masks->length () < nvectors) - masks->safe_grow_cleared (nvectors, true); - rgroup_controls *rgm = &(*masks)[nvectors - 1]; - /* The number of scalars per iteration and the number of vectors are - both compile-time constants. */ - unsigned int nscalars_per_iter - = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype), - LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant (); if (scalar_mask) { @@ -10344,12 +10579,7 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks, loop_vinfo->scalar_cond_masked_set.add (cond); } - if (rgm->max_nscalars_per_iter < nscalars_per_iter) - { - rgm->max_nscalars_per_iter = nscalars_per_iter; - rgm->type = truth_type_for (vectype); - rgm->factor = 1; - } + masks->mask_set.add (std::make_pair (vectype, nvectors)); } /* Given a complete set of masks MASKS, extract mask number INDEX @@ -10360,46 +10590,121 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks, arrangement. */ tree -vect_get_loop_mask (loop_vec_info, +vect_get_loop_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi, vec_loop_masks *masks, unsigned int nvectors, tree vectype, unsigned int index) { - rgroup_controls *rgm = &(*masks)[nvectors - 1]; - tree mask_type = rgm->type; - - /* Populate the rgroup's mask array, if this is the first time we've - used it. */ - if (rgm->controls.is_empty ()) + if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) + == vect_partial_vectors_while_ult) { - rgm->controls.safe_grow_cleared (nvectors, true); - for (unsigned int i = 0; i < nvectors; ++i) + rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1]; + tree mask_type = rgm->type; + + /* Populate the rgroup's mask array, if this is the first time we've + used it. */ + if (rgm->controls.is_empty ()) { - tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask"); - /* Provide a dummy definition until the real one is available. */ - SSA_NAME_DEF_STMT (mask) = gimple_build_nop (); - rgm->controls[i] = mask; + rgm->controls.safe_grow_cleared (nvectors, true); + for (unsigned int i = 0; i < nvectors; ++i) + { + tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask"); + /* Provide a dummy definition until the real one is available. */ + SSA_NAME_DEF_STMT (mask) = gimple_build_nop (); + rgm->controls[i] = mask; + } } - } - tree mask = rgm->controls[index]; - if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type), - TYPE_VECTOR_SUBPARTS (vectype))) + tree mask = rgm->controls[index]; + if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type), + TYPE_VECTOR_SUBPARTS (vectype))) + { + /* A loop mask for data type X can be reused for data type Y + if X has N times more elements than Y and if Y's elements + are N times bigger than X's. In this case each sequence + of N elements in the loop mask will be all-zero or all-one. + We can then view-convert the mask so that each sequence of + N elements is replaced by a single element. */ + gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type), + TYPE_VECTOR_SUBPARTS (vectype))); + gimple_seq seq = NULL; + mask_type = truth_type_for (vectype); + /* We can only use re-use the mask by reinterpreting it if it + occupies the same space, that is the mask with less elements + uses multiple bits for each masked elements. */ + gcc_assert (known_eq (TYPE_PRECISION (TREE_TYPE (TREE_TYPE (mask))) + * TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask)), + TYPE_PRECISION (TREE_TYPE (mask_type)) + * TYPE_VECTOR_SUBPARTS (mask_type))); + mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask); + if (seq) + gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT); + } + return mask; + } + else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) + == vect_partial_vectors_avx512) { - /* A loop mask for data type X can be reused for data type Y - if X has N times more elements than Y and if Y's elements - are N times bigger than X's. In this case each sequence - of N elements in the loop mask will be all-zero or all-one. - We can then view-convert the mask so that each sequence of - N elements is replaced by a single element. */ - gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type), - TYPE_VECTOR_SUBPARTS (vectype))); + /* The number of scalars per iteration and the number of vectors are + both compile-time constants. */ + unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); + unsigned int nscalars_per_iter + = nvectors * nunits / LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant (); + + rgroup_controls *rgm + = *masks->rgc_map.get (std::make_pair (nscalars_per_iter, + nvectors * nunits)); + + /* The stored nV is dependent on the mask type produced. */ + nvectors = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype), + TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant (); + + /* Populate the rgroup's mask array, if this is the first time we've + used it. */ + if (rgm->controls.is_empty ()) + { + rgm->controls.safe_grow_cleared (nvectors, true); + for (unsigned int i = 0; i < nvectors; ++i) + { + tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask"); + /* Provide a dummy definition until the real one is available. */ + SSA_NAME_DEF_STMT (mask) = gimple_build_nop (); + rgm->controls[i] = mask; + } + } + if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type), + TYPE_VECTOR_SUBPARTS (vectype))) + return rgm->controls[index]; + + /* Split the vector if needed. Since we are dealing with integer mode + masks with AVX512 we can operate on the integer representation + performing the whole vector shifting. */ + unsigned HOST_WIDE_INT factor; + bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type), + TYPE_VECTOR_SUBPARTS (vectype), &factor); + gcc_assert (ok); + gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT); + tree mask_type = truth_type_for (vectype); + gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT); + unsigned vi = index / factor; + unsigned vpart = index % factor; + tree vec = rgm->controls[vi]; gimple_seq seq = NULL; - mask_type = truth_type_for (vectype); - mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask); + vec = gimple_build (&seq, VIEW_CONVERT_EXPR, + lang_hooks.types.type_for_mode + (TYPE_MODE (rgm->type), 1), vec); + /* For integer mode masks simply shift the right bits into position. */ + if (vpart != 0) + vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec, + build_int_cst (integer_type_node, vpart * nunits)); + vec = gimple_convert (&seq, lang_hooks.types.type_for_mode + (TYPE_MODE (mask_type), 1), vec); + vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec); if (seq) gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT); + return vec; } - return mask; + else + gcc_unreachable (); } /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 767a0774d45..42161778dc1 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -300,6 +300,13 @@ public: #define SLP_TREE_LANES(S) (S)->lanes #define SLP_TREE_CODE(S) (S)->code +enum vect_partial_vector_style { + vect_partial_vectors_none, + vect_partial_vectors_while_ult, + vect_partial_vectors_avx512, + vect_partial_vectors_len +}; + /* Key for map that records association between scalar conditions and corresponding loop mask, and is populated by vect_record_loop_mask. */ @@ -605,6 +612,10 @@ struct rgroup_controls { specified number of elements; the type of the elements doesn't matter. */ tree type; + /* When there is no uniformly used LOOP_VINFO_RGROUP_COMPARE_TYPE this + is the rgroup specific type used. */ + tree compare_type; + /* A vector of nV controls, in iteration order. */ vec controls; @@ -613,7 +624,24 @@ struct rgroup_controls { tree bias_adjusted_ctrl; }; -typedef auto_vec vec_loop_masks; +struct vec_loop_masks +{ + bool is_empty () const { return mask_set.is_empty (); } + + typedef pair_hash , + int_hash> mp_hash; + hash_set mask_set; + + /* Default storage for rgroup_controls. */ + auto_vec rgc_vec; + + /* The vect_partial_vectors_avx512 style uses a hash-map. */ + hash_map, rgroup_controls *, + simple_hashmap_traits, + int_hash>, + rgroup_controls *>> rgc_map; +}; typedef auto_vec vec_loop_lens; @@ -741,6 +769,10 @@ public: LOOP_VINFO_USING_PARTIAL_VECTORS_P is true. */ tree rgroup_iv_type; + /* The style used for implementing partial vectors when + LOOP_VINFO_USING_PARTIAL_VECTORS_P is true. */ + vect_partial_vector_style partial_vector_style; + /* Unknown DRs according to which loop was peeled. */ class dr_vec_info *unaligned_dr; @@ -914,6 +946,7 @@ public: #define LOOP_VINFO_MASK_SKIP_NITERS(L) (L)->mask_skip_niters #define LOOP_VINFO_RGROUP_COMPARE_TYPE(L) (L)->rgroup_compare_type #define LOOP_VINFO_RGROUP_IV_TYPE(L) (L)->rgroup_iv_type +#define LOOP_VINFO_PARTIAL_VECTORS_STYLE(L) (L)->partial_vector_style #define LOOP_VINFO_PTR_MASK(L) (L)->ptr_mask #define LOOP_VINFO_N_STMTS(L) (L)->shared->n_stmts #define LOOP_VINFO_LOOP_NEST(L) (L)->shared->loop_nest