Patch Detail
get:
Show a patch.
patch:
Update a patch.
put:
Update a patch.
GET /api/1.1/patches/2233392/?format=api
{ "id": 2233392, "url": "http://patchwork.ozlabs.org/api/1.1/patches/2233392/?format=api", "web_url": "http://patchwork.ozlabs.org/project/gcc/patch/20260506070415.99154-1-wangyaduo@linux.alibaba.com/", "project": { "id": 17, "url": "http://patchwork.ozlabs.org/api/1.1/projects/17/?format=api", "name": "GNU Compiler Collection", "link_name": "gcc", "list_id": "gcc-patches.gcc.gnu.org", "list_email": "gcc-patches@gcc.gnu.org", "web_url": null, "scm_url": null, "webscm_url": null }, "msgid": "<20260506070415.99154-1-wangyaduo@linux.alibaba.com>", "date": "2026-05-06T07:04:15", "name": "RISC-V: Add per-type reduction costs to the vector cost model", "commit_ref": null, "pull_url": null, "state": "new", "archived": false, "hash": "251f5d482500d66690a07a1b21d3da3faf86c1b7", "submitter": { "id": 93342, "url": "http://patchwork.ozlabs.org/api/1.1/people/93342/?format=api", "name": "Wang Yaduo", "email": "wangyaduo@linux.alibaba.com" }, "delegate": null, "mbox": "http://patchwork.ozlabs.org/project/gcc/patch/20260506070415.99154-1-wangyaduo@linux.alibaba.com/mbox/", "series": [ { "id": 502964, "url": "http://patchwork.ozlabs.org/api/1.1/series/502964/?format=api", "web_url": "http://patchwork.ozlabs.org/project/gcc/list/?series=502964", "date": "2026-05-06T07:04:15", "name": "RISC-V: Add per-type reduction costs to the vector cost model", "version": 1, "mbox": "http://patchwork.ozlabs.org/series/502964/mbox/" } ], "comments": "http://patchwork.ozlabs.org/api/patches/2233392/comments/", "check": "pending", "checks": "http://patchwork.ozlabs.org/api/patches/2233392/checks/", "tags": {}, "headers": { "Return-Path": "<gcc-patches-bounces~incoming=patchwork.ozlabs.org@gcc.gnu.org>", "X-Original-To": [ "incoming@patchwork.ozlabs.org", "gcc-patches@gcc.gnu.org" ], "Delivered-To": [ "patchwork-incoming@legolas.ozlabs.org", "gcc-patches@gcc.gnu.org" ], "Authentication-Results": [ "legolas.ozlabs.org;\n\tdkim=pass (1024-bit key;\n unprotected) header.d=linux.alibaba.com header.i=@linux.alibaba.com\n header.a=rsa-sha256 header.s=default header.b=da5npsRI;\n\tdkim-atps=neutral", "legolas.ozlabs.org;\n spf=pass (sender SPF authorized) smtp.mailfrom=gcc.gnu.org\n (client-ip=2620:52:6:3111::32; helo=vm01.sourceware.org;\n envelope-from=gcc-patches-bounces~incoming=patchwork.ozlabs.org@gcc.gnu.org;\n receiver=patchwork.ozlabs.org)", "sourceware.org;\n\tdkim=pass (1024-bit key,\n unprotected) header.d=linux.alibaba.com header.i=@linux.alibaba.com\n header.a=rsa-sha256 header.s=default header.b=da5npsRI", "sourceware.org; dmarc=pass (p=none dis=none)\n header.from=linux.alibaba.com", "sourceware.org;\n spf=pass smtp.mailfrom=linux.alibaba.com", "sourceware.org; arc=none smtp.remote-ip=115.124.30.100" ], "Received": [ "from vm01.sourceware.org (vm01.sourceware.org\n [IPv6:2620:52:6:3111::32])\n\t(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)\n\t key-exchange x25519 server-signature ECDSA (secp384r1) server-digest SHA384)\n\t(No client certificate requested)\n\tby legolas.ozlabs.org (Postfix) with ESMTPS id 4g9XLg5ysmz1yJV\n\tfor <incoming@patchwork.ozlabs.org>; Wed, 06 May 2026 20:54:02 +1000 (AEST)", "from vm01.sourceware.org (localhost [IPv6:::1])\n\tby sourceware.org (Postfix) with ESMTP id 707A54B9DB5F\n\tfor <incoming@patchwork.ozlabs.org>; Wed, 6 May 2026 10:54:00 +0000 (GMT)", "from out30-100.freemail.mail.aliyun.com\n (out30-100.freemail.mail.aliyun.com [115.124.30.100])\n by sourceware.org (Postfix) with ESMTPS id 7C75C4BA2E38\n for <gcc-patches@gcc.gnu.org>; Wed, 6 May 2026 07:04:22 +0000 (GMT)", "from localhost.localdomain(mailfrom:wangyaduo@linux.alibaba.com\n fp:SMTPD_---0X2MLDff_1778051057 cluster:ay36) by smtp.aliyun-inc.com;\n Wed, 06 May 2026 15:04:18 +0800" ], "DKIM-Filter": [ "OpenDKIM Filter v2.11.0 sourceware.org 707A54B9DB5F", "OpenDKIM Filter v2.11.0 sourceware.org 7C75C4BA2E38" ], "DMARC-Filter": "OpenDMARC Filter v1.4.2 sourceware.org 7C75C4BA2E38", "ARC-Filter": "OpenARC Filter v1.0.0 sourceware.org 7C75C4BA2E38", "ARC-Seal": "i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1778051064; cv=none;\n b=R4fNR4E3+OQWZIFsi/JJRDlqnmbhCY3o1IWbc3PdF66eK/HFhOtCPdSpGrRmvOIFEZoYlZS6g8xKYufIjNaWxFxK4ZLbcTaajsEN9sPwkWcXST1rP9SjzDNiKBlHu++7XlvN7Vhn+c70rdR+W9hpFkEjaxvhJanF7sl5CNddctY=", "ARC-Message-Signature": "i=1; a=rsa-sha256; d=sourceware.org; s=key;\n t=1778051064; c=relaxed/simple;\n bh=plLZEEI04XfoRaDzcEufgMsvdL9HeXKjxTU+5r3V0v0=;\n h=DKIM-Signature:From:To:Subject:Date:Message-ID:MIME-Version;\n b=opOqI6MBAVv4txJy+ja7H46KKQWBPlUkk27CCOpeNcVgDea2v8oBuhsQfpsd6ziwg4QCLGFpMMx4b78ZDfOgkSHvXtHCXTZ7WJ7sUcjDqPBHvP+R4Zr+5xACeGnrVVI4kBvg0+0hma49Yj0jTVKAscvxZZU04EsHbmPITtqcKmE=", "ARC-Authentication-Results": "i=1; sourceware.org;\n dkim=pass (1024-bit key, unprotected)\n header.d=linux.alibaba.com header.i=@linux.alibaba.com header.a=rsa-sha256\n header.s=default header.b=da5npsRI", "DKIM-Signature": "v=1; a=rsa-sha256; c=relaxed/relaxed;\n d=linux.alibaba.com; s=default;\n t=1778051060; h=From:To:Subject:Date:Message-ID:MIME-Version;\n bh=jRqEgO9vLWpPjYX/N2fAY63U1BUs+snC2xwhIuso06c=;\n b=da5npsRIBe/rCu4uIIsORd028/DYnbTEMNDOCyv9zUJwimS/VQ+AklKdfN7t2mJYcOmvvJGnCPZ1MVWykzNn0/PIb+wmd4qFXbkoCGRZKcmNfttDF3hZOIzNQNAoIKQPQ118L9nNt3Yh9bI13RsruPFXlE0t/SY8JuH03cjO3ys=", "X-Alimail-AntiSpam": "AC=PASS; BC=-1|-1; BR=01201311R151e4; CH=green;\n DM=||false|;\n DS=||; FP=0|-1|-1|-1|0|-1|-1|-1; HT=maildocker-contentspam033037026112;\n MF=wangyaduo@linux.alibaba.com; NM=1; PH=DS; RN=9; SR=0;\n TI=SMTPD_---0X2MLDff_1778051057;", "From": "Wang Yaduo <wangyaduo@linux.alibaba.com>", "To": "gcc-patches@gcc.gnu.org", "Cc": "rdapp.gcc@gmail.com, kito.cheng@gmail.com, juzhe.zhong@rivai.ai,\n palmer@dabbelt.com, pan2.li@intel.com, jeffreyalaw@gmail.com,\n chenzhongyao.hit@gmail.com, Wang Yaduo <wangyaduo@linux.alibaba.com>", "Subject": "[PATCH] RISC-V: Add per-type reduction costs to the vector cost model", "Date": "Wed, 6 May 2026 15:04:15 +0800", "Message-ID": "<20260506070415.99154-1-wangyaduo@linux.alibaba.com>", "X-Mailer": "git-send-email 2.54.0", "MIME-Version": "1.0", "Content-Transfer-Encoding": "8bit", "X-BeenThere": "gcc-patches@gcc.gnu.org", "X-Mailman-Version": "2.1.30", "Precedence": "list", "List-Id": "Gcc-patches mailing list <gcc-patches.gcc.gnu.org>", "List-Unsubscribe": "<https://gcc.gnu.org/mailman/options/gcc-patches>,\n <mailto:gcc-patches-request@gcc.gnu.org?subject=unsubscribe>", "List-Archive": "<https://gcc.gnu.org/pipermail/gcc-patches/>", "List-Post": "<mailto:gcc-patches@gcc.gnu.org>", "List-Help": "<mailto:gcc-patches-request@gcc.gnu.org?subject=help>", "List-Subscribe": "<https://gcc.gnu.org/mailman/listinfo/gcc-patches>,\n <mailto:gcc-patches-request@gcc.gnu.org?subject=subscribe>", "Errors-To": "gcc-patches-bounces~incoming=patchwork.ozlabs.org@gcc.gnu.org" }, "content": "Add per-type reduction costs (i8/i16/i32/i64/f16/f32/f64) to the RISC-V\nvector cost model, distinguishing between ordered (fold-left) and\nunordered (tree) floating-point reductions. When a reduction is\ndetected, the per-type cost replaces the default vec_to_scalar_cost,\nsimilar to AArch64. This causes _Float16 n=4 ordered reductions to no\nlonger be vectorized in VLS mode due to the higher cost.\n\ngcc/ChangeLog:\n\n\t* config/riscv/riscv-protos.h (common_vector_cost): Add per-type\n\treduction cost fields: reduc_i8_cost, reduc_i16_cost,\n\treduc_i32_cost, reduc_i64_cost, reduc_f16_cost, reduc_f32_cost,\n\treduc_f64_cost for unordered reductions, and reduc_f16_ordered_cost,\n\treduc_f32_ordered_cost, reduc_f64_ordered_cost for ordered\n\t(fold-left) reductions.\n\t* config/riscv/riscv.cc (rvv_vla_vector_cost): Initialize reduction\n\tcost fields with default values.\n\t(rvv_vls_vector_cost): Likewise.\n\t* config/riscv/riscv-vector-costs.cc (costs::adjust_stmt_cost): Add\n\treduction detection in the vec_to_scalar case. When a reduction is\n\tdetected, replace the default vec_to_scalar_cost with the\n\tappropriate per-type reduction cost based on element mode and\n\treduction kind (ordered vs unordered).\n\ngcc/testsuite/ChangeLog:\n\n\t* gcc.target/riscv/rvv/autovec/reduc/reduc_cost-1.c: New test for\n\tVLA unordered reduction costs.\n\t* gcc.target/riscv/rvv/autovec/reduc/reduc_cost-2.c: New test for\n\tVLA ordered reduction costs.\n\t* gcc.target/riscv/rvv/autovec/vls/reduc_cost-1.c: New test for\n\tVLS reduction costs.\n\t* gcc.target/riscv/rvv/autovec/vls/reduc-19.c: Update expected\n\tvfredosum count from 9 to 8.\n\t* gcc.target/riscv/rvv/autovec/vls/wred-3.c: Update expected\n\tvfwredosum count from 17 to 16.\n\nSigned-off-by: Wang Yaduo <wangyaduo@linux.alibaba.com>\n---\n gcc/config/riscv/riscv-protos.h | 20 +++++-\n gcc/config/riscv/riscv-vector-costs.cc | 68 ++++++++++++++++++-\n gcc/config/riscv/riscv.cc | 20 ++++++\n .../riscv/rvv/autovec/reduc/reduc_cost-1.c | 34 ++++++++++\n .../riscv/rvv/autovec/reduc/reduc_cost-2.c | 34 ++++++++++\n .../riscv/rvv/autovec/vls/reduc-19.c | 4 +-\n .../riscv/rvv/autovec/vls/reduc_cost-1.c | 41 +++++++++++\n .../gcc.target/riscv/rvv/autovec/vls/wred-3.c | 4 +-\n 8 files changed, 219 insertions(+), 6 deletions(-)\n create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_cost-1.c\n create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_cost-2.c\n create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc_cost-1.c", "diff": "diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h\nindex dd029c704..5da5a6a21 100644\n--- a/gcc/config/riscv/riscv-protos.h\n+++ b/gcc/config/riscv/riscv-protos.h\n@@ -279,6 +279,24 @@ struct common_vector_cost\n \n /* Cost of an unaligned vector store. */\n const int unalign_store_cost;\n+\n+ /* Cost of vector reduction operations (unordered / tree reduction).\n+ Indexed by element type. */\n+ const int reduc_i8_cost;\n+ const int reduc_i16_cost;\n+ const int reduc_i32_cost;\n+ const int reduc_i64_cost;\n+ const int reduc_f16_cost;\n+ const int reduc_f32_cost;\n+ const int reduc_f64_cost;\n+\n+ /* Cost of ordered (fold-left / strict) floating-point reductions.\n+ These are significantly more expensive than unordered (tree) reductions\n+ because RVV ordered reduction instructions (e.g. vfredosum) process\n+ elements sequentially. */\n+ const int reduc_f16_ordered_cost;\n+ const int reduc_f32_ordered_cost;\n+ const int reduc_f64_ordered_cost;\n };\n \n /* scalable vectorization (VLA) specific cost. */\n@@ -289,7 +307,7 @@ struct scalable_vector_cost : common_vector_cost\n {}\n \n /* TODO: We will need more other kinds of vector cost for VLA.\n- E.g. fold_left reduction cost, lanes load/store cost, ..., etc. */\n+ E.g. lanes load/store cost, ..., etc. */\n };\n \n /* Additional costs for register copies. Cost is for one register. */\ndiff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc\nindex f582551eb..a837e4879 100644\n--- a/gcc/config/riscv/riscv-vector-costs.cc\n+++ b/gcc/config/riscv/riscv-vector-costs.cc\n@@ -1253,9 +1253,71 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop,\n \t+= (FLOAT_TYPE_P (vectype) ? get_fr2vr_cost () : get_gr2vr_cost ());\n break;\n case vec_to_scalar:\n- stmt_cost\n-\t+= (FLOAT_TYPE_P (vectype) ? get_vr2fr_cost () : get_vr2gr_cost ());\n- break;\n+ {\n+\t/* Detect reduction operations and apply type-specific reduction\n+\t costs. The vec_to_scalar cost kind represents the reduction\n+\t operation itself (e.g. vredsum.vs, vfredosum.vs), so we replace\n+\t the default vec_to_scalar_cost with a more precise per-type cost.\n+\t For floating-point reductions, distinguish between ordered\n+\t (fold-left, e.g. vfredosum) and unordered (tree, e.g. vfredusum)\n+\t reductions since ordered reductions are significantly more\n+\t expensive due to sequential processing. */\n+\tif (stmt_info && vectype && vect_is_reduction (stmt_info))\n+\t {\n+\t const common_vector_cost *common_costs\n+\t = loop && riscv_vla_mode_p (loop->vector_mode)\n+\t\t? costs->vla : costs->vls;\n+\n+\t bool is_ordered = false;\n+\t if (FLOAT_TYPE_P (vectype) && loop && node)\n+\t {\n+\t\tint reduc_type = vect_reduc_type (m_vinfo, node);\n+\t\tis_ordered = (reduc_type == FOLD_LEFT_REDUCTION);\n+\t }\n+\n+\t int reduc_cost = 0;\n+\t switch (GET_MODE_INNER (TYPE_MODE (vectype)))\n+\t {\n+\t case E_QImode:\n+\t\treduc_cost = common_costs->reduc_i8_cost;\n+\t\tbreak;\n+\t case E_HImode:\n+\t\treduc_cost = common_costs->reduc_i16_cost;\n+\t\tbreak;\n+\t case E_SImode:\n+\t\treduc_cost = common_costs->reduc_i32_cost;\n+\t\tbreak;\n+\t case E_DImode:\n+\t\treduc_cost = common_costs->reduc_i64_cost;\n+\t\tbreak;\n+\t case E_HFmode:\n+\t case E_BFmode:\n+\t\treduc_cost = is_ordered\n+\t\t\t ? common_costs->reduc_f16_ordered_cost\n+\t\t\t : common_costs->reduc_f16_cost;\n+\t\tbreak;\n+\t case E_SFmode:\n+\t\treduc_cost = is_ordered\n+\t\t\t ? common_costs->reduc_f32_ordered_cost\n+\t\t\t : common_costs->reduc_f32_cost;\n+\t\tbreak;\n+\t case E_DFmode:\n+\t\treduc_cost = is_ordered\n+\t\t\t ? common_costs->reduc_f64_ordered_cost\n+\t\t\t : common_costs->reduc_f64_cost;\n+\t\tbreak;\n+\t default:\n+\t\tbreak;\n+\t }\n+\n+\t if (reduc_cost)\n+\t stmt_cost = reduc_cost;\n+\t }\n+\n+\tstmt_cost\n+\t += (FLOAT_TYPE_P (vectype) ? get_vr2fr_cost () : get_vr2gr_cost ());\n+\tbreak;\n+ }\n case vector_load:\n case vector_store:\n \t{\ndiff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc\nindex 97272b434..50fa9bd96 100644\n--- a/gcc/config/riscv/riscv.cc\n+++ b/gcc/config/riscv/riscv.cc\n@@ -415,6 +415,16 @@ static const common_vector_cost rvv_vls_vector_cost = {\n 1, /* align_store_cost */\n 2, /* unalign_load_cost */\n 2, /* unalign_store_cost */\n+ 2, /* reduc_i8_cost */\n+ 2, /* reduc_i16_cost */\n+ 2, /* reduc_i32_cost */\n+ 2, /* reduc_i64_cost */\n+ 2, /* reduc_f16_cost */\n+ 2, /* reduc_f32_cost */\n+ 2, /* reduc_f64_cost */\n+ 6, /* reduc_f16_ordered_cost */\n+ 4, /* reduc_f32_ordered_cost */\n+ 2, /* reduc_f64_ordered_cost */\n };\n \n /* RVV costs for VLA vector operations. */\n@@ -438,6 +448,16 @@ static const scalable_vector_cost rvv_vla_vector_cost = {\n 1, /* align_store_cost */\n 2, /* unalign_load_cost */\n 2, /* unalign_store_cost */\n+ 2, /* reduc_i8_cost */\n+ 2, /* reduc_i16_cost */\n+ 2, /* reduc_i32_cost */\n+ 2, /* reduc_i64_cost */\n+ 2, /* reduc_f16_cost */\n+ 2, /* reduc_f32_cost */\n+ 2, /* reduc_f64_cost */\n+ 6, /* reduc_f16_ordered_cost */\n+ 4, /* reduc_f32_ordered_cost */\n+ 2, /* reduc_f64_ordered_cost */\n },\n };\n \ndiff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_cost-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_cost-1.c\nnew file mode 100644\nindex 000000000..f567e0ce7\n--- /dev/null\n+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_cost-1.c\n@@ -0,0 +1,34 @@\n+/* Verify that the vector cost model handles unordered (tree) reductions\n+ for all integer and floating-point element types (VLA). */\n+/* { dg-do compile } */\n+/* { dg-additional-options \"-march=rv32gcv_zvfh -mabi=ilp32d -mrvv-vector-bits=scalable -ffast-math -fdump-tree-vect-details\" } */\n+\n+#include <stdint-gcc.h>\n+\n+#define DEF_REDUC_PLUS(TYPE)\t\t\t\\\n+TYPE __attribute__ ((noinline, noclone))\t\\\n+reduc_plus_##TYPE (TYPE *restrict a, int n)\t\\\n+{\t\t\t\t\t\t\\\n+ TYPE r = 0;\t\t\t\t\t\\\n+ for (int i = 0; i < n; ++i)\t\t\t\\\n+ r += a[i];\t\t\t\t\t\\\n+ return r;\t\t\t\t\t\\\n+}\n+\n+DEF_REDUC_PLUS (int8_t)\n+DEF_REDUC_PLUS (int16_t)\n+DEF_REDUC_PLUS (int32_t)\n+DEF_REDUC_PLUS (int64_t)\n+DEF_REDUC_PLUS (_Float16)\n+DEF_REDUC_PLUS (float)\n+DEF_REDUC_PLUS (double)\n+\n+/* All loops should be vectorized with the cost model enabled. */\n+/* { dg-final { scan-tree-dump-times \"optimized: loop vectorized\" 7 \"vect\" } } */\n+/* { dg-final { scan-assembler-times {vredsum\\.vs\\s+v[0-9]+,\\s*v[0-9]+,\\s*v[0-9]+} 4 } } */\n+/* { dg-final { scan-assembler-times {vfredusum\\.vs\\s+v[0-9]+,\\s*v[0-9]+,\\s*v[0-9]+} 3 } } */\n+\n+/* Verify the reduction cost is reflected in the cost model dump.\n+ For unordered reductions: reduc_*_cost (2) + vr2gr/vr2fr (2) = 4,\n+ where reduc_*_cost replaces the default vec_to_scalar_cost. */\n+/* { dg-final { scan-tree-dump \"vec_to_scalar costs 4\" \"vect\" } } */\ndiff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_cost-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_cost-2.c\nnew file mode 100644\nindex 000000000..af9ffbcf5\n--- /dev/null\n+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_cost-2.c\n@@ -0,0 +1,34 @@\n+/* Verify that the vector cost model handles ordered (fold-left / strict)\n+ floating-point reductions for all FP element types (VLA). */\n+/* { dg-do compile } */\n+/* { dg-additional-options \"-march=rv32gcv_zvfh -mabi=ilp32d -mrvv-vector-bits=scalable -fdump-tree-vect-details\" } */\n+\n+#include <stdint-gcc.h>\n+\n+#define DEF_REDUC_PLUS(TYPE)\t\t\t\t\\\n+ TYPE __attribute__ ((noinline, noclone))\t\t\\\n+ reduc_plus_##TYPE (TYPE *restrict a, int n)\t\t\\\n+ {\t\t\t\t\t\t\t\\\n+ TYPE r = 0;\t\t\t\t\t\t\\\n+ for (int i = 0; i < n; ++i)\t\t\t\t\\\n+ r += a[i];\t\t\t\t\t\\\n+ return r;\t\t\t\t\t\t\\\n+ }\n+\n+DEF_REDUC_PLUS (_Float16)\n+DEF_REDUC_PLUS (float)\n+DEF_REDUC_PLUS (double)\n+\n+/* Without -ffast-math, FP reductions use ordered (fold-left) mode.\n+ The cost model should still allow vectorization. */\n+/* { dg-final { scan-assembler {vfredosum\\.vs\\s+v[0-9]+,\\s*v[0-9]+,\\s*v[0-9]+} } } */\n+\n+/* Verify ordered reduction costs are reflected in the cost model dump.\n+ The reduc_f*_ordered_cost replaces the default vec_to_scalar_cost,\n+ plus vr2fr cost (2):\n+ f16: reduc_f16_ordered_cost (6) + vr2fr (2) = 8\n+ f32: reduc_f32_ordered_cost (4) + vr2fr (2) = 6\n+ f64: reduc_f64_ordered_cost (2) + vr2fr (2) = 4 */\n+/* { dg-final { scan-tree-dump \"vec_to_scalar costs 8\" \"vect\" } } */\n+/* { dg-final { scan-tree-dump \"vec_to_scalar costs 6\" \"vect\" } } */\n+/* { dg-final { scan-tree-dump \"vec_to_scalar costs 4\" \"vect\" } } */\ndiff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c\nindex 5a4df4824..3815bbadd 100644\n--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c\n+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c\n@@ -14,7 +14,9 @@ DEF_REDUC_PLUS (_Float16, 512)\n DEF_REDUC_PLUS (_Float16, 1024)\n DEF_REDUC_PLUS (_Float16, 2048)\n \n-/* { dg-final { scan-assembler-times {vfredosum\\.vs} 9 } } */\n+/* The _Float16 n=4 case is not vectorized because the ordered reduction\n+ cost (reduc_f16_ordered_cost) makes it unprofitable for small trip counts. */\n+/* { dg-final { scan-assembler-times {vfredosum\\.vs} 8 } } */\n /* { dg-final { scan-assembler-not {csrr} } } */\n /* { dg-final { scan-tree-dump-not \"1,1\" \"optimized\" } } */\n /* { dg-final { scan-tree-dump-not \"2,2\" \"optimized\" } } */\ndiff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc_cost-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc_cost-1.c\nnew file mode 100644\nindex 000000000..ed62ee230\n--- /dev/null\n+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc_cost-1.c\n@@ -0,0 +1,41 @@\n+/* Verify that the vector cost model handles reductions for all element\n+ types in VLS mode, including both unordered and ordered reductions. */\n+/* { dg-do compile } */\n+/* { dg-options \"-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -mrvv-max-lmul=m8 -fdump-tree-vect-details\" } */\n+\n+#include \"def.h\"\n+\n+/* Integer unordered reductions (VLS). */\n+DEF_REDUC_PLUS (int8_t, 4)\n+DEF_REDUC_PLUS (int8_t, 8)\n+DEF_REDUC_PLUS (int16_t, 4)\n+DEF_REDUC_PLUS (int16_t, 8)\n+DEF_REDUC_PLUS (int32_t, 4)\n+DEF_REDUC_PLUS (int32_t, 8)\n+DEF_REDUC_PLUS (int64_t, 4)\n+DEF_REDUC_PLUS (int64_t, 8)\n+\n+/* { dg-final { scan-assembler-times {vredsum\\.vs} 8 } } */\n+\n+/* Floating-point ordered (strict) reductions (VLS).\n+ Without -ffast-math, FP reductions default to ordered. */\n+DEF_REDUC_PLUS (_Float16, 4)\n+DEF_REDUC_PLUS (_Float16, 8)\n+DEF_REDUC_PLUS (float, 4)\n+DEF_REDUC_PLUS (float, 8)\n+DEF_REDUC_PLUS (double, 4)\n+DEF_REDUC_PLUS (double, 8)\n+\n+/* { dg-final { scan-assembler {vfredosum\\.vs} } } */\n+/* { dg-final { scan-assembler-not {csrr} } } */\n+\n+/* Verify reduction costs in the cost model dump.\n+ The reduc_*_cost replaces the default vec_to_scalar_cost,\n+ plus vr2gr/vr2fr cost (2):\n+ Integer unordered: reduc_i*_cost (2) + vr2gr (2) = 4\n+ FP ordered f16: reduc_f16_ordered_cost (6) + vr2fr (2) = 8\n+ FP ordered f32: reduc_f32_ordered_cost (4) + vr2fr (2) = 6\n+ FP ordered f64: reduc_f64_ordered_cost (2) + vr2fr (2) = 4 */\n+/* { dg-final { scan-tree-dump \"vec_to_scalar costs 8\" \"vect\" } } */\n+/* { dg-final { scan-tree-dump \"vec_to_scalar costs 6\" \"vect\" } } */\n+/* { dg-final { scan-tree-dump \"vec_to_scalar costs 4\" \"vect\" } } */\ndiff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/wred-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/wred-3.c\nindex 6e9456b23..0f08d50a5 100644\n--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/wred-3.c\n+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/wred-3.c\n@@ -3,7 +3,9 @@\n \n #include \"wred-2.c\"\n \n-/* { dg-final { scan-assembler-times {vfwredosum\\.vs} 17 } } */\n+/* The _Float16->float n=4 case is not vectorized because the ordered\n+ reduction cost makes it unprofitable for small trip counts. */\n+/* { dg-final { scan-assembler-times {vfwredosum\\.vs} 16 } } */\n /* { dg-final { scan-assembler-not {csrr} } } */\n /* { dg-final { scan-tree-dump-not \"1,1\" \"optimized\" } } */\n /* { dg-final { scan-tree-dump-not \"2,2\" \"optimized\" } } */\n", "prefixes": [] }