Patch Detail
get:
Show a patch.
patch:
Update a patch.
put:
Update a patch.
GET /api/patches/2216380/?format=api
{ "id": 2216380, "url": "http://patchwork.ozlabs.org/api/patches/2216380/?format=api", "web_url": "http://patchwork.ozlabs.org/project/gcc/patch/20260326105908.3389883-3-konstantinos.eleftheriou@vrull.eu/", "project": { "id": 17, "url": "http://patchwork.ozlabs.org/api/projects/17/?format=api", "name": "GNU Compiler Collection", "link_name": "gcc", "list_id": "gcc-patches.gcc.gnu.org", "list_email": "gcc-patches@gcc.gnu.org", "web_url": null, "scm_url": null, "webscm_url": null, "list_archive_url": "", "list_archive_url_format": "", "commit_url_format": "" }, "msgid": "<20260326105908.3389883-3-konstantinos.eleftheriou@vrull.eu>", "list_archive_url": null, "date": "2026-03-26T10:58:57", "name": "[v3,2/2] forwprop: Match and fold long-multiply patterns [PR107090]", "commit_ref": null, "pull_url": null, "state": "new", "archived": false, "hash": "db6f5ae62b967b43f97ad47912f9456283a22012", "submitter": { "id": 89121, "url": "http://patchwork.ozlabs.org/api/people/89121/?format=api", "name": "Konstantinos Eleftheriou", "email": "konstantinos.eleftheriou@vrull.eu" }, "delegate": null, "mbox": "http://patchwork.ozlabs.org/project/gcc/patch/20260326105908.3389883-3-konstantinos.eleftheriou@vrull.eu/mbox/", "series": [ { "id": 497569, "url": "http://patchwork.ozlabs.org/api/series/497569/?format=api", "web_url": "http://patchwork.ozlabs.org/project/gcc/list/?series=497569", "date": "2026-03-26T10:58:55", "name": "Recognize and fold longhand wide-multiplication idioms [PR107090]", "version": 3, "mbox": "http://patchwork.ozlabs.org/series/497569/mbox/" } ], "comments": "http://patchwork.ozlabs.org/api/patches/2216380/comments/", "check": "pending", "checks": "http://patchwork.ozlabs.org/api/patches/2216380/checks/", "tags": {}, "related": [], "headers": { "Return-Path": "<gcc-patches-bounces~incoming=patchwork.ozlabs.org@gcc.gnu.org>", "X-Original-To": [ "incoming@patchwork.ozlabs.org", "gcc-patches@gcc.gnu.org" ], "Delivered-To": [ "patchwork-incoming@legolas.ozlabs.org", "gcc-patches@gcc.gnu.org" ], "Authentication-Results": [ "legolas.ozlabs.org;\n\tdkim=pass (2048-bit key;\n unprotected) header.d=vrull.eu header.i=@vrull.eu header.a=rsa-sha256\n header.s=google header.b=LVG7dpDu;\n\tdkim-atps=neutral", "legolas.ozlabs.org;\n spf=pass (sender SPF authorized) smtp.mailfrom=gcc.gnu.org\n (client-ip=2620:52:6:3111::32; helo=vm01.sourceware.org;\n envelope-from=gcc-patches-bounces~incoming=patchwork.ozlabs.org@gcc.gnu.org;\n receiver=patchwork.ozlabs.org)", "sourceware.org;\n\tdkim=pass (2048-bit key,\n unprotected) header.d=vrull.eu header.i=@vrull.eu header.a=rsa-sha256\n header.s=google header.b=LVG7dpDu", "sourceware.org;\n dmarc=none (p=none dis=none) header.from=vrull.eu", "sourceware.org; spf=pass smtp.mailfrom=vrull.eu", "server2.sourceware.org;\n arc=none smtp.remote-ip=209.85.128.47" ], "Received": [ "from vm01.sourceware.org (vm01.sourceware.org\n [IPv6:2620:52:6:3111::32])\n\t(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)\n\t key-exchange x25519 server-signature ECDSA (secp384r1) server-digest SHA384)\n\t(No client certificate requested)\n\tby legolas.ozlabs.org (Postfix) with ESMTPS id 4fhLm162FMz1yGD\n\tfor <incoming@patchwork.ozlabs.org>; Thu, 26 Mar 2026 22:15:13 +1100 (AEDT)", "from vm01.sourceware.org (localhost [127.0.0.1])\n\tby sourceware.org (Postfix) with ESMTP id 9414D4BA2E12\n\tfor <incoming@patchwork.ozlabs.org>; Thu, 26 Mar 2026 11:15:11 +0000 (GMT)", "from mail-wm1-f47.google.com (mail-wm1-f47.google.com\n [209.85.128.47])\n by sourceware.org (Postfix) with ESMTPS id 95CC54BA2E19\n for <gcc-patches@gcc.gnu.org>; Thu, 26 Mar 2026 11:11:44 +0000 (GMT)", "by mail-wm1-f47.google.com with SMTP id\n 5b1f17b1804b1-4852b81c73aso7471325e9.3\n for <gcc-patches@gcc.gnu.org>; Thu, 26 Mar 2026 04:11:44 -0700 (PDT)", "from altra1.sec.univie.ac.at (altra1.sec.univie.ac.at.\n [131.130.126.103]) by smtp.gmail.com with ESMTPSA id\n 5b1f17b1804b1-487209204d5sm19623915e9.5.2026.03.26.04.11.41\n (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);\n Thu, 26 Mar 2026 04:11:41 -0700 (PDT)" ], "DKIM-Filter": [ "OpenDKIM Filter v2.11.0 sourceware.org 9414D4BA2E12", "OpenDKIM Filter v2.11.0 sourceware.org 95CC54BA2E19" ], "DMARC-Filter": "OpenDMARC Filter v1.4.2 sourceware.org 95CC54BA2E19", "ARC-Filter": "OpenARC Filter v1.0.0 sourceware.org 95CC54BA2E19", "ARC-Seal": "i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1774523505; cv=none;\n b=d0W/FlFESleTf3X6uObtv1uTBEpDe67dpPQWg+QYg42MvvXcYnZom5bgK2/kAxCcPPmF8WHp+BOpYoea9sakfGdodKdNvcxTx26gExhNlobyVAk0Lc4FSxU6ILI2yqVF6aI03Src/eBlINYfSCE80fTL/QFInx0eiSC1UvnA9eY=", "ARC-Message-Signature": "i=1; a=rsa-sha256; d=sourceware.org; s=key;\n t=1774523505; c=relaxed/simple;\n bh=l6fYMcq7uhiSI+7b/sYWukDx5xgu7WApqGwN6yaL1go=;\n h=DKIM-Signature:From:To:Subject:Date:Message-ID:MIME-Version;\n b=Lqvt/BF1lRyXpVQOfN3f6TtOQn5hTucF9SXee/8Xp6xeON9DiO307t6/zL/CicKtHc3zZLTdWI3R3SLuUhWvG/R5sxHwNTL9ym58AmcqyU2W8CaPmtOdadZ9V60BbBaGmDp6x8BfFplPnD4JlfkdxLVO60nSv9Z+18ydBerPboA=", "ARC-Authentication-Results": "i=1; server2.sourceware.org", "DKIM-Signature": "v=1; a=rsa-sha256; c=relaxed/relaxed;\n d=vrull.eu; s=google; t=1774523503; x=1775128303; darn=gcc.gnu.org;\n h=content-transfer-encoding:mime-version:references:in-reply-to\n :message-id:date:subject:cc:to:from:from:to:cc:subject:date\n :message-id:reply-to;\n bh=POuVJoVfJIrYF4yjbCCBX0dpRkGKozNcnRSdS/U9+3I=;\n b=LVG7dpDujlpFQVREQ0FPBZ+eQc1CyeLji8Dp7a+tJxTSV7iogSgGePzzcCvc54tF3g\n KeO24I+JXZDWJSFPY65RlX3R+Q36XOfBINL+DzOWoQFmpQaO7vi6YvzlXb++19jf/07j\n YJ/FA3FIXm44tiPI4l5w6W2LGoTbcxoVo1RhTCdfl1SpvLs0FgM8g1y81gOTDovldkZj\n b+Y91Hry2xUQuEmXZOwW68PdzNlKWq7hKJO+QHLOrTMxrfcD5mW/kUERfDCdZKGJp9w1\n JgqGBI5lgy8UKqAIpJTA7hMB9KD65qM1Ej8oYCvRBNWa9rH4rOaLUCiz3DbH3/dTQBQa\n ZApA==", "X-Google-DKIM-Signature": "v=1; a=rsa-sha256; c=relaxed/relaxed;\n d=1e100.net; s=20251104; t=1774523503; x=1775128303;\n h=content-transfer-encoding:mime-version:references:in-reply-to\n :message-id:date:subject:cc:to:from:x-gm-gg:x-gm-message-state:from\n :to:cc:subject:date:message-id:reply-to;\n bh=POuVJoVfJIrYF4yjbCCBX0dpRkGKozNcnRSdS/U9+3I=;\n b=DJWQo1CrJcMtSh+l/2jz2OL62qUH5GUk7sNJJ/y05pYTUzpTLn7wmalkdYiHNUyK6T\n ttlqqkhODOUbEUGtL5+rl7wFfA+oL++vSbxvLleumfv73anLztxEfEN+pgHchZ1K74Pw\n KVz3pwcA2+t2JVJ/ojnVXOKG2dcGSNLPKzN1aMzf6MSvIEpFnK3zZVlL6NGmuhbt5irZ\n VOYCdAjnHxrL/ZF7wloZzaAqLw3vsUar5lfIQM2FI30TeEVWX9zBuewreydFH3Pw5elR\n 1inpA61y1AS5h54MsmdHXGbin/WtNSOGDvCLbF170Ftv/zMeccEOfDAhQ0V6sfgmUa9n\n HR+Q==", "X-Gm-Message-State": "AOJu0Yzaq2pSJtbgYg2PXGlku6juVt/mNaHmO7+PDLn25TsfN1E6Vpd7\n iWAyaaXsBTPahboTsxUrWLhR3t1O94qnbWXRFnuQe16ffA2JcfqORIDAWgYaIv6p4n6PZhamNWK\n jHAaObHM=", "X-Gm-Gg": "ATEYQzwKRGDqKcSZIfWdNoKiTnA9E4AmT485NXSNHs3yDa5r0lpnZrKvUyY34K0QGry\n 8cK4v8YHHKPgZvY9QwHp3TzwlPvMEw0OWTjbh75HaM0xHCjc8wzsMHhc458TtiQ/WOx2kOY34Jq\n /tqPF6HJXglh2idHFLNTRrnzGyOOcnDVvk1cTYMOHstMp/GDqKJc14ku6SBqB++EKwsES85lN4z\n ONQQTatV2JAw4CfY3cx694pUv+6LQTl5wp9HU9mF9xsZrcc5XgXlCAZ4XlvklMNHEuPeGjiJV/+\n fnbip4JCop7JjgQWBSgyYifR76Ky9cF5O7eSi4gITLCr4TJOchIlkvGuqESy4CmTpGIvIBorzbt\n kztFwnpRjvcSBbHPBmAMI+0hNa5RDTwe+ijL+hdOjuQm7sW92xhGZw2AzMML70YIt6Mh5BH9+hq\n 4xfZObY8W12i88FQaJjlCrZPX4ugL4J+4CVYzXHuD9Z80lL/u1GVoAoYoeizoB1XYJDwuRDD8zx\n 4DBy77C3w==", "X-Received": "by 2002:a05:600c:1f13:b0:47e:e952:86c9 with SMTP id\n 5b1f17b1804b1-48715f03a9dmr109053875e9.0.1774523501993;\n Thu, 26 Mar 2026 04:11:41 -0700 (PDT)", "From": "Konstantinos Eleftheriou <konstantinos.eleftheriou@vrull.eu>", "To": "gcc-patches@gcc.gnu.org", "Cc": "Andrew Pinski <andrew.pinski@oss.qualcomm.com>,\n Philipp Tomsich <philipp.tomsich@vrull.eu>,\n Konstantinos Eleftheriou <konstantinos.eleftheriou@vrull.eu>", "Subject": "[PATCH v3 2/2] forwprop: Match and fold long-multiply patterns\n [PR107090]", "Date": "Thu, 26 Mar 2026 03:58:57 -0700", "Message-ID": "<20260326105908.3389883-3-konstantinos.eleftheriou@vrull.eu>", "X-Mailer": "git-send-email 2.52.0", "In-Reply-To": "<20260326105908.3389883-1-konstantinos.eleftheriou@vrull.eu>", "References": "<20260326105908.3389883-1-konstantinos.eleftheriou@vrull.eu>", "MIME-Version": "1.0", "Content-Transfer-Encoding": "8bit", "X-BeenThere": "gcc-patches@gcc.gnu.org", "X-Mailman-Version": "2.1.30", "Precedence": "list", "List-Id": "Gcc-patches mailing list <gcc-patches.gcc.gnu.org>", "List-Unsubscribe": "<https://gcc.gnu.org/mailman/options/gcc-patches>,\n <mailto:gcc-patches-request@gcc.gnu.org?subject=unsubscribe>", "List-Archive": "<https://gcc.gnu.org/pipermail/gcc-patches/>", "List-Post": "<mailto:gcc-patches@gcc.gnu.org>", "List-Help": "<mailto:gcc-patches-request@gcc.gnu.org?subject=help>", "List-Subscribe": "<https://gcc.gnu.org/mailman/listinfo/gcc-patches>,\n <mailto:gcc-patches-request@gcc.gnu.org?subject=subscribe>", "Errors-To": "gcc-patches-bounces~incoming=patchwork.ozlabs.org@gcc.gnu.org" }, "content": "Recognize longhand wide-multiplication idioms and fold them into\nMULT_HIGHPART_EXPR (or a widening multiply followed by a right shift)\nfor the high part, and a plain MULT_EXPR for the low part.\n\nBased on LLVM's approach:\nhttps://github.com/llvm/llvm-project/pull/168396\n\nThe decomposed patterns differ in how they propagate the carry from\nthe cross-product addition:\n\n - carry: single overflow comparison on the cross-sum\n - carry-long: cross-carry with separate high/low accumulation\n - two-carry: both cross-carry and low-carry as separate comparisons\n - ladder: sequential accumulation without explicit carry comparison\n - ladder-long: ladder with separate high/low accumulation\n - low-plus: low part as a direct sum of partial products\n\nPattern recognition is implemented via match patterns in match.pd,\nwhile the actual transformations (replacing the longhand sequence with\na single multiply instruction) are performed in forwprop. The\ntarget's support for umul_highpart or widening multiply is verified\nbefore folding.\n\nFor example, this AArch64 sequence from SPEC2026's 750.sealcrypto_r:\n\n lsr x4, x0, 32\n lsr x6, x1, 32\n umull x3, w0, w1\n umull x1, w1, w4\n and x5, x3, 4294967295\n umaddl x0, w0, w6, x1\n cmp x1, x0\n and x1, x0, 4294967295\n lsr x0, x0, 32\n add x3, x1, x3, lsr 32\n cset x1, hi\n orr x5, x5, x3, lsl 32\n umaddl x4, w4, w6, x0\n extr x0, x1, x3, 32\n add x0, x0, x4\n stp x5, x0, [x2]\n\ncan be transformed to:\n\n umulh x3, x1, x0\n mul x0, x0, x1\n stp x0, x3, [x2]\n\nSealcrypto shows a 25% improvement on Neoverse-N1 and 59% on Zen4.\n\nBootstrapped/regtested on AArch64 and x86-64.\n\n\tPR tree-optimization/107090\n\ngcc/ChangeLog:\n\n\tPR tree-optimization/107090\n\t* match.pd: Add match patterns for long-multiply recognition.\n\t* tree-ssa-forwprop.cc: Include langhooks.h.\n\t(create_mul_high_seq): New function.\n\t(create_mul_low_seq): New function.\n\t(check_matched_ops): New function.\n\t(check_hilo_and_ops): New function.\n\t(get_def_rhs_code): New function.\n\t(check_ssa_name_addition): New function.\n\t(check_ssa_name_hipart_common_exprs): New function.\n\t(check_stmt_common_exprs): New function.\n\t(fold_mul_high_carry): New function.\n\t(fold_mul_high_carry_long): New function.\n\t(fold_mul_high_two_carries): New function.\n\t(fold_mul_high_ladder): New function.\n\t(fold_mul_high_ladder_long): New function.\n\t(fold_mul_low_carry_long): New function.\n\t(fold_mul_low_ladder): New function.\n\t(fold_mul_hi): New function.\n\t(fold_mul_low_plus): New function.\n\t(fold_mul_low): New function.\n\t(pass_forwprop::execute): Call fold_mul_hi and fold_mul_low\n\tfor PLUS_EXPR and BIT_IOR_EXPR statements.\n\ngcc/testsuite/ChangeLog:\n\n\t* gcc.dg/tree-ssa/long-mul-carry.c: New test.\n\t* gcc.dg/tree-ssa/long-mul-ladder.c: New test.\n\t* gcc.dg/tree-ssa/long-mul-low-plus.c: New test.\n\t* gcc.dg/tree-ssa/long-mul-two-carry.c: New test.\n\t* gcc.dg/tree-ssa/long-mul-partial.c: New test.\n\t* gcc.dg/tree-ssa/long-mul-boundary.c: New test.\n\t* gcc.dg/tree-ssa/long-mul-boundary-64.c: New test.\n\t* gcc.target/aarch64/long_mul.c: New test.\n\t* gcc.target/i386/long_mul.c: New test.\n\nCo-authored-by: Philipp Tomsich <philipp.tomsich@vrull.eu>\n\nSigned-off-by: Konstantinos Eleftheriou <konstantinos.eleftheriou@vrull.eu>\n---\n\nChanges in v3:\n- Moved carry-diamond flattening from forwprop to match.pd,\nreplacing ~460 lines of C++ with a 17-line match.pd pattern.\n- Two-carry test scans forwprop3 (the first forwprop after phiopt2,\nsince early phiopt restricts which tree codes are allowed).\n- Set location for new sequences.\n- Updated mul_carry_low pattern.\n- Added the `mul_low_plus` pattern.\n- Fixed formatting issues.\n\nChanges in v2:\n- Fixed the testcases by separating the high part's fold count for\n32-bit and 64-bit targets.\n\n gcc/match.pd | 230 +++++\n .../gcc.dg/tree-ssa/long-mul-boundary-64.c | 274 ++++++\n .../gcc.dg/tree-ssa/long-mul-boundary.c | 270 ++++++\n .../gcc.dg/tree-ssa/long-mul-carry.c | 311 +++++++\n .../gcc.dg/tree-ssa/long-mul-ladder.c | 329 +++++++\n .../gcc.dg/tree-ssa/long-mul-low-plus.c | 54 ++\n .../gcc.dg/tree-ssa/long-mul-partial.c | 119 +++\n .../gcc.dg/tree-ssa/long-mul-two-carry.c | 111 +++\n gcc/testsuite/gcc.target/aarch64/long_mul.c | 100 ++\n gcc/testsuite/gcc.target/i386/long_mul.c | 100 ++\n gcc/tree-ssa-forwprop.cc | 871 +++++++++++++++++-\n 11 files changed, 2764 insertions(+), 5 deletions(-)\n create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/long-mul-boundary-64.c\n create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/long-mul-boundary.c\n create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/long-mul-carry.c\n create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/long-mul-ladder.c\n create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/long-mul-low-plus.c\n create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/long-mul-partial.c\n create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/long-mul-two-carry.c\n create mode 100644 gcc/testsuite/gcc.target/aarch64/long_mul.c\n create mode 100644 gcc/testsuite/gcc.target/i386/long_mul.c", "diff": "diff --git a/gcc/match.pd b/gcc/match.pd\nindex 59c3a9f2fe69..00ac6a4977ba 100644\n--- a/gcc/match.pd\n+++ b/gcc/match.pd\n@@ -11978,6 +11978,236 @@ and,\n && compare_tree_int (@c16, 16) == 0\n && compare_tree_int (@c32, 32) == 0)))\n \n+/* Match low and high parts of longhand multiplication.\n+ Given a 2N-bit unsigned type, x = xh*2^N + xl and y = yh*2^N + yl,\n+ where xh, xl, yh, yl are N-bit halves extracted via shifts and masks. */\n+\n+/* High half: op >> N. */\n+(match (mul_hi @op @0)\n+ (rshift @op INTEGER_CST@0)\n+ (with {\n+ tree op_type = TREE_TYPE (@op); }\n+ (if (INTEGRAL_TYPE_P (op_type)\n+ && TYPE_UNSIGNED (op_type)\n+ && TYPE_PRECISION (op_type) % 2 == 0\n+ && tree_fits_uhwi_p (@0)\n+ && tree_to_uhwi (@0) == TYPE_PRECISION (op_type) / 2))))\n+/* Low half: op & mask. */\n+(match (mul_lo @op @0)\n+ (bit_and @op INTEGER_CST@0)\n+ (with {\n+ tree op_type = TREE_TYPE (@op); }\n+ (if (INTEGRAL_TYPE_P (op_type)\n+ && TYPE_UNSIGNED (op_type)\n+ && TYPE_PRECISION (op_type) % 2 == 0\n+ && tree_fits_uhwi_p (@0)\n+ && tree_to_uhwi (@0) == wi::mask (\n+\t\t\t\t TYPE_PRECISION (op_type) / 2,\n+\t\t\t\t false,\n+\t\t\t\t TYPE_PRECISION (op_type))))))\n+/* Cross product: high(op0) * low(op1). */\n+(match (mul_hilo @op0 @op1 @0 @1)\n+ (mult:c\n+ (mul_hi @op0 INTEGER_CST@0)\n+ (mul_lo @op1 INTEGER_CST@1)))\n+/* Low-low product: low(op0) * low(op1). */\n+(match (mul_lolo @op0 @op1 @0)\n+ (mult:c\n+ (mul_lo @op0 INTEGER_CST@0)\n+ (mul_lo @op1 INTEGER_CST@0)))\n+/* High-high product: high(op0) * high(op1). */\n+(match (mul_hihi @op0 @op1 @0)\n+ (mult:c\n+ (mul_hi @op0 INTEGER_CST@0)\n+ (mul_hi @op1 INTEGER_CST@0)))\n+/* Cross sum: xh*yl + xl*yh.\n+ Note: matches any PLUS; operands are validated as actual cross\n+ products by the forwprop consumer (check_hilo_and_ops). */\n+(match (mul_cross_sum @mul_hilo0 @mul_hilo1)\n+ (plus:c @mul_hilo0 @mul_hilo1))\n+/* Low sum: cross_sum + (xl*yl >> N). */\n+(match (mul_low_sum @op0 @op1 @mul_hilo0 @mul_hilo1 @0 @1)\n+ (plus:c\n+ (mul_cross_sum @mul_hilo0 @mul_hilo1)\n+ (mul_hi\n+ (mul_lolo @op0 @op1 INTEGER_CST@1)\n+ INTEGER_CST@0)))\n+/* Carry from low-sum overflow: (cast?) (hilo > low_sum) << N.\n+ No explicit type/width guard needed: mul_low_sum delegates to\n+ mul_cross_sum + mul_hi + mul_lolo, which provide deep structural\n+ constraints, and @0 ties the shift amount to the inner constants. */\n+(match (mul_carry_low_sum @op0 @op1 @mul_hilo0 @mul_hilo1 @mul_hilo2 @0 @1)\n+ (lshift\n+ (convert? (gt\n+ @mul_hilo0\n+ (mul_low_sum @op0 @op1 @mul_hilo1 @mul_hilo2 INTEGER_CST@0\n+ INTEGER_CST@1)))\n+ INTEGER_CST@0))\n+/* Carry from cross-sum overflow: (cast?) (hilo > cross_sum) << N.\n+ Explicit guard required because mul_cross_sum is just (plus:c @0 @1)\n+ with no inherent type or halfwidth constraint. */\n+(match (mul_carry_cross_sum @mul_hilo0 @mul_hilo1 @mul_hilo2 @0)\n+ (lshift\n+ (convert? (gt\n+ @mul_hilo0\n+ (mul_cross_sum @mul_hilo1 @mul_hilo2)))\n+ INTEGER_CST@0)\n+ (with {\n+ tree op_type = TREE_TYPE (@mul_hilo0); }\n+ (if (INTEGRAL_TYPE_P (op_type)\n+ && TYPE_UNSIGNED (op_type)\n+ && TYPE_PRECISION (op_type) % 2 == 0\n+ && tree_fits_uhwi_p (@0)\n+ && tree_to_uhwi (@0) == TYPE_PRECISION (op_type) / 2))))\n+/* Carry from addition overflow: (cast?) (a > a + b).\n+ :c on gt also matches the LT form: (cast?) (a + b < a). */\n+(match (mul_carry_low @0 @1)\n+ (convert?\n+ (gt:c @0 (plus:c @1 @0)))\n+ (with { tree op_type = TREE_TYPE (@0); }\n+ (if (INTEGRAL_TYPE_P (op_type) && TYPE_UNSIGNED (op_type)))))\n+/* High-part sum (short form): xh*yh + addend + addend.\n+ Multiple variants cover different operand orderings. */\n+(match (mul_hipart @op0 @op1 @0 @1 @2)\n+ (plus:c\n+ (plus:c @0 @1)\n+ (mul_hihi @op0 @op1 INTEGER_CST@2)))\n+(match (mul_hipart @op0 @op1 @0 @1 @2)\n+ (plus:c\n+ (plus:c\n+ (mul_hihi @op0 @op1 INTEGER_CST@2)\n+ @0)\n+ @1))\n+/* High-part sum (long form): xh*yh + addend + addend + addend.\n+ Multiple variants cover different operand orderings. */\n+(match (mul_hipart_long @op0 @op1 @0 @1 @2 @3)\n+ (plus:c\n+ (plus:c\n+ @0\n+ (plus:c\n+\t(mul_hihi @op0 @op1 INTEGER_CST@3)\n+\t@1))\n+ @2))\n+(match (mul_hipart_long @op0 @op1 @0 @1 @2 @3)\n+ (plus:c\n+ (mul_hihi @op0 @op1 INTEGER_CST@3)\n+ (plus:c\n+ @0\n+ (plus:c @1 @2))))\n+(match (mul_hipart_long @op0 @op1 @0 @1 @2 @3)\n+ (plus:c\n+ @0\n+ (plus:c\n+ (mul_hihi @op0 @op1 INTEGER_CST@3)\n+ (plus:c @1 @2))))\n+(match (mul_hipart_long @op0 @op1 @0 @1 @2 @3)\n+ (plus:c\n+ (plus:c\n+ (mul_hihi @op0 @op1 INTEGER_CST@3)\n+ @0)\n+ (plus:c @1 @2)))\n+/* Low accumulate: (xl*yl >> N) + (cross_sum & mask). */\n+(match (mul_low_accum @op0 @op1 @mul_hilo0 @mul_hilo1 @0 @1)\n+ (plus:c\n+ (mul_hi\n+ (mul_lolo @op0 @op1 INTEGER_CST@0)\n+ INTEGER_CST@1)\n+ (mul_lo (mul_cross_sum @mul_hilo0 @mul_hilo1) INTEGER_CST@0)))\n+/* Ladder sum form 1: (hilo0 & mask) + hilo1 + (xl*yl >> N).\n+ First variant: @mul_hilo1 is inside the inner plus:c alongside\n+ (mul_lo ...). */\n+(match (mul_ladder_sum1 @op0 @op1 @mul_hilo0 @mul_hilo1 @0 @1)\n+ (plus:c\n+ (plus:c\n+ (mul_lo\n+\t@mul_hilo0\n+\tINTEGER_CST@0)\n+ @mul_hilo1)\n+ (mul_hi (mul_lolo @op0 @op1 INTEGER_CST@0) INTEGER_CST@1)))\n+/* Second variant: @mul_hilo1 is the outermost addend and could match\n+ anything, so guard that its definition is a MULT_EXPR. */\n+(match (mul_ladder_sum1 @op0 @op1 @mul_hilo0 @mul_hilo1 @0 @1)\n+ (plus:c\n+ (plus:c\n+ (mul_lo @mul_hilo0 INTEGER_CST@0)\n+ (mul_hi (mul_lolo @op0 @op1 INTEGER_CST@0) INTEGER_CST@1))\n+ @mul_hilo1)\n+ (with {\n+ tree_code mul_hilo_code = TREE_CODE (@mul_hilo1);\n+ tree_code rhs_code = ERROR_MARK;\n+ if (mul_hilo_code == SSA_NAME)\n+ {\n+\tgimple *def = SSA_NAME_DEF_STMT (@mul_hilo1);\n+\tif (def && gimple_code (def) == GIMPLE_ASSIGN)\n+\t rhs_code = gimple_assign_rhs_code (def);\n+ } }\n+ (if (rhs_code == MULT_EXPR))))\n+/* Partial ladder sum: (xl*yl >> N) + hilo. */\n+(match (mul_ladder_part_sum @op0 @op1 @mul_hilo0 @0 @1)\n+ (plus:c\n+ (mul_hi (mul_lolo @op0 @op1 INTEGER_CST@0) INTEGER_CST@1)\n+ @mul_hilo0))\n+/* Ladder sum form 2: (ladder_part_sum & mask) + hilo. */\n+(match (mul_ladder_sum2 @op0 @op1 @mul_hilo0 @mul_hilo1 @0 @1)\n+ (plus:c\n+ (mul_lo\n+ (mul_ladder_part_sum @op0 @op1 @mul_hilo0 INTEGER_CST@0 INTEGER_CST@1)\n+ INTEGER_CST@0)\n+ @mul_hilo1))\n+/* Ladder sum form 3: (hilo0 & mask) + (hilo1 & mask) + (xl*yl >> N). */\n+(match (mul_ladder_sum3 @op0 @op1 @mul_hilo0 @mul_hilo1 @0 @1)\n+ (plus:c\n+ (plus:c\n+ (mul_lo @mul_hilo0 INTEGER_CST@0)\n+ (mul_lo @mul_hilo1 INTEGER_CST@0))\n+ (mul_hi (mul_lolo @op0 @op1 INTEGER_CST@0) INTEGER_CST@1)))\n+(match (mul_ladder_sum3 @op0 @op1 @mul_hilo0 @mul_hilo1 @0 @1)\n+ (plus:c\n+ (plus:c\n+ (mul_lo @mul_hilo0 INTEGER_CST@0)\n+ (mul_hi (mul_lolo @op0 @op1 INTEGER_CST@0) INTEGER_CST@1))\n+ (mul_lo @mul_hilo1 INTEGER_CST@0)))\n+/* Low part: (ladder_sum << N) | (xl*yl & mask).\n+ Matches all three ladder-sum forms. */\n+(for mul_ladder_sum (mul_ladder_sum1 mul_ladder_sum2 mul_ladder_sum3)\n+ (match (mul_low_part @op0 @op1 @mul_hilo0 @mul_hilo1 @0 @1)\n+ (bit_ior:c\n+ (lshift\n+\t(mul_ladder_sum @op0 @op1 @mul_hilo0 @mul_hilo1 INTEGER_CST@0\n+\t INTEGER_CST@1)\n+\tINTEGER_CST@1)\n+ (mul_lo\n+\t(mul_lolo @op0 @op1 INTEGER_CST@0)\n+\tINTEGER_CST@0))))\n+/* lolo + (cross_sum << N): low part as plain PLUS_EXPR. */\n+(match (mul_low_plus @op0 @op1 @mul_hilo0 @mul_hilo1 @0 @1)\n+ (plus:c\n+ (mul_lolo @op0 @op1 INTEGER_CST@0)\n+ (lshift\n+ (mul_cross_sum @mul_hilo0 @mul_hilo1)\n+ INTEGER_CST@1))\n+ (with {\n+ tree op_type = TREE_TYPE (@op0); }\n+ (if (INTEGRAL_TYPE_P (op_type)\n+ && TYPE_UNSIGNED (op_type)\n+ && TYPE_PRECISION (op_type) % 2 == 0\n+ && tree_fits_uhwi_p (@1)\n+ && tree_to_uhwi (@1) == TYPE_PRECISION (op_type) / 2))))\n+/* Low part (long form): (sum << N) | (xl*yl & mask). */\n+(match (mul_low_part_long @op0 @op1 @mul_sum @0 @1)\n+ (bit_ior:c\n+ (lshift @mul_sum INTEGER_CST@1)\n+ (mul_lo\n+ (mul_lolo @op0 @op1 INTEGER_CST@0)\n+ INTEGER_CST@0))\n+ (with {\n+ tree op_type = TREE_TYPE (@op0); }\n+ (if (INTEGRAL_TYPE_P (op_type)\n+ && TYPE_UNSIGNED (op_type)\n+ && TYPE_PRECISION (op_type) % 2 == 0\n+ && tree_fits_uhwi_p (@1)\n+ && tree_to_uhwi (@1) == TYPE_PRECISION (op_type) / 2))))\n+\n /* Floatint point/integer comparison and integer->integer\n or floating point -> float point conversion. */\n (match (cond_expr_convert_p @0 @2 @3 @6)\ndiff --git a/gcc/testsuite/gcc.dg/tree-ssa/long-mul-boundary-64.c b/gcc/testsuite/gcc.dg/tree-ssa/long-mul-boundary-64.c\nnew file mode 100644\nindex 000000000000..a0576ad48a8e\n--- /dev/null\n+++ b/gcc/testsuite/gcc.dg/tree-ssa/long-mul-boundary-64.c\n@@ -0,0 +1,274 @@\n+/* { dg-do run } */\n+/* { dg-require-effective-target int128 } */\n+/* { dg-options \"-O3\" } */\n+\n+typedef __UINT64_TYPE__ uint64_t;\n+typedef unsigned __int128 uint128_t;\n+\n+/* Reference: high part of 64x64 -> 128 multiply. */\n+__attribute__((noipa))\n+uint64_t mulh_ref (uint64_t x, uint64_t y)\n+{\n+ return (uint64_t)(((uint128_t)x * y) >> 64);\n+}\n+\n+/* Carry pattern for high part. */\n+__attribute__((noipa))\n+uint64_t mulh_carry (uint64_t x, uint64_t y)\n+{\n+ uint64_t x_hi = x >> 32;\n+ uint64_t x_lo = x & 0xFFFFFFFFUL;\n+ uint64_t y_hi = y >> 32;\n+ uint64_t y_lo = y & 0xFFFFFFFFUL;\n+ uint64_t mulhilo = x_hi * y_lo;\n+ uint64_t mullohi = x_lo * y_hi;\n+ uint64_t cross_sum = mulhilo + mullohi;\n+ uint64_t mullolo = x_lo * y_lo;\n+ uint64_t shrlolo = mullolo >> 32;\n+ uint64_t add_cross_sum = cross_sum + shrlolo;\n+ int carry = add_cross_sum < mulhilo;\n+ uint64_t cond = ((uint64_t) carry << 32) + x_hi * y_hi;\n+ uint64_t add = cond + (add_cross_sum >> 32);\n+\n+ return add;\n+}\n+\n+/* Ladder pattern for high part. */\n+__attribute__((noipa))\n+uint64_t mulh_ladder (uint64_t x, uint64_t y)\n+{\n+ uint64_t x_lo = x & 0xFFFFFFFFUL;\n+ uint64_t y_lo = y & 0xFFFFFFFFUL;\n+ uint64_t x_hi = x >> 32;\n+ uint64_t y_hi = y >> 32;\n+ uint64_t t0 = y_lo * x_lo;\n+ uint64_t t1 = y_lo * x_hi;\n+ uint64_t t2 = y_hi * x_lo;\n+ uint64_t t3 = y_hi * x_hi;\n+ uint64_t t0_hi = t0 >> 32;\n+ uint64_t u0 = t0_hi + t1;\n+ uint64_t u0_lo = u0 & 0xFFFFFFFFUL;\n+ uint64_t u0_hi = u0 >> 32;\n+ uint64_t u1 = u0_lo + t2;\n+ uint64_t u1_hi = u1 >> 32;\n+ uint64_t u2 = u0_hi + t3;\n+ uint64_t hw = u2 + u1_hi;\n+\n+ return hw;\n+}\n+\n+/* Ladder-long full multiply (both high and low parts). */\n+__attribute__((noipa))\n+void full_mul (uint64_t x, uint64_t y, uint64_t *p)\n+{\n+ uint64_t xl = x & 0xFFFFFFFFUL;\n+ uint64_t xh = x >> 32;\n+ uint64_t yl = y & 0xFFFFFFFFUL;\n+ uint64_t yh = y >> 32;\n+ uint64_t mulll = xl * yl;\n+ uint64_t mullh = xl * yh;\n+ uint64_t mulhl = xh * yl;\n+ uint64_t mulhh = xh * yh;\n+ uint64_t shr8 = mulll >> 32;\n+ uint64_t conv10 = mullh & 0xFFFFFFFFUL;\n+ uint64_t add = shr8 + conv10;\n+ uint64_t conv12 = mulhl & 0xFFFFFFFFUL;\n+ uint64_t add13 = add + conv12;\n+ uint64_t shr14 = add13 >> 32;\n+ uint64_t shr15 = mullh >> 32;\n+ uint64_t add16 = mulhh + shr15;\n+ uint64_t shr17 = mulhl >> 32;\n+ uint64_t add18 = add16 + shr17;\n+ uint64_t add19 = add18 + shr14;\n+ p[1] = add19;\n+ uint64_t add_13_shl = add13 << 32;\n+ uint64_t and17 = mulll & 0xFFFFFFFFUL;\n+ uint64_t or_val = add_13_shl | and17;\n+ p[0] = or_val;\n+}\n+\n+/* Two-carry pattern for high part. */\n+__attribute__((noipa))\n+uint64_t mulh_two_carry (uint64_t x, uint64_t y)\n+{\n+ uint64_t x_hi = x >> 32;\n+ uint64_t x_lo = x & 0xFFFFFFFFUL;\n+ uint64_t y_hi = y >> 32;\n+ uint64_t y_lo = y & 0xFFFFFFFFUL;\n+\n+ uint64_t lolo = x_lo * y_lo;\n+ uint64_t hilo = x_hi * y_lo;\n+ uint64_t lohi = x_lo * y_hi;\n+ uint64_t hihi = x_hi * y_hi;\n+\n+ uint64_t cross_sum = hilo + lohi;\n+ uint64_t cross_carry = (uint64_t)(cross_sum < hilo) << 32;\n+\n+ uint64_t cross_shifted = cross_sum << 32;\n+ uint64_t low_result = lolo + cross_shifted;\n+ uint64_t low_carry = (uint64_t)(low_result < cross_shifted);\n+\n+ uint64_t high = hihi + (cross_sum >> 32) + cross_carry + low_carry;\n+\n+ return high;\n+}\n+\n+/* Two-carry full multiply (both high and low parts). */\n+__attribute__((noipa))\n+void full_mul_two_carry (uint64_t x, uint64_t y, uint64_t *p)\n+{\n+ uint64_t x_hi = x >> 32;\n+ uint64_t x_lo = x & 0xFFFFFFFFUL;\n+ uint64_t y_hi = y >> 32;\n+ uint64_t y_lo = y & 0xFFFFFFFFUL;\n+\n+ uint64_t lolo = x_lo * y_lo;\n+ uint64_t hilo = x_hi * y_lo;\n+ uint64_t lohi = x_lo * y_hi;\n+ uint64_t hihi = x_hi * y_hi;\n+\n+ uint64_t cross_sum = hilo + lohi;\n+ uint64_t cross_carry = (uint64_t)(cross_sum < hilo) << 32;\n+\n+ uint64_t cross_shifted = cross_sum << 32;\n+ uint64_t low_result = lolo + cross_shifted;\n+ uint64_t low_carry = (uint64_t)(low_result < cross_shifted);\n+\n+ uint64_t high = hihi + (cross_sum >> 32) + cross_carry + low_carry;\n+\n+ p[0] = low_result;\n+ p[1] = high;\n+}\n+\n+/* Carry-long pattern for high part. */\n+__attribute__((noipa))\n+uint64_t mulh_carry_long (uint64_t x, uint64_t y)\n+{\n+ uint64_t x_lo = x & 0xFFFFFFFFUL;\n+ uint64_t x_hi = x >> 32;\n+ uint64_t y_lo = y & 0xFFFFFFFFUL;\n+ uint64_t y_hi = y >> 32;\n+ uint64_t y_lo_x_hi = y_lo * x_hi;\n+ uint64_t y_hi_x_hi = y_hi * x_hi;\n+ uint64_t y_hi_x_lo = y_hi * x_lo;\n+ uint64_t y_lo_x_lo = y_lo * x_lo;\n+ uint64_t cross_sum = y_hi_x_lo + y_lo_x_hi;\n+ int carry_out = (cross_sum < y_lo_x_hi);\n+ uint64_t carry = (uint64_t) carry_out << 32;\n+ uint64_t y_lo_x_lo_hi = y_lo_x_lo >> 32;\n+ uint64_t cross_sum_lo = cross_sum & 0xFFFFFFFFUL;\n+ uint64_t cross_sum_hi = cross_sum >> 32;\n+ uint64_t low_accum = cross_sum_lo + y_lo_x_lo_hi;\n+ uint64_t interm = cross_sum_hi + y_hi_x_hi;\n+ uint64_t low_accum_hi = low_accum >> 32;\n+ uint64_t interm_plus_carry = interm + carry;\n+ return interm_plus_carry + low_accum_hi;\n+}\n+\n+/* Carry-long full multiply (both high and low parts). */\n+__attribute__((noipa))\n+void full_mul_carry_long (uint64_t x, uint64_t y, uint64_t *p)\n+{\n+ uint64_t x_lo = x & 0xFFFFFFFFUL;\n+ uint64_t x_hi = x >> 32;\n+ uint64_t y_lo = y & 0xFFFFFFFFUL;\n+ uint64_t y_hi = y >> 32;\n+ uint64_t y_lo_x_hi = y_lo * x_hi;\n+ uint64_t y_hi_x_hi = y_hi * x_hi;\n+ uint64_t y_hi_x_lo = y_hi * x_lo;\n+ uint64_t y_lo_x_lo = y_lo * x_lo;\n+ uint64_t cross_sum = y_hi_x_lo + y_lo_x_hi;\n+ int carry_out = (cross_sum < y_lo_x_hi);\n+ uint64_t carry = (uint64_t) carry_out << 32;\n+ uint64_t y_lo_x_lo_hi = y_lo_x_lo >> 32;\n+ uint64_t cross_sum_lo = cross_sum & 0xFFFFFFFFUL;\n+ uint64_t cross_sum_hi = cross_sum >> 32;\n+ uint64_t low_accum = cross_sum_lo + y_lo_x_lo_hi;\n+ uint64_t upper_mid = y_hi_x_hi + carry;\n+ uint64_t low_accum_hi = low_accum >> 32;\n+ uint64_t upper_mid_with_cross = upper_mid + cross_sum_hi;\n+ p[1] = upper_mid_with_cross + low_accum_hi;\n+ uint64_t low_accum_shifted = low_accum << 32;\n+ uint64_t y_lo_x_lo_lo = y_lo_x_lo & 0xFFFFFFFFUL;\n+ p[0] = low_accum_shifted | y_lo_x_lo_lo;\n+}\n+\n+/* Ladder-long pattern for high part. */\n+__attribute__((noipa))\n+uint64_t mulh_ladder_long (uint64_t x, uint64_t y)\n+{\n+ uint64_t xl = x & 0xFFFFFFFFUL;\n+ uint64_t xh = x >> 32;\n+ uint64_t yl = y & 0xFFFFFFFFUL;\n+ uint64_t yh = y >> 32;\n+ uint64_t mulll = xl * yl;\n+ uint64_t mullh = xl * yh;\n+ uint64_t mulhl = xh * yl;\n+ uint64_t mulhh = xh * yh;\n+ uint64_t shr8 = mulll >> 32;\n+ uint64_t conv10 = mullh & 0xFFFFFFFFUL;\n+ uint64_t add = shr8 + conv10;\n+ uint64_t conv12 = mulhl & 0xFFFFFFFFUL;\n+ uint64_t add13 = add + conv12;\n+ uint64_t shr14 = add13 >> 32;\n+ uint64_t shr15 = mullh >> 32;\n+ uint64_t add16 = mulhh + shr15;\n+ uint64_t shr17 = mulhl >> 32;\n+ uint64_t add18 = add16 + shr17;\n+ return add18 + shr14;\n+}\n+\n+int main ()\n+{\n+ uint64_t vals[] = {\n+ 0, 1, 0xFFFFFFFFUL, 0x100000000ULL,\n+ 0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL\n+ };\n+ int n = sizeof (vals) / sizeof (vals[0]);\n+\n+ for (int i = 0; i < n; i++)\n+ for (int j = 0; j < n; j++)\n+ {\n+\tuint64_t x = vals[i], y = vals[j];\n+\tuint64_t expected_hi = mulh_ref (x, y);\n+\tuint64_t expected_lo = x * y;\n+\n+\tif (mulh_carry (x, y) != expected_hi)\n+\t __builtin_abort ();\n+\n+\tif (mulh_ladder (x, y) != expected_hi)\n+\t __builtin_abort ();\n+\n+\tif (mulh_two_carry (x, y) != expected_hi)\n+\t __builtin_abort ();\n+\n+\tuint64_t p[2];\n+\tfull_mul (x, y, p);\n+\tif (p[1] != expected_hi)\n+\t __builtin_abort ();\n+\tif (p[0] != expected_lo)\n+\t __builtin_abort ();\n+\n+\tuint64_t q[2];\n+\tfull_mul_two_carry (x, y, q);\n+\tif (q[1] != expected_hi)\n+\t __builtin_abort ();\n+\tif (q[0] != expected_lo)\n+\t __builtin_abort ();\n+\n+\tif (mulh_carry_long (x, y) != expected_hi)\n+\t __builtin_abort ();\n+\n+\tif (mulh_ladder_long (x, y) != expected_hi)\n+\t __builtin_abort ();\n+\n+\tuint64_t r[2];\n+\tfull_mul_carry_long (x, y, r);\n+\tif (r[1] != expected_hi)\n+\t __builtin_abort ();\n+\tif (r[0] != expected_lo)\n+\t __builtin_abort ();\n+ }\n+\n+ return 0;\n+}\ndiff --git a/gcc/testsuite/gcc.dg/tree-ssa/long-mul-boundary.c b/gcc/testsuite/gcc.dg/tree-ssa/long-mul-boundary.c\nnew file mode 100644\nindex 000000000000..c38c9da8539c\n--- /dev/null\n+++ b/gcc/testsuite/gcc.dg/tree-ssa/long-mul-boundary.c\n@@ -0,0 +1,270 @@\n+/* { dg-do run } */\n+/* { dg-options \"-O3\" } */\n+\n+typedef __UINT32_TYPE__ uint32_t;\n+typedef __UINT64_TYPE__ uint64_t;\n+\n+/* Reference: high part of 32x32 -> 64 multiply. */\n+__attribute__((noipa))\n+uint32_t mulh_ref (uint32_t x, uint32_t y)\n+{\n+ return (uint32_t)(((uint64_t)x * y) >> 32);\n+}\n+\n+/* Carry pattern for high part. */\n+__attribute__((noipa))\n+uint32_t mulh_carry (uint32_t x, uint32_t y)\n+{\n+ uint32_t x_hi = x >> 16;\n+ uint32_t x_lo = x & 0xFFFF;\n+ uint32_t y_hi = y >> 16;\n+ uint32_t y_lo = y & 0xFFFF;\n+ uint32_t mulhilo = x_hi * y_lo;\n+ uint32_t mullohi = x_lo * y_hi;\n+ uint32_t cross_sum = mulhilo + mullohi;\n+ uint32_t mullolo = x_lo * y_lo;\n+ uint32_t shrlolo = mullolo >> 16;\n+ uint32_t add_cross_sum = cross_sum + shrlolo;\n+ int carry = add_cross_sum < mulhilo;\n+ uint32_t cond = ((uint32_t) carry << 16) + x_hi * y_hi;\n+ uint32_t add = cond + (add_cross_sum >> 16);\n+\n+ return add;\n+}\n+\n+/* Ladder pattern for high part. */\n+__attribute__((noipa))\n+uint32_t mulh_ladder (uint32_t x, uint32_t y)\n+{\n+ uint32_t x_lo = x & 0xFFFF;\n+ uint32_t y_lo = y & 0xFFFF;\n+ uint32_t x_hi = x >> 16;\n+ uint32_t y_hi = y >> 16;\n+ uint32_t t0 = y_lo * x_lo;\n+ uint32_t t1 = y_lo * x_hi;\n+ uint32_t t2 = y_hi * x_lo;\n+ uint32_t t3 = y_hi * x_hi;\n+ uint32_t t0_hi = t0 >> 16;\n+ uint32_t u0 = t0_hi + t1;\n+ uint32_t u0_lo = u0 & 0xFFFF;\n+ uint32_t u0_hi = u0 >> 16;\n+ uint32_t u1 = u0_lo + t2;\n+ uint32_t u1_hi = u1 >> 16;\n+ uint32_t u2 = u0_hi + t3;\n+ uint32_t hw = u2 + u1_hi;\n+\n+ return hw;\n+}\n+\n+/* Ladder-long pattern for full multiplication (both high and low parts). */\n+__attribute__((noipa))\n+void full_mul (uint32_t x, uint32_t y, uint32_t *p)\n+{\n+ uint32_t xl = x & 0xFFFF;\n+ uint32_t xh = x >> 16;\n+ uint32_t yl = y & 0xFFFF;\n+ uint32_t yh = y >> 16;\n+ uint32_t mulll = xl * yl;\n+ uint32_t mullh = xl * yh;\n+ uint32_t mulhl = xh * yl;\n+ uint32_t mulhh = xh * yh;\n+ uint32_t shr8 = mulll >> 16;\n+ uint32_t conv10 = mullh & 0xFFFF;\n+ uint32_t add = shr8 + conv10;\n+ uint32_t conv12 = mulhl & 0xFFFF;\n+ uint32_t add13 = add + conv12;\n+ uint32_t shr14 = add13 >> 16;\n+ uint32_t shr15 = mullh >> 16;\n+ uint32_t add16 = mulhh + shr15;\n+ uint32_t shr17 = mulhl >> 16;\n+ uint32_t add18 = add16 + shr17;\n+ uint32_t add19 = add18 + shr14;\n+ p[1] = add19;\n+ uint32_t add_13_shl = add13 << 16;\n+ uint32_t and17 = mulll & 0xFFFF;\n+ uint32_t or_val = add_13_shl | and17;\n+ p[0] = or_val;\n+}\n+\n+/* Two-carry pattern for high part (32-bit). */\n+__attribute__((noipa))\n+uint32_t mulh_two_carry (uint32_t x, uint32_t y)\n+{\n+ uint32_t x_hi = x >> 16;\n+ uint32_t x_lo = x & 0xFFFF;\n+ uint32_t y_hi = y >> 16;\n+ uint32_t y_lo = y & 0xFFFF;\n+\n+ uint32_t lolo = x_lo * y_lo;\n+ uint32_t hilo = x_hi * y_lo;\n+ uint32_t lohi = x_lo * y_hi;\n+ uint32_t hihi = x_hi * y_hi;\n+\n+ uint32_t cross_sum = hilo + lohi;\n+ uint32_t cross_carry = (uint32_t)(cross_sum < hilo) << 16;\n+\n+ uint32_t cross_shifted = cross_sum << 16;\n+ uint32_t low_result = lolo + cross_shifted;\n+ uint32_t low_carry = (uint32_t)(low_result < cross_shifted);\n+\n+ uint32_t high = hihi + (cross_sum >> 16) + cross_carry + low_carry;\n+\n+ return high;\n+}\n+\n+/* Two-carry full multiply (32-bit, both high and low parts). */\n+__attribute__((noipa))\n+void full_mul_two_carry (uint32_t x, uint32_t y, uint32_t *p)\n+{\n+ uint32_t x_hi = x >> 16;\n+ uint32_t x_lo = x & 0xFFFF;\n+ uint32_t y_hi = y >> 16;\n+ uint32_t y_lo = y & 0xFFFF;\n+\n+ uint32_t lolo = x_lo * y_lo;\n+ uint32_t hilo = x_hi * y_lo;\n+ uint32_t lohi = x_lo * y_hi;\n+ uint32_t hihi = x_hi * y_hi;\n+\n+ uint32_t cross_sum = hilo + lohi;\n+ uint32_t cross_carry = (uint32_t)(cross_sum < hilo) << 16;\n+\n+ uint32_t cross_shifted = cross_sum << 16;\n+ uint32_t low_result = lolo + cross_shifted;\n+ uint32_t low_carry = (uint32_t)(low_result < cross_shifted);\n+\n+ uint32_t high = hihi + (cross_sum >> 16) + cross_carry + low_carry;\n+\n+ p[0] = low_result;\n+ p[1] = high;\n+}\n+\n+/* Carry-long pattern for high part. */\n+__attribute__((noipa))\n+uint32_t mulh_carry_long (uint32_t x, uint32_t y)\n+{\n+ uint32_t x_lo = x & 0xFFFF;\n+ uint32_t x_hi = x >> 16;\n+ uint32_t y_lo = y & 0xFFFF;\n+ uint32_t y_hi = y >> 16;\n+ uint32_t y_lo_x_hi = y_lo * x_hi;\n+ uint32_t y_hi_x_hi = y_hi * x_hi;\n+ uint32_t y_hi_x_lo = y_hi * x_lo;\n+ uint32_t y_lo_x_lo = y_lo * x_lo;\n+ uint32_t cross_sum = y_hi_x_lo + y_lo_x_hi;\n+ int carry_out = (cross_sum < y_lo_x_hi);\n+ uint32_t carry = (uint32_t) carry_out << 16;\n+ uint32_t y_lo_x_lo_hi = y_lo_x_lo >> 16;\n+ uint32_t cross_sum_lo = cross_sum & 0xFFFF;\n+ uint32_t cross_sum_hi = cross_sum >> 16;\n+ uint32_t low_accum = cross_sum_lo + y_lo_x_lo_hi;\n+ uint32_t interm = cross_sum_hi + y_hi_x_hi;\n+ uint32_t low_accum_hi = low_accum >> 16;\n+ uint32_t interm_plus_carry = interm + carry;\n+ return interm_plus_carry + low_accum_hi;\n+}\n+\n+/* Carry-long full multiply (both high and low parts). */\n+__attribute__((noipa))\n+void full_mul_carry_long (uint32_t x, uint32_t y, uint32_t *p)\n+{\n+ uint32_t x_lo = x & 0xFFFF;\n+ uint32_t x_hi = x >> 16;\n+ uint32_t y_lo = y & 0xFFFF;\n+ uint32_t y_hi = y >> 16;\n+ uint32_t y_lo_x_hi = y_lo * x_hi;\n+ uint32_t y_hi_x_hi = y_hi * x_hi;\n+ uint32_t y_hi_x_lo = y_hi * x_lo;\n+ uint32_t y_lo_x_lo = y_lo * x_lo;\n+ uint32_t cross_sum = y_hi_x_lo + y_lo_x_hi;\n+ int carry_out = (cross_sum < y_lo_x_hi);\n+ uint32_t carry = (uint32_t) carry_out << 16;\n+ uint32_t y_lo_x_lo_hi = y_lo_x_lo >> 16;\n+ uint32_t cross_sum_lo = cross_sum & 0xFFFF;\n+ uint32_t cross_sum_hi = cross_sum >> 16;\n+ uint32_t low_accum = cross_sum_lo + y_lo_x_lo_hi;\n+ uint32_t upper_mid = y_hi_x_hi + carry;\n+ uint32_t low_accum_hi = low_accum >> 16;\n+ uint32_t upper_mid_with_cross = upper_mid + cross_sum_hi;\n+ p[1] = upper_mid_with_cross + low_accum_hi;\n+ uint32_t low_accum_shifted = low_accum << 16;\n+ uint32_t y_lo_x_lo_lo = y_lo_x_lo & 0xFFFF;\n+ p[0] = low_accum_shifted | y_lo_x_lo_lo;\n+}\n+\n+/* Ladder-long pattern for high part. */\n+__attribute__((noipa))\n+uint32_t mulh_ladder_long (uint32_t x, uint32_t y)\n+{\n+ uint32_t xl = x & 0xFFFF;\n+ uint32_t xh = x >> 16;\n+ uint32_t yl = y & 0xFFFF;\n+ uint32_t yh = y >> 16;\n+ uint32_t mulll = xl * yl;\n+ uint32_t mullh = xl * yh;\n+ uint32_t mulhl = xh * yl;\n+ uint32_t mulhh = xh * yh;\n+ uint32_t shr8 = mulll >> 16;\n+ uint32_t conv10 = mullh & 0xFFFF;\n+ uint32_t add = shr8 + conv10;\n+ uint32_t conv12 = mulhl & 0xFFFF;\n+ uint32_t add13 = add + conv12;\n+ uint32_t shr14 = add13 >> 16;\n+ uint32_t shr15 = mullh >> 16;\n+ uint32_t add16 = mulhh + shr15;\n+ uint32_t shr17 = mulhl >> 16;\n+ uint32_t add18 = add16 + shr17;\n+ return add18 + shr14;\n+}\n+\n+int main ()\n+{\n+ uint32_t vals[] = { 0, 1, 0xFFFF, 0x10000, 0x7FFFFFFFU, 0xFFFFFFFFU };\n+ int n = sizeof (vals) / sizeof (vals[0]);\n+\n+ for (int i = 0; i < n; i++)\n+ for (int j = 0; j < n; j++)\n+ {\n+\tuint32_t x = vals[i], y = vals[j];\n+\tuint32_t expected_hi = mulh_ref (x, y);\n+\tuint32_t expected_lo = x * y;\n+\n+\tif (mulh_carry (x, y) != expected_hi)\n+\t __builtin_abort ();\n+\n+\tif (mulh_ladder (x, y) != expected_hi)\n+\t __builtin_abort ();\n+\n+\tif (mulh_two_carry (x, y) != expected_hi)\n+\t __builtin_abort ();\n+\n+\tuint32_t p[2];\n+\tfull_mul (x, y, p);\n+\tif (p[1] != expected_hi)\n+\t __builtin_abort ();\n+\tif (p[0] != expected_lo)\n+\t __builtin_abort ();\n+\n+\tuint32_t q[2];\n+\tfull_mul_two_carry (x, y, q);\n+\tif (q[1] != expected_hi)\n+\t __builtin_abort ();\n+\tif (q[0] != expected_lo)\n+\t __builtin_abort ();\n+\n+\tif (mulh_carry_long (x, y) != expected_hi)\n+\t __builtin_abort ();\n+\n+\tif (mulh_ladder_long (x, y) != expected_hi)\n+\t __builtin_abort ();\n+\n+\tuint32_t r[2];\n+\tfull_mul_carry_long (x, y, r);\n+\tif (r[1] != expected_hi)\n+\t __builtin_abort ();\n+\tif (r[0] != expected_lo)\n+\t __builtin_abort ();\n+ }\n+\n+ return 0;\n+}\ndiff --git a/gcc/testsuite/gcc.dg/tree-ssa/long-mul-carry.c b/gcc/testsuite/gcc.dg/tree-ssa/long-mul-carry.c\nnew file mode 100644\nindex 000000000000..0cc7843b702c\n--- /dev/null\n+++ b/gcc/testsuite/gcc.dg/tree-ssa/long-mul-carry.c\n@@ -0,0 +1,311 @@\n+/* { dg-do compile } */\n+/* { dg-options \"-O3 -fdump-tree-forwprop-details\" } */\n+\n+typedef __UINT32_TYPE__ uint32_t;\n+typedef __UINT64_TYPE__ uint64_t;\n+typedef struct { uint32_t v[2]; } v2i32;\n+\n+uint32_t mulh_carry (uint32_t x, uint32_t y)\n+{\n+ uint32_t x_hi = x >> 16;\n+ uint32_t x_lo = x & 0xFFFF;\n+ uint32_t y_hi = y >> 16;\n+ uint32_t y_lo = y & 0xFFFF;\n+ uint32_t mulhilo = x_hi * y_lo;\n+ uint32_t mullohi = x_lo * y_hi;\n+ uint32_t cross_sum = mulhilo + mullohi;\n+ uint32_t mullolo = x_lo * y_lo;\n+ uint32_t shrlolo = mullolo >> 16;\n+ uint32_t add_cross_sum = cross_sum + shrlolo;\n+ int carry = add_cross_sum < mulhilo;\n+ uint32_t cond = ((uint32_t) carry << 16) + x_hi * y_hi;\n+ uint32_t add = cond + (add_cross_sum >> 16);\n+\n+ return add;\n+}\n+\n+void full_mul_carry (uint32_t x, uint32_t y, uint32_t* p)\n+{\n+ uint32_t x_hi = x >> 16;\n+ uint32_t x_lo = x & 0xFFFF;\n+ uint32_t y_hi = y >> 16;\n+ uint32_t y_lo = y & 0xFFFF;\n+ uint32_t mulhilo = x_hi * y_lo;\n+ uint32_t mullohi = x_lo * y_hi;\n+ uint32_t cross_sum = mulhilo + mullohi;\n+ uint32_t mullolo = x_lo * y_lo;\n+ uint32_t shrlolo = mullolo >> 16;\n+ uint32_t add_cross_sum = cross_sum + shrlolo;\n+ int carry = add_cross_sum < mulhilo;\n+ uint32_t cond = ((uint32_t) carry << 16) + x_hi * y_hi;\n+ uint32_t add = cond + (add_cross_sum >> 16);\n+ p[1] = add;\n+ uint32_t add_cross_sum_shr = add_cross_sum << 16;\n+ uint32_t mullololo = mullolo & 0xFFFF;\n+ uint32_t low = add_cross_sum_shr | mullololo;\n+ p[0] = low;\n+}\n+\n+uint32_t mulh_carry_comm (uint32_t x, uint32_t y)\n+{\n+ uint32_t x_hi = x >> 16;\n+ uint32_t x_lo = x & 0xFFFF;\n+ uint32_t y_hi = y >> 16;\n+ uint32_t y_lo = y & 0xFFFF;\n+ uint32_t mulhilo = y_lo * x_hi;\n+ uint32_t mullohi = y_hi * x_lo;\n+ uint32_t cross_sum = mullohi + mulhilo;\n+ uint32_t mullolo = x_lo * y_lo;\n+ uint32_t shrlolo = mullolo >> 16;\n+ uint32_t add_cross_sum = shrlolo + cross_sum;\n+ int carry = add_cross_sum < mulhilo;\n+ uint32_t cond = ((uint32_t) carry << 16) + x_hi * y_hi;\n+ uint32_t add = cond + (add_cross_sum >> 16);\n+\n+ return add;\n+}\n+\n+uint32_t mulh_carry_lohi (uint32_t x, uint32_t y) {\n+ uint32_t x_hi = x >> 16;\n+ uint32_t x_lo = x & 0xFFFF;\n+ uint32_t y_hi = y >> 16;\n+ uint32_t y_lo = y & 0xFFFF;\n+ uint32_t mulhilo = x_hi * y_lo;\n+ uint32_t mullohi = x_lo * y_hi;\n+ uint32_t cross_sum = mulhilo + mullohi;\n+ uint32_t mullolo = x_lo * y_lo;\n+ uint32_t add_cross_sum = cross_sum + (mullolo >> 16);\n+ int carry_occurred = (add_cross_sum < mullohi);\n+ uint32_t cond = (uint32_t) carry_occurred << 16;\n+ uint32_t add = x_hi * y_hi + cond + (add_cross_sum >> 16);\n+\n+ return add;\n+}\n+\n+/* The 128-bit variant will fail during the high sequence generation\n+ (no target provides a 256-bit multiply) and is excluded from the\n+ expected fold counts below. */\n+#ifdef __SIZEOF_INT128__\n+__uint128_t mulh_carry_128 (__uint128_t x, __uint128_t y)\n+{\n+ __uint128_t x_hi = x >> 64;\n+ __uint128_t x_lo = x & (__uint128_t)0xFFFFFFFFFFFFFFFF;\n+ __uint128_t y_hi = y >> 64;\n+ __uint128_t y_lo = y & (__uint128_t)0xFFFFFFFFFFFFFFFF;\n+ __uint128_t mulhilo = x_hi * y_lo;\n+ __uint128_t mullohi = x_lo * y_hi;\n+ __uint128_t cross_sum = mulhilo + mullohi;\n+ __uint128_t mullolo = x_lo * y_lo;\n+ __uint128_t shrlolo = mullolo >> 64;\n+ __uint128_t add_cross_sum = cross_sum + shrlolo;\n+ int carry = add_cross_sum < mulhilo;\n+ __uint128_t cond = ((__uint128_t) carry << 64) + x_hi * y_hi;\n+ __uint128_t add = cond + (add_cross_sum >> 64);\n+\n+ return add;\n+}\n+#endif\n+\n+/* This will be optimized during the second forwprop run.\n+ Disable SLP so the expected fold count is target-independent. */\n+__attribute__((optimize(\"no-tree-slp-vectorize\")))\n+v2i32 mulh_carry_v2i32 (v2i32 x, v2i32 y)\n+{\n+ v2i32 result;\n+ for (int i = 0; i < 2; i++)\n+ {\n+ uint32_t x_hi = x.v[i] >> 16;\n+ uint32_t x_lo = x.v[i] & 0xFFFF;\n+ uint32_t y_hi = y.v[i] >> 16;\n+ uint32_t y_lo = y.v[i] & 0xFFFF;\n+ uint32_t mulhilo = x_hi * y_lo;\n+ uint32_t mullohi = x_lo * y_hi;\n+ uint32_t cross_sum = mulhilo + mullohi;\n+ uint32_t mullolo = x_lo * y_lo;\n+ uint32_t shrlolo = mullolo >> 16;\n+ uint32_t add_cross_sum = cross_sum + shrlolo;\n+ int carry = add_cross_sum < mulhilo;\n+ uint32_t cond = ((uint32_t) carry << 16) + x_hi * y_hi;\n+ uint32_t add = cond + (add_cross_sum >> 16);\n+ result.v[i] = add;\n+ }\n+\n+ return result;\n+}\n+\n+/* Carry-long variants: hi-part sum uses the long form\n+ (xh*yh + carry + cross_hi + low_accum_hi). */\n+\n+uint64_t mulh_carry_long (uint64_t x, uint64_t y)\n+{\n+ uint64_t x_lo = x & 0xFFFFFFFF;\n+ uint64_t x_hi = x >> 32;\n+ uint64_t y_lo = y & 0xFFFFFFFF;\n+ uint64_t y_hi = y >> 32;\n+ uint64_t y_lo_x_hi = y_lo * x_hi;\n+ uint64_t y_hi_x_hi = y_hi * x_hi;\n+ uint64_t y_hi_x_lo = y_hi * x_lo;\n+ uint64_t y_lo_x_lo = y_lo * x_lo;\n+ uint64_t cross_sum = y_hi_x_lo + y_lo_x_hi;\n+ int carry_out = cross_sum < y_lo_x_hi;\n+ uint64_t carry = (uint64_t) carry_out << 32;\n+ uint64_t y_lo_x_lo_hi = y_lo_x_lo >> 32;\n+ uint64_t cross_sum_lo = cross_sum & 0xFFFFFFFF;\n+ uint64_t cross_sum_hi = cross_sum >> 32;\n+ uint64_t low_accum = cross_sum_lo + y_lo_x_lo_hi;\n+ uint64_t interm = cross_sum_hi + y_hi_x_hi;\n+ uint64_t low_accum_hi = low_accum >> 32;\n+ uint64_t interm_plus_carry = interm + carry;\n+ uint64_t hw64 = interm_plus_carry + low_accum_hi;\n+\n+ return hw64;\n+}\n+\n+uint64_t mulh_carry_long_comm (uint64_t x, uint64_t y)\n+{\n+ uint64_t x_lo = x & 0xFFFFFFFF;\n+ uint64_t y_lo = y & 0xFFFFFFFF;\n+ uint64_t x_hi = x >> 32;\n+ uint64_t y_hi = y >> 32;\n+ uint64_t y_lo_x_hi = x_hi * y_lo;\n+ uint64_t y_hi_x_hi = y_hi * x_hi;\n+ uint64_t y_hi_x_lo = x_lo * y_hi;\n+ uint64_t y_lo_x_lo = x_lo * y_lo;\n+ uint64_t cross_sum = y_lo_x_hi + y_hi_x_lo;\n+ int carry_out = (cross_sum < y_lo_x_hi);\n+ uint64_t carry = (uint64_t) carry_out << 32;\n+ uint64_t y_lo_x_lo_hi = y_lo_x_lo >> 32;\n+ uint64_t cross_sum_lo = cross_sum & 0xFFFFFFFF;\n+ uint64_t cross_sum_hi = cross_sum >> 32;\n+ uint64_t low_accum = y_lo_x_lo_hi + cross_sum_lo;\n+ uint64_t inter = y_hi_x_hi + cross_sum_hi;\n+ uint64_t low_accum_hi = low_accum >> 32;\n+ uint64_t interm_plus_carry = carry + inter;\n+ uint64_t hw64 = low_accum_hi + interm_plus_carry;\n+\n+ return hw64;\n+}\n+\n+uint32_t mulh_carry_long_32 (uint32_t x, uint32_t y)\n+{\n+ uint32_t x_lo = x & 0xFFFF;\n+ uint32_t x_hi = x >> 16;\n+ uint32_t y_lo = y & 0xFFFF;\n+ uint32_t y_hi = y >> 16;\n+ uint32_t y_lo_x_hi = y_lo * x_hi;\n+ uint32_t y_hi_x_hi = y_hi * x_hi;\n+ uint32_t y_hi_x_lo = y_hi * x_lo;\n+ uint32_t y_lo_x_lo = y_lo * x_lo;\n+ uint32_t cross_sum = y_hi_x_lo + y_lo_x_hi;\n+ int carry_out = (cross_sum < y_lo_x_hi);\n+ uint32_t carry = (uint32_t) carry_out << 16;\n+ uint32_t y_lo_x_lo_hi = y_lo_x_lo >> 16;\n+ uint32_t cross_sum_lo = cross_sum & 0xFFFF;\n+ uint32_t cross_sum_hi = cross_sum >> 16;\n+ uint32_t low_accum = cross_sum_lo + y_lo_x_lo_hi;\n+ uint32_t interm = cross_sum_hi + y_hi_x_hi;\n+ uint32_t low_accum_hi = low_accum >> 16;\n+ uint32_t interm_plus_carry = interm + carry;\n+ uint32_t hw64 = interm_plus_carry + low_accum_hi;\n+\n+ return hw64;\n+}\n+\n+/* The 128-bit variant will fail during the high sequence generation\n+ (no target provides a 256-bit multiply) and is excluded from the\n+ expected fold counts below. */\n+#ifdef __SIZEOF_INT128__\n+__uint128_t mulh_carry_long_128 (__uint128_t x, __uint128_t y)\n+{\n+ __uint128_t x_lo = x & (__uint128_t)0xFFFFFFFFFFFFFFFF;\n+ __uint128_t x_hi = x >> 64;\n+ __uint128_t y_lo = y & (__uint128_t)0xFFFFFFFFFFFFFFFF;\n+ __uint128_t y_hi = y >> 64;\n+ __uint128_t y_lo_x_hi = y_lo * x_hi;\n+ __uint128_t y_hi_x_hi = y_hi * x_hi;\n+ __uint128_t y_hi_x_lo = y_hi * x_lo;\n+ __uint128_t y_lo_x_lo = y_lo * x_lo;\n+ __uint128_t cross_sum = y_hi_x_lo + y_lo_x_hi;\n+ int carry_out = cross_sum < y_lo_x_hi;\n+ __uint128_t carry = (__uint128_t) carry_out << 64;\n+ __uint128_t y_lo_x_lo_hi = y_lo_x_lo >> 64;\n+ __uint128_t cross_sum_lo = cross_sum & (__uint128_t)0xFFFFFFFFFFFFFFFF;\n+ __uint128_t cross_sum_hi = cross_sum >> 64;\n+ __uint128_t low_accum = cross_sum_lo + y_lo_x_lo_hi;\n+ __uint128_t interm = cross_sum_hi + y_hi_x_hi;\n+ __uint128_t low_accum_hi = low_accum >> 64;\n+ __uint128_t interm_plus_carry = interm + carry;\n+ __uint128_t hw64 = interm_plus_carry + low_accum_hi;\n+\n+ return hw64;\n+}\n+#endif\n+\n+void full_mul_carry_long (uint64_t x, uint64_t y, uint64_t* p) {\n+ uint64_t x_lo = x & 0xFFFFFFFF;\n+ uint64_t y_lo = y & 0xFFFFFFFF;\n+ uint64_t x_hi = x >> 32;\n+ uint64_t y_hi = y >> 32;\n+ uint64_t y_lo_x_hi = y_lo * x_hi;\n+ uint64_t y_hi_x_hi = y_hi * x_hi;\n+ uint64_t y_hi_x_lo = y_hi * x_lo;\n+ uint64_t y_lo_x_lo = y_lo * x_lo;\n+ uint64_t cross_sum = y_hi_x_lo + y_lo_x_hi;\n+ int carry_out = (cross_sum < y_lo_x_hi);\n+ uint64_t carry = (uint64_t) carry_out << 32;\n+ uint64_t y_lo_x_lo_hi = y_lo_x_lo >> 32;\n+ uint64_t cross_sum_lo = cross_sum & 0xFFFFFFFF;\n+ uint64_t cross_sum_hi = cross_sum >> 32;\n+ uint64_t low_accum = cross_sum_lo + y_lo_x_lo_hi;\n+ uint64_t upper_mid = y_hi_x_hi + carry;\n+ uint64_t low_accum_hi = low_accum >> 32;\n+ uint64_t upper_mid_with_cross = upper_mid + cross_sum_hi;\n+ uint64_t hw64 = upper_mid_with_cross + low_accum_hi;\n+ p[1] = hw64;\n+ uint64_t low_accum_shifted = low_accum << 32;\n+ uint64_t y_lo_x_lo_lo = y_lo_x_lo & 0xFFFFFFFF;\n+ uint64_t lw64 = low_accum_shifted | y_lo_x_lo_lo;\n+ p[0] = lw64;\n+}\n+\n+/* This will be optimized during the second forwprop run.\n+ Disable SLP so the expected fold count is target-independent. */\n+__attribute__((optimize(\"no-tree-slp-vectorize\")))\n+v2i32 mulh_carry_long_v2i32 (v2i32 x, v2i32 y)\n+{\n+ v2i32 result;\n+ for (int i = 0; i < 2; i++)\n+ {\n+ uint32_t x_lo = x.v[i] & 0xFFFF;\n+ uint32_t y_lo = y.v[i] & 0xFFFF;\n+ uint32_t x_hi = x.v[i] >> 16;\n+ uint32_t y_hi = y.v[i] >> 16;\n+\n+ uint32_t y_lo_x_hi = y_lo * x_hi;\n+ uint32_t y_hi_x_hi = y_hi * x_hi;\n+ uint32_t y_hi_x_lo = y_hi * x_lo;\n+ uint32_t y_lo_x_lo = y_lo * x_lo;\n+\n+ uint32_t cross_sum = y_hi_x_lo + y_lo_x_hi;\n+ int carry_out = cross_sum < y_lo_x_hi;\n+ uint32_t carry = (uint32_t) carry_out << 16;\n+\n+ uint32_t y_lo_x_lo_hi = y_lo_x_lo >> 16;\n+ uint32_t cross_sum_lo = cross_sum & 0xFFFF;\n+ uint32_t cross_sum_hi = cross_sum >> 16;\n+\n+ uint32_t low_accum = cross_sum_lo + y_lo_x_lo_hi;\n+ uint32_t interm = cross_sum_hi + y_hi_x_hi;\n+ uint32_t low_accum_hi = low_accum >> 16;\n+ uint32_t interm_plus_carry = interm + carry;\n+\n+ result.v[i] = interm_plus_carry + low_accum_hi;\n+ }\n+\n+ return result;\n+}\n+\n+/* { dg-final { scan-tree-dump-times \"Long multiplication high part folded.\" 8 \"forwprop1\" { target lp64 } } } */\n+/* { dg-final { scan-tree-dump-times \"Long multiplication high part folded.\" 5 \"forwprop1\" { target { ! lp64 } } } } */\n+/* { dg-final { scan-tree-dump-times \"Long multiplication low part folded.\" 2 \"forwprop1\" } } */\n+/* { dg-final { scan-tree-dump-times \"Long multiplication high part folded.\" 2 \"forwprop2\" } } */\ndiff --git a/gcc/testsuite/gcc.dg/tree-ssa/long-mul-ladder.c b/gcc/testsuite/gcc.dg/tree-ssa/long-mul-ladder.c\nnew file mode 100644\nindex 000000000000..5cb9c45c22c2\n--- /dev/null\n+++ b/gcc/testsuite/gcc.dg/tree-ssa/long-mul-ladder.c\n@@ -0,0 +1,329 @@\n+/* { dg-do compile } */\n+/* { dg-options \"-O3 -fdump-tree-forwprop-details\" } */\n+\n+typedef __UINT32_TYPE__ uint32_t;\n+typedef __UINT64_TYPE__ uint64_t;\n+typedef struct { uint32_t v[2]; } v2i32;\n+typedef struct { uint64_t v[2]; } v2i64;\n+\n+uint64_t mulh_ladder (uint64_t x, uint64_t y)\n+{\n+ uint64_t x_lo = x & 0xFFFFFFFF;\n+ uint64_t y_lo = y & 0xFFFFFFFF;\n+ uint64_t x_hi = x >> 32;\n+ uint64_t y_hi = y >> 32;\n+ uint64_t t0 = y_lo * x_lo;\n+ uint64_t t1 = y_lo * x_hi;\n+ uint64_t t2 = y_hi * x_lo;\n+ uint64_t t3 = y_hi * x_hi;\n+ uint64_t t0_hi = t0 >> 32;\n+ uint64_t u0 = t0_hi + t1;\n+ uint64_t u0_lo = u0 & 0xFFFFFFFF;\n+ uint64_t u0_hi = u0 >> 32;\n+ uint64_t u1 = u0_lo + t2;\n+ uint64_t u1_hi = u1 >> 32;\n+ uint64_t u2 = u0_hi + t3;\n+ uint64_t hw64 = u2 + u1_hi;\n+\n+ return hw64;\n+}\n+\n+uint64_t mulh_ladder_comm (uint64_t x, uint64_t y)\n+{\n+ uint64_t x_lo = x & 0xFFFFFFFF;\n+ uint64_t y_lo = y & 0xFFFFFFFF;\n+ uint64_t x_hi = x >> 32;\n+ uint64_t y_hi = y >> 32;\n+ uint64_t t0 = x_lo * y_lo;\n+ uint64_t t1 = x_lo * y_hi;\n+ uint64_t t2 = x_hi * y_lo;\n+ uint64_t t3 = x_hi * y_hi;\n+ uint64_t t0_hi = t0 >> 32;\n+ uint64_t u0 = t1 + t0_hi;\n+ uint64_t u0_lo = u0 & 0xFFFFFFFF;\n+ uint64_t u0_hi = u0 >> 32;\n+ uint64_t u1 = t2 + u0_lo;\n+ uint64_t u1_hi = u1 >> 32;\n+ uint64_t u2 = u1_hi + u0_hi;\n+ uint64_t hw64 = t3 + u2;\n+\n+ return hw64;\n+}\n+\n+uint32_t mulh_ladder_32 (uint32_t x, uint32_t y)\n+{\n+ uint32_t x_lo = x & 0xFFFF;\n+ uint32_t y_lo = y & 0xFFFF;\n+ uint32_t x_hi = x >> 16;\n+ uint32_t y_hi = y >> 16;\n+ uint32_t t0 = y_lo * x_lo;\n+ uint32_t t1 = y_lo * x_hi;\n+ uint32_t t2 = y_hi * x_lo;\n+ uint32_t t3 = y_hi * x_hi;\n+ uint32_t t0_hi = t0 >> 16;\n+ uint32_t u0 = t0_hi + t1;\n+ uint32_t u0_lo = u0 & 0xFFFF;\n+ uint32_t u0_hi = u0 >> 16;\n+ uint32_t u1 = u0_lo + t2;\n+ uint32_t u1_hi = u1 >> 16;\n+ uint32_t u2 = u0_hi + t3;\n+ uint32_t hw64 = u2 + u1_hi;\n+\n+ return hw64;\n+}\n+\n+/* The 128-bit variant will fail during the high sequence generation\n+ (no target provides a 256-bit multiply) and is excluded from the\n+ expected fold counts below. */\n+#ifdef __SIZEOF_INT128__\n+__uint128_t umulh_variant_i128 (__uint128_t x, __uint128_t y)\n+{\n+ __uint128_t x_lo = x & (__uint128_t)0xFFFFFFFFFFFFFFFF;\n+ __uint128_t y_lo = y & (__uint128_t)0xFFFFFFFFFFFFFFFF;\n+ __uint128_t x_hi = x >> 64;\n+ __uint128_t y_hi = y >> 64;\n+ __uint128_t t0 = y_lo * x_lo;\n+ __uint128_t t1 = y_lo * x_hi;\n+ __uint128_t t2 = y_hi * x_lo;\n+ __uint128_t t3 = y_hi * x_hi;\n+ __uint128_t t0_hi = t0 >> 64;\n+ __uint128_t u0 = t0_hi + t1;\n+ __uint128_t u0_lo = u0 & (__uint128_t)0xFFFFFFFFFFFFFFFF;\n+ __uint128_t u0_hi = u0 >> 64;\n+ __uint128_t u1 = u0_lo + t2;\n+ __uint128_t u1_hi = u1 >> 64;\n+ __uint128_t u2 = u0_hi + t3;\n+ __uint128_t hw64 = u2 + u1_hi;\n+\n+ return hw64;\n+}\n+#endif\n+\n+v2i64 full_mul_ladder (uint64_t x, uint64_t y)\n+{\n+ uint64_t and_x = x & 0xFFFFFFFF;\n+ uint64_t and_y = y & 0xFFFFFFFF;\n+ uint64_t mul_i = and_y * and_x;\n+ uint64_t shr_x = x >> 32;\n+ uint64_t mul_i27 = and_y * shr_x;\n+ uint64_t shr_y = y >> 32;\n+ uint64_t mul_i28 = shr_y * and_x;\n+ uint64_t mul_i29 = shr_y * shr_x;\n+ uint64_t shr10 = mul_i >> 32;\n+ uint64_t and11 = mul_i27 & 0xFFFFFFFF;\n+ uint64_t add = and11 + mul_i28;\n+ uint64_t add12 = add + shr10;\n+ uint64_t shr13 = mul_i27 >> 32;\n+ uint64_t shr14 = add12 >> 32;\n+ uint64_t add15 = shr13 + mul_i29;\n+ uint64_t add16 = add15 + shr14;\n+ uint64_t shl = add12 << 32;\n+ uint64_t and17 = mul_i & 0xFFFFFFFF;\n+ uint64_t or_val = shl | and17;\n+ v2i64 result;\n+ result.v[0] = or_val;\n+ result.v[1] = add16;\n+ return result;\n+}\n+\n+/* This will be optimized during the second forwprop run.\n+ Disable SLP so the expected fold count is target-independent. */\n+__attribute__((optimize(\"no-tree-slp-vectorize\")))\n+v2i32 mulh_ladder_v2i32 (v2i32 x, v2i32 y)\n+{\n+ v2i32 result;\n+ for(int i=0; i<2; ++i)\n+ {\n+ uint32_t x_lo = x.v[i] & 0xFFFF;\n+ uint32_t y_lo = y.v[i] & 0xFFFF;\n+ uint32_t x_hi = x.v[i] >> 16;\n+ uint32_t y_hi = y.v[i] >> 16;\n+ uint32_t t0 = y_lo * x_lo;\n+ uint32_t t1 = y_lo * x_hi;\n+ uint32_t t2 = y_hi * x_lo;\n+ uint32_t t3 = y_hi * x_hi;\n+ uint32_t t0_hi = t0 >> 16;\n+ uint32_t u0 = t0_hi + t1;\n+ uint32_t u0_lo = u0 & 0xFFFF;\n+ uint32_t u0_hi = u0 >> 16;\n+ uint32_t u1 = u0_lo + t2;\n+ uint32_t u1_hi = u1 >> 16;\n+ uint32_t u2 = u0_hi + t3;\n+ result.v[i] = u2 + u1_hi;\n+ }\n+\n+ return result;\n+}\n+\n+/* Ladder-long variants: hi-part sum uses the long form\n+ (xh*yh + cross_hi_a + cross_hi_b + mid_hi). */\n+\n+uint32_t mulh_ladder_long (uint32_t x, uint32_t y)\n+{\n+ uint32_t xl = x & 0xFFFF;\n+ uint32_t xh = x >> 16;\n+ uint32_t yl = y & 0xFFFF;\n+ uint32_t yh = y >> 16;\n+ uint32_t mulll = xl * yl;\n+ uint32_t mullh = xl * yh;\n+ uint32_t mulhl = xh * yl;\n+ uint32_t mulhh = xh * yh;\n+ uint32_t shr8 = mulll >> 16;\n+ uint32_t conv10 = mullh & 0xFFFF;\n+ uint32_t add = shr8 + conv10;\n+ uint32_t conv12 = mulhl & 0xFFFF;\n+ uint32_t add13 = add + conv12;\n+ uint32_t shr14 = add13 >> 16;\n+ uint32_t shr15 = mullh >> 16;\n+ uint32_t add16 = mulhh + shr15;\n+ uint32_t shr17 = mulhl >> 16;\n+ uint32_t add18 = add16 + shr17;\n+ uint32_t add19 = add18 + shr14;\n+\n+ return add19;\n+}\n+\n+void full_mul_ladder_long (uint32_t x, uint32_t y, uint32_t *p)\n+{\n+ uint32_t xl = x & 0xFFFF;\n+ uint32_t xh = x >> 16;\n+ uint32_t yl = y & 0xFFFF;\n+ uint32_t yh = y >> 16;\n+ uint32_t mulll = xl * yl;\n+ uint32_t mullh = xl * yh;\n+ uint32_t mulhl = xh * yl;\n+ uint32_t mulhh = xh * yh;\n+ uint32_t shr8 = mulll >> 16;\n+ uint32_t conv10 = mullh & 0xFFFF;\n+ uint32_t add = shr8 + conv10;\n+ uint32_t conv12 = mulhl & 0xFFFF;\n+ uint32_t add13 = add + conv12;\n+ uint32_t shr14 = add13 >> 16;\n+ uint32_t shr15 = mullh >> 16;\n+ uint32_t add16 = mulhh + shr15;\n+ uint32_t shr17 = mulhl >> 16;\n+ uint32_t add18 = add16 + shr17;\n+ uint32_t add19 = add18 + shr14;\n+ p[1] = add19;\n+ uint32_t add_13_shl = add13 << 16;\n+ uint32_t and17 = mulll & 0xFFFF;\n+ uint32_t or_val = add_13_shl | and17;\n+ p[0] = or_val;\n+}\n+\n+uint32_t mulh_ladder_long_comm (uint32_t x, uint32_t y)\n+{\n+ uint32_t xl = x & 0xFFFF;\n+ uint32_t xh = x >> 16;\n+ uint32_t yl = y & 0xFFFF;\n+ uint32_t yh = y >> 16;\n+ uint32_t mulll = yl * xl;\n+ uint32_t mullh = yh * xl;\n+ uint32_t mulhl = yl * xh;\n+ uint32_t mulhh = yh * xh;\n+ uint32_t shr8 = mulll >> 16;\n+ uint32_t conv10 = mullh & 0xFFFF;\n+ uint32_t add = conv10 + shr8;\n+ uint32_t conv12 = mulhl & 0xFFFF;\n+ uint32_t add13 = conv12 + add;\n+ uint32_t shr14 = add13 >> 16;\n+ uint32_t shr15 = mullh >> 16;\n+ uint32_t shr17 = mulhl >> 16;\n+ uint32_t add16 = shr14 + shr17;\n+ uint32_t add18 = add16 + shr15;\n+ uint32_t add19 = mulhh + add18;\n+\n+ return add19;\n+}\n+\n+/* The 128-bit variant will fail during the high sequence generation\n+ (no target provides a 256-bit multiply) and is excluded from the\n+ expected fold counts below. */\n+#ifdef __SIZEOF_INT128__\n+__uint128_t mulh_ladder_long_128 (__uint128_t x, __uint128_t y)\n+{\n+ __uint128_t xl = x & (__uint128_t)0xFFFFFFFFFFFFFFFF;\n+ __uint128_t xh = x >> 64;\n+ __uint128_t yl = y & (__uint128_t)0xFFFFFFFFFFFFFFFF;\n+ __uint128_t yh = y >> 64;\n+ __uint128_t mulll = xl * yl;\n+ __uint128_t mullh = xl * yh;\n+ __uint128_t mulhl = xh * yl;\n+ __uint128_t mulhh = xh * yh;\n+ __uint128_t shr8 = mulll >> 64;\n+ __uint128_t conv10 = mullh & (__uint128_t)0xFFFFFFFFFFFFFFFF;\n+ __uint128_t add = shr8 + conv10;\n+ __uint128_t conv12 = mulhl & (__uint128_t)0xFFFFFFFFFFFFFFFF;\n+ __uint128_t add13 = add + conv12;\n+ __uint128_t shr14 = add13 >> 64;\n+ __uint128_t shr15 = mullh >> 64;\n+ __uint128_t add16 = mulhh + shr15;\n+ __uint128_t shr17 = mulhl >> 64;\n+ __uint128_t add18 = add16 + shr17;\n+ __uint128_t add19 = add18 + shr14;\n+\n+ return add19;\n+}\n+#endif\n+\n+uint32_t mulh_ladder_long_hllh (uint32_t x, uint32_t y)\n+{\n+ uint32_t xl = x & 0xFFFF;\n+ uint32_t xh = x >> 16;\n+ uint32_t yl = y & 0xFFFF;\n+ uint32_t yh = y >> 16;\n+ uint32_t mulll = xl * yl;\n+ uint32_t mullh = xl * yh;\n+ uint32_t mulhl = xh * yl;\n+ uint32_t mulhh = xh * yh;\n+ uint32_t shr8 = mulll >> 16;\n+ uint32_t conv10 = mulhl & 0xFFFF;\n+ uint32_t add = shr8 + conv10;\n+ uint32_t conv12 = mullh & 0xFFFF;\n+ uint32_t add13 = add + conv12;\n+ uint32_t shr14 = add13 >> 16;\n+ uint32_t shr15 = mulhl >> 16;\n+ uint32_t add16 = mulhh + shr15;\n+ uint32_t shr17 = mullh >> 16;\n+ uint32_t add18 = add16 + shr17;\n+ uint32_t add19 = add18 + shr14;\n+\n+ return add19;\n+}\n+\n+/* This will be optimized during the second forwprop run.\n+ Disable SLP so the expected fold count is target-independent. */\n+__attribute__((optimize(\"no-tree-slp-vectorize\")))\n+v2i32 mul_ladder_long_v2i32 (v2i32 x, v2i32 y)\n+{\n+ v2i32 result;\n+ for (int i = 0; i < 2; i++)\n+ {\n+ uint32_t xl = x.v[i] & 0xFFFF;\n+ uint32_t xh = x.v[i] >> 16;\n+ uint32_t yl = y.v[i] & 0xFFFF;\n+ uint32_t yh = y.v[i] >> 16;\n+ uint32_t mulll = xl * yl;\n+ uint32_t mullh = xl * yh;\n+ uint32_t mulhl = xh * yl;\n+ uint32_t mulhh = xh * yh;\n+ uint32_t shr8 = mulll >> 16;\n+ uint32_t conv10 = mullh & 0xFFFF;\n+ uint32_t add = shr8 + conv10;\n+ uint32_t conv12 = mulhl & 0xFFFF;\n+ uint32_t add13 = add + conv12;\n+ uint32_t shr14 = add13 >> 16;\n+ uint32_t shr15 = mullh >> 16;\n+ uint32_t add16 = mulhh + shr15;\n+ uint32_t shr17 = mulhl >> 16;\n+ uint32_t add18 = add16 + shr17;\n+ result.v[i] = add18 + shr14;\n+ }\n+\n+ return result;\n+}\n+\n+/* { dg-final { scan-tree-dump-times \"Long multiplication high part folded.\" 8 \"forwprop1\" { target lp64 } } } */\n+/* { dg-final { scan-tree-dump-times \"Long multiplication high part folded.\" 5 \"forwprop1\" { target { ! lp64 } } } } */\n+/* { dg-final { scan-tree-dump-times \"Long multiplication low part folded.\" 2 \"forwprop1\" } } */\n+/* { dg-final { scan-tree-dump-times \"Long multiplication high part folded.\" 2 \"forwprop2\" } } */\ndiff --git a/gcc/testsuite/gcc.dg/tree-ssa/long-mul-low-plus.c b/gcc/testsuite/gcc.dg/tree-ssa/long-mul-low-plus.c\nnew file mode 100644\nindex 000000000000..cfb9f6073015\n--- /dev/null\n+++ b/gcc/testsuite/gcc.dg/tree-ssa/long-mul-low-plus.c\n@@ -0,0 +1,54 @@\n+/* { dg-do compile } */\n+/* { dg-options \"-O3 -fdump-tree-forwprop1-details\" } */\n+\n+typedef __UINT32_TYPE__ uint32_t;\n+typedef __UINT64_TYPE__ uint64_t;\n+\n+/* Low part via PLUS form: lolo + (cross_sum << halfwidth).\n+ No GT/LT comparison on the result, so fold_mul_low_plus\n+ should fold without deferring. */\n+uint32_t mul_low_plus_32 (uint32_t x, uint32_t y)\n+{\n+ uint32_t x_hi = x >> 16;\n+ uint32_t x_lo = x & 0xFFFF;\n+ uint32_t y_hi = y >> 16;\n+ uint32_t y_lo = y & 0xFFFF;\n+ uint32_t lolo = x_lo * y_lo;\n+ uint32_t hilo = x_hi * y_lo;\n+ uint32_t lohi = x_lo * y_hi;\n+ uint32_t cross_sum = hilo + lohi;\n+ uint32_t cross_shifted = cross_sum << 16;\n+ return lolo + cross_shifted;\n+}\n+\n+/* 64-bit variant. */\n+uint64_t mul_low_plus_64 (uint64_t x, uint64_t y)\n+{\n+ uint64_t x_hi = x >> 32;\n+ uint64_t x_lo = x & 0xFFFFFFFFUL;\n+ uint64_t y_hi = y >> 32;\n+ uint64_t y_lo = y & 0xFFFFFFFFUL;\n+ uint64_t lolo = x_lo * y_lo;\n+ uint64_t hilo = x_hi * y_lo;\n+ uint64_t lohi = x_lo * y_hi;\n+ uint64_t cross_sum = hilo + lohi;\n+ uint64_t cross_shifted = cross_sum << 32;\n+ return lolo + cross_shifted;\n+}\n+\n+/* Commuted operand order. */\n+uint32_t mul_low_plus_comm (uint32_t x, uint32_t y)\n+{\n+ uint32_t x_hi = x >> 16;\n+ uint32_t x_lo = x & 0xFFFF;\n+ uint32_t y_hi = y >> 16;\n+ uint32_t y_lo = y & 0xFFFF;\n+ uint32_t lolo = y_lo * x_lo;\n+ uint32_t hilo = y_lo * x_hi;\n+ uint32_t lohi = y_hi * x_lo;\n+ uint32_t cross_sum = lohi + hilo;\n+ uint32_t cross_shifted = cross_sum << 16;\n+ return cross_shifted + lolo;\n+}\n+\n+/* { dg-final { scan-tree-dump-times \"Long multiplication low part folded.\" 3 \"forwprop1\" } } */\ndiff --git a/gcc/testsuite/gcc.dg/tree-ssa/long-mul-partial.c b/gcc/testsuite/gcc.dg/tree-ssa/long-mul-partial.c\nnew file mode 100644\nindex 000000000000..0b187b4949b2\n--- /dev/null\n+++ b/gcc/testsuite/gcc.dg/tree-ssa/long-mul-partial.c\n@@ -0,0 +1,119 @@\n+/* { dg-do compile } */\n+/* { dg-options \"-O3 -fdump-tree-optimized\" } */\n+\n+typedef __UINT32_TYPE__ uint32_t;\n+\n+/* Only one cross-product (xh*yl), missing xl*yh.\n+ Should NOT be folded. */\n+uint32_t partial_one_cross (uint32_t x, uint32_t y)\n+{\n+ uint32_t x_lo = x & 0xFFFF;\n+ uint32_t y_lo = y & 0xFFFF;\n+ uint32_t x_hi = x >> 16;\n+ uint32_t y_hi = y >> 16;\n+ uint32_t t0 = y_lo * x_lo;\n+ uint32_t t1 = y_lo * x_hi;\n+ uint32_t t3 = y_hi * x_hi;\n+ uint32_t t0_hi = t0 >> 16;\n+ uint32_t u0 = t0_hi + t1;\n+ uint32_t u0_hi = u0 >> 16;\n+ return t3 + u0_hi;\n+}\n+\n+/* Only xl*yl and xh*yh, no cross-products at all.\n+ Should NOT be folded. */\n+uint32_t partial_no_cross (uint32_t x, uint32_t y)\n+{\n+ uint32_t x_lo = x & 0xFFFF;\n+ uint32_t y_lo = y & 0xFFFF;\n+ uint32_t x_hi = x >> 16;\n+ uint32_t y_hi = y >> 16;\n+ uint32_t t0 = y_lo * x_lo;\n+ uint32_t t3 = y_hi * x_hi;\n+ return t3 + (t0 >> 16);\n+}\n+\n+/* Only cross-products, missing xl*yl and xh*yh.\n+ Should NOT be folded. */\n+uint32_t partial_only_cross (uint32_t x, uint32_t y)\n+{\n+ uint32_t x_lo = x & 0xFFFF;\n+ uint32_t y_lo = y & 0xFFFF;\n+ uint32_t x_hi = x >> 16;\n+ uint32_t y_hi = y >> 16;\n+ uint32_t t1 = y_lo * x_hi;\n+ uint32_t t2 = y_hi * x_lo;\n+ return (t1 + t2) >> 16;\n+}\n+\n+/* Full ladder structure but one cross-product uses z instead of y.\n+ check_hilo_and_ops should reject the mismatched operand.\n+ Should NOT be folded. */\n+uint32_t partial_mismatched_op (uint32_t x, uint32_t y, uint32_t z)\n+{\n+ uint32_t x_lo = x & 0xFFFF;\n+ uint32_t y_lo = y & 0xFFFF;\n+ uint32_t x_hi = x >> 16;\n+ uint32_t y_hi = y >> 16;\n+ uint32_t z_lo = z & 0xFFFF;\n+ uint32_t t0 = y_lo * x_lo;\n+ uint32_t t1 = y_lo * x_hi;\n+ uint32_t t2 = z_lo * x_lo;\n+ uint32_t t3 = y_hi * x_hi;\n+ uint32_t t0_hi = t0 >> 16;\n+ uint32_t u0 = t0_hi + t1;\n+ uint32_t u0_lo = u0 & 0xFFFF;\n+ uint32_t u0_hi = u0 >> 16;\n+ uint32_t u1 = u0_lo + t2;\n+ uint32_t u1_hi = u1 >> 16;\n+ uint32_t u2 = u0_hi + t3;\n+ return u2 + u1_hi;\n+}\n+\n+/* Uses conditionals in the computation.\n+ Should NOT be folded. */\n+unsigned mulhu_conditional (unsigned u, unsigned v) {\n+ unsigned a, b, c, d, p, q, rlow, rhigh;\n+\n+ a = u >> 16;\n+ b = u & 0xFFFF;\n+ c = v >> 16;\n+ d = v & 0xFFFF;\n+\n+ p = a*c;\n+ q = b*d;\n+ rlow = (-a + b)*(c - d);\n+ rhigh = (int)((-a + b)^(c - d)) >> 31;\n+ if (rlow == 0) rhigh = 0;\n+\n+ q = q + (q >> 16);\n+ rlow = rlow + p;\n+ if (rlow < p) rhigh = rhigh + 1;\n+ rlow = rlow + q;\n+ if (rlow < q) rhigh = rhigh + 1;\n+\n+ return p + (rlow >> 16) + (rhigh << 16);\n+}\n+\n+/* Signed operands.\n+ Should NOT be folded. */\n+int mulhs_signed (int u, int v) {\n+ unsigned u0, v0, w0;\n+ int u1, v1, w1, w2, t;\n+\n+ u0 = u & 0xFFFF;\n+ u1 = u >> 16;\n+ v0 = v & 0xFFFF;\n+ v1 = v >> 16;\n+ w0 = u0*v0;\n+ t = u1*v0 + (w0 >> 16);\n+ w1 = t & 0xFFFF;\n+ w2 = t >> 16;\n+ w1 = u0*v1 + w1;\n+ return u1*v1 + w2 + (w1 >> 16);\n+}\n+\n+/* Verify no fold in any forwprop pass by checking the optimized IR\n+ for MULT_HIGHPART_EXPR (h*) and WIDEN_MULT_EXPR (w*). */\n+/* { dg-final { scan-tree-dump-not \" h\\\\* \" \"optimized\" } } */\n+/* { dg-final { scan-tree-dump-not \" w\\\\* \" \"optimized\" } } */\ndiff --git a/gcc/testsuite/gcc.dg/tree-ssa/long-mul-two-carry.c b/gcc/testsuite/gcc.dg/tree-ssa/long-mul-two-carry.c\nnew file mode 100644\nindex 000000000000..b37d4a269f41\n--- /dev/null\n+++ b/gcc/testsuite/gcc.dg/tree-ssa/long-mul-two-carry.c\n@@ -0,0 +1,111 @@\n+/* { dg-do compile } */\n+/* { dg-options \"-O3 -fdump-tree-forwprop-details\" } */\n+\n+typedef __UINT32_TYPE__ uint32_t;\n+typedef __UINT64_TYPE__ uint64_t;\n+\n+/* High part using two separate carries (cross carry + low carry). */\n+uint64_t mulh_two_carry (uint64_t x, uint64_t y)\n+{\n+ uint64_t x_hi = x >> 32;\n+ uint64_t x_lo = x & 0xFFFFFFFFUL;\n+ uint64_t y_hi = y >> 32;\n+ uint64_t y_lo = y & 0xFFFFFFFFUL;\n+\n+ uint64_t lolo = x_lo * y_lo;\n+ uint64_t hilo = x_hi * y_lo;\n+ uint64_t lohi = x_lo * y_hi;\n+ uint64_t hihi = x_hi * y_hi;\n+\n+ uint64_t cross_sum = hilo + lohi;\n+ uint64_t cross_carry = (uint64_t)(cross_sum < hilo) << 32;\n+\n+ uint64_t cross_shifted = cross_sum << 32;\n+ uint64_t low_result = lolo + cross_shifted;\n+ uint64_t low_carry = (uint64_t)(low_result < cross_shifted);\n+\n+ uint64_t high = hihi + (cross_sum >> 32) + cross_carry + low_carry;\n+\n+ return high;\n+}\n+\n+/* Commuted operand order. */\n+uint64_t mulh_two_carry_comm (uint64_t x, uint64_t y)\n+{\n+ uint64_t x_hi = x >> 32;\n+ uint64_t x_lo = x & 0xFFFFFFFFUL;\n+ uint64_t y_hi = y >> 32;\n+ uint64_t y_lo = y & 0xFFFFFFFFUL;\n+\n+ uint64_t lolo = x_lo * y_lo;\n+ uint64_t lohi = x_lo * y_hi;\n+ uint64_t hilo = x_hi * y_lo;\n+ uint64_t hihi = x_hi * y_hi;\n+\n+ uint64_t cross_sum = lohi + hilo;\n+ uint64_t cross_carry = (uint64_t)(cross_sum < lohi) << 32;\n+\n+ uint64_t cross_shifted = cross_sum << 32;\n+ uint64_t low_result = cross_shifted + lolo;\n+ uint64_t low_carry = (uint64_t)(low_result < lolo);\n+\n+ uint64_t high = hihi + (cross_sum >> 32) + cross_carry + low_carry;\n+\n+ return high;\n+}\n+\n+/* 32-bit variant. */\n+uint32_t mulh_two_carry_32 (uint32_t x, uint32_t y)\n+{\n+ uint32_t x_hi = x >> 16;\n+ uint32_t x_lo = x & 0xFFFF;\n+ uint32_t y_hi = y >> 16;\n+ uint32_t y_lo = y & 0xFFFF;\n+\n+ uint32_t lolo = x_lo * y_lo;\n+ uint32_t hilo = x_hi * y_lo;\n+ uint32_t lohi = x_lo * y_hi;\n+ uint32_t hihi = x_hi * y_hi;\n+\n+ uint32_t cross_sum = hilo + lohi;\n+ uint32_t cross_carry = (uint32_t)(cross_sum < hilo) << 16;\n+\n+ uint32_t cross_shifted = cross_sum << 16;\n+ uint32_t low_result = lolo + cross_shifted;\n+ uint32_t low_carry = (uint32_t)(low_result < cross_shifted);\n+\n+ uint32_t high = hihi + (cross_sum >> 16) + cross_carry + low_carry;\n+\n+ return high;\n+}\n+\n+/* Full multiply: both high and low parts. */\n+uint64_t full_mul_two_carry (uint64_t x, uint64_t y, uint64_t *lo)\n+{\n+ uint64_t x_hi = x >> 32;\n+ uint64_t x_lo = x & 0xFFFFFFFFUL;\n+ uint64_t y_hi = y >> 32;\n+ uint64_t y_lo = y & 0xFFFFFFFFUL;\n+\n+ uint64_t lolo = x_lo * y_lo;\n+ uint64_t hilo = x_hi * y_lo;\n+ uint64_t lohi = x_lo * y_hi;\n+ uint64_t hihi = x_hi * y_hi;\n+\n+ uint64_t cross_sum = hilo + lohi;\n+ uint64_t cross_carry = (uint64_t)(cross_sum < hilo) << 32;\n+\n+ uint64_t cross_shifted = cross_sum << 32;\n+ uint64_t low_result = lolo + cross_shifted;\n+ uint64_t low_carry = (uint64_t)(low_result < cross_shifted);\n+\n+ uint64_t high = hihi + (cross_sum >> 32) + cross_carry + low_carry;\n+\n+ *lo = low_result;\n+ return high;\n+}\n+\n+/* { dg-final { scan-tree-dump-times \"Long multiplication high part folded.\" 4 \"forwprop3\" { target lp64 } } } */\n+/* { dg-final { scan-tree-dump-times \"Long multiplication high part folded.\" 1 \"forwprop3\" { target { ! lp64 } } } } */\n+/* { dg-final { scan-tree-dump-times \"Long multiplication low part folded.\" 4 \"forwprop3\" { target lp64 } } } */\n+/* { dg-final { scan-tree-dump-times \"Long multiplication low part folded.\" 1 \"forwprop3\" { target { ! lp64 } } } } */\ndiff --git a/gcc/testsuite/gcc.target/aarch64/long_mul.c b/gcc/testsuite/gcc.target/aarch64/long_mul.c\nnew file mode 100644\nindex 000000000000..f9c765380d8c\n--- /dev/null\n+++ b/gcc/testsuite/gcc.target/aarch64/long_mul.c\n@@ -0,0 +1,100 @@\n+/* { dg-do compile } */\n+/* { dg-options \"-O3\" } */\n+\n+typedef __UINT32_TYPE__ uint32_t;\n+typedef __UINT64_TYPE__ uint64_t;\n+\n+/* 64-bit ladder pattern for high part. */\n+uint64_t mulh_ladder (uint64_t x, uint64_t y)\n+{\n+ uint64_t x_lo = x & 0xFFFFFFFF;\n+ uint64_t y_lo = y & 0xFFFFFFFF;\n+ uint64_t x_hi = x >> 32;\n+ uint64_t y_hi = y >> 32;\n+ uint64_t t0 = y_lo * x_lo;\n+ uint64_t t1 = y_lo * x_hi;\n+ uint64_t t2 = y_hi * x_lo;\n+ uint64_t t3 = y_hi * x_hi;\n+ uint64_t t0_hi = t0 >> 32;\n+ uint64_t u0 = t0_hi + t1;\n+ uint64_t u0_lo = u0 & 0xFFFFFFFF;\n+ uint64_t u0_hi = u0 >> 32;\n+ uint64_t u1 = u0_lo + t2;\n+ uint64_t u1_hi = u1 >> 32;\n+ uint64_t u2 = u0_hi + t3;\n+ return u2 + u1_hi;\n+}\n+\n+/* 64-bit carry pattern for high part. */\n+uint64_t mulh_carry (uint64_t x, uint64_t y)\n+{\n+ uint64_t x_lo = x & 0xFFFFFFFF;\n+ uint64_t x_hi = x >> 32;\n+ uint64_t y_lo = y & 0xFFFFFFFF;\n+ uint64_t y_hi = y >> 32;\n+ uint64_t y_lo_x_hi = y_lo * x_hi;\n+ uint64_t y_hi_x_hi = y_hi * x_hi;\n+ uint64_t y_hi_x_lo = y_hi * x_lo;\n+ uint64_t y_lo_x_lo = y_lo * x_lo;\n+ uint64_t cross_sum = y_hi_x_lo + y_lo_x_hi;\n+ int carry_out = cross_sum < y_lo_x_hi;\n+ uint64_t carry = (uint64_t) carry_out << 32;\n+ uint64_t y_lo_x_lo_hi = y_lo_x_lo >> 32;\n+ uint64_t cross_sum_lo = cross_sum & 0xFFFFFFFF;\n+ uint64_t cross_sum_hi = cross_sum >> 32;\n+ uint64_t low_accum = cross_sum_lo + y_lo_x_lo_hi;\n+ uint64_t interm = cross_sum_hi + y_hi_x_hi;\n+ uint64_t low_accum_hi = low_accum >> 32;\n+ uint64_t interm_plus_carry = interm + carry;\n+ return interm_plus_carry + low_accum_hi;\n+}\n+\n+/* 32-bit ladder pattern for high part. */\n+uint32_t mulh_ladder_32 (uint32_t x, uint32_t y)\n+{\n+ uint32_t x_lo = x & 0xFFFF;\n+ uint32_t y_lo = y & 0xFFFF;\n+ uint32_t x_hi = x >> 16;\n+ uint32_t y_hi = y >> 16;\n+ uint32_t t0 = y_lo * x_lo;\n+ uint32_t t1 = y_lo * x_hi;\n+ uint32_t t2 = y_hi * x_lo;\n+ uint32_t t3 = y_hi * x_hi;\n+ uint32_t t0_hi = t0 >> 16;\n+ uint32_t u0 = t0_hi + t1;\n+ uint32_t u0_lo = u0 & 0xFFFF;\n+ uint32_t u0_hi = u0 >> 16;\n+ uint32_t u1 = u0_lo + t2;\n+ uint32_t u1_hi = u1 >> 16;\n+ uint32_t u2 = u0_hi + t3;\n+ return u2 + u1_hi;\n+}\n+\n+/* 32-bit carry pattern for high part. */\n+uint32_t mulh_carry_32 (uint32_t x, uint32_t y)\n+{\n+ uint32_t x_lo = x & 0xFFFF;\n+ uint32_t x_hi = x >> 16;\n+ uint32_t y_lo = y & 0xFFFF;\n+ uint32_t y_hi = y >> 16;\n+ uint32_t y_lo_x_hi = y_lo * x_hi;\n+ uint32_t y_hi_x_hi = y_hi * x_hi;\n+ uint32_t y_hi_x_lo = y_hi * x_lo;\n+ uint32_t y_lo_x_lo = y_lo * x_lo;\n+ uint32_t cross_sum = y_hi_x_lo + y_lo_x_hi;\n+ int carry_out = (cross_sum < y_lo_x_hi);\n+ uint32_t carry = (uint32_t) carry_out << 16;\n+ uint32_t y_lo_x_lo_hi = y_lo_x_lo >> 16;\n+ uint32_t cross_sum_lo = cross_sum & 0xFFFF;\n+ uint32_t cross_sum_hi = cross_sum >> 16;\n+ uint32_t low_accum = cross_sum_lo + y_lo_x_lo_hi;\n+ uint32_t interm = cross_sum_hi + y_hi_x_hi;\n+ uint32_t low_accum_hi = low_accum >> 16;\n+ uint32_t interm_plus_carry = interm + carry;\n+ return interm_plus_carry + low_accum_hi;\n+}\n+\n+/* 64-bit patterns should emit umulh. */\n+/* { dg-final { scan-assembler-times \"umulh\\t\" 2 } } */\n+/* 32-bit patterns should emit umull (32x32->64 widening multiply). */\n+/* { dg-final { scan-assembler-times \"umull\\t\" 2 } } */\ndiff --git a/gcc/testsuite/gcc.target/i386/long_mul.c b/gcc/testsuite/gcc.target/i386/long_mul.c\nnew file mode 100644\nindex 000000000000..d4b03ed1beda\n--- /dev/null\n+++ b/gcc/testsuite/gcc.target/i386/long_mul.c\n@@ -0,0 +1,100 @@\n+/* { dg-do compile { target { ! ia32 } } } */\n+/* { dg-options \"-O3\" } */\n+\n+typedef __UINT32_TYPE__ uint32_t;\n+typedef __UINT64_TYPE__ uint64_t;\n+\n+/* 64-bit ladder pattern for high part. */\n+uint64_t mulh_ladder (uint64_t x, uint64_t y)\n+{\n+ uint64_t x_lo = x & 0xFFFFFFFF;\n+ uint64_t y_lo = y & 0xFFFFFFFF;\n+ uint64_t x_hi = x >> 32;\n+ uint64_t y_hi = y >> 32;\n+ uint64_t t0 = y_lo * x_lo;\n+ uint64_t t1 = y_lo * x_hi;\n+ uint64_t t2 = y_hi * x_lo;\n+ uint64_t t3 = y_hi * x_hi;\n+ uint64_t t0_hi = t0 >> 32;\n+ uint64_t u0 = t0_hi + t1;\n+ uint64_t u0_lo = u0 & 0xFFFFFFFF;\n+ uint64_t u0_hi = u0 >> 32;\n+ uint64_t u1 = u0_lo + t2;\n+ uint64_t u1_hi = u1 >> 32;\n+ uint64_t u2 = u0_hi + t3;\n+ return u2 + u1_hi;\n+}\n+\n+/* 64-bit carry pattern for high part. */\n+uint64_t mulh_carry (uint64_t x, uint64_t y)\n+{\n+ uint64_t x_lo = x & 0xFFFFFFFF;\n+ uint64_t x_hi = x >> 32;\n+ uint64_t y_lo = y & 0xFFFFFFFF;\n+ uint64_t y_hi = y >> 32;\n+ uint64_t y_lo_x_hi = y_lo * x_hi;\n+ uint64_t y_hi_x_hi = y_hi * x_hi;\n+ uint64_t y_hi_x_lo = y_hi * x_lo;\n+ uint64_t y_lo_x_lo = y_lo * x_lo;\n+ uint64_t cross_sum = y_hi_x_lo + y_lo_x_hi;\n+ int carry_out = cross_sum < y_lo_x_hi;\n+ uint64_t carry = (uint64_t) carry_out << 32;\n+ uint64_t y_lo_x_lo_hi = y_lo_x_lo >> 32;\n+ uint64_t cross_sum_lo = cross_sum & 0xFFFFFFFF;\n+ uint64_t cross_sum_hi = cross_sum >> 32;\n+ uint64_t low_accum = cross_sum_lo + y_lo_x_lo_hi;\n+ uint64_t interm = cross_sum_hi + y_hi_x_hi;\n+ uint64_t low_accum_hi = low_accum >> 32;\n+ uint64_t interm_plus_carry = interm + carry;\n+ return interm_plus_carry + low_accum_hi;\n+}\n+\n+/* 32-bit ladder pattern for high part. */\n+uint32_t mulh_ladder_32 (uint32_t x, uint32_t y)\n+{\n+ uint32_t x_lo = x & 0xFFFF;\n+ uint32_t y_lo = y & 0xFFFF;\n+ uint32_t x_hi = x >> 16;\n+ uint32_t y_hi = y >> 16;\n+ uint32_t t0 = y_lo * x_lo;\n+ uint32_t t1 = y_lo * x_hi;\n+ uint32_t t2 = y_hi * x_lo;\n+ uint32_t t3 = y_hi * x_hi;\n+ uint32_t t0_hi = t0 >> 16;\n+ uint32_t u0 = t0_hi + t1;\n+ uint32_t u0_lo = u0 & 0xFFFF;\n+ uint32_t u0_hi = u0 >> 16;\n+ uint32_t u1 = u0_lo + t2;\n+ uint32_t u1_hi = u1 >> 16;\n+ uint32_t u2 = u0_hi + t3;\n+ return u2 + u1_hi;\n+}\n+\n+/* 32-bit carry pattern for high part. */\n+uint32_t mulh_carry_32 (uint32_t x, uint32_t y)\n+{\n+ uint32_t x_lo = x & 0xFFFF;\n+ uint32_t x_hi = x >> 16;\n+ uint32_t y_lo = y & 0xFFFF;\n+ uint32_t y_hi = y >> 16;\n+ uint32_t y_lo_x_hi = y_lo * x_hi;\n+ uint32_t y_hi_x_hi = y_hi * x_hi;\n+ uint32_t y_hi_x_lo = y_hi * x_lo;\n+ uint32_t y_lo_x_lo = y_lo * x_lo;\n+ uint32_t cross_sum = y_hi_x_lo + y_lo_x_hi;\n+ int carry_out = (cross_sum < y_lo_x_hi);\n+ uint32_t carry = (uint32_t) carry_out << 16;\n+ uint32_t y_lo_x_lo_hi = y_lo_x_lo >> 16;\n+ uint32_t cross_sum_lo = cross_sum & 0xFFFF;\n+ uint32_t cross_sum_hi = cross_sum >> 16;\n+ uint32_t low_accum = cross_sum_lo + y_lo_x_lo_hi;\n+ uint32_t interm = cross_sum_hi + y_hi_x_hi;\n+ uint32_t low_accum_hi = low_accum >> 16;\n+ uint32_t interm_plus_carry = interm + carry;\n+ return interm_plus_carry + low_accum_hi;\n+}\n+\n+/* 64-bit patterns should emit mulq (unsigned 64x64->128 multiply). */\n+/* { dg-final { scan-assembler-times \"\\tmulq\" 2 } } */\n+/* 32-bit patterns should emit imulq (64-bit multiply of zero-extended operands). */\n+/* { dg-final { scan-assembler-times \"\\timulq\" 2 } } */\ndiff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc\nindex b5544414ca6e..5aea445f0d72 100644\n--- a/gcc/tree-ssa-forwprop.cc\n+++ b/gcc/tree-ssa-forwprop.cc\n@@ -43,6 +43,7 @@ along with GCC; see the file COPYING3. If not see\n #include \"tree-ssa-propagate.h\"\n #include \"tree-ssa-dom.h\"\n #include \"tree-ssa-strlen.h\"\n+#include \"langhooks.h\"\n #include \"builtins.h\"\n #include \"tree-cfgcleanup.h\"\n #include \"cfganal.h\"\n@@ -3558,6 +3559,858 @@ simplify_count_zeroes (gimple_stmt_iterator *gsi)\n return true;\n }\n \n+/* Match.pd functions to match long multiplication. */\n+\n+extern bool gimple_mul_hi (tree, tree *, tree (*)(tree));\n+extern bool gimple_mul_hilo (tree, tree *, tree (*)(tree));\n+extern bool gimple_mul_lolo (tree, tree *, tree (*)(tree));\n+extern bool gimple_mul_low_sum (tree, tree *, tree (*)(tree));\n+extern bool gimple_mul_carry_cross_sum (tree, tree *, tree (*)(tree));\n+extern bool gimple_mul_carry_low_sum (tree, tree *, tree (*)(tree));\n+extern bool gimple_mul_hipart (tree, tree *, tree (*)(tree));\n+extern bool gimple_mul_hipart_long (tree, tree *, tree (*)(tree));\n+extern bool gimple_mul_low_accum (tree, tree *, tree (*)(tree));\n+extern bool gimple_mul_cross_sum (tree, tree *, tree (*)(tree));\n+extern bool gimple_mul_ladder_part_sum (tree, tree *, tree (*)(tree));\n+extern bool gimple_mul_ladder_sum1 (tree, tree *, tree (*)(tree));\n+extern bool gimple_mul_ladder_sum2 (tree, tree *, tree (*)(tree));\n+extern bool gimple_mul_ladder_sum3 (tree, tree *, tree (*)(tree));\n+extern bool gimple_mul_carry_low (tree, tree *, tree (*)(tree));\n+extern bool gimple_mul_low_part (tree, tree *, tree (*)(tree));\n+extern bool gimple_mul_low_part_long (tree, tree *, tree (*)(tree));\n+extern bool gimple_mul_low_plus (tree, tree *, tree (*)(tree));\n+\n+/* Create a sequence that multiplies OP1 and OP2 and extracts the high part.\n+ The sequence will replace STMT. Return true on success,\n+ false otherwise. */\n+\n+static bool\n+create_mul_high_seq (tree op1, tree op2, gimple *stmt)\n+{\n+ tree op_type = TREE_TYPE (op1);\n+ const unsigned int width = TYPE_PRECISION (op_type);\n+ scalar_int_mode op_mode = SCALAR_INT_TYPE_MODE (op_type);\n+\n+ /* Prefer MULT_HIGHPART_EXPR when the target supports it directly. */\n+ if (optab_handler (umul_highpart_optab, op_mode) != CODE_FOR_nothing)\n+ {\n+ gimple_seq new_seq = NULL;\n+ gimple *prod = gimple_build_assign (gimple_get_lhs (stmt),\n+\t\t\t\t\t MULT_HIGHPART_EXPR, op1, op2);\n+ gimple_seq_add_stmt (&new_seq, prod);\n+ annotate_all_with_location (new_seq, gimple_location (stmt));\n+\n+ gimple_stmt_iterator gsi = gsi_for_stmt (stmt);\n+ gsi_replace_with_seq (&gsi, new_seq, true);\n+ return true;\n+ }\n+\n+ /* Fall back to widening multiply + shift. */\n+ if (width > MAX_FIXED_MODE_SIZE)\n+ return false;\n+ opt_machine_mode new_mode = int_mode_for_size (width * 2, 0);\n+ if (!new_mode.exists ())\n+ return false;\n+\n+ machine_mode dw_mode = new_mode.require ();\n+\n+ /* Verify the target can efficiently perform the double-width multiply.\n+ Without hardware support the compiler would synthesize it via a\n+ library call, which is likely slower than the original longhand\n+ sequence. */\n+ if (find_widening_optab_handler (umul_widen_optab, dw_mode, op_mode)\n+\t== CODE_FOR_nothing\n+ && optab_handler (smul_optab, dw_mode) == CODE_FOR_nothing)\n+ return false;\n+\n+ tree new_type = lang_hooks.types.type_for_mode (dw_mode, true);\n+ if (!new_type)\n+ return false;\n+\n+ gimple_seq new_seq = NULL;\n+ gimple *first_op_cast = gimple_build_assign (make_ssa_name (new_type),\n+\t\t\t\t\t NOP_EXPR, op1);\n+ gimple *sec_op_cast = gimple_build_assign (make_ssa_name (new_type),\n+\t\t\t\t\t NOP_EXPR, op2);\n+ gimple *prod = gimple_build_assign (make_ssa_name (new_type),\n+\t\t\t\t MULT_EXPR,\n+\t\t\t\t gimple_get_lhs (first_op_cast),\n+\t\t\t\t gimple_get_lhs (sec_op_cast));\n+ gimple *rshift = gimple_build_assign (make_ssa_name (new_type),\n+\t\t\t\t\tRSHIFT_EXPR, gimple_get_lhs (prod),\n+\t\t\t\t\tbuild_int_cst (integer_type_node,\n+\t\t\t\t\t\t width));\n+ gimple *rshift_cast = gimple_build_assign (gimple_get_lhs (stmt),\n+\t\t\t\t\t NOP_EXPR,\n+\t\t\t\t\t gimple_get_lhs (rshift));\n+\n+ gimple_seq_add_stmt (&new_seq, first_op_cast);\n+ gimple_seq_add_stmt (&new_seq, sec_op_cast);\n+ gimple_seq_add_stmt (&new_seq, prod);\n+\n+ gimple_seq_add_stmt (&new_seq, rshift);\n+ gimple_seq_add_stmt (&new_seq, rshift_cast);\n+ annotate_all_with_location (new_seq, gimple_location (stmt));\n+\n+ gimple_stmt_iterator gsi = gsi_for_stmt (stmt);\n+ gsi_replace_with_seq (&gsi, new_seq, true);\n+\n+ return true;\n+}\n+\n+/* Create a sequence that multiplies OP1 and OP2 and extracts the low part.\n+ The sequence will replace STMT. */\n+\n+static void\n+create_mul_low_seq (tree op1, tree op2, gimple *stmt)\n+{\n+ gimple_seq new_seq = NULL;\n+ gimple *prod = gimple_build_assign (gimple_get_lhs (stmt),\n+\t\t\t\t MULT_EXPR, op1, op2);\n+\n+ gimple_seq_add_stmt (&new_seq, prod);\n+ annotate_all_with_location (new_seq, gimple_location (stmt));\n+\n+ gimple_stmt_iterator gsi = gsi_for_stmt (stmt);\n+ gsi_replace_with_seq (&gsi, new_seq, true);\n+}\n+\n+/* Check that MATCHED_OPS contain OP1 and OP2 in any order. */\n+\n+static bool\n+check_matched_ops (tree op1, tree op2, tree *matched_ops)\n+{\n+ return (matched_ops[0] == op1 && matched_ops[1] == op2)\n+\t || (matched_ops[0] == op2 && matched_ops[1] == op1);\n+}\n+\n+/* Check whether HILO0 and HILO1 result from multiplying OP1's low part with\n+ OP2's high part, or vice versa. OPS is used as scratch space for\n+ gimple_mul_hilo and its contents are overwritten. */\n+\n+static bool\n+check_hilo_and_ops (tree op1, tree op2, tree hilo0,\n+\t\t tree hilo1, tree *ops)\n+{\n+ return hilo0 != hilo1\n+\t && gimple_mul_hilo (hilo0, ops, NULL)\n+\t && check_matched_ops (op1, op2, ops)\n+\t && gimple_mul_hilo (hilo1, ops, NULL)\n+\t && check_matched_ops (op1, op2, ops);\n+}\n+\n+/* Returns the code of the definition statement of SSA_NAME. */\n+\n+static tree_code\n+get_def_rhs_code (tree ssa_name)\n+{\n+ if (TREE_CODE (ssa_name) != SSA_NAME)\n+ return ERROR_MARK;\n+\n+ gimple *def = SSA_NAME_DEF_STMT (ssa_name);\n+ enum tree_code code = ERROR_MARK;\n+ if (def && gimple_code (def) == GIMPLE_ASSIGN)\n+ code = gimple_assign_rhs_code (def);\n+\n+ return code;\n+}\n+\n+/* Check if SSA_NAME corresponds to an addition. */\n+\n+static bool\n+check_ssa_name_addition (tree ssa_name)\n+{\n+ return get_def_rhs_code (ssa_name) == PLUS_EXPR;\n+}\n+\n+/* Checks if SSA_NAME corresponds to a code that is expected to be found\n+ in long multiplication's high part. */\n+\n+static bool\n+check_ssa_name_hipart_common_exprs (tree ssa_name)\n+{\n+ enum tree_code code = get_def_rhs_code (ssa_name);\n+\n+ return code == MULT_EXPR\n+\t || code == RSHIFT_EXPR\n+\t || code == LSHIFT_EXPR\n+\t || code == PLUS_EXPR\n+\t || code == NOP_EXPR;\n+}\n+\n+/* Checks if STMT's RHS operands correspond to expressions expected in the\n+ high or low parts of a long multiplication. Returns false for non-integral\n+ or signed types and for types with odd precision. When HIGHPART is set\n+ (default), it checks against patterns associated with the high part\n+ (one operand is MULT/RSHIFT/LSHIFT/PLUS/NOP and the other is PLUS);\n+ otherwise, it checks against those of the low part (LSHIFT + BIT_AND\n+ for BIT_IOR_EXPR, or MULT + LSHIFT for PLUS_EXPR). */\n+\n+static bool\n+check_stmt_common_exprs (gimple *stmt, bool highpart = true)\n+{\n+ if (!is_gimple_assign (stmt))\n+ return false;\n+\n+ tree lhs_type = TREE_TYPE (gimple_assign_lhs (stmt));\n+\n+ /* Only handle unsigned integer types with even precision. */\n+ if (!INTEGRAL_TYPE_P (lhs_type)\n+ || !TYPE_UNSIGNED (lhs_type)\n+ || TYPE_PRECISION (lhs_type) % 2 != 0)\n+ return false;\n+\n+ /* Only binary expressions (PLUS_EXPR, BIT_IOR_EXPR) are handled. */\n+ if (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))\n+ != GIMPLE_BINARY_RHS)\n+ return false;\n+\n+ tree rhs1 = gimple_assign_rhs1 (stmt);\n+ tree rhs2 = gimple_assign_rhs2 (stmt);\n+\n+ if (highpart)\n+ return (check_ssa_name_hipart_common_exprs (rhs1)\n+\t && check_ssa_name_addition (rhs2))\n+\t || (check_ssa_name_hipart_common_exprs (rhs2)\n+\t && check_ssa_name_addition (rhs1));\n+\n+ enum tree_code def_code1 = get_def_rhs_code (rhs1);\n+ enum tree_code def_code2 = get_def_rhs_code (rhs2);\n+ return (def_code1 == LSHIFT_EXPR && def_code2 == BIT_AND_EXPR)\n+\t || (def_code2 == LSHIFT_EXPR && def_code1 == BIT_AND_EXPR)\n+\t || (def_code1 == MULT_EXPR && def_code2 == LSHIFT_EXPR)\n+\t || (def_code2 == MULT_EXPR && def_code1 == LSHIFT_EXPR);\n+}\n+\n+/* Match and fold the high part of the multiplication when carry is presented\n+ as a comparison. Return true on success. These sequences look like:\n+ ((xh*yl > lowsum) << N) + (lowsum >> N) + xh*yh\n+ lowsum = xh*yl + xl*yh + ((xl*yl) >> N). */\n+\n+static bool\n+fold_mul_high_carry (gimple *stmt)\n+{\n+ tree lhs = gimple_get_lhs (stmt);\n+ /* Scratch buffer, reused across successive pattern matches. */\n+ tree res_ops[7];\n+ tree low_sum_ops[6];\n+\n+ /* Try to match a generic version of the pattern, like\n+ xh*yh + carry + shr. */\n+ if (!gimple_mul_hipart (lhs, res_ops, NULL))\n+ return false;\n+\n+ tree op1 = res_ops[0];\n+ tree op2 = res_ops[1];\n+ tree carry = res_ops[2];\n+ tree shr = res_ops[3];\n+ /* Find the operand that contains the comparison part:\n+ (xh*yl > lowsum) << N. */\n+ if (!gimple_mul_carry_low_sum (carry, res_ops, NULL))\n+ {\n+ if (gimple_mul_carry_low_sum (shr, res_ops, NULL))\n+\tstd::swap (carry, shr);\n+ else\n+\treturn false;\n+ }\n+\n+ /* Check that mul_lolo operands in the carry comparison match. */\n+ if (!check_matched_ops (op1, op2, res_ops))\n+ return false;\n+\n+ tree mul_hilo0 = res_ops[2];\n+ /* Check that the operands matched in the pattern are indeed high-low\n+ part products. */\n+ if (!gimple_mul_hilo (mul_hilo0, res_ops, NULL)\n+ || !check_matched_ops (op1, op2, res_ops))\n+ return false;\n+\n+ /* Check for the (lowsum >> N) part. */\n+ if (!gimple_mul_hi (shr, res_ops, NULL)\n+ || !gimple_mul_low_sum (res_ops[0], low_sum_ops, NULL)\n+ || !check_matched_ops (op1, op2, low_sum_ops)\n+ || !check_hilo_and_ops (op1, op2, low_sum_ops[2], low_sum_ops[3],\n+\t\t\t res_ops))\n+ return false;\n+\n+ return create_mul_high_seq (op1, op2, stmt);\n+}\n+\n+/* Like fold_mul_high_carry, but this tries to match and fold a longer\n+ sequence. This sequence looks like:\n+ ((xh*yl > cross_sum) << N) + (cross_sum >> N) + (xl*yl >> N)\n+ + (cross_sum & mask) + xh*yh\n+ cross_sum = xh*yl + xl*yh. */\n+\n+static bool\n+fold_mul_high_carry_long (gimple *stmt)\n+{\n+ tree lhs = gimple_get_lhs (stmt);\n+ /* Scratch buffer, reused across successive pattern matches. */\n+ tree res_ops[6];\n+ tree low_accum_ops[6];\n+\n+ /* Try to match a generic version of the pattern, like\n+ xh*yh + carry + shr_cross_sum + shr_low_accum. */\n+ if (!gimple_mul_hipart_long (lhs, res_ops, NULL))\n+ return false;\n+\n+ tree op1 = res_ops[0];\n+ tree op2 = res_ops[1];\n+\n+ tree carry = res_ops[2];\n+ tree shr_cross_sum = res_ops[3];\n+ tree shr_low_accum = res_ops[4];\n+ /* Find the operand that contains the comparison part:\n+ (xh*yl > cross_sum) << N. */\n+ if (!gimple_mul_carry_cross_sum (carry, res_ops, NULL))\n+ {\n+ if (gimple_mul_carry_cross_sum (shr_cross_sum, res_ops, NULL))\n+\tstd::swap (carry, shr_cross_sum);\n+ else if (gimple_mul_carry_cross_sum (shr_low_accum, res_ops, NULL))\n+\tstd::swap (carry, shr_low_accum);\n+ else\n+\treturn false;\n+ }\n+\n+ tree mul_hilo = res_ops[2];\n+ /* Check if hilo is matched. */\n+ if (!gimple_mul_hilo (mul_hilo, res_ops, NULL)\n+ || !check_matched_ops (op1, op2, res_ops))\n+ return false;\n+\n+ /* Find the operand that corresponds to (cross_sum >> N). */\n+ if (!gimple_mul_hi (shr_cross_sum, res_ops, NULL)\n+ || !gimple_mul_cross_sum (res_ops[0], res_ops, NULL))\n+ {\n+ std::swap (shr_cross_sum, shr_low_accum);\n+ if (!gimple_mul_hi (shr_cross_sum, res_ops, NULL)\n+\t || !gimple_mul_cross_sum (res_ops[0], res_ops, NULL))\n+\treturn false;\n+ }\n+\n+ /* Check for the (xl*yl >> N) + (cross_sum & mask) part. */\n+ if (!gimple_mul_hi (shr_low_accum, res_ops, NULL)\n+ || !gimple_mul_low_accum (res_ops[0], low_accum_ops, NULL))\n+ return false;\n+\n+ /* Check if mul_lolo operands and hilos are matched. */\n+ if (!check_matched_ops (op1, op2, low_accum_ops)\n+ || !check_hilo_and_ops (op1, op2, low_accum_ops[2], low_accum_ops[3],\n+\t\t\t res_ops))\n+ return false;\n+\n+ return create_mul_high_seq (op1, op2, stmt);\n+}\n+\n+/* Match and fold the high part of the multiplication when both cross-carry\n+ and low-carry are present as separate comparisons. Return true on success.\n+ These sequences look like:\n+ ((hilo > cross_sum) << N) + (cross_sum >> N) + (carry_low) + xh*yh\n+ cross_sum = xh*yl + xl*yh\n+ carry_low = (convert?) ((cross_sum << N) > (lolo + (cross_sum << N)))\n+\n+ After folding the high part, also fold the low-part PLUS\n+ (lolo + (cross_sum << N)) if found among the uses of\n+ (cross_sum << N). */\n+\n+static bool\n+fold_mul_high_two_carries (gimple *stmt)\n+{\n+ tree lhs = gimple_get_lhs (stmt);\n+ tree res_ops[6];\n+ tree cross_sum_ops[2];\n+\n+ /* Try to match xh*yh + inst1 + inst2 + inst3. */\n+ if (!gimple_mul_hipart_long (lhs, res_ops, NULL))\n+ return false;\n+\n+ tree op1 = res_ops[0];\n+ tree op2 = res_ops[1];\n+\n+ /* Three addends beyond xh*yh. */\n+ tree addend[3] = { res_ops[2], res_ops[3], res_ops[4] };\n+\n+ /* Identify the cross carry: (hilo > cross_sum) << halfwidth. */\n+ int cross_carry_idx = -1;\n+ tree cross_carry_ops[4];\n+ for (int i = 0; i < 3; i++)\n+ if (gimple_mul_carry_cross_sum (addend[i], cross_carry_ops, NULL))\n+ {\n+\tcross_carry_idx = i;\n+\tbreak;\n+ }\n+ if (cross_carry_idx < 0)\n+ return false;\n+\n+ /* Validate the hilo in the cross carry. */\n+ tree cross_hilo = cross_carry_ops[2];\n+ if (!gimple_mul_hilo (cross_hilo, res_ops, NULL)\n+ || !check_matched_ops (op1, op2, res_ops))\n+ return false;\n+\n+ /* Identify the cross_sum >> halfwidth addend. */\n+ int shr_idx = -1;\n+ for (int i = 0; i < 3; i++)\n+ {\n+ if (i == cross_carry_idx)\n+\tcontinue;\n+ if (gimple_mul_hi (addend[i], res_ops, NULL)\n+\t && gimple_mul_cross_sum (res_ops[0], cross_sum_ops, NULL))\n+\t{\n+\t shr_idx = i;\n+\t break;\n+\t}\n+ }\n+ if (shr_idx < 0)\n+ return false;\n+\n+ /* Validate the hilos within cross_sum. */\n+ if (!check_hilo_and_ops (op1, op2, cross_sum_ops[0], cross_sum_ops[1],\n+\t\t\t res_ops))\n+ return false;\n+\n+ /* The remaining addend must be the low carry. */\n+ int low_carry_idx = 3 - cross_carry_idx - shr_idx;\n+ tree low_carry_ops[2];\n+ if (!gimple_mul_carry_low (addend[low_carry_idx], low_carry_ops, NULL))\n+ return false;\n+\n+ /* low_carry_ops captures can be in either order depending on\n+ whether the GT or LT form matched. Try both orientations. */\n+ tree cross_shifted = low_carry_ops[0];\n+ tree lolo_candidate = low_carry_ops[1];\n+\n+ /* Validate cross_shifted is LSHIFT_EXPR(cross_sum, halfwidth).\n+ If not, try the other orientation. */\n+ auto is_lshift_def = [] (tree t) -> bool {\n+ if (TREE_CODE (t) != SSA_NAME)\n+ return false;\n+ gimple *def = SSA_NAME_DEF_STMT (t);\n+ return def && is_gimple_assign (def)\n+\t && gimple_assign_rhs_code (def) == LSHIFT_EXPR;\n+ };\n+\n+ if (!is_lshift_def (cross_shifted))\n+ {\n+ if (is_lshift_def (lolo_candidate))\n+\tstd::swap (cross_shifted, lolo_candidate);\n+ else\n+\treturn false;\n+ }\n+\n+ gimple *cs_def = SSA_NAME_DEF_STMT (cross_shifted);\n+\n+ tree cs_inner = gimple_assign_rhs1 (cs_def);\n+ tree cs_shift = gimple_assign_rhs2 (cs_def);\n+\n+ tree lhs_type = TREE_TYPE (gimple_get_lhs (stmt));\n+ unsigned halfwidth = TYPE_PRECISION (lhs_type) / 2;\n+ if (!tree_fits_uhwi_p (cs_shift)\n+ || tree_to_uhwi (cs_shift) != halfwidth)\n+ return false;\n+\n+ /* cs_inner should be a cross_sum with valid hilos. */\n+ if (!gimple_mul_cross_sum (cs_inner, cross_sum_ops, NULL)\n+ || !check_hilo_and_ops (op1, op2, cross_sum_ops[0], cross_sum_ops[1],\n+\t\t\t res_ops))\n+ return false;\n+\n+ /* Validate lolo_candidate is mul_lolo(op1, op2). */\n+ if (!gimple_mul_lolo (lolo_candidate, res_ops, NULL)\n+ || !check_matched_ops (op1, op2, res_ops))\n+ return false;\n+\n+ if (!create_mul_high_seq (op1, op2, stmt))\n+ return false;\n+\n+ /* Also fold the low-part PLUS: lolo + (cross_sum << halfwidth).\n+ Walk uses of cross_shifted to find a PLUS_EXPR. Save the target\n+ and fold after the loop to avoid mutating the use chain of\n+ cross_shifted during iteration. */\n+ gimple *low_plus_stmt = NULL;\n+ gimple *use_stmt;\n+ imm_use_iterator iter;\n+ FOR_EACH_IMM_USE_STMT (use_stmt, iter, cross_shifted)\n+ {\n+ if (!is_gimple_assign (use_stmt)\n+\t || gimple_assign_rhs_code (use_stmt) != PLUS_EXPR)\n+\tcontinue;\n+\n+ tree plus_rhs1 = gimple_assign_rhs1 (use_stmt);\n+ tree plus_rhs2 = gimple_assign_rhs2 (use_stmt);\n+ tree other = (plus_rhs1 == cross_shifted) ? plus_rhs2\n+\t\t : (plus_rhs2 == cross_shifted) ? plus_rhs1 : NULL_TREE;\n+ if (!other)\n+\tcontinue;\n+\n+ if (gimple_mul_lolo (other, res_ops, NULL)\n+\t && check_matched_ops (op1, op2, res_ops))\n+\t{\n+\t low_plus_stmt = use_stmt;\n+\t break;\n+\t}\n+ }\n+\n+ if (low_plus_stmt)\n+ {\n+ create_mul_low_seq (op1, op2, low_plus_stmt);\n+ if (dump_file && (dump_flags & TDF_DETAILS))\n+\tfprintf (dump_file, \"Long multiplication low part folded.\\n\");\n+ }\n+\n+ return true;\n+}\n+\n+/* Match and fold the high part of the multiplication, carry is not presented\n+ as comparison in these sequences. Return true on success. The sequences\n+ look like:\n+ (((xl*yh & mask) + xh*yl + (xl*yl >> N)) >> N) + (xh*yl >> N)\n+ + xh*yh\n+ or\n+ (((ladder_part_sum & mask) + xl*yh) >> N) + (ladder_part_sum >> N)\n+ + xh*yh\n+ ladder_part_sum = (xl*yl >> N) + xh*yl. */\n+\n+static bool\n+fold_mul_high_ladder (gimple *stmt)\n+{\n+ tree lhs = gimple_get_lhs (stmt);\n+ /* Scratch buffer, reused across successive pattern matches. */\n+ tree res_ops[6];\n+ tree ladder_sum_ops[6];\n+ tree inst1_ops[2];\n+ tree inst2_ops[2];\n+\n+ /* Try to match a generic version of the pattern, like\n+ xh*yh + inst1 + inst2. */\n+ if (!gimple_mul_hipart (lhs, res_ops, NULL))\n+ return false;\n+\n+ tree op1 = res_ops[0];\n+ tree op2 = res_ops[1];\n+ tree inst1 = res_ops[2];\n+ tree inst2 = res_ops[3];\n+\n+ /* Check if both inst1 and inst2 are right shifts. */\n+ if (!gimple_mul_hi (inst1, inst1_ops, NULL)\n+ || !gimple_mul_hi (inst2, inst2_ops, NULL))\n+ return false;\n+\n+ tree inst3 = inst1_ops[0];\n+ tree inst4 = inst2_ops[0];\n+\n+ /* Check whether both inst3 and inst4 match the patterns, depending on the\n+ sequence form. */\n+ tree insts[2] = { inst3, inst4 };\n+ for (int i = 0; i < 2; ++i)\n+ {\n+ int j = 1 - i;\n+ /* Check for ((xl*yh & mask) + xh*yl + (xl*yl >> N)). */\n+ if (gimple_mul_ladder_sum1 (insts[i], res_ops, NULL))\n+\t{\n+\t /* Save captured hilos before gimple_mul_hilo overwrites\n+\t res_ops. */\n+\t tree hilo0 = res_ops[2];\n+\t tree hilo1 = res_ops[3];\n+\t if (check_matched_ops (op1, op2, res_ops)\n+\t && gimple_mul_hilo (insts[j], res_ops, NULL)\n+\t && check_matched_ops (op1, op2, res_ops)\n+\t && check_hilo_and_ops (op1, op2, hilo0, hilo1, res_ops))\n+\t return create_mul_high_seq (op1, op2, stmt);\n+\t}\n+ /* Check for (ladder_part_sum & mask) + xl*yh. */\n+ if (gimple_mul_ladder_sum2 (insts[i], ladder_sum_ops, NULL))\n+\t{\n+\t /* Check for mul_lolo operands, (xl*yl >> N) + xh*yl. */\n+\t if (check_matched_ops (op1, op2, ladder_sum_ops)\n+\t && gimple_mul_ladder_part_sum (insts[j], res_ops, NULL)\n+\t && gimple_mul_hilo (res_ops[2], res_ops, NULL)\n+\t && check_matched_ops (op1, op2, res_ops)\n+\t && check_hilo_and_ops (op1, op2, ladder_sum_ops[2],\n+\t\t\t\t ladder_sum_ops[3], ladder_sum_ops))\n+\t return create_mul_high_seq (op1, op2, stmt);\n+\t}\n+ }\n+\n+ return false;\n+}\n+\n+/* Like fold_mul_high_ladder, but this tries to match and fold a longer\n+ sequence. This sequence looks like:\n+ (ladder_sum_3 >> N) + (xh*yl >> N) + (xl*yh >> N) + xh*yh\n+ ladder_sum_3 = ((xl*yh & mask) + (xh*yl & mask) + (xl*yl >> N)). */\n+\n+static bool\n+fold_mul_high_ladder_long (gimple *stmt)\n+{\n+ tree lhs = gimple_get_lhs (stmt);\n+ /* Scratch buffer, reused across successive pattern matches. */\n+ tree res_ops[6];\n+\n+ /* Try to match a generic version of the pattern, like\n+ xh*yh + ladder_sum_3_shr + shr1 + shr2. */\n+ if (!gimple_mul_hipart_long (lhs, res_ops, NULL))\n+ return false;\n+\n+ tree op1 = res_ops[0];\n+ tree op2 = res_ops[1];\n+ tree ladder_sum_3_shr = res_ops[2];\n+ tree shr1 = res_ops[3];\n+ tree shr2 = res_ops[4];\n+\n+ /* Find which of the three addends is ladder_sum_3 >> N.\n+ Try each candidate; put the match in ladder_sum_3_shr and the\n+ remaining two in shr1/shr2. */\n+ {\n+ tree candidates[3] = { ladder_sum_3_shr, shr1, shr2 };\n+ bool found = false;\n+ for (int i = 0; i < 3; i++)\n+ if (gimple_mul_hi (candidates[i], res_ops, NULL)\n+\t && check_ssa_name_addition (res_ops[0]))\n+\t{\n+\t ladder_sum_3_shr = candidates[i];\n+\t shr1 = candidates[(i + 1) % 3];\n+\t shr2 = candidates[(i + 2) % 3];\n+\t found = true;\n+\t break;\n+\t}\n+ if (!found)\n+ return false;\n+ }\n+\n+ /* Try to match ladder_sum_3. */\n+ if (!gimple_mul_ladder_sum3 (res_ops[0], res_ops, NULL))\n+ return false;\n+\n+ /* Check that mul_lolo operands match. */\n+ if (!check_matched_ops (op1, op2, res_ops))\n+ return false;\n+\n+ tree mul_hilo0 = res_ops[2];\n+ tree mul_hilo1 = res_ops[3];\n+ /* Check for hilos in the matched sequence. */\n+ if (!check_hilo_and_ops (op1, op2, mul_hilo0, mul_hilo1, res_ops))\n+ return false;\n+\n+ /* Check for (xh*yl >> N) and (xl*yh >> N). */\n+ if (!gimple_mul_hi (shr1, res_ops, NULL)\n+ || !gimple_mul_hilo (res_ops[0], res_ops, NULL)\n+ || !check_matched_ops (op1, op2, res_ops)\n+ || !gimple_mul_hi (shr2, res_ops, NULL)\n+ || !gimple_mul_hilo (res_ops[0], res_ops, NULL)\n+ || !check_matched_ops (op1, op2, res_ops))\n+ return false;\n+\n+ return create_mul_high_seq (op1, op2, stmt);\n+}\n+\n+/* Match and fold the low part of the multiplication for sequences like:\n+ (mul_sum << N) | (xl*yl & mask),\n+ where mul_sum = (xl*yl >> N) + ((xh*yl + xl*yh) & mask)\n+ or\n+ mul_sum = (xh*yl + xl*yh) + (xl*yl >> N).\n+ Returns true on success. */\n+\n+static bool\n+fold_mul_low_carry_long (gimple *stmt)\n+{\n+ tree lhs = gimple_get_lhs (stmt);\n+ /* Scratch buffer, reused across successive pattern matches. */\n+ tree res_ops[5];\n+ tree low_accum_ops[6];\n+ tree mul_cross_sum_ops[2];\n+\n+ /* Try to match a generic version of the pattern, like\n+ (mul_sum << N) | (xl*yl & mask). */\n+ if (!gimple_mul_low_part_long (lhs, res_ops, NULL))\n+ return false;\n+\n+ tree op1 = res_ops[0];\n+ tree op2 = res_ops[1];\n+\n+ tree mul_sum = res_ops[2];\n+ /* Check for (xl*yl >> N) + ((xh*yl + xl*yh) & mask). */\n+ if (gimple_mul_low_accum (mul_sum, low_accum_ops, NULL)\n+ && check_matched_ops (op1, op2, low_accum_ops)\n+ && check_hilo_and_ops (op1, op2, low_accum_ops[2], low_accum_ops[3],\n+\t\t\t res_ops))\n+ {\n+ create_mul_low_seq (op1, op2, stmt);\n+ return true;\n+ }\n+\n+ if (!check_ssa_name_addition (mul_sum))\n+ return false;\n+\n+ gimple *def = SSA_NAME_DEF_STMT (mul_sum);\n+ tree cross_sum = gimple_assign_rhs1 (def);\n+ tree mul_hi = gimple_assign_rhs2 (def);\n+ /* Check for (xh*yl + xl*yh). */\n+ if (!gimple_mul_cross_sum (cross_sum, mul_cross_sum_ops, NULL))\n+ {\n+ std::swap (cross_sum, mul_hi);\n+ if (!gimple_mul_cross_sum (cross_sum, mul_cross_sum_ops, NULL))\n+\treturn false;\n+ }\n+\n+ /* Check for hilo products in the matched sequence. */\n+ if (!check_hilo_and_ops (op1, op2, mul_cross_sum_ops[0],\n+\t\t\t mul_cross_sum_ops[1], res_ops))\n+ return false;\n+\n+ /* Check for (xl*yl >> N). */\n+ if (!gimple_mul_hi (mul_hi, res_ops, NULL)\n+ || !gimple_mul_lolo (res_ops[0], res_ops, NULL)\n+ || !check_matched_ops (op1, op2, res_ops))\n+ return false;\n+\n+ create_mul_low_seq (op1, op2, stmt);\n+\n+ return true;\n+}\n+\n+/* Match and fold the low part of the multiplication for sequences like:\n+ (mul_ladder_sum << N) | (xl*yl & mask), where\n+ mul_ladder_sum = ((xl*yh & mask) + xh*yl + (xl*yl >> N))\n+ or\n+ mul_ladder_sum = ((((xl*yl >> N) + xh*yl) & mask) + xl*yh)\n+ or\n+ mul_ladder_sum = ((xl*yh & mask) + (xh*yl & mask)\n+ + (xl*yl >> N)). */\n+\n+static bool\n+fold_mul_low_ladder (gimple *stmt)\n+{\n+ tree lhs = gimple_get_lhs (stmt);\n+ /* Scratch buffer, reused across successive pattern matches. */\n+ tree res_ops[6];\n+\n+ /* Try to match (mul_ladder_sum << N) | (xl*yl & mask). */\n+ if (!gimple_mul_low_part (lhs, res_ops, NULL))\n+ return false;\n+\n+ tree op1 = res_ops[0];\n+ tree op2 = res_ops[1];\n+\n+ tree mul_hilo0 = res_ops[2];\n+ tree mul_hilo1 = res_ops[3];\n+ /* Check for hilo products in the matched sequence. */\n+ if (!check_hilo_and_ops (op1, op2, mul_hilo0, mul_hilo1, res_ops))\n+ return false;\n+\n+ create_mul_low_seq (op1, op2, stmt);\n+\n+ return true;\n+}\n+\n+/* Call the folding functions for the high part of the multiplication.\n+ Return true if any of them succeeds. */\n+\n+static bool\n+fold_mul_hi (gimple *stmt)\n+{\n+ /* Pre-check for expected expressions before pattern matching. */\n+ if (!check_stmt_common_exprs (stmt))\n+ return false;\n+\n+ if (fold_mul_high_carry (stmt)\n+ || fold_mul_high_carry_long (stmt)\n+ || fold_mul_high_two_carries (stmt)\n+ || fold_mul_high_ladder (stmt)\n+ || fold_mul_high_ladder_long (stmt))\n+ {\n+ if (dump_file && (dump_flags & TDF_DETAILS))\n+\tfprintf (dump_file, \"Long multiplication high part folded.\\n\");\n+ return true;\n+ }\n+\n+ return false;\n+}\n+\n+/* Match and fold the low part of the multiplication when it is a plain\n+ PLUS_EXPR: low_result = lolo + (cross_sum << halfwidth).\n+ Uses the mul_low_plus match.pd pattern for structural recognition.\n+ We skip folding if the result is used in a GT_EXPR, because the\n+ high-part two-carry fold needs the carry pattern intact. */\n+\n+static bool\n+fold_mul_low_plus (gimple *stmt)\n+{\n+ tree lhs = gimple_assign_lhs (stmt);\n+ tree rhs1 = gimple_assign_rhs1 (stmt);\n+ tree rhs2 = gimple_assign_rhs2 (stmt);\n+\n+ /* mul_low_plus: 6 captures (op0, op1, hilo0, hilo1, mask_cst, shift_cst). */\n+ tree res_ops[6];\n+ if (!gimple_mul_low_plus (lhs, res_ops, NULL))\n+ return false;\n+\n+ tree op1 = res_ops[0];\n+ tree op2 = res_ops[1];\n+\n+ /* Validate that the hilos are consistent cross products. */\n+ if (!check_hilo_and_ops (op1, op2, res_ops[2], res_ops[3], res_ops))\n+ return false;\n+\n+ /* If the result feeds an unsigned overflow comparison, the high-part\n+ fold needs this pattern intact -- defer. The overflow idiom is\n+ addend > (addend + other), so we only defer when one comparison\n+ operand is the sum (lhs) and the other is one of its addends. */\n+ gimple *use_stmt;\n+ imm_use_iterator iter;\n+ FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)\n+ {\n+ tree cmp_op1 = NULL_TREE, cmp_op2 = NULL_TREE;\n+ enum tree_code use_code = ERROR_MARK;\n+ if (is_gimple_assign (use_stmt))\n+\t{\n+\t use_code = gimple_assign_rhs_code (use_stmt);\n+\t cmp_op1 = gimple_assign_rhs1 (use_stmt);\n+\t cmp_op2 = gimple_assign_rhs2 (use_stmt);\n+\t}\n+ else if (gcond *cond = dyn_cast<gcond *> (use_stmt))\n+\t{\n+\t use_code = gimple_cond_code (cond);\n+\t cmp_op1 = gimple_cond_lhs (cond);\n+\t cmp_op2 = gimple_cond_rhs (cond);\n+\t}\n+ if (use_code == GT_EXPR || use_code == LT_EXPR\n+\t || use_code == GE_EXPR || use_code == LE_EXPR)\n+\t{\n+\t tree other = (cmp_op1 == lhs) ? cmp_op2\n+\t\t : (cmp_op2 == lhs) ? cmp_op1 : NULL_TREE;\n+\t if (other && (other == rhs1 || other == rhs2))\n+\t return false;\n+\t}\n+ }\n+\n+ create_mul_low_seq (op1, op2, stmt);\n+\n+ return true;\n+}\n+\n+/* Call the folding functions for the low part of the multiplication.\n+ Handles both BIT_IOR_EXPR and PLUS_EXPR forms. Return true if any\n+ of them succeeds. */\n+\n+static bool\n+fold_mul_low (gimple *stmt)\n+{\n+ /* Pre-check for expected expressions before pattern matching. */\n+ if (!check_stmt_common_exprs (stmt, false))\n+ return false;\n+\n+ if (fold_mul_low_carry_long (stmt) || fold_mul_low_ladder (stmt)\n+ || fold_mul_low_plus (stmt))\n+ {\n+ if (dump_file && (dump_flags & TDF_DETAILS))\n+\tfprintf (dump_file, \"Long multiplication low part folded.\\n\");\n+ return true;\n+ }\n+\n+ return false;\n+}\n \n /* Determine whether applying the 2 permutations (mask1 then mask2)\n gives back one of the input. */\n@@ -5810,11 +6663,19 @@ pass_forwprop::execute (function *fun)\n \t\t }\n \t\t else if (TREE_CODE_CLASS (code) == tcc_comparison)\n \t\t changed |= forward_propagate_into_comparison (&gsi);\n-\t\t else if ((code == PLUS_EXPR\n-\t\t\t || code == BIT_IOR_EXPR\n-\t\t\t || code == BIT_XOR_EXPR)\n-\t\t\t && simplify_rotate (&gsi))\n-\t\t changed = true;\n+\t\t else if ((code == PLUS_EXPR || code == BIT_IOR_EXPR))\n+\t\t {\n+\t\t\tbool folded = false;\n+\t\t\tif (code == PLUS_EXPR)\n+\t\t\t folded = fold_mul_hi (stmt);\n+\t\t\tif (!folded)\n+\t\t\t folded = fold_mul_low (stmt);\n+\t\t\tif (!folded)\n+\t\t\t folded = simplify_rotate (&gsi);\n+\t\t\tchanged |= folded;\n+\t\t }\n+\t\t else if (code == BIT_XOR_EXPR)\n+\t\t changed |= simplify_rotate (&gsi);\n \t\t else if (code == VEC_PERM_EXPR)\n \t\t changed |= simplify_permutation (&gsi);\n \t\t else if (code == CONSTRUCTOR\n", "prefixes": [ "v3", "2/2" ] }