diff mbox series

AArch64: Fix expansion of Advanced SIMD div and mul using SVE [PR109636]

Message ID patch-18153-tamar@arm.com
State New
Headers show
Series AArch64: Fix expansion of Advanced SIMD div and mul using SVE [PR109636] | expand

Commit Message

Tamar Christina Jan. 24, 2024, 9:20 a.m. UTC
Hi All,

As suggested in the ticket this replaces the expansion by converting the
Advanced SIMD types to SVE types by simply printing out an SVE register for
these instructions.

This fixes the subreg issues since there are no subregs involved anymore.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	PR target/109636
	* config/aarch64/aarch64-simd.md (<su_optab>div<mode>3,
	mulv2di3): Remove.
	* config/aarch64/iterators.md (VQDIV): Remove.
	(SVE_FULL_SDI_SIMD, SVE_FULL_SDI_SIMD_DI, SVE_FULL_HSDI_SIMD_DI,
	SVE_I_SIMD_DI): New.
	(VPRED, sve_lane_con): Add V4SI and V2DI.
	* config/aarch64/aarch64-sve.md (<optab><mode>3,
	@aarch64_pred_<optab><mode>): Support Advanced SIMD types.
	(mul<mode>3): New, split from <optab><mode>3.
	(@aarch64_pred_<optab><mode>, *post_ra_<optab><mode>3): New.
	* config/aarch64/aarch64-sve2.md (@aarch64_mul_lane_<mode>,
	*aarch64_mul_unpredicated_<mode>): Change SVE_FULL_HSDI to
	SVE_FULL_HSDI_SIMD_DI.

gcc/testsuite/ChangeLog:

	PR target/109636
	* gcc.target/aarch64/sve/pr109636_1.c: New test.
	* gcc.target/aarch64/sve/pr109636_2.c: New test.
	* gcc.target/aarch64/sve2/pr109636_1.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 6f48b4d5f21da9f96a376cd6b34110c2a39deb33..556d0cf359fedf2c28dfe1e0a75e1c12321be68a 100644




--
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 6f48b4d5f21da9f96a376cd6b34110c2a39deb33..556d0cf359fedf2c28dfe1e0a75e1c12321be68a 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -389,26 +389,6 @@ (define_insn "mul<mode>3<vczle><vczbe>"
   [(set_attr "type" "neon_mul_<Vetype><q>")]
 )
 
-;; Advanced SIMD does not support vector DImode MUL, but SVE does.
-;; Make use of the overlap between Z and V registers to implement the V2DI
-;; optab for TARGET_SVE.  The mulvnx2di3 expander can
-;; handle the TARGET_SVE2 case transparently.
-(define_expand "mulv2di3"
-  [(set (match_operand:V2DI 0 "register_operand")
-        (mult:V2DI (match_operand:V2DI 1 "register_operand")
-		   (match_operand:V2DI 2 "aarch64_sve_vsm_operand")))]
-  "TARGET_SVE"
-  {
-    machine_mode sve_mode = VNx2DImode;
-    rtx sve_op0 = simplify_gen_subreg (sve_mode, operands[0], V2DImode, 0);
-    rtx sve_op1 = simplify_gen_subreg (sve_mode, operands[1], V2DImode, 0);
-    rtx sve_op2 = simplify_gen_subreg (sve_mode, operands[2], V2DImode, 0);
-
-    emit_insn (gen_mulvnx2di3 (sve_op0, sve_op1, sve_op2));
-    DONE;
-  }
-)
-
 (define_insn "bswap<mode>2"
   [(set (match_operand:VDQHSD 0 "register_operand" "=w")
         (bswap:VDQHSD (match_operand:VDQHSD 1 "register_operand" "w")))]
@@ -2678,27 +2658,6 @@ (define_insn "*div<mode>3<vczle><vczbe>"
   [(set_attr "type" "neon_fp_div_<stype><q>")]
 )
 
-;; SVE has vector integer divisions, unlike Advanced SIMD.
-;; We can use it with Advanced SIMD modes to expose the V2DI and V4SI
-;; optabs to the midend.
-(define_expand "<su_optab>div<mode>3"
-  [(set (match_operand:VQDIV 0 "register_operand")
-	(ANY_DIV:VQDIV
-	  (match_operand:VQDIV 1 "register_operand")
-	  (match_operand:VQDIV 2 "register_operand")))]
-  "TARGET_SVE"
-  {
-    machine_mode sve_mode
-      = aarch64_full_sve_mode (GET_MODE_INNER (<MODE>mode)).require ();
-    rtx sve_op0 = simplify_gen_subreg (sve_mode, operands[0], <MODE>mode, 0);
-    rtx sve_op1 = simplify_gen_subreg (sve_mode, operands[1], <MODE>mode, 0);
-    rtx sve_op2 = simplify_gen_subreg (sve_mode, operands[2], <MODE>mode, 0);
-
-    emit_insn (gen_<su_optab>div<vnx>3 (sve_op0, sve_op1, sve_op2));
-    DONE;
-  }
-)
-
 (define_insn "neg<mode>2<vczle><vczbe>"
  [(set (match_operand:VHSDF 0 "register_operand" "=w")
        (neg:VHSDF (match_operand:VHSDF 1 "register_operand" "w")))]
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index e1e3c1bd0b7d12eefe43dc95a10716c24e3a48de..eca8623e587af944927a9459e29d5f8af170d347 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3789,16 +3789,35 @@ (define_expand "<optab><mode>3"
   [(set (match_operand:SVE_I 0 "register_operand")
 	(unspec:SVE_I
 	  [(match_dup 3)
-	   (SVE_INT_BINARY_IMM:SVE_I
+	   (SVE_INT_BINARY_MULTI:SVE_I
 	     (match_operand:SVE_I 1 "register_operand")
 	     (match_operand:SVE_I 2 "aarch64_sve_<sve_imm_con>_operand"))]
 	  UNSPEC_PRED_X))]
   "TARGET_SVE"
+  {
+    operands[3] = aarch64_ptrue_reg (<VPRED>mode);
+  }
+)
+
+;; Unpredicated integer binary operations that have an immediate form.
+;; Advanced SIMD does not support vector DImode MUL, but SVE does.
+;; Make use of the overlap between Z and V registers to implement the V2DI
+;; optab for TARGET_SVE.  The mulvnx2di3 expander can
+;; handle the TARGET_SVE2 case transparently.
+(define_expand "mul<mode>3"
+  [(set (match_operand:SVE_I_SIMD_DI 0 "register_operand")
+	(unspec:SVE_I_SIMD_DI
+	  [(match_dup 3)
+	   (mult:SVE_I_SIMD_DI
+	     (match_operand:SVE_I_SIMD_DI 1 "register_operand")
+	     (match_operand:SVE_I_SIMD_DI 2 "aarch64_sve_vsm_operand"))]
+	  UNSPEC_PRED_X))]
+  "TARGET_SVE"
   {
     /* SVE2 supports the MUL (vectors, unpredicated) form.  Emit the simple
        pattern for it here rather than splitting off the MULT expander
        separately.  */
-    if (TARGET_SVE2 && <CODE> == MULT)
+    if (TARGET_SVE2)
       {
 	emit_move_insn (operands[0], gen_rtx_MULT (<MODE>mode,
 						   operands[1], operands[2]));
@@ -3814,26 +3833,26 @@ (define_expand "<optab><mode>3"
 ;; and would make the instruction seem less uniform to the register
 ;; allocator.
 (define_insn_and_split "@aarch64_pred_<optab><mode>"
-  [(set (match_operand:SVE_I 0 "register_operand")
-	(unspec:SVE_I
+  [(set (match_operand:SVE_I_SIMD_DI 0 "register_operand")
+	(unspec:SVE_I_SIMD_DI
 	  [(match_operand:<VPRED> 1 "register_operand")
-	   (SVE_INT_BINARY_IMM:SVE_I
-	     (match_operand:SVE_I 2 "register_operand")
-	     (match_operand:SVE_I 3 "aarch64_sve_<sve_imm_con>_operand"))]
+	   (SVE_INT_BINARY_IMM:SVE_I_SIMD_DI
+	     (match_operand:SVE_I_SIMD_DI 2 "register_operand")
+	     (match_operand:SVE_I_SIMD_DI 3 "aarch64_sve_<sve_imm_con>_operand"))]
 	  UNSPEC_PRED_X))]
   "TARGET_SVE"
   {@ [ cons: =0 , 1   , 2  , 3             ; attrs: movprfx ]
      [ w        , Upl , %0 , <sve_imm_con> ; *              ] #
-     [ w        , Upl , 0  , w             ; *              ] <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+     [ w        , Upl , 0  , w             ; *              ] <sve_int_op>\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z3.<Vetype>
      [ ?&w      , Upl , w  , <sve_imm_con> ; yes            ] #
-     [ ?&w      , Upl , w  , w             ; yes            ] movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+     [ ?&w      , Upl , w  , w             ; yes            ] movprfx\t%Z0, %Z2\;<sve_int_op>\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z3.<Vetype>
   }
   ; Split the unpredicated form after reload, so that we don't have
   ; the unnecessary PTRUE.
   "&& reload_completed
    && !register_operand (operands[3], <MODE>mode)"
   [(set (match_dup 0)
-	(SVE_INT_BINARY_IMM:SVE_I (match_dup 2) (match_dup 3)))]
+	(SVE_INT_BINARY_IMM:SVE_I_SIMD_DI (match_dup 2) (match_dup 3)))]
   ""
 )
 
@@ -3841,14 +3860,14 @@ (define_insn_and_split "@aarch64_pred_<optab><mode>"
 ;; These are generated by splitting a predicated instruction whose
 ;; predicate is unused.
 (define_insn "*post_ra_<optab><mode>3"
-  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
-	(SVE_INT_BINARY_IMM:SVE_I
-	  (match_operand:SVE_I 1 "register_operand" "0, w")
-	  (match_operand:SVE_I 2 "aarch64_sve_<sve_imm_con>_immediate")))]
+  [(set (match_operand:SVE_I_SIMD_DI 0 "register_operand" "=w, ?&w")
+	(SVE_INT_BINARY_IMM:SVE_I_SIMD_DI
+	  (match_operand:SVE_I_SIMD_DI 1 "register_operand" "0, w")
+	  (match_operand:SVE_I_SIMD_DI 2 "aarch64_sve_<sve_imm_con>_immediate")))]
   "TARGET_SVE && reload_completed"
   "@
-   <sve_int_op>\t%0.<Vetype>, %0.<Vetype>, #%<sve_imm_prefix>2
-   movprfx\t%0, %1\;<sve_int_op>\t%0.<Vetype>, %0.<Vetype>, #%<sve_imm_prefix>2"
+   <sve_int_op>\t%Z0.<Vetype>, %Z0.<Vetype>, #%<sve_imm_prefix>2
+   movprfx\t%Z0, %Z1\;<sve_int_op>\t%Z0.<Vetype>, %Z0.<Vetype>, #%<sve_imm_prefix>2"
   [(set_attr "movprfx" "*,yes")]
 )
 
@@ -4458,13 +4477,16 @@ (define_insn "*cond_<optab><mode>_z"
 ;; -------------------------------------------------------------------------
 
 ;; Unpredicated integer division.
+;; SVE has vector integer divisions, unlike Advanced SIMD.
+;; We can use it with Advanced SIMD modes to expose the V2DI and V4SI
+;; optabs to the midend.
 (define_expand "<optab><mode>3"
-  [(set (match_operand:SVE_FULL_SDI 0 "register_operand")
-	(unspec:SVE_FULL_SDI
+  [(set (match_operand:SVE_FULL_SDI_SIMD 0 "register_operand")
+	(unspec:SVE_FULL_SDI_SIMD
 	  [(match_dup 3)
-	   (SVE_INT_BINARY_SD:SVE_FULL_SDI
-	     (match_operand:SVE_FULL_SDI 1 "register_operand")
-	     (match_operand:SVE_FULL_SDI 2 "register_operand"))]
+	   (SVE_INT_BINARY_SD:SVE_FULL_SDI_SIMD
+	     (match_operand:SVE_FULL_SDI_SIMD 1 "register_operand")
+	     (match_operand:SVE_FULL_SDI_SIMD 2 "register_operand"))]
 	  UNSPEC_PRED_X))]
   "TARGET_SVE"
   {
@@ -4474,18 +4496,18 @@ (define_expand "<optab><mode>3"
 
 ;; Integer division predicated with a PTRUE.
 (define_insn "@aarch64_pred_<optab><mode>"
-  [(set (match_operand:SVE_FULL_SDI 0 "register_operand")
-	(unspec:SVE_FULL_SDI
+  [(set (match_operand:SVE_FULL_SDI_SIMD 0 "register_operand")
+	(unspec:SVE_FULL_SDI_SIMD
 	  [(match_operand:<VPRED> 1 "register_operand")
-	   (SVE_INT_BINARY_SD:SVE_FULL_SDI
-	     (match_operand:SVE_FULL_SDI 2 "register_operand")
-	     (match_operand:SVE_FULL_SDI 3 "register_operand"))]
+	   (SVE_INT_BINARY_SD:SVE_FULL_SDI_SIMD
+	     (match_operand:SVE_FULL_SDI_SIMD 2 "register_operand")
+	     (match_operand:SVE_FULL_SDI_SIMD 3 "register_operand"))]
 	  UNSPEC_PRED_X))]
   "TARGET_SVE"
   {@ [ cons: =0 , 1   , 2 , 3 ; attrs: movprfx ]
-     [ w        , Upl , 0 , w ; *              ] <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
-     [ w        , Upl , w , 0 ; *              ] <sve_int_op>r\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
-     [ ?&w      , Upl , w , w ; yes            ] movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+     [ w        , Upl , 0 , w ; *              ] <sve_int_op>\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z3.<Vetype>
+     [ w        , Upl , w , 0 ; *              ] <sve_int_op>r\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z2.<Vetype>
+     [ ?&w      , Upl , w , w ; yes            ] movprfx\t%Z0, %Z2\;<sve_int_op>\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z3.<Vetype>
   }
 )
 
diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
index 1d1eb8bfdffe7b502c1ab5dd5a8ecc94b1e0214e..934e57055d3419e5dcc89b473fd110a0d4978b4f 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -615,29 +615,29 @@ (define_insn "@aarch64_sve_<su>clamp_single<mode>"
 ;; -------------------------------------------------------------------------
 
 (define_insn "@aarch64_mul_lane_<mode>"
-  [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
-	(mult:SVE_FULL_HSDI
-	  (unspec:SVE_FULL_HSDI
-	    [(match_operand:SVE_FULL_HSDI 2 "register_operand" "<sve_lane_con>")
+  [(set (match_operand:SVE_FULL_HSDI_SIMD_DI 0 "register_operand" "=w")
+	(mult:SVE_FULL_HSDI_SIMD_DI
+	  (unspec:SVE_FULL_HSDI_SIMD_DI
+	    [(match_operand:SVE_FULL_HSDI_SIMD_DI 2 "register_operand" "<sve_lane_con>")
 	     (match_operand:SI 3 "const_int_operand")]
 	    UNSPEC_SVE_LANE_SELECT)
-	  (match_operand:SVE_FULL_HSDI 1 "register_operand" "w")))]
+	  (match_operand:SVE_FULL_HSDI_SIMD_DI 1 "register_operand" "w")))]
   "TARGET_SVE2"
-  "mul\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>[%3]"
+  "mul\t%Z0.<Vetype>, %Z1.<Vetype>, %Z2.<Vetype>[%3]"
 )
 
 ;; The 2nd and 3rd alternatives are valid for just TARGET_SVE as well but
 ;; we include them here to allow matching simpler, unpredicated RTL.
 (define_insn "*aarch64_mul_unpredicated_<mode>"
-  [(set (match_operand:SVE_I 0 "register_operand")
-	(mult:SVE_I
-	  (match_operand:SVE_I 1 "register_operand")
-	  (match_operand:SVE_I 2 "aarch64_sve_vsm_operand")))]
+  [(set (match_operand:SVE_I_SIMD_DI 0 "register_operand")
+	(mult:SVE_I_SIMD_DI
+	  (match_operand:SVE_I_SIMD_DI 1 "register_operand")
+	  (match_operand:SVE_I_SIMD_DI 2 "aarch64_sve_vsm_operand")))]
   "TARGET_SVE2"
   {@ [ cons: =0 , 1 , 2   ; attrs: movprfx ]
-     [ w        , w , w   ; *              ] mul\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>
-     [ w        , 0 , vsm ; *              ] mul\t%0.<Vetype>, %0.<Vetype>, #%2
-     [ ?&w      , w , vsm ; yes            ] movprfx\t%0, %1\;mul\t%0.<Vetype>, %0.<Vetype>, #%2
+     [ w        , w , w   ; *              ] mul\t%Z0.<Vetype>, %Z1.<Vetype>, %Z2.<Vetype>
+     [ w        , 0 , vsm ; *              ] mul\t%Z0.<Vetype>, %Z0.<Vetype>, #%2
+     [ ?&w      , w , vsm ; yes            ] movprfx\t%Z0, %Z1\;mul\t%Z0.<Vetype>, %Z0.<Vetype>, #%2
   }
 )
 
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 942270e99d6d0c6632199c059256f3a902a1b138..6acf58a0e8f1533d7c4c8450c5755d22f0955852 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -108,9 +108,6 @@ (define_mode_iterator DREG [V8QI V4HI V4HF V2SI V2SF DF])
 ;; Copy of the above.
 (define_mode_iterator DREG2 [DREG])
 
-;; Advanced SIMD modes for integer divides.
-(define_mode_iterator VQDIV [V4SI V2DI])
-
 ;; All modes suitable to store/load pair (2 elements) using STP/LDP.
 (define_mode_iterator VP_2E [V2SI V2SF V2DI V2DF])
 
@@ -471,6 +468,10 @@ (define_mode_iterator SVE_FULL_HSD [VNx8HI VNx4SI VNx2DI
 ;; elements.
 (define_mode_iterator SVE_FULL_HSDI [VNx8HI VNx4SI VNx2DI])
 
+;; Fully-packed SVE integer vector modes that have 16-bit, 32-bit or 64-bit
+;; elements and Advanced SIMD Fully-packed 64-bit elements.
+(define_mode_iterator SVE_FULL_HSDI_SIMD_DI [VNx8HI VNx4SI VNx2DI V2DI])
+
 ;; Fully-packed SVE integer vector modes that have 16-bit or 32-bit
 ;; elements.
 (define_mode_iterator SVE_FULL_HSI [VNx8HI VNx4SI])
@@ -488,6 +489,14 @@ (define_mode_iterator SVE_FULL_SD [VNx4SI VNx2DI VNx4SF VNx2DF])
 ;; Fully-packed SVE integer vector modes that have 32-bit or 64-bit elements.
 (define_mode_iterator SVE_FULL_SDI [VNx4SI VNx2DI])
 
+;; Fully-packed SVE and Advanced SIMD integer vector modes that have 32-bit or
+;; 64-bit elements.
+(define_mode_iterator SVE_FULL_SDI_SIMD [VNx4SI VNx2DI V4SI V2DI])
+
+;; Fully-packed SVE integer vector modes that have 32-bit or 64-bit elements
+;; and Advanced SIMD 64-bit elements.
+(define_mode_iterator SVE_FULL_SDI_SIMD_DI [VNx4SI VNx2DI])
+
 ;; 2x and 4x tuples of the above, excluding 2x DI.
 (define_mode_iterator SVE_FULL_SIx2_SDIx4 [VNx8SI VNx16SI VNx8DI])
 
@@ -550,6 +559,13 @@ (define_mode_iterator SVE_I [VNx16QI VNx8QI VNx4QI VNx2QI
 			     VNx4SI VNx2SI
 			     VNx2DI])
 
+;; All SVE integer vector modes and Advanced SIMD 64-bit vector
+;; element modes
+(define_mode_iterator SVE_I_SIMD_DI [VNx16QI VNx8QI VNx4QI VNx2QI
+				     VNx8HI VNx4HI VNx2HI
+				     VNx4SI VNx2SI
+				     VNx2DI V2DI])
+
 ;; SVE integer vector modes whose elements are 16 bits or wider.
 (define_mode_iterator SVE_HSDI [VNx8HI VNx4HI VNx2HI
 				VNx4SI VNx2SI
@@ -2268,7 +2284,8 @@ (define_mode_attr VPRED [(VNx16QI "VNx16BI") (VNx8QI "VNx8BI")
 			 (VNx32HI "VNx8BI") (VNx32HF "VNx8BI")
 			 (VNx32BF "VNx8BI")
 			 (VNx16SI "VNx4BI") (VNx16SF "VNx4BI")
-			 (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")])
+			 (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
+			 (V4SI "VNx4BI") (V2DI "VNx2BI")])
 
 ;; ...and again in lower case.
 (define_mode_attr vpred [(VNx16QI "vnx16bi") (VNx8QI "vnx8bi")
@@ -2370,6 +2387,7 @@ (define_mode_attr narrower_mask [(VNx8HI "0x81") (VNx4HI "0x41")
 
 ;; The constraint to use for an SVE [SU]DOT, FMUL, FMLA or FMLS lane index.
 (define_mode_attr sve_lane_con [(VNx8HI "y") (VNx4SI "y") (VNx2DI "x")
+							  (V2DI "x")
 				(VNx8HF "y") (VNx4SF "y") (VNx2DF "x")])
 
 ;; The constraint to use for an SVE FCMLA lane index.
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr109636_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pr109636_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..5b37ddd2770bcbbec37b9563644da0ba061d3789
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr109636_1.c
@@ -0,0 +1,13 @@
+/* { dg-additional-options "-O -mtune=a64fx" } */
+
+typedef unsigned long long __attribute__((__vector_size__ (16))) V;
+typedef unsigned long long __attribute__((__vector_size__ (32))) W;
+
+extern void bar (V v);
+
+void foo (V v, W w)
+{
+  bar (__builtin_shuffle (v, __builtin_shufflevector ((V){}, w, 4, 5) / v));
+}
+
+/* { dg-final { scan-assembler {udiv\tz[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr109636_2.c b/gcc/testsuite/gcc.target/aarch64/sve/pr109636_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..6d39dc8e590a04a486a300de10c5480d9c33afba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr109636_2.c
@@ -0,0 +1,13 @@
+/* { dg-additional-options "-O -mcpu=a64fx" } */
+
+typedef unsigned long long __attribute__((__vector_size__ (16))) V;
+typedef unsigned long long __attribute__((__vector_size__ (32))) W;
+
+extern void bar (V v);
+
+void foom (V v, W w)
+{
+  bar (__builtin_shuffle (v, __builtin_shufflevector ((V){}, w, 4, 5) * v));
+}
+
+/* { dg-final { scan-assembler {mul\tz[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/pr109636_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/pr109636_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..2bea18ad703cb3e1a1ce896bcedc2530e831a192
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/pr109636_1.c
@@ -0,0 +1,13 @@
+/* { dg-additional-options "-O -mtune=a64fx" } */
+
+typedef unsigned long long __attribute__((__vector_size__ (16))) V;
+typedef unsigned long long __attribute__((__vector_size__ (32))) W;
+
+extern void bar (V v);
+
+void foom (V v, W w)
+{
+  bar (__builtin_shuffle (v, __builtin_shufflevector ((V){}, w, 4, 5) * v));
+}
+
+/* { dg-final { scan-assembler {mul\tz[0-9]+.d, z[0-9]+.d, z[0-9]+.d} } } */

Comments

Richard Sandiford Jan. 24, 2024, 9:40 a.m. UTC | #1
Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> As suggested in the ticket this replaces the expansion by converting the
> Advanced SIMD types to SVE types by simply printing out an SVE register for
> these instructions.
>
> This fixes the subreg issues since there are no subregs involved anymore.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	PR target/109636
> 	* config/aarch64/aarch64-simd.md (<su_optab>div<mode>3,
> 	mulv2di3): Remove.
> 	* config/aarch64/iterators.md (VQDIV): Remove.
> 	(SVE_FULL_SDI_SIMD, SVE_FULL_SDI_SIMD_DI, SVE_FULL_HSDI_SIMD_DI,
> 	SVE_I_SIMD_DI): New.
> 	(VPRED, sve_lane_con): Add V4SI and V2DI.
> 	* config/aarch64/aarch64-sve.md (<optab><mode>3,
> 	@aarch64_pred_<optab><mode>): Support Advanced SIMD types.
> 	(mul<mode>3): New, split from <optab><mode>3.
> 	(@aarch64_pred_<optab><mode>, *post_ra_<optab><mode>3): New.
> 	* config/aarch64/aarch64-sve2.md (@aarch64_mul_lane_<mode>,
> 	*aarch64_mul_unpredicated_<mode>): Change SVE_FULL_HSDI to
> 	SVE_FULL_HSDI_SIMD_DI.
>
> gcc/testsuite/ChangeLog:
>
> 	PR target/109636
> 	* gcc.target/aarch64/sve/pr109636_1.c: New test.
> 	* gcc.target/aarch64/sve/pr109636_2.c: New test.
> 	* gcc.target/aarch64/sve2/pr109636_1.c: New test.
>
> --- inline copy of patch -- 
> [...]
> @@ -550,6 +559,13 @@ (define_mode_iterator SVE_I [VNx16QI VNx8QI VNx4QI VNx2QI
>  			     VNx4SI VNx2SI
>  			     VNx2DI])
>  
> +;; All SVE integer vector modes and Advanced SIMD 64-bit vector
> +;; element modes
> +(define_mode_iterator SVE_I_SIMD_DI [VNx16QI VNx8QI VNx4QI VNx2QI
> +				     VNx8HI VNx4HI VNx2HI
> +				     VNx4SI VNx2SI
> +				     VNx2DI V2DI])
> +

IMO this would be more robust as:

(define_mode_iterator SVE_I_SIMD_DI [SVE_I V2DI])

Your call on whether that's better or worse for the others.

OK with that changes, thanks.  I suppose at some point we should extend
the division patterns to V2SI, but that's clearly not stage 4 material.

Richard

>  ;; SVE integer vector modes whose elements are 16 bits or wider.
>  (define_mode_iterator SVE_HSDI [VNx8HI VNx4HI VNx2HI
>  				VNx4SI VNx2SI
> @@ -2268,7 +2284,8 @@ (define_mode_attr VPRED [(VNx16QI "VNx16BI") (VNx8QI "VNx8BI")
>  			 (VNx32HI "VNx8BI") (VNx32HF "VNx8BI")
>  			 (VNx32BF "VNx8BI")
>  			 (VNx16SI "VNx4BI") (VNx16SF "VNx4BI")
> -			 (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")])
> +			 (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
> +			 (V4SI "VNx4BI") (V2DI "VNx2BI")])
>  
>  ;; ...and again in lower case.
>  (define_mode_attr vpred [(VNx16QI "vnx16bi") (VNx8QI "vnx8bi")
> @@ -2370,6 +2387,7 @@ (define_mode_attr narrower_mask [(VNx8HI "0x81") (VNx4HI "0x41")
>  
>  ;; The constraint to use for an SVE [SU]DOT, FMUL, FMLA or FMLS lane index.
>  (define_mode_attr sve_lane_con [(VNx8HI "y") (VNx4SI "y") (VNx2DI "x")
> +							  (V2DI "x")
>  				(VNx8HF "y") (VNx4SF "y") (VNx2DF "x")])
>  
>  ;; The constraint to use for an SVE FCMLA lane index.
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr109636_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pr109636_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..5b37ddd2770bcbbec37b9563644da0ba061d3789
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr109636_1.c
> @@ -0,0 +1,13 @@
> +/* { dg-additional-options "-O -mtune=a64fx" } */
> +
> +typedef unsigned long long __attribute__((__vector_size__ (16))) V;
> +typedef unsigned long long __attribute__((__vector_size__ (32))) W;
> +
> +extern void bar (V v);
> +
> +void foo (V v, W w)
> +{
> +  bar (__builtin_shuffle (v, __builtin_shufflevector ((V){}, w, 4, 5) / v));
> +}
> +
> +/* { dg-final { scan-assembler {udiv\tz[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d} } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr109636_2.c b/gcc/testsuite/gcc.target/aarch64/sve/pr109636_2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..6d39dc8e590a04a486a300de10c5480d9c33afba
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr109636_2.c
> @@ -0,0 +1,13 @@
> +/* { dg-additional-options "-O -mcpu=a64fx" } */
> +
> +typedef unsigned long long __attribute__((__vector_size__ (16))) V;
> +typedef unsigned long long __attribute__((__vector_size__ (32))) W;
> +
> +extern void bar (V v);
> +
> +void foom (V v, W w)
> +{
> +  bar (__builtin_shuffle (v, __builtin_shufflevector ((V){}, w, 4, 5) * v));
> +}
> +
> +/* { dg-final { scan-assembler {mul\tz[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d} } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/pr109636_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/pr109636_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..2bea18ad703cb3e1a1ce896bcedc2530e831a192
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/pr109636_1.c
> @@ -0,0 +1,13 @@
> +/* { dg-additional-options "-O -mtune=a64fx" } */
> +
> +typedef unsigned long long __attribute__((__vector_size__ (16))) V;
> +typedef unsigned long long __attribute__((__vector_size__ (32))) W;
> +
> +extern void bar (V v);
> +
> +void foom (V v, W w)
> +{
> +  bar (__builtin_shuffle (v, __builtin_shufflevector ((V){}, w, 4, 5) * v));
> +}
> +
> +/* { dg-final { scan-assembler {mul\tz[0-9]+.d, z[0-9]+.d, z[0-9]+.d} } } */
diff mbox series

Patch

--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -389,26 +389,6 @@  (define_insn "mul<mode>3<vczle><vczbe>"
   [(set_attr "type" "neon_mul_<Vetype><q>")]
 )
 
-;; Advanced SIMD does not support vector DImode MUL, but SVE does.
-;; Make use of the overlap between Z and V registers to implement the V2DI
-;; optab for TARGET_SVE.  The mulvnx2di3 expander can
-;; handle the TARGET_SVE2 case transparently.
-(define_expand "mulv2di3"
-  [(set (match_operand:V2DI 0 "register_operand")
-        (mult:V2DI (match_operand:V2DI 1 "register_operand")
-		   (match_operand:V2DI 2 "aarch64_sve_vsm_operand")))]
-  "TARGET_SVE"
-  {
-    machine_mode sve_mode = VNx2DImode;
-    rtx sve_op0 = simplify_gen_subreg (sve_mode, operands[0], V2DImode, 0);
-    rtx sve_op1 = simplify_gen_subreg (sve_mode, operands[1], V2DImode, 0);
-    rtx sve_op2 = simplify_gen_subreg (sve_mode, operands[2], V2DImode, 0);
-
-    emit_insn (gen_mulvnx2di3 (sve_op0, sve_op1, sve_op2));
-    DONE;
-  }
-)
-
 (define_insn "bswap<mode>2"
   [(set (match_operand:VDQHSD 0 "register_operand" "=w")
         (bswap:VDQHSD (match_operand:VDQHSD 1 "register_operand" "w")))]
@@ -2678,27 +2658,6 @@  (define_insn "*div<mode>3<vczle><vczbe>"
   [(set_attr "type" "neon_fp_div_<stype><q>")]
 )
 
-;; SVE has vector integer divisions, unlike Advanced SIMD.
-;; We can use it with Advanced SIMD modes to expose the V2DI and V4SI
-;; optabs to the midend.
-(define_expand "<su_optab>div<mode>3"
-  [(set (match_operand:VQDIV 0 "register_operand")
-	(ANY_DIV:VQDIV
-	  (match_operand:VQDIV 1 "register_operand")
-	  (match_operand:VQDIV 2 "register_operand")))]
-  "TARGET_SVE"
-  {
-    machine_mode sve_mode
-      = aarch64_full_sve_mode (GET_MODE_INNER (<MODE>mode)).require ();
-    rtx sve_op0 = simplify_gen_subreg (sve_mode, operands[0], <MODE>mode, 0);
-    rtx sve_op1 = simplify_gen_subreg (sve_mode, operands[1], <MODE>mode, 0);
-    rtx sve_op2 = simplify_gen_subreg (sve_mode, operands[2], <MODE>mode, 0);
-
-    emit_insn (gen_<su_optab>div<vnx>3 (sve_op0, sve_op1, sve_op2));
-    DONE;
-  }
-)
-
 (define_insn "neg<mode>2<vczle><vczbe>"
  [(set (match_operand:VHSDF 0 "register_operand" "=w")
        (neg:VHSDF (match_operand:VHSDF 1 "register_operand" "w")))]
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index e1e3c1bd0b7d12eefe43dc95a10716c24e3a48de..eca8623e587af944927a9459e29d5f8af170d347 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3789,16 +3789,35 @@  (define_expand "<optab><mode>3"
   [(set (match_operand:SVE_I 0 "register_operand")
 	(unspec:SVE_I
 	  [(match_dup 3)
-	   (SVE_INT_BINARY_IMM:SVE_I
+	   (SVE_INT_BINARY_MULTI:SVE_I
 	     (match_operand:SVE_I 1 "register_operand")
 	     (match_operand:SVE_I 2 "aarch64_sve_<sve_imm_con>_operand"))]
 	  UNSPEC_PRED_X))]
   "TARGET_SVE"
+  {
+    operands[3] = aarch64_ptrue_reg (<VPRED>mode);
+  }
+)
+
+;; Unpredicated integer binary operations that have an immediate form.
+;; Advanced SIMD does not support vector DImode MUL, but SVE does.
+;; Make use of the overlap between Z and V registers to implement the V2DI
+;; optab for TARGET_SVE.  The mulvnx2di3 expander can
+;; handle the TARGET_SVE2 case transparently.
+(define_expand "mul<mode>3"
+  [(set (match_operand:SVE_I_SIMD_DI 0 "register_operand")
+	(unspec:SVE_I_SIMD_DI
+	  [(match_dup 3)
+	   (mult:SVE_I_SIMD_DI
+	     (match_operand:SVE_I_SIMD_DI 1 "register_operand")
+	     (match_operand:SVE_I_SIMD_DI 2 "aarch64_sve_vsm_operand"))]
+	  UNSPEC_PRED_X))]
+  "TARGET_SVE"
   {
     /* SVE2 supports the MUL (vectors, unpredicated) form.  Emit the simple
        pattern for it here rather than splitting off the MULT expander
        separately.  */
-    if (TARGET_SVE2 && <CODE> == MULT)
+    if (TARGET_SVE2)
       {
 	emit_move_insn (operands[0], gen_rtx_MULT (<MODE>mode,
 						   operands[1], operands[2]));
@@ -3814,26 +3833,26 @@  (define_expand "<optab><mode>3"
 ;; and would make the instruction seem less uniform to the register
 ;; allocator.
 (define_insn_and_split "@aarch64_pred_<optab><mode>"
-  [(set (match_operand:SVE_I 0 "register_operand")
-	(unspec:SVE_I
+  [(set (match_operand:SVE_I_SIMD_DI 0 "register_operand")
+	(unspec:SVE_I_SIMD_DI
 	  [(match_operand:<VPRED> 1 "register_operand")
-	   (SVE_INT_BINARY_IMM:SVE_I
-	     (match_operand:SVE_I 2 "register_operand")
-	     (match_operand:SVE_I 3 "aarch64_sve_<sve_imm_con>_operand"))]
+	   (SVE_INT_BINARY_IMM:SVE_I_SIMD_DI
+	     (match_operand:SVE_I_SIMD_DI 2 "register_operand")
+	     (match_operand:SVE_I_SIMD_DI 3 "aarch64_sve_<sve_imm_con>_operand"))]
 	  UNSPEC_PRED_X))]
   "TARGET_SVE"
   {@ [ cons: =0 , 1   , 2  , 3             ; attrs: movprfx ]
      [ w        , Upl , %0 , <sve_imm_con> ; *              ] #
-     [ w        , Upl , 0  , w             ; *              ] <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+     [ w        , Upl , 0  , w             ; *              ] <sve_int_op>\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z3.<Vetype>
      [ ?&w      , Upl , w  , <sve_imm_con> ; yes            ] #
-     [ ?&w      , Upl , w  , w             ; yes            ] movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+     [ ?&w      , Upl , w  , w             ; yes            ] movprfx\t%Z0, %Z2\;<sve_int_op>\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z3.<Vetype>
   }
   ; Split the unpredicated form after reload, so that we don't have
   ; the unnecessary PTRUE.
   "&& reload_completed
    && !register_operand (operands[3], <MODE>mode)"
   [(set (match_dup 0)
-	(SVE_INT_BINARY_IMM:SVE_I (match_dup 2) (match_dup 3)))]
+	(SVE_INT_BINARY_IMM:SVE_I_SIMD_DI (match_dup 2) (match_dup 3)))]
   ""
 )
 
@@ -3841,14 +3860,14 @@  (define_insn_and_split "@aarch64_pred_<optab><mode>"
 ;; These are generated by splitting a predicated instruction whose
 ;; predicate is unused.
 (define_insn "*post_ra_<optab><mode>3"
-  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
-	(SVE_INT_BINARY_IMM:SVE_I
-	  (match_operand:SVE_I 1 "register_operand" "0, w")
-	  (match_operand:SVE_I 2 "aarch64_sve_<sve_imm_con>_immediate")))]
+  [(set (match_operand:SVE_I_SIMD_DI 0 "register_operand" "=w, ?&w")
+	(SVE_INT_BINARY_IMM:SVE_I_SIMD_DI
+	  (match_operand:SVE_I_SIMD_DI 1 "register_operand" "0, w")
+	  (match_operand:SVE_I_SIMD_DI 2 "aarch64_sve_<sve_imm_con>_immediate")))]
   "TARGET_SVE && reload_completed"
   "@
-   <sve_int_op>\t%0.<Vetype>, %0.<Vetype>, #%<sve_imm_prefix>2
-   movprfx\t%0, %1\;<sve_int_op>\t%0.<Vetype>, %0.<Vetype>, #%<sve_imm_prefix>2"
+   <sve_int_op>\t%Z0.<Vetype>, %Z0.<Vetype>, #%<sve_imm_prefix>2
+   movprfx\t%Z0, %Z1\;<sve_int_op>\t%Z0.<Vetype>, %Z0.<Vetype>, #%<sve_imm_prefix>2"
   [(set_attr "movprfx" "*,yes")]
 )
 
@@ -4458,13 +4477,16 @@  (define_insn "*cond_<optab><mode>_z"
 ;; -------------------------------------------------------------------------
 
 ;; Unpredicated integer division.
+;; SVE has vector integer divisions, unlike Advanced SIMD.
+;; We can use it with Advanced SIMD modes to expose the V2DI and V4SI
+;; optabs to the midend.
 (define_expand "<optab><mode>3"
-  [(set (match_operand:SVE_FULL_SDI 0 "register_operand")
-	(unspec:SVE_FULL_SDI
+  [(set (match_operand:SVE_FULL_SDI_SIMD 0 "register_operand")
+	(unspec:SVE_FULL_SDI_SIMD
 	  [(match_dup 3)
-	   (SVE_INT_BINARY_SD:SVE_FULL_SDI
-	     (match_operand:SVE_FULL_SDI 1 "register_operand")
-	     (match_operand:SVE_FULL_SDI 2 "register_operand"))]
+	   (SVE_INT_BINARY_SD:SVE_FULL_SDI_SIMD
+	     (match_operand:SVE_FULL_SDI_SIMD 1 "register_operand")
+	     (match_operand:SVE_FULL_SDI_SIMD 2 "register_operand"))]
 	  UNSPEC_PRED_X))]
   "TARGET_SVE"
   {
@@ -4474,18 +4496,18 @@  (define_expand "<optab><mode>3"
 
 ;; Integer division predicated with a PTRUE.
 (define_insn "@aarch64_pred_<optab><mode>"
-  [(set (match_operand:SVE_FULL_SDI 0 "register_operand")
-	(unspec:SVE_FULL_SDI
+  [(set (match_operand:SVE_FULL_SDI_SIMD 0 "register_operand")
+	(unspec:SVE_FULL_SDI_SIMD
 	  [(match_operand:<VPRED> 1 "register_operand")
-	   (SVE_INT_BINARY_SD:SVE_FULL_SDI
-	     (match_operand:SVE_FULL_SDI 2 "register_operand")
-	     (match_operand:SVE_FULL_SDI 3 "register_operand"))]
+	   (SVE_INT_BINARY_SD:SVE_FULL_SDI_SIMD
+	     (match_operand:SVE_FULL_SDI_SIMD 2 "register_operand")
+	     (match_operand:SVE_FULL_SDI_SIMD 3 "register_operand"))]
 	  UNSPEC_PRED_X))]
   "TARGET_SVE"
   {@ [ cons: =0 , 1   , 2 , 3 ; attrs: movprfx ]
-     [ w        , Upl , 0 , w ; *              ] <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
-     [ w        , Upl , w , 0 ; *              ] <sve_int_op>r\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
-     [ ?&w      , Upl , w , w ; yes            ] movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+     [ w        , Upl , 0 , w ; *              ] <sve_int_op>\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z3.<Vetype>
+     [ w        , Upl , w , 0 ; *              ] <sve_int_op>r\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z2.<Vetype>
+     [ ?&w      , Upl , w , w ; yes            ] movprfx\t%Z0, %Z2\;<sve_int_op>\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z3.<Vetype>
   }
 )
 
diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
index 1d1eb8bfdffe7b502c1ab5dd5a8ecc94b1e0214e..934e57055d3419e5dcc89b473fd110a0d4978b4f 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -615,29 +615,29 @@  (define_insn "@aarch64_sve_<su>clamp_single<mode>"
 ;; -------------------------------------------------------------------------
 
 (define_insn "@aarch64_mul_lane_<mode>"
-  [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
-	(mult:SVE_FULL_HSDI
-	  (unspec:SVE_FULL_HSDI
-	    [(match_operand:SVE_FULL_HSDI 2 "register_operand" "<sve_lane_con>")
+  [(set (match_operand:SVE_FULL_HSDI_SIMD_DI 0 "register_operand" "=w")
+	(mult:SVE_FULL_HSDI_SIMD_DI
+	  (unspec:SVE_FULL_HSDI_SIMD_DI
+	    [(match_operand:SVE_FULL_HSDI_SIMD_DI 2 "register_operand" "<sve_lane_con>")
 	     (match_operand:SI 3 "const_int_operand")]
 	    UNSPEC_SVE_LANE_SELECT)
-	  (match_operand:SVE_FULL_HSDI 1 "register_operand" "w")))]
+	  (match_operand:SVE_FULL_HSDI_SIMD_DI 1 "register_operand" "w")))]
   "TARGET_SVE2"
-  "mul\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>[%3]"
+  "mul\t%Z0.<Vetype>, %Z1.<Vetype>, %Z2.<Vetype>[%3]"
 )
 
 ;; The 2nd and 3rd alternatives are valid for just TARGET_SVE as well but
 ;; we include them here to allow matching simpler, unpredicated RTL.
 (define_insn "*aarch64_mul_unpredicated_<mode>"
-  [(set (match_operand:SVE_I 0 "register_operand")
-	(mult:SVE_I
-	  (match_operand:SVE_I 1 "register_operand")
-	  (match_operand:SVE_I 2 "aarch64_sve_vsm_operand")))]
+  [(set (match_operand:SVE_I_SIMD_DI 0 "register_operand")
+	(mult:SVE_I_SIMD_DI
+	  (match_operand:SVE_I_SIMD_DI 1 "register_operand")
+	  (match_operand:SVE_I_SIMD_DI 2 "aarch64_sve_vsm_operand")))]
   "TARGET_SVE2"
   {@ [ cons: =0 , 1 , 2   ; attrs: movprfx ]
-     [ w        , w , w   ; *              ] mul\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>
-     [ w        , 0 , vsm ; *              ] mul\t%0.<Vetype>, %0.<Vetype>, #%2
-     [ ?&w      , w , vsm ; yes            ] movprfx\t%0, %1\;mul\t%0.<Vetype>, %0.<Vetype>, #%2
+     [ w        , w , w   ; *              ] mul\t%Z0.<Vetype>, %Z1.<Vetype>, %Z2.<Vetype>
+     [ w        , 0 , vsm ; *              ] mul\t%Z0.<Vetype>, %Z0.<Vetype>, #%2
+     [ ?&w      , w , vsm ; yes            ] movprfx\t%Z0, %Z1\;mul\t%Z0.<Vetype>, %Z0.<Vetype>, #%2
   }
 )
 
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 942270e99d6d0c6632199c059256f3a902a1b138..6acf58a0e8f1533d7c4c8450c5755d22f0955852 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -108,9 +108,6 @@  (define_mode_iterator DREG [V8QI V4HI V4HF V2SI V2SF DF])
 ;; Copy of the above.
 (define_mode_iterator DREG2 [DREG])
 
-;; Advanced SIMD modes for integer divides.
-(define_mode_iterator VQDIV [V4SI V2DI])
-
 ;; All modes suitable to store/load pair (2 elements) using STP/LDP.
 (define_mode_iterator VP_2E [V2SI V2SF V2DI V2DF])
 
@@ -471,6 +468,10 @@  (define_mode_iterator SVE_FULL_HSD [VNx8HI VNx4SI VNx2DI
 ;; elements.
 (define_mode_iterator SVE_FULL_HSDI [VNx8HI VNx4SI VNx2DI])
 
+;; Fully-packed SVE integer vector modes that have 16-bit, 32-bit or 64-bit
+;; elements and Advanced SIMD Fully-packed 64-bit elements.
+(define_mode_iterator SVE_FULL_HSDI_SIMD_DI [VNx8HI VNx4SI VNx2DI V2DI])
+
 ;; Fully-packed SVE integer vector modes that have 16-bit or 32-bit
 ;; elements.
 (define_mode_iterator SVE_FULL_HSI [VNx8HI VNx4SI])
@@ -488,6 +489,14 @@  (define_mode_iterator SVE_FULL_SD [VNx4SI VNx2DI VNx4SF VNx2DF])
 ;; Fully-packed SVE integer vector modes that have 32-bit or 64-bit elements.
 (define_mode_iterator SVE_FULL_SDI [VNx4SI VNx2DI])
 
+;; Fully-packed SVE and Advanced SIMD integer vector modes that have 32-bit or
+;; 64-bit elements.
+(define_mode_iterator SVE_FULL_SDI_SIMD [VNx4SI VNx2DI V4SI V2DI])
+
+;; Fully-packed SVE integer vector modes that have 32-bit or 64-bit elements
+;; and Advanced SIMD 64-bit elements.
+(define_mode_iterator SVE_FULL_SDI_SIMD_DI [VNx4SI VNx2DI])
+
 ;; 2x and 4x tuples of the above, excluding 2x DI.
 (define_mode_iterator SVE_FULL_SIx2_SDIx4 [VNx8SI VNx16SI VNx8DI])
 
@@ -550,6 +559,13 @@  (define_mode_iterator SVE_I [VNx16QI VNx8QI VNx4QI VNx2QI
 			     VNx4SI VNx2SI
 			     VNx2DI])
 
+;; All SVE integer vector modes and Advanced SIMD 64-bit vector
+;; element modes
+(define_mode_iterator SVE_I_SIMD_DI [VNx16QI VNx8QI VNx4QI VNx2QI
+				     VNx8HI VNx4HI VNx2HI
+				     VNx4SI VNx2SI
+				     VNx2DI V2DI])
+
 ;; SVE integer vector modes whose elements are 16 bits or wider.
 (define_mode_iterator SVE_HSDI [VNx8HI VNx4HI VNx2HI
 				VNx4SI VNx2SI
@@ -2268,7 +2284,8 @@  (define_mode_attr VPRED [(VNx16QI "VNx16BI") (VNx8QI "VNx8BI")
 			 (VNx32HI "VNx8BI") (VNx32HF "VNx8BI")
 			 (VNx32BF "VNx8BI")
 			 (VNx16SI "VNx4BI") (VNx16SF "VNx4BI")
-			 (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")])
+			 (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
+			 (V4SI "VNx4BI") (V2DI "VNx2BI")])
 
 ;; ...and again in lower case.
 (define_mode_attr vpred [(VNx16QI "vnx16bi") (VNx8QI "vnx8bi")
@@ -2370,6 +2387,7 @@  (define_mode_attr narrower_mask [(VNx8HI "0x81") (VNx4HI "0x41")
 
 ;; The constraint to use for an SVE [SU]DOT, FMUL, FMLA or FMLS lane index.
 (define_mode_attr sve_lane_con [(VNx8HI "y") (VNx4SI "y") (VNx2DI "x")
+							  (V2DI "x")
 				(VNx8HF "y") (VNx4SF "y") (VNx2DF "x")])
 
 ;; The constraint to use for an SVE FCMLA lane index.
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr109636_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pr109636_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..5b37ddd2770bcbbec37b9563644da0ba061d3789
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr109636_1.c
@@ -0,0 +1,13 @@ 
+/* { dg-additional-options "-O -mtune=a64fx" } */
+
+typedef unsigned long long __attribute__((__vector_size__ (16))) V;
+typedef unsigned long long __attribute__((__vector_size__ (32))) W;
+
+extern void bar (V v);
+
+void foo (V v, W w)
+{
+  bar (__builtin_shuffle (v, __builtin_shufflevector ((V){}, w, 4, 5) / v));
+}
+
+/* { dg-final { scan-assembler {udiv\tz[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr109636_2.c b/gcc/testsuite/gcc.target/aarch64/sve/pr109636_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..6d39dc8e590a04a486a300de10c5480d9c33afba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr109636_2.c
@@ -0,0 +1,13 @@ 
+/* { dg-additional-options "-O -mcpu=a64fx" } */
+
+typedef unsigned long long __attribute__((__vector_size__ (16))) V;
+typedef unsigned long long __attribute__((__vector_size__ (32))) W;
+
+extern void bar (V v);
+
+void foom (V v, W w)
+{
+  bar (__builtin_shuffle (v, __builtin_shufflevector ((V){}, w, 4, 5) * v));
+}
+
+/* { dg-final { scan-assembler {mul\tz[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/pr109636_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/pr109636_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..2bea18ad703cb3e1a1ce896bcedc2530e831a192
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/pr109636_1.c
@@ -0,0 +1,13 @@ 
+/* { dg-additional-options "-O -mtune=a64fx" } */
+
+typedef unsigned long long __attribute__((__vector_size__ (16))) V;
+typedef unsigned long long __attribute__((__vector_size__ (32))) W;
+
+extern void bar (V v);
+
+void foom (V v, W w)
+{
+  bar (__builtin_shuffle (v, __builtin_shufflevector ((V){}, w, 4, 5) * v));
+}
+
+/* { dg-final { scan-assembler {mul\tz[0-9]+.d, z[0-9]+.d, z[0-9]+.d} } } */