diff mbox series

LoongArch: Use bstrins instruction for (a & ~mask) and (a & mask) | (b & ~mask) [PR111252]

Message ID 20230906104628.51362-1-xry111@xry111.site
State New
Headers show
Series LoongArch: Use bstrins instruction for (a & ~mask) and (a & mask) | (b & ~mask) [PR111252] | expand

Commit Message

Xi Ruoyao Sept. 6, 2023, 10:46 a.m. UTC
If mask is a constant with value ((1 << N) - 1) << M we can perform this
optimization.

gcc/ChangeLog:

	PR target/111252
	* config/loongarch/loongarch-protos.h
	(loongarch_pre_reload_split): Declare new function.
	(loongarch_use_bstrins_for_ior_with_mask): Likewise.
	* config/loongarch/loongarch.cc
	(loongarch_pre_reload_split): Implement.
	(loongarch_use_bstrins_for_ior_with_mask): Likewise.
	* config/loongarch/predicates.md (ins_zero_bitmask_operand):
	New predicate.
	* config/loongarch/loongarch.md (bstrins_<mode>_for_mask):
	New define_insn_and_split.
	(bstrins_<mode>_for_ior_mask): Likewise.
	(define_peephole2): Further optimize code sequence produced by
	bstrins_<mode>_for_ior_mask if possible.

gcc/testsuite/ChangeLog:

	* g++.target/loongarch/bstrins-compile.C: New test.
	* g++.target/loongarch/bstrins-run.C: New test.
---
 gcc/config/loongarch/loongarch-protos.h       |  4 +-
 gcc/config/loongarch/loongarch.cc             | 36 ++++++++
 gcc/config/loongarch/loongarch.md             | 91 +++++++++++++++++++
 gcc/config/loongarch/predicates.md            |  8 ++
 .../g++.target/loongarch/bstrins-compile.C    | 22 +++++
 .../g++.target/loongarch/bstrins-run.C        | 65 +++++++++++++
 6 files changed, 225 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/g++.target/loongarch/bstrins-compile.C
 create mode 100644 gcc/testsuite/g++.target/loongarch/bstrins-run.C

Comments

Xi Ruoyao Sept. 6, 2023, 10:58 a.m. UTC | #1
Forgot to mention: I've bootstrapped and regtested this patch on
loongarch64-linux-gnu (with PR110939 patch applied to unbreak the
bootstrapping).  Ok for trunk?

On Wed, 2023-09-06 at 18:46 +0800, Xi Ruoyao wrote:

> If mask is a constant with value ((1 << N) - 1) << M we can perform this
> optimization.
> 
> gcc/ChangeLog:
> 
>         PR target/111252
>         * config/loongarch/loongarch-protos.h
>         (loongarch_pre_reload_split): Declare new function.
>         (loongarch_use_bstrins_for_ior_with_mask): Likewise.
>         * config/loongarch/loongarch.cc
>         (loongarch_pre_reload_split): Implement.
>         (loongarch_use_bstrins_for_ior_with_mask): Likewise.
>         * config/loongarch/predicates.md (ins_zero_bitmask_operand):
>         New predicate.
>         * config/loongarch/loongarch.md (bstrins_<mode>_for_mask):
>         New define_insn_and_split.
>         (bstrins_<mode>_for_ior_mask): Likewise.
>         (define_peephole2): Further optimize code sequence produced by
>         bstrins_<mode>_for_ior_mask if possible.
> 
> gcc/testsuite/ChangeLog:
> 
>         * g++.target/loongarch/bstrins-compile.C: New test.
>         * g++.target/loongarch/bstrins-run.C: New test.

/* snip */
Lulu Cheng Sept. 7, 2023, 2:15 a.m. UTC | #2
在 2023/9/6 下午6:58, Xi Ruoyao 写道:
> Forgot to mention: I've bootstrapped and regtested this patch on
> loongarch64-linux-gnu (with PR110939 patch applied to unbreak the
> bootstrapping).  Ok for trunk?

LGTM!

Thanks!

>
> On Wed, 2023-09-06 at 18:46 +0800, Xi Ruoyao wrote:
>
>> If mask is a constant with value ((1 << N) - 1) << M we can perform this
>> optimization.
>>
>> gcc/ChangeLog:
>>
>>          PR target/111252
>>          * config/loongarch/loongarch-protos.h
>>          (loongarch_pre_reload_split): Declare new function.
>>          (loongarch_use_bstrins_for_ior_with_mask): Likewise.
>>          * config/loongarch/loongarch.cc
>>          (loongarch_pre_reload_split): Implement.
>>          (loongarch_use_bstrins_for_ior_with_mask): Likewise.
>>          * config/loongarch/predicates.md (ins_zero_bitmask_operand):
>>          New predicate.
>>          * config/loongarch/loongarch.md (bstrins_<mode>_for_mask):
>>          New define_insn_and_split.
>>          (bstrins_<mode>_for_ior_mask): Likewise.
>>          (define_peephole2): Further optimize code sequence produced by
>>          bstrins_<mode>_for_ior_mask if possible.
>>
>> gcc/testsuite/ChangeLog:
>>
>>          * g++.target/loongarch/bstrins-compile.C: New test.
>>          * g++.target/loongarch/bstrins-run.C: New test.
> /* snip */
>
Xi Ruoyao Sept. 7, 2023, 8:06 a.m. UTC | #3
On Thu, 2023-09-07 at 10:15 +0800, chenglulu wrote:
> 
> 在 2023/9/6 下午6:58, Xi Ruoyao 写道:
> > Forgot to mention: I've bootstrapped and regtested this patch on
> > loongarch64-linux-gnu (with PR110939 patch applied to unbreak the
> > bootstrapping).  Ok for trunk?
> 
> LGTM!
> 
> Thanks!

Pushed r14-3773.

> > 
> > On Wed, 2023-09-06 at 18:46 +0800, Xi Ruoyao wrote:
> > 
> > > If mask is a constant with value ((1 << N) - 1) << M we can
> > > perform this
> > > optimization.
> > > 
> > > gcc/ChangeLog:
> > > 
> > >          PR target/111252
> > >          * config/loongarch/loongarch-protos.h
> > >          (loongarch_pre_reload_split): Declare new function.
> > >          (loongarch_use_bstrins_for_ior_with_mask): Likewise.
> > >          * config/loongarch/loongarch.cc
> > >          (loongarch_pre_reload_split): Implement.
> > >          (loongarch_use_bstrins_for_ior_with_mask): Likewise.
> > >          * config/loongarch/predicates.md
> > > (ins_zero_bitmask_operand):
> > >          New predicate.
> > >          * config/loongarch/loongarch.md
> > > (bstrins_<mode>_for_mask):
> > >          New define_insn_and_split.
> > >          (bstrins_<mode>_for_ior_mask): Likewise.
> > >          (define_peephole2): Further optimize code sequence
> > > produced by
> > >          bstrins_<mode>_for_ior_mask if possible.
> > > 
> > > gcc/testsuite/ChangeLog:
> > > 
> > >          * g++.target/loongarch/bstrins-compile.C: New test.
> > >          * g++.target/loongarch/bstrins-run.C: New test.
> > /* snip */
> > 
>
diff mbox series

Patch

diff --git a/gcc/config/loongarch/loongarch-protos.h b/gcc/config/loongarch/loongarch-protos.h
index f4430d0d418..251011c5414 100644
--- a/gcc/config/loongarch/loongarch-protos.h
+++ b/gcc/config/loongarch/loongarch-protos.h
@@ -56,7 +56,7 @@  enum loongarch_symbol_type {
 };
 #define NUM_SYMBOL_TYPES (SYMBOL_TLSLDM + 1)
 
-/* Routines implemented in loongarch.c.  */
+/* Routines implemented in loongarch.cc.  */
 extern rtx loongarch_emit_move (rtx, rtx);
 extern HOST_WIDE_INT loongarch_initial_elimination_offset (int, int);
 extern void loongarch_expand_prologue (void);
@@ -163,6 +163,8 @@  extern const char *current_section_name (void);
 extern unsigned int current_section_flags (void);
 extern bool loongarch_use_ins_ext_p (rtx, HOST_WIDE_INT, HOST_WIDE_INT);
 extern bool loongarch_check_zero_div_p (void);
+extern bool loongarch_pre_reload_split (void);
+extern int loongarch_use_bstrins_for_ior_with_mask (machine_mode, rtx *);
 
 union loongarch_gen_fn_ptrs
 {
diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
index aeb37f0f2f7..6698414281e 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -5482,6 +5482,42 @@  loongarch_use_ins_ext_p (rtx op, HOST_WIDE_INT width, HOST_WIDE_INT bitpos)
   return true;
 }
 
+/* Predicate for pre-reload splitters with associated instructions,
+   which can match any time before the split1 pass (usually combine),
+   then are unconditionally split in that pass and should not be
+   matched again afterwards.  */
+
+bool loongarch_pre_reload_split (void)
+{
+  return (can_create_pseudo_p ()
+	  && !(cfun->curr_properties & PROP_rtl_split_insns));
+}
+
+/* Check if we can use bstrins.<d> for
+   op0 = (op1 & op2) | (op3 & op4)
+   where op0, op1, op3 are regs, and op2, op4 are integer constants.  */
+int
+loongarch_use_bstrins_for_ior_with_mask (machine_mode mode, rtx *op)
+{
+  unsigned HOST_WIDE_INT mask1 = UINTVAL (op[2]);
+  unsigned HOST_WIDE_INT mask2 = UINTVAL (op[4]);
+
+  if (mask1 != ~mask2 || !mask1 || !mask2)
+    return 0;
+
+  /* Try to avoid a right-shift.  */
+  if (low_bitmask_len (mode, mask1) != -1)
+    return -1;
+
+  if (low_bitmask_len (mode, mask2 >> (ffs_hwi (mask2) - 1)) != -1)
+    return 1;
+
+  if (low_bitmask_len (mode, mask1 >> (ffs_hwi (mask1) - 1)) != -1)
+    return -1;
+
+  return 0;
+}
+
 /* Print the text for PRINT_OPERAND punctation character CH to FILE.
    The punctuation characters are:
 
diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md
index 2308db16902..75f641b38ee 100644
--- a/gcc/config/loongarch/loongarch.md
+++ b/gcc/config/loongarch/loongarch.md
@@ -1322,6 +1322,97 @@  (define_insn "and<mode>3_extended"
   [(set_attr "move_type" "pick_ins")
    (set_attr "mode" "<MODE>")])
 
+(define_insn_and_split "*bstrins_<mode>_for_mask"
+  [(set (match_operand:GPR 0 "register_operand")
+	(and:GPR (match_operand:GPR 1 "register_operand")
+		 (match_operand:GPR 2 "ins_zero_bitmask_operand")))]
+  ""
+  "#"
+  ""
+  [(set (match_dup 0) (match_dup 1))
+   (set (zero_extract:GPR (match_dup 0) (match_dup 2) (match_dup 3))
+	(const_int 0))]
+  {
+    unsigned HOST_WIDE_INT mask = ~UINTVAL (operands[2]);
+    int lo = ffs_hwi (mask) - 1;
+    int len = low_bitmask_len (<MODE>mode, mask >> lo);
+
+    len = MIN (len, GET_MODE_BITSIZE (<MODE>mode) - lo);
+    operands[2] = GEN_INT (len);
+    operands[3] = GEN_INT (lo);
+  })
+
+(define_insn_and_split "*bstrins_<mode>_for_ior_mask"
+  [(set (match_operand:GPR 0 "register_operand")
+	(ior:GPR (and:GPR (match_operand:GPR 1 "register_operand")
+                          (match_operand:GPR 2 "const_int_operand"))
+		 (and:GPR (match_operand:GPR 3 "register_operand")
+			  (match_operand:GPR 4 "const_int_operand"))))]
+  "loongarch_pre_reload_split () && \
+   loongarch_use_bstrins_for_ior_with_mask (<MODE>mode, operands)"
+  "#"
+  ""
+  [(set (match_dup 0) (match_dup 1))
+   (set (zero_extract:GPR (match_dup 0) (match_dup 2) (match_dup 4))
+	(match_dup 3))]
+  {
+    if (loongarch_use_bstrins_for_ior_with_mask (<MODE>mode, operands) < 0)
+      {
+	std::swap (operands[1], operands[3]);
+	std::swap (operands[2], operands[4]);
+      }
+
+    unsigned HOST_WIDE_INT mask = ~UINTVAL (operands[2]);
+    int lo = ffs_hwi (mask) - 1;
+    int len = low_bitmask_len (<MODE>mode, mask >> lo);
+
+    len = MIN (len, GET_MODE_BITSIZE (<MODE>mode) - lo);
+    operands[2] = GEN_INT (len);
+    operands[4] = GEN_INT (lo);
+
+    if (lo)
+      {
+	rtx tmp = gen_reg_rtx (<MODE>mode);
+	emit_move_insn (tmp, gen_rtx_ASHIFTRT(<MODE>mode, operands[3],
+					      GEN_INT (lo)));
+	operands[3] = tmp;
+      }
+  })
+
+;; We always avoid the shift operation in bstrins_<mode>_for_ior_mask
+;; if possible, but the result may be sub-optimal when one of the masks
+;; is (1 << N) - 1 and one of the src register is the dest register.
+;; For example:
+;;     move		t0, a0
+;;     move		a0, a1
+;;     bstrins.d	a0, t0, 42, 0
+;;     ret
+;; using a shift operation would be better:
+;;     srai.d		t0, a1, 43
+;;     bstrins.d	a0, t0, 63, 43
+;;     ret
+;; unfortunately we cannot figure it out in split1: before reload we cannot
+;; know if the dest register is one of the src register.  Fix it up in
+;; peephole2.
+(define_peephole2
+  [(set (match_operand:GPR 0 "register_operand")
+	(match_operand:GPR 1 "register_operand"))
+   (set (match_dup 1) (match_operand:GPR 2 "register_operand"))
+   (set (zero_extract:GPR (match_dup 1)
+			  (match_operand:SI 3 "const_int_operand")
+			  (const_int 0))
+	(match_dup 0))]
+  "peep2_reg_dead_p (3, operands[0])"
+  [(const_int 0)]
+  {
+    int len = GET_MODE_BITSIZE (<MODE>mode) - INTVAL (operands[3]);
+
+    emit_insn (gen_ashr<mode>3 (operands[0], operands[2], operands[3]));
+    emit_insn (gen_insv<mode> (operands[1], GEN_INT (len), operands[3],
+			       operands[0]));
+    DONE;
+  })
+
 (define_insn "*iorhi3"
   [(set (match_operand:HI 0 "register_operand" "=r,r")
 	(ior:HI (match_operand:HI 1 "register_operand" "%r,r")
diff --git a/gcc/config/loongarch/predicates.md b/gcc/config/loongarch/predicates.md
index f430629825e..499518b82ba 100644
--- a/gcc/config/loongarch/predicates.md
+++ b/gcc/config/loongarch/predicates.md
@@ -408,6 +408,14 @@  (define_predicate "fcc_reload_operand"
 (define_predicate "muldiv_target_operand"
 		(match_operand 0 "register_operand"))
 
+(define_predicate "ins_zero_bitmask_operand"
+  (and (match_code "const_int")
+       (match_test "INTVAL (op) != -1")
+       (match_test "INTVAL (op) & 1")
+       (match_test "low_bitmask_len (mode, \
+				     ~UINTVAL (op) | (~UINTVAL(op) - 1)) \
+		    > 12")))
+
 (define_predicate "const_call_insn_operand"
   (match_code "const,symbol_ref,label_ref")
 {
diff --git a/gcc/testsuite/g++.target/loongarch/bstrins-compile.C b/gcc/testsuite/g++.target/loongarch/bstrins-compile.C
new file mode 100644
index 00000000000..3c0db1de4c6
--- /dev/null
+++ b/gcc/testsuite/g++.target/loongarch/bstrins-compile.C
@@ -0,0 +1,22 @@ 
+/* { dg-do compile } */
+/* { dg-options "-std=c++14 -O2 -march=loongarch64 -mabi=lp64d" } */
+/* { dg-final { scan-assembler "bstrins\\.d.*7,4" } } */
+/* { dg-final { scan-assembler "bstrins\\.d.*15,4" } } */
+/* { dg-final { scan-assembler "bstrins\\.d.*31,4" } } */
+/* { dg-final { scan-assembler "bstrins\\.d.*47,4" } } */
+/* { dg-final { scan-assembler "bstrins\\.d.*3,0" } } */
+
+typedef unsigned long u64;
+
+template <u64 mask>
+u64
+test (u64 a, u64 b)
+{
+  return (a & mask) | (b & ~mask);
+}
+
+template u64 test<0x0000'0000'0000'00f0l> (u64, u64);
+template u64 test<0x0000'0000'0000'fff0l> (u64, u64);
+template u64 test<0x0000'0000'ffff'fff0l> (u64, u64);
+template u64 test<0x0000'ffff'ffff'fff0l> (u64, u64);
+template u64 test<0xffff'ffff'ffff'fff0l> (u64, u64);
diff --git a/gcc/testsuite/g++.target/loongarch/bstrins-run.C b/gcc/testsuite/g++.target/loongarch/bstrins-run.C
new file mode 100644
index 00000000000..68913d5e0fc
--- /dev/null
+++ b/gcc/testsuite/g++.target/loongarch/bstrins-run.C
@@ -0,0 +1,65 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+
+typedef unsigned long gr;
+
+template <int l, int r>
+struct mask {
+  enum { value = (1ul << r) - (1ul << l) };
+};
+
+template <int l>
+struct mask<l, sizeof (gr) * __CHAR_BIT__> {
+  enum { value = -(1ul << l) };
+};
+
+__attribute__ ((noipa)) void
+test (gr a, gr b, gr mask, gr out)
+{
+  if (((a & mask) | (b & ~mask)) != out)
+    __builtin_abort ();
+}
+
+__attribute__ ((noipa)) gr
+no_optimize (gr x)
+{
+  return x;
+}
+
+template <int l, int r>
+struct test1 {
+  static void
+  run (void)
+  {
+    gr m = mask<l, r>::value;
+    gr a = no_optimize (-1ul);
+    gr b = no_optimize (0);
+
+    test (a, b, m, (a & m) | (b & ~m));
+    test (a, b, ~m, (a & ~m) | (b & m));
+    test (a, 0, ~m, a & ~m);
+
+    test1<l, r + 1>::run ();
+  }
+};
+
+template <int l>
+struct test1<l, sizeof (gr) * __CHAR_BIT__ + 1> {
+  static void run (void) {}
+};
+
+template <int l>
+void
+test2 (void)
+{
+  test1<l, l + 1>::run ();
+  test2<l + 1> ();
+}
+
+template <> void test2<sizeof (gr) * __CHAR_BIT__> (void) {}
+
+int
+main ()
+{
+  test2<0> ();
+}