diff mbox series

[xstormy16] Recognize/support swpn (swap nibbles) instruction.

Message ID 004001d97ab7$0a1989f0$1e4c9dd0$@nextmovesoftware.com
State New
Headers show
Series [xstormy16] Recognize/support swpn (swap nibbles) instruction. | expand

Commit Message

Roger Sayle April 29, 2023, 4:24 p.m. UTC
This patch adds support for xstormy16's swap nibbles instruction (swpn).
For the test case:

short foo(short x) {
  return (x&0xff00) | ((x<<4)&0xf0) | ((x>>4)&0x0f);
}

GCC with -O2 currently generates the nine instruction sequence:
foo:    mov r7,r2
        asr r2,#4
        and r2,#15
        mov.w r6,#-256
        and r6,r7
        or r2,r6
        shl r7,#4
        and r7,#255
        or r2,r7
        ret

with this patch, we now generate:
foo:    swpn r2
        ret

To achieve this using combine's four instruction "combinations" requires
a little wizardry.  Firstly, define_insn_and_split are introduced to
treat logical shifts followed by bitwise-AND as macro instructions that
are split after reload.  This is sufficient to recognize a QImode
nibble swap, which can be implemented by swpn followed by either a
zero-extension or a sign-extension from QImode to HImode.  Then finally,
in the correct context, a QImode swap-nibbles pattern can be combined to
preserve the high-byte of a HImode word, matching the xstormy16's swpn
semantics.

The naming of the new code iterators is taken from i386.md.
The any_rotate code iterator is used in my next (split out) patch.

This patch has been tested by building a cross-compiler to xstormy16-elf
from x86_64-pc-linux-gnu and confirming the new test cases pass.
Ok for mainline?


2023-04-29  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
        * config/stormy16/stormy16.md (any_lshift): New code iterator.
        (any_or_plus): Likewise.
        (any_rotate): Likewise.
        (*<any_lshift>_and_internal): New define_insn_and_split to
        recognize a logical shift followed by an AND, and split it
        again after reload.
        (*swpn): New define_insn matching xstormy16's swpn.
        (*swpn_zext): New define_insn recognizing swpn followed by
        zero_extendqihi2, i.e. with the high byte set to zero.
        (*swpn_sext): Likewise, for swpn followed by cbw.
        (*swpn_sext_2): Likewise, for an alternate RTL form.
        (*swpn_zext_ior): A pre-reload splitter so that an swpn+zext+ior
        sequence is split in the correct place to recognize the *swpn_zext
        followed by any_or_plus (ior, xor or plus) instruction.

gcc/testsuite/ChangeLog
        * gcc.target/xstormy16/swpn-1.c: New QImode test case.
        * gcc.target/xstormy16/swpn-2.c: New zero_extend test case.
        * gcc.target/xstormy16/swpn-3.c: New sign_extend test case.
        * gcc.target/xstormy16/swpn-4.c: New HImode test case.


Thanks in advance,
Roger
--

Comments

Jeff Law April 29, 2023, 4:44 p.m. UTC | #1
On 4/29/23 10:24, Roger Sayle wrote:
> 
> This patch adds support for xstormy16's swap nibbles instruction (swpn).
> For the test case:
> 
> short foo(short x) {
>    return (x&0xff00) | ((x<<4)&0xf0) | ((x>>4)&0x0f);
> }
> 
> GCC with -O2 currently generates the nine instruction sequence:
> foo:    mov r7,r2
>          asr r2,#4
>          and r2,#15
>          mov.w r6,#-256
>          and r6,r7
>          or r2,r6
>          shl r7,#4
>          and r7,#255
>          or r2,r7
>          ret
> 
> with this patch, we now generate:
> foo:    swpn r2
>          ret
> 
> To achieve this using combine's four instruction "combinations" requires
> a little wizardry.  Firstly, define_insn_and_split are introduced to
> treat logical shifts followed by bitwise-AND as macro instructions that
> are split after reload.  This is sufficient to recognize a QImode
> nibble swap, which can be implemented by swpn followed by either a
> zero-extension or a sign-extension from QImode to HImode.  Then finally,
> in the correct context, a QImode swap-nibbles pattern can be combined to
> preserve the high-byte of a HImode word, matching the xstormy16's swpn
> semantics.
> 
> The naming of the new code iterators is taken from i386.md.
> The any_rotate code iterator is used in my next (split out) patch.
> 
> This patch has been tested by building a cross-compiler to xstormy16-elf
> from x86_64-pc-linux-gnu and confirming the new test cases pass.
> Ok for mainline?
> 
> 
> 2023-04-29  Roger Sayle  <roger@nextmovesoftware.com>
> 
> gcc/ChangeLog
>          * config/stormy16/stormy16.md (any_lshift): New code iterator.
>          (any_or_plus): Likewise.
>          (any_rotate): Likewise.
>          (*<any_lshift>_and_internal): New define_insn_and_split to
>          recognize a logical shift followed by an AND, and split it
>          again after reload.
>          (*swpn): New define_insn matching xstormy16's swpn.
>          (*swpn_zext): New define_insn recognizing swpn followed by
>          zero_extendqihi2, i.e. with the high byte set to zero.
>          (*swpn_sext): Likewise, for swpn followed by cbw.
>          (*swpn_sext_2): Likewise, for an alternate RTL form.
>          (*swpn_zext_ior): A pre-reload splitter so that an swpn+zext+ior
>          sequence is split in the correct place to recognize the *swpn_zext
>          followed by any_or_plus (ior, xor or plus) instruction.
> 
> gcc/testsuite/ChangeLog
>          * gcc.target/xstormy16/swpn-1.c: New QImode test case.
>          * gcc.target/xstormy16/swpn-2.c: New zero_extend test case.
>          * gcc.target/xstormy16/swpn-3.c: New sign_extend test case.
>          * gcc.target/xstormy16/swpn-4.c: New HImode test case.
Ah, bridge patterns.

OK for the trunk.

jeff
diff mbox series

Patch

diff --git a/gcc/config/stormy16/stormy16.md b/gcc/config/stormy16/stormy16.md
index b2e86ee..be1ee04 100644
--- a/gcc/config/stormy16/stormy16.md
+++ b/gcc/config/stormy16/stormy16.md
@@ -48,6 +48,10 @@ 
     (CARRY_REG 16)
   ]
 )
+
+(define_code_iterator any_lshift [ashift lshiftrt])
+(define_code_iterator any_or_plus [plus ior xor])
+(define_code_iterator any_rotate [rotate rotatert])
 
 ;; ::::::::::::::::::::
 ;; ::
@@ -1301,3 +1323,86 @@ 
   [(parallel [(set (match_dup 2) (match_dup 1))
               (set (match_dup 1) (match_dup 2))])])
 
+;; Recognize shl+and and shr+and as macro instructions.
+(define_insn_and_split "*<code>_and_internal"
+  [(set (match_operand:HI 0 "register_operand" "=r")
+        (and:HI (any_lshift:HI (match_operand 1 "register_operand" "0")
+			       (match_operand 2 "const_int_operand" "i"))
+		(match_operand 3 "const_int_operand" "i")))
+   (clobber (reg:BI CARRY_REG))]
+  "IN_RANGE (INTVAL (operands[2]), 0, 15)"
+  "#"
+  "reload_completed"
+  [(parallel [(set (match_dup 0) (any_lshift:HI (match_dup 1) (match_dup 2)))
+	      (clobber (reg:BI CARRY_REG))])
+   (set (match_dup 0) (and:HI (match_dup 0) (match_dup 3)))])
+
+;; Swap nibbles instruction
+(define_insn "*swpn"
+  [(set (match_operand:HI 0 "register_operand" "=r")
+	(any_or_plus:HI
+	  (any_or_plus:HI
+	    (and:HI (ashift:HI (match_operand:HI 1 "register_operand" "0")
+			       (const_int 4))
+		    (const_int 240))
+	    (and:HI (lshiftrt:HI (match_dup 1) (const_int 4))
+		    (const_int 15)))
+	  (and:HI (match_dup 1) (const_int -256))))]
+  ""
+  "swpn %0")
+
+(define_insn "*swpn_zext"
+  [(set (match_operand:HI 0 "register_operand" "=r")
+	(any_or_plus:HI
+	  (and:HI (ashift:HI (match_operand:HI 1 "register_operand" "0")
+			     (const_int 4))
+		  (const_int 240))
+	  (and:HI (lshiftrt:HI (match_dup 1) (const_int 4))
+		  (const_int 15))))]
+  ""
+  "swpn %0 | and %0,#255"
+  [(set_attr "length" "6")])
+
+(define_insn "*swpn_sext"
+  [(set (match_operand:HI 0 "register_operand" "=r")
+	(sign_extend:HI
+	  (rotate:QI (subreg:QI (match_operand:HI 1 "register_operand" "0") 0)
+		     (const_int 4))))]
+  ""
+  "swpn %0 | cbw %0"
+  [(set_attr "length" "4")])
+
+(define_insn "*swpn_sext_2"
+  [(set (match_operand:HI 0 "register_operand" "=r")
+	(sign_extend:HI
+	  (subreg:QI
+	    (any_or_plus:HI
+	      (ashift:HI (match_operand:HI 1 "register_operand" "0")
+			 (const_int 4))
+	      (subreg:HI (lshiftrt:QI (subreg:QI (match_dup 1) 0)
+				      (const_int 4)) 0)) 0)))]
+  ""
+  "swpn %0 | cbw %0"
+  [(set_attr "length" "4")])
+
+;; Recognize swpn_zext+ior as a macro instruction.
+(define_insn_and_split "*swpn_zext_ior"
+  [(set (match_operand:HI 0 "register_operand")
+	(any_or_plus:HI
+	  (any_or_plus:HI
+	    (and:HI (ashift:HI (match_operand:HI 1 "register_operand")
+			       (const_int 4))
+		    (const_int 240))
+	    (and:HI (lshiftrt:HI (match_dup 1) (const_int 4))
+		    (const_int 15)))
+	  (match_operand:HI 2 "nonmemory_operand")))]
+  "can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+ [(set (match_dup 3) (ior:HI (and:HI (ashift:HI (match_dup 1) (const_int 4))
+				     (const_int 240))
+			     (and:HI (lshiftrt:HI (match_dup 1) (const_int 4))
+				     (const_int 15))))
+  (set (match_dup 0) (ior:HI (match_dup 3) (match_dup 2)))]
+  "operands[3] = gen_reg_rtx (HImode);")
+
diff --git a/gcc/testsuite/gcc.target/xstormy16/swpn-1.c b/gcc/testsuite/gcc.target/xstormy16/swpn-1.c
new file mode 100644
index 0000000..a2c9316
--- /dev/null
+++ b/gcc/testsuite/gcc.target/xstormy16/swpn-1.c
@@ -0,0 +1,10 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+unsigned char ior_1(unsigned char x) { return (x>>4) | (x<<4); }
+unsigned char ior_2(unsigned char x) { return (x<<4) | (x>>4); }
+unsigned char xor_1(unsigned char x) { return (x>>4) ^ (x<<4); }
+unsigned char xor_2(unsigned char x) { return (x<<4) ^ (x>>4); }
+unsigned char sum_1(unsigned char x) { return (x>>4) + (x<<4); }
+unsigned char sum_2(unsigned char x) { return (x<<4) + (x>>4); }
+/* { dg-final { scan-assembler-times "swpn r2" 6 } } */
+
diff --git a/gcc/testsuite/gcc.target/xstormy16/swpn-2.c b/gcc/testsuite/gcc.target/xstormy16/swpn-2.c
new file mode 100644
index 0000000..f26c296
--- /dev/null
+++ b/gcc/testsuite/gcc.target/xstormy16/swpn-2.c
@@ -0,0 +1,14 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned short ior_1(unsigned short x) { return ((x&0xf0)>>4) | ((x&0x0f)<<4); }
+unsigned short xor_1(unsigned short x) { return ((x&0xf0)>>4) ^ ((x&0x0f)<<4); }
+unsigned short sum_1(unsigned short x) { return ((x&0xf0)>>4) + ((x&0x0f)<<4); }
+
+unsigned short ior_2(unsigned short x) { return ((x&0x0f)<<4) | ((x&0xf0)>>4); }
+unsigned short xor_2(unsigned short x) { return ((x&0x0f)<<4) ^ ((x&0xf0)>>4); }
+unsigned short sum_2(unsigned short x) { return ((x&0x0f)<<4) + ((x&0xf0)>>4); }
+
+/* { dg-final { scan-assembler-times "swpn r2" 6 } } */
+/* { dg-final { scan-assembler-times "and r2,#255" 6 } } */
+
diff --git a/gcc/testsuite/gcc.target/xstormy16/swpn-3.c b/gcc/testsuite/gcc.target/xstormy16/swpn-3.c
new file mode 100644
index 0000000..6109c6a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/xstormy16/swpn-3.c
@@ -0,0 +1,28 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+short ior_1(unsigned short x) {
+  return (signed char)(((x&0xf0)>>4) | ((x&0x0f)<<4));
+}
+
+short xor_1(unsigned short x) {
+  return (signed char)(((x&0xf0)>>4) ^ ((x&0x0f)<<4));
+}
+
+short sum_1(unsigned short x) {
+  return (signed char)(((x&0xf0)>>4) + ((x&0x0f)<<4));
+}
+
+short ior_2(unsigned short x) {
+  return (signed char)(((x&0x0f)<<4) | ((x&0xf0)>>4));
+}
+
+short xor_2(unsigned short x) {
+  return (signed char)(((x&0x0f)<<4) ^ ((x&0xf0)>>4));
+}
+
+short sum_2(unsigned short x) {
+  return (signed char)(((x&0x0f)<<4) + ((x&0xf0)>>4));
+}
+
+/* { dg-final { scan-assembler-times "cbw" 6 } } */
diff --git a/gcc/testsuite/gcc.target/xstormy16/swpn-4.c b/gcc/testsuite/gcc.target/xstormy16/swpn-4.c
new file mode 100644
index 0000000..4a31dc6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/xstormy16/swpn-4.c
@@ -0,0 +1,25 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+short ior_abc(short x) { return (x&0xff00) | ((x<<4)&0xf0) | ((x>>4)&0x0f); }
+short ior_acb(short x) { return (x&0xff00) | ((x>>4)&0x0f) | ((x<<4)&0xf0); }
+short ior_bac(short x) { return ((x<<4)&0xf0) | (x&0xff00) | ((x>>4)&0x0f); }
+short ior_bca(short x) { return ((x<<4)&0xf0) | ((x>>4)&0x0f) | (x&0xff00); }
+short ior_cab(short x) { return ((x>>4)&0x0f) | (x&0xff00) | ((x<<4)&0xf0); }
+short ior_cba(short x) { return ((x>>4)&0x0f) | ((x<<4)&0xf0) | (x&0xff00); }
+
+short xor_abc(short x) { return (x&0xff00) ^ ((x<<4)&0xf0) ^ ((x>>4)&0x0f); }
+short xor_acb(short x) { return (x&0xff00) ^ ((x>>4)&0x0f) ^ ((x<<4)&0xf0); }
+short xor_bac(short x) { return ((x<<4)&0xf0) ^ (x&0xff00) ^ ((x>>4)&0x0f); }
+short xor_bca(short x) { return ((x<<4)&0xf0) ^ ((x>>4)&0x0f) ^ (x&0xff00); }
+short xor_cab(short x) { return ((x>>4)&0x0f) ^ (x&0xff00) ^ ((x<<4)&0xf0); }
+short xor_cba(short x) { return ((x>>4)&0x0f) ^ ((x<<4)&0xf0) ^ (x&0xff00); }
+
+short sum_abc(short x) { return (x&0xff00) + ((x<<4)&0xf0) + ((x>>4)&0x0f); }
+short sum_acb(short x) { return (x&0xff00) + ((x>>4)&0x0f) + ((x<<4)&0xf0); }
+short sum_bac(short x) { return ((x<<4)&0xf0) + (x&0xff00) + ((x>>4)&0x0f); }
+short sum_bca(short x) { return ((x<<4)&0xf0) + ((x>>4)&0x0f) + (x&0xff00); }
+short sum_cab(short x) { return ((x>>4)&0x0f) + (x&0xff00) + ((x<<4)&0xf0); }
+short sum_cba(short x) { return ((x>>4)&0x0f) + ((x<<4)&0xf0) + (x&0xff00); }
+
+/* { dg-final { scan-assembler-times "swpn r2" 18 } } */