Improve x86 and + rotate (PR target/82498)

Message ID 20171011205903.GP14653@tucnak
State New
Headers show
Series
  • Improve x86 and + rotate (PR target/82498)
Related show

Commit Message

Jakub Jelinek Oct. 11, 2017, 8:59 p.m.
Hi!

As can be seen on the testcase below, the *<rotate_insn><mode>3_mask
insn/splitter is able to optimize only the case when the and is
performed in SImode and then the result subreged into QImode,
while if the computation is already in QImode, we don't handle it.

Fixed by adding another pattern, bootstrapped/regtested on x86_64-linux and
i686-linux, ok for trunk?

2017-10-11  Jakub Jelinek  <jakub@redhat.com>

	PR target/82498
	* config/i386/i386.md (*<rotate_insn><mode>3_mask_1): New
	define_insn_and_split.

	* gcc.target/i386/pr82498.c: New test.


	Jakub

Comments

Uros Bizjak Oct. 12, 2017, 6:32 a.m. | #1
On Wed, Oct 11, 2017 at 10:59 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> Hi!
>
> As can be seen on the testcase below, the *<rotate_insn><mode>3_mask
> insn/splitter is able to optimize only the case when the and is
> performed in SImode and then the result subreged into QImode,
> while if the computation is already in QImode, we don't handle it.
>
> Fixed by adding another pattern, bootstrapped/regtested on x86_64-linux and
> i686-linux, ok for trunk?

We probably want to add this variant to *all* *_mask splitters (there
are a few of them in i386.md, please grep for "Avoid useless
masking"). Which finally begs a question - should we implement this
simplification in a generic, target-independent way? OTOH, we already
have SHIFT_COUNT_TRUNCATED and shift_truncation_mask hooks, but last
time I try the former, there were some problems in the testsuite on
x86. I guess there are several targets that would benefit from
removing useless masking of count operands.

Uros.

> 2017-10-11  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/82498
>         * config/i386/i386.md (*<rotate_insn><mode>3_mask_1): New
>         define_insn_and_split.
>
>         * gcc.target/i386/pr82498.c: New test.
>
> --- gcc/config/i386/i386.md.jj  2017-10-10 11:54:11.000000000 +0200
> +++ gcc/config/i386/i386.md     2017-10-11 19:24:27.673606778 +0200
> @@ -11187,6 +11187,26 @@ (define_insn_and_split "*<rotate_insn><m
>        (clobber (reg:CC FLAGS_REG))])]
>    "operands[2] = gen_lowpart (QImode, operands[2]);")
>
> +(define_insn_and_split "*<rotate_insn><mode>3_mask_1"
> +  [(set (match_operand:SWI48 0 "nonimmediate_operand")
> +       (any_rotate:SWI48
> +         (match_operand:SWI48 1 "nonimmediate_operand")
> +         (and:QI
> +           (match_operand:QI 2 "register_operand")
> +           (match_operand:QI 3 "const_int_operand"))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)
> +   && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1))
> +      == GET_MODE_BITSIZE (<MODE>mode)-1
> +   && can_create_pseudo_p ()"
> +  "#"
> +  "&& 1"
> +  [(parallel
> +     [(set (match_dup 0)
> +          (any_rotate:SWI48 (match_dup 1)
> +                            (match_dup 2)))
> +      (clobber (reg:CC FLAGS_REG))])])
> +
>  ;; Implement rotation using two double-precision
>  ;; shift instructions and a scratch register.
>
> --- gcc/testsuite/gcc.target/i386/pr82498.c.jj  2017-10-11 20:21:10.677088306 +0200
> +++ gcc/testsuite/gcc.target/i386/pr82498.c     2017-10-11 20:22:31.569101564 +0200
> @@ -0,0 +1,52 @@
> +/* PR target/82498 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mtune=generic -masm=att" } */
> +/* { dg-final { scan-assembler-not {\mand[bwlq]\M} } } */
> +
> +unsigned
> +f1 (unsigned x, unsigned char y)
> +{
> +  if (y == 0)
> +    return x;
> +  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
> +  return (x << y) | (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y));
> +}
> +
> +unsigned
> +f2 (unsigned x, unsigned y)
> +{
> +  if (y == 0)
> +    return x;
> +  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
> +  return (x << y) | (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y));
> +}
> +
> +unsigned
> +f3 (unsigned x, unsigned short y)
> +{
> +  if (y == 0)
> +    return x;
> +  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
> +  return (x << y) | (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y));
> +}
> +
> +unsigned
> +f4 (unsigned x, unsigned char y)
> +{
> +  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
> +  return (x << y) | (x >> (-y & (__CHAR_BIT__ * __SIZEOF_INT__ - 1)));
> +}
> +
> +unsigned
> +f5 (unsigned x, unsigned int y)
> +{
> +  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
> +  return (x << y) | (x >> (-y & (__CHAR_BIT__ * __SIZEOF_INT__ - 1)));
> +}
> +
> +unsigned
> +f6 (unsigned x, unsigned short y)
> +{
> +  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
> +  return (x << y) | (x >> (-y & (__CHAR_BIT__ * __SIZEOF_INT__ - 1)));
> +}
>
>         Jakub
Uros Bizjak Oct. 12, 2017, 6:39 a.m. | #2
On Thu, Oct 12, 2017 at 8:32 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Wed, Oct 11, 2017 at 10:59 PM, Jakub Jelinek <jakub@redhat.com> wrote:
>> Hi!
>>
>> As can be seen on the testcase below, the *<rotate_insn><mode>3_mask
>> insn/splitter is able to optimize only the case when the and is
>> performed in SImode and then the result subreged into QImode,
>> while if the computation is already in QImode, we don't handle it.
>>
>> Fixed by adding another pattern, bootstrapped/regtested on x86_64-linux and
>> i686-linux, ok for trunk?
>
> We probably want to add this variant to *all* *_mask splitters (there
> are a few of them in i386.md, please grep for "Avoid useless
> masking"). Which finally begs a question - should we implement this
> simplification in a generic, target-independent way? OTOH, we already
> have SHIFT_COUNT_TRUNCATED and shift_truncation_mask hooks, but last
> time I try the former, there were some problems in the testsuite on
> x86. I guess there are several targets that would benefit from
> removing useless masking of count operands.

Oh, and there is a strange x86 exception in the comment for
SHIFT_COUNT_TRUNCATED. I'm not sure what "(real or pretended)
bit-field operation" means, but variable-count BT instruction with
non-memory operand (we never generate variable-count BTx with memory
operand) masks its count operand as well.

Uros.
Uros Bizjak Oct. 12, 2017, 6:52 a.m. | #3
On Thu, Oct 12, 2017 at 8:39 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Thu, Oct 12, 2017 at 8:32 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
>> On Wed, Oct 11, 2017 at 10:59 PM, Jakub Jelinek <jakub@redhat.com> wrote:
>>> Hi!
>>>
>>> As can be seen on the testcase below, the *<rotate_insn><mode>3_mask
>>> insn/splitter is able to optimize only the case when the and is
>>> performed in SImode and then the result subreged into QImode,
>>> while if the computation is already in QImode, we don't handle it.
>>>
>>> Fixed by adding another pattern, bootstrapped/regtested on x86_64-linux and
>>> i686-linux, ok for trunk?
>>
>> We probably want to add this variant to *all* *_mask splitters (there
>> are a few of them in i386.md, please grep for "Avoid useless
>> masking"). Which finally begs a question - should we implement this
>> simplification in a generic, target-independent way? OTOH, we already
>> have SHIFT_COUNT_TRUNCATED and shift_truncation_mask hooks, but last
>> time I try the former, there were some problems in the testsuite on
>> x86. I guess there are several targets that would benefit from
>> removing useless masking of count operands.
>
> Oh, and there is a strange x86 exception in the comment for
> SHIFT_COUNT_TRUNCATED. I'm not sure what "(real or pretended)
> bit-field operation" means, but variable-count BT instruction with
> non-memory operand (we never generate variable-count BTx with memory
> operand) masks its count operand as well.

I forgot that SSE shifts don't truncate their count operand. This is
the reason that the removal of mask is implemented in the *.md file,
but it would really be nice if the same functionality can be achieved
in a more generic way, without pattern explosion.

Uros.
Jakub Jelinek Oct. 12, 2017, 8:32 a.m. | #4
On Thu, Oct 12, 2017 at 08:32:32AM +0200, Uros Bizjak wrote:
> On Wed, Oct 11, 2017 at 10:59 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> > As can be seen on the testcase below, the *<rotate_insn><mode>3_mask
> > insn/splitter is able to optimize only the case when the and is
> > performed in SImode and then the result subreged into QImode,
> > while if the computation is already in QImode, we don't handle it.
> >
> > Fixed by adding another pattern, bootstrapped/regtested on x86_64-linux and
> > i686-linux, ok for trunk?
> 
> We probably want to add this variant to *all* *_mask splitters (there
> are a few of them in i386.md, please grep for "Avoid useless

Well, not all of them, just the 5 ones that have (subreg:QI (and:SI ...)),
*jcc_bt<mode>_mask expects SImode argument, so doesn't have this kind of
problem.

> masking"). Which finally begs a question - should we implement this
> simplification in a generic, target-independent way? OTOH, we already

The target independent way is SHIFT_COUNT_TRUNCATED, but that isn't
applicable to targets which aren't consistent in their behavior across
the whole ISA.  x86 has some instructions that truncate the mask and others
that saturate and others (b* with memory operand) that use something
still different.
So we need to apply it only to the instructions which really truncate
the shift counts and the best infrastructure for that I'm aware is the
combiner.

In order to have just one set of the *_mask patterns we'd need to
change simplify_truncation to canonicalize
(subreg:QI (and:SI (something:SI) (const_int N)) low)
into
(and:QI (subreg:QI (something:SI) low) (const_int lowN))
and change all these patterns; but I'm afraid that is going to break almost
all WORD_REGISTER_OPERATIONS targets and various others, those actually
perform all the operations in word mode and that is why the first form
is actually preferred for them.  Except that if something is already
QImode there is no need for the subreg at all, which is why we I'm afraid
need the two sets of mask patterns.

So, if you aren't against it, I can extend the patch to handle the 4
other mask patterns; as for other modes, SImode is what is being handled
already, DImode is not a problem, because the FEs truncate the shift counts
to integer_type_node already, and for HImode I haven't seen problem
probably because most tunings avoid HImode math and so it isn't worth
optimizing.

> have SHIFT_COUNT_TRUNCATED and shift_truncation_mask hooks, but last
> time I try the former, there were some problems in the testsuite on
> x86. I guess there are several targets that would benefit from
> removing useless masking of count operands.
> 
> Uros.
> 
> > 2017-10-11  Jakub Jelinek  <jakub@redhat.com>
> >
> >         PR target/82498
> >         * config/i386/i386.md (*<rotate_insn><mode>3_mask_1): New
> >         define_insn_and_split.
> >
> >         * gcc.target/i386/pr82498.c: New test.

	Jakub
Uros Bizjak Oct. 12, 2017, 8:40 a.m. | #5
On Thu, Oct 12, 2017 at 10:32 AM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Thu, Oct 12, 2017 at 08:32:32AM +0200, Uros Bizjak wrote:
>> On Wed, Oct 11, 2017 at 10:59 PM, Jakub Jelinek <jakub@redhat.com> wrote:
>> > As can be seen on the testcase below, the *<rotate_insn><mode>3_mask
>> > insn/splitter is able to optimize only the case when the and is
>> > performed in SImode and then the result subreged into QImode,
>> > while if the computation is already in QImode, we don't handle it.
>> >
>> > Fixed by adding another pattern, bootstrapped/regtested on x86_64-linux and
>> > i686-linux, ok for trunk?
>>
>> We probably want to add this variant to *all* *_mask splitters (there
>> are a few of them in i386.md, please grep for "Avoid useless
>
> Well, not all of them, just the 5 ones that have (subreg:QI (and:SI ...)),
> *jcc_bt<mode>_mask expects SImode argument, so doesn't have this kind of
> problem.
>
>> masking"). Which finally begs a question - should we implement this
>> simplification in a generic, target-independent way? OTOH, we already
>
> The target independent way is SHIFT_COUNT_TRUNCATED, but that isn't
> applicable to targets which aren't consistent in their behavior across
> the whole ISA.  x86 has some instructions that truncate the mask and others
> that saturate and others (b* with memory operand) that use something
> still different.
> So we need to apply it only to the instructions which really truncate
> the shift counts and the best infrastructure for that I'm aware is the
> combiner.
>
> In order to have just one set of the *_mask patterns we'd need to
> change simplify_truncation to canonicalize
> (subreg:QI (and:SI (something:SI) (const_int N)) low)
> into
> (and:QI (subreg:QI (something:SI) low) (const_int lowN))
> and change all these patterns; but I'm afraid that is going to break almost
> all WORD_REGISTER_OPERATIONS targets and various others, those actually
> perform all the operations in word mode and that is why the first form
> is actually preferred for them.  Except that if something is already
> QImode there is no need for the subreg at all, which is why we I'm afraid
> need the two sets of mask patterns.
>
> So, if you aren't against it, I can extend the patch to handle the 4
> other mask patterns; as for other modes, SImode is what is being handled
> already, DImode is not a problem, because the FEs truncate the shift counts
> to integer_type_node already, and for HImode I haven't seen problem
> probably because most tunings avoid HImode math and so it isn't worth
> optimizing.

OK, I think that we can live wtih 4 new patterns. Since these are all
written in the same way (as in the patch you posted), the ammended
patch is pre-approved for mainline.

Thanks,
Uros.
Jakub Jelinek Oct. 12, 2017, 7:11 p.m. | #6
On Thu, Oct 12, 2017 at 10:40:22AM +0200, Uros Bizjak wrote:
> > So, if you aren't against it, I can extend the patch to handle the 4
> > other mask patterns; as for other modes, SImode is what is being handled
> > already, DImode is not a problem, because the FEs truncate the shift counts
> > to integer_type_node already, and for HImode I haven't seen problem
> > probably because most tunings avoid HImode math and so it isn't worth
> > optimizing.
> 
> OK, I think that we can live wtih 4 new patterns. Since these are all
> written in the same way (as in the patch you posted), the ammended
> patch is pre-approved for mainline.

Thanks, here is what I've committed to trunk after another bootstrap/regtest
on x86_64-linux and i686-linux:

2017-10-12  Jakub Jelinek  <jakub@redhat.com>

	PR target/82498
	* config/i386/i386.md (*ashl<mode>3_mask_1,
	*<shift_insn><mode>3_mask_1, *<rotate_insn><mode>3_mask_1,
	*<btsc><mode>_mask_1, *btr<mode>_mask_1): New define_insn_and_split
	patterns.

	* gcc.target/i386/pr82498-1.c: New test.
	* gcc.target/i386/pr82498-2.c: New test.

--- gcc/config/i386/i386.md.jj	2017-10-11 22:37:55.933863355 +0200
+++ gcc/config/i386/i386.md	2017-10-12 11:30:38.191535974 +0200
@@ -10228,6 +10228,26 @@ (define_insn_and_split "*ashl<mode>3_mas
       (clobber (reg:CC FLAGS_REG))])]
   "operands[2] = gen_lowpart (QImode, operands[2]);")
 
+(define_insn_and_split "*ashl<mode>3_mask_1"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand")
+	(ashift:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand")
+	  (and:QI
+	    (match_operand:QI 2 "register_operand")
+	    (match_operand:QI 3 "const_int_operand"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (ASHIFT, <MODE>mode, operands)
+   && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1))
+      == GET_MODE_BITSIZE (<MODE>mode)-1
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(parallel
+     [(set (match_dup 0)
+	   (ashift:SWI48 (match_dup 1)
+			 (match_dup 2)))
+      (clobber (reg:CC FLAGS_REG))])])
+
 (define_insn "*bmi2_ashl<mode>3_1"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
 	(ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")
@@ -10728,6 +10748,26 @@ (define_insn_and_split "*<shift_insn><mo
       (clobber (reg:CC FLAGS_REG))])]
   "operands[2] = gen_lowpart (QImode, operands[2]);")
 
+(define_insn_and_split "*<shift_insn><mode>3_mask_1"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand")
+	(any_shiftrt:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand")
+	  (and:QI
+	    (match_operand:QI 2 "register_operand")
+	    (match_operand:QI 3 "const_int_operand"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)
+   && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1))
+      == GET_MODE_BITSIZE (<MODE>mode)-1
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(parallel
+     [(set (match_dup 0)
+	   (any_shiftrt:SWI48 (match_dup 1)
+			      (match_dup 2)))
+      (clobber (reg:CC FLAGS_REG))])])
+
 (define_insn_and_split "*<shift_insn><mode>3_doubleword"
   [(set (match_operand:DWI 0 "register_operand" "=&r")
 	(any_shiftrt:DWI (match_operand:DWI 1 "register_operand" "0")
@@ -11187,6 +11227,26 @@ (define_insn_and_split "*<rotate_insn><m
       (clobber (reg:CC FLAGS_REG))])]
   "operands[2] = gen_lowpart (QImode, operands[2]);")
 
+(define_insn_and_split "*<rotate_insn><mode>3_mask_1"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand")
+	(any_rotate:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand")
+	  (and:QI
+	    (match_operand:QI 2 "register_operand")
+	    (match_operand:QI 3 "const_int_operand"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)
+   && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1))
+      == GET_MODE_BITSIZE (<MODE>mode)-1
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(parallel
+     [(set (match_dup 0)
+	   (any_rotate:SWI48 (match_dup 1)
+			     (match_dup 2)))
+      (clobber (reg:CC FLAGS_REG))])])
+
 ;; Implement rotation using two double-precision
 ;; shift instructions and a scratch register.
 
@@ -11494,6 +11554,30 @@ (define_insn_and_split "*<btsc><mode>_ma
       (clobber (reg:CC FLAGS_REG))])]
   "operands[1] = gen_lowpart (QImode, operands[1]);")
 
+(define_insn_and_split "*<btsc><mode>_mask_1"
+  [(set (match_operand:SWI48 0 "register_operand")
+	(any_or:SWI48
+	  (ashift:SWI48
+	    (const_int 1)
+	    (and:QI
+	      (match_operand:QI 1 "register_operand")
+	      (match_operand:QI 2 "const_int_operand")))
+	  (match_operand:SWI48 3 "register_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_USE_BT
+   && (INTVAL (operands[2]) & (GET_MODE_BITSIZE (<MODE>mode)-1))
+      == GET_MODE_BITSIZE (<MODE>mode)-1
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(parallel
+     [(set (match_dup 0)
+	   (any_or:SWI48
+	     (ashift:SWI48 (const_int 1)
+			   (match_dup 1))
+	     (match_dup 3)))
+      (clobber (reg:CC FLAGS_REG))])])
+
 (define_insn "*btr<mode>"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
 	(and:SWI48
@@ -11535,6 +11619,30 @@ (define_insn_and_split "*btr<mode>_mask"
       (clobber (reg:CC FLAGS_REG))])]
   "operands[1] = gen_lowpart (QImode, operands[1]);")
 
+(define_insn_and_split "*btr<mode>_mask_1"
+  [(set (match_operand:SWI48 0 "register_operand")
+	(and:SWI48
+	  (rotate:SWI48
+	    (const_int -2)
+	    (and:QI
+	      (match_operand:QI 1 "register_operand")
+	      (match_operand:QI 2 "const_int_operand")))
+	  (match_operand:SWI48 3 "register_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_USE_BT
+   && (INTVAL (operands[2]) & (GET_MODE_BITSIZE (<MODE>mode)-1))
+      == GET_MODE_BITSIZE (<MODE>mode)-1
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(parallel
+     [(set (match_dup 0)
+	   (and:SWI48
+	     (rotate:SWI48 (const_int -2)
+			   (match_dup 1))
+	     (match_dup 3)))
+      (clobber (reg:CC FLAGS_REG))])])
+
 ;; These instructions are never faster than the corresponding
 ;; and/ior/xor operations when using immediate operand, so with
 ;; 32-bit there's no point.  But in 64-bit, we can't hold the
--- gcc/testsuite/gcc.target/i386/pr82498-1.c.jj	2017-10-12 11:18:48.905128703 +0200
+++ gcc/testsuite/gcc.target/i386/pr82498-1.c	2017-10-12 11:18:48.905128703 +0200
@@ -0,0 +1,52 @@
+/* PR target/82498 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic -masm=att" } */
+/* { dg-final { scan-assembler-not {\mand[bwlq]\M} } } */
+
+unsigned
+f1 (unsigned x, unsigned char y)
+{
+  if (y == 0)
+    return x;
+  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
+  return (x << y) | (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y));
+}
+
+unsigned
+f2 (unsigned x, unsigned y)
+{
+  if (y == 0)
+    return x;
+  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
+  return (x << y) | (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y));
+}
+
+unsigned
+f3 (unsigned x, unsigned short y)
+{
+  if (y == 0)
+    return x;
+  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
+  return (x << y) | (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y));
+}
+
+unsigned
+f4 (unsigned x, unsigned char y)
+{
+  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
+  return (x << y) | (x >> (-y & (__CHAR_BIT__ * __SIZEOF_INT__ - 1)));
+}
+
+unsigned
+f5 (unsigned x, unsigned int y)
+{
+  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
+  return (x << y) | (x >> (-y & (__CHAR_BIT__ * __SIZEOF_INT__ - 1)));
+}
+
+unsigned
+f6 (unsigned x, unsigned short y)
+{
+  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
+  return (x << y) | (x >> (-y & (__CHAR_BIT__ * __SIZEOF_INT__ - 1)));
+}
--- gcc/testsuite/gcc.target/i386/pr82498-2.c.jj	2017-10-12 12:14:53.452321121 +0200
+++ gcc/testsuite/gcc.target/i386/pr82498-2.c	2017-10-12 12:08:53.000000000 +0200
@@ -0,0 +1,46 @@
+/* PR target/82498 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic -masm=att" } */
+/* { dg-final { scan-assembler-not {\mand[bwlq]\M} } } */
+
+int
+f1 (int x, unsigned char y)
+{
+  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
+  return x >> y;
+}
+
+unsigned
+f2 (unsigned x, unsigned char y)
+{
+  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
+  return x >> y;
+}
+
+unsigned
+f3 (unsigned x, unsigned char y)
+{
+  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
+  return x << y;
+}
+
+unsigned
+f4 (unsigned x, unsigned char y)
+{
+  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
+  return x | (1U << y);
+}
+
+unsigned
+f5 (unsigned x, unsigned char y)
+{
+  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
+  return x ^ (1U << y);
+}
+
+unsigned
+f6 (unsigned x, unsigned char y)
+{
+  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
+  return (x + 2) & ~(1U << y);
+}


	Jakub
Uros Bizjak Oct. 13, 2017, 6:47 a.m. | #7
On Thu, Oct 12, 2017 at 9:11 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Thu, Oct 12, 2017 at 10:40:22AM +0200, Uros Bizjak wrote:
>> > So, if you aren't against it, I can extend the patch to handle the 4
>> > other mask patterns; as for other modes, SImode is what is being handled
>> > already, DImode is not a problem, because the FEs truncate the shift counts
>> > to integer_type_node already, and for HImode I haven't seen problem
>> > probably because most tunings avoid HImode math and so it isn't worth
>> > optimizing.
>>
>> OK, I think that we can live wtih 4 new patterns. Since these are all
>> written in the same way (as in the patch you posted), the ammended
>> patch is pre-approved for mainline.
>
> Thanks, here is what I've committed to trunk after another bootstrap/regtest
> on x86_64-linux and i686-linux:
>
> 2017-10-12  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/82498
>         * config/i386/i386.md (*ashl<mode>3_mask_1,
>         *<shift_insn><mode>3_mask_1, *<rotate_insn><mode>3_mask_1,
>         *<btsc><mode>_mask_1, *btr<mode>_mask_1): New define_insn_and_split
>         patterns.
>
>         * gcc.target/i386/pr82498-1.c: New test.
>         * gcc.target/i386/pr82498-2.c: New test.

OK for mainline.

Thanks,
Uros.

> --- gcc/config/i386/i386.md.jj  2017-10-11 22:37:55.933863355 +0200
> +++ gcc/config/i386/i386.md     2017-10-12 11:30:38.191535974 +0200
> @@ -10228,6 +10228,26 @@ (define_insn_and_split "*ashl<mode>3_mas
>        (clobber (reg:CC FLAGS_REG))])]
>    "operands[2] = gen_lowpart (QImode, operands[2]);")
>
> +(define_insn_and_split "*ashl<mode>3_mask_1"
> +  [(set (match_operand:SWI48 0 "nonimmediate_operand")
> +       (ashift:SWI48
> +         (match_operand:SWI48 1 "nonimmediate_operand")
> +         (and:QI
> +           (match_operand:QI 2 "register_operand")
> +           (match_operand:QI 3 "const_int_operand"))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "ix86_binary_operator_ok (ASHIFT, <MODE>mode, operands)
> +   && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1))
> +      == GET_MODE_BITSIZE (<MODE>mode)-1
> +   && can_create_pseudo_p ()"
> +  "#"
> +  "&& 1"
> +  [(parallel
> +     [(set (match_dup 0)
> +          (ashift:SWI48 (match_dup 1)
> +                        (match_dup 2)))
> +      (clobber (reg:CC FLAGS_REG))])])
> +
>  (define_insn "*bmi2_ashl<mode>3_1"
>    [(set (match_operand:SWI48 0 "register_operand" "=r")
>         (ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")
> @@ -10728,6 +10748,26 @@ (define_insn_and_split "*<shift_insn><mo
>        (clobber (reg:CC FLAGS_REG))])]
>    "operands[2] = gen_lowpart (QImode, operands[2]);")
>
> +(define_insn_and_split "*<shift_insn><mode>3_mask_1"
> +  [(set (match_operand:SWI48 0 "nonimmediate_operand")
> +       (any_shiftrt:SWI48
> +         (match_operand:SWI48 1 "nonimmediate_operand")
> +         (and:QI
> +           (match_operand:QI 2 "register_operand")
> +           (match_operand:QI 3 "const_int_operand"))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)
> +   && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1))
> +      == GET_MODE_BITSIZE (<MODE>mode)-1
> +   && can_create_pseudo_p ()"
> +  "#"
> +  "&& 1"
> +  [(parallel
> +     [(set (match_dup 0)
> +          (any_shiftrt:SWI48 (match_dup 1)
> +                             (match_dup 2)))
> +      (clobber (reg:CC FLAGS_REG))])])
> +
>  (define_insn_and_split "*<shift_insn><mode>3_doubleword"
>    [(set (match_operand:DWI 0 "register_operand" "=&r")
>         (any_shiftrt:DWI (match_operand:DWI 1 "register_operand" "0")
> @@ -11187,6 +11227,26 @@ (define_insn_and_split "*<rotate_insn><m
>        (clobber (reg:CC FLAGS_REG))])]
>    "operands[2] = gen_lowpart (QImode, operands[2]);")
>
> +(define_insn_and_split "*<rotate_insn><mode>3_mask_1"
> +  [(set (match_operand:SWI48 0 "nonimmediate_operand")
> +       (any_rotate:SWI48
> +         (match_operand:SWI48 1 "nonimmediate_operand")
> +         (and:QI
> +           (match_operand:QI 2 "register_operand")
> +           (match_operand:QI 3 "const_int_operand"))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)
> +   && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1))
> +      == GET_MODE_BITSIZE (<MODE>mode)-1
> +   && can_create_pseudo_p ()"
> +  "#"
> +  "&& 1"
> +  [(parallel
> +     [(set (match_dup 0)
> +          (any_rotate:SWI48 (match_dup 1)
> +                            (match_dup 2)))
> +      (clobber (reg:CC FLAGS_REG))])])
> +
>  ;; Implement rotation using two double-precision
>  ;; shift instructions and a scratch register.
>
> @@ -11494,6 +11554,30 @@ (define_insn_and_split "*<btsc><mode>_ma
>        (clobber (reg:CC FLAGS_REG))])]
>    "operands[1] = gen_lowpart (QImode, operands[1]);")
>
> +(define_insn_and_split "*<btsc><mode>_mask_1"
> +  [(set (match_operand:SWI48 0 "register_operand")
> +       (any_or:SWI48
> +         (ashift:SWI48
> +           (const_int 1)
> +           (and:QI
> +             (match_operand:QI 1 "register_operand")
> +             (match_operand:QI 2 "const_int_operand")))
> +         (match_operand:SWI48 3 "register_operand")))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "TARGET_USE_BT
> +   && (INTVAL (operands[2]) & (GET_MODE_BITSIZE (<MODE>mode)-1))
> +      == GET_MODE_BITSIZE (<MODE>mode)-1
> +   && can_create_pseudo_p ()"
> +  "#"
> +  "&& 1"
> +  [(parallel
> +     [(set (match_dup 0)
> +          (any_or:SWI48
> +            (ashift:SWI48 (const_int 1)
> +                          (match_dup 1))
> +            (match_dup 3)))
> +      (clobber (reg:CC FLAGS_REG))])])
> +
>  (define_insn "*btr<mode>"
>    [(set (match_operand:SWI48 0 "register_operand" "=r")
>         (and:SWI48
> @@ -11535,6 +11619,30 @@ (define_insn_and_split "*btr<mode>_mask"
>        (clobber (reg:CC FLAGS_REG))])]
>    "operands[1] = gen_lowpart (QImode, operands[1]);")
>
> +(define_insn_and_split "*btr<mode>_mask_1"
> +  [(set (match_operand:SWI48 0 "register_operand")
> +       (and:SWI48
> +         (rotate:SWI48
> +           (const_int -2)
> +           (and:QI
> +             (match_operand:QI 1 "register_operand")
> +             (match_operand:QI 2 "const_int_operand")))
> +         (match_operand:SWI48 3 "register_operand")))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "TARGET_USE_BT
> +   && (INTVAL (operands[2]) & (GET_MODE_BITSIZE (<MODE>mode)-1))
> +      == GET_MODE_BITSIZE (<MODE>mode)-1
> +   && can_create_pseudo_p ()"
> +  "#"
> +  "&& 1"
> +  [(parallel
> +     [(set (match_dup 0)
> +          (and:SWI48
> +            (rotate:SWI48 (const_int -2)
> +                          (match_dup 1))
> +            (match_dup 3)))
> +      (clobber (reg:CC FLAGS_REG))])])
> +
>  ;; These instructions are never faster than the corresponding
>  ;; and/ior/xor operations when using immediate operand, so with
>  ;; 32-bit there's no point.  But in 64-bit, we can't hold the
> --- gcc/testsuite/gcc.target/i386/pr82498-1.c.jj        2017-10-12 11:18:48.905128703 +0200
> +++ gcc/testsuite/gcc.target/i386/pr82498-1.c   2017-10-12 11:18:48.905128703 +0200
> @@ -0,0 +1,52 @@
> +/* PR target/82498 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mtune=generic -masm=att" } */
> +/* { dg-final { scan-assembler-not {\mand[bwlq]\M} } } */
> +
> +unsigned
> +f1 (unsigned x, unsigned char y)
> +{
> +  if (y == 0)
> +    return x;
> +  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
> +  return (x << y) | (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y));
> +}
> +
> +unsigned
> +f2 (unsigned x, unsigned y)
> +{
> +  if (y == 0)
> +    return x;
> +  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
> +  return (x << y) | (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y));
> +}
> +
> +unsigned
> +f3 (unsigned x, unsigned short y)
> +{
> +  if (y == 0)
> +    return x;
> +  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
> +  return (x << y) | (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y));
> +}
> +
> +unsigned
> +f4 (unsigned x, unsigned char y)
> +{
> +  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
> +  return (x << y) | (x >> (-y & (__CHAR_BIT__ * __SIZEOF_INT__ - 1)));
> +}
> +
> +unsigned
> +f5 (unsigned x, unsigned int y)
> +{
> +  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
> +  return (x << y) | (x >> (-y & (__CHAR_BIT__ * __SIZEOF_INT__ - 1)));
> +}
> +
> +unsigned
> +f6 (unsigned x, unsigned short y)
> +{
> +  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
> +  return (x << y) | (x >> (-y & (__CHAR_BIT__ * __SIZEOF_INT__ - 1)));
> +}
> --- gcc/testsuite/gcc.target/i386/pr82498-2.c.jj        2017-10-12 12:14:53.452321121 +0200
> +++ gcc/testsuite/gcc.target/i386/pr82498-2.c   2017-10-12 12:08:53.000000000 +0200
> @@ -0,0 +1,46 @@
> +/* PR target/82498 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mtune=generic -masm=att" } */
> +/* { dg-final { scan-assembler-not {\mand[bwlq]\M} } } */
> +
> +int
> +f1 (int x, unsigned char y)
> +{
> +  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
> +  return x >> y;
> +}
> +
> +unsigned
> +f2 (unsigned x, unsigned char y)
> +{
> +  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
> +  return x >> y;
> +}
> +
> +unsigned
> +f3 (unsigned x, unsigned char y)
> +{
> +  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
> +  return x << y;
> +}
> +
> +unsigned
> +f4 (unsigned x, unsigned char y)
> +{
> +  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
> +  return x | (1U << y);
> +}
> +
> +unsigned
> +f5 (unsigned x, unsigned char y)
> +{
> +  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
> +  return x ^ (1U << y);
> +}
> +
> +unsigned
> +f6 (unsigned x, unsigned char y)
> +{
> +  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
> +  return (x + 2) & ~(1U << y);
> +}
>
>
>         Jakub

Patch

--- gcc/config/i386/i386.md.jj	2017-10-10 11:54:11.000000000 +0200
+++ gcc/config/i386/i386.md	2017-10-11 19:24:27.673606778 +0200
@@ -11187,6 +11187,26 @@  (define_insn_and_split "*<rotate_insn><m
       (clobber (reg:CC FLAGS_REG))])]
   "operands[2] = gen_lowpart (QImode, operands[2]);")
 
+(define_insn_and_split "*<rotate_insn><mode>3_mask_1"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand")
+	(any_rotate:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand")
+	  (and:QI
+	    (match_operand:QI 2 "register_operand")
+	    (match_operand:QI 3 "const_int_operand"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)
+   && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1))
+      == GET_MODE_BITSIZE (<MODE>mode)-1
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(parallel
+     [(set (match_dup 0)
+	   (any_rotate:SWI48 (match_dup 1)
+			     (match_dup 2)))
+      (clobber (reg:CC FLAGS_REG))])])
+
 ;; Implement rotation using two double-precision
 ;; shift instructions and a scratch register.
 
--- gcc/testsuite/gcc.target/i386/pr82498.c.jj	2017-10-11 20:21:10.677088306 +0200
+++ gcc/testsuite/gcc.target/i386/pr82498.c	2017-10-11 20:22:31.569101564 +0200
@@ -0,0 +1,52 @@ 
+/* PR target/82498 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic -masm=att" } */
+/* { dg-final { scan-assembler-not {\mand[bwlq]\M} } } */
+
+unsigned
+f1 (unsigned x, unsigned char y)
+{
+  if (y == 0)
+    return x;
+  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
+  return (x << y) | (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y));
+}
+
+unsigned
+f2 (unsigned x, unsigned y)
+{
+  if (y == 0)
+    return x;
+  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
+  return (x << y) | (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y));
+}
+
+unsigned
+f3 (unsigned x, unsigned short y)
+{
+  if (y == 0)
+    return x;
+  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
+  return (x << y) | (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y));
+}
+
+unsigned
+f4 (unsigned x, unsigned char y)
+{
+  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
+  return (x << y) | (x >> (-y & (__CHAR_BIT__ * __SIZEOF_INT__ - 1)));
+}
+
+unsigned
+f5 (unsigned x, unsigned int y)
+{
+  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
+  return (x << y) | (x >> (-y & (__CHAR_BIT__ * __SIZEOF_INT__ - 1)));
+}
+
+unsigned
+f6 (unsigned x, unsigned short y)
+{
+  y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1;
+  return (x << y) | (x >> (-y & (__CHAR_BIT__ * __SIZEOF_INT__ - 1)));
+}