diff mbox series

[5/5] x86: yet more PR target/100711-like splitting

Message ID 0075f542-9dc0-33db-4cf9-cdd3ba502122@suse.com
State New
Headers show
Series x86: make better use of VPTERNLOG{D,Q} | expand

Commit Message

Jan Beulich June 21, 2023, 6:28 a.m. UTC
Following two-operand bitwise operations, add another splitter to also
deal with not followed by broadcast all on its own, which can be
expressed as simple embedded broadcast instead once a broadcast operand
is actually permitted in the respective insn. While there also permit
a broadcast operand in the corresponding expander.

gcc/

	* config/i386/sse.md: New splitters to simplify
	not;vec_duplicate as a singular vpternlog.
	(one_cmpl<mode>2): Allow broadcast for operand 1.
	(<mask_codefor>one_cmpl<mode>2<mask_name>): Likewise.

gcc/testsuite/

	* gcc.target/i386/pr100711-6.c: New test.
---
For the purpose here (and elsewhere) bcst_vector_operand() (really:
bcst_mem_operand()) isn't permissive enough: We'd want it to allow
128-bit and 256-bit types as well irrespective of AVX512VL being
enabled. This would likely require a new predicate
(bcst_intvec_operand()?) and a new constraint (BR? Bi?). (Yet for name
selection it will want considering that this is applicable to certain
non-calculational FP operations as well.)

Comments

Hongtao Liu June 25, 2023, 5:12 a.m. UTC | #1
On Wed, Jun 21, 2023 at 2:29 PM Jan Beulich via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Following two-operand bitwise operations, add another splitter to also
> deal with not followed by broadcast all on its own, which can be
> expressed as simple embedded broadcast instead once a broadcast operand
> is actually permitted in the respective insn. While there also permit
> a broadcast operand in the corresponding expander.
The patch LGTM.
>
> gcc/
>
>         * config/i386/sse.md: New splitters to simplify
>         not;vec_duplicate as a singular vpternlog.
>         (one_cmpl<mode>2): Allow broadcast for operand 1.
>         (<mask_codefor>one_cmpl<mode>2<mask_name>): Likewise.
>
> gcc/testsuite/
>
>         * gcc.target/i386/pr100711-6.c: New test.
> ---
> For the purpose here (and elsewhere) bcst_vector_operand() (really:
> bcst_mem_operand()) isn't permissive enough: We'd want it to allow
> 128-bit and 256-bit types as well irrespective of AVX512VL being
> enabled. This would likely require a new predicate
> (bcst_intvec_operand()?) and a new constraint (BR? Bi?). (Yet for name
> selection it will want considering that this is applicable to certain
> non-calculational FP operations as well.)
I think so.
>
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -17156,7 +17156,7 @@
>
>  (define_expand "one_cmpl<mode>2"
>    [(set (match_operand:VI 0 "register_operand")
> -       (xor:VI (match_operand:VI 1 "vector_operand")
> +       (xor:VI (match_operand:VI 1 "bcst_vector_operand")
>                 (match_dup 2)))]
>    "TARGET_SSE"
>  {
> @@ -17168,7 +17168,7 @@
>
>  (define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>"
>    [(set (match_operand:VI 0 "register_operand" "=v,v")
> -       (xor:VI (match_operand:VI 1 "nonimmediate_operand" "v,m")
> +       (xor:VI (match_operand:VI 1 "bcst_vector_operand" "vBr,m")
>                 (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))]
>    "TARGET_AVX512F
>     && (!<mask_applied>
> @@ -17191,6 +17191,19 @@
>                       (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
>                       (const_int 1)))])
>
> +(define_split
> +  [(set (match_operand:VI48_AVX512F 0 "register_operand")
> +       (vec_duplicate:VI48_AVX512F
> +         (not:<ssescalarmode>
> +           (match_operand:<ssescalarmode> 1 "nonimmediate_operand"))))]
> +  "<MODE_SIZE> == 64 || TARGET_AVX512VL
> +   || (TARGET_AVX512F && !TARGET_PREFER_AVX256)"
> +  [(set (match_dup 0)
> +       (xor:VI48_AVX512F
> +         (vec_duplicate:VI48_AVX512F (match_dup 1))
> +         (match_dup 2)))]
> +  "operands[2] = CONSTM1_RTX (<MODE>mode);")
> +
>  (define_expand "<sse2_avx2>_andnot<mode>3"
>    [(set (match_operand:VI_AVX2 0 "register_operand")
>         (and:VI_AVX2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr100711-6.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */
> +
> +typedef int v16si __attribute__ ((vector_size (64)));
> +typedef long long v8di __attribute__((vector_size (64)));
> +
> +v16si foo_v16si (const int *a)
> +{
> +    return (__extension__ (v16si) {~*a, ~*a, ~*a, ~*a, ~*a, ~*a, ~*a, ~*a,
> +                                  ~*a, ~*a, ~*a, ~*a, ~*a, ~*a, ~*a, ~*a});
> +}
> +
> +v8di foo_v8di (const long long *a)
> +{
> +    return (__extension__ (v8di) {~*a, ~*a, ~*a, ~*a, ~*a, ~*a, ~*a, ~*a});
> +}
> +
> +/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$0x55, \\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}" 2 } } */
>
Jan Beulich June 25, 2023, 6:25 a.m. UTC | #2
On 25.06.2023 07:12, Hongtao Liu wrote:
> On Wed, Jun 21, 2023 at 2:29 PM Jan Beulich via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>>
>> ---
>> For the purpose here (and elsewhere) bcst_vector_operand() (really:
>> bcst_mem_operand()) isn't permissive enough: We'd want it to allow
>> 128-bit and 256-bit types as well irrespective of AVX512VL being
>> enabled. This would likely require a new predicate
>> (bcst_intvec_operand()?) and a new constraint (BR? Bi?). (Yet for name
>> selection it will want considering that this is applicable to certain
>> non-calculational FP operations as well.)
> I think so.

Any preference towards predicate and constraint naming?

Plus I think there's a more general question behind this: A new
predicate / constraint pair is likely just one way of dealing
with the issue. Another would appear to be to remove the
restriction of 128- and 256-byte types when AVX512VL is not
enabled, but AVX512F is. While that would require touching a
lot of insn constraints, it looks as if lifting that restriction
would "merely" require much wider use of Yv where v is used
right now. But of course I may well be unaware of (some of) the
reasons why that restriction was put in place in the first place
(it can't really be the lack of suitable move insns, as those
can be synthesized by using e.g. vextract{32,64}x4).

Jan
Hongtao Liu June 25, 2023, 6:35 a.m. UTC | #3
On Sun, Jun 25, 2023 at 2:25 PM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 25.06.2023 07:12, Hongtao Liu wrote:
> > On Wed, Jun 21, 2023 at 2:29 PM Jan Beulich via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> >>
> >> ---
> >> For the purpose here (and elsewhere) bcst_vector_operand() (really:
> >> bcst_mem_operand()) isn't permissive enough: We'd want it to allow
> >> 128-bit and 256-bit types as well irrespective of AVX512VL being
> >> enabled. This would likely require a new predicate
> >> (bcst_intvec_operand()?) and a new constraint (BR? Bi?). (Yet for name
> >> selection it will want considering that this is applicable to certain
> >> non-calculational FP operations as well.)
> > I think so.
>
> Any preference towards predicate and constraint naming?
something like bcst_mem_operand_$suffiix, $suffix indicates the
pattern may use zmm instruction for 128/256-bit operand.
maybe just bcst_mem_operand_zmm?

>
> Plus I think there's a more general question behind this: A new
> predicate / constraint pair is likely just one way of dealing
> with the issue. Another would appear to be to remove the
> restriction of 128- and 256-byte types when AVX512VL is not
> enabled, but AVX512F is. While that would require touching a
> lot of insn constraints, it looks as if lifting that restriction
> would "merely" require much wider use of Yv where v is used
> right now. But of course I may well be unaware of (some of) the
> reasons why that restriction was put in place in the first place
> (it can't really be the lack of suitable move insns, as those
> can be synthesized by using e.g. vextract{32,64}x4).
Also be careful of SIMD Floating-Point Exception if we use the zmm
version for those arithmetic instructions, the upper bits need to be
explicitly cleared for 128/256-bit operand.
For pternlog or other logic instructions, it's ok since there's no
SIMD Floating-Point Exception for such instructions.

>
> Jan
Hongtao Liu June 25, 2023, 6:41 a.m. UTC | #4
On Sun, Jun 25, 2023 at 2:35 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Sun, Jun 25, 2023 at 2:25 PM Jan Beulich <jbeulich@suse.com> wrote:
> >
> > On 25.06.2023 07:12, Hongtao Liu wrote:
> > > On Wed, Jun 21, 2023 at 2:29 PM Jan Beulich via Gcc-patches
> > > <gcc-patches@gcc.gnu.org> wrote:
> > >>
> > >> ---
> > >> For the purpose here (and elsewhere) bcst_vector_operand() (really:
> > >> bcst_mem_operand()) isn't permissive enough: We'd want it to allow
> > >> 128-bit and 256-bit types as well irrespective of AVX512VL being
> > >> enabled. This would likely require a new predicate
> > >> (bcst_intvec_operand()?) and a new constraint (BR? Bi?). (Yet for name
> > >> selection it will want considering that this is applicable to certain
> > >> non-calculational FP operations as well.)
> > > I think so.
> >
> > Any preference towards predicate and constraint naming?
> something like bcst_mem_operand_$suffiix, $suffix indicates the
> pattern may use zmm instruction for 128/256-bit operand.
> maybe just bcst_mem_operand_zmm?
For constraint, maybe we can reuse Br, relax Br to match bcst_mem_operand_zmm.
For those original patterns with bcst_mem_operand, it should be ok
since it's already guarded by the predicate, the constraint must be
valid.
>
> >
> > Plus I think there's a more general question behind this: A new
> > predicate / constraint pair is likely just one way of dealing
> > with the issue. Another would appear to be to remove the
> > restriction of 128- and 256-byte types when AVX512VL is not
> > enabled, but AVX512F is. While that would require touching a
> > lot of insn constraints, it looks as if lifting that restriction
> > would "merely" require much wider use of Yv where v is used
> > right now. But of course I may well be unaware of (some of) the
> > reasons why that restriction was put in place in the first place
> > (it can't really be the lack of suitable move insns, as those
> > can be synthesized by using e.g. vextract{32,64}x4).
> Also be careful of SIMD Floating-Point Exception if we use the zmm
> version for those arithmetic instructions, the upper bits need to be
> explicitly cleared for 128/256-bit operand.
> For pternlog or other logic instructions, it's ok since there's no
> SIMD Floating-Point Exception for such instructions.
>
> >
> > Jan
>
>
>
> --
> BR,
> Hongtao
Jan Beulich Nov. 6, 2023, 11:10 a.m. UTC | #5
On 25.06.2023 08:41, Hongtao Liu wrote:
> On Sun, Jun 25, 2023 at 2:35 PM Hongtao Liu <crazylht@gmail.com> wrote:
>>
>> On Sun, Jun 25, 2023 at 2:25 PM Jan Beulich <jbeulich@suse.com> wrote:
>>>
>>> On 25.06.2023 07:12, Hongtao Liu wrote:
>>>> On Wed, Jun 21, 2023 at 2:29 PM Jan Beulich via Gcc-patches
>>>> <gcc-patches@gcc.gnu.org> wrote:
>>>>>
>>>>> ---
>>>>> For the purpose here (and elsewhere) bcst_vector_operand() (really:
>>>>> bcst_mem_operand()) isn't permissive enough: We'd want it to allow
>>>>> 128-bit and 256-bit types as well irrespective of AVX512VL being
>>>>> enabled. This would likely require a new predicate
>>>>> (bcst_intvec_operand()?) and a new constraint (BR? Bi?). (Yet for name
>>>>> selection it will want considering that this is applicable to certain
>>>>> non-calculational FP operations as well.)
>>>> I think so.
>>>
>>> Any preference towards predicate and constraint naming?
>> something like bcst_mem_operand_$suffiix, $suffix indicates the
>> pattern may use zmm instruction for 128/256-bit operand.
>> maybe just bcst_mem_operand_zmm?
> For constraint, maybe we can reuse Br, relax Br to match bcst_mem_operand_zmm.
> For those original patterns with bcst_mem_operand, it should be ok
> since it's already guarded by the predicate, the constraint must be
> valid.

Hmm, I wanted to get back to this, but then I started wondering about this
reply of yours vs your request to not go farther with the use of "oversized"
insns (i.e. acting in 512-bit registers in lieu of AVX512VL being enabled,
when no FP exceptions can be raised on the otherwise unused elements). Since
iirc the latter came later, am I right in assuming we then also shouldn't go
the route outlined above?

Jan
Hongtao Liu Nov. 6, 2023, 1:48 p.m. UTC | #6
On Mon, Nov 6, 2023 at 7:10 PM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 25.06.2023 08:41, Hongtao Liu wrote:
> > On Sun, Jun 25, 2023 at 2:35 PM Hongtao Liu <crazylht@gmail.com> wrote:
> >>
> >> On Sun, Jun 25, 2023 at 2:25 PM Jan Beulich <jbeulich@suse.com> wrote:
> >>>
> >>> On 25.06.2023 07:12, Hongtao Liu wrote:
> >>>> On Wed, Jun 21, 2023 at 2:29 PM Jan Beulich via Gcc-patches
> >>>> <gcc-patches@gcc.gnu.org> wrote:
> >>>>>
> >>>>> ---
> >>>>> For the purpose here (and elsewhere) bcst_vector_operand() (really:
> >>>>> bcst_mem_operand()) isn't permissive enough: We'd want it to allow
> >>>>> 128-bit and 256-bit types as well irrespective of AVX512VL being
> >>>>> enabled. This would likely require a new predicate
> >>>>> (bcst_intvec_operand()?) and a new constraint (BR? Bi?). (Yet for name
> >>>>> selection it will want considering that this is applicable to certain
> >>>>> non-calculational FP operations as well.)
> >>>> I think so.
> >>>
> >>> Any preference towards predicate and constraint naming?
> >> something like bcst_mem_operand_$suffiix, $suffix indicates the
> >> pattern may use zmm instruction for 128/256-bit operand.
> >> maybe just bcst_mem_operand_zmm?
> > For constraint, maybe we can reuse Br, relax Br to match bcst_mem_operand_zmm.
> > For those original patterns with bcst_mem_operand, it should be ok
> > since it's already guarded by the predicate, the constraint must be
> > valid.
>
> Hmm, I wanted to get back to this, but then I started wondering about this
> reply of yours vs your request to not go farther with the use of "oversized"
> insns (i.e. acting in 512-bit registers in lieu of AVX512VL being enabled,
> when no FP exceptions can be raised on the otherwise unused elements). Since
> iirc the latter came later, am I right in assuming we then also shouldn't go
> the route outlined above?
No, we shouldn't.
This reply is just an answer on how to do it technically, but we don't
really want to do it (considering that all AVX512 processors after SKX
will all support AVX512VL)
>
> Jan
diff mbox series

Patch

--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -17156,7 +17156,7 @@ 
 
 (define_expand "one_cmpl<mode>2"
   [(set (match_operand:VI 0 "register_operand")
-	(xor:VI (match_operand:VI 1 "vector_operand")
+	(xor:VI (match_operand:VI 1 "bcst_vector_operand")
 		(match_dup 2)))]
   "TARGET_SSE"
 {
@@ -17168,7 +17168,7 @@ 
 
 (define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>"
   [(set (match_operand:VI 0 "register_operand" "=v,v")
-	(xor:VI (match_operand:VI 1 "nonimmediate_operand" "v,m")
+	(xor:VI (match_operand:VI 1 "bcst_vector_operand" "vBr,m")
 		(match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))]
   "TARGET_AVX512F
    && (!<mask_applied>
@@ -17191,6 +17191,19 @@ 
 		      (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
 		      (const_int 1)))])
 
+(define_split
+  [(set (match_operand:VI48_AVX512F 0 "register_operand")
+	(vec_duplicate:VI48_AVX512F
+	  (not:<ssescalarmode>
+	    (match_operand:<ssescalarmode> 1 "nonimmediate_operand"))))]
+  "<MODE_SIZE> == 64 || TARGET_AVX512VL
+   || (TARGET_AVX512F && !TARGET_PREFER_AVX256)"
+  [(set (match_dup 0)
+	(xor:VI48_AVX512F
+	  (vec_duplicate:VI48_AVX512F (match_dup 1))
+	  (match_dup 2)))]
+  "operands[2] = CONSTM1_RTX (<MODE>mode);")
+
 (define_expand "<sse2_avx2>_andnot<mode>3"
   [(set (match_operand:VI_AVX2 0 "register_operand")
 	(and:VI_AVX2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100711-6.c
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */
+
+typedef int v16si __attribute__ ((vector_size (64)));
+typedef long long v8di __attribute__((vector_size (64)));
+
+v16si foo_v16si (const int *a)
+{
+    return (__extension__ (v16si) {~*a, ~*a, ~*a, ~*a, ~*a, ~*a, ~*a, ~*a,
+				   ~*a, ~*a, ~*a, ~*a, ~*a, ~*a, ~*a, ~*a});
+}
+
+v8di foo_v8di (const long long *a)
+{
+    return (__extension__ (v8di) {~*a, ~*a, ~*a, ~*a, ~*a, ~*a, ~*a, ~*a});
+}
+
+/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$0x55, \\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}" 2 } } */