diff mbox

Fix MMX/SSE/AVX* shifts by non-immediate scalar (PR target/80286)

Message ID 20170404150905.GI17461@tucnak
State New
Headers show

Commit Message

Jakub Jelinek April 4, 2017, 3:09 p.m. UTC
On Tue, Apr 04, 2017 at 02:33:24PM +0200, Uros Bizjak wrote:
> > I assume split those before reload.  Because we want to give reload a chance
> > to do the zero extension on GPRs if it is more beneficial, and it might
> > choose to store it into memory and load into XMM from memory and that is
> > hard to do after reload.
> 
> Yes, split before reload, and hope that alternative's decorations play
> well with RA.

Haven't done these splitters yet, just playing now with:
typedef long long __m256i __attribute__ ((__vector_size__ (32), __may_alias__));
typedef int __v4si __attribute__ ((__vector_size__ (16)));
typedef short __v8hi __attribute__ ((__vector_size__ (16)));
typedef int __v8si __attribute__ ((__vector_size__ (32)));
typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_castsi256_si128 (__m256i __A) { return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A); }
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si32 (__m128i __A) { return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0); }
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi16 (__m128i __A, int __B) { return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B); }
__m256i m;
__m128i foo (__m128i minmax)
{
  int shift = _mm_cvtsi128_si32 (_mm256_castsi256_si128 (m));
  return _mm_srli_epi16 (minmax, shift);
}
to see what it emits (in that case we already have zero extension rather
than sign extension).
> > With ? in front of it or without?  I admit I've only tried so far:
> 
> I'd leave ?* in this case. In my experience, RA allocates alternative
> with ?* only when really needed.

So far I have following, which seems to work fine for the above testcase and
-O2 -m64 -mavx2, but doesn't work for -O2 -m32 -mavx2.
For 64-bit combiner matches the *vec_extractv4si_0_zext pattern and as that
doesn't have ? nor * in the constraint, it is used.
For 32-bit there is no such pattern and we end up with just zero_extendsidi2
pattern and apparently either the ? or * prevent IRA/LRA from using it.
If I remove both ?*, I get nice code even for 32-bit.


	Jakub

Comments

Uros Bizjak April 6, 2017, 7:33 a.m. UTC | #1
On Tue, Apr 4, 2017 at 5:09 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Tue, Apr 04, 2017 at 02:33:24PM +0200, Uros Bizjak wrote:
>> > I assume split those before reload.  Because we want to give reload a chance
>> > to do the zero extension on GPRs if it is more beneficial, and it might
>> > choose to store it into memory and load into XMM from memory and that is
>> > hard to do after reload.
>>
>> Yes, split before reload, and hope that alternative's decorations play
>> well with RA.
>
> Haven't done these splitters yet, just playing now with:
> typedef long long __m256i __attribute__ ((__vector_size__ (32), __may_alias__));
> typedef int __v4si __attribute__ ((__vector_size__ (16)));
> typedef short __v8hi __attribute__ ((__vector_size__ (16)));
> typedef int __v8si __attribute__ ((__vector_size__ (32)));
> typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> _mm256_castsi256_si128 (__m256i __A) { return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A); }
> extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> _mm_cvtsi128_si32 (__m128i __A) { return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0); }
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> _mm_srli_epi16 (__m128i __A, int __B) { return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B); }
> __m256i m;
> __m128i foo (__m128i minmax)
> {
>   int shift = _mm_cvtsi128_si32 (_mm256_castsi256_si128 (m));
>   return _mm_srli_epi16 (minmax, shift);
> }
> to see what it emits (in that case we already have zero extension rather
> than sign extension).
>> > With ? in front of it or without?  I admit I've only tried so far:
>>
>> I'd leave ?* in this case. In my experience, RA allocates alternative
>> with ?* only when really needed.
>
> So far I have following, which seems to work fine for the above testcase and
> -O2 -m64 -mavx2, but doesn't work for -O2 -m32 -mavx2.
> For 64-bit combiner matches the *vec_extractv4si_0_zext pattern and as that
> doesn't have ? nor * in the constraint, it is used.
> For 32-bit there is no such pattern and we end up with just zero_extendsidi2
> pattern and apparently either the ? or * prevent IRA/LRA from using it.
> If I remove both ?*, I get nice code even for 32-bit.

Newly introduced alternatives (x/x) and (v/v) are valid also for
32-bit targets, so we have to adjust insn constraint of
*vec_extractv4si_0_zext and enable alternatives accordingly. After the
adjustment, the pattern will be split to a zero-extend.

With -m32, I get:

(insn 10 8 13 2 (set (reg:SI 98)
        (vec_select:SI (reg:V4SI 95)
            (parallel [
                    (const_int 0 [0])
                ]))) "pr80286.c":9 3663 {*vec_extractv4si_0}
     (expr_list:REG_DEAD (reg:V4SI 95)
        (nil)))
(insn 13 10 14 2 (set (reg:DI 101 [ _7 ])
        (zero_extend:DI (reg:SI 98))) "pr80286.c":11 131 {*zero_extendsidi2}
     (expr_list:REG_DEAD (reg:SI 98)
        (nil)))

and for SSE4+, combine can merge these two patterns to
*vec_extractv4si_0_zext, with the anticipation that pmovzx will be
generated.

Uros.

> --- gcc/config/i386/sse.md.jj   2017-04-04 12:45:08.000000000 +0200
> +++ gcc/config/i386/sse.md      2017-04-04 16:54:58.667382522 +0200
> @@ -13517,16 +13517,17 @@ (define_insn "*vec_extract<ssevecmodelow
>    [(set_attr "isa" "*,sse4,*,*")])
>
>  (define_insn_and_split "*vec_extractv4si_0_zext"
> -  [(set (match_operand:DI 0 "register_operand" "=r")
> +  [(set (match_operand:DI 0 "register_operand" "=r,x,v")
>         (zero_extend:DI
>           (vec_select:SI
> -           (match_operand:V4SI 1 "register_operand" "v")
> +           (match_operand:V4SI 1 "register_operand" "v,x,v")
>             (parallel [(const_int 0)]))))]
>    "TARGET_64BIT && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_FROM_VEC"
>    "#"
>    "&& reload_completed"
>    [(set (match_dup 0) (zero_extend:DI (match_dup 1)))]
> -  "operands[1] = gen_lowpart (SImode, operands[1]);")
> +  "operands[1] = gen_lowpart (SImode, operands[1]);"
> +  [(set_attr "isa" "*,sse4,avx512f")])
>
>  (define_insn "*vec_extractv2di_0_sse"
>    [(set (match_operand:DI 0 "nonimmediate_operand"     "=v,m")
> --- gcc/config/i386/i386.md.jj  2017-04-03 13:43:50.000000000 +0200
> +++ gcc/config/i386/i386.md     2017-04-04 16:54:09.786014373 +0200
> @@ -3767,10 +3767,10 @@ (define_expand "zero_extendsidi2"
>
>  (define_insn "*zero_extendsidi2"
>    [(set (match_operand:DI 0 "nonimmediate_operand"
> -                       "=r,?r,?o,r   ,o,?*Ym,?!*y,?r ,?r,?*Yi,?*x,*r")
> +                       "=r,?r,?o,r   ,o,?*Ym,?!*y,?r ,?r,?*Yi,?*x,*r,?*x,?*v")
>         (zero_extend:DI
>          (match_operand:SI 1 "x86_64_zext_operand"
> -                       "0 ,rm,r ,rmWz,0,r   ,m   ,*Yj,*x,r   ,m  ,*k")))]
> +                       "0 ,rm,r ,rmWz,0,r   ,m   ,*Yj,*x,r   ,m  ,*k,x  ,v")))]
>    ""
>  {
>    switch (get_attr_type (insn))
> @@ -3791,6 +3791,14 @@ (define_insn "*zero_extendsidi2"
>        return "%vpextrd\t{$0, %1, %k0|%k0, %1, 0}";
>
>      case TYPE_SSEMOV:
> +      if (SSE_REG_P (operands[0]) && SSE_REG_P (operands[1]))
> +       {
> +         if (EXT_REX_SSE_REG_P (operands[0])
> +             || EXT_REX_SSE_REG_P (operands[1]))
> +           return "vpmovzxdq\t{%t1, %g0|%g0, %t1}";
> +         else
> +           return "%vpmovzxdq\t{%1, %0|%0, %1}";
> +       }
>        if (GENERAL_REG_P (operands[0]))
>         return "%vmovd\t{%1, %k0|%k0, %1}";
>
> @@ -3814,6 +3822,10 @@ (define_insn "*zero_extendsidi2"
>               (const_string "sse2")
>             (eq_attr "alternative" "11")
>               (const_string "x64_avx512bw")
> +           (eq_attr "alternative" "12")
> +             (const_string "sse4")
> +           (eq_attr "alternative" "13")
> +             (const_string "avx512f")
>            ]
>            (const_string "*")))
>     (set (attr "type")
> @@ -3821,7 +3833,7 @@ (define_insn "*zero_extendsidi2"
>               (const_string "multi")
>             (eq_attr "alternative" "5,6")
>               (const_string "mmxmov")
> -           (eq_attr "alternative" "7,9,10")
> +           (eq_attr "alternative" "7,9,10,12,13")
>               (const_string "ssemov")
>             (eq_attr "alternative" "8")
>               (const_string "sselog1")
> @@ -3830,7 +3842,7 @@ (define_insn "*zero_extendsidi2"
>            ]
>            (const_string "imovx")))
>     (set (attr "prefix_extra")
> -     (if_then_else (eq_attr "alternative" "8")
> +     (if_then_else (eq_attr "alternative" "8,12,13")
>         (const_string "1")
>         (const_string "*")))
>     (set (attr "length_immediate")
> @@ -3848,8 +3860,10 @@ (define_insn "*zero_extendsidi2"
>     (set (attr "mode")
>       (cond [(eq_attr "alternative" "5,6")
>               (const_string "DI")
> -           (eq_attr "alternative" "7,8,9")
> +           (eq_attr "alternative" "7,8,9,12")
>               (const_string "TI")
> +           (eq_attr "alternative" "13")
> +             (const_string "OI")
>            ]
>            (const_string "SI")))])
>
>
>         Jakub
diff mbox

Patch

--- gcc/config/i386/sse.md.jj	2017-04-04 12:45:08.000000000 +0200
+++ gcc/config/i386/sse.md	2017-04-04 16:54:58.667382522 +0200
@@ -13517,16 +13517,17 @@  (define_insn "*vec_extract<ssevecmodelow
   [(set_attr "isa" "*,sse4,*,*")])
 
 (define_insn_and_split "*vec_extractv4si_0_zext"
-  [(set (match_operand:DI 0 "register_operand" "=r")
+  [(set (match_operand:DI 0 "register_operand" "=r,x,v")
 	(zero_extend:DI
 	  (vec_select:SI
-	    (match_operand:V4SI 1 "register_operand" "v")
+	    (match_operand:V4SI 1 "register_operand" "v,x,v")
 	    (parallel [(const_int 0)]))))]
   "TARGET_64BIT && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_FROM_VEC"
   "#"
   "&& reload_completed"
   [(set (match_dup 0) (zero_extend:DI (match_dup 1)))]
-  "operands[1] = gen_lowpart (SImode, operands[1]);")
+  "operands[1] = gen_lowpart (SImode, operands[1]);"
+  [(set_attr "isa" "*,sse4,avx512f")])
 
 (define_insn "*vec_extractv2di_0_sse"
   [(set (match_operand:DI 0 "nonimmediate_operand"     "=v,m")
--- gcc/config/i386/i386.md.jj	2017-04-03 13:43:50.000000000 +0200
+++ gcc/config/i386/i386.md	2017-04-04 16:54:09.786014373 +0200
@@ -3767,10 +3767,10 @@  (define_expand "zero_extendsidi2"
 
 (define_insn "*zero_extendsidi2"
   [(set (match_operand:DI 0 "nonimmediate_operand"
-			"=r,?r,?o,r   ,o,?*Ym,?!*y,?r ,?r,?*Yi,?*x,*r")
+			"=r,?r,?o,r   ,o,?*Ym,?!*y,?r ,?r,?*Yi,?*x,*r,?*x,?*v")
 	(zero_extend:DI
 	 (match_operand:SI 1 "x86_64_zext_operand"
-	        	"0 ,rm,r ,rmWz,0,r   ,m   ,*Yj,*x,r   ,m  ,*k")))]
+	        	"0 ,rm,r ,rmWz,0,r   ,m   ,*Yj,*x,r   ,m  ,*k,x  ,v")))]
   ""
 {
   switch (get_attr_type (insn))
@@ -3791,6 +3791,14 @@  (define_insn "*zero_extendsidi2"
       return "%vpextrd\t{$0, %1, %k0|%k0, %1, 0}";
 
     case TYPE_SSEMOV:
+      if (SSE_REG_P (operands[0]) && SSE_REG_P (operands[1]))
+	{
+	  if (EXT_REX_SSE_REG_P (operands[0])
+	      || EXT_REX_SSE_REG_P (operands[1]))
+	    return "vpmovzxdq\t{%t1, %g0|%g0, %t1}";
+	  else
+	    return "%vpmovzxdq\t{%1, %0|%0, %1}";
+	}
       if (GENERAL_REG_P (operands[0]))
 	return "%vmovd\t{%1, %k0|%k0, %1}";
 
@@ -3814,6 +3822,10 @@  (define_insn "*zero_extendsidi2"
 	      (const_string "sse2")
 	    (eq_attr "alternative" "11")
 	      (const_string "x64_avx512bw")
+	    (eq_attr "alternative" "12")
+	      (const_string "sse4")
+	    (eq_attr "alternative" "13")
+	      (const_string "avx512f")
 	   ]
 	   (const_string "*")))
    (set (attr "type")
@@ -3821,7 +3833,7 @@  (define_insn "*zero_extendsidi2"
 	      (const_string "multi")
 	    (eq_attr "alternative" "5,6")
 	      (const_string "mmxmov")
-	    (eq_attr "alternative" "7,9,10")
+	    (eq_attr "alternative" "7,9,10,12,13")
 	      (const_string "ssemov")
 	    (eq_attr "alternative" "8")
 	      (const_string "sselog1")
@@ -3830,7 +3842,7 @@  (define_insn "*zero_extendsidi2"
 	   ]
 	   (const_string "imovx")))
    (set (attr "prefix_extra")
-     (if_then_else (eq_attr "alternative" "8")
+     (if_then_else (eq_attr "alternative" "8,12,13")
        (const_string "1")
        (const_string "*")))
    (set (attr "length_immediate")
@@ -3848,8 +3860,10 @@  (define_insn "*zero_extendsidi2"
    (set (attr "mode")
      (cond [(eq_attr "alternative" "5,6")
 	      (const_string "DI")
-	    (eq_attr "alternative" "7,8,9")
+	    (eq_attr "alternative" "7,8,9,12")
 	      (const_string "TI")
+	    (eq_attr "alternative" "13")
+	      (const_string "OI")
 	   ]
 	   (const_string "SI")))])