Fix non-AVX512VL handling of lo extraction from AVX512F xmm16+ (PR target/85328, take 2)

Message ID 20180411185945.GB8577@tucnak
State New
Headers show
Series
  • Fix non-AVX512VL handling of lo extraction from AVX512F xmm16+ (PR target/85328, take 2)
Related show

Commit Message

Jakub Jelinek April 11, 2018, 6:59 p.m.
On Wed, Apr 11, 2018 at 03:27:28PM +0200, Jakub Jelinek wrote:
> In lots of patterns we assume that we never see xmm16+ hard registers
> with 128-bit and 256-bit vector modes when not -mavx512vl, because
> HARD_REGNO_MODE_OK refuses those.
> Unfortunately, as this testcase and patch shows, the vec_extract_lo*
> splitters work as a loophole around this, we happily create instructions
> like (set (reg:V32QI xmm5) (reg:V32QI xmm16)) and then hard register
> propagation can propagate the V32QI xmm16 into other insns like vpand.
> 
> The following patch fixes it by making sure we never create such registers,
> just emit (set (reg:V64QI xmm5) (reg:V64QI xmm16)) instead, which by copying
> all the 512 bits also copies the low bits, and as the destination is
> originally V32QI which is not HARD_REGNO_MODE_OK in xmm16+, this should be
> fine.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

Actually, thinking about it more (not that I have managed to come up with a
testcase), if output is a MEM and input is xmm16+, then we really need to
give up in the splitters and instead emit the v*extract* instructions,
because simple vmovdqa and vmovap[sd] require AVX512VL for the EVEX
encodings.

So, here is an updated patch, bootstrapped/regtested on x86_64-linux and
i686-linux, is this one ok for trunk instead?

Tried e.g.
#include <x86intrin.h>

__m256d f1 (__m512d x) { register __m512d a __asm ("zmm16"); __asm ("" : "=v" (a) : "0" (x)); return _mm512_extractf64x4_pd (a, 0); }
void f2 (__m256d *p, __m512d x) { register __m512d a __asm ("zmm16"); __asm ("" : "=v" (a) : "0" (x)); *p = _mm512_extractf64x4_pd (a, 0); }
__m256d f3 (__m512d x, __m256d y) { register __m512d a __asm ("zmm16"); __asm ("" : "=v" (a) : "0" (x)); return y + _mm512_extractf64x4_pd (a, 0); }
__m128 f4 (__m512 x) { register __m512 a __asm ("zmm16"); __asm ("" : "=v" (a) : "0" (x)); return _mm512_extractf32x4_ps (a, 0); }
void f5 (__m128 *p, __m512 x) { register __m512 a __asm ("zmm16"); __asm ("" : "=v" (a) : "0" (x)); *p = _mm512_extractf32x4_ps (a, 0); }
__m128 f6 (__m512 x, __m128 y) { register __m512 a __asm ("zmm16"); __asm ("" : "=v" (a) : "0" (x)); return y + _mm512_extractf32x4_ps (a, 0); }
__m256i f7 (__m512i x) { register __m512i a __asm ("zmm16"); __asm ("" : "=v" (a) : "0" (x)); return _mm512_extracti64x4_epi64 (a, 0); }
void f8 (__m256i *p, __m512i x) { register __m512i a __asm ("zmm16"); __asm ("" : "=v" (a) : "0" (x)); *p = _mm512_extracti64x4_epi64 (a, 0); }
__m256i f9 (__m512i x, __m256i y) { register __m512i a __asm ("zmm16"); __asm ("" : "=v" (a) : "0" (x)); return y + _mm512_extracti64x4_epi64 (a, 0); }
__m128i f10 (__m512i x) { register __m512i a __asm ("zmm16"); __asm ("" : "=v" (a) : "0" (x)); return _mm512_extracti32x4_epi32 (a, 0); }
void f11 (__m128i *p, __m512i x) { register __m512i a __asm ("zmm16"); __asm ("" : "=v" (a) : "0" (x)); *p = _mm512_extracti32x4_epi32 (a, 0); }
__m128i f12 (__m512i x, __m128i y) { register __m512i a __asm ("zmm16"); __asm ("" : "=v" (a) : "0" (x)); return y + _mm512_extracti32x4_epi32 (a, 0); }
but couldn't reproduce though.

2018-04-11  Jakub Jelinek  <jakub@redhat.com>

	PR target/85328
	* config/i386/sse.md
	(<mask_codefor>avx512dq_vextract<shuffletype>64x2_1<mask_name> split,
	<mask_codefor>avx512f_vextract<shuffletype>32x4_1<mask_name> split,
	vec_extract_lo_<mode><mask_name> split, vec_extract_lo_v32hi,
	vec_extract_lo_v64qi): For non-AVX512VL if input is xmm16+ reg
	and output is a reg, avoid creating invalid lowpart subreg, but
	instead split into a 512-bit move.  Don't split if not AVX512VL,
	input is xmm16+ reg and output is a mem.
	(vec_extract_lo_<mode><mask_name>, vec_extract_lo_v32hi,
	vec_extract_lo_v64qi): Don't require split if not AVX512VL, input is
	xmm16+ reg and output is a mem.

	* gcc.target/i386/pr85328.c: New test.



	Jakub

Patch

--- gcc/config/i386/sse.md.jj	2018-04-11 13:36:29.368015262 +0200
+++ gcc/config/i386/sse.md	2018-04-11 17:15:56.175746606 +0200
@@ -7361,9 +7361,21 @@  (define_split
 	(vec_select:<ssequartermode>
 	  (match_operand:V8FI 1 "register_operand")
 	  (parallel [(const_int 0) (const_int 1)])))]
-  "TARGET_AVX512DQ && reload_completed"
+  "TARGET_AVX512DQ
+   && reload_completed
+   && (TARGET_AVX512VL
+       || REG_P (operands[0])
+       || !EXT_REX_SSE_REG_P (operands[1]))"
   [(set (match_dup 0) (match_dup 1))]
-  "operands[1] = gen_lowpart (<ssequartermode>mode, operands[1]);")
+{
+  if (!TARGET_AVX512VL
+      && REG_P (operands[0])
+      && EXT_REX_SSE_REG_P (operands[1]))
+    operands[0]
+      = lowpart_subreg (<MODE>mode, operands[0], <ssequartermode>mode);
+  else
+    operands[1] = gen_lowpart (<ssequartermode>mode, operands[1]);
+})
 
 (define_insn "<mask_codefor>avx512f_vextract<shuffletype>32x4_1<mask_name>"
   [(set (match_operand:<ssequartermode> 0 "<store_mask_predicate>" "=<store_mask_constraint>")
@@ -7394,9 +7406,21 @@  (define_split
 	  (match_operand:V16FI 1 "register_operand")
 	  (parallel [(const_int 0) (const_int 1)
 		     (const_int 2) (const_int 3)])))]
-  "TARGET_AVX512F && reload_completed"
+  "TARGET_AVX512F
+   && reload_completed
+   && (TARGET_AVX512VL
+       || REG_P (operands[0])
+       || !EXT_REX_SSE_REG_P (operands[1]))"
   [(set (match_dup 0) (match_dup 1))]
-  "operands[1] = gen_lowpart (<ssequartermode>mode, operands[1]);")
+{
+  if (!TARGET_AVX512VL
+      && REG_P (operands[0])
+      && EXT_REX_SSE_REG_P (operands[1]))
+    operands[0]
+      = lowpart_subreg (<MODE>mode, operands[0], <ssequartermode>mode);
+  else
+    operands[1] = gen_lowpart (<ssequartermode>mode, operands[1]);
+})
 
 (define_mode_attr extract_type_2
   [(V16SF "avx512dq") (V16SI "avx512dq") (V8DF "avx512f") (V8DI "avx512f")])
@@ -7639,7 +7663,10 @@  (define_insn "vec_extract_lo_<mode><mask
    && <mask_mode512bit_condition>
    && (<mask_applied> || !(MEM_P (operands[0]) && MEM_P (operands[1])))"
 {
-  if (<mask_applied>)
+  if (<mask_applied>
+      || (!TARGET_AVX512VL
+	  && !REG_P (operands[0])
+	  && EXT_REX_SSE_REG_P (operands[1])))
     return "vextract<shuffletype>32x8\t{$0x0, %1, %0<mask_operand2>|%0<mask_operand2>, %1, 0x0}";
   else
     return "#";
@@ -7654,9 +7681,20 @@  (define_split
 	    (const_int 4) (const_int 5)
 	    (const_int 6) (const_int 7)])))]
   "TARGET_AVX512F && !(MEM_P (operands[0]) && MEM_P (operands[1]))
-   && reload_completed"
+   && reload_completed
+   && (TARGET_AVX512VL
+       || REG_P (operands[0])
+       || !EXT_REX_SSE_REG_P (operands[1]))"
   [(set (match_dup 0) (match_dup 1))]
-  "operands[1] = gen_lowpart (<ssehalfvecmode>mode, operands[1]);")
+{
+  if (!TARGET_AVX512VL
+      && REG_P (operands[0])
+      && EXT_REX_SSE_REG_P (operands[1]))
+    operands[0]
+      = lowpart_subreg (<MODE>mode, operands[0], <ssehalfvecmode>mode);
+  else
+    operands[1] = gen_lowpart (<ssehalfvecmode>mode, operands[1]);
+})
 
 (define_insn "vec_extract_lo_<mode><mask_name>"
   [(set (match_operand:<ssehalfvecmode> 0 "<store_mask_predicate>" "=v,m")
@@ -7828,10 +7866,27 @@  (define_insn_and_split "vec_extract_lo_v
 		     (const_int 12) (const_int 13)
 		     (const_int 14) (const_int 15)])))]
   "TARGET_AVX512F && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
-  "#"
-  "&& reload_completed"
+{
+  if (TARGET_AVX512VL
+      || REG_P (operands[0])
+      || !EXT_REX_SSE_REG_P (operands[1]))
+    return "#";
+  else
+    return "vextracti64x4\t{$0x0, %1, %0|%0, %1, 0x0}";
+}
+  "&& reload_completed
+   && (TARGET_AVX512VL
+       || REG_P (operands[0])
+       || !EXT_REX_SSE_REG_P (operands[1]))"
   [(set (match_dup 0) (match_dup 1))]
-  "operands[1] = gen_lowpart (V16HImode, operands[1]);")
+{
+  if (!TARGET_AVX512VL
+      && REG_P (operands[0])
+      && EXT_REX_SSE_REG_P (operands[1]))
+    operands[0] = lowpart_subreg (V32HImode, operands[0], V16HImode);
+  else
+    operands[1] = gen_lowpart (V16HImode, operands[1]);
+})
 
 (define_insn "vec_extract_hi_v32hi"
   [(set (match_operand:V16HI 0 "nonimmediate_operand" "=v,m")
@@ -7913,10 +7968,27 @@  (define_insn_and_split "vec_extract_lo_v
 		     (const_int 28) (const_int 29)
 		     (const_int 30) (const_int 31)])))]
   "TARGET_AVX512F && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
-  "#"
-  "&& reload_completed"
+{
+  if (TARGET_AVX512VL
+      || REG_P (operands[0])
+      || !EXT_REX_SSE_REG_P (operands[1]))
+    return "#";
+  else
+    return "vextracti64x4\t{$0x0, %1, %0|%0, %1, 0x0}";
+}
+  "&& reload_completed
+   && (TARGET_AVX512VL
+       || REG_P (operands[0])
+       || !EXT_REX_SSE_REG_P (operands[1]))"
   [(set (match_dup 0) (match_dup 1))]
-  "operands[1] = gen_lowpart (V32QImode, operands[1]);")
+{
+  if (!TARGET_AVX512VL
+      && REG_P (operands[0])
+      && EXT_REX_SSE_REG_P (operands[1]))
+    operands[0] = lowpart_subreg (V64QImode, operands[0], V32QImode);
+  else
+    operands[1] = gen_lowpart (V32QImode, operands[1]);
+})
 
 (define_insn "vec_extract_hi_v64qi"
   [(set (match_operand:V32QI 0 "nonimmediate_operand" "=v,m")
--- gcc/testsuite/gcc.target/i386/pr85328.c.jj	2018-04-11 16:41:49.769327148 +0200
+++ gcc/testsuite/gcc.target/i386/pr85328.c	2018-04-11 16:41:49.769327148 +0200
@@ -0,0 +1,18 @@ 
+/* PR target/85328 */
+/* { dg-do assemble { target avx512f } } */
+/* { dg-options "-O3 -fno-caller-saves -mavx512f" } */
+
+typedef char U __attribute__((vector_size (64)));
+typedef int V __attribute__((vector_size (64)));
+U a, b;
+
+extern void bar (void);
+
+V
+foo (V f)
+{
+  b <<= (U){(V){}[63]} & 7;
+  bar ();
+  a = (U)f & 7;
+  return (V)b;
+}