@@ -4231,12 +4231,12 @@
that might lead to ICE on 32bit target. The sequence unlikely combine
anyway. */
(define_split
- [(set (match_operand:DF 0 "register_operand")
+ [(set (match_operand:DF 0 "sse_reg_operand")
(float_extend:DF
(match_operand:SF 1 "nonimmediate_operand")))]
"TARGET_USE_VECTOR_FP_CONVERTS
&& optimize_insn_for_speed_p ()
- && reload_completed && SSE_REG_P (operands[0])
+ && reload_completed
&& (!EXT_REX_SSE_REG_P (operands[0])
|| TARGET_AVX512VL)"
[(set (match_dup 2)
@@ -4253,13 +4253,11 @@
{
/* If it is unsafe to overwrite upper half of source, we need
to move to destination and unpack there. */
- if (((ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER
- || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 4)
- && true_regnum (operands[0]) != true_regnum (operands[1]))
+ if (REGNO (operands[0]) != REGNO (operands[1])
|| (EXT_REX_SSE_REG_P (operands[1])
&& !TARGET_AVX512VL))
{
- rtx tmp = gen_rtx_REG (SFmode, true_regnum (operands[0]));
+ rtx tmp = lowpart_subreg (SFmode, operands[0], DFmode);
emit_move_insn (tmp, operands[1]);
}
else
@@ -4267,7 +4265,7 @@
/* FIXME: vec_interleave_lowv4sf for AVX512VL should allow
=v, v, then vbroadcastss will be only needed for AVX512F without
AVX512VL. */
- if (!EXT_REX_SSE_REGNO_P (true_regnum (operands[3])))
+ if (!EXT_REX_SSE_REGNO_P (REGNO (operands[3])))
emit_insn (gen_vec_interleave_lowv4sf (operands[3], operands[3],
operands[3]));
else
@@ -4283,15 +4281,14 @@
;; It's more profitable to split and then extend in the same register.
(define_peephole2
- [(set (match_operand:DF 0 "register_operand")
+ [(set (match_operand:DF 0 "sse_reg_operand")
(float_extend:DF
(match_operand:SF 1 "memory_operand")))]
"TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS
- && optimize_insn_for_speed_p ()
- && SSE_REG_P (operands[0])"
+ && optimize_insn_for_speed_p ()"
[(set (match_dup 2) (match_dup 1))
(set (match_dup 0) (float_extend:DF (match_dup 2)))]
- "operands[2] = gen_rtx_REG (SFmode, REGNO (operands[0]));")
+ "operands[2] = lowpart_subreg (SFmode, operands[0], DFmode);")
(define_insn "*extendsfdf2"
[(set (match_operand:DF 0 "nonimm_ssenomem_operand" "=f,m,v")
@@ -4390,12 +4387,12 @@
that might lead to ICE on 32bit target. The sequence unlikely combine
anyway. */
(define_split
- [(set (match_operand:SF 0 "register_operand")
+ [(set (match_operand:SF 0 "sse_reg_operand")
(float_truncate:SF
(match_operand:DF 1 "nonimmediate_operand")))]
"TARGET_USE_VECTOR_FP_CONVERTS
&& optimize_insn_for_speed_p ()
- && reload_completed && SSE_REG_P (operands[0])
+ && reload_completed
&& (!EXT_REX_SSE_REG_P (operands[0])
|| TARGET_AVX512VL)"
[(set (match_dup 2)
@@ -4413,9 +4410,7 @@
if (REG_P (operands[1]))
{
if (!TARGET_SSE3
- && true_regnum (operands[0]) != true_regnum (operands[1])
- && (ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER
- || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 8))
+ && REGNO (operands[0]) != REGNO (operands[1]))
{
rtx tmp = lowpart_subreg (DFmode, operands[0], SFmode);
emit_move_insn (tmp, operands[1]);
@@ -4432,15 +4427,14 @@
;; It's more profitable to split and then extend in the same register.
(define_peephole2
- [(set (match_operand:SF 0 "register_operand")
+ [(set (match_operand:SF 0 "sse_reg_operand")
(float_truncate:SF
(match_operand:DF 1 "memory_operand")))]
"TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS
- && optimize_insn_for_speed_p ()
- && SSE_REG_P (operands[0])"
+ && optimize_insn_for_speed_p ()"
[(set (match_dup 2) (match_dup 1))
(set (match_dup 0) (float_truncate:SF (match_dup 2)))]
- "operands[2] = gen_rtx_REG (DFmode, REGNO (operands[0]));")
+ "operands[2] = lowpart_subreg (DFmode, operands[0], SFmode);")
(define_expand "truncdfsf2_with_temp"
[(parallel [(set (match_operand:SF 0)
@@ -4547,7 +4541,7 @@
"reload_completed"
[(set (match_dup 2) (match_dup 1))
(set (match_dup 0) (match_dup 2))]
- "operands[1] = gen_rtx_REG (SFmode, true_regnum (operands[1]));")
+ "operands[1] = gen_rtx_REG (SFmode, REGNO (operands[1]));")
;; Conversion from XFmode to {SF,DF}mode
@@ -5153,11 +5147,11 @@
;; slots when !TARGET_INTER_UNIT_MOVES_TO_VEC disables the general_regs
;; alternative in sse2_loadld.
(define_split
- [(set (match_operand:MODEF 0 "register_operand")
+ [(set (match_operand:MODEF 0 "sse_reg_operand")
(float:MODEF (match_operand:SI 1 "nonimmediate_operand")))]
- "TARGET_SSE2 && TARGET_SSE_MATH
- && TARGET_USE_VECTOR_CONVERTS && optimize_function_for_speed_p (cfun)
- && reload_completed && SSE_REG_P (operands[0])
+ "TARGET_USE_VECTOR_CONVERTS
+ && optimize_function_for_speed_p (cfun)
+ && reload_completed
&& (MEM_P (operands[1]) || TARGET_INTER_UNIT_MOVES_TO_VEC)
&& (!EXT_REX_SSE_REG_P (operands[0])
|| TARGET_AVX512VL)"
@@ -5176,41 +5170,43 @@
DONE;
})
-;; Avoid partial SSE register dependency stalls
+;; Avoid partial SSE register dependency stalls. This splitter should split
+;; late in the pass sequence (after register rename pass), so allocated
+;; registers won't change anymore
+
(define_split
- [(set (match_operand:MODEF 0 "register_operand")
+ [(set (match_operand:MODEF 0 "sse_reg_operand")
(float:MODEF (match_operand:SWI48 1 "nonimmediate_operand")))]
- "TARGET_SSE2 && TARGET_SSE_MATH
- && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+ "TARGET_SSE_PARTIAL_REG_DEPENDENCY
&& optimize_function_for_speed_p (cfun)
- && reload_completed && SSE_REG_P (operands[0])
+ && epilogue_completed
&& (!EXT_REX_SSE_REG_P (operands[0])
|| TARGET_AVX512VL)"
- [(const_int 0)]
+ [(set (match_dup 0)
+ (vec_merge:<MODEF:ssevecmode>
+ (vec_duplicate:<MODEF:ssevecmode>
+ (float:MODEF
+ (match_dup 1)))
+ (match_dup 0)
+ (const_int 1)))]
{
const machine_mode vmode = <MODEF:ssevecmode>mode;
- const machine_mode mode = <MODEF:MODE>mode;
- rtx t, op0 = lowpart_subreg (vmode, operands[0], mode);
-
- emit_move_insn (op0, CONST0_RTX (vmode));
- t = gen_rtx_FLOAT (mode, operands[1]);
- t = gen_rtx_VEC_DUPLICATE (vmode, t);
- t = gen_rtx_VEC_MERGE (vmode, t, op0, const1_rtx);
- emit_insn (gen_rtx_SET (op0, t));
- DONE;
+ operands[0] = lowpart_subreg (vmode, operands[0], <MODEF:MODE>mode);
+ emit_move_insn (operands[0], CONST0_RTX (vmode));
})
-;; Break partial reg stall for cvtsd2ss.
+;; Break partial reg stall for cvtsd2ss. This splitter should split
+;; late in the pass sequence (after register rename pass),
+;; so allocated registers won't change anymore.
-(define_peephole2
- [(set (match_operand:SF 0 "register_operand")
+(define_split
+ [(set (match_operand:SF 0 "sse_reg_operand")
(float_truncate:SF
(match_operand:DF 1 "nonimmediate_operand")))]
- "TARGET_SSE2 && TARGET_SSE_MATH
- && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+ "TARGET_SSE_PARTIAL_REG_DEPENDENCY
&& optimize_function_for_speed_p (cfun)
- && SSE_REG_P (operands[0])
+ && epilogue_completed
&& (!SSE_REG_P (operands[1])
|| REGNO (operands[0]) != REGNO (operands[1]))
&& (!EXT_REX_SSE_REG_P (operands[0])
@@ -5228,16 +5224,17 @@
emit_move_insn (operands[0], CONST0_RTX (V4SFmode));
})
-;; Break partial reg stall for cvtss2sd.
+;; Break partial reg stall for cvtss2sd. This splitter should split
+;; late in the pass sequence (after register rename pass),
+;; so allocated registers won't change anymore.
-(define_peephole2
- [(set (match_operand:DF 0 "register_operand")
+(define_split
+ [(set (match_operand:DF 0 "sse_reg_operand")
(float_extend:DF
(match_operand:SF 1 "nonimmediate_operand")))]
- "TARGET_SSE2 && TARGET_SSE_MATH
- && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+ "TARGET_SSE_PARTIAL_REG_DEPENDENCY
&& optimize_function_for_speed_p (cfun)
- && SSE_REG_P (operands[0])
+ && epilogue_completed
&& (!SSE_REG_P (operands[1])
|| REGNO (operands[0]) != REGNO (operands[1]))
&& (!EXT_REX_SSE_REG_P (operands[0])