diff mbox series

[v3,02/12] x86: Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE

Message ID 20210517131527.3053833-3-hjl.tools@gmail.com
State New
Headers show
Series [v3,01/12] Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE | expand

Commit Message

H.J. Lu May 17, 2021, 1:15 p.m. UTC
1. Make ix86_expand_vector_init_duplicate global to duplicate QImode
value to TImode/OImode/XImode.
2. Make ix86_minimum_incoming_stack_boundary global and add an argument
to ignore stack_alignment_estimated.
3. Define SCRATCH_SSE_REG as a scratch register for ix86_gen_memset_value.
4. Add TARGET_READ_MEMSET_VALUE and TARGET_GEN_MEMSET_VALUE to support
target instructions to duplicate QImode value to TImode/OImode/XImode
value for memmset.

gcc/

	PR middle-end/90773
	* config/i386/i386-expand.c (ix86_expand_vector_init_duplicate):
	Make it global.
	* config/i386/i386-protos.h (ix86_minimum_incoming_stack_boundary):
	New.
	(ix86_expand_vector_init_duplicate): Likewise.
	* config/i386/i386.c (ix86_minimum_incoming_stack_boundary): Add
	an argument to ignore stack_alignment_estimated.  It is passed
	as false by default.  Make it global.
	(ix86_gen_memset_value_from_prev): New function.
	(ix86_gen_memset_value): Likewise.
	(ix86_read_memset_value): Likewise.
	(TARGET_GEN_MEMSET_VALUE): New.
	(TARGET_READ_MEMSET_VALUE): Likewise.
	* config/i386/i386.h (SCRATCH_SSE_REG): New.

gcc/testsuite/

	PR middle-end/90773
	* gcc.target/i386/pr90773-15.c: New test.
	* gcc.target/i386/pr90773-16.c: Likewise.
	* gcc.target/i386/pr90773-17.c: Likewise.
	* gcc.target/i386/pr90773-18.c: Likewise.
	* gcc.target/i386/pr90773-19.c: Likewise.
---
 gcc/config/i386/i386-expand.c              |   2 +-
 gcc/config/i386/i386-protos.h              |   5 +
 gcc/config/i386/i386.c                     | 268 ++++++++++++++++++++-
 gcc/config/i386/i386.h                     |   4 +
 gcc/testsuite/gcc.target/i386/pr90773-15.c |  14 ++
 gcc/testsuite/gcc.target/i386/pr90773-16.c |  14 ++
 gcc/testsuite/gcc.target/i386/pr90773-17.c |  14 ++
 gcc/testsuite/gcc.target/i386/pr90773-18.c |  15 ++
 gcc/testsuite/gcc.target/i386/pr90773-19.c |  14 ++
 9 files changed, 345 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-15.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-16.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-17.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-18.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-19.c
diff mbox series

Patch

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 0fa8d45a684..485825b3c15 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -13648,7 +13648,7 @@  static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
    with all elements equal to VAR.  Return true if successful.  */
 
-static bool
+bool
 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
 				   rtx target, rtx val)
 {
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 7782cf1163f..c4896c2da74 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -50,6 +50,9 @@  extern void ix86_reset_previous_fndecl (void);
 
 extern bool ix86_using_red_zone (void);
 
+extern unsigned int ix86_minimum_incoming_stack_boundary (bool,
+							  bool = false);
+
 extern unsigned int ix86_regmode_natural_size (machine_mode);
 #ifdef RTX_CODE
 extern int standard_80387_constant_p (rtx);
@@ -257,6 +260,8 @@  extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool);
 extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
 extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
 extern void ix86_expand_sse2_abs (rtx, rtx);
+extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx,
+					       rtx);
 
 /* In i386-c.c  */
 extern void ix86_target_macros (void);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 6a1f5746089..8b9b2346478 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -415,7 +415,6 @@  static unsigned int split_stack_prologue_scratch_regno (void);
 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
 
 static bool ix86_can_inline_p (tree, tree);
-static unsigned int ix86_minimum_incoming_stack_boundary (bool);
 
 
 /* Whether -mtune= or -march= were specified */
@@ -7232,8 +7231,9 @@  find_drap_reg (void)
 
 /* Return minimum incoming stack alignment.  */
 
-static unsigned int
-ix86_minimum_incoming_stack_boundary (bool sibcall)
+unsigned int
+ix86_minimum_incoming_stack_boundary (bool sibcall,
+				      bool ignore_estimated)
 {
   unsigned int incoming_stack_boundary;
 
@@ -7248,7 +7248,8 @@  ix86_minimum_incoming_stack_boundary (bool sibcall)
      estimated stack alignment is 128bit.  */
   else if (!sibcall
 	   && ix86_force_align_arg_pointer
-	   && crtl->stack_alignment_estimated == 128)
+	   && (ignore_estimated
+	       || crtl->stack_alignment_estimated == 128))
     incoming_stack_boundary = MIN_STACK_BOUNDARY;
   else
     incoming_stack_boundary = ix86_default_incoming_stack_boundary;
@@ -23052,6 +23053,259 @@  ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
     }
 }
 
+/* Return the RTL for memset in MODE from PREV.  */
+
+static rtx
+ix86_gen_memset_value_from_prev (by_pieces_prev *prevp,
+				 scalar_int_mode mode)
+{
+  rtx prev = prevp->data;
+
+  /* Use the previous data in the same mode.  */
+  if (prevp->mode == mode)
+    return prev;
+
+  machine_mode prev_mode = prevp->mode;
+  size_t size = GET_MODE_SIZE (prev_mode);
+
+  /* NB: Skip if the previous value is 1 byte or less.  CONST_WIDE_INT
+     is in VOIDmode whose size is 0.  */
+  if (size <= 1)
+    return nullptr;
+
+  rtx reg, reg_ti;
+  switch (size)
+    {
+    default:
+      gcc_unreachable ();
+
+    case 2:
+    case 4:
+      return simplify_gen_subreg (mode, prev, prev_mode, 0);
+
+    case 8:
+      /* In 64-bit mode, use SUBREG since word size is 8 bytes.  */
+      if (TARGET_64BIT)
+	return simplify_gen_subreg (mode, prev, prev_mode, 0);
+
+      switch (GET_MODE_SIZE (mode))
+	{
+	default:
+	  gcc_unreachable ();
+	case 2:
+	case 4:
+do_hi_si_mode:
+	  /* In 32-bit mode, Extract the value from an 8-byte
+	     register into an integer register first.  */
+	  reg = gen_reg_rtx (SImode);
+	  emit_move_insn (reg,
+			  simplify_gen_subreg (SImode, prev,
+					       prev_mode, 0));
+	  return simplify_gen_subreg (mode, reg, SImode, 0);
+	}
+      break;
+
+    case 16:
+      switch (GET_MODE_SIZE (mode))
+	{
+	default:
+	  gcc_unreachable ();
+	case 2:
+	case 4:
+	  /* Extract the value from a 16-byte vector register into
+	     an integer register first.  */
+	  goto do_hi_si_mode;
+	case 8:
+	  return simplify_gen_subreg (mode, prev, prev_mode, 0);
+	case 16:
+	  return prev;
+	}
+      break;
+
+    case 32:
+      switch (GET_MODE_SIZE (mode))
+	{
+	default:
+	  gcc_unreachable ();
+	case 2:
+do_himode:
+	  /* Extract the value from a 32-byte vector register into
+	     a 16-byte vector register first.  */
+	  reg_ti = gen_reg_rtx (TImode);
+	  emit_move_insn (reg_ti,
+			  simplify_gen_subreg (TImode, prev,
+					       prev_mode, 0));
+	  /* Then extract the value from a 16-byte vector register
+	     into an integer register.  */
+	  reg = gen_reg_rtx (SImode);
+	  emit_move_insn (reg,
+			  simplify_gen_subreg (SImode, reg_ti,
+					       TImode, 0));
+	  return simplify_gen_subreg (mode, reg, SImode, 0);
+
+	case 4:
+	case 8:
+do_si_di_mode:
+	  /* Extract the value from a 32-byte vector register into
+	     a 16-byte vector register first.  */
+	  reg_ti = gen_reg_rtx (TImode);
+	  emit_move_insn (reg_ti,
+			  simplify_gen_subreg (TImode, prev,
+					       prev_mode, 0));
+	  /* Generate 4/8-byte SSE -> INT move instruction.  */
+	  reg = gen_reg_rtx (mode);
+	  emit_move_insn (reg,
+			  simplify_gen_subreg (mode, reg_ti,
+					       TImode, 0));
+	  return reg;
+	case 16:
+	  return simplify_gen_subreg (mode, prev, prev_mode, 0);
+	case 32:
+	  return prev;
+	}
+
+    case 64:
+      switch (GET_MODE_SIZE (mode))
+	{
+	default:
+	  gcc_unreachable ();
+	case 2:
+	  /* Extract the value from a 64-byte vector register into
+	     a 16-byte vector register first.  */
+	  goto do_himode;
+	case 4:
+	case 8:
+	  /* Extract the value from a 64-byte vector register into
+	     a 16-byte vector register first.  */
+	  goto do_si_di_mode;
+	case 16:
+	case 32:
+	  return simplify_gen_subreg (mode, prev, prev_mode, 0);
+	case 64:
+	  return prev;
+	}
+    }
+
+  return nullptr;
+}
+
+/* Implement the TARGET_GEN_MEMSET_VALUE hook.  */
+
+static rtx
+ix86_gen_memset_value (rtx data, void *prevp, scalar_int_mode mode)
+{
+  /* Don't use the previous value if size is 1.  */
+  if (GET_MODE_SIZE (mode) == 1)
+    return data;
+
+  by_pieces_prev *prev = (by_pieces_prev *) prevp;
+  if (prev != nullptr && prev->data != nullptr)
+    {
+      rtx value = ix86_gen_memset_value_from_prev (prev, mode);
+      if (value)
+	return value;
+    }
+
+  /* Use default_gen_memset_value for vector store won't be used.  */
+  if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (DImode))
+    return default_gen_memset_value (data, prevp, mode);
+
+  rtx one, target;
+  scalar_mode one_mode;
+
+  unsigned int incoming_stack_boundary
+    = ix86_minimum_incoming_stack_boundary (false, true);
+
+  switch (GET_MODE_SIZE (mode))
+    {
+    default:
+      gcc_unreachable ();
+
+    case 64:
+      if (!TARGET_AVX512BW)
+	{
+	  rtx tmp;
+	  /* NB: Don't increase stack alignment requirement by using a
+	     scratch SSE register.  */
+	  if (GET_MODE_ALIGNMENT (V32QImode) > incoming_stack_boundary)
+	    tmp = gen_rtx_REG (V32QImode, SCRATCH_SSE_REG);
+	  else
+	    tmp = gen_reg_rtx (V32QImode);
+	  if (!ix86_expand_vector_init_duplicate (false, V32QImode,
+						  tmp, data))
+	    gcc_unreachable ();
+	  target = gen_rtx_VEC_CONCAT (V64QImode, tmp, tmp);
+	  if (REGNO (tmp) == SCRATCH_SSE_REG)
+	    {
+	      tmp = gen_rtx_REG (V64QImode, SCRATCH_SSE_REG);
+	      emit_move_insn (tmp, target);
+	      return gen_rtx_REG (mode, SCRATCH_SSE_REG);
+	    }
+	  else
+	    return convert_to_mode (mode, target, 1);
+	}
+      /* FALLTHRU */
+    case 16:
+    case 32:
+      one_mode = QImode;
+      one = data;
+      break;
+    }
+
+  unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (one_mode);
+  machine_mode vector_mode;
+  if (!mode_for_vector (one_mode, nunits).exists (&vector_mode))
+    gcc_unreachable ();
+
+  /* NB: Don't increase stack alignment requirement by using a scratch
+     SSE register.  */
+  if (GET_MODE_ALIGNMENT (vector_mode) > incoming_stack_boundary)
+    target = gen_rtx_REG (vector_mode, SCRATCH_SSE_REG);
+  else
+    target = gen_reg_rtx (vector_mode);
+  if (!ix86_expand_vector_init_duplicate (false, vector_mode, target,
+					  one))
+    gcc_unreachable ();
+
+  if (REGNO (target) == SCRATCH_SSE_REG)
+    return gen_rtx_REG (mode, SCRATCH_SSE_REG);
+  else
+    return convert_to_mode (mode, target, 1);
+}
+
+/* Implement the TARGET_READ_MEMSET_VALUE hook.  */
+
+static rtx
+ix86_read_memset_value (const char *str, void *prevp,
+			scalar_int_mode mode)
+{
+  rtx value;
+
+  by_pieces_prev *prev = (by_pieces_prev *) prevp;
+  if (prev != nullptr && prev->data != nullptr)
+    {
+      /* Don't use the previous value if size is 1.  */
+      if (GET_MODE_SIZE (mode) == 1)
+	return default_read_memset_value (str, nullptr, mode);
+
+      value = ix86_gen_memset_value_from_prev (prev, mode);
+      if (value)
+	return value;
+
+      return default_read_memset_value (str, nullptr, mode);
+    }
+
+  /* Use default_gen_memset_value if vector store can't be used.
+     NB: Need AVX2 for fast vector duplication and gen_reg_rtx.  */
+  if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (DImode)
+      || !TARGET_AVX2
+      || !reg_rtx_no)
+   return default_read_memset_value (str, nullptr, mode);
+
+  value = default_read_memset_value (str, nullptr, QImode);
+  return ix86_gen_memset_value (value, nullptr, mode);
+}
+
 /* Address space support.
 
    This is not "far pointers" in the 16-bit sense, but an easy way
@@ -23953,6 +24207,12 @@  static bool ix86_libc_has_fast_function (int fcode ATTRIBUTE_UNUSED)
 #undef TARGET_LIBC_HAS_FAST_FUNCTION
 #define TARGET_LIBC_HAS_FAST_FUNCTION ix86_libc_has_fast_function
 
+#undef TARGET_GEN_MEMSET_VALUE
+#define TARGET_GEN_MEMSET_VALUE ix86_gen_memset_value
+
+#undef TARGET_READ_MEMSET_VALUE
+#define TARGET_READ_MEMSET_VALUE ix86_read_memset_value
+
 #if CHECKING_P
 #undef TARGET_RUN_TARGET_SELFTESTS
 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 97d6f3863cb..45d86802c51 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1131,6 +1131,10 @@  extern const char *host_detect_local_cpu (int argc, const char **argv);
 #define FIRST_MASK_REG  MASK0_REG
 #define LAST_MASK_REG   MASK7_REG
 
+/* A scratch vector reg.  */
+#define SCRATCH_SSE_REG \
+  (TARGET_64BIT ? LAST_REX_SSE_REG : LAST_SSE_REG)
+
 /* Override this in other tm.h files to cope with various OS lossage
    requiring a frame pointer.  */
 #ifndef SUBTARGET_FRAME_POINTER_REQUIRED
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-15.c b/gcc/testsuite/gcc.target/i386/pr90773-15.c
new file mode 100644
index 00000000000..c0a96fed892
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90773-15.c
@@ -0,0 +1,14 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512" } */
+
+extern char *dst;
+
+void
+foo (int c)
+{
+  __builtin_memset (dst, c, 17);
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%edi, %xmm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]+%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movb\[\\t \]+%dil, 16\\(%\[\^,\]+\\)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-16.c b/gcc/testsuite/gcc.target/i386/pr90773-16.c
new file mode 100644
index 00000000000..d2d1ec6141c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90773-16.c
@@ -0,0 +1,14 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, -1, 17);
+}
+
+/* { dg-final { scan-assembler-times "vpcmpeqd" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]+%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movb\[\\t \]+\\\$-1, 16\\(%\[\^,\]+\\)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-17.c b/gcc/testsuite/gcc.target/i386/pr90773-17.c
new file mode 100644
index 00000000000..6c8da7d24ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90773-17.c
@@ -0,0 +1,14 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 12, 19);
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]+%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovd\[\\t \]+%xmm\[0-9\]+, 15\\(%\[\^,\]+\\)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-18.c b/gcc/testsuite/gcc.target/i386/pr90773-18.c
new file mode 100644
index 00000000000..b0687abbe01
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90773-18.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 12, 9);
+}
+
+/* { dg-final { scan-assembler-times "movabsq\[\\t \]+\\\$868082074056920076, %r" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+\\\$202116108, \\(%\[\^,\]+\\)" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+\\\$202116108, 4\\(%\[\^,\]+\\)" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movb\[\\t \]+\\\$12, 8\\(%\[\^,\]+\\)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-19.c b/gcc/testsuite/gcc.target/i386/pr90773-19.c
new file mode 100644
index 00000000000..8aa5540bacc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90773-19.c
@@ -0,0 +1,14 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 12, 9);
+}
+
+/* { dg-final { scan-assembler-times "movabsq\[\\t \]+\\\$868082074056920076, %r" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+\\\$202116108, \\(%\[\^,\]+\\)" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+\\\$202116108, 4\\(%\[\^,\]+\\)" 1 { target ia32 } } } */