@@ -177,7 +177,7 @@ extern void ix86_expand_trunc (rtx, rtx);
extern void ix86_expand_truncdf_32 (rtx, rtx);
#ifdef TREE_CODE
-extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree);
+extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
#endif /* TREE_CODE */
#endif /* RTX_CODE */
@@ -56,6 +56,305 @@ along with GCC; see the file COPYING3. If not see
#include "debug.h"
#include "dwarf2out.h"
#include "sched-int.h"
+#include "tree-pass.h"
+
+typedef struct block_info_def
+{
+ /* TRUE if the upper 128bits of any AVX registers are live at exit. */
+ bool upper_128bits_live;
+ /* TRUE if block has been processed. */
+ bool done;
+} *block_info;
+
+#define BLOCK_INFO(B) ((block_info) (B)->aux)
+
+/* Callee returns 256bit AVX register. */
+#define RTX_VZEROUPPER_CALLEE_RETURN_AVX256 const1_rtx
+/* Callee returns and passes 256bit AVX register. */
+#define RTX_VZEROUPPER_CALLEE_RETURN_PASS_AVX256 constm1_rtx
+/* Callee passes 256bit AVX register. */
+#define RTX_VZEROUPPER_CALLEE_PASS_AVX256 const0_rtx
+/* Callee doesn't return nor passe 256bit AVX register, or no
+ 256bit AVX register in function return. */
+#define RTX_VZEROUPPER_NO_AVX256 const2_rtx
+
+/* Check if a 256bit AVX register is referenced in stores. */
+
+static void
+check_avx256_stores (rtx dest, const_rtx set, void *data)
+{
+ if ((REG_P (dest)
+ && VALID_AVX256_REG_MODE (GET_MODE (dest)))
+ || (GET_CODE (set) == SET
+ && REG_P (SET_SRC (set))
+ && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
+ {
+ bool *upper_128bits_live = (bool *) data;
+ *upper_128bits_live = true;
+ }
+}
+
+/* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
+ in CURR_BLOCK. Delete it if upper 128bit AVX registers are unused.
+ If it isn't deleted, move it to just before a jump insn.
+
+ UPPER_128BITS_LIVE is TRUE if the upper 128bits of any AVX registers
+ are live at entry. */
+
+static void
+move_or_delete_vzeroupper_2 (basic_block curr_block,
+ bool upper_128bits_live)
+{
+ rtx curr_insn, next_insn, prev_insn, insn;
+
+ if (dump_file)
+ fprintf (dump_file, " BB [%i] entry: upper 128bits: %d\n",
+ curr_block->index, upper_128bits_live);
+
+ for (curr_insn = BB_HEAD (curr_block);
+ curr_insn && curr_insn != NEXT_INSN (BB_END (curr_block));
+ curr_insn = next_insn)
+ {
+ rtx avx256;
+
+ next_insn = NEXT_INSN (curr_insn);
+
+ if (!NONDEBUG_INSN_P (curr_insn))
+ continue;
+
+ /* Search for vzeroupper. */
+ insn = PATTERN (curr_insn);
+ if (GET_CODE (insn) != UNSPEC_VOLATILE
+ || XINT (insn, 1) != UNSPECV_VZEROUPPER_NOP)
+ {
+ /* Check vzeroall/zeroupper intrinsics. */
+ if (GET_CODE (insn) == PARALLEL
+ && GET_CODE (XVECEXP (insn, 0, 0)) == UNSPEC_VOLATILE
+ && (XINT (XVECEXP (insn, 0, 0), 1) == UNSPECV_VZEROUPPER
+ || XINT (XVECEXP (insn, 0, 0), 1) == UNSPECV_VZEROALL))
+ {
+ if (upper_128bits_live
+ || XINT (XVECEXP (insn, 0, 0), 1) == UNSPECV_VZEROALL)
+ upper_128bits_live = false;
+ else
+ {
+ /* Remove zeroupper intrinsic if upper 128bits are
+ known dead. */
+ if (dump_file)
+ {
+ fprintf (dump_file,
+ "Delete redundant vzeroupper intrinsic:\n");
+ print_rtl_single (dump_file, curr_insn);
+ }
+ delete_insn (curr_insn);
+ }
+ }
+ else if (!upper_128bits_live)
+ {
+ /* Check if upper 128bits of AVX registers are used. */
+ note_stores (insn, check_avx256_stores,
+ &upper_128bits_live);
+ }
+ continue;
+ }
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "Found vzeroupper:\n");
+ print_rtl_single (dump_file, curr_insn);
+ }
+
+ avx256 = XVECEXP (insn, 0, 0);
+
+ if (!upper_128bits_live)
+ {
+ /* Since the upper 128bits are dead, callee must not pass
+ 256bit AVX register. We only need to check if callee
+ returns 256bit AVX register. */
+ upper_128bits_live
+ = avx256 == RTX_VZEROUPPER_CALLEE_RETURN_AVX256;
+
+ /* Remove unnecessary vzeroupper since upper 128bits are
+ dead. */
+ if (dump_file)
+ {
+ fprintf (dump_file, "Delete redundant vzeroupper:\n");
+ print_rtl_single (dump_file, curr_insn);
+ }
+ delete_insn (curr_insn);
+ continue;
+ }
+ else if (avx256 == RTX_VZEROUPPER_CALLEE_RETURN_PASS_AVX256
+ || avx256 == RTX_VZEROUPPER_CALLEE_PASS_AVX256)
+ {
+ /* Callee passes 256bit AVX register. Check if callee
+ returns 256bit AVX register. */
+ upper_128bits_live
+ = avx256 == RTX_VZEROUPPER_CALLEE_RETURN_PASS_AVX256;
+
+ /* Must remove vzeroupper since callee passes 256bit AVX
+ register. */
+ if (dump_file)
+ {
+ fprintf (dump_file, "Delete callee pass vzeroupper:\n");
+ print_rtl_single (dump_file, curr_insn);
+ }
+ delete_insn (curr_insn);
+ continue;
+ }
+
+ /* Keep vzeroupper. */
+ upper_128bits_live = false;
+
+ /* Find the jump after vzeroupper. */
+ prev_insn = curr_insn;
+ for (insn = NEXT_INSN (curr_insn);
+ insn && insn != NEXT_INSN (BB_END (curr_block));
+ insn = NEXT_INSN (insn))
+ {
+ if (!NONDEBUG_INSN_P (insn))
+ continue;
+ if (!NONJUMP_INSN_P (insn))
+ break;
+ prev_insn = insn;
+ }
+
+ if (!insn || insn == NEXT_INSN (BB_END (curr_block)))
+ {
+ /* Move vzeroupper before label if neeeded. */
+ if (LABEL_P (insn))
+ prev_insn = PREV_INSN (insn);
+ else
+ gcc_unreachable();
+ }
+
+ /* Move vzeroupper before jump if neeeded. */
+ if (curr_insn != prev_insn)
+ {
+ reorder_insns_nobb (curr_insn, curr_insn, prev_insn);
+ if (dump_file)
+ {
+ fprintf (dump_file, "Move vzeroupper after:\n");
+ print_rtl_single (dump_file, prev_insn);
+ fprintf (dump_file, "before:\n");
+ print_rtl_single (dump_file, insn);
+ }
+ }
+
+ next_insn = NEXT_INSN (insn);
+ }
+
+ BLOCK_INFO (curr_block)->upper_128bits_live = upper_128bits_live;
+
+ if (dump_file)
+ fprintf (dump_file, " BB [%i] exit: upper 128bits: %d\n",
+ curr_block->index, upper_128bits_live);
+}
+
+/* Helper function for move_or_delete_vzeroupper. Process vzeroupper
+ in BLOCK and its predecessor blocks recursively. */
+
+static void
+move_or_delete_vzeroupper_1 (basic_block block)
+{
+ edge e;
+ edge_iterator ei;
+ bool upper_128bits_live;
+
+ if (dump_file)
+ fprintf (dump_file, " Process BB [%i]: status: %d\n",
+ block->index, BLOCK_INFO (block)->done);
+
+ if (BLOCK_INFO (block)->done)
+ return;
+
+ BLOCK_INFO (block)->done = true;
+
+ upper_128bits_live = false;
+
+ /* Process all predecessor edges of this block. */
+ FOR_EACH_EDGE (e, ei, block->preds)
+ {
+ if (e->src == block)
+ continue;
+ move_or_delete_vzeroupper_1 (e->src);
+ if (BLOCK_INFO (e->src)->upper_128bits_live)
+ upper_128bits_live = true;
+ }
+
+ /* Process this block. */
+ move_or_delete_vzeroupper_2 (block, upper_128bits_live);
+}
+
+/* Go through the instruction stream looking for vzeroupper. Delete
+ it if upper 128bit AVX registers are unused. If it isn't deleted,
+ move it to just before a jump insn. */
+
+static void
+move_or_delete_vzeroupper (void)
+{
+ edge e;
+ edge_iterator ei;
+
+ /* Set up block info for each basic block. */
+ alloc_aux_for_blocks (sizeof (struct block_info_def));
+
+ /* Process successor blocks of all entry points. */
+ if (dump_file)
+ fprintf (dump_file, "Process all entry points\n");
+
+ FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
+ {
+ move_or_delete_vzeroupper_2 (e->dest,
+ cfun->machine->caller_pass_avx256_p);
+ BLOCK_INFO (e->dest)->done = true;
+ }
+
+ /* Process predecessor blocks of all exit points. */
+ if (dump_file)
+ fprintf (dump_file, "Process all exit points\n");
+
+ FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
+ move_or_delete_vzeroupper_1 (e->src);
+
+ free_aux_for_blocks ();
+}
+
+static unsigned int
+rest_of_handle_vzeroupper (void)
+{
+ timevar_push (TV_VZEROUPPER);
+ move_or_delete_vzeroupper ();
+ timevar_pop (TV_VZEROUPPER);
+ return 0;
+}
+
+static bool
+gate_handle_vzeroupper (void)
+{
+ /* Run the vzeroupper pass if needed. */
+ return cfun->machine->use_vzeroupper_p;
+}
+
+static struct rtl_opt_pass pass_vzeroupper =
+{
+ {
+ RTL_PASS,
+ "vzeroupper", /* name */
+ gate_handle_vzeroupper, /* gate */
+ rest_of_handle_vzeroupper, /* execute */
+ NULL, /* sub */
+ NULL, /* next */
+ 0, /* static_pass_number */
+ TV_VZEROUPPER, /* tv_id */
+ 0, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ TODO_dump_func /* todo_flags_finish */
+ }
+};
+
static rtx legitimize_dllimport_symbol (rtx, bool);
#ifndef CHECK_STACK_LIMIT
@@ -2633,6 +2932,7 @@ ix86_target_string (int isa, int flags, const char *arch, const char *tune,
{ "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
{ "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
{ "-m8bit-idiv", MASK_USE_8BIT_IDIV },
+ { "-mvzeroupper", MASK_VZEROUPPER },
};
const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
@@ -3712,6 +4012,73 @@ ix86_option_override_internal (bool main_args_p)
if (main_args_p)
target_option_default_node = target_option_current_node
= build_target_option_node ();
+
+ if (TARGET_AVX)
+ {
+ /* Enable vzeroupper pass by default for TARGET_AVX. */
+ if (!(target_flags_explicit & MASK_VZEROUPPER))
+ target_flags |= MASK_VZEROUPPER;
+ }
+ else
+ {
+ /* Disable vzeroupper pass if TARGET_AVX is disabled. */
+ target_flags &= ~MASK_VZEROUPPER;
+ }
+
+ /* Register the vzeroupper pass. */
+ if (TARGET_VZEROUPPER)
+ {
+ struct register_pass_info vzeroupper_pass_info;
+
+ vzeroupper_pass_info.pass = &pass_vzeroupper.pass;
+ vzeroupper_pass_info.reference_pass_name
+ = pass_final.pass.name;
+ vzeroupper_pass_info.ref_pass_instance_number = 1;
+ vzeroupper_pass_info.pos_op = PASS_POS_INSERT_BEFORE;
+ register_pass (&vzeroupper_pass_info);
+ }
+}
+
+/* Return TRUE if type TYPE and mode MODE use 256bit AVX modes. */
+
+static bool
+use_avx256_p (enum machine_mode mode, const_tree type)
+{
+ return (VALID_AVX256_REG_MODE (mode)
+ || (type
+ && TREE_CODE (type) == VECTOR_TYPE
+ && int_size_in_bytes (type) == 32));
+}
+
+/* Return TRUE if VAL is passed in register with 256bit AVX modes. */
+
+static bool
+function_pass_avx256_p (const_rtx val)
+{
+ if (!val)
+ return false;
+
+ if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
+ return true;
+
+ if (GET_CODE (val) == PARALLEL)
+ {
+ int i;
+ rtx r;
+
+ for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
+ {
+ r = XVECEXP (val, 0, i);
+ if (GET_CODE (r) == EXPR_LIST
+ && XEXP (r, 0)
+ && REG_P (XEXP (r, 0))
+ && (GET_MODE (XEXP (r, 0)) == OImode
+ || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
+ return true;
+ }
+ }
+
+ return false;
}
/* Implement the TARGET_OPTION_OVERRIDE hook. */
@@ -4626,7 +4993,14 @@ ix86_function_ok_for_sibcall (tree decl, tree exp)
return false;
}
else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
- ;
+ {
+ /* Disable sibcall if we need to generate vzeroupper after
+ callee returns. */
+ if (TARGET_VZEROUPPER
+ && cfun->machine->callee_return_avx256_p
+ && !cfun->machine->caller_return_avx256_p)
+ return false;
+ }
else if (!rtx_equal_p (a, b))
return false;
@@ -5243,15 +5617,54 @@ void
init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
tree fntype, /* tree ptr for function decl */
rtx libname, /* SYMBOL_REF of library name or 0 */
- tree fndecl)
+ tree fndecl,
+ int caller)
{
- struct cgraph_local_info *i = fndecl ? cgraph_local_info (fndecl) : NULL;
+ struct cgraph_local_info *i;
+ tree fnret_type;
+
memset (cum, 0, sizeof (*cum));
+ /* Initialize for the current callee. */
+ if (caller)
+ {
+ cfun->machine->callee_pass_avx256_p = false;
+ cfun->machine->callee_return_avx256_p = false;
+ }
+
if (fndecl)
- cum->call_abi = ix86_function_abi (fndecl);
+ {
+ i = cgraph_local_info (fndecl);
+ cum->call_abi = ix86_function_abi (fndecl);
+ fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
+ }
else
- cum->call_abi = ix86_function_type_abi (fntype);
+ {
+ i = NULL;
+ cum->call_abi = ix86_function_type_abi (fntype);
+ if (fntype)
+ fnret_type = TREE_TYPE (fntype);
+ else
+ fnret_type = NULL;
+ }
+
+ if (TARGET_VZEROUPPER && fnret_type)
+ {
+ rtx fnret_value = ix86_function_value (fnret_type, fntype,
+ false);
+ if (function_pass_avx256_p (fnret_value))
+ {
+ /* The return value of this function uses 256bit AVX modes. */
+ cfun->machine->use_avx256_p = true;
+ if (caller)
+ cfun->machine->callee_return_avx256_p = true;
+ else
+ cfun->machine->caller_return_avx256_p = true;
+ }
+ }
+
+ cum->caller = caller;
+
/* Set up the number of registers to use for passing arguments. */
if (cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
@@ -6488,6 +6901,7 @@ ix86_function_arg (CUMULATIVE_ARGS *cum, enum machine_mode omode,
{
enum machine_mode mode = omode;
HOST_WIDE_INT bytes, words;
+ rtx arg;
if (mode == BLKmode)
bytes = int_size_in_bytes (type);
@@ -6501,11 +6915,23 @@ ix86_function_arg (CUMULATIVE_ARGS *cum, enum machine_mode omode,
mode = type_natural_mode (type, cum);
if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
- return function_arg_ms_64 (cum, mode, omode, named, bytes);
+ arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
else if (TARGET_64BIT)
- return function_arg_64 (cum, mode, omode, type, named);
+ arg = function_arg_64 (cum, mode, omode, type, named);
else
- return function_arg_32 (cum, mode, omode, type, bytes, words);
+ arg = function_arg_32 (cum, mode, omode, type, bytes, words);
+
+ if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
+ {
+ /* This argument uses 256bit AVX modes. */
+ cfun->machine->use_avx256_p = true;
+ if (cum->caller)
+ cfun->machine->callee_pass_avx256_p = true;
+ else
+ cfun->machine->caller_pass_avx256_p = true;
+ }
+
+ return arg;
}
/* A C expression that indicates when an argument must be passed by
@@ -10326,6 +10752,15 @@ ix86_expand_epilogue (int style)
return;
}
+ /* Emit vzeroupper if needed. */
+ if (TARGET_VZEROUPPER
+ && cfun->machine->use_avx256_p
+ && !cfun->machine->caller_return_avx256_p)
+ {
+ cfun->machine->use_vzeroupper_p = 1;
+ emit_insn (gen_avx_vzeroupper_nop (RTX_VZEROUPPER_NO_AVX256));
+ }
+
if (crtl->args.pops_args && crtl->args.size)
{
rtx popc = GEN_INT (crtl->args.pops_args);
@@ -20883,6 +21318,25 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
+ 2, vec));
}
+ /* Emit vzeroupper if needed. */
+ if (TARGET_VZEROUPPER && cfun->machine->use_avx256_p)
+ {
+ rtx avx256;
+ cfun->machine->use_vzeroupper_p = 1;
+ if (cfun->machine->callee_pass_avx256_p)
+ {
+ if (cfun->machine->callee_return_avx256_p)
+ avx256 = RTX_VZEROUPPER_CALLEE_RETURN_PASS_AVX256;
+ else
+ avx256 = RTX_VZEROUPPER_CALLEE_PASS_AVX256;
+ }
+ else if (cfun->machine->callee_return_avx256_p)
+ avx256 = RTX_VZEROUPPER_CALLEE_RETURN_AVX256;
+ else
+ avx256 = RTX_VZEROUPPER_NO_AVX256;
+ emit_insn (gen_avx_vzeroupper_nop (avx256));
+ }
+
call = emit_call_insn (call);
if (use)
CALL_INSN_FUNCTION_USAGE (call) = use;
@@ -21626,6 +22080,9 @@ ix86_local_alignment (tree exp, enum machine_mode mode,
decl = NULL;
}
+ if (use_avx256_p (mode, type))
+ cfun->machine->use_avx256_p = true;
+
/* Don't do dynamic stack realignment for long long objects with
-mpreferred-stack-boundary=2. */
if (!TARGET_64BIT
@@ -21721,9 +22178,6 @@ ix86_minimum_alignment (tree exp, enum machine_mode mode,
{
tree type, decl;
- if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
- return align;
-
if (exp && DECL_P (exp))
{
type = TREE_TYPE (exp);
@@ -21735,6 +22189,12 @@ ix86_minimum_alignment (tree exp, enum machine_mode mode,
decl = NULL;
}
+ if (use_avx256_p (mode, type))
+ cfun->machine->use_avx256_p = true;
+
+ if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
+ return align;
+
/* Don't do dynamic stack realignment for long long objects with
-mpreferred-stack-boundary=2. */
if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
@@ -1507,6 +1507,7 @@ typedef struct ix86_args {
int mmx_nregs; /* # mmx registers available for passing */
int mmx_regno; /* next available mmx register number */
int maybe_vaarg; /* true for calls to possibly vardic fncts. */
+ int caller; /* true if it is caller. */
int float_in_sse; /* Set to 1 or 2 for 32bit targets if
SFmode/DFmode arguments should be passed
in SSE registers. Otherwise 0. */
@@ -1519,7 +1520,8 @@ typedef struct ix86_args {
For a library call, FNTYPE is 0. */
#define INIT_CUMULATIVE_ARGS(CUM, FNTYPE, LIBNAME, FNDECL, N_NAMED_ARGS) \
- init_cumulative_args (&(CUM), (FNTYPE), (LIBNAME), (FNDECL))
+ init_cumulative_args (&(CUM), (FNTYPE), (LIBNAME), (FNDECL), \
+ (N_NAMED_ARGS) != -1)
/* Output assembler code to FILE to increment profiler label # LABELNO
for profiling a function entry. */
@@ -2289,6 +2291,24 @@ struct GTY(()) machine_function {
stack below the return address. */
BOOL_BITFIELD static_chain_on_stack : 1;
+ /* Nonzero if the current function uses vzeroupper. */
+ BOOL_BITFIELD use_vzeroupper_p : 1;
+
+ /* Nonzero if the current function uses 256bit AVX regisers. */
+ BOOL_BITFIELD use_avx256_p : 1;
+
+ /* Nonzero if caller passes 256bit AVX modes. */
+ BOOL_BITFIELD caller_pass_avx256_p : 1;
+
+ /* Nonzero if caller returns 256bit AVX modes. */
+ BOOL_BITFIELD caller_return_avx256_p : 1;
+
+ /* Nonzero if the current callee passes 256bit AVX modes. */
+ BOOL_BITFIELD callee_pass_avx256_p : 1;
+
+ /* Nonzero if the current callee returns 256bit AVX modes. */
+ BOOL_BITFIELD callee_return_avx256_p : 1;
+
/* During prologue/epilogue generation, the current frame state.
Otherwise, the frame state at the end of the prologue. */
struct machine_frame_state fs;
@@ -249,6 +249,7 @@
UNSPECV_NOPS
UNSPECV_VZEROALL
UNSPECV_VZEROUPPER
+ UNSPECV_VZEROUPPER_NOP
UNSPECV_RDTSC
UNSPECV_RDTSCP
UNSPECV_RDPMC
@@ -256,6 +256,11 @@ mcld
Target Report Mask(CLD) Save
Generate cld instruction in the function prologue.
+mvzeroupper
+Target Report Mask(VZEROUPPER) Save
+Generate vzeroupper instruction before a transfer of control flow out of
+the function.
+
mfused-madd
Target Report Mask(FUSED_MADD) Save
Enable automatic generation of fused floating point multiply-add instructions
@@ -11429,6 +11429,19 @@
(set_attr "prefix" "vex")
(set_attr "mode" "OI")])
+;; Clear the upper 128bits of AVX registers, equivalent to a NOP.
+;; This should be used only when the upper 128bits are unused.
+(define_insn "avx_vzeroupper_nop"
+ [(unspec_volatile [(match_operand 0 "const_int_operand" "")]
+ UNSPECV_VZEROUPPER_NOP)]
+ "TARGET_AVX"
+ "vzeroupper"
+ [(set_attr "type" "sse")
+ (set_attr "modrm" "0")
+ (set_attr "memory" "none")
+ (set_attr "prefix" "vex")
+ (set_attr "mode" "OI")])
+
(define_insn_and_split "vec_dup<mode>"
[(set (match_operand:AVX256MODE24P 0 "register_operand" "=x,x")
(vec_duplicate:AVX256MODE24P
@@ -594,7 +594,7 @@ Objective-C and Objective-C++ Dialects}.
-mno-wide-multiply -mrtd -malign-double @gol
-mpreferred-stack-boundary=@var{num}
-mincoming-stack-boundary=@var{num} @gol
--mcld -mcx16 -msahf -mmovbe -mcrc32 -mrecip @gol
+-mcld -mcx16 -msahf -mmovbe -mcrc32 -mrecip -mvzeroupper @gol
-mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -msse4 -mavx @gol
-maes -mpclmul -mfsgsbase -mrdrnd -mf16c -mfused-madd @gol
-msse4a -m3dnow -mpopcnt -mabm -mfma4 -mxop -mlwp @gol
@@ -12466,6 +12466,13 @@ GCC with the @option{--enable-cld} configure option. Generation of @code{cld}
instructions can be suppressed with the @option{-mno-cld} compiler option
in this case.
+@item -mvzeroupper
+@opindex mvzeroupper
+This option instructs GCC to emit a @code{vzeroupper} instruction
+before a transfer of control flow out of the function to minimize
+AVX to SSE transition penalty as well as remove unnecessary zeroupper
+intrinsics.
+
@item -mcx16
@opindex mcx16
This option will enable GCC to use CMPXCHG16B instruction in generated code.
@@ -1,6 +1,6 @@
/* { dg-do run } */
/* { dg-require-effective-target avx } */
-/* { dg-options "-O2 -mavx" } */
+/* { dg-options "-O2 -mavx -mtune=generic" } */
#include "avx-check.h"
new file mode 100644
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern float x, y;
+
+void
+foo ()
+{
+ x = y;
+ _mm256_zeroupper ();
+ _mm256_zeroupper ();
+ _mm256_zeroupper ();
+}
+
+/* { dg-final { scan-assembler-times "\\*avx_vzeroupper" 3 } } */
+/* { dg-final { scan-assembler-not "avx_vzeroupper_nop" } } */
new file mode 100644
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern float x, y;
+
+void
+foo ()
+{
+ x = y;
+ _mm256_zeroall ();
+ _mm256_zeroupper ();
+ _mm256_zeroupper ();
+ _mm256_zeroupper ();
+}
+
+/* { dg-final { scan-assembler-times "\\*avx_vzeroall" 1 } } */
+/* { dg-final { scan-assembler-times "\\*avx_vzeroupper" 3 } } */
+/* { dg-final { scan-assembler-not "avx_vzeroupper_nop" } } */
new file mode 100644
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+ _mm256_zeroall ();
+ _mm256_zeroupper ();
+ x = y;
+ _mm256_zeroupper ();
+ _mm256_zeroupper ();
+ _mm256_zeroupper ();
+}
+
+/* { dg-final { scan-assembler-times "\\*avx_vzeroupper" 1 } } */
+/* { dg-final { scan-assembler-times "\\*avx_vzeroall" 1 } } */
+/* { dg-final { scan-assembler-not "avx_vzeroupper_nop" } } */
new file mode 100644
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mno-vzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+ x = y;
+}
+
+/* { dg-final { scan-assembler-not "avx_vzeroupper_nop" } } */
new file mode 100644
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mtune=generic -dp" } */
+
+#include <immintrin.h>
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+ x = y;
+}
+
+/* { dg-final { scan-assembler-times "avx_vzeroupper_nop" 1 } } */
@@ -1,6 +1,6 @@
/* { dg-do run } */
/* { dg-require-effective-target avx } */
-/* { dg-options "-O2 -mavx" } */
+/* { dg-options "-O2 -mavx -mtune=generic" } */
#include "avx-check.h"
new file mode 100644
@@ -0,0 +1,34 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx } */
+/* { dg-options "-O2 -mavx -mvzeroupper" } */
+
+#include "avx-check.h"
+
+int s[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+int d[8] = {11, 22, 33, 44, 55, 66, 77, 88};
+
+void
+__attribute__((noinline))
+foo ()
+{
+ int i;
+ for (i = 0; i < ARRAY_SIZE (d); i++)
+ d[i] = s[i] + 0x1000;
+}
+
+static void
+__attribute__((noinline))
+bar (__m256i src)
+{
+ foo ();
+ _mm256_storeu_si256 ((__m256i*) d, src);
+ if (__builtin_memcmp (d, s, sizeof (d)))
+ abort ();
+}
+
+static void
+avx_test (void)
+{
+ __m256i src = _mm256_loadu_si256 ((__m256i*) s);
+ bar (src);
+}
new file mode 100644
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+typedef float __m256 __attribute__ ((__vector_size__ (32), __may_alias__));
+
+extern void bar2 (__m256);
+extern __m256 y;
+
+void
+foo ()
+{
+ bar2 (y);
+}
+
+/* { dg-final { scan-assembler-not "avx_vzeroupper_nop" } } */
new file mode 100644
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern void bar2 (__m256);
+extern __m256 y;
+
+void
+foo ()
+{
+ bar2 (y);
+ _mm256_zeroupper ();
+}
+
+/* { dg-final { scan-assembler-not "avx_vzeroupper" } } */
new file mode 100644
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+ x = y;
+ _mm256_zeroall ();
+}
+
+/* { dg-final { scan-assembler-not "avx_vzeroupper_nop" } } */
new file mode 100644
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+ x = y;
+ _mm256_zeroupper ();
+}
+
+/* { dg-final { scan-assembler-times "\\*avx_vzeroupper" 1 } } */
+/* { dg-final { scan-assembler-not "avx_vzeroupper_nop" } } */
new file mode 100644
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+ x = y;
+ _mm256_zeroall ();
+ _mm256_zeroupper ();
+}
+
+/* { dg-final { scan-assembler-not "\\*avx_vzeroupper" } } */
+/* { dg-final { scan-assembler-not "avx_vzeroupper_nop" } } */
new file mode 100644
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+ _mm256_zeroupper ();
+ x = y;
+ _mm256_zeroupper ();
+ _mm256_zeroupper ();
+ _mm256_zeroupper ();
+}
+
+/* { dg-final { scan-assembler-times "\\*avx_vzeroupper" 1 } } */
+/* { dg-final { scan-assembler-not "avx_vzeroupper_nop" } } */
@@ -235,6 +235,7 @@ DEFTIMEVAR (TV_TREE_IFCOMBINE , "tree if-combine")
DEFTIMEVAR (TV_TREE_UNINIT , "uninit var anaysis")
DEFTIMEVAR (TV_PLUGIN_INIT , "plugin initialization")
DEFTIMEVAR (TV_PLUGIN_RUN , "plugin execution")
+DEFTIMEVAR (TV_VZEROUPPER , "vzeroupper")
/* Everything else in rest_of_compilation not included above. */
DEFTIMEVAR (TV_REST_OF_COMPILATION , "rest of compilation")
Hi, This patch adds vzeroupper optimization for AVX, which is very important for 256bit AVX instructions. Otherwise AVX-SSE transition penalty may kill 256bit AVX vector performance. I am enclosing improvement of vzeroupper optimization on SPEC CPU 2K/2006 when 256bit AVX vectorizer is enabled. The data shows that tonto is improved by 59%, wrf by 25%, sphinx3 by 19%, GemsFDTD by 11% and gamess by 11%. At RTL expansion time, the vzeroupper optimization generates a vzeroupper_nop before function call and functin return if 256bit AVX instructions are used. The vzeroupper pass is run before final pass. It scans all reachable blocks: 1. Remove vzeroupper_nop when: a. The upper 128bits of AVX regiters are known dead. b. The upper 128bits of AVX regiters are live and used for parameter passing. We need to know if callee returns 256bit AVX registers to decide if the upper 128bits of AVX regiters are live after callee returns. 2. Move vzeroupper_nop right before function return call/return. It is needed since various passes may move 256bit vector instructions across vzeroupper_nop and I can't find a way to describe vzeroupper_nop other than as UNSPECV. I can't say it clobbers all 256bit AVX regiters since it isn't true. I can't describe it clears upper 128bits of all AVX regiters since register allocator will try allocate all AVX regiters for vzeroupper_nop. OK for trunk? Thanks. H.J. --- gcc/ 2010-10-22 H.J. Lu <hongjiu.lu@intel.com> * config/i386/i386-protos.h (init_cumulative_args): Add an int. * config/i386/i386.c: Include "tree-pass.h". (block_info): New. (BLOCK_INFO): Likewise. (RTX_VZEROUPPER_CALLEE_RETURN_AVX256): Likewise. (RTX_VZEROUPPER_CALLEE_RETURN_PASS_AVX256): Likewise. (RTX_VZEROUPPER_CALLEE_PASS_AVX256): Likewise. (RTX_VZEROUPPER_NO_AVX256): Likewise. (check_avx256_stores): Likewise. (move_or_delete_vzeroupper_2): Likewise. (move_or_delete_vzeroupper_1): Likewise. (move_or_delete_vzeroupper): Likewise. (rest_of_handle_vzeroupper): Likewise. (gate_handle_vzeroupper): Likewise. (pass_vzeroupper): Likewise. (use_avx256_p): Likewise. (function_pass_avx256_p): Likewise. (flag_opts): Add -mvzeroupper. (ix86_option_override_internal): Turn on MASK_VZEROUPPER by default for TARGET_AVX. Turn off MASK_VZEROUPPER if TARGET_AVX is disabled. Register pass_vzeroupper for TARGET_VZEROUPPER. (ix86_function_ok_for_sibcall): Disable sibcall if we need to generate vzeroupper. (init_cumulative_args): Add an int to indicate caller. Set use_avx256_p, callee_return_avx256_p and caller_use_avx256_p based on return type. (ix86_function_arg): Set use_avx256_p, callee_pass_avx256_p and caller_pass_avx256_p based on argument type. (ix86_expand_epilogue): Emit vzeroupper if 256bit AVX register is used, but not returned by caller. (ix86_expand_call): Emit vzeroupper if 256bit AVX register is used. (ix86_local_alignment): Set use_avx256_p if 256bit AVX register is used. (ix86_minimum_alignment): Likewise. * config/i386/i386.h (ix86_args): Add caller. (INIT_CUMULATIVE_ARGS): Updated. (machine_function): Add use_vzeroupper_p, use_avx256_p, caller_pass_avx256_p, caller_return_avx256_p, callee_pass_avx256_p and callee_return_avx256_p. * config/i386/i386.md (UNSPECV_VZEROUPPER_NOP): New. * config/i386/sse.md (avx_vzeroupper_nop): Likewise. * config/i386/i386.opt (-mvzeroupper): New. * doc/invoke.texi: Document -mvzeroupper. * timevar.def (TV_VZEROUPPER): New. gcc/testsuite/ 2010-10-22 H.J. Lu <hongjiu.lu@intel.com> * gcc.target/i386/avx-vzeroupper-1.c: Add -mtune=generic. * gcc.target/i386/avx-vzeroupper-2.c: Likewise. * gcc.target/i386/avx-vzeroupper-3.c: New. * gcc.target/i386/avx-vzeroupper-4.c: Likewise. * gcc.target/i386/avx-vzeroupper-5.c: Likewise. * gcc.target/i386/avx-vzeroupper-6.c: Likewise. * gcc.target/i386/avx-vzeroupper-7.c: Likewise. * gcc.target/i386/avx-vzeroupper-8.c: Likewise. * gcc.target/i386/avx-vzeroupper-9.c: Likewise. * gcc.target/i386/avx-vzeroupper-10.c: Likewise. * gcc.target/i386/avx-vzeroupper-11.c: Likewise. * gcc.target/i386/avx-vzeroupper-12.c: Likewise. * gcc.target/i386/avx-vzeroupper-13.c: Likewise. * gcc.target/i386/avx-vzeroupper-14.c: Likewise.