Patchwork PATCH: Add vzeroupper optimization for AVX

login
register
mail settings
Submitter H.J. Lu
Date Oct. 27, 2010, 10:01 a.m.
Message ID <AANLkTimEeGpXvvH-jwge2RnMH1TbA-r7rei00hxhGrSu@mail.gmail.com>
Download mbox | patch
Permalink /patch/69334/
State New
Headers show

Comments

H.J. Lu - Oct. 27, 2010, 10:01 a.m.
On Tue, Oct 26, 2010 at 11:34 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Wed, Oct 27, 2010 at 4:46 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
>
>>>> IMO, there is no need for a new insn pattern. UNSPEC_VOLATILEs clobber
>>>> all regs, pseudos and memory (see sched-deps.c around line 2528), so
>>>> it looks to me that vzeroupper_nop and existing vzeroupper are no
>>>> different. Just use the new pattern instead of avx_vzeroupper and
>>>> *avx_vzeroupper, and iff -mvzeroupper is passed to gcc, then
>>>> vzeroupper insn is moved around.
>>>
>>
>> Done.
>>
>>
>> Here is the updated patch.  OK for trunk?
>
> -ENOPATCH

Oops.  Here it is.
Uros Bizjak - Oct. 27, 2010, 11:54 a.m.
On Wed, Oct 27, 2010 at 12:01 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Tue, Oct 26, 2010 at 11:34 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
>> On Wed, Oct 27, 2010 at 4:46 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
>>
>>>>> IMO, there is no need for a new insn pattern. UNSPEC_VOLATILEs clobber
>>>>> all regs, pseudos and memory (see sched-deps.c around line 2528), so
>>>>> it looks to me that vzeroupper_nop and existing vzeroupper are no
>>>>> different. Just use the new pattern instead of avx_vzeroupper and
>>>>> *avx_vzeroupper, and iff -mvzeroupper is passed to gcc, then
>>>>> vzeroupper insn is moved around.
>>>>
>>>
>>> Done.
>>>
>>>
>>> Here is the updated patch.  OK for trunk?
>>
>> -ENOPATCH
>
> Oops.  Here it is.

> +  /* Emit vzeroupper if needed.  */
> +  if (TARGET_VZEROUPPER && cfun->machine->use_avx256_p)
> +    {
> +      rtx avx256;
> +      cfun->machine->use_vzeroupper_p = 1;
> +      if (cfun->machine->callee_pass_avx256_p)
> +       {
> +         if (cfun->machine->callee_return_avx256_p)
> +           avx256 = GEN_INT (callee_return_pass_avx256);
> +         else
> +           avx256 = GEN_INT (callee_pass_avx256);
> +       }
> +      else if (cfun->machine->callee_return_avx256_p)
> +       avx256 = GEN_INT (callee_return_avx256);
> +      else
> +       avx256 = GEN_INT (call_no_avx256);
> +      emit_insn (gen_avx_vzeroupper (avx256));
> +    }

Please make avx256 an int and use GEN_INT just before gen_avx_vzeroupper.

Also, vzeroupper_operation predicate can be removed from predicates.md.

The patch is OK for mainline with these changes.

Thanks,
Uros.

Patch

diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 9c10103..02c2a90 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -177,7 +177,7 @@  extern void ix86_expand_trunc (rtx, rtx);
 extern void ix86_expand_truncdf_32 (rtx, rtx);
 
 #ifdef TREE_CODE
-extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree);
+extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
 #endif	/* TREE_CODE  */
 
 #endif	/* RTX_CODE  */
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 6f3a898..ae74a4c 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -56,6 +56,293 @@  along with GCC; see the file COPYING3.  If not see
 #include "debug.h"
 #include "dwarf2out.h"
 #include "sched-int.h"
+
+typedef struct block_info_def
+{
+  /* TRUE if the upper 128bits of any AVX registers are live at exit.  */
+  bool upper_128bits_set;
+  /* TRUE if block has been processed.  */
+  bool done;
+} *block_info;
+
+#define BLOCK_INFO(B)   ((block_info) (B)->aux)
+
+enum call_avx256_state
+{
+  /* Callee returns 256bit AVX register.  */
+  callee_return_avx256 = -1,
+  /* Callee returns and passes 256bit AVX register.  */
+  callee_return_pass_avx256,
+  /* Callee passes 256bit AVX register.  */
+  callee_pass_avx256,
+  /* Callee doesn't return nor passe 256bit AVX register, or no
+     256bit AVX register in function return.  */
+  call_no_avx256,
+  /* vzeroupper intrinsic.  */
+  vzeroupper_intrinsic
+};
+
+/* Check if a 256bit AVX register is referenced in stores.   */
+
+static void
+check_avx256_stores (rtx dest, const_rtx set, void *data)
+{
+  if ((REG_P (dest)
+       && VALID_AVX256_REG_MODE (GET_MODE (dest)))
+      || (GET_CODE (set) == SET
+	  && REG_P (SET_SRC (set))
+	  && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
+    {
+      bool *upper_128bits_set = (bool *) data;
+      *upper_128bits_set = true;
+    }
+}
+
+/* Helper function for move_or_delete_vzeroupper_1.  Look for vzeroupper
+   in basic block BB.  Delete it if upper 128bit AVX registers are
+   unused.  If it isn't deleted, move it to just before a jump insn.
+   
+   UPPER_128BITS_LIVE is TRUE if the upper 128bits of any AVX registers
+   are live at entry.  */
+
+static void
+move_or_delete_vzeroupper_2 (basic_block bb, bool upper_128bits_set)
+{
+  rtx curr_insn, next_insn, prev_insn, insn;
+
+  if (dump_file)
+    fprintf (dump_file, " BB [%i] entry: upper 128bits: %d\n",
+	     bb->index, upper_128bits_set);
+
+  for (curr_insn = BB_HEAD (bb);
+       curr_insn && curr_insn != NEXT_INSN (BB_END (bb));
+       curr_insn = next_insn)
+    {
+      int avx256;
+
+      next_insn = NEXT_INSN (curr_insn);
+
+      if (!NONDEBUG_INSN_P (curr_insn))
+	continue;
+
+      /* Search for vzeroupper.  */
+      insn = PATTERN (curr_insn);
+      if (GET_CODE (insn) == UNSPEC_VOLATILE
+	  && XINT (insn, 1) == UNSPECV_VZEROUPPER)
+	{
+	  /* Found vzeroupper.  */
+	  if (dump_file)
+	    {
+	      fprintf (dump_file, "Found vzeroupper:\n");
+	      print_rtl_single (dump_file, curr_insn);
+	    }
+	}
+      else
+	{
+	  /* Check vzeroall intrinsic.  */
+	  if (GET_CODE (insn) == PARALLEL
+	      && GET_CODE (XVECEXP (insn, 0, 0)) == UNSPEC_VOLATILE
+	      && XINT (XVECEXP (insn, 0, 0), 1) == UNSPECV_VZEROALL)
+	    upper_128bits_set = false;
+	  else if (!upper_128bits_set)
+	    {
+	      /* Check if upper 128bits of AVX registers are used.  */
+	      note_stores (insn, check_avx256_stores,
+			   &upper_128bits_set);
+	    }
+	  continue;
+	}
+
+      avx256 = INTVAL (XVECEXP (insn, 0, 0));
+
+      if (!upper_128bits_set)
+	{
+	  /* Since the upper 128bits are cleared, callee must not pass
+	     256bit AVX register.  We only need to check if callee
+	     returns 256bit AVX register.  */
+	  upper_128bits_set = avx256 == callee_return_avx256;
+
+	  /* Remove unnecessary vzeroupper since upper 128bits are
+	     cleared.  */
+	  if (dump_file)
+	    {
+	      fprintf (dump_file, "Delete redundant vzeroupper:\n");
+	      print_rtl_single (dump_file, curr_insn);
+	    }
+	  delete_insn (curr_insn);
+	  continue;
+	}
+      else if (avx256 == callee_return_pass_avx256
+	       || avx256 == callee_pass_avx256)
+	{
+	  /* Callee passes 256bit AVX register.  Check if callee
+	     returns 256bit AVX register.  */
+	  upper_128bits_set = avx256 == callee_return_pass_avx256;
+
+	  /* Must remove vzeroupper since callee passes 256bit AVX
+	     register.  */
+	  if (dump_file)
+	    {
+	      fprintf (dump_file, "Delete callee pass vzeroupper:\n");
+	      print_rtl_single (dump_file, curr_insn);
+	    }
+	  delete_insn (curr_insn);
+	  continue;
+	}
+
+      /* Find the jump after vzeroupper.  */
+      prev_insn = curr_insn;
+      if (avx256 == vzeroupper_intrinsic)
+	{
+	  /* For vzeroupper intrinsic, check if there is another
+	     vzeroupper.  */
+	  insn = NEXT_INSN (curr_insn);
+	  while (insn)
+	    {
+	      if (NONJUMP_INSN_P (insn)
+		  && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
+		  && XINT (PATTERN (insn), 1) == UNSPECV_VZEROUPPER)
+		{
+		  if (dump_file)
+		    {
+		      fprintf (dump_file,
+			       "Delete redundant vzeroupper intrinsic:\n");
+		      print_rtl_single (dump_file, curr_insn);
+		    }
+		  delete_insn (curr_insn);
+		  insn = NULL;
+		  continue;
+		}
+
+	      if (JUMP_P (insn) || CALL_P (insn))
+		break;
+	      prev_insn = insn;
+	      insn = NEXT_INSN (insn);
+	      if (insn == NEXT_INSN (BB_END (bb)))
+		break;
+	    }
+
+	  /* Continue if redundant vzeroupper intrinsic is deleted.  */
+	  if (!insn)
+	    continue;
+	}
+      else
+	{
+	  /* Find the next jump/call.  */
+	  insn = NEXT_INSN (curr_insn);
+	  while (insn)
+	    {
+	      if (JUMP_P (insn) || CALL_P (insn))
+		break;
+	      prev_insn = insn;
+	      insn = NEXT_INSN (insn);
+	      if (insn == NEXT_INSN (BB_END (bb)))
+		break;
+	    }
+
+	  if (!insn)
+	    gcc_unreachable();
+	}
+
+      /* Keep vzeroupper.  */
+      upper_128bits_set = false;
+
+      /* Also allow label as the next instruction.  */
+      if (insn == NEXT_INSN (BB_END (bb)) && !LABEL_P (insn))
+	gcc_unreachable();
+
+      /* Move vzeroupper before jump/call if neeeded.  */
+      if (curr_insn != prev_insn)
+	{
+	  reorder_insns_nobb (curr_insn, curr_insn, prev_insn);
+	  if (dump_file)
+	    {
+	      fprintf (dump_file, "Move vzeroupper after:\n");
+	      print_rtl_single (dump_file, prev_insn);
+	      fprintf (dump_file, "before:\n");
+	      print_rtl_single (dump_file, insn);
+	    }
+	}
+
+      next_insn = NEXT_INSN (insn);
+    }
+
+  BLOCK_INFO (bb)->upper_128bits_set = upper_128bits_set;
+
+  if (dump_file)
+    fprintf (dump_file, " BB [%i] exit: upper 128bits: %d\n",
+	     bb->index, upper_128bits_set);
+}
+
+/* Helper function for move_or_delete_vzeroupper.  Process vzeroupper
+   in BLOCK and its predecessor blocks recursively.  */
+
+static void
+move_or_delete_vzeroupper_1 (basic_block block)
+{
+  edge e;
+  edge_iterator ei;
+  bool upper_128bits_set;
+
+  if (dump_file)
+    fprintf (dump_file, " Process BB [%i]: status: %d\n",
+	     block->index, BLOCK_INFO (block)->done);
+
+  if (BLOCK_INFO (block)->done)
+    return;
+
+  BLOCK_INFO (block)->done = true;
+
+  upper_128bits_set = false;
+
+  /* Process all predecessor edges of this block.  */
+  FOR_EACH_EDGE (e, ei, block->preds)
+    {
+      if (e->src == block)
+	continue;
+      move_or_delete_vzeroupper_1 (e->src);
+      if (BLOCK_INFO (e->src)->upper_128bits_set)
+	upper_128bits_set = true;
+    }
+
+  /* Process this block.  */
+  move_or_delete_vzeroupper_2 (block, upper_128bits_set);
+}
+
+/* Go through the instruction stream looking for vzeroupper.  Delete
+   it if upper 128bit AVX registers are unused.  If it isn't deleted,
+   move it to just before a jump insn.  */
+
+static void
+move_or_delete_vzeroupper (void)
+{
+  edge e;
+  edge_iterator ei;
+
+  /* Set up block info for each basic block.  */
+  alloc_aux_for_blocks (sizeof (struct block_info_def));
+
+  /* Process successor blocks of all entry points.  */
+  if (dump_file)
+    fprintf (dump_file, "Process all entry points\n");
+
+  FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
+    {
+      move_or_delete_vzeroupper_2 (e->dest,
+				   cfun->machine->caller_pass_avx256_p);
+      BLOCK_INFO (e->dest)->done = true;
+    }
+
+  /* Process predecessor blocks of all exit points.  */
+  if (dump_file)
+    fprintf (dump_file, "Process all exit points\n");
+
+  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
+    move_or_delete_vzeroupper_1 (e->src);
+
+  free_aux_for_blocks ();
+}
+
 static rtx legitimize_dllimport_symbol (rtx, bool);
 
 #ifndef CHECK_STACK_LIMIT
@@ -2633,6 +2920,7 @@  ix86_target_string (int isa, int flags, const char *arch, const char *tune,
     { "-mtls-direct-seg-refs",		MASK_TLS_DIRECT_SEG_REFS },
     { "-mvect8-ret-in-mem",		MASK_VECT8_RETURNS },
     { "-m8bit-idiv",			MASK_USE_8BIT_IDIV },
+    { "-mvzeroupper",			MASK_VZEROUPPER },
   };
 
   const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
@@ -3712,6 +4000,60 @@  ix86_option_override_internal (bool main_args_p)
   if (main_args_p)
     target_option_default_node = target_option_current_node
       = build_target_option_node ();
+
+  if (TARGET_AVX)
+    {
+      /* Enable vzeroupper pass by default for TARGET_AVX.  */
+      if (!(target_flags_explicit & MASK_VZEROUPPER))
+	target_flags |= MASK_VZEROUPPER;
+    }
+  else 
+    {
+      /* Disable vzeroupper pass if TARGET_AVX is disabled.  */
+      target_flags &= ~MASK_VZEROUPPER;
+    }
+}
+
+/* Return TRUE if type TYPE and mode MODE use 256bit AVX modes.  */
+
+static bool
+use_avx256_p (enum machine_mode mode, const_tree type)
+{
+  return (VALID_AVX256_REG_MODE (mode)
+	  || (type
+	      && TREE_CODE (type) == VECTOR_TYPE
+	      && int_size_in_bytes (type) == 32));
+}
+
+/* Return TRUE if VAL is passed in register with 256bit AVX modes.  */
+
+static bool
+function_pass_avx256_p (const_rtx val)
+{
+  if (!val)
+    return false;
+
+  if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
+    return true;
+
+  if (GET_CODE (val) == PARALLEL)
+    {
+      int i;
+      rtx r;
+
+      for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
+	{
+	  r = XVECEXP (val, 0, i);
+	  if (GET_CODE (r) == EXPR_LIST
+	      && XEXP (r, 0)
+	      && REG_P (XEXP (r, 0))
+	      && (GET_MODE (XEXP (r, 0)) == OImode
+		  || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
+	    return true;
+	}
+    }
+
+  return false;
 }
 
 /* Implement the TARGET_OPTION_OVERRIDE hook.  */
@@ -4626,7 +4968,14 @@  ix86_function_ok_for_sibcall (tree decl, tree exp)
 	return false;
     }
   else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
-    ;
+    {
+      /* Disable sibcall if we need to generate vzeroupper after
+	 callee returns.  */
+      if (TARGET_VZEROUPPER
+	  && cfun->machine->callee_return_avx256_p
+	  && !cfun->machine->caller_return_avx256_p)
+	return false;
+    }
   else if (!rtx_equal_p (a, b))
     return false;
 
@@ -5243,15 +5592,54 @@  void
 init_cumulative_args (CUMULATIVE_ARGS *cum,  /* Argument info to initialize */
 		      tree fntype,	/* tree ptr for function decl */
 		      rtx libname,	/* SYMBOL_REF of library name or 0 */
-		      tree fndecl)
+		      tree fndecl,
+		      int caller)
 {
-  struct cgraph_local_info *i = fndecl ? cgraph_local_info (fndecl) : NULL;
+  struct cgraph_local_info *i;
+  tree fnret_type;
+
   memset (cum, 0, sizeof (*cum));
 
+  /* Initialize for the current callee.  */
+  if (caller)
+    {
+      cfun->machine->callee_pass_avx256_p = false;
+      cfun->machine->callee_return_avx256_p = false;
+    }
+
   if (fndecl)
-   cum->call_abi = ix86_function_abi (fndecl);
+    {
+      i = cgraph_local_info (fndecl);
+      cum->call_abi = ix86_function_abi (fndecl);
+      fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
+    }
   else
-   cum->call_abi = ix86_function_type_abi (fntype);
+    {
+      i = NULL;
+      cum->call_abi = ix86_function_type_abi (fntype);
+      if (fntype)
+	fnret_type = TREE_TYPE (fntype);
+      else
+	fnret_type = NULL;
+    }
+
+  if (TARGET_VZEROUPPER && fnret_type)
+    {
+      rtx fnret_value = ix86_function_value (fnret_type, fntype,
+					     false);
+      if (function_pass_avx256_p (fnret_value))
+	{
+	  /* The return value of this function uses 256bit AVX modes.  */
+	  cfun->machine->use_avx256_p = true;
+	  if (caller)
+	    cfun->machine->callee_return_avx256_p = true;
+	  else
+	    cfun->machine->caller_return_avx256_p = true;
+	}
+    }
+
+  cum->caller = caller;
+
   /* Set up the number of registers to use for passing arguments.  */
 
   if (cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
@@ -6488,6 +6876,7 @@  ix86_function_arg (CUMULATIVE_ARGS *cum, enum machine_mode omode,
 {
   enum machine_mode mode = omode;
   HOST_WIDE_INT bytes, words;
+  rtx arg;
 
   if (mode == BLKmode)
     bytes = int_size_in_bytes (type);
@@ -6501,11 +6890,23 @@  ix86_function_arg (CUMULATIVE_ARGS *cum, enum machine_mode omode,
     mode = type_natural_mode (type, cum);
 
   if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
-    return function_arg_ms_64 (cum, mode, omode, named, bytes);
+    arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
   else if (TARGET_64BIT)
-    return function_arg_64 (cum, mode, omode, type, named);
+    arg = function_arg_64 (cum, mode, omode, type, named);
   else
-    return function_arg_32 (cum, mode, omode, type, bytes, words);
+    arg = function_arg_32 (cum, mode, omode, type, bytes, words);
+
+  if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
+    {
+      /* This argument uses 256bit AVX modes.  */
+      cfun->machine->use_avx256_p = true;
+      if (cum->caller)
+	cfun->machine->callee_pass_avx256_p = true;
+      else
+	cfun->machine->caller_pass_avx256_p = true;
+    }
+
+  return arg;
 }
 
 /* A C expression that indicates when an argument must be passed by
@@ -10353,6 +10754,15 @@  ix86_expand_epilogue (int style)
       return;
     }
 
+  /* Emit vzeroupper if needed.  */
+  if (TARGET_VZEROUPPER
+      && cfun->machine->use_avx256_p
+      && !cfun->machine->caller_return_avx256_p)
+    {
+      cfun->machine->use_vzeroupper_p = 1;
+      emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256))); 
+    }
+
   if (crtl->args.pops_args && crtl->args.size)
     {
       rtx popc = GEN_INT (crtl->args.pops_args);
@@ -20910,6 +21320,25 @@  ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
 			       + 2, vec));
     }
 
+  /* Emit vzeroupper if needed.  */
+  if (TARGET_VZEROUPPER && cfun->machine->use_avx256_p)
+    {
+      rtx avx256;
+      cfun->machine->use_vzeroupper_p = 1;
+      if (cfun->machine->callee_pass_avx256_p)
+	{
+	  if (cfun->machine->callee_return_avx256_p)
+	    avx256 = GEN_INT (callee_return_pass_avx256);
+	  else
+	    avx256 = GEN_INT (callee_pass_avx256);
+	}
+      else if (cfun->machine->callee_return_avx256_p)
+	avx256 = GEN_INT (callee_return_avx256);
+      else
+	avx256 = GEN_INT (call_no_avx256);
+      emit_insn (gen_avx_vzeroupper (avx256)); 
+    }
+
   call = emit_call_insn (call);
   if (use)
     CALL_INSN_FUNCTION_USAGE (call) = use;
@@ -21653,6 +22082,9 @@  ix86_local_alignment (tree exp, enum machine_mode mode,
       decl = NULL;
     }
 
+  if (use_avx256_p (mode, type))
+    cfun->machine->use_avx256_p = true;
+
   /* Don't do dynamic stack realignment for long long objects with
      -mpreferred-stack-boundary=2.  */
   if (!TARGET_64BIT
@@ -21748,9 +22180,6 @@  ix86_minimum_alignment (tree exp, enum machine_mode mode,
 {
   tree type, decl;
 
-  if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
-    return align;
-
   if (exp && DECL_P (exp))
     {
       type = TREE_TYPE (exp);
@@ -21762,6 +22191,12 @@  ix86_minimum_alignment (tree exp, enum machine_mode mode,
       decl = NULL;
     }
 
+  if (use_avx256_p (mode, type))
+    cfun->machine->use_avx256_p = true;
+
+  if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
+    return align;
+
   /* Don't do dynamic stack realignment for long long objects with
      -mpreferred-stack-boundary=2.  */
   if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
@@ -25505,6 +25940,8 @@  ix86_expand_special_args_builtin (const struct builtin_description *d,
   switch ((enum ix86_builtin_func_type) d->flag)
     {
     case VOID_FTYPE_VOID:
+      if (icode == CODE_FOR_avx_vzeroupper)
+	target = GEN_INT (vzeroupper_intrinsic);
       emit_insn (GEN_FCN (icode) (target));
       return 0;
     case VOID_FTYPE_UINT64:
@@ -28542,6 +28979,10 @@  ix86_reorg (void)
 	ix86_avoid_jump_mispredicts ();
 #endif
     }
+
+  /* Run the vzeroupper optimization if needed.  */
+  if (cfun->machine->use_vzeroupper_p)
+    move_or_delete_vzeroupper ();
 }
 
 /* Return nonzero when QImode register that must be represented via REX prefix
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 25463a5..5474048 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1507,6 +1507,7 @@  typedef struct ix86_args {
   int mmx_nregs;		/* # mmx registers available for passing */
   int mmx_regno;		/* next available mmx register number */
   int maybe_vaarg;		/* true for calls to possibly vardic fncts.  */
+  int caller;			/* true if it is caller.  */
   int float_in_sse;		/* Set to 1 or 2 for 32bit targets if
 				   SFmode/DFmode arguments should be passed
 				   in SSE registers.  Otherwise 0.  */
@@ -1519,7 +1520,8 @@  typedef struct ix86_args {
    For a library call, FNTYPE is 0.  */
 
 #define INIT_CUMULATIVE_ARGS(CUM, FNTYPE, LIBNAME, FNDECL, N_NAMED_ARGS) \
-  init_cumulative_args (&(CUM), (FNTYPE), (LIBNAME), (FNDECL))
+  init_cumulative_args (&(CUM), (FNTYPE), (LIBNAME), (FNDECL), \
+			(N_NAMED_ARGS) != -1)
 
 /* Output assembler code to FILE to increment profiler label # LABELNO
    for profiling a function entry.  */
@@ -2289,6 +2291,24 @@  struct GTY(()) machine_function {
      stack below the return address.  */
   BOOL_BITFIELD static_chain_on_stack : 1;
 
+  /* Nonzero if the current function uses vzeroupper.  */
+  BOOL_BITFIELD use_vzeroupper_p : 1;
+
+  /* Nonzero if the current function uses 256bit AVX regisers.  */
+  BOOL_BITFIELD use_avx256_p : 1;
+
+  /* Nonzero if caller passes 256bit AVX modes.  */
+  BOOL_BITFIELD caller_pass_avx256_p : 1;
+
+  /* Nonzero if caller returns 256bit AVX modes.  */
+  BOOL_BITFIELD caller_return_avx256_p : 1;
+
+  /* Nonzero if the current callee passes 256bit AVX modes.  */
+  BOOL_BITFIELD callee_pass_avx256_p : 1;
+
+  /* Nonzero if the current callee returns 256bit AVX modes.  */
+  BOOL_BITFIELD callee_return_avx256_p : 1;
+
   /* During prologue/epilogue generation, the current frame state.
      Otherwise, the frame state at the end of the prologue.  */
   struct machine_frame_state fs;
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 9c1fe1f..28a921f 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -256,6 +256,11 @@  mcld
 Target Report Mask(CLD) Save
 Generate cld instruction in the function prologue.
 
+mvzeroupper
+Target Report Mask(VZEROUPPER) Save
+Generate vzeroupper instruction before a transfer of control flow out of
+the function.
+
 mfused-madd
 Target Report Mask(FUSED_MADD) Save
 Enable automatic generation of fused floating point multiply-add instructions
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index ffddf18..078fac6 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -11508,29 +11508,11 @@ 
    (set_attr "prefix" "vex")
    (set_attr "mode" "OI")])
 
-;; vzeroupper clobbers the upper 128bits of AVX registers.
-(define_expand "avx_vzeroupper"
-  [(match_par_dup 0 [(const_int 0)])]
-  "TARGET_AVX"
-{
-  int nregs = TARGET_64BIT ? 16 : 8;
-  int regno;
-
-  operands[0] = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (nregs + 1));
-
-  XVECEXP (operands[0], 0, 0)
-    = gen_rtx_UNSPEC_VOLATILE (VOIDmode, gen_rtvec (1, const0_rtx),
-			       UNSPECV_VZEROUPPER);
-
-  for (regno = 0; regno < nregs; regno++)
-    XVECEXP (operands[0], 0, regno + 1)
-      = gen_rtx_CLOBBER (VOIDmode,
-			 gen_rtx_REG (V8SImode, SSE_REGNO (regno)));
-})
-
-(define_insn "*avx_vzeroupper"
-  [(match_parallel 0 "vzeroupper_operation"
-    [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])]
+;; Clear the upper 128bits of AVX registers, equivalent to a NOP
+;; if the upper 128bits are unused.
+(define_insn "avx_vzeroupper"
+  [(unspec_volatile [(match_operand 0 "const_int_operand" "")]
+		    UNSPECV_VZEROUPPER)]
   "TARGET_AVX"
   "vzeroupper"
   [(set_attr "type" "sse")
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 7ea042f..365b8c3 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -594,7 +594,7 @@  Objective-C and Objective-C++ Dialects}.
 -mno-wide-multiply  -mrtd  -malign-double @gol
 -mpreferred-stack-boundary=@var{num}
 -mincoming-stack-boundary=@var{num} @gol
--mcld -mcx16 -msahf -mmovbe -mcrc32 -mrecip @gol
+-mcld -mcx16 -msahf -mmovbe -mcrc32 -mrecip -mvzeroupper @gol
 -mmmx  -msse  -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -msse4 -mavx @gol
 -maes -mpclmul -mfsgsbase -mrdrnd -mf16c -mfused-madd @gol
 -msse4a -m3dnow -mpopcnt -mabm -mfma4 -mxop -mlwp @gol
@@ -12466,6 +12466,13 @@  GCC with the @option{--enable-cld} configure option.  Generation of @code{cld}
 instructions can be suppressed with the @option{-mno-cld} compiler option
 in this case.
 
+@item -mvzeroupper
+@opindex mvzeroupper
+This option instructs GCC to emit a @code{vzeroupper} instruction
+before a transfer of control flow out of the function to minimize
+AVX to SSE transition penalty as well as remove unnecessary zeroupper 
+intrinsics.
+
 @item -mcx16
 @opindex mcx16
 This option will enable GCC to use CMPXCHG16B instruction in generated code.
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-1.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-1.c
index 2137c25..73ce795 100644
--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-1.c
@@ -1,6 +1,6 @@ 
 /* { dg-do run } */
 /* { dg-require-effective-target avx } */
-/* { dg-options "-O2 -mavx" } */
+/* { dg-options "-O2 -mavx -mtune=generic" } */
 
 #include "avx-check.h"
 
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-10.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-10.c
new file mode 100644
index 0000000..5007753
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-10.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern float x, y;
+
+void
+foo ()
+{
+  x = y;
+  _mm256_zeroupper ();
+  _mm256_zeroupper ();
+  _mm256_zeroupper ();
+}
+
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 3 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-11.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-11.c
new file mode 100644
index 0000000..507f945
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-11.c
@@ -0,0 +1,19 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern float x, y;
+
+void
+foo ()
+{
+  x = y;
+  _mm256_zeroall ();
+  _mm256_zeroupper ();
+  _mm256_zeroupper ();
+  _mm256_zeroupper ();
+}
+
+/* { dg-final { scan-assembler-times "\\*avx_vzeroall" 1 } } */
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 3 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c
new file mode 100644
index 0000000..f74ea0c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c
@@ -0,0 +1,20 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+  _mm256_zeroall ();
+  _mm256_zeroupper ();
+  x = y;
+  _mm256_zeroupper ();
+  _mm256_zeroupper ();
+  _mm256_zeroupper ();
+}
+
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */
+/* { dg-final { scan-assembler-times "\\*avx_vzeroall" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-13.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-13.c
new file mode 100644
index 0000000..cff5f88
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-13.c
@@ -0,0 +1,14 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mno-vzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+  x = y;
+}
+
+/* { dg-final { scan-assembler-not "avx_vzeroupper" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-14.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-14.c
new file mode 100644
index 0000000..e74bc24
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-14.c
@@ -0,0 +1,14 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mtune=generic -dp" } */
+
+#include <immintrin.h>
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+  x = y;
+}
+
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-2.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-2.c
index 9771e6c..66df90f 100644
--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-2.c
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-2.c
@@ -1,6 +1,6 @@ 
 /* { dg-do run } */
 /* { dg-require-effective-target avx } */
-/* { dg-options "-O2 -mavx" } */
+/* { dg-options "-O2 -mavx -mtune=generic" } */
 
 #include "avx-check.h"
 
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-3.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-3.c
new file mode 100644
index 0000000..8053d78
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-3.c
@@ -0,0 +1,34 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target avx } */
+/* { dg-options "-O2 -mavx -mvzeroupper" } */
+
+#include "avx-check.h"
+
+int s[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+int d[8] = {11, 22, 33, 44, 55, 66, 77, 88};
+
+void
+__attribute__((noinline))
+foo ()
+{
+  int i;
+  for (i = 0; i < ARRAY_SIZE (d); i++)
+    d[i] = s[i] + 0x1000;
+}
+
+static void
+__attribute__((noinline))
+bar (__m256i src)
+{
+  foo ();
+  _mm256_storeu_si256 ((__m256i*) d, src);
+  if (__builtin_memcmp (d, s, sizeof (d)))
+    abort ();
+}
+
+static void
+avx_test (void)
+{
+  __m256i src = _mm256_loadu_si256 ((__m256i*) s);
+  bar (src);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-4.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-4.c
new file mode 100644
index 0000000..c55c814
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-4.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+typedef float __m256 __attribute__ ((__vector_size__ (32), __may_alias__));
+
+extern void bar2 (__m256);
+extern __m256 y;
+
+void
+foo ()
+{
+  bar2 (y);
+}
+
+/* { dg-final { scan-assembler-not "avx_vzeroupper" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-5.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-5.c
new file mode 100644
index 0000000..a14460c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-5.c
@@ -0,0 +1,16 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern void bar2 (__m256);
+extern __m256 y;
+
+void
+foo ()
+{
+  bar2 (y);
+  _mm256_zeroupper ();
+}
+
+/* { dg-final { scan-assembler-not "avx_vzeroupper" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-6.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-6.c
new file mode 100644
index 0000000..ada87bd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-6.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+  x = y;
+  _mm256_zeroall ();
+}
+
+/* { dg-final { scan-assembler-not "avx_vzeroupper" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c
new file mode 100644
index 0000000..ab6d687
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+  x = y;
+  _mm256_zeroupper ();
+}
+
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-8.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-8.c
new file mode 100644
index 0000000..0a821c2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-8.c
@@ -0,0 +1,16 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+  x = y;
+  _mm256_zeroall ();
+  _mm256_zeroupper ();
+}
+
+/* { dg-final { scan-assembler-not "avx_vzeroupper" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c
new file mode 100644
index 0000000..5aa05b8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+  _mm256_zeroupper ();
+  x = y;
+  _mm256_zeroupper ();
+  _mm256_zeroupper ();
+  _mm256_zeroupper ();
+}
+
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */