diff mbox

[RFC,PR,target/65105] Use vector instructions for scalar 64bit computations on 32bit target

Message ID 20150923101936.GB26618@msticlxl57.ims.intel.com
State New
Headers show

Commit Message

Ilya Enkovich Sept. 23, 2015, 10:19 a.m. UTC
On 14 Sep 17:50, Uros Bizjak wrote:
> 
> +(define_insn_and_split "*zext<mode>_doubleword"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> + (zero_extend:DI (match_operand:SWI24 1 "nonimmediate_operand" "rm")))]
> +  "!TARGET_64BIT && TARGET_STV && TARGET_SSE2"
> +  "#"
> +  "&& reload_completed && GENERAL_REG_P (operands[0])"
> +  [(set (match_dup 0) (zero_extend:SI (match_dup 1)))
> +   (set (match_dup 2) (const_int 0))]
> +  "split_double_mode (DImode, &operands[0], 1, &operands[0], &operands[2]);")
> +
> +(define_insn_and_split "*zextqi_doubleword"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> + (zero_extend:DI (match_operand:QI 1 "nonimmediate_operand" "qm")))]
> +  "!TARGET_64BIT && TARGET_STV && TARGET_SSE2"
> +  "#"
> +  "&& reload_completed && GENERAL_REG_P (operands[0])"
> +  [(set (match_dup 0) (zero_extend:SI (match_dup 1)))
> +   (set (match_dup 2) (const_int 0))]
> +  "split_double_mode (DImode, &operands[0], 1, &operands[0], &operands[2]);")
> +
> 
> Please put the above patterns together with other zero_extend
> patterns. You can also merge these two patterns using SWI124 mode
> iterator with <r> mode attribute as a register constraint. Also, no
> need to check for GENERAL_REG_P after reload, when "r" constraint is
> in effect:
> 
> (define_insn_and_split "*zext<mode>_doubleword"
>   [(set (match_operand:DI 0 "register_operand" "=r")
>  (zero_extend:DI (match_operand:SWI124 1 "nonimmediate_operand" "<r>m")))]
>   "!TARGET_64BIT && TARGET_STV && TARGET_SSE2"
>   "#"
>   "&& reload_completed"
>   [(set (match_dup 0) (zero_extend:SI (match_dup 1)))
>    (set (match_dup 2) (const_int 0))]
>   "split_double_mode (DImode, &operands[0], 1, &operands[0], &operands[2]);")

Register constraint doesn't affect split and I need GENERAL_REG_P to filter other registers case.

I merged QI and HI cases of zext but made a separate pattern for SI case because it doesn't need zero_extend in resulting code.  Bootstrapped and regtested for x86_64-unknown-linux-gnu.

Thanks,
Ilya
--
gcc/

2015-09-23  Ilya Enkovich  <enkovich.gnu@gmail.com>

	* config/i386/i386.c: Include dbgcnt.h.
	(has_non_address_hard_reg): New.
	(convertible_comparison_p): New.
	(scalar_to_vector_candidate_p): New.
	(remove_non_convertible_regs): New.
	(scalar_chain): New.
	(scalar_chain::scalar_chain): New.
	(scalar_chain::~scalar_chain): New.
	(scalar_chain::add_to_queue): New.
	(scalar_chain::mark_dual_mode_def): New.
	(scalar_chain::analyze_register_chain): New.
	(scalar_chain::add_insn): New.
	(scalar_chain::build): New.
	(scalar_chain::compute_convert_gain): New.
	(scalar_chain::replace_with_subreg): New.
	(scalar_chain::replace_with_subreg_in_insn): New.
	(scalar_chain::emit_conversion_insns): New.
	(scalar_chain::make_vector_copies): New.
	(scalar_chain::convert_reg): New.
	(scalar_chain::convert_op): New.
	(scalar_chain::convert_insn): New.
	(scalar_chain::convert): New.
	(convert_scalars_to_vector): New.
	(pass_data_stv): New.
	(pass_stv): New.
	(make_pass_stv): New.
	(ix86_option_override): Created and register stv pass.
	(flag_opts): Add -mstv.
	(ix86_option_override_internal): Likewise.
	* config/i386/i386.md (SWIM1248x): New.
	(*movdi_internal): Add xmm to mem alternative for TARGET_STV.
	(and<mode>3): Use SWIM1248x iterator instead of SWIM.
	(*anddi3_doubleword): New.
	(*zext<mode>_doubleword): New.
	(*zextsi_doubleword): New.
	(<code><mode>3): Use SWIM1248x iterator instead of SWIM.
	(*<code>di3_doubleword): New.
	* config/i386/i386.opt (mstv): New.
	* dbgcnt.def (stv_conversion): New.

gcc/testsuite/

2015-09-23  Ilya Enkovich  <enkovich.gnu@gmail.com>

	* gcc.target/i386/pr65105-1.c: New.
	* gcc.target/i386/pr65105-2.c: New.
	* gcc.target/i386/pr65105-3.c: New.
	* gcc.target/i386/pr65105-4.C: New.
	* gcc.dg/lower-subreg-1.c: Add -mno-stv options for ia32.

Comments

Uros Bizjak Sept. 23, 2015, 10:29 a.m. UTC | #1
On Wed, Sep 23, 2015 at 12:19 PM, Ilya Enkovich <enkovich.gnu@gmail.com> wrote:
> On 14 Sep 17:50, Uros Bizjak wrote:
>>
>> +(define_insn_and_split "*zext<mode>_doubleword"
>> +  [(set (match_operand:DI 0 "register_operand" "=r")
>> + (zero_extend:DI (match_operand:SWI24 1 "nonimmediate_operand" "rm")))]
>> +  "!TARGET_64BIT && TARGET_STV && TARGET_SSE2"
>> +  "#"
>> +  "&& reload_completed && GENERAL_REG_P (operands[0])"
>> +  [(set (match_dup 0) (zero_extend:SI (match_dup 1)))
>> +   (set (match_dup 2) (const_int 0))]
>> +  "split_double_mode (DImode, &operands[0], 1, &operands[0], &operands[2]);")
>> +
>> +(define_insn_and_split "*zextqi_doubleword"
>> +  [(set (match_operand:DI 0 "register_operand" "=r")
>> + (zero_extend:DI (match_operand:QI 1 "nonimmediate_operand" "qm")))]
>> +  "!TARGET_64BIT && TARGET_STV && TARGET_SSE2"
>> +  "#"
>> +  "&& reload_completed && GENERAL_REG_P (operands[0])"
>> +  [(set (match_dup 0) (zero_extend:SI (match_dup 1)))
>> +   (set (match_dup 2) (const_int 0))]
>> +  "split_double_mode (DImode, &operands[0], 1, &operands[0], &operands[2]);")
>> +
>>
>> Please put the above patterns together with other zero_extend
>> patterns. You can also merge these two patterns using SWI124 mode
>> iterator with <r> mode attribute as a register constraint. Also, no
>> need to check for GENERAL_REG_P after reload, when "r" constraint is
>> in effect:
>>
>> (define_insn_and_split "*zext<mode>_doubleword"
>>   [(set (match_operand:DI 0 "register_operand" "=r")
>>  (zero_extend:DI (match_operand:SWI124 1 "nonimmediate_operand" "<r>m")))]
>>   "!TARGET_64BIT && TARGET_STV && TARGET_SSE2"
>>   "#"
>>   "&& reload_completed"
>>   [(set (match_dup 0) (zero_extend:SI (match_dup 1)))
>>    (set (match_dup 2) (const_int 0))]
>>   "split_double_mode (DImode, &operands[0], 1, &operands[0], &operands[2]);")
>
> Register constraint doesn't affect split and I need GENERAL_REG_P to filter other registers case.

OK.

> I merged QI and HI cases of zext but made a separate pattern for SI case because it doesn't need zero_extend in resulting code.  Bootstrapped and regtested for x86_64-unknown-linux-gnu.

This change is OK.

The patch LGTM, but please wait a couple of days if Jeff has some
comment on algorithmic aspect of the patch.

Thanks,
Uros.

>
> Thanks,
> Ilya
> --
> gcc/
>
> 2015-09-23  Ilya Enkovich  <enkovich.gnu@gmail.com>
>
>         * config/i386/i386.c: Include dbgcnt.h.
>         (has_non_address_hard_reg): New.
>         (convertible_comparison_p): New.
>         (scalar_to_vector_candidate_p): New.
>         (remove_non_convertible_regs): New.
>         (scalar_chain): New.
>         (scalar_chain::scalar_chain): New.
>         (scalar_chain::~scalar_chain): New.
>         (scalar_chain::add_to_queue): New.
>         (scalar_chain::mark_dual_mode_def): New.
>         (scalar_chain::analyze_register_chain): New.
>         (scalar_chain::add_insn): New.
>         (scalar_chain::build): New.
>         (scalar_chain::compute_convert_gain): New.
>         (scalar_chain::replace_with_subreg): New.
>         (scalar_chain::replace_with_subreg_in_insn): New.
>         (scalar_chain::emit_conversion_insns): New.
>         (scalar_chain::make_vector_copies): New.
>         (scalar_chain::convert_reg): New.
>         (scalar_chain::convert_op): New.
>         (scalar_chain::convert_insn): New.
>         (scalar_chain::convert): New.
>         (convert_scalars_to_vector): New.
>         (pass_data_stv): New.
>         (pass_stv): New.
>         (make_pass_stv): New.
>         (ix86_option_override): Created and register stv pass.
>         (flag_opts): Add -mstv.
>         (ix86_option_override_internal): Likewise.
>         * config/i386/i386.md (SWIM1248x): New.
>         (*movdi_internal): Add xmm to mem alternative for TARGET_STV.
>         (and<mode>3): Use SWIM1248x iterator instead of SWIM.
>         (*anddi3_doubleword): New.
>         (*zext<mode>_doubleword): New.
>         (*zextsi_doubleword): New.
>         (<code><mode>3): Use SWIM1248x iterator instead of SWIM.
>         (*<code>di3_doubleword): New.
>         * config/i386/i386.opt (mstv): New.
>         * dbgcnt.def (stv_conversion): New.
>
> gcc/testsuite/
>
> 2015-09-23  Ilya Enkovich  <enkovich.gnu@gmail.com>
>
>         * gcc.target/i386/pr65105-1.c: New.
>         * gcc.target/i386/pr65105-2.c: New.
>         * gcc.target/i386/pr65105-3.c: New.
>         * gcc.target/i386/pr65105-4.C: New.
>         * gcc.dg/lower-subreg-1.c: Add -mno-stv options for ia32.
>
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index d547cfd..2663f85 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -87,6 +87,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "tree-iterator.h"
>  #include "tree-chkp.h"
>  #include "rtl-chkp.h"
> +#include "dbgcnt.h"
>
>  /* This file should be included last.  */
>  #include "target-def.h"
> @@ -2600,6 +2601,908 @@ rest_of_handle_insert_vzeroupper (void)
>    return 0;
>  }
>
> +/* Return 1 if INSN uses or defines a hard register.
> +   Hard register uses in a memory address are ignored.
> +   Clobbers and flags definitions are ignored.  */
> +
> +static bool
> +has_non_address_hard_reg (rtx_insn *insn)
> +{
> +  df_ref ref;
> +  FOR_EACH_INSN_DEF (ref, insn)
> +    if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
> +       && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
> +       && DF_REF_REGNO (ref) != FLAGS_REG)
> +      return true;
> +
> +  FOR_EACH_INSN_USE (ref, insn)
> +    if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
> +      return true;
> +
> +  return false;
> +}
> +
> +/* Check if comparison INSN may be transformed
> +   into vector comparison.  Currently we transform
> +   zero checks only which look like:
> +
> +   (set (reg:CCZ 17 flags)
> +        (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
> +                             (subreg:SI (reg:DI x) 0))
> +                    (const_int 0 [0])))  */
> +
> +static bool
> +convertible_comparison_p (rtx_insn *insn)
> +{
> +  if (!TARGET_SSE4_1)
> +    return false;
> +
> +  rtx def_set = single_set (insn);
> +
> +  gcc_assert (def_set);
> +
> +  rtx src = SET_SRC (def_set);
> +  rtx dst = SET_DEST (def_set);
> +
> +  gcc_assert (GET_CODE (src) == COMPARE);
> +
> +  if (GET_CODE (dst) != REG
> +      || REGNO (dst) != FLAGS_REG
> +      || GET_MODE (dst) != CCZmode)
> +    return false;
> +
> +  rtx op1 = XEXP (src, 0);
> +  rtx op2 = XEXP (src, 1);
> +
> +  if (op2 != CONST0_RTX (GET_MODE (op2)))
> +    return false;
> +
> +  if (GET_CODE (op1) != IOR)
> +    return false;
> +
> +  op2 = XEXP (op1, 1);
> +  op1 = XEXP (op1, 0);
> +
> +  if (!SUBREG_P (op1)
> +      || !SUBREG_P (op2)
> +      || GET_MODE (op1) != SImode
> +      || GET_MODE (op2) != SImode
> +      || ((SUBREG_BYTE (op1) != 0
> +          || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
> +         && (SUBREG_BYTE (op2) != 0
> +             || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
> +    return false;
> +
> +  op1 = SUBREG_REG (op1);
> +  op2 = SUBREG_REG (op2);
> +
> +  if (op1 != op2
> +      || !REG_P (op1)
> +      || GET_MODE (op1) != DImode)
> +    return false;
> +
> +  return true;
> +}
> +
> +/* Return 1 if INSN may be converted into vector
> +   instruction.  */
> +
> +static bool
> +scalar_to_vector_candidate_p (rtx_insn *insn)
> +{
> +  rtx def_set = single_set (insn);
> +
> +  if (!def_set)
> +    return false;
> +
> +  if (has_non_address_hard_reg (insn))
> +    return false;
> +
> +  rtx src = SET_SRC (def_set);
> +  rtx dst = SET_DEST (def_set);
> +
> +  if (GET_CODE (src) == COMPARE)
> +    return convertible_comparison_p (insn);
> +
> +  /* We are interested in DImode promotion only.  */
> +  if (GET_MODE (src) != DImode
> +      || GET_MODE (dst) != DImode)
> +    return false;
> +
> +  if (!REG_P (dst) && !MEM_P (dst))
> +    return false;
> +
> +  switch (GET_CODE (src))
> +    {
> +    case PLUS:
> +    case MINUS:
> +    case IOR:
> +    case XOR:
> +    case AND:
> +      break;
> +
> +    case REG:
> +      return true;
> +
> +    case MEM:
> +      return REG_P (dst);
> +
> +    default:
> +      return false;
> +    }
> +
> +  if (!REG_P (XEXP (src, 0)) && !MEM_P (XEXP (src, 0)))
> +      return false;
> +
> +  if (!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
> +      return false;
> +
> +  if (GET_MODE (XEXP (src, 0)) != DImode
> +      || GET_MODE (XEXP (src, 1)) != DImode)
> +    return false;
> +
> +  return true;
> +}
> +
> +/* For a given bitmap of insn UIDs scans all instruction and
> +   remove insn from CANDIDATES in case it has both convertible
> +   and not convertible definitions.
> +
> +   All insns in a bitmap are conversion candidates according to
> +   scalar_to_vector_candidate_p.  Currently it implies all insns
> +   are single_set.  */
> +
> +static void
> +remove_non_convertible_regs (bitmap candidates)
> +{
> +  bitmap_iterator bi;
> +  unsigned id;
> +  bitmap regs = BITMAP_ALLOC (NULL);
> +
> +  EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
> +    {
> +      rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
> +      rtx reg = SET_DEST (def_set);
> +
> +      if (!REG_P (reg)
> +         || bitmap_bit_p (regs, REGNO (reg))
> +         || HARD_REGISTER_P (reg))
> +       continue;
> +
> +      for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
> +          def;
> +          def = DF_REF_NEXT_REG (def))
> +       {
> +         if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
> +           {
> +             if (dump_file)
> +               fprintf (dump_file,
> +                        "r%d has non convertible definition in insn %d\n",
> +                        REGNO (reg), DF_REF_INSN_UID (def));
> +
> +             bitmap_set_bit (regs, REGNO (reg));
> +             break;
> +           }
> +       }
> +    }
> +
> +  EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
> +    {
> +      for (df_ref def = DF_REG_DEF_CHAIN (id);
> +          def;
> +          def = DF_REF_NEXT_REG (def))
> +       if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
> +         {
> +           if (dump_file)
> +             fprintf (dump_file, "Removing insn %d from candidates list\n",
> +                      DF_REF_INSN_UID (def));
> +
> +           bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
> +         }
> +    }
> +
> +  BITMAP_FREE (regs);
> +}
> +
> +class scalar_chain
> +{
> + public:
> +  scalar_chain ();
> +  ~scalar_chain ();
> +
> +  static unsigned max_id;
> +
> +  /* ID of a chain.  */
> +  unsigned int chain_id;
> +  /* A queue of instructions to be included into a chain.  */
> +  bitmap queue;
> +  /* Instructions included into a chain.  */
> +  bitmap insns;
> +  /* All registers defined by a chain.  */
> +  bitmap defs;
> +  /* Registers used in both vector and sclar modes.  */
> +  bitmap defs_conv;
> +
> +  void build (bitmap candidates, unsigned insn_uid);
> +  int compute_convert_gain ();
> +  int convert ();
> +
> + private:
> +  void add_insn (bitmap candidates, unsigned insn_uid);
> +  void add_to_queue (unsigned insn_uid);
> +  void mark_dual_mode_def (df_ref def);
> +  void analyze_register_chain (bitmap candidates, df_ref ref);
> +  rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
> +  void emit_conversion_insns (rtx insns, rtx_insn *pos);
> +  void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
> +  void convert_insn (rtx_insn *insn);
> +  void convert_op (rtx *op, rtx_insn *insn);
> +  void convert_reg (unsigned regno);
> +  void make_vector_copies (unsigned regno);
> +};
> +
> +unsigned scalar_chain::max_id = 0;
> +
> +/* Initialize new chain.  */
> +
> +scalar_chain::scalar_chain ()
> +{
> +  chain_id = ++max_id;
> +
> +   if (dump_file)
> +    fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
> +
> +  bitmap_obstack_initialize (NULL);
> +  insns = BITMAP_ALLOC (NULL);
> +  defs = BITMAP_ALLOC (NULL);
> +  defs_conv = BITMAP_ALLOC (NULL);
> +  queue = NULL;
> +}
> +
> +/* Free chain's data.  */
> +
> +scalar_chain::~scalar_chain ()
> +{
> +  BITMAP_FREE (insns);
> +  BITMAP_FREE (defs);
> +  BITMAP_FREE (defs_conv);
> +  bitmap_obstack_release (NULL);
> +}
> +
> +/* Add instruction into chains' queue.  */
> +
> +void
> +scalar_chain::add_to_queue (unsigned insn_uid)
> +{
> +  if (bitmap_bit_p (insns, insn_uid)
> +      || bitmap_bit_p (queue, insn_uid))
> +    return;
> +
> +  if (dump_file)
> +    fprintf (dump_file, "  Adding insn %d into chain's #%d queue\n",
> +            insn_uid, chain_id);
> +  bitmap_set_bit (queue, insn_uid);
> +}
> +
> +/* Mark register defined by DEF as requiring conversion.  */
> +
> +void
> +scalar_chain::mark_dual_mode_def (df_ref def)
> +{
> +  gcc_assert (DF_REF_REG_DEF_P (def));
> +
> +  if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
> +    return;
> +
> +  if (dump_file)
> +    fprintf (dump_file,
> +            "  Mark r%d def in insn %d as requiring both modes in chain #%d\n",
> +            DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
> +
> +  bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
> +}
> +
> +/* Check REF's chain to add new insns into a queue
> +   and find registers requiring conversion.  */
> +
> +void
> +scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
> +{
> +  df_link *chain;
> +
> +  gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
> +             || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
> +  add_to_queue (DF_REF_INSN_UID (ref));
> +
> +  for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
> +    {
> +      unsigned uid = DF_REF_INSN_UID (chain->ref);
> +      if (!DF_REF_REG_MEM_P (chain->ref))
> +       {
> +         if (bitmap_bit_p (insns, uid))
> +           continue;
> +
> +         if (bitmap_bit_p (candidates, uid))
> +           {
> +             add_to_queue (uid);
> +             continue;
> +           }
> +       }
> +
> +      if (DF_REF_REG_DEF_P (chain->ref))
> +       {
> +         if (dump_file)
> +           fprintf (dump_file, "  r%d def in insn %d isn't convertible\n",
> +                    DF_REF_REGNO (chain->ref), uid);
> +         mark_dual_mode_def (chain->ref);
> +       }
> +      else
> +       {
> +         if (dump_file)
> +           fprintf (dump_file, "  r%d use in insn %d isn't convertible\n",
> +                    DF_REF_REGNO (chain->ref), uid);
> +         mark_dual_mode_def (ref);
> +       }
> +    }
> +}
> +
> +/* Add instruction into a chain.  */
> +
> +void
> +scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
> +{
> +  if (bitmap_bit_p (insns, insn_uid))
> +    return;
> +
> +  if (dump_file)
> +    fprintf (dump_file, "  Adding insn %d to chain #%d\n", insn_uid, chain_id);
> +
> +  bitmap_set_bit (insns, insn_uid);
> +
> +  rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
> +  rtx def_set = single_set (insn);
> +  if (def_set && REG_P (SET_DEST (def_set))
> +      && !HARD_REGISTER_P (SET_DEST (def_set)))
> +    bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
> +
> +  df_ref ref;
> +  df_ref def;
> +  for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
> +    if (!HARD_REGISTER_P (DF_REF_REG (ref)))
> +      for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
> +          def;
> +          def = DF_REF_NEXT_REG (def))
> +       analyze_register_chain (candidates, def);
> +  for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
> +    if (!DF_REF_REG_MEM_P (ref))
> +      analyze_register_chain (candidates, ref);
> +}
> +
> +/* Build new chain starting from insn INSN_UID recursively
> +   adding all dependent uses and definitions.  */
> +
> +void
> +scalar_chain::build (bitmap candidates, unsigned insn_uid)
> +{
> +  queue = BITMAP_ALLOC (NULL);
> +  bitmap_set_bit (queue, insn_uid);
> +
> +  if (dump_file)
> +    fprintf (dump_file, "Building chain #%d...\n", chain_id);
> +
> +  while (!bitmap_empty_p (queue))
> +    {
> +      insn_uid = bitmap_first_set_bit (queue);
> +      bitmap_clear_bit (queue, insn_uid);
> +      bitmap_clear_bit (candidates, insn_uid);
> +      add_insn (candidates, insn_uid);
> +    }
> +
> +  if (dump_file)
> +    {
> +      fprintf (dump_file, "Collected chain #%d...\n", chain_id);
> +      fprintf (dump_file, "  insns: ");
> +      dump_bitmap (dump_file, insns);
> +      if (!bitmap_empty_p (defs_conv))
> +       {
> +         bitmap_iterator bi;
> +         unsigned id;
> +         const char *comma = "";
> +         fprintf (dump_file, "  defs to convert: ");
> +         EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
> +           {
> +             fprintf (dump_file, "%sr%d", comma, id);
> +             comma = ", ";
> +           }
> +         fprintf (dump_file, "\n");
> +       }
> +    }
> +
> +  BITMAP_FREE (queue);
> +}
> +
> +/* Compute a gain for chain conversion.  */
> +
> +int
> +scalar_chain::compute_convert_gain ()
> +{
> +  bitmap_iterator bi;
> +  unsigned insn_uid;
> +  int gain = 0;
> +  int cost = 0;
> +
> +  if (dump_file)
> +    fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
> +
> +  EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
> +    {
> +      rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
> +      rtx def_set = single_set (insn);
> +      rtx src = SET_SRC (def_set);
> +      rtx dst = SET_DEST (def_set);
> +
> +      if (REG_P (src) && REG_P (dst))
> +       gain += COSTS_N_INSNS (2) - ix86_cost->sse_move;
> +      else if (REG_P (src) && MEM_P (dst))
> +       gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
> +      else if (MEM_P (src) && REG_P (dst))
> +       gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
> +      else if (GET_CODE (src) == PLUS
> +              || GET_CODE (src) == MINUS
> +              || GET_CODE (src) == IOR
> +              || GET_CODE (src) == XOR
> +              || GET_CODE (src) == AND)
> +       gain += ix86_cost->add;
> +      else if (GET_CODE (src) == COMPARE)
> +       {
> +         /* Assume comparison cost is the same.  */
> +       }
> +      else
> +       gcc_unreachable ();
> +    }
> +
> +  if (dump_file)
> +    fprintf (dump_file, "  Instruction convertion gain: %d\n", gain);
> +
> +  EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
> +    cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
> +
> +  if (dump_file)
> +    fprintf (dump_file, "  Registers convertion cost: %d\n", cost);
> +
> +  gain -= cost;
> +
> +  if (dump_file)
> +    fprintf (dump_file, "  Total gain: %d\n", gain);
> +
> +  return gain;
> +}
> +
> +/* Replace REG in X with a V2DI subreg of NEW_REG.  */
> +
> +rtx
> +scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
> +{
> +  if (x == reg)
> +    return gen_rtx_SUBREG (V2DImode, new_reg, 0);
> +
> +  const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
> +  int i, j;
> +  for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
> +    {
> +      if (fmt[i] == 'e')
> +       XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
> +      else if (fmt[i] == 'E')
> +       for (j = XVECLEN (x, i) - 1; j >= 0; j--)
> +         XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
> +                                                  reg, new_reg);
> +    }
> +
> +  return x;
> +}
> +
> +/* Replace REG in INSN with a V2DI subreg of NEW_REG.  */
> +
> +void
> +scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx new_reg)
> +{
> +  replace_with_subreg (single_set (insn), reg, new_reg);
> +}
> +
> +/* Insert generated conversion instruction sequence INSNS
> +   after instruction AFTER.  New BB may be required in case
> +   instruction has EH region attached.  */
> +
> +void
> +scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
> +{
> +  if (!control_flow_insn_p (after))
> +    {
> +      emit_insn_after (insns, after);
> +      return;
> +    }
> +
> +  basic_block bb = BLOCK_FOR_INSN (after);
> +  edge e = find_fallthru_edge (bb->succs);
> +  gcc_assert (e);
> +
> +  basic_block new_bb = split_edge (e);
> +  emit_insn_after (insns, BB_HEAD (new_bb));
> +}
> +
> +/* Make vector copies for all register REGNO definitions
> +   and replace its uses in a chain.  */
> +
> +void
> +scalar_chain::make_vector_copies (unsigned regno)
> +{
> +  rtx reg = regno_reg_rtx[regno];
> +  rtx vreg = gen_reg_rtx (DImode);
> +  df_ref ref;
> +
> +  for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
> +    if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
> +      {
> +       rtx_insn *insn = DF_REF_INSN (ref);
> +
> +       start_sequence ();
> +       if (TARGET_SSE4_1)
> +         {
> +           emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
> +                                       CONST0_RTX (V4SImode),
> +                                       gen_rtx_SUBREG (SImode, reg, 0)));
> +           emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
> +                                         gen_rtx_SUBREG (V4SImode, vreg, 0),
> +                                         gen_rtx_SUBREG (SImode, reg, 4),
> +                                         GEN_INT (2)));
> +         }
> +       else if (TARGET_INTER_UNIT_MOVES_TO_VEC)
> +         {
> +           rtx tmp = gen_reg_rtx (DImode);
> +           emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
> +                                       CONST0_RTX (V4SImode),
> +                                       gen_rtx_SUBREG (SImode, reg, 0)));
> +           emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
> +                                       CONST0_RTX (V4SImode),
> +                                       gen_rtx_SUBREG (SImode, reg, 4)));
> +           emit_insn (gen_vec_interleave_lowv4si
> +                      (gen_rtx_SUBREG (V4SImode, vreg, 0),
> +                       gen_rtx_SUBREG (V4SImode, vreg, 0),
> +                       gen_rtx_SUBREG (V4SImode, tmp, 0)));
> +         }
> +       else
> +         {
> +           rtx tmp = assign_386_stack_local (DImode, SLOT_TEMP);
> +           emit_move_insn (adjust_address (tmp, SImode, 0),
> +                           gen_rtx_SUBREG (SImode, reg, 0));
> +           emit_move_insn (adjust_address (tmp, SImode, 4),
> +                           gen_rtx_SUBREG (SImode, reg, 4));
> +           emit_move_insn (vreg, tmp);
> +         }
> +       emit_conversion_insns (get_insns (), insn);
> +       end_sequence ();
> +
> +       if (dump_file)
> +         fprintf (dump_file,
> +                  "  Copied r%d to a vector register r%d for insn %d\n",
> +                  regno, REGNO (vreg), DF_REF_INSN_UID (ref));
> +      }
> +
> +  for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
> +    if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
> +      {
> +       replace_with_subreg_in_insn (DF_REF_INSN (ref), reg, vreg);
> +
> +       if (dump_file)
> +         fprintf (dump_file, "  Replaced r%d with r%d in insn %d\n",
> +                  regno, REGNO (vreg), DF_REF_INSN_UID (ref));
> +      }
> +}
> +
> +/* Convert all definitions of register REGNO
> +   and fix its uses.  Scalar copies may be created
> +   in case register is used in not convertible insn.  */
> +
> +void
> +scalar_chain::convert_reg (unsigned regno)
> +{
> +  bool scalar_copy = bitmap_bit_p (defs_conv, regno);
> +  rtx reg = regno_reg_rtx[regno];
> +  rtx scopy = NULL_RTX;
> +  df_ref ref;
> +  bitmap conv;
> +
> +  conv = BITMAP_ALLOC (NULL);
> +  bitmap_copy (conv, insns);
> +
> +  if (scalar_copy)
> +    scopy = gen_reg_rtx (DImode);
> +
> +  for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
> +    {
> +      rtx_insn *insn = DF_REF_INSN (ref);
> +      rtx def_set = single_set (insn);
> +      rtx src = SET_SRC (def_set);
> +      rtx reg = DF_REF_REG (ref);
> +
> +      if (!MEM_P (src))
> +       {
> +         replace_with_subreg_in_insn (insn, reg, reg);
> +         bitmap_clear_bit (conv, INSN_UID (insn));
> +       }
> +
> +      if (scalar_copy)
> +       {
> +         rtx vcopy = gen_reg_rtx (V2DImode);
> +
> +         start_sequence ();
> +         if (TARGET_INTER_UNIT_MOVES_FROM_VEC)
> +           {
> +             emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
> +             emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
> +                             gen_rtx_SUBREG (SImode, vcopy, 0));
> +             emit_move_insn (vcopy,
> +                             gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
> +             emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
> +                             gen_rtx_SUBREG (SImode, vcopy, 0));
> +           }
> +         else
> +           {
> +             rtx tmp = assign_386_stack_local (DImode, SLOT_TEMP);
> +             emit_move_insn (tmp, reg);
> +             emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
> +                             adjust_address (tmp, SImode, 0));
> +             emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
> +                             adjust_address (tmp, SImode, 4));
> +           }
> +         emit_conversion_insns (get_insns (), insn);
> +         end_sequence ();
> +
> +         if (dump_file)
> +           fprintf (dump_file,
> +                    "  Copied r%d to a scalar register r%d for insn %d\n",
> +                    regno, REGNO (scopy), INSN_UID (insn));
> +       }
> +    }
> +
> +  for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
> +    if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
> +      {
> +       if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
> +         {
> +           rtx def_set = single_set (DF_REF_INSN (ref));
> +           if (!MEM_P (SET_DEST (def_set))
> +               || !REG_P (SET_SRC (def_set)))
> +             replace_with_subreg_in_insn (DF_REF_INSN (ref), reg, reg);
> +           bitmap_clear_bit (conv, DF_REF_INSN_UID (ref));
> +         }
> +      }
> +    else
> +      {
> +       replace_rtx (DF_REF_INSN (ref), reg, scopy);
> +       df_insn_rescan (DF_REF_INSN (ref));
> +      }
> +
> +  BITMAP_FREE (conv);
> +}
> +
> +/* Convert operand OP in INSN.  All register uses
> +   are converted during registers conversion.
> +   Therefore we should just handle memory operands.  */
> +
> +void
> +scalar_chain::convert_op (rtx *op, rtx_insn *insn)
> +{
> +  *op = copy_rtx_if_shared (*op);
> +
> +  if (MEM_P (*op))
> +    {
> +      rtx tmp = gen_reg_rtx (DImode);
> +
> +      emit_insn_before (gen_move_insn (tmp, *op), insn);
> +      *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
> +
> +      if (dump_file)
> +       fprintf (dump_file, "  Preloading operand for insn %d into r%d\n",
> +                INSN_UID (insn), REGNO (tmp));
> +    }
> +  else
> +    {
> +      gcc_assert (SUBREG_P (*op));
> +      gcc_assert (GET_MODE (*op) == V2DImode);
> +    }
> +}
> +
> +/* Convert INSN to vector mode.  */
> +
> +void
> +scalar_chain::convert_insn (rtx_insn *insn)
> +{
> +  rtx def_set = single_set (insn);
> +  rtx src = SET_SRC (def_set);
> +  rtx dst = SET_DEST (def_set);
> +  rtx subreg;
> +
> +  if (MEM_P (dst) && !REG_P (src))
> +    {
> +      /* There are no scalar integer instructions and therefore
> +        temporary register usage is required.  */
> +      rtx tmp = gen_reg_rtx (DImode);
> +      emit_conversion_insns (gen_move_insn (dst, tmp), insn);
> +      dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
> +    }
> +
> +  switch (GET_CODE (src))
> +    {
> +    case PLUS:
> +    case MINUS:
> +    case IOR:
> +    case XOR:
> +    case AND:
> +      convert_op (&XEXP (src, 0), insn);
> +      convert_op (&XEXP (src, 1), insn);
> +      PUT_MODE (src, V2DImode);
> +      break;
> +
> +    case MEM:
> +      if (!REG_P (dst))
> +       convert_op (&src, insn);
> +      break;
> +
> +    case REG:
> +      break;
> +
> +    case SUBREG:
> +      gcc_assert (GET_MODE (src) == V2DImode);
> +      break;
> +
> +    case COMPARE:
> +      src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
> +
> +      gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
> +                 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
> +
> +      if (REG_P (src))
> +       subreg = gen_rtx_SUBREG (V2DImode, src, 0);
> +      else
> +       subreg = copy_rtx_if_shared (src);
> +      emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
> +                                                   copy_rtx_if_shared (subreg),
> +                                                   copy_rtx_if_shared (subreg)),
> +                       insn);
> +      dst = gen_rtx_REG (CCmode, FLAGS_REG);
> +      src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
> +                                              copy_rtx_if_shared (src)),
> +                           UNSPEC_PTEST);
> +      break;
> +
> +    default:
> +      gcc_unreachable ();
> +    }
> +
> +  SET_SRC (def_set) = src;
> +  SET_DEST (def_set) = dst;
> +
> +  /* Drop possible dead definitions.  */
> +  PATTERN (insn) = def_set;
> +
> +  INSN_CODE (insn) = -1;
> +  recog_memoized (insn);
> +  df_insn_rescan (insn);
> +}
> +
> +/* Convert whole chain creating required register
> +   conversions and copies.  */
> +
> +int
> +scalar_chain::convert ()
> +{
> +  bitmap_iterator bi;
> +  unsigned id;
> +  int converted_insns = 0;
> +
> +  if (!dbg_cnt (stv_conversion))
> +    return 0;
> +
> +  if (dump_file)
> +    fprintf (dump_file, "Converting chain #%d...\n", chain_id);
> +
> +  EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
> +    convert_reg (id);
> +
> +  EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
> +    make_vector_copies (id);
> +
> +  EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
> +    {
> +      convert_insn (DF_INSN_UID_GET (id)->insn);
> +      converted_insns++;
> +    }
> +
> +  return converted_insns;
> +}
> +
> +/* Main STV pass function.  Find and convert scalar
> +   instructions into vector mode when profitable.  */
> +
> +static unsigned int
> +convert_scalars_to_vector ()
> +{
> +  basic_block bb;
> +  bitmap candidates;
> +  int converted_insns = 0;
> +
> +  bitmap_obstack_initialize (NULL);
> +  candidates = BITMAP_ALLOC (NULL);
> +
> +  calculate_dominance_info (CDI_DOMINATORS);
> +  df_set_flags (DF_DEFER_INSN_RESCAN);
> +  df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
> +  df_md_add_problem ();
> +  df_analyze ();
> +
> +  /* Find all instructions we want to convert into vector mode.  */
> +  if (dump_file)
> +    fprintf (dump_file, "Searching for mode convertion candidates...\n");
> +
> +  FOR_EACH_BB_FN (bb, cfun)
> +    {
> +      rtx_insn *insn;
> +      FOR_BB_INSNS (bb, insn)
> +       if (scalar_to_vector_candidate_p (insn))
> +         {
> +           if (dump_file)
> +             fprintf (dump_file, "  insn %d is marked as a candidate\n",
> +                      INSN_UID (insn));
> +
> +           bitmap_set_bit (candidates, INSN_UID (insn));
> +         }
> +    }
> +
> +  remove_non_convertible_regs (candidates);
> +
> +  if (bitmap_empty_p (candidates))
> +    if (dump_file)
> +      fprintf (dump_file, "There are no candidates for optimization.\n");
> +
> +  while (!bitmap_empty_p (candidates))
> +    {
> +      unsigned uid = bitmap_first_set_bit (candidates);
> +      scalar_chain chain;
> +
> +      /* Find instructions chain we want to convert to vector mode.
> +        Check all uses and definitions to estimate all required
> +        conversions.  */
> +      chain.build (candidates, uid);
> +
> +      if (chain.compute_convert_gain () > 0)
> +       converted_insns += chain.convert ();
> +      else
> +       if (dump_file)
> +         fprintf (dump_file, "Chain #%d conversion is not profitable\n",
> +                  chain.chain_id);
> +    }
> +
> +  if (dump_file)
> +    fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
> +
> +  BITMAP_FREE (candidates);
> +  bitmap_obstack_release (NULL);
> +  df_process_deferred_rescans ();
> +
> +  /* Conversion means we may have 128bit register spills/fills
> +     which require aligned stack.  */
> +  if (converted_insns)
> +    {
> +      if (crtl->stack_alignment_needed < 128)
> +       crtl->stack_alignment_needed = 128;
> +      if (crtl->stack_alignment_estimated < 128)
> +       crtl->stack_alignment_estimated = 128;
> +    }
> +
> +  return 0;
> +}
> +
>  namespace {
>
>  const pass_data pass_data_insert_vzeroupper =
> @@ -2637,6 +3540,39 @@ public:
>
>  }; // class pass_insert_vzeroupper
>
> +const pass_data pass_data_stv =
> +{
> +  RTL_PASS, /* type */
> +  "stv", /* name */
> +  OPTGROUP_NONE, /* optinfo_flags */
> +  TV_NONE, /* tv_id */
> +  0, /* properties_required */
> +  0, /* properties_provided */
> +  0, /* properties_destroyed */
> +  0, /* todo_flags_start */
> +  TODO_df_finish, /* todo_flags_finish */
> +};
> +
> +class pass_stv : public rtl_opt_pass
> +{
> +public:
> +  pass_stv (gcc::context *ctxt)
> +    : rtl_opt_pass (pass_data_stv, ctxt)
> +  {}
> +
> +  /* opt_pass methods: */
> +  virtual bool gate (function *)
> +    {
> +      return !TARGET_64BIT && TARGET_STV && TARGET_SSE2 && optimize > 1;
> +    }
> +
> +  virtual unsigned int execute (function *)
> +    {
> +      return convert_scalars_to_vector ();
> +    }
> +
> +}; // class pass_stv
> +
>  } // anon namespace
>
>  rtl_opt_pass *
> @@ -2645,6 +3581,12 @@ make_pass_insert_vzeroupper (gcc::context *ctxt)
>    return new pass_insert_vzeroupper (ctxt);
>  }
>
> +rtl_opt_pass *
> +make_pass_stv (gcc::context *ctxt)
> +{
> +  return new pass_stv (ctxt);
> +}
> +
>  /* Return true if a red-zone is in use.  */
>
>  static inline bool
> @@ -2754,6 +3696,7 @@ ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
>      { "-mvect8-ret-in-mem",            MASK_VECT8_RETURNS },
>      { "-m8bit-idiv",                   MASK_USE_8BIT_IDIV },
>      { "-mvzeroupper",                  MASK_VZEROUPPER },
> +    { "-mstv",                         MASK_STV},
>      { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
>      { "-mavx256-split-unaligned-store",        MASK_AVX256_SPLIT_UNALIGNED_STORE},
>      { "-mprefer-avx128",               MASK_PREFER_AVX128},
> @@ -4366,6 +5309,8 @@ ix86_option_override_internal (bool main_args_p,
>
>    if (!(opts_set->x_target_flags & MASK_VZEROUPPER))
>      opts->x_target_flags |= MASK_VZEROUPPER;
> +  if (!(opts_set->x_target_flags & MASK_STV))
> +    opts->x_target_flags |= MASK_STV;
>    if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
>        && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
>      opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
> @@ -4479,12 +5424,18 @@ ix86_option_override (void)
>      = { pass_insert_vzeroupper, "reload",
>         1, PASS_POS_INSERT_AFTER
>        };
> +  opt_pass *pass_stv = make_pass_stv (g);
> +  struct register_pass_info stv_info
> +    = { pass_stv, "combine",
> +       1, PASS_POS_INSERT_AFTER
> +      };
>
>    ix86_option_override_internal (true, &global_options, &global_options_set);
>
>
>    /* This needs to be done at start up.  It's convenient to do it here.  */
>    register_pass (&insert_vzeroupper_info);
> +  register_pass (&stv_info);
>  }
>
>  /* Implement the TARGET_OFFLOAD_OPTIONS hook.  */
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index 7808705..89b74c9 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -978,6 +978,11 @@
>                                (HI "TARGET_HIMODE_MATH")
>                                SI])
>
> +;; Math-dependant integer modes with DImode.
> +(define_mode_iterator SWIM1248x [(QI "TARGET_QIMODE_MATH")
> +                                (HI "TARGET_HIMODE_MATH")
> +                                SI (DI "(TARGET_STV && TARGET_SSE2) || TARGET_64BIT")])
> +
>  ;; Math-dependant single word integer modes without QImode.
>  (define_mode_iterator SWIM248 [(HI "TARGET_HIMODE_MATH")
>                                SI (DI "TARGET_64BIT")])
> @@ -2094,9 +2099,9 @@
>
>  (define_insn "*movdi_internal"
>    [(set (match_operand:DI 0 "nonimmediate_operand"
> -    "=r  ,o  ,r,r  ,r,m ,*y,*y,?*y,?m,?r ,?*Ym,*v,*v,*v,m ,?r ,?r,?*Yi,?*Ym,?*Yi,*k,*k ,*r ,*m")
> +    "=r  ,o  ,r,r  ,r,m ,*y,*y,?*y,?m,?r ,?*Ym,*v,*v,*v,m ,m,?r ,?r,?*Yi,?*Ym,?*Yi,*k,*k ,*r ,*m")
>         (match_operand:DI 1 "general_operand"
> -    "riFo,riF,Z,rem,i,re,C ,*y,m  ,*y,*Yn,r   ,C ,*v,m ,*v,*Yj,*v,r   ,*Yj ,*Yn ,*r ,*km,*k,*k"))]
> +    "riFo,riF,Z,rem,i,re,C ,*y,m  ,*y,*Yn,r   ,C ,*v,m ,*v,v,*Yj,*v,r   ,*Yj ,*Yn ,*r ,*km,*k,*k"))]
>    "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
>  {
>    switch (get_attr_type (insn))
> @@ -2174,9 +2179,9 @@
>    [(set (attr "isa")
>       (cond [(eq_attr "alternative" "0,1")
>               (const_string "nox64")
> -           (eq_attr "alternative" "2,3,4,5,10,11,16,18,21,23")
> +           (eq_attr "alternative" "2,3,4,5,10,11,17,19,22,24")
>               (const_string "x64")
> -           (eq_attr "alternative" "17")
> +           (eq_attr "alternative" "18")
>               (const_string "x64_sse4")
>            ]
>            (const_string "*")))
> @@ -2187,13 +2192,13 @@
>               (const_string "mmx")
>             (eq_attr "alternative" "7,8,9,10,11")
>               (const_string "mmxmov")
> -           (eq_attr "alternative" "12,17")
> +           (eq_attr "alternative" "12,18")
>               (const_string "sselog1")
> -           (eq_attr "alternative" "13,14,15,16,18")
> +           (eq_attr "alternative" "13,14,15,16,17,19")
>               (const_string "ssemov")
> -           (eq_attr "alternative" "19,20")
> +           (eq_attr "alternative" "20,21")
>               (const_string "ssecvt")
> -           (eq_attr "alternative" "21,22,23,24")
> +           (eq_attr "alternative" "22,23,24,25")
>               (const_string "mskmov")
>             (and (match_operand 0 "register_operand")
>                  (match_operand 1 "pic_32bit_operand"))
> @@ -2208,16 +2213,16 @@
>     (set (attr "length_immediate")
>       (cond [(and (eq_attr "alternative" "4") (eq_attr "type" "imov"))
>               (const_string "8")
> -           (eq_attr "alternative" "17")
> +           (eq_attr "alternative" "18")
>               (const_string "1")
>            ]
>            (const_string "*")))
>     (set (attr "prefix_rex")
> -     (if_then_else (eq_attr "alternative" "10,11,16,17,18")
> +     (if_then_else (eq_attr "alternative" "10,11,17,18,19")
>         (const_string "1")
>         (const_string "*")))
>     (set (attr "prefix_extra")
> -     (if_then_else (eq_attr "alternative" "17")
> +     (if_then_else (eq_attr "alternative" "18")
>         (const_string "1")
>         (const_string "*")))
>     (set (attr "prefix")
> @@ -2245,13 +2250,26 @@
>                     ]
>                     (const_string "TI"))
>
> -           (and (eq_attr "alternative" "14,15")
> +           (and (eq_attr "alternative" "14,15,16")
>                  (not (match_test "TARGET_SSE2")))
>               (const_string "V2SF")
> -           (eq_attr "alternative" "17")
> +           (eq_attr "alternative" "18")
>               (const_string "TI")
>            ]
> -          (const_string "DI")))])
> +          (const_string "DI")))
> +   (set (attr "enabled")
> +     (cond [(eq_attr "alternative" "15")
> +              (if_then_else
> +               (match_test "TARGET_STV && TARGET_SSE2")
> +               (symbol_ref "false")
> +               (const_string "*"))
> +           (eq_attr "alternative" "16")
> +              (if_then_else
> +               (match_test "TARGET_STV && TARGET_SSE2")
> +               (symbol_ref "true")
> +               (symbol_ref "false"))
> +          ]
> +          (const_string "*")))])
>
>  (define_split
>    [(set (match_operand:DI 0 "nonimmediate_operand")
> @@ -3811,6 +3829,26 @@
>    "movz{bl|x}\t{%1, %k0|%k0, %1}"
>    [(set_attr "type" "imovx")
>     (set_attr "mode" "SI")])
> +
> +(define_insn_and_split "*zext<mode>_doubleword"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> +       (zero_extend:DI (match_operand:SWI12 1 "nonimmediate_operand" "<r>m")))]
> +  "!TARGET_64BIT && TARGET_STV && TARGET_SSE2"
> +  "#"
> +  "&& reload_completed && GENERAL_REG_P (operands[0])"
> +  [(set (match_dup 0) (zero_extend:SI (match_dup 1)))
> +   (set (match_dup 2) (const_int 0))]
> +  "split_double_mode (DImode, &operands[0], 1, &operands[0], &operands[2]);")
> +
> +(define_insn_and_split "*zextsi_doubleword"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> +       (zero_extend:DI (match_operand:SI 1 "nonimmediate_operand" "rm")))]
> +  "!TARGET_64BIT && TARGET_STV && TARGET_SSE2"
> +  "#"
> +  "&& reload_completed && GENERAL_REG_P (operands[0])"
> +  [(set (match_dup 0) (match_dup 1))
> +   (set (match_dup 2) (const_int 0))]
> +  "split_double_mode (DImode, &operands[0], 1, &operands[0], &operands[2]);")
>
>  ;; Sign extension instructions
>
> @@ -7860,9 +7898,9 @@
>  ;; it should be done with splitters.
>
>  (define_expand "and<mode>3"
> -  [(set (match_operand:SWIM 0 "nonimmediate_operand")
> -       (and:SWIM (match_operand:SWIM 1 "nonimmediate_operand")
> -                 (match_operand:SWIM 2 "<general_szext_operand>")))]
> +  [(set (match_operand:SWIM1248x 0 "nonimmediate_operand")
> +       (and:SWIM1248x (match_operand:SWIM1248x 1 "nonimmediate_operand")
> +                     (match_operand:SWIM1248x 2 "<general_szext_operand>")))]
>    ""
>  {
>    machine_mode mode = <MODE>mode;
> @@ -7940,6 +7978,23 @@
>         (const_string "*")))
>     (set_attr "mode" "SI,DI,DI,SI,DI")])
>
> +(define_insn_and_split "*anddi3_doubleword"
> +  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r")
> +       (and:DI
> +        (match_operand:DI 1 "nonimmediate_operand" "%0,0,0")
> +        (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,rm")))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "!TARGET_64BIT && TARGET_STV && TARGET_SSE2 && ix86_binary_operator_ok (AND, DImode, operands)"
> +  "#"
> +  "&& reload_completed"
> +  [(parallel [(set (match_dup 0)
> +                  (and:SI (match_dup 1) (match_dup 2)))
> +             (clobber (reg:CC FLAGS_REG))])
> +   (parallel [(set (match_dup 3)
> +                  (and:SI (match_dup 4) (match_dup 5)))
> +             (clobber (reg:CC FLAGS_REG))])]
> +  "split_double_mode (DImode, &operands[0], 3, &operands[0], &operands[3]);")
> +
>  (define_insn "*andsi_1"
>    [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r,Ya,!k")
>         (and:SI (match_operand:SI 1 "nonimmediate_operand" "%0,0,qm,k")
> @@ -8427,9 +8482,9 @@
>  ;; If this is considered useful, it should be done with splitters.
>
>  (define_expand "<code><mode>3"
> -  [(set (match_operand:SWIM 0 "nonimmediate_operand")
> -       (any_or:SWIM (match_operand:SWIM 1 "nonimmediate_operand")
> -                    (match_operand:SWIM 2 "<general_operand>")))]
> +  [(set (match_operand:SWIM1248x 0 "nonimmediate_operand")
> +       (any_or:SWIM1248x (match_operand:SWIM1248x 1 "nonimmediate_operand")
> +                            (match_operand:SWIM1248x 2 "<general_operand>")))]
>    ""
>    "ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); DONE;")
>
> @@ -8447,6 +8502,23 @@
>    [(set_attr "type" "alu,alu,msklog")
>     (set_attr "mode" "<MODE>")])
>
> +(define_insn_and_split "*<code>di3_doubleword"
> +  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r")
> +       (any_or:DI
> +        (match_operand:DI 1 "nonimmediate_operand" "%0,0,0")
> +        (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,rm")))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "!TARGET_64BIT && TARGET_STV && TARGET_SSE2 && ix86_binary_operator_ok (<CODE>, DImode, operands)"
> +  "#"
> +  "&& reload_completed"
> +  [(parallel [(set (match_dup 0)
> +                  (any_or:SI (match_dup 1) (match_dup 2)))
> +             (clobber (reg:CC FLAGS_REG))])
> +   (parallel [(set (match_dup 3)
> +                  (any_or:SI (match_dup 4) (match_dup 5)))
> +             (clobber (reg:CC FLAGS_REG))])]
> +  "split_double_mode (DImode, &operands[0], 3, &operands[0], &operands[3]);")
> +
>  (define_insn "*<code>hi_1"
>    [(set (match_operand:HI 0 "nonimmediate_operand" "=r,rm,!k")
>         (any_or:HI
> diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
> index 042f3c1..dae5c5d 100644
> --- a/gcc/config/i386/i386.opt
> +++ b/gcc/config/i386/i386.opt
> @@ -567,6 +567,11 @@ Target Report Mask(VZEROUPPER) Save
>  Generate vzeroupper instruction before a transfer of control flow out of
>  the function.
>
> +mstv
> +Target Report Mask(STV) Save
> +Disable Scalar to Vector optimization pass transforming 64-bit integer
> +computations into a vector ones.
> +
>  mdispatch-scheduler
>  Target RejectNegative Var(flag_dispatch_scheduler)
>  Do dispatch scheduling if processor is bdver1 or bdver2 or bdver3 or bdver4 and Haifa scheduling
> diff --git a/gcc/dbgcnt.def b/gcc/dbgcnt.def
> index 95f6b06..583b16b 100644
> --- a/gcc/dbgcnt.def
> +++ b/gcc/dbgcnt.def
> @@ -186,6 +186,7 @@ DEBUG_COUNTER (sel_sched_region_cnt)
>  DEBUG_COUNTER (sms_sched_loop)
>  DEBUG_COUNTER (split_for_sched2)
>  DEBUG_COUNTER (store_motion)
> +DEBUG_COUNTER (stv_conversion)
>  DEBUG_COUNTER (tail_call)
>  DEBUG_COUNTER (treepre_insert)
>  DEBUG_COUNTER (tree_sra)
> diff --git a/gcc/testsuite/gcc.dg/lower-subreg-1.c b/gcc/testsuite/gcc.dg/lower-subreg-1.c
> index 6362d37..47057fe 100644
> --- a/gcc/testsuite/gcc.dg/lower-subreg-1.c
> +++ b/gcc/testsuite/gcc.dg/lower-subreg-1.c
> @@ -1,5 +1,6 @@
>  /* { dg-do compile { target { ! { mips64 || { aarch64*-*-* arm*-*-* ia64-*-* sparc*-*-* spu-*-* tilegx-*-* } } } } } */
>  /* { dg-options "-O -fdump-rtl-subreg1" } */
> +/* { dg-additional-options "-mno-stv" { target ia32 } } */
>  /* { dg-skip-if "" { { i?86-*-* x86_64-*-* } && x32 } { "*" } { "" } } */
>  /* { dg-require-effective-target ilp32 } */
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr65105-1.c b/gcc/testsuite/gcc.target/i386/pr65105-1.c
> new file mode 100644
> index 0000000..bac6c07
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr65105-1.c
> @@ -0,0 +1,50 @@
> +/* PR target/pr65105 */
> +/* { dg-do run { target { ia32 } } } */
> +/* { dg-options "-O2 -march=slm" } */
> +/* { dg-final { scan-assembler "por" } } */
> +/* { dg-final { scan-assembler "pand" } } */
> +
> +#include "stdlib.h"
> +
> +static int count = 0;
> +
> +void __attribute__((noinline))
> +counter (long long l)
> +{
> +  count++;
> +  if (!l || count > 5)
> +    exit (1);
> +}
> +
> +void __attribute__((noinline))
> +test (long long *arr)
> +{
> +  register unsigned long long tmp;
> +
> +  tmp = arr[0] | arr[1] & arr[2];
> +  while (tmp)
> +    {
> +      counter (tmp);
> +      tmp = *(arr++) & tmp;
> +    }
> +}
> +
> +void  __attribute__((noinline))
> +fill_data (long long *arr)
> +{
> +  arr[0] = 0x00ffffffL;
> +  arr[1] = 0xffffff00L;
> +  arr[2] = 0x00ffffffL;
> +  arr[3] = 0x0000ff00L;
> +  arr[4] = 0x00ff0000L;
> +  arr[5] = 0xff000000L;
> +}
> +
> +int
> +main (int argc, const char **argv)
> +{
> +  long long arr[6];
> +  fill_data (arr);
> +  test (arr);
> +  return count - 5;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr65105-2.c b/gcc/testsuite/gcc.target/i386/pr65105-2.c
> new file mode 100644
> index 0000000..9216894
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr65105-2.c
> @@ -0,0 +1,12 @@
> +/* PR target/pr65105 */
> +/* { dg-do compile { target { ia32 } } } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { scan-assembler "por" } } */
> +
> +long long i1, i2, res;
> +
> +void
> +test ()
> +{
> +  res = i1 | i2;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr65105-3.c b/gcc/testsuite/gcc.target/i386/pr65105-3.c
> new file mode 100644
> index 0000000..b83989f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr65105-3.c
> @@ -0,0 +1,16 @@
> +/* PR target/pr65105 */
> +/* { dg-do compile { target { ia32 } } } */
> +/* { dg-options "-O2 -march=slm -msse4.2" } */
> +/* { dg-final { scan-assembler "pand" } } */
> +/* { dg-final { scan-assembler "por" } } */
> +/* { dg-final { scan-assembler "ptest" } } */
> +
> +long long i1, i2, i3, res;
> +
> +void
> +test ()
> +{
> +  res = i1 | i2;
> +  if (res)
> +    res &= i3;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr65105-4.C b/gcc/testsuite/gcc.target/i386/pr65105-4.C
> new file mode 100644
> index 0000000..9acf368
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr65105-4.C
> @@ -0,0 +1,19 @@
> +/* PR target/pr65105 */
> +/* { dg-do run { target { ia32 } } } */
> +/* { dg-options "-O2 -march=slm" } */
> +
> +struct s {
> +  long long l1, l2, l3, l4, l5;
> +} *a;
> +long long b;
> +long long fn1()
> +{
> +  try
> +    {
> +      b = (a->l1 | a->l2 | a->l3 | a->l4 | a->l5);
> +      return a->l1;
> +    }
> +  catch (int)
> +    {
> +    }
> +}
H.J. Lu Sept. 29, 2015, 12:11 p.m. UTC | #2
On Wed, Sep 23, 2015 at 3:29 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Wed, Sep 23, 2015 at 12:19 PM, Ilya Enkovich <enkovich.gnu@gmail.com> wrote:
>> On 14 Sep 17:50, Uros Bizjak wrote:
>>>
>>> +(define_insn_and_split "*zext<mode>_doubleword"
>>> +  [(set (match_operand:DI 0 "register_operand" "=r")
>>> + (zero_extend:DI (match_operand:SWI24 1 "nonimmediate_operand" "rm")))]
>>> +  "!TARGET_64BIT && TARGET_STV && TARGET_SSE2"
>>> +  "#"
>>> +  "&& reload_completed && GENERAL_REG_P (operands[0])"
>>> +  [(set (match_dup 0) (zero_extend:SI (match_dup 1)))
>>> +   (set (match_dup 2) (const_int 0))]
>>> +  "split_double_mode (DImode, &operands[0], 1, &operands[0], &operands[2]);")
>>> +
>>> +(define_insn_and_split "*zextqi_doubleword"
>>> +  [(set (match_operand:DI 0 "register_operand" "=r")
>>> + (zero_extend:DI (match_operand:QI 1 "nonimmediate_operand" "qm")))]
>>> +  "!TARGET_64BIT && TARGET_STV && TARGET_SSE2"
>>> +  "#"
>>> +  "&& reload_completed && GENERAL_REG_P (operands[0])"
>>> +  [(set (match_dup 0) (zero_extend:SI (match_dup 1)))
>>> +   (set (match_dup 2) (const_int 0))]
>>> +  "split_double_mode (DImode, &operands[0], 1, &operands[0], &operands[2]);")
>>> +
>>>
>>> Please put the above patterns together with other zero_extend
>>> patterns. You can also merge these two patterns using SWI124 mode
>>> iterator with <r> mode attribute as a register constraint. Also, no
>>> need to check for GENERAL_REG_P after reload, when "r" constraint is
>>> in effect:
>>>
>>> (define_insn_and_split "*zext<mode>_doubleword"
>>>   [(set (match_operand:DI 0 "register_operand" "=r")
>>>  (zero_extend:DI (match_operand:SWI124 1 "nonimmediate_operand" "<r>m")))]
>>>   "!TARGET_64BIT && TARGET_STV && TARGET_SSE2"
>>>   "#"
>>>   "&& reload_completed"
>>>   [(set (match_dup 0) (zero_extend:SI (match_dup 1)))
>>>    (set (match_dup 2) (const_int 0))]
>>>   "split_double_mode (DImode, &operands[0], 1, &operands[0], &operands[2]);")
>>
>> Register constraint doesn't affect split and I need GENERAL_REG_P to filter other registers case.
>
> OK.
>
>> I merged QI and HI cases of zext but made a separate pattern for SI case because it doesn't need zero_extend in resulting code.  Bootstrapped and regtested for x86_64-unknown-linux-gnu.
>
> This change is OK.
>
> The patch LGTM, but please wait a couple of days if Jeff has some
> comment on algorithmic aspect of the patch.
>
> Thanks,
> Uros.
>
>>
>> Thanks,
>> Ilya
>> --
>> gcc/
>>
>> 2015-09-23  Ilya Enkovich  <enkovich.gnu@gmail.com>
>>
>>         * config/i386/i386.c: Include dbgcnt.h.
>>         (has_non_address_hard_reg): New.
>>         (convertible_comparison_p): New.
>>         (scalar_to_vector_candidate_p): New.
>>         (remove_non_convertible_regs): New.
>>         (scalar_chain): New.
>>         (scalar_chain::scalar_chain): New.
>>         (scalar_chain::~scalar_chain): New.
>>         (scalar_chain::add_to_queue): New.
>>         (scalar_chain::mark_dual_mode_def): New.
>>         (scalar_chain::analyze_register_chain): New.
>>         (scalar_chain::add_insn): New.
>>         (scalar_chain::build): New.
>>         (scalar_chain::compute_convert_gain): New.
>>         (scalar_chain::replace_with_subreg): New.
>>         (scalar_chain::replace_with_subreg_in_insn): New.
>>         (scalar_chain::emit_conversion_insns): New.
>>         (scalar_chain::make_vector_copies): New.
>>         (scalar_chain::convert_reg): New.
>>         (scalar_chain::convert_op): New.
>>         (scalar_chain::convert_insn): New.
>>         (scalar_chain::convert): New.
>>         (convert_scalars_to_vector): New.
>>         (pass_data_stv): New.
>>         (pass_stv): New.
>>         (make_pass_stv): New.
>>         (ix86_option_override): Created and register stv pass.
>>         (flag_opts): Add -mstv.
>>         (ix86_option_override_internal): Likewise.
>>         * config/i386/i386.md (SWIM1248x): New.
>>         (*movdi_internal): Add xmm to mem alternative for TARGET_STV.
>>         (and<mode>3): Use SWIM1248x iterator instead of SWIM.
>>         (*anddi3_doubleword): New.
>>         (*zext<mode>_doubleword): New.
>>         (*zextsi_doubleword): New.
>>         (<code><mode>3): Use SWIM1248x iterator instead of SWIM.
>>         (*<code>di3_doubleword): New.
>>         * config/i386/i386.opt (mstv): New.
>>         * dbgcnt.def (stv_conversion): New.
>>

This caused:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67761
diff mbox

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index d547cfd..2663f85 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -87,6 +87,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "tree-iterator.h"
 #include "tree-chkp.h"
 #include "rtl-chkp.h"
+#include "dbgcnt.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -2600,6 +2601,908 @@  rest_of_handle_insert_vzeroupper (void)
   return 0;
 }
 
+/* Return 1 if INSN uses or defines a hard register.
+   Hard register uses in a memory address are ignored.
+   Clobbers and flags definitions are ignored.  */
+
+static bool
+has_non_address_hard_reg (rtx_insn *insn)
+{
+  df_ref ref;
+  FOR_EACH_INSN_DEF (ref, insn)
+    if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
+	&& !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
+	&& DF_REF_REGNO (ref) != FLAGS_REG)
+      return true;
+
+  FOR_EACH_INSN_USE (ref, insn)
+    if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
+      return true;
+
+  return false;
+}
+
+/* Check if comparison INSN may be transformed
+   into vector comparison.  Currently we transform
+   zero checks only which look like:
+
+   (set (reg:CCZ 17 flags)
+        (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
+                             (subreg:SI (reg:DI x) 0))
+		     (const_int 0 [0])))  */
+
+static bool
+convertible_comparison_p (rtx_insn *insn)
+{
+  if (!TARGET_SSE4_1)
+    return false;
+
+  rtx def_set = single_set (insn);
+
+  gcc_assert (def_set);
+
+  rtx src = SET_SRC (def_set);
+  rtx dst = SET_DEST (def_set);
+
+  gcc_assert (GET_CODE (src) == COMPARE);
+
+  if (GET_CODE (dst) != REG
+      || REGNO (dst) != FLAGS_REG
+      || GET_MODE (dst) != CCZmode)
+    return false;
+
+  rtx op1 = XEXP (src, 0);
+  rtx op2 = XEXP (src, 1);
+
+  if (op2 != CONST0_RTX (GET_MODE (op2)))
+    return false;
+
+  if (GET_CODE (op1) != IOR)
+    return false;
+
+  op2 = XEXP (op1, 1);
+  op1 = XEXP (op1, 0);
+
+  if (!SUBREG_P (op1)
+      || !SUBREG_P (op2)
+      || GET_MODE (op1) != SImode
+      || GET_MODE (op2) != SImode
+      || ((SUBREG_BYTE (op1) != 0
+	   || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
+	  && (SUBREG_BYTE (op2) != 0
+	      || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
+    return false;
+
+  op1 = SUBREG_REG (op1);
+  op2 = SUBREG_REG (op2);
+
+  if (op1 != op2
+      || !REG_P (op1)
+      || GET_MODE (op1) != DImode)
+    return false;
+
+  return true;
+}
+
+/* Return 1 if INSN may be converted into vector
+   instruction.  */
+
+static bool
+scalar_to_vector_candidate_p (rtx_insn *insn)
+{
+  rtx def_set = single_set (insn);
+
+  if (!def_set)
+    return false;
+
+  if (has_non_address_hard_reg (insn))
+    return false;
+
+  rtx src = SET_SRC (def_set);
+  rtx dst = SET_DEST (def_set);
+
+  if (GET_CODE (src) == COMPARE)
+    return convertible_comparison_p (insn);
+
+  /* We are interested in DImode promotion only.  */
+  if (GET_MODE (src) != DImode
+      || GET_MODE (dst) != DImode)
+    return false;
+
+  if (!REG_P (dst) && !MEM_P (dst))
+    return false;
+
+  switch (GET_CODE (src))
+    {
+    case PLUS:
+    case MINUS:
+    case IOR:
+    case XOR:
+    case AND:
+      break;
+
+    case REG:
+      return true;
+
+    case MEM:
+      return REG_P (dst);
+
+    default:
+      return false;
+    }
+
+  if (!REG_P (XEXP (src, 0)) && !MEM_P (XEXP (src, 0)))
+      return false;
+
+  if (!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
+      return false;
+
+  if (GET_MODE (XEXP (src, 0)) != DImode
+      || GET_MODE (XEXP (src, 1)) != DImode)
+    return false;
+
+  return true;
+}
+
+/* For a given bitmap of insn UIDs scans all instruction and
+   remove insn from CANDIDATES in case it has both convertible
+   and not convertible definitions.
+
+   All insns in a bitmap are conversion candidates according to
+   scalar_to_vector_candidate_p.  Currently it implies all insns
+   are single_set.  */
+
+static void
+remove_non_convertible_regs (bitmap candidates)
+{
+  bitmap_iterator bi;
+  unsigned id;
+  bitmap regs = BITMAP_ALLOC (NULL);
+
+  EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
+    {
+      rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
+      rtx reg = SET_DEST (def_set);
+
+      if (!REG_P (reg)
+	  || bitmap_bit_p (regs, REGNO (reg))
+	  || HARD_REGISTER_P (reg))
+	continue;
+
+      for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
+	   def;
+	   def = DF_REF_NEXT_REG (def))
+	{
+	  if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
+	    {
+	      if (dump_file)
+		fprintf (dump_file,
+			 "r%d has non convertible definition in insn %d\n",
+			 REGNO (reg), DF_REF_INSN_UID (def));
+
+	      bitmap_set_bit (regs, REGNO (reg));
+	      break;
+	    }
+	}
+    }
+
+  EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
+    {
+      for (df_ref def = DF_REG_DEF_CHAIN (id);
+	   def;
+	   def = DF_REF_NEXT_REG (def))
+	if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
+	  {
+	    if (dump_file)
+	      fprintf (dump_file, "Removing insn %d from candidates list\n",
+		       DF_REF_INSN_UID (def));
+
+	    bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
+	  }
+    }
+
+  BITMAP_FREE (regs);
+}
+
+class scalar_chain
+{
+ public:
+  scalar_chain ();
+  ~scalar_chain ();
+
+  static unsigned max_id;
+
+  /* ID of a chain.  */
+  unsigned int chain_id;
+  /* A queue of instructions to be included into a chain.  */
+  bitmap queue;
+  /* Instructions included into a chain.  */
+  bitmap insns;
+  /* All registers defined by a chain.  */
+  bitmap defs;
+  /* Registers used in both vector and sclar modes.  */
+  bitmap defs_conv;
+
+  void build (bitmap candidates, unsigned insn_uid);
+  int compute_convert_gain ();
+  int convert ();
+
+ private:
+  void add_insn (bitmap candidates, unsigned insn_uid);
+  void add_to_queue (unsigned insn_uid);
+  void mark_dual_mode_def (df_ref def);
+  void analyze_register_chain (bitmap candidates, df_ref ref);
+  rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
+  void emit_conversion_insns (rtx insns, rtx_insn *pos);
+  void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
+  void convert_insn (rtx_insn *insn);
+  void convert_op (rtx *op, rtx_insn *insn);
+  void convert_reg (unsigned regno);
+  void make_vector_copies (unsigned regno);
+};
+
+unsigned scalar_chain::max_id = 0;
+
+/* Initialize new chain.  */
+
+scalar_chain::scalar_chain ()
+{
+  chain_id = ++max_id;
+
+   if (dump_file)
+    fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
+
+  bitmap_obstack_initialize (NULL);
+  insns = BITMAP_ALLOC (NULL);
+  defs = BITMAP_ALLOC (NULL);
+  defs_conv = BITMAP_ALLOC (NULL);
+  queue = NULL;
+}
+
+/* Free chain's data.  */
+
+scalar_chain::~scalar_chain ()
+{
+  BITMAP_FREE (insns);
+  BITMAP_FREE (defs);
+  BITMAP_FREE (defs_conv);
+  bitmap_obstack_release (NULL);
+}
+
+/* Add instruction into chains' queue.  */
+
+void
+scalar_chain::add_to_queue (unsigned insn_uid)
+{
+  if (bitmap_bit_p (insns, insn_uid)
+      || bitmap_bit_p (queue, insn_uid))
+    return;
+
+  if (dump_file)
+    fprintf (dump_file, "  Adding insn %d into chain's #%d queue\n",
+	     insn_uid, chain_id);
+  bitmap_set_bit (queue, insn_uid);
+}
+
+/* Mark register defined by DEF as requiring conversion.  */
+
+void
+scalar_chain::mark_dual_mode_def (df_ref def)
+{
+  gcc_assert (DF_REF_REG_DEF_P (def));
+
+  if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
+    return;
+
+  if (dump_file)
+    fprintf (dump_file,
+	     "  Mark r%d def in insn %d as requiring both modes in chain #%d\n",
+	     DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
+
+  bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
+}
+
+/* Check REF's chain to add new insns into a queue
+   and find registers requiring conversion.  */
+
+void
+scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
+{
+  df_link *chain;
+
+  gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
+	      || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
+  add_to_queue (DF_REF_INSN_UID (ref));
+
+  for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
+    {
+      unsigned uid = DF_REF_INSN_UID (chain->ref);
+      if (!DF_REF_REG_MEM_P (chain->ref))
+	{
+	  if (bitmap_bit_p (insns, uid))
+	    continue;
+
+	  if (bitmap_bit_p (candidates, uid))
+	    {
+	      add_to_queue (uid);
+	      continue;
+	    }
+	}
+
+      if (DF_REF_REG_DEF_P (chain->ref))
+	{
+	  if (dump_file)
+	    fprintf (dump_file, "  r%d def in insn %d isn't convertible\n",
+		     DF_REF_REGNO (chain->ref), uid);
+	  mark_dual_mode_def (chain->ref);
+	}
+      else
+	{
+	  if (dump_file)
+	    fprintf (dump_file, "  r%d use in insn %d isn't convertible\n",
+		     DF_REF_REGNO (chain->ref), uid);
+	  mark_dual_mode_def (ref);
+	}
+    }
+}
+
+/* Add instruction into a chain.  */
+
+void
+scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
+{
+  if (bitmap_bit_p (insns, insn_uid))
+    return;
+
+  if (dump_file)
+    fprintf (dump_file, "  Adding insn %d to chain #%d\n", insn_uid, chain_id);
+
+  bitmap_set_bit (insns, insn_uid);
+
+  rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
+  rtx def_set = single_set (insn);
+  if (def_set && REG_P (SET_DEST (def_set))
+      && !HARD_REGISTER_P (SET_DEST (def_set)))
+    bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
+
+  df_ref ref;
+  df_ref def;
+  for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
+    if (!HARD_REGISTER_P (DF_REF_REG (ref)))
+      for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
+	   def;
+	   def = DF_REF_NEXT_REG (def))
+	analyze_register_chain (candidates, def);
+  for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
+    if (!DF_REF_REG_MEM_P (ref))
+      analyze_register_chain (candidates, ref);
+}
+
+/* Build new chain starting from insn INSN_UID recursively
+   adding all dependent uses and definitions.  */
+
+void
+scalar_chain::build (bitmap candidates, unsigned insn_uid)
+{
+  queue = BITMAP_ALLOC (NULL);
+  bitmap_set_bit (queue, insn_uid);
+
+  if (dump_file)
+    fprintf (dump_file, "Building chain #%d...\n", chain_id);
+
+  while (!bitmap_empty_p (queue))
+    {
+      insn_uid = bitmap_first_set_bit (queue);
+      bitmap_clear_bit (queue, insn_uid);
+      bitmap_clear_bit (candidates, insn_uid);
+      add_insn (candidates, insn_uid);
+    }
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "Collected chain #%d...\n", chain_id);
+      fprintf (dump_file, "  insns: ");
+      dump_bitmap (dump_file, insns);
+      if (!bitmap_empty_p (defs_conv))
+	{
+	  bitmap_iterator bi;
+	  unsigned id;
+	  const char *comma = "";
+	  fprintf (dump_file, "  defs to convert: ");
+	  EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
+	    {
+	      fprintf (dump_file, "%sr%d", comma, id);
+	      comma = ", ";
+	    }
+	  fprintf (dump_file, "\n");
+	}
+    }
+
+  BITMAP_FREE (queue);
+}
+
+/* Compute a gain for chain conversion.  */
+
+int
+scalar_chain::compute_convert_gain ()
+{
+  bitmap_iterator bi;
+  unsigned insn_uid;
+  int gain = 0;
+  int cost = 0;
+
+  if (dump_file)
+    fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
+
+  EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
+    {
+      rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
+      rtx def_set = single_set (insn);
+      rtx src = SET_SRC (def_set);
+      rtx dst = SET_DEST (def_set);
+
+      if (REG_P (src) && REG_P (dst))
+	gain += COSTS_N_INSNS (2) - ix86_cost->sse_move;
+      else if (REG_P (src) && MEM_P (dst))
+	gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
+      else if (MEM_P (src) && REG_P (dst))
+	gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
+      else if (GET_CODE (src) == PLUS
+	       || GET_CODE (src) == MINUS
+	       || GET_CODE (src) == IOR
+	       || GET_CODE (src) == XOR
+	       || GET_CODE (src) == AND)
+	gain += ix86_cost->add;
+      else if (GET_CODE (src) == COMPARE)
+	{
+	  /* Assume comparison cost is the same.  */
+	}
+      else
+	gcc_unreachable ();
+    }
+
+  if (dump_file)
+    fprintf (dump_file, "  Instruction convertion gain: %d\n", gain);
+
+  EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
+    cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
+
+  if (dump_file)
+    fprintf (dump_file, "  Registers convertion cost: %d\n", cost);
+
+  gain -= cost;
+
+  if (dump_file)
+    fprintf (dump_file, "  Total gain: %d\n", gain);
+
+  return gain;
+}
+
+/* Replace REG in X with a V2DI subreg of NEW_REG.  */
+
+rtx
+scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
+{
+  if (x == reg)
+    return gen_rtx_SUBREG (V2DImode, new_reg, 0);
+
+  const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
+  int i, j;
+  for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
+    {
+      if (fmt[i] == 'e')
+	XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
+      else if (fmt[i] == 'E')
+	for (j = XVECLEN (x, i) - 1; j >= 0; j--)
+	  XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
+						   reg, new_reg);
+    }
+
+  return x;
+}
+
+/* Replace REG in INSN with a V2DI subreg of NEW_REG.  */
+
+void
+scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx new_reg)
+{
+  replace_with_subreg (single_set (insn), reg, new_reg);
+}
+
+/* Insert generated conversion instruction sequence INSNS
+   after instruction AFTER.  New BB may be required in case
+   instruction has EH region attached.  */
+
+void
+scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
+{
+  if (!control_flow_insn_p (after))
+    {
+      emit_insn_after (insns, after);
+      return;
+    }
+
+  basic_block bb = BLOCK_FOR_INSN (after);
+  edge e = find_fallthru_edge (bb->succs);
+  gcc_assert (e);
+
+  basic_block new_bb = split_edge (e);
+  emit_insn_after (insns, BB_HEAD (new_bb));
+}
+
+/* Make vector copies for all register REGNO definitions
+   and replace its uses in a chain.  */
+
+void
+scalar_chain::make_vector_copies (unsigned regno)
+{
+  rtx reg = regno_reg_rtx[regno];
+  rtx vreg = gen_reg_rtx (DImode);
+  df_ref ref;
+
+  for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+    if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
+      {
+	rtx_insn *insn = DF_REF_INSN (ref);
+
+	start_sequence ();
+	if (TARGET_SSE4_1)
+	  {
+	    emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
+					CONST0_RTX (V4SImode),
+					gen_rtx_SUBREG (SImode, reg, 0)));
+	    emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
+					  gen_rtx_SUBREG (V4SImode, vreg, 0),
+					  gen_rtx_SUBREG (SImode, reg, 4),
+					  GEN_INT (2)));
+	  }
+	else if (TARGET_INTER_UNIT_MOVES_TO_VEC)
+	  {
+	    rtx tmp = gen_reg_rtx (DImode);
+	    emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
+					CONST0_RTX (V4SImode),
+					gen_rtx_SUBREG (SImode, reg, 0)));
+	    emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
+					CONST0_RTX (V4SImode),
+					gen_rtx_SUBREG (SImode, reg, 4)));
+	    emit_insn (gen_vec_interleave_lowv4si
+		       (gen_rtx_SUBREG (V4SImode, vreg, 0),
+			gen_rtx_SUBREG (V4SImode, vreg, 0),
+			gen_rtx_SUBREG (V4SImode, tmp, 0)));
+	  }
+	else
+	  {
+	    rtx tmp = assign_386_stack_local (DImode, SLOT_TEMP);
+	    emit_move_insn (adjust_address (tmp, SImode, 0),
+			    gen_rtx_SUBREG (SImode, reg, 0));
+	    emit_move_insn (adjust_address (tmp, SImode, 4),
+			    gen_rtx_SUBREG (SImode, reg, 4));
+	    emit_move_insn (vreg, tmp);
+	  }
+	emit_conversion_insns (get_insns (), insn);
+	end_sequence ();
+
+	if (dump_file)
+	  fprintf (dump_file,
+		   "  Copied r%d to a vector register r%d for insn %d\n",
+		   regno, REGNO (vreg), DF_REF_INSN_UID (ref));
+      }
+
+  for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+    if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
+      {
+	replace_with_subreg_in_insn (DF_REF_INSN (ref), reg, vreg);
+
+	if (dump_file)
+	  fprintf (dump_file, "  Replaced r%d with r%d in insn %d\n",
+		   regno, REGNO (vreg), DF_REF_INSN_UID (ref));
+      }
+}
+
+/* Convert all definitions of register REGNO
+   and fix its uses.  Scalar copies may be created
+   in case register is used in not convertible insn.  */
+
+void
+scalar_chain::convert_reg (unsigned regno)
+{
+  bool scalar_copy = bitmap_bit_p (defs_conv, regno);
+  rtx reg = regno_reg_rtx[regno];
+  rtx scopy = NULL_RTX;
+  df_ref ref;
+  bitmap conv;
+
+  conv = BITMAP_ALLOC (NULL);
+  bitmap_copy (conv, insns);
+
+  if (scalar_copy)
+    scopy = gen_reg_rtx (DImode);
+
+  for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+    {
+      rtx_insn *insn = DF_REF_INSN (ref);
+      rtx def_set = single_set (insn);
+      rtx src = SET_SRC (def_set);
+      rtx reg = DF_REF_REG (ref);
+
+      if (!MEM_P (src))
+	{
+	  replace_with_subreg_in_insn (insn, reg, reg);
+	  bitmap_clear_bit (conv, INSN_UID (insn));
+	}
+
+      if (scalar_copy)
+	{
+	  rtx vcopy = gen_reg_rtx (V2DImode);
+
+	  start_sequence ();
+	  if (TARGET_INTER_UNIT_MOVES_FROM_VEC)
+	    {
+	      emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
+	      emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
+			      gen_rtx_SUBREG (SImode, vcopy, 0));
+	      emit_move_insn (vcopy,
+			      gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
+	      emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
+			      gen_rtx_SUBREG (SImode, vcopy, 0));
+	    }
+	  else
+	    {
+	      rtx tmp = assign_386_stack_local (DImode, SLOT_TEMP);
+	      emit_move_insn (tmp, reg);
+	      emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
+			      adjust_address (tmp, SImode, 0));
+	      emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
+			      adjust_address (tmp, SImode, 4));
+	    }
+	  emit_conversion_insns (get_insns (), insn);
+	  end_sequence ();
+
+	  if (dump_file)
+	    fprintf (dump_file,
+		     "  Copied r%d to a scalar register r%d for insn %d\n",
+		     regno, REGNO (scopy), INSN_UID (insn));
+	}
+    }
+
+  for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+    if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
+      {
+	if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
+	  {
+	    rtx def_set = single_set (DF_REF_INSN (ref));
+	    if (!MEM_P (SET_DEST (def_set))
+		|| !REG_P (SET_SRC (def_set)))
+	      replace_with_subreg_in_insn (DF_REF_INSN (ref), reg, reg);
+	    bitmap_clear_bit (conv, DF_REF_INSN_UID (ref));
+	  }
+      }
+    else
+      {
+	replace_rtx (DF_REF_INSN (ref), reg, scopy);
+	df_insn_rescan (DF_REF_INSN (ref));
+      }
+
+  BITMAP_FREE (conv);
+}
+
+/* Convert operand OP in INSN.  All register uses
+   are converted during registers conversion.
+   Therefore we should just handle memory operands.  */
+
+void
+scalar_chain::convert_op (rtx *op, rtx_insn *insn)
+{
+  *op = copy_rtx_if_shared (*op);
+
+  if (MEM_P (*op))
+    {
+      rtx tmp = gen_reg_rtx (DImode);
+
+      emit_insn_before (gen_move_insn (tmp, *op), insn);
+      *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
+
+      if (dump_file)
+	fprintf (dump_file, "  Preloading operand for insn %d into r%d\n",
+		 INSN_UID (insn), REGNO (tmp));
+    }
+  else
+    {
+      gcc_assert (SUBREG_P (*op));
+      gcc_assert (GET_MODE (*op) == V2DImode);
+    }
+}
+
+/* Convert INSN to vector mode.  */
+
+void
+scalar_chain::convert_insn (rtx_insn *insn)
+{
+  rtx def_set = single_set (insn);
+  rtx src = SET_SRC (def_set);
+  rtx dst = SET_DEST (def_set);
+  rtx subreg;
+
+  if (MEM_P (dst) && !REG_P (src))
+    {
+      /* There are no scalar integer instructions and therefore
+	 temporary register usage is required.  */
+      rtx tmp = gen_reg_rtx (DImode);
+      emit_conversion_insns (gen_move_insn (dst, tmp), insn);
+      dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
+    }
+
+  switch (GET_CODE (src))
+    {
+    case PLUS:
+    case MINUS:
+    case IOR:
+    case XOR:
+    case AND:
+      convert_op (&XEXP (src, 0), insn);
+      convert_op (&XEXP (src, 1), insn);
+      PUT_MODE (src, V2DImode);
+      break;
+
+    case MEM:
+      if (!REG_P (dst))
+	convert_op (&src, insn);
+      break;
+
+    case REG:
+      break;
+
+    case SUBREG:
+      gcc_assert (GET_MODE (src) == V2DImode);
+      break;
+
+    case COMPARE:
+      src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
+
+      gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
+		  || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
+
+      if (REG_P (src))
+	subreg = gen_rtx_SUBREG (V2DImode, src, 0);
+      else
+	subreg = copy_rtx_if_shared (src);
+      emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
+						    copy_rtx_if_shared (subreg),
+						    copy_rtx_if_shared (subreg)),
+			insn);
+      dst = gen_rtx_REG (CCmode, FLAGS_REG);
+      src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
+					       copy_rtx_if_shared (src)),
+			    UNSPEC_PTEST);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  SET_SRC (def_set) = src;
+  SET_DEST (def_set) = dst;
+
+  /* Drop possible dead definitions.  */
+  PATTERN (insn) = def_set;
+
+  INSN_CODE (insn) = -1;
+  recog_memoized (insn);
+  df_insn_rescan (insn);
+}
+
+/* Convert whole chain creating required register
+   conversions and copies.  */
+
+int
+scalar_chain::convert ()
+{
+  bitmap_iterator bi;
+  unsigned id;
+  int converted_insns = 0;
+
+  if (!dbg_cnt (stv_conversion))
+    return 0;
+
+  if (dump_file)
+    fprintf (dump_file, "Converting chain #%d...\n", chain_id);
+
+  EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
+    convert_reg (id);
+
+  EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
+    make_vector_copies (id);
+
+  EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
+    {
+      convert_insn (DF_INSN_UID_GET (id)->insn);
+      converted_insns++;
+    }
+
+  return converted_insns;
+}
+
+/* Main STV pass function.  Find and convert scalar
+   instructions into vector mode when profitable.  */
+
+static unsigned int
+convert_scalars_to_vector ()
+{
+  basic_block bb;
+  bitmap candidates;
+  int converted_insns = 0;
+
+  bitmap_obstack_initialize (NULL);
+  candidates = BITMAP_ALLOC (NULL);
+
+  calculate_dominance_info (CDI_DOMINATORS);
+  df_set_flags (DF_DEFER_INSN_RESCAN);
+  df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
+  df_md_add_problem ();
+  df_analyze ();
+
+  /* Find all instructions we want to convert into vector mode.  */
+  if (dump_file)
+    fprintf (dump_file, "Searching for mode convertion candidates...\n");
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      rtx_insn *insn;
+      FOR_BB_INSNS (bb, insn)
+	if (scalar_to_vector_candidate_p (insn))
+	  {
+	    if (dump_file)
+	      fprintf (dump_file, "  insn %d is marked as a candidate\n",
+		       INSN_UID (insn));
+
+	    bitmap_set_bit (candidates, INSN_UID (insn));
+	  }
+    }
+
+  remove_non_convertible_regs (candidates);
+
+  if (bitmap_empty_p (candidates))
+    if (dump_file)
+      fprintf (dump_file, "There are no candidates for optimization.\n");
+
+  while (!bitmap_empty_p (candidates))
+    {
+      unsigned uid = bitmap_first_set_bit (candidates);
+      scalar_chain chain;
+
+      /* Find instructions chain we want to convert to vector mode.
+	 Check all uses and definitions to estimate all required
+	 conversions.  */
+      chain.build (candidates, uid);
+
+      if (chain.compute_convert_gain () > 0)
+	converted_insns += chain.convert ();
+      else
+	if (dump_file)
+	  fprintf (dump_file, "Chain #%d conversion is not profitable\n",
+		   chain.chain_id);
+    }
+
+  if (dump_file)
+    fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
+
+  BITMAP_FREE (candidates);
+  bitmap_obstack_release (NULL);
+  df_process_deferred_rescans ();
+
+  /* Conversion means we may have 128bit register spills/fills
+     which require aligned stack.  */
+  if (converted_insns)
+    {
+      if (crtl->stack_alignment_needed < 128)
+	crtl->stack_alignment_needed = 128;
+      if (crtl->stack_alignment_estimated < 128)
+	crtl->stack_alignment_estimated = 128;
+    }
+
+  return 0;
+}
+
 namespace {
 
 const pass_data pass_data_insert_vzeroupper =
@@ -2637,6 +3540,39 @@  public:
 
 }; // class pass_insert_vzeroupper
 
+const pass_data pass_data_stv =
+{
+  RTL_PASS, /* type */
+  "stv", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_NONE, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_stv : public rtl_opt_pass
+{
+public:
+  pass_stv (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_stv, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+    {
+      return !TARGET_64BIT && TARGET_STV && TARGET_SSE2 && optimize > 1;
+    }
+
+  virtual unsigned int execute (function *)
+    {
+      return convert_scalars_to_vector ();
+    }
+
+}; // class pass_stv
+
 } // anon namespace
 
 rtl_opt_pass *
@@ -2645,6 +3581,12 @@  make_pass_insert_vzeroupper (gcc::context *ctxt)
   return new pass_insert_vzeroupper (ctxt);
 }
 
+rtl_opt_pass *
+make_pass_stv (gcc::context *ctxt)
+{
+  return new pass_stv (ctxt);
+}
+
 /* Return true if a red-zone is in use.  */
 
 static inline bool
@@ -2754,6 +3696,7 @@  ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
     { "-mvect8-ret-in-mem",		MASK_VECT8_RETURNS },
     { "-m8bit-idiv",			MASK_USE_8BIT_IDIV },
     { "-mvzeroupper",			MASK_VZEROUPPER },
+    { "-mstv",				MASK_STV},
     { "-mavx256-split-unaligned-load",	MASK_AVX256_SPLIT_UNALIGNED_LOAD},
     { "-mavx256-split-unaligned-store",	MASK_AVX256_SPLIT_UNALIGNED_STORE},
     { "-mprefer-avx128",		MASK_PREFER_AVX128},
@@ -4366,6 +5309,8 @@  ix86_option_override_internal (bool main_args_p,
 
   if (!(opts_set->x_target_flags & MASK_VZEROUPPER))
     opts->x_target_flags |= MASK_VZEROUPPER;
+  if (!(opts_set->x_target_flags & MASK_STV))
+    opts->x_target_flags |= MASK_STV;
   if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
       && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
     opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
@@ -4479,12 +5424,18 @@  ix86_option_override (void)
     = { pass_insert_vzeroupper, "reload",
 	1, PASS_POS_INSERT_AFTER
       };
+  opt_pass *pass_stv = make_pass_stv (g);
+  struct register_pass_info stv_info
+    = { pass_stv, "combine",
+	1, PASS_POS_INSERT_AFTER
+      };
 
   ix86_option_override_internal (true, &global_options, &global_options_set);
 
 
   /* This needs to be done at start up.  It's convenient to do it here.  */
   register_pass (&insert_vzeroupper_info);
+  register_pass (&stv_info);
 }
 
 /* Implement the TARGET_OFFLOAD_OPTIONS hook.  */
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 7808705..89b74c9 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -978,6 +978,11 @@ 
 			       (HI "TARGET_HIMODE_MATH")
 			       SI])
 
+;; Math-dependant integer modes with DImode.
+(define_mode_iterator SWIM1248x [(QI "TARGET_QIMODE_MATH")
+				 (HI "TARGET_HIMODE_MATH")
+				 SI (DI "(TARGET_STV && TARGET_SSE2) || TARGET_64BIT")])
+
 ;; Math-dependant single word integer modes without QImode.
 (define_mode_iterator SWIM248 [(HI "TARGET_HIMODE_MATH")
 		      	       SI (DI "TARGET_64BIT")])
@@ -2094,9 +2099,9 @@ 
 
 (define_insn "*movdi_internal"
   [(set (match_operand:DI 0 "nonimmediate_operand"
-    "=r  ,o  ,r,r  ,r,m ,*y,*y,?*y,?m,?r ,?*Ym,*v,*v,*v,m ,?r ,?r,?*Yi,?*Ym,?*Yi,*k,*k ,*r ,*m")
+    "=r  ,o  ,r,r  ,r,m ,*y,*y,?*y,?m,?r ,?*Ym,*v,*v,*v,m ,m,?r ,?r,?*Yi,?*Ym,?*Yi,*k,*k ,*r ,*m")
 	(match_operand:DI 1 "general_operand"
-    "riFo,riF,Z,rem,i,re,C ,*y,m  ,*y,*Yn,r   ,C ,*v,m ,*v,*Yj,*v,r   ,*Yj ,*Yn ,*r ,*km,*k,*k"))]
+    "riFo,riF,Z,rem,i,re,C ,*y,m  ,*y,*Yn,r   ,C ,*v,m ,*v,v,*Yj,*v,r   ,*Yj ,*Yn ,*r ,*km,*k,*k"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
 {
   switch (get_attr_type (insn))
@@ -2174,9 +2179,9 @@ 
   [(set (attr "isa")
      (cond [(eq_attr "alternative" "0,1")
 	      (const_string "nox64")
-	    (eq_attr "alternative" "2,3,4,5,10,11,16,18,21,23")
+	    (eq_attr "alternative" "2,3,4,5,10,11,17,19,22,24")
 	      (const_string "x64")
-	    (eq_attr "alternative" "17")
+	    (eq_attr "alternative" "18")
 	      (const_string "x64_sse4")
 	   ]
 	   (const_string "*")))
@@ -2187,13 +2192,13 @@ 
 	      (const_string "mmx")
 	    (eq_attr "alternative" "7,8,9,10,11")
 	      (const_string "mmxmov")
-	    (eq_attr "alternative" "12,17")
+	    (eq_attr "alternative" "12,18")
 	      (const_string "sselog1")
-	    (eq_attr "alternative" "13,14,15,16,18")
+	    (eq_attr "alternative" "13,14,15,16,17,19")
 	      (const_string "ssemov")
-	    (eq_attr "alternative" "19,20")
+	    (eq_attr "alternative" "20,21")
 	      (const_string "ssecvt")
-	    (eq_attr "alternative" "21,22,23,24")
+	    (eq_attr "alternative" "22,23,24,25")
 	      (const_string "mskmov")
 	    (and (match_operand 0 "register_operand")
 		 (match_operand 1 "pic_32bit_operand"))
@@ -2208,16 +2213,16 @@ 
    (set (attr "length_immediate")
      (cond [(and (eq_attr "alternative" "4") (eq_attr "type" "imov"))
 	      (const_string "8")
-	    (eq_attr "alternative" "17")
+	    (eq_attr "alternative" "18")
 	      (const_string "1")
 	   ]
 	   (const_string "*")))
    (set (attr "prefix_rex")
-     (if_then_else (eq_attr "alternative" "10,11,16,17,18")
+     (if_then_else (eq_attr "alternative" "10,11,17,18,19")
        (const_string "1")
        (const_string "*")))
    (set (attr "prefix_extra")
-     (if_then_else (eq_attr "alternative" "17")
+     (if_then_else (eq_attr "alternative" "18")
        (const_string "1")
        (const_string "*")))
    (set (attr "prefix")
@@ -2245,13 +2250,26 @@ 
 		    ]
 		    (const_string "TI"))
 
-	    (and (eq_attr "alternative" "14,15")
+	    (and (eq_attr "alternative" "14,15,16")
 		 (not (match_test "TARGET_SSE2")))
 	      (const_string "V2SF")
-	    (eq_attr "alternative" "17")
+	    (eq_attr "alternative" "18")
 	      (const_string "TI")
 	   ]
-	   (const_string "DI")))])
+	   (const_string "DI")))
+   (set (attr "enabled")
+     (cond [(eq_attr "alternative" "15")
+              (if_then_else
+		(match_test "TARGET_STV && TARGET_SSE2")
+		(symbol_ref "false")
+		(const_string "*"))
+	    (eq_attr "alternative" "16")
+              (if_then_else
+		(match_test "TARGET_STV && TARGET_SSE2")
+		(symbol_ref "true")
+		(symbol_ref "false"))
+	   ]
+	   (const_string "*")))])
 
 (define_split
   [(set (match_operand:DI 0 "nonimmediate_operand")
@@ -3811,6 +3829,26 @@ 
   "movz{bl|x}\t{%1, %k0|%k0, %1}"
   [(set_attr "type" "imovx")
    (set_attr "mode" "SI")])
+
+(define_insn_and_split "*zext<mode>_doubleword"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI (match_operand:SWI12 1 "nonimmediate_operand" "<r>m")))]
+  "!TARGET_64BIT && TARGET_STV && TARGET_SSE2"
+  "#"
+  "&& reload_completed && GENERAL_REG_P (operands[0])"
+  [(set (match_dup 0) (zero_extend:SI (match_dup 1)))
+   (set (match_dup 2) (const_int 0))]
+  "split_double_mode (DImode, &operands[0], 1, &operands[0], &operands[2]);")
+
+(define_insn_and_split "*zextsi_doubleword"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI (match_operand:SI 1 "nonimmediate_operand" "rm")))]
+  "!TARGET_64BIT && TARGET_STV && TARGET_SSE2"
+  "#"
+  "&& reload_completed && GENERAL_REG_P (operands[0])"
+  [(set (match_dup 0) (match_dup 1))
+   (set (match_dup 2) (const_int 0))]
+  "split_double_mode (DImode, &operands[0], 1, &operands[0], &operands[2]);")
 
 ;; Sign extension instructions
 
@@ -7860,9 +7898,9 @@ 
 ;; it should be done with splitters.
 
 (define_expand "and<mode>3"
-  [(set (match_operand:SWIM 0 "nonimmediate_operand")
-	(and:SWIM (match_operand:SWIM 1 "nonimmediate_operand")
-		  (match_operand:SWIM 2 "<general_szext_operand>")))]
+  [(set (match_operand:SWIM1248x 0 "nonimmediate_operand")
+	(and:SWIM1248x (match_operand:SWIM1248x 1 "nonimmediate_operand")
+		      (match_operand:SWIM1248x 2 "<general_szext_operand>")))]
   ""
 {
   machine_mode mode = <MODE>mode;
@@ -7940,6 +7978,23 @@ 
        (const_string "*")))
    (set_attr "mode" "SI,DI,DI,SI,DI")])
 
+(define_insn_and_split "*anddi3_doubleword"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r")
+	(and:DI
+	 (match_operand:DI 1 "nonimmediate_operand" "%0,0,0")
+	 (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,rm")))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_64BIT && TARGET_STV && TARGET_SSE2 && ix86_binary_operator_ok (AND, DImode, operands)"
+  "#"
+  "&& reload_completed"
+  [(parallel [(set (match_dup 0)
+		   (and:SI (match_dup 1) (match_dup 2)))
+	      (clobber (reg:CC FLAGS_REG))])
+   (parallel [(set (match_dup 3)
+		   (and:SI (match_dup 4) (match_dup 5)))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "split_double_mode (DImode, &operands[0], 3, &operands[0], &operands[3]);")
+
 (define_insn "*andsi_1"
   [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r,Ya,!k")
 	(and:SI (match_operand:SI 1 "nonimmediate_operand" "%0,0,qm,k")
@@ -8427,9 +8482,9 @@ 
 ;; If this is considered useful, it should be done with splitters.
 
 (define_expand "<code><mode>3"
-  [(set (match_operand:SWIM 0 "nonimmediate_operand")
-	(any_or:SWIM (match_operand:SWIM 1 "nonimmediate_operand")
-		     (match_operand:SWIM 2 "<general_operand>")))]
+  [(set (match_operand:SWIM1248x 0 "nonimmediate_operand")
+	(any_or:SWIM1248x (match_operand:SWIM1248x 1 "nonimmediate_operand")
+			     (match_operand:SWIM1248x 2 "<general_operand>")))]
   ""
   "ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); DONE;")
 
@@ -8447,6 +8502,23 @@ 
   [(set_attr "type" "alu,alu,msklog")
    (set_attr "mode" "<MODE>")])
 
+(define_insn_and_split "*<code>di3_doubleword"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r")
+	(any_or:DI
+	 (match_operand:DI 1 "nonimmediate_operand" "%0,0,0")
+	 (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,rm")))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_64BIT && TARGET_STV && TARGET_SSE2 && ix86_binary_operator_ok (<CODE>, DImode, operands)"
+  "#"
+  "&& reload_completed"
+  [(parallel [(set (match_dup 0)
+		   (any_or:SI (match_dup 1) (match_dup 2)))
+	      (clobber (reg:CC FLAGS_REG))])
+   (parallel [(set (match_dup 3)
+		   (any_or:SI (match_dup 4) (match_dup 5)))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "split_double_mode (DImode, &operands[0], 3, &operands[0], &operands[3]);")
+
 (define_insn "*<code>hi_1"
   [(set (match_operand:HI 0 "nonimmediate_operand" "=r,rm,!k")
 	(any_or:HI
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 042f3c1..dae5c5d 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -567,6 +567,11 @@  Target Report Mask(VZEROUPPER) Save
 Generate vzeroupper instruction before a transfer of control flow out of
 the function.
 
+mstv
+Target Report Mask(STV) Save
+Disable Scalar to Vector optimization pass transforming 64-bit integer
+computations into a vector ones.
+
 mdispatch-scheduler
 Target RejectNegative Var(flag_dispatch_scheduler)
 Do dispatch scheduling if processor is bdver1 or bdver2 or bdver3 or bdver4 and Haifa scheduling
diff --git a/gcc/dbgcnt.def b/gcc/dbgcnt.def
index 95f6b06..583b16b 100644
--- a/gcc/dbgcnt.def
+++ b/gcc/dbgcnt.def
@@ -186,6 +186,7 @@  DEBUG_COUNTER (sel_sched_region_cnt)
 DEBUG_COUNTER (sms_sched_loop)
 DEBUG_COUNTER (split_for_sched2)
 DEBUG_COUNTER (store_motion)
+DEBUG_COUNTER (stv_conversion)
 DEBUG_COUNTER (tail_call)
 DEBUG_COUNTER (treepre_insert)
 DEBUG_COUNTER (tree_sra)
diff --git a/gcc/testsuite/gcc.dg/lower-subreg-1.c b/gcc/testsuite/gcc.dg/lower-subreg-1.c
index 6362d37..47057fe 100644
--- a/gcc/testsuite/gcc.dg/lower-subreg-1.c
+++ b/gcc/testsuite/gcc.dg/lower-subreg-1.c
@@ -1,5 +1,6 @@ 
 /* { dg-do compile { target { ! { mips64 || { aarch64*-*-* arm*-*-* ia64-*-* sparc*-*-* spu-*-* tilegx-*-* } } } } } */
 /* { dg-options "-O -fdump-rtl-subreg1" } */
+/* { dg-additional-options "-mno-stv" { target ia32 } } */
 /* { dg-skip-if "" { { i?86-*-* x86_64-*-* } && x32 } { "*" } { "" } } */
 /* { dg-require-effective-target ilp32 } */
 
diff --git a/gcc/testsuite/gcc.target/i386/pr65105-1.c b/gcc/testsuite/gcc.target/i386/pr65105-1.c
new file mode 100644
index 0000000..bac6c07
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr65105-1.c
@@ -0,0 +1,50 @@ 
+/* PR target/pr65105 */
+/* { dg-do run { target { ia32 } } } */
+/* { dg-options "-O2 -march=slm" } */
+/* { dg-final { scan-assembler "por" } } */
+/* { dg-final { scan-assembler "pand" } } */
+
+#include "stdlib.h"
+
+static int count = 0;
+
+void __attribute__((noinline))
+counter (long long l)
+{
+  count++;
+  if (!l || count > 5)
+    exit (1);
+}
+
+void __attribute__((noinline))
+test (long long *arr)
+{
+  register unsigned long long tmp;
+
+  tmp = arr[0] | arr[1] & arr[2];
+  while (tmp)
+    {
+      counter (tmp);
+      tmp = *(arr++) & tmp;
+    }
+}
+
+void  __attribute__((noinline))
+fill_data (long long *arr)
+{
+  arr[0] = 0x00ffffffL;
+  arr[1] = 0xffffff00L;
+  arr[2] = 0x00ffffffL;
+  arr[3] = 0x0000ff00L;
+  arr[4] = 0x00ff0000L;
+  arr[5] = 0xff000000L;
+}
+
+int
+main (int argc, const char **argv)
+{
+  long long arr[6];
+  fill_data (arr);
+  test (arr);
+  return count - 5;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr65105-2.c b/gcc/testsuite/gcc.target/i386/pr65105-2.c
new file mode 100644
index 0000000..9216894
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr65105-2.c
@@ -0,0 +1,12 @@ 
+/* PR target/pr65105 */
+/* { dg-do compile { target { ia32 } } } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler "por" } } */
+
+long long i1, i2, res;
+
+void
+test ()
+{
+  res = i1 | i2;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr65105-3.c b/gcc/testsuite/gcc.target/i386/pr65105-3.c
new file mode 100644
index 0000000..b83989f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr65105-3.c
@@ -0,0 +1,16 @@ 
+/* PR target/pr65105 */
+/* { dg-do compile { target { ia32 } } } */
+/* { dg-options "-O2 -march=slm -msse4.2" } */
+/* { dg-final { scan-assembler "pand" } } */
+/* { dg-final { scan-assembler "por" } } */
+/* { dg-final { scan-assembler "ptest" } } */
+
+long long i1, i2, i3, res;
+
+void
+test ()
+{
+  res = i1 | i2;
+  if (res)
+    res &= i3;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr65105-4.C b/gcc/testsuite/gcc.target/i386/pr65105-4.C
new file mode 100644
index 0000000..9acf368
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr65105-4.C
@@ -0,0 +1,19 @@ 
+/* PR target/pr65105 */
+/* { dg-do run { target { ia32 } } } */
+/* { dg-options "-O2 -march=slm" } */
+
+struct s {
+  long long l1, l2, l3, l4, l5;
+} *a;
+long long b;
+long long fn1()
+{
+  try
+    {
+      b = (a->l1 | a->l2 | a->l3 | a->l4 | a->l5);
+      return a->l1;
+    }
+  catch (int)
+    {
+    }
+}