diff mbox

[AArch64] optimize float immediate moves (1 /4) - infrastructure.

Message ID VI1PR0801MB203183E948429F85BFB14E3CFFDF0@VI1PR0801MB2031.eurprd08.prod.outlook.com
State New
Headers show

Commit Message

Tamar Christina June 26, 2017, 10:49 a.m. UTC
Hi All,

I've updated patch accordingly.

This mostly involves removing the loop to create the ival
and removing the *2 code and instead defaulting to 64bit
and switching to 128 when needed.

Regression tested on aarch64-none-linux-gnu and no regressions.

OK for trunk?

Thanks,
Tamar


gcc/
2017-06-26  Tamar Christina  <tamar.christina@arm.com>

        * config/aarch64/aarch64.c
        (aarch64_simd_container_mode): Add prototype.
        (aarch64_expand_mov_immediate): Add HI support.
        (aarch64_reinterpret_float_as_int, aarch64_float_const_rtx_p: New.
        (aarch64_can_const_movi_rtx_p): New.
        (aarch64_preferred_reload_class):
        Remove restrictions of using FP registers for certain SIMD operations.
        (aarch64_rtx_costs): Added new cost for CONST_DOUBLE moves.
        (aarch64_valid_floating_const): Add integer move validation.
        (aarch64_simd_imm_scalar_p): Remove.
        (aarch64_output_scalar_simd_mov_immediate): Generalize function.
        (aarch64_legitimate_constant_p): Expand list of supported cases.
        * config/aarch64/aarch64-protos.h
        (aarch64_float_const_rtx_p, aarch64_can_const_movi_rtx_p): New.
        (aarch64_reinterpret_float_as_int): New.
        (aarch64_simd_imm_scalar_p): Remove.
        * config/aarch64/predicates.md (aarch64_reg_or_fp_float): New.
        * config/aarch64/constraints.md (Uvi): New.
        (Dd): Split into Ds and new Dd.
        * config/aarch64/aarch64.md (*movsi_aarch64):
        Add SIMD mov case.
        (*movdi_aarch64): Add SIMD mov case.

Comments

Tamar Christina July 3, 2017, 6:11 a.m. UTC | #1
Ping
Tamar Christina July 10, 2017, 7:35 a.m. UTC | #2
Ping
James Greenhalgh July 19, 2017, 3:47 p.m. UTC | #3
On Mon, Jun 26, 2017 at 11:49:42AM +0100, Tamar Christina wrote:
> Hi All,
> 
> I've updated patch accordingly.
> 
> This mostly involves removing the loop to create the ival
> and removing the *2 code and instead defaulting to 64bit
> and switching to 128 when needed.
> 
> Regression tested on aarch64-none-linux-gnu and no regressions.
> 
> OK for trunk?

Almost, with a couple of small changes and clarifications.

> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 04417dcd609f6e8ff594a9c5853b3143696d3208..efb027f7fa9b9750b019c529bbcfc8b73dbaf804 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -4668,6 +4670,62 @@ aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
>    return true;
>  }
>  
> +/* Return the binary representation of floating point constant VALUE in INTVAL.
> +   If the value cannot be converted, return false without setting INTVAL.
> +   The conversion is done in the given MODE.  */
> +bool
> +aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
> +{
> +  machine_mode mode = GET_MODE (value);
> +  if (GET_CODE (value) != CONST_DOUBLE
> +      || !SCALAR_FLOAT_MODE_P (mode)
> +      || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT)
> +    return false;
> +
> +  unsigned HOST_WIDE_INT ival = 0;
> +
> +  /* Only support up to DF mode.  */
> +  gcc_assert (GET_MODE_BITSIZE (mode) <= 64);

GET_MODE_BITSIZE (DFmode) would be more self-documenting.

> +
> +  long res[2];
> +  real_to_target (res,
> +		  CONST_DOUBLE_REAL_VALUE (value),
> +		  REAL_MODE_FORMAT (mode));
> +
> +  ival = zext_hwi (res[0], 32);
> +  if (GET_MODE_BITSIZE (mode) == 64)

Likewise here.

> +    ival |= (zext_hwi (res[1], 32) << 32);
> +
> +  *intval = ival;
> +  return true;
> +}
> +

> @@ -4680,6 +4738,46 @@ aarch64_float_const_zero_rtx_p (rtx x)
>    return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
>  }
>  
> +/* Return TRUE if rtx X is immediate constant that fits in a single
> +   MOVI immediate operation.  */
> +bool
> +aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
> +{
> +  if (!TARGET_SIMD)
> +     return false;
> +
> +  machine_mode vmode, imode;
> +  unsigned HOST_WIDE_INT ival;
> +
> +  /* Don't write float constants out to memory.  */

This comment seems (only a little) out of line with the code below - "Don't
write float constants out to memory if we can represent them as integers."

> +  if (GET_CODE (x) == CONST_DOUBLE
> +      && SCALAR_FLOAT_MODE_P (mode))
> +    {
> +      if (!aarch64_reinterpret_float_as_int (x, &ival))
> +	return false;
> +
> +      imode = int_mode_for_mode (mode);
> +    }
> +  else if (GET_CODE (x) == CONST_INT
> +	   && SCALAR_INT_MODE_P (mode))
> +    {
> +       imode = mode;
> +       ival = INTVAL (x);
> +    }
> +  else
> +    return false;
> +
> +   /* use a 64 bit mode for everything except for DI/DF mode, where we use
> +     a 128 bit vector mode.  */
> +  int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
> +
> +  vmode = aarch64_simd_container_mode (imode, width);
> +  rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
> +
> +  return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);

I still wonder whether we could rewrite aarch64_simd_valid)_immediate to
avoid the need for a 128-bit vector here - 64-bits are good enough.

This doesn't need fixed for this patch, but it would make for a small
optimisation.

> +}
> +
> +
>  /* Return the fixed registers used for condition codes.  */
>  
>  static bool
> @@ -5857,12 +5955,6 @@ aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
>        return NO_REGS;
>      }
>  
> -  /* If it's an integer immediate that MOVI can't handle, then
> -     FP_REGS is not an option, so we return NO_REGS instead.  */
> -  if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
> -      && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
> -    return NO_REGS;
> -

I don't understand this relaxation could you explain what this achieves
and why it is safe in this patch?

>    /* Register eliminiation can result in a request for
>       SP+constant->FP_REGS.  We cannot support such operations which
>       use SP as source and an FP_REG as destination, so reject out
> @@ -6773,26 +6865,44 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
>        return true;
>  
>      case CONST_DOUBLE:
> +
> +	 /* First determine number of instructions to do the move
> +	    as an integer constant.  */
> +	if (!aarch64_float_const_representable_p (x)
> +	    && !aarch64_can_const_movi_rtx_p (x, mode)
> +	    && aarch64_float_const_rtx_p (x))
> +	  {
> +	    unsigned HOST_WIDE_INT ival;
> +	    bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
> +	    gcc_assert (succeed);
> +
> +	    machine_mode imode = mode == HFmode ? SImode : int_mode_for_mode (mode);
> +	    int ncost = aarch64_internal_mov_immediate
> +		(NULL_RTX, gen_int_mode (ival, imode), false, imode);
> +	    *cost += COSTS_N_INSNS (ncost);
> +	    return true;
> +	  }
> +
>        if (speed)
>  	{
> -	  /* mov[df,sf]_aarch64.  */
> -	  if (aarch64_float_const_representable_p (x))
> -	    /* FMOV (scalar immediate).  */
> -	    *cost += extra_cost->fp[mode == DFmode].fpconst;
> -	  else if (!aarch64_float_const_zero_rtx_p (x))
> -	    {
> -	      /* This will be a load from memory.  */
> -	      if (mode == DFmode)
> +	/* mov[df,sf]_aarch64.  */
> +	if (aarch64_float_const_representable_p (x))
> +	  /* FMOV (scalar immediate).  */
> +	  *cost += extra_cost->fp[mode == DFmode].fpconst;
> +	else if (!aarch64_float_const_zero_rtx_p (x))
> +	  {
> +	    /* This will be a load from memory.  */
> +	    if (mode == DFmode)
>  		*cost += extra_cost->ldst.loadd;
> -	      else
> +	    else
>  		*cost += extra_cost->ldst.loadf;
> -	    }
> -	  else
> -	    /* Otherwise this is +0.0.  We get this using MOVI d0, #0
> -	       or MOV v0.s[0], wzr - neither of which are modeled by the
> -	       cost tables.  Just use the default cost.  */
> -	    {
> -	    }
> +	  }
> +	else
> +	  /* Otherwise this is +0.0.  We get this using MOVI d0, #0
> +	     or MOV v0.s[0], wzr - neither of which are modeled by the
> +	     cost tables.  Just use the default cost.  */
> +	  {
> +	  }


Why reindent this code? From what I can tell this makes the code less
conformant to GNU style.

>  	}
>  
>        return true;
> @@ -6974,7 +7084,7 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
>  	  if (speed)
>  	    *cost += extra_cost->fp[mode == DFmode].compare;
>  
> -          if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
> +	  if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))

Likewise here?

>              {
>  	      *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
>                /* FCMP supports constant 0.0 for no extra cost. */

> diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
> index cd7ded986630c14ed6d42618b2a1f9baa0cbd192..6992c82fa790eac34669fcc5b030e395ad332201 100644
> --- a/gcc/config/aarch64/predicates.md
> +++ b/gcc/config/aarch64/predicates.md
> @@ -53,6 +53,11 @@
>         (ior (match_operand 0 "register_operand")
>  	    (match_test "op == const0_rtx"))))
>  
> +(define_predicate "aarch64_reg_or_fp_float"
> +  (ior (match_operand 0 "register_operand")
> +	(and (match_code "const_double")
> +	     (match_test "aarch64_float_const_rtx_p (op)"))))
> +

Unused?

>  (define_predicate "aarch64_reg_or_fp_zero"
>    (ior (match_operand 0 "register_operand")
>  	(and (match_code "const_double")

Thanks,
James
Tamar Christina July 26, 2017, 4 p.m. UTC | #4
Hi James,

I have updated the patch and have responded to your question blow.

Ok for trunk?

Thanks,
Tamar

> >  static bool
> > @@ -5857,12 +5955,6 @@ aarch64_preferred_reload_class (rtx x,
> reg_class_t regclass)
> >        return NO_REGS;
> >      }
> >
> > -  /* If it's an integer immediate that MOVI can't handle, then
> > -     FP_REGS is not an option, so we return NO_REGS instead.  */
> > -  if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
> > -      && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
> > -    return NO_REGS;
> > -
> 
> I don't understand this relaxation could you explain what this achieves and
> why it is safe in this patch?

Because this should be left up to the pattern to decide what should happen and not reload.
Leaving the check here also means you'll do a reasonably expensive check twice for each constant
you can emit a move for.

Removing extra restriction on the constant classes leaves it up to aarch64_legitimate_constant_p to decide if
if the constant can be emitted as a move or should be forced to memory.
aarch64_legitimate_constant_p also calls aarch64_cannot_force_const_mem.

The documentation for TARGET_PREFERRED_RELOAD_CLASS also states: 

"One case where TARGET_PREFERRED_RELOAD_CLASS must not return rclass is if x is a legitimate constant which cannot be loaded into some register class. By returning NO_REGS you can force x into a memory location. For example, rs6000 can load immediate values into general-purpose registers, but does not have an instruction for loading an immediate value into a floating-point register, so TARGET_PREFERRED_RELOAD_CLASS returns NO_REGS when x is a floating-point constant. If the constant can't be loaded into any kind of register, code generation will be better if TARGET_LEGITIMATE_CONSTANT_P makes the constant illegitimate instead of using TARGET_PREFERRED_RELOAD_CLASS. "

So it seems that not only did the original constraint not add anything, we may also generate better code now.
James Greenhalgh July 27, 2017, 3:42 p.m. UTC | #5
On Wed, Jul 26, 2017 at 05:00:05PM +0100, Tamar Christina wrote:
> Hi James,
> 
> I have updated the patch and have responded to your question blow.
> 
> Ok for trunk?

Ok, with a few small changes.

> > >  static bool
> > > @@ -5857,12 +5955,6 @@ aarch64_preferred_reload_class (rtx x,
> > reg_class_t regclass)
> > >        return NO_REGS;
> > >      }
> > >
> > > -  /* If it's an integer immediate that MOVI can't handle, then
> > > -     FP_REGS is not an option, so we return NO_REGS instead.  */
> > > -  if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
> > > -      && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
> > > -    return NO_REGS;
> > > -
> > 
> > I don't understand this relaxation could you explain what this achieves and
> > why it is safe in this patch?
> 
> Because this should be left up to the pattern to decide what should happen
> and not reload.  Leaving the check here also means you'll do a reasonably
> expensive check twice for each constant
> you can emit a move for.
> 
> Removing extra restriction on the constant classes leaves it up to
> aarch64_legitimate_constant_p to decide if if the constant can be emitted as
> a move or should be forced to memory.
> aarch64_legitimate_constant_p also calls aarch64_cannot_force_const_mem.
> 
> The documentation for TARGET_PREFERRED_RELOAD_CLASS also states: 
> 
> "One case where TARGET_PREFERRED_RELOAD_CLASS must not return rclass is if x
> is a legitimate constant which cannot be loaded into some register class. By
> returning NO_REGS you can force x into a memory location. For example, rs6000
> can load immediate values into general-purpose registers, but does not have
> an instruction for loading an immediate value into a floating-point register,
> so TARGET_PREFERRED_RELOAD_CLASS returns NO_REGS when x is a floating-point
> constant. If the constant can't be loaded into any kind of register, code
> generation will be better if TARGET_LEGITIMATE_CONSTANT_P makes the constant
> illegitimate instead of using TARGET_PREFERRED_RELOAD_CLASS. "
> 
> So it seems that not only did the original constraint not add anything, we
> may also generate better code now.

Thanks for the explanation.

> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index e397ff4afa73cfbc7e192fd5686b1beff9bbbadf..fd20576d23cfdc48761f65e41762b2a5a71f3e61 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -326,6 +326,8 @@ bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
>  void aarch64_expand_call (rtx, rtx, bool);
>  bool aarch64_expand_movmem (rtx *);
>  bool aarch64_float_const_zero_rtx_p (rtx);
> +bool aarch64_float_const_rtx_p (rtx);
> +bool aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode);

This list should be alphabetical, first by type, then by name. So I'd
expect this to fit in just above aarch64_const_vec_all_same_int_p .

>  bool aarch64_function_arg_regno_p (unsigned);
>  bool aarch64_fusion_enabled_p (enum aarch64_fusion_pairs);
>  bool aarch64_gen_movmemqi (rtx *);
> @@ -353,7 +355,6 @@ bool aarch64_regno_ok_for_base_p (int, bool);
>  bool aarch64_regno_ok_for_index_p (int, bool);
>  bool aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
>  					    bool high);
> -bool aarch64_simd_imm_scalar_p (rtx x, machine_mode mode);
>  bool aarch64_simd_imm_zero_p (rtx, machine_mode);
>  bool aarch64_simd_scalar_immediate_valid_for_move (rtx, machine_mode);
>  bool aarch64_simd_shift_imm_p (rtx, machine_mode, bool);
> @@ -488,4 +489,6 @@ std::string aarch64_get_extension_string_for_isa_flags (unsigned long,
>  
>  rtl_opt_pass *make_pass_fma_steering (gcc::context *ctxt);
>  
> +bool aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *fail);

This isn't defined in common/config/aarch64-common.c so shouldn't be in
the section for functions which will be defined there. It should be in
the list above between aarch64_regno_ok_for_index_p and
aarch64_simd_check_vect_par_cnst_half.

> +
>  #endif /* GCC_AARCH64_PROTOS_H */
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index d753666ef67fc524260c41f36743df3649a0a98a..b1ddd77823e50e63439e497f695f3fad9bd9efc9 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -147,6 +147,8 @@ static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
>  							 const_tree type,
>  							 int misalignment,
>  							 bool is_packed);
> +static machine_mode
> +aarch64_simd_container_mode (machine_mode mode, unsigned width);
>  
>  /* Major revision number of the ARM Architecture implemented by the target.  */
>  unsigned aarch64_architecture_version;
> @@ -4723,6 +4725,62 @@ aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
>    return true;
>  }
>  
> +/* Return the binary representation of floating point constant VALUE in INTVAL.
> +   If the value cannot be converted, return false without setting INTVAL.
> +   The conversion is done in the given MODE.  */
> +bool
> +aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
> +{
> +  machine_mode mode = GET_MODE (value);
> +  if (GET_CODE (value) != CONST_DOUBLE
> +      || !SCALAR_FLOAT_MODE_P (mode)
> +      || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT)
> +    return false;
> +
> +  unsigned HOST_WIDE_INT ival = 0;
> +
> +  /* Only support up to DF mode.  */
> +  gcc_assert (GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (DFmode));

I'm somewhere a bit hypothetical here, but on a machine with a 128-bit
HOST_WIDE_INT, you would hit this case and take the assert. Just make
this another case in your if statement above:

  machine_mode mode = GET_MODE (value);
  if (GET_CODE (value) != CONST_DOUBLE
      || !SCALAR_FLOAT_MODE_P (mode)
      || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
      || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
    return false;

> +  long res[2];
> +  real_to_target (res,
> +		  CONST_DOUBLE_REAL_VALUE (value),
> +		  REAL_MODE_FORMAT (mode));
> +
> +  ival = zext_hwi (res[0], 32);
> +  if (GET_MODE_BITSIZE (mode) == GET_MODE_BITSIZE (DFmode))
> +    ival |= (zext_hwi (res[1], 32) << 32);
> +
> +  *intval = ival;
> +  return true;
> +}
> +

Thanks,
James
diff mbox

Patch

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index bfe44a75e12fe1213c1baafb56f8333a30466bc5..e0f34405dff4555ad482f8ca74e7517b8ca24860 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -326,6 +326,8 @@  bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
 void aarch64_expand_call (rtx, rtx, bool);
 bool aarch64_expand_movmem (rtx *);
 bool aarch64_float_const_zero_rtx_p (rtx);
+bool aarch64_float_const_rtx_p (rtx);
+bool aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode);
 bool aarch64_function_arg_regno_p (unsigned);
 bool aarch64_fusion_enabled_p (enum aarch64_fusion_pairs);
 bool aarch64_gen_movmemqi (rtx *);
@@ -353,7 +355,6 @@  bool aarch64_regno_ok_for_base_p (int, bool);
 bool aarch64_regno_ok_for_index_p (int, bool);
 bool aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
 					    bool high);
-bool aarch64_simd_imm_scalar_p (rtx x, machine_mode mode);
 bool aarch64_simd_imm_zero_p (rtx, machine_mode);
 bool aarch64_simd_scalar_immediate_valid_for_move (rtx, machine_mode);
 bool aarch64_simd_shift_imm_p (rtx, machine_mode, bool);
@@ -488,4 +489,6 @@  std::string aarch64_get_extension_string_for_isa_flags (unsigned long,
 
 rtl_opt_pass *make_pass_fma_steering (gcc::context *ctxt);
 
+bool aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *fail);
+
 #endif /* GCC_AARCH64_PROTOS_H */
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 04417dcd609f6e8ff594a9c5853b3143696d3208..efb027f7fa9b9750b019c529bbcfc8b73dbaf804 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -147,6 +147,8 @@  static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 							 const_tree type,
 							 int misalignment,
 							 bool is_packed);
+static machine_mode
+aarch64_simd_container_mode (machine_mode mode, unsigned width);
 
 /* Major revision number of the ARM Architecture implemented by the target.  */
 unsigned aarch64_architecture_version;
@@ -4668,6 +4670,62 @@  aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
   return true;
 }
 
+/* Return the binary representation of floating point constant VALUE in INTVAL.
+   If the value cannot be converted, return false without setting INTVAL.
+   The conversion is done in the given MODE.  */
+bool
+aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
+{
+  machine_mode mode = GET_MODE (value);
+  if (GET_CODE (value) != CONST_DOUBLE
+      || !SCALAR_FLOAT_MODE_P (mode)
+      || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT)
+    return false;
+
+  unsigned HOST_WIDE_INT ival = 0;
+
+  /* Only support up to DF mode.  */
+  gcc_assert (GET_MODE_BITSIZE (mode) <= 64);
+
+  long res[2];
+  real_to_target (res,
+		  CONST_DOUBLE_REAL_VALUE (value),
+		  REAL_MODE_FORMAT (mode));
+
+  ival = zext_hwi (res[0], 32);
+  if (GET_MODE_BITSIZE (mode) == 64)
+    ival |= (zext_hwi (res[1], 32) << 32);
+
+  *intval = ival;
+  return true;
+}
+
+/* Return TRUE if rtx X is an immediate constant that can be moved using a
+   single MOV(+MOVK) followed by an FMOV.  */
+bool
+aarch64_float_const_rtx_p (rtx x)
+{
+  machine_mode mode = GET_MODE (x);
+  if (mode == VOIDmode)
+    return false;
+
+  /* Determine whether it's cheaper to write float constants as
+     mov/movk pairs over ldr/adrp pairs.  */
+  unsigned HOST_WIDE_INT ival;
+
+  if (GET_CODE (x) == CONST_DOUBLE
+      && SCALAR_FLOAT_MODE_P (mode)
+      && aarch64_reinterpret_float_as_int (x, &ival))
+    {
+      machine_mode imode = mode == HFmode ? SImode : int_mode_for_mode (mode);
+      int num_instr = aarch64_internal_mov_immediate
+			(NULL_RTX, gen_int_mode (ival, imode), false, imode);
+      return num_instr < 3;
+    }
+
+  return false;
+}
+
 /* Return TRUE if rtx X is immediate constant 0.0 */
 bool
 aarch64_float_const_zero_rtx_p (rtx x)
@@ -4680,6 +4738,46 @@  aarch64_float_const_zero_rtx_p (rtx x)
   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
 }
 
+/* Return TRUE if rtx X is immediate constant that fits in a single
+   MOVI immediate operation.  */
+bool
+aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
+{
+  if (!TARGET_SIMD)
+     return false;
+
+  machine_mode vmode, imode;
+  unsigned HOST_WIDE_INT ival;
+
+  /* Don't write float constants out to memory.  */
+  if (GET_CODE (x) == CONST_DOUBLE
+      && SCALAR_FLOAT_MODE_P (mode))
+    {
+      if (!aarch64_reinterpret_float_as_int (x, &ival))
+	return false;
+
+      imode = int_mode_for_mode (mode);
+    }
+  else if (GET_CODE (x) == CONST_INT
+	   && SCALAR_INT_MODE_P (mode))
+    {
+       imode = mode;
+       ival = INTVAL (x);
+    }
+  else
+    return false;
+
+   /* use a 64 bit mode for everything except for DI/DF mode, where we use
+     a 128 bit vector mode.  */
+  int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
+
+  vmode = aarch64_simd_container_mode (imode, width);
+  rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
+
+  return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
+}
+
+
 /* Return the fixed registers used for condition codes.  */
 
 static bool
@@ -5857,12 +5955,6 @@  aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
       return NO_REGS;
     }
 
-  /* If it's an integer immediate that MOVI can't handle, then
-     FP_REGS is not an option, so we return NO_REGS instead.  */
-  if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
-      && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
-    return NO_REGS;
-
   /* Register eliminiation can result in a request for
      SP+constant->FP_REGS.  We cannot support such operations which
      use SP as source and an FP_REG as destination, so reject out
@@ -6773,26 +6865,44 @@  aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
       return true;
 
     case CONST_DOUBLE:
+
+	 /* First determine number of instructions to do the move
+	    as an integer constant.  */
+	if (!aarch64_float_const_representable_p (x)
+	    && !aarch64_can_const_movi_rtx_p (x, mode)
+	    && aarch64_float_const_rtx_p (x))
+	  {
+	    unsigned HOST_WIDE_INT ival;
+	    bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
+	    gcc_assert (succeed);
+
+	    machine_mode imode = mode == HFmode ? SImode : int_mode_for_mode (mode);
+	    int ncost = aarch64_internal_mov_immediate
+		(NULL_RTX, gen_int_mode (ival, imode), false, imode);
+	    *cost += COSTS_N_INSNS (ncost);
+	    return true;
+	  }
+
       if (speed)
 	{
-	  /* mov[df,sf]_aarch64.  */
-	  if (aarch64_float_const_representable_p (x))
-	    /* FMOV (scalar immediate).  */
-	    *cost += extra_cost->fp[mode == DFmode].fpconst;
-	  else if (!aarch64_float_const_zero_rtx_p (x))
-	    {
-	      /* This will be a load from memory.  */
-	      if (mode == DFmode)
+	/* mov[df,sf]_aarch64.  */
+	if (aarch64_float_const_representable_p (x))
+	  /* FMOV (scalar immediate).  */
+	  *cost += extra_cost->fp[mode == DFmode].fpconst;
+	else if (!aarch64_float_const_zero_rtx_p (x))
+	  {
+	    /* This will be a load from memory.  */
+	    if (mode == DFmode)
 		*cost += extra_cost->ldst.loadd;
-	      else
+	    else
 		*cost += extra_cost->ldst.loadf;
-	    }
-	  else
-	    /* Otherwise this is +0.0.  We get this using MOVI d0, #0
-	       or MOV v0.s[0], wzr - neither of which are modeled by the
-	       cost tables.  Just use the default cost.  */
-	    {
-	    }
+	  }
+	else
+	  /* Otherwise this is +0.0.  We get this using MOVI d0, #0
+	     or MOV v0.s[0], wzr - neither of which are modeled by the
+	     cost tables.  Just use the default cost.  */
+	  {
+	  }
 	}
 
       return true;
@@ -6974,7 +7084,7 @@  aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
 	  if (speed)
 	    *cost += extra_cost->fp[mode == DFmode].compare;
 
-          if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
+	  if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
             {
 	      *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
               /* FCMP supports constant 0.0 for no extra cost. */
@@ -10095,18 +10205,16 @@  aarch64_legitimate_pic_operand_p (rtx x)
 /* Return true if X holds either a quarter-precision or
      floating-point +0.0 constant.  */
 static bool
-aarch64_valid_floating_const (machine_mode mode, rtx x)
+aarch64_valid_floating_const (rtx x)
 {
   if (!CONST_DOUBLE_P (x))
     return false;
 
-  if (aarch64_float_const_zero_rtx_p (x))
+  /* This call determines which constants can be used in mov<mode>
+     as integer moves instead of constant loads.  */
+  if (aarch64_float_const_rtx_p (x))
     return true;
 
-  /* We only handle moving 0.0 to a TFmode register.  */
-  if (!(mode == SFmode || mode == DFmode))
-    return false;
-
   return aarch64_float_const_representable_p (x);
 }
 
@@ -10118,11 +10226,15 @@  aarch64_legitimate_constant_p (machine_mode mode, rtx x)
   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
     return false;
 
-  /* This could probably go away because
-     we now decompose CONST_INTs according to expand_mov_immediate.  */
+  /* For these cases we never want to use a literal load.
+     As such we have to prevent the compiler from forcing these
+     to memory.  */
   if ((GET_CODE (x) == CONST_VECTOR
        && aarch64_simd_valid_immediate (x, mode, false, NULL))
-      || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
+      || CONST_INT_P (x)
+      || aarch64_valid_floating_const (x)
+      || aarch64_can_const_movi_rtx_p (x, mode)
+      || aarch64_float_const_rtx_p (x))
 	return !targetm.cannot_force_const_mem (mode, x);
 
   if (GET_CODE (x) == HIGH
@@ -11400,23 +11512,6 @@  aarch64_mask_from_zextract_ops (rtx width, rtx pos)
 }
 
 bool
-aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
-{
-  HOST_WIDE_INT imm = INTVAL (x);
-  int i;
-
-  for (i = 0; i < 8; i++)
-    {
-      unsigned int byte = imm & 0xff;
-      if (byte != 0xff && byte != 0)
-       return false;
-      imm >>= 8;
-    }
-
-  return true;
-}
-
-bool
 aarch64_mov_operand_p (rtx x, machine_mode mode)
 {
   if (GET_CODE (x) == HIGH
@@ -12809,15 +12904,28 @@  aarch64_output_simd_mov_immediate (rtx const_vector,
 }
 
 char*
-aarch64_output_scalar_simd_mov_immediate (rtx immediate,
-					  machine_mode mode)
+aarch64_output_scalar_simd_mov_immediate (rtx immediate,  machine_mode mode)
 {
+
+  /* If a floating point number was passed and we desire to use it in an
+     integer mode do the conversion to integer.  */
+  if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
+    {
+      unsigned HOST_WIDE_INT ival;
+      if (!aarch64_reinterpret_float_as_int (immediate, &ival))
+	  gcc_unreachable ();
+      immediate = gen_int_mode (ival, mode);
+    }
+
   machine_mode vmode;
+  /* use a 64 bit mode for everything except for DI/DF mode, where we use
+     a 128 bit vector mode.  */
+  int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
 
   gcc_assert (!VECTOR_MODE_P (mode));
-  vmode = aarch64_simd_container_mode (mode, 64);
+  vmode = aarch64_simd_container_mode (mode, width);
   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
-  return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
+  return aarch64_output_simd_mov_immediate (v_op, vmode, width);
 }
 
 /* Split operands into moves from op[1] + op[2] into op[0].  */
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 1a721bfbe42270ec75268b6e2366290aa6ad2134..618f4fa229e5121544f30c7afb20a2cc30e73de1 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -920,8 +920,8 @@ 
 )
 
 (define_insn_and_split "*movsi_aarch64"
-  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,k,r,r,r,r,*w,m,  m,r,r  ,*w,r,*w")
-	(match_operand:SI 1 "aarch64_mov_operand"  " r,r,k,M,n,m, m,rZ,*w,Usa,Ush,rZ,w,*w"))]
+  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,k,r,r,r,r,*w,m,  m,r,r  ,*w, r,*w,w")
+	(match_operand:SI 1 "aarch64_mov_operand"  " r,r,k,M,n,m, m,rZ,*w,Usa,Ush,rZ,w,*w,Ds"))]
   "(register_operand (operands[0], SImode)
     || aarch64_reg_or_zero (operands[1], SImode))"
   "@
@@ -938,17 +938,19 @@ 
    adrp\\t%x0, %A1
    fmov\\t%s0, %w1
    fmov\\t%w0, %s1
-   fmov\\t%s0, %s1"
-   "CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), SImode)
+   fmov\\t%s0, %s1
+   * return aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);"
+  "CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), SImode)
     && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
-   [(const_int 0)]
-   "{
-       aarch64_expand_mov_immediate (operands[0], operands[1]);
-       DONE;
-    }"
+  [(const_int 0)]
+  "{
+      aarch64_expand_mov_immediate (operands[0], operands[1]);
+      DONE;
+   }"
   [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,load1,load1,store1,store1,\
-                     adr,adr,f_mcr,f_mrc,fmov")
-   (set_attr "fp" "*,*,*,*,*,*,yes,*,yes,*,*,yes,yes,yes")]
+		    adr,adr,f_mcr,f_mrc,fmov,neon_move")
+   (set_attr "fp" "*,*,*,*,*,*,yes,*,yes,*,*,yes,yes,yes,*")
+   (set_attr "simd" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,yes")]
 )
 
 (define_insn_and_split "*movdi_aarch64"
@@ -971,7 +973,7 @@ 
    fmov\\t%d0, %x1
    fmov\\t%x0, %d1
    fmov\\t%d0, %d1
-   movi\\t%d0, %1"
+   * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);"
    "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode))
     && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
    [(const_int 0)]
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 88e840f2898d2da3e51e753578ee59bce4f462fa..9ce3d4efaf31a301dfb7c1772a6b685fb2cbd2ee 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -176,6 +176,12 @@ 
   (and (match_code "const_double")
        (match_test "aarch64_float_const_representable_p (op)")))
 
+(define_constraint "Uvi"
+  "A floating point constant which can be used with a\
+   MOVI immediate operation."
+  (and (match_code "const_double")
+       (match_test "aarch64_can_const_movi_rtx_p (op, GET_MODE (op))")))
+
 (define_constraint "Dn"
   "@internal
  A constraint that matches vector of immediates."
@@ -220,9 +226,17 @@ 
 
 (define_constraint "Dd"
   "@internal
- A constraint that matches an immediate operand valid for AdvSIMD scalar."
+ A constraint that matches an integer immediate operand valid\
+ for AdvSIMD scalar operations in DImode."
+ (and (match_code "const_int")
+      (match_test "aarch64_can_const_movi_rtx_p (op, DImode)")))
+
+(define_constraint "Ds"
+  "@internal
+ A constraint that matches an integer immediate operand valid\
+ for AdvSIMD scalar operations in SImode."
  (and (match_code "const_int")
-      (match_test "aarch64_simd_imm_scalar_p (op, GET_MODE (op))")))
+      (match_test "aarch64_can_const_movi_rtx_p (op, SImode)")))
 
 (define_address_constraint "Dp"
   "@internal
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index cd7ded986630c14ed6d42618b2a1f9baa0cbd192..6992c82fa790eac34669fcc5b030e395ad332201 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -53,6 +53,11 @@ 
        (ior (match_operand 0 "register_operand")
 	    (match_test "op == const0_rtx"))))
 
+(define_predicate "aarch64_reg_or_fp_float"
+  (ior (match_operand 0 "register_operand")
+	(and (match_code "const_double")
+	     (match_test "aarch64_float_const_rtx_p (op)"))))
+
 (define_predicate "aarch64_reg_or_fp_zero"
   (ior (match_operand 0 "register_operand")
 	(and (match_code "const_double")