diff mbox

[AArch64] optimize float immediate moves (1 /4) - infrastructure.

Message ID VI1PR0801MB2031CA473713EDC999DF103CFFCD0@VI1PR0801MB2031.eurprd08.prod.outlook.com
State New
Headers show

Commit Message

Tamar Christina June 12, 2017, 7:29 a.m. UTC
Hi Richard,

Here's the updated patch, thanks for the feedback so far!

Regards,
Tamar

Comments

James Greenhalgh June 13, 2017, 4:39 p.m. UTC | #1
This patch is pretty huge, are there any opportunities to further split
it to aid review?

I have some comments in line.

> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index a069427f576f6bd7336bbe4497249773bd33d138..2ab2d96e40e80a79b5648046ca2d6e202d3939a2 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -147,6 +147,8 @@ static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
>  							 const_tree type,
>  							 int misalignment,
>  							 bool is_packed);
> +static machine_mode
> +aarch64_simd_container_mode (machine_mode mode, unsigned width);
>  
>  /* Major revision number of the ARM Architecture implemented by the target.  */
>  unsigned aarch64_architecture_version;
> @@ -4613,6 +4615,66 @@ aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
>    return true;
>  }
>  
> +/* Return the binary representation of floating point constant VALUE in INTVAL.
> +   If the value cannot be converted, return false without setting INTVAL.
> +   The conversion is done in the given MODE.  */
> +bool
> +aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
> +{
> +  machine_mode mode = GET_MODE (value);
> +  if (GET_CODE (value) != CONST_DOUBLE
> +      || !SCALAR_FLOAT_MODE_P (mode)
> +      || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT)
> +    return false;
> +
> +  unsigned HOST_WIDE_INT ival = 0;
> +
> +  /* Only support up to DF mode.  */
> +  gcc_assert (GET_MODE_BITSIZE (mode) <= 64);
> +  int needed = GET_MODE_BITSIZE (mode) == 64 ? 2 : 1;
> +
> +  long res[2];
> +  real_to_target (res,
> +		  CONST_DOUBLE_REAL_VALUE (value),
> +		  REAL_MODE_FORMAT (mode));
> +
> +  ival = zext_hwi (res[needed - 1], 32);
> +  for (int i = needed - 2; i >= 0; i--)
> +    {
> +      ival <<= 32;
> +      ival |= zext_hwi (res[i], 32);
> +    }
> +
> +  *intval = ival;

???

Two cases here, needed is either 2 if GET_MODE_BITSIZE (mode) == 64, or it
is 1 otherwise. So i starts at either -1 or 0. So this for loop either runs
0 or 1 times. What am I missing? I'm sure this is all an indirect way of
writing:

  *intval = 0;
  if (GET_MODE_BITSIZE (mode) == 64)
    *intval = zext_hwi (res[1], 32) << 32
  *intval |= zext_hwi (res[0], 32)



> +  return true;
> +}
> +
> +/* Return TRUE if rtx X is an immediate constant that can be moved using a
> +   single MOV(+MOVK) followed by an FMOV.  */
> +bool
> +aarch64_float_const_rtx_p (rtx x)
> +{
> +  machine_mode mode = GET_MODE (x);
> +  if (mode == VOIDmode)
> +    return false;
> +
> +  /* Determine whether it's cheaper to write float constants as
> +     mov/movk pairs over ldr/adrp pairs.  */
> +  unsigned HOST_WIDE_INT ival;
> +
> +  if (GET_CODE (x) == CONST_DOUBLE
> +      && SCALAR_FLOAT_MODE_P (mode)
> +      && aarch64_reinterpret_float_as_int (x, &ival))
> +    {
> +      machine_mode imode = mode == HFmode ? SImode : int_mode_for_mode (mode);
> +      int num_instr = aarch64_internal_mov_immediate
> +			(NULL_RTX, gen_int_mode (ival, imode), false, imode);
> +      return num_instr < 3;

Should this cost model be static on a magin number? Is it not the case that
the decision should be based on the relative speeds of a memory access
compared with mov/movk/fmov ?

> +    }
> +
> +  return false;
> +}
> +
>  /* Return TRUE if rtx X is immediate constant 0.0 */
>  bool
>  aarch64_float_const_zero_rtx_p (rtx x)
> @@ -4625,6 +4687,46 @@ aarch64_float_const_zero_rtx_p (rtx x)
>    return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
>  }
>  
> +/* Return TRUE if rtx X is immediate constant that fits in a single
> +   MOVI immediate operation.  */
> +bool
> +aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
> +{
> +  if (!TARGET_SIMD)
> +     return false;
> +
> +  machine_mode vmode, imode;
> +  unsigned HOST_WIDE_INT ival;
> +
> +  /* Don't write float constants out to memory.  */
> +  if (GET_CODE (x) == CONST_DOUBLE
> +      && SCALAR_FLOAT_MODE_P (mode))
> +    {
> +      if (!aarch64_reinterpret_float_as_int (x, &ival))
> +	return false;
> +
> +      imode = int_mode_for_mode (mode);
> +    }
> +  else if (GET_CODE (x) == CONST_INT
> +	   && SCALAR_INT_MODE_P (mode))
> +    {
> +       imode = mode;
> +       ival = INTVAL (x);
> +    }
> +  else
> +    return false;
> +
> +  unsigned width = GET_MODE_BITSIZE (mode) * 2;

Why * 2? It isn't obvious to me from my understanding of movi why that would
be better than just clamping to 64-bit?

> +  if (width < GET_MODE_BITSIZE (DFmode))
> +     width = GET_MODE_BITSIZE (DFmode);
> +
> +  vmode = aarch64_simd_container_mode (imode, width);
> +  rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
> +
> +  return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
> +}
> +
> +
>  /* Return the fixed registers used for condition codes.  */
>  
>  static bool
> @@ -5758,12 +5860,6 @@ aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
>        return NO_REGS;
>      }
>  
> -  /* If it's an integer immediate that MOVI can't handle, then
> -     FP_REGS is not an option, so we return NO_REGS instead.  */
> -  if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
> -      && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
> -    return NO_REGS;
> -
>    /* Register eliminiation can result in a request for
>       SP+constant->FP_REGS.  We cannot support such operations which
>       use SP as source and an FP_REG as destination, so reject out
> @@ -6674,26 +6770,44 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
>        return true;
>  
>      case CONST_DOUBLE:
> +
> +	 /* First determine number of instructions to do the move
> +	    as an integer constant.  */
> +	if (!aarch64_float_const_representable_p (x)
> +	    && !aarch64_can_const_movi_rtx_p (x, mode)
> +	    && aarch64_float_const_rtx_p (x))
> +	  {
> +	    unsigned HOST_WIDE_INT ival;
> +	    bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
> +	    gcc_assert (succeed);

Just:

  gcc_assert (aarch64_reinterpret_float_as_int (x, &ival));

There's not much extra information in the name "succeed", so no extra value
in the variable assignment.

> +
> +	    machine_mode imode = mode == HFmode ? SImode : int_mode_for_mode (mode);
> +	    int ncost = aarch64_internal_mov_immediate
> +		(NULL_RTX, gen_int_mode (ival, imode), false, imode);
> +	    *cost += COSTS_N_INSNS (ncost);
> +	    return true;
> +	  }
> +
>        if (speed)
>  	{
> -	  /* mov[df,sf]_aarch64.  */
> -	  if (aarch64_float_const_representable_p (x))
> -	    /* FMOV (scalar immediate).  */
> -	    *cost += extra_cost->fp[mode == DFmode].fpconst;
> -	  else if (!aarch64_float_const_zero_rtx_p (x))
> -	    {
> -	      /* This will be a load from memory.  */
> -	      if (mode == DFmode)
> +	/* mov[df,sf]_aarch64.  */
> +	if (aarch64_float_const_representable_p (x))
> +	  /* FMOV (scalar immediate).  */
> +	  *cost += extra_cost->fp[mode == DFmode].fpconst;
> +	else if (!aarch64_float_const_zero_rtx_p (x))
> +	  {
> +	    /* This will be a load from memory.  */
> +	    if (mode == DFmode)
>  		*cost += extra_cost->ldst.loadd;
> -	      else
> +	    else
>  		*cost += extra_cost->ldst.loadf;
> -	    }
> -	  else
> -	    /* Otherwise this is +0.0.  We get this using MOVI d0, #0
> -	       or MOV v0.s[0], wzr - neither of which are modeled by the
> -	       cost tables.  Just use the default cost.  */
> -	    {
> -	    }
> +	  }
> +	else
> +	  /* Otherwise this is +0.0.  We get this using MOVI d0, #0
> +	     or MOV v0.s[0], wzr - neither of which are modeled by the
> +	     cost tables.  Just use the default cost.  */
> +	  {
> +	  }
>  	}
>  
>        return true;
> @@ -12599,15 +12698,28 @@ aarch64_output_simd_mov_immediate (rtx const_vector,
>  }
>  
>  char*
> -aarch64_output_scalar_simd_mov_immediate (rtx immediate,
> -					  machine_mode mode)
> +aarch64_output_scalar_simd_mov_immediate (rtx immediate,  machine_mode mode)
>  {
> +
> +  /* If a floating point number was passed and we desire to use it in an
> +     integer mode do the conversion to integer.  */
> +  if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
> +    {
> +      unsigned HOST_WIDE_INT ival;
> +      if (!aarch64_reinterpret_float_as_int (immediate, &ival))
> +	  gcc_unreachable ();
> +      immediate = gen_int_mode (ival, mode);
> +    }
> +
>    machine_mode vmode;
> +  int width = GET_MODE_BITSIZE (mode) * 2;

Dubious * 2 again!

> +  if (width < 64)
> +     width = 64;
>  
>    gcc_assert (!VECTOR_MODE_P (mode));
> -  vmode = aarch64_simd_container_mode (mode, 64);
> +  vmode = aarch64_simd_container_mode (mode, width);
>    rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
> -  return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
> +  return aarch64_output_simd_mov_immediate (v_op, vmode, width);
>  }
>  
>  /* Split operands into moves from op[1] + op[2] into op[0].  */

Thanks,
James
Richard Sandiford June 13, 2017, 5:08 p.m. UTC | #2
James Greenhalgh <james.greenhalgh@arm.com> writes:
>> +
>> +	 /* First determine number of instructions to do the move
>> +	    as an integer constant.  */
>> +	if (!aarch64_float_const_representable_p (x)
>> +	    && !aarch64_can_const_movi_rtx_p (x, mode)
>> +	    && aarch64_float_const_rtx_p (x))
>> +	  {
>> +	    unsigned HOST_WIDE_INT ival;
>> +	    bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
>> +	    gcc_assert (succeed);
>
> Just:
>
>   gcc_assert (aarch64_reinterpret_float_as_int (x, &ival));
>
> There's not much extra information in the name "succeed", so no extra value
> in the variable assignment.

That's not the same thing with --enable-checking=no

>> +
>> +	    machine_mode imode = mode == HFmode ? SImode : int_mode_for_mode (mode);
>> +	    int ncost = aarch64_internal_mov_immediate
>> +		(NULL_RTX, gen_int_mode (ival, imode), false, imode);
>> +	    *cost += COSTS_N_INSNS (ncost);
>> +	    return true;
>> +	  }
Tamar Christina June 15, 2017, 12:50 p.m. UTC | #3
> 
> This patch is pretty huge, are there any opportunities to further split it to aid
> review?

Unfortunately because I'm also changing some constraints it introduced a bit of a dependency cycle.
If I were to break it up more, the individual patches won't work on their own anymore. If this is acceptable
I can break it up more.

> > +  ival = zext_hwi (res[needed - 1], 32);  for (int i = needed - 2; i
> > + >= 0; i--)
> > +    {
> > +      ival <<= 32;
> > +      ival |= zext_hwi (res[i], 32);
> > +    }
> > +
> > +  *intval = ival;
> 
> ???
> 
> Two cases here, needed is either 2 if GET_MODE_BITSIZE (mode) == 64, or it
> is 1 otherwise. So i starts at either -1 or 0. So this for loop either runs
> 0 or 1 times. What am I missing? I'm sure this is all an indirect way of
> writing:
> 

Yes, the code was set up to be easily extended to support 128 floats as well,
Which was deprioritized. I'll just remove the loop.

> > +
> > +  /* Determine whether it's cheaper to write float constants as
> > +     mov/movk pairs over ldr/adrp pairs.  */  unsigned HOST_WIDE_INT
> > + ival;
> > +
> > +  if (GET_CODE (x) == CONST_DOUBLE
> > +      && SCALAR_FLOAT_MODE_P (mode)
> > +      && aarch64_reinterpret_float_as_int (x, &ival))
> > +    {
> > +      machine_mode imode = mode == HFmode ? SImode :
> int_mode_for_mode (mode);
> > +      int num_instr = aarch64_internal_mov_immediate
> > +			(NULL_RTX, gen_int_mode (ival, imode), false,
> imode);
> > +      return num_instr < 3;
> 
> Should this cost model be static on a magin number? Is it not the case that
> the decision should be based on the relative speeds of a memory access
> compared with mov/movk/fmov ?
> 

As far as I'm aware, the cost model is too simplistic to be able to express the
Actual costs of mov/movk and movk/movk pairs. E.g it doesn't take into account
The latency and throughput difference when the instructions occur in sequence/pairs.

This leads to it allowing a smaller subset through here then what would be beneficial.

> > +/* Return TRUE if rtx X is immediate constant that fits in a single
> > +   MOVI immediate operation.  */
> > +bool
> > +aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode) {
> > +  if (!TARGET_SIMD)
> > +     return false;
> > +
> > +  machine_mode vmode, imode;
> > +  unsigned HOST_WIDE_INT ival;
> > +
> > +  /* Don't write float constants out to memory.  */
> > +  if (GET_CODE (x) == CONST_DOUBLE
> > +      && SCALAR_FLOAT_MODE_P (mode))
> > +    {
> > +      if (!aarch64_reinterpret_float_as_int (x, &ival))
> > +	return false;
> > +
> > +      imode = int_mode_for_mode (mode);
> > +    }
> > +  else if (GET_CODE (x) == CONST_INT
> > +	   && SCALAR_INT_MODE_P (mode))
> > +    {
> > +       imode = mode;
> > +       ival = INTVAL (x);
> > +    }
> > +  else
> > +    return false;
> > +
> > +  unsigned width = GET_MODE_BITSIZE (mode) * 2;
> 
> Why * 2? It isn't obvious to me from my understanding of movi why that
> would be better than just clamping to 64-bit?

The idea is to get the smallest vector mode for the given mode.
For SF that's V2SF and DF: V2DF, which is why the *2. Clamping to 64 bit there
would be no 64 bit DF vector mode as far as I'm aware of.

The clamping is done for modes smaller than SF, e.g. HF. Which mapped to the smallest
Option, V4HF thanks to the clamping. Forcing everything to 128 bit vectors would work,
But I don't see the advantage of that.

For this particular function is doesn't matter much as no code is generated. So clamping to
128 bits would work, but when generating the code, I don't see why V4SF and V8HF would be
Better than V2SF and V4HF.

Alternatively I could instead of reusing aarch64_simd_container_mode just create my own
Mapping function which just does the mapping I expect. Would that be a better option?

> 
> > +  if (width < GET_MODE_BITSIZE (DFmode))
> > +     width = GET_MODE_BITSIZE (DFmode);
> > +
> > +  vmode = aarch64_simd_container_mode (imode, width);  rtx v_op =
> > + aarch64_simd_gen_const_vector_dup (vmode, ival);
> > +
> > +  return aarch64_simd_valid_immediate (v_op, vmode, false, NULL); }
> > +
> > +
> >  /* Return the fixed registers used for condition codes.  */
> >
> >  static bool
> > @@ -5758,12 +5860,6 @@ aarch64_preferred_reload_class (rtx x,
> reg_class_t regclass)
> >        return NO_REGS;
> >      }
> >
> > -  /* If it's an integer immediate that MOVI can't handle, then
> > -     FP_REGS is not an option, so we return NO_REGS instead.  */
> > -  if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
> > -      && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
> > -    return NO_REGS;
> > -
> >    /* Register eliminiation can result in a request for
> >       SP+constant->FP_REGS.  We cannot support such operations which
> >       use SP as source and an FP_REG as destination, so reject out @@
> > -6674,26 +6770,44 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int
> outer ATTRIBUTE_UNUSED,
> >        return true;
> >
> >      case CONST_DOUBLE:
> > +
> > +	 /* First determine number of instructions to do the move
> > +	    as an integer constant.  */
> > +	if (!aarch64_float_const_representable_p (x)
> > +	    && !aarch64_can_const_movi_rtx_p (x, mode)
> > +	    && aarch64_float_const_rtx_p (x))
> > +	  {
> > +	    unsigned HOST_WIDE_INT ival;
> > +	    bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
> > +	    gcc_assert (succeed);
> 
> Just:
> 
>   gcc_assert (aarch64_reinterpret_float_as_int (x, &ival));
> 
> There's not much extra information in the name "succeed", so no extra value
> in the variable assignment.

But if asserts are disabled won't the code be optimized out. This is why I only assert
on the result (as an extra sanity check) since the call itself has a side-effect.

> >  char*
> > -aarch64_output_scalar_simd_mov_immediate (rtx immediate,
> > -					  machine_mode mode)
> > +aarch64_output_scalar_simd_mov_immediate (rtx immediate,
> > +machine_mode mode)
> >  {
> > +
> > +  /* If a floating point number was passed and we desire to use it in an
> > +     integer mode do the conversion to integer.  */
> > +  if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) ==
> MODE_INT)
> > +    {
> > +      unsigned HOST_WIDE_INT ival;
> > +      if (!aarch64_reinterpret_float_as_int (immediate, &ival))
> > +	  gcc_unreachable ();
> > +      immediate = gen_int_mode (ival, mode);
> > +    }
> > +
> >    machine_mode vmode;
> > +  int width = GET_MODE_BITSIZE (mode) * 2;
> 
> Dubious * 2 again!

Same reason as above, however here code is actually generated.

> 
> > +  if (width < 64)
> > +     width = 64;
> >
> >    gcc_assert (!VECTOR_MODE_P (mode));
> > -  vmode = aarch64_simd_container_mode (mode, 64);
> > +  vmode = aarch64_simd_container_mode (mode, width);
> >    rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL
> > (immediate));
> > -  return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
> > +  return aarch64_output_simd_mov_immediate (v_op, vmode, width);
> >  }
> >
> >  /* Split operands into moves from op[1] + op[2] into op[0].  */
> 
> Thanks,
> James
diff mbox

Patch

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 9543f8c9f2974ad7f8612aa007f975dd6eeec2bc..5a05137e4e28be09a1516ad6bbfce2661052195e 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -313,6 +313,8 @@  bool aarch64_emit_approx_div (rtx, rtx, rtx);
 bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
 bool aarch64_expand_movmem (rtx *);
 bool aarch64_float_const_zero_rtx_p (rtx);
+bool aarch64_float_const_rtx_p (rtx);
+bool aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode);
 bool aarch64_function_arg_regno_p (unsigned);
 bool aarch64_fusion_enabled_p (enum aarch64_fusion_pairs);
 bool aarch64_gen_movmemqi (rtx *);
@@ -340,7 +342,6 @@  bool aarch64_regno_ok_for_base_p (int, bool);
 bool aarch64_regno_ok_for_index_p (int, bool);
 bool aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
 					    bool high);
-bool aarch64_simd_imm_scalar_p (rtx x, machine_mode mode);
 bool aarch64_simd_imm_zero_p (rtx, machine_mode);
 bool aarch64_simd_scalar_immediate_valid_for_move (rtx, machine_mode);
 bool aarch64_simd_shift_imm_p (rtx, machine_mode, bool);
@@ -475,4 +476,6 @@  std::string aarch64_get_extension_string_for_isa_flags (unsigned long,
 
 rtl_opt_pass *make_pass_fma_steering (gcc::context *ctxt);
 
+bool aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *fail);
+
 #endif /* GCC_AARCH64_PROTOS_H */
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index a069427f576f6bd7336bbe4497249773bd33d138..2ab2d96e40e80a79b5648046ca2d6e202d3939a2 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -147,6 +147,8 @@  static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 							 const_tree type,
 							 int misalignment,
 							 bool is_packed);
+static machine_mode
+aarch64_simd_container_mode (machine_mode mode, unsigned width);
 
 /* Major revision number of the ARM Architecture implemented by the target.  */
 unsigned aarch64_architecture_version;
@@ -4613,6 +4615,66 @@  aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
   return true;
 }
 
+/* Return the binary representation of floating point constant VALUE in INTVAL.
+   If the value cannot be converted, return false without setting INTVAL.
+   The conversion is done in the given MODE.  */
+bool
+aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
+{
+  machine_mode mode = GET_MODE (value);
+  if (GET_CODE (value) != CONST_DOUBLE
+      || !SCALAR_FLOAT_MODE_P (mode)
+      || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT)
+    return false;
+
+  unsigned HOST_WIDE_INT ival = 0;
+
+  /* Only support up to DF mode.  */
+  gcc_assert (GET_MODE_BITSIZE (mode) <= 64);
+  int needed = GET_MODE_BITSIZE (mode) == 64 ? 2 : 1;
+
+  long res[2];
+  real_to_target (res,
+		  CONST_DOUBLE_REAL_VALUE (value),
+		  REAL_MODE_FORMAT (mode));
+
+  ival = zext_hwi (res[needed - 1], 32);
+  for (int i = needed - 2; i >= 0; i--)
+    {
+      ival <<= 32;
+      ival |= zext_hwi (res[i], 32);
+    }
+
+  *intval = ival;
+  return true;
+}
+
+/* Return TRUE if rtx X is an immediate constant that can be moved using a
+   single MOV(+MOVK) followed by an FMOV.  */
+bool
+aarch64_float_const_rtx_p (rtx x)
+{
+  machine_mode mode = GET_MODE (x);
+  if (mode == VOIDmode)
+    return false;
+
+  /* Determine whether it's cheaper to write float constants as
+     mov/movk pairs over ldr/adrp pairs.  */
+  unsigned HOST_WIDE_INT ival;
+
+  if (GET_CODE (x) == CONST_DOUBLE
+      && SCALAR_FLOAT_MODE_P (mode)
+      && aarch64_reinterpret_float_as_int (x, &ival))
+    {
+      machine_mode imode = mode == HFmode ? SImode : int_mode_for_mode (mode);
+      int num_instr = aarch64_internal_mov_immediate
+			(NULL_RTX, gen_int_mode (ival, imode), false, imode);
+      return num_instr < 3;
+    }
+
+  return false;
+}
+
 /* Return TRUE if rtx X is immediate constant 0.0 */
 bool
 aarch64_float_const_zero_rtx_p (rtx x)
@@ -4625,6 +4687,46 @@  aarch64_float_const_zero_rtx_p (rtx x)
   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
 }
 
+/* Return TRUE if rtx X is immediate constant that fits in a single
+   MOVI immediate operation.  */
+bool
+aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
+{
+  if (!TARGET_SIMD)
+     return false;
+
+  machine_mode vmode, imode;
+  unsigned HOST_WIDE_INT ival;
+
+  /* Don't write float constants out to memory.  */
+  if (GET_CODE (x) == CONST_DOUBLE
+      && SCALAR_FLOAT_MODE_P (mode))
+    {
+      if (!aarch64_reinterpret_float_as_int (x, &ival))
+	return false;
+
+      imode = int_mode_for_mode (mode);
+    }
+  else if (GET_CODE (x) == CONST_INT
+	   && SCALAR_INT_MODE_P (mode))
+    {
+       imode = mode;
+       ival = INTVAL (x);
+    }
+  else
+    return false;
+
+  unsigned width = GET_MODE_BITSIZE (mode) * 2;
+  if (width < GET_MODE_BITSIZE (DFmode))
+     width = GET_MODE_BITSIZE (DFmode);
+
+  vmode = aarch64_simd_container_mode (imode, width);
+  rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
+
+  return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
+}
+
+
 /* Return the fixed registers used for condition codes.  */
 
 static bool
@@ -5758,12 +5860,6 @@  aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
       return NO_REGS;
     }
 
-  /* If it's an integer immediate that MOVI can't handle, then
-     FP_REGS is not an option, so we return NO_REGS instead.  */
-  if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
-      && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
-    return NO_REGS;
-
   /* Register eliminiation can result in a request for
      SP+constant->FP_REGS.  We cannot support such operations which
      use SP as source and an FP_REG as destination, so reject out
@@ -6674,26 +6770,44 @@  aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
       return true;
 
     case CONST_DOUBLE:
+
+	 /* First determine number of instructions to do the move
+	    as an integer constant.  */
+	if (!aarch64_float_const_representable_p (x)
+	    && !aarch64_can_const_movi_rtx_p (x, mode)
+	    && aarch64_float_const_rtx_p (x))
+	  {
+	    unsigned HOST_WIDE_INT ival;
+	    bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
+	    gcc_assert (succeed);
+
+	    machine_mode imode = mode == HFmode ? SImode : int_mode_for_mode (mode);
+	    int ncost = aarch64_internal_mov_immediate
+		(NULL_RTX, gen_int_mode (ival, imode), false, imode);
+	    *cost += COSTS_N_INSNS (ncost);
+	    return true;
+	  }
+
       if (speed)
 	{
-	  /* mov[df,sf]_aarch64.  */
-	  if (aarch64_float_const_representable_p (x))
-	    /* FMOV (scalar immediate).  */
-	    *cost += extra_cost->fp[mode == DFmode].fpconst;
-	  else if (!aarch64_float_const_zero_rtx_p (x))
-	    {
-	      /* This will be a load from memory.  */
-	      if (mode == DFmode)
+	/* mov[df,sf]_aarch64.  */
+	if (aarch64_float_const_representable_p (x))
+	  /* FMOV (scalar immediate).  */
+	  *cost += extra_cost->fp[mode == DFmode].fpconst;
+	else if (!aarch64_float_const_zero_rtx_p (x))
+	  {
+	    /* This will be a load from memory.  */
+	    if (mode == DFmode)
 		*cost += extra_cost->ldst.loadd;
-	      else
+	    else
 		*cost += extra_cost->ldst.loadf;
-	    }
-	  else
-	    /* Otherwise this is +0.0.  We get this using MOVI d0, #0
-	       or MOV v0.s[0], wzr - neither of which are modeled by the
-	       cost tables.  Just use the default cost.  */
-	    {
-	    }
+	  }
+	else
+	  /* Otherwise this is +0.0.  We get this using MOVI d0, #0
+	     or MOV v0.s[0], wzr - neither of which are modeled by the
+	     cost tables.  Just use the default cost.  */
+	  {
+	  }
 	}
 
       return true;
@@ -6875,7 +6989,7 @@  aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
 	  if (speed)
 	    *cost += extra_cost->fp[mode == DFmode].compare;
 
-          if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
+	  if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
             {
 	      *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
               /* FCMP supports constant 0.0 for no extra cost. */
@@ -9961,18 +10075,16 @@  aarch64_legitimate_pic_operand_p (rtx x)
 /* Return true if X holds either a quarter-precision or
      floating-point +0.0 constant.  */
 static bool
-aarch64_valid_floating_const (machine_mode mode, rtx x)
+aarch64_valid_floating_const (rtx x)
 {
   if (!CONST_DOUBLE_P (x))
     return false;
 
-  if (aarch64_float_const_zero_rtx_p (x))
+  /* This call determines which constants can be used in mov<mode>
+     as integer moves instead of constant loads.  */
+  if (aarch64_float_const_rtx_p (x))
     return true;
 
-  /* We only handle moving 0.0 to a TFmode register.  */
-  if (!(mode == SFmode || mode == DFmode))
-    return false;
-
   return aarch64_float_const_representable_p (x);
 }
 
@@ -9984,11 +10096,15 @@  aarch64_legitimate_constant_p (machine_mode mode, rtx x)
   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
     return false;
 
-  /* This could probably go away because
-     we now decompose CONST_INTs according to expand_mov_immediate.  */
+  /* For these cases we never want to use a literal load.
+     As such we have to prevent the compiler from forcing these
+     to memory.  */
   if ((GET_CODE (x) == CONST_VECTOR
        && aarch64_simd_valid_immediate (x, mode, false, NULL))
-      || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
+      || CONST_INT_P (x)
+      || aarch64_valid_floating_const (x)
+      || aarch64_can_const_movi_rtx_p (x, mode)
+      || aarch64_float_const_rtx_p (x))
 	return !targetm.cannot_force_const_mem (mode, x);
 
   if (GET_CODE (x) == HIGH
@@ -11266,23 +11382,6 @@  aarch64_mask_from_zextract_ops (rtx width, rtx pos)
 }
 
 bool
-aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
-{
-  HOST_WIDE_INT imm = INTVAL (x);
-  int i;
-
-  for (i = 0; i < 8; i++)
-    {
-      unsigned int byte = imm & 0xff;
-      if (byte != 0xff && byte != 0)
-       return false;
-      imm >>= 8;
-    }
-
-  return true;
-}
-
-bool
 aarch64_mov_operand_p (rtx x, machine_mode mode)
 {
   if (GET_CODE (x) == HIGH
@@ -12599,15 +12698,28 @@  aarch64_output_simd_mov_immediate (rtx const_vector,
 }
 
 char*
-aarch64_output_scalar_simd_mov_immediate (rtx immediate,
-					  machine_mode mode)
+aarch64_output_scalar_simd_mov_immediate (rtx immediate,  machine_mode mode)
 {
+
+  /* If a floating point number was passed and we desire to use it in an
+     integer mode do the conversion to integer.  */
+  if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
+    {
+      unsigned HOST_WIDE_INT ival;
+      if (!aarch64_reinterpret_float_as_int (immediate, &ival))
+	  gcc_unreachable ();
+      immediate = gen_int_mode (ival, mode);
+    }
+
   machine_mode vmode;
+  int width = GET_MODE_BITSIZE (mode) * 2;
+  if (width < 64)
+     width = 64;
 
   gcc_assert (!VECTOR_MODE_P (mode));
-  vmode = aarch64_simd_container_mode (mode, 64);
+  vmode = aarch64_simd_container_mode (mode, width);
   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
-  return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
+  return aarch64_output_simd_mov_immediate (v_op, vmode, width);
 }
 
 /* Split operands into moves from op[1] + op[2] into op[0].  */
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 5adc5edb8dde9c30450b04932a37c41f84cc5ed1..dce24f5369616b7804af33840622f403ead9aeea 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1026,8 +1026,8 @@ 
 )
 
 (define_insn_and_split "*movsi_aarch64"
-  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,k,r,r,r,r,*w,m,  m,r,r  ,*w, r,*w")
-	(match_operand:SI 1 "aarch64_mov_operand"  " r,r,k,M,n,m, m,rZ,*w,S,Ush,rZ,*w,*w"))]
+  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,k,r,r,r,r,*w,m,  m,r,r  ,*w, r,*w,w")
+	(match_operand:SI 1 "aarch64_mov_operand"  " r,r,k,M,n,m, m,rZ,*w,S,Ush,rZ,*w,*w,Ds"))]
   "(register_operand (operands[0], SImode)
     || aarch64_reg_or_zero (operands[1], SImode))"
   "@
@@ -1044,21 +1044,23 @@ 
    adrp\\t%x0, %A1
    fmov\\t%s0, %w1
    fmov\\t%w0, %s1
-   fmov\\t%s0, %s1"
-   "CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), SImode)
+   fmov\\t%s0, %s1
+   * return aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);"
+  "CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), SImode)
     && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
-   [(const_int 0)]
-   "{
-       aarch64_expand_mov_immediate (operands[0], operands[1]);
-       DONE;
-    }"
+  [(const_int 0)]
+  "{
+      aarch64_expand_mov_immediate (operands[0], operands[1]);
+      DONE;
+   }"
   [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,load1,load1,store1,store1,\
-                     adr,adr,f_mcr,f_mrc,fmov")
-   (set_attr "fp" "*,*,*,*,*,*,yes,*,yes,*,*,yes,yes,yes")]
+		    adr,adr,f_mcr,f_mrc,fmov,neon_move")
+   (set_attr "fp" "*,*,*,*,*,*,yes,*,yes,*,*,yes,yes,yes,*")
+   (set_attr "simd" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,yes")]
 )
 
 (define_insn_and_split "*movdi_aarch64"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,*w,m,  m,r,r,  *w, r,*w,w")
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,*w, m, m,r,r  ,*w, r,*w,w")
 	(match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,N,n,m, m,rZ,*w,S,Ush,rZ,*w,*w,Dd"))]
   "(register_operand (operands[0], DImode)
     || aarch64_reg_or_zero (operands[1], DImode))"
@@ -1077,7 +1079,7 @@ 
    fmov\\t%d0, %x1
    fmov\\t%x0, %d1
    fmov\\t%d0, %d1
-   movi\\t%d0, %1"
+   * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);"
    "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode))
     && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
    [(const_int 0)]
@@ -1086,7 +1088,7 @@ 
        DONE;
     }"
   [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,load1,load1,store1,store1,\
-                     adr,adr,f_mcr,f_mrc,fmov,neon_move")
+		    adr,adr,f_mcr,f_mrc,fmov,neon_move")
    (set_attr "fp" "*,*,*,*,*,*,yes,*,yes,*,*,yes,yes,yes,*")
    (set_attr "simd" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,yes")]
 )
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 5a252c07afa6bda32af5d53bbff44958ce84108a..577e8d29b2683d42e71d45bdf097e3c4940c0d8e 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -167,6 +167,12 @@ 
   (and (match_code "const_double")
        (match_test "aarch64_float_const_representable_p (op)")))
 
+(define_constraint "Uvi"
+  "A floating point constant which can be used with a\
+   MOVI immediate operation."
+  (and (match_code "const_double")
+       (match_test "aarch64_can_const_movi_rtx_p (op, GET_MODE (op))")))
+
 (define_constraint "Dn"
   "@internal
  A constraint that matches vector of immediates."
@@ -211,6 +217,14 @@ 
 
 (define_constraint "Dd"
   "@internal
- A constraint that matches an immediate operand valid for AdvSIMD scalar."
+ A constraint that matches an integer immediate operand valid\
+ for AdvSIMD scalar operations in DImode."
+ (and (match_code "const_int")
+      (match_test "aarch64_can_const_movi_rtx_p (op, DImode)")))
+
+(define_constraint "Ds"
+  "@internal
+ A constraint that matches an integer immediate operand valid\
+ for AdvSIMD scalar operations in SImode."
  (and (match_code "const_int")
-      (match_test "aarch64_simd_imm_scalar_p (op, GET_MODE (op))")))
+      (match_test "aarch64_can_const_movi_rtx_p (op, SImode)")))
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index e83d45b394518f106303a47df2069e498faa73cc..401a2d40574fcf34fcaffdc91234029fd9410173 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -53,6 +53,11 @@ 
        (ior (match_operand 0 "register_operand")
 	    (match_test "op == const0_rtx"))))
 
+(define_predicate "aarch64_reg_or_fp_float"
+  (ior (match_operand 0 "register_operand")
+	(and (match_code "const_double")
+	     (match_test "aarch64_float_const_rtx_p (op)"))))
+
 (define_predicate "aarch64_reg_or_fp_zero"
   (ior (match_operand 0 "register_operand")
 	(and (match_code "const_double")