Index: gcc/config/arm/neon.md
===================================================================
--- gcc/config/arm/neon.md	(revision 161038)
+++ gcc/config/arm/neon.md	(working copy)
@@ -22,11 +22,8 @@
 (define_constants
   [(UNSPEC_ASHIFT_SIGNED	65)
    (UNSPEC_ASHIFT_UNSIGNED	66)
-   (UNSPEC_VABA			67)
-   (UNSPEC_VABAL		68)
    (UNSPEC_VABD			69)
    (UNSPEC_VABDL		70)
-   (UNSPEC_VABS			71)
    (UNSPEC_VADD			72)
    (UNSPEC_VADDHN		73)
    (UNSPEC_VADDL		74)
@@ -86,7 +83,6 @@
    (UNSPEC_VMULL		128)
    (UNSPEC_VMUL_LANE		129)
    (UNSPEC_VMULL_LANE		130)
-   (UNSPEC_VMUL_N		131)
    (UNSPEC_VMVN			132)
    (UNSPEC_VORN			133)
    (UNSPEC_VORR			134)
@@ -823,11 +819,8 @@
 
 ;; Doubleword and quadword arithmetic.
 
-;; NOTE: vadd/vsub and some other instructions also support 64-bit integer
-;; element size, which we could potentially use for "long long" operations. We
-;; don't want to do this at present though, because moving values from the
-;; vector unit to the ARM core is currently slow and 64-bit addition (etc.) is
-;; easy to do with ARM instructions anyway.
+;; NOTE: some other instructions also support 64-bit integer
+;; element size, which we could potentially use for "long long" operations.
 
 (define_insn "*add<mode>3_neon"
   [(set (match_operand:VDQ 0 "s_register_operand" "=w")
@@ -843,6 +836,26 @@
                     (const_string "neon_int_1")))]
 )
 
+(define_insn "adddi3_neon"
+  [(set (match_operand:DI 0 "s_register_operand" "=w,?&r,?&r")
+        (plus:DI (match_operand:DI 1 "s_register_operand" "%w,0,0")
+                 (match_operand:DI 2 "s_register_operand" "w,r,0")))
+   (clobber (reg:CC CC_REGNUM))]
+  "TARGET_NEON"
+{
+  switch (which_alternative)
+    {
+    case 0: return "vadd.i64\t%P0, %P1, %P2";
+    case 1: return "#";
+    case 2: return "#";
+    default: gcc_unreachable ();
+    }
+}
+  [(set_attr "neon_type" "neon_int_1,*,*")
+   (set_attr "conds" "*,clob,clob")
+   (set_attr "length" "*,8,8")]
+)
+
 (define_insn "*sub<mode>3_neon"
   [(set (match_operand:VDQ 0 "s_register_operand" "=w")
         (minus:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
@@ -857,6 +870,27 @@
                     (const_string "neon_int_2")))]
 )
 
+(define_insn "subdi3_neon"
+  [(set (match_operand:DI 0 "s_register_operand" "=w,?&r,?&r,?&r")
+        (minus:DI (match_operand:DI 1 "s_register_operand" "w,0,r,0")
+                  (match_operand:DI 2 "s_register_operand" "w,r,0,0")))
+   (clobber (reg:CC CC_REGNUM))]
+  "TARGET_NEON"
+{
+  switch (which_alternative)
+    {
+    case 0: return "vsub.i64\t%P0, %P1, %P2";
+    case 1: /* fall through */ 
+    case 2: /* fall through */
+    case 3: return  "subs\\t%Q0, %Q1, %Q2\;sbc\\t%R0, %R1, %R2";
+    default: gcc_unreachable ();
+    }
+}
+  [(set_attr "neon_type" "neon_int_2,*,*,*")
+   (set_attr "conds" "*,clob,clob,clob")
+   (set_attr "length" "*,8,8,8")]
+)
+
 (define_insn "*mul<mode>3_neon"
   [(set (match_operand:VDQ 0 "s_register_operand" "=w")
         (mult:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
@@ -878,7 +912,7 @@
                                     (const_string "neon_mul_qqq_8_16_32_ddd_32")))))]
 )
 
-(define_insn "*mul<mode>3add<mode>_neon"
+(define_insn "mul<mode>3add<mode>_neon"
   [(set (match_operand:VDQ 0 "s_register_operand" "=w")
         (plus:VDQ (mult:VDQ (match_operand:VDQ 2 "s_register_operand" "w")
                             (match_operand:VDQ 3 "s_register_operand" "w"))
@@ -900,7 +934,7 @@
                                     (const_string "neon_mla_qqq_32_qqd_32_scalar")))))]
 )
 
-(define_insn "*mul<mode>3neg<mode>add<mode>_neon"
+(define_insn "mul<mode>3neg<mode>add<mode>_neon"
   [(set (match_operand:VDQ 0 "s_register_operand" "=w")
         (minus:VDQ (match_operand:VDQ 1 "s_register_operand" "0")
                    (mult:VDQ (match_operand:VDQ 2 "s_register_operand" "w")
@@ -1711,11 +1745,37 @@
 
 ; good for plain vadd, vaddq.
 
-(define_insn "neon_vadd<mode>"
+(define_expand "neon_vadd<mode>"
+  [(match_operand:VDQX 0 "s_register_operand" "=w")
+   (match_operand:VDQX 1 "s_register_operand" "w")
+   (match_operand:VDQX 2 "s_register_operand" "w")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_NEON"
+{
+  if (!<Is_float_mode> || flag_unsafe_math_optimizations)
+    emit_insn (gen_add<mode>3 (operands[0], operands[1], operands[2]));
+  else
+    emit_insn (gen_neon_vadd<mode>_unspec (operands[0], operands[1],
+					   operands[2]));
+  DONE;
+})
+
+; Note that NEON operations don't support the full IEEE 754 standard: in
+; particular, denormal values are flushed to zero.  This means that GCC cannot
+; use those instructions for autovectorization, etc. unless
+; -funsafe-math-optimizations is in effect (in which case flush-to-zero
+; behaviour is permissible).  Intrinsic operations (provided by the arm_neon.h
+; header) must work in either case: if -funsafe-math-optimizations is given,
+; intrinsics expand to "canonical" RTL where possible, otherwise intrinsics
+; expand to unspecs (which may potentially limit the extent to which they might
+; be optimized by generic code).
+
+; Used for intrinsics when flag_unsafe_math_optimizations is false.
+
+(define_insn "neon_vadd<mode>_unspec"
   [(set (match_operand:VDQX 0 "s_register_operand" "=w")
         (unspec:VDQX [(match_operand:VDQX 1 "s_register_operand" "w")
-		      (match_operand:VDQX 2 "s_register_operand" "w")
-                      (match_operand:SI 3 "immediate_operand" "i")]
+		      (match_operand:VDQX 2 "s_register_operand" "w")]
                      UNSPEC_VADD))]
   "TARGET_NEON"
   "vadd.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
@@ -1788,6 +1848,8 @@
   [(set_attr "neon_type" "neon_int_4")]
 )
 
+;; We cannot replace this unspec with mul<mode>3 because of the odd 
+;; polynomial multiplication case that can specified by operand 3.
 (define_insn "neon_vmul<mode>"
   [(set (match_operand:VDQW 0 "s_register_operand" "=w")
         (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "w")
@@ -1811,13 +1873,31 @@
                                     (const_string "neon_mul_qqq_8_16_32_ddd_32")))))]
 )
 
-(define_insn "neon_vmla<mode>"
-  [(set (match_operand:VDQW 0 "s_register_operand" "=w")
-        (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0")
-		      (match_operand:VDQW 2 "s_register_operand" "w")
-		      (match_operand:VDQW 3 "s_register_operand" "w")
-                     (match_operand:SI 4 "immediate_operand" "i")]
-                    UNSPEC_VMLA))]
+(define_expand "neon_vmla<mode>"
+  [(match_operand:VDQW 0 "s_register_operand" "=w")
+   (match_operand:VDQW 1 "s_register_operand" "0")
+   (match_operand:VDQW 2 "s_register_operand" "w")
+   (match_operand:VDQW 3 "s_register_operand" "w")
+   (match_operand:SI 4 "immediate_operand" "i")]
+  "TARGET_NEON"
+{
+  if (!<Is_float_mode> || flag_unsafe_math_optimizations)
+    emit_insn (gen_mul<mode>3add<mode>_neon (operands[0], operands[1],
+				             operands[2], operands[3]));
+  else
+    emit_insn (gen_neon_vmla<mode>_unspec (operands[0], operands[1],
+					   operands[2], operands[3]));
+  DONE;
+})
+
+; Used for intrinsics when flag_unsafe_math_optimizations is false.
+
+(define_insn "neon_vmla<mode>_unspec"
+  [(set (match_operand:VDQ 0 "s_register_operand" "=w")
+	(unspec:VDQ [(match_operand:VDQ 1 "s_register_operand" "0")
+		     (match_operand:VDQ 2 "s_register_operand" "w")
+		     (match_operand:VDQ 3 "s_register_operand" "w")]
+		    UNSPEC_VMLA))]
   "TARGET_NEON"
   "vmla.<V_if_elem>\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
   [(set (attr "neon_type")
@@ -1850,13 +1930,31 @@
                    (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")))]
 )
 
-(define_insn "neon_vmls<mode>"
-  [(set (match_operand:VDQW 0 "s_register_operand" "=w")
-        (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0")
-		      (match_operand:VDQW 2 "s_register_operand" "w")
-		      (match_operand:VDQW 3 "s_register_operand" "w")
-                     (match_operand:SI 4 "immediate_operand" "i")]
-                    UNSPEC_VMLS))]
+(define_expand "neon_vmls<mode>"
+  [(match_operand:VDQW 0 "s_register_operand" "=w")
+   (match_operand:VDQW 1 "s_register_operand" "0")
+   (match_operand:VDQW 2 "s_register_operand" "w")
+   (match_operand:VDQW 3 "s_register_operand" "w")
+   (match_operand:SI 4 "immediate_operand" "i")]
+  "TARGET_NEON"
+{
+  if (!<Is_float_mode> || flag_unsafe_math_optimizations)
+    emit_insn (gen_mul<mode>3neg<mode>add<mode>_neon (operands[0],
+		 operands[1], operands[2], operands[3]));
+  else
+    emit_insn (gen_neon_vmls<mode>_unspec (operands[0], operands[1],
+					   operands[2], operands[3]));
+  DONE;
+})
+
+; Used for intrinsics when flag_unsafe_math_optimizations is false.
+
+(define_insn "neon_vmls<mode>_unspec"
+  [(set (match_operand:VDQ 0 "s_register_operand" "=w")
+	(unspec:VDQ [(match_operand:VDQ 1 "s_register_operand" "0")
+		     (match_operand:VDQ 2 "s_register_operand" "w")
+		     (match_operand:VDQ 3 "s_register_operand" "w")]
+		    UNSPEC_VMLS))]
   "TARGET_NEON"
   "vmls.<V_if_elem>\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
   [(set (attr "neon_type")
@@ -1966,11 +2064,27 @@
                    (const_string "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar")))]
 )
 
-(define_insn "neon_vsub<mode>"
+(define_expand "neon_vsub<mode>"
+  [(match_operand:VDQX 0 "s_register_operand" "=w")
+   (match_operand:VDQX 1 "s_register_operand" "w")
+   (match_operand:VDQX 2 "s_register_operand" "w")
+   (match_operand:SI 3 "immediate_operand" "i")]
+  "TARGET_NEON"
+{
+  if (!<Is_float_mode> || flag_unsafe_math_optimizations)
+    emit_insn (gen_sub<mode>3 (operands[0], operands[1], operands[2]));
+  else
+    emit_insn (gen_neon_vsub<mode>_unspec (operands[0], operands[1],
+					   operands[2]));
+  DONE;
+})
+
+; Used for intrinsics when flag_unsafe_math_optimizations is false.
+
+(define_insn "neon_vsub<mode>_unspec"
   [(set (match_operand:VDQX 0 "s_register_operand" "=w")
         (unspec:VDQX [(match_operand:VDQX 1 "s_register_operand" "w")
-		      (match_operand:VDQX 2 "s_register_operand" "w")
-                      (match_operand:SI 3 "immediate_operand" "i")]
+		      (match_operand:VDQX 2 "s_register_operand" "w")]
                      UNSPEC_VSUB))]
   "TARGET_NEON"
   "vsub.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
@@ -2153,11 +2267,11 @@
 
 (define_insn "neon_vaba<mode>"
   [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
-        (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "0")
-		       (match_operand:VDQIW 2 "s_register_operand" "w")
-		       (match_operand:VDQIW 3 "s_register_operand" "w")
-                       (match_operand:SI 4 "immediate_operand" "i")]
-		      UNSPEC_VABA))]
+        (plus:VDQIW (match_operand:VDQIW 1 "s_register_operand" "0")
+                    (unspec:VDQIW [(match_operand:VDQIW 2 "s_register_operand" "w")
+		                   (match_operand:VDQIW 3 "s_register_operand" "w")
+                                   (match_operand:SI 4 "immediate_operand" "i")]
+		                  UNSPEC_VABD)))]
   "TARGET_NEON"
   "vaba.%T4%#<V_sz_elem>\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
   [(set (attr "neon_type")
@@ -2167,11 +2281,11 @@
 
 (define_insn "neon_vabal<mode>"
   [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
-        (unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
-		           (match_operand:VW 2 "s_register_operand" "w")
-		           (match_operand:VW 3 "s_register_operand" "w")
-                           (match_operand:SI 4 "immediate_operand" "i")]
-                          UNSPEC_VABAL))]
+        (plus:<V_widen> (match_operand:<V_widen> 1 "s_register_operand" "0")
+                        (unspec:<V_widen> [(match_operand:VW 2 "s_register_operand" "w")
+                                           (match_operand:VW 3 "s_register_operand" "w")
+                                           (match_operand:SI 4 "immediate_operand" "i")]
+                          UNSPEC_VABDL)))]
   "TARGET_NEON"
   "vabal.%T4%#<V_sz_elem>\t%q0, %P2, %P3"
   [(set_attr "neon_type" "neon_vaba")]
@@ -2302,22 +2416,15 @@
                     (const_string "neon_fp_vrecps_vrsqrts_qqq")))]
 )
 
-(define_insn "neon_vabs<mode>"
-  [(set (match_operand:VDQW 0 "s_register_operand" "=w")
-	(unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "w")
-		      (match_operand:SI 2 "immediate_operand" "i")]
-                     UNSPEC_VABS))]
+(define_expand "neon_vabs<mode>"
+  [(match_operand:VDQW 0 "s_register_operand" "")
+   (match_operand:VDQW 1 "s_register_operand" "")
+   (match_operand:SI 2 "immediate_operand" "")]
   "TARGET_NEON"
-  "vabs.<V_s_elem>\t%<V_reg>0, %<V_reg>1"
-  [(set (attr "neon_type")
-     (if_then_else (ior (ne (symbol_ref "<Is_float_mode>") (const_int 0))
-                        (ne (symbol_ref "<Is_float_mode>") (const_int 0)))
-                   (if_then_else
-                      (ne (symbol_ref "<Is_d_reg>") (const_int 0))
-                      (const_string "neon_fp_vadd_ddd_vabs_dd")
-                      (const_string "neon_fp_vadd_qqq_vabs_qq"))
-                   (const_string "neon_vqneg_vqabs")))]
-)
+{
+  emit_insn (gen_abs<mode>2 (operands[0], operands[1]));
+  DONE;
+})
 
 (define_insn "neon_vqabs<mode>"
   [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
Index: gcc/config/arm/arm.md
===================================================================
--- gcc/config/arm/arm.md	(revision 161038)
+++ gcc/config/arm/arm.md	(working copy)
@@ -492,9 +492,10 @@
 	(plus:DI (match_operand:DI 1 "s_register_operand" "%0, 0")
 		 (match_operand:DI 2 "s_register_operand" "r,  0")))
    (clobber (reg:CC CC_REGNUM))]
-  "TARGET_32BIT && !(TARGET_HARD_FLOAT && TARGET_MAVERICK)"
+  "TARGET_32BIT && !(TARGET_HARD_FLOAT && TARGET_MAVERICK) && !TARGET_NEON"
   "#"
-  "TARGET_32BIT && reload_completed"
+  "TARGET_32BIT && reload_completed
+   && ! (TARGET_NEON && IS_VFP_REGNUM (REGNO (operands[0])))"
   [(parallel [(set (reg:CC_C CC_REGNUM)
 		   (compare:CC_C (plus:SI (match_dup 1) (match_dup 2))
 				 (match_dup 1)))
@@ -991,7 +992,7 @@
 	(minus:DI (match_operand:DI 1 "s_register_operand" "0,r,0")
 		  (match_operand:DI 2 "s_register_operand" "r,0,0")))
    (clobber (reg:CC CC_REGNUM))]
-  "TARGET_32BIT"
+  "TARGET_32BIT && !TARGET_NEON"
   "subs\\t%Q0, %Q1, %Q2\;sbc\\t%R0, %R1, %R2"
   [(set_attr "conds" "clob")
    (set_attr "length" "8")]
Index: gcc/config/arm/neon.ml
===================================================================
--- gcc/config/arm/neon.ml	(revision 161038)
+++ gcc/config/arm/neon.ml	(working copy)
@@ -709,7 +709,8 @@ let pf_su_8_64 = P8 :: P16 :: F32 :: su_
 let ops =
   [
     (* Addition.  *)
-    Vadd, [], All (3, Dreg), "vadd", sign_invar_2, F32 :: su_8_64;
+    Vadd, [], All (3, Dreg), "vadd", sign_invar_2, F32 :: su_8_32;
+    Vadd, [No_op], All (3, Dreg), "vadd", sign_invar_2, [S64; U64];
     Vadd, [], All (3, Qreg), "vaddQ", sign_invar_2, F32 :: su_8_64;
     Vadd, [], Long, "vaddl", elts_same_2, su_8_32;
     Vadd, [], Wide, "vaddw", elts_same_2, su_8_32;
@@ -758,7 +759,8 @@ let ops =
     Vmls, [Saturating; Doubling], Long, "vqdmlsl", elts_same_io, [S16; S32];
 
     (* Subtraction.  *)
-    Vsub, [], All (3, Dreg), "vsub", sign_invar_2, F32 :: su_8_64;
+    Vsub, [], All (3, Dreg), "vsub", sign_invar_2, F32 :: su_8_32;
+    Vsub, [No_op], All (3, Dreg), "vsub", sign_invar_2,  [S64; U64];
     Vsub, [], All (3, Qreg), "vsubQ", sign_invar_2, F32 :: su_8_64;
     Vsub, [], Long, "vsubl", elts_same_2, su_8_32;
     Vsub, [], Wide, "vsubw", elts_same_2, su_8_32;
Index: gcc/config/arm/arm_neon.h
===================================================================
--- gcc/config/arm/arm_neon.h	(revision 161038)
+++ gcc/config/arm/arm_neon.h	(working copy)
@@ -414,12 +414,6 @@ vadd_s32 (int32x2_t __a, int32x2_t __b)
   return (int32x2_t)__builtin_neon_vaddv2si (__a, __b, 1);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vadd_s64 (int64x1_t __a, int64x1_t __b)
-{
-  return (int64x1_t)__builtin_neon_vadddi (__a, __b, 1);
-}
-
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vadd_f32 (float32x2_t __a, float32x2_t __b)
 {
@@ -444,6 +438,12 @@ vadd_u32 (uint32x2_t __a, uint32x2_t __b
   return (uint32x2_t)__builtin_neon_vaddv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
 }
 
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vadd_s64 (int64x1_t __a, int64x1_t __b)
+{
+  return (int64x1_t)__builtin_neon_vadddi (__a, __b, 1);
+}
+
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vadd_u64 (uint64x1_t __a, uint64x1_t __b)
 {
@@ -1368,12 +1368,6 @@ vsub_s32 (int32x2_t __a, int32x2_t __b)
   return (int32x2_t)__builtin_neon_vsubv2si (__a, __b, 1);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vsub_s64 (int64x1_t __a, int64x1_t __b)
-{
-  return (int64x1_t)__builtin_neon_vsubdi (__a, __b, 1);
-}
-
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vsub_f32 (float32x2_t __a, float32x2_t __b)
 {
@@ -1398,6 +1392,12 @@ vsub_u32 (uint32x2_t __a, uint32x2_t __b
   return (uint32x2_t)__builtin_neon_vsubv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
 }
 
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vsub_s64 (int64x1_t __a, int64x1_t __b)
+{
+  return (int64x1_t)__builtin_neon_vsubdi (__a, __b, 1);
+}
+
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vsub_u64 (uint64x1_t __a, uint64x1_t __b)
 {
@@ -5808,12 +5808,6 @@ vget_low_s32 (int32x4_t __a)
   return (int32x2_t)__builtin_neon_vget_lowv4si (__a);
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vget_low_s64 (int64x2_t __a)
-{
-  return (int64x1_t)__builtin_neon_vget_lowv2di (__a);
-}
-
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vget_low_f32 (float32x4_t __a)
 {
@@ -5838,12 +5832,6 @@ vget_low_u32 (uint32x4_t __a)
   return (uint32x2_t)__builtin_neon_vget_lowv4si ((int32x4_t) __a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vget_low_u64 (uint64x2_t __a)
-{
-  return (uint64x1_t)__builtin_neon_vget_lowv2di ((int64x2_t) __a);
-}
-
 __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 vget_low_p8 (poly8x16_t __a)
 {
@@ -5856,6 +5844,18 @@ vget_low_p16 (poly16x8_t __a)
   return (poly16x4_t)__builtin_neon_vget_lowv8hi ((int16x8_t) __a);
 }
 
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vget_low_s64 (int64x2_t __a)
+{
+  return (int64x1_t)__builtin_neon_vget_lowv2di (__a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vget_low_u64 (uint64x2_t __a)
+{
+  return (uint64x1_t)__builtin_neon_vget_lowv2di ((int64x2_t) __a);
+}
+
 __extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 vcvt_s32_f32 (float32x2_t __a)
 {
Index: gcc/doc/arm-neon-intrinsics.texi
===================================================================
--- gcc/doc/arm-neon-intrinsics.texi	(revision 161038)
+++ gcc/doc/arm-neon-intrinsics.texi	(working copy)
@@ -43,20 +43,18 @@
 
 
 @itemize @bullet
-@item uint64x1_t vadd_u64 (uint64x1_t, uint64x1_t)
-@*@emph{Form of expected instruction(s):} @code{vadd.i64 @var{d0}, @var{d0}, @var{d0}}
+@item float32x2_t vadd_f32 (float32x2_t, float32x2_t)
+@*@emph{Form of expected instruction(s):} @code{vadd.f32 @var{d0}, @var{d0}, @var{d0}}
 @end itemize
 
 
 @itemize @bullet
-@item int64x1_t vadd_s64 (int64x1_t, int64x1_t)
-@*@emph{Form of expected instruction(s):} @code{vadd.i64 @var{d0}, @var{d0}, @var{d0}}
+@item uint64x1_t vadd_u64 (uint64x1_t, uint64x1_t)
 @end itemize
 
 
 @itemize @bullet
-@item float32x2_t vadd_f32 (float32x2_t, float32x2_t)
-@*@emph{Form of expected instruction(s):} @code{vadd.f32 @var{d0}, @var{d0}, @var{d0}}
+@item int64x1_t vadd_s64 (int64x1_t, int64x1_t)
 @end itemize
 
 
@@ -1013,20 +1011,18 @@
 
 
 @itemize @bullet
-@item uint64x1_t vsub_u64 (uint64x1_t, uint64x1_t)
-@*@emph{Form of expected instruction(s):} @code{vsub.i64 @var{d0}, @var{d0}, @var{d0}}
+@item float32x2_t vsub_f32 (float32x2_t, float32x2_t)
+@*@emph{Form of expected instruction(s):} @code{vsub.f32 @var{d0}, @var{d0}, @var{d0}}
 @end itemize
 
 
 @itemize @bullet
-@item int64x1_t vsub_s64 (int64x1_t, int64x1_t)
-@*@emph{Form of expected instruction(s):} @code{vsub.i64 @var{d0}, @var{d0}, @var{d0}}
+@item uint64x1_t vsub_u64 (uint64x1_t, uint64x1_t)
 @end itemize
 
 
 @itemize @bullet
-@item float32x2_t vsub_f32 (float32x2_t, float32x2_t)
-@*@emph{Form of expected instruction(s):} @code{vsub.f32 @var{d0}, @var{d0}, @var{d0}}
+@item int64x1_t vsub_s64 (int64x1_t, int64x1_t)
 @end itemize
 
 
@@ -5572,32 +5568,30 @@
 
 
 @itemize @bullet
-@item uint64x1_t vget_low_u64 (uint64x2_t)
+@item float32x2_t vget_low_f32 (float32x4_t)
 @*@emph{Form of expected instruction(s):} @code{vmov @var{d0}, @var{d0}}
 @end itemize
 
 
 @itemize @bullet
-@item int64x1_t vget_low_s64 (int64x2_t)
+@item poly16x4_t vget_low_p16 (poly16x8_t)
 @*@emph{Form of expected instruction(s):} @code{vmov @var{d0}, @var{d0}}
 @end itemize
 
 
 @itemize @bullet
-@item float32x2_t vget_low_f32 (float32x4_t)
+@item poly8x8_t vget_low_p8 (poly8x16_t)
 @*@emph{Form of expected instruction(s):} @code{vmov @var{d0}, @var{d0}}
 @end itemize
 
 
 @itemize @bullet
-@item poly16x4_t vget_low_p16 (poly16x8_t)
-@*@emph{Form of expected instruction(s):} @code{vmov @var{d0}, @var{d0}}
+@item uint64x1_t vget_low_u64 (uint64x2_t)
 @end itemize
 
 
 @itemize @bullet
-@item poly8x8_t vget_low_p8 (poly8x16_t)
-@*@emph{Form of expected instruction(s):} @code{vmov @var{d0}, @var{d0}}
+@item int64x1_t vget_low_s64 (int64x2_t)
 @end itemize
 
 
Index: gcc/testsuite/gcc.target/arm/neon/vadds64.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vadds64.c	(revision 161038)
+++ gcc/testsuite/gcc.target/arm/neon/vadds64.c	(working copy)
@@ -17,5 +17,4 @@ void test_vadds64 (void)
   out_int64x1_t = vadd_s64 (arg0_int64x1_t, arg1_int64x1_t);
 }
 
-/* { dg-final { scan-assembler "vadd\.i64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vaddu64.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vaddu64.c	(revision 161038)
+++ gcc/testsuite/gcc.target/arm/neon/vaddu64.c	(working copy)
@@ -17,5 +17,4 @@ void test_vaddu64 (void)
   out_uint64x1_t = vadd_u64 (arg0_uint64x1_t, arg1_uint64x1_t);
 }
 
-/* { dg-final { scan-assembler "vadd\.i64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vsubs64.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vsubs64.c	(revision 161038)
+++ gcc/testsuite/gcc.target/arm/neon/vsubs64.c	(working copy)
@@ -17,5 +17,4 @@ void test_vsubs64 (void)
   out_int64x1_t = vsub_s64 (arg0_int64x1_t, arg1_int64x1_t);
 }
 
-/* { dg-final { scan-assembler "vsub\.i64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vsubu64.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vsubu64.c	(revision 161038)
+++ gcc/testsuite/gcc.target/arm/neon/vsubu64.c	(working copy)
@@ -17,5 +17,4 @@ void test_vsubu64 (void)
   out_uint64x1_t = vsub_u64 (arg0_uint64x1_t, arg1_uint64x1_t);
 }
 
-/* { dg-final { scan-assembler "vsub\.i64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon-vmla-1.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vmla-1.c	(revision 161038)
+++ gcc/testsuite/gcc.target/arm/neon-vmla-1.c	(working copy)
@@ -1,5 +1,5 @@
 /* { dg-require-effective-target arm_neon_hw } */
-/* { dg-options "-O2 -ftree-vectorize" } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
 /* { dg-add-options arm_neon } */
 /* { dg-final { scan-assembler "vmla\\.f32" } } */
 
Index: gcc/testsuite/gcc.target/arm/neon-vmls-1.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vmls-1.c	(revision 161038)
+++ gcc/testsuite/gcc.target/arm/neon-vmls-1.c	(working copy)
@@ -1,5 +1,5 @@
 /* { dg-require-effective-target arm_neon_hw } */
-/* { dg-options "-O2 -ftree-vectorize" } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
 /* { dg-add-options arm_neon } */
 /* { dg-final { scan-assembler "vmls\\.f32" } } */

Index: gcc/testsuite/gcc.target/arm/neon-vsubs64.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vsubs64.c	(revision 0)
+++ gcc/testsuite/gcc.target/arm/neon-vsubs64.c	(revision 0)
@@ -0,0 +1,21 @@
+/* Test the `vsub_s64' ARM Neon intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-options "-O0" } */
+/* { dg-add-options arm_neon } */
+
+#include "arm_neon.h"
+#include <stdlib.h>
+
+int main (void)
+{
+  int64x1_t out_int64x1_t = 0;
+  int64x1_t arg0_int64x1_t = (int64x1_t)0xdeadbeefdeadbeefLL;
+  int64x1_t arg1_int64x1_t = (int64x1_t)0x0000beefdead0000LL;
+
+  out_int64x1_t = vsub_s64 (arg0_int64x1_t, arg1_int64x1_t);
+  if (out_int64x1_t != (int64x1_t)0xdead00000000beefLL)
+    abort();
+  return 0;
+}
Index: gcc/testsuite/gcc.target/arm/neon-vsubu64.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vsubu64.c	(revision 0)
+++ gcc/testsuite/gcc.target/arm/neon-vsubu64.c	(revision 0)
@@ -0,0 +1,21 @@
+/* Test the `vsub_u64' ARM Neon intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-options "-O0" } */
+/* { dg-add-options arm_neon } */
+
+#include "arm_neon.h"
+#include <stdlib.h>
+
+int main (void)
+{
+  uint64x1_t out_uint64x1_t = 0;
+  uint64x1_t arg0_uint64x1_t = (uint64x1_t)0xdeadbeefdeadbeefLL;
+  uint64x1_t arg1_uint64x1_t = (uint64x1_t)0x0000beefdead0000LL;
+
+  out_uint64x1_t = vsub_u64 (arg0_uint64x1_t, arg1_uint64x1_t);
+  if (out_uint64x1_t != (uint64x1_t)0xdead00000000beefLL)
+    abort();
+  return 0;
+}
Index: gcc/testsuite/gcc.target/arm/neon-vadds64.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vadds64.c	(revision 0)
+++ gcc/testsuite/gcc.target/arm/neon-vadds64.c	(revision 0)
@@ -0,0 +1,21 @@
+/* Test the `vadd_s64' ARM Neon intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-options "-O0" } */
+/* { dg-add-options arm_neon } */
+
+#include "arm_neon.h"
+#include <stdlib.h>
+
+int main (void)
+{
+  int64x1_t out_int64x1_t = 0;
+  int64x1_t arg0_int64x1_t = (int64x1_t)0xdeadbeef00000000LL;
+  int64x1_t arg1_int64x1_t = (int64x1_t)0x00000000deadbeefLL;
+
+  out_int64x1_t = vadd_s64 (arg0_int64x1_t, arg1_int64x1_t);
+  if (out_int64x1_t != (int64x1_t)0xdeadbeefdeadbeefLL)
+    abort();
+  return 0;
+}
Index: gcc/testsuite/gcc.target/arm/neon-vaddu64.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vaddu64.c	(revision 0)
+++ gcc/testsuite/gcc.target/arm/neon-vaddu64.c	(revision 0)
@@ -0,0 +1,21 @@
+/* Test the `vadd_u64' ARM Neon intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-options "-O0" } */
+/* { dg-add-options arm_neon } */
+
+#include "arm_neon.h"
+#include <stdlib.h>
+
+int main (void)
+{
+  uint64x1_t out_uint64x1_t = 0;
+  uint64x1_t arg0_uint64x1_t = (uint64x1_t)0xdeadbeef00000000LL;
+  uint64x1_t arg1_uint64x1_t = (uint64x1_t)0x00000000deadbeefLL;
+
+  out_uint64x1_t = vadd_u64 (arg0_uint64x1_t, arg1_uint64x1_t);
+  if (out_uint64x1_t != (uint64x1_t)0xdeadbeefdeadbeefLL)
+    abort();
+  return 0;
+}
 
