diff mbox

[ARM] Optimize vdup-constant builtins

Message ID 20120717152653.6d988114@octopus
State New
Headers show

Commit Message

Julian Brown July 17, 2012, 2:26 p.m. UTC
Hi,

This patch (originally by Jie Zhang) optimizes vdup builtins which use
a constant argument into the immediate variants of the vdup
instructions, rather than generating separate immediate-loads and
register vdups, as is presently done. It also adds support for loading
floating-point constant 0.0 to such instructions, by using integer
zero-load instructions (the bit pattern is the same).

Tested with no regressions, and the new tests pass. One test needed
tweaking, since with the patch the (constant) calculations performed
now get entirely optimised away.

OK to apply?

Thanks,

Julian

ChangeLog

    Jie Zhang  <jzhang918@gmail.com>
    Julian Brown  <julian@codesourcery.com>

    gcc/
    * config/arm/arm.c (arm_rtx_costs_1): Adjust cost for
    CONST_VECTOR.
    (arm_size_rtx_costs): Likewise.
    (neon_valid_immediate): Add a case for double 0.0.

    gcc/testsuite/
    * gcc.target/arm/neon-vdup-1.c: New test case.
    * gcc.target/arm/neon-vdup-2.c: New test case.
    * gcc.target/arm/neon-vdup-3.c: New test case.
    * gcc.target/arm/neon-vdup-4.c: New test case.
    * gcc.target/arm/neon-vdup-5.c: New test case.
    * gcc.target/arm/neon-vdup-6.c: New test case.
    * gcc.target/arm/neon-vdup-7.c: New test case.
    * gcc.target/arm/neon-vdup-8.c: New test case.
    * gcc.target/arm/neon-vdup-9.c: New test case.
    * gcc.target/arm/neon-vdup-10.c: New test case.
    * gcc.target/arm/neon-vdup-11.c: New test case.
    * gcc.target/arm/neon-vdup-12.c: New test case.
    * gcc.target/arm/neon-vdup-13.c: New test case.
    * gcc.target/arm/neon-vdup-14.c: New test case.
    * gcc.target/arm/neon-vdup-15.c: New test case.
    * gcc.target/arm/neon-vdup-16.c: New test case.
    * gcc.target/arm/neon-vdup-17.c: New test case.
    * gcc.target/arm/neon-vdup-18.c: New test case.
    * gcc.target/arm/neon-vdup-19.c: New test case.
    * gcc.target/arm/neon-combine-sub-abs-into-vabd.c: Make intrinsic
    arguments non-constant.

Comments

Ramana Radhakrishnan July 17, 2012, 7:08 p.m. UTC | #1
Nice.

On 17 July 2012 15:26, Julian Brown <julian@codesourcery.com> wrote:
> Hi,
>
> This patch (originally by Jie Zhang) optimizes vdup builtins which use
> a constant argument into the immediate variants of the vdup
> instructions, rather than generating separate immediate-loads and
> register vdups, as is presently done. It also adds support for loading
> floating-point constant 0.0 to such instructions, by using integer
> zero-load instructions (the bit pattern is the same).
>
> Tested with no regressions, and the new tests pass. One test needed
> tweaking, since with the patch the (constant) calculations performed
> now get entirely optimised away.

Yes, I know that one. I've been looking at fixing these with my other
patch from earlier this month.

Coincidentally the costs was something that I'd been looking at
tweaking, so thanks.

>
> OK to apply?

Ok.


regards,
Ramana


>
> Thanks,
>
> Julian
>
> ChangeLog
>
>     Jie Zhang  <jzhang918@gmail.com>
>     Julian Brown  <julian@codesourcery.com>
>
>     gcc/
>     * config/arm/arm.c (arm_rtx_costs_1): Adjust cost for
>     CONST_VECTOR.
>     (arm_size_rtx_costs): Likewise.
>     (neon_valid_immediate): Add a case for double 0.0.
>
>     gcc/testsuite/
>     * gcc.target/arm/neon-vdup-1.c: New test case.
>     * gcc.target/arm/neon-vdup-2.c: New test case.
>     * gcc.target/arm/neon-vdup-3.c: New test case.
>     * gcc.target/arm/neon-vdup-4.c: New test case.
>     * gcc.target/arm/neon-vdup-5.c: New test case.
>     * gcc.target/arm/neon-vdup-6.c: New test case.
>     * gcc.target/arm/neon-vdup-7.c: New test case.
>     * gcc.target/arm/neon-vdup-8.c: New test case.
>     * gcc.target/arm/neon-vdup-9.c: New test case.
>     * gcc.target/arm/neon-vdup-10.c: New test case.
>     * gcc.target/arm/neon-vdup-11.c: New test case.
>     * gcc.target/arm/neon-vdup-12.c: New test case.
>     * gcc.target/arm/neon-vdup-13.c: New test case.
>     * gcc.target/arm/neon-vdup-14.c: New test case.
>     * gcc.target/arm/neon-vdup-15.c: New test case.
>     * gcc.target/arm/neon-vdup-16.c: New test case.
>     * gcc.target/arm/neon-vdup-17.c: New test case.
>     * gcc.target/arm/neon-vdup-18.c: New test case.
>     * gcc.target/arm/neon-vdup-19.c: New test case.
>     * gcc.target/arm/neon-combine-sub-abs-into-vabd.c: Make intrinsic
>     arguments non-constant.
diff mbox

Patch

commit 1e3a8ea7fbaed4cd6638b08fb18b951a9c02f889
Author: Julian Brown <jbrown@build6-lucid-cs.sje.mentorg.com>
Date:   Fri Jul 13 07:13:09 2012 -0700

    Optimize NEON vdup builtins.

diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 9748dda..7eedb18 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -7608,6 +7608,17 @@  arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
 	}
       return true;
 
+    case CONST_VECTOR:
+      if (TARGET_NEON
+	  && TARGET_HARD_FLOAT
+	  && outer == SET
+	  && (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode))
+	  && neon_immediate_valid_for_move (x, mode, NULL, NULL))
+	*total = COSTS_N_INSNS (1);
+      else
+	*total = COSTS_N_INSNS (4);
+      return true;
+
     default:
       *total = COSTS_N_INSNS (4);
       return false;
@@ -7948,6 +7959,17 @@  arm_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
       *total = COSTS_N_INSNS (4);
       return true;
 
+    case CONST_VECTOR:
+      if (TARGET_NEON
+	  && TARGET_HARD_FLOAT
+	  && outer_code == SET
+	  && (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode))
+	  && neon_immediate_valid_for_move (x, mode, NULL, NULL))
+	*total = COSTS_N_INSNS (1);
+      else
+	*total = COSTS_N_INSNS (4);
+      return true;
+
     case HIGH:
     case LO_SUM:
       /* We prefer constant pool entries to MOVW/MOVT pairs, so bump the
@@ -8768,11 +8790,14 @@  vfp3_const_double_rtx (rtx x)
    vmov  i64    17    aaaaaaaa bbbbbbbb cccccccc dddddddd
                       eeeeeeee ffffffff gggggggg hhhhhhhh
    vmov  f32    18    aBbbbbbc defgh000 00000000 00000000
+   vmov  f32    19    00000000 00000000 00000000 00000000
 
    For case 18, B = !b. Representable values are exactly those accepted by
    vfp3_const_double_index, but are output as floating-point numbers rather
    than indices.
 
+   For case 19, we will change it to vmov.i32 when assembling.
+
    Variants 0-5 (inclusive) may also be used as immediates for the second
    operand of VORR/VBIC instructions.
 
@@ -8829,7 +8854,7 @@  neon_valid_immediate (rtx op, enum machine_mode mode, int inverse,
       rtx el0 = CONST_VECTOR_ELT (op, 0);
       REAL_VALUE_TYPE r0;
 
-      if (!vfp3_const_double_rtx (el0))
+      if (!vfp3_const_double_rtx (el0) && el0 != CONST0_RTX (GET_MODE (el0)))
         return -1;
 
       REAL_VALUE_FROM_CONST_DOUBLE (r0, el0);
@@ -8851,7 +8876,10 @@  neon_valid_immediate (rtx op, enum machine_mode mode, int inverse,
       if (elementwidth)
         *elementwidth = 0;
 
-      return 18;
+      if (el0 == CONST0_RTX (GET_MODE (el0)))
+	return 19;
+      else
+	return 18;
     }
 
   /* Splat vector constant out into a byte vector.  */
diff --git a/gcc/testsuite/gcc.target/arm/neon-combine-sub-abs-into-vabd.c b/gcc/testsuite/gcc.target/arm/neon-combine-sub-abs-into-vabd.c
index ad6ba75..fe3d78b 100644
--- a/gcc/testsuite/gcc.target/arm/neon-combine-sub-abs-into-vabd.c
+++ b/gcc/testsuite/gcc.target/arm/neon-combine-sub-abs-into-vabd.c
@@ -4,10 +4,8 @@ 
 /* { dg-add-options arm_neon } */
 
 #include <arm_neon.h>
-float32x2_t f_sub_abs_to_vabd_32()
+float32x2_t f_sub_abs_to_vabd_32(float32x2_t val1, float32x2_t val2)
 {
-  float32x2_t val1 = vdup_n_f32 (10);
-  float32x2_t val2 = vdup_n_f32 (30);
   float32x2_t sres = vsub_f32(val1, val2);
   float32x2_t res = vabs_f32 (sres);
 
@@ -16,10 +14,8 @@  float32x2_t f_sub_abs_to_vabd_32()
 /* { dg-final { scan-assembler "vabd\.f32" } }*/
 
 #include <arm_neon.h>
-int8x8_t sub_abs_to_vabd_8()
+int8x8_t sub_abs_to_vabd_8(int8x8_t val1, int8x8_t val2)
 {
-  int8x8_t val1 = vdup_n_s8 (10);
-  int8x8_t val2 = vdup_n_s8 (30);
   int8x8_t sres = vsub_s8(val1, val2);
   int8x8_t res = vabs_s8 (sres);
 
@@ -27,10 +23,8 @@  int8x8_t sub_abs_to_vabd_8()
 }
 /* { dg-final { scan-assembler "vabd\.s8" } }*/
 
-int16x4_t sub_abs_to_vabd_16()
+int16x4_t sub_abs_to_vabd_16(int16x4_t val1, int16x4_t val2)
 {
-  int16x4_t val1 = vdup_n_s16 (10);
-  int16x4_t val2 = vdup_n_s16 (30);
   int16x4_t sres = vsub_s16(val1, val2);
   int16x4_t res = vabs_s16 (sres);
 
@@ -38,10 +32,8 @@  int16x4_t sub_abs_to_vabd_16()
 }
 /* { dg-final { scan-assembler "vabd\.s16" } }*/
 
-int32x2_t sub_abs_to_vabd_32()
+int32x2_t sub_abs_to_vabd_32(int32x2_t val1, int32x2_t val2)
 {
-  int32x2_t val1 = vdup_n_s32 (10);
-  int32x2_t val2 = vdup_n_s32 (30);
   int32x2_t sres = vsub_s32(val1, val2);
   int32x2_t res = vabs_s32 (sres);
 
diff --git a/gcc/testsuite/gcc.target/arm/neon-vdup-1.c b/gcc/testsuite/gcc.target/arm/neon-vdup-1.c
new file mode 100644
index 0000000..41799a2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vdup-1.c
@@ -0,0 +1,17 @@ 
+/* Test the optimization of `vdupq_n_f32' ARM Neon intrinsic.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+float32x4_t out_float32x4_t;
+void test_vdupq_nf32 (void)
+{
+  out_float32x4_t = vdupq_n_f32 (0.0);
+}
+
+/* { dg-final { scan-assembler "vmov\.f32\[ 	\]+\[qQ\]\[0-9\]+, #0\.0\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vdup-10.c b/gcc/testsuite/gcc.target/arm/neon-vdup-10.c
new file mode 100644
index 0000000..a06b064
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vdup-10.c
@@ -0,0 +1,17 @@ 
+/* Test the optimization of `vdupq_n_u32' ARM Neon intrinsic.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint32x4_t out_uint32x4_t;
+void test_vdupq_nu32 (void)
+{
+  out_uint32x4_t = vdupq_n_u32 (~0x12000000);
+}
+
+/* { dg-final { scan-assembler "vmov\.i32\[ 	\]+\[qQ\]\[0-9\]+, #3992977407\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vdup-11.c b/gcc/testsuite/gcc.target/arm/neon-vdup-11.c
new file mode 100644
index 0000000..07d0889
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vdup-11.c
@@ -0,0 +1,17 @@ 
+/* Test the optimization of `vdupq_n_u16' ARM Neon intrinsic.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint16x8_t out_uint16x8_t;
+void test_vdupq_nu16 (void)
+{
+  out_uint16x8_t = vdupq_n_u16 (0x12);
+}
+
+/* { dg-final { scan-assembler "vmov\.i16\[ 	\]+\[qQ\]\[0-9\]+, #18\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vdup-12.c b/gcc/testsuite/gcc.target/arm/neon-vdup-12.c
new file mode 100644
index 0000000..27b4186
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vdup-12.c
@@ -0,0 +1,17 @@ 
+/* Test the optimization of `vdupq_n_u16' ARM Neon intrinsic.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint16x8_t out_uint16x8_t;
+void test_vdupq_nu16 (void)
+{
+  out_uint16x8_t = vdupq_n_u16 (0x1200);
+}
+
+/* { dg-final { scan-assembler "vmov\.i16\[ 	\]+\[qQ\]\[0-9\]+, #4608\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vdup-13.c b/gcc/testsuite/gcc.target/arm/neon-vdup-13.c
new file mode 100644
index 0000000..4d38bc0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vdup-13.c
@@ -0,0 +1,17 @@ 
+/* Test the optimization of `vdupq_n_u16' ARM Neon intrinsic.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint16x8_t out_uint16x8_t;
+void test_vdupq_nu16 (void)
+{
+  out_uint16x8_t = vdupq_n_u16 (~0x12);
+}
+
+/* { dg-final { scan-assembler "vmov\.i16\[ 	\]+\[qQ\]\[0-9\]+, #65517\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vdup-14.c b/gcc/testsuite/gcc.target/arm/neon-vdup-14.c
new file mode 100644
index 0000000..a16659f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vdup-14.c
@@ -0,0 +1,17 @@ 
+/* Test the optimization of `vdupq_n_u16' ARM Neon intrinsic.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint16x8_t out_uint16x8_t;
+void test_vdupq_nu16 (void)
+{
+  out_uint16x8_t = vdupq_n_u16 (~0x1200);
+}
+
+/* { dg-final { scan-assembler "vmov\.i16\[ 	\]+\[qQ\]\[0-9\]+, #60927\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vdup-15.c b/gcc/testsuite/gcc.target/arm/neon-vdup-15.c
new file mode 100644
index 0000000..84a6fe0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vdup-15.c
@@ -0,0 +1,17 @@ 
+/* Test the optimization of `vdupq_n_u8' ARM Neon intrinsic.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint8x16_t out_uint8x16_t;
+void test_vdupq_nu8 (void)
+{
+  out_uint8x16_t = vdupq_n_u8 (0x12);
+}
+
+/* { dg-final { scan-assembler "vmov\.i8\[ 	\]+\[qQ\]\[0-9\]+, #18\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vdup-16.c b/gcc/testsuite/gcc.target/arm/neon-vdup-16.c
new file mode 100644
index 0000000..70bec03
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vdup-16.c
@@ -0,0 +1,17 @@ 
+/* Test the optimization of `vdupq_n_u32' ARM Neon intrinsic.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint32x4_t out_uint32x4_t;
+void test_vdupq_nu32 (void)
+{
+  out_uint32x4_t = vdupq_n_u32 (0x12ff);
+}
+
+/* { dg-final { scan-assembler "vmov\.i32\[ 	\]+\[qQ\]\[0-9\]+, #4863\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vdup-17.c b/gcc/testsuite/gcc.target/arm/neon-vdup-17.c
new file mode 100644
index 0000000..e0283f1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vdup-17.c
@@ -0,0 +1,17 @@ 
+/* Test the optimization of `vdupq_n_u32' ARM Neon intrinsic.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint32x4_t out_uint32x4_t;
+void test_vdupq_nu32 (void)
+{
+  out_uint32x4_t = vdupq_n_u32 (0x12ffff);
+}
+
+/* { dg-final { scan-assembler "vmov\.i32\[ 	\]+\[qQ\]\[0-9\]+, #1245183\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vdup-18.c b/gcc/testsuite/gcc.target/arm/neon-vdup-18.c
new file mode 100644
index 0000000..7dcf85d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vdup-18.c
@@ -0,0 +1,17 @@ 
+/* Test the optimization of `vdupq_n_u32' ARM Neon intrinsic.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint32x4_t out_uint32x4_t;
+void test_vdupq_nu32 (void)
+{
+  out_uint32x4_t = vdupq_n_u32 (~0x12ff);
+}
+
+/* { dg-final { scan-assembler "vmov\.i32\[ 	\]+\[qQ\]\[0-9\]+, #4294962432\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vdup-19.c b/gcc/testsuite/gcc.target/arm/neon-vdup-19.c
new file mode 100644
index 0000000..0980437
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vdup-19.c
@@ -0,0 +1,17 @@ 
+/* Test the optimization of `vdupq_n_u32' ARM Neon intrinsic.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint32x4_t out_uint32x4_t;
+void test_vdupq_nu32 (void)
+{
+  out_uint32x4_t = vdupq_n_u32 (~0x12ffff);
+}
+
+/* { dg-final { scan-assembler "vmov\.i32\[ 	\]+\[qQ\]\[0-9\]+, #4293722112\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vdup-2.c b/gcc/testsuite/gcc.target/arm/neon-vdup-2.c
new file mode 100644
index 0000000..f9e6a72
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vdup-2.c
@@ -0,0 +1,17 @@ 
+/* Test the optimization of `vdupq_n_f32' ARM Neon intrinsic.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+float32x4_t out_float32x4_t;
+void test_vdupq_nf32 (void)
+{
+  out_float32x4_t = vdupq_n_f32 (0.125);
+}
+
+/* { dg-final { scan-assembler "vmov\.f32\[ 	\]+\[qQ\]\[0-9\]+, #1\.25e-1\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vdup-3.c b/gcc/testsuite/gcc.target/arm/neon-vdup-3.c
new file mode 100644
index 0000000..d407316
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vdup-3.c
@@ -0,0 +1,17 @@ 
+/* Test the optimization of `vdupq_n_u32' ARM Neon intrinsic.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint32x4_t out_uint32x4_t;
+void test_vdupq_nu32 (void)
+{
+  out_uint32x4_t = vdupq_n_u32 (0x12);
+}
+
+/* { dg-final { scan-assembler "vmov\.i32\[ 	\]+\[qQ\]\[0-9\]+, #18\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vdup-4.c b/gcc/testsuite/gcc.target/arm/neon-vdup-4.c
new file mode 100644
index 0000000..bc1be07
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vdup-4.c
@@ -0,0 +1,17 @@ 
+/* Test the optimization of `vdupq_n_u32' ARM Neon intrinsic.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint32x4_t out_uint32x4_t;
+void test_vdupq_nu32 (void)
+{
+  out_uint32x4_t = vdupq_n_u32 (0x1200);
+}
+
+/* { dg-final { scan-assembler "vmov\.i32\[ 	\]+\[qQ\]\[0-9\]+, #4608\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vdup-5.c b/gcc/testsuite/gcc.target/arm/neon-vdup-5.c
new file mode 100644
index 0000000..9b04f16
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vdup-5.c
@@ -0,0 +1,17 @@ 
+/* Test the optimization of `vdupq_n_u32' ARM Neon intrinsic.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint32x4_t out_uint32x4_t;
+void test_vdupq_nu32 (void)
+{
+  out_uint32x4_t = vdupq_n_u32 (0x120000);
+}
+
+/* { dg-final { scan-assembler "vmov\.i32\[ 	\]+\[qQ\]\[0-9\]+, #1179648\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vdup-6.c b/gcc/testsuite/gcc.target/arm/neon-vdup-6.c
new file mode 100644
index 0000000..0889b80
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vdup-6.c
@@ -0,0 +1,17 @@ 
+/* Test the optimization of `vdupq_n_u32' ARM Neon intrinsic.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint32x4_t out_uint32x4_t;
+void test_vdupq_nu32 (void)
+{
+  out_uint32x4_t = vdupq_n_u32 (0x12000000);
+}
+
+/* { dg-final { scan-assembler "vmov\.i32\[ 	\]+\[qQ\]\[0-9\]+, #301989888\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vdup-7.c b/gcc/testsuite/gcc.target/arm/neon-vdup-7.c
new file mode 100644
index 0000000..f7b1dc8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vdup-7.c
@@ -0,0 +1,17 @@ 
+/* Test the optimization of `vdupq_n_u32' ARM Neon intrinsic.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint32x4_t out_uint32x4_t;
+void test_vdupq_nu32 (void)
+{
+  out_uint32x4_t = vdupq_n_u32 (~0x12);
+}
+
+/* { dg-final { scan-assembler "vmov\.i32\[ 	\]+\[qQ\]\[0-9\]+, #4294967277\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vdup-8.c b/gcc/testsuite/gcc.target/arm/neon-vdup-8.c
new file mode 100644
index 0000000..9d494c3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vdup-8.c
@@ -0,0 +1,17 @@ 
+/* Test the optimization of `vdupq_n_u32' ARM Neon intrinsic.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint32x4_t out_uint32x4_t;
+void test_vdupq_nu32 (void)
+{
+  out_uint32x4_t = vdupq_n_u32 (~0x1200);
+}
+
+/* { dg-final { scan-assembler "vmov\.i32\[ 	\]+\[qQ\]\[0-9\]+, #4294962687\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vdup-9.c b/gcc/testsuite/gcc.target/arm/neon-vdup-9.c
new file mode 100644
index 0000000..799e95e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vdup-9.c
@@ -0,0 +1,17 @@ 
+/* Test the optimization of `vdupq_n_u32' ARM Neon intrinsic.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint32x4_t out_uint32x4_t;
+void test_vdupq_nu32 (void)
+{
+  out_uint32x4_t = vdupq_n_u32 (~0x120000);
+}
+
+/* { dg-final { scan-assembler "vmov\.i32\[ 	\]+\[qQ\]\[0-9\]+, #4293787647\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */