diff mbox

[AArch64] Add STP pattern to store a vec_concat of two 64-bit registers

Message ID 59366A8C.9090302@foss.arm.com
State New
Headers show

Commit Message

Kyrill Tkachov June 6, 2017, 8:40 a.m. UTC
Hi all,

On top of the previous vec_merge simplifications [1] we can add this pattern to perform
a store of a vec_concat of two 64-bit values in distinct registers as an STP.
This avoids constructing such a vector explicitly in a register and storing it as
a Q register.
This way for the code in the testcase we can generate:

construct_lane_1:
         ldp     d1, d0, [x0]
         fmov    d3, 1.0e+0
         fmov    d2, 2.0e+0
         fadd    d4, d1, d3
         fadd    d5, d0, d2
         stp     d4, d5, [x1, 32]
         ret

construct_lane_2:
         ldp     x2, x0, [x0]
         add     x3, x2, 1
         add     x4, x0, 2
         stp     x3, x4, [x1, 32]
         ret

instead of the current:
construct_lane_1:
         ldp     d0, d1, [x0]
         fmov    d3, 1.0e+0
         fmov    d2, 2.0e+0
         fadd    d0, d0, d3
         fadd    d1, d1, d2
         dup     v0.2d, v0.d[0]
         ins     v0.d[1], v1.d[0]
         str     q0, [x1, 32]
         ret

construct_lane_2:
         ldp     x2, x3, [x0]
         add     x0, x2, 1
         add     x2, x3, 2
         dup     v0.2d, x0
         ins     v0.d[1], x2
         str     q0, [x1, 32]
         ret

Bootstrapped and tested on aarch64-none-linux-gnu.
Ok for GCC 8?

Thanks,
Kyrill

[1] https://gcc.gnu.org/ml/gcc-patches/2017-06/msg00272.html
     https://gcc.gnu.org/ml/gcc-patches/2017-06/msg00273.html
     https://gcc.gnu.org/ml/gcc-patches/2017-06/msg00274.html

2017-06-06  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

     * config/aarch64/aarch64-simd.md (store_pair_lanes<mode>):
     New pattern.
     * config/aarch64/constraints.md (Uml): New constraint.
     * config/aarch64/predicates.md (aarch64_mem_pair_lanes_operand): New
     predicate.

2017-06-06  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

     * gcc.target/aarch64/store_v2vec_lanes.c: New test.

Comments

James Greenhalgh June 6, 2017, 1:17 p.m. UTC | #1
On Tue, Jun 06, 2017 at 09:40:44AM +0100, Kyrill Tkachov wrote:
> Hi all,
> 
> On top of the previous vec_merge simplifications [1] we can add this pattern to perform
> a store of a vec_concat of two 64-bit values in distinct registers as an STP.
> This avoids constructing such a vector explicitly in a register and storing it as
> a Q register.
> This way for the code in the testcase we can generate:
> 
> construct_lane_1:
>         ldp     d1, d0, [x0]
>         fmov    d3, 1.0e+0
>         fmov    d2, 2.0e+0
>         fadd    d4, d1, d3
>         fadd    d5, d0, d2
>         stp     d4, d5, [x1, 32]
>         ret
> 
> construct_lane_2:
>         ldp     x2, x0, [x0]
>         add     x3, x2, 1
>         add     x4, x0, 2
>         stp     x3, x4, [x1, 32]
>         ret
> 
> instead of the current:
> construct_lane_1:
>         ldp     d0, d1, [x0]
>         fmov    d3, 1.0e+0
>         fmov    d2, 2.0e+0
>         fadd    d0, d0, d3
>         fadd    d1, d1, d2
>         dup     v0.2d, v0.d[0]
>         ins     v0.d[1], v1.d[0]
>         str     q0, [x1, 32]
>         ret
> 
> construct_lane_2:
>         ldp     x2, x3, [x0]
>         add     x0, x2, 1
>         add     x2, x3, 2
>         dup     v0.2d, x0
>         ins     v0.d[1], x2
>         str     q0, [x1, 32]
>         ret
> 
> Bootstrapped and tested on aarch64-none-linux-gnu.
> Ok for GCC 8?

OK.

Thanks,
James

> 2017-06-06  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
> 
>     * config/aarch64/aarch64-simd.md (store_pair_lanes<mode>):
>     New pattern.
>     * config/aarch64/constraints.md (Uml): New constraint.
>     * config/aarch64/predicates.md (aarch64_mem_pair_lanes_operand): New
>     predicate.
> 
> 2017-06-06  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
> 
>     * gcc.target/aarch64/store_v2vec_lanes.c: New test.
Kyrill Tkachov Nov. 8, 2017, 6:34 p.m. UTC | #2
On 06/06/17 14:17, James Greenhalgh wrote:
> On Tue, Jun 06, 2017 at 09:40:44AM +0100, Kyrill Tkachov wrote:
>> Hi all,
>>
>> On top of the previous vec_merge simplifications [1] we can add this pattern to perform
>> a store of a vec_concat of two 64-bit values in distinct registers as an STP.
>> This avoids constructing such a vector explicitly in a register and storing it as
>> a Q register.
>> This way for the code in the testcase we can generate:
>>
>> construct_lane_1:
>>          ldp     d1, d0, [x0]
>>          fmov    d3, 1.0e+0
>>          fmov    d2, 2.0e+0
>>          fadd    d4, d1, d3
>>          fadd    d5, d0, d2
>>          stp     d4, d5, [x1, 32]
>>          ret
>>
>> construct_lane_2:
>>          ldp     x2, x0, [x0]
>>          add     x3, x2, 1
>>          add     x4, x0, 2
>>          stp     x3, x4, [x1, 32]
>>          ret
>>
>> instead of the current:
>> construct_lane_1:
>>          ldp     d0, d1, [x0]
>>          fmov    d3, 1.0e+0
>>          fmov    d2, 2.0e+0
>>          fadd    d0, d0, d3
>>          fadd    d1, d1, d2
>>          dup     v0.2d, v0.d[0]
>>          ins     v0.d[1], v1.d[0]
>>          str     q0, [x1, 32]
>>          ret
>>
>> construct_lane_2:
>>          ldp     x2, x3, [x0]
>>          add     x0, x2, 1
>>          add     x2, x3, 2
>>          dup     v0.2d, x0
>>          ins     v0.d[1], x2
>>          str     q0, [x1, 32]
>>          ret
>>
>> Bootstrapped and tested on aarch64-none-linux-gnu.
>> Ok for GCC 8?
> OK.

Thanks, I've committed this and the other patches in this series after 
rebasing and rebootstrapping and testing on aarch64-none-linux-gnu.
The only conflict from updating the patch was that I had to use the 
store_16 attribute rather than
the old store2 for the new define_insn. This is what I've committed with 
r254551.

Sorry for the delay in committing.

Kyrill

> Thanks,
> James
>
>> 2017-06-06  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>>
>>      * config/aarch64/aarch64-simd.md (store_pair_lanes<mode>):
>>      New pattern.
>>      * config/aarch64/constraints.md (Uml): New constraint.
>>      * config/aarch64/predicates.md (aarch64_mem_pair_lanes_operand): New
>>      predicate.
>>
>> 2017-06-06  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>>
>>      * gcc.target/aarch64/store_v2vec_lanes.c: New test.
>
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index b78affe9b06ffc973888822a4fcf1ec8e80ecdf6..0847e7aff230782874565e565929c10053807839 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2817,6 +2817,18 @@ (define_insn "load_pair_lanes<mode>"
   [(set_attr "type" "neon_load1_1reg_q")]
 )
 
+(define_insn "store_pair_lanes<mode>"
+  [(set (match_operand:<VDBL> 0 "aarch64_mem_pair_lanes_operand" "=Uml, Uml")
+	(vec_concat:<VDBL>
+	   (match_operand:VDC 1 "register_operand" "w, r")
+	   (match_operand:VDC 2 "register_operand" "w, r")))]
+  "TARGET_SIMD"
+  "@
+   stp\\t%d1, %d2, %0
+   stp\\t%x1, %x2, %0"
+  [(set_attr "type" "neon_stp, store_16")]
+)
+
 ;; In this insn, operand 1 should be low, and operand 2 the high part of the
 ;; dest vector.
 
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index ab607b9f7488e903a14fe93e88d4c4e1fad762b3..6bdb93f07a7c8d19675971ccfc71e681fd2dae66 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -154,6 +154,15 @@ (define_memory_constraint "Ump"
        (match_test "aarch64_legitimate_address_p (GET_MODE (op), XEXP (op, 0),
 						  PARALLEL, 1)")))
 
+;; Used for storing two 64-bit values in an AdvSIMD register using an STP
+;; as a 128-bit vec_concat.
+(define_memory_constraint "Uml"
+  "@internal
+  A memory address suitable for a load/store pair operation."
+  (and (match_code "mem")
+       (match_test "aarch64_legitimate_address_p (DFmode, XEXP (op, 0),
+						   PARALLEL, 1)")))
+
 (define_memory_constraint "Utv"
   "@internal
    An address valid for loading/storing opaque structure
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 16e864765cde3bf8f54a11e5fc8db5a53606db30..c175f9d9a5f969817782bede858b1834ac642e28 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -173,6 +173,13 @@ (define_predicate "aarch64_mem_pair_operand"
        (match_test "aarch64_legitimate_address_p (mode, XEXP (op, 0), PARALLEL,
 					       0)")))
 
+;; Used for storing two 64-bit values in an AdvSIMD register using an STP
+;; as a 128-bit vec_concat.
+(define_predicate "aarch64_mem_pair_lanes_operand"
+  (and (match_code "mem")
+       (match_test "aarch64_legitimate_address_p (DFmode, XEXP (op, 0),
+						   PARALLEL, 1)")))
+
 (define_predicate "aarch64_prefetch_operand"
   (match_test "aarch64_address_valid_for_prefetch_p (op, false)"))
 
diff --git a/gcc/testsuite/gcc.target/aarch64/store_v2vec_lanes.c b/gcc/testsuite/gcc.target/aarch64/store_v2vec_lanes.c
new file mode 100644
index 0000000000000000000000000000000000000000..6810db3c54dce81777caf062177facedb464d1d6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/store_v2vec_lanes.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+typedef long long v2di __attribute__ ((vector_size (16)));
+typedef double v2df __attribute__ ((vector_size (16)));
+
+void
+construct_lane_1 (double *y, v2df *z)
+{
+  double y0 = y[0] + 1;
+  double y1 = y[1] + 2;
+  v2df x = {y0, y1};
+  z[2] = x;
+}
+
+void
+construct_lane_2 (long long *y, v2di *z)
+{
+  long long y0 = y[0] + 1;
+  long long y1 = y[1] + 2;
+  v2di x = {y0, y1};
+  z[2] = x;
+}
+
+/* We can use the load_pair_lanes<mode> pattern to vec_concat two DI/DF
+   values from consecutive memory into a 2-element vector by using
+   a Q-reg LDR.  */
+
+/* { dg-final { scan-assembler-times "stp\td\[0-9\]+, d\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "stp\tx\[0-9\]+, x\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-not "ins\t" } } */
Christophe Lyon Nov. 15, 2017, 3:31 p.m. UTC | #3
Hi Kyrill,


On 8 November 2017 at 19:34, Kyrill  Tkachov
<kyrylo.tkachov@foss.arm.com> wrote:
>
> On 06/06/17 14:17, James Greenhalgh wrote:
>>
>> On Tue, Jun 06, 2017 at 09:40:44AM +0100, Kyrill Tkachov wrote:
>>>
>>> Hi all,
>>>
>>> On top of the previous vec_merge simplifications [1] we can add this
>>> pattern to perform
>>> a store of a vec_concat of two 64-bit values in distinct registers as an
>>> STP.
>>> This avoids constructing such a vector explicitly in a register and
>>> storing it as
>>> a Q register.
>>> This way for the code in the testcase we can generate:
>>>
>>> construct_lane_1:
>>>          ldp     d1, d0, [x0]
>>>          fmov    d3, 1.0e+0
>>>          fmov    d2, 2.0e+0
>>>          fadd    d4, d1, d3
>>>          fadd    d5, d0, d2
>>>          stp     d4, d5, [x1, 32]
>>>          ret
>>>
>>> construct_lane_2:
>>>          ldp     x2, x0, [x0]
>>>          add     x3, x2, 1
>>>          add     x4, x0, 2
>>>          stp     x3, x4, [x1, 32]
>>>          ret
>>>
>>> instead of the current:
>>> construct_lane_1:
>>>          ldp     d0, d1, [x0]
>>>          fmov    d3, 1.0e+0
>>>          fmov    d2, 2.0e+0
>>>          fadd    d0, d0, d3
>>>          fadd    d1, d1, d2
>>>          dup     v0.2d, v0.d[0]
>>>          ins     v0.d[1], v1.d[0]
>>>          str     q0, [x1, 32]
>>>          ret
>>>
>>> construct_lane_2:
>>>          ldp     x2, x3, [x0]
>>>          add     x0, x2, 1
>>>          add     x2, x3, 2
>>>          dup     v0.2d, x0
>>>          ins     v0.d[1], x2
>>>          str     q0, [x1, 32]
>>>          ret
>>>
>>> Bootstrapped and tested on aarch64-none-linux-gnu.
>>> Ok for GCC 8?
>>
>> OK.
>
>
> Thanks, I've committed this and the other patches in this series after
> rebasing and rebootstrapping and testing on aarch64-none-linux-gnu.
> The only conflict from updating the patch was that I had to use the store_16
> attribute rather than
> the old store2 for the new define_insn. This is what I've committed with
> r254551.
>
> Sorry for the delay in committing.
>

I've noticed that the new tests fail when testing with -mabi=ilp32:
FAIL:    gcc.target/aarch64/store_v2vec_lanes.c scan-assembler-not ins\t
FAIL:    gcc.target/aarch64/store_v2vec_lanes.c scan-assembler-times
stp\td[0-9]+, d[0-9]+ 1 (found 0 times)
FAIL:    gcc.target/aarch64/store_v2vec_lanes.c scan-assembler-times
stp\tx[0-9]+, x[0-9]+ 1 (found 0 times)

Sorry if this has been reported before.

Christophe

> Kyrill
>
>
>> Thanks,
>> James
>>
>>> 2017-06-06  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>>>
>>>      * config/aarch64/aarch64-simd.md (store_pair_lanes<mode>):
>>>      New pattern.
>>>      * config/aarch64/constraints.md (Uml): New constraint.
>>>      * config/aarch64/predicates.md (aarch64_mem_pair_lanes_operand): New
>>>      predicate.
>>>
>>> 2017-06-06  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>>>
>>>      * gcc.target/aarch64/store_v2vec_lanes.c: New test.
>>
>>
>
Kyrill Tkachov Nov. 15, 2017, 3:58 p.m. UTC | #4
Hi Christophe,

On 15/11/17 15:31, Christophe Lyon wrote:
> Hi Kyrill,
>
>
> On 8 November 2017 at 19:34, Kyrill  Tkachov
> <kyrylo.tkachov@foss.arm.com> wrote:
>> On 06/06/17 14:17, James Greenhalgh wrote:
>>> On Tue, Jun 06, 2017 at 09:40:44AM +0100, Kyrill Tkachov wrote:
>>>> Hi all,
>>>>
>>>> On top of the previous vec_merge simplifications [1] we can add this
>>>> pattern to perform
>>>> a store of a vec_concat of two 64-bit values in distinct registers as an
>>>> STP.
>>>> This avoids constructing such a vector explicitly in a register and
>>>> storing it as
>>>> a Q register.
>>>> This way for the code in the testcase we can generate:
>>>>
>>>> construct_lane_1:
>>>>           ldp     d1, d0, [x0]
>>>>           fmov    d3, 1.0e+0
>>>>           fmov    d2, 2.0e+0
>>>>           fadd    d4, d1, d3
>>>>           fadd    d5, d0, d2
>>>>           stp     d4, d5, [x1, 32]
>>>>           ret
>>>>
>>>> construct_lane_2:
>>>>           ldp     x2, x0, [x0]
>>>>           add     x3, x2, 1
>>>>           add     x4, x0, 2
>>>>           stp     x3, x4, [x1, 32]
>>>>           ret
>>>>
>>>> instead of the current:
>>>> construct_lane_1:
>>>>           ldp     d0, d1, [x0]
>>>>           fmov    d3, 1.0e+0
>>>>           fmov    d2, 2.0e+0
>>>>           fadd    d0, d0, d3
>>>>           fadd    d1, d1, d2
>>>>           dup     v0.2d, v0.d[0]
>>>>           ins     v0.d[1], v1.d[0]
>>>>           str     q0, [x1, 32]
>>>>           ret
>>>>
>>>> construct_lane_2:
>>>>           ldp     x2, x3, [x0]
>>>>           add     x0, x2, 1
>>>>           add     x2, x3, 2
>>>>           dup     v0.2d, x0
>>>>           ins     v0.d[1], x2
>>>>           str     q0, [x1, 32]
>>>>           ret
>>>>
>>>> Bootstrapped and tested on aarch64-none-linux-gnu.
>>>> Ok for GCC 8?
>>> OK.
>>
>> Thanks, I've committed this and the other patches in this series after
>> rebasing and rebootstrapping and testing on aarch64-none-linux-gnu.
>> The only conflict from updating the patch was that I had to use the store_16
>> attribute rather than
>> the old store2 for the new define_insn. This is what I've committed with
>> r254551.
>>
>> Sorry for the delay in committing.
>>
> I've noticed that the new tests fail when testing with -mabi=ilp32:
> FAIL:    gcc.target/aarch64/store_v2vec_lanes.c scan-assembler-not ins\t
> FAIL:    gcc.target/aarch64/store_v2vec_lanes.c scan-assembler-times
> stp\td[0-9]+, d[0-9]+ 1 (found 0 times)
> FAIL:    gcc.target/aarch64/store_v2vec_lanes.c scan-assembler-times
> stp\tx[0-9]+, x[0-9]+ 1 (found 0 times)
>
> Sorry if this has been reported before.

Thank you for reporting this, I was not aware of it.
My patch does indeed fail to generate the optimised sequence for 
-mabi=ilp32.
During combine it fails to match:
Failed to match this instruction:
(set (mem:V2DF (plus:DI (reg/v/f:DI 79 [ z ])
             (const_int 32 [0x20])) [1 MEM[(v2df *)z_8(D) + 32B]+0 S16 
A128])
     (vec_concat:V2DF (reg:DF 81 [ y0 ])
         (reg:DF 84 [ y1 ])))


but without the -mabi=ilp32 it does successfully match the equivalent

(set (mem:V2DF (plus:DI (reg:DI 1 x1 [ z ])
             (const_int 32 [0x20])) [1 MEM[(v2df *)z_8(D) + 32B]+0 S16 
A128])
     (vec_concat:V2DF (reg:DF 81 [ y0 ])
         (reg:DF 84 [ y1 ])))

The only difference is the index register being the hard reg x1.
There's probably some subtlety in aarch64_classify_address that I'll 
need to dig into.
In any case, can you please open a bug report for this so we can track it?
To be clear, the failure is just suboptimal codegen for the -mabi=ilp32 
case, not a wrong-code or ICE
(though it should still be fixed, of course).

Thanks again,
Kyrill

> Christophe
>
>> Kyrill
>>
>>
>>> Thanks,
>>> James
>>>
>>>> 2017-06-06  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>>>>
>>>>       * config/aarch64/aarch64-simd.md (store_pair_lanes<mode>):
>>>>       New pattern.
>>>>       * config/aarch64/constraints.md (Uml): New constraint.
>>>>       * config/aarch64/predicates.md (aarch64_mem_pair_lanes_operand): New
>>>>       predicate.
>>>>
>>>> 2017-06-06  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>>>>
>>>>       * gcc.target/aarch64/store_v2vec_lanes.c: New test.
>>>
Christophe Lyon Nov. 15, 2017, 4:28 p.m. UTC | #5
On 15 November 2017 at 16:58, Kyrill  Tkachov
<kyrylo.tkachov@foss.arm.com> wrote:
> Hi Christophe,
>
>
> On 15/11/17 15:31, Christophe Lyon wrote:
>>
>> Hi Kyrill,
>>
>>
>> On 8 November 2017 at 19:34, Kyrill  Tkachov
>> <kyrylo.tkachov@foss.arm.com> wrote:
>>>
>>> On 06/06/17 14:17, James Greenhalgh wrote:
>>>>
>>>> On Tue, Jun 06, 2017 at 09:40:44AM +0100, Kyrill Tkachov wrote:
>>>>>
>>>>> Hi all,
>>>>>
>>>>> On top of the previous vec_merge simplifications [1] we can add this
>>>>> pattern to perform
>>>>> a store of a vec_concat of two 64-bit values in distinct registers as
>>>>> an
>>>>> STP.
>>>>> This avoids constructing such a vector explicitly in a register and
>>>>> storing it as
>>>>> a Q register.
>>>>> This way for the code in the testcase we can generate:
>>>>>
>>>>> construct_lane_1:
>>>>>           ldp     d1, d0, [x0]
>>>>>           fmov    d3, 1.0e+0
>>>>>           fmov    d2, 2.0e+0
>>>>>           fadd    d4, d1, d3
>>>>>           fadd    d5, d0, d2
>>>>>           stp     d4, d5, [x1, 32]
>>>>>           ret
>>>>>
>>>>> construct_lane_2:
>>>>>           ldp     x2, x0, [x0]
>>>>>           add     x3, x2, 1
>>>>>           add     x4, x0, 2
>>>>>           stp     x3, x4, [x1, 32]
>>>>>           ret
>>>>>
>>>>> instead of the current:
>>>>> construct_lane_1:
>>>>>           ldp     d0, d1, [x0]
>>>>>           fmov    d3, 1.0e+0
>>>>>           fmov    d2, 2.0e+0
>>>>>           fadd    d0, d0, d3
>>>>>           fadd    d1, d1, d2
>>>>>           dup     v0.2d, v0.d[0]
>>>>>           ins     v0.d[1], v1.d[0]
>>>>>           str     q0, [x1, 32]
>>>>>           ret
>>>>>
>>>>> construct_lane_2:
>>>>>           ldp     x2, x3, [x0]
>>>>>           add     x0, x2, 1
>>>>>           add     x2, x3, 2
>>>>>           dup     v0.2d, x0
>>>>>           ins     v0.d[1], x2
>>>>>           str     q0, [x1, 32]
>>>>>           ret
>>>>>
>>>>> Bootstrapped and tested on aarch64-none-linux-gnu.
>>>>> Ok for GCC 8?
>>>>
>>>> OK.
>>>
>>>
>>> Thanks, I've committed this and the other patches in this series after
>>> rebasing and rebootstrapping and testing on aarch64-none-linux-gnu.
>>> The only conflict from updating the patch was that I had to use the
>>> store_16
>>> attribute rather than
>>> the old store2 for the new define_insn. This is what I've committed with
>>> r254551.
>>>
>>> Sorry for the delay in committing.
>>>
>> I've noticed that the new tests fail when testing with -mabi=ilp32:
>> FAIL:    gcc.target/aarch64/store_v2vec_lanes.c scan-assembler-not ins\t
>> FAIL:    gcc.target/aarch64/store_v2vec_lanes.c scan-assembler-times
>> stp\td[0-9]+, d[0-9]+ 1 (found 0 times)
>> FAIL:    gcc.target/aarch64/store_v2vec_lanes.c scan-assembler-times
>> stp\tx[0-9]+, x[0-9]+ 1 (found 0 times)
>>
>> Sorry if this has been reported before.
>
>
> Thank you for reporting this, I was not aware of it.
> My patch does indeed fail to generate the optimised sequence for
> -mabi=ilp32.
> During combine it fails to match:
> Failed to match this instruction:
> (set (mem:V2DF (plus:DI (reg/v/f:DI 79 [ z ])
>             (const_int 32 [0x20])) [1 MEM[(v2df *)z_8(D) + 32B]+0 S16 A128])
>     (vec_concat:V2DF (reg:DF 81 [ y0 ])
>         (reg:DF 84 [ y1 ])))
>
>
> but without the -mabi=ilp32 it does successfully match the equivalent
>
> (set (mem:V2DF (plus:DI (reg:DI 1 x1 [ z ])
>             (const_int 32 [0x20])) [1 MEM[(v2df *)z_8(D) + 32B]+0 S16 A128])
>     (vec_concat:V2DF (reg:DF 81 [ y0 ])
>         (reg:DF 84 [ y1 ])))
>
> The only difference is the index register being the hard reg x1.
> There's probably some subtlety in aarch64_classify_address that I'll need to
> dig into.
> In any case, can you please open a bug report for this so we can track it?

Sure, that's: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83009


> To be clear, the failure is just suboptimal codegen for the -mabi=ilp32
> case, not a wrong-code or ICE
> (though it should still be fixed, of course).
>
> Thanks again,
> Kyrill
>
>
>> Christophe
>>
>>> Kyrill
>>>
>>>
>>>> Thanks,
>>>> James
>>>>
>>>>> 2017-06-06  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>>>>>
>>>>>       * config/aarch64/aarch64-simd.md (store_pair_lanes<mode>):
>>>>>       New pattern.
>>>>>       * config/aarch64/constraints.md (Uml): New constraint.
>>>>>       * config/aarch64/predicates.md (aarch64_mem_pair_lanes_operand):
>>>>> New
>>>>>       predicate.
>>>>>
>>>>> 2017-06-06  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>>>>>
>>>>>       * gcc.target/aarch64/store_v2vec_lanes.c: New test.
>>>>
>>>>
>
diff mbox

Patch

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index b78affe9b06ffc973888822a4fcf1ec8e80ecdf6..0847e7aff230782874565e565929c10053807839 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2817,6 +2817,18 @@  (define_insn "load_pair_lanes<mode>"
   [(set_attr "type" "neon_load1_1reg_q")]
 )
 
+(define_insn "store_pair_lanes<mode>"
+  [(set (match_operand:<VDBL> 0 "aarch64_mem_pair_lanes_operand" "=Uml, Uml")
+	(vec_concat:<VDBL>
+	   (match_operand:VDC 1 "register_operand" "w, r")
+	   (match_operand:VDC 2 "register_operand" "w, r")))]
+  "TARGET_SIMD"
+  "@
+   stp\\t%d1, %d2, %0
+   stp\\t%x1, %x2, %0"
+  [(set_attr "type" "neon_stp, store2")]
+)
+
 ;; In this insn, operand 1 should be low, and operand 2 the high part of the
 ;; dest vector.
 
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index ab607b9f7488e903a14fe93e88d4c4e1fad762b3..6bdb93f07a7c8d19675971ccfc71e681fd2dae66 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -154,6 +154,15 @@  (define_memory_constraint "Ump"
        (match_test "aarch64_legitimate_address_p (GET_MODE (op), XEXP (op, 0),
 						  PARALLEL, 1)")))
 
+;; Used for storing two 64-bit values in an AdvSIMD register using an STP
+;; as a 128-bit vec_concat.
+(define_memory_constraint "Uml"
+  "@internal
+  A memory address suitable for a load/store pair operation."
+  (and (match_code "mem")
+       (match_test "aarch64_legitimate_address_p (DFmode, XEXP (op, 0),
+						   PARALLEL, 1)")))
+
 (define_memory_constraint "Utv"
   "@internal
    An address valid for loading/storing opaque structure
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 16e864765cde3bf8f54a11e5fc8db5a53606db30..c175f9d9a5f969817782bede858b1834ac642e28 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -173,6 +173,13 @@  (define_predicate "aarch64_mem_pair_operand"
        (match_test "aarch64_legitimate_address_p (mode, XEXP (op, 0), PARALLEL,
 					       0)")))
 
+;; Used for storing two 64-bit values in an AdvSIMD register using an STP
+;; as a 128-bit vec_concat.
+(define_predicate "aarch64_mem_pair_lanes_operand"
+  (and (match_code "mem")
+       (match_test "aarch64_legitimate_address_p (DFmode, XEXP (op, 0),
+						   PARALLEL, 1)")))
+
 (define_predicate "aarch64_prefetch_operand"
   (match_test "aarch64_address_valid_for_prefetch_p (op, false)"))
 
diff --git a/gcc/testsuite/gcc.target/aarch64/store_v2vec_lanes.c b/gcc/testsuite/gcc.target/aarch64/store_v2vec_lanes.c
new file mode 100644
index 0000000000000000000000000000000000000000..6810db3c54dce81777caf062177facedb464d1d6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/store_v2vec_lanes.c
@@ -0,0 +1,31 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+typedef long long v2di __attribute__ ((vector_size (16)));
+typedef double v2df __attribute__ ((vector_size (16)));
+
+void
+construct_lane_1 (double *y, v2df *z)
+{
+  double y0 = y[0] + 1;
+  double y1 = y[1] + 2;
+  v2df x = {y0, y1};
+  z[2] = x;
+}
+
+void
+construct_lane_2 (long long *y, v2di *z)
+{
+  long long y0 = y[0] + 1;
+  long long y1 = y[1] + 2;
+  v2di x = {y0, y1};
+  z[2] = x;
+}
+
+/* We can use the load_pair_lanes<mode> pattern to vec_concat two DI/DF
+   values from consecutive memory into a 2-element vector by using
+   a Q-reg LDR.  */
+
+/* { dg-final { scan-assembler-times "stp\td\[0-9\]+, d\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "stp\tx\[0-9\]+, x\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-not "ins\t" } } */