[v4] aarch64: thunderx2 memcpy optimizations for ext-based code path
diff mbox series

Message ID 5C994D96.6000303@bell-sw.com
State New
Headers show
Series
  • [v4] aarch64: thunderx2 memcpy optimizations for ext-based code path
Related show

Commit Message

Anton Youdkevitch March 25, 2019, 9:52 p.m. UTC
Wilco,

I appreciate you comments very much. Here is the patch
considering the points you made.

1. Always taken conditional branch at the beginning is
removed.

2. Epilogue code is placed after the end of the loop to
reduce the number of branches.

3. The redundant "mov" instructions inside the loop are
gone due to the changed order of the registers in the ext
instructions inside the loop.

4. Invariant code in the loop epilogue is no more
repeated for each ext chunk.

make check shows no regression

Comments

Wilco Dijkstra March 26, 2019, 8:40 p.m. UTC | #1
Hi Anton,

> I appreciate you comments very much. Here is the patch
> considering the points you made.
>
> 1. Always taken conditional branch at the beginning is
> removed.
>
> 2. Epilogue code is placed after the end of the loop to
> reduce the number of branches.
>
> 3. The redundant "mov" instructions inside the loop are
> gone due to the changed order of the registers in the ext
> instructions inside the loop.
>
> 4. Invariant code in the loop epilogue is no more
> repeated for each ext chunk.

That looks much better indeed! The alignment can still be improved
though:

   819d0:       6e037840        ext     v0.16b, v2.16b, v3.16b, #15
   819d4:       6e047861        ext     v1.16b, v3.16b, v4.16b, #15
   819d8:       6e057887        ext     v7.16b, v4.16b, v5.16b, #15
   819dc:       ac810460        stp     q0, q1, [x3], #32
   819e0:       f9814021        prfm    pldl1strm, [x1, #640]
   819e4:       acc10c22        ldp     q2, q3, [x1], #32
   819e8:       6e0678b0        ext     v16.16b, v5.16b, v6.16b, #15
   819ec:       ac814067        stp     q7, q16, [x3], #32
   819f0:       6e0278c0        ext     v0.16b, v6.16b, v2.16b, #15
   819f4:       6e037841        ext     v1.16b, v2.16b, v3.16b, #15
   819f8:       acc11825        ldp     q5, q6, [x1], #32
   819fc:       6e057867        ext     v7.16b, v3.16b, v5.16b, #15
   81a00:       f1010042        subs    x2, x2, #0x40
   81a04:       54fffeca        b.ge    819dc <__GI___memcpy_thunderx2+0x27c>

So rather than aligning the first instruction as currently done:

#define EXT_CHUNK(shft) \
.p2align 4 ;\

Align the loop instead. If you also add 2 nops after the bx instruction then
everything should work out perfectly.

Cheers,
Wilco
Anton Youdkevitch March 26, 2019, 10:22 p.m. UTC | #2
Wilco,

On 3/26/2019 23:40, Wilco Dijkstra wrote:
> Hi Anton,
>
>> I appreciate you comments very much. Here is the patch
>> considering the points you made.
>>
>> 1. Always taken conditional branch at the beginning is
>> removed.
>>
>> 2. Epilogue code is placed after the end of the loop to
>> reduce the number of branches.
>>
>> 3. The redundant "mov" instructions inside the loop are
>> gone due to the changed order of the registers in the ext
>> instructions inside the loop.
>>
>> 4. Invariant code in the loop epilogue is no more
>> repeated for each ext chunk.
>
> That looks much better indeed! The alignment can still be improved
> though:
>
>     819d0:       6e037840        ext     v0.16b, v2.16b, v3.16b, #15
>     819d4:       6e047861        ext     v1.16b, v3.16b, v4.16b, #15
>     819d8:       6e057887        ext     v7.16b, v4.16b, v5.16b, #15
>     819dc:       ac810460        stp     q0, q1, [x3], #32
>     819e0:       f9814021        prfm    pldl1strm, [x1, #640]
>     819e4:       acc10c22        ldp     q2, q3, [x1], #32
>     819e8:       6e0678b0        ext     v16.16b, v5.16b, v6.16b, #15
>     819ec:       ac814067        stp     q7, q16, [x3], #32
>     819f0:       6e0278c0        ext     v0.16b, v6.16b, v2.16b, #15
>     819f4:       6e037841        ext     v1.16b, v2.16b, v3.16b, #15
>     819f8:       acc11825        ldp     q5, q6, [x1], #32
>     819fc:       6e057867        ext     v7.16b, v3.16b, v5.16b, #15
>     81a00:       f1010042        subs    x2, x2, #0x40
>     81a04:       54fffeca        b.ge    819dc <__GI___memcpy_thunderx2+0x27c>
>
> So rather than aligning the first instruction as currently done:
>
> #define EXT_CHUNK(shft) \
> .p2align 4 ;\
>
> Align the loop instead. If you also add 2 nops after the bx instruction then
> everything should work out perfectly.
OK, right, aligning loop body makes more sense than aligning prologue.
Thanks!

But why adding nops at the end (if by "bx" you meant branch) as we do
not care about prologue alignment? If this is about how the next chunk
is aligned, of course.
Anton Youdkevitch March 27, 2019, 2:22 p.m. UTC | #3
Wilco,

Here is the updated patch.

The first three instructions of each chunk are forced
to be unaligned to make the loop header aligned on 16
bytes.


488:   d61f00c0        br      x6
48c:   d503201f        nop
490:   d503201f        nop

494:   6e037840        ext     v0.16b, v2.16b, v3.16b, #15
498:   6e047861        ext     v1.16b, v3.16b, v4.16b, #15
49c:   6e057887        ext     v7.16b, v4.16b, v5.16b, #15

4a0:   ac810460        stp     q0, q1, [x3], #32
4a4:   f9814021        prfm    pldl1strm, [x1, #640]
4a8:   acc10c22        ldp     q2, q3, [x1], #32
4ac:   6e0678b0        ext     v16.16b, v5.16b, v6.16b, #15
4b0:   ac814067        stp     q7, q16, [x3], #32
4b4:   6e0278c0        ext     v0.16b, v6.16b, v2.16b, #15
4b8:   6e037841        ext     v1.16b, v2.16b, v3.16b, #15
4bc:   acc11825        ldp     q5, q6, [x1], #32
4c0:   6e057867        ext     v7.16b, v3.16b, v5.16b, #15
4c4:   f1010042        subs    x2, x2, #0x40
4c8:   54fffeca        b.ge    4a0 <__memcpy_thunderx2+0x280>
4cc:   6e0678b0        ext     v16.16b, v5.16b, v6.16b, #15
4d0:   140000e1        b       854 <__memcpy_thunderx2+0x634>

4d4:   6e037040        ext     v0.16b, v2.16b, v3.16b, #14
4d8:   6e047061        ext     v1.16b, v3.16b, v4.16b, #14
4dc:   6e057087        ext     v7.16b, v4.16b, v5.16b, #14

4e0:   ac810460        stp     q0, q1, [x3], #32
4e4:   f9814021        prfm    pldl1strm, [x1, #640]
4e8:   acc10c22        ldp     q2, q3, [x1], #32
4ec:   6e0670b0        ext     v16.16b, v5.16b, v6.16b, #14
4f0:   ac814067        stp     q7, q16, [x3], #32
4f4:   6e0270c0        ext     v0.16b, v6.16b, v2.16b, #14
4f8:   6e037041        ext     v1.16b, v2.16b, v3.16b, #14
4fc:   acc11825        ldp     q5, q6, [x1], #32
500:   6e057067        ext     v7.16b, v3.16b, v5.16b, #14
504:   f1010042        subs    x2, x2, #0x40
508:   54fffeca        b.ge    4e0 <__memcpy_thunderx2+0x2c0> 
50c:   6e0670b0        ext     v16.16b, v5.16b, v6.16b, #14
510:   140000d1        b       854 <__memcpy_thunderx2+0x634>

Looks OK?


On Tue, Mar 26, 2019 at 08:40:34PM +0000, Wilco Dijkstra wrote:
> Hi Anton,
> 
> > I appreciate you comments very much. Here is the patch
> > considering the points you made.
> >
> > 1. Always taken conditional branch at the beginning is
> > removed.
> >
> > 2. Epilogue code is placed after the end of the loop to
> > reduce the number of branches.
> >
> > 3. The redundant "mov" instructions inside the loop are
> > gone due to the changed order of the registers in the ext
> > instructions inside the loop.
> >
> > 4. Invariant code in the loop epilogue is no more
> > repeated for each ext chunk.
> 
> That looks much better indeed! The alignment can still be improved
> though:
> 
>    819d0:       6e037840        ext     v0.16b, v2.16b, v3.16b, #15
>    819d4:       6e047861        ext     v1.16b, v3.16b, v4.16b, #15
>    819d8:       6e057887        ext     v7.16b, v4.16b, v5.16b, #15
>    819dc:       ac810460        stp     q0, q1, [x3], #32
>    819e0:       f9814021        prfm    pldl1strm, [x1, #640]
>    819e4:       acc10c22        ldp     q2, q3, [x1], #32
>    819e8:       6e0678b0        ext     v16.16b, v5.16b, v6.16b, #15
>    819ec:       ac814067        stp     q7, q16, [x3], #32
>    819f0:       6e0278c0        ext     v0.16b, v6.16b, v2.16b, #15
>    819f4:       6e037841        ext     v1.16b, v2.16b, v3.16b, #15
>    819f8:       acc11825        ldp     q5, q6, [x1], #32
>    819fc:       6e057867        ext     v7.16b, v3.16b, v5.16b, #15
>    81a00:       f1010042        subs    x2, x2, #0x40
>    81a04:       54fffeca        b.ge    819dc <__GI___memcpy_thunderx2+0x27c>
> 
> So rather than aligning the first instruction as currently done:
> 
> #define EXT_CHUNK(shft) \
> .p2align 4 ;\
> 
> Align the loop instead. If you also add 2 nops after the bx instruction then
> everything should work out perfectly.
> 
> Cheers,
> Wilco
>
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
index b2215c1..45e9a29 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
@@ -382,7 +382,8 @@ L(bytes_0_to_3):
 	strb    A_lw, [dstin]
 	strb    B_lw, [dstin, tmp1]
 	strb    A_hw, [dstend, -1]
-L(end): ret
+L(end):
+	ret
 
 	.p2align 4
 
@@ -544,43 +545,35 @@ L(dst_unaligned):
 	str     C_q, [dst], #16
 	ldp     F_q, G_q, [src], #32
 	bic	dst, dst, 15
+	subs    count, count, 32
 	adrp	tmp2, L(ext_table)
 	add	tmp2, tmp2, :lo12:L(ext_table)
 	add	tmp2, tmp2, tmp1, LSL #2
 	ldr	tmp3w, [tmp2]
 	add	tmp2, tmp2, tmp3w, SXTW
 	br	tmp2
-
-#define EXT_CHUNK(shft) \
 .p2align 4 ;\
+	nop
+#define EXT_CHUNK(shft) \
 L(ext_size_ ## shft):;\
 	ext     A_v.16b, C_v.16b, D_v.16b, 16-shft;\
 	ext     B_v.16b, D_v.16b, E_v.16b, 16-shft;\
-	subs    count, count, 32;\
-	b.ge    2f;\
-1:;\
-	stp     A_q, B_q, [dst], #32;\
 	ext     H_v.16b, E_v.16b, F_v.16b, 16-shft;\
-	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
-	stp     H_q, I_q, [dst], #16;\
-	add     dst, dst, tmp1;\
-	str     G_q, [dst], #16;\
-	b       L(copy_long_check32);\
-2:;\
+1:;\
 	stp     A_q, B_q, [dst], #32;\
 	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR];\
-	ldp     D_q, J_q, [src], #32;\
-	ext     H_v.16b, E_v.16b, F_v.16b, 16-shft;\
+	ldp     C_q, D_q, [src], #32;\
 	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
-	mov     C_v.16b, G_v.16b;\
 	stp     H_q, I_q, [dst], #32;\
+	ext     A_v.16b, G_v.16b, C_v.16b, 16-shft;\
+	ext     B_v.16b, C_v.16b, D_v.16b, 16-shft;\
 	ldp     F_q, G_q, [src], #32;\
-	ext     A_v.16b, C_v.16b, D_v.16b, 16-shft;\
-	ext     B_v.16b, D_v.16b, J_v.16b, 16-shft;\
-	mov     E_v.16b, J_v.16b;\
+	ext     H_v.16b, D_v.16b, F_v.16b, 16-shft;\
 	subs    count, count, 64;\
-	b.ge    2b;\
-	b	1b;\
+	b.ge    1b;\
+2:;\
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
+	b	L(ext_tail);
 
 EXT_CHUNK(1)
 EXT_CHUNK(2)
@@ -598,6 +591,14 @@ EXT_CHUNK(13)
 EXT_CHUNK(14)
 EXT_CHUNK(15)
 
+L(ext_tail):
+	stp     A_q, B_q, [dst], #32
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+
+
 END (MEMCPY)
 	.section	.rodata
 	.p2align	4

Patch
diff mbox series

diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
index b2215c1..c8c5e8b 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
@@ -382,7 +382,8 @@  L(bytes_0_to_3):
 	strb    A_lw, [dstin]
 	strb    B_lw, [dstin, tmp1]
 	strb    A_hw, [dstend, -1]
-L(end): ret
+L(end):
+	ret
 
 	.p2align 4
 
@@ -544,6 +545,7 @@  L(dst_unaligned):
 	str     C_q, [dst], #16
 	ldp     F_q, G_q, [src], #32
 	bic	dst, dst, 15
+	subs    count, count, 32
 	adrp	tmp2, L(ext_table)
 	add	tmp2, tmp2, :lo12:L(ext_table)
 	add	tmp2, tmp2, tmp1, LSL #2
@@ -556,31 +558,22 @@  L(dst_unaligned):
 L(ext_size_ ## shft):;\
 	ext     A_v.16b, C_v.16b, D_v.16b, 16-shft;\
 	ext     B_v.16b, D_v.16b, E_v.16b, 16-shft;\
-	subs    count, count, 32;\
-	b.ge    2f;\
-1:;\
-	stp     A_q, B_q, [dst], #32;\
 	ext     H_v.16b, E_v.16b, F_v.16b, 16-shft;\
-	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
-	stp     H_q, I_q, [dst], #16;\
-	add     dst, dst, tmp1;\
-	str     G_q, [dst], #16;\
-	b       L(copy_long_check32);\
-2:;\
+1:;\
 	stp     A_q, B_q, [dst], #32;\
 	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR];\
-	ldp     D_q, J_q, [src], #32;\
-	ext     H_v.16b, E_v.16b, F_v.16b, 16-shft;\
+	ldp     C_q, D_q, [src], #32;\
 	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
-	mov     C_v.16b, G_v.16b;\
 	stp     H_q, I_q, [dst], #32;\
+	ext     A_v.16b, G_v.16b, C_v.16b, 16-shft;\
+	ext     B_v.16b, C_v.16b, D_v.16b, 16-shft;\
 	ldp     F_q, G_q, [src], #32;\
-	ext     A_v.16b, C_v.16b, D_v.16b, 16-shft;\
-	ext     B_v.16b, D_v.16b, J_v.16b, 16-shft;\
-	mov     E_v.16b, J_v.16b;\
+	ext     H_v.16b, D_v.16b, F_v.16b, 16-shft;\
 	subs    count, count, 64;\
-	b.ge    2b;\
-	b	1b;\
+	b.ge    1b;\
+2:;\
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
+	b	L(ext_tail);
 
 EXT_CHUNK(1)
 EXT_CHUNK(2)
@@ -598,6 +591,14 @@  EXT_CHUNK(13)
 EXT_CHUNK(14)
 EXT_CHUNK(15)
 
+L(ext_tail):
+	stp     A_q, B_q, [dst], #32
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+
+
 END (MEMCPY)
 	.section	.rodata
 	.p2align	4