diff mbox series

[RFC,07/13] ARCv2: memset: rewrite using double load/stores

Message ID 20220222141506.4003433-8-geomatsi@gmail.com
State New
Headers show
Series ARC: handle the lack of ZOL support | expand

Commit Message

Sergey Matyukevich Feb. 22, 2022, 2:15 p.m. UTC
From: Vineet Gupta <vgupta@kernel.org>

Signed-off-by: Vineet Gupta <vgupta@kernel.org>
---
 arch/arc/lib/memset-archs.S | 112 ++++++++++++++----------------------
 1 file changed, 43 insertions(+), 69 deletions(-)
diff mbox series

Patch

diff --git a/arch/arc/lib/memset-archs.S b/arch/arc/lib/memset-archs.S
index 330e22f7cf3c..a9a0ccef761d 100644
--- a/arch/arc/lib/memset-archs.S
+++ b/arch/arc/lib/memset-archs.S
@@ -5,6 +5,7 @@ 
 
 #include <linux/linkage.h>
 #include <asm/cache.h>
+#include <asm/assembler.h>
 
 /*
  * The memset implementation below is optimized to use prefetchw and prealloc
@@ -55,7 +56,7 @@  ENTRY_CFI(memset)
 1:
 #endif
 
-;;; Destination is aligned
+	; promote memset pattern from char to int (double actually for STD)
 	and	r1, r1, 0xFF
 	asl	r4, r1, 8
 	or	r4, r4, r1
@@ -63,75 +64,48 @@  ENTRY_CFI(memset)
 	or	r5, r5, r4
 	mov	r4, r5
 
-	sub3	lp_count, r2, 8
-	cmp     r2, 64
-	bmsk.hi	r2, r2, 5
-	mov.ls	lp_count, 0
-	add3.hi	r2, r2, 8
-
-;;; Convert len to Dwords, unfold x8
-	lsr.f	lp_count, lp_count, 6
-
-	lpnz	@.Lset64bytes
-	;; LOOP START
-	PREALLOC_INSTR	r3, 64	; alloc next line w/o fetching
-
-#ifdef CONFIG_ARC_HAS_LL64
-	std.ab	r4, [r3, 8]
-	std.ab	r4, [r3, 8]
-	std.ab	r4, [r3, 8]
-	std.ab	r4, [r3, 8]
-	std.ab	r4, [r3, 8]
-	std.ab	r4, [r3, 8]
-	std.ab	r4, [r3, 8]
-	std.ab	r4, [r3, 8]
-#else
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-#endif
-.Lset64bytes:
-
-	lsr.f	lp_count, r2, 5 ;Last remaining  max 124 bytes
-	lpnz	.Lset32bytes
-	;; LOOP START
-#ifdef CONFIG_ARC_HAS_LL64
-	std.ab	r4, [r3, 8]
-	std.ab	r4, [r3, 8]
-	std.ab	r4, [r3, 8]
-	std.ab	r4, [r3, 8]
-#else
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-	st.ab	r4, [r3, 4]
-#endif
-.Lset32bytes:
-
-	and.f	lp_count, r2, 0x1F ;Last remaining 31 bytes
-.Lsmallchunk:
-	lpnz	.Lcopy3bytes
-	;; LOOP START
+	; Loop #a:
+	; - Updates 1 cache line worth data (64 bytes) per iteration
+	; - PREALLOC the next line.
+	;
+	; = Only entered if at least 2 lines worth of work (i.e. >= 128 bytes),
+	;   else PREALLOC for next can "bleed" past end of buffer, causing data
+	;   corruption issue if that line is owned by some other core.
+	; = Last 64 bytes (even for min 128 bytes work) are NOT done here to
+	;   avoid PREALLOC issue
+
+	sub     r6, r2, 64
+	cmp	r2, 64
+	bmsk.hi	r2, r2, 5	; trailing 63 bytes
+	mov.ls	r6, 0
+	add.hi	r2, r2, 64	; line skipped in loop below
+
+	lsr.f	lp_count, r6, 6
+	lpnz	2f
+	PREALLOCR r3, 64
+	ST64.ab	r4, r3, 8
+	ST64.ab	r4, r3, 8
+	ST64.ab	r4, r3, 8
+	ST64.ab	r4, r3, 8
+	ST64.ab	r4, r3, 8
+	ST64.ab	r4, r3, 8
+	ST64.ab	r4, r3, 8
+	ST64.ab	r4, r3, 8
+2:
+	; Loop #b: Remaining 32 / 64 bytes
+	lsr.f	lp_count, r2, 5
+	lpnz	.Lbyteloop
+	ST64.ab	r4, r3, 8
+	ST64.ab	r4, r3, 8
+	ST64.ab	r4, r3, 8
+	ST64.ab	r4, r3, 8
+
+.Lbyteloop:
+	; Loop #c: straggler 31 bytes
+	and.f	lp_count, r2, 0x1F
+	lpnz	4f
 	stb.ab	r1, [r3, 1]
-.Lcopy3bytes:
-
+4:
 	j	[blink]
 
 END_CFI(memset)