Message ID | 20200619134944.29699-1-stli@linux.ibm.com |
---|---|
State | New |
Headers | show |
Series | S390: Optimize __memcpy_z196. | expand |
Just as information, if nobody opposes, I'll commit this patch tomorrow. On 6/19/20 3:49 PM, Stefan Liebler wrote: > This patch introduces an extra loop without pfd instructions > as it turned out that the pfd instructions are usefull > for copies >=64KB but are counterproductive for smaller copies. > --- > sysdeps/s390/memcpy-z900.S | 21 +++++++++++++++------ > 1 file changed, 15 insertions(+), 6 deletions(-) > > diff --git a/sysdeps/s390/memcpy-z900.S b/sysdeps/s390/memcpy-z900.S > index f2e9aaeb2d..dc2f491ec3 100644 > --- a/sysdeps/s390/memcpy-z900.S > +++ b/sysdeps/s390/memcpy-z900.S > @@ -184,25 +184,34 @@ ENTRY(MEMCPY_Z196) > je .L_Z196_4 > .L_Z196_start2: > aghi %r4,-1 > - srlg %r5,%r4,8 > - ltgr %r5,%r5 > + risbg %r5,%r4,8,128+63,56 # r0 = r5 / 256 > jne .L_Z196_5 > .L_Z196_3: > exrl %r4,.L_Z196_14 > .L_Z196_4: > br %r14 > .L_Z196_5: > - cgfi %r5,262144 # Switch to mvcle for copies >64MB > - jh __memcpy_mvcle > + cgfi %r5,255 # Switch to loop with pfd for copies >=64kB > + jh .L_Z196_6 > .L_Z196_2: > - pfd 1,768(%r3) > - pfd 2,768(%r1) > mvc 0(256,%r1),0(%r3) > aghi %r5,-1 > la %r1,256(%r1) > la %r3,256(%r3) > jne .L_Z196_2 > j .L_Z196_3 > +.L_Z196_6: > + cgfi %r5,262144 # Switch to mvcle for copies >64MB > + jh __memcpy_mvcle > +.L_Z196_7: > + pfd 1,1024(%r3) > + pfd 2,1024(%r1) > + mvc 0(256,%r1),0(%r3) > + aghi %r5,-1 > + la %r1,256(%r1) > + la %r3,256(%r3) > + jne .L_Z196_7 > + j .L_Z196_3 > .L_Z196_14: > mvc 0(1,%r1),0(%r3) > END(MEMCPY_Z196) >
committed On 6/25/20 10:18 AM, Stefan Liebler via Libc-alpha wrote: > Just as information, if nobody opposes, I'll commit this patch tomorrow. > > On 6/19/20 3:49 PM, Stefan Liebler wrote: >> This patch introduces an extra loop without pfd instructions >> as it turned out that the pfd instructions are usefull >> for copies >=64KB but are counterproductive for smaller copies. >> --- >> sysdeps/s390/memcpy-z900.S | 21 +++++++++++++++------ >> 1 file changed, 15 insertions(+), 6 deletions(-) >> >> diff --git a/sysdeps/s390/memcpy-z900.S b/sysdeps/s390/memcpy-z900.S >> index f2e9aaeb2d..dc2f491ec3 100644 >> --- a/sysdeps/s390/memcpy-z900.S >> +++ b/sysdeps/s390/memcpy-z900.S >> @@ -184,25 +184,34 @@ ENTRY(MEMCPY_Z196) >> je .L_Z196_4 >> .L_Z196_start2: >> aghi %r4,-1 >> - srlg %r5,%r4,8 >> - ltgr %r5,%r5 >> + risbg %r5,%r4,8,128+63,56 # r0 = r5 / 256 >> jne .L_Z196_5 >> .L_Z196_3: >> exrl %r4,.L_Z196_14 >> .L_Z196_4: >> br %r14 >> .L_Z196_5: >> - cgfi %r5,262144 # Switch to mvcle for copies >64MB >> - jh __memcpy_mvcle >> + cgfi %r5,255 # Switch to loop with pfd for copies >=64kB >> + jh .L_Z196_6 >> .L_Z196_2: >> - pfd 1,768(%r3) >> - pfd 2,768(%r1) >> mvc 0(256,%r1),0(%r3) >> aghi %r5,-1 >> la %r1,256(%r1) >> la %r3,256(%r3) >> jne .L_Z196_2 >> j .L_Z196_3 >> +.L_Z196_6: >> + cgfi %r5,262144 # Switch to mvcle for copies >64MB >> + jh __memcpy_mvcle >> +.L_Z196_7: >> + pfd 1,1024(%r3) >> + pfd 2,1024(%r1) >> + mvc 0(256,%r1),0(%r3) >> + aghi %r5,-1 >> + la %r1,256(%r1) >> + la %r3,256(%r3) >> + jne .L_Z196_7 >> + j .L_Z196_3 >> .L_Z196_14: >> mvc 0(1,%r1),0(%r3) >> END(MEMCPY_Z196) >> >
diff --git a/sysdeps/s390/memcpy-z900.S b/sysdeps/s390/memcpy-z900.S index f2e9aaeb2d..dc2f491ec3 100644 --- a/sysdeps/s390/memcpy-z900.S +++ b/sysdeps/s390/memcpy-z900.S @@ -184,25 +184,34 @@ ENTRY(MEMCPY_Z196) je .L_Z196_4 .L_Z196_start2: aghi %r4,-1 - srlg %r5,%r4,8 - ltgr %r5,%r5 + risbg %r5,%r4,8,128+63,56 # r0 = r5 / 256 jne .L_Z196_5 .L_Z196_3: exrl %r4,.L_Z196_14 .L_Z196_4: br %r14 .L_Z196_5: - cgfi %r5,262144 # Switch to mvcle for copies >64MB - jh __memcpy_mvcle + cgfi %r5,255 # Switch to loop with pfd for copies >=64kB + jh .L_Z196_6 .L_Z196_2: - pfd 1,768(%r3) - pfd 2,768(%r1) mvc 0(256,%r1),0(%r3) aghi %r5,-1 la %r1,256(%r1) la %r3,256(%r3) jne .L_Z196_2 j .L_Z196_3 +.L_Z196_6: + cgfi %r5,262144 # Switch to mvcle for copies >64MB + jh __memcpy_mvcle +.L_Z196_7: + pfd 1,1024(%r3) + pfd 2,1024(%r1) + mvc 0(256,%r1),0(%r3) + aghi %r5,-1 + la %r1,256(%r1) + la %r3,256(%r3) + jne .L_Z196_7 + j .L_Z196_3 .L_Z196_14: mvc 0(1,%r1),0(%r3) END(MEMCPY_Z196)