Patchwork [SH] PR 6526

login
register
mail settings
Submitter Oleg Endo
Date May 24, 2013, 10:34 a.m.
Message ID <1369391663.21245.17.camel@yam-132-YW-E178-FTW>
Download mbox | patch
Permalink /patch/246127/
State New
Headers show

Comments

Oleg Endo - May 24, 2013, 10:34 a.m.
Hello,

I'd like to fix this ancient PR.
The attached patch picks up the suggested changes mentioned in comment
#3 to avoid changing the FPSCR.FR bit in the sdivsi3_i4 and udivsi3_i4
library functions.  As mentioned in the PR, this makes integer division
a bit slower when using -mdiv=call-fp, but it allows better utilization
of the SH4 matrix multiplication feature.
I've also added SH4A versions of the library functions which utilize the
fpchg instruction.


Tested on rev 199102 with
make -k check RUNTESTFLAGS="--target_board=sh-sim
\{-m4/-mb/-mdiv=call-fp,-m4-single/-mb/-mdiv=call-fp,-m4a/-mb/-mdiv=call-fp,
-m4a-single/-mb/-mdiv=call-fp,-m4/-ml/-mdiv=call-fp,
-m4-single/-ml/-mdiv=call-fp,-m4a/-ml/-mdiv=call-fp,
-m4a-single/-ml/-mdiv=call-fp}"

and no new failures.

OK for trunk?

Cheers,
Oleg


libgcc/ChangeLog:

	PR target/6526
	* config/sh/lib1funcs.S (sdivsi3_i4, udivsi3_i4): Do not change 
	bits other than FPSCR.PR and FPSCR.SZ.  Add SH4A implementation.


testsuite/ChangeLog:

	PR target/6526
	* gcc.target/sh/pr6526.c: New.
Kaz Kojima - May 25, 2013, 11:16 p.m.
Oleg Endo <oleg.endo@t-online.de> wrote:
> I'd like to fix this ancient PR.
> The attached patch picks up the suggested changes mentioned in comment
> #3 to avoid changing the FPSCR.FR bit in the sdivsi3_i4 and udivsi3_i4
> library functions.  As mentioned in the PR, this makes integer division
> a bit slower when using -mdiv=call-fp, but it allows better utilization
> of the SH4 matrix multiplication feature.
> I've also added SH4A versions of the library functions which utilize the
> fpchg instruction.
> 
> 
> Tested on rev 199102 with
> make -k check RUNTESTFLAGS="--target_board=sh-sim
> \{-m4/-mb/-mdiv=call-fp,-m4-single/-mb/-mdiv=call-fp,-m4a/-mb/-mdiv=call-fp,
> -m4a-single/-mb/-mdiv=call-fp,-m4/-ml/-mdiv=call-fp,
> -m4-single/-ml/-mdiv=call-fp,-m4a/-ml/-mdiv=call-fp,
> -m4a-single/-ml/-mdiv=call-fp}"
> 
> and no new failures.
> 
> OK for trunk?

OK.

Regards,
	kaz

Patch

Index: libgcc/config/sh/lib1funcs.S
===================================================================
--- libgcc/config/sh/lib1funcs.S	(revision 199109)
+++ libgcc/config/sh/lib1funcs.S	(working copy)
@@ -1003,11 +1003,17 @@ 
 	ENDFUNC(GLOBAL(mulsi3))
 #endif
 #endif /* ! __SH5__ */
+
+/*------------------------------------------------------------------------------
+  32 bit signed integer division that uses FPU double precision division.  */
+
 #ifdef L_sdivsi3_i4
 	.title "SH DIVIDE"
-!! 4 byte integer Divide code for the Renesas SH
+
 #if defined (__SH4__) || defined (__SH2A__)
-!! args in r4 and r5, result in fpul, clobber dr0, dr2
+/* This variant is used when FPSCR.PR = 1 (double precision) is the default
+   setting.
+   Args in r4 and r5, result in fpul, clobber dr0, dr2.  */
 
 	.global	GLOBAL(sdivsi3_i4)
 	HIDDEN_FUNC(GLOBAL(sdivsi3_i4))
@@ -1021,8 +1027,13 @@ 
 	ftrc dr0,fpul
 
 	ENDFUNC(GLOBAL(sdivsi3_i4))
+
 #elif defined (__SH2A_SINGLE__) || defined (__SH2A_SINGLE_ONLY__) || defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) || (defined (__SH5__) && ! defined __SH4_NOFPU__)
-!! args in r4 and r5, result in fpul, clobber r2, dr0, dr2
+/* This variant is used when FPSCR.PR = 0 (sigle precision) is the default
+   setting.
+   Args in r4 and r5, result in fpul, clobber r2, dr0, dr2.
+   For this to work, we must temporarily switch the FPU do double precision,
+   but we better do not touch FPSCR.FR.  See PR 6526.  */
 
 #if ! __SH5__ || __SH5__ == 32
 #if __SH5__
@@ -1031,24 +1042,43 @@ 
 	.global	GLOBAL(sdivsi3_i4)
 	HIDDEN_FUNC(GLOBAL(sdivsi3_i4))
 GLOBAL(sdivsi3_i4):
-	sts.l fpscr,@-r15
-	mov #8,r2
-	swap.w r2,r2
-	lds r2,fpscr
-	lds r4,fpul
-	float fpul,dr0
-	lds r5,fpul
-	float fpul,dr2
-	fdiv dr2,dr0
-	ftrc dr0,fpul
+
+#ifndef __SH4A__
+	mov.l	r3,@-r15
+	sts	fpscr,r2
+	mov	#8,r3
+	swap.w	r3,r3		// r3 = 1 << 19 (FPSCR.PR bit)
+	or	r2,r3
+	lds	r3,fpscr	// Set FPSCR.PR = 1.
+	lds	r4,fpul
+	float	fpul,dr0
+	lds	r5,fpul
+	float	fpul,dr2
+	fdiv	dr2,dr0
+	ftrc	dr0,fpul
+	lds	r2,fpscr
 	rts
-	lds.l @r15+,fpscr
+	mov.l	@r15+,r3
+#else
+/* On SH4A we can use the fpchg instruction to flip the FPSCR.PR bit.  */
+	fpchg
+	lds	r4,fpul
+	float	fpul,dr0
+	lds	r5,fpul
+	float	fpul,dr2
+	fdiv	dr2,dr0
+	ftrc	dr0,fpul
+	rts
+	fpchg	
 
+#endif /* __SH4A__  */
+
 	ENDFUNC(GLOBAL(sdivsi3_i4))
 #endif /* ! __SH5__ || __SH5__ == 32 */
 #endif /* ! __SH4__ || __SH2A__  */
-#endif
+#endif /* L_sdivsi3_i4  */
 
+//------------------------------------------------------------------------------
 #ifdef L_sdivsi3
 /* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with
    sh2e/sh3e code.  */
@@ -1367,54 +1397,60 @@ 
 	mov	#0,r0
 
 	ENDFUNC(GLOBAL(sdivsi3))
-#endif /* ! __SHMEDIA__ */
-#endif
+#endif /* ! __SHMEDIA__  */
+#endif /* L_sdivsi3  */
+
+/*------------------------------------------------------------------------------
+  32 bit unsigned integer division that uses FPU double precision division.  */
+
 #ifdef L_udivsi3_i4
+	.title "SH DIVIDE"
 
-	.title "SH DIVIDE"
-!! 4 byte integer Divide code for the Renesas SH
 #if defined (__SH4__) || defined (__SH2A__)
-!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4,
-!! and t bit
+/* This variant is used when FPSCR.PR = 1 (double precision) is the default
+   setting.
+   Args in r4 and r5, result in fpul,
+   clobber r0, r1, r4, r5, dr0, dr2, dr4, and t bit  */
 
 	.global	GLOBAL(udivsi3_i4)
 	HIDDEN_FUNC(GLOBAL(udivsi3_i4))
 GLOBAL(udivsi3_i4):
-	mov #1,r1
-	cmp/hi r1,r5
-	bf trivial
-	rotr r1
-	xor r1,r4
-	lds r4,fpul
-	mova L1,r0
+	mov	#1,r1
+	cmp/hi	r1,r5
+	bf/s	trivial
+	rotr	r1
+	xor	r1,r4
+	lds	r4,fpul
+	mova	L1,r0
 #ifdef FMOVD_WORKS
-	fmov.d @r0+,dr4
+	fmov.d	@r0+,dr4
 #else
-	fmov.s @r0+,DR40
-	fmov.s @r0,DR41
+	fmov.s	@r0+,DR40
+	fmov.s	@r0,DR41
 #endif
-	float fpul,dr0
-	xor r1,r5
-	lds r5,fpul
-	float fpul,dr2
-	fadd dr4,dr0
-	fadd dr4,dr2
-	fdiv dr2,dr0
+	float	fpul,dr0
+	xor	r1,r5
+	lds	r5,fpul
+	float	fpul,dr2
+	fadd	dr4,dr0
+	fadd	dr4,dr2
+	fdiv	dr2,dr0
 	rts
-	ftrc dr0,fpul
+	ftrc	dr0,fpul
 
 trivial:
 	rts
-	lds r4,fpul
+	lds	r4,fpul
 
 	.align 2
 #ifdef FMOVD_WORKS
-	.align 3	! make double below 8 byte aligned.
+	.align 3	// Make the double below 8 byte aligned.
 #endif
 L1:
 	.double 2147483648
 
 	ENDFUNC(GLOBAL(udivsi3_i4))
+
 #elif defined (__SH5__) && ! defined (__SH4_NOFPU__) && ! defined (__SH2A_NOFPU__)
 #if ! __SH5__ || __SH5__ == 32
 !! args in r4 and r5, result in fpul, clobber r20, r21, dr0, fr33
@@ -1436,57 +1472,106 @@ 
 
 	ENDFUNC(GLOBAL(udivsi3_i4))
 #endif /* ! __SH5__ || __SH5__ == 32 */
+
 #elif defined (__SH2A_SINGLE__) || defined (__SH2A_SINGLE_ONLY__) || defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__)
-!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4
+/* This variant is used when FPSCR.PR = 0 (sigle precision) is the default
+   setting.
+   Args in r4 and r5, result in fpul,
+   clobber r0, r1, r4, r5, dr0, dr2, dr4.
+   For this to work, we must temporarily switch the FPU do double precision,
+   but we better do not touch FPSCR.FR.  See PR 6526.  */
 
 	.global	GLOBAL(udivsi3_i4)
 	HIDDEN_FUNC(GLOBAL(udivsi3_i4))
 GLOBAL(udivsi3_i4):
-	mov #1,r1
-	cmp/hi r1,r5
-	bf trivial
-	sts.l fpscr,@-r15
-	mova L1,r0
-	lds.l @r0+,fpscr
-	rotr r1
-	xor r1,r4
-	lds r4,fpul
+
+#ifndef __SH4A__
+	mov	#1,r1
+	cmp/hi	r1,r5
+	bf/s	trivial
+	rotr	r1		// r1 = 1 << 31
+	sts.l	fpscr,@-r15
+	xor	r1,r4
+	mov.l	@(0,r15),r0
+	xor	r1,r5
+	mov.l	L2,r1
+	lds	r4,fpul
+	or	r0,r1
+	mova	L1,r0
+	lds	r1,fpscr
 #ifdef FMOVD_WORKS
-	fmov.d @r0+,dr4
+	fmov.d	@r0+,dr4
 #else
-	fmov.s @r0+,DR40
-	fmov.s @r0,DR41
+	fmov.s	@r0+,DR40
+	fmov.s	@r0,DR41
 #endif
-	float fpul,dr0
-	xor r1,r5
-	lds r5,fpul
-	float fpul,dr2
-	fadd dr4,dr0
-	fadd dr4,dr2
-	fdiv dr2,dr0
-	ftrc dr0,fpul
+	float	fpul,dr0
+	lds	r5,fpul
+	float	fpul,dr2
+	fadd	dr4,dr0
+	fadd	dr4,dr2
+	fdiv	dr2,dr0
+	ftrc	dr0,fpul
 	rts
-	lds.l @r15+,fpscr
+	lds.l	@r15+,fpscr
 
 #ifdef FMOVD_WORKS
-	.align 3	! make double below 8 byte aligned.
+	.align 3	// Make the double below 8 byte aligned.
 #endif
 trivial:
 	rts
-	lds r4,fpul
+	lds	r4,fpul
 
 	.align 2
-L1:
-#ifndef FMOVD_WORKS
-	.long 0x80000
+L2:
+#ifdef FMOVD_WORKS
+	.long 0x180000	// FPSCR.PR = 1, FPSCR.SZ = 1
 #else
-	.long 0x180000
+	.long 0x80000	// FPSCR.PR = 1
 #endif
+L1:
 	.double 2147483648
 
+#else
+/* On SH4A we can use the fpchg instruction to flip the FPSCR.PR bit.
+   Although on SH4A fmovd usually works, it would require either additional
+   two fschg instructions or an FPSCR push + pop.  It's not worth the effort
+   for loading only one double constant.  */
+	mov	#1,r1
+	cmp/hi	r1,r5
+	bf/s	trivial
+	rotr	r1		// r1 = 1 << 31
+	fpchg
+	mova	L1,r0
+	xor	r1,r4
+	fmov.s	@r0+,DR40
+	lds	r4,fpul
+	fmov.s	@r0,DR41
+	xor	r1,r5
+	float	fpul,dr0
+	lds	r5,fpul
+	float	fpul,dr2
+	fadd	dr4,dr0
+	fadd	dr4,dr2
+	fdiv	dr2,dr0
+	ftrc	dr0,fpul
+	rts
+	fpchg
+
+trivial:
+	rts
+	lds	r4,fpul
+
+	.align 2
+L1:
+	.double 2147483648
+
+#endif /* __SH4A__  */
+
+
 	ENDFUNC(GLOBAL(udivsi3_i4))
 #endif /* ! __SH4__ */
-#endif
+#endif /* L_udivsi3_i4  */
 
 #ifdef L_udivsi3
 /* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with
Index: gcc/testsuite/gcc.target/sh/pr6526.c
===================================================================
--- gcc/testsuite/gcc.target/sh/pr6526.c	(revision 0)
+++ gcc/testsuite/gcc.target/sh/pr6526.c	(revision 0)
@@ -0,0 +1,64 @@ 
+/* Check that the XF registers are not clobbered by an integer division
+   that is done using double precision FPU division.  */
+/* { dg-do run { target "sh*-*-*" } }  */
+/* { dg-options "-O1 -mdiv=call-fp" }  */
+/* { dg-skip-if "" { "sh*-*-*" } { "*" } { "-m4*-single" "-m4*-single-only" } }  */
+
+#include <assert.h>
+#include <stdlib.h>
+
+extern void __set_fpscr (int);
+
+void
+write_xf0 (float* f)
+{
+  __asm__ __volatile__ ("frchg; fmov.s @%0,fr0; frchg" : : "r" (f) : "memory");
+}
+ 
+void
+read_xf0 (float* f)
+{
+  __asm__ __volatile__ ("frchg; fmov.s fr0,@%0; frchg" : : "r" (f) : "memory");
+}
+
+int __attribute__ ((noinline))
+test_00 (int a, int b)
+{
+  return a / b;
+}
+
+unsigned int __attribute__ ((noinline))
+test_01 (unsigned a, unsigned b)
+{
+  return a / b;
+}
+
+int __attribute__ ((noinline))
+test_02 (int x)
+{
+  return x & 0;
+}
+
+int
+main (void)
+{
+  float test_value;
+  int r = 0;
+
+  /* Set FPSCR.FR to 1.  */
+  __set_fpscr (0x200000);
+
+  test_value = 123;
+  write_xf0 (&test_value);
+  r += test_00 (40, 4);
+  read_xf0 (&test_value);
+  assert (test_value == 123);
+
+  test_value = 321;
+  write_xf0 (&test_value);
+  r += test_01 (50, 5);
+  read_xf0 (&test_value);
+  assert (test_value == 321);
+
+  return test_02 (r);
+}