diff mbox series

RISC-V: Implement __umulsidi3, umul_ppmm and __muluw3

Message ID 20171118000923.19152-1-palmer@dabbelt.com
State New
Headers show
Series RISC-V: Implement __umulsidi3, umul_ppmm and __muluw3 | expand

Commit Message

Palmer Dabbelt Nov. 18, 2017, 12:09 a.m. UTC
From: Kito Cheng <kito.cheng@gmail.com>

2017-11-17  Kito Cheng  <kito.cheng@gmail.com>

        * longlong.h [__riscv] (__umulsidi3): Define.
        [__riscv] (umul_ppmm) Likewise.
        [__riscv] (__muluw3) Likewise.
---
 include/longlong.h | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

Comments

Jim Wilson Nov. 18, 2017, 1:33 a.m. UTC | #1
On 11/17/2017 04:09 PM, Palmer Dabbelt wrote:
> From: Kito Cheng <kito.cheng@gmail.com>
> 
> 2017-11-17  Kito Cheng  <kito.cheng@gmail.com>
> 
>          * longlong.h [__riscv] (__umulsidi3): Define.
>          [__riscv] (umul_ppmm) Likewise.
>          [__riscv] (__muluw3) Likewise.

Apparently the point of this is that by defining __mulsi3/__muldi3 as an 
extended asm, we get better register allocation, and hence better 
performance.  It would be nice if this info was provided when a patch 
was submitted, so that we don't have to figure it out ourselves.

I see one minor issue though

> +    (w0) = __muluw3 (__ll_lowpart (__x1), __ll_B)			\
> +	   + __ll_lowpart (__x0);					\

The multiply here is just a shift and should be written as a shift by 
(W_TYPE_SIZE / 2).

You copied this code from the default definition which uses *, and 
assumes that the compiler will optimize the multiply into a shift. 
However, because __muluw3 expands into a call to __mulsi3/__muldi3 for a 
risc-v part with no multiply, the optimization will not take place here, 
and you end up with one extra unnecessary multiply operation.

Otherwise this looks OK.

Jim
Kito Cheng Nov. 20, 2017, 3:34 a.m. UTC | #2
>> 2017-11-17  Kito Cheng  <kito.cheng@gmail.com>
>>
>>          * longlong.h [__riscv] (__umulsidi3): Define.
>>          [__riscv] (umul_ppmm) Likewise.
>>          [__riscv] (__muluw3) Likewise.
>
>
> Apparently the point of this is that by defining __mulsi3/__muldi3 as an extended asm, we get better register allocation, and hence better performance.  It would be nice if this info was provided when a patch was submitted, so that we don't have to figure it out ourselves.

I prefer write more comment in the code instead of ChangeLog, I add
more comment in __muluw3.

btw, long times ago, Vladimir was told me[1] it should contain what is
done but not why it is done?

[1] https://gcc.gnu.org/ml/gcc-patches/2014-06/msg00604.html

> I see one minor issue though
>
>> +    (w0) = __muluw3 (__ll_lowpart (__x1), __ll_B)                      \
>> +          + __ll_lowpart (__x0);                                       \
>
>
> The multiply here is just a shift and should be written as a shift by (W_TYPE_SIZE / 2).
>
> You copied this code from the default definition which uses *, and assumes that the compiler will optimize the multiply into a shift. However, because __muluw3 expands into a call to __mulsi3/__muldi3 for a risc-v part with no multiply, the optimization will not take place here, and you end up with one extra unnecessary multiply operation.

You are right, I guess I just substitute all multiplication before,
thanks you point out this :)


Updated patch:



2017-11-24  Kito Cheng  <kito.cheng@gmail.com>

        * longlong.h [__riscv] (__umulsidi3): Define.
        [__riscv] (umul_ppmm) Likewise.
        [__riscv] (__muluw3) Likewise.

---
 include/longlong.h | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/include/longlong.h b/include/longlong.h
index 9d3ab21..b76fa41 100644
--- a/include/longlong.h
+++ b/include/longlong.h
@@ -1086,6 +1086,56 @@ extern UDItype __umulsidi3 (USItype, USItype);
   } while (0)
 #endif

+#if defined(__riscv)
+#ifdef __riscv_mul
+#define __umulsidi3(u,v) ((UDWtype)(UWtype)(u) * (UWtype)(v))
+#define __muluw3(a, b) ((UWtype)(a) * (UWtype)(b))
+#else
+#if __riscv_xlen == 32
+  #define MULUW3 "call __mulsi3"
+#elif __riscv_xlen == 64
+  #define MULUW3 "call __muldi3"
+#else
+#error unsupport xlen
+#endif /* __riscv_xlen */
+/* We rely on the fact that MULUW3 doesn't clobber the t-registers.
+   It can get better register allocation result.  */
+#define __muluw3(a, b) \
+  ({ \
+    register UWtype __op0 asm ("a0") = a; \
+    register UWtype __op1 asm ("a1") = b; \
+    asm volatile (MULUW3 \
+                  : "+r" (__op0), "+r" (__op1) \
+                  : \
+                  : "ra", "a2", "a3"); \
+    __op0; \
+  })
+#endif /* __riscv_mul */
+#define umul_ppmm(w1, w0, u, v) \
+  do { \
+    UWtype __x0, __x1, __x2, __x3; \
+    UHWtype __ul, __vl, __uh, __vh; \
+ \
+    __ul = __ll_lowpart (u); \
+    __uh = __ll_highpart (u); \
+    __vl = __ll_lowpart (v); \
+    __vh = __ll_highpart (v); \
+ \
+    __x0 = __muluw3 (__ul, __vl); \
+    __x1 = __muluw3 (__ul, __vh); \
+    __x2 = __muluw3 (__uh, __vl); \
+    __x3 = __muluw3 (__uh, __vh); \
+ \
+    __x1 += __ll_highpart (__x0);/* this can't give carry */ \
+    __x1 += __x2; /* but this indeed can */ \
+    if (__x1 < __x2) /* did we get it? */ \
+      __x3 += __ll_B; /* yes, add it in the proper pos.  */ \
+ \
+    (w1) = __x3 + __ll_highpart (__x1); \
+    (w0) = __ll_lowpart (__x1) * __ll_B + __ll_lowpart (__x0); \
+  } while (0)
+#endif /* __riscv */
+
 #if defined(__sh__) && W_TYPE_SIZE == 32
 #ifndef __sh1__
 #define umul_ppmm(w1, w0, u, v) \
Jim Wilson Nov. 20, 2017, 4:53 a.m. UTC | #3
On Sun, Nov 19, 2017 at 7:34 PM, Kito Cheng <kito.cheng@gmail.com> wrote:
>
> I prefer write more comment in the code instead of ChangeLog, I add
> more comment in __muluw3.
> btw, long times ago, Vladimir was told me[1] it should contain what is
> done but not why it is done?


Yes, the ChangeLog entry only states what was changed.  But the email
message or the comments should mention why, so we know why we are reviewing
the patch.  Something like "gives a 5% performance increase on our
application" would be a good reason to accept a patch.  Otherwise the patch
is a little confusing, since it isn't clear why you are making the change.
But I can see that there will be fewer register saves/restores in a
function that needs umul_ppmm, and that will lead to better register
allocation results, which is likely to result in better performance for
some application.


> You are right, I guess I just substitute all multiplication before,
> thanks you point out this :)
> Updated patch:
>
> 2017-11-24  Kito Cheng  <kito.cheng@gmail.com>
>
>         * longlong.h [__riscv] (__umulsidi3): Define.
>         [__riscv] (umul_ppmm) Likewise.
>         [__riscv] (__muluw3) Likewise.
>

Yes, this looks good to me.  The placement of the backslashes to continue
macro lines is different in the patch Palmer submitted, but that isn't
important, just made it a little harder to compare the two patches.

Jim
Jakub Jelinek Nov. 20, 2017, 7:31 a.m. UTC | #4
On Sun, Nov 19, 2017 at 08:53:00PM -0800, Jim Wilson wrote:
> > 2017-11-24  Kito Cheng  <kito.cheng@gmail.com>
> >
> >         * longlong.h [__riscv] (__umulsidi3): Define.
> >         [__riscv] (umul_ppmm) Likewise.
> >         [__riscv] (__muluw3) Likewise.

BTW, two colons missing after )

	Jakub
Palmer Dabbelt Nov. 20, 2017, 7:08 p.m. UTC | #5
On Sun, 19 Nov 2017 23:31:56 PST (-0800), jakub@redhat.com wrote:
> On Sun, Nov 19, 2017 at 08:53:00PM -0800, Jim Wilson wrote:
>> > 2017-11-24  Kito Cheng  <kito.cheng@gmail.com>
>> >
>> >         * longlong.h [__riscv] (__umulsidi3): Define.
>> >         [__riscv] (umul_ppmm) Likewise.
>> >         [__riscv] (__muluw3) Likewise.
>
> BTW, two colons missing after )

Thanks.  Committed!
diff mbox series

Patch

diff --git a/include/longlong.h b/include/longlong.h
index c24568acea07..e61947d6c307 100644
--- a/include/longlong.h
+++ b/include/longlong.h
@@ -1050,6 +1050,56 @@  extern UDItype __umulsidi3 (USItype, USItype);
   } while (0)
 #endif
 
+#if defined(__riscv)
+#ifdef __riscv_mul
+#define __umulsidi3(u,v) ((UDWtype)(UWtype)(u) * (UWtype)(v))
+#define __muluw3(a, b) ((UWtype)(a) * (UWtype)(b))
+#else
+#if __riscv_xlen == 32
+  #define MULUW3 "call __mulsi3"
+#elif __riscv_xlen == 64
+  #define MULUW3 "call __muldi3"
+#else
+#error unsupport xlen
+#endif /* __riscv_xlen */
+/* We rely on the fact that MULUW3 doesn't clobber the t-registers.  */
+#define __muluw3(a, b)							\
+  ({									\
+    register UWtype __op0 asm ("a0") = a;				\
+    register UWtype __op1 asm ("a1") = b;				\
+    asm volatile (MULUW3						\
+                  : "+r" (__op0), "+r" (__op1)				\
+                  :							\
+                  : "ra", "a2", "a3");					\
+    __op0;								\
+  })
+#endif /* __riscv_mul */
+#define umul_ppmm(w1, w0, u, v)						\
+  do {									\
+    UWtype __x0, __x1, __x2, __x3;					\
+    UHWtype __ul, __vl, __uh, __vh;					\
+									\
+    __ul = __ll_lowpart (u);						\
+    __uh = __ll_highpart (u);						\
+    __vl = __ll_lowpart (v);						\
+    __vh = __ll_highpart (v);						\
+									\
+    __x0 = __muluw3 (__ul, __vl);					\
+    __x1 = __muluw3 (__ul, __vh);					\
+    __x2 = __muluw3 (__uh, __vl);					\
+    __x3 = __muluw3 (__uh, __vh);					\
+									\
+    __x1 += __ll_highpart (__x0);/* this can't give carry */		\
+    __x1 += __x2;		/* but this indeed can */		\
+    if (__x1 < __x2)		/* did we get it? */			\
+      __x3 += __ll_B;		/* yes, add it in the proper pos.  */	\
+									\
+    (w1) = __x3 + __ll_highpart (__x1);					\
+    (w0) = __muluw3 (__ll_lowpart (__x1), __ll_B)			\
+	   + __ll_lowpart (__x0);					\
+  } while (0)
+#endif /* __riscv */
+
 #if defined(__sh__) && W_TYPE_SIZE == 32
 #ifndef __sh1__
 #define umul_ppmm(w1, w0, u, v) \