diff mbox

[09/10] mips: Add reduction support for Loongson.

Message ID 1324486822-18225-10-git-send-email-rth@redhat.com
State New
Headers show

Commit Message

Richard Henderson Dec. 21, 2011, 5 p.m. UTC
Both plus and min/max.
---
 gcc/config/mips/loongson.md   |  230 +++++++++++++++++++++++++++++++++++++----
 gcc/config/mips/mips-protos.h |    3 +
 gcc/config/mips/mips.c        |   77 ++++++++++++++
 3 files changed, 290 insertions(+), 20 deletions(-)

Comments

Richard Sandiford Dec. 22, 2011, 8:42 p.m. UTC | #1
Richard Henderson <rth@redhat.com> writes:
> +      /* Use PUL/PLU to produce { L, H } op { H, L }.
> +         By reversing the pair order, rather a pure interleave high,
> +	 we don't produce erroneous exceptional conditions.  */

"rather than".  We don't produce erroneous exceptional conditions
that would result from H op H?  It probably should be obvious,
but it took me a few seconds to work out...

> +    case V4HImode:
> +      /* Perform the first reduction with interleave,
> +	 and subsequent reductions with shifts.  */
> +      emit_insn (gen_loongson_punpckhwd_hi (tmp, in, in));
> +      emit_insn (gen (in, in, tmp));
> +      emit_insn (gen_vec_shr_v4hi (tmp, in, force_reg (SImode, GEN_INT (16))));
> +      break;
> +
> +    case V8QImode:
> +      emit_insn (gen_loongson_punpckhwd_qi (tmp, in, in));
> +      emit_insn (gen (in, in, tmp));
> +      emit_insn (gen_vec_shr_v8qi (tmp, in, force_reg (SImode, GEN_INT (16))));
> +      emit_insn (gen (in, in, tmp));
> +      emit_insn (gen_vec_shr_v8qi (tmp, in, force_reg (SImode, GEN_INT (8))));
> +      break;

Seems safer to have a second temporary rather than assign to the input here.

Richard
Richard Henderson Dec. 22, 2011, 8:47 p.m. UTC | #2
On 12/22/2011 12:42 PM, Richard Sandiford wrote:
> Richard Henderson <rth@redhat.com> writes:
>> +      /* Use PUL/PLU to produce { L, H } op { H, L }.
>> +         By reversing the pair order, rather a pure interleave high,
>> +	 we don't produce erroneous exceptional conditions.  */
> 
> "rather than".  We don't produce erroneous exceptional conditions
> that would result from H op H?  It probably should be obvious,
> but it took me a few seconds to work out...

Will fix.

>> +    case V4HImode:
>> +      /* Perform the first reduction with interleave,
>> +	 and subsequent reductions with shifts.  */
>> +      emit_insn (gen_loongson_punpckhwd_hi (tmp, in, in));
>> +      emit_insn (gen (in, in, tmp));
>> +      emit_insn (gen_vec_shr_v4hi (tmp, in, force_reg (SImode, GEN_INT (16))));
>> +      break;
>> +
>> +    case V8QImode:
>> +      emit_insn (gen_loongson_punpckhwd_qi (tmp, in, in));
>> +      emit_insn (gen (in, in, tmp));
>> +      emit_insn (gen_vec_shr_v8qi (tmp, in, force_reg (SImode, GEN_INT (16))));
>> +      emit_insn (gen (in, in, tmp));
>> +      emit_insn (gen_vec_shr_v8qi (tmp, in, force_reg (SImode, GEN_INT (8))));
>> +      break;
> 
> Seems safer to have a second temporary rather than assign to the input here.

Ouch.  That's a real bug.


r~
diff mbox

Patch

diff --git a/gcc/config/mips/loongson.md b/gcc/config/mips/loongson.md
index e9fa616..4f9cc73 100644
--- a/gcc/config/mips/loongson.md
+++ b/gcc/config/mips/loongson.md
@@ -39,6 +39,8 @@ 
   UNSPEC_LOONGSON_PUNPCKL
   UNSPEC_LOONGSON_PADDD
   UNSPEC_LOONGSON_PSUBD
+  UNSPEC_LOONGSON_DSLL
+  UNSPEC_LOONGSON_DSRL
 ])
 
 ;; Mode iterators and attributes.
@@ -58,6 +60,9 @@ 
 ;; 64-bit vectors of words and halfwords.
 (define_mode_iterator VWH [V2SI V4HI])
 
+;; 64-bit vectors of words and bytes
+(define_mode_iterator VWB [V2SI V8QI])
+
 ;; 64-bit vectors of words, halfwords and bytes.
 (define_mode_iterator VWHB [V2SI V4HI V8QI])
 
@@ -404,39 +409,61 @@ 
 })
 
 ;; Maximum of signed halfwords.
-(define_insn "smax<mode>3"
-  [(set (match_operand:VH 0 "register_operand" "=f")
-        (smax:VH (match_operand:VH 1 "register_operand" "f")
-		 (match_operand:VH 2 "register_operand" "f")))]
+(define_insn "smaxv4hi3"
+  [(set (match_operand:V4HI 0 "register_operand" "=f")
+        (smax:V4HI (match_operand:V4HI 1 "register_operand" "f")
+		   (match_operand:V4HI 2 "register_operand" "f")))]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
-  "pmaxs<V_suffix>\t%0,%1,%2"
+  "pmaxsh\t%0,%1,%2"
   [(set_attr "type" "fadd")])
 
+(define_expand "smax<mode>3"
+  [(match_operand:VWB 0 "register_operand" "")
+   (match_operand:VWB 1 "register_operand" "")
+   (match_operand:VWB 2 "register_operand" "")]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+{
+  mips_expand_vec_minmax (operands[0], operands[1], operands[2],
+			  gen_loongson_pcmpgt<V_suffix>, false);
+  DONE;
+})
+
 ;; Maximum of unsigned bytes.
-(define_insn "umax<mode>3"
-  [(set (match_operand:VB 0 "register_operand" "=f")
-        (umax:VB (match_operand:VB 1 "register_operand" "f")
-		 (match_operand:VB 2 "register_operand" "f")))]
+(define_insn "umaxv8qi3"
+  [(set (match_operand:V8QI 0 "register_operand" "=f")
+        (umax:V8QI (match_operand:V8QI 1 "register_operand" "f")
+		   (match_operand:V8QI 2 "register_operand" "f")))]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
-  "pmaxu<V_suffix>\t%0,%1,%2"
+  "pmaxub\t%0,%1,%2"
   [(set_attr "type" "fadd")])
 
 ;; Minimum of signed halfwords.
-(define_insn "smin<mode>3"
-  [(set (match_operand:VH 0 "register_operand" "=f")
-        (smin:VH (match_operand:VH 1 "register_operand" "f")
-		 (match_operand:VH 2 "register_operand" "f")))]
+(define_insn "sminv4hi3"
+  [(set (match_operand:V4HI 0 "register_operand" "=f")
+        (smin:V4HI (match_operand:V4HI 1 "register_operand" "f")
+		   (match_operand:V4HI 2 "register_operand" "f")))]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
-  "pmins<V_suffix>\t%0,%1,%2"
+  "pminsh\t%0,%1,%2"
   [(set_attr "type" "fadd")])
 
+(define_expand "smin<mode>3"
+  [(match_operand:VWB 0 "register_operand" "")
+   (match_operand:VWB 1 "register_operand" "")
+   (match_operand:VWB 2 "register_operand" "")]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+{
+  mips_expand_vec_minmax (operands[0], operands[1], operands[2],
+			  gen_loongson_pcmpgt<V_suffix>, true);
+  DONE;
+})
+
 ;; Minimum of unsigned bytes.
-(define_insn "umin<mode>3"
-  [(set (match_operand:VB 0 "register_operand" "=f")
-        (umin:VB (match_operand:VB 1 "register_operand" "f")
-		 (match_operand:VB 2 "register_operand" "f")))]
+(define_insn "uminv8qi3"
+  [(set (match_operand:V8QI 0 "register_operand" "=f")
+        (umin:V8QI (match_operand:V8QI 1 "register_operand" "f")
+		   (match_operand:V8QI 2 "register_operand" "f")))]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
-  "pminu<V_suffix>\t%0,%1,%2"
+  "pminub\t%0,%1,%2"
   [(set_attr "type" "fadd")])
 
 ;; Move byte mask.
@@ -506,6 +533,14 @@ 
   "biadd\t%0,%1"
   [(set_attr "type" "fabs")])
 
+(define_insn "reduc_uplus_v8qi"
+  [(set (match_operand:V8QI 0 "register_operand" "=f")
+	(unspec:V8QI [(match_operand:V8QI 1 "register_operand" "f")]
+		     UNSPEC_LOONGSON_BIADD))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+  "biadd\t%0,%1"
+  [(set_attr "type" "fabs")])
+
 ;; Sum of absolute differences.
 (define_insn "loongson_psadbh"
   [(set (match_operand:<V_stretch_half> 0 "register_operand" "=f")
@@ -620,6 +655,20 @@ 
   "punpckhhw\t%0,%1,%2"
   [(set_attr "type" "fdiv")])
 
+(define_insn "loongson_punpckhhw_qi"
+  [(set (match_operand:V8QI 0 "register_operand" "=f")
+	(vec_select:V8QI
+	  (vec_concat:V16QI
+	    (match_operand:V8QI 1 "register_operand" "f")
+	    (match_operand:V8QI 2 "register_operand" "f"))
+	  (parallel [(const_int 4)  (const_int 5)
+		     (const_int 12) (const_int 13)
+		     (const_int 6)  (const_int 7)
+		     (const_int 14) (const_int 15)])))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+  "punpckhhw\t%0,%1,%2"
+  [(set_attr "type" "fdiv")])
+
 (define_insn "loongson_punpckhwd"
   [(set (match_operand:V2SI 0 "register_operand" "=f")
 	(vec_select:V2SI
@@ -631,6 +680,32 @@ 
   "punpckhwd\t%0,%1,%2"
   [(set_attr "type" "fcvt")])
 
+(define_insn "loongson_punpckhwd_qi"
+  [(set (match_operand:V8QI 0 "register_operand" "=f")
+	(vec_select:V8QI
+	  (vec_concat:V16QI
+	    (match_operand:V8QI 1 "register_operand" "f")
+	    (match_operand:V8QI 2 "register_operand" "f"))
+	  (parallel [(const_int 4) (const_int 5)
+		     (const_int 6) (const_int 7)
+		     (const_int 12) (const_int 13)
+		     (const_int 14) (const_int 15)])))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+  "punpckhwd\t%0,%1,%2"
+  [(set_attr "type" "fcvt")])
+
+(define_insn "loongson_punpckhwd_hi"
+  [(set (match_operand:V4HI 0 "register_operand" "=f")
+	(vec_select:V4HI
+	  (vec_concat:V8HI
+	    (match_operand:V4HI 1 "register_operand" "f")
+	    (match_operand:V4HI 2 "register_operand" "f"))
+	  (parallel [(const_int 2) (const_int 3)
+		     (const_int 6) (const_int 7)])))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+  "punpckhwd\t%0,%1,%2"
+  [(set_attr "type" "fcvt")])
+
 ;; Unpack low data.
 (define_insn "loongson_punpcklbh"
   [(set (match_operand:V8QI 0 "register_operand" "=f")
@@ -658,6 +733,20 @@ 
   "punpcklhw\t%0,%1,%2"
   [(set_attr "type" "fdiv")])
 
+(define_insn "*loongson_punpcklhw_qi"
+  [(set (match_operand:V8QI 0 "register_operand" "=f")
+	(vec_select:V8QI
+	  (vec_concat:V16QI
+	    (match_operand:V8QI 1 "register_operand" "f")
+	    (match_operand:V8QI 2 "register_operand" "f"))
+	  (parallel [(const_int 0)  (const_int 1)
+		     (const_int 8)  (const_int 9)
+		     (const_int 2)  (const_int 3)
+		     (const_int 10) (const_int 11)])))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+  "punpcklhw\t%0,%1,%2"
+  [(set_attr "type" "fdiv")])
+
 (define_insn "loongson_punpcklwd"
   [(set (match_operand:V2SI 0 "register_operand" "=f")
 	(vec_select:V2SI
@@ -669,6 +758,32 @@ 
   "punpcklwd\t%0,%1,%2"
   [(set_attr "type" "fcvt")])
 
+(define_insn "*loongson_punpcklwd_qi"
+  [(set (match_operand:V8QI 0 "register_operand" "=f")
+	(vec_select:V8QI
+	  (vec_concat:V16QI
+	    (match_operand:V8QI 1 "register_operand" "f")
+	    (match_operand:V8QI 2 "register_operand" "f"))
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 3)
+		     (const_int 8) (const_int 9)
+		     (const_int 10) (const_int 11)])))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+  "punpcklwd\t%0,%1,%2"
+  [(set_attr "type" "fcvt")])
+
+(define_insn "*loongson_punpcklwd_hi"
+  [(set (match_operand:V4HI 0 "register_operand" "=f")
+	(vec_select:V4HI
+	  (vec_concat:V8HI
+	    (match_operand:V4HI 1 "register_operand" "f")
+	    (match_operand:V4HI 2 "register_operand" "f"))
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 4) (const_int 5)])))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+  "punpcklwd\t%0,%1,%2"
+  [(set_attr "type" "fcvt")])
+
 (define_expand "vec_perm_const<mode>"
   [(match_operand:VWHB 0 "register_operand" "")
    (match_operand:VWHB 1 "register_operand" "")
@@ -718,6 +833,81 @@ 
   DONE;
 })
 
+;; Whole vector shifts, used for reduction epilogues.
+(define_insn "vec_shl_<mode>"
+  [(set (match_operand:VWHBDI 0 "register_operand" "=f")
+        (unspec:VWHBDI [(match_operand:VWHBDI 1 "register_operand" "f")
+                        (match_operand:SI 2 "register_operand" "f")]
+                       UNSPEC_LOONGSON_DSLL))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+  "dsll\t%0,%1,%2"
+  [(set_attr "type" "fcvt")])
+
+(define_insn "vec_shr_<mode>"
+  [(set (match_operand:VWHBDI 0 "register_operand" "=f")
+        (unspec:VWHBDI [(match_operand:VWHBDI 1 "register_operand" "f")
+                        (match_operand:SI 2 "register_operand" "f")]
+                       UNSPEC_LOONGSON_DSRL))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+  "dsrl\t%0,%1,%2"
+  [(set_attr "type" "fcvt")])
+
+(define_expand "reduc_uplus_<mode>"
+  [(match_operand:VWH 0 "register_operand" "")
+   (match_operand:VWH 1 "register_operand" "")]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+{
+  mips_expand_vec_reduc (operands[0], operands[1], gen_add<mode>3);
+  DONE;
+})
+
+; ??? Given that we're not describing a widening reduction, we should
+; not have separate optabs for signed and unsigned.
+(define_expand "reduc_splus_<mode>"
+  [(match_operand:VWHB 0 "register_operand" "")
+   (match_operand:VWHB 1 "register_operand" "")]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+{
+  emit_insn (gen_reduc_uplus_<mode>(operands[0], operands[1]));
+  DONE;
+})
+
+(define_expand "reduc_smax_<mode>"
+  [(match_operand:VWHB 0 "register_operand" "")
+   (match_operand:VWHB 1 "register_operand" "")]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+{
+  mips_expand_vec_reduc (operands[0], operands[1], gen_smax<mode>3);
+  DONE;
+})
+
+(define_expand "reduc_smin_<mode>"
+  [(match_operand:VWHB 0 "register_operand" "")
+   (match_operand:VWHB 1 "register_operand" "")]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+{
+  mips_expand_vec_reduc (operands[0], operands[1], gen_smin<mode>3);
+  DONE;
+})
+
+(define_expand "reduc_umax_<mode>"
+  [(match_operand:VB 0 "register_operand" "")
+   (match_operand:VB 1 "register_operand" "")]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+{
+  mips_expand_vec_reduc (operands[0], operands[1], gen_umax<mode>3);
+  DONE;
+})
+
+(define_expand "reduc_umin_<mode>"
+  [(match_operand:VB 0 "register_operand" "")
+   (match_operand:VB 1 "register_operand" "")]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+{
+  mips_expand_vec_reduc (operands[0], operands[1], gen_umin<mode>3);
+  DONE;
+})
+
 ;; Integer division and modulus.  For integer multiplication, see mips.md.
 
 (define_insn "<u>div<mode>3"
diff --git a/gcc/config/mips/mips-protos.h b/gcc/config/mips/mips-protos.h
index 82c8c33..1791ce7 100644
--- a/gcc/config/mips/mips-protos.h
+++ b/gcc/config/mips/mips-protos.h
@@ -330,6 +330,9 @@  extern void mips_expand_atomic_qihi (union mips_gen_fn_ptrs,
 extern void mips_expand_vector_init (rtx, rtx);
 extern bool mips_expand_vec_perm_const (rtx op[4]);
 extern void mips_expand_vec_unpack (rtx op[2], bool, bool);
+extern void mips_expand_vec_reduc (rtx, rtx, rtx (*)(rtx, rtx, rtx));
+extern void mips_expand_vec_minmax (rtx, rtx, rtx,
+				    rtx (*) (rtx, rtx, rtx), bool);
 
 extern bool mips_eh_uses (unsigned int);
 extern bool mips_epilogue_uses (unsigned int);
diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c
index 45b8454..a8f3b26 100644
--- a/gcc/config/mips/mips.c
+++ b/gcc/config/mips/mips.c
@@ -16834,6 +16834,83 @@  mips_expand_vector_init (rtx target, rtx vals)
 
   mips_expand_vi_general (vmode, imode, nelt, nvar, target, vals);
 }
+
+/* Expand a vector reduction.  */
+
+void
+mips_expand_vec_reduc (rtx target, rtx in, rtx (*gen)(rtx, rtx, rtx))
+{
+  enum machine_mode vmode = GET_MODE (in);
+  unsigned char perm2[2];
+  rtx tmp;
+  bool ok;
+
+  tmp = gen_reg_rtx (vmode);
+  switch (vmode)
+    {
+    case V2SFmode:
+      /* Use PUL/PLU to produce { L, H } op { H, L }.
+         By reversing the pair order, rather a pure interleave high,
+	 we don't produce erroneous exceptional conditions.  */
+      perm2[0] = 1;
+      perm2[1] = 2;
+      ok = expand_vselect_vconcat (tmp, in, in, perm2, 2);
+      gcc_assert (ok);
+      break;
+
+    case V2SImode:
+      /* Use interleave to produce { H, L } op { H, H }.  */
+      emit_insn (gen_loongson_punpckhwd (tmp, in, in));
+      break;
+
+    case V4HImode:
+      /* Perform the first reduction with interleave,
+	 and subsequent reductions with shifts.  */
+      emit_insn (gen_loongson_punpckhwd_hi (tmp, in, in));
+      emit_insn (gen (in, in, tmp));
+      emit_insn (gen_vec_shr_v4hi (tmp, in, force_reg (SImode, GEN_INT (16))));
+      break;
+
+    case V8QImode:
+      emit_insn (gen_loongson_punpckhwd_qi (tmp, in, in));
+      emit_insn (gen (in, in, tmp));
+      emit_insn (gen_vec_shr_v8qi (tmp, in, force_reg (SImode, GEN_INT (16))));
+      emit_insn (gen (in, in, tmp));
+      emit_insn (gen_vec_shr_v8qi (tmp, in, force_reg (SImode, GEN_INT (8))));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+  emit_insn (gen (target, in, tmp));
+}
+
+/* Expand a vector minimum/maximum.  */
+
+void
+mips_expand_vec_minmax (rtx target, rtx op0, rtx op1,
+			rtx (*cmp) (rtx, rtx, rtx), bool min_p)
+{
+  enum machine_mode vmode = GET_MODE (target);
+  rtx tc, t0, t1, x;
+
+  tc = gen_reg_rtx (vmode);
+  t0 = gen_reg_rtx (vmode);
+  t1 = gen_reg_rtx (vmode);
+
+  /* op0 > op1 */
+  emit_insn (cmp (tc, op0, op1));
+
+  x = gen_rtx_AND (vmode, tc, (min_p ? op1 : op0));
+  emit_insn (gen_rtx_SET (VOIDmode, t0, x));
+
+  x = gen_rtx_NOT (vmode, tc);
+  x = gen_rtx_AND (vmode, x, (min_p ? op0 : op1));
+  emit_insn (gen_rtx_SET (VOIDmode, t1, x));
+
+  x = gen_rtx_IOR (vmode, t0, t1);
+  emit_insn (gen_rtx_SET (VOIDmode, target, x));
+}
 
 /* Initialize the GCC target structure.  */
 #undef TARGET_ASM_ALIGNED_HI_OP