Patchwork [ARM] Implement widening vector moves and mults.

login
register
mail settings
Submitter tejas belagod
Date Aug. 19, 2010, 2:14 p.m.
Message ID <1282227298.30429.50.camel@e102484-lin.cambridge.arm.com>
Download mbox | patch
Permalink /patch/62160/
State New
Headers show

Comments

tejas belagod - Aug. 19, 2010, 2:14 p.m.
Hi,

Take 2 with the patch!

This patch implements support for vector widening signed and unsigned
moves and multiplications viz. VMOVL.<sign><size> and VMULL.<sign><size>
NEON instructions. This helps vectorize loops whose bodies have widening
moves or multiplications when compiled for NEON. This patch is
implemented to have support for vectorizing with and without
-mvectorize-with-neon-quad. 

Regression tested on trunk. OK for trunk?

--
Tejas Belagod
ARM.

gcc/testsuite

2010-08-19 Tejas Belagod <tejas.belagod@arm.com>

	* lib/target-supports.exp (check_effective_target_vect_unpack):
	Set vect_unpack supported flag to true for neon.

gcc/

2010-08-19 Tejas Belagod <tejas.belagod@arm.com>

	* config/arm/iterators.md (VU, SE, V_widen_l): New. 
	(V_unpack, US): New.
	* config/arm/neon.md (vec_unpack<US>_hi_<mode>): Expansion for
	vmovl.
	(vec_unpack<US>_lo_<mode>): Likewise.
	(neon_vec_unpack<US>_hi_<mode>): Instruction pattern for vmovl.
	(neon_vec_unpack<US>_lo_<mode>): Likewise.
	(vec_widen_<US>mult_lo_<mode>): Expansion for vmull.
	(vec_widen_<US>mult_hi_<mode>): Likewise.
	(neon_vec_<US>mult_lo_<mode>"): Instruction pattern for vmull.
	(neon_vec_<US>mult_hi_<mode>"): Likewise.
	(neon_unpack<US>_<mode>): Widening move intermediate step for
	vectorizing without -mvectorize-with-neon-quad.
	(neon_vec_<US>mult_<mode>): Widening multiply intermediate step
	for vectorizing without -mvectorize-with-neon-quad.
	* config/arm/predicates.md (vect_par_constant_high): Check for
	high-half lanes of a vector.
	(vect_par_constant_low): Check for low-half lanes of a vector.
Richard Earnshaw - Aug. 24, 2010, 9:14 a.m.
On Thu, 2010-08-19 at 15:14 +0100, Tejas Belagod wrote:
> Hi,
> 
> Take 2 with the patch!
> 
> This patch implements support for vector widening signed and unsigned
> moves and multiplications viz. VMOVL.<sign><size> and VMULL.<sign><size>
> NEON instructions. This helps vectorize loops whose bodies have widening
> moves or multiplications when compiled for NEON. This patch is
> implemented to have support for vectorizing with and without
> -mvectorize-with-neon-quad. 
> 
> Regression tested on trunk. OK for trunk?
> 
> --
> Tejas Belagod
> ARM.
> 
> gcc/testsuite
> 
> 2010-08-19 Tejas Belagod <tejas.belagod@arm.com>
> 
> 	* lib/target-supports.exp (check_effective_target_vect_unpack):
> 	Set vect_unpack supported flag to true for neon.
> 
> gcc/
> 
> 2010-08-19 Tejas Belagod <tejas.belagod@arm.com>
> 
> 	* config/arm/iterators.md (VU, SE, V_widen_l): New. 
> 	(V_unpack, US): New.
> 	* config/arm/neon.md (vec_unpack<US>_hi_<mode>): Expansion for
> 	vmovl.
> 	(vec_unpack<US>_lo_<mode>): Likewise.
> 	(neon_vec_unpack<US>_hi_<mode>): Instruction pattern for vmovl.
> 	(neon_vec_unpack<US>_lo_<mode>): Likewise.
> 	(vec_widen_<US>mult_lo_<mode>): Expansion for vmull.
> 	(vec_widen_<US>mult_hi_<mode>): Likewise.
> 	(neon_vec_<US>mult_lo_<mode>"): Instruction pattern for vmull.
> 	(neon_vec_<US>mult_hi_<mode>"): Likewise.
> 	(neon_unpack<US>_<mode>): Widening move intermediate step for
> 	vectorizing without -mvectorize-with-neon-quad.
> 	(neon_vec_<US>mult_<mode>): Widening multiply intermediate step
> 	for vectorizing without -mvectorize-with-neon-quad.
> 	* config/arm/predicates.md (vect_par_constant_high): Check for
> 	high-half lanes of a vector.
> 	(vect_par_constant_low): Check for low-half lanes of a vector.

+;; Assembler mnemonics for signedness of widening operations
Full stop at end of sentence.


+;; Predicates for parallel expanders based on mode.
+(define_special_predicate "vect_par_constant_high" 
+  (match_code "parallel")
+{
[...]
+
+  for (i = 0; i < count; i++)
+   {
+     rtx elt = XVECEXP (op, 0, i);
+     int val = INTVAL (elt);

It's unlikely that this will ever fault, as the uses of this predicate
are fairly limited, but good coding practice says that you shouldn't
assume that.  So you need to confirm that elt is a const_int before
extracting its value (and if it's not the predicate fails to match).
Similarly for vect_par_constant_low.

Otherwise, this is OK.

R.
Ramana Radhakrishnan - Aug. 25, 2010, 7:25 a.m.
Hi Tejas,

> 
> gcc/testsuite
> 
> 2010-08-19 Tejas Belagod <tejas.belagod@arm.com>

Please remember 2 spaces between date and name and 2 spaces between name
and email address in Changelog entries.  I've changed this and committed
now.

cheers
Ramana

Patch

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index ee04aab..3717f6b 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -136,7 +136,9 @@ 
 ;; Modes with 32-bit elements only.
 (define_mode_iterator V32 [V2SI V2SF V4SI V4SF])
 
-
+;; Modes with 8-bit, 16-bit and 32-bit elements.
+(define_mode_iterator VU [V16QI V8HI V4SI])
+ 
 ;;----------------------------------------------------------------------------
 ;; Code iterators
 ;;----------------------------------------------------------------------------
@@ -156,6 +158,8 @@ 
 ;; without unsigned variants (for use with *SFmode pattern).
 (define_code_iterator vqhs_ops [plus smin smax])
 
+;; A list of widening operators
+(define_code_iterator SE [sign_extend zero_extend])
 
 ;;----------------------------------------------------------------------------
 ;; Mode attributes
@@ -360,6 +364,11 @@ 
                                  (V2SF "2") (V4SF "4")
                                  (DI "1")   (V2DI "2")])
 
+;; Same as V_widen, but lower-case.
+(define_mode_attr V_widen_l [(V8QI "v8hi") (V4HI "v4si") ( V2SI "v2di")])
+
+;; Widen. Result is half the number of elements, but widened to double-width.
+(define_mode_attr V_unpack   [(V16QI "V8HI") (V8HI "V4SI") (V4SI "V2DI")])
 
 ;;----------------------------------------------------------------------------
 ;; Code attributes
@@ -375,3 +384,6 @@ 
 
 (define_code_attr cnb [(ltu "CC_C") (geu "CC")])
 (define_code_attr optab [(ltu "ltu") (geu "geu")])
+
+;; Assembler mnemonics for signedness of widening operations
+(define_code_attr US [(sign_extend "s") (zero_extend "u")])
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index bdc279a..3b75a18 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -4977,3 +4977,205 @@ 
   emit_insn (gen_orn<mode>3_neon (operands[0], operands[1], operands[2]));
   DONE;
 })
+
+(define_insn "neon_vec_unpack<US>_lo_<mode>"
+  [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
+        (SE:<V_unpack> (vec_select:<V_HALF>
+			  (match_operand:VU 1 "register_operand" "w")
+			  (match_operand:VU 2 "vect_par_constant_low" ""))))]
+  "TARGET_NEON"
+  "vmovl.<US><V_sz_elem> %q0, %e1"
+  [(set_attr "neon_type" "neon_shift_1")]
+)
+
+(define_insn "neon_vec_unpack<US>_hi_<mode>"
+  [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
+        (SE:<V_unpack> (vec_select:<V_HALF>
+			  (match_operand:VU 1 "register_operand" "w")
+			  (match_operand:VU 2 "vect_par_constant_high" ""))))]
+  "TARGET_NEON"
+  "vmovl.<US><V_sz_elem> %q0, %f1"
+  [(set_attr "neon_type" "neon_shift_1")]
+)
+
+(define_expand "vec_unpack<US>_hi_<mode>"
+  [(match_operand:<V_unpack> 0 "register_operand" "")
+   (SE:<V_unpack> (match_operand:VU 1 "register_operand"))]
+ "TARGET_NEON"
+  {
+   rtvec v = rtvec_alloc (<V_mode_nunits>/2)  ;
+   rtx t1;
+   int i;
+   for (i = 0; i < (<V_mode_nunits>/2); i++)
+     RTVEC_ELT (v, i) = GEN_INT ((<V_mode_nunits>/2) + i);
+  
+   t1 = gen_rtx_PARALLEL (<MODE>mode, v);
+   emit_insn (gen_neon_vec_unpack<US>_hi_<mode> (operands[0], 
+                                                 operands[1], 
+					         t1));
+   DONE;
+  }
+)
+
+(define_expand "vec_unpack<US>_lo_<mode>"
+  [(match_operand:<V_unpack> 0 "register_operand" "")
+   (SE:<V_unpack> (match_operand:VU 1 "register_operand" ""))]
+ "TARGET_NEON"
+  {
+   rtvec v = rtvec_alloc (<V_mode_nunits>/2)  ;
+   rtx t1;
+   int i;
+   for (i = 0; i < (<V_mode_nunits>/2) ; i++)
+     RTVEC_ELT (v, i) = GEN_INT (i);
+   t1 = gen_rtx_PARALLEL (<MODE>mode, v);
+   emit_insn (gen_neon_vec_unpack<US>_lo_<mode> (operands[0], 
+                                                 operands[1], 
+				   	         t1));
+   DONE;
+  }
+)
+
+(define_insn "neon_vec_<US>mult_lo_<mode>"
+ [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
+       (mult:<V_unpack> (SE:<V_unpack> (vec_select:<V_HALF>
+			   (match_operand:VU 1 "register_operand" "w") 
+                           (match_operand:VU 2 "vect_par_constant_low" "")))
+ 		        (SE:<V_unpack> (vec_select:<V_HALF>
+                           (match_operand:VU 3 "register_operand" "w") 
+                           (match_dup 2)))))]
+  "TARGET_NEON"
+  "vmull.<US><V_sz_elem> %q0, %e1, %e3"
+  [(set_attr "neon_type" "neon_shift_1")]
+)
+
+(define_expand "vec_widen_<US>mult_lo_<mode>"
+  [(match_operand:<V_unpack> 0 "register_operand" "")
+   (SE:<V_unpack> (match_operand:VU 1 "register_operand" ""))
+   (SE:<V_unpack> (match_operand:VU 2 "register_operand" ""))]
+ "TARGET_NEON"
+ {
+   rtvec v = rtvec_alloc (<V_mode_nunits>/2)  ;
+   rtx t1;
+   int i;
+   for (i = 0; i < (<V_mode_nunits>/2) ; i++)
+     RTVEC_ELT (v, i) = GEN_INT (i);
+   t1 = gen_rtx_PARALLEL (<MODE>mode, v);
+
+   emit_insn (gen_neon_vec_<US>mult_lo_<mode> (operands[0],
+ 					       operands[1],
+					       t1,
+					       operands[2]));
+   DONE;
+ }
+)
+
+(define_insn "neon_vec_<US>mult_hi_<mode>"
+ [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
+      (mult:<V_unpack> (SE:<V_unpack> (vec_select:<V_HALF>
+			    (match_operand:VU 1 "register_operand" "w") 
+			    (match_operand:VU 2 "vect_par_constant_high" "")))
+		       (SE:<V_unpack> (vec_select:<V_HALF>
+			    (match_operand:VU 3 "register_operand" "w") 
+			    (match_dup 2)))))]
+  "TARGET_NEON"
+  "vmull.<US><V_sz_elem> %q0, %f1, %f3"
+  [(set_attr "neon_type" "neon_shift_1")]
+)
+
+(define_expand "vec_widen_<US>mult_hi_<mode>"
+  [(match_operand:<V_unpack> 0 "register_operand" "")
+   (SE:<V_unpack> (match_operand:VU 1 "register_operand" ""))
+   (SE:<V_unpack> (match_operand:VU 2 "register_operand" ""))]
+ "TARGET_NEON"
+ {
+   rtvec v = rtvec_alloc (<V_mode_nunits>/2)  ;
+   rtx t1;
+   int i;
+   for (i = 0; i < (<V_mode_nunits>/2) ; i++)
+     RTVEC_ELT (v, i) = GEN_INT (<V_mode_nunits>/2 + i);
+   t1 = gen_rtx_PARALLEL (<MODE>mode, v);
+
+   emit_insn (gen_neon_vec_<US>mult_hi_<mode> (operands[0],
+ 					       operands[1],
+					       t1,
+					       operands[2]));
+   DONE;
+
+ }
+)
+
+;; Vectorize for non-neon-quad case
+(define_insn "neon_unpack<US>_<mode>"
+ [(set (match_operand:<V_widen> 0 "register_operand" "=w")
+       (SE:<V_widen> (match_operand:VDI 1 "register_operand" "")))]
+ "TARGET_NEON"
+ "vmovl.<US><V_sz_elem> %q0, %1"
+  [(set_attr "neon_type" "neon_shift_1")]
+)
+
+(define_expand "vec_unpack<US>_lo_<mode>"
+ [(match_operand:<V_double_width> 0 "register_operand" "")
+  (SE:<V_double_width>(match_operand:VDI 1 "register_operand"))]
+ "TARGET_NEON"
+{
+  rtx tmpreg = gen_reg_rtx (<V_widen>mode);
+  emit_insn (gen_neon_unpack<US>_<mode> (tmpreg, operands[1]));
+  emit_insn (gen_neon_vget_low<V_widen_l> (operands[0], tmpreg));
+
+  DONE;
+}
+)
+
+(define_expand "vec_unpack<US>_hi_<mode>"
+ [(match_operand:<V_double_width> 0 "register_operand" "")
+  (SE:<V_double_width>(match_operand:VDI 1 "register_operand"))]
+ "TARGET_NEON"
+{
+  rtx tmpreg = gen_reg_rtx (<V_widen>mode);
+  emit_insn (gen_neon_unpack<US>_<mode> (tmpreg, operands[1]));
+  emit_insn (gen_neon_vget_high<V_widen_l> (operands[0], tmpreg));
+
+  DONE;
+}
+)
+
+(define_insn "neon_vec_<US>mult_<mode>"
+ [(set (match_operand:<V_widen> 0 "register_operand" "=w")
+       (mult:<V_widen> (SE:<V_widen> 
+		 	   (match_operand:VDI 1 "register_operand" "w"))
+ 		       (SE:<V_widen> 
+			   (match_operand:VDI 2 "register_operand" "w"))))]
+  "TARGET_NEON"
+  "vmull.<US><V_sz_elem> %q0, %1, %2"
+  [(set_attr "neon_type" "neon_shift_1")]
+)
+
+(define_expand "vec_widen_<US>mult_hi_<mode>"
+  [(match_operand:<V_double_width> 0 "register_operand" "")
+   (SE:<V_double_width> (match_operand:VDI 1 "register_operand" ""))
+   (SE:<V_double_width> (match_operand:VDI 2 "register_operand" ""))]
+ "TARGET_NEON"
+ {
+   rtx tmpreg = gen_reg_rtx (<V_widen>mode);
+   emit_insn (gen_neon_vec_<US>mult_<mode> (tmpreg, operands[1], operands[2]));
+   emit_insn (gen_neon_vget_high<V_widen_l> (operands[0], tmpreg));
+ 					    
+   DONE;
+
+ }
+)
+
+(define_expand "vec_widen_<US>mult_lo_<mode>"
+  [(match_operand:<V_double_width> 0 "register_operand" "")
+   (SE:<V_double_width> (match_operand:VDI 1 "register_operand" ""))
+   (SE:<V_double_width> (match_operand:VDI 2 "register_operand" ""))]
+ "TARGET_NEON"
+ {
+   rtx tmpreg = gen_reg_rtx (<V_widen>mode);
+   emit_insn (gen_neon_vec_<US>mult_<mode> (tmpreg, operands[1], operands[2]));
+   emit_insn (gen_neon_vget_low<V_widen_l> (operands[0], tmpreg));
+ 					    
+   DONE;
+
+ }
+)
diff --git a/gcc/config/arm/predicates.md b/gcc/config/arm/predicates.md
index da3b6dc..4e12670 100644
--- a/gcc/config/arm/predicates.md
+++ b/gcc/config/arm/predicates.md
@@ -619,3 +619,52 @@ 
 		(and (match_test "TARGET_32BIT")
 		     (match_operand 0 "arm_di_operand"))))
 
+;; Predicates for parallel expanders based on mode.
+(define_special_predicate "vect_par_constant_high" 
+  (match_code "parallel")
+{
+  HOST_WIDE_INT count = XVECLEN (op, 0);
+  int i;
+  int base = GET_MODE_NUNITS (mode);
+
+  if ((count < 1)
+      || (count != base/2))
+    return false;
+    
+  if (!VECTOR_MODE_P (mode))
+    return false;
+
+  for (i = 0; i < count; i++)
+   {
+     rtx elt = XVECEXP (op, 0, i);
+     int val = INTVAL (elt);
+     if (val != (base/2) + i)
+       return false;
+   }
+  return true; 
+})
+
+(define_special_predicate "vect_par_constant_low"
+  (match_code "parallel")
+{
+  HOST_WIDE_INT count = XVECLEN (op, 0);
+  int i;
+  int base = GET_MODE_NUNITS (mode);
+
+  if ((count < 1)
+      || (count != base/2))
+    return false;
+    
+  if (!VECTOR_MODE_P (mode))
+    return false;
+
+  for (i = 0; i < count; i++)
+   {
+     rtx elt = XVECEXP (op, 0, i);
+     int val = INTVAL (elt);
+     if (val != i)
+       return false;
+   } 
+  return true; 
+})
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 1682d58..4b95323 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -2640,7 +2640,8 @@  proc check_effective_target_vect_unpack { } {
         if { ([istarget powerpc*-*-*] && ![istarget powerpc-*paired*])
              || [istarget i?86-*-*]
              || [istarget x86_64-*-*] 
-             || [istarget spu-*-*] } {
+             || [istarget spu-*-*]
+             || ([istarget arm*-*-*] && [check_effective_target_arm_neon]) } {
             set et_vect_unpack_saved 1
         }
     }