Patchwork [ARM] Fix pr55073

login
register
mail settings
Submitter Richard Earnshaw
Date Nov. 29, 2012, 5:53 p.m.
Message ID <50B7A103.8000701@arm.com>
Download mbox | patch
Permalink /patch/202800/
State New
Headers show

Comments

Richard Earnshaw - Nov. 29, 2012, 5:53 p.m.
PR 55073 is a case where scheduling appears to mess up the order of
instructions to the extent that they no-longer give the correct results.
   However, looking at the patterns, I think they are ill-defined.  IIRC
tied operands should tie a source to a destination, rather than a
destination to a source.  Failing to get this right can essentially
result in an input value being clobbered and the compiler failing to
detect this.

Because all the patterns that appear to have this violation are named
patterns that are called during expand (mainly of intrinsics), it's not
trivial to simply rename the operands, or we get bogus rtl.  Instead,
I've taken the approach of splitting expand from match.

gcc:

	* arm/neon.md (neon_vtrn<mode>_internal): Split into expand
	and insn patterns.  Re-order insn arguments to tie inputs to
	outputs.
	(neon_vzip<mode>_internal): Likewise.
	(neon_vuzp<mode>_internal): Likewise.

testsuite:

	* gcc.target/arm/pr55073.C: New test.

Patch

--- config/arm/neon.md	(revision 193005)
+++ config/arm/neon.md	(local)
@@ -4225,16 +4225,29 @@  (define_insn "neon_vtbx4v8qi"
   [(set_attr "neon_type" "neon_bp_3cycle")]
 )
 
-(define_insn "neon_vtrn<mode>_internal"
+(define_expand "neon_vtrn<mode>_internal"
+  [(parallel
+    [(set (match_operand:VDQW 0 "s_register_operand" "")
+	  (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "")
+			(match_operand:VDQW 2 "s_register_operand" "")]
+	   UNSPEC_VTRN1))
+     (set (match_operand:VDQW 3 "s_register_operand" "")
+          (unspec:VDQW [(match_dup 1) (match_dup 2)] UNSPEC_VTRN2))])]
+  "TARGET_NEON"
+  ""
+)
+
+;; Note: Different operand numbering to handle tied registers correctly.
+(define_insn "*neon_vtrn<mode>_insn"
   [(set (match_operand:VDQW 0 "s_register_operand" "=w")
         (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0")
-                      (match_operand:VDQW 2 "s_register_operand" "w")]
+                      (match_operand:VDQW 3 "s_register_operand" "2")]
                      UNSPEC_VTRN1))
-   (set (match_operand:VDQW 3 "s_register_operand" "=2")
-         (unspec:VDQW [(match_dup 1) (match_dup 2)]
+   (set (match_operand:VDQW 2 "s_register_operand" "=w")
+         (unspec:VDQW [(match_dup 1) (match_dup 3)]
                      UNSPEC_VTRN2))]
   "TARGET_NEON"
-  "vtrn.<V_sz_elem>\t%<V_reg>0, %<V_reg>3"
+  "vtrn.<V_sz_elem>\t%<V_reg>0, %<V_reg>2"
   [(set (attr "neon_type")
       (if_then_else (match_test "<Is_d_reg>")
                     (const_string "neon_bp_simple")
@@ -4252,16 +4265,29 @@  (define_expand "neon_vtrn<mode>"
   DONE;
 })
 
-(define_insn "neon_vzip<mode>_internal"
+(define_expand "neon_vzip<mode>_internal"
+  [(parallel
+    [(set (match_operand:VDQW 0 "s_register_operand" "")
+	  (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "")
+	  	        (match_operand:VDQW 2 "s_register_operand" "")]
+		       UNSPEC_VZIP1))
+    (set (match_operand:VDQW 3 "s_register_operand" "")
+	 (unspec:VDQW [(match_dup 1) (match_dup 2)] UNSPEC_VZIP2))])]
+  "TARGET_NEON"
+  ""
+)
+
+;; Note: Different operand numbering to handle tied registers correctly.
+(define_insn "*neon_vzip<mode>_insn"
   [(set (match_operand:VDQW 0 "s_register_operand" "=w")
         (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0")
-                      (match_operand:VDQW 2 "s_register_operand" "w")]
+                      (match_operand:VDQW 3 "s_register_operand" "2")]
                      UNSPEC_VZIP1))
-   (set (match_operand:VDQW 3 "s_register_operand" "=2")
-        (unspec:VDQW [(match_dup 1) (match_dup 2)]
+   (set (match_operand:VDQW 2 "s_register_operand" "=w")
+        (unspec:VDQW [(match_dup 1) (match_dup 3)]
                      UNSPEC_VZIP2))]
   "TARGET_NEON"
-  "vzip.<V_sz_elem>\t%<V_reg>0, %<V_reg>3"
+  "vzip.<V_sz_elem>\t%<V_reg>0, %<V_reg>2"
   [(set (attr "neon_type")
       (if_then_else (match_test "<Is_d_reg>")
                     (const_string "neon_bp_simple")
@@ -4279,16 +4305,29 @@  (define_expand "neon_vzip<mode>"
   DONE;
 })
 
-(define_insn "neon_vuzp<mode>_internal"
+(define_expand "neon_vuzp<mode>_internal"
+  [(parallel
+    [(set (match_operand:VDQW 0 "s_register_operand" "")
+	  (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "")
+			(match_operand:VDQW 2 "s_register_operand" "")]
+	   UNSPEC_VUZP1))
+     (set (match_operand:VDQW 3 "s_register_operand" "")
+	  (unspec:VDQW [(match_dup 1) (match_dup 2)] UNSPEC_VUZP2))])]
+  "TARGET_NEON"
+  ""
+)
+
+;; Note: Different operand numbering to handle tied registers correctly.
+(define_insn "*neon_vuzp<mode>_insn"
   [(set (match_operand:VDQW 0 "s_register_operand" "=w")
         (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0")
-                      (match_operand:VDQW 2 "s_register_operand" "w")]
+                      (match_operand:VDQW 3 "s_register_operand" "2")]
                      UNSPEC_VUZP1))
-   (set (match_operand:VDQW 3 "s_register_operand" "=2")
-        (unspec:VDQW [(match_dup 1) (match_dup 2)]
+   (set (match_operand:VDQW 2 "s_register_operand" "=w")
+        (unspec:VDQW [(match_dup 1) (match_dup 3)]
                      UNSPEC_VUZP2))]
   "TARGET_NEON"
-  "vuzp.<V_sz_elem>\t%<V_reg>0, %<V_reg>3"
+  "vuzp.<V_sz_elem>\t%<V_reg>0, %<V_reg>2"
   [(set (attr "neon_type")
       (if_then_else (match_test "<Is_d_reg>")
                     (const_string "neon_bp_simple")
--- testsuite/gcc.target/arm/pr55073.C	(revision 193005)
+++ testsuite/gcc.target/arm/pr55073.C	(local)
@@ -0,0 +1,74 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target arm_neon } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+#include <stdlib.h>
+
+struct __attribute__((aligned(16))) _v16u8_ {
+	uint8x16_t val;
+	_v16u8_() { }
+
+	_v16u8_( const  uint8x16_t &src) { val = src; }
+	_v16u8_( const   int16x8_t &src) { val = vreinterpretq_u8_s16(src); }
+	_v16u8_( const  uint32x4_t &src) { val = vreinterpretq_u8_u32(src); }
+
+	operator  uint8x16_t () const { return val; }
+	operator   int8x16_t () const { return vreinterpretq_s8_u8 (val); }
+	operator   int16x8_t () const { return vreinterpretq_s16_u8(val); }
+	operator  uint32x4_t () const { return vreinterpretq_u32_u8(val); }
+	operator   int32x4_t () const { return vreinterpretq_s32_u8(val); }
+};
+typedef struct _v16u8_ v16u8;
+typedef const v16u8 cv16u8;
+
+typedef v16u8 v16i8;
+typedef v16u8 v8i16;
+typedef v16u8 v4u32;
+
+inline v16u8 __attribute__((always_inline)) mergelo( const v16u8 & s, const v16u8 & t )
+{
+	uint8x8x2_t r = vzip_u8( vget_low_u8(s), vget_low_u8(t) );
+	return vcombine_u8( r.val[0], r.val[1] );
+}
+
+inline v8i16 __attribute__((always_inline)) unpacklo(const v16i8 & s)
+{
+	return vmovl_s8( vget_low_s8( s ) );
+}
+
+const uint32_t __attribute__((aligned(16))) _InA [4] = { 0xFF020001, 0xFF020001, 0xFF000101, 0xFF000101 } ;
+const uint32_t __attribute__((aligned(16))) _InB [4] = { 0xFF050002, 0xFF050002, 0xFF000303, 0xFF000203 } ;
+
+__attribute__((noinline)) v16i8 test_func(void)
+{
+	v16u8 A = vld1q_u8( (uint8_t*) _InA );
+	v16u8 B = vld1q_u8( (uint8_t*) _InB );
+	v8i16 r   = vdupq_n_s16(2);
+
+	v16u8 _0 = mergelo( A, B );
+	v16u8 _1 = mergelo( B, A );
+
+	v16u8 _2 = mergelo( _0, _1 );
+	v16u8 _3 = mergelo( _1, _0 );
+
+	v8i16 _4 = vsubq_s16( unpacklo( _2 ), r );
+	v8i16 _5 = vsubq_s16( unpacklo( _3 ), r );
+
+	v8i16 ret = vaddq_s16( _4, _5 );
+
+	return ( ret );
+}
+
+int main (int argc, char **argv)
+{
+	v16u8 val = test_func();
+
+	if (vgetq_lane_u32( val, 0 ) != 0xffffffff
+	    || vgetq_lane_u32( val, 1 ) != 0xffffffff
+	    || vgetq_lane_u32( val, 2 ) != 0xfffcfffc
+	    || vgetq_lane_u32( val, 3 ) != 0xfffcfffc)
+	  abort ();
+	exit (0);
+}