From patchwork Mon Jun  6 13:40:55 2016
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Jiong Wang <jiong.wang@foss.arm.com>
X-Patchwork-Id: 630908
Return-Path: 
 <gcc-patches-return-429112-incoming=patchwork.ozlabs.org@gcc.gnu.org>
X-Original-To: incoming@patchwork.ozlabs.org
Delivered-To: patchwork-incoming@bilbo.ozlabs.org
Received: from sourceware.org (server1.sourceware.org [209.132.180.131])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256
	bits)) (No client certificate requested)
	by ozlabs.org (Postfix) with ESMTPS id 3rNbT62B2Pz9t3x
	for <incoming@patchwork.ozlabs.org>;
	Mon,  6 Jun 2016 23:41:30 +1000 (AEST)
Authentication-Results: ozlabs.org; dkim=pass (1024-bit key;
	unprotected) header.d=gcc.gnu.org header.i=@gcc.gnu.org
	header.b=v2RdqL3y; dkim-atps=neutral
DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id
	:list-unsubscribe:list-archive:list-post:list-help:sender:from
	:subject:to:references:cc:message-id:date:mime-version
	:in-reply-to:content-type; q=dns; s=default; b=wak+orjhexecjmxvV
	TE/TqgOPpUDfg4GytNf7WBFdWC2k5SE9ao3qnNQR3lY7AxOpSIJBAJukAH4olumx
	vRpnCW2Sbq7o3uNwv+Wx2lzbNJmnQKsWX6FU6q8z9d/8phrwoJKftA70j4GOOFuh
	sKf9wy5SBsjAHKmbdjI72m/rCE=
DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=gcc.gnu.org; h=list-id
	:list-unsubscribe:list-archive:list-post:list-help:sender:from
	:subject:to:references:cc:message-id:date:mime-version
	:in-reply-to:content-type; s=default; bh=JvcXS6wHFCa89jLPlWOmdUS
	S7PU=; b=v2RdqL3yXx1RaTjciZANAjy84WeFXU14dMZWK8+shwii6hKt6xKYKsF
	6WJL0Oc3XDt11IQa0RJi7rdM43MYG4u5oik0scMwgbV9pG9SVc8Ynj8lwHurLf4l
	9HIxeAjl69FmPqlyiHf1cYl7wNUtU+klyZR4K7fgPcG4CEq/NEF4=
Received: (qmail 78684 invoked by alias); 6 Jun 2016 13:41:11 -0000
Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Id: <gcc-patches.gcc.gnu.org>
List-Unsubscribe: 
 <mailto:gcc-patches-unsubscribe-incoming=patchwork.ozlabs.org@gcc.gnu.org>
List-Archive: <http://gcc.gnu.org/ml/gcc-patches/>
List-Post: <mailto:gcc-patches@gcc.gnu.org>
List-Help: <mailto:gcc-patches-help@gcc.gnu.org>
Sender: gcc-patches-owner@gcc.gnu.org
Delivered-To: mailing list gcc-patches@gcc.gnu.org
Received: (qmail 78548 invoked by uid 89); 6 Jun 2016 13:41:10 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-2.1 required=5.0 tests=BAYES_00,
	KAM_LAZY_DOMAIN_SECURITY, KAM_LOTSOFHASH,
	RP_MATCHES_RCVD autolearn=ham version=3.3.2 spammy=vpadds_f32,
	elt
X-HELO: foss.arm.com
Received: from foss.arm.com (HELO foss.arm.com) (217.140.101.70) by
	sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with ESMTP;
	Mon, 06 Jun 2016 13:40:58 +0000
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.72.51.249])	by
	usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id D4BCB2F;
	Mon,  6 Jun 2016 06:41:30 -0700 (PDT)
Received: from [10.2.206.198] (e104437-lin.cambridge.arm.com
	[10.2.206.198])	by usa-sjc-imap-foss1.foss.arm.com (Postfix)
	with ESMTPSA id CAA513F445; Mon,  6 Jun 2016 06:40:56 -0700 (PDT)
From: Jiong Wang <jiong.wang@foss.arm.com>
Subject: [v2][AArch64,
	6/6] Reimplement vpadd intrinsics & extend rtl patterns to all modes
To: James Greenhalgh <james.greenhalgh@arm.com>
References: <57430251.6060902@foss.arm.com> <57440F88.2060603@foss.arm.com>
	<20160527130344.GF26495@arm.com> <57487B41.8020200@foss.arm.com>
	<6af07de4-8179-c0bf-410c-317ef52876dd@foss.arm.com>
	<7cb1e234-46f9-76b4-aefd-1eacabfb4ca7@foss.arm.com>
	<49a7c4d8-3fdc-8806-a4df-affa742cc5d7@foss.arm.com>
	<32b5ca55-e60a-42b0-3532-84319e5c0daf@foss.arm.com>
	<1017fc5b-389d-ab41-24bd-491fff8e1a81@foss.arm.com>
Cc: GCC Patches <gcc-patches@gcc.gnu.org>
Message-ID: <758393e5-3257-7aab-8704-592aaafc1917@foss.arm.com>
Date: Mon, 6 Jun 2016 14:40:55 +0100
User-Agent: Mozilla/5.0 (X11; Linux x86_64;
	rv:45.0) Gecko/20100101 Thunderbird/45.1.0
MIME-Version: 1.0
In-Reply-To: <1017fc5b-389d-ab41-24bd-491fff8e1a81@foss.arm.com>
X-IsSubscribed: yes

These intrinsics was implemented by inline assembly using "faddp" instruction.
There was a pattern "aarch64_addpv4sf" which supportsV4SF mode only while we can
extend this pattern to support VDQF mode, then we can reimplement these
intrinsics through builtlins.

gcc/
2016-06-06  Jiong Wang<jiong.wang@arm.com>

         * config/aarch64/aarch64-builtins.def (faddp): New builtins for modes in VDQF.
         * config/aarch64/aarch64-simd.md (aarch64_faddp<mode>): New.
         (arch64_addpv4sf): Delete.
         (reduc_plus_scal_v4sf): Use "gen_aarch64_faddpv4sf" instead of
         "gen_aarch64_addpv4sf".
         * config/aarch64/arm_neon.h (vpadd_f32): Remove inline assembly.  Use
         builtin.
         (vpadds_f32): Likewise.
         (vpaddq_f32): Likewise.
         (vpaddq_f64): Likewise.

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index deab3450ab74fcd6dfcf8267fa9cedfc1423ca4e..1348e7c198763b24d092f774a0ff25e4d0fd1787 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -468,3 +468,6 @@
 
   /* Implemented by fabd<mode>3.  */
   BUILTIN_VALLF (BINOP, fabd, 3)
+
+  /* Implemented by aarch64_faddp<mode>.  */
+  BUILTIN_VDQF (BINOP, faddp, 0)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index ad8b9c1d0c155d022be2e7e7c426120b551f3f2b..f8d3e766a53736a4b87ba016caccd085eb793bda 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1992,6 +1992,16 @@
   }
 )
 
+(define_insn "aarch64_faddp<mode>"
+ [(set (match_operand:VDQF 0 "register_operand" "=w")
+       (unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w")
+		     (match_operand:VDQF 2 "register_operand" "w")]
+		     UNSPEC_FADDV))]
+ "TARGET_SIMD"
+ "faddp\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_fp_reduc_add_<Vetype><q>")]
+)
+
 (define_insn "aarch64_reduc_plus_internal<mode>"
  [(set (match_operand:VDQV 0 "register_operand" "=w")
        (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")]
@@ -2019,15 +2029,6 @@
   [(set_attr "type" "neon_fp_reduc_add_<Vetype><q>")]
 )
 
-(define_insn "aarch64_addpv4sf"
- [(set (match_operand:V4SF 0 "register_operand" "=w")
-       (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "w")]
-		    UNSPEC_FADDV))]
- "TARGET_SIMD"
- "faddp\\t%0.4s, %1.4s, %1.4s"
-  [(set_attr "type" "neon_fp_reduc_add_s_q")]
-)
-
 (define_expand "reduc_plus_scal_v4sf"
  [(set (match_operand:SF 0 "register_operand")
        (unspec:V4SF [(match_operand:V4SF 1 "register_operand")]
@@ -2036,8 +2037,8 @@
 {
   rtx elt = GEN_INT (ENDIAN_LANE_N (V4SFmode, 0));
   rtx scratch = gen_reg_rtx (V4SFmode);
-  emit_insn (gen_aarch64_addpv4sf (scratch, operands[1]));
-  emit_insn (gen_aarch64_addpv4sf (scratch, scratch));
+  emit_insn (gen_aarch64_faddpv4sf (scratch, operands[1], operands[1]));
+  emit_insn (gen_aarch64_faddpv4sf (scratch, scratch, scratch));
   emit_insn (gen_aarch64_get_lanev4sf (operands[0], scratch, elt));
   DONE;
 })
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 9e966e47789646ed968a081c1fc4cb76b45537af..13a4ab80cf7b0470d8ec8b07e0ed1988f8f4e66d 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -8225,17 +8225,6 @@ vpadalq_u32 (uint64x2_t a, uint32x4_t b)
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vpadd_f32 (float32x2_t a, float32x2_t b)
-{
-  float32x2_t result;
-  __asm__ ("faddp %0.2s,%1.2s,%2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 vpaddl_s8 (int8x8_t a)
 {
@@ -8368,28 +8357,6 @@ vpaddlq_u32 (uint32x4_t a)
   return result;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vpaddq_f32 (float32x4_t a, float32x4_t b)
-{
-  float32x4_t result;
-  __asm__ ("faddp %0.4s,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vpaddq_f64 (float64x2_t a, float64x2_t b)
-{
-  float64x2_t result;
-  __asm__ ("faddp %0.2d,%1.2d,%2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vpaddq_s8 (int8x16_t a, int8x16_t b)
 {
@@ -8478,17 +8445,6 @@ vpaddq_u64 (uint64x2_t a, uint64x2_t b)
   return result;
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vpadds_f32 (float32x2_t a)
-{
-  float32_t result;
-  __asm__ ("faddp %s0,%1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 vqdmulh_n_s16 (int16x4_t a, int16_t b)
 {
@@ -18625,6 +18581,24 @@ vnegq_s64 (int64x2_t __a)
 
 /* vpadd  */
 
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vpadd_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return __builtin_aarch64_faddpv2sf (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vpaddq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return __builtin_aarch64_faddpv4sf (__a, __b);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vpaddq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return __builtin_aarch64_faddpv2df (__a, __b);
+}
+
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vpadd_s8 (int8x8_t __a, int8x8_t __b)
 {
@@ -18664,6 +18638,12 @@ vpadd_u32 (uint32x2_t __a, uint32x2_t __b)
 						  (int32x2_t) __b);
 }
 
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vpadds_f32 (float32x2_t __a)
+{
+  return __builtin_aarch64_reduc_plus_scal_v2sf (__a);
+}
+
 __extension__ static __inline float64_t __attribute__ ((__always_inline__))
 vpaddd_f64 (float64x2_t __a)
 {