From patchwork Thu Jul  3 09:53:25 2014
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Evgeny Stupachenko <evstupac@gmail.com>
X-Patchwork-Id: 366814
Return-Path: 
 <gcc-patches-return-371733-incoming=patchwork.ozlabs.org@gcc.gnu.org>
X-Original-To: incoming@patchwork.ozlabs.org
Delivered-To: patchwork-incoming@bilbo.ozlabs.org
Received: from sourceware.org (server1.sourceware.org [209.132.180.131])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256
	bits)) (No client certificate requested)
	by ozlabs.org (Postfix) with ESMTPS id 7CC3D140129
	for <incoming@patchwork.ozlabs.org>;
	Thu,  3 Jul 2014 19:53:45 +1000 (EST)
DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id
	:list-unsubscribe:list-archive:list-post:list-help:sender
	:mime-version:in-reply-to:references:date:message-id:subject
	:from:to:cc:content-type; q=dns; s=default; b=YUNW5GczpeDhFPqIdd
	V9lzQDZmiKvE28L0RaaaIwCMy+AD0ruxk7LjeQtPxmOMRQRILEpKyDoCxKeJBKR2
	dO9w299rlJuWwieVX3lwD2YL2FlHwZsJ1GZ/eH/Uba7VwpVUCHCJBtG/SkqRY4Id
	hU8xAAG/nql47YyMp5pXWaPZk=
DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=gcc.gnu.org; h=list-id
	:list-unsubscribe:list-archive:list-post:list-help:sender
	:mime-version:in-reply-to:references:date:message-id:subject
	:from:to:cc:content-type; s=default; bh=sqJcKUZztD2T35c0UTxdu0cL
	rl0=; b=UwBIQjCLTBEASs69nx/617098dFBhaf2jSSo2zm5/iNm0FaAIeNreJOe
	wAFgLMcCM0WWe9hMu7u01QwqvxEB+wFHgAgqTqlkdElA8dkLzBf/NQ6mc8Xt3QuL
	3Ft4thFb4dST0RZuPt7CpfD0Cisa4qT5VB8uGuSE8XsLMfZnu2o=
Received: (qmail 11660 invoked by alias); 3 Jul 2014 09:53:37 -0000
Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Id: <gcc-patches.gcc.gnu.org>
List-Unsubscribe: 
 <mailto:gcc-patches-unsubscribe-incoming=patchwork.ozlabs.org@gcc.gnu.org>
List-Archive: <http://gcc.gnu.org/ml/gcc-patches/>
List-Post: <mailto:gcc-patches@gcc.gnu.org>
List-Help: <mailto:gcc-patches-help@gcc.gnu.org>
Sender: gcc-patches-owner@gcc.gnu.org
Delivered-To: mailing list gcc-patches@gcc.gnu.org
Received: (qmail 11629 invoked by uid 89); 3 Jul 2014 09:53:32 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-2.4 required=5.0 tests=AWL, BAYES_00,
	FREEMAIL_FROM, RCVD_IN_DNSWL_LOW,
	SPF_PASS autolearn=ham version=3.3.2
X-HELO: mail-ob0-f180.google.com
Received: from mail-ob0-f180.google.com (HELO mail-ob0-f180.google.com)
	(209.85.214.180) by sourceware.org
	(qpsmtpd/0.93/v0.84-503-g423c35a) with (AES128-SHA encrypted)
	ESMTPS; Thu, 03 Jul 2014 09:53:27 +0000
Received: by mail-ob0-f180.google.com with SMTP id vb8so13724013obc.11 for
	<gcc-patches@gcc.gnu.org>; Thu, 03 Jul 2014 02:53:25 -0700 (PDT)
MIME-Version: 1.0
X-Received: by 10.182.5.3 with SMTP id o3mr974409obo.85.1404381205576;
	Thu, 03 Jul 2014 02:53:25 -0700 (PDT)
Received: by 10.76.18.148 with HTTP; Thu, 3 Jul 2014 02:53:25 -0700 (PDT)
In-Reply-To: <537A23F7.2060601@redhat.com>
References: 
 <CAOvf_xx3-VpgN8YDxJBPvzzNGNykUPoLdU6xThW_QBN7byy5rw@mail.gmail.com>
	<535E909A.7040205@redhat.com>
	<CAOvf_xygddE6yxkp=+SZtVz+bvVciH5YfH1Cwy1WxdPHCgrbtg@mail.gmail.com>
	<535EC233.7000500@redhat.com>
	<CAOvf_xxq6PNN2KLpA8yoB2jjgmy-UAOvij_ghgsKPkZnsu2Rkg@mail.gmail.com>
	<537A23F7.2060601@redhat.com>
Date: Thu, 3 Jul 2014 13:53:25 +0400
Message-ID: 
 <CAOvf_xwYJpQ0H9=MrLzUd40JDksZ8Tfsks2=8ZW4MfR_wCjU1A@mail.gmail.com>
Subject: Re: [PATCH 1/2, x86] Add palignr support for AVX2.
From: Evgeny Stupachenko <evstupac@gmail.com>
To: Richard Henderson <rth@redhat.com>
Cc: GCC Patches <gcc-patches@gcc.gnu.org>, Richard Biener <rguenther@suse.de>,
	Uros Bizjak <ubizjak@gmail.com>
X-IsSubscribed: yes

The "expand_vec_perm_palignr" is similar for SSSE3 and AVX2 cases,
  but AVX2 requires more instructions to complete the scheme.

The patch below adds AVX2 support for six instructions, leaving SSSE3 for two.
Is it ok?

   min = nelt, max = 0;
@@ -43168,9 +43183,34 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d)

   dcopy = *d;
   shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
-  target = gen_reg_rtx (TImode);
-  emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
-                                 gen_lowpart (TImode, d->op0), shift));
+  shift1 = GEN_INT ((min - nelt / 2)
+          * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
+
+  if (GET_MODE_SIZE (d->vmode) != 32)
+    {
+      target = gen_reg_rtx (TImode);
+      emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
+                                     gen_lowpart (TImode, d->op0), shift));
+    }
+  else
+    {
+      target = gen_reg_rtx (V2TImode);
+      tmp = gen_reg_rtx (V4DImode);
+      emit_insn (gen_avx2_permv2ti (tmp,
+                                   gen_lowpart (V4DImode, d->op0),
+                                   gen_lowpart (V4DImode, d->op1),
+                                   GEN_INT (33)));
+      if (min < nelt / 2)
+        emit_insn (gen_avx2_palignrv2ti (target,
+                                        gen_lowpart (V2TImode, tmp),
+                                        gen_lowpart (V2TImode, d->op0),
+                                        shift));
+      else
+       emit_insn (gen_avx2_palignrv2ti (target,
+                                        gen_lowpart (V2TImode, d->op1),
+                                        gen_lowpart (V2TImode, tmp),
+                                        shift1));
+    }

   dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
   dcopy.one_operand_p = true;
@@ -43192,9 +43232,22 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d)
       return true;
     }

-  ok = expand_vec_perm_1 (&dcopy);
-  gcc_assert (ok);
-
+  /* For SSSE3 we need 1 instruction for palignr plus 1 for one
+     operand permutaoin.  */
+  if (insn_num == 2)
+    {
+      ok = expand_vec_perm_1 (&dcopy);
+      gcc_assert (ok);
+    }
+  /* For AVX2 we need 2 instructions for the shift: vpalignr and
+     vperm plus 4 instructions for one operand permutation.  */
+  else if (insn_num == 6)
+    {
+      ok = expand_vec_perm_vpshufb2_vpermq (&dcopy);
+      gcc_assert (ok);
+    }
+  else
+    ok = false;
   return ok;
 }

@@ -44627,7 +44680,7 @@ ix86_expand_vec_perm_const_1 (struct
expand_vec_perm_d *d)
   if (expand_vec_perm_pshuflw_pshufhw (d))
     return true;

-  if (expand_vec_perm_palignr (d))
+  if (expand_vec_perm_palignr (d, 2))
     return true;

   if (expand_vec_perm_interleave2 (d))
@@ -44680,6 +44733,10 @@ ix86_expand_vec_perm_const_1 (struct
expand_vec_perm_d *d)
   if (expand_vec_perm_even_odd (d))
     return true;

+  /* Try sequences of six instructions.  */
+  if (expand_vec_perm_palignr (d, 6))
+    return true;
+
   /* Even longer sequences.  */
   if (expand_vec_perm_vpshufb4_vpermq2 (d))
     return true;

On Mon, May 19, 2014 at 7:32 PM, Richard Henderson <rth@redhat.com> wrote:
> On 05/05/2014 09:49 AM, Evgeny Stupachenko wrote:
>> @@ -42946,6 +42948,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
>>    if (expand_vec_perm_pshufb (d))
>>      return true;
>>
>> +  /* Try the AVX2 vpshufb.  */
>> +  if (expand_vec_perm_vpshufb2_vpermq (d))
>> +    return true;
>
> Why is this here?  It doesn't expand to 1 insn, which is
> what expand_vec_perm_1 is intended to check.
>
> It's already called from ix86_expand_vec_perm_const_1.
> If things need to be shuffled around in that function,
> then that's the right place to do so.
>
> It's also clearly unrelated to "palignr", so has no
> business at all within this patch.
>
>> -  min = nelt, max = 0;
>> +  min = 2 * nelt, max = 0;
>
> This change to min is wrong for this patch.  It probably
> belonged in your 2/2 patch.
>
>> +  shift1 = GEN_INT ((min - nelt / 2) *
>> +          GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
>
> Coding convention sez
>
>   shift1 = GEN_INT ((min - nelt / 2)
>                     * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
>
>
> r~
>

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 2cffcef..70fc832 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -43130,23 +43130,38 @@ expand_vec_perm_pshuflw_pshufhw (struct
expand_vec_perm_d *d)
   return true;
 }

+static bool
+expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d);
+
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
-   the permutation using the SSSE3 palignr instruction.  This succeeds
+   the permutation using the SSSE3/AVX2 palignr instruction.  This succeeds
    when all of the elements in PERM fit within one vector and we merely
    need to shift them down so that a single vector permutation has a
    chance to succeed.  */

 static bool
-expand_vec_perm_palignr (struct expand_vec_perm_d *d)
+expand_vec_perm_palignr (struct expand_vec_perm_d *d, int insn_num)
 {
   unsigned i, nelt = d->nelt;
   unsigned min, max;
   bool in_order, ok;
-  rtx shift, target;
+  rtx shift, shift1, target, tmp;
   struct expand_vec_perm_d dcopy;

-  /* Even with AVX, palignr only operates on 128-bit vectors.  */
-  if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
+  /* SSSE3 is required to apply PALIGNR on 16 bytes operands.  */
+  if (GET_MODE_SIZE (d->vmode) == 16)
+    {
+      if (!TARGET_SSSE3)
+       return false;
+    }
+  /* AVX2 is required to apply PALIGNR on 32 bytes operands.  */
+  else if (GET_MODE_SIZE (d->vmode) == 32)
+    {
+      if (!TARGET_AVX2)
+       return false;
+    }
+  /* Other sizes are not supported.  */
+  else
     return false;