Patchwork [AArch64] Add zip{1, 2}, uzp{1, 2}, trn{1, 2} support for vector permute.

login
register
mail settings
Submitter James Greenhalgh
Date Dec. 4, 2012, 10:36 a.m.
Message ID <1354617390-32181-1-git-send-email-james.greenhalgh@arm.com>
Download mbox | patch
Permalink /patch/203595/
State New
Headers show

Comments

James Greenhalgh - Dec. 4, 2012, 10:36 a.m.
Hi,

This patch improves our code generation for some cases of
constant vector permutation. In particular, we are able to
generate better code for patterns which match the output
of the zip, uzp and trn instructions.

This patch adds support for these cases.

This patch has been tested with no regressions on
aarch64-none-elf.

OK to commit?

Thanks,
James Greenhalgh

---
gcc/
2012-12-04  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/aarch64/aarch64-simd-builtins.def: Add new builtins.
	* config/aarch64/aarch64-simd.md (simd_type): Add uzp.
	(aarch64_<PERMUTE:perm_insn><PERMUTE:perm_hilo><mode>): New.
	* config/aarch64/aarch64.c (aarch64_evpc_trn): New.
	(aarch64_evpc_uzp): Likewise.
	(aarch64_evpc_zip): Likewise.
	(aarch64_expand_vec_perm_const_1): Check for trn, zip, uzp patterns.
	* config/aarch64/iterators.md (unspec): Add neccessary unspecs.
	(PERMUTE): New.
	(perm_insn): Likewise.
	(perm_hilo): Likewise.
Marcus Shawcroft - Dec. 4, 2012, 10:45 p.m.
OK
/Marcus

On 4 December 2012 10:36, James Greenhalgh <james.greenhalgh@arm.com> wrote:
>
> Hi,
>
> This patch improves our code generation for some cases of
> constant vector permutation. In particular, we are able to
> generate better code for patterns which match the output
> of the zip, uzp and trn instructions.
>
> This patch adds support for these cases.
>
> This patch has been tested with no regressions on
> aarch64-none-elf.
>
> OK to commit?
>
> Thanks,
> James Greenhalgh
>
> ---
> gcc/
> 2012-12-04  James Greenhalgh  <james.greenhalgh@arm.com>
>
>         * config/aarch64/aarch64-simd-builtins.def: Add new builtins.
>         * config/aarch64/aarch64-simd.md (simd_type): Add uzp.
>         (aarch64_<PERMUTE:perm_insn><PERMUTE:perm_hilo><mode>): New.
>         * config/aarch64/aarch64.c (aarch64_evpc_trn): New.
>         (aarch64_evpc_uzp): Likewise.
>         (aarch64_evpc_zip): Likewise.
>         (aarch64_expand_vec_perm_const_1): Check for trn, zip, uzp patterns.
>         * config/aarch64/iterators.md (unspec): Add neccessary unspecs.
>         (PERMUTE): New.
>         (perm_insn): Likewise.
>         (perm_hilo): Likewise.

Patch

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 2e3c4e1..8730c56 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -206,3 +206,12 @@ 
   BUILTIN_VDQ_BHSI (BINOP, smin)
   BUILTIN_VDQ_BHSI (BINOP, umax)
   BUILTIN_VDQ_BHSI (BINOP, umin)
+
+  /* Implemented by
+     aarch64_<PERMUTE:perm_insn><PERMUTE:perm_hilo><mode>.  */
+  BUILTIN_VALL (BINOP, zip1)
+  BUILTIN_VALL (BINOP, zip2)
+  BUILTIN_VALL (BINOP, uzp1)
+  BUILTIN_VALL (BINOP, uzp2)
+  BUILTIN_VALL (BINOP, trn1)
+  BUILTIN_VALL (BINOP, trn2)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 2b0c8d6..df88ef4 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -128,7 +128,8 @@ 
 ; simd_store4s          store single structure from one lane for four registers (ST4 [index]).
 ; simd_tbl              table lookup.
 ; simd_trn              transpose.
-; simd_zip              zip/unzip.
+; simd_uzp              unzip.
+; simd_zip              zip.
 
 (define_attr "simd_type"
    "simd_abd,\
@@ -230,6 +231,7 @@ 
    simd_store4s,\
    simd_tbl,\
    simd_trn,\
+   simd_uzp,\
    simd_zip,\
    none"
   (const_string "none"))
@@ -3366,6 +3368,17 @@ 
   DONE;
 })
 
+(define_insn "aarch64_<PERMUTE:perm_insn><PERMUTE:perm_hilo><mode>"
+  [(set (match_operand:VALL 0 "register_operand" "=w")
+	(unspec:VALL [(match_operand:VALL 1 "register_operand" "w")
+		      (match_operand:VALL 2 "register_operand" "w")]
+		       PERMUTE))]
+  "TARGET_SIMD"
+  "<PERMUTE:perm_insn><PERMUTE:perm_hilo>\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+  [(set_attr "simd_type" "simd_<PERMUTE:perm_insn>")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
 (define_insn "aarch64_st2<mode>_dreg"
   [(set (match_operand:TI 0 "aarch64_simd_struct_operand" "=Utv")
 	(unspec:TI [(match_operand:OI 1 "register_operand" "w")
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index cebc8cb..0eac0b7 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -6815,6 +6815,261 @@  aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
 }
 
+/* Recognize patterns suitable for the TRN instructions.  */
+static bool
+aarch64_evpc_trn (struct expand_vec_perm_d *d)
+{
+  unsigned int i, odd, mask, nelt = d->nelt;
+  rtx out, in0, in1, x;
+  rtx (*gen) (rtx, rtx, rtx);
+  enum machine_mode vmode = d->vmode;
+
+  if (GET_MODE_UNIT_SIZE (vmode) > 8)
+    return false;
+
+  /* Note that these are little-endian tests.
+     We correct for big-endian later.  */
+  if (d->perm[0] == 0)
+    odd = 0;
+  else if (d->perm[0] == 1)
+    odd = 1;
+  else
+    return false;
+  mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
+
+  for (i = 0; i < nelt; i += 2)
+    {
+      if (d->perm[i] != i + odd)
+	return false;
+      if (d->perm[i + 1] != ((i + nelt + odd) & mask))
+	return false;
+    }
+
+  /* Success!  */
+  if (d->testing_p)
+    return true;
+
+  in0 = d->op0;
+  in1 = d->op1;
+  if (BYTES_BIG_ENDIAN)
+    {
+      x = in0, in0 = in1, in1 = x;
+      odd = !odd;
+    }
+  out = d->target;
+
+  if (odd)
+    {
+      switch (vmode)
+	{
+	case V16QImode: gen = gen_aarch64_trn2v16qi; break;
+	case V8QImode: gen = gen_aarch64_trn2v8qi; break;
+	case V8HImode: gen = gen_aarch64_trn2v8hi; break;
+	case V4HImode: gen = gen_aarch64_trn2v4hi; break;
+	case V4SImode: gen = gen_aarch64_trn2v4si; break;
+	case V2SImode: gen = gen_aarch64_trn2v2si; break;
+	case V2DImode: gen = gen_aarch64_trn2v2di; break;
+	case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
+	case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
+	case V2DFmode: gen = gen_aarch64_trn2v2df; break;
+	default:
+	  return false;
+	}
+    }
+  else
+    {
+      switch (vmode)
+	{
+	case V16QImode: gen = gen_aarch64_trn1v16qi; break;
+	case V8QImode: gen = gen_aarch64_trn1v8qi; break;
+	case V8HImode: gen = gen_aarch64_trn1v8hi; break;
+	case V4HImode: gen = gen_aarch64_trn1v4hi; break;
+	case V4SImode: gen = gen_aarch64_trn1v4si; break;
+	case V2SImode: gen = gen_aarch64_trn1v2si; break;
+	case V2DImode: gen = gen_aarch64_trn1v2di; break;
+	case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
+	case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
+	case V2DFmode: gen = gen_aarch64_trn1v2df; break;
+	default:
+	  return false;
+	}
+    }
+
+  emit_insn (gen (out, in0, in1));
+  return true;
+}
+
+/* Recognize patterns suitable for the UZP instructions.  */
+static bool
+aarch64_evpc_uzp (struct expand_vec_perm_d *d)
+{
+  unsigned int i, odd, mask, nelt = d->nelt;
+  rtx out, in0, in1, x;
+  rtx (*gen) (rtx, rtx, rtx);
+  enum machine_mode vmode = d->vmode;
+
+  if (GET_MODE_UNIT_SIZE (vmode) > 8)
+    return false;
+
+  /* Note that these are little-endian tests.
+     We correct for big-endian later.  */
+  if (d->perm[0] == 0)
+    odd = 0;
+  else if (d->perm[0] == 1)
+    odd = 1;
+  else
+    return false;
+  mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
+
+  for (i = 0; i < nelt; i++)
+    {
+      unsigned elt = (i * 2 + odd) & mask;
+      if (d->perm[i] != elt)
+	return false;
+    }
+
+  /* Success!  */
+  if (d->testing_p)
+    return true;
+
+  in0 = d->op0;
+  in1 = d->op1;
+  if (BYTES_BIG_ENDIAN)
+    {
+      x = in0, in0 = in1, in1 = x;
+      odd = !odd;
+    }
+  out = d->target;
+
+  if (odd)
+    {
+      switch (vmode)
+	{
+	case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
+	case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
+	case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
+	case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
+	case V4SImode: gen = gen_aarch64_uzp2v4si; break;
+	case V2SImode: gen = gen_aarch64_uzp2v2si; break;
+	case V2DImode: gen = gen_aarch64_uzp2v2di; break;
+	case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
+	case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
+	case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
+	default:
+	  return false;
+	}
+    }
+  else
+    {
+      switch (vmode)
+	{
+	case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
+	case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
+	case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
+	case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
+	case V4SImode: gen = gen_aarch64_uzp1v4si; break;
+	case V2SImode: gen = gen_aarch64_uzp1v2si; break;
+	case V2DImode: gen = gen_aarch64_uzp1v2di; break;
+	case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
+	case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
+	case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
+	default:
+	  return false;
+	}
+    }
+
+  emit_insn (gen (out, in0, in1));
+  return true;
+}
+
+/* Recognize patterns suitable for the ZIP instructions.  */
+static bool
+aarch64_evpc_zip (struct expand_vec_perm_d *d)
+{
+  unsigned int i, high, mask, nelt = d->nelt;
+  rtx out, in0, in1, x;
+  rtx (*gen) (rtx, rtx, rtx);
+  enum machine_mode vmode = d->vmode;
+
+  if (GET_MODE_UNIT_SIZE (vmode) > 8)
+    return false;
+
+  /* Note that these are little-endian tests.
+     We correct for big-endian later.  */
+  high = nelt / 2;
+  if (d->perm[0] == high)
+    /* Do Nothing.  */
+    ;
+  else if (d->perm[0] == 0)
+    high = 0;
+  else
+    return false;
+  mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
+
+  for (i = 0; i < nelt / 2; i++)
+    {
+      unsigned elt = (i + high) & mask;
+      if (d->perm[i * 2] != elt)
+	return false;
+      elt = (elt + nelt) & mask;
+      if (d->perm[i * 2 + 1] != elt)
+	return false;
+    }
+
+  /* Success!  */
+  if (d->testing_p)
+    return true;
+
+  in0 = d->op0;
+  in1 = d->op1;
+  if (BYTES_BIG_ENDIAN)
+    {
+      x = in0, in0 = in1, in1 = x;
+      high = !high;
+    }
+  out = d->target;
+
+  if (high)
+    {
+      switch (vmode)
+	{
+	case V16QImode: gen = gen_aarch64_zip2v16qi; break;
+	case V8QImode: gen = gen_aarch64_zip2v8qi; break;
+	case V8HImode: gen = gen_aarch64_zip2v8hi; break;
+	case V4HImode: gen = gen_aarch64_zip2v4hi; break;
+	case V4SImode: gen = gen_aarch64_zip2v4si; break;
+	case V2SImode: gen = gen_aarch64_zip2v2si; break;
+	case V2DImode: gen = gen_aarch64_zip2v2di; break;
+	case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
+	case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
+	case V2DFmode: gen = gen_aarch64_zip2v2df; break;
+	default:
+	  return false;
+	}
+    }
+  else
+    {
+      switch (vmode)
+	{
+	case V16QImode: gen = gen_aarch64_zip1v16qi; break;
+	case V8QImode: gen = gen_aarch64_zip1v8qi; break;
+	case V8HImode: gen = gen_aarch64_zip1v8hi; break;
+	case V4HImode: gen = gen_aarch64_zip1v4hi; break;
+	case V4SImode: gen = gen_aarch64_zip1v4si; break;
+	case V2SImode: gen = gen_aarch64_zip1v2si; break;
+	case V2DImode: gen = gen_aarch64_zip1v2di; break;
+	case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
+	case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
+	case V2DFmode: gen = gen_aarch64_zip1v2df; break;
+	default:
+	  return false;
+	}
+    }
+
+  emit_insn (gen (out, in0, in1));
+  return true;
+}
+
 static bool
 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
 {
@@ -6865,7 +7120,15 @@  aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
     }
 
   if (TARGET_SIMD)
-    return aarch64_evpc_tbl (d);
+    {
+      if (aarch64_evpc_zip (d))
+	return true;
+      else if (aarch64_evpc_uzp (d))
+	return true;
+      else if (aarch64_evpc_trn (d))
+	return true;
+      return aarch64_evpc_tbl (d);
+    }
   return false;
 }
 
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 9ea5e0c..d710ea0 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -230,6 +230,12 @@ 
     UNSPEC_BSL		; Used in aarch64-simd.md.
     UNSPEC_TBL		; Used in vector permute patterns.
     UNSPEC_CONCAT	; Used in vector permute patterns.
+    UNSPEC_ZIP1		; Used in vector permute patterns.
+    UNSPEC_ZIP2		; Used in vector permute patterns.
+    UNSPEC_UZP1		; Used in vector permute patterns.
+    UNSPEC_UZP2		; Used in vector permute patterns.
+    UNSPEC_TRN1		; Used in vector permute patterns.
+    UNSPEC_TRN2		; Used in vector permute patterns.
 ])
 
 ;; -------------------------------------------------------------------
@@ -649,6 +655,9 @@ 
 
 (define_int_iterator VCMP_U [UNSPEC_CMHS UNSPEC_CMHI UNSPEC_CMTST])
 
+(define_int_iterator PERMUTE [UNSPEC_ZIP1 UNSPEC_ZIP2
+			      UNSPEC_TRN1 UNSPEC_TRN2
+			      UNSPEC_UZP1 UNSPEC_UZP2])
 
 ;; -------------------------------------------------------------------
 ;; Int Iterators Attributes.
@@ -732,3 +741,10 @@ 
 (define_int_attr offsetlr [(UNSPEC_SSLI	"1") (UNSPEC_USLI "1")
 			   (UNSPEC_SSRI	"0") (UNSPEC_USRI "0")])
 
+(define_int_attr perm_insn [(UNSPEC_ZIP1 "zip") (UNSPEC_ZIP2 "zip")
+			    (UNSPEC_TRN1 "trn") (UNSPEC_TRN2 "trn")
+			    (UNSPEC_UZP1 "uzp") (UNSPEC_UZP2 "uzp")])
+
+(define_int_attr perm_hilo [(UNSPEC_ZIP1 "1") (UNSPEC_ZIP2 "2")
+			    (UNSPEC_TRN1 "1") (UNSPEC_TRN2 "2")
+			    (UNSPEC_UZP1 "1") (UNSPEC_UZP2 "2")])