diff mbox

i?86 unaligned/aligned load improvement for AVX512F

Message ID 20140103085924.GW892@tucnak.redhat.com
State New
Headers show

Commit Message

Jakub Jelinek Jan. 3, 2014, 8:59 a.m. UTC
Hi!

This is an attempt to port my recent
http://gcc.gnu.org/viewcvs?rev=204219&root=gcc&view=rev
http://gcc.gnu.org/viewcvs?rev=205663&root=gcc&view=rev
http://gcc.gnu.org/viewcvs?rev=206090&root=gcc&view=rev
changes also to AVX512F.  The motivation is to get:

#include <immintrin.h>

__m512i
foo (void *x, void *y)
{
  __m512i a = _mm512_loadu_si512 (x);
  __m512i b = _mm512_loadu_si512 (y);
  return _mm512_add_epi32 (a, b);
}

use one of the unaligned memories directly as operand to the vpaddd
instruction.  The first hunk is needed so that we don't regress on say:

#include <immintrin.h>

__m512i z;

__m512i
foo (void *x, void *y, int k)
{
  __m512i a = _mm512_mask_loadu_epi32 (z, k, x);
  __m512i b = _mm512_mask_loadu_epi32 (z, k, y);
  return _mm512_add_epi32 (a, b);
}

__m512i
bar (void *x, void *y, int k)
{
  __m512i a = _mm512_maskz_loadu_epi32 (k, x);
  __m512i b = _mm512_maskz_loadu_epi32 (k, y);
  return _mm512_add_epi32 (a, b);
}

Does it matter which of vmovdqu32 vs. vmovdqu64 is used if no
masking/zeroing is performed (i.e. vmovdqu32 (%rax), %zmm0 vs.
vmovdqu64 (%rax), %zmm0) for performance reasons (i.e. isn't there some
reinterpretation penalty)?

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2014-01-03  Jakub Jelinek  <jakub@redhat.com>

	* config/i386/sse.md (avx512f_load<mode>_mask): Emit vmovup{s,d}
	or vmovdqu* for misaligned_operand.
	(<sse>_loadu<ssemodesuffix><avxsizesuffix><mask_name>,
	<sse2_avx_avx512f>_loaddqu<mode><mask_name>): Handle <mask_applied>.
	* config/i386/i386.c (ix86_expand_special_args_builtin): Set
	aligned_mem for AVX512F masked aligned load and store builtins and for
	non-temporal moves.

	* gcc.target/i386/avx512f-vmovdqu32-1.c: Allow vmovdqu64 instead of
	vmovdqu32.


	Jakub

Comments

Kirill Yukhin Jan. 3, 2014, 10:36 p.m. UTC | #1
Hello,
On 03 Jan 09:59, Jakub Jelinek wrote:
> Does it matter which of vmovdqu32 vs. vmovdqu64 is used if no
> masking/zeroing is performed (i.e. vmovdqu32 (%rax), %zmm0 vs.
> vmovdqu64 (%rax), %zmm0) for performance reasons (i.e. isn't there some
> reinterpretation penalty)?
No, there should be no penalty (at least from today point of view).
So, I like your patch!

--
Thanks, K
Uros Bizjak Jan. 4, 2014, 8:46 a.m. UTC | #2
On Fri, Jan 3, 2014 at 9:59 AM, Jakub Jelinek <jakub@redhat.com> wrote:

> This is an attempt to port my recent
> http://gcc.gnu.org/viewcvs?rev=204219&root=gcc&view=rev
> http://gcc.gnu.org/viewcvs?rev=205663&root=gcc&view=rev
> http://gcc.gnu.org/viewcvs?rev=206090&root=gcc&view=rev
> changes also to AVX512F.  The motivation is to get:
>
> #include <immintrin.h>
>
> __m512i
> foo (void *x, void *y)
> {
>   __m512i a = _mm512_loadu_si512 (x);
>   __m512i b = _mm512_loadu_si512 (y);
>   return _mm512_add_epi32 (a, b);
> }
>
> use one of the unaligned memories directly as operand to the vpaddd
> instruction.  The first hunk is needed so that we don't regress on say:
>
> #include <immintrin.h>
>
> __m512i z;
>
> __m512i
> foo (void *x, void *y, int k)
> {
>   __m512i a = _mm512_mask_loadu_epi32 (z, k, x);
>   __m512i b = _mm512_mask_loadu_epi32 (z, k, y);
>   return _mm512_add_epi32 (a, b);
> }
>
> __m512i
> bar (void *x, void *y, int k)
> {
>   __m512i a = _mm512_maskz_loadu_epi32 (k, x);
>   __m512i b = _mm512_maskz_loadu_epi32 (k, y);
>   return _mm512_add_epi32 (a, b);
> }
>
> Does it matter which of vmovdqu32 vs. vmovdqu64 is used if no
> masking/zeroing is performed (i.e. vmovdqu32 (%rax), %zmm0 vs.
> vmovdqu64 (%rax), %zmm0) for performance reasons (i.e. isn't there some
> reinterpretation penalty)?
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2014-01-03  Jakub Jelinek  <jakub@redhat.com>
>
>         * config/i386/sse.md (avx512f_load<mode>_mask): Emit vmovup{s,d}
>         or vmovdqu* for misaligned_operand.
>         (<sse>_loadu<ssemodesuffix><avxsizesuffix><mask_name>,
>         <sse2_avx_avx512f>_loaddqu<mode><mask_name>): Handle <mask_applied>.
>         * config/i386/i386.c (ix86_expand_special_args_builtin): Set
>         aligned_mem for AVX512F masked aligned load and store builtins and for
>         non-temporal moves.
>
>         * gcc.target/i386/avx512f-vmovdqu32-1.c: Allow vmovdqu64 instead of
>         vmovdqu32.

Taking into account Kirill's comment, the patch is OK, although I find
a bit strange in [1] that

void f2 (int *__restrict e, int *__restrict f) { int i; for (i = 0; i
< 1024; i++) e[i] = f[i]; }

results in

        vmovdqu64       (%rsi,%rax), %zmm0
        vmovdqu32       %zmm0, (%rdi,%rax)

Shouldn't these two move insns be the same?

[1] http://gcc.gnu.org/ml/gcc/2014-01/msg00015.html

Thanks,
Uros.
diff mbox

Patch

--- gcc/config/i386/sse.md.jj	2014-01-02 20:11:49.000000000 +0100
+++ gcc/config/i386/sse.md	2014-01-02 21:59:06.706161064 +0100
@@ -786,8 +786,12 @@  (define_insn "avx512f_load<mode>_mask"
     {
     case MODE_V8DF:
     case MODE_V16SF:
+      if (misaligned_operand (operands[1], <MODE>mode))
+	return "vmovu<ssemodesuffix>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}";
       return "vmova<ssemodesuffix>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}";
     default:
+      if (misaligned_operand (operands[1], <MODE>mode))
+	return "vmovdqu<ssescalarsize>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}";
       return "vmovdqa<ssescalarsize>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}";
     }
 }
@@ -936,11 +940,14 @@  (define_expand "<sse>_loadu<ssemodesuffi
      false, still emit UNSPEC_LOADU insn to honor user's request for
      misaligned load.  */
   if (TARGET_AVX
-      && misaligned_operand (operands[1], <MODE>mode)
-      /* FIXME: Revisit after AVX512F merge is completed.  */
-      && !<mask_applied>)
+      && misaligned_operand (operands[1], <MODE>mode))
     {
-      emit_insn (gen_rtx_SET (VOIDmode, operands[0], operands[1]));
+      rtx src = operands[1];
+      if (<mask_applied>)
+	src = gen_rtx_VEC_MERGE (<MODE>mode, operands[1],
+				 operands[2 * <mask_applied>],
+				 operands[3 * <mask_applied>]);
+      emit_insn (gen_rtx_SET (VOIDmode, operands[0], src));
       DONE;
     }
 })
@@ -1046,11 +1053,14 @@  (define_expand "<sse2_avx_avx512f>_loadd
      false, still emit UNSPEC_LOADU insn to honor user's request for
      misaligned load.  */
   if (TARGET_AVX
-      && misaligned_operand (operands[1], <MODE>mode)
-      /* FIXME: Revisit after AVX512F merge is completed.  */
-      && !<mask_applied>)
+      && misaligned_operand (operands[1], <MODE>mode))
     {
-      emit_insn (gen_rtx_SET (VOIDmode, operands[0], operands[1]));
+      rtx src = operands[1];
+      if (<mask_applied>)
+	src = gen_rtx_VEC_MERGE (<MODE>mode, operands[1],
+				 operands[2 * <mask_applied>],
+				 operands[3 * <mask_applied>]);
+      emit_insn (gen_rtx_SET (VOIDmode, operands[0], src));
       DONE;
     }
 })
--- gcc/config/i386/i386.c.jj	2014-01-02 14:44:07.000000000 +0100
+++ gcc/config/i386/i386.c	2014-01-02 21:48:23.204400654 +0100
@@ -34407,6 +34408,9 @@  ix86_expand_special_args_builtin (const
 	case CODE_FOR_sse2_movntidi:
 	case CODE_FOR_sse_movntq:
 	case CODE_FOR_sse2_movntisi:
+	case CODE_FOR_avx512f_movntv16sf:
+	case CODE_FOR_avx512f_movntv8df:
+	case CODE_FOR_avx512f_movntv8di:
 	  aligned_mem = true;
 	  break;
 	default:
@@ -34431,6 +34435,24 @@  ix86_expand_special_args_builtin (const
       klass = load;
       memory = 0;
       break;
+    case VOID_FTYPE_PV8DF_V8DF_QI:
+    case VOID_FTYPE_PV16SF_V16SF_HI:
+    case VOID_FTYPE_PV8DI_V8DI_QI:
+    case VOID_FTYPE_PV16SI_V16SI_HI:
+      switch (icode)
+	{
+	/* These builtins and instructions require the memory
+	   to be properly aligned.  */
+	case CODE_FOR_avx512f_storev16sf_mask:
+	case CODE_FOR_avx512f_storev16si_mask:
+	case CODE_FOR_avx512f_storev8df_mask:
+	case CODE_FOR_avx512f_storev8di_mask:
+	  aligned_mem = true;
+	  break;
+	default:
+	  break;
+	}
+      /* FALLTHRU */
     case VOID_FTYPE_PV8SF_V8SI_V8SF:
     case VOID_FTYPE_PV4DF_V4DI_V4DF:
     case VOID_FTYPE_PV4SF_V4SI_V4SF:
@@ -34439,10 +34461,6 @@  ix86_expand_special_args_builtin (const
     case VOID_FTYPE_PV4DI_V4DI_V4DI:
     case VOID_FTYPE_PV4SI_V4SI_V4SI:
     case VOID_FTYPE_PV2DI_V2DI_V2DI:
-    case VOID_FTYPE_PV8DF_V8DF_QI:
-    case VOID_FTYPE_PV16SF_V16SF_HI:
-    case VOID_FTYPE_PV8DI_V8DI_QI:
-    case VOID_FTYPE_PV16SI_V16SI_HI:
     case VOID_FTYPE_PDOUBLE_V2DF_QI:
     case VOID_FTYPE_PFLOAT_V4SF_QI:
       nargs = 2;
@@ -34459,6 +34477,19 @@  ix86_expand_special_args_builtin (const
       nargs = 3;
       klass = load;
       memory = 0;
+      switch (icode)
+	{
+	/* These builtins and instructions require the memory
+	   to be properly aligned.  */
+	case CODE_FOR_avx512f_loadv16sf_mask:
+	case CODE_FOR_avx512f_loadv16si_mask:
+	case CODE_FOR_avx512f_loadv8df_mask:
+	case CODE_FOR_avx512f_loadv8di_mask:
+	  aligned_mem = true;
+	  break;
+	default:
+	  break;
+	}
       break;
     case VOID_FTYPE_UINT_UINT_UINT:
     case VOID_FTYPE_UINT64_UINT_UINT:
--- gcc/testsuite/gcc.target/i386/avx512f-vmovdqu32-1.c.jj	2013-12-31 12:51:09.000000000 +0100
+++ gcc/testsuite/gcc.target/i386/avx512f-vmovdqu32-1.c	2014-01-03 01:46:14.714705390 +0100
@@ -1,6 +1,6 @@ 
 /* { dg-do compile } */
 /* { dg-options "-mavx512f -O2" } */
-/* { dg-final { scan-assembler-times "vmovdqu32\[ \\t\]+\[^\n\]*\\)\[^\n\]*%zmm\[0-9\]\[^\{\]" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[36\]\[24\]\[ \\t\]+\[^\n\]*\\)\[^\n\]*%zmm\[0-9\]\[^\{\]" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu32\[ \\t\]+\[^\n\]*\\)\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\[^\{\]" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu32\[ \\t\]+\[^\n\]*\\)\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\{z\}" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu32\[ \\t\]+\[^\n\]*%zmm\[0-9\]\[^\n\]*\\)\[^\{\]" 1 } } */