Fix up floatunsv{4,8}siv{4,8}sf2

Hi!

As mentioned in the last mail, the testcase in that patch
shows a bug in the unsigned int -> float vectorization
on i?86/x86_64.  E.g. 0x8000008fU converted to float
by scalar code is 0x1.000002p+31, but by vector code
is 0x1p+31, i.e. one ulp smaller.

We currently generate:
        vxorps  %xmm2, %xmm2, %xmm2
        vmovaps .LC0(%rip), %xmm3	! { 1333788672, 1333788672, 1333788672, 1333788672 }
...
.L2:
        vcvtdq2ps       ui(%rax), %xmm0
        vcmpltps        %xmm2, %xmm0, %xmm1
        vandps  %xmm3, %xmm1, %xmm1
        vaddps  %xmm1, %xmm0, %xmm0
        vmovaps %xmm0, f(%rax)

For the scalar code we just zero extend it to 64-bit
register and convert as signed 64-bit integer to float,
but we don't have a vector insn for that.

Attached are 3 different fix alternatives:
X313a is based on scalar code we generate for ulonglong
to double code, i.e. if 0x80000000 or bigger divide by
two with oring the shifted off bit into the lowest bit,
then signed conversion to float and finally doubling that
value.
        vpxor   %xmm4, %xmm4, %xmm4
        vmovdqa .LC0(%rip), %xmm5	! { 1, 1, 1, 1 }
...
.L2:
        vmovdqa ui(%rax), %xmm0
        vpand   %xmm5, %xmm0, %xmm2
        vpsrld  $1, %xmm0, %xmm3
        vpcmpgtd        %xmm0, %xmm4, %xmm1
        vpor    %xmm2, %xmm3, %xmm2
        vpblendvb       %xmm1, %xmm2, %xmm0, %xmm0
        vcvtdq2ps       %xmm0, %xmm0
        vandps  %xmm1, %xmm0, %xmm1
        vaddps  %xmm1, %xmm0, %xmm0
        vmovaps %xmm0, f(%rax)

X313b is based on Richard's sequence,
        vmovdqa .LC0(%rip), %xmm2	! { 65535, 65535, 65535, 65535 }
...
.L2:
        vmovdqa ui(%rax), %xmm1
        vpandn  %xmm1, %xmm2, %xmm0
        vpand   %xmm2, %xmm1, %xmm1
        vpsrld  $1, %xmm0, %xmm0
        vcvtdq2ps       %xmm0, %xmm0
        vaddps  %xmm0, %xmm0, %xmm0
        vcvtdq2ps       %xmm1, %xmm1
        vaddps  %xmm0, %xmm1, %xmm0
        vmovaps %xmm0, f(%rax)

and finally X313c is based on scalar code we generate for
-m32 -O2 -msse2 -mfpmath=sse:
        vmovdqa .LC0(%rip), %xmm3	! { 65535, 65535, 65535, 65535 }
        vmovaps .LC1(%rip), %xmm2	! { 65536.0f, 65536.0f, 65536.0f, 65536.0f }
...
.L2:
        vmovdqa ui(%rax), %xmm0
        vpand   %xmm3, %xmm0, %xmm1
        vpsrld  $16, %xmm0, %xmm0
        vcvtdq2ps       %xmm0, %xmm0
        vmulps  %xmm2, %xmm0, %xmm0
        vcvtdq2ps       %xmm1, %xmm1
        vaddps  %xmm0, %xmm1, %xmm0
        vmovaps %xmm0, f(%rax)

I've done some benchmarking of these, X313b has the advantage
that it needs just one constant while all other need two, but
the last variant seems to win, it is one insn shorter and surprisingly
the multiplication isn't any slower than the addition.
All this is using the attached testcase which is just a cut down
version of the vect-cvt-1.c testcase from my last patch, just with
the ui2f routine executed many times.

Sandybridge, -O3 -mavx
vanilla -fno-tree-vectorize	0m4.136s
vanilla				0m1.052s
patch X313a			0m1.689s
patch X313b			0m1.398s
patch X313c			0m1.115s

Sandybridge, -O3 -msse2
vanilla -fno-tree-vectorize	0m4.099s
vanilla				0m1.054s
patch X313a			0m2.527s
patch X313b			0m1.394s
patch X313c			0m1.267s

Barcelona, -O3 -msse2
vanilla -fno-tree-vectorize	0m9.176s
vanilla 			0m2.706s
patch X313a			0m6.515s
patch X313b			0m3.825s
patch X313c			0m3.310s

So, what do you prefer and do you want the expander to be moved into
i386.c or kept like this?  Do we perhaps want for -ffast-math
keep the slightly faster, but imprecise version (I'd prefer not to)?

	Jakub
2011-11-03  Jakub Jelinek  <jakub@redhat.com>

	* config/i386/sse.md (floatuns<sseintvecmodelower><mode>2): Rewritten.
	* config/i386/i386.c (ix86_expand_sse_movcc): No longer static.

	* gcc.dg/torture/vec-cvt-1.c: Enable commented out inttoflttestui
	test.
2011-11-03  Jakub Jelinek  <jakub@redhat.com>

	* config/i386/sse.md (floatuns<sseintvecmodelower><mode>2): Rewritten.

	* gcc.dg/torture/vec-cvt-1.c: Enable commented out inttoflttestui
	test.

--- gcc/config/i386/sse.md.jj	2011-11-03 16:11:05.000000000 +0100
+++ gcc/config/i386/sse.md	2011-11-03 17:12:28.000000000 +0100
@@ -2242,30 +2242,35 @@ (define_insn "float<sseintvecmodelower><
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_expand "floatuns<sseintvecmodelower><mode>2"
-  [(set (match_dup 5)
-	(float:VF1
-	  (match_operand:<sseintvecmode> 1 "nonimmediate_operand" "")))
-   (set (match_dup 6)
-	(lt:VF1 (match_dup 5) (match_dup 3)))
-   (set (match_dup 7)
-	(and:VF1 (match_dup 6) (match_dup 4)))
-   (set (match_operand:VF1 0 "register_operand" "")
-	(plus:VF1 (match_dup 5) (match_dup 7)))]
-  "TARGET_SSE2"
-{
-  REAL_VALUE_TYPE TWO32r;
-  rtx x;
-  int i;
-
-  real_ldexp (&TWO32r, &dconst1, 32);
-  x = const_double_from_real_value (TWO32r, SFmode);
-
-  operands[3] = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
-  operands[4] = force_reg (<MODE>mode,
-			   ix86_build_const_vector (<MODE>mode, 1, x));
-
-  for (i = 5; i < 8; i++)
-    operands[i] = gen_reg_rtx (<MODE>mode);
+  [(match_operand:VF1 0 "register_operand" "")
+   (match_operand:<sseintvecmode> 1 "register_operand" "")]
+  "TARGET_SSE2 && (<MODE>mode == V4SFmode || TARGET_AVX2)"
+{
+  rtx tmp[8];
+  tmp[0] = force_reg (<sseintvecmode>mode,
+		      ix86_build_const_vector (<sseintvecmode>mode, 1,
+					       GEN_INT (0xffff)));
+  tmp[1] = gen_reg_rtx (<sseintvecmode>mode);
+  if (<sseintvecmode>mode == V4SImode)
+    emit_insn (gen_sse2_andnotv4si3 (tmp[1], tmp[0], operands[1]));
+  else
+    emit_insn (gen_avx2_andnotv8si3 (tmp[1], tmp[0], operands[1]));
+  tmp[2] = expand_simple_binop (<sseintvecmode>mode, AND,
+				operands[1], tmp[0], NULL_RTX,
+				1, OPTAB_DIRECT);
+  tmp[3] = expand_simple_binop (<sseintvecmode>mode, LSHIFTRT,
+				tmp[1], const1_rtx, NULL_RTX,
+				1, OPTAB_DIRECT);
+  tmp[4] = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_float<sseintvecmodelower><mode>2 (tmp[4], tmp[2]));
+  tmp[5] = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_float<sseintvecmodelower><mode>2 (tmp[5], tmp[3]));
+  tmp[6] = expand_simple_binop (<MODE>mode, PLUS, tmp[5], tmp[5],
+				NULL_RTX, 1, OPTAB_DIRECT);
+  tmp[7] = expand_simple_binop (<MODE>mode, PLUS, tmp[4], tmp[6],
+				operands[0], 1, OPTAB_DIRECT);
+  gcc_assert (tmp[7] == operands[0]);
+  DONE;
 })
 
 (define_insn "avx_cvtps2dq256"
--- gcc/testsuite/gcc.dg/torture/vec-cvt-1.c.jj	2011-11-03 14:07:23.000000000 +0100
+++ gcc/testsuite/gcc.dg/torture/vec-cvt-1.c	2011-11-03 17:43:11.000000000 +0100
@@ -205,7 +205,7 @@ main ()
   inttoflttestsl ();
   inttoflttestuc ();
   inttoflttestus ();
-//  inttoflttestui ();
+  inttoflttestui ();
   inttoflttestul ();
   return 0;
 }
2011-11-03  Jakub Jelinek  <jakub@redhat.com>

	* config/i386/sse.md (floatuns<sseintvecmodelower><mode>2): Rewritten.

	* gcc.dg/torture/vec-cvt-1.c: Enable commented out inttoflttestui
	test.

--- gcc/config/i386/sse.md.jj	2011-11-03 16:11:05.000000000 +0100
+++ gcc/config/i386/sse.md	2011-11-03 17:25:22.000000000 +0100
@@ -2242,30 +2242,35 @@ (define_insn "float<sseintvecmodelower><
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_expand "floatuns<sseintvecmodelower><mode>2"
-  [(set (match_dup 5)
-	(float:VF1
-	  (match_operand:<sseintvecmode> 1 "nonimmediate_operand" "")))
-   (set (match_dup 6)
-	(lt:VF1 (match_dup 5) (match_dup 3)))
-   (set (match_dup 7)
-	(and:VF1 (match_dup 6) (match_dup 4)))
-   (set (match_operand:VF1 0 "register_operand" "")
-	(plus:VF1 (match_dup 5) (match_dup 7)))]
-  "TARGET_SSE2"
-{
-  REAL_VALUE_TYPE TWO32r;
-  rtx x;
-  int i;
-
-  real_ldexp (&TWO32r, &dconst1, 32);
-  x = const_double_from_real_value (TWO32r, SFmode);
-
-  operands[3] = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
-  operands[4] = force_reg (<MODE>mode,
-			   ix86_build_const_vector (<MODE>mode, 1, x));
-
-  for (i = 5; i < 8; i++)
-    operands[i] = gen_reg_rtx (<MODE>mode);
+  [(match_operand:VF1 0 "register_operand" "")
+   (match_operand:<sseintvecmode> 1 "register_operand" "")]
+  "TARGET_SSE2 && (<MODE>mode == V4SFmode || TARGET_AVX2)"
+{
+  rtx tmp[8];
+  REAL_VALUE_TYPE TWO16r;
+  tmp[0] = force_reg (<sseintvecmode>mode,
+		      ix86_build_const_vector (<sseintvecmode>mode, 1,
+					       GEN_INT (0xffff)));
+  tmp[1] = expand_simple_binop (<sseintvecmode>mode, AND,
+				operands[1], tmp[0], NULL_RTX,
+				1, OPTAB_DIRECT);
+  tmp[2] = expand_simple_binop (<sseintvecmode>mode, LSHIFTRT,
+				operands[1], GEN_INT (16), NULL_RTX,
+				1, OPTAB_DIRECT);
+  tmp[3] = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_float<sseintvecmodelower><mode>2 (tmp[3], tmp[1]));
+  tmp[4] = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_float<sseintvecmodelower><mode>2 (tmp[4], tmp[2]));
+  real_ldexp (&TWO16r, &dconst1, 16);
+  tmp[5] = const_double_from_real_value (TWO16r, SFmode);
+  tmp[5] = force_reg (<MODE>mode,
+		      ix86_build_const_vector (<MODE>mode, 1, tmp[5]));
+  tmp[6] = expand_simple_binop (<MODE>mode, MULT, tmp[4], tmp[5],
+				NULL_RTX, 1, OPTAB_DIRECT);
+  tmp[7] = expand_simple_binop (<MODE>mode, PLUS, tmp[3], tmp[6],
+				operands[0], 1, OPTAB_DIRECT);
+  gcc_assert (tmp[7] == operands[0]);
+  DONE;
 })
 
 (define_insn "avx_cvtps2dq256"
--- gcc/testsuite/gcc.dg/torture/vec-cvt-1.c.jj	2011-11-03 14:07:23.000000000 +0100
+++ gcc/testsuite/gcc.dg/torture/vec-cvt-1.c	2011-11-03 17:43:11.000000000 +0100
@@ -205,7 +205,7 @@ main ()
   inttoflttestsl ();
   inttoflttestuc ();
   inttoflttestus ();
-//  inttoflttestui ();
+  inttoflttestui ();
   inttoflttestul ();
   return 0;
 }
#define N 1024
unsigned int ui[N];
float f[N];

#define FN1(from, to) \
__attribute__((noinline, noclone)) void		\
from##2##to (void)				\
{						\
  int i;					\
  for (i = 0; i < N; i++)			\
    to[i] = from[i];				\
}
#define FN(intt, fltt) FN1 (intt, fltt) FN1 (fltt, intt)

FN (ui, f)

#define FLTTEST(min, max, intt) \
__attribute__((noinline, noclone)) void					\
inttoflttest##intt (void)						\
{									\
  int i;								\
  volatile float vf;							\
  volatile double vd;							\
  for (i = 0; i < N; i++)						\
    {									\
      asm ("");								\
      if (i < N / 4)							\
	intt[i] = min + i;						\
      else if (i < 3 * N / 4)						\
	intt[i] = (max + min) / 2 - N * 8 + 16 * i;			\
      else								\
	intt[i] = max - N + 1 + i;					\
    }									\
  for (i = 0; i < 5000000; i++)						\
  intt##2f ();								\
}

FLTTEST (0, 2U * __INT_MAX__ + 1, ui)

int
main ()
{
  inttoflttestui ();
  return 0;
}


Fix up floatunsv{4,8}siv{4,8}sf2

Commit Message

Comments

Patch