diff mbox

[i386] : Fix PR78037, Incorrect code generated at optimization level -O2 for tzcnt and binary and

Message ID CAFULd4anWLJd1t0U1mabStNNadDbQmRn+Jq9MoDGcgY9FskRrw@mail.gmail.com
State New
Headers show

Commit Message

Uros Bizjak Oct. 20, 2016, 6:09 p.m. UTC
Hello!

The intrinsic functions __tzcnt_{u16,u32,u64} and _lzcnt_{u16,u32,u64}
have well defined output for zero input, so they should not be
expanded via __builtin_ctz and __builtin_clz builtins.

Attached patch implements special builtins and UNSPECs to protect
these intrinsic functions from optimizations involving undefinedness
of generic __builtin_ctz and __builtin_clz with zero input.

The patched compiler will fail:

FAIL: gcc.target/i386/bmi-4.c (test for excess errors)
FAIL: gcc.target/i386/bmi-5.c (test for excess errors)
FAIL: gcc.target/i386/bmi-6.c (test for excess errors)

where we test that:

/* Test that a constant operand 0 to tzcnt gets folded.  */
extern void link_error(void);
int main()
{
  if (__tzcnt_u32(0) != 32)
    link_error();
  return 0;
}

As pointed out by Richi in the PR:

Note that regardless of this macro the ``definedness'' of @code{clz}
and @code{ctz} at zero do @emph{not} extend to the builtin functions
visible to the user.  Thus one may be free to adjust the value at will
to match the target expansion of these operations without fear of
breaking the API@.

these tests, where e.g. __tzcnt_u32 called  __builtin_ctz, passed only by luck.

The correct and robust solution would be to constant-fold these new
builtins in ix86_fold_builtin function. I'll open a PR for that.

2016-10-20 Uros Bizjak  <ubizjak@gmail.com>

    PR target/78037
    * config/i386/bmiintrin.h (__tzcnt_u16): Call __builtin_ia32_tzcnt_u16.
    (__tzcnt_u32, _tzcnt_u32): Call __builtin_ia32_tzcnt_u32.
    (__tzcnt_u64, _tzcnt_u64): Call __builtin_ia32_tzcnt_u64.
    * config/i386/lzcntintrin.h (__lzcnt_u16): Call
    __builtin_ia32_lzcnt_u16.
    (__lzcnt_u32, _lzcnt_u32): Call __builtin_ia32_lzcnt_u32.
    (__lzcnt_u64, _lzcnt_u64): Call __builtin_ia32_lzcnt_u64.
    * config/i386/i386.md (UNSPEC_LZCNT, UNSPEC_TZCNT): New unspecs.
    (ctz<mode>2, *ctz<mode>2): Use SWI48 mode iterator.
    (bmi_tzcnt_<mode>): New expander.
    (*bmi_tzcnt_<mode>_falsedep_1): New define_insn_and_split pattern.
    (*bmi_tzcnt_<mode>_falsedep, *bmi_tzcnt_<mode>): New insn patterns.
    (clz<mode>2_lzcnt, *clz<mode>2_lzcnt): Use SWI48 mode iterator.
    (lzcnt_<mode>): New expander.
    (*lzcnt_<mode>_falsedep_1): New define_insn_and_split pattern.
    (*lzcnt_<mode>_falsedep, *lzcnt_<mode>): New insn patterns.
    * config/i386/i386-builtin-types.def (UINT_FTYPE_UINT): New.
    (UINT64_FTYPE_UINT64): New.
    * config/i386/i386-builtin.def (__builtin_clzs): Remove description.
    (__builtin_ia32_lzcnt_u16): New description.
    (__builtin_ia32_lzcnt_u32): Ditto.
    (__builtin_ia32_lzcnt_u64): Ditto.
    (__builtin_ctzs): Remove description.
    (__builtin_ia32_tzcnt_u16): New description.
    (__builtin_ia32_tzcnt_u32): Ditto.
    (__builtin_ia32_tzcnt_u64): Ditto.
    * config/i386/i386.c (ix86_expand_args_builtin): Handle
    UINT_FTYPE_UINT and UINT64_FTYPE_UINT64.

testsuite/ChangeLog:

2016-10-20  Uros Bizjak  <ubizjak@gmail.com>

    PR target/78037
    * gcc.target/i386/pr78037.c: New test.

Patch was bootstrapped and regression tested on x86_64-linux-gnu and
was committed to mainline SVN.

Uros.
diff mbox

Patch

Index: config/i386/bmiintrin.h
===================================================================
--- config/i386/bmiintrin.h	(revision 241344)
+++ config/i386/bmiintrin.h	(working copy)
@@ -37,7 +37,7 @@ 
 extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 __tzcnt_u16 (unsigned short __X)
 {
-  return __builtin_ctzs (__X);
+  return __builtin_ia32_tzcnt_u16 (__X);
 }
 
 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -97,13 +97,13 @@  _blsr_u32 (unsigned int __X)
 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 __tzcnt_u32 (unsigned int __X)
 {
-  return __builtin_ctz (__X);
+  return __builtin_ia32_tzcnt_u32 (__X);
 }
 
 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _tzcnt_u32 (unsigned int __X)
 {
-  return __builtin_ctz (__X);
+  return __builtin_ia32_tzcnt_u32 (__X);
 }
 
 
@@ -165,13 +165,13 @@  _blsr_u64 (unsigned long long __X)
 extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 __tzcnt_u64 (unsigned long long __X)
 {
-  return __builtin_ctzll (__X);
+  return __builtin_ia32_tzcnt_u64 (__X);
 }
 
 extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _tzcnt_u64 (unsigned long long __X)
 {
-  return __builtin_ctzll (__X);
+  return __builtin_ia32_tzcnt_u64 (__X);
 }
 
 #endif /* __x86_64__  */
Index: config/i386/i386-builtin-types.def
===================================================================
--- config/i386/i386-builtin-types.def	(revision 241344)
+++ config/i386/i386-builtin-types.def	(working copy)
@@ -205,9 +205,11 @@  DEF_FUNCTION_TYPE (INT, PCCHAR)
 DEF_FUNCTION_TYPE (INT64, INT64)
 DEF_FUNCTION_TYPE (INT64, V2DF)
 DEF_FUNCTION_TYPE (INT64, V4SF)
+DEF_FUNCTION_TYPE (UINT, UINT)
+DEF_FUNCTION_TYPE (UINT16, UINT16)
 DEF_FUNCTION_TYPE (UINT64, INT)
-DEF_FUNCTION_TYPE (UINT16, UINT16)
 DEF_FUNCTION_TYPE (UINT64, PUNSIGNED)
+DEF_FUNCTION_TYPE (UINT64, UINT64)
 DEF_FUNCTION_TYPE (V16QI, PCCHAR)
 DEF_FUNCTION_TYPE (V16QI, V16QI)
 DEF_FUNCTION_TYPE (V2DF, PCDOUBLE)
Index: config/i386/i386-builtin.def
===================================================================
--- config/i386/i386-builtin.def	(revision 241344)
+++ config/i386/i386-builtin.def	(working copy)
@@ -1186,13 +1186,19 @@  BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2
 BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI)
 BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI)
 
-BDESC (OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt,   "__builtin_clzs",   IX86_BUILTIN_CLZS,    UNKNOWN,     (int) UINT16_FTYPE_UINT16)
+/* LZCNT */
+BDESC (OPTION_MASK_ISA_LZCNT, CODE_FOR_lzcnt_hi, "__builtin_ia32_lzcnt_u16", IX86_BUILTIN_LZCNT16, UNKNOWN, (int) UINT16_FTYPE_UINT16)
+BDESC (OPTION_MASK_ISA_LZCNT, CODE_FOR_lzcnt_si, "__builtin_ia32_lzcnt_u32", IX86_BUILTIN_LZCNT32, UNKNOWN, (int) UINT_FTYPE_UINT)
+BDESC (OPTION_MASK_ISA_LZCNT | OPTION_MASK_ISA_64BIT, CODE_FOR_lzcnt_di, "__builtin_ia32_lzcnt_u64", IX86_BUILTIN_LZCNT64, UNKNOWN, (int) UINT64_FTYPE_UINT64)
 
 /* BMI */
 BDESC (OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT)
 BDESC (OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_64BIT, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64)
-BDESC (OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2,       "__builtin_ctzs",           IX86_BUILTIN_CTZS,    UNKNOWN, (int) UINT16_FTYPE_UINT16)
 
+BDESC (OPTION_MASK_ISA_BMI, CODE_FOR_bmi_tzcnt_hi, "__builtin_ia32_tzcnt_u16", IX86_BUILTIN_TZCNT16, UNKNOWN, (int) UINT16_FTYPE_UINT16)
+BDESC (OPTION_MASK_ISA_BMI, CODE_FOR_bmi_tzcnt_si, "__builtin_ia32_tzcnt_u32", IX86_BUILTIN_TZCNT32, UNKNOWN, (int) UINT_FTYPE_UINT)
+BDESC (OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_64BIT, CODE_FOR_bmi_tzcnt_di, "__builtin_ia32_tzcnt_u64", IX86_BUILTIN_TZCNT64, UNKNOWN, (int) UINT64_FTYPE_UINT64)
+
 /* TBM */
 BDESC (OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT)
 BDESC (OPTION_MASK_ISA_TBM | OPTION_MASK_ISA_64BIT, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64)
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 241344)
+++ config/i386/i386.c	(working copy)
@@ -34317,8 +34317,10 @@  ix86_expand_args_builtin (const struct builtin_des
     case FLOAT128_FTYPE_FLOAT128:
     case FLOAT_FTYPE_FLOAT:
     case INT_FTYPE_INT:
+    case UINT_FTYPE_UINT:
+    case UINT16_FTYPE_UINT16:
     case UINT64_FTYPE_INT:
-    case UINT16_FTYPE_UINT16:
+    case UINT64_FTYPE_UINT64:
     case INT64_FTYPE_INT64:
     case INT64_FTYPE_V4SF:
     case INT64_FTYPE_V2DF:
Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md	(revision 241344)
+++ config/i386/i386.md	(working copy)
@@ -175,7 +175,11 @@ 
   ;; For CRC32 support
   UNSPEC_CRC32
 
+  ;; For LZCNT suppoprt
+  UNSPEC_LZCNT
+
   ;; For BMI support
+  UNSPEC_TZCNT
   UNSPEC_BEXTR
 
   ;; For BMI2 support
@@ -12850,9 +12854,9 @@ 
 
 (define_expand "ctz<mode>2"
   [(parallel
-    [(set (match_operand:SWI248 0 "register_operand")
-	  (ctz:SWI248
-	    (match_operand:SWI248 1 "nonimmediate_operand")))
+    [(set (match_operand:SWI48 0 "register_operand")
+	  (ctz:SWI48
+	    (match_operand:SWI48 1 "nonimmediate_operand")))
      (clobber (reg:CC FLAGS_REG))])])
 
 ; False dependency happens when destination is only updated by tzcnt,
@@ -12900,8 +12904,8 @@ 
    (set_attr "mode" "<MODE>")])
 
 (define_insn "*ctz<mode>2"
-  [(set (match_operand:SWI248 0 "register_operand" "=r")
-	(ctz:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "rm")))
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(ctz:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")))
    (clobber (reg:CC FLAGS_REG))]
   ""
 {
@@ -12926,15 +12930,78 @@ 
        (const_string "0")))
    (set_attr "mode" "<MODE>")])
 
+;; Version of tzcnt that is expanded from intrinsics.  This version provides
+;; operand size as output when source operand is zero. 
+
+(define_expand "bmi_tzcnt_<mode>"
+  [(parallel
+    [(set (match_operand:SWI248 0 "register_operand")
+	  (unspec:SWI248
+	    [(match_operand:SWI248 1 "nonimmediate_operand")]
+	    UNSPEC_TZCNT))
+     (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_BMI")
+
+; False dependency happens when destination is only updated by tzcnt,
+; lzcnt or popcnt.  There is no false dependency when destination is
+; also used in source.
+(define_insn_and_split "*bmi_tzcnt_<mode>_falsedep_1"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(unspec:SWI48
+	  [(match_operand:SWI48 1 "nonimmediate_operand" "rm")]
+	  UNSPEC_TZCNT))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI
+   && TARGET_AVOID_FALSE_DEP_FOR_BMI && optimize_function_for_speed_p (cfun)"
+  "#"
+  "&& reload_completed"
+  [(parallel
+    [(set (match_dup 0)
+	  (unspec:SWI48 [(match_dup 1)] UNSPEC_TZCNT))
+     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)
+     (clobber (reg:CC FLAGS_REG))])]
+{
+  if (!reg_mentioned_p (operands[0], operands[1]))
+    ix86_expand_clear (operands[0]);
+})
+
+(define_insn "*bmi_tzcnt_<mode>_falsedep"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(unspec:SWI48
+	  [(match_operand:SWI48 1 "nonimmediate_operand" "rm")]
+	  UNSPEC_TZCNT))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+	   UNSPEC_INSN_FALSE_DEP)
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI"
+  "tzcnt{<imodesuffix>}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*bmi_tzcnt_<mode>"
+  [(set (match_operand:SWI248 0 "register_operand" "=r")
+	(unspec:SWI248
+	  [(match_operand:SWI248 1 "nonimmediate_operand" "rm")]
+	  UNSPEC_TZCNT))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI"
+  "tzcnt{<imodesuffix>}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "<MODE>")])
+
 (define_expand "clz<mode>2"
   [(parallel
-     [(set (match_operand:SWI248 0 "register_operand")
-	   (minus:SWI248
+     [(set (match_operand:SWI48 0 "register_operand")
+	   (minus:SWI48
 	     (match_dup 2)
-	     (clz:SWI248 (match_operand:SWI248 1 "nonimmediate_operand"))))
+	     (clz:SWI48 (match_operand:SWI48 1 "nonimmediate_operand"))))
       (clobber (reg:CC FLAGS_REG))])
    (parallel
-     [(set (match_dup 0) (xor:SWI248 (match_dup 0) (match_dup 2)))
+     [(set (match_dup 0) (xor:SWI48 (match_dup 0) (match_dup 2)))
       (clobber (reg:CC FLAGS_REG))])]
   ""
 {
@@ -12948,9 +13015,9 @@ 
 
 (define_expand "clz<mode>2_lzcnt"
   [(parallel
-    [(set (match_operand:SWI248 0 "register_operand")
-	  (clz:SWI248
-	    (match_operand:SWI248 1 "nonimmediate_operand")))
+    [(set (match_operand:SWI48 0 "register_operand")
+	  (clz:SWI48
+	    (match_operand:SWI48 1 "nonimmediate_operand")))
      (clobber (reg:CC FLAGS_REG))])]
   "TARGET_LZCNT")
 
@@ -12987,8 +13054,8 @@ 
    (set_attr "mode" "<MODE>")])
 
 (define_insn "*clz<mode>2_lzcnt"
-  [(set (match_operand:SWI248 0 "register_operand" "=r")
-	(clz:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "rm")))
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(clz:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_LZCNT"
   "lzcnt{<imodesuffix>}\t{%1, %0|%0, %1}"
@@ -12996,6 +13063,69 @@ 
    (set_attr "type" "bitmanip")
    (set_attr "mode" "<MODE>")])
 
+;; Version of lzcnt that is expanded from intrinsics.  This version provides
+;; operand size as output when source operand is zero. 
+
+(define_expand "lzcnt_<mode>"
+  [(parallel
+    [(set (match_operand:SWI248 0 "register_operand")
+	  (unspec:SWI248
+	    [(match_operand:SWI248 1 "nonimmediate_operand")]
+	    UNSPEC_LZCNT))
+     (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_LZCNT")
+
+; False dependency happens when destination is only updated by tzcnt,
+; lzcnt or popcnt.  There is no false dependency when destination is
+; also used in source.
+(define_insn_and_split "*lzcnt_<mode>_falsedep_1"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(unspec:SWI48
+	  [(match_operand:SWI48 1 "nonimmediate_operand" "rm")]
+	  UNSPEC_LZCNT))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_LZCNT
+   && TARGET_AVOID_FALSE_DEP_FOR_BMI && optimize_function_for_speed_p (cfun)"
+  "#"
+  "&& reload_completed"
+  [(parallel
+    [(set (match_dup 0)
+	  (unspec:SWI48 [(match_dup 1)] UNSPEC_LZCNT))
+     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)
+     (clobber (reg:CC FLAGS_REG))])]
+{
+  if (!reg_mentioned_p (operands[0], operands[1]))
+    ix86_expand_clear (operands[0]);
+})
+
+(define_insn "*lzcnt_<mode>_falsedep"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(unspec:SWI48
+	  [(match_operand:SWI48 1 "nonimmediate_operand" "rm")]
+	  UNSPEC_LZCNT))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+	   UNSPEC_INSN_FALSE_DEP)
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_LZCNT"
+  "lzcnt{<imodesuffix>}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*lzcnt_<mode>"
+  [(set (match_operand:SWI248 0 "register_operand" "=r")
+	(unspec:SWI248
+	  [(match_operand:SWI248 1 "nonimmediate_operand" "rm")]
+	  UNSPEC_LZCNT))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_LZCNT"
+  "lzcnt{<imodesuffix>}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "<MODE>")])
+
 ;; BMI instructions.
 (define_insn "*bmi_andn_<mode>"
   [(set (match_operand:SWI48 0 "register_operand" "=r,r")
Index: config/i386/lzcntintrin.h
===================================================================
--- config/i386/lzcntintrin.h	(revision 241344)
+++ config/i386/lzcntintrin.h	(working copy)
@@ -38,19 +38,19 @@ 
 extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 __lzcnt16 (unsigned short __X)
 {
-  return __builtin_clzs (__X);
+  return __builtin_ia32_lzcnt_u16 (__X);
 }
 
 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 __lzcnt32 (unsigned int __X)
 {
-  return __builtin_clz (__X);
+  return __builtin_ia32_lzcnt_u32 (__X);
 }
 
 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _lzcnt_u32 (unsigned int __X)
 {
-  return __builtin_clz (__X);
+  return __builtin_ia32_lzcnt_u32 (__X);
 }
 
 #ifdef __x86_64__
@@ -57,13 +57,13 @@  _lzcnt_u32 (unsigned int __X)
 extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 __lzcnt64 (unsigned long long __X)
 {
-  return __builtin_clzll (__X);
+  return __builtin_ia32_lzcnt_u64 (__X);
 }
 
 extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _lzcnt_u64 (unsigned long long __X)
 {
-  return __builtin_clzll (__X);
+  return __builtin_ia32_lzcnt_u64 (__X);
 }
 #endif
 
Index: testsuite/gcc.target/i386/pr78037.c
===================================================================
--- testsuite/gcc.target/i386/pr78037.c	(nonexistent)
+++ testsuite/gcc.target/i386/pr78037.c	(working copy)
@@ -0,0 +1,21 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target bmi } */
+/* { dg-options "-O2 -mbmi" } */
+
+#include <x86intrin.h>
+
+#include "bmi-check.h"
+
+int
+__attribute__((noinline, noclone))
+foo (int x)
+{
+  return __tzcnt_u32 (x) & 0x1f;
+}
+
+static void
+bmi_test ()
+{
+  if (foo (0) != 0)
+    abort ();
+}