diff mbox series

i386: Add GENERIC and GIMPLE folders of __builtin_ia32_{min,max}* [PR116738]

Message ID ZvLxzxHCztA2rDOs@tucnak
State New
Headers show
Series i386: Add GENERIC and GIMPLE folders of __builtin_ia32_{min,max}* [PR116738] | expand

Commit Message

Jakub Jelinek Sept. 24, 2024, 5:07 p.m. UTC
Hi!

The following patch adds GENERIC and GIMPLE folders for various
x86 min/max builtins.
As discussed, these builtins have effectively x < y ? x : y
(or x > y ? x : y) behavior.
The GENERIC folding is done if all the (relevant) arguments are
constants (such as VECTOR_CST for vectors) and is done because
the GIMPLE folding can't easily handle masking, rounding and the
ss/sd cases (in a way that it would be pattern recognized back to the
corresponding instructions).  The GIMPLE folding is also done just
for TARGET_SSE4 or later when optimizing, otherwise it is apparently
not matched back.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2024-09-24  Jakub Jelinek  <jakub@redhat.com>

	PR target/116738
	* config/i386/i386.cc (ix86_fold_builtin): Handle
	IX86_BUILTIN_M{IN,AX}{S,P}{S,H,D}*.
	(ix86_gimple_fold_builtin): Handle IX86_BUILTIN_M{IN,AX}P{S,H,D}*.

	* gcc.target/i386/avx512f-pr116738-1.c: New test.
	* gcc.target/i386/avx512f-pr116738-2.c: New test.


	Jakub

Comments

Hongtao Liu Sept. 25, 2024, 2:17 a.m. UTC | #1
On Wed, Sep 25, 2024 at 1:07 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> The following patch adds GENERIC and GIMPLE folders for various
> x86 min/max builtins.
> As discussed, these builtins have effectively x < y ? x : y
> (or x > y ? x : y) behavior.
> The GENERIC folding is done if all the (relevant) arguments are
> constants (such as VECTOR_CST for vectors) and is done because
> the GIMPLE folding can't easily handle masking, rounding and the
> ss/sd cases (in a way that it would be pattern recognized back to the
> corresponding instructions).  The GIMPLE folding is also done just
> for TARGET_SSE4 or later when optimizing, otherwise it is apparently
> not matched back.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2024-09-24  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/116738
>         * config/i386/i386.cc (ix86_fold_builtin): Handle
>         IX86_BUILTIN_M{IN,AX}{S,P}{S,H,D}*.
>         (ix86_gimple_fold_builtin): Handle IX86_BUILTIN_M{IN,AX}P{S,H,D}*.
>
>         * gcc.target/i386/avx512f-pr116738-1.c: New test.
>         * gcc.target/i386/avx512f-pr116738-2.c: New test.
>
> --- gcc/config/i386/i386.cc.jj  2024-09-12 10:56:57.344683959 +0200
> +++ gcc/config/i386/i386.cc     2024-09-23 15:15:40.154783766 +0200
> @@ -18507,6 +18507,8 @@ ix86_fold_builtin (tree fndecl, int n_ar
>         = (enum ix86_builtins) DECL_MD_FUNCTION_CODE (fndecl);
>        enum rtx_code rcode;
>        bool is_vshift;
> +      enum tree_code tcode;
> +      bool is_scalar;
>        unsigned HOST_WIDE_INT mask;
>
>        switch (fn_code)
> @@ -18956,6 +18958,133 @@ ix86_fold_builtin (tree fndecl, int n_ar
>             }
>           break;
>
> +       case IX86_BUILTIN_MINSS:
> +       case IX86_BUILTIN_MINSH_MASK:
> +         tcode = LT_EXPR;
> +         is_scalar = true;
> +         goto do_minmax;
> +
> +       case IX86_BUILTIN_MAXSS:
> +       case IX86_BUILTIN_MAXSH_MASK:
> +         tcode = GT_EXPR;
> +         is_scalar = true;
> +         goto do_minmax;
> +
> +       case IX86_BUILTIN_MINPS:
> +       case IX86_BUILTIN_MINPD:
> +       case IX86_BUILTIN_MINPS256:
> +       case IX86_BUILTIN_MINPD256:
> +       case IX86_BUILTIN_MINPS512:
> +       case IX86_BUILTIN_MINPD512:
> +       case IX86_BUILTIN_MINPS128_MASK:
> +       case IX86_BUILTIN_MINPD128_MASK:
> +       case IX86_BUILTIN_MINPS256_MASK:
> +       case IX86_BUILTIN_MINPD256_MASK:
> +       case IX86_BUILTIN_MINPH128_MASK:
> +       case IX86_BUILTIN_MINPH256_MASK:
> +       case IX86_BUILTIN_MINPH512_MASK:
> +         tcode = LT_EXPR;
> +         is_scalar = false;
> +         goto do_minmax;
> +
> +       case IX86_BUILTIN_MAXPS:
> +       case IX86_BUILTIN_MAXPD:
> +       case IX86_BUILTIN_MAXPS256:
> +       case IX86_BUILTIN_MAXPD256:
> +       case IX86_BUILTIN_MAXPS512:
> +       case IX86_BUILTIN_MAXPD512:
> +       case IX86_BUILTIN_MAXPS128_MASK:
> +       case IX86_BUILTIN_MAXPD128_MASK:
> +       case IX86_BUILTIN_MAXPS256_MASK:
> +       case IX86_BUILTIN_MAXPD256_MASK:
> +       case IX86_BUILTIN_MAXPH128_MASK:
> +       case IX86_BUILTIN_MAXPH256_MASK:
> +       case IX86_BUILTIN_MAXPH512_MASK:
> +         tcode = GT_EXPR;
> +         is_scalar = false;
> +       do_minmax:
> +         gcc_assert (n_args >= 2);
> +         if (TREE_CODE (args[0]) != VECTOR_CST
> +             || TREE_CODE (args[1]) != VECTOR_CST)
> +           break;
> +         mask = HOST_WIDE_INT_M1U;
> +         if (n_args > 2)
> +           {
> +             gcc_assert (n_args >= 4);
> +             /* This is masked minmax.  */
> +             if (TREE_CODE (args[3]) != INTEGER_CST
> +                 || TREE_SIDE_EFFECTS (args[2]))
> +               break;
> +             mask = TREE_INT_CST_LOW (args[3]);
> +             unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0]));
> +             mask |= HOST_WIDE_INT_M1U << elems;
> +             if (mask != HOST_WIDE_INT_M1U
> +                 && TREE_CODE (args[2]) != VECTOR_CST)
> +               break;
> +             if (n_args >= 5)
> +               {
> +                 if (!tree_fits_uhwi_p (args[4]))
> +                   break;
> +                 if (tree_to_uhwi (args[4]) != 4
> +                     && tree_to_uhwi (args[4]) != 8)
> +                   break;
> +               }
> +             if (mask == (HOST_WIDE_INT_M1U << elems))
> +               return args[2];
> +           }
> +         /* Punt on NaNs, unless exceptions are disabled.  */
> +         if (HONOR_NANS (args[0])
> +             && (n_args < 5 || tree_to_uhwi (args[4]) != 8))
> +           for (int i = 0; i < 2; ++i)
> +             {
> +               unsigned count = vector_cst_encoded_nelts (args[i]), j;
> +               for (j = 0; j < count; ++j)
> +                 if (!tree_expr_nan_p (VECTOR_CST_ENCODED_ELT (args[i], j)))
Is this a typo? I assume you want to check if the component is NAN, so
tree_expr_nan_p, not !tree_expr_nan_p?
> +                   break;
> +               if (j < count)
> +                 break;
Also this break just break the outer loop(for (int i = 0; i < 2;
i++)), but according to comments, it wants to break the outer switch?
> +             }
> +         {
> +           tree res = const_binop (tcode,
> +                                   truth_type_for (TREE_TYPE (args[0])),
> +                                   args[0], args[1]);
> +           if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST)
> +             break;
> +           res = fold_ternary (VEC_COND_EXPR, TREE_TYPE (args[0]), res,
> +                               args[0], args[1]);
> +           if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST)
> +             break;
> +           if (mask != HOST_WIDE_INT_M1U)
> +             {
> +               unsigned nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0]));
> +               vec_perm_builder sel (nelts, nelts, 1);
> +               for (unsigned int i = 0; i < nelts; i++)
> +                 if (mask & (HOST_WIDE_INT_1U << i))
> +                   sel.quick_push (i);
> +                 else
> +                   sel.quick_push (nelts + i);
> +               vec_perm_indices indices (sel, 2, nelts);
> +               res = fold_vec_perm (TREE_TYPE (args[0]), res, args[2],
> +                                    indices);
> +               if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST)
> +                 break;
> +             }
> +           if (is_scalar)
> +             {
> +               unsigned nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0]));
> +               vec_perm_builder sel (nelts, nelts, 1);
> +               sel.quick_push (0);
> +               for (unsigned int i = 1; i < nelts; i++)
> +                 sel.quick_push (nelts + i);
> +               vec_perm_indices indices (sel, 2, nelts);
> +               res = fold_vec_perm (TREE_TYPE (args[0]), res, args[0],
> +                                    indices);
> +               if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST)
> +                 break;
> +             }
> +           return res;
> +         }
> +
>         default:
>           break;
>         }
> @@ -19500,6 +19629,74 @@ ix86_gimple_fold_builtin (gimple_stmt_it
>         gsi_replace (gsi, g, false);
>        }
>        return true;
> +
> +    case IX86_BUILTIN_MINPS:
> +    case IX86_BUILTIN_MINPD:
> +    case IX86_BUILTIN_MINPS256:
> +    case IX86_BUILTIN_MINPD256:
> +    case IX86_BUILTIN_MINPS512:
> +    case IX86_BUILTIN_MINPD512:
> +    case IX86_BUILTIN_MINPS128_MASK:
> +    case IX86_BUILTIN_MINPD128_MASK:
> +    case IX86_BUILTIN_MINPS256_MASK:
> +    case IX86_BUILTIN_MINPD256_MASK:
> +    case IX86_BUILTIN_MINPH128_MASK:
> +    case IX86_BUILTIN_MINPH256_MASK:
> +    case IX86_BUILTIN_MINPH512_MASK:
> +      tcode = LT_EXPR;
> +      goto do_minmax;
> +
> +    case IX86_BUILTIN_MAXPS:
> +    case IX86_BUILTIN_MAXPD:
> +    case IX86_BUILTIN_MAXPS256:
> +    case IX86_BUILTIN_MAXPD256:
> +    case IX86_BUILTIN_MAXPS512:
> +    case IX86_BUILTIN_MAXPD512:
> +    case IX86_BUILTIN_MAXPS128_MASK:
> +    case IX86_BUILTIN_MAXPD128_MASK:
> +    case IX86_BUILTIN_MAXPS256_MASK:
> +    case IX86_BUILTIN_MAXPD256_MASK:
> +    case IX86_BUILTIN_MAXPH128_MASK:
> +    case IX86_BUILTIN_MAXPH256_MASK:
> +    case IX86_BUILTIN_MAXPH512_MASK:
> +      tcode = GT_EXPR;
> +    do_minmax:
> +      gcc_assert (n_args >= 2);
> +      /* Without SSE4.1 we often aren't able to pattern match it back to the
> +        desired instruction.  */
> +      if (!gimple_call_lhs (stmt) || !optimize || !TARGET_SSE4_1)
> +       break;
> +      arg0 = gimple_call_arg (stmt, 0);
> +      arg1 = gimple_call_arg (stmt, 1);
> +      elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> +      /* For masked minmax, only optimize if the mask is all ones.  */
> +      if (n_args > 2
> +         && !ix86_masked_all_ones (elems, gimple_call_arg (stmt, 3)))
> +       break;
> +      if (n_args >= 5)
> +       {
> +         tree arg4 = gimple_call_arg (stmt, 4);
> +         if (!tree_fits_uhwi_p (arg4))
> +           break;
> +         if (tree_to_uhwi (arg4) == 4)
> +           /* Ok.  */;
> +         else if (tree_to_uhwi (arg4) != 8)
> +           /* Invalid round argument.  */
> +           break;
> +         else if (HONOR_NANS (arg0))
> +           /* Lowering to comparison would raise exceptions which
> +              shouldn't be raised.  */
> +           break;
> +       }
> +      {
> +       tree type = truth_type_for (TREE_TYPE (arg0));
> +       tree cmpres = gimple_build (&stmts, tcode, type, arg0, arg1);
> +       gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> +       g = gimple_build_assign (gimple_call_lhs (stmt),
> +                                VEC_COND_EXPR, cmpres, arg0, arg1);
> +       gsi_replace (gsi, g, false);
> +      }
> +      return true;
>
>      default:
>        break;
> --- gcc/testsuite/gcc.target/i386/avx512f-pr116738-1.c.jj       2024-09-23 15:46:39.402217045 +0200
> +++ gcc/testsuite/gcc.target/i386/avx512f-pr116738-1.c  2024-09-23 15:48:35.340623166 +0200
> @@ -0,0 +1,56 @@
> +/* PR target/116738 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx512f -fdump-tree-optimized" } */
> +/* { dg-final { scan-tree-dump-not "__builtin_ia32_min" "optimized" } } */
> +/* { dg-final { scan-tree-dump-not "__builtin_ia32_max" "optimized" } } */
> +
> +#include <x86intrin.h>
> +
> +void
> +test_pr116738 (void)
> +{
> +  __m512 a = _mm512_setr_ps (1.f, 2.f, 0.f, -0.f, -0.f, 0.f, 5.f, 6.f, 7.f,
> +                            8.f, 9.f, 10.f, 11.f, -__builtin_inff (),
> +                            __builtin_inff (), -42.f);
> +  __m512 b = _mm512_setr_ps (-0.f, 3.f, -0.f, 0.f, -0.f, 0.f, 5.f, 5.f, 8.f,
> +                            7.f, 10.f, -9.f, 12.f, 0.f, -0.f, 42.f);
> +  __m512 w = _mm512_setr_ps (4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 0.f, 1.f,
> +                            2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f);
> +  __m512 c = _mm512_mask_min_ps (w, -1, a, b);
> +  __m512 d = _mm512_mask_min_ps (w, 18658, a, b);
> +  __m512 e = _mm512_mask_min_ps (w, 54649, a, b);
> +  __m512 f = _mm512_mask_max_ps (w, -1, a, b);
> +  __m512 g = _mm512_mask_max_ps (w, 18658, a, b);
> +  __m512 h = _mm512_mask_max_ps (w, 54649, a, b);
> +  __m128 i = _mm_setr_ps (1.f, 2.f, 0.f, -0.f);
> +  __m128 j = _mm_setr_ps (-0.f, 3.f, -0.f, 0.f);
> +  __m128 k = _mm_min_ss (i, j);
> +  __m128 l = _mm_max_ss (j, i);
> +  __m512 ce = _mm512_setr_ps (-0.f, 2.f, -0.f, 0.f, -0.f, 0.f, 5.f, 5.f, 7.f,
> +                             7.f, 9.f, -9.f, 11.f, -__builtin_inff (),
> +                             -0.f, -42.f);
> +  __m512 de = _mm512_setr_ps (4.f, 2.f, 6.f, 7.f, 8.f, 0.f, 5.f, 5.f, 1.f,
> +                             2.f, 3.f, -9.f, 5.f, 6.f, -0.f, 8.f);
> +  __m512 ee = _mm512_setr_ps (-0.f, 5.f, 6.f, 0.f, -0.f, 0.f, 5.f, 0.f, 7.f,
> +                             2.f, 9.f, 4.f, 11.f, 6.f, -0.f, -42.f);
> +  __m512 fe = _mm512_setr_ps (1.f, 3.f, -0.f, 0.f, -0.f, 0.f, 5.f, 6.f, 8.f,
> +                             8.f, 10.f, 10.f, 12.f, 0.f, __builtin_inff (),
> +                             42.f);
> +  __m512 ge = _mm512_setr_ps (4.f, 3.f, 6.f, 7.f, 8.f, 0.f, 5.f, 6.f, 1.f,
> +                             2.f, 3.f, 10.f, 5.f, 6.f, __builtin_inff (),
> +                             8.f);
> +  __m512 he = _mm512_setr_ps (1.f, 5.f, 6.f, 0.f, -0.f, 0.f, 5.f, 0.f, 8.f,
> +                             2.f, 10.f, 4.f, 12.f, 6.f, __builtin_inff (),
> +                             42.f);
> +  __m128 ke = _mm_setr_ps (-0.f, 2.f, 0.f, -0.f);
> +  __m128 le = _mm_setr_ps (1.f, 3.f, -0.f, 0.f);
> +  if (__builtin_memcmp (&c, &ce, sizeof (c))
> +      || __builtin_memcmp (&d, &de, sizeof (d))
> +      || __builtin_memcmp (&e, &ee, sizeof (e))
> +      || __builtin_memcmp (&f, &fe, sizeof (f))
> +      || __builtin_memcmp (&g, &ge, sizeof (g))
> +      || __builtin_memcmp (&h, &he, sizeof (h))
> +      || __builtin_memcmp (&k, &ke, sizeof (k))
> +      || __builtin_memcmp (&l, &le, sizeof (l)))
> +    __builtin_abort ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx512f-pr116738-2.c.jj       2024-09-23 15:47:34.876454405 +0200
> +++ gcc/testsuite/gcc.target/i386/avx512f-pr116738-2.c  2024-09-23 15:48:43.471511382 +0200
> @@ -0,0 +1,15 @@
> +/* PR target/116738 */
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512f" } */
> +/* { dg-require-effective-target avx512f } */
> +
> +#define AVX512F
> +#include "avx512f-helper.h"
> +
> +#include "avx512f-pr116738-1.c"
> +
> +void
> +TEST (void)
> +{
> +  test_pr116738 ();
> +}
>
>         Jakub
>
diff mbox series

Patch

--- gcc/config/i386/i386.cc.jj	2024-09-12 10:56:57.344683959 +0200
+++ gcc/config/i386/i386.cc	2024-09-23 15:15:40.154783766 +0200
@@ -18507,6 +18507,8 @@  ix86_fold_builtin (tree fndecl, int n_ar
 	= (enum ix86_builtins) DECL_MD_FUNCTION_CODE (fndecl);
       enum rtx_code rcode;
       bool is_vshift;
+      enum tree_code tcode;
+      bool is_scalar;
       unsigned HOST_WIDE_INT mask;
 
       switch (fn_code)
@@ -18956,6 +18958,133 @@  ix86_fold_builtin (tree fndecl, int n_ar
 	    }
 	  break;
 
+	case IX86_BUILTIN_MINSS:
+	case IX86_BUILTIN_MINSH_MASK:
+	  tcode = LT_EXPR;
+	  is_scalar = true;
+	  goto do_minmax;
+
+	case IX86_BUILTIN_MAXSS:
+	case IX86_BUILTIN_MAXSH_MASK:
+	  tcode = GT_EXPR;
+	  is_scalar = true;
+	  goto do_minmax;
+
+	case IX86_BUILTIN_MINPS:
+	case IX86_BUILTIN_MINPD:
+	case IX86_BUILTIN_MINPS256:
+	case IX86_BUILTIN_MINPD256:
+	case IX86_BUILTIN_MINPS512:
+	case IX86_BUILTIN_MINPD512:
+	case IX86_BUILTIN_MINPS128_MASK:
+	case IX86_BUILTIN_MINPD128_MASK:
+	case IX86_BUILTIN_MINPS256_MASK:
+	case IX86_BUILTIN_MINPD256_MASK:
+	case IX86_BUILTIN_MINPH128_MASK:
+	case IX86_BUILTIN_MINPH256_MASK:
+	case IX86_BUILTIN_MINPH512_MASK:
+	  tcode = LT_EXPR;
+	  is_scalar = false;
+	  goto do_minmax;
+
+	case IX86_BUILTIN_MAXPS:
+	case IX86_BUILTIN_MAXPD:
+	case IX86_BUILTIN_MAXPS256:
+	case IX86_BUILTIN_MAXPD256:
+	case IX86_BUILTIN_MAXPS512:
+	case IX86_BUILTIN_MAXPD512:
+	case IX86_BUILTIN_MAXPS128_MASK:
+	case IX86_BUILTIN_MAXPD128_MASK:
+	case IX86_BUILTIN_MAXPS256_MASK:
+	case IX86_BUILTIN_MAXPD256_MASK:
+	case IX86_BUILTIN_MAXPH128_MASK:
+	case IX86_BUILTIN_MAXPH256_MASK:
+	case IX86_BUILTIN_MAXPH512_MASK:
+	  tcode = GT_EXPR;
+	  is_scalar = false;
+	do_minmax:
+	  gcc_assert (n_args >= 2);
+	  if (TREE_CODE (args[0]) != VECTOR_CST
+	      || TREE_CODE (args[1]) != VECTOR_CST)
+	    break;
+	  mask = HOST_WIDE_INT_M1U;
+	  if (n_args > 2)
+	    {
+	      gcc_assert (n_args >= 4);
+	      /* This is masked minmax.  */
+	      if (TREE_CODE (args[3]) != INTEGER_CST
+		  || TREE_SIDE_EFFECTS (args[2]))
+		break;
+	      mask = TREE_INT_CST_LOW (args[3]);
+	      unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0]));
+	      mask |= HOST_WIDE_INT_M1U << elems;
+	      if (mask != HOST_WIDE_INT_M1U
+		  && TREE_CODE (args[2]) != VECTOR_CST)
+		break;
+	      if (n_args >= 5)
+		{
+		  if (!tree_fits_uhwi_p (args[4]))
+		    break;
+		  if (tree_to_uhwi (args[4]) != 4
+		      && tree_to_uhwi (args[4]) != 8)
+		    break;
+		}
+	      if (mask == (HOST_WIDE_INT_M1U << elems))
+		return args[2];
+	    }
+	  /* Punt on NaNs, unless exceptions are disabled.  */
+	  if (HONOR_NANS (args[0])
+	      && (n_args < 5 || tree_to_uhwi (args[4]) != 8))
+	    for (int i = 0; i < 2; ++i)
+	      {
+		unsigned count = vector_cst_encoded_nelts (args[i]), j;
+		for (j = 0; j < count; ++j)
+		  if (!tree_expr_nan_p (VECTOR_CST_ENCODED_ELT (args[i], j)))
+		    break;
+		if (j < count)
+		  break;
+	      }
+	  {
+	    tree res = const_binop (tcode,
+				    truth_type_for (TREE_TYPE (args[0])),
+				    args[0], args[1]);
+	    if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST)
+	      break;
+	    res = fold_ternary (VEC_COND_EXPR, TREE_TYPE (args[0]), res,
+				args[0], args[1]);
+	    if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST)
+	      break;
+	    if (mask != HOST_WIDE_INT_M1U)
+	      {
+		unsigned nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0]));
+		vec_perm_builder sel (nelts, nelts, 1);
+		for (unsigned int i = 0; i < nelts; i++)
+		  if (mask & (HOST_WIDE_INT_1U << i))
+		    sel.quick_push (i);
+		  else
+		    sel.quick_push (nelts + i);
+		vec_perm_indices indices (sel, 2, nelts);
+		res = fold_vec_perm (TREE_TYPE (args[0]), res, args[2],
+				     indices);
+		if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST)
+		  break;
+	      }
+	    if (is_scalar)
+	      {
+		unsigned nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0]));
+		vec_perm_builder sel (nelts, nelts, 1);
+		sel.quick_push (0);
+		for (unsigned int i = 1; i < nelts; i++)
+		  sel.quick_push (nelts + i);
+		vec_perm_indices indices (sel, 2, nelts);
+		res = fold_vec_perm (TREE_TYPE (args[0]), res, args[0],
+				     indices);
+		if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST)
+		  break;
+	      }
+	    return res;
+	  }
+
 	default:
 	  break;
 	}
@@ -19500,6 +19629,74 @@  ix86_gimple_fold_builtin (gimple_stmt_it
 	gsi_replace (gsi, g, false);
       }
       return true;
+
+    case IX86_BUILTIN_MINPS:
+    case IX86_BUILTIN_MINPD:
+    case IX86_BUILTIN_MINPS256:
+    case IX86_BUILTIN_MINPD256:
+    case IX86_BUILTIN_MINPS512:
+    case IX86_BUILTIN_MINPD512:
+    case IX86_BUILTIN_MINPS128_MASK:
+    case IX86_BUILTIN_MINPD128_MASK:
+    case IX86_BUILTIN_MINPS256_MASK:
+    case IX86_BUILTIN_MINPD256_MASK:
+    case IX86_BUILTIN_MINPH128_MASK:
+    case IX86_BUILTIN_MINPH256_MASK:
+    case IX86_BUILTIN_MINPH512_MASK:
+      tcode = LT_EXPR;
+      goto do_minmax;
+
+    case IX86_BUILTIN_MAXPS:
+    case IX86_BUILTIN_MAXPD:
+    case IX86_BUILTIN_MAXPS256:
+    case IX86_BUILTIN_MAXPD256:
+    case IX86_BUILTIN_MAXPS512:
+    case IX86_BUILTIN_MAXPD512:
+    case IX86_BUILTIN_MAXPS128_MASK:
+    case IX86_BUILTIN_MAXPD128_MASK:
+    case IX86_BUILTIN_MAXPS256_MASK:
+    case IX86_BUILTIN_MAXPD256_MASK:
+    case IX86_BUILTIN_MAXPH128_MASK:
+    case IX86_BUILTIN_MAXPH256_MASK:
+    case IX86_BUILTIN_MAXPH512_MASK:
+      tcode = GT_EXPR;
+    do_minmax:
+      gcc_assert (n_args >= 2);
+      /* Without SSE4.1 we often aren't able to pattern match it back to the
+	 desired instruction.  */
+      if (!gimple_call_lhs (stmt) || !optimize || !TARGET_SSE4_1)
+	break;
+      arg0 = gimple_call_arg (stmt, 0);
+      arg1 = gimple_call_arg (stmt, 1);
+      elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+      /* For masked minmax, only optimize if the mask is all ones.  */
+      if (n_args > 2
+	  && !ix86_masked_all_ones (elems, gimple_call_arg (stmt, 3)))
+	break;
+      if (n_args >= 5)
+	{
+	  tree arg4 = gimple_call_arg (stmt, 4);
+	  if (!tree_fits_uhwi_p (arg4))
+	    break;
+	  if (tree_to_uhwi (arg4) == 4)
+	    /* Ok.  */;
+	  else if (tree_to_uhwi (arg4) != 8)
+	    /* Invalid round argument.  */
+	    break;
+	  else if (HONOR_NANS (arg0))
+	    /* Lowering to comparison would raise exceptions which
+	       shouldn't be raised.  */
+	    break;
+	}
+      {
+	tree type = truth_type_for (TREE_TYPE (arg0));
+	tree cmpres = gimple_build (&stmts, tcode, type, arg0, arg1);
+	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+	g = gimple_build_assign (gimple_call_lhs (stmt),
+				 VEC_COND_EXPR, cmpres, arg0, arg1);
+	gsi_replace (gsi, g, false);
+      }
+      return true;
 
     default:
       break;
--- gcc/testsuite/gcc.target/i386/avx512f-pr116738-1.c.jj	2024-09-23 15:46:39.402217045 +0200
+++ gcc/testsuite/gcc.target/i386/avx512f-pr116738-1.c	2024-09-23 15:48:35.340623166 +0200
@@ -0,0 +1,56 @@ 
+/* PR target/116738 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-not "__builtin_ia32_min" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "__builtin_ia32_max" "optimized" } } */
+
+#include <x86intrin.h>
+
+void
+test_pr116738 (void)
+{
+  __m512 a = _mm512_setr_ps (1.f, 2.f, 0.f, -0.f, -0.f, 0.f, 5.f, 6.f, 7.f,
+			     8.f, 9.f, 10.f, 11.f, -__builtin_inff (),
+			     __builtin_inff (), -42.f);
+  __m512 b = _mm512_setr_ps (-0.f, 3.f, -0.f, 0.f, -0.f, 0.f, 5.f, 5.f, 8.f,
+			     7.f, 10.f, -9.f, 12.f, 0.f, -0.f, 42.f);
+  __m512 w = _mm512_setr_ps (4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 0.f, 1.f,
+			     2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f);
+  __m512 c = _mm512_mask_min_ps (w, -1, a, b);
+  __m512 d = _mm512_mask_min_ps (w, 18658, a, b);
+  __m512 e = _mm512_mask_min_ps (w, 54649, a, b);
+  __m512 f = _mm512_mask_max_ps (w, -1, a, b);
+  __m512 g = _mm512_mask_max_ps (w, 18658, a, b);
+  __m512 h = _mm512_mask_max_ps (w, 54649, a, b);
+  __m128 i = _mm_setr_ps (1.f, 2.f, 0.f, -0.f);
+  __m128 j = _mm_setr_ps (-0.f, 3.f, -0.f, 0.f);
+  __m128 k = _mm_min_ss (i, j);
+  __m128 l = _mm_max_ss (j, i);
+  __m512 ce = _mm512_setr_ps (-0.f, 2.f, -0.f, 0.f, -0.f, 0.f, 5.f, 5.f, 7.f,
+			      7.f, 9.f, -9.f, 11.f, -__builtin_inff (),
+			      -0.f, -42.f);
+  __m512 de = _mm512_setr_ps (4.f, 2.f, 6.f, 7.f, 8.f, 0.f, 5.f, 5.f, 1.f,
+			      2.f, 3.f, -9.f, 5.f, 6.f, -0.f, 8.f); 
+  __m512 ee = _mm512_setr_ps (-0.f, 5.f, 6.f, 0.f, -0.f, 0.f, 5.f, 0.f, 7.f,
+			      2.f, 9.f, 4.f, 11.f, 6.f, -0.f, -42.f);
+  __m512 fe = _mm512_setr_ps (1.f, 3.f, -0.f, 0.f, -0.f, 0.f, 5.f, 6.f, 8.f,
+			      8.f, 10.f, 10.f, 12.f, 0.f, __builtin_inff (),
+			      42.f);
+  __m512 ge = _mm512_setr_ps (4.f, 3.f, 6.f, 7.f, 8.f, 0.f, 5.f, 6.f, 1.f,
+			      2.f, 3.f, 10.f, 5.f, 6.f, __builtin_inff (),
+			      8.f);
+  __m512 he = _mm512_setr_ps (1.f, 5.f, 6.f, 0.f, -0.f, 0.f, 5.f, 0.f, 8.f,
+			      2.f, 10.f, 4.f, 12.f, 6.f, __builtin_inff (),
+			      42.f);
+  __m128 ke = _mm_setr_ps (-0.f, 2.f, 0.f, -0.f);
+  __m128 le = _mm_setr_ps (1.f, 3.f, -0.f, 0.f);
+  if (__builtin_memcmp (&c, &ce, sizeof (c))
+      || __builtin_memcmp (&d, &de, sizeof (d))
+      || __builtin_memcmp (&e, &ee, sizeof (e))
+      || __builtin_memcmp (&f, &fe, sizeof (f))
+      || __builtin_memcmp (&g, &ge, sizeof (g))
+      || __builtin_memcmp (&h, &he, sizeof (h))
+      || __builtin_memcmp (&k, &ke, sizeof (k))
+      || __builtin_memcmp (&l, &le, sizeof (l)))
+    __builtin_abort ();
+}
--- gcc/testsuite/gcc.target/i386/avx512f-pr116738-2.c.jj	2024-09-23 15:47:34.876454405 +0200
+++ gcc/testsuite/gcc.target/i386/avx512f-pr116738-2.c	2024-09-23 15:48:43.471511382 +0200
@@ -0,0 +1,15 @@ 
+/* PR target/116738 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-require-effective-target avx512f } */
+
+#define AVX512F
+#include "avx512f-helper.h"
+
+#include "avx512f-pr116738-1.c"
+
+void
+TEST (void)
+{
+  test_pr116738 ();
+}