diff mbox series

tree-vect-patterns: Improve __builtin_{clz,ctz,ffs}ll vectorization [PR109011]

Message ID ZD+rh/xdkZfD7Zwe@tucnak
State New
Headers show
Series tree-vect-patterns: Improve __builtin_{clz,ctz,ffs}ll vectorization [PR109011] | expand

Commit Message

Jakub Jelinek April 19, 2023, 8:51 a.m. UTC
Hi!

For __builtin_popcountll tree-vect-patterns.cc has
vect_recog_popcount_pattern, which improves the vectorized code.
Without that the vectorization is always multi-type vectorization
in the loop (at least int and long long types) where we emit two
.POPCOUNT calls with long long arguments and int return value and then
widen to long long, so effectively after vectorization do the
V?DImode -> V?DImode popcount twice, then pack the result into V?SImode
and immediately unpack.

The following patch extends that handling to __builtin_{clz,ctz,ffs}ll
builtins as well (as long as there is an optab for them; more to come
laster).

Bootstrapped/regtested on x86_64-linux and i686-linux, plus tested on
the testcase in crosses to powerpc64le-linux and s390x-linux.  Ok
for trunk?

x86 can do __builtin_popcountll with -mavx512vpopcntdq, __builtin_clzll
with -mavx512cd, ppc can do __builtin_popcountll and __builtin_clzll
with -mpower8-vector and __builtin_ctzll with -mpower9-vector, s390
can do __builtin_{popcount,clz,ctz}ll with -march=z13 -mzarch (i.e. VX).

2023-04-19  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/109011
	* tree-vect-patterns.cc (vect_recog_popcount_pattern): Rename to ...
	(vect_recog_popcount_clz_ctz_ffs_pattern): ... this.  Handle also
	CLZ, CTZ and FFS.  Remove vargs variable, use
	gimple_build_call_internal rather than gimple_build_call_internal_vec.
	(vect_vect_recog_func_ptrs): Adjust popcount entry.

	* gcc.dg/vect/pr109011-1.c: New test.


	Jakub

Comments

Richard Biener April 19, 2023, 8:52 a.m. UTC | #1
On Wed, 19 Apr 2023, Jakub Jelinek wrote:

> Hi!
> 
> For __builtin_popcountll tree-vect-patterns.cc has
> vect_recog_popcount_pattern, which improves the vectorized code.
> Without that the vectorization is always multi-type vectorization
> in the loop (at least int and long long types) where we emit two
> .POPCOUNT calls with long long arguments and int return value and then
> widen to long long, so effectively after vectorization do the
> V?DImode -> V?DImode popcount twice, then pack the result into V?SImode
> and immediately unpack.
> 
> The following patch extends that handling to __builtin_{clz,ctz,ffs}ll
> builtins as well (as long as there is an optab for them; more to come
> laster).
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, plus tested on
> the testcase in crosses to powerpc64le-linux and s390x-linux.  Ok
> for trunk?

OK.

Richard.

> x86 can do __builtin_popcountll with -mavx512vpopcntdq, __builtin_clzll
> with -mavx512cd, ppc can do __builtin_popcountll and __builtin_clzll
> with -mpower8-vector and __builtin_ctzll with -mpower9-vector, s390
> can do __builtin_{popcount,clz,ctz}ll with -march=z13 -mzarch (i.e. VX).
> 
> 2023-04-19  Jakub Jelinek  <jakub@redhat.com>
> 
> 	PR tree-optimization/109011
> 	* tree-vect-patterns.cc (vect_recog_popcount_pattern): Rename to ...
> 	(vect_recog_popcount_clz_ctz_ffs_pattern): ... this.  Handle also
> 	CLZ, CTZ and FFS.  Remove vargs variable, use
> 	gimple_build_call_internal rather than gimple_build_call_internal_vec.
> 	(vect_vect_recog_func_ptrs): Adjust popcount entry.
> 
> 	* gcc.dg/vect/pr109011-1.c: New test.
> 
> --- gcc/tree-vect-patterns.cc.jj	2023-03-01 09:51:27.995362601 +0100
> +++ gcc/tree-vect-patterns.cc	2023-04-18 17:16:42.733935262 +0200
> @@ -1501,7 +1501,7 @@ vect_recog_widen_minus_pattern (vec_info
>  				      "vect_recog_widen_minus_pattern");
>  }
>  
> -/* Function vect_recog_popcount_pattern
> +/* Function vect_recog_popcount_clz_ctz_ffs_pattern
>  
>     Try to find the following pattern:
>  
> @@ -1530,16 +1530,20 @@ vect_recog_widen_minus_pattern (vec_info
>     * Return value: A new stmt that will be used to replace the sequence of
>     stmts that constitute the pattern. In this case it will be:
>     B = .POPCOUNT (A);
> +
> +   Similarly for clz, ctz and ffs.
>  */
>  
>  static gimple *
> -vect_recog_popcount_pattern (vec_info *vinfo,
> -			     stmt_vec_info stmt_vinfo, tree *type_out)
> +vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo,
> +					 stmt_vec_info stmt_vinfo,
> +					 tree *type_out)
>  {
>    gassign *last_stmt = dyn_cast <gassign *> (stmt_vinfo->stmt);
> -  gimple *popcount_stmt, *pattern_stmt;
> +  gimple *call_stmt, *pattern_stmt;
>    tree rhs_oprnd, rhs_origin, lhs_oprnd, lhs_type, vec_type, new_var;
> -  auto_vec<tree> vargs;
> +  internal_fn ifn = IFN_LAST;
> +  int addend = 0;
>  
>    /* Find B = (TYPE1) temp_out. */
>    if (!last_stmt)
> @@ -1557,51 +1561,137 @@ vect_recog_popcount_pattern (vec_info *v
>    if (TREE_CODE (rhs_oprnd) != SSA_NAME
>        || !has_single_use (rhs_oprnd))
>      return NULL;
> -  popcount_stmt = SSA_NAME_DEF_STMT (rhs_oprnd);
> +  call_stmt = SSA_NAME_DEF_STMT (rhs_oprnd);
>  
>    /* Find temp_out = __builtin_popcount{,l,ll} (temp_in);  */
> -  if (!is_gimple_call (popcount_stmt))
> +  if (!is_gimple_call (call_stmt))
>      return NULL;
> -  switch (gimple_call_combined_fn (popcount_stmt))
> +  switch (gimple_call_combined_fn (call_stmt))
>      {
> +      int val;
>      CASE_CFN_POPCOUNT:
> +      ifn = IFN_POPCOUNT;
> +      break;
> +    CASE_CFN_CLZ:
> +      ifn = IFN_CLZ;
> +      /* Punt if call result is unsigned and defined value at zero
> +	 is negative, as the negative value doesn't extend correctly.  */
> +      if (TYPE_UNSIGNED (TREE_TYPE (rhs_oprnd))
> +	  && gimple_call_internal_p (call_stmt)
> +	  && CLZ_DEFINED_VALUE_AT_ZERO
> +	       (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val) == 2
> +	  && val < 0)
> +	return NULL;
> +      break;
> +    CASE_CFN_CTZ:
> +      ifn = IFN_CTZ;
> +      /* Punt if call result is unsigned and defined value at zero
> +	 is negative, as the negative value doesn't extend correctly.  */
> +      if (TYPE_UNSIGNED (TREE_TYPE (rhs_oprnd))
> +	  && gimple_call_internal_p (call_stmt)
> +	  && CTZ_DEFINED_VALUE_AT_ZERO
> +	       (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val) == 2
> +	  && val < 0)
> +	return NULL;
> +      break;
> +    CASE_CFN_FFS:
> +      ifn = IFN_FFS;
>        break;
>      default:
>        return NULL;
>      }
>  
> -  if (gimple_call_num_args (popcount_stmt) != 1)
> +  if (gimple_call_num_args (call_stmt) != 1)
>      return NULL;
>  
> -  rhs_oprnd = gimple_call_arg (popcount_stmt, 0);
> +  rhs_oprnd = gimple_call_arg (call_stmt, 0);
>    vect_unpromoted_value unprom_diff;
> -  rhs_origin = vect_look_through_possible_promotion (vinfo, rhs_oprnd,
> -						    &unprom_diff);
> +  rhs_origin
> +    = vect_look_through_possible_promotion (vinfo, rhs_oprnd, &unprom_diff);
>  
>    if (!rhs_origin)
>      return NULL;
>  
> -  /* Input and output of .POPCOUNT should be same-precision integer.
> -     Also A should be unsigned or same precision as temp_in,
> -     otherwise there would be sign_extend from A to temp_in.  */
> -  if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (lhs_type)
> -      || (!TYPE_UNSIGNED (unprom_diff.type)
> -	  && (TYPE_PRECISION (unprom_diff.type)
> -	      != TYPE_PRECISION (TREE_TYPE (rhs_oprnd)))))
> +  /* Input and output of .POPCOUNT should be same-precision integer.  */
> +  if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (lhs_type))
>      return NULL;
> -  vargs.safe_push (unprom_diff.op);
>  
> -  vect_pattern_detected ("vec_regcog_popcount_pattern", popcount_stmt);
> +  /* Also A should be unsigned or same precision as temp_in, otherwise
> +     different builtins/internal functions have different behaviors.  */
> +  if (TYPE_PRECISION (unprom_diff.type)
> +      != TYPE_PRECISION (TREE_TYPE (rhs_oprnd)))
> +    switch (ifn)
> +      {
> +      case IFN_POPCOUNT:
> +	/* For popcount require zero extension, which doesn't add any
> +	   further bits to the count.  */
> +	if (!TYPE_UNSIGNED (unprom_diff.type))
> +	  return NULL;
> +	break;
> +      case IFN_CLZ:
> +	/* clzll (x) == clz (x) + 32 for unsigned x != 0, so ok
> +	   if it is undefined at zero or if it matches also for the
> +	   defined value there.  */
> +	if (!TYPE_UNSIGNED (unprom_diff.type))
> +	  return NULL;
> +	if (!type_has_mode_precision_p (lhs_type)
> +	    || !type_has_mode_precision_p (TREE_TYPE (rhs_oprnd)))
> +	  return NULL;
> +	addend = (TYPE_PRECISION (TREE_TYPE (rhs_oprnd))
> +		  - TYPE_PRECISION (lhs_type));
> +	if (gimple_call_internal_p (call_stmt))
> +	  {
> +	    int val1, val2;
> +	    int d1
> +	      = CLZ_DEFINED_VALUE_AT_ZERO
> +		  (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val1);
> +	    int d2
> +	      = CLZ_DEFINED_VALUE_AT_ZERO (SCALAR_INT_TYPE_MODE (lhs_type),
> +					   val2);
> +	    if (d1 != 2)
> +	      break;
> +	    if (d2 != 2 || val1 != val2 + addend)
> +	      return NULL;
> +	  }
> +	break;
> +      case IFN_CTZ:
> +	/* ctzll (x) == ctz (x) for unsigned or signed x != 0, so ok
> +	   if it is undefined at zero or if it matches also for the
> +	   defined value there.  */
> +	if (gimple_call_internal_p (call_stmt))
> +	  {
> +	    int val1, val2;
> +	    int d1
> +	      = CTZ_DEFINED_VALUE_AT_ZERO
> +		  (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val1);
> +	    int d2
> +	      = CTZ_DEFINED_VALUE_AT_ZERO (SCALAR_INT_TYPE_MODE (lhs_type),
> +					   val2);
> +	    if (d1 != 2)
> +	      break;
> +	    if (d2 != 2 || val1 != val2)
> +	      return NULL;
> +	  }
> +	break;
> +      case IFN_FFS:
> +	/* ffsll (x) == ffs (x) for unsigned or signed x.  */
> +	break;
> +      default:
> +	gcc_unreachable ();
> +      }
> +
> +  vect_pattern_detected ("vec_recog_popcount_clz_ctz_ffs_pattern",
> +			 call_stmt);
>    vec_type = get_vectype_for_scalar_type (vinfo, lhs_type);
> -  /* Do it only if the backend has popcount<vector_mode>2 pattern.  */
> +  /* Do it only if the backend has popcount<vector_mode>2 etc. pattern.  */
>    if (!vec_type
> -      || !direct_internal_fn_supported_p (IFN_POPCOUNT, vec_type,
> +      || !direct_internal_fn_supported_p (ifn, vec_type,
>  					  OPTIMIZE_FOR_SPEED))
>      return NULL;
>  
>    /* Create B = .POPCOUNT (A).  */
>    new_var = vect_recog_temp_ssa_var (lhs_type, NULL);
> -  pattern_stmt = gimple_build_call_internal_vec (IFN_POPCOUNT, vargs);
> +  pattern_stmt = gimple_build_call_internal (ifn, 1, unprom_diff.op);
>    gimple_call_set_lhs (pattern_stmt, new_var);
>    gimple_set_location (pattern_stmt, gimple_location (last_stmt));
>    *type_out = vec_type;
> @@ -1609,6 +1699,14 @@ vect_recog_popcount_pattern (vec_info *v
>    if (dump_enabled_p ())
>      dump_printf_loc (MSG_NOTE, vect_location,
>  		     "created pattern stmt: %G", pattern_stmt);
> +
> +  if (addend)
> +    {
> +      append_pattern_def_seq (vinfo, stmt_vinfo, pattern_stmt, vec_type);
> +      tree ret_var = vect_recog_temp_ssa_var (lhs_type, NULL);
> +      pattern_stmt = gimple_build_assign (ret_var, PLUS_EXPR, new_var,
> +					  build_int_cst (lhs_type, addend));
> +    }
>    return pattern_stmt;
>  }
>  
> @@ -6051,7 +6149,7 @@ static vect_recog_func vect_vect_recog_f
>    { vect_recog_sad_pattern, "sad" },
>    { vect_recog_widen_sum_pattern, "widen_sum" },
>    { vect_recog_pow_pattern, "pow" },
> -  { vect_recog_popcount_pattern, "popcount" },
> +  { vect_recog_popcount_clz_ctz_ffs_pattern, "popcount_clz_ctz_ffs" },
>    { vect_recog_widen_shift_pattern, "widen_shift" },
>    { vect_recog_rotate_pattern, "rotate" },
>    { vect_recog_vector_vector_shift_pattern, "vector_vector_shift" },
> --- gcc/testsuite/gcc.dg/vect/pr109011-1.c.jj	2023-04-18 14:40:47.117397908 +0200
> +++ gcc/testsuite/gcc.dg/vect/pr109011-1.c	2023-04-18 14:40:05.124004362 +0200
> @@ -0,0 +1,48 @@
> +/* PR tree-optimization/109011 */
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-unroll-loops --param=vect-epilogues-nomask=0 -fdump-tree-optimized" } */
> +/* { dg-additional-options "-mavx512cd" { target { { i?86-*-* x86_64-*-* } && avx512cd } } } */
> +/* { dg-additional-options "-mavx512vpopcntdq" { target { { i?86-*-* x86_64-*-* } && avx512vpopcntdq } } } */
> +/* { dg-additional-options "-mpower8-vector" { target powerpc_p8vector_ok } } */
> +/* { dg-additional-options "-mpower9-vector" { target powerpc_p9vector_ok } } */
> +/* { dg-additional-options "-march=z13 -mzarch" { target s390_vx } } */
> +
> +void
> +foo (long long *p, long long *q)
> +{
> +#pragma omp simd
> +  for (int i = 0; i < 2048; ++i)
> +    p[i] = __builtin_popcountll (q[i]);
> +}
> +
> +/* { dg-final { scan-tree-dump-times " = \.POPCOUNT \\\(" 1 "optimized" { target { { i?86-*-* x86_64-*-* } && avx512vpopcntdq } } } } */
> +/* { dg-final { scan-tree-dump-times " = \.POPCOUNT \\\(" 1 "optimized" { target { powerpc_p8vector_ok || s390_vx } } } } */
> +
> +void
> +bar (long long *p, long long *q)
> +{
> +#pragma omp simd
> +  for (int i = 0; i < 2048; ++i)
> +    p[i] = __builtin_clzll (q[i]);
> +}
> +
> +/* { dg-final { scan-tree-dump-times " = \.CLZ \\\(" 1 "optimized" { target { { i?86-*-* x86_64-*-* } && avx512cd } } } } */
> +/* { dg-final { scan-tree-dump-times " = \.CLZ \\\(" 1 "optimized" { target { powerpc_p8vector_ok || s390_vx } } } } */
> +
> +void
> +baz (long long *p, long long *q)
> +{
> +#pragma omp simd
> +  for (int i = 0; i < 2048; ++i)
> +    p[i] = __builtin_ctzll (q[i]);
> +}
> +
> +/* { dg-final { scan-tree-dump-times " = \.CTZ \\\(" 1 "optimized" { target { powerpc_p9vector_ok || s390_vx } } } } */
> +
> +void
> +qux (long long *p, long long *q)
> +{
> +#pragma omp simd
> +  for (int i = 0; i < 2048; ++i)
> +    p[i] = __builtin_ffsll (q[i]);
> +}
> 
> 	Jakub
> 
>
diff mbox series

Patch

--- gcc/tree-vect-patterns.cc.jj	2023-03-01 09:51:27.995362601 +0100
+++ gcc/tree-vect-patterns.cc	2023-04-18 17:16:42.733935262 +0200
@@ -1501,7 +1501,7 @@  vect_recog_widen_minus_pattern (vec_info
 				      "vect_recog_widen_minus_pattern");
 }
 
-/* Function vect_recog_popcount_pattern
+/* Function vect_recog_popcount_clz_ctz_ffs_pattern
 
    Try to find the following pattern:
 
@@ -1530,16 +1530,20 @@  vect_recog_widen_minus_pattern (vec_info
    * Return value: A new stmt that will be used to replace the sequence of
    stmts that constitute the pattern. In this case it will be:
    B = .POPCOUNT (A);
+
+   Similarly for clz, ctz and ffs.
 */
 
 static gimple *
-vect_recog_popcount_pattern (vec_info *vinfo,
-			     stmt_vec_info stmt_vinfo, tree *type_out)
+vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo,
+					 stmt_vec_info stmt_vinfo,
+					 tree *type_out)
 {
   gassign *last_stmt = dyn_cast <gassign *> (stmt_vinfo->stmt);
-  gimple *popcount_stmt, *pattern_stmt;
+  gimple *call_stmt, *pattern_stmt;
   tree rhs_oprnd, rhs_origin, lhs_oprnd, lhs_type, vec_type, new_var;
-  auto_vec<tree> vargs;
+  internal_fn ifn = IFN_LAST;
+  int addend = 0;
 
   /* Find B = (TYPE1) temp_out. */
   if (!last_stmt)
@@ -1557,51 +1561,137 @@  vect_recog_popcount_pattern (vec_info *v
   if (TREE_CODE (rhs_oprnd) != SSA_NAME
       || !has_single_use (rhs_oprnd))
     return NULL;
-  popcount_stmt = SSA_NAME_DEF_STMT (rhs_oprnd);
+  call_stmt = SSA_NAME_DEF_STMT (rhs_oprnd);
 
   /* Find temp_out = __builtin_popcount{,l,ll} (temp_in);  */
-  if (!is_gimple_call (popcount_stmt))
+  if (!is_gimple_call (call_stmt))
     return NULL;
-  switch (gimple_call_combined_fn (popcount_stmt))
+  switch (gimple_call_combined_fn (call_stmt))
     {
+      int val;
     CASE_CFN_POPCOUNT:
+      ifn = IFN_POPCOUNT;
+      break;
+    CASE_CFN_CLZ:
+      ifn = IFN_CLZ;
+      /* Punt if call result is unsigned and defined value at zero
+	 is negative, as the negative value doesn't extend correctly.  */
+      if (TYPE_UNSIGNED (TREE_TYPE (rhs_oprnd))
+	  && gimple_call_internal_p (call_stmt)
+	  && CLZ_DEFINED_VALUE_AT_ZERO
+	       (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val) == 2
+	  && val < 0)
+	return NULL;
+      break;
+    CASE_CFN_CTZ:
+      ifn = IFN_CTZ;
+      /* Punt if call result is unsigned and defined value at zero
+	 is negative, as the negative value doesn't extend correctly.  */
+      if (TYPE_UNSIGNED (TREE_TYPE (rhs_oprnd))
+	  && gimple_call_internal_p (call_stmt)
+	  && CTZ_DEFINED_VALUE_AT_ZERO
+	       (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val) == 2
+	  && val < 0)
+	return NULL;
+      break;
+    CASE_CFN_FFS:
+      ifn = IFN_FFS;
       break;
     default:
       return NULL;
     }
 
-  if (gimple_call_num_args (popcount_stmt) != 1)
+  if (gimple_call_num_args (call_stmt) != 1)
     return NULL;
 
-  rhs_oprnd = gimple_call_arg (popcount_stmt, 0);
+  rhs_oprnd = gimple_call_arg (call_stmt, 0);
   vect_unpromoted_value unprom_diff;
-  rhs_origin = vect_look_through_possible_promotion (vinfo, rhs_oprnd,
-						    &unprom_diff);
+  rhs_origin
+    = vect_look_through_possible_promotion (vinfo, rhs_oprnd, &unprom_diff);
 
   if (!rhs_origin)
     return NULL;
 
-  /* Input and output of .POPCOUNT should be same-precision integer.
-     Also A should be unsigned or same precision as temp_in,
-     otherwise there would be sign_extend from A to temp_in.  */
-  if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (lhs_type)
-      || (!TYPE_UNSIGNED (unprom_diff.type)
-	  && (TYPE_PRECISION (unprom_diff.type)
-	      != TYPE_PRECISION (TREE_TYPE (rhs_oprnd)))))
+  /* Input and output of .POPCOUNT should be same-precision integer.  */
+  if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (lhs_type))
     return NULL;
-  vargs.safe_push (unprom_diff.op);
 
-  vect_pattern_detected ("vec_regcog_popcount_pattern", popcount_stmt);
+  /* Also A should be unsigned or same precision as temp_in, otherwise
+     different builtins/internal functions have different behaviors.  */
+  if (TYPE_PRECISION (unprom_diff.type)
+      != TYPE_PRECISION (TREE_TYPE (rhs_oprnd)))
+    switch (ifn)
+      {
+      case IFN_POPCOUNT:
+	/* For popcount require zero extension, which doesn't add any
+	   further bits to the count.  */
+	if (!TYPE_UNSIGNED (unprom_diff.type))
+	  return NULL;
+	break;
+      case IFN_CLZ:
+	/* clzll (x) == clz (x) + 32 for unsigned x != 0, so ok
+	   if it is undefined at zero or if it matches also for the
+	   defined value there.  */
+	if (!TYPE_UNSIGNED (unprom_diff.type))
+	  return NULL;
+	if (!type_has_mode_precision_p (lhs_type)
+	    || !type_has_mode_precision_p (TREE_TYPE (rhs_oprnd)))
+	  return NULL;
+	addend = (TYPE_PRECISION (TREE_TYPE (rhs_oprnd))
+		  - TYPE_PRECISION (lhs_type));
+	if (gimple_call_internal_p (call_stmt))
+	  {
+	    int val1, val2;
+	    int d1
+	      = CLZ_DEFINED_VALUE_AT_ZERO
+		  (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val1);
+	    int d2
+	      = CLZ_DEFINED_VALUE_AT_ZERO (SCALAR_INT_TYPE_MODE (lhs_type),
+					   val2);
+	    if (d1 != 2)
+	      break;
+	    if (d2 != 2 || val1 != val2 + addend)
+	      return NULL;
+	  }
+	break;
+      case IFN_CTZ:
+	/* ctzll (x) == ctz (x) for unsigned or signed x != 0, so ok
+	   if it is undefined at zero or if it matches also for the
+	   defined value there.  */
+	if (gimple_call_internal_p (call_stmt))
+	  {
+	    int val1, val2;
+	    int d1
+	      = CTZ_DEFINED_VALUE_AT_ZERO
+		  (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val1);
+	    int d2
+	      = CTZ_DEFINED_VALUE_AT_ZERO (SCALAR_INT_TYPE_MODE (lhs_type),
+					   val2);
+	    if (d1 != 2)
+	      break;
+	    if (d2 != 2 || val1 != val2)
+	      return NULL;
+	  }
+	break;
+      case IFN_FFS:
+	/* ffsll (x) == ffs (x) for unsigned or signed x.  */
+	break;
+      default:
+	gcc_unreachable ();
+      }
+
+  vect_pattern_detected ("vec_recog_popcount_clz_ctz_ffs_pattern",
+			 call_stmt);
   vec_type = get_vectype_for_scalar_type (vinfo, lhs_type);
-  /* Do it only if the backend has popcount<vector_mode>2 pattern.  */
+  /* Do it only if the backend has popcount<vector_mode>2 etc. pattern.  */
   if (!vec_type
-      || !direct_internal_fn_supported_p (IFN_POPCOUNT, vec_type,
+      || !direct_internal_fn_supported_p (ifn, vec_type,
 					  OPTIMIZE_FOR_SPEED))
     return NULL;
 
   /* Create B = .POPCOUNT (A).  */
   new_var = vect_recog_temp_ssa_var (lhs_type, NULL);
-  pattern_stmt = gimple_build_call_internal_vec (IFN_POPCOUNT, vargs);
+  pattern_stmt = gimple_build_call_internal (ifn, 1, unprom_diff.op);
   gimple_call_set_lhs (pattern_stmt, new_var);
   gimple_set_location (pattern_stmt, gimple_location (last_stmt));
   *type_out = vec_type;
@@ -1609,6 +1699,14 @@  vect_recog_popcount_pattern (vec_info *v
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
 		     "created pattern stmt: %G", pattern_stmt);
+
+  if (addend)
+    {
+      append_pattern_def_seq (vinfo, stmt_vinfo, pattern_stmt, vec_type);
+      tree ret_var = vect_recog_temp_ssa_var (lhs_type, NULL);
+      pattern_stmt = gimple_build_assign (ret_var, PLUS_EXPR, new_var,
+					  build_int_cst (lhs_type, addend));
+    }
   return pattern_stmt;
 }
 
@@ -6051,7 +6149,7 @@  static vect_recog_func vect_vect_recog_f
   { vect_recog_sad_pattern, "sad" },
   { vect_recog_widen_sum_pattern, "widen_sum" },
   { vect_recog_pow_pattern, "pow" },
-  { vect_recog_popcount_pattern, "popcount" },
+  { vect_recog_popcount_clz_ctz_ffs_pattern, "popcount_clz_ctz_ffs" },
   { vect_recog_widen_shift_pattern, "widen_shift" },
   { vect_recog_rotate_pattern, "rotate" },
   { vect_recog_vector_vector_shift_pattern, "vector_vector_shift" },
--- gcc/testsuite/gcc.dg/vect/pr109011-1.c.jj	2023-04-18 14:40:47.117397908 +0200
+++ gcc/testsuite/gcc.dg/vect/pr109011-1.c	2023-04-18 14:40:05.124004362 +0200
@@ -0,0 +1,48 @@ 
+/* PR tree-optimization/109011 */
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-unroll-loops --param=vect-epilogues-nomask=0 -fdump-tree-optimized" } */
+/* { dg-additional-options "-mavx512cd" { target { { i?86-*-* x86_64-*-* } && avx512cd } } } */
+/* { dg-additional-options "-mavx512vpopcntdq" { target { { i?86-*-* x86_64-*-* } && avx512vpopcntdq } } } */
+/* { dg-additional-options "-mpower8-vector" { target powerpc_p8vector_ok } } */
+/* { dg-additional-options "-mpower9-vector" { target powerpc_p9vector_ok } } */
+/* { dg-additional-options "-march=z13 -mzarch" { target s390_vx } } */
+
+void
+foo (long long *p, long long *q)
+{
+#pragma omp simd
+  for (int i = 0; i < 2048; ++i)
+    p[i] = __builtin_popcountll (q[i]);
+}
+
+/* { dg-final { scan-tree-dump-times " = \.POPCOUNT \\\(" 1 "optimized" { target { { i?86-*-* x86_64-*-* } && avx512vpopcntdq } } } } */
+/* { dg-final { scan-tree-dump-times " = \.POPCOUNT \\\(" 1 "optimized" { target { powerpc_p8vector_ok || s390_vx } } } } */
+
+void
+bar (long long *p, long long *q)
+{
+#pragma omp simd
+  for (int i = 0; i < 2048; ++i)
+    p[i] = __builtin_clzll (q[i]);
+}
+
+/* { dg-final { scan-tree-dump-times " = \.CLZ \\\(" 1 "optimized" { target { { i?86-*-* x86_64-*-* } && avx512cd } } } } */
+/* { dg-final { scan-tree-dump-times " = \.CLZ \\\(" 1 "optimized" { target { powerpc_p8vector_ok || s390_vx } } } } */
+
+void
+baz (long long *p, long long *q)
+{
+#pragma omp simd
+  for (int i = 0; i < 2048; ++i)
+    p[i] = __builtin_ctzll (q[i]);
+}
+
+/* { dg-final { scan-tree-dump-times " = \.CTZ \\\(" 1 "optimized" { target { powerpc_p9vector_ok || s390_vx } } } } */
+
+void
+qux (long long *p, long long *q)
+{
+#pragma omp simd
+  for (int i = 0; i < 2048; ++i)
+    p[i] = __builtin_ffsll (q[i]);
+}