diff mbox series

[v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS

Message ID 20240224111229.1536064-1-pan2.li@intel.com
State New
Headers show
Series [v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS | expand

Commit Message

Li, Pan2 Feb. 24, 2024, 11:12 a.m. UTC
From: Pan Li <pan2.li@intel.com>

Hi Richard & Tamar,

Try the DEF_INTERNAL_INT_EXT_FN as your suggestion.  By mapping
us_plus$a3 to the RTL representation (us_plus:m x y) in optabs.def.
And then expand_US_PLUS in internal-fn.cc.  Not very sure if my
understanding is correct for DEF_INTERNAL_INT_EXT_FN.

I am not sure if we still need DEF_INTERNAL_SIGNED_OPTAB_FN here, given
the RTL representation has (ss_plus:m x y) and (us_plus:m x y) already.

Note this patch is a draft for validation, no test are invovled here.

gcc/ChangeLog:

	* builtins.def (BUILT_IN_US_PLUS): Add builtin def.
	(BUILT_IN_US_PLUSIMAX): Ditto.
	(BUILT_IN_US_PLUSL): Ditto.
	(BUILT_IN_US_PLUSLL): Ditto.
	(BUILT_IN_US_PLUSG): Ditto.
	* config/riscv/riscv-protos.h (riscv_expand_us_plus): Add new
	func decl for expanding us_plus.
	* config/riscv/riscv.cc (riscv_expand_us_plus): Add new func
	impl for expanding us_plus.
	* config/riscv/riscv.md (us_plus<mode>3): Add new pattern impl
	us_plus<mode>3.
	* internal-fn.cc (expand_US_PLUS): Add new func impl to expand
	US_PLUS.
	* internal-fn.def (US_PLUS): Add new INT_EXT_FN.
	* internal-fn.h (expand_US_PLUS): Add new func decl.
	* match.pd: Add new simplify pattern for us_plus.
	* optabs.def (OPTAB_NL): Add new OPTAB_NL to US_PLUS rtl.

Signed-off-by: Pan Li <pan2.li@intel.com>
---
 gcc/builtins.def                |  7 +++++
 gcc/config/riscv/riscv-protos.h |  1 +
 gcc/config/riscv/riscv.cc       | 46 +++++++++++++++++++++++++++++++++
 gcc/config/riscv/riscv.md       | 11 ++++++++
 gcc/internal-fn.cc              | 26 +++++++++++++++++++
 gcc/internal-fn.def             |  3 +++
 gcc/internal-fn.h               |  1 +
 gcc/match.pd                    | 17 ++++++++++++
 gcc/optabs.def                  |  2 ++
 9 files changed, 114 insertions(+)

Comments

Tamar Christina Feb. 25, 2024, 9:01 a.m. UTC | #1
Hi Pan,

> From: Pan Li <pan2.li@intel.com>
> 
> Hi Richard & Tamar,
> 
> Try the DEF_INTERNAL_INT_EXT_FN as your suggestion.  By mapping
> us_plus$a3 to the RTL representation (us_plus:m x y) in optabs.def.
> And then expand_US_PLUS in internal-fn.cc.  Not very sure if my
> understanding is correct for DEF_INTERNAL_INT_EXT_FN.
> 
> I am not sure if we still need DEF_INTERNAL_SIGNED_OPTAB_FN here, given
> the RTL representation has (ss_plus:m x y) and (us_plus:m x y) already.
> 

I think a couple of things are being confused here.  So lets break it down:

The reason for DEF_INTERNAL_SIGNED_OPTAB_FN is because in GIMPLE
we only want one internal function for both signed and unsigned SAT_ADD.
with this definition we don't need SAT_UADD and SAT_SADD but instead
we will only have SAT_ADD, which will expand to us_plus or ss_plus.

Now the downside of this is that this is a direct internal optab.  This means
that for the representation to be used the target *must* have the optab
implemented.   This is a bit annoying because it doesn't allow us to generically
assume that all targets use SAT_ADD for saturating add and thus only have to
write optimization for this representation.

This is why Richi said we may need to use a new tree_code because we can
override tree code expansions.  However the same can be done with the _EXT_FN
internal functions.

So what I meant was that we want to have a combination of the two. i.e. a
DEF_INTERNAL_SIGNED_OPTAB_EXT_FN.

If Richi agrees, the below is what I meant. It creates the infrastructure for this
and for now only allows a default fallback for unsigned saturating add and makes
it easier for us to add the rest later

Also, unless I'm wrong (and Richi can correct me here), us_plus and ss_plus are the
RTL expression, but the optab for saturation are ssadd and usadd.  So you don't
need to make new us_plus and ss_plus ones.

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index a07f25f3aee..aaf9f8991b3 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -4103,6 +4103,17 @@ direct_internal_fn_supported_p (internal_fn fn, tree_pair types,
 	return direct_##TYPE##_optab_supported_p (which_optab, types,	\
 						  opt_type);		\
       }
+#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR, SIGNED_OPTAB, \
+					 UNSIGNED_OPTAB, TYPE)		\
+    case IFN_##CODE:							\
+      {									\
+	optab which_optab = (TYPE_UNSIGNED (types.SELECTOR)		\
+			     ? UNSIGNED_OPTAB ## _optab			\
+			     : SIGNED_OPTAB ## _optab);			\
+	return direct_##TYPE##_optab_supported_p (which_optab, types,	\
+						  opt_type)		\
+	       || internal_##CODE##_fn_supported_p (types.SELECTOR, opt_type); \
+      }
 #include "internal-fn.def"
 
     case IFN_LAST:
@@ -4303,6 +4314,8 @@ set_edom_supported_p (void)
     optab which_optab = direct_internal_fn_optab (fn, types);		\
     expand_##TYPE##_optab_fn (fn, stmt, which_optab);			\
   }
+#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR, SIGNED_OPTAB, \
+					 UNSIGNED_OPTAB, TYPE)
 #include "internal-fn.def"
 
 /* Routines to expand each internal function, indexed by function number.
@@ -5177,3 +5190,45 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
       emit_move_insn (plhs, cmp);
     }
 }
+
+void
+expand_SAT_ADD (internal_fn fn, gcall *stmt)
+{
+  /* Check if the target supports the expansion through an IFN.  */
+  tree_pair types = direct_internal_fn_types (fn, stmt);
+  optab which_optab = direct_internal_fn_optab (fn, types);
+  if (direct_binary_optab_supported_p (which_optab, types,
+				       insn_optimization_type ()))
+    {
+      expand_binary_optab_fn (fn, stmt, which_optab);
+      return;
+    }
+
+  /* Target does not support the optab, but we can de-compose it.  */
+  /*
+  ... decompose to a canonical representation ...
+  if (TYPE_UNSIGNED (types.SELECTOR))
+    {
+      ...
+      decompose back to (X + Y) | - ((X + Y) < X)
+    }
+  else
+    {
+      ...
+    }
+  */
+}
+
+bool internal_SAT_ADD_fn_supported_p (tree type, optimization_type /* optype */)
+{
+  /* For now, don't support decomposing vector ops.  */
+  if (VECTOR_TYPE_P (type))
+    return false;
+
+  /* Signed saturating arithmetic is harder to do since we'll so for now
+     lets ignore.  */
+  if (!TYPE_UNSIGNED (type))
+    return false;
+
+  return TREE_CODE (type) == INTEGER_TYPE;
+}
\ No newline at end of file
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index c14d30365c1..5a2491228d5 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -92,6 +92,10 @@ along with GCC; see the file COPYING3.  If not see
    unsigned inputs respectively, both without the trailing "_optab".
    SELECTOR says which type in the tree_pair determines the signedness.
 
+   DEF_INTERNAL_SIGNED_OPTAB_EXT_FN is like DEF_INTERNAL_SIGNED_OPTAB_FN, except
+   that it has expand_##NAME defined in internal-fn.cc to override the
+   DEF_INTERNAL_SIGNED_OPTAB_FN expansion behavior.
+
    DEF_INTERNAL_FLT_FN is like DEF_INTERNAL_OPTAB_FN, but in addition,
    the function implements the computational part of a built-in math
    function BUILT_IN_<NAME>{F,,L}.  Unlike some built-in functions,
@@ -153,6 +157,13 @@ along with GCC; see the file COPYING3.  If not see
   DEF_INTERNAL_FN (NAME, FLAGS | ECF_LEAF, NULL)
 #endif
 
+#ifndef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
+#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(NAME, FLAGS, SELECTOR, SIGNED_OPTAB, \
+				     UNSIGNED_OPTAB, TYPE) \
+  DEF_INTERNAL_SIGNED_OPTAB_FN (NAME, FLAGS, SELECTOR, SIGNED_OPTAB, \
+				UNSIGNED_OPTAB, TYPE)
+#endif
+
 #ifndef DEF_INTERNAL_FLT_FN
 #define DEF_INTERNAL_FLT_FN(NAME, FLAGS, OPTAB, TYPE) \
   DEF_INTERNAL_OPTAB_FN (NAME, FLAGS, OPTAB, TYPE)
@@ -274,6 +285,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST | ECF_NOTHROW, first,
 			      smulhs, umulhs, binary)
 DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, first,
 			      smulhrs, umulhrs, binary)
+DEF_INTERNAL_SIGNED_OPTAB_EXT_FN (SAT_ADD, ECF_CONST | ECF_NOTHROW, first,
+				  ssadd, usadd, binary)
 
 DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
 DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
@@ -593,5 +606,6 @@ DEF_INTERNAL_FN (BITINTTOFLOAT, ECF_PURE | ECF_LEAF, ". R . ")
 #undef DEF_INTERNAL_FLT_FN
 #undef DEF_INTERNAL_FLT_FLOATN_FN
 #undef DEF_INTERNAL_SIGNED_OPTAB_FN
+#undef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
 #undef DEF_INTERNAL_OPTAB_FN
 #undef DEF_INTERNAL_FN
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index bccee1c3e09..dbdb1e6bad2 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -263,6 +263,8 @@ extern void expand_DIVMODBITINT (internal_fn, gcall *);
 extern void expand_FLOATTOBITINT (internal_fn, gcall *);
 extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
 extern void expand_POPCOUNT (internal_fn, gcall *);
+extern void expand_SAT_ADD (internal_fn, gcall *);
+extern bool internal_SAT_ADD_fn_supported_p (tree, optimization_type);
 
 extern bool vectorized_internal_fn_supported_p (internal_fn, tree);

> Note this patch is a draft for validation, no test are invovled here.
> 
> gcc/ChangeLog:
> 
> 	* builtins.def (BUILT_IN_US_PLUS): Add builtin def.
> 	(BUILT_IN_US_PLUSIMAX): Ditto.
> 	(BUILT_IN_US_PLUSL): Ditto.
> 	(BUILT_IN_US_PLUSLL): Ditto.
> 	(BUILT_IN_US_PLUSG): Ditto.
> 	* config/riscv/riscv-protos.h (riscv_expand_us_plus): Add new
> 	func decl for expanding us_plus.
> 	* config/riscv/riscv.cc (riscv_expand_us_plus): Add new func
> 	impl for expanding us_plus.
> 	* config/riscv/riscv.md (us_plus<mode>3): Add new pattern impl
> 	us_plus<mode>3.
> 	* internal-fn.cc (expand_US_PLUS): Add new func impl to expand
> 	US_PLUS.
> 	* internal-fn.def (US_PLUS): Add new INT_EXT_FN.
> 	* internal-fn.h (expand_US_PLUS): Add new func decl.
> 	* match.pd: Add new simplify pattern for us_plus.
> 	* optabs.def (OPTAB_NL): Add new OPTAB_NL to US_PLUS rtl.
> 
> Signed-off-by: Pan Li <pan2.li@intel.com>
> ---
>  gcc/builtins.def                |  7 +++++
>  gcc/config/riscv/riscv-protos.h |  1 +
>  gcc/config/riscv/riscv.cc       | 46 +++++++++++++++++++++++++++++++++
>  gcc/config/riscv/riscv.md       | 11 ++++++++
>  gcc/internal-fn.cc              | 26 +++++++++++++++++++
>  gcc/internal-fn.def             |  3 +++
>  gcc/internal-fn.h               |  1 +
>  gcc/match.pd                    | 17 ++++++++++++
>  gcc/optabs.def                  |  2 ++
>  9 files changed, 114 insertions(+)
> 
> diff --git a/gcc/builtins.def b/gcc/builtins.def
> index f6f3e104f6a..0777b912cfa 100644
> --- a/gcc/builtins.def
> +++ b/gcc/builtins.def
> @@ -1055,6 +1055,13 @@ DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTIMAX,
> "popcountimax", BT_FN_INT_UINTMAX
>  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTL, "popcountl", BT_FN_INT_ULONG,
> ATTR_CONST_NOTHROW_LEAF_LIST)
>  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTLL, "popcountll",
> BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
>  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTG, "popcountg", BT_FN_INT_VAR,
> ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> +
> +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUS, "us_plus", BT_FN_INT_UINT,
> ATTR_CONST_NOTHROW_LEAF_LIST)
> +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSIMAX, "us_plusimax",
> BT_FN_INT_UINTMAX, ATTR_CONST_NOTHROW_LEAF_LIST)
> +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSL, "us_plusl", BT_FN_INT_ULONG,
> ATTR_CONST_NOTHROW_LEAF_LIST)
> +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSLL, "us_plusll",
> BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSG, "us_plusg", BT_FN_INT_VAR,
> ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> +
>  DEF_EXT_LIB_BUILTIN    (BUILT_IN_POSIX_MEMALIGN, "posix_memalign",
> BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF)
>  DEF_GCC_BUILTIN        (BUILT_IN_PREFETCH, "prefetch",
> BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
>  DEF_LIB_BUILTIN        (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE,
> ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST)
> diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> index 80efdf2b7e5..ba6086f1f25 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -132,6 +132,7 @@ extern void riscv_asm_output_external (FILE *, const tree,
> const char *);
>  extern bool
>  riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
>  extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx);
> +extern void riscv_expand_us_plus (rtx, rtx, rtx);
> 
>  #ifdef RTX_CODE
>  extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool *invert_ptr =
> 0);
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index 4100abc9dd1..23f08974f07 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -10657,6 +10657,52 @@ riscv_vector_mode_supported_any_target_p
> (machine_mode)
>    return true;
>  }
> 
> +/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < x).  */
> +void
> +riscv_expand_us_plus (rtx dest, rtx x, rtx y)
> +{
> +  machine_mode mode = GET_MODE (dest);
> +  rtx pmode_sum = gen_reg_rtx (Pmode);
> +  rtx pmode_lt = gen_reg_rtx (Pmode);
> +  rtx pmode_x = gen_lowpart (Pmode, x);
> +  rtx pmode_y = gen_lowpart (Pmode, y);
> +  rtx pmode_dest = gen_reg_rtx (Pmode);
> +
> +  /* Step-1: sum = x + y  */
> +  if (mode == SImode && mode != Pmode)
> +    { /* Take addw to avoid the sum truncate.  */
> +      rtx simode_sum = gen_reg_rtx (SImode);
> +      riscv_emit_binary (PLUS, simode_sum, x, y);
> +      emit_move_insn (pmode_sum, gen_lowpart (Pmode, simode_sum));
> +    }
> +  else
> +    riscv_emit_binary (PLUS, pmode_sum, pmode_x, pmode_y);
> +
> +  /* Step-1.1: truncate sum for HI and QI as we have no insn for add QI/HI.  */
> +  if (mode == HImode || mode == QImode)
> +    {
> +      int mode_bits = GET_MODE_BITSIZE (mode).to_constant ();
> +      int shift_bits = GET_MODE_BITSIZE (Pmode) - mode_bits;
> +
> +      gcc_assert (shift_bits > 0);
> +
> +      riscv_emit_binary (ASHIFT, pmode_sum, pmode_sum, GEN_INT (shift_bits));
> +      riscv_emit_binary (LSHIFTRT, pmode_sum, pmode_sum, GEN_INT
> (shift_bits));
> +    }
> +
> +  /* Step-2: lt = sum < x  */
> +  riscv_emit_binary (LTU, pmode_lt, pmode_sum, pmode_x);
> +
> +  /* Step-3: lt = -lt  */
> +  riscv_emit_unary (NEG, pmode_lt, pmode_lt);
> +
> +  /* Step-4: pmode_dest = sum | lt  */
> +  riscv_emit_binary (IOR, pmode_dest, pmode_lt, pmode_sum);
> +
> +  /* Step-5: dest = pmode_dest */
> +  emit_move_insn (dest, gen_lowpart (mode, pmode_dest));
> +}
> +
>  /* Initialize the GCC target structure.  */
>  #undef TARGET_ASM_ALIGNED_HI_OP
>  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> index 3f7a023d941..eaa9867023c 100644
> --- a/gcc/config/riscv/riscv.md
> +++ b/gcc/config/riscv/riscv.md
> @@ -3841,6 +3841,17 @@ (define_insn "*large_load_address"
>    [(set_attr "type" "load")
>     (set (attr "length") (const_int 8))])
> 
> +(define_expand "us_plus<mode>3"
> +  [(match_operand:ANYI   0 "register_operand")
> +   (match_operand:ANYI   1 "register_operand")
> +   (match_operand:ANYI   2 "register_operand")]
> +  ""
> +  {
> +    riscv_expand_us_plus (operands[0], operands[1], operands[2]);
> +    DONE;
> +  }
> +)
> +
>  (include "bitmanip.md")
>  (include "crypto.md")
>  (include "sync.md")
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index a07f25f3aee..a7341a57ffa 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -5177,3 +5177,29 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
>        emit_move_insn (plhs, cmp);
>      }
>  }
> +
> +void
> +expand_US_PLUS (internal_fn fn, gcall *stmt)
> +{
> +  tree lhs = gimple_call_lhs (stmt);
> +  tree rhs_0 = gimple_call_arg (stmt, 0);
> +  tree rhs_1 = gimple_call_arg (stmt, 1);
> +
> +  do_pending_stack_adjust ();
> +
> +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> +  rtx op_0 = expand_normal (rhs_0);
> +  rtx op_1 = expand_normal (rhs_1);
> +
> +  class expand_operand ops[3];
> +
> +  create_output_operand (&ops[0], target, TYPE_MODE (TREE_TYPE (lhs)));
> +  create_output_operand (&ops[1], op_0, TYPE_MODE (TREE_TYPE (rhs_0)));
> +  create_output_operand (&ops[2], op_1, TYPE_MODE (TREE_TYPE (rhs_1)));
> +
> +  insn_code code = optab_handler (us_plus_optab, TYPE_MODE (TREE_TYPE
> (rhs_0)));
> +  expand_insn (code, 3, ops);
> +
> +  if (!rtx_equal_p (target, ops[0].value))
> +    emit_move_insn (target, ops[0].value);
> +}

This can be simplified by calling expand_binary_optab_fn instead. See my template

> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index c14d30365c1..b1d7b5a0307 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -447,6 +447,9 @@ DEF_INTERNAL_INT_FN (FFS, ECF_CONST |
> ECF_NOTHROW, ffs, unary)
>  DEF_INTERNAL_INT_FN (PARITY, ECF_CONST | ECF_NOTHROW, parity, unary)
>  DEF_INTERNAL_INT_EXT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW,
> popcount, unary)
> 
> +/* Binary integer ops.  */
> +DEF_INTERNAL_INT_EXT_FN (US_PLUS, ECF_CONST | ECF_NOTHROW, us_plus,
> binary)
> +
>  DEF_INTERNAL_FN (GOMP_TARGET_REV, ECF_NOVOPS | ECF_LEAF |
> ECF_NOTHROW, NULL)
>  DEF_INTERNAL_FN (GOMP_USE_SIMT, ECF_NOVOPS | ECF_LEAF |
> ECF_NOTHROW, NULL)
>  DEF_INTERNAL_FN (GOMP_SIMT_ENTER, ECF_LEAF | ECF_NOTHROW, NULL)
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index bccee1c3e09..46e404b4a49 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -263,6 +263,7 @@ extern void expand_DIVMODBITINT (internal_fn, gcall *);
>  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
>  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
>  extern void expand_POPCOUNT (internal_fn, gcall *);
> +extern void expand_US_PLUS (internal_fn, gcall *);
> 
>  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index c5b6540f939..f45fd58ad23 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -10265,3 +10265,20 @@ and,
>        }
>        (if (full_perm_p)
>  	(vec_perm (op@3 @0 @1) @3 @2))))))
> +
> +#if GIMPLE
> +
> +/* Unsigned saturation add, aka:
> +   SAT_ADDU = (X + Y) | - ((X + Y) < X) or
> +   SAT_ADDU = (X + Y) | - ((X + Y) < Y).  */
> +(simplify
> + (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0))))
> +   (if (optimize
> +	&& INTEGRAL_TYPE_P (type)
> +	&& TYPE_UNSIGNED (TREE_TYPE (@0))
> +	&& types_match (type, TREE_TYPE (@0))
> +	&& types_match (type, TREE_TYPE (@1))
> +	&& direct_internal_fn_supported_p (IFN_US_PLUS, type,
> OPTIMIZE_FOR_BOTH))
> +   (IFN_US_PLUS @0 @1)))
> +
> +#endif

With the version above you can drop the #if GIMPLE and the 
> +	&& direct_internal_fn_supported_p (IFN_US_PLUS, type,

Check.

Thanks,
Tamar

> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index ad14f9328b9..5855c4e0834 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -179,6 +179,8 @@ OPTAB_NL(clrsb_optab, "clrsb$a2", CLRSB, "clrsb", '2',
> gen_int_libfunc)
>  OPTAB_NL(popcount_optab, "popcount$a2", POPCOUNT, "popcount", '2',
> gen_int_libfunc)
>  OPTAB_NL(parity_optab, "parity$a2", PARITY, "parity", '2', gen_int_libfunc)
> 
> +OPTAB_NL(us_plus_optab, "us_plus$a3", US_PLUS, "us_plus", '3',
> gen_int_libfunc)
> +
>  /* Comparison libcalls for integers MUST come in pairs, signed/unsigned.  */
>  OPTAB_NL(cmp_optab, NULL, UNKNOWN, "cmp", '2', gen_int_fp_fixed_libfunc)
>  OPTAB_NL(ucmp_optab, NULL, UNKNOWN, "ucmp", '2', gen_int_libfunc)
> --
> 2.34.1
Li, Pan2 Feb. 26, 2024, 6:27 a.m. UTC | #2
Got it, we need to combine that together up to point.
Thanks Tamar for the explanation. Help a lot and will have a try in v3.

Pan

-----Original Message-----
From: Tamar Christina <Tamar.Christina@arm.com> 
Sent: Sunday, February 25, 2024 5:02 PM
To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org
Cc: juzhe.zhong@rivai.ai; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com; richard.guenther@gmail.com; richard.sandiford@arm.com2; jeffreyalaw@gmail.com
Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS

Hi Pan,

> From: Pan Li <pan2.li@intel.com>
> 
> Hi Richard & Tamar,
> 
> Try the DEF_INTERNAL_INT_EXT_FN as your suggestion.  By mapping
> us_plus$a3 to the RTL representation (us_plus:m x y) in optabs.def.
> And then expand_US_PLUS in internal-fn.cc.  Not very sure if my
> understanding is correct for DEF_INTERNAL_INT_EXT_FN.
> 
> I am not sure if we still need DEF_INTERNAL_SIGNED_OPTAB_FN here, given
> the RTL representation has (ss_plus:m x y) and (us_plus:m x y) already.
> 

I think a couple of things are being confused here.  So lets break it down:

The reason for DEF_INTERNAL_SIGNED_OPTAB_FN is because in GIMPLE
we only want one internal function for both signed and unsigned SAT_ADD.
with this definition we don't need SAT_UADD and SAT_SADD but instead
we will only have SAT_ADD, which will expand to us_plus or ss_plus.

Now the downside of this is that this is a direct internal optab.  This means
that for the representation to be used the target *must* have the optab
implemented.   This is a bit annoying because it doesn't allow us to generically
assume that all targets use SAT_ADD for saturating add and thus only have to
write optimization for this representation.

This is why Richi said we may need to use a new tree_code because we can
override tree code expansions.  However the same can be done with the _EXT_FN
internal functions.

So what I meant was that we want to have a combination of the two. i.e. a
DEF_INTERNAL_SIGNED_OPTAB_EXT_FN.

If Richi agrees, the below is what I meant. It creates the infrastructure for this
and for now only allows a default fallback for unsigned saturating add and makes
it easier for us to add the rest later

Also, unless I'm wrong (and Richi can correct me here), us_plus and ss_plus are the
RTL expression, but the optab for saturation are ssadd and usadd.  So you don't
need to make new us_plus and ss_plus ones.

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index a07f25f3aee..aaf9f8991b3 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -4103,6 +4103,17 @@ direct_internal_fn_supported_p (internal_fn fn, tree_pair types,
 	return direct_##TYPE##_optab_supported_p (which_optab, types,	\
 						  opt_type);		\
       }
+#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR, SIGNED_OPTAB, \
+					 UNSIGNED_OPTAB, TYPE)		\
+    case IFN_##CODE:							\
+      {									\
+	optab which_optab = (TYPE_UNSIGNED (types.SELECTOR)		\
+			     ? UNSIGNED_OPTAB ## _optab			\
+			     : SIGNED_OPTAB ## _optab);			\
+	return direct_##TYPE##_optab_supported_p (which_optab, types,	\
+						  opt_type)		\
+	       || internal_##CODE##_fn_supported_p (types.SELECTOR, opt_type); \
+      }
 #include "internal-fn.def"
 
     case IFN_LAST:
@@ -4303,6 +4314,8 @@ set_edom_supported_p (void)
     optab which_optab = direct_internal_fn_optab (fn, types);		\
     expand_##TYPE##_optab_fn (fn, stmt, which_optab);			\
   }
+#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR, SIGNED_OPTAB, \
+					 UNSIGNED_OPTAB, TYPE)
 #include "internal-fn.def"
 
 /* Routines to expand each internal function, indexed by function number.
@@ -5177,3 +5190,45 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
       emit_move_insn (plhs, cmp);
     }
 }
+
+void
+expand_SAT_ADD (internal_fn fn, gcall *stmt)
+{
+  /* Check if the target supports the expansion through an IFN.  */
+  tree_pair types = direct_internal_fn_types (fn, stmt);
+  optab which_optab = direct_internal_fn_optab (fn, types);
+  if (direct_binary_optab_supported_p (which_optab, types,
+				       insn_optimization_type ()))
+    {
+      expand_binary_optab_fn (fn, stmt, which_optab);
+      return;
+    }
+
+  /* Target does not support the optab, but we can de-compose it.  */
+  /*
+  ... decompose to a canonical representation ...
+  if (TYPE_UNSIGNED (types.SELECTOR))
+    {
+      ...
+      decompose back to (X + Y) | - ((X + Y) < X)
+    }
+  else
+    {
+      ...
+    }
+  */
+}
+
+bool internal_SAT_ADD_fn_supported_p (tree type, optimization_type /* optype */)
+{
+  /* For now, don't support decomposing vector ops.  */
+  if (VECTOR_TYPE_P (type))
+    return false;
+
+  /* Signed saturating arithmetic is harder to do since we'll so for now
+     lets ignore.  */
+  if (!TYPE_UNSIGNED (type))
+    return false;
+
+  return TREE_CODE (type) == INTEGER_TYPE;
+}
\ No newline at end of file
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index c14d30365c1..5a2491228d5 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -92,6 +92,10 @@ along with GCC; see the file COPYING3.  If not see
    unsigned inputs respectively, both without the trailing "_optab".
    SELECTOR says which type in the tree_pair determines the signedness.
 
+   DEF_INTERNAL_SIGNED_OPTAB_EXT_FN is like DEF_INTERNAL_SIGNED_OPTAB_FN, except
+   that it has expand_##NAME defined in internal-fn.cc to override the
+   DEF_INTERNAL_SIGNED_OPTAB_FN expansion behavior.
+
    DEF_INTERNAL_FLT_FN is like DEF_INTERNAL_OPTAB_FN, but in addition,
    the function implements the computational part of a built-in math
    function BUILT_IN_<NAME>{F,,L}.  Unlike some built-in functions,
@@ -153,6 +157,13 @@ along with GCC; see the file COPYING3.  If not see
   DEF_INTERNAL_FN (NAME, FLAGS | ECF_LEAF, NULL)
 #endif
 
+#ifndef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
+#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(NAME, FLAGS, SELECTOR, SIGNED_OPTAB, \
+				     UNSIGNED_OPTAB, TYPE) \
+  DEF_INTERNAL_SIGNED_OPTAB_FN (NAME, FLAGS, SELECTOR, SIGNED_OPTAB, \
+				UNSIGNED_OPTAB, TYPE)
+#endif
+
 #ifndef DEF_INTERNAL_FLT_FN
 #define DEF_INTERNAL_FLT_FN(NAME, FLAGS, OPTAB, TYPE) \
   DEF_INTERNAL_OPTAB_FN (NAME, FLAGS, OPTAB, TYPE)
@@ -274,6 +285,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST | ECF_NOTHROW, first,
 			      smulhs, umulhs, binary)
 DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, first,
 			      smulhrs, umulhrs, binary)
+DEF_INTERNAL_SIGNED_OPTAB_EXT_FN (SAT_ADD, ECF_CONST | ECF_NOTHROW, first,
+				  ssadd, usadd, binary)
 
 DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
 DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
@@ -593,5 +606,6 @@ DEF_INTERNAL_FN (BITINTTOFLOAT, ECF_PURE | ECF_LEAF, ". R . ")
 #undef DEF_INTERNAL_FLT_FN
 #undef DEF_INTERNAL_FLT_FLOATN_FN
 #undef DEF_INTERNAL_SIGNED_OPTAB_FN
+#undef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
 #undef DEF_INTERNAL_OPTAB_FN
 #undef DEF_INTERNAL_FN
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index bccee1c3e09..dbdb1e6bad2 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -263,6 +263,8 @@ extern void expand_DIVMODBITINT (internal_fn, gcall *);
 extern void expand_FLOATTOBITINT (internal_fn, gcall *);
 extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
 extern void expand_POPCOUNT (internal_fn, gcall *);
+extern void expand_SAT_ADD (internal_fn, gcall *);
+extern bool internal_SAT_ADD_fn_supported_p (tree, optimization_type);
 
 extern bool vectorized_internal_fn_supported_p (internal_fn, tree);

> Note this patch is a draft for validation, no test are invovled here.
> 
> gcc/ChangeLog:
> 
> 	* builtins.def (BUILT_IN_US_PLUS): Add builtin def.
> 	(BUILT_IN_US_PLUSIMAX): Ditto.
> 	(BUILT_IN_US_PLUSL): Ditto.
> 	(BUILT_IN_US_PLUSLL): Ditto.
> 	(BUILT_IN_US_PLUSG): Ditto.
> 	* config/riscv/riscv-protos.h (riscv_expand_us_plus): Add new
> 	func decl for expanding us_plus.
> 	* config/riscv/riscv.cc (riscv_expand_us_plus): Add new func
> 	impl for expanding us_plus.
> 	* config/riscv/riscv.md (us_plus<mode>3): Add new pattern impl
> 	us_plus<mode>3.
> 	* internal-fn.cc (expand_US_PLUS): Add new func impl to expand
> 	US_PLUS.
> 	* internal-fn.def (US_PLUS): Add new INT_EXT_FN.
> 	* internal-fn.h (expand_US_PLUS): Add new func decl.
> 	* match.pd: Add new simplify pattern for us_plus.
> 	* optabs.def (OPTAB_NL): Add new OPTAB_NL to US_PLUS rtl.
> 
> Signed-off-by: Pan Li <pan2.li@intel.com>
> ---
>  gcc/builtins.def                |  7 +++++
>  gcc/config/riscv/riscv-protos.h |  1 +
>  gcc/config/riscv/riscv.cc       | 46 +++++++++++++++++++++++++++++++++
>  gcc/config/riscv/riscv.md       | 11 ++++++++
>  gcc/internal-fn.cc              | 26 +++++++++++++++++++
>  gcc/internal-fn.def             |  3 +++
>  gcc/internal-fn.h               |  1 +
>  gcc/match.pd                    | 17 ++++++++++++
>  gcc/optabs.def                  |  2 ++
>  9 files changed, 114 insertions(+)
> 
> diff --git a/gcc/builtins.def b/gcc/builtins.def
> index f6f3e104f6a..0777b912cfa 100644
> --- a/gcc/builtins.def
> +++ b/gcc/builtins.def
> @@ -1055,6 +1055,13 @@ DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTIMAX,
> "popcountimax", BT_FN_INT_UINTMAX
>  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTL, "popcountl", BT_FN_INT_ULONG,
> ATTR_CONST_NOTHROW_LEAF_LIST)
>  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTLL, "popcountll",
> BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
>  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTG, "popcountg", BT_FN_INT_VAR,
> ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> +
> +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUS, "us_plus", BT_FN_INT_UINT,
> ATTR_CONST_NOTHROW_LEAF_LIST)
> +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSIMAX, "us_plusimax",
> BT_FN_INT_UINTMAX, ATTR_CONST_NOTHROW_LEAF_LIST)
> +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSL, "us_plusl", BT_FN_INT_ULONG,
> ATTR_CONST_NOTHROW_LEAF_LIST)
> +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSLL, "us_plusll",
> BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSG, "us_plusg", BT_FN_INT_VAR,
> ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> +
>  DEF_EXT_LIB_BUILTIN    (BUILT_IN_POSIX_MEMALIGN, "posix_memalign",
> BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF)
>  DEF_GCC_BUILTIN        (BUILT_IN_PREFETCH, "prefetch",
> BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
>  DEF_LIB_BUILTIN        (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE,
> ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST)
> diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> index 80efdf2b7e5..ba6086f1f25 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -132,6 +132,7 @@ extern void riscv_asm_output_external (FILE *, const tree,
> const char *);
>  extern bool
>  riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
>  extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx);
> +extern void riscv_expand_us_plus (rtx, rtx, rtx);
> 
>  #ifdef RTX_CODE
>  extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool *invert_ptr =
> 0);
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index 4100abc9dd1..23f08974f07 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -10657,6 +10657,52 @@ riscv_vector_mode_supported_any_target_p
> (machine_mode)
>    return true;
>  }
> 
> +/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < x).  */
> +void
> +riscv_expand_us_plus (rtx dest, rtx x, rtx y)
> +{
> +  machine_mode mode = GET_MODE (dest);
> +  rtx pmode_sum = gen_reg_rtx (Pmode);
> +  rtx pmode_lt = gen_reg_rtx (Pmode);
> +  rtx pmode_x = gen_lowpart (Pmode, x);
> +  rtx pmode_y = gen_lowpart (Pmode, y);
> +  rtx pmode_dest = gen_reg_rtx (Pmode);
> +
> +  /* Step-1: sum = x + y  */
> +  if (mode == SImode && mode != Pmode)
> +    { /* Take addw to avoid the sum truncate.  */
> +      rtx simode_sum = gen_reg_rtx (SImode);
> +      riscv_emit_binary (PLUS, simode_sum, x, y);
> +      emit_move_insn (pmode_sum, gen_lowpart (Pmode, simode_sum));
> +    }
> +  else
> +    riscv_emit_binary (PLUS, pmode_sum, pmode_x, pmode_y);
> +
> +  /* Step-1.1: truncate sum for HI and QI as we have no insn for add QI/HI.  */
> +  if (mode == HImode || mode == QImode)
> +    {
> +      int mode_bits = GET_MODE_BITSIZE (mode).to_constant ();
> +      int shift_bits = GET_MODE_BITSIZE (Pmode) - mode_bits;
> +
> +      gcc_assert (shift_bits > 0);
> +
> +      riscv_emit_binary (ASHIFT, pmode_sum, pmode_sum, GEN_INT (shift_bits));
> +      riscv_emit_binary (LSHIFTRT, pmode_sum, pmode_sum, GEN_INT
> (shift_bits));
> +    }
> +
> +  /* Step-2: lt = sum < x  */
> +  riscv_emit_binary (LTU, pmode_lt, pmode_sum, pmode_x);
> +
> +  /* Step-3: lt = -lt  */
> +  riscv_emit_unary (NEG, pmode_lt, pmode_lt);
> +
> +  /* Step-4: pmode_dest = sum | lt  */
> +  riscv_emit_binary (IOR, pmode_dest, pmode_lt, pmode_sum);
> +
> +  /* Step-5: dest = pmode_dest */
> +  emit_move_insn (dest, gen_lowpart (mode, pmode_dest));
> +}
> +
>  /* Initialize the GCC target structure.  */
>  #undef TARGET_ASM_ALIGNED_HI_OP
>  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> index 3f7a023d941..eaa9867023c 100644
> --- a/gcc/config/riscv/riscv.md
> +++ b/gcc/config/riscv/riscv.md
> @@ -3841,6 +3841,17 @@ (define_insn "*large_load_address"
>    [(set_attr "type" "load")
>     (set (attr "length") (const_int 8))])
> 
> +(define_expand "us_plus<mode>3"
> +  [(match_operand:ANYI   0 "register_operand")
> +   (match_operand:ANYI   1 "register_operand")
> +   (match_operand:ANYI   2 "register_operand")]
> +  ""
> +  {
> +    riscv_expand_us_plus (operands[0], operands[1], operands[2]);
> +    DONE;
> +  }
> +)
> +
>  (include "bitmanip.md")
>  (include "crypto.md")
>  (include "sync.md")
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index a07f25f3aee..a7341a57ffa 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -5177,3 +5177,29 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
>        emit_move_insn (plhs, cmp);
>      }
>  }
> +
> +void
> +expand_US_PLUS (internal_fn fn, gcall *stmt)
> +{
> +  tree lhs = gimple_call_lhs (stmt);
> +  tree rhs_0 = gimple_call_arg (stmt, 0);
> +  tree rhs_1 = gimple_call_arg (stmt, 1);
> +
> +  do_pending_stack_adjust ();
> +
> +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> +  rtx op_0 = expand_normal (rhs_0);
> +  rtx op_1 = expand_normal (rhs_1);
> +
> +  class expand_operand ops[3];
> +
> +  create_output_operand (&ops[0], target, TYPE_MODE (TREE_TYPE (lhs)));
> +  create_output_operand (&ops[1], op_0, TYPE_MODE (TREE_TYPE (rhs_0)));
> +  create_output_operand (&ops[2], op_1, TYPE_MODE (TREE_TYPE (rhs_1)));
> +
> +  insn_code code = optab_handler (us_plus_optab, TYPE_MODE (TREE_TYPE
> (rhs_0)));
> +  expand_insn (code, 3, ops);
> +
> +  if (!rtx_equal_p (target, ops[0].value))
> +    emit_move_insn (target, ops[0].value);
> +}

This can be simplified by calling expand_binary_optab_fn instead. See my template

> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index c14d30365c1..b1d7b5a0307 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -447,6 +447,9 @@ DEF_INTERNAL_INT_FN (FFS, ECF_CONST |
> ECF_NOTHROW, ffs, unary)
>  DEF_INTERNAL_INT_FN (PARITY, ECF_CONST | ECF_NOTHROW, parity, unary)
>  DEF_INTERNAL_INT_EXT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW,
> popcount, unary)
> 
> +/* Binary integer ops.  */
> +DEF_INTERNAL_INT_EXT_FN (US_PLUS, ECF_CONST | ECF_NOTHROW, us_plus,
> binary)
> +
>  DEF_INTERNAL_FN (GOMP_TARGET_REV, ECF_NOVOPS | ECF_LEAF |
> ECF_NOTHROW, NULL)
>  DEF_INTERNAL_FN (GOMP_USE_SIMT, ECF_NOVOPS | ECF_LEAF |
> ECF_NOTHROW, NULL)
>  DEF_INTERNAL_FN (GOMP_SIMT_ENTER, ECF_LEAF | ECF_NOTHROW, NULL)
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index bccee1c3e09..46e404b4a49 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -263,6 +263,7 @@ extern void expand_DIVMODBITINT (internal_fn, gcall *);
>  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
>  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
>  extern void expand_POPCOUNT (internal_fn, gcall *);
> +extern void expand_US_PLUS (internal_fn, gcall *);
> 
>  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index c5b6540f939..f45fd58ad23 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -10265,3 +10265,20 @@ and,
>        }
>        (if (full_perm_p)
>  	(vec_perm (op@3 @0 @1) @3 @2))))))
> +
> +#if GIMPLE
> +
> +/* Unsigned saturation add, aka:
> +   SAT_ADDU = (X + Y) | - ((X + Y) < X) or
> +   SAT_ADDU = (X + Y) | - ((X + Y) < Y).  */
> +(simplify
> + (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0))))
> +   (if (optimize
> +	&& INTEGRAL_TYPE_P (type)
> +	&& TYPE_UNSIGNED (TREE_TYPE (@0))
> +	&& types_match (type, TREE_TYPE (@0))
> +	&& types_match (type, TREE_TYPE (@1))
> +	&& direct_internal_fn_supported_p (IFN_US_PLUS, type,
> OPTIMIZE_FOR_BOTH))
> +   (IFN_US_PLUS @0 @1)))
> +
> +#endif

With the version above you can drop the #if GIMPLE and the 
> +	&& direct_internal_fn_supported_p (IFN_US_PLUS, type,

Check.

Thanks,
Tamar

> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index ad14f9328b9..5855c4e0834 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -179,6 +179,8 @@ OPTAB_NL(clrsb_optab, "clrsb$a2", CLRSB, "clrsb", '2',
> gen_int_libfunc)
>  OPTAB_NL(popcount_optab, "popcount$a2", POPCOUNT, "popcount", '2',
> gen_int_libfunc)
>  OPTAB_NL(parity_optab, "parity$a2", PARITY, "parity", '2', gen_int_libfunc)
> 
> +OPTAB_NL(us_plus_optab, "us_plus$a3", US_PLUS, "us_plus", '3',
> gen_int_libfunc)
> +
>  /* Comparison libcalls for integers MUST come in pairs, signed/unsigned.  */
>  OPTAB_NL(cmp_optab, NULL, UNKNOWN, "cmp", '2', gen_int_fp_fixed_libfunc)
>  OPTAB_NL(ucmp_optab, NULL, UNKNOWN, "ucmp", '2', gen_int_libfunc)
> --
> 2.34.1
Richard Biener Feb. 27, 2024, 9:43 a.m. UTC | #3
On Sun, Feb 25, 2024 at 10:01 AM Tamar Christina
<Tamar.Christina@arm.com> wrote:
>
> Hi Pan,
>
> > From: Pan Li <pan2.li@intel.com>
> >
> > Hi Richard & Tamar,
> >
> > Try the DEF_INTERNAL_INT_EXT_FN as your suggestion.  By mapping
> > us_plus$a3 to the RTL representation (us_plus:m x y) in optabs.def.
> > And then expand_US_PLUS in internal-fn.cc.  Not very sure if my
> > understanding is correct for DEF_INTERNAL_INT_EXT_FN.
> >
> > I am not sure if we still need DEF_INTERNAL_SIGNED_OPTAB_FN here, given
> > the RTL representation has (ss_plus:m x y) and (us_plus:m x y) already.
> >
>
> I think a couple of things are being confused here.  So lets break it down:
>
> The reason for DEF_INTERNAL_SIGNED_OPTAB_FN is because in GIMPLE
> we only want one internal function for both signed and unsigned SAT_ADD.
> with this definition we don't need SAT_UADD and SAT_SADD but instead
> we will only have SAT_ADD, which will expand to us_plus or ss_plus.
>
> Now the downside of this is that this is a direct internal optab.  This means
> that for the representation to be used the target *must* have the optab
> implemented.   This is a bit annoying because it doesn't allow us to generically
> assume that all targets use SAT_ADD for saturating add and thus only have to
> write optimization for this representation.
>
> This is why Richi said we may need to use a new tree_code because we can
> override tree code expansions.  However the same can be done with the _EXT_FN
> internal functions.
>
> So what I meant was that we want to have a combination of the two. i.e. a
> DEF_INTERNAL_SIGNED_OPTAB_EXT_FN.

Whether we want/need _EXT or only direct depends mainly on how we want to
leverage support.  If it's only during vectorization and possibly instruction
selection a direct optab is IMO the way to go.  Generic optimization only
marginally improves when you explode the number of basic operations you
expose - in fact it gets quite unwieldly to support all of them in
simplifications
and/or canonicalization and you possibly need to translate them back to what
the target CPU supports.

We already do have too many (IMO) "special" operations exposed "early"
in the GIMPLE pipeline.

But what I'd like to see is that we do more instruction selection on GIMPLE
but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
passes doing what I'd call instruction selection).  But that means not adding
match.pd patterns for that or at least have a separate isel-match.pd
machinery for that.

So as a start I would go for a direct optab and see to recognize it during
ISEL?

> If Richi agrees, the below is what I meant. It creates the infrastructure for this
> and for now only allows a default fallback for unsigned saturating add and makes
> it easier for us to add the rest later
>
> Also, unless I'm wrong (and Richi can correct me here), us_plus and ss_plus are the
> RTL expression, but the optab for saturation are ssadd and usadd.  So you don't
> need to make new us_plus and ss_plus ones.
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index a07f25f3aee..aaf9f8991b3 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4103,6 +4103,17 @@ direct_internal_fn_supported_p (internal_fn fn, tree_pair types,
>         return direct_##TYPE##_optab_supported_p (which_optab, types,   \
>                                                   opt_type);            \
>        }
> +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR, SIGNED_OPTAB, \
> +                                        UNSIGNED_OPTAB, TYPE)          \
> +    case IFN_##CODE:                                                   \
> +      {                                                                        \
> +       optab which_optab = (TYPE_UNSIGNED (types.SELECTOR)             \
> +                            ? UNSIGNED_OPTAB ## _optab                 \
> +                            : SIGNED_OPTAB ## _optab);                 \
> +       return direct_##TYPE##_optab_supported_p (which_optab, types,   \
> +                                                 opt_type)             \
> +              || internal_##CODE##_fn_supported_p (types.SELECTOR, opt_type); \
> +      }
>  #include "internal-fn.def"
>
>      case IFN_LAST:
> @@ -4303,6 +4314,8 @@ set_edom_supported_p (void)
>      optab which_optab = direct_internal_fn_optab (fn, types);          \
>      expand_##TYPE##_optab_fn (fn, stmt, which_optab);                  \
>    }
> +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR, SIGNED_OPTAB, \
> +                                        UNSIGNED_OPTAB, TYPE)
>  #include "internal-fn.def"
>
>  /* Routines to expand each internal function, indexed by function number.
> @@ -5177,3 +5190,45 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
>        emit_move_insn (plhs, cmp);
>      }
>  }
> +
> +void
> +expand_SAT_ADD (internal_fn fn, gcall *stmt)
> +{
> +  /* Check if the target supports the expansion through an IFN.  */
> +  tree_pair types = direct_internal_fn_types (fn, stmt);
> +  optab which_optab = direct_internal_fn_optab (fn, types);
> +  if (direct_binary_optab_supported_p (which_optab, types,
> +                                      insn_optimization_type ()))
> +    {
> +      expand_binary_optab_fn (fn, stmt, which_optab);
> +      return;
> +    }
> +
> +  /* Target does not support the optab, but we can de-compose it.  */
> +  /*
> +  ... decompose to a canonical representation ...
> +  if (TYPE_UNSIGNED (types.SELECTOR))
> +    {
> +      ...
> +      decompose back to (X + Y) | - ((X + Y) < X)
> +    }
> +  else
> +    {
> +      ...
> +    }
> +  */
> +}
> +
> +bool internal_SAT_ADD_fn_supported_p (tree type, optimization_type /* optype */)
> +{
> +  /* For now, don't support decomposing vector ops.  */
> +  if (VECTOR_TYPE_P (type))
> +    return false;
> +
> +  /* Signed saturating arithmetic is harder to do since we'll so for now
> +     lets ignore.  */
> +  if (!TYPE_UNSIGNED (type))
> +    return false;
> +
> +  return TREE_CODE (type) == INTEGER_TYPE;
> +}
> \ No newline at end of file
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index c14d30365c1..5a2491228d5 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -92,6 +92,10 @@ along with GCC; see the file COPYING3.  If not see
>     unsigned inputs respectively, both without the trailing "_optab".
>     SELECTOR says which type in the tree_pair determines the signedness.
>
> +   DEF_INTERNAL_SIGNED_OPTAB_EXT_FN is like DEF_INTERNAL_SIGNED_OPTAB_FN, except
> +   that it has expand_##NAME defined in internal-fn.cc to override the
> +   DEF_INTERNAL_SIGNED_OPTAB_FN expansion behavior.
> +
>     DEF_INTERNAL_FLT_FN is like DEF_INTERNAL_OPTAB_FN, but in addition,
>     the function implements the computational part of a built-in math
>     function BUILT_IN_<NAME>{F,,L}.  Unlike some built-in functions,
> @@ -153,6 +157,13 @@ along with GCC; see the file COPYING3.  If not see
>    DEF_INTERNAL_FN (NAME, FLAGS | ECF_LEAF, NULL)
>  #endif
>
> +#ifndef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(NAME, FLAGS, SELECTOR, SIGNED_OPTAB, \
> +                                    UNSIGNED_OPTAB, TYPE) \
> +  DEF_INTERNAL_SIGNED_OPTAB_FN (NAME, FLAGS, SELECTOR, SIGNED_OPTAB, \
> +                               UNSIGNED_OPTAB, TYPE)
> +#endif
> +
>  #ifndef DEF_INTERNAL_FLT_FN
>  #define DEF_INTERNAL_FLT_FN(NAME, FLAGS, OPTAB, TYPE) \
>    DEF_INTERNAL_OPTAB_FN (NAME, FLAGS, OPTAB, TYPE)
> @@ -274,6 +285,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST | ECF_NOTHROW, first,
>                               smulhs, umulhs, binary)
>  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, first,
>                               smulhrs, umulhrs, binary)
> +DEF_INTERNAL_SIGNED_OPTAB_EXT_FN (SAT_ADD, ECF_CONST | ECF_NOTHROW, first,
> +                                 ssadd, usadd, binary)
>
>  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
>  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
> @@ -593,5 +606,6 @@ DEF_INTERNAL_FN (BITINTTOFLOAT, ECF_PURE | ECF_LEAF, ". R . ")
>  #undef DEF_INTERNAL_FLT_FN
>  #undef DEF_INTERNAL_FLT_FLOATN_FN
>  #undef DEF_INTERNAL_SIGNED_OPTAB_FN
> +#undef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
>  #undef DEF_INTERNAL_OPTAB_FN
>  #undef DEF_INTERNAL_FN
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index bccee1c3e09..dbdb1e6bad2 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -263,6 +263,8 @@ extern void expand_DIVMODBITINT (internal_fn, gcall *);
>  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
>  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
>  extern void expand_POPCOUNT (internal_fn, gcall *);
> +extern void expand_SAT_ADD (internal_fn, gcall *);
> +extern bool internal_SAT_ADD_fn_supported_p (tree, optimization_type);
>
>  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
>
> > Note this patch is a draft for validation, no test are invovled here.
> >
> > gcc/ChangeLog:
> >
> >       * builtins.def (BUILT_IN_US_PLUS): Add builtin def.
> >       (BUILT_IN_US_PLUSIMAX): Ditto.
> >       (BUILT_IN_US_PLUSL): Ditto.
> >       (BUILT_IN_US_PLUSLL): Ditto.
> >       (BUILT_IN_US_PLUSG): Ditto.
> >       * config/riscv/riscv-protos.h (riscv_expand_us_plus): Add new
> >       func decl for expanding us_plus.
> >       * config/riscv/riscv.cc (riscv_expand_us_plus): Add new func
> >       impl for expanding us_plus.
> >       * config/riscv/riscv.md (us_plus<mode>3): Add new pattern impl
> >       us_plus<mode>3.
> >       * internal-fn.cc (expand_US_PLUS): Add new func impl to expand
> >       US_PLUS.
> >       * internal-fn.def (US_PLUS): Add new INT_EXT_FN.
> >       * internal-fn.h (expand_US_PLUS): Add new func decl.
> >       * match.pd: Add new simplify pattern for us_plus.
> >       * optabs.def (OPTAB_NL): Add new OPTAB_NL to US_PLUS rtl.
> >
> > Signed-off-by: Pan Li <pan2.li@intel.com>
> > ---
> >  gcc/builtins.def                |  7 +++++
> >  gcc/config/riscv/riscv-protos.h |  1 +
> >  gcc/config/riscv/riscv.cc       | 46 +++++++++++++++++++++++++++++++++
> >  gcc/config/riscv/riscv.md       | 11 ++++++++
> >  gcc/internal-fn.cc              | 26 +++++++++++++++++++
> >  gcc/internal-fn.def             |  3 +++
> >  gcc/internal-fn.h               |  1 +
> >  gcc/match.pd                    | 17 ++++++++++++
> >  gcc/optabs.def                  |  2 ++
> >  9 files changed, 114 insertions(+)
> >
> > diff --git a/gcc/builtins.def b/gcc/builtins.def
> > index f6f3e104f6a..0777b912cfa 100644
> > --- a/gcc/builtins.def
> > +++ b/gcc/builtins.def
> > @@ -1055,6 +1055,13 @@ DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTIMAX,
> > "popcountimax", BT_FN_INT_UINTMAX
> >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTL, "popcountl", BT_FN_INT_ULONG,
> > ATTR_CONST_NOTHROW_LEAF_LIST)
> >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTLL, "popcountll",
> > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTG, "popcountg", BT_FN_INT_VAR,
> > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > +
> > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUS, "us_plus", BT_FN_INT_UINT,
> > ATTR_CONST_NOTHROW_LEAF_LIST)
> > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSIMAX, "us_plusimax",
> > BT_FN_INT_UINTMAX, ATTR_CONST_NOTHROW_LEAF_LIST)
> > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSL, "us_plusl", BT_FN_INT_ULONG,
> > ATTR_CONST_NOTHROW_LEAF_LIST)
> > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSLL, "us_plusll",
> > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSG, "us_plusg", BT_FN_INT_VAR,
> > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > +
> >  DEF_EXT_LIB_BUILTIN    (BUILT_IN_POSIX_MEMALIGN, "posix_memalign",
> > BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF)
> >  DEF_GCC_BUILTIN        (BUILT_IN_PREFETCH, "prefetch",
> > BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
> >  DEF_LIB_BUILTIN        (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE,
> > ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST)
> > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> > index 80efdf2b7e5..ba6086f1f25 100644
> > --- a/gcc/config/riscv/riscv-protos.h
> > +++ b/gcc/config/riscv/riscv-protos.h
> > @@ -132,6 +132,7 @@ extern void riscv_asm_output_external (FILE *, const tree,
> > const char *);
> >  extern bool
> >  riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
> >  extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx);
> > +extern void riscv_expand_us_plus (rtx, rtx, rtx);
> >
> >  #ifdef RTX_CODE
> >  extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool *invert_ptr =
> > 0);
> > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> > index 4100abc9dd1..23f08974f07 100644
> > --- a/gcc/config/riscv/riscv.cc
> > +++ b/gcc/config/riscv/riscv.cc
> > @@ -10657,6 +10657,52 @@ riscv_vector_mode_supported_any_target_p
> > (machine_mode)
> >    return true;
> >  }
> >
> > +/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < x).  */
> > +void
> > +riscv_expand_us_plus (rtx dest, rtx x, rtx y)
> > +{
> > +  machine_mode mode = GET_MODE (dest);
> > +  rtx pmode_sum = gen_reg_rtx (Pmode);
> > +  rtx pmode_lt = gen_reg_rtx (Pmode);
> > +  rtx pmode_x = gen_lowpart (Pmode, x);
> > +  rtx pmode_y = gen_lowpart (Pmode, y);
> > +  rtx pmode_dest = gen_reg_rtx (Pmode);
> > +
> > +  /* Step-1: sum = x + y  */
> > +  if (mode == SImode && mode != Pmode)
> > +    { /* Take addw to avoid the sum truncate.  */
> > +      rtx simode_sum = gen_reg_rtx (SImode);
> > +      riscv_emit_binary (PLUS, simode_sum, x, y);
> > +      emit_move_insn (pmode_sum, gen_lowpart (Pmode, simode_sum));
> > +    }
> > +  else
> > +    riscv_emit_binary (PLUS, pmode_sum, pmode_x, pmode_y);
> > +
> > +  /* Step-1.1: truncate sum for HI and QI as we have no insn for add QI/HI.  */
> > +  if (mode == HImode || mode == QImode)
> > +    {
> > +      int mode_bits = GET_MODE_BITSIZE (mode).to_constant ();
> > +      int shift_bits = GET_MODE_BITSIZE (Pmode) - mode_bits;
> > +
> > +      gcc_assert (shift_bits > 0);
> > +
> > +      riscv_emit_binary (ASHIFT, pmode_sum, pmode_sum, GEN_INT (shift_bits));
> > +      riscv_emit_binary (LSHIFTRT, pmode_sum, pmode_sum, GEN_INT
> > (shift_bits));
> > +    }
> > +
> > +  /* Step-2: lt = sum < x  */
> > +  riscv_emit_binary (LTU, pmode_lt, pmode_sum, pmode_x);
> > +
> > +  /* Step-3: lt = -lt  */
> > +  riscv_emit_unary (NEG, pmode_lt, pmode_lt);
> > +
> > +  /* Step-4: pmode_dest = sum | lt  */
> > +  riscv_emit_binary (IOR, pmode_dest, pmode_lt, pmode_sum);
> > +
> > +  /* Step-5: dest = pmode_dest */
> > +  emit_move_insn (dest, gen_lowpart (mode, pmode_dest));
> > +}
> > +
> >  /* Initialize the GCC target structure.  */
> >  #undef TARGET_ASM_ALIGNED_HI_OP
> >  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> > diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> > index 3f7a023d941..eaa9867023c 100644
> > --- a/gcc/config/riscv/riscv.md
> > +++ b/gcc/config/riscv/riscv.md
> > @@ -3841,6 +3841,17 @@ (define_insn "*large_load_address"
> >    [(set_attr "type" "load")
> >     (set (attr "length") (const_int 8))])
> >
> > +(define_expand "us_plus<mode>3"
> > +  [(match_operand:ANYI   0 "register_operand")
> > +   (match_operand:ANYI   1 "register_operand")
> > +   (match_operand:ANYI   2 "register_operand")]
> > +  ""
> > +  {
> > +    riscv_expand_us_plus (operands[0], operands[1], operands[2]);
> > +    DONE;
> > +  }
> > +)
> > +
> >  (include "bitmanip.md")
> >  (include "crypto.md")
> >  (include "sync.md")
> > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > index a07f25f3aee..a7341a57ffa 100644
> > --- a/gcc/internal-fn.cc
> > +++ b/gcc/internal-fn.cc
> > @@ -5177,3 +5177,29 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> >        emit_move_insn (plhs, cmp);
> >      }
> >  }
> > +
> > +void
> > +expand_US_PLUS (internal_fn fn, gcall *stmt)
> > +{
> > +  tree lhs = gimple_call_lhs (stmt);
> > +  tree rhs_0 = gimple_call_arg (stmt, 0);
> > +  tree rhs_1 = gimple_call_arg (stmt, 1);
> > +
> > +  do_pending_stack_adjust ();
> > +
> > +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> > +  rtx op_0 = expand_normal (rhs_0);
> > +  rtx op_1 = expand_normal (rhs_1);
> > +
> > +  class expand_operand ops[3];
> > +
> > +  create_output_operand (&ops[0], target, TYPE_MODE (TREE_TYPE (lhs)));
> > +  create_output_operand (&ops[1], op_0, TYPE_MODE (TREE_TYPE (rhs_0)));
> > +  create_output_operand (&ops[2], op_1, TYPE_MODE (TREE_TYPE (rhs_1)));
> > +
> > +  insn_code code = optab_handler (us_plus_optab, TYPE_MODE (TREE_TYPE
> > (rhs_0)));
> > +  expand_insn (code, 3, ops);
> > +
> > +  if (!rtx_equal_p (target, ops[0].value))
> > +    emit_move_insn (target, ops[0].value);
> > +}
>
> This can be simplified by calling expand_binary_optab_fn instead. See my template
>
> > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > index c14d30365c1..b1d7b5a0307 100644
> > --- a/gcc/internal-fn.def
> > +++ b/gcc/internal-fn.def
> > @@ -447,6 +447,9 @@ DEF_INTERNAL_INT_FN (FFS, ECF_CONST |
> > ECF_NOTHROW, ffs, unary)
> >  DEF_INTERNAL_INT_FN (PARITY, ECF_CONST | ECF_NOTHROW, parity, unary)
> >  DEF_INTERNAL_INT_EXT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW,
> > popcount, unary)
> >
> > +/* Binary integer ops.  */
> > +DEF_INTERNAL_INT_EXT_FN (US_PLUS, ECF_CONST | ECF_NOTHROW, us_plus,
> > binary)
> > +
> >  DEF_INTERNAL_FN (GOMP_TARGET_REV, ECF_NOVOPS | ECF_LEAF |
> > ECF_NOTHROW, NULL)
> >  DEF_INTERNAL_FN (GOMP_USE_SIMT, ECF_NOVOPS | ECF_LEAF |
> > ECF_NOTHROW, NULL)
> >  DEF_INTERNAL_FN (GOMP_SIMT_ENTER, ECF_LEAF | ECF_NOTHROW, NULL)
> > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > index bccee1c3e09..46e404b4a49 100644
> > --- a/gcc/internal-fn.h
> > +++ b/gcc/internal-fn.h
> > @@ -263,6 +263,7 @@ extern void expand_DIVMODBITINT (internal_fn, gcall *);
> >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > +extern void expand_US_PLUS (internal_fn, gcall *);
> >
> >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> >
> > diff --git a/gcc/match.pd b/gcc/match.pd
> > index c5b6540f939..f45fd58ad23 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -10265,3 +10265,20 @@ and,
> >        }
> >        (if (full_perm_p)
> >       (vec_perm (op@3 @0 @1) @3 @2))))))
> > +
> > +#if GIMPLE
> > +
> > +/* Unsigned saturation add, aka:
> > +   SAT_ADDU = (X + Y) | - ((X + Y) < X) or
> > +   SAT_ADDU = (X + Y) | - ((X + Y) < Y).  */
> > +(simplify
> > + (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0))))
> > +   (if (optimize
> > +     && INTEGRAL_TYPE_P (type)
> > +     && TYPE_UNSIGNED (TREE_TYPE (@0))
> > +     && types_match (type, TREE_TYPE (@0))
> > +     && types_match (type, TREE_TYPE (@1))
> > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> > OPTIMIZE_FOR_BOTH))
> > +   (IFN_US_PLUS @0 @1)))
> > +
> > +#endif
>
> With the version above you can drop the #if GIMPLE and the
> > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
>
> Check.
>
> Thanks,
> Tamar
>
> > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > index ad14f9328b9..5855c4e0834 100644
> > --- a/gcc/optabs.def
> > +++ b/gcc/optabs.def
> > @@ -179,6 +179,8 @@ OPTAB_NL(clrsb_optab, "clrsb$a2", CLRSB, "clrsb", '2',
> > gen_int_libfunc)
> >  OPTAB_NL(popcount_optab, "popcount$a2", POPCOUNT, "popcount", '2',
> > gen_int_libfunc)
> >  OPTAB_NL(parity_optab, "parity$a2", PARITY, "parity", '2', gen_int_libfunc)
> >
> > +OPTAB_NL(us_plus_optab, "us_plus$a3", US_PLUS, "us_plus", '3',
> > gen_int_libfunc)
> > +
> >  /* Comparison libcalls for integers MUST come in pairs, signed/unsigned.  */
> >  OPTAB_NL(cmp_optab, NULL, UNKNOWN, "cmp", '2', gen_int_fp_fixed_libfunc)
> >  OPTAB_NL(ucmp_optab, NULL, UNKNOWN, "ucmp", '2', gen_int_libfunc)
> > --
> > 2.34.1
>
Tamar Christina Feb. 27, 2024, 9:56 a.m. UTC | #4
> -----Original Message-----
> From: Richard Biener <richard.guenther@gmail.com>
> Sent: Tuesday, February 27, 2024 9:44 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: pan2.li@intel.com; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai;
> yanzhang.wang@intel.com; kito.cheng@gmail.com;
> richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> US_PLUS
> 
> On Sun, Feb 25, 2024 at 10:01 AM Tamar Christina
> <Tamar.Christina@arm.com> wrote:
> >
> > Hi Pan,
> >
> > > From: Pan Li <pan2.li@intel.com>
> > >
> > > Hi Richard & Tamar,
> > >
> > > Try the DEF_INTERNAL_INT_EXT_FN as your suggestion.  By mapping
> > > us_plus$a3 to the RTL representation (us_plus:m x y) in optabs.def.
> > > And then expand_US_PLUS in internal-fn.cc.  Not very sure if my
> > > understanding is correct for DEF_INTERNAL_INT_EXT_FN.
> > >
> > > I am not sure if we still need DEF_INTERNAL_SIGNED_OPTAB_FN here, given
> > > the RTL representation has (ss_plus:m x y) and (us_plus:m x y) already.
> > >
> >
> > I think a couple of things are being confused here.  So lets break it down:
> >
> > The reason for DEF_INTERNAL_SIGNED_OPTAB_FN is because in GIMPLE
> > we only want one internal function for both signed and unsigned SAT_ADD.
> > with this definition we don't need SAT_UADD and SAT_SADD but instead
> > we will only have SAT_ADD, which will expand to us_plus or ss_plus.
> >
> > Now the downside of this is that this is a direct internal optab.  This means
> > that for the representation to be used the target *must* have the optab
> > implemented.   This is a bit annoying because it doesn't allow us to generically
> > assume that all targets use SAT_ADD for saturating add and thus only have to
> > write optimization for this representation.
> >
> > This is why Richi said we may need to use a new tree_code because we can
> > override tree code expansions.  However the same can be done with the _EXT_FN
> > internal functions.
> >
> > So what I meant was that we want to have a combination of the two. i.e. a
> > DEF_INTERNAL_SIGNED_OPTAB_EXT_FN.
> 
> Whether we want/need _EXT or only direct depends mainly on how we want to
> leverage support.  If it's only during vectorization and possibly instruction
> selection a direct optab is IMO the way to go.  Generic optimization only
> marginally improves when you explode the number of basic operations you
> expose - in fact it gets quite unwieldly to support all of them in
> simplifications
> and/or canonicalization and you possibly need to translate them back to what
> the target CPU supports.
> 
> We already do have too many (IMO) "special" operations exposed "early"
> in the GIMPLE pipeline.
> 
> But what I'd like to see is that we do more instruction selection on GIMPLE
> but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> passes doing what I'd call instruction selection).  But that means not adding
> match.pd patterns for that or at least have a separate isel-match.pd
> machinery for that.
> 
> So as a start I would go for a direct optab and see to recognize it during
> ISEL?
> 

The problem with ISEL and the reason I suggested an indirect IFN is that there
Are benefit to be had from recognizing it early.  Saturating arithmetic can be optimized
Differently from non-saturating ones.

But additionally a common way of specifying them decomposes to branches
and/or using COMPLEX_EXPR (see the various PRs on saturating arithmetic).

These two representation can be detected in PHI-opts and it's beneficial to all
targets to canonicalize them to the branchless code.

Those two cases also *completely* stop vectorization because of either the
control flow or the fact the vectorizer can't handle complex types.

So really, gimple ISEL would fix just 1 of the 3 very common cases, and then
We'd still need to hack the vectorizer cost models for targets with saturating
vector instructions.

I of course defer to you, but it seems quite suboptimal to do it this way and
doesn't get us first class saturation support.

Additionally there have been discussions whether both clang and gcc should
provide __builtin_saturate_* methods, which the non-direct IFN would help
support.

Tamar.

> > If Richi agrees, the below is what I meant. It creates the infrastructure for this
> > and for now only allows a default fallback for unsigned saturating add and makes
> > it easier for us to add the rest later
> >
> > Also, unless I'm wrong (and Richi can correct me here), us_plus and ss_plus are
> the
> > RTL expression, but the optab for saturation are ssadd and usadd.  So you don't
> > need to make new us_plus and ss_plus ones.
> >
> > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > index a07f25f3aee..aaf9f8991b3 100644
> > --- a/gcc/internal-fn.cc
> > +++ b/gcc/internal-fn.cc
> > @@ -4103,6 +4103,17 @@ direct_internal_fn_supported_p (internal_fn fn,
> tree_pair types,
> >         return direct_##TYPE##_optab_supported_p (which_optab, types,   \
> >                                                   opt_type);            \
> >        }
> > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR,
> SIGNED_OPTAB, \
> > +                                        UNSIGNED_OPTAB, TYPE)          \
> > +    case IFN_##CODE:                                                   \
> > +      {                                                                        \
> > +       optab which_optab = (TYPE_UNSIGNED (types.SELECTOR)             \
> > +                            ? UNSIGNED_OPTAB ## _optab                 \
> > +                            : SIGNED_OPTAB ## _optab);                 \
> > +       return direct_##TYPE##_optab_supported_p (which_optab, types,   \
> > +                                                 opt_type)             \
> > +              || internal_##CODE##_fn_supported_p (types.SELECTOR, opt_type); \
> > +      }
> >  #include "internal-fn.def"
> >
> >      case IFN_LAST:
> > @@ -4303,6 +4314,8 @@ set_edom_supported_p (void)
> >      optab which_optab = direct_internal_fn_optab (fn, types);          \
> >      expand_##TYPE##_optab_fn (fn, stmt, which_optab);                  \
> >    }
> > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR,
> SIGNED_OPTAB, \
> > +                                        UNSIGNED_OPTAB, TYPE)
> >  #include "internal-fn.def"
> >
> >  /* Routines to expand each internal function, indexed by function number.
> > @@ -5177,3 +5190,45 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> >        emit_move_insn (plhs, cmp);
> >      }
> >  }
> > +
> > +void
> > +expand_SAT_ADD (internal_fn fn, gcall *stmt)
> > +{
> > +  /* Check if the target supports the expansion through an IFN.  */
> > +  tree_pair types = direct_internal_fn_types (fn, stmt);
> > +  optab which_optab = direct_internal_fn_optab (fn, types);
> > +  if (direct_binary_optab_supported_p (which_optab, types,
> > +                                      insn_optimization_type ()))
> > +    {
> > +      expand_binary_optab_fn (fn, stmt, which_optab);
> > +      return;
> > +    }
> > +
> > +  /* Target does not support the optab, but we can de-compose it.  */
> > +  /*
> > +  ... decompose to a canonical representation ...
> > +  if (TYPE_UNSIGNED (types.SELECTOR))
> > +    {
> > +      ...
> > +      decompose back to (X + Y) | - ((X + Y) < X)
> > +    }
> > +  else
> > +    {
> > +      ...
> > +    }
> > +  */
> > +}
> > +
> > +bool internal_SAT_ADD_fn_supported_p (tree type, optimization_type /*
> optype */)
> > +{
> > +  /* For now, don't support decomposing vector ops.  */
> > +  if (VECTOR_TYPE_P (type))
> > +    return false;
> > +
> > +  /* Signed saturating arithmetic is harder to do since we'll so for now
> > +     lets ignore.  */
> > +  if (!TYPE_UNSIGNED (type))
> > +    return false;
> > +
> > +  return TREE_CODE (type) == INTEGER_TYPE;
> > +}
> > \ No newline at end of file
> > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > index c14d30365c1..5a2491228d5 100644
> > --- a/gcc/internal-fn.def
> > +++ b/gcc/internal-fn.def
> > @@ -92,6 +92,10 @@ along with GCC; see the file COPYING3.  If not see
> >     unsigned inputs respectively, both without the trailing "_optab".
> >     SELECTOR says which type in the tree_pair determines the signedness.
> >
> > +   DEF_INTERNAL_SIGNED_OPTAB_EXT_FN is like
> DEF_INTERNAL_SIGNED_OPTAB_FN, except
> > +   that it has expand_##NAME defined in internal-fn.cc to override the
> > +   DEF_INTERNAL_SIGNED_OPTAB_FN expansion behavior.
> > +
> >     DEF_INTERNAL_FLT_FN is like DEF_INTERNAL_OPTAB_FN, but in addition,
> >     the function implements the computational part of a built-in math
> >     function BUILT_IN_<NAME>{F,,L}.  Unlike some built-in functions,
> > @@ -153,6 +157,13 @@ along with GCC; see the file COPYING3.  If not see
> >    DEF_INTERNAL_FN (NAME, FLAGS | ECF_LEAF, NULL)
> >  #endif
> >
> > +#ifndef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(NAME, FLAGS, SELECTOR,
> SIGNED_OPTAB, \
> > +                                    UNSIGNED_OPTAB, TYPE) \
> > +  DEF_INTERNAL_SIGNED_OPTAB_FN (NAME, FLAGS, SELECTOR,
> SIGNED_OPTAB, \
> > +                               UNSIGNED_OPTAB, TYPE)
> > +#endif
> > +
> >  #ifndef DEF_INTERNAL_FLT_FN
> >  #define DEF_INTERNAL_FLT_FN(NAME, FLAGS, OPTAB, TYPE) \
> >    DEF_INTERNAL_OPTAB_FN (NAME, FLAGS, OPTAB, TYPE)
> > @@ -274,6 +285,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS,
> ECF_CONST | ECF_NOTHROW, first,
> >                               smulhs, umulhs, binary)
> >  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW,
> first,
> >                               smulhrs, umulhrs, binary)
> > +DEF_INTERNAL_SIGNED_OPTAB_EXT_FN (SAT_ADD, ECF_CONST |
> ECF_NOTHROW, first,
> > +                                 ssadd, usadd, binary)
> >
> >  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
> >  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
> > @@ -593,5 +606,6 @@ DEF_INTERNAL_FN (BITINTTOFLOAT, ECF_PURE |
> ECF_LEAF, ". R . ")
> >  #undef DEF_INTERNAL_FLT_FN
> >  #undef DEF_INTERNAL_FLT_FLOATN_FN
> >  #undef DEF_INTERNAL_SIGNED_OPTAB_FN
> > +#undef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> >  #undef DEF_INTERNAL_OPTAB_FN
> >  #undef DEF_INTERNAL_FN
> > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > index bccee1c3e09..dbdb1e6bad2 100644
> > --- a/gcc/internal-fn.h
> > +++ b/gcc/internal-fn.h
> > @@ -263,6 +263,8 @@ extern void expand_DIVMODBITINT (internal_fn, gcall
> *);
> >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > +extern void expand_SAT_ADD (internal_fn, gcall *);
> > +extern bool internal_SAT_ADD_fn_supported_p (tree, optimization_type);
> >
> >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> >
> > > Note this patch is a draft for validation, no test are invovled here.
> > >
> > > gcc/ChangeLog:
> > >
> > >       * builtins.def (BUILT_IN_US_PLUS): Add builtin def.
> > >       (BUILT_IN_US_PLUSIMAX): Ditto.
> > >       (BUILT_IN_US_PLUSL): Ditto.
> > >       (BUILT_IN_US_PLUSLL): Ditto.
> > >       (BUILT_IN_US_PLUSG): Ditto.
> > >       * config/riscv/riscv-protos.h (riscv_expand_us_plus): Add new
> > >       func decl for expanding us_plus.
> > >       * config/riscv/riscv.cc (riscv_expand_us_plus): Add new func
> > >       impl for expanding us_plus.
> > >       * config/riscv/riscv.md (us_plus<mode>3): Add new pattern impl
> > >       us_plus<mode>3.
> > >       * internal-fn.cc (expand_US_PLUS): Add new func impl to expand
> > >       US_PLUS.
> > >       * internal-fn.def (US_PLUS): Add new INT_EXT_FN.
> > >       * internal-fn.h (expand_US_PLUS): Add new func decl.
> > >       * match.pd: Add new simplify pattern for us_plus.
> > >       * optabs.def (OPTAB_NL): Add new OPTAB_NL to US_PLUS rtl.
> > >
> > > Signed-off-by: Pan Li <pan2.li@intel.com>
> > > ---
> > >  gcc/builtins.def                |  7 +++++
> > >  gcc/config/riscv/riscv-protos.h |  1 +
> > >  gcc/config/riscv/riscv.cc       | 46 +++++++++++++++++++++++++++++++++
> > >  gcc/config/riscv/riscv.md       | 11 ++++++++
> > >  gcc/internal-fn.cc              | 26 +++++++++++++++++++
> > >  gcc/internal-fn.def             |  3 +++
> > >  gcc/internal-fn.h               |  1 +
> > >  gcc/match.pd                    | 17 ++++++++++++
> > >  gcc/optabs.def                  |  2 ++
> > >  9 files changed, 114 insertions(+)
> > >
> > > diff --git a/gcc/builtins.def b/gcc/builtins.def
> > > index f6f3e104f6a..0777b912cfa 100644
> > > --- a/gcc/builtins.def
> > > +++ b/gcc/builtins.def
> > > @@ -1055,6 +1055,13 @@ DEF_GCC_BUILTIN
> (BUILT_IN_POPCOUNTIMAX,
> > > "popcountimax", BT_FN_INT_UINTMAX
> > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTL, "popcountl",
> BT_FN_INT_ULONG,
> > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTLL, "popcountll",
> > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTG, "popcountg",
> BT_FN_INT_VAR,
> > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > > +
> > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUS, "us_plus", BT_FN_INT_UINT,
> > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSIMAX, "us_plusimax",
> > > BT_FN_INT_UINTMAX, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSL, "us_plusl", BT_FN_INT_ULONG,
> > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSLL, "us_plusll",
> > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSG, "us_plusg", BT_FN_INT_VAR,
> > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > > +
> > >  DEF_EXT_LIB_BUILTIN    (BUILT_IN_POSIX_MEMALIGN, "posix_memalign",
> > > BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF)
> > >  DEF_GCC_BUILTIN        (BUILT_IN_PREFETCH, "prefetch",
> > > BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
> > >  DEF_LIB_BUILTIN        (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE,
> > > ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST)
> > > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> > > index 80efdf2b7e5..ba6086f1f25 100644
> > > --- a/gcc/config/riscv/riscv-protos.h
> > > +++ b/gcc/config/riscv/riscv-protos.h
> > > @@ -132,6 +132,7 @@ extern void riscv_asm_output_external (FILE *, const
> tree,
> > > const char *);
> > >  extern bool
> > >  riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
> > >  extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx);
> > > +extern void riscv_expand_us_plus (rtx, rtx, rtx);
> > >
> > >  #ifdef RTX_CODE
> > >  extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool *invert_ptr
> =
> > > 0);
> > > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> > > index 4100abc9dd1..23f08974f07 100644
> > > --- a/gcc/config/riscv/riscv.cc
> > > +++ b/gcc/config/riscv/riscv.cc
> > > @@ -10657,6 +10657,52 @@ riscv_vector_mode_supported_any_target_p
> > > (machine_mode)
> > >    return true;
> > >  }
> > >
> > > +/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < x).  */
> > > +void
> > > +riscv_expand_us_plus (rtx dest, rtx x, rtx y)
> > > +{
> > > +  machine_mode mode = GET_MODE (dest);
> > > +  rtx pmode_sum = gen_reg_rtx (Pmode);
> > > +  rtx pmode_lt = gen_reg_rtx (Pmode);
> > > +  rtx pmode_x = gen_lowpart (Pmode, x);
> > > +  rtx pmode_y = gen_lowpart (Pmode, y);
> > > +  rtx pmode_dest = gen_reg_rtx (Pmode);
> > > +
> > > +  /* Step-1: sum = x + y  */
> > > +  if (mode == SImode && mode != Pmode)
> > > +    { /* Take addw to avoid the sum truncate.  */
> > > +      rtx simode_sum = gen_reg_rtx (SImode);
> > > +      riscv_emit_binary (PLUS, simode_sum, x, y);
> > > +      emit_move_insn (pmode_sum, gen_lowpart (Pmode, simode_sum));
> > > +    }
> > > +  else
> > > +    riscv_emit_binary (PLUS, pmode_sum, pmode_x, pmode_y);
> > > +
> > > +  /* Step-1.1: truncate sum for HI and QI as we have no insn for add QI/HI.  */
> > > +  if (mode == HImode || mode == QImode)
> > > +    {
> > > +      int mode_bits = GET_MODE_BITSIZE (mode).to_constant ();
> > > +      int shift_bits = GET_MODE_BITSIZE (Pmode) - mode_bits;
> > > +
> > > +      gcc_assert (shift_bits > 0);
> > > +
> > > +      riscv_emit_binary (ASHIFT, pmode_sum, pmode_sum, GEN_INT
> (shift_bits));
> > > +      riscv_emit_binary (LSHIFTRT, pmode_sum, pmode_sum, GEN_INT
> > > (shift_bits));
> > > +    }
> > > +
> > > +  /* Step-2: lt = sum < x  */
> > > +  riscv_emit_binary (LTU, pmode_lt, pmode_sum, pmode_x);
> > > +
> > > +  /* Step-3: lt = -lt  */
> > > +  riscv_emit_unary (NEG, pmode_lt, pmode_lt);
> > > +
> > > +  /* Step-4: pmode_dest = sum | lt  */
> > > +  riscv_emit_binary (IOR, pmode_dest, pmode_lt, pmode_sum);
> > > +
> > > +  /* Step-5: dest = pmode_dest */
> > > +  emit_move_insn (dest, gen_lowpart (mode, pmode_dest));
> > > +}
> > > +
> > >  /* Initialize the GCC target structure.  */
> > >  #undef TARGET_ASM_ALIGNED_HI_OP
> > >  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> > > diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> > > index 3f7a023d941..eaa9867023c 100644
> > > --- a/gcc/config/riscv/riscv.md
> > > +++ b/gcc/config/riscv/riscv.md
> > > @@ -3841,6 +3841,17 @@ (define_insn "*large_load_address"
> > >    [(set_attr "type" "load")
> > >     (set (attr "length") (const_int 8))])
> > >
> > > +(define_expand "us_plus<mode>3"
> > > +  [(match_operand:ANYI   0 "register_operand")
> > > +   (match_operand:ANYI   1 "register_operand")
> > > +   (match_operand:ANYI   2 "register_operand")]
> > > +  ""
> > > +  {
> > > +    riscv_expand_us_plus (operands[0], operands[1], operands[2]);
> > > +    DONE;
> > > +  }
> > > +)
> > > +
> > >  (include "bitmanip.md")
> > >  (include "crypto.md")
> > >  (include "sync.md")
> > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > index a07f25f3aee..a7341a57ffa 100644
> > > --- a/gcc/internal-fn.cc
> > > +++ b/gcc/internal-fn.cc
> > > @@ -5177,3 +5177,29 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> > >        emit_move_insn (plhs, cmp);
> > >      }
> > >  }
> > > +
> > > +void
> > > +expand_US_PLUS (internal_fn fn, gcall *stmt)
> > > +{
> > > +  tree lhs = gimple_call_lhs (stmt);
> > > +  tree rhs_0 = gimple_call_arg (stmt, 0);
> > > +  tree rhs_1 = gimple_call_arg (stmt, 1);
> > > +
> > > +  do_pending_stack_adjust ();
> > > +
> > > +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> > > +  rtx op_0 = expand_normal (rhs_0);
> > > +  rtx op_1 = expand_normal (rhs_1);
> > > +
> > > +  class expand_operand ops[3];
> > > +
> > > +  create_output_operand (&ops[0], target, TYPE_MODE (TREE_TYPE (lhs)));
> > > +  create_output_operand (&ops[1], op_0, TYPE_MODE (TREE_TYPE (rhs_0)));
> > > +  create_output_operand (&ops[2], op_1, TYPE_MODE (TREE_TYPE (rhs_1)));
> > > +
> > > +  insn_code code = optab_handler (us_plus_optab, TYPE_MODE (TREE_TYPE
> > > (rhs_0)));
> > > +  expand_insn (code, 3, ops);
> > > +
> > > +  if (!rtx_equal_p (target, ops[0].value))
> > > +    emit_move_insn (target, ops[0].value);
> > > +}
> >
> > This can be simplified by calling expand_binary_optab_fn instead. See my
> template
> >
> > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > index c14d30365c1..b1d7b5a0307 100644
> > > --- a/gcc/internal-fn.def
> > > +++ b/gcc/internal-fn.def
> > > @@ -447,6 +447,9 @@ DEF_INTERNAL_INT_FN (FFS, ECF_CONST |
> > > ECF_NOTHROW, ffs, unary)
> > >  DEF_INTERNAL_INT_FN (PARITY, ECF_CONST | ECF_NOTHROW, parity, unary)
> > >  DEF_INTERNAL_INT_EXT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW,
> > > popcount, unary)
> > >
> > > +/* Binary integer ops.  */
> > > +DEF_INTERNAL_INT_EXT_FN (US_PLUS, ECF_CONST | ECF_NOTHROW,
> us_plus,
> > > binary)
> > > +
> > >  DEF_INTERNAL_FN (GOMP_TARGET_REV, ECF_NOVOPS | ECF_LEAF |
> > > ECF_NOTHROW, NULL)
> > >  DEF_INTERNAL_FN (GOMP_USE_SIMT, ECF_NOVOPS | ECF_LEAF |
> > > ECF_NOTHROW, NULL)
> > >  DEF_INTERNAL_FN (GOMP_SIMT_ENTER, ECF_LEAF | ECF_NOTHROW, NULL)
> > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > > index bccee1c3e09..46e404b4a49 100644
> > > --- a/gcc/internal-fn.h
> > > +++ b/gcc/internal-fn.h
> > > @@ -263,6 +263,7 @@ extern void expand_DIVMODBITINT (internal_fn, gcall
> *);
> > >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> > >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> > >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > > +extern void expand_US_PLUS (internal_fn, gcall *);
> > >
> > >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> > >
> > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > index c5b6540f939..f45fd58ad23 100644
> > > --- a/gcc/match.pd
> > > +++ b/gcc/match.pd
> > > @@ -10265,3 +10265,20 @@ and,
> > >        }
> > >        (if (full_perm_p)
> > >       (vec_perm (op@3 @0 @1) @3 @2))))))
> > > +
> > > +#if GIMPLE
> > > +
> > > +/* Unsigned saturation add, aka:
> > > +   SAT_ADDU = (X + Y) | - ((X + Y) < X) or
> > > +   SAT_ADDU = (X + Y) | - ((X + Y) < Y).  */
> > > +(simplify
> > > + (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0))))
> > > +   (if (optimize
> > > +     && INTEGRAL_TYPE_P (type)
> > > +     && TYPE_UNSIGNED (TREE_TYPE (@0))
> > > +     && types_match (type, TREE_TYPE (@0))
> > > +     && types_match (type, TREE_TYPE (@1))
> > > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> > > OPTIMIZE_FOR_BOTH))
> > > +   (IFN_US_PLUS @0 @1)))
> > > +
> > > +#endif
> >
> > With the version above you can drop the #if GIMPLE and the
> > > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> >
> > Check.
> >
> > Thanks,
> > Tamar
> >
> > > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > > index ad14f9328b9..5855c4e0834 100644
> > > --- a/gcc/optabs.def
> > > +++ b/gcc/optabs.def
> > > @@ -179,6 +179,8 @@ OPTAB_NL(clrsb_optab, "clrsb$a2", CLRSB, "clrsb", '2',
> > > gen_int_libfunc)
> > >  OPTAB_NL(popcount_optab, "popcount$a2", POPCOUNT, "popcount", '2',
> > > gen_int_libfunc)
> > >  OPTAB_NL(parity_optab, "parity$a2", PARITY, "parity", '2', gen_int_libfunc)
> > >
> > > +OPTAB_NL(us_plus_optab, "us_plus$a3", US_PLUS, "us_plus", '3',
> > > gen_int_libfunc)
> > > +
> > >  /* Comparison libcalls for integers MUST come in pairs, signed/unsigned.  */
> > >  OPTAB_NL(cmp_optab, NULL, UNKNOWN, "cmp", '2',
> gen_int_fp_fixed_libfunc)
> > >  OPTAB_NL(ucmp_optab, NULL, UNKNOWN, "ucmp", '2', gen_int_libfunc)
> > > --
> > > 2.34.1
> >
Li, Pan2 Feb. 27, 2024, 12:41 p.m. UTC | #5
Thanks Tamar.

> Those two cases also *completely* stop vectorization because of either the
> control flow or the fact the vectorizer can't handle complex types.

Yes, we eventually would like to vectorize the SAT ALU but we start with scalar part first.
I tried the DEF_INTERNAL_SIGNED_OPTAB_EXT_FN as your suggestion. It works well with some additions as below.
Feel free to correct me if any misunderstandings.

1. usadd$Q$a3 are restricted to fixed point and we need to change it to usadd$a3(as well as gen_int_libfunc) for int.
2. We need to implement a default implementation of SAT_ADD if direct_binary_optab_supported_p is false.
    It looks like the default implementation is difficult to make every backend happy. That is why you suggest just normal
    DEF_INTERNAL_SIGNED_OPTAB_FN in another thread.

Thanks Richard.

> But what I'd like to see is that we do more instruction selection on GIMPLE
> but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> passes doing what I'd call instruction selection).  But that means not adding
> match.pd patterns for that or at least have a separate isel-match.pd
> machinery for that.

> So as a start I would go for a direct optab and see to recognize it during
> ISEL?

Looks we have sorts of SAT alu like PLUS/MINUS/MULT/DIV/SHIFT/NEG/ABS, good to know isel and I am happy to
try that once we have conclusion.

Pan

-----Original Message-----
From: Tamar Christina <Tamar.Christina@arm.com> 
Sent: Tuesday, February 27, 2024 5:57 PM
To: Richard Biener <richard.guenther@gmail.com>
Cc: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com; richard.sandiford@arm.com2; jeffreyalaw@gmail.com
Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS

> -----Original Message-----
> From: Richard Biener <richard.guenther@gmail.com>
> Sent: Tuesday, February 27, 2024 9:44 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: pan2.li@intel.com; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai;
> yanzhang.wang@intel.com; kito.cheng@gmail.com;
> richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> US_PLUS
> 
> On Sun, Feb 25, 2024 at 10:01 AM Tamar Christina
> <Tamar.Christina@arm.com> wrote:
> >
> > Hi Pan,
> >
> > > From: Pan Li <pan2.li@intel.com>
> > >
> > > Hi Richard & Tamar,
> > >
> > > Try the DEF_INTERNAL_INT_EXT_FN as your suggestion.  By mapping
> > > us_plus$a3 to the RTL representation (us_plus:m x y) in optabs.def.
> > > And then expand_US_PLUS in internal-fn.cc.  Not very sure if my
> > > understanding is correct for DEF_INTERNAL_INT_EXT_FN.
> > >
> > > I am not sure if we still need DEF_INTERNAL_SIGNED_OPTAB_FN here, given
> > > the RTL representation has (ss_plus:m x y) and (us_plus:m x y) already.
> > >
> >
> > I think a couple of things are being confused here.  So lets break it down:
> >
> > The reason for DEF_INTERNAL_SIGNED_OPTAB_FN is because in GIMPLE
> > we only want one internal function for both signed and unsigned SAT_ADD.
> > with this definition we don't need SAT_UADD and SAT_SADD but instead
> > we will only have SAT_ADD, which will expand to us_plus or ss_plus.
> >
> > Now the downside of this is that this is a direct internal optab.  This means
> > that for the representation to be used the target *must* have the optab
> > implemented.   This is a bit annoying because it doesn't allow us to generically
> > assume that all targets use SAT_ADD for saturating add and thus only have to
> > write optimization for this representation.
> >
> > This is why Richi said we may need to use a new tree_code because we can
> > override tree code expansions.  However the same can be done with the _EXT_FN
> > internal functions.
> >
> > So what I meant was that we want to have a combination of the two. i.e. a
> > DEF_INTERNAL_SIGNED_OPTAB_EXT_FN.
> 
> Whether we want/need _EXT or only direct depends mainly on how we want to
> leverage support.  If it's only during vectorization and possibly instruction
> selection a direct optab is IMO the way to go.  Generic optimization only
> marginally improves when you explode the number of basic operations you
> expose - in fact it gets quite unwieldly to support all of them in
> simplifications
> and/or canonicalization and you possibly need to translate them back to what
> the target CPU supports.
> 
> We already do have too many (IMO) "special" operations exposed "early"
> in the GIMPLE pipeline.
> 
> But what I'd like to see is that we do more instruction selection on GIMPLE
> but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> passes doing what I'd call instruction selection).  But that means not adding
> match.pd patterns for that or at least have a separate isel-match.pd
> machinery for that.
> 
> So as a start I would go for a direct optab and see to recognize it during
> ISEL?
> 

The problem with ISEL and the reason I suggested an indirect IFN is that there
Are benefit to be had from recognizing it early.  Saturating arithmetic can be optimized
Differently from non-saturating ones.

But additionally a common way of specifying them decomposes to branches
and/or using COMPLEX_EXPR (see the various PRs on saturating arithmetic).

These two representation can be detected in PHI-opts and it's beneficial to all
targets to canonicalize them to the branchless code.

Those two cases also *completely* stop vectorization because of either the
control flow or the fact the vectorizer can't handle complex types.

So really, gimple ISEL would fix just 1 of the 3 very common cases, and then
We'd still need to hack the vectorizer cost models for targets with saturating
vector instructions.

I of course defer to you, but it seems quite suboptimal to do it this way and
doesn't get us first class saturation support.

Additionally there have been discussions whether both clang and gcc should
provide __builtin_saturate_* methods, which the non-direct IFN would help
support.

Tamar.

> > If Richi agrees, the below is what I meant. It creates the infrastructure for this
> > and for now only allows a default fallback for unsigned saturating add and makes
> > it easier for us to add the rest later
> >
> > Also, unless I'm wrong (and Richi can correct me here), us_plus and ss_plus are
> the
> > RTL expression, but the optab for saturation are ssadd and usadd.  So you don't
> > need to make new us_plus and ss_plus ones.
> >
> > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > index a07f25f3aee..aaf9f8991b3 100644
> > --- a/gcc/internal-fn.cc
> > +++ b/gcc/internal-fn.cc
> > @@ -4103,6 +4103,17 @@ direct_internal_fn_supported_p (internal_fn fn,
> tree_pair types,
> >         return direct_##TYPE##_optab_supported_p (which_optab, types,   \
> >                                                   opt_type);            \
> >        }
> > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR,
> SIGNED_OPTAB, \
> > +                                        UNSIGNED_OPTAB, TYPE)          \
> > +    case IFN_##CODE:                                                   \
> > +      {                                                                        \
> > +       optab which_optab = (TYPE_UNSIGNED (types.SELECTOR)             \
> > +                            ? UNSIGNED_OPTAB ## _optab                 \
> > +                            : SIGNED_OPTAB ## _optab);                 \
> > +       return direct_##TYPE##_optab_supported_p (which_optab, types,   \
> > +                                                 opt_type)             \
> > +              || internal_##CODE##_fn_supported_p (types.SELECTOR, opt_type); \
> > +      }
> >  #include "internal-fn.def"
> >
> >      case IFN_LAST:
> > @@ -4303,6 +4314,8 @@ set_edom_supported_p (void)
> >      optab which_optab = direct_internal_fn_optab (fn, types);          \
> >      expand_##TYPE##_optab_fn (fn, stmt, which_optab);                  \
> >    }
> > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR,
> SIGNED_OPTAB, \
> > +                                        UNSIGNED_OPTAB, TYPE)
> >  #include "internal-fn.def"
> >
> >  /* Routines to expand each internal function, indexed by function number.
> > @@ -5177,3 +5190,45 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> >        emit_move_insn (plhs, cmp);
> >      }
> >  }
> > +
> > +void
> > +expand_SAT_ADD (internal_fn fn, gcall *stmt)
> > +{
> > +  /* Check if the target supports the expansion through an IFN.  */
> > +  tree_pair types = direct_internal_fn_types (fn, stmt);
> > +  optab which_optab = direct_internal_fn_optab (fn, types);
> > +  if (direct_binary_optab_supported_p (which_optab, types,
> > +                                      insn_optimization_type ()))
> > +    {
> > +      expand_binary_optab_fn (fn, stmt, which_optab);
> > +      return;
> > +    }
> > +
> > +  /* Target does not support the optab, but we can de-compose it.  */
> > +  /*
> > +  ... decompose to a canonical representation ...
> > +  if (TYPE_UNSIGNED (types.SELECTOR))
> > +    {
> > +      ...
> > +      decompose back to (X + Y) | - ((X + Y) < X)
> > +    }
> > +  else
> > +    {
> > +      ...
> > +    }
> > +  */
> > +}
> > +
> > +bool internal_SAT_ADD_fn_supported_p (tree type, optimization_type /*
> optype */)
> > +{
> > +  /* For now, don't support decomposing vector ops.  */
> > +  if (VECTOR_TYPE_P (type))
> > +    return false;
> > +
> > +  /* Signed saturating arithmetic is harder to do since we'll so for now
> > +     lets ignore.  */
> > +  if (!TYPE_UNSIGNED (type))
> > +    return false;
> > +
> > +  return TREE_CODE (type) == INTEGER_TYPE;
> > +}
> > \ No newline at end of file
> > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > index c14d30365c1..5a2491228d5 100644
> > --- a/gcc/internal-fn.def
> > +++ b/gcc/internal-fn.def
> > @@ -92,6 +92,10 @@ along with GCC; see the file COPYING3.  If not see
> >     unsigned inputs respectively, both without the trailing "_optab".
> >     SELECTOR says which type in the tree_pair determines the signedness.
> >
> > +   DEF_INTERNAL_SIGNED_OPTAB_EXT_FN is like
> DEF_INTERNAL_SIGNED_OPTAB_FN, except
> > +   that it has expand_##NAME defined in internal-fn.cc to override the
> > +   DEF_INTERNAL_SIGNED_OPTAB_FN expansion behavior.
> > +
> >     DEF_INTERNAL_FLT_FN is like DEF_INTERNAL_OPTAB_FN, but in addition,
> >     the function implements the computational part of a built-in math
> >     function BUILT_IN_<NAME>{F,,L}.  Unlike some built-in functions,
> > @@ -153,6 +157,13 @@ along with GCC; see the file COPYING3.  If not see
> >    DEF_INTERNAL_FN (NAME, FLAGS | ECF_LEAF, NULL)
> >  #endif
> >
> > +#ifndef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(NAME, FLAGS, SELECTOR,
> SIGNED_OPTAB, \
> > +                                    UNSIGNED_OPTAB, TYPE) \
> > +  DEF_INTERNAL_SIGNED_OPTAB_FN (NAME, FLAGS, SELECTOR,
> SIGNED_OPTAB, \
> > +                               UNSIGNED_OPTAB, TYPE)
> > +#endif
> > +
> >  #ifndef DEF_INTERNAL_FLT_FN
> >  #define DEF_INTERNAL_FLT_FN(NAME, FLAGS, OPTAB, TYPE) \
> >    DEF_INTERNAL_OPTAB_FN (NAME, FLAGS, OPTAB, TYPE)
> > @@ -274,6 +285,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS,
> ECF_CONST | ECF_NOTHROW, first,
> >                               smulhs, umulhs, binary)
> >  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW,
> first,
> >                               smulhrs, umulhrs, binary)
> > +DEF_INTERNAL_SIGNED_OPTAB_EXT_FN (SAT_ADD, ECF_CONST |
> ECF_NOTHROW, first,
> > +                                 ssadd, usadd, binary)
> >
> >  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
> >  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
> > @@ -593,5 +606,6 @@ DEF_INTERNAL_FN (BITINTTOFLOAT, ECF_PURE |
> ECF_LEAF, ". R . ")
> >  #undef DEF_INTERNAL_FLT_FN
> >  #undef DEF_INTERNAL_FLT_FLOATN_FN
> >  #undef DEF_INTERNAL_SIGNED_OPTAB_FN
> > +#undef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> >  #undef DEF_INTERNAL_OPTAB_FN
> >  #undef DEF_INTERNAL_FN
> > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > index bccee1c3e09..dbdb1e6bad2 100644
> > --- a/gcc/internal-fn.h
> > +++ b/gcc/internal-fn.h
> > @@ -263,6 +263,8 @@ extern void expand_DIVMODBITINT (internal_fn, gcall
> *);
> >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > +extern void expand_SAT_ADD (internal_fn, gcall *);
> > +extern bool internal_SAT_ADD_fn_supported_p (tree, optimization_type);
> >
> >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> >
> > > Note this patch is a draft for validation, no test are invovled here.
> > >
> > > gcc/ChangeLog:
> > >
> > >       * builtins.def (BUILT_IN_US_PLUS): Add builtin def.
> > >       (BUILT_IN_US_PLUSIMAX): Ditto.
> > >       (BUILT_IN_US_PLUSL): Ditto.
> > >       (BUILT_IN_US_PLUSLL): Ditto.
> > >       (BUILT_IN_US_PLUSG): Ditto.
> > >       * config/riscv/riscv-protos.h (riscv_expand_us_plus): Add new
> > >       func decl for expanding us_plus.
> > >       * config/riscv/riscv.cc (riscv_expand_us_plus): Add new func
> > >       impl for expanding us_plus.
> > >       * config/riscv/riscv.md (us_plus<mode>3): Add new pattern impl
> > >       us_plus<mode>3.
> > >       * internal-fn.cc (expand_US_PLUS): Add new func impl to expand
> > >       US_PLUS.
> > >       * internal-fn.def (US_PLUS): Add new INT_EXT_FN.
> > >       * internal-fn.h (expand_US_PLUS): Add new func decl.
> > >       * match.pd: Add new simplify pattern for us_plus.
> > >       * optabs.def (OPTAB_NL): Add new OPTAB_NL to US_PLUS rtl.
> > >
> > > Signed-off-by: Pan Li <pan2.li@intel.com>
> > > ---
> > >  gcc/builtins.def                |  7 +++++
> > >  gcc/config/riscv/riscv-protos.h |  1 +
> > >  gcc/config/riscv/riscv.cc       | 46 +++++++++++++++++++++++++++++++++
> > >  gcc/config/riscv/riscv.md       | 11 ++++++++
> > >  gcc/internal-fn.cc              | 26 +++++++++++++++++++
> > >  gcc/internal-fn.def             |  3 +++
> > >  gcc/internal-fn.h               |  1 +
> > >  gcc/match.pd                    | 17 ++++++++++++
> > >  gcc/optabs.def                  |  2 ++
> > >  9 files changed, 114 insertions(+)
> > >
> > > diff --git a/gcc/builtins.def b/gcc/builtins.def
> > > index f6f3e104f6a..0777b912cfa 100644
> > > --- a/gcc/builtins.def
> > > +++ b/gcc/builtins.def
> > > @@ -1055,6 +1055,13 @@ DEF_GCC_BUILTIN
> (BUILT_IN_POPCOUNTIMAX,
> > > "popcountimax", BT_FN_INT_UINTMAX
> > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTL, "popcountl",
> BT_FN_INT_ULONG,
> > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTLL, "popcountll",
> > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTG, "popcountg",
> BT_FN_INT_VAR,
> > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > > +
> > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUS, "us_plus", BT_FN_INT_UINT,
> > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSIMAX, "us_plusimax",
> > > BT_FN_INT_UINTMAX, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSL, "us_plusl", BT_FN_INT_ULONG,
> > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSLL, "us_plusll",
> > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSG, "us_plusg", BT_FN_INT_VAR,
> > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > > +
> > >  DEF_EXT_LIB_BUILTIN    (BUILT_IN_POSIX_MEMALIGN, "posix_memalign",
> > > BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF)
> > >  DEF_GCC_BUILTIN        (BUILT_IN_PREFETCH, "prefetch",
> > > BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
> > >  DEF_LIB_BUILTIN        (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE,
> > > ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST)
> > > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> > > index 80efdf2b7e5..ba6086f1f25 100644
> > > --- a/gcc/config/riscv/riscv-protos.h
> > > +++ b/gcc/config/riscv/riscv-protos.h
> > > @@ -132,6 +132,7 @@ extern void riscv_asm_output_external (FILE *, const
> tree,
> > > const char *);
> > >  extern bool
> > >  riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
> > >  extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx);
> > > +extern void riscv_expand_us_plus (rtx, rtx, rtx);
> > >
> > >  #ifdef RTX_CODE
> > >  extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool *invert_ptr
> =
> > > 0);
> > > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> > > index 4100abc9dd1..23f08974f07 100644
> > > --- a/gcc/config/riscv/riscv.cc
> > > +++ b/gcc/config/riscv/riscv.cc
> > > @@ -10657,6 +10657,52 @@ riscv_vector_mode_supported_any_target_p
> > > (machine_mode)
> > >    return true;
> > >  }
> > >
> > > +/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < x).  */
> > > +void
> > > +riscv_expand_us_plus (rtx dest, rtx x, rtx y)
> > > +{
> > > +  machine_mode mode = GET_MODE (dest);
> > > +  rtx pmode_sum = gen_reg_rtx (Pmode);
> > > +  rtx pmode_lt = gen_reg_rtx (Pmode);
> > > +  rtx pmode_x = gen_lowpart (Pmode, x);
> > > +  rtx pmode_y = gen_lowpart (Pmode, y);
> > > +  rtx pmode_dest = gen_reg_rtx (Pmode);
> > > +
> > > +  /* Step-1: sum = x + y  */
> > > +  if (mode == SImode && mode != Pmode)
> > > +    { /* Take addw to avoid the sum truncate.  */
> > > +      rtx simode_sum = gen_reg_rtx (SImode);
> > > +      riscv_emit_binary (PLUS, simode_sum, x, y);
> > > +      emit_move_insn (pmode_sum, gen_lowpart (Pmode, simode_sum));
> > > +    }
> > > +  else
> > > +    riscv_emit_binary (PLUS, pmode_sum, pmode_x, pmode_y);
> > > +
> > > +  /* Step-1.1: truncate sum for HI and QI as we have no insn for add QI/HI.  */
> > > +  if (mode == HImode || mode == QImode)
> > > +    {
> > > +      int mode_bits = GET_MODE_BITSIZE (mode).to_constant ();
> > > +      int shift_bits = GET_MODE_BITSIZE (Pmode) - mode_bits;
> > > +
> > > +      gcc_assert (shift_bits > 0);
> > > +
> > > +      riscv_emit_binary (ASHIFT, pmode_sum, pmode_sum, GEN_INT
> (shift_bits));
> > > +      riscv_emit_binary (LSHIFTRT, pmode_sum, pmode_sum, GEN_INT
> > > (shift_bits));
> > > +    }
> > > +
> > > +  /* Step-2: lt = sum < x  */
> > > +  riscv_emit_binary (LTU, pmode_lt, pmode_sum, pmode_x);
> > > +
> > > +  /* Step-3: lt = -lt  */
> > > +  riscv_emit_unary (NEG, pmode_lt, pmode_lt);
> > > +
> > > +  /* Step-4: pmode_dest = sum | lt  */
> > > +  riscv_emit_binary (IOR, pmode_dest, pmode_lt, pmode_sum);
> > > +
> > > +  /* Step-5: dest = pmode_dest */
> > > +  emit_move_insn (dest, gen_lowpart (mode, pmode_dest));
> > > +}
> > > +
> > >  /* Initialize the GCC target structure.  */
> > >  #undef TARGET_ASM_ALIGNED_HI_OP
> > >  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> > > diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> > > index 3f7a023d941..eaa9867023c 100644
> > > --- a/gcc/config/riscv/riscv.md
> > > +++ b/gcc/config/riscv/riscv.md
> > > @@ -3841,6 +3841,17 @@ (define_insn "*large_load_address"
> > >    [(set_attr "type" "load")
> > >     (set (attr "length") (const_int 8))])
> > >
> > > +(define_expand "us_plus<mode>3"
> > > +  [(match_operand:ANYI   0 "register_operand")
> > > +   (match_operand:ANYI   1 "register_operand")
> > > +   (match_operand:ANYI   2 "register_operand")]
> > > +  ""
> > > +  {
> > > +    riscv_expand_us_plus (operands[0], operands[1], operands[2]);
> > > +    DONE;
> > > +  }
> > > +)
> > > +
> > >  (include "bitmanip.md")
> > >  (include "crypto.md")
> > >  (include "sync.md")
> > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > index a07f25f3aee..a7341a57ffa 100644
> > > --- a/gcc/internal-fn.cc
> > > +++ b/gcc/internal-fn.cc
> > > @@ -5177,3 +5177,29 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> > >        emit_move_insn (plhs, cmp);
> > >      }
> > >  }
> > > +
> > > +void
> > > +expand_US_PLUS (internal_fn fn, gcall *stmt)
> > > +{
> > > +  tree lhs = gimple_call_lhs (stmt);
> > > +  tree rhs_0 = gimple_call_arg (stmt, 0);
> > > +  tree rhs_1 = gimple_call_arg (stmt, 1);
> > > +
> > > +  do_pending_stack_adjust ();
> > > +
> > > +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> > > +  rtx op_0 = expand_normal (rhs_0);
> > > +  rtx op_1 = expand_normal (rhs_1);
> > > +
> > > +  class expand_operand ops[3];
> > > +
> > > +  create_output_operand (&ops[0], target, TYPE_MODE (TREE_TYPE (lhs)));
> > > +  create_output_operand (&ops[1], op_0, TYPE_MODE (TREE_TYPE (rhs_0)));
> > > +  create_output_operand (&ops[2], op_1, TYPE_MODE (TREE_TYPE (rhs_1)));
> > > +
> > > +  insn_code code = optab_handler (us_plus_optab, TYPE_MODE (TREE_TYPE
> > > (rhs_0)));
> > > +  expand_insn (code, 3, ops);
> > > +
> > > +  if (!rtx_equal_p (target, ops[0].value))
> > > +    emit_move_insn (target, ops[0].value);
> > > +}
> >
> > This can be simplified by calling expand_binary_optab_fn instead. See my
> template
> >
> > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > index c14d30365c1..b1d7b5a0307 100644
> > > --- a/gcc/internal-fn.def
> > > +++ b/gcc/internal-fn.def
> > > @@ -447,6 +447,9 @@ DEF_INTERNAL_INT_FN (FFS, ECF_CONST |
> > > ECF_NOTHROW, ffs, unary)
> > >  DEF_INTERNAL_INT_FN (PARITY, ECF_CONST | ECF_NOTHROW, parity, unary)
> > >  DEF_INTERNAL_INT_EXT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW,
> > > popcount, unary)
> > >
> > > +/* Binary integer ops.  */
> > > +DEF_INTERNAL_INT_EXT_FN (US_PLUS, ECF_CONST | ECF_NOTHROW,
> us_plus,
> > > binary)
> > > +
> > >  DEF_INTERNAL_FN (GOMP_TARGET_REV, ECF_NOVOPS | ECF_LEAF |
> > > ECF_NOTHROW, NULL)
> > >  DEF_INTERNAL_FN (GOMP_USE_SIMT, ECF_NOVOPS | ECF_LEAF |
> > > ECF_NOTHROW, NULL)
> > >  DEF_INTERNAL_FN (GOMP_SIMT_ENTER, ECF_LEAF | ECF_NOTHROW, NULL)
> > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > > index bccee1c3e09..46e404b4a49 100644
> > > --- a/gcc/internal-fn.h
> > > +++ b/gcc/internal-fn.h
> > > @@ -263,6 +263,7 @@ extern void expand_DIVMODBITINT (internal_fn, gcall
> *);
> > >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> > >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> > >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > > +extern void expand_US_PLUS (internal_fn, gcall *);
> > >
> > >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> > >
> > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > index c5b6540f939..f45fd58ad23 100644
> > > --- a/gcc/match.pd
> > > +++ b/gcc/match.pd
> > > @@ -10265,3 +10265,20 @@ and,
> > >        }
> > >        (if (full_perm_p)
> > >       (vec_perm (op@3 @0 @1) @3 @2))))))
> > > +
> > > +#if GIMPLE
> > > +
> > > +/* Unsigned saturation add, aka:
> > > +   SAT_ADDU = (X + Y) | - ((X + Y) < X) or
> > > +   SAT_ADDU = (X + Y) | - ((X + Y) < Y).  */
> > > +(simplify
> > > + (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0))))
> > > +   (if (optimize
> > > +     && INTEGRAL_TYPE_P (type)
> > > +     && TYPE_UNSIGNED (TREE_TYPE (@0))
> > > +     && types_match (type, TREE_TYPE (@0))
> > > +     && types_match (type, TREE_TYPE (@1))
> > > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> > > OPTIMIZE_FOR_BOTH))
> > > +   (IFN_US_PLUS @0 @1)))
> > > +
> > > +#endif
> >
> > With the version above you can drop the #if GIMPLE and the
> > > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> >
> > Check.
> >
> > Thanks,
> > Tamar
> >
> > > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > > index ad14f9328b9..5855c4e0834 100644
> > > --- a/gcc/optabs.def
> > > +++ b/gcc/optabs.def
> > > @@ -179,6 +179,8 @@ OPTAB_NL(clrsb_optab, "clrsb$a2", CLRSB, "clrsb", '2',
> > > gen_int_libfunc)
> > >  OPTAB_NL(popcount_optab, "popcount$a2", POPCOUNT, "popcount", '2',
> > > gen_int_libfunc)
> > >  OPTAB_NL(parity_optab, "parity$a2", PARITY, "parity", '2', gen_int_libfunc)
> > >
> > > +OPTAB_NL(us_plus_optab, "us_plus$a3", US_PLUS, "us_plus", '3',
> > > gen_int_libfunc)
> > > +
> > >  /* Comparison libcalls for integers MUST come in pairs, signed/unsigned.  */
> > >  OPTAB_NL(cmp_optab, NULL, UNKNOWN, "cmp", '2',
> gen_int_fp_fixed_libfunc)
> > >  OPTAB_NL(ucmp_optab, NULL, UNKNOWN, "ucmp", '2', gen_int_libfunc)
> > > --
> > > 2.34.1
> >
Tamar Christina Feb. 27, 2024, 12:57 p.m. UTC | #6
> Thanks Tamar.
> 
> > Those two cases also *completely* stop vectorization because of either the
> > control flow or the fact the vectorizer can't handle complex types.
> 
> Yes, we eventually would like to vectorize the SAT ALU but we start with scalar part
> first.
> I tried the DEF_INTERNAL_SIGNED_OPTAB_EXT_FN as your suggestion. It works
> well with some additions as below.
> Feel free to correct me if any misunderstandings.
> 
> 1. usadd$Q$a3 are restricted to fixed point and we need to change it to
> usadd$a3(as well as gen_int_libfunc) for int.
> 2. We need to implement a default implementation of SAT_ADD if
> direct_binary_optab_supported_p is false.
>     It looks like the default implementation is difficult to make every backend happy.
> That is why you suggest just normal
>     DEF_INTERNAL_SIGNED_OPTAB_FN in another thread.
> 
> Thanks Richard.
> 
> > But what I'd like to see is that we do more instruction selection on GIMPLE
> > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> > passes doing what I'd call instruction selection).  But that means not adding
> > match.pd patterns for that or at least have a separate isel-match.pd
> > machinery for that.
> 
> > So as a start I would go for a direct optab and see to recognize it during
> > ISEL?
> 
> Looks we have sorts of SAT alu like PLUS/MINUS/MULT/DIV/SHIFT/NEG/ABS, good
> to know isel and I am happy to
> try that once we have conclusion.
> 

So after a lively discussion on IRC, the conclusion is that before we proceed Richi would
like to see some examples of various operations.  The problem is that unsigned saturating
addition is the simplest example and it may lead to an implementation strategy that doesn't
scale.

So I'd suggest writing some example of both signed and unsigned saturating add and multiply

Because signed addition, will likely require a branch and signed multiplication would require a
larger type.

This would allow us to better understand what kind of gimple would have to to deal with in
ISEL and VECT if we decide not to lower early.

Thanks,
Tamar

> Pan
> 
> -----Original Message-----
> From: Tamar Christina <Tamar.Christina@arm.com>
> Sent: Tuesday, February 27, 2024 5:57 PM
> To: Richard Biener <richard.guenther@gmail.com>
> Cc: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai;
> Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com;
> richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> US_PLUS
> 
> > -----Original Message-----
> > From: Richard Biener <richard.guenther@gmail.com>
> > Sent: Tuesday, February 27, 2024 9:44 AM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: pan2.li@intel.com; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai;
> > yanzhang.wang@intel.com; kito.cheng@gmail.com;
> > richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> > US_PLUS
> >
> > On Sun, Feb 25, 2024 at 10:01 AM Tamar Christina
> > <Tamar.Christina@arm.com> wrote:
> > >
> > > Hi Pan,
> > >
> > > > From: Pan Li <pan2.li@intel.com>
> > > >
> > > > Hi Richard & Tamar,
> > > >
> > > > Try the DEF_INTERNAL_INT_EXT_FN as your suggestion.  By mapping
> > > > us_plus$a3 to the RTL representation (us_plus:m x y) in optabs.def.
> > > > And then expand_US_PLUS in internal-fn.cc.  Not very sure if my
> > > > understanding is correct for DEF_INTERNAL_INT_EXT_FN.
> > > >
> > > > I am not sure if we still need DEF_INTERNAL_SIGNED_OPTAB_FN here, given
> > > > the RTL representation has (ss_plus:m x y) and (us_plus:m x y) already.
> > > >
> > >
> > > I think a couple of things are being confused here.  So lets break it down:
> > >
> > > The reason for DEF_INTERNAL_SIGNED_OPTAB_FN is because in GIMPLE
> > > we only want one internal function for both signed and unsigned SAT_ADD.
> > > with this definition we don't need SAT_UADD and SAT_SADD but instead
> > > we will only have SAT_ADD, which will expand to us_plus or ss_plus.
> > >
> > > Now the downside of this is that this is a direct internal optab.  This means
> > > that for the representation to be used the target *must* have the optab
> > > implemented.   This is a bit annoying because it doesn't allow us to generically
> > > assume that all targets use SAT_ADD for saturating add and thus only have to
> > > write optimization for this representation.
> > >
> > > This is why Richi said we may need to use a new tree_code because we can
> > > override tree code expansions.  However the same can be done with the
> _EXT_FN
> > > internal functions.
> > >
> > > So what I meant was that we want to have a combination of the two. i.e. a
> > > DEF_INTERNAL_SIGNED_OPTAB_EXT_FN.
> >
> > Whether we want/need _EXT or only direct depends mainly on how we want to
> > leverage support.  If it's only during vectorization and possibly instruction
> > selection a direct optab is IMO the way to go.  Generic optimization only
> > marginally improves when you explode the number of basic operations you
> > expose - in fact it gets quite unwieldly to support all of them in
> > simplifications
> > and/or canonicalization and you possibly need to translate them back to what
> > the target CPU supports.
> >
> > We already do have too many (IMO) "special" operations exposed "early"
> > in the GIMPLE pipeline.
> >
> > But what I'd like to see is that we do more instruction selection on GIMPLE
> > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> > passes doing what I'd call instruction selection).  But that means not adding
> > match.pd patterns for that or at least have a separate isel-match.pd
> > machinery for that.
> >
> > So as a start I would go for a direct optab and see to recognize it during
> > ISEL?
> >
> 
> The problem with ISEL and the reason I suggested an indirect IFN is that there
> Are benefit to be had from recognizing it early.  Saturating arithmetic can be
> optimized
> Differently from non-saturating ones.
> 
> But additionally a common way of specifying them decomposes to branches
> and/or using COMPLEX_EXPR (see the various PRs on saturating arithmetic).
> 
> These two representation can be detected in PHI-opts and it's beneficial to all
> targets to canonicalize them to the branchless code.
> 
> Those two cases also *completely* stop vectorization because of either the
> control flow or the fact the vectorizer can't handle complex types.
> 
> So really, gimple ISEL would fix just 1 of the 3 very common cases, and then
> We'd still need to hack the vectorizer cost models for targets with saturating
> vector instructions.
> 
> I of course defer to you, but it seems quite suboptimal to do it this way and
> doesn't get us first class saturation support.
> 
> Additionally there have been discussions whether both clang and gcc should
> provide __builtin_saturate_* methods, which the non-direct IFN would help
> support.
> 
> Tamar.
> 
> > > If Richi agrees, the below is what I meant. It creates the infrastructure for this
> > > and for now only allows a default fallback for unsigned saturating add and
> makes
> > > it easier for us to add the rest later
> > >
> > > Also, unless I'm wrong (and Richi can correct me here), us_plus and ss_plus are
> > the
> > > RTL expression, but the optab for saturation are ssadd and usadd.  So you don't
> > > need to make new us_plus and ss_plus ones.
> > >
> > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > index a07f25f3aee..aaf9f8991b3 100644
> > > --- a/gcc/internal-fn.cc
> > > +++ b/gcc/internal-fn.cc
> > > @@ -4103,6 +4103,17 @@ direct_internal_fn_supported_p (internal_fn fn,
> > tree_pair types,
> > >         return direct_##TYPE##_optab_supported_p (which_optab, types,   \
> > >                                                   opt_type);            \
> > >        }
> > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR,
> > SIGNED_OPTAB, \
> > > +                                        UNSIGNED_OPTAB, TYPE)          \
> > > +    case IFN_##CODE:                                                   \
> > > +      {                                                                        \
> > > +       optab which_optab = (TYPE_UNSIGNED (types.SELECTOR)             \
> > > +                            ? UNSIGNED_OPTAB ## _optab                 \
> > > +                            : SIGNED_OPTAB ## _optab);                 \
> > > +       return direct_##TYPE##_optab_supported_p (which_optab, types,   \
> > > +                                                 opt_type)             \
> > > +              || internal_##CODE##_fn_supported_p (types.SELECTOR, opt_type); \
> > > +      }
> > >  #include "internal-fn.def"
> > >
> > >      case IFN_LAST:
> > > @@ -4303,6 +4314,8 @@ set_edom_supported_p (void)
> > >      optab which_optab = direct_internal_fn_optab (fn, types);          \
> > >      expand_##TYPE##_optab_fn (fn, stmt, which_optab);                  \
> > >    }
> > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR,
> > SIGNED_OPTAB, \
> > > +                                        UNSIGNED_OPTAB, TYPE)
> > >  #include "internal-fn.def"
> > >
> > >  /* Routines to expand each internal function, indexed by function number.
> > > @@ -5177,3 +5190,45 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> > >        emit_move_insn (plhs, cmp);
> > >      }
> > >  }
> > > +
> > > +void
> > > +expand_SAT_ADD (internal_fn fn, gcall *stmt)
> > > +{
> > > +  /* Check if the target supports the expansion through an IFN.  */
> > > +  tree_pair types = direct_internal_fn_types (fn, stmt);
> > > +  optab which_optab = direct_internal_fn_optab (fn, types);
> > > +  if (direct_binary_optab_supported_p (which_optab, types,
> > > +                                      insn_optimization_type ()))
> > > +    {
> > > +      expand_binary_optab_fn (fn, stmt, which_optab);
> > > +      return;
> > > +    }
> > > +
> > > +  /* Target does not support the optab, but we can de-compose it.  */
> > > +  /*
> > > +  ... decompose to a canonical representation ...
> > > +  if (TYPE_UNSIGNED (types.SELECTOR))
> > > +    {
> > > +      ...
> > > +      decompose back to (X + Y) | - ((X + Y) < X)
> > > +    }
> > > +  else
> > > +    {
> > > +      ...
> > > +    }
> > > +  */
> > > +}
> > > +
> > > +bool internal_SAT_ADD_fn_supported_p (tree type, optimization_type /*
> > optype */)
> > > +{
> > > +  /* For now, don't support decomposing vector ops.  */
> > > +  if (VECTOR_TYPE_P (type))
> > > +    return false;
> > > +
> > > +  /* Signed saturating arithmetic is harder to do since we'll so for now
> > > +     lets ignore.  */
> > > +  if (!TYPE_UNSIGNED (type))
> > > +    return false;
> > > +
> > > +  return TREE_CODE (type) == INTEGER_TYPE;
> > > +}
> > > \ No newline at end of file
> > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > index c14d30365c1..5a2491228d5 100644
> > > --- a/gcc/internal-fn.def
> > > +++ b/gcc/internal-fn.def
> > > @@ -92,6 +92,10 @@ along with GCC; see the file COPYING3.  If not see
> > >     unsigned inputs respectively, both without the trailing "_optab".
> > >     SELECTOR says which type in the tree_pair determines the signedness.
> > >
> > > +   DEF_INTERNAL_SIGNED_OPTAB_EXT_FN is like
> > DEF_INTERNAL_SIGNED_OPTAB_FN, except
> > > +   that it has expand_##NAME defined in internal-fn.cc to override the
> > > +   DEF_INTERNAL_SIGNED_OPTAB_FN expansion behavior.
> > > +
> > >     DEF_INTERNAL_FLT_FN is like DEF_INTERNAL_OPTAB_FN, but in addition,
> > >     the function implements the computational part of a built-in math
> > >     function BUILT_IN_<NAME>{F,,L}.  Unlike some built-in functions,
> > > @@ -153,6 +157,13 @@ along with GCC; see the file COPYING3.  If not see
> > >    DEF_INTERNAL_FN (NAME, FLAGS | ECF_LEAF, NULL)
> > >  #endif
> > >
> > > +#ifndef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(NAME, FLAGS, SELECTOR,
> > SIGNED_OPTAB, \
> > > +                                    UNSIGNED_OPTAB, TYPE) \
> > > +  DEF_INTERNAL_SIGNED_OPTAB_FN (NAME, FLAGS, SELECTOR,
> > SIGNED_OPTAB, \
> > > +                               UNSIGNED_OPTAB, TYPE)
> > > +#endif
> > > +
> > >  #ifndef DEF_INTERNAL_FLT_FN
> > >  #define DEF_INTERNAL_FLT_FN(NAME, FLAGS, OPTAB, TYPE) \
> > >    DEF_INTERNAL_OPTAB_FN (NAME, FLAGS, OPTAB, TYPE)
> > > @@ -274,6 +285,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS,
> > ECF_CONST | ECF_NOTHROW, first,
> > >                               smulhs, umulhs, binary)
> > >  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW,
> > first,
> > >                               smulhrs, umulhrs, binary)
> > > +DEF_INTERNAL_SIGNED_OPTAB_EXT_FN (SAT_ADD, ECF_CONST |
> > ECF_NOTHROW, first,
> > > +                                 ssadd, usadd, binary)
> > >
> > >  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
> > >  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
> > > @@ -593,5 +606,6 @@ DEF_INTERNAL_FN (BITINTTOFLOAT, ECF_PURE |
> > ECF_LEAF, ". R . ")
> > >  #undef DEF_INTERNAL_FLT_FN
> > >  #undef DEF_INTERNAL_FLT_FLOATN_FN
> > >  #undef DEF_INTERNAL_SIGNED_OPTAB_FN
> > > +#undef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> > >  #undef DEF_INTERNAL_OPTAB_FN
> > >  #undef DEF_INTERNAL_FN
> > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > > index bccee1c3e09..dbdb1e6bad2 100644
> > > --- a/gcc/internal-fn.h
> > > +++ b/gcc/internal-fn.h
> > > @@ -263,6 +263,8 @@ extern void expand_DIVMODBITINT (internal_fn, gcall
> > *);
> > >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> > >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> > >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > > +extern void expand_SAT_ADD (internal_fn, gcall *);
> > > +extern bool internal_SAT_ADD_fn_supported_p (tree, optimization_type);
> > >
> > >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> > >
> > > > Note this patch is a draft for validation, no test are invovled here.
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > >       * builtins.def (BUILT_IN_US_PLUS): Add builtin def.
> > > >       (BUILT_IN_US_PLUSIMAX): Ditto.
> > > >       (BUILT_IN_US_PLUSL): Ditto.
> > > >       (BUILT_IN_US_PLUSLL): Ditto.
> > > >       (BUILT_IN_US_PLUSG): Ditto.
> > > >       * config/riscv/riscv-protos.h (riscv_expand_us_plus): Add new
> > > >       func decl for expanding us_plus.
> > > >       * config/riscv/riscv.cc (riscv_expand_us_plus): Add new func
> > > >       impl for expanding us_plus.
> > > >       * config/riscv/riscv.md (us_plus<mode>3): Add new pattern impl
> > > >       us_plus<mode>3.
> > > >       * internal-fn.cc (expand_US_PLUS): Add new func impl to expand
> > > >       US_PLUS.
> > > >       * internal-fn.def (US_PLUS): Add new INT_EXT_FN.
> > > >       * internal-fn.h (expand_US_PLUS): Add new func decl.
> > > >       * match.pd: Add new simplify pattern for us_plus.
> > > >       * optabs.def (OPTAB_NL): Add new OPTAB_NL to US_PLUS rtl.
> > > >
> > > > Signed-off-by: Pan Li <pan2.li@intel.com>
> > > > ---
> > > >  gcc/builtins.def                |  7 +++++
> > > >  gcc/config/riscv/riscv-protos.h |  1 +
> > > >  gcc/config/riscv/riscv.cc       | 46 +++++++++++++++++++++++++++++++++
> > > >  gcc/config/riscv/riscv.md       | 11 ++++++++
> > > >  gcc/internal-fn.cc              | 26 +++++++++++++++++++
> > > >  gcc/internal-fn.def             |  3 +++
> > > >  gcc/internal-fn.h               |  1 +
> > > >  gcc/match.pd                    | 17 ++++++++++++
> > > >  gcc/optabs.def                  |  2 ++
> > > >  9 files changed, 114 insertions(+)
> > > >
> > > > diff --git a/gcc/builtins.def b/gcc/builtins.def
> > > > index f6f3e104f6a..0777b912cfa 100644
> > > > --- a/gcc/builtins.def
> > > > +++ b/gcc/builtins.def
> > > > @@ -1055,6 +1055,13 @@ DEF_GCC_BUILTIN
> > (BUILT_IN_POPCOUNTIMAX,
> > > > "popcountimax", BT_FN_INT_UINTMAX
> > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTL, "popcountl",
> > BT_FN_INT_ULONG,
> > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTLL, "popcountll",
> > > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTG, "popcountg",
> > BT_FN_INT_VAR,
> > > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > > > +
> > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUS, "us_plus", BT_FN_INT_UINT,
> > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSIMAX, "us_plusimax",
> > > > BT_FN_INT_UINTMAX, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSL, "us_plusl",
> BT_FN_INT_ULONG,
> > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSLL, "us_plusll",
> > > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSG, "us_plusg", BT_FN_INT_VAR,
> > > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > > > +
> > > >  DEF_EXT_LIB_BUILTIN    (BUILT_IN_POSIX_MEMALIGN, "posix_memalign",
> > > > BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF)
> > > >  DEF_GCC_BUILTIN        (BUILT_IN_PREFETCH, "prefetch",
> > > > BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
> > > >  DEF_LIB_BUILTIN        (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE,
> > > > ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST)
> > > > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> > > > index 80efdf2b7e5..ba6086f1f25 100644
> > > > --- a/gcc/config/riscv/riscv-protos.h
> > > > +++ b/gcc/config/riscv/riscv-protos.h
> > > > @@ -132,6 +132,7 @@ extern void riscv_asm_output_external (FILE *, const
> > tree,
> > > > const char *);
> > > >  extern bool
> > > >  riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
> > > >  extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx);
> > > > +extern void riscv_expand_us_plus (rtx, rtx, rtx);
> > > >
> > > >  #ifdef RTX_CODE
> > > >  extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool
> *invert_ptr
> > =
> > > > 0);
> > > > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> > > > index 4100abc9dd1..23f08974f07 100644
> > > > --- a/gcc/config/riscv/riscv.cc
> > > > +++ b/gcc/config/riscv/riscv.cc
> > > > @@ -10657,6 +10657,52 @@ riscv_vector_mode_supported_any_target_p
> > > > (machine_mode)
> > > >    return true;
> > > >  }
> > > >
> > > > +/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < x).  */
> > > > +void
> > > > +riscv_expand_us_plus (rtx dest, rtx x, rtx y)
> > > > +{
> > > > +  machine_mode mode = GET_MODE (dest);
> > > > +  rtx pmode_sum = gen_reg_rtx (Pmode);
> > > > +  rtx pmode_lt = gen_reg_rtx (Pmode);
> > > > +  rtx pmode_x = gen_lowpart (Pmode, x);
> > > > +  rtx pmode_y = gen_lowpart (Pmode, y);
> > > > +  rtx pmode_dest = gen_reg_rtx (Pmode);
> > > > +
> > > > +  /* Step-1: sum = x + y  */
> > > > +  if (mode == SImode && mode != Pmode)
> > > > +    { /* Take addw to avoid the sum truncate.  */
> > > > +      rtx simode_sum = gen_reg_rtx (SImode);
> > > > +      riscv_emit_binary (PLUS, simode_sum, x, y);
> > > > +      emit_move_insn (pmode_sum, gen_lowpart (Pmode, simode_sum));
> > > > +    }
> > > > +  else
> > > > +    riscv_emit_binary (PLUS, pmode_sum, pmode_x, pmode_y);
> > > > +
> > > > +  /* Step-1.1: truncate sum for HI and QI as we have no insn for add QI/HI.
> */
> > > > +  if (mode == HImode || mode == QImode)
> > > > +    {
> > > > +      int mode_bits = GET_MODE_BITSIZE (mode).to_constant ();
> > > > +      int shift_bits = GET_MODE_BITSIZE (Pmode) - mode_bits;
> > > > +
> > > > +      gcc_assert (shift_bits > 0);
> > > > +
> > > > +      riscv_emit_binary (ASHIFT, pmode_sum, pmode_sum, GEN_INT
> > (shift_bits));
> > > > +      riscv_emit_binary (LSHIFTRT, pmode_sum, pmode_sum, GEN_INT
> > > > (shift_bits));
> > > > +    }
> > > > +
> > > > +  /* Step-2: lt = sum < x  */
> > > > +  riscv_emit_binary (LTU, pmode_lt, pmode_sum, pmode_x);
> > > > +
> > > > +  /* Step-3: lt = -lt  */
> > > > +  riscv_emit_unary (NEG, pmode_lt, pmode_lt);
> > > > +
> > > > +  /* Step-4: pmode_dest = sum | lt  */
> > > > +  riscv_emit_binary (IOR, pmode_dest, pmode_lt, pmode_sum);
> > > > +
> > > > +  /* Step-5: dest = pmode_dest */
> > > > +  emit_move_insn (dest, gen_lowpart (mode, pmode_dest));
> > > > +}
> > > > +
> > > >  /* Initialize the GCC target structure.  */
> > > >  #undef TARGET_ASM_ALIGNED_HI_OP
> > > >  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> > > > diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> > > > index 3f7a023d941..eaa9867023c 100644
> > > > --- a/gcc/config/riscv/riscv.md
> > > > +++ b/gcc/config/riscv/riscv.md
> > > > @@ -3841,6 +3841,17 @@ (define_insn "*large_load_address"
> > > >    [(set_attr "type" "load")
> > > >     (set (attr "length") (const_int 8))])
> > > >
> > > > +(define_expand "us_plus<mode>3"
> > > > +  [(match_operand:ANYI   0 "register_operand")
> > > > +   (match_operand:ANYI   1 "register_operand")
> > > > +   (match_operand:ANYI   2 "register_operand")]
> > > > +  ""
> > > > +  {
> > > > +    riscv_expand_us_plus (operands[0], operands[1], operands[2]);
> > > > +    DONE;
> > > > +  }
> > > > +)
> > > > +
> > > >  (include "bitmanip.md")
> > > >  (include "crypto.md")
> > > >  (include "sync.md")
> > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > > index a07f25f3aee..a7341a57ffa 100644
> > > > --- a/gcc/internal-fn.cc
> > > > +++ b/gcc/internal-fn.cc
> > > > @@ -5177,3 +5177,29 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> > > >        emit_move_insn (plhs, cmp);
> > > >      }
> > > >  }
> > > > +
> > > > +void
> > > > +expand_US_PLUS (internal_fn fn, gcall *stmt)
> > > > +{
> > > > +  tree lhs = gimple_call_lhs (stmt);
> > > > +  tree rhs_0 = gimple_call_arg (stmt, 0);
> > > > +  tree rhs_1 = gimple_call_arg (stmt, 1);
> > > > +
> > > > +  do_pending_stack_adjust ();
> > > > +
> > > > +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> > > > +  rtx op_0 = expand_normal (rhs_0);
> > > > +  rtx op_1 = expand_normal (rhs_1);
> > > > +
> > > > +  class expand_operand ops[3];
> > > > +
> > > > +  create_output_operand (&ops[0], target, TYPE_MODE (TREE_TYPE (lhs)));
> > > > +  create_output_operand (&ops[1], op_0, TYPE_MODE (TREE_TYPE
> (rhs_0)));
> > > > +  create_output_operand (&ops[2], op_1, TYPE_MODE (TREE_TYPE
> (rhs_1)));
> > > > +
> > > > +  insn_code code = optab_handler (us_plus_optab, TYPE_MODE (TREE_TYPE
> > > > (rhs_0)));
> > > > +  expand_insn (code, 3, ops);
> > > > +
> > > > +  if (!rtx_equal_p (target, ops[0].value))
> > > > +    emit_move_insn (target, ops[0].value);
> > > > +}
> > >
> > > This can be simplified by calling expand_binary_optab_fn instead. See my
> > template
> > >
> > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > > index c14d30365c1..b1d7b5a0307 100644
> > > > --- a/gcc/internal-fn.def
> > > > +++ b/gcc/internal-fn.def
> > > > @@ -447,6 +447,9 @@ DEF_INTERNAL_INT_FN (FFS, ECF_CONST |
> > > > ECF_NOTHROW, ffs, unary)
> > > >  DEF_INTERNAL_INT_FN (PARITY, ECF_CONST | ECF_NOTHROW, parity,
> unary)
> > > >  DEF_INTERNAL_INT_EXT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW,
> > > > popcount, unary)
> > > >
> > > > +/* Binary integer ops.  */
> > > > +DEF_INTERNAL_INT_EXT_FN (US_PLUS, ECF_CONST | ECF_NOTHROW,
> > us_plus,
> > > > binary)
> > > > +
> > > >  DEF_INTERNAL_FN (GOMP_TARGET_REV, ECF_NOVOPS | ECF_LEAF |
> > > > ECF_NOTHROW, NULL)
> > > >  DEF_INTERNAL_FN (GOMP_USE_SIMT, ECF_NOVOPS | ECF_LEAF |
> > > > ECF_NOTHROW, NULL)
> > > >  DEF_INTERNAL_FN (GOMP_SIMT_ENTER, ECF_LEAF | ECF_NOTHROW,
> NULL)
> > > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > > > index bccee1c3e09..46e404b4a49 100644
> > > > --- a/gcc/internal-fn.h
> > > > +++ b/gcc/internal-fn.h
> > > > @@ -263,6 +263,7 @@ extern void expand_DIVMODBITINT (internal_fn,
> gcall
> > *);
> > > >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> > > >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> > > >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > > > +extern void expand_US_PLUS (internal_fn, gcall *);
> > > >
> > > >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> > > >
> > > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > > index c5b6540f939..f45fd58ad23 100644
> > > > --- a/gcc/match.pd
> > > > +++ b/gcc/match.pd
> > > > @@ -10265,3 +10265,20 @@ and,
> > > >        }
> > > >        (if (full_perm_p)
> > > >       (vec_perm (op@3 @0 @1) @3 @2))))))
> > > > +
> > > > +#if GIMPLE
> > > > +
> > > > +/* Unsigned saturation add, aka:
> > > > +   SAT_ADDU = (X + Y) | - ((X + Y) < X) or
> > > > +   SAT_ADDU = (X + Y) | - ((X + Y) < Y).  */
> > > > +(simplify
> > > > + (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0))))
> > > > +   (if (optimize
> > > > +     && INTEGRAL_TYPE_P (type)
> > > > +     && TYPE_UNSIGNED (TREE_TYPE (@0))
> > > > +     && types_match (type, TREE_TYPE (@0))
> > > > +     && types_match (type, TREE_TYPE (@1))
> > > > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> > > > OPTIMIZE_FOR_BOTH))
> > > > +   (IFN_US_PLUS @0 @1)))
> > > > +
> > > > +#endif
> > >
> > > With the version above you can drop the #if GIMPLE and the
> > > > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> > >
> > > Check.
> > >
> > > Thanks,
> > > Tamar
> > >
> > > > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > > > index ad14f9328b9..5855c4e0834 100644
> > > > --- a/gcc/optabs.def
> > > > +++ b/gcc/optabs.def
> > > > @@ -179,6 +179,8 @@ OPTAB_NL(clrsb_optab, "clrsb$a2", CLRSB, "clrsb",
> '2',
> > > > gen_int_libfunc)
> > > >  OPTAB_NL(popcount_optab, "popcount$a2", POPCOUNT, "popcount", '2',
> > > > gen_int_libfunc)
> > > >  OPTAB_NL(parity_optab, "parity$a2", PARITY, "parity", '2', gen_int_libfunc)
> > > >
> > > > +OPTAB_NL(us_plus_optab, "us_plus$a3", US_PLUS, "us_plus", '3',
> > > > gen_int_libfunc)
> > > > +
> > > >  /* Comparison libcalls for integers MUST come in pairs, signed/unsigned.  */
> > > >  OPTAB_NL(cmp_optab, NULL, UNKNOWN, "cmp", '2',
> > gen_int_fp_fixed_libfunc)
> > > >  OPTAB_NL(ucmp_optab, NULL, UNKNOWN, "ucmp", '2', gen_int_libfunc)
> > > > --
> > > > 2.34.1
> > >
Richard Biener Feb. 27, 2024, 1:42 p.m. UTC | #7
On Tue, Feb 27, 2024 at 1:57 PM Tamar Christina <Tamar.Christina@arm.com> wrote:
>
> > Thanks Tamar.
> >
> > > Those two cases also *completely* stop vectorization because of either the
> > > control flow or the fact the vectorizer can't handle complex types.
> >
> > Yes, we eventually would like to vectorize the SAT ALU but we start with scalar part
> > first.
> > I tried the DEF_INTERNAL_SIGNED_OPTAB_EXT_FN as your suggestion. It works
> > well with some additions as below.
> > Feel free to correct me if any misunderstandings.
> >
> > 1. usadd$Q$a3 are restricted to fixed point and we need to change it to
> > usadd$a3(as well as gen_int_libfunc) for int.
> > 2. We need to implement a default implementation of SAT_ADD if
> > direct_binary_optab_supported_p is false.
> >     It looks like the default implementation is difficult to make every backend happy.
> > That is why you suggest just normal
> >     DEF_INTERNAL_SIGNED_OPTAB_FN in another thread.
> >
> > Thanks Richard.
> >
> > > But what I'd like to see is that we do more instruction selection on GIMPLE
> > > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> > > passes doing what I'd call instruction selection).  But that means not adding
> > > match.pd patterns for that or at least have a separate isel-match.pd
> > > machinery for that.
> >
> > > So as a start I would go for a direct optab and see to recognize it during
> > > ISEL?
> >
> > Looks we have sorts of SAT alu like PLUS/MINUS/MULT/DIV/SHIFT/NEG/ABS, good
> > to know isel and I am happy to
> > try that once we have conclusion.
> >
>
> So after a lively discussion on IRC, the conclusion is that before we proceed Richi would
> like to see some examples of various operations.  The problem is that unsigned saturating
> addition is the simplest example and it may lead to an implementation strategy that doesn't
> scale.
>
> So I'd suggest writing some example of both signed and unsigned saturating add and multiply
>
> Because signed addition, will likely require a branch and signed multiplication would require a
> larger type.
>
> This would allow us to better understand what kind of gimple would have to to deal with in
> ISEL and VECT if we decide not to lower early.

More specifically before making something like .SAT_ADD a core part of
GIMPLE I'd like
to point out that we have saturating PLUS_EXPR but just for
fixed-point types.  I realize
Joseph thinks that keying this on the type was wrong and it should
have used integer
types and special saturating operations.  Still having both,
type-keyed saturating PLUS_EXPR
and "code"-keyed .SAT_ADD (on integer types only?) looks like a mess.

It might be that the way to go is to turn all existing saturating type
*_EXPR into
.SAT_* internal function calls, in the end mapping to the optabs and
eventual RTX codes.

That could work for both integer types and fixed-point types.

I'll also note that "saturating" is just another variant of overflow
behavior of which we have
trapping (-ftrapv), wrapping (-fwrapv), signed-undefined (default) and
also (kind-of) sanitized.
We do lack direct IL representation of -ftrapv and -fwrapv, the
semantics on a PLUS_EXPR
depend on per-function flags.  Eventually a common representation
could be found here.
For saturating I was thinking of .ADD_OVERFLOW (a, b,
saturation-value), a "trap" could
be a "trapping" saturation value, "undefined" could be a
"not-a-thing".  But I didn't think much
about this.

That said, I would like to see the bigger picture to be kept in mind
before altering the GIMPLE IL.

Adding an internal function for an already present optab is a
no-brainer.  Adding a vectorizer
and/or if-conversion pattern to make use of this during vectorization
is existing practice.
Adding pattern recognition to ISEL or widening-mul passes for
instructions the CPU can do
is existing practice and OK.

Thanks,
Richard.


> Thanks,
> Tamar
>
> > Pan
> >
> > -----Original Message-----
> > From: Tamar Christina <Tamar.Christina@arm.com>
> > Sent: Tuesday, February 27, 2024 5:57 PM
> > To: Richard Biener <richard.guenther@gmail.com>
> > Cc: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai;
> > Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com;
> > richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> > US_PLUS
> >
> > > -----Original Message-----
> > > From: Richard Biener <richard.guenther@gmail.com>
> > > Sent: Tuesday, February 27, 2024 9:44 AM
> > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > Cc: pan2.li@intel.com; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai;
> > > yanzhang.wang@intel.com; kito.cheng@gmail.com;
> > > richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > > Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> > > US_PLUS
> > >
> > > On Sun, Feb 25, 2024 at 10:01 AM Tamar Christina
> > > <Tamar.Christina@arm.com> wrote:
> > > >
> > > > Hi Pan,
> > > >
> > > > > From: Pan Li <pan2.li@intel.com>
> > > > >
> > > > > Hi Richard & Tamar,
> > > > >
> > > > > Try the DEF_INTERNAL_INT_EXT_FN as your suggestion.  By mapping
> > > > > us_plus$a3 to the RTL representation (us_plus:m x y) in optabs.def.
> > > > > And then expand_US_PLUS in internal-fn.cc.  Not very sure if my
> > > > > understanding is correct for DEF_INTERNAL_INT_EXT_FN.
> > > > >
> > > > > I am not sure if we still need DEF_INTERNAL_SIGNED_OPTAB_FN here, given
> > > > > the RTL representation has (ss_plus:m x y) and (us_plus:m x y) already.
> > > > >
> > > >
> > > > I think a couple of things are being confused here.  So lets break it down:
> > > >
> > > > The reason for DEF_INTERNAL_SIGNED_OPTAB_FN is because in GIMPLE
> > > > we only want one internal function for both signed and unsigned SAT_ADD.
> > > > with this definition we don't need SAT_UADD and SAT_SADD but instead
> > > > we will only have SAT_ADD, which will expand to us_plus or ss_plus.
> > > >
> > > > Now the downside of this is that this is a direct internal optab.  This means
> > > > that for the representation to be used the target *must* have the optab
> > > > implemented.   This is a bit annoying because it doesn't allow us to generically
> > > > assume that all targets use SAT_ADD for saturating add and thus only have to
> > > > write optimization for this representation.
> > > >
> > > > This is why Richi said we may need to use a new tree_code because we can
> > > > override tree code expansions.  However the same can be done with the
> > _EXT_FN
> > > > internal functions.
> > > >
> > > > So what I meant was that we want to have a combination of the two. i.e. a
> > > > DEF_INTERNAL_SIGNED_OPTAB_EXT_FN.
> > >
> > > Whether we want/need _EXT or only direct depends mainly on how we want to
> > > leverage support.  If it's only during vectorization and possibly instruction
> > > selection a direct optab is IMO the way to go.  Generic optimization only
> > > marginally improves when you explode the number of basic operations you
> > > expose - in fact it gets quite unwieldly to support all of them in
> > > simplifications
> > > and/or canonicalization and you possibly need to translate them back to what
> > > the target CPU supports.
> > >
> > > We already do have too many (IMO) "special" operations exposed "early"
> > > in the GIMPLE pipeline.
> > >
> > > But what I'd like to see is that we do more instruction selection on GIMPLE
> > > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> > > passes doing what I'd call instruction selection).  But that means not adding
> > > match.pd patterns for that or at least have a separate isel-match.pd
> > > machinery for that.
> > >
> > > So as a start I would go for a direct optab and see to recognize it during
> > > ISEL?
> > >
> >
> > The problem with ISEL and the reason I suggested an indirect IFN is that there
> > Are benefit to be had from recognizing it early.  Saturating arithmetic can be
> > optimized
> > Differently from non-saturating ones.
> >
> > But additionally a common way of specifying them decomposes to branches
> > and/or using COMPLEX_EXPR (see the various PRs on saturating arithmetic).
> >
> > These two representation can be detected in PHI-opts and it's beneficial to all
> > targets to canonicalize them to the branchless code.
> >
> > Those two cases also *completely* stop vectorization because of either the
> > control flow or the fact the vectorizer can't handle complex types.
> >
> > So really, gimple ISEL would fix just 1 of the 3 very common cases, and then
> > We'd still need to hack the vectorizer cost models for targets with saturating
> > vector instructions.
> >
> > I of course defer to you, but it seems quite suboptimal to do it this way and
> > doesn't get us first class saturation support.
> >
> > Additionally there have been discussions whether both clang and gcc should
> > provide __builtin_saturate_* methods, which the non-direct IFN would help
> > support.
> >
> > Tamar.
> >
> > > > If Richi agrees, the below is what I meant. It creates the infrastructure for this
> > > > and for now only allows a default fallback for unsigned saturating add and
> > makes
> > > > it easier for us to add the rest later
> > > >
> > > > Also, unless I'm wrong (and Richi can correct me here), us_plus and ss_plus are
> > > the
> > > > RTL expression, but the optab for saturation are ssadd and usadd.  So you don't
> > > > need to make new us_plus and ss_plus ones.
> > > >
> > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > > index a07f25f3aee..aaf9f8991b3 100644
> > > > --- a/gcc/internal-fn.cc
> > > > +++ b/gcc/internal-fn.cc
> > > > @@ -4103,6 +4103,17 @@ direct_internal_fn_supported_p (internal_fn fn,
> > > tree_pair types,
> > > >         return direct_##TYPE##_optab_supported_p (which_optab, types,   \
> > > >                                                   opt_type);            \
> > > >        }
> > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR,
> > > SIGNED_OPTAB, \
> > > > +                                        UNSIGNED_OPTAB, TYPE)          \
> > > > +    case IFN_##CODE:                                                   \
> > > > +      {                                                                        \
> > > > +       optab which_optab = (TYPE_UNSIGNED (types.SELECTOR)             \
> > > > +                            ? UNSIGNED_OPTAB ## _optab                 \
> > > > +                            : SIGNED_OPTAB ## _optab);                 \
> > > > +       return direct_##TYPE##_optab_supported_p (which_optab, types,   \
> > > > +                                                 opt_type)             \
> > > > +              || internal_##CODE##_fn_supported_p (types.SELECTOR, opt_type); \
> > > > +      }
> > > >  #include "internal-fn.def"
> > > >
> > > >      case IFN_LAST:
> > > > @@ -4303,6 +4314,8 @@ set_edom_supported_p (void)
> > > >      optab which_optab = direct_internal_fn_optab (fn, types);          \
> > > >      expand_##TYPE##_optab_fn (fn, stmt, which_optab);                  \
> > > >    }
> > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR,
> > > SIGNED_OPTAB, \
> > > > +                                        UNSIGNED_OPTAB, TYPE)
> > > >  #include "internal-fn.def"
> > > >
> > > >  /* Routines to expand each internal function, indexed by function number.
> > > > @@ -5177,3 +5190,45 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> > > >        emit_move_insn (plhs, cmp);
> > > >      }
> > > >  }
> > > > +
> > > > +void
> > > > +expand_SAT_ADD (internal_fn fn, gcall *stmt)
> > > > +{
> > > > +  /* Check if the target supports the expansion through an IFN.  */
> > > > +  tree_pair types = direct_internal_fn_types (fn, stmt);
> > > > +  optab which_optab = direct_internal_fn_optab (fn, types);
> > > > +  if (direct_binary_optab_supported_p (which_optab, types,
> > > > +                                      insn_optimization_type ()))
> > > > +    {
> > > > +      expand_binary_optab_fn (fn, stmt, which_optab);
> > > > +      return;
> > > > +    }
> > > > +
> > > > +  /* Target does not support the optab, but we can de-compose it.  */
> > > > +  /*
> > > > +  ... decompose to a canonical representation ...
> > > > +  if (TYPE_UNSIGNED (types.SELECTOR))
> > > > +    {
> > > > +      ...
> > > > +      decompose back to (X + Y) | - ((X + Y) < X)
> > > > +    }
> > > > +  else
> > > > +    {
> > > > +      ...
> > > > +    }
> > > > +  */
> > > > +}
> > > > +
> > > > +bool internal_SAT_ADD_fn_supported_p (tree type, optimization_type /*
> > > optype */)
> > > > +{
> > > > +  /* For now, don't support decomposing vector ops.  */
> > > > +  if (VECTOR_TYPE_P (type))
> > > > +    return false;
> > > > +
> > > > +  /* Signed saturating arithmetic is harder to do since we'll so for now
> > > > +     lets ignore.  */
> > > > +  if (!TYPE_UNSIGNED (type))
> > > > +    return false;
> > > > +
> > > > +  return TREE_CODE (type) == INTEGER_TYPE;
> > > > +}
> > > > \ No newline at end of file
> > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > > index c14d30365c1..5a2491228d5 100644
> > > > --- a/gcc/internal-fn.def
> > > > +++ b/gcc/internal-fn.def
> > > > @@ -92,6 +92,10 @@ along with GCC; see the file COPYING3.  If not see
> > > >     unsigned inputs respectively, both without the trailing "_optab".
> > > >     SELECTOR says which type in the tree_pair determines the signedness.
> > > >
> > > > +   DEF_INTERNAL_SIGNED_OPTAB_EXT_FN is like
> > > DEF_INTERNAL_SIGNED_OPTAB_FN, except
> > > > +   that it has expand_##NAME defined in internal-fn.cc to override the
> > > > +   DEF_INTERNAL_SIGNED_OPTAB_FN expansion behavior.
> > > > +
> > > >     DEF_INTERNAL_FLT_FN is like DEF_INTERNAL_OPTAB_FN, but in addition,
> > > >     the function implements the computational part of a built-in math
> > > >     function BUILT_IN_<NAME>{F,,L}.  Unlike some built-in functions,
> > > > @@ -153,6 +157,13 @@ along with GCC; see the file COPYING3.  If not see
> > > >    DEF_INTERNAL_FN (NAME, FLAGS | ECF_LEAF, NULL)
> > > >  #endif
> > > >
> > > > +#ifndef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(NAME, FLAGS, SELECTOR,
> > > SIGNED_OPTAB, \
> > > > +                                    UNSIGNED_OPTAB, TYPE) \
> > > > +  DEF_INTERNAL_SIGNED_OPTAB_FN (NAME, FLAGS, SELECTOR,
> > > SIGNED_OPTAB, \
> > > > +                               UNSIGNED_OPTAB, TYPE)
> > > > +#endif
> > > > +
> > > >  #ifndef DEF_INTERNAL_FLT_FN
> > > >  #define DEF_INTERNAL_FLT_FN(NAME, FLAGS, OPTAB, TYPE) \
> > > >    DEF_INTERNAL_OPTAB_FN (NAME, FLAGS, OPTAB, TYPE)
> > > > @@ -274,6 +285,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS,
> > > ECF_CONST | ECF_NOTHROW, first,
> > > >                               smulhs, umulhs, binary)
> > > >  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW,
> > > first,
> > > >                               smulhrs, umulhrs, binary)
> > > > +DEF_INTERNAL_SIGNED_OPTAB_EXT_FN (SAT_ADD, ECF_CONST |
> > > ECF_NOTHROW, first,
> > > > +                                 ssadd, usadd, binary)
> > > >
> > > >  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
> > > >  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
> > > > @@ -593,5 +606,6 @@ DEF_INTERNAL_FN (BITINTTOFLOAT, ECF_PURE |
> > > ECF_LEAF, ". R . ")
> > > >  #undef DEF_INTERNAL_FLT_FN
> > > >  #undef DEF_INTERNAL_FLT_FLOATN_FN
> > > >  #undef DEF_INTERNAL_SIGNED_OPTAB_FN
> > > > +#undef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> > > >  #undef DEF_INTERNAL_OPTAB_FN
> > > >  #undef DEF_INTERNAL_FN
> > > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > > > index bccee1c3e09..dbdb1e6bad2 100644
> > > > --- a/gcc/internal-fn.h
> > > > +++ b/gcc/internal-fn.h
> > > > @@ -263,6 +263,8 @@ extern void expand_DIVMODBITINT (internal_fn, gcall
> > > *);
> > > >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> > > >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> > > >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > > > +extern void expand_SAT_ADD (internal_fn, gcall *);
> > > > +extern bool internal_SAT_ADD_fn_supported_p (tree, optimization_type);
> > > >
> > > >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> > > >
> > > > > Note this patch is a draft for validation, no test are invovled here.
> > > > >
> > > > > gcc/ChangeLog:
> > > > >
> > > > >       * builtins.def (BUILT_IN_US_PLUS): Add builtin def.
> > > > >       (BUILT_IN_US_PLUSIMAX): Ditto.
> > > > >       (BUILT_IN_US_PLUSL): Ditto.
> > > > >       (BUILT_IN_US_PLUSLL): Ditto.
> > > > >       (BUILT_IN_US_PLUSG): Ditto.
> > > > >       * config/riscv/riscv-protos.h (riscv_expand_us_plus): Add new
> > > > >       func decl for expanding us_plus.
> > > > >       * config/riscv/riscv.cc (riscv_expand_us_plus): Add new func
> > > > >       impl for expanding us_plus.
> > > > >       * config/riscv/riscv.md (us_plus<mode>3): Add new pattern impl
> > > > >       us_plus<mode>3.
> > > > >       * internal-fn.cc (expand_US_PLUS): Add new func impl to expand
> > > > >       US_PLUS.
> > > > >       * internal-fn.def (US_PLUS): Add new INT_EXT_FN.
> > > > >       * internal-fn.h (expand_US_PLUS): Add new func decl.
> > > > >       * match.pd: Add new simplify pattern for us_plus.
> > > > >       * optabs.def (OPTAB_NL): Add new OPTAB_NL to US_PLUS rtl.
> > > > >
> > > > > Signed-off-by: Pan Li <pan2.li@intel.com>
> > > > > ---
> > > > >  gcc/builtins.def                |  7 +++++
> > > > >  gcc/config/riscv/riscv-protos.h |  1 +
> > > > >  gcc/config/riscv/riscv.cc       | 46 +++++++++++++++++++++++++++++++++
> > > > >  gcc/config/riscv/riscv.md       | 11 ++++++++
> > > > >  gcc/internal-fn.cc              | 26 +++++++++++++++++++
> > > > >  gcc/internal-fn.def             |  3 +++
> > > > >  gcc/internal-fn.h               |  1 +
> > > > >  gcc/match.pd                    | 17 ++++++++++++
> > > > >  gcc/optabs.def                  |  2 ++
> > > > >  9 files changed, 114 insertions(+)
> > > > >
> > > > > diff --git a/gcc/builtins.def b/gcc/builtins.def
> > > > > index f6f3e104f6a..0777b912cfa 100644
> > > > > --- a/gcc/builtins.def
> > > > > +++ b/gcc/builtins.def
> > > > > @@ -1055,6 +1055,13 @@ DEF_GCC_BUILTIN
> > > (BUILT_IN_POPCOUNTIMAX,
> > > > > "popcountimax", BT_FN_INT_UINTMAX
> > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTL, "popcountl",
> > > BT_FN_INT_ULONG,
> > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTLL, "popcountll",
> > > > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTG, "popcountg",
> > > BT_FN_INT_VAR,
> > > > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > > > > +
> > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUS, "us_plus", BT_FN_INT_UINT,
> > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSIMAX, "us_plusimax",
> > > > > BT_FN_INT_UINTMAX, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSL, "us_plusl",
> > BT_FN_INT_ULONG,
> > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSLL, "us_plusll",
> > > > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSG, "us_plusg", BT_FN_INT_VAR,
> > > > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > > > > +
> > > > >  DEF_EXT_LIB_BUILTIN    (BUILT_IN_POSIX_MEMALIGN, "posix_memalign",
> > > > > BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF)
> > > > >  DEF_GCC_BUILTIN        (BUILT_IN_PREFETCH, "prefetch",
> > > > > BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
> > > > >  DEF_LIB_BUILTIN        (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE,
> > > > > ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST)
> > > > > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> > > > > index 80efdf2b7e5..ba6086f1f25 100644
> > > > > --- a/gcc/config/riscv/riscv-protos.h
> > > > > +++ b/gcc/config/riscv/riscv-protos.h
> > > > > @@ -132,6 +132,7 @@ extern void riscv_asm_output_external (FILE *, const
> > > tree,
> > > > > const char *);
> > > > >  extern bool
> > > > >  riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
> > > > >  extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx);
> > > > > +extern void riscv_expand_us_plus (rtx, rtx, rtx);
> > > > >
> > > > >  #ifdef RTX_CODE
> > > > >  extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool
> > *invert_ptr
> > > =
> > > > > 0);
> > > > > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> > > > > index 4100abc9dd1..23f08974f07 100644
> > > > > --- a/gcc/config/riscv/riscv.cc
> > > > > +++ b/gcc/config/riscv/riscv.cc
> > > > > @@ -10657,6 +10657,52 @@ riscv_vector_mode_supported_any_target_p
> > > > > (machine_mode)
> > > > >    return true;
> > > > >  }
> > > > >
> > > > > +/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < x).  */
> > > > > +void
> > > > > +riscv_expand_us_plus (rtx dest, rtx x, rtx y)
> > > > > +{
> > > > > +  machine_mode mode = GET_MODE (dest);
> > > > > +  rtx pmode_sum = gen_reg_rtx (Pmode);
> > > > > +  rtx pmode_lt = gen_reg_rtx (Pmode);
> > > > > +  rtx pmode_x = gen_lowpart (Pmode, x);
> > > > > +  rtx pmode_y = gen_lowpart (Pmode, y);
> > > > > +  rtx pmode_dest = gen_reg_rtx (Pmode);
> > > > > +
> > > > > +  /* Step-1: sum = x + y  */
> > > > > +  if (mode == SImode && mode != Pmode)
> > > > > +    { /* Take addw to avoid the sum truncate.  */
> > > > > +      rtx simode_sum = gen_reg_rtx (SImode);
> > > > > +      riscv_emit_binary (PLUS, simode_sum, x, y);
> > > > > +      emit_move_insn (pmode_sum, gen_lowpart (Pmode, simode_sum));
> > > > > +    }
> > > > > +  else
> > > > > +    riscv_emit_binary (PLUS, pmode_sum, pmode_x, pmode_y);
> > > > > +
> > > > > +  /* Step-1.1: truncate sum for HI and QI as we have no insn for add QI/HI.
> > */
> > > > > +  if (mode == HImode || mode == QImode)
> > > > > +    {
> > > > > +      int mode_bits = GET_MODE_BITSIZE (mode).to_constant ();
> > > > > +      int shift_bits = GET_MODE_BITSIZE (Pmode) - mode_bits;
> > > > > +
> > > > > +      gcc_assert (shift_bits > 0);
> > > > > +
> > > > > +      riscv_emit_binary (ASHIFT, pmode_sum, pmode_sum, GEN_INT
> > > (shift_bits));
> > > > > +      riscv_emit_binary (LSHIFTRT, pmode_sum, pmode_sum, GEN_INT
> > > > > (shift_bits));
> > > > > +    }
> > > > > +
> > > > > +  /* Step-2: lt = sum < x  */
> > > > > +  riscv_emit_binary (LTU, pmode_lt, pmode_sum, pmode_x);
> > > > > +
> > > > > +  /* Step-3: lt = -lt  */
> > > > > +  riscv_emit_unary (NEG, pmode_lt, pmode_lt);
> > > > > +
> > > > > +  /* Step-4: pmode_dest = sum | lt  */
> > > > > +  riscv_emit_binary (IOR, pmode_dest, pmode_lt, pmode_sum);
> > > > > +
> > > > > +  /* Step-5: dest = pmode_dest */
> > > > > +  emit_move_insn (dest, gen_lowpart (mode, pmode_dest));
> > > > > +}
> > > > > +
> > > > >  /* Initialize the GCC target structure.  */
> > > > >  #undef TARGET_ASM_ALIGNED_HI_OP
> > > > >  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> > > > > diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> > > > > index 3f7a023d941..eaa9867023c 100644
> > > > > --- a/gcc/config/riscv/riscv.md
> > > > > +++ b/gcc/config/riscv/riscv.md
> > > > > @@ -3841,6 +3841,17 @@ (define_insn "*large_load_address"
> > > > >    [(set_attr "type" "load")
> > > > >     (set (attr "length") (const_int 8))])
> > > > >
> > > > > +(define_expand "us_plus<mode>3"
> > > > > +  [(match_operand:ANYI   0 "register_operand")
> > > > > +   (match_operand:ANYI   1 "register_operand")
> > > > > +   (match_operand:ANYI   2 "register_operand")]
> > > > > +  ""
> > > > > +  {
> > > > > +    riscv_expand_us_plus (operands[0], operands[1], operands[2]);
> > > > > +    DONE;
> > > > > +  }
> > > > > +)
> > > > > +
> > > > >  (include "bitmanip.md")
> > > > >  (include "crypto.md")
> > > > >  (include "sync.md")
> > > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > > > index a07f25f3aee..a7341a57ffa 100644
> > > > > --- a/gcc/internal-fn.cc
> > > > > +++ b/gcc/internal-fn.cc
> > > > > @@ -5177,3 +5177,29 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> > > > >        emit_move_insn (plhs, cmp);
> > > > >      }
> > > > >  }
> > > > > +
> > > > > +void
> > > > > +expand_US_PLUS (internal_fn fn, gcall *stmt)
> > > > > +{
> > > > > +  tree lhs = gimple_call_lhs (stmt);
> > > > > +  tree rhs_0 = gimple_call_arg (stmt, 0);
> > > > > +  tree rhs_1 = gimple_call_arg (stmt, 1);
> > > > > +
> > > > > +  do_pending_stack_adjust ();
> > > > > +
> > > > > +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> > > > > +  rtx op_0 = expand_normal (rhs_0);
> > > > > +  rtx op_1 = expand_normal (rhs_1);
> > > > > +
> > > > > +  class expand_operand ops[3];
> > > > > +
> > > > > +  create_output_operand (&ops[0], target, TYPE_MODE (TREE_TYPE (lhs)));
> > > > > +  create_output_operand (&ops[1], op_0, TYPE_MODE (TREE_TYPE
> > (rhs_0)));
> > > > > +  create_output_operand (&ops[2], op_1, TYPE_MODE (TREE_TYPE
> > (rhs_1)));
> > > > > +
> > > > > +  insn_code code = optab_handler (us_plus_optab, TYPE_MODE (TREE_TYPE
> > > > > (rhs_0)));
> > > > > +  expand_insn (code, 3, ops);
> > > > > +
> > > > > +  if (!rtx_equal_p (target, ops[0].value))
> > > > > +    emit_move_insn (target, ops[0].value);
> > > > > +}
> > > >
> > > > This can be simplified by calling expand_binary_optab_fn instead. See my
> > > template
> > > >
> > > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > > > index c14d30365c1..b1d7b5a0307 100644
> > > > > --- a/gcc/internal-fn.def
> > > > > +++ b/gcc/internal-fn.def
> > > > > @@ -447,6 +447,9 @@ DEF_INTERNAL_INT_FN (FFS, ECF_CONST |
> > > > > ECF_NOTHROW, ffs, unary)
> > > > >  DEF_INTERNAL_INT_FN (PARITY, ECF_CONST | ECF_NOTHROW, parity,
> > unary)
> > > > >  DEF_INTERNAL_INT_EXT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW,
> > > > > popcount, unary)
> > > > >
> > > > > +/* Binary integer ops.  */
> > > > > +DEF_INTERNAL_INT_EXT_FN (US_PLUS, ECF_CONST | ECF_NOTHROW,
> > > us_plus,
> > > > > binary)
> > > > > +
> > > > >  DEF_INTERNAL_FN (GOMP_TARGET_REV, ECF_NOVOPS | ECF_LEAF |
> > > > > ECF_NOTHROW, NULL)
> > > > >  DEF_INTERNAL_FN (GOMP_USE_SIMT, ECF_NOVOPS | ECF_LEAF |
> > > > > ECF_NOTHROW, NULL)
> > > > >  DEF_INTERNAL_FN (GOMP_SIMT_ENTER, ECF_LEAF | ECF_NOTHROW,
> > NULL)
> > > > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > > > > index bccee1c3e09..46e404b4a49 100644
> > > > > --- a/gcc/internal-fn.h
> > > > > +++ b/gcc/internal-fn.h
> > > > > @@ -263,6 +263,7 @@ extern void expand_DIVMODBITINT (internal_fn,
> > gcall
> > > *);
> > > > >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> > > > >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> > > > >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > > > > +extern void expand_US_PLUS (internal_fn, gcall *);
> > > > >
> > > > >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> > > > >
> > > > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > > > index c5b6540f939..f45fd58ad23 100644
> > > > > --- a/gcc/match.pd
> > > > > +++ b/gcc/match.pd
> > > > > @@ -10265,3 +10265,20 @@ and,
> > > > >        }
> > > > >        (if (full_perm_p)
> > > > >       (vec_perm (op@3 @0 @1) @3 @2))))))
> > > > > +
> > > > > +#if GIMPLE
> > > > > +
> > > > > +/* Unsigned saturation add, aka:
> > > > > +   SAT_ADDU = (X + Y) | - ((X + Y) < X) or
> > > > > +   SAT_ADDU = (X + Y) | - ((X + Y) < Y).  */
> > > > > +(simplify
> > > > > + (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0))))
> > > > > +   (if (optimize
> > > > > +     && INTEGRAL_TYPE_P (type)
> > > > > +     && TYPE_UNSIGNED (TREE_TYPE (@0))
> > > > > +     && types_match (type, TREE_TYPE (@0))
> > > > > +     && types_match (type, TREE_TYPE (@1))
> > > > > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> > > > > OPTIMIZE_FOR_BOTH))
> > > > > +   (IFN_US_PLUS @0 @1)))
> > > > > +
> > > > > +#endif
> > > >
> > > > With the version above you can drop the #if GIMPLE and the
> > > > > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> > > >
> > > > Check.
> > > >
> > > > Thanks,
> > > > Tamar
> > > >
> > > > > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > > > > index ad14f9328b9..5855c4e0834 100644
> > > > > --- a/gcc/optabs.def
> > > > > +++ b/gcc/optabs.def
> > > > > @@ -179,6 +179,8 @@ OPTAB_NL(clrsb_optab, "clrsb$a2", CLRSB, "clrsb",
> > '2',
> > > > > gen_int_libfunc)
> > > > >  OPTAB_NL(popcount_optab, "popcount$a2", POPCOUNT, "popcount", '2',
> > > > > gen_int_libfunc)
> > > > >  OPTAB_NL(parity_optab, "parity$a2", PARITY, "parity", '2', gen_int_libfunc)
> > > > >
> > > > > +OPTAB_NL(us_plus_optab, "us_plus$a3", US_PLUS, "us_plus", '3',
> > > > > gen_int_libfunc)
> > > > > +
> > > > >  /* Comparison libcalls for integers MUST come in pairs, signed/unsigned.  */
> > > > >  OPTAB_NL(cmp_optab, NULL, UNKNOWN, "cmp", '2',
> > > gen_int_fp_fixed_libfunc)
> > > > >  OPTAB_NL(ucmp_optab, NULL, UNKNOWN, "ucmp", '2', gen_int_libfunc)
> > > > > --
> > > > > 2.34.1
> > > >
Li, Pan2 Feb. 27, 2024, 2:35 p.m. UTC | #8
Thanks Richard and Tammer for moving this forward.

> That said, I would like to see the bigger picture to be kept in mind
> before altering the GIMPLE IL.

> Adding an internal function for an already present optab is a
> no-brainer.  Adding a vectorizer
> and/or if-conversion pattern to make use of this during vectorization
> is existing practice.
> Adding pattern recognition to ISEL or widening-mul passes for
> instructions the CPU can do
> is existing practice and OK.

Thanks for explaining, got the point here.

> So I'd suggest writing some example of both signed and unsigned saturating add and multiply

> Because signed addition, will likely require a branch and signed multiplication would require a
> larger type.

Ack, will prepare one prototype validation patch for add, sub and mul (both unsigned and signed) soon.

Pan

-----Original Message-----
From: Richard Biener <richard.guenther@gmail.com> 
Sent: Tuesday, February 27, 2024 9:42 PM
To: Tamar Christina <Tamar.Christina@arm.com>
Cc: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com; richard.sandiford@arm.com2; jeffreyalaw@gmail.com
Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS

On Tue, Feb 27, 2024 at 1:57 PM Tamar Christina <Tamar.Christina@arm.com> wrote:
>
> > Thanks Tamar.
> >
> > > Those two cases also *completely* stop vectorization because of either the
> > > control flow or the fact the vectorizer can't handle complex types.
> >
> > Yes, we eventually would like to vectorize the SAT ALU but we start with scalar part
> > first.
> > I tried the DEF_INTERNAL_SIGNED_OPTAB_EXT_FN as your suggestion. It works
> > well with some additions as below.
> > Feel free to correct me if any misunderstandings.
> >
> > 1. usadd$Q$a3 are restricted to fixed point and we need to change it to
> > usadd$a3(as well as gen_int_libfunc) for int.
> > 2. We need to implement a default implementation of SAT_ADD if
> > direct_binary_optab_supported_p is false.
> >     It looks like the default implementation is difficult to make every backend happy.
> > That is why you suggest just normal
> >     DEF_INTERNAL_SIGNED_OPTAB_FN in another thread.
> >
> > Thanks Richard.
> >
> > > But what I'd like to see is that we do more instruction selection on GIMPLE
> > > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> > > passes doing what I'd call instruction selection).  But that means not adding
> > > match.pd patterns for that or at least have a separate isel-match.pd
> > > machinery for that.
> >
> > > So as a start I would go for a direct optab and see to recognize it during
> > > ISEL?
> >
> > Looks we have sorts of SAT alu like PLUS/MINUS/MULT/DIV/SHIFT/NEG/ABS, good
> > to know isel and I am happy to
> > try that once we have conclusion.
> >
>
> So after a lively discussion on IRC, the conclusion is that before we proceed Richi would
> like to see some examples of various operations.  The problem is that unsigned saturating
> addition is the simplest example and it may lead to an implementation strategy that doesn't
> scale.
>
> So I'd suggest writing some example of both signed and unsigned saturating add and multiply
>
> Because signed addition, will likely require a branch and signed multiplication would require a
> larger type.
>
> This would allow us to better understand what kind of gimple would have to to deal with in
> ISEL and VECT if we decide not to lower early.

More specifically before making something like .SAT_ADD a core part of
GIMPLE I'd like
to point out that we have saturating PLUS_EXPR but just for
fixed-point types.  I realize
Joseph thinks that keying this on the type was wrong and it should
have used integer
types and special saturating operations.  Still having both,
type-keyed saturating PLUS_EXPR
and "code"-keyed .SAT_ADD (on integer types only?) looks like a mess.

It might be that the way to go is to turn all existing saturating type
*_EXPR into
.SAT_* internal function calls, in the end mapping to the optabs and
eventual RTX codes.

That could work for both integer types and fixed-point types.

I'll also note that "saturating" is just another variant of overflow
behavior of which we have
trapping (-ftrapv), wrapping (-fwrapv), signed-undefined (default) and
also (kind-of) sanitized.
We do lack direct IL representation of -ftrapv and -fwrapv, the
semantics on a PLUS_EXPR
depend on per-function flags.  Eventually a common representation
could be found here.
For saturating I was thinking of .ADD_OVERFLOW (a, b,
saturation-value), a "trap" could
be a "trapping" saturation value, "undefined" could be a
"not-a-thing".  But I didn't think much
about this.

That said, I would like to see the bigger picture to be kept in mind
before altering the GIMPLE IL.

Adding an internal function for an already present optab is a
no-brainer.  Adding a vectorizer
and/or if-conversion pattern to make use of this during vectorization
is existing practice.
Adding pattern recognition to ISEL or widening-mul passes for
instructions the CPU can do
is existing practice and OK.

Thanks,
Richard.


> Thanks,
> Tamar
>
> > Pan
> >
> > -----Original Message-----
> > From: Tamar Christina <Tamar.Christina@arm.com>
> > Sent: Tuesday, February 27, 2024 5:57 PM
> > To: Richard Biener <richard.guenther@gmail.com>
> > Cc: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai;
> > Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com;
> > richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> > US_PLUS
> >
> > > -----Original Message-----
> > > From: Richard Biener <richard.guenther@gmail.com>
> > > Sent: Tuesday, February 27, 2024 9:44 AM
> > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > Cc: pan2.li@intel.com; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai;
> > > yanzhang.wang@intel.com; kito.cheng@gmail.com;
> > > richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > > Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> > > US_PLUS
> > >
> > > On Sun, Feb 25, 2024 at 10:01 AM Tamar Christina
> > > <Tamar.Christina@arm.com> wrote:
> > > >
> > > > Hi Pan,
> > > >
> > > > > From: Pan Li <pan2.li@intel.com>
> > > > >
> > > > > Hi Richard & Tamar,
> > > > >
> > > > > Try the DEF_INTERNAL_INT_EXT_FN as your suggestion.  By mapping
> > > > > us_plus$a3 to the RTL representation (us_plus:m x y) in optabs.def.
> > > > > And then expand_US_PLUS in internal-fn.cc.  Not very sure if my
> > > > > understanding is correct for DEF_INTERNAL_INT_EXT_FN.
> > > > >
> > > > > I am not sure if we still need DEF_INTERNAL_SIGNED_OPTAB_FN here, given
> > > > > the RTL representation has (ss_plus:m x y) and (us_plus:m x y) already.
> > > > >
> > > >
> > > > I think a couple of things are being confused here.  So lets break it down:
> > > >
> > > > The reason for DEF_INTERNAL_SIGNED_OPTAB_FN is because in GIMPLE
> > > > we only want one internal function for both signed and unsigned SAT_ADD.
> > > > with this definition we don't need SAT_UADD and SAT_SADD but instead
> > > > we will only have SAT_ADD, which will expand to us_plus or ss_plus.
> > > >
> > > > Now the downside of this is that this is a direct internal optab.  This means
> > > > that for the representation to be used the target *must* have the optab
> > > > implemented.   This is a bit annoying because it doesn't allow us to generically
> > > > assume that all targets use SAT_ADD for saturating add and thus only have to
> > > > write optimization for this representation.
> > > >
> > > > This is why Richi said we may need to use a new tree_code because we can
> > > > override tree code expansions.  However the same can be done with the
> > _EXT_FN
> > > > internal functions.
> > > >
> > > > So what I meant was that we want to have a combination of the two. i.e. a
> > > > DEF_INTERNAL_SIGNED_OPTAB_EXT_FN.
> > >
> > > Whether we want/need _EXT or only direct depends mainly on how we want to
> > > leverage support.  If it's only during vectorization and possibly instruction
> > > selection a direct optab is IMO the way to go.  Generic optimization only
> > > marginally improves when you explode the number of basic operations you
> > > expose - in fact it gets quite unwieldly to support all of them in
> > > simplifications
> > > and/or canonicalization and you possibly need to translate them back to what
> > > the target CPU supports.
> > >
> > > We already do have too many (IMO) "special" operations exposed "early"
> > > in the GIMPLE pipeline.
> > >
> > > But what I'd like to see is that we do more instruction selection on GIMPLE
> > > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> > > passes doing what I'd call instruction selection).  But that means not adding
> > > match.pd patterns for that or at least have a separate isel-match.pd
> > > machinery for that.
> > >
> > > So as a start I would go for a direct optab and see to recognize it during
> > > ISEL?
> > >
> >
> > The problem with ISEL and the reason I suggested an indirect IFN is that there
> > Are benefit to be had from recognizing it early.  Saturating arithmetic can be
> > optimized
> > Differently from non-saturating ones.
> >
> > But additionally a common way of specifying them decomposes to branches
> > and/or using COMPLEX_EXPR (see the various PRs on saturating arithmetic).
> >
> > These two representation can be detected in PHI-opts and it's beneficial to all
> > targets to canonicalize them to the branchless code.
> >
> > Those two cases also *completely* stop vectorization because of either the
> > control flow or the fact the vectorizer can't handle complex types.
> >
> > So really, gimple ISEL would fix just 1 of the 3 very common cases, and then
> > We'd still need to hack the vectorizer cost models for targets with saturating
> > vector instructions.
> >
> > I of course defer to you, but it seems quite suboptimal to do it this way and
> > doesn't get us first class saturation support.
> >
> > Additionally there have been discussions whether both clang and gcc should
> > provide __builtin_saturate_* methods, which the non-direct IFN would help
> > support.
> >
> > Tamar.
> >
> > > > If Richi agrees, the below is what I meant. It creates the infrastructure for this
> > > > and for now only allows a default fallback for unsigned saturating add and
> > makes
> > > > it easier for us to add the rest later
> > > >
> > > > Also, unless I'm wrong (and Richi can correct me here), us_plus and ss_plus are
> > > the
> > > > RTL expression, but the optab for saturation are ssadd and usadd.  So you don't
> > > > need to make new us_plus and ss_plus ones.
> > > >
> > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > > index a07f25f3aee..aaf9f8991b3 100644
> > > > --- a/gcc/internal-fn.cc
> > > > +++ b/gcc/internal-fn.cc
> > > > @@ -4103,6 +4103,17 @@ direct_internal_fn_supported_p (internal_fn fn,
> > > tree_pair types,
> > > >         return direct_##TYPE##_optab_supported_p (which_optab, types,   \
> > > >                                                   opt_type);            \
> > > >        }
> > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR,
> > > SIGNED_OPTAB, \
> > > > +                                        UNSIGNED_OPTAB, TYPE)          \
> > > > +    case IFN_##CODE:                                                   \
> > > > +      {                                                                        \
> > > > +       optab which_optab = (TYPE_UNSIGNED (types.SELECTOR)             \
> > > > +                            ? UNSIGNED_OPTAB ## _optab                 \
> > > > +                            : SIGNED_OPTAB ## _optab);                 \
> > > > +       return direct_##TYPE##_optab_supported_p (which_optab, types,   \
> > > > +                                                 opt_type)             \
> > > > +              || internal_##CODE##_fn_supported_p (types.SELECTOR, opt_type); \
> > > > +      }
> > > >  #include "internal-fn.def"
> > > >
> > > >      case IFN_LAST:
> > > > @@ -4303,6 +4314,8 @@ set_edom_supported_p (void)
> > > >      optab which_optab = direct_internal_fn_optab (fn, types);          \
> > > >      expand_##TYPE##_optab_fn (fn, stmt, which_optab);                  \
> > > >    }
> > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR,
> > > SIGNED_OPTAB, \
> > > > +                                        UNSIGNED_OPTAB, TYPE)
> > > >  #include "internal-fn.def"
> > > >
> > > >  /* Routines to expand each internal function, indexed by function number.
> > > > @@ -5177,3 +5190,45 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> > > >        emit_move_insn (plhs, cmp);
> > > >      }
> > > >  }
> > > > +
> > > > +void
> > > > +expand_SAT_ADD (internal_fn fn, gcall *stmt)
> > > > +{
> > > > +  /* Check if the target supports the expansion through an IFN.  */
> > > > +  tree_pair types = direct_internal_fn_types (fn, stmt);
> > > > +  optab which_optab = direct_internal_fn_optab (fn, types);
> > > > +  if (direct_binary_optab_supported_p (which_optab, types,
> > > > +                                      insn_optimization_type ()))
> > > > +    {
> > > > +      expand_binary_optab_fn (fn, stmt, which_optab);
> > > > +      return;
> > > > +    }
> > > > +
> > > > +  /* Target does not support the optab, but we can de-compose it.  */
> > > > +  /*
> > > > +  ... decompose to a canonical representation ...
> > > > +  if (TYPE_UNSIGNED (types.SELECTOR))
> > > > +    {
> > > > +      ...
> > > > +      decompose back to (X + Y) | - ((X + Y) < X)
> > > > +    }
> > > > +  else
> > > > +    {
> > > > +      ...
> > > > +    }
> > > > +  */
> > > > +}
> > > > +
> > > > +bool internal_SAT_ADD_fn_supported_p (tree type, optimization_type /*
> > > optype */)
> > > > +{
> > > > +  /* For now, don't support decomposing vector ops.  */
> > > > +  if (VECTOR_TYPE_P (type))
> > > > +    return false;
> > > > +
> > > > +  /* Signed saturating arithmetic is harder to do since we'll so for now
> > > > +     lets ignore.  */
> > > > +  if (!TYPE_UNSIGNED (type))
> > > > +    return false;
> > > > +
> > > > +  return TREE_CODE (type) == INTEGER_TYPE;
> > > > +}
> > > > \ No newline at end of file
> > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > > index c14d30365c1..5a2491228d5 100644
> > > > --- a/gcc/internal-fn.def
> > > > +++ b/gcc/internal-fn.def
> > > > @@ -92,6 +92,10 @@ along with GCC; see the file COPYING3.  If not see
> > > >     unsigned inputs respectively, both without the trailing "_optab".
> > > >     SELECTOR says which type in the tree_pair determines the signedness.
> > > >
> > > > +   DEF_INTERNAL_SIGNED_OPTAB_EXT_FN is like
> > > DEF_INTERNAL_SIGNED_OPTAB_FN, except
> > > > +   that it has expand_##NAME defined in internal-fn.cc to override the
> > > > +   DEF_INTERNAL_SIGNED_OPTAB_FN expansion behavior.
> > > > +
> > > >     DEF_INTERNAL_FLT_FN is like DEF_INTERNAL_OPTAB_FN, but in addition,
> > > >     the function implements the computational part of a built-in math
> > > >     function BUILT_IN_<NAME>{F,,L}.  Unlike some built-in functions,
> > > > @@ -153,6 +157,13 @@ along with GCC; see the file COPYING3.  If not see
> > > >    DEF_INTERNAL_FN (NAME, FLAGS | ECF_LEAF, NULL)
> > > >  #endif
> > > >
> > > > +#ifndef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(NAME, FLAGS, SELECTOR,
> > > SIGNED_OPTAB, \
> > > > +                                    UNSIGNED_OPTAB, TYPE) \
> > > > +  DEF_INTERNAL_SIGNED_OPTAB_FN (NAME, FLAGS, SELECTOR,
> > > SIGNED_OPTAB, \
> > > > +                               UNSIGNED_OPTAB, TYPE)
> > > > +#endif
> > > > +
> > > >  #ifndef DEF_INTERNAL_FLT_FN
> > > >  #define DEF_INTERNAL_FLT_FN(NAME, FLAGS, OPTAB, TYPE) \
> > > >    DEF_INTERNAL_OPTAB_FN (NAME, FLAGS, OPTAB, TYPE)
> > > > @@ -274,6 +285,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS,
> > > ECF_CONST | ECF_NOTHROW, first,
> > > >                               smulhs, umulhs, binary)
> > > >  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW,
> > > first,
> > > >                               smulhrs, umulhrs, binary)
> > > > +DEF_INTERNAL_SIGNED_OPTAB_EXT_FN (SAT_ADD, ECF_CONST |
> > > ECF_NOTHROW, first,
> > > > +                                 ssadd, usadd, binary)
> > > >
> > > >  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
> > > >  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
> > > > @@ -593,5 +606,6 @@ DEF_INTERNAL_FN (BITINTTOFLOAT, ECF_PURE |
> > > ECF_LEAF, ". R . ")
> > > >  #undef DEF_INTERNAL_FLT_FN
> > > >  #undef DEF_INTERNAL_FLT_FLOATN_FN
> > > >  #undef DEF_INTERNAL_SIGNED_OPTAB_FN
> > > > +#undef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> > > >  #undef DEF_INTERNAL_OPTAB_FN
> > > >  #undef DEF_INTERNAL_FN
> > > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > > > index bccee1c3e09..dbdb1e6bad2 100644
> > > > --- a/gcc/internal-fn.h
> > > > +++ b/gcc/internal-fn.h
> > > > @@ -263,6 +263,8 @@ extern void expand_DIVMODBITINT (internal_fn, gcall
> > > *);
> > > >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> > > >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> > > >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > > > +extern void expand_SAT_ADD (internal_fn, gcall *);
> > > > +extern bool internal_SAT_ADD_fn_supported_p (tree, optimization_type);
> > > >
> > > >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> > > >
> > > > > Note this patch is a draft for validation, no test are invovled here.
> > > > >
> > > > > gcc/ChangeLog:
> > > > >
> > > > >       * builtins.def (BUILT_IN_US_PLUS): Add builtin def.
> > > > >       (BUILT_IN_US_PLUSIMAX): Ditto.
> > > > >       (BUILT_IN_US_PLUSL): Ditto.
> > > > >       (BUILT_IN_US_PLUSLL): Ditto.
> > > > >       (BUILT_IN_US_PLUSG): Ditto.
> > > > >       * config/riscv/riscv-protos.h (riscv_expand_us_plus): Add new
> > > > >       func decl for expanding us_plus.
> > > > >       * config/riscv/riscv.cc (riscv_expand_us_plus): Add new func
> > > > >       impl for expanding us_plus.
> > > > >       * config/riscv/riscv.md (us_plus<mode>3): Add new pattern impl
> > > > >       us_plus<mode>3.
> > > > >       * internal-fn.cc (expand_US_PLUS): Add new func impl to expand
> > > > >       US_PLUS.
> > > > >       * internal-fn.def (US_PLUS): Add new INT_EXT_FN.
> > > > >       * internal-fn.h (expand_US_PLUS): Add new func decl.
> > > > >       * match.pd: Add new simplify pattern for us_plus.
> > > > >       * optabs.def (OPTAB_NL): Add new OPTAB_NL to US_PLUS rtl.
> > > > >
> > > > > Signed-off-by: Pan Li <pan2.li@intel.com>
> > > > > ---
> > > > >  gcc/builtins.def                |  7 +++++
> > > > >  gcc/config/riscv/riscv-protos.h |  1 +
> > > > >  gcc/config/riscv/riscv.cc       | 46 +++++++++++++++++++++++++++++++++
> > > > >  gcc/config/riscv/riscv.md       | 11 ++++++++
> > > > >  gcc/internal-fn.cc              | 26 +++++++++++++++++++
> > > > >  gcc/internal-fn.def             |  3 +++
> > > > >  gcc/internal-fn.h               |  1 +
> > > > >  gcc/match.pd                    | 17 ++++++++++++
> > > > >  gcc/optabs.def                  |  2 ++
> > > > >  9 files changed, 114 insertions(+)
> > > > >
> > > > > diff --git a/gcc/builtins.def b/gcc/builtins.def
> > > > > index f6f3e104f6a..0777b912cfa 100644
> > > > > --- a/gcc/builtins.def
> > > > > +++ b/gcc/builtins.def
> > > > > @@ -1055,6 +1055,13 @@ DEF_GCC_BUILTIN
> > > (BUILT_IN_POPCOUNTIMAX,
> > > > > "popcountimax", BT_FN_INT_UINTMAX
> > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTL, "popcountl",
> > > BT_FN_INT_ULONG,
> > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTLL, "popcountll",
> > > > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTG, "popcountg",
> > > BT_FN_INT_VAR,
> > > > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > > > > +
> > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUS, "us_plus", BT_FN_INT_UINT,
> > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSIMAX, "us_plusimax",
> > > > > BT_FN_INT_UINTMAX, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSL, "us_plusl",
> > BT_FN_INT_ULONG,
> > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSLL, "us_plusll",
> > > > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSG, "us_plusg", BT_FN_INT_VAR,
> > > > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > > > > +
> > > > >  DEF_EXT_LIB_BUILTIN    (BUILT_IN_POSIX_MEMALIGN, "posix_memalign",
> > > > > BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF)
> > > > >  DEF_GCC_BUILTIN        (BUILT_IN_PREFETCH, "prefetch",
> > > > > BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
> > > > >  DEF_LIB_BUILTIN        (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE,
> > > > > ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST)
> > > > > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> > > > > index 80efdf2b7e5..ba6086f1f25 100644
> > > > > --- a/gcc/config/riscv/riscv-protos.h
> > > > > +++ b/gcc/config/riscv/riscv-protos.h
> > > > > @@ -132,6 +132,7 @@ extern void riscv_asm_output_external (FILE *, const
> > > tree,
> > > > > const char *);
> > > > >  extern bool
> > > > >  riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
> > > > >  extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx);
> > > > > +extern void riscv_expand_us_plus (rtx, rtx, rtx);
> > > > >
> > > > >  #ifdef RTX_CODE
> > > > >  extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool
> > *invert_ptr
> > > =
> > > > > 0);
> > > > > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> > > > > index 4100abc9dd1..23f08974f07 100644
> > > > > --- a/gcc/config/riscv/riscv.cc
> > > > > +++ b/gcc/config/riscv/riscv.cc
> > > > > @@ -10657,6 +10657,52 @@ riscv_vector_mode_supported_any_target_p
> > > > > (machine_mode)
> > > > >    return true;
> > > > >  }
> > > > >
> > > > > +/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < x).  */
> > > > > +void
> > > > > +riscv_expand_us_plus (rtx dest, rtx x, rtx y)
> > > > > +{
> > > > > +  machine_mode mode = GET_MODE (dest);
> > > > > +  rtx pmode_sum = gen_reg_rtx (Pmode);
> > > > > +  rtx pmode_lt = gen_reg_rtx (Pmode);
> > > > > +  rtx pmode_x = gen_lowpart (Pmode, x);
> > > > > +  rtx pmode_y = gen_lowpart (Pmode, y);
> > > > > +  rtx pmode_dest = gen_reg_rtx (Pmode);
> > > > > +
> > > > > +  /* Step-1: sum = x + y  */
> > > > > +  if (mode == SImode && mode != Pmode)
> > > > > +    { /* Take addw to avoid the sum truncate.  */
> > > > > +      rtx simode_sum = gen_reg_rtx (SImode);
> > > > > +      riscv_emit_binary (PLUS, simode_sum, x, y);
> > > > > +      emit_move_insn (pmode_sum, gen_lowpart (Pmode, simode_sum));
> > > > > +    }
> > > > > +  else
> > > > > +    riscv_emit_binary (PLUS, pmode_sum, pmode_x, pmode_y);
> > > > > +
> > > > > +  /* Step-1.1: truncate sum for HI and QI as we have no insn for add QI/HI.
> > */
> > > > > +  if (mode == HImode || mode == QImode)
> > > > > +    {
> > > > > +      int mode_bits = GET_MODE_BITSIZE (mode).to_constant ();
> > > > > +      int shift_bits = GET_MODE_BITSIZE (Pmode) - mode_bits;
> > > > > +
> > > > > +      gcc_assert (shift_bits > 0);
> > > > > +
> > > > > +      riscv_emit_binary (ASHIFT, pmode_sum, pmode_sum, GEN_INT
> > > (shift_bits));
> > > > > +      riscv_emit_binary (LSHIFTRT, pmode_sum, pmode_sum, GEN_INT
> > > > > (shift_bits));
> > > > > +    }
> > > > > +
> > > > > +  /* Step-2: lt = sum < x  */
> > > > > +  riscv_emit_binary (LTU, pmode_lt, pmode_sum, pmode_x);
> > > > > +
> > > > > +  /* Step-3: lt = -lt  */
> > > > > +  riscv_emit_unary (NEG, pmode_lt, pmode_lt);
> > > > > +
> > > > > +  /* Step-4: pmode_dest = sum | lt  */
> > > > > +  riscv_emit_binary (IOR, pmode_dest, pmode_lt, pmode_sum);
> > > > > +
> > > > > +  /* Step-5: dest = pmode_dest */
> > > > > +  emit_move_insn (dest, gen_lowpart (mode, pmode_dest));
> > > > > +}
> > > > > +
> > > > >  /* Initialize the GCC target structure.  */
> > > > >  #undef TARGET_ASM_ALIGNED_HI_OP
> > > > >  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> > > > > diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> > > > > index 3f7a023d941..eaa9867023c 100644
> > > > > --- a/gcc/config/riscv/riscv.md
> > > > > +++ b/gcc/config/riscv/riscv.md
> > > > > @@ -3841,6 +3841,17 @@ (define_insn "*large_load_address"
> > > > >    [(set_attr "type" "load")
> > > > >     (set (attr "length") (const_int 8))])
> > > > >
> > > > > +(define_expand "us_plus<mode>3"
> > > > > +  [(match_operand:ANYI   0 "register_operand")
> > > > > +   (match_operand:ANYI   1 "register_operand")
> > > > > +   (match_operand:ANYI   2 "register_operand")]
> > > > > +  ""
> > > > > +  {
> > > > > +    riscv_expand_us_plus (operands[0], operands[1], operands[2]);
> > > > > +    DONE;
> > > > > +  }
> > > > > +)
> > > > > +
> > > > >  (include "bitmanip.md")
> > > > >  (include "crypto.md")
> > > > >  (include "sync.md")
> > > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > > > index a07f25f3aee..a7341a57ffa 100644
> > > > > --- a/gcc/internal-fn.cc
> > > > > +++ b/gcc/internal-fn.cc
> > > > > @@ -5177,3 +5177,29 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> > > > >        emit_move_insn (plhs, cmp);
> > > > >      }
> > > > >  }
> > > > > +
> > > > > +void
> > > > > +expand_US_PLUS (internal_fn fn, gcall *stmt)
> > > > > +{
> > > > > +  tree lhs = gimple_call_lhs (stmt);
> > > > > +  tree rhs_0 = gimple_call_arg (stmt, 0);
> > > > > +  tree rhs_1 = gimple_call_arg (stmt, 1);
> > > > > +
> > > > > +  do_pending_stack_adjust ();
> > > > > +
> > > > > +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> > > > > +  rtx op_0 = expand_normal (rhs_0);
> > > > > +  rtx op_1 = expand_normal (rhs_1);
> > > > > +
> > > > > +  class expand_operand ops[3];
> > > > > +
> > > > > +  create_output_operand (&ops[0], target, TYPE_MODE (TREE_TYPE (lhs)));
> > > > > +  create_output_operand (&ops[1], op_0, TYPE_MODE (TREE_TYPE
> > (rhs_0)));
> > > > > +  create_output_operand (&ops[2], op_1, TYPE_MODE (TREE_TYPE
> > (rhs_1)));
> > > > > +
> > > > > +  insn_code code = optab_handler (us_plus_optab, TYPE_MODE (TREE_TYPE
> > > > > (rhs_0)));
> > > > > +  expand_insn (code, 3, ops);
> > > > > +
> > > > > +  if (!rtx_equal_p (target, ops[0].value))
> > > > > +    emit_move_insn (target, ops[0].value);
> > > > > +}
> > > >
> > > > This can be simplified by calling expand_binary_optab_fn instead. See my
> > > template
> > > >
> > > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > > > index c14d30365c1..b1d7b5a0307 100644
> > > > > --- a/gcc/internal-fn.def
> > > > > +++ b/gcc/internal-fn.def
> > > > > @@ -447,6 +447,9 @@ DEF_INTERNAL_INT_FN (FFS, ECF_CONST |
> > > > > ECF_NOTHROW, ffs, unary)
> > > > >  DEF_INTERNAL_INT_FN (PARITY, ECF_CONST | ECF_NOTHROW, parity,
> > unary)
> > > > >  DEF_INTERNAL_INT_EXT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW,
> > > > > popcount, unary)
> > > > >
> > > > > +/* Binary integer ops.  */
> > > > > +DEF_INTERNAL_INT_EXT_FN (US_PLUS, ECF_CONST | ECF_NOTHROW,
> > > us_plus,
> > > > > binary)
> > > > > +
> > > > >  DEF_INTERNAL_FN (GOMP_TARGET_REV, ECF_NOVOPS | ECF_LEAF |
> > > > > ECF_NOTHROW, NULL)
> > > > >  DEF_INTERNAL_FN (GOMP_USE_SIMT, ECF_NOVOPS | ECF_LEAF |
> > > > > ECF_NOTHROW, NULL)
> > > > >  DEF_INTERNAL_FN (GOMP_SIMT_ENTER, ECF_LEAF | ECF_NOTHROW,
> > NULL)
> > > > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > > > > index bccee1c3e09..46e404b4a49 100644
> > > > > --- a/gcc/internal-fn.h
> > > > > +++ b/gcc/internal-fn.h
> > > > > @@ -263,6 +263,7 @@ extern void expand_DIVMODBITINT (internal_fn,
> > gcall
> > > *);
> > > > >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> > > > >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> > > > >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > > > > +extern void expand_US_PLUS (internal_fn, gcall *);
> > > > >
> > > > >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> > > > >
> > > > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > > > index c5b6540f939..f45fd58ad23 100644
> > > > > --- a/gcc/match.pd
> > > > > +++ b/gcc/match.pd
> > > > > @@ -10265,3 +10265,20 @@ and,
> > > > >        }
> > > > >        (if (full_perm_p)
> > > > >       (vec_perm (op@3 @0 @1) @3 @2))))))
> > > > > +
> > > > > +#if GIMPLE
> > > > > +
> > > > > +/* Unsigned saturation add, aka:
> > > > > +   SAT_ADDU = (X + Y) | - ((X + Y) < X) or
> > > > > +   SAT_ADDU = (X + Y) | - ((X + Y) < Y).  */
> > > > > +(simplify
> > > > > + (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0))))
> > > > > +   (if (optimize
> > > > > +     && INTEGRAL_TYPE_P (type)
> > > > > +     && TYPE_UNSIGNED (TREE_TYPE (@0))
> > > > > +     && types_match (type, TREE_TYPE (@0))
> > > > > +     && types_match (type, TREE_TYPE (@1))
> > > > > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> > > > > OPTIMIZE_FOR_BOTH))
> > > > > +   (IFN_US_PLUS @0 @1)))
> > > > > +
> > > > > +#endif
> > > >
> > > > With the version above you can drop the #if GIMPLE and the
> > > > > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> > > >
> > > > Check.
> > > >
> > > > Thanks,
> > > > Tamar
> > > >
> > > > > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > > > > index ad14f9328b9..5855c4e0834 100644
> > > > > --- a/gcc/optabs.def
> > > > > +++ b/gcc/optabs.def
> > > > > @@ -179,6 +179,8 @@ OPTAB_NL(clrsb_optab, "clrsb$a2", CLRSB, "clrsb",
> > '2',
> > > > > gen_int_libfunc)
> > > > >  OPTAB_NL(popcount_optab, "popcount$a2", POPCOUNT, "popcount", '2',
> > > > > gen_int_libfunc)
> > > > >  OPTAB_NL(parity_optab, "parity$a2", PARITY, "parity", '2', gen_int_libfunc)
> > > > >
> > > > > +OPTAB_NL(us_plus_optab, "us_plus$a3", US_PLUS, "us_plus", '3',
> > > > > gen_int_libfunc)
> > > > > +
> > > > >  /* Comparison libcalls for integers MUST come in pairs, signed/unsigned.  */
> > > > >  OPTAB_NL(cmp_optab, NULL, UNKNOWN, "cmp", '2',
> > > gen_int_fp_fixed_libfunc)
> > > > >  OPTAB_NL(ucmp_optab, NULL, UNKNOWN, "ucmp", '2', gen_int_libfunc)
> > > > > --
> > > > > 2.34.1
> > > >
Li, Pan2 March 2, 2024, 7:46 a.m. UTC | #9
Hi Richard and Tamar,

I have a try with DEF_INTERNAL_SIGNED_OPTAB_FN for SAT_ADD/SUB/MUL but meet some problem when match.pd.

For unsigned SAT_ADD = (x + y) | - ((x + y) < x), the match.pd can be (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0)))).
For unsigned SAT_SUB = x >= y ? x - y : 0, and then match.pd can be (cond (ge @0 @1) (minus @0 @1) integer_zerop).

For signed SAT_ADD/SAT_SUB as below, seems not easy to make the simplify pattern works well as expected up to a point.
sint64_t sat_add (sint64_t x, sint64_t y)
{
  sint64_t a = x ^ y;
  sint64_t add = x + y;
  sint64_t b = sum ^ x;

  return (a < 0 || (a >= 0 && b >= 0)) ? add : (MAX_INT64 + (x < 0));
}

sint64_t sad_sub (sint64_t x, sint64_t y)
{
  sint64_t a = x ^ y;
  sint64_t sub = x - y;
  sint64_t b = sub ^ x;

  return (a >= 0 || (a < 0 && b >= 0) ? sub : (MAX_INT64 + (x < 0));
}

For SAT_MUL as below, looks we may need widen type. I am not sure if we can leverage MUL_OVERFLOW or not in match.pd.

uint32_t sat_mul (uint32_t x, uint32_t y)
{
  uint64_t mul = (uint64_t)x * (uint64_t)y;
  return mul > UINT32_MAX ? UINT32_MAX : (uint32_t)mul;
}

sint32_t sat_mul (sint32_t x, sint32_t y)
{
  sint64_t mul = (sint64_t)x * (sint64_t))y;

  return mul <= MAX_INT32 && mul >= MIN_INT32 ? mul : MAX_INT32 + (x ^ y) > 0;
}

Below diff only contains unsigned SAT_ADD and SAT_SUB for prototype validation.
I will continue to try the rest part in match.pd and keep you posted.

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 80efdf2b7e5..d9ad6fe2b58 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -132,6 +132,9 @@ extern void riscv_asm_output_external (FILE *, const tree, const char *);
 extern bool
 riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
 extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx);
+extern void riscv_expand_usadd (rtx, rtx, rtx);
+extern void riscv_expand_ussub (rtx, rtx, rtx);
 
 #ifdef RTX_CODE
 extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool *invert_ptr = 0);
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 5e984ee2a55..795462526df 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -10655,6 +10655,28 @@ riscv_vector_mode_supported_any_target_p (machine_mode)
   return true;
 }
 
+/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < x).  */
+void
+riscv_expand_usadd (rtx dest, rtx x, rtx y)
+{
+  fprintf (stdout, "Hit riscv_expand_usadd.\n");
+  // ToDo
+}
+
+void
+riscv_expand_ussub (rtx dest, rtx x, rtx y)
+{
+  fprintf (stdout, "Hit riscv_expand_ussub.\n");
+  // ToDo
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_ASM_ALIGNED_HI_OP
 #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 1fec13092e2..e2dbadb3ead 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -3839,6 +3839,39 @@ (define_insn "*large_load_address"
   [(set_attr "type" "load")
    (set (attr "length") (const_int 8))])
 
+(define_expand "usadd<mode>3"
+  [(match_operand:ANYI 0 "register_operand")
+   (match_operand:ANYI 1 "register_operand")
+   (match_operand:ANYI 2 "register_operand")]
+  ""
+  {
+    riscv_expand_usadd (operands[0], operands[1], operands[2]);
+    DONE;
+  }
+)
+
+(define_expand "ussub<mode>3"
+  [(match_operand:ANYI 0 "register_operand")
+   (match_operand:ANYI 1 "register_operand")
+   (match_operand:ANYI 2 "register_operand")]
+  ""
+  {
+    riscv_expand_ussub (operands[0], operands[1], operands[2]);
+    DONE;
+  }
+)
+
 (include "bitmanip.md")
 (include "crypto.md")
 (include "sync.md")
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 848bb9dbff3..0fff19c875f 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -275,6 +275,13 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST | ECF_NOTHROW, first,
 DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, first,
 			      smulhrs, umulhrs, binary)
 
+DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST | ECF_NOTHROW, first,
+			      ssadd, usadd, binary)
+DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_SUB, ECF_CONST | ECF_NOTHROW, first,
+			      sssub, ussub, binary)
+
 DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
 DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
 DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary)
diff --git a/gcc/match.pd b/gcc/match.pd
index f3fffd8dec2..6592dea643a 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -10276,3 +10276,32 @@ and,
       }
       (if (full_perm_p)
 	(vec_perm (op@3 @0 @1) @3 @2))))))
+
+#if GIMPLE
+
+/* Unsigned saturation add, aka:
+   SAT_ADDU = (X + Y) | - ((X + Y) < X) or
+   SAT_ADDU = (X + Y) | - ((X + Y) < Y).  */
+(simplify
+ (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0))))
+   (if (optimize
+	&& INTEGRAL_TYPE_P (type)
+	&& types_match (type, TREE_TYPE (@0))
+	&& types_match (type, TREE_TYPE (@1))
+	&& TYPE_UNSIGNED (TREE_TYPE (@0))
+	&& direct_internal_fn_supported_p (IFN_SAT_ADD, type, OPTIMIZE_FOR_BOTH))
+   (IFN_SAT_ADD @0 @1)))
+
+/* Unsigned saturation sub , aka
+   SAT_SUBU = x >= y ? x - y : 0.  */
+(simplify
+  (cond (ge @0 @1) (minus @0 @1) integer_zerop)
+    (if (optimize
+	&& INTEGRAL_TYPE_P (type)
+	&& TYPE_UNSIGNED (TREE_TYPE (@0))
+	&& types_match (type, TREE_TYPE (@0))
+	&& types_match (type, TREE_TYPE (@1))
+	&& direct_internal_fn_supported_p (IFN_SAT_SUB, type, OPTIMIZE_FOR_BOTH))
+    (IFN_SAT_SUB @0 @1)))
+
+#endif
diff --git a/gcc/optabs.def b/gcc/optabs.def
index ad14f9328b9..bebe38c888b 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -111,15 +111,15 @@ OPTAB_NX(add_optab, "add$F$a3")
 OPTAB_NX(add_optab, "add$Q$a3")
 OPTAB_VL(addv_optab, "addv$I$a3", PLUS, "add", '3', gen_intv_fp_libfunc)
 OPTAB_VX(addv_optab, "add$F$a3")
-OPTAB_NL(ssadd_optab, "ssadd$Q$a3", SS_PLUS, "ssadd", '3', gen_signed_fixed_libfunc)
-OPTAB_NL(usadd_optab, "usadd$Q$a3", US_PLUS, "usadd", '3', gen_unsigned_fixed_libfunc)
+OPTAB_NL(ssadd_optab, "ssadd$a3", SS_PLUS, "ssadd", '3', gen_int_libfunc)
+OPTAB_NL(usadd_optab, "usadd$a3", US_PLUS, "usadd", '3', gen_int_libfunc)
 OPTAB_NL(sub_optab, "sub$P$a3", MINUS, "sub", '3', gen_int_fp_fixed_libfunc)
 OPTAB_NX(sub_optab, "sub$F$a3")
 OPTAB_NX(sub_optab, "sub$Q$a3")
 OPTAB_VL(subv_optab, "subv$I$a3", MINUS, "sub", '3', gen_intv_fp_libfunc)
 OPTAB_VX(subv_optab, "sub$F$a3")
-OPTAB_NL(sssub_optab, "sssub$Q$a3", SS_MINUS, "sssub", '3', gen_signed_fixed_libfunc)
-OPTAB_NL(ussub_optab, "ussub$Q$a3", US_MINUS, "ussub", '3', gen_unsigned_fixed_libfunc)
+OPTAB_NL(sssub_optab, "sssub$a3", SS_MINUS, "sssub", '3', gen_int_libfunc)
+OPTAB_NL(ussub_optab, "ussub$a3", US_MINUS, "ussub", '3', gen_int_libfunc)
 OPTAB_NL(smul_optab, "mul$Q$a3", MULT, "mul", '3', gen_int_fp_fixed_libfunc)
 OPTAB_NX(smul_optab, "mul$P$a3")
 OPTAB_NX(smul_optab, "mul$F$a3")

Pan

-----Original Message-----
From: Li, Pan2 <pan2.li@intel.com> 
Sent: Tuesday, February 27, 2024 10:36 PM
To: Richard Biener <richard.guenther@gmail.com>; Tamar Christina <Tamar.Christina@arm.com>
Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com; richard.sandiford@arm.com2; jeffreyalaw@gmail.com
Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS

Thanks Richard and Tammer for moving this forward.

> That said, I would like to see the bigger picture to be kept in mind
> before altering the GIMPLE IL.

> Adding an internal function for an already present optab is a
> no-brainer.  Adding a vectorizer
> and/or if-conversion pattern to make use of this during vectorization
> is existing practice.
> Adding pattern recognition to ISEL or widening-mul passes for
> instructions the CPU can do
> is existing practice and OK.

Thanks for explaining, got the point here.

> So I'd suggest writing some example of both signed and unsigned saturating add and multiply

> Because signed addition, will likely require a branch and signed multiplication would require a
> larger type.

Ack, will prepare one prototype validation patch for add, sub and mul (both unsigned and signed) soon.

Pan

-----Original Message-----
From: Richard Biener <richard.guenther@gmail.com> 
Sent: Tuesday, February 27, 2024 9:42 PM
To: Tamar Christina <Tamar.Christina@arm.com>
Cc: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com; richard.sandiford@arm.com2; jeffreyalaw@gmail.com
Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS

On Tue, Feb 27, 2024 at 1:57 PM Tamar Christina <Tamar.Christina@arm.com> wrote:
>
> > Thanks Tamar.
> >
> > > Those two cases also *completely* stop vectorization because of either the
> > > control flow or the fact the vectorizer can't handle complex types.
> >
> > Yes, we eventually would like to vectorize the SAT ALU but we start with scalar part
> > first.
> > I tried the DEF_INTERNAL_SIGNED_OPTAB_EXT_FN as your suggestion. It works
> > well with some additions as below.
> > Feel free to correct me if any misunderstandings.
> >
> > 1. usadd$Q$a3 are restricted to fixed point and we need to change it to
> > usadd$a3(as well as gen_int_libfunc) for int.
> > 2. We need to implement a default implementation of SAT_ADD if
> > direct_binary_optab_supported_p is false.
> >     It looks like the default implementation is difficult to make every backend happy.
> > That is why you suggest just normal
> >     DEF_INTERNAL_SIGNED_OPTAB_FN in another thread.
> >
> > Thanks Richard.
> >
> > > But what I'd like to see is that we do more instruction selection on GIMPLE
> > > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> > > passes doing what I'd call instruction selection).  But that means not adding
> > > match.pd patterns for that or at least have a separate isel-match.pd
> > > machinery for that.
> >
> > > So as a start I would go for a direct optab and see to recognize it during
> > > ISEL?
> >
> > Looks we have sorts of SAT alu like PLUS/MINUS/MULT/DIV/SHIFT/NEG/ABS, good
> > to know isel and I am happy to
> > try that once we have conclusion.
> >
>
> So after a lively discussion on IRC, the conclusion is that before we proceed Richi would
> like to see some examples of various operations.  The problem is that unsigned saturating
> addition is the simplest example and it may lead to an implementation strategy that doesn't
> scale.
>
> So I'd suggest writing some example of both signed and unsigned saturating add and multiply
>
> Because signed addition, will likely require a branch and signed multiplication would require a
> larger type.
>
> This would allow us to better understand what kind of gimple would have to to deal with in
> ISEL and VECT if we decide not to lower early.

More specifically before making something like .SAT_ADD a core part of
GIMPLE I'd like
to point out that we have saturating PLUS_EXPR but just for
fixed-point types.  I realize
Joseph thinks that keying this on the type was wrong and it should
have used integer
types and special saturating operations.  Still having both,
type-keyed saturating PLUS_EXPR
and "code"-keyed .SAT_ADD (on integer types only?) looks like a mess.

It might be that the way to go is to turn all existing saturating type
*_EXPR into
.SAT_* internal function calls, in the end mapping to the optabs and
eventual RTX codes.

That could work for both integer types and fixed-point types.

I'll also note that "saturating" is just another variant of overflow
behavior of which we have
trapping (-ftrapv), wrapping (-fwrapv), signed-undefined (default) and
also (kind-of) sanitized.
We do lack direct IL representation of -ftrapv and -fwrapv, the
semantics on a PLUS_EXPR
depend on per-function flags.  Eventually a common representation
could be found here.
For saturating I was thinking of .ADD_OVERFLOW (a, b,
saturation-value), a "trap" could
be a "trapping" saturation value, "undefined" could be a
"not-a-thing".  But I didn't think much
about this.

That said, I would like to see the bigger picture to be kept in mind
before altering the GIMPLE IL.

Adding an internal function for an already present optab is a
no-brainer.  Adding a vectorizer
and/or if-conversion pattern to make use of this during vectorization
is existing practice.
Adding pattern recognition to ISEL or widening-mul passes for
instructions the CPU can do
is existing practice and OK.

Thanks,
Richard.


> Thanks,
> Tamar
>
> > Pan
> >
> > -----Original Message-----
> > From: Tamar Christina <Tamar.Christina@arm.com>
> > Sent: Tuesday, February 27, 2024 5:57 PM
> > To: Richard Biener <richard.guenther@gmail.com>
> > Cc: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai;
> > Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com;
> > richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> > US_PLUS
> >
> > > -----Original Message-----
> > > From: Richard Biener <richard.guenther@gmail.com>
> > > Sent: Tuesday, February 27, 2024 9:44 AM
> > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > Cc: pan2.li@intel.com; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai;
> > > yanzhang.wang@intel.com; kito.cheng@gmail.com;
> > > richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > > Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> > > US_PLUS
> > >
> > > On Sun, Feb 25, 2024 at 10:01 AM Tamar Christina
> > > <Tamar.Christina@arm.com> wrote:
> > > >
> > > > Hi Pan,
> > > >
> > > > > From: Pan Li <pan2.li@intel.com>
> > > > >
> > > > > Hi Richard & Tamar,
> > > > >
> > > > > Try the DEF_INTERNAL_INT_EXT_FN as your suggestion.  By mapping
> > > > > us_plus$a3 to the RTL representation (us_plus:m x y) in optabs.def.
> > > > > And then expand_US_PLUS in internal-fn.cc.  Not very sure if my
> > > > > understanding is correct for DEF_INTERNAL_INT_EXT_FN.
> > > > >
> > > > > I am not sure if we still need DEF_INTERNAL_SIGNED_OPTAB_FN here, given
> > > > > the RTL representation has (ss_plus:m x y) and (us_plus:m x y) already.
> > > > >
> > > >
> > > > I think a couple of things are being confused here.  So lets break it down:
> > > >
> > > > The reason for DEF_INTERNAL_SIGNED_OPTAB_FN is because in GIMPLE
> > > > we only want one internal function for both signed and unsigned SAT_ADD.
> > > > with this definition we don't need SAT_UADD and SAT_SADD but instead
> > > > we will only have SAT_ADD, which will expand to us_plus or ss_plus.
> > > >
> > > > Now the downside of this is that this is a direct internal optab.  This means
> > > > that for the representation to be used the target *must* have the optab
> > > > implemented.   This is a bit annoying because it doesn't allow us to generically
> > > > assume that all targets use SAT_ADD for saturating add and thus only have to
> > > > write optimization for this representation.
> > > >
> > > > This is why Richi said we may need to use a new tree_code because we can
> > > > override tree code expansions.  However the same can be done with the
> > _EXT_FN
> > > > internal functions.
> > > >
> > > > So what I meant was that we want to have a combination of the two. i.e. a
> > > > DEF_INTERNAL_SIGNED_OPTAB_EXT_FN.
> > >
> > > Whether we want/need _EXT or only direct depends mainly on how we want to
> > > leverage support.  If it's only during vectorization and possibly instruction
> > > selection a direct optab is IMO the way to go.  Generic optimization only
> > > marginally improves when you explode the number of basic operations you
> > > expose - in fact it gets quite unwieldly to support all of them in
> > > simplifications
> > > and/or canonicalization and you possibly need to translate them back to what
> > > the target CPU supports.
> > >
> > > We already do have too many (IMO) "special" operations exposed "early"
> > > in the GIMPLE pipeline.
> > >
> > > But what I'd like to see is that we do more instruction selection on GIMPLE
> > > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> > > passes doing what I'd call instruction selection).  But that means not adding
> > > match.pd patterns for that or at least have a separate isel-match.pd
> > > machinery for that.
> > >
> > > So as a start I would go for a direct optab and see to recognize it during
> > > ISEL?
> > >
> >
> > The problem with ISEL and the reason I suggested an indirect IFN is that there
> > Are benefit to be had from recognizing it early.  Saturating arithmetic can be
> > optimized
> > Differently from non-saturating ones.
> >
> > But additionally a common way of specifying them decomposes to branches
> > and/or using COMPLEX_EXPR (see the various PRs on saturating arithmetic).
> >
> > These two representation can be detected in PHI-opts and it's beneficial to all
> > targets to canonicalize them to the branchless code.
> >
> > Those two cases also *completely* stop vectorization because of either the
> > control flow or the fact the vectorizer can't handle complex types.
> >
> > So really, gimple ISEL would fix just 1 of the 3 very common cases, and then
> > We'd still need to hack the vectorizer cost models for targets with saturating
> > vector instructions.
> >
> > I of course defer to you, but it seems quite suboptimal to do it this way and
> > doesn't get us first class saturation support.
> >
> > Additionally there have been discussions whether both clang and gcc should
> > provide __builtin_saturate_* methods, which the non-direct IFN would help
> > support.
> >
> > Tamar.
> >
> > > > If Richi agrees, the below is what I meant. It creates the infrastructure for this
> > > > and for now only allows a default fallback for unsigned saturating add and
> > makes
> > > > it easier for us to add the rest later
> > > >
> > > > Also, unless I'm wrong (and Richi can correct me here), us_plus and ss_plus are
> > > the
> > > > RTL expression, but the optab for saturation are ssadd and usadd.  So you don't
> > > > need to make new us_plus and ss_plus ones.
> > > >
> > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > > index a07f25f3aee..aaf9f8991b3 100644
> > > > --- a/gcc/internal-fn.cc
> > > > +++ b/gcc/internal-fn.cc
> > > > @@ -4103,6 +4103,17 @@ direct_internal_fn_supported_p (internal_fn fn,
> > > tree_pair types,
> > > >         return direct_##TYPE##_optab_supported_p (which_optab, types,   \
> > > >                                                   opt_type);            \
> > > >        }
> > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR,
> > > SIGNED_OPTAB, \
> > > > +                                        UNSIGNED_OPTAB, TYPE)          \
> > > > +    case IFN_##CODE:                                                   \
> > > > +      {                                                                        \
> > > > +       optab which_optab = (TYPE_UNSIGNED (types.SELECTOR)             \
> > > > +                            ? UNSIGNED_OPTAB ## _optab                 \
> > > > +                            : SIGNED_OPTAB ## _optab);                 \
> > > > +       return direct_##TYPE##_optab_supported_p (which_optab, types,   \
> > > > +                                                 opt_type)             \
> > > > +              || internal_##CODE##_fn_supported_p (types.SELECTOR, opt_type); \
> > > > +      }
> > > >  #include "internal-fn.def"
> > > >
> > > >      case IFN_LAST:
> > > > @@ -4303,6 +4314,8 @@ set_edom_supported_p (void)
> > > >      optab which_optab = direct_internal_fn_optab (fn, types);          \
> > > >      expand_##TYPE##_optab_fn (fn, stmt, which_optab);                  \
> > > >    }
> > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR,
> > > SIGNED_OPTAB, \
> > > > +                                        UNSIGNED_OPTAB, TYPE)
> > > >  #include "internal-fn.def"
> > > >
> > > >  /* Routines to expand each internal function, indexed by function number.
> > > > @@ -5177,3 +5190,45 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> > > >        emit_move_insn (plhs, cmp);
> > > >      }
> > > >  }
> > > > +
> > > > +void
> > > > +expand_SAT_ADD (internal_fn fn, gcall *stmt)
> > > > +{
> > > > +  /* Check if the target supports the expansion through an IFN.  */
> > > > +  tree_pair types = direct_internal_fn_types (fn, stmt);
> > > > +  optab which_optab = direct_internal_fn_optab (fn, types);
> > > > +  if (direct_binary_optab_supported_p (which_optab, types,
> > > > +                                      insn_optimization_type ()))
> > > > +    {
> > > > +      expand_binary_optab_fn (fn, stmt, which_optab);
> > > > +      return;
> > > > +    }
> > > > +
> > > > +  /* Target does not support the optab, but we can de-compose it.  */
> > > > +  /*
> > > > +  ... decompose to a canonical representation ...
> > > > +  if (TYPE_UNSIGNED (types.SELECTOR))
> > > > +    {
> > > > +      ...
> > > > +      decompose back to (X + Y) | - ((X + Y) < X)
> > > > +    }
> > > > +  else
> > > > +    {
> > > > +      ...
> > > > +    }
> > > > +  */
> > > > +}
> > > > +
> > > > +bool internal_SAT_ADD_fn_supported_p (tree type, optimization_type /*
> > > optype */)
> > > > +{
> > > > +  /* For now, don't support decomposing vector ops.  */
> > > > +  if (VECTOR_TYPE_P (type))
> > > > +    return false;
> > > > +
> > > > +  /* Signed saturating arithmetic is harder to do since we'll so for now
> > > > +     lets ignore.  */
> > > > +  if (!TYPE_UNSIGNED (type))
> > > > +    return false;
> > > > +
> > > > +  return TREE_CODE (type) == INTEGER_TYPE;
> > > > +}
> > > > \ No newline at end of file
> > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > > index c14d30365c1..5a2491228d5 100644
> > > > --- a/gcc/internal-fn.def
> > > > +++ b/gcc/internal-fn.def
> > > > @@ -92,6 +92,10 @@ along with GCC; see the file COPYING3.  If not see
> > > >     unsigned inputs respectively, both without the trailing "_optab".
> > > >     SELECTOR says which type in the tree_pair determines the signedness.
> > > >
> > > > +   DEF_INTERNAL_SIGNED_OPTAB_EXT_FN is like
> > > DEF_INTERNAL_SIGNED_OPTAB_FN, except
> > > > +   that it has expand_##NAME defined in internal-fn.cc to override the
> > > > +   DEF_INTERNAL_SIGNED_OPTAB_FN expansion behavior.
> > > > +
> > > >     DEF_INTERNAL_FLT_FN is like DEF_INTERNAL_OPTAB_FN, but in addition,
> > > >     the function implements the computational part of a built-in math
> > > >     function BUILT_IN_<NAME>{F,,L}.  Unlike some built-in functions,
> > > > @@ -153,6 +157,13 @@ along with GCC; see the file COPYING3.  If not see
> > > >    DEF_INTERNAL_FN (NAME, FLAGS | ECF_LEAF, NULL)
> > > >  #endif
> > > >
> > > > +#ifndef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(NAME, FLAGS, SELECTOR,
> > > SIGNED_OPTAB, \
> > > > +                                    UNSIGNED_OPTAB, TYPE) \
> > > > +  DEF_INTERNAL_SIGNED_OPTAB_FN (NAME, FLAGS, SELECTOR,
> > > SIGNED_OPTAB, \
> > > > +                               UNSIGNED_OPTAB, TYPE)
> > > > +#endif
> > > > +
> > > >  #ifndef DEF_INTERNAL_FLT_FN
> > > >  #define DEF_INTERNAL_FLT_FN(NAME, FLAGS, OPTAB, TYPE) \
> > > >    DEF_INTERNAL_OPTAB_FN (NAME, FLAGS, OPTAB, TYPE)
> > > > @@ -274,6 +285,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS,
> > > ECF_CONST | ECF_NOTHROW, first,
> > > >                               smulhs, umulhs, binary)
> > > >  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW,
> > > first,
> > > >                               smulhrs, umulhrs, binary)
> > > > +DEF_INTERNAL_SIGNED_OPTAB_EXT_FN (SAT_ADD, ECF_CONST |
> > > ECF_NOTHROW, first,
> > > > +                                 ssadd, usadd, binary)
> > > >
> > > >  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
> > > >  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
> > > > @@ -593,5 +606,6 @@ DEF_INTERNAL_FN (BITINTTOFLOAT, ECF_PURE |
> > > ECF_LEAF, ". R . ")
> > > >  #undef DEF_INTERNAL_FLT_FN
> > > >  #undef DEF_INTERNAL_FLT_FLOATN_FN
> > > >  #undef DEF_INTERNAL_SIGNED_OPTAB_FN
> > > > +#undef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> > > >  #undef DEF_INTERNAL_OPTAB_FN
> > > >  #undef DEF_INTERNAL_FN
> > > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > > > index bccee1c3e09..dbdb1e6bad2 100644
> > > > --- a/gcc/internal-fn.h
> > > > +++ b/gcc/internal-fn.h
> > > > @@ -263,6 +263,8 @@ extern void expand_DIVMODBITINT (internal_fn, gcall
> > > *);
> > > >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> > > >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> > > >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > > > +extern void expand_SAT_ADD (internal_fn, gcall *);
> > > > +extern bool internal_SAT_ADD_fn_supported_p (tree, optimization_type);
> > > >
> > > >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> > > >
> > > > > Note this patch is a draft for validation, no test are invovled here.
> > > > >
> > > > > gcc/ChangeLog:
> > > > >
> > > > >       * builtins.def (BUILT_IN_US_PLUS): Add builtin def.
> > > > >       (BUILT_IN_US_PLUSIMAX): Ditto.
> > > > >       (BUILT_IN_US_PLUSL): Ditto.
> > > > >       (BUILT_IN_US_PLUSLL): Ditto.
> > > > >       (BUILT_IN_US_PLUSG): Ditto.
> > > > >       * config/riscv/riscv-protos.h (riscv_expand_us_plus): Add new
> > > > >       func decl for expanding us_plus.
> > > > >       * config/riscv/riscv.cc (riscv_expand_us_plus): Add new func
> > > > >       impl for expanding us_plus.
> > > > >       * config/riscv/riscv.md (us_plus<mode>3): Add new pattern impl
> > > > >       us_plus<mode>3.
> > > > >       * internal-fn.cc (expand_US_PLUS): Add new func impl to expand
> > > > >       US_PLUS.
> > > > >       * internal-fn.def (US_PLUS): Add new INT_EXT_FN.
> > > > >       * internal-fn.h (expand_US_PLUS): Add new func decl.
> > > > >       * match.pd: Add new simplify pattern for us_plus.
> > > > >       * optabs.def (OPTAB_NL): Add new OPTAB_NL to US_PLUS rtl.
> > > > >
> > > > > Signed-off-by: Pan Li <pan2.li@intel.com>
> > > > > ---
> > > > >  gcc/builtins.def                |  7 +++++
> > > > >  gcc/config/riscv/riscv-protos.h |  1 +
> > > > >  gcc/config/riscv/riscv.cc       | 46 +++++++++++++++++++++++++++++++++
> > > > >  gcc/config/riscv/riscv.md       | 11 ++++++++
> > > > >  gcc/internal-fn.cc              | 26 +++++++++++++++++++
> > > > >  gcc/internal-fn.def             |  3 +++
> > > > >  gcc/internal-fn.h               |  1 +
> > > > >  gcc/match.pd                    | 17 ++++++++++++
> > > > >  gcc/optabs.def                  |  2 ++
> > > > >  9 files changed, 114 insertions(+)
> > > > >
> > > > > diff --git a/gcc/builtins.def b/gcc/builtins.def
> > > > > index f6f3e104f6a..0777b912cfa 100644
> > > > > --- a/gcc/builtins.def
> > > > > +++ b/gcc/builtins.def
> > > > > @@ -1055,6 +1055,13 @@ DEF_GCC_BUILTIN
> > > (BUILT_IN_POPCOUNTIMAX,
> > > > > "popcountimax", BT_FN_INT_UINTMAX
> > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTL, "popcountl",
> > > BT_FN_INT_ULONG,
> > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTLL, "popcountll",
> > > > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTG, "popcountg",
> > > BT_FN_INT_VAR,
> > > > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > > > > +
> > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUS, "us_plus", BT_FN_INT_UINT,
> > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSIMAX, "us_plusimax",
> > > > > BT_FN_INT_UINTMAX, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSL, "us_plusl",
> > BT_FN_INT_ULONG,
> > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSLL, "us_plusll",
> > > > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSG, "us_plusg", BT_FN_INT_VAR,
> > > > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > > > > +
> > > > >  DEF_EXT_LIB_BUILTIN    (BUILT_IN_POSIX_MEMALIGN, "posix_memalign",
> > > > > BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF)
> > > > >  DEF_GCC_BUILTIN        (BUILT_IN_PREFETCH, "prefetch",
> > > > > BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
> > > > >  DEF_LIB_BUILTIN        (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE,
> > > > > ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST)
> > > > > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> > > > > index 80efdf2b7e5..ba6086f1f25 100644
> > > > > --- a/gcc/config/riscv/riscv-protos.h
> > > > > +++ b/gcc/config/riscv/riscv-protos.h
> > > > > @@ -132,6 +132,7 @@ extern void riscv_asm_output_external (FILE *, const
> > > tree,
> > > > > const char *);
> > > > >  extern bool
> > > > >  riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
> > > > >  extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx);
> > > > > +extern void riscv_expand_us_plus (rtx, rtx, rtx);
> > > > >
> > > > >  #ifdef RTX_CODE
> > > > >  extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool
> > *invert_ptr
> > > =
> > > > > 0);
> > > > > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> > > > > index 4100abc9dd1..23f08974f07 100644
> > > > > --- a/gcc/config/riscv/riscv.cc
> > > > > +++ b/gcc/config/riscv/riscv.cc
> > > > > @@ -10657,6 +10657,52 @@ riscv_vector_mode_supported_any_target_p
> > > > > (machine_mode)
> > > > >    return true;
> > > > >  }
> > > > >
> > > > > +/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < x).  */
> > > > > +void
> > > > > +riscv_expand_us_plus (rtx dest, rtx x, rtx y)
> > > > > +{
> > > > > +  machine_mode mode = GET_MODE (dest);
> > > > > +  rtx pmode_sum = gen_reg_rtx (Pmode);
> > > > > +  rtx pmode_lt = gen_reg_rtx (Pmode);
> > > > > +  rtx pmode_x = gen_lowpart (Pmode, x);
> > > > > +  rtx pmode_y = gen_lowpart (Pmode, y);
> > > > > +  rtx pmode_dest = gen_reg_rtx (Pmode);
> > > > > +
> > > > > +  /* Step-1: sum = x + y  */
> > > > > +  if (mode == SImode && mode != Pmode)
> > > > > +    { /* Take addw to avoid the sum truncate.  */
> > > > > +      rtx simode_sum = gen_reg_rtx (SImode);
> > > > > +      riscv_emit_binary (PLUS, simode_sum, x, y);
> > > > > +      emit_move_insn (pmode_sum, gen_lowpart (Pmode, simode_sum));
> > > > > +    }
> > > > > +  else
> > > > > +    riscv_emit_binary (PLUS, pmode_sum, pmode_x, pmode_y);
> > > > > +
> > > > > +  /* Step-1.1: truncate sum for HI and QI as we have no insn for add QI/HI.
> > */
> > > > > +  if (mode == HImode || mode == QImode)
> > > > > +    {
> > > > > +      int mode_bits = GET_MODE_BITSIZE (mode).to_constant ();
> > > > > +      int shift_bits = GET_MODE_BITSIZE (Pmode) - mode_bits;
> > > > > +
> > > > > +      gcc_assert (shift_bits > 0);
> > > > > +
> > > > > +      riscv_emit_binary (ASHIFT, pmode_sum, pmode_sum, GEN_INT
> > > (shift_bits));
> > > > > +      riscv_emit_binary (LSHIFTRT, pmode_sum, pmode_sum, GEN_INT
> > > > > (shift_bits));
> > > > > +    }
> > > > > +
> > > > > +  /* Step-2: lt = sum < x  */
> > > > > +  riscv_emit_binary (LTU, pmode_lt, pmode_sum, pmode_x);
> > > > > +
> > > > > +  /* Step-3: lt = -lt  */
> > > > > +  riscv_emit_unary (NEG, pmode_lt, pmode_lt);
> > > > > +
> > > > > +  /* Step-4: pmode_dest = sum | lt  */
> > > > > +  riscv_emit_binary (IOR, pmode_dest, pmode_lt, pmode_sum);
> > > > > +
> > > > > +  /* Step-5: dest = pmode_dest */
> > > > > +  emit_move_insn (dest, gen_lowpart (mode, pmode_dest));
> > > > > +}
> > > > > +
> > > > >  /* Initialize the GCC target structure.  */
> > > > >  #undef TARGET_ASM_ALIGNED_HI_OP
> > > > >  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> > > > > diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> > > > > index 3f7a023d941..eaa9867023c 100644
> > > > > --- a/gcc/config/riscv/riscv.md
> > > > > +++ b/gcc/config/riscv/riscv.md
> > > > > @@ -3841,6 +3841,17 @@ (define_insn "*large_load_address"
> > > > >    [(set_attr "type" "load")
> > > > >     (set (attr "length") (const_int 8))])
> > > > >
> > > > > +(define_expand "us_plus<mode>3"
> > > > > +  [(match_operand:ANYI   0 "register_operand")
> > > > > +   (match_operand:ANYI   1 "register_operand")
> > > > > +   (match_operand:ANYI   2 "register_operand")]
> > > > > +  ""
> > > > > +  {
> > > > > +    riscv_expand_us_plus (operands[0], operands[1], operands[2]);
> > > > > +    DONE;
> > > > > +  }
> > > > > +)
> > > > > +
> > > > >  (include "bitmanip.md")
> > > > >  (include "crypto.md")
> > > > >  (include "sync.md")
> > > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > > > index a07f25f3aee..a7341a57ffa 100644
> > > > > --- a/gcc/internal-fn.cc
> > > > > +++ b/gcc/internal-fn.cc
> > > > > @@ -5177,3 +5177,29 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> > > > >        emit_move_insn (plhs, cmp);
> > > > >      }
> > > > >  }
> > > > > +
> > > > > +void
> > > > > +expand_US_PLUS (internal_fn fn, gcall *stmt)
> > > > > +{
> > > > > +  tree lhs = gimple_call_lhs (stmt);
> > > > > +  tree rhs_0 = gimple_call_arg (stmt, 0);
> > > > > +  tree rhs_1 = gimple_call_arg (stmt, 1);
> > > > > +
> > > > > +  do_pending_stack_adjust ();
> > > > > +
> > > > > +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> > > > > +  rtx op_0 = expand_normal (rhs_0);
> > > > > +  rtx op_1 = expand_normal (rhs_1);
> > > > > +
> > > > > +  class expand_operand ops[3];
> > > > > +
> > > > > +  create_output_operand (&ops[0], target, TYPE_MODE (TREE_TYPE (lhs)));
> > > > > +  create_output_operand (&ops[1], op_0, TYPE_MODE (TREE_TYPE
> > (rhs_0)));
> > > > > +  create_output_operand (&ops[2], op_1, TYPE_MODE (TREE_TYPE
> > (rhs_1)));
> > > > > +
> > > > > +  insn_code code = optab_handler (us_plus_optab, TYPE_MODE (TREE_TYPE
> > > > > (rhs_0)));
> > > > > +  expand_insn (code, 3, ops);
> > > > > +
> > > > > +  if (!rtx_equal_p (target, ops[0].value))
> > > > > +    emit_move_insn (target, ops[0].value);
> > > > > +}
> > > >
> > > > This can be simplified by calling expand_binary_optab_fn instead. See my
> > > template
> > > >
> > > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > > > index c14d30365c1..b1d7b5a0307 100644
> > > > > --- a/gcc/internal-fn.def
> > > > > +++ b/gcc/internal-fn.def
> > > > > @@ -447,6 +447,9 @@ DEF_INTERNAL_INT_FN (FFS, ECF_CONST |
> > > > > ECF_NOTHROW, ffs, unary)
> > > > >  DEF_INTERNAL_INT_FN (PARITY, ECF_CONST | ECF_NOTHROW, parity,
> > unary)
> > > > >  DEF_INTERNAL_INT_EXT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW,
> > > > > popcount, unary)
> > > > >
> > > > > +/* Binary integer ops.  */
> > > > > +DEF_INTERNAL_INT_EXT_FN (US_PLUS, ECF_CONST | ECF_NOTHROW,
> > > us_plus,
> > > > > binary)
> > > > > +
> > > > >  DEF_INTERNAL_FN (GOMP_TARGET_REV, ECF_NOVOPS | ECF_LEAF |
> > > > > ECF_NOTHROW, NULL)
> > > > >  DEF_INTERNAL_FN (GOMP_USE_SIMT, ECF_NOVOPS | ECF_LEAF |
> > > > > ECF_NOTHROW, NULL)
> > > > >  DEF_INTERNAL_FN (GOMP_SIMT_ENTER, ECF_LEAF | ECF_NOTHROW,
> > NULL)
> > > > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > > > > index bccee1c3e09..46e404b4a49 100644
> > > > > --- a/gcc/internal-fn.h
> > > > > +++ b/gcc/internal-fn.h
> > > > > @@ -263,6 +263,7 @@ extern void expand_DIVMODBITINT (internal_fn,
> > gcall
> > > *);
> > > > >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> > > > >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> > > > >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > > > > +extern void expand_US_PLUS (internal_fn, gcall *);
> > > > >
> > > > >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> > > > >
> > > > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > > > index c5b6540f939..f45fd58ad23 100644
> > > > > --- a/gcc/match.pd
> > > > > +++ b/gcc/match.pd
> > > > > @@ -10265,3 +10265,20 @@ and,
> > > > >        }
> > > > >        (if (full_perm_p)
> > > > >       (vec_perm (op@3 @0 @1) @3 @2))))))
> > > > > +
> > > > > +#if GIMPLE
> > > > > +
> > > > > +/* Unsigned saturation add, aka:
> > > > > +   SAT_ADDU = (X + Y) | - ((X + Y) < X) or
> > > > > +   SAT_ADDU = (X + Y) | - ((X + Y) < Y).  */
> > > > > +(simplify
> > > > > + (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0))))
> > > > > +   (if (optimize
> > > > > +     && INTEGRAL_TYPE_P (type)
> > > > > +     && TYPE_UNSIGNED (TREE_TYPE (@0))
> > > > > +     && types_match (type, TREE_TYPE (@0))
> > > > > +     && types_match (type, TREE_TYPE (@1))
> > > > > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> > > > > OPTIMIZE_FOR_BOTH))
> > > > > +   (IFN_US_PLUS @0 @1)))
> > > > > +
> > > > > +#endif
> > > >
> > > > With the version above you can drop the #if GIMPLE and the
> > > > > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> > > >
> > > > Check.
> > > >
> > > > Thanks,
> > > > Tamar
> > > >
> > > > > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > > > > index ad14f9328b9..5855c4e0834 100644
> > > > > --- a/gcc/optabs.def
> > > > > +++ b/gcc/optabs.def
> > > > > @@ -179,6 +179,8 @@ OPTAB_NL(clrsb_optab, "clrsb$a2", CLRSB, "clrsb",
> > '2',
> > > > > gen_int_libfunc)
> > > > >  OPTAB_NL(popcount_optab, "popcount$a2", POPCOUNT, "popcount", '2',
> > > > > gen_int_libfunc)
> > > > >  OPTAB_NL(parity_optab, "parity$a2", PARITY, "parity", '2', gen_int_libfunc)
> > > > >
> > > > > +OPTAB_NL(us_plus_optab, "us_plus$a3", US_PLUS, "us_plus", '3',
> > > > > gen_int_libfunc)
> > > > > +
> > > > >  /* Comparison libcalls for integers MUST come in pairs, signed/unsigned.  */
> > > > >  OPTAB_NL(cmp_optab, NULL, UNKNOWN, "cmp", '2',
> > > gen_int_fp_fixed_libfunc)
> > > > >  OPTAB_NL(ucmp_optab, NULL, UNKNOWN, "ucmp", '2', gen_int_libfunc)
> > > > > --
> > > > > 2.34.1
> > > >
Richard Biener March 4, 2024, 10:31 a.m. UTC | #10
On Sat, Mar 2, 2024 at 8:46 AM Li, Pan2 <pan2.li@intel.com> wrote:
>
> Hi Richard and Tamar,
>
> I have a try with DEF_INTERNAL_SIGNED_OPTAB_FN for SAT_ADD/SUB/MUL but meet some problem when match.pd.
>
> For unsigned SAT_ADD = (x + y) | - ((x + y) < x), the match.pd can be (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0)))).
> For unsigned SAT_SUB = x >= y ? x - y : 0, and then match.pd can be (cond (ge @0 @1) (minus @0 @1) integer_zerop).
>
> For signed SAT_ADD/SAT_SUB as below, seems not easy to make the simplify pattern works well as expected up to a point.
> sint64_t sat_add (sint64_t x, sint64_t y)
> {
>   sint64_t a = x ^ y;
>   sint64_t add = x + y;
>   sint64_t b = sum ^ x;
>
>   return (a < 0 || (a >= 0 && b >= 0)) ? add : (MAX_INT64 + (x < 0));
> }
>
> sint64_t sad_sub (sint64_t x, sint64_t y)
> {
>   sint64_t a = x ^ y;
>   sint64_t sub = x - y;
>   sint64_t b = sub ^ x;
>
>   return (a >= 0 || (a < 0 && b >= 0) ? sub : (MAX_INT64 + (x < 0));
> }
>
> For SAT_MUL as below, looks we may need widen type. I am not sure if we can leverage MUL_OVERFLOW or not in match.pd.
>
> uint32_t sat_mul (uint32_t x, uint32_t y)
> {
>   uint64_t mul = (uint64_t)x * (uint64_t)y;
>   return mul > UINT32_MAX ? UINT32_MAX : (uint32_t)mul;
> }
>
> sint32_t sat_mul (sint32_t x, sint32_t y)
> {
>   sint64_t mul = (sint64_t)x * (sint64_t))y;
>
>   return mul <= MAX_INT32 && mul >= MIN_INT32 ? mul : MAX_INT32 + (x ^ y) > 0;
> }
>
> Below diff only contains unsigned SAT_ADD and SAT_SUB for prototype validation.
> I will continue to try the rest part in match.pd and keep you posted.
>
> -------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>
> diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> index 80efdf2b7e5..d9ad6fe2b58 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -132,6 +132,9 @@ extern void riscv_asm_output_external (FILE *, const tree, const char *);
>  extern bool
>  riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
>  extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx);
> +extern void riscv_expand_usadd (rtx, rtx, rtx);
> +extern void riscv_expand_ussub (rtx, rtx, rtx);
>
>  #ifdef RTX_CODE
>  extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool *invert_ptr = 0);
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index 5e984ee2a55..795462526df 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -10655,6 +10655,28 @@ riscv_vector_mode_supported_any_target_p (machine_mode)
>    return true;
>  }
>
> +/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < x).  */
> +void
> +riscv_expand_usadd (rtx dest, rtx x, rtx y)
> +{
> +  fprintf (stdout, "Hit riscv_expand_usadd.\n");
> +  // ToDo
> +}
> +
> +void
> +riscv_expand_ussub (rtx dest, rtx x, rtx y)
> +{
> +  fprintf (stdout, "Hit riscv_expand_ussub.\n");
> +  // ToDo
> +}
> +
>  /* Initialize the GCC target structure.  */
>  #undef TARGET_ASM_ALIGNED_HI_OP
>  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> index 1fec13092e2..e2dbadb3ead 100644
> --- a/gcc/config/riscv/riscv.md
> +++ b/gcc/config/riscv/riscv.md
> @@ -3839,6 +3839,39 @@ (define_insn "*large_load_address"
>    [(set_attr "type" "load")
>     (set (attr "length") (const_int 8))])
>
> +(define_expand "usadd<mode>3"
> +  [(match_operand:ANYI 0 "register_operand")
> +   (match_operand:ANYI 1 "register_operand")
> +   (match_operand:ANYI 2 "register_operand")]
> +  ""
> +  {
> +    riscv_expand_usadd (operands[0], operands[1], operands[2]);
> +    DONE;
> +  }
> +)
> +
> +(define_expand "ussub<mode>3"
> +  [(match_operand:ANYI 0 "register_operand")
> +   (match_operand:ANYI 1 "register_operand")
> +   (match_operand:ANYI 2 "register_operand")]
> +  ""
> +  {
> +    riscv_expand_ussub (operands[0], operands[1], operands[2]);
> +    DONE;
> +  }
> +)
> +
>  (include "bitmanip.md")
>  (include "crypto.md")
>  (include "sync.md")
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 848bb9dbff3..0fff19c875f 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -275,6 +275,13 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST | ECF_NOTHROW, first,
>  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, first,
>                               smulhrs, umulhrs, binary)
>
> +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST | ECF_NOTHROW, first,
> +                             ssadd, usadd, binary)
> +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_SUB, ECF_CONST | ECF_NOTHROW, first,
> +                             sssub, ussub, binary)
> +
>  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
>  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
>  DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary)
> diff --git a/gcc/match.pd b/gcc/match.pd
> index f3fffd8dec2..6592dea643a 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -10276,3 +10276,32 @@ and,
>        }
>        (if (full_perm_p)
>         (vec_perm (op@3 @0 @1) @3 @2))))))
> +
> +#if GIMPLE
> +
> +/* Unsigned saturation add, aka:
> +   SAT_ADDU = (X + Y) | - ((X + Y) < X) or
> +   SAT_ADDU = (X + Y) | - ((X + Y) < Y).  */
> +(simplify
> + (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0))))
> +   (if (optimize
> +       && INTEGRAL_TYPE_P (type)
> +       && types_match (type, TREE_TYPE (@0))
> +       && types_match (type, TREE_TYPE (@1))
> +       && TYPE_UNSIGNED (TREE_TYPE (@0))
> +       && direct_internal_fn_supported_p (IFN_SAT_ADD, type, OPTIMIZE_FOR_BOTH))
> +   (IFN_SAT_ADD @0 @1)))
> +
> +/* Unsigned saturation sub , aka
> +   SAT_SUBU = x >= y ? x - y : 0.  */
> +(simplify
> +  (cond (ge @0 @1) (minus @0 @1) integer_zerop)
> +    (if (optimize
> +       && INTEGRAL_TYPE_P (type)
> +       && TYPE_UNSIGNED (TREE_TYPE (@0))
> +       && types_match (type, TREE_TYPE (@0))
> +       && types_match (type, TREE_TYPE (@1))
> +       && direct_internal_fn_supported_p (IFN_SAT_SUB, type, OPTIMIZE_FOR_BOTH))
> +    (IFN_SAT_SUB @0 @1)))
> +
> +#endif
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index ad14f9328b9..bebe38c888b 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -111,15 +111,15 @@ OPTAB_NX(add_optab, "add$F$a3")
>  OPTAB_NX(add_optab, "add$Q$a3")
>  OPTAB_VL(addv_optab, "addv$I$a3", PLUS, "add", '3', gen_intv_fp_libfunc)
>  OPTAB_VX(addv_optab, "add$F$a3")
> -OPTAB_NL(ssadd_optab, "ssadd$Q$a3", SS_PLUS, "ssadd", '3', gen_signed_fixed_libfunc)
> -OPTAB_NL(usadd_optab, "usadd$Q$a3", US_PLUS, "usadd", '3', gen_unsigned_fixed_libfunc)
> +OPTAB_NL(ssadd_optab, "ssadd$a3", SS_PLUS, "ssadd", '3', gen_int_libfunc)
> +OPTAB_NL(usadd_optab, "usadd$a3", US_PLUS, "usadd", '3', gen_int_libfunc)
>  OPTAB_NL(sub_optab, "sub$P$a3", MINUS, "sub", '3', gen_int_fp_fixed_libfunc)
>  OPTAB_NX(sub_optab, "sub$F$a3")
>  OPTAB_NX(sub_optab, "sub$Q$a3")
>  OPTAB_VL(subv_optab, "subv$I$a3", MINUS, "sub", '3', gen_intv_fp_libfunc)
>  OPTAB_VX(subv_optab, "sub$F$a3")
> -OPTAB_NL(sssub_optab, "sssub$Q$a3", SS_MINUS, "sssub", '3', gen_signed_fixed_libfunc)
> -OPTAB_NL(ussub_optab, "ussub$Q$a3", US_MINUS, "ussub", '3', gen_unsigned_fixed_libfunc)
> +OPTAB_NL(sssub_optab, "sssub$a3", SS_MINUS, "sssub", '3', gen_int_libfunc)
> +OPTAB_NL(ussub_optab, "ussub$a3", US_MINUS, "ussub", '3', gen_int_libfunc)

I do wonder what the existing usadd patterns with integer vector modes
in various targets do?
Those define_insn will at least not end up in the optab set I guess,
so they must end up
being either unused or used by explicit gen_* (via intrinsic
functions?) or by combine?

I think simply changing gen_*_fixed_libfunc to gen_int_libfunc won't
work.  Since there's
no libgcc support I'd leave it as gen_*_fixed_libfunc thus no library
fallback for integers?

>  OPTAB_NL(smul_optab, "mul$Q$a3", MULT, "mul", '3', gen_int_fp_fixed_libfunc)
>  OPTAB_NX(smul_optab, "mul$P$a3")
>  OPTAB_NX(smul_optab, "mul$F$a3")
>
> Pan
>
> -----Original Message-----
> From: Li, Pan2 <pan2.li@intel.com>
> Sent: Tuesday, February 27, 2024 10:36 PM
> To: Richard Biener <richard.guenther@gmail.com>; Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com; richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS
>
> Thanks Richard and Tammer for moving this forward.
>
> > That said, I would like to see the bigger picture to be kept in mind
> > before altering the GIMPLE IL.
>
> > Adding an internal function for an already present optab is a
> > no-brainer.  Adding a vectorizer
> > and/or if-conversion pattern to make use of this during vectorization
> > is existing practice.
> > Adding pattern recognition to ISEL or widening-mul passes for
> > instructions the CPU can do
> > is existing practice and OK.
>
> Thanks for explaining, got the point here.
>
> > So I'd suggest writing some example of both signed and unsigned saturating add and multiply
>
> > Because signed addition, will likely require a branch and signed multiplication would require a
> > larger type.
>
> Ack, will prepare one prototype validation patch for add, sub and mul (both unsigned and signed) soon.
>
> Pan
>
> -----Original Message-----
> From: Richard Biener <richard.guenther@gmail.com>
> Sent: Tuesday, February 27, 2024 9:42 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com; richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS
>
> On Tue, Feb 27, 2024 at 1:57 PM Tamar Christina <Tamar.Christina@arm.com> wrote:
> >
> > > Thanks Tamar.
> > >
> > > > Those two cases also *completely* stop vectorization because of either the
> > > > control flow or the fact the vectorizer can't handle complex types.
> > >
> > > Yes, we eventually would like to vectorize the SAT ALU but we start with scalar part
> > > first.
> > > I tried the DEF_INTERNAL_SIGNED_OPTAB_EXT_FN as your suggestion. It works
> > > well with some additions as below.
> > > Feel free to correct me if any misunderstandings.
> > >
> > > 1. usadd$Q$a3 are restricted to fixed point and we need to change it to
> > > usadd$a3(as well as gen_int_libfunc) for int.
> > > 2. We need to implement a default implementation of SAT_ADD if
> > > direct_binary_optab_supported_p is false.
> > >     It looks like the default implementation is difficult to make every backend happy.
> > > That is why you suggest just normal
> > >     DEF_INTERNAL_SIGNED_OPTAB_FN in another thread.
> > >
> > > Thanks Richard.
> > >
> > > > But what I'd like to see is that we do more instruction selection on GIMPLE
> > > > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> > > > passes doing what I'd call instruction selection).  But that means not adding
> > > > match.pd patterns for that or at least have a separate isel-match.pd
> > > > machinery for that.
> > >
> > > > So as a start I would go for a direct optab and see to recognize it during
> > > > ISEL?
> > >
> > > Looks we have sorts of SAT alu like PLUS/MINUS/MULT/DIV/SHIFT/NEG/ABS, good
> > > to know isel and I am happy to
> > > try that once we have conclusion.
> > >
> >
> > So after a lively discussion on IRC, the conclusion is that before we proceed Richi would
> > like to see some examples of various operations.  The problem is that unsigned saturating
> > addition is the simplest example and it may lead to an implementation strategy that doesn't
> > scale.
> >
> > So I'd suggest writing some example of both signed and unsigned saturating add and multiply
> >
> > Because signed addition, will likely require a branch and signed multiplication would require a
> > larger type.
> >
> > This would allow us to better understand what kind of gimple would have to to deal with in
> > ISEL and VECT if we decide not to lower early.
>
> More specifically before making something like .SAT_ADD a core part of
> GIMPLE I'd like
> to point out that we have saturating PLUS_EXPR but just for
> fixed-point types.  I realize
> Joseph thinks that keying this on the type was wrong and it should
> have used integer
> types and special saturating operations.  Still having both,
> type-keyed saturating PLUS_EXPR
> and "code"-keyed .SAT_ADD (on integer types only?) looks like a mess.
>
> It might be that the way to go is to turn all existing saturating type
> *_EXPR into
> .SAT_* internal function calls, in the end mapping to the optabs and
> eventual RTX codes.
>
> That could work for both integer types and fixed-point types.
>
> I'll also note that "saturating" is just another variant of overflow
> behavior of which we have
> trapping (-ftrapv), wrapping (-fwrapv), signed-undefined (default) and
> also (kind-of) sanitized.
> We do lack direct IL representation of -ftrapv and -fwrapv, the
> semantics on a PLUS_EXPR
> depend on per-function flags.  Eventually a common representation
> could be found here.
> For saturating I was thinking of .ADD_OVERFLOW (a, b,
> saturation-value), a "trap" could
> be a "trapping" saturation value, "undefined" could be a
> "not-a-thing".  But I didn't think much
> about this.
>
> That said, I would like to see the bigger picture to be kept in mind
> before altering the GIMPLE IL.
>
> Adding an internal function for an already present optab is a
> no-brainer.  Adding a vectorizer
> and/or if-conversion pattern to make use of this during vectorization
> is existing practice.
> Adding pattern recognition to ISEL or widening-mul passes for
> instructions the CPU can do
> is existing practice and OK.
>
> Thanks,
> Richard.
>
>
> > Thanks,
> > Tamar
> >
> > > Pan
> > >
> > > -----Original Message-----
> > > From: Tamar Christina <Tamar.Christina@arm.com>
> > > Sent: Tuesday, February 27, 2024 5:57 PM
> > > To: Richard Biener <richard.guenther@gmail.com>
> > > Cc: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai;
> > > Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com;
> > > richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > > Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> > > US_PLUS
> > >
> > > > -----Original Message-----
> > > > From: Richard Biener <richard.guenther@gmail.com>
> > > > Sent: Tuesday, February 27, 2024 9:44 AM
> > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > Cc: pan2.li@intel.com; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai;
> > > > yanzhang.wang@intel.com; kito.cheng@gmail.com;
> > > > richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > > > Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> > > > US_PLUS
> > > >
> > > > On Sun, Feb 25, 2024 at 10:01 AM Tamar Christina
> > > > <Tamar.Christina@arm.com> wrote:
> > > > >
> > > > > Hi Pan,
> > > > >
> > > > > > From: Pan Li <pan2.li@intel.com>
> > > > > >
> > > > > > Hi Richard & Tamar,
> > > > > >
> > > > > > Try the DEF_INTERNAL_INT_EXT_FN as your suggestion.  By mapping
> > > > > > us_plus$a3 to the RTL representation (us_plus:m x y) in optabs.def.
> > > > > > And then expand_US_PLUS in internal-fn.cc.  Not very sure if my
> > > > > > understanding is correct for DEF_INTERNAL_INT_EXT_FN.
> > > > > >
> > > > > > I am not sure if we still need DEF_INTERNAL_SIGNED_OPTAB_FN here, given
> > > > > > the RTL representation has (ss_plus:m x y) and (us_plus:m x y) already.
> > > > > >
> > > > >
> > > > > I think a couple of things are being confused here.  So lets break it down:
> > > > >
> > > > > The reason for DEF_INTERNAL_SIGNED_OPTAB_FN is because in GIMPLE
> > > > > we only want one internal function for both signed and unsigned SAT_ADD.
> > > > > with this definition we don't need SAT_UADD and SAT_SADD but instead
> > > > > we will only have SAT_ADD, which will expand to us_plus or ss_plus.
> > > > >
> > > > > Now the downside of this is that this is a direct internal optab.  This means
> > > > > that for the representation to be used the target *must* have the optab
> > > > > implemented.   This is a bit annoying because it doesn't allow us to generically
> > > > > assume that all targets use SAT_ADD for saturating add and thus only have to
> > > > > write optimization for this representation.
> > > > >
> > > > > This is why Richi said we may need to use a new tree_code because we can
> > > > > override tree code expansions.  However the same can be done with the
> > > _EXT_FN
> > > > > internal functions.
> > > > >
> > > > > So what I meant was that we want to have a combination of the two. i.e. a
> > > > > DEF_INTERNAL_SIGNED_OPTAB_EXT_FN.
> > > >
> > > > Whether we want/need _EXT or only direct depends mainly on how we want to
> > > > leverage support.  If it's only during vectorization and possibly instruction
> > > > selection a direct optab is IMO the way to go.  Generic optimization only
> > > > marginally improves when you explode the number of basic operations you
> > > > expose - in fact it gets quite unwieldly to support all of them in
> > > > simplifications
> > > > and/or canonicalization and you possibly need to translate them back to what
> > > > the target CPU supports.
> > > >
> > > > We already do have too many (IMO) "special" operations exposed "early"
> > > > in the GIMPLE pipeline.
> > > >
> > > > But what I'd like to see is that we do more instruction selection on GIMPLE
> > > > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> > > > passes doing what I'd call instruction selection).  But that means not adding
> > > > match.pd patterns for that or at least have a separate isel-match.pd
> > > > machinery for that.
> > > >
> > > > So as a start I would go for a direct optab and see to recognize it during
> > > > ISEL?
> > > >
> > >
> > > The problem with ISEL and the reason I suggested an indirect IFN is that there
> > > Are benefit to be had from recognizing it early.  Saturating arithmetic can be
> > > optimized
> > > Differently from non-saturating ones.
> > >
> > > But additionally a common way of specifying them decomposes to branches
> > > and/or using COMPLEX_EXPR (see the various PRs on saturating arithmetic).
> > >
> > > These two representation can be detected in PHI-opts and it's beneficial to all
> > > targets to canonicalize them to the branchless code.
> > >
> > > Those two cases also *completely* stop vectorization because of either the
> > > control flow or the fact the vectorizer can't handle complex types.
> > >
> > > So really, gimple ISEL would fix just 1 of the 3 very common cases, and then
> > > We'd still need to hack the vectorizer cost models for targets with saturating
> > > vector instructions.
> > >
> > > I of course defer to you, but it seems quite suboptimal to do it this way and
> > > doesn't get us first class saturation support.
> > >
> > > Additionally there have been discussions whether both clang and gcc should
> > > provide __builtin_saturate_* methods, which the non-direct IFN would help
> > > support.
> > >
> > > Tamar.
> > >
> > > > > If Richi agrees, the below is what I meant. It creates the infrastructure for this
> > > > > and for now only allows a default fallback for unsigned saturating add and
> > > makes
> > > > > it easier for us to add the rest later
> > > > >
> > > > > Also, unless I'm wrong (and Richi can correct me here), us_plus and ss_plus are
> > > > the
> > > > > RTL expression, but the optab for saturation are ssadd and usadd.  So you don't
> > > > > need to make new us_plus and ss_plus ones.
> > > > >
> > > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > > > index a07f25f3aee..aaf9f8991b3 100644
> > > > > --- a/gcc/internal-fn.cc
> > > > > +++ b/gcc/internal-fn.cc
> > > > > @@ -4103,6 +4103,17 @@ direct_internal_fn_supported_p (internal_fn fn,
> > > > tree_pair types,
> > > > >         return direct_##TYPE##_optab_supported_p (which_optab, types,   \
> > > > >                                                   opt_type);            \
> > > > >        }
> > > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR,
> > > > SIGNED_OPTAB, \
> > > > > +                                        UNSIGNED_OPTAB, TYPE)          \
> > > > > +    case IFN_##CODE:                                                   \
> > > > > +      {                                                                        \
> > > > > +       optab which_optab = (TYPE_UNSIGNED (types.SELECTOR)             \
> > > > > +                            ? UNSIGNED_OPTAB ## _optab                 \
> > > > > +                            : SIGNED_OPTAB ## _optab);                 \
> > > > > +       return direct_##TYPE##_optab_supported_p (which_optab, types,   \
> > > > > +                                                 opt_type)             \
> > > > > +              || internal_##CODE##_fn_supported_p (types.SELECTOR, opt_type); \
> > > > > +      }
> > > > >  #include "internal-fn.def"
> > > > >
> > > > >      case IFN_LAST:
> > > > > @@ -4303,6 +4314,8 @@ set_edom_supported_p (void)
> > > > >      optab which_optab = direct_internal_fn_optab (fn, types);          \
> > > > >      expand_##TYPE##_optab_fn (fn, stmt, which_optab);                  \
> > > > >    }
> > > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR,
> > > > SIGNED_OPTAB, \
> > > > > +                                        UNSIGNED_OPTAB, TYPE)
> > > > >  #include "internal-fn.def"
> > > > >
> > > > >  /* Routines to expand each internal function, indexed by function number.
> > > > > @@ -5177,3 +5190,45 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> > > > >        emit_move_insn (plhs, cmp);
> > > > >      }
> > > > >  }
> > > > > +
> > > > > +void
> > > > > +expand_SAT_ADD (internal_fn fn, gcall *stmt)
> > > > > +{
> > > > > +  /* Check if the target supports the expansion through an IFN.  */
> > > > > +  tree_pair types = direct_internal_fn_types (fn, stmt);
> > > > > +  optab which_optab = direct_internal_fn_optab (fn, types);
> > > > > +  if (direct_binary_optab_supported_p (which_optab, types,
> > > > > +                                      insn_optimization_type ()))
> > > > > +    {
> > > > > +      expand_binary_optab_fn (fn, stmt, which_optab);
> > > > > +      return;
> > > > > +    }
> > > > > +
> > > > > +  /* Target does not support the optab, but we can de-compose it.  */
> > > > > +  /*
> > > > > +  ... decompose to a canonical representation ...
> > > > > +  if (TYPE_UNSIGNED (types.SELECTOR))
> > > > > +    {
> > > > > +      ...
> > > > > +      decompose back to (X + Y) | - ((X + Y) < X)
> > > > > +    }
> > > > > +  else
> > > > > +    {
> > > > > +      ...
> > > > > +    }
> > > > > +  */
> > > > > +}
> > > > > +
> > > > > +bool internal_SAT_ADD_fn_supported_p (tree type, optimization_type /*
> > > > optype */)
> > > > > +{
> > > > > +  /* For now, don't support decomposing vector ops.  */
> > > > > +  if (VECTOR_TYPE_P (type))
> > > > > +    return false;
> > > > > +
> > > > > +  /* Signed saturating arithmetic is harder to do since we'll so for now
> > > > > +     lets ignore.  */
> > > > > +  if (!TYPE_UNSIGNED (type))
> > > > > +    return false;
> > > > > +
> > > > > +  return TREE_CODE (type) == INTEGER_TYPE;
> > > > > +}
> > > > > \ No newline at end of file
> > > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > > > index c14d30365c1..5a2491228d5 100644
> > > > > --- a/gcc/internal-fn.def
> > > > > +++ b/gcc/internal-fn.def
> > > > > @@ -92,6 +92,10 @@ along with GCC; see the file COPYING3.  If not see
> > > > >     unsigned inputs respectively, both without the trailing "_optab".
> > > > >     SELECTOR says which type in the tree_pair determines the signedness.
> > > > >
> > > > > +   DEF_INTERNAL_SIGNED_OPTAB_EXT_FN is like
> > > > DEF_INTERNAL_SIGNED_OPTAB_FN, except
> > > > > +   that it has expand_##NAME defined in internal-fn.cc to override the
> > > > > +   DEF_INTERNAL_SIGNED_OPTAB_FN expansion behavior.
> > > > > +
> > > > >     DEF_INTERNAL_FLT_FN is like DEF_INTERNAL_OPTAB_FN, but in addition,
> > > > >     the function implements the computational part of a built-in math
> > > > >     function BUILT_IN_<NAME>{F,,L}.  Unlike some built-in functions,
> > > > > @@ -153,6 +157,13 @@ along with GCC; see the file COPYING3.  If not see
> > > > >    DEF_INTERNAL_FN (NAME, FLAGS | ECF_LEAF, NULL)
> > > > >  #endif
> > > > >
> > > > > +#ifndef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> > > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(NAME, FLAGS, SELECTOR,
> > > > SIGNED_OPTAB, \
> > > > > +                                    UNSIGNED_OPTAB, TYPE) \
> > > > > +  DEF_INTERNAL_SIGNED_OPTAB_FN (NAME, FLAGS, SELECTOR,
> > > > SIGNED_OPTAB, \
> > > > > +                               UNSIGNED_OPTAB, TYPE)
> > > > > +#endif
> > > > > +
> > > > >  #ifndef DEF_INTERNAL_FLT_FN
> > > > >  #define DEF_INTERNAL_FLT_FN(NAME, FLAGS, OPTAB, TYPE) \
> > > > >    DEF_INTERNAL_OPTAB_FN (NAME, FLAGS, OPTAB, TYPE)
> > > > > @@ -274,6 +285,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS,
> > > > ECF_CONST | ECF_NOTHROW, first,
> > > > >                               smulhs, umulhs, binary)
> > > > >  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW,
> > > > first,
> > > > >                               smulhrs, umulhrs, binary)
> > > > > +DEF_INTERNAL_SIGNED_OPTAB_EXT_FN (SAT_ADD, ECF_CONST |
> > > > ECF_NOTHROW, first,
> > > > > +                                 ssadd, usadd, binary)
> > > > >
> > > > >  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
> > > > >  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
> > > > > @@ -593,5 +606,6 @@ DEF_INTERNAL_FN (BITINTTOFLOAT, ECF_PURE |
> > > > ECF_LEAF, ". R . ")
> > > > >  #undef DEF_INTERNAL_FLT_FN
> > > > >  #undef DEF_INTERNAL_FLT_FLOATN_FN
> > > > >  #undef DEF_INTERNAL_SIGNED_OPTAB_FN
> > > > > +#undef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> > > > >  #undef DEF_INTERNAL_OPTAB_FN
> > > > >  #undef DEF_INTERNAL_FN
> > > > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > > > > index bccee1c3e09..dbdb1e6bad2 100644
> > > > > --- a/gcc/internal-fn.h
> > > > > +++ b/gcc/internal-fn.h
> > > > > @@ -263,6 +263,8 @@ extern void expand_DIVMODBITINT (internal_fn, gcall
> > > > *);
> > > > >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> > > > >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> > > > >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > > > > +extern void expand_SAT_ADD (internal_fn, gcall *);
> > > > > +extern bool internal_SAT_ADD_fn_supported_p (tree, optimization_type);
> > > > >
> > > > >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> > > > >
> > > > > > Note this patch is a draft for validation, no test are invovled here.
> > > > > >
> > > > > > gcc/ChangeLog:
> > > > > >
> > > > > >       * builtins.def (BUILT_IN_US_PLUS): Add builtin def.
> > > > > >       (BUILT_IN_US_PLUSIMAX): Ditto.
> > > > > >       (BUILT_IN_US_PLUSL): Ditto.
> > > > > >       (BUILT_IN_US_PLUSLL): Ditto.
> > > > > >       (BUILT_IN_US_PLUSG): Ditto.
> > > > > >       * config/riscv/riscv-protos.h (riscv_expand_us_plus): Add new
> > > > > >       func decl for expanding us_plus.
> > > > > >       * config/riscv/riscv.cc (riscv_expand_us_plus): Add new func
> > > > > >       impl for expanding us_plus.
> > > > > >       * config/riscv/riscv.md (us_plus<mode>3): Add new pattern impl
> > > > > >       us_plus<mode>3.
> > > > > >       * internal-fn.cc (expand_US_PLUS): Add new func impl to expand
> > > > > >       US_PLUS.
> > > > > >       * internal-fn.def (US_PLUS): Add new INT_EXT_FN.
> > > > > >       * internal-fn.h (expand_US_PLUS): Add new func decl.
> > > > > >       * match.pd: Add new simplify pattern for us_plus.
> > > > > >       * optabs.def (OPTAB_NL): Add new OPTAB_NL to US_PLUS rtl.
> > > > > >
> > > > > > Signed-off-by: Pan Li <pan2.li@intel.com>
> > > > > > ---
> > > > > >  gcc/builtins.def                |  7 +++++
> > > > > >  gcc/config/riscv/riscv-protos.h |  1 +
> > > > > >  gcc/config/riscv/riscv.cc       | 46 +++++++++++++++++++++++++++++++++
> > > > > >  gcc/config/riscv/riscv.md       | 11 ++++++++
> > > > > >  gcc/internal-fn.cc              | 26 +++++++++++++++++++
> > > > > >  gcc/internal-fn.def             |  3 +++
> > > > > >  gcc/internal-fn.h               |  1 +
> > > > > >  gcc/match.pd                    | 17 ++++++++++++
> > > > > >  gcc/optabs.def                  |  2 ++
> > > > > >  9 files changed, 114 insertions(+)
> > > > > >
> > > > > > diff --git a/gcc/builtins.def b/gcc/builtins.def
> > > > > > index f6f3e104f6a..0777b912cfa 100644
> > > > > > --- a/gcc/builtins.def
> > > > > > +++ b/gcc/builtins.def
> > > > > > @@ -1055,6 +1055,13 @@ DEF_GCC_BUILTIN
> > > > (BUILT_IN_POPCOUNTIMAX,
> > > > > > "popcountimax", BT_FN_INT_UINTMAX
> > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTL, "popcountl",
> > > > BT_FN_INT_ULONG,
> > > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTLL, "popcountll",
> > > > > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTG, "popcountg",
> > > > BT_FN_INT_VAR,
> > > > > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > > > > > +
> > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUS, "us_plus", BT_FN_INT_UINT,
> > > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSIMAX, "us_plusimax",
> > > > > > BT_FN_INT_UINTMAX, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSL, "us_plusl",
> > > BT_FN_INT_ULONG,
> > > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSLL, "us_plusll",
> > > > > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSG, "us_plusg", BT_FN_INT_VAR,
> > > > > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > > > > > +
> > > > > >  DEF_EXT_LIB_BUILTIN    (BUILT_IN_POSIX_MEMALIGN, "posix_memalign",
> > > > > > BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF)
> > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_PREFETCH, "prefetch",
> > > > > > BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
> > > > > >  DEF_LIB_BUILTIN        (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE,
> > > > > > ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST)
> > > > > > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> > > > > > index 80efdf2b7e5..ba6086f1f25 100644
> > > > > > --- a/gcc/config/riscv/riscv-protos.h
> > > > > > +++ b/gcc/config/riscv/riscv-protos.h
> > > > > > @@ -132,6 +132,7 @@ extern void riscv_asm_output_external (FILE *, const
> > > > tree,
> > > > > > const char *);
> > > > > >  extern bool
> > > > > >  riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
> > > > > >  extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx);
> > > > > > +extern void riscv_expand_us_plus (rtx, rtx, rtx);
> > > > > >
> > > > > >  #ifdef RTX_CODE
> > > > > >  extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool
> > > *invert_ptr
> > > > =
> > > > > > 0);
> > > > > > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> > > > > > index 4100abc9dd1..23f08974f07 100644
> > > > > > --- a/gcc/config/riscv/riscv.cc
> > > > > > +++ b/gcc/config/riscv/riscv.cc
> > > > > > @@ -10657,6 +10657,52 @@ riscv_vector_mode_supported_any_target_p
> > > > > > (machine_mode)
> > > > > >    return true;
> > > > > >  }
> > > > > >
> > > > > > +/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < x).  */
> > > > > > +void
> > > > > > +riscv_expand_us_plus (rtx dest, rtx x, rtx y)
> > > > > > +{
> > > > > > +  machine_mode mode = GET_MODE (dest);
> > > > > > +  rtx pmode_sum = gen_reg_rtx (Pmode);
> > > > > > +  rtx pmode_lt = gen_reg_rtx (Pmode);
> > > > > > +  rtx pmode_x = gen_lowpart (Pmode, x);
> > > > > > +  rtx pmode_y = gen_lowpart (Pmode, y);
> > > > > > +  rtx pmode_dest = gen_reg_rtx (Pmode);
> > > > > > +
> > > > > > +  /* Step-1: sum = x + y  */
> > > > > > +  if (mode == SImode && mode != Pmode)
> > > > > > +    { /* Take addw to avoid the sum truncate.  */
> > > > > > +      rtx simode_sum = gen_reg_rtx (SImode);
> > > > > > +      riscv_emit_binary (PLUS, simode_sum, x, y);
> > > > > > +      emit_move_insn (pmode_sum, gen_lowpart (Pmode, simode_sum));
> > > > > > +    }
> > > > > > +  else
> > > > > > +    riscv_emit_binary (PLUS, pmode_sum, pmode_x, pmode_y);
> > > > > > +
> > > > > > +  /* Step-1.1: truncate sum for HI and QI as we have no insn for add QI/HI.
> > > */
> > > > > > +  if (mode == HImode || mode == QImode)
> > > > > > +    {
> > > > > > +      int mode_bits = GET_MODE_BITSIZE (mode).to_constant ();
> > > > > > +      int shift_bits = GET_MODE_BITSIZE (Pmode) - mode_bits;
> > > > > > +
> > > > > > +      gcc_assert (shift_bits > 0);
> > > > > > +
> > > > > > +      riscv_emit_binary (ASHIFT, pmode_sum, pmode_sum, GEN_INT
> > > > (shift_bits));
> > > > > > +      riscv_emit_binary (LSHIFTRT, pmode_sum, pmode_sum, GEN_INT
> > > > > > (shift_bits));
> > > > > > +    }
> > > > > > +
> > > > > > +  /* Step-2: lt = sum < x  */
> > > > > > +  riscv_emit_binary (LTU, pmode_lt, pmode_sum, pmode_x);
> > > > > > +
> > > > > > +  /* Step-3: lt = -lt  */
> > > > > > +  riscv_emit_unary (NEG, pmode_lt, pmode_lt);
> > > > > > +
> > > > > > +  /* Step-4: pmode_dest = sum | lt  */
> > > > > > +  riscv_emit_binary (IOR, pmode_dest, pmode_lt, pmode_sum);
> > > > > > +
> > > > > > +  /* Step-5: dest = pmode_dest */
> > > > > > +  emit_move_insn (dest, gen_lowpart (mode, pmode_dest));
> > > > > > +}
> > > > > > +
> > > > > >  /* Initialize the GCC target structure.  */
> > > > > >  #undef TARGET_ASM_ALIGNED_HI_OP
> > > > > >  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> > > > > > diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> > > > > > index 3f7a023d941..eaa9867023c 100644
> > > > > > --- a/gcc/config/riscv/riscv.md
> > > > > > +++ b/gcc/config/riscv/riscv.md
> > > > > > @@ -3841,6 +3841,17 @@ (define_insn "*large_load_address"
> > > > > >    [(set_attr "type" "load")
> > > > > >     (set (attr "length") (const_int 8))])
> > > > > >
> > > > > > +(define_expand "us_plus<mode>3"
> > > > > > +  [(match_operand:ANYI   0 "register_operand")
> > > > > > +   (match_operand:ANYI   1 "register_operand")
> > > > > > +   (match_operand:ANYI   2 "register_operand")]
> > > > > > +  ""
> > > > > > +  {
> > > > > > +    riscv_expand_us_plus (operands[0], operands[1], operands[2]);
> > > > > > +    DONE;
> > > > > > +  }
> > > > > > +)
> > > > > > +
> > > > > >  (include "bitmanip.md")
> > > > > >  (include "crypto.md")
> > > > > >  (include "sync.md")
> > > > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > > > > index a07f25f3aee..a7341a57ffa 100644
> > > > > > --- a/gcc/internal-fn.cc
> > > > > > +++ b/gcc/internal-fn.cc
> > > > > > @@ -5177,3 +5177,29 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> > > > > >        emit_move_insn (plhs, cmp);
> > > > > >      }
> > > > > >  }
> > > > > > +
> > > > > > +void
> > > > > > +expand_US_PLUS (internal_fn fn, gcall *stmt)
> > > > > > +{
> > > > > > +  tree lhs = gimple_call_lhs (stmt);
> > > > > > +  tree rhs_0 = gimple_call_arg (stmt, 0);
> > > > > > +  tree rhs_1 = gimple_call_arg (stmt, 1);
> > > > > > +
> > > > > > +  do_pending_stack_adjust ();
> > > > > > +
> > > > > > +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> > > > > > +  rtx op_0 = expand_normal (rhs_0);
> > > > > > +  rtx op_1 = expand_normal (rhs_1);
> > > > > > +
> > > > > > +  class expand_operand ops[3];
> > > > > > +
> > > > > > +  create_output_operand (&ops[0], target, TYPE_MODE (TREE_TYPE (lhs)));
> > > > > > +  create_output_operand (&ops[1], op_0, TYPE_MODE (TREE_TYPE
> > > (rhs_0)));
> > > > > > +  create_output_operand (&ops[2], op_1, TYPE_MODE (TREE_TYPE
> > > (rhs_1)));
> > > > > > +
> > > > > > +  insn_code code = optab_handler (us_plus_optab, TYPE_MODE (TREE_TYPE
> > > > > > (rhs_0)));
> > > > > > +  expand_insn (code, 3, ops);
> > > > > > +
> > > > > > +  if (!rtx_equal_p (target, ops[0].value))
> > > > > > +    emit_move_insn (target, ops[0].value);
> > > > > > +}
> > > > >
> > > > > This can be simplified by calling expand_binary_optab_fn instead. See my
> > > > template
> > > > >
> > > > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > > > > index c14d30365c1..b1d7b5a0307 100644
> > > > > > --- a/gcc/internal-fn.def
> > > > > > +++ b/gcc/internal-fn.def
> > > > > > @@ -447,6 +447,9 @@ DEF_INTERNAL_INT_FN (FFS, ECF_CONST |
> > > > > > ECF_NOTHROW, ffs, unary)
> > > > > >  DEF_INTERNAL_INT_FN (PARITY, ECF_CONST | ECF_NOTHROW, parity,
> > > unary)
> > > > > >  DEF_INTERNAL_INT_EXT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW,
> > > > > > popcount, unary)
> > > > > >
> > > > > > +/* Binary integer ops.  */
> > > > > > +DEF_INTERNAL_INT_EXT_FN (US_PLUS, ECF_CONST | ECF_NOTHROW,
> > > > us_plus,
> > > > > > binary)
> > > > > > +
> > > > > >  DEF_INTERNAL_FN (GOMP_TARGET_REV, ECF_NOVOPS | ECF_LEAF |
> > > > > > ECF_NOTHROW, NULL)
> > > > > >  DEF_INTERNAL_FN (GOMP_USE_SIMT, ECF_NOVOPS | ECF_LEAF |
> > > > > > ECF_NOTHROW, NULL)
> > > > > >  DEF_INTERNAL_FN (GOMP_SIMT_ENTER, ECF_LEAF | ECF_NOTHROW,
> > > NULL)
> > > > > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > > > > > index bccee1c3e09..46e404b4a49 100644
> > > > > > --- a/gcc/internal-fn.h
> > > > > > +++ b/gcc/internal-fn.h
> > > > > > @@ -263,6 +263,7 @@ extern void expand_DIVMODBITINT (internal_fn,
> > > gcall
> > > > *);
> > > > > >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> > > > > >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> > > > > >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > > > > > +extern void expand_US_PLUS (internal_fn, gcall *);
> > > > > >
> > > > > >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> > > > > >
> > > > > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > > > > index c5b6540f939..f45fd58ad23 100644
> > > > > > --- a/gcc/match.pd
> > > > > > +++ b/gcc/match.pd
> > > > > > @@ -10265,3 +10265,20 @@ and,
> > > > > >        }
> > > > > >        (if (full_perm_p)
> > > > > >       (vec_perm (op@3 @0 @1) @3 @2))))))
> > > > > > +
> > > > > > +#if GIMPLE
> > > > > > +
> > > > > > +/* Unsigned saturation add, aka:
> > > > > > +   SAT_ADDU = (X + Y) | - ((X + Y) < X) or
> > > > > > +   SAT_ADDU = (X + Y) | - ((X + Y) < Y).  */
> > > > > > +(simplify
> > > > > > + (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0))))
> > > > > > +   (if (optimize
> > > > > > +     && INTEGRAL_TYPE_P (type)
> > > > > > +     && TYPE_UNSIGNED (TREE_TYPE (@0))
> > > > > > +     && types_match (type, TREE_TYPE (@0))
> > > > > > +     && types_match (type, TREE_TYPE (@1))
> > > > > > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> > > > > > OPTIMIZE_FOR_BOTH))
> > > > > > +   (IFN_US_PLUS @0 @1)))
> > > > > > +
> > > > > > +#endif
> > > > >
> > > > > With the version above you can drop the #if GIMPLE and the
> > > > > > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> > > > >
> > > > > Check.
> > > > >
> > > > > Thanks,
> > > > > Tamar
> > > > >
> > > > > > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > > > > > index ad14f9328b9..5855c4e0834 100644
> > > > > > --- a/gcc/optabs.def
> > > > > > +++ b/gcc/optabs.def
> > > > > > @@ -179,6 +179,8 @@ OPTAB_NL(clrsb_optab, "clrsb$a2", CLRSB, "clrsb",
> > > '2',
> > > > > > gen_int_libfunc)
> > > > > >  OPTAB_NL(popcount_optab, "popcount$a2", POPCOUNT, "popcount", '2',
> > > > > > gen_int_libfunc)
> > > > > >  OPTAB_NL(parity_optab, "parity$a2", PARITY, "parity", '2', gen_int_libfunc)
> > > > > >
> > > > > > +OPTAB_NL(us_plus_optab, "us_plus$a3", US_PLUS, "us_plus", '3',
> > > > > > gen_int_libfunc)
> > > > > > +
> > > > > >  /* Comparison libcalls for integers MUST come in pairs, signed/unsigned.  */
> > > > > >  OPTAB_NL(cmp_optab, NULL, UNKNOWN, "cmp", '2',
> > > > gen_int_fp_fixed_libfunc)
> > > > > >  OPTAB_NL(ucmp_optab, NULL, UNKNOWN, "ucmp", '2', gen_int_libfunc)
> > > > > > --
> > > > > > 2.34.1
> > > > >
Li, Pan2 March 5, 2024, 7:09 a.m. UTC | #11
Thanks Richard for comments.

> I do wonder what the existing usadd patterns with integer vector modes
> in various targets do?
> Those define_insn will at least not end up in the optab set I guess,
> so they must end up
> being either unused or used by explicit gen_* (via intrinsic
> functions?) or by combine?

For usadd with vector modes, I think the backend like RISC-V try to leverage instructions
like Vector Single-Width Saturating Add(aka vsaddu.vv/x/i).

> I think simply changing gen_*_fixed_libfunc to gen_int_libfunc won't
> work.  Since there's
> no libgcc support I'd leave it as gen_*_fixed_libfunc thus no library
> fallback for integers?

Change to gen_int_libfunc follows other int optabs. I am not sure if it will hit the standard name usaddm3 for vector mode.
But the happy path for scalar modes works up to a point, please help to correct me if any misunderstanding.

#0  riscv_expand_usadd (dest=0x7ffff6a8c7c8, x=0x7ffff6a8c798, y=0x7ffff6a8c7b0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/config/riscv/riscv.cc:10662
#1  0x00000000029f142a in gen_usaddsi3 (operand0=0x7ffff6a8c7c8, operand1=0x7ffff6a8c798, operand2=0x7ffff6a8c7b0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/config/riscv/riscv.md:3848
#2  0x0000000001751e60 in insn_gen_fn::operator()<rtx_def*, rtx_def*, rtx_def*> (this=0x4910e70 <insn_data+1248976>) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/recog.h:441
#3  0x000000000180f553 in maybe_gen_insn (icode=CODE_FOR_usaddsi3, nops=3, ops=0x7fffffffd2c0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/optabs.cc:8232
#4  0x000000000180fa42 in maybe_expand_insn (icode=CODE_FOR_usaddsi3, nops=3, ops=0x7fffffffd2c0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/optabs.cc:8275
#5  0x000000000180fade in expand_insn (icode=CODE_FOR_usaddsi3, nops=3, ops=0x7fffffffd2c0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/optabs.cc:8306
#6  0x00000000015cebdc in expand_fn_using_insn (stmt=0x7ffff6a36480, icode=CODE_FOR_usaddsi3, noutputs=1, ninputs=2) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/internal-fn.cc:254
#7  0x00000000015de146 in expand_direct_optab_fn (fn=IFN_SAT_ADD, stmt=0x7ffff6a36480, optab=usadd_optab, nargs=2) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/internal-fn.cc:3818
#8  0x00000000015e3610 in expand_SAT_ADD (fn=IFN_SAT_ADD, stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/internal-fn.def:278
#9  0x00000000015e65b6 in expand_internal_call (fn=IFN_SAT_ADD, stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/internal-fn.cc:4914
#10 0x00000000015e65e5 in expand_internal_call (stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/internal-fn.cc:4922
#11 0x0000000001248c8f in expand_call_stmt (stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:2771
#12 0x000000000124d392 in expand_gimple_stmt_1 (stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:3932
#13 0x000000000124d9aa in expand_gimple_stmt (stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:4077
#14 0x000000000124dac4 in expand_gimple_tailcall (bb=0x7ffff6dddae0, stmt=0x7ffff6a36480, can_fallthru=0x7fffffffd800) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:4123
#15 0x000000000125636b in expand_gimple_basic_block (bb=0x7ffff6dddae0, disable_tail_calls=false) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:6107
#16 0x0000000001258a1a in (anonymous namespace)::pass_expand::execute (this=0x556d180, fun=0x7ffff6a7f2e0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:6872
#17 0x0000000001873565 in execute_one_pass (pass=0x556d180) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/passes.cc:2646
#18 0x0000000001873948 in execute_pass_list_1 (pass=0x556d180) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/passes.cc:2755
#19 0x00000000018739d6 in execute_pass_list (fn=0x7ffff6a7f2e0, pass=0x5568870) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/passes.cc:2766
#20 0x00000000012bc975 in cgraph_node::expand (this=0x7ffff6c2c880) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cgraphunit.cc:1845
#21 0x00000000012bd18f in expand_all_functions () at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cgraphunit.cc:2028
#22 0x00000000012bdcc5 in symbol_table::compile (this=0x7ffff6c06000) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cgraphunit.cc:2402
#23 0x00000000012be16c in symbol_table::finalize_compilation_unit (this=0x7ffff6c06000) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cgraphunit.cc:2587
#24 0x0000000001a048a3 in compile_file () at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/toplev.cc:476
#25 0x0000000001a079e9 in do_compile () at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/toplev.cc:2154
#26 0x0000000001a07dee in toplev::main (this=0x7fffffffdcf2, argc=19, argv=0x7fffffffde28) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/toplev.cc:2310
#27 0x0000000003ebcc5d in main (argc=19, argv=0x7fffffffde28) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/main.cc:39

BTW, does match.pd support nested cond like below? I am debugging into gimple_simplify_COND_EXPR for why not hit the pattern...
+(simplify
+  (cond
+    (lt @0 integer_zerop)
+    (plus:c @0 @1)
+    (cond (lt @1 integer_zerop) @1 @0))
+  (IFN_SAT_ADD @0 @1))

Pan

-----Original Message-----
From: Richard Biener <richard.guenther@gmail.com> 
Sent: Monday, March 4, 2024 6:31 PM
To: Li, Pan2 <pan2.li@intel.com>
Cc: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com; jeffreyalaw@gmail.com
Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS

On Sat, Mar 2, 2024 at 8:46 AM Li, Pan2 <pan2.li@intel.com> wrote:
>
> Hi Richard and Tamar,
>
> I have a try with DEF_INTERNAL_SIGNED_OPTAB_FN for SAT_ADD/SUB/MUL but meet some problem when match.pd.
>
> For unsigned SAT_ADD = (x + y) | - ((x + y) < x), the match.pd can be (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0)))).
> For unsigned SAT_SUB = x >= y ? x - y : 0, and then match.pd can be (cond (ge @0 @1) (minus @0 @1) integer_zerop).
>
> For signed SAT_ADD/SAT_SUB as below, seems not easy to make the simplify pattern works well as expected up to a point.
> sint64_t sat_add (sint64_t x, sint64_t y)
> {
>   sint64_t a = x ^ y;
>   sint64_t add = x + y;
>   sint64_t b = sum ^ x;
>
>   return (a < 0 || (a >= 0 && b >= 0)) ? add : (MAX_INT64 + (x < 0));
> }
>
> sint64_t sad_sub (sint64_t x, sint64_t y)
> {
>   sint64_t a = x ^ y;
>   sint64_t sub = x - y;
>   sint64_t b = sub ^ x;
>
>   return (a >= 0 || (a < 0 && b >= 0) ? sub : (MAX_INT64 + (x < 0));
> }
>
> For SAT_MUL as below, looks we may need widen type. I am not sure if we can leverage MUL_OVERFLOW or not in match.pd.
>
> uint32_t sat_mul (uint32_t x, uint32_t y)
> {
>   uint64_t mul = (uint64_t)x * (uint64_t)y;
>   return mul > UINT32_MAX ? UINT32_MAX : (uint32_t)mul;
> }
>
> sint32_t sat_mul (sint32_t x, sint32_t y)
> {
>   sint64_t mul = (sint64_t)x * (sint64_t))y;
>
>   return mul <= MAX_INT32 && mul >= MIN_INT32 ? mul : MAX_INT32 + (x ^ y) > 0;
> }
>
> Below diff only contains unsigned SAT_ADD and SAT_SUB for prototype validation.
> I will continue to try the rest part in match.pd and keep you posted.
>
> -------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>
> diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> index 80efdf2b7e5..d9ad6fe2b58 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -132,6 +132,9 @@ extern void riscv_asm_output_external (FILE *, const tree, const char *);
>  extern bool
>  riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
>  extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx);
> +extern void riscv_expand_usadd (rtx, rtx, rtx);
> +extern void riscv_expand_ussub (rtx, rtx, rtx);
>
>  #ifdef RTX_CODE
>  extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool *invert_ptr = 0);
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index 5e984ee2a55..795462526df 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -10655,6 +10655,28 @@ riscv_vector_mode_supported_any_target_p (machine_mode)
>    return true;
>  }
>
> +/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < x).  */
> +void
> +riscv_expand_usadd (rtx dest, rtx x, rtx y)
> +{
> +  fprintf (stdout, "Hit riscv_expand_usadd.\n");
> +  // ToDo
> +}
> +
> +void
> +riscv_expand_ussub (rtx dest, rtx x, rtx y)
> +{
> +  fprintf (stdout, "Hit riscv_expand_ussub.\n");
> +  // ToDo
> +}
> +
>  /* Initialize the GCC target structure.  */
>  #undef TARGET_ASM_ALIGNED_HI_OP
>  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> index 1fec13092e2..e2dbadb3ead 100644
> --- a/gcc/config/riscv/riscv.md
> +++ b/gcc/config/riscv/riscv.md
> @@ -3839,6 +3839,39 @@ (define_insn "*large_load_address"
>    [(set_attr "type" "load")
>     (set (attr "length") (const_int 8))])
>
> +(define_expand "usadd<mode>3"
> +  [(match_operand:ANYI 0 "register_operand")
> +   (match_operand:ANYI 1 "register_operand")
> +   (match_operand:ANYI 2 "register_operand")]
> +  ""
> +  {
> +    riscv_expand_usadd (operands[0], operands[1], operands[2]);
> +    DONE;
> +  }
> +)
> +
> +(define_expand "ussub<mode>3"
> +  [(match_operand:ANYI 0 "register_operand")
> +   (match_operand:ANYI 1 "register_operand")
> +   (match_operand:ANYI 2 "register_operand")]
> +  ""
> +  {
> +    riscv_expand_ussub (operands[0], operands[1], operands[2]);
> +    DONE;
> +  }
> +)
> +
>  (include "bitmanip.md")
>  (include "crypto.md")
>  (include "sync.md")
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 848bb9dbff3..0fff19c875f 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -275,6 +275,13 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST | ECF_NOTHROW, first,
>  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, first,
>                               smulhrs, umulhrs, binary)
>
> +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST | ECF_NOTHROW, first,
> +                             ssadd, usadd, binary)
> +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_SUB, ECF_CONST | ECF_NOTHROW, first,
> +                             sssub, ussub, binary)
> +
>  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
>  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
>  DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary)
> diff --git a/gcc/match.pd b/gcc/match.pd
> index f3fffd8dec2..6592dea643a 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -10276,3 +10276,32 @@ and,
>        }
>        (if (full_perm_p)
>         (vec_perm (op@3 @0 @1) @3 @2))))))
> +
> +#if GIMPLE
> +
> +/* Unsigned saturation add, aka:
> +   SAT_ADDU = (X + Y) | - ((X + Y) < X) or
> +   SAT_ADDU = (X + Y) | - ((X + Y) < Y).  */
> +(simplify
> + (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0))))
> +   (if (optimize
> +       && INTEGRAL_TYPE_P (type)
> +       && types_match (type, TREE_TYPE (@0))
> +       && types_match (type, TREE_TYPE (@1))
> +       && TYPE_UNSIGNED (TREE_TYPE (@0))
> +       && direct_internal_fn_supported_p (IFN_SAT_ADD, type, OPTIMIZE_FOR_BOTH))
> +   (IFN_SAT_ADD @0 @1)))
> +
> +/* Unsigned saturation sub , aka
> +   SAT_SUBU = x >= y ? x - y : 0.  */
> +(simplify
> +  (cond (ge @0 @1) (minus @0 @1) integer_zerop)
> +    (if (optimize
> +       && INTEGRAL_TYPE_P (type)
> +       && TYPE_UNSIGNED (TREE_TYPE (@0))
> +       && types_match (type, TREE_TYPE (@0))
> +       && types_match (type, TREE_TYPE (@1))
> +       && direct_internal_fn_supported_p (IFN_SAT_SUB, type, OPTIMIZE_FOR_BOTH))
> +    (IFN_SAT_SUB @0 @1)))
> +
> +#endif
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index ad14f9328b9..bebe38c888b 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -111,15 +111,15 @@ OPTAB_NX(add_optab, "add$F$a3")
>  OPTAB_NX(add_optab, "add$Q$a3")
>  OPTAB_VL(addv_optab, "addv$I$a3", PLUS, "add", '3', gen_intv_fp_libfunc)
>  OPTAB_VX(addv_optab, "add$F$a3")
> -OPTAB_NL(ssadd_optab, "ssadd$Q$a3", SS_PLUS, "ssadd", '3', gen_signed_fixed_libfunc)
> -OPTAB_NL(usadd_optab, "usadd$Q$a3", US_PLUS, "usadd", '3', gen_unsigned_fixed_libfunc)
> +OPTAB_NL(ssadd_optab, "ssadd$a3", SS_PLUS, "ssadd", '3', gen_int_libfunc)
> +OPTAB_NL(usadd_optab, "usadd$a3", US_PLUS, "usadd", '3', gen_int_libfunc)
>  OPTAB_NL(sub_optab, "sub$P$a3", MINUS, "sub", '3', gen_int_fp_fixed_libfunc)
>  OPTAB_NX(sub_optab, "sub$F$a3")
>  OPTAB_NX(sub_optab, "sub$Q$a3")
>  OPTAB_VL(subv_optab, "subv$I$a3", MINUS, "sub", '3', gen_intv_fp_libfunc)
>  OPTAB_VX(subv_optab, "sub$F$a3")
> -OPTAB_NL(sssub_optab, "sssub$Q$a3", SS_MINUS, "sssub", '3', gen_signed_fixed_libfunc)
> -OPTAB_NL(ussub_optab, "ussub$Q$a3", US_MINUS, "ussub", '3', gen_unsigned_fixed_libfunc)
> +OPTAB_NL(sssub_optab, "sssub$a3", SS_MINUS, "sssub", '3', gen_int_libfunc)
> +OPTAB_NL(ussub_optab, "ussub$a3", US_MINUS, "ussub", '3', gen_int_libfunc)

I do wonder what the existing usadd patterns with integer vector modes
in various targets do?
Those define_insn will at least not end up in the optab set I guess,
so they must end up
being either unused or used by explicit gen_* (via intrinsic
functions?) or by combine?

I think simply changing gen_*_fixed_libfunc to gen_int_libfunc won't
work.  Since there's
no libgcc support I'd leave it as gen_*_fixed_libfunc thus no library
fallback for integers?

>  OPTAB_NL(smul_optab, "mul$Q$a3", MULT, "mul", '3', gen_int_fp_fixed_libfunc)
>  OPTAB_NX(smul_optab, "mul$P$a3")
>  OPTAB_NX(smul_optab, "mul$F$a3")
>
> Pan
>
> -----Original Message-----
> From: Li, Pan2 <pan2.li@intel.com>
> Sent: Tuesday, February 27, 2024 10:36 PM
> To: Richard Biener <richard.guenther@gmail.com>; Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com; richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS
>
> Thanks Richard and Tammer for moving this forward.
>
> > That said, I would like to see the bigger picture to be kept in mind
> > before altering the GIMPLE IL.
>
> > Adding an internal function for an already present optab is a
> > no-brainer.  Adding a vectorizer
> > and/or if-conversion pattern to make use of this during vectorization
> > is existing practice.
> > Adding pattern recognition to ISEL or widening-mul passes for
> > instructions the CPU can do
> > is existing practice and OK.
>
> Thanks for explaining, got the point here.
>
> > So I'd suggest writing some example of both signed and unsigned saturating add and multiply
>
> > Because signed addition, will likely require a branch and signed multiplication would require a
> > larger type.
>
> Ack, will prepare one prototype validation patch for add, sub and mul (both unsigned and signed) soon.
>
> Pan
>
> -----Original Message-----
> From: Richard Biener <richard.guenther@gmail.com>
> Sent: Tuesday, February 27, 2024 9:42 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com; richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS
>
> On Tue, Feb 27, 2024 at 1:57 PM Tamar Christina <Tamar.Christina@arm.com> wrote:
> >
> > > Thanks Tamar.
> > >
> > > > Those two cases also *completely* stop vectorization because of either the
> > > > control flow or the fact the vectorizer can't handle complex types.
> > >
> > > Yes, we eventually would like to vectorize the SAT ALU but we start with scalar part
> > > first.
> > > I tried the DEF_INTERNAL_SIGNED_OPTAB_EXT_FN as your suggestion. It works
> > > well with some additions as below.
> > > Feel free to correct me if any misunderstandings.
> > >
> > > 1. usadd$Q$a3 are restricted to fixed point and we need to change it to
> > > usadd$a3(as well as gen_int_libfunc) for int.
> > > 2. We need to implement a default implementation of SAT_ADD if
> > > direct_binary_optab_supported_p is false.
> > >     It looks like the default implementation is difficult to make every backend happy.
> > > That is why you suggest just normal
> > >     DEF_INTERNAL_SIGNED_OPTAB_FN in another thread.
> > >
> > > Thanks Richard.
> > >
> > > > But what I'd like to see is that we do more instruction selection on GIMPLE
> > > > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> > > > passes doing what I'd call instruction selection).  But that means not adding
> > > > match.pd patterns for that or at least have a separate isel-match.pd
> > > > machinery for that.
> > >
> > > > So as a start I would go for a direct optab and see to recognize it during
> > > > ISEL?
> > >
> > > Looks we have sorts of SAT alu like PLUS/MINUS/MULT/DIV/SHIFT/NEG/ABS, good
> > > to know isel and I am happy to
> > > try that once we have conclusion.
> > >
> >
> > So after a lively discussion on IRC, the conclusion is that before we proceed Richi would
> > like to see some examples of various operations.  The problem is that unsigned saturating
> > addition is the simplest example and it may lead to an implementation strategy that doesn't
> > scale.
> >
> > So I'd suggest writing some example of both signed and unsigned saturating add and multiply
> >
> > Because signed addition, will likely require a branch and signed multiplication would require a
> > larger type.
> >
> > This would allow us to better understand what kind of gimple would have to to deal with in
> > ISEL and VECT if we decide not to lower early.
>
> More specifically before making something like .SAT_ADD a core part of
> GIMPLE I'd like
> to point out that we have saturating PLUS_EXPR but just for
> fixed-point types.  I realize
> Joseph thinks that keying this on the type was wrong and it should
> have used integer
> types and special saturating operations.  Still having both,
> type-keyed saturating PLUS_EXPR
> and "code"-keyed .SAT_ADD (on integer types only?) looks like a mess.
>
> It might be that the way to go is to turn all existing saturating type
> *_EXPR into
> .SAT_* internal function calls, in the end mapping to the optabs and
> eventual RTX codes.
>
> That could work for both integer types and fixed-point types.
>
> I'll also note that "saturating" is just another variant of overflow
> behavior of which we have
> trapping (-ftrapv), wrapping (-fwrapv), signed-undefined (default) and
> also (kind-of) sanitized.
> We do lack direct IL representation of -ftrapv and -fwrapv, the
> semantics on a PLUS_EXPR
> depend on per-function flags.  Eventually a common representation
> could be found here.
> For saturating I was thinking of .ADD_OVERFLOW (a, b,
> saturation-value), a "trap" could
> be a "trapping" saturation value, "undefined" could be a
> "not-a-thing".  But I didn't think much
> about this.
>
> That said, I would like to see the bigger picture to be kept in mind
> before altering the GIMPLE IL.
>
> Adding an internal function for an already present optab is a
> no-brainer.  Adding a vectorizer
> and/or if-conversion pattern to make use of this during vectorization
> is existing practice.
> Adding pattern recognition to ISEL or widening-mul passes for
> instructions the CPU can do
> is existing practice and OK.
>
> Thanks,
> Richard.
>
>
> > Thanks,
> > Tamar
> >
> > > Pan
> > >
> > > -----Original Message-----
> > > From: Tamar Christina <Tamar.Christina@arm.com>
> > > Sent: Tuesday, February 27, 2024 5:57 PM
> > > To: Richard Biener <richard.guenther@gmail.com>
> > > Cc: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai;
> > > Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com;
> > > richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > > Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> > > US_PLUS
> > >
> > > > -----Original Message-----
> > > > From: Richard Biener <richard.guenther@gmail.com>
> > > > Sent: Tuesday, February 27, 2024 9:44 AM
> > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > Cc: pan2.li@intel.com; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai;
> > > > yanzhang.wang@intel.com; kito.cheng@gmail.com;
> > > > richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > > > Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> > > > US_PLUS
> > > >
> > > > On Sun, Feb 25, 2024 at 10:01 AM Tamar Christina
> > > > <Tamar.Christina@arm.com> wrote:
> > > > >
> > > > > Hi Pan,
> > > > >
> > > > > > From: Pan Li <pan2.li@intel.com>
> > > > > >
> > > > > > Hi Richard & Tamar,
> > > > > >
> > > > > > Try the DEF_INTERNAL_INT_EXT_FN as your suggestion.  By mapping
> > > > > > us_plus$a3 to the RTL representation (us_plus:m x y) in optabs.def.
> > > > > > And then expand_US_PLUS in internal-fn.cc.  Not very sure if my
> > > > > > understanding is correct for DEF_INTERNAL_INT_EXT_FN.
> > > > > >
> > > > > > I am not sure if we still need DEF_INTERNAL_SIGNED_OPTAB_FN here, given
> > > > > > the RTL representation has (ss_plus:m x y) and (us_plus:m x y) already.
> > > > > >
> > > > >
> > > > > I think a couple of things are being confused here.  So lets break it down:
> > > > >
> > > > > The reason for DEF_INTERNAL_SIGNED_OPTAB_FN is because in GIMPLE
> > > > > we only want one internal function for both signed and unsigned SAT_ADD.
> > > > > with this definition we don't need SAT_UADD and SAT_SADD but instead
> > > > > we will only have SAT_ADD, which will expand to us_plus or ss_plus.
> > > > >
> > > > > Now the downside of this is that this is a direct internal optab.  This means
> > > > > that for the representation to be used the target *must* have the optab
> > > > > implemented.   This is a bit annoying because it doesn't allow us to generically
> > > > > assume that all targets use SAT_ADD for saturating add and thus only have to
> > > > > write optimization for this representation.
> > > > >
> > > > > This is why Richi said we may need to use a new tree_code because we can
> > > > > override tree code expansions.  However the same can be done with the
> > > _EXT_FN
> > > > > internal functions.
> > > > >
> > > > > So what I meant was that we want to have a combination of the two. i.e. a
> > > > > DEF_INTERNAL_SIGNED_OPTAB_EXT_FN.
> > > >
> > > > Whether we want/need _EXT or only direct depends mainly on how we want to
> > > > leverage support.  If it's only during vectorization and possibly instruction
> > > > selection a direct optab is IMO the way to go.  Generic optimization only
> > > > marginally improves when you explode the number of basic operations you
> > > > expose - in fact it gets quite unwieldly to support all of them in
> > > > simplifications
> > > > and/or canonicalization and you possibly need to translate them back to what
> > > > the target CPU supports.
> > > >
> > > > We already do have too many (IMO) "special" operations exposed "early"
> > > > in the GIMPLE pipeline.
> > > >
> > > > But what I'd like to see is that we do more instruction selection on GIMPLE
> > > > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> > > > passes doing what I'd call instruction selection).  But that means not adding
> > > > match.pd patterns for that or at least have a separate isel-match.pd
> > > > machinery for that.
> > > >
> > > > So as a start I would go for a direct optab and see to recognize it during
> > > > ISEL?
> > > >
> > >
> > > The problem with ISEL and the reason I suggested an indirect IFN is that there
> > > Are benefit to be had from recognizing it early.  Saturating arithmetic can be
> > > optimized
> > > Differently from non-saturating ones.
> > >
> > > But additionally a common way of specifying them decomposes to branches
> > > and/or using COMPLEX_EXPR (see the various PRs on saturating arithmetic).
> > >
> > > These two representation can be detected in PHI-opts and it's beneficial to all
> > > targets to canonicalize them to the branchless code.
> > >
> > > Those two cases also *completely* stop vectorization because of either the
> > > control flow or the fact the vectorizer can't handle complex types.
> > >
> > > So really, gimple ISEL would fix just 1 of the 3 very common cases, and then
> > > We'd still need to hack the vectorizer cost models for targets with saturating
> > > vector instructions.
> > >
> > > I of course defer to you, but it seems quite suboptimal to do it this way and
> > > doesn't get us first class saturation support.
> > >
> > > Additionally there have been discussions whether both clang and gcc should
> > > provide __builtin_saturate_* methods, which the non-direct IFN would help
> > > support.
> > >
> > > Tamar.
> > >
> > > > > If Richi agrees, the below is what I meant. It creates the infrastructure for this
> > > > > and for now only allows a default fallback for unsigned saturating add and
> > > makes
> > > > > it easier for us to add the rest later
> > > > >
> > > > > Also, unless I'm wrong (and Richi can correct me here), us_plus and ss_plus are
> > > > the
> > > > > RTL expression, but the optab for saturation are ssadd and usadd.  So you don't
> > > > > need to make new us_plus and ss_plus ones.
> > > > >
> > > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > > > index a07f25f3aee..aaf9f8991b3 100644
> > > > > --- a/gcc/internal-fn.cc
> > > > > +++ b/gcc/internal-fn.cc
> > > > > @@ -4103,6 +4103,17 @@ direct_internal_fn_supported_p (internal_fn fn,
> > > > tree_pair types,
> > > > >         return direct_##TYPE##_optab_supported_p (which_optab, types,   \
> > > > >                                                   opt_type);            \
> > > > >        }
> > > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR,
> > > > SIGNED_OPTAB, \
> > > > > +                                        UNSIGNED_OPTAB, TYPE)          \
> > > > > +    case IFN_##CODE:                                                   \
> > > > > +      {                                                                        \
> > > > > +       optab which_optab = (TYPE_UNSIGNED (types.SELECTOR)             \
> > > > > +                            ? UNSIGNED_OPTAB ## _optab                 \
> > > > > +                            : SIGNED_OPTAB ## _optab);                 \
> > > > > +       return direct_##TYPE##_optab_supported_p (which_optab, types,   \
> > > > > +                                                 opt_type)             \
> > > > > +              || internal_##CODE##_fn_supported_p (types.SELECTOR, opt_type); \
> > > > > +      }
> > > > >  #include "internal-fn.def"
> > > > >
> > > > >      case IFN_LAST:
> > > > > @@ -4303,6 +4314,8 @@ set_edom_supported_p (void)
> > > > >      optab which_optab = direct_internal_fn_optab (fn, types);          \
> > > > >      expand_##TYPE##_optab_fn (fn, stmt, which_optab);                  \
> > > > >    }
> > > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR,
> > > > SIGNED_OPTAB, \
> > > > > +                                        UNSIGNED_OPTAB, TYPE)
> > > > >  #include "internal-fn.def"
> > > > >
> > > > >  /* Routines to expand each internal function, indexed by function number.
> > > > > @@ -5177,3 +5190,45 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> > > > >        emit_move_insn (plhs, cmp);
> > > > >      }
> > > > >  }
> > > > > +
> > > > > +void
> > > > > +expand_SAT_ADD (internal_fn fn, gcall *stmt)
> > > > > +{
> > > > > +  /* Check if the target supports the expansion through an IFN.  */
> > > > > +  tree_pair types = direct_internal_fn_types (fn, stmt);
> > > > > +  optab which_optab = direct_internal_fn_optab (fn, types);
> > > > > +  if (direct_binary_optab_supported_p (which_optab, types,
> > > > > +                                      insn_optimization_type ()))
> > > > > +    {
> > > > > +      expand_binary_optab_fn (fn, stmt, which_optab);
> > > > > +      return;
> > > > > +    }
> > > > > +
> > > > > +  /* Target does not support the optab, but we can de-compose it.  */
> > > > > +  /*
> > > > > +  ... decompose to a canonical representation ...
> > > > > +  if (TYPE_UNSIGNED (types.SELECTOR))
> > > > > +    {
> > > > > +      ...
> > > > > +      decompose back to (X + Y) | - ((X + Y) < X)
> > > > > +    }
> > > > > +  else
> > > > > +    {
> > > > > +      ...
> > > > > +    }
> > > > > +  */
> > > > > +}
> > > > > +
> > > > > +bool internal_SAT_ADD_fn_supported_p (tree type, optimization_type /*
> > > > optype */)
> > > > > +{
> > > > > +  /* For now, don't support decomposing vector ops.  */
> > > > > +  if (VECTOR_TYPE_P (type))
> > > > > +    return false;
> > > > > +
> > > > > +  /* Signed saturating arithmetic is harder to do since we'll so for now
> > > > > +     lets ignore.  */
> > > > > +  if (!TYPE_UNSIGNED (type))
> > > > > +    return false;
> > > > > +
> > > > > +  return TREE_CODE (type) == INTEGER_TYPE;
> > > > > +}
> > > > > \ No newline at end of file
> > > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > > > index c14d30365c1..5a2491228d5 100644
> > > > > --- a/gcc/internal-fn.def
> > > > > +++ b/gcc/internal-fn.def
> > > > > @@ -92,6 +92,10 @@ along with GCC; see the file COPYING3.  If not see
> > > > >     unsigned inputs respectively, both without the trailing "_optab".
> > > > >     SELECTOR says which type in the tree_pair determines the signedness.
> > > > >
> > > > > +   DEF_INTERNAL_SIGNED_OPTAB_EXT_FN is like
> > > > DEF_INTERNAL_SIGNED_OPTAB_FN, except
> > > > > +   that it has expand_##NAME defined in internal-fn.cc to override the
> > > > > +   DEF_INTERNAL_SIGNED_OPTAB_FN expansion behavior.
> > > > > +
> > > > >     DEF_INTERNAL_FLT_FN is like DEF_INTERNAL_OPTAB_FN, but in addition,
> > > > >     the function implements the computational part of a built-in math
> > > > >     function BUILT_IN_<NAME>{F,,L}.  Unlike some built-in functions,
> > > > > @@ -153,6 +157,13 @@ along with GCC; see the file COPYING3.  If not see
> > > > >    DEF_INTERNAL_FN (NAME, FLAGS | ECF_LEAF, NULL)
> > > > >  #endif
> > > > >
> > > > > +#ifndef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> > > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(NAME, FLAGS, SELECTOR,
> > > > SIGNED_OPTAB, \
> > > > > +                                    UNSIGNED_OPTAB, TYPE) \
> > > > > +  DEF_INTERNAL_SIGNED_OPTAB_FN (NAME, FLAGS, SELECTOR,
> > > > SIGNED_OPTAB, \
> > > > > +                               UNSIGNED_OPTAB, TYPE)
> > > > > +#endif
> > > > > +
> > > > >  #ifndef DEF_INTERNAL_FLT_FN
> > > > >  #define DEF_INTERNAL_FLT_FN(NAME, FLAGS, OPTAB, TYPE) \
> > > > >    DEF_INTERNAL_OPTAB_FN (NAME, FLAGS, OPTAB, TYPE)
> > > > > @@ -274,6 +285,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS,
> > > > ECF_CONST | ECF_NOTHROW, first,
> > > > >                               smulhs, umulhs, binary)
> > > > >  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW,
> > > > first,
> > > > >                               smulhrs, umulhrs, binary)
> > > > > +DEF_INTERNAL_SIGNED_OPTAB_EXT_FN (SAT_ADD, ECF_CONST |
> > > > ECF_NOTHROW, first,
> > > > > +                                 ssadd, usadd, binary)
> > > > >
> > > > >  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
> > > > >  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
> > > > > @@ -593,5 +606,6 @@ DEF_INTERNAL_FN (BITINTTOFLOAT, ECF_PURE |
> > > > ECF_LEAF, ". R . ")
> > > > >  #undef DEF_INTERNAL_FLT_FN
> > > > >  #undef DEF_INTERNAL_FLT_FLOATN_FN
> > > > >  #undef DEF_INTERNAL_SIGNED_OPTAB_FN
> > > > > +#undef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> > > > >  #undef DEF_INTERNAL_OPTAB_FN
> > > > >  #undef DEF_INTERNAL_FN
> > > > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > > > > index bccee1c3e09..dbdb1e6bad2 100644
> > > > > --- a/gcc/internal-fn.h
> > > > > +++ b/gcc/internal-fn.h
> > > > > @@ -263,6 +263,8 @@ extern void expand_DIVMODBITINT (internal_fn, gcall
> > > > *);
> > > > >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> > > > >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> > > > >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > > > > +extern void expand_SAT_ADD (internal_fn, gcall *);
> > > > > +extern bool internal_SAT_ADD_fn_supported_p (tree, optimization_type);
> > > > >
> > > > >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> > > > >
> > > > > > Note this patch is a draft for validation, no test are invovled here.
> > > > > >
> > > > > > gcc/ChangeLog:
> > > > > >
> > > > > >       * builtins.def (BUILT_IN_US_PLUS): Add builtin def.
> > > > > >       (BUILT_IN_US_PLUSIMAX): Ditto.
> > > > > >       (BUILT_IN_US_PLUSL): Ditto.
> > > > > >       (BUILT_IN_US_PLUSLL): Ditto.
> > > > > >       (BUILT_IN_US_PLUSG): Ditto.
> > > > > >       * config/riscv/riscv-protos.h (riscv_expand_us_plus): Add new
> > > > > >       func decl for expanding us_plus.
> > > > > >       * config/riscv/riscv.cc (riscv_expand_us_plus): Add new func
> > > > > >       impl for expanding us_plus.
> > > > > >       * config/riscv/riscv.md (us_plus<mode>3): Add new pattern impl
> > > > > >       us_plus<mode>3.
> > > > > >       * internal-fn.cc (expand_US_PLUS): Add new func impl to expand
> > > > > >       US_PLUS.
> > > > > >       * internal-fn.def (US_PLUS): Add new INT_EXT_FN.
> > > > > >       * internal-fn.h (expand_US_PLUS): Add new func decl.
> > > > > >       * match.pd: Add new simplify pattern for us_plus.
> > > > > >       * optabs.def (OPTAB_NL): Add new OPTAB_NL to US_PLUS rtl.
> > > > > >
> > > > > > Signed-off-by: Pan Li <pan2.li@intel.com>
> > > > > > ---
> > > > > >  gcc/builtins.def                |  7 +++++
> > > > > >  gcc/config/riscv/riscv-protos.h |  1 +
> > > > > >  gcc/config/riscv/riscv.cc       | 46 +++++++++++++++++++++++++++++++++
> > > > > >  gcc/config/riscv/riscv.md       | 11 ++++++++
> > > > > >  gcc/internal-fn.cc              | 26 +++++++++++++++++++
> > > > > >  gcc/internal-fn.def             |  3 +++
> > > > > >  gcc/internal-fn.h               |  1 +
> > > > > >  gcc/match.pd                    | 17 ++++++++++++
> > > > > >  gcc/optabs.def                  |  2 ++
> > > > > >  9 files changed, 114 insertions(+)
> > > > > >
> > > > > > diff --git a/gcc/builtins.def b/gcc/builtins.def
> > > > > > index f6f3e104f6a..0777b912cfa 100644
> > > > > > --- a/gcc/builtins.def
> > > > > > +++ b/gcc/builtins.def
> > > > > > @@ -1055,6 +1055,13 @@ DEF_GCC_BUILTIN
> > > > (BUILT_IN_POPCOUNTIMAX,
> > > > > > "popcountimax", BT_FN_INT_UINTMAX
> > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTL, "popcountl",
> > > > BT_FN_INT_ULONG,
> > > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTLL, "popcountll",
> > > > > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTG, "popcountg",
> > > > BT_FN_INT_VAR,
> > > > > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > > > > > +
> > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUS, "us_plus", BT_FN_INT_UINT,
> > > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSIMAX, "us_plusimax",
> > > > > > BT_FN_INT_UINTMAX, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSL, "us_plusl",
> > > BT_FN_INT_ULONG,
> > > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSLL, "us_plusll",
> > > > > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSG, "us_plusg", BT_FN_INT_VAR,
> > > > > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > > > > > +
> > > > > >  DEF_EXT_LIB_BUILTIN    (BUILT_IN_POSIX_MEMALIGN, "posix_memalign",
> > > > > > BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF)
> > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_PREFETCH, "prefetch",
> > > > > > BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
> > > > > >  DEF_LIB_BUILTIN        (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE,
> > > > > > ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST)
> > > > > > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> > > > > > index 80efdf2b7e5..ba6086f1f25 100644
> > > > > > --- a/gcc/config/riscv/riscv-protos.h
> > > > > > +++ b/gcc/config/riscv/riscv-protos.h
> > > > > > @@ -132,6 +132,7 @@ extern void riscv_asm_output_external (FILE *, const
> > > > tree,
> > > > > > const char *);
> > > > > >  extern bool
> > > > > >  riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
> > > > > >  extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx);
> > > > > > +extern void riscv_expand_us_plus (rtx, rtx, rtx);
> > > > > >
> > > > > >  #ifdef RTX_CODE
> > > > > >  extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool
> > > *invert_ptr
> > > > =
> > > > > > 0);
> > > > > > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> > > > > > index 4100abc9dd1..23f08974f07 100644
> > > > > > --- a/gcc/config/riscv/riscv.cc
> > > > > > +++ b/gcc/config/riscv/riscv.cc
> > > > > > @@ -10657,6 +10657,52 @@ riscv_vector_mode_supported_any_target_p
> > > > > > (machine_mode)
> > > > > >    return true;
> > > > > >  }
> > > > > >
> > > > > > +/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < x).  */
> > > > > > +void
> > > > > > +riscv_expand_us_plus (rtx dest, rtx x, rtx y)
> > > > > > +{
> > > > > > +  machine_mode mode = GET_MODE (dest);
> > > > > > +  rtx pmode_sum = gen_reg_rtx (Pmode);
> > > > > > +  rtx pmode_lt = gen_reg_rtx (Pmode);
> > > > > > +  rtx pmode_x = gen_lowpart (Pmode, x);
> > > > > > +  rtx pmode_y = gen_lowpart (Pmode, y);
> > > > > > +  rtx pmode_dest = gen_reg_rtx (Pmode);
> > > > > > +
> > > > > > +  /* Step-1: sum = x + y  */
> > > > > > +  if (mode == SImode && mode != Pmode)
> > > > > > +    { /* Take addw to avoid the sum truncate.  */
> > > > > > +      rtx simode_sum = gen_reg_rtx (SImode);
> > > > > > +      riscv_emit_binary (PLUS, simode_sum, x, y);
> > > > > > +      emit_move_insn (pmode_sum, gen_lowpart (Pmode, simode_sum));
> > > > > > +    }
> > > > > > +  else
> > > > > > +    riscv_emit_binary (PLUS, pmode_sum, pmode_x, pmode_y);
> > > > > > +
> > > > > > +  /* Step-1.1: truncate sum for HI and QI as we have no insn for add QI/HI.
> > > */
> > > > > > +  if (mode == HImode || mode == QImode)
> > > > > > +    {
> > > > > > +      int mode_bits = GET_MODE_BITSIZE (mode).to_constant ();
> > > > > > +      int shift_bits = GET_MODE_BITSIZE (Pmode) - mode_bits;
> > > > > > +
> > > > > > +      gcc_assert (shift_bits > 0);
> > > > > > +
> > > > > > +      riscv_emit_binary (ASHIFT, pmode_sum, pmode_sum, GEN_INT
> > > > (shift_bits));
> > > > > > +      riscv_emit_binary (LSHIFTRT, pmode_sum, pmode_sum, GEN_INT
> > > > > > (shift_bits));
> > > > > > +    }
> > > > > > +
> > > > > > +  /* Step-2: lt = sum < x  */
> > > > > > +  riscv_emit_binary (LTU, pmode_lt, pmode_sum, pmode_x);
> > > > > > +
> > > > > > +  /* Step-3: lt = -lt  */
> > > > > > +  riscv_emit_unary (NEG, pmode_lt, pmode_lt);
> > > > > > +
> > > > > > +  /* Step-4: pmode_dest = sum | lt  */
> > > > > > +  riscv_emit_binary (IOR, pmode_dest, pmode_lt, pmode_sum);
> > > > > > +
> > > > > > +  /* Step-5: dest = pmode_dest */
> > > > > > +  emit_move_insn (dest, gen_lowpart (mode, pmode_dest));
> > > > > > +}
> > > > > > +
> > > > > >  /* Initialize the GCC target structure.  */
> > > > > >  #undef TARGET_ASM_ALIGNED_HI_OP
> > > > > >  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> > > > > > diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> > > > > > index 3f7a023d941..eaa9867023c 100644
> > > > > > --- a/gcc/config/riscv/riscv.md
> > > > > > +++ b/gcc/config/riscv/riscv.md
> > > > > > @@ -3841,6 +3841,17 @@ (define_insn "*large_load_address"
> > > > > >    [(set_attr "type" "load")
> > > > > >     (set (attr "length") (const_int 8))])
> > > > > >
> > > > > > +(define_expand "us_plus<mode>3"
> > > > > > +  [(match_operand:ANYI   0 "register_operand")
> > > > > > +   (match_operand:ANYI   1 "register_operand")
> > > > > > +   (match_operand:ANYI   2 "register_operand")]
> > > > > > +  ""
> > > > > > +  {
> > > > > > +    riscv_expand_us_plus (operands[0], operands[1], operands[2]);
> > > > > > +    DONE;
> > > > > > +  }
> > > > > > +)
> > > > > > +
> > > > > >  (include "bitmanip.md")
> > > > > >  (include "crypto.md")
> > > > > >  (include "sync.md")
> > > > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > > > > index a07f25f3aee..a7341a57ffa 100644
> > > > > > --- a/gcc/internal-fn.cc
> > > > > > +++ b/gcc/internal-fn.cc
> > > > > > @@ -5177,3 +5177,29 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> > > > > >        emit_move_insn (plhs, cmp);
> > > > > >      }
> > > > > >  }
> > > > > > +
> > > > > > +void
> > > > > > +expand_US_PLUS (internal_fn fn, gcall *stmt)
> > > > > > +{
> > > > > > +  tree lhs = gimple_call_lhs (stmt);
> > > > > > +  tree rhs_0 = gimple_call_arg (stmt, 0);
> > > > > > +  tree rhs_1 = gimple_call_arg (stmt, 1);
> > > > > > +
> > > > > > +  do_pending_stack_adjust ();
> > > > > > +
> > > > > > +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> > > > > > +  rtx op_0 = expand_normal (rhs_0);
> > > > > > +  rtx op_1 = expand_normal (rhs_1);
> > > > > > +
> > > > > > +  class expand_operand ops[3];
> > > > > > +
> > > > > > +  create_output_operand (&ops[0], target, TYPE_MODE (TREE_TYPE (lhs)));
> > > > > > +  create_output_operand (&ops[1], op_0, TYPE_MODE (TREE_TYPE
> > > (rhs_0)));
> > > > > > +  create_output_operand (&ops[2], op_1, TYPE_MODE (TREE_TYPE
> > > (rhs_1)));
> > > > > > +
> > > > > > +  insn_code code = optab_handler (us_plus_optab, TYPE_MODE (TREE_TYPE
> > > > > > (rhs_0)));
> > > > > > +  expand_insn (code, 3, ops);
> > > > > > +
> > > > > > +  if (!rtx_equal_p (target, ops[0].value))
> > > > > > +    emit_move_insn (target, ops[0].value);
> > > > > > +}
> > > > >
> > > > > This can be simplified by calling expand_binary_optab_fn instead. See my
> > > > template
> > > > >
> > > > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > > > > index c14d30365c1..b1d7b5a0307 100644
> > > > > > --- a/gcc/internal-fn.def
> > > > > > +++ b/gcc/internal-fn.def
> > > > > > @@ -447,6 +447,9 @@ DEF_INTERNAL_INT_FN (FFS, ECF_CONST |
> > > > > > ECF_NOTHROW, ffs, unary)
> > > > > >  DEF_INTERNAL_INT_FN (PARITY, ECF_CONST | ECF_NOTHROW, parity,
> > > unary)
> > > > > >  DEF_INTERNAL_INT_EXT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW,
> > > > > > popcount, unary)
> > > > > >
> > > > > > +/* Binary integer ops.  */
> > > > > > +DEF_INTERNAL_INT_EXT_FN (US_PLUS, ECF_CONST | ECF_NOTHROW,
> > > > us_plus,
> > > > > > binary)
> > > > > > +
> > > > > >  DEF_INTERNAL_FN (GOMP_TARGET_REV, ECF_NOVOPS | ECF_LEAF |
> > > > > > ECF_NOTHROW, NULL)
> > > > > >  DEF_INTERNAL_FN (GOMP_USE_SIMT, ECF_NOVOPS | ECF_LEAF |
> > > > > > ECF_NOTHROW, NULL)
> > > > > >  DEF_INTERNAL_FN (GOMP_SIMT_ENTER, ECF_LEAF | ECF_NOTHROW,
> > > NULL)
> > > > > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > > > > > index bccee1c3e09..46e404b4a49 100644
> > > > > > --- a/gcc/internal-fn.h
> > > > > > +++ b/gcc/internal-fn.h
> > > > > > @@ -263,6 +263,7 @@ extern void expand_DIVMODBITINT (internal_fn,
> > > gcall
> > > > *);
> > > > > >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> > > > > >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> > > > > >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > > > > > +extern void expand_US_PLUS (internal_fn, gcall *);
> > > > > >
> > > > > >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> > > > > >
> > > > > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > > > > index c5b6540f939..f45fd58ad23 100644
> > > > > > --- a/gcc/match.pd
> > > > > > +++ b/gcc/match.pd
> > > > > > @@ -10265,3 +10265,20 @@ and,
> > > > > >        }
> > > > > >        (if (full_perm_p)
> > > > > >       (vec_perm (op@3 @0 @1) @3 @2))))))
> > > > > > +
> > > > > > +#if GIMPLE
> > > > > > +
> > > > > > +/* Unsigned saturation add, aka:
> > > > > > +   SAT_ADDU = (X + Y) | - ((X + Y) < X) or
> > > > > > +   SAT_ADDU = (X + Y) | - ((X + Y) < Y).  */
> > > > > > +(simplify
> > > > > > + (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0))))
> > > > > > +   (if (optimize
> > > > > > +     && INTEGRAL_TYPE_P (type)
> > > > > > +     && TYPE_UNSIGNED (TREE_TYPE (@0))
> > > > > > +     && types_match (type, TREE_TYPE (@0))
> > > > > > +     && types_match (type, TREE_TYPE (@1))
> > > > > > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> > > > > > OPTIMIZE_FOR_BOTH))
> > > > > > +   (IFN_US_PLUS @0 @1)))
> > > > > > +
> > > > > > +#endif
> > > > >
> > > > > With the version above you can drop the #if GIMPLE and the
> > > > > > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> > > > >
> > > > > Check.
> > > > >
> > > > > Thanks,
> > > > > Tamar
> > > > >
> > > > > > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > > > > > index ad14f9328b9..5855c4e0834 100644
> > > > > > --- a/gcc/optabs.def
> > > > > > +++ b/gcc/optabs.def
> > > > > > @@ -179,6 +179,8 @@ OPTAB_NL(clrsb_optab, "clrsb$a2", CLRSB, "clrsb",
> > > '2',
> > > > > > gen_int_libfunc)
> > > > > >  OPTAB_NL(popcount_optab, "popcount$a2", POPCOUNT, "popcount", '2',
> > > > > > gen_int_libfunc)
> > > > > >  OPTAB_NL(parity_optab, "parity$a2", PARITY, "parity", '2', gen_int_libfunc)
> > > > > >
> > > > > > +OPTAB_NL(us_plus_optab, "us_plus$a3", US_PLUS, "us_plus", '3',
> > > > > > gen_int_libfunc)
> > > > > > +
> > > > > >  /* Comparison libcalls for integers MUST come in pairs, signed/unsigned.  */
> > > > > >  OPTAB_NL(cmp_optab, NULL, UNKNOWN, "cmp", '2',
> > > > gen_int_fp_fixed_libfunc)
> > > > > >  OPTAB_NL(ucmp_optab, NULL, UNKNOWN, "ucmp", '2', gen_int_libfunc)
> > > > > > --
> > > > > > 2.34.1
> > > > >
Richard Biener March 5, 2024, 8:41 a.m. UTC | #12
On Tue, Mar 5, 2024 at 8:09 AM Li, Pan2 <pan2.li@intel.com> wrote:
>
> Thanks Richard for comments.
>
> > I do wonder what the existing usadd patterns with integer vector modes
> > in various targets do?
> > Those define_insn will at least not end up in the optab set I guess,
> > so they must end up
> > being either unused or used by explicit gen_* (via intrinsic
> > functions?) or by combine?
>
> For usadd with vector modes, I think the backend like RISC-V try to leverage instructions
> like Vector Single-Width Saturating Add(aka vsaddu.vv/x/i).
>
> > I think simply changing gen_*_fixed_libfunc to gen_int_libfunc won't
> > work.  Since there's
> > no libgcc support I'd leave it as gen_*_fixed_libfunc thus no library
> > fallback for integers?
>
> Change to gen_int_libfunc follows other int optabs. I am not sure if it will hit the standard name usaddm3 for vector mode.
> But the happy path for scalar modes works up to a point, please help to correct me if any misunderstanding.

gen_int_libfunc will no longer make it emit libcalls for fixed point
modes, so this can't be correct
and there's no libgcc implementation for integer mode saturating ops,
so it's pointless to emit calls
to them.

> #0  riscv_expand_usadd (dest=0x7ffff6a8c7c8, x=0x7ffff6a8c798, y=0x7ffff6a8c7b0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/config/riscv/riscv.cc:10662
> #1  0x00000000029f142a in gen_usaddsi3 (operand0=0x7ffff6a8c7c8, operand1=0x7ffff6a8c798, operand2=0x7ffff6a8c7b0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/config/riscv/riscv.md:3848
> #2  0x0000000001751e60 in insn_gen_fn::operator()<rtx_def*, rtx_def*, rtx_def*> (this=0x4910e70 <insn_data+1248976>) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/recog.h:441
> #3  0x000000000180f553 in maybe_gen_insn (icode=CODE_FOR_usaddsi3, nops=3, ops=0x7fffffffd2c0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/optabs.cc:8232
> #4  0x000000000180fa42 in maybe_expand_insn (icode=CODE_FOR_usaddsi3, nops=3, ops=0x7fffffffd2c0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/optabs.cc:8275
> #5  0x000000000180fade in expand_insn (icode=CODE_FOR_usaddsi3, nops=3, ops=0x7fffffffd2c0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/optabs.cc:8306
> #6  0x00000000015cebdc in expand_fn_using_insn (stmt=0x7ffff6a36480, icode=CODE_FOR_usaddsi3, noutputs=1, ninputs=2) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/internal-fn.cc:254
> #7  0x00000000015de146 in expand_direct_optab_fn (fn=IFN_SAT_ADD, stmt=0x7ffff6a36480, optab=usadd_optab, nargs=2) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/internal-fn.cc:3818
> #8  0x00000000015e3610 in expand_SAT_ADD (fn=IFN_SAT_ADD, stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/internal-fn.def:278
> #9  0x00000000015e65b6 in expand_internal_call (fn=IFN_SAT_ADD, stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/internal-fn.cc:4914
> #10 0x00000000015e65e5 in expand_internal_call (stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/internal-fn.cc:4922
> #11 0x0000000001248c8f in expand_call_stmt (stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:2771
> #12 0x000000000124d392 in expand_gimple_stmt_1 (stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:3932
> #13 0x000000000124d9aa in expand_gimple_stmt (stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:4077
> #14 0x000000000124dac4 in expand_gimple_tailcall (bb=0x7ffff6dddae0, stmt=0x7ffff6a36480, can_fallthru=0x7fffffffd800) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:4123
> #15 0x000000000125636b in expand_gimple_basic_block (bb=0x7ffff6dddae0, disable_tail_calls=false) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:6107
> #16 0x0000000001258a1a in (anonymous namespace)::pass_expand::execute (this=0x556d180, fun=0x7ffff6a7f2e0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:6872
> #17 0x0000000001873565 in execute_one_pass (pass=0x556d180) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/passes.cc:2646
> #18 0x0000000001873948 in execute_pass_list_1 (pass=0x556d180) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/passes.cc:2755
> #19 0x00000000018739d6 in execute_pass_list (fn=0x7ffff6a7f2e0, pass=0x5568870) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/passes.cc:2766
> #20 0x00000000012bc975 in cgraph_node::expand (this=0x7ffff6c2c880) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cgraphunit.cc:1845
> #21 0x00000000012bd18f in expand_all_functions () at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cgraphunit.cc:2028
> #22 0x00000000012bdcc5 in symbol_table::compile (this=0x7ffff6c06000) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cgraphunit.cc:2402
> #23 0x00000000012be16c in symbol_table::finalize_compilation_unit (this=0x7ffff6c06000) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cgraphunit.cc:2587
> #24 0x0000000001a048a3 in compile_file () at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/toplev.cc:476
> #25 0x0000000001a079e9 in do_compile () at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/toplev.cc:2154
> #26 0x0000000001a07dee in toplev::main (this=0x7fffffffdcf2, argc=19, argv=0x7fffffffde28) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/toplev.cc:2310
> #27 0x0000000003ebcc5d in main (argc=19, argv=0x7fffffffde28) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/main.cc:39
>
> BTW, does match.pd support nested cond like below? I am debugging into gimple_simplify_COND_EXPR for why not hit the pattern...
> +(simplify
> +  (cond
> +    (lt @0 integer_zerop)
> +    (plus:c @0 @1)
> +    (cond (lt @1 integer_zerop) @1 @0))
> +  (IFN_SAT_ADD @0 @1))
>
> Pan
>
> -----Original Message-----
> From: Richard Biener <richard.guenther@gmail.com>
> Sent: Monday, March 4, 2024 6:31 PM
> To: Li, Pan2 <pan2.li@intel.com>
> Cc: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com; jeffreyalaw@gmail.com
> Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS
>
> On Sat, Mar 2, 2024 at 8:46 AM Li, Pan2 <pan2.li@intel.com> wrote:
> >
> > Hi Richard and Tamar,
> >
> > I have a try with DEF_INTERNAL_SIGNED_OPTAB_FN for SAT_ADD/SUB/MUL but meet some problem when match.pd.
> >
> > For unsigned SAT_ADD = (x + y) | - ((x + y) < x), the match.pd can be (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0)))).
> > For unsigned SAT_SUB = x >= y ? x - y : 0, and then match.pd can be (cond (ge @0 @1) (minus @0 @1) integer_zerop).
> >
> > For signed SAT_ADD/SAT_SUB as below, seems not easy to make the simplify pattern works well as expected up to a point.
> > sint64_t sat_add (sint64_t x, sint64_t y)
> > {
> >   sint64_t a = x ^ y;
> >   sint64_t add = x + y;
> >   sint64_t b = sum ^ x;
> >
> >   return (a < 0 || (a >= 0 && b >= 0)) ? add : (MAX_INT64 + (x < 0));
> > }
> >
> > sint64_t sad_sub (sint64_t x, sint64_t y)
> > {
> >   sint64_t a = x ^ y;
> >   sint64_t sub = x - y;
> >   sint64_t b = sub ^ x;
> >
> >   return (a >= 0 || (a < 0 && b >= 0) ? sub : (MAX_INT64 + (x < 0));
> > }
> >
> > For SAT_MUL as below, looks we may need widen type. I am not sure if we can leverage MUL_OVERFLOW or not in match.pd.
> >
> > uint32_t sat_mul (uint32_t x, uint32_t y)
> > {
> >   uint64_t mul = (uint64_t)x * (uint64_t)y;
> >   return mul > UINT32_MAX ? UINT32_MAX : (uint32_t)mul;
> > }
> >
> > sint32_t sat_mul (sint32_t x, sint32_t y)
> > {
> >   sint64_t mul = (sint64_t)x * (sint64_t))y;
> >
> >   return mul <= MAX_INT32 && mul >= MIN_INT32 ? mul : MAX_INT32 + (x ^ y) > 0;
> > }
> >
> > Below diff only contains unsigned SAT_ADD and SAT_SUB for prototype validation.
> > I will continue to try the rest part in match.pd and keep you posted.
> >
> > -------------------------------------------------------------------------------------------------------------------------------------------------------------------------
> >
> > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> > index 80efdf2b7e5..d9ad6fe2b58 100644
> > --- a/gcc/config/riscv/riscv-protos.h
> > +++ b/gcc/config/riscv/riscv-protos.h
> > @@ -132,6 +132,9 @@ extern void riscv_asm_output_external (FILE *, const tree, const char *);
> >  extern bool
> >  riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
> >  extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx);
> > +extern void riscv_expand_usadd (rtx, rtx, rtx);
> > +extern void riscv_expand_ussub (rtx, rtx, rtx);
> >
> >  #ifdef RTX_CODE
> >  extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool *invert_ptr = 0);
> > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> > index 5e984ee2a55..795462526df 100644
> > --- a/gcc/config/riscv/riscv.cc
> > +++ b/gcc/config/riscv/riscv.cc
> > @@ -10655,6 +10655,28 @@ riscv_vector_mode_supported_any_target_p (machine_mode)
> >    return true;
> >  }
> >
> > +/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < x).  */
> > +void
> > +riscv_expand_usadd (rtx dest, rtx x, rtx y)
> > +{
> > +  fprintf (stdout, "Hit riscv_expand_usadd.\n");
> > +  // ToDo
> > +}
> > +
> > +void
> > +riscv_expand_ussub (rtx dest, rtx x, rtx y)
> > +{
> > +  fprintf (stdout, "Hit riscv_expand_ussub.\n");
> > +  // ToDo
> > +}
> > +
> >  /* Initialize the GCC target structure.  */
> >  #undef TARGET_ASM_ALIGNED_HI_OP
> >  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> > diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> > index 1fec13092e2..e2dbadb3ead 100644
> > --- a/gcc/config/riscv/riscv.md
> > +++ b/gcc/config/riscv/riscv.md
> > @@ -3839,6 +3839,39 @@ (define_insn "*large_load_address"
> >    [(set_attr "type" "load")
> >     (set (attr "length") (const_int 8))])
> >
> > +(define_expand "usadd<mode>3"
> > +  [(match_operand:ANYI 0 "register_operand")
> > +   (match_operand:ANYI 1 "register_operand")
> > +   (match_operand:ANYI 2 "register_operand")]
> > +  ""
> > +  {
> > +    riscv_expand_usadd (operands[0], operands[1], operands[2]);
> > +    DONE;
> > +  }
> > +)
> > +
> > +(define_expand "ussub<mode>3"
> > +  [(match_operand:ANYI 0 "register_operand")
> > +   (match_operand:ANYI 1 "register_operand")
> > +   (match_operand:ANYI 2 "register_operand")]
> > +  ""
> > +  {
> > +    riscv_expand_ussub (operands[0], operands[1], operands[2]);
> > +    DONE;
> > +  }
> > +)
> > +
> >  (include "bitmanip.md")
> >  (include "crypto.md")
> >  (include "sync.md")
> > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > index 848bb9dbff3..0fff19c875f 100644
> > --- a/gcc/internal-fn.def
> > +++ b/gcc/internal-fn.def
> > @@ -275,6 +275,13 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST | ECF_NOTHROW, first,
> >  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, first,
> >                               smulhrs, umulhrs, binary)
> >
> > +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST | ECF_NOTHROW, first,
> > +                             ssadd, usadd, binary)
> > +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_SUB, ECF_CONST | ECF_NOTHROW, first,
> > +                             sssub, ussub, binary)
> > +
> >  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
> >  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
> >  DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary)
> > diff --git a/gcc/match.pd b/gcc/match.pd
> > index f3fffd8dec2..6592dea643a 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -10276,3 +10276,32 @@ and,
> >        }
> >        (if (full_perm_p)
> >         (vec_perm (op@3 @0 @1) @3 @2))))))
> > +
> > +#if GIMPLE
> > +
> > +/* Unsigned saturation add, aka:
> > +   SAT_ADDU = (X + Y) | - ((X + Y) < X) or
> > +   SAT_ADDU = (X + Y) | - ((X + Y) < Y).  */
> > +(simplify
> > + (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0))))
> > +   (if (optimize
> > +       && INTEGRAL_TYPE_P (type)
> > +       && types_match (type, TREE_TYPE (@0))
> > +       && types_match (type, TREE_TYPE (@1))
> > +       && TYPE_UNSIGNED (TREE_TYPE (@0))
> > +       && direct_internal_fn_supported_p (IFN_SAT_ADD, type, OPTIMIZE_FOR_BOTH))
> > +   (IFN_SAT_ADD @0 @1)))
> > +
> > +/* Unsigned saturation sub , aka
> > +   SAT_SUBU = x >= y ? x - y : 0.  */
> > +(simplify
> > +  (cond (ge @0 @1) (minus @0 @1) integer_zerop)
> > +    (if (optimize
> > +       && INTEGRAL_TYPE_P (type)
> > +       && TYPE_UNSIGNED (TREE_TYPE (@0))
> > +       && types_match (type, TREE_TYPE (@0))
> > +       && types_match (type, TREE_TYPE (@1))
> > +       && direct_internal_fn_supported_p (IFN_SAT_SUB, type, OPTIMIZE_FOR_BOTH))
> > +    (IFN_SAT_SUB @0 @1)))
> > +
> > +#endif
> > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > index ad14f9328b9..bebe38c888b 100644
> > --- a/gcc/optabs.def
> > +++ b/gcc/optabs.def
> > @@ -111,15 +111,15 @@ OPTAB_NX(add_optab, "add$F$a3")
> >  OPTAB_NX(add_optab, "add$Q$a3")
> >  OPTAB_VL(addv_optab, "addv$I$a3", PLUS, "add", '3', gen_intv_fp_libfunc)
> >  OPTAB_VX(addv_optab, "add$F$a3")
> > -OPTAB_NL(ssadd_optab, "ssadd$Q$a3", SS_PLUS, "ssadd", '3', gen_signed_fixed_libfunc)
> > -OPTAB_NL(usadd_optab, "usadd$Q$a3", US_PLUS, "usadd", '3', gen_unsigned_fixed_libfunc)
> > +OPTAB_NL(ssadd_optab, "ssadd$a3", SS_PLUS, "ssadd", '3', gen_int_libfunc)
> > +OPTAB_NL(usadd_optab, "usadd$a3", US_PLUS, "usadd", '3', gen_int_libfunc)
> >  OPTAB_NL(sub_optab, "sub$P$a3", MINUS, "sub", '3', gen_int_fp_fixed_libfunc)
> >  OPTAB_NX(sub_optab, "sub$F$a3")
> >  OPTAB_NX(sub_optab, "sub$Q$a3")
> >  OPTAB_VL(subv_optab, "subv$I$a3", MINUS, "sub", '3', gen_intv_fp_libfunc)
> >  OPTAB_VX(subv_optab, "sub$F$a3")
> > -OPTAB_NL(sssub_optab, "sssub$Q$a3", SS_MINUS, "sssub", '3', gen_signed_fixed_libfunc)
> > -OPTAB_NL(ussub_optab, "ussub$Q$a3", US_MINUS, "ussub", '3', gen_unsigned_fixed_libfunc)
> > +OPTAB_NL(sssub_optab, "sssub$a3", SS_MINUS, "sssub", '3', gen_int_libfunc)
> > +OPTAB_NL(ussub_optab, "ussub$a3", US_MINUS, "ussub", '3', gen_int_libfunc)
>
> I do wonder what the existing usadd patterns with integer vector modes
> in various targets do?
> Those define_insn will at least not end up in the optab set I guess,
> so they must end up
> being either unused or used by explicit gen_* (via intrinsic
> functions?) or by combine?
>
> I think simply changing gen_*_fixed_libfunc to gen_int_libfunc won't
> work.  Since there's
> no libgcc support I'd leave it as gen_*_fixed_libfunc thus no library
> fallback for integers?
>
> >  OPTAB_NL(smul_optab, "mul$Q$a3", MULT, "mul", '3', gen_int_fp_fixed_libfunc)
> >  OPTAB_NX(smul_optab, "mul$P$a3")
> >  OPTAB_NX(smul_optab, "mul$F$a3")
> >
> > Pan
> >
> > -----Original Message-----
> > From: Li, Pan2 <pan2.li@intel.com>
> > Sent: Tuesday, February 27, 2024 10:36 PM
> > To: Richard Biener <richard.guenther@gmail.com>; Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com; richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS
> >
> > Thanks Richard and Tammer for moving this forward.
> >
> > > That said, I would like to see the bigger picture to be kept in mind
> > > before altering the GIMPLE IL.
> >
> > > Adding an internal function for an already present optab is a
> > > no-brainer.  Adding a vectorizer
> > > and/or if-conversion pattern to make use of this during vectorization
> > > is existing practice.
> > > Adding pattern recognition to ISEL or widening-mul passes for
> > > instructions the CPU can do
> > > is existing practice and OK.
> >
> > Thanks for explaining, got the point here.
> >
> > > So I'd suggest writing some example of both signed and unsigned saturating add and multiply
> >
> > > Because signed addition, will likely require a branch and signed multiplication would require a
> > > larger type.
> >
> > Ack, will prepare one prototype validation patch for add, sub and mul (both unsigned and signed) soon.
> >
> > Pan
> >
> > -----Original Message-----
> > From: Richard Biener <richard.guenther@gmail.com>
> > Sent: Tuesday, February 27, 2024 9:42 PM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com; richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS
> >
> > On Tue, Feb 27, 2024 at 1:57 PM Tamar Christina <Tamar.Christina@arm.com> wrote:
> > >
> > > > Thanks Tamar.
> > > >
> > > > > Those two cases also *completely* stop vectorization because of either the
> > > > > control flow or the fact the vectorizer can't handle complex types.
> > > >
> > > > Yes, we eventually would like to vectorize the SAT ALU but we start with scalar part
> > > > first.
> > > > I tried the DEF_INTERNAL_SIGNED_OPTAB_EXT_FN as your suggestion. It works
> > > > well with some additions as below.
> > > > Feel free to correct me if any misunderstandings.
> > > >
> > > > 1. usadd$Q$a3 are restricted to fixed point and we need to change it to
> > > > usadd$a3(as well as gen_int_libfunc) for int.
> > > > 2. We need to implement a default implementation of SAT_ADD if
> > > > direct_binary_optab_supported_p is false.
> > > >     It looks like the default implementation is difficult to make every backend happy.
> > > > That is why you suggest just normal
> > > >     DEF_INTERNAL_SIGNED_OPTAB_FN in another thread.
> > > >
> > > > Thanks Richard.
> > > >
> > > > > But what I'd like to see is that we do more instruction selection on GIMPLE
> > > > > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> > > > > passes doing what I'd call instruction selection).  But that means not adding
> > > > > match.pd patterns for that or at least have a separate isel-match.pd
> > > > > machinery for that.
> > > >
> > > > > So as a start I would go for a direct optab and see to recognize it during
> > > > > ISEL?
> > > >
> > > > Looks we have sorts of SAT alu like PLUS/MINUS/MULT/DIV/SHIFT/NEG/ABS, good
> > > > to know isel and I am happy to
> > > > try that once we have conclusion.
> > > >
> > >
> > > So after a lively discussion on IRC, the conclusion is that before we proceed Richi would
> > > like to see some examples of various operations.  The problem is that unsigned saturating
> > > addition is the simplest example and it may lead to an implementation strategy that doesn't
> > > scale.
> > >
> > > So I'd suggest writing some example of both signed and unsigned saturating add and multiply
> > >
> > > Because signed addition, will likely require a branch and signed multiplication would require a
> > > larger type.
> > >
> > > This would allow us to better understand what kind of gimple would have to to deal with in
> > > ISEL and VECT if we decide not to lower early.
> >
> > More specifically before making something like .SAT_ADD a core part of
> > GIMPLE I'd like
> > to point out that we have saturating PLUS_EXPR but just for
> > fixed-point types.  I realize
> > Joseph thinks that keying this on the type was wrong and it should
> > have used integer
> > types and special saturating operations.  Still having both,
> > type-keyed saturating PLUS_EXPR
> > and "code"-keyed .SAT_ADD (on integer types only?) looks like a mess.
> >
> > It might be that the way to go is to turn all existing saturating type
> > *_EXPR into
> > .SAT_* internal function calls, in the end mapping to the optabs and
> > eventual RTX codes.
> >
> > That could work for both integer types and fixed-point types.
> >
> > I'll also note that "saturating" is just another variant of overflow
> > behavior of which we have
> > trapping (-ftrapv), wrapping (-fwrapv), signed-undefined (default) and
> > also (kind-of) sanitized.
> > We do lack direct IL representation of -ftrapv and -fwrapv, the
> > semantics on a PLUS_EXPR
> > depend on per-function flags.  Eventually a common representation
> > could be found here.
> > For saturating I was thinking of .ADD_OVERFLOW (a, b,
> > saturation-value), a "trap" could
> > be a "trapping" saturation value, "undefined" could be a
> > "not-a-thing".  But I didn't think much
> > about this.
> >
> > That said, I would like to see the bigger picture to be kept in mind
> > before altering the GIMPLE IL.
> >
> > Adding an internal function for an already present optab is a
> > no-brainer.  Adding a vectorizer
> > and/or if-conversion pattern to make use of this during vectorization
> > is existing practice.
> > Adding pattern recognition to ISEL or widening-mul passes for
> > instructions the CPU can do
> > is existing practice and OK.
> >
> > Thanks,
> > Richard.
> >
> >
> > > Thanks,
> > > Tamar
> > >
> > > > Pan
> > > >
> > > > -----Original Message-----
> > > > From: Tamar Christina <Tamar.Christina@arm.com>
> > > > Sent: Tuesday, February 27, 2024 5:57 PM
> > > > To: Richard Biener <richard.guenther@gmail.com>
> > > > Cc: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai;
> > > > Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com;
> > > > richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > > > Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> > > > US_PLUS
> > > >
> > > > > -----Original Message-----
> > > > > From: Richard Biener <richard.guenther@gmail.com>
> > > > > Sent: Tuesday, February 27, 2024 9:44 AM
> > > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > > Cc: pan2.li@intel.com; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai;
> > > > > yanzhang.wang@intel.com; kito.cheng@gmail.com;
> > > > > richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > > > > Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> > > > > US_PLUS
> > > > >
> > > > > On Sun, Feb 25, 2024 at 10:01 AM Tamar Christina
> > > > > <Tamar.Christina@arm.com> wrote:
> > > > > >
> > > > > > Hi Pan,
> > > > > >
> > > > > > > From: Pan Li <pan2.li@intel.com>
> > > > > > >
> > > > > > > Hi Richard & Tamar,
> > > > > > >
> > > > > > > Try the DEF_INTERNAL_INT_EXT_FN as your suggestion.  By mapping
> > > > > > > us_plus$a3 to the RTL representation (us_plus:m x y) in optabs.def.
> > > > > > > And then expand_US_PLUS in internal-fn.cc.  Not very sure if my
> > > > > > > understanding is correct for DEF_INTERNAL_INT_EXT_FN.
> > > > > > >
> > > > > > > I am not sure if we still need DEF_INTERNAL_SIGNED_OPTAB_FN here, given
> > > > > > > the RTL representation has (ss_plus:m x y) and (us_plus:m x y) already.
> > > > > > >
> > > > > >
> > > > > > I think a couple of things are being confused here.  So lets break it down:
> > > > > >
> > > > > > The reason for DEF_INTERNAL_SIGNED_OPTAB_FN is because in GIMPLE
> > > > > > we only want one internal function for both signed and unsigned SAT_ADD.
> > > > > > with this definition we don't need SAT_UADD and SAT_SADD but instead
> > > > > > we will only have SAT_ADD, which will expand to us_plus or ss_plus.
> > > > > >
> > > > > > Now the downside of this is that this is a direct internal optab.  This means
> > > > > > that for the representation to be used the target *must* have the optab
> > > > > > implemented.   This is a bit annoying because it doesn't allow us to generically
> > > > > > assume that all targets use SAT_ADD for saturating add and thus only have to
> > > > > > write optimization for this representation.
> > > > > >
> > > > > > This is why Richi said we may need to use a new tree_code because we can
> > > > > > override tree code expansions.  However the same can be done with the
> > > > _EXT_FN
> > > > > > internal functions.
> > > > > >
> > > > > > So what I meant was that we want to have a combination of the two. i.e. a
> > > > > > DEF_INTERNAL_SIGNED_OPTAB_EXT_FN.
> > > > >
> > > > > Whether we want/need _EXT or only direct depends mainly on how we want to
> > > > > leverage support.  If it's only during vectorization and possibly instruction
> > > > > selection a direct optab is IMO the way to go.  Generic optimization only
> > > > > marginally improves when you explode the number of basic operations you
> > > > > expose - in fact it gets quite unwieldly to support all of them in
> > > > > simplifications
> > > > > and/or canonicalization and you possibly need to translate them back to what
> > > > > the target CPU supports.
> > > > >
> > > > > We already do have too many (IMO) "special" operations exposed "early"
> > > > > in the GIMPLE pipeline.
> > > > >
> > > > > But what I'd like to see is that we do more instruction selection on GIMPLE
> > > > > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> > > > > passes doing what I'd call instruction selection).  But that means not adding
> > > > > match.pd patterns for that or at least have a separate isel-match.pd
> > > > > machinery for that.
> > > > >
> > > > > So as a start I would go for a direct optab and see to recognize it during
> > > > > ISEL?
> > > > >
> > > >
> > > > The problem with ISEL and the reason I suggested an indirect IFN is that there
> > > > Are benefit to be had from recognizing it early.  Saturating arithmetic can be
> > > > optimized
> > > > Differently from non-saturating ones.
> > > >
> > > > But additionally a common way of specifying them decomposes to branches
> > > > and/or using COMPLEX_EXPR (see the various PRs on saturating arithmetic).
> > > >
> > > > These two representation can be detected in PHI-opts and it's beneficial to all
> > > > targets to canonicalize them to the branchless code.
> > > >
> > > > Those two cases also *completely* stop vectorization because of either the
> > > > control flow or the fact the vectorizer can't handle complex types.
> > > >
> > > > So really, gimple ISEL would fix just 1 of the 3 very common cases, and then
> > > > We'd still need to hack the vectorizer cost models for targets with saturating
> > > > vector instructions.
> > > >
> > > > I of course defer to you, but it seems quite suboptimal to do it this way and
> > > > doesn't get us first class saturation support.
> > > >
> > > > Additionally there have been discussions whether both clang and gcc should
> > > > provide __builtin_saturate_* methods, which the non-direct IFN would help
> > > > support.
> > > >
> > > > Tamar.
> > > >
> > > > > > If Richi agrees, the below is what I meant. It creates the infrastructure for this
> > > > > > and for now only allows a default fallback for unsigned saturating add and
> > > > makes
> > > > > > it easier for us to add the rest later
> > > > > >
> > > > > > Also, unless I'm wrong (and Richi can correct me here), us_plus and ss_plus are
> > > > > the
> > > > > > RTL expression, but the optab for saturation are ssadd and usadd.  So you don't
> > > > > > need to make new us_plus and ss_plus ones.
> > > > > >
> > > > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > > > > index a07f25f3aee..aaf9f8991b3 100644
> > > > > > --- a/gcc/internal-fn.cc
> > > > > > +++ b/gcc/internal-fn.cc
> > > > > > @@ -4103,6 +4103,17 @@ direct_internal_fn_supported_p (internal_fn fn,
> > > > > tree_pair types,
> > > > > >         return direct_##TYPE##_optab_supported_p (which_optab, types,   \
> > > > > >                                                   opt_type);            \
> > > > > >        }
> > > > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR,
> > > > > SIGNED_OPTAB, \
> > > > > > +                                        UNSIGNED_OPTAB, TYPE)          \
> > > > > > +    case IFN_##CODE:                                                   \
> > > > > > +      {                                                                        \
> > > > > > +       optab which_optab = (TYPE_UNSIGNED (types.SELECTOR)             \
> > > > > > +                            ? UNSIGNED_OPTAB ## _optab                 \
> > > > > > +                            : SIGNED_OPTAB ## _optab);                 \
> > > > > > +       return direct_##TYPE##_optab_supported_p (which_optab, types,   \
> > > > > > +                                                 opt_type)             \
> > > > > > +              || internal_##CODE##_fn_supported_p (types.SELECTOR, opt_type); \
> > > > > > +      }
> > > > > >  #include "internal-fn.def"
> > > > > >
> > > > > >      case IFN_LAST:
> > > > > > @@ -4303,6 +4314,8 @@ set_edom_supported_p (void)
> > > > > >      optab which_optab = direct_internal_fn_optab (fn, types);          \
> > > > > >      expand_##TYPE##_optab_fn (fn, stmt, which_optab);                  \
> > > > > >    }
> > > > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR,
> > > > > SIGNED_OPTAB, \
> > > > > > +                                        UNSIGNED_OPTAB, TYPE)
> > > > > >  #include "internal-fn.def"
> > > > > >
> > > > > >  /* Routines to expand each internal function, indexed by function number.
> > > > > > @@ -5177,3 +5190,45 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> > > > > >        emit_move_insn (plhs, cmp);
> > > > > >      }
> > > > > >  }
> > > > > > +
> > > > > > +void
> > > > > > +expand_SAT_ADD (internal_fn fn, gcall *stmt)
> > > > > > +{
> > > > > > +  /* Check if the target supports the expansion through an IFN.  */
> > > > > > +  tree_pair types = direct_internal_fn_types (fn, stmt);
> > > > > > +  optab which_optab = direct_internal_fn_optab (fn, types);
> > > > > > +  if (direct_binary_optab_supported_p (which_optab, types,
> > > > > > +                                      insn_optimization_type ()))
> > > > > > +    {
> > > > > > +      expand_binary_optab_fn (fn, stmt, which_optab);
> > > > > > +      return;
> > > > > > +    }
> > > > > > +
> > > > > > +  /* Target does not support the optab, but we can de-compose it.  */
> > > > > > +  /*
> > > > > > +  ... decompose to a canonical representation ...
> > > > > > +  if (TYPE_UNSIGNED (types.SELECTOR))
> > > > > > +    {
> > > > > > +      ...
> > > > > > +      decompose back to (X + Y) | - ((X + Y) < X)
> > > > > > +    }
> > > > > > +  else
> > > > > > +    {
> > > > > > +      ...
> > > > > > +    }
> > > > > > +  */
> > > > > > +}
> > > > > > +
> > > > > > +bool internal_SAT_ADD_fn_supported_p (tree type, optimization_type /*
> > > > > optype */)
> > > > > > +{
> > > > > > +  /* For now, don't support decomposing vector ops.  */
> > > > > > +  if (VECTOR_TYPE_P (type))
> > > > > > +    return false;
> > > > > > +
> > > > > > +  /* Signed saturating arithmetic is harder to do since we'll so for now
> > > > > > +     lets ignore.  */
> > > > > > +  if (!TYPE_UNSIGNED (type))
> > > > > > +    return false;
> > > > > > +
> > > > > > +  return TREE_CODE (type) == INTEGER_TYPE;
> > > > > > +}
> > > > > > \ No newline at end of file
> > > > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > > > > index c14d30365c1..5a2491228d5 100644
> > > > > > --- a/gcc/internal-fn.def
> > > > > > +++ b/gcc/internal-fn.def
> > > > > > @@ -92,6 +92,10 @@ along with GCC; see the file COPYING3.  If not see
> > > > > >     unsigned inputs respectively, both without the trailing "_optab".
> > > > > >     SELECTOR says which type in the tree_pair determines the signedness.
> > > > > >
> > > > > > +   DEF_INTERNAL_SIGNED_OPTAB_EXT_FN is like
> > > > > DEF_INTERNAL_SIGNED_OPTAB_FN, except
> > > > > > +   that it has expand_##NAME defined in internal-fn.cc to override the
> > > > > > +   DEF_INTERNAL_SIGNED_OPTAB_FN expansion behavior.
> > > > > > +
> > > > > >     DEF_INTERNAL_FLT_FN is like DEF_INTERNAL_OPTAB_FN, but in addition,
> > > > > >     the function implements the computational part of a built-in math
> > > > > >     function BUILT_IN_<NAME>{F,,L}.  Unlike some built-in functions,
> > > > > > @@ -153,6 +157,13 @@ along with GCC; see the file COPYING3.  If not see
> > > > > >    DEF_INTERNAL_FN (NAME, FLAGS | ECF_LEAF, NULL)
> > > > > >  #endif
> > > > > >
> > > > > > +#ifndef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> > > > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(NAME, FLAGS, SELECTOR,
> > > > > SIGNED_OPTAB, \
> > > > > > +                                    UNSIGNED_OPTAB, TYPE) \
> > > > > > +  DEF_INTERNAL_SIGNED_OPTAB_FN (NAME, FLAGS, SELECTOR,
> > > > > SIGNED_OPTAB, \
> > > > > > +                               UNSIGNED_OPTAB, TYPE)
> > > > > > +#endif
> > > > > > +
> > > > > >  #ifndef DEF_INTERNAL_FLT_FN
> > > > > >  #define DEF_INTERNAL_FLT_FN(NAME, FLAGS, OPTAB, TYPE) \
> > > > > >    DEF_INTERNAL_OPTAB_FN (NAME, FLAGS, OPTAB, TYPE)
> > > > > > @@ -274,6 +285,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS,
> > > > > ECF_CONST | ECF_NOTHROW, first,
> > > > > >                               smulhs, umulhs, binary)
> > > > > >  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW,
> > > > > first,
> > > > > >                               smulhrs, umulhrs, binary)
> > > > > > +DEF_INTERNAL_SIGNED_OPTAB_EXT_FN (SAT_ADD, ECF_CONST |
> > > > > ECF_NOTHROW, first,
> > > > > > +                                 ssadd, usadd, binary)
> > > > > >
> > > > > >  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
> > > > > >  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
> > > > > > @@ -593,5 +606,6 @@ DEF_INTERNAL_FN (BITINTTOFLOAT, ECF_PURE |
> > > > > ECF_LEAF, ". R . ")
> > > > > >  #undef DEF_INTERNAL_FLT_FN
> > > > > >  #undef DEF_INTERNAL_FLT_FLOATN_FN
> > > > > >  #undef DEF_INTERNAL_SIGNED_OPTAB_FN
> > > > > > +#undef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> > > > > >  #undef DEF_INTERNAL_OPTAB_FN
> > > > > >  #undef DEF_INTERNAL_FN
> > > > > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > > > > > index bccee1c3e09..dbdb1e6bad2 100644
> > > > > > --- a/gcc/internal-fn.h
> > > > > > +++ b/gcc/internal-fn.h
> > > > > > @@ -263,6 +263,8 @@ extern void expand_DIVMODBITINT (internal_fn, gcall
> > > > > *);
> > > > > >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> > > > > >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> > > > > >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > > > > > +extern void expand_SAT_ADD (internal_fn, gcall *);
> > > > > > +extern bool internal_SAT_ADD_fn_supported_p (tree, optimization_type);
> > > > > >
> > > > > >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> > > > > >
> > > > > > > Note this patch is a draft for validation, no test are invovled here.
> > > > > > >
> > > > > > > gcc/ChangeLog:
> > > > > > >
> > > > > > >       * builtins.def (BUILT_IN_US_PLUS): Add builtin def.
> > > > > > >       (BUILT_IN_US_PLUSIMAX): Ditto.
> > > > > > >       (BUILT_IN_US_PLUSL): Ditto.
> > > > > > >       (BUILT_IN_US_PLUSLL): Ditto.
> > > > > > >       (BUILT_IN_US_PLUSG): Ditto.
> > > > > > >       * config/riscv/riscv-protos.h (riscv_expand_us_plus): Add new
> > > > > > >       func decl for expanding us_plus.
> > > > > > >       * config/riscv/riscv.cc (riscv_expand_us_plus): Add new func
> > > > > > >       impl for expanding us_plus.
> > > > > > >       * config/riscv/riscv.md (us_plus<mode>3): Add new pattern impl
> > > > > > >       us_plus<mode>3.
> > > > > > >       * internal-fn.cc (expand_US_PLUS): Add new func impl to expand
> > > > > > >       US_PLUS.
> > > > > > >       * internal-fn.def (US_PLUS): Add new INT_EXT_FN.
> > > > > > >       * internal-fn.h (expand_US_PLUS): Add new func decl.
> > > > > > >       * match.pd: Add new simplify pattern for us_plus.
> > > > > > >       * optabs.def (OPTAB_NL): Add new OPTAB_NL to US_PLUS rtl.
> > > > > > >
> > > > > > > Signed-off-by: Pan Li <pan2.li@intel.com>
> > > > > > > ---
> > > > > > >  gcc/builtins.def                |  7 +++++
> > > > > > >  gcc/config/riscv/riscv-protos.h |  1 +
> > > > > > >  gcc/config/riscv/riscv.cc       | 46 +++++++++++++++++++++++++++++++++
> > > > > > >  gcc/config/riscv/riscv.md       | 11 ++++++++
> > > > > > >  gcc/internal-fn.cc              | 26 +++++++++++++++++++
> > > > > > >  gcc/internal-fn.def             |  3 +++
> > > > > > >  gcc/internal-fn.h               |  1 +
> > > > > > >  gcc/match.pd                    | 17 ++++++++++++
> > > > > > >  gcc/optabs.def                  |  2 ++
> > > > > > >  9 files changed, 114 insertions(+)
> > > > > > >
> > > > > > > diff --git a/gcc/builtins.def b/gcc/builtins.def
> > > > > > > index f6f3e104f6a..0777b912cfa 100644
> > > > > > > --- a/gcc/builtins.def
> > > > > > > +++ b/gcc/builtins.def
> > > > > > > @@ -1055,6 +1055,13 @@ DEF_GCC_BUILTIN
> > > > > (BUILT_IN_POPCOUNTIMAX,
> > > > > > > "popcountimax", BT_FN_INT_UINTMAX
> > > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTL, "popcountl",
> > > > > BT_FN_INT_ULONG,
> > > > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTLL, "popcountll",
> > > > > > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTG, "popcountg",
> > > > > BT_FN_INT_VAR,
> > > > > > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > > > > > > +
> > > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUS, "us_plus", BT_FN_INT_UINT,
> > > > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSIMAX, "us_plusimax",
> > > > > > > BT_FN_INT_UINTMAX, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSL, "us_plusl",
> > > > BT_FN_INT_ULONG,
> > > > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSLL, "us_plusll",
> > > > > > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSG, "us_plusg", BT_FN_INT_VAR,
> > > > > > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > > > > > > +
> > > > > > >  DEF_EXT_LIB_BUILTIN    (BUILT_IN_POSIX_MEMALIGN, "posix_memalign",
> > > > > > > BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF)
> > > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_PREFETCH, "prefetch",
> > > > > > > BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
> > > > > > >  DEF_LIB_BUILTIN        (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE,
> > > > > > > ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST)
> > > > > > > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> > > > > > > index 80efdf2b7e5..ba6086f1f25 100644
> > > > > > > --- a/gcc/config/riscv/riscv-protos.h
> > > > > > > +++ b/gcc/config/riscv/riscv-protos.h
> > > > > > > @@ -132,6 +132,7 @@ extern void riscv_asm_output_external (FILE *, const
> > > > > tree,
> > > > > > > const char *);
> > > > > > >  extern bool
> > > > > > >  riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
> > > > > > >  extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx);
> > > > > > > +extern void riscv_expand_us_plus (rtx, rtx, rtx);
> > > > > > >
> > > > > > >  #ifdef RTX_CODE
> > > > > > >  extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool
> > > > *invert_ptr
> > > > > =
> > > > > > > 0);
> > > > > > > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> > > > > > > index 4100abc9dd1..23f08974f07 100644
> > > > > > > --- a/gcc/config/riscv/riscv.cc
> > > > > > > +++ b/gcc/config/riscv/riscv.cc
> > > > > > > @@ -10657,6 +10657,52 @@ riscv_vector_mode_supported_any_target_p
> > > > > > > (machine_mode)
> > > > > > >    return true;
> > > > > > >  }
> > > > > > >
> > > > > > > +/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < x).  */
> > > > > > > +void
> > > > > > > +riscv_expand_us_plus (rtx dest, rtx x, rtx y)
> > > > > > > +{
> > > > > > > +  machine_mode mode = GET_MODE (dest);
> > > > > > > +  rtx pmode_sum = gen_reg_rtx (Pmode);
> > > > > > > +  rtx pmode_lt = gen_reg_rtx (Pmode);
> > > > > > > +  rtx pmode_x = gen_lowpart (Pmode, x);
> > > > > > > +  rtx pmode_y = gen_lowpart (Pmode, y);
> > > > > > > +  rtx pmode_dest = gen_reg_rtx (Pmode);
> > > > > > > +
> > > > > > > +  /* Step-1: sum = x + y  */
> > > > > > > +  if (mode == SImode && mode != Pmode)
> > > > > > > +    { /* Take addw to avoid the sum truncate.  */
> > > > > > > +      rtx simode_sum = gen_reg_rtx (SImode);
> > > > > > > +      riscv_emit_binary (PLUS, simode_sum, x, y);
> > > > > > > +      emit_move_insn (pmode_sum, gen_lowpart (Pmode, simode_sum));
> > > > > > > +    }
> > > > > > > +  else
> > > > > > > +    riscv_emit_binary (PLUS, pmode_sum, pmode_x, pmode_y);
> > > > > > > +
> > > > > > > +  /* Step-1.1: truncate sum for HI and QI as we have no insn for add QI/HI.
> > > > */
> > > > > > > +  if (mode == HImode || mode == QImode)
> > > > > > > +    {
> > > > > > > +      int mode_bits = GET_MODE_BITSIZE (mode).to_constant ();
> > > > > > > +      int shift_bits = GET_MODE_BITSIZE (Pmode) - mode_bits;
> > > > > > > +
> > > > > > > +      gcc_assert (shift_bits > 0);
> > > > > > > +
> > > > > > > +      riscv_emit_binary (ASHIFT, pmode_sum, pmode_sum, GEN_INT
> > > > > (shift_bits));
> > > > > > > +      riscv_emit_binary (LSHIFTRT, pmode_sum, pmode_sum, GEN_INT
> > > > > > > (shift_bits));
> > > > > > > +    }
> > > > > > > +
> > > > > > > +  /* Step-2: lt = sum < x  */
> > > > > > > +  riscv_emit_binary (LTU, pmode_lt, pmode_sum, pmode_x);
> > > > > > > +
> > > > > > > +  /* Step-3: lt = -lt  */
> > > > > > > +  riscv_emit_unary (NEG, pmode_lt, pmode_lt);
> > > > > > > +
> > > > > > > +  /* Step-4: pmode_dest = sum | lt  */
> > > > > > > +  riscv_emit_binary (IOR, pmode_dest, pmode_lt, pmode_sum);
> > > > > > > +
> > > > > > > +  /* Step-5: dest = pmode_dest */
> > > > > > > +  emit_move_insn (dest, gen_lowpart (mode, pmode_dest));
> > > > > > > +}
> > > > > > > +
> > > > > > >  /* Initialize the GCC target structure.  */
> > > > > > >  #undef TARGET_ASM_ALIGNED_HI_OP
> > > > > > >  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> > > > > > > diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> > > > > > > index 3f7a023d941..eaa9867023c 100644
> > > > > > > --- a/gcc/config/riscv/riscv.md
> > > > > > > +++ b/gcc/config/riscv/riscv.md
> > > > > > > @@ -3841,6 +3841,17 @@ (define_insn "*large_load_address"
> > > > > > >    [(set_attr "type" "load")
> > > > > > >     (set (attr "length") (const_int 8))])
> > > > > > >
> > > > > > > +(define_expand "us_plus<mode>3"
> > > > > > > +  [(match_operand:ANYI   0 "register_operand")
> > > > > > > +   (match_operand:ANYI   1 "register_operand")
> > > > > > > +   (match_operand:ANYI   2 "register_operand")]
> > > > > > > +  ""
> > > > > > > +  {
> > > > > > > +    riscv_expand_us_plus (operands[0], operands[1], operands[2]);
> > > > > > > +    DONE;
> > > > > > > +  }
> > > > > > > +)
> > > > > > > +
> > > > > > >  (include "bitmanip.md")
> > > > > > >  (include "crypto.md")
> > > > > > >  (include "sync.md")
> > > > > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > > > > > index a07f25f3aee..a7341a57ffa 100644
> > > > > > > --- a/gcc/internal-fn.cc
> > > > > > > +++ b/gcc/internal-fn.cc
> > > > > > > @@ -5177,3 +5177,29 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> > > > > > >        emit_move_insn (plhs, cmp);
> > > > > > >      }
> > > > > > >  }
> > > > > > > +
> > > > > > > +void
> > > > > > > +expand_US_PLUS (internal_fn fn, gcall *stmt)
> > > > > > > +{
> > > > > > > +  tree lhs = gimple_call_lhs (stmt);
> > > > > > > +  tree rhs_0 = gimple_call_arg (stmt, 0);
> > > > > > > +  tree rhs_1 = gimple_call_arg (stmt, 1);
> > > > > > > +
> > > > > > > +  do_pending_stack_adjust ();
> > > > > > > +
> > > > > > > +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> > > > > > > +  rtx op_0 = expand_normal (rhs_0);
> > > > > > > +  rtx op_1 = expand_normal (rhs_1);
> > > > > > > +
> > > > > > > +  class expand_operand ops[3];
> > > > > > > +
> > > > > > > +  create_output_operand (&ops[0], target, TYPE_MODE (TREE_TYPE (lhs)));
> > > > > > > +  create_output_operand (&ops[1], op_0, TYPE_MODE (TREE_TYPE
> > > > (rhs_0)));
> > > > > > > +  create_output_operand (&ops[2], op_1, TYPE_MODE (TREE_TYPE
> > > > (rhs_1)));
> > > > > > > +
> > > > > > > +  insn_code code = optab_handler (us_plus_optab, TYPE_MODE (TREE_TYPE
> > > > > > > (rhs_0)));
> > > > > > > +  expand_insn (code, 3, ops);
> > > > > > > +
> > > > > > > +  if (!rtx_equal_p (target, ops[0].value))
> > > > > > > +    emit_move_insn (target, ops[0].value);
> > > > > > > +}
> > > > > >
> > > > > > This can be simplified by calling expand_binary_optab_fn instead. See my
> > > > > template
> > > > > >
> > > > > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > > > > > index c14d30365c1..b1d7b5a0307 100644
> > > > > > > --- a/gcc/internal-fn.def
> > > > > > > +++ b/gcc/internal-fn.def
> > > > > > > @@ -447,6 +447,9 @@ DEF_INTERNAL_INT_FN (FFS, ECF_CONST |
> > > > > > > ECF_NOTHROW, ffs, unary)
> > > > > > >  DEF_INTERNAL_INT_FN (PARITY, ECF_CONST | ECF_NOTHROW, parity,
> > > > unary)
> > > > > > >  DEF_INTERNAL_INT_EXT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW,
> > > > > > > popcount, unary)
> > > > > > >
> > > > > > > +/* Binary integer ops.  */
> > > > > > > +DEF_INTERNAL_INT_EXT_FN (US_PLUS, ECF_CONST | ECF_NOTHROW,
> > > > > us_plus,
> > > > > > > binary)
> > > > > > > +
> > > > > > >  DEF_INTERNAL_FN (GOMP_TARGET_REV, ECF_NOVOPS | ECF_LEAF |
> > > > > > > ECF_NOTHROW, NULL)
> > > > > > >  DEF_INTERNAL_FN (GOMP_USE_SIMT, ECF_NOVOPS | ECF_LEAF |
> > > > > > > ECF_NOTHROW, NULL)
> > > > > > >  DEF_INTERNAL_FN (GOMP_SIMT_ENTER, ECF_LEAF | ECF_NOTHROW,
> > > > NULL)
> > > > > > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > > > > > > index bccee1c3e09..46e404b4a49 100644
> > > > > > > --- a/gcc/internal-fn.h
> > > > > > > +++ b/gcc/internal-fn.h
> > > > > > > @@ -263,6 +263,7 @@ extern void expand_DIVMODBITINT (internal_fn,
> > > > gcall
> > > > > *);
> > > > > > >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> > > > > > >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> > > > > > >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > > > > > > +extern void expand_US_PLUS (internal_fn, gcall *);
> > > > > > >
> > > > > > >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> > > > > > >
> > > > > > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > > > > > index c5b6540f939..f45fd58ad23 100644
> > > > > > > --- a/gcc/match.pd
> > > > > > > +++ b/gcc/match.pd
> > > > > > > @@ -10265,3 +10265,20 @@ and,
> > > > > > >        }
> > > > > > >        (if (full_perm_p)
> > > > > > >       (vec_perm (op@3 @0 @1) @3 @2))))))
> > > > > > > +
> > > > > > > +#if GIMPLE
> > > > > > > +
> > > > > > > +/* Unsigned saturation add, aka:
> > > > > > > +   SAT_ADDU = (X + Y) | - ((X + Y) < X) or
> > > > > > > +   SAT_ADDU = (X + Y) | - ((X + Y) < Y).  */
> > > > > > > +(simplify
> > > > > > > + (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0))))
> > > > > > > +   (if (optimize
> > > > > > > +     && INTEGRAL_TYPE_P (type)
> > > > > > > +     && TYPE_UNSIGNED (TREE_TYPE (@0))
> > > > > > > +     && types_match (type, TREE_TYPE (@0))
> > > > > > > +     && types_match (type, TREE_TYPE (@1))
> > > > > > > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> > > > > > > OPTIMIZE_FOR_BOTH))
> > > > > > > +   (IFN_US_PLUS @0 @1)))
> > > > > > > +
> > > > > > > +#endif
> > > > > >
> > > > > > With the version above you can drop the #if GIMPLE and the
> > > > > > > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> > > > > >
> > > > > > Check.
> > > > > >
> > > > > > Thanks,
> > > > > > Tamar
> > > > > >
> > > > > > > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > > > > > > index ad14f9328b9..5855c4e0834 100644
> > > > > > > --- a/gcc/optabs.def
> > > > > > > +++ b/gcc/optabs.def
> > > > > > > @@ -179,6 +179,8 @@ OPTAB_NL(clrsb_optab, "clrsb$a2", CLRSB, "clrsb",
> > > > '2',
> > > > > > > gen_int_libfunc)
> > > > > > >  OPTAB_NL(popcount_optab, "popcount$a2", POPCOUNT, "popcount", '2',
> > > > > > > gen_int_libfunc)
> > > > > > >  OPTAB_NL(parity_optab, "parity$a2", PARITY, "parity", '2', gen_int_libfunc)
> > > > > > >
> > > > > > > +OPTAB_NL(us_plus_optab, "us_plus$a3", US_PLUS, "us_plus", '3',
> > > > > > > gen_int_libfunc)
> > > > > > > +
> > > > > > >  /* Comparison libcalls for integers MUST come in pairs, signed/unsigned.  */
> > > > > > >  OPTAB_NL(cmp_optab, NULL, UNKNOWN, "cmp", '2',
> > > > > gen_int_fp_fixed_libfunc)
> > > > > > >  OPTAB_NL(ucmp_optab, NULL, UNKNOWN, "ucmp", '2', gen_int_libfunc)
> > > > > > > --
> > > > > > > 2.34.1
> > > > > >
Li, Pan2 March 7, 2024, 1:54 a.m. UTC | #13
Thanks Richard for comments.

> gen_int_libfunc will no longer make it emit libcalls for fixed point
> modes, so this can't be correct
> and there's no libgcc implementation for integer mode saturating ops,
> so it's pointless to emit calls
> to them.

Got the pointer here, the OPTAB_NL(usadd_optab, "usadd$Q$a3", US_PLUS, "usadd", '3', gen_unsigned_fixed_libfunc)
Is designed for the fixed point, cannot cover integer mode right now.

Given we have saturating integer alu like below, could you help to coach me the most reasonable way to represent
It in scalar as well as vectorize part? Sorry not familiar with this part and still dig into how it works...

uint32_t sat_uadd (uint32_t a, uint32_t b)
{
  uint32_t add = a + b;
  return add | -(add < a);
}

sint32_t sat_sadd (sint32_t a, sint32_t b)
{
  sint32_t add = a + b;
  sint32_t x = a ^ b;
  sint32_t y = add ^ x;
  return x < 0 ? add : (y >= 0 ? add : INT32_MAX + (x < 0));
}

uint32_t sat_usub (uint32_t a, uint32_t b)
{
  return a >= b ? a - b : 0;
}

sint32_t sat_ssub (sint32_t a, sint32_t b)
{
  sint32_t sub = a - b;
  sint32_t x = a ^ b;
  sint32_t y = sub ^ x;
  return x >= 0 ? sub : (y >= 0 ? sub : INT32_MAX + (x < 0));
}

uint32_t sat_umul (uint32_t a, uint32_t b)
{
  uint64_t mul = a * b;

  return mul <= (uint64_t)UINT32_MAX ? (uint32_t)mul : UINT32_MAX;
}

sint32_t sat_smul (sint32_t a, sint32_t b)
{
  sint64_t mul = a * b;

  return mul >= (sint64_t)INT32_MIN && mul <= (sint64_t)INT32_MAX ? (sint32_t)mul : INT32_MAX + ((x ^ y) < 0);
}

uint32_t sat_udiv (uint32_t a, uint32_t b)
{
  return a / b; // never overflow
}

sint32_t sat_sdiv (sint32_t a, sint32_t b)
{
  return a == INT32_MIN && b == -1 ? INT32_MAX : a / b;
}

sint32_t sat_abs (sint32_t a)
{
  return a >= 0 ? a : (a == INT32_MIN ? INT32_MAX : -a);
}

Pan

-----Original Message-----
From: Richard Biener <richard.guenther@gmail.com> 
Sent: Tuesday, March 5, 2024 4:41 PM
To: Li, Pan2 <pan2.li@intel.com>
Cc: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com; jeffreyalaw@gmail.com
Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS

On Tue, Mar 5, 2024 at 8:09 AM Li, Pan2 <pan2.li@intel.com> wrote:
>
> Thanks Richard for comments.
>
> > I do wonder what the existing usadd patterns with integer vector modes
> > in various targets do?
> > Those define_insn will at least not end up in the optab set I guess,
> > so they must end up
> > being either unused or used by explicit gen_* (via intrinsic
> > functions?) or by combine?
>
> For usadd with vector modes, I think the backend like RISC-V try to leverage instructions
> like Vector Single-Width Saturating Add(aka vsaddu.vv/x/i).
>
> > I think simply changing gen_*_fixed_libfunc to gen_int_libfunc won't
> > work.  Since there's
> > no libgcc support I'd leave it as gen_*_fixed_libfunc thus no library
> > fallback for integers?
>
> Change to gen_int_libfunc follows other int optabs. I am not sure if it will hit the standard name usaddm3 for vector mode.
> But the happy path for scalar modes works up to a point, please help to correct me if any misunderstanding.

gen_int_libfunc will no longer make it emit libcalls for fixed point
modes, so this can't be correct
and there's no libgcc implementation for integer mode saturating ops,
so it's pointless to emit calls
to them.

> #0  riscv_expand_usadd (dest=0x7ffff6a8c7c8, x=0x7ffff6a8c798, y=0x7ffff6a8c7b0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/config/riscv/riscv.cc:10662
> #1  0x00000000029f142a in gen_usaddsi3 (operand0=0x7ffff6a8c7c8, operand1=0x7ffff6a8c798, operand2=0x7ffff6a8c7b0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/config/riscv/riscv.md:3848
> #2  0x0000000001751e60 in insn_gen_fn::operator()<rtx_def*, rtx_def*, rtx_def*> (this=0x4910e70 <insn_data+1248976>) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/recog.h:441
> #3  0x000000000180f553 in maybe_gen_insn (icode=CODE_FOR_usaddsi3, nops=3, ops=0x7fffffffd2c0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/optabs.cc:8232
> #4  0x000000000180fa42 in maybe_expand_insn (icode=CODE_FOR_usaddsi3, nops=3, ops=0x7fffffffd2c0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/optabs.cc:8275
> #5  0x000000000180fade in expand_insn (icode=CODE_FOR_usaddsi3, nops=3, ops=0x7fffffffd2c0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/optabs.cc:8306
> #6  0x00000000015cebdc in expand_fn_using_insn (stmt=0x7ffff6a36480, icode=CODE_FOR_usaddsi3, noutputs=1, ninputs=2) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/internal-fn.cc:254
> #7  0x00000000015de146 in expand_direct_optab_fn (fn=IFN_SAT_ADD, stmt=0x7ffff6a36480, optab=usadd_optab, nargs=2) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/internal-fn.cc:3818
> #8  0x00000000015e3610 in expand_SAT_ADD (fn=IFN_SAT_ADD, stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/internal-fn.def:278
> #9  0x00000000015e65b6 in expand_internal_call (fn=IFN_SAT_ADD, stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/internal-fn.cc:4914
> #10 0x00000000015e65e5 in expand_internal_call (stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/internal-fn.cc:4922
> #11 0x0000000001248c8f in expand_call_stmt (stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:2771
> #12 0x000000000124d392 in expand_gimple_stmt_1 (stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:3932
> #13 0x000000000124d9aa in expand_gimple_stmt (stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:4077
> #14 0x000000000124dac4 in expand_gimple_tailcall (bb=0x7ffff6dddae0, stmt=0x7ffff6a36480, can_fallthru=0x7fffffffd800) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:4123
> #15 0x000000000125636b in expand_gimple_basic_block (bb=0x7ffff6dddae0, disable_tail_calls=false) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:6107
> #16 0x0000000001258a1a in (anonymous namespace)::pass_expand::execute (this=0x556d180, fun=0x7ffff6a7f2e0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:6872
> #17 0x0000000001873565 in execute_one_pass (pass=0x556d180) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/passes.cc:2646
> #18 0x0000000001873948 in execute_pass_list_1 (pass=0x556d180) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/passes.cc:2755
> #19 0x00000000018739d6 in execute_pass_list (fn=0x7ffff6a7f2e0, pass=0x5568870) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/passes.cc:2766
> #20 0x00000000012bc975 in cgraph_node::expand (this=0x7ffff6c2c880) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cgraphunit.cc:1845
> #21 0x00000000012bd18f in expand_all_functions () at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cgraphunit.cc:2028
> #22 0x00000000012bdcc5 in symbol_table::compile (this=0x7ffff6c06000) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cgraphunit.cc:2402
> #23 0x00000000012be16c in symbol_table::finalize_compilation_unit (this=0x7ffff6c06000) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cgraphunit.cc:2587
> #24 0x0000000001a048a3 in compile_file () at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/toplev.cc:476
> #25 0x0000000001a079e9 in do_compile () at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/toplev.cc:2154
> #26 0x0000000001a07dee in toplev::main (this=0x7fffffffdcf2, argc=19, argv=0x7fffffffde28) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/toplev.cc:2310
> #27 0x0000000003ebcc5d in main (argc=19, argv=0x7fffffffde28) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/main.cc:39
>
> BTW, does match.pd support nested cond like below? I am debugging into gimple_simplify_COND_EXPR for why not hit the pattern...
> +(simplify
> +  (cond
> +    (lt @0 integer_zerop)
> +    (plus:c @0 @1)
> +    (cond (lt @1 integer_zerop) @1 @0))
> +  (IFN_SAT_ADD @0 @1))
>
> Pan
>
> -----Original Message-----
> From: Richard Biener <richard.guenther@gmail.com>
> Sent: Monday, March 4, 2024 6:31 PM
> To: Li, Pan2 <pan2.li@intel.com>
> Cc: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com; jeffreyalaw@gmail.com
> Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS
>
> On Sat, Mar 2, 2024 at 8:46 AM Li, Pan2 <pan2.li@intel.com> wrote:
> >
> > Hi Richard and Tamar,
> >
> > I have a try with DEF_INTERNAL_SIGNED_OPTAB_FN for SAT_ADD/SUB/MUL but meet some problem when match.pd.
> >
> > For unsigned SAT_ADD = (x + y) | - ((x + y) < x), the match.pd can be (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0)))).
> > For unsigned SAT_SUB = x >= y ? x - y : 0, and then match.pd can be (cond (ge @0 @1) (minus @0 @1) integer_zerop).
> >
> > For signed SAT_ADD/SAT_SUB as below, seems not easy to make the simplify pattern works well as expected up to a point.
> > sint64_t sat_add (sint64_t x, sint64_t y)
> > {
> >   sint64_t a = x ^ y;
> >   sint64_t add = x + y;
> >   sint64_t b = sum ^ x;
> >
> >   return (a < 0 || (a >= 0 && b >= 0)) ? add : (MAX_INT64 + (x < 0));
> > }
> >
> > sint64_t sad_sub (sint64_t x, sint64_t y)
> > {
> >   sint64_t a = x ^ y;
> >   sint64_t sub = x - y;
> >   sint64_t b = sub ^ x;
> >
> >   return (a >= 0 || (a < 0 && b >= 0) ? sub : (MAX_INT64 + (x < 0));
> > }
> >
> > For SAT_MUL as below, looks we may need widen type. I am not sure if we can leverage MUL_OVERFLOW or not in match.pd.
> >
> > uint32_t sat_mul (uint32_t x, uint32_t y)
> > {
> >   uint64_t mul = (uint64_t)x * (uint64_t)y;
> >   return mul > UINT32_MAX ? UINT32_MAX : (uint32_t)mul;
> > }
> >
> > sint32_t sat_mul (sint32_t x, sint32_t y)
> > {
> >   sint64_t mul = (sint64_t)x * (sint64_t))y;
> >
> >   return mul <= MAX_INT32 && mul >= MIN_INT32 ? mul : MAX_INT32 + (x ^ y) > 0;
> > }
> >
> > Below diff only contains unsigned SAT_ADD and SAT_SUB for prototype validation.
> > I will continue to try the rest part in match.pd and keep you posted.
> >
> > -------------------------------------------------------------------------------------------------------------------------------------------------------------------------
> >
> > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> > index 80efdf2b7e5..d9ad6fe2b58 100644
> > --- a/gcc/config/riscv/riscv-protos.h
> > +++ b/gcc/config/riscv/riscv-protos.h
> > @@ -132,6 +132,9 @@ extern void riscv_asm_output_external (FILE *, const tree, const char *);
> >  extern bool
> >  riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
> >  extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx);
> > +extern void riscv_expand_usadd (rtx, rtx, rtx);
> > +extern void riscv_expand_ussub (rtx, rtx, rtx);
> >
> >  #ifdef RTX_CODE
> >  extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool *invert_ptr = 0);
> > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> > index 5e984ee2a55..795462526df 100644
> > --- a/gcc/config/riscv/riscv.cc
> > +++ b/gcc/config/riscv/riscv.cc
> > @@ -10655,6 +10655,28 @@ riscv_vector_mode_supported_any_target_p (machine_mode)
> >    return true;
> >  }
> >
> > +/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < x).  */
> > +void
> > +riscv_expand_usadd (rtx dest, rtx x, rtx y)
> > +{
> > +  fprintf (stdout, "Hit riscv_expand_usadd.\n");
> > +  // ToDo
> > +}
> > +
> > +void
> > +riscv_expand_ussub (rtx dest, rtx x, rtx y)
> > +{
> > +  fprintf (stdout, "Hit riscv_expand_ussub.\n");
> > +  // ToDo
> > +}
> > +
> >  /* Initialize the GCC target structure.  */
> >  #undef TARGET_ASM_ALIGNED_HI_OP
> >  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> > diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> > index 1fec13092e2..e2dbadb3ead 100644
> > --- a/gcc/config/riscv/riscv.md
> > +++ b/gcc/config/riscv/riscv.md
> > @@ -3839,6 +3839,39 @@ (define_insn "*large_load_address"
> >    [(set_attr "type" "load")
> >     (set (attr "length") (const_int 8))])
> >
> > +(define_expand "usadd<mode>3"
> > +  [(match_operand:ANYI 0 "register_operand")
> > +   (match_operand:ANYI 1 "register_operand")
> > +   (match_operand:ANYI 2 "register_operand")]
> > +  ""
> > +  {
> > +    riscv_expand_usadd (operands[0], operands[1], operands[2]);
> > +    DONE;
> > +  }
> > +)
> > +
> > +(define_expand "ussub<mode>3"
> > +  [(match_operand:ANYI 0 "register_operand")
> > +   (match_operand:ANYI 1 "register_operand")
> > +   (match_operand:ANYI 2 "register_operand")]
> > +  ""
> > +  {
> > +    riscv_expand_ussub (operands[0], operands[1], operands[2]);
> > +    DONE;
> > +  }
> > +)
> > +
> >  (include "bitmanip.md")
> >  (include "crypto.md")
> >  (include "sync.md")
> > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > index 848bb9dbff3..0fff19c875f 100644
> > --- a/gcc/internal-fn.def
> > +++ b/gcc/internal-fn.def
> > @@ -275,6 +275,13 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST | ECF_NOTHROW, first,
> >  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, first,
> >                               smulhrs, umulhrs, binary)
> >
> > +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST | ECF_NOTHROW, first,
> > +                             ssadd, usadd, binary)
> > +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_SUB, ECF_CONST | ECF_NOTHROW, first,
> > +                             sssub, ussub, binary)
> > +
> >  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
> >  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
> >  DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary)
> > diff --git a/gcc/match.pd b/gcc/match.pd
> > index f3fffd8dec2..6592dea643a 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -10276,3 +10276,32 @@ and,
> >        }
> >        (if (full_perm_p)
> >         (vec_perm (op@3 @0 @1) @3 @2))))))
> > +
> > +#if GIMPLE
> > +
> > +/* Unsigned saturation add, aka:
> > +   SAT_ADDU = (X + Y) | - ((X + Y) < X) or
> > +   SAT_ADDU = (X + Y) | - ((X + Y) < Y).  */
> > +(simplify
> > + (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0))))
> > +   (if (optimize
> > +       && INTEGRAL_TYPE_P (type)
> > +       && types_match (type, TREE_TYPE (@0))
> > +       && types_match (type, TREE_TYPE (@1))
> > +       && TYPE_UNSIGNED (TREE_TYPE (@0))
> > +       && direct_internal_fn_supported_p (IFN_SAT_ADD, type, OPTIMIZE_FOR_BOTH))
> > +   (IFN_SAT_ADD @0 @1)))
> > +
> > +/* Unsigned saturation sub , aka
> > +   SAT_SUBU = x >= y ? x - y : 0.  */
> > +(simplify
> > +  (cond (ge @0 @1) (minus @0 @1) integer_zerop)
> > +    (if (optimize
> > +       && INTEGRAL_TYPE_P (type)
> > +       && TYPE_UNSIGNED (TREE_TYPE (@0))
> > +       && types_match (type, TREE_TYPE (@0))
> > +       && types_match (type, TREE_TYPE (@1))
> > +       && direct_internal_fn_supported_p (IFN_SAT_SUB, type, OPTIMIZE_FOR_BOTH))
> > +    (IFN_SAT_SUB @0 @1)))
> > +
> > +#endif
> > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > index ad14f9328b9..bebe38c888b 100644
> > --- a/gcc/optabs.def
> > +++ b/gcc/optabs.def
> > @@ -111,15 +111,15 @@ OPTAB_NX(add_optab, "add$F$a3")
> >  OPTAB_NX(add_optab, "add$Q$a3")
> >  OPTAB_VL(addv_optab, "addv$I$a3", PLUS, "add", '3', gen_intv_fp_libfunc)
> >  OPTAB_VX(addv_optab, "add$F$a3")
> > -OPTAB_NL(ssadd_optab, "ssadd$Q$a3", SS_PLUS, "ssadd", '3', gen_signed_fixed_libfunc)
> > -OPTAB_NL(usadd_optab, "usadd$Q$a3", US_PLUS, "usadd", '3', gen_unsigned_fixed_libfunc)
> > +OPTAB_NL(ssadd_optab, "ssadd$a3", SS_PLUS, "ssadd", '3', gen_int_libfunc)
> > +OPTAB_NL(usadd_optab, "usadd$a3", US_PLUS, "usadd", '3', gen_int_libfunc)
> >  OPTAB_NL(sub_optab, "sub$P$a3", MINUS, "sub", '3', gen_int_fp_fixed_libfunc)
> >  OPTAB_NX(sub_optab, "sub$F$a3")
> >  OPTAB_NX(sub_optab, "sub$Q$a3")
> >  OPTAB_VL(subv_optab, "subv$I$a3", MINUS, "sub", '3', gen_intv_fp_libfunc)
> >  OPTAB_VX(subv_optab, "sub$F$a3")
> > -OPTAB_NL(sssub_optab, "sssub$Q$a3", SS_MINUS, "sssub", '3', gen_signed_fixed_libfunc)
> > -OPTAB_NL(ussub_optab, "ussub$Q$a3", US_MINUS, "ussub", '3', gen_unsigned_fixed_libfunc)
> > +OPTAB_NL(sssub_optab, "sssub$a3", SS_MINUS, "sssub", '3', gen_int_libfunc)
> > +OPTAB_NL(ussub_optab, "ussub$a3", US_MINUS, "ussub", '3', gen_int_libfunc)
>
> I do wonder what the existing usadd patterns with integer vector modes
> in various targets do?
> Those define_insn will at least not end up in the optab set I guess,
> so they must end up
> being either unused or used by explicit gen_* (via intrinsic
> functions?) or by combine?
>
> I think simply changing gen_*_fixed_libfunc to gen_int_libfunc won't
> work.  Since there's
> no libgcc support I'd leave it as gen_*_fixed_libfunc thus no library
> fallback for integers?
>
> >  OPTAB_NL(smul_optab, "mul$Q$a3", MULT, "mul", '3', gen_int_fp_fixed_libfunc)
> >  OPTAB_NX(smul_optab, "mul$P$a3")
> >  OPTAB_NX(smul_optab, "mul$F$a3")
> >
> > Pan
> >
> > -----Original Message-----
> > From: Li, Pan2 <pan2.li@intel.com>
> > Sent: Tuesday, February 27, 2024 10:36 PM
> > To: Richard Biener <richard.guenther@gmail.com>; Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com; richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS
> >
> > Thanks Richard and Tammer for moving this forward.
> >
> > > That said, I would like to see the bigger picture to be kept in mind
> > > before altering the GIMPLE IL.
> >
> > > Adding an internal function for an already present optab is a
> > > no-brainer.  Adding a vectorizer
> > > and/or if-conversion pattern to make use of this during vectorization
> > > is existing practice.
> > > Adding pattern recognition to ISEL or widening-mul passes for
> > > instructions the CPU can do
> > > is existing practice and OK.
> >
> > Thanks for explaining, got the point here.
> >
> > > So I'd suggest writing some example of both signed and unsigned saturating add and multiply
> >
> > > Because signed addition, will likely require a branch and signed multiplication would require a
> > > larger type.
> >
> > Ack, will prepare one prototype validation patch for add, sub and mul (both unsigned and signed) soon.
> >
> > Pan
> >
> > -----Original Message-----
> > From: Richard Biener <richard.guenther@gmail.com>
> > Sent: Tuesday, February 27, 2024 9:42 PM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com; richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS
> >
> > On Tue, Feb 27, 2024 at 1:57 PM Tamar Christina <Tamar.Christina@arm.com> wrote:
> > >
> > > > Thanks Tamar.
> > > >
> > > > > Those two cases also *completely* stop vectorization because of either the
> > > > > control flow or the fact the vectorizer can't handle complex types.
> > > >
> > > > Yes, we eventually would like to vectorize the SAT ALU but we start with scalar part
> > > > first.
> > > > I tried the DEF_INTERNAL_SIGNED_OPTAB_EXT_FN as your suggestion. It works
> > > > well with some additions as below.
> > > > Feel free to correct me if any misunderstandings.
> > > >
> > > > 1. usadd$Q$a3 are restricted to fixed point and we need to change it to
> > > > usadd$a3(as well as gen_int_libfunc) for int.
> > > > 2. We need to implement a default implementation of SAT_ADD if
> > > > direct_binary_optab_supported_p is false.
> > > >     It looks like the default implementation is difficult to make every backend happy.
> > > > That is why you suggest just normal
> > > >     DEF_INTERNAL_SIGNED_OPTAB_FN in another thread.
> > > >
> > > > Thanks Richard.
> > > >
> > > > > But what I'd like to see is that we do more instruction selection on GIMPLE
> > > > > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> > > > > passes doing what I'd call instruction selection).  But that means not adding
> > > > > match.pd patterns for that or at least have a separate isel-match.pd
> > > > > machinery for that.
> > > >
> > > > > So as a start I would go for a direct optab and see to recognize it during
> > > > > ISEL?
> > > >
> > > > Looks we have sorts of SAT alu like PLUS/MINUS/MULT/DIV/SHIFT/NEG/ABS, good
> > > > to know isel and I am happy to
> > > > try that once we have conclusion.
> > > >
> > >
> > > So after a lively discussion on IRC, the conclusion is that before we proceed Richi would
> > > like to see some examples of various operations.  The problem is that unsigned saturating
> > > addition is the simplest example and it may lead to an implementation strategy that doesn't
> > > scale.
> > >
> > > So I'd suggest writing some example of both signed and unsigned saturating add and multiply
> > >
> > > Because signed addition, will likely require a branch and signed multiplication would require a
> > > larger type.
> > >
> > > This would allow us to better understand what kind of gimple would have to to deal with in
> > > ISEL and VECT if we decide not to lower early.
> >
> > More specifically before making something like .SAT_ADD a core part of
> > GIMPLE I'd like
> > to point out that we have saturating PLUS_EXPR but just for
> > fixed-point types.  I realize
> > Joseph thinks that keying this on the type was wrong and it should
> > have used integer
> > types and special saturating operations.  Still having both,
> > type-keyed saturating PLUS_EXPR
> > and "code"-keyed .SAT_ADD (on integer types only?) looks like a mess.
> >
> > It might be that the way to go is to turn all existing saturating type
> > *_EXPR into
> > .SAT_* internal function calls, in the end mapping to the optabs and
> > eventual RTX codes.
> >
> > That could work for both integer types and fixed-point types.
> >
> > I'll also note that "saturating" is just another variant of overflow
> > behavior of which we have
> > trapping (-ftrapv), wrapping (-fwrapv), signed-undefined (default) and
> > also (kind-of) sanitized.
> > We do lack direct IL representation of -ftrapv and -fwrapv, the
> > semantics on a PLUS_EXPR
> > depend on per-function flags.  Eventually a common representation
> > could be found here.
> > For saturating I was thinking of .ADD_OVERFLOW (a, b,
> > saturation-value), a "trap" could
> > be a "trapping" saturation value, "undefined" could be a
> > "not-a-thing".  But I didn't think much
> > about this.
> >
> > That said, I would like to see the bigger picture to be kept in mind
> > before altering the GIMPLE IL.
> >
> > Adding an internal function for an already present optab is a
> > no-brainer.  Adding a vectorizer
> > and/or if-conversion pattern to make use of this during vectorization
> > is existing practice.
> > Adding pattern recognition to ISEL or widening-mul passes for
> > instructions the CPU can do
> > is existing practice and OK.
> >
> > Thanks,
> > Richard.
> >
> >
> > > Thanks,
> > > Tamar
> > >
> > > > Pan
> > > >
> > > > -----Original Message-----
> > > > From: Tamar Christina <Tamar.Christina@arm.com>
> > > > Sent: Tuesday, February 27, 2024 5:57 PM
> > > > To: Richard Biener <richard.guenther@gmail.com>
> > > > Cc: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai;
> > > > Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com;
> > > > richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > > > Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> > > > US_PLUS
> > > >
> > > > > -----Original Message-----
> > > > > From: Richard Biener <richard.guenther@gmail.com>
> > > > > Sent: Tuesday, February 27, 2024 9:44 AM
> > > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > > Cc: pan2.li@intel.com; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai;
> > > > > yanzhang.wang@intel.com; kito.cheng@gmail.com;
> > > > > richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > > > > Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> > > > > US_PLUS
> > > > >
> > > > > On Sun, Feb 25, 2024 at 10:01 AM Tamar Christina
> > > > > <Tamar.Christina@arm.com> wrote:
> > > > > >
> > > > > > Hi Pan,
> > > > > >
> > > > > > > From: Pan Li <pan2.li@intel.com>
> > > > > > >
> > > > > > > Hi Richard & Tamar,
> > > > > > >
> > > > > > > Try the DEF_INTERNAL_INT_EXT_FN as your suggestion.  By mapping
> > > > > > > us_plus$a3 to the RTL representation (us_plus:m x y) in optabs.def.
> > > > > > > And then expand_US_PLUS in internal-fn.cc.  Not very sure if my
> > > > > > > understanding is correct for DEF_INTERNAL_INT_EXT_FN.
> > > > > > >
> > > > > > > I am not sure if we still need DEF_INTERNAL_SIGNED_OPTAB_FN here, given
> > > > > > > the RTL representation has (ss_plus:m x y) and (us_plus:m x y) already.
> > > > > > >
> > > > > >
> > > > > > I think a couple of things are being confused here.  So lets break it down:
> > > > > >
> > > > > > The reason for DEF_INTERNAL_SIGNED_OPTAB_FN is because in GIMPLE
> > > > > > we only want one internal function for both signed and unsigned SAT_ADD.
> > > > > > with this definition we don't need SAT_UADD and SAT_SADD but instead
> > > > > > we will only have SAT_ADD, which will expand to us_plus or ss_plus.
> > > > > >
> > > > > > Now the downside of this is that this is a direct internal optab.  This means
> > > > > > that for the representation to be used the target *must* have the optab
> > > > > > implemented.   This is a bit annoying because it doesn't allow us to generically
> > > > > > assume that all targets use SAT_ADD for saturating add and thus only have to
> > > > > > write optimization for this representation.
> > > > > >
> > > > > > This is why Richi said we may need to use a new tree_code because we can
> > > > > > override tree code expansions.  However the same can be done with the
> > > > _EXT_FN
> > > > > > internal functions.
> > > > > >
> > > > > > So what I meant was that we want to have a combination of the two. i.e. a
> > > > > > DEF_INTERNAL_SIGNED_OPTAB_EXT_FN.
> > > > >
> > > > > Whether we want/need _EXT or only direct depends mainly on how we want to
> > > > > leverage support.  If it's only during vectorization and possibly instruction
> > > > > selection a direct optab is IMO the way to go.  Generic optimization only
> > > > > marginally improves when you explode the number of basic operations you
> > > > > expose - in fact it gets quite unwieldly to support all of them in
> > > > > simplifications
> > > > > and/or canonicalization and you possibly need to translate them back to what
> > > > > the target CPU supports.
> > > > >
> > > > > We already do have too many (IMO) "special" operations exposed "early"
> > > > > in the GIMPLE pipeline.
> > > > >
> > > > > But what I'd like to see is that we do more instruction selection on GIMPLE
> > > > > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> > > > > passes doing what I'd call instruction selection).  But that means not adding
> > > > > match.pd patterns for that or at least have a separate isel-match.pd
> > > > > machinery for that.
> > > > >
> > > > > So as a start I would go for a direct optab and see to recognize it during
> > > > > ISEL?
> > > > >
> > > >
> > > > The problem with ISEL and the reason I suggested an indirect IFN is that there
> > > > Are benefit to be had from recognizing it early.  Saturating arithmetic can be
> > > > optimized
> > > > Differently from non-saturating ones.
> > > >
> > > > But additionally a common way of specifying them decomposes to branches
> > > > and/or using COMPLEX_EXPR (see the various PRs on saturating arithmetic).
> > > >
> > > > These two representation can be detected in PHI-opts and it's beneficial to all
> > > > targets to canonicalize them to the branchless code.
> > > >
> > > > Those two cases also *completely* stop vectorization because of either the
> > > > control flow or the fact the vectorizer can't handle complex types.
> > > >
> > > > So really, gimple ISEL would fix just 1 of the 3 very common cases, and then
> > > > We'd still need to hack the vectorizer cost models for targets with saturating
> > > > vector instructions.
> > > >
> > > > I of course defer to you, but it seems quite suboptimal to do it this way and
> > > > doesn't get us first class saturation support.
> > > >
> > > > Additionally there have been discussions whether both clang and gcc should
> > > > provide __builtin_saturate_* methods, which the non-direct IFN would help
> > > > support.
> > > >
> > > > Tamar.
> > > >
> > > > > > If Richi agrees, the below is what I meant. It creates the infrastructure for this
> > > > > > and for now only allows a default fallback for unsigned saturating add and
> > > > makes
> > > > > > it easier for us to add the rest later
> > > > > >
> > > > > > Also, unless I'm wrong (and Richi can correct me here), us_plus and ss_plus are
> > > > > the
> > > > > > RTL expression, but the optab for saturation are ssadd and usadd.  So you don't
> > > > > > need to make new us_plus and ss_plus ones.
> > > > > >
> > > > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > > > > index a07f25f3aee..aaf9f8991b3 100644
> > > > > > --- a/gcc/internal-fn.cc
> > > > > > +++ b/gcc/internal-fn.cc
> > > > > > @@ -4103,6 +4103,17 @@ direct_internal_fn_supported_p (internal_fn fn,
> > > > > tree_pair types,
> > > > > >         return direct_##TYPE##_optab_supported_p (which_optab, types,   \
> > > > > >                                                   opt_type);            \
> > > > > >        }
> > > > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR,
> > > > > SIGNED_OPTAB, \
> > > > > > +                                        UNSIGNED_OPTAB, TYPE)          \
> > > > > > +    case IFN_##CODE:                                                   \
> > > > > > +      {                                                                        \
> > > > > > +       optab which_optab = (TYPE_UNSIGNED (types.SELECTOR)             \
> > > > > > +                            ? UNSIGNED_OPTAB ## _optab                 \
> > > > > > +                            : SIGNED_OPTAB ## _optab);                 \
> > > > > > +       return direct_##TYPE##_optab_supported_p (which_optab, types,   \
> > > > > > +                                                 opt_type)             \
> > > > > > +              || internal_##CODE##_fn_supported_p (types.SELECTOR, opt_type); \
> > > > > > +      }
> > > > > >  #include "internal-fn.def"
> > > > > >
> > > > > >      case IFN_LAST:
> > > > > > @@ -4303,6 +4314,8 @@ set_edom_supported_p (void)
> > > > > >      optab which_optab = direct_internal_fn_optab (fn, types);          \
> > > > > >      expand_##TYPE##_optab_fn (fn, stmt, which_optab);                  \
> > > > > >    }
> > > > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR,
> > > > > SIGNED_OPTAB, \
> > > > > > +                                        UNSIGNED_OPTAB, TYPE)
> > > > > >  #include "internal-fn.def"
> > > > > >
> > > > > >  /* Routines to expand each internal function, indexed by function number.
> > > > > > @@ -5177,3 +5190,45 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> > > > > >        emit_move_insn (plhs, cmp);
> > > > > >      }
> > > > > >  }
> > > > > > +
> > > > > > +void
> > > > > > +expand_SAT_ADD (internal_fn fn, gcall *stmt)
> > > > > > +{
> > > > > > +  /* Check if the target supports the expansion through an IFN.  */
> > > > > > +  tree_pair types = direct_internal_fn_types (fn, stmt);
> > > > > > +  optab which_optab = direct_internal_fn_optab (fn, types);
> > > > > > +  if (direct_binary_optab_supported_p (which_optab, types,
> > > > > > +                                      insn_optimization_type ()))
> > > > > > +    {
> > > > > > +      expand_binary_optab_fn (fn, stmt, which_optab);
> > > > > > +      return;
> > > > > > +    }
> > > > > > +
> > > > > > +  /* Target does not support the optab, but we can de-compose it.  */
> > > > > > +  /*
> > > > > > +  ... decompose to a canonical representation ...
> > > > > > +  if (TYPE_UNSIGNED (types.SELECTOR))
> > > > > > +    {
> > > > > > +      ...
> > > > > > +      decompose back to (X + Y) | - ((X + Y) < X)
> > > > > > +    }
> > > > > > +  else
> > > > > > +    {
> > > > > > +      ...
> > > > > > +    }
> > > > > > +  */
> > > > > > +}
> > > > > > +
> > > > > > +bool internal_SAT_ADD_fn_supported_p (tree type, optimization_type /*
> > > > > optype */)
> > > > > > +{
> > > > > > +  /* For now, don't support decomposing vector ops.  */
> > > > > > +  if (VECTOR_TYPE_P (type))
> > > > > > +    return false;
> > > > > > +
> > > > > > +  /* Signed saturating arithmetic is harder to do since we'll so for now
> > > > > > +     lets ignore.  */
> > > > > > +  if (!TYPE_UNSIGNED (type))
> > > > > > +    return false;
> > > > > > +
> > > > > > +  return TREE_CODE (type) == INTEGER_TYPE;
> > > > > > +}
> > > > > > \ No newline at end of file
> > > > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > > > > index c14d30365c1..5a2491228d5 100644
> > > > > > --- a/gcc/internal-fn.def
> > > > > > +++ b/gcc/internal-fn.def
> > > > > > @@ -92,6 +92,10 @@ along with GCC; see the file COPYING3.  If not see
> > > > > >     unsigned inputs respectively, both without the trailing "_optab".
> > > > > >     SELECTOR says which type in the tree_pair determines the signedness.
> > > > > >
> > > > > > +   DEF_INTERNAL_SIGNED_OPTAB_EXT_FN is like
> > > > > DEF_INTERNAL_SIGNED_OPTAB_FN, except
> > > > > > +   that it has expand_##NAME defined in internal-fn.cc to override the
> > > > > > +   DEF_INTERNAL_SIGNED_OPTAB_FN expansion behavior.
> > > > > > +
> > > > > >     DEF_INTERNAL_FLT_FN is like DEF_INTERNAL_OPTAB_FN, but in addition,
> > > > > >     the function implements the computational part of a built-in math
> > > > > >     function BUILT_IN_<NAME>{F,,L}.  Unlike some built-in functions,
> > > > > > @@ -153,6 +157,13 @@ along with GCC; see the file COPYING3.  If not see
> > > > > >    DEF_INTERNAL_FN (NAME, FLAGS | ECF_LEAF, NULL)
> > > > > >  #endif
> > > > > >
> > > > > > +#ifndef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> > > > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(NAME, FLAGS, SELECTOR,
> > > > > SIGNED_OPTAB, \
> > > > > > +                                    UNSIGNED_OPTAB, TYPE) \
> > > > > > +  DEF_INTERNAL_SIGNED_OPTAB_FN (NAME, FLAGS, SELECTOR,
> > > > > SIGNED_OPTAB, \
> > > > > > +                               UNSIGNED_OPTAB, TYPE)
> > > > > > +#endif
> > > > > > +
> > > > > >  #ifndef DEF_INTERNAL_FLT_FN
> > > > > >  #define DEF_INTERNAL_FLT_FN(NAME, FLAGS, OPTAB, TYPE) \
> > > > > >    DEF_INTERNAL_OPTAB_FN (NAME, FLAGS, OPTAB, TYPE)
> > > > > > @@ -274,6 +285,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS,
> > > > > ECF_CONST | ECF_NOTHROW, first,
> > > > > >                               smulhs, umulhs, binary)
> > > > > >  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW,
> > > > > first,
> > > > > >                               smulhrs, umulhrs, binary)
> > > > > > +DEF_INTERNAL_SIGNED_OPTAB_EXT_FN (SAT_ADD, ECF_CONST |
> > > > > ECF_NOTHROW, first,
> > > > > > +                                 ssadd, usadd, binary)
> > > > > >
> > > > > >  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
> > > > > >  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
> > > > > > @@ -593,5 +606,6 @@ DEF_INTERNAL_FN (BITINTTOFLOAT, ECF_PURE |
> > > > > ECF_LEAF, ". R . ")
> > > > > >  #undef DEF_INTERNAL_FLT_FN
> > > > > >  #undef DEF_INTERNAL_FLT_FLOATN_FN
> > > > > >  #undef DEF_INTERNAL_SIGNED_OPTAB_FN
> > > > > > +#undef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> > > > > >  #undef DEF_INTERNAL_OPTAB_FN
> > > > > >  #undef DEF_INTERNAL_FN
> > > > > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > > > > > index bccee1c3e09..dbdb1e6bad2 100644
> > > > > > --- a/gcc/internal-fn.h
> > > > > > +++ b/gcc/internal-fn.h
> > > > > > @@ -263,6 +263,8 @@ extern void expand_DIVMODBITINT (internal_fn, gcall
> > > > > *);
> > > > > >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> > > > > >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> > > > > >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > > > > > +extern void expand_SAT_ADD (internal_fn, gcall *);
> > > > > > +extern bool internal_SAT_ADD_fn_supported_p (tree, optimization_type);
> > > > > >
> > > > > >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> > > > > >
> > > > > > > Note this patch is a draft for validation, no test are invovled here.
> > > > > > >
> > > > > > > gcc/ChangeLog:
> > > > > > >
> > > > > > >       * builtins.def (BUILT_IN_US_PLUS): Add builtin def.
> > > > > > >       (BUILT_IN_US_PLUSIMAX): Ditto.
> > > > > > >       (BUILT_IN_US_PLUSL): Ditto.
> > > > > > >       (BUILT_IN_US_PLUSLL): Ditto.
> > > > > > >       (BUILT_IN_US_PLUSG): Ditto.
> > > > > > >       * config/riscv/riscv-protos.h (riscv_expand_us_plus): Add new
> > > > > > >       func decl for expanding us_plus.
> > > > > > >       * config/riscv/riscv.cc (riscv_expand_us_plus): Add new func
> > > > > > >       impl for expanding us_plus.
> > > > > > >       * config/riscv/riscv.md (us_plus<mode>3): Add new pattern impl
> > > > > > >       us_plus<mode>3.
> > > > > > >       * internal-fn.cc (expand_US_PLUS): Add new func impl to expand
> > > > > > >       US_PLUS.
> > > > > > >       * internal-fn.def (US_PLUS): Add new INT_EXT_FN.
> > > > > > >       * internal-fn.h (expand_US_PLUS): Add new func decl.
> > > > > > >       * match.pd: Add new simplify pattern for us_plus.
> > > > > > >       * optabs.def (OPTAB_NL): Add new OPTAB_NL to US_PLUS rtl.
> > > > > > >
> > > > > > > Signed-off-by: Pan Li <pan2.li@intel.com>
> > > > > > > ---
> > > > > > >  gcc/builtins.def                |  7 +++++
> > > > > > >  gcc/config/riscv/riscv-protos.h |  1 +
> > > > > > >  gcc/config/riscv/riscv.cc       | 46 +++++++++++++++++++++++++++++++++
> > > > > > >  gcc/config/riscv/riscv.md       | 11 ++++++++
> > > > > > >  gcc/internal-fn.cc              | 26 +++++++++++++++++++
> > > > > > >  gcc/internal-fn.def             |  3 +++
> > > > > > >  gcc/internal-fn.h               |  1 +
> > > > > > >  gcc/match.pd                    | 17 ++++++++++++
> > > > > > >  gcc/optabs.def                  |  2 ++
> > > > > > >  9 files changed, 114 insertions(+)
> > > > > > >
> > > > > > > diff --git a/gcc/builtins.def b/gcc/builtins.def
> > > > > > > index f6f3e104f6a..0777b912cfa 100644
> > > > > > > --- a/gcc/builtins.def
> > > > > > > +++ b/gcc/builtins.def
> > > > > > > @@ -1055,6 +1055,13 @@ DEF_GCC_BUILTIN
> > > > > (BUILT_IN_POPCOUNTIMAX,
> > > > > > > "popcountimax", BT_FN_INT_UINTMAX
> > > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTL, "popcountl",
> > > > > BT_FN_INT_ULONG,
> > > > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTLL, "popcountll",
> > > > > > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTG, "popcountg",
> > > > > BT_FN_INT_VAR,
> > > > > > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > > > > > > +
> > > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUS, "us_plus", BT_FN_INT_UINT,
> > > > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSIMAX, "us_plusimax",
> > > > > > > BT_FN_INT_UINTMAX, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSL, "us_plusl",
> > > > BT_FN_INT_ULONG,
> > > > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSLL, "us_plusll",
> > > > > > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSG, "us_plusg", BT_FN_INT_VAR,
> > > > > > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > > > > > > +
> > > > > > >  DEF_EXT_LIB_BUILTIN    (BUILT_IN_POSIX_MEMALIGN, "posix_memalign",
> > > > > > > BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF)
> > > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_PREFETCH, "prefetch",
> > > > > > > BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
> > > > > > >  DEF_LIB_BUILTIN        (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE,
> > > > > > > ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST)
> > > > > > > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> > > > > > > index 80efdf2b7e5..ba6086f1f25 100644
> > > > > > > --- a/gcc/config/riscv/riscv-protos.h
> > > > > > > +++ b/gcc/config/riscv/riscv-protos.h
> > > > > > > @@ -132,6 +132,7 @@ extern void riscv_asm_output_external (FILE *, const
> > > > > tree,
> > > > > > > const char *);
> > > > > > >  extern bool
> > > > > > >  riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
> > > > > > >  extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx);
> > > > > > > +extern void riscv_expand_us_plus (rtx, rtx, rtx);
> > > > > > >
> > > > > > >  #ifdef RTX_CODE
> > > > > > >  extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool
> > > > *invert_ptr
> > > > > =
> > > > > > > 0);
> > > > > > > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> > > > > > > index 4100abc9dd1..23f08974f07 100644
> > > > > > > --- a/gcc/config/riscv/riscv.cc
> > > > > > > +++ b/gcc/config/riscv/riscv.cc
> > > > > > > @@ -10657,6 +10657,52 @@ riscv_vector_mode_supported_any_target_p
> > > > > > > (machine_mode)
> > > > > > >    return true;
> > > > > > >  }
> > > > > > >
> > > > > > > +/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < x).  */
> > > > > > > +void
> > > > > > > +riscv_expand_us_plus (rtx dest, rtx x, rtx y)
> > > > > > > +{
> > > > > > > +  machine_mode mode = GET_MODE (dest);
> > > > > > > +  rtx pmode_sum = gen_reg_rtx (Pmode);
> > > > > > > +  rtx pmode_lt = gen_reg_rtx (Pmode);
> > > > > > > +  rtx pmode_x = gen_lowpart (Pmode, x);
> > > > > > > +  rtx pmode_y = gen_lowpart (Pmode, y);
> > > > > > > +  rtx pmode_dest = gen_reg_rtx (Pmode);
> > > > > > > +
> > > > > > > +  /* Step-1: sum = x + y  */
> > > > > > > +  if (mode == SImode && mode != Pmode)
> > > > > > > +    { /* Take addw to avoid the sum truncate.  */
> > > > > > > +      rtx simode_sum = gen_reg_rtx (SImode);
> > > > > > > +      riscv_emit_binary (PLUS, simode_sum, x, y);
> > > > > > > +      emit_move_insn (pmode_sum, gen_lowpart (Pmode, simode_sum));
> > > > > > > +    }
> > > > > > > +  else
> > > > > > > +    riscv_emit_binary (PLUS, pmode_sum, pmode_x, pmode_y);
> > > > > > > +
> > > > > > > +  /* Step-1.1: truncate sum for HI and QI as we have no insn for add QI/HI.
> > > > */
> > > > > > > +  if (mode == HImode || mode == QImode)
> > > > > > > +    {
> > > > > > > +      int mode_bits = GET_MODE_BITSIZE (mode).to_constant ();
> > > > > > > +      int shift_bits = GET_MODE_BITSIZE (Pmode) - mode_bits;
> > > > > > > +
> > > > > > > +      gcc_assert (shift_bits > 0);
> > > > > > > +
> > > > > > > +      riscv_emit_binary (ASHIFT, pmode_sum, pmode_sum, GEN_INT
> > > > > (shift_bits));
> > > > > > > +      riscv_emit_binary (LSHIFTRT, pmode_sum, pmode_sum, GEN_INT
> > > > > > > (shift_bits));
> > > > > > > +    }
> > > > > > > +
> > > > > > > +  /* Step-2: lt = sum < x  */
> > > > > > > +  riscv_emit_binary (LTU, pmode_lt, pmode_sum, pmode_x);
> > > > > > > +
> > > > > > > +  /* Step-3: lt = -lt  */
> > > > > > > +  riscv_emit_unary (NEG, pmode_lt, pmode_lt);
> > > > > > > +
> > > > > > > +  /* Step-4: pmode_dest = sum | lt  */
> > > > > > > +  riscv_emit_binary (IOR, pmode_dest, pmode_lt, pmode_sum);
> > > > > > > +
> > > > > > > +  /* Step-5: dest = pmode_dest */
> > > > > > > +  emit_move_insn (dest, gen_lowpart (mode, pmode_dest));
> > > > > > > +}
> > > > > > > +
> > > > > > >  /* Initialize the GCC target structure.  */
> > > > > > >  #undef TARGET_ASM_ALIGNED_HI_OP
> > > > > > >  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> > > > > > > diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> > > > > > > index 3f7a023d941..eaa9867023c 100644
> > > > > > > --- a/gcc/config/riscv/riscv.md
> > > > > > > +++ b/gcc/config/riscv/riscv.md
> > > > > > > @@ -3841,6 +3841,17 @@ (define_insn "*large_load_address"
> > > > > > >    [(set_attr "type" "load")
> > > > > > >     (set (attr "length") (const_int 8))])
> > > > > > >
> > > > > > > +(define_expand "us_plus<mode>3"
> > > > > > > +  [(match_operand:ANYI   0 "register_operand")
> > > > > > > +   (match_operand:ANYI   1 "register_operand")
> > > > > > > +   (match_operand:ANYI   2 "register_operand")]
> > > > > > > +  ""
> > > > > > > +  {
> > > > > > > +    riscv_expand_us_plus (operands[0], operands[1], operands[2]);
> > > > > > > +    DONE;
> > > > > > > +  }
> > > > > > > +)
> > > > > > > +
> > > > > > >  (include "bitmanip.md")
> > > > > > >  (include "crypto.md")
> > > > > > >  (include "sync.md")
> > > > > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > > > > > index a07f25f3aee..a7341a57ffa 100644
> > > > > > > --- a/gcc/internal-fn.cc
> > > > > > > +++ b/gcc/internal-fn.cc
> > > > > > > @@ -5177,3 +5177,29 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> > > > > > >        emit_move_insn (plhs, cmp);
> > > > > > >      }
> > > > > > >  }
> > > > > > > +
> > > > > > > +void
> > > > > > > +expand_US_PLUS (internal_fn fn, gcall *stmt)
> > > > > > > +{
> > > > > > > +  tree lhs = gimple_call_lhs (stmt);
> > > > > > > +  tree rhs_0 = gimple_call_arg (stmt, 0);
> > > > > > > +  tree rhs_1 = gimple_call_arg (stmt, 1);
> > > > > > > +
> > > > > > > +  do_pending_stack_adjust ();
> > > > > > > +
> > > > > > > +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> > > > > > > +  rtx op_0 = expand_normal (rhs_0);
> > > > > > > +  rtx op_1 = expand_normal (rhs_1);
> > > > > > > +
> > > > > > > +  class expand_operand ops[3];
> > > > > > > +
> > > > > > > +  create_output_operand (&ops[0], target, TYPE_MODE (TREE_TYPE (lhs)));
> > > > > > > +  create_output_operand (&ops[1], op_0, TYPE_MODE (TREE_TYPE
> > > > (rhs_0)));
> > > > > > > +  create_output_operand (&ops[2], op_1, TYPE_MODE (TREE_TYPE
> > > > (rhs_1)));
> > > > > > > +
> > > > > > > +  insn_code code = optab_handler (us_plus_optab, TYPE_MODE (TREE_TYPE
> > > > > > > (rhs_0)));
> > > > > > > +  expand_insn (code, 3, ops);
> > > > > > > +
> > > > > > > +  if (!rtx_equal_p (target, ops[0].value))
> > > > > > > +    emit_move_insn (target, ops[0].value);
> > > > > > > +}
> > > > > >
> > > > > > This can be simplified by calling expand_binary_optab_fn instead. See my
> > > > > template
> > > > > >
> > > > > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > > > > > index c14d30365c1..b1d7b5a0307 100644
> > > > > > > --- a/gcc/internal-fn.def
> > > > > > > +++ b/gcc/internal-fn.def
> > > > > > > @@ -447,6 +447,9 @@ DEF_INTERNAL_INT_FN (FFS, ECF_CONST |
> > > > > > > ECF_NOTHROW, ffs, unary)
> > > > > > >  DEF_INTERNAL_INT_FN (PARITY, ECF_CONST | ECF_NOTHROW, parity,
> > > > unary)
> > > > > > >  DEF_INTERNAL_INT_EXT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW,
> > > > > > > popcount, unary)
> > > > > > >
> > > > > > > +/* Binary integer ops.  */
> > > > > > > +DEF_INTERNAL_INT_EXT_FN (US_PLUS, ECF_CONST | ECF_NOTHROW,
> > > > > us_plus,
> > > > > > > binary)
> > > > > > > +
> > > > > > >  DEF_INTERNAL_FN (GOMP_TARGET_REV, ECF_NOVOPS | ECF_LEAF |
> > > > > > > ECF_NOTHROW, NULL)
> > > > > > >  DEF_INTERNAL_FN (GOMP_USE_SIMT, ECF_NOVOPS | ECF_LEAF |
> > > > > > > ECF_NOTHROW, NULL)
> > > > > > >  DEF_INTERNAL_FN (GOMP_SIMT_ENTER, ECF_LEAF | ECF_NOTHROW,
> > > > NULL)
> > > > > > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > > > > > > index bccee1c3e09..46e404b4a49 100644
> > > > > > > --- a/gcc/internal-fn.h
> > > > > > > +++ b/gcc/internal-fn.h
> > > > > > > @@ -263,6 +263,7 @@ extern void expand_DIVMODBITINT (internal_fn,
> > > > gcall
> > > > > *);
> > > > > > >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> > > > > > >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> > > > > > >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > > > > > > +extern void expand_US_PLUS (internal_fn, gcall *);
> > > > > > >
> > > > > > >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> > > > > > >
> > > > > > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > > > > > index c5b6540f939..f45fd58ad23 100644
> > > > > > > --- a/gcc/match.pd
> > > > > > > +++ b/gcc/match.pd
> > > > > > > @@ -10265,3 +10265,20 @@ and,
> > > > > > >        }
> > > > > > >        (if (full_perm_p)
> > > > > > >       (vec_perm (op@3 @0 @1) @3 @2))))))
> > > > > > > +
> > > > > > > +#if GIMPLE
> > > > > > > +
> > > > > > > +/* Unsigned saturation add, aka:
> > > > > > > +   SAT_ADDU = (X + Y) | - ((X + Y) < X) or
> > > > > > > +   SAT_ADDU = (X + Y) | - ((X + Y) < Y).  */
> > > > > > > +(simplify
> > > > > > > + (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0))))
> > > > > > > +   (if (optimize
> > > > > > > +     && INTEGRAL_TYPE_P (type)
> > > > > > > +     && TYPE_UNSIGNED (TREE_TYPE (@0))
> > > > > > > +     && types_match (type, TREE_TYPE (@0))
> > > > > > > +     && types_match (type, TREE_TYPE (@1))
> > > > > > > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> > > > > > > OPTIMIZE_FOR_BOTH))
> > > > > > > +   (IFN_US_PLUS @0 @1)))
> > > > > > > +
> > > > > > > +#endif
> > > > > >
> > > > > > With the version above you can drop the #if GIMPLE and the
> > > > > > > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> > > > > >
> > > > > > Check.
> > > > > >
> > > > > > Thanks,
> > > > > > Tamar
> > > > > >
> > > > > > > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > > > > > > index ad14f9328b9..5855c4e0834 100644
> > > > > > > --- a/gcc/optabs.def
> > > > > > > +++ b/gcc/optabs.def
> > > > > > > @@ -179,6 +179,8 @@ OPTAB_NL(clrsb_optab, "clrsb$a2", CLRSB, "clrsb",
> > > > '2',
> > > > > > > gen_int_libfunc)
> > > > > > >  OPTAB_NL(popcount_optab, "popcount$a2", POPCOUNT, "popcount", '2',
> > > > > > > gen_int_libfunc)
> > > > > > >  OPTAB_NL(parity_optab, "parity$a2", PARITY, "parity", '2', gen_int_libfunc)
> > > > > > >
> > > > > > > +OPTAB_NL(us_plus_optab, "us_plus$a3", US_PLUS, "us_plus", '3',
> > > > > > > gen_int_libfunc)
> > > > > > > +
> > > > > > >  /* Comparison libcalls for integers MUST come in pairs, signed/unsigned.  */
> > > > > > >  OPTAB_NL(cmp_optab, NULL, UNKNOWN, "cmp", '2',
> > > > > gen_int_fp_fixed_libfunc)
> > > > > > >  OPTAB_NL(ucmp_optab, NULL, UNKNOWN, "ucmp", '2', gen_int_libfunc)
> > > > > > > --
> > > > > > > 2.34.1
> > > > > >
Richard Biener March 7, 2024, 8:41 a.m. UTC | #14
On Thu, Mar 7, 2024 at 2:54 AM Li, Pan2 <pan2.li@intel.com> wrote:
>
> Thanks Richard for comments.
>
> > gen_int_libfunc will no longer make it emit libcalls for fixed point
> > modes, so this can't be correct
> > and there's no libgcc implementation for integer mode saturating ops,
> > so it's pointless to emit calls
> > to them.
>
> Got the pointer here, the OPTAB_NL(usadd_optab, "usadd$Q$a3", US_PLUS, "usadd", '3', gen_unsigned_fixed_libfunc)
> Is designed for the fixed point, cannot cover integer mode right now.

I think

OPTAB_NL(usadd_optab, "usadd$a3", US_PLUS, "usadd", '3',
gen_unsigned_fixed_libfunc)

would work fine (just dropping the $Q).

> Given we have saturating integer alu like below, could you help to coach me the most reasonable way to represent
> It in scalar as well as vectorize part? Sorry not familiar with this part and still dig into how it works...

As in your v2, .SAT_ADD for both sat_uadd and sat_sadd, similar for
the other cases.

As I said, use vectorizer patterns and possibly do instruction
selection at ISEL/widen_mult time.

Richard.

> uint32_t sat_uadd (uint32_t a, uint32_t b)
> {
>   uint32_t add = a + b;
>   return add | -(add < a);
> }
>
> sint32_t sat_sadd (sint32_t a, sint32_t b)
> {
>   sint32_t add = a + b;
>   sint32_t x = a ^ b;
>   sint32_t y = add ^ x;
>   return x < 0 ? add : (y >= 0 ? add : INT32_MAX + (x < 0));
> }
>
> uint32_t sat_usub (uint32_t a, uint32_t b)
> {
>   return a >= b ? a - b : 0;
> }
>
> sint32_t sat_ssub (sint32_t a, sint32_t b)
> {
>   sint32_t sub = a - b;
>   sint32_t x = a ^ b;
>   sint32_t y = sub ^ x;
>   return x >= 0 ? sub : (y >= 0 ? sub : INT32_MAX + (x < 0));
> }
>
> uint32_t sat_umul (uint32_t a, uint32_t b)
> {
>   uint64_t mul = a * b;
>
>   return mul <= (uint64_t)UINT32_MAX ? (uint32_t)mul : UINT32_MAX;
> }
>
> sint32_t sat_smul (sint32_t a, sint32_t b)
> {
>   sint64_t mul = a * b;
>
>   return mul >= (sint64_t)INT32_MIN && mul <= (sint64_t)INT32_MAX ? (sint32_t)mul : INT32_MAX + ((x ^ y) < 0);
> }
>
> uint32_t sat_udiv (uint32_t a, uint32_t b)
> {
>   return a / b; // never overflow
> }
>
> sint32_t sat_sdiv (sint32_t a, sint32_t b)
> {
>   return a == INT32_MIN && b == -1 ? INT32_MAX : a / b;
> }
>
> sint32_t sat_abs (sint32_t a)
> {
>   return a >= 0 ? a : (a == INT32_MIN ? INT32_MAX : -a);
> }
>
> Pan
>
> -----Original Message-----
> From: Richard Biener <richard.guenther@gmail.com>
> Sent: Tuesday, March 5, 2024 4:41 PM
> To: Li, Pan2 <pan2.li@intel.com>
> Cc: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com; jeffreyalaw@gmail.com
> Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS
>
> On Tue, Mar 5, 2024 at 8:09 AM Li, Pan2 <pan2.li@intel.com> wrote:
> >
> > Thanks Richard for comments.
> >
> > > I do wonder what the existing usadd patterns with integer vector modes
> > > in various targets do?
> > > Those define_insn will at least not end up in the optab set I guess,
> > > so they must end up
> > > being either unused or used by explicit gen_* (via intrinsic
> > > functions?) or by combine?
> >
> > For usadd with vector modes, I think the backend like RISC-V try to leverage instructions
> > like Vector Single-Width Saturating Add(aka vsaddu.vv/x/i).
> >
> > > I think simply changing gen_*_fixed_libfunc to gen_int_libfunc won't
> > > work.  Since there's
> > > no libgcc support I'd leave it as gen_*_fixed_libfunc thus no library
> > > fallback for integers?
> >
> > Change to gen_int_libfunc follows other int optabs. I am not sure if it will hit the standard name usaddm3 for vector mode.
> > But the happy path for scalar modes works up to a point, please help to correct me if any misunderstanding.
>
> gen_int_libfunc will no longer make it emit libcalls for fixed point
> modes, so this can't be correct
> and there's no libgcc implementation for integer mode saturating ops,
> so it's pointless to emit calls
> to them.
>
> > #0  riscv_expand_usadd (dest=0x7ffff6a8c7c8, x=0x7ffff6a8c798, y=0x7ffff6a8c7b0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/config/riscv/riscv.cc:10662
> > #1  0x00000000029f142a in gen_usaddsi3 (operand0=0x7ffff6a8c7c8, operand1=0x7ffff6a8c798, operand2=0x7ffff6a8c7b0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/config/riscv/riscv.md:3848
> > #2  0x0000000001751e60 in insn_gen_fn::operator()<rtx_def*, rtx_def*, rtx_def*> (this=0x4910e70 <insn_data+1248976>) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/recog.h:441
> > #3  0x000000000180f553 in maybe_gen_insn (icode=CODE_FOR_usaddsi3, nops=3, ops=0x7fffffffd2c0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/optabs.cc:8232
> > #4  0x000000000180fa42 in maybe_expand_insn (icode=CODE_FOR_usaddsi3, nops=3, ops=0x7fffffffd2c0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/optabs.cc:8275
> > #5  0x000000000180fade in expand_insn (icode=CODE_FOR_usaddsi3, nops=3, ops=0x7fffffffd2c0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/optabs.cc:8306
> > #6  0x00000000015cebdc in expand_fn_using_insn (stmt=0x7ffff6a36480, icode=CODE_FOR_usaddsi3, noutputs=1, ninputs=2) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/internal-fn.cc:254
> > #7  0x00000000015de146 in expand_direct_optab_fn (fn=IFN_SAT_ADD, stmt=0x7ffff6a36480, optab=usadd_optab, nargs=2) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/internal-fn.cc:3818
> > #8  0x00000000015e3610 in expand_SAT_ADD (fn=IFN_SAT_ADD, stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/internal-fn.def:278
> > #9  0x00000000015e65b6 in expand_internal_call (fn=IFN_SAT_ADD, stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/internal-fn.cc:4914
> > #10 0x00000000015e65e5 in expand_internal_call (stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/internal-fn.cc:4922
> > #11 0x0000000001248c8f in expand_call_stmt (stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:2771
> > #12 0x000000000124d392 in expand_gimple_stmt_1 (stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:3932
> > #13 0x000000000124d9aa in expand_gimple_stmt (stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:4077
> > #14 0x000000000124dac4 in expand_gimple_tailcall (bb=0x7ffff6dddae0, stmt=0x7ffff6a36480, can_fallthru=0x7fffffffd800) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:4123
> > #15 0x000000000125636b in expand_gimple_basic_block (bb=0x7ffff6dddae0, disable_tail_calls=false) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:6107
> > #16 0x0000000001258a1a in (anonymous namespace)::pass_expand::execute (this=0x556d180, fun=0x7ffff6a7f2e0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:6872
> > #17 0x0000000001873565 in execute_one_pass (pass=0x556d180) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/passes.cc:2646
> > #18 0x0000000001873948 in execute_pass_list_1 (pass=0x556d180) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/passes.cc:2755
> > #19 0x00000000018739d6 in execute_pass_list (fn=0x7ffff6a7f2e0, pass=0x5568870) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/passes.cc:2766
> > #20 0x00000000012bc975 in cgraph_node::expand (this=0x7ffff6c2c880) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cgraphunit.cc:1845
> > #21 0x00000000012bd18f in expand_all_functions () at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cgraphunit.cc:2028
> > #22 0x00000000012bdcc5 in symbol_table::compile (this=0x7ffff6c06000) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cgraphunit.cc:2402
> > #23 0x00000000012be16c in symbol_table::finalize_compilation_unit (this=0x7ffff6c06000) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cgraphunit.cc:2587
> > #24 0x0000000001a048a3 in compile_file () at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/toplev.cc:476
> > #25 0x0000000001a079e9 in do_compile () at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/toplev.cc:2154
> > #26 0x0000000001a07dee in toplev::main (this=0x7fffffffdcf2, argc=19, argv=0x7fffffffde28) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/toplev.cc:2310
> > #27 0x0000000003ebcc5d in main (argc=19, argv=0x7fffffffde28) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/main.cc:39
> >
> > BTW, does match.pd support nested cond like below? I am debugging into gimple_simplify_COND_EXPR for why not hit the pattern...
> > +(simplify
> > +  (cond
> > +    (lt @0 integer_zerop)
> > +    (plus:c @0 @1)
> > +    (cond (lt @1 integer_zerop) @1 @0))
> > +  (IFN_SAT_ADD @0 @1))
> >
> > Pan
> >
> > -----Original Message-----
> > From: Richard Biener <richard.guenther@gmail.com>
> > Sent: Monday, March 4, 2024 6:31 PM
> > To: Li, Pan2 <pan2.li@intel.com>
> > Cc: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com; jeffreyalaw@gmail.com
> > Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS
> >
> > On Sat, Mar 2, 2024 at 8:46 AM Li, Pan2 <pan2.li@intel.com> wrote:
> > >
> > > Hi Richard and Tamar,
> > >
> > > I have a try with DEF_INTERNAL_SIGNED_OPTAB_FN for SAT_ADD/SUB/MUL but meet some problem when match.pd.
> > >
> > > For unsigned SAT_ADD = (x + y) | - ((x + y) < x), the match.pd can be (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0)))).
> > > For unsigned SAT_SUB = x >= y ? x - y : 0, and then match.pd can be (cond (ge @0 @1) (minus @0 @1) integer_zerop).
> > >
> > > For signed SAT_ADD/SAT_SUB as below, seems not easy to make the simplify pattern works well as expected up to a point.
> > > sint64_t sat_add (sint64_t x, sint64_t y)
> > > {
> > >   sint64_t a = x ^ y;
> > >   sint64_t add = x + y;
> > >   sint64_t b = sum ^ x;
> > >
> > >   return (a < 0 || (a >= 0 && b >= 0)) ? add : (MAX_INT64 + (x < 0));
> > > }
> > >
> > > sint64_t sad_sub (sint64_t x, sint64_t y)
> > > {
> > >   sint64_t a = x ^ y;
> > >   sint64_t sub = x - y;
> > >   sint64_t b = sub ^ x;
> > >
> > >   return (a >= 0 || (a < 0 && b >= 0) ? sub : (MAX_INT64 + (x < 0));
> > > }
> > >
> > > For SAT_MUL as below, looks we may need widen type. I am not sure if we can leverage MUL_OVERFLOW or not in match.pd.
> > >
> > > uint32_t sat_mul (uint32_t x, uint32_t y)
> > > {
> > >   uint64_t mul = (uint64_t)x * (uint64_t)y;
> > >   return mul > UINT32_MAX ? UINT32_MAX : (uint32_t)mul;
> > > }
> > >
> > > sint32_t sat_mul (sint32_t x, sint32_t y)
> > > {
> > >   sint64_t mul = (sint64_t)x * (sint64_t))y;
> > >
> > >   return mul <= MAX_INT32 && mul >= MIN_INT32 ? mul : MAX_INT32 + (x ^ y) > 0;
> > > }
> > >
> > > Below diff only contains unsigned SAT_ADD and SAT_SUB for prototype validation.
> > > I will continue to try the rest part in match.pd and keep you posted.
> > >
> > > -------------------------------------------------------------------------------------------------------------------------------------------------------------------------
> > >
> > > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> > > index 80efdf2b7e5..d9ad6fe2b58 100644
> > > --- a/gcc/config/riscv/riscv-protos.h
> > > +++ b/gcc/config/riscv/riscv-protos.h
> > > @@ -132,6 +132,9 @@ extern void riscv_asm_output_external (FILE *, const tree, const char *);
> > >  extern bool
> > >  riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
> > >  extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx);
> > > +extern void riscv_expand_usadd (rtx, rtx, rtx);
> > > +extern void riscv_expand_ussub (rtx, rtx, rtx);
> > >
> > >  #ifdef RTX_CODE
> > >  extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool *invert_ptr = 0);
> > > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> > > index 5e984ee2a55..795462526df 100644
> > > --- a/gcc/config/riscv/riscv.cc
> > > +++ b/gcc/config/riscv/riscv.cc
> > > @@ -10655,6 +10655,28 @@ riscv_vector_mode_supported_any_target_p (machine_mode)
> > >    return true;
> > >  }
> > >
> > > +/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < x).  */
> > > +void
> > > +riscv_expand_usadd (rtx dest, rtx x, rtx y)
> > > +{
> > > +  fprintf (stdout, "Hit riscv_expand_usadd.\n");
> > > +  // ToDo
> > > +}
> > > +
> > > +void
> > > +riscv_expand_ussub (rtx dest, rtx x, rtx y)
> > > +{
> > > +  fprintf (stdout, "Hit riscv_expand_ussub.\n");
> > > +  // ToDo
> > > +}
> > > +
> > >  /* Initialize the GCC target structure.  */
> > >  #undef TARGET_ASM_ALIGNED_HI_OP
> > >  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> > > diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> > > index 1fec13092e2..e2dbadb3ead 100644
> > > --- a/gcc/config/riscv/riscv.md
> > > +++ b/gcc/config/riscv/riscv.md
> > > @@ -3839,6 +3839,39 @@ (define_insn "*large_load_address"
> > >    [(set_attr "type" "load")
> > >     (set (attr "length") (const_int 8))])
> > >
> > > +(define_expand "usadd<mode>3"
> > > +  [(match_operand:ANYI 0 "register_operand")
> > > +   (match_operand:ANYI 1 "register_operand")
> > > +   (match_operand:ANYI 2 "register_operand")]
> > > +  ""
> > > +  {
> > > +    riscv_expand_usadd (operands[0], operands[1], operands[2]);
> > > +    DONE;
> > > +  }
> > > +)
> > > +
> > > +(define_expand "ussub<mode>3"
> > > +  [(match_operand:ANYI 0 "register_operand")
> > > +   (match_operand:ANYI 1 "register_operand")
> > > +   (match_operand:ANYI 2 "register_operand")]
> > > +  ""
> > > +  {
> > > +    riscv_expand_ussub (operands[0], operands[1], operands[2]);
> > > +    DONE;
> > > +  }
> > > +)
> > > +
> > >  (include "bitmanip.md")
> > >  (include "crypto.md")
> > >  (include "sync.md")
> > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > index 848bb9dbff3..0fff19c875f 100644
> > > --- a/gcc/internal-fn.def
> > > +++ b/gcc/internal-fn.def
> > > @@ -275,6 +275,13 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST | ECF_NOTHROW, first,
> > >  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, first,
> > >                               smulhrs, umulhrs, binary)
> > >
> > > +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST | ECF_NOTHROW, first,
> > > +                             ssadd, usadd, binary)
> > > +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_SUB, ECF_CONST | ECF_NOTHROW, first,
> > > +                             sssub, ussub, binary)
> > > +
> > >  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
> > >  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
> > >  DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary)
> > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > index f3fffd8dec2..6592dea643a 100644
> > > --- a/gcc/match.pd
> > > +++ b/gcc/match.pd
> > > @@ -10276,3 +10276,32 @@ and,
> > >        }
> > >        (if (full_perm_p)
> > >         (vec_perm (op@3 @0 @1) @3 @2))))))
> > > +
> > > +#if GIMPLE
> > > +
> > > +/* Unsigned saturation add, aka:
> > > +   SAT_ADDU = (X + Y) | - ((X + Y) < X) or
> > > +   SAT_ADDU = (X + Y) | - ((X + Y) < Y).  */
> > > +(simplify
> > > + (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0))))
> > > +   (if (optimize
> > > +       && INTEGRAL_TYPE_P (type)
> > > +       && types_match (type, TREE_TYPE (@0))
> > > +       && types_match (type, TREE_TYPE (@1))
> > > +       && TYPE_UNSIGNED (TREE_TYPE (@0))
> > > +       && direct_internal_fn_supported_p (IFN_SAT_ADD, type, OPTIMIZE_FOR_BOTH))
> > > +   (IFN_SAT_ADD @0 @1)))
> > > +
> > > +/* Unsigned saturation sub , aka
> > > +   SAT_SUBU = x >= y ? x - y : 0.  */
> > > +(simplify
> > > +  (cond (ge @0 @1) (minus @0 @1) integer_zerop)
> > > +    (if (optimize
> > > +       && INTEGRAL_TYPE_P (type)
> > > +       && TYPE_UNSIGNED (TREE_TYPE (@0))
> > > +       && types_match (type, TREE_TYPE (@0))
> > > +       && types_match (type, TREE_TYPE (@1))
> > > +       && direct_internal_fn_supported_p (IFN_SAT_SUB, type, OPTIMIZE_FOR_BOTH))
> > > +    (IFN_SAT_SUB @0 @1)))
> > > +
> > > +#endif
> > > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > > index ad14f9328b9..bebe38c888b 100644
> > > --- a/gcc/optabs.def
> > > +++ b/gcc/optabs.def
> > > @@ -111,15 +111,15 @@ OPTAB_NX(add_optab, "add$F$a3")
> > >  OPTAB_NX(add_optab, "add$Q$a3")
> > >  OPTAB_VL(addv_optab, "addv$I$a3", PLUS, "add", '3', gen_intv_fp_libfunc)
> > >  OPTAB_VX(addv_optab, "add$F$a3")
> > > -OPTAB_NL(ssadd_optab, "ssadd$Q$a3", SS_PLUS, "ssadd", '3', gen_signed_fixed_libfunc)
> > > -OPTAB_NL(usadd_optab, "usadd$Q$a3", US_PLUS, "usadd", '3', gen_unsigned_fixed_libfunc)
> > > +OPTAB_NL(ssadd_optab, "ssadd$a3", SS_PLUS, "ssadd", '3', gen_int_libfunc)
> > > +OPTAB_NL(usadd_optab, "usadd$a3", US_PLUS, "usadd", '3', gen_int_libfunc)
> > >  OPTAB_NL(sub_optab, "sub$P$a3", MINUS, "sub", '3', gen_int_fp_fixed_libfunc)
> > >  OPTAB_NX(sub_optab, "sub$F$a3")
> > >  OPTAB_NX(sub_optab, "sub$Q$a3")
> > >  OPTAB_VL(subv_optab, "subv$I$a3", MINUS, "sub", '3', gen_intv_fp_libfunc)
> > >  OPTAB_VX(subv_optab, "sub$F$a3")
> > > -OPTAB_NL(sssub_optab, "sssub$Q$a3", SS_MINUS, "sssub", '3', gen_signed_fixed_libfunc)
> > > -OPTAB_NL(ussub_optab, "ussub$Q$a3", US_MINUS, "ussub", '3', gen_unsigned_fixed_libfunc)
> > > +OPTAB_NL(sssub_optab, "sssub$a3", SS_MINUS, "sssub", '3', gen_int_libfunc)
> > > +OPTAB_NL(ussub_optab, "ussub$a3", US_MINUS, "ussub", '3', gen_int_libfunc)
> >
> > I do wonder what the existing usadd patterns with integer vector modes
> > in various targets do?
> > Those define_insn will at least not end up in the optab set I guess,
> > so they must end up
> > being either unused or used by explicit gen_* (via intrinsic
> > functions?) or by combine?
> >
> > I think simply changing gen_*_fixed_libfunc to gen_int_libfunc won't
> > work.  Since there's
> > no libgcc support I'd leave it as gen_*_fixed_libfunc thus no library
> > fallback for integers?
> >
> > >  OPTAB_NL(smul_optab, "mul$Q$a3", MULT, "mul", '3', gen_int_fp_fixed_libfunc)
> > >  OPTAB_NX(smul_optab, "mul$P$a3")
> > >  OPTAB_NX(smul_optab, "mul$F$a3")
> > >
> > > Pan
> > >
> > > -----Original Message-----
> > > From: Li, Pan2 <pan2.li@intel.com>
> > > Sent: Tuesday, February 27, 2024 10:36 PM
> > > To: Richard Biener <richard.guenther@gmail.com>; Tamar Christina <Tamar.Christina@arm.com>
> > > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com; richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > > Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS
> > >
> > > Thanks Richard and Tammer for moving this forward.
> > >
> > > > That said, I would like to see the bigger picture to be kept in mind
> > > > before altering the GIMPLE IL.
> > >
> > > > Adding an internal function for an already present optab is a
> > > > no-brainer.  Adding a vectorizer
> > > > and/or if-conversion pattern to make use of this during vectorization
> > > > is existing practice.
> > > > Adding pattern recognition to ISEL or widening-mul passes for
> > > > instructions the CPU can do
> > > > is existing practice and OK.
> > >
> > > Thanks for explaining, got the point here.
> > >
> > > > So I'd suggest writing some example of both signed and unsigned saturating add and multiply
> > >
> > > > Because signed addition, will likely require a branch and signed multiplication would require a
> > > > larger type.
> > >
> > > Ack, will prepare one prototype validation patch for add, sub and mul (both unsigned and signed) soon.
> > >
> > > Pan
> > >
> > > -----Original Message-----
> > > From: Richard Biener <richard.guenther@gmail.com>
> > > Sent: Tuesday, February 27, 2024 9:42 PM
> > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > Cc: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com; richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > > Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS
> > >
> > > On Tue, Feb 27, 2024 at 1:57 PM Tamar Christina <Tamar.Christina@arm.com> wrote:
> > > >
> > > > > Thanks Tamar.
> > > > >
> > > > > > Those two cases also *completely* stop vectorization because of either the
> > > > > > control flow or the fact the vectorizer can't handle complex types.
> > > > >
> > > > > Yes, we eventually would like to vectorize the SAT ALU but we start with scalar part
> > > > > first.
> > > > > I tried the DEF_INTERNAL_SIGNED_OPTAB_EXT_FN as your suggestion. It works
> > > > > well with some additions as below.
> > > > > Feel free to correct me if any misunderstandings.
> > > > >
> > > > > 1. usadd$Q$a3 are restricted to fixed point and we need to change it to
> > > > > usadd$a3(as well as gen_int_libfunc) for int.
> > > > > 2. We need to implement a default implementation of SAT_ADD if
> > > > > direct_binary_optab_supported_p is false.
> > > > >     It looks like the default implementation is difficult to make every backend happy.
> > > > > That is why you suggest just normal
> > > > >     DEF_INTERNAL_SIGNED_OPTAB_FN in another thread.
> > > > >
> > > > > Thanks Richard.
> > > > >
> > > > > > But what I'd like to see is that we do more instruction selection on GIMPLE
> > > > > > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> > > > > > passes doing what I'd call instruction selection).  But that means not adding
> > > > > > match.pd patterns for that or at least have a separate isel-match.pd
> > > > > > machinery for that.
> > > > >
> > > > > > So as a start I would go for a direct optab and see to recognize it during
> > > > > > ISEL?
> > > > >
> > > > > Looks we have sorts of SAT alu like PLUS/MINUS/MULT/DIV/SHIFT/NEG/ABS, good
> > > > > to know isel and I am happy to
> > > > > try that once we have conclusion.
> > > > >
> > > >
> > > > So after a lively discussion on IRC, the conclusion is that before we proceed Richi would
> > > > like to see some examples of various operations.  The problem is that unsigned saturating
> > > > addition is the simplest example and it may lead to an implementation strategy that doesn't
> > > > scale.
> > > >
> > > > So I'd suggest writing some example of both signed and unsigned saturating add and multiply
> > > >
> > > > Because signed addition, will likely require a branch and signed multiplication would require a
> > > > larger type.
> > > >
> > > > This would allow us to better understand what kind of gimple would have to to deal with in
> > > > ISEL and VECT if we decide not to lower early.
> > >
> > > More specifically before making something like .SAT_ADD a core part of
> > > GIMPLE I'd like
> > > to point out that we have saturating PLUS_EXPR but just for
> > > fixed-point types.  I realize
> > > Joseph thinks that keying this on the type was wrong and it should
> > > have used integer
> > > types and special saturating operations.  Still having both,
> > > type-keyed saturating PLUS_EXPR
> > > and "code"-keyed .SAT_ADD (on integer types only?) looks like a mess.
> > >
> > > It might be that the way to go is to turn all existing saturating type
> > > *_EXPR into
> > > .SAT_* internal function calls, in the end mapping to the optabs and
> > > eventual RTX codes.
> > >
> > > That could work for both integer types and fixed-point types.
> > >
> > > I'll also note that "saturating" is just another variant of overflow
> > > behavior of which we have
> > > trapping (-ftrapv), wrapping (-fwrapv), signed-undefined (default) and
> > > also (kind-of) sanitized.
> > > We do lack direct IL representation of -ftrapv and -fwrapv, the
> > > semantics on a PLUS_EXPR
> > > depend on per-function flags.  Eventually a common representation
> > > could be found here.
> > > For saturating I was thinking of .ADD_OVERFLOW (a, b,
> > > saturation-value), a "trap" could
> > > be a "trapping" saturation value, "undefined" could be a
> > > "not-a-thing".  But I didn't think much
> > > about this.
> > >
> > > That said, I would like to see the bigger picture to be kept in mind
> > > before altering the GIMPLE IL.
> > >
> > > Adding an internal function for an already present optab is a
> > > no-brainer.  Adding a vectorizer
> > > and/or if-conversion pattern to make use of this during vectorization
> > > is existing practice.
> > > Adding pattern recognition to ISEL or widening-mul passes for
> > > instructions the CPU can do
> > > is existing practice and OK.
> > >
> > > Thanks,
> > > Richard.
> > >
> > >
> > > > Thanks,
> > > > Tamar
> > > >
> > > > > Pan
> > > > >
> > > > > -----Original Message-----
> > > > > From: Tamar Christina <Tamar.Christina@arm.com>
> > > > > Sent: Tuesday, February 27, 2024 5:57 PM
> > > > > To: Richard Biener <richard.guenther@gmail.com>
> > > > > Cc: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai;
> > > > > Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com;
> > > > > richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > > > > Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> > > > > US_PLUS
> > > > >
> > > > > > -----Original Message-----
> > > > > > From: Richard Biener <richard.guenther@gmail.com>
> > > > > > Sent: Tuesday, February 27, 2024 9:44 AM
> > > > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > > > Cc: pan2.li@intel.com; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai;
> > > > > > yanzhang.wang@intel.com; kito.cheng@gmail.com;
> > > > > > richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > > > > > Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> > > > > > US_PLUS
> > > > > >
> > > > > > On Sun, Feb 25, 2024 at 10:01 AM Tamar Christina
> > > > > > <Tamar.Christina@arm.com> wrote:
> > > > > > >
> > > > > > > Hi Pan,
> > > > > > >
> > > > > > > > From: Pan Li <pan2.li@intel.com>
> > > > > > > >
> > > > > > > > Hi Richard & Tamar,
> > > > > > > >
> > > > > > > > Try the DEF_INTERNAL_INT_EXT_FN as your suggestion.  By mapping
> > > > > > > > us_plus$a3 to the RTL representation (us_plus:m x y) in optabs.def.
> > > > > > > > And then expand_US_PLUS in internal-fn.cc.  Not very sure if my
> > > > > > > > understanding is correct for DEF_INTERNAL_INT_EXT_FN.
> > > > > > > >
> > > > > > > > I am not sure if we still need DEF_INTERNAL_SIGNED_OPTAB_FN here, given
> > > > > > > > the RTL representation has (ss_plus:m x y) and (us_plus:m x y) already.
> > > > > > > >
> > > > > > >
> > > > > > > I think a couple of things are being confused here.  So lets break it down:
> > > > > > >
> > > > > > > The reason for DEF_INTERNAL_SIGNED_OPTAB_FN is because in GIMPLE
> > > > > > > we only want one internal function for both signed and unsigned SAT_ADD.
> > > > > > > with this definition we don't need SAT_UADD and SAT_SADD but instead
> > > > > > > we will only have SAT_ADD, which will expand to us_plus or ss_plus.
> > > > > > >
> > > > > > > Now the downside of this is that this is a direct internal optab.  This means
> > > > > > > that for the representation to be used the target *must* have the optab
> > > > > > > implemented.   This is a bit annoying because it doesn't allow us to generically
> > > > > > > assume that all targets use SAT_ADD for saturating add and thus only have to
> > > > > > > write optimization for this representation.
> > > > > > >
> > > > > > > This is why Richi said we may need to use a new tree_code because we can
> > > > > > > override tree code expansions.  However the same can be done with the
> > > > > _EXT_FN
> > > > > > > internal functions.
> > > > > > >
> > > > > > > So what I meant was that we want to have a combination of the two. i.e. a
> > > > > > > DEF_INTERNAL_SIGNED_OPTAB_EXT_FN.
> > > > > >
> > > > > > Whether we want/need _EXT or only direct depends mainly on how we want to
> > > > > > leverage support.  If it's only during vectorization and possibly instruction
> > > > > > selection a direct optab is IMO the way to go.  Generic optimization only
> > > > > > marginally improves when you explode the number of basic operations you
> > > > > > expose - in fact it gets quite unwieldly to support all of them in
> > > > > > simplifications
> > > > > > and/or canonicalization and you possibly need to translate them back to what
> > > > > > the target CPU supports.
> > > > > >
> > > > > > We already do have too many (IMO) "special" operations exposed "early"
> > > > > > in the GIMPLE pipeline.
> > > > > >
> > > > > > But what I'd like to see is that we do more instruction selection on GIMPLE
> > > > > > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> > > > > > passes doing what I'd call instruction selection).  But that means not adding
> > > > > > match.pd patterns for that or at least have a separate isel-match.pd
> > > > > > machinery for that.
> > > > > >
> > > > > > So as a start I would go for a direct optab and see to recognize it during
> > > > > > ISEL?
> > > > > >
> > > > >
> > > > > The problem with ISEL and the reason I suggested an indirect IFN is that there
> > > > > Are benefit to be had from recognizing it early.  Saturating arithmetic can be
> > > > > optimized
> > > > > Differently from non-saturating ones.
> > > > >
> > > > > But additionally a common way of specifying them decomposes to branches
> > > > > and/or using COMPLEX_EXPR (see the various PRs on saturating arithmetic).
> > > > >
> > > > > These two representation can be detected in PHI-opts and it's beneficial to all
> > > > > targets to canonicalize them to the branchless code.
> > > > >
> > > > > Those two cases also *completely* stop vectorization because of either the
> > > > > control flow or the fact the vectorizer can't handle complex types.
> > > > >
> > > > > So really, gimple ISEL would fix just 1 of the 3 very common cases, and then
> > > > > We'd still need to hack the vectorizer cost models for targets with saturating
> > > > > vector instructions.
> > > > >
> > > > > I of course defer to you, but it seems quite suboptimal to do it this way and
> > > > > doesn't get us first class saturation support.
> > > > >
> > > > > Additionally there have been discussions whether both clang and gcc should
> > > > > provide __builtin_saturate_* methods, which the non-direct IFN would help
> > > > > support.
> > > > >
> > > > > Tamar.
> > > > >
> > > > > > > If Richi agrees, the below is what I meant. It creates the infrastructure for this
> > > > > > > and for now only allows a default fallback for unsigned saturating add and
> > > > > makes
> > > > > > > it easier for us to add the rest later
> > > > > > >
> > > > > > > Also, unless I'm wrong (and Richi can correct me here), us_plus and ss_plus are
> > > > > > the
> > > > > > > RTL expression, but the optab for saturation are ssadd and usadd.  So you don't
> > > > > > > need to make new us_plus and ss_plus ones.
> > > > > > >
> > > > > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > > > > > index a07f25f3aee..aaf9f8991b3 100644
> > > > > > > --- a/gcc/internal-fn.cc
> > > > > > > +++ b/gcc/internal-fn.cc
> > > > > > > @@ -4103,6 +4103,17 @@ direct_internal_fn_supported_p (internal_fn fn,
> > > > > > tree_pair types,
> > > > > > >         return direct_##TYPE##_optab_supported_p (which_optab, types,   \
> > > > > > >                                                   opt_type);            \
> > > > > > >        }
> > > > > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR,
> > > > > > SIGNED_OPTAB, \
> > > > > > > +                                        UNSIGNED_OPTAB, TYPE)          \
> > > > > > > +    case IFN_##CODE:                                                   \
> > > > > > > +      {                                                                        \
> > > > > > > +       optab which_optab = (TYPE_UNSIGNED (types.SELECTOR)             \
> > > > > > > +                            ? UNSIGNED_OPTAB ## _optab                 \
> > > > > > > +                            : SIGNED_OPTAB ## _optab);                 \
> > > > > > > +       return direct_##TYPE##_optab_supported_p (which_optab, types,   \
> > > > > > > +                                                 opt_type)             \
> > > > > > > +              || internal_##CODE##_fn_supported_p (types.SELECTOR, opt_type); \
> > > > > > > +      }
> > > > > > >  #include "internal-fn.def"
> > > > > > >
> > > > > > >      case IFN_LAST:
> > > > > > > @@ -4303,6 +4314,8 @@ set_edom_supported_p (void)
> > > > > > >      optab which_optab = direct_internal_fn_optab (fn, types);          \
> > > > > > >      expand_##TYPE##_optab_fn (fn, stmt, which_optab);                  \
> > > > > > >    }
> > > > > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR,
> > > > > > SIGNED_OPTAB, \
> > > > > > > +                                        UNSIGNED_OPTAB, TYPE)
> > > > > > >  #include "internal-fn.def"
> > > > > > >
> > > > > > >  /* Routines to expand each internal function, indexed by function number.
> > > > > > > @@ -5177,3 +5190,45 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> > > > > > >        emit_move_insn (plhs, cmp);
> > > > > > >      }
> > > > > > >  }
> > > > > > > +
> > > > > > > +void
> > > > > > > +expand_SAT_ADD (internal_fn fn, gcall *stmt)
> > > > > > > +{
> > > > > > > +  /* Check if the target supports the expansion through an IFN.  */
> > > > > > > +  tree_pair types = direct_internal_fn_types (fn, stmt);
> > > > > > > +  optab which_optab = direct_internal_fn_optab (fn, types);
> > > > > > > +  if (direct_binary_optab_supported_p (which_optab, types,
> > > > > > > +                                      insn_optimization_type ()))
> > > > > > > +    {
> > > > > > > +      expand_binary_optab_fn (fn, stmt, which_optab);
> > > > > > > +      return;
> > > > > > > +    }
> > > > > > > +
> > > > > > > +  /* Target does not support the optab, but we can de-compose it.  */
> > > > > > > +  /*
> > > > > > > +  ... decompose to a canonical representation ...
> > > > > > > +  if (TYPE_UNSIGNED (types.SELECTOR))
> > > > > > > +    {
> > > > > > > +      ...
> > > > > > > +      decompose back to (X + Y) | - ((X + Y) < X)
> > > > > > > +    }
> > > > > > > +  else
> > > > > > > +    {
> > > > > > > +      ...
> > > > > > > +    }
> > > > > > > +  */
> > > > > > > +}
> > > > > > > +
> > > > > > > +bool internal_SAT_ADD_fn_supported_p (tree type, optimization_type /*
> > > > > > optype */)
> > > > > > > +{
> > > > > > > +  /* For now, don't support decomposing vector ops.  */
> > > > > > > +  if (VECTOR_TYPE_P (type))
> > > > > > > +    return false;
> > > > > > > +
> > > > > > > +  /* Signed saturating arithmetic is harder to do since we'll so for now
> > > > > > > +     lets ignore.  */
> > > > > > > +  if (!TYPE_UNSIGNED (type))
> > > > > > > +    return false;
> > > > > > > +
> > > > > > > +  return TREE_CODE (type) == INTEGER_TYPE;
> > > > > > > +}
> > > > > > > \ No newline at end of file
> > > > > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > > > > > index c14d30365c1..5a2491228d5 100644
> > > > > > > --- a/gcc/internal-fn.def
> > > > > > > +++ b/gcc/internal-fn.def
> > > > > > > @@ -92,6 +92,10 @@ along with GCC; see the file COPYING3.  If not see
> > > > > > >     unsigned inputs respectively, both without the trailing "_optab".
> > > > > > >     SELECTOR says which type in the tree_pair determines the signedness.
> > > > > > >
> > > > > > > +   DEF_INTERNAL_SIGNED_OPTAB_EXT_FN is like
> > > > > > DEF_INTERNAL_SIGNED_OPTAB_FN, except
> > > > > > > +   that it has expand_##NAME defined in internal-fn.cc to override the
> > > > > > > +   DEF_INTERNAL_SIGNED_OPTAB_FN expansion behavior.
> > > > > > > +
> > > > > > >     DEF_INTERNAL_FLT_FN is like DEF_INTERNAL_OPTAB_FN, but in addition,
> > > > > > >     the function implements the computational part of a built-in math
> > > > > > >     function BUILT_IN_<NAME>{F,,L}.  Unlike some built-in functions,
> > > > > > > @@ -153,6 +157,13 @@ along with GCC; see the file COPYING3.  If not see
> > > > > > >    DEF_INTERNAL_FN (NAME, FLAGS | ECF_LEAF, NULL)
> > > > > > >  #endif
> > > > > > >
> > > > > > > +#ifndef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> > > > > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(NAME, FLAGS, SELECTOR,
> > > > > > SIGNED_OPTAB, \
> > > > > > > +                                    UNSIGNED_OPTAB, TYPE) \
> > > > > > > +  DEF_INTERNAL_SIGNED_OPTAB_FN (NAME, FLAGS, SELECTOR,
> > > > > > SIGNED_OPTAB, \
> > > > > > > +                               UNSIGNED_OPTAB, TYPE)
> > > > > > > +#endif
> > > > > > > +
> > > > > > >  #ifndef DEF_INTERNAL_FLT_FN
> > > > > > >  #define DEF_INTERNAL_FLT_FN(NAME, FLAGS, OPTAB, TYPE) \
> > > > > > >    DEF_INTERNAL_OPTAB_FN (NAME, FLAGS, OPTAB, TYPE)
> > > > > > > @@ -274,6 +285,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS,
> > > > > > ECF_CONST | ECF_NOTHROW, first,
> > > > > > >                               smulhs, umulhs, binary)
> > > > > > >  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW,
> > > > > > first,
> > > > > > >                               smulhrs, umulhrs, binary)
> > > > > > > +DEF_INTERNAL_SIGNED_OPTAB_EXT_FN (SAT_ADD, ECF_CONST |
> > > > > > ECF_NOTHROW, first,
> > > > > > > +                                 ssadd, usadd, binary)
> > > > > > >
> > > > > > >  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
> > > > > > >  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
> > > > > > > @@ -593,5 +606,6 @@ DEF_INTERNAL_FN (BITINTTOFLOAT, ECF_PURE |
> > > > > > ECF_LEAF, ". R . ")
> > > > > > >  #undef DEF_INTERNAL_FLT_FN
> > > > > > >  #undef DEF_INTERNAL_FLT_FLOATN_FN
> > > > > > >  #undef DEF_INTERNAL_SIGNED_OPTAB_FN
> > > > > > > +#undef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> > > > > > >  #undef DEF_INTERNAL_OPTAB_FN
> > > > > > >  #undef DEF_INTERNAL_FN
> > > > > > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > > > > > > index bccee1c3e09..dbdb1e6bad2 100644
> > > > > > > --- a/gcc/internal-fn.h
> > > > > > > +++ b/gcc/internal-fn.h
> > > > > > > @@ -263,6 +263,8 @@ extern void expand_DIVMODBITINT (internal_fn, gcall
> > > > > > *);
> > > > > > >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> > > > > > >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> > > > > > >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > > > > > > +extern void expand_SAT_ADD (internal_fn, gcall *);
> > > > > > > +extern bool internal_SAT_ADD_fn_supported_p (tree, optimization_type);
> > > > > > >
> > > > > > >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> > > > > > >
> > > > > > > > Note this patch is a draft for validation, no test are invovled here.
> > > > > > > >
> > > > > > > > gcc/ChangeLog:
> > > > > > > >
> > > > > > > >       * builtins.def (BUILT_IN_US_PLUS): Add builtin def.
> > > > > > > >       (BUILT_IN_US_PLUSIMAX): Ditto.
> > > > > > > >       (BUILT_IN_US_PLUSL): Ditto.
> > > > > > > >       (BUILT_IN_US_PLUSLL): Ditto.
> > > > > > > >       (BUILT_IN_US_PLUSG): Ditto.
> > > > > > > >       * config/riscv/riscv-protos.h (riscv_expand_us_plus): Add new
> > > > > > > >       func decl for expanding us_plus.
> > > > > > > >       * config/riscv/riscv.cc (riscv_expand_us_plus): Add new func
> > > > > > > >       impl for expanding us_plus.
> > > > > > > >       * config/riscv/riscv.md (us_plus<mode>3): Add new pattern impl
> > > > > > > >       us_plus<mode>3.
> > > > > > > >       * internal-fn.cc (expand_US_PLUS): Add new func impl to expand
> > > > > > > >       US_PLUS.
> > > > > > > >       * internal-fn.def (US_PLUS): Add new INT_EXT_FN.
> > > > > > > >       * internal-fn.h (expand_US_PLUS): Add new func decl.
> > > > > > > >       * match.pd: Add new simplify pattern for us_plus.
> > > > > > > >       * optabs.def (OPTAB_NL): Add new OPTAB_NL to US_PLUS rtl.
> > > > > > > >
> > > > > > > > Signed-off-by: Pan Li <pan2.li@intel.com>
> > > > > > > > ---
> > > > > > > >  gcc/builtins.def                |  7 +++++
> > > > > > > >  gcc/config/riscv/riscv-protos.h |  1 +
> > > > > > > >  gcc/config/riscv/riscv.cc       | 46 +++++++++++++++++++++++++++++++++
> > > > > > > >  gcc/config/riscv/riscv.md       | 11 ++++++++
> > > > > > > >  gcc/internal-fn.cc              | 26 +++++++++++++++++++
> > > > > > > >  gcc/internal-fn.def             |  3 +++
> > > > > > > >  gcc/internal-fn.h               |  1 +
> > > > > > > >  gcc/match.pd                    | 17 ++++++++++++
> > > > > > > >  gcc/optabs.def                  |  2 ++
> > > > > > > >  9 files changed, 114 insertions(+)
> > > > > > > >
> > > > > > > > diff --git a/gcc/builtins.def b/gcc/builtins.def
> > > > > > > > index f6f3e104f6a..0777b912cfa 100644
> > > > > > > > --- a/gcc/builtins.def
> > > > > > > > +++ b/gcc/builtins.def
> > > > > > > > @@ -1055,6 +1055,13 @@ DEF_GCC_BUILTIN
> > > > > > (BUILT_IN_POPCOUNTIMAX,
> > > > > > > > "popcountimax", BT_FN_INT_UINTMAX
> > > > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTL, "popcountl",
> > > > > > BT_FN_INT_ULONG,
> > > > > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTLL, "popcountll",
> > > > > > > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTG, "popcountg",
> > > > > > BT_FN_INT_VAR,
> > > > > > > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > > > > > > > +
> > > > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUS, "us_plus", BT_FN_INT_UINT,
> > > > > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSIMAX, "us_plusimax",
> > > > > > > > BT_FN_INT_UINTMAX, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSL, "us_plusl",
> > > > > BT_FN_INT_ULONG,
> > > > > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSLL, "us_plusll",
> > > > > > > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSG, "us_plusg", BT_FN_INT_VAR,
> > > > > > > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > > > > > > > +
> > > > > > > >  DEF_EXT_LIB_BUILTIN    (BUILT_IN_POSIX_MEMALIGN, "posix_memalign",
> > > > > > > > BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF)
> > > > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_PREFETCH, "prefetch",
> > > > > > > > BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
> > > > > > > >  DEF_LIB_BUILTIN        (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE,
> > > > > > > > ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST)
> > > > > > > > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> > > > > > > > index 80efdf2b7e5..ba6086f1f25 100644
> > > > > > > > --- a/gcc/config/riscv/riscv-protos.h
> > > > > > > > +++ b/gcc/config/riscv/riscv-protos.h
> > > > > > > > @@ -132,6 +132,7 @@ extern void riscv_asm_output_external (FILE *, const
> > > > > > tree,
> > > > > > > > const char *);
> > > > > > > >  extern bool
> > > > > > > >  riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
> > > > > > > >  extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx);
> > > > > > > > +extern void riscv_expand_us_plus (rtx, rtx, rtx);
> > > > > > > >
> > > > > > > >  #ifdef RTX_CODE
> > > > > > > >  extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool
> > > > > *invert_ptr
> > > > > > =
> > > > > > > > 0);
> > > > > > > > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> > > > > > > > index 4100abc9dd1..23f08974f07 100644
> > > > > > > > --- a/gcc/config/riscv/riscv.cc
> > > > > > > > +++ b/gcc/config/riscv/riscv.cc
> > > > > > > > @@ -10657,6 +10657,52 @@ riscv_vector_mode_supported_any_target_p
> > > > > > > > (machine_mode)
> > > > > > > >    return true;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < x).  */
> > > > > > > > +void
> > > > > > > > +riscv_expand_us_plus (rtx dest, rtx x, rtx y)
> > > > > > > > +{
> > > > > > > > +  machine_mode mode = GET_MODE (dest);
> > > > > > > > +  rtx pmode_sum = gen_reg_rtx (Pmode);
> > > > > > > > +  rtx pmode_lt = gen_reg_rtx (Pmode);
> > > > > > > > +  rtx pmode_x = gen_lowpart (Pmode, x);
> > > > > > > > +  rtx pmode_y = gen_lowpart (Pmode, y);
> > > > > > > > +  rtx pmode_dest = gen_reg_rtx (Pmode);
> > > > > > > > +
> > > > > > > > +  /* Step-1: sum = x + y  */
> > > > > > > > +  if (mode == SImode && mode != Pmode)
> > > > > > > > +    { /* Take addw to avoid the sum truncate.  */
> > > > > > > > +      rtx simode_sum = gen_reg_rtx (SImode);
> > > > > > > > +      riscv_emit_binary (PLUS, simode_sum, x, y);
> > > > > > > > +      emit_move_insn (pmode_sum, gen_lowpart (Pmode, simode_sum));
> > > > > > > > +    }
> > > > > > > > +  else
> > > > > > > > +    riscv_emit_binary (PLUS, pmode_sum, pmode_x, pmode_y);
> > > > > > > > +
> > > > > > > > +  /* Step-1.1: truncate sum for HI and QI as we have no insn for add QI/HI.
> > > > > */
> > > > > > > > +  if (mode == HImode || mode == QImode)
> > > > > > > > +    {
> > > > > > > > +      int mode_bits = GET_MODE_BITSIZE (mode).to_constant ();
> > > > > > > > +      int shift_bits = GET_MODE_BITSIZE (Pmode) - mode_bits;
> > > > > > > > +
> > > > > > > > +      gcc_assert (shift_bits > 0);
> > > > > > > > +
> > > > > > > > +      riscv_emit_binary (ASHIFT, pmode_sum, pmode_sum, GEN_INT
> > > > > > (shift_bits));
> > > > > > > > +      riscv_emit_binary (LSHIFTRT, pmode_sum, pmode_sum, GEN_INT
> > > > > > > > (shift_bits));
> > > > > > > > +    }
> > > > > > > > +
> > > > > > > > +  /* Step-2: lt = sum < x  */
> > > > > > > > +  riscv_emit_binary (LTU, pmode_lt, pmode_sum, pmode_x);
> > > > > > > > +
> > > > > > > > +  /* Step-3: lt = -lt  */
> > > > > > > > +  riscv_emit_unary (NEG, pmode_lt, pmode_lt);
> > > > > > > > +
> > > > > > > > +  /* Step-4: pmode_dest = sum | lt  */
> > > > > > > > +  riscv_emit_binary (IOR, pmode_dest, pmode_lt, pmode_sum);
> > > > > > > > +
> > > > > > > > +  /* Step-5: dest = pmode_dest */
> > > > > > > > +  emit_move_insn (dest, gen_lowpart (mode, pmode_dest));
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >  /* Initialize the GCC target structure.  */
> > > > > > > >  #undef TARGET_ASM_ALIGNED_HI_OP
> > > > > > > >  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> > > > > > > > diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> > > > > > > > index 3f7a023d941..eaa9867023c 100644
> > > > > > > > --- a/gcc/config/riscv/riscv.md
> > > > > > > > +++ b/gcc/config/riscv/riscv.md
> > > > > > > > @@ -3841,6 +3841,17 @@ (define_insn "*large_load_address"
> > > > > > > >    [(set_attr "type" "load")
> > > > > > > >     (set (attr "length") (const_int 8))])
> > > > > > > >
> > > > > > > > +(define_expand "us_plus<mode>3"
> > > > > > > > +  [(match_operand:ANYI   0 "register_operand")
> > > > > > > > +   (match_operand:ANYI   1 "register_operand")
> > > > > > > > +   (match_operand:ANYI   2 "register_operand")]
> > > > > > > > +  ""
> > > > > > > > +  {
> > > > > > > > +    riscv_expand_us_plus (operands[0], operands[1], operands[2]);
> > > > > > > > +    DONE;
> > > > > > > > +  }
> > > > > > > > +)
> > > > > > > > +
> > > > > > > >  (include "bitmanip.md")
> > > > > > > >  (include "crypto.md")
> > > > > > > >  (include "sync.md")
> > > > > > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > > > > > > index a07f25f3aee..a7341a57ffa 100644
> > > > > > > > --- a/gcc/internal-fn.cc
> > > > > > > > +++ b/gcc/internal-fn.cc
> > > > > > > > @@ -5177,3 +5177,29 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> > > > > > > >        emit_move_insn (plhs, cmp);
> > > > > > > >      }
> > > > > > > >  }
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +expand_US_PLUS (internal_fn fn, gcall *stmt)
> > > > > > > > +{
> > > > > > > > +  tree lhs = gimple_call_lhs (stmt);
> > > > > > > > +  tree rhs_0 = gimple_call_arg (stmt, 0);
> > > > > > > > +  tree rhs_1 = gimple_call_arg (stmt, 1);
> > > > > > > > +
> > > > > > > > +  do_pending_stack_adjust ();
> > > > > > > > +
> > > > > > > > +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> > > > > > > > +  rtx op_0 = expand_normal (rhs_0);
> > > > > > > > +  rtx op_1 = expand_normal (rhs_1);
> > > > > > > > +
> > > > > > > > +  class expand_operand ops[3];
> > > > > > > > +
> > > > > > > > +  create_output_operand (&ops[0], target, TYPE_MODE (TREE_TYPE (lhs)));
> > > > > > > > +  create_output_operand (&ops[1], op_0, TYPE_MODE (TREE_TYPE
> > > > > (rhs_0)));
> > > > > > > > +  create_output_operand (&ops[2], op_1, TYPE_MODE (TREE_TYPE
> > > > > (rhs_1)));
> > > > > > > > +
> > > > > > > > +  insn_code code = optab_handler (us_plus_optab, TYPE_MODE (TREE_TYPE
> > > > > > > > (rhs_0)));
> > > > > > > > +  expand_insn (code, 3, ops);
> > > > > > > > +
> > > > > > > > +  if (!rtx_equal_p (target, ops[0].value))
> > > > > > > > +    emit_move_insn (target, ops[0].value);
> > > > > > > > +}
> > > > > > >
> > > > > > > This can be simplified by calling expand_binary_optab_fn instead. See my
> > > > > > template
> > > > > > >
> > > > > > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > > > > > > index c14d30365c1..b1d7b5a0307 100644
> > > > > > > > --- a/gcc/internal-fn.def
> > > > > > > > +++ b/gcc/internal-fn.def
> > > > > > > > @@ -447,6 +447,9 @@ DEF_INTERNAL_INT_FN (FFS, ECF_CONST |
> > > > > > > > ECF_NOTHROW, ffs, unary)
> > > > > > > >  DEF_INTERNAL_INT_FN (PARITY, ECF_CONST | ECF_NOTHROW, parity,
> > > > > unary)
> > > > > > > >  DEF_INTERNAL_INT_EXT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW,
> > > > > > > > popcount, unary)
> > > > > > > >
> > > > > > > > +/* Binary integer ops.  */
> > > > > > > > +DEF_INTERNAL_INT_EXT_FN (US_PLUS, ECF_CONST | ECF_NOTHROW,
> > > > > > us_plus,
> > > > > > > > binary)
> > > > > > > > +
> > > > > > > >  DEF_INTERNAL_FN (GOMP_TARGET_REV, ECF_NOVOPS | ECF_LEAF |
> > > > > > > > ECF_NOTHROW, NULL)
> > > > > > > >  DEF_INTERNAL_FN (GOMP_USE_SIMT, ECF_NOVOPS | ECF_LEAF |
> > > > > > > > ECF_NOTHROW, NULL)
> > > > > > > >  DEF_INTERNAL_FN (GOMP_SIMT_ENTER, ECF_LEAF | ECF_NOTHROW,
> > > > > NULL)
> > > > > > > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > > > > > > > index bccee1c3e09..46e404b4a49 100644
> > > > > > > > --- a/gcc/internal-fn.h
> > > > > > > > +++ b/gcc/internal-fn.h
> > > > > > > > @@ -263,6 +263,7 @@ extern void expand_DIVMODBITINT (internal_fn,
> > > > > gcall
> > > > > > *);
> > > > > > > >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> > > > > > > >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> > > > > > > >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > > > > > > > +extern void expand_US_PLUS (internal_fn, gcall *);
> > > > > > > >
> > > > > > > >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> > > > > > > >
> > > > > > > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > > > > > > index c5b6540f939..f45fd58ad23 100644
> > > > > > > > --- a/gcc/match.pd
> > > > > > > > +++ b/gcc/match.pd
> > > > > > > > @@ -10265,3 +10265,20 @@ and,
> > > > > > > >        }
> > > > > > > >        (if (full_perm_p)
> > > > > > > >       (vec_perm (op@3 @0 @1) @3 @2))))))
> > > > > > > > +
> > > > > > > > +#if GIMPLE
> > > > > > > > +
> > > > > > > > +/* Unsigned saturation add, aka:
> > > > > > > > +   SAT_ADDU = (X + Y) | - ((X + Y) < X) or
> > > > > > > > +   SAT_ADDU = (X + Y) | - ((X + Y) < Y).  */
> > > > > > > > +(simplify
> > > > > > > > + (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0))))
> > > > > > > > +   (if (optimize
> > > > > > > > +     && INTEGRAL_TYPE_P (type)
> > > > > > > > +     && TYPE_UNSIGNED (TREE_TYPE (@0))
> > > > > > > > +     && types_match (type, TREE_TYPE (@0))
> > > > > > > > +     && types_match (type, TREE_TYPE (@1))
> > > > > > > > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> > > > > > > > OPTIMIZE_FOR_BOTH))
> > > > > > > > +   (IFN_US_PLUS @0 @1)))
> > > > > > > > +
> > > > > > > > +#endif
> > > > > > >
> > > > > > > With the version above you can drop the #if GIMPLE and the
> > > > > > > > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> > > > > > >
> > > > > > > Check.
> > > > > > >
> > > > > > > Thanks,
> > > > > > > Tamar
> > > > > > >
> > > > > > > > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > > > > > > > index ad14f9328b9..5855c4e0834 100644
> > > > > > > > --- a/gcc/optabs.def
> > > > > > > > +++ b/gcc/optabs.def
> > > > > > > > @@ -179,6 +179,8 @@ OPTAB_NL(clrsb_optab, "clrsb$a2", CLRSB, "clrsb",
> > > > > '2',
> > > > > > > > gen_int_libfunc)
> > > > > > > >  OPTAB_NL(popcount_optab, "popcount$a2", POPCOUNT, "popcount", '2',
> > > > > > > > gen_int_libfunc)
> > > > > > > >  OPTAB_NL(parity_optab, "parity$a2", PARITY, "parity", '2', gen_int_libfunc)
> > > > > > > >
> > > > > > > > +OPTAB_NL(us_plus_optab, "us_plus$a3", US_PLUS, "us_plus", '3',
> > > > > > > > gen_int_libfunc)
> > > > > > > > +
> > > > > > > >  /* Comparison libcalls for integers MUST come in pairs, signed/unsigned.  */
> > > > > > > >  OPTAB_NL(cmp_optab, NULL, UNKNOWN, "cmp", '2',
> > > > > > gen_int_fp_fixed_libfunc)
> > > > > > > >  OPTAB_NL(ucmp_optab, NULL, UNKNOWN, "ucmp", '2', gen_int_libfunc)
> > > > > > > > --
> > > > > > > > 2.34.1
> > > > > > >
Li, Pan2 March 8, 2024, 1:01 a.m. UTC | #15
Thanks a lot for coaching, really save my day. I will have a try for usadd/ssadd includes both the scalar and vector (ISEL/widen_mult) modes in v3.

Pan

-----Original Message-----
From: Richard Biener <richard.guenther@gmail.com> 
Sent: Thursday, March 7, 2024 4:41 PM
To: Li, Pan2 <pan2.li@intel.com>
Cc: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com; jeffreyalaw@gmail.com
Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS

On Thu, Mar 7, 2024 at 2:54 AM Li, Pan2 <pan2.li@intel.com> wrote:
>
> Thanks Richard for comments.
>
> > gen_int_libfunc will no longer make it emit libcalls for fixed point
> > modes, so this can't be correct
> > and there's no libgcc implementation for integer mode saturating ops,
> > so it's pointless to emit calls
> > to them.
>
> Got the pointer here, the OPTAB_NL(usadd_optab, "usadd$Q$a3", US_PLUS, "usadd", '3', gen_unsigned_fixed_libfunc)
> Is designed for the fixed point, cannot cover integer mode right now.

I think

OPTAB_NL(usadd_optab, "usadd$a3", US_PLUS, "usadd", '3',
gen_unsigned_fixed_libfunc)

would work fine (just dropping the $Q).

> Given we have saturating integer alu like below, could you help to coach me the most reasonable way to represent
> It in scalar as well as vectorize part? Sorry not familiar with this part and still dig into how it works...

As in your v2, .SAT_ADD for both sat_uadd and sat_sadd, similar for
the other cases.

As I said, use vectorizer patterns and possibly do instruction
selection at ISEL/widen_mult time.

Richard.

> uint32_t sat_uadd (uint32_t a, uint32_t b)
> {
>   uint32_t add = a + b;
>   return add | -(add < a);
> }
>
> sint32_t sat_sadd (sint32_t a, sint32_t b)
> {
>   sint32_t add = a + b;
>   sint32_t x = a ^ b;
>   sint32_t y = add ^ x;
>   return x < 0 ? add : (y >= 0 ? add : INT32_MAX + (x < 0));
> }
>
> uint32_t sat_usub (uint32_t a, uint32_t b)
> {
>   return a >= b ? a - b : 0;
> }
>
> sint32_t sat_ssub (sint32_t a, sint32_t b)
> {
>   sint32_t sub = a - b;
>   sint32_t x = a ^ b;
>   sint32_t y = sub ^ x;
>   return x >= 0 ? sub : (y >= 0 ? sub : INT32_MAX + (x < 0));
> }
>
> uint32_t sat_umul (uint32_t a, uint32_t b)
> {
>   uint64_t mul = a * b;
>
>   return mul <= (uint64_t)UINT32_MAX ? (uint32_t)mul : UINT32_MAX;
> }
>
> sint32_t sat_smul (sint32_t a, sint32_t b)
> {
>   sint64_t mul = a * b;
>
>   return mul >= (sint64_t)INT32_MIN && mul <= (sint64_t)INT32_MAX ? (sint32_t)mul : INT32_MAX + ((x ^ y) < 0);
> }
>
> uint32_t sat_udiv (uint32_t a, uint32_t b)
> {
>   return a / b; // never overflow
> }
>
> sint32_t sat_sdiv (sint32_t a, sint32_t b)
> {
>   return a == INT32_MIN && b == -1 ? INT32_MAX : a / b;
> }
>
> sint32_t sat_abs (sint32_t a)
> {
>   return a >= 0 ? a : (a == INT32_MIN ? INT32_MAX : -a);
> }
>
> Pan
>
> -----Original Message-----
> From: Richard Biener <richard.guenther@gmail.com>
> Sent: Tuesday, March 5, 2024 4:41 PM
> To: Li, Pan2 <pan2.li@intel.com>
> Cc: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com; jeffreyalaw@gmail.com
> Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS
>
> On Tue, Mar 5, 2024 at 8:09 AM Li, Pan2 <pan2.li@intel.com> wrote:
> >
> > Thanks Richard for comments.
> >
> > > I do wonder what the existing usadd patterns with integer vector modes
> > > in various targets do?
> > > Those define_insn will at least not end up in the optab set I guess,
> > > so they must end up
> > > being either unused or used by explicit gen_* (via intrinsic
> > > functions?) or by combine?
> >
> > For usadd with vector modes, I think the backend like RISC-V try to leverage instructions
> > like Vector Single-Width Saturating Add(aka vsaddu.vv/x/i).
> >
> > > I think simply changing gen_*_fixed_libfunc to gen_int_libfunc won't
> > > work.  Since there's
> > > no libgcc support I'd leave it as gen_*_fixed_libfunc thus no library
> > > fallback for integers?
> >
> > Change to gen_int_libfunc follows other int optabs. I am not sure if it will hit the standard name usaddm3 for vector mode.
> > But the happy path for scalar modes works up to a point, please help to correct me if any misunderstanding.
>
> gen_int_libfunc will no longer make it emit libcalls for fixed point
> modes, so this can't be correct
> and there's no libgcc implementation for integer mode saturating ops,
> so it's pointless to emit calls
> to them.
>
> > #0  riscv_expand_usadd (dest=0x7ffff6a8c7c8, x=0x7ffff6a8c798, y=0x7ffff6a8c7b0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/config/riscv/riscv.cc:10662
> > #1  0x00000000029f142a in gen_usaddsi3 (operand0=0x7ffff6a8c7c8, operand1=0x7ffff6a8c798, operand2=0x7ffff6a8c7b0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/config/riscv/riscv.md:3848
> > #2  0x0000000001751e60 in insn_gen_fn::operator()<rtx_def*, rtx_def*, rtx_def*> (this=0x4910e70 <insn_data+1248976>) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/recog.h:441
> > #3  0x000000000180f553 in maybe_gen_insn (icode=CODE_FOR_usaddsi3, nops=3, ops=0x7fffffffd2c0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/optabs.cc:8232
> > #4  0x000000000180fa42 in maybe_expand_insn (icode=CODE_FOR_usaddsi3, nops=3, ops=0x7fffffffd2c0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/optabs.cc:8275
> > #5  0x000000000180fade in expand_insn (icode=CODE_FOR_usaddsi3, nops=3, ops=0x7fffffffd2c0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/optabs.cc:8306
> > #6  0x00000000015cebdc in expand_fn_using_insn (stmt=0x7ffff6a36480, icode=CODE_FOR_usaddsi3, noutputs=1, ninputs=2) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/internal-fn.cc:254
> > #7  0x00000000015de146 in expand_direct_optab_fn (fn=IFN_SAT_ADD, stmt=0x7ffff6a36480, optab=usadd_optab, nargs=2) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/internal-fn.cc:3818
> > #8  0x00000000015e3610 in expand_SAT_ADD (fn=IFN_SAT_ADD, stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/internal-fn.def:278
> > #9  0x00000000015e65b6 in expand_internal_call (fn=IFN_SAT_ADD, stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/internal-fn.cc:4914
> > #10 0x00000000015e65e5 in expand_internal_call (stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/internal-fn.cc:4922
> > #11 0x0000000001248c8f in expand_call_stmt (stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:2771
> > #12 0x000000000124d392 in expand_gimple_stmt_1 (stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:3932
> > #13 0x000000000124d9aa in expand_gimple_stmt (stmt=0x7ffff6a36480) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:4077
> > #14 0x000000000124dac4 in expand_gimple_tailcall (bb=0x7ffff6dddae0, stmt=0x7ffff6a36480, can_fallthru=0x7fffffffd800) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:4123
> > #15 0x000000000125636b in expand_gimple_basic_block (bb=0x7ffff6dddae0, disable_tail_calls=false) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:6107
> > #16 0x0000000001258a1a in (anonymous namespace)::pass_expand::execute (this=0x556d180, fun=0x7ffff6a7f2e0) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cfgexpand.cc:6872
> > #17 0x0000000001873565 in execute_one_pass (pass=0x556d180) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/passes.cc:2646
> > #18 0x0000000001873948 in execute_pass_list_1 (pass=0x556d180) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/passes.cc:2755
> > #19 0x00000000018739d6 in execute_pass_list (fn=0x7ffff6a7f2e0, pass=0x5568870) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/passes.cc:2766
> > #20 0x00000000012bc975 in cgraph_node::expand (this=0x7ffff6c2c880) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cgraphunit.cc:1845
> > #21 0x00000000012bd18f in expand_all_functions () at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cgraphunit.cc:2028
> > #22 0x00000000012bdcc5 in symbol_table::compile (this=0x7ffff6c06000) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cgraphunit.cc:2402
> > #23 0x00000000012be16c in symbol_table::finalize_compilation_unit (this=0x7ffff6c06000) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/cgraphunit.cc:2587
> > #24 0x0000000001a048a3 in compile_file () at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/toplev.cc:476
> > #25 0x0000000001a079e9 in do_compile () at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/toplev.cc:2154
> > #26 0x0000000001a07dee in toplev::main (this=0x7fffffffdcf2, argc=19, argv=0x7fffffffde28) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/toplev.cc:2310
> > #27 0x0000000003ebcc5d in main (argc=19, argv=0x7fffffffde28) at /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/main.cc:39
> >
> > BTW, does match.pd support nested cond like below? I am debugging into gimple_simplify_COND_EXPR for why not hit the pattern...
> > +(simplify
> > +  (cond
> > +    (lt @0 integer_zerop)
> > +    (plus:c @0 @1)
> > +    (cond (lt @1 integer_zerop) @1 @0))
> > +  (IFN_SAT_ADD @0 @1))
> >
> > Pan
> >
> > -----Original Message-----
> > From: Richard Biener <richard.guenther@gmail.com>
> > Sent: Monday, March 4, 2024 6:31 PM
> > To: Li, Pan2 <pan2.li@intel.com>
> > Cc: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com; jeffreyalaw@gmail.com
> > Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS
> >
> > On Sat, Mar 2, 2024 at 8:46 AM Li, Pan2 <pan2.li@intel.com> wrote:
> > >
> > > Hi Richard and Tamar,
> > >
> > > I have a try with DEF_INTERNAL_SIGNED_OPTAB_FN for SAT_ADD/SUB/MUL but meet some problem when match.pd.
> > >
> > > For unsigned SAT_ADD = (x + y) | - ((x + y) < x), the match.pd can be (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0)))).
> > > For unsigned SAT_SUB = x >= y ? x - y : 0, and then match.pd can be (cond (ge @0 @1) (minus @0 @1) integer_zerop).
> > >
> > > For signed SAT_ADD/SAT_SUB as below, seems not easy to make the simplify pattern works well as expected up to a point.
> > > sint64_t sat_add (sint64_t x, sint64_t y)
> > > {
> > >   sint64_t a = x ^ y;
> > >   sint64_t add = x + y;
> > >   sint64_t b = sum ^ x;
> > >
> > >   return (a < 0 || (a >= 0 && b >= 0)) ? add : (MAX_INT64 + (x < 0));
> > > }
> > >
> > > sint64_t sad_sub (sint64_t x, sint64_t y)
> > > {
> > >   sint64_t a = x ^ y;
> > >   sint64_t sub = x - y;
> > >   sint64_t b = sub ^ x;
> > >
> > >   return (a >= 0 || (a < 0 && b >= 0) ? sub : (MAX_INT64 + (x < 0));
> > > }
> > >
> > > For SAT_MUL as below, looks we may need widen type. I am not sure if we can leverage MUL_OVERFLOW or not in match.pd.
> > >
> > > uint32_t sat_mul (uint32_t x, uint32_t y)
> > > {
> > >   uint64_t mul = (uint64_t)x * (uint64_t)y;
> > >   return mul > UINT32_MAX ? UINT32_MAX : (uint32_t)mul;
> > > }
> > >
> > > sint32_t sat_mul (sint32_t x, sint32_t y)
> > > {
> > >   sint64_t mul = (sint64_t)x * (sint64_t))y;
> > >
> > >   return mul <= MAX_INT32 && mul >= MIN_INT32 ? mul : MAX_INT32 + (x ^ y) > 0;
> > > }
> > >
> > > Below diff only contains unsigned SAT_ADD and SAT_SUB for prototype validation.
> > > I will continue to try the rest part in match.pd and keep you posted.
> > >
> > > -------------------------------------------------------------------------------------------------------------------------------------------------------------------------
> > >
> > > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> > > index 80efdf2b7e5..d9ad6fe2b58 100644
> > > --- a/gcc/config/riscv/riscv-protos.h
> > > +++ b/gcc/config/riscv/riscv-protos.h
> > > @@ -132,6 +132,9 @@ extern void riscv_asm_output_external (FILE *, const tree, const char *);
> > >  extern bool
> > >  riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
> > >  extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx);
> > > +extern void riscv_expand_usadd (rtx, rtx, rtx);
> > > +extern void riscv_expand_ussub (rtx, rtx, rtx);
> > >
> > >  #ifdef RTX_CODE
> > >  extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool *invert_ptr = 0);
> > > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> > > index 5e984ee2a55..795462526df 100644
> > > --- a/gcc/config/riscv/riscv.cc
> > > +++ b/gcc/config/riscv/riscv.cc
> > > @@ -10655,6 +10655,28 @@ riscv_vector_mode_supported_any_target_p (machine_mode)
> > >    return true;
> > >  }
> > >
> > > +/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < x).  */
> > > +void
> > > +riscv_expand_usadd (rtx dest, rtx x, rtx y)
> > > +{
> > > +  fprintf (stdout, "Hit riscv_expand_usadd.\n");
> > > +  // ToDo
> > > +}
> > > +
> > > +void
> > > +riscv_expand_ussub (rtx dest, rtx x, rtx y)
> > > +{
> > > +  fprintf (stdout, "Hit riscv_expand_ussub.\n");
> > > +  // ToDo
> > > +}
> > > +
> > >  /* Initialize the GCC target structure.  */
> > >  #undef TARGET_ASM_ALIGNED_HI_OP
> > >  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> > > diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> > > index 1fec13092e2..e2dbadb3ead 100644
> > > --- a/gcc/config/riscv/riscv.md
> > > +++ b/gcc/config/riscv/riscv.md
> > > @@ -3839,6 +3839,39 @@ (define_insn "*large_load_address"
> > >    [(set_attr "type" "load")
> > >     (set (attr "length") (const_int 8))])
> > >
> > > +(define_expand "usadd<mode>3"
> > > +  [(match_operand:ANYI 0 "register_operand")
> > > +   (match_operand:ANYI 1 "register_operand")
> > > +   (match_operand:ANYI 2 "register_operand")]
> > > +  ""
> > > +  {
> > > +    riscv_expand_usadd (operands[0], operands[1], operands[2]);
> > > +    DONE;
> > > +  }
> > > +)
> > > +
> > > +(define_expand "ussub<mode>3"
> > > +  [(match_operand:ANYI 0 "register_operand")
> > > +   (match_operand:ANYI 1 "register_operand")
> > > +   (match_operand:ANYI 2 "register_operand")]
> > > +  ""
> > > +  {
> > > +    riscv_expand_ussub (operands[0], operands[1], operands[2]);
> > > +    DONE;
> > > +  }
> > > +)
> > > +
> > >  (include "bitmanip.md")
> > >  (include "crypto.md")
> > >  (include "sync.md")
> > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > index 848bb9dbff3..0fff19c875f 100644
> > > --- a/gcc/internal-fn.def
> > > +++ b/gcc/internal-fn.def
> > > @@ -275,6 +275,13 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST | ECF_NOTHROW, first,
> > >  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, first,
> > >                               smulhrs, umulhrs, binary)
> > >
> > > +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST | ECF_NOTHROW, first,
> > > +                             ssadd, usadd, binary)
> > > +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_SUB, ECF_CONST | ECF_NOTHROW, first,
> > > +                             sssub, ussub, binary)
> > > +
> > >  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
> > >  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
> > >  DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary)
> > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > index f3fffd8dec2..6592dea643a 100644
> > > --- a/gcc/match.pd
> > > +++ b/gcc/match.pd
> > > @@ -10276,3 +10276,32 @@ and,
> > >        }
> > >        (if (full_perm_p)
> > >         (vec_perm (op@3 @0 @1) @3 @2))))))
> > > +
> > > +#if GIMPLE
> > > +
> > > +/* Unsigned saturation add, aka:
> > > +   SAT_ADDU = (X + Y) | - ((X + Y) < X) or
> > > +   SAT_ADDU = (X + Y) | - ((X + Y) < Y).  */
> > > +(simplify
> > > + (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0))))
> > > +   (if (optimize
> > > +       && INTEGRAL_TYPE_P (type)
> > > +       && types_match (type, TREE_TYPE (@0))
> > > +       && types_match (type, TREE_TYPE (@1))
> > > +       && TYPE_UNSIGNED (TREE_TYPE (@0))
> > > +       && direct_internal_fn_supported_p (IFN_SAT_ADD, type, OPTIMIZE_FOR_BOTH))
> > > +   (IFN_SAT_ADD @0 @1)))
> > > +
> > > +/* Unsigned saturation sub , aka
> > > +   SAT_SUBU = x >= y ? x - y : 0.  */
> > > +(simplify
> > > +  (cond (ge @0 @1) (minus @0 @1) integer_zerop)
> > > +    (if (optimize
> > > +       && INTEGRAL_TYPE_P (type)
> > > +       && TYPE_UNSIGNED (TREE_TYPE (@0))
> > > +       && types_match (type, TREE_TYPE (@0))
> > > +       && types_match (type, TREE_TYPE (@1))
> > > +       && direct_internal_fn_supported_p (IFN_SAT_SUB, type, OPTIMIZE_FOR_BOTH))
> > > +    (IFN_SAT_SUB @0 @1)))
> > > +
> > > +#endif
> > > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > > index ad14f9328b9..bebe38c888b 100644
> > > --- a/gcc/optabs.def
> > > +++ b/gcc/optabs.def
> > > @@ -111,15 +111,15 @@ OPTAB_NX(add_optab, "add$F$a3")
> > >  OPTAB_NX(add_optab, "add$Q$a3")
> > >  OPTAB_VL(addv_optab, "addv$I$a3", PLUS, "add", '3', gen_intv_fp_libfunc)
> > >  OPTAB_VX(addv_optab, "add$F$a3")
> > > -OPTAB_NL(ssadd_optab, "ssadd$Q$a3", SS_PLUS, "ssadd", '3', gen_signed_fixed_libfunc)
> > > -OPTAB_NL(usadd_optab, "usadd$Q$a3", US_PLUS, "usadd", '3', gen_unsigned_fixed_libfunc)
> > > +OPTAB_NL(ssadd_optab, "ssadd$a3", SS_PLUS, "ssadd", '3', gen_int_libfunc)
> > > +OPTAB_NL(usadd_optab, "usadd$a3", US_PLUS, "usadd", '3', gen_int_libfunc)
> > >  OPTAB_NL(sub_optab, "sub$P$a3", MINUS, "sub", '3', gen_int_fp_fixed_libfunc)
> > >  OPTAB_NX(sub_optab, "sub$F$a3")
> > >  OPTAB_NX(sub_optab, "sub$Q$a3")
> > >  OPTAB_VL(subv_optab, "subv$I$a3", MINUS, "sub", '3', gen_intv_fp_libfunc)
> > >  OPTAB_VX(subv_optab, "sub$F$a3")
> > > -OPTAB_NL(sssub_optab, "sssub$Q$a3", SS_MINUS, "sssub", '3', gen_signed_fixed_libfunc)
> > > -OPTAB_NL(ussub_optab, "ussub$Q$a3", US_MINUS, "ussub", '3', gen_unsigned_fixed_libfunc)
> > > +OPTAB_NL(sssub_optab, "sssub$a3", SS_MINUS, "sssub", '3', gen_int_libfunc)
> > > +OPTAB_NL(ussub_optab, "ussub$a3", US_MINUS, "ussub", '3', gen_int_libfunc)
> >
> > I do wonder what the existing usadd patterns with integer vector modes
> > in various targets do?
> > Those define_insn will at least not end up in the optab set I guess,
> > so they must end up
> > being either unused or used by explicit gen_* (via intrinsic
> > functions?) or by combine?
> >
> > I think simply changing gen_*_fixed_libfunc to gen_int_libfunc won't
> > work.  Since there's
> > no libgcc support I'd leave it as gen_*_fixed_libfunc thus no library
> > fallback for integers?
> >
> > >  OPTAB_NL(smul_optab, "mul$Q$a3", MULT, "mul", '3', gen_int_fp_fixed_libfunc)
> > >  OPTAB_NX(smul_optab, "mul$P$a3")
> > >  OPTAB_NX(smul_optab, "mul$F$a3")
> > >
> > > Pan
> > >
> > > -----Original Message-----
> > > From: Li, Pan2 <pan2.li@intel.com>
> > > Sent: Tuesday, February 27, 2024 10:36 PM
> > > To: Richard Biener <richard.guenther@gmail.com>; Tamar Christina <Tamar.Christina@arm.com>
> > > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com; richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > > Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS
> > >
> > > Thanks Richard and Tammer for moving this forward.
> > >
> > > > That said, I would like to see the bigger picture to be kept in mind
> > > > before altering the GIMPLE IL.
> > >
> > > > Adding an internal function for an already present optab is a
> > > > no-brainer.  Adding a vectorizer
> > > > and/or if-conversion pattern to make use of this during vectorization
> > > > is existing practice.
> > > > Adding pattern recognition to ISEL or widening-mul passes for
> > > > instructions the CPU can do
> > > > is existing practice and OK.
> > >
> > > Thanks for explaining, got the point here.
> > >
> > > > So I'd suggest writing some example of both signed and unsigned saturating add and multiply
> > >
> > > > Because signed addition, will likely require a branch and signed multiplication would require a
> > > > larger type.
> > >
> > > Ack, will prepare one prototype validation patch for add, sub and mul (both unsigned and signed) soon.
> > >
> > > Pan
> > >
> > > -----Original Message-----
> > > From: Richard Biener <richard.guenther@gmail.com>
> > > Sent: Tuesday, February 27, 2024 9:42 PM
> > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > Cc: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com; richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > > Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS
> > >
> > > On Tue, Feb 27, 2024 at 1:57 PM Tamar Christina <Tamar.Christina@arm.com> wrote:
> > > >
> > > > > Thanks Tamar.
> > > > >
> > > > > > Those two cases also *completely* stop vectorization because of either the
> > > > > > control flow or the fact the vectorizer can't handle complex types.
> > > > >
> > > > > Yes, we eventually would like to vectorize the SAT ALU but we start with scalar part
> > > > > first.
> > > > > I tried the DEF_INTERNAL_SIGNED_OPTAB_EXT_FN as your suggestion. It works
> > > > > well with some additions as below.
> > > > > Feel free to correct me if any misunderstandings.
> > > > >
> > > > > 1. usadd$Q$a3 are restricted to fixed point and we need to change it to
> > > > > usadd$a3(as well as gen_int_libfunc) for int.
> > > > > 2. We need to implement a default implementation of SAT_ADD if
> > > > > direct_binary_optab_supported_p is false.
> > > > >     It looks like the default implementation is difficult to make every backend happy.
> > > > > That is why you suggest just normal
> > > > >     DEF_INTERNAL_SIGNED_OPTAB_FN in another thread.
> > > > >
> > > > > Thanks Richard.
> > > > >
> > > > > > But what I'd like to see is that we do more instruction selection on GIMPLE
> > > > > > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> > > > > > passes doing what I'd call instruction selection).  But that means not adding
> > > > > > match.pd patterns for that or at least have a separate isel-match.pd
> > > > > > machinery for that.
> > > > >
> > > > > > So as a start I would go for a direct optab and see to recognize it during
> > > > > > ISEL?
> > > > >
> > > > > Looks we have sorts of SAT alu like PLUS/MINUS/MULT/DIV/SHIFT/NEG/ABS, good
> > > > > to know isel and I am happy to
> > > > > try that once we have conclusion.
> > > > >
> > > >
> > > > So after a lively discussion on IRC, the conclusion is that before we proceed Richi would
> > > > like to see some examples of various operations.  The problem is that unsigned saturating
> > > > addition is the simplest example and it may lead to an implementation strategy that doesn't
> > > > scale.
> > > >
> > > > So I'd suggest writing some example of both signed and unsigned saturating add and multiply
> > > >
> > > > Because signed addition, will likely require a branch and signed multiplication would require a
> > > > larger type.
> > > >
> > > > This would allow us to better understand what kind of gimple would have to to deal with in
> > > > ISEL and VECT if we decide not to lower early.
> > >
> > > More specifically before making something like .SAT_ADD a core part of
> > > GIMPLE I'd like
> > > to point out that we have saturating PLUS_EXPR but just for
> > > fixed-point types.  I realize
> > > Joseph thinks that keying this on the type was wrong and it should
> > > have used integer
> > > types and special saturating operations.  Still having both,
> > > type-keyed saturating PLUS_EXPR
> > > and "code"-keyed .SAT_ADD (on integer types only?) looks like a mess.
> > >
> > > It might be that the way to go is to turn all existing saturating type
> > > *_EXPR into
> > > .SAT_* internal function calls, in the end mapping to the optabs and
> > > eventual RTX codes.
> > >
> > > That could work for both integer types and fixed-point types.
> > >
> > > I'll also note that "saturating" is just another variant of overflow
> > > behavior of which we have
> > > trapping (-ftrapv), wrapping (-fwrapv), signed-undefined (default) and
> > > also (kind-of) sanitized.
> > > We do lack direct IL representation of -ftrapv and -fwrapv, the
> > > semantics on a PLUS_EXPR
> > > depend on per-function flags.  Eventually a common representation
> > > could be found here.
> > > For saturating I was thinking of .ADD_OVERFLOW (a, b,
> > > saturation-value), a "trap" could
> > > be a "trapping" saturation value, "undefined" could be a
> > > "not-a-thing".  But I didn't think much
> > > about this.
> > >
> > > That said, I would like to see the bigger picture to be kept in mind
> > > before altering the GIMPLE IL.
> > >
> > > Adding an internal function for an already present optab is a
> > > no-brainer.  Adding a vectorizer
> > > and/or if-conversion pattern to make use of this during vectorization
> > > is existing practice.
> > > Adding pattern recognition to ISEL or widening-mul passes for
> > > instructions the CPU can do
> > > is existing practice and OK.
> > >
> > > Thanks,
> > > Richard.
> > >
> > >
> > > > Thanks,
> > > > Tamar
> > > >
> > > > > Pan
> > > > >
> > > > > -----Original Message-----
> > > > > From: Tamar Christina <Tamar.Christina@arm.com>
> > > > > Sent: Tuesday, February 27, 2024 5:57 PM
> > > > > To: Richard Biener <richard.guenther@gmail.com>
> > > > > Cc: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai;
> > > > > Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng@gmail.com;
> > > > > richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > > > > Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> > > > > US_PLUS
> > > > >
> > > > > > -----Original Message-----
> > > > > > From: Richard Biener <richard.guenther@gmail.com>
> > > > > > Sent: Tuesday, February 27, 2024 9:44 AM
> > > > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > > > Cc: pan2.li@intel.com; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai;
> > > > > > yanzhang.wang@intel.com; kito.cheng@gmail.com;
> > > > > > richard.sandiford@arm.com2; jeffreyalaw@gmail.com
> > > > > > Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> > > > > > US_PLUS
> > > > > >
> > > > > > On Sun, Feb 25, 2024 at 10:01 AM Tamar Christina
> > > > > > <Tamar.Christina@arm.com> wrote:
> > > > > > >
> > > > > > > Hi Pan,
> > > > > > >
> > > > > > > > From: Pan Li <pan2.li@intel.com>
> > > > > > > >
> > > > > > > > Hi Richard & Tamar,
> > > > > > > >
> > > > > > > > Try the DEF_INTERNAL_INT_EXT_FN as your suggestion.  By mapping
> > > > > > > > us_plus$a3 to the RTL representation (us_plus:m x y) in optabs.def.
> > > > > > > > And then expand_US_PLUS in internal-fn.cc.  Not very sure if my
> > > > > > > > understanding is correct for DEF_INTERNAL_INT_EXT_FN.
> > > > > > > >
> > > > > > > > I am not sure if we still need DEF_INTERNAL_SIGNED_OPTAB_FN here, given
> > > > > > > > the RTL representation has (ss_plus:m x y) and (us_plus:m x y) already.
> > > > > > > >
> > > > > > >
> > > > > > > I think a couple of things are being confused here.  So lets break it down:
> > > > > > >
> > > > > > > The reason for DEF_INTERNAL_SIGNED_OPTAB_FN is because in GIMPLE
> > > > > > > we only want one internal function for both signed and unsigned SAT_ADD.
> > > > > > > with this definition we don't need SAT_UADD and SAT_SADD but instead
> > > > > > > we will only have SAT_ADD, which will expand to us_plus or ss_plus.
> > > > > > >
> > > > > > > Now the downside of this is that this is a direct internal optab.  This means
> > > > > > > that for the representation to be used the target *must* have the optab
> > > > > > > implemented.   This is a bit annoying because it doesn't allow us to generically
> > > > > > > assume that all targets use SAT_ADD for saturating add and thus only have to
> > > > > > > write optimization for this representation.
> > > > > > >
> > > > > > > This is why Richi said we may need to use a new tree_code because we can
> > > > > > > override tree code expansions.  However the same can be done with the
> > > > > _EXT_FN
> > > > > > > internal functions.
> > > > > > >
> > > > > > > So what I meant was that we want to have a combination of the two. i.e. a
> > > > > > > DEF_INTERNAL_SIGNED_OPTAB_EXT_FN.
> > > > > >
> > > > > > Whether we want/need _EXT or only direct depends mainly on how we want to
> > > > > > leverage support.  If it's only during vectorization and possibly instruction
> > > > > > selection a direct optab is IMO the way to go.  Generic optimization only
> > > > > > marginally improves when you explode the number of basic operations you
> > > > > > expose - in fact it gets quite unwieldly to support all of them in
> > > > > > simplifications
> > > > > > and/or canonicalization and you possibly need to translate them back to what
> > > > > > the target CPU supports.
> > > > > >
> > > > > > We already do have too many (IMO) "special" operations exposed "early"
> > > > > > in the GIMPLE pipeline.
> > > > > >
> > > > > > But what I'd like to see is that we do more instruction selection on GIMPLE
> > > > > > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> > > > > > passes doing what I'd call instruction selection).  But that means not adding
> > > > > > match.pd patterns for that or at least have a separate isel-match.pd
> > > > > > machinery for that.
> > > > > >
> > > > > > So as a start I would go for a direct optab and see to recognize it during
> > > > > > ISEL?
> > > > > >
> > > > >
> > > > > The problem with ISEL and the reason I suggested an indirect IFN is that there
> > > > > Are benefit to be had from recognizing it early.  Saturating arithmetic can be
> > > > > optimized
> > > > > Differently from non-saturating ones.
> > > > >
> > > > > But additionally a common way of specifying them decomposes to branches
> > > > > and/or using COMPLEX_EXPR (see the various PRs on saturating arithmetic).
> > > > >
> > > > > These two representation can be detected in PHI-opts and it's beneficial to all
> > > > > targets to canonicalize them to the branchless code.
> > > > >
> > > > > Those two cases also *completely* stop vectorization because of either the
> > > > > control flow or the fact the vectorizer can't handle complex types.
> > > > >
> > > > > So really, gimple ISEL would fix just 1 of the 3 very common cases, and then
> > > > > We'd still need to hack the vectorizer cost models for targets with saturating
> > > > > vector instructions.
> > > > >
> > > > > I of course defer to you, but it seems quite suboptimal to do it this way and
> > > > > doesn't get us first class saturation support.
> > > > >
> > > > > Additionally there have been discussions whether both clang and gcc should
> > > > > provide __builtin_saturate_* methods, which the non-direct IFN would help
> > > > > support.
> > > > >
> > > > > Tamar.
> > > > >
> > > > > > > If Richi agrees, the below is what I meant. It creates the infrastructure for this
> > > > > > > and for now only allows a default fallback for unsigned saturating add and
> > > > > makes
> > > > > > > it easier for us to add the rest later
> > > > > > >
> > > > > > > Also, unless I'm wrong (and Richi can correct me here), us_plus and ss_plus are
> > > > > > the
> > > > > > > RTL expression, but the optab for saturation are ssadd and usadd.  So you don't
> > > > > > > need to make new us_plus and ss_plus ones.
> > > > > > >
> > > > > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > > > > > index a07f25f3aee..aaf9f8991b3 100644
> > > > > > > --- a/gcc/internal-fn.cc
> > > > > > > +++ b/gcc/internal-fn.cc
> > > > > > > @@ -4103,6 +4103,17 @@ direct_internal_fn_supported_p (internal_fn fn,
> > > > > > tree_pair types,
> > > > > > >         return direct_##TYPE##_optab_supported_p (which_optab, types,   \
> > > > > > >                                                   opt_type);            \
> > > > > > >        }
> > > > > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR,
> > > > > > SIGNED_OPTAB, \
> > > > > > > +                                        UNSIGNED_OPTAB, TYPE)          \
> > > > > > > +    case IFN_##CODE:                                                   \
> > > > > > > +      {                                                                        \
> > > > > > > +       optab which_optab = (TYPE_UNSIGNED (types.SELECTOR)             \
> > > > > > > +                            ? UNSIGNED_OPTAB ## _optab                 \
> > > > > > > +                            : SIGNED_OPTAB ## _optab);                 \
> > > > > > > +       return direct_##TYPE##_optab_supported_p (which_optab, types,   \
> > > > > > > +                                                 opt_type)             \
> > > > > > > +              || internal_##CODE##_fn_supported_p (types.SELECTOR, opt_type); \
> > > > > > > +      }
> > > > > > >  #include "internal-fn.def"
> > > > > > >
> > > > > > >      case IFN_LAST:
> > > > > > > @@ -4303,6 +4314,8 @@ set_edom_supported_p (void)
> > > > > > >      optab which_optab = direct_internal_fn_optab (fn, types);          \
> > > > > > >      expand_##TYPE##_optab_fn (fn, stmt, which_optab);                  \
> > > > > > >    }
> > > > > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR,
> > > > > > SIGNED_OPTAB, \
> > > > > > > +                                        UNSIGNED_OPTAB, TYPE)
> > > > > > >  #include "internal-fn.def"
> > > > > > >
> > > > > > >  /* Routines to expand each internal function, indexed by function number.
> > > > > > > @@ -5177,3 +5190,45 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> > > > > > >        emit_move_insn (plhs, cmp);
> > > > > > >      }
> > > > > > >  }
> > > > > > > +
> > > > > > > +void
> > > > > > > +expand_SAT_ADD (internal_fn fn, gcall *stmt)
> > > > > > > +{
> > > > > > > +  /* Check if the target supports the expansion through an IFN.  */
> > > > > > > +  tree_pair types = direct_internal_fn_types (fn, stmt);
> > > > > > > +  optab which_optab = direct_internal_fn_optab (fn, types);
> > > > > > > +  if (direct_binary_optab_supported_p (which_optab, types,
> > > > > > > +                                      insn_optimization_type ()))
> > > > > > > +    {
> > > > > > > +      expand_binary_optab_fn (fn, stmt, which_optab);
> > > > > > > +      return;
> > > > > > > +    }
> > > > > > > +
> > > > > > > +  /* Target does not support the optab, but we can de-compose it.  */
> > > > > > > +  /*
> > > > > > > +  ... decompose to a canonical representation ...
> > > > > > > +  if (TYPE_UNSIGNED (types.SELECTOR))
> > > > > > > +    {
> > > > > > > +      ...
> > > > > > > +      decompose back to (X + Y) | - ((X + Y) < X)
> > > > > > > +    }
> > > > > > > +  else
> > > > > > > +    {
> > > > > > > +      ...
> > > > > > > +    }
> > > > > > > +  */
> > > > > > > +}
> > > > > > > +
> > > > > > > +bool internal_SAT_ADD_fn_supported_p (tree type, optimization_type /*
> > > > > > optype */)
> > > > > > > +{
> > > > > > > +  /* For now, don't support decomposing vector ops.  */
> > > > > > > +  if (VECTOR_TYPE_P (type))
> > > > > > > +    return false;
> > > > > > > +
> > > > > > > +  /* Signed saturating arithmetic is harder to do since we'll so for now
> > > > > > > +     lets ignore.  */
> > > > > > > +  if (!TYPE_UNSIGNED (type))
> > > > > > > +    return false;
> > > > > > > +
> > > > > > > +  return TREE_CODE (type) == INTEGER_TYPE;
> > > > > > > +}
> > > > > > > \ No newline at end of file
> > > > > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > > > > > index c14d30365c1..5a2491228d5 100644
> > > > > > > --- a/gcc/internal-fn.def
> > > > > > > +++ b/gcc/internal-fn.def
> > > > > > > @@ -92,6 +92,10 @@ along with GCC; see the file COPYING3.  If not see
> > > > > > >     unsigned inputs respectively, both without the trailing "_optab".
> > > > > > >     SELECTOR says which type in the tree_pair determines the signedness.
> > > > > > >
> > > > > > > +   DEF_INTERNAL_SIGNED_OPTAB_EXT_FN is like
> > > > > > DEF_INTERNAL_SIGNED_OPTAB_FN, except
> > > > > > > +   that it has expand_##NAME defined in internal-fn.cc to override the
> > > > > > > +   DEF_INTERNAL_SIGNED_OPTAB_FN expansion behavior.
> > > > > > > +
> > > > > > >     DEF_INTERNAL_FLT_FN is like DEF_INTERNAL_OPTAB_FN, but in addition,
> > > > > > >     the function implements the computational part of a built-in math
> > > > > > >     function BUILT_IN_<NAME>{F,,L}.  Unlike some built-in functions,
> > > > > > > @@ -153,6 +157,13 @@ along with GCC; see the file COPYING3.  If not see
> > > > > > >    DEF_INTERNAL_FN (NAME, FLAGS | ECF_LEAF, NULL)
> > > > > > >  #endif
> > > > > > >
> > > > > > > +#ifndef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> > > > > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(NAME, FLAGS, SELECTOR,
> > > > > > SIGNED_OPTAB, \
> > > > > > > +                                    UNSIGNED_OPTAB, TYPE) \
> > > > > > > +  DEF_INTERNAL_SIGNED_OPTAB_FN (NAME, FLAGS, SELECTOR,
> > > > > > SIGNED_OPTAB, \
> > > > > > > +                               UNSIGNED_OPTAB, TYPE)
> > > > > > > +#endif
> > > > > > > +
> > > > > > >  #ifndef DEF_INTERNAL_FLT_FN
> > > > > > >  #define DEF_INTERNAL_FLT_FN(NAME, FLAGS, OPTAB, TYPE) \
> > > > > > >    DEF_INTERNAL_OPTAB_FN (NAME, FLAGS, OPTAB, TYPE)
> > > > > > > @@ -274,6 +285,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS,
> > > > > > ECF_CONST | ECF_NOTHROW, first,
> > > > > > >                               smulhs, umulhs, binary)
> > > > > > >  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW,
> > > > > > first,
> > > > > > >                               smulhrs, umulhrs, binary)
> > > > > > > +DEF_INTERNAL_SIGNED_OPTAB_EXT_FN (SAT_ADD, ECF_CONST |
> > > > > > ECF_NOTHROW, first,
> > > > > > > +                                 ssadd, usadd, binary)
> > > > > > >
> > > > > > >  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
> > > > > > >  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
> > > > > > > @@ -593,5 +606,6 @@ DEF_INTERNAL_FN (BITINTTOFLOAT, ECF_PURE |
> > > > > > ECF_LEAF, ". R . ")
> > > > > > >  #undef DEF_INTERNAL_FLT_FN
> > > > > > >  #undef DEF_INTERNAL_FLT_FLOATN_FN
> > > > > > >  #undef DEF_INTERNAL_SIGNED_OPTAB_FN
> > > > > > > +#undef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> > > > > > >  #undef DEF_INTERNAL_OPTAB_FN
> > > > > > >  #undef DEF_INTERNAL_FN
> > > > > > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > > > > > > index bccee1c3e09..dbdb1e6bad2 100644
> > > > > > > --- a/gcc/internal-fn.h
> > > > > > > +++ b/gcc/internal-fn.h
> > > > > > > @@ -263,6 +263,8 @@ extern void expand_DIVMODBITINT (internal_fn, gcall
> > > > > > *);
> > > > > > >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> > > > > > >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> > > > > > >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > > > > > > +extern void expand_SAT_ADD (internal_fn, gcall *);
> > > > > > > +extern bool internal_SAT_ADD_fn_supported_p (tree, optimization_type);
> > > > > > >
> > > > > > >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> > > > > > >
> > > > > > > > Note this patch is a draft for validation, no test are invovled here.
> > > > > > > >
> > > > > > > > gcc/ChangeLog:
> > > > > > > >
> > > > > > > >       * builtins.def (BUILT_IN_US_PLUS): Add builtin def.
> > > > > > > >       (BUILT_IN_US_PLUSIMAX): Ditto.
> > > > > > > >       (BUILT_IN_US_PLUSL): Ditto.
> > > > > > > >       (BUILT_IN_US_PLUSLL): Ditto.
> > > > > > > >       (BUILT_IN_US_PLUSG): Ditto.
> > > > > > > >       * config/riscv/riscv-protos.h (riscv_expand_us_plus): Add new
> > > > > > > >       func decl for expanding us_plus.
> > > > > > > >       * config/riscv/riscv.cc (riscv_expand_us_plus): Add new func
> > > > > > > >       impl for expanding us_plus.
> > > > > > > >       * config/riscv/riscv.md (us_plus<mode>3): Add new pattern impl
> > > > > > > >       us_plus<mode>3.
> > > > > > > >       * internal-fn.cc (expand_US_PLUS): Add new func impl to expand
> > > > > > > >       US_PLUS.
> > > > > > > >       * internal-fn.def (US_PLUS): Add new INT_EXT_FN.
> > > > > > > >       * internal-fn.h (expand_US_PLUS): Add new func decl.
> > > > > > > >       * match.pd: Add new simplify pattern for us_plus.
> > > > > > > >       * optabs.def (OPTAB_NL): Add new OPTAB_NL to US_PLUS rtl.
> > > > > > > >
> > > > > > > > Signed-off-by: Pan Li <pan2.li@intel.com>
> > > > > > > > ---
> > > > > > > >  gcc/builtins.def                |  7 +++++
> > > > > > > >  gcc/config/riscv/riscv-protos.h |  1 +
> > > > > > > >  gcc/config/riscv/riscv.cc       | 46 +++++++++++++++++++++++++++++++++
> > > > > > > >  gcc/config/riscv/riscv.md       | 11 ++++++++
> > > > > > > >  gcc/internal-fn.cc              | 26 +++++++++++++++++++
> > > > > > > >  gcc/internal-fn.def             |  3 +++
> > > > > > > >  gcc/internal-fn.h               |  1 +
> > > > > > > >  gcc/match.pd                    | 17 ++++++++++++
> > > > > > > >  gcc/optabs.def                  |  2 ++
> > > > > > > >  9 files changed, 114 insertions(+)
> > > > > > > >
> > > > > > > > diff --git a/gcc/builtins.def b/gcc/builtins.def
> > > > > > > > index f6f3e104f6a..0777b912cfa 100644
> > > > > > > > --- a/gcc/builtins.def
> > > > > > > > +++ b/gcc/builtins.def
> > > > > > > > @@ -1055,6 +1055,13 @@ DEF_GCC_BUILTIN
> > > > > > (BUILT_IN_POPCOUNTIMAX,
> > > > > > > > "popcountimax", BT_FN_INT_UINTMAX
> > > > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTL, "popcountl",
> > > > > > BT_FN_INT_ULONG,
> > > > > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTLL, "popcountll",
> > > > > > > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTG, "popcountg",
> > > > > > BT_FN_INT_VAR,
> > > > > > > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > > > > > > > +
> > > > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUS, "us_plus", BT_FN_INT_UINT,
> > > > > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSIMAX, "us_plusimax",
> > > > > > > > BT_FN_INT_UINTMAX, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSL, "us_plusl",
> > > > > BT_FN_INT_ULONG,
> > > > > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSLL, "us_plusll",
> > > > > > > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSG, "us_plusg", BT_FN_INT_VAR,
> > > > > > > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > > > > > > > +
> > > > > > > >  DEF_EXT_LIB_BUILTIN    (BUILT_IN_POSIX_MEMALIGN, "posix_memalign",
> > > > > > > > BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF)
> > > > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_PREFETCH, "prefetch",
> > > > > > > > BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
> > > > > > > >  DEF_LIB_BUILTIN        (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE,
> > > > > > > > ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST)
> > > > > > > > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> > > > > > > > index 80efdf2b7e5..ba6086f1f25 100644
> > > > > > > > --- a/gcc/config/riscv/riscv-protos.h
> > > > > > > > +++ b/gcc/config/riscv/riscv-protos.h
> > > > > > > > @@ -132,6 +132,7 @@ extern void riscv_asm_output_external (FILE *, const
> > > > > > tree,
> > > > > > > > const char *);
> > > > > > > >  extern bool
> > > > > > > >  riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
> > > > > > > >  extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx);
> > > > > > > > +extern void riscv_expand_us_plus (rtx, rtx, rtx);
> > > > > > > >
> > > > > > > >  #ifdef RTX_CODE
> > > > > > > >  extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool
> > > > > *invert_ptr
> > > > > > =
> > > > > > > > 0);
> > > > > > > > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> > > > > > > > index 4100abc9dd1..23f08974f07 100644
> > > > > > > > --- a/gcc/config/riscv/riscv.cc
> > > > > > > > +++ b/gcc/config/riscv/riscv.cc
> > > > > > > > @@ -10657,6 +10657,52 @@ riscv_vector_mode_supported_any_target_p
> > > > > > > > (machine_mode)
> > > > > > > >    return true;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < x).  */
> > > > > > > > +void
> > > > > > > > +riscv_expand_us_plus (rtx dest, rtx x, rtx y)
> > > > > > > > +{
> > > > > > > > +  machine_mode mode = GET_MODE (dest);
> > > > > > > > +  rtx pmode_sum = gen_reg_rtx (Pmode);
> > > > > > > > +  rtx pmode_lt = gen_reg_rtx (Pmode);
> > > > > > > > +  rtx pmode_x = gen_lowpart (Pmode, x);
> > > > > > > > +  rtx pmode_y = gen_lowpart (Pmode, y);
> > > > > > > > +  rtx pmode_dest = gen_reg_rtx (Pmode);
> > > > > > > > +
> > > > > > > > +  /* Step-1: sum = x + y  */
> > > > > > > > +  if (mode == SImode && mode != Pmode)
> > > > > > > > +    { /* Take addw to avoid the sum truncate.  */
> > > > > > > > +      rtx simode_sum = gen_reg_rtx (SImode);
> > > > > > > > +      riscv_emit_binary (PLUS, simode_sum, x, y);
> > > > > > > > +      emit_move_insn (pmode_sum, gen_lowpart (Pmode, simode_sum));
> > > > > > > > +    }
> > > > > > > > +  else
> > > > > > > > +    riscv_emit_binary (PLUS, pmode_sum, pmode_x, pmode_y);
> > > > > > > > +
> > > > > > > > +  /* Step-1.1: truncate sum for HI and QI as we have no insn for add QI/HI.
> > > > > */
> > > > > > > > +  if (mode == HImode || mode == QImode)
> > > > > > > > +    {
> > > > > > > > +      int mode_bits = GET_MODE_BITSIZE (mode).to_constant ();
> > > > > > > > +      int shift_bits = GET_MODE_BITSIZE (Pmode) - mode_bits;
> > > > > > > > +
> > > > > > > > +      gcc_assert (shift_bits > 0);
> > > > > > > > +
> > > > > > > > +      riscv_emit_binary (ASHIFT, pmode_sum, pmode_sum, GEN_INT
> > > > > > (shift_bits));
> > > > > > > > +      riscv_emit_binary (LSHIFTRT, pmode_sum, pmode_sum, GEN_INT
> > > > > > > > (shift_bits));
> > > > > > > > +    }
> > > > > > > > +
> > > > > > > > +  /* Step-2: lt = sum < x  */
> > > > > > > > +  riscv_emit_binary (LTU, pmode_lt, pmode_sum, pmode_x);
> > > > > > > > +
> > > > > > > > +  /* Step-3: lt = -lt  */
> > > > > > > > +  riscv_emit_unary (NEG, pmode_lt, pmode_lt);
> > > > > > > > +
> > > > > > > > +  /* Step-4: pmode_dest = sum | lt  */
> > > > > > > > +  riscv_emit_binary (IOR, pmode_dest, pmode_lt, pmode_sum);
> > > > > > > > +
> > > > > > > > +  /* Step-5: dest = pmode_dest */
> > > > > > > > +  emit_move_insn (dest, gen_lowpart (mode, pmode_dest));
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >  /* Initialize the GCC target structure.  */
> > > > > > > >  #undef TARGET_ASM_ALIGNED_HI_OP
> > > > > > > >  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> > > > > > > > diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> > > > > > > > index 3f7a023d941..eaa9867023c 100644
> > > > > > > > --- a/gcc/config/riscv/riscv.md
> > > > > > > > +++ b/gcc/config/riscv/riscv.md
> > > > > > > > @@ -3841,6 +3841,17 @@ (define_insn "*large_load_address"
> > > > > > > >    [(set_attr "type" "load")
> > > > > > > >     (set (attr "length") (const_int 8))])
> > > > > > > >
> > > > > > > > +(define_expand "us_plus<mode>3"
> > > > > > > > +  [(match_operand:ANYI   0 "register_operand")
> > > > > > > > +   (match_operand:ANYI   1 "register_operand")
> > > > > > > > +   (match_operand:ANYI   2 "register_operand")]
> > > > > > > > +  ""
> > > > > > > > +  {
> > > > > > > > +    riscv_expand_us_plus (operands[0], operands[1], operands[2]);
> > > > > > > > +    DONE;
> > > > > > > > +  }
> > > > > > > > +)
> > > > > > > > +
> > > > > > > >  (include "bitmanip.md")
> > > > > > > >  (include "crypto.md")
> > > > > > > >  (include "sync.md")
> > > > > > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > > > > > > index a07f25f3aee..a7341a57ffa 100644
> > > > > > > > --- a/gcc/internal-fn.cc
> > > > > > > > +++ b/gcc/internal-fn.cc
> > > > > > > > @@ -5177,3 +5177,29 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> > > > > > > >        emit_move_insn (plhs, cmp);
> > > > > > > >      }
> > > > > > > >  }
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +expand_US_PLUS (internal_fn fn, gcall *stmt)
> > > > > > > > +{
> > > > > > > > +  tree lhs = gimple_call_lhs (stmt);
> > > > > > > > +  tree rhs_0 = gimple_call_arg (stmt, 0);
> > > > > > > > +  tree rhs_1 = gimple_call_arg (stmt, 1);
> > > > > > > > +
> > > > > > > > +  do_pending_stack_adjust ();
> > > > > > > > +
> > > > > > > > +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> > > > > > > > +  rtx op_0 = expand_normal (rhs_0);
> > > > > > > > +  rtx op_1 = expand_normal (rhs_1);
> > > > > > > > +
> > > > > > > > +  class expand_operand ops[3];
> > > > > > > > +
> > > > > > > > +  create_output_operand (&ops[0], target, TYPE_MODE (TREE_TYPE (lhs)));
> > > > > > > > +  create_output_operand (&ops[1], op_0, TYPE_MODE (TREE_TYPE
> > > > > (rhs_0)));
> > > > > > > > +  create_output_operand (&ops[2], op_1, TYPE_MODE (TREE_TYPE
> > > > > (rhs_1)));
> > > > > > > > +
> > > > > > > > +  insn_code code = optab_handler (us_plus_optab, TYPE_MODE (TREE_TYPE
> > > > > > > > (rhs_0)));
> > > > > > > > +  expand_insn (code, 3, ops);
> > > > > > > > +
> > > > > > > > +  if (!rtx_equal_p (target, ops[0].value))
> > > > > > > > +    emit_move_insn (target, ops[0].value);
> > > > > > > > +}
> > > > > > >
> > > > > > > This can be simplified by calling expand_binary_optab_fn instead. See my
> > > > > > template
> > > > > > >
> > > > > > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > > > > > > index c14d30365c1..b1d7b5a0307 100644
> > > > > > > > --- a/gcc/internal-fn.def
> > > > > > > > +++ b/gcc/internal-fn.def
> > > > > > > > @@ -447,6 +447,9 @@ DEF_INTERNAL_INT_FN (FFS, ECF_CONST |
> > > > > > > > ECF_NOTHROW, ffs, unary)
> > > > > > > >  DEF_INTERNAL_INT_FN (PARITY, ECF_CONST | ECF_NOTHROW, parity,
> > > > > unary)
> > > > > > > >  DEF_INTERNAL_INT_EXT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW,
> > > > > > > > popcount, unary)
> > > > > > > >
> > > > > > > > +/* Binary integer ops.  */
> > > > > > > > +DEF_INTERNAL_INT_EXT_FN (US_PLUS, ECF_CONST | ECF_NOTHROW,
> > > > > > us_plus,
> > > > > > > > binary)
> > > > > > > > +
> > > > > > > >  DEF_INTERNAL_FN (GOMP_TARGET_REV, ECF_NOVOPS | ECF_LEAF |
> > > > > > > > ECF_NOTHROW, NULL)
> > > > > > > >  DEF_INTERNAL_FN (GOMP_USE_SIMT, ECF_NOVOPS | ECF_LEAF |
> > > > > > > > ECF_NOTHROW, NULL)
> > > > > > > >  DEF_INTERNAL_FN (GOMP_SIMT_ENTER, ECF_LEAF | ECF_NOTHROW,
> > > > > NULL)
> > > > > > > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > > > > > > > index bccee1c3e09..46e404b4a49 100644
> > > > > > > > --- a/gcc/internal-fn.h
> > > > > > > > +++ b/gcc/internal-fn.h
> > > > > > > > @@ -263,6 +263,7 @@ extern void expand_DIVMODBITINT (internal_fn,
> > > > > gcall
> > > > > > *);
> > > > > > > >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> > > > > > > >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> > > > > > > >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > > > > > > > +extern void expand_US_PLUS (internal_fn, gcall *);
> > > > > > > >
> > > > > > > >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> > > > > > > >
> > > > > > > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > > > > > > index c5b6540f939..f45fd58ad23 100644
> > > > > > > > --- a/gcc/match.pd
> > > > > > > > +++ b/gcc/match.pd
> > > > > > > > @@ -10265,3 +10265,20 @@ and,
> > > > > > > >        }
> > > > > > > >        (if (full_perm_p)
> > > > > > > >       (vec_perm (op@3 @0 @1) @3 @2))))))
> > > > > > > > +
> > > > > > > > +#if GIMPLE
> > > > > > > > +
> > > > > > > > +/* Unsigned saturation add, aka:
> > > > > > > > +   SAT_ADDU = (X + Y) | - ((X + Y) < X) or
> > > > > > > > +   SAT_ADDU = (X + Y) | - ((X + Y) < Y).  */
> > > > > > > > +(simplify
> > > > > > > > + (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0))))
> > > > > > > > +   (if (optimize
> > > > > > > > +     && INTEGRAL_TYPE_P (type)
> > > > > > > > +     && TYPE_UNSIGNED (TREE_TYPE (@0))
> > > > > > > > +     && types_match (type, TREE_TYPE (@0))
> > > > > > > > +     && types_match (type, TREE_TYPE (@1))
> > > > > > > > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> > > > > > > > OPTIMIZE_FOR_BOTH))
> > > > > > > > +   (IFN_US_PLUS @0 @1)))
> > > > > > > > +
> > > > > > > > +#endif
> > > > > > >
> > > > > > > With the version above you can drop the #if GIMPLE and the
> > > > > > > > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> > > > > > >
> > > > > > > Check.
> > > > > > >
> > > > > > > Thanks,
> > > > > > > Tamar
> > > > > > >
> > > > > > > > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > > > > > > > index ad14f9328b9..5855c4e0834 100644
> > > > > > > > --- a/gcc/optabs.def
> > > > > > > > +++ b/gcc/optabs.def
> > > > > > > > @@ -179,6 +179,8 @@ OPTAB_NL(clrsb_optab, "clrsb$a2", CLRSB, "clrsb",
> > > > > '2',
> > > > > > > > gen_int_libfunc)
> > > > > > > >  OPTAB_NL(popcount_optab, "popcount$a2", POPCOUNT, "popcount", '2',
> > > > > > > > gen_int_libfunc)
> > > > > > > >  OPTAB_NL(parity_optab, "parity$a2", PARITY, "parity", '2', gen_int_libfunc)
> > > > > > > >
> > > > > > > > +OPTAB_NL(us_plus_optab, "us_plus$a3", US_PLUS, "us_plus", '3',
> > > > > > > > gen_int_libfunc)
> > > > > > > > +
> > > > > > > >  /* Comparison libcalls for integers MUST come in pairs, signed/unsigned.  */
> > > > > > > >  OPTAB_NL(cmp_optab, NULL, UNKNOWN, "cmp", '2',
> > > > > > gen_int_fp_fixed_libfunc)
> > > > > > > >  OPTAB_NL(ucmp_optab, NULL, UNKNOWN, "ucmp", '2', gen_int_libfunc)
> > > > > > > > --
> > > > > > > > 2.34.1
> > > > > > >
diff mbox series

Patch

diff --git a/gcc/builtins.def b/gcc/builtins.def
index f6f3e104f6a..0777b912cfa 100644
--- a/gcc/builtins.def
+++ b/gcc/builtins.def
@@ -1055,6 +1055,13 @@  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTIMAX, "popcountimax", BT_FN_INT_UINTMAX
 DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTL, "popcountl", BT_FN_INT_ULONG, ATTR_CONST_NOTHROW_LEAF_LIST)
 DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTLL, "popcountll", BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
 DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTG, "popcountg", BT_FN_INT_VAR, ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
+
+DEF_GCC_BUILTIN        (BUILT_IN_US_PLUS, "us_plus", BT_FN_INT_UINT, ATTR_CONST_NOTHROW_LEAF_LIST)
+DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSIMAX, "us_plusimax", BT_FN_INT_UINTMAX, ATTR_CONST_NOTHROW_LEAF_LIST)
+DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSL, "us_plusl", BT_FN_INT_ULONG, ATTR_CONST_NOTHROW_LEAF_LIST)
+DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSLL, "us_plusll", BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
+DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSG, "us_plusg", BT_FN_INT_VAR, ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
+
 DEF_EXT_LIB_BUILTIN    (BUILT_IN_POSIX_MEMALIGN, "posix_memalign", BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF)
 DEF_GCC_BUILTIN        (BUILT_IN_PREFETCH, "prefetch", BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
 DEF_LIB_BUILTIN        (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE, ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST)
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 80efdf2b7e5..ba6086f1f25 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -132,6 +132,7 @@  extern void riscv_asm_output_external (FILE *, const tree, const char *);
 extern bool
 riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
 extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx);
+extern void riscv_expand_us_plus (rtx, rtx, rtx);
 
 #ifdef RTX_CODE
 extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool *invert_ptr = 0);
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 4100abc9dd1..23f08974f07 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -10657,6 +10657,52 @@  riscv_vector_mode_supported_any_target_p (machine_mode)
   return true;
 }
 
+/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < x).  */
+void
+riscv_expand_us_plus (rtx dest, rtx x, rtx y)
+{
+  machine_mode mode = GET_MODE (dest);
+  rtx pmode_sum = gen_reg_rtx (Pmode);
+  rtx pmode_lt = gen_reg_rtx (Pmode);
+  rtx pmode_x = gen_lowpart (Pmode, x);
+  rtx pmode_y = gen_lowpart (Pmode, y);
+  rtx pmode_dest = gen_reg_rtx (Pmode);
+
+  /* Step-1: sum = x + y  */
+  if (mode == SImode && mode != Pmode)
+    { /* Take addw to avoid the sum truncate.  */
+      rtx simode_sum = gen_reg_rtx (SImode);
+      riscv_emit_binary (PLUS, simode_sum, x, y);
+      emit_move_insn (pmode_sum, gen_lowpart (Pmode, simode_sum));
+    }
+  else
+    riscv_emit_binary (PLUS, pmode_sum, pmode_x, pmode_y);
+
+  /* Step-1.1: truncate sum for HI and QI as we have no insn for add QI/HI.  */
+  if (mode == HImode || mode == QImode)
+    {
+      int mode_bits = GET_MODE_BITSIZE (mode).to_constant ();
+      int shift_bits = GET_MODE_BITSIZE (Pmode) - mode_bits;
+
+      gcc_assert (shift_bits > 0);
+
+      riscv_emit_binary (ASHIFT, pmode_sum, pmode_sum, GEN_INT (shift_bits));
+      riscv_emit_binary (LSHIFTRT, pmode_sum, pmode_sum, GEN_INT (shift_bits));
+    }
+
+  /* Step-2: lt = sum < x  */
+  riscv_emit_binary (LTU, pmode_lt, pmode_sum, pmode_x);
+
+  /* Step-3: lt = -lt  */
+  riscv_emit_unary (NEG, pmode_lt, pmode_lt);
+
+  /* Step-4: pmode_dest = sum | lt  */
+  riscv_emit_binary (IOR, pmode_dest, pmode_lt, pmode_sum);
+
+  /* Step-5: dest = pmode_dest */
+  emit_move_insn (dest, gen_lowpart (mode, pmode_dest));
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_ASM_ALIGNED_HI_OP
 #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 3f7a023d941..eaa9867023c 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -3841,6 +3841,17 @@  (define_insn "*large_load_address"
   [(set_attr "type" "load")
    (set (attr "length") (const_int 8))])
 
+(define_expand "us_plus<mode>3"
+  [(match_operand:ANYI   0 "register_operand")
+   (match_operand:ANYI   1 "register_operand")
+   (match_operand:ANYI   2 "register_operand")]
+  ""
+  {
+    riscv_expand_us_plus (operands[0], operands[1], operands[2]);
+    DONE;
+  }
+)
+
 (include "bitmanip.md")
 (include "crypto.md")
 (include "sync.md")
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index a07f25f3aee..a7341a57ffa 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -5177,3 +5177,29 @@  expand_POPCOUNT (internal_fn fn, gcall *stmt)
       emit_move_insn (plhs, cmp);
     }
 }
+
+void
+expand_US_PLUS (internal_fn fn, gcall *stmt)
+{
+  tree lhs = gimple_call_lhs (stmt);
+  tree rhs_0 = gimple_call_arg (stmt, 0);
+  tree rhs_1 = gimple_call_arg (stmt, 1);
+
+  do_pending_stack_adjust ();
+
+  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+  rtx op_0 = expand_normal (rhs_0);
+  rtx op_1 = expand_normal (rhs_1);
+
+  class expand_operand ops[3];
+
+  create_output_operand (&ops[0], target, TYPE_MODE (TREE_TYPE (lhs)));
+  create_output_operand (&ops[1], op_0, TYPE_MODE (TREE_TYPE (rhs_0)));
+  create_output_operand (&ops[2], op_1, TYPE_MODE (TREE_TYPE (rhs_1)));
+
+  insn_code code = optab_handler (us_plus_optab, TYPE_MODE (TREE_TYPE (rhs_0)));
+  expand_insn (code, 3, ops);
+
+  if (!rtx_equal_p (target, ops[0].value))
+    emit_move_insn (target, ops[0].value);
+}
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index c14d30365c1..b1d7b5a0307 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -447,6 +447,9 @@  DEF_INTERNAL_INT_FN (FFS, ECF_CONST | ECF_NOTHROW, ffs, unary)
 DEF_INTERNAL_INT_FN (PARITY, ECF_CONST | ECF_NOTHROW, parity, unary)
 DEF_INTERNAL_INT_EXT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW, popcount, unary)
 
+/* Binary integer ops.  */
+DEF_INTERNAL_INT_EXT_FN (US_PLUS, ECF_CONST | ECF_NOTHROW, us_plus, binary)
+
 DEF_INTERNAL_FN (GOMP_TARGET_REV, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
 DEF_INTERNAL_FN (GOMP_USE_SIMT, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
 DEF_INTERNAL_FN (GOMP_SIMT_ENTER, ECF_LEAF | ECF_NOTHROW, NULL)
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index bccee1c3e09..46e404b4a49 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -263,6 +263,7 @@  extern void expand_DIVMODBITINT (internal_fn, gcall *);
 extern void expand_FLOATTOBITINT (internal_fn, gcall *);
 extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
 extern void expand_POPCOUNT (internal_fn, gcall *);
+extern void expand_US_PLUS (internal_fn, gcall *);
 
 extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
 
diff --git a/gcc/match.pd b/gcc/match.pd
index c5b6540f939..f45fd58ad23 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -10265,3 +10265,20 @@  and,
       }
       (if (full_perm_p)
 	(vec_perm (op@3 @0 @1) @3 @2))))))
+
+#if GIMPLE
+
+/* Unsigned saturation add, aka:
+   SAT_ADDU = (X + Y) | - ((X + Y) < X) or
+   SAT_ADDU = (X + Y) | - ((X + Y) < Y).  */
+(simplify
+ (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0))))
+   (if (optimize
+	&& INTEGRAL_TYPE_P (type)
+	&& TYPE_UNSIGNED (TREE_TYPE (@0))
+	&& types_match (type, TREE_TYPE (@0))
+	&& types_match (type, TREE_TYPE (@1))
+	&& direct_internal_fn_supported_p (IFN_US_PLUS, type, OPTIMIZE_FOR_BOTH))
+   (IFN_US_PLUS @0 @1)))
+
+#endif
diff --git a/gcc/optabs.def b/gcc/optabs.def
index ad14f9328b9..5855c4e0834 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -179,6 +179,8 @@  OPTAB_NL(clrsb_optab, "clrsb$a2", CLRSB, "clrsb", '2', gen_int_libfunc)
 OPTAB_NL(popcount_optab, "popcount$a2", POPCOUNT, "popcount", '2', gen_int_libfunc)
 OPTAB_NL(parity_optab, "parity$a2", PARITY, "parity", '2', gen_int_libfunc)
 
+OPTAB_NL(us_plus_optab, "us_plus$a3", US_PLUS, "us_plus", '3', gen_int_libfunc)
+
 /* Comparison libcalls for integers MUST come in pairs, signed/unsigned.  */
 OPTAB_NL(cmp_optab, NULL, UNKNOWN, "cmp", '2', gen_int_fp_fixed_libfunc)
 OPTAB_NL(ucmp_optab, NULL, UNKNOWN, "ucmp", '2', gen_int_libfunc)