diff mbox series

[x86] Add support for stc, clc and cmc instructions in i386.md

Message ID 00d701d9966d$16552220$42ff6660$@nextmovesoftware.com
State New
Headers show
Series [x86] Add support for stc, clc and cmc instructions in i386.md | expand

Commit Message

Roger Sayle June 3, 2023, 10:45 p.m. UTC
This patch is the latest revision of my patch to add support for the
STC (set carry flag), CLC (clear carry flag) and CMC (complement
carry flag) instructions to the i386 backend, incorporating Uros'
previous feedback.  The significant changes are (i) the inclusion
of CMC, (ii) the use of UNSPEC for pattern, (iii) Use of a new
X86_TUNE_SLOW_STC tuning flag to use alternate implementations on
pentium4 (which has a notoriously slow STC) when not optimizing
for size.

An example of the use of the stc instruction is:
unsigned int foo (unsigned int a, unsigned int b, unsigned int *c) {
  return __builtin_ia32_addcarryx_u32 (1, a, b, c);
}

which previously generated:
        movl    $1, %eax
        addb    $-1, %al
        adcl    %esi, %edi
        setc    %al
        movl    %edi, (%rdx)
        movzbl  %al, %eax
        ret

with this patch now generates:
        stc
        adcl    %esi, %edi
        setc    %al
        movl    %edi, (%rdx)
        movzbl  %al, %eax
        ret

An example of the use of the cmc instruction (where the carry from
a first adc is inverted/complemented as input to a second adc) is:
unsigned int bar (unsigned int a, unsigned int b,
                  unsigned int c, unsigned int d)
{
  unsigned int c1 = __builtin_ia32_addcarryx_u32 (1, a, b, &o1);
  return __builtin_ia32_addcarryx_u32 (c1 ^ 1, c, d, &o2);
}

which previously generated:
        movl    $1, %eax
        addb    $-1, %al
        adcl    %esi, %edi
        setnc   %al
        movl    %edi, o1(%rip)
        addb    $-1, %al
        adcl    %ecx, %edx
        setc    %al
        movl    %edx, o2(%rip)
        movzbl  %al, %eax
        ret

and now generates:
        stc
        adcl    %esi, %edi
        cmc
        movl    %edi, o1(%rip)
        adcl    %ecx, %edx
        setc    %al
        movl    %edx, o2(%rip)
        movzbl  %al, %eax
        ret


This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures.  Ok for mainline?


2022-06-03  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
        * config/i386/i386-expand.cc (ix86_expand_builtin) <handlecarry>:
        Use new x86_stc or negqi_ccc_1 instructions to set the carry flag.
        * config/i386/i386.h (TARGET_SLOW_STC): New define.
        * config/i386/i386.md (UNSPEC_CLC): New UNSPEC for clc.
        (UNSPEC_STC): New UNSPEC for stc.
        (UNSPEC_CMC): New UNSPEC for cmc.
        (*x86_clc): New define_insn.
        (*x86_clc_xor): New define_insn for pentium4 without -Os.
        (x86_stc): New define_insn.
        (define_split): Convert x86_stc into alternate implementation
        on pentium4.
        (x86_cmc): New define_insn.
        (*x86_cmc_1): New define_insn_and_split to recognize cmc pattern.
        (*setcc_qi_negqi_ccc_1_<mode>): New define_insn_and_split to
        recognize (and eliminate) the carry flag being copied to itself.
        (*setcc_qi_negqi_ccc_2_<mode>): Likewise.
        (neg<mode>_ccc_1): Renamed from *neg<mode>_ccc_1 for gen function.
        * config/i386/x86-tune.def (X86_TUNE_SLOW_STC): New tuning flag.

gcc/testsuite/ChangeLog
        * gcc.target/i386/cmc-1.c: New test case.
        * gcc.target/i386/stc-1.c: Likewise.


Thanks,
Roger
--

Comments

Uros Bizjak June 4, 2023, 5:52 p.m. UTC | #1
On Sun, Jun 4, 2023 at 12:45 AM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> This patch is the latest revision of my patch to add support for the
> STC (set carry flag), CLC (clear carry flag) and CMC (complement
> carry flag) instructions to the i386 backend, incorporating Uros'
> previous feedback.  The significant changes are (i) the inclusion
> of CMC, (ii) the use of UNSPEC for pattern, (iii) Use of a new
> X86_TUNE_SLOW_STC tuning flag to use alternate implementations on
> pentium4 (which has a notoriously slow STC) when not optimizing
> for size.
>
> An example of the use of the stc instruction is:
> unsigned int foo (unsigned int a, unsigned int b, unsigned int *c) {
>   return __builtin_ia32_addcarryx_u32 (1, a, b, c);
> }
>
> which previously generated:
>         movl    $1, %eax
>         addb    $-1, %al
>         adcl    %esi, %edi
>         setc    %al
>         movl    %edi, (%rdx)
>         movzbl  %al, %eax
>         ret
>
> with this patch now generates:
>         stc
>         adcl    %esi, %edi
>         setc    %al
>         movl    %edi, (%rdx)
>         movzbl  %al, %eax
>         ret
>
> An example of the use of the cmc instruction (where the carry from
> a first adc is inverted/complemented as input to a second adc) is:
> unsigned int bar (unsigned int a, unsigned int b,
>                   unsigned int c, unsigned int d)
> {
>   unsigned int c1 = __builtin_ia32_addcarryx_u32 (1, a, b, &o1);
>   return __builtin_ia32_addcarryx_u32 (c1 ^ 1, c, d, &o2);
> }
>
> which previously generated:
>         movl    $1, %eax
>         addb    $-1, %al
>         adcl    %esi, %edi
>         setnc   %al
>         movl    %edi, o1(%rip)
>         addb    $-1, %al
>         adcl    %ecx, %edx
>         setc    %al
>         movl    %edx, o2(%rip)
>         movzbl  %al, %eax
>         ret
>
> and now generates:
>         stc
>         adcl    %esi, %edi
>         cmc
>         movl    %edi, o1(%rip)
>         adcl    %ecx, %edx
>         setc    %al
>         movl    %edx, o2(%rip)
>         movzbl  %al, %eax
>         ret
>
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32}
> with no new failures.  Ok for mainline?
>
>
> 2022-06-03  Roger Sayle  <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
>         * config/i386/i386-expand.cc (ix86_expand_builtin) <handlecarry>:
>         Use new x86_stc or negqi_ccc_1 instructions to set the carry flag.
>         * config/i386/i386.h (TARGET_SLOW_STC): New define.
>         * config/i386/i386.md (UNSPEC_CLC): New UNSPEC for clc.
>         (UNSPEC_STC): New UNSPEC for stc.
>         (UNSPEC_CMC): New UNSPEC for cmc.
>         (*x86_clc): New define_insn.
>         (*x86_clc_xor): New define_insn for pentium4 without -Os.
>         (x86_stc): New define_insn.
>         (define_split): Convert x86_stc into alternate implementation
>         on pentium4.
>         (x86_cmc): New define_insn.
>         (*x86_cmc_1): New define_insn_and_split to recognize cmc pattern.
>         (*setcc_qi_negqi_ccc_1_<mode>): New define_insn_and_split to
>         recognize (and eliminate) the carry flag being copied to itself.
>         (*setcc_qi_negqi_ccc_2_<mode>): Likewise.
>         (neg<mode>_ccc_1): Renamed from *neg<mode>_ccc_1 for gen function.
>         * config/i386/x86-tune.def (X86_TUNE_SLOW_STC): New tuning flag.
>
> gcc/testsuite/ChangeLog
>         * gcc.target/i386/cmc-1.c: New test case.
>         * gcc.target/i386/stc-1.c: Likewise.

+;; Clear carry flag.
+(define_insn "*x86_clc"
+  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_CLC))]
+  "!TARGET_SLOW_STC || optimize_function_for_size_p (cfun)"
+  "clc"
+  [(set_attr "length" "1")
+   (set_attr "length_immediate" "0")
+   (set_attr "modrm" "0")])
+
+(define_insn "*x86_clc_xor"
+  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_CLC))
+   (clobber (match_scratch:SI 0 "=r"))]
+  "TARGET_SLOW_STC && !optimize_function_for_size_p (cfun)"
+  "xor{l}\t%0, %0"
+    [(set_attr "type" "alu1")
+     (set_attr "mode" "SI")
+     (set_attr "length_immediate" "0")])

I think the above would be better implemented as a peephole2 pattern
that triggers when a register is available. We should not waste a
register on a register starved x86_32 just to set a carry flag. This
is implemented with:

  [(match_scratch:SI 2 "r")

at the beginning of the peephole2 pattern that generates x86_clc_xor.
The pattern should be constrained with "TARGET_SLOW_STC &&
!optimize_function_for_size_p()" and x86_clc_xor should be available
only after reload (like e.g. "*mov<mode>_xor").

+;; On Pentium 4, set the carry flag using mov $1,%al;neg %al.
+(define_split
+  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_STC))]
+  "TARGET_SLOW_STC
+   && !optimize_insn_for_size_p ()
+   && can_create_pseudo_p ()"
+  [(set (match_dup 0) (const_int 1))
+   (parallel
+     [(set (reg:CCC FLAGS_REG)
+       (unspec:CCC [(match_dup 0) (const_int 0)] UNSPEC_CC_NE))
+      (set (match_dup 0) (neg:QI (match_dup 0)))])]
+  "operands[0] = gen_reg_rtx (QImode);")

Same here, this should be a peephole2 to trigger only when a register
is available, again for "TARGET_SLOW_STC &&
!optimize_function_for_size_p()".

Is the new sequence "mov $1,%al; neg %al" any better than existing
"mov $1,%al; addb $-1,%al" or is there another reason for the changed
sequence?

+(define_insn_and_split "*x86_cmc_1"
+  [(set (reg:CCC FLAGS_REG)
+        (unspec:CCC [(geu:QI (reg:CCC FLAGS_REG) (const_int 0))
+             (const_int 0)] UNSPEC_CC_NE))]
+  "!TARGET_SLOW_STC || optimize_function_for_size_p (cfun)"
+  "#"
+  "&& 1"
+  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(reg:CCC FLAGS_REG)] UNSPEC_CMC))])

The above should be the RTL model of x86_cmc. No need to split to an unspec.

Uros.
Roger Sayle June 6, 2023, 3:14 p.m. UTC | #2
Hi Uros,
This revision implements your suggestions/refinements. (i) Avoid the
UNSPEC_CMC by using the canonical RTL idiom for *x86_cmc, (ii) Use
peephole2s to convert x86_stc and *x86_cmc into alternate forms on
TARGET_SLOW_STC CPUs (pentium4), when a suitable QImode register is
available, (iii) Prefer the addqi_cconly_overflow idiom (addb $-1,%al)
over negqi_ccc_1 (neg %al) for setting the carry from a QImode value,
(iv) Use andl %eax,%eax to clear carry flag without requiring (clobbering)
an additional register, as an alternate output template for *x86_clc.
These changes required two minor edits to i386.cc:  ix86_cc_mode had
to be tweaked to suggest CCCmode for the new *x86_cmc pattern, and
*x86_cmc needed to be handled/parameterized in ix86_rtx_costs so that
combine would appreciate that this complex RTL expression was actually
a fast, single byte instruction [i.e. preferable].

This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures.  Ok for mainline?

2022-06-06  Roger Sayle  <roger@nextmovesoftware.com>
            Uros Bizjak  <ubizjak@gmail.com>

gcc/ChangeLog
        * config/i386/i386-expand.cc (ix86_expand_builtin) <handlecarry>:
        Use new x86_stc instruction when the carry flag must be set.
        * config/i386/i386.cc (ix86_cc_mode): Use CCCmode for *x86_cmc.
        (ix86_rtx_costs): Provide accurate rtx_costs for *x86_cmc.
        * config/i386/i386.h (TARGET_SLOW_STC): New define.
        * config/i386/i386.md (UNSPEC_CLC): New UNSPEC for clc.
        (UNSPEC_STC): New UNSPEC for stc.
        (*x86_clc): New define_insn (with implementation for pentium4).
        (x86_stc): New define_insn.
        (define_peephole2): Convert x86_stc into alternate implementation
        on pentium4 without -Os when a QImode register is available.
        (*x86_cmc): New define_insn.
        (define_peephole2): Convert *x86_cmc into alternate implementation
        on pentium4 without -Os when a QImode register is available.
        (*setccc): New define_insn_and_split for a no-op CCCmode move.
        (*setcc_qi_negqi_ccc_1_<mode>): New define_insn_and_split to
        recognize (and eliminate) the carry flag being copied to itself.
        (*setcc_qi_negqi_ccc_2_<mode>): Likewise.
        * config/i386/x86-tune.def (X86_TUNE_SLOW_STC): New tuning flag.

gcc/testsuite/ChangeLog
        * gcc.target/i386/cmc-1.c: New test case.
        * gcc.target/i386/stc-1.c: Likewise.


Thanks, Roger.
--

-----Original Message-----
From: Uros Bizjak <ubizjak@gmail.com> 
Sent: 04 June 2023 18:53
To: Roger Sayle <roger@nextmovesoftware.com>
Cc: gcc-patches@gcc.gnu.org
Subject: Re: [x86 PATCH] Add support for stc, clc and cmc instructions in i386.md

On Sun, Jun 4, 2023 at 12:45 AM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> This patch is the latest revision of my patch to add support for the 
> STC (set carry flag), CLC (clear carry flag) and CMC (complement carry 
> flag) instructions to the i386 backend, incorporating Uros'
> previous feedback.  The significant changes are (i) the inclusion of 
> CMC, (ii) the use of UNSPEC for pattern, (iii) Use of a new 
> X86_TUNE_SLOW_STC tuning flag to use alternate implementations on
> pentium4 (which has a notoriously slow STC) when not optimizing for 
> size.
>
> An example of the use of the stc instruction is:
> unsigned int foo (unsigned int a, unsigned int b, unsigned int *c) {
>   return __builtin_ia32_addcarryx_u32 (1, a, b, c); }
>
> which previously generated:
>         movl    $1, %eax
>         addb    $-1, %al
>         adcl    %esi, %edi
>         setc    %al
>         movl    %edi, (%rdx)
>         movzbl  %al, %eax
>         ret
>
> with this patch now generates:
>         stc
>         adcl    %esi, %edi
>         setc    %al
>         movl    %edi, (%rdx)
>         movzbl  %al, %eax
>         ret
>
> An example of the use of the cmc instruction (where the carry from a 
> first adc is inverted/complemented as input to a second adc) is:
> unsigned int bar (unsigned int a, unsigned int b,
>                   unsigned int c, unsigned int d) {
>   unsigned int c1 = __builtin_ia32_addcarryx_u32 (1, a, b, &o1);
>   return __builtin_ia32_addcarryx_u32 (c1 ^ 1, c, d, &o2); }
>
> which previously generated:
>         movl    $1, %eax
>         addb    $-1, %al
>         adcl    %esi, %edi
>         setnc   %al
>         movl    %edi, o1(%rip)
>         addb    $-1, %al
>         adcl    %ecx, %edx
>         setc    %al
>         movl    %edx, o2(%rip)
>         movzbl  %al, %eax
>         ret
>
> and now generates:
>         stc
>         adcl    %esi, %edi
>         cmc
>         movl    %edi, o1(%rip)
>         adcl    %ecx, %edx
>         setc    %al
>         movl    %edx, o2(%rip)
>         movzbl  %al, %eax
>         ret
>
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap 
> and make -k check, both with and without --target_board=unix{-m32} 
> with no new failures.  Ok for mainline?
>
>
> 2022-06-03  Roger Sayle  <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
>         * config/i386/i386-expand.cc (ix86_expand_builtin) <handlecarry>:
>         Use new x86_stc or negqi_ccc_1 instructions to set the carry flag.
>         * config/i386/i386.h (TARGET_SLOW_STC): New define.
>         * config/i386/i386.md (UNSPEC_CLC): New UNSPEC for clc.
>         (UNSPEC_STC): New UNSPEC for stc.
>         (UNSPEC_CMC): New UNSPEC for cmc.
>         (*x86_clc): New define_insn.
>         (*x86_clc_xor): New define_insn for pentium4 without -Os.
>         (x86_stc): New define_insn.
>         (define_split): Convert x86_stc into alternate implementation
>         on pentium4.
>         (x86_cmc): New define_insn.
>         (*x86_cmc_1): New define_insn_and_split to recognize cmc pattern.
>         (*setcc_qi_negqi_ccc_1_<mode>): New define_insn_and_split to
>         recognize (and eliminate) the carry flag being copied to itself.
>         (*setcc_qi_negqi_ccc_2_<mode>): Likewise.
>         (neg<mode>_ccc_1): Renamed from *neg<mode>_ccc_1 for gen function.
>         * config/i386/x86-tune.def (X86_TUNE_SLOW_STC): New tuning flag.
>
> gcc/testsuite/ChangeLog
>         * gcc.target/i386/cmc-1.c: New test case.
>         * gcc.target/i386/stc-1.c: Likewise.

+;; Clear carry flag.
+(define_insn "*x86_clc"
+  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_CLC))]
+  "!TARGET_SLOW_STC || optimize_function_for_size_p (cfun)"
+  "clc"
+  [(set_attr "length" "1")
+   (set_attr "length_immediate" "0")
+   (set_attr "modrm" "0")])
+
+(define_insn "*x86_clc_xor"
+  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_CLC))
+   (clobber (match_scratch:SI 0 "=r"))]
+  "TARGET_SLOW_STC && !optimize_function_for_size_p (cfun)"
+  "xor{l}\t%0, %0"
+    [(set_attr "type" "alu1")
+     (set_attr "mode" "SI")
+     (set_attr "length_immediate" "0")])

I think the above would be better implemented as a peephole2 pattern that triggers when a register is available. We should not waste a register on a register starved x86_32 just to set a carry flag. This is implemented with:

  [(match_scratch:SI 2 "r")

at the beginning of the peephole2 pattern that generates x86_clc_xor.
The pattern should be constrained with "TARGET_SLOW_STC && !optimize_function_for_size_p()" and x86_clc_xor should be available only after reload (like e.g. "*mov<mode>_xor").

+;; On Pentium 4, set the carry flag using mov $1,%al;neg %al.
+(define_split
+  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_STC))]
+  "TARGET_SLOW_STC
+   && !optimize_insn_for_size_p ()
+   && can_create_pseudo_p ()"
+  [(set (match_dup 0) (const_int 1))
+   (parallel
+     [(set (reg:CCC FLAGS_REG)
+       (unspec:CCC [(match_dup 0) (const_int 0)] UNSPEC_CC_NE))
+      (set (match_dup 0) (neg:QI (match_dup 0)))])]
+  "operands[0] = gen_reg_rtx (QImode);")

Same here, this should be a peephole2 to trigger only when a register is available, again for "TARGET_SLOW_STC && !optimize_function_for_size_p()".

Is the new sequence "mov $1,%al; neg %al" any better than existing "mov $1,%al; addb $-1,%al" or is there another reason for the changed sequence?

+(define_insn_and_split "*x86_cmc_1"
+  [(set (reg:CCC FLAGS_REG)
+        (unspec:CCC [(geu:QI (reg:CCC FLAGS_REG) (const_int 0))
+             (const_int 0)] UNSPEC_CC_NE))]
+  "!TARGET_SLOW_STC || optimize_function_for_size_p (cfun)"
+  "#"
+  "&& 1"
+  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(reg:CCC FLAGS_REG)] 
+UNSPEC_CMC))])

The above should be the RTL model of x86_cmc. No need to split to an unspec.

Uros.
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index ac67441..697eb47 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -13948,8 +13948,6 @@ rdseed_step:
       arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out.  */
 
       op1 = expand_normal (arg0);
-      if (!integer_zerop (arg0))
-	op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
 
       op2 = expand_normal (arg1);
       if (!register_operand (op2, mode0))
@@ -13967,7 +13965,7 @@ rdseed_step:
 	}
 
       op0 = gen_reg_rtx (mode0);
-      if (integer_zerop (arg0))
+      if (op1 == const0_rtx)
 	{
 	  /* If arg0 is 0, optimize right away into add or sub
 	     instruction that sets CCCmode flags.  */
@@ -13977,7 +13975,14 @@ rdseed_step:
       else
 	{
 	  /* Generate CF from input operand.  */
-	  emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
+	  if (!CONST_INT_P (op1))
+	    {
+	      op1 = convert_to_mode (QImode, op1, 1);
+	      op1 = copy_to_mode_reg (QImode, op1);
+	      emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
+	    }
+	  else
+	    emit_insn (gen_x86_stc ());
 
 	  /* Generate instruction that consumes CF.  */
 	  op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d4ff56e..c4591d6 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -15954,6 +15954,17 @@ ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
 	       && REGNO (XEXP (op1, 0)) == FLAGS_REG
 	       && XEXP (op1, 1) == const0_rtx)
 	return CCCmode;
+      /* Similarly for *x86_cmc pattern.
+	 Match LTU of op0 (neg:QI (ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+	 and op1 (geu:QI (reg:CCC FLAGS_REG) (const_int 0)).
+	 It is sufficient to test that the operand modes are CCCmode.  */
+      else if (code == LTU
+	       && GET_CODE (op0) == NEG
+	       && GET_CODE (XEXP (op0, 0)) == LTU
+	       && GET_MODE (XEXP (XEXP (op0, 0), 0)) == CCCmode
+	       && GET_CODE (op1) == GEU
+	       && GET_MODE (XEXP (op1, 0)) == CCCmode)
+	return CCCmode;
       else
 	return CCmode;
     case GTU:			/* CF=0 & ZF=0 */
@@ -21305,6 +21316,31 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
 	  *total = 0;
 	  return true;
 	}
+      /* Match x
+	 (compare:CCC (neg:QI (ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+		      (geu:QI (reg:CCC FLAGS_REG) (const_int 0)))  */
+      if (mode == CCCmode
+	  && GET_CODE (op0) == NEG
+	  && GET_CODE (XEXP (op0, 0)) == LTU
+	  && REG_P (XEXP (XEXP (op0, 0), 0))
+	  && GET_MODE (XEXP (XEXP (op0, 0), 0)) == CCCmode
+	  && REGNO (XEXP (XEXP (op0, 0), 0)) == FLAGS_REG
+	  && XEXP (XEXP (op0, 0), 1) == const0_rtx
+	  && GET_CODE (op1) == GEU
+	  && REG_P (XEXP (op1, 0))
+	  && GET_MODE (XEXP (op1, 0)) == CCCmode
+	  && REGNO (XEXP (op1, 0)) == FLAGS_REG
+	  && XEXP (op1, 1) == const0_rtx)
+	{
+	  /* This is *x86_cmc.  */
+	  if (!speed)
+	    *total = COSTS_N_BYTES (1);
+	  else if (TARGET_SLOW_STC)
+	    *total = COSTS_N_INSNS (2);
+	  else 
+	    *total = COSTS_N_INSNS (1);
+	  return true;
+	}
 
       if (SCALAR_INT_MODE_P (GET_MODE (op0))
 	  && GET_MODE_SIZE (GET_MODE (op0)) > UNITS_PER_WORD)
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index c7439f8..5ac9c78 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -448,6 +448,7 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 	ix86_tune_features[X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD]
 #define TARGET_DEST_FALSE_DEP_FOR_GLC \
 	ix86_tune_features[X86_TUNE_DEST_FALSE_DEP_FOR_GLC]
+#define TARGET_SLOW_STC ix86_tune_features[X86_TUNE_SLOW_STC]
 
 /* Feature tests against the various architecture variations.  */
 enum ix86_arch_indices {
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e6ebc46..bc48fc4 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -114,6 +114,8 @@
   UNSPEC_INSN_FALSE_DEP
   UNSPEC_SBB
   UNSPEC_CC_NE
+  UNSPEC_CLC
+  UNSPEC_STC
 
   ;; For SSE/MMX support:
   UNSPEC_FIX_NOTRUNC
@@ -1999,6 +2001,71 @@
   [(set_attr "type" "ssecomi")
    (set_attr "prefix" "evex")
    (set_attr "mode" "HF")])
+
+;; Clear carry flag.
+(define_insn "*x86_clc"
+  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_CLC))]
+  ""
+{
+  if (TARGET_SLOW_STC && !optimize_function_for_size_p (cfun))
+    return "and{l}\t{%%rax, %%rax|rax, rax}";
+  return "clc";
+}
+  [(set (attr "length")
+	(if_then_else
+	  (and (match_test "TARGET_SLOW_STC")
+	       (not (match_test "optimize_function_for_size_p (cfun)")))
+	  (const_int 2)
+	  (const_int 1)))
+   (set_attr "length_immediate" "0")
+   (set_attr "modrm" "0")])
+
+;; Set carry flag.
+(define_insn "x86_stc"
+  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_STC))]
+  ""
+  "stc"
+  [(set_attr "length" "1")
+   (set_attr "length_immediate" "0")
+   (set_attr "modrm" "0")])
+
+;; On Pentium 4, set the carry flag using mov $1,%al;addb $-1,%al.
+(define_peephole2
+  [(match_scratch:QI 0 "r")
+   (set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_STC))]
+  "TARGET_SLOW_STC && !optimize_insn_for_size_p ()"
+  [(set (match_dup 0) (const_int 1))
+   (parallel
+     [(set (reg:CCC FLAGS_REG)
+	   (compare:CCC (plus:QI (match_dup 0) (const_int -1))
+			(match_dup 0)))
+      (set (match_dup 0) (plus:QI (match_dup 0) (const_int -1)))])])
+
+;; Complement carry flag.
+(define_insn "*x86_cmc"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC (neg:QI (ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+		     (geu:QI (reg:CCC FLAGS_REG) (const_int 0))))]
+  ""
+  "cmc"
+  [(set_attr "length" "1")
+   (set_attr "length_immediate" "0")
+   (set_attr "use_carry" "1")
+   (set_attr "modrm" "0")])
+
+;; On Pentium 4, cmc is replaced with setnc %al;addb $-1,%al.
+(define_peephole2
+  [(match_scratch:QI 0 "r")
+   (set (reg:CCC FLAGS_REG)
+	(compare:CCC (neg:QI (ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+		     (geu:QI (reg:CCC FLAGS_REG) (const_int 0))))]
+  "TARGET_SLOW_STC && !optimize_insn_for_size_p ()"
+  [(set (match_dup 0) (ne:QI (reg:CCC FLAGS_REG) (const_int 0)))
+   (parallel
+     [(set (reg:CCC FLAGS_REG)
+	   (compare:CCC (plus:QI (match_dup 0) (const_int -1))
+			(match_dup 0)))
+      (set (match_dup 0) (plus:QI (match_dup 0) (const_int -1)))])])
 
 ;; Push/pop instructions.
 
@@ -8107,6 +8174,34 @@
   "#"
   "&& 1"
   [(const_int 0)])
+
+;; Set the carry flag from the carry flag.
+(define_insn_and_split "*setccc"
+  [(set (reg:CCC FLAGS_REG)
+	(reg:CCC FLAGS_REG))]
+  "ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)])
+
+;; Set the carry flag from the carry flag.
+(define_insn_and_split "*setcc_qi_negqi_ccc_1_<mode>"
+  [(set (reg:CCC FLAGS_REG)
+	(ltu:CCC (reg:CC_CCC FLAGS_REG) (const_int 0)))]
+  "ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)])
+
+;; Set the carry flag from the carry flag.
+(define_insn_and_split "*setcc_qi_negqi_ccc_2_<mode>"
+  [(set (reg:CCC FLAGS_REG)
+	(unspec:CCC [(ltu:QI (reg:CC_CCC FLAGS_REG) (const_int 0))
+		     (const_int 0)] UNSPEC_CC_NE))]
+  "ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)])
 
 ;; Overflow setting add instructions
 
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index e1c72cd..c3229d2 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -698,3 +698,7 @@ DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", m_NONE)
 /* X86_TUNE_EMIT_VZEROUPPER: This enables vzeroupper instruction insertion
    before a transfer of control flow out of the function.  */
 DEF_TUNE (X86_TUNE_EMIT_VZEROUPPER, "emit_vzeroupper", ~m_KNL)
+
+/* X86_TUNE_SLOW_STC: This disables use of stc, clc and cmc carry flag
+  modifications on architectures where theses operations are slow.  */
+DEF_TUNE (X86_TUNE_SLOW_STC, "slow_stc", m_PENT4)
diff --git a/gcc/testsuite/gcc.target/i386/cmc-1.c b/gcc/testsuite/gcc.target/i386/cmc-1.c
new file mode 100644
index 0000000..58e922a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cmc-1.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned int o1;
+unsigned int o2;
+
+unsigned int foo_xor (unsigned int a, unsigned int b,
+                      unsigned int c, unsigned int d)
+{
+  unsigned int c1 = __builtin_ia32_addcarryx_u32 (1, a, b, &o1);
+  return __builtin_ia32_addcarryx_u32 (c1 ^ 1, c, d, &o2);
+}
+
+unsigned int foo_sub (unsigned int a, unsigned int b,
+                      unsigned int c, unsigned int d)
+{
+  unsigned int c1 = __builtin_ia32_addcarryx_u32 (1, a, b, &o1);
+  return __builtin_ia32_addcarryx_u32 (1 - c1, c, d, &o2);
+}
+
+unsigned int foo_eqz (unsigned int a, unsigned int b,
+                      unsigned int c, unsigned int d)
+{
+  unsigned int c1 = __builtin_ia32_addcarryx_u32 (1, a, b, &o1);
+  return __builtin_ia32_addcarryx_u32 (c1 == 0, c, d, &o2);
+}
+
+/* { dg-final { scan-assembler "cmc" } } */
diff --git a/gcc/testsuite/gcc.target/i386/stc-1.c b/gcc/testsuite/gcc.target/i386/stc-1.c
new file mode 100644
index 0000000..857c939
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/stc-1.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+typedef unsigned int u32;
+
+unsigned int foo (unsigned int a, unsigned int b, unsigned int *c)
+{
+  return __builtin_ia32_addcarryx_u32 (1, a, b, c);
+}
+
+unsigned int bar (unsigned int b, unsigned int *c)
+{
+  return __builtin_ia32_addcarryx_u32 (1, 2, b, c);
+}
+
+unsigned int baz (unsigned int a, unsigned int *c)
+{
+  return __builtin_ia32_addcarryx_u32 (1, a, 3, c);
+}
+
+/* { dg-final { scan-assembler "stc" } } */
Uros Bizjak June 6, 2023, 5:33 p.m. UTC | #3
On Tue, Jun 6, 2023 at 5:14 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> Hi Uros,
> This revision implements your suggestions/refinements. (i) Avoid the
> UNSPEC_CMC by using the canonical RTL idiom for *x86_cmc, (ii) Use
> peephole2s to convert x86_stc and *x86_cmc into alternate forms on
> TARGET_SLOW_STC CPUs (pentium4), when a suitable QImode register is
> available, (iii) Prefer the addqi_cconly_overflow idiom (addb $-1,%al)
> over negqi_ccc_1 (neg %al) for setting the carry from a QImode value,
> (iv) Use andl %eax,%eax to clear carry flag without requiring (clobbering)
> an additional register, as an alternate output template for *x86_clc.

Uh, I don't think (iv) is OK. "xor reg,reg" will break the dependency
chain, while "and reg,reg" won't. So, you are hurting out-of-order
execution by depending on an instruction that calculates previous
result in reg. You can use peephole2 trick to allocate an unused reg
here, but then using AND is no better than using XOR, and the latter
is guaranteed to break dependency chains.

Uros.

> These changes required two minor edits to i386.cc:  ix86_cc_mode had
> to be tweaked to suggest CCCmode for the new *x86_cmc pattern, and
> *x86_cmc needed to be handled/parameterized in ix86_rtx_costs so that
> combine would appreciate that this complex RTL expression was actually
> a fast, single byte instruction [i.e. preferable].
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32}
> with no new failures.  Ok for mainline?
>
> 2022-06-06  Roger Sayle  <roger@nextmovesoftware.com>
>             Uros Bizjak  <ubizjak@gmail.com>
>
> gcc/ChangeLog
>         * config/i386/i386-expand.cc (ix86_expand_builtin) <handlecarry>:
>         Use new x86_stc instruction when the carry flag must be set.
>         * config/i386/i386.cc (ix86_cc_mode): Use CCCmode for *x86_cmc.
>         (ix86_rtx_costs): Provide accurate rtx_costs for *x86_cmc.
>         * config/i386/i386.h (TARGET_SLOW_STC): New define.
>         * config/i386/i386.md (UNSPEC_CLC): New UNSPEC for clc.
>         (UNSPEC_STC): New UNSPEC for stc.
>         (*x86_clc): New define_insn (with implementation for pentium4).
>         (x86_stc): New define_insn.
>         (define_peephole2): Convert x86_stc into alternate implementation
>         on pentium4 without -Os when a QImode register is available.
>         (*x86_cmc): New define_insn.
>         (define_peephole2): Convert *x86_cmc into alternate implementation
>         on pentium4 without -Os when a QImode register is available.
>         (*setccc): New define_insn_and_split for a no-op CCCmode move.
>         (*setcc_qi_negqi_ccc_1_<mode>): New define_insn_and_split to
>         recognize (and eliminate) the carry flag being copied to itself.
>         (*setcc_qi_negqi_ccc_2_<mode>): Likewise.
>         * config/i386/x86-tune.def (X86_TUNE_SLOW_STC): New tuning flag.
>
> gcc/testsuite/ChangeLog
>         * gcc.target/i386/cmc-1.c: New test case.
>         * gcc.target/i386/stc-1.c: Likewise.
>
>
> Thanks, Roger.
> --
>
> -----Original Message-----
> From: Uros Bizjak <ubizjak@gmail.com>
> Sent: 04 June 2023 18:53
> To: Roger Sayle <roger@nextmovesoftware.com>
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [x86 PATCH] Add support for stc, clc and cmc instructions in i386.md
>
> On Sun, Jun 4, 2023 at 12:45 AM Roger Sayle <roger@nextmovesoftware.com> wrote:
> >
> >
> > This patch is the latest revision of my patch to add support for the
> > STC (set carry flag), CLC (clear carry flag) and CMC (complement carry
> > flag) instructions to the i386 backend, incorporating Uros'
> > previous feedback.  The significant changes are (i) the inclusion of
> > CMC, (ii) the use of UNSPEC for pattern, (iii) Use of a new
> > X86_TUNE_SLOW_STC tuning flag to use alternate implementations on
> > pentium4 (which has a notoriously slow STC) when not optimizing for
> > size.
> >
> > An example of the use of the stc instruction is:
> > unsigned int foo (unsigned int a, unsigned int b, unsigned int *c) {
> >   return __builtin_ia32_addcarryx_u32 (1, a, b, c); }
> >
> > which previously generated:
> >         movl    $1, %eax
> >         addb    $-1, %al
> >         adcl    %esi, %edi
> >         setc    %al
> >         movl    %edi, (%rdx)
> >         movzbl  %al, %eax
> >         ret
> >
> > with this patch now generates:
> >         stc
> >         adcl    %esi, %edi
> >         setc    %al
> >         movl    %edi, (%rdx)
> >         movzbl  %al, %eax
> >         ret
> >
> > An example of the use of the cmc instruction (where the carry from a
> > first adc is inverted/complemented as input to a second adc) is:
> > unsigned int bar (unsigned int a, unsigned int b,
> >                   unsigned int c, unsigned int d) {
> >   unsigned int c1 = __builtin_ia32_addcarryx_u32 (1, a, b, &o1);
> >   return __builtin_ia32_addcarryx_u32 (c1 ^ 1, c, d, &o2); }
> >
> > which previously generated:
> >         movl    $1, %eax
> >         addb    $-1, %al
> >         adcl    %esi, %edi
> >         setnc   %al
> >         movl    %edi, o1(%rip)
> >         addb    $-1, %al
> >         adcl    %ecx, %edx
> >         setc    %al
> >         movl    %edx, o2(%rip)
> >         movzbl  %al, %eax
> >         ret
> >
> > and now generates:
> >         stc
> >         adcl    %esi, %edi
> >         cmc
> >         movl    %edi, o1(%rip)
> >         adcl    %ecx, %edx
> >         setc    %al
> >         movl    %edx, o2(%rip)
> >         movzbl  %al, %eax
> >         ret
> >
> >
> > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> > and make -k check, both with and without --target_board=unix{-m32}
> > with no new failures.  Ok for mainline?
> >
> >
> > 2022-06-03  Roger Sayle  <roger@nextmovesoftware.com>
> >
> > gcc/ChangeLog
> >         * config/i386/i386-expand.cc (ix86_expand_builtin) <handlecarry>:
> >         Use new x86_stc or negqi_ccc_1 instructions to set the carry flag.
> >         * config/i386/i386.h (TARGET_SLOW_STC): New define.
> >         * config/i386/i386.md (UNSPEC_CLC): New UNSPEC for clc.
> >         (UNSPEC_STC): New UNSPEC for stc.
> >         (UNSPEC_CMC): New UNSPEC for cmc.
> >         (*x86_clc): New define_insn.
> >         (*x86_clc_xor): New define_insn for pentium4 without -Os.
> >         (x86_stc): New define_insn.
> >         (define_split): Convert x86_stc into alternate implementation
> >         on pentium4.
> >         (x86_cmc): New define_insn.
> >         (*x86_cmc_1): New define_insn_and_split to recognize cmc pattern.
> >         (*setcc_qi_negqi_ccc_1_<mode>): New define_insn_and_split to
> >         recognize (and eliminate) the carry flag being copied to itself.
> >         (*setcc_qi_negqi_ccc_2_<mode>): Likewise.
> >         (neg<mode>_ccc_1): Renamed from *neg<mode>_ccc_1 for gen function.
> >         * config/i386/x86-tune.def (X86_TUNE_SLOW_STC): New tuning flag.
> >
> > gcc/testsuite/ChangeLog
> >         * gcc.target/i386/cmc-1.c: New test case.
> >         * gcc.target/i386/stc-1.c: Likewise.
>
> +;; Clear carry flag.
> +(define_insn "*x86_clc"
> +  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_CLC))]
> +  "!TARGET_SLOW_STC || optimize_function_for_size_p (cfun)"
> +  "clc"
> +  [(set_attr "length" "1")
> +   (set_attr "length_immediate" "0")
> +   (set_attr "modrm" "0")])
> +
> +(define_insn "*x86_clc_xor"
> +  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_CLC))
> +   (clobber (match_scratch:SI 0 "=r"))]
> +  "TARGET_SLOW_STC && !optimize_function_for_size_p (cfun)"
> +  "xor{l}\t%0, %0"
> +    [(set_attr "type" "alu1")
> +     (set_attr "mode" "SI")
> +     (set_attr "length_immediate" "0")])
>
> I think the above would be better implemented as a peephole2 pattern that triggers when a register is available. We should not waste a register on a register starved x86_32 just to set a carry flag. This is implemented with:
>
>   [(match_scratch:SI 2 "r")
>
> at the beginning of the peephole2 pattern that generates x86_clc_xor.
> The pattern should be constrained with "TARGET_SLOW_STC && !optimize_function_for_size_p()" and x86_clc_xor should be available only after reload (like e.g. "*mov<mode>_xor").
>
> +;; On Pentium 4, set the carry flag using mov $1,%al;neg %al.
> +(define_split
> +  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_STC))]
> +  "TARGET_SLOW_STC
> +   && !optimize_insn_for_size_p ()
> +   && can_create_pseudo_p ()"
> +  [(set (match_dup 0) (const_int 1))
> +   (parallel
> +     [(set (reg:CCC FLAGS_REG)
> +       (unspec:CCC [(match_dup 0) (const_int 0)] UNSPEC_CC_NE))
> +      (set (match_dup 0) (neg:QI (match_dup 0)))])]
> +  "operands[0] = gen_reg_rtx (QImode);")
>
> Same here, this should be a peephole2 to trigger only when a register is available, again for "TARGET_SLOW_STC && !optimize_function_for_size_p()".
>
> Is the new sequence "mov $1,%al; neg %al" any better than existing "mov $1,%al; addb $-1,%al" or is there another reason for the changed sequence?
>
> +(define_insn_and_split "*x86_cmc_1"
> +  [(set (reg:CCC FLAGS_REG)
> +        (unspec:CCC [(geu:QI (reg:CCC FLAGS_REG) (const_int 0))
> +             (const_int 0)] UNSPEC_CC_NE))]
> +  "!TARGET_SLOW_STC || optimize_function_for_size_p (cfun)"
> +  "#"
> +  "&& 1"
> +  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(reg:CCC FLAGS_REG)]
> +UNSPEC_CMC))])
>
> The above should be the RTL model of x86_cmc. No need to split to an unspec.
>
> Uros.
Roger Sayle June 6, 2023, 9 p.m. UTC | #4
Hi Uros,
Might you willing to approve the patch without the *x86_clc pieces?
These can be submitted later, when they are actually used.  For now,
we're arguing about the performance of a pattern that's not yet
generated on an obsolete microarchitecture that's no longer in use,
and this is holding up real improvements on current processors.
cmc, for example, should allow for better cmov if-conversion.

Thanks in advance.
Roger
--

-----Original Message-----
From: Uros Bizjak <ubizjak@gmail.com> 
Sent: 06 June 2023 18:34
To: Roger Sayle <roger@nextmovesoftware.com>
Cc: gcc-patches@gcc.gnu.org
Subject: Re: [x86 PATCH] Add support for stc, clc and cmc instructions in i386.md

On Tue, Jun 6, 2023 at 5:14 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> Hi Uros,
> This revision implements your suggestions/refinements. (i) Avoid the 
> UNSPEC_CMC by using the canonical RTL idiom for *x86_cmc, (ii) Use 
> peephole2s to convert x86_stc and *x86_cmc into alternate forms on 
> TARGET_SLOW_STC CPUs (pentium4), when a suitable QImode register is 
> available, (iii) Prefer the addqi_cconly_overflow idiom (addb $-1,%al) 
> over negqi_ccc_1 (neg %al) for setting the carry from a QImode value,
> (iv) Use andl %eax,%eax to clear carry flag without requiring 
> (clobbering) an additional register, as an alternate output template for *x86_clc.

Uh, I don't think (iv) is OK. "xor reg,reg" will break the dependency chain, while "and reg,reg" won't. So, you are hurting out-of-order execution by depending on an instruction that calculates previous result in reg. You can use peephole2 trick to allocate an unused reg here, but then using AND is no better than using XOR, and the latter is guaranteed to break dependency chains.

Uros.

> These changes required two minor edits to i386.cc:  ix86_cc_mode had 
> to be tweaked to suggest CCCmode for the new *x86_cmc pattern, and 
> *x86_cmc needed to be handled/parameterized in ix86_rtx_costs so that 
> combine would appreciate that this complex RTL expression was actually 
> a fast, single byte instruction [i.e. preferable].
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap 
> and make -k check, both with and without --target_board=unix{-m32} 
> with no new failures.  Ok for mainline?
>
> 2022-06-06  Roger Sayle  <roger@nextmovesoftware.com>
>             Uros Bizjak  <ubizjak@gmail.com>
>
> gcc/ChangeLog
>         * config/i386/i386-expand.cc (ix86_expand_builtin) <handlecarry>:
>         Use new x86_stc instruction when the carry flag must be set.
>         * config/i386/i386.cc (ix86_cc_mode): Use CCCmode for *x86_cmc.
>         (ix86_rtx_costs): Provide accurate rtx_costs for *x86_cmc.
>         * config/i386/i386.h (TARGET_SLOW_STC): New define.
>         * config/i386/i386.md (UNSPEC_CLC): New UNSPEC for clc.
>         (UNSPEC_STC): New UNSPEC for stc.
>         (*x86_clc): New define_insn (with implementation for pentium4).
>         (x86_stc): New define_insn.
>         (define_peephole2): Convert x86_stc into alternate implementation
>         on pentium4 without -Os when a QImode register is available.
>         (*x86_cmc): New define_insn.
>         (define_peephole2): Convert *x86_cmc into alternate implementation
>         on pentium4 without -Os when a QImode register is available.
>         (*setccc): New define_insn_and_split for a no-op CCCmode move.
>         (*setcc_qi_negqi_ccc_1_<mode>): New define_insn_and_split to
>         recognize (and eliminate) the carry flag being copied to itself.
>         (*setcc_qi_negqi_ccc_2_<mode>): Likewise.
>         * config/i386/x86-tune.def (X86_TUNE_SLOW_STC): New tuning flag.
>
> gcc/testsuite/ChangeLog
>         * gcc.target/i386/cmc-1.c: New test case.
>         * gcc.target/i386/stc-1.c: Likewise.
>
>
> Thanks, Roger.
> --
>
> -----Original Message-----
> From: Uros Bizjak <ubizjak@gmail.com>
> Sent: 04 June 2023 18:53
> To: Roger Sayle <roger@nextmovesoftware.com>
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [x86 PATCH] Add support for stc, clc and cmc instructions 
> in i386.md
>
> On Sun, Jun 4, 2023 at 12:45 AM Roger Sayle <roger@nextmovesoftware.com> wrote:
> >
> >
> > This patch is the latest revision of my patch to add support for the 
> > STC (set carry flag), CLC (clear carry flag) and CMC (complement 
> > carry
> > flag) instructions to the i386 backend, incorporating Uros'
> > previous feedback.  The significant changes are (i) the inclusion of 
> > CMC, (ii) the use of UNSPEC for pattern, (iii) Use of a new 
> > X86_TUNE_SLOW_STC tuning flag to use alternate implementations on
> > pentium4 (which has a notoriously slow STC) when not optimizing for 
> > size.
> >
> > An example of the use of the stc instruction is:
> > unsigned int foo (unsigned int a, unsigned int b, unsigned int *c) {
> >   return __builtin_ia32_addcarryx_u32 (1, a, b, c); }
> >
> > which previously generated:
> >         movl    $1, %eax
> >         addb    $-1, %al
> >         adcl    %esi, %edi
> >         setc    %al
> >         movl    %edi, (%rdx)
> >         movzbl  %al, %eax
> >         ret
> >
> > with this patch now generates:
> >         stc
> >         adcl    %esi, %edi
> >         setc    %al
> >         movl    %edi, (%rdx)
> >         movzbl  %al, %eax
> >         ret
> >
> > An example of the use of the cmc instruction (where the carry from a 
> > first adc is inverted/complemented as input to a second adc) is:
> > unsigned int bar (unsigned int a, unsigned int b,
> >                   unsigned int c, unsigned int d) {
> >   unsigned int c1 = __builtin_ia32_addcarryx_u32 (1, a, b, &o1);
> >   return __builtin_ia32_addcarryx_u32 (c1 ^ 1, c, d, &o2); }
> >
> > which previously generated:
> >         movl    $1, %eax
> >         addb    $-1, %al
> >         adcl    %esi, %edi
> >         setnc   %al
> >         movl    %edi, o1(%rip)
> >         addb    $-1, %al
> >         adcl    %ecx, %edx
> >         setc    %al
> >         movl    %edx, o2(%rip)
> >         movzbl  %al, %eax
> >         ret
> >
> > and now generates:
> >         stc
> >         adcl    %esi, %edi
> >         cmc
> >         movl    %edi, o1(%rip)
> >         adcl    %ecx, %edx
> >         setc    %al
> >         movl    %edx, o2(%rip)
> >         movzbl  %al, %eax
> >         ret
> >
> >
> > This patch has been tested on x86_64-pc-linux-gnu with make 
> > bootstrap and make -k check, both with and without 
> > --target_board=unix{-m32} with no new failures.  Ok for mainline?
> >
> >
> > 2022-06-03  Roger Sayle  <roger@nextmovesoftware.com>
> >
> > gcc/ChangeLog
> >         * config/i386/i386-expand.cc (ix86_expand_builtin) <handlecarry>:
> >         Use new x86_stc or negqi_ccc_1 instructions to set the carry flag.
> >         * config/i386/i386.h (TARGET_SLOW_STC): New define.
> >         * config/i386/i386.md (UNSPEC_CLC): New UNSPEC for clc.
> >         (UNSPEC_STC): New UNSPEC for stc.
> >         (UNSPEC_CMC): New UNSPEC for cmc.
> >         (*x86_clc): New define_insn.
> >         (*x86_clc_xor): New define_insn for pentium4 without -Os.
> >         (x86_stc): New define_insn.
> >         (define_split): Convert x86_stc into alternate implementation
> >         on pentium4.
> >         (x86_cmc): New define_insn.
> >         (*x86_cmc_1): New define_insn_and_split to recognize cmc pattern.
> >         (*setcc_qi_negqi_ccc_1_<mode>): New define_insn_and_split to
> >         recognize (and eliminate) the carry flag being copied to itself.
> >         (*setcc_qi_negqi_ccc_2_<mode>): Likewise.
> >         (neg<mode>_ccc_1): Renamed from *neg<mode>_ccc_1 for gen function.
> >         * config/i386/x86-tune.def (X86_TUNE_SLOW_STC): New tuning flag.
> >
> > gcc/testsuite/ChangeLog
> >         * gcc.target/i386/cmc-1.c: New test case.
> >         * gcc.target/i386/stc-1.c: Likewise.
>
> +;; Clear carry flag.
> +(define_insn "*x86_clc"
> +  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_CLC))]
> +  "!TARGET_SLOW_STC || optimize_function_for_size_p (cfun)"
> +  "clc"
> +  [(set_attr "length" "1")
> +   (set_attr "length_immediate" "0")
> +   (set_attr "modrm" "0")])
> +
> +(define_insn "*x86_clc_xor"
> +  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_CLC))
> +   (clobber (match_scratch:SI 0 "=r"))]
> +  "TARGET_SLOW_STC && !optimize_function_for_size_p (cfun)"
> +  "xor{l}\t%0, %0"
> +    [(set_attr "type" "alu1")
> +     (set_attr "mode" "SI")
> +     (set_attr "length_immediate" "0")])
>
> I think the above would be better implemented as a peephole2 pattern that triggers when a register is available. We should not waste a register on a register starved x86_32 just to set a carry flag. This is implemented with:
>
>   [(match_scratch:SI 2 "r")
>
> at the beginning of the peephole2 pattern that generates x86_clc_xor.
> The pattern should be constrained with "TARGET_SLOW_STC && !optimize_function_for_size_p()" and x86_clc_xor should be available only after reload (like e.g. "*mov<mode>_xor").
>
> +;; On Pentium 4, set the carry flag using mov $1,%al;neg %al.
> +(define_split
> +  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_STC))]
> +  "TARGET_SLOW_STC
> +   && !optimize_insn_for_size_p ()
> +   && can_create_pseudo_p ()"
> +  [(set (match_dup 0) (const_int 1))
> +   (parallel
> +     [(set (reg:CCC FLAGS_REG)
> +       (unspec:CCC [(match_dup 0) (const_int 0)] UNSPEC_CC_NE))
> +      (set (match_dup 0) (neg:QI (match_dup 0)))])]
> +  "operands[0] = gen_reg_rtx (QImode);")
>
> Same here, this should be a peephole2 to trigger only when a register is available, again for "TARGET_SLOW_STC && !optimize_function_for_size_p()".
>
> Is the new sequence "mov $1,%al; neg %al" any better than existing "mov $1,%al; addb $-1,%al" or is there another reason for the changed sequence?
>
> +(define_insn_and_split "*x86_cmc_1"
> +  [(set (reg:CCC FLAGS_REG)
> +        (unspec:CCC [(geu:QI (reg:CCC FLAGS_REG) (const_int 0))
> +             (const_int 0)] UNSPEC_CC_NE))]
> +  "!TARGET_SLOW_STC || optimize_function_for_size_p (cfun)"
> +  "#"
> +  "&& 1"
> +  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(reg:CCC FLAGS_REG)]
> +UNSPEC_CMC))])
>
> The above should be the RTL model of x86_cmc. No need to split to an unspec.
>
> Uros.
Uros Bizjak June 7, 2023, 6:16 a.m. UTC | #5
On Tue, Jun 6, 2023 at 11:00 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> Hi Uros,
> Might you willing to approve the patch without the *x86_clc pieces?
> These can be submitted later, when they are actually used.  For now,
> we're arguing about the performance of a pattern that's not yet
> generated on an obsolete microarchitecture that's no longer in use,
> and this is holding up real improvements on current processors.
> cmc, for example, should allow for better cmov if-conversion.

Then the patch is OK, the above can be implemented in an incremental patch.

Thanks,
Uros.

>
> Thanks in advance.
> Roger
> --
>
> -----Original Message-----
> From: Uros Bizjak <ubizjak@gmail.com>
> Sent: 06 June 2023 18:34
> To: Roger Sayle <roger@nextmovesoftware.com>
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [x86 PATCH] Add support for stc, clc and cmc instructions in i386.md
>
> On Tue, Jun 6, 2023 at 5:14 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
> >
> >
> > Hi Uros,
> > This revision implements your suggestions/refinements. (i) Avoid the
> > UNSPEC_CMC by using the canonical RTL idiom for *x86_cmc, (ii) Use
> > peephole2s to convert x86_stc and *x86_cmc into alternate forms on
> > TARGET_SLOW_STC CPUs (pentium4), when a suitable QImode register is
> > available, (iii) Prefer the addqi_cconly_overflow idiom (addb $-1,%al)
> > over negqi_ccc_1 (neg %al) for setting the carry from a QImode value,
> > (iv) Use andl %eax,%eax to clear carry flag without requiring
> > (clobbering) an additional register, as an alternate output template for *x86_clc.
>
> Uh, I don't think (iv) is OK. "xor reg,reg" will break the dependency chain, while "and reg,reg" won't. So, you are hurting out-of-order execution by depending on an instruction that calculates previous result in reg. You can use peephole2 trick to allocate an unused reg here, but then using AND is no better than using XOR, and the latter is guaranteed to break dependency chains.
>
> Uros.
>
> > These changes required two minor edits to i386.cc:  ix86_cc_mode had
> > to be tweaked to suggest CCCmode for the new *x86_cmc pattern, and
> > *x86_cmc needed to be handled/parameterized in ix86_rtx_costs so that
> > combine would appreciate that this complex RTL expression was actually
> > a fast, single byte instruction [i.e. preferable].
> >
> > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> > and make -k check, both with and without --target_board=unix{-m32}
> > with no new failures.  Ok for mainline?
> >
> > 2022-06-06  Roger Sayle  <roger@nextmovesoftware.com>
> >             Uros Bizjak  <ubizjak@gmail.com>
> >
> > gcc/ChangeLog
> >         * config/i386/i386-expand.cc (ix86_expand_builtin) <handlecarry>:
> >         Use new x86_stc instruction when the carry flag must be set.
> >         * config/i386/i386.cc (ix86_cc_mode): Use CCCmode for *x86_cmc.
> >         (ix86_rtx_costs): Provide accurate rtx_costs for *x86_cmc.
> >         * config/i386/i386.h (TARGET_SLOW_STC): New define.
> >         * config/i386/i386.md (UNSPEC_CLC): New UNSPEC for clc.
> >         (UNSPEC_STC): New UNSPEC for stc.
> >         (*x86_clc): New define_insn (with implementation for pentium4).
> >         (x86_stc): New define_insn.
> >         (define_peephole2): Convert x86_stc into alternate implementation
> >         on pentium4 without -Os when a QImode register is available.
> >         (*x86_cmc): New define_insn.
> >         (define_peephole2): Convert *x86_cmc into alternate implementation
> >         on pentium4 without -Os when a QImode register is available.
> >         (*setccc): New define_insn_and_split for a no-op CCCmode move.
> >         (*setcc_qi_negqi_ccc_1_<mode>): New define_insn_and_split to
> >         recognize (and eliminate) the carry flag being copied to itself.
> >         (*setcc_qi_negqi_ccc_2_<mode>): Likewise.
> >         * config/i386/x86-tune.def (X86_TUNE_SLOW_STC): New tuning flag.
> >
> > gcc/testsuite/ChangeLog
> >         * gcc.target/i386/cmc-1.c: New test case.
> >         * gcc.target/i386/stc-1.c: Likewise.
> >
> >
> > Thanks, Roger.
> > --
> >
> > -----Original Message-----
> > From: Uros Bizjak <ubizjak@gmail.com>
> > Sent: 04 June 2023 18:53
> > To: Roger Sayle <roger@nextmovesoftware.com>
> > Cc: gcc-patches@gcc.gnu.org
> > Subject: Re: [x86 PATCH] Add support for stc, clc and cmc instructions
> > in i386.md
> >
> > On Sun, Jun 4, 2023 at 12:45 AM Roger Sayle <roger@nextmovesoftware.com> wrote:
> > >
> > >
> > > This patch is the latest revision of my patch to add support for the
> > > STC (set carry flag), CLC (clear carry flag) and CMC (complement
> > > carry
> > > flag) instructions to the i386 backend, incorporating Uros'
> > > previous feedback.  The significant changes are (i) the inclusion of
> > > CMC, (ii) the use of UNSPEC for pattern, (iii) Use of a new
> > > X86_TUNE_SLOW_STC tuning flag to use alternate implementations on
> > > pentium4 (which has a notoriously slow STC) when not optimizing for
> > > size.
> > >
> > > An example of the use of the stc instruction is:
> > > unsigned int foo (unsigned int a, unsigned int b, unsigned int *c) {
> > >   return __builtin_ia32_addcarryx_u32 (1, a, b, c); }
> > >
> > > which previously generated:
> > >         movl    $1, %eax
> > >         addb    $-1, %al
> > >         adcl    %esi, %edi
> > >         setc    %al
> > >         movl    %edi, (%rdx)
> > >         movzbl  %al, %eax
> > >         ret
> > >
> > > with this patch now generates:
> > >         stc
> > >         adcl    %esi, %edi
> > >         setc    %al
> > >         movl    %edi, (%rdx)
> > >         movzbl  %al, %eax
> > >         ret
> > >
> > > An example of the use of the cmc instruction (where the carry from a
> > > first adc is inverted/complemented as input to a second adc) is:
> > > unsigned int bar (unsigned int a, unsigned int b,
> > >                   unsigned int c, unsigned int d) {
> > >   unsigned int c1 = __builtin_ia32_addcarryx_u32 (1, a, b, &o1);
> > >   return __builtin_ia32_addcarryx_u32 (c1 ^ 1, c, d, &o2); }
> > >
> > > which previously generated:
> > >         movl    $1, %eax
> > >         addb    $-1, %al
> > >         adcl    %esi, %edi
> > >         setnc   %al
> > >         movl    %edi, o1(%rip)
> > >         addb    $-1, %al
> > >         adcl    %ecx, %edx
> > >         setc    %al
> > >         movl    %edx, o2(%rip)
> > >         movzbl  %al, %eax
> > >         ret
> > >
> > > and now generates:
> > >         stc
> > >         adcl    %esi, %edi
> > >         cmc
> > >         movl    %edi, o1(%rip)
> > >         adcl    %ecx, %edx
> > >         setc    %al
> > >         movl    %edx, o2(%rip)
> > >         movzbl  %al, %eax
> > >         ret
> > >
> > >
> > > This patch has been tested on x86_64-pc-linux-gnu with make
> > > bootstrap and make -k check, both with and without
> > > --target_board=unix{-m32} with no new failures.  Ok for mainline?
> > >
> > >
> > > 2022-06-03  Roger Sayle  <roger@nextmovesoftware.com>
> > >
> > > gcc/ChangeLog
> > >         * config/i386/i386-expand.cc (ix86_expand_builtin) <handlecarry>:
> > >         Use new x86_stc or negqi_ccc_1 instructions to set the carry flag.
> > >         * config/i386/i386.h (TARGET_SLOW_STC): New define.
> > >         * config/i386/i386.md (UNSPEC_CLC): New UNSPEC for clc.
> > >         (UNSPEC_STC): New UNSPEC for stc.
> > >         (UNSPEC_CMC): New UNSPEC for cmc.
> > >         (*x86_clc): New define_insn.
> > >         (*x86_clc_xor): New define_insn for pentium4 without -Os.
> > >         (x86_stc): New define_insn.
> > >         (define_split): Convert x86_stc into alternate implementation
> > >         on pentium4.
> > >         (x86_cmc): New define_insn.
> > >         (*x86_cmc_1): New define_insn_and_split to recognize cmc pattern.
> > >         (*setcc_qi_negqi_ccc_1_<mode>): New define_insn_and_split to
> > >         recognize (and eliminate) the carry flag being copied to itself.
> > >         (*setcc_qi_negqi_ccc_2_<mode>): Likewise.
> > >         (neg<mode>_ccc_1): Renamed from *neg<mode>_ccc_1 for gen function.
> > >         * config/i386/x86-tune.def (X86_TUNE_SLOW_STC): New tuning flag.
> > >
> > > gcc/testsuite/ChangeLog
> > >         * gcc.target/i386/cmc-1.c: New test case.
> > >         * gcc.target/i386/stc-1.c: Likewise.
> >
> > +;; Clear carry flag.
> > +(define_insn "*x86_clc"
> > +  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_CLC))]
> > +  "!TARGET_SLOW_STC || optimize_function_for_size_p (cfun)"
> > +  "clc"
> > +  [(set_attr "length" "1")
> > +   (set_attr "length_immediate" "0")
> > +   (set_attr "modrm" "0")])
> > +
> > +(define_insn "*x86_clc_xor"
> > +  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_CLC))
> > +   (clobber (match_scratch:SI 0 "=r"))]
> > +  "TARGET_SLOW_STC && !optimize_function_for_size_p (cfun)"
> > +  "xor{l}\t%0, %0"
> > +    [(set_attr "type" "alu1")
> > +     (set_attr "mode" "SI")
> > +     (set_attr "length_immediate" "0")])
> >
> > I think the above would be better implemented as a peephole2 pattern that triggers when a register is available. We should not waste a register on a register starved x86_32 just to set a carry flag. This is implemented with:
> >
> >   [(match_scratch:SI 2 "r")
> >
> > at the beginning of the peephole2 pattern that generates x86_clc_xor.
> > The pattern should be constrained with "TARGET_SLOW_STC && !optimize_function_for_size_p()" and x86_clc_xor should be available only after reload (like e.g. "*mov<mode>_xor").
> >
> > +;; On Pentium 4, set the carry flag using mov $1,%al;neg %al.
> > +(define_split
> > +  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_STC))]
> > +  "TARGET_SLOW_STC
> > +   && !optimize_insn_for_size_p ()
> > +   && can_create_pseudo_p ()"
> > +  [(set (match_dup 0) (const_int 1))
> > +   (parallel
> > +     [(set (reg:CCC FLAGS_REG)
> > +       (unspec:CCC [(match_dup 0) (const_int 0)] UNSPEC_CC_NE))
> > +      (set (match_dup 0) (neg:QI (match_dup 0)))])]
> > +  "operands[0] = gen_reg_rtx (QImode);")
> >
> > Same here, this should be a peephole2 to trigger only when a register is available, again for "TARGET_SLOW_STC && !optimize_function_for_size_p()".
> >
> > Is the new sequence "mov $1,%al; neg %al" any better than existing "mov $1,%al; addb $-1,%al" or is there another reason for the changed sequence?
> >
> > +(define_insn_and_split "*x86_cmc_1"
> > +  [(set (reg:CCC FLAGS_REG)
> > +        (unspec:CCC [(geu:QI (reg:CCC FLAGS_REG) (const_int 0))
> > +             (const_int 0)] UNSPEC_CC_NE))]
> > +  "!TARGET_SLOW_STC || optimize_function_for_size_p (cfun)"
> > +  "#"
> > +  "&& 1"
> > +  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(reg:CCC FLAGS_REG)]
> > +UNSPEC_CMC))])
> >
> > The above should be the RTL model of x86_cmc. No need to split to an unspec.
> >
> > Uros.
>
diff mbox series

Patch

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 5d21810..9e02fdd 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -13948,8 +13948,6 @@  rdseed_step:
       arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out.  */
 
       op1 = expand_normal (arg0);
-      if (!integer_zerop (arg0))
-	op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
 
       op2 = expand_normal (arg1);
       if (!register_operand (op2, mode0))
@@ -13967,7 +13965,7 @@  rdseed_step:
 	}
 
       op0 = gen_reg_rtx (mode0);
-      if (integer_zerop (arg0))
+      if (op1 == const0_rtx)
 	{
 	  /* If arg0 is 0, optimize right away into add or sub
 	     instruction that sets CCCmode flags.  */
@@ -13977,7 +13975,14 @@  rdseed_step:
       else
 	{
 	  /* Generate CF from input operand.  */
-	  emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
+	  if (!CONST_INT_P (op1))
+	    {
+	      op1 = convert_to_mode (QImode, op1, 1);
+	      op1 = copy_to_mode_reg (QImode, op1);
+	      emit_insn (gen_negqi_ccc_1 (op1, op1));
+	    }
+	  else
+	    emit_insn (gen_x86_stc ());
 
 	  /* Generate instruction that consumes CF.  */
 	  op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index c7439f8..5ac9c78 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -448,6 +448,7 @@  extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 	ix86_tune_features[X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD]
 #define TARGET_DEST_FALSE_DEP_FOR_GLC \
 	ix86_tune_features[X86_TUNE_DEST_FALSE_DEP_FOR_GLC]
+#define TARGET_SLOW_STC ix86_tune_features[X86_TUNE_SLOW_STC]
 
 /* Feature tests against the various architecture variations.  */
 enum ix86_arch_indices {
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e6ebc46..482548c 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -114,6 +114,9 @@ 
   UNSPEC_INSN_FALSE_DEP
   UNSPEC_SBB
   UNSPEC_CC_NE
+  UNSPEC_CLC
+  UNSPEC_STC
+  UNSPEC_CMC
 
   ;; For SSE/MMX support:
   UNSPEC_FIX_NOTRUNC
@@ -1999,6 +2002,64 @@ 
   [(set_attr "type" "ssecomi")
    (set_attr "prefix" "evex")
    (set_attr "mode" "HF")])
+
+;; Clear carry flag.
+(define_insn "*x86_clc"
+  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_CLC))]
+  "!TARGET_SLOW_STC || optimize_function_for_size_p (cfun)"
+  "clc"
+  [(set_attr "length" "1")
+   (set_attr "length_immediate" "0")
+   (set_attr "modrm" "0")])
+
+(define_insn "*x86_clc_xor"
+  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_CLC))
+   (clobber (match_scratch:SI 0 "=r"))]
+  "TARGET_SLOW_STC && !optimize_function_for_size_p (cfun)"
+  "xor{l}\t%0, %0"
+    [(set_attr "type" "alu1")
+     (set_attr "mode" "SI")
+     (set_attr "length_immediate" "0")])
+
+;; Set carry flag.
+(define_insn "x86_stc"
+  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_STC))]
+  ""
+  "stc"
+  [(set_attr "length" "1")
+   (set_attr "length_immediate" "0")
+   (set_attr "modrm" "0")])
+
+;; On Pentium 4, set the carry flag using mov $1,%al;neg %al.
+(define_split
+  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(const_int 0)] UNSPEC_STC))]
+  "TARGET_SLOW_STC
+   && !optimize_insn_for_size_p ()
+   && can_create_pseudo_p ()"
+  [(set (match_dup 0) (const_int 1))
+   (parallel
+     [(set (reg:CCC FLAGS_REG)
+	   (unspec:CCC [(match_dup 0) (const_int 0)] UNSPEC_CC_NE))
+      (set (match_dup 0) (neg:QI (match_dup 0)))])]
+  "operands[0] = gen_reg_rtx (QImode);")
+
+;; Complement carry flag.
+(define_insn "*x86_cmc"
+  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(reg:CCC FLAGS_REG)] UNSPEC_CMC))]
+  "!TARGET_SLOW_STC || optimize_function_for_size_p (cfun)"
+  "cmc"
+  [(set_attr "length" "1")
+   (set_attr "length_immediate" "0")
+   (set_attr "modrm" "0")])
+
+(define_insn_and_split "*x86_cmc_1"
+  [(set (reg:CCC FLAGS_REG)
+        (unspec:CCC [(geu:QI (reg:CCC FLAGS_REG) (const_int 0))
+		     (const_int 0)] UNSPEC_CC_NE))]
+  "!TARGET_SLOW_STC || optimize_function_for_size_p (cfun)"
+  "#"
+  "&& 1"
+  [(set (reg:CCC FLAGS_REG) (unspec:CCC [(reg:CCC FLAGS_REG)] UNSPEC_CMC))])
 
 ;; Push/pop instructions.
 
@@ -8107,6 +8168,25 @@ 
   "#"
   "&& 1"
   [(const_int 0)])
+
+;; Set the carry flag from the carry flag.
+(define_insn_and_split "*setcc_qi_negqi_ccc_1_<mode>"
+  [(set (reg:CCC FLAGS_REG)
+	(ltu:CCC (reg:CC_CCC FLAGS_REG) (const_int 0)))]
+  "ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)])
+
+;; Set the carry flag from the carry flag.
+(define_insn_and_split "*setcc_qi_negqi_ccc_2_<mode>"
+  [(set (reg:CCC FLAGS_REG)
+	(unspec:CCC [(ltu:QI (reg:CC_CCC FLAGS_REG) (const_int 0))
+		     (const_int 0)] UNSPEC_CC_NE))]
+  "ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)])
 
 ;; Overflow setting add instructions
 
@@ -11942,7 +12022,7 @@ 
   [(set_attr "type" "negnot")
    (set_attr "mode" "SI")])
 
-(define_insn "*neg<mode>_ccc_1"
+(define_insn "neg<mode>_ccc_1"
   [(set (reg:CCC FLAGS_REG)
 	(unspec:CCC
 	  [(match_operand:SWI 1 "nonimmediate_operand" "0")
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index e1c72cd..c3229d2 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -698,3 +698,7 @@  DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", m_NONE)
 /* X86_TUNE_EMIT_VZEROUPPER: This enables vzeroupper instruction insertion
    before a transfer of control flow out of the function.  */
 DEF_TUNE (X86_TUNE_EMIT_VZEROUPPER, "emit_vzeroupper", ~m_KNL)
+
+/* X86_TUNE_SLOW_STC: This disables use of stc, clc and cmc carry flag
+  modifications on architectures where theses operations are slow.  */
+DEF_TUNE (X86_TUNE_SLOW_STC, "slow_stc", m_PENT4)
diff --git a/gcc/testsuite/gcc.target/i386/cmc-1.c b/gcc/testsuite/gcc.target/i386/cmc-1.c
new file mode 100644
index 0000000..58e922a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cmc-1.c
@@ -0,0 +1,28 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned int o1;
+unsigned int o2;
+
+unsigned int foo_xor (unsigned int a, unsigned int b,
+                      unsigned int c, unsigned int d)
+{
+  unsigned int c1 = __builtin_ia32_addcarryx_u32 (1, a, b, &o1);
+  return __builtin_ia32_addcarryx_u32 (c1 ^ 1, c, d, &o2);
+}
+
+unsigned int foo_sub (unsigned int a, unsigned int b,
+                      unsigned int c, unsigned int d)
+{
+  unsigned int c1 = __builtin_ia32_addcarryx_u32 (1, a, b, &o1);
+  return __builtin_ia32_addcarryx_u32 (1 - c1, c, d, &o2);
+}
+
+unsigned int foo_eqz (unsigned int a, unsigned int b,
+                      unsigned int c, unsigned int d)
+{
+  unsigned int c1 = __builtin_ia32_addcarryx_u32 (1, a, b, &o1);
+  return __builtin_ia32_addcarryx_u32 (c1 == 0, c, d, &o2);
+}
+
+/* { dg-final { scan-assembler "cmc" } } */
diff --git a/gcc/testsuite/gcc.target/i386/stc-1.c b/gcc/testsuite/gcc.target/i386/stc-1.c
new file mode 100644
index 0000000..857c939
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/stc-1.c
@@ -0,0 +1,19 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned int foo (unsigned int a, unsigned int b, unsigned int *c)
+{
+  return __builtin_ia32_addcarryx_u32 (1, a, b, c);
+}
+
+unsigned int bar (unsigned int b, unsigned int *c)
+{
+  return __builtin_ia32_addcarryx_u32 (1, 2, b, c);
+}
+
+unsigned int baz (unsigned int a, unsigned int *c)
+{
+  return __builtin_ia32_addcarryx_u32 (1, a, 3, c);
+}
+
+/* { dg-final { scan-assembler "stc" } } */