diff mbox

[ARM] : Improve handling of DImode comparisons

Message ID 4C2016BC.3030502@codesourcery.com
State New
Headers show

Commit Message

Sandra Loosemore June 22, 2010, 1:49 a.m. UTC
When I was benchmarking the preliminary versions of my just-posted patches to 
add support for NEON DImode bitwise and arithmetic instructions

http://gcc.gnu.org/ml/gcc-patches/2010-06/msg02100.html
http://gcc.gnu.org/ml/gcc-patches/2010-06/msg02101.html

I found that we were getting a performance hit that was traceable to really bad 
code being emitted for DImode comparisons, that caused unnecessary spills, etc. 
    Dan and Julian came up with a patch for this that I've now updated and 
retested against mainline trunk.  (We were using a 4.4-based tree for the 
initial work.)  This patch is actually independent of the two patches listed 
above and not specific to NEON, as it understands the optimal ways to express 
comparisons for all the different ARM instruction set variants.  I'll have to 
defer to Dan if there are technical questions about that part of it.  :-)  My 
contribution is pretty much limited to adapting the patch to deal with the 
bgt/ble/etc guys having gone away since 4.4 -- now the relevant bits happen in 
cbranchdi4 and cstoredi4 instead.

I just retested this version of the patch on arm-none-eabi with a simulator 
target, testing a bunch of compiler option variants including NEON, Thumb-2, 
etc.  (I also tested an earlier version of the patch on arm-none-linux-gnueabi a 
month or so back.)  OK to check in?

-Sandra

2010-06-21  Daniel Jacobowitz  <dan@codesourcery.com>
	    Julian Brown  <julian@codesourcery.com>
	    Sandra Loosemore <sandra@codesourcery.com>

	gcc/
	* config/arm/arm.c (arm_canonicalize_comparison): Canonicalize DImode
	comparisons.  Adjust to take both operands.
	(arm_select_cc_mode): Handle DImode comparisons.
	(arm_gen_compare_reg): Generate a scratch register for DImode
	comparisons which require one.  Use xor for Thumb equality checks.
	(arm_const_double_by_immediates): New.
	(arm_print_operand): Allow 'Q' and 'R' for constants.
	(get_arm_condition_code): Handle new CC_CZmode and CC_NCVmode.
	* config/arm/arm.h (CANONICALIZE_COMPARISON): Always use
	arm_canonicalize_comparison.
	* config/arm/arm-modes.def: Add CC_CZmode and CC_NCVmode.
	* config/arm/arm-protos.h (arm_canonicalize_comparison): Update
	prototype.
	(arm_const_double_by_immediates): Declare.
	* config/arm/constraints.md (Di): New constraint.
	* config/arm/predicates.md (arm_immediate_di_operand)
	(arm_di_operand, cmpdi_operand): New.
	* config/arm/arm.md (cbranchdi4): Handle non-Cirrus also.
	(*arm_cmpdi_insn, *arm_cmpdi_unsigned)
	(*arm_cmpdi_zero, *thumb_cmpdi_zero): New insns.
	(cstoredi4): Handle non-Cirrus also.

	gcc/testsuite/
	* gcc.c-torture/execute/20100416-1.c: New test case.

Comments

Richard Earnshaw June 30, 2010, 5:24 p.m. UTC | #1
On Mon, 2010-06-21 at 21:49 -0400, Sandra Loosemore wrote:
> 2010-06-21  Daniel Jacobowitz  <dan@codesourcery.com>
> 	    Julian Brown  <julian@codesourcery.com>
> 	    Sandra Loosemore <sandra@codesourcery.com>
> 
> 	gcc/
> 	* config/arm/arm.c (arm_canonicalize_comparison): Canonicalize DImode
> 	comparisons.  Adjust to take both operands.
> 	(arm_select_cc_mode): Handle DImode comparisons.
> 	(arm_gen_compare_reg): Generate a scratch register for DImode
> 	comparisons which require one.  Use xor for Thumb equality checks.
> 	(arm_const_double_by_immediates): New.
> 	(arm_print_operand): Allow 'Q' and 'R' for constants.
> 	(get_arm_condition_code): Handle new CC_CZmode and CC_NCVmode.
> 	* config/arm/arm.h (CANONICALIZE_COMPARISON): Always use
> 	arm_canonicalize_comparison.
> 	* config/arm/arm-modes.def: Add CC_CZmode and CC_NCVmode.
> 	* config/arm/arm-protos.h (arm_canonicalize_comparison): Update
> 	prototype.
> 	(arm_const_double_by_immediates): Declare.
> 	* config/arm/constraints.md (Di): New constraint.
> 	* config/arm/predicates.md (arm_immediate_di_operand)
> 	(arm_di_operand, cmpdi_operand): New.
> 	* config/arm/arm.md (cbranchdi4): Handle non-Cirrus also.
> 	(*arm_cmpdi_insn, *arm_cmpdi_unsigned)
> 	(*arm_cmpdi_zero, *thumb_cmpdi_zero): New insns.
> 	(cstoredi4): Handle non-Cirrus also.
> 
> 	gcc/testsuite/
> 	* gcc.c-torture/execute/20100416-1.c: New test case.
> 


OK

R.
diff mbox

Patch

Index: gcc/config/arm/arm.c
===================================================================
--- gcc/config/arm/arm.c	(revision 159319)
+++ gcc/config/arm/arm.c	(working copy)
@@ -3207,13 +3207,82 @@  arm_gen_constant (enum rtx_code code, en
    immediate value easier to load.  */
 
 enum rtx_code
-arm_canonicalize_comparison (enum rtx_code code, enum machine_mode mode,
-			     rtx * op1)
+arm_canonicalize_comparison (enum rtx_code code, rtx *op0, rtx *op1)
 {
-  unsigned HOST_WIDE_INT i = INTVAL (*op1);
-  unsigned HOST_WIDE_INT maxval;
+  enum machine_mode mode;
+  unsigned HOST_WIDE_INT i, maxval;
+
+  mode = GET_MODE (*op0);
+  if (mode == VOIDmode)
+    mode = GET_MODE (*op1);
+
   maxval = (((unsigned HOST_WIDE_INT) 1) << (GET_MODE_BITSIZE(mode) - 1)) - 1;
 
+  /* For DImode, we have GE/LT/GEU/LTU comparisons.  In ARM mode
+     we can also use cmp/cmpeq for GTU/LEU.  GT/LE must be either
+     reversed or (for constant OP1) adjusted to GE/LT.  Similarly
+     for GTU/LEU in Thumb mode.  */
+  if (mode == DImode)
+    {
+      rtx tem;
+
+      /* To keep things simple, always use the Cirrus cfcmp64 if it is
+	 available.  */
+      if (TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK)
+	return code;
+
+      if (code == GT || code == LE
+	  || (!TARGET_ARM && (code == GTU || code == LEU)))
+	{
+	  /* Missing comparison.  First try to use an available
+	     comparison.  */
+	  if (GET_CODE (*op1) == CONST_INT)
+	    {
+	      i = INTVAL (*op1);
+	      switch (code)
+		{
+		case GT:
+		case LE:
+		  if (i != maxval
+		      && arm_const_double_by_immediates (GEN_INT (i + 1)))
+		    {
+		      *op1 = GEN_INT (i + 1);
+		      return code == GT ? GE : LT;
+		    }
+		  break;
+		case GTU:
+		case LEU:
+		  if (i != ~((unsigned HOST_WIDE_INT) 0)
+		      && arm_const_double_by_immediates (GEN_INT (i + 1)))
+		    {
+		      *op1 = GEN_INT (i + 1);
+		      return code == GTU ? GEU : LTU;
+		    }
+		  break;
+		default:
+		  gcc_unreachable ();
+		}
+	    }
+
+	  /* If that did not work, reverse the condition.  */
+	  tem = *op0;
+	  *op0 = *op1;
+	  *op1 = tem;
+	  return swap_condition (code);
+	}
+
+      return code;
+    }
+
+  /* Comparisons smaller than DImode.  Only adjust comparisons against
+     an out-of-range constant.  */
+  if (GET_CODE (*op1) != CONST_INT
+      || const_ok_for_arm (INTVAL (*op1))
+      || const_ok_for_arm (- INTVAL (*op1)))
+    return code;
+
+  i = INTVAL (*op1);
+
   switch (code)
     {
     case EQ:
@@ -10058,6 +10127,55 @@  arm_select_cc_mode (enum rtx_code op, rt
       && (rtx_equal_p (XEXP (x, 0), y) || rtx_equal_p (XEXP (x, 1), y)))
     return CC_Cmode;
 
+  if (GET_MODE (x) == DImode || GET_MODE (y) == DImode)
+    {
+      /* To keep things simple, always use the Cirrus cfcmp64 if it is
+	 available.  */
+      if (TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK)
+	return CCmode;
+
+      switch (op)
+	{
+	case EQ:
+	case NE:
+	  /* A DImode comparison against zero can be implemented by
+	     or'ing the two halves together.  */
+	  if (y == const0_rtx)
+	    return CC_Zmode;
+
+	  /* We can do an equality test in three Thumb instructions.  */
+	  if (!TARGET_ARM)
+	    return CC_Zmode;
+
+	  /* FALLTHROUGH */
+
+	case LTU:
+	case LEU:
+	case GTU:
+	case GEU:
+	  /* DImode unsigned comparisons can be implemented by cmp +
+	     cmpeq without a scratch register.  Not worth doing in
+	     Thumb-2.  */
+	  if (TARGET_ARM)
+	    return CC_CZmode;
+
+	  /* FALLTHROUGH */
+
+	case LT:
+	case LE:
+	case GT:
+	case GE:
+	  /* DImode signed and unsigned comparisons can be implemented
+	     by cmp + sbcs with a scratch register, but that does not
+	     set the Z flag - we must reverse GT/LE/GTU/LEU.  */
+	  gcc_assert (op != EQ && op != NE);
+	  return CC_NCVmode;
+
+	default:
+	  gcc_unreachable ();
+	}
+    }
+
   return CCmode;
 }
 
@@ -10067,10 +10185,39 @@  arm_select_cc_mode (enum rtx_code op, rt
 rtx
 arm_gen_compare_reg (enum rtx_code code, rtx x, rtx y)
 {
-  enum machine_mode mode = SELECT_CC_MODE (code, x, y);
-  rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
+  enum machine_mode mode;
+  rtx cc_reg;
+  int dimode_comparison = GET_MODE (x) == DImode || GET_MODE (y) == DImode;
 
-  emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
+  /* We might have X as a constant, Y as a register because of the predicates
+     used for cmpdi.  If so, force X to a register here.  */
+  if (dimode_comparison && !REG_P (x))
+    x = force_reg (DImode, x);
+
+  mode = SELECT_CC_MODE (code, x, y);
+  cc_reg = gen_rtx_REG (mode, CC_REGNUM);
+
+  if (dimode_comparison
+      && !(TARGET_HARD_FLOAT && TARGET_MAVERICK)
+      && mode != CC_CZmode)
+    {
+      rtx clobber, set;
+
+      /* To compare two non-zero values for equality, XOR them and
+	 then compare against zero.  Not used for ARM mode; there
+	 CC_CZmode is cheaper.  */
+      if (mode == CC_Zmode && y != const0_rtx)
+	{
+	  x = expand_binop (DImode, xor_optab, x, y, NULL_RTX, 0, OPTAB_WIDEN);
+	  y = const0_rtx;
+	}
+      /* A scratch register is required.  */
+      clobber = gen_rtx_CLOBBER (VOIDmode, gen_rtx_SCRATCH (SImode));
+      set = gen_rtx_SET (VOIDmode, cc_reg, gen_rtx_COMPARE (mode, x, y));
+      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clobber)));
+    }
+  else
+    emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
 
   return cc_reg;
 }
@@ -11399,6 +11546,34 @@  arm_const_double_by_parts (rtx val)
   return false;
 }
 
+/* Return true if it is possible to inline both the high and low parts
+   of a 64-bit constant into 32-bit data processing instructions.  */
+bool
+arm_const_double_by_immediates (rtx val)
+{
+  enum machine_mode mode = GET_MODE (val);
+  rtx part;
+
+  if (mode == VOIDmode)
+    mode = DImode;
+
+  part = gen_highpart_mode (SImode, mode, val);
+
+  gcc_assert (GET_CODE (part) == CONST_INT);
+
+  if (!const_ok_for_arm (INTVAL (part)))
+    return false;
+
+  part = gen_lowpart (SImode, val);
+
+  gcc_assert (GET_CODE (part) == CONST_INT);
+
+  if (!const_ok_for_arm (INTVAL (part)))
+    return false;
+
+  return true;
+}
+
 /* Scan INSN and note any of its operands that need fixing.
    If DO_PUSHES is false we do not actually push any of the fixups
    needed.  The function returns TRUE if any fixups were needed/pushed.
@@ -15243,8 +15418,18 @@  arm_print_operand (FILE *stream, rtx x, 
 	 the value being loaded is big-wordian or little-wordian.  The
 	 order of the two register loads can matter however, if the address
 	 of the memory location is actually held in one of the registers
-	 being overwritten by the load.  */
+	 being overwritten by the load.
+
+	 The 'Q' and 'R' constraints are also available for 64-bit
+	 constants.  */
     case 'Q':
+      if (GET_CODE (x) == CONST_INT || GET_CODE (x) == CONST_DOUBLE)
+	{
+	  rtx part = gen_lowpart (SImode, x);
+	  fprintf (stream, "#" HOST_WIDE_INT_PRINT_DEC, INTVAL (part));
+	  return;
+	}
+
       if (GET_CODE (x) != REG || REGNO (x) > LAST_ARM_REGNUM)
 	{
 	  output_operand_lossage ("invalid operand for code '%c'", code);
@@ -15255,6 +15440,18 @@  arm_print_operand (FILE *stream, rtx x, 
       return;
 
     case 'R':
+      if (GET_CODE (x) == CONST_INT || GET_CODE (x) == CONST_DOUBLE)
+	{
+	  enum machine_mode mode = GET_MODE (x);
+	  rtx part;
+
+	  if (mode == VOIDmode)
+	    mode = DImode;
+	  part = gen_highpart_mode (SImode, mode, x);
+	  fprintf (stream, "#" HOST_WIDE_INT_PRINT_DEC, INTVAL (part));
+	  return;
+	}
+
       if (GET_CODE (x) != REG || REGNO (x) > LAST_ARM_REGNUM)
 	{
 	  output_operand_lossage ("invalid operand for code '%c'", code);
@@ -15947,6 +16144,28 @@  get_arm_condition_code (rtx comparison)
       default: gcc_unreachable ();
       }
 
+    case CC_CZmode:
+      switch (comp_code)
+	{
+	case NE: return ARM_NE;
+	case EQ: return ARM_EQ;
+	case GEU: return ARM_CS;
+	case GTU: return ARM_HI;
+	case LEU: return ARM_LS;
+	case LTU: return ARM_CC;
+	default: gcc_unreachable ();
+	}
+
+    case CC_NCVmode:
+      switch (comp_code)
+	{
+	case GE: return ARM_GE;
+	case LT: return ARM_LT;
+	case GEU: return ARM_CS;
+	case LTU: return ARM_CC;
+	default: gcc_unreachable ();
+	}
+
     case CCmode:
       switch (comp_code)
 	{
Index: gcc/config/arm/arm.h
===================================================================
--- gcc/config/arm/arm.h	(revision 159319)
+++ gcc/config/arm/arm.h	(working copy)
@@ -2257,19 +2257,7 @@  extern int making_const_table;
    : reverse_condition (code))
 
 #define CANONICALIZE_COMPARISON(CODE, OP0, OP1)				\
-  do									\
-    {									\
-      if (GET_CODE (OP1) == CONST_INT					\
-          && ! (const_ok_for_arm (INTVAL (OP1))				\
-	        || (const_ok_for_arm (- INTVAL (OP1)))))		\
-        {								\
-          rtx const_op = OP1;						\
-          CODE = arm_canonicalize_comparison ((CODE), GET_MODE (OP0),	\
-					      &const_op);		\
-          OP1 = const_op;						\
-        }								\
-    }									\
-  while (0)
+  (CODE) = arm_canonicalize_comparison (CODE, &(OP0), &(OP1))
 
 /* The arm5 clz instruction returns 32.  */
 #define CLZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE)  ((VALUE) = 32, 1)
Index: gcc/config/arm/arm-modes.def
===================================================================
--- gcc/config/arm/arm-modes.def	(revision 159319)
+++ gcc/config/arm/arm-modes.def	(working copy)
@@ -35,10 +35,16 @@  ADJUST_FLOAT_FORMAT (HF, ((arm_fp16_form
    CC_NOOVmode should be used with SImode integer equalities.
    CC_Zmode should be used if only the Z flag is set correctly
    CC_Nmode should be used if only the N (sign) flag is set correctly
+   CC_CZmode should be used if only the C and Z flags are correct
+   (used for DImode unsigned comparisons).
+   CC_NCVmode should be used if only the N, C, and V flags are correct
+   (used for DImode signed comparisons).
    CCmode should be used otherwise.  */
 
 CC_MODE (CC_NOOV);
 CC_MODE (CC_Z);
+CC_MODE (CC_CZ);
+CC_MODE (CC_NCV);
 CC_MODE (CC_SWP);
 CC_MODE (CCFP);
 CC_MODE (CCFPE);
Index: gcc/config/arm/arm-protos.h
===================================================================
--- gcc/config/arm/arm-protos.h	(revision 159319)
+++ gcc/config/arm/arm-protos.h	(working copy)
@@ -50,8 +50,7 @@  extern int arm_hard_regno_mode_ok (unsig
 extern int const_ok_for_arm (HOST_WIDE_INT);
 extern int arm_split_constant (RTX_CODE, enum machine_mode, rtx,
 			       HOST_WIDE_INT, rtx, rtx, int);
-extern RTX_CODE arm_canonicalize_comparison (RTX_CODE, enum machine_mode,
-					     rtx *);
+extern RTX_CODE arm_canonicalize_comparison (RTX_CODE, rtx *, rtx *);
 extern int legitimate_pic_operand_p (rtx);
 extern rtx legitimize_pic_address (rtx, enum machine_mode, rtx);
 extern rtx legitimize_tls_address (rtx, rtx);
@@ -117,6 +116,7 @@  extern void arm_reload_in_hi (rtx *);
 extern void arm_reload_out_hi (rtx *);
 extern int arm_const_double_inline_cost (rtx);
 extern bool arm_const_double_by_parts (rtx);
+extern bool arm_const_double_by_immediates (rtx);
 extern const char *fp_immediate_constant (rtx);
 extern void arm_emit_call_insn (rtx, rtx);
 extern const char *output_call (rtx *);
Index: gcc/config/arm/constraints.md
===================================================================
--- gcc/config/arm/constraints.md	(revision 159319)
+++ gcc/config/arm/constraints.md	(working copy)
@@ -29,7 +29,7 @@ 
 ;; in Thumb-1 state: I, J, K, L, M, N, O
 
 ;; The following multi-letter normal constraints have been used:
-;; in ARM/Thumb-2 state: Da, Db, Dc, Dn, Dl, DL, Dv, Dy
+;; in ARM/Thumb-2 state: Da, Db, Dc, Dn, Dl, DL, Dv, Dy, Di
 ;; in Thumb-1 state: Pa, Pb
 ;; in Thumb-2 state: Ps, Pt
 
@@ -191,6 +191,13 @@ 
       (match_test "TARGET_32BIT && arm_const_double_inline_cost (op) == 4
 		   && !(optimize_size || arm_ld_sched)")))
 
+(define_constraint "Di"
+ "@internal
+  In ARM/Thumb-2 state a const_int or const_double where both the high
+  and low SImode words can be generated as immediates in 32-bit instructions."
+ (and (match_code "const_double,const_int")
+      (match_test "TARGET_32BIT && arm_const_double_by_immediates (op)")))
+
 (define_constraint "Dn"
  "@internal
   In ARM/Thumb-2 state a const_vector which can be loaded with a Neon vmov
Index: gcc/config/arm/predicates.md
===================================================================
--- gcc/config/arm/predicates.md	(revision 159319)
+++ gcc/config/arm/predicates.md	(working copy)
@@ -86,6 +86,12 @@ 
   (and (match_code "const_int")
        (match_test "const_ok_for_arm (INTVAL (op))")))
 
+;; A constant value which fits into two instructions, each taking
+;; an arithmetic constant operand for one of the words.
+(define_predicate "arm_immediate_di_operand"
+  (and (match_code "const_int,const_double")
+       (match_test "arm_const_double_by_immediates (op)")))
+
 (define_predicate "arm_neg_immediate_operand"
   (and (match_code "const_int")
        (match_test "const_ok_for_arm (-INTVAL (op))")))
@@ -119,6 +125,10 @@ 
   (ior (match_operand 0 "arm_rhs_operand")
        (match_operand 0 "arm_not_immediate_operand")))
 
+(define_predicate "arm_di_operand"
+  (ior (match_operand 0 "s_register_operand")
+       (match_operand 0 "arm_immediate_di_operand")))
+
 ;; True if the operand is a memory reference which contains an
 ;; offsettable address.
 (define_predicate "offsettable_memory_operand"
@@ -527,4 +537,12 @@ 
 (define_predicate "neon_lane_number"
   (and (match_code "const_int")
        (match_test "INTVAL (op) >= 0 && INTVAL (op) <= 7")))
+;; Predicates for named expanders that overlap multiple ISAs.
+
+(define_predicate "cmpdi_operand"
+  (if_then_else (match_test "TARGET_HARD_FLOAT && TARGET_MAVERICK")
+		(and (match_test "TARGET_ARM")
+		     (match_operand 0 "cirrus_fp_register"))
+		(and (match_test "TARGET_32BIT")
+		     (match_operand 0 "arm_di_operand"))))
 
Index: gcc/config/arm/arm.md
===================================================================
--- gcc/config/arm/arm.md	(revision 159319)
+++ gcc/config/arm/arm.md	(working copy)
@@ -6703,17 +6703,45 @@ 
 				   operands[3])); DONE;"
 )
 
-;; this uses the Cirrus DI compare instruction
 (define_expand "cbranchdi4"
   [(set (pc) (if_then_else
 	      (match_operator 0 "arm_comparison_operator"
-	       [(match_operand:DI 1 "cirrus_fp_register" "")
-	        (match_operand:DI 2 "cirrus_fp_register" "")])
+	       [(match_operand:DI 1 "cmpdi_operand" "")
+	        (match_operand:DI 2 "cmpdi_operand" "")])
 	      (label_ref (match_operand 3 "" ""))
 	      (pc)))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
-  "emit_jump_insn (gen_cbranch_cc (operands[0], operands[1], operands[2],
-				   operands[3])); DONE;"
+  "TARGET_32BIT"
+  "{
+     rtx swap = NULL_RTX;
+     enum rtx_code code = GET_CODE (operands[0]);
+
+     /* We should not have two constants.  */
+     gcc_assert (GET_MODE (operands[1]) == DImode
+		 || GET_MODE (operands[2]) == DImode);
+
+    /* Flip unimplemented DImode comparisons to a form that
+       arm_gen_compare_reg can handle.  */
+     switch (code)
+     {
+     case GT:
+       swap = gen_rtx_LT (VOIDmode, operands[2], operands[1]); break;
+     case LE:
+       swap = gen_rtx_GE (VOIDmode, operands[2], operands[1]); break;
+     case GTU:
+       swap = gen_rtx_LTU (VOIDmode, operands[2], operands[1]); break;
+     case LEU:
+       swap = gen_rtx_GEU (VOIDmode, operands[2], operands[1]); break;
+     default:
+       break;
+     }
+     if (swap)
+       emit_jump_insn (gen_cbranch_cc (swap, operands[2], operands[1],
+                                       operands[3]));
+     else
+       emit_jump_insn (gen_cbranch_cc (operands[0], operands[1], operands[2],
+				       operands[3]));
+     DONE;
+   }"
 )
 
 (define_insn "cbranchsi4_insn"
@@ -7877,6 +7905,52 @@ 
 				    (const_string "alu_shift_reg")))]
 )
 
+;; DImode comparisons.  The generic code generates branches that
+;; if-conversion can not reduce to a conditional compare, so we do
+;; that directly.
+
+(define_insn "*arm_cmpdi_insn"
+  [(set (reg:CC_NCV CC_REGNUM)
+	(compare:CC_NCV (match_operand:DI 0 "s_register_operand" "r")
+			(match_operand:DI 1 "arm_di_operand"	   "rDi")))
+   (clobber (match_scratch:SI 2 "=r"))]
+  "TARGET_32BIT && !(TARGET_HARD_FLOAT && TARGET_MAVERICK)"
+  "cmp\\t%Q0, %Q1\;sbcs\\t%2, %R0, %R1"
+  [(set_attr "conds" "set")
+   (set_attr "length" "8")]
+)
+
+(define_insn "*arm_cmpdi_unsigned"
+  [(set (reg:CC_CZ CC_REGNUM)
+	(compare:CC_CZ (match_operand:DI 0 "s_register_operand" "r")
+		       (match_operand:DI 1 "arm_di_operand"	"rDi")))]
+  "TARGET_ARM"
+  "cmp%?\\t%R0, %R1\;cmpeq\\t%Q0, %Q1"
+  [(set_attr "conds" "set")
+   (set_attr "length" "8")]
+)
+
+(define_insn "*arm_cmpdi_zero"
+  [(set (reg:CC_Z CC_REGNUM)
+	(compare:CC_Z (match_operand:DI 0 "s_register_operand" "r")
+		      (const_int 0)))
+   (clobber (match_scratch:SI 1 "=r"))]
+  "TARGET_32BIT"
+  "orr%.\\t%1, %Q0, %R0"
+  [(set_attr "conds" "set")]
+)
+
+(define_insn "*thumb_cmpdi_zero"
+  [(set (reg:CC_Z CC_REGNUM)
+	(compare:CC_Z (match_operand:DI 0 "s_register_operand" "l")
+		      (const_int 0)))
+   (clobber (match_scratch:SI 1 "=l"))]
+  "TARGET_THUMB1"
+  "orr\\t%1, %Q0, %R0"
+  [(set_attr "conds" "set")
+   (set_attr "length" "2")]
+)
+
 ;; Cirrus SF compare instruction
 (define_insn "*cirrus_cmpsf"
   [(set (reg:CCFP CC_REGNUM)
@@ -8180,17 +8254,44 @@ 
 			     operands[2], operands[3])); DONE;"
 )
 
-;; this uses the Cirrus DI compare instruction
 (define_expand "cstoredi4"
   [(set (match_operand:SI 0 "s_register_operand" "")
 	(match_operator:SI 1 "arm_comparison_operator"
-	 [(match_operand:DI 2 "cirrus_fp_register" "")
-	  (match_operand:DI 3 "cirrus_fp_register" "")]))]
-  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
-  "emit_insn (gen_cstore_cc (operands[0], operands[1],
-			     operands[2], operands[3])); DONE;"
-)
+	 [(match_operand:DI 2 "cmpdi_operand" "")
+	  (match_operand:DI 3 "cmpdi_operand" "")]))]
+  "TARGET_32BIT"
+  "{
+     rtx swap = NULL_RTX;
+     enum rtx_code code = GET_CODE (operands[1]);
 
+     /* We should not have two constants.  */
+     gcc_assert (GET_MODE (operands[2]) == DImode
+		 || GET_MODE (operands[3]) == DImode);
+
+    /* Flip unimplemented DImode comparisons to a form that
+       arm_gen_compare_reg can handle.  */
+     switch (code)
+     {
+     case GT:
+       swap = gen_rtx_LT (VOIDmode, operands[3], operands[2]); break;
+     case LE:
+       swap = gen_rtx_GE (VOIDmode, operands[3], operands[2]); break;
+     case GTU:
+       swap = gen_rtx_LTU (VOIDmode, operands[3], operands[2]); break;
+     case LEU:
+       swap = gen_rtx_GEU (VOIDmode, operands[3], operands[2]); break;
+     default:
+       break;
+     }
+     if (swap)
+       emit_insn (gen_cstore_cc (operands[0], swap, operands[3],
+		      	         operands[2]));
+     else
+       emit_insn (gen_cstore_cc (operands[0], operands[1], operands[2],
+		      	         operands[3]));
+     DONE;
+   }"
+)
 
 (define_expand "cstoresi_eq0_thumb1"
   [(parallel
Index: gcc/testsuite/gcc.c-torture/execute/20100416-1.c
===================================================================
--- gcc/testsuite/gcc.c-torture/execute/20100416-1.c	(revision 0)
+++ gcc/testsuite/gcc.c-torture/execute/20100416-1.c	(revision 0)
@@ -0,0 +1,40 @@ 
+void abort(void);
+
+int
+movegt(int x, int y, long long a)
+{
+  int i;
+  int ret = 0;
+  for (i = 0; i < y; i++)
+    {
+      if (a >= (long long) 0xf000000000000000LL)
+	ret = x;
+      else
+	ret = y;
+    }
+  return ret;
+}
+
+struct test
+{
+  long long val;
+  int ret;
+} tests[] = {
+  { 0xf000000000000000LL, -1 },
+  { 0xefffffffffffffffLL, 1 },
+  { 0xf000000000000001LL, -1 },
+  { 0x0000000000000000LL, -1 },
+  { 0x8000000000000000LL, 1 },
+};
+
+int
+main()
+{
+  int i;
+  for (i = 0; i < sizeof (tests) / sizeof (tests[0]); i++)
+    {
+      if (movegt (-1, 1, tests[i].val) != tests[i].ret)
+	abort ();
+    }
+  return 0;
+}