Patchwork [MIPS,committed] Use round-robin allocation of floating-point condition codes

login
register
mail settings
Submitter Richard Sandiford
Date Aug. 26, 2012, 7:31 p.m.
Message ID <87ehmt30xf.fsf@talisman.home>
Download mbox | patch
Permalink /patch/180083/
State New
Headers show

Comments

Richard Sandiford - Aug. 26, 2012, 7:31 p.m.
This is the last patch (for today) brought about by the upcoming
gcc.target/mips patch.  It's also the one I'm least happy about.

For MIPS IV and above, we've traditionally created CCmode pseudo registers
for floating-point conditions but defined AVOID_CCMODE_COPIES to try to stop
the optimizers from getting too carried away.  Even then, we might have
to spill registers, either because we've run out or because they're live
across a call.  Moving a value out of a condition-code register and back
requires a tortuous sequence, and any time we use it, we know we're
generating bad code.

Running the paired-single tests at higher optimization levels hit the
same problem for CCV2 and CCV4.  We could take the same approach there,
but the sequences would be even worse.  They would almost certainly
outweigh whatever advantage we get from the vectorisation.

In principle, it would be nice for something like:

   if (float_a == float_b)
     foo;
   bar;
   if (float_a == float_b)
     frob;

to reuse a condition-code register.  But:

  (a) we never want to do that if there's a call inbetween
  (b) AVOID_CCMODE_COPIES makes it unlikely anyway
  (c) if it was really performance critical, we'd hope to thread
      the jumps instead.

The main advantage of using more than one condition-code register is
to allow better scheduling of blocks that contain more than one comparison.
We can get most of that advantage, without the downsides of pseudos,
by using a round-robin allocation of registers at expand time.

This produces better code for some of the mips testsuite tests
that already compile correctly.  It does feel very amateurish though...

Tested on mipsisa64-elf, mips64-elf and mips64-linux-gnu.  Applied.
Will be tested once gcc.target/mips uses gcc-dg.

Richard


gcc/
	* config/mips/mips.h (AVOID_CCMODE_COPIES): Update rationale for
	definition.
	* config/mips/mips.c (machine_function): Add next_fcc.
	(mips_output_move): Remove handling of fcc moves.
	(mips_allocate_fcc): New function.
	(mips_emit_compare, mips_expand_vcondv2sf): Use it.
	(mips_hard_regno_mode_ok_p): Restrict CCmode to ST registers.
	Remove special case for CCmode reloads.
	(mips_expand_builtin_compare_1): Use mips_allocate_fcc and treat
	the result a fixed operand.
	* config/mips/mips.md (move_type): Remove lui_movf.
	(type, length): Remove references to it.
	(movcc, reload_incc, reload_outcc): Delete.

Patch

Index: gcc/config/mips/mips.h
===================================================================
--- gcc/config/mips/mips.h	2012-08-26 20:15:38.585706187 +0100
+++ gcc/config/mips/mips.h	2012-08-26 20:29:05.983682714 +0100
@@ -2395,12 +2395,8 @@  #define Pmode (TARGET_64BIT && TARGET_LO
 #define FUNCTION_MODE SImode
 
 
-
-/* Define if copies to/from condition code registers should be avoided.
-
-   This is needed for the MIPS because reload_outcc is not complete;
-   it needs to handle cases where the source is a general or another
-   condition code register.  */
+/* We allocate $fcc registers by hand and can't cope with moves of
+   CCmode registers to and from pseudos (or memory).  */
 #define AVOID_CCMODE_COPIES
 
 /* A C expression for the cost of a branch instruction.  A value of
Index: gcc/config/mips/mips.c
===================================================================
--- gcc/config/mips/mips.c	2012-08-26 20:22:48.059693700 +0100
+++ gcc/config/mips/mips.c	2012-08-26 20:29:09.350682619 +0100
@@ -321,6 +321,10 @@  struct GTY(())  mips_frame_info {
 };
 
 struct GTY(())  machine_function {
+  /* The next floating-point condition-code register to allocate
+     for ISA_HAS_8CC targets, relative to ST_REG_FIRST.  */
+  unsigned int next_fcc;
+
   /* The register returned by mips16_gp_pseudo_reg; see there for details.  */
   rtx mips16_gp_pseudo_rtx;
 
@@ -4284,9 +4288,6 @@  mips_output_move (rtx dest, rtx src)
 	      retval[4] = COPNUM_AS_CHAR_FROM_REGNUM (REGNO (src));
 	      return dbl_p ? retval : retval + 1;
 	    }
-
-	  if (ST_REG_P (REGNO (src)) && ISA_HAS_8CC)
-	    return "lui\t%0,0x3f80\n\tmovf\t%0,%.,%1";
 	}
 
       if (src_code == MEM)
@@ -4527,6 +4528,63 @@  mips_reversed_fp_cond (enum rtx_code *co
     }
 }
 
+/* Allocate a floating-point condition-code register of mode MODE.
+
+   These condition code registers are used for certain kinds
+   of compound operation, such as compare and branches, vconds,
+   and built-in functions.  At expand time, their use is entirely
+   controlled by MIPS-specific code and is entirely internal
+   to these compound operations.
+
+   We could (and did in the past) expose condition-code values
+   as pseudo registers and leave the register allocator to pick
+   appropriate registers.  The problem is that it is not practically
+   possible for the rtl optimizers to guarantee that no spills will
+   be needed, even when AVOID_CCMODE_COPIES is defined.  We would
+   therefore need spill and reload sequences to handle the worst case.
+
+   Although such sequences do exist, they are very expensive and are
+   not something we'd want to use.  This is especially true of CCV2 and
+   CCV4, where all the shuffling would greatly outweigh whatever benefit
+   the vectorization itself provides.
+
+   The main benefit of having more than one condition-code register
+   is to allow the pipelining of operations, especially those involving
+   comparisons and conditional moves.  We don't really expect the
+   registers to be live for long periods, and certainly never want
+   them to be live across calls.
+
+   Also, there should be no penalty attached to using all the available
+   registers.  They are simply bits in the same underlying FPU control
+   register.
+
+   We therefore expose the hardware registers from the outset and use
+   a simple round-robin allocation scheme.  */
+
+static rtx
+mips_allocate_fcc (enum machine_mode mode)
+{
+  unsigned int regno, count;
+
+  gcc_assert (TARGET_HARD_FLOAT && ISA_HAS_8CC);
+
+  if (mode == CCmode)
+    count = 1;
+  else if (mode == CCV2mode)
+    count = 2;
+  else if (mode == CCV4mode)
+    count = 4;
+  else
+    gcc_unreachable ();
+
+  cfun->machine->next_fcc += -cfun->machine->next_fcc & (count - 1);
+  if (cfun->machine->next_fcc > ST_REG_LAST - ST_REG_FIRST)
+    cfun->machine->next_fcc = 0;
+  regno = ST_REG_FIRST + cfun->machine->next_fcc;
+  cfun->machine->next_fcc += count;
+  return gen_rtx_REG (mode, regno);
+}
+
 /* Convert a comparison into something that can be used in a branch or
    conditional move.  On entry, *OP0 and *OP1 are the values being
    compared and *CODE is the code used to compare them.
@@ -4590,7 +4648,7 @@  mips_emit_compare (enum rtx_code *code,
       cmp_code = *code;
       *code = mips_reversed_fp_cond (&cmp_code) ? EQ : NE;
       *op0 = (ISA_HAS_8CC
-	      ? gen_reg_rtx (CCmode)
+	      ? mips_allocate_fcc (CCmode)
 	      : gen_rtx_REG (CCmode, FPSW_REGNUM));
       *op1 = const0_rtx;
       mips_emit_binary (cmp_code, *op0, cmp_op0, cmp_op1);
@@ -4657,7 +4715,7 @@  mips_expand_vcondv2sf (rtx dest, rtx tru
   bool reversed_p;
 
   reversed_p = mips_reversed_fp_cond (&cond);
-  cmp_result = gen_reg_rtx (CCV2mode);
+  cmp_result = mips_allocate_fcc (CCV2mode);
   emit_insn (gen_scc_ps (cmp_result,
 			 gen_rtx_fmt_ee (cond, VOIDmode, cmp_op0, cmp_op1)));
   if (reversed_p)
@@ -11098,14 +11156,7 @@  mips_hard_regno_mode_ok_p (unsigned int
 	    && (regno - ST_REG_FIRST) % 4 == 0);
 
   if (mode == CCmode)
-    {
-      if (!ISA_HAS_8CC)
-	return regno == FPSW_REGNUM;
-
-      return (ST_REG_P (regno)
-	      || GP_REG_P (regno)
-	      || FP_REG_P (regno));
-    }
+    return ISA_HAS_8CC ? ST_REG_P (regno) : regno == FPSW_REGNUM;
 
   size = GET_MODE_SIZE (mode);
   mclass = GET_MODE_CLASS (mode);
@@ -11117,10 +11168,6 @@  mips_hard_regno_mode_ok_p (unsigned int
       && (((regno - FP_REG_FIRST) % MAX_FPRS_PER_FMT) == 0
 	  || (MIN_FPRS_PER_FMT == 1 && size <= UNITS_PER_FPREG)))
     {
-      /* Allow TFmode for CCmode reloads.  */
-      if (mode == TFmode && ISA_HAS_8CC)
-	return true;
-
       /* Allow 64-bit vector modes for Loongson-2E/2F.  */
       if (TARGET_LOONGSON_VECTORS
 	  && (mode == V2SImode
@@ -13738,15 +13785,16 @@  mips_expand_builtin_compare_1 (enum insn
 			       tree exp, int nargs)
 {
   struct expand_operand ops[MAX_RECOG_OPERANDS];
+  rtx output;
   int opno, argno;
 
   /* The instruction should have a target operand, an operand for each
      argument, and an operand for COND.  */
   gcc_assert (nargs + 2 == insn_data[(int) icode].n_generator_args);
 
+  output = mips_allocate_fcc (insn_data[(int) icode].operand[0].mode);
   opno = 0;
-  create_output_operand (&ops[opno++], NULL_RTX,
-			 insn_data[(int) icode].operand[0].mode);
+  create_fixed_operand (&ops[opno++], output);
   for (argno = 0; argno < nargs; argno++)
     mips_prepare_builtin_arg (&ops[opno++], exp, argno);
   create_integer_operand (&ops[opno++], (int) cond);
Index: gcc/config/mips/mips.md
===================================================================
--- gcc/config/mips/mips.md	2012-08-26 20:14:26.327708290 +0100
+++ gcc/config/mips/mips.md	2012-08-26 20:29:05.988682713 +0100
@@ -198,7 +198,6 @@  (define_attr "jal_macro" "no,yes"
 ;; loadpool	move a constant into a MIPS16 register by loading it
 ;;		from the pool
 ;; shift_shift	a shift left followed by a shift right
-;; lui_movf	an LUI followed by a MOVF (for d<-z CC moves)
 ;;
 ;; This attribute is used to determine the instruction's length and
 ;; scheduling type.  For doubleword moves, the attribute always describes
@@ -207,7 +206,7 @@  (define_attr "jal_macro" "no,yes"
 (define_attr "move_type"
   "unknown,load,fpload,store,fpstore,mtc,mfc,mtlo,mflo,move,fmove,
    const,constN,signext,ext_ins,logical,arith,sll0,andi,loadpool,
-   shift_shift,lui_movf"
+   shift_shift"
   (const_string "unknown"))
 
 (define_attr "alu_type" "unknown,add,sub,not,nor,and,or,xor"
@@ -390,8 +389,6 @@  (define_attr "type"
 	 (eq_attr "move_type" "move") (const_string "move")
 	 (eq_attr "move_type" "const") (const_string "const")
 	 (eq_attr "sync_mem" "!none") (const_string "syncloop")]
-	;; We classify "lui_movf" as "unknown" rather than "multi"
-	;; because we don't split it.  FIXME: we should split instead.
 	(const_string "unknown")))
 
 ;; Mode for conversion types (fcvt)
@@ -578,10 +575,6 @@  (define_attr "length" ""
 	  (eq_attr "move_type" "loadpool")
 	  (const_int 8)
 
-	  ;; LUI_MOVFs are decomposed into two separate instructions.
-	  (eq_attr "move_type" "lui_movf")
-	  (const_int 8)
-
 	  ;; SHIFT_SHIFTs are decomposed into two separate instructions.
 	  ;; They are extended instructions on MIPS16 targets.
 	  (eq_attr "move_type" "shift_shift")
@@ -4436,53 +4429,6 @@  (define_split
   operands[2] = GEN_INT (val - 0xff);
 })
 
-;; This insn handles moving CCmode values.  It's really just a
-;; slightly simplified copy of movsi_internal2, with additional cases
-;; to move a condition register to a general register and to move
-;; between the general registers and the floating point registers.
-
-(define_insn "movcc"
-  [(set (match_operand:CC 0 "nonimmediate_operand" "=d,*d,*d,*m,*d,*f,*f,*f,*m")
-	(match_operand:CC 1 "general_operand" "z,*d,*m,*d,*f,*d,*f,*m,*f"))]
-  "ISA_HAS_8CC && TARGET_HARD_FLOAT"
-  { return mips_output_move (operands[0], operands[1]); }
-  [(set_attr "move_type" "lui_movf,move,load,store,mfc,mtc,fmove,fpload,fpstore")
-   (set_attr "mode" "SI")])
-
-;; Reload condition code registers.  reload_incc and reload_outcc
-;; both handle moves from arbitrary operands into condition code
-;; registers.  reload_incc handles the more common case in which
-;; a source operand is constrained to be in a condition-code
-;; register, but has not been allocated to one.
-;;
-;; Sometimes, such as in movcc, we have a CCmode destination whose
-;; constraints do not include 'z'.  reload_outcc handles the case
-;; when such an operand is allocated to a condition-code register.
-;;
-;; Note that reloads from a condition code register to some
-;; other location can be done using ordinary moves.  Moving
-;; into a GPR takes a single movcc, moving elsewhere takes
-;; two.  We can leave these cases to the generic reload code.
-(define_expand "reload_incc"
-  [(set (match_operand:CC 0 "fcc_reload_operand" "=z")
-	(match_operand:CC 1 "general_operand" ""))
-   (clobber (match_operand:TF 2 "register_operand" "=&f"))]
-  "ISA_HAS_8CC && TARGET_HARD_FLOAT"
-{
-  mips_expand_fcc_reload (operands[0], operands[1], operands[2]);
-  DONE;
-})
-
-(define_expand "reload_outcc"
-  [(set (match_operand:CC 0 "fcc_reload_operand" "=z")
-	(match_operand:CC 1 "register_operand" ""))
-   (clobber (match_operand:TF 2 "register_operand" "=&f"))]
-  "ISA_HAS_8CC && TARGET_HARD_FLOAT"
-{
-  mips_expand_fcc_reload (operands[0], operands[1], operands[2]);
-  DONE;
-})
-
 ;; MIPS4 supports loading and storing a floating point register from
 ;; the sum of two general registers.  We use two versions for each of
 ;; these four instructions: one where the two general registers are