[committed] Improve code generation for pdp11 target

Message ID 3B5A5000-9ABB-420D-94B2-B258FFA739EA@comcast.net
State New
Headers show
Series
  • [committed] Improve code generation for pdp11 target
Related show

Commit Message

Paul Koning July 9, 2018, 2:02 p.m.
This patch improves the generated code for the pdp11 target.

Committed.

	paul

ChangeLog:

2018-07-09  Paul Koning  <ni1d@arrl.net>

	* config/pdp11/pdp11.c (pdp11_addr_cost): New function.
	(pdp11_insn_cost): New function.
	(pdp11_md_asm_adjust): New function.
	(TARGET_INVALID_WITHIN_DOLOOP): Define.
	(pdp11_rtx_costs): Update to match machine better.
	(output_addr_const_pdp11): Correct format mismatch warnings.
	* config/pdp11/pdp11.h (SLOW_BYTE_ACCESS): Correct definition.
	* config/pdp11/pdp11.md: General change to add base_cost and/or
	length attributes for use by new pdp11_insn_cost function.
	(MIN_BRANCH): Correct definition.
	(MIN_SOB): Ditto.
	(doloop_end): Use standard pattern name for looping pattern.
	(doloop_end_nocc): New.
	(movsf): Add another constraint alternative.
	(zero_extendqihi2): Add constraint alternatives for not in place
	extend.
	(zero_extendhisi2): Remove.
	(shift patterns): Add CC handling variants.
	(bswaphi2): New.
	(bswapsi2): New.
	(rothi3): New.
	(define_peephole2): New peephole to recognize mov that sets CC for
	subsequent test.

Patch

Index: config/pdp11/pdp11.c
===================================================================
--- config/pdp11/pdp11.c	(revision 262518)
+++ config/pdp11/pdp11.c	(working copy)
@@ -33,6 +33,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "memmodel.h"
 #include "tm_p.h"
 #include "insn-config.h"
+#include "insn-attr.h"
 #include "regs.h"
 #include "emit-rtl.h"
 #include "recog.h"
@@ -150,6 +151,11 @@  decode_pdp11_d (const struct real_format *fmt ATTR
 static const char *singlemove_string (rtx *);
 static bool pdp11_assemble_integer (rtx, unsigned int, int);
 static bool pdp11_rtx_costs (rtx, machine_mode, int, int, int *, bool);
+static int pdp11_addr_cost (rtx, machine_mode, addr_space_t, bool);
+static int pdp11_insn_cost (rtx_insn *insn, bool speed);
+static rtx_insn *pdp11_md_asm_adjust (vec<rtx> &, vec<rtx> &,
+				      vec<const char *> &,
+				      vec<rtx> &, HARD_REG_SET &);
 static bool pdp11_return_in_memory (const_tree, const_tree);
 static rtx pdp11_function_value (const_tree, const_tree, bool);
 static rtx pdp11_libcall_value (machine_mode, const_rtx);
@@ -174,6 +180,8 @@  static bool pdp11_scalar_mode_supported_p (scalar_
 #undef TARGET_ASM_INTEGER
 #define TARGET_ASM_INTEGER pdp11_assemble_integer
 
+/* These two apply to Unix and GNU assembler; for DEC, they are
+   overridden during option processing.  */
 #undef TARGET_ASM_OPEN_PAREN
 #define TARGET_ASM_OPEN_PAREN "["
 #undef TARGET_ASM_CLOSE_PAREN
@@ -182,6 +190,15 @@  static bool pdp11_scalar_mode_supported_p (scalar_
 #undef TARGET_RTX_COSTS
 #define TARGET_RTX_COSTS pdp11_rtx_costs
 
+#undef  TARGET_ADDRESS_COST
+#define TARGET_ADDRESS_COST pdp11_addr_cost
+
+#undef  TARGET_INSN_COST
+#define TARGET_INSN_COST pdp11_insn_cost
+
+#undef  TARGET_MD_ASM_ADJUST
+#define TARGET_MD_ASM_ADJUST pdp11_md_asm_adjust
+
 #undef TARGET_FUNCTION_ARG
 #define TARGET_FUNCTION_ARG pdp11_function_arg
 #undef TARGET_FUNCTION_ARG_ADVANCE
@@ -271,6 +288,9 @@  static bool pdp11_scalar_mode_supported_p (scalar_
 
 #undef  TARGET_CAN_CHANGE_MODE_CLASS
 #define TARGET_CAN_CHANGE_MODE_CLASS pdp11_can_change_mode_class
+
+#undef TARGET_INVALID_WITHIN_DOLOOP
+#define TARGET_INVALID_WITHIN_DOLOOP hook_constcharptr_const_rtx_insn_null
 
 /* A helper function to determine if REGNO should be saved in the
    current function's stack frame.  */
@@ -968,12 +988,8 @@  pdp11_assemble_integer (rtx x, unsigned int size,
 }
 
 
-/* Register to register moves are cheap if both are general registers.
-   The same is true for FPU, but there we return cost of 3 rather than
-   2 to make reload look at the constraints.  The raeson is that
-   load/store double require extra care since load touches condition
-   codes and store doesn't, which is (partly anyway) described by
-   constraints.  */
+/* Register to register moves are cheap if both are general
+   registers.  */
 static int 
 pdp11_register_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
 			  reg_class_t c1, reg_class_t c2)
@@ -983,151 +999,270 @@  pdp11_register_move_cost (machine_mode mode ATTRIB
     return 2;
   else if ((c1 >= LOAD_FPU_REGS && c1 <= FPU_REGS && c2 == LOAD_FPU_REGS) ||
 	   (c2 >= LOAD_FPU_REGS && c2 <= FPU_REGS && c1 == LOAD_FPU_REGS))
-    return 3;
+    return 2;
   else
     return 22;
 }
 
-
+/* This tries to approximate what pdp11_insn_cost would do, but
+   without visibility into the actual instruction being generated it's
+   inevitably a rough approximation.  */
 static bool
-pdp11_rtx_costs (rtx x, machine_mode mode, int outer_code ATTRIBUTE_UNUSED,
-		 int opno ATTRIBUTE_UNUSED, int *total,
-		 bool speed ATTRIBUTE_UNUSED)
+pdp11_rtx_costs (rtx x, machine_mode mode, int outer_code,
+		 int opno ATTRIBUTE_UNUSED, int *total, bool speed)
 {
-  int code = GET_CODE (x);
-
+  const int code = GET_CODE (x);
+  const int asize = (mode == QImode) ? 2 : GET_MODE_SIZE (mode);
+  rtx src, dest;
+  
   switch (code)
     {
     case CONST_INT:
-      if (INTVAL (x) == 0 || INTVAL (x) == -1 || INTVAL (x) == 1)
+      /* Treat -1, 0, 1 as things that are optimized as clr or dec
+	 etc. though that doesn't apply to every case.  */
+      if (INTVAL (x) >= -1 && INTVAL (x) <= 1)
 	{
 	  *total = 0;
 	  return true;
 	}
-      /* FALLTHRU */
-
+      /* FALL THROUGH.  */
+    case REG:
+    case MEM:
     case CONST:
     case LABEL_REF:
     case SYMBOL_REF:
-      /* Twice as expensive as REG.  */
-      *total = 2;
-      return true;
-
     case CONST_DOUBLE:
-      /* Twice (or 4 times) as expensive as 16 bit.  */
-      *total = 4;
+      *total = pdp11_addr_cost (x, mode, ADDR_SPACE_GENERIC, speed);
       return true;
+    }
 
-    case MULT:
-      /* ??? There is something wrong in MULT because MULT is not 
-         as cheap as total = 2 even if we can shift!  */
-      /* If optimizing for size make mult etc cheap, but not 1, so when 
-         in doubt the faster insn is chosen.  */
-      if (optimize_size)
-        *total = COSTS_N_INSNS (2);
-      else
-        *total = COSTS_N_INSNS (11);
-      return false;
+  /* Pick up source and dest.  We don't necessarily use the standard
+     recursion in rtx_costs to figure the cost, because that would
+     count the destination operand twice for three-operand insns.
+     Also, this way we can catch special cases like move of zero, or
+     add one.  */
+  if (GET_RTX_LENGTH (code) > 1)
+    src = XEXP (x, 1);
+  dest = XEXP (x, 0);
+      
+  /* If optimizing for size, claim everything costs 2 per word, plus
+     whatever the operands require.  */
+  if (!speed)
+    *total = asize;
+  else
+    {
+      if (FLOAT_MODE_P (mode))
+	{
+	  switch (code)
+	    {
+	    case MULT:
+	    case DIV:
+	    case MOD:
+	      *total = 20;
+	      break;
 
-    case DIV:
-      if (optimize_size)
-        *total = COSTS_N_INSNS (2);
-      else
-        *total = COSTS_N_INSNS (25);
-      return false;
+	    case COMPARE:
+	      *total = 4;
+	      break;
 
-    case MOD:
-      if (optimize_size)
-        *total = COSTS_N_INSNS (2);
+	    case PLUS:
+	    case MINUS:
+	      *total = 6;
+	      break;
+
+	    default:
+	      *total = 2;
+	      break;
+	    }
+	}
       else
-        *total = COSTS_N_INSNS (26);
-      return false;
+	{
+	  /* Integer operations are scaled for SI and DI modes, though the
+	     scaling is not exactly accurate.  */
+	  switch (code)
+	    {
+	    case MULT:
+	      *total = 5 * asize * asize;
+	      break;
 
-    case ABS:
-      /* Equivalent to length, so same for optimize_size.  */
-      *total = COSTS_N_INSNS (3);
-      return false;
+	    case DIV:
+	      *total = 10 * asize * asize;
+	      break;
+	  
+	    case MOD:
+	      /* Fake value because it's accounted for under DIV, since we
+		 use a divmod pattern.  */
+	      total = 0;
+	      break;
 
-    case ZERO_EXTEND:
-      /* Only used for qi->hi.  */
-      *total = COSTS_N_INSNS (1);
-      return false;
+	    case ASHIFT:
+	    case ASHIFTRT:
+	    case LSHIFTRT:
+	      /* This is a bit problematic because the cost depends on the
+		 shift amount.  Make it <asize> for now, which is for the
+		 case of a one bit shift.  */
+	      *total = asize;
+	      break;
+	  
+	    default:
+	      *total = asize;
+	      break;
+	    }
+	}
+    }
+  
+  /* Now see if we're looking at a SET.  If yes, then look at the
+     source to see if this is a move or an arithmetic operation, and
+     continue accordingly to handle the operands.  */
+  if (code == SET)
+    {
+      switch (GET_CODE (src))
+	{
+	case REG:
+	case MEM:
+	case CONST_INT:
+	case CONST:
+	case LABEL_REF:
+	case SYMBOL_REF:
+	case CONST_DOUBLE:
+	  /* It's a move.  */
+	  *total += pdp11_addr_cost (dest, mode, ADDR_SPACE_GENERIC, speed);
+	  if (src != const0_rtx)
+	    *total += pdp11_addr_cost (src, mode, ADDR_SPACE_GENERIC, speed);
+	  return true;
+	default:
+	  /* Not a move.  Get the cost of the source operand and add
+	     that in, but not the destination operand since we're
+	     dealing with read/modify/write operands.  */
+	  *total += rtx_cost (src, mode, (enum rtx_code) outer_code, 1, speed);
+	  return true;
+	}
+    }
+  else if (code == PLUS || code == MINUS)
+    {
+      if (GET_CODE (src) == CONST_INT &&
+	  (INTVAL (src) == 1 || INTVAL (src) == -1))
+	{
+	  *total += rtx_cost (dest, mode, (enum rtx_code) outer_code, 0, speed);
+	  return true;
+	}
+    }
+  return false;
+}
 
-    case SIGN_EXTEND:
-      if (mode == HImode)
-      	*total = COSTS_N_INSNS (1);
-      else if (mode == SImode)
-	*total = COSTS_N_INSNS (6);
-      else
-	*total = COSTS_N_INSNS (2);
-      return false;
+/* Return cost of accessing the supplied operand.  Registers are free.
+   Anything else starts with a cost of two.  Add to that for memory
+   references the memory accesses of the addressing mode (if any) plus
+   the data reference; for other operands just the memory access (if
+   any) for the mode.  */
+static int
+pdp11_addr_cost (rtx addr, machine_mode mode, addr_space_t as ATTRIBUTE_UNUSED,
+		 bool speed)
+{
+  int cost = 0;
+  
+  if (GET_CODE (addr) != REG)
+    {
+      if (!simple_memory_operand (addr, mode))
+	cost = 2;
 
-    case ASHIFT:
-    case ASHIFTRT:
-      if (optimize_size)
-        *total = COSTS_N_INSNS (1);
-      else if (mode ==  QImode)
-        {
-          if (GET_CODE (XEXP (x, 1)) != CONST_INT)
-   	    *total = COSTS_N_INSNS (8); /* worst case */
-          else
-	    *total = COSTS_N_INSNS (INTVAL (XEXP (x, 1)));
-        }
-      else if (mode == HImode)
-        {
-          if (GET_CODE (XEXP (x, 1)) == CONST_INT)
-            {
-	      if (abs (INTVAL (XEXP (x, 1))) == 1)
-                *total = COSTS_N_INSNS (1);
-              else
-	        *total = COSTS_N_INSNS (2.5 + 0.5 * INTVAL (XEXP (x, 1)));
-            }
-          else
-            *total = COSTS_N_INSNS (10); /* worst case */
-        }
-      else if (mode == SImode)
-        {
-          if (GET_CODE (XEXP (x, 1)) == CONST_INT)
-	    *total = COSTS_N_INSNS (2.5 + 0.5 * INTVAL (XEXP (x, 1)));
-          else /* worst case */
-            *total = COSTS_N_INSNS (18);
-        }
-      return false;
+      /* If optimizing for speed, account for the memory reference if
+	 any.  */
+      if (speed && !CONSTANT_P (addr))
+	cost += (mode == QImode) ? 2 : GET_MODE_SIZE (mode);
+    }
+  return cost;
+}
 
-    case LSHIFTRT:
-      if (optimize_size)
-        *total = COSTS_N_INSNS (2);
-      else if (mode ==  QImode)
-        {
-          if (GET_CODE (XEXP (x, 1)) != CONST_INT)
-   	    *total = COSTS_N_INSNS (12); /* worst case */
-          else
-	    *total = COSTS_N_INSNS (1 + INTVAL (XEXP (x, 1)));
-        }
-      else if (mode == HImode)
-        {
-          if (GET_CODE (XEXP (x, 1)) == CONST_INT)
-            {
-	      if (abs (INTVAL (XEXP (x, 1))) == 1)
-                *total = COSTS_N_INSNS (2);
-              else
-	        *total = COSTS_N_INSNS (3.5 + 0.5 * INTVAL (XEXP (x, 1)));
-            }
-          else
-            *total = COSTS_N_INSNS (12); /* worst case */
-        }
-      else if (mode == SImode)
-        {
-          if (GET_CODE (XEXP (x, 1)) == CONST_INT)
-	    *total = COSTS_N_INSNS (3.5 + 0.5 * INTVAL (XEXP (x, 1)));
-          else /* worst case */
-            *total = COSTS_N_INSNS (20);
-        }
-      return false;
 
+static int
+pdp11_insn_cost (rtx_insn *insn, bool speed)
+{
+  int base_cost, i;
+  rtx pat, set, dest, src, src2;
+  machine_mode mode;
+  const char *fmt;
+  enum rtx_code op;
+  
+  if (recog_memoized (insn) < 0)
+    return 0;
+
+  /* If optimizing for size, we want the insn size.  */
+  if (!speed)
+    return get_attr_length (insn);
+  else
+    {
+      /* Optimizing for speed.  Get the base cost of the insn, then
+	 adjust for the cost of accessing operands.  Zero means use
+	 the length as the cost even when optimizing for speed.  */
+      base_cost = get_attr_base_cost (insn);
+      if (base_cost <= 0)
+	base_cost = get_attr_length (insn);
+    }
+  /* Look for the operands.  Often we have a PARALLEL that's either
+     the actual operation plus a clobber, or the implicit compare plus
+     the actual operation.  Find the actual operation.  */
+  pat = PATTERN (insn);
+  
+  if (GET_CODE (pat) == PARALLEL)
+    {
+      set = XVECEXP (pat, 0, 0);
+      if (GET_CODE (set) != SET || GET_CODE (XEXP (set, 1)) == COMPARE)
+	set = XVECEXP (pat, 0, 1);
+      if (GET_CODE (set) != SET || GET_CODE (XEXP (set, 1)) == COMPARE)
+	return 0;
+    }
+  else
+    {
+      set = pat;
+      if (GET_CODE (set) != SET)
+	return 0;
+    }
+  
+  /* Pick up the SET source and destination RTL.  */
+  dest = XEXP (set, 0);
+  src = XEXP (set, 1);
+  mode = GET_MODE (dest);
+
+  /* See if we have a move, or some arithmetic operation.  If a move,
+     account for source and destination operand costs.  Otherwise,
+     account for the destination and for the second operand of the
+     operation -- the first is also destination and we don't want to
+     double-count it.  */
+  base_cost += pdp11_addr_cost (dest, mode, ADDR_SPACE_GENERIC, speed);
+  op = GET_CODE (src);
+  switch (op)
+    {
+    case REG:
+    case MEM:
+    case CONST_INT:
+    case CONST:
+    case LABEL_REF:
+    case SYMBOL_REF:
+    case CONST_DOUBLE:
+      /* It's a move.  */
+      if (src != const0_rtx)
+	base_cost += pdp11_addr_cost (src, mode, ADDR_SPACE_GENERIC, speed);
+      return base_cost;
     default:
-      return false;
+      break;
     }
+  /* There are some other cases where souce and dest are distinct.  */
+  if (FLOAT_MODE_P (mode) &&
+      (op == FLOAT_TRUNCATE || op == FLOAT_EXTEND || op == FIX || op == FLOAT))
+    {
+      src2 = XEXP (src, 0);
+      base_cost += pdp11_addr_cost (src2, mode, ADDR_SPACE_GENERIC, speed);
+    }
+  /* Otherwise, pick up the second operand of the arithmetic
+     operation, if it has two operands.  */
+  else if (op != SUBREG && op != UNSPEC && GET_RTX_LENGTH (op) > 1)
+    {
+      src2 = XEXP (src, 1);
+      base_cost += pdp11_addr_cost (src2, mode, ADDR_SPACE_GENERIC, speed);
+    }
+  
+  return base_cost;
 }
 
 const char *
@@ -1192,7 +1327,7 @@  output_jump (rtx *operands, int ccnz, int length)
    zero, given the compare operation code in op and the compare
    operands in x in and y.  */
 machine_mode
-pdp11_cc_mode (enum rtx_code op, rtx x, rtx y)
+pdp11_cc_mode (enum rtx_code op ATTRIBUTE_UNUSED, rtx x, rtx y ATTRIBUTE_UNUSED)
 {
   if (FLOAT_MODE_P (GET_MODE (x)))
     {
@@ -1863,11 +1998,10 @@  output_addr_const_pdp11 (FILE *file, rtx x)
       if (GET_MODE (x) == VOIDmode)
 	{
 	  /* We can use %o if the number is one word and positive.  */
-	  gcc_assert (!CONST_DOUBLE_HIGH (x));
 	  if (TARGET_DEC_ASM)
-	    fprintf (file, "%ho", CONST_DOUBLE_LOW (x) & 0xffff);
+	    fprintf (file, "%o", (int) CONST_DOUBLE_LOW (x) & 0xffff);
 	  else
-	    fprintf (file, "%#ho", CONST_DOUBLE_LOW (x) & 0xffff);
+	    fprintf (file, "%#o", (int) CONST_DOUBLE_LOW (x) & 0xffff);
 	}
       else
 	/* We can't handle floating point constants;
@@ -2138,6 +2272,26 @@  pdp11_shift_length (rtx *operands, machine_mode m,
   return shift_size;
 }
 
+/* Prepend to CLOBBERS hard registers that are automatically clobbered
+   for an asm We do this for CC_REGNUM and FCC_REGNUM (on FPU target)
+   to maintain source compatibility with the original cc0-based
+   compiler.  */
+
+static rtx_insn *
+pdp11_md_asm_adjust (vec<rtx> &/*outputs*/, vec<rtx> &/*inputs*/,
+		     vec<const char *> &/*constraints*/,
+		     vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
+{
+  clobbers.safe_push (gen_rtx_REG (CCmode, CC_REGNUM));
+  SET_HARD_REG_BIT (clobbered_regs, CC_REGNUM);
+  if (TARGET_FPU)
+    {
+      clobbers.safe_push (gen_rtx_REG (CCmode, FCC_REGNUM));
+      SET_HARD_REG_BIT (clobbered_regs, FCC_REGNUM);
+    }
+  return NULL;
+}
+
 /* Worker function for TARGET_TRAMPOLINE_INIT.
 
    trampoline - how should i do it in separate i+d ? 
Index: config/pdp11/pdp11.h
===================================================================
--- config/pdp11/pdp11.h	(revision 262518)
+++ config/pdp11/pdp11.h	(working copy)
@@ -441,9 +441,8 @@  extern int may_call_alloca;
 
 #define MOVE_MAX 2
 
-/* Nonzero if access to memory by byte is slow and undesirable. -
-*/
-#define SLOW_BYTE_ACCESS 0
+/* Nonzero if access to memory by byte is no faster than by word.  */
+#define SLOW_BYTE_ACCESS 1
 
 /* Do not break .stabs pseudos into continuations.  */
 #define DBX_CONTIN_LENGTH 0
@@ -453,8 +452,7 @@  extern int may_call_alloca;
 
 #define SELECT_CC_MODE(OP,X,Y) pdp11_cc_mode (OP, X, Y)
 
-/* Enable compare elimination pass.
-   FIXME: how can this be enabled for two registers?  */
+/* Enable compare elimination pass.  */
 #undef TARGET_FLAGS_REGNUM
 #define TARGET_FLAGS_REGNUM CC_REGNUM
 
Index: config/pdp11/pdp11.md
===================================================================
--- config/pdp11/pdp11.md	(revision 262518)
+++ config/pdp11/pdp11.md	(working copy)
@@ -50,10 +50,15 @@ 
    ;; End of hard registers
    (FIRST_PSEUDO_REGISTER 18)
    
-   ;; Branch offset limits, as byte offsets from instruction address
+   ;; Branch offset limits, as byte offsets from (pc).  That is NOT
+   ;; the same thing as "instruction address" -- it is for backward
+   ;; branches, but for forward branches it refers to the address
+   ;; following the instruction.  So the max forward distance
+   ;; matches what the processor handbook says, while the max
+   ;; backward branch is 2 less than the book.
    (MIN_BRANCH            -254)
-   (MAX_BRANCH            256)
-   (MIN_SOB               -126)
+   (MAX_BRANCH            254)
+   (MIN_SOB               -124)
    (MAX_SOB               0)])
 
 ;; DF is 64 bit
@@ -142,6 +147,9 @@ 
 ;; length default is 2 bytes each
 (define_attr "length" "" (const_int 2))
 
+;; instruction base cost (not counting operands)
+(define_attr "base_cost" "" (const_int 2))
+
 ;; a user's asm statement
 (define_asm_attributes
   [(set_attr "type" "unknown")
@@ -178,7 +186,7 @@ 
 (define_insn "*rts"
   [(return)]
   ""
-  "rts pc")
+  "rts\tpc")
 
 (define_insn "blockage"
   [(unspec_volatile [(const_int 0)] UNSPECV_BLOCKAGE)]
@@ -217,6 +225,7 @@ 
     return \"{cmpd|cmpf}\t%0,%1\";
 }"
   [(set_attr "length" "2,2,4,4")
+   (set_attr "base_cost" "4")
    (set_attr "type" "fp")]) 
 
 ;; Copy floating point processor condition code register to main CPU
@@ -240,14 +249,15 @@ 
    cmp<PDPint:isfx>\t%0,%1"
   [(set_attr "length" "2,2,4,4,4,6")])
 
-;; sob instruction - FIXME: this doesn't do anything, need to use doloop_end.
-
-(define_insn ""
+;; sob instruction
+;;
+;; Do a define_expand because some alternatives clobber CC.
+;; Some don't, but it isn't all that interesting to cover that case.
+(define_insn_and_split "doloop_end"
   [(set (pc)
 	(if_then_else
-	 (ne (plus:HI (match_operand:HI 0 "register_operand" "+r")
-		      (const_int -1))
-	     (const_int 0))
+	 (ne (match_operand:HI 0 "nonimmediate_operand" "+r,!m")
+	     (const_int 1))
 	 (label_ref (match_operand 1 "" ""))
 	 (pc)))
    (set (match_dup 0)
@@ -254,30 +264,62 @@ 
 	(plus:HI (match_dup 0)
 		 (const_int -1)))]
   "TARGET_40_PLUS"
+  "#"
+  "&& reload_completed"
+  [(parallel [(set (pc)
+		   (if_then_else
+		    (ne (match_dup 0) (const_int 1))
+		    (label_ref (match_dup 1))
+		    (pc)))
+	      (set (match_dup 0)
+		   (plus:HI (match_dup 0)
+			 (const_int -1)))
+	      (clobber (reg:CC CC_REGNUM))])]
+  "")
+
+;; Note that there is a memory alternative here.  This is as documented
+;; in gccint, which says that doloop_end, since it has both a jump and
+;; an output interrupt "must handle its own reloads".  That translates
+;; to: must accept memory operands as valid though they may be deprecated.
+(define_insn "doloop_end_nocc"
+  [(set (pc)
+	(if_then_else
+	 (ne (match_operand:HI 0 "nonimmediate_operand" "+r,!m")
+	     (const_int 1))
+	 (label_ref (match_operand 1 "" ""))
+	 (pc)))
+   (set (match_dup 0)
+	(plus:HI (match_dup 0)
+	      (const_int -1)))
+   (clobber (reg:CC CC_REGNUM))]
+  "TARGET_40_PLUS && reload_completed"
   "*
 {
+ rtx lb[1];
+
  if (get_attr_length (insn) == 2)
     return \"sob\t%0,%l1\";
 
  /* emulate sob */
- operands[2] = gen_label_rtx ();
+ lb[0] = gen_label_rtx ();
  output_asm_insn (\"dec\t%0\", operands);
- output_asm_insn (\"beq\t%l2\", operands);
+ output_asm_insn (\"beq\t%l0\", lb);
  output_asm_insn (\"jmp\t%l1\", operands);
  
- output_asm_label (operands[2]);
+ output_asm_label (lb[0]);
  fputs (\":\\n\", asm_out_file);
 
  return \"\";
 }"
-  [(set (attr "length") (if_then_else (ior (lt (minus (match_dup 0)
-						       (pc))
-						(const_int MIN_SOB))
-					   (gt (minus (match_dup 0)
-						       (pc))
-						(const_int MAX_SOB)))
-				      (const_int 8)
-				      (const_int 2)))])
+  [(set (attr "length")
+        (if_then_else (eq (symbol_ref ("which_alternative")) (const_int 1))
+                          (const_int 10)
+                          (if_then_else (ior (lt (minus (match_dup 1) (pc))
+					         (const_int MIN_SOB))
+					     (gt (minus (match_dup 1) (pc))
+					         (const_int MAX_SOB)))
+				        (const_int 8)
+				        (const_int 2))))])
 
 ;; These control RTL generation for conditional jump insns
 ;; and match them for register allocation.
@@ -377,11 +419,16 @@ 
 
 ;; Move instructions
 
+;; "length" is defined even though this pattern won't appear at
+;; assembly language output time.  But the length is used by
+;; pdp11_insn_cost, before the post-reload splitter adds the
+;; CC clobber to the insn.
 (define_insn "movdi"
   [(set (match_operand:DI 0 "nonimmediate_operand" "=&r,g")
 	(match_operand:DI 1 "general_operand" "rN,g"))]
   ""
-  "")
+  ""
+  [(set_attr "length" "16,32")])
 
 
 (define_insn "*movdi_nocc"
@@ -396,7 +443,8 @@ 
   [(set (match_operand:SI 0 "nonimmediate_operand" "=r,r,g,g")
 	(match_operand:SI 1 "general_operand" "rN,IJ,IJ,g"))]
   ""
-  "")
+  ""
+  [(set_attr "length" "4,6,8,16")])
 
 (define_insn "*movsi_nocc"
   [(set (match_operand:SI 0 "nonimmediate_operand" "=r,r,g,g")
@@ -410,7 +458,8 @@ 
   [(set (match_operand:PDPint 0 "nonimmediate_operand" "=rR,rR,Q,Q")
 	(match_operand:PDPint 1 "general_operand" "rRN,Qi,rRN,Qi"))]
   ""
-  "")
+  ""
+  [(set_attr "length" "2,4,4,6")])
 
 ;; This splits all the integer moves: DI and SI modes as well as
 ;; the simple machine operations.
@@ -497,8 +546,8 @@ 
 ;; but then extend and truncate register-to-register are NOP and
 ;; generate no code.
 (define_insn_and_split "movsf"
-  [(set (match_operand:SF 0 "float_nonimm_operand" "=a,fR,a,Q")
-        (match_operand:SF 1 "float_operand" "fRG,a,FQ,a"))]
+  [(set (match_operand:SF 0 "float_nonimm_operand" "=a,fR,a,Q,a")
+        (match_operand:SF 1 "float_operand" "fRG,a,FQ,a,G"))]
   "TARGET_FPU"
   "#"
   "&& reload_completed"
@@ -505,7 +554,8 @@ 
   [(parallel [(set (match_dup 0)
 		   (match_dup 1))
 	      (clobber (reg:CC FCC_REGNUM))])]
-  "")
+  ""
+  [(set_attr "length" "2,2,4,4,2")])
   
 (define_insn "*movsf<fcc_ccnz>"
   [(set (match_operand:SF 0 "float_nonimm_operand" "=a,fR,a,Q,a")
@@ -598,18 +648,37 @@ 
   [(set_attr "length" "2,4")])
 
 
-;;- zero extension instructions
+;;- zero extension instruction
 
 (define_insn_and_split "zero_extendqihi2"
-  [(set (match_operand:HI 0 "nonimmediate_operand" "=rR,Q")
-	(zero_extend:HI (match_operand:QI 1 "general_operand" "0,0")))]
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=rD,Q,&r,&r")
+	(zero_extend:HI (match_operand:QI 1 "general_operand" "0,0,rR,Q")))]
   ""
   "#"
   "reload_completed"
   [(parallel [(set (match_dup 0) (zero_extend:HI (match_dup 1)))
 	      (clobber (reg:CC CC_REGNUM))])]
-  "")
+  "{
+    rtx r;
 
+    if (!REG_P (operands[0]))
+      {
+        r = gen_rtx_MEM (QImode, operands[0]);
+        adjust_address (r, QImode, 1);
+        emit_move_insn (r, const0_rtx);
+        DONE;
+      }
+    else if (!rtx_equal_p (operands[0], operands[1]))
+      {
+        /* Alternatives 2 and 3 */
+        emit_move_insn (operands[0], const0_rtx);
+        r = gen_rtx_REG (QImode, REGNO (operands[0]));
+        emit_insn (gen_iorqi3_nocc (r, r, operands[1]));
+        DONE;
+      }
+  }"
+  [(set_attr "length" "4,4,4,6")])
+
 (define_insn "*zero_extendqihi2<cc_cc>"
   [(parallel [(set (match_operand:HI 0 "nonimmediate_operand" "=rR,Q")
 		   (zero_extend:HI (match_operand:QI 1 "general_operand" "0,0")))
@@ -618,19 +687,6 @@ 
   "bic\t%#0177400,%0"
   [(set_attr "length" "4,6")])
 			 
-(define_expand "zero_extendhisi2"
-  [(set (subreg:HI 
-          (match_dup 0)
-          2)
-        (match_operand:HI 1 "register_operand" "r"))
-   (set (subreg:HI 
-          (match_operand:SI 0 "register_operand" "=r")
-          0)
-        (const_int 0))]
-  ""
-  "/* operands[1] = make_safe_from (operands[1], operands[0]); */")
-
-
 ;;- sign extension instructions
 
 ;; We sometimes end up doing a register to register extend,
@@ -663,7 +719,8 @@ 
    (clobber (reg:CC FCC_REGNUM))]
   "TARGET_FPU && reload_completed"
   "{ldcfd|movof}\t%1,%0"
-  [(set_attr "length" "2,4")])
+  [(set_attr "length" "2,4")
+   (set_attr "base_cost" "6")])
 
 ;; movb sign extends if destination is a register
 (define_insn_and_split "extendqihi2"
@@ -674,7 +731,8 @@ 
   "reload_completed"
   [(parallel [(set (match_dup 0) (sign_extend:HI (match_dup 1)))
 	      (clobber (reg:CC CC_REGNUM))])]
-  "")
+  ""
+  [(set_attr "length" "2,4")])
 
 ;; MOVB clears V
 (define_insn "*extendqihi2<cc_cc>"
@@ -693,7 +751,8 @@ 
   "&& reload_completed"
   [(parallel [(set (match_dup 0) (sign_extend:SI (match_dup 1)))
 	      (clobber (reg:CC CC_REGNUM))])]
-  "")
+  ""
+  [(set_attr "length" "10,6,6")])
 
 (define_insn "*extendhisi2_nocc"
   [(set (match_operand:SI 0 "nonimmediate_operand" "=o,<,r")
@@ -760,7 +819,8 @@ 
   "&& reload_completed"
   [(parallel [(set (match_dup 0) (float:DF (match_dup 1)))
 	      (clobber (reg:CC FCC_REGNUM))])]
-  "")
+  ""
+  [(set_attr "length" "10,6,8")])
 
 (define_insn "*floatsidf2<fcc_cc>"
   [(set (match_operand:DF 0 "register_operand" "=a,a,a")
@@ -784,7 +844,8 @@ 
      else 
        return \"setl\;{ldcld|movif}\t%1,%0\;seti\";
   "
-  [(set_attr "length" "10,6,8")])
+  [(set_attr "length" "10,6,8")
+   (set_attr "base_cost" "12")])
 
 (define_insn_and_split "floathidf2"
   [(set (match_operand:DF 0 "register_operand" "=a,a")
@@ -794,7 +855,8 @@ 
   "&& reload_completed"
   [(parallel [(set (match_dup 0) (float:DF (match_dup 1)))
 	      (clobber (reg:CC FCC_REGNUM))])]
-  "")
+  ""
+  [(set_attr "length" "2,4")])
 
 (define_insn "*floathidf2<fcc_cc>"
   [(set (match_operand:DF 0 "register_operand" "=a,a")
@@ -802,8 +864,9 @@ 
    (clobber (reg:CC FCC_REGNUM))]
   "TARGET_FPU && reload_completed"
   "{ldcid|movif}\t%1,%0"
-  [(set_attr "length" "2,4")])
-	
+  [(set_attr "length" "2,4")
+   (set_attr "base_cost" "12")])
+
 ;; cut float to int
 
 ;; Note: the hardware treats register destination as
@@ -819,7 +882,8 @@ 
   [(parallel [(set (match_dup 0) (fix:SI (fix:DF (match_dup 1))))
 	      (clobber (reg:CC CC_REGNUM))
 	      (clobber (reg:CC FCC_REGNUM))])]
-  "")
+  ""
+  [(set_attr "length" "10,6,8")])
 
 ;; Note: this clobbers both sets of condition codes!
 (define_insn "*fix_truncdfsi2_nocc"
@@ -841,7 +905,8 @@ 
      else 
        return \"setl\;{stcdl|movfi}\t%1,%0\;seti\";
   "
-  [(set_attr "length" "10,6,8")])
+  [(set_attr "length" "10,6,8")
+   (set_attr "base_cost" "12")])
 
 (define_insn_and_split "fix_truncdfhi2"
   [(set (match_operand:HI 0 "nonimmediate_operand" "=rR,Q")
@@ -852,7 +917,8 @@ 
   [(parallel [(set (match_dup 0) (fix:HI (fix:DF (match_dup 1))))
 	      (clobber (reg:CC CC_REGNUM))
 	      (clobber (reg:CC FCC_REGNUM))])]
-  "")
+  ""
+  [(set_attr "length" "2,4")])
 
 ;; Note: this clobbers both sets of condition codes!
 (define_insn "*fix_truncdfhi2_nocc"
@@ -862,7 +928,8 @@ 
    (clobber (reg:CC FCC_REGNUM))]
   "TARGET_FPU && reload_completed"
   "{stcdi|movfi}\t%1,%0"
-  [(set_attr "length" "2,4")])
+  [(set_attr "length" "2,4")
+   (set_attr "base_cost" "12")])
 
 
 ;;- arithmetic instructions
@@ -878,7 +945,8 @@ 
   [(parallel [(set (match_dup 0)
 		   (plus:DF (match_dup 1) (match_dup 2)))
 	      (clobber (reg:CC FCC_REGNUM))])]
-  "")
+  ""
+  [(set_attr "length" "2,4")])
 
 ;; Float add sets V if overflow from add
 (define_insn "*adddf3<fcc_ccnz>"
@@ -888,7 +956,8 @@ 
    (clobber (reg:CC FCC_REGNUM))]
   "TARGET_FPU && reload_completed"
   "{addd|addf}\t%2,%0"
-  [(set_attr "length" "2,4")])
+  [(set_attr "length" "2,4")
+   (set_attr "base_cost" "6")])
 
 (define_insn_and_split "adddi3"
   [(set (match_operand:DI 0 "nonimmediate_operand" "=&r,r,o,o")
@@ -899,7 +968,8 @@ 
   "reload_completed"
   [(parallel [(set (match_dup 0) (plus:DI (match_dup 1) (match_dup 2)))
 	      (clobber (reg:CC CC_REGNUM))])]
-  "")
+  ""
+  [(set_attr "length" "20,28,40,48")])
 
 (define_insn "*adddi3_nocc"
   [(set (match_operand:DI 0 "nonimmediate_operand" "=&r,r,o,o")
@@ -939,7 +1009,8 @@ 
 
   return \"\";
 }"
-  [(set_attr "length" "20,28,40,48")])
+  [(set_attr "length" "20,28,40,48")
+   (set_attr "base_cost" "0")])
 
 ;; Note that the register operand is not marked earlyclobber.
 ;; The reason is that SI values go in register pairs, so they
@@ -959,7 +1030,8 @@ 
   "reload_completed"
   [(parallel [(set (match_dup 0) (plus:SI (match_dup 1) (match_dup 2)))
 	      (clobber (reg:CC CC_REGNUM))])]
-  "")
+  ""
+  [(set_attr "length" "6,10,12,16")])
 
 (define_insn "*addsi3_nocc"
   [(set (match_operand:SI 0 "nonimmediate_operand" "=&r,r,o,o")
@@ -986,7 +1058,8 @@ 
 
   return \"\";
 }"
-  [(set_attr "length" "6,10,12,16")])
+  [(set_attr "length" "6,10,12,16")
+   (set_attr "base_cost" "0")])
 
 (define_insn_and_split "addhi3"
   [(set (match_operand:HI 0 "nonimmediate_operand" "=rR,rR,Q,Q")
@@ -998,7 +1071,8 @@ 
   [(parallel [(set (match_dup 0)
 		   (plus:HI (match_dup 1) (match_dup 2)))
 	      (clobber (reg:CC CC_REGNUM))])]
-  "")
+  ""
+  [(set_attr "length" "2,4,4,6")])
 
 ;; Add sets V if overflow from the add
 (define_insn "*addhi3<cc_ccnz>"
@@ -1037,7 +1111,8 @@ 
   [(parallel [(set (match_dup 0)
 		   (minus:DF (match_dup 1) (match_dup 2)))
 	      (clobber (reg:CC FCC_REGNUM))])]
-  "")
+  ""
+  [(set_attr "length" "2,4")])
 
 (define_insn "*subdf3<fcc_ccnz>"
   [(set (match_operand:DF 0 "register_operand" "=a,a")
@@ -1046,7 +1121,8 @@ 
    (clobber (reg:CC FCC_REGNUM))]
   "TARGET_FPU && reload_completed"
   "{subd|subf}\t%2,%0"
-  [(set_attr "length" "2,4")])
+  [(set_attr "length" "2,4")
+   (set_attr "base_cost" "6")])
 
 (define_insn_and_split "subdi3"
   [(set (match_operand:DI 0 "nonimmediate_operand" "=&r,r,o,o")
@@ -1057,7 +1133,8 @@ 
   "reload_completed"
   [(parallel [(set (match_dup 0) (minus:DI (match_dup 1) (match_dup 2)))
 	      (clobber (reg:CC CC_REGNUM))])]
-  "")
+  ""
+  [(set_attr "length" "20,28,40,48")])
 
 (define_insn "*subdi3_nocc"
   [(set (match_operand:DI 0 "nonimmediate_operand" "=&r,r,o,o")
@@ -1097,7 +1174,8 @@ 
 
   return \"\";
 }"
-  [(set_attr "length" "20,28,40,48")])
+  [(set_attr "length" "20,28,40,48")
+   (set_attr "base_cost" "0")])
 
 (define_insn_and_split "subsi3"
   [(set (match_operand:SI 0 "nonimmediate_operand" "=&r,r,o,o")
@@ -1108,7 +1186,8 @@ 
   "reload_completed"
   [(parallel [(set (match_dup 0) (minus:SI (match_dup 1) (match_dup 2)))
 	      (clobber (reg:CC CC_REGNUM))])]
-  "")
+  ""
+  [(set_attr "length" "6,10,12,16")])
 
 (define_insn "*subsi3_nocc"
   [(set (match_operand:SI 0 "nonimmediate_operand" "=&r,r,o,o")
@@ -1135,7 +1214,8 @@ 
 
   return \"\";
 }"
-  [(set_attr "length" "6,10,12,16")])
+  [(set_attr "length" "6,10,12,16")
+   (set_attr "base_cost" "0")])
 
 (define_insn_and_split "subhi3"
   [(set (match_operand:HI 0 "nonimmediate_operand" "=rR,rR,Q,Q")
@@ -1147,7 +1227,8 @@ 
   [(parallel [(set (match_dup 0)
 		   (minus:HI (match_dup 1) (match_dup 2)))
 	      (clobber (reg:CC CC_REGNUM))])]
-  "")
+  ""
+  [(set_attr "length" "2,4,4,6")])
 
 ;; Note: the manual says that (minus m (const_int n)) is converted
 ;; to (plus m (const_int -n)) but that does not appear to be
@@ -1200,7 +1281,8 @@ 
     operands[1] = GEN_INT (~INTVAL (op1));
   else
     operands[1] = expand_unop (<MODE>mode, one_cmpl_optab, op1, 0, 1);
-}")
+}"
+  [(set_attr "length" "2,4,4,6")])
 
 (define_insn_and_split "*bic<mode>"
   [(set (match_operand:PDPint 0 "nonimmediate_operand" "=rR,rR,Q,Q")
@@ -1236,7 +1318,8 @@ 
   [(parallel [(set (match_dup 0)
 		   (ior:PDPint (match_dup 1) (match_dup 2)))
 	      (clobber (reg:CC CC_REGNUM))])]
-  "")
+  ""
+  [(set_attr "length" "2,4,4,6")])
 
 (define_insn "ior<mode>3<cc_cc>"
   [(set (match_operand:PDPint 0 "nonimmediate_operand" "=rR,rR,Q,Q")
@@ -1258,7 +1341,8 @@ 
   [(parallel [(set (match_dup 0)
 		   (xor:HI (match_dup 1) (match_dup 2)))
 	      (clobber (reg:CC CC_REGNUM))])]
-  "")
+  ""
+  [(set_attr "length" "2,4")])
 
 (define_insn "*xorhi3<cc_cc>"
   [(set (match_operand:HI 0 "nonimmediate_operand" "=rR,Q")
@@ -1280,7 +1364,8 @@ 
   [(parallel [(set (match_dup 0)
 		   (not:PDPint (match_dup 1)))
 	      (clobber (reg:CC CC_REGNUM))])]
-  "")
+  ""
+  [(set_attr "length" "2,4")])
 
 (define_insn "*one_cmpl<mode>2<cc_cc>"
   [(set (match_operand:PDPint 0 "nonimmediate_operand" "=rR,Q")
@@ -1297,48 +1382,129 @@ 
 ;; used to reduce the amount of very similar code.
 ;;
 ;; First the insns used for small constant shifts.
-(define_insn "<code><mode>_sc"
+(define_insn_and_split "<code><mode>_sc"
   [(set (match_operand:QHSint 0 "nonimmediate_operand" "=rD,Q")
 	(SHF:QHSint (match_operand:QHSint 1 "general_operand" "0,0")
 	            (match_operand:HI 2 "expand_shift_operand" "O,O")))]
   ""
-  "* return pdp11_assemble_shift (operands, <QHSint:mname>, <CODE>);"
+  "#"
+  "reload_completed"
+  [(parallel [(set (match_dup 0) (SHF:QHSint (match_dup 1) (match_dup 2)))
+	      (clobber (reg:CC CC_REGNUM))])]
+  ""
   [(set (attr "length")
 	(symbol_ref "pdp11_shift_length (operands, <QHSint:mname>, 
-                                         <CODE>, which_alternative == 0)"))])
+                                         <CODE>, which_alternative == 0)"))
+   (set_attr "base_cost" "0")])
 
+(define_insn "<code><mode>_sc<cc_ccnz>"
+  [(set (match_operand:PDPint 0 "nonimmediate_operand" "=rD,Q")
+	(SHF:PDPint (match_operand:PDPint 1 "general_operand" "0,0")
+	     (match_operand:HI 2 "expand_shift_operand" "O,O")))
+   (clobber (reg:CC CC_REGNUM))]
+  "reload_completed"
+  "* return pdp11_assemble_shift (operands, <PDPint:mname>, <CODE>);"
+  [(set (attr "length")
+	(symbol_ref "pdp11_shift_length (operands, <PDPint:mname>, 
+                                         <CODE>, which_alternative == 0)"))
+   (set_attr "base_cost" "0")])
+
+;; This one comes only in clobber flavor.
+(define_insn "<code>si_sc_nocc"
+  [(set (match_operand:SI 0 "nonimmediate_operand" "=rD,Q")
+	(SHF:SI (match_operand:SI 1 "general_operand" "0,0")
+	     (match_operand:HI 2 "expand_shift_operand" "O,O")))
+   (clobber (reg:CC CC_REGNUM))]
+  "reload_completed"
+  "* return pdp11_assemble_shift (operands, SImode, <CODE>);"
+  [(set (attr "length")
+	(symbol_ref "pdp11_shift_length (operands, SImode, 
+                                         <CODE>, which_alternative == 0)"))
+   (set_attr "base_cost" "0")])
+
 ;; Next, shifts that are done as a loop on base (11/10 class) machines.
 ;; This applies to shift counts too large to unroll, or variable shift
 ;; counts.  The check for count <= 0 is done before we get here.
-(define_insn "<code><mode>_base"
+(define_insn_and_split "<code><mode>_base"
   [(set (match_operand:QHSint 0 "nonimmediate_operand" "=rD,Q")
 	(SHF:QHSint (match_operand:QHSint 1 "general_operand" "0,0")
 	     (match_operand:HI 2 "register_operand" "r,r")))
    (clobber (match_dup 2))]
   ""
+  "#"
+  "reload_completed"
+  [(parallel [(set (match_dup 0) (SHF:QHSint (match_dup 1) (match_dup 2)))
+	      (clobber (match_dup 2))
+	      (clobber (reg:CC CC_REGNUM))])]
+  ""
+  [(set (attr "length")
+	(symbol_ref "pdp11_shift_length (operands, <QHSint:mname>, 
+                                         <CODE>, which_alternative == 0)"))
+   (set_attr "base_cost" "0")])
+
+(define_insn "<code><mode>_base_nocc"
+  [(set (match_operand:QHSint 0 "nonimmediate_operand" "=rD,Q")
+	(SHF:QHSint (match_operand:QHSint 1 "general_operand" "0,0")
+	     (match_operand:HI 2 "register_operand" "r,r")))
+   (clobber (match_dup 2))
+   (clobber (reg:CC CC_REGNUM))]
+  "reload_completed"
   "* return pdp11_assemble_shift (operands, <QHSint:mname>, <CODE>);"
   [(set (attr "length")
 	(symbol_ref "pdp11_shift_length (operands, <QHSint:mname>, 
-                                         <CODE>, which_alternative == 0)"))])
+                                         <CODE>, which_alternative == 0)"))
+   (set_attr "base_cost" "0")])
 
 ;; Next the insns that use the extended instructions ash and ashc.
 ;; Note that these are just left shifts, and HI/SI only.  (Right shifts
 ;; are done by shifting by a negative amount.)
-(define_insn "aslhi_op"
+(define_insn_and_split "aslhi_op"
   [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r")
 	(ashift:HI (match_operand:HI 1 "general_operand" "0,0")
-	               (match_operand:HI 2 "general_operand" "rR,Q")))]
+	               (match_operand:HI 2 "general_operand" "rR,Qi")))]
   "TARGET_40_PLUS"
+  "#"
+  "&& reload_completed"
+  [(parallel [(set (match_dup 0)
+		   (ashift:HI (match_dup 1) (match_dup 2)))
+	      (clobber (reg:CC CC_REGNUM))])]
+  ""
+  [(set_attr "length" "2,4")
+   (set_attr "base_cost" "8")])
+
+(define_insn "aslhi_op<cc_ccnz>"
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r")
+	(ashift:HI (match_operand:HI 1 "general_operand" "0,0")
+		(match_operand:HI 2 "general_operand" "rR,Qi")))
+   (clobber (reg:CC CC_REGNUM))]
+  "TARGET_40_PLUS && reload_completed"
   "ash\t%2,%0"
-  [(set_attr "length" "2,4")])
+  [(set_attr "length" "2,4")
+   (set_attr "base_cost" "8")])
 
-(define_insn "aslsi_op"
+(define_insn_and_split "aslsi_op"
   [(set (match_operand:SI 0 "nonimmediate_operand" "=r,r")
 	(ashift:SI (match_operand:SI 1 "general_operand" "0,0")
-	           (match_operand:HI 2 "general_operand" "rR,Q")))]
+	           (match_operand:HI 2 "general_operand" "rR,Qi")))]
   "TARGET_40_PLUS"
+  "#"
+  "&& reload_completed"
+  [(parallel [(set (match_dup 0)
+		   (ashift:SI (match_dup 1) (match_dup 2)))
+	      (clobber (reg:CC CC_REGNUM))])]
+  ""
+  [(set_attr "length" "2,4")
+   (set_attr "base_cost" "8")])
+
+(define_insn "aslsi_op_<cc_ccnz>"
+  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,r")
+	(ashift:SI (match_operand:SI 1 "general_operand" "0,0")
+		(match_operand:HI 2 "general_operand" "rR,Qi")))
+   (clobber (reg:CC CC_REGNUM))]
+  "TARGET_40_PLUS && reload_completed"
   "ashc\t%2,%0"
-  [(set_attr "length" "2,4")])
+  [(set_attr "length" "2,4")
+   (set_attr "base_cost" "8")])
 
 ;; Now the expanders that produce the insns defined above. 
 (define_expand "ashl<mode>3"
@@ -1437,7 +1603,8 @@ 
   "&& reload_completed"
   [(parallel [(set (match_dup 0) (abs:DF (match_dup 1)))
 	      (clobber (reg:CC FCC_REGNUM))])]
-   "")
+   ""
+  [(set_attr "length" "2,4")])
 
 (define_insn "absdf2<fcc_cc>"
   [(set (match_operand:DF 0 "nonimmediate_operand" "=fR,Q")
@@ -1457,7 +1624,8 @@ 
   "&& reload_completed"
   [(parallel [(set (match_dup 0) (neg:DF (match_dup 1)))
 	      (clobber (reg:CC FCC_REGNUM))])]
-   "")
+   ""
+  [(set_attr "length" "2,4")])
 
 (define_insn "negdf2<fcc_cc>"
   [(set (match_operand:DF 0 "nonimmediate_operand" "=fR,Q")
@@ -1475,7 +1643,8 @@ 
   "reload_completed"
   [(parallel [(set (match_dup 0) (neg:DI (match_dup 1)))
 	      (clobber (reg:CC CC_REGNUM))])]
-  "")
+  ""
+  [(set_attr "length" "18,34")])
   
 ;; TODO: this can be neg/adc/neg/adc... I believe.  Check.  Saves one word.
 (define_insn "negdi2_nocc"
@@ -1483,24 +1652,25 @@ 
 	(neg:DI (match_operand:DI 1 "general_operand" "0,0")))
    (clobber (reg:CC CC_REGNUM))]
   "reload_completed"
-{
-  rtx exops[4][2];
+  {
+    rtx exops[4][2];
+    
+    pdp11_expand_operands (operands, exops, 1, NULL, either);
   
-  pdp11_expand_operands (operands, exops, 1, NULL, either);
+    output_asm_insn (\"com\t%0\", exops[3]);
+    output_asm_insn (\"com\t%0\", exops[2]);
+    output_asm_insn (\"com\t%0\", exops[1]);
+    output_asm_insn (\"com\t%0\", exops[0]);
+    output_asm_insn (\"add\t%#1,%0\", exops[3]);
+    output_asm_insn (\"adc\t%0\", exops[2]);
+    output_asm_insn (\"adc\t%0\", exops[1]);
+    output_asm_insn (\"adc\t%0\", exops[0]);
+  
+    return \"\";
+  }
+  [(set_attr "length" "18,34")
+   (set_attr "base_cost" "0")])
 
-  output_asm_insn (\"com\t%0\", exops[3]);
-  output_asm_insn (\"com\t%0\", exops[2]);
-  output_asm_insn (\"com\t%0\", exops[1]);
-  output_asm_insn (\"com\t%0\", exops[0]);
-  output_asm_insn (\"add\t%#1,%0\", exops[3]);
-  output_asm_insn (\"adc\t%0\", exops[2]);
-  output_asm_insn (\"adc\t%0\", exops[1]);
-  output_asm_insn (\"adc\t%0\", exops[0]);
-
-  return \"\";
-}
-[(set_attr "length" "18,34")])
-
 (define_insn_and_split "negsi2"
   [(set (match_operand:SI 0 "nonimmediate_operand" "=r,o")
 	(neg:SI (match_operand:SI 1 "general_operand" "0,0")))]
@@ -1509,7 +1679,8 @@ 
   "reload_completed"
   [(parallel [(set (match_dup 0) (neg:SI (match_dup 1)))
 	      (clobber (reg:CC CC_REGNUM))])]
-  "")
+  ""
+  [(set_attr "length" "10,18")])
   
 ;; TODO: this can be neg/adc/neg/adc... I believe.  Check.  Saves one word.
 (define_insn "negsi2_nocc"
@@ -1517,20 +1688,21 @@ 
 	(neg:SI (match_operand:SI 1 "general_operand" "0,0")))
    (clobber (reg:CC CC_REGNUM))]
   "reload_completed"
-{
-  rtx exops[2][2];
+  {
+    rtx exops[2][2];
+    
+    pdp11_expand_operands (operands, exops, 1, NULL, either);
   
-  pdp11_expand_operands (operands, exops, 1, NULL, either);
+    output_asm_insn (\"com\t%0\", exops[1]);
+    output_asm_insn (\"com\t%0\", exops[0]);
+    output_asm_insn (\"add\t%#1,%0\", exops[1]);
+    output_asm_insn (\"adc\t%0\", exops[0]);
+  
+    return \"\";
+  }
+  [(set_attr "length" "10,18")
+   (set_attr "base_cost" "0")])
 
-  output_asm_insn (\"com\t%0\", exops[1]);
-  output_asm_insn (\"com\t%0\", exops[0]);
-  output_asm_insn (\"add\t%#1,%0\", exops[1]);
-  output_asm_insn (\"adc\t%0\", exops[0]);
-
-  return \"\";
-}
-[(set_attr "length" "10,18")])
-
 (define_insn_and_split "neg<mode>2"
   [(set (match_operand:PDPint 0 "nonimmediate_operand" "=rR,Q")
 	(neg:PDPint (match_operand:PDPint 1 "general_operand" "0,0")))]
@@ -1539,7 +1711,8 @@ 
   "reload_completed"
   [(parallel [(set (match_dup 0) (neg:PDPint (match_dup 1)))
 	      (clobber (reg:CC CC_REGNUM))])]
-  "")
+  ""
+  [(set_attr "length" "2,4")])
   
 (define_insn "neg<mode>2<cc_ccnz>"
   [(set (match_operand:PDPint 0 "nonimmediate_operand" "=rR,Q")
@@ -1586,7 +1759,7 @@ 
 (define_insn "indirect_jump"
   [(set (pc) (match_operand:HI 0 "general_operand" "r"))]
   ""
-  "jmp @%0"
+  "jmp\t@%0"
   [(set_attr "length" "2")])
 
 ;;- jump to subroutine
@@ -1596,7 +1769,7 @@ 
 	 (match_operand:HI 1 "general_operand" "g,g"))]
   ;;- Don't use operand 1 for most machines.
   ""
-  "jsr pc,%0"
+  "jsr\tpc,%0"
   [(set_attr "length" "2,4")])
 
 ;;- jump to subroutine
@@ -1606,7 +1779,7 @@ 
 	      (match_operand:HI 2 "general_operand" "g,g")))]
   ;;- Don't use operand 2 for most machines.
   ""
-  "jsr pc,%1"
+  "jsr\tpc,%1"
   [(set_attr "length" "2,4")])
 
 (define_expand "untyped_call"
@@ -1653,7 +1826,8 @@ 
   "&& reload_completed"
   [(parallel [(set (match_dup 0) (mult:DF (match_dup 1) (match_dup 2)))
 	      (clobber (reg:CC FCC_REGNUM))])]
-  "")
+  ""
+  [(set_attr "length" "2,4")])
 
 (define_insn "muldf3<fcc_ccnz>"
   [(set (match_operand:DF 0 "register_operand" "=a,a")
@@ -1662,7 +1836,8 @@ 
    (clobber (reg:CC FCC_REGNUM))]
   "TARGET_FPU && reload_completed"
   "{muld|mulf}\t%2,%0"
-  [(set_attr "length" "2,4")])
+  [(set_attr "length" "2,4")
+   (set_attr "base_cost" "20")])
 
 ;; 16 bit result multiply.  This uses odd numbered registers.
 
@@ -1675,7 +1850,8 @@ 
   "&& reload_completed"
   [(parallel [(set (match_dup 0) (mult:HI (match_dup 1) (match_dup 2)))
 	      (clobber (reg:CC CC_REGNUM))])]
-  "")
+  ""
+  [(set_attr "length" "2,4")])
 
 (define_insn "mulhi3<cc_cc>"
   [(set (match_operand:HI 0 "register_operand" "=d,d")
@@ -1684,7 +1860,8 @@ 
    (clobber (reg:CC CC_REGNUM))]
   "TARGET_40_PLUS && reload_completed"
   "mul\t%2,%0"
-  [(set_attr "length" "2,4")])
+  [(set_attr "length" "2,4")
+   (set_attr "base_cost" "20")])
 
 ;; 32 bit result from 16 bit operands
 (define_insn_and_split "mulhisi3"
@@ -1698,7 +1875,8 @@ 
 		   (mult:SI (sign_extend:SI (match_dup 1))
 			 (sign_extend:SI (match_dup 2))))
 	      (clobber (reg:CC CC_REGNUM))])]
-  "")
+  ""
+  [(set_attr "length" "2,4")])
 
 (define_insn "mulhisi3<cc_cc>"
   [(set (match_operand:SI 0 "register_operand" "=r,r")
@@ -1707,7 +1885,8 @@ 
    (clobber (reg:CC CC_REGNUM))]
   "TARGET_40_PLUS && reload_completed"
   "mul\t%2,%0"
-  [(set_attr "length" "2,4")])
+  [(set_attr "length" "2,4")
+   (set_attr "base_cost" "20")])
 
 ;;- divide
 (define_insn_and_split "divdf3"
@@ -1719,7 +1898,8 @@ 
   "&& reload_completed"
   [(parallel [(set (match_dup 0) (div:DF (match_dup 1) (match_dup 2)))
 	      (clobber (reg:CC FCC_REGNUM))])]
-   "")
+   ""
+  [(set_attr "length" "2,4")])
   
 (define_insn "divdf3<fcc_ccnz>"
   [(set (match_operand:DF 0 "register_operand" "=a,a")
@@ -1728,7 +1908,8 @@ 
    (clobber (reg:CC FCC_REGNUM))]
   "TARGET_FPU && reload_completed"
   "{divd|divf}\t%2,%0"
-  [(set_attr "length" "2,4")])
+  [(set_attr "length" "2,4")
+   (set_attr "base_cost" "20")])
 
 (define_expand "divmodhi4"
   [(parallel
@@ -1758,7 +1939,8 @@ 
 	      (set (subreg:HI (match_dup 1) 2)
 		   (mod:HI (match_dup 1) (match_dup 2)))
 	      (clobber (reg:CC CC_REGNUM))])]
-  "")
+  ""
+  [(set_attr "length" "2,4")])
 
 ;; Note that there is no corresponding CC setter pattern.
 ;; The reason is that it won't be generated, because
@@ -1774,4 +1956,114 @@ 
    (clobber (reg:CC CC_REGNUM))]
   "TARGET_40_PLUS"
    "div\t%2,%0"
+  [(set_attr "length" "2,4")
+   (set_attr "base_cost" "40")])
+
+;; Byte swap
+(define_insn_and_split "bswaphi2"
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=rR,Q")
+	(bswap:HI (match_operand:HI 1 "general_operand" "0,0")))]
+  ""
+  "#"
+  "reload_completed"
+  [(parallel [(set (match_dup 0) (bswap:HI (match_dup 1)))
+	      (clobber (reg:CC CC_REGNUM))])]
+  ""
   [(set_attr "length" "2,4")])
+
+(define_insn "bswaphi2<cc_ccnz>"
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=rR,Q")
+	(bswap:HI (match_operand:HI 1 "general_operand" "0,0")))
+   (clobber (reg:CC CC_REGNUM))]
+  ""
+  "swab\t%0"
+  [(set_attr "length" "2,4")])
+
+(define_insn_and_split "bswapsi2"
+  [(set (match_operand:SI 0 "register_operand" "=&r")
+	(bswap:SI (match_operand:SI 1 "general_operand" "g")))]
+  ""
+  "#"
+  "reload_completed"
+  [(parallel [(set (match_dup 0)
+		   (bswap:SI (match_dup 1)))
+	      (clobber (reg:CC CC_REGNUM))])]
+  ""
+  [(set_attr "length" "10")])
+
+(define_insn "bswapsi2_nocc"
+  [(set (match_operand:SI 0 "register_operand" "=&r,&r,&r")
+	(bswap:SI (match_operand:SI 1 "general_operand" "r,D,Q")))
+   (clobber (reg:CC CC_REGNUM))]
+  ""
+  {
+    rtx exops[2][2];
+    rtx t;
+  
+    pdp11_expand_operands (operands, exops, 2, NULL, either);
+
+    t = exops[0][0];
+    exops[0][0] = exops[1][0];
+    exops[1][0] = t;
+
+    output_asm_insn ("mov\t%0,%1", exops[0]);
+    output_asm_insn ("mov\t%0,%1", exops[1]);
+    output_asm_insn ("swab\t%0", exops[0]);
+    output_asm_insn ("swab\t%0", exops[1]);
+    return "";
+  }
+  [(set_attr "length" "8,10,12")])
+
+(define_expand "rotrhi3"
+  [(match_operand:HI 0 "register_operand" "")
+   (match_operand:HI 1 "register_operand" "")
+   (match_operand:HI 2 "general_operand" "")]
+  "TARGET_40_PLUS"
+  "
+{
+  operands[2] = negate_rtx (HImode, operands[2]);
+  emit_insn (gen_rotlhi3 (operands[0], operands[1], operands[2]));
+  DONE;
+}")
+
+(define_insn_and_split "rotlhi3"
+  [(set (match_operand:HI 0 "register_operand" "=d,d")
+	(rotate:HI (match_operand:HI 1 "register_operand" "0,0")
+	           (match_operand:HI 2 "general_operand" "rR,Qi")))]
+  "TARGET_40_PLUS"
+  "#"
+  "&& reload_completed"
+  [(parallel [(set (match_dup 0)
+		   (rotate:HI (match_dup 1) (match_dup 2)))
+	      (clobber (reg:CC CC_REGNUM))])]
+  ""
+  [(set_attr "length" "2,4")
+   (set_attr "base_cost" "8")])
+
+(define_insn "rotlhi3<cc_ccnz>"
+  [(set (match_operand:HI 0 "register_operand" "=d,d")
+	(rotate:HI (match_operand:HI 1 "register_operand" "0,0")
+		   (match_operand:HI 2 "general_operand" "rR,Qi")))
+   (clobber (reg:CC CC_REGNUM))]
+  "TARGET_40_PLUS && reload_completed"
+  "ashc\t%2,%0"
+  [(set_attr "length" "2,4")
+   (set_attr "base_cost" "8")])
+
+
+  
+;; Some peephole optimizations
+
+;; Move then conditional branch on the result of the move is handled
+;; by compare elimination, but an earlier pass sometimes changes the
+;; compare operand to the move input, and then the compare is not
+;; eliminated.  Do so here.
+(define_peephole2
+  [(parallel [(set (match_operand:PDPint 0 "nonimmediate_operand" "")
+		   (match_operand:PDPint 1 "general_operand" ""))
+	      (clobber (reg:CC CC_REGNUM))])
+   (set (reg:CC CC_REGNUM) (compare:CC (match_dup 1) (const_int 0)))]
+  ""
+  [(parallel [(set (reg:CC CC_REGNUM) (compare:CC (match_dup 1) (const_int 0)))
+	      (set (match_dup 0) (match_dup 1))])]
+  "")