diff mbox

[nios2,5/7] Support R2 CDX load/store multiple instructions

Message ID 55A59841.4020202@codesourcery.com
State New
Headers show

Commit Message

Sandra Loosemore July 14, 2015, 11:16 p.m. UTC
This installment of the Nios II R2 patch series adds support for the
new CDX load/store multiple instructions (ldwm, stwm, pop.n, push.n).

The implementation approach we used here is similar to that in the ARM
backend, with all the insn patterns and peephole optimizers generated
by a Standard ML program.  These instructions have quite complicated
restrictions on register numbering and ordering, which are handled by
pop_operation_p and ldswtm_operand_p in nios2.c.

We ran into the regrename bug addressed by this patch
https://gcc.gnu.org/ml/gcc-patches/2015-06/msg01214.html when testing
this functionality.  I have adapted a couple of the test cases that were
failing with assembler errors into regression tests for that bug, and
included them in the patch committed.

Our experiments with expanding "load_multiple" and "store_multiple"
patterns into these instructions were not promising.  It might be
worth revisiting that later, but presently the only things that
generate them are the peephole optimizers and the prologue/epilogue
changes coming along in part 6 of the patch series.

Committed as r225798.

-Sandra
diff mbox

Patch

Index: gcc/config/nios2/predicates.md
===================================================================
--- gcc/config/nios2/predicates.md	(revision 225796)
+++ gcc/config/nios2/predicates.md	(working copy)
@@ -94,6 +94,30 @@ 
                                          false));
 })
 
+(define_special_predicate "pop_operation"
+  (match_code "parallel")
+{
+  return pop_operation_p (op);
+})
+
+(define_special_predicate "ldwm_operation"
+  (match_code "parallel")
+{
+  return ldstwm_operation_p (op, /*load_p=*/true);
+})
+
+(define_special_predicate "stwm_operation"
+  (match_code "parallel")
+{
+  return ldstwm_operation_p (op, /*load_p=*/false);
+})
+
+(define_predicate "nios2_hard_register_operand"
+  (match_code "reg")
+{
+  return GP_REG_P (REGNO (op));
+})
+
 (define_predicate "stack_memory_operand"
   (match_code "mem")
 {
Index: gcc/config/nios2/nios2-protos.h
===================================================================
--- gcc/config/nios2/nios2-protos.h	(revision 225796)
+++ gcc/config/nios2/nios2-protos.h	(working copy)
@@ -52,6 +52,10 @@  extern bool nios2_unspec_reloc_p (rtx);
 extern int nios2_label_align (rtx);
 extern bool nios2_cdx_narrow_form_p (rtx_insn *);
 
+extern bool pop_operation_p (rtx);
+extern bool ldstwm_operation_p (rtx, bool);
+extern bool gen_ldstwm_peep (bool, int, rtx, rtx *);
+
 extern void nios2_adjust_reg_alloc_order (void);
 
 #ifdef TREE_CODE
Index: gcc/config/nios2/nios2.c
===================================================================
--- gcc/config/nios2/nios2.c	(revision 225796)
+++ gcc/config/nios2/nios2.c	(working copy)
@@ -71,6 +71,8 @@  static void nios2_load_pic_register (voi
 static void nios2_register_custom_code (unsigned int, enum nios2_ccs_code, int);
 static const char *nios2_unspec_reloc_name (int);
 static void nios2_register_builtin_fndecl (unsigned, tree);
+static rtx nios2_ldst_parallel (bool, bool, bool, rtx, int,
+				unsigned HOST_WIDE_INT, bool);
 
 /* Threshold for data being put into the small data/bss area, instead
    of the normal data area (references to the small data/bss area take
@@ -456,6 +458,25 @@  restore_reg (int regno, unsigned offset)
   RTX_FRAME_RELATED_P (insn) = 1;
 }
 
+/* This routine tests for the base register update SET in load/store
+   multiple RTL insns, used in pop_operation_p and ldstwm_operation_p.  */
+static bool
+base_reg_adjustment_p (rtx set, rtx *base_reg, rtx *offset)
+{
+  if (GET_CODE (set) == SET
+      && REG_P (SET_DEST (set))
+      && GET_CODE (SET_SRC (set)) == PLUS
+      && REG_P (XEXP (SET_SRC (set), 0))
+      && rtx_equal_p (SET_DEST (set), XEXP (SET_SRC (set), 0))
+      && CONST_INT_P (XEXP (SET_SRC (set), 1)))
+    {
+      *base_reg = XEXP (SET_SRC (set), 0);
+      *offset = XEXP (SET_SRC (set), 1);
+      return true;
+    }
+  return false;
+}
+
 /* Temp regno used inside prologue/epilogue.  */
 #define TEMP_REG_NUM 8
 
@@ -4030,6 +4051,432 @@  nios2_cdx_narrow_form_p (rtx_insn *insn)
   return false;
 }
 
+/* Main function to implement the pop_operation predicate that
+   check pop.n insn pattern integrity.  The CDX pop.n patterns mostly
+   hardcode the restored registers, so the main checking is for the
+   SP offsets.  */
+bool
+pop_operation_p (rtx op)
+{
+  int i;
+  HOST_WIDE_INT last_offset = -1, len = XVECLEN (op, 0);
+  rtx base_reg, offset;
+
+  if (len < 3 /* At least has a return, SP-update, and RA restore.  */
+      || GET_CODE (XVECEXP (op, 0, 0)) != RETURN
+      || !base_reg_adjustment_p (XVECEXP (op, 0, 1), &base_reg, &offset)
+      || !rtx_equal_p (base_reg, stack_pointer_rtx)
+      || !CONST_INT_P (offset)
+      || (INTVAL (offset) & 3) != 0)
+    return false;
+
+  for (i = len - 1; i > 1; i--)
+    {
+      rtx set = XVECEXP (op, 0, i);
+      rtx curr_base_reg, curr_offset;
+
+      if (GET_CODE (set) != SET || !MEM_P (SET_SRC (set))
+	  || !split_mem_address (XEXP (SET_SRC (set), 0),
+				 &curr_base_reg, &curr_offset)
+	  || !rtx_equal_p (base_reg, curr_base_reg)
+	  || !CONST_INT_P (curr_offset))
+	return false;
+      if (i == len - 1)
+	{
+	  last_offset = INTVAL (curr_offset);
+	  if ((last_offset & 3) != 0 || last_offset > 60)
+	    return false;
+	}
+      else
+	{
+	  last_offset += 4;
+	  if (INTVAL (curr_offset) != last_offset)
+	    return false;
+	}
+    }
+  if (last_offset < 0 || last_offset + 4 != INTVAL (offset))
+    return false;
+
+  return true;
+}
+
+
+/* Masks of registers that are valid for CDX ldwm/stwm instructions.
+   The instruction can encode subsets drawn from either R2-R13 or
+   R14-R23 + FP + RA.  */
+#define CDX_LDSTWM_VALID_REGS_0 0x00003ffc
+#define CDX_LDSTWM_VALID_REGS_1 0x90ffc000
+
+static bool
+nios2_ldstwm_regset_p (unsigned int regno, unsigned int *regset)
+{
+  if (*regset == 0)
+    {
+      if (CDX_LDSTWM_VALID_REGS_0 & (1 << regno))
+	*regset = CDX_LDSTWM_VALID_REGS_0;
+      else if (CDX_LDSTWM_VALID_REGS_1 & (1 << regno))
+	*regset = CDX_LDSTWM_VALID_REGS_1;
+      else
+	return false;
+      return true;
+    }
+  else
+    return (*regset & (1 << regno)) != 0;
+}
+
+/* Main function to implement ldwm_operation/stwm_operation
+   predicates that check ldwm/stwm insn pattern integrity.  */
+bool
+ldstwm_operation_p (rtx op, bool load_p)
+{
+  int start, i, end = XVECLEN (op, 0) - 1, last_regno = -1;
+  unsigned int regset = 0;
+  rtx base_reg, offset;  
+  rtx first_elt = XVECEXP (op, 0, 0);
+  bool inc_p = true;
+  bool wb_p = base_reg_adjustment_p (first_elt, &base_reg, &offset);
+  if (GET_CODE (XVECEXP (op, 0, end)) == RETURN)
+    end--;
+  start = wb_p ? 1 : 0;
+  for (i = start; i <= end; i++)
+    {
+      int regno;
+      rtx reg, mem, elt = XVECEXP (op, 0, i);
+      /* Return early if not a SET at all.  */
+      if (GET_CODE (elt) != SET)
+	return false;
+      reg = load_p ? SET_DEST (elt) : SET_SRC (elt);
+      mem = load_p ? SET_SRC (elt) : SET_DEST (elt);
+      if (!REG_P (reg) || !MEM_P (mem))
+	return false;
+      regno = REGNO (reg);
+      if (!nios2_ldstwm_regset_p (regno, &regset))
+	return false;
+      /* If no writeback to determine direction, use offset of first MEM.  */
+      if (wb_p)
+	inc_p = INTVAL (offset) > 0;
+      else if (i == start)
+	{
+	  rtx first_base, first_offset;
+	  if (!split_mem_address (XEXP (mem, 0),
+				  &first_base, &first_offset))
+	    return false;
+	  base_reg = first_base;
+	  inc_p = INTVAL (first_offset) >= 0;
+	}
+      /* Ensure that the base register is not loaded into.  */
+      if (load_p && regno == (int) REGNO (base_reg))
+	return false;
+      /* Check for register order inc/dec integrity.  */
+      if (last_regno >= 0)
+	{
+	  if (inc_p && last_regno >= regno)
+	    return false;
+	  if (!inc_p && last_regno <= regno)
+	    return false;
+	}
+      last_regno = regno;
+    }
+  return true;
+}
+
+/* Helper for nios2_ldst_parallel, for generating a parallel vector
+   SET element.  */
+static rtx
+gen_ldst (bool load_p, int regno, rtx base_mem, int offset)
+{
+  rtx reg = gen_rtx_REG (SImode, regno);
+  rtx mem = adjust_address_nv (base_mem, SImode, offset);
+  return gen_rtx_SET (load_p ? reg : mem,
+		      load_p ? mem : reg);
+}
+
+/* A general routine for creating the body RTL pattern of
+   ldwm/stwm/push.n/pop.n insns.
+   LOAD_P: true/false for load/store direction.
+   REG_INC_P: whether registers are incrementing/decrementing in the
+   *RTL vector* (not necessarily the order defined in the ISA specification).
+   OFFSET_INC_P: Same as REG_INC_P, but for the memory offset order.
+   BASE_MEM: starting MEM.
+   BASE_UPDATE: amount to update base register; zero means no writeback.
+   REGMASK: register mask to load/store.
+   RET_P: true if to tag a (return) element at the end.
+
+   Note that this routine does not do any checking. It's the job of the
+   caller to do the right thing, and the insn patterns to do the
+   safe-guarding.  */
+static rtx
+nios2_ldst_parallel (bool load_p, bool reg_inc_p, bool offset_inc_p,
+		     rtx base_mem, int base_update,
+		     unsigned HOST_WIDE_INT regmask, bool ret_p)
+{
+  rtvec p;
+  int regno, b = 0, i = 0, n = 0, len = popcount_hwi (regmask);
+  if (ret_p) len++, i++, b++;
+  if (base_update != 0) len++, i++;
+  p = rtvec_alloc (len);
+  for (regno = (reg_inc_p ? 0 : 31);
+       regno != (reg_inc_p ? 32 : -1);
+       regno += (reg_inc_p ? 1 : -1))
+    if ((regmask & (1 << regno)) != 0)
+      {
+	int offset = (offset_inc_p ? 4 : -4) * n++;
+	RTVEC_ELT (p, i++) = gen_ldst (load_p, regno, base_mem, offset);
+      }
+  if (ret_p)
+    RTVEC_ELT (p, 0) = ret_rtx;
+  if (base_update != 0)
+    {
+      rtx reg, offset;
+      if (!split_mem_address (XEXP (base_mem, 0), &reg, &offset))
+	gcc_unreachable ();
+      RTVEC_ELT (p, b) =
+	gen_rtx_SET (reg, plus_constant (Pmode, reg, base_update));
+    }
+  return gen_rtx_PARALLEL (VOIDmode, p);
+}
+
+/* CDX ldwm/stwm peephole optimization pattern related routines.  */
+
+/* Data structure and sorting function for ldwm/stwm peephole optimizers.  */
+struct ldstwm_operand
+{
+  int offset;	/* Offset from base register.  */
+  rtx reg;	/* Register to store at this offset.  */
+  rtx mem;	/* Original mem.  */
+  bool bad;	/* True if this load/store can't be combined.  */
+  bool rewrite; /* True if we should rewrite using scratch.  */
+};
+
+static int
+compare_ldstwm_operands (const void *arg1, const void *arg2)
+{
+  const struct ldstwm_operand *op1 = (const struct ldstwm_operand *) arg1;
+  const struct ldstwm_operand *op2 = (const struct ldstwm_operand *) arg2;
+  if (op1->bad)
+    return op2->bad ? 0 : 1;
+  else if (op2->bad)
+    return -1;
+  else
+    return op1->offset - op2->offset;
+}
+
+/* Helper function: return true if a load/store using REGNO with address
+   BASEREG and offset OFFSET meets the constraints for a 2-byte CDX ldw.n,
+   stw.n, ldwsp.n, or stwsp.n instruction.  */
+static bool
+can_use_cdx_ldstw (int regno, int basereg, int offset)
+{
+  if (CDX_REG_P (regno) && CDX_REG_P (basereg)
+      && (offset & 0x3) == 0 && 0 <= offset && offset < 0x40)
+    return true;
+  else if (basereg == SP_REGNO
+	   && offset >= 0 && offset < 0x80 && (offset & 0x3) == 0)
+    return true;
+  return false;
+}
+
+/* This function is called from peephole2 optimizers to try to merge
+   a series of individual loads and stores into a ldwm or stwm.  It
+   can also rewrite addresses inside the individual loads and stores
+   using a common base register using a scratch register and smaller
+   offsets if that allows them to use CDX ldw.n or stw.n instructions
+   instead of 4-byte loads or stores.
+   N is the number of insns we are trying to merge.  SCRATCH is non-null
+   if there is a scratch register available.  The OPERANDS array contains
+   alternating REG (even) and MEM (odd) operands.  */
+bool
+gen_ldstwm_peep (bool load_p, int n, rtx scratch, rtx *operands)
+{
+  /* CDX ldwm/stwm instructions allow a maximum of 12 registers to be
+     specified.  */
+#define MAX_LDSTWM_OPS 12
+  struct ldstwm_operand sort[MAX_LDSTWM_OPS];
+  int basereg = -1;
+  int baseoffset;
+  int i, m, lastoffset, lastreg;
+  unsigned int regmask = 0, usemask = 0, regset;
+  bool needscratch;
+  int newbasereg;
+  int nbytes;
+
+  if (!TARGET_HAS_CDX)
+    return false;
+  if (n < 2 || n > MAX_LDSTWM_OPS)
+    return false;
+
+  /* Check all the operands for validity and initialize the sort array.
+     The places where we return false here are all situations that aren't
+     expected to ever happen -- invalid patterns, invalid registers, etc.  */
+  for (i = 0; i < n; i++)
+    {
+      rtx base, offset;
+      rtx reg = operands[i];
+      rtx mem = operands[i + n];
+      int r, o, regno;
+      bool bad = false;
+
+      if (!REG_P (reg) || !MEM_P (mem))
+	return false;
+
+      regno = REGNO (reg);
+      if (regno > 31)
+	return false;
+      if (load_p && (regmask & (1 << regno)) != 0)
+	return false;
+      regmask |= 1 << regno;
+
+      if (!split_mem_address (XEXP (mem, 0), &base, &offset))
+	return false;
+      r = REGNO (base);
+      o = INTVAL (offset);
+
+      if (basereg == -1)
+	basereg = r;
+      else if (r != basereg)
+	bad = true;
+      usemask |= 1 << r;
+
+      sort[i].bad = bad;
+      sort[i].rewrite = false;
+      sort[i].offset = o;
+      sort[i].reg = reg;
+      sort[i].mem = mem;
+    }
+
+  /* If we are doing a series of register loads, we can't safely reorder
+     them if any of the regs used in addr expressions are also being set.  */
+  if (load_p && (regmask & usemask))
+    return false;
+
+  /* Sort the array by increasing mem offset order, then check that
+     offsets are valid and register order matches mem order.  At the
+     end of this loop, m is the number of loads/stores we will try to
+     combine; the rest are leftovers.  */
+  qsort (sort, n, sizeof (struct ldstwm_operand), compare_ldstwm_operands);
+
+  baseoffset = sort[0].offset;
+  needscratch = baseoffset != 0;
+  if (needscratch && !scratch)
+    return false;
+
+  lastreg = regmask = regset = 0;
+  lastoffset = baseoffset;
+  for (m = 0; m < n && !sort[m].bad; m++)
+    {
+      int thisreg = REGNO (sort[m].reg);
+      if (sort[m].offset != lastoffset
+	  || (m > 0 && lastreg >= thisreg)
+	  || !nios2_ldstwm_regset_p (thisreg, &regset))
+	break;
+      lastoffset += 4;
+      lastreg = thisreg;
+      regmask |= (1 << thisreg);
+    }
+
+  /* For loads, make sure we are not overwriting the scratch reg.
+     The peephole2 pattern isn't supposed to match unless the register is
+     unused all the way through, so this isn't supposed to happen anyway.  */
+  if (load_p
+      && needscratch
+      && ((1 << REGNO (scratch)) & regmask) != 0)
+    return false;
+  newbasereg = needscratch ? (int) REGNO (scratch) : basereg;
+
+  /* We may be able to combine only the first m of the n total loads/stores
+     into a single instruction.  If m < 2, there's no point in emitting
+     a ldwm/stwm at all, but we might be able to do further optimizations
+     if we have a scratch.  We will count the instruction lengths of the
+     old and new patterns and store the savings in nbytes.  */
+  if (m < 2)
+    {
+      if (!needscratch)
+	return false;
+      m = 0;
+      nbytes = 0;
+    }
+  else
+    nbytes = -4;  /* Size of ldwm/stwm.  */
+  if (needscratch)
+    {
+      int bo = baseoffset > 0 ? baseoffset : -baseoffset;
+      if (CDX_REG_P (newbasereg)
+	  && CDX_REG_P (basereg)
+	  && bo <= 128 && bo > 0 && (bo & (bo - 1)) == 0)
+	nbytes -= 2;  /* Size of addi.n/subi.n.  */
+      else
+	nbytes -= 4;  /* Size of non-CDX addi.  */
+    }
+
+  /* Count the size of the input load/store instructions being replaced.  */
+  for (i = 0; i < m; i++)
+    if (can_use_cdx_ldstw (REGNO (sort[i].reg), basereg, sort[i].offset))
+      nbytes += 2;
+    else
+      nbytes += 4;
+
+  /* We may also be able to save a bit if we can rewrite non-CDX
+     load/stores that can't be combined into the ldwm/stwm into CDX
+     load/stores using the scratch reg.  For example, this might happen
+     if baseoffset is large, by bringing in the offsets in the load/store
+     instructions within the range that fits in the CDX instruction.  */
+  if (needscratch && CDX_REG_P (newbasereg))
+    for (i = m; i < n && !sort[i].bad; i++)
+      if (!can_use_cdx_ldstw (REGNO (sort[i].reg), basereg, sort[i].offset)
+	  && can_use_cdx_ldstw (REGNO (sort[i].reg), newbasereg,
+				sort[i].offset - baseoffset))
+	{
+	  sort[i].rewrite = true;
+	  nbytes += 2;
+	}
+
+  /* Are we good to go?  */
+  if (nbytes <= 0)
+    return false;
+
+  /* Emit the scratch load.  */
+  if (needscratch)
+    emit_insn (gen_rtx_SET (scratch, XEXP (sort[0].mem, 0)));
+
+  /* Emit the ldwm/stwm insn.  */
+  if (m > 0)
+    {
+      rtvec p = rtvec_alloc (m);
+      for (i = 0; i < m; i++)
+	{
+	  int offset = sort[i].offset;
+	  rtx mem, reg = sort[i].reg;
+	  rtx base_reg = gen_rtx_REG (Pmode, newbasereg);
+	  if (needscratch)
+	    offset -= baseoffset;
+	  mem = gen_rtx_MEM (SImode, plus_constant (Pmode, base_reg, offset));
+	  if (load_p)
+	    RTVEC_ELT (p, i) = gen_rtx_SET (reg, mem);
+	  else
+	    RTVEC_ELT (p, i) = gen_rtx_SET (mem, reg);
+	}
+      emit_insn (gen_rtx_PARALLEL (VOIDmode, p));
+    }
+
+  /* Emit any leftover load/stores as individual instructions, doing
+     the previously-noted rewrites to use the scratch reg.  */
+  for (i = m; i < n; i++)
+    {
+      rtx reg = sort[i].reg;
+      rtx mem = sort[i].mem;
+      if (sort[i].rewrite)
+	{
+	  int offset = sort[i].offset - baseoffset;
+	  mem = gen_rtx_MEM (SImode, plus_constant (Pmode, scratch, offset));
+	}
+      if (load_p)
+	emit_move_insn (reg, mem);
+      else
+	emit_move_insn (mem, reg);
+    }
+  return true;
+}
+
 /* Implement TARGET_MACHINE_DEPENDENT_REORG:
    We use this hook when emitting CDX code to enforce the 4-byte
    alignment requirement for labels that are used as the targets of
Index: gcc/config/nios2/nios2-ldstwm.sml
===================================================================
--- gcc/config/nios2/nios2-ldstwm.sml	(revision 0)
+++ gcc/config/nios2/nios2-ldstwm.sml	(revision 0)
@@ -0,0 +1,277 @@ 
+(* Auto-generate Nios II R2 CDX ldwm/stwm/push.n/pop.n patterns
+   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Contributed by Mentor Graphics.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it under
+   the terms of the GNU General Public License as published by the Free
+   Software Foundation; either version 3, or (at your option) any later
+   version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or
+   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+   for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.
+
+   This is a Standard ML program.  There are multiple Standard ML
+   implementations widely available.  We recommend the MLton optimizing
+   SML compiler, due to its ease of creating a standalone executable.
+
+     http://www.mlton.org/
+
+   Or from your favourite OS's friendly packaging system. Tested with
+   MLton Release 20130715, though other versions will probably work too.
+
+   Run with:
+     mlton -output a.out /path/to/gcc/config/nios2/nios2-ldstwm.sml
+     ./a.out >/path/to/gcc/config/nios2/ldstwm.md
+*)
+
+datatype ld_st = ld | st;    
+datatype push_pop = push | pop;
+datatype inc_dec = inc | dec;
+
+fun for ls f = map f ls;
+fun conds cond str = if cond then str else "";
+fun ints n = if n>=0 then (Int.toString n) else ("-" ^ (Int.toString (~n)));
+
+fun pushpop_pattern pptype n fp =
+    let 
+	val sp_reg = "(reg:SI SP_REGNO)";
+	val ra_reg = "(reg:SI RA_REGNO)";
+	val fp_reg = "(reg:SI FP_REGNO)";
+
+	fun sets lhs rhs = "(set " ^ lhs ^
+			   (if pptype=push then " "
+			    else " ") ^ rhs ^ ")";
+	val sp_adj =
+	    "(set " ^ sp_reg ^ "\n          " ^
+	    "(plus:SI " ^ sp_reg ^
+	    " (match_operand 1 \"const_int_operand\" \"\")))";
+
+	fun reg i regi = "(reg:SI " ^ (ints regi) ^ ")";
+	fun mem i opndi =
+	    if pptype=push then
+		"(mem:SI (plus:SI (reg:SI SP_REGNO) (const_int " ^ (ints (~4*i)) ^ ")))"
+	    else
+		"(match_operand:SI " ^
+		(ints opndi) ^ " \"stack_memory_operand\" \"\")";
+
+	val start = 1 + (if fp then 2 else 1);
+	val lim = n + (if fp then 2 else 1);
+	fun set_elt i regi opndi =
+	    if pptype=push then (sets (mem i opndi) (reg i regi))
+	    else (sets (reg i regi) (mem i opndi));
+	fun get_elt_list (i, regi, opndi) =
+	    if i > lim then []
+	    else (set_elt i regi opndi) :: get_elt_list (i+1, regi-1, opndi+1);
+
+	val set_elements = get_elt_list (start, 16+n-1, start+1);
+
+	val ra_set = if pptype=push then sets (mem 1 2) ra_reg
+		     else sets ra_reg (mem 1 2);
+	val fp_set = (conds fp (if pptype=push then sets (mem 2 3) fp_reg
+				else sets fp_reg (mem 2 3)));
+	val ret = (conds (pptype=pop) "(return)");
+	val element_list =
+	    List.filter (fn x => x<>"")
+			([ret, sp_adj, ra_set, fp_set] @ set_elements);
+
+	fun reg_index i = 16 + n - i;
+	fun pop_opnds 0 spl = (conds fp ("fp" ^ spl)) ^ "ra"
+	  | pop_opnds n spl = "r" ^ (ints (reg_index n)) ^ spl ^ (pop_opnds (n-1) spl);
+	fun push_opnds 0 spl = "ra" ^ (conds fp (spl ^ "fp"))
+	  | push_opnds n spl = (push_opnds (n-1) spl) ^ spl ^ "r" ^ (ints (reg_index n));
+
+	val spadj_opnd = if pptype=push then 2 else (start+n);
+	val spadj = ints spadj_opnd;
+	val regsave_num = n + (if fp then 2 else 1);
+
+	val ppname = if pptype=push then "push" else "pop";
+	val name = if pptype=push then "push" ^ "_" ^ (push_opnds n "_")
+		   else "pop" ^ "_" ^ (pop_opnds n "_");
+    in
+	"(define_insn \"*cdx_" ^ name ^ "\"\n" ^
+	"  [(match_parallel 0 \"" ^
+	(conds (pptype=pop) "pop_operation") ^ "\"\n" ^
+	"    [" ^ (String.concatWith ("\n     ") element_list) ^ "])]\n" ^
+	"   \"TARGET_HAS_CDX && XVECLEN (operands[0], 0) == " ^
+	(ints (length element_list)) ^
+	(conds (pptype=push)
+	       ("\n    && (-INTVAL (operands[1]) & 3) == 0\n" ^
+		"    && (-INTVAL (operands[1]) - " ^
+		(ints (4*regsave_num)) ^ ") <= 60")) ^
+	"\"\n" ^
+	(if pptype=pop then
+	     "{\n" ^
+	     "  rtx x = XEXP (operands[" ^ spadj ^ "], 0);\n" ^
+	     "  operands[" ^ spadj ^ "] = REG_P (x) ? const0_rtx : XEXP (x, 1);\n" ^
+	     "  return \"pop.n\\\\t{" ^ (pop_opnds n ", ") ^ "}, %" ^ spadj ^ "\";\n" ^
+	     "}\n"
+	 else
+	     "{\n" ^
+	     "  operands[" ^ spadj ^ "] = " ^
+	     "GEN_INT (-INTVAL (operands[1]) - " ^ (ints (4*regsave_num)) ^ ");\n" ^
+	     "  return \"push.n\\\\t{" ^ (push_opnds n ", ") ^ "}, %" ^ spadj ^ "\";\n" ^
+	     "}\n") ^
+	"  [(set_attr \"type\" \"" ^ ppname ^ "\")])\n\n"
+    end;
+
+fun ldstwm_pattern ldst n id wb pc =
+    let
+	val ldstwm = (if ldst=ld then "ldwm" else "stwm");
+	val name = "*cdx_" ^ ldstwm ^ (Int.toString n) ^
+		   (if id=inc then "_inc" else "_dec") ^
+		   (conds wb "_wb") ^ (conds pc "_ret");
+	val base_reg_referenced_p = ref false;
+	val base_regno = ints (n+1);
+	fun plus_addr base offset =
+	    "(plus:SI " ^ base ^ " (const_int " ^ (ints offset) ^ "))";
+	fun base_reg () =
+	    if !base_reg_referenced_p then
+		"(match_dup " ^ base_regno ^ ")"
+	    else (base_reg_referenced_p := true;
+		  "(match_operand:SI " ^ base_regno ^
+		  " \"register_operand\" \"" ^ (conds wb "+&") ^ "r\")");
+	fun reg i = "(match_operand:SI " ^ (ints i) ^
+		    " \"nios2_hard_register_operand\" \"" ^
+		    (conds (ldst=ld) "") ^ "\")";
+
+	fun addr 1 = if id=inc then base_reg ()
+		     else plus_addr (base_reg ()) (~4)
+	  | addr i = let val offset = if id=inc then (i-1)*4 else (~i*4)
+		     in plus_addr (base_reg ()) offset end;
+
+	fun mem i = "(mem:SI " ^ (addr i) ^ ")";
+	fun lhs i = if ldst=ld then reg i else mem i;
+	fun rhs i = if ldst=st then reg i else mem i;
+	fun sets lhs rhs = "(set " ^ lhs ^ "\n          " ^ rhs ^ ")";
+	fun set_elements i =
+	    if i > n then []
+	    else (sets (lhs i) (rhs i)) :: (set_elements (i+1));
+
+	fun opnds 1 = "%1"
+	  | opnds n = opnds(n-1) ^ ", %" ^ (Int.toString n);
+
+	val asm_template = ldstwm ^ "\\\\t{" ^ (opnds n) ^ "}" ^
+			   (if id=inc
+			    then ", (%" ^ base_regno ^ ")++"
+			    else ", --(%" ^ base_regno ^ ")") ^
+			   (conds wb ", writeback") ^
+			   (conds pc ", ret");
+	val wbtmp =
+	    if wb then
+		(sets (base_reg ())
+		      (plus_addr (base_reg ())
+				 ((if id=inc then n else ~n)*4)))
+	    else "";
+	val pctmp = conds pc "(return)";
+	val set_list = List.filter (fn x => x<>"")
+				   ([pctmp, wbtmp] @ (set_elements 1));
+    in
+	if ldst=st andalso pc then ""
+	else
+	    "(define_insn \"" ^ name ^ "\"\n" ^
+	    "  [(match_parallel 0 \"" ^ ldstwm ^  "_operation\"\n" ^
+	    "    [" ^ (String.concatWith ("\n     ") set_list) ^ "])]\n" ^
+	    "   \"TARGET_HAS_CDX && XVECLEN (operands[0], 0) == " ^
+	    (ints (length set_list)) ^ "\"\n" ^
+	    "   \"" ^ asm_template ^ "\"\n" ^
+	    "  [(set_attr \"type\" \"" ^ ldstwm ^ "\")])\n\n"
+    end;
+
+fun peephole_pattern ldst n scratch_p =
+    let
+	fun sets lhs rhs = "(set " ^ lhs ^ "\n        " ^ rhs ^ ")";
+	fun single_set i indent =
+	    let val reg = "(match_operand:SI " ^ (ints i) ^
+			  " \"register_operand\" \"\")";
+		val mem = "(match_operand:SI " ^ (ints (i+n)) ^
+			  " \"memory_operand\" \"\")";
+	    in
+		if ldst=ld then sets reg mem
+		else sets mem reg
+	    end;
+
+	fun single_sets i =
+	    if i=n then []
+	    else (single_set i "   ") :: (single_sets (i+1));
+
+	val scratch = ints (2*n);
+	val peephole_elements =
+	    let val tmp = single_sets 0 in
+		if scratch_p
+		then (["(match_scratch:SI " ^ scratch ^ " \"r\")"] @
+		      tmp @
+		      ["(match_dup " ^ scratch ^ ")"])
+		else tmp
+	    end;
+    in
+	"(define_peephole2\n" ^
+	"  [" ^ (String.concatWith ("\n   ") peephole_elements) ^ "]\n" ^
+	"  \"TARGET_HAS_CDX\"\n" ^
+	"  [(const_int 0)]\n" ^
+	"{\n" ^
+	"  if (gen_ldstwm_peep (" ^
+	(if ldst=st then "false" else "true") ^ ", " ^ (ints n) ^ ", " ^ 
+	(if scratch_p then ("operands[" ^ scratch ^ "]") else "NULL_RTX") ^
+	", operands))\n" ^
+	"    DONE;\n" ^
+	"  else\n" ^
+	"    FAIL;\n" ^
+	"})\n\n"
+    end;
+
+
+print
+("/* Nios II R2 CDX ldwm/stwm/push.h/pop.n instruction patterns.\n" ^
+ "   This file was automatically generated using nios2-ldstwm.sml.\n" ^
+ "   Please do not edit manually.\n" ^
+ "\n" ^
+ "   Copyright (C) 2014-2015 Free Software Foundation, Inc.\n" ^
+ "   Contributed by Mentor Graphics.\n" ^
+ "\n" ^
+ "   This file is part of GCC.\n" ^
+ "\n" ^
+ "   GCC is free software; you can redistribute it and/or modify it\n" ^
+ "   under the terms of the GNU General Public License as published\n" ^
+ "   by the Free Software Foundation; either version 3, or (at your\n" ^
+ "   option) any later version.\n" ^
+ "\n" ^
+ "   GCC is distributed in the hope that it will be useful, but WITHOUT\n" ^
+ "   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY\n" ^
+ "   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public\n" ^
+ "   License for more details.\n" ^
+ "\n" ^
+ "   You should have received a copy of the GNU General Public License and\n" ^
+ "   a copy of the GCC Runtime Library Exception along with this program;\n" ^
+ "   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see\n" ^
+ "   <http://www.gnu.org/licenses/>.  */\n\n");
+
+fun seq a b = if a=b then [b]
+	      else a :: (seq (if a<b then a+1 else a-1) b);
+
+(* push/pop patterns *)
+for (seq 0 8) (fn n =>
+  for [push, pop] (fn p =>
+    for [true, false] (fn fp =>
+       print (pushpop_pattern p n fp))));
+
+(* ldwm/stwm patterns *)
+for [ld, st] (fn l =>
+  for (seq 1 12) (fn n =>
+    for [inc, dec] (fn id =>
+      for [true, false] (fn wb =>
+        for [true, false] (fn pc =>
+          print (ldstwm_pattern l n id wb pc))))));
+
+(* peephole patterns *)
+for [ld, st] (fn l =>
+  for (seq 12 2) (fn n =>
+    print (peephole_pattern l n true)));
+
Index: gcc/config/nios2/nios2.md
===================================================================
--- gcc/config/nios2/nios2.md	(revision 225796)
+++ gcc/config/nios2/nios2.md	(working copy)
@@ -1169,3 +1169,6 @@ 
   emit_move_insn (operands[0], gen_rtx_REG (Pmode, TP_REGNO));
   DONE;
 })
+;; Include the ldwm/stwm/push.n/pop.n patterns and peepholes.
+(include "ldstwm.md")
+
Index: gcc/testsuite/gcc.target/nios2/cdx-ldstwm-1.c
===================================================================
--- gcc/testsuite/gcc.target/nios2/cdx-ldstwm-1.c	(revision 0)
+++ gcc/testsuite/gcc.target/nios2/cdx-ldstwm-1.c	(revision 0)
@@ -0,0 +1,24 @@ 
+/* { dg-do assemble } */
+/* { dg-options "-O3 -fomit-frame-pointer -funroll-all-loops -finline-functions -march=r2 -mcdx -w" } */
+
+/* Based on gcc.c-torture/compile/920501-23.c.
+   This test used to result in assembler errors with R2 CDX because of
+   a bug in regrename; it wasn't re-validating insns after renaming, so
+   ldwm/stwm instructions with incorrect registers were being emitted.  */
+
+typedef unsigned char qi;
+typedef unsigned short hi;
+typedef unsigned long si;
+typedef unsigned long long di;
+subi(a){return 100-a;}
+add(a,b){return a+b;}
+mul(a){return 85*a;}
+memshift(p)unsigned*p;{unsigned x;for(;;){x=*p++>>16;if(x)return x;}}
+ldw(xp)si*xp;{return xp[4];}
+ldws_m(xp)si*xp;{si x;do{x=xp[3];xp+=3;}while(x);}
+postinc_si(p)si*p;{si x;for(;;){x=*p++;if(x)return x;}}
+preinc_si(p)si*p;{si x;for(;;){x=*++p;if(x)return x;}}
+postinc_di(p)di*p;{di x;for(;;){x=*p++;if(x)return x;}}
+preinc_di(p)di*p;{di x;for(;;){x=*++p;if(x)return x;}}
+inc_overlap(p,a)di*p;{do{p=*(di**)p;p=(di*)((int)p+4);}while(*p);}
+di move_di(p,p2)di*p,*p2;{di x=p;p2=((di*)x)[1];return p2[1];}
Index: gcc/testsuite/gcc.target/nios2/cdx-ldstwm-2.c
===================================================================
--- gcc/testsuite/gcc.target/nios2/cdx-ldstwm-2.c	(revision 0)
+++ gcc/testsuite/gcc.target/nios2/cdx-ldstwm-2.c	(revision 0)
@@ -0,0 +1,66 @@ 
+/* { dg-do assemble } */
+/* { dg-options "-O3 -fomit-frame-pointer -funroll-loops -march=r2 -mcdx -w" } */
+
+/* Based on gcc.c-torture/execute/20021120-1.c.
+   This test used to result in assembler errors with R2 CDX because of
+   a bug in regrename; it wasn't re-validating insns after renaming, so
+   ldwm/stwm instructions with incorrect registers were being emitted.  */
+
+/* Macros to emit "L Nxx R" for each octal number xx between 000 and 037.  */
+#define OP1(L, N, R, I, J) L N##I##J R
+#define OP2(L, N, R, I) \
+    OP1(L, N, R, 0, I), OP1(L, N, R, 1, I), \
+    OP1(L, N, R, 2, I), OP1(L, N, R, 3, I)
+#define OP(L, N, R) \
+    OP2(L, N, R, 0), OP2(L, N, R, 1), OP2(L, N, R, 2), OP2(L, N, R, 3), \
+    OP2(L, N, R, 4), OP2(L, N, R, 5), OP2(L, N, R, 6), OP2(L, N, R, 7)
+
+/* Declare 32 unique variables with prefix N.  */
+#define DECLARE(N) OP (, N,)
+
+/* Copy 32 variables with prefix N from the array at ADDR.
+   Leave ADDR pointing to the end of the array.  */
+#define COPYIN(N, ADDR) OP (, N, = *(ADDR++))
+
+/* Likewise, but copy the other way.  */
+#define COPYOUT(N, ADDR) OP (*(ADDR++) =, N,)
+
+/* Add the contents of the array at ADDR to 32 variables with prefix N.
+   Leave ADDR pointing to the end of the array.  */
+#define ADD(N, ADDR) OP (, N, += *(ADDR++))
+
+volatile double gd[32];
+volatile float gf[32];
+
+void foo (int n)
+{
+  double DECLARE(d);
+  float DECLARE(f);
+  volatile double *pd;
+  volatile float *pf;
+  int i;
+
+  pd = gd; COPYIN (d, pd);
+  for (i = 0; i < n; i++)
+    {
+      pf = gf; COPYIN (f, pf);
+      pd = gd; ADD (d, pd);
+      pd = gd; ADD (d, pd);
+      pd = gd; ADD (d, pd);
+      pf = gf; COPYOUT (f, pf);
+    }
+  pd = gd; COPYOUT (d, pd);
+}
+
+int main ()
+{
+  int i;
+
+  for (i = 0; i < 32; i++)
+    gd[i] = i, gf[i] = i;
+  foo (1);
+  for (i = 0; i < 32; i++)
+    if (gd[i] != i * 4 || gf[i] != i)
+      abort ();
+  exit (0);
+}