Patchwork [v3] s390: Convert from sync to atomic optabs

login
register
mail settings
Submitter Richard Henderson
Date Aug. 6, 2012, 11 p.m.
Message ID <1344294001-30600-1-git-send-email-rth@redhat.com>
Download mbox | patch
Permalink /patch/175471/
State New
Headers show

Comments

Richard Henderson - Aug. 6, 2012, 11 p.m.
[ This ought to be exactly the patch you bootstrapped.  It does
  not include the SEQ follow-up. ]

Split out s390_two_part_insv from s390_expand_cs_hqi to try
harder to use bit insertion instructions in the CAS loop.

Reorg s390_expand_insv to aid that.  Try RISBG last, after other
mechanisms have failed; don't require operands in registers for
it but force them there instead.  Try a limited form of ICM.

--

 gcc/config/s390/s390-protos.h |    3 +-
 gcc/config/s390/s390.c        |  302 ++++++++++++++++++++------------
 gcc/config/s390/s390.md       |  389 +++++++++++++++++++++++++++++------------
 3 files changed, 471 insertions(+), 223 deletions(-)

Patch

diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h
index 4f1eb42..79673d6 100644
--- a/gcc/config/s390/s390-protos.h
+++ b/gcc/config/s390/s390-protos.h
@@ -85,7 +85,8 @@  extern void s390_expand_setmem (rtx, rtx, rtx);
 extern bool s390_expand_cmpmem (rtx, rtx, rtx, rtx);
 extern bool s390_expand_addcc (enum rtx_code, rtx, rtx, rtx, rtx, rtx);
 extern bool s390_expand_insv (rtx, rtx, rtx, rtx);
-extern void s390_expand_cs_hqi (enum machine_mode, rtx, rtx, rtx, rtx);
+extern void s390_expand_cs_hqi (enum machine_mode, rtx, rtx, rtx,
+				rtx, rtx, bool);
 extern void s390_expand_atomic (enum machine_mode, enum rtx_code,
 				rtx, rtx, rtx, bool);
 extern rtx s390_return_addr_rtx (int, rtx);
diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index 3a87291..20a2db6 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -896,10 +896,12 @@  s390_emit_compare (enum rtx_code code, rtx op0, rtx op1)
    conditional branch testing the result.  */
 
 static rtx
-s390_emit_compare_and_swap (enum rtx_code code, rtx old, rtx mem, rtx cmp, rtx new_rtx)
+s390_emit_compare_and_swap (enum rtx_code code, rtx old, rtx mem,
+			    rtx cmp, rtx new_rtx)
 {
-  emit_insn (gen_sync_compare_and_swapsi (old, mem, cmp, new_rtx));
-  return s390_emit_compare (code, gen_rtx_REG (CCZ1mode, CC_REGNUM), const0_rtx);
+  emit_insn (gen_atomic_compare_and_swapsi_internal (old, mem, cmp, new_rtx));
+  return s390_emit_compare (code, gen_rtx_REG (CCZ1mode, CC_REGNUM),
+			    const0_rtx);
 }
 
 /* Emit a jump instruction to TARGET.  If COND is NULL_RTX, emit an
@@ -4548,106 +4550,146 @@  s390_expand_insv (rtx dest, rtx op1, rtx op2, rtx src)
 {
   int bitsize = INTVAL (op1);
   int bitpos = INTVAL (op2);
+  enum machine_mode mode = GET_MODE (dest);
+  enum machine_mode smode;
+  int smode_bsize, mode_bsize;
+  rtx op, clobber;
 
-  /* On z10 we can use the risbg instruction to implement insv.  */
-  if (TARGET_Z10
-      && ((GET_MODE (dest) == DImode && GET_MODE (src) == DImode)
-	  || (GET_MODE (dest) == SImode && GET_MODE (src) == SImode)))
+  /* Generate INSERT IMMEDIATE (IILL et al).  */
+  /* (set (ze (reg)) (const_int)).  */
+  if (TARGET_ZARCH
+      && register_operand (dest, word_mode)
+      && (bitpos % 16) == 0
+      && (bitsize % 16) == 0
+      && const_int_operand (src, VOIDmode))
     {
-      rtx op;
-      rtx clobber;
+      HOST_WIDE_INT val = INTVAL (src);
+      int regpos = bitpos + bitsize;
 
-      op = gen_rtx_SET (GET_MODE(src),
-			gen_rtx_ZERO_EXTRACT (GET_MODE (dest), dest, op1, op2),
-			src);
-      clobber = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, CC_REGNUM));
-      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clobber)));
+      while (regpos > bitpos)
+	{
+	  enum machine_mode putmode;
+	  int putsize;
 
+	  if (TARGET_EXTIMM && (regpos % 32 == 0) && (regpos >= bitpos + 32))
+	    putmode = SImode;
+	  else
+	    putmode = HImode;
+
+	  putsize = GET_MODE_BITSIZE (putmode);
+	  regpos -= putsize;
+	  emit_move_insn (gen_rtx_ZERO_EXTRACT (word_mode, dest,
+						GEN_INT (putsize),
+						GEN_INT (regpos)),
+			  gen_int_mode (val, putmode));
+	  val >>= putsize;
+	}
+      gcc_assert (regpos == bitpos);
       return true;
     }
 
-  /* We need byte alignment.  */
-  if (bitsize % BITS_PER_UNIT)
-    return false;
+  smode = smallest_mode_for_size (bitsize, MODE_INT);
+  smode_bsize = GET_MODE_BITSIZE (smode);
+  mode_bsize = GET_MODE_BITSIZE (mode);
 
+  /* Generate STORE CHARACTERS UNDER MASK (STCM et al).  */
   if (bitpos == 0
-      && memory_operand (dest, VOIDmode)
+      && (bitsize % BITS_PER_UNIT) == 0
+      && MEM_P (dest)
       && (register_operand (src, word_mode)
 	  || const_int_operand (src, VOIDmode)))
     {
       /* Emit standard pattern if possible.  */
-      enum machine_mode mode = smallest_mode_for_size (bitsize, MODE_INT);
-      if (GET_MODE_BITSIZE (mode) == bitsize)
-	emit_move_insn (adjust_address (dest, mode, 0), gen_lowpart (mode, src));
+      if (smode_bsize == bitsize)
+	{
+	  emit_move_insn (adjust_address (dest, smode, 0),
+			  gen_lowpart (smode, src));
+	  return true;
+	}
 
       /* (set (ze (mem)) (const_int)).  */
       else if (const_int_operand (src, VOIDmode))
 	{
 	  int size = bitsize / BITS_PER_UNIT;
-	  rtx src_mem = adjust_address (force_const_mem (word_mode, src), BLKmode,
-					GET_MODE_SIZE (word_mode) - size);
+	  rtx src_mem = adjust_address (force_const_mem (word_mode, src),
+					BLKmode,
+					UNITS_PER_WORD - size);
 
 	  dest = adjust_address (dest, BLKmode, 0);
 	  set_mem_size (dest, size);
 	  s390_expand_movmem (dest, src_mem, GEN_INT (size));
+	  return true;
 	}
 
       /* (set (ze (mem)) (reg)).  */
       else if (register_operand (src, word_mode))
 	{
-	  if (bitsize <= GET_MODE_BITSIZE (SImode))
+	  if (bitsize <= 32)
 	    emit_move_insn (gen_rtx_ZERO_EXTRACT (word_mode, dest, op1,
 						  const0_rtx), src);
 	  else
 	    {
 	      /* Emit st,stcmh sequence.  */
-	      int stcmh_width = bitsize - GET_MODE_BITSIZE (SImode);
+	      int stcmh_width = bitsize - 32;
 	      int size = stcmh_width / BITS_PER_UNIT;
 
 	      emit_move_insn (adjust_address (dest, SImode, size),
 			      gen_lowpart (SImode, src));
 	      set_mem_size (dest, size);
-	      emit_move_insn (gen_rtx_ZERO_EXTRACT (word_mode, dest, GEN_INT
-						    (stcmh_width), const0_rtx),
-			      gen_rtx_LSHIFTRT (word_mode, src, GEN_INT
-						(GET_MODE_BITSIZE (SImode))));
+	      emit_move_insn (gen_rtx_ZERO_EXTRACT (word_mode, dest,
+						    GEN_INT (stcmh_width),
+						    const0_rtx),
+			      gen_rtx_LSHIFTRT (word_mode, src, GEN_INT (32)));
 	    }
+	  return true;
 	}
-      else
-	return false;
+    }
 
-      return true;
+  /* Generate INSERT CHARACTERS UNDER MASK (IC, ICM et al).  */
+  if ((bitpos % BITS_PER_UNIT) == 0
+      && (bitsize % BITS_PER_UNIT) == 0
+      && (bitpos & 32) == ((bitpos + bitsize - 1) & 32)
+      && MEM_P (src)
+      && (mode == DImode || mode == SImode)
+      && register_operand (dest, mode))
+    {
+      /* Emit a strict_low_part pattern if possible.  */
+      if (smode_bsize == bitsize && bitpos == mode_bsize - smode_bsize)
+	{
+	  op = gen_rtx_STRICT_LOW_PART (VOIDmode, gen_lowpart (smode, dest));
+	  op = gen_rtx_SET (VOIDmode, op, gen_lowpart (smode, src));
+	  clobber = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, CC_REGNUM));
+	  emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clobber)));
+	  return true;
+	}
+
+      /* ??? There are more powerful versions of ICM that are not
+	 completely represented in the md file.  */
     }
 
-  /* (set (ze (reg)) (const_int)).  */
-  if (TARGET_ZARCH
-      && register_operand (dest, word_mode)
-      && (bitpos % 16) == 0
-      && (bitsize % 16) == 0
-      && const_int_operand (src, VOIDmode))
+  /* For z10, generate ROTATE THEN INSERT SELECTED BITS (RISBG et al).  */
+  if (TARGET_Z10 && (mode == DImode || mode == SImode))
     {
-      HOST_WIDE_INT val = INTVAL (src);
-      int regpos = bitpos + bitsize;
+      enum machine_mode mode_s = GET_MODE (src);
 
-      while (regpos > bitpos)
+      if (mode_s == VOIDmode)
 	{
-	  enum machine_mode putmode;
-	  int putsize;
+	  /* Assume const_int etc already in the proper mode.  */
+	  src = force_reg (mode, src);
+	}
+      else if (mode_s != mode)
+	{
+	  gcc_assert (GET_MODE_BITSIZE (mode_s) >= bitsize);
+	  src = force_reg (mode_s, src);
+	  src = gen_lowpart (mode, src);
+	}
 
-	  if (TARGET_EXTIMM && (regpos % 32 == 0) && (regpos >= bitpos + 32))
-	    putmode = SImode;
-	  else
-	    putmode = HImode;
+      op = gen_rtx_SET (mode,
+			gen_rtx_ZERO_EXTRACT (mode, dest, op1, op2),
+			src);
+      clobber = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, CC_REGNUM));
+      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clobber)));
 
-	  putsize = GET_MODE_BITSIZE (putmode);
-	  regpos -= putsize;
-	  emit_move_insn (gen_rtx_ZERO_EXTRACT (word_mode, dest,
-						GEN_INT (putsize),
-						GEN_INT (regpos)),
-			  gen_int_mode (val, putmode));
-	  val >>= putsize;
-	}
-      gcc_assert (regpos == bitpos);
       return true;
     }
 
@@ -4717,92 +4759,134 @@  init_alignment_context (struct alignment_context *ac, rtx mem,
       /* As we already have some offset, evaluate the remaining distance.  */
       ac->shift = expand_simple_binop (SImode, MINUS, ac->shift, byteoffset,
 				      NULL_RTX, 1, OPTAB_DIRECT);
-
     }
+
   /* Shift is the byte count, but we need the bitcount.  */
-  ac->shift = expand_simple_binop (SImode, MULT, ac->shift, GEN_INT (BITS_PER_UNIT),
-				  NULL_RTX, 1, OPTAB_DIRECT);
+  ac->shift = expand_simple_binop (SImode, ASHIFT, ac->shift, GEN_INT (3),
+				   NULL_RTX, 1, OPTAB_DIRECT);
+
   /* Calculate masks.  */
   ac->modemask = expand_simple_binop (SImode, ASHIFT,
-				     GEN_INT (GET_MODE_MASK (mode)), ac->shift,
-				     NULL_RTX, 1, OPTAB_DIRECT);
-  ac->modemaski = expand_simple_unop (SImode, NOT, ac->modemask, NULL_RTX, 1);
+				      GEN_INT (GET_MODE_MASK (mode)),
+				      ac->shift, NULL_RTX, 1, OPTAB_DIRECT);
+  ac->modemaski = expand_simple_unop (SImode, NOT, ac->modemask,
+				      NULL_RTX, 1);
+}
+
+/* A subroutine of s390_expand_cs_hqi.  Insert INS into VAL.  If possible,
+   use a single insv insn into SEQ2.  Otherwise, put prep insns in SEQ1 and
+   perform the merge in SEQ2.  */
+
+static rtx
+s390_two_part_insv (struct alignment_context *ac, rtx *seq1, rtx *seq2,
+		    enum machine_mode mode, rtx val, rtx ins)
+{
+  rtx tmp;
+
+  if (ac->aligned)
+    {
+      start_sequence ();
+      tmp = copy_to_mode_reg (SImode, val);
+      if (s390_expand_insv (tmp, GEN_INT (GET_MODE_BITSIZE (mode)),
+			    const0_rtx, ins))
+	{
+	  *seq1 = NULL;
+	  *seq2 = get_insns ();
+	  end_sequence ();
+	  return tmp;
+	}
+      end_sequence ();
+    }
+
+  /* Failed to use insv.  Generate a two part shift and mask.  */
+  start_sequence ();
+  tmp = s390_expand_mask_and_shift (ins, mode, ac->shift);
+  *seq1 = get_insns ();
+  end_sequence ();
+
+  start_sequence ();
+  tmp = expand_simple_binop (SImode, IOR, tmp, val, NULL_RTX, 1, OPTAB_DIRECT);
+  *seq2 = get_insns ();
+  end_sequence ();
+
+  return tmp;
 }
 
 /* Expand an atomic compare and swap operation for HImode and QImode.  MEM is
-   the memory location, CMP the old value to compare MEM with and NEW_RTX the value
-   to set if CMP == MEM.
-   CMP is never in memory for compare_and_swap_cc because
-   expand_bool_compare_and_swap puts it into a register for later compare.  */
+   the memory location, CMP the old value to compare MEM with and NEW_RTX the
+   value to set if CMP == MEM.  */
 
 void
-s390_expand_cs_hqi (enum machine_mode mode, rtx target, rtx mem, rtx cmp, rtx new_rtx)
+s390_expand_cs_hqi (enum machine_mode mode, rtx btarget, rtx vtarget, rtx mem,
+		    rtx cmp, rtx new_rtx, bool is_weak)
 {
   struct alignment_context ac;
-  rtx cmpv, newv, val, resv, cc;
+  rtx cmpv, newv, val, resv, cc, seq0, seq1, seq2, seq3;
   rtx res = gen_reg_rtx (SImode);
-  rtx csloop = gen_label_rtx ();
-  rtx csend = gen_label_rtx ();
+  rtx csloop = NULL, csend = NULL;
 
-  gcc_assert (register_operand (target, VOIDmode));
+  gcc_assert (register_operand (vtarget, VOIDmode));
   gcc_assert (MEM_P (mem));
 
   init_alignment_context (&ac, mem, mode);
 
-  /* Shift the values to the correct bit positions.  */
-  if (!(ac.aligned && MEM_P (cmp)))
-    cmp = s390_expand_mask_and_shift (cmp, mode, ac.shift);
-  if (!(ac.aligned && MEM_P (new_rtx)))
-    new_rtx = s390_expand_mask_and_shift (new_rtx, mode, ac.shift);
-
   /* Load full word.  Subsequent loads are performed by CS.  */
   val = expand_simple_binop (SImode, AND, ac.memsi, ac.modemaski,
 			     NULL_RTX, 1, OPTAB_DIRECT);
 
+  /* Prepare insertions of cmp and new_rtx into the loaded value.  When
+     possible, we try to use insv to make this happen efficiently.  If
+     that fails we'll generate code both inside and outside the loop.  */
+  cmpv = s390_two_part_insv (&ac, &seq0, &seq2, mode, val, cmp);
+  newv = s390_two_part_insv (&ac, &seq1, &seq3, mode, val, new_rtx);
+
+  if (seq0)
+    emit_insn (seq0);
+  if (seq1)
+    emit_insn (seq1);
+
   /* Start CS loop.  */
-  emit_label (csloop);
+  if (!is_weak)
+    {
+      /* Begin assuming success.  */
+      emit_move_insn (btarget, const1_rtx);
+
+      csloop = gen_label_rtx ();
+      csend = gen_label_rtx ();
+      emit_label (csloop);
+    }
+
   /* val = "<mem>00..0<mem>"
    * cmp = "00..0<cmp>00..0"
    * new = "00..0<new>00..0"
    */
 
-  /* Patch cmp and new with val at correct position.  */
-  if (ac.aligned && MEM_P (cmp))
-    {
-      cmpv = force_reg (SImode, val);
-      store_bit_field (cmpv, GET_MODE_BITSIZE (mode), 0,
-		       0, 0, SImode, cmp);
-    }
+  emit_insn (seq2);
+  emit_insn (seq3);
+
+  cc = s390_emit_compare_and_swap (EQ, res, ac.memsi, cmpv, newv);
+  if (is_weak)
+    emit_insn (gen_cstorecc4 (btarget, cc, XEXP (cc, 0), XEXP (cc, 1)));
   else
-    cmpv = force_reg (SImode, expand_simple_binop (SImode, IOR, cmp, val,
-						   NULL_RTX, 1, OPTAB_DIRECT));
-  if (ac.aligned && MEM_P (new_rtx))
     {
-      newv = force_reg (SImode, val);
-      store_bit_field (newv, GET_MODE_BITSIZE (mode), 0,
-		       0, 0, SImode, new_rtx);
-    }
-  else
-    newv = force_reg (SImode, expand_simple_binop (SImode, IOR, new_rtx, val,
-						   NULL_RTX, 1, OPTAB_DIRECT));
+      /* Jump to end if we're done (likely?).  */
+      s390_emit_jump (csend, cc);
 
-  /* Jump to end if we're done (likely?).  */
-  s390_emit_jump (csend, s390_emit_compare_and_swap (EQ, res, ac.memsi,
-						     cmpv, newv));
+      /* Check for changes outside mode, and loop internal if so.  */
+      resv = expand_simple_binop (SImode, AND, res, ac.modemaski,
+			          NULL_RTX, 1, OPTAB_DIRECT);
+      cc = s390_emit_compare (NE, resv, val);
+      emit_move_insn (val, resv);
+      s390_emit_jump (csloop, cc);
 
-  /* Check for changes outside mode.  */
-  resv = expand_simple_binop (SImode, AND, res, ac.modemaski,
-			      NULL_RTX, 1, OPTAB_DIRECT);
-  cc = s390_emit_compare (NE, resv, val);
-  emit_move_insn (val, resv);
-  /* Loop internal if so.  */
-  s390_emit_jump (csloop, cc);
-
-  emit_label (csend);
+      /* Failed.  */
+      emit_move_insn (btarget, const0_rtx);
+      emit_label (csend);
+    }
 
   /* Return the correct part of the bitfield.  */
-  convert_move (target, expand_simple_binop (SImode, LSHIFTRT, res, ac.shift,
-					     NULL_RTX, 1, OPTAB_DIRECT), 1);
+  convert_move (vtarget, expand_simple_binop (SImode, LSHIFTRT, res, ac.shift,
+					      NULL_RTX, 1, OPTAB_DIRECT), 1);
 }
 
 /* Expand an atomic operation CODE of mode MODE.  MEM is the memory location
diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index 096f266..0e43e51 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -84,6 +84,7 @@ 
 
    ; Atomic Support
    UNSPEC_MB
+   UNSPEC_MOVA
 
    ; TLS relocation specifiers
    UNSPEC_TLSGD
@@ -349,21 +350,19 @@ 
 (define_mode_iterator DD_DF [DF DD])
 (define_mode_iterator TD_TF [TF TD])
 
-;; This mode iterator allows 31-bit and 64-bit TDSI patterns to be generated
-;; from the same template.
-(define_mode_iterator TDSI [(TI "TARGET_64BIT") DI SI])
-
 ;; These mode iterators allow 31-bit and 64-bit GPR patterns to be generated
 ;; from the same template.
 (define_mode_iterator GPR [(DI "TARGET_ZARCH") SI])
+(define_mode_iterator DGPR [(TI "TARGET_ZARCH") DI SI])
 (define_mode_iterator DSI [DI SI])
+(define_mode_iterator TDI [TI DI])
 
 ;; These mode iterators allow :P to be used for patterns that operate on
 ;; pointer-sized quantities.  Exactly one of the two alternatives will match.
 (define_mode_iterator P [(DI "TARGET_64BIT") (SI "!TARGET_64BIT")])
 
-;; These macros refer to the actual word_mode of the configuration. This is equal
-;; to Pmode except on 31-bit machines in zarch mode.
+;; These macros refer to the actual word_mode of the configuration.
+;; This is equal to Pmode except on 31-bit machines in zarch mode.
 (define_mode_iterator DW [(TI "TARGET_ZARCH") (DI "!TARGET_ZARCH")])
 (define_mode_iterator W  [(DI "TARGET_ZARCH") (SI "!TARGET_ZARCH")])
 
@@ -379,6 +378,7 @@ 
 ;; same template.
 (define_mode_iterator INT [(DI "TARGET_ZARCH") SI HI QI])
 (define_mode_iterator INTALL [TI DI SI HI QI])
+(define_mode_iterator DINT [(TI "TARGET_ZARCH") DI SI HI QI])
 
 ;; This iterator allows some 'ashift' and 'lshiftrt' pattern to be defined from
 ;; the same template.
@@ -487,6 +487,9 @@ 
 ;; and "cds" in DImode.
 (define_mode_attr tg [(TI "g") (DI "")])
 
+;; In TDI templates, a string like "c<d>sg".
+(define_mode_attr td [(TI "d") (DI "")])
+
 ;; In GPR templates, a string like "c<gf>dbr" will expand to "cgdbr" in DImode
 ;; and "cfdbr" in SImode.
 (define_mode_attr gf [(DI "g") (SI "f")])
@@ -8739,164 +8742,324 @@ 
 ;;
 
 ;
-; memory barrier pattern.
+; memory barrier patterns.
 ;
 
-(define_expand "memory_barrier"
-  [(set (match_dup 0)
-	(unspec:BLK [(match_dup 0)] UNSPEC_MB))]
+(define_expand "mem_signal_fence"
+  [(match_operand:SI 0 "const_int_operand")]		;; model
   ""
 {
-  operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
-  MEM_VOLATILE_P (operands[0]) = 1;
+  /* The s390 memory model is strong enough not to require any
+     barrier in order to synchronize a thread with itself.  */
+  DONE;
+})
+
+(define_expand "mem_thread_fence"
+  [(match_operand:SI 0 "const_int_operand")]		;; model
+  ""
+{
+  /* Unless this is a SEQ_CST fence, the s390 memory model is strong
+     enough not to require barriers of any kind.  */
+  if (INTVAL (operands[0]) == MEMMODEL_SEQ_CST)
+    {
+      rtx mem = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+      MEM_VOLATILE_P (mem) = 1;
+      emit_insn (gen_mem_thread_fence_1 (mem));
+    }
+  DONE;
 })
 
-(define_insn "*memory_barrier"
+; Although bcr is superscalar on Z10, this variant will never
+; become part of an execution group.
+(define_insn "mem_thread_fence_1"
   [(set (match_operand:BLK 0 "" "")
 	(unspec:BLK [(match_dup 0)] UNSPEC_MB))]
   ""
   "bcr\t15,0"
   [(set_attr "op_type" "RR")])
 
-; Although bcr is superscalar on Z10, this variant will never become part of
-; an execution group.
+;
+; atomic load/store operations
+;
+
+; Atomic loads need not examine the memory model at all.
+(define_expand "atomic_load<mode>"
+  [(match_operand:DINT 0 "register_operand")	;; output
+   (match_operand:DINT 1 "memory_operand")	;; memory
+   (match_operand:SI 2 "const_int_operand")]	;; model
+  ""
+{
+  if (<MODE>mode == TImode)
+    emit_insn (gen_atomic_loadti_1 (operands[0], operands[1]));
+  else if (<MODE>mode == DImode && !TARGET_ZARCH)
+    emit_insn (gen_atomic_loaddi_1 (operands[0], operands[1]));
+  else
+    emit_move_insn (operands[0], operands[1]);
+  DONE;
+})
+
+; Different from movdi_31 in that we want no splitters.
+(define_insn "atomic_loaddi_1"
+  [(set (match_operand:DI 0 "register_operand" "=d,d,!*f,!*f")
+	(unspec:DI [(match_operand:DI 1 "memory_operand" "Q,S,R,T")]
+		   UNSPEC_MOVA))]
+  "!TARGET_ZARCH"
+  "@
+   lm\t%0,%M0,%S1
+   lmy\t%0,%M0,%S1
+   ld\t%0,%1
+   ldy\t%0,%1"
+  [(set_attr "op_type" "RS,RSY,RS,RSY")
+   (set_attr "type" "lm,lm,floaddf,floaddf")])
+
+(define_insn "atomic_loadti_1"
+  [(set (match_operand:TI 0 "register_operand" "=r")
+	(unspec:TI [(match_operand:TI 1 "memory_operand" "RT")]
+		   UNSPEC_MOVA))]
+  "TARGET_ZARCH"
+  "lpq\t%0,%1"
+  [(set_attr "op_type" "RXY")
+   (set_attr "type" "other")])
+
+; Atomic stores must(?) enforce sequential consistency.
+(define_expand "atomic_store<mode>"
+  [(match_operand:DINT 0 "memory_operand")	;; memory
+   (match_operand:DINT 1 "register_operand")	;; input
+   (match_operand:SI 2 "const_int_operand")]	;; model
+  ""
+{
+  enum memmodel model = (enum memmodel) INTVAL (operands[2]);
+
+  if (<MODE>mode == TImode)
+    emit_insn (gen_atomic_storeti_1 (operands[0], operands[1]));
+  else if (<MODE>mode == DImode && !TARGET_ZARCH)
+    emit_insn (gen_atomic_storedi_1 (operands[0], operands[1]));
+  else
+    emit_move_insn (operands[0], operands[1]);
+  if (model == MEMMODEL_SEQ_CST)
+    emit_insn (gen_mem_thread_fence (operands[2]));
+  DONE;
+})
+
+; Different from movdi_31 in that we want no splitters.
+(define_insn "atomic_storedi_1"
+  [(set (match_operand:DI 0 "memory_operand" "=Q,S,R,T")
+	(unspec:DI [(match_operand:DI 1 "register_operand" "d,d,!*f,!*f")]
+		   UNSPEC_MOVA))]
+  "!TARGET_ZARCH"
+  "@
+   stm\t%1,%N1,%S0
+   stmy\t%1,%N1,%S0
+   std %1,%0
+   stdy %1,%0"
+  [(set_attr "op_type" "RS,RSY,RS,RSY")
+   (set_attr "type" "stm,stm,fstoredf,fstoredf")])
+
+(define_insn "atomic_storeti_1"
+  [(set (match_operand:TI 0 "memory_operand" "=RT")
+	(unspec:TI [(match_operand:TI 1 "register_operand" "r")]
+		   UNSPEC_MOVA))]
+  "TARGET_ZARCH"
+  "stpq\t%1,%0"
+  [(set_attr "op_type" "RXY")
+   (set_attr "type" "other")])
 
 ;
 ; compare and swap patterns.
 ;
 
-(define_expand "sync_compare_and_swap<mode>"
-  [(parallel
-    [(set (match_operand:TDSI 0 "register_operand" "")
-	  (match_operand:TDSI 1 "memory_operand" ""))
-     (set (match_dup 1)
-	  (unspec_volatile:TDSI
-	    [(match_dup 1)
-	     (match_operand:TDSI 2 "register_operand" "")
-	     (match_operand:TDSI 3 "register_operand" "")]
-	    UNSPECV_CAS))
-     (set (reg:CCZ1 CC_REGNUM)
-	  (compare:CCZ1 (match_dup 1) (match_dup 2)))])]
-  "")
+(define_expand "atomic_compare_and_swap<mode>"
+  [(match_operand:SI 0 "register_operand")	;; bool success output
+   (match_operand:DGPR 1 "register_operand")	;; oldval output
+   (match_operand:DGPR 2 "memory_operand")	;; memory
+   (match_operand:DGPR 3 "register_operand")	;; expected intput
+   (match_operand:DGPR 4 "register_operand")	;; newval intput
+   (match_operand:SI 5 "const_int_operand")	;; is_weak
+   (match_operand:SI 6 "const_int_operand")	;; success model
+   (match_operand:SI 7 "const_int_operand")]	;; failure model
+  ""
+{
+  rtx cc, cmp;
+  emit_insn (gen_atomic_compare_and_swap<mode>_internal
+	     (operands[1], operands[2], operands[3], operands[4]));
+  cc = gen_rtx_REG (CCZ1mode, CC_REGNUM);
+  cmp = gen_rtx_EQ (SImode, cc, const0_rtx);
+  emit_insn (gen_cstorecc4 (operands[0], cmp, cc, const0_rtx));
+  DONE;
+})
 
-(define_expand "sync_compare_and_swap<mode>"
-  [(parallel
-    [(set (match_operand:HQI 0 "register_operand" "")
-	  (match_operand:HQI 1 "memory_operand" ""))
-     (set (match_dup 1)
-	  (unspec_volatile:HQI
-	    [(match_dup 1)
-	     (match_operand:HQI 2 "general_operand" "")
-	     (match_operand:HQI 3 "general_operand" "")]
-	    UNSPECV_CAS))
-     (clobber (reg:CC CC_REGNUM))])]
+(define_expand "atomic_compare_and_swap<mode>"
+  [(match_operand:SI 0 "register_operand")	;; bool success output
+   (match_operand:HQI 1 "register_operand")	;; oldval output
+   (match_operand:HQI 2 "memory_operand")	;; memory
+   (match_operand:HQI 3 "general_operand")	;; expected intput
+   (match_operand:HQI 4 "general_operand")	;; newval intput
+   (match_operand:SI 5 "const_int_operand")	;; is_weak
+   (match_operand:SI 6 "const_int_operand")	;; success model
+   (match_operand:SI 7 "const_int_operand")]	;; failure model
   ""
-  "s390_expand_cs_hqi (<MODE>mode, operands[0], operands[1],
-		       operands[2], operands[3]); DONE;")
+{
+  s390_expand_cs_hqi (<MODE>mode, operands[0], operands[1], operands[2],
+		      operands[3], operands[4], INTVAL (operands[5]));
+  DONE;
+})
 
-; cds, cdsg
-(define_insn "*sync_compare_and_swap<mode>"
-  [(set (match_operand:DW 0 "register_operand" "=r")
-	(match_operand:DW 1 "memory_operand" "+Q"))
+(define_expand "atomic_compare_and_swap<mode>_internal"
+  [(parallel
+     [(set (match_operand:DGPR 0 "register_operand")
+	   (match_operand:DGPR 1 "memory_operand"))
+      (set (match_dup 1)
+	   (unspec_volatile:DGPR
+	     [(match_dup 1)
+	      (match_operand:DGPR 2 "register_operand")
+	      (match_operand:DGPR 3 "register_operand")]
+	     UNSPECV_CAS))
+      (set (reg:CCZ1 CC_REGNUM)
+	   (compare:CCZ1 (match_dup 1) (match_dup 2)))])]
+  "")
+
+; cdsg, csg
+(define_insn "*atomic_compare_and_swap<mode>_1"
+  [(set (match_operand:TDI 0 "register_operand" "=r")
+	(match_operand:TDI 1 "memory_operand" "+QS"))
    (set (match_dup 1)
-	(unspec_volatile:DW
+	(unspec_volatile:TDI
 	  [(match_dup 1)
-	   (match_operand:DW 2 "register_operand" "0")
-	   (match_operand:DW 3 "register_operand" "r")]
+	   (match_operand:TDI 2 "register_operand" "0")
+	   (match_operand:TDI 3 "register_operand" "r")]
 	  UNSPECV_CAS))
    (set (reg:CCZ1 CC_REGNUM)
 	(compare:CCZ1 (match_dup 1) (match_dup 2)))]
-  ""
-  "cds<tg>\t%0,%3,%S1"
-  [(set_attr "op_type" "RS<TE>")
+  "TARGET_ZARCH"
+  "c<td>sg\t%0,%3,%S1"
+  [(set_attr "op_type" "RSY")
    (set_attr "type"   "sem")])
 
-; cs, csg
-(define_insn "*sync_compare_and_swap<mode>"
-  [(set (match_operand:GPR 0 "register_operand" "=r")
-	(match_operand:GPR 1 "memory_operand" "+Q"))
+; cds, cdsy
+(define_insn "*atomic_compare_and_swapdi_2"
+  [(set (match_operand:DI 0 "register_operand" "=r,r")
+	(match_operand:DI 1 "memory_operand" "+Q,S"))
    (set (match_dup 1)
-	(unspec_volatile:GPR
+	(unspec_volatile:DI
+	  [(match_dup 1)
+	   (match_operand:DI 2 "register_operand" "0,0")
+	   (match_operand:DI 3 "register_operand" "r,r")]
+	  UNSPECV_CAS))
+   (set (reg:CCZ1 CC_REGNUM)
+	(compare:CCZ1 (match_dup 1) (match_dup 2)))]
+  "!TARGET_ZARCH"
+  "@
+   cds\t%0,%3,%S1
+   cdsy\t%0,%3,%S1"
+  [(set_attr "op_type" "RS,RSY")
+   (set_attr "type" "sem")])
+
+; cs, csy
+(define_insn "*atomic_compare_and_swapsi_3"
+  [(set (match_operand:SI 0 "register_operand" "=r,r")
+	(match_operand:SI 1 "memory_operand" "+Q,S"))
+   (set (match_dup 1)
+	(unspec_volatile:SI
 	  [(match_dup 1)
-	   (match_operand:GPR 2 "register_operand" "0")
-	   (match_operand:GPR 3 "register_operand" "r")]
+	   (match_operand:SI 2 "register_operand" "0,0")
+	   (match_operand:SI 3 "register_operand" "r,r")]
 	  UNSPECV_CAS))
    (set (reg:CCZ1 CC_REGNUM)
 	(compare:CCZ1 (match_dup 1) (match_dup 2)))]
   ""
-  "cs<g>\t%0,%3,%S1"
-  [(set_attr "op_type" "RS<E>")
+  "@
+   cs\t%0,%3,%S1
+   csy\t%0,%3,%S1"
+  [(set_attr "op_type" "RS,RSY")
    (set_attr "type"   "sem")])
 
-
 ;
 ; Other atomic instruction patterns.
 ;
 
-(define_expand "sync_lock_test_and_set<mode>"
-  [(match_operand:HQI 0 "register_operand")
-   (match_operand:HQI 1 "memory_operand")
-   (match_operand:HQI 2 "general_operand")]
-  ""
-  "s390_expand_atomic (<MODE>mode, SET, operands[0], operands[1],
-		       operands[2], false); DONE;")
-
 ; z196 load and add, xor, or and and instructions
 
-; lan, lang, lao, laog, lax, laxg, laa, laag
-(define_insn "sync_<atomic><mode>"
-  [(parallel
-    [(set (match_operand:GPR 0 "memory_operand" "+QS")
-	  (unspec_volatile:GPR
-	   [(ATOMIC_Z196:GPR (match_dup 0)
-			     (match_operand:GPR 1 "general_operand" "d"))]
-	   UNSPECV_ATOMIC_OP))
-     (clobber (match_scratch:GPR 2 "=d"))
-     (clobber (reg:CC CC_REGNUM))])]
+(define_expand "atomic_fetch_<atomic><mode>"
+  [(match_operand:GPR 0 "register_operand")		;; val out
+   (ATOMIC_Z196:GPR
+     (match_operand:GPR 1 "memory_operand")		;; memory
+     (match_operand:GPR 2 "register_operand"))		;; val in
+   (match_operand:SI 3 "const_int_operand")]		;; model
   "TARGET_Z196"
-  "la<noxa><g>\t%2,%1,%0")
+{
+  emit_insn (gen_atomic_fetch_<atomic><mode>_iaf
+	     (operands[0], operands[1], operands[2]));
+  DONE;
+})
 
 ; lan, lang, lao, laog, lax, laxg, laa, laag
-(define_insn "sync_old_<atomic><mode>"
-  [(parallel
-    [(set (match_operand:GPR 0 "register_operand" "=d")
-	  (match_operand:GPR 1 "memory_operand"   "+QS"))
-     (set (match_dup 1)
-	  (unspec_volatile:GPR
-	   [(ATOMIC_Z196:GPR (match_dup 1)
-			     (match_operand:GPR 2 "general_operand" "d"))]
-	   UNSPECV_ATOMIC_OP))
-     (clobber (reg:CC CC_REGNUM))])]
+(define_insn "atomic_fetch_<atomic><mode>_iaf"
+  [(set (match_operand:GPR 0 "register_operand" "=d")
+	(match_operand:GPR 1 "memory_operand" "+QS"))
+   (set (match_dup 1)
+	(unspec_volatile:GPR
+	 [(ATOMIC_Z196:GPR (match_dup 1)
+			   (match_operand:GPR 2 "general_operand" "d"))]
+	 UNSPECV_ATOMIC_OP))
+   (clobber (reg:CC CC_REGNUM))]
   "TARGET_Z196"
-  "la<noxa><g>\t%0,%2,%1")
+  "la<noxa><g>\t%0,%2,%1"
+  [(set_attr "op_type" "RSY")
+   (set_attr "type" "sem")])
 
+;; For SImode and larger, the optabs.c code will do just fine in
+;; expanding a compare-and-swap loop.  For QI/HImode, we can do
+;; better by expanding our own loop.
 
-(define_expand "sync_<atomic><mode>"
-  [(set (match_operand:HQI 0 "memory_operand")
-	(ATOMIC:HQI (match_dup 0)
-		    (match_operand:HQI 1 "general_operand")))]
+(define_expand "atomic_<atomic><mode>"
+  [(ATOMIC:HQI
+     (match_operand:HQI 0 "memory_operand")		;; memory
+     (match_operand:HQI 1 "general_operand"))		;; val in
+   (match_operand:SI 2 "const_int_operand")]		;; model
   ""
-  "s390_expand_atomic (<MODE>mode, <CODE>, NULL_RTX, operands[0],
-		       operands[1], false); DONE;")
+{
+  s390_expand_atomic (<MODE>mode, <CODE>, NULL_RTX, operands[0],
+		       operands[1], false);
+  DONE;
+})
 
-(define_expand "sync_old_<atomic><mode>"
-  [(set (match_operand:HQI 0 "register_operand")
-	(match_operand:HQI 1 "memory_operand"))
-   (set (match_dup 1)
-	(ATOMIC:HQI (match_dup 1)
-		    (match_operand:HQI 2 "general_operand")))]
+(define_expand "atomic_fetch_<atomic><mode>"
+  [(match_operand:HQI 0 "register_operand")		;; val out
+   (ATOMIC:HQI
+     (match_operand:HQI 1 "memory_operand")		;; memory
+     (match_operand:HQI 2 "general_operand"))		;; val in
+   (match_operand:SI 3 "const_int_operand")]		;; model
   ""
-  "s390_expand_atomic (<MODE>mode, <CODE>, operands[0], operands[1],
-		       operands[2], false); DONE;")
-
-(define_expand "sync_new_<atomic><mode>"
-  [(set (match_operand:HQI 0 "register_operand")
-	(ATOMIC:HQI (match_operand:HQI 1 "memory_operand")
-		    (match_operand:HQI 2 "general_operand")))
-   (set (match_dup 1) (ATOMIC:HQI (match_dup 1) (match_dup 2)))]
+{
+  s390_expand_atomic (<MODE>mode, <CODE>, operands[0], operands[1],
+		      operands[2], false);
+  DONE;
+})
+
+(define_expand "atomic_<atomic>_fetch<mode>"
+  [(match_operand:HQI 0 "register_operand")		;; val out
+   (ATOMIC:HQI
+     (match_operand:HQI 1 "memory_operand")		;; memory
+     (match_operand:HQI 2 "general_operand"))		;; val in
+   (match_operand:SI 3 "const_int_operand")]		;; model
+  ""
+{
+  s390_expand_atomic (<MODE>mode, <CODE>, operands[0], operands[1],
+		      operands[2], true);
+  DONE;
+})
+
+(define_expand "atomic_exchange<mode>"
+  [(match_operand:HQI 0 "register_operand")		;; val out
+   (match_operand:HQI 1 "memory_operand")		;; memory
+   (match_operand:HQI 2 "general_operand")		;; val in
+   (match_operand:SI 3 "const_int_operand")]		;; model
   ""
-  "s390_expand_atomic (<MODE>mode, <CODE>, operands[0], operands[1],
-		       operands[2], true); DONE;")
+{
+  s390_expand_atomic (<MODE>mode, SET, operands[0], operands[1],
+		      operands[2], false);
+  DONE;
+})
 
 ;;
 ;;- Miscellaneous instructions.