diff mbox

[V3,0/2] aarch64: Place target independent and dependent changed code in one file.

Message ID 6a013cf5-a5f2-44df-bdb3-a4985fbae06a@linux.ibm.com
State New
Headers show

Commit Message

Ajit Agarwal Feb. 23, 2024, 11:11 a.m. UTC
Hello Richard/Alex/Segher:

This patch adds the changed code for target independent and
dependent code for load store fusion.

Common infrastructure of load store pair fusion is
divided into target independent and target dependent
changed code.

Target independent code is the Generic code with
pure virtual function to interface betwwen target
independent and dependent code.

Target dependent code is the implementation of pure
virtual function for aarch64 target and the call
to target independent code.

Bootstrapped for aarch64-linux-gnu.

Thanks & Regards
Ajit

aarch64: Place target independent and dependent changed code in one file.

Common infrastructure of load store pair fusion is
divided into target independent and target dependent
changed code.

Target independent code is the Generic code with
pure virtual function to interface betwwen target
independent and dependent code.

Target dependent code is the implementation of pure
virtual function for aarch64 target and the call
to target independent code.

2024-02-23  Ajit Kumar Agarwal  <aagarwa1@linux.ibm.com>

gcc/ChangeLog:

	* config/aarch64/aarch64-ldp-fusion.cc: Place target
	independent and dependent changed code.
---
 gcc/config/aarch64/aarch64-ldp-fusion.cc | 437 ++++++++++++++++-------
 1 file changed, 305 insertions(+), 132 deletions(-)

Comments

Ajit Agarwal Feb. 27, 2024, 7:03 a.m. UTC | #1
Hello Richard/Alex:

This patch has better diff with changed and unchanged code.
Unchanged code and some of the changed code  will be extracted 
into target independent headers and sources wherein target
deoendent code changed and unchanged code would be in target
dependent file like aarch64-ldp-fusion

Please review.

Thanks & Regards
Ajit

On 23/02/24 4:41 pm, Ajit Agarwal wrote:
> Hello Richard/Alex/Segher:
> 
> This patch adds the changed code for target independent and
> dependent code for load store fusion.
> 
> Common infrastructure of load store pair fusion is
> divided into target independent and target dependent
> changed code.
> 
> Target independent code is the Generic code with
> pure virtual function to interface betwwen target
> independent and dependent code.
> 
> Target dependent code is the implementation of pure
> virtual function for aarch64 target and the call
> to target independent code.
> 
> Bootstrapped for aarch64-linux-gnu.
> 
> Thanks & Regards
> Ajit
> 
> aarch64: Place target independent and dependent changed code in one file.
> 
> Common infrastructure of load store pair fusion is
> divided into target independent and target dependent
> changed code.
> 
> Target independent code is the Generic code with
> pure virtual function to interface betwwen target
> independent and dependent code.
> 
> Target dependent code is the implementation of pure
> virtual function for aarch64 target and the call
> to target independent code.
> 
> 2024-02-23  Ajit Kumar Agarwal  <aagarwa1@linux.ibm.com>
> 
> gcc/ChangeLog:
> 
> 	* config/aarch64/aarch64-ldp-fusion.cc: Place target
> 	independent and dependent changed code.
> ---
>  gcc/config/aarch64/aarch64-ldp-fusion.cc | 437 ++++++++++++++++-------
>  1 file changed, 305 insertions(+), 132 deletions(-)
> 
> diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc b/gcc/config/aarch64/aarch64-ldp-fusion.cc
> index 22ed95eb743..2ef22ff1e96 100644
> --- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
> +++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
> @@ -40,10 +40,10 @@
>  
>  using namespace rtl_ssa;
>  
> -static constexpr HOST_WIDE_INT LDP_IMM_BITS = 7;
> -static constexpr HOST_WIDE_INT LDP_IMM_SIGN_BIT = (1 << (LDP_IMM_BITS - 1));
> -static constexpr HOST_WIDE_INT LDP_MAX_IMM = LDP_IMM_SIGN_BIT - 1;
> -static constexpr HOST_WIDE_INT LDP_MIN_IMM = -LDP_MAX_IMM - 1;
> +static constexpr HOST_WIDE_INT PAIR_MEM_IMM_BITS = 7;
> +static constexpr HOST_WIDE_INT PAIR_MEM_IMM_SIGN_BIT = (1 << (PAIR_MEM_IMM_BITS - 1));
> +static constexpr HOST_WIDE_INT PAIR_MEM_MAX_IMM = PAIR_MEM_IMM_SIGN_BIT - 1;
> +static constexpr HOST_WIDE_INT PAIR_MEM_MIN_IMM = -PAIR_MEM_MAX_IMM - 1;
>  
>  // We pack these fields (load_p, fpsimd_p, and size) into an integer
>  // (LFS) which we use as part of the key into the main hash tables.
> @@ -138,8 +138,18 @@ struct alt_base
>    poly_int64 offset;
>  };
>  
> +// Virtual base class for load/store walkers used in alias analysis.
> +struct alias_walker
> +{
> +  virtual bool conflict_p (int &budget) const = 0;
> +  virtual insn_info *insn () const = 0;
> +  virtual bool valid () const  = 0;
> +  virtual void advance () = 0;
> +};
> +
> +
>  // State used by the pass for a given basic block.
> -struct ldp_bb_info
> +struct pair_fusion
>  {
>    using def_hash = nofree_ptr_hash<def_info>;
>    using expr_key_t = pair_hash<tree_operand_hash, int_hash<int, -1, -2>>;
> @@ -161,13 +171,13 @@ struct ldp_bb_info
>    static const size_t obstack_alignment = sizeof (void *);
>    bb_info *m_bb;
>  
> -  ldp_bb_info (bb_info *bb) : m_bb (bb), m_emitted_tombstone (false)
> +  pair_fusion (bb_info *bb) : m_bb (bb), m_emitted_tombstone (false)
>    {
>      obstack_specify_allocation (&m_obstack, OBSTACK_CHUNK_SIZE,
>  				obstack_alignment, obstack_chunk_alloc,
>  				obstack_chunk_free);
>    }
> -  ~ldp_bb_info ()
> +  ~pair_fusion ()
>    {
>      obstack_free (&m_obstack, nullptr);
>  
> @@ -177,10 +187,50 @@ struct ldp_bb_info
>  	bitmap_obstack_release (&m_bitmap_obstack);
>        }
>    }
> +  void track_access (insn_info *, bool load, rtx mem);
> +  void transform ();
> +  void cleanup_tombstones ();
> +  virtual void set_multiword_subreg (insn_info *i1, insn_info *i2,
> +				     bool load_p) = 0;
> +  virtual rtx gen_load_store_pair (rtx *pats,  rtx writeback,
> +				   bool load_p) = 0;
> +  void merge_pairs (insn_list_t &, insn_list_t &,
> +		    bool load_p, unsigned access_size);
> +  virtual void transform_for_base (int load_size, access_group &group) = 0;
> +
> +  bool try_fuse_pair (bool load_p, unsigned access_size,
> +			     insn_info *i1, insn_info *i2);
> +
> +  bool fuse_pair (bool load_p, unsigned access_size,
> +		  int writeback,
> +		  insn_info *i1, insn_info *i2,
> +		  base_cand &base,
> +		  const insn_range_info &move_range);
> +
> +  void do_alias_analysis (insn_info *alias_hazards[4],
> +			  alias_walker *walkers[4],
> +			  bool load_p);
> +
> +  void track_tombstone (int uid);
> +
> +  bool track_via_mem_expr (insn_info *, rtx mem, lfs_fields lfs);
> +
> +  virtual bool is_fpsimd_op_p (rtx reg_op, machine_mode mem_mode,
> +			       bool load_p) = 0;
> +
> +  virtual bool pair_operand_mode_ok_p (machine_mode mode) = 0;
> +  virtual bool pair_trailing_writeback_p () = 0;
> +  virtual bool pair_check_register_operand (bool load_p, rtx reg_op,
> +					    machine_mode mem_mode) = 0;
> +  virtual int pair_mem_alias_check_limit () = 0;
> +  virtual bool pair_is_writeback () = 0 ;
> +  virtual bool pair_mem_ok_policy (rtx first_mem, bool load_p,
> +				   machine_mode mode) = 0;
> +  virtual bool fuseable_store_p (insn_info *i1, insn_info *i2) = 0;
> +  virtual bool fuseable_load_p (insn_info *info) = 0;
>  
> -  inline void track_access (insn_info *, bool load, rtx mem);
> -  inline void transform ();
> -  inline void cleanup_tombstones ();
> +  template<typename Map>
> +    void traverse_base_map (Map &map);
>  
>  private:
>    obstack m_obstack;
> @@ -191,30 +241,157 @@ private:
>    bool m_emitted_tombstone;
>  
>    inline splay_tree_node<access_record *> *node_alloc (access_record *);
> +};
>  
> -  template<typename Map>
> -  inline void traverse_base_map (Map &map);
> -  inline void transform_for_base (int load_size, access_group &group);
> -
> -  inline void merge_pairs (insn_list_t &, insn_list_t &,
> -			   bool load_p, unsigned access_size);
> +struct  aarch64_pair_fusion : public pair_fusion
> +{
> +public:
> +  aarch64_pair_fusion (bb_info *bb) : pair_fusion (bb) {};
> +  bool is_fpsimd_op_p (rtx reg_op, machine_mode mem_mode, bool load_p)
> +  {
> +    const bool fpsimd_op_p
> +      = reload_completed
> +      ? (REG_P (reg_op) && FP_REGNUM_P (REGNO (reg_op)))
> +      : (GET_MODE_CLASS (mem_mode) != MODE_INT
> +	 && (load_p || !aarch64_const_zero_rtx_p (reg_op)));
> +    return fpsimd_op_p;
> +  }
>  
> -  inline bool try_fuse_pair (bool load_p, unsigned access_size,
> -			     insn_info *i1, insn_info *i2);
> +  bool pair_mem_ok_policy (rtx first_mem, bool load_p, machine_mode mode)
> +  {
> +    return aarch64_mem_ok_with_ldpstp_policy_model (first_mem,
> +						     load_p,
> +						     mode);
> +  }
> +  bool pair_operand_mode_ok_p (machine_mode mode);
>  
> -  inline bool fuse_pair (bool load_p, unsigned access_size,
> -			 int writeback,
> -			 insn_info *i1, insn_info *i2,
> -			 base_cand &base,
> -			 const insn_range_info &move_range);
> +  void transform_for_base (int encoded_lfs,
> +			   access_group &group);
> +  rtx gen_load_store_pair (rtx *pats,
> +			   rtx writeback,
> +			   bool load_p)
> +  {
> +    rtx pair_pat;
>  
> -  inline void track_tombstone (int uid);
> +    if (writeback)
> +      {
> +	auto patvec = gen_rtvec (3, writeback, pats[0], pats[1]);
> +	pair_pat = gen_rtx_PARALLEL (VOIDmode, patvec);
> +      }
> +    else if (load_p)
> +      pair_pat = aarch64_gen_load_pair (XEXP (pats[0], 0),
> +					XEXP (pats[1], 0),
> +					XEXP (pats[0], 1));
> +    else
> +      pair_pat = aarch64_gen_store_pair (XEXP (pats[0], 0),
> +					 XEXP (pats[0], 1),
> +					 XEXP (pats[1], 1));
> +     return pair_pat;
> +  }
>  
> -  inline bool track_via_mem_expr (insn_info *, rtx mem, lfs_fields lfs);
> +  void set_multiword_subreg (insn_info *i1, insn_info *i2, bool load_p)
> +  {
> +    if (i1 || i2 || load_p)
> +      return;
> +    return;
> +  }
> +  bool pair_trailing_writeback_p  ()
> +  {
> +    return aarch64_ldp_writeback > 1;
> +  }
> +  bool pair_check_register_operand (bool load_p, rtx reg_op, machine_mode mem_mode)
> +  {
> +    return  (load_p
> +	     ? aarch64_ldp_reg_operand (reg_op, mem_mode)
> +	     : aarch64_stp_reg_operand (reg_op, mem_mode));
> +  }
> +  int pair_mem_alias_check_limit ()
> +  {
> +    return aarch64_ldp_alias_check_limit;
> +  }
> +  bool fuseable_store_p (insn_info *i1, insn_info *i2) { return i1 || i2;}
> +  bool fuseable_load_p (insn_info *insn) { return insn;}
> +  bool pair_is_writeback ()
> +  {
> +    return !aarch64_ldp_writeback;
> +  }
>  };
>  
> +bool
> +store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget);
> +bool load_modified_by_store_p (insn_info *load,
> +			  insn_info *store,
> +			  int &budget);
> +extern insn_info *
> +try_repurpose_store (insn_info *first,
> +		     insn_info *second,
> +		     const insn_range_info &move_range);
> +
> +void reset_debug_use (use_info *use);
> +
> +extern void
> +fixup_debug_uses (obstack_watermark &attempt,
> +		  insn_info *insns[2],
> +		  rtx orig_rtl[2],
> +		  insn_info *pair_dst,
> +		  insn_info *trailing_add,
> +		  bool load_p,
> +		  int writeback,
> +		  rtx writeback_effect,
> +		  unsigned base_regno);
> +
> +void
> +fixup_debug_uses_trailing_add (obstack_watermark &attempt,
> +			       insn_info *pair_dst,
> +			       insn_info *trailing_add,
> +			       rtx writeback_effect);
> +
> +
> +extern void
> +fixup_debug_use (obstack_watermark &attempt,
> +		 use_info *use,
> +		 def_info *def,
> +		 rtx base,
> +		 poly_int64 wb_offset);
> +
> +extern insn_info *
> +find_trailing_add (insn_info *insns[2],
> +		   const insn_range_info &pair_range,
> +		   int initial_writeback,
> +		   rtx *writeback_effect,
> +		   def_info **add_def,
> +		   def_info *base_def,
> +		   poly_int64 initial_offset,
> +		   unsigned access_size);
> +
> +rtx drop_writeback (rtx mem);
> +rtx pair_mem_strip_offset (rtx mem, poly_int64 *offset);
> +bool any_pre_modify_p (rtx x);
> +bool any_post_modify_p (rtx x);
> +int encode_lfs (lfs_fields fields);
> +extern insn_info * latest_hazard_before (insn_info *insn, rtx *ignore,
> +		      insn_info *ignore_insn = nullptr);
> +insn_info * first_hazard_after (insn_info *insn, rtx *ignore);
> +bool ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2);
> +insn_range_info get_def_range (def_info *def);
> +insn_range_info def_downwards_move_range (def_info *def);
> +insn_range_info def_upwards_move_range (def_info *def);
> +rtx gen_tombstone (void);
> +rtx filter_notes (rtx note, rtx result, bool *eh_region, rtx *fr_expr);
> +rtx combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p);
> +rtx extract_writebacks (bool load_p, rtx pats[2], int changed);
> +void do_alias_analysis (insn_info *alias_hazards[4],
> +		   alias_walker *walkers[4],
> +		   bool load_p);
> +int get_viable_bases (insn_info *insns[2],
> +		  vec<base_cand> &base_cands,
> +		  rtx cand_mems[2],
> +		  unsigned access_size,
> +		  bool reversed);
> +void dump_insn_list (FILE *f, const insn_list_t &l);
> +
>  splay_tree_node<access_record *> *
> -ldp_bb_info::node_alloc (access_record *access)
> +pair_fusion::node_alloc (access_record *access)
>  {
>    using T = splay_tree_node<access_record *>;
>    void *addr = obstack_alloc (&m_obstack, sizeof (T));
> @@ -224,7 +401,7 @@ ldp_bb_info::node_alloc (access_record *access)
>  // Given a mem MEM, if the address has side effects, return a MEM that accesses
>  // the same address but without the side effects.  Otherwise, return
>  // MEM unchanged.
> -static rtx
> +rtx
>  drop_writeback (rtx mem)
>  {
>    rtx addr = XEXP (mem, 0);
> @@ -261,8 +438,8 @@ drop_writeback (rtx mem)
>  // Convenience wrapper around strip_offset that can also look through
>  // RTX_AUTOINC addresses.  The interface is like strip_offset except we take a
>  // MEM so that we know the mode of the access.
> -static rtx
> -ldp_strip_offset (rtx mem, poly_int64 *offset)
> +rtx
> +pair_mem_strip_offset (rtx mem, poly_int64 *offset)
>  {
>    rtx addr = XEXP (mem, 0);
>  
> @@ -295,7 +472,7 @@ ldp_strip_offset (rtx mem, poly_int64 *offset)
>  }
>  
>  // Return true if X is a PRE_{INC,DEC,MODIFY} rtx.
> -static bool
> +bool
>  any_pre_modify_p (rtx x)
>  {
>    const auto code = GET_CODE (x);
> @@ -303,7 +480,7 @@ any_pre_modify_p (rtx x)
>  }
>  
>  // Return true if X is a POST_{INC,DEC,MODIFY} rtx.
> -static bool
> +bool
>  any_post_modify_p (rtx x)
>  {
>    const auto code = GET_CODE (x);
> @@ -332,9 +509,15 @@ ldp_operand_mode_ok_p (machine_mode mode)
>    return reload_completed || mode != TImode;
>  }
>  
> +bool
> +aarch64_pair_fusion::pair_operand_mode_ok_p (machine_mode mode)
> +{
> +  return (ldp_operand_mode_ok_p (mode));
> +}
> +
>  // Given LFS (load_p, fpsimd_p, size) fields in FIELDS, encode these
>  // into an integer for use as a hash table key.
> -static int
> +int
>  encode_lfs (lfs_fields fields)
>  {
>    int size_log2 = exact_log2 (fields.size);
> @@ -396,7 +579,7 @@ access_group::track (Alloc alloc_node, poly_int64 offset, insn_info *insn)
>  // MEM_EXPR base (i.e. a tree decl) relative to which we can track the access.
>  // LFS is used as part of the key to the hash table, see track_access.
>  bool
> -ldp_bb_info::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs)
> +pair_fusion::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs)
>  {
>    if (!MEM_EXPR (mem) || !MEM_OFFSET_KNOWN_P (mem))
>      return false;
> @@ -412,7 +595,7 @@ ldp_bb_info::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs)
>    const machine_mode mem_mode = GET_MODE (mem);
>    const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant ();
>  
> -  // Punt on misaligned offsets.  LDP/STP instructions require offsets to be a
> +  // Punt on misaligned offsets.  PAIR MEM  instructions require offsets to be a
>    // multiple of the access size, and we believe that misaligned offsets on
>    // MEM_EXPR bases are likely to lead to misaligned offsets w.r.t. RTL bases.
>    if (!multiple_p (offset, mem_size))
> @@ -438,46 +621,38 @@ ldp_bb_info::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs)
>  }
>  
>  // Main function to begin pair discovery.  Given a memory access INSN,
> -// determine whether it could be a candidate for fusing into an ldp/stp,
> +// determine whether it could be a candidate for fusing into an pair mem,
>  // and if so, track it in the appropriate data structure for this basic
>  // block.  LOAD_P is true if the access is a load, and MEM is the mem
>  // rtx that occurs in INSN.
>  void
> -ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
> +pair_fusion::track_access (insn_info *insn, bool load_p, rtx mem)
>  {
>    // We can't combine volatile MEMs, so punt on these.
>    if (MEM_VOLATILE_P (mem))
>      return;
>  
> -  // Ignore writeback accesses if the param says to do so.
> -  if (!aarch64_ldp_writeback
> +  // Ignore writeback accesses if the param says to do so
> +  if (pair_is_writeback ()
>        && GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC)
>      return;
>  
>    const machine_mode mem_mode = GET_MODE (mem);
> -  if (!ldp_operand_mode_ok_p (mem_mode))
> +
> +  if (!pair_operand_mode_ok_p (mem_mode))
>      return;
>  
>    rtx reg_op = XEXP (PATTERN (insn->rtl ()), !load_p);
>  
> -  // Ignore the access if the register operand isn't suitable for ldp/stp.
> -  if (load_p
> -      ? !aarch64_ldp_reg_operand (reg_op, mem_mode)
> -      : !aarch64_stp_reg_operand (reg_op, mem_mode))
> +  if (!pair_check_register_operand (load_p, reg_op, mem_mode))
>      return;
> -
>    // We want to segregate FP/SIMD accesses from GPR accesses.
>    //
>    // Before RA, we use the modes, noting that stores of constant zero
>    // operands use GPRs (even in non-integer modes).  After RA, we use
>    // the hard register numbers.
> -  const bool fpsimd_op_p
> -    = reload_completed
> -    ? (REG_P (reg_op) && FP_REGNUM_P (REGNO (reg_op)))
> -    : (GET_MODE_CLASS (mem_mode) != MODE_INT
> -       && (load_p || !aarch64_const_zero_rtx_p (reg_op)));
> -
> -  // Note ldp_operand_mode_ok_p already rejected VL modes.
> + const bool fpsimd_op_p = is_fpsimd_op_p (reg_op, mem_mode, load_p);
> +  // Note pair_operand_mode_ok_p already rejected VL modes.
>    const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant ();
>    const lfs_fields lfs = { load_p, fpsimd_op_p, mem_size };
>  
> @@ -487,7 +662,7 @@ ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
>    poly_int64 mem_off;
>    rtx addr = XEXP (mem, 0);
>    const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC;
> -  rtx base = ldp_strip_offset (mem, &mem_off);
> +  rtx base = pair_mem_strip_offset (mem, &mem_off);
>    if (!REG_P (base))
>      return;
>  
> @@ -507,7 +682,7 @@ ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
>    // accesses until after RA.
>    //
>    // As it stands, addresses with offsets in range for LDR but not
> -  // in range for LDP/STP are currently reloaded inefficiently,
> +  // in range for PAIR MEM LOAD STORE  are currently reloaded inefficiently,
>    // ending up with a separate base register for each pair.
>    //
>    // In theory LRA should make use of
> @@ -519,8 +694,8 @@ ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
>    // that calls targetm.legitimize_address_displacement.
>    //
>    // So for now, it's better to punt when we can't be sure that the
> -  // offset is in range for LDP/STP.  Out-of-range cases can then be
> -  // handled after RA by the out-of-range LDP/STP peepholes.  Eventually, it
> +  // offset is in range for PAIR MEM LOAD STORE.  Out-of-range cases can then be
> +  // handled after RA by the out-of-range PAIR MEM  peepholes.  Eventually, it
>    // would be nice to handle known out-of-range opportunities in the
>    // pass itself (for stack accesses, this would be in the post-RA pass).
>    if (!reload_completed
> @@ -573,7 +748,7 @@ ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
>  	gcc_unreachable (); // Base defs should be unique.
>      }
>  
> -  // Punt on misaligned offsets.  LDP/STP require offsets to be a multiple of
> +  // Punt on misaligned offsets.  PAIR MEM  require offsets to be a multiple of
>    // the access size.
>    if (!multiple_p (mem_off, mem_size))
>      return;
> @@ -612,9 +787,9 @@ static bool no_ignore (insn_info *) { return false; }
>  //
>  // N.B. we ignore any defs/uses of memory here as we deal with that separately,
>  // making use of alias disambiguation.
> -static insn_info *
> +insn_info *
>  latest_hazard_before (insn_info *insn, rtx *ignore,
> -		      insn_info *ignore_insn = nullptr)
> +		      insn_info *ignore_insn)
>  {
>    insn_info *result = nullptr;
>  
> @@ -698,7 +873,7 @@ latest_hazard_before (insn_info *insn, rtx *ignore,
>  //
>  // N.B. we ignore any defs/uses of memory here as we deal with that separately,
>  // making use of alias disambiguation.
> -static insn_info *
> +insn_info *
>  first_hazard_after (insn_info *insn, rtx *ignore)
>  {
>    insn_info *result = nullptr;
> @@ -787,7 +962,7 @@ first_hazard_after (insn_info *insn, rtx *ignore)
>  }
>  
>  // Return true iff R1 and R2 overlap.
> -static bool
> +bool
>  ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2)
>  {
>    // If either range is empty, then their intersection is empty.
> @@ -801,7 +976,7 @@ ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2)
>  }
>  
>  // Get the range of insns that def feeds.
> -static insn_range_info get_def_range (def_info *def)
> +insn_range_info get_def_range (def_info *def)
>  {
>    insn_info *last = def->next_def ()->insn ()->prev_nondebug_insn ();
>    return { def->insn (), last };
> @@ -809,7 +984,7 @@ static insn_range_info get_def_range (def_info *def)
>  
>  // Given a def (of memory), return the downwards range within which we
>  // can safely move this def.
> -static insn_range_info
> +insn_range_info
>  def_downwards_move_range (def_info *def)
>  {
>    auto range = get_def_range (def);
> @@ -827,7 +1002,7 @@ def_downwards_move_range (def_info *def)
>  
>  // Given a def (of memory), return the upwards range within which we can
>  // safely move this def.
> -static insn_range_info
> +insn_range_info
>  def_upwards_move_range (def_info *def)
>  {
>    def_info *prev = def->prev_def ();
> @@ -974,7 +1149,7 @@ private:
>  // Given candidate store insns FIRST and SECOND, see if we can re-purpose one
>  // of them (together with its def of memory) for the stp insn.  If so, return
>  // that insn.  Otherwise, return null.
> -static insn_info *
> +insn_info *
>  try_repurpose_store (insn_info *first,
>  		     insn_info *second,
>  		     const insn_range_info &move_range)
> @@ -1001,7 +1176,7 @@ try_repurpose_store (insn_info *first,
>  //
>  // These are deleted at the end of the pass and uses re-parented appropriately
>  // at this point.
> -static rtx
> +rtx
>  gen_tombstone (void)
>  {
>    return gen_rtx_CLOBBER (VOIDmode,
> @@ -1034,7 +1209,7 @@ aarch64_operand_mode_for_pair_mode (machine_mode mode)
>  // REG_EH_REGION note in the resulting list.  FR_EXPR is used to return any
>  // REG_FRAME_RELATED_EXPR note we find, as these can need special handling in
>  // combine_reg_notes.
> -static rtx
> +rtx
>  filter_notes (rtx note, rtx result, bool *eh_region, rtx *fr_expr)
>  {
>    for (; note; note = XEXP (note, 1))
> @@ -1084,7 +1259,7 @@ filter_notes (rtx note, rtx result, bool *eh_region, rtx *fr_expr)
>  
>  // Return the notes that should be attached to a combination of I1 and I2, where
>  // *I1 < *I2.  LOAD_P is true for loads.
> -static rtx
> +rtx
>  combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p)
>  {
>    // Temporary storage for REG_FRAME_RELATED_EXPR notes.
> @@ -1133,7 +1308,7 @@ combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p)
>  // relative to the initial value of the base register, and output these
>  // in PATS.  Return an rtx that represents the overall change to the
>  // base register.
> -static rtx
> +rtx
>  extract_writebacks (bool load_p, rtx pats[2], int changed)
>  {
>    rtx base_reg = NULL_RTX;
> @@ -1150,7 +1325,7 @@ extract_writebacks (bool load_p, rtx pats[2], int changed)
>        const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC;
>  
>        poly_int64 offset;
> -      rtx this_base = ldp_strip_offset (mem, &offset);
> +      rtx this_base = pair_mem_strip_offset (mem, &offset);
>        gcc_assert (REG_P (this_base));
>        if (base_reg)
>  	gcc_assert (rtx_equal_p (base_reg, this_base));
> @@ -1207,7 +1382,7 @@ extract_writebacks (bool load_p, rtx pats[2], int changed)
>  // base register.  If there is one, we choose the first such update after
>  // PAIR_DST that is still in the same BB as our pair.  We return the new def in
>  // *ADD_DEF and the resulting writeback effect in *WRITEBACK_EFFECT.
> -static insn_info *
> +insn_info *
>  find_trailing_add (insn_info *insns[2],
>  		   const insn_range_info &pair_range,
>  		   int initial_writeback,
> @@ -1286,7 +1461,7 @@ find_trailing_add (insn_info *insns[2],
>  
>    off_hwi /= access_size;
>  
> -  if (off_hwi < LDP_MIN_IMM || off_hwi > LDP_MAX_IMM)
> +  if (off_hwi < PAIR_MEM_MIN_IMM || off_hwi > PAIR_MEM_MAX_IMM)
>      return nullptr;
>  
>    auto dump_prefix = [&]()
> @@ -1328,7 +1503,7 @@ find_trailing_add (insn_info *insns[2],
>  // We just emitted a tombstone with uid UID, track it in a bitmap for
>  // this BB so we can easily identify it later when cleaning up tombstones.
>  void
> -ldp_bb_info::track_tombstone (int uid)
> +pair_fusion::track_tombstone (int uid)
>  {
>    if (!m_emitted_tombstone)
>      {
> @@ -1344,7 +1519,7 @@ ldp_bb_info::track_tombstone (int uid)
>  
>  // Reset the debug insn containing USE (the debug insn has been
>  // optimized away).
> -static void
> +void
>  reset_debug_use (use_info *use)
>  {
>    auto use_insn = use->insn ();
> @@ -1360,7 +1535,7 @@ reset_debug_use (use_info *use)
>  // is an update of the register BASE by a constant, given by WB_OFFSET,
>  // and we can preserve debug info by accounting for the change in side
>  // effects.
> -static void
> +void
>  fixup_debug_use (obstack_watermark &attempt,
>  		 use_info *use,
>  		 def_info *def,
> @@ -1463,7 +1638,7 @@ fixup_debug_use (obstack_watermark &attempt,
>  // is a trailing add insn which is being folded into the pair to make it
>  // use writeback addressing, and WRITEBACK_EFFECT is the pattern for
>  // TRAILING_ADD.
> -static void
> +void
>  fixup_debug_uses_trailing_add (obstack_watermark &attempt,
>  			       insn_info *pair_dst,
>  			       insn_info *trailing_add,
> @@ -1500,7 +1675,7 @@ fixup_debug_uses_trailing_add (obstack_watermark &attempt,
>  // writeback, and WRITEBACK_EFFECT is an rtx describing the overall update to
>  // the base register in the final pair (if any).  BASE_REGNO gives the register
>  // number of the base register used in the final pair.
> -static void
> +void
>  fixup_debug_uses (obstack_watermark &attempt,
>  		  insn_info *insns[2],
>  		  rtx orig_rtl[2],
> @@ -1528,7 +1703,7 @@ fixup_debug_uses (obstack_watermark &attempt,
>  	  gcc_checking_assert (GET_RTX_CLASS (GET_CODE (XEXP (mem, 0)))
>  			       == RTX_AUTOINC);
>  
> -	  base = ldp_strip_offset (mem, &offset);
> +	  base = pair_mem_strip_offset (mem, &offset);
>  	  gcc_checking_assert (REG_P (base) && REGNO (base) == base_regno);
>  	}
>        fixup_debug_use (attempt, use, def, base, offset);
> @@ -1664,7 +1839,7 @@ fixup_debug_uses (obstack_watermark &attempt,
>  // BASE gives the chosen base candidate for the pair and MOVE_RANGE is
>  // a singleton range which says where to place the pair.
>  bool
> -ldp_bb_info::fuse_pair (bool load_p,
> +pair_fusion::fuse_pair (bool load_p,
>  			unsigned access_size,
>  			int writeback,
>  			insn_info *i1, insn_info *i2,
> @@ -1684,6 +1859,9 @@ ldp_bb_info::fuse_pair (bool load_p,
>  						   insn_change::DELETE);
>      };
>  
> +  if (*i1 > *i2)
> +    return false;
> +
>    insn_info *first = (*i1 < *i2) ? i1 : i2;
>    insn_info *second = (first == i1) ? i2 : i1;
>  
> @@ -1800,7 +1978,7 @@ ldp_bb_info::fuse_pair (bool load_p,
>      {
>        if (dump_file)
>  	fprintf (dump_file,
> -		 "  ldp: i%d has wb but subsequent i%d has non-wb "
> +		 "  pair_mem: i%d has wb but subsequent i%d has non-wb "
>  		 "update of base (r%d), dropping wb\n",
>  		 insns[0]->uid (), insns[1]->uid (), base_regno);
>        gcc_assert (writeback_effect);
> @@ -1823,7 +2001,7 @@ ldp_bb_info::fuse_pair (bool load_p,
>      }
>  
>    // If either of the original insns had writeback, but the resulting pair insn
> -  // does not (can happen e.g. in the ldp edge case above, or if the writeback
> +  // does not (can happen e.g. in the pair mem  edge case above, or if the writeback
>    // effects cancel out), then drop the def(s) of the base register as
>    // appropriate.
>    //
> @@ -1842,7 +2020,7 @@ ldp_bb_info::fuse_pair (bool load_p,
>    // update of the base register and try and fold it in to make this into a
>    // writeback pair.
>    insn_info *trailing_add = nullptr;
> -  if (aarch64_ldp_writeback > 1
> +  if (pair_trailing_writeback_p ()
>        && !writeback_effect
>        && (!load_p || (!refers_to_regno_p (base_regno, base_regno + 1,
>  					 XEXP (pats[0], 0), nullptr)
> @@ -1863,14 +2041,14 @@ ldp_bb_info::fuse_pair (bool load_p,
>      }
>  
>    // Now that we know what base mem we're going to use, check if it's OK
> -  // with the ldp/stp policy.
> +  // with the pair mem  policy.
>    rtx first_mem = XEXP (pats[0], load_p);
> -  if (!aarch64_mem_ok_with_ldpstp_policy_model (first_mem,
> -						load_p,
> -						GET_MODE (first_mem)))
> +  if (!pair_mem_ok_policy (first_mem,
> +			  load_p,
> +			  GET_MODE (first_mem)))
>      {
>        if (dump_file)
> -	fprintf (dump_file, "punting on pair (%d,%d), ldp/stp policy says no\n",
> +	fprintf (dump_file, "punting on pair (%d,%d), pair mem  policy says no\n",
>  		 i1->uid (), i2->uid ());
>        return false;
>      }
> @@ -1878,20 +2056,12 @@ ldp_bb_info::fuse_pair (bool load_p,
>    rtx reg_notes = combine_reg_notes (first, second, load_p);
>  
>    rtx pair_pat;
> -  if (writeback_effect)
> -    {
> -      auto patvec = gen_rtvec (3, writeback_effect, pats[0], pats[1]);
> -      pair_pat = gen_rtx_PARALLEL (VOIDmode, patvec);
> -    }
> -  else if (load_p)
> -    pair_pat = aarch64_gen_load_pair (XEXP (pats[0], 0),
> -				      XEXP (pats[1], 0),
> -				      XEXP (pats[0], 1));
> -  else
> -    pair_pat = aarch64_gen_store_pair (XEXP (pats[0], 0),
> -				       XEXP (pats[0], 1),
> -				       XEXP (pats[1], 1));
>  
> +  set_multiword_subreg (first, second, load_p);
> +
> +  pair_pat = gen_load_store_pair (pats, writeback_effect, load_p);
> +  if (pair_pat == NULL_RTX)
> +    return false;
>    insn_change *pair_change = nullptr;
>    auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) {
>      rtx_insn *rti = change->insn ()->rtl ();
> @@ -2075,7 +2245,7 @@ ldp_bb_info::fuse_pair (bool load_p,
>  
>  // Return true if STORE_INSN may modify mem rtx MEM.  Make sure we keep
>  // within our BUDGET for alias analysis.
> -static bool
> +bool
>  store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget)
>  {
>    if (!budget)
> @@ -2098,7 +2268,7 @@ store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget)
>  
>  // Return true if LOAD may be modified by STORE.  Make sure we keep
>  // within our BUDGET for alias analysis.
> -static bool
> +bool
>  load_modified_by_store_p (insn_info *load,
>  			  insn_info *store,
>  			  int &budget)
> @@ -2133,15 +2303,6 @@ load_modified_by_store_p (insn_info *load,
>    return false;
>  }
>  
> -// Virtual base class for load/store walkers used in alias analysis.
> -struct alias_walker
> -{
> -  virtual bool conflict_p (int &budget) const = 0;
> -  virtual insn_info *insn () const = 0;
> -  virtual bool valid () const  = 0;
> -  virtual void advance () = 0;
> -};
> -
>  // Implement some common functionality used by both store_walker
>  // and load_walker.
>  template<bool reverse>
> @@ -2259,13 +2420,13 @@ public:
>  //
>  // We try to maintain the invariant that if a walker becomes invalid, we
>  // set its pointer to null.
> -static void
> -do_alias_analysis (insn_info *alias_hazards[4],
> +void
> +pair_fusion::do_alias_analysis (insn_info *alias_hazards[4],
>  		   alias_walker *walkers[4],
>  		   bool load_p)
>  {
>    const int n_walkers = 2 + (2 * !load_p);
> -  int budget = aarch64_ldp_alias_check_limit;
> +  int budget = pair_mem_alias_check_limit();
>  
>    auto next_walker = [walkers,n_walkers](int current) -> int {
>      for (int j = 1; j <= n_walkers; j++)
> @@ -2350,7 +2511,7 @@ do_alias_analysis (insn_info *alias_hazards[4],
>  //
>  // Returns an integer where bit (1 << i) is set if INSNS[i] uses writeback
>  // addressing.
> -static int
> +int
>  get_viable_bases (insn_info *insns[2],
>  		  vec<base_cand> &base_cands,
>  		  rtx cand_mems[2],
> @@ -2365,7 +2526,7 @@ get_viable_bases (insn_info *insns[2],
>      {
>        const bool is_lower = (i == reversed);
>        poly_int64 poly_off;
> -      rtx base = ldp_strip_offset (cand_mems[i], &poly_off);
> +      rtx base = pair_mem_strip_offset (cand_mems[i], &poly_off);
>        if (GET_RTX_CLASS (GET_CODE (XEXP (cand_mems[i], 0))) == RTX_AUTOINC)
>  	writeback |= (1 << i);
>  
> @@ -2373,7 +2534,7 @@ get_viable_bases (insn_info *insns[2],
>  	continue;
>  
>        // Punt on accesses relative to eliminable regs.  See the comment in
> -      // ldp_bb_info::track_access for a detailed explanation of this.
> +      // pair_fusion::track_access for a detailed explanation of this.
>        if (!reload_completed
>  	  && (REGNO (base) == FRAME_POINTER_REGNUM
>  	      || REGNO (base) == ARG_POINTER_REGNUM))
> @@ -2397,7 +2558,7 @@ get_viable_bases (insn_info *insns[2],
>        if (!is_lower)
>  	base_off--;
>  
> -      if (base_off < LDP_MIN_IMM || base_off > LDP_MAX_IMM)
> +      if (base_off < PAIR_MEM_MIN_IMM || base_off > PAIR_MEM_MAX_IMM)
>  	continue;
>  
>        use_info *use = find_access (insns[i]->uses (), REGNO (base));
> @@ -2454,12 +2615,12 @@ get_viable_bases (insn_info *insns[2],
>  }
>  
>  // Given two adjacent memory accesses of the same size, I1 and I2, try
> -// and see if we can merge them into a ldp or stp.
> +// and see if we can merge them into a pair mem load and store.
>  //
>  // ACCESS_SIZE gives the (common) size of a single access, LOAD_P is true
>  // if the accesses are both loads, otherwise they are both stores.
>  bool
> -ldp_bb_info::try_fuse_pair (bool load_p, unsigned access_size,
> +pair_fusion::try_fuse_pair (bool load_p, unsigned access_size,
>  			    insn_info *i1, insn_info *i2)
>  {
>    if (dump_file)
> @@ -2490,11 +2651,21 @@ ldp_bb_info::try_fuse_pair (bool load_p, unsigned access_size,
>        reg_ops[i] = XEXP (pats[i], !load_p);
>      }
>  
> +  if (!load_p && !fuseable_store_p (i1, i2))
> +    {
> +      if (dump_file)
> +	fprintf (dump_file,
> +		 "punting on store-mem-pairs due to non fuseable cand (%d,%d)\n",
> +		 insns[0]->uid (), insns[1]->uid ());
> +      return false;
> +    }
> +
> +
>    if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1]))
>      {
>        if (dump_file)
>  	fprintf (dump_file,
> -		 "punting on ldp due to reg conflcits (%d,%d)\n",
> +		 "punting on pair mem load  due to reg conflcits (%d,%d)\n",
>  		 insns[0]->uid (), insns[1]->uid ());
>        return false;
>      }
> @@ -2787,7 +2958,7 @@ ldp_bb_info::try_fuse_pair (bool load_p, unsigned access_size,
>  		    i1, i2, *base, range);
>  }
>  
> -static void
> +void
>  dump_insn_list (FILE *f, const insn_list_t &l)
>  {
>    fprintf (f, "(");
> @@ -2843,7 +3014,7 @@ debug (const insn_list_t &l)
>  // we can't re-order them anyway, so provided earlier passes have cleaned up
>  // redundant loads, we shouldn't miss opportunities by doing this.
>  void
> -ldp_bb_info::merge_pairs (insn_list_t &left_list,
> +pair_fusion::merge_pairs (insn_list_t &left_list,
>  			  insn_list_t &right_list,
>  			  bool load_p,
>  			  unsigned access_size)
> @@ -2890,8 +3061,8 @@ ldp_bb_info::merge_pairs (insn_list_t &left_list,
>  // of accesses.  If we find two sets of adjacent accesses, call
>  // merge_pairs.
>  void
> -ldp_bb_info::transform_for_base (int encoded_lfs,
> -				 access_group &group)
> +aarch64_pair_fusion::transform_for_base (int encoded_lfs,
> +					 access_group &group)
>  {
>    const auto lfs = decode_lfs (encoded_lfs);
>    const unsigned access_size = lfs.size;
> @@ -2919,7 +3090,7 @@ ldp_bb_info::transform_for_base (int encoded_lfs,
>  // and remove all the tombstone insns, being sure to reparent any uses
>  // of mem to previous defs when we do this.
>  void
> -ldp_bb_info::cleanup_tombstones ()
> +pair_fusion::cleanup_tombstones ()
>  {
>    // No need to do anything if we didn't emit a tombstone insn for this BB.
>    if (!m_emitted_tombstone)
> @@ -2947,7 +3118,7 @@ ldp_bb_info::cleanup_tombstones ()
>  
>  template<typename Map>
>  void
> -ldp_bb_info::traverse_base_map (Map &map)
> +pair_fusion::traverse_base_map (Map &map)
>  {
>    for (auto kv : map)
>      {
> @@ -2958,7 +3129,7 @@ ldp_bb_info::traverse_base_map (Map &map)
>  }
>  
>  void
> -ldp_bb_info::transform ()
> +pair_fusion::transform ()
>  {
>    traverse_base_map (expr_map);
>    traverse_base_map (def_map);
> @@ -3174,7 +3345,9 @@ void ldp_fusion_bb (bb_info *bb)
>    const bool track_stores
>      = aarch64_tune_params.stp_policy_model != AARCH64_LDP_STP_POLICY_NEVER;
>  
> -  ldp_bb_info bb_state (bb);
> +  pair_fusion *bb_state;
> +  aarch64_pair_fusion derived (bb);
> +  bb_state = &derived;
>  
>    for (auto insn : bb->nondebug_insns ())
>      {
> @@ -3194,13 +3367,13 @@ void ldp_fusion_bb (bb_info *bb)
>  	continue;
>  
>        if (track_stores && MEM_P (XEXP (pat, 0)))
> -	bb_state.track_access (insn, false, XEXP (pat, 0));
> +	bb_state->track_access (insn, false, XEXP (pat, 0));
>        else if (track_loads && MEM_P (XEXP (pat, 1)))
> -	bb_state.track_access (insn, true, XEXP (pat, 1));
> +	bb_state->track_access (insn, true, XEXP (pat, 1));
>      }
>  
> -  bb_state.transform ();
> -  bb_state.cleanup_tombstones ();
> +  bb_state->transform ();
> +  bb_state->cleanup_tombstones ();
>  }
>  
>  void ldp_fusion ()
Alex Coplan April 3, 2024, 3:21 p.m. UTC | #2
On 23/02/2024 16:41, Ajit Agarwal wrote:
> Hello Richard/Alex/Segher:

Hi Ajit,

Sorry for the delay and thanks for working on this.

Generally this looks like the right sort of approach (IMO) but I've left
some comments below.

I'll start with a meta comment: in the subject line you have marked this
as 0/2, but usually 0/n is reserved for the cover letter of a patch
series and wouldn't contain an actual patch.  I think this might have
confused the Linaro CI suitably such that it didn't run regression tests
on the patch.

> 
> This patch adds the changed code for target independent and
> dependent code for load store fusion.
> 
> Common infrastructure of load store pair fusion is
> divided into target independent and target dependent
> changed code.
> 
> Target independent code is the Generic code with
> pure virtual function to interface betwwen target
> independent and dependent code.
> 
> Target dependent code is the implementation of pure
> virtual function for aarch64 target and the call
> to target independent code.
> 
> Bootstrapped for aarch64-linux-gnu.
> 
> Thanks & Regards
> Ajit
> 
> aarch64: Place target independent and dependent changed code in one file.
> 
> Common infrastructure of load store pair fusion is
> divided into target independent and target dependent
> changed code.
> 
> Target independent code is the Generic code with
> pure virtual function to interface betwwen target
> independent and dependent code.
> 
> Target dependent code is the implementation of pure
> virtual function for aarch64 target and the call
> to target independent code.
> 
> 2024-02-23  Ajit Kumar Agarwal  <aagarwa1@linux.ibm.com>
> 
> gcc/ChangeLog:
> 
> 	* config/aarch64/aarch64-ldp-fusion.cc: Place target
> 	independent and dependent changed code.
> ---
>  gcc/config/aarch64/aarch64-ldp-fusion.cc | 437 ++++++++++++++++-------
>  1 file changed, 305 insertions(+), 132 deletions(-)
> 
> diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc b/gcc/config/aarch64/aarch64-ldp-fusion.cc
> index 22ed95eb743..2ef22ff1e96 100644
> --- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
> +++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
> @@ -40,10 +40,10 @@
>  
>  using namespace rtl_ssa;
>  
> -static constexpr HOST_WIDE_INT LDP_IMM_BITS = 7;
> -static constexpr HOST_WIDE_INT LDP_IMM_SIGN_BIT = (1 << (LDP_IMM_BITS - 1));
> -static constexpr HOST_WIDE_INT LDP_MAX_IMM = LDP_IMM_SIGN_BIT - 1;
> -static constexpr HOST_WIDE_INT LDP_MIN_IMM = -LDP_MAX_IMM - 1;
> +static constexpr HOST_WIDE_INT PAIR_MEM_IMM_BITS = 7;
> +static constexpr HOST_WIDE_INT PAIR_MEM_IMM_SIGN_BIT = (1 << (PAIR_MEM_IMM_BITS - 1));
> +static constexpr HOST_WIDE_INT PAIR_MEM_MAX_IMM = PAIR_MEM_IMM_SIGN_BIT - 1;
> +static constexpr HOST_WIDE_INT PAIR_MEM_MIN_IMM = -PAIR_MEM_MAX_IMM - 1;

These constants shouldn't be renamed: they are specific to aarch64 so the
original names should be preserved in this file.

I expect we want to introduce virtual functions to validate an offset
for a paired access.  The aarch64 code could then implement it by
comparing the offset against LDP_{MIN,MAX}_IMM, and the generic code
could use that hook to replace the code that queries those constants
directly (i.e. in find_trailing_add and get_viable_bases).

>  
>  // We pack these fields (load_p, fpsimd_p, and size) into an integer
>  // (LFS) which we use as part of the key into the main hash tables.
> @@ -138,8 +138,18 @@ struct alt_base
>    poly_int64 offset;
>  };
>  
> +// Virtual base class for load/store walkers used in alias analysis.
> +struct alias_walker
> +{
> +  virtual bool conflict_p (int &budget) const = 0;
> +  virtual insn_info *insn () const = 0;
> +  virtual bool valid () const  = 0;
> +  virtual void advance () = 0;
> +};
> +
> +
>  // State used by the pass for a given basic block.
> -struct ldp_bb_info
> +struct pair_fusion

As a comment on the high-level design, I think we want a generic class
for the overall pass, not just for the BB-specific structure.

That is because naturally we want the ldp_fusion_bb function itself to
be a member of such a class, so that it can access virtual functions to
query the target e.g. about the load/store pair policy, and whether to
try and promote writeback pairs.

If we keep all of the virtual functions in such an outer class, then we
can keep the ldp_fusion_bb class generic (not needing an override for
each target) and that inner class can perhaps be given a pointer or
reference to the outer class when it is instantiated in ldp_fusion_bb.

>  {
>    using def_hash = nofree_ptr_hash<def_info>;
>    using expr_key_t = pair_hash<tree_operand_hash, int_hash<int, -1, -2>>;
> @@ -161,13 +171,13 @@ struct ldp_bb_info
>    static const size_t obstack_alignment = sizeof (void *);
>    bb_info *m_bb;
>  
> -  ldp_bb_info (bb_info *bb) : m_bb (bb), m_emitted_tombstone (false)
> +  pair_fusion (bb_info *bb) : m_bb (bb), m_emitted_tombstone (false)
>    {
>      obstack_specify_allocation (&m_obstack, OBSTACK_CHUNK_SIZE,
>  				obstack_alignment, obstack_chunk_alloc,
>  				obstack_chunk_free);
>    }
> -  ~ldp_bb_info ()
> +  ~pair_fusion ()
>    {
>      obstack_free (&m_obstack, nullptr);
>  
> @@ -177,10 +187,50 @@ struct ldp_bb_info
>  	bitmap_obstack_release (&m_bitmap_obstack);
>        }
>    }
> +  void track_access (insn_info *, bool load, rtx mem);
> +  void transform ();
> +  void cleanup_tombstones ();
> +  virtual void set_multiword_subreg (insn_info *i1, insn_info *i2,
> +				     bool load_p) = 0;

These virtual functions could do with a comment to describe what their
purpose is.  What is set_multiword_subreg supposed to do?  I guess it is
something specifically needed for rs6000 as the aarch64 implementation
in this patch isn't particularly illuminating.

In general for hooks that are needed for rs6000 but aren't for aarch64 I
think it would be better to defer adding them to a later patch.  This
patch should just focus on making the aarch64/generic split.

> +  virtual rtx gen_load_store_pair (rtx *pats,  rtx writeback,
> +				   bool load_p) = 0;

Nit: naming, how about just "gen_mem_pair" for this?

> +  void merge_pairs (insn_list_t &, insn_list_t &,
> +		    bool load_p, unsigned access_size);
> +  virtual void transform_for_base (int load_size, access_group &group) = 0;

I don't think transform_for_base wants to be a pure virtual function,
can't we just share the same implementation for all targets?

> +
> +  bool try_fuse_pair (bool load_p, unsigned access_size,
> +			     insn_info *i1, insn_info *i2);
> +
> +  bool fuse_pair (bool load_p, unsigned access_size,
> +		  int writeback,
> +		  insn_info *i1, insn_info *i2,
> +		  base_cand &base,
> +		  const insn_range_info &move_range);
> +
> +  void do_alias_analysis (insn_info *alias_hazards[4],
> +			  alias_walker *walkers[4],
> +			  bool load_p);
> +
> +  void track_tombstone (int uid);
> +
> +  bool track_via_mem_expr (insn_info *, rtx mem, lfs_fields lfs);
> +
> +  virtual bool is_fpsimd_op_p (rtx reg_op, machine_mode mem_mode,
> +			       bool load_p) = 0;

Minor nit: the "is_" part of the name here is redundant.

> +
> +  virtual bool pair_operand_mode_ok_p (machine_mode mode) = 0;
> +  virtual bool pair_trailing_writeback_p () = 0;
> +  virtual bool pair_check_register_operand (bool load_p, rtx reg_op,
> +					    machine_mode mem_mode) = 0;

How about pair_reg_operand_ok_p for consistency with
pair_operand_mode_ok_p?

> +  virtual int pair_mem_alias_check_limit () = 0;
> +  virtual bool pair_is_writeback () = 0 ;

Looking at the implementation below, I don't think this is a good name.
With this name it seems like the function would be checking if a given
pair insn is a writeback access, but this function is really just asking
the target if we should handle any writeback opportunities.

How about something like handle_writeback_opportunities () instead?

> +  virtual bool pair_mem_ok_policy (rtx first_mem, bool load_p,
> +				   machine_mode mode) = 0;

Nit: a name like pair_mem_ok_with_policy might be better (to closer
match the underlying aarch64 name).

> +  virtual bool fuseable_store_p (insn_info *i1, insn_info *i2) = 0;
> +  virtual bool fuseable_load_p (insn_info *info) = 0;

Similar comment as on set_multiword_subreg: I think it would be better
to defer adding these hooks (that are only needed for rs6000) to a later
patch.

>  
> -  inline void track_access (insn_info *, bool load, rtx mem);
> -  inline void transform ();
> -  inline void cleanup_tombstones ();
> +  template<typename Map>
> +    void traverse_base_map (Map &map);
>  
>  private:
>    obstack m_obstack;
> @@ -191,30 +241,157 @@ private:
>    bool m_emitted_tombstone;
>  
>    inline splay_tree_node<access_record *> *node_alloc (access_record *);
> +};
>  
> -  template<typename Map>
> -  inline void traverse_base_map (Map &map);
> -  inline void transform_for_base (int load_size, access_group &group);
> -
> -  inline void merge_pairs (insn_list_t &, insn_list_t &,
> -			   bool load_p, unsigned access_size);
> +struct  aarch64_pair_fusion : public pair_fusion

Nit: excess whitespace (double space after struct).

> +{
> +public:
> +  aarch64_pair_fusion (bb_info *bb) : pair_fusion (bb) {};
> +  bool is_fpsimd_op_p (rtx reg_op, machine_mode mem_mode, bool load_p)

Nit: the virtual function implementations in this class should probably
be marked override (and arguably final too, although that seems less
important).

> +  {
> +    const bool fpsimd_op_p
> +      = reload_completed
> +      ? (REG_P (reg_op) && FP_REGNUM_P (REGNO (reg_op)))
> +      : (GET_MODE_CLASS (mem_mode) != MODE_INT
> +	 && (load_p || !aarch64_const_zero_rtx_p (reg_op)));
> +    return fpsimd_op_p;
> +  }
>  
> -  inline bool try_fuse_pair (bool load_p, unsigned access_size,
> -			     insn_info *i1, insn_info *i2);
> +  bool pair_mem_ok_policy (rtx first_mem, bool load_p, machine_mode mode)
> +  {
> +    return aarch64_mem_ok_with_ldpstp_policy_model (first_mem,
> +						     load_p,
> +						     mode);
> +  }
> +  bool pair_operand_mode_ok_p (machine_mode mode);
>  
> -  inline bool fuse_pair (bool load_p, unsigned access_size,
> -			 int writeback,
> -			 insn_info *i1, insn_info *i2,
> -			 base_cand &base,
> -			 const insn_range_info &move_range);
> +  void transform_for_base (int encoded_lfs,
> +			   access_group &group);
> +  rtx gen_load_store_pair (rtx *pats,
> +			   rtx writeback,
> +			   bool load_p)
> +  {
> +    rtx pair_pat;
>  
> -  inline void track_tombstone (int uid);
> +    if (writeback)
> +      {
> +	auto patvec = gen_rtvec (3, writeback, pats[0], pats[1]);
> +	pair_pat = gen_rtx_PARALLEL (VOIDmode, patvec);
> +      }
> +    else if (load_p)
> +      pair_pat = aarch64_gen_load_pair (XEXP (pats[0], 0),
> +					XEXP (pats[1], 0),
> +					XEXP (pats[0], 1));
> +    else
> +      pair_pat = aarch64_gen_store_pair (XEXP (pats[0], 0),
> +					 XEXP (pats[0], 1),
> +					 XEXP (pats[1], 1));
> +     return pair_pat;
> +  }

Minor nit: for readability, I'm not sure we want member functions
implemented inline like this if the function is more than a few lines
long.  But I don't feel strongly on this and would defer to a
maintainer.

>  
> -  inline bool track_via_mem_expr (insn_info *, rtx mem, lfs_fields lfs);
> +  void set_multiword_subreg (insn_info *i1, insn_info *i2, bool load_p)
> +  {
> +    if (i1 || i2 || load_p)
> +      return;
> +    return;
> +  }

It looks like this could just have an empty function body and the code
you've included here is dead/pointless.

Nit: I think we want whitespace between the inline member
implementations here.

> +  bool pair_trailing_writeback_p  ()
> +  {
> +    return aarch64_ldp_writeback > 1;
> +  }
> +  bool pair_check_register_operand (bool load_p, rtx reg_op, machine_mode mem_mode)
> +  {
> +    return  (load_p
> +	     ? aarch64_ldp_reg_operand (reg_op, mem_mode)
> +	     : aarch64_stp_reg_operand (reg_op, mem_mode));
> +  }
> +  int pair_mem_alias_check_limit ()
> +  {
> +    return aarch64_ldp_alias_check_limit;
> +  }

This seems reasonable for the time being, but I wonder if eventually we
want to move to a single generic param shared by all targets using this
pass (more of an open question than a request for change).

> +  bool fuseable_store_p (insn_info *i1, insn_info *i2) { return i1 || i2;}
> +  bool fuseable_load_p (insn_info *insn) { return insn;}

It looks like both of these could just return true for aarch64.
Could you explain in which scenarios you expect to reject pairs with
these hooks?  As mentioned above, comments on the virtual function
declarations would be good.

Naively it seems that both of these hooks should be given both insns.
Why should fuseable_load_p only care about one candidate insn?  If both
take both insns then they can be collapsed to a single hook which takes
a load_p parameter to disambiguate the load/store cases.

It also looks like the fuseable_load_p function is currently unused.

> +  bool pair_is_writeback ()
> +  {
> +    return !aarch64_ldp_writeback;

I think you want to drop the negation here, we want to return true iff
we should handle writeback opportunities, no?  You'll then need to drop
the negation in callers, too.

> +  }
>  };
>  
> +bool
> +store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget);
> +bool load_modified_by_store_p (insn_info *load,
> +			  insn_info *store,
> +			  int &budget);
> +extern insn_info *
> +try_repurpose_store (insn_info *first,
> +		     insn_info *second,
> +		     const insn_range_info &move_range);
> +
> +void reset_debug_use (use_info *use);
> +
> +extern void
> +fixup_debug_uses (obstack_watermark &attempt,
> +		  insn_info *insns[2],
> +		  rtx orig_rtl[2],
> +		  insn_info *pair_dst,
> +		  insn_info *trailing_add,
> +		  bool load_p,
> +		  int writeback,
> +		  rtx writeback_effect,
> +		  unsigned base_regno);
> +
> +void
> +fixup_debug_uses_trailing_add (obstack_watermark &attempt,
> +			       insn_info *pair_dst,
> +			       insn_info *trailing_add,
> +			       rtx writeback_effect);
> +
> +
> +extern void
> +fixup_debug_use (obstack_watermark &attempt,
> +		 use_info *use,
> +		 def_info *def,
> +		 rtx base,
> +		 poly_int64 wb_offset);
> +
> +extern insn_info *
> +find_trailing_add (insn_info *insns[2],
> +		   const insn_range_info &pair_range,
> +		   int initial_writeback,
> +		   rtx *writeback_effect,
> +		   def_info **add_def,
> +		   def_info *base_def,
> +		   poly_int64 initial_offset,
> +		   unsigned access_size);
> +
> +rtx drop_writeback (rtx mem);
> +rtx pair_mem_strip_offset (rtx mem, poly_int64 *offset);
> +bool any_pre_modify_p (rtx x);
> +bool any_post_modify_p (rtx x);
> +int encode_lfs (lfs_fields fields);
> +extern insn_info * latest_hazard_before (insn_info *insn, rtx *ignore,
> +		      insn_info *ignore_insn = nullptr);
> +insn_info * first_hazard_after (insn_info *insn, rtx *ignore);
> +bool ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2);
> +insn_range_info get_def_range (def_info *def);
> +insn_range_info def_downwards_move_range (def_info *def);
> +insn_range_info def_upwards_move_range (def_info *def);
> +rtx gen_tombstone (void);
> +rtx filter_notes (rtx note, rtx result, bool *eh_region, rtx *fr_expr);
> +rtx combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p);
> +rtx extract_writebacks (bool load_p, rtx pats[2], int changed);
> +void do_alias_analysis (insn_info *alias_hazards[4],
> +		   alias_walker *walkers[4],
> +		   bool load_p);
> +int get_viable_bases (insn_info *insns[2],
> +		  vec<base_cand> &base_cands,
> +		  rtx cand_mems[2],
> +		  unsigned access_size,
> +		  bool reversed);
> +void dump_insn_list (FILE *f, const insn_list_t &l);

Is there really a need to forward-declare all of these?

> +
>  splay_tree_node<access_record *> *
> -ldp_bb_info::node_alloc (access_record *access)
> +pair_fusion::node_alloc (access_record *access)
>  {
>    using T = splay_tree_node<access_record *>;
>    void *addr = obstack_alloc (&m_obstack, sizeof (T));
> @@ -224,7 +401,7 @@ ldp_bb_info::node_alloc (access_record *access)
>  // Given a mem MEM, if the address has side effects, return a MEM that accesses
>  // the same address but without the side effects.  Otherwise, return
>  // MEM unchanged.
> -static rtx
> +rtx

Is there a reason to change the linkage here?  Presumably when this (and
many other such functions) get moved to a generic file then they will
only be called from within that file, so they could stay having static
linkage.

That should remove a fair bit of noise from this patch.

>  drop_writeback (rtx mem)
>  {
>    rtx addr = XEXP (mem, 0);
> @@ -261,8 +438,8 @@ drop_writeback (rtx mem)
>  // Convenience wrapper around strip_offset that can also look through
>  // RTX_AUTOINC addresses.  The interface is like strip_offset except we take a
>  // MEM so that we know the mode of the access.
> -static rtx
> -ldp_strip_offset (rtx mem, poly_int64 *offset)
> +rtx
> +pair_mem_strip_offset (rtx mem, poly_int64 *offset)
>  {
>    rtx addr = XEXP (mem, 0);
>  
> @@ -295,7 +472,7 @@ ldp_strip_offset (rtx mem, poly_int64 *offset)
>  }
>  
>  // Return true if X is a PRE_{INC,DEC,MODIFY} rtx.
> -static bool
> +bool

As above: presumably we don't really need to change the linkage here?

>  any_pre_modify_p (rtx x)
>  {
>    const auto code = GET_CODE (x);
> @@ -303,7 +480,7 @@ any_pre_modify_p (rtx x)
>  }
>  
>  // Return true if X is a POST_{INC,DEC,MODIFY} rtx.
> -static bool
> +bool

Likewise.

>  any_post_modify_p (rtx x)
>  {
>    const auto code = GET_CODE (x);
> @@ -332,9 +509,15 @@ ldp_operand_mode_ok_p (machine_mode mode)
>    return reload_completed || mode != TImode;
>  }
>  
> +bool
> +aarch64_pair_fusion::pair_operand_mode_ok_p (machine_mode mode)
> +{
> +  return (ldp_operand_mode_ok_p (mode));

Nit: redundant extra parens around the function call here.

> +}
> +
>  // Given LFS (load_p, fpsimd_p, size) fields in FIELDS, encode these
>  // into an integer for use as a hash table key.
> -static int
> +int
>  encode_lfs (lfs_fields fields)
>  {
>    int size_log2 = exact_log2 (fields.size);
> @@ -396,7 +579,7 @@ access_group::track (Alloc alloc_node, poly_int64 offset, insn_info *insn)
>  // MEM_EXPR base (i.e. a tree decl) relative to which we can track the access.
>  // LFS is used as part of the key to the hash table, see track_access.
>  bool
> -ldp_bb_info::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs)
> +pair_fusion::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs)
>  {
>    if (!MEM_EXPR (mem) || !MEM_OFFSET_KNOWN_P (mem))
>      return false;
> @@ -412,7 +595,7 @@ ldp_bb_info::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs)
>    const machine_mode mem_mode = GET_MODE (mem);
>    const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant ();
>  
> -  // Punt on misaligned offsets.  LDP/STP instructions require offsets to be a
> +  // Punt on misaligned offsets.  PAIR MEM  instructions require offsets to be a

Nit: how about "Paired memory accesses" as the generic term (assuming
this is true for the rs6000 insns too).

>    // multiple of the access size, and we believe that misaligned offsets on
>    // MEM_EXPR bases are likely to lead to misaligned offsets w.r.t. RTL bases.
>    if (!multiple_p (offset, mem_size))
> @@ -438,46 +621,38 @@ ldp_bb_info::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs)
>  }
>  
>  // Main function to begin pair discovery.  Given a memory access INSN,
> -// determine whether it could be a candidate for fusing into an ldp/stp,
> +// determine whether it could be a candidate for fusing into an pair mem,
>  // and if so, track it in the appropriate data structure for this basic
>  // block.  LOAD_P is true if the access is a load, and MEM is the mem
>  // rtx that occurs in INSN.
>  void
> -ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
> +pair_fusion::track_access (insn_info *insn, bool load_p, rtx mem)
>  {
>    // We can't combine volatile MEMs, so punt on these.
>    if (MEM_VOLATILE_P (mem))
>      return;
>  
> -  // Ignore writeback accesses if the param says to do so.
> -  if (!aarch64_ldp_writeback
> +  // Ignore writeback accesses if the param says to do so
> +  if (pair_is_writeback ()

See my comment on the implementation of pair_is_writeback.

>        && GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC)
>      return;
>  
>    const machine_mode mem_mode = GET_MODE (mem);
> -  if (!ldp_operand_mode_ok_p (mem_mode))
> +
> +  if (!pair_operand_mode_ok_p (mem_mode))
>      return;
>  
>    rtx reg_op = XEXP (PATTERN (insn->rtl ()), !load_p);
>  
> -  // Ignore the access if the register operand isn't suitable for ldp/stp.
> -  if (load_p
> -      ? !aarch64_ldp_reg_operand (reg_op, mem_mode)
> -      : !aarch64_stp_reg_operand (reg_op, mem_mode))
> +  if (!pair_check_register_operand (load_p, reg_op, mem_mode))
>      return;
> -
>    // We want to segregate FP/SIMD accesses from GPR accesses.
>    //
>    // Before RA, we use the modes, noting that stores of constant zero
>    // operands use GPRs (even in non-integer modes).  After RA, we use
>    // the hard register numbers.
> -  const bool fpsimd_op_p
> -    = reload_completed
> -    ? (REG_P (reg_op) && FP_REGNUM_P (REGNO (reg_op)))
> -    : (GET_MODE_CLASS (mem_mode) != MODE_INT
> -       && (load_p || !aarch64_const_zero_rtx_p (reg_op)));
> -
> -  // Note ldp_operand_mode_ok_p already rejected VL modes.
> + const bool fpsimd_op_p = is_fpsimd_op_p (reg_op, mem_mode, load_p);

Nit: indentation looks off here.

> +  // Note pair_operand_mode_ok_p already rejected VL modes.
>    const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant ();
>    const lfs_fields lfs = { load_p, fpsimd_op_p, mem_size };
>  
> @@ -487,7 +662,7 @@ ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
>    poly_int64 mem_off;
>    rtx addr = XEXP (mem, 0);
>    const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC;
> -  rtx base = ldp_strip_offset (mem, &mem_off);
> +  rtx base = pair_mem_strip_offset (mem, &mem_off);

For review purposes: it might be easier if you split off patches that
just do a simple rename into a separate patch.  So all of the ldp/stp ->
pair mem renaming could be done in a preparatory patch.  That would
reduce the noise in reviewing any subsequent patches.

>    if (!REG_P (base))
>      return;
>  
> @@ -507,7 +682,7 @@ ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
>    // accesses until after RA.
>    //
>    // As it stands, addresses with offsets in range for LDR but not
> -  // in range for LDP/STP are currently reloaded inefficiently,
> +  // in range for PAIR MEM LOAD STORE  are currently reloaded inefficiently,

How about: "addresses in range for an individual load/store but not
for a paired access"?

>    // ending up with a separate base register for each pair.
>    //
>    // In theory LRA should make use of
> @@ -519,8 +694,8 @@ ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
>    // that calls targetm.legitimize_address_displacement.
>    //
>    // So for now, it's better to punt when we can't be sure that the
> -  // offset is in range for LDP/STP.  Out-of-range cases can then be
> -  // handled after RA by the out-of-range LDP/STP peepholes.  Eventually, it
> +  // offset is in range for PAIR MEM LOAD STORE.  Out-of-range cases can then be

"in range for the paired access".

> +  // handled after RA by the out-of-range PAIR MEM  peepholes.  Eventually, it

Does this part of the comment really apply to rs6000?  I.e. does rs6000
currently have peepholes for forming vector load/store pair insns?

I'm also not sure to what extent the part about relaxed memory
constraints applies to rs6000.  If some parts aren't relevant then
perhaps a caveat mentioning this in the comment could be a good idea.

>    // would be nice to handle known out-of-range opportunities in the
>    // pass itself (for stack accesses, this would be in the post-RA pass).
>    if (!reload_completed
> @@ -573,7 +748,7 @@ ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
>  	gcc_unreachable (); // Base defs should be unique.
>      }
>  
> -  // Punt on misaligned offsets.  LDP/STP require offsets to be a multiple of
> +  // Punt on misaligned offsets.  PAIR MEM  require offsets to be a multiple of

How about "Paired memory accesses require ..." (assuming that is true of
rs6000).

>    // the access size.
>    if (!multiple_p (mem_off, mem_size))
>      return;
> @@ -612,9 +787,9 @@ static bool no_ignore (insn_info *) { return false; }
>  //
>  // N.B. we ignore any defs/uses of memory here as we deal with that separately,
>  // making use of alias disambiguation.
> -static insn_info *
> +insn_info *

Similar comment as on drop_writeback above about changing the linkage.

>  latest_hazard_before (insn_info *insn, rtx *ignore,
> -		      insn_info *ignore_insn = nullptr)
> +		      insn_info *ignore_insn)
>  {
>    insn_info *result = nullptr;
>  
> @@ -698,7 +873,7 @@ latest_hazard_before (insn_info *insn, rtx *ignore,
>  //
>  // N.B. we ignore any defs/uses of memory here as we deal with that separately,
>  // making use of alias disambiguation.
> -static insn_info *
> +insn_info *
>  first_hazard_after (insn_info *insn, rtx *ignore)
>  {
>    insn_info *result = nullptr;
> @@ -787,7 +962,7 @@ first_hazard_after (insn_info *insn, rtx *ignore)
>  }
>  
>  // Return true iff R1 and R2 overlap.
> -static bool
> +bool
>  ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2)
>  {
>    // If either range is empty, then their intersection is empty.
> @@ -801,7 +976,7 @@ ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2)
>  }
>  
>  // Get the range of insns that def feeds.
> -static insn_range_info get_def_range (def_info *def)
> +insn_range_info get_def_range (def_info *def)
>  {
>    insn_info *last = def->next_def ()->insn ()->prev_nondebug_insn ();
>    return { def->insn (), last };
> @@ -809,7 +984,7 @@ static insn_range_info get_def_range (def_info *def)
>  
>  // Given a def (of memory), return the downwards range within which we
>  // can safely move this def.
> -static insn_range_info
> +insn_range_info
>  def_downwards_move_range (def_info *def)
>  {
>    auto range = get_def_range (def);
> @@ -827,7 +1002,7 @@ def_downwards_move_range (def_info *def)
>  
>  // Given a def (of memory), return the upwards range within which we can
>  // safely move this def.
> -static insn_range_info
> +insn_range_info
>  def_upwards_move_range (def_info *def)
>  {
>    def_info *prev = def->prev_def ();
> @@ -974,7 +1149,7 @@ private:
>  // Given candidate store insns FIRST and SECOND, see if we can re-purpose one
>  // of them (together with its def of memory) for the stp insn.  If so, return
>  // that insn.  Otherwise, return null.
> -static insn_info *
> +insn_info *
>  try_repurpose_store (insn_info *first,
>  		     insn_info *second,
>  		     const insn_range_info &move_range)
> @@ -1001,7 +1176,7 @@ try_repurpose_store (insn_info *first,
>  //
>  // These are deleted at the end of the pass and uses re-parented appropriately
>  // at this point.
> -static rtx
> +rtx
>  gen_tombstone (void)
>  {
>    return gen_rtx_CLOBBER (VOIDmode,
> @@ -1034,7 +1209,7 @@ aarch64_operand_mode_for_pair_mode (machine_mode mode)
>  // REG_EH_REGION note in the resulting list.  FR_EXPR is used to return any
>  // REG_FRAME_RELATED_EXPR note we find, as these can need special handling in
>  // combine_reg_notes.
> -static rtx
> +rtx
>  filter_notes (rtx note, rtx result, bool *eh_region, rtx *fr_expr)
>  {
>    for (; note; note = XEXP (note, 1))
> @@ -1084,7 +1259,7 @@ filter_notes (rtx note, rtx result, bool *eh_region, rtx *fr_expr)
>  
>  // Return the notes that should be attached to a combination of I1 and I2, where
>  // *I1 < *I2.  LOAD_P is true for loads.
> -static rtx
> +rtx
>  combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p)
>  {
>    // Temporary storage for REG_FRAME_RELATED_EXPR notes.
> @@ -1133,7 +1308,7 @@ combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p)
>  // relative to the initial value of the base register, and output these
>  // in PATS.  Return an rtx that represents the overall change to the
>  // base register.
> -static rtx
> +rtx
>  extract_writebacks (bool load_p, rtx pats[2], int changed)
>  {
>    rtx base_reg = NULL_RTX;
> @@ -1150,7 +1325,7 @@ extract_writebacks (bool load_p, rtx pats[2], int changed)
>        const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC;
>  
>        poly_int64 offset;
> -      rtx this_base = ldp_strip_offset (mem, &offset);
> +      rtx this_base = pair_mem_strip_offset (mem, &offset);
>        gcc_assert (REG_P (this_base));
>        if (base_reg)
>  	gcc_assert (rtx_equal_p (base_reg, this_base));
> @@ -1207,7 +1382,7 @@ extract_writebacks (bool load_p, rtx pats[2], int changed)
>  // base register.  If there is one, we choose the first such update after
>  // PAIR_DST that is still in the same BB as our pair.  We return the new def in
>  // *ADD_DEF and the resulting writeback effect in *WRITEBACK_EFFECT.
> -static insn_info *
> +insn_info *
>  find_trailing_add (insn_info *insns[2],
>  		   const insn_range_info &pair_range,
>  		   int initial_writeback,
> @@ -1286,7 +1461,7 @@ find_trailing_add (insn_info *insns[2],
>  
>    off_hwi /= access_size;
>  
> -  if (off_hwi < LDP_MIN_IMM || off_hwi > LDP_MAX_IMM)
> +  if (off_hwi < PAIR_MEM_MIN_IMM || off_hwi > PAIR_MEM_MAX_IMM)
>      return nullptr;
>  
>    auto dump_prefix = [&]()
> @@ -1328,7 +1503,7 @@ find_trailing_add (insn_info *insns[2],
>  // We just emitted a tombstone with uid UID, track it in a bitmap for
>  // this BB so we can easily identify it later when cleaning up tombstones.
>  void
> -ldp_bb_info::track_tombstone (int uid)
> +pair_fusion::track_tombstone (int uid)
>  {
>    if (!m_emitted_tombstone)
>      {
> @@ -1344,7 +1519,7 @@ ldp_bb_info::track_tombstone (int uid)
>  
>  // Reset the debug insn containing USE (the debug insn has been
>  // optimized away).
> -static void
> +void
>  reset_debug_use (use_info *use)
>  {
>    auto use_insn = use->insn ();
> @@ -1360,7 +1535,7 @@ reset_debug_use (use_info *use)
>  // is an update of the register BASE by a constant, given by WB_OFFSET,
>  // and we can preserve debug info by accounting for the change in side
>  // effects.
> -static void
> +void
>  fixup_debug_use (obstack_watermark &attempt,
>  		 use_info *use,
>  		 def_info *def,
> @@ -1463,7 +1638,7 @@ fixup_debug_use (obstack_watermark &attempt,
>  // is a trailing add insn which is being folded into the pair to make it
>  // use writeback addressing, and WRITEBACK_EFFECT is the pattern for
>  // TRAILING_ADD.
> -static void
> +void
>  fixup_debug_uses_trailing_add (obstack_watermark &attempt,
>  			       insn_info *pair_dst,
>  			       insn_info *trailing_add,
> @@ -1500,7 +1675,7 @@ fixup_debug_uses_trailing_add (obstack_watermark &attempt,
>  // writeback, and WRITEBACK_EFFECT is an rtx describing the overall update to
>  // the base register in the final pair (if any).  BASE_REGNO gives the register
>  // number of the base register used in the final pair.
> -static void
> +void
>  fixup_debug_uses (obstack_watermark &attempt,
>  		  insn_info *insns[2],
>  		  rtx orig_rtl[2],
> @@ -1528,7 +1703,7 @@ fixup_debug_uses (obstack_watermark &attempt,
>  	  gcc_checking_assert (GET_RTX_CLASS (GET_CODE (XEXP (mem, 0)))
>  			       == RTX_AUTOINC);
>  
> -	  base = ldp_strip_offset (mem, &offset);
> +	  base = pair_mem_strip_offset (mem, &offset);
>  	  gcc_checking_assert (REG_P (base) && REGNO (base) == base_regno);
>  	}
>        fixup_debug_use (attempt, use, def, base, offset);
> @@ -1664,7 +1839,7 @@ fixup_debug_uses (obstack_watermark &attempt,
>  // BASE gives the chosen base candidate for the pair and MOVE_RANGE is
>  // a singleton range which says where to place the pair.
>  bool
> -ldp_bb_info::fuse_pair (bool load_p,
> +pair_fusion::fuse_pair (bool load_p,
>  			unsigned access_size,
>  			int writeback,
>  			insn_info *i1, insn_info *i2,
> @@ -1684,6 +1859,9 @@ ldp_bb_info::fuse_pair (bool load_p,
>  						   insn_change::DELETE);
>      };
>  
> +  if (*i1 > *i2)
> +    return false;
> +

I don't think we want this: we want to be able to handle the case
where the insns in offset order are not in program order.  What goes
wrong in this case for you?

>    insn_info *first = (*i1 < *i2) ? i1 : i2;
>    insn_info *second = (first == i1) ? i2 : i1;
>  
> @@ -1800,7 +1978,7 @@ ldp_bb_info::fuse_pair (bool load_p,
>      {
>        if (dump_file)
>  	fprintf (dump_file,
> -		 "  ldp: i%d has wb but subsequent i%d has non-wb "
> +		 "  pair_mem: i%d has wb but subsequent i%d has non-wb "

How about "load pair: "?

>  		 "update of base (r%d), dropping wb\n",
>  		 insns[0]->uid (), insns[1]->uid (), base_regno);
>        gcc_assert (writeback_effect);
> @@ -1823,7 +2001,7 @@ ldp_bb_info::fuse_pair (bool load_p,
>      }
>  
>    // If either of the original insns had writeback, but the resulting pair insn
> -  // does not (can happen e.g. in the ldp edge case above, or if the writeback
> +  // does not (can happen e.g. in the pair mem  edge case above, or if the writeback

s/pair mem/load pair/, also nit: excess whitespace after mem.

>    // effects cancel out), then drop the def(s) of the base register as
>    // appropriate.
>    //
> @@ -1842,7 +2020,7 @@ ldp_bb_info::fuse_pair (bool load_p,
>    // update of the base register and try and fold it in to make this into a
>    // writeback pair.
>    insn_info *trailing_add = nullptr;
> -  if (aarch64_ldp_writeback > 1
> +  if (pair_trailing_writeback_p ()
>        && !writeback_effect
>        && (!load_p || (!refers_to_regno_p (base_regno, base_regno + 1,
>  					 XEXP (pats[0], 0), nullptr)
> @@ -1863,14 +2041,14 @@ ldp_bb_info::fuse_pair (bool load_p,
>      }
>  
>    // Now that we know what base mem we're going to use, check if it's OK
> -  // with the ldp/stp policy.
> +  // with the pair mem  policy.
>    rtx first_mem = XEXP (pats[0], load_p);
> -  if (!aarch64_mem_ok_with_ldpstp_policy_model (first_mem,
> -						load_p,
> -						GET_MODE (first_mem)))
> +  if (!pair_mem_ok_policy (first_mem,
> +			  load_p,
> +			  GET_MODE (first_mem)))
>      {
>        if (dump_file)
> -	fprintf (dump_file, "punting on pair (%d,%d), ldp/stp policy says no\n",
> +	fprintf (dump_file, "punting on pair (%d,%d), pair mem  policy says no\n",

Nit: excess whitespace after mem.

>  		 i1->uid (), i2->uid ());
>        return false;
>      }
> @@ -1878,20 +2056,12 @@ ldp_bb_info::fuse_pair (bool load_p,
>    rtx reg_notes = combine_reg_notes (first, second, load_p);
>  
>    rtx pair_pat;
> -  if (writeback_effect)
> -    {
> -      auto patvec = gen_rtvec (3, writeback_effect, pats[0], pats[1]);
> -      pair_pat = gen_rtx_PARALLEL (VOIDmode, patvec);
> -    }
> -  else if (load_p)
> -    pair_pat = aarch64_gen_load_pair (XEXP (pats[0], 0),
> -				      XEXP (pats[1], 0),
> -				      XEXP (pats[0], 1));
> -  else
> -    pair_pat = aarch64_gen_store_pair (XEXP (pats[0], 0),
> -				       XEXP (pats[0], 1),
> -				       XEXP (pats[1], 1));
>  
> +  set_multiword_subreg (first, second, load_p);
> +
> +  pair_pat = gen_load_store_pair (pats, writeback_effect, load_p);
> +  if (pair_pat == NULL_RTX)
> +    return false;

I don't think this function should be allowed to fail (if we want to
reject a pair, we should do it earlier), so this should either assert
the result is non-NULL or we should just drop the check altogether.

>    insn_change *pair_change = nullptr;
>    auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) {
>      rtx_insn *rti = change->insn ()->rtl ();
> @@ -2075,7 +2245,7 @@ ldp_bb_info::fuse_pair (bool load_p,
>  
>  // Return true if STORE_INSN may modify mem rtx MEM.  Make sure we keep
>  // within our BUDGET for alias analysis.
> -static bool
> +bool
>  store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget)
>  {
>    if (!budget)
> @@ -2098,7 +2268,7 @@ store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget)
>  
>  // Return true if LOAD may be modified by STORE.  Make sure we keep
>  // within our BUDGET for alias analysis.
> -static bool
> +bool
>  load_modified_by_store_p (insn_info *load,
>  			  insn_info *store,
>  			  int &budget)
> @@ -2133,15 +2303,6 @@ load_modified_by_store_p (insn_info *load,
>    return false;
>  }
>  
> -// Virtual base class for load/store walkers used in alias analysis.
> -struct alias_walker
> -{
> -  virtual bool conflict_p (int &budget) const = 0;
> -  virtual insn_info *insn () const = 0;
> -  virtual bool valid () const  = 0;
> -  virtual void advance () = 0;
> -};
> -
>  // Implement some common functionality used by both store_walker
>  // and load_walker.
>  template<bool reverse>
> @@ -2259,13 +2420,13 @@ public:
>  //
>  // We try to maintain the invariant that if a walker becomes invalid, we
>  // set its pointer to null.
> -static void
> -do_alias_analysis (insn_info *alias_hazards[4],
> +void
> +pair_fusion::do_alias_analysis (insn_info *alias_hazards[4],
>  		   alias_walker *walkers[4],
>  		   bool load_p)
>  {
>    const int n_walkers = 2 + (2 * !load_p);
> -  int budget = aarch64_ldp_alias_check_limit;
> +  int budget = pair_mem_alias_check_limit();

Nit: space before the open paren, contrib/check_GNU_style.py can help
catch such things.

>  
>    auto next_walker = [walkers,n_walkers](int current) -> int {
>      for (int j = 1; j <= n_walkers; j++)
> @@ -2350,7 +2511,7 @@ do_alias_analysis (insn_info *alias_hazards[4],
>  //
>  // Returns an integer where bit (1 << i) is set if INSNS[i] uses writeback
>  // addressing.
> -static int
> +int
>  get_viable_bases (insn_info *insns[2],
>  		  vec<base_cand> &base_cands,
>  		  rtx cand_mems[2],
> @@ -2365,7 +2526,7 @@ get_viable_bases (insn_info *insns[2],
>      {
>        const bool is_lower = (i == reversed);
>        poly_int64 poly_off;
> -      rtx base = ldp_strip_offset (cand_mems[i], &poly_off);
> +      rtx base = pair_mem_strip_offset (cand_mems[i], &poly_off);
>        if (GET_RTX_CLASS (GET_CODE (XEXP (cand_mems[i], 0))) == RTX_AUTOINC)
>  	writeback |= (1 << i);
>  
> @@ -2373,7 +2534,7 @@ get_viable_bases (insn_info *insns[2],
>  	continue;
>  
>        // Punt on accesses relative to eliminable regs.  See the comment in
> -      // ldp_bb_info::track_access for a detailed explanation of this.
> +      // pair_fusion::track_access for a detailed explanation of this.
>        if (!reload_completed
>  	  && (REGNO (base) == FRAME_POINTER_REGNUM
>  	      || REGNO (base) == ARG_POINTER_REGNUM))
> @@ -2397,7 +2558,7 @@ get_viable_bases (insn_info *insns[2],
>        if (!is_lower)
>  	base_off--;
>  
> -      if (base_off < LDP_MIN_IMM || base_off > LDP_MAX_IMM)
> +      if (base_off < PAIR_MEM_MIN_IMM || base_off > PAIR_MEM_MAX_IMM)

As mentioned at the top of the review, these checks should be replaced
with a call to a virtual function that validates the offset instead of
just renaming the aarch64 constants.

>  	continue;
>  
>        use_info *use = find_access (insns[i]->uses (), REGNO (base));
> @@ -2454,12 +2615,12 @@ get_viable_bases (insn_info *insns[2],
>  }
>  
>  // Given two adjacent memory accesses of the same size, I1 and I2, try
> -// and see if we can merge them into a ldp or stp.
> +// and see if we can merge them into a pair mem load and store.

How about: "into a paired access"?

>  //
>  // ACCESS_SIZE gives the (common) size of a single access, LOAD_P is true
>  // if the accesses are both loads, otherwise they are both stores.
>  bool
> -ldp_bb_info::try_fuse_pair (bool load_p, unsigned access_size,
> +pair_fusion::try_fuse_pair (bool load_p, unsigned access_size,
>  			    insn_info *i1, insn_info *i2)
>  {
>    if (dump_file)
> @@ -2490,11 +2651,21 @@ ldp_bb_info::try_fuse_pair (bool load_p, unsigned access_size,
>        reg_ops[i] = XEXP (pats[i], !load_p);
>      }
>  
> +  if (!load_p && !fuseable_store_p (i1, i2))
> +    {
> +      if (dump_file)
> +	fprintf (dump_file,
> +		 "punting on store-mem-pairs due to non fuseable cand (%d,%d)\n",
> +		 insns[0]->uid (), insns[1]->uid ());
> +      return false;
> +    }
> +

As above, please can you explain under which circumstances you plan to
reject store pairs with this hook?  In any case I think we should defer
adding hooks that are only needed for rs6000 to a later patch.

> +

Nit: excess whitespace.

>    if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1]))
>      {
>        if (dump_file)
>  	fprintf (dump_file,
> -		 "punting on ldp due to reg conflcits (%d,%d)\n",
> +		 "punting on pair mem load  due to reg conflcits (%d,%d)\n",
>  		 insns[0]->uid (), insns[1]->uid ());
>        return false;
>      }
> @@ -2787,7 +2958,7 @@ ldp_bb_info::try_fuse_pair (bool load_p, unsigned access_size,
>  		    i1, i2, *base, range);
>  }
>  
> -static void
> +void
>  dump_insn_list (FILE *f, const insn_list_t &l)
>  {
>    fprintf (f, "(");
> @@ -2843,7 +3014,7 @@ debug (const insn_list_t &l)
>  // we can't re-order them anyway, so provided earlier passes have cleaned up
>  // redundant loads, we shouldn't miss opportunities by doing this.
>  void
> -ldp_bb_info::merge_pairs (insn_list_t &left_list,
> +pair_fusion::merge_pairs (insn_list_t &left_list,
>  			  insn_list_t &right_list,
>  			  bool load_p,
>  			  unsigned access_size)
> @@ -2890,8 +3061,8 @@ ldp_bb_info::merge_pairs (insn_list_t &left_list,
>  // of accesses.  If we find two sets of adjacent accesses, call
>  // merge_pairs.
>  void
> -ldp_bb_info::transform_for_base (int encoded_lfs,
> -				 access_group &group)
> +aarch64_pair_fusion::transform_for_base (int encoded_lfs,
> +					 access_group &group)
>  {
>    const auto lfs = decode_lfs (encoded_lfs);
>    const unsigned access_size = lfs.size;
> @@ -2919,7 +3090,7 @@ ldp_bb_info::transform_for_base (int encoded_lfs,
>  // and remove all the tombstone insns, being sure to reparent any uses
>  // of mem to previous defs when we do this.
>  void
> -ldp_bb_info::cleanup_tombstones ()
> +pair_fusion::cleanup_tombstones ()
>  {
>    // No need to do anything if we didn't emit a tombstone insn for this BB.
>    if (!m_emitted_tombstone)
> @@ -2947,7 +3118,7 @@ ldp_bb_info::cleanup_tombstones ()
>  
>  template<typename Map>
>  void
> -ldp_bb_info::traverse_base_map (Map &map)
> +pair_fusion::traverse_base_map (Map &map)
>  {
>    for (auto kv : map)
>      {
> @@ -2958,7 +3129,7 @@ ldp_bb_info::traverse_base_map (Map &map)
>  }
>  
>  void
> -ldp_bb_info::transform ()
> +pair_fusion::transform ()
>  {
>    traverse_base_map (expr_map);
>    traverse_base_map (def_map);
> @@ -3174,7 +3345,9 @@ void ldp_fusion_bb (bb_info *bb)
>    const bool track_stores
>      = aarch64_tune_params.stp_policy_model != AARCH64_LDP_STP_POLICY_NEVER;
>  
> -  ldp_bb_info bb_state (bb);
> +  pair_fusion *bb_state;
> +  aarch64_pair_fusion derived (bb);
> +  bb_state = &derived;

See my comment towards the top of this review about the high-level
structure: I think this function (ldp_fusion_bb) should be a member of
an outer class where all the virtual functions live, and then the
ldp_bb_info class shouldn't need to be specialised to a given target
(but we can just pass a reference or pointer to the outer class).

>  
>    for (auto insn : bb->nondebug_insns ())
>      {
> @@ -3194,13 +3367,13 @@ void ldp_fusion_bb (bb_info *bb)
>  	continue;
>  
>        if (track_stores && MEM_P (XEXP (pat, 0)))
> -	bb_state.track_access (insn, false, XEXP (pat, 0));
> +	bb_state->track_access (insn, false, XEXP (pat, 0));
>        else if (track_loads && MEM_P (XEXP (pat, 1)))
> -	bb_state.track_access (insn, true, XEXP (pat, 1));
> +	bb_state->track_access (insn, true, XEXP (pat, 1));
>      }
>  
> -  bb_state.transform ();
> -  bb_state.cleanup_tombstones ();
> +  bb_state->transform ();
> +  bb_state->cleanup_tombstones ();
>  }
>  
>  void ldp_fusion ()
> -- 
> 2.39.3
> 

Thanks,
Alex
Richard Sandiford April 3, 2024, 9:09 p.m. UTC | #3
Alex Coplan <alex.coplan@arm.com> writes:
> On 23/02/2024 16:41, Ajit Agarwal wrote:
>> Hello Richard/Alex/Segher:
>
> Hi Ajit,
>
> Sorry for the delay and thanks for working on this.
>
> Generally this looks like the right sort of approach (IMO) but I've left
> some comments below.
>
> I'll start with a meta comment: in the subject line you have marked this
> as 0/2, but usually 0/n is reserved for the cover letter of a patch
> series and wouldn't contain an actual patch.  I think this might have
> confused the Linaro CI suitably such that it didn't run regression tests
> on the patch.

Alex, thanks for the thorough and in-depth review.  I agree with all the
comments FWIW.  Just to add a couple of things:

> > @@ -138,8 +138,18 @@ struct alt_base
> >    poly_int64 offset;
> >  };
> >  
> > +// Virtual base class for load/store walkers used in alias analysis.
> > +struct alias_walker
> > +{
> > +  virtual bool conflict_p (int &budget) const = 0;
> > +  virtual insn_info *insn () const = 0;
> > +  virtual bool valid () const  = 0;
> > +  virtual void advance () = 0;
> > +};
> > +
> > +
> >  // State used by the pass for a given basic block.
> > -struct ldp_bb_info
> > +struct pair_fusion
>
> As a comment on the high-level design, I think we want a generic class
> for the overall pass, not just for the BB-specific structure.
>
> That is because naturally we want the ldp_fusion_bb function itself to
> be a member of such a class, so that it can access virtual functions to
> query the target e.g. about the load/store pair policy, and whether to
> try and promote writeback pairs.
>
> If we keep all of the virtual functions in such an outer class, then we
> can keep the ldp_fusion_bb class generic (not needing an override for
> each target) and that inner class can perhaps be given a pointer or
> reference to the outer class when it is instantiated in ldp_fusion_bb.

I agree that in general, the new virtual methods should belong to a pass
class rather than the per-bb class.

In principle, if we need to virtualise existing members of ldp_bb_info
(or code contained within existing members of ldp_bb_info), and if that
code accesses members of the bb info, then it might make sense to have
target-specific derivatives of the bb info structure too, with a virtual
function to create the bb info structure for a given bb.

However, it looks like all but one of the virtual functions in the patch
are self-contained (in the sense of depending only on their arguments
and on globals).  The one exception is transform_for_base, but Alex
asked whether that needs to be virtualised.  If it doesn't, then like
Alex says, it seems that all virtuals could belong to the pass class
rather than to the bb info.

>> [...]
>> +  }
>>  };
>>  
>> +bool
>> +store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget);
>> +bool load_modified_by_store_p (insn_info *load,
>> +			  insn_info *store,
>> +			  int &budget);
>> +extern insn_info *
>> +try_repurpose_store (insn_info *first,
>> +		     insn_info *second,
>> +		     const insn_range_info &move_range);
>> +
>> +void reset_debug_use (use_info *use);
>> +
>> +extern void
>> +fixup_debug_uses (obstack_watermark &attempt,
>> +		  insn_info *insns[2],
>> +		  rtx orig_rtl[2],
>> +		  insn_info *pair_dst,
>> +		  insn_info *trailing_add,
>> +		  bool load_p,
>> +		  int writeback,
>> +		  rtx writeback_effect,
>> +		  unsigned base_regno);
>> +
>> +void
>> +fixup_debug_uses_trailing_add (obstack_watermark &attempt,
>> +			       insn_info *pair_dst,
>> +			       insn_info *trailing_add,
>> +			       rtx writeback_effect);
>> +
>> +
>> +extern void
>> +fixup_debug_use (obstack_watermark &attempt,
>> +		 use_info *use,
>> +		 def_info *def,
>> +		 rtx base,
>> +		 poly_int64 wb_offset);
>> +
>> +extern insn_info *
>> +find_trailing_add (insn_info *insns[2],
>> +		   const insn_range_info &pair_range,
>> +		   int initial_writeback,
>> +		   rtx *writeback_effect,
>> +		   def_info **add_def,
>> +		   def_info *base_def,
>> +		   poly_int64 initial_offset,
>> +		   unsigned access_size);
>> +
>> +rtx drop_writeback (rtx mem);
>> +rtx pair_mem_strip_offset (rtx mem, poly_int64 *offset);
>> +bool any_pre_modify_p (rtx x);
>> +bool any_post_modify_p (rtx x);
>> +int encode_lfs (lfs_fields fields);
>> +extern insn_info * latest_hazard_before (insn_info *insn, rtx *ignore,
>> +		      insn_info *ignore_insn = nullptr);
>> +insn_info * first_hazard_after (insn_info *insn, rtx *ignore);
>> +bool ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2);
>> +insn_range_info get_def_range (def_info *def);
>> +insn_range_info def_downwards_move_range (def_info *def);
>> +insn_range_info def_upwards_move_range (def_info *def);
>> +rtx gen_tombstone (void);
>> +rtx filter_notes (rtx note, rtx result, bool *eh_region, rtx *fr_expr);
>> +rtx combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p);
>> +rtx extract_writebacks (bool load_p, rtx pats[2], int changed);
>> +void do_alias_analysis (insn_info *alias_hazards[4],
>> +		   alias_walker *walkers[4],
>> +		   bool load_p);
>> +int get_viable_bases (insn_info *insns[2],
>> +		  vec<base_cand> &base_cands,
>> +		  rtx cand_mems[2],
>> +		  unsigned access_size,
>> +		  bool reversed);
>> +void dump_insn_list (FILE *f, const insn_list_t &l);
>
> Is there really a need to forward-declare all of these?
>
>> +
>>  splay_tree_node<access_record *> *
>> -ldp_bb_info::node_alloc (access_record *access)
>> +pair_fusion::node_alloc (access_record *access)
>>  {
>>    using T = splay_tree_node<access_record *>;
>>    void *addr = obstack_alloc (&m_obstack, sizeof (T));
>> @@ -224,7 +401,7 @@ ldp_bb_info::node_alloc (access_record *access)
>>  // Given a mem MEM, if the address has side effects, return a MEM that accesses
>>  // the same address but without the side effects.  Otherwise, return
>>  // MEM unchanged.
>> -static rtx
>> +rtx
>
> Is there a reason to change the linkage here?  Presumably when this (and
> many other such functions) get moved to a generic file then they will
> only be called from within that file, so they could stay having static
> linkage.
>
> That should remove a fair bit of noise from this patch.

Yeah, agreed.  In general, I don't think we should create any more true
globals as part of this series.  If functions need to be exposed to targets,
I think it should by via static or non-static member functions of the
pass class that Alex described above.

>> [...]
>> @@ -487,7 +662,7 @@ ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
>>    poly_int64 mem_off;
>>    rtx addr = XEXP (mem, 0);
>>    const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC;
>> -  rtx base = ldp_strip_offset (mem, &mem_off);
>> +  rtx base = pair_mem_strip_offset (mem, &mem_off);
>
> For review purposes: it might be easier if you split off patches that
> just do a simple rename into a separate patch.  So all of the ldp/stp ->
> pair mem renaming could be done in a preparatory patch.  That would
> reduce the noise in reviewing any subsequent patches.

Yeah, I agree this would help.  Sorry Ajit for making the submission more
complex in terms of number of patches (though hopefully less complex in
terms of following the changes).

>> [...]
>> @@ -1684,6 +1859,9 @@ ldp_bb_info::fuse_pair (bool load_p,
>>  						   insn_change::DELETE);
>>      };
>>  
>> +  if (*i1 > *i2)
>> +    return false;
>> +
>
> I don't think we want this: we want to be able to handle the case
> where the insns in offset order are not in program order.  What goes
> wrong in this case for you?

Yeah.  It's OK/good for this patch to move existing code verbatim into
virtual member functions.  But please don't add new tests or conditions
as part of this patch.  Those would be better as separate pre-patches or
post-patches, with an explanation for why each change is needed.

Thanks,
Richard
Ajit Agarwal April 5, 2024, 8:38 a.m. UTC | #4
Hello Alex:

On 03/04/24 8:51 pm, Alex Coplan wrote:
> On 23/02/2024 16:41, Ajit Agarwal wrote:
>> Hello Richard/Alex/Segher:
> 
> Hi Ajit,
> 
> Sorry for the delay and thanks for working on this.
> 
> Generally this looks like the right sort of approach (IMO) but I've left
> some comments below.
> 
> I'll start with a meta comment: in the subject line you have marked this
> as 0/2, but usually 0/n is reserved for the cover letter of a patch
> series and wouldn't contain an actual patch.  I think this might have
> confused the Linaro CI suitably such that it didn't run regression tests
> on the patch.
> 
Sorry for that. I have changed that in latest patch.
>>
>> This patch adds the changed code for target independent and
>> dependent code for load store fusion.
>>
>> Common infrastructure of load store pair fusion is
>> divided into target independent and target dependent
>> changed code.
>>
>> Target independent code is the Generic code with
>> pure virtual function to interface betwwen target
>> independent and dependent code.
>>
>> Target dependent code is the implementation of pure
>> virtual function for aarch64 target and the call
>> to target independent code.
>>
>> Bootstrapped for aarch64-linux-gnu.
>>
>> Thanks & Regards
>> Ajit
>>
>> aarch64: Place target independent and dependent changed code in one file.
>>
>> Common infrastructure of load store pair fusion is
>> divided into target independent and target dependent
>> changed code.
>>
>> Target independent code is the Generic code with
>> pure virtual function to interface betwwen target
>> independent and dependent code.
>>
>> Target dependent code is the implementation of pure
>> virtual function for aarch64 target and the call
>> to target independent code.
>>
>> 2024-02-23  Ajit Kumar Agarwal  <aagarwa1@linux.ibm.com>
>>
>> gcc/ChangeLog:
>>
>> 	* config/aarch64/aarch64-ldp-fusion.cc: Place target
>> 	independent and dependent changed code.
>> ---
>>  gcc/config/aarch64/aarch64-ldp-fusion.cc | 437 ++++++++++++++++-------
>>  1 file changed, 305 insertions(+), 132 deletions(-)
>>
>> diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc b/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> index 22ed95eb743..2ef22ff1e96 100644
>> --- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> +++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> @@ -40,10 +40,10 @@
>>  
>>  using namespace rtl_ssa;
>>  
>> -static constexpr HOST_WIDE_INT LDP_IMM_BITS = 7;
>> -static constexpr HOST_WIDE_INT LDP_IMM_SIGN_BIT = (1 << (LDP_IMM_BITS - 1));
>> -static constexpr HOST_WIDE_INT LDP_MAX_IMM = LDP_IMM_SIGN_BIT - 1;
>> -static constexpr HOST_WIDE_INT LDP_MIN_IMM = -LDP_MAX_IMM - 1;
>> +static constexpr HOST_WIDE_INT PAIR_MEM_IMM_BITS = 7;
>> +static constexpr HOST_WIDE_INT PAIR_MEM_IMM_SIGN_BIT = (1 << (PAIR_MEM_IMM_BITS - 1));
>> +static constexpr HOST_WIDE_INT PAIR_MEM_MAX_IMM = PAIR_MEM_IMM_SIGN_BIT - 1;
>> +static constexpr HOST_WIDE_INT PAIR_MEM_MIN_IMM = -PAIR_MEM_MAX_IMM - 1;
> 
> These constants shouldn't be renamed: they are specific to aarch64 so the
> original names should be preserved in this file.
> 
> I expect we want to introduce virtual functions to validate an offset
> for a paired access.  The aarch64 code could then implement it by
> comparing the offset against LDP_{MIN,MAX}_IMM, and the generic code
> could use that hook to replace the code that queries those constants
> directly (i.e. in find_trailing_add and get_viable_bases).
> 
>>  
>>  // We pack these fields (load_p, fpsimd_p, and size) into an integer
>>  // (LFS) which we use as part of the key into the main hash tables.
>> @@ -138,8 +138,18 @@ struct alt_base
>>    poly_int64 offset;
>>  };
>>  
>> +// Virtual base class for load/store walkers used in alias analysis.
>> +struct alias_walker
>> +{
>> +  virtual bool conflict_p (int &budget) const = 0;
>> +  virtual insn_info *insn () const = 0;
>> +  virtual bool valid () const  = 0;
>> +  virtual void advance () = 0;
>> +};
>> +
>> +
>>  // State used by the pass for a given basic block.
>> -struct ldp_bb_info
>> +struct pair_fusion
> 
> As a comment on the high-level design, I think we want a generic class
> for the overall pass, not just for the BB-specific structure.
> 
> That is because naturally we want the ldp_fusion_bb function itself to
> be a member of such a class, so that it can access virtual functions to
> query the target e.g. about the load/store pair policy, and whether to
> try and promote writeback pairs.
> 
> If we keep all of the virtual functions in such an outer class, then we
> can keep the ldp_fusion_bb class generic (not needing an override for
> each target) and that inner class can perhaps be given a pointer or
> reference to the outer class when it is instantiated in ldp_fusion_bb.
> 

Above comments is addressed in latest patch.

>>  {
>>    using def_hash = nofree_ptr_hash<def_info>;
>>    using expr_key_t = pair_hash<tree_operand_hash, int_hash<int, -1, -2>>;
>> @@ -161,13 +171,13 @@ struct ldp_bb_info
>>    static const size_t obstack_alignment = sizeof (void *);
>>    bb_info *m_bb;
>>  
>> -  ldp_bb_info (bb_info *bb) : m_bb (bb), m_emitted_tombstone (false)
>> +  pair_fusion (bb_info *bb) : m_bb (bb), m_emitted_tombstone (false)
>>    {
>>      obstack_specify_allocation (&m_obstack, OBSTACK_CHUNK_SIZE,
>>  				obstack_alignment, obstack_chunk_alloc,
>>  				obstack_chunk_free);
>>    }
>> -  ~ldp_bb_info ()
>> +  ~pair_fusion ()
>>    {
>>      obstack_free (&m_obstack, nullptr);
>>  
>> @@ -177,10 +187,50 @@ struct ldp_bb_info
>>  	bitmap_obstack_release (&m_bitmap_obstack);
>>        }
>>    }
>> +  void track_access (insn_info *, bool load, rtx mem);
>> +  void transform ();
>> +  void cleanup_tombstones ();
>> +  virtual void set_multiword_subreg (insn_info *i1, insn_info *i2,
>> +				     bool load_p) = 0;
> 
> These virtual functions could do with a comment to describe what their
> purpose is.  What is set_multiword_subreg supposed to do?  I guess it is
> something specifically needed for rs6000 as the aarch64 implementation
> in this patch isn't particularly illuminating.
> 
> In general for hooks that are needed for rs6000 but aren't for aarch64 I
> think it would be better to defer adding them to a later patch.  This
> patch should just focus on making the aarch64/generic split.
>

This is required for rs6000 to set subreg to generate sequential
register pairs. I have excluded the above hooks in latest patch
and would update the above hooks in subsequent patches.

 
>> +  virtual rtx gen_load_store_pair (rtx *pats,  rtx writeback,
>> +				   bool load_p) = 0;
> 
> Nit: naming, how about just "gen_mem_pair" for this?
> 
>> +  void merge_pairs (insn_list_t &, insn_list_t &,
>> +		    bool load_p, unsigned access_size);
>> +  virtual void transform_for_base (int load_size, access_group &group) = 0;
> 
> I don't think transform_for_base wants to be a pure virtual function,
> can't we just share the same implementation for all targets?
> 
>> +
>> +  bool try_fuse_pair (bool load_p, unsigned access_size,
>> +			     insn_info *i1, insn_info *i2);
>> +
>> +  bool fuse_pair (bool load_p, unsigned access_size,
>> +		  int writeback,
>> +		  insn_info *i1, insn_info *i2,
>> +		  base_cand &base,
>> +		  const insn_range_info &move_range);
>> +
>> +  void do_alias_analysis (insn_info *alias_hazards[4],
>> +			  alias_walker *walkers[4],
>> +			  bool load_p);
>> +
>> +  void track_tombstone (int uid);
>> +
>> +  bool track_via_mem_expr (insn_info *, rtx mem, lfs_fields lfs);
>> +
>> +  virtual bool is_fpsimd_op_p (rtx reg_op, machine_mode mem_mode,
>> +			       bool load_p) = 0;
> 
> Minor nit: the "is_" part of the name here is redundant.
>

Addressed.
 
>> +
>> +  virtual bool pair_operand_mode_ok_p (machine_mode mode) = 0;
>> +  virtual bool pair_trailing_writeback_p () = 0;
>> +  virtual bool pair_check_register_operand (bool load_p, rtx reg_op,
>> +					    machine_mode mem_mode) = 0;
> 
> How about pair_reg_operand_ok_p for consistency with
> pair_operand_mode_ok_p?
> 

Addressed.

>> +  virtual int pair_mem_alias_check_limit () = 0;
>> +  virtual bool pair_is_writeback () = 0 ;
> 
> Looking at the implementation below, I don't think this is a good name.
> With this name it seems like the function would be checking if a given
> pair insn is a writeback access, but this function is really just asking
> the target if we should handle any writeback opportunities.
>

Addressed.
 
> How about something like handle_writeback_opportunities () instead?
> 
>> +  virtual bool pair_mem_ok_policy (rtx first_mem, bool load_p,
>> +				   machine_mode mode) = 0;
> 
> Nit: a name like pair_mem_ok_with_policy might be better (to closer
> match the underlying aarch64 name).
> 
>> +  virtual bool fuseable_store_p (insn_info *i1, insn_info *i2) = 0;
>> +  virtual bool fuseable_load_p (insn_info *info) = 0;
> 
> Similar comment as on set_multiword_subreg: I think it would be better
> to defer adding these hooks (that are only needed for rs6000) to a later
> patch.
> 

Addressed.

>>  
>> -  inline void track_access (insn_info *, bool load, rtx mem);
>> -  inline void transform ();
>> -  inline void cleanup_tombstones ();
>> +  template<typename Map>
>> +    void traverse_base_map (Map &map);
>>  
>>  private:
>>    obstack m_obstack;
>> @@ -191,30 +241,157 @@ private:
>>    bool m_emitted_tombstone;
>>  
>>    inline splay_tree_node<access_record *> *node_alloc (access_record *);
>> +};
>>  
>> -  template<typename Map>
>> -  inline void traverse_base_map (Map &map);
>> -  inline void transform_for_base (int load_size, access_group &group);
>> -
>> -  inline void merge_pairs (insn_list_t &, insn_list_t &,
>> -			   bool load_p, unsigned access_size);
>> +struct  aarch64_pair_fusion : public pair_fusion
> 
> Nit: excess whitespace (double space after struct).
>

Addressed.

 
>> +{
>> +public:
>> +  aarch64_pair_fusion (bb_info *bb) : pair_fusion (bb) {};
>> +  bool is_fpsimd_op_p (rtx reg_op, machine_mode mem_mode, bool load_p)
> 
> Nit: the virtual function implementations in this class should probably
> be marked override (and arguably final too, although that seems less
> important).
> 
>> +  {
>> +    const bool fpsimd_op_p
>> +      = reload_completed
>> +      ? (REG_P (reg_op) && FP_REGNUM_P (REGNO (reg_op)))
>> +      : (GET_MODE_CLASS (mem_mode) != MODE_INT
>> +	 && (load_p || !aarch64_const_zero_rtx_p (reg_op)));
>> +    return fpsimd_op_p;
>> +  }
>>  
>> -  inline bool try_fuse_pair (bool load_p, unsigned access_size,
>> -			     insn_info *i1, insn_info *i2);
>> +  bool pair_mem_ok_policy (rtx first_mem, bool load_p, machine_mode mode)
>> +  {
>> +    return aarch64_mem_ok_with_ldpstp_policy_model (first_mem,
>> +						     load_p,
>> +						     mode);
>> +  }
>> +  bool pair_operand_mode_ok_p (machine_mode mode);
>>  
>> -  inline bool fuse_pair (bool load_p, unsigned access_size,
>> -			 int writeback,
>> -			 insn_info *i1, insn_info *i2,
>> -			 base_cand &base,
>> -			 const insn_range_info &move_range);
>> +  void transform_for_base (int encoded_lfs,
>> +			   access_group &group);
>> +  rtx gen_load_store_pair (rtx *pats,
>> +			   rtx writeback,
>> +			   bool load_p)
>> +  {
>> +    rtx pair_pat;
>>  
>> -  inline void track_tombstone (int uid);
>> +    if (writeback)
>> +      {
>> +	auto patvec = gen_rtvec (3, writeback, pats[0], pats[1]);
>> +	pair_pat = gen_rtx_PARALLEL (VOIDmode, patvec);
>> +      }
>> +    else if (load_p)
>> +      pair_pat = aarch64_gen_load_pair (XEXP (pats[0], 0),
>> +					XEXP (pats[1], 0),
>> +					XEXP (pats[0], 1));
>> +    else
>> +      pair_pat = aarch64_gen_store_pair (XEXP (pats[0], 0),
>> +					 XEXP (pats[0], 1),
>> +					 XEXP (pats[1], 1));
>> +     return pair_pat;
>> +  }
> 
> Minor nit: for readability, I'm not sure we want member functions
> implemented inline like this if the function is more than a few lines
> long.  But I don't feel strongly on this and would defer to a
> maintainer.
>

Addressed.
 
>>  
>> -  inline bool track_via_mem_expr (insn_info *, rtx mem, lfs_fields lfs);
>> +  void set_multiword_subreg (insn_info *i1, insn_info *i2, bool load_p)
>> +  {
>> +    if (i1 || i2 || load_p)
>> +      return;
>> +    return;
>> +  }
> 
> It looks like this could just have an empty function body and the code
> you've included here is dead/pointless.
> 

Addressed.

> Nit: I think we want whitespace between the inline member
> implementations here.
> 
>> +  bool pair_trailing_writeback_p  ()
>> +  {
>> +    return aarch64_ldp_writeback > 1;
>> +  }
>> +  bool pair_check_register_operand (bool load_p, rtx reg_op, machine_mode mem_mode)
>> +  {
>> +    return  (load_p
>> +	     ? aarch64_ldp_reg_operand (reg_op, mem_mode)
>> +	     : aarch64_stp_reg_operand (reg_op, mem_mode));
>> +  }
>> +  int pair_mem_alias_check_limit ()
>> +  {
>> +    return aarch64_ldp_alias_check_limit;
>> +  }
> 
> This seems reasonable for the time being, but I wonder if eventually we
> want to move to a single generic param shared by all targets using this
> pass (more of an open question than a request for change).
>
>> +  bool fuseable_store_p (insn_info *i1, insn_info *i2) { return i1 || i2;}
>> +  bool fuseable_load_p (insn_info *insn) { return insn;}
>

Addressed.
 
> It looks like both of these could just return true for aarch64.
> Could you explain in which scenarios you expect to reject pairs with
> these hooks?  As mentioned above, comments on the virtual function
> declarations would be good.
> 
> Naively it seems that both of these hooks should be given both insns.
> Why should fuseable_load_p only care about one candidate insn?  If both
> take both insns then they can be collapsed to a single hook which takes
> a load_p parameter to disambiguate the load/store cases.
> 

Yes this should be true for aarch64. This is required for rs6000
to avoid certain store fusion.

> It also looks like the fuseable_load_p function is currently unused.
> 
>> +  bool pair_is_writeback ()
>> +  {
>> +    return !aarch64_ldp_writeback;
> 
> I think you want to drop the negation here, we want to return true iff
> we should handle writeback opportunities, no?  You'll then need to drop
> the negation in callers, too.
>

Addressed.
 
>> +  }
>>  };
>>  
>> +bool
>> +store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget);
>> +bool load_modified_by_store_p (insn_info *load,
>> +			  insn_info *store,
>> +			  int &budget);
>> +extern insn_info *
>> +try_repurpose_store (insn_info *first,
>> +		     insn_info *second,
>> +		     const insn_range_info &move_range);
>> +
>> +void reset_debug_use (use_info *use);
>> +
>> +extern void
>> +fixup_debug_uses (obstack_watermark &attempt,
>> +		  insn_info *insns[2],
>> +		  rtx orig_rtl[2],
>> +		  insn_info *pair_dst,
>> +		  insn_info *trailing_add,
>> +		  bool load_p,
>> +		  int writeback,
>> +		  rtx writeback_effect,
>> +		  unsigned base_regno);
>> +
>> +void
>> +fixup_debug_uses_trailing_add (obstack_watermark &attempt,
>> +			       insn_info *pair_dst,
>> +			       insn_info *trailing_add,
>> +			       rtx writeback_effect);
>> +
>> +
>> +extern void
>> +fixup_debug_use (obstack_watermark &attempt,
>> +		 use_info *use,
>> +		 def_info *def,
>> +		 rtx base,
>> +		 poly_int64 wb_offset);
>> +
>> +extern insn_info *
>> +find_trailing_add (insn_info *insns[2],
>> +		   const insn_range_info &pair_range,
>> +		   int initial_writeback,
>> +		   rtx *writeback_effect,
>> +		   def_info **add_def,
>> +		   def_info *base_def,
>> +		   poly_int64 initial_offset,
>> +		   unsigned access_size);
>> +
>> +rtx drop_writeback (rtx mem);
>> +rtx pair_mem_strip_offset (rtx mem, poly_int64 *offset);
>> +bool any_pre_modify_p (rtx x);
>> +bool any_post_modify_p (rtx x);
>> +int encode_lfs (lfs_fields fields);
>> +extern insn_info * latest_hazard_before (insn_info *insn, rtx *ignore,
>> +		      insn_info *ignore_insn = nullptr);
>> +insn_info * first_hazard_after (insn_info *insn, rtx *ignore);
>> +bool ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2);
>> +insn_range_info get_def_range (def_info *def);
>> +insn_range_info def_downwards_move_range (def_info *def);
>> +insn_range_info def_upwards_move_range (def_info *def);
>> +rtx gen_tombstone (void);
>> +rtx filter_notes (rtx note, rtx result, bool *eh_region, rtx *fr_expr);
>> +rtx combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p);
>> +rtx extract_writebacks (bool load_p, rtx pats[2], int changed);
>> +void do_alias_analysis (insn_info *alias_hazards[4],
>> +		   alias_walker *walkers[4],
>> +		   bool load_p);
>> +int get_viable_bases (insn_info *insns[2],
>> +		  vec<base_cand> &base_cands,
>> +		  rtx cand_mems[2],
>> +		  unsigned access_size,
>> +		  bool reversed);
>> +void dump_insn_list (FILE *f, const insn_list_t &l);
> 
> Is there really a need to forward-declare all of these?
> 

Not required. Addressed in latest patch.
>> +
>>  splay_tree_node<access_record *> *
>> -ldp_bb_info::node_alloc (access_record *access)
>> +pair_fusion::node_alloc (access_record *access)
>>  {
>>    using T = splay_tree_node<access_record *>;
>>    void *addr = obstack_alloc (&m_obstack, sizeof (T));
>> @@ -224,7 +401,7 @@ ldp_bb_info::node_alloc (access_record *access)
>>  // Given a mem MEM, if the address has side effects, return a MEM that accesses
>>  // the same address but without the side effects.  Otherwise, return
>>  // MEM unchanged.
>> -static rtx
>> +rtx
> 
> Is there a reason to change the linkage here?  Presumably when this (and
> many other such functions) get moved to a generic file then they will
> only be called from within that file, so they could stay having static
> linkage.
> 

Not required addressed in latest patch.
> That should remove a fair bit of noise from this patch.
> 
>>  drop_writeback (rtx mem)
>>  {
>>    rtx addr = XEXP (mem, 0);
>> @@ -261,8 +438,8 @@ drop_writeback (rtx mem)
>>  // Convenience wrapper around strip_offset that can also look through
>>  // RTX_AUTOINC addresses.  The interface is like strip_offset except we take a
>>  // MEM so that we know the mode of the access.
>> -static rtx
>> -ldp_strip_offset (rtx mem, poly_int64 *offset)
>> +rtx
>> +pair_mem_strip_offset (rtx mem, poly_int64 *offset)
>>  {
>>    rtx addr = XEXP (mem, 0);
>>  
>> @@ -295,7 +472,7 @@ ldp_strip_offset (rtx mem, poly_int64 *offset)
>>  }
>>  
>>  // Return true if X is a PRE_{INC,DEC,MODIFY} rtx.
>> -static bool
>> +bool
> 
> As above: presumably we don't really need to change the linkage here?
> 
>>  any_pre_modify_p (rtx x)
>>  {
>>    const auto code = GET_CODE (x);
>> @@ -303,7 +480,7 @@ any_pre_modify_p (rtx x)
>>  }
>>  
>>  // Return true if X is a POST_{INC,DEC,MODIFY} rtx.
>> -static bool
>> +bool
> 
> Likewise.
> 

Addressed.

>>  any_post_modify_p (rtx x)
>>  {
>>    const auto code = GET_CODE (x);
>> @@ -332,9 +509,15 @@ ldp_operand_mode_ok_p (machine_mode mode)
>>    return reload_completed || mode != TImode;
>>  }
>>  
>> +bool
>> +aarch64_pair_fusion::pair_operand_mode_ok_p (machine_mode mode)
>> +{
>> +  return (ldp_operand_mode_ok_p (mode));
> 
> Nit: redundant extra parens around the function call here.
>

Addressed.

 
>> +}
>> +
>>  // Given LFS (load_p, fpsimd_p, size) fields in FIELDS, encode these
>>  // into an integer for use as a hash table key.
>> -static int
>> +int
>>  encode_lfs (lfs_fields fields)
>>  {
>>    int size_log2 = exact_log2 (fields.size);
>> @@ -396,7 +579,7 @@ access_group::track (Alloc alloc_node, poly_int64 offset, insn_info *insn)
>>  // MEM_EXPR base (i.e. a tree decl) relative to which we can track the access.
>>  // LFS is used as part of the key to the hash table, see track_access.
>>  bool
>> -ldp_bb_info::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs)
>> +pair_fusion::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs)
>>  {
>>    if (!MEM_EXPR (mem) || !MEM_OFFSET_KNOWN_P (mem))
>>      return false;
>> @@ -412,7 +595,7 @@ ldp_bb_info::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs)
>>    const machine_mode mem_mode = GET_MODE (mem);
>>    const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant ();
>>  
>> -  // Punt on misaligned offsets.  LDP/STP instructions require offsets to be a
>> +  // Punt on misaligned offsets.  PAIR MEM  instructions require offsets to be a
> 
> Nit: how about "Paired memory accesses" as the generic term (assuming
> this is true for the rs6000 insns too).
> 
>>    // multiple of the access size, and we believe that misaligned offsets on
>>    // MEM_EXPR bases are likely to lead to misaligned offsets w.r.t. RTL bases.
>>    if (!multiple_p (offset, mem_size))
>> @@ -438,46 +621,38 @@ ldp_bb_info::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs)
>>  }
>>  
>>  // Main function to begin pair discovery.  Given a memory access INSN,
>> -// determine whether it could be a candidate for fusing into an ldp/stp,
>> +// determine whether it could be a candidate for fusing into an pair mem,
>>  // and if so, track it in the appropriate data structure for this basic
>>  // block.  LOAD_P is true if the access is a load, and MEM is the mem
>>  // rtx that occurs in INSN.
>>  void
>> -ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
>> +pair_fusion::track_access (insn_info *insn, bool load_p, rtx mem)
>>  {
>>    // We can't combine volatile MEMs, so punt on these.
>>    if (MEM_VOLATILE_P (mem))
>>      return;
>>  
>> -  // Ignore writeback accesses if the param says to do so.
>> -  if (!aarch64_ldp_writeback
>> +  // Ignore writeback accesses if the param says to do so
>> +  if (pair_is_writeback ()
> 
> See my comment on the implementation of pair_is_writeback.
> 
>>        && GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC)
>>      return;
>>  
>>    const machine_mode mem_mode = GET_MODE (mem);
>> -  if (!ldp_operand_mode_ok_p (mem_mode))
>> +
>> +  if (!pair_operand_mode_ok_p (mem_mode))
>>      return;
>>  
>>    rtx reg_op = XEXP (PATTERN (insn->rtl ()), !load_p);
>>  
>> -  // Ignore the access if the register operand isn't suitable for ldp/stp.
>> -  if (load_p
>> -      ? !aarch64_ldp_reg_operand (reg_op, mem_mode)
>> -      : !aarch64_stp_reg_operand (reg_op, mem_mode))
>> +  if (!pair_check_register_operand (load_p, reg_op, mem_mode))
>>      return;
>> -
>>    // We want to segregate FP/SIMD accesses from GPR accesses.
>>    //
>>    // Before RA, we use the modes, noting that stores of constant zero
>>    // operands use GPRs (even in non-integer modes).  After RA, we use
>>    // the hard register numbers.
>> -  const bool fpsimd_op_p
>> -    = reload_completed
>> -    ? (REG_P (reg_op) && FP_REGNUM_P (REGNO (reg_op)))
>> -    : (GET_MODE_CLASS (mem_mode) != MODE_INT
>> -       && (load_p || !aarch64_const_zero_rtx_p (reg_op)));
>> -
>> -  // Note ldp_operand_mode_ok_p already rejected VL modes.
>> + const bool fpsimd_op_p = is_fpsimd_op_p (reg_op, mem_mode, load_p);
> 
> Nit: indentation looks off here.
>

Addressed.
 
>> +  // Note pair_operand_mode_ok_p already rejected VL modes.
>>    const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant ();
>>    const lfs_fields lfs = { load_p, fpsimd_op_p, mem_size };
>>  
>> @@ -487,7 +662,7 @@ ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
>>    poly_int64 mem_off;
>>    rtx addr = XEXP (mem, 0);
>>    const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC;
>> -  rtx base = ldp_strip_offset (mem, &mem_off);
>> +  rtx base = pair_mem_strip_offset (mem, &mem_off);
> 
> For review purposes: it might be easier if you split off patches that
> just do a simple rename into a separate patch.  So all of the ldp/stp ->
> pair mem renaming could be done in a preparatory patch.  That would
> reduce the noise in reviewing any subsequent patches.
> 
>>    if (!REG_P (base))
>>      return;
>>  
>> @@ -507,7 +682,7 @@ ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
>>    // accesses until after RA.
>>    //
>>    // As it stands, addresses with offsets in range for LDR but not
>> -  // in range for LDP/STP are currently reloaded inefficiently,
>> +  // in range for PAIR MEM LOAD STORE  are currently reloaded inefficiently,
> 
> How about: "addresses in range for an individual load/store but not
> for a paired access"?
> 
>>    // ending up with a separate base register for each pair.
>>    //
>>    // In theory LRA should make use of
>> @@ -519,8 +694,8 @@ ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
>>    // that calls targetm.legitimize_address_displacement.
>>    //
>>    // So for now, it's better to punt when we can't be sure that the
>> -  // offset is in range for LDP/STP.  Out-of-range cases can then be
>> -  // handled after RA by the out-of-range LDP/STP peepholes.  Eventually, it
>> +  // offset is in range for PAIR MEM LOAD STORE.  Out-of-range cases can then be
> 
> "in range for the paired access".
> 
>> +  // handled after RA by the out-of-range PAIR MEM  peepholes.  Eventually, it
> 
> Does this part of the comment really apply to rs6000?  I.e. does rs6000
> currently have peepholes for forming vector load/store pair insns?
> 

We dont do peehole for load store pair insns in rs6000.

> I'm also not sure to what extent the part about relaxed memory
> constraints applies to rs6000.  If some parts aren't relevant then
> perhaps a caveat mentioning this in the comment could be a good idea.
> 
>>    // would be nice to handle known out-of-range opportunities in the
>>    // pass itself (for stack accesses, this would be in the post-RA pass).
>>    if (!reload_completed
>> @@ -573,7 +748,7 @@ ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
>>  	gcc_unreachable (); // Base defs should be unique.
>>      }
>>  
>> -  // Punt on misaligned offsets.  LDP/STP require offsets to be a multiple of
>> +  // Punt on misaligned offsets.  PAIR MEM  require offsets to be a multiple of
> 
> How about "Paired memory accesses require ..." (assuming that is true of
> rs6000).
>

Addressed.
 
>>    // the access size.
>>    if (!multiple_p (mem_off, mem_size))
>>      return;
>> @@ -612,9 +787,9 @@ static bool no_ignore (insn_info *) { return false; }
>>  //
>>  // N.B. we ignore any defs/uses of memory here as we deal with that separately,
>>  // making use of alias disambiguation.
>> -static insn_info *
>> +insn_info *
> 
> Similar comment as on drop_writeback above about changing the linkage.
> 
>>  latest_hazard_before (insn_info *insn, rtx *ignore,
>> -		      insn_info *ignore_insn = nullptr)
>> +		      insn_info *ignore_insn)
>>  {
>>    insn_info *result = nullptr;
>>  
>> @@ -698,7 +873,7 @@ latest_hazard_before (insn_info *insn, rtx *ignore,
>>  //
>>  // N.B. we ignore any defs/uses of memory here as we deal with that separately,
>>  // making use of alias disambiguation.
>> -static insn_info *
>> +insn_info *
>>  first_hazard_after (insn_info *insn, rtx *ignore)
>>  {
>>    insn_info *result = nullptr;
>> @@ -787,7 +962,7 @@ first_hazard_after (insn_info *insn, rtx *ignore)
>>  }
>>  
>>  // Return true iff R1 and R2 overlap.
>> -static bool
>> +bool
>>  ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2)
>>  {
>>    // If either range is empty, then their intersection is empty.
>> @@ -801,7 +976,7 @@ ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2)
>>  }
>>  
>>  // Get the range of insns that def feeds.
>> -static insn_range_info get_def_range (def_info *def)
>> +insn_range_info get_def_range (def_info *def)
>>  {
>>    insn_info *last = def->next_def ()->insn ()->prev_nondebug_insn ();
>>    return { def->insn (), last };
>> @@ -809,7 +984,7 @@ static insn_range_info get_def_range (def_info *def)
>>  
>>  // Given a def (of memory), return the downwards range within which we
>>  // can safely move this def.
>> -static insn_range_info
>> +insn_range_info
>>  def_downwards_move_range (def_info *def)
>>  {
>>    auto range = get_def_range (def);
>> @@ -827,7 +1002,7 @@ def_downwards_move_range (def_info *def)
>>  
>>  // Given a def (of memory), return the upwards range within which we can
>>  // safely move this def.
>> -static insn_range_info
>> +insn_range_info
>>  def_upwards_move_range (def_info *def)
>>  {
>>    def_info *prev = def->prev_def ();
>> @@ -974,7 +1149,7 @@ private:
>>  // Given candidate store insns FIRST and SECOND, see if we can re-purpose one
>>  // of them (together with its def of memory) for the stp insn.  If so, return
>>  // that insn.  Otherwise, return null.
>> -static insn_info *
>> +insn_info *
>>  try_repurpose_store (insn_info *first,
>>  		     insn_info *second,
>>  		     const insn_range_info &move_range)
>> @@ -1001,7 +1176,7 @@ try_repurpose_store (insn_info *first,
>>  //
>>  // These are deleted at the end of the pass and uses re-parented appropriately
>>  // at this point.
>> -static rtx
>> +rtx
>>  gen_tombstone (void)
>>  {
>>    return gen_rtx_CLOBBER (VOIDmode,
>> @@ -1034,7 +1209,7 @@ aarch64_operand_mode_for_pair_mode (machine_mode mode)
>>  // REG_EH_REGION note in the resulting list.  FR_EXPR is used to return any
>>  // REG_FRAME_RELATED_EXPR note we find, as these can need special handling in
>>  // combine_reg_notes.
>> -static rtx
>> +rtx
>>  filter_notes (rtx note, rtx result, bool *eh_region, rtx *fr_expr)
>>  {
>>    for (; note; note = XEXP (note, 1))
>> @@ -1084,7 +1259,7 @@ filter_notes (rtx note, rtx result, bool *eh_region, rtx *fr_expr)
>>  
>>  // Return the notes that should be attached to a combination of I1 and I2, where
>>  // *I1 < *I2.  LOAD_P is true for loads.
>> -static rtx
>> +rtx
>>  combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p)
>>  {
>>    // Temporary storage for REG_FRAME_RELATED_EXPR notes.
>> @@ -1133,7 +1308,7 @@ combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p)
>>  // relative to the initial value of the base register, and output these
>>  // in PATS.  Return an rtx that represents the overall change to the
>>  // base register.
>> -static rtx
>> +rtx
>>  extract_writebacks (bool load_p, rtx pats[2], int changed)
>>  {
>>    rtx base_reg = NULL_RTX;
>> @@ -1150,7 +1325,7 @@ extract_writebacks (bool load_p, rtx pats[2], int changed)
>>        const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC;
>>  
>>        poly_int64 offset;
>> -      rtx this_base = ldp_strip_offset (mem, &offset);
>> +      rtx this_base = pair_mem_strip_offset (mem, &offset);
>>        gcc_assert (REG_P (this_base));
>>        if (base_reg)
>>  	gcc_assert (rtx_equal_p (base_reg, this_base));
>> @@ -1207,7 +1382,7 @@ extract_writebacks (bool load_p, rtx pats[2], int changed)
>>  // base register.  If there is one, we choose the first such update after
>>  // PAIR_DST that is still in the same BB as our pair.  We return the new def in
>>  // *ADD_DEF and the resulting writeback effect in *WRITEBACK_EFFECT.
>> -static insn_info *
>> +insn_info *
>>  find_trailing_add (insn_info *insns[2],
>>  		   const insn_range_info &pair_range,
>>  		   int initial_writeback,
>> @@ -1286,7 +1461,7 @@ find_trailing_add (insn_info *insns[2],
>>  
>>    off_hwi /= access_size;
>>  
>> -  if (off_hwi < LDP_MIN_IMM || off_hwi > LDP_MAX_IMM)
>> +  if (off_hwi < PAIR_MEM_MIN_IMM || off_hwi > PAIR_MEM_MAX_IMM)
>>      return nullptr;
>>  
>>    auto dump_prefix = [&]()
>> @@ -1328,7 +1503,7 @@ find_trailing_add (insn_info *insns[2],
>>  // We just emitted a tombstone with uid UID, track it in a bitmap for
>>  // this BB so we can easily identify it later when cleaning up tombstones.
>>  void
>> -ldp_bb_info::track_tombstone (int uid)
>> +pair_fusion::track_tombstone (int uid)
>>  {
>>    if (!m_emitted_tombstone)
>>      {
>> @@ -1344,7 +1519,7 @@ ldp_bb_info::track_tombstone (int uid)
>>  
>>  // Reset the debug insn containing USE (the debug insn has been
>>  // optimized away).
>> -static void
>> +void
>>  reset_debug_use (use_info *use)
>>  {
>>    auto use_insn = use->insn ();
>> @@ -1360,7 +1535,7 @@ reset_debug_use (use_info *use)
>>  // is an update of the register BASE by a constant, given by WB_OFFSET,
>>  // and we can preserve debug info by accounting for the change in side
>>  // effects.
>> -static void
>> +void
>>  fixup_debug_use (obstack_watermark &attempt,
>>  		 use_info *use,
>>  		 def_info *def,
>> @@ -1463,7 +1638,7 @@ fixup_debug_use (obstack_watermark &attempt,
>>  // is a trailing add insn which is being folded into the pair to make it
>>  // use writeback addressing, and WRITEBACK_EFFECT is the pattern for
>>  // TRAILING_ADD.
>> -static void
>> +void
>>  fixup_debug_uses_trailing_add (obstack_watermark &attempt,
>>  			       insn_info *pair_dst,
>>  			       insn_info *trailing_add,
>> @@ -1500,7 +1675,7 @@ fixup_debug_uses_trailing_add (obstack_watermark &attempt,
>>  // writeback, and WRITEBACK_EFFECT is an rtx describing the overall update to
>>  // the base register in the final pair (if any).  BASE_REGNO gives the register
>>  // number of the base register used in the final pair.
>> -static void
>> +void
>>  fixup_debug_uses (obstack_watermark &attempt,
>>  		  insn_info *insns[2],
>>  		  rtx orig_rtl[2],
>> @@ -1528,7 +1703,7 @@ fixup_debug_uses (obstack_watermark &attempt,
>>  	  gcc_checking_assert (GET_RTX_CLASS (GET_CODE (XEXP (mem, 0)))
>>  			       == RTX_AUTOINC);
>>  
>> -	  base = ldp_strip_offset (mem, &offset);
>> +	  base = pair_mem_strip_offset (mem, &offset);
>>  	  gcc_checking_assert (REG_P (base) && REGNO (base) == base_regno);
>>  	}
>>        fixup_debug_use (attempt, use, def, base, offset);
>> @@ -1664,7 +1839,7 @@ fixup_debug_uses (obstack_watermark &attempt,
>>  // BASE gives the chosen base candidate for the pair and MOVE_RANGE is
>>  // a singleton range which says where to place the pair.
>>  bool
>> -ldp_bb_info::fuse_pair (bool load_p,
>> +pair_fusion::fuse_pair (bool load_p,
>>  			unsigned access_size,
>>  			int writeback,
>>  			insn_info *i1, insn_info *i2,
>> @@ -1684,6 +1859,9 @@ ldp_bb_info::fuse_pair (bool load_p,
>>  						   insn_change::DELETE);
>>      };
>>  
>> +  if (*i1 > *i2)
>> +    return false;
>> +
> 

This is required for rs6000 where the offset in first instruction
gets greater than second for some testcases in testsuite.
Currently I have removed from latest aarch64 patch and add
in subsequent patches for rs6000.

> I don't think we want this: we want to be able to handle the case
> where the insns in offset order are not in program order.  What goes
> wrong in this case for you?
> 
>>    insn_info *first = (*i1 < *i2) ? i1 : i2;
>>    insn_info *second = (first == i1) ? i2 : i1;
>>  
>> @@ -1800,7 +1978,7 @@ ldp_bb_info::fuse_pair (bool load_p,
>>      {
>>        if (dump_file)
>>  	fprintf (dump_file,
>> -		 "  ldp: i%d has wb but subsequent i%d has non-wb "
>> +		 "  pair_mem: i%d has wb but subsequent i%d has non-wb "
> 
> How about "load pair: "?
> 
>>  		 "update of base (r%d), dropping wb\n",
>>  		 insns[0]->uid (), insns[1]->uid (), base_regno);
>>        gcc_assert (writeback_effect);
>> @@ -1823,7 +2001,7 @@ ldp_bb_info::fuse_pair (bool load_p,
>>      }
>>  
>>    // If either of the original insns had writeback, but the resulting pair insn
>> -  // does not (can happen e.g. in the ldp edge case above, or if the writeback
>> +  // does not (can happen e.g. in the pair mem  edge case above, or if the writeback
> 
> s/pair mem/load pair/, also nit: excess whitespace after mem.
>

Done.
 
>>    // effects cancel out), then drop the def(s) of the base register as
>>    // appropriate.
>>    //
>> @@ -1842,7 +2020,7 @@ ldp_bb_info::fuse_pair (bool load_p,
>>    // update of the base register and try and fold it in to make this into a
>>    // writeback pair.
>>    insn_info *trailing_add = nullptr;
>> -  if (aarch64_ldp_writeback > 1
>> +  if (pair_trailing_writeback_p ()
>>        && !writeback_effect
>>        && (!load_p || (!refers_to_regno_p (base_regno, base_regno + 1,
>>  					 XEXP (pats[0], 0), nullptr)
>> @@ -1863,14 +2041,14 @@ ldp_bb_info::fuse_pair (bool load_p,
>>      }
>>  
>>    // Now that we know what base mem we're going to use, check if it's OK
>> -  // with the ldp/stp policy.
>> +  // with the pair mem  policy.
>>    rtx first_mem = XEXP (pats[0], load_p);
>> -  if (!aarch64_mem_ok_with_ldpstp_policy_model (first_mem,
>> -						load_p,
>> -						GET_MODE (first_mem)))
>> +  if (!pair_mem_ok_policy (first_mem,
>> +			  load_p,
>> +			  GET_MODE (first_mem)))
>>      {
>>        if (dump_file)
>> -	fprintf (dump_file, "punting on pair (%d,%d), ldp/stp policy says no\n",
>> +	fprintf (dump_file, "punting on pair (%d,%d), pair mem  policy says no\n",
> 
> Nit: excess whitespace after mem.
> 

Done.

>>  		 i1->uid (), i2->uid ());
>>        return false;
>>      }
>> @@ -1878,20 +2056,12 @@ ldp_bb_info::fuse_pair (bool load_p,
>>    rtx reg_notes = combine_reg_notes (first, second, load_p);
>>  
>>    rtx pair_pat;
>> -  if (writeback_effect)
>> -    {
>> -      auto patvec = gen_rtvec (3, writeback_effect, pats[0], pats[1]);
>> -      pair_pat = gen_rtx_PARALLEL (VOIDmode, patvec);
>> -    }
>> -  else if (load_p)
>> -    pair_pat = aarch64_gen_load_pair (XEXP (pats[0], 0),
>> -				      XEXP (pats[1], 0),
>> -				      XEXP (pats[0], 1));
>> -  else
>> -    pair_pat = aarch64_gen_store_pair (XEXP (pats[0], 0),
>> -				       XEXP (pats[0], 1),
>> -				       XEXP (pats[1], 1));
>>  
>> +  set_multiword_subreg (first, second, load_p);
>> +
>> +  pair_pat = gen_load_store_pair (pats, writeback_effect, load_p);
>> +  if (pair_pat == NULL_RTX)
>> +    return false;
> 

Done.

> I don't think this function should be allowed to fail (if we want to
> reject a pair, we should do it earlier), so this should either assert
> the result is non-NULL or we should just drop the check altogether.
> 
>>    insn_change *pair_change = nullptr;
>>    auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) {
>>      rtx_insn *rti = change->insn ()->rtl ();
>> @@ -2075,7 +2245,7 @@ ldp_bb_info::fuse_pair (bool load_p,
>>  
>>  // Return true if STORE_INSN may modify mem rtx MEM.  Make sure we keep
>>  // within our BUDGET for alias analysis.
>> -static bool
>> +bool
>>  store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget)
>>  {
>>    if (!budget)
>> @@ -2098,7 +2268,7 @@ store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget)
>>  
>>  // Return true if LOAD may be modified by STORE.  Make sure we keep
>>  // within our BUDGET for alias analysis.
>> -static bool
>> +bool
>>  load_modified_by_store_p (insn_info *load,
>>  			  insn_info *store,
>>  			  int &budget)
>> @@ -2133,15 +2303,6 @@ load_modified_by_store_p (insn_info *load,
>>    return false;
>>  }
>>  
>> -// Virtual base class for load/store walkers used in alias analysis.
>> -struct alias_walker
>> -{
>> -  virtual bool conflict_p (int &budget) const = 0;
>> -  virtual insn_info *insn () const = 0;
>> -  virtual bool valid () const  = 0;
>> -  virtual void advance () = 0;
>> -};
>> -
>>  // Implement some common functionality used by both store_walker
>>  // and load_walker.
>>  template<bool reverse>
>> @@ -2259,13 +2420,13 @@ public:
>>  //
>>  // We try to maintain the invariant that if a walker becomes invalid, we
>>  // set its pointer to null.
>> -static void
>> -do_alias_analysis (insn_info *alias_hazards[4],
>> +void
>> +pair_fusion::do_alias_analysis (insn_info *alias_hazards[4],
>>  		   alias_walker *walkers[4],
>>  		   bool load_p)
>>  {
>>    const int n_walkers = 2 + (2 * !load_p);
>> -  int budget = aarch64_ldp_alias_check_limit;
>> +  int budget = pair_mem_alias_check_limit();
> 
> Nit: space before the open paren, contrib/check_GNU_style.py can help
> catch such things.
>

Done.
 
>>  
>>    auto next_walker = [walkers,n_walkers](int current) -> int {
>>      for (int j = 1; j <= n_walkers; j++)
>> @@ -2350,7 +2511,7 @@ do_alias_analysis (insn_info *alias_hazards[4],
>>  //
>>  // Returns an integer where bit (1 << i) is set if INSNS[i] uses writeback
>>  // addressing.
>> -static int
>> +int
>>  get_viable_bases (insn_info *insns[2],
>>  		  vec<base_cand> &base_cands,
>>  		  rtx cand_mems[2],
>> @@ -2365,7 +2526,7 @@ get_viable_bases (insn_info *insns[2],
>>      {
>>        const bool is_lower = (i == reversed);
>>        poly_int64 poly_off;
>> -      rtx base = ldp_strip_offset (cand_mems[i], &poly_off);
>> +      rtx base = pair_mem_strip_offset (cand_mems[i], &poly_off);
>>        if (GET_RTX_CLASS (GET_CODE (XEXP (cand_mems[i], 0))) == RTX_AUTOINC)
>>  	writeback |= (1 << i);
>>  
>> @@ -2373,7 +2534,7 @@ get_viable_bases (insn_info *insns[2],
>>  	continue;
>>  
>>        // Punt on accesses relative to eliminable regs.  See the comment in
>> -      // ldp_bb_info::track_access for a detailed explanation of this.
>> +      // pair_fusion::track_access for a detailed explanation of this.
>>        if (!reload_completed
>>  	  && (REGNO (base) == FRAME_POINTER_REGNUM
>>  	      || REGNO (base) == ARG_POINTER_REGNUM))
>> @@ -2397,7 +2558,7 @@ get_viable_bases (insn_info *insns[2],
>>        if (!is_lower)
>>  	base_off--;
>>  
>> -      if (base_off < LDP_MIN_IMM || base_off > LDP_MAX_IMM)
>> +      if (base_off < PAIR_MEM_MIN_IMM || base_off > PAIR_MEM_MAX_IMM)
> 
> As mentioned at the top of the review, these checks should be replaced
> with a call to a virtual function that validates the offset instead of
> just renaming the aarch64 constants.
> 
>>  	continue;
>>  
>>        use_info *use = find_access (insns[i]->uses (), REGNO (base));
>> @@ -2454,12 +2615,12 @@ get_viable_bases (insn_info *insns[2],
>>  }
>>  
>>  // Given two adjacent memory accesses of the same size, I1 and I2, try
>> -// and see if we can merge them into a ldp or stp.
>> +// and see if we can merge them into a pair mem load and store.
> 
> How about: "into a paired access"?
> 

Done.

>>  //
>>  // ACCESS_SIZE gives the (common) size of a single access, LOAD_P is true
>>  // if the accesses are both loads, otherwise they are both stores.
>>  bool
>> -ldp_bb_info::try_fuse_pair (bool load_p, unsigned access_size,
>> +pair_fusion::try_fuse_pair (bool load_p, unsigned access_size,
>>  			    insn_info *i1, insn_info *i2)
>>  {
>>    if (dump_file)
>> @@ -2490,11 +2651,21 @@ ldp_bb_info::try_fuse_pair (bool load_p, unsigned access_size,
>>        reg_ops[i] = XEXP (pats[i], !load_p);
>>      }
>>  
>> +  if (!load_p && !fuseable_store_p (i1, i2))
>> +    {
>> +      if (dump_file)
>> +	fprintf (dump_file,
>> +		 "punting on store-mem-pairs due to non fuseable cand (%d,%d)\n",
>> +		 insns[0]->uid (), insns[1]->uid ());
>> +      return false;
>> +    }
>> +
> 
> As above, please can you explain under which circumstances you plan to
> reject store pairs with this hook?  In any case I think we should defer
> adding hooks that are only needed for rs6000 to a later patch.
>

We disable certain store fusion in rs6000 and this case
is required to disable them and currently I have removed
in aarch64 patch and later add in rs6000 patches and when
seperate the file.

 
>> +
> 
> Nit: excess whitespace.
> 
>>    if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1]))
>>      {
>>        if (dump_file)
>>  	fprintf (dump_file,
>> -		 "punting on ldp due to reg conflcits (%d,%d)\n",
>> +		 "punting on pair mem load  due to reg conflcits (%d,%d)\n",
>>  		 insns[0]->uid (), insns[1]->uid ());
>>        return false;
>>      }
>> @@ -2787,7 +2958,7 @@ ldp_bb_info::try_fuse_pair (bool load_p, unsigned access_size,
>>  		    i1, i2, *base, range);
>>  }
>>  
>> -static void
>> +void
>>  dump_insn_list (FILE *f, const insn_list_t &l)
>>  {
>>    fprintf (f, "(");
>> @@ -2843,7 +3014,7 @@ debug (const insn_list_t &l)
>>  // we can't re-order them anyway, so provided earlier passes have cleaned up
>>  // redundant loads, we shouldn't miss opportunities by doing this.
>>  void
>> -ldp_bb_info::merge_pairs (insn_list_t &left_list,
>> +pair_fusion::merge_pairs (insn_list_t &left_list,
>>  			  insn_list_t &right_list,
>>  			  bool load_p,
>>  			  unsigned access_size)
>> @@ -2890,8 +3061,8 @@ ldp_bb_info::merge_pairs (insn_list_t &left_list,
>>  // of accesses.  If we find two sets of adjacent accesses, call
>>  // merge_pairs.
>>  void
>> -ldp_bb_info::transform_for_base (int encoded_lfs,
>> -				 access_group &group)
>> +aarch64_pair_fusion::transform_for_base (int encoded_lfs,
>> +					 access_group &group)
>>  {
>>    const auto lfs = decode_lfs (encoded_lfs);
>>    const unsigned access_size = lfs.size;
>> @@ -2919,7 +3090,7 @@ ldp_bb_info::transform_for_base (int encoded_lfs,
>>  // and remove all the tombstone insns, being sure to reparent any uses
>>  // of mem to previous defs when we do this.
>>  void
>> -ldp_bb_info::cleanup_tombstones ()
>> +pair_fusion::cleanup_tombstones ()
>>  {
>>    // No need to do anything if we didn't emit a tombstone insn for this BB.
>>    if (!m_emitted_tombstone)
>> @@ -2947,7 +3118,7 @@ ldp_bb_info::cleanup_tombstones ()
>>  
>>  template<typename Map>
>>  void
>> -ldp_bb_info::traverse_base_map (Map &map)
>> +pair_fusion::traverse_base_map (Map &map)
>>  {
>>    for (auto kv : map)
>>      {
>> @@ -2958,7 +3129,7 @@ ldp_bb_info::traverse_base_map (Map &map)
>>  }
>>  
>>  void
>> -ldp_bb_info::transform ()
>> +pair_fusion::transform ()
>>  {
>>    traverse_base_map (expr_map);
>>    traverse_base_map (def_map);
>> @@ -3174,7 +3345,9 @@ void ldp_fusion_bb (bb_info *bb)
>>    const bool track_stores
>>      = aarch64_tune_params.stp_policy_model != AARCH64_LDP_STP_POLICY_NEVER;
>>  
>> -  ldp_bb_info bb_state (bb);
>> +  pair_fusion *bb_state;
>> +  aarch64_pair_fusion derived (bb);
>> +  bb_state = &derived;
> 
> See my comment towards the top of this review about the high-level
> structure: I think this function (ldp_fusion_bb) should be a member of
> an outer class where all the virtual functions live, and then the
> ldp_bb_info class shouldn't need to be specialised to a given target
> (but we can just pass a reference or pointer to the outer class).
>

Done.

 
>>  
>>    for (auto insn : bb->nondebug_insns ())
>>      {
>> @@ -3194,13 +3367,13 @@ void ldp_fusion_bb (bb_info *bb)
>>  	continue;
>>  
>>        if (track_stores && MEM_P (XEXP (pat, 0)))
>> -	bb_state.track_access (insn, false, XEXP (pat, 0));
>> +	bb_state->track_access (insn, false, XEXP (pat, 0));
>>        else if (track_loads && MEM_P (XEXP (pat, 1)))
>> -	bb_state.track_access (insn, true, XEXP (pat, 1));
>> +	bb_state->track_access (insn, true, XEXP (pat, 1));
>>      }
>>  
>> -  bb_state.transform ();
>> -  bb_state.cleanup_tombstones ();
>> +  bb_state->transform ();
>> +  bb_state->cleanup_tombstones ();
>>  }
>>  
>>  void ldp_fusion ()
>> -- 
>> 2.39.3
>>
> 
> Thanks,
> Alex
Thanks & Regards
Ajit
diff mbox

Patch

diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc b/gcc/config/aarch64/aarch64-ldp-fusion.cc
index 22ed95eb743..2ef22ff1e96 100644
--- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
+++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
@@ -40,10 +40,10 @@ 
 
 using namespace rtl_ssa;
 
-static constexpr HOST_WIDE_INT LDP_IMM_BITS = 7;
-static constexpr HOST_WIDE_INT LDP_IMM_SIGN_BIT = (1 << (LDP_IMM_BITS - 1));
-static constexpr HOST_WIDE_INT LDP_MAX_IMM = LDP_IMM_SIGN_BIT - 1;
-static constexpr HOST_WIDE_INT LDP_MIN_IMM = -LDP_MAX_IMM - 1;
+static constexpr HOST_WIDE_INT PAIR_MEM_IMM_BITS = 7;
+static constexpr HOST_WIDE_INT PAIR_MEM_IMM_SIGN_BIT = (1 << (PAIR_MEM_IMM_BITS - 1));
+static constexpr HOST_WIDE_INT PAIR_MEM_MAX_IMM = PAIR_MEM_IMM_SIGN_BIT - 1;
+static constexpr HOST_WIDE_INT PAIR_MEM_MIN_IMM = -PAIR_MEM_MAX_IMM - 1;
 
 // We pack these fields (load_p, fpsimd_p, and size) into an integer
 // (LFS) which we use as part of the key into the main hash tables.
@@ -138,8 +138,18 @@  struct alt_base
   poly_int64 offset;
 };
 
+// Virtual base class for load/store walkers used in alias analysis.
+struct alias_walker
+{
+  virtual bool conflict_p (int &budget) const = 0;
+  virtual insn_info *insn () const = 0;
+  virtual bool valid () const  = 0;
+  virtual void advance () = 0;
+};
+
+
 // State used by the pass for a given basic block.
-struct ldp_bb_info
+struct pair_fusion
 {
   using def_hash = nofree_ptr_hash<def_info>;
   using expr_key_t = pair_hash<tree_operand_hash, int_hash<int, -1, -2>>;
@@ -161,13 +171,13 @@  struct ldp_bb_info
   static const size_t obstack_alignment = sizeof (void *);
   bb_info *m_bb;
 
-  ldp_bb_info (bb_info *bb) : m_bb (bb), m_emitted_tombstone (false)
+  pair_fusion (bb_info *bb) : m_bb (bb), m_emitted_tombstone (false)
   {
     obstack_specify_allocation (&m_obstack, OBSTACK_CHUNK_SIZE,
 				obstack_alignment, obstack_chunk_alloc,
 				obstack_chunk_free);
   }
-  ~ldp_bb_info ()
+  ~pair_fusion ()
   {
     obstack_free (&m_obstack, nullptr);
 
@@ -177,10 +187,50 @@  struct ldp_bb_info
 	bitmap_obstack_release (&m_bitmap_obstack);
       }
   }
+  void track_access (insn_info *, bool load, rtx mem);
+  void transform ();
+  void cleanup_tombstones ();
+  virtual void set_multiword_subreg (insn_info *i1, insn_info *i2,
+				     bool load_p) = 0;
+  virtual rtx gen_load_store_pair (rtx *pats,  rtx writeback,
+				   bool load_p) = 0;
+  void merge_pairs (insn_list_t &, insn_list_t &,
+		    bool load_p, unsigned access_size);
+  virtual void transform_for_base (int load_size, access_group &group) = 0;
+
+  bool try_fuse_pair (bool load_p, unsigned access_size,
+			     insn_info *i1, insn_info *i2);
+
+  bool fuse_pair (bool load_p, unsigned access_size,
+		  int writeback,
+		  insn_info *i1, insn_info *i2,
+		  base_cand &base,
+		  const insn_range_info &move_range);
+
+  void do_alias_analysis (insn_info *alias_hazards[4],
+			  alias_walker *walkers[4],
+			  bool load_p);
+
+  void track_tombstone (int uid);
+
+  bool track_via_mem_expr (insn_info *, rtx mem, lfs_fields lfs);
+
+  virtual bool is_fpsimd_op_p (rtx reg_op, machine_mode mem_mode,
+			       bool load_p) = 0;
+
+  virtual bool pair_operand_mode_ok_p (machine_mode mode) = 0;
+  virtual bool pair_trailing_writeback_p () = 0;
+  virtual bool pair_check_register_operand (bool load_p, rtx reg_op,
+					    machine_mode mem_mode) = 0;
+  virtual int pair_mem_alias_check_limit () = 0;
+  virtual bool pair_is_writeback () = 0 ;
+  virtual bool pair_mem_ok_policy (rtx first_mem, bool load_p,
+				   machine_mode mode) = 0;
+  virtual bool fuseable_store_p (insn_info *i1, insn_info *i2) = 0;
+  virtual bool fuseable_load_p (insn_info *info) = 0;
 
-  inline void track_access (insn_info *, bool load, rtx mem);
-  inline void transform ();
-  inline void cleanup_tombstones ();
+  template<typename Map>
+    void traverse_base_map (Map &map);
 
 private:
   obstack m_obstack;
@@ -191,30 +241,157 @@  private:
   bool m_emitted_tombstone;
 
   inline splay_tree_node<access_record *> *node_alloc (access_record *);
+};
 
-  template<typename Map>
-  inline void traverse_base_map (Map &map);
-  inline void transform_for_base (int load_size, access_group &group);
-
-  inline void merge_pairs (insn_list_t &, insn_list_t &,
-			   bool load_p, unsigned access_size);
+struct  aarch64_pair_fusion : public pair_fusion
+{
+public:
+  aarch64_pair_fusion (bb_info *bb) : pair_fusion (bb) {};
+  bool is_fpsimd_op_p (rtx reg_op, machine_mode mem_mode, bool load_p)
+  {
+    const bool fpsimd_op_p
+      = reload_completed
+      ? (REG_P (reg_op) && FP_REGNUM_P (REGNO (reg_op)))
+      : (GET_MODE_CLASS (mem_mode) != MODE_INT
+	 && (load_p || !aarch64_const_zero_rtx_p (reg_op)));
+    return fpsimd_op_p;
+  }
 
-  inline bool try_fuse_pair (bool load_p, unsigned access_size,
-			     insn_info *i1, insn_info *i2);
+  bool pair_mem_ok_policy (rtx first_mem, bool load_p, machine_mode mode)
+  {
+    return aarch64_mem_ok_with_ldpstp_policy_model (first_mem,
+						     load_p,
+						     mode);
+  }
+  bool pair_operand_mode_ok_p (machine_mode mode);
 
-  inline bool fuse_pair (bool load_p, unsigned access_size,
-			 int writeback,
-			 insn_info *i1, insn_info *i2,
-			 base_cand &base,
-			 const insn_range_info &move_range);
+  void transform_for_base (int encoded_lfs,
+			   access_group &group);
+  rtx gen_load_store_pair (rtx *pats,
+			   rtx writeback,
+			   bool load_p)
+  {
+    rtx pair_pat;
 
-  inline void track_tombstone (int uid);
+    if (writeback)
+      {
+	auto patvec = gen_rtvec (3, writeback, pats[0], pats[1]);
+	pair_pat = gen_rtx_PARALLEL (VOIDmode, patvec);
+      }
+    else if (load_p)
+      pair_pat = aarch64_gen_load_pair (XEXP (pats[0], 0),
+					XEXP (pats[1], 0),
+					XEXP (pats[0], 1));
+    else
+      pair_pat = aarch64_gen_store_pair (XEXP (pats[0], 0),
+					 XEXP (pats[0], 1),
+					 XEXP (pats[1], 1));
+     return pair_pat;
+  }
 
-  inline bool track_via_mem_expr (insn_info *, rtx mem, lfs_fields lfs);
+  void set_multiword_subreg (insn_info *i1, insn_info *i2, bool load_p)
+  {
+    if (i1 || i2 || load_p)
+      return;
+    return;
+  }
+  bool pair_trailing_writeback_p  ()
+  {
+    return aarch64_ldp_writeback > 1;
+  }
+  bool pair_check_register_operand (bool load_p, rtx reg_op, machine_mode mem_mode)
+  {
+    return  (load_p
+	     ? aarch64_ldp_reg_operand (reg_op, mem_mode)
+	     : aarch64_stp_reg_operand (reg_op, mem_mode));
+  }
+  int pair_mem_alias_check_limit ()
+  {
+    return aarch64_ldp_alias_check_limit;
+  }
+  bool fuseable_store_p (insn_info *i1, insn_info *i2) { return i1 || i2;}
+  bool fuseable_load_p (insn_info *insn) { return insn;}
+  bool pair_is_writeback ()
+  {
+    return !aarch64_ldp_writeback;
+  }
 };
 
+bool
+store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget);
+bool load_modified_by_store_p (insn_info *load,
+			  insn_info *store,
+			  int &budget);
+extern insn_info *
+try_repurpose_store (insn_info *first,
+		     insn_info *second,
+		     const insn_range_info &move_range);
+
+void reset_debug_use (use_info *use);
+
+extern void
+fixup_debug_uses (obstack_watermark &attempt,
+		  insn_info *insns[2],
+		  rtx orig_rtl[2],
+		  insn_info *pair_dst,
+		  insn_info *trailing_add,
+		  bool load_p,
+		  int writeback,
+		  rtx writeback_effect,
+		  unsigned base_regno);
+
+void
+fixup_debug_uses_trailing_add (obstack_watermark &attempt,
+			       insn_info *pair_dst,
+			       insn_info *trailing_add,
+			       rtx writeback_effect);
+
+
+extern void
+fixup_debug_use (obstack_watermark &attempt,
+		 use_info *use,
+		 def_info *def,
+		 rtx base,
+		 poly_int64 wb_offset);
+
+extern insn_info *
+find_trailing_add (insn_info *insns[2],
+		   const insn_range_info &pair_range,
+		   int initial_writeback,
+		   rtx *writeback_effect,
+		   def_info **add_def,
+		   def_info *base_def,
+		   poly_int64 initial_offset,
+		   unsigned access_size);
+
+rtx drop_writeback (rtx mem);
+rtx pair_mem_strip_offset (rtx mem, poly_int64 *offset);
+bool any_pre_modify_p (rtx x);
+bool any_post_modify_p (rtx x);
+int encode_lfs (lfs_fields fields);
+extern insn_info * latest_hazard_before (insn_info *insn, rtx *ignore,
+		      insn_info *ignore_insn = nullptr);
+insn_info * first_hazard_after (insn_info *insn, rtx *ignore);
+bool ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2);
+insn_range_info get_def_range (def_info *def);
+insn_range_info def_downwards_move_range (def_info *def);
+insn_range_info def_upwards_move_range (def_info *def);
+rtx gen_tombstone (void);
+rtx filter_notes (rtx note, rtx result, bool *eh_region, rtx *fr_expr);
+rtx combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p);
+rtx extract_writebacks (bool load_p, rtx pats[2], int changed);
+void do_alias_analysis (insn_info *alias_hazards[4],
+		   alias_walker *walkers[4],
+		   bool load_p);
+int get_viable_bases (insn_info *insns[2],
+		  vec<base_cand> &base_cands,
+		  rtx cand_mems[2],
+		  unsigned access_size,
+		  bool reversed);
+void dump_insn_list (FILE *f, const insn_list_t &l);
+
 splay_tree_node<access_record *> *
-ldp_bb_info::node_alloc (access_record *access)
+pair_fusion::node_alloc (access_record *access)
 {
   using T = splay_tree_node<access_record *>;
   void *addr = obstack_alloc (&m_obstack, sizeof (T));
@@ -224,7 +401,7 @@  ldp_bb_info::node_alloc (access_record *access)
 // Given a mem MEM, if the address has side effects, return a MEM that accesses
 // the same address but without the side effects.  Otherwise, return
 // MEM unchanged.
-static rtx
+rtx
 drop_writeback (rtx mem)
 {
   rtx addr = XEXP (mem, 0);
@@ -261,8 +438,8 @@  drop_writeback (rtx mem)
 // Convenience wrapper around strip_offset that can also look through
 // RTX_AUTOINC addresses.  The interface is like strip_offset except we take a
 // MEM so that we know the mode of the access.
-static rtx
-ldp_strip_offset (rtx mem, poly_int64 *offset)
+rtx
+pair_mem_strip_offset (rtx mem, poly_int64 *offset)
 {
   rtx addr = XEXP (mem, 0);
 
@@ -295,7 +472,7 @@  ldp_strip_offset (rtx mem, poly_int64 *offset)
 }
 
 // Return true if X is a PRE_{INC,DEC,MODIFY} rtx.
-static bool
+bool
 any_pre_modify_p (rtx x)
 {
   const auto code = GET_CODE (x);
@@ -303,7 +480,7 @@  any_pre_modify_p (rtx x)
 }
 
 // Return true if X is a POST_{INC,DEC,MODIFY} rtx.
-static bool
+bool
 any_post_modify_p (rtx x)
 {
   const auto code = GET_CODE (x);
@@ -332,9 +509,15 @@  ldp_operand_mode_ok_p (machine_mode mode)
   return reload_completed || mode != TImode;
 }
 
+bool
+aarch64_pair_fusion::pair_operand_mode_ok_p (machine_mode mode)
+{
+  return (ldp_operand_mode_ok_p (mode));
+}
+
 // Given LFS (load_p, fpsimd_p, size) fields in FIELDS, encode these
 // into an integer for use as a hash table key.
-static int
+int
 encode_lfs (lfs_fields fields)
 {
   int size_log2 = exact_log2 (fields.size);
@@ -396,7 +579,7 @@  access_group::track (Alloc alloc_node, poly_int64 offset, insn_info *insn)
 // MEM_EXPR base (i.e. a tree decl) relative to which we can track the access.
 // LFS is used as part of the key to the hash table, see track_access.
 bool
-ldp_bb_info::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs)
+pair_fusion::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs)
 {
   if (!MEM_EXPR (mem) || !MEM_OFFSET_KNOWN_P (mem))
     return false;
@@ -412,7 +595,7 @@  ldp_bb_info::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs)
   const machine_mode mem_mode = GET_MODE (mem);
   const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant ();
 
-  // Punt on misaligned offsets.  LDP/STP instructions require offsets to be a
+  // Punt on misaligned offsets.  PAIR MEM  instructions require offsets to be a
   // multiple of the access size, and we believe that misaligned offsets on
   // MEM_EXPR bases are likely to lead to misaligned offsets w.r.t. RTL bases.
   if (!multiple_p (offset, mem_size))
@@ -438,46 +621,38 @@  ldp_bb_info::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs)
 }
 
 // Main function to begin pair discovery.  Given a memory access INSN,
-// determine whether it could be a candidate for fusing into an ldp/stp,
+// determine whether it could be a candidate for fusing into an pair mem,
 // and if so, track it in the appropriate data structure for this basic
 // block.  LOAD_P is true if the access is a load, and MEM is the mem
 // rtx that occurs in INSN.
 void
-ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
+pair_fusion::track_access (insn_info *insn, bool load_p, rtx mem)
 {
   // We can't combine volatile MEMs, so punt on these.
   if (MEM_VOLATILE_P (mem))
     return;
 
-  // Ignore writeback accesses if the param says to do so.
-  if (!aarch64_ldp_writeback
+  // Ignore writeback accesses if the param says to do so
+  if (pair_is_writeback ()
       && GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC)
     return;
 
   const machine_mode mem_mode = GET_MODE (mem);
-  if (!ldp_operand_mode_ok_p (mem_mode))
+
+  if (!pair_operand_mode_ok_p (mem_mode))
     return;
 
   rtx reg_op = XEXP (PATTERN (insn->rtl ()), !load_p);
 
-  // Ignore the access if the register operand isn't suitable for ldp/stp.
-  if (load_p
-      ? !aarch64_ldp_reg_operand (reg_op, mem_mode)
-      : !aarch64_stp_reg_operand (reg_op, mem_mode))
+  if (!pair_check_register_operand (load_p, reg_op, mem_mode))
     return;
-
   // We want to segregate FP/SIMD accesses from GPR accesses.
   //
   // Before RA, we use the modes, noting that stores of constant zero
   // operands use GPRs (even in non-integer modes).  After RA, we use
   // the hard register numbers.
-  const bool fpsimd_op_p
-    = reload_completed
-    ? (REG_P (reg_op) && FP_REGNUM_P (REGNO (reg_op)))
-    : (GET_MODE_CLASS (mem_mode) != MODE_INT
-       && (load_p || !aarch64_const_zero_rtx_p (reg_op)));
-
-  // Note ldp_operand_mode_ok_p already rejected VL modes.
+ const bool fpsimd_op_p = is_fpsimd_op_p (reg_op, mem_mode, load_p);
+  // Note pair_operand_mode_ok_p already rejected VL modes.
   const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant ();
   const lfs_fields lfs = { load_p, fpsimd_op_p, mem_size };
 
@@ -487,7 +662,7 @@  ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
   poly_int64 mem_off;
   rtx addr = XEXP (mem, 0);
   const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC;
-  rtx base = ldp_strip_offset (mem, &mem_off);
+  rtx base = pair_mem_strip_offset (mem, &mem_off);
   if (!REG_P (base))
     return;
 
@@ -507,7 +682,7 @@  ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
   // accesses until after RA.
   //
   // As it stands, addresses with offsets in range for LDR but not
-  // in range for LDP/STP are currently reloaded inefficiently,
+  // in range for PAIR MEM LOAD STORE  are currently reloaded inefficiently,
   // ending up with a separate base register for each pair.
   //
   // In theory LRA should make use of
@@ -519,8 +694,8 @@  ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
   // that calls targetm.legitimize_address_displacement.
   //
   // So for now, it's better to punt when we can't be sure that the
-  // offset is in range for LDP/STP.  Out-of-range cases can then be
-  // handled after RA by the out-of-range LDP/STP peepholes.  Eventually, it
+  // offset is in range for PAIR MEM LOAD STORE.  Out-of-range cases can then be
+  // handled after RA by the out-of-range PAIR MEM  peepholes.  Eventually, it
   // would be nice to handle known out-of-range opportunities in the
   // pass itself (for stack accesses, this would be in the post-RA pass).
   if (!reload_completed
@@ -573,7 +748,7 @@  ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
 	gcc_unreachable (); // Base defs should be unique.
     }
 
-  // Punt on misaligned offsets.  LDP/STP require offsets to be a multiple of
+  // Punt on misaligned offsets.  PAIR MEM  require offsets to be a multiple of
   // the access size.
   if (!multiple_p (mem_off, mem_size))
     return;
@@ -612,9 +787,9 @@  static bool no_ignore (insn_info *) { return false; }
 //
 // N.B. we ignore any defs/uses of memory here as we deal with that separately,
 // making use of alias disambiguation.
-static insn_info *
+insn_info *
 latest_hazard_before (insn_info *insn, rtx *ignore,
-		      insn_info *ignore_insn = nullptr)
+		      insn_info *ignore_insn)
 {
   insn_info *result = nullptr;
 
@@ -698,7 +873,7 @@  latest_hazard_before (insn_info *insn, rtx *ignore,
 //
 // N.B. we ignore any defs/uses of memory here as we deal with that separately,
 // making use of alias disambiguation.
-static insn_info *
+insn_info *
 first_hazard_after (insn_info *insn, rtx *ignore)
 {
   insn_info *result = nullptr;
@@ -787,7 +962,7 @@  first_hazard_after (insn_info *insn, rtx *ignore)
 }
 
 // Return true iff R1 and R2 overlap.
-static bool
+bool
 ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2)
 {
   // If either range is empty, then their intersection is empty.
@@ -801,7 +976,7 @@  ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2)
 }
 
 // Get the range of insns that def feeds.
-static insn_range_info get_def_range (def_info *def)
+insn_range_info get_def_range (def_info *def)
 {
   insn_info *last = def->next_def ()->insn ()->prev_nondebug_insn ();
   return { def->insn (), last };
@@ -809,7 +984,7 @@  static insn_range_info get_def_range (def_info *def)
 
 // Given a def (of memory), return the downwards range within which we
 // can safely move this def.
-static insn_range_info
+insn_range_info
 def_downwards_move_range (def_info *def)
 {
   auto range = get_def_range (def);
@@ -827,7 +1002,7 @@  def_downwards_move_range (def_info *def)
 
 // Given a def (of memory), return the upwards range within which we can
 // safely move this def.
-static insn_range_info
+insn_range_info
 def_upwards_move_range (def_info *def)
 {
   def_info *prev = def->prev_def ();
@@ -974,7 +1149,7 @@  private:
 // Given candidate store insns FIRST and SECOND, see if we can re-purpose one
 // of them (together with its def of memory) for the stp insn.  If so, return
 // that insn.  Otherwise, return null.
-static insn_info *
+insn_info *
 try_repurpose_store (insn_info *first,
 		     insn_info *second,
 		     const insn_range_info &move_range)
@@ -1001,7 +1176,7 @@  try_repurpose_store (insn_info *first,
 //
 // These are deleted at the end of the pass and uses re-parented appropriately
 // at this point.
-static rtx
+rtx
 gen_tombstone (void)
 {
   return gen_rtx_CLOBBER (VOIDmode,
@@ -1034,7 +1209,7 @@  aarch64_operand_mode_for_pair_mode (machine_mode mode)
 // REG_EH_REGION note in the resulting list.  FR_EXPR is used to return any
 // REG_FRAME_RELATED_EXPR note we find, as these can need special handling in
 // combine_reg_notes.
-static rtx
+rtx
 filter_notes (rtx note, rtx result, bool *eh_region, rtx *fr_expr)
 {
   for (; note; note = XEXP (note, 1))
@@ -1084,7 +1259,7 @@  filter_notes (rtx note, rtx result, bool *eh_region, rtx *fr_expr)
 
 // Return the notes that should be attached to a combination of I1 and I2, where
 // *I1 < *I2.  LOAD_P is true for loads.
-static rtx
+rtx
 combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p)
 {
   // Temporary storage for REG_FRAME_RELATED_EXPR notes.
@@ -1133,7 +1308,7 @@  combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p)
 // relative to the initial value of the base register, and output these
 // in PATS.  Return an rtx that represents the overall change to the
 // base register.
-static rtx
+rtx
 extract_writebacks (bool load_p, rtx pats[2], int changed)
 {
   rtx base_reg = NULL_RTX;
@@ -1150,7 +1325,7 @@  extract_writebacks (bool load_p, rtx pats[2], int changed)
       const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC;
 
       poly_int64 offset;
-      rtx this_base = ldp_strip_offset (mem, &offset);
+      rtx this_base = pair_mem_strip_offset (mem, &offset);
       gcc_assert (REG_P (this_base));
       if (base_reg)
 	gcc_assert (rtx_equal_p (base_reg, this_base));
@@ -1207,7 +1382,7 @@  extract_writebacks (bool load_p, rtx pats[2], int changed)
 // base register.  If there is one, we choose the first such update after
 // PAIR_DST that is still in the same BB as our pair.  We return the new def in
 // *ADD_DEF and the resulting writeback effect in *WRITEBACK_EFFECT.
-static insn_info *
+insn_info *
 find_trailing_add (insn_info *insns[2],
 		   const insn_range_info &pair_range,
 		   int initial_writeback,
@@ -1286,7 +1461,7 @@  find_trailing_add (insn_info *insns[2],
 
   off_hwi /= access_size;
 
-  if (off_hwi < LDP_MIN_IMM || off_hwi > LDP_MAX_IMM)
+  if (off_hwi < PAIR_MEM_MIN_IMM || off_hwi > PAIR_MEM_MAX_IMM)
     return nullptr;
 
   auto dump_prefix = [&]()
@@ -1328,7 +1503,7 @@  find_trailing_add (insn_info *insns[2],
 // We just emitted a tombstone with uid UID, track it in a bitmap for
 // this BB so we can easily identify it later when cleaning up tombstones.
 void
-ldp_bb_info::track_tombstone (int uid)
+pair_fusion::track_tombstone (int uid)
 {
   if (!m_emitted_tombstone)
     {
@@ -1344,7 +1519,7 @@  ldp_bb_info::track_tombstone (int uid)
 
 // Reset the debug insn containing USE (the debug insn has been
 // optimized away).
-static void
+void
 reset_debug_use (use_info *use)
 {
   auto use_insn = use->insn ();
@@ -1360,7 +1535,7 @@  reset_debug_use (use_info *use)
 // is an update of the register BASE by a constant, given by WB_OFFSET,
 // and we can preserve debug info by accounting for the change in side
 // effects.
-static void
+void
 fixup_debug_use (obstack_watermark &attempt,
 		 use_info *use,
 		 def_info *def,
@@ -1463,7 +1638,7 @@  fixup_debug_use (obstack_watermark &attempt,
 // is a trailing add insn which is being folded into the pair to make it
 // use writeback addressing, and WRITEBACK_EFFECT is the pattern for
 // TRAILING_ADD.
-static void
+void
 fixup_debug_uses_trailing_add (obstack_watermark &attempt,
 			       insn_info *pair_dst,
 			       insn_info *trailing_add,
@@ -1500,7 +1675,7 @@  fixup_debug_uses_trailing_add (obstack_watermark &attempt,
 // writeback, and WRITEBACK_EFFECT is an rtx describing the overall update to
 // the base register in the final pair (if any).  BASE_REGNO gives the register
 // number of the base register used in the final pair.
-static void
+void
 fixup_debug_uses (obstack_watermark &attempt,
 		  insn_info *insns[2],
 		  rtx orig_rtl[2],
@@ -1528,7 +1703,7 @@  fixup_debug_uses (obstack_watermark &attempt,
 	  gcc_checking_assert (GET_RTX_CLASS (GET_CODE (XEXP (mem, 0)))
 			       == RTX_AUTOINC);
 
-	  base = ldp_strip_offset (mem, &offset);
+	  base = pair_mem_strip_offset (mem, &offset);
 	  gcc_checking_assert (REG_P (base) && REGNO (base) == base_regno);
 	}
       fixup_debug_use (attempt, use, def, base, offset);
@@ -1664,7 +1839,7 @@  fixup_debug_uses (obstack_watermark &attempt,
 // BASE gives the chosen base candidate for the pair and MOVE_RANGE is
 // a singleton range which says where to place the pair.
 bool
-ldp_bb_info::fuse_pair (bool load_p,
+pair_fusion::fuse_pair (bool load_p,
 			unsigned access_size,
 			int writeback,
 			insn_info *i1, insn_info *i2,
@@ -1684,6 +1859,9 @@  ldp_bb_info::fuse_pair (bool load_p,
 						   insn_change::DELETE);
     };
 
+  if (*i1 > *i2)
+    return false;
+
   insn_info *first = (*i1 < *i2) ? i1 : i2;
   insn_info *second = (first == i1) ? i2 : i1;
 
@@ -1800,7 +1978,7 @@  ldp_bb_info::fuse_pair (bool load_p,
     {
       if (dump_file)
 	fprintf (dump_file,
-		 "  ldp: i%d has wb but subsequent i%d has non-wb "
+		 "  pair_mem: i%d has wb but subsequent i%d has non-wb "
 		 "update of base (r%d), dropping wb\n",
 		 insns[0]->uid (), insns[1]->uid (), base_regno);
       gcc_assert (writeback_effect);
@@ -1823,7 +2001,7 @@  ldp_bb_info::fuse_pair (bool load_p,
     }
 
   // If either of the original insns had writeback, but the resulting pair insn
-  // does not (can happen e.g. in the ldp edge case above, or if the writeback
+  // does not (can happen e.g. in the pair mem  edge case above, or if the writeback
   // effects cancel out), then drop the def(s) of the base register as
   // appropriate.
   //
@@ -1842,7 +2020,7 @@  ldp_bb_info::fuse_pair (bool load_p,
   // update of the base register and try and fold it in to make this into a
   // writeback pair.
   insn_info *trailing_add = nullptr;
-  if (aarch64_ldp_writeback > 1
+  if (pair_trailing_writeback_p ()
       && !writeback_effect
       && (!load_p || (!refers_to_regno_p (base_regno, base_regno + 1,
 					 XEXP (pats[0], 0), nullptr)
@@ -1863,14 +2041,14 @@  ldp_bb_info::fuse_pair (bool load_p,
     }
 
   // Now that we know what base mem we're going to use, check if it's OK
-  // with the ldp/stp policy.
+  // with the pair mem  policy.
   rtx first_mem = XEXP (pats[0], load_p);
-  if (!aarch64_mem_ok_with_ldpstp_policy_model (first_mem,
-						load_p,
-						GET_MODE (first_mem)))
+  if (!pair_mem_ok_policy (first_mem,
+			  load_p,
+			  GET_MODE (first_mem)))
     {
       if (dump_file)
-	fprintf (dump_file, "punting on pair (%d,%d), ldp/stp policy says no\n",
+	fprintf (dump_file, "punting on pair (%d,%d), pair mem  policy says no\n",
 		 i1->uid (), i2->uid ());
       return false;
     }
@@ -1878,20 +2056,12 @@  ldp_bb_info::fuse_pair (bool load_p,
   rtx reg_notes = combine_reg_notes (first, second, load_p);
 
   rtx pair_pat;
-  if (writeback_effect)
-    {
-      auto patvec = gen_rtvec (3, writeback_effect, pats[0], pats[1]);
-      pair_pat = gen_rtx_PARALLEL (VOIDmode, patvec);
-    }
-  else if (load_p)
-    pair_pat = aarch64_gen_load_pair (XEXP (pats[0], 0),
-				      XEXP (pats[1], 0),
-				      XEXP (pats[0], 1));
-  else
-    pair_pat = aarch64_gen_store_pair (XEXP (pats[0], 0),
-				       XEXP (pats[0], 1),
-				       XEXP (pats[1], 1));
 
+  set_multiword_subreg (first, second, load_p);
+
+  pair_pat = gen_load_store_pair (pats, writeback_effect, load_p);
+  if (pair_pat == NULL_RTX)
+    return false;
   insn_change *pair_change = nullptr;
   auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) {
     rtx_insn *rti = change->insn ()->rtl ();
@@ -2075,7 +2245,7 @@  ldp_bb_info::fuse_pair (bool load_p,
 
 // Return true if STORE_INSN may modify mem rtx MEM.  Make sure we keep
 // within our BUDGET for alias analysis.
-static bool
+bool
 store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget)
 {
   if (!budget)
@@ -2098,7 +2268,7 @@  store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget)
 
 // Return true if LOAD may be modified by STORE.  Make sure we keep
 // within our BUDGET for alias analysis.
-static bool
+bool
 load_modified_by_store_p (insn_info *load,
 			  insn_info *store,
 			  int &budget)
@@ -2133,15 +2303,6 @@  load_modified_by_store_p (insn_info *load,
   return false;
 }
 
-// Virtual base class for load/store walkers used in alias analysis.
-struct alias_walker
-{
-  virtual bool conflict_p (int &budget) const = 0;
-  virtual insn_info *insn () const = 0;
-  virtual bool valid () const  = 0;
-  virtual void advance () = 0;
-};
-
 // Implement some common functionality used by both store_walker
 // and load_walker.
 template<bool reverse>
@@ -2259,13 +2420,13 @@  public:
 //
 // We try to maintain the invariant that if a walker becomes invalid, we
 // set its pointer to null.
-static void
-do_alias_analysis (insn_info *alias_hazards[4],
+void
+pair_fusion::do_alias_analysis (insn_info *alias_hazards[4],
 		   alias_walker *walkers[4],
 		   bool load_p)
 {
   const int n_walkers = 2 + (2 * !load_p);
-  int budget = aarch64_ldp_alias_check_limit;
+  int budget = pair_mem_alias_check_limit();
 
   auto next_walker = [walkers,n_walkers](int current) -> int {
     for (int j = 1; j <= n_walkers; j++)
@@ -2350,7 +2511,7 @@  do_alias_analysis (insn_info *alias_hazards[4],
 //
 // Returns an integer where bit (1 << i) is set if INSNS[i] uses writeback
 // addressing.
-static int
+int
 get_viable_bases (insn_info *insns[2],
 		  vec<base_cand> &base_cands,
 		  rtx cand_mems[2],
@@ -2365,7 +2526,7 @@  get_viable_bases (insn_info *insns[2],
     {
       const bool is_lower = (i == reversed);
       poly_int64 poly_off;
-      rtx base = ldp_strip_offset (cand_mems[i], &poly_off);
+      rtx base = pair_mem_strip_offset (cand_mems[i], &poly_off);
       if (GET_RTX_CLASS (GET_CODE (XEXP (cand_mems[i], 0))) == RTX_AUTOINC)
 	writeback |= (1 << i);
 
@@ -2373,7 +2534,7 @@  get_viable_bases (insn_info *insns[2],
 	continue;
 
       // Punt on accesses relative to eliminable regs.  See the comment in
-      // ldp_bb_info::track_access for a detailed explanation of this.
+      // pair_fusion::track_access for a detailed explanation of this.
       if (!reload_completed
 	  && (REGNO (base) == FRAME_POINTER_REGNUM
 	      || REGNO (base) == ARG_POINTER_REGNUM))
@@ -2397,7 +2558,7 @@  get_viable_bases (insn_info *insns[2],
       if (!is_lower)
 	base_off--;
 
-      if (base_off < LDP_MIN_IMM || base_off > LDP_MAX_IMM)
+      if (base_off < PAIR_MEM_MIN_IMM || base_off > PAIR_MEM_MAX_IMM)
 	continue;
 
       use_info *use = find_access (insns[i]->uses (), REGNO (base));
@@ -2454,12 +2615,12 @@  get_viable_bases (insn_info *insns[2],
 }
 
 // Given two adjacent memory accesses of the same size, I1 and I2, try
-// and see if we can merge them into a ldp or stp.
+// and see if we can merge them into a pair mem load and store.
 //
 // ACCESS_SIZE gives the (common) size of a single access, LOAD_P is true
 // if the accesses are both loads, otherwise they are both stores.
 bool
-ldp_bb_info::try_fuse_pair (bool load_p, unsigned access_size,
+pair_fusion::try_fuse_pair (bool load_p, unsigned access_size,
 			    insn_info *i1, insn_info *i2)
 {
   if (dump_file)
@@ -2490,11 +2651,21 @@  ldp_bb_info::try_fuse_pair (bool load_p, unsigned access_size,
       reg_ops[i] = XEXP (pats[i], !load_p);
     }
 
+  if (!load_p && !fuseable_store_p (i1, i2))
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "punting on store-mem-pairs due to non fuseable cand (%d,%d)\n",
+		 insns[0]->uid (), insns[1]->uid ());
+      return false;
+    }
+
+
   if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1]))
     {
       if (dump_file)
 	fprintf (dump_file,
-		 "punting on ldp due to reg conflcits (%d,%d)\n",
+		 "punting on pair mem load  due to reg conflcits (%d,%d)\n",
 		 insns[0]->uid (), insns[1]->uid ());
       return false;
     }
@@ -2787,7 +2958,7 @@  ldp_bb_info::try_fuse_pair (bool load_p, unsigned access_size,
 		    i1, i2, *base, range);
 }
 
-static void
+void
 dump_insn_list (FILE *f, const insn_list_t &l)
 {
   fprintf (f, "(");
@@ -2843,7 +3014,7 @@  debug (const insn_list_t &l)
 // we can't re-order them anyway, so provided earlier passes have cleaned up
 // redundant loads, we shouldn't miss opportunities by doing this.
 void
-ldp_bb_info::merge_pairs (insn_list_t &left_list,
+pair_fusion::merge_pairs (insn_list_t &left_list,
 			  insn_list_t &right_list,
 			  bool load_p,
 			  unsigned access_size)
@@ -2890,8 +3061,8 @@  ldp_bb_info::merge_pairs (insn_list_t &left_list,
 // of accesses.  If we find two sets of adjacent accesses, call
 // merge_pairs.
 void
-ldp_bb_info::transform_for_base (int encoded_lfs,
-				 access_group &group)
+aarch64_pair_fusion::transform_for_base (int encoded_lfs,
+					 access_group &group)
 {
   const auto lfs = decode_lfs (encoded_lfs);
   const unsigned access_size = lfs.size;
@@ -2919,7 +3090,7 @@  ldp_bb_info::transform_for_base (int encoded_lfs,
 // and remove all the tombstone insns, being sure to reparent any uses
 // of mem to previous defs when we do this.
 void
-ldp_bb_info::cleanup_tombstones ()
+pair_fusion::cleanup_tombstones ()
 {
   // No need to do anything if we didn't emit a tombstone insn for this BB.
   if (!m_emitted_tombstone)
@@ -2947,7 +3118,7 @@  ldp_bb_info::cleanup_tombstones ()
 
 template<typename Map>
 void
-ldp_bb_info::traverse_base_map (Map &map)
+pair_fusion::traverse_base_map (Map &map)
 {
   for (auto kv : map)
     {
@@ -2958,7 +3129,7 @@  ldp_bb_info::traverse_base_map (Map &map)
 }
 
 void
-ldp_bb_info::transform ()
+pair_fusion::transform ()
 {
   traverse_base_map (expr_map);
   traverse_base_map (def_map);
@@ -3174,7 +3345,9 @@  void ldp_fusion_bb (bb_info *bb)
   const bool track_stores
     = aarch64_tune_params.stp_policy_model != AARCH64_LDP_STP_POLICY_NEVER;
 
-  ldp_bb_info bb_state (bb);
+  pair_fusion *bb_state;
+  aarch64_pair_fusion derived (bb);
+  bb_state = &derived;
 
   for (auto insn : bb->nondebug_insns ())
     {
@@ -3194,13 +3367,13 @@  void ldp_fusion_bb (bb_info *bb)
 	continue;
 
       if (track_stores && MEM_P (XEXP (pat, 0)))
-	bb_state.track_access (insn, false, XEXP (pat, 0));
+	bb_state->track_access (insn, false, XEXP (pat, 0));
       else if (track_loads && MEM_P (XEXP (pat, 1)))
-	bb_state.track_access (insn, true, XEXP (pat, 1));
+	bb_state->track_access (insn, true, XEXP (pat, 1));
     }
 
-  bb_state.transform ();
-  bb_state.cleanup_tombstones ();
+  bb_state->transform ();
+  bb_state->cleanup_tombstones ();
 }
 
 void ldp_fusion ()