diff mbox

[MIPS] Enable load-load/store-store bonding

Message ID 55508D02.4050007@imgtec.com
State New
Headers show

Commit Message

sameera May 11, 2015, 11:05 a.m. UTC
On Tuesday 21 April 2015 12:39 AM, Matthew Fortune wrote:
> Sameera Deshpande <Sameera.Deshpande@imgtec.com> writes:
>> Gentle reminder!
>
> Thanks Sameera. Just a couple of comments inline below and a question
> for Catherine at the end.
>
>> - Thanks and regards,
>>     Sameera D.
>>
>> On Monday 30 March 2015 04:58 PM, sameera wrote:
>>> Hi!
>>>
>>> Sorry for delay in sending this patch for review.
>>> Please find attached updated patch.
>>>
>>> In P5600, 2 consecutive loads/stores of same type which access
>>> contiguous memory locations are bonded together by instruction issue
>>> unit to dispatch single load/store instruction which accesses both
>> locations. This allows 2X improvement in memory intensive code. This
>> optimization can be performed for LH, SH, LW, SW, LWC, SWC, LDC, SDC
>> instructions.
>>>
>>> This patch adds peephole2 patterns to identify such loads/stores, and
>>> put them in parallel, so that the scheduler will not split it -
>> thereby guaranteeing h/w level load/store bonding.
>>>
>>> The patch is tested with dejagnu for correctness, and tested on
>> hardware for performance.
>>> Ok for trunk?
>>>
>>> Changelog:
>>> gcc/
>>>           * config/mips/mips.md (JOIN_MODE): New mode iterator.
>>>       (join2_load_Store<JOIN_MODE:mode>): New pattern.
>>>       (join2_loadhi): Likewise.
>>>       (define_peehole2): Add peephole2 patterns to join 2 HI/SI/SF/DF-
>> mode
>>>       load-load and store-stores.
>>>       * config/mips/mips.opt (mload-store-pairs): New option.
>>>       (TARGET_LOAD_STORE_PAIRS): New macro.
>>>       *config/mips/mips.h (ENABLE_LD_ST_PAIRS): Likewise.
>>>       *config/mips/mips-protos.h (mips_load_store_bonding_p): New
>> prototype.
>>>       *config/mips/mips.c(mips_load_store_bonding_p): New function.
>
> I don't know if this has been corrupted by mail clients but a single
> space after '*' and a space before '('.
>
>> diff --git a/gcc/config/mips/mips-protos.h b/gcc/config/mips/mips-protos.h
>> index b48e04f..244eb8d 100644
>> --- a/gcc/config/mips/mips-protos.h
>> +++ b/gcc/config/mips/mips-protos.h
>> @@ -360,6 +360,7 @@ extern bool mips_epilogue_uses (unsigned int);
>> extern void mips_final_prescan_insn (rtx_insn *, rtx *, int);
>> extern int mips_trampoline_code_size (void);
>> extern void mips_function_profiler (FILE *);
>> +extern bool mips_load_store_bonding_p (rtx *, machine_mode, bool);
>>
>> typedef rtx (*mulsidi3_gen_fn) (rtx, rtx, rtx);
>> #ifdef RTX_CODE
>> diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c
>> index 1733457..85f0591 100644
>> --- a/gcc/config/mips/mips.c
>> +++ b/gcc/config/mips/mips.c
>> @@ -18241,6 +18241,64 @@ umips_load_store_pair_p_1 (bool load_p, bool swap_p,
>>    return true;
>> }
>>
>> +bool
>> +mips_load_store_bonding_p (rtx *operands, enum machine_mode mode, bool load_p)
>
> Remove enum from machine_mode.
>
>> +{
>> +  rtx reg1, reg2, mem1, mem2, base1, base2;
>> +  enum reg_class rc1, rc2;
>> +  HOST_WIDE_INT offset1, offset2;
>> +
>> +  if (load_p)
>> +    {
>> +      reg1 = operands[0];
>> +      reg2 = operands[2];
>> +      mem1 = operands[1];
>> +      mem2 = operands[3];
>> +    }
>> +  else
>> +    {
>> +      reg1 = operands[1];
>> +      reg2 = operands[3];
>> +      mem1 = operands[0];
>> +      mem2 = operands[2];
>> +    }
>> +
>> +  if (mips_address_insns (XEXP (mem1, 0), mode, false) == 0
>> +      || mips_address_insns (XEXP (mem2, 0), mode, false) == 0)
>> +    return false;
>> +
>> +  mips_split_plus (XEXP (mem1, 0), &base1, &offset1);
>> +  mips_split_plus (XEXP (mem2, 0), &base2, &offset2);
>> +
>> +  /* Base regs do not match.  */
>> +  if (!REG_P (base1) || !rtx_equal_p (base1, base2))
>> +    return false;
>> +
>> +  /* Either of the loads is clobbering base register.  */
>> +  if (load_p
>> +      && (REGNO (reg1) == REGNO (base1)
>> +	  || (REGNO (reg2) == REGNO (base1))))
>> +    return false;
>
> Can you add a comment saying that this case does not get bonded by
> any known hardware even though it could be valid to bond them if it
> is the second load that clobbers the base.
>
>> +  /* Loading in same registers.  */
>> +  if (load_p
>> +      && REGNO (reg1) == REGNO (reg2))
>> +    return false;
>> +
>> +  /* The loads/stores are not of same type.  */
>> +  rc1 = REGNO_REG_CLASS (REGNO (reg1));
>> +  rc2 = REGNO_REG_CLASS (REGNO (reg2));
>> +  if (rc1 != rc2
>> +      && !reg_class_subset_p (rc1, rc2)
>> +      && !reg_class_subset_p (rc2, rc1))
>> +    return false;
>> +
>> +  if (abs (offset1 - offset2) != GET_MODE_SIZE (mode))
>> +    return false;
>> +
>> +  return true;
>> +}
>> +
>> /* OPERANDS describes the operands to a pair of SETs, in the order
>>     dest1, src1, dest2, src2.  Return true if the operands can be used
>>     in an LWP or SWP instruction; LOAD_P says which.  */
>> diff --git a/gcc/config/mips/mips.h b/gcc/config/mips/mips.h
>> index ec69ed5..1bd0dae 100644
>> --- a/gcc/config/mips/mips.h
>> +++ b/gcc/config/mips/mips.h
>> @@ -3147,3 +3147,7 @@ extern GTY(()) struct target_globals *mips16_globals;
>> #define STANDARD_STARTFILE_PREFIX_1 "/lib64/"
>> #define STANDARD_STARTFILE_PREFIX_2 "/usr/lib64/"
>> #endif
>> +
>> +#define ENABLE_LD_ST_PAIRS \
>> +  (TARGET_LOAD_STORE_PAIRS && TUNE_P5600 \
>> +   && !TARGET_MICROMIPS && !TARGET_FIX_24K)
>
> I've already forgotten why these extra micromips/fix24k conditions
> were required. Can you add a comment?
>
>> diff --git a/gcc/config/mips/mips.md b/gcc/config/mips/mips.md
>> index 3672c0b..cfdb750 100644
>> --- a/gcc/config/mips/mips.md
>> +++ b/gcc/config/mips/mips.md
>> @@ -754,6 +754,11 @@
>>
>> (define_mode_iterator MOVEP1 [SI SF])
>> (define_mode_iterator MOVEP2 [SI SF])
>> +(define_mode_iterator JOIN_MODE [HI
>> +				 SI
>> +				 (SF "TARGET_HARD_FLOAT")
>> +				 (DF "TARGET_HARD_FLOAT
>> +				      && TARGET_DOUBLE_FLOAT")])
>>
>> ;; This mode iterator allows :HILO to be used as the mode of the
>> ;; concatenated HI and LO registers.
>> @@ -7419,6 +7424,108 @@
>>    { return MIPS_CALL ("jal", operands, 0, -1); }
>>    [(set_attr "type" "call")
>>     (set_attr "insn_count" "3")])
>> +
>> +(define_insn "*join2_load_store<JOIN_MODE:mode>"
>> +  [(set (match_operand:JOIN_MODE 0 "nonimmediate_operand" "=d,f,m,m")
>> +	(match_operand:JOIN_MODE 1 "nonimmediate_operand" "m,m,d,f"))
>> +   (set (match_operand:JOIN_MODE 2 "nonimmediate_operand" "=d,f,m,m")
>> +	(match_operand:JOIN_MODE 3 "nonimmediate_operand" "m,m,d,f"))]
>> +  "ENABLE_LD_ST_PAIRS && reload_completed"
>> +  {
>> +    bool load_p = (which_alternative == 0 || which_alternative == 1);
>> +    /* Reg-renaming pass reuses base register if it is dead after bonded loads.
>> +       Hardware does not bond those loads, even when they are consecutive.
>> +       However, order of the loads need to be checked for correctness.  */
>> +    if (!load_p || !reg_overlap_mentioned_p (operands[0], operands[1]))
>> +      {
>> +	output_asm_insn (mips_output_move (operands[0], operands[1]),
>> +			 operands);
>> +	output_asm_insn (mips_output_move (operands[2], operands[3]),
>> +			 &operands[2]);
>> +      }
>> +    else
>> +      {
>> +	output_asm_insn (mips_output_move (operands[2], operands[3]),
>> +			 &operands[2]);
>> +	output_asm_insn (mips_output_move (operands[0], operands[1]),
>> +			 operands);
>> +      }
>> +    return "";
>> +  }
>> +  [(set_attr "move_type" "load,fpload,store,fpstore")
>> +   (set_attr "insn_count" "2,2,2,2")])
>> +
>> +;; 2 HI/SI/SF/DF loads are joined.
>> +;; P5600 does not support bonding of two LBs, hence QI mode is not included.
>> +(define_peephole2
>> +  [(set (match_operand:JOIN_MODE 0 "register_operand")
>> +	(match_operand:JOIN_MODE 1 "non_volatile_mem_operand"))
>> +   (set (match_operand:JOIN_MODE 2 "register_operand")
>> +	(match_operand:JOIN_MODE 3 "non_volatile_mem_operand"))]
>
> Please can you comment that the loads must be non-volatile as they may get
> re-ordered.
>
>> +  "ENABLE_LD_ST_PAIRS &&
>
> && on the next line
>
>> +   mips_load_store_bonding_p (operands, <JOIN_MODE:MODE>mode, true)"
>> +  [(parallel [(set (match_dup 0)
>> +		   (match_dup 1))
>> +	      (set (match_dup 2)
>> +		   (match_dup 3))])]
>> +  "")
>> +
>> +;; 2 HI/SI/SF/DF stores are joined.
>> +;; P5600 does not support bonding of two SBs, hence QI mode is not included.
>> +(define_peephole2
>> +  [(set (match_operand:JOIN_MODE 0 "memory_operand")
>> +	(match_operand:JOIN_MODE 1 "register_operand"))
>> +   (set (match_operand:JOIN_MODE 2 "memory_operand")
>> +	(match_operand:JOIN_MODE 3 "register_operand"))]
>> +  "ENABLE_LD_ST_PAIRS &&
>
> && on the next line
>
>> +   mips_load_store_bonding_p (operands, <JOIN_MODE:MODE>mode, false)"
>> +  [(parallel [(set (match_dup 0)
>> +		   (match_dup 1))
>> +	      (set (match_dup 2)
>> +		   (match_dup 3))])]
>> +  "")
>> +
>> +(define_insn "*join2_loadhi"
>> +  [(set (match_operand:SI 0 "register_operand" "=r")
>> +	(any_extend:SI (match_operand:HI 1 "non_volatile_mem_operand" "m")))
>> +   (set (match_operand:SI 2 "register_operand" "=r")
>> +	(any_extend:SI (match_operand:HI 3 "non_volatile_mem_operand" "m")))]
>> +  "ENABLE_LD_ST_PAIRS && reload_completed"
>> +  {
>> +    /* Reg-renaming pass reuses base register if it is dead after bonded loads.
>> +       Hardware does not bond those loads, even when they are consecutive.
>> +       However, order of the loads need to be checked for correctness.  */
>> +    if (!reg_overlap_mentioned_p (operands[0], operands[1]))
>> +      {
>> +	output_asm_insn ("lh<u>\t%0,%1", operands);
>> +	output_asm_insn ("lh<u>\t%2,%3", operands);
>> +      }
>> +    else
>> +      {
>> +	output_asm_insn ("lh<u>\t%2,%3", operands);
>> +	output_asm_insn ("lh<u>\t%0,%1", operands);
>> +      }
>> +
>> +    return "";
>> +  }
>> +  [(set_attr "move_type" "load")
>> +   (set_attr "insn_count" "2")])
>> +
>> +
>> +;; 2 16 bit integer loads are joined.
>
> 2 HI mode loads
>
>> +(define_peephole2
>> +  [(set (match_operand:SI 0 "register_operand")
>> +	(any_extend:SI (match_operand:HI 1 "non_volatile_mem_operand")))
>> +   (set (match_operand:SI 2 "register_operand")
>> +	(any_extend:SI (match_operand:HI 3 "non_volatile_mem_operand")))]
>> +  "ENABLE_LD_ST_PAIRS &&
>> +   mips_load_store_bonding_p (operands, HImode, true)"
>> +  [(parallel [(set (match_dup 0)
>> +		   (any_extend:SI (match_dup 1)))
>> +	      (set (match_dup 2)
>> +		   (any_extend:SI (match_dup 3)))])]
>> +  "")
>> +
>>
>> ;; Synchronization instructions.
>>
>> diff --git a/gcc/config/mips/mips.opt b/gcc/config/mips/mips.opt
>> index 9e89aa9..a9baebe 100644
>> --- a/gcc/config/mips/mips.opt
>> +++ b/gcc/config/mips/mips.opt
>> @@ -418,3 +418,7 @@ Enable use of odd-numbered single-precision registers
>>
>> noasmopt
>> Driver
>> +
>> +mload-store-pairs
>> +Target Report Var(TARGET_LOAD_STORE_PAIRS) Init(1)
>> +Enable load/store bonding.
>
> Catherine: We have this option in place just as a get-out clause if
> there are any side effects that have been missed in this patch such
> that you can still tune for p5600 but with bonding disabled. Do you
> think this is OK? I'm not completely against this being either
> undocumented or removed entirely.
>
> Sameera: Assuming we keep it then it needs adding to the invoke doc.
>
> Thanks,
> Matthew
>

Hi Matthew, Catherine and Mike,

Thanks for your comments.

Please find attached updated patch. I have added a testcase for bonding. However, unlike other architectures, we do not generate single instruction 
for bonded pair, because of which it is difficult to check if bonding is happening or not. Hence, an assembly file is generated with debug dumps, and 
the bonded loads/stores are identified by their pattern names.

I am trying FUSION for MIPS as suggested by Mike, and testing the perf impact of it along with other mips specific options.

Changelog:
gcc/
         * config/mips/mips.md (JOIN_MODE): New mode iterator.
         (join2_load_Store<JOIN_MODE:mode>): New pattern.
         (join2_loadhi): Likewise.
         (define_peehole2): Add peephole2 patterns to join 2 HI/SI/SF/DF-mode
         load-load and store-stores.
         * config/mips/mips.opt (mload-store-pairs): New option.
         (TARGET_LOAD_STORE_PAIRS): New macro.
         * config/mips/mips.h (ENABLE_LD_ST_PAIRS): Likewise.
         * config/mips/mips-protos.h (mips_load_store_bonding_p): New prototype.
         * config/mips/mips.c (mips_load_store_bonding_p): New function.

gcc/testsuite/
         * gcc.target/mips/p5600-bonding.c : New testcase to test bonding.

- Thanks and regards,
   Sameera D.

Comments

Matthew Fortune May 11, 2015, 12:13 p.m. UTC | #1
Hi Sameera,

Sameera Deshpande <Sameera.Deshpande@imgtec.com> writes: 
> Changelog:

> gcc/

>          * config/mips/mips.md (JOIN_MODE): New mode iterator.

>          (join2_load_Store<JOIN_MODE:mode>): New pattern.

>          (join2_loadhi): Likewise.

>          (define_peehole2): Add peephole2 patterns to join 2

> HI/SI/SF/DF-mode

>          load-load and store-stores.

>          * config/mips/mips.opt (mload-store-pairs): New option.

>          (TARGET_LOAD_STORE_PAIRS): New macro.

>          * config/mips/mips.h (ENABLE_LD_ST_PAIRS): Likewise.

>          * config/mips/mips-protos.h (mips_load_store_bonding_p): New

> prototype.

>          * config/mips/mips.c (mips_load_store_bonding_p): New function.

> 

> gcc/testsuite/

>          * gcc.target/mips/p5600-bonding.c : New testcase to test

> bonding.


Just 'New file.' is fine for the changelog.

>diff --git a/gcc/testsuite/gcc.target/mips/p5600-bonding.c b/gcc/testsuite/gcc.target/mips/p5600-bonding.c

>new file mode 100644

>index 0000000..122b9f8

>--- /dev/null

>+++ b/gcc/testsuite/gcc.target/mips/p5600-bonding.c

>@@ -0,0 +1,19 @@

>+/* { dg-do compile } */

>+/* { dg-options "-dp -mtune=p5600  -mno-micromips -mno-mips16" } */

>+/* { dg-skip-if "Bonding needs peephole optimization." { *-*-* } { "-O0" "-O1" } { "" } } */

>+typedef int VINT32 __attribute__ ((vector_size((16))));

>+

>+void memory_operation_fun2_si(void * __restrict src, void * __restrict dest, int num)


Code style applies for testcases too, return type on line above, space
after function name, line length.

>+{

>+    VINT32 *vsrc = (VINT32 *)src;


Indentation.

>+    VINT32 *vdest = (VINT32 *)dest;

>+    int i;

>+

>+    for (i = 0; i < num - 1; i+=2)

>+    {


Indentation

>+      vdest[i] = (vdest[i] + vsrc[i]);


Unnecessary brackets.

>+      vdest[i + 1] = vdest[i + 1] + vsrc[i + 1];

>+    }

>+}

>+/* { dg-final { scan-assembler "join2_" } }  */

>+


OK with those changes.

Thanks,
Matthew
Mike Stump May 11, 2015, 4:39 p.m. UTC | #2
On May 11, 2015, at 4:05 AM, sameera <sameera.deshpande@imgtec.com> wrote:
>>> +(define_insn "*join2_loadhi"
>>> +  [(set (match_operand:SI 0 "register_operand" "=r")
>>> +	(any_extend:SI (match_operand:HI 1 "non_volatile_mem_operand" "m")))
>>> +   (set (match_operand:SI 2 "register_operand" "=r")
>>> +	(any_extend:SI (match_operand:HI 3 "non_volatile_mem_operand" "m")))]
>>> +  "ENABLE_LD_ST_PAIRS && reload_completed"
>>> +  {
>>> +    /* Reg-renaming pass reuses base register if it is dead after bonded loads.
>>> +       Hardware does not bond those loads, even when they are consecutive.
>>> +       However, order of the loads need to be checked for correctness.  */
>>> +    if (!reg_overlap_mentioned_p (operands[0], operands[1]))
>>> +      {
>>> +	output_asm_insn ("lh<u>\t%0,%1", operands);
>>> +	output_asm_insn ("lh<u>\t%2,%3", operands);
>>> +      }
>>> +    else
>>> +      {
>>> +	output_asm_insn ("lh<u>\t%2,%3", operands);
>>> +	output_asm_insn ("lh<u>\t%0,%1", operands);
>>> +      }
>>> +
>>> +    return "";
>>> +  }
>>> +  [(set_attr "move_type" "load")
>>> +   (set_attr "insn_count" "2")])

> However, unlike other architectures, we do not generate single instruction for bonded pair,

Actually, you do.  The above is 1 instruction pattern.  Doesn’t matter much what it prints as or what the CPU thinks of it.

> because of which it is difficult to check if bonding is happening or not. Hence, an assembly file is generated with debug dumps, and the bonded loads/stores are identified by their pattern names.

Nothing wrong with that approach.  Also, in the assembly, one can look for sequences of instruction if they way.  
See gcc/testsuite/gcc.target/aarch64/fuse_adrp_add_1.c:

  /* { dg-final { scan-assembler "adrp\tx.*, fixed_regs\n\tadd\tx.*, x.*fixed_regs" } } */

in the test suite for example.

> I am trying FUSION for MIPS as suggested by Mike, and testing the perf impact of it along with other mips specific options.

I think you will discover it is virtually what you have now, and works better.  The fusion just can peephole over greater distances, that’s the only real difference.
sameera May 13, 2015, 6:45 a.m. UTC | #3
Hi Mike,

Thanks for your comments.
Please find my comments inlined.

- Thanks and regards,
   Sameera D.

On Monday 11 May 2015 10:09 PM, Mike Stump wrote:
> On May 11, 2015, at 4:05 AM, sameera <sameera.deshpande@imgtec.com> wrote:
>>>> +(define_insn "*join2_loadhi"
>>>> +  [(set (match_operand:SI 0 "register_operand" "=r")
>>>> +	(any_extend:SI (match_operand:HI 1 "non_volatile_mem_operand" "m")))
>>>> +   (set (match_operand:SI 2 "register_operand" "=r")
>>>> +	(any_extend:SI (match_operand:HI 3 "non_volatile_mem_operand" "m")))]
>>>> +  "ENABLE_LD_ST_PAIRS && reload_completed"
>>>> +  {
>>>> +    /* Reg-renaming pass reuses base register if it is dead after bonded loads.
>>>> +       Hardware does not bond those loads, even when they are consecutive.
>>>> +       However, order of the loads need to be checked for correctness.  */
>>>> +    if (!reg_overlap_mentioned_p (operands[0], operands[1]))
>>>> +      {
>>>> +	output_asm_insn ("lh<u>\t%0,%1", operands);
>>>> +	output_asm_insn ("lh<u>\t%2,%3", operands);
>>>> +      }
>>>> +    else
>>>> +      {
>>>> +	output_asm_insn ("lh<u>\t%2,%3", operands);
>>>> +	output_asm_insn ("lh<u>\t%0,%1", operands);
>>>> +      }
>>>> +
>>>> +    return "";
>>>> +  }
>>>> +  [(set_attr "move_type" "load")
>>>> +   (set_attr "insn_count" "2")])
>
>> However, unlike other architectures, we do not generate single instruction for bonded pair,
>
> Actually, you do.  The above is 1 instruction pattern.  Doesn’t matter much what it prints as or what the CPU thinks of it.
The pattern is single, however, the asm code will have multiple instructions generated for the pattern.
>
>> because of which it is difficult to check if bonding is happening or not. Hence, an assembly file is generated with debug dumps, and the bonded loads/stores are identified by their pattern names.
>
> Nothing wrong with that approach.  Also, in the assembly, one can look for sequences of instruction if they way.
Load/store bonding is not just contiguous load/store instructions, but they also need to have same base register and offset with specific difference. 
Hence, The way you suggested might not be useful always. Hence, I am comparing the pattern name instead.
> See gcc/testsuite/gcc.target/aarch64/fuse_adrp_add_1.c:
>
>    /* { dg-final { scan-assembler "adrp\tx.*, fixed_regs\n\tadd\tx.*, x.*fixed_regs" } } */
>
> in the test suite for example.
>
>> I am trying FUSION for MIPS as suggested by Mike, and testing the perf impact of it along with other mips specific options.
>
> I think you will discover it is virtually what you have now, and works better.  The fusion just can peephole over greater distances, that’s the only real difference.
Yes, in many cases I see clear improvement. However, it also tries to bring loads/stores together, which were split intentionally by msched-weight 
option, introduced for MIPS. I need to measure performance and do perf tuning (if needed) for that option before sending it for review.
>
diff mbox

Patch

diff --git a/gcc/config/mips/mips-protos.h b/gcc/config/mips/mips-protos.h
index b48e04f..244eb8d 100644
--- a/gcc/config/mips/mips-protos.h
+++ b/gcc/config/mips/mips-protos.h
@@ -360,6 +360,7 @@  extern bool mips_epilogue_uses (unsigned int);
 extern void mips_final_prescan_insn (rtx_insn *, rtx *, int);
 extern int mips_trampoline_code_size (void);
 extern void mips_function_profiler (FILE *);
+extern bool mips_load_store_bonding_p (rtx *, machine_mode, bool);
 
 typedef rtx (*mulsidi3_gen_fn) (rtx, rtx, rtx);
 #ifdef RTX_CODE
diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c
index bf69850..4fc15c4 100644
--- a/gcc/config/mips/mips.c
+++ b/gcc/config/mips/mips.c
@@ -18241,6 +18241,66 @@  umips_load_store_pair_p_1 (bool load_p, bool swap_p,
   return true;
 }
 
+bool
+mips_load_store_bonding_p (rtx *operands, machine_mode mode, bool load_p)
+{
+  rtx reg1, reg2, mem1, mem2, base1, base2;
+  enum reg_class rc1, rc2;
+  HOST_WIDE_INT offset1, offset2;
+
+  if (load_p)
+    {
+      reg1 = operands[0];
+      reg2 = operands[2];
+      mem1 = operands[1];
+      mem2 = operands[3];
+    }
+  else
+    {
+      reg1 = operands[1];
+      reg2 = operands[3];
+      mem1 = operands[0];
+      mem2 = operands[2];
+    }
+
+  if (mips_address_insns (XEXP (mem1, 0), mode, false) == 0
+      || mips_address_insns (XEXP (mem2, 0), mode, false) == 0)
+    return false;
+
+  mips_split_plus (XEXP (mem1, 0), &base1, &offset1);
+  mips_split_plus (XEXP (mem2, 0), &base2, &offset2);
+
+  /* Base regs do not match.  */
+  if (!REG_P (base1) || !rtx_equal_p (base1, base2))
+    return false;
+
+  /* Either of the loads is clobbering base register.  It is legitimate to bond
+     loads if second load clobbers base register.  However, hardware does not
+     support such bonding.  */
+  if (load_p
+      && (REGNO (reg1) == REGNO (base1)
+	  || (REGNO (reg2) == REGNO (base1))))
+    return false;
+
+  /* Loading in same registers.  */
+  if (load_p
+      && REGNO (reg1) == REGNO (reg2))
+    return false;
+
+  /* The loads/stores are not of same type.  */
+  rc1 = REGNO_REG_CLASS (REGNO (reg1));
+  rc2 = REGNO_REG_CLASS (REGNO (reg2));
+  if (rc1 != rc2
+      && !reg_class_subset_p (rc1, rc2)
+      && !reg_class_subset_p (rc2, rc1))
+    return false;
+
+  if (abs (offset1 - offset2) != GET_MODE_SIZE (mode))
+    return false;
+
+  return true;
+}
+
 /* OPERANDS describes the operands to a pair of SETs, in the order
    dest1, src1, dest2, src2.  Return true if the operands can be used
    in an LWP or SWP instruction; LOAD_P says which.  */
diff --git a/gcc/config/mips/mips.h b/gcc/config/mips/mips.h
index 4bd83f5..c9accd1 100644
--- a/gcc/config/mips/mips.h
+++ b/gcc/config/mips/mips.h
@@ -3162,3 +3162,10 @@  extern GTY(()) struct target_globals *mips16_globals;
 #define STANDARD_STARTFILE_PREFIX_1 "/lib64/"
 #define STANDARD_STARTFILE_PREFIX_2 "/usr/lib64/"
 #endif
+
+/* Load store bonding is not supported by micromips and fix_24k.  The
+   performance can be degraded for those targets.  Hence, do not bond for
+   micromips or fix_24k.  */
+#define ENABLE_LD_ST_PAIRS \
+  (TARGET_LOAD_STORE_PAIRS && TUNE_P5600 \
+   && !TARGET_MICROMIPS && !TARGET_FIX_24K)
diff --git a/gcc/config/mips/mips.md b/gcc/config/mips/mips.md
index ed4c0ba..0e2b172 100644
--- a/gcc/config/mips/mips.md
+++ b/gcc/config/mips/mips.md
@@ -754,6 +754,11 @@ 
 
 (define_mode_iterator MOVEP1 [SI SF])
 (define_mode_iterator MOVEP2 [SI SF])
+(define_mode_iterator JOIN_MODE [HI
+				 SI
+				 (SF "TARGET_HARD_FLOAT")
+				 (DF "TARGET_HARD_FLOAT
+				      && TARGET_DOUBLE_FLOAT")])
 
 ;; This mode iterator allows :HILO to be used as the mode of the
 ;; concatenated HI and LO registers.
@@ -7407,6 +7412,112 @@ 
   { return MIPS_CALL ("jal", operands, 0, -1); }
   [(set_attr "type" "call")
    (set_attr "insn_count" "3")])
+
+;; Match paired HI/SI/SF/DFmode load/stores.
+(define_insn "*join2_load_store<JOIN_MODE:mode>"
+  [(set (match_operand:JOIN_MODE 0 "nonimmediate_operand" "=d,f,m,m")
+	(match_operand:JOIN_MODE 1 "nonimmediate_operand" "m,m,d,f"))
+   (set (match_operand:JOIN_MODE 2 "nonimmediate_operand" "=d,f,m,m")
+	(match_operand:JOIN_MODE 3 "nonimmediate_operand" "m,m,d,f"))]
+  "ENABLE_LD_ST_PAIRS && reload_completed"
+  {
+    bool load_p = (which_alternative == 0 || which_alternative == 1);
+    /* Reg-renaming pass reuses base register if it is dead after bonded loads.
+       Hardware does not bond those loads, even when they are consecutive.
+       However, order of the loads need to be checked for correctness.  */
+    if (!load_p || !reg_overlap_mentioned_p (operands[0], operands[1]))
+      {
+	output_asm_insn (mips_output_move (operands[0], operands[1]),
+			 operands);
+	output_asm_insn (mips_output_move (operands[2], operands[3]),
+			 &operands[2]);
+      }
+    else
+      {
+	output_asm_insn (mips_output_move (operands[2], operands[3]),
+			 &operands[2]);
+	output_asm_insn (mips_output_move (operands[0], operands[1]),
+			 operands);
+      }
+    return "";
+  }
+  [(set_attr "move_type" "load,fpload,store,fpstore")
+   (set_attr "insn_count" "2,2,2,2")])
+
+;; 2 HI/SI/SF/DF loads are joined.
+;; P5600 does not support bonding of two LBs, hence QI mode is not included.
+;; The loads must be non-volatile as they might be reordered at the time of asm
+;; generation.
+(define_peephole2
+  [(set (match_operand:JOIN_MODE 0 "register_operand")
+	(match_operand:JOIN_MODE 1 "non_volatile_mem_operand"))
+   (set (match_operand:JOIN_MODE 2 "register_operand")
+	(match_operand:JOIN_MODE 3 "non_volatile_mem_operand"))]
+  "ENABLE_LD_ST_PAIRS
+   && mips_load_store_bonding_p (operands, <JOIN_MODE:MODE>mode, true)"
+  [(parallel [(set (match_dup 0)
+		   (match_dup 1))
+	      (set (match_dup 2)
+		   (match_dup 3))])]
+  "")
+
+;; 2 HI/SI/SF/DF stores are joined.
+;; P5600 does not support bonding of two SBs, hence QI mode is not included.
+(define_peephole2
+  [(set (match_operand:JOIN_MODE 0 "memory_operand")
+	(match_operand:JOIN_MODE 1 "register_operand"))
+   (set (match_operand:JOIN_MODE 2 "memory_operand")
+	(match_operand:JOIN_MODE 3 "register_operand"))]
+  "ENABLE_LD_ST_PAIRS
+   && mips_load_store_bonding_p (operands, <JOIN_MODE:MODE>mode, false)"
+  [(parallel [(set (match_dup 0)
+		   (match_dup 1))
+	      (set (match_dup 2)
+		   (match_dup 3))])]
+  "")
+
+;; Match paired HImode loads.
+(define_insn "*join2_loadhi"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(any_extend:SI (match_operand:HI 1 "non_volatile_mem_operand" "m")))
+   (set (match_operand:SI 2 "register_operand" "=r")
+	(any_extend:SI (match_operand:HI 3 "non_volatile_mem_operand" "m")))]
+  "ENABLE_LD_ST_PAIRS && reload_completed"
+  {
+    /* Reg-renaming pass reuses base register if it is dead after bonded loads.
+       Hardware does not bond those loads, even when they are consecutive.
+       However, order of the loads need to be checked for correctness.  */
+    if (!reg_overlap_mentioned_p (operands[0], operands[1]))
+      {
+	output_asm_insn ("lh<u>\t%0,%1", operands);
+	output_asm_insn ("lh<u>\t%2,%3", operands);
+      }
+    else
+      {
+	output_asm_insn ("lh<u>\t%2,%3", operands);
+	output_asm_insn ("lh<u>\t%0,%1", operands);
+      }
+
+    return "";
+  }
+  [(set_attr "move_type" "load")
+   (set_attr "insn_count" "2")])
+
+
+;; 2 HI loads are joined.
+(define_peephole2
+  [(set (match_operand:SI 0 "register_operand")
+	(any_extend:SI (match_operand:HI 1 "non_volatile_mem_operand")))
+   (set (match_operand:SI 2 "register_operand")
+	(any_extend:SI (match_operand:HI 3 "non_volatile_mem_operand")))]
+  "ENABLE_LD_ST_PAIRS
+   && mips_load_store_bonding_p (operands, HImode, true)"
+  [(parallel [(set (match_dup 0)
+		   (any_extend:SI (match_dup 1)))
+	      (set (match_dup 2)
+		   (any_extend:SI (match_dup 3)))])]
+  "")
+
 
 ;; Synchronization instructions.
 
diff --git a/gcc/config/mips/mips.opt b/gcc/config/mips/mips.opt
index 9e89aa9..a9baebe 100644
--- a/gcc/config/mips/mips.opt
+++ b/gcc/config/mips/mips.opt
@@ -418,3 +418,7 @@  Enable use of odd-numbered single-precision registers
 
 noasmopt
 Driver
+
+mload-store-pairs
+Target Report Var(TARGET_LOAD_STORE_PAIRS) Init(1)
+Enable load/store bonding.
diff --git a/gcc/testsuite/gcc.target/mips/p5600-bonding.c b/gcc/testsuite/gcc.target/mips/p5600-bonding.c
new file mode 100644
index 0000000..122b9f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/p5600-bonding.c
@@ -0,0 +1,19 @@ 
+/* { dg-do compile } */
+/* { dg-options "-dp -mtune=p5600  -mno-micromips -mno-mips16" } */
+/* { dg-skip-if "Bonding needs peephole optimization." { *-*-* } { "-O0" "-O1" } { "" } } */
+typedef int VINT32 __attribute__ ((vector_size((16))));
+
+void memory_operation_fun2_si(void * __restrict src, void * __restrict dest, int num)
+{
+    VINT32 *vsrc = (VINT32 *)src;
+    VINT32 *vdest = (VINT32 *)dest;
+    int i;
+
+    for (i = 0; i < num - 1; i+=2)
+    {
+      vdest[i] = (vdest[i] + vsrc[i]);
+      vdest[i + 1] = vdest[i + 1] + vsrc[i + 1];
+    }
+}
+/* { dg-final { scan-assembler "join2_" } }  */
+