diff mbox series

[2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

Message ID b0b14a63-ec38-89bc-5c0b-da87c3b28390@arm.com
State New
Headers show
Series None | expand

Commit Message

Stamatis Markianos-Wright Nov. 28, 2022, 12:13 p.m. UTC
On 11/15/22 15:51, Andre Vieira (lists) wrote:
>
> On 11/11/2022 17:40, Stam Markianos-Wright via Gcc-patches wrote:
>> Hi all,
>>
>> This is the 2/2 patch that contains the functional changes needed
>> for MVE Tail Predicated Low Overhead Loops.  See my previous email
>> for a general introduction of MVE LOLs.
>>
>> This support is added through the already existing loop-doloop
>> mechanisms that are used for non-MVE dls/le looping.
>>
>> Changes are:
>>
>> 1) Relax the loop-doloop mechanism in the mid-end to allow for
>>    decrement numbers other that -1 and for `count` to be an
>>    rtx containing the number of elements to be processed, rather
>>    than an expression for calculating the number of iterations.
>> 2) Add a `allow_elementwise_doloop` target hook. This allows the
>>    target backend to manipulate the iteration count as it needs:
>>    in our case to change it from a pre-calculation of the number
>>    of iterations to the number of elements to be processed.
>> 3) The doloop_end target-insn now had an additional parameter:
>>    the `count` (note: this is before it gets modified to just be
>>    the number of elements), so that the decrement value is
>>    extracted from that parameter.
>>
>> And many things in the backend to implement the above optimisation:
>>
>> 4)  Appropriate changes to the define_expand of doloop_end and new
>>     patterns for dlstp and letp.
>> 5) `arm_attempt_dlstp_transform`: (called from the define_expand of
>>     doloop_end) this function checks for the loop's suitability for
>>     dlstp/letp transformation and then implements it, if possible.
>> 6) `arm_mve_get_loop_unique_vctp`: A function that loops through
>>     the loop contents and returns the vctp VPR-genereting operation
>>     within the loop, if it is unique and there is exclusively one
>>     vctp within the loop.
>> 7) A couple of utility functions: `arm_mve_get_vctp_lanes` to map
>>    from vctp unspecs to number of lanes, and `arm_get_required_vpr_reg`
>>    to check an insn to see if it requires the VPR or not.
>>
>> No regressions on arm-none-eabi with various targets and on
>> aarch64-none-elf. Thoughts on getting this into trunk?
>>
>> Thank you,
>> Stam Markianos-Wright
>>
>> gcc/ChangeLog:
>>
>>         * config/aarch64/aarch64.md: Add extra doloop_end arg.
>>         * config/arm/arm-protos.h (arm_attempt_dlstp_transform): New.
>>         * config/arm/arm.cc (TARGET_ALLOW_ELEMENTWISE_DOLOOP): New.
>>         (arm_mve_get_vctp_lanes): New.
>>         (arm_get_required_vpr_reg): New.
>>         (arm_mve_get_loop_unique_vctp): New.
>>         (arm_attempt_dlstp_transform): New.
>>         (arm_allow_elementwise_doloop): New.
>>         * config/arm/iterators.md:
>>         * config/arm/mve.md (*predicated_doloop_end_internal): New.
>>         (dlstp<mode1>_insn): New.
>>         * config/arm/thumb2.md (doloop_end): Update for MVE LOLs.
>>         * config/arm/unspecs.md: New unspecs.
>>         * config/ia64/ia64.md: Add extra doloop_end arg.
>>         * config/pru/pru.md: Add extra doloop_end arg.
>>         * config/rs6000/rs6000.md: Add extra doloop_end arg.
>>         * config/s390/s390.md: Add extra doloop_end arg.
>>         * config/v850/v850.md: Add extra doloop_end arg.
>>         * doc/tm.texi: Document new hook.
>>         * doc/tm.texi.in: Likewise.
>>         * loop-doloop.cc (doloop_condition_get): Relax conditions.
>>         (doloop_optimize): Add support for elementwise LoLs.
>>         * target-insns.def (doloop_end): Add extra arg.
>>         * target.def (allow_elementwise_doloop): New hook.
>>         * targhooks.cc (default_allow_elementwise_doloop): New.
>>         * targhooks.h (default_allow_elementwise_doloop): New.
>>
>> gcc/testsuite/ChangeLog:
>>
>>         * gcc.target/arm/lob.h: Update framework.
>>         * gcc.target/arm/lob1.c: Likewise.
>>         * gcc.target/arm/lob6.c: Likewise.
>>         * gcc.target/arm/dlstp-int16x8.c: New test.
>>         * gcc.target/arm/dlstp-int32x4.c: New test.
>>         * gcc.target/arm/dlstp-int64x2.c: New test.
>>         * gcc.target/arm/dlstp-int8x16.c: New test.
>>
>>
>> ### Inline copy of patch ###
>>
>> diff --git a/gcc/config/aarch64/aarch64.md 
>> b/gcc/config/aarch64/aarch64.md
>> index 
>> f2e3d905dbbeb2949f2947f5cfd68208c94c9272..7a6d24a80060b4a704a481ccd1a32d96e7b0f369 
>> 100644
>> --- a/gcc/config/aarch64/aarch64.md
>> +++ b/gcc/config/aarch64/aarch64.md
>> @@ -7366,7 +7366,8 @@
>>  ;; knows what to generate.
>>  (define_expand "doloop_end"
>>    [(use (match_operand 0 "" ""))      ; loop pseudo
>> -   (use (match_operand 1 "" ""))]     ; label
>> +   (use (match_operand 1 "" ""))      ; label
>> +   (use (match_operand 2 "" ""))]     ; decrement constant
>>    "optimize > 0 && flag_modulo_sched"
>>  {
>>    rtx s0;
>> diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
>> index 
>> 550272facd12e60a49bf8a3b20f811cc13765b3a..7684620f0f4d161dd9e9ad2d70308021ec3d3d34 
>> 100644
>> --- a/gcc/config/arm/arm-protos.h
>> +++ b/gcc/config/arm/arm-protos.h
>> @@ -63,7 +63,7 @@ extern void arm_decompose_di_binop (rtx, rtx, rtx 
>> *, rtx *, rtx *, rtx *);
>>  extern bool arm_q_bit_access (void);
>>  extern bool arm_ge_bits_access (void);
>>  extern bool arm_target_insn_ok_for_lob (rtx);
>> -
>> +extern rtx arm_attempt_dlstp_transform (rtx, rtx);
>>  #ifdef RTX_CODE
>>  enum reg_class
>>  arm_mode_base_reg_class (machine_mode);
>> diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
>> index 
>> ee8f1babf8a1319e77e0db0fa55851c038048804..99e144d52c26597c64b982b3d4ae9a62a114cf18 
>> 100644
>> --- a/gcc/config/arm/arm.cc
>> +++ b/gcc/config/arm/arm.cc
>> @@ -470,6 +470,9 @@ static const struct attribute_spec 
>> arm_attribute_table[] =
>>  #undef TARGET_SCHED_REORDER
>>  #define TARGET_SCHED_REORDER arm_sched_reorder
>>
>> +#undef TARGET_ALLOW_ELEMENTWISE_DOLOOP
>> +#define TARGET_ALLOW_ELEMENTWISE_DOLOOP arm_allow_elementwise_doloop
>> +
> Just a nit but maybe keep the same naming scheme as the existing hook:
> TARGET_CAN_USE_ELEMENTWISE_DOLOOP_P ?

Done,

Also:

Thanks for taking time to review!

I've done a second draft as an attachment to this email. Let me know of any

further when you get to the second half of the patch.

>> +
>> +static rtx
>> +arm_get_required_vpr_reg (rtx_insn *insn)
>> +{
>> +  bool requires_vpr;
>> +
>> +  extract_constrain_insn (insn);
>> +  int n_operands = recog_data.n_operands;
>> +  if (recog_data.n_alternatives == 0)
>> +    return NULL_RTX;
>> +
>> +  /* Fill in recog_op_alt with information about the constraints of
>> +     this insn.  */
>> +  preprocess_constraints (insn);
>> +
>> +  for (int use = 0; use < n_operands; use++)
>> +    {
>> +      requires_vpr = true;
>> +      /* Iterate through alternatives of operand "use" in 
>> recog_op_alt and
>> +       * identify if the operand is required to be the VPR.  */
> Remove the * at the start of the new line.
Done.
>> +      for (int alt1 = 0; alt1 < recog_data.n_alternatives; alt1++)
>> +    {
>> +      const operand_alternative *op_alt1
>> +          = &recog_op_alt[alt1 * n_operands];
>> +      /* Fetch the reg_class for each entry and check it against the
>> +       * VPR_REG reg_class.  */
>> +      if (alternative_class (op_alt1, use) != VPR_REG)
>> +        requires_vpr = false;
>> +    }
>> +      /* If all alternatives of the insn require the VPR reg for 
>> this operand,
>> +     it means that either this is VPR-generating instruction, like a 
>> vctp,
>> +     vcmp, etc., or it is a VPT-predicated insruction.  Return the 
>> subrtx
>> +     of the VPR reg operand.  */
>> +      if (requires_vpr)
>> +    return recog_data.operand[use];
>> +    }
>> +  return NULL_RTX;
>> +}
>> +
>> +/* Scan the basic block of a loop body for a vctp instruction. If 
>> there is
>> +   exactly one unique vctp instruction, return its rtx_insn *. */
>> +
>> +static rtx_insn *
>> +arm_mve_get_loop_unique_vctp (basic_block bb)
>> +{
>> +  rtx_insn *insn = BB_HEAD (bb);
>> +  rtx_insn *vctp_op = NULL;
>> +
>> +  /* Now scan through all the instruction patterns and
>> +     pick out any MVE instructions.  */
>> +  FOR_BB_INSNS (bb, insn)
>> +    {
>> +      if (INSN_P (insn))
>> +    {
>> +      /* First check if this is a vctp instruction.  There needs to be
>> +         exactly one vctp instruction within the loop.  */
>> +      if (arm_mve_get_vctp_lanes (PATTERN (insn)) != 0)
>> +        {
>> +          /* If we already found one vctp instruction, then the
>> +         loop is not consistent internally.  */
>> +          if (vctp_op)
>> +        return NULL;
>> +
>> +          vctp_op = insn;
>> +        }
>> +    }
>> +    }
>> +  return vctp_op;
>> +}
>> +
>> +rtx
>> +arm_attempt_dlstp_transform (rtx label, rtx count)
>> +{
>> +  int decrementnum;
>> +  basic_block body = BLOCK_FOR_INSN (label)->prev_bb;
>> +  rtx initial_compare;
>> +  /* Doloop can only be done "elementwise" with predicated dlstp/letp
>> +     when the iteration counter gets deprecated by the number of MVE
> s/deprecated/decreased/ ? I think

"decremented" was what I was going for, so changed it to that :)


>> + lanes.  This can be exxtracted from the `count`, which is the 
>> expression
> s/exxtracted/extracted/
Done.
>> +     used to calculate the number of iterations that the loop would 
>> execute
>> +     for a standard dls/le loop.  Since we only support cases where 
>> this is a
>> +     power of 2, we can assume that this expression arrives here as:
>> +       (lshiftrt: (A) (const_int y))
>> +     Then we can extract the decrementnum from y.  */
>> +  if (GET_CODE (count) == LSHIFTRT && ARITHMETIC_P (XEXP (count, 0))
>> +      && (decrementnum = (1 << (INTVAL (XEXP (count, 1)))))
> Why are you calculating decrementnum inside the condition?

Ahh ok so this is an artifact from a previous revision, where I also 
included the:

`decrementnum != arm_mve_get_vctp_lanes (PATTERN (vctp_insn))`

within that condition, but since that is now done late, I can move this 
down.

>> +      /* There is one final condition that needs to be met for the 
>> loop to be
>> +     transformable: dlstp/letp will continue looping until there are
>> +     elements still to process.  This can only work if the looping ends
>> +     when the element counter reaches zero and not some other value
>> +     (e.g. n > 0 works, not n > 1), or we can incorrectly end up 
>> running
>> +     one additional iteration.  To by-pass any hoisting that the 
>> compiler
>> +     may have done with the `A` in `count` above, we can instead 
>> look up
>> +     to the bb before the loop preheader: this should end with a 
>> cmp+jump
>> +     pair, where the cmp needs to be with (const_int 0).  */
>
> I'm wondering whether it would be possible to subtract a non-zero 
> const from count. But that might be dangerous...
>
> Do you have an example/test case where you saw this happen?

So this can happen in situations like:

     while (n > 1)
     {
         mve_pred16_t p = vctp8q (n);
         uint8x16_t va = vldrbq_z_u8 (a, p);
         uint8x16_t vb = vldrbq_z_u8 (b, p);
         uint8x16_t vc = vaddq_x_u8 (va, vb, p);
         vstrbq_p_u8 (c, vc, p);
         n-=16;
         a+=16;
         b+=16;
         c+=16;
     }

IIUC you mean that we'd end up with something like:

         sub r3, r3, #1
         dlstp.8 lr, r3

to enable loops that aren't terminating at zero.

I believe the answer is `No`, because even though we would execute the 
correct

number of iterations, we would use an incorrect predicate value:

e.g. if n == r3 == 18:

we'd subtract a const 1: r3 == 17

Iteration 1: use all 16 lanes: Correct.
Iteration 2: use 1 lane: Incorrect -- from the user code it looks like 
we should use 2.


Because the number of iterations and the predicate value here are tied, 
I think we

can only safely transform loops that terminate with a `>0`


On standard dls/le loops we do already subtract the non-zero const.

>
>> + && loop_preheader_edge (body->loop_father)->src->prev_bb
>> +      && BB_END (loop_preheader_edge (body->loop_father)->src->prev_bb)
>> +      && PREV_INSN (BB_END (loop_preheader_edge (body->loop_father)
>> +                ->src->prev_bb))
>> +      && INSN_P (PREV_INSN (BB_END (loop_preheader_edge 
>> (body->loop_father)
>> +                    ->src->prev_bb)))
>> +      && (initial_compare
>> +      = PATTERN (PREV_INSN (BB_END (loop_preheader_edge 
>> (body->loop_father)
>> +                        ->src->prev_bb))))
>> +      && GET_CODE (initial_compare) == SET
>> +      && cc_register (XEXP (initial_compare, 0), VOIDmode)
>> +      && GET_CODE (XEXP (initial_compare, 1)) == COMPARE
>> +      && CONST_INT_P (XEXP (XEXP (initial_compare, 1), 1))
>> +      && INTVAL (XEXP (XEXP (initial_compare, 1), 1)) == 0)
>> +    {
>> +      /* Find the vctp predicate generation inside the loop body 
>> BB.  */
>> +      rtx_insn *vctp_insn = arm_mve_get_loop_unique_vctp (body);
>> +
>> +      /* If we have successfully found one exactly vctp 
>> predicate-generating
>> +     instruction within the loop and the number by which we 
>> deprecate the
>> +     loop counter in each iteration matches the number of lanes of the
>> +     vctp instruction, we can attempt to turn this into a dlstp/letp 
>> loop.
>> +     */
>> +      if (vctp_insn
>> +      && decrementnum == arm_mve_get_vctp_lanes (PATTERN (vctp_insn)))
>> +    {
> I would exit early here, so you don't need to indent the rest of the 
> code, by that I mean something like:
>
> if (!vectp_insn
>      || decrementnum != ...)
>   return GEN_INT (1);
>
> .. rest of the code.
Done, thanks!
>
>> + rtx_insn *insn = 0;
>> +      rtx_insn *cur_insn = 0;
>> +      rtx_insn *seq;
>> +      rtx vctp_vpr_generated = NULL_RTX;
>> +      rtx insn_vpr_reg_operand = NULL_RTX;
>> +      bool transform_worked = true;
> Won't need transform_worked.
Done
>> +      int new_icode;
>> +
>> +      /* Scan through the insns in the loop bb and emit the 
>> transformed bb
>> +         insns to a sequence.  */
>> +      start_sequence ();
>> +      FOR_BB_INSNS (body, insn)
>> +        {
>> +          if (INSN_P (insn))
> This also captures DEBUG_INSNs, which means passing -g disables this 
> feature.


Fixed: description of this is later on

>> +        {
>> +          /* When we find the vctp instruction: This may be followed by
>> +             a sign-extend insn to SImode.  If it is, then save the
>> +             sign-extended REG into vctp_vpr_generated.  If there is no
>> +             sign-extend, then store the raw output of the vctp.
>> +             For any VPT-predicated instructions we need to ensure that
>> +             the VPR they use is the same as the one given here and
>> +             they often consume the output of a subreg of the SImode
>> +             sign-extended VPR-reg.  As a result, comparing against the
>> +             output of the sign-extend is more likely to succeed.
>> +             This code also guarantees to us that the vctp comes before
>> +             any instructions that use the VPR within the loop, for the
>> +             dlstp/letp transform to succeed.  */
>> +          if (insn == vctp_insn)
>> +            {
>> +              if (GET_CODE (XEXP (PATTERN (NEXT_INSN (insn)), 1))
>> +                  == SIGN_EXTEND
>> +              && GET_CODE (XEXP (
>> +                 PATTERN (NEXT_INSN (NEXT_INSN (insn))), 1))
>> +                 == SUBREG)
>> +            vctp_vpr_generated
>> +                = XEXP (PATTERN (NEXT_INSN (NEXT_INSN (insn))), 0);
>> +              else
>> +            vctp_vpr_generated = XEXP (PATTERN (insn), 0);
>> +              /* Also emit a USE of the source register of the vctp.
>> +             This holds the number of elements being processed
>> +             by the loop.  This later gets stored into `count`.
>> +             */
>> +              emit_use (XVECEXP (XEXP (PATTERN (insn), 1), 0, 0));
> What if we get here but don't end up creating a predicated do-loop? 
> Will this use break something?

Nope, it won't break anything because of the use of 
start_sequence/end_sequence. The use and all

the transform insns get emitted to the sequence, and then only if we 
have fully succeeded, we

will wipe all the contents of the `bb` and put in the sequence (incl. 
the use).


The `use` gets deleted later in `arm_allow_elementwise_doloop_p` and 
really it's just a dodgy way

for me to preserve the REG used in the vctp, so that later I can make it 
the `count` or the number

of elements to be processed by the loop (the `dlstp lr, REG` register)

>> +       continue;
>> +            }
>> +          /* If the insn pattern requires the use of the VPR, then it
> Missing an is.
Done
>> +      a VPT-predicated instruction, so it will need to be
>> +             transformed into the non-predicated version of the
>> +             instruction.  */
> But this comment seems misplace here.
Adjusted this :)
>> + else if ((insn_vpr_reg_operand
>> +                = arm_get_required_vpr_reg (insn))
>> +               != NULL_RTX)
>> +            {
>> +              /* If the VPR value is different to the one generated by
>> +             the vctp, then fail the conversion.  */
>> +              if (!rtx_equal_p (vctp_vpr_generated,
>> +                    insn_vpr_reg_operand))
>> +            {
>> +              transform_worked = false;
>> +              break;
> return GEN_INT (1);
Done -- but also needed an end_sequece ();
>> +     }
>> +              /* Also ensure that it's a valid recog-ed instruction 
>> with
>> +             the mve_unpredicated_insn atrribute.  */
>> +              else if (recog_memoized (insn) >= 0
>> +                   && (new_icode
>> +                   = get_attr_mve_unpredicated_insn (insn)))
>> +            {
>> +              extract_insn (insn);
>> +              rtx arr[8];
>> +              int j = 0;
>> +
>> +              /* When transforming a VPT-predicated instruction
>> +                 into its unpredicated equivalent we need to drop
>> +                 the VPR operand and we may need to also drop a
>> +                 merge "vuninit" input operand, depending on the
>> +                 instruction pattern.  Here ensure that we have at
>> +                 most a two-operand difference between the two
>> +                 instrunctions.  */
>> +              int n_operands_diff
>> +                  = recog_data.n_operands
>> +                - insn_data[new_icode].n_operands;
>> +              gcc_assert (n_operands_diff > 0
>> +                      && n_operands_diff <= 2);
>> +
>> +              /* Then, loop through the operands of the predicated
>> +                 instruction, and retain the ones that map to the
>> +                 unpredicated instruction.  */
>> +              for (int i = 0; i < recog_data.n_operands; i++)
>> +                {
>> +                  /* Ignore the VPR and, if needed, the vuninit
>> +                 operand.  */
>> +                  if (insn_vpr_reg_operand == recog_data.operand[i]
>> +                  || (n_operands_diff == 2
>> +                      && !strcmp (recog_data.constraints[i],
>> +                          "0")))
>> +                continue;
>> +                  else
>> +                {
>> +                  arr[j] = recog_data.operand[i];
>> +                  j++;
>> +                }
>> +                }
>> +
>> +              /* Finally, emit the upredicated instruction.  */
>> +              switch (j)
>> +                {
>> +                  case 2:
>> +                emit_insn (GEN_FCN (new_icode) (arr[0],
>> +                                arr[1]));
>> +                break;
>> +                  case 3:
>> +                emit_insn (GEN_FCN (new_icode) (arr[0], arr[1],
>> +                                arr[2]));
>> +                break;
>> +                  default:
>> +                gcc_unreachable ();
>> +                }
>> +            }
>> +              /* If we can't identify the INSN as either being either
>> +             for deletion or to re-map, then we don't know how to
>> +             handle it, so fail the whole conversion.  */
>> +              else
>> +            {
>> +              transform_worked = false;
>> +              break;
> use
> return GEN_INT (1);
Done.
>> +     }
>> +            }
>> +          /* Instructions that dont's require the VPR can be carried
>> +             over as-is.  */
>> +          else
>> +            emit_insn (PATTERN (insn));
>> +        }
>> +        }
>> +      seq = get_insns ();
>> +      end_sequence ();
>> +
>> +      if (transform_worked)
>> +        {
> no need to check this, you can only get here if it worked.
Done.
>> + /* Re-write the entire BB contents with the transformed
>> +         sequence.  */
>> +          FOR_BB_INSNS_SAFE (body, insn, cur_insn)
>> +        if (INSN_P (insn))
>> +          delete_insn (insn);
> This will also delete DEBUG_INSN's! You'd probably want to delete only 
> NONDEBUG_INSN_P (insn). I'm not an expert in how DEBUG_INSNs work but 
> I suspect their order compared to non-debug insns are likely to be 
> important, so really you'd want change how you 'transform' the BB and 
> do inline insn replacement.
You are correct, thanks for spotting this!

So I do want to also delete debug_insns, because what I want to do is 
replace all the

bb contents with the previous sequence.

Instead, though I need to make sure that DEBUG_INSNs get put into the 
sequence

correctly (which wasn't working).


I also found a similar situation with NOTEs (which are !INSN_Ps), if 
there were any

in the bb other than the NOTE_INSN_BASIC_BLOCK, they would get filtered 
up to

the start of the bb. This should also be fixed now.

>> + for (insn = seq; NEXT_INSN (insn); insn = NEXT_INSN (insn))
>> +        emit_insn_after (PATTERN (insn), BB_END (body));
>> +          emit_jump_insn_after (PATTERN (insn), BB_END (body));
>> +          return GEN_INT (decrementnum);
>> +        }
>> +    }
>> +    }
>> +  /* Bail out: we can't use dlstp/letp, so return 1 to allow 
>> loop-doloop to try
>> +     the standard dls/le pair.  */
>> +  return GEN_INT (1);
>> +}
>>
> Only reviewed until here, will look at the rest later.
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index f2e3d905dbb..7a6d24a8006 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -7366,7 +7366,8 @@ 
 ;; knows what to generate.
 (define_expand "doloop_end"
   [(use (match_operand 0 "" ""))      ; loop pseudo
-   (use (match_operand 1 "" ""))]     ; label
+   (use (match_operand 1 "" ""))      ; label
+   (use (match_operand 2 "" ""))]     ; decrement constant
   "optimize > 0 && flag_modulo_sched"
 {
   rtx s0;
diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
index 458d3edf716..a4b3d1addbf 100644
--- a/gcc/config/arc/arc.md
+++ b/gcc/config/arc/arc.md
@@ -4985,12 +4985,13 @@  archs4x, archs4xd"
 		    (pc)))
 	      (set (match_dup 0) (plus:SI (match_dup 0) (const_int -1)))
 	      (unspec:SI [(const_int 0)] UNSPEC_ARC_LP)
-	      (clobber (match_dup 2))])]
+	      (clobber (match_dup 3))
+	      (match_operand 2 "" "")])]
   ""
 {
  if (GET_MODE (operands[0]) != SImode)
    FAIL;
- operands[2] = gen_rtx_SCRATCH (SImode);
+ operands[3] = gen_rtx_SCRATCH (SImode);
 })
 
 (define_insn "arc_lp"
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 550272facd1..7684620f0f4 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -63,7 +63,7 @@  extern void arm_decompose_di_binop (rtx, rtx, rtx *, rtx *, rtx *, rtx *);
 extern bool arm_q_bit_access (void);
 extern bool arm_ge_bits_access (void);
 extern bool arm_target_insn_ok_for_lob (rtx);
-
+extern rtx arm_attempt_dlstp_transform (rtx, rtx);
 #ifdef RTX_CODE
 enum reg_class
 arm_mode_base_reg_class (machine_mode);
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index ee8f1babf8a..72d0187eb00 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -470,6 +470,9 @@  static const struct attribute_spec arm_attribute_table[] =
 #undef TARGET_SCHED_REORDER
 #define TARGET_SCHED_REORDER arm_sched_reorder
 
+#undef TARGET_ALLOW_ELEMENTWISE_DOLOOP_P
+#define TARGET_ALLOW_ELEMENTWISE_DOLOOP_P arm_allow_elementwise_doloop_p
+
 #undef TARGET_REGISTER_MOVE_COST
 #define TARGET_REGISTER_MOVE_COST arm_register_move_cost
 
@@ -34138,8 +34141,370 @@  arm_target_insn_ok_for_lob (rtx insn)
 
   return single_succ_p (bb)
     && single_pred_p (bb)
-    && single_succ_edge (bb)->dest == single_pred_edge (bb)->src
-    && contains_no_active_insn_p (bb);
+    && single_succ_edge (bb)->dest == single_pred_edge (bb)->src;
+}
+
+static int
+arm_mve_get_vctp_lanes (rtx x)
+{
+  if (GET_CODE (x) == SET && GET_CODE (XEXP (x, 1)) == UNSPEC)
+    {
+      switch (XINT (XEXP (x, 1), 1))
+	{
+	  case VCTP8Q:
+	    return 16;
+	  case VCTP16Q:
+	    return 8;
+	  case VCTP32Q:
+	    return 4;
+	  case VCTP64Q:
+	    return 2;
+	  default:
+	    break;
+	}
+    }
+  return 0;
+}
+
+/* Check if an insn requires the use of the VPR_REG, if it does, return the
+   sub-rtx of the matched operand.  If there are more than one operand (e.g. an
+   input operand and an output operand) that use VPR_REG, return the first
+   occurance, which is usually the output operand.  */
+
+static rtx
+arm_get_required_vpr_reg (rtx_insn *insn)
+{
+  if (!NONJUMP_INSN_P (insn))
+    return NULL_RTX;
+
+  bool requires_vpr;
+  extract_constrain_insn (insn);
+  int n_operands = recog_data.n_operands;
+  if (recog_data.n_alternatives == 0)
+    return NULL_RTX;
+
+  /* Fill in recog_op_alt with information about the constraints of
+     this insn.  */
+  preprocess_constraints (insn);
+
+  for (int use = 0; use < n_operands; use++)
+    {
+      requires_vpr = true;
+      /* Iterate through alternatives of operand "use" in recog_op_alt and
+	 identify if the operand is required to be the VPR.  */
+      for (int alt1 = 0; alt1 < recog_data.n_alternatives; alt1++)
+	{
+	  const operand_alternative *op_alt1
+	      = &recog_op_alt[alt1 * n_operands];
+	  /* Fetch the reg_class for each entry and check it against the
+	   * VPR_REG reg_class.  */
+	  if (alternative_class (op_alt1, use) != VPR_REG)
+	    requires_vpr = false;
+	}
+      /* If all alternatives of the insn require the VPR reg for this operand,
+	 it means that either this is VPR-generating instruction, like a vctp,
+	 vcmp, etc., or it is a VPT-predicated insruction.  Return the subrtx
+	 of the VPR reg operand.  */
+      if (requires_vpr)
+	return recog_data.operand[use];
+    }
+  return NULL_RTX;
+}
+
+/* Scan the basic block of a loop body for a vctp instruction.  If there is
+   exactly one unique vctp instruction, return its rtx_insn *.  */
+
+static rtx_insn *
+arm_mve_get_loop_unique_vctp (basic_block bb)
+{
+  rtx_insn *insn = BB_HEAD (bb);
+  rtx_insn *vctp_op = NULL;
+
+  /* Now scan through all the instruction patterns and
+     pick out any MVE instructions.  */
+  FOR_BB_INSNS (bb, insn)
+    {
+      if (INSN_P (insn))
+	{
+	  /* First check if this is a vctp instruction.  There needs to be
+	     exactly one vctp instruction within the loop.  */
+	  if (arm_mve_get_vctp_lanes (PATTERN (insn)) != 0)
+	    {
+	      /* If we already found one vctp instruction, then the
+		 loop is not consistent internally.  */
+	      if (vctp_op)
+		return NULL;
+
+	      vctp_op = insn;
+	    }
+	}
+    }
+  return vctp_op;
+}
+
+rtx
+arm_attempt_dlstp_transform (rtx label, rtx count)
+{
+  int decrementnum;
+  basic_block body = BLOCK_FOR_INSN (label)->prev_bb;
+  rtx initial_compare;
+  /* Doloop can only be done "elementwise" with predicated dlstp/letp
+     when the iteration counter gets decremented by the number of MVE
+     lanes.  This can be extracted from the `count`, which is the expression
+     used to calculate the number of iterations that the loop would execute
+     for a standard dls/le loop.  Since we only support cases where this is a
+     power of 2, we can assume that this expression arrives here as:
+       (lshiftrt: (A) (const_int y))
+     Then we can extract the decrementnum from y.  */
+  if (GET_CODE (count) == LSHIFTRT && ARITHMETIC_P (XEXP (count, 0))
+      /* There is one final condition that needs to be met for the loop to be
+	 transformable: dlstp/letp will continue looping until there are
+	 elements still to process.  This can only work if the looping ends
+	 when the element counter reaches zero and not some other value
+	 (e.g. n > 0 works, not n > 1), or we can incorrectly end up running
+	 one additional iteration.  To by-pass any hoisting that the compiler
+	 may have done with the `A` in `count` above, we can instead look up
+	 to the bb before the loop preheader: this should end with a cmp+jump
+	 pair, where the cmp needs to be with (const_int 0).  */
+      && loop_preheader_edge (body->loop_father)->src->prev_bb
+      && BB_END (loop_preheader_edge (body->loop_father)->src->prev_bb)
+      && PREV_INSN (BB_END (loop_preheader_edge (body->loop_father)
+				->src->prev_bb))
+      && INSN_P (PREV_INSN (BB_END (loop_preheader_edge (body->loop_father)
+					->src->prev_bb)))
+      && (initial_compare
+	  = PATTERN (PREV_INSN (BB_END (loop_preheader_edge (body->loop_father)
+					    ->src->prev_bb))))
+      && GET_CODE (initial_compare) == SET
+      && cc_register (XEXP (initial_compare, 0), VOIDmode)
+      && GET_CODE (XEXP (initial_compare, 1)) == COMPARE
+      && CONST_INT_P (XEXP (XEXP (initial_compare, 1), 1))
+      && INTVAL (XEXP (XEXP (initial_compare, 1), 1)) == 0)
+    {
+      /* Extract the integer decrement from the LSHIFTR condition.  */
+      decrementnum = (1 << (INTVAL (XEXP (count, 1))));
+      /* Find the vctp predicate generation inside the loop body BB.  */
+      rtx_insn *vctp_insn = arm_mve_get_loop_unique_vctp (body);
+      /* If we have successfully found one exactly vctp predicate-generating
+	 instruction within the loop and the number by which we deprecate the
+	 loop counter in each iteration matches the number of lanes of the
+	 vctp instruction, we can attempt to turn this into a dlstp/letp loop.
+	 */
+      if (!vctp_insn
+	  || decrementnum != arm_mve_get_vctp_lanes (PATTERN (vctp_insn)))
+	return GEN_INT (1);
+
+      rtx_insn *insn = 0;
+      rtx_insn *cur_insn = 0;
+      rtx_insn *seq;
+      rtx vctp_vpr_generated = NULL_RTX;
+      rtx insn_vpr_reg_operand = NULL_RTX;
+      int new_icode;
+
+      /* Scan through the insns in the loop bb and emit the transformed bb
+	 insns to a sequence.  */
+      start_sequence ();
+      FOR_BB_INSNS (body, insn)
+	{
+	  if (GET_CODE (insn) == CODE_LABEL || NOTE_INSN_BASIC_BLOCK_P (insn))
+	    continue;
+	  else if (NOTE_P (insn))
+	    emit_note ((enum insn_note) NOTE_KIND (insn));
+	  else if (!INSN_P (insn))
+	    {
+	      end_sequence ();
+	      return GEN_INT (1);
+	    }
+	     /* When we find the vctp instruction: This may be followed by
+	     a sign-extend insn to SImode.  If it is, then save the
+	     sign-extended REG into vctp_vpr_generated.  If there is no
+	     sign-extend, then store the raw output of the vctp.
+	     For any VPT-predicated instructions we need to ensure that
+	     the VPR they use is the same as the one given here and
+	     they often consume the output of a subreg of the SImode
+	     sign-extended VPR-reg.  As a result, comparing against the
+	     output of the sign-extend is more likely to succeed.
+	     This code also guarantees to us that the vctp comes before
+	     any instructions that use the VPR within the loop, for the
+	     dlstp/letp transform to succeed.  */
+	  else if (insn == vctp_insn)
+	    {
+	      if (GET_CODE (
+		      XEXP (PATTERN (next_nonnote_nondebug_insn_bb (insn)), 1))
+		      == SIGN_EXTEND
+		  && GET_CODE (
+			 XEXP (PATTERN (next_nonnote_nondebug_insn_bb (
+				   next_nonnote_nondebug_insn_bb (insn))),
+			       1))
+			 == SUBREG)
+		vctp_vpr_generated
+		    = XEXP (PATTERN (next_nonnote_nondebug_insn_bb (
+				next_nonnote_nondebug_insn_bb (insn))),
+			    0);
+	      else
+		vctp_vpr_generated = XEXP (PATTERN (insn), 0);
+	      /* Also emit a USE of the source register of the vctp.
+		 This holds the number of elements being processed
+		 by the loop.  This later gets stored into `count`.
+		 */
+	      emit_use (XVECEXP (XEXP (PATTERN (insn), 1), 0, 0));
+	      continue;
+	    }
+	  /* If the insn pattern requires the use of the VPR, then it
+	     is a VPT-predicated instruction.  */
+	  else if ((insn_vpr_reg_operand = arm_get_required_vpr_reg (insn))
+		   != NULL_RTX)
+	    {
+	      /* If the VPR value is different to the one generated by
+		 the vctp, then fail the conversion.  */
+	      if (!rtx_equal_p (vctp_vpr_generated, insn_vpr_reg_operand))
+		{
+		  end_sequence ();
+		  return GEN_INT (1);
+		}
+	      /* If the insn does use the same VPR as the one generated
+		 by the vctp, it will need to be transformed into its
+		 non-predicated version.  Also ensure that it's a valid
+		 recog-ed instruction with the mve_unpredicated_insn
+		 atrribute.  */
+	      else if (recog_memoized (insn) >= 0
+		       && (new_icode = get_attr_mve_unpredicated_insn (insn)))
+		{
+		  extract_insn (insn);
+		  rtx arr[8];
+		  int j = 0;
+
+		  /* When transforming a VPT-predicated instruction
+		     into its unpredicated equivalent we need to drop
+		     the VPR operand and we may need to also drop a
+		     merge "vuninit" input operand, depending on the
+		     instruction pattern.  Here ensure that we have at
+		     most a two-operand difference between the two
+		     instrunctions.  */
+		  int n_operands_diff = recog_data.n_operands
+					- insn_data[new_icode].n_operands;
+		  gcc_assert (n_operands_diff > 0 && n_operands_diff <= 2);
+
+		  /* Then, loop through the operands of the predicated
+		     instruction, and retain the ones that map to the
+		     unpredicated instruction.  */
+		  for (int i = 0; i < recog_data.n_operands; i++)
+		    {
+		      /* Ignore the VPR and, if needed, the vuninit
+			 operand.  */
+		      if (insn_vpr_reg_operand == recog_data.operand[i]
+			  || (n_operands_diff == 2
+			      && !strcmp (recog_data.constraints[i], "0")))
+			continue;
+		      else
+			{
+			  arr[j] = recog_data.operand[i];
+			  j++;
+			}
+		    }
+
+		  /* Finally, emit the upredicated instruction.  */
+		  switch (j)
+		    {
+		      case 1:
+			emit_insn (GEN_FCN (new_icode) (arr[0]));
+			break;
+		      case 2:
+			emit_insn (GEN_FCN (new_icode) (arr[0], arr[1]));
+			break;
+		      case 3:
+			emit_insn (GEN_FCN (new_icode) (arr[0], arr[1],
+							arr[2]));
+			break;
+		      case 4:
+			emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2],
+							arr[3]));
+			break;
+		      case 5:
+			emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2],
+							arr[3], arr[4]));
+			break;
+		      case 6:
+			emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2],
+							arr[3], arr[4],
+							arr[5]));
+			break;
+		      case 7:
+			emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2],
+							arr[3], arr[4], arr[5],
+							arr[6]));
+			break;
+		      default:
+			gcc_unreachable ();
+		    }
+		}
+	      /* If we can't identify the INSN as either being either
+		 for deletion or to re-map, then we don't know how to
+		 handle it, so fail the whole conversion.  */
+	      else
+		{
+		  end_sequence ();
+		  return GEN_INT (1);
+		}
+	    }
+	  /* Instructions that dont's require the VPR can be carried
+	     over as-is.  */
+	  else if (DEBUG_INSN_P (insn))
+	    emit_debug_insn (PATTERN (insn));
+	  else
+	    emit_insn (PATTERN (insn));
+	}
+      seq = get_insns ();
+      end_sequence ();
+
+      /* Re-write the entire BB contents with the transformed
+	 sequence.  */
+      FOR_BB_INSNS_SAFE (body, insn, cur_insn)
+	if (!(GET_CODE (insn) == CODE_LABEL || NOTE_INSN_BASIC_BLOCK_P (insn)))
+	  delete_insn (insn);
+      for (insn = seq; NEXT_INSN (insn); insn = NEXT_INSN (insn))
+	if (NOTE_P (insn))
+	  emit_note_after ((enum insn_note) NOTE_KIND (insn),  BB_END (body));
+	else if (DEBUG_INSN_P (insn))
+	  emit_debug_insn_after (PATTERN (insn), BB_END (body));
+	else
+	  emit_insn_after (PATTERN (insn), BB_END (body));
+
+      emit_jump_insn_after (PATTERN (insn), BB_END (body));
+      return GEN_INT (decrementnum);
+    }
+  /* Bail out: we can't use dlstp/letp, so return 1 to allow loop-doloop to try
+     the standard dls/le pair.  */
+  return GEN_INT (1);
+}
+
+/* Target hook to the number of elements to be processed by a dlstp/letp loop
+   into `count` to intialise the counter register.  The number of elements was
+   previously extracted from the vctp insn and placed into a USE rtx.
+   We only check that the doloop_end pattern successfully decrements by a
+   number other than -1 for a valid dlstp/letp loop.  No other checking is
+   needed as that was done previously.  */
+
+rtx
+arm_allow_elementwise_doloop_p (rtx count, rtx label, rtx doloop)
+{
+  if (doloop
+      && INTVAL (XEXP (SET_SRC (XVECEXP (PATTERN (doloop), 0, 1)), 1)) != -1
+      && ARITHMETIC_P (XEXP (count, 0)))
+    {
+      basic_block body = BLOCK_FOR_INSN (label)->prev_bb;
+      rtx_insn* insn;
+      FOR_BB_INSNS (body, insn)
+	{
+	  if (INSN_P (insn) && GET_CODE (PATTERN (insn)) == USE)
+	    {
+	      rtx num_elem_reg = copy_rtx (XEXP (PATTERN (insn), 0));
+	      delete_insn (insn);
+	      return num_elem_reg;
+	    }
+	}
+    }
+  return count;
 }
 
 #if CHECKING_P
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 29062cd6fb3..aa2fdac22f3 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -1464,7 +1464,9 @@ 
 
 (define_int_attr mode1 [(VCTP8Q "8") (VCTP16Q "16") (VCTP32Q "32")
 			(VCTP64Q "64") (VCTP8Q_M "8") (VCTP16Q_M "16")
-			(VCTP32Q_M "32") (VCTP64Q_M "64")])
+			(VCTP32Q_M "32") (VCTP64Q_M "64")
+			(DLSTP8 "8") (DLSTP16 "16") (DLSTP32 "32")
+			(DLSTP64 "64")])
 
 ;; Both kinds of return insn.
 (define_code_iterator RETURNS [return simple_return])
@@ -1773,6 +1775,8 @@ 
 (define_int_iterator UQRSHLLQ [UQRSHLL_64 UQRSHLL_48])
 (define_int_iterator SQRSHRLQ [SQRSHRL_64 SQRSHRL_48])
 (define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U])
+(define_int_iterator DLSTP [DLSTP8 DLSTP16 DLSTP32
+				   DLSTP64])
 
 ;; Define iterators for VCMLA operations
 (define_int_iterator VCMLA_OP [UNSPEC_VCMLA
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index b1c8c1c569f..3baddab5905 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -10837,3 +10837,38 @@ 
     }
   DONE;
 })
+
+;; Originally expanded by 'predicated_doloop_end'.
+(define_insn "*predicated_doloop_end_internal"
+  [(set (pc)
+	(if_then_else
+	   (ge (plus:SI (reg:SI LR_REGNUM)
+			(match_operand:SI 0 "const_int_operand" ""))
+		(const_int 0))
+	 (label_ref (match_operand 1 "" ""))
+	 (pc)))
+   (set (reg:SI LR_REGNUM)
+	(plus:SI (reg:SI LR_REGNUM) (match_dup 0)))
+   (clobber (reg:CC CC_REGNUM))]
+  "TARGET_32BIT && TARGET_HAVE_LOB && TARGET_HAVE_MVE && TARGET_THUMB2"
+  {
+    if (get_attr_length (insn) == 4)
+      return "letp\t%|lr, %l1";
+    else
+      return "subs\t%|lr, #%0;bgt\t%l1";
+  }
+  [(set (attr "length")
+	(if_then_else
+	   (ltu (minus (pc) (match_dup 1)) (const_int 1024))
+	    (const_int 4)
+	    (const_int 6)))
+   (set_attr "type" "branch")])
+
+(define_insn "dlstp<mode1>_insn"
+  [
+    (set (reg:SI LR_REGNUM)
+	 (unspec:SI [(match_operand:SI 0 "s_register_operand" "r")]
+	  DLSTP))
+  ]
+  "TARGET_32BIT && TARGET_HAVE_LOB && TARGET_HAVE_MVE && TARGET_THUMB2"
+  "dlstp.<mode1>\t%|lr, %0")
\ No newline at end of file
diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md
index b2309a52165..a21e2909872 100644
--- a/gcc/config/arm/thumb2.md
+++ b/gcc/config/arm/thumb2.md
@@ -1610,10 +1610,11 @@ 
 ;; knows what to generate.
 (define_expand "doloop_end"
   [(use (match_operand 0 "" ""))      ; loop pseudo
-   (use (match_operand 1 "" ""))]     ; label
+   (use (match_operand 1 "" ""))      ; label
+   (use (match_operand 2 "" ""))]     ; decrement constant
   "TARGET_32BIT"
   "
- {
+{
    /* Currently SMS relies on the do-loop pattern to recognize loops
       where (1) the control part consists of all insns defining and/or
       using a certain 'count' register and (2) the loop count can be
@@ -1623,41 +1624,68 @@ 
 
       Also used to implement the low over head loops feature, which is part of
       the Armv8.1-M Mainline Low Overhead Branch (LOB) extension.  */
-   if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB))
-   {
-     rtx s0;
-     rtx bcomp;
-     rtx loc_ref;
-     rtx cc_reg;
-     rtx insn;
-     rtx cmp;
-
-     if (GET_MODE (operands[0]) != SImode)
-       FAIL;
-
-     s0 = operands [0];
-
-     /* Low over head loop instructions require the first operand to be LR.  */
-     if (TARGET_HAVE_LOB && arm_target_insn_ok_for_lob (operands [1]))
-       s0 = gen_rtx_REG (SImode, LR_REGNUM);
-
-     if (TARGET_THUMB2)
-       insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0, GEN_INT (-1)));
-     else
-       insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1)));
-
-     cmp = XVECEXP (PATTERN (insn), 0, 0);
-     cc_reg = SET_DEST (cmp);
-     bcomp = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
-     loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands [1]);
-     emit_jump_insn (gen_rtx_SET (pc_rtx,
-                                  gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
-                                                        loc_ref, pc_rtx)));
-     DONE;
-   }
- else
-   FAIL;
- }")
+  if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB))
+    {
+      rtx s0;
+      rtx bcomp;
+      rtx loc_ref;
+      rtx cc_reg;
+      rtx insn;
+      rtx cmp;
+      rtx decrement_num;
+
+      if (GET_MODE (operands[0]) != SImode)
+	FAIL;
+
+      s0 = operands[0];
+
+       if (TARGET_HAVE_LOB && arm_target_insn_ok_for_lob (operands[1]))
+	{
+	  s0 = gen_rtx_REG (SImode, LR_REGNUM);
+
+	  /* If we have a compatibe MVE target, try and analyse the loop
+	     contents to determine if we can use predicated dlstp/letp
+	     looping.  */
+	  if (TARGET_HAVE_MVE && TARGET_THUMB2
+	      && (decrement_num = arm_attempt_dlstp_transform (operands[1],
+							       operands[2]))
+	      && (INTVAL (decrement_num) != 1))
+	    {
+	      insn = emit_insn
+		      (gen_thumb2_addsi3_compare0
+			  (s0, s0, GEN_INT ((-1) * (INTVAL (decrement_num)))));
+	      cmp = XVECEXP (PATTERN (insn), 0, 0);
+	      cc_reg = SET_DEST (cmp);
+	      bcomp = gen_rtx_GE (VOIDmode, cc_reg, const0_rtx);
+	      loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]);
+	      emit_jump_insn (gen_rtx_SET (pc_rtx,
+				       gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
+							     loc_ref, pc_rtx)));
+	      DONE;
+	    }
+
+	  /* Otherwise, try standard decrement-by-one dls/le looping.  */
+	  if (TARGET_THUMB2)
+	    insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0,
+							  GEN_INT (-1)));
+	  else
+	    insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1)));
+
+	  cmp = XVECEXP (PATTERN (insn), 0, 0);
+	  cc_reg = SET_DEST (cmp);
+	  bcomp = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
+	  loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]);
+	  emit_jump_insn (gen_rtx_SET (pc_rtx,
+				       gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
+							     loc_ref, pc_rtx)));
+	  DONE;
+	}
+      else
+	FAIL;
+    }
+  else
+    FAIL;
+}")
 
 (define_insn "*clear_apsr"
   [(unspec_volatile:SI [(const_int 0)] VUNSPEC_CLRM_APSR)
@@ -1755,7 +1783,37 @@ 
   {
     if (REGNO (operands[0]) == LR_REGNUM)
       {
-	emit_insn (gen_dls_insn (operands[0]));
+	/* Pick out the number by which we are decrementing the loop counter
+	   in every iteration.  If it's > 1, then use dlstp.  */
+	int const_int_dec_num
+	     = abs (INTVAL (XEXP (XEXP (XVECEXP (PATTERN (operands[1]), 0, 1),
+				  1),
+			    1)));
+	switch (const_int_dec_num)
+	  {
+	    case 16:
+	      emit_insn (gen_dlstp8_insn (operands[0]));
+	      break;
+
+	    case 8:
+	      emit_insn (gen_dlstp16_insn (operands[0]));
+	      break;
+
+	    case 4:
+	      emit_insn (gen_dlstp32_insn (operands[0]));
+	      break;
+
+	    case 2:
+	      emit_insn (gen_dlstp64_insn (operands[0]));
+	      break;
+
+	    case 1:
+	      emit_insn (gen_dls_insn (operands[0]));
+	      break;
+
+	    default:
+	      gcc_unreachable ();
+	  }
 	DONE;
       }
     else
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index 7748e784379..744e7ab5731 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -579,6 +579,10 @@ 
   VCTP16Q
   VCTP32Q
   VCTP64Q
+  DLSTP8
+  DLSTP16
+  DLSTP32
+  DLSTP64
   VPNOT
   VCREATEQ_F
   VCVTQ_N_TO_F_S
diff --git a/gcc/config/bfin/bfin.md b/gcc/config/bfin/bfin.md
index 56b24726bc2..3b31ee3a1ee 100644
--- a/gcc/config/bfin/bfin.md
+++ b/gcc/config/bfin/bfin.md
@@ -1959,7 +1959,8 @@ 
 		   (plus:SI (match_dup 0)
 			    (const_int -1)))
 	      (unspec [(const_int 0)] UNSPEC_LSETUP_END)
-	      (clobber (match_dup 2))
+	      (clobber (match_dup 3))
+	      (match_operand 2 "" "")
 	      (clobber (reg:BI REG_CC))])] ; match_scratch
   ""
 {
@@ -1967,7 +1968,7 @@ 
   if (GET_MODE (operands[0]) != SImode)
     FAIL;
   bfin_hardware_loop ();
-  operands[2] = gen_rtx_SCRATCH (SImode);
+  operands[3] = gen_rtx_SCRATCH (SImode);
 })
 
 (define_insn "loop_end"
diff --git a/gcc/config/c6x/c6x.md b/gcc/config/c6x/c6x.md
index 60110410d0b..de4f4a56d99 100644
--- a/gcc/config/c6x/c6x.md
+++ b/gcc/config/c6x/c6x.md
@@ -1429,13 +1429,14 @@ 
 	      (set (match_dup 0)
 		   (plus:SI (match_dup 0)
 			    (const_int -1)))
-	      (clobber (match_dup 2))])] ; match_scratch
+	      (clobber (match_dup 3)) ; match_scratch
+	      (match_operand 2 "" "")])]
   "TARGET_INSNS_64PLUS && optimize"
 {
   /* The loop optimizer doesn't check the predicates... */
   if (GET_MODE (operands[0]) != SImode)
     FAIL;
-  operands[2] = gen_rtx_SCRATCH (SImode);
+  operands[3] = gen_rtx_SCRATCH (SImode);
 })
 
 (define_insn "mvilc"
diff --git a/gcc/config/ia64/ia64.md b/gcc/config/ia64/ia64.md
index 5d1d47da55b..5b24bd76ace 100644
--- a/gcc/config/ia64/ia64.md
+++ b/gcc/config/ia64/ia64.md
@@ -3956,7 +3956,8 @@ 
 
 (define_expand "doloop_end"
   [(use (match_operand 0 "" ""))	; loop pseudo
-   (use (match_operand 1 "" ""))]	; label
+   (use (match_operand 1 "" ""))	; label
+   (use (match_operand 2 "" ""))]     ; decrement constant
   ""
 {
   if (GET_MODE (operands[0]) != DImode)
diff --git a/gcc/config/pdp11/pdp11.md b/gcc/config/pdp11/pdp11.md
index a46efc1ef78..aa359955c3c 100644
--- a/gcc/config/pdp11/pdp11.md
+++ b/gcc/config/pdp11/pdp11.md
@@ -332,7 +332,8 @@ 
 		    (pc)))
 	      (set (match_dup 0)
 		   (plus:HI (match_dup 0)
-			 (const_int -1)))])]
+			 (const_int -1)))
+	      (match_operand 2 "" "")])]
   "TARGET_40_PLUS"
   "{
     if (GET_MODE (operands[0]) != HImode)
diff --git a/gcc/config/pru/pru.md b/gcc/config/pru/pru.md
index bdc5ad79ba0..990cdfe0974 100644
--- a/gcc/config/pru/pru.md
+++ b/gcc/config/pru/pru.md
@@ -1636,7 +1636,8 @@ 
 
 (define_expand "doloop_end"
   [(use (match_operand 0 "nonimmediate_operand"))
-   (use (label_ref (match_operand 1 "")))]
+   (use (label_ref (match_operand 1 "")))
+   (use (match_operand 2 "" ""))]     ; decrement constant
   "TARGET_OPT_LOOP"
 {
   if (GET_CODE (operands[0]) == REG && GET_MODE (operands[0]) == QImode)
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index ad5a4cf2ef8..fdaeb6a02ca 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -13422,7 +13422,8 @@ 
 
 (define_expand "doloop_end"
   [(use (match_operand 0))	; loop pseudo
-   (use (match_operand 1))]	; label
+   (use (match_operand 1))	; label
+   (use (match_operand 2 "" ""))]     ; decrement constant
   ""
 {
   if (GET_MODE (operands[0]) != Pmode)
diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index 962927c3112..bad317fdba7 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -9780,7 +9780,8 @@ 
 
 (define_expand "doloop_end"
   [(use (match_operand 0 "" ""))        ; loop pseudo
-   (use (match_operand 1 "" ""))]       ; label
+   (use (match_operand 1 "" ""))        ; label
+   (use (match_operand 2 "" ""))]       ; decrement constant
   ""
 {
   if (GET_MODE (operands[0]) == SImode)
diff --git a/gcc/config/sh/sh.md b/gcc/config/sh/sh.md
index 59a7b216433..5b8d74d6029 100644
--- a/gcc/config/sh/sh.md
+++ b/gcc/config/sh/sh.md
@@ -6404,7 +6404,8 @@ 
 				 (pc)))
 	      (set (match_dup 0)
 		   (plus:SI (match_dup 0) (const_int -1)))
-	      (clobber (reg:SI T_REG))])]
+	      (clobber (reg:SI T_REG))
+	      (match_operand 2 "" "")])]
   "TARGET_SH2"
 {
   if (GET_MODE (operands[0]) != SImode)
diff --git a/gcc/config/v850/v850.md b/gcc/config/v850/v850.md
index 6ca31e3f43f..8d0812abac7 100644
--- a/gcc/config/v850/v850.md
+++ b/gcc/config/v850/v850.md
@@ -1434,7 +1434,8 @@ 
 
 (define_expand "doloop_end"
  [(use (match_operand 0 "" ""))        ; loop pseudo
-  (use (match_operand 1 "" ""))]       ; label
+  (use (match_operand 1 "" ""))        ; label
+  (use (match_operand 2 "" ""))]       ; decrement constant
   "TARGET_V850E3V5_UP && TARGET_LOOP"
   {
     rtx loop_cnt = operands[0];
diff --git a/gcc/config/xtensa/xtensa.md b/gcc/config/xtensa/xtensa.md
index 608110c20bc..57a4e2ca353 100644
--- a/gcc/config/xtensa/xtensa.md
+++ b/gcc/config/xtensa/xtensa.md
@@ -2016,13 +2016,14 @@ 
                    (plus:SI (match_dup 0)
                             (const_int -1)))
               (unspec [(const_int 0)] UNSPEC_LSETUP_END)
-              (clobber (match_dup 2))])] ; match_scratch
+              (clobber (match_dup 3)) ; match_scratch
+	      (match_operand 2 "" "")])]
   "TARGET_LOOPS && optimize"
 {
   /* The loop optimizer doesn't check the predicates... */
   if (GET_MODE (operands[0]) != SImode)
     FAIL;
-  operands[2] = gen_rtx_SCRATCH (SImode);
+  operands[3] = gen_rtx_SCRATCH (SImode);
 })
 
 
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 110f8dfa0a9..a8cfcd7497d 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -11774,6 +11774,14 @@  loops, and will help ivopts to make some decisions.
 The default version of this hook returns false.
 @end deftypefn
 
+@deftypefn {Target Hook} rtx TARGET_ALLOW_ELEMENTWISE_DOLOOP_P (rtx @var{count}, rtx @var{label}, rtx @var{doloop})
+This target hook allows the target to support loop-doloop optimisations
+where the value that gets put into the loop counter register is not a
+pre-calculation of the number of iteration of the loop.  For instance,
+the value used can be the number of elements that the loop will process.
+The default version of this hook returns the same rtx it was given.
+@end deftypefn
+
 @deftypevr {Target Hook} bool TARGET_HAVE_COUNT_REG_DECR_P
 Return true if the target supports hardware count register for decrement
 and branch.
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 501ddf147e4..024711cfdb9 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -7730,6 +7730,8 @@  to by @var{ce_info}.
 
 @hook TARGET_PREDICT_DOLOOP_P
 
+@hook TARGET_ALLOW_ELEMENTWISE_DOLOOP_P
+
 @hook TARGET_HAVE_COUNT_REG_DECR_P
 
 @hook TARGET_DOLOOP_COST_FOR_GENERIC
diff --git a/gcc/loop-doloop.cc b/gcc/loop-doloop.cc
index 30b45c8071a..cbbc662f16b 100644
--- a/gcc/loop-doloop.cc
+++ b/gcc/loop-doloop.cc
@@ -85,29 +85,29 @@  doloop_condition_get (rtx_insn *doloop_pat)
      forms:
 
      1)  (parallel [(set (pc) (if_then_else (condition)
-	  			            (label_ref (label))
-				            (pc)))
-	             (set (reg) (plus (reg) (const_int -1)))
-	             (additional clobbers and uses)])
+					    (label_ref (label))
+					    (pc)))
+		     (set (reg) (plus (reg) (const_int -n)))
+		     (additional clobbers and uses)])
 
      The branch must be the first entry of the parallel (also required
      by jump.cc), and the second entry of the parallel must be a set of
      the loop counter register.  Some targets (IA-64) wrap the set of
      the loop counter in an if_then_else too.
 
-     2)  (set (reg) (plus (reg) (const_int -1))
-         (set (pc) (if_then_else (reg != 0)
-	                         (label_ref (label))
-			         (pc))).  
+     2)  (set (reg) (plus (reg) (const_int -n))
+	 (set (pc) (if_then_else (reg != 0)
+				 (label_ref (label))
+				 (pc))).
 
      Some targets (ARM) do the comparison before the branch, as in the
      following form:
 
-     3) (parallel [(set (cc) (compare ((plus (reg) (const_int -1), 0)))
-                   (set (reg) (plus (reg) (const_int -1)))])
-        (set (pc) (if_then_else (cc == NE)
-                                (label_ref (label))
-                                (pc))) */
+     3) (parallel [(set (cc) (compare ((plus (reg) (const_int -n), 0)))
+		   (set (reg) (plus (reg) (const_int -n)))])
+	(set (pc) (if_then_else (cc == NE)
+				(label_ref (label))
+				(pc))) */
 
   pattern = PATTERN (doloop_pat);
 
@@ -143,7 +143,7 @@  doloop_condition_get (rtx_insn *doloop_pat)
 	      || GET_CODE (cmp_arg1) != PLUS)
 	    return 0;
 	  reg_orig = XEXP (cmp_arg1, 0);
-	  if (XEXP (cmp_arg1, 1) != GEN_INT (-1) 
+	  if (!CONST_INT_P (XEXP (cmp_arg1, 1))
 	      || !REG_P (reg_orig))
 	    return 0;
 	  cc_reg = SET_DEST (cmp_orig);
@@ -156,7 +156,8 @@  doloop_condition_get (rtx_insn *doloop_pat)
 	{
 	  /* We expect the condition to be of the form (reg != 0)  */
 	  cond = XEXP (SET_SRC (cmp), 0);
-	  if (GET_CODE (cond) != NE || XEXP (cond, 1) != const0_rtx)
+	  if ((GET_CODE (cond) != NE && GET_CODE (cond) != GE)
+	      || XEXP (cond, 1) != const0_rtx)
 	    return 0;
 	}
     }
@@ -173,14 +174,14 @@  doloop_condition_get (rtx_insn *doloop_pat)
   if (! REG_P (reg))
     return 0;
 
-  /* Check if something = (plus (reg) (const_int -1)).
+  /* Check if something = (plus (reg) (const_int -n)).
      On IA-64, this decrement is wrapped in an if_then_else.  */
   inc_src = SET_SRC (inc);
   if (GET_CODE (inc_src) == IF_THEN_ELSE)
     inc_src = XEXP (inc_src, 1);
   if (GET_CODE (inc_src) != PLUS
       || XEXP (inc_src, 0) != reg
-      || XEXP (inc_src, 1) != constm1_rtx)
+      || !CONST_INT_P (XEXP (inc_src, 1)))
     return 0;
 
   /* Check for (set (pc) (if_then_else (condition)
@@ -211,42 +212,49 @@  doloop_condition_get (rtx_insn *doloop_pat)
       || (GET_CODE (XEXP (condition, 0)) == PLUS
 	  && XEXP (XEXP (condition, 0), 0) == reg))
    {
-     if (GET_CODE (pattern) != PARALLEL)
      /*  For the second form we expect:
 
-         (set (reg) (plus (reg) (const_int -1))
-         (set (pc) (if_then_else (reg != 0)
-                                 (label_ref (label))
-                                 (pc))).
+	 (set (reg) (plus (reg) (const_int -n))
+	 (set (pc) (if_then_else (reg != 0)
+				 (label_ref (label))
+				 (pc))).
 
-         is equivalent to the following:
+	 If n == 1, that is equivalent to the following:
 
-         (parallel [(set (pc) (if_then_else (reg != 1)
-                                            (label_ref (label))
-                                            (pc)))
-                     (set (reg) (plus (reg) (const_int -1)))
-                     (additional clobbers and uses)])
+	 (parallel [(set (pc) (if_then_else (reg != 1)
+					    (label_ref (label))
+					    (pc)))
+		     (set (reg) (plus (reg) (const_int -1)))
+		     (additional clobbers and uses)])
 
-        For the third form we expect:
+	For the third form we expect:
 
-        (parallel [(set (cc) (compare ((plus (reg) (const_int -1)), 0))
-                   (set (reg) (plus (reg) (const_int -1)))])
-        (set (pc) (if_then_else (cc == NE)
-                                (label_ref (label))
-                                (pc))) 
+	(parallel [(set (cc) (compare ((plus (reg) (const_int -n)), 0))
+		   (set (reg) (plus (reg) (const_int -n)))])
+	(set (pc) (if_then_else (cc == NE)
+				(label_ref (label))
+				(pc)))
 
-        which is equivalent to the following:
+	Which also for n == 1 is equivalent to the following:
 
-        (parallel [(set (cc) (compare (reg,  1))
-                   (set (reg) (plus (reg) (const_int -1)))
-                   (set (pc) (if_then_else (NE == cc)
-                                           (label_ref (label))
-                                           (pc))))])
+	(parallel [(set (cc) (compare (reg,  1))
+		   (set (reg) (plus (reg) (const_int -1)))
+		   (set (pc) (if_then_else (NE == cc)
+					   (label_ref (label))
+					   (pc))))])
 
-        So we return the second form instead for the two cases.
+	So we return the second form instead for the two cases.
 
+	For the "elementwise" form where the decrement number isn't -1,
+	the final value may be exceeded, so use GE instead of NE.
      */
-        condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);
+     if (GET_CODE (pattern) != PARALLEL)
+       {
+	if (INTVAL (XEXP (inc_src, 1)) != -1)
+	  condition = gen_rtx_fmt_ee (GE, VOIDmode, inc_src, const0_rtx);
+	else
+	  condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);;
+       }
 
     return condition;
    }
@@ -685,17 +693,6 @@  doloop_optimize (class loop *loop)
       return false;
     }
 
-  max_cost
-    = COSTS_N_INSNS (param_max_iterations_computation_cost);
-  if (set_src_cost (desc->niter_expr, mode, optimize_loop_for_speed_p (loop))
-      > max_cost)
-    {
-      if (dump_file)
-	fprintf (dump_file,
-		 "Doloop: number of iterations too costly to compute.\n");
-      return false;
-    }
-
   if (desc->const_iter)
     iterations = widest_int::from (rtx_mode_t (desc->niter_expr, mode),
 				   UNSIGNED);
@@ -720,7 +717,25 @@  doloop_optimize (class loop *loop)
   count = copy_rtx (desc->niter_expr);
   start_label = block_label (desc->in_edge->dest);
   doloop_reg = gen_reg_rtx (mode);
-  rtx_insn *doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label);
+  rtx_insn *doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label,
+						 count);
+
+  /* Not all targets need to pre-calculate the number of the iterations of
+     the loop, they instead work by storing the number of elements in the
+     counter_reg and decrementing that.  Call the appropriate target hook to
+     change the value of count.  */
+  count = targetm.allow_elementwise_doloop_p (count, start_label, doloop_seq);
+
+  max_cost
+    = COSTS_N_INSNS (param_max_iterations_computation_cost);
+  if (set_src_cost (count, mode, optimize_loop_for_speed_p (loop))
+      > max_cost)
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "Doloop: number of iterations too costly to compute.\n");
+      return false;
+    }
 
   word_mode_size = GET_MODE_PRECISION (word_mode);
   word_mode_max = (HOST_WIDE_INT_1U << (word_mode_size - 1) << 1) - 1;
@@ -737,7 +752,7 @@  doloop_optimize (class loop *loop)
       else
 	count = lowpart_subreg (word_mode, count, mode);
       PUT_MODE (doloop_reg, word_mode);
-      doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label);
+      doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label, count);
     }
   if (! doloop_seq)
     {
diff --git a/gcc/target-insns.def b/gcc/target-insns.def
index de8c0092f98..b77b7972426 100644
--- a/gcc/target-insns.def
+++ b/gcc/target-insns.def
@@ -48,7 +48,7 @@  DEF_TARGET_INSN (casesi, (rtx x0, rtx x1, rtx x2, rtx x3, rtx x4))
 DEF_TARGET_INSN (check_stack, (rtx x0))
 DEF_TARGET_INSN (clear_cache, (rtx x0, rtx x1))
 DEF_TARGET_INSN (doloop_begin, (rtx x0, rtx x1))
-DEF_TARGET_INSN (doloop_end, (rtx x0, rtx x1))
+DEF_TARGET_INSN (doloop_end, (rtx x0, rtx x1, rtx x2))
 DEF_TARGET_INSN (eh_return, (rtx x0))
 DEF_TARGET_INSN (epilogue, (void))
 DEF_TARGET_INSN (exception_receiver, (void))
diff --git a/gcc/target.def b/gcc/target.def
index a3d3b04a165..e74724c8a13 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -4392,6 +4392,16 @@  The default version of this hook returns false.",
  bool, (class loop *loop),
  default_predict_doloop_p)
 
+DEFHOOK
+(allow_elementwise_doloop_p,
+ "This target hook allows the target to support loop-doloop optimisations\n\
+where the value that gets put into the loop counter register is not a\n\
+pre-calculation of the number of iteration of the loop.  For instance,\n\
+the value used can be the number of elements that the loop will process.\n\
+The default version of this hook returns the same rtx it was given.",
+ rtx, (rtx count, rtx label, rtx doloop),
+ default_allow_elementwise_doloop_p)
+
 DEFHOOKPOD
 (have_count_reg_decr_p,
  "Return true if the target supports hardware count register for decrement\n\
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index d17d393baed..04d08056dd7 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -661,6 +661,12 @@  default_predict_doloop_p (class loop *loop ATTRIBUTE_UNUSED)
   return false;
 }
 
+rtx
+default_allow_elementwise_doloop_p (rtx count, rtx, rtx)
+{
+  return count;
+}
+
 /* By default, just use the input MODE itself.  */
 
 machine_mode
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index ecce55ebe79..f9aba2e0813 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -88,6 +88,7 @@  extern bool default_fixed_point_supported_p (void);
 extern bool default_has_ifunc_p (void);
 
 extern bool default_predict_doloop_p (class loop *);
+extern rtx default_allow_elementwise_doloop_p (rtx, rtx, rtx);
 extern machine_mode default_preferred_doloop_mode (machine_mode);
 extern const char * default_invalid_within_doloop (const rtx_insn *);
 
diff --git a/gcc/testsuite/gcc.target/arm/dlstp-int16x8.c b/gcc/testsuite/gcc.target/arm/dlstp-int16x8.c
new file mode 100644
index 00000000000..a61f02ed3a2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/dlstp-int16x8.c
@@ -0,0 +1,68 @@ 
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "lob.h"
+
+void  __attribute__ ((noinline)) test (int16_t *a, int16_t *b, int16_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp16q (n);
+      int16x8_t va = vldrhq_z_s16 (a, p);
+      int16x8_t vb = vldrhq_z_s16 (b, p);
+      int16x8_t vc = vaddq_x_s16 (va, vb, p);
+      vstrhq_p_s16 (c, vc, p);
+      c+=8;
+      a+=8;
+      b+=8;
+      n-=8;
+    }
+}
+
+int main ()
+{
+  int i;
+  int16_t temp1[N];
+  int16_t temp2[N];
+  int16_t temp3[N];
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 0);
+  check_plus16 (temp1, temp2, temp3, 0);
+
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 1);
+  check_plus16 (temp1, temp2, temp3, 1);
+
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 7);
+  check_plus16 (temp1, temp2, temp3, 7);
+
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 8);
+  check_plus16 (temp1, temp2, temp3, 8);
+
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 9);
+  check_plus16 (temp1, temp2, temp3, 9);
+
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 16);
+  check_plus16 (temp1, temp2, temp3, 16);
+
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 17);
+  check_plus16 (temp1, temp2, temp3, 17);
+
+  reset_data16 (temp1, temp2, temp3, N);
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.16\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp\t} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp\t" } } */
+/* { dg-final { scan-assembler-not "\tvpst\t" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
diff --git a/gcc/testsuite/gcc.target/arm/dlstp-int32x4.c b/gcc/testsuite/gcc.target/arm/dlstp-int32x4.c
new file mode 100644
index 00000000000..31a7264ae26
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/dlstp-int32x4.c
@@ -0,0 +1,68 @@ 
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "lob.h"
+
+void  __attribute__ ((noinline)) test (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c+=4;
+      a+=4;
+      b+=4;
+      n-=4;
+    }
+}
+
+int main ()
+{
+  int i;
+  int32_t temp1[N];
+  int32_t temp2[N];
+  int32_t temp3[N];
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 0);
+  check_plus32 (temp1, temp2, temp3, 0);
+
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 1);
+  check_plus32 (temp1, temp2, temp3, 1);
+
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 3);
+  check_plus32 (temp1, temp2, temp3, 3);
+
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 4);
+  check_plus32 (temp1, temp2, temp3, 4);
+
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 5);
+  check_plus32 (temp1, temp2, temp3, 5);
+
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 8);
+  check_plus32 (temp1, temp2, temp3, 8);
+
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 9);
+  check_plus32 (temp1, temp2, temp3, 9);
+
+  reset_data32 (temp1, temp2, temp3, N);
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.32\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp\t} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp\t" } } */
+/* { dg-final { scan-assembler-not "\tvpst\t" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
diff --git a/gcc/testsuite/gcc.target/arm/dlstp-int64x2.c b/gcc/testsuite/gcc.target/arm/dlstp-int64x2.c
new file mode 100644
index 00000000000..a09c01884da
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/dlstp-int64x2.c
@@ -0,0 +1,68 @@ 
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "lob.h"
+
+void  __attribute__ ((noinline)) test (int64_t *a, int64_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp64q (n);
+      int64x2_t va = vldrdq_gather_offset_z_s64 (a, vcreateq_u64 (8, 0), p);
+      vstrdq_scatter_offset_p_s64 (c, vcreateq_u64 (8, 0), va, p);
+      c+=2;
+      a+=2;
+      n-=2;
+    }
+}
+
+int main ()
+{
+  int i;
+  int64_t temp1[N];
+  int64_t temp3[N];
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 0);
+  check_memcpy64 (temp1, temp3, 0);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 1);
+  check_memcpy64 (temp1, temp3, 1);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 2);
+  check_memcpy64 (temp1, temp3, 2);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 3);
+  check_memcpy64 (temp1, temp3, 3);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 4);
+  check_memcpy64 (temp1, temp3, 4);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 5);
+  check_memcpy64 (temp1, temp3, 5);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 6);
+  check_memcpy64 (temp1, temp3, 6);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 7);
+  check_memcpy64 (temp1, temp3, 7);
+
+  reset_data64  (temp1, temp3, N);
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.64\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp\t} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp\t" } } */
+/* { dg-final { scan-assembler-not "\tvpst\t" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
diff --git a/gcc/testsuite/gcc.target/arm/dlstp-int8x16.c b/gcc/testsuite/gcc.target/arm/dlstp-int8x16.c
new file mode 100644
index 00000000000..49fbd4c16a2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/dlstp-int8x16.c
@@ -0,0 +1,68 @@ 
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "lob.h"
+
+void  __attribute__ ((noinline)) test (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp8q (n);
+      int8x16_t va = vldrbq_z_s8 (a, p);
+      int8x16_t vb = vldrbq_z_s8 (b, p);
+      int8x16_t vc = vaddq_x_s8 (va, vb, p);
+      vstrbq_p_s8 (c, vc, p);
+      c+=16;
+      a+=16;
+      b+=16;
+      n-=16;
+    }
+}
+
+int main ()
+{
+  int i;
+  int8_t temp1[N];
+  int8_t temp2[N];
+  int8_t temp3[N];
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 0);
+  check_plus8 (temp1, temp2, temp3, 0);
+
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 1);
+  check_plus8 (temp1, temp2, temp3, 1);
+
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 15);
+  check_plus8 (temp1, temp2, temp3, 15);
+
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 16);
+  check_plus8 (temp1, temp2, temp3, 16);
+
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 17);
+  check_plus8 (temp1, temp2, temp3, 17);
+
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 32);
+  check_plus8 (temp1, temp2, temp3, 32);
+
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 33);
+  check_plus8 (temp1, temp2, temp3, 33);
+
+  reset_data8 (temp1, temp2, temp3, N);
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.8\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp\t} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp\t" } } */
+/* { dg-final { scan-assembler-not "\tvpst\t" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
diff --git a/gcc/testsuite/gcc.target/arm/lob.h b/gcc/testsuite/gcc.target/arm/lob.h
index feaae7cc899..3941fe7a8b6 100644
--- a/gcc/testsuite/gcc.target/arm/lob.h
+++ b/gcc/testsuite/gcc.target/arm/lob.h
@@ -1,15 +1,131 @@ 
 #include <string.h>
-
+#include <stdint.h>
 /* Common code for lob tests.  */
 
 #define NO_LOB asm volatile ("@ clobber lr" : : : "lr" )
 
-#define N 10000
+#define N 100
+
+static void
+reset_data (int *a, int *b, int *c, int x)
+{
+  memset (a, -1, x * sizeof (*a));
+  memset (b, -1, x * sizeof (*b));
+  memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data8 (int8_t *a, int8_t *b, int8_t *c, int x)
+{
+  memset (a, -1, x * sizeof (*a));
+  memset (b, -1, x * sizeof (*b));
+  memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data16 (int16_t *a, int16_t *b, int16_t *c, int x)
+{
+  memset (a, -1, x * sizeof (*a));
+  memset (b, -1, x * sizeof (*b));
+  memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data32 (int32_t *a, int32_t *b, int32_t *c, int x)
+{
+  memset (a, -1, x * sizeof (*a));
+  memset (b, -1, x * sizeof (*b));
+  memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data64 (int64_t *a, int64_t *c, int x)
+{
+  memset (a, -1, x * sizeof (*a));
+  memset (c, 0, x * sizeof (*c));
+}
+
+static void
+check_plus (int *a, int *b, int *c, int x)
+{
+  for (int i = 0; i < N; i++)
+    {
+      NO_LOB;
+      if (i < x)
+	{
+	  if (c[i] != (a[i] + b[i])) abort ();
+	}
+      else
+	{
+	  if (c[i] != 0) abort ();
+	}
+    }
+}
+
+static void
+check_plus8 (int8_t *a, int8_t *b, int8_t *c, int x)
+{
+  for (int i = 0; i < N; i++)
+    {
+      NO_LOB;
+      if (i < x)
+	{
+	  if (c[i] != (a[i] + b[i])) abort ();
+	}
+      else
+	{
+	  if (c[i] != 0) abort ();
+	}
+    }
+}
+
+static void
+check_plus16 (int16_t *a, int16_t *b, int16_t *c, int x)
+{
+  for (int i = 0; i < N; i++)
+    {
+      NO_LOB;
+      if (i < x)
+	{
+	  if (c[i] != (a[i] + b[i])) abort ();
+	}
+      else
+	{
+	  if (c[i] != 0) abort ();
+	}
+    }
+}
+
+static void
+check_plus32 (int32_t *a, int32_t *b, int32_t *c, int x)
+{
+  for (int i = 0; i < N; i++)
+    {
+      NO_LOB;
+      if (i < x)
+	{
+	  if (c[i] != (a[i] + b[i])) abort ();
+	}
+      else
+	{
+	  if (c[i] != 0) abort ();
+	}
+    }
+}
 
 static void
-reset_data (int *a, int *b, int *c)
+check_memcpy64 (int64_t *a, int64_t *c, int x)
 {
-  memset (a, -1, N * sizeof (*a));
-  memset (b, -1, N * sizeof (*b));
-  memset (c, -1, N * sizeof (*c));
+  for (int i = 0; i < N; i++)
+    {
+      NO_LOB;
+      if (i < x)
+	{
+	  if (c[i] != a[i]) abort ();
+	}
+      else
+	{
+	  if (c[i] != 0) abort ();
+	}
+    }
 }
diff --git a/gcc/testsuite/gcc.target/arm/lob1.c b/gcc/testsuite/gcc.target/arm/lob1.c
index ba5c82cd55c..c8ce653a5c3 100644
--- a/gcc/testsuite/gcc.target/arm/lob1.c
+++ b/gcc/testsuite/gcc.target/arm/lob1.c
@@ -54,29 +54,18 @@  loop3 (int *a, int *b, int *c)
     } while (i < N);
 }
 
-void
-check (int *a, int *b, int *c)
-{
-  for (int i = 0; i < N; i++)
-    {
-      NO_LOB;
-      if (c[i] != a[i] + b[i])
-	abort ();
-    }
-}
-
 int
 main (void)
 {
-  reset_data (a, b, c);
+  reset_data (a, b, c, N);
   loop1 (a, b ,c);
-  check (a, b ,c);
-  reset_data (a, b, c);
+  check_plus (a, b, c, N);
+  reset_data (a, b, c, N);
   loop2 (a, b ,c);
-  check (a, b ,c);
-  reset_data (a, b, c);
+  check_plus (a, b, c, N);
+  reset_data (a, b, c, N);
   loop3 (a, b ,c);
-  check (a, b ,c);
+  check_plus (a, b, c, N);
 
   return 0;
 }
diff --git a/gcc/testsuite/gcc.target/arm/lob6.c b/gcc/testsuite/gcc.target/arm/lob6.c
index 17b6124295e..4fe116e2c2b 100644
--- a/gcc/testsuite/gcc.target/arm/lob6.c
+++ b/gcc/testsuite/gcc.target/arm/lob6.c
@@ -79,14 +79,14 @@  check (void)
 int
 main (void)
 {
-  reset_data (a1, b1, c1);
-  reset_data (a2, b2, c2);
+  reset_data (a1, b1, c1, N);
+  reset_data (a2, b2, c2, N);
   loop1 (a1, b1, c1);
   ref1 (a2, b2, c2);
   check ();
 
-  reset_data (a1, b1, c1);
-  reset_data (a2, b2, c2);
+  reset_data (a1, b1, c1, N);
+  reset_data (a2, b2, c2, N);
   loop2 (a1, b1, c1);
   ref2 (a2, b2, c2);
   check ();