diff mbox series

[V10] VECT: Add decrement IV support in Loop Vectorizer

Message ID 20230516091941.84280-1-juzhe.zhong@rivai.ai
State New
Headers show
Series [V10] VECT: Add decrement IV support in Loop Vectorizer | expand

Commit Message

juzhe.zhong@rivai.ai May 16, 2023, 9:19 a.m. UTC
From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

This patch implement decrement IV for length approach in loop control.

Address comment from kewen that incorporate the implementation inside
"vect_set_loop_controls_directly" instead of a standalone function.

Address comment from Richard using MIN_EXPR to handle these 3 following
cases
1. single rgroup.
2. multiple rgroup for SLP.
3. multiple rgroup for non-SLP (tested on vec_pack_trunc).


gcc/ChangeLog:

        * tree-vect-loop-manip.cc (vect_adjust_loop_lens): New function.
        (vect_set_loop_controls_directly): Add decrement IV support.
        (vect_set_loop_condition_partial_vectors): Ditto.
        * tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): New variable.
        (vect_get_loop_len): Add decrement IV support.
        * tree-vect-stmts.cc (vectorizable_store): Ditto.
        (vectorizable_load): Ditto.
        * tree-vectorizer.h (LOOP_VINFO_USING_DECREMENTING_IV_P): New macro.
        (vect_get_loop_len): Add decrement IV support.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-1.c: New test.
        * gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-1.h: New test.
        * gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-2.c: New test.
        * gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-2.h: New test.
        * gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-1.c: New test.
        * gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-2.c: New test.

---
 .../rvv/autovec/partial/multiple_rgroup-1.c   |   6 +
 .../rvv/autovec/partial/multiple_rgroup-1.h   | 304 ++++++++++
 .../rvv/autovec/partial/multiple_rgroup-2.c   |   6 +
 .../rvv/autovec/partial/multiple_rgroup-2.h   | 546 ++++++++++++++++++
 .../autovec/partial/multiple_rgroup_run-1.c   |  19 +
 .../autovec/partial/multiple_rgroup_run-2.c   |  19 +
 gcc/tree-vect-loop-manip.cc                   | 203 ++++++-
 gcc/tree-vect-loop.cc                         |  37 +-
 gcc/tree-vect-stmts.cc                        |   9 +-
 gcc/tree-vectorizer.h                         |  13 +-
 10 files changed, 1148 insertions(+), 14 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-1.h
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-2.h
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-2.c
diff mbox series

Patch

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-1.c
new file mode 100644
index 00000000000..69cc3be78f7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-1.c
@@ -0,0 +1,6 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d --param riscv-autovec-preference=fixed-vlmax" } */
+
+#include "multiple_rgroup-1.h"
+
+TEST_ALL (test_1)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-1.h b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-1.h
new file mode 100644
index 00000000000..fbc49f4855d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-1.h
@@ -0,0 +1,304 @@ 
+#include <stddef.h>
+#include <stdint-gcc.h>
+
+#define test_1(TYPE1, TYPE2)                                                   \
+  void __attribute__ ((noinline, noclone))                                     \
+  test_1_##TYPE1_##TYPE2 (TYPE1 *__restrict f, TYPE2 *__restrict d, TYPE1 x,   \
+			  TYPE1 x2, TYPE2 y, int n)                            \
+  {                                                                            \
+    for (int i = 0; i < n; ++i)                                                \
+      {                                                                        \
+	f[i * 2 + 0] = x;                                                      \
+	f[i * 2 + 1] = x2;                                                     \
+	d[i] = y;                                                              \
+      }                                                                        \
+  }
+
+#define run_1(TYPE1, TYPE2)                                                    \
+  int n_1_##TYPE1_##TYPE2 = 1;                                                 \
+  TYPE1 x_1_##TYPE1 = 117;                                                     \
+  TYPE1 x2_1_##TYPE1 = 232;                                                    \
+  TYPE2 y_1_##TYPE2 = 9762;                                                    \
+  TYPE1 f_1_##TYPE1[2 * 2 + 1] = {0};                                          \
+  TYPE2 d_1_##TYPE2[2] = {0};                                                  \
+  test_1_##TYPE1_##TYPE2 (f_1_##TYPE1, d_1_##TYPE2, x_1_##TYPE1, x2_1_##TYPE1, \
+			  y_1_##TYPE2, n_1_##TYPE1_##TYPE2);                   \
+  for (int i = 0; i < n_1_##TYPE1_##TYPE2; ++i)                                \
+    {                                                                          \
+      if (f_1_##TYPE1[i * 2 + 0] != x_1_##TYPE1)                               \
+	__builtin_abort ();                                                    \
+      if (f_1_##TYPE1[i * 2 + 1] != x2_1_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (d_1_##TYPE2[i] != y_1_##TYPE2)                                       \
+	__builtin_abort ();                                                    \
+    }                                                                          \
+  for (int i = n_1_##TYPE1_##TYPE2; i < n_1_##TYPE1_##TYPE2 + 1; ++i)          \
+    {                                                                          \
+      if (f_1_##TYPE1[i * 2 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_1_##TYPE1[i * 2 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_1_##TYPE2[i] != 0)                                                 \
+	__builtin_abort ();                                                    \
+    }
+
+#define run_2(TYPE1, TYPE2)                                                    \
+  int n_2_##TYPE1_##TYPE2 = 17;                                                \
+  TYPE1 x_2_##TYPE1 = 133;                                                     \
+  TYPE1 x2_2_##TYPE1 = 94;                                                     \
+  TYPE2 y_2_##TYPE2 = 8672;                                                    \
+  TYPE1 f_2_##TYPE1[18 * 2 + 1] = {0};                                         \
+  TYPE2 d_2_##TYPE2[18] = {0};                                                 \
+  test_1_##TYPE1_##TYPE2 (f_2_##TYPE1, d_2_##TYPE2, x_2_##TYPE1, x2_2_##TYPE1, \
+			  y_2_##TYPE2, n_2_##TYPE1_##TYPE2);                   \
+  for (int i = 0; i < n_2_##TYPE1_##TYPE2; ++i)                                \
+    {                                                                          \
+      if (f_2_##TYPE1[i * 2 + 0] != x_2_##TYPE1)                               \
+	__builtin_abort ();                                                    \
+      if (f_2_##TYPE1[i * 2 + 1] != x2_2_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (d_2_##TYPE2[i] != y_2_##TYPE2)                                       \
+	__builtin_abort ();                                                    \
+    }                                                                          \
+  for (int i = n_2_##TYPE1_##TYPE2; i < n_2_##TYPE1_##TYPE2 + 1; ++i)          \
+    {                                                                          \
+      if (f_2_##TYPE1[i * 2 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_2_##TYPE1[i * 2 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_2_##TYPE2[i] != 0)                                                 \
+	__builtin_abort ();                                                    \
+    }
+
+#define run_3(TYPE1, TYPE2)                                                    \
+  int n_3_##TYPE1_##TYPE2 = 32;                                                \
+  TYPE1 x_3_##TYPE1 = 233;                                                     \
+  TYPE1 x2_3_##TYPE1 = 78;                                                     \
+  TYPE2 y_3_##TYPE2 = 1234;                                                    \
+  TYPE1 f_3_##TYPE1[33 * 2 + 1] = {0};                                         \
+  TYPE2 d_3_##TYPE2[33] = {0};                                                 \
+  test_1_##TYPE1_##TYPE2 (f_3_##TYPE1, d_3_##TYPE2, x_3_##TYPE1, x2_3_##TYPE1, \
+			  y_3_##TYPE2, n_3_##TYPE1_##TYPE2);                   \
+  for (int i = 0; i < n_3_##TYPE1_##TYPE2; ++i)                                \
+    {                                                                          \
+      if (f_3_##TYPE1[i * 2 + 0] != x_3_##TYPE1)                               \
+	__builtin_abort ();                                                    \
+      if (f_3_##TYPE1[i * 2 + 1] != x2_3_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (d_3_##TYPE2[i] != y_3_##TYPE2)                                       \
+	__builtin_abort ();                                                    \
+    }                                                                          \
+  for (int i = n_3_##TYPE1_##TYPE2; i < n_3_##TYPE1_##TYPE2 + 1; ++i)          \
+    {                                                                          \
+      if (f_3_##TYPE1[i * 2 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_3_##TYPE1[i * 2 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_3_##TYPE2[i] != 0)                                                 \
+	__builtin_abort ();                                                    \
+    }
+
+#define run_4(TYPE1, TYPE2)                                                    \
+  int n_4_##TYPE1_##TYPE2 = 128;                                               \
+  TYPE1 x_4_##TYPE1 = 222;                                                     \
+  TYPE1 x2_4_##TYPE1 = 59;                                                     \
+  TYPE2 y_4_##TYPE2 = 4321;                                                    \
+  TYPE1 f_4_##TYPE1[129 * 2 + 1] = {0};                                        \
+  TYPE2 d_4_##TYPE2[129] = {0};                                                \
+  test_1_##TYPE1_##TYPE2 (f_4_##TYPE1, d_4_##TYPE2, x_4_##TYPE1, x2_4_##TYPE1, \
+			  y_4_##TYPE2, n_4_##TYPE1_##TYPE2);                   \
+  for (int i = 0; i < n_4_##TYPE1_##TYPE2; ++i)                                \
+    {                                                                          \
+      if (f_4_##TYPE1[i * 2 + 0] != x_4_##TYPE1)                               \
+	__builtin_abort ();                                                    \
+      if (f_4_##TYPE1[i * 2 + 1] != x2_4_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (d_4_##TYPE2[i] != y_4_##TYPE2)                                       \
+	__builtin_abort ();                                                    \
+    }                                                                          \
+  for (int i = n_4_##TYPE1_##TYPE2; i < n_4_##TYPE1_##TYPE2 + 1; ++i)          \
+    {                                                                          \
+      if (f_4_##TYPE1[i * 2 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_4_##TYPE1[i * 2 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_4_##TYPE2[i] != 0)                                                 \
+	__builtin_abort ();                                                    \
+    }
+
+#define run_5(TYPE1, TYPE2)                                                    \
+  int n_5_##TYPE1_##TYPE2 = 177;                                               \
+  TYPE1 x_5_##TYPE1 = 111;                                                     \
+  TYPE1 x2_5_##TYPE1 = 189;                                                    \
+  TYPE2 y_5_##TYPE2 = 5555;                                                    \
+  TYPE1 f_5_##TYPE1[178 * 2 + 1] = {0};                                        \
+  TYPE2 d_5_##TYPE2[178] = {0};                                                \
+  test_1_##TYPE1_##TYPE2 (f_5_##TYPE1, d_5_##TYPE2, x_5_##TYPE1, x2_5_##TYPE1, \
+			  y_5_##TYPE2, n_5_##TYPE1_##TYPE2);                   \
+  for (int i = 0; i < n_5_##TYPE1_##TYPE2; ++i)                                \
+    {                                                                          \
+      if (f_5_##TYPE1[i * 2 + 0] != x_5_##TYPE1)                               \
+	__builtin_abort ();                                                    \
+      if (f_5_##TYPE1[i * 2 + 1] != x2_5_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (d_5_##TYPE2[i] != y_5_##TYPE2)                                       \
+	__builtin_abort ();                                                    \
+    }                                                                          \
+  for (int i = n_5_##TYPE1_##TYPE2; i < n_5_##TYPE1_##TYPE2 + 1; ++i)          \
+    {                                                                          \
+      if (f_5_##TYPE1[i * 2 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_5_##TYPE1[i * 2 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_5_##TYPE2[i] != 0)                                                 \
+	__builtin_abort ();                                                    \
+    }
+
+#define run_6(TYPE1, TYPE2)                                                    \
+  int n_6_##TYPE1_##TYPE2 = 255;                                               \
+  TYPE1 x_6_##TYPE1 = 123;                                                     \
+  TYPE1 x2_6_##TYPE1 = 132;                                                    \
+  TYPE2 y_6_##TYPE2 = 6655;                                                    \
+  TYPE1 f_6_##TYPE1[256 * 2 + 1] = {0};                                        \
+  TYPE2 d_6_##TYPE2[256] = {0};                                                \
+  test_1_##TYPE1_##TYPE2 (f_6_##TYPE1, d_6_##TYPE2, x_6_##TYPE1, x2_6_##TYPE1, \
+			  y_6_##TYPE2, n_6_##TYPE1_##TYPE2);                   \
+  for (int i = 0; i < n_6_##TYPE1_##TYPE2; ++i)                                \
+    {                                                                          \
+      if (f_6_##TYPE1[i * 2 + 0] != x_6_##TYPE1)                               \
+	__builtin_abort ();                                                    \
+      if (f_6_##TYPE1[i * 2 + 1] != x2_6_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (d_6_##TYPE2[i] != y_6_##TYPE2)                                       \
+	__builtin_abort ();                                                    \
+    }                                                                          \
+  for (int i = n_6_##TYPE1_##TYPE2; i < n_6_##TYPE1_##TYPE2 + 1; ++i)          \
+    {                                                                          \
+      if (f_6_##TYPE1[i * 2 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_6_##TYPE1[i * 2 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_6_##TYPE2[i] != 0)                                                 \
+	__builtin_abort ();                                                    \
+    }
+
+#define run_7(TYPE1, TYPE2)                                                    \
+  int n_7_##TYPE1_##TYPE2 = 333;                                               \
+  TYPE1 x_7_##TYPE1 = 39;                                                      \
+  TYPE1 x2_7_##TYPE1 = 59;                                                     \
+  TYPE2 y_7_##TYPE2 = 5968;                                                    \
+  TYPE1 f_7_##TYPE1[334 * 2 + 1] = {0};                                        \
+  TYPE2 d_7_##TYPE2[334] = {0};                                                \
+  test_1_##TYPE1_##TYPE2 (f_7_##TYPE1, d_7_##TYPE2, x_7_##TYPE1, x2_7_##TYPE1, \
+			  y_7_##TYPE2, n_7_##TYPE1_##TYPE2);                   \
+  for (int i = 0; i < n_7_##TYPE1_##TYPE2; ++i)                                \
+    {                                                                          \
+      if (f_7_##TYPE1[i * 2 + 0] != x_7_##TYPE1)                               \
+	__builtin_abort ();                                                    \
+      if (f_7_##TYPE1[i * 2 + 1] != x2_7_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (d_7_##TYPE2[i] != y_7_##TYPE2)                                       \
+	__builtin_abort ();                                                    \
+    }                                                                          \
+  for (int i = n_7_##TYPE1_##TYPE2; i < n_7_##TYPE1_##TYPE2 + 1; ++i)          \
+    {                                                                          \
+      if (f_7_##TYPE1[i * 2 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_7_##TYPE1[i * 2 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_7_##TYPE2[i] != 0)                                                 \
+	__builtin_abort ();                                                    \
+    }
+
+#define run_8(TYPE1, TYPE2)                                                    \
+  int n_8_##TYPE1_##TYPE2 = 512;                                               \
+  TYPE1 x_8_##TYPE1 = 71;                                                      \
+  TYPE1 x2_8_##TYPE1 = 255;                                                    \
+  TYPE2 y_8_##TYPE2 = 3366;                                                    \
+  TYPE1 f_8_##TYPE1[513 * 2 + 1] = {0};                                        \
+  TYPE2 d_8_##TYPE2[513] = {0};                                                \
+  test_1_##TYPE1_##TYPE2 (f_8_##TYPE1, d_8_##TYPE2, x_8_##TYPE1, x2_8_##TYPE1, \
+			  y_8_##TYPE2, n_8_##TYPE1_##TYPE2);                   \
+  for (int i = 0; i < n_8_##TYPE1_##TYPE2; ++i)                                \
+    {                                                                          \
+      if (f_8_##TYPE1[i * 2 + 0] != x_8_##TYPE1)                               \
+	__builtin_abort ();                                                    \
+      if (f_8_##TYPE1[i * 2 + 1] != x2_8_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (d_8_##TYPE2[i] != y_8_##TYPE2)                                       \
+	__builtin_abort ();                                                    \
+    }                                                                          \
+  for (int i = n_8_##TYPE1_##TYPE2; i < n_8_##TYPE1_##TYPE2 + 1; ++i)          \
+    {                                                                          \
+      if (f_8_##TYPE1[i * 2 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_8_##TYPE1[i * 2 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_8_##TYPE2[i] != 0)                                                 \
+	__builtin_abort ();                                                    \
+    }
+
+#define run_9(TYPE1, TYPE2)                                                    \
+  int n_9_##TYPE1_##TYPE2 = 637;                                               \
+  TYPE1 x_9_##TYPE1 = 157;                                                     \
+  TYPE1 x2_9_##TYPE1 = 89;                                                     \
+  TYPE2 y_9_##TYPE2 = 5511;                                                    \
+  TYPE1 f_9_##TYPE1[638 * 2 + 1] = {0};                                        \
+  TYPE2 d_9_##TYPE2[638] = {0};                                                \
+  test_1_##TYPE1_##TYPE2 (f_9_##TYPE1, d_9_##TYPE2, x_9_##TYPE1, x2_9_##TYPE1, \
+			  y_9_##TYPE2, n_9_##TYPE1_##TYPE2);                   \
+  for (int i = 0; i < n_9_##TYPE1_##TYPE2; ++i)                                \
+    {                                                                          \
+      if (f_9_##TYPE1[i * 2 + 0] != x_9_##TYPE1)                               \
+	__builtin_abort ();                                                    \
+      if (f_9_##TYPE1[i * 2 + 1] != x2_9_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (d_9_##TYPE2[i] != y_9_##TYPE2)                                       \
+	__builtin_abort ();                                                    \
+    }                                                                          \
+  for (int i = n_9_##TYPE1_##TYPE2; i < n_9_##TYPE1_##TYPE2 + 1; ++i)          \
+    {                                                                          \
+      if (f_9_##TYPE1[i * 2 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_9_##TYPE1[i * 2 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_9_##TYPE2[i] != 0)                                                 \
+	__builtin_abort ();                                                    \
+    }
+
+#define run_10(TYPE1, TYPE2)                                                   \
+  int n_10_##TYPE1_##TYPE2 = 777;                                              \
+  TYPE1 x_10_##TYPE1 = 203;                                                    \
+  TYPE1 x2_10_##TYPE1 = 200;                                                   \
+  TYPE2 y_10_##TYPE2 = 2023;                                                   \
+  TYPE1 f_10_##TYPE1[778 * 2 + 1] = {0};                                       \
+  TYPE2 d_10_##TYPE2[778] = {0};                                               \
+  test_1_##TYPE1_##TYPE2 (f_10_##TYPE1, d_10_##TYPE2, x_10_##TYPE1,            \
+			  x2_10_##TYPE1, y_10_##TYPE2, n_10_##TYPE1_##TYPE2);  \
+  for (int i = 0; i < n_10_##TYPE1_##TYPE2; ++i)                               \
+    {                                                                          \
+      if (f_10_##TYPE1[i * 2 + 0] != x_10_##TYPE1)                             \
+	__builtin_abort ();                                                    \
+      if (f_10_##TYPE1[i * 2 + 1] != x2_10_##TYPE1)                            \
+	__builtin_abort ();                                                    \
+      if (d_10_##TYPE2[i] != y_10_##TYPE2)                                     \
+	__builtin_abort ();                                                    \
+    }                                                                          \
+  for (int i = n_10_##TYPE1_##TYPE2; i < n_10_##TYPE1_##TYPE2 + 1; ++i)        \
+    {                                                                          \
+      if (f_10_##TYPE1[i * 2 + 0] != 0)                                        \
+	__builtin_abort ();                                                    \
+      if (f_10_##TYPE1[i * 2 + 1] != 0)                                        \
+	__builtin_abort ();                                                    \
+      if (d_10_##TYPE2[i] != 0)                                                \
+	__builtin_abort ();                                                    \
+    }
+
+#define TEST_ALL(T)                                                            \
+  T (int8_t, int16_t)                                                          \
+  T (uint8_t, uint16_t)                                                        \
+  T (int16_t, int32_t)                                                         \
+  T (uint16_t, uint32_t)                                                       \
+  T (int32_t, int64_t)                                                         \
+  T (uint32_t, uint64_t)                                                       \
+  T (float, double)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-2.c
new file mode 100644
index 00000000000..d1c41907547
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-2.c
@@ -0,0 +1,6 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d --param riscv-autovec-preference=fixed-vlmax" } */
+
+#include "multiple_rgroup-2.h"
+
+TEST_ALL (test_1)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-2.h b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-2.h
new file mode 100644
index 00000000000..045a76de45f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-2.h
@@ -0,0 +1,546 @@ 
+#include <stddef.h>
+#include <stdint-gcc.h>
+
+#define test_1(TYPE1, TYPE2, TYPE3)                                            \
+  void __attribute__ ((noinline, noclone))                                     \
+  test_1_##TYPE1_##TYPE2 (TYPE1 *__restrict f, TYPE2 *__restrict d,            \
+			  TYPE3 *__restrict e, TYPE1 x, TYPE1 x2, TYPE1 x3,    \
+			  TYPE1 x4, TYPE2 y, TYPE2 y2, TYPE3 z, int n)         \
+  {                                                                            \
+    for (int i = 0; i < n; ++i)                                                \
+      {                                                                        \
+	f[i * 4 + 0] = x;                                                      \
+	f[i * 4 + 1] = x2;                                                     \
+	f[i * 4 + 2] = x3;                                                     \
+	f[i * 4 + 3] = x4;                                                     \
+	d[i * 2 + 0] = y;                                                      \
+	d[i * 2 + 1] = y2;                                                     \
+	e[i] = z;                                                              \
+      }                                                                        \
+  }
+
+#define run_1(TYPE1, TYPE2, TYPE3)                                             \
+  int n_1_##TYPE1_##TYPE2_##TYPE3 = 1;                                         \
+  TYPE1 x_1_##TYPE1 = 117;                                                     \
+  TYPE1 x2_1_##TYPE1 = 232;                                                    \
+  TYPE1 x3_1_##TYPE1 = 127;                                                    \
+  TYPE1 x4_1_##TYPE1 = 11;                                                     \
+  TYPE2 y_1_##TYPE2 = 9762;                                                    \
+  TYPE2 y2_1_##TYPE2 = 6279;                                                   \
+  TYPE3 z_1_##TYPE3 = 5891663;                                                 \
+  TYPE1 f_1_##TYPE1[2 * 4 + 1] = {0};                                          \
+  TYPE2 d_1_##TYPE2[2 * 2 + 1] = {0};                                          \
+  TYPE3 e_1_##TYPE3[2] = {0};                                                  \
+  test_1_##TYPE1_##TYPE2 (f_1_##TYPE1, d_1_##TYPE2, e_1_##TYPE3, x_1_##TYPE1,  \
+			  x2_1_##TYPE1, x3_1_##TYPE1, x4_1_##TYPE1,            \
+			  y_1_##TYPE2, y2_1_##TYPE2, z_1_##TYPE3,              \
+			  n_1_##TYPE1_##TYPE2_##TYPE3);                        \
+  for (int i = 0; i < n_1_##TYPE1_##TYPE2_##TYPE3; ++i)                        \
+    {                                                                          \
+      if (f_1_##TYPE1[i * 4 + 0] != x_1_##TYPE1)                               \
+	__builtin_abort ();                                                    \
+      if (f_1_##TYPE1[i * 4 + 1] != x2_1_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (f_1_##TYPE1[i * 4 + 2] != x3_1_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (f_1_##TYPE1[i * 4 + 3] != x4_1_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (d_1_##TYPE2[i * 2 + 0] != y_1_##TYPE2)                               \
+	__builtin_abort ();                                                    \
+      if (d_1_##TYPE2[i * 2 + 1] != y2_1_##TYPE2)                              \
+	__builtin_abort ();                                                    \
+      if (e_1_##TYPE3[i] != z_1_##TYPE3)                                       \
+	__builtin_abort ();                                                    \
+    }                                                                          \
+  for (int i = n_1_##TYPE1_##TYPE2_##TYPE3;                                    \
+       i < n_1_##TYPE1_##TYPE2_##TYPE3 + 1; ++i)                               \
+    {                                                                          \
+      if (f_1_##TYPE1[i * 4 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_1_##TYPE1[i * 4 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_1_##TYPE1[i * 4 + 2] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_1_##TYPE1[i * 4 + 3] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_1_##TYPE2[i * 2 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_1_##TYPE2[i * 2 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (e_1_##TYPE3[i] != 0)                                                 \
+	__builtin_abort ();                                                    \
+    }
+
+#define run_2(TYPE1, TYPE2, TYPE3)                                             \
+  int n_2_##TYPE1_##TYPE2_##TYPE3 = 17;                                        \
+  TYPE1 x_2_##TYPE1 = 107;                                                     \
+  TYPE1 x2_2_##TYPE1 = 202;                                                    \
+  TYPE1 x3_2_##TYPE1 = 17;                                                     \
+  TYPE1 x4_2_##TYPE1 = 53;                                                     \
+  TYPE2 y_2_##TYPE2 = 5566;                                                    \
+  TYPE2 y2_2_##TYPE2 = 7926;                                                   \
+  TYPE3 z_2_##TYPE3 = 781545971;                                               \
+  TYPE1 f_2_##TYPE1[18 * 4 + 1] = {0};                                         \
+  TYPE2 d_2_##TYPE2[18 * 2 + 1] = {0};                                         \
+  TYPE3 e_2_##TYPE3[18] = {0};                                                 \
+  test_1_##TYPE1_##TYPE2 (f_2_##TYPE1, d_2_##TYPE2, e_2_##TYPE3, x_2_##TYPE1,  \
+			  x2_2_##TYPE1, x3_2_##TYPE1, x4_2_##TYPE1,            \
+			  y_2_##TYPE2, y2_2_##TYPE2, z_2_##TYPE3,              \
+			  n_2_##TYPE1_##TYPE2_##TYPE3);                        \
+  for (int i = 0; i < n_2_##TYPE1_##TYPE2_##TYPE3; ++i)                        \
+    {                                                                          \
+      if (f_2_##TYPE1[i * 4 + 0] != x_2_##TYPE1)                               \
+	__builtin_abort ();                                                    \
+      if (f_2_##TYPE1[i * 4 + 1] != x2_2_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (f_2_##TYPE1[i * 4 + 2] != x3_2_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (f_2_##TYPE1[i * 4 + 3] != x4_2_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (d_2_##TYPE2[i * 2 + 0] != y_2_##TYPE2)                               \
+	__builtin_abort ();                                                    \
+      if (d_2_##TYPE2[i * 2 + 1] != y2_2_##TYPE2)                              \
+	__builtin_abort ();                                                    \
+      if (e_2_##TYPE3[i] != z_2_##TYPE3)                                       \
+	__builtin_abort ();                                                    \
+    }                                                                          \
+  for (int i = n_2_##TYPE1_##TYPE2_##TYPE3;                                    \
+       i < n_2_##TYPE1_##TYPE2_##TYPE3 + 1; ++i)                               \
+    {                                                                          \
+      if (f_2_##TYPE1[i * 4 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_2_##TYPE1[i * 4 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_2_##TYPE1[i * 4 + 2] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_2_##TYPE1[i * 4 + 3] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_2_##TYPE2[i * 2 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_2_##TYPE2[i * 2 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (e_2_##TYPE3[i] != 0)                                                 \
+	__builtin_abort ();                                                    \
+    }
+
+#define run_3(TYPE1, TYPE2, TYPE3)                                             \
+  int n_3_##TYPE1_##TYPE2_##TYPE3 = 32;                                        \
+  TYPE1 x_3_##TYPE1 = 109;                                                     \
+  TYPE1 x2_3_##TYPE1 = 239;                                                    \
+  TYPE1 x3_3_##TYPE1 = 151;                                                    \
+  TYPE1 x4_3_##TYPE1 = 3;                                                      \
+  TYPE2 y_3_##TYPE2 = 1234;                                                    \
+  TYPE2 y2_3_##TYPE2 = 4321;                                                   \
+  TYPE3 z_3_##TYPE3 = 145615615;                                               \
+  TYPE1 f_3_##TYPE1[33 * 4 + 1] = {0};                                         \
+  TYPE2 d_3_##TYPE2[33 * 2 + 1] = {0};                                         \
+  TYPE3 e_3_##TYPE3[33] = {0};                                                 \
+  test_1_##TYPE1_##TYPE2 (f_3_##TYPE1, d_3_##TYPE2, e_3_##TYPE3, x_3_##TYPE1,  \
+			  x2_3_##TYPE1, x3_3_##TYPE1, x4_3_##TYPE1,            \
+			  y_3_##TYPE2, y2_3_##TYPE2, z_3_##TYPE3,              \
+			  n_3_##TYPE1_##TYPE2_##TYPE3);                        \
+  for (int i = 0; i < n_3_##TYPE1_##TYPE2_##TYPE3; ++i)                        \
+    {                                                                          \
+      if (f_3_##TYPE1[i * 4 + 0] != x_3_##TYPE1)                               \
+	__builtin_abort ();                                                    \
+      if (f_3_##TYPE1[i * 4 + 1] != x2_3_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (f_3_##TYPE1[i * 4 + 2] != x3_3_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (f_3_##TYPE1[i * 4 + 3] != x4_3_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (d_3_##TYPE2[i * 2 + 0] != y_3_##TYPE2)                               \
+	__builtin_abort ();                                                    \
+      if (d_3_##TYPE2[i * 2 + 1] != y2_3_##TYPE2)                              \
+	__builtin_abort ();                                                    \
+      if (e_3_##TYPE3[i] != z_3_##TYPE3)                                       \
+	__builtin_abort ();                                                    \
+    }                                                                          \
+  for (int i = n_3_##TYPE1_##TYPE2_##TYPE3;                                    \
+       i < n_3_##TYPE1_##TYPE2_##TYPE3 + 1; ++i)                               \
+    {                                                                          \
+      if (f_3_##TYPE1[i * 4 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_3_##TYPE1[i * 4 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_3_##TYPE1[i * 4 + 2] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_3_##TYPE1[i * 4 + 3] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_3_##TYPE2[i * 2 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_3_##TYPE2[i * 2 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (e_3_##TYPE3[i] != 0)                                                 \
+	__builtin_abort ();                                                    \
+    }
+
+#define run_4(TYPE1, TYPE2, TYPE3)                                             \
+  int n_4_##TYPE1_##TYPE2_##TYPE3 = 128;                                       \
+  TYPE1 x_4_##TYPE1 = 239;                                                     \
+  TYPE1 x2_4_##TYPE1 = 132;                                                    \
+  TYPE1 x3_4_##TYPE1 = 39;                                                     \
+  TYPE1 x4_4_##TYPE1 = 48;                                                     \
+  TYPE2 y_4_##TYPE2 = 1036;                                                    \
+  TYPE2 y2_4_##TYPE2 = 3665;                                                   \
+  TYPE3 z_4_##TYPE3 = 5145656;                                                 \
+  TYPE1 f_4_##TYPE1[129 * 4 + 1] = {0};                                        \
+  TYPE2 d_4_##TYPE2[129 * 2 + 1] = {0};                                        \
+  TYPE3 e_4_##TYPE3[129] = {0};                                                \
+  test_1_##TYPE1_##TYPE2 (f_4_##TYPE1, d_4_##TYPE2, e_4_##TYPE3, x_4_##TYPE1,  \
+			  x2_4_##TYPE1, x3_4_##TYPE1, x4_4_##TYPE1,            \
+			  y_4_##TYPE2, y2_4_##TYPE2, z_4_##TYPE3,              \
+			  n_4_##TYPE1_##TYPE2_##TYPE3);                        \
+  for (int i = 0; i < n_4_##TYPE1_##TYPE2_##TYPE3; ++i)                        \
+    {                                                                          \
+      if (f_4_##TYPE1[i * 4 + 0] != x_4_##TYPE1)                               \
+	__builtin_abort ();                                                    \
+      if (f_4_##TYPE1[i * 4 + 1] != x2_4_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (f_4_##TYPE1[i * 4 + 2] != x3_4_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (f_4_##TYPE1[i * 4 + 3] != x4_4_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (d_4_##TYPE2[i * 2 + 0] != y_4_##TYPE2)                               \
+	__builtin_abort ();                                                    \
+      if (d_4_##TYPE2[i * 2 + 1] != y2_4_##TYPE2)                              \
+	__builtin_abort ();                                                    \
+      if (e_4_##TYPE3[i] != z_4_##TYPE3)                                       \
+	__builtin_abort ();                                                    \
+    }                                                                          \
+  for (int i = n_4_##TYPE1_##TYPE2_##TYPE3;                                    \
+       i < n_4_##TYPE1_##TYPE2_##TYPE3 + 1; ++i)                               \
+    {                                                                          \
+      if (f_4_##TYPE1[i * 4 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_4_##TYPE1[i * 4 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_4_##TYPE1[i * 4 + 2] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_4_##TYPE1[i * 4 + 3] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_4_##TYPE2[i * 2 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_4_##TYPE2[i * 2 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (e_4_##TYPE3[i] != 0)                                                 \
+	__builtin_abort ();                                                    \
+    }
+
+#define run_5(TYPE1, TYPE2, TYPE3)                                             \
+  int n_5_##TYPE1_##TYPE2_##TYPE3 = 177;                                       \
+  TYPE1 x_5_##TYPE1 = 239;                                                     \
+  TYPE1 x2_5_##TYPE1 = 132;                                                    \
+  TYPE1 x3_5_##TYPE1 = 39;                                                     \
+  TYPE1 x4_5_##TYPE1 = 48;                                                     \
+  TYPE2 y_5_##TYPE2 = 1036;                                                    \
+  TYPE2 y2_5_##TYPE2 = 3665;                                                   \
+  TYPE3 z_5_##TYPE3 = 5145656;                                                 \
+  TYPE1 f_5_##TYPE1[178 * 4 + 1] = {0};                                        \
+  TYPE2 d_5_##TYPE2[178 * 2 + 1] = {0};                                        \
+  TYPE3 e_5_##TYPE3[178] = {0};                                                \
+  test_1_##TYPE1_##TYPE2 (f_5_##TYPE1, d_5_##TYPE2, e_5_##TYPE3, x_5_##TYPE1,  \
+			  x2_5_##TYPE1, x3_5_##TYPE1, x4_5_##TYPE1,            \
+			  y_5_##TYPE2, y2_5_##TYPE2, z_5_##TYPE3,              \
+			  n_5_##TYPE1_##TYPE2_##TYPE3);                        \
+  for (int i = 0; i < n_5_##TYPE1_##TYPE2_##TYPE3; ++i)                        \
+    {                                                                          \
+      if (f_5_##TYPE1[i * 4 + 0] != x_5_##TYPE1)                               \
+	__builtin_abort ();                                                    \
+      if (f_5_##TYPE1[i * 4 + 1] != x2_5_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (f_5_##TYPE1[i * 4 + 2] != x3_5_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (f_5_##TYPE1[i * 4 + 3] != x4_5_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (d_5_##TYPE2[i * 2 + 0] != y_5_##TYPE2)                               \
+	__builtin_abort ();                                                    \
+      if (d_5_##TYPE2[i * 2 + 1] != y2_5_##TYPE2)                              \
+	__builtin_abort ();                                                    \
+      if (e_5_##TYPE3[i] != z_5_##TYPE3)                                       \
+	__builtin_abort ();                                                    \
+    }                                                                          \
+  for (int i = n_5_##TYPE1_##TYPE2_##TYPE3;                                    \
+       i < n_5_##TYPE1_##TYPE2_##TYPE3 + 1; ++i)                               \
+    {                                                                          \
+      if (f_5_##TYPE1[i * 4 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_5_##TYPE1[i * 4 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_5_##TYPE1[i * 4 + 2] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_5_##TYPE1[i * 4 + 3] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_5_##TYPE2[i * 2 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_5_##TYPE2[i * 2 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (e_5_##TYPE3[i] != 0)                                                 \
+	__builtin_abort ();                                                    \
+    }
+
+#define run_6(TYPE1, TYPE2, TYPE3)                                             \
+  int n_6_##TYPE1_##TYPE2_##TYPE3 = 255;                                       \
+  TYPE1 x_6_##TYPE1 = 239;                                                     \
+  TYPE1 x2_6_##TYPE1 = 132;                                                    \
+  TYPE1 x3_6_##TYPE1 = 39;                                                     \
+  TYPE1 x4_6_##TYPE1 = 48;                                                     \
+  TYPE2 y_6_##TYPE2 = 1036;                                                    \
+  TYPE2 y2_6_##TYPE2 = 3665;                                                   \
+  TYPE3 z_6_##TYPE3 = 5145656;                                                 \
+  TYPE1 f_6_##TYPE1[256 * 4 + 1] = {0};                                        \
+  TYPE2 d_6_##TYPE2[256 * 2 + 1] = {0};                                        \
+  TYPE3 e_6_##TYPE3[256] = {0};                                                \
+  test_1_##TYPE1_##TYPE2 (f_6_##TYPE1, d_6_##TYPE2, e_6_##TYPE3, x_6_##TYPE1,  \
+			  x2_6_##TYPE1, x3_6_##TYPE1, x4_6_##TYPE1,            \
+			  y_6_##TYPE2, y2_6_##TYPE2, z_6_##TYPE3,              \
+			  n_6_##TYPE1_##TYPE2_##TYPE3);                        \
+  for (int i = 0; i < n_6_##TYPE1_##TYPE2_##TYPE3; ++i)                        \
+    {                                                                          \
+      if (f_6_##TYPE1[i * 4 + 0] != x_6_##TYPE1)                               \
+	__builtin_abort ();                                                    \
+      if (f_6_##TYPE1[i * 4 + 1] != x2_6_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (f_6_##TYPE1[i * 4 + 2] != x3_6_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (f_6_##TYPE1[i * 4 + 3] != x4_6_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (d_6_##TYPE2[i * 2 + 0] != y_6_##TYPE2)                               \
+	__builtin_abort ();                                                    \
+      if (d_6_##TYPE2[i * 2 + 1] != y2_6_##TYPE2)                              \
+	__builtin_abort ();                                                    \
+      if (e_6_##TYPE3[i] != z_6_##TYPE3)                                       \
+	__builtin_abort ();                                                    \
+    }                                                                          \
+  for (int i = n_6_##TYPE1_##TYPE2_##TYPE3;                                    \
+       i < n_6_##TYPE1_##TYPE2_##TYPE3 + 1; ++i)                               \
+    {                                                                          \
+      if (f_6_##TYPE1[i * 4 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_6_##TYPE1[i * 4 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_6_##TYPE1[i * 4 + 2] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_6_##TYPE1[i * 4 + 3] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_6_##TYPE2[i * 2 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_6_##TYPE2[i * 2 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (e_6_##TYPE3[i] != 0)                                                 \
+	__builtin_abort ();                                                    \
+    }
+
+#define run_7(TYPE1, TYPE2, TYPE3)                                             \
+  int n_7_##TYPE1_##TYPE2_##TYPE3 = 333;                                       \
+  TYPE1 x_7_##TYPE1 = 239;                                                     \
+  TYPE1 x2_7_##TYPE1 = 132;                                                    \
+  TYPE1 x3_7_##TYPE1 = 39;                                                     \
+  TYPE1 x4_7_##TYPE1 = 48;                                                     \
+  TYPE2 y_7_##TYPE2 = 1036;                                                    \
+  TYPE2 y2_7_##TYPE2 = 3665;                                                   \
+  TYPE3 z_7_##TYPE3 = 5145656;                                                 \
+  TYPE1 f_7_##TYPE1[334 * 4 + 1] = {0};                                        \
+  TYPE2 d_7_##TYPE2[334 * 2 + 1] = {0};                                        \
+  TYPE3 e_7_##TYPE3[334] = {0};                                                \
+  test_1_##TYPE1_##TYPE2 (f_7_##TYPE1, d_7_##TYPE2, e_7_##TYPE3, x_7_##TYPE1,  \
+			  x2_7_##TYPE1, x3_7_##TYPE1, x4_7_##TYPE1,            \
+			  y_7_##TYPE2, y2_7_##TYPE2, z_7_##TYPE3,              \
+			  n_7_##TYPE1_##TYPE2_##TYPE3);                        \
+  for (int i = 0; i < n_7_##TYPE1_##TYPE2_##TYPE3; ++i)                        \
+    {                                                                          \
+      if (f_7_##TYPE1[i * 4 + 0] != x_7_##TYPE1)                               \
+	__builtin_abort ();                                                    \
+      if (f_7_##TYPE1[i * 4 + 1] != x2_7_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (f_7_##TYPE1[i * 4 + 2] != x3_7_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (f_7_##TYPE1[i * 4 + 3] != x4_7_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (d_7_##TYPE2[i * 2 + 0] != y_7_##TYPE2)                               \
+	__builtin_abort ();                                                    \
+      if (d_7_##TYPE2[i * 2 + 1] != y2_7_##TYPE2)                              \
+	__builtin_abort ();                                                    \
+      if (e_7_##TYPE3[i] != z_7_##TYPE3)                                       \
+	__builtin_abort ();                                                    \
+    }                                                                          \
+  for (int i = n_7_##TYPE1_##TYPE2_##TYPE3;                                    \
+       i < n_7_##TYPE1_##TYPE2_##TYPE3 + 1; ++i)                               \
+    {                                                                          \
+      if (f_7_##TYPE1[i * 4 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_7_##TYPE1[i * 4 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_7_##TYPE1[i * 4 + 2] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_7_##TYPE1[i * 4 + 3] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_7_##TYPE2[i * 2 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_7_##TYPE2[i * 2 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (e_7_##TYPE3[i] != 0)                                                 \
+	__builtin_abort ();                                                    \
+    }
+
+#define run_8(TYPE1, TYPE2, TYPE3)                                             \
+  int n_8_##TYPE1_##TYPE2_##TYPE3 = 512;                                       \
+  TYPE1 x_8_##TYPE1 = 239;                                                     \
+  TYPE1 x2_8_##TYPE1 = 132;                                                    \
+  TYPE1 x3_8_##TYPE1 = 39;                                                     \
+  TYPE1 x4_8_##TYPE1 = 48;                                                     \
+  TYPE2 y_8_##TYPE2 = 1036;                                                    \
+  TYPE2 y2_8_##TYPE2 = 3665;                                                   \
+  TYPE3 z_8_##TYPE3 = 5145656;                                                 \
+  TYPE1 f_8_##TYPE1[513 * 4 + 1] = {0};                                        \
+  TYPE2 d_8_##TYPE2[513 * 2 + 1] = {0};                                        \
+  TYPE3 e_8_##TYPE3[513] = {0};                                                \
+  test_1_##TYPE1_##TYPE2 (f_8_##TYPE1, d_8_##TYPE2, e_8_##TYPE3, x_8_##TYPE1,  \
+			  x2_8_##TYPE1, x3_8_##TYPE1, x4_8_##TYPE1,            \
+			  y_8_##TYPE2, y2_8_##TYPE2, z_8_##TYPE3,              \
+			  n_8_##TYPE1_##TYPE2_##TYPE3);                        \
+  for (int i = 0; i < n_8_##TYPE1_##TYPE2_##TYPE3; ++i)                        \
+    {                                                                          \
+      if (f_8_##TYPE1[i * 4 + 0] != x_8_##TYPE1)                               \
+	__builtin_abort ();                                                    \
+      if (f_8_##TYPE1[i * 4 + 1] != x2_8_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (f_8_##TYPE1[i * 4 + 2] != x3_8_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (f_8_##TYPE1[i * 4 + 3] != x4_8_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (d_8_##TYPE2[i * 2 + 0] != y_8_##TYPE2)                               \
+	__builtin_abort ();                                                    \
+      if (d_8_##TYPE2[i * 2 + 1] != y2_8_##TYPE2)                              \
+	__builtin_abort ();                                                    \
+      if (e_8_##TYPE3[i] != z_8_##TYPE3)                                       \
+	__builtin_abort ();                                                    \
+    }                                                                          \
+  for (int i = n_8_##TYPE1_##TYPE2_##TYPE3;                                    \
+       i < n_8_##TYPE1_##TYPE2_##TYPE3 + 1; ++i)                               \
+    {                                                                          \
+      if (f_8_##TYPE1[i * 4 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_8_##TYPE1[i * 4 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_8_##TYPE1[i * 4 + 2] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_8_##TYPE1[i * 4 + 3] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_8_##TYPE2[i * 2 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_8_##TYPE2[i * 2 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (e_8_##TYPE3[i] != 0)                                                 \
+	__builtin_abort ();                                                    \
+    }
+
+#define run_9(TYPE1, TYPE2, TYPE3)                                             \
+  int n_9_##TYPE1_##TYPE2_##TYPE3 = 637;                                       \
+  TYPE1 x_9_##TYPE1 = 222;                                                     \
+  TYPE1 x2_9_##TYPE1 = 111;                                                    \
+  TYPE1 x3_9_##TYPE1 = 11;                                                     \
+  TYPE1 x4_9_##TYPE1 = 7;                                                     \
+  TYPE2 y_9_##TYPE2 = 2034;                                                    \
+  TYPE2 y2_9_##TYPE2 = 6987;                                                   \
+  TYPE3 z_9_##TYPE3 = 1564616;                                                 \
+  TYPE1 f_9_##TYPE1[638 * 4 + 1] = {0};                                        \
+  TYPE2 d_9_##TYPE2[638 * 2 + 1] = {0};                                        \
+  TYPE3 e_9_##TYPE3[638] = {0};                                                \
+  test_1_##TYPE1_##TYPE2 (f_9_##TYPE1, d_9_##TYPE2, e_9_##TYPE3, x_9_##TYPE1,  \
+			  x2_9_##TYPE1, x3_9_##TYPE1, x4_9_##TYPE1,            \
+			  y_9_##TYPE2, y2_9_##TYPE2, z_9_##TYPE3,              \
+			  n_9_##TYPE1_##TYPE2_##TYPE3);                        \
+  for (int i = 0; i < n_9_##TYPE1_##TYPE2_##TYPE3; ++i)                        \
+    {                                                                          \
+      if (f_9_##TYPE1[i * 4 + 0] != x_9_##TYPE1)                               \
+	__builtin_abort ();                                                    \
+      if (f_9_##TYPE1[i * 4 + 1] != x2_9_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (f_9_##TYPE1[i * 4 + 2] != x3_9_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (f_9_##TYPE1[i * 4 + 3] != x4_9_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (d_9_##TYPE2[i * 2 + 0] != y_9_##TYPE2)                               \
+	__builtin_abort ();                                                    \
+      if (d_9_##TYPE2[i * 2 + 1] != y2_9_##TYPE2)                              \
+	__builtin_abort ();                                                    \
+      if (e_9_##TYPE3[i] != z_9_##TYPE3)                                       \
+	__builtin_abort ();                                                    \
+    }                                                                          \
+  for (int i = n_9_##TYPE1_##TYPE2_##TYPE3;                                    \
+       i < n_9_##TYPE1_##TYPE2_##TYPE3 + 1; ++i)                               \
+    {                                                                          \
+      if (f_9_##TYPE1[i * 4 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_9_##TYPE1[i * 4 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_9_##TYPE1[i * 4 + 2] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_9_##TYPE1[i * 4 + 3] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_9_##TYPE2[i * 2 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_9_##TYPE2[i * 2 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (e_9_##TYPE3[i] != 0)                                                 \
+	__builtin_abort ();                                                    \
+    }
+
+#define run_10(TYPE1, TYPE2, TYPE3)                                             \
+  int n_10_##TYPE1_##TYPE2_##TYPE3 = 777;                                       \
+  TYPE1 x_10_##TYPE1 = 222;                                                     \
+  TYPE1 x2_10_##TYPE1 = 111;                                                    \
+  TYPE1 x3_10_##TYPE1 = 11;                                                     \
+  TYPE1 x4_10_##TYPE1 = 7;                                                     \
+  TYPE2 y_10_##TYPE2 = 2034;                                                    \
+  TYPE2 y2_10_##TYPE2 = 6987;                                                   \
+  TYPE3 z_10_##TYPE3 = 1564616;                                                 \
+  TYPE1 f_10_##TYPE1[778 * 4 + 1] = {0};                                        \
+  TYPE2 d_10_##TYPE2[778 * 2 + 1] = {0};                                        \
+  TYPE3 e_10_##TYPE3[778] = {0};                                                \
+  test_1_##TYPE1_##TYPE2 (f_10_##TYPE1, d_10_##TYPE2, e_10_##TYPE3, x_10_##TYPE1,  \
+			  x2_10_##TYPE1, x3_10_##TYPE1, x4_10_##TYPE1,            \
+			  y_10_##TYPE2, y2_10_##TYPE2, z_10_##TYPE3,              \
+			  n_10_##TYPE1_##TYPE2_##TYPE3);                        \
+  for (int i = 0; i < n_10_##TYPE1_##TYPE2_##TYPE3; ++i)                        \
+    {                                                                          \
+      if (f_10_##TYPE1[i * 4 + 0] != x_10_##TYPE1)                               \
+	__builtin_abort ();                                                    \
+      if (f_10_##TYPE1[i * 4 + 1] != x2_10_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (f_10_##TYPE1[i * 4 + 2] != x3_10_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (f_10_##TYPE1[i * 4 + 3] != x4_10_##TYPE1)                              \
+	__builtin_abort ();                                                    \
+      if (d_10_##TYPE2[i * 2 + 0] != y_10_##TYPE2)                               \
+	__builtin_abort ();                                                    \
+      if (d_10_##TYPE2[i * 2 + 1] != y2_10_##TYPE2)                              \
+	__builtin_abort ();                                                    \
+      if (e_10_##TYPE3[i] != z_10_##TYPE3)                                       \
+	__builtin_abort ();                                                    \
+    }                                                                          \
+  for (int i = n_10_##TYPE1_##TYPE2_##TYPE3;                                    \
+       i < n_10_##TYPE1_##TYPE2_##TYPE3 + 1; ++i)                               \
+    {                                                                          \
+      if (f_10_##TYPE1[i * 4 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_10_##TYPE1[i * 4 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_10_##TYPE1[i * 4 + 2] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (f_10_##TYPE1[i * 4 + 3] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_10_##TYPE2[i * 2 + 0] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (d_10_##TYPE2[i * 2 + 1] != 0)                                         \
+	__builtin_abort ();                                                    \
+      if (e_10_##TYPE3[i] != 0)                                                 \
+	__builtin_abort ();                                                    \
+    }
+
+#define TEST_ALL(T)                                                            \
+  T (int8_t, int16_t, int32_t)                                                 \
+  T (uint8_t, uint16_t, uint32_t)                                              \
+  T (int16_t, int32_t, int64_t)                                                \
+  T (uint16_t, uint32_t, uint64_t)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-1.c
new file mode 100644
index 00000000000..d3e187eae68
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-1.c
@@ -0,0 +1,19 @@ 
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "--param riscv-autovec-preference=fixed-vlmax" } */
+
+#include "multiple_rgroup-1.c"
+
+int main (void)
+{
+  TEST_ALL (run_1)
+  TEST_ALL (run_2)
+  TEST_ALL (run_3)
+  TEST_ALL (run_4)
+  TEST_ALL (run_5)
+  TEST_ALL (run_6)
+  TEST_ALL (run_7)
+  TEST_ALL (run_8)
+  TEST_ALL (run_9)
+  TEST_ALL (run_10)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-2.c
new file mode 100644
index 00000000000..5166c9e35a0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-2.c
@@ -0,0 +1,19 @@ 
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "--param riscv-autovec-preference=fixed-vlmax" } */
+
+#include "multiple_rgroup-2.c"
+
+int main (void)
+{
+  TEST_ALL (run_1)
+  TEST_ALL (run_2)
+  TEST_ALL (run_3)
+  TEST_ALL (run_4)
+  TEST_ALL (run_5)
+  TEST_ALL (run_6)
+  TEST_ALL (run_7)
+  TEST_ALL (run_8)
+  TEST_ALL (run_9)
+  TEST_ALL (run_10)
+  return 0;
+}
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index ff6159e08d5..828490e85fc 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -385,6 +385,58 @@  vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
   return false;
 }
 
+/* Try to use adjust loop lens for non-SLP multiple-rgroups.
+
+     _36 = MIN_EXPR <ivtmp_34, VF>;
+
+     First length (MIN (X, VF/N)):
+       loop_len_15 = MIN_EXPR <_36, VF/N>;
+
+     Second length:
+       tmp = _36 - loop_len_15;
+       loop_len_16 = MIN (tmp, VF/N);
+
+     Third length:
+       tmp2 = tmp - loop_len_16;
+       loop_len_17 = MIN (tmp2, VF/N);
+
+     Forth length:
+       tmp3 = tmp2 - loop_len_17;
+       loop_len_18 = MIN (tmp3, VF/N);  */
+
+static void
+vect_adjust_loop_lens (tree iv_type, gimple_seq *seq, rgroup_controls *dest_rgm,
+		       rgroup_controls *src_rgm)
+{
+  tree ctrl_type = dest_rgm->type;
+  poly_uint64 nitems_per_ctrl
+    = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
+
+  for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
+    {
+      tree src = src_rgm->controls[i / dest_rgm->controls.length ()];
+      tree dest = dest_rgm->controls[i];
+      tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
+      gassign *stmt;
+      if (i == 0)
+	{
+	  /* MIN (X, VF*I/N) capped to the range [0, VF/N].  */
+	  stmt = gimple_build_assign (dest, MIN_EXPR, src, length_limit);
+	  gimple_seq_add_stmt (seq, stmt);
+	}
+      else
+	{
+	  /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N].  */
+	  tree temp = make_ssa_name (iv_type);
+	  stmt = gimple_build_assign (temp, MINUS_EXPR, src,
+				      dest_rgm->controls[i - 1]);
+	  gimple_seq_add_stmt (seq, stmt);
+	  stmt = gimple_build_assign (dest, MIN_EXPR, temp, length_limit);
+	  gimple_seq_add_stmt (seq, stmt);
+	}
+    }
+}
+
 /* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
    for all the rgroup controls in RGC and return a control that is nonzero
    when the loop needs to iterate.  Add any new preheader statements to
@@ -467,13 +519,106 @@  vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
   tree index_before_incr, index_after_incr;
   gimple_stmt_iterator incr_gsi;
   bool insert_after;
+  tree ctrl;
+  unsigned int i;
+  tree test_index, test_limit, first_limit;
   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
-  create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
-	     loop, &incr_gsi, insert_after, &index_before_incr,
-	     &index_after_incr);
+  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
+    {
+      nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
+      tree step = make_ssa_name (iv_type);
+      /* Create decrement IV.  */
+      create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
+		 insert_after, &index_before_incr, &index_after_incr);
+      tree temp = gimple_build (header_seq, MIN_EXPR, iv_type,
+				index_before_incr, nitems_step);
+      gimple_seq_add_stmt (header_seq, gimple_build_assign (step, temp));
+
+      FOR_EACH_VEC_ELT (rgc->controls, i, ctrl)
+	{
+	  if (rgc->max_nscalars_per_iter == 1)
+	    {
+	      /* single rgroup:
+		 ...
+		 _10 = (unsigned long) count_12(D);
+		 ...
+		 # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
+		 _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>;
+		 ...
+		 vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
+		 ...
+		 ivtmp_35 = ivtmp_9 - _36;
+		 ...
+		 if (ivtmp_35 != 0)
+		   goto <bb 4>; [83.33%]
+		 else
+		   goto <bb 5>; [16.67%]
+	      */
+	      gassign *assign = gimple_build_assign (ctrl, step);
+	      gimple_seq_add_stmt (header_seq, assign);
+	    }
+	  else
+	    {
+	      /* Multiple rgroup (SLP):
+		 ...
+		 _38 = (unsigned long) bnd.7_29;
+		 _39 = _38 * 2;
+		 ...
+		 # ivtmp_41 = PHI <ivtmp_42(6), _39(5)>
+		 ...
+		 _43 = MIN_EXPR <ivtmp_41, 32>;
+		 loop_len_26 = MIN_EXPR <_43, 16>;
+		 loop_len_25 = _43 - loop_len_26;
+		 ...
+		 .LEN_STORE (_6, 8B, loop_len_26, ...);
+		 ...
+		 .LEN_STORE (_25, 8B, loop_len_25, ...);
+		 _33 = loop_len_26 / 2;
+		 ...
+		 .LEN_STORE (_8, 16B, _33, ...);
+		 _36 = loop_len_25 / 2;
+		 ...
+		 .LEN_STORE (_15, 16B, _36, ...);
+		 ivtmp_42 = ivtmp_41 - _43;
+		 ...
+		 if (ivtmp_42 != 0)
+		   goto <bb 4>; [83.33%]
+		 else
+		  goto <bb 5>; [16.67%]
+	      */
+	      if (i == 0)
+		{
+		  gassign *assign
+		    = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
+		  gimple_seq_add_stmt (header_seq, assign);
+		}
+	      else if (i == rgc->controls.length () - 1)
+		{
+		  gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step,
+							 rgc->controls[i - 1]);
+		  gimple_seq_add_stmt (header_seq, assign);
+		}
+	      else
+		{
+		  step = gimple_build (header_seq, MINUS_EXPR, iv_type, step,
+				       rgc->controls[i - 1]);
+		  gassign *assign
+		    = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
+		  gimple_seq_add_stmt (header_seq, assign);
+		}
+	    }
+	}
+      return index_after_incr;
+    }
+  else
+    {
+      /* Create increment IV.  */
+      create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
+		 loop, &incr_gsi, insert_after, &index_before_incr,
+		 &index_after_incr);
+    }
 
   tree zero_index = build_int_cst (compare_type, 0);
-  tree test_index, test_limit, first_limit;
   gimple_stmt_iterator *test_gsi;
   if (might_wrap_p)
     {
@@ -557,8 +702,6 @@  vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
 
   /* Provide a definition of each control in the group.  */
   tree next_ctrl = NULL_TREE;
-  tree ctrl;
-  unsigned int i;
   FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
     {
       /* Previous controls will cover BIAS items.  This control covers the
@@ -704,6 +847,7 @@  vect_set_loop_condition_partial_vectors (class loop *loop,
 
   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
   unsigned int compare_precision = TYPE_PRECISION (compare_type);
   tree orig_niters = niters;
 
@@ -753,6 +897,53 @@  vect_set_loop_condition_partial_vectors (class loop *loop,
 	      continue;
 	  }
 
+	if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
+	    && rgc->max_nscalars_per_iter == 1
+	    && rgc != &LOOP_VINFO_LENS (loop_vinfo)[0])
+	  {
+	    /* Multiple rgroup (non-SLP):
+	      ...
+	      _38 = (unsigned long) n_12(D);
+	      ...
+	      # ivtmp_38 = PHI <ivtmp_39(3), 100(2)>
+	      ...
+	      _40 = MIN_EXPR <ivtmp_38, POLY_INT_CST [8, 8]>;
+	      loop_len_21 = MIN_EXPR <_40, POLY_INT_CST [2, 2]>;
+	      _41 = _40 - loop_len_21;
+	      loop_len_20 = MIN_EXPR <_41, POLY_INT_CST [2, 2]>;
+	      _42 = _40 - loop_len_20;
+	      loop_len_19 = MIN_EXPR <_42, POLY_INT_CST [2, 2]>;
+	      _43 = _40 - loop_len_19;
+	      loop_len_16 = MIN_EXPR <_43, POLY_INT_CST [2, 2]>;
+	      ...
+	      vect__4.8_15 = .LEN_LOAD (_6, 64B, loop_len_21, 0);
+	      ...
+	      vect__4.9_8 = .LEN_LOAD (_13, 64B, loop_len_20, 0);
+	      ...
+	      vect__4.10_28 = .LEN_LOAD (_46, 64B, loop_len_19, 0);
+	      ...
+	      vect__4.11_30 = .LEN_LOAD (_49, 64B, loop_len_16, 0);
+	      vect__7.13_31 = VEC_PACK_TRUNC_EXPR <vect__4.8_15, vect__4.9_8>;
+	      vect__7.13_32 = VEC_PACK_TRUNC_EXPR <...>;
+	      vect__7.12_33 = VEC_PACK_TRUNC_EXPR <...>;
+	      ...
+	      .LEN_STORE (_14, 16B, _40, vect__7.12_33, 0);
+	      ivtmp_39 = ivtmp_38 - _40;
+	      ...
+	      if (ivtmp_39 != 0)
+		goto <bb 3>; [92.31%]
+	      else
+		goto <bb 4>; [7.69%]
+	    */
+	    rgroup_controls *sub_rgc
+	      = &(*controls)[nmasks / rgc->controls.length () - 1];
+	    if (!sub_rgc->controls.is_empty ())
+	      {
+		vect_adjust_loop_lens (iv_type, &header_seq, rgc, sub_rgc);
+		continue;
+	      }
+	  }
+
 	/* See whether zero-based IV would ever generate all-false masks
 	   or zero length before wrapping around.  */
 	bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index ed0166fedab..6f49bdee009 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -973,6 +973,7 @@  _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
     vectorizable (false),
     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
     using_partial_vectors_p (false),
+    using_decrementing_iv_p (false),
     epil_using_partial_vectors_p (false),
     partial_load_store_bias (0),
     peeling_for_gaps (false),
@@ -2725,6 +2726,16 @@  start_over:
       && !vect_verify_loop_lens (loop_vinfo))
     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
 
+  /* If we're vectorizing an loop that uses length "controls" and
+     can iterate more than once, we apply decrementing IV approach
+     in loop control.  */
+  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+      && !LOOP_VINFO_LENS (loop_vinfo).is_empty ()
+      && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+	   && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
+			LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
+    LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
+
   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
      to be able to handle fewer than VF scalars, or needs to have a lower VF
      than the main loop.  */
@@ -10364,12 +10375,14 @@  vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
 
 tree
-vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
-		   unsigned int nvectors, unsigned int index)
+vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
+		   vec_loop_lens *lens, unsigned int nvectors, tree vectype,
+		   unsigned int index)
 {
   rgroup_controls *rgl = &(*lens)[nvectors - 1];
   bool use_bias_adjusted_len =
     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
+  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 
   /* Populate the rgroup's len array, if this is the first time we've
      used it.  */
@@ -10400,6 +10413,26 @@  vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
 
   if (use_bias_adjusted_len)
     return rgl->bias_adjusted_ctrl;
+  else if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
+    {
+      tree loop_len = rgl->controls[index];
+      poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
+      poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
+      if (maybe_ne (nunits1, nunits2))
+	{
+	  /* A loop len for data type X can be reused for data type Y
+	     if X has N times more elements than Y and if Y's elements
+	     are N times bigger than X's.  */
+	  gcc_assert (multiple_p (nunits1, nunits2));
+	  unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
+	  gimple_seq seq = NULL;
+	  loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
+				   build_int_cst (iv_type, factor));
+	  if (seq)
+	    gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
+	}
+      return loop_len;
+    }
   else
     return rgl->controls[index];
 }
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 7313191b0db..b5e4bc59355 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -8795,8 +8795,9 @@  vectorizable_store (vec_info *vinfo,
 	      else if (loop_lens)
 		{
 		  tree final_len
-		    = vect_get_loop_len (loop_vinfo, loop_lens,
-					 vec_num * ncopies, vec_num * j + i);
+		    = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+					 vec_num * ncopies, vectype,
+					 vec_num * j + i);
 		  tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
 		  machine_mode vmode = TYPE_MODE (vectype);
 		  opt_machine_mode new_ovmode
@@ -10151,8 +10152,8 @@  vectorizable_load (vec_info *vinfo,
 		    else if (loop_lens && memory_access_type != VMAT_INVARIANT)
 		      {
 			tree final_len
-			  = vect_get_loop_len (loop_vinfo, loop_lens,
-					       vec_num * ncopies,
+			  = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+					       vec_num * ncopies, vectype,
 					       vec_num * j + i);
 			tree ptr = build_int_cst (ref_type,
 						  align * BITS_PER_UNIT);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 9cf2fb23fe3..8af3b35324e 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -818,6 +818,13 @@  public:
      the vector loop can handle fewer than VF scalars.  */
   bool using_partial_vectors_p;
 
+  /* True if we've decided to use a decrementing loop control IV that counts
+     scalars. This can be done for any loop that:
+
+	(a) uses length "controls"; and
+	(b) can iterate more than once.  */
+  bool using_decrementing_iv_p;
+
   /* True if we've decided to use partially-populated vectors for the
      epilogue of loop.  */
   bool epil_using_partial_vectors_p;
@@ -890,6 +897,7 @@  public:
 #define LOOP_VINFO_VECTORIZABLE_P(L)       (L)->vectorizable
 #define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p
 #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
+#define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p
 #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L)                             \
   (L)->epil_using_partial_vectors_p
 #define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias
@@ -2293,8 +2301,9 @@  extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
 				unsigned int, tree, unsigned int);
 extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
 				  tree, unsigned int);
-extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
-			       unsigned int);
+extern tree vect_get_loop_len (loop_vec_info, gimple_stmt_iterator *,
+			       vec_loop_lens *, unsigned int, tree,
+ 			       unsigned int);
 extern gimple_seq vect_gen_len (tree, tree, tree, tree);
 extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
 extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);