diff mbox

[00/10] vect: Reuse reduction accumulators between loops

Message ID mptpmvtrner.fsf@arm.com
State New
Headers show

Commit Message

Richard Sandiford July 8, 2021, 12:38 p.m. UTC
Quoting from the final patch in the series:

------------------------------------------------------------------------
This patch adds support for reusing a main loop's reduction accumulator
in an epilogue loop.  This in turn lets the loops share a single piece
of vector->scalar reduction code.

The patch has the following restrictions:

(1) The epilogue reduction can only operate on a single vector
    (e.g. ncopies must be 1 for non-SLP reductions, and the group size
    must be <= the element count for SLP reductions).

(2) Both loops must use the same vector mode for their accumulators.
    This means that the patch is restricted to targets that support
    --param vect-partial-vector-usage=1.

(3) The reduction must be a standard “tree code” reduction.

However, these restrictions could be lifted in future.  For example,
if the main loop operates on 128-bit vectors and the epilogue loop
operates on 64-bit vectors, we could in future reduce the 128-bit
vector by one stage and use the 64-bit result as the starting point
for the epilogue result.

The patch tries to handle chained SLP reductions, unchained SLP
reductions and non-SLP reductions.  It also handles cases in which
the epilogue loop is entered directly (rather than via the main loop)
and cases in which the epilogue loop can be skipped.
------------------------------------------------------------------------

However, it ended up being difficult to do that without some preparatory
clean-ups.  Some of them could probably stand on their own, but others
are a bit “meh” without the final patch to justify them.

The diff below shows the effect of the patch when compiling:

  unsigned short __attribute__((noipa))
  add_loop (unsigned short *x, int n)
  {
    unsigned short res = 0;
    for (int i = 0; i < n; ++i)
      res += x[i];
    return res;
  }

with -O3 --param vect-partial-vector-usage=1 on an SVE target:

add_loop:				add_loop:
.LFB0:					.LFB0:
	.cfi_startproc				.cfi_startproc
	mov	x4, x0		      <
	cmp	w1, 0				cmp	w1, 0
	ble	.L7				ble	.L7
	cnth	x0		      |		cnth	x4
	sub	w2, w1, #1			sub	w2, w1, #1
	sub	w3, w0, #1	      |		sub	w3, w4, #1
	cmp	w2, w3				cmp	w2, w3
	bcc	.L8				bcc	.L8
	sub	w0, w1, w0	      |		sub	w4, w1, w4
	mov	x3, 0				mov	x3, 0
	cnth	x5				cnth	x5
	mov	z0.b, #0			mov	z0.b, #0
	ptrue	p0.b, all			ptrue	p0.b, all
	.p2align 3,,7				.p2align 3,,7
.L4:					.L4:
	ld1h	z1.h, p0/z, [x4, x3,  |		ld1h	z1.h, p0/z, [x0, x3, 
	mov	x2, x3				mov	x2, x3
	add	x3, x3, x5			add	x3, x3, x5
	add	z0.h, z0.h, z1.h		add	z0.h, z0.h, z1.h
	cmp	w0, w3		      |		cmp	w4, w3
	bcs	.L4				bcs	.L4
	uaddv	d0, p0, z0.h	      <
	umov	w0, v0.h[0]	      <
	inch	x2				inch	x2
	and	w0, w0, 65535	      <
	cmp	w1, w2				cmp	w1, w2
	beq	.L2		      |		beq	.L6
.L3:					.L3:
	sub	w1, w1, w2			sub	w1, w1, w2
	mov	z1.b, #0	      |		add	x2, x0, w2, uxtw 1
	whilelo	p0.h, wzr, w1			whilelo	p0.h, wzr, w1
	add	x2, x4, w2, uxtw 1    |		ld1h	z1.h, p0/z, [x2]
	ptrue	p1.b, all	      |		add	z0.h, p0/m, z0.h, z1.
	ld1h	z0.h, p0/z, [x2]      |	.L6:
	sel	z0.h, p0, z0.h, z1.h  |		ptrue	p0.b, all
	uaddv	d0, p1, z0.h	      |		uaddv	d0, p0, z0.h
	fmov	x1, d0		      |		umov	w0, v0.h[0]
	add	w0, w0, w1, uxth      <
	and	w0, w0, 65535			and	w0, w0, 65535
.L2:				      <
	ret					ret
	.p2align 2,,3				.p2align 2,,3
.L7:					.L7:
	mov	w0, 0				mov	w0, 0
	ret					ret
.L8:					.L8:
	mov	w2, 0				mov	w2, 0
	mov	w0, 0		      |		mov	z0.b, #0
	b	.L3				b	.L3
	.cfi_endproc				.cfi_endproc

Kewen, could you give this a spin on Power 10 to see whether it
works/helps there?  I've attached a combined diff.

Series tested on aarch64-linux-gnu and x86_64-linux-gnu.

Richard

Comments

Kewen.Lin July 10, 2021, 2:11 a.m. UTC | #1
Hi Richard,

on 2021/7/8 下午8:38, Richard Sandiford via Gcc-patches wrote:
> Quoting from the final patch in the series:
> 
> ------------------------------------------------------------------------
> This patch adds support for reusing a main loop's reduction accumulator
> in an epilogue loop.  This in turn lets the loops share a single piece
> of vector->scalar reduction code.
> 
> The patch has the following restrictions:
> 
> (1) The epilogue reduction can only operate on a single vector
>     (e.g. ncopies must be 1 for non-SLP reductions, and the group size
>     must be <= the element count for SLP reductions).
> 
> (2) Both loops must use the same vector mode for their accumulators.
>     This means that the patch is restricted to targets that support
>     --param vect-partial-vector-usage=1.
> 
> (3) The reduction must be a standard “tree code” reduction.
> 
> However, these restrictions could be lifted in future.  For example,
> if the main loop operates on 128-bit vectors and the epilogue loop
> operates on 64-bit vectors, we could in future reduce the 128-bit
> vector by one stage and use the 64-bit result as the starting point
> for the epilogue result.
> 
> The patch tries to handle chained SLP reductions, unchained SLP
> reductions and non-SLP reductions.  It also handles cases in which
> the epilogue loop is entered directly (rather than via the main loop)
> and cases in which the epilogue loop can be skipped.
> ------------------------------------------------------------------------
> 
> However, it ended up being difficult to do that without some preparatory
> clean-ups.  Some of them could probably stand on their own, but others
> are a bit “meh” without the final patch to justify them.
> 
> The diff below shows the effect of the patch when compiling:
> 
>   unsigned short __attribute__((noipa))
>   add_loop (unsigned short *x, int n)
>   {
>     unsigned short res = 0;
>     for (int i = 0; i < n; ++i)
>       res += x[i];
>     return res;
>   }
> 
> with -O3 --param vect-partial-vector-usage=1 on an SVE target:
> 
> add_loop:				add_loop:
> .LFB0:					.LFB0:
> 	.cfi_startproc				.cfi_startproc
> 	mov	x4, x0		      <
> 	cmp	w1, 0				cmp	w1, 0
> 	ble	.L7				ble	.L7
> 	cnth	x0		      |		cnth	x4
> 	sub	w2, w1, #1			sub	w2, w1, #1
> 	sub	w3, w0, #1	      |		sub	w3, w4, #1
> 	cmp	w2, w3				cmp	w2, w3
> 	bcc	.L8				bcc	.L8
> 	sub	w0, w1, w0	      |		sub	w4, w1, w4
> 	mov	x3, 0				mov	x3, 0
> 	cnth	x5				cnth	x5
> 	mov	z0.b, #0			mov	z0.b, #0
> 	ptrue	p0.b, all			ptrue	p0.b, all
> 	.p2align 3,,7				.p2align 3,,7
> .L4:					.L4:
> 	ld1h	z1.h, p0/z, [x4, x3,  |		ld1h	z1.h, p0/z, [x0, x3, 
> 	mov	x2, x3				mov	x2, x3
> 	add	x3, x3, x5			add	x3, x3, x5
> 	add	z0.h, z0.h, z1.h		add	z0.h, z0.h, z1.h
> 	cmp	w0, w3		      |		cmp	w4, w3
> 	bcs	.L4				bcs	.L4
> 	uaddv	d0, p0, z0.h	      <
> 	umov	w0, v0.h[0]	      <
> 	inch	x2				inch	x2
> 	and	w0, w0, 65535	      <
> 	cmp	w1, w2				cmp	w1, w2
> 	beq	.L2		      |		beq	.L6
> .L3:					.L3:
> 	sub	w1, w1, w2			sub	w1, w1, w2
> 	mov	z1.b, #0	      |		add	x2, x0, w2, uxtw 1
> 	whilelo	p0.h, wzr, w1			whilelo	p0.h, wzr, w1
> 	add	x2, x4, w2, uxtw 1    |		ld1h	z1.h, p0/z, [x2]
> 	ptrue	p1.b, all	      |		add	z0.h, p0/m, z0.h, z1.
> 	ld1h	z0.h, p0/z, [x2]      |	.L6:
> 	sel	z0.h, p0, z0.h, z1.h  |		ptrue	p0.b, all
> 	uaddv	d0, p1, z0.h	      |		uaddv	d0, p0, z0.h
> 	fmov	x1, d0		      |		umov	w0, v0.h[0]
> 	add	w0, w0, w1, uxth      <
> 	and	w0, w0, 65535			and	w0, w0, 65535
> .L2:				      <
> 	ret					ret
> 	.p2align 2,,3				.p2align 2,,3
> .L7:					.L7:
> 	mov	w0, 0				mov	w0, 0
> 	ret					ret
> .L8:					.L8:
> 	mov	w2, 0				mov	w2, 0
> 	mov	w0, 0		      |		mov	z0.b, #0
> 	b	.L3				b	.L3
> 	.cfi_endproc				.cfi_endproc
> 
> Kewen, could you give this a spin on Power 10 to see whether it
> works/helps there?  I've attached a combined diff.
> 

Thanks for the combined diff file.

I'm sorry that the current length based partial vector doesn't support
reduction, there are no conditional operations for length, we have to
preprocess the inactive lanes for the intermediate operations or final
reduction operations as operation types since the inactive lane value
is supposed to be undefined, this seems to require an efficient way to
turn length to a mask vector, Power10 doesn't have the corresponding
instruction so we have to do some tricks, it's still on my TODO list.
I did a hacking to relax the check in vectorizable_operation for
operations involved for reduction, I can see this patch series takes
effect for length based partial vector, so I believe it will help
length based partial vector once we enable it for reduction later.
Thanks for improving this!

This patch series was bootstrapped and regress-tested on Power10, also
benchmarked with SPEC2017 based on r12-2179 at Ofast unroll, no
remarkable regression and improvement was observed.

BR,
Kewen
Richard Sandiford July 13, 2021, 9:27 a.m. UTC | #2
"Kewen.Lin" <linkw@linux.ibm.com> writes:
> Hi Richard,
>
> on 2021/7/8 下午8:38, Richard Sandiford via Gcc-patches wrote:
>> Quoting from the final patch in the series:
>> 
>> ------------------------------------------------------------------------
>> This patch adds support for reusing a main loop's reduction accumulator
>> in an epilogue loop.  This in turn lets the loops share a single piece
>> of vector->scalar reduction code.
>> 
>> The patch has the following restrictions:
>> 
>> (1) The epilogue reduction can only operate on a single vector
>>     (e.g. ncopies must be 1 for non-SLP reductions, and the group size
>>     must be <= the element count for SLP reductions).
>> 
>> (2) Both loops must use the same vector mode for their accumulators.
>>     This means that the patch is restricted to targets that support
>>     --param vect-partial-vector-usage=1.
>> 
>> (3) The reduction must be a standard “tree code” reduction.
>> 
>> However, these restrictions could be lifted in future.  For example,
>> if the main loop operates on 128-bit vectors and the epilogue loop
>> operates on 64-bit vectors, we could in future reduce the 128-bit
>> vector by one stage and use the 64-bit result as the starting point
>> for the epilogue result.
>> 
>> The patch tries to handle chained SLP reductions, unchained SLP
>> reductions and non-SLP reductions.  It also handles cases in which
>> the epilogue loop is entered directly (rather than via the main loop)
>> and cases in which the epilogue loop can be skipped.
>> ------------------------------------------------------------------------
>> 
>> However, it ended up being difficult to do that without some preparatory
>> clean-ups.  Some of them could probably stand on their own, but others
>> are a bit “meh” without the final patch to justify them.
>> 
>> The diff below shows the effect of the patch when compiling:
>> 
>>   unsigned short __attribute__((noipa))
>>   add_loop (unsigned short *x, int n)
>>   {
>>     unsigned short res = 0;
>>     for (int i = 0; i < n; ++i)
>>       res += x[i];
>>     return res;
>>   }
>> 
>> with -O3 --param vect-partial-vector-usage=1 on an SVE target:
>> 
>> add_loop:				add_loop:
>> .LFB0:					.LFB0:
>> 	.cfi_startproc				.cfi_startproc
>> 	mov	x4, x0		      <
>> 	cmp	w1, 0				cmp	w1, 0
>> 	ble	.L7				ble	.L7
>> 	cnth	x0		      |		cnth	x4
>> 	sub	w2, w1, #1			sub	w2, w1, #1
>> 	sub	w3, w0, #1	      |		sub	w3, w4, #1
>> 	cmp	w2, w3				cmp	w2, w3
>> 	bcc	.L8				bcc	.L8
>> 	sub	w0, w1, w0	      |		sub	w4, w1, w4
>> 	mov	x3, 0				mov	x3, 0
>> 	cnth	x5				cnth	x5
>> 	mov	z0.b, #0			mov	z0.b, #0
>> 	ptrue	p0.b, all			ptrue	p0.b, all
>> 	.p2align 3,,7				.p2align 3,,7
>> .L4:					.L4:
>> 	ld1h	z1.h, p0/z, [x4, x3,  |		ld1h	z1.h, p0/z, [x0, x3, 
>> 	mov	x2, x3				mov	x2, x3
>> 	add	x3, x3, x5			add	x3, x3, x5
>> 	add	z0.h, z0.h, z1.h		add	z0.h, z0.h, z1.h
>> 	cmp	w0, w3		      |		cmp	w4, w3
>> 	bcs	.L4				bcs	.L4
>> 	uaddv	d0, p0, z0.h	      <
>> 	umov	w0, v0.h[0]	      <
>> 	inch	x2				inch	x2
>> 	and	w0, w0, 65535	      <
>> 	cmp	w1, w2				cmp	w1, w2
>> 	beq	.L2		      |		beq	.L6
>> .L3:					.L3:
>> 	sub	w1, w1, w2			sub	w1, w1, w2
>> 	mov	z1.b, #0	      |		add	x2, x0, w2, uxtw 1
>> 	whilelo	p0.h, wzr, w1			whilelo	p0.h, wzr, w1
>> 	add	x2, x4, w2, uxtw 1    |		ld1h	z1.h, p0/z, [x2]
>> 	ptrue	p1.b, all	      |		add	z0.h, p0/m, z0.h, z1.
>> 	ld1h	z0.h, p0/z, [x2]      |	.L6:
>> 	sel	z0.h, p0, z0.h, z1.h  |		ptrue	p0.b, all
>> 	uaddv	d0, p1, z0.h	      |		uaddv	d0, p0, z0.h
>> 	fmov	x1, d0		      |		umov	w0, v0.h[0]
>> 	add	w0, w0, w1, uxth      <
>> 	and	w0, w0, 65535			and	w0, w0, 65535
>> .L2:				      <
>> 	ret					ret
>> 	.p2align 2,,3				.p2align 2,,3
>> .L7:					.L7:
>> 	mov	w0, 0				mov	w0, 0
>> 	ret					ret
>> .L8:					.L8:
>> 	mov	w2, 0				mov	w2, 0
>> 	mov	w0, 0		      |		mov	z0.b, #0
>> 	b	.L3				b	.L3
>> 	.cfi_endproc				.cfi_endproc
>> 
>> Kewen, could you give this a spin on Power 10 to see whether it
>> works/helps there?  I've attached a combined diff.
>> 
>
> Thanks for the combined diff file.
>
> I'm sorry that the current length based partial vector doesn't support
> reduction, there are no conditional operations for length, we have to
> preprocess the inactive lanes for the intermediate operations or final
> reduction operations as operation types since the inactive lane value
> is supposed to be undefined, this seems to require an efficient way to
> turn length to a mask vector, Power10 doesn't have the corresponding
> instruction so we have to do some tricks, it's still on my TODO list.

Ah, yeah, I'd forgotten about that, sorry.

> I did a hacking to relax the check in vectorizable_operation for
> operations involved for reduction, I can see this patch series takes
> effect for length based partial vector, so I believe it will help
> length based partial vector once we enable it for reduction later.
> Thanks for improving this!
>
> This patch series was bootstrapped and regress-tested on Power10, also
> benchmarked with SPEC2017 based on r12-2179 at Ofast unroll, no
> remarkable regression and improvement was observed.

Thanks for the testing.

Richard
diff mbox

Patch

diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c
new file mode 100644
index 00000000000..fb817b73d77
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c
@@ -0,0 +1,77 @@ 
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+unsigned short __attribute__((noipa))
+add_loop (unsigned short *x, int n)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < n; ++i)
+    res += x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+min_loop (unsigned short *x, int n)
+{
+  unsigned short res = ~0;
+  for (int i = 0; i < n; ++i)
+    res = res < x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+max_loop (unsigned short *x, int n)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < n; ++i)
+    res = res > x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+and_loop (unsigned short *x, int n)
+{
+  unsigned short res = ~0;
+  for (int i = 0; i < n; ++i)
+    res &= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+or_loop (unsigned short *x, int n)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < n; ++i)
+    res |= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+eor_loop (unsigned short *x, int n)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < n; ++i)
+    res ^= x[i];
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c
new file mode 100644
index 00000000000..1dd579be701
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c
@@ -0,0 +1,49 @@ 
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_10.c"
+
+int
+main (void)
+{
+  unsigned short x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = (i + 1) * (i + 2);
+
+  if (add_loop (x, 0) != 0
+      || add_loop (x, 11) != 572
+      || add_loop (x, 0x100) != 22016
+      || add_loop (x, 0xfff) != 20480
+      || max_loop (x, 0) != 0
+      || max_loop (x, 11) != 132
+      || max_loop (x, 0x100) != 65280
+      || max_loop (x, 0xfff) != 65504
+      || or_loop (x, 0) != 0
+      || or_loop (x, 11) != 0xfe
+      || or_loop (x, 0x80) != 0x7ffe
+      || or_loop (x, 0xb4) != 0x7ffe
+      || or_loop (x, 0xb5) != 0xfffe
+      || eor_loop (x, 0) != 0
+      || eor_loop (x, 11) != 0xe8
+      || eor_loop (x, 0x100) != 0xcf00
+      || eor_loop (x, 0xfff) != 0xa000)
+    __builtin_abort ();
+
+  for (int i = 0; i < N; ++i)
+    x[i] = ~x[i];
+
+  if (min_loop (x, 0) != 65535
+      || min_loop (x, 11) != 65403
+      || min_loop (x, 0x100) != 255
+      || min_loop (x, 0xfff) != 31
+      || and_loop (x, 0) != 0xffff
+      || and_loop (x, 11) != 0xff01
+      || and_loop (x, 0x80) != 0x8001
+      || and_loop (x, 0xb4) != 0x8001
+      || and_loop (x, 0xb5) != 1)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c
new file mode 100644
index 00000000000..f99ef4aa865
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c
@@ -0,0 +1,71 @@ 
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+unsigned short __attribute__((noipa))
+add_loop (unsigned short *x, unsigned short res)
+{
+  for (int i = 0; i < 0xfff; ++i)
+    res += x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+min_loop (unsigned short *x, unsigned short res)
+{
+  for (int i = 0; i < 0xfff; ++i)
+    res = res < x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+max_loop (unsigned short *x, unsigned short res)
+{
+  for (int i = 0; i < 0xfff; ++i)
+    res = res > x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+and_loop (unsigned short *x, unsigned short res)
+{
+  for (int i = 0; i < 0xfff; ++i)
+    res &= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+or_loop (unsigned short *x, unsigned short res)
+{
+  for (int i = 0; i < 0xfff; ++i)
+    res |= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+eor_loop (unsigned short *x, unsigned short res)
+{
+  for (int i = 0; i < 0xfff; ++i)
+    res ^= x[i];
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c
new file mode 100644
index 00000000000..5b41560d2ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c
@@ -0,0 +1,34 @@ 
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_11.c"
+
+int
+main (void)
+{
+  unsigned short x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = (i + 1) * (i + 2);
+
+  if (add_loop (x, 42) != 20522
+      || max_loop (x, 65503) != 65504
+      || max_loop (x, 65505) != 65505
+      || or_loop (x, 0) != 0xfffe
+      || or_loop (x, 1) != 0xffff
+      || eor_loop (x, 0) != 0xa000
+      || eor_loop (x, 0xbfff) != 0x1fff)
+    __builtin_abort ();
+
+  for (int i = 0; i < N; ++i)
+    x[i] = ~x[i];
+
+  if (min_loop (x, 32) != 31
+      || min_loop (x, 30) != 30
+      || and_loop (x, 0xff) != 1
+      || and_loop (x, 0) != 0)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c
new file mode 100644
index 00000000000..d32b81a61bc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c
@@ -0,0 +1,71 @@ 
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+unsigned short __attribute__((noipa))
+add_loop (unsigned short *x, int n, unsigned short res)
+{
+  for (int i = 0; i < n; ++i)
+    res += x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+min_loop (unsigned short *x, int n, unsigned short res)
+{
+  for (int i = 0; i < n; ++i)
+    res = res < x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+max_loop (unsigned short *x, int n, unsigned short res)
+{
+  for (int i = 0; i < n; ++i)
+    res = res > x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+and_loop (unsigned short *x, int n, unsigned short res)
+{
+  for (int i = 0; i < n; ++i)
+    res &= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+or_loop (unsigned short *x, int n, unsigned short res)
+{
+  for (int i = 0; i < n; ++i)
+    res |= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+eor_loop (unsigned short *x, int n, unsigned short res)
+{
+  for (int i = 0; i < n; ++i)
+    res ^= x[i];
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c
new file mode 100644
index 00000000000..929b81a9705
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c
@@ -0,0 +1,66 @@ 
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_12.c"
+
+int
+main (void)
+{
+  unsigned short x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = (i + 1) * (i + 2);
+
+  if (add_loop (x, 0, 10) != 10
+      || add_loop (x, 11, 42) != 614
+      || add_loop (x, 0x100, 84) != 22100
+      || add_loop (x, 0xfff, 20) != 20500
+      || max_loop (x, 0, 10) != 10
+      || max_loop (x, 11, 131) != 132
+      || max_loop (x, 11, 133) != 133
+      || max_loop (x, 0x100, 65279) != 65280
+      || max_loop (x, 0x100, 65281) != 65281
+      || max_loop (x, 0xfff, 65503) != 65504
+      || max_loop (x, 0xfff, 65505) != 65505
+      || or_loop (x, 0, 0x71) != 0x71
+      || or_loop (x, 11, 0) != 0xfe
+      || or_loop (x, 11, 0xb3c) != 0xbfe
+      || or_loop (x, 0x80, 0) != 0x7ffe
+      || or_loop (x, 0x80, 1) != 0x7fff
+      || or_loop (x, 0xb4, 0) != 0x7ffe
+      || or_loop (x, 0xb4, 1) != 0x7fff
+      || or_loop (x, 0xb5, 0) != 0xfffe
+      || or_loop (x, 0xb5, 1) != 0xffff
+      || eor_loop (x, 0, 0x3e) != 0x3e
+      || eor_loop (x, 11, 0) != 0xe8
+      || eor_loop (x, 11, 0x1ff) != 0x117
+      || eor_loop (x, 0x100, 0) != 0xcf00
+      || eor_loop (x, 0x100, 0xeee) != 0xc1ee
+      || eor_loop (x, 0xfff, 0) != 0xa000
+      || eor_loop (x, 0xfff, 0x8888) != 0x2888)
+    __builtin_abort ();
+
+  for (int i = 0; i < N; ++i)
+    x[i] = ~x[i];
+
+  if (min_loop (x, 0, 10000) != 10000
+      || min_loop (x, 11, 65404) != 65403
+      || min_loop (x, 11, 65402) != 65402
+      || min_loop (x, 0x100, 256) != 255
+      || min_loop (x, 0x100, 254) != 254
+      || min_loop (x, 0xfff, 32) != 31
+      || min_loop (x, 0xfff, 30) != 30
+      || and_loop (x, 0, 0x1234) != 0x1234
+      || and_loop (x, 11, 0xffff) != 0xff01
+      || and_loop (x, 11, 0xcdef) != 0xcd01
+      || and_loop (x, 0x80, 0xffff) != 0x8001
+      || and_loop (x, 0x80, 0xfffe) != 0x8000
+      || and_loop (x, 0xb4, 0xffff) != 0x8001
+      || and_loop (x, 0xb4, 0xfffe) != 0x8000
+      || and_loop (x, 0xb5, 0xffff) != 1
+      || and_loop (x, 0xb5, 0xfffe) != 0)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c
new file mode 100644
index 00000000000..ce2b8f2fcdc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c
@@ -0,0 +1,101 @@ 
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+void __attribute__((noipa))
+add_loop (unsigned int *x, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < 0x7ff; ++i)
+    {
+      res0 += x[i * 2];
+      res1 += x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+min_loop (unsigned int *x, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < 0x7ff; ++i)
+    {
+      res0 = res0 < x[i * 2] ? res0 : x[i * 2];
+      res1 = res1 < x[i * 2 + 1] ? res1 : x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+max_loop (unsigned int *x, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < 0x7ff; ++i)
+    {
+      res0 = res0 > x[i * 2] ? res0 : x[i * 2];
+      res1 = res1 > x[i * 2 + 1] ? res1 : x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+and_loop (unsigned int *x, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < 0x7ff; ++i)
+    {
+      res0 &= x[i * 2];
+      res1 &= x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+or_loop (unsigned int *x, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < 0x7ff; ++i)
+    {
+      res0 |= x[i * 2];
+      res1 |= x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+eor_loop (unsigned int *x, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < 0x7ff; ++i)
+    {
+      res0 ^= x[i * 2];
+      res1 ^= x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c
new file mode 100644
index 00000000000..5514d8d6b3b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c
@@ -0,0 +1,61 @@ 
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_13.c"
+
+int
+main (void)
+{
+  unsigned int x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = ((i + 1) * (i + 2)) & 0xfffff;
+
+  unsigned int add_res[2] = { 42, 1111 };
+  add_loop (x, add_res);
+  if (add_res[0] != 968538154
+      || add_res[1] != 964340823)
+    __builtin_abort ();
+
+  unsigned int max_res1[2] = { 0, 0 };
+  max_loop (x, max_res1);
+  if (max_res1[0] != 1048150
+      || max_res1[1] != 1045506)
+    __builtin_abort ();
+
+  unsigned int max_res2[2] = { 1048151, 1045507 };
+  max_loop (x, max_res2);
+  if (max_res2[0] != 1048151
+      || max_res2[1] != 1045507)
+    __builtin_abort ();
+
+  unsigned int or_res[2] = { 0x1000000, 0x2000000 };
+  or_loop (x, or_res);
+  if (or_res[0] != 0x10ffffe
+      || or_res[1] != 0x20ffffe)
+    __builtin_abort ();
+
+  unsigned int eor_res[2] = { 0x1000000, 0x2000000 };
+  eor_loop (x, eor_res);
+  if (eor_res[0] != 0x1010000
+      || eor_res[1] != 0x20b5000)
+    __builtin_abort ();
+
+  for (int i = 0; i < N; ++i)
+    x[i] = ~x[i] & 0xfffff;
+
+  unsigned int min_res1[2] = { 500, 4000 };
+  min_loop (x, min_res1);
+  if (min_res1[0] != 425
+      || min_res1[1] != 3069)
+    __builtin_abort ();
+
+  unsigned int min_res2[2] = { 424, 3068 };
+  min_loop (x, min_res2);
+  if (min_res2[0] != 424
+      || min_res2[1] != 3068)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c
new file mode 100644
index 00000000000..3be611e4b37
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c
@@ -0,0 +1,107 @@ 
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+void __attribute__((noipa))
+add_loop (unsigned int *x, int n, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < n; ++i)
+    {
+      res0 += x[i * 2];
+      res1 += x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+min_loop (unsigned int *x, int n, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < n; ++i)
+    {
+      res0 = res0 < x[i * 2] ? res0 : x[i * 2];
+      res1 = res1 < x[i * 2 + 1] ? res1 : x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+max_loop (unsigned int *x, int n, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < n; ++i)
+    {
+      res0 = res0 > x[i * 2] ? res0 : x[i * 2];
+      res1 = res1 > x[i * 2 + 1] ? res1 : x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+and_loop (unsigned int *x, int n, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < n; ++i)
+    {
+      res0 &= x[i * 2];
+      res1 &= x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+or_loop (unsigned int *x, int n, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < n; ++i)
+    {
+      res0 |= x[i * 2];
+      res1 |= x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+void __attribute__((noipa))
+eor_loop (unsigned int *x, int n, unsigned int *res)
+{
+  unsigned int res0 = res[0];
+  unsigned int res1 = res[1];
+  for (int i = 0; i < n; ++i)
+    {
+      res0 ^= x[i * 2];
+      res1 ^= x[i * 2 + 1];
+    }
+  res[0] = res0;
+  res[1] = res1;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuminv\t} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\t} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tandv\t} 2 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torv\t} 2 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teorv\t} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c
new file mode 100644
index 00000000000..ccaa770e9b2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c
@@ -0,0 +1,187 @@ 
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_14.c"
+
+int
+main (void)
+{
+  unsigned int x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = ((i + 1) * (i + 2)) & 0xfffff;
+
+  unsigned int add_res1[2] = { 11, 22 };
+  add_loop (x, 0, add_res1);
+  if (add_res1[0] != 11
+      || add_res1[1] != 22)
+    __builtin_abort ();
+
+  unsigned int add_res2[2] = { 10, 20 };
+  add_loop (x, 11, add_res2);
+  if (add_res2[0] != 1902
+      || add_res2[1] != 2176)
+    __builtin_abort ();
+
+  unsigned int add_res3[2] = { 15, 30 };
+  add_loop (x, 0x100, add_res3);
+  if (add_res3[0] != 22435087
+      || add_res3[1] != 22566686)
+    __builtin_abort ();
+
+  unsigned int add_res4[2] = { 100, 200 };
+  add_loop (x, 0x11f, add_res4);
+  if (add_res4[0] != 31602244
+      || add_res4[1] != 31767656)
+    __builtin_abort ();
+
+  unsigned int max_res1[2] = { 461, 500 };
+  max_loop (x, 11, max_res1);
+  if (max_res1[0] != 462
+      || max_res1[1] != 506)
+    __builtin_abort ();
+
+  unsigned int max_res2[2] = { 463, 507 };
+  max_loop (x, 11, max_res2);
+  if (max_res2[0] != 463
+      || max_res2[1] != 507)
+    __builtin_abort ();
+
+  unsigned int max_res3[2] = { 1000000, 1000000 };
+  max_loop (x, 0x200, max_res3);
+  if (max_res3[0] != 1047552
+      || max_res3[1] != 1045506)
+    __builtin_abort ();
+
+  unsigned int max_res4[2] = { 1047553, 1045507 };
+  max_loop (x, 0x200, max_res4);
+  if (max_res4[0] != 1047553
+      || max_res4[1] != 1045507)
+    __builtin_abort ();
+
+  unsigned int max_res5[2] = { 300000, 30000 };
+  max_loop (x, 0x11f, max_res5);
+  if (max_res5[0] != 328902
+      || max_res5[1] != 330050)
+    __builtin_abort ();
+
+  unsigned int max_res6[2] = { 328903, 330051 };
+  max_loop (x, 0x11f, max_res6);
+  if (max_res6[0] != 328903
+      || max_res6[1] != 330051)
+    __builtin_abort ();
+
+  unsigned int or_res1[2] = { 11, 22 };
+  or_loop (x, 0, or_res1);
+  if (or_res1[0] != 11
+      || or_res1[1] != 22)
+    __builtin_abort ();
+
+  unsigned int or_res2[2] = { 0x200000, 0xe00000 };
+  or_loop (x, 11, or_res2);
+  if (or_res2[0] != 0x2001fe
+      || or_res2[1] != 0xe001fe)
+    __builtin_abort ();
+
+  unsigned int or_res3[2] = { 0x800000, 0x700000 };
+  or_loop (x, 0x40, or_res3);
+  if (or_res3[0] != 0x803ffe
+      || or_res3[1] != 0x707ffe)
+    __builtin_abort ();
+
+  unsigned int or_res4[2] = { 0x100001, 0x300000 };
+  or_loop (x, 0x4f, or_res4);
+  if (or_res4[0] != 0x107fff
+      || or_res4[1] != 0x307ffe)
+    __builtin_abort ();
+
+  unsigned int eor_res1[2] = { 11, 22 };
+  eor_loop (x, 0, eor_res1);
+  if (eor_res1[0] != 11
+      || eor_res1[1] != 22)
+    __builtin_abort ();
+
+  unsigned int eor_res2[2] = { 0x2000ff, 0xe000ff };
+  eor_loop (x, 11, eor_res2);
+  if (eor_res2[0] != 0x2001cf
+      || eor_res2[1] != 0xe000b7)
+    __builtin_abort ();
+
+  unsigned int eor_res3[2] = { 0x805000, 0x70f000 };
+  eor_loop (x, 0x100, eor_res3);
+  if (eor_res3[0] != 0x824200
+      || eor_res3[1] != 0x77dc00)
+    __builtin_abort ();
+
+  unsigned int eor_res4[2] = { 0x101201, 0x300f00 };
+  eor_loop (x, 0x11f, eor_res4);
+  if (eor_res4[0] != 0x178801
+      || eor_res4[1] != 0x337240)
+    __builtin_abort ();
+
+  for (int i = 0; i < N; ++i)
+    x[i] = ~x[i] & 0xfffff;
+
+  unsigned int min_res1[2] = { 1048200, 1048100 };
+  min_loop (x, 11, min_res1);
+  if (min_res1[0] != 1048113
+      || min_res1[1] != 1048069)
+    __builtin_abort ();
+
+  unsigned int min_res2[2] = { 1048112, 1048068 };
+  min_loop (x, 11, min_res2);
+  if (min_res2[0] != 1048112
+      || min_res2[1] != 1048068)
+    __builtin_abort ();
+
+  unsigned int min_res3[2] = { 10000, 10000 };
+  min_loop (x, 0x200, min_res3);
+  if (min_res3[0] != 1023
+      || min_res3[1] != 3069)
+    __builtin_abort ();
+
+  unsigned int min_res4[2] = { 1022, 3068 };
+  min_loop (x, 0x200, min_res4);
+  if (min_res4[0] != 1022
+      || min_res4[1] != 3068)
+    __builtin_abort ();
+
+  unsigned int min_res5[2] = { 719680, 718530 };
+  min_loop (x, 0x11f, min_res5);
+  if (min_res5[0] != 719673
+      || min_res5[1] != 718525)
+    __builtin_abort ();
+
+  unsigned int min_res6[2] = { 719672, 718524 };
+  min_loop (x, 0x11f, min_res6);
+  if (min_res6[0] != 719672
+      || min_res6[1] != 718524)
+    __builtin_abort ();
+
+  unsigned int and_res1[2] = { 11, 22 };
+  and_loop (x, 0, and_res1);
+  if (and_res1[0] != 11
+      || and_res1[1] != 22)
+    __builtin_abort ();
+
+  unsigned int and_res2[2] = { 0xf5cff, 0xf78ff };
+  and_loop (x, 11, and_res2);
+  if (and_res2[0] != 0xf5c01
+      || and_res2[1] != 0xf7801)
+    __builtin_abort ();
+
+  unsigned int and_res3[2] = { 0x7efff, 0xecfff };
+  and_loop (x, 0x40, and_res3);
+  if (and_res3[0] != 0x7c001
+      || and_res3[1] != 0xe8001)
+    __builtin_abort ();
+
+  unsigned int and_res4[2] = { 0xffffff, 0xffffff };
+  and_loop (x, 0x4f, and_res4);
+  if (and_res4[0] != 0xf8001
+      || and_res4[1] != 0xf8001)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c
new file mode 100644
index 00000000000..15b1ade30e2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c
@@ -0,0 +1,16 @@ 
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+int __attribute__((noipa))
+add_loop (int *x, int n, int res)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      res += x[i * 2];
+      res += x[i * 2 + 1];
+    }
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c
new file mode 100644
index 00000000000..3207fce5be3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c
@@ -0,0 +1,22 @@ 
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_15.c"
+
+int
+main (void)
+{
+  int x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = ((i + 1) * (i + 2)) & 0xfffff;
+
+  if (add_loop (x, 0, 33) != 33
+      || add_loop (x, 11, 30) != 4078
+      || add_loop (x, 0x100, 45) != 45001773
+      || add_loop (x, 0x11f, 300) != 63369900)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c
new file mode 100644
index 00000000000..b839821d6bb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c
@@ -0,0 +1,77 @@ 
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+unsigned short __attribute__((noipa))
+add_loop (unsigned short *x)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < 0xfff; ++i)
+    res += x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+min_loop (unsigned short *x)
+{
+  unsigned short res = ~0;
+  for (int i = 0; i < 0xfff; ++i)
+    res = res < x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+max_loop (unsigned short *x)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < 0xfff; ++i)
+    res = res > x[i] ? res : x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+and_loop (unsigned short *x)
+{
+  unsigned short res = ~0;
+  for (int i = 0; i < 0xfff; ++i)
+    res &= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+or_loop (unsigned short *x)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < 0xfff; ++i)
+    res |= x[i];
+  return res;
+}
+
+unsigned short __attribute__((noipa))
+eor_loop (unsigned short *x)
+{
+  unsigned short res = 0;
+  for (int i = 0; i < 0xfff; ++i)
+    res ^= x[i];
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\torv\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c
new file mode 100644
index 00000000000..aa248f53eaa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c
@@ -0,0 +1,29 @@ 
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */
+
+#define N 0x1100
+
+#include "reduc_9.c"
+
+int
+main (void)
+{
+  unsigned short x[N];
+  for (int i = 0; i < N; ++i)
+    x[i] = (i + 1) * (i + 2);
+
+  if (add_loop (x) != 20480
+      || max_loop (x) != 65504
+      || or_loop (x) != 0xfffe
+      || eor_loop (x) != 0xa000)
+    __builtin_abort ();
+
+  for (int i = 0; i < N; ++i)
+    x[i] = ~x[i];
+
+  if (min_loop (x) != 31
+      || and_loop (x) != 1)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index 2909e8a0fc3..b7b0523e3c8 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -2457,6 +2457,31 @@  vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
   return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
 }
 
+/* LOOP_VINFO is an epilogue loop and MAIN_LOOP_VALUE is available on exit
+   from the corresponding main loop.  Return a value that is available in
+   LOOP_VINFO's preheader, using SKIP_VALUE if the main loop is skipped.
+   Passing a null SKIP_VALUE is equivalent to passing zero.  */
+
+tree
+vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
+			   tree skip_value)
+{
+  if (!loop_vinfo->main_loop_edge)
+    return main_loop_value;
+
+  if (!skip_value)
+    skip_value = build_zero_cst (TREE_TYPE (main_loop_value));
+
+  tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
+  basic_block bb = loop_vinfo->main_loop_edge->dest;
+  gphi *new_phi = create_phi_node (phi_result, bb);
+  add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
+	       UNKNOWN_LOCATION);
+  add_phi_arg (new_phi, skip_value,
+	       loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
+  return phi_result;
+}
+
 /* Function vect_do_peeling.
 
    Input:
@@ -2986,6 +3011,8 @@  vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 					   skip_vector ? anchor : guard_bb,
 					   prob_epilog.invert (),
 					   irred_flag);
+	  if (vect_epilogues)
+	    epilogue_vinfo->skip_this_loop_edge = guard_e;
 	  slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
 					      single_exit (epilog));
 	  /* Only need to handle basic block before epilog loop if it's not
@@ -3057,6 +3084,8 @@  vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 	  add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
 		       UNKNOWN_LOCATION);
 	  niters = PHI_RESULT (new_phi);
+	  epilogue_vinfo->main_loop_edge = update_e;
+	  epilogue_vinfo->skip_main_loop_edge = skip_e;
 	}
 
       /* Set ADVANCE to the number of iterations performed by the previous
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index bc523d151c6..5e6c9b7c38a 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -19,6 +19,7 @@  You should have received a copy of the GNU General Public License
 along with GCC; see the file COPYING3.  If not see
 <http://www.gnu.org/licenses/>.  */
 
+#define INCLUDE_ALGORITHM
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
@@ -823,6 +824,10 @@  _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
     th (0),
     versioning_threshold (0),
     vectorization_factor (0),
+    main_loop_edge (nullptr),
+    skip_main_loop_edge (nullptr),
+    skip_this_loop_edge (nullptr),
+    reusable_accumulators (),
     max_vectorization_factor (0),
     mask_skip_niters (NULL_TREE),
     rgroup_compare_type (NULL_TREE),
@@ -3248,23 +3253,15 @@  reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
     }
 }
 
-/* If there is a neutral value X such that SLP reduction NODE would not
-   be affected by the introduction of additional X elements, return that X,
-   otherwise return null.  CODE is the code of the reduction and VECTOR_TYPE
-   is the vector type that would hold element X.  REDUC_CHAIN is true if
-   the SLP statements perform a single reduction, false if each statement
-   performs an independent reduction.  */
+/* If there is a neutral value X such that a reduction would not be affected
+   by the introduction of additional X elements, return that X, otherwise
+   return null.  CODE is the code of the reduction and SCALAR_TYPE is type
+   of the scalar elements.  If the reduction has just a single initial value
+   then INITIAL_VALUE is that value, otherwise it is null.  */
 
 static tree
-neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
-			      tree_code code, bool reduc_chain)
+neutral_op_for_reduction (tree scalar_type, tree_code code, tree initial_value)
 {
-  vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
-  stmt_vec_info stmt_vinfo = stmts[0];
-  tree scalar_type = TREE_TYPE (vector_type);
-  class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
-  gcc_assert (loop);
-
   switch (code)
     {
     case WIDEN_SUM_EXPR:
@@ -3284,13 +3281,7 @@  neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
 
     case MAX_EXPR:
     case MIN_EXPR:
-      /* For MIN/MAX the initial values are neutral.  A reduction chain
-	 has only a single initial value, so that value is neutral for
-	 all statements.  */
-      if (reduc_chain)
-	return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
-				      loop_preheader_edge (loop));
-      return NULL_TREE;
+      return initial_value;
 
     default:
       return NULL_TREE;
@@ -4621,64 +4612,58 @@  vect_model_reduction_cost (loop_vec_info loop_vinfo,
                  prologue_cost, epilogue_cost);
 }
 
+/* SEQ is a sequence of instructions that initialize the reduction
+   described by REDUC_INFO.  Emit them in the appropriate place.  */
 
+static void
+vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
+				stmt_vec_info reduc_info, gimple *seq)
+{
+  if (reduc_info->reused_accumulator)
+    {
+      /* When reusing an accumulator from the main loop, we only need
+	 initialization instructions if the main loop can be skipped.
+	 In that case, emit the initialization instructions at the end
+	 of the guard block that does the skip.  */
+      edge skip_edge = loop_vinfo->skip_main_loop_edge;
+      gcc_assert (skip_edge);
+      gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
+      gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
+    }
+  else
+    {
+      /* The normal case: emit the initialization instructions on the
+	 preheader edge.  */
+      class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+      gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
+    }
+}
 
 /* Function get_initial_def_for_reduction
 
    Input:
-   STMT_VINFO - a stmt that performs a reduction operation in the loop.
+   REDUC_INFO - the info_for_reduction
    INIT_VAL - the initial value of the reduction variable
+   NEUTRAL_OP - a value that has no effect on the reduction, as per
+		neutral_op_for_reduction
 
    Output:
-   ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
-        of the reduction (used for adjusting the epilog - see below).
    Return a vector variable, initialized according to the operation that
 	STMT_VINFO performs. This vector will be used as the initial value
 	of the vector of partial results.
 
-   Option1 (adjust in epilog): Initialize the vector as follows:
-     add/bit or/xor:    [0,0,...,0,0]
-     mult/bit and:      [1,1,...,1,1]
-     min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
-   and when necessary (e.g. add/mult case) let the caller know
-   that it needs to adjust the result by init_val.
-
-   Option2: Initialize the vector as follows:
-     add/bit or/xor:    [init_val,0,0,...,0]
-     mult/bit and:      [init_val,1,1,...,1]
-     min/max/cond_expr: [init_val,init_val,...,init_val]
-   and no adjustments are needed.
-
-   For example, for the following code:
-
-   s = init_val;
-   for (i=0;i<n;i++)
-     s = s + a[i];
-
-   STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
-   For a vector of 4 units, we want to return either [0,0,0,init_val],
-   or [0,0,0,0] and let the caller know that it needs to adjust
-   the result at the end by 'init_val'.
-
-   FORNOW, we are using the 'adjust in epilog' scheme, because this way the
-   initialization vector is simpler (same element in all entries), if
-   ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
-
-   A cost model should help decide between these two schemes.  */
+   The value we need is a vector in which element 0 has value INIT_VAL
+   and every other element has value NEUTRAL_OP.  */
 
 static tree
 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
-			       stmt_vec_info stmt_vinfo,
-			       enum tree_code code, tree init_val,
-                               tree *adjustment_def)
+			       stmt_vec_info reduc_info,
+			       tree init_val, tree neutral_op)
 {
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   tree scalar_type = TREE_TYPE (init_val);
   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
-  tree def_for_init;
   tree init_def;
-  REAL_VALUE_TYPE real_init_val = dconst0;
-  int int_init_val = 0;
   gimple_seq stmts = NULL;
 
   gcc_assert (vectype);
@@ -4686,115 +4671,64 @@  get_initial_def_for_reduction (loop_vec_info loop_vinfo,
   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
 	      || SCALAR_FLOAT_TYPE_P (scalar_type));
 
-  gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
-	      || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
+  gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
+	      || loop == (gimple_bb (reduc_info->stmt))->loop_father);
 
-  /* ADJUSTMENT_DEF is NULL when called from
-     vect_create_epilog_for_reduction to vectorize double reduction.  */
-  if (adjustment_def)
-    *adjustment_def = NULL;
-
-  switch (code)
+  if (operand_equal_p (init_val, neutral_op))
     {
-    case WIDEN_SUM_EXPR:
-    case DOT_PROD_EXPR:
-    case SAD_EXPR:
-    case PLUS_EXPR:
-    case MINUS_EXPR:
-    case BIT_IOR_EXPR:
-    case BIT_XOR_EXPR:
-    case MULT_EXPR:
-    case BIT_AND_EXPR:
-      {
-        if (code == MULT_EXPR)
-          {
-            real_init_val = dconst1;
-            int_init_val = 1;
-          }
-
-        if (code == BIT_AND_EXPR)
-          int_init_val = -1;
-
-        if (SCALAR_FLOAT_TYPE_P (scalar_type))
-          def_for_init = build_real (scalar_type, real_init_val);
-        else
-          def_for_init = build_int_cst (scalar_type, int_init_val);
-
-	if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
-	  {
-	    /* Option1: the first element is '0' or '1' as well.  */
-	    if (!operand_equal_p (def_for_init, init_val, 0))
-	      *adjustment_def = init_val;
-	    init_def = gimple_build_vector_from_val (&stmts, vectype,
-						     def_for_init);
-	  }
-	else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
-	  {
-	    /* Option2 (variable length): the first element is INIT_VAL.  */
-	    init_def = gimple_build_vector_from_val (&stmts, vectype,
-						     def_for_init);
-	    init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
-				     vectype, init_def, init_val);
-	  }
-	else
-	  {
-	    /* Option2: the first element is INIT_VAL.  */
-	    tree_vector_builder elts (vectype, 1, 2);
-	    elts.quick_push (init_val);
-	    elts.quick_push (def_for_init);
-	    init_def = gimple_build_vector (&stmts, &elts);
-	  }
-      }
-      break;
-
-    case MIN_EXPR:
-    case MAX_EXPR:
-    case COND_EXPR:
-      {
-	init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
-	init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
-      }
-      break;
-
-    default:
-      gcc_unreachable ();
+      /* If both elements are equal then the vector described above is
+	 just a splat.  */
+      neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
+      init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
+    }
+  else
+    {
+      neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
+      init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
+      if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
+	{
+	  /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
+	     element 0.  */
+	  init_def = gimple_build_vector_from_val (&stmts, vectype,
+						   neutral_op);
+	  init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
+				   vectype, init_def, init_val);
+	}
+      else
+	{
+	  /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
+	  tree_vector_builder elts (vectype, 1, 2);
+	  elts.quick_push (init_val);
+	  elts.quick_push (neutral_op);
+	  init_def = gimple_build_vector (&stmts, &elts);
+	}
     }
 
   if (stmts)
-    gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
+    vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
   return init_def;
 }
 
-/* Get at the initial defs for the reduction PHIs in SLP_NODE.
-   NUMBER_OF_VECTORS is the number of vector defs to create.
-   If NEUTRAL_OP is nonnull, introducing extra elements of that
-   value will not change the result.  */
+/* Get at the initial defs for the reduction PHIs for REDUC_INFO,
+   which performs a reduction involving GROUP_SIZE scalar statements.
+   NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
+   is nonnull, introducing extra elements of that value will not change the
+   result.  */
 
 static void
-get_initial_defs_for_reduction (vec_info *vinfo,
-				slp_tree slp_node,
+get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
+				stmt_vec_info reduc_info,
 				vec<tree> *vec_oprnds,
 				unsigned int number_of_vectors,
-				bool reduc_chain, tree neutral_op)
+				unsigned int group_size, tree neutral_op)
 {
-  vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
-  stmt_vec_info stmt_vinfo = stmts[0];
+  vec<tree> &initial_values = reduc_info->reduc_initial_values;
   unsigned HOST_WIDE_INT nunits;
   unsigned j, number_of_places_left_in_vector;
-  tree vector_type;
-  unsigned int group_size = stmts.length ();
+  tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
   unsigned int i;
-  class loop *loop;
-
-  vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
-
-  gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
 
-  loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
-  gcc_assert (loop);
-  edge pe = loop_preheader_edge (loop);
-
-  gcc_assert (!reduc_chain || neutral_op);
+  gcc_assert (group_size == initial_values.length () || neutral_op);
 
   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
      created vectors. It is greater than 1 if unrolling is performed.
@@ -4824,18 +4758,13 @@  get_initial_defs_for_reduction (vec_info *vinfo,
     {
       tree op;
       i = j % group_size;
-      stmt_vinfo = stmts[i];
 
       /* Get the def before the loop.  In reduction chain we have only
 	 one initial value.  Else we have as many as PHIs in the group.  */
-      if (reduc_chain)
-	op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
-      else if (((vec_oprnds->length () + 1) * nunits
-		- number_of_places_left_in_vector >= group_size)
-	       && neutral_op)
+      if (i >= initial_values.length () || (j > i && neutral_op))
 	op = neutral_op;
       else
-	op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
+	op = initial_values[i];
 
       /* Create 'vect_ = {op0,op1,...,opn}'.  */
       number_of_places_left_in_vector--;
@@ -4871,8 +4800,8 @@  get_initial_defs_for_reduction (vec_info *vinfo,
 	    {
 	      /* First time round, duplicate ELTS to fill the
 		 required number of vectors.  */
-	      duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
-					number_of_vectors, *vec_oprnds);
+	      duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
+					elts, number_of_vectors, *vec_oprnds);
 	      break;
 	    }
 	  vec_oprnds->quick_push (init);
@@ -4884,7 +4813,7 @@  get_initial_defs_for_reduction (vec_info *vinfo,
 	}
     }
   if (ctor_seq != NULL)
-    gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
+    vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
 }
 
 /* For a statement STMT_INFO taking part in a reduction operation return
@@ -4906,15 +4835,107 @@  info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
     }
   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
     {
-      edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
-      stmt_vec_info info
-	  = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
+      stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
 	stmt_info = info;
     }
   return stmt_info;
 }
 
+/* PHI is a reduction in LOOP_VINFO that we are going to vectorize using vector
+   type VECTYPE.  See if LOOP_VINFO is an epilogue loop whose main loop had a
+   matching reduction that we can build on.  Adjust REDUC_INFO and return true
+   if so, otherwise return false.  */
+
+static bool
+vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
+				stmt_vec_info reduc_info)
+{
+  loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
+  if (!main_loop_vinfo)
+    return false;
+
+  if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
+    return false;
+
+  unsigned int num_phis = reduc_info->reduc_initial_values.length ();
+  auto_vec<tree, 16> main_loop_results (num_phis);
+  auto_vec<tree, 16> initial_values (num_phis);
+  if (edge main_loop_edge = loop_vinfo->main_loop_edge)
+    {
+      /* The epilogue loop can be entered either from the main loop or
+	 from an earlier guard block.  */
+      edge skip_edge = loop_vinfo->skip_main_loop_edge;
+      for (tree incoming_value : reduc_info->reduc_initial_values)
+	{
+	  /* Look for:
+
+	       INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
+				    INITIAL_VALUE(guard block)>.  */
+	  gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
+
+	  gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
+	  gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
+
+	  tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
+	  tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
+
+	  main_loop_results.quick_push (from_main_loop);
+	  initial_values.quick_push (from_skip);
+	}
+    }
+  else
+    /* The main loop dominates the epilogue loop.  */
+    main_loop_results.splice (reduc_info->reduc_initial_values);
+
+  /* See if the main loop has the kind of accumulator we need.  */
+  vect_reusable_accumulator *accumulator
+    = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
+  if (!accumulator
+      || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
+      || !std::equal (main_loop_results.begin (), main_loop_results.end (),
+		      accumulator->reduc_info->reduc_scalar_results.begin ()))
+    return false;
+
+  /* For now, only handle the case in which both loops are operating on the
+     same vector types.  In future we could reduce wider vectors to narrower
+     ones as well.  */
+  tree vectype = STMT_VINFO_VECTYPE (reduc_info);
+  tree old_vectype = TREE_TYPE (accumulator->reduc_input);
+  if (!useless_type_conversion_p (old_vectype, vectype))
+    return false;
+
+  /* Non-SLP reductions might apply an adjustment after the reduction
+     operation, in order to simplify the initialization of the accumulator.
+     If the epilogue loop carries on from where the main loop left off,
+     it should apply the same adjustment to the final reduction result.
+
+     If the epilogue loop can also be entered directly (rather than via
+     the main loop), we need to be able to handle that case in the same way,
+     with the same adjustment.  (In principle we could add a PHI node
+     to select the correct adjustment, but in practice that shouldn't be
+     necessary.)  */
+  tree main_adjustment
+    = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
+  if (loop_vinfo->main_loop_edge && main_adjustment)
+    {
+      gcc_assert (num_phis == 1);
+      tree initial_value = initial_values[0];
+      /* Check that we can use INITIAL_VALUE as the adjustment and
+	 initialize the accumulator with a neutral value instead.  */
+      if (!operand_equal_p (initial_value, main_adjustment))
+	return false;
+      tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
+      initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
+						    code, initial_value);
+    }
+  STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
+  reduc_info->reduc_initial_values.truncate (0);
+  reduc_info->reduc_initial_values.splice (initial_values);
+  reduc_info->reused_accumulator = accumulator;
+  return true;
+}
+
 /* Function vect_create_epilog_for_reduction
 
    Create code at the loop-epilog to finalize the result of a reduction
@@ -5005,15 +5026,18 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
   imm_use_iterator imm_iter, phi_imm_iter;
   use_operand_p use_p, phi_use_p;
   gimple *use_stmt;
-  bool nested_in_vect_loop = false;
-  auto_vec<gimple *> new_phis;
+  auto_vec<tree> reduc_inputs;
   int j, i;
-  auto_vec<tree> scalar_results;
+  vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
   unsigned int group_size = 1, k;
   auto_vec<gimple *> phis;
-  bool slp_reduc = false;
+  /* SLP reduction without reduction chain, e.g.,
+     # a1 = phi <a2, a0>
+     # b1 = phi <b2, b0>
+     a2 = operation (a1)
+     b2 = operation (b1)  */
+  bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
   bool direct_slp_reduc;
-  tree new_phi_result;
   tree induction_index = NULL_TREE;
 
   if (slp_node)
@@ -5023,38 +5047,39 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
     {
       outer_loop = loop;
       loop = loop->inner;
-      nested_in_vect_loop = true;
-      gcc_assert (!slp_node);
+      gcc_assert (!slp_node && double_reduc);
     }
-  gcc_assert (!nested_in_vect_loop || double_reduc);
 
   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
   gcc_assert (vectype);
   mode = TYPE_MODE (vectype);
 
-  tree initial_def = NULL;
   tree induc_val = NULL_TREE;
   tree adjustment_def = NULL;
   if (slp_node)
     ;
   else
     {
-      /* Get at the scalar def before the loop, that defines the initial value
-	 of the reduction variable.  */
-      initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
-					   loop_preheader_edge (loop));
       /* Optimize: for induction condition reduction, if we can't use zero
          for induc_val, use initial_def.  */
       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
 	induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
       else if (double_reduc)
 	;
-      else if (nested_in_vect_loop)
-	;
       else
 	adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
     }
 
+  stmt_vec_info single_live_out_stmt[] = { stmt_info };
+  array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
+  if (slp_reduc)
+    /* All statements produce live-out values.  */
+    live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
+  else if (slp_node)
+    /* The last statement in the reduction chain produces the live-out
+       value.  */
+    single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
+
   unsigned vec_num;
   int ncopies;
   if (slp_node)
@@ -5205,31 +5230,28 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
   if (double_reduc)
     loop = outer_loop;
   exit_bb = single_exit (loop)->dest;
-  new_phis.create (slp_node ? vec_num : ncopies);
+  exit_gsi = gsi_after_labels (exit_bb);
+  reduc_inputs.create (slp_node ? vec_num : ncopies);
   for (unsigned i = 0; i < vec_num; i++)
     {
+      gimple_seq stmts = NULL;
       if (slp_node)
 	def = vect_get_slp_vect_def (slp_node, i);
       else
 	def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
       for (j = 0; j < ncopies; j++)
-        {
+	{
 	  tree new_def = copy_ssa_name (def);
-          phi = create_phi_node (new_def, exit_bb);
-          if (j == 0)
-            new_phis.quick_push (phi);
-          else
-	    {
-	      def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
-	      new_phis.quick_push (phi);
-	    }
-
-          SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
-        }
+	  phi = create_phi_node (new_def, exit_bb);
+	  if (j)
+	    def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
+	  SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
+	  new_def = gimple_convert (&stmts, vectype, new_def);
+	  reduc_inputs.quick_push (new_def);
+	}
+      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
     }
 
-  exit_gsi = gsi_after_labels (exit_bb);
-
   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
          (i.e. when reduc_fn is not available) and in the final adjustment
 	 code (if needed).  Also get the original scalar reduction variable as
@@ -5253,13 +5275,6 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
   bitsize = TYPE_SIZE (scalar_type);
 
-  /* SLP reduction without reduction chain, e.g.,
-     # a1 = phi <a2, a0>
-     # b1 = phi <b2, b0>
-     a2 = operation (a1)
-     b2 = operation (b1)  */
-  slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
-
   /* True if we should implement SLP_REDUC using native reduction operations
      instead of scalar operations.  */
   direct_slp_reduc = (reduc_fn != IFN_LAST
@@ -5271,52 +5286,60 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
      a2 = operation (a1)
      a3 = operation (a2),
 
-     we may end up with more than one vector result.  Here we reduce them to
-     one vector.  */
-  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
+     we may end up with more than one vector result.  Here we reduce them
+     to one vector.
+
+     The same is true if we couldn't use a single defuse cycle.  */
+  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
+      || direct_slp_reduc
+      || ncopies > 1)
     {
       gimple_seq stmts = NULL;
-      tree first_vect = PHI_RESULT (new_phis[0]);
-      first_vect = gimple_convert (&stmts, vectype, first_vect);
-      for (k = 1; k < new_phis.length (); k++)
-        {
-	  gimple *next_phi = new_phis[k];
-          tree second_vect = PHI_RESULT (next_phi);
-	  second_vect = gimple_convert (&stmts, vectype, second_vect);
-          first_vect = gimple_build (&stmts, code, vectype,
-				     first_vect, second_vect);
-        }
+      tree single_input = reduc_inputs[0];
+      for (k = 1; k < reduc_inputs.length (); k++)
+	single_input = gimple_build (&stmts, code, vectype,
+				     single_input, reduc_inputs[k]);
       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
 
-      new_phi_result = first_vect;
-      new_phis.truncate (0);
-      new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
+      reduc_inputs.truncate (0);
+      reduc_inputs.safe_push (single_input);
     }
-  /* Likewise if we couldn't use a single defuse cycle.  */
-  else if (ncopies > 1)
+
+  tree orig_reduc_input = reduc_inputs[0];
+
+  /* If this loop is an epilogue loop that can be skipped after the
+     main loop, we can only share a reduction operation between the
+     main loop and the epilogue if we put it at the target of the
+     skip edge.
+
+     We can still reuse accumulators if this check fails.  Doing so has
+     the minor(?) benefit of making the epilogue loop's scalar result
+     independent of the main loop's scalar result.  */
+  bool unify_with_main_loop_p = false;
+  if (reduc_info->reused_accumulator
+      && loop_vinfo->skip_this_loop_edge
+      && single_succ_p (exit_bb)
+      && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
     {
-      gimple_seq stmts = NULL;
-      tree first_vect = PHI_RESULT (new_phis[0]);
-      first_vect = gimple_convert (&stmts, vectype, first_vect);
-      for (int k = 1; k < ncopies; ++k)
-	{
-	  tree second_vect = PHI_RESULT (new_phis[k]);
-	  second_vect = gimple_convert (&stmts, vectype, second_vect);
-	  first_vect = gimple_build (&stmts, code, vectype,
-				     first_vect, second_vect);
-	}
-      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
-      new_phi_result = first_vect;
-      new_phis.truncate (0);
-      new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
+      unify_with_main_loop_p = true;
+
+      basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
+      reduc_inputs[0] = make_ssa_name (vectype);
+      gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
+      add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
+		   UNKNOWN_LOCATION);
+      add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
+		   loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
+      exit_gsi = gsi_after_labels (reduc_block);
     }
-  else
-    new_phi_result = PHI_RESULT (new_phis[0]);
+
+  /* Shouldn't be used beyond this point.  */
+  exit_bb = nullptr;
 
   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
       && reduc_fn != IFN_LAST)
     {
-      /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
+      /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
 	 various data values where the condition matched and another vector
 	 (INDUCTION_INDEX) containing all the indexes of those matches.  We
 	 need to extract the last matching index (which will be the index with
@@ -5346,10 +5369,6 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       /* Vector of {0, 0, 0,...}.  */
       tree zero_vec = build_zero_cst (vectype);
 
-      gimple_seq stmts = NULL;
-      new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
-      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
-
       /* Find maximum value from the vector of found indexes.  */
       tree max_index = make_ssa_name (index_scalar_type);
       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
@@ -5367,7 +5386,7 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 
       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
 	 with the vector (INDUCTION_INDEX) of found indexes, choosing values
-	 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
+	 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
 	 otherwise.  Only one value should match, resulting in a vector
 	 (VEC_COND) with one data value and the rest zeros.
 	 In the case where the loop never made any matches, every index will
@@ -5386,7 +5405,8 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	 zero.  */
       tree vec_cond = make_ssa_name (vectype);
       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
-						   vec_compare, new_phi_result,
+						   vec_compare,
+						   reduc_inputs[0],
 						   zero_vec);
       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
 
@@ -5416,7 +5436,7 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 
       /* Convert the reduced value back to the result type and set as the
 	 result.  */
-      stmts = NULL;
+      gimple_seq stmts = NULL;
       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
 			       data_reduc);
       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
@@ -5434,7 +5454,7 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	     val = data_reduc[i], idx_val = induction_index[i];
 	 return val;  */
 
-      tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
+      tree data_eltype = TREE_TYPE (vectype);
       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
@@ -5458,7 +5478,7 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	  epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
 					     build3 (BIT_FIELD_REF,
 						     data_eltype,
-						     new_phi_result,
+						     reduc_inputs[0],
 						     bitsize_int (el_size),
 						     bitsize_int (off)));
 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
@@ -5510,10 +5530,9 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 			 "Reduce using direct vector reduction.\n");
 
       gimple_seq stmts = NULL;
-      new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
-      vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
+      vec_elem_type = TREE_TYPE (vectype);
       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
-			       vec_elem_type, new_phi_result);
+			       vec_elem_type, reduc_inputs[0]);
       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
 
@@ -5526,6 +5545,7 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	     the same as initial_def already.  */
 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
 				  induc_val);
+	  tree initial_def = reduc_info->reduc_initial_values[0];
 
 	  tmp = make_ssa_name (new_scalar_dest);
 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
@@ -5543,12 +5563,9 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	 neutral value.  We can then do a normal reduction on each vector.  */
 
       /* Enforced by vectorizable_reduction.  */
-      gcc_assert (new_phis.length () == 1);
+      gcc_assert (reduc_inputs.length () == 1);
       gcc_assert (pow2p_hwi (group_size));
 
-      slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
-      vec<stmt_vec_info> orig_phis
-	= SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
       gimple_seq seq = NULL;
 
       /* Build a vector {0, 1, 2, ...}, with the same number of elements
@@ -5571,10 +5588,11 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       tree neutral_op = NULL_TREE;
       if (slp_node)
 	{
-	  stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
-	  neutral_op
-	    = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
-					    vectype, code, first != NULL);
+	  tree initial_value = NULL_TREE;
+	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
+	    initial_value = reduc_info->reduc_initial_values[0];
+	  neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
+						 initial_value);
 	}
       if (neutral_op)
 	vector_identity = gimple_build_vector_from_val (&seq, vectype,
@@ -5586,9 +5604,7 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	     for MIN and MAX reduction, for example.  */
 	  if (!neutral_op)
 	    {
-	      tree scalar_value
-		= PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
-					 loop_preheader_edge (loop));
+	      tree scalar_value = reduc_info->reduc_initial_values[i];
 	      scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
 					     scalar_value);
 	      vector_identity = gimple_build_vector_from_val (&seq, vectype,
@@ -5599,7 +5615,7 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 
 	     sel[j] = (index[j] == i);
 
-	     which selects the elements of NEW_PHI_RESULT that should
+	     which selects the elements of REDUC_INPUTS[0] that should
 	     be included in the result.  */
 	  tree compare_val = build_int_cst (index_elt_type, i);
 	  compare_val = build_vector_from_val (index_type, compare_val);
@@ -5608,11 +5624,11 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 
 	  /* Calculate the equivalent of:
 
-	     vec = seq ? new_phi_result : vector_identity;
+	     vec = seq ? reduc_inputs[0] : vector_identity;
 
 	     VEC is now suitable for a full vector reduction.  */
 	  tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
-				   sel, new_phi_result, vector_identity);
+				   sel, reduc_inputs[0], vector_identity);
 
 	  /* Do the reduction and convert it to the appropriate type.  */
 	  tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
@@ -5627,7 +5643,7 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       bool reduce_with_shift;
       tree vec_temp;
 
-      gcc_assert (slp_reduc || new_phis.length () == 1);
+      gcc_assert (slp_reduc || reduc_inputs.length () == 1);
 
       /* See if the target wants to do the final (shift) reduction
 	 in a vector mode of smaller size and first reduce upper/lower
@@ -5637,7 +5653,7 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
       unsigned nunits1 = nunits;
       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
-	  && new_phis.length () == 1)
+	  && reduc_inputs.length () == 1)
 	{
 	  nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
 	  /* For SLP reductions we have to make sure lanes match up, but
@@ -5669,7 +5685,7 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 
       /* First reduce the vector to the desired vector size we should
 	 do shift reduction on by combining upper and lower halves.  */
-      new_temp = new_phi_result;
+      new_temp = reduc_inputs[0];
       while (nunits > nunits1)
 	{
 	  nunits /= 2;
@@ -5748,7 +5764,7 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	  new_temp = make_ssa_name (vectype1);
 	  epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-	  new_phis[0] = epilog_stmt;
+	  reduc_inputs[0] = new_temp;
 	}
 
       if (reduce_with_shift && !slp_reduc)
@@ -5829,13 +5845,9 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	  int element_bitsize = tree_to_uhwi (bitsize);
 	  tree compute_type = TREE_TYPE (vectype);
 	  gimple_seq stmts = NULL;
-          FOR_EACH_VEC_ELT (new_phis, i, new_phi)
+	  FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
             {
               int bit_offset;
-              if (gimple_code (new_phi) == GIMPLE_PHI)
-                vec_temp = PHI_RESULT (new_phi);
-              else
-                vec_temp = gimple_assign_lhs (new_phi);
 	      new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
 				       vec_temp, bitsize, bitsize_zero_node);
 
@@ -5882,6 +5894,7 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 					  first_res, res);
                   scalar_results[j % group_size] = new_res;
                 }
+	      scalar_results.truncate (group_size);
 	      for (k = 0; k < group_size; k++)
 		scalar_results[k] = gimple_convert (&stmts, scalar_type,
 						    scalar_results[k]);
@@ -5905,6 +5918,7 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	     the same as initial_def already.  */
 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
 				  induc_val);
+	  tree initial_def = reduc_info->reduc_initial_values[0];
 
 	  tree tmp = make_ssa_name (new_scalar_dest);
 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
@@ -5923,13 +5937,12 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
     {
       gcc_assert (!slp_reduc);
       gimple_seq stmts = NULL;
-      if (nested_in_vect_loop)
+      if (double_reduc)
 	{
-          new_phi = new_phis[0];
 	  gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
 	  adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
 	  new_temp = gimple_build (&stmts, code, vectype,
-				   PHI_RESULT (new_phi), adjustment_def);
+				   reduc_inputs[0], adjustment_def);
 	}
       else
 	{
@@ -5942,21 +5955,17 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 
       epilog_stmt = gimple_seq_last_stmt (stmts);
       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
-      if (nested_in_vect_loop)
-        {
-          if (!double_reduc)
-            scalar_results.quick_push (new_temp);
-          else
-            scalar_results[0] = new_temp;
-        }
-      else
-        scalar_results[0] = new_temp;
-
-      new_phis[0] = epilog_stmt;
+      scalar_results[0] = new_temp;
     }
 
+  /* Record this operation if it could be reused by the epilogue loop.  */
+  if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
+      && !double_reduc)
+    loop_vinfo->reusable_accumulators.put (scalar_results[0],
+					   { orig_reduc_input, reduc_info });
+
   if (double_reduc)
-    loop = loop->inner;
+    loop = outer_loop;
 
   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
           phis with new adjusted scalar results, i.e., replace use <s_out0>
@@ -5983,47 +5992,11 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
           use <s_out4>  
           use <s_out4> */
 
-
-  /* In SLP reduction chain we reduce vector results into one vector if
-     necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
-     LHS of the last stmt in the reduction chain, since we are looking for
-     the loop exit phi node.  */
-  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
+  gcc_assert (live_out_stmts.size () == scalar_results.length ());
+  for (k = 0; k < live_out_stmts.size (); k++)
     {
-      stmt_vec_info dest_stmt_info
-	= vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
-      scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
-      group_size = 1;
-    }
-
-  /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
-     case that REDUC_GROUP_SIZE is greater than vectorization factor).
-     Therefore, we need to match SCALAR_RESULTS with corresponding statements.
-     The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
-     correspond to the first vector stmt, etc.
-     (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
-  if (group_size > new_phis.length ())
-    gcc_assert (!(group_size % new_phis.length ()));
-
-  for (k = 0; k < group_size; k++)
-    {
-      if (slp_reduc)
-        {
-	  stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
-
-	  orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
-	  /* SLP statements can't participate in patterns.  */
-	  gcc_assert (!orig_stmt_info);
-	  scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
-        }
-
-      if (nested_in_vect_loop)
-        {
-          if (double_reduc)
-            loop = outer_loop;
-          else
-	    gcc_unreachable ();
-        }
+      stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
+      scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
 
       phis.create (3);
       /* Find the loop-closed-use at the loop exit of the original scalar
@@ -6058,6 +6031,17 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
         {
           /* Replace the uses:  */
           orig_name = PHI_RESULT (exit_phi);
+
+	  /* Look for a single use at the target of the skip edge.  */
+	  if (unify_with_main_loop_p)
+	    {
+	      use_operand_p use_p;
+	      gimple *user;
+	      if (!single_imm_use (orig_name, &use_p, &user))
+		gcc_unreachable ();
+	      orig_name = gimple_get_lhs (user);
+	    }
+
           scalar_result = scalar_results[k];
           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
 	    {
@@ -6830,10 +6814,7 @@  vectorizable_reduction (loop_vec_info loop_vinfo,
       else if (cond_reduc_dt == vect_constant_def)
 	{
 	  enum vect_def_type cond_initial_dt;
-	  tree cond_initial_val
-	    = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
-
-	  gcc_assert (cond_reduc_val != NULL_TREE);
+	  tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
 	  vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
 	  if (cond_initial_dt == vect_constant_def
 	      && types_compatible_p (TREE_TYPE (cond_initial_val),
@@ -7026,9 +7007,13 @@  vectorizable_reduction (loop_vec_info loop_vinfo,
   /* For SLP reductions, see if there is a neutral value we can use.  */
   tree neutral_op = NULL_TREE;
   if (slp_node)
-    neutral_op = neutral_op_for_slp_reduction
-      (slp_node_instance->reduc_phis, vectype_out, orig_code,
-       REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
+    {
+      tree initial_value = NULL_TREE;
+      if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
+	initial_value = vect_phi_initial_value (reduc_def_phi);
+      neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
+					     orig_code, initial_value);
+    }
 
   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
     {
@@ -7578,7 +7563,7 @@  vect_transform_cycle_phi (loop_vec_info loop_vinfo,
 					       vectype_out);
 
   /* Get the loop-entry arguments.  */
-  tree vec_initial_def;
+  tree vec_initial_def = NULL_TREE;
   auto_vec<tree> vec_initial_defs;
   if (slp_node)
     {
@@ -7592,22 +7577,40 @@  vect_transform_cycle_phi (loop_vec_info loop_vinfo,
       else
 	{
 	  gcc_assert (slp_node == slp_node_instance->reduc_phis);
-	  stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
-	  tree neutral_op
-	      = neutral_op_for_slp_reduction (slp_node, vectype_out,
-					      STMT_VINFO_REDUC_CODE (reduc_info),
-					      first != NULL);
-	  get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
-					  &vec_initial_defs, vec_num,
-					  first != NULL, neutral_op);
+	  vec<tree> &initial_values = reduc_info->reduc_initial_values;
+	  vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
+
+	  unsigned int num_phis = stmts.length ();
+	  if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
+	    num_phis = 1;
+	  initial_values.reserve (num_phis);
+	  for (unsigned int i = 0; i < num_phis; ++i)
+	    {
+	      gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
+	      initial_values.quick_push (vect_phi_initial_value (this_phi));
+	    }
+	  if (vec_num == 1)
+	    vect_find_reusable_accumulator (loop_vinfo, reduc_info);
+	  if (!initial_values.is_empty ())
+	    {
+	      tree initial_value
+		= (num_phis == 1 ? initial_values[0] : NULL_TREE);
+	      tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
+	      tree neutral_op
+		= neutral_op_for_reduction (TREE_TYPE (vectype_out),
+					    code, initial_value);
+	      get_initial_defs_for_reduction (loop_vinfo, reduc_info,
+					      &vec_initial_defs, vec_num,
+					      stmts.length (), neutral_op);
+	    }
 	}
     }
   else
     {
       /* Get at the scalar def before the loop, that defines the initial
 	 value of the reduction variable.  */
-      tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
-						loop_preheader_edge (loop));
+      tree initial_def = vect_phi_initial_value (phi);
+      reduc_info->reduc_initial_values.safe_push (initial_def);
       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
 	 and we can't use zero for induc_val, use initial_def.  Similarly
 	 for REDUC_MIN and initial_def larger than the base.  */
@@ -7627,9 +7630,6 @@  vect_transform_cycle_phi (loop_vec_info loop_vinfo,
 	      STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
 	    }
 	  vec_initial_def = build_vector_from_val (vectype_out, induc_val);
-	  vec_initial_defs.create (ncopies);
-	  for (i = 0; i < ncopies; ++i)
-	    vec_initial_defs.quick_push (vec_initial_def);
 	}
       else if (nested_cycle)
 	{
@@ -7639,23 +7639,59 @@  vect_transform_cycle_phi (loop_vec_info loop_vinfo,
 					 ncopies, initial_def,
 					 &vec_initial_defs);
 	}
+      else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
+	       || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
+	/* Fill the initial vector with the initial scalar value.  */
+	vec_initial_def
+	  = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
+					   initial_def, initial_def);
       else
 	{
-	  tree adjustment_def = NULL_TREE;
-	  tree *adjustment_defp = &adjustment_def;
-	  enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
-	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
-	    adjustment_defp = NULL;
-	  vec_initial_def
-	    = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
-					     initial_def, adjustment_defp);
-	  STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
-	  vec_initial_defs.create (ncopies);
-	  for (i = 0; i < ncopies; ++i)
-	    vec_initial_defs.quick_push (vec_initial_def);
+	  if (ncopies == 1)
+	    vect_find_reusable_accumulator (loop_vinfo, reduc_info);
+	  if (!reduc_info->reduc_initial_values.is_empty ())
+	    {
+	      initial_def = reduc_info->reduc_initial_values[0];
+	      enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
+	      tree neutral_op
+		= neutral_op_for_reduction (TREE_TYPE (initial_def),
+					    code, initial_def);
+	      gcc_assert (neutral_op);
+	      /* Try to simplify the vector initialization by applying an
+		 adjustment after the reduction has been performed.  */
+	      if (!reduc_info->reused_accumulator
+		  && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
+		  && !operand_equal_p (neutral_op, initial_def))
+		{
+		  STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
+		    = initial_def;
+		  initial_def = neutral_op;
+		}
+	      vec_initial_def
+		= get_initial_def_for_reduction (loop_vinfo, reduc_info,
+						 initial_def, neutral_op);
+	    }
 	}
     }
 
+  if (vec_initial_def)
+    {
+      vec_initial_defs.create (ncopies);
+      for (i = 0; i < ncopies; ++i)
+	vec_initial_defs.quick_push (vec_initial_def);
+    }
+
+  if (auto *accumulator = reduc_info->reused_accumulator)
+    {
+      if (loop_vinfo->main_loop_edge)
+	vec_initial_defs[0]
+	  = vect_get_main_loop_result (loop_vinfo, accumulator->reduc_input,
+				       vec_initial_defs[0]);
+      else
+	vec_initial_defs.safe_push (accumulator->reduc_input);
+      gcc_assert (vec_initial_defs.length () == 1);
+    }
+
   /* Generate the reduction PHIs upfront.  */
   for (i = 0; i < vec_num; i++)
     {
@@ -8253,8 +8289,7 @@  vectorizable_induction (loop_vec_info loop_vinfo,
       return true;
     }
 
-  init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
-				     loop_preheader_edge (iv_loop));
+  init_expr = vect_phi_initial_value (phi);
 
   gimple_seq stmts = NULL;
   if (!nested_in_vect_loop)
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 9748043f3ee..f1035a83826 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -694,6 +694,8 @@  vec_info::new_stmt_vec_info (gimple *stmt)
   STMT_VINFO_SLP_VECT_ONLY (res) = false;
   STMT_VINFO_SLP_VECT_ONLY_PATTERN (res) = false;
   STMT_VINFO_VEC_STMTS (res) = vNULL;
+  res->reduc_initial_values = vNULL;
+  res->reduc_scalar_results = vNULL;
 
   if (is_a <loop_vec_info> (this)
       && gimple_code (stmt) == GIMPLE_PHI
@@ -755,6 +757,8 @@  vec_info::free_stmt_vec_info (stmt_vec_info stmt_info)
 	release_ssa_name (lhs);
     }
 
+  stmt_info->reduc_initial_values.release ();
+  stmt_info->reduc_scalar_results.release ();
   STMT_VINFO_SIMD_CLONE_INFO (stmt_info).release ();
   STMT_VINFO_VEC_STMTS (stmt_info).release ();
   free (stmt_info);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index fa28336d429..ed7a7738880 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -27,7 +27,7 @@  typedef class _stmt_vec_info *stmt_vec_info;
 #include "tree-hash-traits.h"
 #include "target.h"
 #include "internal-fn.h"
-
+#include "tree-ssa-operands.h"
 
 /* Used for naming of new temporaries.  */
 enum vect_var_kind {
@@ -551,6 +551,18 @@  typedef auto_vec<rgroup_controls> vec_loop_lens;
 
 typedef auto_vec<std::pair<data_reference*, tree> > drs_init_vec;
 
+/* Information about a reduction accumulator from the main loop that could
+   conceivably be reused as the input to a reduction in an epilogue loop.  */
+struct vect_reusable_accumulator {
+  /* The final value of the accumulator, which forms the input to the
+     reduction operation.  */
+  tree reduc_input;
+
+  /* The stmt_vec_info that describes the reduction (i.e. the one for
+     which is_reduc_info is true).  */
+  stmt_vec_info reduc_info;
+};
+
 /*-----------------------------------------------------------------*/
 /* Info on vectorized loops.                                       */
 /*-----------------------------------------------------------------*/
@@ -588,6 +600,23 @@  public:
   /* Unrolling factor  */
   poly_uint64 vectorization_factor;
 
+  /* If this loop is an epilogue loop whose main loop can be skipped,
+     MAIN_LOOP_EDGE is the edge from the main loop to this loop's
+     preheader.  SKIP_MAIN_LOOP_EDGE is then the edge that skips the
+     main loop and goes straight to this loop's preheader.
+
+     Both fields are null otherwise.  */
+  edge main_loop_edge;
+  edge skip_main_loop_edge;
+
+  /* If this loop is an epilogue loop that might be skipped after executing
+     the main loop, this edge is the one that skips the epilogue.  */
+  edge skip_this_loop_edge;
+
+  /* After vectorization, maps live-out SSA names to information about
+     the reductions that generated them.  */
+  hash_map<tree, vect_reusable_accumulator> reusable_accumulators;
+
   /* Maximum runtime vectorization factor, or MAX_VECTORIZATION_FACTOR
      if there is no particular limit.  */
   unsigned HOST_WIDE_INT max_vectorization_factor;
@@ -1186,6 +1215,21 @@  public:
   /* The vector type for performing the actual reduction.  */
   tree reduc_vectype;
 
+  /* If IS_REDUC_INFO is true and if the reduction is operating on N
+     elements in parallel, this vector gives the initial values of these
+     N elements.  */
+  vec<tree> reduc_initial_values;
+
+  /* If IS_REDUC_INFO is true and if the reduction is operating on N
+     elements in parallel, this vector gives the scalar result of each
+     reduction.  */
+  vec<tree> reduc_scalar_results;
+
+  /* Only meaningful if IS_REDUC_INFO.  If non-null, the reduction is
+     being performed by an epilogue loop and we have decided to reuse
+     this accumulator from the main loop.  */
+  vect_reusable_accumulator *reused_accumulator;
+
   /* Whether we force a single cycle PHI during reduction vectorization.  */
   bool force_single_cycle;
 
@@ -1369,6 +1413,19 @@  nested_in_vect_loop_p (class loop *loop, stmt_vec_info stmt_info)
 	  && (loop->inner == (gimple_bb (stmt_info->stmt))->loop_father));
 }
 
+/* PHI is either a scalar reduction phi or a scalar induction phi.
+   Return the initial value of the variable on entry to the containing
+   loop.  */
+
+static inline tree
+vect_phi_initial_value (gphi *phi)
+{
+  basic_block bb = gimple_bb (phi);
+  edge pe = loop_preheader_edge (bb->loop_father);
+  gcc_assert (pe->dest == bb);
+  return PHI_ARG_DEF_FROM_EDGE (phi, pe);
+}
+
 /* Return true if STMT_INFO should produce a vector mask type rather than
    a normal nonmask type.  */
 
@@ -1799,6 +1856,7 @@  class loop *vect_loop_versioning (loop_vec_info, gimple *);
 extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
 				    tree *, tree *, tree *, int, bool, bool,
 				    tree *);
+extern tree vect_get_main_loop_result (loop_vec_info, tree, tree = NULL_TREE);
 extern void vect_prepare_for_masked_peels (loop_vec_info);
 extern dump_user_location_t find_loop_location (class loop *);
 extern bool vect_can_advance_ivs_p (loop_vec_info);