diff mbox series

[RFC] Support vectorization for Complex type.

Message ID 20220711034339.18450-1-hongtao.liu@intel.com
State New
Headers show
Series [RFC] Support vectorization for Complex type. | expand

Commit Message

Liu, Hongtao July 11, 2022, 3:43 a.m. UTC
The patch only handles load/store(including ctor/permutation, except
gather/scatter) for complex type, other operations don't needs to be
handled since they will be lowered by pass cplxlower.(MASK_LOAD is not
supported for complex type, so no need to handle either).

Instead of support vector(2) _Complex double, this patch takes vector(4)
double as vector type of _Complex double. Since vectorizer originally
takes TYPE_VECTOR_SUBPARTS as nunits which is not true for complex
type, the patch handles nunits/ncopies/vf specially for complex type.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Also test the patch for SPEC2017 and find there's complex type vectorization
in 510/549(but no performance impact).

Any comments?

gcc/ChangeLog:

	PR tree-optimization/106010
	* tree-vect-data-refs.cc (vect_get_data_access_cost):
	Pass complex_p to vect_get_num_copies to avoid ICE.
	(vect_analyze_data_refs): Support vectorization for Complex
	type with vector scalar types.
	* tree-vect-loop.cc (vect_determine_vf_for_stmt_1): VF should
	be half of TYPE_VECTOR_SUBPARTS when complex_p.
	* tree-vect-slp.cc (vect_record_max_nunits): nunits should be
	half of TYPE_VECTOR_SUBPARTS when complex_p.
	(vect_optimize_slp): Support permutation for complex type.
	(vect_slp_analyze_node_operations_1): Double nunits in
	vect_get_num_vectors to get right SLP_TREE_NUMBER_OF_VEC_STMTS
	when complex_p.
	(vect_slp_analyze_node_operations): Ditto.
	(vect_create_constant_vectors): Support CTOR for complex type.
	(vect_transform_slp_perm_load): Support permutation for
	complex type.
	* tree-vect-stmts.cc (vect_init_vector): Support complex type.
	(vect_get_vec_defs_for_operand): Get vector type for
	complex type.
	(vectorizable_store): Get right ncopies/nunits for complex
	type, also return false when complex_p and
	!TYPE_VECTOR_SUBPARTS.is_constant ().
	(vectorizable_load): Ditto.
	(vect_get_vector_types_for_stmt): Get vector type for complex type.
	* tree-vectorizer.h (STMT_VINFO_COMPLEX_P): New macro.
	(vect_get_num_copies): New overload.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr106010-1a.c: New test.
	* gcc.target/i386/pr106010-1b.c: New test.
	* gcc.target/i386/pr106010-1c.c: New test.
	* gcc.target/i386/pr106010-2a.c: New test.
	* gcc.target/i386/pr106010-2b.c: New test.
	* gcc.target/i386/pr106010-2c.c: New test.
	* gcc.target/i386/pr106010-3a.c: New test.
	* gcc.target/i386/pr106010-3b.c: New test.
	* gcc.target/i386/pr106010-3c.c: New test.
	* gcc.target/i386/pr106010-4a.c: New test.
	* gcc.target/i386/pr106010-4b.c: New test.
	* gcc.target/i386/pr106010-4c.c: New test.
	* gcc.target/i386/pr106010-5a.c: New test.
	* gcc.target/i386/pr106010-5b.c: New test.
	* gcc.target/i386/pr106010-5c.c: New test.
	* gcc.target/i386/pr106010-6a.c: New test.
	* gcc.target/i386/pr106010-6b.c: New test.
	* gcc.target/i386/pr106010-6c.c: New test.
	* gcc.target/i386/pr106010-7a.c: New test.
	* gcc.target/i386/pr106010-7b.c: New test.
	* gcc.target/i386/pr106010-7c.c: New test.
	* gcc.target/i386/pr106010-8a.c: New test.
	* gcc.target/i386/pr106010-8b.c: New test.
	* gcc.target/i386/pr106010-8c.c: New test.
---
 gcc/testsuite/gcc.target/i386/pr106010-1a.c |  58 +++++++
 gcc/testsuite/gcc.target/i386/pr106010-1b.c |  63 +++++++
 gcc/testsuite/gcc.target/i386/pr106010-1c.c |  41 +++++
 gcc/testsuite/gcc.target/i386/pr106010-2a.c |  82 +++++++++
 gcc/testsuite/gcc.target/i386/pr106010-2b.c |  62 +++++++
 gcc/testsuite/gcc.target/i386/pr106010-2c.c |  47 ++++++
 gcc/testsuite/gcc.target/i386/pr106010-3a.c |  80 +++++++++
 gcc/testsuite/gcc.target/i386/pr106010-3b.c | 126 ++++++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-3c.c |  69 ++++++++
 gcc/testsuite/gcc.target/i386/pr106010-4a.c | 101 ++++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-4b.c |  67 ++++++++
 gcc/testsuite/gcc.target/i386/pr106010-4c.c |  54 ++++++
 gcc/testsuite/gcc.target/i386/pr106010-5a.c | 117 +++++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-5b.c |  80 +++++++++
 gcc/testsuite/gcc.target/i386/pr106010-5c.c |  62 +++++++
 gcc/testsuite/gcc.target/i386/pr106010-6a.c | 115 +++++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-6b.c | 157 ++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-6c.c |  80 +++++++++
 gcc/testsuite/gcc.target/i386/pr106010-7a.c |  58 +++++++
 gcc/testsuite/gcc.target/i386/pr106010-7b.c |  63 +++++++
 gcc/testsuite/gcc.target/i386/pr106010-7c.c |  41 +++++
 gcc/testsuite/gcc.target/i386/pr106010-8a.c |  58 +++++++
 gcc/testsuite/gcc.target/i386/pr106010-8b.c |  53 ++++++
 gcc/testsuite/gcc.target/i386/pr106010-8c.c |  38 +++++
 gcc/tree-vect-data-refs.cc                  |  26 ++-
 gcc/tree-vect-loop.cc                       |   7 +-
 gcc/tree-vect-slp.cc                        | 174 +++++++++++++++-----
 gcc/tree-vect-stmts.cc                      | 135 ++++++++++++---
 gcc/tree-vectorizer.h                       |  13 ++
 29 files changed, 2064 insertions(+), 63 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8c.c

Comments

Jeff Law July 11, 2022, 4:54 a.m. UTC | #1
On 7/10/2022 9:43 PM, liuhongt via Gcc-patches wrote:
> The patch only handles load/store(including ctor/permutation, except
> gather/scatter) for complex type, other operations don't needs to be
> handled since they will be lowered by pass cplxlower.(MASK_LOAD is not
> supported for complex type, so no need to handle either).
>
> Instead of support vector(2) _Complex double, this patch takes vector(4)
> double as vector type of _Complex double. Since vectorizer originally
> takes TYPE_VECTOR_SUBPARTS as nunits which is not true for complex
> type, the patch handles nunits/ncopies/vf specially for complex type.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Also test the patch for SPEC2017 and find there's complex type vectorization
> in 510/549(but no performance impact).
No comment on the implementation.  From a benchmarking standpoint you 
might want to look at cam4 in speed, not rate mode.     I'd bet you'd 
want -ffast-math or -fcx-limited-range to avoid divdc3 and have those 
calls expanded inline which may give you a better crack at exposing 
vectorization opportunities in there.

jeff
Richard Biener July 11, 2022, 11:46 a.m. UTC | #2
On Mon, Jul 11, 2022 at 5:44 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> The patch only handles load/store(including ctor/permutation, except
> gather/scatter) for complex type, other operations don't needs to be
> handled since they will be lowered by pass cplxlower.(MASK_LOAD is not
> supported for complex type, so no need to handle either).

(*)

> Instead of support vector(2) _Complex double, this patch takes vector(4)
> double as vector type of _Complex double. Since vectorizer originally
> takes TYPE_VECTOR_SUBPARTS as nunits which is not true for complex
> type, the patch handles nunits/ncopies/vf specially for complex type.

For the limited set above(*) can you explain what's "special" about
vector(2) _Complex
vs. vector(4) double, thus why we need to have STMT_VINFO_COMPLEX_P at all?

I wonder to what extent your handling can be extended to support re-vectorizing
(with a higher VF for example) already vectorized code?  The vectorizer giving
up on vector(2) double looks quite obviously similar to it giving up
on _Complex double ...
It would be a shame to not use the same underlying mechanism for dealing with
both, where for the vector case obviously vector(4) would be supported as well.

In principle _Complex double operations should be two SLP lanes but it seems you
are handling them with classical interleaving as well?

Thanks,
Richard.

> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Also test the patch for SPEC2017 and find there's complex type vectorization
> in 510/549(but no performance impact).
>
> Any comments?
>
> gcc/ChangeLog:
>
>         PR tree-optimization/106010
>         * tree-vect-data-refs.cc (vect_get_data_access_cost):
>         Pass complex_p to vect_get_num_copies to avoid ICE.
>         (vect_analyze_data_refs): Support vectorization for Complex
>         type with vector scalar types.
>         * tree-vect-loop.cc (vect_determine_vf_for_stmt_1): VF should
>         be half of TYPE_VECTOR_SUBPARTS when complex_p.
>         * tree-vect-slp.cc (vect_record_max_nunits): nunits should be
>         half of TYPE_VECTOR_SUBPARTS when complex_p.
>         (vect_optimize_slp): Support permutation for complex type.
>         (vect_slp_analyze_node_operations_1): Double nunits in
>         vect_get_num_vectors to get right SLP_TREE_NUMBER_OF_VEC_STMTS
>         when complex_p.
>         (vect_slp_analyze_node_operations): Ditto.
>         (vect_create_constant_vectors): Support CTOR for complex type.
>         (vect_transform_slp_perm_load): Support permutation for
>         complex type.
>         * tree-vect-stmts.cc (vect_init_vector): Support complex type.
>         (vect_get_vec_defs_for_operand): Get vector type for
>         complex type.
>         (vectorizable_store): Get right ncopies/nunits for complex
>         type, also return false when complex_p and
>         !TYPE_VECTOR_SUBPARTS.is_constant ().
>         (vectorizable_load): Ditto.
>         (vect_get_vector_types_for_stmt): Get vector type for complex type.
>         * tree-vectorizer.h (STMT_VINFO_COMPLEX_P): New macro.
>         (vect_get_num_copies): New overload.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr106010-1a.c: New test.
>         * gcc.target/i386/pr106010-1b.c: New test.
>         * gcc.target/i386/pr106010-1c.c: New test.
>         * gcc.target/i386/pr106010-2a.c: New test.
>         * gcc.target/i386/pr106010-2b.c: New test.
>         * gcc.target/i386/pr106010-2c.c: New test.
>         * gcc.target/i386/pr106010-3a.c: New test.
>         * gcc.target/i386/pr106010-3b.c: New test.
>         * gcc.target/i386/pr106010-3c.c: New test.
>         * gcc.target/i386/pr106010-4a.c: New test.
>         * gcc.target/i386/pr106010-4b.c: New test.
>         * gcc.target/i386/pr106010-4c.c: New test.
>         * gcc.target/i386/pr106010-5a.c: New test.
>         * gcc.target/i386/pr106010-5b.c: New test.
>         * gcc.target/i386/pr106010-5c.c: New test.
>         * gcc.target/i386/pr106010-6a.c: New test.
>         * gcc.target/i386/pr106010-6b.c: New test.
>         * gcc.target/i386/pr106010-6c.c: New test.
>         * gcc.target/i386/pr106010-7a.c: New test.
>         * gcc.target/i386/pr106010-7b.c: New test.
>         * gcc.target/i386/pr106010-7c.c: New test.
>         * gcc.target/i386/pr106010-8a.c: New test.
>         * gcc.target/i386/pr106010-8b.c: New test.
>         * gcc.target/i386/pr106010-8c.c: New test.
> ---
>  gcc/testsuite/gcc.target/i386/pr106010-1a.c |  58 +++++++
>  gcc/testsuite/gcc.target/i386/pr106010-1b.c |  63 +++++++
>  gcc/testsuite/gcc.target/i386/pr106010-1c.c |  41 +++++
>  gcc/testsuite/gcc.target/i386/pr106010-2a.c |  82 +++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-2b.c |  62 +++++++
>  gcc/testsuite/gcc.target/i386/pr106010-2c.c |  47 ++++++
>  gcc/testsuite/gcc.target/i386/pr106010-3a.c |  80 +++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-3b.c | 126 ++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-3c.c |  69 ++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-4a.c | 101 ++++++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-4b.c |  67 ++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-4c.c |  54 ++++++
>  gcc/testsuite/gcc.target/i386/pr106010-5a.c | 117 +++++++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-5b.c |  80 +++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-5c.c |  62 +++++++
>  gcc/testsuite/gcc.target/i386/pr106010-6a.c | 115 +++++++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-6b.c | 157 ++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-6c.c |  80 +++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-7a.c |  58 +++++++
>  gcc/testsuite/gcc.target/i386/pr106010-7b.c |  63 +++++++
>  gcc/testsuite/gcc.target/i386/pr106010-7c.c |  41 +++++
>  gcc/testsuite/gcc.target/i386/pr106010-8a.c |  58 +++++++
>  gcc/testsuite/gcc.target/i386/pr106010-8b.c |  53 ++++++
>  gcc/testsuite/gcc.target/i386/pr106010-8c.c |  38 +++++
>  gcc/tree-vect-data-refs.cc                  |  26 ++-
>  gcc/tree-vect-loop.cc                       |   7 +-
>  gcc/tree-vect-slp.cc                        | 174 +++++++++++++++-----
>  gcc/tree-vect-stmts.cc                      | 135 ++++++++++++---
>  gcc/tree-vectorizer.h                       |  13 ++
>  29 files changed, 2064 insertions(+), 63 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1c.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2c.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3c.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4c.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5c.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6c.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7c.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8c.c
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1a.c b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
> new file mode 100644
> index 00000000000..b608f484934
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
> @@ -0,0 +1,58 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "vect" } } */
> +
> +#define N 10000
> +void
> +__attribute__((noipa))
> +foo_pd (_Complex double* a, _Complex double* b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b[i];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_ps (_Complex float* a, _Complex float* b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b[i];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi64 (_Complex long long* a, _Complex long long* b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b[i];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi32 (_Complex int* a, _Complex int* b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b[i];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi16 (_Complex short* a, _Complex short* b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b[i];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi8 (_Complex char* a, _Complex char* b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b[i];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1b.c b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
> new file mode 100644
> index 00000000000..0f377c3a548
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
> @@ -0,0 +1,63 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +#include <string.h>
> +#include "pr106010-1a.c"
> +
> +void
> +avx_test (void)
> +{
> +  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
> +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> +  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
> +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> +  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
> +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> +  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
> +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> +  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
> +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> +  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
> +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> +  char* p_init = (char*) malloc (2 * N * sizeof (double));
> +
> +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> +
> +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> +    p_init[i] = i;
> +
> +  memcpy (pd_src, p_init, 2 * N * sizeof (double));
> +  memcpy (ps_src, p_init, 2 * N * sizeof (float));
> +  memcpy (epi64_src, p_init, 2 * N * sizeof (long long));
> +  memcpy (epi32_src, p_init, 2 * N * sizeof (int));
> +  memcpy (epi16_src, p_init, 2 * N * sizeof (short));
> +  memcpy (epi8_src, p_init, 2 * N * sizeof (char));
> +
> +  foo_pd (pd_dst, pd_src);
> +  foo_ps (ps_dst, ps_src);
> +  foo_epi64 (epi64_dst, epi64_src);
> +  foo_epi32 (epi32_dst, epi32_src);
> +  foo_epi16 (epi16_dst, epi16_src);
> +  foo_epi8 (epi8_dst, epi8_src);
> +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
> +    __builtin_abort ();
> +
> +  return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1c.c b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
> new file mode 100644
> index 00000000000..f07e9fb2d3d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
> @@ -0,0 +1,41 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "vect" } } */
> +/* { dg-require-effective-target avx512fp16 } */
> +
> +#include <string.h>
> +
> +static void do_test (void);
> +
> +#define DO_TEST do_test
> +#define AVX512FP16
> +#include "avx512-check.h"
> +
> +#define N 10000
> +
> +void
> +__attribute__((noipa))
> +foo_ph (_Complex _Float16* a, _Complex _Float16* b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b[i];
> +}
> +
> +static void
> +do_test (void)
> +{
> +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> +  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
> +
> +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> +
> +  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
> +    p_init[i] = i;
> +
> +  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
> +
> +  foo_ph (ph_dst, ph_src);
> +  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
> +    __builtin_abort ();
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2a.c b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
> new file mode 100644
> index 00000000000..d2e2f8d4f43
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
> @@ -0,0 +1,82 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "slp2" } } */
> +
> +void
> +__attribute__((noipa))
> +foo_pd (_Complex double* a, _Complex double* __restrict b)
> +{
> +  a[0] = b[0];
> +  a[1] = b[1];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_ps (_Complex float* a, _Complex float* __restrict b)
> +{
> +  a[0] = b[0];
> +  a[1] = b[1];
> +  a[2] = b[2];
> +  a[3] = b[3];
> +
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> +{
> +  a[0] = b[0];
> +  a[1] = b[1];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> +{
> +  a[0] = b[0];
> +  a[1] = b[1];
> +  a[2] = b[2];
> +  a[3] = b[3];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> +{
> +  a[0] = b[0];
> +  a[1] = b[1];
> +  a[2] = b[2];
> +  a[3] = b[3];
> +  a[4] = b[4];
> +  a[5] = b[5];
> +  a[6] = b[6];
> +  a[7] = b[7];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> +{
> +  a[0] = b[0];
> +  a[1] = b[1];
> +  a[2] = b[2];
> +  a[3] = b[3];
> +  a[4] = b[4];
> +  a[5] = b[5];
> +  a[6] = b[6];
> +  a[7] = b[7];
> +  a[8] = b[8];
> +  a[9] = b[9];
> +  a[10] = b[10];
> +  a[11] = b[11];
> +  a[12] = b[12];
> +  a[13] = b[13];
> +  a[14] = b[14];
> +  a[15] = b[15];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2b.c b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
> new file mode 100644
> index 00000000000..ac360752693
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
> @@ -0,0 +1,62 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +#include <string.h>
> +#include "pr106010-2a.c"
> +
> +void
> +avx_test (void)
> +{
> +  _Complex double* pd_src = (_Complex double*) malloc (32);
> +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> +  _Complex float* ps_src = (_Complex float*) malloc (32);
> +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> +  char* p = (char* ) malloc (32);
> +
> +  __builtin_memset (pd_dst, 0, 32);
> +  __builtin_memset (ps_dst, 0, 32);
> +  __builtin_memset (epi64_dst, 0, 32);
> +  __builtin_memset (epi32_dst, 0, 32);
> +  __builtin_memset (epi16_dst, 0, 32);
> +  __builtin_memset (epi8_dst, 0, 32);
> +
> +  for (int i = 0; i != 32; i++)
> +    p[i] = i;
> +  __builtin_memcpy (pd_src, p, 32);
> +  __builtin_memcpy (ps_src, p, 32);
> +  __builtin_memcpy (epi64_src, p, 32);
> +  __builtin_memcpy (epi32_src, p, 32);
> +  __builtin_memcpy (epi16_src, p, 32);
> +  __builtin_memcpy (epi8_src, p, 32);
> +
> +  foo_pd (pd_dst, pd_src);
> +  foo_ps (ps_dst, ps_src);
> +  foo_epi64 (epi64_dst, epi64_src);
> +  foo_epi32 (epi32_dst, epi32_src);
> +  foo_epi16 (epi16_dst, epi16_src);
> +  foo_epi8 (epi8_dst, epi8_src);
> +  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> +    __builtin_abort ();
> +
> +  return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2c.c b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
> new file mode 100644
> index 00000000000..a002f209ec9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
> @@ -0,0 +1,47 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> +/* { dg-require-effective-target avx512fp16 } */
> +
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> +
> +#include <string.h>
> +
> +static void do_test (void);
> +#define DO_TEST do_test
> +#define AVX512FP16
> +#include "avx512-check.h"
> +
> +void
> +__attribute__((noipa))
> +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> +{
> +  a[0] = b[0];
> +  a[1] = b[1];
> +  a[2] = b[2];
> +  a[3] = b[3];
> +  a[4] = b[4];
> +  a[5] = b[5];
> +  a[6] = b[6];
> +  a[7] = b[7];
> +}
> +
> +void
> +do_test (void)
> +{
> +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> +  char* p = (char* ) malloc (32);
> +
> +   __builtin_memset (ph_dst, 0, 32);
> +
> +  for (int i = 0; i != 32; i++)
> +    p[i] = i;
> +  __builtin_memcpy (ph_src, p, 32);
> +
> +  foo_ph (ph_dst, ph_src);
> +  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
> +    __builtin_abort ();
> +
> +  return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3a.c b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
> new file mode 100644
> index 00000000000..c1b64b56b1c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
> @@ -0,0 +1,80 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 2 "slp2" } }  */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 6, 7, 4, 5 \}} 1 "slp2" } }  */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17 \}} 1 "slp2" } }  */
> +
> +void
> +__attribute__((noipa))
> +foo_pd (_Complex double* a, _Complex double* __restrict b)
> +{
> +  a[0] = b[1];
> +  a[1] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_ps (_Complex float* a, _Complex float* __restrict b)
> +{
> +  a[0] = b[1];
> +  a[1] = b[0];
> +  a[2] = b[3];
> +  a[3] = b[2];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> +{
> +  a[0] = b[1];
> +  a[1] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> +{
> +  a[0] = b[3];
> +  a[1] = b[2];
> +  a[2] = b[1];
> +  a[3] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> +{
> +  a[0] = b[7];
> +  a[1] = b[6];
> +  a[2] = b[5];
> +  a[3] = b[4];
> +  a[4] = b[3];
> +  a[5] = b[2];
> +  a[6] = b[1];
> +  a[7] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> +{
> +  a[0] = b[7];
> +  a[1] = b[6];
> +  a[2] = b[5];
> +  a[3] = b[4];
> +  a[4] = b[3];
> +  a[5] = b[2];
> +  a[6] = b[1];
> +  a[7] = b[0];
> +  a[8] = b[15];
> +  a[9] = b[14];
> +  a[10] = b[13];
> +  a[11] = b[12];
> +  a[12] = b[11];
> +  a[13] = b[10];
> +  a[14] = b[9];
> +  a[15] = b[8];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3b.c b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
> new file mode 100644
> index 00000000000..e4fa3f3a541
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
> @@ -0,0 +1,126 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx2 } */
> +
> +#include "avx2-check.h"
> +#include <string.h>
> +#include "pr106010-3a.c"
> +
> +void
> +avx2_test (void)
> +{
> +  _Complex double* pd_src = (_Complex double*) malloc (32);
> +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> +  _Complex double* pd_exp = (_Complex double*) malloc (32);
> +  _Complex float* ps_src = (_Complex float*) malloc (32);
> +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> +  _Complex float* ps_exp = (_Complex float*) malloc (32);
> +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> +  _Complex long long* epi64_exp = (_Complex long long*) malloc (32);
> +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> +  _Complex int* epi32_exp = (_Complex int*) malloc (32);
> +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> +  _Complex short* epi16_exp = (_Complex short*) malloc (32);
> +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> +  _Complex char* epi8_exp = (_Complex char*) malloc (32);
> +  char* p = (char* ) malloc (32);
> +  char* q = (char* ) malloc (32);
> +
> +  __builtin_memset (pd_dst, 0, 32);
> +  __builtin_memset (ps_dst, 0, 32);
> +  __builtin_memset (epi64_dst, 0, 32);
> +  __builtin_memset (epi32_dst, 0, 32);
> +  __builtin_memset (epi16_dst, 0, 32);
> +  __builtin_memset (epi8_dst, 0, 32);
> +
> +  for (int i = 0; i != 32; i++)
> +    p[i] = i;
> +  __builtin_memcpy (pd_src, p, 32);
> +  __builtin_memcpy (ps_src, p, 32);
> +  __builtin_memcpy (epi64_src, p, 32);
> +  __builtin_memcpy (epi32_src, p, 32);
> +  __builtin_memcpy (epi16_src, p, 32);
> +  __builtin_memcpy (epi8_src, p, 32);
> +
> +  for (int i = 0; i != 16; i++)
> +    {
> +      p[i] = i + 16;
> +      p[i + 16] = i;
> +    }
> +  __builtin_memcpy (pd_exp, p, 32);
> +  __builtin_memcpy (epi64_exp, p, 32);
> +
> +  for (int i = 0; i != 8; i++)
> +    {
> +      p[i] = i + 8;
> +      p[i + 8] = i;
> +      p[i + 16] = i + 24;
> +      p[i + 24] = i + 16;
> +      q[i] = i + 24;
> +      q[i + 8] = i + 16;
> +      q[i + 16] = i + 8;
> +      q[i + 24] = i;
> +    }
> +  __builtin_memcpy (ps_exp, p, 32);
> +  __builtin_memcpy (epi32_exp, q, 32);
> +
> +
> +  for (int i = 0; i != 4; i++)
> +    {
> +      q[i] = i + 28;
> +      q[i + 4] = i + 24;
> +      q[i + 8] = i + 20;
> +      q[i + 12] = i + 16;
> +      q[i + 16] = i + 12;
> +      q[i + 20] = i + 8;
> +      q[i + 24] = i + 4;
> +      q[i + 28] = i;
> +    }
> +  __builtin_memcpy (epi16_exp, q, 32);
> +
> +  for (int i = 0; i != 2; i++)
> +    {
> +      q[i] = i + 14;
> +      q[i + 2] = i + 12;
> +      q[i + 4] = i + 10;
> +      q[i + 6] = i + 8;
> +      q[i + 8] = i + 6;
> +      q[i + 10] = i + 4;
> +      q[i + 12] = i + 2;
> +      q[i + 14] = i;
> +      q[i + 16] = i + 30;
> +      q[i + 18] = i + 28;
> +      q[i + 20] = i + 26;
> +      q[i + 22] = i + 24;
> +      q[i + 24] = i + 22;
> +      q[i + 26] = i + 20;
> +      q[i + 28] = i + 18;
> +      q[i + 30] = i + 16;
> +    }
> +  __builtin_memcpy (epi8_exp, q, 32);
> +
> +  foo_pd (pd_dst, pd_src);
> +  foo_ps (ps_dst, ps_src);
> +  foo_epi64 (epi64_dst, epi64_src);
> +  foo_epi32 (epi32_dst, epi32_src);
> +  foo_epi16 (epi16_dst, epi16_src);
> +  foo_epi8 (epi8_dst, epi8_src);
> +  if (__builtin_memcmp (pd_dst, pd_exp, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (ps_dst, ps_exp, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi64_dst, epi64_exp, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi32_dst, epi32_exp, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi16_dst, epi16_exp, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi8_dst, epi8_exp, 32) != 0)
> +    __builtin_abort ();
> +
> +  return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3c.c b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
> new file mode 100644
> index 00000000000..5a5a3d4b992
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
> @@ -0,0 +1,69 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> +/* { dg-require-effective-target avx512fp16 } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 8, 9, 6, 7, 14, 15, 12, 13, 4, 5, 10, 11 \}} 1 "slp2" } }  */
> +
> +#include <string.h>
> +
> +static void do_test (void);
> +#define DO_TEST do_test
> +#define AVX512FP16
> +#include "avx512-check.h"
> +
> +void
> +__attribute__((noipa))
> +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> +{
> +  a[0] = b[1];
> +  a[1] = b[0];
> +  a[2] = b[4];
> +  a[3] = b[3];
> +  a[4] = b[7];
> +  a[5] = b[6];
> +  a[6] = b[2];
> +  a[7] = b[5];
> +}
> +
> +void
> +do_test (void)
> +{
> +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (32);
> +  char* p = (char* ) malloc (32);
> +  char* q = (char* ) malloc (32);
> +
> +  __builtin_memset (ph_dst, 0, 32);
> +
> +  for (int i = 0; i != 32; i++)
> +    p[i] = i;
> +  __builtin_memcpy (ph_src, p, 32);
> +
> +  for (int i = 0; i != 4; i++)
> +    {
> +      p[i] = i + 4;
> +      p[i + 4] = i;
> +      p[i + 8] = i + 16;
> +      p[i + 12] = i + 12;
> +      p[i + 16] = i + 28;
> +      p[i + 20] = i + 24;
> +      p[i + 24] = i + 8;
> +      p[i + 28] = i + 20;
> +      q[i] = i + 28;
> +      q[i + 4] = i + 24;
> +      q[i + 8] = i + 20;
> +      q[i + 12] = i + 16;
> +      q[i + 16] = i + 12;
> +      q[i + 20] = i + 8;
> +      q[i + 24] = i + 4;
> +      q[i + 28] = i;
> +    }
> +  __builtin_memcpy (ph_exp, p, 32);
> +
> +  foo_ph (ph_dst, ph_src);
> +  if (__builtin_memcmp (ph_dst, ph_exp, 32) != 0)
> +    __builtin_abort ();
> +
> +  return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4a.c b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
> new file mode 100644
> index 00000000000..b7b0b532bb1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
> @@ -0,0 +1,101 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "slp2" } } */
> +
> +void
> +__attribute__((noipa))
> +foo_pd (_Complex double* a,
> +       _Complex double b1,
> +       _Complex double b2)
> +{
> +  a[0] = b1;
> +  a[1] = b2;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_ps (_Complex float* a,
> +       _Complex float b1, _Complex float b2,
> +       _Complex float b3, _Complex float b4)
> +{
> +  a[0] = b1;
> +  a[1] = b2;
> +  a[2] = b3;
> +  a[3] = b4;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi64 (_Complex long long* a,
> +          _Complex long long b1,
> +          _Complex long long b2)
> +{
> +  a[0] = b1;
> +  a[1] = b2;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi32 (_Complex int* a,
> +          _Complex int b1, _Complex int b2,
> +          _Complex int b3, _Complex int b4)
> +{
> +  a[0] = b1;
> +  a[1] = b2;
> +  a[2] = b3;
> +  a[3] = b4;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi16 (_Complex short* a,
> +          _Complex short b1, _Complex short b2,
> +          _Complex short b3, _Complex short b4,
> +          _Complex short b5, _Complex short b6,
> +          _Complex short b7,_Complex short b8)
> +{
> +  a[0] = b1;
> +  a[1] = b2;
> +  a[2] = b3;
> +  a[3] = b4;
> +  a[4] = b5;
> +  a[5] = b6;
> +  a[6] = b7;
> +  a[7] = b8;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi8 (_Complex char* a,
> +         _Complex char b1, _Complex char b2,
> +         _Complex char b3, _Complex char b4,
> +         _Complex char b5, _Complex char b6,
> +         _Complex char b7,_Complex char b8,
> +         _Complex char b9, _Complex char b10,
> +         _Complex char b11, _Complex char b12,
> +         _Complex char b13, _Complex char b14,
> +         _Complex char b15,_Complex char b16)
> +{
> +  a[0] = b1;
> +  a[1] = b2;
> +  a[2] = b3;
> +  a[3] = b4;
> +  a[4] = b5;
> +  a[5] = b6;
> +  a[6] = b7;
> +  a[7] = b8;
> +  a[8] = b9;
> +  a[9] = b10;
> +  a[10] = b11;
> +  a[11] = b12;
> +  a[12] = b13;
> +  a[13] = b14;
> +  a[14] = b15;
> +  a[15] = b16;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4b.c b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
> new file mode 100644
> index 00000000000..e2e79508c4b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
> @@ -0,0 +1,67 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +#include <string.h>
> +#include "pr106010-4a.c"
> +
> +void
> +avx_test (void)
> +{
> +  _Complex double* pd_src = (_Complex double*) malloc (32);
> +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> +  _Complex float* ps_src = (_Complex float*) malloc (32);
> +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> +  char* p = (char* ) malloc (32);
> +
> +  __builtin_memset (pd_dst, 0, 32);
> +  __builtin_memset (ps_dst, 0, 32);
> +  __builtin_memset (epi64_dst, 0, 32);
> +  __builtin_memset (epi32_dst, 0, 32);
> +  __builtin_memset (epi16_dst, 0, 32);
> +  __builtin_memset (epi8_dst, 0, 32);
> +
> +  for (int i = 0; i != 32; i++)
> +    p[i] = i;
> +  __builtin_memcpy (pd_src, p, 32);
> +  __builtin_memcpy (ps_src, p, 32);
> +  __builtin_memcpy (epi64_src, p, 32);
> +  __builtin_memcpy (epi32_src, p, 32);
> +  __builtin_memcpy (epi16_src, p, 32);
> +  __builtin_memcpy (epi8_src, p, 32);
> +
> +  foo_pd (pd_dst, pd_src[0], pd_src[1]);
> +  foo_ps (ps_dst, ps_src[0], ps_src[1], ps_src[2], ps_src[3]);
> +  foo_epi64 (epi64_dst, epi64_src[0], epi64_src[1]);
> +  foo_epi32 (epi32_dst, epi32_src[0], epi32_src[1], epi32_src[2], epi32_src[3]);
> +  foo_epi16 (epi16_dst, epi16_src[0], epi16_src[1], epi16_src[2], epi16_src[3],
> +            epi16_src[4], epi16_src[5], epi16_src[6], epi16_src[7]);
> +  foo_epi8 (epi8_dst, epi8_src[0], epi8_src[1], epi8_src[2], epi8_src[3],
> +           epi8_src[4], epi8_src[5], epi8_src[6], epi8_src[7],
> +           epi8_src[8], epi8_src[9], epi8_src[10], epi8_src[11],
> +           epi8_src[12], epi8_src[13], epi8_src[14], epi8_src[15]);
> +
> +  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi8_dst, epi8_src, 32) != 0)
> +    __builtin_abort ();
> +
> +  return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4c.c b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
> new file mode 100644
> index 00000000000..8e02aefe3b5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
> @@ -0,0 +1,54 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -fdump-tree-slp-details -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx512fp16 } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "slp2" } } */
> +
> +#include <string.h>
> +
> +static void do_test (void);
> +#define DO_TEST do_test
> +#define AVX512FP16
> +#include "avx512-check.h"
> +
> +void
> +__attribute__((noipa))
> +foo_ph (_Complex _Float16* a,
> +       _Complex _Float16 b1, _Complex _Float16 b2,
> +       _Complex _Float16 b3, _Complex _Float16 b4,
> +       _Complex _Float16 b5, _Complex _Float16 b6,
> +       _Complex _Float16 b7,_Complex _Float16 b8)
> +{
> +  a[0] = b1;
> +  a[1] = b2;
> +  a[2] = b3;
> +  a[3] = b4;
> +  a[4] = b5;
> +  a[5] = b6;
> +  a[6] = b7;
> +  a[7] = b8;
> +}
> +
> +void
> +do_test (void)
> +{
> +
> +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> +
> +  char* p = (char* ) malloc (32);
> +
> +  __builtin_memset (ph_dst, 0, 32);
> +
> +  for (int i = 0; i != 32; i++)
> +    p[i] = i;
> +
> +  __builtin_memcpy (ph_src, p, 32);
> +
> +  foo_ph (ph_dst, ph_src[0], ph_src[1], ph_src[2], ph_src[3],
> +         ph_src[4], ph_src[5], ph_src[6], ph_src[7]);
> +
> +  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
> +    __builtin_abort ();
> +  return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5a.c b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
> new file mode 100644
> index 00000000000..9d4a6f9846b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
> @@ -0,0 +1,117 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 4 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 4 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 4 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 4 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 4 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 4 "slp2" } } */
> +
> +void
> +__attribute__((noipa))
> +foo_pd (_Complex double* a, _Complex double* __restrict b)
> +{
> +  a[0] = b[2];
> +  a[1] = b[3];
> +  a[2] = b[0];
> +  a[3] = b[1];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_ps (_Complex float* a, _Complex float* __restrict b)
> +{
> +  a[0] = b[4];
> +  a[1] = b[5];
> +  a[2] = b[6];
> +  a[3] = b[7];
> +  a[4] = b[0];
> +  a[5] = b[1];
> +  a[6] = b[2];
> +  a[7] = b[3];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> +{
> +  a[0] = b[2];
> +  a[1] = b[3];
> +  a[2] = b[0];
> +  a[3] = b[1];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> +{
> +  a[0] = b[4];
> +  a[1] = b[5];
> +  a[2] = b[6];
> +  a[3] = b[7];
> +  a[4] = b[0];
> +  a[5] = b[1];
> +  a[6] = b[2];
> +  a[7] = b[3];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> +{
> +  a[0] = b[8];
> +  a[1] = b[9];
> +  a[2] = b[10];
> +  a[3] = b[11];
> +  a[4] = b[12];
> +  a[5] = b[13];
> +  a[6] = b[14];
> +  a[7] = b[15];
> +  a[8] = b[0];
> +  a[9] = b[1];
> +  a[10] = b[2];
> +  a[11] = b[3];
> +  a[12] = b[4];
> +  a[13] = b[5];
> +  a[14] = b[6];
> +  a[15] = b[7];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> +{
> +  a[0] = b[16];
> +  a[1] = b[17];
> +  a[2] = b[18];
> +  a[3] = b[19];
> +  a[4] = b[20];
> +  a[5] = b[21];
> +  a[6] = b[22];
> +  a[7] = b[23];
> +  a[8] = b[24];
> +  a[9] = b[25];
> +  a[10] = b[26];
> +  a[11] = b[27];
> +  a[12] = b[28];
> +  a[13] = b[29];
> +  a[14] = b[30];
> +  a[15] = b[31];
> +  a[16] = b[0];
> +  a[17] = b[1];
> +  a[18] = b[2];
> +  a[19] = b[3];
> +  a[20] = b[4];
> +  a[21] = b[5];
> +  a[22] = b[6];
> +  a[23] = b[7];
> +  a[24] = b[8];
> +  a[25] = b[9];
> +  a[26] = b[10];
> +  a[27] = b[11];
> +  a[28] = b[12];
> +  a[29] = b[13];
> +  a[30] = b[14];
> +  a[31] = b[15];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5b.c b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
> new file mode 100644
> index 00000000000..d5c6ebeb5cf
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
> @@ -0,0 +1,80 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +#include <string.h>
> +#include "pr106010-5a.c"
> +
> +void
> +avx_test (void)
> +{
> +  _Complex double* pd_src = (_Complex double*) malloc (64);
> +  _Complex double* pd_dst = (_Complex double*) malloc (64);
> +  _Complex double* pd_exp = (_Complex double*) malloc (64);
> +  _Complex float* ps_src = (_Complex float*) malloc (64);
> +  _Complex float* ps_dst = (_Complex float*) malloc (64);
> +  _Complex float* ps_exp = (_Complex float*) malloc (64);
> +  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
> +  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
> +  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
> +  _Complex int* epi32_src = (_Complex int*) malloc (64);
> +  _Complex int* epi32_dst = (_Complex int*) malloc (64);
> +  _Complex int* epi32_exp = (_Complex int*) malloc (64);
> +  _Complex short* epi16_src = (_Complex short*) malloc (64);
> +  _Complex short* epi16_dst = (_Complex short*) malloc (64);
> +  _Complex short* epi16_exp = (_Complex short*) malloc (64);
> +  _Complex char* epi8_src = (_Complex char*) malloc (64);
> +  _Complex char* epi8_dst = (_Complex char*) malloc (64);
> +  _Complex char* epi8_exp = (_Complex char*) malloc (64);
> +  char* p = (char* ) malloc (64);
> +  char* q = (char* ) malloc (64);
> +
> +  __builtin_memset (pd_dst, 0, 64);
> +  __builtin_memset (ps_dst, 0, 64);
> +  __builtin_memset (epi64_dst, 0, 64);
> +  __builtin_memset (epi32_dst, 0, 64);
> +  __builtin_memset (epi16_dst, 0, 64);
> +  __builtin_memset (epi8_dst, 0, 64);
> +
> +  for (int i = 0; i != 64; i++)
> +    {
> +      p[i] = i;
> +      q[i] = (i + 32) % 64;
> +    }
> +  __builtin_memcpy (pd_src, p, 64);
> +  __builtin_memcpy (ps_src, p, 64);
> +  __builtin_memcpy (epi64_src, p, 64);
> +  __builtin_memcpy (epi32_src, p, 64);
> +  __builtin_memcpy (epi16_src, p, 64);
> +  __builtin_memcpy (epi8_src, p, 64);
> +
> +  __builtin_memcpy (pd_exp, q, 64);
> +  __builtin_memcpy (ps_exp, q, 64);
> +  __builtin_memcpy (epi64_exp, q, 64);
> +  __builtin_memcpy (epi32_exp, q, 64);
> +  __builtin_memcpy (epi16_exp, q, 64);
> +  __builtin_memcpy (epi8_exp, q, 64);
> +
> +  foo_pd (pd_dst, pd_src);
> +  foo_ps (ps_dst, ps_src);
> +  foo_epi64 (epi64_dst, epi64_src);
> +  foo_epi32 (epi32_dst, epi32_src);
> +  foo_epi16 (epi16_dst, epi16_src);
> +  foo_epi8 (epi8_dst, epi8_src);
> +
> +  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
> +    __builtin_abort ();
> +
> +  return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5c.c b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
> new file mode 100644
> index 00000000000..9ce4e6dd5c0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
> @@ -0,0 +1,62 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx512fp16 } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 4 "slp2" } } */
> +
> +#include <string.h>
> +
> +static void do_test (void);
> +#define DO_TEST do_test
> +#define AVX512FP16
> +#include "avx512-check.h"
> +
> +void
> +__attribute__((noipa))
> +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> +{
> +  a[0] = b[8];
> +  a[1] = b[9];
> +  a[2] = b[10];
> +  a[3] = b[11];
> +  a[4] = b[12];
> +  a[5] = b[13];
> +  a[6] = b[14];
> +  a[7] = b[15];
> +  a[8] = b[0];
> +  a[9] = b[1];
> +  a[10] = b[2];
> +  a[11] = b[3];
> +  a[12] = b[4];
> +  a[13] = b[5];
> +  a[14] = b[6];
> +  a[15] = b[7];
> +}
> +
> +void
> +do_test (void)
> +{
> +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
> +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
> +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
> +  char* p = (char* ) malloc (64);
> +  char* q = (char* ) malloc (64);
> +
> +  __builtin_memset (ph_dst, 0, 64);
> +
> +  for (int i = 0; i != 64; i++)
> +    {
> +      p[i] = i;
> +      q[i] = (i + 32) % 64;
> +    }
> +  __builtin_memcpy (ph_src, p, 64);
> +
> +  __builtin_memcpy (ph_exp, q, 64);
> +
> +  foo_ph (ph_dst, ph_src);
> +
> +  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
> +    __builtin_abort ();
> +
> +  return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6a.c b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
> new file mode 100644
> index 00000000000..65a90d03684
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
> @@ -0,0 +1,115 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 4 "slp2" } }  */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 4 "slp2" } }  */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> +
> +void
> +__attribute__((noipa))
> +foo_pd (_Complex double* a, _Complex double* __restrict b)
> +{
> +  a[0] = b[3];
> +  a[1] = b[2];
> +  a[2] = b[1];
> +  a[3] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_ps (_Complex float* a, _Complex float* __restrict b)
> +{
> +  a[0] = b[7];
> +  a[1] = b[6];
> +  a[2] = b[5];
> +  a[3] = b[4];
> +  a[4] = b[3];
> +  a[5] = b[2];
> +  a[6] = b[1];
> +  a[7] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> +{
> +  a[0] = b[3];
> +  a[1] = b[2];
> +  a[2] = b[1];
> +  a[3] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> +{
> +  a[0] = b[7];
> +  a[1] = b[6];
> +  a[2] = b[5];
> +  a[3] = b[4];
> +  a[4] = b[3];
> +  a[5] = b[2];
> +  a[6] = b[1];
> +  a[7] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> +{
> +  a[0] = b[15];
> +  a[1] = b[14];
> +  a[2] = b[13];
> +  a[3] = b[12];
> +  a[4] = b[11];
> +  a[5] = b[10];
> +  a[6] = b[9];
> +  a[7] = b[8];
> +  a[8] = b[7];
> +  a[9] = b[6];
> +  a[10] = b[5];
> +  a[11] = b[4];
> +  a[12] = b[3];
> +  a[13] = b[2];
> +  a[14] = b[1];
> +  a[15] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> +{
> +  a[0] = b[31];
> +  a[1] = b[30];
> +  a[2] = b[29];
> +  a[3] = b[28];
> +  a[4] = b[27];
> +  a[5] = b[26];
> +  a[6] = b[25];
> +  a[7] = b[24];
> +  a[8] = b[23];
> +  a[9] = b[22];
> +  a[10] = b[21];
> +  a[11] = b[20];
> +  a[12] = b[19];
> +  a[13] = b[18];
> +  a[14] = b[17];
> +  a[15] = b[16];
> +  a[16] = b[15];
> +  a[17] = b[14];
> +  a[18] = b[13];
> +  a[19] = b[12];
> +  a[20] = b[11];
> +  a[21] = b[10];
> +  a[22] = b[9];
> +  a[23] = b[8];
> +  a[24] = b[7];
> +  a[25] = b[6];
> +  a[26] = b[5];
> +  a[27] = b[4];
> +  a[28] = b[3];
> +  a[29] = b[2];
> +  a[30] = b[1];
> +  a[31] = b[0];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6b.c b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
> new file mode 100644
> index 00000000000..1c5bb020939
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
> @@ -0,0 +1,157 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx2 } */
> +
> +#include "avx2-check.h"
> +#include <string.h>
> +#include "pr106010-6a.c"
> +
> +void
> +avx2_test (void)
> +{
> +  _Complex double* pd_src = (_Complex double*) malloc (64);
> +  _Complex double* pd_dst = (_Complex double*) malloc (64);
> +  _Complex double* pd_exp = (_Complex double*) malloc (64);
> +  _Complex float* ps_src = (_Complex float*) malloc (64);
> +  _Complex float* ps_dst = (_Complex float*) malloc (64);
> +  _Complex float* ps_exp = (_Complex float*) malloc (64);
> +  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
> +  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
> +  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
> +  _Complex int* epi32_src = (_Complex int*) malloc (64);
> +  _Complex int* epi32_dst = (_Complex int*) malloc (64);
> +  _Complex int* epi32_exp = (_Complex int*) malloc (64);
> +  _Complex short* epi16_src = (_Complex short*) malloc (64);
> +  _Complex short* epi16_dst = (_Complex short*) malloc (64);
> +  _Complex short* epi16_exp = (_Complex short*) malloc (64);
> +  _Complex char* epi8_src = (_Complex char*) malloc (64);
> +  _Complex char* epi8_dst = (_Complex char*) malloc (64);
> +  _Complex char* epi8_exp = (_Complex char*) malloc (64);
> +  char* p = (char* ) malloc (64);
> +  char* q = (char* ) malloc (64);
> +
> +  __builtin_memset (pd_dst, 0, 64);
> +  __builtin_memset (ps_dst, 0, 64);
> +  __builtin_memset (epi64_dst, 0, 64);
> +  __builtin_memset (epi32_dst, 0, 64);
> +  __builtin_memset (epi16_dst, 0, 64);
> +  __builtin_memset (epi8_dst, 0, 64);
> +
> +  for (int i = 0; i != 64; i++)
> +    p[i] = i;
> +
> +  __builtin_memcpy (pd_src, p, 64);
> +  __builtin_memcpy (ps_src, p, 64);
> +  __builtin_memcpy (epi64_src, p, 64);
> +  __builtin_memcpy (epi32_src, p, 64);
> +  __builtin_memcpy (epi16_src, p, 64);
> +  __builtin_memcpy (epi8_src, p, 64);
> +
> +
> +  for (int i = 0; i != 16; i++)
> +    {
> +      q[i] = i + 48;
> +      q[i + 16] = i + 32;
> +      q[i + 32] = i + 16;
> +      q[i + 48] = i;
> +    }
> +
> +  __builtin_memcpy (pd_exp, q, 64);
> +  __builtin_memcpy (epi64_exp, q, 64);
> +
> +   for (int i = 0; i != 8; i++)
> +    {
> +      q[i] = i + 56;
> +      q[i + 8] = i + 48;
> +      q[i + 16] = i + 40;
> +      q[i + 24] = i + 32;
> +      q[i + 32] = i + 24;
> +      q[i + 40] = i + 16;
> +      q[i + 48] = i + 8;
> +      q[i + 56] = i;
> +    }
> +
> +  __builtin_memcpy (ps_exp, q, 64);
> +  __builtin_memcpy (epi32_exp, q, 64);
> +
> +  for (int i = 0; i != 4; i++)
> +    {
> +      q[i] = i + 60;
> +      q[i + 4] = i + 56;
> +      q[i + 8] = i + 52;
> +      q[i + 12] = i + 48;
> +      q[i + 16] = i + 44;
> +      q[i + 20] = i + 40;
> +      q[i + 24] = i + 36;
> +      q[i + 28] = i + 32;
> +      q[i + 32] = i + 28;
> +      q[i + 36] = i + 24;
> +      q[i + 40] = i + 20;
> +      q[i + 44] = i + 16;
> +      q[i + 48] = i + 12;
> +      q[i + 52] = i + 8;
> +      q[i + 56] = i + 4;
> +      q[i + 60] = i;
> +    }
> +
> +  __builtin_memcpy (epi16_exp, q, 64);
> +
> +  for (int i = 0; i != 2; i++)
> +    {
> +      q[i] = i + 62;
> +      q[i + 2] = i + 60;
> +      q[i + 4] = i + 58;
> +      q[i + 6] = i + 56;
> +      q[i + 8] = i + 54;
> +      q[i + 10] = i + 52;
> +      q[i + 12] = i + 50;
> +      q[i + 14] = i + 48;
> +      q[i + 16] = i + 46;
> +      q[i + 18] = i + 44;
> +      q[i + 20] = i + 42;
> +      q[i + 22] = i + 40;
> +      q[i + 24] = i + 38;
> +      q[i + 26] = i + 36;
> +      q[i + 28] = i + 34;
> +      q[i + 30] = i + 32;
> +      q[i + 32] = i + 30;
> +      q[i + 34] = i + 28;
> +      q[i + 36] = i + 26;
> +      q[i + 38] = i + 24;
> +      q[i + 40] = i + 22;
> +      q[i + 42] = i + 20;
> +      q[i + 44] = i + 18;
> +      q[i + 46] = i + 16;
> +      q[i + 48] = i + 14;
> +      q[i + 50] = i + 12;
> +      q[i + 52] = i + 10;
> +      q[i + 54] = i + 8;
> +      q[i + 56] = i + 6;
> +      q[i + 58] = i + 4;
> +      q[i + 60] = i + 2;
> +      q[i + 62] = i;
> +    }
> +  __builtin_memcpy (epi8_exp, q, 64);
> +
> +  foo_pd (pd_dst, pd_src);
> +  foo_ps (ps_dst, ps_src);
> +  foo_epi64 (epi64_dst, epi64_src);
> +  foo_epi32 (epi32_dst, epi32_src);
> +  foo_epi16 (epi16_dst, epi16_src);
> +  foo_epi8 (epi8_dst, epi8_src);
> +
> +  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
> +    __builtin_abort ();
> +
> +  return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6c.c b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
> new file mode 100644
> index 00000000000..b859d884a7f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
> @@ -0,0 +1,80 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> +/* { dg-require-effective-target avx512fp16 } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } } */
> +
> +#include <string.h>
> +
> +static void do_test (void);
> +#define DO_TEST do_test
> +#define AVX512FP16
> +#include "avx512-check.h"
> +
> +void
> +__attribute__((noipa))
> +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> +{
> +  a[0] = b[15];
> +  a[1] = b[14];
> +  a[2] = b[13];
> +  a[3] = b[12];
> +  a[4] = b[11];
> +  a[5] = b[10];
> +  a[6] = b[9];
> +  a[7] = b[8];
> +  a[8] = b[7];
> +  a[9] = b[6];
> +  a[10] = b[5];
> +  a[11] = b[4];
> +  a[12] = b[3];
> +  a[13] = b[2];
> +  a[14] = b[1];
> +  a[15] = b[0];
> +}
> +
> +void
> +do_test (void)
> +{
> +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
> +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
> +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
> +  char* p = (char* ) malloc (64);
> +  char* q = (char* ) malloc (64);
> +
> +  __builtin_memset (ph_dst, 0, 64);
> +
> +  for (int i = 0; i != 64; i++)
> +    p[i] = i;
> +
> +  __builtin_memcpy (ph_src, p, 64);
> +
> +  for (int i = 0; i != 4; i++)
> +    {
> +      q[i] = i + 60;
> +      q[i + 4] = i + 56;
> +      q[i + 8] = i + 52;
> +      q[i + 12] = i + 48;
> +      q[i + 16] = i + 44;
> +      q[i + 20] = i + 40;
> +      q[i + 24] = i + 36;
> +      q[i + 28] = i + 32;
> +      q[i + 32] = i + 28;
> +      q[i + 36] = i + 24;
> +      q[i + 40] = i + 20;
> +      q[i + 44] = i + 16;
> +      q[i + 48] = i + 12;
> +      q[i + 52] = i + 8;
> +      q[i + 56] = i + 4;
> +      q[i + 60] = i;
> +    }
> +
> +  __builtin_memcpy (ph_exp, q, 64);
> +
> +  foo_ph (ph_dst, ph_src);
> +
> +  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
> +    __builtin_abort ();
> +
> +  return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7a.c b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
> new file mode 100644
> index 00000000000..2ea01fac927
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
> @@ -0,0 +1,58 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
> +
> +#define N 10000
> +void
> +__attribute__((noipa))
> +foo_pd (_Complex double* a, _Complex double b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_ps (_Complex float* a, _Complex float b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi64 (_Complex long long* a, _Complex long long b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi32 (_Complex int* a, _Complex int b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi16 (_Complex short* a, _Complex short b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi8 (_Complex char* a, _Complex char b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7b.c b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
> new file mode 100644
> index 00000000000..26482cc10f5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
> @@ -0,0 +1,63 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +#include <string.h>
> +#include "pr106010-7a.c"
> +
> +void
> +avx_test (void)
> +{
> +  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
> +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> +  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
> +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> +  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
> +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> +  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
> +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> +  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
> +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> +  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
> +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> +  char* p_init = (char*) malloc (2 * N * sizeof (double));
> +
> +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> +
> +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> +    p_init[i] = i % 2 + 3;
> +
> +  memcpy (pd_src, p_init, 2 * N * sizeof (double));
> +  memcpy (ps_dst, p_init, 2 * N * sizeof (float));
> +  memcpy (epi64_dst, p_init, 2 * N * sizeof (long long));
> +  memcpy (epi32_dst, p_init, 2 * N * sizeof (int));
> +  memcpy (epi16_dst, p_init, 2 * N * sizeof (short));
> +  memcpy (epi8_dst, p_init, 2 * N * sizeof (char));
> +
> +  foo_pd (pd_dst, pd_src[0]);
> +  foo_ps (ps_dst, ps_src[0]);
> +  foo_epi64 (epi64_dst, epi64_src[0]);
> +  foo_epi32 (epi32_dst, epi32_src[0]);
> +  foo_epi16 (epi16_dst, epi16_src[0]);
> +  foo_epi8 (epi8_dst, epi8_src[0]);
> +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
> +    __builtin_abort ();
> +
> +  return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7c.c b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
> new file mode 100644
> index 00000000000..7f4056a5ecc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
> @@ -0,0 +1,41 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
> +/* { dg-require-effective-target avx512fp16 } */
> +
> +#include <string.h>
> +
> +static void do_test (void);
> +
> +#define DO_TEST do_test
> +#define AVX512FP16
> +#include "avx512-check.h"
> +
> +#define N 10000
> +
> +void
> +__attribute__((noipa))
> +foo_ph (_Complex _Float16* a, _Complex _Float16 b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b;
> +}
> +
> +static void
> +do_test (void)
> +{
> +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> +  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
> +
> +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> +
> +  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
> +    p_init[i] = i % 2 + 3;
> +
> +  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
> +
> +  foo_ph (ph_dst, ph_src[0]);
> +  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
> +    __builtin_abort ();
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8a.c b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
> new file mode 100644
> index 00000000000..11054b60d30
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
> @@ -0,0 +1,58 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
> +
> +#define N 10000
> +void
> +__attribute__((noipa))
> +foo_pd (_Complex double* a)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = 1.0 + 2.0i;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_ps (_Complex float* a)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = 1.0f + 2.0fi;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi64 (_Complex long long* a)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = 1 + 2i;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi32 (_Complex int* a)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = 1 + 2i;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi16 (_Complex short* a)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = 1 + 2i;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi8 (_Complex char* a)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = 1 + 2i;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8b.c b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
> new file mode 100644
> index 00000000000..6bb0073b691
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
> @@ -0,0 +1,53 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +#include <string.h>
> +#include "pr106010-8a.c"
> +
> +void
> +avx_test (void)
> +{
> +  _Complex double pd_src = 1.0 + 2.0i;
> +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> +  _Complex float ps_src = 1.0 + 2.0i;
> +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> +  _Complex long long epi64_src = 1 + 2i;;
> +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> +  _Complex int epi32_src = 1 + 2i;
> +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> +  _Complex short epi16_src = 1 + 2i;
> +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> +  _Complex char epi8_src = 1 + 2i;
> +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> +
> +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> +
> +  foo_pd (pd_dst);
> +  foo_ps (ps_dst);
> +  foo_epi64 (epi64_dst);
> +  foo_epi32 (epi32_dst);
> +  foo_epi16 (epi16_dst);
> +  foo_epi8 (epi8_dst);
> +  for (int i = 0 ; i != N; i++)
> +    {
> +      if (pd_dst[i] != pd_src)
> +       __builtin_abort ();
> +      if (ps_dst[i] != ps_src)
> +       __builtin_abort ();
> +      if (epi64_dst[i] != epi64_src)
> +       __builtin_abort ();
> +      if (epi32_dst[i] != epi32_src)
> +       __builtin_abort ();
> +      if (epi16_dst[i] != epi16_src)
> +       __builtin_abort ();
> +      if (epi8_dst[i] != epi8_src)
> +       __builtin_abort ();
> +    }
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8c.c b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
> new file mode 100644
> index 00000000000..61ae131829d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
> @@ -0,0 +1,38 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
> +/* { dg-require-effective-target avx512fp16 } */
> +
> +#include <string.h>
> +
> +static void do_test (void);
> +
> +#define DO_TEST do_test
> +#define AVX512FP16
> +#include "avx512-check.h"
> +
> +#define N 10000
> +
> +void
> +__attribute__((noipa))
> +foo_ph (_Complex _Float16* a)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = 1.0f16 + 2.0f16i;
> +}
> +
> +static void
> +do_test (void)
> +{
> +  _Complex _Float16 ph_src = 1.0f16 + 2.0f16i;
> +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> +
> +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> +
> +  foo_ph (ph_dst);
> +  for (int i = 0; i != N; i++)
> +    {
> +      if (ph_dst[i] != ph_src)
> +       __builtin_abort ();
> +    }
> +}
> diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> index d20a10a1524..42ee9df674c 100644
> --- a/gcc/tree-vect-data-refs.cc
> +++ b/gcc/tree-vect-data-refs.cc
> @@ -1403,7 +1403,8 @@ vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
>    if (PURE_SLP_STMT (stmt_info))
>      ncopies = 1;
>    else
> -    ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
> +    ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info),
> +                                  STMT_VINFO_COMPLEX_P (stmt_info));
>
>    if (DR_IS_READ (dr_info->dr))
>      vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
> @@ -4597,8 +4598,22 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
>
>        /* Set vectype for STMT.  */
>        scalar_type = TREE_TYPE (DR_REF (dr));
> -      tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
> -      if (!vectype)
> +      tree adjust_scalar_type = scalar_type;
> +      /* Support Complex type access. Note that the complex type of load/store
> +        does not support gather/scatter.  */
> +      if (TREE_CODE (scalar_type) == COMPLEX_TYPE
> +         && gatherscatter == SG_NONE)
> +       {
> +         adjust_scalar_type = TREE_TYPE (scalar_type);
> +         STMT_VINFO_COMPLEX_P (stmt_info) = true;
> +       }
> +      tree vectype = get_vectype_for_scalar_type (vinfo, adjust_scalar_type);
> +      unsigned HOST_WIDE_INT constant_nunits;
> +      if (!vectype
> +         /* For complex type, V1DI doesn't make sense.  */
> +         || (STMT_VINFO_COMPLEX_P (stmt_info)
> +             && (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&constant_nunits)
> +                 || constant_nunits == 1)))
>          {
>            if (dump_enabled_p ())
>              {
> @@ -4635,8 +4650,11 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
>         }
>
>        /* Adjust the minimal vectorization factor according to the
> -        vector type.  */
> +        vector type. Note for complex type, VF is half of
> +        TYPE_VECTOR_SUBPARTS.  */
>        vf = TYPE_VECTOR_SUBPARTS (vectype);
> +      if (STMT_VINFO_COMPLEX_P (stmt_info))
> +       vf = exact_div (vf, 2);
>        *min_vf = upper_bound (*min_vf, vf);
>
>        /* Leave the BB vectorizer to pick the vector type later, based on
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 3a70c15b593..365fa738022 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -200,7 +200,12 @@ vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
>      }
>
>    if (nunits_vectype)
> -    vect_update_max_nunits (vf, nunits_vectype);
> +    {
> +      poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (nunits_vectype);
> +      if (STMT_VINFO_COMPLEX_P (stmt_info))
> +       nunits = exact_div (nunits, 2);
> +      vect_update_max_nunits (vf, nunits);
> +    }
>
>    return opt_result::success ();
>  }
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index dab5daddcc5..5d66ea2f286 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -877,10 +877,14 @@ vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
>        return false;
>      }
>
> +  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> +    nunits = exact_div (nunits, 2);
> +
>    /* If populating the vector type requires unrolling then fail
>       before adjusting *max_nunits for basic-block vectorization.  */
>    if (is_a <bb_vec_info> (vinfo)
> -      && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
> +      && !multiple_p (group_size , nunits))
>      {
>        if (dump_enabled_p ())
>         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -891,7 +895,7 @@ vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
>      }
>
>    /* In case of multiple types we need to detect the smallest type.  */
> -  vect_update_max_nunits (max_nunits, vectype);
> +  vect_update_max_nunits (max_nunits, nunits);
>    return true;
>  }
>
> @@ -3720,22 +3724,54 @@ vect_optimize_slp (vec_info *vinfo)
>          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
>          when permuting constants and invariants keeping the permute
>          bijective.  */
> -      auto_sbitmap load_index (SLP_TREE_LANES (node));
> -      bitmap_clear (load_index);
> -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> -       bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
> -      unsigned j;
> -      for (j = 0; j < SLP_TREE_LANES (node); ++j)
> -       if (!bitmap_bit_p (load_index, j))
> -         break;
> -      if (j != SLP_TREE_LANES (node))
> -       continue;
> +      /* Permutation of Complex type.  */
> +      if (STMT_VINFO_COMPLEX_P (dr_stmt))
> +       {
> +         auto_sbitmap load_index (SLP_TREE_LANES (node) * 2);
> +         bitmap_clear (load_index);
> +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> +           {
> +             unsigned bit = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> +             bitmap_set_bit (load_index, 2 * bit);
> +             bitmap_set_bit (load_index, 2 * bit + 1);
> +           }
> +         unsigned j;
> +         for (j = 0; j < SLP_TREE_LANES (node) * 2; ++j)
> +           if (!bitmap_bit_p (load_index, j))
> +             break;
> +         if (j != SLP_TREE_LANES (node) * 2)
> +           continue;
>
> -      vec<unsigned> perm = vNULL;
> -      perm.safe_grow (SLP_TREE_LANES (node), true);
> -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> -       perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> -      perms.safe_push (perm);
> +         vec<unsigned> perm = vNULL;
> +         perm.safe_grow (SLP_TREE_LANES (node) * 2, true);
> +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> +           {
> +             unsigned cidx = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> +             perm[2 * j] = 2 * cidx;
> +             perm[2 * j + 1] = 2 * cidx + 1;
> +           }
> +         perms.safe_push (perm);
> +       }
> +      else
> +       {
> +         auto_sbitmap load_index (SLP_TREE_LANES (node));
> +         bitmap_clear (load_index);
> +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> +           bitmap_set_bit (load_index,
> +                           SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
> +         unsigned j;
> +         for (j = 0; j < SLP_TREE_LANES (node); ++j)
> +           if (!bitmap_bit_p (load_index, j))
> +             break;
> +         if (j != SLP_TREE_LANES (node))
> +           continue;
> +
> +         vec<unsigned> perm = vNULL;
> +         perm.safe_grow (SLP_TREE_LANES (node), true);
> +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> +           perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> +         perms.safe_push (perm);
> +       }
>        vertices[idx].perm_in = perms.length () - 1;
>        vertices[idx].perm_out = perms.length () - 1;
>      }
> @@ -4518,6 +4554,12 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
>         vf = loop_vinfo->vectorization_factor;
>        else
>         vf = 1;
> +      /* For complex type and SLP, double vf to get right vectype.
> +        .i.e vector(4) double for complex double, group size is 2, double vf
> +        to map vf * group_size to TYPE_VECTOR_SUBPARTS.  */
> +     if (STMT_VINFO_COMPLEX_P (stmt_info))
> +       vf *= 2;
> +
>        unsigned int group_size = SLP_TREE_LANES (node);
>        tree vectype = SLP_TREE_VECTYPE (node);
>        SLP_TREE_NUMBER_OF_VEC_STMTS (node)
> @@ -4763,10 +4805,17 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
>             }
>           unsigned group_size = SLP_TREE_LANES (child);
>           poly_uint64 vf = 1;
> +
>           if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
>             vf = loop_vinfo->vectorization_factor;
> +
> +         /* V2SF is just 1 complex type, so mutiply by 2
> +            to get release vector numbers.  */
> +         unsigned cp
> +           = STMT_VINFO_COMPLEX_P (SLP_TREE_REPRESENTATIVE (node)) ? 2 : 1;
> +
>           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
> -           = vect_get_num_vectors (vf * group_size, vector_type);
> +           = vect_get_num_vectors (vf * group_size * cp, vector_type);
>           /* And cost them.  */
>           vect_prologue_cost_for_slp (child, cost_vec);
>         }
> @@ -6402,6 +6451,11 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
>
>    /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
>    vector_type = SLP_TREE_VECTYPE (op_node);
> +  unsigned int cp = 1;
> +  /* Handle Complex type vector init.
> +     SLP_TREE_REPRESENTATIVE (op_node) could be NULL.  */
> +  if (TREE_CODE (TREE_TYPE (op_node->ops[0])) == COMPLEX_TYPE)
> +    cp = 2;
>
>    unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
>    SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
> @@ -6426,9 +6480,9 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
>    /* When using duplicate_and_interleave, we just need one element for
>       each scalar statement.  */
>    if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
> -    nunits = group_size;
> +    nunits = group_size * cp;
>
> -  number_of_copies = nunits * number_of_vectors / group_size;
> +  number_of_copies = nunits * number_of_vectors / (group_size * cp);
>
>    number_of_places_left_in_vector = nunits;
>    constant_p = true;
> @@ -6460,8 +6514,23 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
>                         gcc_unreachable ();
>                     }
>                   else
> -                   op = fold_unary (VIEW_CONVERT_EXPR,
> -                                    TREE_TYPE (vector_type), op);
> +                   {
> +                     tree scalar_type = TREE_TYPE (vector_type);
> +                     /* For complex type, insert real and imag part
> +                        separately.  */
> +                     if (cp == 2)
> +                       {
> +                         gcc_assert ((TREE_CODE (TREE_TYPE (op))
> +                                      == COMPLEX_TYPE)
> +                                     && (scalar_type
> +                                         == TREE_TYPE (TREE_TYPE (op))));
> +                         elts[number_of_places_left_in_vector--]
> +                           = fold_unary (IMAGPART_EXPR, scalar_type, op);
> +                         op = fold_unary (REALPART_EXPR, scalar_type, op);
> +                       }
> +                     else
> +                       op = fold_unary (VIEW_CONVERT_EXPR, scalar_type, op);
> +                   }
>                   gcc_assert (op && CONSTANT_CLASS_P (op));
>                 }
>               else
> @@ -6481,11 +6550,28 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
>                     }
>                   else
>                     {
> -                     op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
> -                                  op);
> -                     init_stmt
> -                       = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
> -                                              op);
> +                     tree scalar_type = TREE_TYPE (vector_type);
> +                     if (cp == 2)
> +                       {
> +                         gcc_assert ((TREE_CODE (TREE_TYPE (op))
> +                                      == COMPLEX_TYPE)
> +                                     && (scalar_type
> +                                         == TREE_TYPE (TREE_TYPE (op))));
> +                         tree imag = build1 (IMAGPART_EXPR, scalar_type, op);
> +                         op = build1 (REALPART_EXPR, scalar_type, op);
> +                         tree imag_temp = make_ssa_name (scalar_type);
> +                         elts[number_of_places_left_in_vector--] = imag_temp;
> +                         init_stmt = gimple_build_assign (imag_temp, imag);
> +                         gimple_seq_add_stmt (&ctor_seq, init_stmt);
> +                         init_stmt = gimple_build_assign (new_temp, op);
> +                       }
> +                     else
> +                       {
> +                         op = build1 (VIEW_CONVERT_EXPR, scalar_type, op);
> +                         init_stmt
> +                           = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
> +                                                  op);
> +                       }
>                     }
>                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
>                   op = new_temp;
> @@ -6696,15 +6782,17 @@ vect_transform_slp_perm_load (vec_info *vinfo,
>    unsigned int nelts_to_build;
>    unsigned int nvectors_per_build;
>    unsigned int in_nlanes;
> +  unsigned int cp = STMT_VINFO_COMPLEX_P (stmt_info) ? 2 : 1;
>    bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
> -                     && multiple_p (nunits, group_size));
> +                     && multiple_p (nunits, group_size * cp));
>    if (repeating_p)
>      {
>        /* A single vector contains a whole number of copies of the node, so:
>          (a) all permutes can use the same mask; and
>          (b) the permutes only need a single vector input.  */
> -      mask.new_vector (nunits, group_size, 3);
> -      nelts_to_build = mask.encoded_nelts ();
> +      /* For complex type, mask size should be double of nelts_to_build.  */
> +      mask.new_vector (nunits, group_size * cp, 3);
> +      nelts_to_build = mask.encoded_nelts () / cp;
>        nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
>        in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
>      }
> @@ -6744,8 +6832,8 @@ vect_transform_slp_perm_load (vec_info *vinfo,
>         {
>           /* Enforced before the loop when !repeating_p.  */
>           unsigned int const_nunits = nunits.to_constant ();
> -         vec_index = i / const_nunits;
> -         mask_element = i % const_nunits;
> +         vec_index = i / (const_nunits / cp);
> +         mask_element = i % (const_nunits / cp);
>           if (vec_index == first_vec_index
>               || first_vec_index == -1)
>             {
> @@ -6755,7 +6843,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
>                    || second_vec_index == -1)
>             {
>               second_vec_index = vec_index;
> -             mask_element += const_nunits;
> +             mask_element += (const_nunits / cp);
>             }
>           else
>             {
> @@ -6768,14 +6856,24 @@ vect_transform_slp_perm_load (vec_info *vinfo,
>               return false;
>             }
>
> -         gcc_assert (mask_element < 2 * const_nunits);
> +         gcc_assert (mask_element < 2 * const_nunits / cp);
>         }
>
>        if (mask_element != index)
>         noop_p = false;
> -      mask[index++] = mask_element;
> +      /* Set index for Complex _type.
> +        i.e. mask like [1,0] is actually [2, 3, 0, 1]
> +        for vector scalar type.  */
> +      if (cp == 2)
> +       {
> +         mask[2 * index] = 2 * mask_element;
> +         mask[2 * index + 1] = 2 * mask_element + 1;
> +       }
> +      else
> +       mask[index] = mask_element;
> +      index++;
>
> -      if (index == count && !noop_p)
> +      if (index * cp == count && !noop_p)
>         {
>           indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
>           if (!can_vec_perm_const_p (mode, mode, indices))
> @@ -6799,7 +6897,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
>           ++*n_perms;
>         }
>
> -      if (index == count)
> +      if (index * cp == count)
>         {
>           if (!analyze_only)
>             {
> @@ -6869,7 +6967,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
>           bool load_seen = false;
>           for (unsigned i = 0; i < in_nlanes; ++i)
>             {
> -             if (i % const_nunits == 0)
> +             if (i % (const_nunits * cp) == 0)
>                 {
>                   if (load_seen)
>                     *n_loads += 1;
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 72107afc883..8af3b558be4 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -1397,25 +1397,70 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
>  {
>    gimple *init_stmt;
>    tree new_temp;
> +  tree scalar_type = TREE_TYPE (type);
> +  gimple_seq stmts = NULL;
> +
> +  if (TREE_CODE (TREE_TYPE (val)) == COMPLEX_TYPE)
> +    {
> +      unsigned HOST_WIDE_INT nunits;
> +      gcc_assert (TYPE_VECTOR_SUBPARTS (type).is_constant (&nunits));
>
> +      tree_vector_builder elts (type, nunits, 1);
> +      tree imag, real;
> +      if (TREE_CODE (val) == COMPLEX_CST)
> +       {
> +         real = fold_unary (REALPART_EXPR, scalar_type, val);
> +         imag = fold_unary (IMAGPART_EXPR, scalar_type, val);
> +       }
> +      else
> +       {
> +         real = make_ssa_name (scalar_type);
> +         imag = make_ssa_name (scalar_type);
> +         init_stmt
> +           = gimple_build_assign (real,
> +                                  build1 (REALPART_EXPR, scalar_type, val));
> +         gimple_seq_add_stmt (&stmts, init_stmt);
> +         init_stmt
> +           = gimple_build_assign (imag,
> +                                  build1 (IMAGPART_EXPR, scalar_type, val));
> +         gimple_seq_add_stmt (&stmts, init_stmt);
> +       }
> +
> +      /* Build vector as [real,imag,real,imag,...].  */
> +      for (unsigned i = 0; i != nunits; i++)
> +       {
> +         if (i % 2)
> +           elts.quick_push (imag);
> +         else
> +           elts.quick_push (real);
> +       }
> +      val = gimple_build_vector (&stmts, &elts);
> +      if (!gimple_seq_empty_p (stmts))
> +       {
> +         if (gsi)
> +           gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> +         else
> +           vinfo->insert_seq_on_entry (stmt_info, stmts);
> +       }
> +    }
>    /* We abuse this function to push sth to a SSA name with initial 'val'.  */
> -  if (! useless_type_conversion_p (type, TREE_TYPE (val)))
> +  else if (! useless_type_conversion_p (type, TREE_TYPE (val)))
>      {
>        gcc_assert (TREE_CODE (type) == VECTOR_TYPE);
> -      if (! types_compatible_p (TREE_TYPE (type), TREE_TYPE (val)))
> +      if (! types_compatible_p (scalar_type, TREE_TYPE (val)))
>         {
>           /* Scalar boolean value should be transformed into
>              all zeros or all ones value before building a vector.  */
>           if (VECTOR_BOOLEAN_TYPE_P (type))
>             {
> -             tree true_val = build_all_ones_cst (TREE_TYPE (type));
> -             tree false_val = build_zero_cst (TREE_TYPE (type));
> +             tree true_val = build_all_ones_cst (scalar_type);
> +             tree false_val = build_zero_cst (scalar_type);
>
>               if (CONSTANT_CLASS_P (val))
>                 val = integer_zerop (val) ? false_val : true_val;
>               else
>                 {
> -                 new_temp = make_ssa_name (TREE_TYPE (type));
> +                 new_temp = make_ssa_name (scalar_type);
>                   init_stmt = gimple_build_assign (new_temp, COND_EXPR,
>                                                    val, true_val, false_val);
>                   vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi);
> @@ -1424,14 +1469,13 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
>             }
>           else
>             {
> -             gimple_seq stmts = NULL;
>               if (! INTEGRAL_TYPE_P (TREE_TYPE (val)))
>                 val = gimple_build (&stmts, VIEW_CONVERT_EXPR,
> -                                   TREE_TYPE (type), val);
> +                                   scalar_type, val);
>               else
>                 /* ???  Condition vectorization expects us to do
>                    promotion of invariant/external defs.  */
> -               val = gimple_convert (&stmts, TREE_TYPE (type), val);
> +               val = gimple_convert (&stmts, scalar_type, val);
>               for (gimple_stmt_iterator gsi2 = gsi_start (stmts);
>                    !gsi_end_p (gsi2); )
>                 {
> @@ -1496,7 +1540,12 @@ vect_get_vec_defs_for_operand (vec_info *vinfo, stmt_vec_info stmt_vinfo,
>                && VECTOR_BOOLEAN_TYPE_P (stmt_vectype))
>         vector_type = truth_type_for (stmt_vectype);
>        else
> -       vector_type = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op));
> +       {
> +         tree scalar_type = TREE_TYPE (op);
> +         if (STMT_VINFO_COMPLEX_P (stmt_vinfo))
> +           scalar_type = TREE_TYPE (scalar_type);
> +         vector_type = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
> +       }
>
>        gcc_assert (vector_type);
>        tree vop = vect_init_vector (vinfo, stmt_vinfo, op, vector_type, NULL);
> @@ -7509,8 +7558,17 @@ vectorizable_store (vec_info *vinfo,
>       same location twice.  */
>    gcc_assert (slp == PURE_SLP_STMT (stmt_info));
>
> +  if (!STMT_VINFO_DATA_REF (stmt_info))
> +    return false;
> +
>    tree vectype = STMT_VINFO_VECTYPE (stmt_info), rhs_vectype = NULL_TREE;
>    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> +    {
> +      if (!nunits.is_constant ())
> +       return false;
> +      nunits = exact_div (nunits, 2);
> +    }
>
>    if (loop_vinfo)
>      {
> @@ -7526,7 +7584,8 @@ vectorizable_store (vec_info *vinfo,
>    if (slp)
>      ncopies = 1;
>    else
> -    ncopies = vect_get_num_copies (loop_vinfo, vectype);
> +    ncopies = vect_get_num_copies (loop_vinfo, vectype,
> +                                  STMT_VINFO_COMPLEX_P (stmt_info));
>
>    gcc_assert (ncopies >= 1);
>
> @@ -7546,9 +7605,6 @@ vectorizable_store (vec_info *vinfo,
>    elem_type = TREE_TYPE (vectype);
>    vec_mode = TYPE_MODE (vectype);
>
> -  if (!STMT_VINFO_DATA_REF (stmt_info))
> -    return false;
> -
>    vect_memory_access_type memory_access_type;
>    enum dr_alignment_support alignment_support_scheme;
>    int misalignment;
> @@ -8778,6 +8834,12 @@ vectorizable_load (vec_info *vinfo,
>
>    tree vectype = STMT_VINFO_VECTYPE (stmt_info);
>    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> +    {
> +      if (!nunits.is_constant ())
> +       return false;
> +      nunits = exact_div (nunits, 2);
> +    }
>
>    if (loop_vinfo)
>      {
> @@ -8794,7 +8856,8 @@ vectorizable_load (vec_info *vinfo,
>    if (slp)
>      ncopies = 1;
>    else
> -    ncopies = vect_get_num_copies (loop_vinfo, vectype);
> +    ncopies = vect_get_num_copies (loop_vinfo, vectype,
> +                                  STMT_VINFO_COMPLEX_P (stmt_info));
>
>    gcc_assert (ncopies >= 1);
>
> @@ -8870,8 +8933,11 @@ vectorizable_load (vec_info *vinfo,
>                 if (k > maxk)
>                   maxk = k;
>               tree vectype = SLP_TREE_VECTYPE (slp_node);
> +             /* For complex type, half the nunits.  */
>               if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
> -                 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
> +                 || maxk >= (DR_GROUP_SIZE (group_info)
> +                             & ~((STMT_VINFO_COMPLEX_P (group_info)
> +                                  ? nunits >> 1 : nunits) - 1)))
>                 {
>                   if (dump_enabled_p ())
>                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -12499,12 +12565,27 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
>             dump_printf_loc (MSG_NOTE, vect_location,
>                              "get vectype for scalar type: %T\n", scalar_type);
>         }
> +
> +      tree orig_scalar_type = scalar_type;
> +      if (TREE_CODE (scalar_type) == COMPLEX_TYPE)
> +       {
> +         /* Set complex_p for BB vectorizer.  */
> +         STMT_VINFO_COMPLEX_P (stmt_info) = true;
> +         scalar_type = TREE_TYPE (scalar_type);
> +         /* Double group_size for BB vectorizer to make
> +            following 2 get_vectype_for_scalar_type return wanted vectype.
> +            Real group size is not changed, just make the "faked" input
> +            group_size.  */
> +         group_size *= 2;
> +       }
>        vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
> -      if (!vectype)
> +      if (!vectype
> +         || (STMT_VINFO_COMPLEX_P (stmt_info)
> +             && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ()))
>         return opt_result::failure_at (stmt,
>                                        "not vectorized:"
>                                        " unsupported data-type %T\n",
> -                                      scalar_type);
> +                                      orig_scalar_type);
>
>        if (dump_enabled_p ())
>         dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n", vectype);
> @@ -12529,16 +12610,30 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
>                                                    TREE_TYPE (vectype));
>        if (scalar_type != TREE_TYPE (vectype))
>         {
> -         if (dump_enabled_p ())
> +         tree orig_scalar_type = scalar_type;
> +         if (TREE_CODE (scalar_type) == COMPLEX_TYPE)
> +           {
> +             /* Set complex_p for Loop vectorizer.  */
> +             STMT_VINFO_COMPLEX_P (stmt_info) = true;
> +             scalar_type = TREE_TYPE (scalar_type);
> +             if (dump_enabled_p ())
> +               dump_printf_loc (MSG_NOTE, vect_location,
> +                            "get complex for smallest scalar type: %T\n",
> +                            scalar_type);
> +
> +           }
> +         else if (dump_enabled_p ())
>             dump_printf_loc (MSG_NOTE, vect_location,
>                              "get vectype for smallest scalar type: %T\n",
>                              scalar_type);
>           nunits_vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
>                                                         group_size);
> -         if (!nunits_vectype)
> +         if (!nunits_vectype
> +             || (STMT_VINFO_COMPLEX_P (stmt_info)
> +                 && !TYPE_VECTOR_SUBPARTS (nunits_vectype).is_constant ()))
>             return opt_result::failure_at
>               (stmt, "not vectorized: unsupported data-type %T\n",
> -              scalar_type);
> +              orig_scalar_type);
>           if (dump_enabled_p ())
>             dump_printf_loc (MSG_NOTE, vect_location, "nunits vectype: %T\n",
>                              nunits_vectype);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index e5fdc9e0a14..4a809e492c4 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -1161,6 +1161,9 @@ public:
>       vectorization.  */
>    bool vectorizable;
>
> +  /* The scalar type of the LHS of this statement is complex type.  */
> +  bool complex_p;
> +
>    /* The stmt to which this info struct refers to.  */
>    gimple *stmt;
>
> @@ -1395,6 +1398,7 @@ struct gather_scatter_info {
>  #define STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT(S) (S)->reduc_epilogue_adjustment
>  #define STMT_VINFO_REDUC_IDX(S)                   (S)->reduc_idx
>  #define STMT_VINFO_FORCE_SINGLE_CYCLE(S)   (S)->force_single_cycle
> +#define STMT_VINFO_COMPLEX_P(S)            (S)->complex_p
>
>  #define STMT_VINFO_DR_WRT_VEC_LOOP(S)      (S)->dr_wrt_vec_loop
>  #define STMT_VINFO_DR_BASE_ADDRESS(S)      (S)->dr_wrt_vec_loop.base_address
> @@ -1970,6 +1974,15 @@ vect_get_num_copies (loop_vec_info loop_vinfo, tree vectype)
>    return vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo), vectype);
>  }
>
> +static inline unsigned int
> +vect_get_num_copies (loop_vec_info loop_vinfo, tree vectype, bool complex_p)
> +{
> +  poly_uint64 nunits = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> +  if (complex_p)
> +    nunits *= 2;
> +  return vect_get_num_vectors (nunits, vectype);
> +}
> +
>  /* Update maximum unit count *MAX_NUNITS so that it accounts for
>     NUNITS.  *MAX_NUNITS can be 1 if we haven't yet recorded anything.  */
>
> --
> 2.18.1
>
Hongtao Liu July 12, 2022, 4:11 a.m. UTC | #3
On Mon, Jul 11, 2022 at 7:47 PM Richard Biener via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Mon, Jul 11, 2022 at 5:44 AM liuhongt <hongtao.liu@intel.com> wrote:
> >
> > The patch only handles load/store(including ctor/permutation, except
> > gather/scatter) for complex type, other operations don't needs to be
> > handled since they will be lowered by pass cplxlower.(MASK_LOAD is not
> > supported for complex type, so no need to handle either).
>
> (*)
>
> > Instead of support vector(2) _Complex double, this patch takes vector(4)
> > double as vector type of _Complex double. Since vectorizer originally
> > takes TYPE_VECTOR_SUBPARTS as nunits which is not true for complex
> > type, the patch handles nunits/ncopies/vf specially for complex type.
>
> For the limited set above(*) can you explain what's "special" about
> vector(2) _Complex
> vs. vector(4) double, thus why we need to have STMT_VINFO_COMPLEX_P at all?
Supporting a vector(2) complex  is a straightforward idea, just like
supporting other scalar type in vectorizer, but it requires more
efforts(in the backend and frontend), considering that most of
operations of complex type will be lowered into realpart and imagpart
operations, supporting a vector(2) complex does not look that
necessary. Then it comes up with supporting vector(4) double(with
adjustment of vf/ctor/permutation), the vectorizer only needs to
handle the vectorization of the move operation of the complex type(no
need to worry about wrongly mapping vector(4) double multiplication to
complex type multiplication since it's already lowered before
vectorizer).
stmt_info does not record the scalar type, in order to avoid duplicate
operation like getting a lhs type from stmt to determine whether it is
a complex type, STMT_VINFO_COMPLEX_P bit is added, this bit is mainly
initialized in vect_analyze_data_refs and vect_get_vector_types_for_
stmt.
>
> I wonder to what extent your handling can be extended to support re-vectorizing
> (with a higher VF for example) already vectorized code?  The vectorizer giving
> up on vector(2) double looks quite obviously similar to it giving up
> on _Complex double ...
Yes, it can be extended to vector(2) double/float/int/.... with a bit
adjustment(exacting element by using bit_field instead of
imagpart_expr/realpart_expr).
> It would be a shame to not use the same underlying mechanism for dealing with
> both, where for the vector case obviously vector(4) would be supported as well.
>
> In principle _Complex double operations should be two SLP lanes but it seems you
> are handling them with classical interleaving as well?
I'm only handling move operations, for other operations it will be
lowered to realpart and imagpart and thus two SLP lanes.
>
> Thanks,
> Richard.
>
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Also test the patch for SPEC2017 and find there's complex type vectorization
> > in 510/549(but no performance impact).
> >
> > Any comments?
> >
> > gcc/ChangeLog:
> >
> >         PR tree-optimization/106010
> >         * tree-vect-data-refs.cc (vect_get_data_access_cost):
> >         Pass complex_p to vect_get_num_copies to avoid ICE.
> >         (vect_analyze_data_refs): Support vectorization for Complex
> >         type with vector scalar types.
> >         * tree-vect-loop.cc (vect_determine_vf_for_stmt_1): VF should
> >         be half of TYPE_VECTOR_SUBPARTS when complex_p.
> >         * tree-vect-slp.cc (vect_record_max_nunits): nunits should be
> >         half of TYPE_VECTOR_SUBPARTS when complex_p.
> >         (vect_optimize_slp): Support permutation for complex type.
> >         (vect_slp_analyze_node_operations_1): Double nunits in
> >         vect_get_num_vectors to get right SLP_TREE_NUMBER_OF_VEC_STMTS
> >         when complex_p.
> >         (vect_slp_analyze_node_operations): Ditto.
> >         (vect_create_constant_vectors): Support CTOR for complex type.
> >         (vect_transform_slp_perm_load): Support permutation for
> >         complex type.
> >         * tree-vect-stmts.cc (vect_init_vector): Support complex type.
> >         (vect_get_vec_defs_for_operand): Get vector type for
> >         complex type.
> >         (vectorizable_store): Get right ncopies/nunits for complex
> >         type, also return false when complex_p and
> >         !TYPE_VECTOR_SUBPARTS.is_constant ().
> >         (vectorizable_load): Ditto.
> >         (vect_get_vector_types_for_stmt): Get vector type for complex type.
> >         * tree-vectorizer.h (STMT_VINFO_COMPLEX_P): New macro.
> >         (vect_get_num_copies): New overload.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/i386/pr106010-1a.c: New test.
> >         * gcc.target/i386/pr106010-1b.c: New test.
> >         * gcc.target/i386/pr106010-1c.c: New test.
> >         * gcc.target/i386/pr106010-2a.c: New test.
> >         * gcc.target/i386/pr106010-2b.c: New test.
> >         * gcc.target/i386/pr106010-2c.c: New test.
> >         * gcc.target/i386/pr106010-3a.c: New test.
> >         * gcc.target/i386/pr106010-3b.c: New test.
> >         * gcc.target/i386/pr106010-3c.c: New test.
> >         * gcc.target/i386/pr106010-4a.c: New test.
> >         * gcc.target/i386/pr106010-4b.c: New test.
> >         * gcc.target/i386/pr106010-4c.c: New test.
> >         * gcc.target/i386/pr106010-5a.c: New test.
> >         * gcc.target/i386/pr106010-5b.c: New test.
> >         * gcc.target/i386/pr106010-5c.c: New test.
> >         * gcc.target/i386/pr106010-6a.c: New test.
> >         * gcc.target/i386/pr106010-6b.c: New test.
> >         * gcc.target/i386/pr106010-6c.c: New test.
> >         * gcc.target/i386/pr106010-7a.c: New test.
> >         * gcc.target/i386/pr106010-7b.c: New test.
> >         * gcc.target/i386/pr106010-7c.c: New test.
> >         * gcc.target/i386/pr106010-8a.c: New test.
> >         * gcc.target/i386/pr106010-8b.c: New test.
> >         * gcc.target/i386/pr106010-8c.c: New test.
> > ---
> >  gcc/testsuite/gcc.target/i386/pr106010-1a.c |  58 +++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-1b.c |  63 +++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-1c.c |  41 +++++
> >  gcc/testsuite/gcc.target/i386/pr106010-2a.c |  82 +++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-2b.c |  62 +++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-2c.c |  47 ++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-3a.c |  80 +++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-3b.c | 126 ++++++++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-3c.c |  69 ++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-4a.c | 101 ++++++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-4b.c |  67 ++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-4c.c |  54 ++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-5a.c | 117 +++++++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-5b.c |  80 +++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-5c.c |  62 +++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-6a.c | 115 +++++++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-6b.c | 157 ++++++++++++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-6c.c |  80 +++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-7a.c |  58 +++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-7b.c |  63 +++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-7c.c |  41 +++++
> >  gcc/testsuite/gcc.target/i386/pr106010-8a.c |  58 +++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-8b.c |  53 ++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-8c.c |  38 +++++
> >  gcc/tree-vect-data-refs.cc                  |  26 ++-
> >  gcc/tree-vect-loop.cc                       |   7 +-
> >  gcc/tree-vect-slp.cc                        | 174 +++++++++++++++-----
> >  gcc/tree-vect-stmts.cc                      | 135 ++++++++++++---
> >  gcc/tree-vectorizer.h                       |  13 ++
> >  29 files changed, 2064 insertions(+), 63 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1c.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2c.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3c.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4c.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5c.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6c.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7c.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8c.c
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1a.c b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > new file mode 100644
> > index 00000000000..b608f484934
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > @@ -0,0 +1,58 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "vect" } } */
> > +
> > +#define N 10000
> > +void
> > +__attribute__((noipa))
> > +foo_pd (_Complex double* a, _Complex double* b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b[i];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ps (_Complex float* a, _Complex float* b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b[i];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi64 (_Complex long long* a, _Complex long long* b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b[i];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi32 (_Complex int* a, _Complex int* b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b[i];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi16 (_Complex short* a, _Complex short* b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b[i];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi8 (_Complex char* a, _Complex char* b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b[i];
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1b.c b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > new file mode 100644
> > index 00000000000..0f377c3a548
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > @@ -0,0 +1,63 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx } */
> > +
> > +#include "avx-check.h"
> > +#include <string.h>
> > +#include "pr106010-1a.c"
> > +
> > +void
> > +avx_test (void)
> > +{
> > +  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
> > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > +  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
> > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > +  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > +  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
> > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > +  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
> > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > +  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
> > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > +  char* p_init = (char*) malloc (2 * N * sizeof (double));
> > +
> > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > +
> > +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> > +    p_init[i] = i;
> > +
> > +  memcpy (pd_src, p_init, 2 * N * sizeof (double));
> > +  memcpy (ps_src, p_init, 2 * N * sizeof (float));
> > +  memcpy (epi64_src, p_init, 2 * N * sizeof (long long));
> > +  memcpy (epi32_src, p_init, 2 * N * sizeof (int));
> > +  memcpy (epi16_src, p_init, 2 * N * sizeof (short));
> > +  memcpy (epi8_src, p_init, 2 * N * sizeof (char));
> > +
> > +  foo_pd (pd_dst, pd_src);
> > +  foo_ps (ps_dst, ps_src);
> > +  foo_epi64 (epi64_dst, epi64_src);
> > +  foo_epi32 (epi32_dst, epi32_src);
> > +  foo_epi16 (epi16_dst, epi16_src);
> > +  foo_epi8 (epi8_dst, epi8_src);
> > +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
> > +    __builtin_abort ();
> > +
> > +  return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1c.c b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > new file mode 100644
> > index 00000000000..f07e9fb2d3d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > @@ -0,0 +1,41 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "vect" } } */
> > +/* { dg-require-effective-target avx512fp16 } */
> > +
> > +#include <string.h>
> > +
> > +static void do_test (void);
> > +
> > +#define DO_TEST do_test
> > +#define AVX512FP16
> > +#include "avx512-check.h"
> > +
> > +#define N 10000
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ph (_Complex _Float16* a, _Complex _Float16* b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b[i];
> > +}
> > +
> > +static void
> > +do_test (void)
> > +{
> > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > +  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
> > +
> > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > +
> > +  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
> > +    p_init[i] = i;
> > +
> > +  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
> > +
> > +  foo_ph (ph_dst, ph_src);
> > +  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
> > +    __builtin_abort ();
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2a.c b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > new file mode 100644
> > index 00000000000..d2e2f8d4f43
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > @@ -0,0 +1,82 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "slp2" } } */
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > +{
> > +  a[0] = b[0];
> > +  a[1] = b[1];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > +{
> > +  a[0] = b[0];
> > +  a[1] = b[1];
> > +  a[2] = b[2];
> > +  a[3] = b[3];
> > +
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > +{
> > +  a[0] = b[0];
> > +  a[1] = b[1];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > +{
> > +  a[0] = b[0];
> > +  a[1] = b[1];
> > +  a[2] = b[2];
> > +  a[3] = b[3];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > +{
> > +  a[0] = b[0];
> > +  a[1] = b[1];
> > +  a[2] = b[2];
> > +  a[3] = b[3];
> > +  a[4] = b[4];
> > +  a[5] = b[5];
> > +  a[6] = b[6];
> > +  a[7] = b[7];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > +{
> > +  a[0] = b[0];
> > +  a[1] = b[1];
> > +  a[2] = b[2];
> > +  a[3] = b[3];
> > +  a[4] = b[4];
> > +  a[5] = b[5];
> > +  a[6] = b[6];
> > +  a[7] = b[7];
> > +  a[8] = b[8];
> > +  a[9] = b[9];
> > +  a[10] = b[10];
> > +  a[11] = b[11];
> > +  a[12] = b[12];
> > +  a[13] = b[13];
> > +  a[14] = b[14];
> > +  a[15] = b[15];
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2b.c b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > new file mode 100644
> > index 00000000000..ac360752693
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > @@ -0,0 +1,62 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx } */
> > +
> > +#include "avx-check.h"
> > +#include <string.h>
> > +#include "pr106010-2a.c"
> > +
> > +void
> > +avx_test (void)
> > +{
> > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > +  char* p = (char* ) malloc (32);
> > +
> > +  __builtin_memset (pd_dst, 0, 32);
> > +  __builtin_memset (ps_dst, 0, 32);
> > +  __builtin_memset (epi64_dst, 0, 32);
> > +  __builtin_memset (epi32_dst, 0, 32);
> > +  __builtin_memset (epi16_dst, 0, 32);
> > +  __builtin_memset (epi8_dst, 0, 32);
> > +
> > +  for (int i = 0; i != 32; i++)
> > +    p[i] = i;
> > +  __builtin_memcpy (pd_src, p, 32);
> > +  __builtin_memcpy (ps_src, p, 32);
> > +  __builtin_memcpy (epi64_src, p, 32);
> > +  __builtin_memcpy (epi32_src, p, 32);
> > +  __builtin_memcpy (epi16_src, p, 32);
> > +  __builtin_memcpy (epi8_src, p, 32);
> > +
> > +  foo_pd (pd_dst, pd_src);
> > +  foo_ps (ps_dst, ps_src);
> > +  foo_epi64 (epi64_dst, epi64_src);
> > +  foo_epi32 (epi32_dst, epi32_src);
> > +  foo_epi16 (epi16_dst, epi16_src);
> > +  foo_epi8 (epi8_dst, epi8_src);
> > +  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > +    __builtin_abort ();
> > +
> > +  return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2c.c b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > new file mode 100644
> > index 00000000000..a002f209ec9
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > @@ -0,0 +1,47 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > +/* { dg-require-effective-target avx512fp16 } */
> > +
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > +
> > +#include <string.h>
> > +
> > +static void do_test (void);
> > +#define DO_TEST do_test
> > +#define AVX512FP16
> > +#include "avx512-check.h"
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > +{
> > +  a[0] = b[0];
> > +  a[1] = b[1];
> > +  a[2] = b[2];
> > +  a[3] = b[3];
> > +  a[4] = b[4];
> > +  a[5] = b[5];
> > +  a[6] = b[6];
> > +  a[7] = b[7];
> > +}
> > +
> > +void
> > +do_test (void)
> > +{
> > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > +  char* p = (char* ) malloc (32);
> > +
> > +   __builtin_memset (ph_dst, 0, 32);
> > +
> > +  for (int i = 0; i != 32; i++)
> > +    p[i] = i;
> > +  __builtin_memcpy (ph_src, p, 32);
> > +
> > +  foo_ph (ph_dst, ph_src);
> > +  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
> > +    __builtin_abort ();
> > +
> > +  return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3a.c b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > new file mode 100644
> > index 00000000000..c1b64b56b1c
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > @@ -0,0 +1,80 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 6, 7, 4, 5 \}} 1 "slp2" } }  */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17 \}} 1 "slp2" } }  */
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > +{
> > +  a[0] = b[1];
> > +  a[1] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > +{
> > +  a[0] = b[1];
> > +  a[1] = b[0];
> > +  a[2] = b[3];
> > +  a[3] = b[2];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > +{
> > +  a[0] = b[1];
> > +  a[1] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > +{
> > +  a[0] = b[3];
> > +  a[1] = b[2];
> > +  a[2] = b[1];
> > +  a[3] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > +{
> > +  a[0] = b[7];
> > +  a[1] = b[6];
> > +  a[2] = b[5];
> > +  a[3] = b[4];
> > +  a[4] = b[3];
> > +  a[5] = b[2];
> > +  a[6] = b[1];
> > +  a[7] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > +{
> > +  a[0] = b[7];
> > +  a[1] = b[6];
> > +  a[2] = b[5];
> > +  a[3] = b[4];
> > +  a[4] = b[3];
> > +  a[5] = b[2];
> > +  a[6] = b[1];
> > +  a[7] = b[0];
> > +  a[8] = b[15];
> > +  a[9] = b[14];
> > +  a[10] = b[13];
> > +  a[11] = b[12];
> > +  a[12] = b[11];
> > +  a[13] = b[10];
> > +  a[14] = b[9];
> > +  a[15] = b[8];
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3b.c b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > new file mode 100644
> > index 00000000000..e4fa3f3a541
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > @@ -0,0 +1,126 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx2 } */
> > +
> > +#include "avx2-check.h"
> > +#include <string.h>
> > +#include "pr106010-3a.c"
> > +
> > +void
> > +avx2_test (void)
> > +{
> > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > +  _Complex double* pd_exp = (_Complex double*) malloc (32);
> > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > +  _Complex float* ps_exp = (_Complex float*) malloc (32);
> > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (32);
> > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > +  _Complex int* epi32_exp = (_Complex int*) malloc (32);
> > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > +  _Complex short* epi16_exp = (_Complex short*) malloc (32);
> > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > +  _Complex char* epi8_exp = (_Complex char*) malloc (32);
> > +  char* p = (char* ) malloc (32);
> > +  char* q = (char* ) malloc (32);
> > +
> > +  __builtin_memset (pd_dst, 0, 32);
> > +  __builtin_memset (ps_dst, 0, 32);
> > +  __builtin_memset (epi64_dst, 0, 32);
> > +  __builtin_memset (epi32_dst, 0, 32);
> > +  __builtin_memset (epi16_dst, 0, 32);
> > +  __builtin_memset (epi8_dst, 0, 32);
> > +
> > +  for (int i = 0; i != 32; i++)
> > +    p[i] = i;
> > +  __builtin_memcpy (pd_src, p, 32);
> > +  __builtin_memcpy (ps_src, p, 32);
> > +  __builtin_memcpy (epi64_src, p, 32);
> > +  __builtin_memcpy (epi32_src, p, 32);
> > +  __builtin_memcpy (epi16_src, p, 32);
> > +  __builtin_memcpy (epi8_src, p, 32);
> > +
> > +  for (int i = 0; i != 16; i++)
> > +    {
> > +      p[i] = i + 16;
> > +      p[i + 16] = i;
> > +    }
> > +  __builtin_memcpy (pd_exp, p, 32);
> > +  __builtin_memcpy (epi64_exp, p, 32);
> > +
> > +  for (int i = 0; i != 8; i++)
> > +    {
> > +      p[i] = i + 8;
> > +      p[i + 8] = i;
> > +      p[i + 16] = i + 24;
> > +      p[i + 24] = i + 16;
> > +      q[i] = i + 24;
> > +      q[i + 8] = i + 16;
> > +      q[i + 16] = i + 8;
> > +      q[i + 24] = i;
> > +    }
> > +  __builtin_memcpy (ps_exp, p, 32);
> > +  __builtin_memcpy (epi32_exp, q, 32);
> > +
> > +
> > +  for (int i = 0; i != 4; i++)
> > +    {
> > +      q[i] = i + 28;
> > +      q[i + 4] = i + 24;
> > +      q[i + 8] = i + 20;
> > +      q[i + 12] = i + 16;
> > +      q[i + 16] = i + 12;
> > +      q[i + 20] = i + 8;
> > +      q[i + 24] = i + 4;
> > +      q[i + 28] = i;
> > +    }
> > +  __builtin_memcpy (epi16_exp, q, 32);
> > +
> > +  for (int i = 0; i != 2; i++)
> > +    {
> > +      q[i] = i + 14;
> > +      q[i + 2] = i + 12;
> > +      q[i + 4] = i + 10;
> > +      q[i + 6] = i + 8;
> > +      q[i + 8] = i + 6;
> > +      q[i + 10] = i + 4;
> > +      q[i + 12] = i + 2;
> > +      q[i + 14] = i;
> > +      q[i + 16] = i + 30;
> > +      q[i + 18] = i + 28;
> > +      q[i + 20] = i + 26;
> > +      q[i + 22] = i + 24;
> > +      q[i + 24] = i + 22;
> > +      q[i + 26] = i + 20;
> > +      q[i + 28] = i + 18;
> > +      q[i + 30] = i + 16;
> > +    }
> > +  __builtin_memcpy (epi8_exp, q, 32);
> > +
> > +  foo_pd (pd_dst, pd_src);
> > +  foo_ps (ps_dst, ps_src);
> > +  foo_epi64 (epi64_dst, epi64_src);
> > +  foo_epi32 (epi32_dst, epi32_src);
> > +  foo_epi16 (epi16_dst, epi16_src);
> > +  foo_epi8 (epi8_dst, epi8_src);
> > +  if (__builtin_memcmp (pd_dst, pd_exp, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (ps_dst, ps_exp, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 32) != 0)
> > +    __builtin_abort ();
> > +
> > +  return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3c.c b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > new file mode 100644
> > index 00000000000..5a5a3d4b992
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > @@ -0,0 +1,69 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > +/* { dg-require-effective-target avx512fp16 } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 8, 9, 6, 7, 14, 15, 12, 13, 4, 5, 10, 11 \}} 1 "slp2" } }  */
> > +
> > +#include <string.h>
> > +
> > +static void do_test (void);
> > +#define DO_TEST do_test
> > +#define AVX512FP16
> > +#include "avx512-check.h"
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > +{
> > +  a[0] = b[1];
> > +  a[1] = b[0];
> > +  a[2] = b[4];
> > +  a[3] = b[3];
> > +  a[4] = b[7];
> > +  a[5] = b[6];
> > +  a[6] = b[2];
> > +  a[7] = b[5];
> > +}
> > +
> > +void
> > +do_test (void)
> > +{
> > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (32);
> > +  char* p = (char* ) malloc (32);
> > +  char* q = (char* ) malloc (32);
> > +
> > +  __builtin_memset (ph_dst, 0, 32);
> > +
> > +  for (int i = 0; i != 32; i++)
> > +    p[i] = i;
> > +  __builtin_memcpy (ph_src, p, 32);
> > +
> > +  for (int i = 0; i != 4; i++)
> > +    {
> > +      p[i] = i + 4;
> > +      p[i + 4] = i;
> > +      p[i + 8] = i + 16;
> > +      p[i + 12] = i + 12;
> > +      p[i + 16] = i + 28;
> > +      p[i + 20] = i + 24;
> > +      p[i + 24] = i + 8;
> > +      p[i + 28] = i + 20;
> > +      q[i] = i + 28;
> > +      q[i + 4] = i + 24;
> > +      q[i + 8] = i + 20;
> > +      q[i + 12] = i + 16;
> > +      q[i + 16] = i + 12;
> > +      q[i + 20] = i + 8;
> > +      q[i + 24] = i + 4;
> > +      q[i + 28] = i;
> > +    }
> > +  __builtin_memcpy (ph_exp, p, 32);
> > +
> > +  foo_ph (ph_dst, ph_src);
> > +  if (__builtin_memcmp (ph_dst, ph_exp, 32) != 0)
> > +    __builtin_abort ();
> > +
> > +  return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4a.c b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > new file mode 100644
> > index 00000000000..b7b0b532bb1
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > @@ -0,0 +1,101 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "slp2" } } */
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_pd (_Complex double* a,
> > +       _Complex double b1,
> > +       _Complex double b2)
> > +{
> > +  a[0] = b1;
> > +  a[1] = b2;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ps (_Complex float* a,
> > +       _Complex float b1, _Complex float b2,
> > +       _Complex float b3, _Complex float b4)
> > +{
> > +  a[0] = b1;
> > +  a[1] = b2;
> > +  a[2] = b3;
> > +  a[3] = b4;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi64 (_Complex long long* a,
> > +          _Complex long long b1,
> > +          _Complex long long b2)
> > +{
> > +  a[0] = b1;
> > +  a[1] = b2;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi32 (_Complex int* a,
> > +          _Complex int b1, _Complex int b2,
> > +          _Complex int b3, _Complex int b4)
> > +{
> > +  a[0] = b1;
> > +  a[1] = b2;
> > +  a[2] = b3;
> > +  a[3] = b4;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi16 (_Complex short* a,
> > +          _Complex short b1, _Complex short b2,
> > +          _Complex short b3, _Complex short b4,
> > +          _Complex short b5, _Complex short b6,
> > +          _Complex short b7,_Complex short b8)
> > +{
> > +  a[0] = b1;
> > +  a[1] = b2;
> > +  a[2] = b3;
> > +  a[3] = b4;
> > +  a[4] = b5;
> > +  a[5] = b6;
> > +  a[6] = b7;
> > +  a[7] = b8;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi8 (_Complex char* a,
> > +         _Complex char b1, _Complex char b2,
> > +         _Complex char b3, _Complex char b4,
> > +         _Complex char b5, _Complex char b6,
> > +         _Complex char b7,_Complex char b8,
> > +         _Complex char b9, _Complex char b10,
> > +         _Complex char b11, _Complex char b12,
> > +         _Complex char b13, _Complex char b14,
> > +         _Complex char b15,_Complex char b16)
> > +{
> > +  a[0] = b1;
> > +  a[1] = b2;
> > +  a[2] = b3;
> > +  a[3] = b4;
> > +  a[4] = b5;
> > +  a[5] = b6;
> > +  a[6] = b7;
> > +  a[7] = b8;
> > +  a[8] = b9;
> > +  a[9] = b10;
> > +  a[10] = b11;
> > +  a[11] = b12;
> > +  a[12] = b13;
> > +  a[13] = b14;
> > +  a[14] = b15;
> > +  a[15] = b16;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4b.c b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > new file mode 100644
> > index 00000000000..e2e79508c4b
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > @@ -0,0 +1,67 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx } */
> > +
> > +#include "avx-check.h"
> > +#include <string.h>
> > +#include "pr106010-4a.c"
> > +
> > +void
> > +avx_test (void)
> > +{
> > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > +  char* p = (char* ) malloc (32);
> > +
> > +  __builtin_memset (pd_dst, 0, 32);
> > +  __builtin_memset (ps_dst, 0, 32);
> > +  __builtin_memset (epi64_dst, 0, 32);
> > +  __builtin_memset (epi32_dst, 0, 32);
> > +  __builtin_memset (epi16_dst, 0, 32);
> > +  __builtin_memset (epi8_dst, 0, 32);
> > +
> > +  for (int i = 0; i != 32; i++)
> > +    p[i] = i;
> > +  __builtin_memcpy (pd_src, p, 32);
> > +  __builtin_memcpy (ps_src, p, 32);
> > +  __builtin_memcpy (epi64_src, p, 32);
> > +  __builtin_memcpy (epi32_src, p, 32);
> > +  __builtin_memcpy (epi16_src, p, 32);
> > +  __builtin_memcpy (epi8_src, p, 32);
> > +
> > +  foo_pd (pd_dst, pd_src[0], pd_src[1]);
> > +  foo_ps (ps_dst, ps_src[0], ps_src[1], ps_src[2], ps_src[3]);
> > +  foo_epi64 (epi64_dst, epi64_src[0], epi64_src[1]);
> > +  foo_epi32 (epi32_dst, epi32_src[0], epi32_src[1], epi32_src[2], epi32_src[3]);
> > +  foo_epi16 (epi16_dst, epi16_src[0], epi16_src[1], epi16_src[2], epi16_src[3],
> > +            epi16_src[4], epi16_src[5], epi16_src[6], epi16_src[7]);
> > +  foo_epi8 (epi8_dst, epi8_src[0], epi8_src[1], epi8_src[2], epi8_src[3],
> > +           epi8_src[4], epi8_src[5], epi8_src[6], epi8_src[7],
> > +           epi8_src[8], epi8_src[9], epi8_src[10], epi8_src[11],
> > +           epi8_src[12], epi8_src[13], epi8_src[14], epi8_src[15]);
> > +
> > +  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi8_dst, epi8_src, 32) != 0)
> > +    __builtin_abort ();
> > +
> > +  return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4c.c b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > new file mode 100644
> > index 00000000000..8e02aefe3b5
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > @@ -0,0 +1,54 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -fdump-tree-slp-details -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx512fp16 } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "slp2" } } */
> > +
> > +#include <string.h>
> > +
> > +static void do_test (void);
> > +#define DO_TEST do_test
> > +#define AVX512FP16
> > +#include "avx512-check.h"
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ph (_Complex _Float16* a,
> > +       _Complex _Float16 b1, _Complex _Float16 b2,
> > +       _Complex _Float16 b3, _Complex _Float16 b4,
> > +       _Complex _Float16 b5, _Complex _Float16 b6,
> > +       _Complex _Float16 b7,_Complex _Float16 b8)
> > +{
> > +  a[0] = b1;
> > +  a[1] = b2;
> > +  a[2] = b3;
> > +  a[3] = b4;
> > +  a[4] = b5;
> > +  a[5] = b6;
> > +  a[6] = b7;
> > +  a[7] = b8;
> > +}
> > +
> > +void
> > +do_test (void)
> > +{
> > +
> > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > +
> > +  char* p = (char* ) malloc (32);
> > +
> > +  __builtin_memset (ph_dst, 0, 32);
> > +
> > +  for (int i = 0; i != 32; i++)
> > +    p[i] = i;
> > +
> > +  __builtin_memcpy (ph_src, p, 32);
> > +
> > +  foo_ph (ph_dst, ph_src[0], ph_src[1], ph_src[2], ph_src[3],
> > +         ph_src[4], ph_src[5], ph_src[6], ph_src[7]);
> > +
> > +  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
> > +    __builtin_abort ();
> > +  return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5a.c b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > new file mode 100644
> > index 00000000000..9d4a6f9846b
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > @@ -0,0 +1,117 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 4 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 4 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 4 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 4 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 4 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 4 "slp2" } } */
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > +{
> > +  a[0] = b[2];
> > +  a[1] = b[3];
> > +  a[2] = b[0];
> > +  a[3] = b[1];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > +{
> > +  a[0] = b[4];
> > +  a[1] = b[5];
> > +  a[2] = b[6];
> > +  a[3] = b[7];
> > +  a[4] = b[0];
> > +  a[5] = b[1];
> > +  a[6] = b[2];
> > +  a[7] = b[3];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > +{
> > +  a[0] = b[2];
> > +  a[1] = b[3];
> > +  a[2] = b[0];
> > +  a[3] = b[1];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > +{
> > +  a[0] = b[4];
> > +  a[1] = b[5];
> > +  a[2] = b[6];
> > +  a[3] = b[7];
> > +  a[4] = b[0];
> > +  a[5] = b[1];
> > +  a[6] = b[2];
> > +  a[7] = b[3];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > +{
> > +  a[0] = b[8];
> > +  a[1] = b[9];
> > +  a[2] = b[10];
> > +  a[3] = b[11];
> > +  a[4] = b[12];
> > +  a[5] = b[13];
> > +  a[6] = b[14];
> > +  a[7] = b[15];
> > +  a[8] = b[0];
> > +  a[9] = b[1];
> > +  a[10] = b[2];
> > +  a[11] = b[3];
> > +  a[12] = b[4];
> > +  a[13] = b[5];
> > +  a[14] = b[6];
> > +  a[15] = b[7];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > +{
> > +  a[0] = b[16];
> > +  a[1] = b[17];
> > +  a[2] = b[18];
> > +  a[3] = b[19];
> > +  a[4] = b[20];
> > +  a[5] = b[21];
> > +  a[6] = b[22];
> > +  a[7] = b[23];
> > +  a[8] = b[24];
> > +  a[9] = b[25];
> > +  a[10] = b[26];
> > +  a[11] = b[27];
> > +  a[12] = b[28];
> > +  a[13] = b[29];
> > +  a[14] = b[30];
> > +  a[15] = b[31];
> > +  a[16] = b[0];
> > +  a[17] = b[1];
> > +  a[18] = b[2];
> > +  a[19] = b[3];
> > +  a[20] = b[4];
> > +  a[21] = b[5];
> > +  a[22] = b[6];
> > +  a[23] = b[7];
> > +  a[24] = b[8];
> > +  a[25] = b[9];
> > +  a[26] = b[10];
> > +  a[27] = b[11];
> > +  a[28] = b[12];
> > +  a[29] = b[13];
> > +  a[30] = b[14];
> > +  a[31] = b[15];
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5b.c b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > new file mode 100644
> > index 00000000000..d5c6ebeb5cf
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > @@ -0,0 +1,80 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx } */
> > +
> > +#include "avx-check.h"
> > +#include <string.h>
> > +#include "pr106010-5a.c"
> > +
> > +void
> > +avx_test (void)
> > +{
> > +  _Complex double* pd_src = (_Complex double*) malloc (64);
> > +  _Complex double* pd_dst = (_Complex double*) malloc (64);
> > +  _Complex double* pd_exp = (_Complex double*) malloc (64);
> > +  _Complex float* ps_src = (_Complex float*) malloc (64);
> > +  _Complex float* ps_dst = (_Complex float*) malloc (64);
> > +  _Complex float* ps_exp = (_Complex float*) malloc (64);
> > +  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
> > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
> > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
> > +  _Complex int* epi32_src = (_Complex int*) malloc (64);
> > +  _Complex int* epi32_dst = (_Complex int*) malloc (64);
> > +  _Complex int* epi32_exp = (_Complex int*) malloc (64);
> > +  _Complex short* epi16_src = (_Complex short*) malloc (64);
> > +  _Complex short* epi16_dst = (_Complex short*) malloc (64);
> > +  _Complex short* epi16_exp = (_Complex short*) malloc (64);
> > +  _Complex char* epi8_src = (_Complex char*) malloc (64);
> > +  _Complex char* epi8_dst = (_Complex char*) malloc (64);
> > +  _Complex char* epi8_exp = (_Complex char*) malloc (64);
> > +  char* p = (char* ) malloc (64);
> > +  char* q = (char* ) malloc (64);
> > +
> > +  __builtin_memset (pd_dst, 0, 64);
> > +  __builtin_memset (ps_dst, 0, 64);
> > +  __builtin_memset (epi64_dst, 0, 64);
> > +  __builtin_memset (epi32_dst, 0, 64);
> > +  __builtin_memset (epi16_dst, 0, 64);
> > +  __builtin_memset (epi8_dst, 0, 64);
> > +
> > +  for (int i = 0; i != 64; i++)
> > +    {
> > +      p[i] = i;
> > +      q[i] = (i + 32) % 64;
> > +    }
> > +  __builtin_memcpy (pd_src, p, 64);
> > +  __builtin_memcpy (ps_src, p, 64);
> > +  __builtin_memcpy (epi64_src, p, 64);
> > +  __builtin_memcpy (epi32_src, p, 64);
> > +  __builtin_memcpy (epi16_src, p, 64);
> > +  __builtin_memcpy (epi8_src, p, 64);
> > +
> > +  __builtin_memcpy (pd_exp, q, 64);
> > +  __builtin_memcpy (ps_exp, q, 64);
> > +  __builtin_memcpy (epi64_exp, q, 64);
> > +  __builtin_memcpy (epi32_exp, q, 64);
> > +  __builtin_memcpy (epi16_exp, q, 64);
> > +  __builtin_memcpy (epi8_exp, q, 64);
> > +
> > +  foo_pd (pd_dst, pd_src);
> > +  foo_ps (ps_dst, ps_src);
> > +  foo_epi64 (epi64_dst, epi64_src);
> > +  foo_epi32 (epi32_dst, epi32_src);
> > +  foo_epi16 (epi16_dst, epi16_src);
> > +  foo_epi8 (epi8_dst, epi8_src);
> > +
> > +  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
> > +    __builtin_abort ();
> > +
> > +  return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5c.c b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > new file mode 100644
> > index 00000000000..9ce4e6dd5c0
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > @@ -0,0 +1,62 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx512fp16 } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 4 "slp2" } } */
> > +
> > +#include <string.h>
> > +
> > +static void do_test (void);
> > +#define DO_TEST do_test
> > +#define AVX512FP16
> > +#include "avx512-check.h"
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > +{
> > +  a[0] = b[8];
> > +  a[1] = b[9];
> > +  a[2] = b[10];
> > +  a[3] = b[11];
> > +  a[4] = b[12];
> > +  a[5] = b[13];
> > +  a[6] = b[14];
> > +  a[7] = b[15];
> > +  a[8] = b[0];
> > +  a[9] = b[1];
> > +  a[10] = b[2];
> > +  a[11] = b[3];
> > +  a[12] = b[4];
> > +  a[13] = b[5];
> > +  a[14] = b[6];
> > +  a[15] = b[7];
> > +}
> > +
> > +void
> > +do_test (void)
> > +{
> > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
> > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
> > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
> > +  char* p = (char* ) malloc (64);
> > +  char* q = (char* ) malloc (64);
> > +
> > +  __builtin_memset (ph_dst, 0, 64);
> > +
> > +  for (int i = 0; i != 64; i++)
> > +    {
> > +      p[i] = i;
> > +      q[i] = (i + 32) % 64;
> > +    }
> > +  __builtin_memcpy (ph_src, p, 64);
> > +
> > +  __builtin_memcpy (ph_exp, q, 64);
> > +
> > +  foo_ph (ph_dst, ph_src);
> > +
> > +  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
> > +    __builtin_abort ();
> > +
> > +  return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6a.c b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > new file mode 100644
> > index 00000000000..65a90d03684
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > @@ -0,0 +1,115 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 4 "slp2" } }  */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 4 "slp2" } }  */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > +{
> > +  a[0] = b[3];
> > +  a[1] = b[2];
> > +  a[2] = b[1];
> > +  a[3] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > +{
> > +  a[0] = b[7];
> > +  a[1] = b[6];
> > +  a[2] = b[5];
> > +  a[3] = b[4];
> > +  a[4] = b[3];
> > +  a[5] = b[2];
> > +  a[6] = b[1];
> > +  a[7] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > +{
> > +  a[0] = b[3];
> > +  a[1] = b[2];
> > +  a[2] = b[1];
> > +  a[3] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > +{
> > +  a[0] = b[7];
> > +  a[1] = b[6];
> > +  a[2] = b[5];
> > +  a[3] = b[4];
> > +  a[4] = b[3];
> > +  a[5] = b[2];
> > +  a[6] = b[1];
> > +  a[7] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > +{
> > +  a[0] = b[15];
> > +  a[1] = b[14];
> > +  a[2] = b[13];
> > +  a[3] = b[12];
> > +  a[4] = b[11];
> > +  a[5] = b[10];
> > +  a[6] = b[9];
> > +  a[7] = b[8];
> > +  a[8] = b[7];
> > +  a[9] = b[6];
> > +  a[10] = b[5];
> > +  a[11] = b[4];
> > +  a[12] = b[3];
> > +  a[13] = b[2];
> > +  a[14] = b[1];
> > +  a[15] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > +{
> > +  a[0] = b[31];
> > +  a[1] = b[30];
> > +  a[2] = b[29];
> > +  a[3] = b[28];
> > +  a[4] = b[27];
> > +  a[5] = b[26];
> > +  a[6] = b[25];
> > +  a[7] = b[24];
> > +  a[8] = b[23];
> > +  a[9] = b[22];
> > +  a[10] = b[21];
> > +  a[11] = b[20];
> > +  a[12] = b[19];
> > +  a[13] = b[18];
> > +  a[14] = b[17];
> > +  a[15] = b[16];
> > +  a[16] = b[15];
> > +  a[17] = b[14];
> > +  a[18] = b[13];
> > +  a[19] = b[12];
> > +  a[20] = b[11];
> > +  a[21] = b[10];
> > +  a[22] = b[9];
> > +  a[23] = b[8];
> > +  a[24] = b[7];
> > +  a[25] = b[6];
> > +  a[26] = b[5];
> > +  a[27] = b[4];
> > +  a[28] = b[3];
> > +  a[29] = b[2];
> > +  a[30] = b[1];
> > +  a[31] = b[0];
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6b.c b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > new file mode 100644
> > index 00000000000..1c5bb020939
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > @@ -0,0 +1,157 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx2 } */
> > +
> > +#include "avx2-check.h"
> > +#include <string.h>
> > +#include "pr106010-6a.c"
> > +
> > +void
> > +avx2_test (void)
> > +{
> > +  _Complex double* pd_src = (_Complex double*) malloc (64);
> > +  _Complex double* pd_dst = (_Complex double*) malloc (64);
> > +  _Complex double* pd_exp = (_Complex double*) malloc (64);
> > +  _Complex float* ps_src = (_Complex float*) malloc (64);
> > +  _Complex float* ps_dst = (_Complex float*) malloc (64);
> > +  _Complex float* ps_exp = (_Complex float*) malloc (64);
> > +  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
> > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
> > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
> > +  _Complex int* epi32_src = (_Complex int*) malloc (64);
> > +  _Complex int* epi32_dst = (_Complex int*) malloc (64);
> > +  _Complex int* epi32_exp = (_Complex int*) malloc (64);
> > +  _Complex short* epi16_src = (_Complex short*) malloc (64);
> > +  _Complex short* epi16_dst = (_Complex short*) malloc (64);
> > +  _Complex short* epi16_exp = (_Complex short*) malloc (64);
> > +  _Complex char* epi8_src = (_Complex char*) malloc (64);
> > +  _Complex char* epi8_dst = (_Complex char*) malloc (64);
> > +  _Complex char* epi8_exp = (_Complex char*) malloc (64);
> > +  char* p = (char* ) malloc (64);
> > +  char* q = (char* ) malloc (64);
> > +
> > +  __builtin_memset (pd_dst, 0, 64);
> > +  __builtin_memset (ps_dst, 0, 64);
> > +  __builtin_memset (epi64_dst, 0, 64);
> > +  __builtin_memset (epi32_dst, 0, 64);
> > +  __builtin_memset (epi16_dst, 0, 64);
> > +  __builtin_memset (epi8_dst, 0, 64);
> > +
> > +  for (int i = 0; i != 64; i++)
> > +    p[i] = i;
> > +
> > +  __builtin_memcpy (pd_src, p, 64);
> > +  __builtin_memcpy (ps_src, p, 64);
> > +  __builtin_memcpy (epi64_src, p, 64);
> > +  __builtin_memcpy (epi32_src, p, 64);
> > +  __builtin_memcpy (epi16_src, p, 64);
> > +  __builtin_memcpy (epi8_src, p, 64);
> > +
> > +
> > +  for (int i = 0; i != 16; i++)
> > +    {
> > +      q[i] = i + 48;
> > +      q[i + 16] = i + 32;
> > +      q[i + 32] = i + 16;
> > +      q[i + 48] = i;
> > +    }
> > +
> > +  __builtin_memcpy (pd_exp, q, 64);
> > +  __builtin_memcpy (epi64_exp, q, 64);
> > +
> > +   for (int i = 0; i != 8; i++)
> > +    {
> > +      q[i] = i + 56;
> > +      q[i + 8] = i + 48;
> > +      q[i + 16] = i + 40;
> > +      q[i + 24] = i + 32;
> > +      q[i + 32] = i + 24;
> > +      q[i + 40] = i + 16;
> > +      q[i + 48] = i + 8;
> > +      q[i + 56] = i;
> > +    }
> > +
> > +  __builtin_memcpy (ps_exp, q, 64);
> > +  __builtin_memcpy (epi32_exp, q, 64);
> > +
> > +  for (int i = 0; i != 4; i++)
> > +    {
> > +      q[i] = i + 60;
> > +      q[i + 4] = i + 56;
> > +      q[i + 8] = i + 52;
> > +      q[i + 12] = i + 48;
> > +      q[i + 16] = i + 44;
> > +      q[i + 20] = i + 40;
> > +      q[i + 24] = i + 36;
> > +      q[i + 28] = i + 32;
> > +      q[i + 32] = i + 28;
> > +      q[i + 36] = i + 24;
> > +      q[i + 40] = i + 20;
> > +      q[i + 44] = i + 16;
> > +      q[i + 48] = i + 12;
> > +      q[i + 52] = i + 8;
> > +      q[i + 56] = i + 4;
> > +      q[i + 60] = i;
> > +    }
> > +
> > +  __builtin_memcpy (epi16_exp, q, 64);
> > +
> > +  for (int i = 0; i != 2; i++)
> > +    {
> > +      q[i] = i + 62;
> > +      q[i + 2] = i + 60;
> > +      q[i + 4] = i + 58;
> > +      q[i + 6] = i + 56;
> > +      q[i + 8] = i + 54;
> > +      q[i + 10] = i + 52;
> > +      q[i + 12] = i + 50;
> > +      q[i + 14] = i + 48;
> > +      q[i + 16] = i + 46;
> > +      q[i + 18] = i + 44;
> > +      q[i + 20] = i + 42;
> > +      q[i + 22] = i + 40;
> > +      q[i + 24] = i + 38;
> > +      q[i + 26] = i + 36;
> > +      q[i + 28] = i + 34;
> > +      q[i + 30] = i + 32;
> > +      q[i + 32] = i + 30;
> > +      q[i + 34] = i + 28;
> > +      q[i + 36] = i + 26;
> > +      q[i + 38] = i + 24;
> > +      q[i + 40] = i + 22;
> > +      q[i + 42] = i + 20;
> > +      q[i + 44] = i + 18;
> > +      q[i + 46] = i + 16;
> > +      q[i + 48] = i + 14;
> > +      q[i + 50] = i + 12;
> > +      q[i + 52] = i + 10;
> > +      q[i + 54] = i + 8;
> > +      q[i + 56] = i + 6;
> > +      q[i + 58] = i + 4;
> > +      q[i + 60] = i + 2;
> > +      q[i + 62] = i;
> > +    }
> > +  __builtin_memcpy (epi8_exp, q, 64);
> > +
> > +  foo_pd (pd_dst, pd_src);
> > +  foo_ps (ps_dst, ps_src);
> > +  foo_epi64 (epi64_dst, epi64_src);
> > +  foo_epi32 (epi32_dst, epi32_src);
> > +  foo_epi16 (epi16_dst, epi16_src);
> > +  foo_epi8 (epi8_dst, epi8_src);
> > +
> > +  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
> > +    __builtin_abort ();
> > +
> > +  return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6c.c b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > new file mode 100644
> > index 00000000000..b859d884a7f
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > @@ -0,0 +1,80 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > +/* { dg-require-effective-target avx512fp16 } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } } */
> > +
> > +#include <string.h>
> > +
> > +static void do_test (void);
> > +#define DO_TEST do_test
> > +#define AVX512FP16
> > +#include "avx512-check.h"
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > +{
> > +  a[0] = b[15];
> > +  a[1] = b[14];
> > +  a[2] = b[13];
> > +  a[3] = b[12];
> > +  a[4] = b[11];
> > +  a[5] = b[10];
> > +  a[6] = b[9];
> > +  a[7] = b[8];
> > +  a[8] = b[7];
> > +  a[9] = b[6];
> > +  a[10] = b[5];
> > +  a[11] = b[4];
> > +  a[12] = b[3];
> > +  a[13] = b[2];
> > +  a[14] = b[1];
> > +  a[15] = b[0];
> > +}
> > +
> > +void
> > +do_test (void)
> > +{
> > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
> > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
> > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
> > +  char* p = (char* ) malloc (64);
> > +  char* q = (char* ) malloc (64);
> > +
> > +  __builtin_memset (ph_dst, 0, 64);
> > +
> > +  for (int i = 0; i != 64; i++)
> > +    p[i] = i;
> > +
> > +  __builtin_memcpy (ph_src, p, 64);
> > +
> > +  for (int i = 0; i != 4; i++)
> > +    {
> > +      q[i] = i + 60;
> > +      q[i + 4] = i + 56;
> > +      q[i + 8] = i + 52;
> > +      q[i + 12] = i + 48;
> > +      q[i + 16] = i + 44;
> > +      q[i + 20] = i + 40;
> > +      q[i + 24] = i + 36;
> > +      q[i + 28] = i + 32;
> > +      q[i + 32] = i + 28;
> > +      q[i + 36] = i + 24;
> > +      q[i + 40] = i + 20;
> > +      q[i + 44] = i + 16;
> > +      q[i + 48] = i + 12;
> > +      q[i + 52] = i + 8;
> > +      q[i + 56] = i + 4;
> > +      q[i + 60] = i;
> > +    }
> > +
> > +  __builtin_memcpy (ph_exp, q, 64);
> > +
> > +  foo_ph (ph_dst, ph_src);
> > +
> > +  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
> > +    __builtin_abort ();
> > +
> > +  return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7a.c b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > new file mode 100644
> > index 00000000000..2ea01fac927
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > @@ -0,0 +1,58 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
> > +
> > +#define N 10000
> > +void
> > +__attribute__((noipa))
> > +foo_pd (_Complex double* a, _Complex double b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ps (_Complex float* a, _Complex float b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi64 (_Complex long long* a, _Complex long long b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi32 (_Complex int* a, _Complex int b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi16 (_Complex short* a, _Complex short b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi8 (_Complex char* a, _Complex char b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7b.c b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > new file mode 100644
> > index 00000000000..26482cc10f5
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > @@ -0,0 +1,63 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx } */
> > +
> > +#include "avx-check.h"
> > +#include <string.h>
> > +#include "pr106010-7a.c"
> > +
> > +void
> > +avx_test (void)
> > +{
> > +  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
> > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > +  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
> > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > +  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > +  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
> > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > +  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
> > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > +  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
> > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > +  char* p_init = (char*) malloc (2 * N * sizeof (double));
> > +
> > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > +
> > +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> > +    p_init[i] = i % 2 + 3;
> > +
> > +  memcpy (pd_src, p_init, 2 * N * sizeof (double));
> > +  memcpy (ps_dst, p_init, 2 * N * sizeof (float));
> > +  memcpy (epi64_dst, p_init, 2 * N * sizeof (long long));
> > +  memcpy (epi32_dst, p_init, 2 * N * sizeof (int));
> > +  memcpy (epi16_dst, p_init, 2 * N * sizeof (short));
> > +  memcpy (epi8_dst, p_init, 2 * N * sizeof (char));
> > +
> > +  foo_pd (pd_dst, pd_src[0]);
> > +  foo_ps (ps_dst, ps_src[0]);
> > +  foo_epi64 (epi64_dst, epi64_src[0]);
> > +  foo_epi32 (epi32_dst, epi32_src[0]);
> > +  foo_epi16 (epi16_dst, epi16_src[0]);
> > +  foo_epi8 (epi8_dst, epi8_src[0]);
> > +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
> > +    __builtin_abort ();
> > +
> > +  return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7c.c b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > new file mode 100644
> > index 00000000000..7f4056a5ecc
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > @@ -0,0 +1,41 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
> > +/* { dg-require-effective-target avx512fp16 } */
> > +
> > +#include <string.h>
> > +
> > +static void do_test (void);
> > +
> > +#define DO_TEST do_test
> > +#define AVX512FP16
> > +#include "avx512-check.h"
> > +
> > +#define N 10000
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ph (_Complex _Float16* a, _Complex _Float16 b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b;
> > +}
> > +
> > +static void
> > +do_test (void)
> > +{
> > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > +  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
> > +
> > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > +
> > +  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
> > +    p_init[i] = i % 2 + 3;
> > +
> > +  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
> > +
> > +  foo_ph (ph_dst, ph_src[0]);
> > +  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
> > +    __builtin_abort ();
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8a.c b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > new file mode 100644
> > index 00000000000..11054b60d30
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > @@ -0,0 +1,58 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
> > +
> > +#define N 10000
> > +void
> > +__attribute__((noipa))
> > +foo_pd (_Complex double* a)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = 1.0 + 2.0i;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ps (_Complex float* a)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = 1.0f + 2.0fi;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi64 (_Complex long long* a)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = 1 + 2i;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi32 (_Complex int* a)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = 1 + 2i;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi16 (_Complex short* a)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = 1 + 2i;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi8 (_Complex char* a)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = 1 + 2i;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8b.c b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > new file mode 100644
> > index 00000000000..6bb0073b691
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > @@ -0,0 +1,53 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx } */
> > +
> > +#include "avx-check.h"
> > +#include <string.h>
> > +#include "pr106010-8a.c"
> > +
> > +void
> > +avx_test (void)
> > +{
> > +  _Complex double pd_src = 1.0 + 2.0i;
> > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > +  _Complex float ps_src = 1.0 + 2.0i;
> > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > +  _Complex long long epi64_src = 1 + 2i;;
> > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > +  _Complex int epi32_src = 1 + 2i;
> > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > +  _Complex short epi16_src = 1 + 2i;
> > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > +  _Complex char epi8_src = 1 + 2i;
> > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > +
> > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > +
> > +  foo_pd (pd_dst);
> > +  foo_ps (ps_dst);
> > +  foo_epi64 (epi64_dst);
> > +  foo_epi32 (epi32_dst);
> > +  foo_epi16 (epi16_dst);
> > +  foo_epi8 (epi8_dst);
> > +  for (int i = 0 ; i != N; i++)
> > +    {
> > +      if (pd_dst[i] != pd_src)
> > +       __builtin_abort ();
> > +      if (ps_dst[i] != ps_src)
> > +       __builtin_abort ();
> > +      if (epi64_dst[i] != epi64_src)
> > +       __builtin_abort ();
> > +      if (epi32_dst[i] != epi32_src)
> > +       __builtin_abort ();
> > +      if (epi16_dst[i] != epi16_src)
> > +       __builtin_abort ();
> > +      if (epi8_dst[i] != epi8_src)
> > +       __builtin_abort ();
> > +    }
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8c.c b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > new file mode 100644
> > index 00000000000..61ae131829d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > @@ -0,0 +1,38 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
> > +/* { dg-require-effective-target avx512fp16 } */
> > +
> > +#include <string.h>
> > +
> > +static void do_test (void);
> > +
> > +#define DO_TEST do_test
> > +#define AVX512FP16
> > +#include "avx512-check.h"
> > +
> > +#define N 10000
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ph (_Complex _Float16* a)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = 1.0f16 + 2.0f16i;
> > +}
> > +
> > +static void
> > +do_test (void)
> > +{
> > +  _Complex _Float16 ph_src = 1.0f16 + 2.0f16i;
> > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > +
> > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > +
> > +  foo_ph (ph_dst);
> > +  for (int i = 0; i != N; i++)
> > +    {
> > +      if (ph_dst[i] != ph_src)
> > +       __builtin_abort ();
> > +    }
> > +}
> > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> > index d20a10a1524..42ee9df674c 100644
> > --- a/gcc/tree-vect-data-refs.cc
> > +++ b/gcc/tree-vect-data-refs.cc
> > @@ -1403,7 +1403,8 @@ vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
> >    if (PURE_SLP_STMT (stmt_info))
> >      ncopies = 1;
> >    else
> > -    ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
> > +    ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info),
> > +                                  STMT_VINFO_COMPLEX_P (stmt_info));
> >
> >    if (DR_IS_READ (dr_info->dr))
> >      vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
> > @@ -4597,8 +4598,22 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
> >
> >        /* Set vectype for STMT.  */
> >        scalar_type = TREE_TYPE (DR_REF (dr));
> > -      tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
> > -      if (!vectype)
> > +      tree adjust_scalar_type = scalar_type;
> > +      /* Support Complex type access. Note that the complex type of load/store
> > +        does not support gather/scatter.  */
> > +      if (TREE_CODE (scalar_type) == COMPLEX_TYPE
> > +         && gatherscatter == SG_NONE)
> > +       {
> > +         adjust_scalar_type = TREE_TYPE (scalar_type);
> > +         STMT_VINFO_COMPLEX_P (stmt_info) = true;
> > +       }
> > +      tree vectype = get_vectype_for_scalar_type (vinfo, adjust_scalar_type);
> > +      unsigned HOST_WIDE_INT constant_nunits;
> > +      if (!vectype
> > +         /* For complex type, V1DI doesn't make sense.  */
> > +         || (STMT_VINFO_COMPLEX_P (stmt_info)
> > +             && (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&constant_nunits)
> > +                 || constant_nunits == 1)))
> >          {
> >            if (dump_enabled_p ())
> >              {
> > @@ -4635,8 +4650,11 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
> >         }
> >
> >        /* Adjust the minimal vectorization factor according to the
> > -        vector type.  */
> > +        vector type. Note for complex type, VF is half of
> > +        TYPE_VECTOR_SUBPARTS.  */
> >        vf = TYPE_VECTOR_SUBPARTS (vectype);
> > +      if (STMT_VINFO_COMPLEX_P (stmt_info))
> > +       vf = exact_div (vf, 2);
> >        *min_vf = upper_bound (*min_vf, vf);
> >
> >        /* Leave the BB vectorizer to pick the vector type later, based on
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > index 3a70c15b593..365fa738022 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -200,7 +200,12 @@ vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
> >      }
> >
> >    if (nunits_vectype)
> > -    vect_update_max_nunits (vf, nunits_vectype);
> > +    {
> > +      poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (nunits_vectype);
> > +      if (STMT_VINFO_COMPLEX_P (stmt_info))
> > +       nunits = exact_div (nunits, 2);
> > +      vect_update_max_nunits (vf, nunits);
> > +    }
> >
> >    return opt_result::success ();
> >  }
> > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> > index dab5daddcc5..5d66ea2f286 100644
> > --- a/gcc/tree-vect-slp.cc
> > +++ b/gcc/tree-vect-slp.cc
> > @@ -877,10 +877,14 @@ vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
> >        return false;
> >      }
> >
> > +  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> > +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> > +    nunits = exact_div (nunits, 2);
> > +
> >    /* If populating the vector type requires unrolling then fail
> >       before adjusting *max_nunits for basic-block vectorization.  */
> >    if (is_a <bb_vec_info> (vinfo)
> > -      && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
> > +      && !multiple_p (group_size , nunits))
> >      {
> >        if (dump_enabled_p ())
> >         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > @@ -891,7 +895,7 @@ vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
> >      }
> >
> >    /* In case of multiple types we need to detect the smallest type.  */
> > -  vect_update_max_nunits (max_nunits, vectype);
> > +  vect_update_max_nunits (max_nunits, nunits);
> >    return true;
> >  }
> >
> > @@ -3720,22 +3724,54 @@ vect_optimize_slp (vec_info *vinfo)
> >          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
> >          when permuting constants and invariants keeping the permute
> >          bijective.  */
> > -      auto_sbitmap load_index (SLP_TREE_LANES (node));
> > -      bitmap_clear (load_index);
> > -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > -       bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
> > -      unsigned j;
> > -      for (j = 0; j < SLP_TREE_LANES (node); ++j)
> > -       if (!bitmap_bit_p (load_index, j))
> > -         break;
> > -      if (j != SLP_TREE_LANES (node))
> > -       continue;
> > +      /* Permutation of Complex type.  */
> > +      if (STMT_VINFO_COMPLEX_P (dr_stmt))
> > +       {
> > +         auto_sbitmap load_index (SLP_TREE_LANES (node) * 2);
> > +         bitmap_clear (load_index);
> > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > +           {
> > +             unsigned bit = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > +             bitmap_set_bit (load_index, 2 * bit);
> > +             bitmap_set_bit (load_index, 2 * bit + 1);
> > +           }
> > +         unsigned j;
> > +         for (j = 0; j < SLP_TREE_LANES (node) * 2; ++j)
> > +           if (!bitmap_bit_p (load_index, j))
> > +             break;
> > +         if (j != SLP_TREE_LANES (node) * 2)
> > +           continue;
> >
> > -      vec<unsigned> perm = vNULL;
> > -      perm.safe_grow (SLP_TREE_LANES (node), true);
> > -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > -       perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > -      perms.safe_push (perm);
> > +         vec<unsigned> perm = vNULL;
> > +         perm.safe_grow (SLP_TREE_LANES (node) * 2, true);
> > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > +           {
> > +             unsigned cidx = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > +             perm[2 * j] = 2 * cidx;
> > +             perm[2 * j + 1] = 2 * cidx + 1;
> > +           }
> > +         perms.safe_push (perm);
> > +       }
> > +      else
> > +       {
> > +         auto_sbitmap load_index (SLP_TREE_LANES (node));
> > +         bitmap_clear (load_index);
> > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > +           bitmap_set_bit (load_index,
> > +                           SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
> > +         unsigned j;
> > +         for (j = 0; j < SLP_TREE_LANES (node); ++j)
> > +           if (!bitmap_bit_p (load_index, j))
> > +             break;
> > +         if (j != SLP_TREE_LANES (node))
> > +           continue;
> > +
> > +         vec<unsigned> perm = vNULL;
> > +         perm.safe_grow (SLP_TREE_LANES (node), true);
> > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > +           perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > +         perms.safe_push (perm);
> > +       }
> >        vertices[idx].perm_in = perms.length () - 1;
> >        vertices[idx].perm_out = perms.length () - 1;
> >      }
> > @@ -4518,6 +4554,12 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
> >         vf = loop_vinfo->vectorization_factor;
> >        else
> >         vf = 1;
> > +      /* For complex type and SLP, double vf to get right vectype.
> > +        .i.e vector(4) double for complex double, group size is 2, double vf
> > +        to map vf * group_size to TYPE_VECTOR_SUBPARTS.  */
> > +     if (STMT_VINFO_COMPLEX_P (stmt_info))
> > +       vf *= 2;
> > +
> >        unsigned int group_size = SLP_TREE_LANES (node);
> >        tree vectype = SLP_TREE_VECTYPE (node);
> >        SLP_TREE_NUMBER_OF_VEC_STMTS (node)
> > @@ -4763,10 +4805,17 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
> >             }
> >           unsigned group_size = SLP_TREE_LANES (child);
> >           poly_uint64 vf = 1;
> > +
> >           if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
> >             vf = loop_vinfo->vectorization_factor;
> > +
> > +         /* V2SF is just 1 complex type, so mutiply by 2
> > +            to get release vector numbers.  */
> > +         unsigned cp
> > +           = STMT_VINFO_COMPLEX_P (SLP_TREE_REPRESENTATIVE (node)) ? 2 : 1;
> > +
> >           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
> > -           = vect_get_num_vectors (vf * group_size, vector_type);
> > +           = vect_get_num_vectors (vf * group_size * cp, vector_type);
> >           /* And cost them.  */
> >           vect_prologue_cost_for_slp (child, cost_vec);
> >         }
> > @@ -6402,6 +6451,11 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> >
> >    /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
> >    vector_type = SLP_TREE_VECTYPE (op_node);
> > +  unsigned int cp = 1;
> > +  /* Handle Complex type vector init.
> > +     SLP_TREE_REPRESENTATIVE (op_node) could be NULL.  */
> > +  if (TREE_CODE (TREE_TYPE (op_node->ops[0])) == COMPLEX_TYPE)
> > +    cp = 2;
> >
> >    unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
> >    SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
> > @@ -6426,9 +6480,9 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> >    /* When using duplicate_and_interleave, we just need one element for
> >       each scalar statement.  */
> >    if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
> > -    nunits = group_size;
> > +    nunits = group_size * cp;
> >
> > -  number_of_copies = nunits * number_of_vectors / group_size;
> > +  number_of_copies = nunits * number_of_vectors / (group_size * cp);
> >
> >    number_of_places_left_in_vector = nunits;
> >    constant_p = true;
> > @@ -6460,8 +6514,23 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> >                         gcc_unreachable ();
> >                     }
> >                   else
> > -                   op = fold_unary (VIEW_CONVERT_EXPR,
> > -                                    TREE_TYPE (vector_type), op);
> > +                   {
> > +                     tree scalar_type = TREE_TYPE (vector_type);
> > +                     /* For complex type, insert real and imag part
> > +                        separately.  */
> > +                     if (cp == 2)
> > +                       {
> > +                         gcc_assert ((TREE_CODE (TREE_TYPE (op))
> > +                                      == COMPLEX_TYPE)
> > +                                     && (scalar_type
> > +                                         == TREE_TYPE (TREE_TYPE (op))));
> > +                         elts[number_of_places_left_in_vector--]
> > +                           = fold_unary (IMAGPART_EXPR, scalar_type, op);
> > +                         op = fold_unary (REALPART_EXPR, scalar_type, op);
> > +                       }
> > +                     else
> > +                       op = fold_unary (VIEW_CONVERT_EXPR, scalar_type, op);
> > +                   }
> >                   gcc_assert (op && CONSTANT_CLASS_P (op));
> >                 }
> >               else
> > @@ -6481,11 +6550,28 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> >                     }
> >                   else
> >                     {
> > -                     op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
> > -                                  op);
> > -                     init_stmt
> > -                       = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
> > -                                              op);
> > +                     tree scalar_type = TREE_TYPE (vector_type);
> > +                     if (cp == 2)
> > +                       {
> > +                         gcc_assert ((TREE_CODE (TREE_TYPE (op))
> > +                                      == COMPLEX_TYPE)
> > +                                     && (scalar_type
> > +                                         == TREE_TYPE (TREE_TYPE (op))));
> > +                         tree imag = build1 (IMAGPART_EXPR, scalar_type, op);
> > +                         op = build1 (REALPART_EXPR, scalar_type, op);
> > +                         tree imag_temp = make_ssa_name (scalar_type);
> > +                         elts[number_of_places_left_in_vector--] = imag_temp;
> > +                         init_stmt = gimple_build_assign (imag_temp, imag);
> > +                         gimple_seq_add_stmt (&ctor_seq, init_stmt);
> > +                         init_stmt = gimple_build_assign (new_temp, op);
> > +                       }
> > +                     else
> > +                       {
> > +                         op = build1 (VIEW_CONVERT_EXPR, scalar_type, op);
> > +                         init_stmt
> > +                           = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
> > +                                                  op);
> > +                       }
> >                     }
> >                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
> >                   op = new_temp;
> > @@ -6696,15 +6782,17 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> >    unsigned int nelts_to_build;
> >    unsigned int nvectors_per_build;
> >    unsigned int in_nlanes;
> > +  unsigned int cp = STMT_VINFO_COMPLEX_P (stmt_info) ? 2 : 1;
> >    bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
> > -                     && multiple_p (nunits, group_size));
> > +                     && multiple_p (nunits, group_size * cp));
> >    if (repeating_p)
> >      {
> >        /* A single vector contains a whole number of copies of the node, so:
> >          (a) all permutes can use the same mask; and
> >          (b) the permutes only need a single vector input.  */
> > -      mask.new_vector (nunits, group_size, 3);
> > -      nelts_to_build = mask.encoded_nelts ();
> > +      /* For complex type, mask size should be double of nelts_to_build.  */
> > +      mask.new_vector (nunits, group_size * cp, 3);
> > +      nelts_to_build = mask.encoded_nelts () / cp;
> >        nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
> >        in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
> >      }
> > @@ -6744,8 +6832,8 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> >         {
> >           /* Enforced before the loop when !repeating_p.  */
> >           unsigned int const_nunits = nunits.to_constant ();
> > -         vec_index = i / const_nunits;
> > -         mask_element = i % const_nunits;
> > +         vec_index = i / (const_nunits / cp);
> > +         mask_element = i % (const_nunits / cp);
> >           if (vec_index == first_vec_index
> >               || first_vec_index == -1)
> >             {
> > @@ -6755,7 +6843,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> >                    || second_vec_index == -1)
> >             {
> >               second_vec_index = vec_index;
> > -             mask_element += const_nunits;
> > +             mask_element += (const_nunits / cp);
> >             }
> >           else
> >             {
> > @@ -6768,14 +6856,24 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> >               return false;
> >             }
> >
> > -         gcc_assert (mask_element < 2 * const_nunits);
> > +         gcc_assert (mask_element < 2 * const_nunits / cp);
> >         }
> >
> >        if (mask_element != index)
> >         noop_p = false;
> > -      mask[index++] = mask_element;
> > +      /* Set index for Complex _type.
> > +        i.e. mask like [1,0] is actually [2, 3, 0, 1]
> > +        for vector scalar type.  */
> > +      if (cp == 2)
> > +       {
> > +         mask[2 * index] = 2 * mask_element;
> > +         mask[2 * index + 1] = 2 * mask_element + 1;
> > +       }
> > +      else
> > +       mask[index] = mask_element;
> > +      index++;
> >
> > -      if (index == count && !noop_p)
> > +      if (index * cp == count && !noop_p)
> >         {
> >           indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
> >           if (!can_vec_perm_const_p (mode, mode, indices))
> > @@ -6799,7 +6897,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> >           ++*n_perms;
> >         }
> >
> > -      if (index == count)
> > +      if (index * cp == count)
> >         {
> >           if (!analyze_only)
> >             {
> > @@ -6869,7 +6967,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> >           bool load_seen = false;
> >           for (unsigned i = 0; i < in_nlanes; ++i)
> >             {
> > -             if (i % const_nunits == 0)
> > +             if (i % (const_nunits * cp) == 0)
> >                 {
> >                   if (load_seen)
> >                     *n_loads += 1;
> > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > index 72107afc883..8af3b558be4 100644
> > --- a/gcc/tree-vect-stmts.cc
> > +++ b/gcc/tree-vect-stmts.cc
> > @@ -1397,25 +1397,70 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
> >  {
> >    gimple *init_stmt;
> >    tree new_temp;
> > +  tree scalar_type = TREE_TYPE (type);
> > +  gimple_seq stmts = NULL;
> > +
> > +  if (TREE_CODE (TREE_TYPE (val)) == COMPLEX_TYPE)
> > +    {
> > +      unsigned HOST_WIDE_INT nunits;
> > +      gcc_assert (TYPE_VECTOR_SUBPARTS (type).is_constant (&nunits));
> >
> > +      tree_vector_builder elts (type, nunits, 1);
> > +      tree imag, real;
> > +      if (TREE_CODE (val) == COMPLEX_CST)
> > +       {
> > +         real = fold_unary (REALPART_EXPR, scalar_type, val);
> > +         imag = fold_unary (IMAGPART_EXPR, scalar_type, val);
> > +       }
> > +      else
> > +       {
> > +         real = make_ssa_name (scalar_type);
> > +         imag = make_ssa_name (scalar_type);
> > +         init_stmt
> > +           = gimple_build_assign (real,
> > +                                  build1 (REALPART_EXPR, scalar_type, val));
> > +         gimple_seq_add_stmt (&stmts, init_stmt);
> > +         init_stmt
> > +           = gimple_build_assign (imag,
> > +                                  build1 (IMAGPART_EXPR, scalar_type, val));
> > +         gimple_seq_add_stmt (&stmts, init_stmt);
> > +       }
> > +
> > +      /* Build vector as [real,imag,real,imag,...].  */
> > +      for (unsigned i = 0; i != nunits; i++)
> > +       {
> > +         if (i % 2)
> > +           elts.quick_push (imag);
> > +         else
> > +           elts.quick_push (real);
> > +       }
> > +      val = gimple_build_vector (&stmts, &elts);
> > +      if (!gimple_seq_empty_p (stmts))
> > +       {
> > +         if (gsi)
> > +           gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> > +         else
> > +           vinfo->insert_seq_on_entry (stmt_info, stmts);
> > +       }
> > +    }
> >    /* We abuse this function to push sth to a SSA name with initial 'val'.  */
> > -  if (! useless_type_conversion_p (type, TREE_TYPE (val)))
> > +  else if (! useless_type_conversion_p (type, TREE_TYPE (val)))
> >      {
> >        gcc_assert (TREE_CODE (type) == VECTOR_TYPE);
> > -      if (! types_compatible_p (TREE_TYPE (type), TREE_TYPE (val)))
> > +      if (! types_compatible_p (scalar_type, TREE_TYPE (val)))
> >         {
> >           /* Scalar boolean value should be transformed into
> >              all zeros or all ones value before building a vector.  */
> >           if (VECTOR_BOOLEAN_TYPE_P (type))
> >             {
> > -             tree true_val = build_all_ones_cst (TREE_TYPE (type));
> > -             tree false_val = build_zero_cst (TREE_TYPE (type));
> > +             tree true_val = build_all_ones_cst (scalar_type);
> > +             tree false_val = build_zero_cst (scalar_type);
> >
> >               if (CONSTANT_CLASS_P (val))
> >                 val = integer_zerop (val) ? false_val : true_val;
> >               else
> >                 {
> > -                 new_temp = make_ssa_name (TREE_TYPE (type));
> > +                 new_temp = make_ssa_name (scalar_type);
> >                   init_stmt = gimple_build_assign (new_temp, COND_EXPR,
> >                                                    val, true_val, false_val);
> >                   vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi);
> > @@ -1424,14 +1469,13 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
> >             }
> >           else
> >             {
> > -             gimple_seq stmts = NULL;
> >               if (! INTEGRAL_TYPE_P (TREE_TYPE (val)))
> >                 val = gimple_build (&stmts, VIEW_CONVERT_EXPR,
> > -                                   TREE_TYPE (type), val);
> > +                                   scalar_type, val);
> >               else
> >                 /* ???  Condition vectorization expects us to do
> >                    promotion of invariant/external defs.  */
> > -               val = gimple_convert (&stmts, TREE_TYPE (type), val);
> > +               val = gimple_convert (&stmts, scalar_type, val);
> >               for (gimple_stmt_iterator gsi2 = gsi_start (stmts);
> >                    !gsi_end_p (gsi2); )
> >                 {
> > @@ -1496,7 +1540,12 @@ vect_get_vec_defs_for_operand (vec_info *vinfo, stmt_vec_info stmt_vinfo,
> >                && VECTOR_BOOLEAN_TYPE_P (stmt_vectype))
> >         vector_type = truth_type_for (stmt_vectype);
> >        else
> > -       vector_type = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op));
> > +       {
> > +         tree scalar_type = TREE_TYPE (op);
> > +         if (STMT_VINFO_COMPLEX_P (stmt_vinfo))
> > +           scalar_type = TREE_TYPE (scalar_type);
> > +         vector_type = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
> > +       }
> >
> >        gcc_assert (vector_type);
> >        tree vop = vect_init_vector (vinfo, stmt_vinfo, op, vector_type, NULL);
> > @@ -7509,8 +7558,17 @@ vectorizable_store (vec_info *vinfo,
> >       same location twice.  */
> >    gcc_assert (slp == PURE_SLP_STMT (stmt_info));
> >
> > +  if (!STMT_VINFO_DATA_REF (stmt_info))
> > +    return false;
> > +
> >    tree vectype = STMT_VINFO_VECTYPE (stmt_info), rhs_vectype = NULL_TREE;
> >    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> > +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> > +    {
> > +      if (!nunits.is_constant ())
> > +       return false;
> > +      nunits = exact_div (nunits, 2);
> > +    }
> >
> >    if (loop_vinfo)
> >      {
> > @@ -7526,7 +7584,8 @@ vectorizable_store (vec_info *vinfo,
> >    if (slp)
> >      ncopies = 1;
> >    else
> > -    ncopies = vect_get_num_copies (loop_vinfo, vectype);
> > +    ncopies = vect_get_num_copies (loop_vinfo, vectype,
> > +                                  STMT_VINFO_COMPLEX_P (stmt_info));
> >
> >    gcc_assert (ncopies >= 1);
> >
> > @@ -7546,9 +7605,6 @@ vectorizable_store (vec_info *vinfo,
> >    elem_type = TREE_TYPE (vectype);
> >    vec_mode = TYPE_MODE (vectype);
> >
> > -  if (!STMT_VINFO_DATA_REF (stmt_info))
> > -    return false;
> > -
> >    vect_memory_access_type memory_access_type;
> >    enum dr_alignment_support alignment_support_scheme;
> >    int misalignment;
> > @@ -8778,6 +8834,12 @@ vectorizable_load (vec_info *vinfo,
> >
> >    tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> >    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> > +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> > +    {
> > +      if (!nunits.is_constant ())
> > +       return false;
> > +      nunits = exact_div (nunits, 2);
> > +    }
> >
> >    if (loop_vinfo)
> >      {
> > @@ -8794,7 +8856,8 @@ vectorizable_load (vec_info *vinfo,
> >    if (slp)
> >      ncopies = 1;
> >    else
> > -    ncopies = vect_get_num_copies (loop_vinfo, vectype);
> > +    ncopies = vect_get_num_copies (loop_vinfo, vectype,
> > +                                  STMT_VINFO_COMPLEX_P (stmt_info));
> >
> >    gcc_assert (ncopies >= 1);
> >
> > @@ -8870,8 +8933,11 @@ vectorizable_load (vec_info *vinfo,
> >                 if (k > maxk)
> >                   maxk = k;
> >               tree vectype = SLP_TREE_VECTYPE (slp_node);
> > +             /* For complex type, half the nunits.  */
> >               if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
> > -                 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
> > +                 || maxk >= (DR_GROUP_SIZE (group_info)
> > +                             & ~((STMT_VINFO_COMPLEX_P (group_info)
> > +                                  ? nunits >> 1 : nunits) - 1)))
> >                 {
> >                   if (dump_enabled_p ())
> >                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > @@ -12499,12 +12565,27 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
> >             dump_printf_loc (MSG_NOTE, vect_location,
> >                              "get vectype for scalar type: %T\n", scalar_type);
> >         }
> > +
> > +      tree orig_scalar_type = scalar_type;
> > +      if (TREE_CODE (scalar_type) == COMPLEX_TYPE)
> > +       {
> > +         /* Set complex_p for BB vectorizer.  */
> > +         STMT_VINFO_COMPLEX_P (stmt_info) = true;
> > +         scalar_type = TREE_TYPE (scalar_type);
> > +         /* Double group_size for BB vectorizer to make
> > +            following 2 get_vectype_for_scalar_type return wanted vectype.
> > +            Real group size is not changed, just make the "faked" input
> > +            group_size.  */
> > +         group_size *= 2;
> > +       }
> >        vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
> > -      if (!vectype)
> > +      if (!vectype
> > +         || (STMT_VINFO_COMPLEX_P (stmt_info)
> > +             && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ()))
> >         return opt_result::failure_at (stmt,
> >                                        "not vectorized:"
> >                                        " unsupported data-type %T\n",
> > -                                      scalar_type);
> > +                                      orig_scalar_type);
> >
> >        if (dump_enabled_p ())
> >         dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n", vectype);
> > @@ -12529,16 +12610,30 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
> >                                                    TREE_TYPE (vectype));
> >        if (scalar_type != TREE_TYPE (vectype))
> >         {
> > -         if (dump_enabled_p ())
> > +         tree orig_scalar_type = scalar_type;
> > +         if (TREE_CODE (scalar_type) == COMPLEX_TYPE)
> > +           {
> > +             /* Set complex_p for Loop vectorizer.  */
> > +             STMT_VINFO_COMPLEX_P (stmt_info) = true;
> > +             scalar_type = TREE_TYPE (scalar_type);
> > +             if (dump_enabled_p ())
> > +               dump_printf_loc (MSG_NOTE, vect_location,
> > +                            "get complex for smallest scalar type: %T\n",
> > +                            scalar_type);
> > +
> > +           }
> > +         else if (dump_enabled_p ())
> >             dump_printf_loc (MSG_NOTE, vect_location,
> >                              "get vectype for smallest scalar type: %T\n",
> >                              scalar_type);
> >           nunits_vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
> >                                                         group_size);
> > -         if (!nunits_vectype)
> > +         if (!nunits_vectype
> > +             || (STMT_VINFO_COMPLEX_P (stmt_info)
> > +                 && !TYPE_VECTOR_SUBPARTS (nunits_vectype).is_constant ()))
> >             return opt_result::failure_at
> >               (stmt, "not vectorized: unsupported data-type %T\n",
> > -              scalar_type);
> > +              orig_scalar_type);
> >           if (dump_enabled_p ())
> >             dump_printf_loc (MSG_NOTE, vect_location, "nunits vectype: %T\n",
> >                              nunits_vectype);
> > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> > index e5fdc9e0a14..4a809e492c4 100644
> > --- a/gcc/tree-vectorizer.h
> > +++ b/gcc/tree-vectorizer.h
> > @@ -1161,6 +1161,9 @@ public:
> >       vectorization.  */
> >    bool vectorizable;
> >
> > +  /* The scalar type of the LHS of this statement is complex type.  */
> > +  bool complex_p;
> > +
> >    /* The stmt to which this info struct refers to.  */
> >    gimple *stmt;
> >
> > @@ -1395,6 +1398,7 @@ struct gather_scatter_info {
> >  #define STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT(S) (S)->reduc_epilogue_adjustment
> >  #define STMT_VINFO_REDUC_IDX(S)                   (S)->reduc_idx
> >  #define STMT_VINFO_FORCE_SINGLE_CYCLE(S)   (S)->force_single_cycle
> > +#define STMT_VINFO_COMPLEX_P(S)            (S)->complex_p
> >
> >  #define STMT_VINFO_DR_WRT_VEC_LOOP(S)      (S)->dr_wrt_vec_loop
> >  #define STMT_VINFO_DR_BASE_ADDRESS(S)      (S)->dr_wrt_vec_loop.base_address
> > @@ -1970,6 +1974,15 @@ vect_get_num_copies (loop_vec_info loop_vinfo, tree vectype)
> >    return vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo), vectype);
> >  }
> >
> > +static inline unsigned int
> > +vect_get_num_copies (loop_vec_info loop_vinfo, tree vectype, bool complex_p)
> > +{
> > +  poly_uint64 nunits = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> > +  if (complex_p)
> > +    nunits *= 2;
> > +  return vect_get_num_vectors (nunits, vectype);
> > +}
> > +
> >  /* Update maximum unit count *MAX_NUNITS so that it accounts for
> >     NUNITS.  *MAX_NUNITS can be 1 if we haven't yet recorded anything.  */
> >
> > --
> > 2.18.1
> >
Richard Biener July 12, 2022, 2:12 p.m. UTC | #4
On Tue, Jul 12, 2022 at 6:11 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Mon, Jul 11, 2022 at 7:47 PM Richard Biener via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > On Mon, Jul 11, 2022 at 5:44 AM liuhongt <hongtao.liu@intel.com> wrote:
> > >
> > > The patch only handles load/store(including ctor/permutation, except
> > > gather/scatter) for complex type, other operations don't needs to be
> > > handled since they will be lowered by pass cplxlower.(MASK_LOAD is not
> > > supported for complex type, so no need to handle either).
> >
> > (*)
> >
> > > Instead of support vector(2) _Complex double, this patch takes vector(4)
> > > double as vector type of _Complex double. Since vectorizer originally
> > > takes TYPE_VECTOR_SUBPARTS as nunits which is not true for complex
> > > type, the patch handles nunits/ncopies/vf specially for complex type.
> >
> > For the limited set above(*) can you explain what's "special" about
> > vector(2) _Complex
> > vs. vector(4) double, thus why we need to have STMT_VINFO_COMPLEX_P at all?
> Supporting a vector(2) complex  is a straightforward idea, just like
> supporting other scalar type in vectorizer, but it requires more
> efforts(in the backend and frontend), considering that most of
> operations of complex type will be lowered into realpart and imagpart
> operations, supporting a vector(2) complex does not look that
> necessary. Then it comes up with supporting vector(4) double(with
> adjustment of vf/ctor/permutation), the vectorizer only needs to
> handle the vectorization of the move operation of the complex type(no
> need to worry about wrongly mapping vector(4) double multiplication to
> complex type multiplication since it's already lowered before
> vectorizer).
> stmt_info does not record the scalar type, in order to avoid duplicate
> operation like getting a lhs type from stmt to determine whether it is
> a complex type, STMT_VINFO_COMPLEX_P bit is added, this bit is mainly
> initialized in vect_analyze_data_refs and vect_get_vector_types_for_
> stmt.
> >
> > I wonder to what extent your handling can be extended to support re-vectorizing
> > (with a higher VF for example) already vectorized code?  The vectorizer giving
> > up on vector(2) double looks quite obviously similar to it giving up
> > on _Complex double ...
> Yes, it can be extended to vector(2) double/float/int/.... with a bit
> adjustment(exacting element by using bit_field instead of
> imagpart_expr/realpart_expr).
> > It would be a shame to not use the same underlying mechanism for dealing with
> > both, where for the vector case obviously vector(4) would be supported as well.
> >
> > In principle _Complex double operations should be two SLP lanes but it seems you
> > are handling them with classical interleaving as well?
> I'm only handling move operations, for other operations it will be
> lowered to realpart and imagpart and thus two SLP lanes.

Yes, I understood that.

Doing it more general (and IMHO better) would involve enhancing
how we represent dataref groups, maintaining the number of scalars
covered by each of the vinfos.  On the SLP representation side it
probably requires to rely on the representative for access and not
on the scalar stmts (since those do not map properly to the lanes).

Ideally we'd be able to handle

struct { _Complex double c; double a; double b; } a[], b[];

void foo ()
{
   for (int i = 0; i < 100; ++i)
    {
      a[i].c = b[i].c;
      a[i].a = b[i].a;
      a[i].b = b[i].b;
    }
}

which I guess your patch doesn't handle with plain AVX vector
copies but instead uses interleaving for the _Complex and non-_Complex
parts?

Let me spend some time fleshing out what is necessary to make
this work "properly".  We can consider your special-casing of _Complex
memory ops if I can't manage to assess the complexity of the task.

Thanks,
Richard.

> >
> > Thanks,
> > Richard.
> >
> > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > Also test the patch for SPEC2017 and find there's complex type vectorization
> > > in 510/549(but no performance impact).
> > >
> > > Any comments?
> > >
> > > gcc/ChangeLog:
> > >
> > >         PR tree-optimization/106010
> > >         * tree-vect-data-refs.cc (vect_get_data_access_cost):
> > >         Pass complex_p to vect_get_num_copies to avoid ICE.
> > >         (vect_analyze_data_refs): Support vectorization for Complex
> > >         type with vector scalar types.
> > >         * tree-vect-loop.cc (vect_determine_vf_for_stmt_1): VF should
> > >         be half of TYPE_VECTOR_SUBPARTS when complex_p.
> > >         * tree-vect-slp.cc (vect_record_max_nunits): nunits should be
> > >         half of TYPE_VECTOR_SUBPARTS when complex_p.
> > >         (vect_optimize_slp): Support permutation for complex type.
> > >         (vect_slp_analyze_node_operations_1): Double nunits in
> > >         vect_get_num_vectors to get right SLP_TREE_NUMBER_OF_VEC_STMTS
> > >         when complex_p.
> > >         (vect_slp_analyze_node_operations): Ditto.
> > >         (vect_create_constant_vectors): Support CTOR for complex type.
> > >         (vect_transform_slp_perm_load): Support permutation for
> > >         complex type.
> > >         * tree-vect-stmts.cc (vect_init_vector): Support complex type.
> > >         (vect_get_vec_defs_for_operand): Get vector type for
> > >         complex type.
> > >         (vectorizable_store): Get right ncopies/nunits for complex
> > >         type, also return false when complex_p and
> > >         !TYPE_VECTOR_SUBPARTS.is_constant ().
> > >         (vectorizable_load): Ditto.
> > >         (vect_get_vector_types_for_stmt): Get vector type for complex type.
> > >         * tree-vectorizer.h (STMT_VINFO_COMPLEX_P): New macro.
> > >         (vect_get_num_copies): New overload.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         * gcc.target/i386/pr106010-1a.c: New test.
> > >         * gcc.target/i386/pr106010-1b.c: New test.
> > >         * gcc.target/i386/pr106010-1c.c: New test.
> > >         * gcc.target/i386/pr106010-2a.c: New test.
> > >         * gcc.target/i386/pr106010-2b.c: New test.
> > >         * gcc.target/i386/pr106010-2c.c: New test.
> > >         * gcc.target/i386/pr106010-3a.c: New test.
> > >         * gcc.target/i386/pr106010-3b.c: New test.
> > >         * gcc.target/i386/pr106010-3c.c: New test.
> > >         * gcc.target/i386/pr106010-4a.c: New test.
> > >         * gcc.target/i386/pr106010-4b.c: New test.
> > >         * gcc.target/i386/pr106010-4c.c: New test.
> > >         * gcc.target/i386/pr106010-5a.c: New test.
> > >         * gcc.target/i386/pr106010-5b.c: New test.
> > >         * gcc.target/i386/pr106010-5c.c: New test.
> > >         * gcc.target/i386/pr106010-6a.c: New test.
> > >         * gcc.target/i386/pr106010-6b.c: New test.
> > >         * gcc.target/i386/pr106010-6c.c: New test.
> > >         * gcc.target/i386/pr106010-7a.c: New test.
> > >         * gcc.target/i386/pr106010-7b.c: New test.
> > >         * gcc.target/i386/pr106010-7c.c: New test.
> > >         * gcc.target/i386/pr106010-8a.c: New test.
> > >         * gcc.target/i386/pr106010-8b.c: New test.
> > >         * gcc.target/i386/pr106010-8c.c: New test.
> > > ---
> > >  gcc/testsuite/gcc.target/i386/pr106010-1a.c |  58 +++++++
> > >  gcc/testsuite/gcc.target/i386/pr106010-1b.c |  63 +++++++
> > >  gcc/testsuite/gcc.target/i386/pr106010-1c.c |  41 +++++
> > >  gcc/testsuite/gcc.target/i386/pr106010-2a.c |  82 +++++++++
> > >  gcc/testsuite/gcc.target/i386/pr106010-2b.c |  62 +++++++
> > >  gcc/testsuite/gcc.target/i386/pr106010-2c.c |  47 ++++++
> > >  gcc/testsuite/gcc.target/i386/pr106010-3a.c |  80 +++++++++
> > >  gcc/testsuite/gcc.target/i386/pr106010-3b.c | 126 ++++++++++++++
> > >  gcc/testsuite/gcc.target/i386/pr106010-3c.c |  69 ++++++++
> > >  gcc/testsuite/gcc.target/i386/pr106010-4a.c | 101 ++++++++++++
> > >  gcc/testsuite/gcc.target/i386/pr106010-4b.c |  67 ++++++++
> > >  gcc/testsuite/gcc.target/i386/pr106010-4c.c |  54 ++++++
> > >  gcc/testsuite/gcc.target/i386/pr106010-5a.c | 117 +++++++++++++
> > >  gcc/testsuite/gcc.target/i386/pr106010-5b.c |  80 +++++++++
> > >  gcc/testsuite/gcc.target/i386/pr106010-5c.c |  62 +++++++
> > >  gcc/testsuite/gcc.target/i386/pr106010-6a.c | 115 +++++++++++++
> > >  gcc/testsuite/gcc.target/i386/pr106010-6b.c | 157 ++++++++++++++++++
> > >  gcc/testsuite/gcc.target/i386/pr106010-6c.c |  80 +++++++++
> > >  gcc/testsuite/gcc.target/i386/pr106010-7a.c |  58 +++++++
> > >  gcc/testsuite/gcc.target/i386/pr106010-7b.c |  63 +++++++
> > >  gcc/testsuite/gcc.target/i386/pr106010-7c.c |  41 +++++
> > >  gcc/testsuite/gcc.target/i386/pr106010-8a.c |  58 +++++++
> > >  gcc/testsuite/gcc.target/i386/pr106010-8b.c |  53 ++++++
> > >  gcc/testsuite/gcc.target/i386/pr106010-8c.c |  38 +++++
> > >  gcc/tree-vect-data-refs.cc                  |  26 ++-
> > >  gcc/tree-vect-loop.cc                       |   7 +-
> > >  gcc/tree-vect-slp.cc                        | 174 +++++++++++++++-----
> > >  gcc/tree-vect-stmts.cc                      | 135 ++++++++++++---
> > >  gcc/tree-vectorizer.h                       |  13 ++
> > >  29 files changed, 2064 insertions(+), 63 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > >
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1a.c b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > > new file mode 100644
> > > index 00000000000..b608f484934
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > > @@ -0,0 +1,58 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "vect" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "vect" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "vect" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "vect" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "vect" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "vect" } } */
> > > +
> > > +#define N 10000
> > > +void
> > > +__attribute__((noipa))
> > > +foo_pd (_Complex double* a, _Complex double* b)
> > > +{
> > > +  for (int i = 0; i != N; i++)
> > > +    a[i] = b[i];
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_ps (_Complex float* a, _Complex float* b)
> > > +{
> > > +  for (int i = 0; i != N; i++)
> > > +    a[i] = b[i];
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi64 (_Complex long long* a, _Complex long long* b)
> > > +{
> > > +  for (int i = 0; i != N; i++)
> > > +    a[i] = b[i];
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi32 (_Complex int* a, _Complex int* b)
> > > +{
> > > +  for (int i = 0; i != N; i++)
> > > +    a[i] = b[i];
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi16 (_Complex short* a, _Complex short* b)
> > > +{
> > > +  for (int i = 0; i != N; i++)
> > > +    a[i] = b[i];
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi8 (_Complex char* a, _Complex char* b)
> > > +{
> > > +  for (int i = 0; i != N; i++)
> > > +    a[i] = b[i];
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1b.c b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > > new file mode 100644
> > > index 00000000000..0f377c3a548
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > > @@ -0,0 +1,63 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > +/* { dg-require-effective-target avx } */
> > > +
> > > +#include "avx-check.h"
> > > +#include <string.h>
> > > +#include "pr106010-1a.c"
> > > +
> > > +void
> > > +avx_test (void)
> > > +{
> > > +  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
> > > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > > +  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
> > > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > +  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
> > > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > > +  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
> > > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > > +  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
> > > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > > +  char* p_init = (char*) malloc (2 * N * sizeof (double));
> > > +
> > > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > > +
> > > +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> > > +    p_init[i] = i;
> > > +
> > > +  memcpy (pd_src, p_init, 2 * N * sizeof (double));
> > > +  memcpy (ps_src, p_init, 2 * N * sizeof (float));
> > > +  memcpy (epi64_src, p_init, 2 * N * sizeof (long long));
> > > +  memcpy (epi32_src, p_init, 2 * N * sizeof (int));
> > > +  memcpy (epi16_src, p_init, 2 * N * sizeof (short));
> > > +  memcpy (epi8_src, p_init, 2 * N * sizeof (char));
> > > +
> > > +  foo_pd (pd_dst, pd_src);
> > > +  foo_ps (ps_dst, ps_src);
> > > +  foo_epi64 (epi64_dst, epi64_src);
> > > +  foo_epi32 (epi32_dst, epi32_src);
> > > +  foo_epi16 (epi16_dst, epi16_src);
> > > +  foo_epi8 (epi8_dst, epi8_src);
> > > +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
> > > +    __builtin_abort ();
> > > +
> > > +  return;
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1c.c b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > > new file mode 100644
> > > index 00000000000..f07e9fb2d3d
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > > @@ -0,0 +1,41 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "vect" } } */
> > > +/* { dg-require-effective-target avx512fp16 } */
> > > +
> > > +#include <string.h>
> > > +
> > > +static void do_test (void);
> > > +
> > > +#define DO_TEST do_test
> > > +#define AVX512FP16
> > > +#include "avx512-check.h"
> > > +
> > > +#define N 10000
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_ph (_Complex _Float16* a, _Complex _Float16* b)
> > > +{
> > > +  for (int i = 0; i != N; i++)
> > > +    a[i] = b[i];
> > > +}
> > > +
> > > +static void
> > > +do_test (void)
> > > +{
> > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > +  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
> > > +
> > > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > > +
> > > +  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
> > > +    p_init[i] = i;
> > > +
> > > +  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
> > > +
> > > +  foo_ph (ph_dst, ph_src);
> > > +  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
> > > +    __builtin_abort ();
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2a.c b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > > new file mode 100644
> > > index 00000000000..d2e2f8d4f43
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > > @@ -0,0 +1,82 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "slp2" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "slp2" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "slp2" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "slp2" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "slp2" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "slp2" } } */
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > +{
> > > +  a[0] = b[0];
> > > +  a[1] = b[1];
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > +{
> > > +  a[0] = b[0];
> > > +  a[1] = b[1];
> > > +  a[2] = b[2];
> > > +  a[3] = b[3];
> > > +
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > +{
> > > +  a[0] = b[0];
> > > +  a[1] = b[1];
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > +{
> > > +  a[0] = b[0];
> > > +  a[1] = b[1];
> > > +  a[2] = b[2];
> > > +  a[3] = b[3];
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > +{
> > > +  a[0] = b[0];
> > > +  a[1] = b[1];
> > > +  a[2] = b[2];
> > > +  a[3] = b[3];
> > > +  a[4] = b[4];
> > > +  a[5] = b[5];
> > > +  a[6] = b[6];
> > > +  a[7] = b[7];
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > +{
> > > +  a[0] = b[0];
> > > +  a[1] = b[1];
> > > +  a[2] = b[2];
> > > +  a[3] = b[3];
> > > +  a[4] = b[4];
> > > +  a[5] = b[5];
> > > +  a[6] = b[6];
> > > +  a[7] = b[7];
> > > +  a[8] = b[8];
> > > +  a[9] = b[9];
> > > +  a[10] = b[10];
> > > +  a[11] = b[11];
> > > +  a[12] = b[12];
> > > +  a[13] = b[13];
> > > +  a[14] = b[14];
> > > +  a[15] = b[15];
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2b.c b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > > new file mode 100644
> > > index 00000000000..ac360752693
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > > @@ -0,0 +1,62 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > +/* { dg-require-effective-target avx } */
> > > +
> > > +#include "avx-check.h"
> > > +#include <string.h>
> > > +#include "pr106010-2a.c"
> > > +
> > > +void
> > > +avx_test (void)
> > > +{
> > > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > > +  char* p = (char* ) malloc (32);
> > > +
> > > +  __builtin_memset (pd_dst, 0, 32);
> > > +  __builtin_memset (ps_dst, 0, 32);
> > > +  __builtin_memset (epi64_dst, 0, 32);
> > > +  __builtin_memset (epi32_dst, 0, 32);
> > > +  __builtin_memset (epi16_dst, 0, 32);
> > > +  __builtin_memset (epi8_dst, 0, 32);
> > > +
> > > +  for (int i = 0; i != 32; i++)
> > > +    p[i] = i;
> > > +  __builtin_memcpy (pd_src, p, 32);
> > > +  __builtin_memcpy (ps_src, p, 32);
> > > +  __builtin_memcpy (epi64_src, p, 32);
> > > +  __builtin_memcpy (epi32_src, p, 32);
> > > +  __builtin_memcpy (epi16_src, p, 32);
> > > +  __builtin_memcpy (epi8_src, p, 32);
> > > +
> > > +  foo_pd (pd_dst, pd_src);
> > > +  foo_ps (ps_dst, ps_src);
> > > +  foo_epi64 (epi64_dst, epi64_src);
> > > +  foo_epi32 (epi32_dst, epi32_src);
> > > +  foo_epi16 (epi16_dst, epi16_src);
> > > +  foo_epi8 (epi8_dst, epi8_src);
> > > +  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > > +    __builtin_abort ();
> > > +
> > > +  return;
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2c.c b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > > new file mode 100644
> > > index 00000000000..a002f209ec9
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > > @@ -0,0 +1,47 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > > +/* { dg-require-effective-target avx512fp16 } */
> > > +
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "slp2" } } */
> > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > +
> > > +#include <string.h>
> > > +
> > > +static void do_test (void);
> > > +#define DO_TEST do_test
> > > +#define AVX512FP16
> > > +#include "avx512-check.h"
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > +{
> > > +  a[0] = b[0];
> > > +  a[1] = b[1];
> > > +  a[2] = b[2];
> > > +  a[3] = b[3];
> > > +  a[4] = b[4];
> > > +  a[5] = b[5];
> > > +  a[6] = b[6];
> > > +  a[7] = b[7];
> > > +}
> > > +
> > > +void
> > > +do_test (void)
> > > +{
> > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > > +  char* p = (char* ) malloc (32);
> > > +
> > > +   __builtin_memset (ph_dst, 0, 32);
> > > +
> > > +  for (int i = 0; i != 32; i++)
> > > +    p[i] = i;
> > > +  __builtin_memcpy (ph_src, p, 32);
> > > +
> > > +  foo_ph (ph_dst, ph_src);
> > > +  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
> > > +    __builtin_abort ();
> > > +
> > > +  return;
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3a.c b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > > new file mode 100644
> > > index 00000000000..c1b64b56b1c
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > > @@ -0,0 +1,80 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
> > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 6, 7, 4, 5 \}} 1 "slp2" } }  */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17 \}} 1 "slp2" } }  */
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > +{
> > > +  a[0] = b[1];
> > > +  a[1] = b[0];
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > +{
> > > +  a[0] = b[1];
> > > +  a[1] = b[0];
> > > +  a[2] = b[3];
> > > +  a[3] = b[2];
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > +{
> > > +  a[0] = b[1];
> > > +  a[1] = b[0];
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > +{
> > > +  a[0] = b[3];
> > > +  a[1] = b[2];
> > > +  a[2] = b[1];
> > > +  a[3] = b[0];
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > +{
> > > +  a[0] = b[7];
> > > +  a[1] = b[6];
> > > +  a[2] = b[5];
> > > +  a[3] = b[4];
> > > +  a[4] = b[3];
> > > +  a[5] = b[2];
> > > +  a[6] = b[1];
> > > +  a[7] = b[0];
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > +{
> > > +  a[0] = b[7];
> > > +  a[1] = b[6];
> > > +  a[2] = b[5];
> > > +  a[3] = b[4];
> > > +  a[4] = b[3];
> > > +  a[5] = b[2];
> > > +  a[6] = b[1];
> > > +  a[7] = b[0];
> > > +  a[8] = b[15];
> > > +  a[9] = b[14];
> > > +  a[10] = b[13];
> > > +  a[11] = b[12];
> > > +  a[12] = b[11];
> > > +  a[13] = b[10];
> > > +  a[14] = b[9];
> > > +  a[15] = b[8];
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3b.c b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > > new file mode 100644
> > > index 00000000000..e4fa3f3a541
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > > @@ -0,0 +1,126 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > +/* { dg-require-effective-target avx2 } */
> > > +
> > > +#include "avx2-check.h"
> > > +#include <string.h>
> > > +#include "pr106010-3a.c"
> > > +
> > > +void
> > > +avx2_test (void)
> > > +{
> > > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > > +  _Complex double* pd_exp = (_Complex double*) malloc (32);
> > > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > > +  _Complex float* ps_exp = (_Complex float*) malloc (32);
> > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (32);
> > > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > > +  _Complex int* epi32_exp = (_Complex int*) malloc (32);
> > > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > > +  _Complex short* epi16_exp = (_Complex short*) malloc (32);
> > > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > > +  _Complex char* epi8_exp = (_Complex char*) malloc (32);
> > > +  char* p = (char* ) malloc (32);
> > > +  char* q = (char* ) malloc (32);
> > > +
> > > +  __builtin_memset (pd_dst, 0, 32);
> > > +  __builtin_memset (ps_dst, 0, 32);
> > > +  __builtin_memset (epi64_dst, 0, 32);
> > > +  __builtin_memset (epi32_dst, 0, 32);
> > > +  __builtin_memset (epi16_dst, 0, 32);
> > > +  __builtin_memset (epi8_dst, 0, 32);
> > > +
> > > +  for (int i = 0; i != 32; i++)
> > > +    p[i] = i;
> > > +  __builtin_memcpy (pd_src, p, 32);
> > > +  __builtin_memcpy (ps_src, p, 32);
> > > +  __builtin_memcpy (epi64_src, p, 32);
> > > +  __builtin_memcpy (epi32_src, p, 32);
> > > +  __builtin_memcpy (epi16_src, p, 32);
> > > +  __builtin_memcpy (epi8_src, p, 32);
> > > +
> > > +  for (int i = 0; i != 16; i++)
> > > +    {
> > > +      p[i] = i + 16;
> > > +      p[i + 16] = i;
> > > +    }
> > > +  __builtin_memcpy (pd_exp, p, 32);
> > > +  __builtin_memcpy (epi64_exp, p, 32);
> > > +
> > > +  for (int i = 0; i != 8; i++)
> > > +    {
> > > +      p[i] = i + 8;
> > > +      p[i + 8] = i;
> > > +      p[i + 16] = i + 24;
> > > +      p[i + 24] = i + 16;
> > > +      q[i] = i + 24;
> > > +      q[i + 8] = i + 16;
> > > +      q[i + 16] = i + 8;
> > > +      q[i + 24] = i;
> > > +    }
> > > +  __builtin_memcpy (ps_exp, p, 32);
> > > +  __builtin_memcpy (epi32_exp, q, 32);
> > > +
> > > +
> > > +  for (int i = 0; i != 4; i++)
> > > +    {
> > > +      q[i] = i + 28;
> > > +      q[i + 4] = i + 24;
> > > +      q[i + 8] = i + 20;
> > > +      q[i + 12] = i + 16;
> > > +      q[i + 16] = i + 12;
> > > +      q[i + 20] = i + 8;
> > > +      q[i + 24] = i + 4;
> > > +      q[i + 28] = i;
> > > +    }
> > > +  __builtin_memcpy (epi16_exp, q, 32);
> > > +
> > > +  for (int i = 0; i != 2; i++)
> > > +    {
> > > +      q[i] = i + 14;
> > > +      q[i + 2] = i + 12;
> > > +      q[i + 4] = i + 10;
> > > +      q[i + 6] = i + 8;
> > > +      q[i + 8] = i + 6;
> > > +      q[i + 10] = i + 4;
> > > +      q[i + 12] = i + 2;
> > > +      q[i + 14] = i;
> > > +      q[i + 16] = i + 30;
> > > +      q[i + 18] = i + 28;
> > > +      q[i + 20] = i + 26;
> > > +      q[i + 22] = i + 24;
> > > +      q[i + 24] = i + 22;
> > > +      q[i + 26] = i + 20;
> > > +      q[i + 28] = i + 18;
> > > +      q[i + 30] = i + 16;
> > > +    }
> > > +  __builtin_memcpy (epi8_exp, q, 32);
> > > +
> > > +  foo_pd (pd_dst, pd_src);
> > > +  foo_ps (ps_dst, ps_src);
> > > +  foo_epi64 (epi64_dst, epi64_src);
> > > +  foo_epi32 (epi32_dst, epi32_src);
> > > +  foo_epi16 (epi16_dst, epi16_src);
> > > +  foo_epi8 (epi8_dst, epi8_src);
> > > +  if (__builtin_memcmp (pd_dst, pd_exp, 32) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (ps_dst, ps_exp, 32) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 32) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 32) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 32) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 32) != 0)
> > > +    __builtin_abort ();
> > > +
> > > +  return;
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3c.c b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > > new file mode 100644
> > > index 00000000000..5a5a3d4b992
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > > @@ -0,0 +1,69 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > > +/* { dg-require-effective-target avx512fp16 } */
> > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 8, 9, 6, 7, 14, 15, 12, 13, 4, 5, 10, 11 \}} 1 "slp2" } }  */
> > > +
> > > +#include <string.h>
> > > +
> > > +static void do_test (void);
> > > +#define DO_TEST do_test
> > > +#define AVX512FP16
> > > +#include "avx512-check.h"
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > +{
> > > +  a[0] = b[1];
> > > +  a[1] = b[0];
> > > +  a[2] = b[4];
> > > +  a[3] = b[3];
> > > +  a[4] = b[7];
> > > +  a[5] = b[6];
> > > +  a[6] = b[2];
> > > +  a[7] = b[5];
> > > +}
> > > +
> > > +void
> > > +do_test (void)
> > > +{
> > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (32);
> > > +  char* p = (char* ) malloc (32);
> > > +  char* q = (char* ) malloc (32);
> > > +
> > > +  __builtin_memset (ph_dst, 0, 32);
> > > +
> > > +  for (int i = 0; i != 32; i++)
> > > +    p[i] = i;
> > > +  __builtin_memcpy (ph_src, p, 32);
> > > +
> > > +  for (int i = 0; i != 4; i++)
> > > +    {
> > > +      p[i] = i + 4;
> > > +      p[i + 4] = i;
> > > +      p[i + 8] = i + 16;
> > > +      p[i + 12] = i + 12;
> > > +      p[i + 16] = i + 28;
> > > +      p[i + 20] = i + 24;
> > > +      p[i + 24] = i + 8;
> > > +      p[i + 28] = i + 20;
> > > +      q[i] = i + 28;
> > > +      q[i + 4] = i + 24;
> > > +      q[i + 8] = i + 20;
> > > +      q[i + 12] = i + 16;
> > > +      q[i + 16] = i + 12;
> > > +      q[i + 20] = i + 8;
> > > +      q[i + 24] = i + 4;
> > > +      q[i + 28] = i;
> > > +    }
> > > +  __builtin_memcpy (ph_exp, p, 32);
> > > +
> > > +  foo_ph (ph_dst, ph_src);
> > > +  if (__builtin_memcmp (ph_dst, ph_exp, 32) != 0)
> > > +    __builtin_abort ();
> > > +
> > > +  return;
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4a.c b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > > new file mode 100644
> > > index 00000000000..b7b0b532bb1
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > > @@ -0,0 +1,101 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
> > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "slp2" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "slp2" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "slp2" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "slp2" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "slp2" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "slp2" } } */
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_pd (_Complex double* a,
> > > +       _Complex double b1,
> > > +       _Complex double b2)
> > > +{
> > > +  a[0] = b1;
> > > +  a[1] = b2;
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_ps (_Complex float* a,
> > > +       _Complex float b1, _Complex float b2,
> > > +       _Complex float b3, _Complex float b4)
> > > +{
> > > +  a[0] = b1;
> > > +  a[1] = b2;
> > > +  a[2] = b3;
> > > +  a[3] = b4;
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi64 (_Complex long long* a,
> > > +          _Complex long long b1,
> > > +          _Complex long long b2)
> > > +{
> > > +  a[0] = b1;
> > > +  a[1] = b2;
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi32 (_Complex int* a,
> > > +          _Complex int b1, _Complex int b2,
> > > +          _Complex int b3, _Complex int b4)
> > > +{
> > > +  a[0] = b1;
> > > +  a[1] = b2;
> > > +  a[2] = b3;
> > > +  a[3] = b4;
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi16 (_Complex short* a,
> > > +          _Complex short b1, _Complex short b2,
> > > +          _Complex short b3, _Complex short b4,
> > > +          _Complex short b5, _Complex short b6,
> > > +          _Complex short b7,_Complex short b8)
> > > +{
> > > +  a[0] = b1;
> > > +  a[1] = b2;
> > > +  a[2] = b3;
> > > +  a[3] = b4;
> > > +  a[4] = b5;
> > > +  a[5] = b6;
> > > +  a[6] = b7;
> > > +  a[7] = b8;
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi8 (_Complex char* a,
> > > +         _Complex char b1, _Complex char b2,
> > > +         _Complex char b3, _Complex char b4,
> > > +         _Complex char b5, _Complex char b6,
> > > +         _Complex char b7,_Complex char b8,
> > > +         _Complex char b9, _Complex char b10,
> > > +         _Complex char b11, _Complex char b12,
> > > +         _Complex char b13, _Complex char b14,
> > > +         _Complex char b15,_Complex char b16)
> > > +{
> > > +  a[0] = b1;
> > > +  a[1] = b2;
> > > +  a[2] = b3;
> > > +  a[3] = b4;
> > > +  a[4] = b5;
> > > +  a[5] = b6;
> > > +  a[6] = b7;
> > > +  a[7] = b8;
> > > +  a[8] = b9;
> > > +  a[9] = b10;
> > > +  a[10] = b11;
> > > +  a[11] = b12;
> > > +  a[12] = b13;
> > > +  a[13] = b14;
> > > +  a[14] = b15;
> > > +  a[15] = b16;
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4b.c b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > > new file mode 100644
> > > index 00000000000..e2e79508c4b
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > > @@ -0,0 +1,67 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > +/* { dg-require-effective-target avx } */
> > > +
> > > +#include "avx-check.h"
> > > +#include <string.h>
> > > +#include "pr106010-4a.c"
> > > +
> > > +void
> > > +avx_test (void)
> > > +{
> > > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > > +  char* p = (char* ) malloc (32);
> > > +
> > > +  __builtin_memset (pd_dst, 0, 32);
> > > +  __builtin_memset (ps_dst, 0, 32);
> > > +  __builtin_memset (epi64_dst, 0, 32);
> > > +  __builtin_memset (epi32_dst, 0, 32);
> > > +  __builtin_memset (epi16_dst, 0, 32);
> > > +  __builtin_memset (epi8_dst, 0, 32);
> > > +
> > > +  for (int i = 0; i != 32; i++)
> > > +    p[i] = i;
> > > +  __builtin_memcpy (pd_src, p, 32);
> > > +  __builtin_memcpy (ps_src, p, 32);
> > > +  __builtin_memcpy (epi64_src, p, 32);
> > > +  __builtin_memcpy (epi32_src, p, 32);
> > > +  __builtin_memcpy (epi16_src, p, 32);
> > > +  __builtin_memcpy (epi8_src, p, 32);
> > > +
> > > +  foo_pd (pd_dst, pd_src[0], pd_src[1]);
> > > +  foo_ps (ps_dst, ps_src[0], ps_src[1], ps_src[2], ps_src[3]);
> > > +  foo_epi64 (epi64_dst, epi64_src[0], epi64_src[1]);
> > > +  foo_epi32 (epi32_dst, epi32_src[0], epi32_src[1], epi32_src[2], epi32_src[3]);
> > > +  foo_epi16 (epi16_dst, epi16_src[0], epi16_src[1], epi16_src[2], epi16_src[3],
> > > +            epi16_src[4], epi16_src[5], epi16_src[6], epi16_src[7]);
> > > +  foo_epi8 (epi8_dst, epi8_src[0], epi8_src[1], epi8_src[2], epi8_src[3],
> > > +           epi8_src[4], epi8_src[5], epi8_src[6], epi8_src[7],
> > > +           epi8_src[8], epi8_src[9], epi8_src[10], epi8_src[11],
> > > +           epi8_src[12], epi8_src[13], epi8_src[14], epi8_src[15]);
> > > +
> > > +  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi8_dst, epi8_src, 32) != 0)
> > > +    __builtin_abort ();
> > > +
> > > +  return;
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4c.c b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > > new file mode 100644
> > > index 00000000000..8e02aefe3b5
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > > @@ -0,0 +1,54 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -fdump-tree-slp-details -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > +/* { dg-require-effective-target avx512fp16 } */
> > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "slp2" } } */
> > > +
> > > +#include <string.h>
> > > +
> > > +static void do_test (void);
> > > +#define DO_TEST do_test
> > > +#define AVX512FP16
> > > +#include "avx512-check.h"
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_ph (_Complex _Float16* a,
> > > +       _Complex _Float16 b1, _Complex _Float16 b2,
> > > +       _Complex _Float16 b3, _Complex _Float16 b4,
> > > +       _Complex _Float16 b5, _Complex _Float16 b6,
> > > +       _Complex _Float16 b7,_Complex _Float16 b8)
> > > +{
> > > +  a[0] = b1;
> > > +  a[1] = b2;
> > > +  a[2] = b3;
> > > +  a[3] = b4;
> > > +  a[4] = b5;
> > > +  a[5] = b6;
> > > +  a[6] = b7;
> > > +  a[7] = b8;
> > > +}
> > > +
> > > +void
> > > +do_test (void)
> > > +{
> > > +
> > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > > +
> > > +  char* p = (char* ) malloc (32);
> > > +
> > > +  __builtin_memset (ph_dst, 0, 32);
> > > +
> > > +  for (int i = 0; i != 32; i++)
> > > +    p[i] = i;
> > > +
> > > +  __builtin_memcpy (ph_src, p, 32);
> > > +
> > > +  foo_ph (ph_dst, ph_src[0], ph_src[1], ph_src[2], ph_src[3],
> > > +         ph_src[4], ph_src[5], ph_src[6], ph_src[7]);
> > > +
> > > +  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
> > > +    __builtin_abort ();
> > > +  return;
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5a.c b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > > new file mode 100644
> > > index 00000000000..9d4a6f9846b
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > > @@ -0,0 +1,117 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 4 "slp2" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 4 "slp2" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 4 "slp2" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 4 "slp2" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 4 "slp2" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 4 "slp2" } } */
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > +{
> > > +  a[0] = b[2];
> > > +  a[1] = b[3];
> > > +  a[2] = b[0];
> > > +  a[3] = b[1];
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > +{
> > > +  a[0] = b[4];
> > > +  a[1] = b[5];
> > > +  a[2] = b[6];
> > > +  a[3] = b[7];
> > > +  a[4] = b[0];
> > > +  a[5] = b[1];
> > > +  a[6] = b[2];
> > > +  a[7] = b[3];
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > +{
> > > +  a[0] = b[2];
> > > +  a[1] = b[3];
> > > +  a[2] = b[0];
> > > +  a[3] = b[1];
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > +{
> > > +  a[0] = b[4];
> > > +  a[1] = b[5];
> > > +  a[2] = b[6];
> > > +  a[3] = b[7];
> > > +  a[4] = b[0];
> > > +  a[5] = b[1];
> > > +  a[6] = b[2];
> > > +  a[7] = b[3];
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > +{
> > > +  a[0] = b[8];
> > > +  a[1] = b[9];
> > > +  a[2] = b[10];
> > > +  a[3] = b[11];
> > > +  a[4] = b[12];
> > > +  a[5] = b[13];
> > > +  a[6] = b[14];
> > > +  a[7] = b[15];
> > > +  a[8] = b[0];
> > > +  a[9] = b[1];
> > > +  a[10] = b[2];
> > > +  a[11] = b[3];
> > > +  a[12] = b[4];
> > > +  a[13] = b[5];
> > > +  a[14] = b[6];
> > > +  a[15] = b[7];
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > +{
> > > +  a[0] = b[16];
> > > +  a[1] = b[17];
> > > +  a[2] = b[18];
> > > +  a[3] = b[19];
> > > +  a[4] = b[20];
> > > +  a[5] = b[21];
> > > +  a[6] = b[22];
> > > +  a[7] = b[23];
> > > +  a[8] = b[24];
> > > +  a[9] = b[25];
> > > +  a[10] = b[26];
> > > +  a[11] = b[27];
> > > +  a[12] = b[28];
> > > +  a[13] = b[29];
> > > +  a[14] = b[30];
> > > +  a[15] = b[31];
> > > +  a[16] = b[0];
> > > +  a[17] = b[1];
> > > +  a[18] = b[2];
> > > +  a[19] = b[3];
> > > +  a[20] = b[4];
> > > +  a[21] = b[5];
> > > +  a[22] = b[6];
> > > +  a[23] = b[7];
> > > +  a[24] = b[8];
> > > +  a[25] = b[9];
> > > +  a[26] = b[10];
> > > +  a[27] = b[11];
> > > +  a[28] = b[12];
> > > +  a[29] = b[13];
> > > +  a[30] = b[14];
> > > +  a[31] = b[15];
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5b.c b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > > new file mode 100644
> > > index 00000000000..d5c6ebeb5cf
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > > @@ -0,0 +1,80 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > +/* { dg-require-effective-target avx } */
> > > +
> > > +#include "avx-check.h"
> > > +#include <string.h>
> > > +#include "pr106010-5a.c"
> > > +
> > > +void
> > > +avx_test (void)
> > > +{
> > > +  _Complex double* pd_src = (_Complex double*) malloc (64);
> > > +  _Complex double* pd_dst = (_Complex double*) malloc (64);
> > > +  _Complex double* pd_exp = (_Complex double*) malloc (64);
> > > +  _Complex float* ps_src = (_Complex float*) malloc (64);
> > > +  _Complex float* ps_dst = (_Complex float*) malloc (64);
> > > +  _Complex float* ps_exp = (_Complex float*) malloc (64);
> > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
> > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
> > > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
> > > +  _Complex int* epi32_src = (_Complex int*) malloc (64);
> > > +  _Complex int* epi32_dst = (_Complex int*) malloc (64);
> > > +  _Complex int* epi32_exp = (_Complex int*) malloc (64);
> > > +  _Complex short* epi16_src = (_Complex short*) malloc (64);
> > > +  _Complex short* epi16_dst = (_Complex short*) malloc (64);
> > > +  _Complex short* epi16_exp = (_Complex short*) malloc (64);
> > > +  _Complex char* epi8_src = (_Complex char*) malloc (64);
> > > +  _Complex char* epi8_dst = (_Complex char*) malloc (64);
> > > +  _Complex char* epi8_exp = (_Complex char*) malloc (64);
> > > +  char* p = (char* ) malloc (64);
> > > +  char* q = (char* ) malloc (64);
> > > +
> > > +  __builtin_memset (pd_dst, 0, 64);
> > > +  __builtin_memset (ps_dst, 0, 64);
> > > +  __builtin_memset (epi64_dst, 0, 64);
> > > +  __builtin_memset (epi32_dst, 0, 64);
> > > +  __builtin_memset (epi16_dst, 0, 64);
> > > +  __builtin_memset (epi8_dst, 0, 64);
> > > +
> > > +  for (int i = 0; i != 64; i++)
> > > +    {
> > > +      p[i] = i;
> > > +      q[i] = (i + 32) % 64;
> > > +    }
> > > +  __builtin_memcpy (pd_src, p, 64);
> > > +  __builtin_memcpy (ps_src, p, 64);
> > > +  __builtin_memcpy (epi64_src, p, 64);
> > > +  __builtin_memcpy (epi32_src, p, 64);
> > > +  __builtin_memcpy (epi16_src, p, 64);
> > > +  __builtin_memcpy (epi8_src, p, 64);
> > > +
> > > +  __builtin_memcpy (pd_exp, q, 64);
> > > +  __builtin_memcpy (ps_exp, q, 64);
> > > +  __builtin_memcpy (epi64_exp, q, 64);
> > > +  __builtin_memcpy (epi32_exp, q, 64);
> > > +  __builtin_memcpy (epi16_exp, q, 64);
> > > +  __builtin_memcpy (epi8_exp, q, 64);
> > > +
> > > +  foo_pd (pd_dst, pd_src);
> > > +  foo_ps (ps_dst, ps_src);
> > > +  foo_epi64 (epi64_dst, epi64_src);
> > > +  foo_epi32 (epi32_dst, epi32_src);
> > > +  foo_epi16 (epi16_dst, epi16_src);
> > > +  foo_epi8 (epi8_dst, epi8_src);
> > > +
> > > +  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
> > > +    __builtin_abort ();
> > > +
> > > +  return;
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5c.c b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > > new file mode 100644
> > > index 00000000000..9ce4e6dd5c0
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > > @@ -0,0 +1,62 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > +/* { dg-require-effective-target avx512fp16 } */
> > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 4 "slp2" } } */
> > > +
> > > +#include <string.h>
> > > +
> > > +static void do_test (void);
> > > +#define DO_TEST do_test
> > > +#define AVX512FP16
> > > +#include "avx512-check.h"
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > +{
> > > +  a[0] = b[8];
> > > +  a[1] = b[9];
> > > +  a[2] = b[10];
> > > +  a[3] = b[11];
> > > +  a[4] = b[12];
> > > +  a[5] = b[13];
> > > +  a[6] = b[14];
> > > +  a[7] = b[15];
> > > +  a[8] = b[0];
> > > +  a[9] = b[1];
> > > +  a[10] = b[2];
> > > +  a[11] = b[3];
> > > +  a[12] = b[4];
> > > +  a[13] = b[5];
> > > +  a[14] = b[6];
> > > +  a[15] = b[7];
> > > +}
> > > +
> > > +void
> > > +do_test (void)
> > > +{
> > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
> > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
> > > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
> > > +  char* p = (char* ) malloc (64);
> > > +  char* q = (char* ) malloc (64);
> > > +
> > > +  __builtin_memset (ph_dst, 0, 64);
> > > +
> > > +  for (int i = 0; i != 64; i++)
> > > +    {
> > > +      p[i] = i;
> > > +      q[i] = (i + 32) % 64;
> > > +    }
> > > +  __builtin_memcpy (ph_src, p, 64);
> > > +
> > > +  __builtin_memcpy (ph_exp, q, 64);
> > > +
> > > +  foo_ph (ph_dst, ph_src);
> > > +
> > > +  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
> > > +    __builtin_abort ();
> > > +
> > > +  return;
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6a.c b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > > new file mode 100644
> > > index 00000000000..65a90d03684
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > > @@ -0,0 +1,115 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 4 "slp2" } }  */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 4 "slp2" } }  */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > +{
> > > +  a[0] = b[3];
> > > +  a[1] = b[2];
> > > +  a[2] = b[1];
> > > +  a[3] = b[0];
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > +{
> > > +  a[0] = b[7];
> > > +  a[1] = b[6];
> > > +  a[2] = b[5];
> > > +  a[3] = b[4];
> > > +  a[4] = b[3];
> > > +  a[5] = b[2];
> > > +  a[6] = b[1];
> > > +  a[7] = b[0];
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > +{
> > > +  a[0] = b[3];
> > > +  a[1] = b[2];
> > > +  a[2] = b[1];
> > > +  a[3] = b[0];
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > +{
> > > +  a[0] = b[7];
> > > +  a[1] = b[6];
> > > +  a[2] = b[5];
> > > +  a[3] = b[4];
> > > +  a[4] = b[3];
> > > +  a[5] = b[2];
> > > +  a[6] = b[1];
> > > +  a[7] = b[0];
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > +{
> > > +  a[0] = b[15];
> > > +  a[1] = b[14];
> > > +  a[2] = b[13];
> > > +  a[3] = b[12];
> > > +  a[4] = b[11];
> > > +  a[5] = b[10];
> > > +  a[6] = b[9];
> > > +  a[7] = b[8];
> > > +  a[8] = b[7];
> > > +  a[9] = b[6];
> > > +  a[10] = b[5];
> > > +  a[11] = b[4];
> > > +  a[12] = b[3];
> > > +  a[13] = b[2];
> > > +  a[14] = b[1];
> > > +  a[15] = b[0];
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > +{
> > > +  a[0] = b[31];
> > > +  a[1] = b[30];
> > > +  a[2] = b[29];
> > > +  a[3] = b[28];
> > > +  a[4] = b[27];
> > > +  a[5] = b[26];
> > > +  a[6] = b[25];
> > > +  a[7] = b[24];
> > > +  a[8] = b[23];
> > > +  a[9] = b[22];
> > > +  a[10] = b[21];
> > > +  a[11] = b[20];
> > > +  a[12] = b[19];
> > > +  a[13] = b[18];
> > > +  a[14] = b[17];
> > > +  a[15] = b[16];
> > > +  a[16] = b[15];
> > > +  a[17] = b[14];
> > > +  a[18] = b[13];
> > > +  a[19] = b[12];
> > > +  a[20] = b[11];
> > > +  a[21] = b[10];
> > > +  a[22] = b[9];
> > > +  a[23] = b[8];
> > > +  a[24] = b[7];
> > > +  a[25] = b[6];
> > > +  a[26] = b[5];
> > > +  a[27] = b[4];
> > > +  a[28] = b[3];
> > > +  a[29] = b[2];
> > > +  a[30] = b[1];
> > > +  a[31] = b[0];
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6b.c b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > > new file mode 100644
> > > index 00000000000..1c5bb020939
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > > @@ -0,0 +1,157 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > +/* { dg-require-effective-target avx2 } */
> > > +
> > > +#include "avx2-check.h"
> > > +#include <string.h>
> > > +#include "pr106010-6a.c"
> > > +
> > > +void
> > > +avx2_test (void)
> > > +{
> > > +  _Complex double* pd_src = (_Complex double*) malloc (64);
> > > +  _Complex double* pd_dst = (_Complex double*) malloc (64);
> > > +  _Complex double* pd_exp = (_Complex double*) malloc (64);
> > > +  _Complex float* ps_src = (_Complex float*) malloc (64);
> > > +  _Complex float* ps_dst = (_Complex float*) malloc (64);
> > > +  _Complex float* ps_exp = (_Complex float*) malloc (64);
> > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
> > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
> > > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
> > > +  _Complex int* epi32_src = (_Complex int*) malloc (64);
> > > +  _Complex int* epi32_dst = (_Complex int*) malloc (64);
> > > +  _Complex int* epi32_exp = (_Complex int*) malloc (64);
> > > +  _Complex short* epi16_src = (_Complex short*) malloc (64);
> > > +  _Complex short* epi16_dst = (_Complex short*) malloc (64);
> > > +  _Complex short* epi16_exp = (_Complex short*) malloc (64);
> > > +  _Complex char* epi8_src = (_Complex char*) malloc (64);
> > > +  _Complex char* epi8_dst = (_Complex char*) malloc (64);
> > > +  _Complex char* epi8_exp = (_Complex char*) malloc (64);
> > > +  char* p = (char* ) malloc (64);
> > > +  char* q = (char* ) malloc (64);
> > > +
> > > +  __builtin_memset (pd_dst, 0, 64);
> > > +  __builtin_memset (ps_dst, 0, 64);
> > > +  __builtin_memset (epi64_dst, 0, 64);
> > > +  __builtin_memset (epi32_dst, 0, 64);
> > > +  __builtin_memset (epi16_dst, 0, 64);
> > > +  __builtin_memset (epi8_dst, 0, 64);
> > > +
> > > +  for (int i = 0; i != 64; i++)
> > > +    p[i] = i;
> > > +
> > > +  __builtin_memcpy (pd_src, p, 64);
> > > +  __builtin_memcpy (ps_src, p, 64);
> > > +  __builtin_memcpy (epi64_src, p, 64);
> > > +  __builtin_memcpy (epi32_src, p, 64);
> > > +  __builtin_memcpy (epi16_src, p, 64);
> > > +  __builtin_memcpy (epi8_src, p, 64);
> > > +
> > > +
> > > +  for (int i = 0; i != 16; i++)
> > > +    {
> > > +      q[i] = i + 48;
> > > +      q[i + 16] = i + 32;
> > > +      q[i + 32] = i + 16;
> > > +      q[i + 48] = i;
> > > +    }
> > > +
> > > +  __builtin_memcpy (pd_exp, q, 64);
> > > +  __builtin_memcpy (epi64_exp, q, 64);
> > > +
> > > +   for (int i = 0; i != 8; i++)
> > > +    {
> > > +      q[i] = i + 56;
> > > +      q[i + 8] = i + 48;
> > > +      q[i + 16] = i + 40;
> > > +      q[i + 24] = i + 32;
> > > +      q[i + 32] = i + 24;
> > > +      q[i + 40] = i + 16;
> > > +      q[i + 48] = i + 8;
> > > +      q[i + 56] = i;
> > > +    }
> > > +
> > > +  __builtin_memcpy (ps_exp, q, 64);
> > > +  __builtin_memcpy (epi32_exp, q, 64);
> > > +
> > > +  for (int i = 0; i != 4; i++)
> > > +    {
> > > +      q[i] = i + 60;
> > > +      q[i + 4] = i + 56;
> > > +      q[i + 8] = i + 52;
> > > +      q[i + 12] = i + 48;
> > > +      q[i + 16] = i + 44;
> > > +      q[i + 20] = i + 40;
> > > +      q[i + 24] = i + 36;
> > > +      q[i + 28] = i + 32;
> > > +      q[i + 32] = i + 28;
> > > +      q[i + 36] = i + 24;
> > > +      q[i + 40] = i + 20;
> > > +      q[i + 44] = i + 16;
> > > +      q[i + 48] = i + 12;
> > > +      q[i + 52] = i + 8;
> > > +      q[i + 56] = i + 4;
> > > +      q[i + 60] = i;
> > > +    }
> > > +
> > > +  __builtin_memcpy (epi16_exp, q, 64);
> > > +
> > > +  for (int i = 0; i != 2; i++)
> > > +    {
> > > +      q[i] = i + 62;
> > > +      q[i + 2] = i + 60;
> > > +      q[i + 4] = i + 58;
> > > +      q[i + 6] = i + 56;
> > > +      q[i + 8] = i + 54;
> > > +      q[i + 10] = i + 52;
> > > +      q[i + 12] = i + 50;
> > > +      q[i + 14] = i + 48;
> > > +      q[i + 16] = i + 46;
> > > +      q[i + 18] = i + 44;
> > > +      q[i + 20] = i + 42;
> > > +      q[i + 22] = i + 40;
> > > +      q[i + 24] = i + 38;
> > > +      q[i + 26] = i + 36;
> > > +      q[i + 28] = i + 34;
> > > +      q[i + 30] = i + 32;
> > > +      q[i + 32] = i + 30;
> > > +      q[i + 34] = i + 28;
> > > +      q[i + 36] = i + 26;
> > > +      q[i + 38] = i + 24;
> > > +      q[i + 40] = i + 22;
> > > +      q[i + 42] = i + 20;
> > > +      q[i + 44] = i + 18;
> > > +      q[i + 46] = i + 16;
> > > +      q[i + 48] = i + 14;
> > > +      q[i + 50] = i + 12;
> > > +      q[i + 52] = i + 10;
> > > +      q[i + 54] = i + 8;
> > > +      q[i + 56] = i + 6;
> > > +      q[i + 58] = i + 4;
> > > +      q[i + 60] = i + 2;
> > > +      q[i + 62] = i;
> > > +    }
> > > +  __builtin_memcpy (epi8_exp, q, 64);
> > > +
> > > +  foo_pd (pd_dst, pd_src);
> > > +  foo_ps (ps_dst, ps_src);
> > > +  foo_epi64 (epi64_dst, epi64_src);
> > > +  foo_epi32 (epi32_dst, epi32_src);
> > > +  foo_epi16 (epi16_dst, epi16_src);
> > > +  foo_epi8 (epi8_dst, epi8_src);
> > > +
> > > +  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
> > > +    __builtin_abort ();
> > > +
> > > +  return;
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6c.c b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > > new file mode 100644
> > > index 00000000000..b859d884a7f
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > > @@ -0,0 +1,80 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > > +/* { dg-require-effective-target avx512fp16 } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } } */
> > > +
> > > +#include <string.h>
> > > +
> > > +static void do_test (void);
> > > +#define DO_TEST do_test
> > > +#define AVX512FP16
> > > +#include "avx512-check.h"
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > +{
> > > +  a[0] = b[15];
> > > +  a[1] = b[14];
> > > +  a[2] = b[13];
> > > +  a[3] = b[12];
> > > +  a[4] = b[11];
> > > +  a[5] = b[10];
> > > +  a[6] = b[9];
> > > +  a[7] = b[8];
> > > +  a[8] = b[7];
> > > +  a[9] = b[6];
> > > +  a[10] = b[5];
> > > +  a[11] = b[4];
> > > +  a[12] = b[3];
> > > +  a[13] = b[2];
> > > +  a[14] = b[1];
> > > +  a[15] = b[0];
> > > +}
> > > +
> > > +void
> > > +do_test (void)
> > > +{
> > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
> > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
> > > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
> > > +  char* p = (char* ) malloc (64);
> > > +  char* q = (char* ) malloc (64);
> > > +
> > > +  __builtin_memset (ph_dst, 0, 64);
> > > +
> > > +  for (int i = 0; i != 64; i++)
> > > +    p[i] = i;
> > > +
> > > +  __builtin_memcpy (ph_src, p, 64);
> > > +
> > > +  for (int i = 0; i != 4; i++)
> > > +    {
> > > +      q[i] = i + 60;
> > > +      q[i + 4] = i + 56;
> > > +      q[i + 8] = i + 52;
> > > +      q[i + 12] = i + 48;
> > > +      q[i + 16] = i + 44;
> > > +      q[i + 20] = i + 40;
> > > +      q[i + 24] = i + 36;
> > > +      q[i + 28] = i + 32;
> > > +      q[i + 32] = i + 28;
> > > +      q[i + 36] = i + 24;
> > > +      q[i + 40] = i + 20;
> > > +      q[i + 44] = i + 16;
> > > +      q[i + 48] = i + 12;
> > > +      q[i + 52] = i + 8;
> > > +      q[i + 56] = i + 4;
> > > +      q[i + 60] = i;
> > > +    }
> > > +
> > > +  __builtin_memcpy (ph_exp, q, 64);
> > > +
> > > +  foo_ph (ph_dst, ph_src);
> > > +
> > > +  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
> > > +    __builtin_abort ();
> > > +
> > > +  return;
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7a.c b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > > new file mode 100644
> > > index 00000000000..2ea01fac927
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > > @@ -0,0 +1,58 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
> > > +
> > > +#define N 10000
> > > +void
> > > +__attribute__((noipa))
> > > +foo_pd (_Complex double* a, _Complex double b)
> > > +{
> > > +  for (int i = 0; i != N; i++)
> > > +    a[i] = b;
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_ps (_Complex float* a, _Complex float b)
> > > +{
> > > +  for (int i = 0; i != N; i++)
> > > +    a[i] = b;
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi64 (_Complex long long* a, _Complex long long b)
> > > +{
> > > +  for (int i = 0; i != N; i++)
> > > +    a[i] = b;
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi32 (_Complex int* a, _Complex int b)
> > > +{
> > > +  for (int i = 0; i != N; i++)
> > > +    a[i] = b;
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi16 (_Complex short* a, _Complex short b)
> > > +{
> > > +  for (int i = 0; i != N; i++)
> > > +    a[i] = b;
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi8 (_Complex char* a, _Complex char b)
> > > +{
> > > +  for (int i = 0; i != N; i++)
> > > +    a[i] = b;
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7b.c b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > > new file mode 100644
> > > index 00000000000..26482cc10f5
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > > @@ -0,0 +1,63 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > +/* { dg-require-effective-target avx } */
> > > +
> > > +#include "avx-check.h"
> > > +#include <string.h>
> > > +#include "pr106010-7a.c"
> > > +
> > > +void
> > > +avx_test (void)
> > > +{
> > > +  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
> > > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > > +  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
> > > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > +  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
> > > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > > +  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
> > > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > > +  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
> > > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > > +  char* p_init = (char*) malloc (2 * N * sizeof (double));
> > > +
> > > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > > +
> > > +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> > > +    p_init[i] = i % 2 + 3;
> > > +
> > > +  memcpy (pd_src, p_init, 2 * N * sizeof (double));
> > > +  memcpy (ps_dst, p_init, 2 * N * sizeof (float));
> > > +  memcpy (epi64_dst, p_init, 2 * N * sizeof (long long));
> > > +  memcpy (epi32_dst, p_init, 2 * N * sizeof (int));
> > > +  memcpy (epi16_dst, p_init, 2 * N * sizeof (short));
> > > +  memcpy (epi8_dst, p_init, 2 * N * sizeof (char));
> > > +
> > > +  foo_pd (pd_dst, pd_src[0]);
> > > +  foo_ps (ps_dst, ps_src[0]);
> > > +  foo_epi64 (epi64_dst, epi64_src[0]);
> > > +  foo_epi32 (epi32_dst, epi32_src[0]);
> > > +  foo_epi16 (epi16_dst, epi16_src[0]);
> > > +  foo_epi8 (epi8_dst, epi8_src[0]);
> > > +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
> > > +    __builtin_abort ();
> > > +  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
> > > +    __builtin_abort ();
> > > +
> > > +  return;
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7c.c b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > > new file mode 100644
> > > index 00000000000..7f4056a5ecc
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > > @@ -0,0 +1,41 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
> > > +/* { dg-require-effective-target avx512fp16 } */
> > > +
> > > +#include <string.h>
> > > +
> > > +static void do_test (void);
> > > +
> > > +#define DO_TEST do_test
> > > +#define AVX512FP16
> > > +#include "avx512-check.h"
> > > +
> > > +#define N 10000
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_ph (_Complex _Float16* a, _Complex _Float16 b)
> > > +{
> > > +  for (int i = 0; i != N; i++)
> > > +    a[i] = b;
> > > +}
> > > +
> > > +static void
> > > +do_test (void)
> > > +{
> > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > +  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
> > > +
> > > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > > +
> > > +  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
> > > +    p_init[i] = i % 2 + 3;
> > > +
> > > +  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
> > > +
> > > +  foo_ph (ph_dst, ph_src[0]);
> > > +  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
> > > +    __builtin_abort ();
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8a.c b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > > new file mode 100644
> > > index 00000000000..11054b60d30
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > > @@ -0,0 +1,58 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
> > > +
> > > +#define N 10000
> > > +void
> > > +__attribute__((noipa))
> > > +foo_pd (_Complex double* a)
> > > +{
> > > +  for (int i = 0; i != N; i++)
> > > +    a[i] = 1.0 + 2.0i;
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_ps (_Complex float* a)
> > > +{
> > > +  for (int i = 0; i != N; i++)
> > > +    a[i] = 1.0f + 2.0fi;
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi64 (_Complex long long* a)
> > > +{
> > > +  for (int i = 0; i != N; i++)
> > > +    a[i] = 1 + 2i;
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi32 (_Complex int* a)
> > > +{
> > > +  for (int i = 0; i != N; i++)
> > > +    a[i] = 1 + 2i;
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi16 (_Complex short* a)
> > > +{
> > > +  for (int i = 0; i != N; i++)
> > > +    a[i] = 1 + 2i;
> > > +}
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_epi8 (_Complex char* a)
> > > +{
> > > +  for (int i = 0; i != N; i++)
> > > +    a[i] = 1 + 2i;
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8b.c b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > > new file mode 100644
> > > index 00000000000..6bb0073b691
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > > @@ -0,0 +1,53 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > +/* { dg-require-effective-target avx } */
> > > +
> > > +#include "avx-check.h"
> > > +#include <string.h>
> > > +#include "pr106010-8a.c"
> > > +
> > > +void
> > > +avx_test (void)
> > > +{
> > > +  _Complex double pd_src = 1.0 + 2.0i;
> > > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > > +  _Complex float ps_src = 1.0 + 2.0i;
> > > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > > +  _Complex long long epi64_src = 1 + 2i;;
> > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > +  _Complex int epi32_src = 1 + 2i;
> > > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > > +  _Complex short epi16_src = 1 + 2i;
> > > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > > +  _Complex char epi8_src = 1 + 2i;
> > > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > > +
> > > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > > +
> > > +  foo_pd (pd_dst);
> > > +  foo_ps (ps_dst);
> > > +  foo_epi64 (epi64_dst);
> > > +  foo_epi32 (epi32_dst);
> > > +  foo_epi16 (epi16_dst);
> > > +  foo_epi8 (epi8_dst);
> > > +  for (int i = 0 ; i != N; i++)
> > > +    {
> > > +      if (pd_dst[i] != pd_src)
> > > +       __builtin_abort ();
> > > +      if (ps_dst[i] != ps_src)
> > > +       __builtin_abort ();
> > > +      if (epi64_dst[i] != epi64_src)
> > > +       __builtin_abort ();
> > > +      if (epi32_dst[i] != epi32_src)
> > > +       __builtin_abort ();
> > > +      if (epi16_dst[i] != epi16_src)
> > > +       __builtin_abort ();
> > > +      if (epi8_dst[i] != epi8_src)
> > > +       __builtin_abort ();
> > > +    }
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8c.c b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > > new file mode 100644
> > > index 00000000000..61ae131829d
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > > @@ -0,0 +1,38 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
> > > +/* { dg-require-effective-target avx512fp16 } */
> > > +
> > > +#include <string.h>
> > > +
> > > +static void do_test (void);
> > > +
> > > +#define DO_TEST do_test
> > > +#define AVX512FP16
> > > +#include "avx512-check.h"
> > > +
> > > +#define N 10000
> > > +
> > > +void
> > > +__attribute__((noipa))
> > > +foo_ph (_Complex _Float16* a)
> > > +{
> > > +  for (int i = 0; i != N; i++)
> > > +    a[i] = 1.0f16 + 2.0f16i;
> > > +}
> > > +
> > > +static void
> > > +do_test (void)
> > > +{
> > > +  _Complex _Float16 ph_src = 1.0f16 + 2.0f16i;
> > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > +
> > > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > > +
> > > +  foo_ph (ph_dst);
> > > +  for (int i = 0; i != N; i++)
> > > +    {
> > > +      if (ph_dst[i] != ph_src)
> > > +       __builtin_abort ();
> > > +    }
> > > +}
> > > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> > > index d20a10a1524..42ee9df674c 100644
> > > --- a/gcc/tree-vect-data-refs.cc
> > > +++ b/gcc/tree-vect-data-refs.cc
> > > @@ -1403,7 +1403,8 @@ vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
> > >    if (PURE_SLP_STMT (stmt_info))
> > >      ncopies = 1;
> > >    else
> > > -    ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
> > > +    ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info),
> > > +                                  STMT_VINFO_COMPLEX_P (stmt_info));
> > >
> > >    if (DR_IS_READ (dr_info->dr))
> > >      vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
> > > @@ -4597,8 +4598,22 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
> > >
> > >        /* Set vectype for STMT.  */
> > >        scalar_type = TREE_TYPE (DR_REF (dr));
> > > -      tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
> > > -      if (!vectype)
> > > +      tree adjust_scalar_type = scalar_type;
> > > +      /* Support Complex type access. Note that the complex type of load/store
> > > +        does not support gather/scatter.  */
> > > +      if (TREE_CODE (scalar_type) == COMPLEX_TYPE
> > > +         && gatherscatter == SG_NONE)
> > > +       {
> > > +         adjust_scalar_type = TREE_TYPE (scalar_type);
> > > +         STMT_VINFO_COMPLEX_P (stmt_info) = true;
> > > +       }
> > > +      tree vectype = get_vectype_for_scalar_type (vinfo, adjust_scalar_type);
> > > +      unsigned HOST_WIDE_INT constant_nunits;
> > > +      if (!vectype
> > > +         /* For complex type, V1DI doesn't make sense.  */
> > > +         || (STMT_VINFO_COMPLEX_P (stmt_info)
> > > +             && (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&constant_nunits)
> > > +                 || constant_nunits == 1)))
> > >          {
> > >            if (dump_enabled_p ())
> > >              {
> > > @@ -4635,8 +4650,11 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
> > >         }
> > >
> > >        /* Adjust the minimal vectorization factor according to the
> > > -        vector type.  */
> > > +        vector type. Note for complex type, VF is half of
> > > +        TYPE_VECTOR_SUBPARTS.  */
> > >        vf = TYPE_VECTOR_SUBPARTS (vectype);
> > > +      if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > +       vf = exact_div (vf, 2);
> > >        *min_vf = upper_bound (*min_vf, vf);
> > >
> > >        /* Leave the BB vectorizer to pick the vector type later, based on
> > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > > index 3a70c15b593..365fa738022 100644
> > > --- a/gcc/tree-vect-loop.cc
> > > +++ b/gcc/tree-vect-loop.cc
> > > @@ -200,7 +200,12 @@ vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
> > >      }
> > >
> > >    if (nunits_vectype)
> > > -    vect_update_max_nunits (vf, nunits_vectype);
> > > +    {
> > > +      poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (nunits_vectype);
> > > +      if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > +       nunits = exact_div (nunits, 2);
> > > +      vect_update_max_nunits (vf, nunits);
> > > +    }
> > >
> > >    return opt_result::success ();
> > >  }
> > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> > > index dab5daddcc5..5d66ea2f286 100644
> > > --- a/gcc/tree-vect-slp.cc
> > > +++ b/gcc/tree-vect-slp.cc
> > > @@ -877,10 +877,14 @@ vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
> > >        return false;
> > >      }
> > >
> > > +  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> > > +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > +    nunits = exact_div (nunits, 2);
> > > +
> > >    /* If populating the vector type requires unrolling then fail
> > >       before adjusting *max_nunits for basic-block vectorization.  */
> > >    if (is_a <bb_vec_info> (vinfo)
> > > -      && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
> > > +      && !multiple_p (group_size , nunits))
> > >      {
> > >        if (dump_enabled_p ())
> > >         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > @@ -891,7 +895,7 @@ vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
> > >      }
> > >
> > >    /* In case of multiple types we need to detect the smallest type.  */
> > > -  vect_update_max_nunits (max_nunits, vectype);
> > > +  vect_update_max_nunits (max_nunits, nunits);
> > >    return true;
> > >  }
> > >
> > > @@ -3720,22 +3724,54 @@ vect_optimize_slp (vec_info *vinfo)
> > >          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
> > >          when permuting constants and invariants keeping the permute
> > >          bijective.  */
> > > -      auto_sbitmap load_index (SLP_TREE_LANES (node));
> > > -      bitmap_clear (load_index);
> > > -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > -       bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
> > > -      unsigned j;
> > > -      for (j = 0; j < SLP_TREE_LANES (node); ++j)
> > > -       if (!bitmap_bit_p (load_index, j))
> > > -         break;
> > > -      if (j != SLP_TREE_LANES (node))
> > > -       continue;
> > > +      /* Permutation of Complex type.  */
> > > +      if (STMT_VINFO_COMPLEX_P (dr_stmt))
> > > +       {
> > > +         auto_sbitmap load_index (SLP_TREE_LANES (node) * 2);
> > > +         bitmap_clear (load_index);
> > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > +           {
> > > +             unsigned bit = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > +             bitmap_set_bit (load_index, 2 * bit);
> > > +             bitmap_set_bit (load_index, 2 * bit + 1);
> > > +           }
> > > +         unsigned j;
> > > +         for (j = 0; j < SLP_TREE_LANES (node) * 2; ++j)
> > > +           if (!bitmap_bit_p (load_index, j))
> > > +             break;
> > > +         if (j != SLP_TREE_LANES (node) * 2)
> > > +           continue;
> > >
> > > -      vec<unsigned> perm = vNULL;
> > > -      perm.safe_grow (SLP_TREE_LANES (node), true);
> > > -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > -       perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > -      perms.safe_push (perm);
> > > +         vec<unsigned> perm = vNULL;
> > > +         perm.safe_grow (SLP_TREE_LANES (node) * 2, true);
> > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > +           {
> > > +             unsigned cidx = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > +             perm[2 * j] = 2 * cidx;
> > > +             perm[2 * j + 1] = 2 * cidx + 1;
> > > +           }
> > > +         perms.safe_push (perm);
> > > +       }
> > > +      else
> > > +       {
> > > +         auto_sbitmap load_index (SLP_TREE_LANES (node));
> > > +         bitmap_clear (load_index);
> > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > +           bitmap_set_bit (load_index,
> > > +                           SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
> > > +         unsigned j;
> > > +         for (j = 0; j < SLP_TREE_LANES (node); ++j)
> > > +           if (!bitmap_bit_p (load_index, j))
> > > +             break;
> > > +         if (j != SLP_TREE_LANES (node))
> > > +           continue;
> > > +
> > > +         vec<unsigned> perm = vNULL;
> > > +         perm.safe_grow (SLP_TREE_LANES (node), true);
> > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > +           perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > +         perms.safe_push (perm);
> > > +       }
> > >        vertices[idx].perm_in = perms.length () - 1;
> > >        vertices[idx].perm_out = perms.length () - 1;
> > >      }
> > > @@ -4518,6 +4554,12 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
> > >         vf = loop_vinfo->vectorization_factor;
> > >        else
> > >         vf = 1;
> > > +      /* For complex type and SLP, double vf to get right vectype.
> > > +        .i.e vector(4) double for complex double, group size is 2, double vf
> > > +        to map vf * group_size to TYPE_VECTOR_SUBPARTS.  */
> > > +     if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > +       vf *= 2;
> > > +
> > >        unsigned int group_size = SLP_TREE_LANES (node);
> > >        tree vectype = SLP_TREE_VECTYPE (node);
> > >        SLP_TREE_NUMBER_OF_VEC_STMTS (node)
> > > @@ -4763,10 +4805,17 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
> > >             }
> > >           unsigned group_size = SLP_TREE_LANES (child);
> > >           poly_uint64 vf = 1;
> > > +
> > >           if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
> > >             vf = loop_vinfo->vectorization_factor;
> > > +
> > > +         /* V2SF is just 1 complex type, so mutiply by 2
> > > +            to get release vector numbers.  */
> > > +         unsigned cp
> > > +           = STMT_VINFO_COMPLEX_P (SLP_TREE_REPRESENTATIVE (node)) ? 2 : 1;
> > > +
> > >           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
> > > -           = vect_get_num_vectors (vf * group_size, vector_type);
> > > +           = vect_get_num_vectors (vf * group_size * cp, vector_type);
> > >           /* And cost them.  */
> > >           vect_prologue_cost_for_slp (child, cost_vec);
> > >         }
> > > @@ -6402,6 +6451,11 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > >
> > >    /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
> > >    vector_type = SLP_TREE_VECTYPE (op_node);
> > > +  unsigned int cp = 1;
> > > +  /* Handle Complex type vector init.
> > > +     SLP_TREE_REPRESENTATIVE (op_node) could be NULL.  */
> > > +  if (TREE_CODE (TREE_TYPE (op_node->ops[0])) == COMPLEX_TYPE)
> > > +    cp = 2;
> > >
> > >    unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
> > >    SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
> > > @@ -6426,9 +6480,9 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > >    /* When using duplicate_and_interleave, we just need one element for
> > >       each scalar statement.  */
> > >    if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
> > > -    nunits = group_size;
> > > +    nunits = group_size * cp;
> > >
> > > -  number_of_copies = nunits * number_of_vectors / group_size;
> > > +  number_of_copies = nunits * number_of_vectors / (group_size * cp);
> > >
> > >    number_of_places_left_in_vector = nunits;
> > >    constant_p = true;
> > > @@ -6460,8 +6514,23 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > >                         gcc_unreachable ();
> > >                     }
> > >                   else
> > > -                   op = fold_unary (VIEW_CONVERT_EXPR,
> > > -                                    TREE_TYPE (vector_type), op);
> > > +                   {
> > > +                     tree scalar_type = TREE_TYPE (vector_type);
> > > +                     /* For complex type, insert real and imag part
> > > +                        separately.  */
> > > +                     if (cp == 2)
> > > +                       {
> > > +                         gcc_assert ((TREE_CODE (TREE_TYPE (op))
> > > +                                      == COMPLEX_TYPE)
> > > +                                     && (scalar_type
> > > +                                         == TREE_TYPE (TREE_TYPE (op))));
> > > +                         elts[number_of_places_left_in_vector--]
> > > +                           = fold_unary (IMAGPART_EXPR, scalar_type, op);
> > > +                         op = fold_unary (REALPART_EXPR, scalar_type, op);
> > > +                       }
> > > +                     else
> > > +                       op = fold_unary (VIEW_CONVERT_EXPR, scalar_type, op);
> > > +                   }
> > >                   gcc_assert (op && CONSTANT_CLASS_P (op));
> > >                 }
> > >               else
> > > @@ -6481,11 +6550,28 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > >                     }
> > >                   else
> > >                     {
> > > -                     op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
> > > -                                  op);
> > > -                     init_stmt
> > > -                       = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
> > > -                                              op);
> > > +                     tree scalar_type = TREE_TYPE (vector_type);
> > > +                     if (cp == 2)
> > > +                       {
> > > +                         gcc_assert ((TREE_CODE (TREE_TYPE (op))
> > > +                                      == COMPLEX_TYPE)
> > > +                                     && (scalar_type
> > > +                                         == TREE_TYPE (TREE_TYPE (op))));
> > > +                         tree imag = build1 (IMAGPART_EXPR, scalar_type, op);
> > > +                         op = build1 (REALPART_EXPR, scalar_type, op);
> > > +                         tree imag_temp = make_ssa_name (scalar_type);
> > > +                         elts[number_of_places_left_in_vector--] = imag_temp;
> > > +                         init_stmt = gimple_build_assign (imag_temp, imag);
> > > +                         gimple_seq_add_stmt (&ctor_seq, init_stmt);
> > > +                         init_stmt = gimple_build_assign (new_temp, op);
> > > +                       }
> > > +                     else
> > > +                       {
> > > +                         op = build1 (VIEW_CONVERT_EXPR, scalar_type, op);
> > > +                         init_stmt
> > > +                           = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
> > > +                                                  op);
> > > +                       }
> > >                     }
> > >                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
> > >                   op = new_temp;
> > > @@ -6696,15 +6782,17 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > >    unsigned int nelts_to_build;
> > >    unsigned int nvectors_per_build;
> > >    unsigned int in_nlanes;
> > > +  unsigned int cp = STMT_VINFO_COMPLEX_P (stmt_info) ? 2 : 1;
> > >    bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
> > > -                     && multiple_p (nunits, group_size));
> > > +                     && multiple_p (nunits, group_size * cp));
> > >    if (repeating_p)
> > >      {
> > >        /* A single vector contains a whole number of copies of the node, so:
> > >          (a) all permutes can use the same mask; and
> > >          (b) the permutes only need a single vector input.  */
> > > -      mask.new_vector (nunits, group_size, 3);
> > > -      nelts_to_build = mask.encoded_nelts ();
> > > +      /* For complex type, mask size should be double of nelts_to_build.  */
> > > +      mask.new_vector (nunits, group_size * cp, 3);
> > > +      nelts_to_build = mask.encoded_nelts () / cp;
> > >        nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
> > >        in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
> > >      }
> > > @@ -6744,8 +6832,8 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > >         {
> > >           /* Enforced before the loop when !repeating_p.  */
> > >           unsigned int const_nunits = nunits.to_constant ();
> > > -         vec_index = i / const_nunits;
> > > -         mask_element = i % const_nunits;
> > > +         vec_index = i / (const_nunits / cp);
> > > +         mask_element = i % (const_nunits / cp);
> > >           if (vec_index == first_vec_index
> > >               || first_vec_index == -1)
> > >             {
> > > @@ -6755,7 +6843,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > >                    || second_vec_index == -1)
> > >             {
> > >               second_vec_index = vec_index;
> > > -             mask_element += const_nunits;
> > > +             mask_element += (const_nunits / cp);
> > >             }
> > >           else
> > >             {
> > > @@ -6768,14 +6856,24 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > >               return false;
> > >             }
> > >
> > > -         gcc_assert (mask_element < 2 * const_nunits);
> > > +         gcc_assert (mask_element < 2 * const_nunits / cp);
> > >         }
> > >
> > >        if (mask_element != index)
> > >         noop_p = false;
> > > -      mask[index++] = mask_element;
> > > +      /* Set index for Complex _type.
> > > +        i.e. mask like [1,0] is actually [2, 3, 0, 1]
> > > +        for vector scalar type.  */
> > > +      if (cp == 2)
> > > +       {
> > > +         mask[2 * index] = 2 * mask_element;
> > > +         mask[2 * index + 1] = 2 * mask_element + 1;
> > > +       }
> > > +      else
> > > +       mask[index] = mask_element;
> > > +      index++;
> > >
> > > -      if (index == count && !noop_p)
> > > +      if (index * cp == count && !noop_p)
> > >         {
> > >           indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
> > >           if (!can_vec_perm_const_p (mode, mode, indices))
> > > @@ -6799,7 +6897,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > >           ++*n_perms;
> > >         }
> > >
> > > -      if (index == count)
> > > +      if (index * cp == count)
> > >         {
> > >           if (!analyze_only)
> > >             {
> > > @@ -6869,7 +6967,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > >           bool load_seen = false;
> > >           for (unsigned i = 0; i < in_nlanes; ++i)
> > >             {
> > > -             if (i % const_nunits == 0)
> > > +             if (i % (const_nunits * cp) == 0)
> > >                 {
> > >                   if (load_seen)
> > >                     *n_loads += 1;
> > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > > index 72107afc883..8af3b558be4 100644
> > > --- a/gcc/tree-vect-stmts.cc
> > > +++ b/gcc/tree-vect-stmts.cc
> > > @@ -1397,25 +1397,70 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
> > >  {
> > >    gimple *init_stmt;
> > >    tree new_temp;
> > > +  tree scalar_type = TREE_TYPE (type);
> > > +  gimple_seq stmts = NULL;
> > > +
> > > +  if (TREE_CODE (TREE_TYPE (val)) == COMPLEX_TYPE)
> > > +    {
> > > +      unsigned HOST_WIDE_INT nunits;
> > > +      gcc_assert (TYPE_VECTOR_SUBPARTS (type).is_constant (&nunits));
> > >
> > > +      tree_vector_builder elts (type, nunits, 1);
> > > +      tree imag, real;
> > > +      if (TREE_CODE (val) == COMPLEX_CST)
> > > +       {
> > > +         real = fold_unary (REALPART_EXPR, scalar_type, val);
> > > +         imag = fold_unary (IMAGPART_EXPR, scalar_type, val);
> > > +       }
> > > +      else
> > > +       {
> > > +         real = make_ssa_name (scalar_type);
> > > +         imag = make_ssa_name (scalar_type);
> > > +         init_stmt
> > > +           = gimple_build_assign (real,
> > > +                                  build1 (REALPART_EXPR, scalar_type, val));
> > > +         gimple_seq_add_stmt (&stmts, init_stmt);
> > > +         init_stmt
> > > +           = gimple_build_assign (imag,
> > > +                                  build1 (IMAGPART_EXPR, scalar_type, val));
> > > +         gimple_seq_add_stmt (&stmts, init_stmt);
> > > +       }
> > > +
> > > +      /* Build vector as [real,imag,real,imag,...].  */
> > > +      for (unsigned i = 0; i != nunits; i++)
> > > +       {
> > > +         if (i % 2)
> > > +           elts.quick_push (imag);
> > > +         else
> > > +           elts.quick_push (real);
> > > +       }
> > > +      val = gimple_build_vector (&stmts, &elts);
> > > +      if (!gimple_seq_empty_p (stmts))
> > > +       {
> > > +         if (gsi)
> > > +           gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> > > +         else
> > > +           vinfo->insert_seq_on_entry (stmt_info, stmts);
> > > +       }
> > > +    }
> > >    /* We abuse this function to push sth to a SSA name with initial 'val'.  */
> > > -  if (! useless_type_conversion_p (type, TREE_TYPE (val)))
> > > +  else if (! useless_type_conversion_p (type, TREE_TYPE (val)))
> > >      {
> > >        gcc_assert (TREE_CODE (type) == VECTOR_TYPE);
> > > -      if (! types_compatible_p (TREE_TYPE (type), TREE_TYPE (val)))
> > > +      if (! types_compatible_p (scalar_type, TREE_TYPE (val)))
> > >         {
> > >           /* Scalar boolean value should be transformed into
> > >              all zeros or all ones value before building a vector.  */
> > >           if (VECTOR_BOOLEAN_TYPE_P (type))
> > >             {
> > > -             tree true_val = build_all_ones_cst (TREE_TYPE (type));
> > > -             tree false_val = build_zero_cst (TREE_TYPE (type));
> > > +             tree true_val = build_all_ones_cst (scalar_type);
> > > +             tree false_val = build_zero_cst (scalar_type);
> > >
> > >               if (CONSTANT_CLASS_P (val))
> > >                 val = integer_zerop (val) ? false_val : true_val;
> > >               else
> > >                 {
> > > -                 new_temp = make_ssa_name (TREE_TYPE (type));
> > > +                 new_temp = make_ssa_name (scalar_type);
> > >                   init_stmt = gimple_build_assign (new_temp, COND_EXPR,
> > >                                                    val, true_val, false_val);
> > >                   vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi);
> > > @@ -1424,14 +1469,13 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
> > >             }
> > >           else
> > >             {
> > > -             gimple_seq stmts = NULL;
> > >               if (! INTEGRAL_TYPE_P (TREE_TYPE (val)))
> > >                 val = gimple_build (&stmts, VIEW_CONVERT_EXPR,
> > > -                                   TREE_TYPE (type), val);
> > > +                                   scalar_type, val);
> > >               else
> > >                 /* ???  Condition vectorization expects us to do
> > >                    promotion of invariant/external defs.  */
> > > -               val = gimple_convert (&stmts, TREE_TYPE (type), val);
> > > +               val = gimple_convert (&stmts, scalar_type, val);
> > >               for (gimple_stmt_iterator gsi2 = gsi_start (stmts);
> > >                    !gsi_end_p (gsi2); )
> > >                 {
> > > @@ -1496,7 +1540,12 @@ vect_get_vec_defs_for_operand (vec_info *vinfo, stmt_vec_info stmt_vinfo,
> > >                && VECTOR_BOOLEAN_TYPE_P (stmt_vectype))
> > >         vector_type = truth_type_for (stmt_vectype);
> > >        else
> > > -       vector_type = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op));
> > > +       {
> > > +         tree scalar_type = TREE_TYPE (op);
> > > +         if (STMT_VINFO_COMPLEX_P (stmt_vinfo))
> > > +           scalar_type = TREE_TYPE (scalar_type);
> > > +         vector_type = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
> > > +       }
> > >
> > >        gcc_assert (vector_type);
> > >        tree vop = vect_init_vector (vinfo, stmt_vinfo, op, vector_type, NULL);
> > > @@ -7509,8 +7558,17 @@ vectorizable_store (vec_info *vinfo,
> > >       same location twice.  */
> > >    gcc_assert (slp == PURE_SLP_STMT (stmt_info));
> > >
> > > +  if (!STMT_VINFO_DATA_REF (stmt_info))
> > > +    return false;
> > > +
> > >    tree vectype = STMT_VINFO_VECTYPE (stmt_info), rhs_vectype = NULL_TREE;
> > >    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> > > +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > +    {
> > > +      if (!nunits.is_constant ())
> > > +       return false;
> > > +      nunits = exact_div (nunits, 2);
> > > +    }
> > >
> > >    if (loop_vinfo)
> > >      {
> > > @@ -7526,7 +7584,8 @@ vectorizable_store (vec_info *vinfo,
> > >    if (slp)
> > >      ncopies = 1;
> > >    else
> > > -    ncopies = vect_get_num_copies (loop_vinfo, vectype);
> > > +    ncopies = vect_get_num_copies (loop_vinfo, vectype,
> > > +                                  STMT_VINFO_COMPLEX_P (stmt_info));
> > >
> > >    gcc_assert (ncopies >= 1);
> > >
> > > @@ -7546,9 +7605,6 @@ vectorizable_store (vec_info *vinfo,
> > >    elem_type = TREE_TYPE (vectype);
> > >    vec_mode = TYPE_MODE (vectype);
> > >
> > > -  if (!STMT_VINFO_DATA_REF (stmt_info))
> > > -    return false;
> > > -
> > >    vect_memory_access_type memory_access_type;
> > >    enum dr_alignment_support alignment_support_scheme;
> > >    int misalignment;
> > > @@ -8778,6 +8834,12 @@ vectorizable_load (vec_info *vinfo,
> > >
> > >    tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> > >    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> > > +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > +    {
> > > +      if (!nunits.is_constant ())
> > > +       return false;
> > > +      nunits = exact_div (nunits, 2);
> > > +    }
> > >
> > >    if (loop_vinfo)
> > >      {
> > > @@ -8794,7 +8856,8 @@ vectorizable_load (vec_info *vinfo,
> > >    if (slp)
> > >      ncopies = 1;
> > >    else
> > > -    ncopies = vect_get_num_copies (loop_vinfo, vectype);
> > > +    ncopies = vect_get_num_copies (loop_vinfo, vectype,
> > > +                                  STMT_VINFO_COMPLEX_P (stmt_info));
> > >
> > >    gcc_assert (ncopies >= 1);
> > >
> > > @@ -8870,8 +8933,11 @@ vectorizable_load (vec_info *vinfo,
> > >                 if (k > maxk)
> > >                   maxk = k;
> > >               tree vectype = SLP_TREE_VECTYPE (slp_node);
> > > +             /* For complex type, half the nunits.  */
> > >               if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
> > > -                 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
> > > +                 || maxk >= (DR_GROUP_SIZE (group_info)
> > > +                             & ~((STMT_VINFO_COMPLEX_P (group_info)
> > > +                                  ? nunits >> 1 : nunits) - 1)))
> > >                 {
> > >                   if (dump_enabled_p ())
> > >                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > @@ -12499,12 +12565,27 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
> > >             dump_printf_loc (MSG_NOTE, vect_location,
> > >                              "get vectype for scalar type: %T\n", scalar_type);
> > >         }
> > > +
> > > +      tree orig_scalar_type = scalar_type;
> > > +      if (TREE_CODE (scalar_type) == COMPLEX_TYPE)
> > > +       {
> > > +         /* Set complex_p for BB vectorizer.  */
> > > +         STMT_VINFO_COMPLEX_P (stmt_info) = true;
> > > +         scalar_type = TREE_TYPE (scalar_type);
> > > +         /* Double group_size for BB vectorizer to make
> > > +            following 2 get_vectype_for_scalar_type return wanted vectype.
> > > +            Real group size is not changed, just make the "faked" input
> > > +            group_size.  */
> > > +         group_size *= 2;
> > > +       }
> > >        vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
> > > -      if (!vectype)
> > > +      if (!vectype
> > > +         || (STMT_VINFO_COMPLEX_P (stmt_info)
> > > +             && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ()))
> > >         return opt_result::failure_at (stmt,
> > >                                        "not vectorized:"
> > >                                        " unsupported data-type %T\n",
> > > -                                      scalar_type);
> > > +                                      orig_scalar_type);
> > >
> > >        if (dump_enabled_p ())
> > >         dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n", vectype);
> > > @@ -12529,16 +12610,30 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
> > >                                                    TREE_TYPE (vectype));
> > >        if (scalar_type != TREE_TYPE (vectype))
> > >         {
> > > -         if (dump_enabled_p ())
> > > +         tree orig_scalar_type = scalar_type;
> > > +         if (TREE_CODE (scalar_type) == COMPLEX_TYPE)
> > > +           {
> > > +             /* Set complex_p for Loop vectorizer.  */
> > > +             STMT_VINFO_COMPLEX_P (stmt_info) = true;
> > > +             scalar_type = TREE_TYPE (scalar_type);
> > > +             if (dump_enabled_p ())
> > > +               dump_printf_loc (MSG_NOTE, vect_location,
> > > +                            "get complex for smallest scalar type: %T\n",
> > > +                            scalar_type);
> > > +
> > > +           }
> > > +         else if (dump_enabled_p ())
> > >             dump_printf_loc (MSG_NOTE, vect_location,
> > >                              "get vectype for smallest scalar type: %T\n",
> > >                              scalar_type);
> > >           nunits_vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
> > >                                                         group_size);
> > > -         if (!nunits_vectype)
> > > +         if (!nunits_vectype
> > > +             || (STMT_VINFO_COMPLEX_P (stmt_info)
> > > +                 && !TYPE_VECTOR_SUBPARTS (nunits_vectype).is_constant ()))
> > >             return opt_result::failure_at
> > >               (stmt, "not vectorized: unsupported data-type %T\n",
> > > -              scalar_type);
> > > +              orig_scalar_type);
> > >           if (dump_enabled_p ())
> > >             dump_printf_loc (MSG_NOTE, vect_location, "nunits vectype: %T\n",
> > >                              nunits_vectype);
> > > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> > > index e5fdc9e0a14..4a809e492c4 100644
> > > --- a/gcc/tree-vectorizer.h
> > > +++ b/gcc/tree-vectorizer.h
> > > @@ -1161,6 +1161,9 @@ public:
> > >       vectorization.  */
> > >    bool vectorizable;
> > >
> > > +  /* The scalar type of the LHS of this statement is complex type.  */
> > > +  bool complex_p;
> > > +
> > >    /* The stmt to which this info struct refers to.  */
> > >    gimple *stmt;
> > >
> > > @@ -1395,6 +1398,7 @@ struct gather_scatter_info {
> > >  #define STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT(S) (S)->reduc_epilogue_adjustment
> > >  #define STMT_VINFO_REDUC_IDX(S)                   (S)->reduc_idx
> > >  #define STMT_VINFO_FORCE_SINGLE_CYCLE(S)   (S)->force_single_cycle
> > > +#define STMT_VINFO_COMPLEX_P(S)            (S)->complex_p
> > >
> > >  #define STMT_VINFO_DR_WRT_VEC_LOOP(S)      (S)->dr_wrt_vec_loop
> > >  #define STMT_VINFO_DR_BASE_ADDRESS(S)      (S)->dr_wrt_vec_loop.base_address
> > > @@ -1970,6 +1974,15 @@ vect_get_num_copies (loop_vec_info loop_vinfo, tree vectype)
> > >    return vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo), vectype);
> > >  }
> > >
> > > +static inline unsigned int
> > > +vect_get_num_copies (loop_vec_info loop_vinfo, tree vectype, bool complex_p)
> > > +{
> > > +  poly_uint64 nunits = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> > > +  if (complex_p)
> > > +    nunits *= 2;
> > > +  return vect_get_num_vectors (nunits, vectype);
> > > +}
> > > +
> > >  /* Update maximum unit count *MAX_NUNITS so that it accounts for
> > >     NUNITS.  *MAX_NUNITS can be 1 if we haven't yet recorded anything.  */
> > >
> > > --
> > > 2.18.1
> > >
>
>
>
> --
> BR,
> Hongtao
Hongtao Liu July 13, 2022, 4:46 a.m. UTC | #5
On Tue, Jul 12, 2022 at 10:12 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Tue, Jul 12, 2022 at 6:11 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Mon, Jul 11, 2022 at 7:47 PM Richard Biener via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> > >
> > > On Mon, Jul 11, 2022 at 5:44 AM liuhongt <hongtao.liu@intel.com> wrote:
> > > >
> > > > The patch only handles load/store(including ctor/permutation, except
> > > > gather/scatter) for complex type, other operations don't needs to be
> > > > handled since they will be lowered by pass cplxlower.(MASK_LOAD is not
> > > > supported for complex type, so no need to handle either).
> > >
> > > (*)
> > >
> > > > Instead of support vector(2) _Complex double, this patch takes vector(4)
> > > > double as vector type of _Complex double. Since vectorizer originally
> > > > takes TYPE_VECTOR_SUBPARTS as nunits which is not true for complex
> > > > type, the patch handles nunits/ncopies/vf specially for complex type.
> > >
> > > For the limited set above(*) can you explain what's "special" about
> > > vector(2) _Complex
> > > vs. vector(4) double, thus why we need to have STMT_VINFO_COMPLEX_P at all?
> > Supporting a vector(2) complex  is a straightforward idea, just like
> > supporting other scalar type in vectorizer, but it requires more
> > efforts(in the backend and frontend), considering that most of
> > operations of complex type will be lowered into realpart and imagpart
> > operations, supporting a vector(2) complex does not look that
> > necessary. Then it comes up with supporting vector(4) double(with
> > adjustment of vf/ctor/permutation), the vectorizer only needs to
> > handle the vectorization of the move operation of the complex type(no
> > need to worry about wrongly mapping vector(4) double multiplication to
> > complex type multiplication since it's already lowered before
> > vectorizer).
> > stmt_info does not record the scalar type, in order to avoid duplicate
> > operation like getting a lhs type from stmt to determine whether it is
> > a complex type, STMT_VINFO_COMPLEX_P bit is added, this bit is mainly
> > initialized in vect_analyze_data_refs and vect_get_vector_types_for_
> > stmt.
> > >
> > > I wonder to what extent your handling can be extended to support re-vectorizing
> > > (with a higher VF for example) already vectorized code?  The vectorizer giving
> > > up on vector(2) double looks quite obviously similar to it giving up
> > > on _Complex double ...
> > Yes, it can be extended to vector(2) double/float/int/.... with a bit
> > adjustment(exacting element by using bit_field instead of
> > imagpart_expr/realpart_expr).
> > > It would be a shame to not use the same underlying mechanism for dealing with
> > > both, where for the vector case obviously vector(4) would be supported as well.
> > >
> > > In principle _Complex double operations should be two SLP lanes but it seems you
> > > are handling them with classical interleaving as well?
> > I'm only handling move operations, for other operations it will be
> > lowered to realpart and imagpart and thus two SLP lanes.
>
> Yes, I understood that.
>
> Doing it more general (and IMHO better) would involve enhancing
> how we represent dataref groups, maintaining the number of scalars
> covered by each of the vinfos.  On the SLP representation side it
> probably requires to rely on the representative for access and not
> on the scalar stmts (since those do not map properly to the lanes).
>
> Ideally we'd be able to handle
>
> struct { _Complex double c; double a; double b; } a[], b[];
>
> void foo ()
> {
>    for (int i = 0; i < 100; ++i)
>     {
>       a[i].c = b[i].c;
>       a[i].a = b[i].a;
>       a[i].b = b[i].b;
>     }
> }
>
> which I guess your patch doesn't handle with plain AVX vector
> copies but instead uses interleaving for the _Complex and non-_Complex
> parts?
Indeed, it produces wrong code.
>
> Let me spend some time fleshing out what is necessary to make
> this work "properly".  We can consider your special-casing of _Complex
> memory ops if I can't manage to assess the complexity of the task.
>
> Thanks,
> Richard.
>
> > >
> > > Thanks,
> > > Richard.
> > >
> > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > > Also test the patch for SPEC2017 and find there's complex type vectorization
> > > > in 510/549(but no performance impact).
> > > >
> > > > Any comments?
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > >         PR tree-optimization/106010
> > > >         * tree-vect-data-refs.cc (vect_get_data_access_cost):
> > > >         Pass complex_p to vect_get_num_copies to avoid ICE.
> > > >         (vect_analyze_data_refs): Support vectorization for Complex
> > > >         type with vector scalar types.
> > > >         * tree-vect-loop.cc (vect_determine_vf_for_stmt_1): VF should
> > > >         be half of TYPE_VECTOR_SUBPARTS when complex_p.
> > > >         * tree-vect-slp.cc (vect_record_max_nunits): nunits should be
> > > >         half of TYPE_VECTOR_SUBPARTS when complex_p.
> > > >         (vect_optimize_slp): Support permutation for complex type.
> > > >         (vect_slp_analyze_node_operations_1): Double nunits in
> > > >         vect_get_num_vectors to get right SLP_TREE_NUMBER_OF_VEC_STMTS
> > > >         when complex_p.
> > > >         (vect_slp_analyze_node_operations): Ditto.
> > > >         (vect_create_constant_vectors): Support CTOR for complex type.
> > > >         (vect_transform_slp_perm_load): Support permutation for
> > > >         complex type.
> > > >         * tree-vect-stmts.cc (vect_init_vector): Support complex type.
> > > >         (vect_get_vec_defs_for_operand): Get vector type for
> > > >         complex type.
> > > >         (vectorizable_store): Get right ncopies/nunits for complex
> > > >         type, also return false when complex_p and
> > > >         !TYPE_VECTOR_SUBPARTS.is_constant ().
> > > >         (vectorizable_load): Ditto.
> > > >         (vect_get_vector_types_for_stmt): Get vector type for complex type.
> > > >         * tree-vectorizer.h (STMT_VINFO_COMPLEX_P): New macro.
> > > >         (vect_get_num_copies): New overload.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > >         * gcc.target/i386/pr106010-1a.c: New test.
> > > >         * gcc.target/i386/pr106010-1b.c: New test.
> > > >         * gcc.target/i386/pr106010-1c.c: New test.
> > > >         * gcc.target/i386/pr106010-2a.c: New test.
> > > >         * gcc.target/i386/pr106010-2b.c: New test.
> > > >         * gcc.target/i386/pr106010-2c.c: New test.
> > > >         * gcc.target/i386/pr106010-3a.c: New test.
> > > >         * gcc.target/i386/pr106010-3b.c: New test.
> > > >         * gcc.target/i386/pr106010-3c.c: New test.
> > > >         * gcc.target/i386/pr106010-4a.c: New test.
> > > >         * gcc.target/i386/pr106010-4b.c: New test.
> > > >         * gcc.target/i386/pr106010-4c.c: New test.
> > > >         * gcc.target/i386/pr106010-5a.c: New test.
> > > >         * gcc.target/i386/pr106010-5b.c: New test.
> > > >         * gcc.target/i386/pr106010-5c.c: New test.
> > > >         * gcc.target/i386/pr106010-6a.c: New test.
> > > >         * gcc.target/i386/pr106010-6b.c: New test.
> > > >         * gcc.target/i386/pr106010-6c.c: New test.
> > > >         * gcc.target/i386/pr106010-7a.c: New test.
> > > >         * gcc.target/i386/pr106010-7b.c: New test.
> > > >         * gcc.target/i386/pr106010-7c.c: New test.
> > > >         * gcc.target/i386/pr106010-8a.c: New test.
> > > >         * gcc.target/i386/pr106010-8b.c: New test.
> > > >         * gcc.target/i386/pr106010-8c.c: New test.
> > > > ---
> > > >  gcc/testsuite/gcc.target/i386/pr106010-1a.c |  58 +++++++
> > > >  gcc/testsuite/gcc.target/i386/pr106010-1b.c |  63 +++++++
> > > >  gcc/testsuite/gcc.target/i386/pr106010-1c.c |  41 +++++
> > > >  gcc/testsuite/gcc.target/i386/pr106010-2a.c |  82 +++++++++
> > > >  gcc/testsuite/gcc.target/i386/pr106010-2b.c |  62 +++++++
> > > >  gcc/testsuite/gcc.target/i386/pr106010-2c.c |  47 ++++++
> > > >  gcc/testsuite/gcc.target/i386/pr106010-3a.c |  80 +++++++++
> > > >  gcc/testsuite/gcc.target/i386/pr106010-3b.c | 126 ++++++++++++++
> > > >  gcc/testsuite/gcc.target/i386/pr106010-3c.c |  69 ++++++++
> > > >  gcc/testsuite/gcc.target/i386/pr106010-4a.c | 101 ++++++++++++
> > > >  gcc/testsuite/gcc.target/i386/pr106010-4b.c |  67 ++++++++
> > > >  gcc/testsuite/gcc.target/i386/pr106010-4c.c |  54 ++++++
> > > >  gcc/testsuite/gcc.target/i386/pr106010-5a.c | 117 +++++++++++++
> > > >  gcc/testsuite/gcc.target/i386/pr106010-5b.c |  80 +++++++++
> > > >  gcc/testsuite/gcc.target/i386/pr106010-5c.c |  62 +++++++
> > > >  gcc/testsuite/gcc.target/i386/pr106010-6a.c | 115 +++++++++++++
> > > >  gcc/testsuite/gcc.target/i386/pr106010-6b.c | 157 ++++++++++++++++++
> > > >  gcc/testsuite/gcc.target/i386/pr106010-6c.c |  80 +++++++++
> > > >  gcc/testsuite/gcc.target/i386/pr106010-7a.c |  58 +++++++
> > > >  gcc/testsuite/gcc.target/i386/pr106010-7b.c |  63 +++++++
> > > >  gcc/testsuite/gcc.target/i386/pr106010-7c.c |  41 +++++
> > > >  gcc/testsuite/gcc.target/i386/pr106010-8a.c |  58 +++++++
> > > >  gcc/testsuite/gcc.target/i386/pr106010-8b.c |  53 ++++++
> > > >  gcc/testsuite/gcc.target/i386/pr106010-8c.c |  38 +++++
> > > >  gcc/tree-vect-data-refs.cc                  |  26 ++-
> > > >  gcc/tree-vect-loop.cc                       |   7 +-
> > > >  gcc/tree-vect-slp.cc                        | 174 +++++++++++++++-----
> > > >  gcc/tree-vect-stmts.cc                      | 135 ++++++++++++---
> > > >  gcc/tree-vectorizer.h                       |  13 ++
> > > >  29 files changed, 2064 insertions(+), 63 deletions(-)
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > > >
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1a.c b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > > > new file mode 100644
> > > > index 00000000000..b608f484934
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > > > @@ -0,0 +1,58 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "vect" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "vect" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "vect" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "vect" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "vect" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "vect" } } */
> > > > +
> > > > +#define N 10000
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_pd (_Complex double* a, _Complex double* b)
> > > > +{
> > > > +  for (int i = 0; i != N; i++)
> > > > +    a[i] = b[i];
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_ps (_Complex float* a, _Complex float* b)
> > > > +{
> > > > +  for (int i = 0; i != N; i++)
> > > > +    a[i] = b[i];
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi64 (_Complex long long* a, _Complex long long* b)
> > > > +{
> > > > +  for (int i = 0; i != N; i++)
> > > > +    a[i] = b[i];
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi32 (_Complex int* a, _Complex int* b)
> > > > +{
> > > > +  for (int i = 0; i != N; i++)
> > > > +    a[i] = b[i];
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi16 (_Complex short* a, _Complex short* b)
> > > > +{
> > > > +  for (int i = 0; i != N; i++)
> > > > +    a[i] = b[i];
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi8 (_Complex char* a, _Complex char* b)
> > > > +{
> > > > +  for (int i = 0; i != N; i++)
> > > > +    a[i] = b[i];
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1b.c b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > > > new file mode 100644
> > > > index 00000000000..0f377c3a548
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > > > @@ -0,0 +1,63 @@
> > > > +/* { dg-do run } */
> > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > +/* { dg-require-effective-target avx } */
> > > > +
> > > > +#include "avx-check.h"
> > > > +#include <string.h>
> > > > +#include "pr106010-1a.c"
> > > > +
> > > > +void
> > > > +avx_test (void)
> > > > +{
> > > > +  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > +  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > +  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > +  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > +  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > +  char* p_init = (char*) malloc (2 * N * sizeof (double));
> > > > +
> > > > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > > > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > > > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > > > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > > > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > > > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > > > +
> > > > +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> > > > +    p_init[i] = i;
> > > > +
> > > > +  memcpy (pd_src, p_init, 2 * N * sizeof (double));
> > > > +  memcpy (ps_src, p_init, 2 * N * sizeof (float));
> > > > +  memcpy (epi64_src, p_init, 2 * N * sizeof (long long));
> > > > +  memcpy (epi32_src, p_init, 2 * N * sizeof (int));
> > > > +  memcpy (epi16_src, p_init, 2 * N * sizeof (short));
> > > > +  memcpy (epi8_src, p_init, 2 * N * sizeof (char));
> > > > +
> > > > +  foo_pd (pd_dst, pd_src);
> > > > +  foo_ps (ps_dst, ps_src);
> > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
> > > > +    __builtin_abort ();
> > > > +
> > > > +  return;
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1c.c b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > > > new file mode 100644
> > > > index 00000000000..f07e9fb2d3d
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > > > @@ -0,0 +1,41 @@
> > > > +/* { dg-do run } */
> > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "vect" } } */
> > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > +
> > > > +#include <string.h>
> > > > +
> > > > +static void do_test (void);
> > > > +
> > > > +#define DO_TEST do_test
> > > > +#define AVX512FP16
> > > > +#include "avx512-check.h"
> > > > +
> > > > +#define N 10000
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* b)
> > > > +{
> > > > +  for (int i = 0; i != N; i++)
> > > > +    a[i] = b[i];
> > > > +}
> > > > +
> > > > +static void
> > > > +do_test (void)
> > > > +{
> > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > +  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
> > > > +
> > > > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > > > +
> > > > +  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
> > > > +    p_init[i] = i;
> > > > +
> > > > +  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
> > > > +
> > > > +  foo_ph (ph_dst, ph_src);
> > > > +  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
> > > > +    __builtin_abort ();
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2a.c b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > > > new file mode 100644
> > > > index 00000000000..d2e2f8d4f43
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > > > @@ -0,0 +1,82 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "slp2" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "slp2" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "slp2" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "slp2" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "slp2" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "slp2" } } */
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > > +{
> > > > +  a[0] = b[0];
> > > > +  a[1] = b[1];
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > > +{
> > > > +  a[0] = b[0];
> > > > +  a[1] = b[1];
> > > > +  a[2] = b[2];
> > > > +  a[3] = b[3];
> > > > +
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > > +{
> > > > +  a[0] = b[0];
> > > > +  a[1] = b[1];
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > > +{
> > > > +  a[0] = b[0];
> > > > +  a[1] = b[1];
> > > > +  a[2] = b[2];
> > > > +  a[3] = b[3];
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > > +{
> > > > +  a[0] = b[0];
> > > > +  a[1] = b[1];
> > > > +  a[2] = b[2];
> > > > +  a[3] = b[3];
> > > > +  a[4] = b[4];
> > > > +  a[5] = b[5];
> > > > +  a[6] = b[6];
> > > > +  a[7] = b[7];
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > > +{
> > > > +  a[0] = b[0];
> > > > +  a[1] = b[1];
> > > > +  a[2] = b[2];
> > > > +  a[3] = b[3];
> > > > +  a[4] = b[4];
> > > > +  a[5] = b[5];
> > > > +  a[6] = b[6];
> > > > +  a[7] = b[7];
> > > > +  a[8] = b[8];
> > > > +  a[9] = b[9];
> > > > +  a[10] = b[10];
> > > > +  a[11] = b[11];
> > > > +  a[12] = b[12];
> > > > +  a[13] = b[13];
> > > > +  a[14] = b[14];
> > > > +  a[15] = b[15];
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2b.c b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > > > new file mode 100644
> > > > index 00000000000..ac360752693
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > > > @@ -0,0 +1,62 @@
> > > > +/* { dg-do run } */
> > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > +/* { dg-require-effective-target avx } */
> > > > +
> > > > +#include "avx-check.h"
> > > > +#include <string.h>
> > > > +#include "pr106010-2a.c"
> > > > +
> > > > +void
> > > > +avx_test (void)
> > > > +{
> > > > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > > > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > > > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > > > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > > > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > > > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > > > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > > > +  char* p = (char* ) malloc (32);
> > > > +
> > > > +  __builtin_memset (pd_dst, 0, 32);
> > > > +  __builtin_memset (ps_dst, 0, 32);
> > > > +  __builtin_memset (epi64_dst, 0, 32);
> > > > +  __builtin_memset (epi32_dst, 0, 32);
> > > > +  __builtin_memset (epi16_dst, 0, 32);
> > > > +  __builtin_memset (epi8_dst, 0, 32);
> > > > +
> > > > +  for (int i = 0; i != 32; i++)
> > > > +    p[i] = i;
> > > > +  __builtin_memcpy (pd_src, p, 32);
> > > > +  __builtin_memcpy (ps_src, p, 32);
> > > > +  __builtin_memcpy (epi64_src, p, 32);
> > > > +  __builtin_memcpy (epi32_src, p, 32);
> > > > +  __builtin_memcpy (epi16_src, p, 32);
> > > > +  __builtin_memcpy (epi8_src, p, 32);
> > > > +
> > > > +  foo_pd (pd_dst, pd_src);
> > > > +  foo_ps (ps_dst, ps_src);
> > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > +  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > > > +    __builtin_abort ();
> > > > +
> > > > +  return;
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2c.c b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > > > new file mode 100644
> > > > index 00000000000..a002f209ec9
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > > > @@ -0,0 +1,47 @@
> > > > +/* { dg-do run } */
> > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > +
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "slp2" } } */
> > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > > +
> > > > +#include <string.h>
> > > > +
> > > > +static void do_test (void);
> > > > +#define DO_TEST do_test
> > > > +#define AVX512FP16
> > > > +#include "avx512-check.h"
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > > +{
> > > > +  a[0] = b[0];
> > > > +  a[1] = b[1];
> > > > +  a[2] = b[2];
> > > > +  a[3] = b[3];
> > > > +  a[4] = b[4];
> > > > +  a[5] = b[5];
> > > > +  a[6] = b[6];
> > > > +  a[7] = b[7];
> > > > +}
> > > > +
> > > > +void
> > > > +do_test (void)
> > > > +{
> > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > > > +  char* p = (char* ) malloc (32);
> > > > +
> > > > +   __builtin_memset (ph_dst, 0, 32);
> > > > +
> > > > +  for (int i = 0; i != 32; i++)
> > > > +    p[i] = i;
> > > > +  __builtin_memcpy (ph_src, p, 32);
> > > > +
> > > > +  foo_ph (ph_dst, ph_src);
> > > > +  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
> > > > +    __builtin_abort ();
> > > > +
> > > > +  return;
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3a.c b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > > > new file mode 100644
> > > > index 00000000000..c1b64b56b1c
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > > > @@ -0,0 +1,80 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
> > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 6, 7, 4, 5 \}} 1 "slp2" } }  */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17 \}} 1 "slp2" } }  */
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > > +{
> > > > +  a[0] = b[1];
> > > > +  a[1] = b[0];
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > > +{
> > > > +  a[0] = b[1];
> > > > +  a[1] = b[0];
> > > > +  a[2] = b[3];
> > > > +  a[3] = b[2];
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > > +{
> > > > +  a[0] = b[1];
> > > > +  a[1] = b[0];
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > > +{
> > > > +  a[0] = b[3];
> > > > +  a[1] = b[2];
> > > > +  a[2] = b[1];
> > > > +  a[3] = b[0];
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > > +{
> > > > +  a[0] = b[7];
> > > > +  a[1] = b[6];
> > > > +  a[2] = b[5];
> > > > +  a[3] = b[4];
> > > > +  a[4] = b[3];
> > > > +  a[5] = b[2];
> > > > +  a[6] = b[1];
> > > > +  a[7] = b[0];
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > > +{
> > > > +  a[0] = b[7];
> > > > +  a[1] = b[6];
> > > > +  a[2] = b[5];
> > > > +  a[3] = b[4];
> > > > +  a[4] = b[3];
> > > > +  a[5] = b[2];
> > > > +  a[6] = b[1];
> > > > +  a[7] = b[0];
> > > > +  a[8] = b[15];
> > > > +  a[9] = b[14];
> > > > +  a[10] = b[13];
> > > > +  a[11] = b[12];
> > > > +  a[12] = b[11];
> > > > +  a[13] = b[10];
> > > > +  a[14] = b[9];
> > > > +  a[15] = b[8];
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3b.c b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > > > new file mode 100644
> > > > index 00000000000..e4fa3f3a541
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > > > @@ -0,0 +1,126 @@
> > > > +/* { dg-do run } */
> > > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > +/* { dg-require-effective-target avx2 } */
> > > > +
> > > > +#include "avx2-check.h"
> > > > +#include <string.h>
> > > > +#include "pr106010-3a.c"
> > > > +
> > > > +void
> > > > +avx2_test (void)
> > > > +{
> > > > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > > > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > > > +  _Complex double* pd_exp = (_Complex double*) malloc (32);
> > > > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > > > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > > > +  _Complex float* ps_exp = (_Complex float*) malloc (32);
> > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > > > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (32);
> > > > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > > > +  _Complex int* epi32_exp = (_Complex int*) malloc (32);
> > > > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > > > +  _Complex short* epi16_exp = (_Complex short*) malloc (32);
> > > > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > > > +  _Complex char* epi8_exp = (_Complex char*) malloc (32);
> > > > +  char* p = (char* ) malloc (32);
> > > > +  char* q = (char* ) malloc (32);
> > > > +
> > > > +  __builtin_memset (pd_dst, 0, 32);
> > > > +  __builtin_memset (ps_dst, 0, 32);
> > > > +  __builtin_memset (epi64_dst, 0, 32);
> > > > +  __builtin_memset (epi32_dst, 0, 32);
> > > > +  __builtin_memset (epi16_dst, 0, 32);
> > > > +  __builtin_memset (epi8_dst, 0, 32);
> > > > +
> > > > +  for (int i = 0; i != 32; i++)
> > > > +    p[i] = i;
> > > > +  __builtin_memcpy (pd_src, p, 32);
> > > > +  __builtin_memcpy (ps_src, p, 32);
> > > > +  __builtin_memcpy (epi64_src, p, 32);
> > > > +  __builtin_memcpy (epi32_src, p, 32);
> > > > +  __builtin_memcpy (epi16_src, p, 32);
> > > > +  __builtin_memcpy (epi8_src, p, 32);
> > > > +
> > > > +  for (int i = 0; i != 16; i++)
> > > > +    {
> > > > +      p[i] = i + 16;
> > > > +      p[i + 16] = i;
> > > > +    }
> > > > +  __builtin_memcpy (pd_exp, p, 32);
> > > > +  __builtin_memcpy (epi64_exp, p, 32);
> > > > +
> > > > +  for (int i = 0; i != 8; i++)
> > > > +    {
> > > > +      p[i] = i + 8;
> > > > +      p[i + 8] = i;
> > > > +      p[i + 16] = i + 24;
> > > > +      p[i + 24] = i + 16;
> > > > +      q[i] = i + 24;
> > > > +      q[i + 8] = i + 16;
> > > > +      q[i + 16] = i + 8;
> > > > +      q[i + 24] = i;
> > > > +    }
> > > > +  __builtin_memcpy (ps_exp, p, 32);
> > > > +  __builtin_memcpy (epi32_exp, q, 32);
> > > > +
> > > > +
> > > > +  for (int i = 0; i != 4; i++)
> > > > +    {
> > > > +      q[i] = i + 28;
> > > > +      q[i + 4] = i + 24;
> > > > +      q[i + 8] = i + 20;
> > > > +      q[i + 12] = i + 16;
> > > > +      q[i + 16] = i + 12;
> > > > +      q[i + 20] = i + 8;
> > > > +      q[i + 24] = i + 4;
> > > > +      q[i + 28] = i;
> > > > +    }
> > > > +  __builtin_memcpy (epi16_exp, q, 32);
> > > > +
> > > > +  for (int i = 0; i != 2; i++)
> > > > +    {
> > > > +      q[i] = i + 14;
> > > > +      q[i + 2] = i + 12;
> > > > +      q[i + 4] = i + 10;
> > > > +      q[i + 6] = i + 8;
> > > > +      q[i + 8] = i + 6;
> > > > +      q[i + 10] = i + 4;
> > > > +      q[i + 12] = i + 2;
> > > > +      q[i + 14] = i;
> > > > +      q[i + 16] = i + 30;
> > > > +      q[i + 18] = i + 28;
> > > > +      q[i + 20] = i + 26;
> > > > +      q[i + 22] = i + 24;
> > > > +      q[i + 24] = i + 22;
> > > > +      q[i + 26] = i + 20;
> > > > +      q[i + 28] = i + 18;
> > > > +      q[i + 30] = i + 16;
> > > > +    }
> > > > +  __builtin_memcpy (epi8_exp, q, 32);
> > > > +
> > > > +  foo_pd (pd_dst, pd_src);
> > > > +  foo_ps (ps_dst, ps_src);
> > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > +  if (__builtin_memcmp (pd_dst, pd_exp, 32) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (ps_dst, ps_exp, 32) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 32) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 32) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 32) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 32) != 0)
> > > > +    __builtin_abort ();
> > > > +
> > > > +  return;
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3c.c b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > > > new file mode 100644
> > > > index 00000000000..5a5a3d4b992
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > > > @@ -0,0 +1,69 @@
> > > > +/* { dg-do run } */
> > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 8, 9, 6, 7, 14, 15, 12, 13, 4, 5, 10, 11 \}} 1 "slp2" } }  */
> > > > +
> > > > +#include <string.h>
> > > > +
> > > > +static void do_test (void);
> > > > +#define DO_TEST do_test
> > > > +#define AVX512FP16
> > > > +#include "avx512-check.h"
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > > +{
> > > > +  a[0] = b[1];
> > > > +  a[1] = b[0];
> > > > +  a[2] = b[4];
> > > > +  a[3] = b[3];
> > > > +  a[4] = b[7];
> > > > +  a[5] = b[6];
> > > > +  a[6] = b[2];
> > > > +  a[7] = b[5];
> > > > +}
> > > > +
> > > > +void
> > > > +do_test (void)
> > > > +{
> > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > > > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (32);
> > > > +  char* p = (char* ) malloc (32);
> > > > +  char* q = (char* ) malloc (32);
> > > > +
> > > > +  __builtin_memset (ph_dst, 0, 32);
> > > > +
> > > > +  for (int i = 0; i != 32; i++)
> > > > +    p[i] = i;
> > > > +  __builtin_memcpy (ph_src, p, 32);
> > > > +
> > > > +  for (int i = 0; i != 4; i++)
> > > > +    {
> > > > +      p[i] = i + 4;
> > > > +      p[i + 4] = i;
> > > > +      p[i + 8] = i + 16;
> > > > +      p[i + 12] = i + 12;
> > > > +      p[i + 16] = i + 28;
> > > > +      p[i + 20] = i + 24;
> > > > +      p[i + 24] = i + 8;
> > > > +      p[i + 28] = i + 20;
> > > > +      q[i] = i + 28;
> > > > +      q[i + 4] = i + 24;
> > > > +      q[i + 8] = i + 20;
> > > > +      q[i + 12] = i + 16;
> > > > +      q[i + 16] = i + 12;
> > > > +      q[i + 20] = i + 8;
> > > > +      q[i + 24] = i + 4;
> > > > +      q[i + 28] = i;
> > > > +    }
> > > > +  __builtin_memcpy (ph_exp, p, 32);
> > > > +
> > > > +  foo_ph (ph_dst, ph_src);
> > > > +  if (__builtin_memcmp (ph_dst, ph_exp, 32) != 0)
> > > > +    __builtin_abort ();
> > > > +
> > > > +  return;
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4a.c b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > > > new file mode 100644
> > > > index 00000000000..b7b0b532bb1
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > > > @@ -0,0 +1,101 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
> > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "slp2" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "slp2" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "slp2" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "slp2" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "slp2" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "slp2" } } */
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_pd (_Complex double* a,
> > > > +       _Complex double b1,
> > > > +       _Complex double b2)
> > > > +{
> > > > +  a[0] = b1;
> > > > +  a[1] = b2;
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_ps (_Complex float* a,
> > > > +       _Complex float b1, _Complex float b2,
> > > > +       _Complex float b3, _Complex float b4)
> > > > +{
> > > > +  a[0] = b1;
> > > > +  a[1] = b2;
> > > > +  a[2] = b3;
> > > > +  a[3] = b4;
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi64 (_Complex long long* a,
> > > > +          _Complex long long b1,
> > > > +          _Complex long long b2)
> > > > +{
> > > > +  a[0] = b1;
> > > > +  a[1] = b2;
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi32 (_Complex int* a,
> > > > +          _Complex int b1, _Complex int b2,
> > > > +          _Complex int b3, _Complex int b4)
> > > > +{
> > > > +  a[0] = b1;
> > > > +  a[1] = b2;
> > > > +  a[2] = b3;
> > > > +  a[3] = b4;
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi16 (_Complex short* a,
> > > > +          _Complex short b1, _Complex short b2,
> > > > +          _Complex short b3, _Complex short b4,
> > > > +          _Complex short b5, _Complex short b6,
> > > > +          _Complex short b7,_Complex short b8)
> > > > +{
> > > > +  a[0] = b1;
> > > > +  a[1] = b2;
> > > > +  a[2] = b3;
> > > > +  a[3] = b4;
> > > > +  a[4] = b5;
> > > > +  a[5] = b6;
> > > > +  a[6] = b7;
> > > > +  a[7] = b8;
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi8 (_Complex char* a,
> > > > +         _Complex char b1, _Complex char b2,
> > > > +         _Complex char b3, _Complex char b4,
> > > > +         _Complex char b5, _Complex char b6,
> > > > +         _Complex char b7,_Complex char b8,
> > > > +         _Complex char b9, _Complex char b10,
> > > > +         _Complex char b11, _Complex char b12,
> > > > +         _Complex char b13, _Complex char b14,
> > > > +         _Complex char b15,_Complex char b16)
> > > > +{
> > > > +  a[0] = b1;
> > > > +  a[1] = b2;
> > > > +  a[2] = b3;
> > > > +  a[3] = b4;
> > > > +  a[4] = b5;
> > > > +  a[5] = b6;
> > > > +  a[6] = b7;
> > > > +  a[7] = b8;
> > > > +  a[8] = b9;
> > > > +  a[9] = b10;
> > > > +  a[10] = b11;
> > > > +  a[11] = b12;
> > > > +  a[12] = b13;
> > > > +  a[13] = b14;
> > > > +  a[14] = b15;
> > > > +  a[15] = b16;
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4b.c b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > > > new file mode 100644
> > > > index 00000000000..e2e79508c4b
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > > > @@ -0,0 +1,67 @@
> > > > +/* { dg-do run } */
> > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > +/* { dg-require-effective-target avx } */
> > > > +
> > > > +#include "avx-check.h"
> > > > +#include <string.h>
> > > > +#include "pr106010-4a.c"
> > > > +
> > > > +void
> > > > +avx_test (void)
> > > > +{
> > > > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > > > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > > > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > > > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > > > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > > > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > > > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > > > +  char* p = (char* ) malloc (32);
> > > > +
> > > > +  __builtin_memset (pd_dst, 0, 32);
> > > > +  __builtin_memset (ps_dst, 0, 32);
> > > > +  __builtin_memset (epi64_dst, 0, 32);
> > > > +  __builtin_memset (epi32_dst, 0, 32);
> > > > +  __builtin_memset (epi16_dst, 0, 32);
> > > > +  __builtin_memset (epi8_dst, 0, 32);
> > > > +
> > > > +  for (int i = 0; i != 32; i++)
> > > > +    p[i] = i;
> > > > +  __builtin_memcpy (pd_src, p, 32);
> > > > +  __builtin_memcpy (ps_src, p, 32);
> > > > +  __builtin_memcpy (epi64_src, p, 32);
> > > > +  __builtin_memcpy (epi32_src, p, 32);
> > > > +  __builtin_memcpy (epi16_src, p, 32);
> > > > +  __builtin_memcpy (epi8_src, p, 32);
> > > > +
> > > > +  foo_pd (pd_dst, pd_src[0], pd_src[1]);
> > > > +  foo_ps (ps_dst, ps_src[0], ps_src[1], ps_src[2], ps_src[3]);
> > > > +  foo_epi64 (epi64_dst, epi64_src[0], epi64_src[1]);
> > > > +  foo_epi32 (epi32_dst, epi32_src[0], epi32_src[1], epi32_src[2], epi32_src[3]);
> > > > +  foo_epi16 (epi16_dst, epi16_src[0], epi16_src[1], epi16_src[2], epi16_src[3],
> > > > +            epi16_src[4], epi16_src[5], epi16_src[6], epi16_src[7]);
> > > > +  foo_epi8 (epi8_dst, epi8_src[0], epi8_src[1], epi8_src[2], epi8_src[3],
> > > > +           epi8_src[4], epi8_src[5], epi8_src[6], epi8_src[7],
> > > > +           epi8_src[8], epi8_src[9], epi8_src[10], epi8_src[11],
> > > > +           epi8_src[12], epi8_src[13], epi8_src[14], epi8_src[15]);
> > > > +
> > > > +  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi8_dst, epi8_src, 32) != 0)
> > > > +    __builtin_abort ();
> > > > +
> > > > +  return;
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4c.c b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > > > new file mode 100644
> > > > index 00000000000..8e02aefe3b5
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > > > @@ -0,0 +1,54 @@
> > > > +/* { dg-do run } */
> > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -fdump-tree-slp-details -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "slp2" } } */
> > > > +
> > > > +#include <string.h>
> > > > +
> > > > +static void do_test (void);
> > > > +#define DO_TEST do_test
> > > > +#define AVX512FP16
> > > > +#include "avx512-check.h"
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_ph (_Complex _Float16* a,
> > > > +       _Complex _Float16 b1, _Complex _Float16 b2,
> > > > +       _Complex _Float16 b3, _Complex _Float16 b4,
> > > > +       _Complex _Float16 b5, _Complex _Float16 b6,
> > > > +       _Complex _Float16 b7,_Complex _Float16 b8)
> > > > +{
> > > > +  a[0] = b1;
> > > > +  a[1] = b2;
> > > > +  a[2] = b3;
> > > > +  a[3] = b4;
> > > > +  a[4] = b5;
> > > > +  a[5] = b6;
> > > > +  a[6] = b7;
> > > > +  a[7] = b8;
> > > > +}
> > > > +
> > > > +void
> > > > +do_test (void)
> > > > +{
> > > > +
> > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > > > +
> > > > +  char* p = (char* ) malloc (32);
> > > > +
> > > > +  __builtin_memset (ph_dst, 0, 32);
> > > > +
> > > > +  for (int i = 0; i != 32; i++)
> > > > +    p[i] = i;
> > > > +
> > > > +  __builtin_memcpy (ph_src, p, 32);
> > > > +
> > > > +  foo_ph (ph_dst, ph_src[0], ph_src[1], ph_src[2], ph_src[3],
> > > > +         ph_src[4], ph_src[5], ph_src[6], ph_src[7]);
> > > > +
> > > > +  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
> > > > +    __builtin_abort ();
> > > > +  return;
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5a.c b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > > > new file mode 100644
> > > > index 00000000000..9d4a6f9846b
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > > > @@ -0,0 +1,117 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 4 "slp2" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 4 "slp2" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 4 "slp2" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 4 "slp2" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 4 "slp2" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 4 "slp2" } } */
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > > +{
> > > > +  a[0] = b[2];
> > > > +  a[1] = b[3];
> > > > +  a[2] = b[0];
> > > > +  a[3] = b[1];
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > > +{
> > > > +  a[0] = b[4];
> > > > +  a[1] = b[5];
> > > > +  a[2] = b[6];
> > > > +  a[3] = b[7];
> > > > +  a[4] = b[0];
> > > > +  a[5] = b[1];
> > > > +  a[6] = b[2];
> > > > +  a[7] = b[3];
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > > +{
> > > > +  a[0] = b[2];
> > > > +  a[1] = b[3];
> > > > +  a[2] = b[0];
> > > > +  a[3] = b[1];
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > > +{
> > > > +  a[0] = b[4];
> > > > +  a[1] = b[5];
> > > > +  a[2] = b[6];
> > > > +  a[3] = b[7];
> > > > +  a[4] = b[0];
> > > > +  a[5] = b[1];
> > > > +  a[6] = b[2];
> > > > +  a[7] = b[3];
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > > +{
> > > > +  a[0] = b[8];
> > > > +  a[1] = b[9];
> > > > +  a[2] = b[10];
> > > > +  a[3] = b[11];
> > > > +  a[4] = b[12];
> > > > +  a[5] = b[13];
> > > > +  a[6] = b[14];
> > > > +  a[7] = b[15];
> > > > +  a[8] = b[0];
> > > > +  a[9] = b[1];
> > > > +  a[10] = b[2];
> > > > +  a[11] = b[3];
> > > > +  a[12] = b[4];
> > > > +  a[13] = b[5];
> > > > +  a[14] = b[6];
> > > > +  a[15] = b[7];
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > > +{
> > > > +  a[0] = b[16];
> > > > +  a[1] = b[17];
> > > > +  a[2] = b[18];
> > > > +  a[3] = b[19];
> > > > +  a[4] = b[20];
> > > > +  a[5] = b[21];
> > > > +  a[6] = b[22];
> > > > +  a[7] = b[23];
> > > > +  a[8] = b[24];
> > > > +  a[9] = b[25];
> > > > +  a[10] = b[26];
> > > > +  a[11] = b[27];
> > > > +  a[12] = b[28];
> > > > +  a[13] = b[29];
> > > > +  a[14] = b[30];
> > > > +  a[15] = b[31];
> > > > +  a[16] = b[0];
> > > > +  a[17] = b[1];
> > > > +  a[18] = b[2];
> > > > +  a[19] = b[3];
> > > > +  a[20] = b[4];
> > > > +  a[21] = b[5];
> > > > +  a[22] = b[6];
> > > > +  a[23] = b[7];
> > > > +  a[24] = b[8];
> > > > +  a[25] = b[9];
> > > > +  a[26] = b[10];
> > > > +  a[27] = b[11];
> > > > +  a[28] = b[12];
> > > > +  a[29] = b[13];
> > > > +  a[30] = b[14];
> > > > +  a[31] = b[15];
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5b.c b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > > > new file mode 100644
> > > > index 00000000000..d5c6ebeb5cf
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > > > @@ -0,0 +1,80 @@
> > > > +/* { dg-do run } */
> > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > +/* { dg-require-effective-target avx } */
> > > > +
> > > > +#include "avx-check.h"
> > > > +#include <string.h>
> > > > +#include "pr106010-5a.c"
> > > > +
> > > > +void
> > > > +avx_test (void)
> > > > +{
> > > > +  _Complex double* pd_src = (_Complex double*) malloc (64);
> > > > +  _Complex double* pd_dst = (_Complex double*) malloc (64);
> > > > +  _Complex double* pd_exp = (_Complex double*) malloc (64);
> > > > +  _Complex float* ps_src = (_Complex float*) malloc (64);
> > > > +  _Complex float* ps_dst = (_Complex float*) malloc (64);
> > > > +  _Complex float* ps_exp = (_Complex float*) malloc (64);
> > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
> > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
> > > > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
> > > > +  _Complex int* epi32_src = (_Complex int*) malloc (64);
> > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (64);
> > > > +  _Complex int* epi32_exp = (_Complex int*) malloc (64);
> > > > +  _Complex short* epi16_src = (_Complex short*) malloc (64);
> > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (64);
> > > > +  _Complex short* epi16_exp = (_Complex short*) malloc (64);
> > > > +  _Complex char* epi8_src = (_Complex char*) malloc (64);
> > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (64);
> > > > +  _Complex char* epi8_exp = (_Complex char*) malloc (64);
> > > > +  char* p = (char* ) malloc (64);
> > > > +  char* q = (char* ) malloc (64);
> > > > +
> > > > +  __builtin_memset (pd_dst, 0, 64);
> > > > +  __builtin_memset (ps_dst, 0, 64);
> > > > +  __builtin_memset (epi64_dst, 0, 64);
> > > > +  __builtin_memset (epi32_dst, 0, 64);
> > > > +  __builtin_memset (epi16_dst, 0, 64);
> > > > +  __builtin_memset (epi8_dst, 0, 64);
> > > > +
> > > > +  for (int i = 0; i != 64; i++)
> > > > +    {
> > > > +      p[i] = i;
> > > > +      q[i] = (i + 32) % 64;
> > > > +    }
> > > > +  __builtin_memcpy (pd_src, p, 64);
> > > > +  __builtin_memcpy (ps_src, p, 64);
> > > > +  __builtin_memcpy (epi64_src, p, 64);
> > > > +  __builtin_memcpy (epi32_src, p, 64);
> > > > +  __builtin_memcpy (epi16_src, p, 64);
> > > > +  __builtin_memcpy (epi8_src, p, 64);
> > > > +
> > > > +  __builtin_memcpy (pd_exp, q, 64);
> > > > +  __builtin_memcpy (ps_exp, q, 64);
> > > > +  __builtin_memcpy (epi64_exp, q, 64);
> > > > +  __builtin_memcpy (epi32_exp, q, 64);
> > > > +  __builtin_memcpy (epi16_exp, q, 64);
> > > > +  __builtin_memcpy (epi8_exp, q, 64);
> > > > +
> > > > +  foo_pd (pd_dst, pd_src);
> > > > +  foo_ps (ps_dst, ps_src);
> > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > +
> > > > +  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
> > > > +    __builtin_abort ();
> > > > +
> > > > +  return;
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5c.c b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > > > new file mode 100644
> > > > index 00000000000..9ce4e6dd5c0
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > > > @@ -0,0 +1,62 @@
> > > > +/* { dg-do run } */
> > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 4 "slp2" } } */
> > > > +
> > > > +#include <string.h>
> > > > +
> > > > +static void do_test (void);
> > > > +#define DO_TEST do_test
> > > > +#define AVX512FP16
> > > > +#include "avx512-check.h"
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > > +{
> > > > +  a[0] = b[8];
> > > > +  a[1] = b[9];
> > > > +  a[2] = b[10];
> > > > +  a[3] = b[11];
> > > > +  a[4] = b[12];
> > > > +  a[5] = b[13];
> > > > +  a[6] = b[14];
> > > > +  a[7] = b[15];
> > > > +  a[8] = b[0];
> > > > +  a[9] = b[1];
> > > > +  a[10] = b[2];
> > > > +  a[11] = b[3];
> > > > +  a[12] = b[4];
> > > > +  a[13] = b[5];
> > > > +  a[14] = b[6];
> > > > +  a[15] = b[7];
> > > > +}
> > > > +
> > > > +void
> > > > +do_test (void)
> > > > +{
> > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
> > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
> > > > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
> > > > +  char* p = (char* ) malloc (64);
> > > > +  char* q = (char* ) malloc (64);
> > > > +
> > > > +  __builtin_memset (ph_dst, 0, 64);
> > > > +
> > > > +  for (int i = 0; i != 64; i++)
> > > > +    {
> > > > +      p[i] = i;
> > > > +      q[i] = (i + 32) % 64;
> > > > +    }
> > > > +  __builtin_memcpy (ph_src, p, 64);
> > > > +
> > > > +  __builtin_memcpy (ph_exp, q, 64);
> > > > +
> > > > +  foo_ph (ph_dst, ph_src);
> > > > +
> > > > +  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
> > > > +    __builtin_abort ();
> > > > +
> > > > +  return;
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6a.c b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > > > new file mode 100644
> > > > index 00000000000..65a90d03684
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > > > @@ -0,0 +1,115 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 4 "slp2" } }  */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 4 "slp2" } }  */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > > +{
> > > > +  a[0] = b[3];
> > > > +  a[1] = b[2];
> > > > +  a[2] = b[1];
> > > > +  a[3] = b[0];
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > > +{
> > > > +  a[0] = b[7];
> > > > +  a[1] = b[6];
> > > > +  a[2] = b[5];
> > > > +  a[3] = b[4];
> > > > +  a[4] = b[3];
> > > > +  a[5] = b[2];
> > > > +  a[6] = b[1];
> > > > +  a[7] = b[0];
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > > +{
> > > > +  a[0] = b[3];
> > > > +  a[1] = b[2];
> > > > +  a[2] = b[1];
> > > > +  a[3] = b[0];
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > > +{
> > > > +  a[0] = b[7];
> > > > +  a[1] = b[6];
> > > > +  a[2] = b[5];
> > > > +  a[3] = b[4];
> > > > +  a[4] = b[3];
> > > > +  a[5] = b[2];
> > > > +  a[6] = b[1];
> > > > +  a[7] = b[0];
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > > +{
> > > > +  a[0] = b[15];
> > > > +  a[1] = b[14];
> > > > +  a[2] = b[13];
> > > > +  a[3] = b[12];
> > > > +  a[4] = b[11];
> > > > +  a[5] = b[10];
> > > > +  a[6] = b[9];
> > > > +  a[7] = b[8];
> > > > +  a[8] = b[7];
> > > > +  a[9] = b[6];
> > > > +  a[10] = b[5];
> > > > +  a[11] = b[4];
> > > > +  a[12] = b[3];
> > > > +  a[13] = b[2];
> > > > +  a[14] = b[1];
> > > > +  a[15] = b[0];
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > > +{
> > > > +  a[0] = b[31];
> > > > +  a[1] = b[30];
> > > > +  a[2] = b[29];
> > > > +  a[3] = b[28];
> > > > +  a[4] = b[27];
> > > > +  a[5] = b[26];
> > > > +  a[6] = b[25];
> > > > +  a[7] = b[24];
> > > > +  a[8] = b[23];
> > > > +  a[9] = b[22];
> > > > +  a[10] = b[21];
> > > > +  a[11] = b[20];
> > > > +  a[12] = b[19];
> > > > +  a[13] = b[18];
> > > > +  a[14] = b[17];
> > > > +  a[15] = b[16];
> > > > +  a[16] = b[15];
> > > > +  a[17] = b[14];
> > > > +  a[18] = b[13];
> > > > +  a[19] = b[12];
> > > > +  a[20] = b[11];
> > > > +  a[21] = b[10];
> > > > +  a[22] = b[9];
> > > > +  a[23] = b[8];
> > > > +  a[24] = b[7];
> > > > +  a[25] = b[6];
> > > > +  a[26] = b[5];
> > > > +  a[27] = b[4];
> > > > +  a[28] = b[3];
> > > > +  a[29] = b[2];
> > > > +  a[30] = b[1];
> > > > +  a[31] = b[0];
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6b.c b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > > > new file mode 100644
> > > > index 00000000000..1c5bb020939
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > > > @@ -0,0 +1,157 @@
> > > > +/* { dg-do run } */
> > > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > +/* { dg-require-effective-target avx2 } */
> > > > +
> > > > +#include "avx2-check.h"
> > > > +#include <string.h>
> > > > +#include "pr106010-6a.c"
> > > > +
> > > > +void
> > > > +avx2_test (void)
> > > > +{
> > > > +  _Complex double* pd_src = (_Complex double*) malloc (64);
> > > > +  _Complex double* pd_dst = (_Complex double*) malloc (64);
> > > > +  _Complex double* pd_exp = (_Complex double*) malloc (64);
> > > > +  _Complex float* ps_src = (_Complex float*) malloc (64);
> > > > +  _Complex float* ps_dst = (_Complex float*) malloc (64);
> > > > +  _Complex float* ps_exp = (_Complex float*) malloc (64);
> > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
> > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
> > > > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
> > > > +  _Complex int* epi32_src = (_Complex int*) malloc (64);
> > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (64);
> > > > +  _Complex int* epi32_exp = (_Complex int*) malloc (64);
> > > > +  _Complex short* epi16_src = (_Complex short*) malloc (64);
> > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (64);
> > > > +  _Complex short* epi16_exp = (_Complex short*) malloc (64);
> > > > +  _Complex char* epi8_src = (_Complex char*) malloc (64);
> > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (64);
> > > > +  _Complex char* epi8_exp = (_Complex char*) malloc (64);
> > > > +  char* p = (char* ) malloc (64);
> > > > +  char* q = (char* ) malloc (64);
> > > > +
> > > > +  __builtin_memset (pd_dst, 0, 64);
> > > > +  __builtin_memset (ps_dst, 0, 64);
> > > > +  __builtin_memset (epi64_dst, 0, 64);
> > > > +  __builtin_memset (epi32_dst, 0, 64);
> > > > +  __builtin_memset (epi16_dst, 0, 64);
> > > > +  __builtin_memset (epi8_dst, 0, 64);
> > > > +
> > > > +  for (int i = 0; i != 64; i++)
> > > > +    p[i] = i;
> > > > +
> > > > +  __builtin_memcpy (pd_src, p, 64);
> > > > +  __builtin_memcpy (ps_src, p, 64);
> > > > +  __builtin_memcpy (epi64_src, p, 64);
> > > > +  __builtin_memcpy (epi32_src, p, 64);
> > > > +  __builtin_memcpy (epi16_src, p, 64);
> > > > +  __builtin_memcpy (epi8_src, p, 64);
> > > > +
> > > > +
> > > > +  for (int i = 0; i != 16; i++)
> > > > +    {
> > > > +      q[i] = i + 48;
> > > > +      q[i + 16] = i + 32;
> > > > +      q[i + 32] = i + 16;
> > > > +      q[i + 48] = i;
> > > > +    }
> > > > +
> > > > +  __builtin_memcpy (pd_exp, q, 64);
> > > > +  __builtin_memcpy (epi64_exp, q, 64);
> > > > +
> > > > +   for (int i = 0; i != 8; i++)
> > > > +    {
> > > > +      q[i] = i + 56;
> > > > +      q[i + 8] = i + 48;
> > > > +      q[i + 16] = i + 40;
> > > > +      q[i + 24] = i + 32;
> > > > +      q[i + 32] = i + 24;
> > > > +      q[i + 40] = i + 16;
> > > > +      q[i + 48] = i + 8;
> > > > +      q[i + 56] = i;
> > > > +    }
> > > > +
> > > > +  __builtin_memcpy (ps_exp, q, 64);
> > > > +  __builtin_memcpy (epi32_exp, q, 64);
> > > > +
> > > > +  for (int i = 0; i != 4; i++)
> > > > +    {
> > > > +      q[i] = i + 60;
> > > > +      q[i + 4] = i + 56;
> > > > +      q[i + 8] = i + 52;
> > > > +      q[i + 12] = i + 48;
> > > > +      q[i + 16] = i + 44;
> > > > +      q[i + 20] = i + 40;
> > > > +      q[i + 24] = i + 36;
> > > > +      q[i + 28] = i + 32;
> > > > +      q[i + 32] = i + 28;
> > > > +      q[i + 36] = i + 24;
> > > > +      q[i + 40] = i + 20;
> > > > +      q[i + 44] = i + 16;
> > > > +      q[i + 48] = i + 12;
> > > > +      q[i + 52] = i + 8;
> > > > +      q[i + 56] = i + 4;
> > > > +      q[i + 60] = i;
> > > > +    }
> > > > +
> > > > +  __builtin_memcpy (epi16_exp, q, 64);
> > > > +
> > > > +  for (int i = 0; i != 2; i++)
> > > > +    {
> > > > +      q[i] = i + 62;
> > > > +      q[i + 2] = i + 60;
> > > > +      q[i + 4] = i + 58;
> > > > +      q[i + 6] = i + 56;
> > > > +      q[i + 8] = i + 54;
> > > > +      q[i + 10] = i + 52;
> > > > +      q[i + 12] = i + 50;
> > > > +      q[i + 14] = i + 48;
> > > > +      q[i + 16] = i + 46;
> > > > +      q[i + 18] = i + 44;
> > > > +      q[i + 20] = i + 42;
> > > > +      q[i + 22] = i + 40;
> > > > +      q[i + 24] = i + 38;
> > > > +      q[i + 26] = i + 36;
> > > > +      q[i + 28] = i + 34;
> > > > +      q[i + 30] = i + 32;
> > > > +      q[i + 32] = i + 30;
> > > > +      q[i + 34] = i + 28;
> > > > +      q[i + 36] = i + 26;
> > > > +      q[i + 38] = i + 24;
> > > > +      q[i + 40] = i + 22;
> > > > +      q[i + 42] = i + 20;
> > > > +      q[i + 44] = i + 18;
> > > > +      q[i + 46] = i + 16;
> > > > +      q[i + 48] = i + 14;
> > > > +      q[i + 50] = i + 12;
> > > > +      q[i + 52] = i + 10;
> > > > +      q[i + 54] = i + 8;
> > > > +      q[i + 56] = i + 6;
> > > > +      q[i + 58] = i + 4;
> > > > +      q[i + 60] = i + 2;
> > > > +      q[i + 62] = i;
> > > > +    }
> > > > +  __builtin_memcpy (epi8_exp, q, 64);
> > > > +
> > > > +  foo_pd (pd_dst, pd_src);
> > > > +  foo_ps (ps_dst, ps_src);
> > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > +
> > > > +  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
> > > > +    __builtin_abort ();
> > > > +
> > > > +  return;
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6c.c b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > > > new file mode 100644
> > > > index 00000000000..b859d884a7f
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > > > @@ -0,0 +1,80 @@
> > > > +/* { dg-do run } */
> > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } } */
> > > > +
> > > > +#include <string.h>
> > > > +
> > > > +static void do_test (void);
> > > > +#define DO_TEST do_test
> > > > +#define AVX512FP16
> > > > +#include "avx512-check.h"
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > > +{
> > > > +  a[0] = b[15];
> > > > +  a[1] = b[14];
> > > > +  a[2] = b[13];
> > > > +  a[3] = b[12];
> > > > +  a[4] = b[11];
> > > > +  a[5] = b[10];
> > > > +  a[6] = b[9];
> > > > +  a[7] = b[8];
> > > > +  a[8] = b[7];
> > > > +  a[9] = b[6];
> > > > +  a[10] = b[5];
> > > > +  a[11] = b[4];
> > > > +  a[12] = b[3];
> > > > +  a[13] = b[2];
> > > > +  a[14] = b[1];
> > > > +  a[15] = b[0];
> > > > +}
> > > > +
> > > > +void
> > > > +do_test (void)
> > > > +{
> > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
> > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
> > > > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
> > > > +  char* p = (char* ) malloc (64);
> > > > +  char* q = (char* ) malloc (64);
> > > > +
> > > > +  __builtin_memset (ph_dst, 0, 64);
> > > > +
> > > > +  for (int i = 0; i != 64; i++)
> > > > +    p[i] = i;
> > > > +
> > > > +  __builtin_memcpy (ph_src, p, 64);
> > > > +
> > > > +  for (int i = 0; i != 4; i++)
> > > > +    {
> > > > +      q[i] = i + 60;
> > > > +      q[i + 4] = i + 56;
> > > > +      q[i + 8] = i + 52;
> > > > +      q[i + 12] = i + 48;
> > > > +      q[i + 16] = i + 44;
> > > > +      q[i + 20] = i + 40;
> > > > +      q[i + 24] = i + 36;
> > > > +      q[i + 28] = i + 32;
> > > > +      q[i + 32] = i + 28;
> > > > +      q[i + 36] = i + 24;
> > > > +      q[i + 40] = i + 20;
> > > > +      q[i + 44] = i + 16;
> > > > +      q[i + 48] = i + 12;
> > > > +      q[i + 52] = i + 8;
> > > > +      q[i + 56] = i + 4;
> > > > +      q[i + 60] = i;
> > > > +    }
> > > > +
> > > > +  __builtin_memcpy (ph_exp, q, 64);
> > > > +
> > > > +  foo_ph (ph_dst, ph_src);
> > > > +
> > > > +  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
> > > > +    __builtin_abort ();
> > > > +
> > > > +  return;
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7a.c b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > > > new file mode 100644
> > > > index 00000000000..2ea01fac927
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > > > @@ -0,0 +1,58 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
> > > > +
> > > > +#define N 10000
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_pd (_Complex double* a, _Complex double b)
> > > > +{
> > > > +  for (int i = 0; i != N; i++)
> > > > +    a[i] = b;
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_ps (_Complex float* a, _Complex float b)
> > > > +{
> > > > +  for (int i = 0; i != N; i++)
> > > > +    a[i] = b;
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi64 (_Complex long long* a, _Complex long long b)
> > > > +{
> > > > +  for (int i = 0; i != N; i++)
> > > > +    a[i] = b;
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi32 (_Complex int* a, _Complex int b)
> > > > +{
> > > > +  for (int i = 0; i != N; i++)
> > > > +    a[i] = b;
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi16 (_Complex short* a, _Complex short b)
> > > > +{
> > > > +  for (int i = 0; i != N; i++)
> > > > +    a[i] = b;
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi8 (_Complex char* a, _Complex char b)
> > > > +{
> > > > +  for (int i = 0; i != N; i++)
> > > > +    a[i] = b;
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7b.c b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > > > new file mode 100644
> > > > index 00000000000..26482cc10f5
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > > > @@ -0,0 +1,63 @@
> > > > +/* { dg-do run } */
> > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > +/* { dg-require-effective-target avx } */
> > > > +
> > > > +#include "avx-check.h"
> > > > +#include <string.h>
> > > > +#include "pr106010-7a.c"
> > > > +
> > > > +void
> > > > +avx_test (void)
> > > > +{
> > > > +  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > +  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > +  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > +  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > +  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > +  char* p_init = (char*) malloc (2 * N * sizeof (double));
> > > > +
> > > > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > > > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > > > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > > > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > > > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > > > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > > > +
> > > > +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> > > > +    p_init[i] = i % 2 + 3;
> > > > +
> > > > +  memcpy (pd_src, p_init, 2 * N * sizeof (double));
> > > > +  memcpy (ps_dst, p_init, 2 * N * sizeof (float));
> > > > +  memcpy (epi64_dst, p_init, 2 * N * sizeof (long long));
> > > > +  memcpy (epi32_dst, p_init, 2 * N * sizeof (int));
> > > > +  memcpy (epi16_dst, p_init, 2 * N * sizeof (short));
> > > > +  memcpy (epi8_dst, p_init, 2 * N * sizeof (char));
> > > > +
> > > > +  foo_pd (pd_dst, pd_src[0]);
> > > > +  foo_ps (ps_dst, ps_src[0]);
> > > > +  foo_epi64 (epi64_dst, epi64_src[0]);
> > > > +  foo_epi32 (epi32_dst, epi32_src[0]);
> > > > +  foo_epi16 (epi16_dst, epi16_src[0]);
> > > > +  foo_epi8 (epi8_dst, epi8_src[0]);
> > > > +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
> > > > +    __builtin_abort ();
> > > > +  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
> > > > +    __builtin_abort ();
> > > > +
> > > > +  return;
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7c.c b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > > > new file mode 100644
> > > > index 00000000000..7f4056a5ecc
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > > > @@ -0,0 +1,41 @@
> > > > +/* { dg-do run } */
> > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
> > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > +
> > > > +#include <string.h>
> > > > +
> > > > +static void do_test (void);
> > > > +
> > > > +#define DO_TEST do_test
> > > > +#define AVX512FP16
> > > > +#include "avx512-check.h"
> > > > +
> > > > +#define N 10000
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_ph (_Complex _Float16* a, _Complex _Float16 b)
> > > > +{
> > > > +  for (int i = 0; i != N; i++)
> > > > +    a[i] = b;
> > > > +}
> > > > +
> > > > +static void
> > > > +do_test (void)
> > > > +{
> > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > +  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
> > > > +
> > > > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > > > +
> > > > +  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
> > > > +    p_init[i] = i % 2 + 3;
> > > > +
> > > > +  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
> > > > +
> > > > +  foo_ph (ph_dst, ph_src[0]);
> > > > +  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
> > > > +    __builtin_abort ();
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8a.c b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > > > new file mode 100644
> > > > index 00000000000..11054b60d30
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > > > @@ -0,0 +1,58 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
> > > > +
> > > > +#define N 10000
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_pd (_Complex double* a)
> > > > +{
> > > > +  for (int i = 0; i != N; i++)
> > > > +    a[i] = 1.0 + 2.0i;
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_ps (_Complex float* a)
> > > > +{
> > > > +  for (int i = 0; i != N; i++)
> > > > +    a[i] = 1.0f + 2.0fi;
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi64 (_Complex long long* a)
> > > > +{
> > > > +  for (int i = 0; i != N; i++)
> > > > +    a[i] = 1 + 2i;
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi32 (_Complex int* a)
> > > > +{
> > > > +  for (int i = 0; i != N; i++)
> > > > +    a[i] = 1 + 2i;
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi16 (_Complex short* a)
> > > > +{
> > > > +  for (int i = 0; i != N; i++)
> > > > +    a[i] = 1 + 2i;
> > > > +}
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_epi8 (_Complex char* a)
> > > > +{
> > > > +  for (int i = 0; i != N; i++)
> > > > +    a[i] = 1 + 2i;
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8b.c b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > > > new file mode 100644
> > > > index 00000000000..6bb0073b691
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > > > @@ -0,0 +1,53 @@
> > > > +/* { dg-do run } */
> > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > +/* { dg-require-effective-target avx } */
> > > > +
> > > > +#include "avx-check.h"
> > > > +#include <string.h>
> > > > +#include "pr106010-8a.c"
> > > > +
> > > > +void
> > > > +avx_test (void)
> > > > +{
> > > > +  _Complex double pd_src = 1.0 + 2.0i;
> > > > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > +  _Complex float ps_src = 1.0 + 2.0i;
> > > > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > +  _Complex long long epi64_src = 1 + 2i;;
> > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > +  _Complex int epi32_src = 1 + 2i;
> > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > +  _Complex short epi16_src = 1 + 2i;
> > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > +  _Complex char epi8_src = 1 + 2i;
> > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > +
> > > > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > > > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > > > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > > > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > > > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > > > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > > > +
> > > > +  foo_pd (pd_dst);
> > > > +  foo_ps (ps_dst);
> > > > +  foo_epi64 (epi64_dst);
> > > > +  foo_epi32 (epi32_dst);
> > > > +  foo_epi16 (epi16_dst);
> > > > +  foo_epi8 (epi8_dst);
> > > > +  for (int i = 0 ; i != N; i++)
> > > > +    {
> > > > +      if (pd_dst[i] != pd_src)
> > > > +       __builtin_abort ();
> > > > +      if (ps_dst[i] != ps_src)
> > > > +       __builtin_abort ();
> > > > +      if (epi64_dst[i] != epi64_src)
> > > > +       __builtin_abort ();
> > > > +      if (epi32_dst[i] != epi32_src)
> > > > +       __builtin_abort ();
> > > > +      if (epi16_dst[i] != epi16_src)
> > > > +       __builtin_abort ();
> > > > +      if (epi8_dst[i] != epi8_src)
> > > > +       __builtin_abort ();
> > > > +    }
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8c.c b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > > > new file mode 100644
> > > > index 00000000000..61ae131829d
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > > > @@ -0,0 +1,38 @@
> > > > +/* { dg-do run } */
> > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
> > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > +
> > > > +#include <string.h>
> > > > +
> > > > +static void do_test (void);
> > > > +
> > > > +#define DO_TEST do_test
> > > > +#define AVX512FP16
> > > > +#include "avx512-check.h"
> > > > +
> > > > +#define N 10000
> > > > +
> > > > +void
> > > > +__attribute__((noipa))
> > > > +foo_ph (_Complex _Float16* a)
> > > > +{
> > > > +  for (int i = 0; i != N; i++)
> > > > +    a[i] = 1.0f16 + 2.0f16i;
> > > > +}
> > > > +
> > > > +static void
> > > > +do_test (void)
> > > > +{
> > > > +  _Complex _Float16 ph_src = 1.0f16 + 2.0f16i;
> > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > +
> > > > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > > > +
> > > > +  foo_ph (ph_dst);
> > > > +  for (int i = 0; i != N; i++)
> > > > +    {
> > > > +      if (ph_dst[i] != ph_src)
> > > > +       __builtin_abort ();
> > > > +    }
> > > > +}
> > > > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> > > > index d20a10a1524..42ee9df674c 100644
> > > > --- a/gcc/tree-vect-data-refs.cc
> > > > +++ b/gcc/tree-vect-data-refs.cc
> > > > @@ -1403,7 +1403,8 @@ vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
> > > >    if (PURE_SLP_STMT (stmt_info))
> > > >      ncopies = 1;
> > > >    else
> > > > -    ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
> > > > +    ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info),
> > > > +                                  STMT_VINFO_COMPLEX_P (stmt_info));
> > > >
> > > >    if (DR_IS_READ (dr_info->dr))
> > > >      vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
> > > > @@ -4597,8 +4598,22 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
> > > >
> > > >        /* Set vectype for STMT.  */
> > > >        scalar_type = TREE_TYPE (DR_REF (dr));
> > > > -      tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
> > > > -      if (!vectype)
> > > > +      tree adjust_scalar_type = scalar_type;
> > > > +      /* Support Complex type access. Note that the complex type of load/store
> > > > +        does not support gather/scatter.  */
> > > > +      if (TREE_CODE (scalar_type) == COMPLEX_TYPE
> > > > +         && gatherscatter == SG_NONE)
> > > > +       {
> > > > +         adjust_scalar_type = TREE_TYPE (scalar_type);
> > > > +         STMT_VINFO_COMPLEX_P (stmt_info) = true;
> > > > +       }
> > > > +      tree vectype = get_vectype_for_scalar_type (vinfo, adjust_scalar_type);
> > > > +      unsigned HOST_WIDE_INT constant_nunits;
> > > > +      if (!vectype
> > > > +         /* For complex type, V1DI doesn't make sense.  */
> > > > +         || (STMT_VINFO_COMPLEX_P (stmt_info)
> > > > +             && (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&constant_nunits)
> > > > +                 || constant_nunits == 1)))
> > > >          {
> > > >            if (dump_enabled_p ())
> > > >              {
> > > > @@ -4635,8 +4650,11 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
> > > >         }
> > > >
> > > >        /* Adjust the minimal vectorization factor according to the
> > > > -        vector type.  */
> > > > +        vector type. Note for complex type, VF is half of
> > > > +        TYPE_VECTOR_SUBPARTS.  */
> > > >        vf = TYPE_VECTOR_SUBPARTS (vectype);
> > > > +      if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > +       vf = exact_div (vf, 2);
> > > >        *min_vf = upper_bound (*min_vf, vf);
> > > >
> > > >        /* Leave the BB vectorizer to pick the vector type later, based on
> > > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > > > index 3a70c15b593..365fa738022 100644
> > > > --- a/gcc/tree-vect-loop.cc
> > > > +++ b/gcc/tree-vect-loop.cc
> > > > @@ -200,7 +200,12 @@ vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
> > > >      }
> > > >
> > > >    if (nunits_vectype)
> > > > -    vect_update_max_nunits (vf, nunits_vectype);
> > > > +    {
> > > > +      poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (nunits_vectype);
> > > > +      if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > +       nunits = exact_div (nunits, 2);
> > > > +      vect_update_max_nunits (vf, nunits);
> > > > +    }
> > > >
> > > >    return opt_result::success ();
> > > >  }
> > > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> > > > index dab5daddcc5..5d66ea2f286 100644
> > > > --- a/gcc/tree-vect-slp.cc
> > > > +++ b/gcc/tree-vect-slp.cc
> > > > @@ -877,10 +877,14 @@ vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
> > > >        return false;
> > > >      }
> > > >
> > > > +  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> > > > +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > +    nunits = exact_div (nunits, 2);
> > > > +
> > > >    /* If populating the vector type requires unrolling then fail
> > > >       before adjusting *max_nunits for basic-block vectorization.  */
> > > >    if (is_a <bb_vec_info> (vinfo)
> > > > -      && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
> > > > +      && !multiple_p (group_size , nunits))
> > > >      {
> > > >        if (dump_enabled_p ())
> > > >         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > > @@ -891,7 +895,7 @@ vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
> > > >      }
> > > >
> > > >    /* In case of multiple types we need to detect the smallest type.  */
> > > > -  vect_update_max_nunits (max_nunits, vectype);
> > > > +  vect_update_max_nunits (max_nunits, nunits);
> > > >    return true;
> > > >  }
> > > >
> > > > @@ -3720,22 +3724,54 @@ vect_optimize_slp (vec_info *vinfo)
> > > >          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
> > > >          when permuting constants and invariants keeping the permute
> > > >          bijective.  */
> > > > -      auto_sbitmap load_index (SLP_TREE_LANES (node));
> > > > -      bitmap_clear (load_index);
> > > > -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > -       bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
> > > > -      unsigned j;
> > > > -      for (j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > -       if (!bitmap_bit_p (load_index, j))
> > > > -         break;
> > > > -      if (j != SLP_TREE_LANES (node))
> > > > -       continue;
> > > > +      /* Permutation of Complex type.  */
> > > > +      if (STMT_VINFO_COMPLEX_P (dr_stmt))
> > > > +       {
> > > > +         auto_sbitmap load_index (SLP_TREE_LANES (node) * 2);
> > > > +         bitmap_clear (load_index);
> > > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > +           {
> > > > +             unsigned bit = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > > +             bitmap_set_bit (load_index, 2 * bit);
> > > > +             bitmap_set_bit (load_index, 2 * bit + 1);
> > > > +           }
> > > > +         unsigned j;
> > > > +         for (j = 0; j < SLP_TREE_LANES (node) * 2; ++j)
> > > > +           if (!bitmap_bit_p (load_index, j))
> > > > +             break;
> > > > +         if (j != SLP_TREE_LANES (node) * 2)
> > > > +           continue;
> > > >
> > > > -      vec<unsigned> perm = vNULL;
> > > > -      perm.safe_grow (SLP_TREE_LANES (node), true);
> > > > -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > -       perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > > -      perms.safe_push (perm);
> > > > +         vec<unsigned> perm = vNULL;
> > > > +         perm.safe_grow (SLP_TREE_LANES (node) * 2, true);
> > > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > +           {
> > > > +             unsigned cidx = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > > +             perm[2 * j] = 2 * cidx;
> > > > +             perm[2 * j + 1] = 2 * cidx + 1;
> > > > +           }
> > > > +         perms.safe_push (perm);
> > > > +       }
> > > > +      else
> > > > +       {
> > > > +         auto_sbitmap load_index (SLP_TREE_LANES (node));
> > > > +         bitmap_clear (load_index);
> > > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > +           bitmap_set_bit (load_index,
> > > > +                           SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
> > > > +         unsigned j;
> > > > +         for (j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > +           if (!bitmap_bit_p (load_index, j))
> > > > +             break;
> > > > +         if (j != SLP_TREE_LANES (node))
> > > > +           continue;
> > > > +
> > > > +         vec<unsigned> perm = vNULL;
> > > > +         perm.safe_grow (SLP_TREE_LANES (node), true);
> > > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > +           perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > > +         perms.safe_push (perm);
> > > > +       }
> > > >        vertices[idx].perm_in = perms.length () - 1;
> > > >        vertices[idx].perm_out = perms.length () - 1;
> > > >      }
> > > > @@ -4518,6 +4554,12 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
> > > >         vf = loop_vinfo->vectorization_factor;
> > > >        else
> > > >         vf = 1;
> > > > +      /* For complex type and SLP, double vf to get right vectype.
> > > > +        .i.e vector(4) double for complex double, group size is 2, double vf
> > > > +        to map vf * group_size to TYPE_VECTOR_SUBPARTS.  */
> > > > +     if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > +       vf *= 2;
> > > > +
> > > >        unsigned int group_size = SLP_TREE_LANES (node);
> > > >        tree vectype = SLP_TREE_VECTYPE (node);
> > > >        SLP_TREE_NUMBER_OF_VEC_STMTS (node)
> > > > @@ -4763,10 +4805,17 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
> > > >             }
> > > >           unsigned group_size = SLP_TREE_LANES (child);
> > > >           poly_uint64 vf = 1;
> > > > +
> > > >           if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
> > > >             vf = loop_vinfo->vectorization_factor;
> > > > +
> > > > +         /* V2SF is just 1 complex type, so mutiply by 2
> > > > +            to get release vector numbers.  */
> > > > +         unsigned cp
> > > > +           = STMT_VINFO_COMPLEX_P (SLP_TREE_REPRESENTATIVE (node)) ? 2 : 1;
> > > > +
> > > >           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
> > > > -           = vect_get_num_vectors (vf * group_size, vector_type);
> > > > +           = vect_get_num_vectors (vf * group_size * cp, vector_type);
> > > >           /* And cost them.  */
> > > >           vect_prologue_cost_for_slp (child, cost_vec);
> > > >         }
> > > > @@ -6402,6 +6451,11 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > > >
> > > >    /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
> > > >    vector_type = SLP_TREE_VECTYPE (op_node);
> > > > +  unsigned int cp = 1;
> > > > +  /* Handle Complex type vector init.
> > > > +     SLP_TREE_REPRESENTATIVE (op_node) could be NULL.  */
> > > > +  if (TREE_CODE (TREE_TYPE (op_node->ops[0])) == COMPLEX_TYPE)
> > > > +    cp = 2;
> > > >
> > > >    unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
> > > >    SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
> > > > @@ -6426,9 +6480,9 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > > >    /* When using duplicate_and_interleave, we just need one element for
> > > >       each scalar statement.  */
> > > >    if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
> > > > -    nunits = group_size;
> > > > +    nunits = group_size * cp;
> > > >
> > > > -  number_of_copies = nunits * number_of_vectors / group_size;
> > > > +  number_of_copies = nunits * number_of_vectors / (group_size * cp);
> > > >
> > > >    number_of_places_left_in_vector = nunits;
> > > >    constant_p = true;
> > > > @@ -6460,8 +6514,23 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > > >                         gcc_unreachable ();
> > > >                     }
> > > >                   else
> > > > -                   op = fold_unary (VIEW_CONVERT_EXPR,
> > > > -                                    TREE_TYPE (vector_type), op);
> > > > +                   {
> > > > +                     tree scalar_type = TREE_TYPE (vector_type);
> > > > +                     /* For complex type, insert real and imag part
> > > > +                        separately.  */
> > > > +                     if (cp == 2)
> > > > +                       {
> > > > +                         gcc_assert ((TREE_CODE (TREE_TYPE (op))
> > > > +                                      == COMPLEX_TYPE)
> > > > +                                     && (scalar_type
> > > > +                                         == TREE_TYPE (TREE_TYPE (op))));
> > > > +                         elts[number_of_places_left_in_vector--]
> > > > +                           = fold_unary (IMAGPART_EXPR, scalar_type, op);
> > > > +                         op = fold_unary (REALPART_EXPR, scalar_type, op);
> > > > +                       }
> > > > +                     else
> > > > +                       op = fold_unary (VIEW_CONVERT_EXPR, scalar_type, op);
> > > > +                   }
> > > >                   gcc_assert (op && CONSTANT_CLASS_P (op));
> > > >                 }
> > > >               else
> > > > @@ -6481,11 +6550,28 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > > >                     }
> > > >                   else
> > > >                     {
> > > > -                     op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
> > > > -                                  op);
> > > > -                     init_stmt
> > > > -                       = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
> > > > -                                              op);
> > > > +                     tree scalar_type = TREE_TYPE (vector_type);
> > > > +                     if (cp == 2)
> > > > +                       {
> > > > +                         gcc_assert ((TREE_CODE (TREE_TYPE (op))
> > > > +                                      == COMPLEX_TYPE)
> > > > +                                     && (scalar_type
> > > > +                                         == TREE_TYPE (TREE_TYPE (op))));
> > > > +                         tree imag = build1 (IMAGPART_EXPR, scalar_type, op);
> > > > +                         op = build1 (REALPART_EXPR, scalar_type, op);
> > > > +                         tree imag_temp = make_ssa_name (scalar_type);
> > > > +                         elts[number_of_places_left_in_vector--] = imag_temp;
> > > > +                         init_stmt = gimple_build_assign (imag_temp, imag);
> > > > +                         gimple_seq_add_stmt (&ctor_seq, init_stmt);
> > > > +                         init_stmt = gimple_build_assign (new_temp, op);
> > > > +                       }
> > > > +                     else
> > > > +                       {
> > > > +                         op = build1 (VIEW_CONVERT_EXPR, scalar_type, op);
> > > > +                         init_stmt
> > > > +                           = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
> > > > +                                                  op);
> > > > +                       }
> > > >                     }
> > > >                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
> > > >                   op = new_temp;
> > > > @@ -6696,15 +6782,17 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > >    unsigned int nelts_to_build;
> > > >    unsigned int nvectors_per_build;
> > > >    unsigned int in_nlanes;
> > > > +  unsigned int cp = STMT_VINFO_COMPLEX_P (stmt_info) ? 2 : 1;
> > > >    bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
> > > > -                     && multiple_p (nunits, group_size));
> > > > +                     && multiple_p (nunits, group_size * cp));
> > > >    if (repeating_p)
> > > >      {
> > > >        /* A single vector contains a whole number of copies of the node, so:
> > > >          (a) all permutes can use the same mask; and
> > > >          (b) the permutes only need a single vector input.  */
> > > > -      mask.new_vector (nunits, group_size, 3);
> > > > -      nelts_to_build = mask.encoded_nelts ();
> > > > +      /* For complex type, mask size should be double of nelts_to_build.  */
> > > > +      mask.new_vector (nunits, group_size * cp, 3);
> > > > +      nelts_to_build = mask.encoded_nelts () / cp;
> > > >        nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
> > > >        in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
> > > >      }
> > > > @@ -6744,8 +6832,8 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > >         {
> > > >           /* Enforced before the loop when !repeating_p.  */
> > > >           unsigned int const_nunits = nunits.to_constant ();
> > > > -         vec_index = i / const_nunits;
> > > > -         mask_element = i % const_nunits;
> > > > +         vec_index = i / (const_nunits / cp);
> > > > +         mask_element = i % (const_nunits / cp);
> > > >           if (vec_index == first_vec_index
> > > >               || first_vec_index == -1)
> > > >             {
> > > > @@ -6755,7 +6843,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > >                    || second_vec_index == -1)
> > > >             {
> > > >               second_vec_index = vec_index;
> > > > -             mask_element += const_nunits;
> > > > +             mask_element += (const_nunits / cp);
> > > >             }
> > > >           else
> > > >             {
> > > > @@ -6768,14 +6856,24 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > >               return false;
> > > >             }
> > > >
> > > > -         gcc_assert (mask_element < 2 * const_nunits);
> > > > +         gcc_assert (mask_element < 2 * const_nunits / cp);
> > > >         }
> > > >
> > > >        if (mask_element != index)
> > > >         noop_p = false;
> > > > -      mask[index++] = mask_element;
> > > > +      /* Set index for Complex _type.
> > > > +        i.e. mask like [1,0] is actually [2, 3, 0, 1]
> > > > +        for vector scalar type.  */
> > > > +      if (cp == 2)
> > > > +       {
> > > > +         mask[2 * index] = 2 * mask_element;
> > > > +         mask[2 * index + 1] = 2 * mask_element + 1;
> > > > +       }
> > > > +      else
> > > > +       mask[index] = mask_element;
> > > > +      index++;
> > > >
> > > > -      if (index == count && !noop_p)
> > > > +      if (index * cp == count && !noop_p)
> > > >         {
> > > >           indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
> > > >           if (!can_vec_perm_const_p (mode, mode, indices))
> > > > @@ -6799,7 +6897,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > >           ++*n_perms;
> > > >         }
> > > >
> > > > -      if (index == count)
> > > > +      if (index * cp == count)
> > > >         {
> > > >           if (!analyze_only)
> > > >             {
> > > > @@ -6869,7 +6967,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > >           bool load_seen = false;
> > > >           for (unsigned i = 0; i < in_nlanes; ++i)
> > > >             {
> > > > -             if (i % const_nunits == 0)
> > > > +             if (i % (const_nunits * cp) == 0)
> > > >                 {
> > > >                   if (load_seen)
> > > >                     *n_loads += 1;
> > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > > > index 72107afc883..8af3b558be4 100644
> > > > --- a/gcc/tree-vect-stmts.cc
> > > > +++ b/gcc/tree-vect-stmts.cc
> > > > @@ -1397,25 +1397,70 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
> > > >  {
> > > >    gimple *init_stmt;
> > > >    tree new_temp;
> > > > +  tree scalar_type = TREE_TYPE (type);
> > > > +  gimple_seq stmts = NULL;
> > > > +
> > > > +  if (TREE_CODE (TREE_TYPE (val)) == COMPLEX_TYPE)
> > > > +    {
> > > > +      unsigned HOST_WIDE_INT nunits;
> > > > +      gcc_assert (TYPE_VECTOR_SUBPARTS (type).is_constant (&nunits));
> > > >
> > > > +      tree_vector_builder elts (type, nunits, 1);
> > > > +      tree imag, real;
> > > > +      if (TREE_CODE (val) == COMPLEX_CST)
> > > > +       {
> > > > +         real = fold_unary (REALPART_EXPR, scalar_type, val);
> > > > +         imag = fold_unary (IMAGPART_EXPR, scalar_type, val);
> > > > +       }
> > > > +      else
> > > > +       {
> > > > +         real = make_ssa_name (scalar_type);
> > > > +         imag = make_ssa_name (scalar_type);
> > > > +         init_stmt
> > > > +           = gimple_build_assign (real,
> > > > +                                  build1 (REALPART_EXPR, scalar_type, val));
> > > > +         gimple_seq_add_stmt (&stmts, init_stmt);
> > > > +         init_stmt
> > > > +           = gimple_build_assign (imag,
> > > > +                                  build1 (IMAGPART_EXPR, scalar_type, val));
> > > > +         gimple_seq_add_stmt (&stmts, init_stmt);
> > > > +       }
> > > > +
> > > > +      /* Build vector as [real,imag,real,imag,...].  */
> > > > +      for (unsigned i = 0; i != nunits; i++)
> > > > +       {
> > > > +         if (i % 2)
> > > > +           elts.quick_push (imag);
> > > > +         else
> > > > +           elts.quick_push (real);
> > > > +       }
> > > > +      val = gimple_build_vector (&stmts, &elts);
> > > > +      if (!gimple_seq_empty_p (stmts))
> > > > +       {
> > > > +         if (gsi)
> > > > +           gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> > > > +         else
> > > > +           vinfo->insert_seq_on_entry (stmt_info, stmts);
> > > > +       }
> > > > +    }
> > > >    /* We abuse this function to push sth to a SSA name with initial 'val'.  */
> > > > -  if (! useless_type_conversion_p (type, TREE_TYPE (val)))
> > > > +  else if (! useless_type_conversion_p (type, TREE_TYPE (val)))
> > > >      {
> > > >        gcc_assert (TREE_CODE (type) == VECTOR_TYPE);
> > > > -      if (! types_compatible_p (TREE_TYPE (type), TREE_TYPE (val)))
> > > > +      if (! types_compatible_p (scalar_type, TREE_TYPE (val)))
> > > >         {
> > > >           /* Scalar boolean value should be transformed into
> > > >              all zeros or all ones value before building a vector.  */
> > > >           if (VECTOR_BOOLEAN_TYPE_P (type))
> > > >             {
> > > > -             tree true_val = build_all_ones_cst (TREE_TYPE (type));
> > > > -             tree false_val = build_zero_cst (TREE_TYPE (type));
> > > > +             tree true_val = build_all_ones_cst (scalar_type);
> > > > +             tree false_val = build_zero_cst (scalar_type);
> > > >
> > > >               if (CONSTANT_CLASS_P (val))
> > > >                 val = integer_zerop (val) ? false_val : true_val;
> > > >               else
> > > >                 {
> > > > -                 new_temp = make_ssa_name (TREE_TYPE (type));
> > > > +                 new_temp = make_ssa_name (scalar_type);
> > > >                   init_stmt = gimple_build_assign (new_temp, COND_EXPR,
> > > >                                                    val, true_val, false_val);
> > > >                   vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi);
> > > > @@ -1424,14 +1469,13 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
> > > >             }
> > > >           else
> > > >             {
> > > > -             gimple_seq stmts = NULL;
> > > >               if (! INTEGRAL_TYPE_P (TREE_TYPE (val)))
> > > >                 val = gimple_build (&stmts, VIEW_CONVERT_EXPR,
> > > > -                                   TREE_TYPE (type), val);
> > > > +                                   scalar_type, val);
> > > >               else
> > > >                 /* ???  Condition vectorization expects us to do
> > > >                    promotion of invariant/external defs.  */
> > > > -               val = gimple_convert (&stmts, TREE_TYPE (type), val);
> > > > +               val = gimple_convert (&stmts, scalar_type, val);
> > > >               for (gimple_stmt_iterator gsi2 = gsi_start (stmts);
> > > >                    !gsi_end_p (gsi2); )
> > > >                 {
> > > > @@ -1496,7 +1540,12 @@ vect_get_vec_defs_for_operand (vec_info *vinfo, stmt_vec_info stmt_vinfo,
> > > >                && VECTOR_BOOLEAN_TYPE_P (stmt_vectype))
> > > >         vector_type = truth_type_for (stmt_vectype);
> > > >        else
> > > > -       vector_type = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op));
> > > > +       {
> > > > +         tree scalar_type = TREE_TYPE (op);
> > > > +         if (STMT_VINFO_COMPLEX_P (stmt_vinfo))
> > > > +           scalar_type = TREE_TYPE (scalar_type);
> > > > +         vector_type = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
> > > > +       }
> > > >
> > > >        gcc_assert (vector_type);
> > > >        tree vop = vect_init_vector (vinfo, stmt_vinfo, op, vector_type, NULL);
> > > > @@ -7509,8 +7558,17 @@ vectorizable_store (vec_info *vinfo,
> > > >       same location twice.  */
> > > >    gcc_assert (slp == PURE_SLP_STMT (stmt_info));
> > > >
> > > > +  if (!STMT_VINFO_DATA_REF (stmt_info))
> > > > +    return false;
> > > > +
> > > >    tree vectype = STMT_VINFO_VECTYPE (stmt_info), rhs_vectype = NULL_TREE;
> > > >    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> > > > +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > +    {
> > > > +      if (!nunits.is_constant ())
> > > > +       return false;
> > > > +      nunits = exact_div (nunits, 2);
> > > > +    }
> > > >
> > > >    if (loop_vinfo)
> > > >      {
> > > > @@ -7526,7 +7584,8 @@ vectorizable_store (vec_info *vinfo,
> > > >    if (slp)
> > > >      ncopies = 1;
> > > >    else
> > > > -    ncopies = vect_get_num_copies (loop_vinfo, vectype);
> > > > +    ncopies = vect_get_num_copies (loop_vinfo, vectype,
> > > > +                                  STMT_VINFO_COMPLEX_P (stmt_info));
> > > >
> > > >    gcc_assert (ncopies >= 1);
> > > >
> > > > @@ -7546,9 +7605,6 @@ vectorizable_store (vec_info *vinfo,
> > > >    elem_type = TREE_TYPE (vectype);
> > > >    vec_mode = TYPE_MODE (vectype);
> > > >
> > > > -  if (!STMT_VINFO_DATA_REF (stmt_info))
> > > > -    return false;
> > > > -
> > > >    vect_memory_access_type memory_access_type;
> > > >    enum dr_alignment_support alignment_support_scheme;
> > > >    int misalignment;
> > > > @@ -8778,6 +8834,12 @@ vectorizable_load (vec_info *vinfo,
> > > >
> > > >    tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> > > >    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> > > > +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > +    {
> > > > +      if (!nunits.is_constant ())
> > > > +       return false;
> > > > +      nunits = exact_div (nunits, 2);
> > > > +    }
> > > >
> > > >    if (loop_vinfo)
> > > >      {
> > > > @@ -8794,7 +8856,8 @@ vectorizable_load (vec_info *vinfo,
> > > >    if (slp)
> > > >      ncopies = 1;
> > > >    else
> > > > -    ncopies = vect_get_num_copies (loop_vinfo, vectype);
> > > > +    ncopies = vect_get_num_copies (loop_vinfo, vectype,
> > > > +                                  STMT_VINFO_COMPLEX_P (stmt_info));
> > > >
> > > >    gcc_assert (ncopies >= 1);
> > > >
> > > > @@ -8870,8 +8933,11 @@ vectorizable_load (vec_info *vinfo,
> > > >                 if (k > maxk)
> > > >                   maxk = k;
> > > >               tree vectype = SLP_TREE_VECTYPE (slp_node);
> > > > +             /* For complex type, half the nunits.  */
> > > >               if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
> > > > -                 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
> > > > +                 || maxk >= (DR_GROUP_SIZE (group_info)
> > > > +                             & ~((STMT_VINFO_COMPLEX_P (group_info)
> > > > +                                  ? nunits >> 1 : nunits) - 1)))
> > > >                 {
> > > >                   if (dump_enabled_p ())
> > > >                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > > @@ -12499,12 +12565,27 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
> > > >             dump_printf_loc (MSG_NOTE, vect_location,
> > > >                              "get vectype for scalar type: %T\n", scalar_type);
> > > >         }
> > > > +
> > > > +      tree orig_scalar_type = scalar_type;
> > > > +      if (TREE_CODE (scalar_type) == COMPLEX_TYPE)
> > > > +       {
> > > > +         /* Set complex_p for BB vectorizer.  */
> > > > +         STMT_VINFO_COMPLEX_P (stmt_info) = true;
> > > > +         scalar_type = TREE_TYPE (scalar_type);
> > > > +         /* Double group_size for BB vectorizer to make
> > > > +            following 2 get_vectype_for_scalar_type return wanted vectype.
> > > > +            Real group size is not changed, just make the "faked" input
> > > > +            group_size.  */
> > > > +         group_size *= 2;
> > > > +       }
> > > >        vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
> > > > -      if (!vectype)
> > > > +      if (!vectype
> > > > +         || (STMT_VINFO_COMPLEX_P (stmt_info)
> > > > +             && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ()))
> > > >         return opt_result::failure_at (stmt,
> > > >                                        "not vectorized:"
> > > >                                        " unsupported data-type %T\n",
> > > > -                                      scalar_type);
> > > > +                                      orig_scalar_type);
> > > >
> > > >        if (dump_enabled_p ())
> > > >         dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n", vectype);
> > > > @@ -12529,16 +12610,30 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
> > > >                                                    TREE_TYPE (vectype));
> > > >        if (scalar_type != TREE_TYPE (vectype))
> > > >         {
> > > > -         if (dump_enabled_p ())
> > > > +         tree orig_scalar_type = scalar_type;
> > > > +         if (TREE_CODE (scalar_type) == COMPLEX_TYPE)
> > > > +           {
> > > > +             /* Set complex_p for Loop vectorizer.  */
> > > > +             STMT_VINFO_COMPLEX_P (stmt_info) = true;
> > > > +             scalar_type = TREE_TYPE (scalar_type);
> > > > +             if (dump_enabled_p ())
> > > > +               dump_printf_loc (MSG_NOTE, vect_location,
> > > > +                            "get complex for smallest scalar type: %T\n",
> > > > +                            scalar_type);
> > > > +
> > > > +           }
> > > > +         else if (dump_enabled_p ())
> > > >             dump_printf_loc (MSG_NOTE, vect_location,
> > > >                              "get vectype for smallest scalar type: %T\n",
> > > >                              scalar_type);
> > > >           nunits_vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
> > > >                                                         group_size);
> > > > -         if (!nunits_vectype)
> > > > +         if (!nunits_vectype
> > > > +             || (STMT_VINFO_COMPLEX_P (stmt_info)
> > > > +                 && !TYPE_VECTOR_SUBPARTS (nunits_vectype).is_constant ()))
> > > >             return opt_result::failure_at
> > > >               (stmt, "not vectorized: unsupported data-type %T\n",
> > > > -              scalar_type);
> > > > +              orig_scalar_type);
> > > >           if (dump_enabled_p ())
> > > >             dump_printf_loc (MSG_NOTE, vect_location, "nunits vectype: %T\n",
> > > >                              nunits_vectype);
> > > > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> > > > index e5fdc9e0a14..4a809e492c4 100644
> > > > --- a/gcc/tree-vectorizer.h
> > > > +++ b/gcc/tree-vectorizer.h
> > > > @@ -1161,6 +1161,9 @@ public:
> > > >       vectorization.  */
> > > >    bool vectorizable;
> > > >
> > > > +  /* The scalar type of the LHS of this statement is complex type.  */
> > > > +  bool complex_p;
> > > > +
> > > >    /* The stmt to which this info struct refers to.  */
> > > >    gimple *stmt;
> > > >
> > > > @@ -1395,6 +1398,7 @@ struct gather_scatter_info {
> > > >  #define STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT(S) (S)->reduc_epilogue_adjustment
> > > >  #define STMT_VINFO_REDUC_IDX(S)                   (S)->reduc_idx
> > > >  #define STMT_VINFO_FORCE_SINGLE_CYCLE(S)   (S)->force_single_cycle
> > > > +#define STMT_VINFO_COMPLEX_P(S)            (S)->complex_p
> > > >
> > > >  #define STMT_VINFO_DR_WRT_VEC_LOOP(S)      (S)->dr_wrt_vec_loop
> > > >  #define STMT_VINFO_DR_BASE_ADDRESS(S)      (S)->dr_wrt_vec_loop.base_address
> > > > @@ -1970,6 +1974,15 @@ vect_get_num_copies (loop_vec_info loop_vinfo, tree vectype)
> > > >    return vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo), vectype);
> > > >  }
> > > >
> > > > +static inline unsigned int
> > > > +vect_get_num_copies (loop_vec_info loop_vinfo, tree vectype, bool complex_p)
> > > > +{
> > > > +  poly_uint64 nunits = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> > > > +  if (complex_p)
> > > > +    nunits *= 2;
> > > > +  return vect_get_num_vectors (nunits, vectype);
> > > > +}
> > > > +
> > > >  /* Update maximum unit count *MAX_NUNITS so that it accounts for
> > > >     NUNITS.  *MAX_NUNITS can be 1 if we haven't yet recorded anything.  */
> > > >
> > > > --
> > > > 2.18.1
> > > >
> >
> >
> >
> > --
> > BR,
> > Hongtao
Richard Biener July 13, 2022, 7:34 a.m. UTC | #6
On Wed, Jul 13, 2022 at 6:47 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Tue, Jul 12, 2022 at 10:12 PM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Tue, Jul 12, 2022 at 6:11 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Mon, Jul 11, 2022 at 7:47 PM Richard Biener via Gcc-patches
> > > <gcc-patches@gcc.gnu.org> wrote:
> > > >
> > > > On Mon, Jul 11, 2022 at 5:44 AM liuhongt <hongtao.liu@intel.com> wrote:
> > > > >
> > > > > The patch only handles load/store(including ctor/permutation, except
> > > > > gather/scatter) for complex type, other operations don't needs to be
> > > > > handled since they will be lowered by pass cplxlower.(MASK_LOAD is not
> > > > > supported for complex type, so no need to handle either).
> > > >
> > > > (*)
> > > >
> > > > > Instead of support vector(2) _Complex double, this patch takes vector(4)
> > > > > double as vector type of _Complex double. Since vectorizer originally
> > > > > takes TYPE_VECTOR_SUBPARTS as nunits which is not true for complex
> > > > > type, the patch handles nunits/ncopies/vf specially for complex type.
> > > >
> > > > For the limited set above(*) can you explain what's "special" about
> > > > vector(2) _Complex
> > > > vs. vector(4) double, thus why we need to have STMT_VINFO_COMPLEX_P at all?
> > > Supporting a vector(2) complex  is a straightforward idea, just like
> > > supporting other scalar type in vectorizer, but it requires more
> > > efforts(in the backend and frontend), considering that most of
> > > operations of complex type will be lowered into realpart and imagpart
> > > operations, supporting a vector(2) complex does not look that
> > > necessary. Then it comes up with supporting vector(4) double(with
> > > adjustment of vf/ctor/permutation), the vectorizer only needs to
> > > handle the vectorization of the move operation of the complex type(no
> > > need to worry about wrongly mapping vector(4) double multiplication to
> > > complex type multiplication since it's already lowered before
> > > vectorizer).
> > > stmt_info does not record the scalar type, in order to avoid duplicate
> > > operation like getting a lhs type from stmt to determine whether it is
> > > a complex type, STMT_VINFO_COMPLEX_P bit is added, this bit is mainly
> > > initialized in vect_analyze_data_refs and vect_get_vector_types_for_
> > > stmt.
> > > >
> > > > I wonder to what extent your handling can be extended to support re-vectorizing
> > > > (with a higher VF for example) already vectorized code?  The vectorizer giving
> > > > up on vector(2) double looks quite obviously similar to it giving up
> > > > on _Complex double ...
> > > Yes, it can be extended to vector(2) double/float/int/.... with a bit
> > > adjustment(exacting element by using bit_field instead of
> > > imagpart_expr/realpart_expr).
> > > > It would be a shame to not use the same underlying mechanism for dealing with
> > > > both, where for the vector case obviously vector(4) would be supported as well.
> > > >
> > > > In principle _Complex double operations should be two SLP lanes but it seems you
> > > > are handling them with classical interleaving as well?
> > > I'm only handling move operations, for other operations it will be
> > > lowered to realpart and imagpart and thus two SLP lanes.
> >
> > Yes, I understood that.
> >
> > Doing it more general (and IMHO better) would involve enhancing
> > how we represent dataref groups, maintaining the number of scalars
> > covered by each of the vinfos.  On the SLP representation side it
> > probably requires to rely on the representative for access and not
> > on the scalar stmts (since those do not map properly to the lanes).
> >
> > Ideally we'd be able to handle
> >
> > struct { _Complex double c; double a; double b; } a[], b[];
> >
> > void foo ()
> > {
> >    for (int i = 0; i < 100; ++i)
> >     {
> >       a[i].c = b[i].c;
> >       a[i].a = b[i].a;
> >       a[i].b = b[i].b;
> >     }
> > }
> >
> > which I guess your patch doesn't handle with plain AVX vector
> > copies but instead uses interleaving for the _Complex and non-_Complex
> > parts?
> Indeed, it produces wrong code.

For _Complex, in case we don't get to the "true and only" solution it
might be easier to split the loads and stores when it's just memory
copies and we have vectorization enabled and a supported vector
mode that would surely re-assemble them (store-merging doesn't seem
to do that).

Btw, we seem to produce

        movsd   b(%rip), %xmm0
        movsd   %xmm0, a(%rip)
        movsd   b+8(%rip), %xmm0
        movsd   %xmm0, a+8(%rip)

for a _Complex double memory copy on x86 which means we lack
true DCmode support (pseudos get decomposed).  Not sure if we
can somehow check whether a target has DCmode load/store
support and key decomposing on that (maybe check the SET optab).

It might be possible to check

_Complex double a, b;
void bar()
{
  a = b;
}

for all targets with a cc1 cross to see whether they somehow get
loads/stores _not_ decomposed (also check _Complex float,
I wouldn't worry for _Complex int or _Complex long double).

Richard.

> > Let me spend some time fleshing out what is necessary to make
> > this work "properly".  We can consider your special-casing of _Complex
> > memory ops if I can't manage to assess the complexity of the task.
> >
> > Thanks,
> > Richard.
> >
> > > >
> > > > Thanks,
> > > > Richard.
> > > >
> > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > > > Also test the patch for SPEC2017 and find there's complex type vectorization
> > > > > in 510/549(but no performance impact).
> > > > >
> > > > > Any comments?
> > > > >
> > > > > gcc/ChangeLog:
> > > > >
> > > > >         PR tree-optimization/106010
> > > > >         * tree-vect-data-refs.cc (vect_get_data_access_cost):
> > > > >         Pass complex_p to vect_get_num_copies to avoid ICE.
> > > > >         (vect_analyze_data_refs): Support vectorization for Complex
> > > > >         type with vector scalar types.
> > > > >         * tree-vect-loop.cc (vect_determine_vf_for_stmt_1): VF should
> > > > >         be half of TYPE_VECTOR_SUBPARTS when complex_p.
> > > > >         * tree-vect-slp.cc (vect_record_max_nunits): nunits should be
> > > > >         half of TYPE_VECTOR_SUBPARTS when complex_p.
> > > > >         (vect_optimize_slp): Support permutation for complex type.
> > > > >         (vect_slp_analyze_node_operations_1): Double nunits in
> > > > >         vect_get_num_vectors to get right SLP_TREE_NUMBER_OF_VEC_STMTS
> > > > >         when complex_p.
> > > > >         (vect_slp_analyze_node_operations): Ditto.
> > > > >         (vect_create_constant_vectors): Support CTOR for complex type.
> > > > >         (vect_transform_slp_perm_load): Support permutation for
> > > > >         complex type.
> > > > >         * tree-vect-stmts.cc (vect_init_vector): Support complex type.
> > > > >         (vect_get_vec_defs_for_operand): Get vector type for
> > > > >         complex type.
> > > > >         (vectorizable_store): Get right ncopies/nunits for complex
> > > > >         type, also return false when complex_p and
> > > > >         !TYPE_VECTOR_SUBPARTS.is_constant ().
> > > > >         (vectorizable_load): Ditto.
> > > > >         (vect_get_vector_types_for_stmt): Get vector type for complex type.
> > > > >         * tree-vectorizer.h (STMT_VINFO_COMPLEX_P): New macro.
> > > > >         (vect_get_num_copies): New overload.
> > > > >
> > > > > gcc/testsuite/ChangeLog:
> > > > >
> > > > >         * gcc.target/i386/pr106010-1a.c: New test.
> > > > >         * gcc.target/i386/pr106010-1b.c: New test.
> > > > >         * gcc.target/i386/pr106010-1c.c: New test.
> > > > >         * gcc.target/i386/pr106010-2a.c: New test.
> > > > >         * gcc.target/i386/pr106010-2b.c: New test.
> > > > >         * gcc.target/i386/pr106010-2c.c: New test.
> > > > >         * gcc.target/i386/pr106010-3a.c: New test.
> > > > >         * gcc.target/i386/pr106010-3b.c: New test.
> > > > >         * gcc.target/i386/pr106010-3c.c: New test.
> > > > >         * gcc.target/i386/pr106010-4a.c: New test.
> > > > >         * gcc.target/i386/pr106010-4b.c: New test.
> > > > >         * gcc.target/i386/pr106010-4c.c: New test.
> > > > >         * gcc.target/i386/pr106010-5a.c: New test.
> > > > >         * gcc.target/i386/pr106010-5b.c: New test.
> > > > >         * gcc.target/i386/pr106010-5c.c: New test.
> > > > >         * gcc.target/i386/pr106010-6a.c: New test.
> > > > >         * gcc.target/i386/pr106010-6b.c: New test.
> > > > >         * gcc.target/i386/pr106010-6c.c: New test.
> > > > >         * gcc.target/i386/pr106010-7a.c: New test.
> > > > >         * gcc.target/i386/pr106010-7b.c: New test.
> > > > >         * gcc.target/i386/pr106010-7c.c: New test.
> > > > >         * gcc.target/i386/pr106010-8a.c: New test.
> > > > >         * gcc.target/i386/pr106010-8b.c: New test.
> > > > >         * gcc.target/i386/pr106010-8c.c: New test.
> > > > > ---
> > > > >  gcc/testsuite/gcc.target/i386/pr106010-1a.c |  58 +++++++
> > > > >  gcc/testsuite/gcc.target/i386/pr106010-1b.c |  63 +++++++
> > > > >  gcc/testsuite/gcc.target/i386/pr106010-1c.c |  41 +++++
> > > > >  gcc/testsuite/gcc.target/i386/pr106010-2a.c |  82 +++++++++
> > > > >  gcc/testsuite/gcc.target/i386/pr106010-2b.c |  62 +++++++
> > > > >  gcc/testsuite/gcc.target/i386/pr106010-2c.c |  47 ++++++
> > > > >  gcc/testsuite/gcc.target/i386/pr106010-3a.c |  80 +++++++++
> > > > >  gcc/testsuite/gcc.target/i386/pr106010-3b.c | 126 ++++++++++++++
> > > > >  gcc/testsuite/gcc.target/i386/pr106010-3c.c |  69 ++++++++
> > > > >  gcc/testsuite/gcc.target/i386/pr106010-4a.c | 101 ++++++++++++
> > > > >  gcc/testsuite/gcc.target/i386/pr106010-4b.c |  67 ++++++++
> > > > >  gcc/testsuite/gcc.target/i386/pr106010-4c.c |  54 ++++++
> > > > >  gcc/testsuite/gcc.target/i386/pr106010-5a.c | 117 +++++++++++++
> > > > >  gcc/testsuite/gcc.target/i386/pr106010-5b.c |  80 +++++++++
> > > > >  gcc/testsuite/gcc.target/i386/pr106010-5c.c |  62 +++++++
> > > > >  gcc/testsuite/gcc.target/i386/pr106010-6a.c | 115 +++++++++++++
> > > > >  gcc/testsuite/gcc.target/i386/pr106010-6b.c | 157 ++++++++++++++++++
> > > > >  gcc/testsuite/gcc.target/i386/pr106010-6c.c |  80 +++++++++
> > > > >  gcc/testsuite/gcc.target/i386/pr106010-7a.c |  58 +++++++
> > > > >  gcc/testsuite/gcc.target/i386/pr106010-7b.c |  63 +++++++
> > > > >  gcc/testsuite/gcc.target/i386/pr106010-7c.c |  41 +++++
> > > > >  gcc/testsuite/gcc.target/i386/pr106010-8a.c |  58 +++++++
> > > > >  gcc/testsuite/gcc.target/i386/pr106010-8b.c |  53 ++++++
> > > > >  gcc/testsuite/gcc.target/i386/pr106010-8c.c |  38 +++++
> > > > >  gcc/tree-vect-data-refs.cc                  |  26 ++-
> > > > >  gcc/tree-vect-loop.cc                       |   7 +-
> > > > >  gcc/tree-vect-slp.cc                        | 174 +++++++++++++++-----
> > > > >  gcc/tree-vect-stmts.cc                      | 135 ++++++++++++---
> > > > >  gcc/tree-vectorizer.h                       |  13 ++
> > > > >  29 files changed, 2064 insertions(+), 63 deletions(-)
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > > > >
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1a.c b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > > > > new file mode 100644
> > > > > index 00000000000..b608f484934
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > > > > @@ -0,0 +1,58 @@
> > > > > +/* { dg-do compile } */
> > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > > > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "vect" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "vect" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "vect" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "vect" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "vect" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "vect" } } */
> > > > > +
> > > > > +#define N 10000
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_pd (_Complex double* a, _Complex double* b)
> > > > > +{
> > > > > +  for (int i = 0; i != N; i++)
> > > > > +    a[i] = b[i];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_ps (_Complex float* a, _Complex float* b)
> > > > > +{
> > > > > +  for (int i = 0; i != N; i++)
> > > > > +    a[i] = b[i];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi64 (_Complex long long* a, _Complex long long* b)
> > > > > +{
> > > > > +  for (int i = 0; i != N; i++)
> > > > > +    a[i] = b[i];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi32 (_Complex int* a, _Complex int* b)
> > > > > +{
> > > > > +  for (int i = 0; i != N; i++)
> > > > > +    a[i] = b[i];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi16 (_Complex short* a, _Complex short* b)
> > > > > +{
> > > > > +  for (int i = 0; i != N; i++)
> > > > > +    a[i] = b[i];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi8 (_Complex char* a, _Complex char* b)
> > > > > +{
> > > > > +  for (int i = 0; i != N; i++)
> > > > > +    a[i] = b[i];
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1b.c b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > > > > new file mode 100644
> > > > > index 00000000000..0f377c3a548
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > > > > @@ -0,0 +1,63 @@
> > > > > +/* { dg-do run } */
> > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > +/* { dg-require-effective-target avx } */
> > > > > +
> > > > > +#include "avx-check.h"
> > > > > +#include <string.h>
> > > > > +#include "pr106010-1a.c"
> > > > > +
> > > > > +void
> > > > > +avx_test (void)
> > > > > +{
> > > > > +  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > > +  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > > +  char* p_init = (char*) malloc (2 * N * sizeof (double));
> > > > > +
> > > > > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > > > > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > > > > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > > > > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > > > > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > > > > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > > > > +
> > > > > +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> > > > > +    p_init[i] = i;
> > > > > +
> > > > > +  memcpy (pd_src, p_init, 2 * N * sizeof (double));
> > > > > +  memcpy (ps_src, p_init, 2 * N * sizeof (float));
> > > > > +  memcpy (epi64_src, p_init, 2 * N * sizeof (long long));
> > > > > +  memcpy (epi32_src, p_init, 2 * N * sizeof (int));
> > > > > +  memcpy (epi16_src, p_init, 2 * N * sizeof (short));
> > > > > +  memcpy (epi8_src, p_init, 2 * N * sizeof (char));
> > > > > +
> > > > > +  foo_pd (pd_dst, pd_src);
> > > > > +  foo_ps (ps_dst, ps_src);
> > > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > > +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
> > > > > +    __builtin_abort ();
> > > > > +
> > > > > +  return;
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1c.c b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > > > > new file mode 100644
> > > > > index 00000000000..f07e9fb2d3d
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > > > > @@ -0,0 +1,41 @@
> > > > > +/* { dg-do run } */
> > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "vect" } } */
> > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > +
> > > > > +#include <string.h>
> > > > > +
> > > > > +static void do_test (void);
> > > > > +
> > > > > +#define DO_TEST do_test
> > > > > +#define AVX512FP16
> > > > > +#include "avx512-check.h"
> > > > > +
> > > > > +#define N 10000
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* b)
> > > > > +{
> > > > > +  for (int i = 0; i != N; i++)
> > > > > +    a[i] = b[i];
> > > > > +}
> > > > > +
> > > > > +static void
> > > > > +do_test (void)
> > > > > +{
> > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > > +  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
> > > > > +
> > > > > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > > > > +
> > > > > +  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
> > > > > +    p_init[i] = i;
> > > > > +
> > > > > +  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
> > > > > +
> > > > > +  foo_ph (ph_dst, ph_src);
> > > > > +  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
> > > > > +    __builtin_abort ();
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2a.c b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > > > > new file mode 100644
> > > > > index 00000000000..d2e2f8d4f43
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > > > > @@ -0,0 +1,82 @@
> > > > > +/* { dg-do compile } */
> > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "slp2" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "slp2" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "slp2" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "slp2" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "slp2" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "slp2" } } */
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[0];
> > > > > +  a[1] = b[1];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[0];
> > > > > +  a[1] = b[1];
> > > > > +  a[2] = b[2];
> > > > > +  a[3] = b[3];
> > > > > +
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[0];
> > > > > +  a[1] = b[1];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[0];
> > > > > +  a[1] = b[1];
> > > > > +  a[2] = b[2];
> > > > > +  a[3] = b[3];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[0];
> > > > > +  a[1] = b[1];
> > > > > +  a[2] = b[2];
> > > > > +  a[3] = b[3];
> > > > > +  a[4] = b[4];
> > > > > +  a[5] = b[5];
> > > > > +  a[6] = b[6];
> > > > > +  a[7] = b[7];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[0];
> > > > > +  a[1] = b[1];
> > > > > +  a[2] = b[2];
> > > > > +  a[3] = b[3];
> > > > > +  a[4] = b[4];
> > > > > +  a[5] = b[5];
> > > > > +  a[6] = b[6];
> > > > > +  a[7] = b[7];
> > > > > +  a[8] = b[8];
> > > > > +  a[9] = b[9];
> > > > > +  a[10] = b[10];
> > > > > +  a[11] = b[11];
> > > > > +  a[12] = b[12];
> > > > > +  a[13] = b[13];
> > > > > +  a[14] = b[14];
> > > > > +  a[15] = b[15];
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2b.c b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > > > > new file mode 100644
> > > > > index 00000000000..ac360752693
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > > > > @@ -0,0 +1,62 @@
> > > > > +/* { dg-do run } */
> > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > +/* { dg-require-effective-target avx } */
> > > > > +
> > > > > +#include "avx-check.h"
> > > > > +#include <string.h>
> > > > > +#include "pr106010-2a.c"
> > > > > +
> > > > > +void
> > > > > +avx_test (void)
> > > > > +{
> > > > > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > > > > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > > > > +  char* p = (char* ) malloc (32);
> > > > > +
> > > > > +  __builtin_memset (pd_dst, 0, 32);
> > > > > +  __builtin_memset (ps_dst, 0, 32);
> > > > > +  __builtin_memset (epi64_dst, 0, 32);
> > > > > +  __builtin_memset (epi32_dst, 0, 32);
> > > > > +  __builtin_memset (epi16_dst, 0, 32);
> > > > > +  __builtin_memset (epi8_dst, 0, 32);
> > > > > +
> > > > > +  for (int i = 0; i != 32; i++)
> > > > > +    p[i] = i;
> > > > > +  __builtin_memcpy (pd_src, p, 32);
> > > > > +  __builtin_memcpy (ps_src, p, 32);
> > > > > +  __builtin_memcpy (epi64_src, p, 32);
> > > > > +  __builtin_memcpy (epi32_src, p, 32);
> > > > > +  __builtin_memcpy (epi16_src, p, 32);
> > > > > +  __builtin_memcpy (epi8_src, p, 32);
> > > > > +
> > > > > +  foo_pd (pd_dst, pd_src);
> > > > > +  foo_ps (ps_dst, ps_src);
> > > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > > +  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > > > > +    __builtin_abort ();
> > > > > +
> > > > > +  return;
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2c.c b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > > > > new file mode 100644
> > > > > index 00000000000..a002f209ec9
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > > > > @@ -0,0 +1,47 @@
> > > > > +/* { dg-do run } */
> > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > +
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "slp2" } } */
> > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > > > +
> > > > > +#include <string.h>
> > > > > +
> > > > > +static void do_test (void);
> > > > > +#define DO_TEST do_test
> > > > > +#define AVX512FP16
> > > > > +#include "avx512-check.h"
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[0];
> > > > > +  a[1] = b[1];
> > > > > +  a[2] = b[2];
> > > > > +  a[3] = b[3];
> > > > > +  a[4] = b[4];
> > > > > +  a[5] = b[5];
> > > > > +  a[6] = b[6];
> > > > > +  a[7] = b[7];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +do_test (void)
> > > > > +{
> > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > > > > +  char* p = (char* ) malloc (32);
> > > > > +
> > > > > +   __builtin_memset (ph_dst, 0, 32);
> > > > > +
> > > > > +  for (int i = 0; i != 32; i++)
> > > > > +    p[i] = i;
> > > > > +  __builtin_memcpy (ph_src, p, 32);
> > > > > +
> > > > > +  foo_ph (ph_dst, ph_src);
> > > > > +  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
> > > > > +    __builtin_abort ();
> > > > > +
> > > > > +  return;
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3a.c b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > > > > new file mode 100644
> > > > > index 00000000000..c1b64b56b1c
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > > > > @@ -0,0 +1,80 @@
> > > > > +/* { dg-do compile } */
> > > > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
> > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 6, 7, 4, 5 \}} 1 "slp2" } }  */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17 \}} 1 "slp2" } }  */
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[1];
> > > > > +  a[1] = b[0];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[1];
> > > > > +  a[1] = b[0];
> > > > > +  a[2] = b[3];
> > > > > +  a[3] = b[2];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[1];
> > > > > +  a[1] = b[0];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[3];
> > > > > +  a[1] = b[2];
> > > > > +  a[2] = b[1];
> > > > > +  a[3] = b[0];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[7];
> > > > > +  a[1] = b[6];
> > > > > +  a[2] = b[5];
> > > > > +  a[3] = b[4];
> > > > > +  a[4] = b[3];
> > > > > +  a[5] = b[2];
> > > > > +  a[6] = b[1];
> > > > > +  a[7] = b[0];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[7];
> > > > > +  a[1] = b[6];
> > > > > +  a[2] = b[5];
> > > > > +  a[3] = b[4];
> > > > > +  a[4] = b[3];
> > > > > +  a[5] = b[2];
> > > > > +  a[6] = b[1];
> > > > > +  a[7] = b[0];
> > > > > +  a[8] = b[15];
> > > > > +  a[9] = b[14];
> > > > > +  a[10] = b[13];
> > > > > +  a[11] = b[12];
> > > > > +  a[12] = b[11];
> > > > > +  a[13] = b[10];
> > > > > +  a[14] = b[9];
> > > > > +  a[15] = b[8];
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3b.c b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > > > > new file mode 100644
> > > > > index 00000000000..e4fa3f3a541
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > > > > @@ -0,0 +1,126 @@
> > > > > +/* { dg-do run } */
> > > > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > +/* { dg-require-effective-target avx2 } */
> > > > > +
> > > > > +#include "avx2-check.h"
> > > > > +#include <string.h>
> > > > > +#include "pr106010-3a.c"
> > > > > +
> > > > > +void
> > > > > +avx2_test (void)
> > > > > +{
> > > > > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > > > > +  _Complex double* pd_exp = (_Complex double*) malloc (32);
> > > > > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > > > > +  _Complex float* ps_exp = (_Complex float*) malloc (32);
> > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > > > > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (32);
> > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > > > > +  _Complex int* epi32_exp = (_Complex int*) malloc (32);
> > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > > > > +  _Complex short* epi16_exp = (_Complex short*) malloc (32);
> > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > > > > +  _Complex char* epi8_exp = (_Complex char*) malloc (32);
> > > > > +  char* p = (char* ) malloc (32);
> > > > > +  char* q = (char* ) malloc (32);
> > > > > +
> > > > > +  __builtin_memset (pd_dst, 0, 32);
> > > > > +  __builtin_memset (ps_dst, 0, 32);
> > > > > +  __builtin_memset (epi64_dst, 0, 32);
> > > > > +  __builtin_memset (epi32_dst, 0, 32);
> > > > > +  __builtin_memset (epi16_dst, 0, 32);
> > > > > +  __builtin_memset (epi8_dst, 0, 32);
> > > > > +
> > > > > +  for (int i = 0; i != 32; i++)
> > > > > +    p[i] = i;
> > > > > +  __builtin_memcpy (pd_src, p, 32);
> > > > > +  __builtin_memcpy (ps_src, p, 32);
> > > > > +  __builtin_memcpy (epi64_src, p, 32);
> > > > > +  __builtin_memcpy (epi32_src, p, 32);
> > > > > +  __builtin_memcpy (epi16_src, p, 32);
> > > > > +  __builtin_memcpy (epi8_src, p, 32);
> > > > > +
> > > > > +  for (int i = 0; i != 16; i++)
> > > > > +    {
> > > > > +      p[i] = i + 16;
> > > > > +      p[i + 16] = i;
> > > > > +    }
> > > > > +  __builtin_memcpy (pd_exp, p, 32);
> > > > > +  __builtin_memcpy (epi64_exp, p, 32);
> > > > > +
> > > > > +  for (int i = 0; i != 8; i++)
> > > > > +    {
> > > > > +      p[i] = i + 8;
> > > > > +      p[i + 8] = i;
> > > > > +      p[i + 16] = i + 24;
> > > > > +      p[i + 24] = i + 16;
> > > > > +      q[i] = i + 24;
> > > > > +      q[i + 8] = i + 16;
> > > > > +      q[i + 16] = i + 8;
> > > > > +      q[i + 24] = i;
> > > > > +    }
> > > > > +  __builtin_memcpy (ps_exp, p, 32);
> > > > > +  __builtin_memcpy (epi32_exp, q, 32);
> > > > > +
> > > > > +
> > > > > +  for (int i = 0; i != 4; i++)
> > > > > +    {
> > > > > +      q[i] = i + 28;
> > > > > +      q[i + 4] = i + 24;
> > > > > +      q[i + 8] = i + 20;
> > > > > +      q[i + 12] = i + 16;
> > > > > +      q[i + 16] = i + 12;
> > > > > +      q[i + 20] = i + 8;
> > > > > +      q[i + 24] = i + 4;
> > > > > +      q[i + 28] = i;
> > > > > +    }
> > > > > +  __builtin_memcpy (epi16_exp, q, 32);
> > > > > +
> > > > > +  for (int i = 0; i != 2; i++)
> > > > > +    {
> > > > > +      q[i] = i + 14;
> > > > > +      q[i + 2] = i + 12;
> > > > > +      q[i + 4] = i + 10;
> > > > > +      q[i + 6] = i + 8;
> > > > > +      q[i + 8] = i + 6;
> > > > > +      q[i + 10] = i + 4;
> > > > > +      q[i + 12] = i + 2;
> > > > > +      q[i + 14] = i;
> > > > > +      q[i + 16] = i + 30;
> > > > > +      q[i + 18] = i + 28;
> > > > > +      q[i + 20] = i + 26;
> > > > > +      q[i + 22] = i + 24;
> > > > > +      q[i + 24] = i + 22;
> > > > > +      q[i + 26] = i + 20;
> > > > > +      q[i + 28] = i + 18;
> > > > > +      q[i + 30] = i + 16;
> > > > > +    }
> > > > > +  __builtin_memcpy (epi8_exp, q, 32);
> > > > > +
> > > > > +  foo_pd (pd_dst, pd_src);
> > > > > +  foo_ps (ps_dst, ps_src);
> > > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > > +  if (__builtin_memcmp (pd_dst, pd_exp, 32) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (ps_dst, ps_exp, 32) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 32) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 32) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 32) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 32) != 0)
> > > > > +    __builtin_abort ();
> > > > > +
> > > > > +  return;
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3c.c b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > > > > new file mode 100644
> > > > > index 00000000000..5a5a3d4b992
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > > > > @@ -0,0 +1,69 @@
> > > > > +/* { dg-do run } */
> > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 8, 9, 6, 7, 14, 15, 12, 13, 4, 5, 10, 11 \}} 1 "slp2" } }  */
> > > > > +
> > > > > +#include <string.h>
> > > > > +
> > > > > +static void do_test (void);
> > > > > +#define DO_TEST do_test
> > > > > +#define AVX512FP16
> > > > > +#include "avx512-check.h"
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[1];
> > > > > +  a[1] = b[0];
> > > > > +  a[2] = b[4];
> > > > > +  a[3] = b[3];
> > > > > +  a[4] = b[7];
> > > > > +  a[5] = b[6];
> > > > > +  a[6] = b[2];
> > > > > +  a[7] = b[5];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +do_test (void)
> > > > > +{
> > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > > > > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (32);
> > > > > +  char* p = (char* ) malloc (32);
> > > > > +  char* q = (char* ) malloc (32);
> > > > > +
> > > > > +  __builtin_memset (ph_dst, 0, 32);
> > > > > +
> > > > > +  for (int i = 0; i != 32; i++)
> > > > > +    p[i] = i;
> > > > > +  __builtin_memcpy (ph_src, p, 32);
> > > > > +
> > > > > +  for (int i = 0; i != 4; i++)
> > > > > +    {
> > > > > +      p[i] = i + 4;
> > > > > +      p[i + 4] = i;
> > > > > +      p[i + 8] = i + 16;
> > > > > +      p[i + 12] = i + 12;
> > > > > +      p[i + 16] = i + 28;
> > > > > +      p[i + 20] = i + 24;
> > > > > +      p[i + 24] = i + 8;
> > > > > +      p[i + 28] = i + 20;
> > > > > +      q[i] = i + 28;
> > > > > +      q[i + 4] = i + 24;
> > > > > +      q[i + 8] = i + 20;
> > > > > +      q[i + 12] = i + 16;
> > > > > +      q[i + 16] = i + 12;
> > > > > +      q[i + 20] = i + 8;
> > > > > +      q[i + 24] = i + 4;
> > > > > +      q[i + 28] = i;
> > > > > +    }
> > > > > +  __builtin_memcpy (ph_exp, p, 32);
> > > > > +
> > > > > +  foo_ph (ph_dst, ph_src);
> > > > > +  if (__builtin_memcmp (ph_dst, ph_exp, 32) != 0)
> > > > > +    __builtin_abort ();
> > > > > +
> > > > > +  return;
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4a.c b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > > > > new file mode 100644
> > > > > index 00000000000..b7b0b532bb1
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > > > > @@ -0,0 +1,101 @@
> > > > > +/* { dg-do compile } */
> > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
> > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "slp2" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "slp2" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "slp2" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "slp2" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "slp2" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "slp2" } } */
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_pd (_Complex double* a,
> > > > > +       _Complex double b1,
> > > > > +       _Complex double b2)
> > > > > +{
> > > > > +  a[0] = b1;
> > > > > +  a[1] = b2;
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_ps (_Complex float* a,
> > > > > +       _Complex float b1, _Complex float b2,
> > > > > +       _Complex float b3, _Complex float b4)
> > > > > +{
> > > > > +  a[0] = b1;
> > > > > +  a[1] = b2;
> > > > > +  a[2] = b3;
> > > > > +  a[3] = b4;
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi64 (_Complex long long* a,
> > > > > +          _Complex long long b1,
> > > > > +          _Complex long long b2)
> > > > > +{
> > > > > +  a[0] = b1;
> > > > > +  a[1] = b2;
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi32 (_Complex int* a,
> > > > > +          _Complex int b1, _Complex int b2,
> > > > > +          _Complex int b3, _Complex int b4)
> > > > > +{
> > > > > +  a[0] = b1;
> > > > > +  a[1] = b2;
> > > > > +  a[2] = b3;
> > > > > +  a[3] = b4;
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi16 (_Complex short* a,
> > > > > +          _Complex short b1, _Complex short b2,
> > > > > +          _Complex short b3, _Complex short b4,
> > > > > +          _Complex short b5, _Complex short b6,
> > > > > +          _Complex short b7,_Complex short b8)
> > > > > +{
> > > > > +  a[0] = b1;
> > > > > +  a[1] = b2;
> > > > > +  a[2] = b3;
> > > > > +  a[3] = b4;
> > > > > +  a[4] = b5;
> > > > > +  a[5] = b6;
> > > > > +  a[6] = b7;
> > > > > +  a[7] = b8;
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi8 (_Complex char* a,
> > > > > +         _Complex char b1, _Complex char b2,
> > > > > +         _Complex char b3, _Complex char b4,
> > > > > +         _Complex char b5, _Complex char b6,
> > > > > +         _Complex char b7,_Complex char b8,
> > > > > +         _Complex char b9, _Complex char b10,
> > > > > +         _Complex char b11, _Complex char b12,
> > > > > +         _Complex char b13, _Complex char b14,
> > > > > +         _Complex char b15,_Complex char b16)
> > > > > +{
> > > > > +  a[0] = b1;
> > > > > +  a[1] = b2;
> > > > > +  a[2] = b3;
> > > > > +  a[3] = b4;
> > > > > +  a[4] = b5;
> > > > > +  a[5] = b6;
> > > > > +  a[6] = b7;
> > > > > +  a[7] = b8;
> > > > > +  a[8] = b9;
> > > > > +  a[9] = b10;
> > > > > +  a[10] = b11;
> > > > > +  a[11] = b12;
> > > > > +  a[12] = b13;
> > > > > +  a[13] = b14;
> > > > > +  a[14] = b15;
> > > > > +  a[15] = b16;
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4b.c b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > > > > new file mode 100644
> > > > > index 00000000000..e2e79508c4b
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > > > > @@ -0,0 +1,67 @@
> > > > > +/* { dg-do run } */
> > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > +/* { dg-require-effective-target avx } */
> > > > > +
> > > > > +#include "avx-check.h"
> > > > > +#include <string.h>
> > > > > +#include "pr106010-4a.c"
> > > > > +
> > > > > +void
> > > > > +avx_test (void)
> > > > > +{
> > > > > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > > > > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > > > > +  char* p = (char* ) malloc (32);
> > > > > +
> > > > > +  __builtin_memset (pd_dst, 0, 32);
> > > > > +  __builtin_memset (ps_dst, 0, 32);
> > > > > +  __builtin_memset (epi64_dst, 0, 32);
> > > > > +  __builtin_memset (epi32_dst, 0, 32);
> > > > > +  __builtin_memset (epi16_dst, 0, 32);
> > > > > +  __builtin_memset (epi8_dst, 0, 32);
> > > > > +
> > > > > +  for (int i = 0; i != 32; i++)
> > > > > +    p[i] = i;
> > > > > +  __builtin_memcpy (pd_src, p, 32);
> > > > > +  __builtin_memcpy (ps_src, p, 32);
> > > > > +  __builtin_memcpy (epi64_src, p, 32);
> > > > > +  __builtin_memcpy (epi32_src, p, 32);
> > > > > +  __builtin_memcpy (epi16_src, p, 32);
> > > > > +  __builtin_memcpy (epi8_src, p, 32);
> > > > > +
> > > > > +  foo_pd (pd_dst, pd_src[0], pd_src[1]);
> > > > > +  foo_ps (ps_dst, ps_src[0], ps_src[1], ps_src[2], ps_src[3]);
> > > > > +  foo_epi64 (epi64_dst, epi64_src[0], epi64_src[1]);
> > > > > +  foo_epi32 (epi32_dst, epi32_src[0], epi32_src[1], epi32_src[2], epi32_src[3]);
> > > > > +  foo_epi16 (epi16_dst, epi16_src[0], epi16_src[1], epi16_src[2], epi16_src[3],
> > > > > +            epi16_src[4], epi16_src[5], epi16_src[6], epi16_src[7]);
> > > > > +  foo_epi8 (epi8_dst, epi8_src[0], epi8_src[1], epi8_src[2], epi8_src[3],
> > > > > +           epi8_src[4], epi8_src[5], epi8_src[6], epi8_src[7],
> > > > > +           epi8_src[8], epi8_src[9], epi8_src[10], epi8_src[11],
> > > > > +           epi8_src[12], epi8_src[13], epi8_src[14], epi8_src[15]);
> > > > > +
> > > > > +  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi8_dst, epi8_src, 32) != 0)
> > > > > +    __builtin_abort ();
> > > > > +
> > > > > +  return;
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4c.c b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > > > > new file mode 100644
> > > > > index 00000000000..8e02aefe3b5
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > > > > @@ -0,0 +1,54 @@
> > > > > +/* { dg-do run } */
> > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -fdump-tree-slp-details -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "slp2" } } */
> > > > > +
> > > > > +#include <string.h>
> > > > > +
> > > > > +static void do_test (void);
> > > > > +#define DO_TEST do_test
> > > > > +#define AVX512FP16
> > > > > +#include "avx512-check.h"
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_ph (_Complex _Float16* a,
> > > > > +       _Complex _Float16 b1, _Complex _Float16 b2,
> > > > > +       _Complex _Float16 b3, _Complex _Float16 b4,
> > > > > +       _Complex _Float16 b5, _Complex _Float16 b6,
> > > > > +       _Complex _Float16 b7,_Complex _Float16 b8)
> > > > > +{
> > > > > +  a[0] = b1;
> > > > > +  a[1] = b2;
> > > > > +  a[2] = b3;
> > > > > +  a[3] = b4;
> > > > > +  a[4] = b5;
> > > > > +  a[5] = b6;
> > > > > +  a[6] = b7;
> > > > > +  a[7] = b8;
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +do_test (void)
> > > > > +{
> > > > > +
> > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > > > > +
> > > > > +  char* p = (char* ) malloc (32);
> > > > > +
> > > > > +  __builtin_memset (ph_dst, 0, 32);
> > > > > +
> > > > > +  for (int i = 0; i != 32; i++)
> > > > > +    p[i] = i;
> > > > > +
> > > > > +  __builtin_memcpy (ph_src, p, 32);
> > > > > +
> > > > > +  foo_ph (ph_dst, ph_src[0], ph_src[1], ph_src[2], ph_src[3],
> > > > > +         ph_src[4], ph_src[5], ph_src[6], ph_src[7]);
> > > > > +
> > > > > +  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  return;
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5a.c b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > > > > new file mode 100644
> > > > > index 00000000000..9d4a6f9846b
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > > > > @@ -0,0 +1,117 @@
> > > > > +/* { dg-do compile } */
> > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 4 "slp2" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 4 "slp2" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 4 "slp2" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 4 "slp2" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 4 "slp2" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 4 "slp2" } } */
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[2];
> > > > > +  a[1] = b[3];
> > > > > +  a[2] = b[0];
> > > > > +  a[3] = b[1];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[4];
> > > > > +  a[1] = b[5];
> > > > > +  a[2] = b[6];
> > > > > +  a[3] = b[7];
> > > > > +  a[4] = b[0];
> > > > > +  a[5] = b[1];
> > > > > +  a[6] = b[2];
> > > > > +  a[7] = b[3];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[2];
> > > > > +  a[1] = b[3];
> > > > > +  a[2] = b[0];
> > > > > +  a[3] = b[1];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[4];
> > > > > +  a[1] = b[5];
> > > > > +  a[2] = b[6];
> > > > > +  a[3] = b[7];
> > > > > +  a[4] = b[0];
> > > > > +  a[5] = b[1];
> > > > > +  a[6] = b[2];
> > > > > +  a[7] = b[3];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[8];
> > > > > +  a[1] = b[9];
> > > > > +  a[2] = b[10];
> > > > > +  a[3] = b[11];
> > > > > +  a[4] = b[12];
> > > > > +  a[5] = b[13];
> > > > > +  a[6] = b[14];
> > > > > +  a[7] = b[15];
> > > > > +  a[8] = b[0];
> > > > > +  a[9] = b[1];
> > > > > +  a[10] = b[2];
> > > > > +  a[11] = b[3];
> > > > > +  a[12] = b[4];
> > > > > +  a[13] = b[5];
> > > > > +  a[14] = b[6];
> > > > > +  a[15] = b[7];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[16];
> > > > > +  a[1] = b[17];
> > > > > +  a[2] = b[18];
> > > > > +  a[3] = b[19];
> > > > > +  a[4] = b[20];
> > > > > +  a[5] = b[21];
> > > > > +  a[6] = b[22];
> > > > > +  a[7] = b[23];
> > > > > +  a[8] = b[24];
> > > > > +  a[9] = b[25];
> > > > > +  a[10] = b[26];
> > > > > +  a[11] = b[27];
> > > > > +  a[12] = b[28];
> > > > > +  a[13] = b[29];
> > > > > +  a[14] = b[30];
> > > > > +  a[15] = b[31];
> > > > > +  a[16] = b[0];
> > > > > +  a[17] = b[1];
> > > > > +  a[18] = b[2];
> > > > > +  a[19] = b[3];
> > > > > +  a[20] = b[4];
> > > > > +  a[21] = b[5];
> > > > > +  a[22] = b[6];
> > > > > +  a[23] = b[7];
> > > > > +  a[24] = b[8];
> > > > > +  a[25] = b[9];
> > > > > +  a[26] = b[10];
> > > > > +  a[27] = b[11];
> > > > > +  a[28] = b[12];
> > > > > +  a[29] = b[13];
> > > > > +  a[30] = b[14];
> > > > > +  a[31] = b[15];
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5b.c b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > > > > new file mode 100644
> > > > > index 00000000000..d5c6ebeb5cf
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > > > > @@ -0,0 +1,80 @@
> > > > > +/* { dg-do run } */
> > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > +/* { dg-require-effective-target avx } */
> > > > > +
> > > > > +#include "avx-check.h"
> > > > > +#include <string.h>
> > > > > +#include "pr106010-5a.c"
> > > > > +
> > > > > +void
> > > > > +avx_test (void)
> > > > > +{
> > > > > +  _Complex double* pd_src = (_Complex double*) malloc (64);
> > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (64);
> > > > > +  _Complex double* pd_exp = (_Complex double*) malloc (64);
> > > > > +  _Complex float* ps_src = (_Complex float*) malloc (64);
> > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (64);
> > > > > +  _Complex float* ps_exp = (_Complex float*) malloc (64);
> > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
> > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
> > > > > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
> > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (64);
> > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (64);
> > > > > +  _Complex int* epi32_exp = (_Complex int*) malloc (64);
> > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (64);
> > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (64);
> > > > > +  _Complex short* epi16_exp = (_Complex short*) malloc (64);
> > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (64);
> > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (64);
> > > > > +  _Complex char* epi8_exp = (_Complex char*) malloc (64);
> > > > > +  char* p = (char* ) malloc (64);
> > > > > +  char* q = (char* ) malloc (64);
> > > > > +
> > > > > +  __builtin_memset (pd_dst, 0, 64);
> > > > > +  __builtin_memset (ps_dst, 0, 64);
> > > > > +  __builtin_memset (epi64_dst, 0, 64);
> > > > > +  __builtin_memset (epi32_dst, 0, 64);
> > > > > +  __builtin_memset (epi16_dst, 0, 64);
> > > > > +  __builtin_memset (epi8_dst, 0, 64);
> > > > > +
> > > > > +  for (int i = 0; i != 64; i++)
> > > > > +    {
> > > > > +      p[i] = i;
> > > > > +      q[i] = (i + 32) % 64;
> > > > > +    }
> > > > > +  __builtin_memcpy (pd_src, p, 64);
> > > > > +  __builtin_memcpy (ps_src, p, 64);
> > > > > +  __builtin_memcpy (epi64_src, p, 64);
> > > > > +  __builtin_memcpy (epi32_src, p, 64);
> > > > > +  __builtin_memcpy (epi16_src, p, 64);
> > > > > +  __builtin_memcpy (epi8_src, p, 64);
> > > > > +
> > > > > +  __builtin_memcpy (pd_exp, q, 64);
> > > > > +  __builtin_memcpy (ps_exp, q, 64);
> > > > > +  __builtin_memcpy (epi64_exp, q, 64);
> > > > > +  __builtin_memcpy (epi32_exp, q, 64);
> > > > > +  __builtin_memcpy (epi16_exp, q, 64);
> > > > > +  __builtin_memcpy (epi8_exp, q, 64);
> > > > > +
> > > > > +  foo_pd (pd_dst, pd_src);
> > > > > +  foo_ps (ps_dst, ps_src);
> > > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > > +
> > > > > +  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
> > > > > +    __builtin_abort ();
> > > > > +
> > > > > +  return;
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5c.c b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > > > > new file mode 100644
> > > > > index 00000000000..9ce4e6dd5c0
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > > > > @@ -0,0 +1,62 @@
> > > > > +/* { dg-do run } */
> > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 4 "slp2" } } */
> > > > > +
> > > > > +#include <string.h>
> > > > > +
> > > > > +static void do_test (void);
> > > > > +#define DO_TEST do_test
> > > > > +#define AVX512FP16
> > > > > +#include "avx512-check.h"
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[8];
> > > > > +  a[1] = b[9];
> > > > > +  a[2] = b[10];
> > > > > +  a[3] = b[11];
> > > > > +  a[4] = b[12];
> > > > > +  a[5] = b[13];
> > > > > +  a[6] = b[14];
> > > > > +  a[7] = b[15];
> > > > > +  a[8] = b[0];
> > > > > +  a[9] = b[1];
> > > > > +  a[10] = b[2];
> > > > > +  a[11] = b[3];
> > > > > +  a[12] = b[4];
> > > > > +  a[13] = b[5];
> > > > > +  a[14] = b[6];
> > > > > +  a[15] = b[7];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +do_test (void)
> > > > > +{
> > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
> > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
> > > > > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
> > > > > +  char* p = (char* ) malloc (64);
> > > > > +  char* q = (char* ) malloc (64);
> > > > > +
> > > > > +  __builtin_memset (ph_dst, 0, 64);
> > > > > +
> > > > > +  for (int i = 0; i != 64; i++)
> > > > > +    {
> > > > > +      p[i] = i;
> > > > > +      q[i] = (i + 32) % 64;
> > > > > +    }
> > > > > +  __builtin_memcpy (ph_src, p, 64);
> > > > > +
> > > > > +  __builtin_memcpy (ph_exp, q, 64);
> > > > > +
> > > > > +  foo_ph (ph_dst, ph_src);
> > > > > +
> > > > > +  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
> > > > > +    __builtin_abort ();
> > > > > +
> > > > > +  return;
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6a.c b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > > > > new file mode 100644
> > > > > index 00000000000..65a90d03684
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > > > > @@ -0,0 +1,115 @@
> > > > > +/* { dg-do compile } */
> > > > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 4 "slp2" } }  */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 4 "slp2" } }  */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[3];
> > > > > +  a[1] = b[2];
> > > > > +  a[2] = b[1];
> > > > > +  a[3] = b[0];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[7];
> > > > > +  a[1] = b[6];
> > > > > +  a[2] = b[5];
> > > > > +  a[3] = b[4];
> > > > > +  a[4] = b[3];
> > > > > +  a[5] = b[2];
> > > > > +  a[6] = b[1];
> > > > > +  a[7] = b[0];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[3];
> > > > > +  a[1] = b[2];
> > > > > +  a[2] = b[1];
> > > > > +  a[3] = b[0];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[7];
> > > > > +  a[1] = b[6];
> > > > > +  a[2] = b[5];
> > > > > +  a[3] = b[4];
> > > > > +  a[4] = b[3];
> > > > > +  a[5] = b[2];
> > > > > +  a[6] = b[1];
> > > > > +  a[7] = b[0];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[15];
> > > > > +  a[1] = b[14];
> > > > > +  a[2] = b[13];
> > > > > +  a[3] = b[12];
> > > > > +  a[4] = b[11];
> > > > > +  a[5] = b[10];
> > > > > +  a[6] = b[9];
> > > > > +  a[7] = b[8];
> > > > > +  a[8] = b[7];
> > > > > +  a[9] = b[6];
> > > > > +  a[10] = b[5];
> > > > > +  a[11] = b[4];
> > > > > +  a[12] = b[3];
> > > > > +  a[13] = b[2];
> > > > > +  a[14] = b[1];
> > > > > +  a[15] = b[0];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[31];
> > > > > +  a[1] = b[30];
> > > > > +  a[2] = b[29];
> > > > > +  a[3] = b[28];
> > > > > +  a[4] = b[27];
> > > > > +  a[5] = b[26];
> > > > > +  a[6] = b[25];
> > > > > +  a[7] = b[24];
> > > > > +  a[8] = b[23];
> > > > > +  a[9] = b[22];
> > > > > +  a[10] = b[21];
> > > > > +  a[11] = b[20];
> > > > > +  a[12] = b[19];
> > > > > +  a[13] = b[18];
> > > > > +  a[14] = b[17];
> > > > > +  a[15] = b[16];
> > > > > +  a[16] = b[15];
> > > > > +  a[17] = b[14];
> > > > > +  a[18] = b[13];
> > > > > +  a[19] = b[12];
> > > > > +  a[20] = b[11];
> > > > > +  a[21] = b[10];
> > > > > +  a[22] = b[9];
> > > > > +  a[23] = b[8];
> > > > > +  a[24] = b[7];
> > > > > +  a[25] = b[6];
> > > > > +  a[26] = b[5];
> > > > > +  a[27] = b[4];
> > > > > +  a[28] = b[3];
> > > > > +  a[29] = b[2];
> > > > > +  a[30] = b[1];
> > > > > +  a[31] = b[0];
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6b.c b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > > > > new file mode 100644
> > > > > index 00000000000..1c5bb020939
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > > > > @@ -0,0 +1,157 @@
> > > > > +/* { dg-do run } */
> > > > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > +/* { dg-require-effective-target avx2 } */
> > > > > +
> > > > > +#include "avx2-check.h"
> > > > > +#include <string.h>
> > > > > +#include "pr106010-6a.c"
> > > > > +
> > > > > +void
> > > > > +avx2_test (void)
> > > > > +{
> > > > > +  _Complex double* pd_src = (_Complex double*) malloc (64);
> > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (64);
> > > > > +  _Complex double* pd_exp = (_Complex double*) malloc (64);
> > > > > +  _Complex float* ps_src = (_Complex float*) malloc (64);
> > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (64);
> > > > > +  _Complex float* ps_exp = (_Complex float*) malloc (64);
> > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
> > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
> > > > > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
> > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (64);
> > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (64);
> > > > > +  _Complex int* epi32_exp = (_Complex int*) malloc (64);
> > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (64);
> > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (64);
> > > > > +  _Complex short* epi16_exp = (_Complex short*) malloc (64);
> > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (64);
> > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (64);
> > > > > +  _Complex char* epi8_exp = (_Complex char*) malloc (64);
> > > > > +  char* p = (char* ) malloc (64);
> > > > > +  char* q = (char* ) malloc (64);
> > > > > +
> > > > > +  __builtin_memset (pd_dst, 0, 64);
> > > > > +  __builtin_memset (ps_dst, 0, 64);
> > > > > +  __builtin_memset (epi64_dst, 0, 64);
> > > > > +  __builtin_memset (epi32_dst, 0, 64);
> > > > > +  __builtin_memset (epi16_dst, 0, 64);
> > > > > +  __builtin_memset (epi8_dst, 0, 64);
> > > > > +
> > > > > +  for (int i = 0; i != 64; i++)
> > > > > +    p[i] = i;
> > > > > +
> > > > > +  __builtin_memcpy (pd_src, p, 64);
> > > > > +  __builtin_memcpy (ps_src, p, 64);
> > > > > +  __builtin_memcpy (epi64_src, p, 64);
> > > > > +  __builtin_memcpy (epi32_src, p, 64);
> > > > > +  __builtin_memcpy (epi16_src, p, 64);
> > > > > +  __builtin_memcpy (epi8_src, p, 64);
> > > > > +
> > > > > +
> > > > > +  for (int i = 0; i != 16; i++)
> > > > > +    {
> > > > > +      q[i] = i + 48;
> > > > > +      q[i + 16] = i + 32;
> > > > > +      q[i + 32] = i + 16;
> > > > > +      q[i + 48] = i;
> > > > > +    }
> > > > > +
> > > > > +  __builtin_memcpy (pd_exp, q, 64);
> > > > > +  __builtin_memcpy (epi64_exp, q, 64);
> > > > > +
> > > > > +   for (int i = 0; i != 8; i++)
> > > > > +    {
> > > > > +      q[i] = i + 56;
> > > > > +      q[i + 8] = i + 48;
> > > > > +      q[i + 16] = i + 40;
> > > > > +      q[i + 24] = i + 32;
> > > > > +      q[i + 32] = i + 24;
> > > > > +      q[i + 40] = i + 16;
> > > > > +      q[i + 48] = i + 8;
> > > > > +      q[i + 56] = i;
> > > > > +    }
> > > > > +
> > > > > +  __builtin_memcpy (ps_exp, q, 64);
> > > > > +  __builtin_memcpy (epi32_exp, q, 64);
> > > > > +
> > > > > +  for (int i = 0; i != 4; i++)
> > > > > +    {
> > > > > +      q[i] = i + 60;
> > > > > +      q[i + 4] = i + 56;
> > > > > +      q[i + 8] = i + 52;
> > > > > +      q[i + 12] = i + 48;
> > > > > +      q[i + 16] = i + 44;
> > > > > +      q[i + 20] = i + 40;
> > > > > +      q[i + 24] = i + 36;
> > > > > +      q[i + 28] = i + 32;
> > > > > +      q[i + 32] = i + 28;
> > > > > +      q[i + 36] = i + 24;
> > > > > +      q[i + 40] = i + 20;
> > > > > +      q[i + 44] = i + 16;
> > > > > +      q[i + 48] = i + 12;
> > > > > +      q[i + 52] = i + 8;
> > > > > +      q[i + 56] = i + 4;
> > > > > +      q[i + 60] = i;
> > > > > +    }
> > > > > +
> > > > > +  __builtin_memcpy (epi16_exp, q, 64);
> > > > > +
> > > > > +  for (int i = 0; i != 2; i++)
> > > > > +    {
> > > > > +      q[i] = i + 62;
> > > > > +      q[i + 2] = i + 60;
> > > > > +      q[i + 4] = i + 58;
> > > > > +      q[i + 6] = i + 56;
> > > > > +      q[i + 8] = i + 54;
> > > > > +      q[i + 10] = i + 52;
> > > > > +      q[i + 12] = i + 50;
> > > > > +      q[i + 14] = i + 48;
> > > > > +      q[i + 16] = i + 46;
> > > > > +      q[i + 18] = i + 44;
> > > > > +      q[i + 20] = i + 42;
> > > > > +      q[i + 22] = i + 40;
> > > > > +      q[i + 24] = i + 38;
> > > > > +      q[i + 26] = i + 36;
> > > > > +      q[i + 28] = i + 34;
> > > > > +      q[i + 30] = i + 32;
> > > > > +      q[i + 32] = i + 30;
> > > > > +      q[i + 34] = i + 28;
> > > > > +      q[i + 36] = i + 26;
> > > > > +      q[i + 38] = i + 24;
> > > > > +      q[i + 40] = i + 22;
> > > > > +      q[i + 42] = i + 20;
> > > > > +      q[i + 44] = i + 18;
> > > > > +      q[i + 46] = i + 16;
> > > > > +      q[i + 48] = i + 14;
> > > > > +      q[i + 50] = i + 12;
> > > > > +      q[i + 52] = i + 10;
> > > > > +      q[i + 54] = i + 8;
> > > > > +      q[i + 56] = i + 6;
> > > > > +      q[i + 58] = i + 4;
> > > > > +      q[i + 60] = i + 2;
> > > > > +      q[i + 62] = i;
> > > > > +    }
> > > > > +  __builtin_memcpy (epi8_exp, q, 64);
> > > > > +
> > > > > +  foo_pd (pd_dst, pd_src);
> > > > > +  foo_ps (ps_dst, ps_src);
> > > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > > +
> > > > > +  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
> > > > > +    __builtin_abort ();
> > > > > +
> > > > > +  return;
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6c.c b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > > > > new file mode 100644
> > > > > index 00000000000..b859d884a7f
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > > > > @@ -0,0 +1,80 @@
> > > > > +/* { dg-do run } */
> > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } } */
> > > > > +
> > > > > +#include <string.h>
> > > > > +
> > > > > +static void do_test (void);
> > > > > +#define DO_TEST do_test
> > > > > +#define AVX512FP16
> > > > > +#include "avx512-check.h"
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > > > +{
> > > > > +  a[0] = b[15];
> > > > > +  a[1] = b[14];
> > > > > +  a[2] = b[13];
> > > > > +  a[3] = b[12];
> > > > > +  a[4] = b[11];
> > > > > +  a[5] = b[10];
> > > > > +  a[6] = b[9];
> > > > > +  a[7] = b[8];
> > > > > +  a[8] = b[7];
> > > > > +  a[9] = b[6];
> > > > > +  a[10] = b[5];
> > > > > +  a[11] = b[4];
> > > > > +  a[12] = b[3];
> > > > > +  a[13] = b[2];
> > > > > +  a[14] = b[1];
> > > > > +  a[15] = b[0];
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +do_test (void)
> > > > > +{
> > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
> > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
> > > > > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
> > > > > +  char* p = (char* ) malloc (64);
> > > > > +  char* q = (char* ) malloc (64);
> > > > > +
> > > > > +  __builtin_memset (ph_dst, 0, 64);
> > > > > +
> > > > > +  for (int i = 0; i != 64; i++)
> > > > > +    p[i] = i;
> > > > > +
> > > > > +  __builtin_memcpy (ph_src, p, 64);
> > > > > +
> > > > > +  for (int i = 0; i != 4; i++)
> > > > > +    {
> > > > > +      q[i] = i + 60;
> > > > > +      q[i + 4] = i + 56;
> > > > > +      q[i + 8] = i + 52;
> > > > > +      q[i + 12] = i + 48;
> > > > > +      q[i + 16] = i + 44;
> > > > > +      q[i + 20] = i + 40;
> > > > > +      q[i + 24] = i + 36;
> > > > > +      q[i + 28] = i + 32;
> > > > > +      q[i + 32] = i + 28;
> > > > > +      q[i + 36] = i + 24;
> > > > > +      q[i + 40] = i + 20;
> > > > > +      q[i + 44] = i + 16;
> > > > > +      q[i + 48] = i + 12;
> > > > > +      q[i + 52] = i + 8;
> > > > > +      q[i + 56] = i + 4;
> > > > > +      q[i + 60] = i;
> > > > > +    }
> > > > > +
> > > > > +  __builtin_memcpy (ph_exp, q, 64);
> > > > > +
> > > > > +  foo_ph (ph_dst, ph_src);
> > > > > +
> > > > > +  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
> > > > > +    __builtin_abort ();
> > > > > +
> > > > > +  return;
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7a.c b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > > > > new file mode 100644
> > > > > index 00000000000..2ea01fac927
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > > > > @@ -0,0 +1,58 @@
> > > > > +/* { dg-do compile } */
> > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > > > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
> > > > > +
> > > > > +#define N 10000
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_pd (_Complex double* a, _Complex double b)
> > > > > +{
> > > > > +  for (int i = 0; i != N; i++)
> > > > > +    a[i] = b;
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_ps (_Complex float* a, _Complex float b)
> > > > > +{
> > > > > +  for (int i = 0; i != N; i++)
> > > > > +    a[i] = b;
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi64 (_Complex long long* a, _Complex long long b)
> > > > > +{
> > > > > +  for (int i = 0; i != N; i++)
> > > > > +    a[i] = b;
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi32 (_Complex int* a, _Complex int b)
> > > > > +{
> > > > > +  for (int i = 0; i != N; i++)
> > > > > +    a[i] = b;
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi16 (_Complex short* a, _Complex short b)
> > > > > +{
> > > > > +  for (int i = 0; i != N; i++)
> > > > > +    a[i] = b;
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi8 (_Complex char* a, _Complex char b)
> > > > > +{
> > > > > +  for (int i = 0; i != N; i++)
> > > > > +    a[i] = b;
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7b.c b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > > > > new file mode 100644
> > > > > index 00000000000..26482cc10f5
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > > > > @@ -0,0 +1,63 @@
> > > > > +/* { dg-do run } */
> > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > +/* { dg-require-effective-target avx } */
> > > > > +
> > > > > +#include "avx-check.h"
> > > > > +#include <string.h>
> > > > > +#include "pr106010-7a.c"
> > > > > +
> > > > > +void
> > > > > +avx_test (void)
> > > > > +{
> > > > > +  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > > +  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > > +  char* p_init = (char*) malloc (2 * N * sizeof (double));
> > > > > +
> > > > > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > > > > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > > > > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > > > > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > > > > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > > > > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > > > > +
> > > > > +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> > > > > +    p_init[i] = i % 2 + 3;
> > > > > +
> > > > > +  memcpy (pd_src, p_init, 2 * N * sizeof (double));
> > > > > +  memcpy (ps_dst, p_init, 2 * N * sizeof (float));
> > > > > +  memcpy (epi64_dst, p_init, 2 * N * sizeof (long long));
> > > > > +  memcpy (epi32_dst, p_init, 2 * N * sizeof (int));
> > > > > +  memcpy (epi16_dst, p_init, 2 * N * sizeof (short));
> > > > > +  memcpy (epi8_dst, p_init, 2 * N * sizeof (char));
> > > > > +
> > > > > +  foo_pd (pd_dst, pd_src[0]);
> > > > > +  foo_ps (ps_dst, ps_src[0]);
> > > > > +  foo_epi64 (epi64_dst, epi64_src[0]);
> > > > > +  foo_epi32 (epi32_dst, epi32_src[0]);
> > > > > +  foo_epi16 (epi16_dst, epi16_src[0]);
> > > > > +  foo_epi8 (epi8_dst, epi8_src[0]);
> > > > > +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
> > > > > +    __builtin_abort ();
> > > > > +  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
> > > > > +    __builtin_abort ();
> > > > > +
> > > > > +  return;
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7c.c b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > > > > new file mode 100644
> > > > > index 00000000000..7f4056a5ecc
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > > > > @@ -0,0 +1,41 @@
> > > > > +/* { dg-do run } */
> > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
> > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > +
> > > > > +#include <string.h>
> > > > > +
> > > > > +static void do_test (void);
> > > > > +
> > > > > +#define DO_TEST do_test
> > > > > +#define AVX512FP16
> > > > > +#include "avx512-check.h"
> > > > > +
> > > > > +#define N 10000
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16 b)
> > > > > +{
> > > > > +  for (int i = 0; i != N; i++)
> > > > > +    a[i] = b;
> > > > > +}
> > > > > +
> > > > > +static void
> > > > > +do_test (void)
> > > > > +{
> > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > > +  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
> > > > > +
> > > > > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > > > > +
> > > > > +  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
> > > > > +    p_init[i] = i % 2 + 3;
> > > > > +
> > > > > +  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
> > > > > +
> > > > > +  foo_ph (ph_dst, ph_src[0]);
> > > > > +  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
> > > > > +    __builtin_abort ();
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8a.c b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > > > > new file mode 100644
> > > > > index 00000000000..11054b60d30
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > > > > @@ -0,0 +1,58 @@
> > > > > +/* { dg-do compile } */
> > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > > > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
> > > > > +
> > > > > +#define N 10000
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_pd (_Complex double* a)
> > > > > +{
> > > > > +  for (int i = 0; i != N; i++)
> > > > > +    a[i] = 1.0 + 2.0i;
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_ps (_Complex float* a)
> > > > > +{
> > > > > +  for (int i = 0; i != N; i++)
> > > > > +    a[i] = 1.0f + 2.0fi;
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi64 (_Complex long long* a)
> > > > > +{
> > > > > +  for (int i = 0; i != N; i++)
> > > > > +    a[i] = 1 + 2i;
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi32 (_Complex int* a)
> > > > > +{
> > > > > +  for (int i = 0; i != N; i++)
> > > > > +    a[i] = 1 + 2i;
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi16 (_Complex short* a)
> > > > > +{
> > > > > +  for (int i = 0; i != N; i++)
> > > > > +    a[i] = 1 + 2i;
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_epi8 (_Complex char* a)
> > > > > +{
> > > > > +  for (int i = 0; i != N; i++)
> > > > > +    a[i] = 1 + 2i;
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8b.c b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > > > > new file mode 100644
> > > > > index 00000000000..6bb0073b691
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > > > > @@ -0,0 +1,53 @@
> > > > > +/* { dg-do run } */
> > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > +/* { dg-require-effective-target avx } */
> > > > > +
> > > > > +#include "avx-check.h"
> > > > > +#include <string.h>
> > > > > +#include "pr106010-8a.c"
> > > > > +
> > > > > +void
> > > > > +avx_test (void)
> > > > > +{
> > > > > +  _Complex double pd_src = 1.0 + 2.0i;
> > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > > +  _Complex float ps_src = 1.0 + 2.0i;
> > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > > +  _Complex long long epi64_src = 1 + 2i;;
> > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > > +  _Complex int epi32_src = 1 + 2i;
> > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > > +  _Complex short epi16_src = 1 + 2i;
> > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > > +  _Complex char epi8_src = 1 + 2i;
> > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > > +
> > > > > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > > > > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > > > > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > > > > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > > > > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > > > > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > > > > +
> > > > > +  foo_pd (pd_dst);
> > > > > +  foo_ps (ps_dst);
> > > > > +  foo_epi64 (epi64_dst);
> > > > > +  foo_epi32 (epi32_dst);
> > > > > +  foo_epi16 (epi16_dst);
> > > > > +  foo_epi8 (epi8_dst);
> > > > > +  for (int i = 0 ; i != N; i++)
> > > > > +    {
> > > > > +      if (pd_dst[i] != pd_src)
> > > > > +       __builtin_abort ();
> > > > > +      if (ps_dst[i] != ps_src)
> > > > > +       __builtin_abort ();
> > > > > +      if (epi64_dst[i] != epi64_src)
> > > > > +       __builtin_abort ();
> > > > > +      if (epi32_dst[i] != epi32_src)
> > > > > +       __builtin_abort ();
> > > > > +      if (epi16_dst[i] != epi16_src)
> > > > > +       __builtin_abort ();
> > > > > +      if (epi8_dst[i] != epi8_src)
> > > > > +       __builtin_abort ();
> > > > > +    }
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8c.c b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > > > > new file mode 100644
> > > > > index 00000000000..61ae131829d
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > > > > @@ -0,0 +1,38 @@
> > > > > +/* { dg-do run } */
> > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
> > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > +
> > > > > +#include <string.h>
> > > > > +
> > > > > +static void do_test (void);
> > > > > +
> > > > > +#define DO_TEST do_test
> > > > > +#define AVX512FP16
> > > > > +#include "avx512-check.h"
> > > > > +
> > > > > +#define N 10000
> > > > > +
> > > > > +void
> > > > > +__attribute__((noipa))
> > > > > +foo_ph (_Complex _Float16* a)
> > > > > +{
> > > > > +  for (int i = 0; i != N; i++)
> > > > > +    a[i] = 1.0f16 + 2.0f16i;
> > > > > +}
> > > > > +
> > > > > +static void
> > > > > +do_test (void)
> > > > > +{
> > > > > +  _Complex _Float16 ph_src = 1.0f16 + 2.0f16i;
> > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > > +
> > > > > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > > > > +
> > > > > +  foo_ph (ph_dst);
> > > > > +  for (int i = 0; i != N; i++)
> > > > > +    {
> > > > > +      if (ph_dst[i] != ph_src)
> > > > > +       __builtin_abort ();
> > > > > +    }
> > > > > +}
> > > > > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> > > > > index d20a10a1524..42ee9df674c 100644
> > > > > --- a/gcc/tree-vect-data-refs.cc
> > > > > +++ b/gcc/tree-vect-data-refs.cc
> > > > > @@ -1403,7 +1403,8 @@ vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
> > > > >    if (PURE_SLP_STMT (stmt_info))
> > > > >      ncopies = 1;
> > > > >    else
> > > > > -    ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
> > > > > +    ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info),
> > > > > +                                  STMT_VINFO_COMPLEX_P (stmt_info));
> > > > >
> > > > >    if (DR_IS_READ (dr_info->dr))
> > > > >      vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
> > > > > @@ -4597,8 +4598,22 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
> > > > >
> > > > >        /* Set vectype for STMT.  */
> > > > >        scalar_type = TREE_TYPE (DR_REF (dr));
> > > > > -      tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
> > > > > -      if (!vectype)
> > > > > +      tree adjust_scalar_type = scalar_type;
> > > > > +      /* Support Complex type access. Note that the complex type of load/store
> > > > > +        does not support gather/scatter.  */
> > > > > +      if (TREE_CODE (scalar_type) == COMPLEX_TYPE
> > > > > +         && gatherscatter == SG_NONE)
> > > > > +       {
> > > > > +         adjust_scalar_type = TREE_TYPE (scalar_type);
> > > > > +         STMT_VINFO_COMPLEX_P (stmt_info) = true;
> > > > > +       }
> > > > > +      tree vectype = get_vectype_for_scalar_type (vinfo, adjust_scalar_type);
> > > > > +      unsigned HOST_WIDE_INT constant_nunits;
> > > > > +      if (!vectype
> > > > > +         /* For complex type, V1DI doesn't make sense.  */
> > > > > +         || (STMT_VINFO_COMPLEX_P (stmt_info)
> > > > > +             && (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&constant_nunits)
> > > > > +                 || constant_nunits == 1)))
> > > > >          {
> > > > >            if (dump_enabled_p ())
> > > > >              {
> > > > > @@ -4635,8 +4650,11 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
> > > > >         }
> > > > >
> > > > >        /* Adjust the minimal vectorization factor according to the
> > > > > -        vector type.  */
> > > > > +        vector type. Note for complex type, VF is half of
> > > > > +        TYPE_VECTOR_SUBPARTS.  */
> > > > >        vf = TYPE_VECTOR_SUBPARTS (vectype);
> > > > > +      if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > +       vf = exact_div (vf, 2);
> > > > >        *min_vf = upper_bound (*min_vf, vf);
> > > > >
> > > > >        /* Leave the BB vectorizer to pick the vector type later, based on
> > > > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > > > > index 3a70c15b593..365fa738022 100644
> > > > > --- a/gcc/tree-vect-loop.cc
> > > > > +++ b/gcc/tree-vect-loop.cc
> > > > > @@ -200,7 +200,12 @@ vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > >      }
> > > > >
> > > > >    if (nunits_vectype)
> > > > > -    vect_update_max_nunits (vf, nunits_vectype);
> > > > > +    {
> > > > > +      poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (nunits_vectype);
> > > > > +      if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > +       nunits = exact_div (nunits, 2);
> > > > > +      vect_update_max_nunits (vf, nunits);
> > > > > +    }
> > > > >
> > > > >    return opt_result::success ();
> > > > >  }
> > > > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> > > > > index dab5daddcc5..5d66ea2f286 100644
> > > > > --- a/gcc/tree-vect-slp.cc
> > > > > +++ b/gcc/tree-vect-slp.cc
> > > > > @@ -877,10 +877,14 @@ vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > >        return false;
> > > > >      }
> > > > >
> > > > > +  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> > > > > +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > +    nunits = exact_div (nunits, 2);
> > > > > +
> > > > >    /* If populating the vector type requires unrolling then fail
> > > > >       before adjusting *max_nunits for basic-block vectorization.  */
> > > > >    if (is_a <bb_vec_info> (vinfo)
> > > > > -      && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
> > > > > +      && !multiple_p (group_size , nunits))
> > > > >      {
> > > > >        if (dump_enabled_p ())
> > > > >         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > > > @@ -891,7 +895,7 @@ vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > >      }
> > > > >
> > > > >    /* In case of multiple types we need to detect the smallest type.  */
> > > > > -  vect_update_max_nunits (max_nunits, vectype);
> > > > > +  vect_update_max_nunits (max_nunits, nunits);
> > > > >    return true;
> > > > >  }
> > > > >
> > > > > @@ -3720,22 +3724,54 @@ vect_optimize_slp (vec_info *vinfo)
> > > > >          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
> > > > >          when permuting constants and invariants keeping the permute
> > > > >          bijective.  */
> > > > > -      auto_sbitmap load_index (SLP_TREE_LANES (node));
> > > > > -      bitmap_clear (load_index);
> > > > > -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > -       bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
> > > > > -      unsigned j;
> > > > > -      for (j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > -       if (!bitmap_bit_p (load_index, j))
> > > > > -         break;
> > > > > -      if (j != SLP_TREE_LANES (node))
> > > > > -       continue;
> > > > > +      /* Permutation of Complex type.  */
> > > > > +      if (STMT_VINFO_COMPLEX_P (dr_stmt))
> > > > > +       {
> > > > > +         auto_sbitmap load_index (SLP_TREE_LANES (node) * 2);
> > > > > +         bitmap_clear (load_index);
> > > > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > +           {
> > > > > +             unsigned bit = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > > > +             bitmap_set_bit (load_index, 2 * bit);
> > > > > +             bitmap_set_bit (load_index, 2 * bit + 1);
> > > > > +           }
> > > > > +         unsigned j;
> > > > > +         for (j = 0; j < SLP_TREE_LANES (node) * 2; ++j)
> > > > > +           if (!bitmap_bit_p (load_index, j))
> > > > > +             break;
> > > > > +         if (j != SLP_TREE_LANES (node) * 2)
> > > > > +           continue;
> > > > >
> > > > > -      vec<unsigned> perm = vNULL;
> > > > > -      perm.safe_grow (SLP_TREE_LANES (node), true);
> > > > > -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > -       perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > > > -      perms.safe_push (perm);
> > > > > +         vec<unsigned> perm = vNULL;
> > > > > +         perm.safe_grow (SLP_TREE_LANES (node) * 2, true);
> > > > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > +           {
> > > > > +             unsigned cidx = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > > > +             perm[2 * j] = 2 * cidx;
> > > > > +             perm[2 * j + 1] = 2 * cidx + 1;
> > > > > +           }
> > > > > +         perms.safe_push (perm);
> > > > > +       }
> > > > > +      else
> > > > > +       {
> > > > > +         auto_sbitmap load_index (SLP_TREE_LANES (node));
> > > > > +         bitmap_clear (load_index);
> > > > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > +           bitmap_set_bit (load_index,
> > > > > +                           SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
> > > > > +         unsigned j;
> > > > > +         for (j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > +           if (!bitmap_bit_p (load_index, j))
> > > > > +             break;
> > > > > +         if (j != SLP_TREE_LANES (node))
> > > > > +           continue;
> > > > > +
> > > > > +         vec<unsigned> perm = vNULL;
> > > > > +         perm.safe_grow (SLP_TREE_LANES (node), true);
> > > > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > +           perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > > > +         perms.safe_push (perm);
> > > > > +       }
> > > > >        vertices[idx].perm_in = perms.length () - 1;
> > > > >        vertices[idx].perm_out = perms.length () - 1;
> > > > >      }
> > > > > @@ -4518,6 +4554,12 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
> > > > >         vf = loop_vinfo->vectorization_factor;
> > > > >        else
> > > > >         vf = 1;
> > > > > +      /* For complex type and SLP, double vf to get right vectype.
> > > > > +        .i.e vector(4) double for complex double, group size is 2, double vf
> > > > > +        to map vf * group_size to TYPE_VECTOR_SUBPARTS.  */
> > > > > +     if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > +       vf *= 2;
> > > > > +
> > > > >        unsigned int group_size = SLP_TREE_LANES (node);
> > > > >        tree vectype = SLP_TREE_VECTYPE (node);
> > > > >        SLP_TREE_NUMBER_OF_VEC_STMTS (node)
> > > > > @@ -4763,10 +4805,17 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
> > > > >             }
> > > > >           unsigned group_size = SLP_TREE_LANES (child);
> > > > >           poly_uint64 vf = 1;
> > > > > +
> > > > >           if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
> > > > >             vf = loop_vinfo->vectorization_factor;
> > > > > +
> > > > > +         /* V2SF is just 1 complex type, so mutiply by 2
> > > > > +            to get release vector numbers.  */
> > > > > +         unsigned cp
> > > > > +           = STMT_VINFO_COMPLEX_P (SLP_TREE_REPRESENTATIVE (node)) ? 2 : 1;
> > > > > +
> > > > >           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
> > > > > -           = vect_get_num_vectors (vf * group_size, vector_type);
> > > > > +           = vect_get_num_vectors (vf * group_size * cp, vector_type);
> > > > >           /* And cost them.  */
> > > > >           vect_prologue_cost_for_slp (child, cost_vec);
> > > > >         }
> > > > > @@ -6402,6 +6451,11 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > > > >
> > > > >    /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
> > > > >    vector_type = SLP_TREE_VECTYPE (op_node);
> > > > > +  unsigned int cp = 1;
> > > > > +  /* Handle Complex type vector init.
> > > > > +     SLP_TREE_REPRESENTATIVE (op_node) could be NULL.  */
> > > > > +  if (TREE_CODE (TREE_TYPE (op_node->ops[0])) == COMPLEX_TYPE)
> > > > > +    cp = 2;
> > > > >
> > > > >    unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
> > > > >    SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
> > > > > @@ -6426,9 +6480,9 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > > > >    /* When using duplicate_and_interleave, we just need one element for
> > > > >       each scalar statement.  */
> > > > >    if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
> > > > > -    nunits = group_size;
> > > > > +    nunits = group_size * cp;
> > > > >
> > > > > -  number_of_copies = nunits * number_of_vectors / group_size;
> > > > > +  number_of_copies = nunits * number_of_vectors / (group_size * cp);
> > > > >
> > > > >    number_of_places_left_in_vector = nunits;
> > > > >    constant_p = true;
> > > > > @@ -6460,8 +6514,23 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > > > >                         gcc_unreachable ();
> > > > >                     }
> > > > >                   else
> > > > > -                   op = fold_unary (VIEW_CONVERT_EXPR,
> > > > > -                                    TREE_TYPE (vector_type), op);
> > > > > +                   {
> > > > > +                     tree scalar_type = TREE_TYPE (vector_type);
> > > > > +                     /* For complex type, insert real and imag part
> > > > > +                        separately.  */
> > > > > +                     if (cp == 2)
> > > > > +                       {
> > > > > +                         gcc_assert ((TREE_CODE (TREE_TYPE (op))
> > > > > +                                      == COMPLEX_TYPE)
> > > > > +                                     && (scalar_type
> > > > > +                                         == TREE_TYPE (TREE_TYPE (op))));
> > > > > +                         elts[number_of_places_left_in_vector--]
> > > > > +                           = fold_unary (IMAGPART_EXPR, scalar_type, op);
> > > > > +                         op = fold_unary (REALPART_EXPR, scalar_type, op);
> > > > > +                       }
> > > > > +                     else
> > > > > +                       op = fold_unary (VIEW_CONVERT_EXPR, scalar_type, op);
> > > > > +                   }
> > > > >                   gcc_assert (op && CONSTANT_CLASS_P (op));
> > > > >                 }
> > > > >               else
> > > > > @@ -6481,11 +6550,28 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > > > >                     }
> > > > >                   else
> > > > >                     {
> > > > > -                     op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
> > > > > -                                  op);
> > > > > -                     init_stmt
> > > > > -                       = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
> > > > > -                                              op);
> > > > > +                     tree scalar_type = TREE_TYPE (vector_type);
> > > > > +                     if (cp == 2)
> > > > > +                       {
> > > > > +                         gcc_assert ((TREE_CODE (TREE_TYPE (op))
> > > > > +                                      == COMPLEX_TYPE)
> > > > > +                                     && (scalar_type
> > > > > +                                         == TREE_TYPE (TREE_TYPE (op))));
> > > > > +                         tree imag = build1 (IMAGPART_EXPR, scalar_type, op);
> > > > > +                         op = build1 (REALPART_EXPR, scalar_type, op);
> > > > > +                         tree imag_temp = make_ssa_name (scalar_type);
> > > > > +                         elts[number_of_places_left_in_vector--] = imag_temp;
> > > > > +                         init_stmt = gimple_build_assign (imag_temp, imag);
> > > > > +                         gimple_seq_add_stmt (&ctor_seq, init_stmt);
> > > > > +                         init_stmt = gimple_build_assign (new_temp, op);
> > > > > +                       }
> > > > > +                     else
> > > > > +                       {
> > > > > +                         op = build1 (VIEW_CONVERT_EXPR, scalar_type, op);
> > > > > +                         init_stmt
> > > > > +                           = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
> > > > > +                                                  op);
> > > > > +                       }
> > > > >                     }
> > > > >                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
> > > > >                   op = new_temp;
> > > > > @@ -6696,15 +6782,17 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > >    unsigned int nelts_to_build;
> > > > >    unsigned int nvectors_per_build;
> > > > >    unsigned int in_nlanes;
> > > > > +  unsigned int cp = STMT_VINFO_COMPLEX_P (stmt_info) ? 2 : 1;
> > > > >    bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
> > > > > -                     && multiple_p (nunits, group_size));
> > > > > +                     && multiple_p (nunits, group_size * cp));
> > > > >    if (repeating_p)
> > > > >      {
> > > > >        /* A single vector contains a whole number of copies of the node, so:
> > > > >          (a) all permutes can use the same mask; and
> > > > >          (b) the permutes only need a single vector input.  */
> > > > > -      mask.new_vector (nunits, group_size, 3);
> > > > > -      nelts_to_build = mask.encoded_nelts ();
> > > > > +      /* For complex type, mask size should be double of nelts_to_build.  */
> > > > > +      mask.new_vector (nunits, group_size * cp, 3);
> > > > > +      nelts_to_build = mask.encoded_nelts () / cp;
> > > > >        nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
> > > > >        in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
> > > > >      }
> > > > > @@ -6744,8 +6832,8 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > >         {
> > > > >           /* Enforced before the loop when !repeating_p.  */
> > > > >           unsigned int const_nunits = nunits.to_constant ();
> > > > > -         vec_index = i / const_nunits;
> > > > > -         mask_element = i % const_nunits;
> > > > > +         vec_index = i / (const_nunits / cp);
> > > > > +         mask_element = i % (const_nunits / cp);
> > > > >           if (vec_index == first_vec_index
> > > > >               || first_vec_index == -1)
> > > > >             {
> > > > > @@ -6755,7 +6843,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > >                    || second_vec_index == -1)
> > > > >             {
> > > > >               second_vec_index = vec_index;
> > > > > -             mask_element += const_nunits;
> > > > > +             mask_element += (const_nunits / cp);
> > > > >             }
> > > > >           else
> > > > >             {
> > > > > @@ -6768,14 +6856,24 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > >               return false;
> > > > >             }
> > > > >
> > > > > -         gcc_assert (mask_element < 2 * const_nunits);
> > > > > +         gcc_assert (mask_element < 2 * const_nunits / cp);
> > > > >         }
> > > > >
> > > > >        if (mask_element != index)
> > > > >         noop_p = false;
> > > > > -      mask[index++] = mask_element;
> > > > > +      /* Set index for Complex _type.
> > > > > +        i.e. mask like [1,0] is actually [2, 3, 0, 1]
> > > > > +        for vector scalar type.  */
> > > > > +      if (cp == 2)
> > > > > +       {
> > > > > +         mask[2 * index] = 2 * mask_element;
> > > > > +         mask[2 * index + 1] = 2 * mask_element + 1;
> > > > > +       }
> > > > > +      else
> > > > > +       mask[index] = mask_element;
> > > > > +      index++;
> > > > >
> > > > > -      if (index == count && !noop_p)
> > > > > +      if (index * cp == count && !noop_p)
> > > > >         {
> > > > >           indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
> > > > >           if (!can_vec_perm_const_p (mode, mode, indices))
> > > > > @@ -6799,7 +6897,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > >           ++*n_perms;
> > > > >         }
> > > > >
> > > > > -      if (index == count)
> > > > > +      if (index * cp == count)
> > > > >         {
> > > > >           if (!analyze_only)
> > > > >             {
> > > > > @@ -6869,7 +6967,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > >           bool load_seen = false;
> > > > >           for (unsigned i = 0; i < in_nlanes; ++i)
> > > > >             {
> > > > > -             if (i % const_nunits == 0)
> > > > > +             if (i % (const_nunits * cp) == 0)
> > > > >                 {
> > > > >                   if (load_seen)
> > > > >                     *n_loads += 1;
> > > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > > > > index 72107afc883..8af3b558be4 100644
> > > > > --- a/gcc/tree-vect-stmts.cc
> > > > > +++ b/gcc/tree-vect-stmts.cc
> > > > > @@ -1397,25 +1397,70 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
> > > > >  {
> > > > >    gimple *init_stmt;
> > > > >    tree new_temp;
> > > > > +  tree scalar_type = TREE_TYPE (type);
> > > > > +  gimple_seq stmts = NULL;
> > > > > +
> > > > > +  if (TREE_CODE (TREE_TYPE (val)) == COMPLEX_TYPE)
> > > > > +    {
> > > > > +      unsigned HOST_WIDE_INT nunits;
> > > > > +      gcc_assert (TYPE_VECTOR_SUBPARTS (type).is_constant (&nunits));
> > > > >
> > > > > +      tree_vector_builder elts (type, nunits, 1);
> > > > > +      tree imag, real;
> > > > > +      if (TREE_CODE (val) == COMPLEX_CST)
> > > > > +       {
> > > > > +         real = fold_unary (REALPART_EXPR, scalar_type, val);
> > > > > +         imag = fold_unary (IMAGPART_EXPR, scalar_type, val);
> > > > > +       }
> > > > > +      else
> > > > > +       {
> > > > > +         real = make_ssa_name (scalar_type);
> > > > > +         imag = make_ssa_name (scalar_type);
> > > > > +         init_stmt
> > > > > +           = gimple_build_assign (real,
> > > > > +                                  build1 (REALPART_EXPR, scalar_type, val));
> > > > > +         gimple_seq_add_stmt (&stmts, init_stmt);
> > > > > +         init_stmt
> > > > > +           = gimple_build_assign (imag,
> > > > > +                                  build1 (IMAGPART_EXPR, scalar_type, val));
> > > > > +         gimple_seq_add_stmt (&stmts, init_stmt);
> > > > > +       }
> > > > > +
> > > > > +      /* Build vector as [real,imag,real,imag,...].  */
> > > > > +      for (unsigned i = 0; i != nunits; i++)
> > > > > +       {
> > > > > +         if (i % 2)
> > > > > +           elts.quick_push (imag);
> > > > > +         else
> > > > > +           elts.quick_push (real);
> > > > > +       }
> > > > > +      val = gimple_build_vector (&stmts, &elts);
> > > > > +      if (!gimple_seq_empty_p (stmts))
> > > > > +       {
> > > > > +         if (gsi)
> > > > > +           gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> > > > > +         else
> > > > > +           vinfo->insert_seq_on_entry (stmt_info, stmts);
> > > > > +       }
> > > > > +    }
> > > > >    /* We abuse this function to push sth to a SSA name with initial 'val'.  */
> > > > > -  if (! useless_type_conversion_p (type, TREE_TYPE (val)))
> > > > > +  else if (! useless_type_conversion_p (type, TREE_TYPE (val)))
> > > > >      {
> > > > >        gcc_assert (TREE_CODE (type) == VECTOR_TYPE);
> > > > > -      if (! types_compatible_p (TREE_TYPE (type), TREE_TYPE (val)))
> > > > > +      if (! types_compatible_p (scalar_type, TREE_TYPE (val)))
> > > > >         {
> > > > >           /* Scalar boolean value should be transformed into
> > > > >              all zeros or all ones value before building a vector.  */
> > > > >           if (VECTOR_BOOLEAN_TYPE_P (type))
> > > > >             {
> > > > > -             tree true_val = build_all_ones_cst (TREE_TYPE (type));
> > > > > -             tree false_val = build_zero_cst (TREE_TYPE (type));
> > > > > +             tree true_val = build_all_ones_cst (scalar_type);
> > > > > +             tree false_val = build_zero_cst (scalar_type);
> > > > >
> > > > >               if (CONSTANT_CLASS_P (val))
> > > > >                 val = integer_zerop (val) ? false_val : true_val;
> > > > >               else
> > > > >                 {
> > > > > -                 new_temp = make_ssa_name (TREE_TYPE (type));
> > > > > +                 new_temp = make_ssa_name (scalar_type);
> > > > >                   init_stmt = gimple_build_assign (new_temp, COND_EXPR,
> > > > >                                                    val, true_val, false_val);
> > > > >                   vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi);
> > > > > @@ -1424,14 +1469,13 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
> > > > >             }
> > > > >           else
> > > > >             {
> > > > > -             gimple_seq stmts = NULL;
> > > > >               if (! INTEGRAL_TYPE_P (TREE_TYPE (val)))
> > > > >                 val = gimple_build (&stmts, VIEW_CONVERT_EXPR,
> > > > > -                                   TREE_TYPE (type), val);
> > > > > +                                   scalar_type, val);
> > > > >               else
> > > > >                 /* ???  Condition vectorization expects us to do
> > > > >                    promotion of invariant/external defs.  */
> > > > > -               val = gimple_convert (&stmts, TREE_TYPE (type), val);
> > > > > +               val = gimple_convert (&stmts, scalar_type, val);
> > > > >               for (gimple_stmt_iterator gsi2 = gsi_start (stmts);
> > > > >                    !gsi_end_p (gsi2); )
> > > > >                 {
> > > > > @@ -1496,7 +1540,12 @@ vect_get_vec_defs_for_operand (vec_info *vinfo, stmt_vec_info stmt_vinfo,
> > > > >                && VECTOR_BOOLEAN_TYPE_P (stmt_vectype))
> > > > >         vector_type = truth_type_for (stmt_vectype);
> > > > >        else
> > > > > -       vector_type = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op));
> > > > > +       {
> > > > > +         tree scalar_type = TREE_TYPE (op);
> > > > > +         if (STMT_VINFO_COMPLEX_P (stmt_vinfo))
> > > > > +           scalar_type = TREE_TYPE (scalar_type);
> > > > > +         vector_type = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
> > > > > +       }
> > > > >
> > > > >        gcc_assert (vector_type);
> > > > >        tree vop = vect_init_vector (vinfo, stmt_vinfo, op, vector_type, NULL);
> > > > > @@ -7509,8 +7558,17 @@ vectorizable_store (vec_info *vinfo,
> > > > >       same location twice.  */
> > > > >    gcc_assert (slp == PURE_SLP_STMT (stmt_info));
> > > > >
> > > > > +  if (!STMT_VINFO_DATA_REF (stmt_info))
> > > > > +    return false;
> > > > > +
> > > > >    tree vectype = STMT_VINFO_VECTYPE (stmt_info), rhs_vectype = NULL_TREE;
> > > > >    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> > > > > +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > +    {
> > > > > +      if (!nunits.is_constant ())
> > > > > +       return false;
> > > > > +      nunits = exact_div (nunits, 2);
> > > > > +    }
> > > > >
> > > > >    if (loop_vinfo)
> > > > >      {
> > > > > @@ -7526,7 +7584,8 @@ vectorizable_store (vec_info *vinfo,
> > > > >    if (slp)
> > > > >      ncopies = 1;
> > > > >    else
> > > > > -    ncopies = vect_get_num_copies (loop_vinfo, vectype);
> > > > > +    ncopies = vect_get_num_copies (loop_vinfo, vectype,
> > > > > +                                  STMT_VINFO_COMPLEX_P (stmt_info));
> > > > >
> > > > >    gcc_assert (ncopies >= 1);
> > > > >
> > > > > @@ -7546,9 +7605,6 @@ vectorizable_store (vec_info *vinfo,
> > > > >    elem_type = TREE_TYPE (vectype);
> > > > >    vec_mode = TYPE_MODE (vectype);
> > > > >
> > > > > -  if (!STMT_VINFO_DATA_REF (stmt_info))
> > > > > -    return false;
> > > > > -
> > > > >    vect_memory_access_type memory_access_type;
> > > > >    enum dr_alignment_support alignment_support_scheme;
> > > > >    int misalignment;
> > > > > @@ -8778,6 +8834,12 @@ vectorizable_load (vec_info *vinfo,
> > > > >
> > > > >    tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> > > > >    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> > > > > +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > +    {
> > > > > +      if (!nunits.is_constant ())
> > > > > +       return false;
> > > > > +      nunits = exact_div (nunits, 2);
> > > > > +    }
> > > > >
> > > > >    if (loop_vinfo)
> > > > >      {
> > > > > @@ -8794,7 +8856,8 @@ vectorizable_load (vec_info *vinfo,
> > > > >    if (slp)
> > > > >      ncopies = 1;
> > > > >    else
> > > > > -    ncopies = vect_get_num_copies (loop_vinfo, vectype);
> > > > > +    ncopies = vect_get_num_copies (loop_vinfo, vectype,
> > > > > +                                  STMT_VINFO_COMPLEX_P (stmt_info));
> > > > >
> > > > >    gcc_assert (ncopies >= 1);
> > > > >
> > > > > @@ -8870,8 +8933,11 @@ vectorizable_load (vec_info *vinfo,
> > > > >                 if (k > maxk)
> > > > >                   maxk = k;
> > > > >               tree vectype = SLP_TREE_VECTYPE (slp_node);
> > > > > +             /* For complex type, half the nunits.  */
> > > > >               if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
> > > > > -                 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
> > > > > +                 || maxk >= (DR_GROUP_SIZE (group_info)
> > > > > +                             & ~((STMT_VINFO_COMPLEX_P (group_info)
> > > > > +                                  ? nunits >> 1 : nunits) - 1)))
> > > > >                 {
> > > > >                   if (dump_enabled_p ())
> > > > >                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > > > @@ -12499,12 +12565,27 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > >             dump_printf_loc (MSG_NOTE, vect_location,
> > > > >                              "get vectype for scalar type: %T\n", scalar_type);
> > > > >         }
> > > > > +
> > > > > +      tree orig_scalar_type = scalar_type;
> > > > > +      if (TREE_CODE (scalar_type) == COMPLEX_TYPE)
> > > > > +       {
> > > > > +         /* Set complex_p for BB vectorizer.  */
> > > > > +         STMT_VINFO_COMPLEX_P (stmt_info) = true;
> > > > > +         scalar_type = TREE_TYPE (scalar_type);
> > > > > +         /* Double group_size for BB vectorizer to make
> > > > > +            following 2 get_vectype_for_scalar_type return wanted vectype.
> > > > > +            Real group size is not changed, just make the "faked" input
> > > > > +            group_size.  */
> > > > > +         group_size *= 2;
> > > > > +       }
> > > > >        vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
> > > > > -      if (!vectype)
> > > > > +      if (!vectype
> > > > > +         || (STMT_VINFO_COMPLEX_P (stmt_info)
> > > > > +             && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ()))
> > > > >         return opt_result::failure_at (stmt,
> > > > >                                        "not vectorized:"
> > > > >                                        " unsupported data-type %T\n",
> > > > > -                                      scalar_type);
> > > > > +                                      orig_scalar_type);
> > > > >
> > > > >        if (dump_enabled_p ())
> > > > >         dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n", vectype);
> > > > > @@ -12529,16 +12610,30 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > >                                                    TREE_TYPE (vectype));
> > > > >        if (scalar_type != TREE_TYPE (vectype))
> > > > >         {
> > > > > -         if (dump_enabled_p ())
> > > > > +         tree orig_scalar_type = scalar_type;
> > > > > +         if (TREE_CODE (scalar_type) == COMPLEX_TYPE)
> > > > > +           {
> > > > > +             /* Set complex_p for Loop vectorizer.  */
> > > > > +             STMT_VINFO_COMPLEX_P (stmt_info) = true;
> > > > > +             scalar_type = TREE_TYPE (scalar_type);
> > > > > +             if (dump_enabled_p ())
> > > > > +               dump_printf_loc (MSG_NOTE, vect_location,
> > > > > +                            "get complex for smallest scalar type: %T\n",
> > > > > +                            scalar_type);
> > > > > +
> > > > > +           }
> > > > > +         else if (dump_enabled_p ())
> > > > >             dump_printf_loc (MSG_NOTE, vect_location,
> > > > >                              "get vectype for smallest scalar type: %T\n",
> > > > >                              scalar_type);
> > > > >           nunits_vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
> > > > >                                                         group_size);
> > > > > -         if (!nunits_vectype)
> > > > > +         if (!nunits_vectype
> > > > > +             || (STMT_VINFO_COMPLEX_P (stmt_info)
> > > > > +                 && !TYPE_VECTOR_SUBPARTS (nunits_vectype).is_constant ()))
> > > > >             return opt_result::failure_at
> > > > >               (stmt, "not vectorized: unsupported data-type %T\n",
> > > > > -              scalar_type);
> > > > > +              orig_scalar_type);
> > > > >           if (dump_enabled_p ())
> > > > >             dump_printf_loc (MSG_NOTE, vect_location, "nunits vectype: %T\n",
> > > > >                              nunits_vectype);
> > > > > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> > > > > index e5fdc9e0a14..4a809e492c4 100644
> > > > > --- a/gcc/tree-vectorizer.h
> > > > > +++ b/gcc/tree-vectorizer.h
> > > > > @@ -1161,6 +1161,9 @@ public:
> > > > >       vectorization.  */
> > > > >    bool vectorizable;
> > > > >
> > > > > +  /* The scalar type of the LHS of this statement is complex type.  */
> > > > > +  bool complex_p;
> > > > > +
> > > > >    /* The stmt to which this info struct refers to.  */
> > > > >    gimple *stmt;
> > > > >
> > > > > @@ -1395,6 +1398,7 @@ struct gather_scatter_info {
> > > > >  #define STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT(S) (S)->reduc_epilogue_adjustment
> > > > >  #define STMT_VINFO_REDUC_IDX(S)                   (S)->reduc_idx
> > > > >  #define STMT_VINFO_FORCE_SINGLE_CYCLE(S)   (S)->force_single_cycle
> > > > > +#define STMT_VINFO_COMPLEX_P(S)            (S)->complex_p
> > > > >
> > > > >  #define STMT_VINFO_DR_WRT_VEC_LOOP(S)      (S)->dr_wrt_vec_loop
> > > > >  #define STMT_VINFO_DR_BASE_ADDRESS(S)      (S)->dr_wrt_vec_loop.base_address
> > > > > @@ -1970,6 +1974,15 @@ vect_get_num_copies (loop_vec_info loop_vinfo, tree vectype)
> > > > >    return vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo), vectype);
> > > > >  }
> > > > >
> > > > > +static inline unsigned int
> > > > > +vect_get_num_copies (loop_vec_info loop_vinfo, tree vectype, bool complex_p)
> > > > > +{
> > > > > +  poly_uint64 nunits = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> > > > > +  if (complex_p)
> > > > > +    nunits *= 2;
> > > > > +  return vect_get_num_vectors (nunits, vectype);
> > > > > +}
> > > > > +
> > > > >  /* Update maximum unit count *MAX_NUNITS so that it accounts for
> > > > >     NUNITS.  *MAX_NUNITS can be 1 if we haven't yet recorded anything.  */
> > > > >
> > > > > --
> > > > > 2.18.1
> > > > >
> > >
> > >
> > >
> > > --
> > > BR,
> > > Hongtao
>
>
>
> --
> BR,
> Hongtao
Richard Biener July 14, 2022, 8:20 a.m. UTC | #7
On Wed, Jul 13, 2022 at 9:34 AM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Wed, Jul 13, 2022 at 6:47 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Tue, Jul 12, 2022 at 10:12 PM Richard Biener
> > <richard.guenther@gmail.com> wrote:
> > >
> > > On Tue, Jul 12, 2022 at 6:11 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > >
> > > > On Mon, Jul 11, 2022 at 7:47 PM Richard Biener via Gcc-patches
> > > > <gcc-patches@gcc.gnu.org> wrote:
> > > > >
> > > > > On Mon, Jul 11, 2022 at 5:44 AM liuhongt <hongtao.liu@intel.com> wrote:
> > > > > >
> > > > > > The patch only handles load/store(including ctor/permutation, except
> > > > > > gather/scatter) for complex type, other operations don't needs to be
> > > > > > handled since they will be lowered by pass cplxlower.(MASK_LOAD is not
> > > > > > supported for complex type, so no need to handle either).
> > > > >
> > > > > (*)
> > > > >
> > > > > > Instead of support vector(2) _Complex double, this patch takes vector(4)
> > > > > > double as vector type of _Complex double. Since vectorizer originally
> > > > > > takes TYPE_VECTOR_SUBPARTS as nunits which is not true for complex
> > > > > > type, the patch handles nunits/ncopies/vf specially for complex type.
> > > > >
> > > > > For the limited set above(*) can you explain what's "special" about
> > > > > vector(2) _Complex
> > > > > vs. vector(4) double, thus why we need to have STMT_VINFO_COMPLEX_P at all?
> > > > Supporting a vector(2) complex  is a straightforward idea, just like
> > > > supporting other scalar type in vectorizer, but it requires more
> > > > efforts(in the backend and frontend), considering that most of
> > > > operations of complex type will be lowered into realpart and imagpart
> > > > operations, supporting a vector(2) complex does not look that
> > > > necessary. Then it comes up with supporting vector(4) double(with
> > > > adjustment of vf/ctor/permutation), the vectorizer only needs to
> > > > handle the vectorization of the move operation of the complex type(no
> > > > need to worry about wrongly mapping vector(4) double multiplication to
> > > > complex type multiplication since it's already lowered before
> > > > vectorizer).
> > > > stmt_info does not record the scalar type, in order to avoid duplicate
> > > > operation like getting a lhs type from stmt to determine whether it is
> > > > a complex type, STMT_VINFO_COMPLEX_P bit is added, this bit is mainly
> > > > initialized in vect_analyze_data_refs and vect_get_vector_types_for_
> > > > stmt.
> > > > >
> > > > > I wonder to what extent your handling can be extended to support re-vectorizing
> > > > > (with a higher VF for example) already vectorized code?  The vectorizer giving
> > > > > up on vector(2) double looks quite obviously similar to it giving up
> > > > > on _Complex double ...
> > > > Yes, it can be extended to vector(2) double/float/int/.... with a bit
> > > > adjustment(exacting element by using bit_field instead of
> > > > imagpart_expr/realpart_expr).
> > > > > It would be a shame to not use the same underlying mechanism for dealing with
> > > > > both, where for the vector case obviously vector(4) would be supported as well.
> > > > >
> > > > > In principle _Complex double operations should be two SLP lanes but it seems you
> > > > > are handling them with classical interleaving as well?
> > > > I'm only handling move operations, for other operations it will be
> > > > lowered to realpart and imagpart and thus two SLP lanes.
> > >
> > > Yes, I understood that.
> > >
> > > Doing it more general (and IMHO better) would involve enhancing
> > > how we represent dataref groups, maintaining the number of scalars
> > > covered by each of the vinfos.  On the SLP representation side it
> > > probably requires to rely on the representative for access and not
> > > on the scalar stmts (since those do not map properly to the lanes).
> > >
> > > Ideally we'd be able to handle
> > >
> > > struct { _Complex double c; double a; double b; } a[], b[];
> > >
> > > void foo ()
> > > {
> > >    for (int i = 0; i < 100; ++i)
> > >     {
> > >       a[i].c = b[i].c;
> > >       a[i].a = b[i].a;
> > >       a[i].b = b[i].b;
> > >     }
> > > }
> > >
> > > which I guess your patch doesn't handle with plain AVX vector
> > > copies but instead uses interleaving for the _Complex and non-_Complex
> > > parts?
> > Indeed, it produces wrong code.
>
> For _Complex, in case we don't get to the "true and only" solution it
> might be easier to split the loads and stores when it's just memory
> copies and we have vectorization enabled and a supported vector
> mode that would surely re-assemble them (store-merging doesn't seem
> to do that).
>
> Btw, we seem to produce
>
>         movsd   b(%rip), %xmm0
>         movsd   %xmm0, a(%rip)
>         movsd   b+8(%rip), %xmm0
>         movsd   %xmm0, a+8(%rip)
>
> for a _Complex double memory copy on x86 which means we lack
> true DCmode support (pseudos get decomposed).  Not sure if we
> can somehow check whether a target has DCmode load/store
> support and key decomposing on that (maybe check the SET optab).
>
> It might be possible to check
>
> _Complex double a, b;
> void bar()
> {
>   a = b;
> }
>
> for all targets with a cc1 cross to see whether they somehow get
> loads/stores _not_ decomposed (also check _Complex float,
> I wouldn't worry for _Complex int or _Complex long double).

Btw, a point for doing the above is that we already do it!  There just
needs to be an (unrelated) complex op in the function:

_Complex float a[2], b[2];
_Complex double foo(_Complex double x, _Complex double y)
{
  a[0] = b[0];
  a[1] = b[1];
  return x + y;
}

vs

void bar()
{
  a[0] = b[0];
  a[1] = b[1];
}

they key difference is that tree_lower_complex returns early here:

  if (!init_dont_simulate_again ())
    return 0;

that returns whether it saw any complex op.

diff --git a/gcc/tree-complex.cc b/gcc/tree-complex.cc
index 61950a0f099..bdcb9968af1 100644
--- a/gcc/tree-complex.cc
+++ b/gcc/tree-complex.cc
@@ -297,6 +297,11 @@ init_dont_simulate_again (void)
                break;

              default:
+               /* When expand_complex_move would trigger make sure we
+                  perform lowering even when there is no actual complex
+                  operation.  This helps consistency and vectorization.  */
+               if (TREE_CODE (TREE_TYPE (gimple_op (stmt, 0))) == COMPLEX_TYPE)
+                 saw_a_complex_op = true;
                break;
              }

fixes that.  If this change tests OK (and fixes your set of new
vectorizer testcases)
then I think that's the way to go for the immediate issue of
vectorizing _Complex.

Richard.

> Richard.
>
> > > Let me spend some time fleshing out what is necessary to make
> > > this work "properly".  We can consider your special-casing of _Complex
> > > memory ops if I can't manage to assess the complexity of the task.
> > >
> > > Thanks,
> > > Richard.
> > >
> > > > >
> > > > > Thanks,
> > > > > Richard.
> > > > >
> > > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > > > > Also test the patch for SPEC2017 and find there's complex type vectorization
> > > > > > in 510/549(but no performance impact).
> > > > > >
> > > > > > Any comments?
> > > > > >
> > > > > > gcc/ChangeLog:
> > > > > >
> > > > > >         PR tree-optimization/106010
> > > > > >         * tree-vect-data-refs.cc (vect_get_data_access_cost):
> > > > > >         Pass complex_p to vect_get_num_copies to avoid ICE.
> > > > > >         (vect_analyze_data_refs): Support vectorization for Complex
> > > > > >         type with vector scalar types.
> > > > > >         * tree-vect-loop.cc (vect_determine_vf_for_stmt_1): VF should
> > > > > >         be half of TYPE_VECTOR_SUBPARTS when complex_p.
> > > > > >         * tree-vect-slp.cc (vect_record_max_nunits): nunits should be
> > > > > >         half of TYPE_VECTOR_SUBPARTS when complex_p.
> > > > > >         (vect_optimize_slp): Support permutation for complex type.
> > > > > >         (vect_slp_analyze_node_operations_1): Double nunits in
> > > > > >         vect_get_num_vectors to get right SLP_TREE_NUMBER_OF_VEC_STMTS
> > > > > >         when complex_p.
> > > > > >         (vect_slp_analyze_node_operations): Ditto.
> > > > > >         (vect_create_constant_vectors): Support CTOR for complex type.
> > > > > >         (vect_transform_slp_perm_load): Support permutation for
> > > > > >         complex type.
> > > > > >         * tree-vect-stmts.cc (vect_init_vector): Support complex type.
> > > > > >         (vect_get_vec_defs_for_operand): Get vector type for
> > > > > >         complex type.
> > > > > >         (vectorizable_store): Get right ncopies/nunits for complex
> > > > > >         type, also return false when complex_p and
> > > > > >         !TYPE_VECTOR_SUBPARTS.is_constant ().
> > > > > >         (vectorizable_load): Ditto.
> > > > > >         (vect_get_vector_types_for_stmt): Get vector type for complex type.
> > > > > >         * tree-vectorizer.h (STMT_VINFO_COMPLEX_P): New macro.
> > > > > >         (vect_get_num_copies): New overload.
> > > > > >
> > > > > > gcc/testsuite/ChangeLog:
> > > > > >
> > > > > >         * gcc.target/i386/pr106010-1a.c: New test.
> > > > > >         * gcc.target/i386/pr106010-1b.c: New test.
> > > > > >         * gcc.target/i386/pr106010-1c.c: New test.
> > > > > >         * gcc.target/i386/pr106010-2a.c: New test.
> > > > > >         * gcc.target/i386/pr106010-2b.c: New test.
> > > > > >         * gcc.target/i386/pr106010-2c.c: New test.
> > > > > >         * gcc.target/i386/pr106010-3a.c: New test.
> > > > > >         * gcc.target/i386/pr106010-3b.c: New test.
> > > > > >         * gcc.target/i386/pr106010-3c.c: New test.
> > > > > >         * gcc.target/i386/pr106010-4a.c: New test.
> > > > > >         * gcc.target/i386/pr106010-4b.c: New test.
> > > > > >         * gcc.target/i386/pr106010-4c.c: New test.
> > > > > >         * gcc.target/i386/pr106010-5a.c: New test.
> > > > > >         * gcc.target/i386/pr106010-5b.c: New test.
> > > > > >         * gcc.target/i386/pr106010-5c.c: New test.
> > > > > >         * gcc.target/i386/pr106010-6a.c: New test.
> > > > > >         * gcc.target/i386/pr106010-6b.c: New test.
> > > > > >         * gcc.target/i386/pr106010-6c.c: New test.
> > > > > >         * gcc.target/i386/pr106010-7a.c: New test.
> > > > > >         * gcc.target/i386/pr106010-7b.c: New test.
> > > > > >         * gcc.target/i386/pr106010-7c.c: New test.
> > > > > >         * gcc.target/i386/pr106010-8a.c: New test.
> > > > > >         * gcc.target/i386/pr106010-8b.c: New test.
> > > > > >         * gcc.target/i386/pr106010-8c.c: New test.
> > > > > > ---
> > > > > >  gcc/testsuite/gcc.target/i386/pr106010-1a.c |  58 +++++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr106010-1b.c |  63 +++++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr106010-1c.c |  41 +++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr106010-2a.c |  82 +++++++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr106010-2b.c |  62 +++++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr106010-2c.c |  47 ++++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr106010-3a.c |  80 +++++++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr106010-3b.c | 126 ++++++++++++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr106010-3c.c |  69 ++++++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr106010-4a.c | 101 ++++++++++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr106010-4b.c |  67 ++++++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr106010-4c.c |  54 ++++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr106010-5a.c | 117 +++++++++++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr106010-5b.c |  80 +++++++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr106010-5c.c |  62 +++++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr106010-6a.c | 115 +++++++++++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr106010-6b.c | 157 ++++++++++++++++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr106010-6c.c |  80 +++++++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr106010-7a.c |  58 +++++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr106010-7b.c |  63 +++++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr106010-7c.c |  41 +++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr106010-8a.c |  58 +++++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr106010-8b.c |  53 ++++++
> > > > > >  gcc/testsuite/gcc.target/i386/pr106010-8c.c |  38 +++++
> > > > > >  gcc/tree-vect-data-refs.cc                  |  26 ++-
> > > > > >  gcc/tree-vect-loop.cc                       |   7 +-
> > > > > >  gcc/tree-vect-slp.cc                        | 174 +++++++++++++++-----
> > > > > >  gcc/tree-vect-stmts.cc                      | 135 ++++++++++++---
> > > > > >  gcc/tree-vectorizer.h                       |  13 ++
> > > > > >  29 files changed, 2064 insertions(+), 63 deletions(-)
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > > > > >
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1a.c b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > > > > > new file mode 100644
> > > > > > index 00000000000..b608f484934
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > > > > > @@ -0,0 +1,58 @@
> > > > > > +/* { dg-do compile } */
> > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > > > > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "vect" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "vect" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "vect" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "vect" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "vect" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "vect" } } */
> > > > > > +
> > > > > > +#define N 10000
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_pd (_Complex double* a, _Complex double* b)
> > > > > > +{
> > > > > > +  for (int i = 0; i != N; i++)
> > > > > > +    a[i] = b[i];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_ps (_Complex float* a, _Complex float* b)
> > > > > > +{
> > > > > > +  for (int i = 0; i != N; i++)
> > > > > > +    a[i] = b[i];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi64 (_Complex long long* a, _Complex long long* b)
> > > > > > +{
> > > > > > +  for (int i = 0; i != N; i++)
> > > > > > +    a[i] = b[i];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi32 (_Complex int* a, _Complex int* b)
> > > > > > +{
> > > > > > +  for (int i = 0; i != N; i++)
> > > > > > +    a[i] = b[i];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi16 (_Complex short* a, _Complex short* b)
> > > > > > +{
> > > > > > +  for (int i = 0; i != N; i++)
> > > > > > +    a[i] = b[i];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi8 (_Complex char* a, _Complex char* b)
> > > > > > +{
> > > > > > +  for (int i = 0; i != N; i++)
> > > > > > +    a[i] = b[i];
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1b.c b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > > > > > new file mode 100644
> > > > > > index 00000000000..0f377c3a548
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > > > > > @@ -0,0 +1,63 @@
> > > > > > +/* { dg-do run } */
> > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > +/* { dg-require-effective-target avx } */
> > > > > > +
> > > > > > +#include "avx-check.h"
> > > > > > +#include <string.h>
> > > > > > +#include "pr106010-1a.c"
> > > > > > +
> > > > > > +void
> > > > > > +avx_test (void)
> > > > > > +{
> > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > > > +  char* p_init = (char*) malloc (2 * N * sizeof (double));
> > > > > > +
> > > > > > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > > > > > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > > > > > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > > > > > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > > > > > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > > > > > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > > > > > +
> > > > > > +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> > > > > > +    p_init[i] = i;
> > > > > > +
> > > > > > +  memcpy (pd_src, p_init, 2 * N * sizeof (double));
> > > > > > +  memcpy (ps_src, p_init, 2 * N * sizeof (float));
> > > > > > +  memcpy (epi64_src, p_init, 2 * N * sizeof (long long));
> > > > > > +  memcpy (epi32_src, p_init, 2 * N * sizeof (int));
> > > > > > +  memcpy (epi16_src, p_init, 2 * N * sizeof (short));
> > > > > > +  memcpy (epi8_src, p_init, 2 * N * sizeof (char));
> > > > > > +
> > > > > > +  foo_pd (pd_dst, pd_src);
> > > > > > +  foo_ps (ps_dst, ps_src);
> > > > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > > > +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +
> > > > > > +  return;
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1c.c b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > > > > > new file mode 100644
> > > > > > index 00000000000..f07e9fb2d3d
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > > > > > @@ -0,0 +1,41 @@
> > > > > > +/* { dg-do run } */
> > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "vect" } } */
> > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > +
> > > > > > +#include <string.h>
> > > > > > +
> > > > > > +static void do_test (void);
> > > > > > +
> > > > > > +#define DO_TEST do_test
> > > > > > +#define AVX512FP16
> > > > > > +#include "avx512-check.h"
> > > > > > +
> > > > > > +#define N 10000
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* b)
> > > > > > +{
> > > > > > +  for (int i = 0; i != N; i++)
> > > > > > +    a[i] = b[i];
> > > > > > +}
> > > > > > +
> > > > > > +static void
> > > > > > +do_test (void)
> > > > > > +{
> > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > > > +  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
> > > > > > +
> > > > > > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > > > > > +
> > > > > > +  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
> > > > > > +    p_init[i] = i;
> > > > > > +
> > > > > > +  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
> > > > > > +
> > > > > > +  foo_ph (ph_dst, ph_src);
> > > > > > +  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2a.c b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > > > > > new file mode 100644
> > > > > > index 00000000000..d2e2f8d4f43
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > > > > > @@ -0,0 +1,82 @@
> > > > > > +/* { dg-do compile } */
> > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "slp2" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "slp2" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "slp2" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "slp2" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "slp2" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "slp2" } } */
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[0];
> > > > > > +  a[1] = b[1];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[0];
> > > > > > +  a[1] = b[1];
> > > > > > +  a[2] = b[2];
> > > > > > +  a[3] = b[3];
> > > > > > +
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[0];
> > > > > > +  a[1] = b[1];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[0];
> > > > > > +  a[1] = b[1];
> > > > > > +  a[2] = b[2];
> > > > > > +  a[3] = b[3];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[0];
> > > > > > +  a[1] = b[1];
> > > > > > +  a[2] = b[2];
> > > > > > +  a[3] = b[3];
> > > > > > +  a[4] = b[4];
> > > > > > +  a[5] = b[5];
> > > > > > +  a[6] = b[6];
> > > > > > +  a[7] = b[7];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[0];
> > > > > > +  a[1] = b[1];
> > > > > > +  a[2] = b[2];
> > > > > > +  a[3] = b[3];
> > > > > > +  a[4] = b[4];
> > > > > > +  a[5] = b[5];
> > > > > > +  a[6] = b[6];
> > > > > > +  a[7] = b[7];
> > > > > > +  a[8] = b[8];
> > > > > > +  a[9] = b[9];
> > > > > > +  a[10] = b[10];
> > > > > > +  a[11] = b[11];
> > > > > > +  a[12] = b[12];
> > > > > > +  a[13] = b[13];
> > > > > > +  a[14] = b[14];
> > > > > > +  a[15] = b[15];
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2b.c b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > > > > > new file mode 100644
> > > > > > index 00000000000..ac360752693
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > > > > > @@ -0,0 +1,62 @@
> > > > > > +/* { dg-do run } */
> > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > +/* { dg-require-effective-target avx } */
> > > > > > +
> > > > > > +#include "avx-check.h"
> > > > > > +#include <string.h>
> > > > > > +#include "pr106010-2a.c"
> > > > > > +
> > > > > > +void
> > > > > > +avx_test (void)
> > > > > > +{
> > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > > > > > +  char* p = (char* ) malloc (32);
> > > > > > +
> > > > > > +  __builtin_memset (pd_dst, 0, 32);
> > > > > > +  __builtin_memset (ps_dst, 0, 32);
> > > > > > +  __builtin_memset (epi64_dst, 0, 32);
> > > > > > +  __builtin_memset (epi32_dst, 0, 32);
> > > > > > +  __builtin_memset (epi16_dst, 0, 32);
> > > > > > +  __builtin_memset (epi8_dst, 0, 32);
> > > > > > +
> > > > > > +  for (int i = 0; i != 32; i++)
> > > > > > +    p[i] = i;
> > > > > > +  __builtin_memcpy (pd_src, p, 32);
> > > > > > +  __builtin_memcpy (ps_src, p, 32);
> > > > > > +  __builtin_memcpy (epi64_src, p, 32);
> > > > > > +  __builtin_memcpy (epi32_src, p, 32);
> > > > > > +  __builtin_memcpy (epi16_src, p, 32);
> > > > > > +  __builtin_memcpy (epi8_src, p, 32);
> > > > > > +
> > > > > > +  foo_pd (pd_dst, pd_src);
> > > > > > +  foo_ps (ps_dst, ps_src);
> > > > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > > > +  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +
> > > > > > +  return;
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2c.c b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > > > > > new file mode 100644
> > > > > > index 00000000000..a002f209ec9
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > > > > > @@ -0,0 +1,47 @@
> > > > > > +/* { dg-do run } */
> > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > +
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "slp2" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > > > > +
> > > > > > +#include <string.h>
> > > > > > +
> > > > > > +static void do_test (void);
> > > > > > +#define DO_TEST do_test
> > > > > > +#define AVX512FP16
> > > > > > +#include "avx512-check.h"
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[0];
> > > > > > +  a[1] = b[1];
> > > > > > +  a[2] = b[2];
> > > > > > +  a[3] = b[3];
> > > > > > +  a[4] = b[4];
> > > > > > +  a[5] = b[5];
> > > > > > +  a[6] = b[6];
> > > > > > +  a[7] = b[7];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +do_test (void)
> > > > > > +{
> > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > > > > > +  char* p = (char* ) malloc (32);
> > > > > > +
> > > > > > +   __builtin_memset (ph_dst, 0, 32);
> > > > > > +
> > > > > > +  for (int i = 0; i != 32; i++)
> > > > > > +    p[i] = i;
> > > > > > +  __builtin_memcpy (ph_src, p, 32);
> > > > > > +
> > > > > > +  foo_ph (ph_dst, ph_src);
> > > > > > +  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +
> > > > > > +  return;
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3a.c b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > > > > > new file mode 100644
> > > > > > index 00000000000..c1b64b56b1c
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > > > > > @@ -0,0 +1,80 @@
> > > > > > +/* { dg-do compile } */
> > > > > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
> > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 6, 7, 4, 5 \}} 1 "slp2" } }  */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17 \}} 1 "slp2" } }  */
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[1];
> > > > > > +  a[1] = b[0];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[1];
> > > > > > +  a[1] = b[0];
> > > > > > +  a[2] = b[3];
> > > > > > +  a[3] = b[2];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[1];
> > > > > > +  a[1] = b[0];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[3];
> > > > > > +  a[1] = b[2];
> > > > > > +  a[2] = b[1];
> > > > > > +  a[3] = b[0];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[7];
> > > > > > +  a[1] = b[6];
> > > > > > +  a[2] = b[5];
> > > > > > +  a[3] = b[4];
> > > > > > +  a[4] = b[3];
> > > > > > +  a[5] = b[2];
> > > > > > +  a[6] = b[1];
> > > > > > +  a[7] = b[0];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[7];
> > > > > > +  a[1] = b[6];
> > > > > > +  a[2] = b[5];
> > > > > > +  a[3] = b[4];
> > > > > > +  a[4] = b[3];
> > > > > > +  a[5] = b[2];
> > > > > > +  a[6] = b[1];
> > > > > > +  a[7] = b[0];
> > > > > > +  a[8] = b[15];
> > > > > > +  a[9] = b[14];
> > > > > > +  a[10] = b[13];
> > > > > > +  a[11] = b[12];
> > > > > > +  a[12] = b[11];
> > > > > > +  a[13] = b[10];
> > > > > > +  a[14] = b[9];
> > > > > > +  a[15] = b[8];
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3b.c b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > > > > > new file mode 100644
> > > > > > index 00000000000..e4fa3f3a541
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > > > > > @@ -0,0 +1,126 @@
> > > > > > +/* { dg-do run } */
> > > > > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > +/* { dg-require-effective-target avx2 } */
> > > > > > +
> > > > > > +#include "avx2-check.h"
> > > > > > +#include <string.h>
> > > > > > +#include "pr106010-3a.c"
> > > > > > +
> > > > > > +void
> > > > > > +avx2_test (void)
> > > > > > +{
> > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > > > > > +  _Complex double* pd_exp = (_Complex double*) malloc (32);
> > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > > > > > +  _Complex float* ps_exp = (_Complex float*) malloc (32);
> > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > > > > > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (32);
> > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > > > > > +  _Complex int* epi32_exp = (_Complex int*) malloc (32);
> > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > > > > > +  _Complex short* epi16_exp = (_Complex short*) malloc (32);
> > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > > > > > +  _Complex char* epi8_exp = (_Complex char*) malloc (32);
> > > > > > +  char* p = (char* ) malloc (32);
> > > > > > +  char* q = (char* ) malloc (32);
> > > > > > +
> > > > > > +  __builtin_memset (pd_dst, 0, 32);
> > > > > > +  __builtin_memset (ps_dst, 0, 32);
> > > > > > +  __builtin_memset (epi64_dst, 0, 32);
> > > > > > +  __builtin_memset (epi32_dst, 0, 32);
> > > > > > +  __builtin_memset (epi16_dst, 0, 32);
> > > > > > +  __builtin_memset (epi8_dst, 0, 32);
> > > > > > +
> > > > > > +  for (int i = 0; i != 32; i++)
> > > > > > +    p[i] = i;
> > > > > > +  __builtin_memcpy (pd_src, p, 32);
> > > > > > +  __builtin_memcpy (ps_src, p, 32);
> > > > > > +  __builtin_memcpy (epi64_src, p, 32);
> > > > > > +  __builtin_memcpy (epi32_src, p, 32);
> > > > > > +  __builtin_memcpy (epi16_src, p, 32);
> > > > > > +  __builtin_memcpy (epi8_src, p, 32);
> > > > > > +
> > > > > > +  for (int i = 0; i != 16; i++)
> > > > > > +    {
> > > > > > +      p[i] = i + 16;
> > > > > > +      p[i + 16] = i;
> > > > > > +    }
> > > > > > +  __builtin_memcpy (pd_exp, p, 32);
> > > > > > +  __builtin_memcpy (epi64_exp, p, 32);
> > > > > > +
> > > > > > +  for (int i = 0; i != 8; i++)
> > > > > > +    {
> > > > > > +      p[i] = i + 8;
> > > > > > +      p[i + 8] = i;
> > > > > > +      p[i + 16] = i + 24;
> > > > > > +      p[i + 24] = i + 16;
> > > > > > +      q[i] = i + 24;
> > > > > > +      q[i + 8] = i + 16;
> > > > > > +      q[i + 16] = i + 8;
> > > > > > +      q[i + 24] = i;
> > > > > > +    }
> > > > > > +  __builtin_memcpy (ps_exp, p, 32);
> > > > > > +  __builtin_memcpy (epi32_exp, q, 32);
> > > > > > +
> > > > > > +
> > > > > > +  for (int i = 0; i != 4; i++)
> > > > > > +    {
> > > > > > +      q[i] = i + 28;
> > > > > > +      q[i + 4] = i + 24;
> > > > > > +      q[i + 8] = i + 20;
> > > > > > +      q[i + 12] = i + 16;
> > > > > > +      q[i + 16] = i + 12;
> > > > > > +      q[i + 20] = i + 8;
> > > > > > +      q[i + 24] = i + 4;
> > > > > > +      q[i + 28] = i;
> > > > > > +    }
> > > > > > +  __builtin_memcpy (epi16_exp, q, 32);
> > > > > > +
> > > > > > +  for (int i = 0; i != 2; i++)
> > > > > > +    {
> > > > > > +      q[i] = i + 14;
> > > > > > +      q[i + 2] = i + 12;
> > > > > > +      q[i + 4] = i + 10;
> > > > > > +      q[i + 6] = i + 8;
> > > > > > +      q[i + 8] = i + 6;
> > > > > > +      q[i + 10] = i + 4;
> > > > > > +      q[i + 12] = i + 2;
> > > > > > +      q[i + 14] = i;
> > > > > > +      q[i + 16] = i + 30;
> > > > > > +      q[i + 18] = i + 28;
> > > > > > +      q[i + 20] = i + 26;
> > > > > > +      q[i + 22] = i + 24;
> > > > > > +      q[i + 24] = i + 22;
> > > > > > +      q[i + 26] = i + 20;
> > > > > > +      q[i + 28] = i + 18;
> > > > > > +      q[i + 30] = i + 16;
> > > > > > +    }
> > > > > > +  __builtin_memcpy (epi8_exp, q, 32);
> > > > > > +
> > > > > > +  foo_pd (pd_dst, pd_src);
> > > > > > +  foo_ps (ps_dst, ps_src);
> > > > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > > > +  if (__builtin_memcmp (pd_dst, pd_exp, 32) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (ps_dst, ps_exp, 32) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 32) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 32) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 32) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 32) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +
> > > > > > +  return;
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3c.c b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > > > > > new file mode 100644
> > > > > > index 00000000000..5a5a3d4b992
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > > > > > @@ -0,0 +1,69 @@
> > > > > > +/* { dg-do run } */
> > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 8, 9, 6, 7, 14, 15, 12, 13, 4, 5, 10, 11 \}} 1 "slp2" } }  */
> > > > > > +
> > > > > > +#include <string.h>
> > > > > > +
> > > > > > +static void do_test (void);
> > > > > > +#define DO_TEST do_test
> > > > > > +#define AVX512FP16
> > > > > > +#include "avx512-check.h"
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[1];
> > > > > > +  a[1] = b[0];
> > > > > > +  a[2] = b[4];
> > > > > > +  a[3] = b[3];
> > > > > > +  a[4] = b[7];
> > > > > > +  a[5] = b[6];
> > > > > > +  a[6] = b[2];
> > > > > > +  a[7] = b[5];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +do_test (void)
> > > > > > +{
> > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > > > > > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (32);
> > > > > > +  char* p = (char* ) malloc (32);
> > > > > > +  char* q = (char* ) malloc (32);
> > > > > > +
> > > > > > +  __builtin_memset (ph_dst, 0, 32);
> > > > > > +
> > > > > > +  for (int i = 0; i != 32; i++)
> > > > > > +    p[i] = i;
> > > > > > +  __builtin_memcpy (ph_src, p, 32);
> > > > > > +
> > > > > > +  for (int i = 0; i != 4; i++)
> > > > > > +    {
> > > > > > +      p[i] = i + 4;
> > > > > > +      p[i + 4] = i;
> > > > > > +      p[i + 8] = i + 16;
> > > > > > +      p[i + 12] = i + 12;
> > > > > > +      p[i + 16] = i + 28;
> > > > > > +      p[i + 20] = i + 24;
> > > > > > +      p[i + 24] = i + 8;
> > > > > > +      p[i + 28] = i + 20;
> > > > > > +      q[i] = i + 28;
> > > > > > +      q[i + 4] = i + 24;
> > > > > > +      q[i + 8] = i + 20;
> > > > > > +      q[i + 12] = i + 16;
> > > > > > +      q[i + 16] = i + 12;
> > > > > > +      q[i + 20] = i + 8;
> > > > > > +      q[i + 24] = i + 4;
> > > > > > +      q[i + 28] = i;
> > > > > > +    }
> > > > > > +  __builtin_memcpy (ph_exp, p, 32);
> > > > > > +
> > > > > > +  foo_ph (ph_dst, ph_src);
> > > > > > +  if (__builtin_memcmp (ph_dst, ph_exp, 32) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +
> > > > > > +  return;
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4a.c b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > > > > > new file mode 100644
> > > > > > index 00000000000..b7b0b532bb1
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > > > > > @@ -0,0 +1,101 @@
> > > > > > +/* { dg-do compile } */
> > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
> > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "slp2" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "slp2" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "slp2" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "slp2" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "slp2" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "slp2" } } */
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_pd (_Complex double* a,
> > > > > > +       _Complex double b1,
> > > > > > +       _Complex double b2)
> > > > > > +{
> > > > > > +  a[0] = b1;
> > > > > > +  a[1] = b2;
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_ps (_Complex float* a,
> > > > > > +       _Complex float b1, _Complex float b2,
> > > > > > +       _Complex float b3, _Complex float b4)
> > > > > > +{
> > > > > > +  a[0] = b1;
> > > > > > +  a[1] = b2;
> > > > > > +  a[2] = b3;
> > > > > > +  a[3] = b4;
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi64 (_Complex long long* a,
> > > > > > +          _Complex long long b1,
> > > > > > +          _Complex long long b2)
> > > > > > +{
> > > > > > +  a[0] = b1;
> > > > > > +  a[1] = b2;
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi32 (_Complex int* a,
> > > > > > +          _Complex int b1, _Complex int b2,
> > > > > > +          _Complex int b3, _Complex int b4)
> > > > > > +{
> > > > > > +  a[0] = b1;
> > > > > > +  a[1] = b2;
> > > > > > +  a[2] = b3;
> > > > > > +  a[3] = b4;
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi16 (_Complex short* a,
> > > > > > +          _Complex short b1, _Complex short b2,
> > > > > > +          _Complex short b3, _Complex short b4,
> > > > > > +          _Complex short b5, _Complex short b6,
> > > > > > +          _Complex short b7,_Complex short b8)
> > > > > > +{
> > > > > > +  a[0] = b1;
> > > > > > +  a[1] = b2;
> > > > > > +  a[2] = b3;
> > > > > > +  a[3] = b4;
> > > > > > +  a[4] = b5;
> > > > > > +  a[5] = b6;
> > > > > > +  a[6] = b7;
> > > > > > +  a[7] = b8;
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi8 (_Complex char* a,
> > > > > > +         _Complex char b1, _Complex char b2,
> > > > > > +         _Complex char b3, _Complex char b4,
> > > > > > +         _Complex char b5, _Complex char b6,
> > > > > > +         _Complex char b7,_Complex char b8,
> > > > > > +         _Complex char b9, _Complex char b10,
> > > > > > +         _Complex char b11, _Complex char b12,
> > > > > > +         _Complex char b13, _Complex char b14,
> > > > > > +         _Complex char b15,_Complex char b16)
> > > > > > +{
> > > > > > +  a[0] = b1;
> > > > > > +  a[1] = b2;
> > > > > > +  a[2] = b3;
> > > > > > +  a[3] = b4;
> > > > > > +  a[4] = b5;
> > > > > > +  a[5] = b6;
> > > > > > +  a[6] = b7;
> > > > > > +  a[7] = b8;
> > > > > > +  a[8] = b9;
> > > > > > +  a[9] = b10;
> > > > > > +  a[10] = b11;
> > > > > > +  a[11] = b12;
> > > > > > +  a[12] = b13;
> > > > > > +  a[13] = b14;
> > > > > > +  a[14] = b15;
> > > > > > +  a[15] = b16;
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4b.c b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > > > > > new file mode 100644
> > > > > > index 00000000000..e2e79508c4b
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > > > > > @@ -0,0 +1,67 @@
> > > > > > +/* { dg-do run } */
> > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > +/* { dg-require-effective-target avx } */
> > > > > > +
> > > > > > +#include "avx-check.h"
> > > > > > +#include <string.h>
> > > > > > +#include "pr106010-4a.c"
> > > > > > +
> > > > > > +void
> > > > > > +avx_test (void)
> > > > > > +{
> > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > > > > > +  char* p = (char* ) malloc (32);
> > > > > > +
> > > > > > +  __builtin_memset (pd_dst, 0, 32);
> > > > > > +  __builtin_memset (ps_dst, 0, 32);
> > > > > > +  __builtin_memset (epi64_dst, 0, 32);
> > > > > > +  __builtin_memset (epi32_dst, 0, 32);
> > > > > > +  __builtin_memset (epi16_dst, 0, 32);
> > > > > > +  __builtin_memset (epi8_dst, 0, 32);
> > > > > > +
> > > > > > +  for (int i = 0; i != 32; i++)
> > > > > > +    p[i] = i;
> > > > > > +  __builtin_memcpy (pd_src, p, 32);
> > > > > > +  __builtin_memcpy (ps_src, p, 32);
> > > > > > +  __builtin_memcpy (epi64_src, p, 32);
> > > > > > +  __builtin_memcpy (epi32_src, p, 32);
> > > > > > +  __builtin_memcpy (epi16_src, p, 32);
> > > > > > +  __builtin_memcpy (epi8_src, p, 32);
> > > > > > +
> > > > > > +  foo_pd (pd_dst, pd_src[0], pd_src[1]);
> > > > > > +  foo_ps (ps_dst, ps_src[0], ps_src[1], ps_src[2], ps_src[3]);
> > > > > > +  foo_epi64 (epi64_dst, epi64_src[0], epi64_src[1]);
> > > > > > +  foo_epi32 (epi32_dst, epi32_src[0], epi32_src[1], epi32_src[2], epi32_src[3]);
> > > > > > +  foo_epi16 (epi16_dst, epi16_src[0], epi16_src[1], epi16_src[2], epi16_src[3],
> > > > > > +            epi16_src[4], epi16_src[5], epi16_src[6], epi16_src[7]);
> > > > > > +  foo_epi8 (epi8_dst, epi8_src[0], epi8_src[1], epi8_src[2], epi8_src[3],
> > > > > > +           epi8_src[4], epi8_src[5], epi8_src[6], epi8_src[7],
> > > > > > +           epi8_src[8], epi8_src[9], epi8_src[10], epi8_src[11],
> > > > > > +           epi8_src[12], epi8_src[13], epi8_src[14], epi8_src[15]);
> > > > > > +
> > > > > > +  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi8_dst, epi8_src, 32) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +
> > > > > > +  return;
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4c.c b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > > > > > new file mode 100644
> > > > > > index 00000000000..8e02aefe3b5
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > > > > > @@ -0,0 +1,54 @@
> > > > > > +/* { dg-do run } */
> > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -fdump-tree-slp-details -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "slp2" } } */
> > > > > > +
> > > > > > +#include <string.h>
> > > > > > +
> > > > > > +static void do_test (void);
> > > > > > +#define DO_TEST do_test
> > > > > > +#define AVX512FP16
> > > > > > +#include "avx512-check.h"
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_ph (_Complex _Float16* a,
> > > > > > +       _Complex _Float16 b1, _Complex _Float16 b2,
> > > > > > +       _Complex _Float16 b3, _Complex _Float16 b4,
> > > > > > +       _Complex _Float16 b5, _Complex _Float16 b6,
> > > > > > +       _Complex _Float16 b7,_Complex _Float16 b8)
> > > > > > +{
> > > > > > +  a[0] = b1;
> > > > > > +  a[1] = b2;
> > > > > > +  a[2] = b3;
> > > > > > +  a[3] = b4;
> > > > > > +  a[4] = b5;
> > > > > > +  a[5] = b6;
> > > > > > +  a[6] = b7;
> > > > > > +  a[7] = b8;
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +do_test (void)
> > > > > > +{
> > > > > > +
> > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > > > > > +
> > > > > > +  char* p = (char* ) malloc (32);
> > > > > > +
> > > > > > +  __builtin_memset (ph_dst, 0, 32);
> > > > > > +
> > > > > > +  for (int i = 0; i != 32; i++)
> > > > > > +    p[i] = i;
> > > > > > +
> > > > > > +  __builtin_memcpy (ph_src, p, 32);
> > > > > > +
> > > > > > +  foo_ph (ph_dst, ph_src[0], ph_src[1], ph_src[2], ph_src[3],
> > > > > > +         ph_src[4], ph_src[5], ph_src[6], ph_src[7]);
> > > > > > +
> > > > > > +  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  return;
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5a.c b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > > > > > new file mode 100644
> > > > > > index 00000000000..9d4a6f9846b
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > > > > > @@ -0,0 +1,117 @@
> > > > > > +/* { dg-do compile } */
> > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 4 "slp2" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 4 "slp2" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 4 "slp2" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 4 "slp2" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 4 "slp2" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 4 "slp2" } } */
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[2];
> > > > > > +  a[1] = b[3];
> > > > > > +  a[2] = b[0];
> > > > > > +  a[3] = b[1];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[4];
> > > > > > +  a[1] = b[5];
> > > > > > +  a[2] = b[6];
> > > > > > +  a[3] = b[7];
> > > > > > +  a[4] = b[0];
> > > > > > +  a[5] = b[1];
> > > > > > +  a[6] = b[2];
> > > > > > +  a[7] = b[3];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[2];
> > > > > > +  a[1] = b[3];
> > > > > > +  a[2] = b[0];
> > > > > > +  a[3] = b[1];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[4];
> > > > > > +  a[1] = b[5];
> > > > > > +  a[2] = b[6];
> > > > > > +  a[3] = b[7];
> > > > > > +  a[4] = b[0];
> > > > > > +  a[5] = b[1];
> > > > > > +  a[6] = b[2];
> > > > > > +  a[7] = b[3];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[8];
> > > > > > +  a[1] = b[9];
> > > > > > +  a[2] = b[10];
> > > > > > +  a[3] = b[11];
> > > > > > +  a[4] = b[12];
> > > > > > +  a[5] = b[13];
> > > > > > +  a[6] = b[14];
> > > > > > +  a[7] = b[15];
> > > > > > +  a[8] = b[0];
> > > > > > +  a[9] = b[1];
> > > > > > +  a[10] = b[2];
> > > > > > +  a[11] = b[3];
> > > > > > +  a[12] = b[4];
> > > > > > +  a[13] = b[5];
> > > > > > +  a[14] = b[6];
> > > > > > +  a[15] = b[7];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[16];
> > > > > > +  a[1] = b[17];
> > > > > > +  a[2] = b[18];
> > > > > > +  a[3] = b[19];
> > > > > > +  a[4] = b[20];
> > > > > > +  a[5] = b[21];
> > > > > > +  a[6] = b[22];
> > > > > > +  a[7] = b[23];
> > > > > > +  a[8] = b[24];
> > > > > > +  a[9] = b[25];
> > > > > > +  a[10] = b[26];
> > > > > > +  a[11] = b[27];
> > > > > > +  a[12] = b[28];
> > > > > > +  a[13] = b[29];
> > > > > > +  a[14] = b[30];
> > > > > > +  a[15] = b[31];
> > > > > > +  a[16] = b[0];
> > > > > > +  a[17] = b[1];
> > > > > > +  a[18] = b[2];
> > > > > > +  a[19] = b[3];
> > > > > > +  a[20] = b[4];
> > > > > > +  a[21] = b[5];
> > > > > > +  a[22] = b[6];
> > > > > > +  a[23] = b[7];
> > > > > > +  a[24] = b[8];
> > > > > > +  a[25] = b[9];
> > > > > > +  a[26] = b[10];
> > > > > > +  a[27] = b[11];
> > > > > > +  a[28] = b[12];
> > > > > > +  a[29] = b[13];
> > > > > > +  a[30] = b[14];
> > > > > > +  a[31] = b[15];
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5b.c b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > > > > > new file mode 100644
> > > > > > index 00000000000..d5c6ebeb5cf
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > > > > > @@ -0,0 +1,80 @@
> > > > > > +/* { dg-do run } */
> > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > +/* { dg-require-effective-target avx } */
> > > > > > +
> > > > > > +#include "avx-check.h"
> > > > > > +#include <string.h>
> > > > > > +#include "pr106010-5a.c"
> > > > > > +
> > > > > > +void
> > > > > > +avx_test (void)
> > > > > > +{
> > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (64);
> > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (64);
> > > > > > +  _Complex double* pd_exp = (_Complex double*) malloc (64);
> > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (64);
> > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (64);
> > > > > > +  _Complex float* ps_exp = (_Complex float*) malloc (64);
> > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
> > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
> > > > > > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
> > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (64);
> > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (64);
> > > > > > +  _Complex int* epi32_exp = (_Complex int*) malloc (64);
> > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (64);
> > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (64);
> > > > > > +  _Complex short* epi16_exp = (_Complex short*) malloc (64);
> > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (64);
> > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (64);
> > > > > > +  _Complex char* epi8_exp = (_Complex char*) malloc (64);
> > > > > > +  char* p = (char* ) malloc (64);
> > > > > > +  char* q = (char* ) malloc (64);
> > > > > > +
> > > > > > +  __builtin_memset (pd_dst, 0, 64);
> > > > > > +  __builtin_memset (ps_dst, 0, 64);
> > > > > > +  __builtin_memset (epi64_dst, 0, 64);
> > > > > > +  __builtin_memset (epi32_dst, 0, 64);
> > > > > > +  __builtin_memset (epi16_dst, 0, 64);
> > > > > > +  __builtin_memset (epi8_dst, 0, 64);
> > > > > > +
> > > > > > +  for (int i = 0; i != 64; i++)
> > > > > > +    {
> > > > > > +      p[i] = i;
> > > > > > +      q[i] = (i + 32) % 64;
> > > > > > +    }
> > > > > > +  __builtin_memcpy (pd_src, p, 64);
> > > > > > +  __builtin_memcpy (ps_src, p, 64);
> > > > > > +  __builtin_memcpy (epi64_src, p, 64);
> > > > > > +  __builtin_memcpy (epi32_src, p, 64);
> > > > > > +  __builtin_memcpy (epi16_src, p, 64);
> > > > > > +  __builtin_memcpy (epi8_src, p, 64);
> > > > > > +
> > > > > > +  __builtin_memcpy (pd_exp, q, 64);
> > > > > > +  __builtin_memcpy (ps_exp, q, 64);
> > > > > > +  __builtin_memcpy (epi64_exp, q, 64);
> > > > > > +  __builtin_memcpy (epi32_exp, q, 64);
> > > > > > +  __builtin_memcpy (epi16_exp, q, 64);
> > > > > > +  __builtin_memcpy (epi8_exp, q, 64);
> > > > > > +
> > > > > > +  foo_pd (pd_dst, pd_src);
> > > > > > +  foo_ps (ps_dst, ps_src);
> > > > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > > > +
> > > > > > +  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +
> > > > > > +  return;
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5c.c b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > > > > > new file mode 100644
> > > > > > index 00000000000..9ce4e6dd5c0
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > > > > > @@ -0,0 +1,62 @@
> > > > > > +/* { dg-do run } */
> > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 4 "slp2" } } */
> > > > > > +
> > > > > > +#include <string.h>
> > > > > > +
> > > > > > +static void do_test (void);
> > > > > > +#define DO_TEST do_test
> > > > > > +#define AVX512FP16
> > > > > > +#include "avx512-check.h"
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[8];
> > > > > > +  a[1] = b[9];
> > > > > > +  a[2] = b[10];
> > > > > > +  a[3] = b[11];
> > > > > > +  a[4] = b[12];
> > > > > > +  a[5] = b[13];
> > > > > > +  a[6] = b[14];
> > > > > > +  a[7] = b[15];
> > > > > > +  a[8] = b[0];
> > > > > > +  a[9] = b[1];
> > > > > > +  a[10] = b[2];
> > > > > > +  a[11] = b[3];
> > > > > > +  a[12] = b[4];
> > > > > > +  a[13] = b[5];
> > > > > > +  a[14] = b[6];
> > > > > > +  a[15] = b[7];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +do_test (void)
> > > > > > +{
> > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
> > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
> > > > > > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
> > > > > > +  char* p = (char* ) malloc (64);
> > > > > > +  char* q = (char* ) malloc (64);
> > > > > > +
> > > > > > +  __builtin_memset (ph_dst, 0, 64);
> > > > > > +
> > > > > > +  for (int i = 0; i != 64; i++)
> > > > > > +    {
> > > > > > +      p[i] = i;
> > > > > > +      q[i] = (i + 32) % 64;
> > > > > > +    }
> > > > > > +  __builtin_memcpy (ph_src, p, 64);
> > > > > > +
> > > > > > +  __builtin_memcpy (ph_exp, q, 64);
> > > > > > +
> > > > > > +  foo_ph (ph_dst, ph_src);
> > > > > > +
> > > > > > +  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +
> > > > > > +  return;
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6a.c b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > > > > > new file mode 100644
> > > > > > index 00000000000..65a90d03684
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > > > > > @@ -0,0 +1,115 @@
> > > > > > +/* { dg-do compile } */
> > > > > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 4 "slp2" } }  */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 4 "slp2" } }  */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[3];
> > > > > > +  a[1] = b[2];
> > > > > > +  a[2] = b[1];
> > > > > > +  a[3] = b[0];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[7];
> > > > > > +  a[1] = b[6];
> > > > > > +  a[2] = b[5];
> > > > > > +  a[3] = b[4];
> > > > > > +  a[4] = b[3];
> > > > > > +  a[5] = b[2];
> > > > > > +  a[6] = b[1];
> > > > > > +  a[7] = b[0];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[3];
> > > > > > +  a[1] = b[2];
> > > > > > +  a[2] = b[1];
> > > > > > +  a[3] = b[0];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[7];
> > > > > > +  a[1] = b[6];
> > > > > > +  a[2] = b[5];
> > > > > > +  a[3] = b[4];
> > > > > > +  a[4] = b[3];
> > > > > > +  a[5] = b[2];
> > > > > > +  a[6] = b[1];
> > > > > > +  a[7] = b[0];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[15];
> > > > > > +  a[1] = b[14];
> > > > > > +  a[2] = b[13];
> > > > > > +  a[3] = b[12];
> > > > > > +  a[4] = b[11];
> > > > > > +  a[5] = b[10];
> > > > > > +  a[6] = b[9];
> > > > > > +  a[7] = b[8];
> > > > > > +  a[8] = b[7];
> > > > > > +  a[9] = b[6];
> > > > > > +  a[10] = b[5];
> > > > > > +  a[11] = b[4];
> > > > > > +  a[12] = b[3];
> > > > > > +  a[13] = b[2];
> > > > > > +  a[14] = b[1];
> > > > > > +  a[15] = b[0];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[31];
> > > > > > +  a[1] = b[30];
> > > > > > +  a[2] = b[29];
> > > > > > +  a[3] = b[28];
> > > > > > +  a[4] = b[27];
> > > > > > +  a[5] = b[26];
> > > > > > +  a[6] = b[25];
> > > > > > +  a[7] = b[24];
> > > > > > +  a[8] = b[23];
> > > > > > +  a[9] = b[22];
> > > > > > +  a[10] = b[21];
> > > > > > +  a[11] = b[20];
> > > > > > +  a[12] = b[19];
> > > > > > +  a[13] = b[18];
> > > > > > +  a[14] = b[17];
> > > > > > +  a[15] = b[16];
> > > > > > +  a[16] = b[15];
> > > > > > +  a[17] = b[14];
> > > > > > +  a[18] = b[13];
> > > > > > +  a[19] = b[12];
> > > > > > +  a[20] = b[11];
> > > > > > +  a[21] = b[10];
> > > > > > +  a[22] = b[9];
> > > > > > +  a[23] = b[8];
> > > > > > +  a[24] = b[7];
> > > > > > +  a[25] = b[6];
> > > > > > +  a[26] = b[5];
> > > > > > +  a[27] = b[4];
> > > > > > +  a[28] = b[3];
> > > > > > +  a[29] = b[2];
> > > > > > +  a[30] = b[1];
> > > > > > +  a[31] = b[0];
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6b.c b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > > > > > new file mode 100644
> > > > > > index 00000000000..1c5bb020939
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > > > > > @@ -0,0 +1,157 @@
> > > > > > +/* { dg-do run } */
> > > > > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > +/* { dg-require-effective-target avx2 } */
> > > > > > +
> > > > > > +#include "avx2-check.h"
> > > > > > +#include <string.h>
> > > > > > +#include "pr106010-6a.c"
> > > > > > +
> > > > > > +void
> > > > > > +avx2_test (void)
> > > > > > +{
> > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (64);
> > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (64);
> > > > > > +  _Complex double* pd_exp = (_Complex double*) malloc (64);
> > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (64);
> > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (64);
> > > > > > +  _Complex float* ps_exp = (_Complex float*) malloc (64);
> > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
> > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
> > > > > > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
> > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (64);
> > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (64);
> > > > > > +  _Complex int* epi32_exp = (_Complex int*) malloc (64);
> > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (64);
> > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (64);
> > > > > > +  _Complex short* epi16_exp = (_Complex short*) malloc (64);
> > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (64);
> > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (64);
> > > > > > +  _Complex char* epi8_exp = (_Complex char*) malloc (64);
> > > > > > +  char* p = (char* ) malloc (64);
> > > > > > +  char* q = (char* ) malloc (64);
> > > > > > +
> > > > > > +  __builtin_memset (pd_dst, 0, 64);
> > > > > > +  __builtin_memset (ps_dst, 0, 64);
> > > > > > +  __builtin_memset (epi64_dst, 0, 64);
> > > > > > +  __builtin_memset (epi32_dst, 0, 64);
> > > > > > +  __builtin_memset (epi16_dst, 0, 64);
> > > > > > +  __builtin_memset (epi8_dst, 0, 64);
> > > > > > +
> > > > > > +  for (int i = 0; i != 64; i++)
> > > > > > +    p[i] = i;
> > > > > > +
> > > > > > +  __builtin_memcpy (pd_src, p, 64);
> > > > > > +  __builtin_memcpy (ps_src, p, 64);
> > > > > > +  __builtin_memcpy (epi64_src, p, 64);
> > > > > > +  __builtin_memcpy (epi32_src, p, 64);
> > > > > > +  __builtin_memcpy (epi16_src, p, 64);
> > > > > > +  __builtin_memcpy (epi8_src, p, 64);
> > > > > > +
> > > > > > +
> > > > > > +  for (int i = 0; i != 16; i++)
> > > > > > +    {
> > > > > > +      q[i] = i + 48;
> > > > > > +      q[i + 16] = i + 32;
> > > > > > +      q[i + 32] = i + 16;
> > > > > > +      q[i + 48] = i;
> > > > > > +    }
> > > > > > +
> > > > > > +  __builtin_memcpy (pd_exp, q, 64);
> > > > > > +  __builtin_memcpy (epi64_exp, q, 64);
> > > > > > +
> > > > > > +   for (int i = 0; i != 8; i++)
> > > > > > +    {
> > > > > > +      q[i] = i + 56;
> > > > > > +      q[i + 8] = i + 48;
> > > > > > +      q[i + 16] = i + 40;
> > > > > > +      q[i + 24] = i + 32;
> > > > > > +      q[i + 32] = i + 24;
> > > > > > +      q[i + 40] = i + 16;
> > > > > > +      q[i + 48] = i + 8;
> > > > > > +      q[i + 56] = i;
> > > > > > +    }
> > > > > > +
> > > > > > +  __builtin_memcpy (ps_exp, q, 64);
> > > > > > +  __builtin_memcpy (epi32_exp, q, 64);
> > > > > > +
> > > > > > +  for (int i = 0; i != 4; i++)
> > > > > > +    {
> > > > > > +      q[i] = i + 60;
> > > > > > +      q[i + 4] = i + 56;
> > > > > > +      q[i + 8] = i + 52;
> > > > > > +      q[i + 12] = i + 48;
> > > > > > +      q[i + 16] = i + 44;
> > > > > > +      q[i + 20] = i + 40;
> > > > > > +      q[i + 24] = i + 36;
> > > > > > +      q[i + 28] = i + 32;
> > > > > > +      q[i + 32] = i + 28;
> > > > > > +      q[i + 36] = i + 24;
> > > > > > +      q[i + 40] = i + 20;
> > > > > > +      q[i + 44] = i + 16;
> > > > > > +      q[i + 48] = i + 12;
> > > > > > +      q[i + 52] = i + 8;
> > > > > > +      q[i + 56] = i + 4;
> > > > > > +      q[i + 60] = i;
> > > > > > +    }
> > > > > > +
> > > > > > +  __builtin_memcpy (epi16_exp, q, 64);
> > > > > > +
> > > > > > +  for (int i = 0; i != 2; i++)
> > > > > > +    {
> > > > > > +      q[i] = i + 62;
> > > > > > +      q[i + 2] = i + 60;
> > > > > > +      q[i + 4] = i + 58;
> > > > > > +      q[i + 6] = i + 56;
> > > > > > +      q[i + 8] = i + 54;
> > > > > > +      q[i + 10] = i + 52;
> > > > > > +      q[i + 12] = i + 50;
> > > > > > +      q[i + 14] = i + 48;
> > > > > > +      q[i + 16] = i + 46;
> > > > > > +      q[i + 18] = i + 44;
> > > > > > +      q[i + 20] = i + 42;
> > > > > > +      q[i + 22] = i + 40;
> > > > > > +      q[i + 24] = i + 38;
> > > > > > +      q[i + 26] = i + 36;
> > > > > > +      q[i + 28] = i + 34;
> > > > > > +      q[i + 30] = i + 32;
> > > > > > +      q[i + 32] = i + 30;
> > > > > > +      q[i + 34] = i + 28;
> > > > > > +      q[i + 36] = i + 26;
> > > > > > +      q[i + 38] = i + 24;
> > > > > > +      q[i + 40] = i + 22;
> > > > > > +      q[i + 42] = i + 20;
> > > > > > +      q[i + 44] = i + 18;
> > > > > > +      q[i + 46] = i + 16;
> > > > > > +      q[i + 48] = i + 14;
> > > > > > +      q[i + 50] = i + 12;
> > > > > > +      q[i + 52] = i + 10;
> > > > > > +      q[i + 54] = i + 8;
> > > > > > +      q[i + 56] = i + 6;
> > > > > > +      q[i + 58] = i + 4;
> > > > > > +      q[i + 60] = i + 2;
> > > > > > +      q[i + 62] = i;
> > > > > > +    }
> > > > > > +  __builtin_memcpy (epi8_exp, q, 64);
> > > > > > +
> > > > > > +  foo_pd (pd_dst, pd_src);
> > > > > > +  foo_ps (ps_dst, ps_src);
> > > > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > > > +
> > > > > > +  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +
> > > > > > +  return;
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6c.c b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > > > > > new file mode 100644
> > > > > > index 00000000000..b859d884a7f
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > > > > > @@ -0,0 +1,80 @@
> > > > > > +/* { dg-do run } */
> > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } } */
> > > > > > +
> > > > > > +#include <string.h>
> > > > > > +
> > > > > > +static void do_test (void);
> > > > > > +#define DO_TEST do_test
> > > > > > +#define AVX512FP16
> > > > > > +#include "avx512-check.h"
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > > > > +{
> > > > > > +  a[0] = b[15];
> > > > > > +  a[1] = b[14];
> > > > > > +  a[2] = b[13];
> > > > > > +  a[3] = b[12];
> > > > > > +  a[4] = b[11];
> > > > > > +  a[5] = b[10];
> > > > > > +  a[6] = b[9];
> > > > > > +  a[7] = b[8];
> > > > > > +  a[8] = b[7];
> > > > > > +  a[9] = b[6];
> > > > > > +  a[10] = b[5];
> > > > > > +  a[11] = b[4];
> > > > > > +  a[12] = b[3];
> > > > > > +  a[13] = b[2];
> > > > > > +  a[14] = b[1];
> > > > > > +  a[15] = b[0];
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +do_test (void)
> > > > > > +{
> > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
> > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
> > > > > > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
> > > > > > +  char* p = (char* ) malloc (64);
> > > > > > +  char* q = (char* ) malloc (64);
> > > > > > +
> > > > > > +  __builtin_memset (ph_dst, 0, 64);
> > > > > > +
> > > > > > +  for (int i = 0; i != 64; i++)
> > > > > > +    p[i] = i;
> > > > > > +
> > > > > > +  __builtin_memcpy (ph_src, p, 64);
> > > > > > +
> > > > > > +  for (int i = 0; i != 4; i++)
> > > > > > +    {
> > > > > > +      q[i] = i + 60;
> > > > > > +      q[i + 4] = i + 56;
> > > > > > +      q[i + 8] = i + 52;
> > > > > > +      q[i + 12] = i + 48;
> > > > > > +      q[i + 16] = i + 44;
> > > > > > +      q[i + 20] = i + 40;
> > > > > > +      q[i + 24] = i + 36;
> > > > > > +      q[i + 28] = i + 32;
> > > > > > +      q[i + 32] = i + 28;
> > > > > > +      q[i + 36] = i + 24;
> > > > > > +      q[i + 40] = i + 20;
> > > > > > +      q[i + 44] = i + 16;
> > > > > > +      q[i + 48] = i + 12;
> > > > > > +      q[i + 52] = i + 8;
> > > > > > +      q[i + 56] = i + 4;
> > > > > > +      q[i + 60] = i;
> > > > > > +    }
> > > > > > +
> > > > > > +  __builtin_memcpy (ph_exp, q, 64);
> > > > > > +
> > > > > > +  foo_ph (ph_dst, ph_src);
> > > > > > +
> > > > > > +  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +
> > > > > > +  return;
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7a.c b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > > > > > new file mode 100644
> > > > > > index 00000000000..2ea01fac927
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > > > > > @@ -0,0 +1,58 @@
> > > > > > +/* { dg-do compile } */
> > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > > > > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
> > > > > > +
> > > > > > +#define N 10000
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_pd (_Complex double* a, _Complex double b)
> > > > > > +{
> > > > > > +  for (int i = 0; i != N; i++)
> > > > > > +    a[i] = b;
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_ps (_Complex float* a, _Complex float b)
> > > > > > +{
> > > > > > +  for (int i = 0; i != N; i++)
> > > > > > +    a[i] = b;
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi64 (_Complex long long* a, _Complex long long b)
> > > > > > +{
> > > > > > +  for (int i = 0; i != N; i++)
> > > > > > +    a[i] = b;
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi32 (_Complex int* a, _Complex int b)
> > > > > > +{
> > > > > > +  for (int i = 0; i != N; i++)
> > > > > > +    a[i] = b;
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi16 (_Complex short* a, _Complex short b)
> > > > > > +{
> > > > > > +  for (int i = 0; i != N; i++)
> > > > > > +    a[i] = b;
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi8 (_Complex char* a, _Complex char b)
> > > > > > +{
> > > > > > +  for (int i = 0; i != N; i++)
> > > > > > +    a[i] = b;
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7b.c b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > > > > > new file mode 100644
> > > > > > index 00000000000..26482cc10f5
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > > > > > @@ -0,0 +1,63 @@
> > > > > > +/* { dg-do run } */
> > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > +/* { dg-require-effective-target avx } */
> > > > > > +
> > > > > > +#include "avx-check.h"
> > > > > > +#include <string.h>
> > > > > > +#include "pr106010-7a.c"
> > > > > > +
> > > > > > +void
> > > > > > +avx_test (void)
> > > > > > +{
> > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > > > +  char* p_init = (char*) malloc (2 * N * sizeof (double));
> > > > > > +
> > > > > > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > > > > > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > > > > > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > > > > > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > > > > > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > > > > > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > > > > > +
> > > > > > +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> > > > > > +    p_init[i] = i % 2 + 3;
> > > > > > +
> > > > > > +  memcpy (pd_src, p_init, 2 * N * sizeof (double));
> > > > > > +  memcpy (ps_dst, p_init, 2 * N * sizeof (float));
> > > > > > +  memcpy (epi64_dst, p_init, 2 * N * sizeof (long long));
> > > > > > +  memcpy (epi32_dst, p_init, 2 * N * sizeof (int));
> > > > > > +  memcpy (epi16_dst, p_init, 2 * N * sizeof (short));
> > > > > > +  memcpy (epi8_dst, p_init, 2 * N * sizeof (char));
> > > > > > +
> > > > > > +  foo_pd (pd_dst, pd_src[0]);
> > > > > > +  foo_ps (ps_dst, ps_src[0]);
> > > > > > +  foo_epi64 (epi64_dst, epi64_src[0]);
> > > > > > +  foo_epi32 (epi32_dst, epi32_src[0]);
> > > > > > +  foo_epi16 (epi16_dst, epi16_src[0]);
> > > > > > +  foo_epi8 (epi8_dst, epi8_src[0]);
> > > > > > +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +
> > > > > > +  return;
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7c.c b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > > > > > new file mode 100644
> > > > > > index 00000000000..7f4056a5ecc
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > > > > > @@ -0,0 +1,41 @@
> > > > > > +/* { dg-do run } */
> > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
> > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > +
> > > > > > +#include <string.h>
> > > > > > +
> > > > > > +static void do_test (void);
> > > > > > +
> > > > > > +#define DO_TEST do_test
> > > > > > +#define AVX512FP16
> > > > > > +#include "avx512-check.h"
> > > > > > +
> > > > > > +#define N 10000
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16 b)
> > > > > > +{
> > > > > > +  for (int i = 0; i != N; i++)
> > > > > > +    a[i] = b;
> > > > > > +}
> > > > > > +
> > > > > > +static void
> > > > > > +do_test (void)
> > > > > > +{
> > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > > > +  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
> > > > > > +
> > > > > > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > > > > > +
> > > > > > +  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
> > > > > > +    p_init[i] = i % 2 + 3;
> > > > > > +
> > > > > > +  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
> > > > > > +
> > > > > > +  foo_ph (ph_dst, ph_src[0]);
> > > > > > +  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
> > > > > > +    __builtin_abort ();
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8a.c b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > > > > > new file mode 100644
> > > > > > index 00000000000..11054b60d30
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > > > > > @@ -0,0 +1,58 @@
> > > > > > +/* { dg-do compile } */
> > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > > > > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
> > > > > > +
> > > > > > +#define N 10000
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_pd (_Complex double* a)
> > > > > > +{
> > > > > > +  for (int i = 0; i != N; i++)
> > > > > > +    a[i] = 1.0 + 2.0i;
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_ps (_Complex float* a)
> > > > > > +{
> > > > > > +  for (int i = 0; i != N; i++)
> > > > > > +    a[i] = 1.0f + 2.0fi;
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi64 (_Complex long long* a)
> > > > > > +{
> > > > > > +  for (int i = 0; i != N; i++)
> > > > > > +    a[i] = 1 + 2i;
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi32 (_Complex int* a)
> > > > > > +{
> > > > > > +  for (int i = 0; i != N; i++)
> > > > > > +    a[i] = 1 + 2i;
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi16 (_Complex short* a)
> > > > > > +{
> > > > > > +  for (int i = 0; i != N; i++)
> > > > > > +    a[i] = 1 + 2i;
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_epi8 (_Complex char* a)
> > > > > > +{
> > > > > > +  for (int i = 0; i != N; i++)
> > > > > > +    a[i] = 1 + 2i;
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8b.c b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > > > > > new file mode 100644
> > > > > > index 00000000000..6bb0073b691
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > > > > > @@ -0,0 +1,53 @@
> > > > > > +/* { dg-do run } */
> > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > +/* { dg-require-effective-target avx } */
> > > > > > +
> > > > > > +#include "avx-check.h"
> > > > > > +#include <string.h>
> > > > > > +#include "pr106010-8a.c"
> > > > > > +
> > > > > > +void
> > > > > > +avx_test (void)
> > > > > > +{
> > > > > > +  _Complex double pd_src = 1.0 + 2.0i;
> > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > > > +  _Complex float ps_src = 1.0 + 2.0i;
> > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > > > +  _Complex long long epi64_src = 1 + 2i;;
> > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > > > +  _Complex int epi32_src = 1 + 2i;
> > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > > > +  _Complex short epi16_src = 1 + 2i;
> > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > > > +  _Complex char epi8_src = 1 + 2i;
> > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > > > +
> > > > > > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > > > > > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > > > > > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > > > > > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > > > > > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > > > > > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > > > > > +
> > > > > > +  foo_pd (pd_dst);
> > > > > > +  foo_ps (ps_dst);
> > > > > > +  foo_epi64 (epi64_dst);
> > > > > > +  foo_epi32 (epi32_dst);
> > > > > > +  foo_epi16 (epi16_dst);
> > > > > > +  foo_epi8 (epi8_dst);
> > > > > > +  for (int i = 0 ; i != N; i++)
> > > > > > +    {
> > > > > > +      if (pd_dst[i] != pd_src)
> > > > > > +       __builtin_abort ();
> > > > > > +      if (ps_dst[i] != ps_src)
> > > > > > +       __builtin_abort ();
> > > > > > +      if (epi64_dst[i] != epi64_src)
> > > > > > +       __builtin_abort ();
> > > > > > +      if (epi32_dst[i] != epi32_src)
> > > > > > +       __builtin_abort ();
> > > > > > +      if (epi16_dst[i] != epi16_src)
> > > > > > +       __builtin_abort ();
> > > > > > +      if (epi8_dst[i] != epi8_src)
> > > > > > +       __builtin_abort ();
> > > > > > +    }
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8c.c b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > > > > > new file mode 100644
> > > > > > index 00000000000..61ae131829d
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > > > > > @@ -0,0 +1,38 @@
> > > > > > +/* { dg-do run } */
> > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
> > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > +
> > > > > > +#include <string.h>
> > > > > > +
> > > > > > +static void do_test (void);
> > > > > > +
> > > > > > +#define DO_TEST do_test
> > > > > > +#define AVX512FP16
> > > > > > +#include "avx512-check.h"
> > > > > > +
> > > > > > +#define N 10000
> > > > > > +
> > > > > > +void
> > > > > > +__attribute__((noipa))
> > > > > > +foo_ph (_Complex _Float16* a)
> > > > > > +{
> > > > > > +  for (int i = 0; i != N; i++)
> > > > > > +    a[i] = 1.0f16 + 2.0f16i;
> > > > > > +}
> > > > > > +
> > > > > > +static void
> > > > > > +do_test (void)
> > > > > > +{
> > > > > > +  _Complex _Float16 ph_src = 1.0f16 + 2.0f16i;
> > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > > > +
> > > > > > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > > > > > +
> > > > > > +  foo_ph (ph_dst);
> > > > > > +  for (int i = 0; i != N; i++)
> > > > > > +    {
> > > > > > +      if (ph_dst[i] != ph_src)
> > > > > > +       __builtin_abort ();
> > > > > > +    }
> > > > > > +}
> > > > > > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> > > > > > index d20a10a1524..42ee9df674c 100644
> > > > > > --- a/gcc/tree-vect-data-refs.cc
> > > > > > +++ b/gcc/tree-vect-data-refs.cc
> > > > > > @@ -1403,7 +1403,8 @@ vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
> > > > > >    if (PURE_SLP_STMT (stmt_info))
> > > > > >      ncopies = 1;
> > > > > >    else
> > > > > > -    ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
> > > > > > +    ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info),
> > > > > > +                                  STMT_VINFO_COMPLEX_P (stmt_info));
> > > > > >
> > > > > >    if (DR_IS_READ (dr_info->dr))
> > > > > >      vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
> > > > > > @@ -4597,8 +4598,22 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
> > > > > >
> > > > > >        /* Set vectype for STMT.  */
> > > > > >        scalar_type = TREE_TYPE (DR_REF (dr));
> > > > > > -      tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
> > > > > > -      if (!vectype)
> > > > > > +      tree adjust_scalar_type = scalar_type;
> > > > > > +      /* Support Complex type access. Note that the complex type of load/store
> > > > > > +        does not support gather/scatter.  */
> > > > > > +      if (TREE_CODE (scalar_type) == COMPLEX_TYPE
> > > > > > +         && gatherscatter == SG_NONE)
> > > > > > +       {
> > > > > > +         adjust_scalar_type = TREE_TYPE (scalar_type);
> > > > > > +         STMT_VINFO_COMPLEX_P (stmt_info) = true;
> > > > > > +       }
> > > > > > +      tree vectype = get_vectype_for_scalar_type (vinfo, adjust_scalar_type);
> > > > > > +      unsigned HOST_WIDE_INT constant_nunits;
> > > > > > +      if (!vectype
> > > > > > +         /* For complex type, V1DI doesn't make sense.  */
> > > > > > +         || (STMT_VINFO_COMPLEX_P (stmt_info)
> > > > > > +             && (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&constant_nunits)
> > > > > > +                 || constant_nunits == 1)))
> > > > > >          {
> > > > > >            if (dump_enabled_p ())
> > > > > >              {
> > > > > > @@ -4635,8 +4650,11 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
> > > > > >         }
> > > > > >
> > > > > >        /* Adjust the minimal vectorization factor according to the
> > > > > > -        vector type.  */
> > > > > > +        vector type. Note for complex type, VF is half of
> > > > > > +        TYPE_VECTOR_SUBPARTS.  */
> > > > > >        vf = TYPE_VECTOR_SUBPARTS (vectype);
> > > > > > +      if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > > +       vf = exact_div (vf, 2);
> > > > > >        *min_vf = upper_bound (*min_vf, vf);
> > > > > >
> > > > > >        /* Leave the BB vectorizer to pick the vector type later, based on
> > > > > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > > > > > index 3a70c15b593..365fa738022 100644
> > > > > > --- a/gcc/tree-vect-loop.cc
> > > > > > +++ b/gcc/tree-vect-loop.cc
> > > > > > @@ -200,7 +200,12 @@ vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > > >      }
> > > > > >
> > > > > >    if (nunits_vectype)
> > > > > > -    vect_update_max_nunits (vf, nunits_vectype);
> > > > > > +    {
> > > > > > +      poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (nunits_vectype);
> > > > > > +      if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > > +       nunits = exact_div (nunits, 2);
> > > > > > +      vect_update_max_nunits (vf, nunits);
> > > > > > +    }
> > > > > >
> > > > > >    return opt_result::success ();
> > > > > >  }
> > > > > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> > > > > > index dab5daddcc5..5d66ea2f286 100644
> > > > > > --- a/gcc/tree-vect-slp.cc
> > > > > > +++ b/gcc/tree-vect-slp.cc
> > > > > > @@ -877,10 +877,14 @@ vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > > >        return false;
> > > > > >      }
> > > > > >
> > > > > > +  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> > > > > > +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > > +    nunits = exact_div (nunits, 2);
> > > > > > +
> > > > > >    /* If populating the vector type requires unrolling then fail
> > > > > >       before adjusting *max_nunits for basic-block vectorization.  */
> > > > > >    if (is_a <bb_vec_info> (vinfo)
> > > > > > -      && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
> > > > > > +      && !multiple_p (group_size , nunits))
> > > > > >      {
> > > > > >        if (dump_enabled_p ())
> > > > > >         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > > > > @@ -891,7 +895,7 @@ vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > > >      }
> > > > > >
> > > > > >    /* In case of multiple types we need to detect the smallest type.  */
> > > > > > -  vect_update_max_nunits (max_nunits, vectype);
> > > > > > +  vect_update_max_nunits (max_nunits, nunits);
> > > > > >    return true;
> > > > > >  }
> > > > > >
> > > > > > @@ -3720,22 +3724,54 @@ vect_optimize_slp (vec_info *vinfo)
> > > > > >          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
> > > > > >          when permuting constants and invariants keeping the permute
> > > > > >          bijective.  */
> > > > > > -      auto_sbitmap load_index (SLP_TREE_LANES (node));
> > > > > > -      bitmap_clear (load_index);
> > > > > > -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > -       bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
> > > > > > -      unsigned j;
> > > > > > -      for (j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > -       if (!bitmap_bit_p (load_index, j))
> > > > > > -         break;
> > > > > > -      if (j != SLP_TREE_LANES (node))
> > > > > > -       continue;
> > > > > > +      /* Permutation of Complex type.  */
> > > > > > +      if (STMT_VINFO_COMPLEX_P (dr_stmt))
> > > > > > +       {
> > > > > > +         auto_sbitmap load_index (SLP_TREE_LANES (node) * 2);
> > > > > > +         bitmap_clear (load_index);
> > > > > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > +           {
> > > > > > +             unsigned bit = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > > > > +             bitmap_set_bit (load_index, 2 * bit);
> > > > > > +             bitmap_set_bit (load_index, 2 * bit + 1);
> > > > > > +           }
> > > > > > +         unsigned j;
> > > > > > +         for (j = 0; j < SLP_TREE_LANES (node) * 2; ++j)
> > > > > > +           if (!bitmap_bit_p (load_index, j))
> > > > > > +             break;
> > > > > > +         if (j != SLP_TREE_LANES (node) * 2)
> > > > > > +           continue;
> > > > > >
> > > > > > -      vec<unsigned> perm = vNULL;
> > > > > > -      perm.safe_grow (SLP_TREE_LANES (node), true);
> > > > > > -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > -       perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > > > > -      perms.safe_push (perm);
> > > > > > +         vec<unsigned> perm = vNULL;
> > > > > > +         perm.safe_grow (SLP_TREE_LANES (node) * 2, true);
> > > > > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > +           {
> > > > > > +             unsigned cidx = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > > > > +             perm[2 * j] = 2 * cidx;
> > > > > > +             perm[2 * j + 1] = 2 * cidx + 1;
> > > > > > +           }
> > > > > > +         perms.safe_push (perm);
> > > > > > +       }
> > > > > > +      else
> > > > > > +       {
> > > > > > +         auto_sbitmap load_index (SLP_TREE_LANES (node));
> > > > > > +         bitmap_clear (load_index);
> > > > > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > +           bitmap_set_bit (load_index,
> > > > > > +                           SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
> > > > > > +         unsigned j;
> > > > > > +         for (j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > +           if (!bitmap_bit_p (load_index, j))
> > > > > > +             break;
> > > > > > +         if (j != SLP_TREE_LANES (node))
> > > > > > +           continue;
> > > > > > +
> > > > > > +         vec<unsigned> perm = vNULL;
> > > > > > +         perm.safe_grow (SLP_TREE_LANES (node), true);
> > > > > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > +           perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > > > > +         perms.safe_push (perm);
> > > > > > +       }
> > > > > >        vertices[idx].perm_in = perms.length () - 1;
> > > > > >        vertices[idx].perm_out = perms.length () - 1;
> > > > > >      }
> > > > > > @@ -4518,6 +4554,12 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
> > > > > >         vf = loop_vinfo->vectorization_factor;
> > > > > >        else
> > > > > >         vf = 1;
> > > > > > +      /* For complex type and SLP, double vf to get right vectype.
> > > > > > +        .i.e vector(4) double for complex double, group size is 2, double vf
> > > > > > +        to map vf * group_size to TYPE_VECTOR_SUBPARTS.  */
> > > > > > +     if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > > +       vf *= 2;
> > > > > > +
> > > > > >        unsigned int group_size = SLP_TREE_LANES (node);
> > > > > >        tree vectype = SLP_TREE_VECTYPE (node);
> > > > > >        SLP_TREE_NUMBER_OF_VEC_STMTS (node)
> > > > > > @@ -4763,10 +4805,17 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
> > > > > >             }
> > > > > >           unsigned group_size = SLP_TREE_LANES (child);
> > > > > >           poly_uint64 vf = 1;
> > > > > > +
> > > > > >           if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
> > > > > >             vf = loop_vinfo->vectorization_factor;
> > > > > > +
> > > > > > +         /* V2SF is just 1 complex type, so mutiply by 2
> > > > > > +            to get release vector numbers.  */
> > > > > > +         unsigned cp
> > > > > > +           = STMT_VINFO_COMPLEX_P (SLP_TREE_REPRESENTATIVE (node)) ? 2 : 1;
> > > > > > +
> > > > > >           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
> > > > > > -           = vect_get_num_vectors (vf * group_size, vector_type);
> > > > > > +           = vect_get_num_vectors (vf * group_size * cp, vector_type);
> > > > > >           /* And cost them.  */
> > > > > >           vect_prologue_cost_for_slp (child, cost_vec);
> > > > > >         }
> > > > > > @@ -6402,6 +6451,11 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > > > > >
> > > > > >    /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
> > > > > >    vector_type = SLP_TREE_VECTYPE (op_node);
> > > > > > +  unsigned int cp = 1;
> > > > > > +  /* Handle Complex type vector init.
> > > > > > +     SLP_TREE_REPRESENTATIVE (op_node) could be NULL.  */
> > > > > > +  if (TREE_CODE (TREE_TYPE (op_node->ops[0])) == COMPLEX_TYPE)
> > > > > > +    cp = 2;
> > > > > >
> > > > > >    unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
> > > > > >    SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
> > > > > > @@ -6426,9 +6480,9 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > > > > >    /* When using duplicate_and_interleave, we just need one element for
> > > > > >       each scalar statement.  */
> > > > > >    if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
> > > > > > -    nunits = group_size;
> > > > > > +    nunits = group_size * cp;
> > > > > >
> > > > > > -  number_of_copies = nunits * number_of_vectors / group_size;
> > > > > > +  number_of_copies = nunits * number_of_vectors / (group_size * cp);
> > > > > >
> > > > > >    number_of_places_left_in_vector = nunits;
> > > > > >    constant_p = true;
> > > > > > @@ -6460,8 +6514,23 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > > > > >                         gcc_unreachable ();
> > > > > >                     }
> > > > > >                   else
> > > > > > -                   op = fold_unary (VIEW_CONVERT_EXPR,
> > > > > > -                                    TREE_TYPE (vector_type), op);
> > > > > > +                   {
> > > > > > +                     tree scalar_type = TREE_TYPE (vector_type);
> > > > > > +                     /* For complex type, insert real and imag part
> > > > > > +                        separately.  */
> > > > > > +                     if (cp == 2)
> > > > > > +                       {
> > > > > > +                         gcc_assert ((TREE_CODE (TREE_TYPE (op))
> > > > > > +                                      == COMPLEX_TYPE)
> > > > > > +                                     && (scalar_type
> > > > > > +                                         == TREE_TYPE (TREE_TYPE (op))));
> > > > > > +                         elts[number_of_places_left_in_vector--]
> > > > > > +                           = fold_unary (IMAGPART_EXPR, scalar_type, op);
> > > > > > +                         op = fold_unary (REALPART_EXPR, scalar_type, op);
> > > > > > +                       }
> > > > > > +                     else
> > > > > > +                       op = fold_unary (VIEW_CONVERT_EXPR, scalar_type, op);
> > > > > > +                   }
> > > > > >                   gcc_assert (op && CONSTANT_CLASS_P (op));
> > > > > >                 }
> > > > > >               else
> > > > > > @@ -6481,11 +6550,28 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > > > > >                     }
> > > > > >                   else
> > > > > >                     {
> > > > > > -                     op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
> > > > > > -                                  op);
> > > > > > -                     init_stmt
> > > > > > -                       = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
> > > > > > -                                              op);
> > > > > > +                     tree scalar_type = TREE_TYPE (vector_type);
> > > > > > +                     if (cp == 2)
> > > > > > +                       {
> > > > > > +                         gcc_assert ((TREE_CODE (TREE_TYPE (op))
> > > > > > +                                      == COMPLEX_TYPE)
> > > > > > +                                     && (scalar_type
> > > > > > +                                         == TREE_TYPE (TREE_TYPE (op))));
> > > > > > +                         tree imag = build1 (IMAGPART_EXPR, scalar_type, op);
> > > > > > +                         op = build1 (REALPART_EXPR, scalar_type, op);
> > > > > > +                         tree imag_temp = make_ssa_name (scalar_type);
> > > > > > +                         elts[number_of_places_left_in_vector--] = imag_temp;
> > > > > > +                         init_stmt = gimple_build_assign (imag_temp, imag);
> > > > > > +                         gimple_seq_add_stmt (&ctor_seq, init_stmt);
> > > > > > +                         init_stmt = gimple_build_assign (new_temp, op);
> > > > > > +                       }
> > > > > > +                     else
> > > > > > +                       {
> > > > > > +                         op = build1 (VIEW_CONVERT_EXPR, scalar_type, op);
> > > > > > +                         init_stmt
> > > > > > +                           = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
> > > > > > +                                                  op);
> > > > > > +                       }
> > > > > >                     }
> > > > > >                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
> > > > > >                   op = new_temp;
> > > > > > @@ -6696,15 +6782,17 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > > >    unsigned int nelts_to_build;
> > > > > >    unsigned int nvectors_per_build;
> > > > > >    unsigned int in_nlanes;
> > > > > > +  unsigned int cp = STMT_VINFO_COMPLEX_P (stmt_info) ? 2 : 1;
> > > > > >    bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
> > > > > > -                     && multiple_p (nunits, group_size));
> > > > > > +                     && multiple_p (nunits, group_size * cp));
> > > > > >    if (repeating_p)
> > > > > >      {
> > > > > >        /* A single vector contains a whole number of copies of the node, so:
> > > > > >          (a) all permutes can use the same mask; and
> > > > > >          (b) the permutes only need a single vector input.  */
> > > > > > -      mask.new_vector (nunits, group_size, 3);
> > > > > > -      nelts_to_build = mask.encoded_nelts ();
> > > > > > +      /* For complex type, mask size should be double of nelts_to_build.  */
> > > > > > +      mask.new_vector (nunits, group_size * cp, 3);
> > > > > > +      nelts_to_build = mask.encoded_nelts () / cp;
> > > > > >        nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
> > > > > >        in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
> > > > > >      }
> > > > > > @@ -6744,8 +6832,8 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > > >         {
> > > > > >           /* Enforced before the loop when !repeating_p.  */
> > > > > >           unsigned int const_nunits = nunits.to_constant ();
> > > > > > -         vec_index = i / const_nunits;
> > > > > > -         mask_element = i % const_nunits;
> > > > > > +         vec_index = i / (const_nunits / cp);
> > > > > > +         mask_element = i % (const_nunits / cp);
> > > > > >           if (vec_index == first_vec_index
> > > > > >               || first_vec_index == -1)
> > > > > >             {
> > > > > > @@ -6755,7 +6843,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > > >                    || second_vec_index == -1)
> > > > > >             {
> > > > > >               second_vec_index = vec_index;
> > > > > > -             mask_element += const_nunits;
> > > > > > +             mask_element += (const_nunits / cp);
> > > > > >             }
> > > > > >           else
> > > > > >             {
> > > > > > @@ -6768,14 +6856,24 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > > >               return false;
> > > > > >             }
> > > > > >
> > > > > > -         gcc_assert (mask_element < 2 * const_nunits);
> > > > > > +         gcc_assert (mask_element < 2 * const_nunits / cp);
> > > > > >         }
> > > > > >
> > > > > >        if (mask_element != index)
> > > > > >         noop_p = false;
> > > > > > -      mask[index++] = mask_element;
> > > > > > +      /* Set index for Complex _type.
> > > > > > +        i.e. mask like [1,0] is actually [2, 3, 0, 1]
> > > > > > +        for vector scalar type.  */
> > > > > > +      if (cp == 2)
> > > > > > +       {
> > > > > > +         mask[2 * index] = 2 * mask_element;
> > > > > > +         mask[2 * index + 1] = 2 * mask_element + 1;
> > > > > > +       }
> > > > > > +      else
> > > > > > +       mask[index] = mask_element;
> > > > > > +      index++;
> > > > > >
> > > > > > -      if (index == count && !noop_p)
> > > > > > +      if (index * cp == count && !noop_p)
> > > > > >         {
> > > > > >           indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
> > > > > >           if (!can_vec_perm_const_p (mode, mode, indices))
> > > > > > @@ -6799,7 +6897,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > > >           ++*n_perms;
> > > > > >         }
> > > > > >
> > > > > > -      if (index == count)
> > > > > > +      if (index * cp == count)
> > > > > >         {
> > > > > >           if (!analyze_only)
> > > > > >             {
> > > > > > @@ -6869,7 +6967,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > > >           bool load_seen = false;
> > > > > >           for (unsigned i = 0; i < in_nlanes; ++i)
> > > > > >             {
> > > > > > -             if (i % const_nunits == 0)
> > > > > > +             if (i % (const_nunits * cp) == 0)
> > > > > >                 {
> > > > > >                   if (load_seen)
> > > > > >                     *n_loads += 1;
> > > > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > > > > > index 72107afc883..8af3b558be4 100644
> > > > > > --- a/gcc/tree-vect-stmts.cc
> > > > > > +++ b/gcc/tree-vect-stmts.cc
> > > > > > @@ -1397,25 +1397,70 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
> > > > > >  {
> > > > > >    gimple *init_stmt;
> > > > > >    tree new_temp;
> > > > > > +  tree scalar_type = TREE_TYPE (type);
> > > > > > +  gimple_seq stmts = NULL;
> > > > > > +
> > > > > > +  if (TREE_CODE (TREE_TYPE (val)) == COMPLEX_TYPE)
> > > > > > +    {
> > > > > > +      unsigned HOST_WIDE_INT nunits;
> > > > > > +      gcc_assert (TYPE_VECTOR_SUBPARTS (type).is_constant (&nunits));
> > > > > >
> > > > > > +      tree_vector_builder elts (type, nunits, 1);
> > > > > > +      tree imag, real;
> > > > > > +      if (TREE_CODE (val) == COMPLEX_CST)
> > > > > > +       {
> > > > > > +         real = fold_unary (REALPART_EXPR, scalar_type, val);
> > > > > > +         imag = fold_unary (IMAGPART_EXPR, scalar_type, val);
> > > > > > +       }
> > > > > > +      else
> > > > > > +       {
> > > > > > +         real = make_ssa_name (scalar_type);
> > > > > > +         imag = make_ssa_name (scalar_type);
> > > > > > +         init_stmt
> > > > > > +           = gimple_build_assign (real,
> > > > > > +                                  build1 (REALPART_EXPR, scalar_type, val));
> > > > > > +         gimple_seq_add_stmt (&stmts, init_stmt);
> > > > > > +         init_stmt
> > > > > > +           = gimple_build_assign (imag,
> > > > > > +                                  build1 (IMAGPART_EXPR, scalar_type, val));
> > > > > > +         gimple_seq_add_stmt (&stmts, init_stmt);
> > > > > > +       }
> > > > > > +
> > > > > > +      /* Build vector as [real,imag,real,imag,...].  */
> > > > > > +      for (unsigned i = 0; i != nunits; i++)
> > > > > > +       {
> > > > > > +         if (i % 2)
> > > > > > +           elts.quick_push (imag);
> > > > > > +         else
> > > > > > +           elts.quick_push (real);
> > > > > > +       }
> > > > > > +      val = gimple_build_vector (&stmts, &elts);
> > > > > > +      if (!gimple_seq_empty_p (stmts))
> > > > > > +       {
> > > > > > +         if (gsi)
> > > > > > +           gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> > > > > > +         else
> > > > > > +           vinfo->insert_seq_on_entry (stmt_info, stmts);
> > > > > > +       }
> > > > > > +    }
> > > > > >    /* We abuse this function to push sth to a SSA name with initial 'val'.  */
> > > > > > -  if (! useless_type_conversion_p (type, TREE_TYPE (val)))
> > > > > > +  else if (! useless_type_conversion_p (type, TREE_TYPE (val)))
> > > > > >      {
> > > > > >        gcc_assert (TREE_CODE (type) == VECTOR_TYPE);
> > > > > > -      if (! types_compatible_p (TREE_TYPE (type), TREE_TYPE (val)))
> > > > > > +      if (! types_compatible_p (scalar_type, TREE_TYPE (val)))
> > > > > >         {
> > > > > >           /* Scalar boolean value should be transformed into
> > > > > >              all zeros or all ones value before building a vector.  */
> > > > > >           if (VECTOR_BOOLEAN_TYPE_P (type))
> > > > > >             {
> > > > > > -             tree true_val = build_all_ones_cst (TREE_TYPE (type));
> > > > > > -             tree false_val = build_zero_cst (TREE_TYPE (type));
> > > > > > +             tree true_val = build_all_ones_cst (scalar_type);
> > > > > > +             tree false_val = build_zero_cst (scalar_type);
> > > > > >
> > > > > >               if (CONSTANT_CLASS_P (val))
> > > > > >                 val = integer_zerop (val) ? false_val : true_val;
> > > > > >               else
> > > > > >                 {
> > > > > > -                 new_temp = make_ssa_name (TREE_TYPE (type));
> > > > > > +                 new_temp = make_ssa_name (scalar_type);
> > > > > >                   init_stmt = gimple_build_assign (new_temp, COND_EXPR,
> > > > > >                                                    val, true_val, false_val);
> > > > > >                   vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi);
> > > > > > @@ -1424,14 +1469,13 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
> > > > > >             }
> > > > > >           else
> > > > > >             {
> > > > > > -             gimple_seq stmts = NULL;
> > > > > >               if (! INTEGRAL_TYPE_P (TREE_TYPE (val)))
> > > > > >                 val = gimple_build (&stmts, VIEW_CONVERT_EXPR,
> > > > > > -                                   TREE_TYPE (type), val);
> > > > > > +                                   scalar_type, val);
> > > > > >               else
> > > > > >                 /* ???  Condition vectorization expects us to do
> > > > > >                    promotion of invariant/external defs.  */
> > > > > > -               val = gimple_convert (&stmts, TREE_TYPE (type), val);
> > > > > > +               val = gimple_convert (&stmts, scalar_type, val);
> > > > > >               for (gimple_stmt_iterator gsi2 = gsi_start (stmts);
> > > > > >                    !gsi_end_p (gsi2); )
> > > > > >                 {
> > > > > > @@ -1496,7 +1540,12 @@ vect_get_vec_defs_for_operand (vec_info *vinfo, stmt_vec_info stmt_vinfo,
> > > > > >                && VECTOR_BOOLEAN_TYPE_P (stmt_vectype))
> > > > > >         vector_type = truth_type_for (stmt_vectype);
> > > > > >        else
> > > > > > -       vector_type = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op));
> > > > > > +       {
> > > > > > +         tree scalar_type = TREE_TYPE (op);
> > > > > > +         if (STMT_VINFO_COMPLEX_P (stmt_vinfo))
> > > > > > +           scalar_type = TREE_TYPE (scalar_type);
> > > > > > +         vector_type = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
> > > > > > +       }
> > > > > >
> > > > > >        gcc_assert (vector_type);
> > > > > >        tree vop = vect_init_vector (vinfo, stmt_vinfo, op, vector_type, NULL);
> > > > > > @@ -7509,8 +7558,17 @@ vectorizable_store (vec_info *vinfo,
> > > > > >       same location twice.  */
> > > > > >    gcc_assert (slp == PURE_SLP_STMT (stmt_info));
> > > > > >
> > > > > > +  if (!STMT_VINFO_DATA_REF (stmt_info))
> > > > > > +    return false;
> > > > > > +
> > > > > >    tree vectype = STMT_VINFO_VECTYPE (stmt_info), rhs_vectype = NULL_TREE;
> > > > > >    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> > > > > > +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > > +    {
> > > > > > +      if (!nunits.is_constant ())
> > > > > > +       return false;
> > > > > > +      nunits = exact_div (nunits, 2);
> > > > > > +    }
> > > > > >
> > > > > >    if (loop_vinfo)
> > > > > >      {
> > > > > > @@ -7526,7 +7584,8 @@ vectorizable_store (vec_info *vinfo,
> > > > > >    if (slp)
> > > > > >      ncopies = 1;
> > > > > >    else
> > > > > > -    ncopies = vect_get_num_copies (loop_vinfo, vectype);
> > > > > > +    ncopies = vect_get_num_copies (loop_vinfo, vectype,
> > > > > > +                                  STMT_VINFO_COMPLEX_P (stmt_info));
> > > > > >
> > > > > >    gcc_assert (ncopies >= 1);
> > > > > >
> > > > > > @@ -7546,9 +7605,6 @@ vectorizable_store (vec_info *vinfo,
> > > > > >    elem_type = TREE_TYPE (vectype);
> > > > > >    vec_mode = TYPE_MODE (vectype);
> > > > > >
> > > > > > -  if (!STMT_VINFO_DATA_REF (stmt_info))
> > > > > > -    return false;
> > > > > > -
> > > > > >    vect_memory_access_type memory_access_type;
> > > > > >    enum dr_alignment_support alignment_support_scheme;
> > > > > >    int misalignment;
> > > > > > @@ -8778,6 +8834,12 @@ vectorizable_load (vec_info *vinfo,
> > > > > >
> > > > > >    tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> > > > > >    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> > > > > > +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > > +    {
> > > > > > +      if (!nunits.is_constant ())
> > > > > > +       return false;
> > > > > > +      nunits = exact_div (nunits, 2);
> > > > > > +    }
> > > > > >
> > > > > >    if (loop_vinfo)
> > > > > >      {
> > > > > > @@ -8794,7 +8856,8 @@ vectorizable_load (vec_info *vinfo,
> > > > > >    if (slp)
> > > > > >      ncopies = 1;
> > > > > >    else
> > > > > > -    ncopies = vect_get_num_copies (loop_vinfo, vectype);
> > > > > > +    ncopies = vect_get_num_copies (loop_vinfo, vectype,
> > > > > > +                                  STMT_VINFO_COMPLEX_P (stmt_info));
> > > > > >
> > > > > >    gcc_assert (ncopies >= 1);
> > > > > >
> > > > > > @@ -8870,8 +8933,11 @@ vectorizable_load (vec_info *vinfo,
> > > > > >                 if (k > maxk)
> > > > > >                   maxk = k;
> > > > > >               tree vectype = SLP_TREE_VECTYPE (slp_node);
> > > > > > +             /* For complex type, half the nunits.  */
> > > > > >               if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
> > > > > > -                 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
> > > > > > +                 || maxk >= (DR_GROUP_SIZE (group_info)
> > > > > > +                             & ~((STMT_VINFO_COMPLEX_P (group_info)
> > > > > > +                                  ? nunits >> 1 : nunits) - 1)))
> > > > > >                 {
> > > > > >                   if (dump_enabled_p ())
> > > > > >                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > > > > @@ -12499,12 +12565,27 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > > >             dump_printf_loc (MSG_NOTE, vect_location,
> > > > > >                              "get vectype for scalar type: %T\n", scalar_type);
> > > > > >         }
> > > > > > +
> > > > > > +      tree orig_scalar_type = scalar_type;
> > > > > > +      if (TREE_CODE (scalar_type) == COMPLEX_TYPE)
> > > > > > +       {
> > > > > > +         /* Set complex_p for BB vectorizer.  */
> > > > > > +         STMT_VINFO_COMPLEX_P (stmt_info) = true;
> > > > > > +         scalar_type = TREE_TYPE (scalar_type);
> > > > > > +         /* Double group_size for BB vectorizer to make
> > > > > > +            following 2 get_vectype_for_scalar_type return wanted vectype.
> > > > > > +            Real group size is not changed, just make the "faked" input
> > > > > > +            group_size.  */
> > > > > > +         group_size *= 2;
> > > > > > +       }
> > > > > >        vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
> > > > > > -      if (!vectype)
> > > > > > +      if (!vectype
> > > > > > +         || (STMT_VINFO_COMPLEX_P (stmt_info)
> > > > > > +             && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ()))
> > > > > >         return opt_result::failure_at (stmt,
> > > > > >                                        "not vectorized:"
> > > > > >                                        " unsupported data-type %T\n",
> > > > > > -                                      scalar_type);
> > > > > > +                                      orig_scalar_type);
> > > > > >
> > > > > >        if (dump_enabled_p ())
> > > > > >         dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n", vectype);
> > > > > > @@ -12529,16 +12610,30 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > > >                                                    TREE_TYPE (vectype));
> > > > > >        if (scalar_type != TREE_TYPE (vectype))
> > > > > >         {
> > > > > > -         if (dump_enabled_p ())
> > > > > > +         tree orig_scalar_type = scalar_type;
> > > > > > +         if (TREE_CODE (scalar_type) == COMPLEX_TYPE)
> > > > > > +           {
> > > > > > +             /* Set complex_p for Loop vectorizer.  */
> > > > > > +             STMT_VINFO_COMPLEX_P (stmt_info) = true;
> > > > > > +             scalar_type = TREE_TYPE (scalar_type);
> > > > > > +             if (dump_enabled_p ())
> > > > > > +               dump_printf_loc (MSG_NOTE, vect_location,
> > > > > > +                            "get complex for smallest scalar type: %T\n",
> > > > > > +                            scalar_type);
> > > > > > +
> > > > > > +           }
> > > > > > +         else if (dump_enabled_p ())
> > > > > >             dump_printf_loc (MSG_NOTE, vect_location,
> > > > > >                              "get vectype for smallest scalar type: %T\n",
> > > > > >                              scalar_type);
> > > > > >           nunits_vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
> > > > > >                                                         group_size);
> > > > > > -         if (!nunits_vectype)
> > > > > > +         if (!nunits_vectype
> > > > > > +             || (STMT_VINFO_COMPLEX_P (stmt_info)
> > > > > > +                 && !TYPE_VECTOR_SUBPARTS (nunits_vectype).is_constant ()))
> > > > > >             return opt_result::failure_at
> > > > > >               (stmt, "not vectorized: unsupported data-type %T\n",
> > > > > > -              scalar_type);
> > > > > > +              orig_scalar_type);
> > > > > >           if (dump_enabled_p ())
> > > > > >             dump_printf_loc (MSG_NOTE, vect_location, "nunits vectype: %T\n",
> > > > > >                              nunits_vectype);
> > > > > > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> > > > > > index e5fdc9e0a14..4a809e492c4 100644
> > > > > > --- a/gcc/tree-vectorizer.h
> > > > > > +++ b/gcc/tree-vectorizer.h
> > > > > > @@ -1161,6 +1161,9 @@ public:
> > > > > >       vectorization.  */
> > > > > >    bool vectorizable;
> > > > > >
> > > > > > +  /* The scalar type of the LHS of this statement is complex type.  */
> > > > > > +  bool complex_p;
> > > > > > +
> > > > > >    /* The stmt to which this info struct refers to.  */
> > > > > >    gimple *stmt;
> > > > > >
> > > > > > @@ -1395,6 +1398,7 @@ struct gather_scatter_info {
> > > > > >  #define STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT(S) (S)->reduc_epilogue_adjustment
> > > > > >  #define STMT_VINFO_REDUC_IDX(S)                   (S)->reduc_idx
> > > > > >  #define STMT_VINFO_FORCE_SINGLE_CYCLE(S)   (S)->force_single_cycle
> > > > > > +#define STMT_VINFO_COMPLEX_P(S)            (S)->complex_p
> > > > > >
> > > > > >  #define STMT_VINFO_DR_WRT_VEC_LOOP(S)      (S)->dr_wrt_vec_loop
> > > > > >  #define STMT_VINFO_DR_BASE_ADDRESS(S)      (S)->dr_wrt_vec_loop.base_address
> > > > > > @@ -1970,6 +1974,15 @@ vect_get_num_copies (loop_vec_info loop_vinfo, tree vectype)
> > > > > >    return vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo), vectype);
> > > > > >  }
> > > > > >
> > > > > > +static inline unsigned int
> > > > > > +vect_get_num_copies (loop_vec_info loop_vinfo, tree vectype, bool complex_p)
> > > > > > +{
> > > > > > +  poly_uint64 nunits = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> > > > > > +  if (complex_p)
> > > > > > +    nunits *= 2;
> > > > > > +  return vect_get_num_vectors (nunits, vectype);
> > > > > > +}
> > > > > > +
> > > > > >  /* Update maximum unit count *MAX_NUNITS so that it accounts for
> > > > > >     NUNITS.  *MAX_NUNITS can be 1 if we haven't yet recorded anything.  */
> > > > > >
> > > > > > --
> > > > > > 2.18.1
> > > > > >
> > > >
> > > >
> > > >
> > > > --
> > > > BR,
> > > > Hongtao
> >
> >
> >
> > --
> > BR,
> > Hongtao
Hongtao Liu July 14, 2022, 8:53 a.m. UTC | #8
On Thu, Jul 14, 2022 at 4:20 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Wed, Jul 13, 2022 at 9:34 AM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Wed, Jul 13, 2022 at 6:47 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Tue, Jul 12, 2022 at 10:12 PM Richard Biener
> > > <richard.guenther@gmail.com> wrote:
> > > >
> > > > On Tue, Jul 12, 2022 at 6:11 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > >
> > > > > On Mon, Jul 11, 2022 at 7:47 PM Richard Biener via Gcc-patches
> > > > > <gcc-patches@gcc.gnu.org> wrote:
> > > > > >
> > > > > > On Mon, Jul 11, 2022 at 5:44 AM liuhongt <hongtao.liu@intel.com> wrote:
> > > > > > >
> > > > > > > The patch only handles load/store(including ctor/permutation, except
> > > > > > > gather/scatter) for complex type, other operations don't needs to be
> > > > > > > handled since they will be lowered by pass cplxlower.(MASK_LOAD is not
> > > > > > > supported for complex type, so no need to handle either).
> > > > > >
> > > > > > (*)
> > > > > >
> > > > > > > Instead of support vector(2) _Complex double, this patch takes vector(4)
> > > > > > > double as vector type of _Complex double. Since vectorizer originally
> > > > > > > takes TYPE_VECTOR_SUBPARTS as nunits which is not true for complex
> > > > > > > type, the patch handles nunits/ncopies/vf specially for complex type.
> > > > > >
> > > > > > For the limited set above(*) can you explain what's "special" about
> > > > > > vector(2) _Complex
> > > > > > vs. vector(4) double, thus why we need to have STMT_VINFO_COMPLEX_P at all?
> > > > > Supporting a vector(2) complex  is a straightforward idea, just like
> > > > > supporting other scalar type in vectorizer, but it requires more
> > > > > efforts(in the backend and frontend), considering that most of
> > > > > operations of complex type will be lowered into realpart and imagpart
> > > > > operations, supporting a vector(2) complex does not look that
> > > > > necessary. Then it comes up with supporting vector(4) double(with
> > > > > adjustment of vf/ctor/permutation), the vectorizer only needs to
> > > > > handle the vectorization of the move operation of the complex type(no
> > > > > need to worry about wrongly mapping vector(4) double multiplication to
> > > > > complex type multiplication since it's already lowered before
> > > > > vectorizer).
> > > > > stmt_info does not record the scalar type, in order to avoid duplicate
> > > > > operation like getting a lhs type from stmt to determine whether it is
> > > > > a complex type, STMT_VINFO_COMPLEX_P bit is added, this bit is mainly
> > > > > initialized in vect_analyze_data_refs and vect_get_vector_types_for_
> > > > > stmt.
> > > > > >
> > > > > > I wonder to what extent your handling can be extended to support re-vectorizing
> > > > > > (with a higher VF for example) already vectorized code?  The vectorizer giving
> > > > > > up on vector(2) double looks quite obviously similar to it giving up
> > > > > > on _Complex double ...
> > > > > Yes, it can be extended to vector(2) double/float/int/.... with a bit
> > > > > adjustment(exacting element by using bit_field instead of
> > > > > imagpart_expr/realpart_expr).
> > > > > > It would be a shame to not use the same underlying mechanism for dealing with
> > > > > > both, where for the vector case obviously vector(4) would be supported as well.
> > > > > >
> > > > > > In principle _Complex double operations should be two SLP lanes but it seems you
> > > > > > are handling them with classical interleaving as well?
> > > > > I'm only handling move operations, for other operations it will be
> > > > > lowered to realpart and imagpart and thus two SLP lanes.
> > > >
> > > > Yes, I understood that.
> > > >
> > > > Doing it more general (and IMHO better) would involve enhancing
> > > > how we represent dataref groups, maintaining the number of scalars
> > > > covered by each of the vinfos.  On the SLP representation side it
> > > > probably requires to rely on the representative for access and not
> > > > on the scalar stmts (since those do not map properly to the lanes).
> > > >
> > > > Ideally we'd be able to handle
> > > >
> > > > struct { _Complex double c; double a; double b; } a[], b[];
> > > >
> > > > void foo ()
> > > > {
> > > >    for (int i = 0; i < 100; ++i)
> > > >     {
> > > >       a[i].c = b[i].c;
> > > >       a[i].a = b[i].a;
> > > >       a[i].b = b[i].b;
> > > >     }
> > > > }
> > > >
> > > > which I guess your patch doesn't handle with plain AVX vector
> > > > copies but instead uses interleaving for the _Complex and non-_Complex
> > > > parts?
> > > Indeed, it produces wrong code.
> >
> > For _Complex, in case we don't get to the "true and only" solution it
> > might be easier to split the loads and stores when it's just memory
> > copies and we have vectorization enabled and a supported vector
> > mode that would surely re-assemble them (store-merging doesn't seem
> > to do that).
> >
> > Btw, we seem to produce
> >
> >         movsd   b(%rip), %xmm0
> >         movsd   %xmm0, a(%rip)
> >         movsd   b+8(%rip), %xmm0
> >         movsd   %xmm0, a+8(%rip)
> >
> > for a _Complex double memory copy on x86 which means we lack
> > true DCmode support (pseudos get decomposed).  Not sure if we
> > can somehow check whether a target has DCmode load/store
> > support and key decomposing on that (maybe check the SET optab).
> >
> > It might be possible to check
> >
> > _Complex double a, b;
> > void bar()
> > {
> >   a = b;
> > }
> >
> > for all targets with a cc1 cross to see whether they somehow get
> > loads/stores _not_ decomposed (also check _Complex float,
> > I wouldn't worry for _Complex int or _Complex long double).
>
> Btw, a point for doing the above is that we already do it!  There just
> needs to be an (unrelated) complex op in the function:
>
> _Complex float a[2], b[2];
> _Complex double foo(_Complex double x, _Complex double y)
> {
>   a[0] = b[0];
>   a[1] = b[1];
>   return x + y;
> }
>
> vs
>
> void bar()
> {
>   a[0] = b[0];
>   a[1] = b[1];
> }
>
> they key difference is that tree_lower_complex returns early here:
>
>   if (!init_dont_simulate_again ())
>     return 0;
>
> that returns whether it saw any complex op.
>
> diff --git a/gcc/tree-complex.cc b/gcc/tree-complex.cc
> index 61950a0f099..bdcb9968af1 100644
> --- a/gcc/tree-complex.cc
> +++ b/gcc/tree-complex.cc
> @@ -297,6 +297,11 @@ init_dont_simulate_again (void)
>                 break;
>
>               default:
> +               /* When expand_complex_move would trigger make sure we
> +                  perform lowering even when there is no actual complex
> +                  operation.  This helps consistency and vectorization.  */
> +               if (TREE_CODE (TREE_TYPE (gimple_op (stmt, 0))) == COMPLEX_TYPE)
> +                 saw_a_complex_op = true;
>                 break;
>               }
>
Let me try this.
> fixes that.  If this change tests OK (and fixes your set of new
> vectorizer testcases)
The direct purpose of my patch is to support vectorization of the
complex type move, and the indirect purpose is to support automatic
vectorization of the complex type libmvec. For example, vectorization
of follow case
void
foo (_Complex double* a, _Complex double* b)
{
  for (int i = 0; i != 100; i++)
  a[i] = csin[b[i]];
}

GCC has support vectorization for sin, but not for csin.
> then I think that's the way to go for the immediate issue of
> vectorizing _Complex.
>
> Richard.
>
> > Richard.
> >
> > > > Let me spend some time fleshing out what is necessary to make
> > > > this work "properly".  We can consider your special-casing of _Complex
> > > > memory ops if I can't manage to assess the complexity of the task.
> > > >
> > > > Thanks,
> > > > Richard.
> > > >
> > > > > >
> > > > > > Thanks,
> > > > > > Richard.
> > > > > >
> > > > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > > > > > Also test the patch for SPEC2017 and find there's complex type vectorization
> > > > > > > in 510/549(but no performance impact).
> > > > > > >
> > > > > > > Any comments?
> > > > > > >
> > > > > > > gcc/ChangeLog:
> > > > > > >
> > > > > > >         PR tree-optimization/106010
> > > > > > >         * tree-vect-data-refs.cc (vect_get_data_access_cost):
> > > > > > >         Pass complex_p to vect_get_num_copies to avoid ICE.
> > > > > > >         (vect_analyze_data_refs): Support vectorization for Complex
> > > > > > >         type with vector scalar types.
> > > > > > >         * tree-vect-loop.cc (vect_determine_vf_for_stmt_1): VF should
> > > > > > >         be half of TYPE_VECTOR_SUBPARTS when complex_p.
> > > > > > >         * tree-vect-slp.cc (vect_record_max_nunits): nunits should be
> > > > > > >         half of TYPE_VECTOR_SUBPARTS when complex_p.
> > > > > > >         (vect_optimize_slp): Support permutation for complex type.
> > > > > > >         (vect_slp_analyze_node_operations_1): Double nunits in
> > > > > > >         vect_get_num_vectors to get right SLP_TREE_NUMBER_OF_VEC_STMTS
> > > > > > >         when complex_p.
> > > > > > >         (vect_slp_analyze_node_operations): Ditto.
> > > > > > >         (vect_create_constant_vectors): Support CTOR for complex type.
> > > > > > >         (vect_transform_slp_perm_load): Support permutation for
> > > > > > >         complex type.
> > > > > > >         * tree-vect-stmts.cc (vect_init_vector): Support complex type.
> > > > > > >         (vect_get_vec_defs_for_operand): Get vector type for
> > > > > > >         complex type.
> > > > > > >         (vectorizable_store): Get right ncopies/nunits for complex
> > > > > > >         type, also return false when complex_p and
> > > > > > >         !TYPE_VECTOR_SUBPARTS.is_constant ().
> > > > > > >         (vectorizable_load): Ditto.
> > > > > > >         (vect_get_vector_types_for_stmt): Get vector type for complex type.
> > > > > > >         * tree-vectorizer.h (STMT_VINFO_COMPLEX_P): New macro.
> > > > > > >         (vect_get_num_copies): New overload.
> > > > > > >
> > > > > > > gcc/testsuite/ChangeLog:
> > > > > > >
> > > > > > >         * gcc.target/i386/pr106010-1a.c: New test.
> > > > > > >         * gcc.target/i386/pr106010-1b.c: New test.
> > > > > > >         * gcc.target/i386/pr106010-1c.c: New test.
> > > > > > >         * gcc.target/i386/pr106010-2a.c: New test.
> > > > > > >         * gcc.target/i386/pr106010-2b.c: New test.
> > > > > > >         * gcc.target/i386/pr106010-2c.c: New test.
> > > > > > >         * gcc.target/i386/pr106010-3a.c: New test.
> > > > > > >         * gcc.target/i386/pr106010-3b.c: New test.
> > > > > > >         * gcc.target/i386/pr106010-3c.c: New test.
> > > > > > >         * gcc.target/i386/pr106010-4a.c: New test.
> > > > > > >         * gcc.target/i386/pr106010-4b.c: New test.
> > > > > > >         * gcc.target/i386/pr106010-4c.c: New test.
> > > > > > >         * gcc.target/i386/pr106010-5a.c: New test.
> > > > > > >         * gcc.target/i386/pr106010-5b.c: New test.
> > > > > > >         * gcc.target/i386/pr106010-5c.c: New test.
> > > > > > >         * gcc.target/i386/pr106010-6a.c: New test.
> > > > > > >         * gcc.target/i386/pr106010-6b.c: New test.
> > > > > > >         * gcc.target/i386/pr106010-6c.c: New test.
> > > > > > >         * gcc.target/i386/pr106010-7a.c: New test.
> > > > > > >         * gcc.target/i386/pr106010-7b.c: New test.
> > > > > > >         * gcc.target/i386/pr106010-7c.c: New test.
> > > > > > >         * gcc.target/i386/pr106010-8a.c: New test.
> > > > > > >         * gcc.target/i386/pr106010-8b.c: New test.
> > > > > > >         * gcc.target/i386/pr106010-8c.c: New test.
> > > > > > > ---
> > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-1a.c |  58 +++++++
> > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-1b.c |  63 +++++++
> > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-1c.c |  41 +++++
> > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-2a.c |  82 +++++++++
> > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-2b.c |  62 +++++++
> > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-2c.c |  47 ++++++
> > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-3a.c |  80 +++++++++
> > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-3b.c | 126 ++++++++++++++
> > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-3c.c |  69 ++++++++
> > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-4a.c | 101 ++++++++++++
> > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-4b.c |  67 ++++++++
> > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-4c.c |  54 ++++++
> > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-5a.c | 117 +++++++++++++
> > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-5b.c |  80 +++++++++
> > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-5c.c |  62 +++++++
> > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-6a.c | 115 +++++++++++++
> > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-6b.c | 157 ++++++++++++++++++
> > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-6c.c |  80 +++++++++
> > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-7a.c |  58 +++++++
> > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-7b.c |  63 +++++++
> > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-7c.c |  41 +++++
> > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-8a.c |  58 +++++++
> > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-8b.c |  53 ++++++
> > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-8c.c |  38 +++++
> > > > > > >  gcc/tree-vect-data-refs.cc                  |  26 ++-
> > > > > > >  gcc/tree-vect-loop.cc                       |   7 +-
> > > > > > >  gcc/tree-vect-slp.cc                        | 174 +++++++++++++++-----
> > > > > > >  gcc/tree-vect-stmts.cc                      | 135 ++++++++++++---
> > > > > > >  gcc/tree-vectorizer.h                       |  13 ++
> > > > > > >  29 files changed, 2064 insertions(+), 63 deletions(-)
> > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > > > > > >
> > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1a.c b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > > > > > > new file mode 100644
> > > > > > > index 00000000000..b608f484934
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > > > > > > @@ -0,0 +1,58 @@
> > > > > > > +/* { dg-do compile } */
> > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > > > > > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "vect" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "vect" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "vect" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "vect" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "vect" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "vect" } } */
> > > > > > > +
> > > > > > > +#define N 10000
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_pd (_Complex double* a, _Complex double* b)
> > > > > > > +{
> > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > +    a[i] = b[i];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_ps (_Complex float* a, _Complex float* b)
> > > > > > > +{
> > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > +    a[i] = b[i];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi64 (_Complex long long* a, _Complex long long* b)
> > > > > > > +{
> > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > +    a[i] = b[i];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi32 (_Complex int* a, _Complex int* b)
> > > > > > > +{
> > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > +    a[i] = b[i];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi16 (_Complex short* a, _Complex short* b)
> > > > > > > +{
> > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > +    a[i] = b[i];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi8 (_Complex char* a, _Complex char* b)
> > > > > > > +{
> > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > +    a[i] = b[i];
> > > > > > > +}
> > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1b.c b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > > > > > > new file mode 100644
> > > > > > > index 00000000000..0f377c3a548
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > > > > > > @@ -0,0 +1,63 @@
> > > > > > > +/* { dg-do run } */
> > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > +/* { dg-require-effective-target avx } */
> > > > > > > +
> > > > > > > +#include "avx-check.h"
> > > > > > > +#include <string.h>
> > > > > > > +#include "pr106010-1a.c"
> > > > > > > +
> > > > > > > +void
> > > > > > > +avx_test (void)
> > > > > > > +{
> > > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > > > > +  char* p_init = (char*) malloc (2 * N * sizeof (double));
> > > > > > > +
> > > > > > > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > > > > > > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > > > > > > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > > > > > > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > > > > > > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > > > > > > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > > > > > > +
> > > > > > > +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> > > > > > > +    p_init[i] = i;
> > > > > > > +
> > > > > > > +  memcpy (pd_src, p_init, 2 * N * sizeof (double));
> > > > > > > +  memcpy (ps_src, p_init, 2 * N * sizeof (float));
> > > > > > > +  memcpy (epi64_src, p_init, 2 * N * sizeof (long long));
> > > > > > > +  memcpy (epi32_src, p_init, 2 * N * sizeof (int));
> > > > > > > +  memcpy (epi16_src, p_init, 2 * N * sizeof (short));
> > > > > > > +  memcpy (epi8_src, p_init, 2 * N * sizeof (char));
> > > > > > > +
> > > > > > > +  foo_pd (pd_dst, pd_src);
> > > > > > > +  foo_ps (ps_dst, ps_src);
> > > > > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > > > > +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +
> > > > > > > +  return;
> > > > > > > +}
> > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1c.c b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > > > > > > new file mode 100644
> > > > > > > index 00000000000..f07e9fb2d3d
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > > > > > > @@ -0,0 +1,41 @@
> > > > > > > +/* { dg-do run } */
> > > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "vect" } } */
> > > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > > +
> > > > > > > +#include <string.h>
> > > > > > > +
> > > > > > > +static void do_test (void);
> > > > > > > +
> > > > > > > +#define DO_TEST do_test
> > > > > > > +#define AVX512FP16
> > > > > > > +#include "avx512-check.h"
> > > > > > > +
> > > > > > > +#define N 10000
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* b)
> > > > > > > +{
> > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > +    a[i] = b[i];
> > > > > > > +}
> > > > > > > +
> > > > > > > +static void
> > > > > > > +do_test (void)
> > > > > > > +{
> > > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > > > > +  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
> > > > > > > +
> > > > > > > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > > > > > > +
> > > > > > > +  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
> > > > > > > +    p_init[i] = i;
> > > > > > > +
> > > > > > > +  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
> > > > > > > +
> > > > > > > +  foo_ph (ph_dst, ph_src);
> > > > > > > +  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +}
> > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2a.c b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > > > > > > new file mode 100644
> > > > > > > index 00000000000..d2e2f8d4f43
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > > > > > > @@ -0,0 +1,82 @@
> > > > > > > +/* { dg-do compile } */
> > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "slp2" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "slp2" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "slp2" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "slp2" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "slp2" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "slp2" } } */
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[0];
> > > > > > > +  a[1] = b[1];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[0];
> > > > > > > +  a[1] = b[1];
> > > > > > > +  a[2] = b[2];
> > > > > > > +  a[3] = b[3];
> > > > > > > +
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[0];
> > > > > > > +  a[1] = b[1];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[0];
> > > > > > > +  a[1] = b[1];
> > > > > > > +  a[2] = b[2];
> > > > > > > +  a[3] = b[3];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[0];
> > > > > > > +  a[1] = b[1];
> > > > > > > +  a[2] = b[2];
> > > > > > > +  a[3] = b[3];
> > > > > > > +  a[4] = b[4];
> > > > > > > +  a[5] = b[5];
> > > > > > > +  a[6] = b[6];
> > > > > > > +  a[7] = b[7];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[0];
> > > > > > > +  a[1] = b[1];
> > > > > > > +  a[2] = b[2];
> > > > > > > +  a[3] = b[3];
> > > > > > > +  a[4] = b[4];
> > > > > > > +  a[5] = b[5];
> > > > > > > +  a[6] = b[6];
> > > > > > > +  a[7] = b[7];
> > > > > > > +  a[8] = b[8];
> > > > > > > +  a[9] = b[9];
> > > > > > > +  a[10] = b[10];
> > > > > > > +  a[11] = b[11];
> > > > > > > +  a[12] = b[12];
> > > > > > > +  a[13] = b[13];
> > > > > > > +  a[14] = b[14];
> > > > > > > +  a[15] = b[15];
> > > > > > > +}
> > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2b.c b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > > > > > > new file mode 100644
> > > > > > > index 00000000000..ac360752693
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > > > > > > @@ -0,0 +1,62 @@
> > > > > > > +/* { dg-do run } */
> > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > +/* { dg-require-effective-target avx } */
> > > > > > > +
> > > > > > > +#include "avx-check.h"
> > > > > > > +#include <string.h>
> > > > > > > +#include "pr106010-2a.c"
> > > > > > > +
> > > > > > > +void
> > > > > > > +avx_test (void)
> > > > > > > +{
> > > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > > > > > > +  char* p = (char* ) malloc (32);
> > > > > > > +
> > > > > > > +  __builtin_memset (pd_dst, 0, 32);
> > > > > > > +  __builtin_memset (ps_dst, 0, 32);
> > > > > > > +  __builtin_memset (epi64_dst, 0, 32);
> > > > > > > +  __builtin_memset (epi32_dst, 0, 32);
> > > > > > > +  __builtin_memset (epi16_dst, 0, 32);
> > > > > > > +  __builtin_memset (epi8_dst, 0, 32);
> > > > > > > +
> > > > > > > +  for (int i = 0; i != 32; i++)
> > > > > > > +    p[i] = i;
> > > > > > > +  __builtin_memcpy (pd_src, p, 32);
> > > > > > > +  __builtin_memcpy (ps_src, p, 32);
> > > > > > > +  __builtin_memcpy (epi64_src, p, 32);
> > > > > > > +  __builtin_memcpy (epi32_src, p, 32);
> > > > > > > +  __builtin_memcpy (epi16_src, p, 32);
> > > > > > > +  __builtin_memcpy (epi8_src, p, 32);
> > > > > > > +
> > > > > > > +  foo_pd (pd_dst, pd_src);
> > > > > > > +  foo_ps (ps_dst, ps_src);
> > > > > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > > > > +  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +
> > > > > > > +  return;
> > > > > > > +}
> > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2c.c b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > > > > > > new file mode 100644
> > > > > > > index 00000000000..a002f209ec9
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > > > > > > @@ -0,0 +1,47 @@
> > > > > > > +/* { dg-do run } */
> > > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > > +
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "slp2" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > > > > > +
> > > > > > > +#include <string.h>
> > > > > > > +
> > > > > > > +static void do_test (void);
> > > > > > > +#define DO_TEST do_test
> > > > > > > +#define AVX512FP16
> > > > > > > +#include "avx512-check.h"
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[0];
> > > > > > > +  a[1] = b[1];
> > > > > > > +  a[2] = b[2];
> > > > > > > +  a[3] = b[3];
> > > > > > > +  a[4] = b[4];
> > > > > > > +  a[5] = b[5];
> > > > > > > +  a[6] = b[6];
> > > > > > > +  a[7] = b[7];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +do_test (void)
> > > > > > > +{
> > > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > > > > > > +  char* p = (char* ) malloc (32);
> > > > > > > +
> > > > > > > +   __builtin_memset (ph_dst, 0, 32);
> > > > > > > +
> > > > > > > +  for (int i = 0; i != 32; i++)
> > > > > > > +    p[i] = i;
> > > > > > > +  __builtin_memcpy (ph_src, p, 32);
> > > > > > > +
> > > > > > > +  foo_ph (ph_dst, ph_src);
> > > > > > > +  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +
> > > > > > > +  return;
> > > > > > > +}
> > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3a.c b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > > > > > > new file mode 100644
> > > > > > > index 00000000000..c1b64b56b1c
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > > > > > > @@ -0,0 +1,80 @@
> > > > > > > +/* { dg-do compile } */
> > > > > > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
> > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 6, 7, 4, 5 \}} 1 "slp2" } }  */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17 \}} 1 "slp2" } }  */
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[1];
> > > > > > > +  a[1] = b[0];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[1];
> > > > > > > +  a[1] = b[0];
> > > > > > > +  a[2] = b[3];
> > > > > > > +  a[3] = b[2];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[1];
> > > > > > > +  a[1] = b[0];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[3];
> > > > > > > +  a[1] = b[2];
> > > > > > > +  a[2] = b[1];
> > > > > > > +  a[3] = b[0];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[7];
> > > > > > > +  a[1] = b[6];
> > > > > > > +  a[2] = b[5];
> > > > > > > +  a[3] = b[4];
> > > > > > > +  a[4] = b[3];
> > > > > > > +  a[5] = b[2];
> > > > > > > +  a[6] = b[1];
> > > > > > > +  a[7] = b[0];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[7];
> > > > > > > +  a[1] = b[6];
> > > > > > > +  a[2] = b[5];
> > > > > > > +  a[3] = b[4];
> > > > > > > +  a[4] = b[3];
> > > > > > > +  a[5] = b[2];
> > > > > > > +  a[6] = b[1];
> > > > > > > +  a[7] = b[0];
> > > > > > > +  a[8] = b[15];
> > > > > > > +  a[9] = b[14];
> > > > > > > +  a[10] = b[13];
> > > > > > > +  a[11] = b[12];
> > > > > > > +  a[12] = b[11];
> > > > > > > +  a[13] = b[10];
> > > > > > > +  a[14] = b[9];
> > > > > > > +  a[15] = b[8];
> > > > > > > +}
> > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3b.c b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > > > > > > new file mode 100644
> > > > > > > index 00000000000..e4fa3f3a541
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > > > > > > @@ -0,0 +1,126 @@
> > > > > > > +/* { dg-do run } */
> > > > > > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > +/* { dg-require-effective-target avx2 } */
> > > > > > > +
> > > > > > > +#include "avx2-check.h"
> > > > > > > +#include <string.h>
> > > > > > > +#include "pr106010-3a.c"
> > > > > > > +
> > > > > > > +void
> > > > > > > +avx2_test (void)
> > > > > > > +{
> > > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > > > > > > +  _Complex double* pd_exp = (_Complex double*) malloc (32);
> > > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > > > > > > +  _Complex float* ps_exp = (_Complex float*) malloc (32);
> > > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > > > > > > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (32);
> > > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > > > > > > +  _Complex int* epi32_exp = (_Complex int*) malloc (32);
> > > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > > > > > > +  _Complex short* epi16_exp = (_Complex short*) malloc (32);
> > > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > > > > > > +  _Complex char* epi8_exp = (_Complex char*) malloc (32);
> > > > > > > +  char* p = (char* ) malloc (32);
> > > > > > > +  char* q = (char* ) malloc (32);
> > > > > > > +
> > > > > > > +  __builtin_memset (pd_dst, 0, 32);
> > > > > > > +  __builtin_memset (ps_dst, 0, 32);
> > > > > > > +  __builtin_memset (epi64_dst, 0, 32);
> > > > > > > +  __builtin_memset (epi32_dst, 0, 32);
> > > > > > > +  __builtin_memset (epi16_dst, 0, 32);
> > > > > > > +  __builtin_memset (epi8_dst, 0, 32);
> > > > > > > +
> > > > > > > +  for (int i = 0; i != 32; i++)
> > > > > > > +    p[i] = i;
> > > > > > > +  __builtin_memcpy (pd_src, p, 32);
> > > > > > > +  __builtin_memcpy (ps_src, p, 32);
> > > > > > > +  __builtin_memcpy (epi64_src, p, 32);
> > > > > > > +  __builtin_memcpy (epi32_src, p, 32);
> > > > > > > +  __builtin_memcpy (epi16_src, p, 32);
> > > > > > > +  __builtin_memcpy (epi8_src, p, 32);
> > > > > > > +
> > > > > > > +  for (int i = 0; i != 16; i++)
> > > > > > > +    {
> > > > > > > +      p[i] = i + 16;
> > > > > > > +      p[i + 16] = i;
> > > > > > > +    }
> > > > > > > +  __builtin_memcpy (pd_exp, p, 32);
> > > > > > > +  __builtin_memcpy (epi64_exp, p, 32);
> > > > > > > +
> > > > > > > +  for (int i = 0; i != 8; i++)
> > > > > > > +    {
> > > > > > > +      p[i] = i + 8;
> > > > > > > +      p[i + 8] = i;
> > > > > > > +      p[i + 16] = i + 24;
> > > > > > > +      p[i + 24] = i + 16;
> > > > > > > +      q[i] = i + 24;
> > > > > > > +      q[i + 8] = i + 16;
> > > > > > > +      q[i + 16] = i + 8;
> > > > > > > +      q[i + 24] = i;
> > > > > > > +    }
> > > > > > > +  __builtin_memcpy (ps_exp, p, 32);
> > > > > > > +  __builtin_memcpy (epi32_exp, q, 32);
> > > > > > > +
> > > > > > > +
> > > > > > > +  for (int i = 0; i != 4; i++)
> > > > > > > +    {
> > > > > > > +      q[i] = i + 28;
> > > > > > > +      q[i + 4] = i + 24;
> > > > > > > +      q[i + 8] = i + 20;
> > > > > > > +      q[i + 12] = i + 16;
> > > > > > > +      q[i + 16] = i + 12;
> > > > > > > +      q[i + 20] = i + 8;
> > > > > > > +      q[i + 24] = i + 4;
> > > > > > > +      q[i + 28] = i;
> > > > > > > +    }
> > > > > > > +  __builtin_memcpy (epi16_exp, q, 32);
> > > > > > > +
> > > > > > > +  for (int i = 0; i != 2; i++)
> > > > > > > +    {
> > > > > > > +      q[i] = i + 14;
> > > > > > > +      q[i + 2] = i + 12;
> > > > > > > +      q[i + 4] = i + 10;
> > > > > > > +      q[i + 6] = i + 8;
> > > > > > > +      q[i + 8] = i + 6;
> > > > > > > +      q[i + 10] = i + 4;
> > > > > > > +      q[i + 12] = i + 2;
> > > > > > > +      q[i + 14] = i;
> > > > > > > +      q[i + 16] = i + 30;
> > > > > > > +      q[i + 18] = i + 28;
> > > > > > > +      q[i + 20] = i + 26;
> > > > > > > +      q[i + 22] = i + 24;
> > > > > > > +      q[i + 24] = i + 22;
> > > > > > > +      q[i + 26] = i + 20;
> > > > > > > +      q[i + 28] = i + 18;
> > > > > > > +      q[i + 30] = i + 16;
> > > > > > > +    }
> > > > > > > +  __builtin_memcpy (epi8_exp, q, 32);
> > > > > > > +
> > > > > > > +  foo_pd (pd_dst, pd_src);
> > > > > > > +  foo_ps (ps_dst, ps_src);
> > > > > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > > > > +  if (__builtin_memcmp (pd_dst, pd_exp, 32) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (ps_dst, ps_exp, 32) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 32) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 32) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 32) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 32) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +
> > > > > > > +  return;
> > > > > > > +}
> > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3c.c b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > > > > > > new file mode 100644
> > > > > > > index 00000000000..5a5a3d4b992
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > > > > > > @@ -0,0 +1,69 @@
> > > > > > > +/* { dg-do run } */
> > > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 8, 9, 6, 7, 14, 15, 12, 13, 4, 5, 10, 11 \}} 1 "slp2" } }  */
> > > > > > > +
> > > > > > > +#include <string.h>
> > > > > > > +
> > > > > > > +static void do_test (void);
> > > > > > > +#define DO_TEST do_test
> > > > > > > +#define AVX512FP16
> > > > > > > +#include "avx512-check.h"
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[1];
> > > > > > > +  a[1] = b[0];
> > > > > > > +  a[2] = b[4];
> > > > > > > +  a[3] = b[3];
> > > > > > > +  a[4] = b[7];
> > > > > > > +  a[5] = b[6];
> > > > > > > +  a[6] = b[2];
> > > > > > > +  a[7] = b[5];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +do_test (void)
> > > > > > > +{
> > > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > > > > > > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (32);
> > > > > > > +  char* p = (char* ) malloc (32);
> > > > > > > +  char* q = (char* ) malloc (32);
> > > > > > > +
> > > > > > > +  __builtin_memset (ph_dst, 0, 32);
> > > > > > > +
> > > > > > > +  for (int i = 0; i != 32; i++)
> > > > > > > +    p[i] = i;
> > > > > > > +  __builtin_memcpy (ph_src, p, 32);
> > > > > > > +
> > > > > > > +  for (int i = 0; i != 4; i++)
> > > > > > > +    {
> > > > > > > +      p[i] = i + 4;
> > > > > > > +      p[i + 4] = i;
> > > > > > > +      p[i + 8] = i + 16;
> > > > > > > +      p[i + 12] = i + 12;
> > > > > > > +      p[i + 16] = i + 28;
> > > > > > > +      p[i + 20] = i + 24;
> > > > > > > +      p[i + 24] = i + 8;
> > > > > > > +      p[i + 28] = i + 20;
> > > > > > > +      q[i] = i + 28;
> > > > > > > +      q[i + 4] = i + 24;
> > > > > > > +      q[i + 8] = i + 20;
> > > > > > > +      q[i + 12] = i + 16;
> > > > > > > +      q[i + 16] = i + 12;
> > > > > > > +      q[i + 20] = i + 8;
> > > > > > > +      q[i + 24] = i + 4;
> > > > > > > +      q[i + 28] = i;
> > > > > > > +    }
> > > > > > > +  __builtin_memcpy (ph_exp, p, 32);
> > > > > > > +
> > > > > > > +  foo_ph (ph_dst, ph_src);
> > > > > > > +  if (__builtin_memcmp (ph_dst, ph_exp, 32) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +
> > > > > > > +  return;
> > > > > > > +}
> > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4a.c b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > > > > > > new file mode 100644
> > > > > > > index 00000000000..b7b0b532bb1
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > > > > > > @@ -0,0 +1,101 @@
> > > > > > > +/* { dg-do compile } */
> > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
> > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "slp2" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "slp2" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "slp2" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "slp2" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "slp2" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "slp2" } } */
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_pd (_Complex double* a,
> > > > > > > +       _Complex double b1,
> > > > > > > +       _Complex double b2)
> > > > > > > +{
> > > > > > > +  a[0] = b1;
> > > > > > > +  a[1] = b2;
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_ps (_Complex float* a,
> > > > > > > +       _Complex float b1, _Complex float b2,
> > > > > > > +       _Complex float b3, _Complex float b4)
> > > > > > > +{
> > > > > > > +  a[0] = b1;
> > > > > > > +  a[1] = b2;
> > > > > > > +  a[2] = b3;
> > > > > > > +  a[3] = b4;
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi64 (_Complex long long* a,
> > > > > > > +          _Complex long long b1,
> > > > > > > +          _Complex long long b2)
> > > > > > > +{
> > > > > > > +  a[0] = b1;
> > > > > > > +  a[1] = b2;
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi32 (_Complex int* a,
> > > > > > > +          _Complex int b1, _Complex int b2,
> > > > > > > +          _Complex int b3, _Complex int b4)
> > > > > > > +{
> > > > > > > +  a[0] = b1;
> > > > > > > +  a[1] = b2;
> > > > > > > +  a[2] = b3;
> > > > > > > +  a[3] = b4;
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi16 (_Complex short* a,
> > > > > > > +          _Complex short b1, _Complex short b2,
> > > > > > > +          _Complex short b3, _Complex short b4,
> > > > > > > +          _Complex short b5, _Complex short b6,
> > > > > > > +          _Complex short b7,_Complex short b8)
> > > > > > > +{
> > > > > > > +  a[0] = b1;
> > > > > > > +  a[1] = b2;
> > > > > > > +  a[2] = b3;
> > > > > > > +  a[3] = b4;
> > > > > > > +  a[4] = b5;
> > > > > > > +  a[5] = b6;
> > > > > > > +  a[6] = b7;
> > > > > > > +  a[7] = b8;
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi8 (_Complex char* a,
> > > > > > > +         _Complex char b1, _Complex char b2,
> > > > > > > +         _Complex char b3, _Complex char b4,
> > > > > > > +         _Complex char b5, _Complex char b6,
> > > > > > > +         _Complex char b7,_Complex char b8,
> > > > > > > +         _Complex char b9, _Complex char b10,
> > > > > > > +         _Complex char b11, _Complex char b12,
> > > > > > > +         _Complex char b13, _Complex char b14,
> > > > > > > +         _Complex char b15,_Complex char b16)
> > > > > > > +{
> > > > > > > +  a[0] = b1;
> > > > > > > +  a[1] = b2;
> > > > > > > +  a[2] = b3;
> > > > > > > +  a[3] = b4;
> > > > > > > +  a[4] = b5;
> > > > > > > +  a[5] = b6;
> > > > > > > +  a[6] = b7;
> > > > > > > +  a[7] = b8;
> > > > > > > +  a[8] = b9;
> > > > > > > +  a[9] = b10;
> > > > > > > +  a[10] = b11;
> > > > > > > +  a[11] = b12;
> > > > > > > +  a[12] = b13;
> > > > > > > +  a[13] = b14;
> > > > > > > +  a[14] = b15;
> > > > > > > +  a[15] = b16;
> > > > > > > +}
> > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4b.c b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > > > > > > new file mode 100644
> > > > > > > index 00000000000..e2e79508c4b
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > > > > > > @@ -0,0 +1,67 @@
> > > > > > > +/* { dg-do run } */
> > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > +/* { dg-require-effective-target avx } */
> > > > > > > +
> > > > > > > +#include "avx-check.h"
> > > > > > > +#include <string.h>
> > > > > > > +#include "pr106010-4a.c"
> > > > > > > +
> > > > > > > +void
> > > > > > > +avx_test (void)
> > > > > > > +{
> > > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > > > > > > +  char* p = (char* ) malloc (32);
> > > > > > > +
> > > > > > > +  __builtin_memset (pd_dst, 0, 32);
> > > > > > > +  __builtin_memset (ps_dst, 0, 32);
> > > > > > > +  __builtin_memset (epi64_dst, 0, 32);
> > > > > > > +  __builtin_memset (epi32_dst, 0, 32);
> > > > > > > +  __builtin_memset (epi16_dst, 0, 32);
> > > > > > > +  __builtin_memset (epi8_dst, 0, 32);
> > > > > > > +
> > > > > > > +  for (int i = 0; i != 32; i++)
> > > > > > > +    p[i] = i;
> > > > > > > +  __builtin_memcpy (pd_src, p, 32);
> > > > > > > +  __builtin_memcpy (ps_src, p, 32);
> > > > > > > +  __builtin_memcpy (epi64_src, p, 32);
> > > > > > > +  __builtin_memcpy (epi32_src, p, 32);
> > > > > > > +  __builtin_memcpy (epi16_src, p, 32);
> > > > > > > +  __builtin_memcpy (epi8_src, p, 32);
> > > > > > > +
> > > > > > > +  foo_pd (pd_dst, pd_src[0], pd_src[1]);
> > > > > > > +  foo_ps (ps_dst, ps_src[0], ps_src[1], ps_src[2], ps_src[3]);
> > > > > > > +  foo_epi64 (epi64_dst, epi64_src[0], epi64_src[1]);
> > > > > > > +  foo_epi32 (epi32_dst, epi32_src[0], epi32_src[1], epi32_src[2], epi32_src[3]);
> > > > > > > +  foo_epi16 (epi16_dst, epi16_src[0], epi16_src[1], epi16_src[2], epi16_src[3],
> > > > > > > +            epi16_src[4], epi16_src[5], epi16_src[6], epi16_src[7]);
> > > > > > > +  foo_epi8 (epi8_dst, epi8_src[0], epi8_src[1], epi8_src[2], epi8_src[3],
> > > > > > > +           epi8_src[4], epi8_src[5], epi8_src[6], epi8_src[7],
> > > > > > > +           epi8_src[8], epi8_src[9], epi8_src[10], epi8_src[11],
> > > > > > > +           epi8_src[12], epi8_src[13], epi8_src[14], epi8_src[15]);
> > > > > > > +
> > > > > > > +  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi8_dst, epi8_src, 32) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +
> > > > > > > +  return;
> > > > > > > +}
> > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4c.c b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > > > > > > new file mode 100644
> > > > > > > index 00000000000..8e02aefe3b5
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > > > > > > @@ -0,0 +1,54 @@
> > > > > > > +/* { dg-do run } */
> > > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -fdump-tree-slp-details -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "slp2" } } */
> > > > > > > +
> > > > > > > +#include <string.h>
> > > > > > > +
> > > > > > > +static void do_test (void);
> > > > > > > +#define DO_TEST do_test
> > > > > > > +#define AVX512FP16
> > > > > > > +#include "avx512-check.h"
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_ph (_Complex _Float16* a,
> > > > > > > +       _Complex _Float16 b1, _Complex _Float16 b2,
> > > > > > > +       _Complex _Float16 b3, _Complex _Float16 b4,
> > > > > > > +       _Complex _Float16 b5, _Complex _Float16 b6,
> > > > > > > +       _Complex _Float16 b7,_Complex _Float16 b8)
> > > > > > > +{
> > > > > > > +  a[0] = b1;
> > > > > > > +  a[1] = b2;
> > > > > > > +  a[2] = b3;
> > > > > > > +  a[3] = b4;
> > > > > > > +  a[4] = b5;
> > > > > > > +  a[5] = b6;
> > > > > > > +  a[6] = b7;
> > > > > > > +  a[7] = b8;
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +do_test (void)
> > > > > > > +{
> > > > > > > +
> > > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > > > > > > +
> > > > > > > +  char* p = (char* ) malloc (32);
> > > > > > > +
> > > > > > > +  __builtin_memset (ph_dst, 0, 32);
> > > > > > > +
> > > > > > > +  for (int i = 0; i != 32; i++)
> > > > > > > +    p[i] = i;
> > > > > > > +
> > > > > > > +  __builtin_memcpy (ph_src, p, 32);
> > > > > > > +
> > > > > > > +  foo_ph (ph_dst, ph_src[0], ph_src[1], ph_src[2], ph_src[3],
> > > > > > > +         ph_src[4], ph_src[5], ph_src[6], ph_src[7]);
> > > > > > > +
> > > > > > > +  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  return;
> > > > > > > +}
> > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5a.c b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > > > > > > new file mode 100644
> > > > > > > index 00000000000..9d4a6f9846b
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > > > > > > @@ -0,0 +1,117 @@
> > > > > > > +/* { dg-do compile } */
> > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 4 "slp2" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 4 "slp2" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 4 "slp2" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 4 "slp2" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 4 "slp2" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 4 "slp2" } } */
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[2];
> > > > > > > +  a[1] = b[3];
> > > > > > > +  a[2] = b[0];
> > > > > > > +  a[3] = b[1];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[4];
> > > > > > > +  a[1] = b[5];
> > > > > > > +  a[2] = b[6];
> > > > > > > +  a[3] = b[7];
> > > > > > > +  a[4] = b[0];
> > > > > > > +  a[5] = b[1];
> > > > > > > +  a[6] = b[2];
> > > > > > > +  a[7] = b[3];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[2];
> > > > > > > +  a[1] = b[3];
> > > > > > > +  a[2] = b[0];
> > > > > > > +  a[3] = b[1];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[4];
> > > > > > > +  a[1] = b[5];
> > > > > > > +  a[2] = b[6];
> > > > > > > +  a[3] = b[7];
> > > > > > > +  a[4] = b[0];
> > > > > > > +  a[5] = b[1];
> > > > > > > +  a[6] = b[2];
> > > > > > > +  a[7] = b[3];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[8];
> > > > > > > +  a[1] = b[9];
> > > > > > > +  a[2] = b[10];
> > > > > > > +  a[3] = b[11];
> > > > > > > +  a[4] = b[12];
> > > > > > > +  a[5] = b[13];
> > > > > > > +  a[6] = b[14];
> > > > > > > +  a[7] = b[15];
> > > > > > > +  a[8] = b[0];
> > > > > > > +  a[9] = b[1];
> > > > > > > +  a[10] = b[2];
> > > > > > > +  a[11] = b[3];
> > > > > > > +  a[12] = b[4];
> > > > > > > +  a[13] = b[5];
> > > > > > > +  a[14] = b[6];
> > > > > > > +  a[15] = b[7];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[16];
> > > > > > > +  a[1] = b[17];
> > > > > > > +  a[2] = b[18];
> > > > > > > +  a[3] = b[19];
> > > > > > > +  a[4] = b[20];
> > > > > > > +  a[5] = b[21];
> > > > > > > +  a[6] = b[22];
> > > > > > > +  a[7] = b[23];
> > > > > > > +  a[8] = b[24];
> > > > > > > +  a[9] = b[25];
> > > > > > > +  a[10] = b[26];
> > > > > > > +  a[11] = b[27];
> > > > > > > +  a[12] = b[28];
> > > > > > > +  a[13] = b[29];
> > > > > > > +  a[14] = b[30];
> > > > > > > +  a[15] = b[31];
> > > > > > > +  a[16] = b[0];
> > > > > > > +  a[17] = b[1];
> > > > > > > +  a[18] = b[2];
> > > > > > > +  a[19] = b[3];
> > > > > > > +  a[20] = b[4];
> > > > > > > +  a[21] = b[5];
> > > > > > > +  a[22] = b[6];
> > > > > > > +  a[23] = b[7];
> > > > > > > +  a[24] = b[8];
> > > > > > > +  a[25] = b[9];
> > > > > > > +  a[26] = b[10];
> > > > > > > +  a[27] = b[11];
> > > > > > > +  a[28] = b[12];
> > > > > > > +  a[29] = b[13];
> > > > > > > +  a[30] = b[14];
> > > > > > > +  a[31] = b[15];
> > > > > > > +}
> > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5b.c b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > > > > > > new file mode 100644
> > > > > > > index 00000000000..d5c6ebeb5cf
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > > > > > > @@ -0,0 +1,80 @@
> > > > > > > +/* { dg-do run } */
> > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > +/* { dg-require-effective-target avx } */
> > > > > > > +
> > > > > > > +#include "avx-check.h"
> > > > > > > +#include <string.h>
> > > > > > > +#include "pr106010-5a.c"
> > > > > > > +
> > > > > > > +void
> > > > > > > +avx_test (void)
> > > > > > > +{
> > > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (64);
> > > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (64);
> > > > > > > +  _Complex double* pd_exp = (_Complex double*) malloc (64);
> > > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (64);
> > > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (64);
> > > > > > > +  _Complex float* ps_exp = (_Complex float*) malloc (64);
> > > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
> > > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
> > > > > > > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
> > > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (64);
> > > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (64);
> > > > > > > +  _Complex int* epi32_exp = (_Complex int*) malloc (64);
> > > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (64);
> > > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (64);
> > > > > > > +  _Complex short* epi16_exp = (_Complex short*) malloc (64);
> > > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (64);
> > > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (64);
> > > > > > > +  _Complex char* epi8_exp = (_Complex char*) malloc (64);
> > > > > > > +  char* p = (char* ) malloc (64);
> > > > > > > +  char* q = (char* ) malloc (64);
> > > > > > > +
> > > > > > > +  __builtin_memset (pd_dst, 0, 64);
> > > > > > > +  __builtin_memset (ps_dst, 0, 64);
> > > > > > > +  __builtin_memset (epi64_dst, 0, 64);
> > > > > > > +  __builtin_memset (epi32_dst, 0, 64);
> > > > > > > +  __builtin_memset (epi16_dst, 0, 64);
> > > > > > > +  __builtin_memset (epi8_dst, 0, 64);
> > > > > > > +
> > > > > > > +  for (int i = 0; i != 64; i++)
> > > > > > > +    {
> > > > > > > +      p[i] = i;
> > > > > > > +      q[i] = (i + 32) % 64;
> > > > > > > +    }
> > > > > > > +  __builtin_memcpy (pd_src, p, 64);
> > > > > > > +  __builtin_memcpy (ps_src, p, 64);
> > > > > > > +  __builtin_memcpy (epi64_src, p, 64);
> > > > > > > +  __builtin_memcpy (epi32_src, p, 64);
> > > > > > > +  __builtin_memcpy (epi16_src, p, 64);
> > > > > > > +  __builtin_memcpy (epi8_src, p, 64);
> > > > > > > +
> > > > > > > +  __builtin_memcpy (pd_exp, q, 64);
> > > > > > > +  __builtin_memcpy (ps_exp, q, 64);
> > > > > > > +  __builtin_memcpy (epi64_exp, q, 64);
> > > > > > > +  __builtin_memcpy (epi32_exp, q, 64);
> > > > > > > +  __builtin_memcpy (epi16_exp, q, 64);
> > > > > > > +  __builtin_memcpy (epi8_exp, q, 64);
> > > > > > > +
> > > > > > > +  foo_pd (pd_dst, pd_src);
> > > > > > > +  foo_ps (ps_dst, ps_src);
> > > > > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > > > > +
> > > > > > > +  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +
> > > > > > > +  return;
> > > > > > > +}
> > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5c.c b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > > > > > > new file mode 100644
> > > > > > > index 00000000000..9ce4e6dd5c0
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > > > > > > @@ -0,0 +1,62 @@
> > > > > > > +/* { dg-do run } */
> > > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 4 "slp2" } } */
> > > > > > > +
> > > > > > > +#include <string.h>
> > > > > > > +
> > > > > > > +static void do_test (void);
> > > > > > > +#define DO_TEST do_test
> > > > > > > +#define AVX512FP16
> > > > > > > +#include "avx512-check.h"
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[8];
> > > > > > > +  a[1] = b[9];
> > > > > > > +  a[2] = b[10];
> > > > > > > +  a[3] = b[11];
> > > > > > > +  a[4] = b[12];
> > > > > > > +  a[5] = b[13];
> > > > > > > +  a[6] = b[14];
> > > > > > > +  a[7] = b[15];
> > > > > > > +  a[8] = b[0];
> > > > > > > +  a[9] = b[1];
> > > > > > > +  a[10] = b[2];
> > > > > > > +  a[11] = b[3];
> > > > > > > +  a[12] = b[4];
> > > > > > > +  a[13] = b[5];
> > > > > > > +  a[14] = b[6];
> > > > > > > +  a[15] = b[7];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +do_test (void)
> > > > > > > +{
> > > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
> > > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
> > > > > > > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
> > > > > > > +  char* p = (char* ) malloc (64);
> > > > > > > +  char* q = (char* ) malloc (64);
> > > > > > > +
> > > > > > > +  __builtin_memset (ph_dst, 0, 64);
> > > > > > > +
> > > > > > > +  for (int i = 0; i != 64; i++)
> > > > > > > +    {
> > > > > > > +      p[i] = i;
> > > > > > > +      q[i] = (i + 32) % 64;
> > > > > > > +    }
> > > > > > > +  __builtin_memcpy (ph_src, p, 64);
> > > > > > > +
> > > > > > > +  __builtin_memcpy (ph_exp, q, 64);
> > > > > > > +
> > > > > > > +  foo_ph (ph_dst, ph_src);
> > > > > > > +
> > > > > > > +  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +
> > > > > > > +  return;
> > > > > > > +}
> > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6a.c b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > > > > > > new file mode 100644
> > > > > > > index 00000000000..65a90d03684
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > > > > > > @@ -0,0 +1,115 @@
> > > > > > > +/* { dg-do compile } */
> > > > > > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 4 "slp2" } }  */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 4 "slp2" } }  */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[3];
> > > > > > > +  a[1] = b[2];
> > > > > > > +  a[2] = b[1];
> > > > > > > +  a[3] = b[0];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[7];
> > > > > > > +  a[1] = b[6];
> > > > > > > +  a[2] = b[5];
> > > > > > > +  a[3] = b[4];
> > > > > > > +  a[4] = b[3];
> > > > > > > +  a[5] = b[2];
> > > > > > > +  a[6] = b[1];
> > > > > > > +  a[7] = b[0];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[3];
> > > > > > > +  a[1] = b[2];
> > > > > > > +  a[2] = b[1];
> > > > > > > +  a[3] = b[0];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[7];
> > > > > > > +  a[1] = b[6];
> > > > > > > +  a[2] = b[5];
> > > > > > > +  a[3] = b[4];
> > > > > > > +  a[4] = b[3];
> > > > > > > +  a[5] = b[2];
> > > > > > > +  a[6] = b[1];
> > > > > > > +  a[7] = b[0];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[15];
> > > > > > > +  a[1] = b[14];
> > > > > > > +  a[2] = b[13];
> > > > > > > +  a[3] = b[12];
> > > > > > > +  a[4] = b[11];
> > > > > > > +  a[5] = b[10];
> > > > > > > +  a[6] = b[9];
> > > > > > > +  a[7] = b[8];
> > > > > > > +  a[8] = b[7];
> > > > > > > +  a[9] = b[6];
> > > > > > > +  a[10] = b[5];
> > > > > > > +  a[11] = b[4];
> > > > > > > +  a[12] = b[3];
> > > > > > > +  a[13] = b[2];
> > > > > > > +  a[14] = b[1];
> > > > > > > +  a[15] = b[0];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[31];
> > > > > > > +  a[1] = b[30];
> > > > > > > +  a[2] = b[29];
> > > > > > > +  a[3] = b[28];
> > > > > > > +  a[4] = b[27];
> > > > > > > +  a[5] = b[26];
> > > > > > > +  a[6] = b[25];
> > > > > > > +  a[7] = b[24];
> > > > > > > +  a[8] = b[23];
> > > > > > > +  a[9] = b[22];
> > > > > > > +  a[10] = b[21];
> > > > > > > +  a[11] = b[20];
> > > > > > > +  a[12] = b[19];
> > > > > > > +  a[13] = b[18];
> > > > > > > +  a[14] = b[17];
> > > > > > > +  a[15] = b[16];
> > > > > > > +  a[16] = b[15];
> > > > > > > +  a[17] = b[14];
> > > > > > > +  a[18] = b[13];
> > > > > > > +  a[19] = b[12];
> > > > > > > +  a[20] = b[11];
> > > > > > > +  a[21] = b[10];
> > > > > > > +  a[22] = b[9];
> > > > > > > +  a[23] = b[8];
> > > > > > > +  a[24] = b[7];
> > > > > > > +  a[25] = b[6];
> > > > > > > +  a[26] = b[5];
> > > > > > > +  a[27] = b[4];
> > > > > > > +  a[28] = b[3];
> > > > > > > +  a[29] = b[2];
> > > > > > > +  a[30] = b[1];
> > > > > > > +  a[31] = b[0];
> > > > > > > +}
> > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6b.c b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > > > > > > new file mode 100644
> > > > > > > index 00000000000..1c5bb020939
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > > > > > > @@ -0,0 +1,157 @@
> > > > > > > +/* { dg-do run } */
> > > > > > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > +/* { dg-require-effective-target avx2 } */
> > > > > > > +
> > > > > > > +#include "avx2-check.h"
> > > > > > > +#include <string.h>
> > > > > > > +#include "pr106010-6a.c"
> > > > > > > +
> > > > > > > +void
> > > > > > > +avx2_test (void)
> > > > > > > +{
> > > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (64);
> > > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (64);
> > > > > > > +  _Complex double* pd_exp = (_Complex double*) malloc (64);
> > > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (64);
> > > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (64);
> > > > > > > +  _Complex float* ps_exp = (_Complex float*) malloc (64);
> > > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
> > > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
> > > > > > > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
> > > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (64);
> > > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (64);
> > > > > > > +  _Complex int* epi32_exp = (_Complex int*) malloc (64);
> > > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (64);
> > > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (64);
> > > > > > > +  _Complex short* epi16_exp = (_Complex short*) malloc (64);
> > > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (64);
> > > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (64);
> > > > > > > +  _Complex char* epi8_exp = (_Complex char*) malloc (64);
> > > > > > > +  char* p = (char* ) malloc (64);
> > > > > > > +  char* q = (char* ) malloc (64);
> > > > > > > +
> > > > > > > +  __builtin_memset (pd_dst, 0, 64);
> > > > > > > +  __builtin_memset (ps_dst, 0, 64);
> > > > > > > +  __builtin_memset (epi64_dst, 0, 64);
> > > > > > > +  __builtin_memset (epi32_dst, 0, 64);
> > > > > > > +  __builtin_memset (epi16_dst, 0, 64);
> > > > > > > +  __builtin_memset (epi8_dst, 0, 64);
> > > > > > > +
> > > > > > > +  for (int i = 0; i != 64; i++)
> > > > > > > +    p[i] = i;
> > > > > > > +
> > > > > > > +  __builtin_memcpy (pd_src, p, 64);
> > > > > > > +  __builtin_memcpy (ps_src, p, 64);
> > > > > > > +  __builtin_memcpy (epi64_src, p, 64);
> > > > > > > +  __builtin_memcpy (epi32_src, p, 64);
> > > > > > > +  __builtin_memcpy (epi16_src, p, 64);
> > > > > > > +  __builtin_memcpy (epi8_src, p, 64);
> > > > > > > +
> > > > > > > +
> > > > > > > +  for (int i = 0; i != 16; i++)
> > > > > > > +    {
> > > > > > > +      q[i] = i + 48;
> > > > > > > +      q[i + 16] = i + 32;
> > > > > > > +      q[i + 32] = i + 16;
> > > > > > > +      q[i + 48] = i;
> > > > > > > +    }
> > > > > > > +
> > > > > > > +  __builtin_memcpy (pd_exp, q, 64);
> > > > > > > +  __builtin_memcpy (epi64_exp, q, 64);
> > > > > > > +
> > > > > > > +   for (int i = 0; i != 8; i++)
> > > > > > > +    {
> > > > > > > +      q[i] = i + 56;
> > > > > > > +      q[i + 8] = i + 48;
> > > > > > > +      q[i + 16] = i + 40;
> > > > > > > +      q[i + 24] = i + 32;
> > > > > > > +      q[i + 32] = i + 24;
> > > > > > > +      q[i + 40] = i + 16;
> > > > > > > +      q[i + 48] = i + 8;
> > > > > > > +      q[i + 56] = i;
> > > > > > > +    }
> > > > > > > +
> > > > > > > +  __builtin_memcpy (ps_exp, q, 64);
> > > > > > > +  __builtin_memcpy (epi32_exp, q, 64);
> > > > > > > +
> > > > > > > +  for (int i = 0; i != 4; i++)
> > > > > > > +    {
> > > > > > > +      q[i] = i + 60;
> > > > > > > +      q[i + 4] = i + 56;
> > > > > > > +      q[i + 8] = i + 52;
> > > > > > > +      q[i + 12] = i + 48;
> > > > > > > +      q[i + 16] = i + 44;
> > > > > > > +      q[i + 20] = i + 40;
> > > > > > > +      q[i + 24] = i + 36;
> > > > > > > +      q[i + 28] = i + 32;
> > > > > > > +      q[i + 32] = i + 28;
> > > > > > > +      q[i + 36] = i + 24;
> > > > > > > +      q[i + 40] = i + 20;
> > > > > > > +      q[i + 44] = i + 16;
> > > > > > > +      q[i + 48] = i + 12;
> > > > > > > +      q[i + 52] = i + 8;
> > > > > > > +      q[i + 56] = i + 4;
> > > > > > > +      q[i + 60] = i;
> > > > > > > +    }
> > > > > > > +
> > > > > > > +  __builtin_memcpy (epi16_exp, q, 64);
> > > > > > > +
> > > > > > > +  for (int i = 0; i != 2; i++)
> > > > > > > +    {
> > > > > > > +      q[i] = i + 62;
> > > > > > > +      q[i + 2] = i + 60;
> > > > > > > +      q[i + 4] = i + 58;
> > > > > > > +      q[i + 6] = i + 56;
> > > > > > > +      q[i + 8] = i + 54;
> > > > > > > +      q[i + 10] = i + 52;
> > > > > > > +      q[i + 12] = i + 50;
> > > > > > > +      q[i + 14] = i + 48;
> > > > > > > +      q[i + 16] = i + 46;
> > > > > > > +      q[i + 18] = i + 44;
> > > > > > > +      q[i + 20] = i + 42;
> > > > > > > +      q[i + 22] = i + 40;
> > > > > > > +      q[i + 24] = i + 38;
> > > > > > > +      q[i + 26] = i + 36;
> > > > > > > +      q[i + 28] = i + 34;
> > > > > > > +      q[i + 30] = i + 32;
> > > > > > > +      q[i + 32] = i + 30;
> > > > > > > +      q[i + 34] = i + 28;
> > > > > > > +      q[i + 36] = i + 26;
> > > > > > > +      q[i + 38] = i + 24;
> > > > > > > +      q[i + 40] = i + 22;
> > > > > > > +      q[i + 42] = i + 20;
> > > > > > > +      q[i + 44] = i + 18;
> > > > > > > +      q[i + 46] = i + 16;
> > > > > > > +      q[i + 48] = i + 14;
> > > > > > > +      q[i + 50] = i + 12;
> > > > > > > +      q[i + 52] = i + 10;
> > > > > > > +      q[i + 54] = i + 8;
> > > > > > > +      q[i + 56] = i + 6;
> > > > > > > +      q[i + 58] = i + 4;
> > > > > > > +      q[i + 60] = i + 2;
> > > > > > > +      q[i + 62] = i;
> > > > > > > +    }
> > > > > > > +  __builtin_memcpy (epi8_exp, q, 64);
> > > > > > > +
> > > > > > > +  foo_pd (pd_dst, pd_src);
> > > > > > > +  foo_ps (ps_dst, ps_src);
> > > > > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > > > > +
> > > > > > > +  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +
> > > > > > > +  return;
> > > > > > > +}
> > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6c.c b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > > > > > > new file mode 100644
> > > > > > > index 00000000000..b859d884a7f
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > > > > > > @@ -0,0 +1,80 @@
> > > > > > > +/* { dg-do run } */
> > > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } } */
> > > > > > > +
> > > > > > > +#include <string.h>
> > > > > > > +
> > > > > > > +static void do_test (void);
> > > > > > > +#define DO_TEST do_test
> > > > > > > +#define AVX512FP16
> > > > > > > +#include "avx512-check.h"
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > > > > > +{
> > > > > > > +  a[0] = b[15];
> > > > > > > +  a[1] = b[14];
> > > > > > > +  a[2] = b[13];
> > > > > > > +  a[3] = b[12];
> > > > > > > +  a[4] = b[11];
> > > > > > > +  a[5] = b[10];
> > > > > > > +  a[6] = b[9];
> > > > > > > +  a[7] = b[8];
> > > > > > > +  a[8] = b[7];
> > > > > > > +  a[9] = b[6];
> > > > > > > +  a[10] = b[5];
> > > > > > > +  a[11] = b[4];
> > > > > > > +  a[12] = b[3];
> > > > > > > +  a[13] = b[2];
> > > > > > > +  a[14] = b[1];
> > > > > > > +  a[15] = b[0];
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +do_test (void)
> > > > > > > +{
> > > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
> > > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
> > > > > > > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
> > > > > > > +  char* p = (char* ) malloc (64);
> > > > > > > +  char* q = (char* ) malloc (64);
> > > > > > > +
> > > > > > > +  __builtin_memset (ph_dst, 0, 64);
> > > > > > > +
> > > > > > > +  for (int i = 0; i != 64; i++)
> > > > > > > +    p[i] = i;
> > > > > > > +
> > > > > > > +  __builtin_memcpy (ph_src, p, 64);
> > > > > > > +
> > > > > > > +  for (int i = 0; i != 4; i++)
> > > > > > > +    {
> > > > > > > +      q[i] = i + 60;
> > > > > > > +      q[i + 4] = i + 56;
> > > > > > > +      q[i + 8] = i + 52;
> > > > > > > +      q[i + 12] = i + 48;
> > > > > > > +      q[i + 16] = i + 44;
> > > > > > > +      q[i + 20] = i + 40;
> > > > > > > +      q[i + 24] = i + 36;
> > > > > > > +      q[i + 28] = i + 32;
> > > > > > > +      q[i + 32] = i + 28;
> > > > > > > +      q[i + 36] = i + 24;
> > > > > > > +      q[i + 40] = i + 20;
> > > > > > > +      q[i + 44] = i + 16;
> > > > > > > +      q[i + 48] = i + 12;
> > > > > > > +      q[i + 52] = i + 8;
> > > > > > > +      q[i + 56] = i + 4;
> > > > > > > +      q[i + 60] = i;
> > > > > > > +    }
> > > > > > > +
> > > > > > > +  __builtin_memcpy (ph_exp, q, 64);
> > > > > > > +
> > > > > > > +  foo_ph (ph_dst, ph_src);
> > > > > > > +
> > > > > > > +  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +
> > > > > > > +  return;
> > > > > > > +}
> > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7a.c b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > > > > > > new file mode 100644
> > > > > > > index 00000000000..2ea01fac927
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > > > > > > @@ -0,0 +1,58 @@
> > > > > > > +/* { dg-do compile } */
> > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > > > > > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
> > > > > > > +
> > > > > > > +#define N 10000
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_pd (_Complex double* a, _Complex double b)
> > > > > > > +{
> > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > +    a[i] = b;
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_ps (_Complex float* a, _Complex float b)
> > > > > > > +{
> > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > +    a[i] = b;
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi64 (_Complex long long* a, _Complex long long b)
> > > > > > > +{
> > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > +    a[i] = b;
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi32 (_Complex int* a, _Complex int b)
> > > > > > > +{
> > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > +    a[i] = b;
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi16 (_Complex short* a, _Complex short b)
> > > > > > > +{
> > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > +    a[i] = b;
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi8 (_Complex char* a, _Complex char b)
> > > > > > > +{
> > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > +    a[i] = b;
> > > > > > > +}
> > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7b.c b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > > > > > > new file mode 100644
> > > > > > > index 00000000000..26482cc10f5
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > > > > > > @@ -0,0 +1,63 @@
> > > > > > > +/* { dg-do run } */
> > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > +/* { dg-require-effective-target avx } */
> > > > > > > +
> > > > > > > +#include "avx-check.h"
> > > > > > > +#include <string.h>
> > > > > > > +#include "pr106010-7a.c"
> > > > > > > +
> > > > > > > +void
> > > > > > > +avx_test (void)
> > > > > > > +{
> > > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > > > > +  char* p_init = (char*) malloc (2 * N * sizeof (double));
> > > > > > > +
> > > > > > > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > > > > > > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > > > > > > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > > > > > > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > > > > > > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > > > > > > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > > > > > > +
> > > > > > > +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> > > > > > > +    p_init[i] = i % 2 + 3;
> > > > > > > +
> > > > > > > +  memcpy (pd_src, p_init, 2 * N * sizeof (double));
> > > > > > > +  memcpy (ps_dst, p_init, 2 * N * sizeof (float));
> > > > > > > +  memcpy (epi64_dst, p_init, 2 * N * sizeof (long long));
> > > > > > > +  memcpy (epi32_dst, p_init, 2 * N * sizeof (int));
> > > > > > > +  memcpy (epi16_dst, p_init, 2 * N * sizeof (short));
> > > > > > > +  memcpy (epi8_dst, p_init, 2 * N * sizeof (char));
> > > > > > > +
> > > > > > > +  foo_pd (pd_dst, pd_src[0]);
> > > > > > > +  foo_ps (ps_dst, ps_src[0]);
> > > > > > > +  foo_epi64 (epi64_dst, epi64_src[0]);
> > > > > > > +  foo_epi32 (epi32_dst, epi32_src[0]);
> > > > > > > +  foo_epi16 (epi16_dst, epi16_src[0]);
> > > > > > > +  foo_epi8 (epi8_dst, epi8_src[0]);
> > > > > > > +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +
> > > > > > > +  return;
> > > > > > > +}
> > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7c.c b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > > > > > > new file mode 100644
> > > > > > > index 00000000000..7f4056a5ecc
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > > > > > > @@ -0,0 +1,41 @@
> > > > > > > +/* { dg-do run } */
> > > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
> > > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > > +
> > > > > > > +#include <string.h>
> > > > > > > +
> > > > > > > +static void do_test (void);
> > > > > > > +
> > > > > > > +#define DO_TEST do_test
> > > > > > > +#define AVX512FP16
> > > > > > > +#include "avx512-check.h"
> > > > > > > +
> > > > > > > +#define N 10000
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16 b)
> > > > > > > +{
> > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > +    a[i] = b;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static void
> > > > > > > +do_test (void)
> > > > > > > +{
> > > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > > > > +  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
> > > > > > > +
> > > > > > > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > > > > > > +
> > > > > > > +  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
> > > > > > > +    p_init[i] = i % 2 + 3;
> > > > > > > +
> > > > > > > +  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
> > > > > > > +
> > > > > > > +  foo_ph (ph_dst, ph_src[0]);
> > > > > > > +  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
> > > > > > > +    __builtin_abort ();
> > > > > > > +}
> > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8a.c b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > > > > > > new file mode 100644
> > > > > > > index 00000000000..11054b60d30
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > > > > > > @@ -0,0 +1,58 @@
> > > > > > > +/* { dg-do compile } */
> > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > > > > > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
> > > > > > > +
> > > > > > > +#define N 10000
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_pd (_Complex double* a)
> > > > > > > +{
> > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > +    a[i] = 1.0 + 2.0i;
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_ps (_Complex float* a)
> > > > > > > +{
> > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > +    a[i] = 1.0f + 2.0fi;
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi64 (_Complex long long* a)
> > > > > > > +{
> > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > +    a[i] = 1 + 2i;
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi32 (_Complex int* a)
> > > > > > > +{
> > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > +    a[i] = 1 + 2i;
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi16 (_Complex short* a)
> > > > > > > +{
> > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > +    a[i] = 1 + 2i;
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_epi8 (_Complex char* a)
> > > > > > > +{
> > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > +    a[i] = 1 + 2i;
> > > > > > > +}
> > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8b.c b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > > > > > > new file mode 100644
> > > > > > > index 00000000000..6bb0073b691
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > > > > > > @@ -0,0 +1,53 @@
> > > > > > > +/* { dg-do run } */
> > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > +/* { dg-require-effective-target avx } */
> > > > > > > +
> > > > > > > +#include "avx-check.h"
> > > > > > > +#include <string.h>
> > > > > > > +#include "pr106010-8a.c"
> > > > > > > +
> > > > > > > +void
> > > > > > > +avx_test (void)
> > > > > > > +{
> > > > > > > +  _Complex double pd_src = 1.0 + 2.0i;
> > > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > > > > +  _Complex float ps_src = 1.0 + 2.0i;
> > > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > > > > +  _Complex long long epi64_src = 1 + 2i;;
> > > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > > > > +  _Complex int epi32_src = 1 + 2i;
> > > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > > > > +  _Complex short epi16_src = 1 + 2i;
> > > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > > > > +  _Complex char epi8_src = 1 + 2i;
> > > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > > > > +
> > > > > > > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > > > > > > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > > > > > > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > > > > > > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > > > > > > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > > > > > > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > > > > > > +
> > > > > > > +  foo_pd (pd_dst);
> > > > > > > +  foo_ps (ps_dst);
> > > > > > > +  foo_epi64 (epi64_dst);
> > > > > > > +  foo_epi32 (epi32_dst);
> > > > > > > +  foo_epi16 (epi16_dst);
> > > > > > > +  foo_epi8 (epi8_dst);
> > > > > > > +  for (int i = 0 ; i != N; i++)
> > > > > > > +    {
> > > > > > > +      if (pd_dst[i] != pd_src)
> > > > > > > +       __builtin_abort ();
> > > > > > > +      if (ps_dst[i] != ps_src)
> > > > > > > +       __builtin_abort ();
> > > > > > > +      if (epi64_dst[i] != epi64_src)
> > > > > > > +       __builtin_abort ();
> > > > > > > +      if (epi32_dst[i] != epi32_src)
> > > > > > > +       __builtin_abort ();
> > > > > > > +      if (epi16_dst[i] != epi16_src)
> > > > > > > +       __builtin_abort ();
> > > > > > > +      if (epi8_dst[i] != epi8_src)
> > > > > > > +       __builtin_abort ();
> > > > > > > +    }
> > > > > > > +}
> > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8c.c b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > > > > > > new file mode 100644
> > > > > > > index 00000000000..61ae131829d
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > > > > > > @@ -0,0 +1,38 @@
> > > > > > > +/* { dg-do run } */
> > > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
> > > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > > +
> > > > > > > +#include <string.h>
> > > > > > > +
> > > > > > > +static void do_test (void);
> > > > > > > +
> > > > > > > +#define DO_TEST do_test
> > > > > > > +#define AVX512FP16
> > > > > > > +#include "avx512-check.h"
> > > > > > > +
> > > > > > > +#define N 10000
> > > > > > > +
> > > > > > > +void
> > > > > > > +__attribute__((noipa))
> > > > > > > +foo_ph (_Complex _Float16* a)
> > > > > > > +{
> > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > +    a[i] = 1.0f16 + 2.0f16i;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static void
> > > > > > > +do_test (void)
> > > > > > > +{
> > > > > > > +  _Complex _Float16 ph_src = 1.0f16 + 2.0f16i;
> > > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > > > > +
> > > > > > > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > > > > > > +
> > > > > > > +  foo_ph (ph_dst);
> > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > +    {
> > > > > > > +      if (ph_dst[i] != ph_src)
> > > > > > > +       __builtin_abort ();
> > > > > > > +    }
> > > > > > > +}
> > > > > > > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> > > > > > > index d20a10a1524..42ee9df674c 100644
> > > > > > > --- a/gcc/tree-vect-data-refs.cc
> > > > > > > +++ b/gcc/tree-vect-data-refs.cc
> > > > > > > @@ -1403,7 +1403,8 @@ vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
> > > > > > >    if (PURE_SLP_STMT (stmt_info))
> > > > > > >      ncopies = 1;
> > > > > > >    else
> > > > > > > -    ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
> > > > > > > +    ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info),
> > > > > > > +                                  STMT_VINFO_COMPLEX_P (stmt_info));
> > > > > > >
> > > > > > >    if (DR_IS_READ (dr_info->dr))
> > > > > > >      vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
> > > > > > > @@ -4597,8 +4598,22 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
> > > > > > >
> > > > > > >        /* Set vectype for STMT.  */
> > > > > > >        scalar_type = TREE_TYPE (DR_REF (dr));
> > > > > > > -      tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
> > > > > > > -      if (!vectype)
> > > > > > > +      tree adjust_scalar_type = scalar_type;
> > > > > > > +      /* Support Complex type access. Note that the complex type of load/store
> > > > > > > +        does not support gather/scatter.  */
> > > > > > > +      if (TREE_CODE (scalar_type) == COMPLEX_TYPE
> > > > > > > +         && gatherscatter == SG_NONE)
> > > > > > > +       {
> > > > > > > +         adjust_scalar_type = TREE_TYPE (scalar_type);
> > > > > > > +         STMT_VINFO_COMPLEX_P (stmt_info) = true;
> > > > > > > +       }
> > > > > > > +      tree vectype = get_vectype_for_scalar_type (vinfo, adjust_scalar_type);
> > > > > > > +      unsigned HOST_WIDE_INT constant_nunits;
> > > > > > > +      if (!vectype
> > > > > > > +         /* For complex type, V1DI doesn't make sense.  */
> > > > > > > +         || (STMT_VINFO_COMPLEX_P (stmt_info)
> > > > > > > +             && (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&constant_nunits)
> > > > > > > +                 || constant_nunits == 1)))
> > > > > > >          {
> > > > > > >            if (dump_enabled_p ())
> > > > > > >              {
> > > > > > > @@ -4635,8 +4650,11 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
> > > > > > >         }
> > > > > > >
> > > > > > >        /* Adjust the minimal vectorization factor according to the
> > > > > > > -        vector type.  */
> > > > > > > +        vector type. Note for complex type, VF is half of
> > > > > > > +        TYPE_VECTOR_SUBPARTS.  */
> > > > > > >        vf = TYPE_VECTOR_SUBPARTS (vectype);
> > > > > > > +      if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > > > +       vf = exact_div (vf, 2);
> > > > > > >        *min_vf = upper_bound (*min_vf, vf);
> > > > > > >
> > > > > > >        /* Leave the BB vectorizer to pick the vector type later, based on
> > > > > > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > > > > > > index 3a70c15b593..365fa738022 100644
> > > > > > > --- a/gcc/tree-vect-loop.cc
> > > > > > > +++ b/gcc/tree-vect-loop.cc
> > > > > > > @@ -200,7 +200,12 @@ vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > > > >      }
> > > > > > >
> > > > > > >    if (nunits_vectype)
> > > > > > > -    vect_update_max_nunits (vf, nunits_vectype);
> > > > > > > +    {
> > > > > > > +      poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (nunits_vectype);
> > > > > > > +      if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > > > +       nunits = exact_div (nunits, 2);
> > > > > > > +      vect_update_max_nunits (vf, nunits);
> > > > > > > +    }
> > > > > > >
> > > > > > >    return opt_result::success ();
> > > > > > >  }
> > > > > > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> > > > > > > index dab5daddcc5..5d66ea2f286 100644
> > > > > > > --- a/gcc/tree-vect-slp.cc
> > > > > > > +++ b/gcc/tree-vect-slp.cc
> > > > > > > @@ -877,10 +877,14 @@ vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > > > >        return false;
> > > > > > >      }
> > > > > > >
> > > > > > > +  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> > > > > > > +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > > > +    nunits = exact_div (nunits, 2);
> > > > > > > +
> > > > > > >    /* If populating the vector type requires unrolling then fail
> > > > > > >       before adjusting *max_nunits for basic-block vectorization.  */
> > > > > > >    if (is_a <bb_vec_info> (vinfo)
> > > > > > > -      && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
> > > > > > > +      && !multiple_p (group_size , nunits))
> > > > > > >      {
> > > > > > >        if (dump_enabled_p ())
> > > > > > >         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > > > > > @@ -891,7 +895,7 @@ vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > > > >      }
> > > > > > >
> > > > > > >    /* In case of multiple types we need to detect the smallest type.  */
> > > > > > > -  vect_update_max_nunits (max_nunits, vectype);
> > > > > > > +  vect_update_max_nunits (max_nunits, nunits);
> > > > > > >    return true;
> > > > > > >  }
> > > > > > >
> > > > > > > @@ -3720,22 +3724,54 @@ vect_optimize_slp (vec_info *vinfo)
> > > > > > >          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
> > > > > > >          when permuting constants and invariants keeping the permute
> > > > > > >          bijective.  */
> > > > > > > -      auto_sbitmap load_index (SLP_TREE_LANES (node));
> > > > > > > -      bitmap_clear (load_index);
> > > > > > > -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > > -       bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
> > > > > > > -      unsigned j;
> > > > > > > -      for (j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > > -       if (!bitmap_bit_p (load_index, j))
> > > > > > > -         break;
> > > > > > > -      if (j != SLP_TREE_LANES (node))
> > > > > > > -       continue;
> > > > > > > +      /* Permutation of Complex type.  */
> > > > > > > +      if (STMT_VINFO_COMPLEX_P (dr_stmt))
> > > > > > > +       {
> > > > > > > +         auto_sbitmap load_index (SLP_TREE_LANES (node) * 2);
> > > > > > > +         bitmap_clear (load_index);
> > > > > > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > > +           {
> > > > > > > +             unsigned bit = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > > > > > +             bitmap_set_bit (load_index, 2 * bit);
> > > > > > > +             bitmap_set_bit (load_index, 2 * bit + 1);
> > > > > > > +           }
> > > > > > > +         unsigned j;
> > > > > > > +         for (j = 0; j < SLP_TREE_LANES (node) * 2; ++j)
> > > > > > > +           if (!bitmap_bit_p (load_index, j))
> > > > > > > +             break;
> > > > > > > +         if (j != SLP_TREE_LANES (node) * 2)
> > > > > > > +           continue;
> > > > > > >
> > > > > > > -      vec<unsigned> perm = vNULL;
> > > > > > > -      perm.safe_grow (SLP_TREE_LANES (node), true);
> > > > > > > -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > > -       perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > > > > > -      perms.safe_push (perm);
> > > > > > > +         vec<unsigned> perm = vNULL;
> > > > > > > +         perm.safe_grow (SLP_TREE_LANES (node) * 2, true);
> > > > > > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > > +           {
> > > > > > > +             unsigned cidx = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > > > > > +             perm[2 * j] = 2 * cidx;
> > > > > > > +             perm[2 * j + 1] = 2 * cidx + 1;
> > > > > > > +           }
> > > > > > > +         perms.safe_push (perm);
> > > > > > > +       }
> > > > > > > +      else
> > > > > > > +       {
> > > > > > > +         auto_sbitmap load_index (SLP_TREE_LANES (node));
> > > > > > > +         bitmap_clear (load_index);
> > > > > > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > > +           bitmap_set_bit (load_index,
> > > > > > > +                           SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
> > > > > > > +         unsigned j;
> > > > > > > +         for (j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > > +           if (!bitmap_bit_p (load_index, j))
> > > > > > > +             break;
> > > > > > > +         if (j != SLP_TREE_LANES (node))
> > > > > > > +           continue;
> > > > > > > +
> > > > > > > +         vec<unsigned> perm = vNULL;
> > > > > > > +         perm.safe_grow (SLP_TREE_LANES (node), true);
> > > > > > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > > +           perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > > > > > +         perms.safe_push (perm);
> > > > > > > +       }
> > > > > > >        vertices[idx].perm_in = perms.length () - 1;
> > > > > > >        vertices[idx].perm_out = perms.length () - 1;
> > > > > > >      }
> > > > > > > @@ -4518,6 +4554,12 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
> > > > > > >         vf = loop_vinfo->vectorization_factor;
> > > > > > >        else
> > > > > > >         vf = 1;
> > > > > > > +      /* For complex type and SLP, double vf to get right vectype.
> > > > > > > +        .i.e vector(4) double for complex double, group size is 2, double vf
> > > > > > > +        to map vf * group_size to TYPE_VECTOR_SUBPARTS.  */
> > > > > > > +     if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > > > +       vf *= 2;
> > > > > > > +
> > > > > > >        unsigned int group_size = SLP_TREE_LANES (node);
> > > > > > >        tree vectype = SLP_TREE_VECTYPE (node);
> > > > > > >        SLP_TREE_NUMBER_OF_VEC_STMTS (node)
> > > > > > > @@ -4763,10 +4805,17 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
> > > > > > >             }
> > > > > > >           unsigned group_size = SLP_TREE_LANES (child);
> > > > > > >           poly_uint64 vf = 1;
> > > > > > > +
> > > > > > >           if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
> > > > > > >             vf = loop_vinfo->vectorization_factor;
> > > > > > > +
> > > > > > > +         /* V2SF is just 1 complex type, so mutiply by 2
> > > > > > > +            to get release vector numbers.  */
> > > > > > > +         unsigned cp
> > > > > > > +           = STMT_VINFO_COMPLEX_P (SLP_TREE_REPRESENTATIVE (node)) ? 2 : 1;
> > > > > > > +
> > > > > > >           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
> > > > > > > -           = vect_get_num_vectors (vf * group_size, vector_type);
> > > > > > > +           = vect_get_num_vectors (vf * group_size * cp, vector_type);
> > > > > > >           /* And cost them.  */
> > > > > > >           vect_prologue_cost_for_slp (child, cost_vec);
> > > > > > >         }
> > > > > > > @@ -6402,6 +6451,11 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > > > > > >
> > > > > > >    /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
> > > > > > >    vector_type = SLP_TREE_VECTYPE (op_node);
> > > > > > > +  unsigned int cp = 1;
> > > > > > > +  /* Handle Complex type vector init.
> > > > > > > +     SLP_TREE_REPRESENTATIVE (op_node) could be NULL.  */
> > > > > > > +  if (TREE_CODE (TREE_TYPE (op_node->ops[0])) == COMPLEX_TYPE)
> > > > > > > +    cp = 2;
> > > > > > >
> > > > > > >    unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
> > > > > > >    SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
> > > > > > > @@ -6426,9 +6480,9 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > > > > > >    /* When using duplicate_and_interleave, we just need one element for
> > > > > > >       each scalar statement.  */
> > > > > > >    if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
> > > > > > > -    nunits = group_size;
> > > > > > > +    nunits = group_size * cp;
> > > > > > >
> > > > > > > -  number_of_copies = nunits * number_of_vectors / group_size;
> > > > > > > +  number_of_copies = nunits * number_of_vectors / (group_size * cp);
> > > > > > >
> > > > > > >    number_of_places_left_in_vector = nunits;
> > > > > > >    constant_p = true;
> > > > > > > @@ -6460,8 +6514,23 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > > > > > >                         gcc_unreachable ();
> > > > > > >                     }
> > > > > > >                   else
> > > > > > > -                   op = fold_unary (VIEW_CONVERT_EXPR,
> > > > > > > -                                    TREE_TYPE (vector_type), op);
> > > > > > > +                   {
> > > > > > > +                     tree scalar_type = TREE_TYPE (vector_type);
> > > > > > > +                     /* For complex type, insert real and imag part
> > > > > > > +                        separately.  */
> > > > > > > +                     if (cp == 2)
> > > > > > > +                       {
> > > > > > > +                         gcc_assert ((TREE_CODE (TREE_TYPE (op))
> > > > > > > +                                      == COMPLEX_TYPE)
> > > > > > > +                                     && (scalar_type
> > > > > > > +                                         == TREE_TYPE (TREE_TYPE (op))));
> > > > > > > +                         elts[number_of_places_left_in_vector--]
> > > > > > > +                           = fold_unary (IMAGPART_EXPR, scalar_type, op);
> > > > > > > +                         op = fold_unary (REALPART_EXPR, scalar_type, op);
> > > > > > > +                       }
> > > > > > > +                     else
> > > > > > > +                       op = fold_unary (VIEW_CONVERT_EXPR, scalar_type, op);
> > > > > > > +                   }
> > > > > > >                   gcc_assert (op && CONSTANT_CLASS_P (op));
> > > > > > >                 }
> > > > > > >               else
> > > > > > > @@ -6481,11 +6550,28 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > > > > > >                     }
> > > > > > >                   else
> > > > > > >                     {
> > > > > > > -                     op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
> > > > > > > -                                  op);
> > > > > > > -                     init_stmt
> > > > > > > -                       = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
> > > > > > > -                                              op);
> > > > > > > +                     tree scalar_type = TREE_TYPE (vector_type);
> > > > > > > +                     if (cp == 2)
> > > > > > > +                       {
> > > > > > > +                         gcc_assert ((TREE_CODE (TREE_TYPE (op))
> > > > > > > +                                      == COMPLEX_TYPE)
> > > > > > > +                                     && (scalar_type
> > > > > > > +                                         == TREE_TYPE (TREE_TYPE (op))));
> > > > > > > +                         tree imag = build1 (IMAGPART_EXPR, scalar_type, op);
> > > > > > > +                         op = build1 (REALPART_EXPR, scalar_type, op);
> > > > > > > +                         tree imag_temp = make_ssa_name (scalar_type);
> > > > > > > +                         elts[number_of_places_left_in_vector--] = imag_temp;
> > > > > > > +                         init_stmt = gimple_build_assign (imag_temp, imag);
> > > > > > > +                         gimple_seq_add_stmt (&ctor_seq, init_stmt);
> > > > > > > +                         init_stmt = gimple_build_assign (new_temp, op);
> > > > > > > +                       }
> > > > > > > +                     else
> > > > > > > +                       {
> > > > > > > +                         op = build1 (VIEW_CONVERT_EXPR, scalar_type, op);
> > > > > > > +                         init_stmt
> > > > > > > +                           = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
> > > > > > > +                                                  op);
> > > > > > > +                       }
> > > > > > >                     }
> > > > > > >                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
> > > > > > >                   op = new_temp;
> > > > > > > @@ -6696,15 +6782,17 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > > > >    unsigned int nelts_to_build;
> > > > > > >    unsigned int nvectors_per_build;
> > > > > > >    unsigned int in_nlanes;
> > > > > > > +  unsigned int cp = STMT_VINFO_COMPLEX_P (stmt_info) ? 2 : 1;
> > > > > > >    bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
> > > > > > > -                     && multiple_p (nunits, group_size));
> > > > > > > +                     && multiple_p (nunits, group_size * cp));
> > > > > > >    if (repeating_p)
> > > > > > >      {
> > > > > > >        /* A single vector contains a whole number of copies of the node, so:
> > > > > > >          (a) all permutes can use the same mask; and
> > > > > > >          (b) the permutes only need a single vector input.  */
> > > > > > > -      mask.new_vector (nunits, group_size, 3);
> > > > > > > -      nelts_to_build = mask.encoded_nelts ();
> > > > > > > +      /* For complex type, mask size should be double of nelts_to_build.  */
> > > > > > > +      mask.new_vector (nunits, group_size * cp, 3);
> > > > > > > +      nelts_to_build = mask.encoded_nelts () / cp;
> > > > > > >        nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
> > > > > > >        in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
> > > > > > >      }
> > > > > > > @@ -6744,8 +6832,8 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > > > >         {
> > > > > > >           /* Enforced before the loop when !repeating_p.  */
> > > > > > >           unsigned int const_nunits = nunits.to_constant ();
> > > > > > > -         vec_index = i / const_nunits;
> > > > > > > -         mask_element = i % const_nunits;
> > > > > > > +         vec_index = i / (const_nunits / cp);
> > > > > > > +         mask_element = i % (const_nunits / cp);
> > > > > > >           if (vec_index == first_vec_index
> > > > > > >               || first_vec_index == -1)
> > > > > > >             {
> > > > > > > @@ -6755,7 +6843,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > > > >                    || second_vec_index == -1)
> > > > > > >             {
> > > > > > >               second_vec_index = vec_index;
> > > > > > > -             mask_element += const_nunits;
> > > > > > > +             mask_element += (const_nunits / cp);
> > > > > > >             }
> > > > > > >           else
> > > > > > >             {
> > > > > > > @@ -6768,14 +6856,24 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > > > >               return false;
> > > > > > >             }
> > > > > > >
> > > > > > > -         gcc_assert (mask_element < 2 * const_nunits);
> > > > > > > +         gcc_assert (mask_element < 2 * const_nunits / cp);
> > > > > > >         }
> > > > > > >
> > > > > > >        if (mask_element != index)
> > > > > > >         noop_p = false;
> > > > > > > -      mask[index++] = mask_element;
> > > > > > > +      /* Set index for Complex _type.
> > > > > > > +        i.e. mask like [1,0] is actually [2, 3, 0, 1]
> > > > > > > +        for vector scalar type.  */
> > > > > > > +      if (cp == 2)
> > > > > > > +       {
> > > > > > > +         mask[2 * index] = 2 * mask_element;
> > > > > > > +         mask[2 * index + 1] = 2 * mask_element + 1;
> > > > > > > +       }
> > > > > > > +      else
> > > > > > > +       mask[index] = mask_element;
> > > > > > > +      index++;
> > > > > > >
> > > > > > > -      if (index == count && !noop_p)
> > > > > > > +      if (index * cp == count && !noop_p)
> > > > > > >         {
> > > > > > >           indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
> > > > > > >           if (!can_vec_perm_const_p (mode, mode, indices))
> > > > > > > @@ -6799,7 +6897,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > > > >           ++*n_perms;
> > > > > > >         }
> > > > > > >
> > > > > > > -      if (index == count)
> > > > > > > +      if (index * cp == count)
> > > > > > >         {
> > > > > > >           if (!analyze_only)
> > > > > > >             {
> > > > > > > @@ -6869,7 +6967,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > > > >           bool load_seen = false;
> > > > > > >           for (unsigned i = 0; i < in_nlanes; ++i)
> > > > > > >             {
> > > > > > > -             if (i % const_nunits == 0)
> > > > > > > +             if (i % (const_nunits * cp) == 0)
> > > > > > >                 {
> > > > > > >                   if (load_seen)
> > > > > > >                     *n_loads += 1;
> > > > > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > > > > > > index 72107afc883..8af3b558be4 100644
> > > > > > > --- a/gcc/tree-vect-stmts.cc
> > > > > > > +++ b/gcc/tree-vect-stmts.cc
> > > > > > > @@ -1397,25 +1397,70 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
> > > > > > >  {
> > > > > > >    gimple *init_stmt;
> > > > > > >    tree new_temp;
> > > > > > > +  tree scalar_type = TREE_TYPE (type);
> > > > > > > +  gimple_seq stmts = NULL;
> > > > > > > +
> > > > > > > +  if (TREE_CODE (TREE_TYPE (val)) == COMPLEX_TYPE)
> > > > > > > +    {
> > > > > > > +      unsigned HOST_WIDE_INT nunits;
> > > > > > > +      gcc_assert (TYPE_VECTOR_SUBPARTS (type).is_constant (&nunits));
> > > > > > >
> > > > > > > +      tree_vector_builder elts (type, nunits, 1);
> > > > > > > +      tree imag, real;
> > > > > > > +      if (TREE_CODE (val) == COMPLEX_CST)
> > > > > > > +       {
> > > > > > > +         real = fold_unary (REALPART_EXPR, scalar_type, val);
> > > > > > > +         imag = fold_unary (IMAGPART_EXPR, scalar_type, val);
> > > > > > > +       }
> > > > > > > +      else
> > > > > > > +       {
> > > > > > > +         real = make_ssa_name (scalar_type);
> > > > > > > +         imag = make_ssa_name (scalar_type);
> > > > > > > +         init_stmt
> > > > > > > +           = gimple_build_assign (real,
> > > > > > > +                                  build1 (REALPART_EXPR, scalar_type, val));
> > > > > > > +         gimple_seq_add_stmt (&stmts, init_stmt);
> > > > > > > +         init_stmt
> > > > > > > +           = gimple_build_assign (imag,
> > > > > > > +                                  build1 (IMAGPART_EXPR, scalar_type, val));
> > > > > > > +         gimple_seq_add_stmt (&stmts, init_stmt);
> > > > > > > +       }
> > > > > > > +
> > > > > > > +      /* Build vector as [real,imag,real,imag,...].  */
> > > > > > > +      for (unsigned i = 0; i != nunits; i++)
> > > > > > > +       {
> > > > > > > +         if (i % 2)
> > > > > > > +           elts.quick_push (imag);
> > > > > > > +         else
> > > > > > > +           elts.quick_push (real);
> > > > > > > +       }
> > > > > > > +      val = gimple_build_vector (&stmts, &elts);
> > > > > > > +      if (!gimple_seq_empty_p (stmts))
> > > > > > > +       {
> > > > > > > +         if (gsi)
> > > > > > > +           gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> > > > > > > +         else
> > > > > > > +           vinfo->insert_seq_on_entry (stmt_info, stmts);
> > > > > > > +       }
> > > > > > > +    }
> > > > > > >    /* We abuse this function to push sth to a SSA name with initial 'val'.  */
> > > > > > > -  if (! useless_type_conversion_p (type, TREE_TYPE (val)))
> > > > > > > +  else if (! useless_type_conversion_p (type, TREE_TYPE (val)))
> > > > > > >      {
> > > > > > >        gcc_assert (TREE_CODE (type) == VECTOR_TYPE);
> > > > > > > -      if (! types_compatible_p (TREE_TYPE (type), TREE_TYPE (val)))
> > > > > > > +      if (! types_compatible_p (scalar_type, TREE_TYPE (val)))
> > > > > > >         {
> > > > > > >           /* Scalar boolean value should be transformed into
> > > > > > >              all zeros or all ones value before building a vector.  */
> > > > > > >           if (VECTOR_BOOLEAN_TYPE_P (type))
> > > > > > >             {
> > > > > > > -             tree true_val = build_all_ones_cst (TREE_TYPE (type));
> > > > > > > -             tree false_val = build_zero_cst (TREE_TYPE (type));
> > > > > > > +             tree true_val = build_all_ones_cst (scalar_type);
> > > > > > > +             tree false_val = build_zero_cst (scalar_type);
> > > > > > >
> > > > > > >               if (CONSTANT_CLASS_P (val))
> > > > > > >                 val = integer_zerop (val) ? false_val : true_val;
> > > > > > >               else
> > > > > > >                 {
> > > > > > > -                 new_temp = make_ssa_name (TREE_TYPE (type));
> > > > > > > +                 new_temp = make_ssa_name (scalar_type);
> > > > > > >                   init_stmt = gimple_build_assign (new_temp, COND_EXPR,
> > > > > > >                                                    val, true_val, false_val);
> > > > > > >                   vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi);
> > > > > > > @@ -1424,14 +1469,13 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
> > > > > > >             }
> > > > > > >           else
> > > > > > >             {
> > > > > > > -             gimple_seq stmts = NULL;
> > > > > > >               if (! INTEGRAL_TYPE_P (TREE_TYPE (val)))
> > > > > > >                 val = gimple_build (&stmts, VIEW_CONVERT_EXPR,
> > > > > > > -                                   TREE_TYPE (type), val);
> > > > > > > +                                   scalar_type, val);
> > > > > > >               else
> > > > > > >                 /* ???  Condition vectorization expects us to do
> > > > > > >                    promotion of invariant/external defs.  */
> > > > > > > -               val = gimple_convert (&stmts, TREE_TYPE (type), val);
> > > > > > > +               val = gimple_convert (&stmts, scalar_type, val);
> > > > > > >               for (gimple_stmt_iterator gsi2 = gsi_start (stmts);
> > > > > > >                    !gsi_end_p (gsi2); )
> > > > > > >                 {
> > > > > > > @@ -1496,7 +1540,12 @@ vect_get_vec_defs_for_operand (vec_info *vinfo, stmt_vec_info stmt_vinfo,
> > > > > > >                && VECTOR_BOOLEAN_TYPE_P (stmt_vectype))
> > > > > > >         vector_type = truth_type_for (stmt_vectype);
> > > > > > >        else
> > > > > > > -       vector_type = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op));
> > > > > > > +       {
> > > > > > > +         tree scalar_type = TREE_TYPE (op);
> > > > > > > +         if (STMT_VINFO_COMPLEX_P (stmt_vinfo))
> > > > > > > +           scalar_type = TREE_TYPE (scalar_type);
> > > > > > > +         vector_type = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
> > > > > > > +       }
> > > > > > >
> > > > > > >        gcc_assert (vector_type);
> > > > > > >        tree vop = vect_init_vector (vinfo, stmt_vinfo, op, vector_type, NULL);
> > > > > > > @@ -7509,8 +7558,17 @@ vectorizable_store (vec_info *vinfo,
> > > > > > >       same location twice.  */
> > > > > > >    gcc_assert (slp == PURE_SLP_STMT (stmt_info));
> > > > > > >
> > > > > > > +  if (!STMT_VINFO_DATA_REF (stmt_info))
> > > > > > > +    return false;
> > > > > > > +
> > > > > > >    tree vectype = STMT_VINFO_VECTYPE (stmt_info), rhs_vectype = NULL_TREE;
> > > > > > >    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> > > > > > > +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > > > +    {
> > > > > > > +      if (!nunits.is_constant ())
> > > > > > > +       return false;
> > > > > > > +      nunits = exact_div (nunits, 2);
> > > > > > > +    }
> > > > > > >
> > > > > > >    if (loop_vinfo)
> > > > > > >      {
> > > > > > > @@ -7526,7 +7584,8 @@ vectorizable_store (vec_info *vinfo,
> > > > > > >    if (slp)
> > > > > > >      ncopies = 1;
> > > > > > >    else
> > > > > > > -    ncopies = vect_get_num_copies (loop_vinfo, vectype);
> > > > > > > +    ncopies = vect_get_num_copies (loop_vinfo, vectype,
> > > > > > > +                                  STMT_VINFO_COMPLEX_P (stmt_info));
> > > > > > >
> > > > > > >    gcc_assert (ncopies >= 1);
> > > > > > >
> > > > > > > @@ -7546,9 +7605,6 @@ vectorizable_store (vec_info *vinfo,
> > > > > > >    elem_type = TREE_TYPE (vectype);
> > > > > > >    vec_mode = TYPE_MODE (vectype);
> > > > > > >
> > > > > > > -  if (!STMT_VINFO_DATA_REF (stmt_info))
> > > > > > > -    return false;
> > > > > > > -
> > > > > > >    vect_memory_access_type memory_access_type;
> > > > > > >    enum dr_alignment_support alignment_support_scheme;
> > > > > > >    int misalignment;
> > > > > > > @@ -8778,6 +8834,12 @@ vectorizable_load (vec_info *vinfo,
> > > > > > >
> > > > > > >    tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> > > > > > >    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> > > > > > > +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > > > +    {
> > > > > > > +      if (!nunits.is_constant ())
> > > > > > > +       return false;
> > > > > > > +      nunits = exact_div (nunits, 2);
> > > > > > > +    }
> > > > > > >
> > > > > > >    if (loop_vinfo)
> > > > > > >      {
> > > > > > > @@ -8794,7 +8856,8 @@ vectorizable_load (vec_info *vinfo,
> > > > > > >    if (slp)
> > > > > > >      ncopies = 1;
> > > > > > >    else
> > > > > > > -    ncopies = vect_get_num_copies (loop_vinfo, vectype);
> > > > > > > +    ncopies = vect_get_num_copies (loop_vinfo, vectype,
> > > > > > > +                                  STMT_VINFO_COMPLEX_P (stmt_info));
> > > > > > >
> > > > > > >    gcc_assert (ncopies >= 1);
> > > > > > >
> > > > > > > @@ -8870,8 +8933,11 @@ vectorizable_load (vec_info *vinfo,
> > > > > > >                 if (k > maxk)
> > > > > > >                   maxk = k;
> > > > > > >               tree vectype = SLP_TREE_VECTYPE (slp_node);
> > > > > > > +             /* For complex type, half the nunits.  */
> > > > > > >               if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
> > > > > > > -                 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
> > > > > > > +                 || maxk >= (DR_GROUP_SIZE (group_info)
> > > > > > > +                             & ~((STMT_VINFO_COMPLEX_P (group_info)
> > > > > > > +                                  ? nunits >> 1 : nunits) - 1)))
> > > > > > >                 {
> > > > > > >                   if (dump_enabled_p ())
> > > > > > >                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > > > > > @@ -12499,12 +12565,27 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > > > >             dump_printf_loc (MSG_NOTE, vect_location,
> > > > > > >                              "get vectype for scalar type: %T\n", scalar_type);
> > > > > > >         }
> > > > > > > +
> > > > > > > +      tree orig_scalar_type = scalar_type;
> > > > > > > +      if (TREE_CODE (scalar_type) == COMPLEX_TYPE)
> > > > > > > +       {
> > > > > > > +         /* Set complex_p for BB vectorizer.  */
> > > > > > > +         STMT_VINFO_COMPLEX_P (stmt_info) = true;
> > > > > > > +         scalar_type = TREE_TYPE (scalar_type);
> > > > > > > +         /* Double group_size for BB vectorizer to make
> > > > > > > +            following 2 get_vectype_for_scalar_type return wanted vectype.
> > > > > > > +            Real group size is not changed, just make the "faked" input
> > > > > > > +            group_size.  */
> > > > > > > +         group_size *= 2;
> > > > > > > +       }
> > > > > > >        vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
> > > > > > > -      if (!vectype)
> > > > > > > +      if (!vectype
> > > > > > > +         || (STMT_VINFO_COMPLEX_P (stmt_info)
> > > > > > > +             && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ()))
> > > > > > >         return opt_result::failure_at (stmt,
> > > > > > >                                        "not vectorized:"
> > > > > > >                                        " unsupported data-type %T\n",
> > > > > > > -                                      scalar_type);
> > > > > > > +                                      orig_scalar_type);
> > > > > > >
> > > > > > >        if (dump_enabled_p ())
> > > > > > >         dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n", vectype);
> > > > > > > @@ -12529,16 +12610,30 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > > > >                                                    TREE_TYPE (vectype));
> > > > > > >        if (scalar_type != TREE_TYPE (vectype))
> > > > > > >         {
> > > > > > > -         if (dump_enabled_p ())
> > > > > > > +         tree orig_scalar_type = scalar_type;
> > > > > > > +         if (TREE_CODE (scalar_type) == COMPLEX_TYPE)
> > > > > > > +           {
> > > > > > > +             /* Set complex_p for Loop vectorizer.  */
> > > > > > > +             STMT_VINFO_COMPLEX_P (stmt_info) = true;
> > > > > > > +             scalar_type = TREE_TYPE (scalar_type);
> > > > > > > +             if (dump_enabled_p ())
> > > > > > > +               dump_printf_loc (MSG_NOTE, vect_location,
> > > > > > > +                            "get complex for smallest scalar type: %T\n",
> > > > > > > +                            scalar_type);
> > > > > > > +
> > > > > > > +           }
> > > > > > > +         else if (dump_enabled_p ())
> > > > > > >             dump_printf_loc (MSG_NOTE, vect_location,
> > > > > > >                              "get vectype for smallest scalar type: %T\n",
> > > > > > >                              scalar_type);
> > > > > > >           nunits_vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
> > > > > > >                                                         group_size);
> > > > > > > -         if (!nunits_vectype)
> > > > > > > +         if (!nunits_vectype
> > > > > > > +             || (STMT_VINFO_COMPLEX_P (stmt_info)
> > > > > > > +                 && !TYPE_VECTOR_SUBPARTS (nunits_vectype).is_constant ()))
> > > > > > >             return opt_result::failure_at
> > > > > > >               (stmt, "not vectorized: unsupported data-type %T\n",
> > > > > > > -              scalar_type);
> > > > > > > +              orig_scalar_type);
> > > > > > >           if (dump_enabled_p ())
> > > > > > >             dump_printf_loc (MSG_NOTE, vect_location, "nunits vectype: %T\n",
> > > > > > >                              nunits_vectype);
> > > > > > > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> > > > > > > index e5fdc9e0a14..4a809e492c4 100644
> > > > > > > --- a/gcc/tree-vectorizer.h
> > > > > > > +++ b/gcc/tree-vectorizer.h
> > > > > > > @@ -1161,6 +1161,9 @@ public:
> > > > > > >       vectorization.  */
> > > > > > >    bool vectorizable;
> > > > > > >
> > > > > > > +  /* The scalar type of the LHS of this statement is complex type.  */
> > > > > > > +  bool complex_p;
> > > > > > > +
> > > > > > >    /* The stmt to which this info struct refers to.  */
> > > > > > >    gimple *stmt;
> > > > > > >
> > > > > > > @@ -1395,6 +1398,7 @@ struct gather_scatter_info {
> > > > > > >  #define STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT(S) (S)->reduc_epilogue_adjustment
> > > > > > >  #define STMT_VINFO_REDUC_IDX(S)                   (S)->reduc_idx
> > > > > > >  #define STMT_VINFO_FORCE_SINGLE_CYCLE(S)   (S)->force_single_cycle
> > > > > > > +#define STMT_VINFO_COMPLEX_P(S)            (S)->complex_p
> > > > > > >
> > > > > > >  #define STMT_VINFO_DR_WRT_VEC_LOOP(S)      (S)->dr_wrt_vec_loop
> > > > > > >  #define STMT_VINFO_DR_BASE_ADDRESS(S)      (S)->dr_wrt_vec_loop.base_address
> > > > > > > @@ -1970,6 +1974,15 @@ vect_get_num_copies (loop_vec_info loop_vinfo, tree vectype)
> > > > > > >    return vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo), vectype);
> > > > > > >  }
> > > > > > >
> > > > > > > +static inline unsigned int
> > > > > > > +vect_get_num_copies (loop_vec_info loop_vinfo, tree vectype, bool complex_p)
> > > > > > > +{
> > > > > > > +  poly_uint64 nunits = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> > > > > > > +  if (complex_p)
> > > > > > > +    nunits *= 2;
> > > > > > > +  return vect_get_num_vectors (nunits, vectype);
> > > > > > > +}
> > > > > > > +
> > > > > > >  /* Update maximum unit count *MAX_NUNITS so that it accounts for
> > > > > > >     NUNITS.  *MAX_NUNITS can be 1 if we haven't yet recorded anything.  */
> > > > > > >
> > > > > > > --
> > > > > > > 2.18.1
> > > > > > >
> > > > >
> > > > >
> > > > >
> > > > > --
> > > > > BR,
> > > > > Hongtao
> > >
> > >
> > >
> > > --
> > > BR,
> > > Hongtao
Hongtao Liu July 14, 2022, 9:26 a.m. UTC | #9
On Thu, Jul 14, 2022 at 4:53 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Thu, Jul 14, 2022 at 4:20 PM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Wed, Jul 13, 2022 at 9:34 AM Richard Biener
> > <richard.guenther@gmail.com> wrote:
> > >
> > > On Wed, Jul 13, 2022 at 6:47 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > >
> > > > On Tue, Jul 12, 2022 at 10:12 PM Richard Biener
> > > > <richard.guenther@gmail.com> wrote:
> > > > >
> > > > > On Tue, Jul 12, 2022 at 6:11 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > > >
> > > > > > On Mon, Jul 11, 2022 at 7:47 PM Richard Biener via Gcc-patches
> > > > > > <gcc-patches@gcc.gnu.org> wrote:
> > > > > > >
> > > > > > > On Mon, Jul 11, 2022 at 5:44 AM liuhongt <hongtao.liu@intel.com> wrote:
> > > > > > > >
> > > > > > > > The patch only handles load/store(including ctor/permutation, except
> > > > > > > > gather/scatter) for complex type, other operations don't needs to be
> > > > > > > > handled since they will be lowered by pass cplxlower.(MASK_LOAD is not
> > > > > > > > supported for complex type, so no need to handle either).
> > > > > > >
> > > > > > > (*)
> > > > > > >
> > > > > > > > Instead of support vector(2) _Complex double, this patch takes vector(4)
> > > > > > > > double as vector type of _Complex double. Since vectorizer originally
> > > > > > > > takes TYPE_VECTOR_SUBPARTS as nunits which is not true for complex
> > > > > > > > type, the patch handles nunits/ncopies/vf specially for complex type.
> > > > > > >
> > > > > > > For the limited set above(*) can you explain what's "special" about
> > > > > > > vector(2) _Complex
> > > > > > > vs. vector(4) double, thus why we need to have STMT_VINFO_COMPLEX_P at all?
> > > > > > Supporting a vector(2) complex  is a straightforward idea, just like
> > > > > > supporting other scalar type in vectorizer, but it requires more
> > > > > > efforts(in the backend and frontend), considering that most of
> > > > > > operations of complex type will be lowered into realpart and imagpart
> > > > > > operations, supporting a vector(2) complex does not look that
> > > > > > necessary. Then it comes up with supporting vector(4) double(with
> > > > > > adjustment of vf/ctor/permutation), the vectorizer only needs to
> > > > > > handle the vectorization of the move operation of the complex type(no
> > > > > > need to worry about wrongly mapping vector(4) double multiplication to
> > > > > > complex type multiplication since it's already lowered before
> > > > > > vectorizer).
> > > > > > stmt_info does not record the scalar type, in order to avoid duplicate
> > > > > > operation like getting a lhs type from stmt to determine whether it is
> > > > > > a complex type, STMT_VINFO_COMPLEX_P bit is added, this bit is mainly
> > > > > > initialized in vect_analyze_data_refs and vect_get_vector_types_for_
> > > > > > stmt.
> > > > > > >
> > > > > > > I wonder to what extent your handling can be extended to support re-vectorizing
> > > > > > > (with a higher VF for example) already vectorized code?  The vectorizer giving
> > > > > > > up on vector(2) double looks quite obviously similar to it giving up
> > > > > > > on _Complex double ...
> > > > > > Yes, it can be extended to vector(2) double/float/int/.... with a bit
> > > > > > adjustment(exacting element by using bit_field instead of
> > > > > > imagpart_expr/realpart_expr).
> > > > > > > It would be a shame to not use the same underlying mechanism for dealing with
> > > > > > > both, where for the vector case obviously vector(4) would be supported as well.
> > > > > > >
> > > > > > > In principle _Complex double operations should be two SLP lanes but it seems you
> > > > > > > are handling them with classical interleaving as well?
> > > > > > I'm only handling move operations, for other operations it will be
> > > > > > lowered to realpart and imagpart and thus two SLP lanes.
> > > > >
> > > > > Yes, I understood that.
> > > > >
> > > > > Doing it more general (and IMHO better) would involve enhancing
> > > > > how we represent dataref groups, maintaining the number of scalars
> > > > > covered by each of the vinfos.  On the SLP representation side it
> > > > > probably requires to rely on the representative for access and not
> > > > > on the scalar stmts (since those do not map properly to the lanes).
> > > > >
> > > > > Ideally we'd be able to handle
> > > > >
> > > > > struct { _Complex double c; double a; double b; } a[], b[];
> > > > >
> > > > > void foo ()
> > > > > {
> > > > >    for (int i = 0; i < 100; ++i)
> > > > >     {
> > > > >       a[i].c = b[i].c;
> > > > >       a[i].a = b[i].a;
> > > > >       a[i].b = b[i].b;
> > > > >     }
> > > > > }
> > > > >
> > > > > which I guess your patch doesn't handle with plain AVX vector
> > > > > copies but instead uses interleaving for the _Complex and non-_Complex
> > > > > parts?
> > > > Indeed, it produces wrong code.
> > >
> > > For _Complex, in case we don't get to the "true and only" solution it
> > > might be easier to split the loads and stores when it's just memory
> > > copies and we have vectorization enabled and a supported vector
> > > mode that would surely re-assemble them (store-merging doesn't seem
> > > to do that).
> > >
> > > Btw, we seem to produce
> > >
> > >         movsd   b(%rip), %xmm0
> > >         movsd   %xmm0, a(%rip)
> > >         movsd   b+8(%rip), %xmm0
> > >         movsd   %xmm0, a+8(%rip)
> > >
> > > for a _Complex double memory copy on x86 which means we lack
> > > true DCmode support (pseudos get decomposed).  Not sure if we
> > > can somehow check whether a target has DCmode load/store
> > > support and key decomposing on that (maybe check the SET optab).
> > >
> > > It might be possible to check
> > >
> > > _Complex double a, b;
> > > void bar()
> > > {
> > >   a = b;
> > > }
> > >
> > > for all targets with a cc1 cross to see whether they somehow get
> > > loads/stores _not_ decomposed (also check _Complex float,
> > > I wouldn't worry for _Complex int or _Complex long double).
> >
> > Btw, a point for doing the above is that we already do it!  There just
> > needs to be an (unrelated) complex op in the function:
> >
> > _Complex float a[2], b[2];
> > _Complex double foo(_Complex double x, _Complex double y)
> > {
> >   a[0] = b[0];
> >   a[1] = b[1];
> >   return x + y;
> > }
> >
> > vs
> >
> > void bar()
> > {
> >   a[0] = b[0];
> >   a[1] = b[1];
> > }
> >
> > they key difference is that tree_lower_complex returns early here:
> >
> >   if (!init_dont_simulate_again ())
> >     return 0;
> >
> > that returns whether it saw any complex op.
> >
> > diff --git a/gcc/tree-complex.cc b/gcc/tree-complex.cc
> > index 61950a0f099..bdcb9968af1 100644
> > --- a/gcc/tree-complex.cc
> > +++ b/gcc/tree-complex.cc
> > @@ -297,6 +297,11 @@ init_dont_simulate_again (void)
> >                 break;
> >
> >               default:
> > +               /* When expand_complex_move would trigger make sure we
> > +                  perform lowering even when there is no actual complex
> > +                  operation.  This helps consistency and vectorization.  */
> > +               if (TREE_CODE (TREE_TYPE (gimple_op (stmt, 0))) == COMPLEX_TYPE)
> > +                 saw_a_complex_op = true;
> >                 break;
> >               }
> >
> Let me try this.
> > fixes that.  If this change tests OK (and fixes your set of new
> > vectorizer testcases)
> The direct purpose of my patch is to support vectorization of the
> complex type move, and the indirect purpose is to support automatic
> vectorization of the complex type libmvec. For example, vectorization
> of follow case
> void
> foo (_Complex double* a, _Complex double* b)
> {
>   for (int i = 0; i != 100; i++)
>   a[i] = csin[b[i]];
> }
>
7918  _8 = REALPART_EXPR <*_3>;
7919  _7 = IMAGPART_EXPR <*_3>;
7920  _4 = COMPLEX_EXPR <_8, _7>;
7921  _5 = a_11(D) + _2;
7922  _6 = csin (_4);
7923  _15 = REALPART_EXPR <_6>;
7924  _14 = IMAGPART_EXPR <_6>;

Still have complex type in loop, and will failed to get corresponding
vector type.

11464get_related_vectype_for_scalar_type (machine_mode prevailing_mode,
11465                                     tree scalar_type, poly_uint64 nunits)
11466{
11467  tree orig_scalar_type = scalar_type;
11468  scalar_mode inner_mode;
11469  machine_mode simd_mode;
11470  tree vectype;
11471
11472  if (!is_int_mode (TYPE_MODE (scalar_type), &inner_mode)
11473      && !is_float_mode (TYPE_MODE (scalar_type), &inner_mode))
11474    return NULL_TREE; ------------ here.

I'm not sure if there's good way to handle that.

> GCC has support vectorization for sin, but not for csin.
> > then I think that's the way to go for the immediate issue of
> > vectorizing _Complex.
> >
> > Richard.
> >
> > > Richard.
> > >
> > > > > Let me spend some time fleshing out what is necessary to make
> > > > > this work "properly".  We can consider your special-casing of _Complex
> > > > > memory ops if I can't manage to assess the complexity of the task.
> > > > >
> > > > > Thanks,
> > > > > Richard.
> > > > >
> > > > > > >
> > > > > > > Thanks,
> > > > > > > Richard.
> > > > > > >
> > > > > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > > > > > > Also test the patch for SPEC2017 and find there's complex type vectorization
> > > > > > > > in 510/549(but no performance impact).
> > > > > > > >
> > > > > > > > Any comments?
> > > > > > > >
> > > > > > > > gcc/ChangeLog:
> > > > > > > >
> > > > > > > >         PR tree-optimization/106010
> > > > > > > >         * tree-vect-data-refs.cc (vect_get_data_access_cost):
> > > > > > > >         Pass complex_p to vect_get_num_copies to avoid ICE.
> > > > > > > >         (vect_analyze_data_refs): Support vectorization for Complex
> > > > > > > >         type with vector scalar types.
> > > > > > > >         * tree-vect-loop.cc (vect_determine_vf_for_stmt_1): VF should
> > > > > > > >         be half of TYPE_VECTOR_SUBPARTS when complex_p.
> > > > > > > >         * tree-vect-slp.cc (vect_record_max_nunits): nunits should be
> > > > > > > >         half of TYPE_VECTOR_SUBPARTS when complex_p.
> > > > > > > >         (vect_optimize_slp): Support permutation for complex type.
> > > > > > > >         (vect_slp_analyze_node_operations_1): Double nunits in
> > > > > > > >         vect_get_num_vectors to get right SLP_TREE_NUMBER_OF_VEC_STMTS
> > > > > > > >         when complex_p.
> > > > > > > >         (vect_slp_analyze_node_operations): Ditto.
> > > > > > > >         (vect_create_constant_vectors): Support CTOR for complex type.
> > > > > > > >         (vect_transform_slp_perm_load): Support permutation for
> > > > > > > >         complex type.
> > > > > > > >         * tree-vect-stmts.cc (vect_init_vector): Support complex type.
> > > > > > > >         (vect_get_vec_defs_for_operand): Get vector type for
> > > > > > > >         complex type.
> > > > > > > >         (vectorizable_store): Get right ncopies/nunits for complex
> > > > > > > >         type, also return false when complex_p and
> > > > > > > >         !TYPE_VECTOR_SUBPARTS.is_constant ().
> > > > > > > >         (vectorizable_load): Ditto.
> > > > > > > >         (vect_get_vector_types_for_stmt): Get vector type for complex type.
> > > > > > > >         * tree-vectorizer.h (STMT_VINFO_COMPLEX_P): New macro.
> > > > > > > >         (vect_get_num_copies): New overload.
> > > > > > > >
> > > > > > > > gcc/testsuite/ChangeLog:
> > > > > > > >
> > > > > > > >         * gcc.target/i386/pr106010-1a.c: New test.
> > > > > > > >         * gcc.target/i386/pr106010-1b.c: New test.
> > > > > > > >         * gcc.target/i386/pr106010-1c.c: New test.
> > > > > > > >         * gcc.target/i386/pr106010-2a.c: New test.
> > > > > > > >         * gcc.target/i386/pr106010-2b.c: New test.
> > > > > > > >         * gcc.target/i386/pr106010-2c.c: New test.
> > > > > > > >         * gcc.target/i386/pr106010-3a.c: New test.
> > > > > > > >         * gcc.target/i386/pr106010-3b.c: New test.
> > > > > > > >         * gcc.target/i386/pr106010-3c.c: New test.
> > > > > > > >         * gcc.target/i386/pr106010-4a.c: New test.
> > > > > > > >         * gcc.target/i386/pr106010-4b.c: New test.
> > > > > > > >         * gcc.target/i386/pr106010-4c.c: New test.
> > > > > > > >         * gcc.target/i386/pr106010-5a.c: New test.
> > > > > > > >         * gcc.target/i386/pr106010-5b.c: New test.
> > > > > > > >         * gcc.target/i386/pr106010-5c.c: New test.
> > > > > > > >         * gcc.target/i386/pr106010-6a.c: New test.
> > > > > > > >         * gcc.target/i386/pr106010-6b.c: New test.
> > > > > > > >         * gcc.target/i386/pr106010-6c.c: New test.
> > > > > > > >         * gcc.target/i386/pr106010-7a.c: New test.
> > > > > > > >         * gcc.target/i386/pr106010-7b.c: New test.
> > > > > > > >         * gcc.target/i386/pr106010-7c.c: New test.
> > > > > > > >         * gcc.target/i386/pr106010-8a.c: New test.
> > > > > > > >         * gcc.target/i386/pr106010-8b.c: New test.
> > > > > > > >         * gcc.target/i386/pr106010-8c.c: New test.
> > > > > > > > ---
> > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-1a.c |  58 +++++++
> > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-1b.c |  63 +++++++
> > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-1c.c |  41 +++++
> > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-2a.c |  82 +++++++++
> > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-2b.c |  62 +++++++
> > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-2c.c |  47 ++++++
> > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-3a.c |  80 +++++++++
> > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-3b.c | 126 ++++++++++++++
> > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-3c.c |  69 ++++++++
> > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-4a.c | 101 ++++++++++++
> > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-4b.c |  67 ++++++++
> > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-4c.c |  54 ++++++
> > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-5a.c | 117 +++++++++++++
> > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-5b.c |  80 +++++++++
> > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-5c.c |  62 +++++++
> > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-6a.c | 115 +++++++++++++
> > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-6b.c | 157 ++++++++++++++++++
> > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-6c.c |  80 +++++++++
> > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-7a.c |  58 +++++++
> > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-7b.c |  63 +++++++
> > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-7c.c |  41 +++++
> > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-8a.c |  58 +++++++
> > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-8b.c |  53 ++++++
> > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-8c.c |  38 +++++
> > > > > > > >  gcc/tree-vect-data-refs.cc                  |  26 ++-
> > > > > > > >  gcc/tree-vect-loop.cc                       |   7 +-
> > > > > > > >  gcc/tree-vect-slp.cc                        | 174 +++++++++++++++-----
> > > > > > > >  gcc/tree-vect-stmts.cc                      | 135 ++++++++++++---
> > > > > > > >  gcc/tree-vectorizer.h                       |  13 ++
> > > > > > > >  29 files changed, 2064 insertions(+), 63 deletions(-)
> > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > > > > > > >
> > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1a.c b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > > > > > > > new file mode 100644
> > > > > > > > index 00000000000..b608f484934
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > > > > > > > @@ -0,0 +1,58 @@
> > > > > > > > +/* { dg-do compile } */
> > > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "vect" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "vect" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "vect" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "vect" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "vect" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "vect" } } */
> > > > > > > > +
> > > > > > > > +#define N 10000
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_pd (_Complex double* a, _Complex double* b)
> > > > > > > > +{
> > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > +    a[i] = b[i];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_ps (_Complex float* a, _Complex float* b)
> > > > > > > > +{
> > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > +    a[i] = b[i];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi64 (_Complex long long* a, _Complex long long* b)
> > > > > > > > +{
> > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > +    a[i] = b[i];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi32 (_Complex int* a, _Complex int* b)
> > > > > > > > +{
> > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > +    a[i] = b[i];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi16 (_Complex short* a, _Complex short* b)
> > > > > > > > +{
> > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > +    a[i] = b[i];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi8 (_Complex char* a, _Complex char* b)
> > > > > > > > +{
> > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > +    a[i] = b[i];
> > > > > > > > +}
> > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1b.c b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > > > > > > > new file mode 100644
> > > > > > > > index 00000000000..0f377c3a548
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > > > > > > > @@ -0,0 +1,63 @@
> > > > > > > > +/* { dg-do run } */
> > > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > > +/* { dg-require-effective-target avx } */
> > > > > > > > +
> > > > > > > > +#include "avx-check.h"
> > > > > > > > +#include <string.h>
> > > > > > > > +#include "pr106010-1a.c"
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +avx_test (void)
> > > > > > > > +{
> > > > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > > > > > +  char* p_init = (char*) malloc (2 * N * sizeof (double));
> > > > > > > > +
> > > > > > > > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > > > > > > > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > > > > > > > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > > > > > > > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > > > > > > > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > > > > > > > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > > > > > > > +
> > > > > > > > +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> > > > > > > > +    p_init[i] = i;
> > > > > > > > +
> > > > > > > > +  memcpy (pd_src, p_init, 2 * N * sizeof (double));
> > > > > > > > +  memcpy (ps_src, p_init, 2 * N * sizeof (float));
> > > > > > > > +  memcpy (epi64_src, p_init, 2 * N * sizeof (long long));
> > > > > > > > +  memcpy (epi32_src, p_init, 2 * N * sizeof (int));
> > > > > > > > +  memcpy (epi16_src, p_init, 2 * N * sizeof (short));
> > > > > > > > +  memcpy (epi8_src, p_init, 2 * N * sizeof (char));
> > > > > > > > +
> > > > > > > > +  foo_pd (pd_dst, pd_src);
> > > > > > > > +  foo_ps (ps_dst, ps_src);
> > > > > > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > > > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > > > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > > > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > > > > > +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +
> > > > > > > > +  return;
> > > > > > > > +}
> > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1c.c b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > > > > > > > new file mode 100644
> > > > > > > > index 00000000000..f07e9fb2d3d
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > > > > > > > @@ -0,0 +1,41 @@
> > > > > > > > +/* { dg-do run } */
> > > > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "vect" } } */
> > > > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > > > +
> > > > > > > > +#include <string.h>
> > > > > > > > +
> > > > > > > > +static void do_test (void);
> > > > > > > > +
> > > > > > > > +#define DO_TEST do_test
> > > > > > > > +#define AVX512FP16
> > > > > > > > +#include "avx512-check.h"
> > > > > > > > +
> > > > > > > > +#define N 10000
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* b)
> > > > > > > > +{
> > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > +    a[i] = b[i];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static void
> > > > > > > > +do_test (void)
> > > > > > > > +{
> > > > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > > > > > +  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
> > > > > > > > +
> > > > > > > > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > > > > > > > +
> > > > > > > > +  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
> > > > > > > > +    p_init[i] = i;
> > > > > > > > +
> > > > > > > > +  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
> > > > > > > > +
> > > > > > > > +  foo_ph (ph_dst, ph_src);
> > > > > > > > +  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +}
> > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2a.c b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > > > > > > > new file mode 100644
> > > > > > > > index 00000000000..d2e2f8d4f43
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > > > > > > > @@ -0,0 +1,82 @@
> > > > > > > > +/* { dg-do compile } */
> > > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "slp2" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "slp2" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "slp2" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "slp2" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "slp2" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "slp2" } } */
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[0];
> > > > > > > > +  a[1] = b[1];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[0];
> > > > > > > > +  a[1] = b[1];
> > > > > > > > +  a[2] = b[2];
> > > > > > > > +  a[3] = b[3];
> > > > > > > > +
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[0];
> > > > > > > > +  a[1] = b[1];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[0];
> > > > > > > > +  a[1] = b[1];
> > > > > > > > +  a[2] = b[2];
> > > > > > > > +  a[3] = b[3];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[0];
> > > > > > > > +  a[1] = b[1];
> > > > > > > > +  a[2] = b[2];
> > > > > > > > +  a[3] = b[3];
> > > > > > > > +  a[4] = b[4];
> > > > > > > > +  a[5] = b[5];
> > > > > > > > +  a[6] = b[6];
> > > > > > > > +  a[7] = b[7];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[0];
> > > > > > > > +  a[1] = b[1];
> > > > > > > > +  a[2] = b[2];
> > > > > > > > +  a[3] = b[3];
> > > > > > > > +  a[4] = b[4];
> > > > > > > > +  a[5] = b[5];
> > > > > > > > +  a[6] = b[6];
> > > > > > > > +  a[7] = b[7];
> > > > > > > > +  a[8] = b[8];
> > > > > > > > +  a[9] = b[9];
> > > > > > > > +  a[10] = b[10];
> > > > > > > > +  a[11] = b[11];
> > > > > > > > +  a[12] = b[12];
> > > > > > > > +  a[13] = b[13];
> > > > > > > > +  a[14] = b[14];
> > > > > > > > +  a[15] = b[15];
> > > > > > > > +}
> > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2b.c b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > > > > > > > new file mode 100644
> > > > > > > > index 00000000000..ac360752693
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > > > > > > > @@ -0,0 +1,62 @@
> > > > > > > > +/* { dg-do run } */
> > > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > > +/* { dg-require-effective-target avx } */
> > > > > > > > +
> > > > > > > > +#include "avx-check.h"
> > > > > > > > +#include <string.h>
> > > > > > > > +#include "pr106010-2a.c"
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +avx_test (void)
> > > > > > > > +{
> > > > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > > > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > > > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > > > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > > > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > > > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > > > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > > > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > > > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > > > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > > > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > > > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > > > > > > > +  char* p = (char* ) malloc (32);
> > > > > > > > +
> > > > > > > > +  __builtin_memset (pd_dst, 0, 32);
> > > > > > > > +  __builtin_memset (ps_dst, 0, 32);
> > > > > > > > +  __builtin_memset (epi64_dst, 0, 32);
> > > > > > > > +  __builtin_memset (epi32_dst, 0, 32);
> > > > > > > > +  __builtin_memset (epi16_dst, 0, 32);
> > > > > > > > +  __builtin_memset (epi8_dst, 0, 32);
> > > > > > > > +
> > > > > > > > +  for (int i = 0; i != 32; i++)
> > > > > > > > +    p[i] = i;
> > > > > > > > +  __builtin_memcpy (pd_src, p, 32);
> > > > > > > > +  __builtin_memcpy (ps_src, p, 32);
> > > > > > > > +  __builtin_memcpy (epi64_src, p, 32);
> > > > > > > > +  __builtin_memcpy (epi32_src, p, 32);
> > > > > > > > +  __builtin_memcpy (epi16_src, p, 32);
> > > > > > > > +  __builtin_memcpy (epi8_src, p, 32);
> > > > > > > > +
> > > > > > > > +  foo_pd (pd_dst, pd_src);
> > > > > > > > +  foo_ps (ps_dst, ps_src);
> > > > > > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > > > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > > > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > > > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > > > > > +  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +
> > > > > > > > +  return;
> > > > > > > > +}
> > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2c.c b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > > > > > > > new file mode 100644
> > > > > > > > index 00000000000..a002f209ec9
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > > > > > > > @@ -0,0 +1,47 @@
> > > > > > > > +/* { dg-do run } */
> > > > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > > > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > > > +
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "slp2" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > > > > > > +
> > > > > > > > +#include <string.h>
> > > > > > > > +
> > > > > > > > +static void do_test (void);
> > > > > > > > +#define DO_TEST do_test
> > > > > > > > +#define AVX512FP16
> > > > > > > > +#include "avx512-check.h"
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[0];
> > > > > > > > +  a[1] = b[1];
> > > > > > > > +  a[2] = b[2];
> > > > > > > > +  a[3] = b[3];
> > > > > > > > +  a[4] = b[4];
> > > > > > > > +  a[5] = b[5];
> > > > > > > > +  a[6] = b[6];
> > > > > > > > +  a[7] = b[7];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +do_test (void)
> > > > > > > > +{
> > > > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > > > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > > > > > > > +  char* p = (char* ) malloc (32);
> > > > > > > > +
> > > > > > > > +   __builtin_memset (ph_dst, 0, 32);
> > > > > > > > +
> > > > > > > > +  for (int i = 0; i != 32; i++)
> > > > > > > > +    p[i] = i;
> > > > > > > > +  __builtin_memcpy (ph_src, p, 32);
> > > > > > > > +
> > > > > > > > +  foo_ph (ph_dst, ph_src);
> > > > > > > > +  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +
> > > > > > > > +  return;
> > > > > > > > +}
> > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3a.c b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > > > > > > > new file mode 100644
> > > > > > > > index 00000000000..c1b64b56b1c
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > > > > > > > @@ -0,0 +1,80 @@
> > > > > > > > +/* { dg-do compile } */
> > > > > > > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 6, 7, 4, 5 \}} 1 "slp2" } }  */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17 \}} 1 "slp2" } }  */
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[1];
> > > > > > > > +  a[1] = b[0];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[1];
> > > > > > > > +  a[1] = b[0];
> > > > > > > > +  a[2] = b[3];
> > > > > > > > +  a[3] = b[2];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[1];
> > > > > > > > +  a[1] = b[0];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[3];
> > > > > > > > +  a[1] = b[2];
> > > > > > > > +  a[2] = b[1];
> > > > > > > > +  a[3] = b[0];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[7];
> > > > > > > > +  a[1] = b[6];
> > > > > > > > +  a[2] = b[5];
> > > > > > > > +  a[3] = b[4];
> > > > > > > > +  a[4] = b[3];
> > > > > > > > +  a[5] = b[2];
> > > > > > > > +  a[6] = b[1];
> > > > > > > > +  a[7] = b[0];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[7];
> > > > > > > > +  a[1] = b[6];
> > > > > > > > +  a[2] = b[5];
> > > > > > > > +  a[3] = b[4];
> > > > > > > > +  a[4] = b[3];
> > > > > > > > +  a[5] = b[2];
> > > > > > > > +  a[6] = b[1];
> > > > > > > > +  a[7] = b[0];
> > > > > > > > +  a[8] = b[15];
> > > > > > > > +  a[9] = b[14];
> > > > > > > > +  a[10] = b[13];
> > > > > > > > +  a[11] = b[12];
> > > > > > > > +  a[12] = b[11];
> > > > > > > > +  a[13] = b[10];
> > > > > > > > +  a[14] = b[9];
> > > > > > > > +  a[15] = b[8];
> > > > > > > > +}
> > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3b.c b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > > > > > > > new file mode 100644
> > > > > > > > index 00000000000..e4fa3f3a541
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > > > > > > > @@ -0,0 +1,126 @@
> > > > > > > > +/* { dg-do run } */
> > > > > > > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > > +/* { dg-require-effective-target avx2 } */
> > > > > > > > +
> > > > > > > > +#include "avx2-check.h"
> > > > > > > > +#include <string.h>
> > > > > > > > +#include "pr106010-3a.c"
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +avx2_test (void)
> > > > > > > > +{
> > > > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > > > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > > > > > > > +  _Complex double* pd_exp = (_Complex double*) malloc (32);
> > > > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > > > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > > > > > > > +  _Complex float* ps_exp = (_Complex float*) malloc (32);
> > > > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > > > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > > > > > > > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (32);
> > > > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > > > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > > > > > > > +  _Complex int* epi32_exp = (_Complex int*) malloc (32);
> > > > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > > > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > > > > > > > +  _Complex short* epi16_exp = (_Complex short*) malloc (32);
> > > > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > > > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > > > > > > > +  _Complex char* epi8_exp = (_Complex char*) malloc (32);
> > > > > > > > +  char* p = (char* ) malloc (32);
> > > > > > > > +  char* q = (char* ) malloc (32);
> > > > > > > > +
> > > > > > > > +  __builtin_memset (pd_dst, 0, 32);
> > > > > > > > +  __builtin_memset (ps_dst, 0, 32);
> > > > > > > > +  __builtin_memset (epi64_dst, 0, 32);
> > > > > > > > +  __builtin_memset (epi32_dst, 0, 32);
> > > > > > > > +  __builtin_memset (epi16_dst, 0, 32);
> > > > > > > > +  __builtin_memset (epi8_dst, 0, 32);
> > > > > > > > +
> > > > > > > > +  for (int i = 0; i != 32; i++)
> > > > > > > > +    p[i] = i;
> > > > > > > > +  __builtin_memcpy (pd_src, p, 32);
> > > > > > > > +  __builtin_memcpy (ps_src, p, 32);
> > > > > > > > +  __builtin_memcpy (epi64_src, p, 32);
> > > > > > > > +  __builtin_memcpy (epi32_src, p, 32);
> > > > > > > > +  __builtin_memcpy (epi16_src, p, 32);
> > > > > > > > +  __builtin_memcpy (epi8_src, p, 32);
> > > > > > > > +
> > > > > > > > +  for (int i = 0; i != 16; i++)
> > > > > > > > +    {
> > > > > > > > +      p[i] = i + 16;
> > > > > > > > +      p[i + 16] = i;
> > > > > > > > +    }
> > > > > > > > +  __builtin_memcpy (pd_exp, p, 32);
> > > > > > > > +  __builtin_memcpy (epi64_exp, p, 32);
> > > > > > > > +
> > > > > > > > +  for (int i = 0; i != 8; i++)
> > > > > > > > +    {
> > > > > > > > +      p[i] = i + 8;
> > > > > > > > +      p[i + 8] = i;
> > > > > > > > +      p[i + 16] = i + 24;
> > > > > > > > +      p[i + 24] = i + 16;
> > > > > > > > +      q[i] = i + 24;
> > > > > > > > +      q[i + 8] = i + 16;
> > > > > > > > +      q[i + 16] = i + 8;
> > > > > > > > +      q[i + 24] = i;
> > > > > > > > +    }
> > > > > > > > +  __builtin_memcpy (ps_exp, p, 32);
> > > > > > > > +  __builtin_memcpy (epi32_exp, q, 32);
> > > > > > > > +
> > > > > > > > +
> > > > > > > > +  for (int i = 0; i != 4; i++)
> > > > > > > > +    {
> > > > > > > > +      q[i] = i + 28;
> > > > > > > > +      q[i + 4] = i + 24;
> > > > > > > > +      q[i + 8] = i + 20;
> > > > > > > > +      q[i + 12] = i + 16;
> > > > > > > > +      q[i + 16] = i + 12;
> > > > > > > > +      q[i + 20] = i + 8;
> > > > > > > > +      q[i + 24] = i + 4;
> > > > > > > > +      q[i + 28] = i;
> > > > > > > > +    }
> > > > > > > > +  __builtin_memcpy (epi16_exp, q, 32);
> > > > > > > > +
> > > > > > > > +  for (int i = 0; i != 2; i++)
> > > > > > > > +    {
> > > > > > > > +      q[i] = i + 14;
> > > > > > > > +      q[i + 2] = i + 12;
> > > > > > > > +      q[i + 4] = i + 10;
> > > > > > > > +      q[i + 6] = i + 8;
> > > > > > > > +      q[i + 8] = i + 6;
> > > > > > > > +      q[i + 10] = i + 4;
> > > > > > > > +      q[i + 12] = i + 2;
> > > > > > > > +      q[i + 14] = i;
> > > > > > > > +      q[i + 16] = i + 30;
> > > > > > > > +      q[i + 18] = i + 28;
> > > > > > > > +      q[i + 20] = i + 26;
> > > > > > > > +      q[i + 22] = i + 24;
> > > > > > > > +      q[i + 24] = i + 22;
> > > > > > > > +      q[i + 26] = i + 20;
> > > > > > > > +      q[i + 28] = i + 18;
> > > > > > > > +      q[i + 30] = i + 16;
> > > > > > > > +    }
> > > > > > > > +  __builtin_memcpy (epi8_exp, q, 32);
> > > > > > > > +
> > > > > > > > +  foo_pd (pd_dst, pd_src);
> > > > > > > > +  foo_ps (ps_dst, ps_src);
> > > > > > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > > > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > > > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > > > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > > > > > +  if (__builtin_memcmp (pd_dst, pd_exp, 32) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (ps_dst, ps_exp, 32) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 32) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 32) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 32) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 32) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +
> > > > > > > > +  return;
> > > > > > > > +}
> > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3c.c b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > > > > > > > new file mode 100644
> > > > > > > > index 00000000000..5a5a3d4b992
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > > > > > > > @@ -0,0 +1,69 @@
> > > > > > > > +/* { dg-do run } */
> > > > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > > > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 8, 9, 6, 7, 14, 15, 12, 13, 4, 5, 10, 11 \}} 1 "slp2" } }  */
> > > > > > > > +
> > > > > > > > +#include <string.h>
> > > > > > > > +
> > > > > > > > +static void do_test (void);
> > > > > > > > +#define DO_TEST do_test
> > > > > > > > +#define AVX512FP16
> > > > > > > > +#include "avx512-check.h"
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[1];
> > > > > > > > +  a[1] = b[0];
> > > > > > > > +  a[2] = b[4];
> > > > > > > > +  a[3] = b[3];
> > > > > > > > +  a[4] = b[7];
> > > > > > > > +  a[5] = b[6];
> > > > > > > > +  a[6] = b[2];
> > > > > > > > +  a[7] = b[5];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +do_test (void)
> > > > > > > > +{
> > > > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > > > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > > > > > > > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (32);
> > > > > > > > +  char* p = (char* ) malloc (32);
> > > > > > > > +  char* q = (char* ) malloc (32);
> > > > > > > > +
> > > > > > > > +  __builtin_memset (ph_dst, 0, 32);
> > > > > > > > +
> > > > > > > > +  for (int i = 0; i != 32; i++)
> > > > > > > > +    p[i] = i;
> > > > > > > > +  __builtin_memcpy (ph_src, p, 32);
> > > > > > > > +
> > > > > > > > +  for (int i = 0; i != 4; i++)
> > > > > > > > +    {
> > > > > > > > +      p[i] = i + 4;
> > > > > > > > +      p[i + 4] = i;
> > > > > > > > +      p[i + 8] = i + 16;
> > > > > > > > +      p[i + 12] = i + 12;
> > > > > > > > +      p[i + 16] = i + 28;
> > > > > > > > +      p[i + 20] = i + 24;
> > > > > > > > +      p[i + 24] = i + 8;
> > > > > > > > +      p[i + 28] = i + 20;
> > > > > > > > +      q[i] = i + 28;
> > > > > > > > +      q[i + 4] = i + 24;
> > > > > > > > +      q[i + 8] = i + 20;
> > > > > > > > +      q[i + 12] = i + 16;
> > > > > > > > +      q[i + 16] = i + 12;
> > > > > > > > +      q[i + 20] = i + 8;
> > > > > > > > +      q[i + 24] = i + 4;
> > > > > > > > +      q[i + 28] = i;
> > > > > > > > +    }
> > > > > > > > +  __builtin_memcpy (ph_exp, p, 32);
> > > > > > > > +
> > > > > > > > +  foo_ph (ph_dst, ph_src);
> > > > > > > > +  if (__builtin_memcmp (ph_dst, ph_exp, 32) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +
> > > > > > > > +  return;
> > > > > > > > +}
> > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4a.c b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > > > > > > > new file mode 100644
> > > > > > > > index 00000000000..b7b0b532bb1
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > > > > > > > @@ -0,0 +1,101 @@
> > > > > > > > +/* { dg-do compile } */
> > > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "slp2" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "slp2" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "slp2" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "slp2" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "slp2" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "slp2" } } */
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_pd (_Complex double* a,
> > > > > > > > +       _Complex double b1,
> > > > > > > > +       _Complex double b2)
> > > > > > > > +{
> > > > > > > > +  a[0] = b1;
> > > > > > > > +  a[1] = b2;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_ps (_Complex float* a,
> > > > > > > > +       _Complex float b1, _Complex float b2,
> > > > > > > > +       _Complex float b3, _Complex float b4)
> > > > > > > > +{
> > > > > > > > +  a[0] = b1;
> > > > > > > > +  a[1] = b2;
> > > > > > > > +  a[2] = b3;
> > > > > > > > +  a[3] = b4;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi64 (_Complex long long* a,
> > > > > > > > +          _Complex long long b1,
> > > > > > > > +          _Complex long long b2)
> > > > > > > > +{
> > > > > > > > +  a[0] = b1;
> > > > > > > > +  a[1] = b2;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi32 (_Complex int* a,
> > > > > > > > +          _Complex int b1, _Complex int b2,
> > > > > > > > +          _Complex int b3, _Complex int b4)
> > > > > > > > +{
> > > > > > > > +  a[0] = b1;
> > > > > > > > +  a[1] = b2;
> > > > > > > > +  a[2] = b3;
> > > > > > > > +  a[3] = b4;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi16 (_Complex short* a,
> > > > > > > > +          _Complex short b1, _Complex short b2,
> > > > > > > > +          _Complex short b3, _Complex short b4,
> > > > > > > > +          _Complex short b5, _Complex short b6,
> > > > > > > > +          _Complex short b7,_Complex short b8)
> > > > > > > > +{
> > > > > > > > +  a[0] = b1;
> > > > > > > > +  a[1] = b2;
> > > > > > > > +  a[2] = b3;
> > > > > > > > +  a[3] = b4;
> > > > > > > > +  a[4] = b5;
> > > > > > > > +  a[5] = b6;
> > > > > > > > +  a[6] = b7;
> > > > > > > > +  a[7] = b8;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi8 (_Complex char* a,
> > > > > > > > +         _Complex char b1, _Complex char b2,
> > > > > > > > +         _Complex char b3, _Complex char b4,
> > > > > > > > +         _Complex char b5, _Complex char b6,
> > > > > > > > +         _Complex char b7,_Complex char b8,
> > > > > > > > +         _Complex char b9, _Complex char b10,
> > > > > > > > +         _Complex char b11, _Complex char b12,
> > > > > > > > +         _Complex char b13, _Complex char b14,
> > > > > > > > +         _Complex char b15,_Complex char b16)
> > > > > > > > +{
> > > > > > > > +  a[0] = b1;
> > > > > > > > +  a[1] = b2;
> > > > > > > > +  a[2] = b3;
> > > > > > > > +  a[3] = b4;
> > > > > > > > +  a[4] = b5;
> > > > > > > > +  a[5] = b6;
> > > > > > > > +  a[6] = b7;
> > > > > > > > +  a[7] = b8;
> > > > > > > > +  a[8] = b9;
> > > > > > > > +  a[9] = b10;
> > > > > > > > +  a[10] = b11;
> > > > > > > > +  a[11] = b12;
> > > > > > > > +  a[12] = b13;
> > > > > > > > +  a[13] = b14;
> > > > > > > > +  a[14] = b15;
> > > > > > > > +  a[15] = b16;
> > > > > > > > +}
> > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4b.c b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > > > > > > > new file mode 100644
> > > > > > > > index 00000000000..e2e79508c4b
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > > > > > > > @@ -0,0 +1,67 @@
> > > > > > > > +/* { dg-do run } */
> > > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > > +/* { dg-require-effective-target avx } */
> > > > > > > > +
> > > > > > > > +#include "avx-check.h"
> > > > > > > > +#include <string.h>
> > > > > > > > +#include "pr106010-4a.c"
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +avx_test (void)
> > > > > > > > +{
> > > > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > > > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > > > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > > > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > > > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > > > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > > > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > > > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > > > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > > > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > > > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > > > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > > > > > > > +  char* p = (char* ) malloc (32);
> > > > > > > > +
> > > > > > > > +  __builtin_memset (pd_dst, 0, 32);
> > > > > > > > +  __builtin_memset (ps_dst, 0, 32);
> > > > > > > > +  __builtin_memset (epi64_dst, 0, 32);
> > > > > > > > +  __builtin_memset (epi32_dst, 0, 32);
> > > > > > > > +  __builtin_memset (epi16_dst, 0, 32);
> > > > > > > > +  __builtin_memset (epi8_dst, 0, 32);
> > > > > > > > +
> > > > > > > > +  for (int i = 0; i != 32; i++)
> > > > > > > > +    p[i] = i;
> > > > > > > > +  __builtin_memcpy (pd_src, p, 32);
> > > > > > > > +  __builtin_memcpy (ps_src, p, 32);
> > > > > > > > +  __builtin_memcpy (epi64_src, p, 32);
> > > > > > > > +  __builtin_memcpy (epi32_src, p, 32);
> > > > > > > > +  __builtin_memcpy (epi16_src, p, 32);
> > > > > > > > +  __builtin_memcpy (epi8_src, p, 32);
> > > > > > > > +
> > > > > > > > +  foo_pd (pd_dst, pd_src[0], pd_src[1]);
> > > > > > > > +  foo_ps (ps_dst, ps_src[0], ps_src[1], ps_src[2], ps_src[3]);
> > > > > > > > +  foo_epi64 (epi64_dst, epi64_src[0], epi64_src[1]);
> > > > > > > > +  foo_epi32 (epi32_dst, epi32_src[0], epi32_src[1], epi32_src[2], epi32_src[3]);
> > > > > > > > +  foo_epi16 (epi16_dst, epi16_src[0], epi16_src[1], epi16_src[2], epi16_src[3],
> > > > > > > > +            epi16_src[4], epi16_src[5], epi16_src[6], epi16_src[7]);
> > > > > > > > +  foo_epi8 (epi8_dst, epi8_src[0], epi8_src[1], epi8_src[2], epi8_src[3],
> > > > > > > > +           epi8_src[4], epi8_src[5], epi8_src[6], epi8_src[7],
> > > > > > > > +           epi8_src[8], epi8_src[9], epi8_src[10], epi8_src[11],
> > > > > > > > +           epi8_src[12], epi8_src[13], epi8_src[14], epi8_src[15]);
> > > > > > > > +
> > > > > > > > +  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi8_dst, epi8_src, 32) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +
> > > > > > > > +  return;
> > > > > > > > +}
> > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4c.c b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > > > > > > > new file mode 100644
> > > > > > > > index 00000000000..8e02aefe3b5
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > > > > > > > @@ -0,0 +1,54 @@
> > > > > > > > +/* { dg-do run } */
> > > > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -fdump-tree-slp-details -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "slp2" } } */
> > > > > > > > +
> > > > > > > > +#include <string.h>
> > > > > > > > +
> > > > > > > > +static void do_test (void);
> > > > > > > > +#define DO_TEST do_test
> > > > > > > > +#define AVX512FP16
> > > > > > > > +#include "avx512-check.h"
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_ph (_Complex _Float16* a,
> > > > > > > > +       _Complex _Float16 b1, _Complex _Float16 b2,
> > > > > > > > +       _Complex _Float16 b3, _Complex _Float16 b4,
> > > > > > > > +       _Complex _Float16 b5, _Complex _Float16 b6,
> > > > > > > > +       _Complex _Float16 b7,_Complex _Float16 b8)
> > > > > > > > +{
> > > > > > > > +  a[0] = b1;
> > > > > > > > +  a[1] = b2;
> > > > > > > > +  a[2] = b3;
> > > > > > > > +  a[3] = b4;
> > > > > > > > +  a[4] = b5;
> > > > > > > > +  a[5] = b6;
> > > > > > > > +  a[6] = b7;
> > > > > > > > +  a[7] = b8;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +do_test (void)
> > > > > > > > +{
> > > > > > > > +
> > > > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > > > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > > > > > > > +
> > > > > > > > +  char* p = (char* ) malloc (32);
> > > > > > > > +
> > > > > > > > +  __builtin_memset (ph_dst, 0, 32);
> > > > > > > > +
> > > > > > > > +  for (int i = 0; i != 32; i++)
> > > > > > > > +    p[i] = i;
> > > > > > > > +
> > > > > > > > +  __builtin_memcpy (ph_src, p, 32);
> > > > > > > > +
> > > > > > > > +  foo_ph (ph_dst, ph_src[0], ph_src[1], ph_src[2], ph_src[3],
> > > > > > > > +         ph_src[4], ph_src[5], ph_src[6], ph_src[7]);
> > > > > > > > +
> > > > > > > > +  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  return;
> > > > > > > > +}
> > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5a.c b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > > > > > > > new file mode 100644
> > > > > > > > index 00000000000..9d4a6f9846b
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > > > > > > > @@ -0,0 +1,117 @@
> > > > > > > > +/* { dg-do compile } */
> > > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 4 "slp2" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 4 "slp2" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 4 "slp2" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 4 "slp2" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 4 "slp2" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 4 "slp2" } } */
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[2];
> > > > > > > > +  a[1] = b[3];
> > > > > > > > +  a[2] = b[0];
> > > > > > > > +  a[3] = b[1];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[4];
> > > > > > > > +  a[1] = b[5];
> > > > > > > > +  a[2] = b[6];
> > > > > > > > +  a[3] = b[7];
> > > > > > > > +  a[4] = b[0];
> > > > > > > > +  a[5] = b[1];
> > > > > > > > +  a[6] = b[2];
> > > > > > > > +  a[7] = b[3];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[2];
> > > > > > > > +  a[1] = b[3];
> > > > > > > > +  a[2] = b[0];
> > > > > > > > +  a[3] = b[1];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[4];
> > > > > > > > +  a[1] = b[5];
> > > > > > > > +  a[2] = b[6];
> > > > > > > > +  a[3] = b[7];
> > > > > > > > +  a[4] = b[0];
> > > > > > > > +  a[5] = b[1];
> > > > > > > > +  a[6] = b[2];
> > > > > > > > +  a[7] = b[3];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[8];
> > > > > > > > +  a[1] = b[9];
> > > > > > > > +  a[2] = b[10];
> > > > > > > > +  a[3] = b[11];
> > > > > > > > +  a[4] = b[12];
> > > > > > > > +  a[5] = b[13];
> > > > > > > > +  a[6] = b[14];
> > > > > > > > +  a[7] = b[15];
> > > > > > > > +  a[8] = b[0];
> > > > > > > > +  a[9] = b[1];
> > > > > > > > +  a[10] = b[2];
> > > > > > > > +  a[11] = b[3];
> > > > > > > > +  a[12] = b[4];
> > > > > > > > +  a[13] = b[5];
> > > > > > > > +  a[14] = b[6];
> > > > > > > > +  a[15] = b[7];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[16];
> > > > > > > > +  a[1] = b[17];
> > > > > > > > +  a[2] = b[18];
> > > > > > > > +  a[3] = b[19];
> > > > > > > > +  a[4] = b[20];
> > > > > > > > +  a[5] = b[21];
> > > > > > > > +  a[6] = b[22];
> > > > > > > > +  a[7] = b[23];
> > > > > > > > +  a[8] = b[24];
> > > > > > > > +  a[9] = b[25];
> > > > > > > > +  a[10] = b[26];
> > > > > > > > +  a[11] = b[27];
> > > > > > > > +  a[12] = b[28];
> > > > > > > > +  a[13] = b[29];
> > > > > > > > +  a[14] = b[30];
> > > > > > > > +  a[15] = b[31];
> > > > > > > > +  a[16] = b[0];
> > > > > > > > +  a[17] = b[1];
> > > > > > > > +  a[18] = b[2];
> > > > > > > > +  a[19] = b[3];
> > > > > > > > +  a[20] = b[4];
> > > > > > > > +  a[21] = b[5];
> > > > > > > > +  a[22] = b[6];
> > > > > > > > +  a[23] = b[7];
> > > > > > > > +  a[24] = b[8];
> > > > > > > > +  a[25] = b[9];
> > > > > > > > +  a[26] = b[10];
> > > > > > > > +  a[27] = b[11];
> > > > > > > > +  a[28] = b[12];
> > > > > > > > +  a[29] = b[13];
> > > > > > > > +  a[30] = b[14];
> > > > > > > > +  a[31] = b[15];
> > > > > > > > +}
> > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5b.c b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > > > > > > > new file mode 100644
> > > > > > > > index 00000000000..d5c6ebeb5cf
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > > > > > > > @@ -0,0 +1,80 @@
> > > > > > > > +/* { dg-do run } */
> > > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > > +/* { dg-require-effective-target avx } */
> > > > > > > > +
> > > > > > > > +#include "avx-check.h"
> > > > > > > > +#include <string.h>
> > > > > > > > +#include "pr106010-5a.c"
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +avx_test (void)
> > > > > > > > +{
> > > > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (64);
> > > > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (64);
> > > > > > > > +  _Complex double* pd_exp = (_Complex double*) malloc (64);
> > > > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (64);
> > > > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (64);
> > > > > > > > +  _Complex float* ps_exp = (_Complex float*) malloc (64);
> > > > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
> > > > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
> > > > > > > > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
> > > > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (64);
> > > > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (64);
> > > > > > > > +  _Complex int* epi32_exp = (_Complex int*) malloc (64);
> > > > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (64);
> > > > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (64);
> > > > > > > > +  _Complex short* epi16_exp = (_Complex short*) malloc (64);
> > > > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (64);
> > > > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (64);
> > > > > > > > +  _Complex char* epi8_exp = (_Complex char*) malloc (64);
> > > > > > > > +  char* p = (char* ) malloc (64);
> > > > > > > > +  char* q = (char* ) malloc (64);
> > > > > > > > +
> > > > > > > > +  __builtin_memset (pd_dst, 0, 64);
> > > > > > > > +  __builtin_memset (ps_dst, 0, 64);
> > > > > > > > +  __builtin_memset (epi64_dst, 0, 64);
> > > > > > > > +  __builtin_memset (epi32_dst, 0, 64);
> > > > > > > > +  __builtin_memset (epi16_dst, 0, 64);
> > > > > > > > +  __builtin_memset (epi8_dst, 0, 64);
> > > > > > > > +
> > > > > > > > +  for (int i = 0; i != 64; i++)
> > > > > > > > +    {
> > > > > > > > +      p[i] = i;
> > > > > > > > +      q[i] = (i + 32) % 64;
> > > > > > > > +    }
> > > > > > > > +  __builtin_memcpy (pd_src, p, 64);
> > > > > > > > +  __builtin_memcpy (ps_src, p, 64);
> > > > > > > > +  __builtin_memcpy (epi64_src, p, 64);
> > > > > > > > +  __builtin_memcpy (epi32_src, p, 64);
> > > > > > > > +  __builtin_memcpy (epi16_src, p, 64);
> > > > > > > > +  __builtin_memcpy (epi8_src, p, 64);
> > > > > > > > +
> > > > > > > > +  __builtin_memcpy (pd_exp, q, 64);
> > > > > > > > +  __builtin_memcpy (ps_exp, q, 64);
> > > > > > > > +  __builtin_memcpy (epi64_exp, q, 64);
> > > > > > > > +  __builtin_memcpy (epi32_exp, q, 64);
> > > > > > > > +  __builtin_memcpy (epi16_exp, q, 64);
> > > > > > > > +  __builtin_memcpy (epi8_exp, q, 64);
> > > > > > > > +
> > > > > > > > +  foo_pd (pd_dst, pd_src);
> > > > > > > > +  foo_ps (ps_dst, ps_src);
> > > > > > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > > > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > > > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > > > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > > > > > +
> > > > > > > > +  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +
> > > > > > > > +  return;
> > > > > > > > +}
> > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5c.c b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > > > > > > > new file mode 100644
> > > > > > > > index 00000000000..9ce4e6dd5c0
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > > > > > > > @@ -0,0 +1,62 @@
> > > > > > > > +/* { dg-do run } */
> > > > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 4 "slp2" } } */
> > > > > > > > +
> > > > > > > > +#include <string.h>
> > > > > > > > +
> > > > > > > > +static void do_test (void);
> > > > > > > > +#define DO_TEST do_test
> > > > > > > > +#define AVX512FP16
> > > > > > > > +#include "avx512-check.h"
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[8];
> > > > > > > > +  a[1] = b[9];
> > > > > > > > +  a[2] = b[10];
> > > > > > > > +  a[3] = b[11];
> > > > > > > > +  a[4] = b[12];
> > > > > > > > +  a[5] = b[13];
> > > > > > > > +  a[6] = b[14];
> > > > > > > > +  a[7] = b[15];
> > > > > > > > +  a[8] = b[0];
> > > > > > > > +  a[9] = b[1];
> > > > > > > > +  a[10] = b[2];
> > > > > > > > +  a[11] = b[3];
> > > > > > > > +  a[12] = b[4];
> > > > > > > > +  a[13] = b[5];
> > > > > > > > +  a[14] = b[6];
> > > > > > > > +  a[15] = b[7];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +do_test (void)
> > > > > > > > +{
> > > > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
> > > > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
> > > > > > > > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
> > > > > > > > +  char* p = (char* ) malloc (64);
> > > > > > > > +  char* q = (char* ) malloc (64);
> > > > > > > > +
> > > > > > > > +  __builtin_memset (ph_dst, 0, 64);
> > > > > > > > +
> > > > > > > > +  for (int i = 0; i != 64; i++)
> > > > > > > > +    {
> > > > > > > > +      p[i] = i;
> > > > > > > > +      q[i] = (i + 32) % 64;
> > > > > > > > +    }
> > > > > > > > +  __builtin_memcpy (ph_src, p, 64);
> > > > > > > > +
> > > > > > > > +  __builtin_memcpy (ph_exp, q, 64);
> > > > > > > > +
> > > > > > > > +  foo_ph (ph_dst, ph_src);
> > > > > > > > +
> > > > > > > > +  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +
> > > > > > > > +  return;
> > > > > > > > +}
> > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6a.c b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > > > > > > > new file mode 100644
> > > > > > > > index 00000000000..65a90d03684
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > > > > > > > @@ -0,0 +1,115 @@
> > > > > > > > +/* { dg-do compile } */
> > > > > > > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 4 "slp2" } }  */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 4 "slp2" } }  */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[3];
> > > > > > > > +  a[1] = b[2];
> > > > > > > > +  a[2] = b[1];
> > > > > > > > +  a[3] = b[0];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[7];
> > > > > > > > +  a[1] = b[6];
> > > > > > > > +  a[2] = b[5];
> > > > > > > > +  a[3] = b[4];
> > > > > > > > +  a[4] = b[3];
> > > > > > > > +  a[5] = b[2];
> > > > > > > > +  a[6] = b[1];
> > > > > > > > +  a[7] = b[0];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[3];
> > > > > > > > +  a[1] = b[2];
> > > > > > > > +  a[2] = b[1];
> > > > > > > > +  a[3] = b[0];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[7];
> > > > > > > > +  a[1] = b[6];
> > > > > > > > +  a[2] = b[5];
> > > > > > > > +  a[3] = b[4];
> > > > > > > > +  a[4] = b[3];
> > > > > > > > +  a[5] = b[2];
> > > > > > > > +  a[6] = b[1];
> > > > > > > > +  a[7] = b[0];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[15];
> > > > > > > > +  a[1] = b[14];
> > > > > > > > +  a[2] = b[13];
> > > > > > > > +  a[3] = b[12];
> > > > > > > > +  a[4] = b[11];
> > > > > > > > +  a[5] = b[10];
> > > > > > > > +  a[6] = b[9];
> > > > > > > > +  a[7] = b[8];
> > > > > > > > +  a[8] = b[7];
> > > > > > > > +  a[9] = b[6];
> > > > > > > > +  a[10] = b[5];
> > > > > > > > +  a[11] = b[4];
> > > > > > > > +  a[12] = b[3];
> > > > > > > > +  a[13] = b[2];
> > > > > > > > +  a[14] = b[1];
> > > > > > > > +  a[15] = b[0];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[31];
> > > > > > > > +  a[1] = b[30];
> > > > > > > > +  a[2] = b[29];
> > > > > > > > +  a[3] = b[28];
> > > > > > > > +  a[4] = b[27];
> > > > > > > > +  a[5] = b[26];
> > > > > > > > +  a[6] = b[25];
> > > > > > > > +  a[7] = b[24];
> > > > > > > > +  a[8] = b[23];
> > > > > > > > +  a[9] = b[22];
> > > > > > > > +  a[10] = b[21];
> > > > > > > > +  a[11] = b[20];
> > > > > > > > +  a[12] = b[19];
> > > > > > > > +  a[13] = b[18];
> > > > > > > > +  a[14] = b[17];
> > > > > > > > +  a[15] = b[16];
> > > > > > > > +  a[16] = b[15];
> > > > > > > > +  a[17] = b[14];
> > > > > > > > +  a[18] = b[13];
> > > > > > > > +  a[19] = b[12];
> > > > > > > > +  a[20] = b[11];
> > > > > > > > +  a[21] = b[10];
> > > > > > > > +  a[22] = b[9];
> > > > > > > > +  a[23] = b[8];
> > > > > > > > +  a[24] = b[7];
> > > > > > > > +  a[25] = b[6];
> > > > > > > > +  a[26] = b[5];
> > > > > > > > +  a[27] = b[4];
> > > > > > > > +  a[28] = b[3];
> > > > > > > > +  a[29] = b[2];
> > > > > > > > +  a[30] = b[1];
> > > > > > > > +  a[31] = b[0];
> > > > > > > > +}
> > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6b.c b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > > > > > > > new file mode 100644
> > > > > > > > index 00000000000..1c5bb020939
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > > > > > > > @@ -0,0 +1,157 @@
> > > > > > > > +/* { dg-do run } */
> > > > > > > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > > +/* { dg-require-effective-target avx2 } */
> > > > > > > > +
> > > > > > > > +#include "avx2-check.h"
> > > > > > > > +#include <string.h>
> > > > > > > > +#include "pr106010-6a.c"
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +avx2_test (void)
> > > > > > > > +{
> > > > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (64);
> > > > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (64);
> > > > > > > > +  _Complex double* pd_exp = (_Complex double*) malloc (64);
> > > > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (64);
> > > > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (64);
> > > > > > > > +  _Complex float* ps_exp = (_Complex float*) malloc (64);
> > > > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
> > > > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
> > > > > > > > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
> > > > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (64);
> > > > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (64);
> > > > > > > > +  _Complex int* epi32_exp = (_Complex int*) malloc (64);
> > > > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (64);
> > > > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (64);
> > > > > > > > +  _Complex short* epi16_exp = (_Complex short*) malloc (64);
> > > > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (64);
> > > > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (64);
> > > > > > > > +  _Complex char* epi8_exp = (_Complex char*) malloc (64);
> > > > > > > > +  char* p = (char* ) malloc (64);
> > > > > > > > +  char* q = (char* ) malloc (64);
> > > > > > > > +
> > > > > > > > +  __builtin_memset (pd_dst, 0, 64);
> > > > > > > > +  __builtin_memset (ps_dst, 0, 64);
> > > > > > > > +  __builtin_memset (epi64_dst, 0, 64);
> > > > > > > > +  __builtin_memset (epi32_dst, 0, 64);
> > > > > > > > +  __builtin_memset (epi16_dst, 0, 64);
> > > > > > > > +  __builtin_memset (epi8_dst, 0, 64);
> > > > > > > > +
> > > > > > > > +  for (int i = 0; i != 64; i++)
> > > > > > > > +    p[i] = i;
> > > > > > > > +
> > > > > > > > +  __builtin_memcpy (pd_src, p, 64);
> > > > > > > > +  __builtin_memcpy (ps_src, p, 64);
> > > > > > > > +  __builtin_memcpy (epi64_src, p, 64);
> > > > > > > > +  __builtin_memcpy (epi32_src, p, 64);
> > > > > > > > +  __builtin_memcpy (epi16_src, p, 64);
> > > > > > > > +  __builtin_memcpy (epi8_src, p, 64);
> > > > > > > > +
> > > > > > > > +
> > > > > > > > +  for (int i = 0; i != 16; i++)
> > > > > > > > +    {
> > > > > > > > +      q[i] = i + 48;
> > > > > > > > +      q[i + 16] = i + 32;
> > > > > > > > +      q[i + 32] = i + 16;
> > > > > > > > +      q[i + 48] = i;
> > > > > > > > +    }
> > > > > > > > +
> > > > > > > > +  __builtin_memcpy (pd_exp, q, 64);
> > > > > > > > +  __builtin_memcpy (epi64_exp, q, 64);
> > > > > > > > +
> > > > > > > > +   for (int i = 0; i != 8; i++)
> > > > > > > > +    {
> > > > > > > > +      q[i] = i + 56;
> > > > > > > > +      q[i + 8] = i + 48;
> > > > > > > > +      q[i + 16] = i + 40;
> > > > > > > > +      q[i + 24] = i + 32;
> > > > > > > > +      q[i + 32] = i + 24;
> > > > > > > > +      q[i + 40] = i + 16;
> > > > > > > > +      q[i + 48] = i + 8;
> > > > > > > > +      q[i + 56] = i;
> > > > > > > > +    }
> > > > > > > > +
> > > > > > > > +  __builtin_memcpy (ps_exp, q, 64);
> > > > > > > > +  __builtin_memcpy (epi32_exp, q, 64);
> > > > > > > > +
> > > > > > > > +  for (int i = 0; i != 4; i++)
> > > > > > > > +    {
> > > > > > > > +      q[i] = i + 60;
> > > > > > > > +      q[i + 4] = i + 56;
> > > > > > > > +      q[i + 8] = i + 52;
> > > > > > > > +      q[i + 12] = i + 48;
> > > > > > > > +      q[i + 16] = i + 44;
> > > > > > > > +      q[i + 20] = i + 40;
> > > > > > > > +      q[i + 24] = i + 36;
> > > > > > > > +      q[i + 28] = i + 32;
> > > > > > > > +      q[i + 32] = i + 28;
> > > > > > > > +      q[i + 36] = i + 24;
> > > > > > > > +      q[i + 40] = i + 20;
> > > > > > > > +      q[i + 44] = i + 16;
> > > > > > > > +      q[i + 48] = i + 12;
> > > > > > > > +      q[i + 52] = i + 8;
> > > > > > > > +      q[i + 56] = i + 4;
> > > > > > > > +      q[i + 60] = i;
> > > > > > > > +    }
> > > > > > > > +
> > > > > > > > +  __builtin_memcpy (epi16_exp, q, 64);
> > > > > > > > +
> > > > > > > > +  for (int i = 0; i != 2; i++)
> > > > > > > > +    {
> > > > > > > > +      q[i] = i + 62;
> > > > > > > > +      q[i + 2] = i + 60;
> > > > > > > > +      q[i + 4] = i + 58;
> > > > > > > > +      q[i + 6] = i + 56;
> > > > > > > > +      q[i + 8] = i + 54;
> > > > > > > > +      q[i + 10] = i + 52;
> > > > > > > > +      q[i + 12] = i + 50;
> > > > > > > > +      q[i + 14] = i + 48;
> > > > > > > > +      q[i + 16] = i + 46;
> > > > > > > > +      q[i + 18] = i + 44;
> > > > > > > > +      q[i + 20] = i + 42;
> > > > > > > > +      q[i + 22] = i + 40;
> > > > > > > > +      q[i + 24] = i + 38;
> > > > > > > > +      q[i + 26] = i + 36;
> > > > > > > > +      q[i + 28] = i + 34;
> > > > > > > > +      q[i + 30] = i + 32;
> > > > > > > > +      q[i + 32] = i + 30;
> > > > > > > > +      q[i + 34] = i + 28;
> > > > > > > > +      q[i + 36] = i + 26;
> > > > > > > > +      q[i + 38] = i + 24;
> > > > > > > > +      q[i + 40] = i + 22;
> > > > > > > > +      q[i + 42] = i + 20;
> > > > > > > > +      q[i + 44] = i + 18;
> > > > > > > > +      q[i + 46] = i + 16;
> > > > > > > > +      q[i + 48] = i + 14;
> > > > > > > > +      q[i + 50] = i + 12;
> > > > > > > > +      q[i + 52] = i + 10;
> > > > > > > > +      q[i + 54] = i + 8;
> > > > > > > > +      q[i + 56] = i + 6;
> > > > > > > > +      q[i + 58] = i + 4;
> > > > > > > > +      q[i + 60] = i + 2;
> > > > > > > > +      q[i + 62] = i;
> > > > > > > > +    }
> > > > > > > > +  __builtin_memcpy (epi8_exp, q, 64);
> > > > > > > > +
> > > > > > > > +  foo_pd (pd_dst, pd_src);
> > > > > > > > +  foo_ps (ps_dst, ps_src);
> > > > > > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > > > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > > > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > > > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > > > > > +
> > > > > > > > +  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +
> > > > > > > > +  return;
> > > > > > > > +}
> > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6c.c b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > > > > > > > new file mode 100644
> > > > > > > > index 00000000000..b859d884a7f
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > > > > > > > @@ -0,0 +1,80 @@
> > > > > > > > +/* { dg-do run } */
> > > > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > > > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } } */
> > > > > > > > +
> > > > > > > > +#include <string.h>
> > > > > > > > +
> > > > > > > > +static void do_test (void);
> > > > > > > > +#define DO_TEST do_test
> > > > > > > > +#define AVX512FP16
> > > > > > > > +#include "avx512-check.h"
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > > > > > > +{
> > > > > > > > +  a[0] = b[15];
> > > > > > > > +  a[1] = b[14];
> > > > > > > > +  a[2] = b[13];
> > > > > > > > +  a[3] = b[12];
> > > > > > > > +  a[4] = b[11];
> > > > > > > > +  a[5] = b[10];
> > > > > > > > +  a[6] = b[9];
> > > > > > > > +  a[7] = b[8];
> > > > > > > > +  a[8] = b[7];
> > > > > > > > +  a[9] = b[6];
> > > > > > > > +  a[10] = b[5];
> > > > > > > > +  a[11] = b[4];
> > > > > > > > +  a[12] = b[3];
> > > > > > > > +  a[13] = b[2];
> > > > > > > > +  a[14] = b[1];
> > > > > > > > +  a[15] = b[0];
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +do_test (void)
> > > > > > > > +{
> > > > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
> > > > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
> > > > > > > > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
> > > > > > > > +  char* p = (char* ) malloc (64);
> > > > > > > > +  char* q = (char* ) malloc (64);
> > > > > > > > +
> > > > > > > > +  __builtin_memset (ph_dst, 0, 64);
> > > > > > > > +
> > > > > > > > +  for (int i = 0; i != 64; i++)
> > > > > > > > +    p[i] = i;
> > > > > > > > +
> > > > > > > > +  __builtin_memcpy (ph_src, p, 64);
> > > > > > > > +
> > > > > > > > +  for (int i = 0; i != 4; i++)
> > > > > > > > +    {
> > > > > > > > +      q[i] = i + 60;
> > > > > > > > +      q[i + 4] = i + 56;
> > > > > > > > +      q[i + 8] = i + 52;
> > > > > > > > +      q[i + 12] = i + 48;
> > > > > > > > +      q[i + 16] = i + 44;
> > > > > > > > +      q[i + 20] = i + 40;
> > > > > > > > +      q[i + 24] = i + 36;
> > > > > > > > +      q[i + 28] = i + 32;
> > > > > > > > +      q[i + 32] = i + 28;
> > > > > > > > +      q[i + 36] = i + 24;
> > > > > > > > +      q[i + 40] = i + 20;
> > > > > > > > +      q[i + 44] = i + 16;
> > > > > > > > +      q[i + 48] = i + 12;
> > > > > > > > +      q[i + 52] = i + 8;
> > > > > > > > +      q[i + 56] = i + 4;
> > > > > > > > +      q[i + 60] = i;
> > > > > > > > +    }
> > > > > > > > +
> > > > > > > > +  __builtin_memcpy (ph_exp, q, 64);
> > > > > > > > +
> > > > > > > > +  foo_ph (ph_dst, ph_src);
> > > > > > > > +
> > > > > > > > +  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +
> > > > > > > > +  return;
> > > > > > > > +}
> > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7a.c b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > > > > > > > new file mode 100644
> > > > > > > > index 00000000000..2ea01fac927
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > > > > > > > @@ -0,0 +1,58 @@
> > > > > > > > +/* { dg-do compile } */
> > > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
> > > > > > > > +
> > > > > > > > +#define N 10000
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_pd (_Complex double* a, _Complex double b)
> > > > > > > > +{
> > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > +    a[i] = b;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_ps (_Complex float* a, _Complex float b)
> > > > > > > > +{
> > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > +    a[i] = b;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi64 (_Complex long long* a, _Complex long long b)
> > > > > > > > +{
> > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > +    a[i] = b;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi32 (_Complex int* a, _Complex int b)
> > > > > > > > +{
> > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > +    a[i] = b;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi16 (_Complex short* a, _Complex short b)
> > > > > > > > +{
> > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > +    a[i] = b;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi8 (_Complex char* a, _Complex char b)
> > > > > > > > +{
> > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > +    a[i] = b;
> > > > > > > > +}
> > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7b.c b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > > > > > > > new file mode 100644
> > > > > > > > index 00000000000..26482cc10f5
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > > > > > > > @@ -0,0 +1,63 @@
> > > > > > > > +/* { dg-do run } */
> > > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > > +/* { dg-require-effective-target avx } */
> > > > > > > > +
> > > > > > > > +#include "avx-check.h"
> > > > > > > > +#include <string.h>
> > > > > > > > +#include "pr106010-7a.c"
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +avx_test (void)
> > > > > > > > +{
> > > > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > > > > > +  char* p_init = (char*) malloc (2 * N * sizeof (double));
> > > > > > > > +
> > > > > > > > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > > > > > > > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > > > > > > > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > > > > > > > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > > > > > > > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > > > > > > > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > > > > > > > +
> > > > > > > > +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> > > > > > > > +    p_init[i] = i % 2 + 3;
> > > > > > > > +
> > > > > > > > +  memcpy (pd_src, p_init, 2 * N * sizeof (double));
> > > > > > > > +  memcpy (ps_dst, p_init, 2 * N * sizeof (float));
> > > > > > > > +  memcpy (epi64_dst, p_init, 2 * N * sizeof (long long));
> > > > > > > > +  memcpy (epi32_dst, p_init, 2 * N * sizeof (int));
> > > > > > > > +  memcpy (epi16_dst, p_init, 2 * N * sizeof (short));
> > > > > > > > +  memcpy (epi8_dst, p_init, 2 * N * sizeof (char));
> > > > > > > > +
> > > > > > > > +  foo_pd (pd_dst, pd_src[0]);
> > > > > > > > +  foo_ps (ps_dst, ps_src[0]);
> > > > > > > > +  foo_epi64 (epi64_dst, epi64_src[0]);
> > > > > > > > +  foo_epi32 (epi32_dst, epi32_src[0]);
> > > > > > > > +  foo_epi16 (epi16_dst, epi16_src[0]);
> > > > > > > > +  foo_epi8 (epi8_dst, epi8_src[0]);
> > > > > > > > +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +
> > > > > > > > +  return;
> > > > > > > > +}
> > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7c.c b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > > > > > > > new file mode 100644
> > > > > > > > index 00000000000..7f4056a5ecc
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > > > > > > > @@ -0,0 +1,41 @@
> > > > > > > > +/* { dg-do run } */
> > > > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
> > > > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > > > +
> > > > > > > > +#include <string.h>
> > > > > > > > +
> > > > > > > > +static void do_test (void);
> > > > > > > > +
> > > > > > > > +#define DO_TEST do_test
> > > > > > > > +#define AVX512FP16
> > > > > > > > +#include "avx512-check.h"
> > > > > > > > +
> > > > > > > > +#define N 10000
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16 b)
> > > > > > > > +{
> > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > +    a[i] = b;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static void
> > > > > > > > +do_test (void)
> > > > > > > > +{
> > > > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > > > > > +  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
> > > > > > > > +
> > > > > > > > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > > > > > > > +
> > > > > > > > +  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
> > > > > > > > +    p_init[i] = i % 2 + 3;
> > > > > > > > +
> > > > > > > > +  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
> > > > > > > > +
> > > > > > > > +  foo_ph (ph_dst, ph_src[0]);
> > > > > > > > +  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
> > > > > > > > +    __builtin_abort ();
> > > > > > > > +}
> > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8a.c b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > > > > > > > new file mode 100644
> > > > > > > > index 00000000000..11054b60d30
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > > > > > > > @@ -0,0 +1,58 @@
> > > > > > > > +/* { dg-do compile } */
> > > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
> > > > > > > > +
> > > > > > > > +#define N 10000
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_pd (_Complex double* a)
> > > > > > > > +{
> > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > +    a[i] = 1.0 + 2.0i;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_ps (_Complex float* a)
> > > > > > > > +{
> > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > +    a[i] = 1.0f + 2.0fi;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi64 (_Complex long long* a)
> > > > > > > > +{
> > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > +    a[i] = 1 + 2i;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi32 (_Complex int* a)
> > > > > > > > +{
> > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > +    a[i] = 1 + 2i;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi16 (_Complex short* a)
> > > > > > > > +{
> > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > +    a[i] = 1 + 2i;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_epi8 (_Complex char* a)
> > > > > > > > +{
> > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > +    a[i] = 1 + 2i;
> > > > > > > > +}
> > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8b.c b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > > > > > > > new file mode 100644
> > > > > > > > index 00000000000..6bb0073b691
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > > > > > > > @@ -0,0 +1,53 @@
> > > > > > > > +/* { dg-do run } */
> > > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > > +/* { dg-require-effective-target avx } */
> > > > > > > > +
> > > > > > > > +#include "avx-check.h"
> > > > > > > > +#include <string.h>
> > > > > > > > +#include "pr106010-8a.c"
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +avx_test (void)
> > > > > > > > +{
> > > > > > > > +  _Complex double pd_src = 1.0 + 2.0i;
> > > > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > > > > > +  _Complex float ps_src = 1.0 + 2.0i;
> > > > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > > > > > +  _Complex long long epi64_src = 1 + 2i;;
> > > > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > > > > > +  _Complex int epi32_src = 1 + 2i;
> > > > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > > > > > +  _Complex short epi16_src = 1 + 2i;
> > > > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > > > > > +  _Complex char epi8_src = 1 + 2i;
> > > > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > > > > > +
> > > > > > > > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > > > > > > > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > > > > > > > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > > > > > > > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > > > > > > > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > > > > > > > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > > > > > > > +
> > > > > > > > +  foo_pd (pd_dst);
> > > > > > > > +  foo_ps (ps_dst);
> > > > > > > > +  foo_epi64 (epi64_dst);
> > > > > > > > +  foo_epi32 (epi32_dst);
> > > > > > > > +  foo_epi16 (epi16_dst);
> > > > > > > > +  foo_epi8 (epi8_dst);
> > > > > > > > +  for (int i = 0 ; i != N; i++)
> > > > > > > > +    {
> > > > > > > > +      if (pd_dst[i] != pd_src)
> > > > > > > > +       __builtin_abort ();
> > > > > > > > +      if (ps_dst[i] != ps_src)
> > > > > > > > +       __builtin_abort ();
> > > > > > > > +      if (epi64_dst[i] != epi64_src)
> > > > > > > > +       __builtin_abort ();
> > > > > > > > +      if (epi32_dst[i] != epi32_src)
> > > > > > > > +       __builtin_abort ();
> > > > > > > > +      if (epi16_dst[i] != epi16_src)
> > > > > > > > +       __builtin_abort ();
> > > > > > > > +      if (epi8_dst[i] != epi8_src)
> > > > > > > > +       __builtin_abort ();
> > > > > > > > +    }
> > > > > > > > +}
> > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8c.c b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > > > > > > > new file mode 100644
> > > > > > > > index 00000000000..61ae131829d
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > > > > > > > @@ -0,0 +1,38 @@
> > > > > > > > +/* { dg-do run } */
> > > > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
> > > > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > > > +
> > > > > > > > +#include <string.h>
> > > > > > > > +
> > > > > > > > +static void do_test (void);
> > > > > > > > +
> > > > > > > > +#define DO_TEST do_test
> > > > > > > > +#define AVX512FP16
> > > > > > > > +#include "avx512-check.h"
> > > > > > > > +
> > > > > > > > +#define N 10000
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +__attribute__((noipa))
> > > > > > > > +foo_ph (_Complex _Float16* a)
> > > > > > > > +{
> > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > +    a[i] = 1.0f16 + 2.0f16i;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static void
> > > > > > > > +do_test (void)
> > > > > > > > +{
> > > > > > > > +  _Complex _Float16 ph_src = 1.0f16 + 2.0f16i;
> > > > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > > > > > +
> > > > > > > > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > > > > > > > +
> > > > > > > > +  foo_ph (ph_dst);
> > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > +    {
> > > > > > > > +      if (ph_dst[i] != ph_src)
> > > > > > > > +       __builtin_abort ();
> > > > > > > > +    }
> > > > > > > > +}
> > > > > > > > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> > > > > > > > index d20a10a1524..42ee9df674c 100644
> > > > > > > > --- a/gcc/tree-vect-data-refs.cc
> > > > > > > > +++ b/gcc/tree-vect-data-refs.cc
> > > > > > > > @@ -1403,7 +1403,8 @@ vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
> > > > > > > >    if (PURE_SLP_STMT (stmt_info))
> > > > > > > >      ncopies = 1;
> > > > > > > >    else
> > > > > > > > -    ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
> > > > > > > > +    ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info),
> > > > > > > > +                                  STMT_VINFO_COMPLEX_P (stmt_info));
> > > > > > > >
> > > > > > > >    if (DR_IS_READ (dr_info->dr))
> > > > > > > >      vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
> > > > > > > > @@ -4597,8 +4598,22 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
> > > > > > > >
> > > > > > > >        /* Set vectype for STMT.  */
> > > > > > > >        scalar_type = TREE_TYPE (DR_REF (dr));
> > > > > > > > -      tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
> > > > > > > > -      if (!vectype)
> > > > > > > > +      tree adjust_scalar_type = scalar_type;
> > > > > > > > +      /* Support Complex type access. Note that the complex type of load/store
> > > > > > > > +        does not support gather/scatter.  */
> > > > > > > > +      if (TREE_CODE (scalar_type) == COMPLEX_TYPE
> > > > > > > > +         && gatherscatter == SG_NONE)
> > > > > > > > +       {
> > > > > > > > +         adjust_scalar_type = TREE_TYPE (scalar_type);
> > > > > > > > +         STMT_VINFO_COMPLEX_P (stmt_info) = true;
> > > > > > > > +       }
> > > > > > > > +      tree vectype = get_vectype_for_scalar_type (vinfo, adjust_scalar_type);
> > > > > > > > +      unsigned HOST_WIDE_INT constant_nunits;
> > > > > > > > +      if (!vectype
> > > > > > > > +         /* For complex type, V1DI doesn't make sense.  */
> > > > > > > > +         || (STMT_VINFO_COMPLEX_P (stmt_info)
> > > > > > > > +             && (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&constant_nunits)
> > > > > > > > +                 || constant_nunits == 1)))
> > > > > > > >          {
> > > > > > > >            if (dump_enabled_p ())
> > > > > > > >              {
> > > > > > > > @@ -4635,8 +4650,11 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
> > > > > > > >         }
> > > > > > > >
> > > > > > > >        /* Adjust the minimal vectorization factor according to the
> > > > > > > > -        vector type.  */
> > > > > > > > +        vector type. Note for complex type, VF is half of
> > > > > > > > +        TYPE_VECTOR_SUBPARTS.  */
> > > > > > > >        vf = TYPE_VECTOR_SUBPARTS (vectype);
> > > > > > > > +      if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > > > > +       vf = exact_div (vf, 2);
> > > > > > > >        *min_vf = upper_bound (*min_vf, vf);
> > > > > > > >
> > > > > > > >        /* Leave the BB vectorizer to pick the vector type later, based on
> > > > > > > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > > > > > > > index 3a70c15b593..365fa738022 100644
> > > > > > > > --- a/gcc/tree-vect-loop.cc
> > > > > > > > +++ b/gcc/tree-vect-loop.cc
> > > > > > > > @@ -200,7 +200,12 @@ vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > > > > >      }
> > > > > > > >
> > > > > > > >    if (nunits_vectype)
> > > > > > > > -    vect_update_max_nunits (vf, nunits_vectype);
> > > > > > > > +    {
> > > > > > > > +      poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (nunits_vectype);
> > > > > > > > +      if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > > > > +       nunits = exact_div (nunits, 2);
> > > > > > > > +      vect_update_max_nunits (vf, nunits);
> > > > > > > > +    }
> > > > > > > >
> > > > > > > >    return opt_result::success ();
> > > > > > > >  }
> > > > > > > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> > > > > > > > index dab5daddcc5..5d66ea2f286 100644
> > > > > > > > --- a/gcc/tree-vect-slp.cc
> > > > > > > > +++ b/gcc/tree-vect-slp.cc
> > > > > > > > @@ -877,10 +877,14 @@ vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > > > > >        return false;
> > > > > > > >      }
> > > > > > > >
> > > > > > > > +  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> > > > > > > > +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > > > > +    nunits = exact_div (nunits, 2);
> > > > > > > > +
> > > > > > > >    /* If populating the vector type requires unrolling then fail
> > > > > > > >       before adjusting *max_nunits for basic-block vectorization.  */
> > > > > > > >    if (is_a <bb_vec_info> (vinfo)
> > > > > > > > -      && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
> > > > > > > > +      && !multiple_p (group_size , nunits))
> > > > > > > >      {
> > > > > > > >        if (dump_enabled_p ())
> > > > > > > >         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > > > > > > @@ -891,7 +895,7 @@ vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > > > > >      }
> > > > > > > >
> > > > > > > >    /* In case of multiple types we need to detect the smallest type.  */
> > > > > > > > -  vect_update_max_nunits (max_nunits, vectype);
> > > > > > > > +  vect_update_max_nunits (max_nunits, nunits);
> > > > > > > >    return true;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > @@ -3720,22 +3724,54 @@ vect_optimize_slp (vec_info *vinfo)
> > > > > > > >          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
> > > > > > > >          when permuting constants and invariants keeping the permute
> > > > > > > >          bijective.  */
> > > > > > > > -      auto_sbitmap load_index (SLP_TREE_LANES (node));
> > > > > > > > -      bitmap_clear (load_index);
> > > > > > > > -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > > > -       bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
> > > > > > > > -      unsigned j;
> > > > > > > > -      for (j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > > > -       if (!bitmap_bit_p (load_index, j))
> > > > > > > > -         break;
> > > > > > > > -      if (j != SLP_TREE_LANES (node))
> > > > > > > > -       continue;
> > > > > > > > +      /* Permutation of Complex type.  */
> > > > > > > > +      if (STMT_VINFO_COMPLEX_P (dr_stmt))
> > > > > > > > +       {
> > > > > > > > +         auto_sbitmap load_index (SLP_TREE_LANES (node) * 2);
> > > > > > > > +         bitmap_clear (load_index);
> > > > > > > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > > > +           {
> > > > > > > > +             unsigned bit = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > > > > > > +             bitmap_set_bit (load_index, 2 * bit);
> > > > > > > > +             bitmap_set_bit (load_index, 2 * bit + 1);
> > > > > > > > +           }
> > > > > > > > +         unsigned j;
> > > > > > > > +         for (j = 0; j < SLP_TREE_LANES (node) * 2; ++j)
> > > > > > > > +           if (!bitmap_bit_p (load_index, j))
> > > > > > > > +             break;
> > > > > > > > +         if (j != SLP_TREE_LANES (node) * 2)
> > > > > > > > +           continue;
> > > > > > > >
> > > > > > > > -      vec<unsigned> perm = vNULL;
> > > > > > > > -      perm.safe_grow (SLP_TREE_LANES (node), true);
> > > > > > > > -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > > > -       perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > > > > > > -      perms.safe_push (perm);
> > > > > > > > +         vec<unsigned> perm = vNULL;
> > > > > > > > +         perm.safe_grow (SLP_TREE_LANES (node) * 2, true);
> > > > > > > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > > > +           {
> > > > > > > > +             unsigned cidx = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > > > > > > +             perm[2 * j] = 2 * cidx;
> > > > > > > > +             perm[2 * j + 1] = 2 * cidx + 1;
> > > > > > > > +           }
> > > > > > > > +         perms.safe_push (perm);
> > > > > > > > +       }
> > > > > > > > +      else
> > > > > > > > +       {
> > > > > > > > +         auto_sbitmap load_index (SLP_TREE_LANES (node));
> > > > > > > > +         bitmap_clear (load_index);
> > > > > > > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > > > +           bitmap_set_bit (load_index,
> > > > > > > > +                           SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
> > > > > > > > +         unsigned j;
> > > > > > > > +         for (j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > > > +           if (!bitmap_bit_p (load_index, j))
> > > > > > > > +             break;
> > > > > > > > +         if (j != SLP_TREE_LANES (node))
> > > > > > > > +           continue;
> > > > > > > > +
> > > > > > > > +         vec<unsigned> perm = vNULL;
> > > > > > > > +         perm.safe_grow (SLP_TREE_LANES (node), true);
> > > > > > > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > > > +           perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > > > > > > +         perms.safe_push (perm);
> > > > > > > > +       }
> > > > > > > >        vertices[idx].perm_in = perms.length () - 1;
> > > > > > > >        vertices[idx].perm_out = perms.length () - 1;
> > > > > > > >      }
> > > > > > > > @@ -4518,6 +4554,12 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
> > > > > > > >         vf = loop_vinfo->vectorization_factor;
> > > > > > > >        else
> > > > > > > >         vf = 1;
> > > > > > > > +      /* For complex type and SLP, double vf to get right vectype.
> > > > > > > > +        .i.e vector(4) double for complex double, group size is 2, double vf
> > > > > > > > +        to map vf * group_size to TYPE_VECTOR_SUBPARTS.  */
> > > > > > > > +     if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > > > > +       vf *= 2;
> > > > > > > > +
> > > > > > > >        unsigned int group_size = SLP_TREE_LANES (node);
> > > > > > > >        tree vectype = SLP_TREE_VECTYPE (node);
> > > > > > > >        SLP_TREE_NUMBER_OF_VEC_STMTS (node)
> > > > > > > > @@ -4763,10 +4805,17 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
> > > > > > > >             }
> > > > > > > >           unsigned group_size = SLP_TREE_LANES (child);
> > > > > > > >           poly_uint64 vf = 1;
> > > > > > > > +
> > > > > > > >           if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
> > > > > > > >             vf = loop_vinfo->vectorization_factor;
> > > > > > > > +
> > > > > > > > +         /* V2SF is just 1 complex type, so mutiply by 2
> > > > > > > > +            to get release vector numbers.  */
> > > > > > > > +         unsigned cp
> > > > > > > > +           = STMT_VINFO_COMPLEX_P (SLP_TREE_REPRESENTATIVE (node)) ? 2 : 1;
> > > > > > > > +
> > > > > > > >           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
> > > > > > > > -           = vect_get_num_vectors (vf * group_size, vector_type);
> > > > > > > > +           = vect_get_num_vectors (vf * group_size * cp, vector_type);
> > > > > > > >           /* And cost them.  */
> > > > > > > >           vect_prologue_cost_for_slp (child, cost_vec);
> > > > > > > >         }
> > > > > > > > @@ -6402,6 +6451,11 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > > > > > > >
> > > > > > > >    /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
> > > > > > > >    vector_type = SLP_TREE_VECTYPE (op_node);
> > > > > > > > +  unsigned int cp = 1;
> > > > > > > > +  /* Handle Complex type vector init.
> > > > > > > > +     SLP_TREE_REPRESENTATIVE (op_node) could be NULL.  */
> > > > > > > > +  if (TREE_CODE (TREE_TYPE (op_node->ops[0])) == COMPLEX_TYPE)
> > > > > > > > +    cp = 2;
> > > > > > > >
> > > > > > > >    unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
> > > > > > > >    SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
> > > > > > > > @@ -6426,9 +6480,9 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > > > > > > >    /* When using duplicate_and_interleave, we just need one element for
> > > > > > > >       each scalar statement.  */
> > > > > > > >    if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
> > > > > > > > -    nunits = group_size;
> > > > > > > > +    nunits = group_size * cp;
> > > > > > > >
> > > > > > > > -  number_of_copies = nunits * number_of_vectors / group_size;
> > > > > > > > +  number_of_copies = nunits * number_of_vectors / (group_size * cp);
> > > > > > > >
> > > > > > > >    number_of_places_left_in_vector = nunits;
> > > > > > > >    constant_p = true;
> > > > > > > > @@ -6460,8 +6514,23 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > > > > > > >                         gcc_unreachable ();
> > > > > > > >                     }
> > > > > > > >                   else
> > > > > > > > -                   op = fold_unary (VIEW_CONVERT_EXPR,
> > > > > > > > -                                    TREE_TYPE (vector_type), op);
> > > > > > > > +                   {
> > > > > > > > +                     tree scalar_type = TREE_TYPE (vector_type);
> > > > > > > > +                     /* For complex type, insert real and imag part
> > > > > > > > +                        separately.  */
> > > > > > > > +                     if (cp == 2)
> > > > > > > > +                       {
> > > > > > > > +                         gcc_assert ((TREE_CODE (TREE_TYPE (op))
> > > > > > > > +                                      == COMPLEX_TYPE)
> > > > > > > > +                                     && (scalar_type
> > > > > > > > +                                         == TREE_TYPE (TREE_TYPE (op))));
> > > > > > > > +                         elts[number_of_places_left_in_vector--]
> > > > > > > > +                           = fold_unary (IMAGPART_EXPR, scalar_type, op);
> > > > > > > > +                         op = fold_unary (REALPART_EXPR, scalar_type, op);
> > > > > > > > +                       }
> > > > > > > > +                     else
> > > > > > > > +                       op = fold_unary (VIEW_CONVERT_EXPR, scalar_type, op);
> > > > > > > > +                   }
> > > > > > > >                   gcc_assert (op && CONSTANT_CLASS_P (op));
> > > > > > > >                 }
> > > > > > > >               else
> > > > > > > > @@ -6481,11 +6550,28 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > > > > > > >                     }
> > > > > > > >                   else
> > > > > > > >                     {
> > > > > > > > -                     op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
> > > > > > > > -                                  op);
> > > > > > > > -                     init_stmt
> > > > > > > > -                       = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
> > > > > > > > -                                              op);
> > > > > > > > +                     tree scalar_type = TREE_TYPE (vector_type);
> > > > > > > > +                     if (cp == 2)
> > > > > > > > +                       {
> > > > > > > > +                         gcc_assert ((TREE_CODE (TREE_TYPE (op))
> > > > > > > > +                                      == COMPLEX_TYPE)
> > > > > > > > +                                     && (scalar_type
> > > > > > > > +                                         == TREE_TYPE (TREE_TYPE (op))));
> > > > > > > > +                         tree imag = build1 (IMAGPART_EXPR, scalar_type, op);
> > > > > > > > +                         op = build1 (REALPART_EXPR, scalar_type, op);
> > > > > > > > +                         tree imag_temp = make_ssa_name (scalar_type);
> > > > > > > > +                         elts[number_of_places_left_in_vector--] = imag_temp;
> > > > > > > > +                         init_stmt = gimple_build_assign (imag_temp, imag);
> > > > > > > > +                         gimple_seq_add_stmt (&ctor_seq, init_stmt);
> > > > > > > > +                         init_stmt = gimple_build_assign (new_temp, op);
> > > > > > > > +                       }
> > > > > > > > +                     else
> > > > > > > > +                       {
> > > > > > > > +                         op = build1 (VIEW_CONVERT_EXPR, scalar_type, op);
> > > > > > > > +                         init_stmt
> > > > > > > > +                           = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
> > > > > > > > +                                                  op);
> > > > > > > > +                       }
> > > > > > > >                     }
> > > > > > > >                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
> > > > > > > >                   op = new_temp;
> > > > > > > > @@ -6696,15 +6782,17 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > > > > >    unsigned int nelts_to_build;
> > > > > > > >    unsigned int nvectors_per_build;
> > > > > > > >    unsigned int in_nlanes;
> > > > > > > > +  unsigned int cp = STMT_VINFO_COMPLEX_P (stmt_info) ? 2 : 1;
> > > > > > > >    bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
> > > > > > > > -                     && multiple_p (nunits, group_size));
> > > > > > > > +                     && multiple_p (nunits, group_size * cp));
> > > > > > > >    if (repeating_p)
> > > > > > > >      {
> > > > > > > >        /* A single vector contains a whole number of copies of the node, so:
> > > > > > > >          (a) all permutes can use the same mask; and
> > > > > > > >          (b) the permutes only need a single vector input.  */
> > > > > > > > -      mask.new_vector (nunits, group_size, 3);
> > > > > > > > -      nelts_to_build = mask.encoded_nelts ();
> > > > > > > > +      /* For complex type, mask size should be double of nelts_to_build.  */
> > > > > > > > +      mask.new_vector (nunits, group_size * cp, 3);
> > > > > > > > +      nelts_to_build = mask.encoded_nelts () / cp;
> > > > > > > >        nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
> > > > > > > >        in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
> > > > > > > >      }
> > > > > > > > @@ -6744,8 +6832,8 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > > > > >         {
> > > > > > > >           /* Enforced before the loop when !repeating_p.  */
> > > > > > > >           unsigned int const_nunits = nunits.to_constant ();
> > > > > > > > -         vec_index = i / const_nunits;
> > > > > > > > -         mask_element = i % const_nunits;
> > > > > > > > +         vec_index = i / (const_nunits / cp);
> > > > > > > > +         mask_element = i % (const_nunits / cp);
> > > > > > > >           if (vec_index == first_vec_index
> > > > > > > >               || first_vec_index == -1)
> > > > > > > >             {
> > > > > > > > @@ -6755,7 +6843,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > > > > >                    || second_vec_index == -1)
> > > > > > > >             {
> > > > > > > >               second_vec_index = vec_index;
> > > > > > > > -             mask_element += const_nunits;
> > > > > > > > +             mask_element += (const_nunits / cp);
> > > > > > > >             }
> > > > > > > >           else
> > > > > > > >             {
> > > > > > > > @@ -6768,14 +6856,24 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > > > > >               return false;
> > > > > > > >             }
> > > > > > > >
> > > > > > > > -         gcc_assert (mask_element < 2 * const_nunits);
> > > > > > > > +         gcc_assert (mask_element < 2 * const_nunits / cp);
> > > > > > > >         }
> > > > > > > >
> > > > > > > >        if (mask_element != index)
> > > > > > > >         noop_p = false;
> > > > > > > > -      mask[index++] = mask_element;
> > > > > > > > +      /* Set index for Complex _type.
> > > > > > > > +        i.e. mask like [1,0] is actually [2, 3, 0, 1]
> > > > > > > > +        for vector scalar type.  */
> > > > > > > > +      if (cp == 2)
> > > > > > > > +       {
> > > > > > > > +         mask[2 * index] = 2 * mask_element;
> > > > > > > > +         mask[2 * index + 1] = 2 * mask_element + 1;
> > > > > > > > +       }
> > > > > > > > +      else
> > > > > > > > +       mask[index] = mask_element;
> > > > > > > > +      index++;
> > > > > > > >
> > > > > > > > -      if (index == count && !noop_p)
> > > > > > > > +      if (index * cp == count && !noop_p)
> > > > > > > >         {
> > > > > > > >           indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
> > > > > > > >           if (!can_vec_perm_const_p (mode, mode, indices))
> > > > > > > > @@ -6799,7 +6897,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > > > > >           ++*n_perms;
> > > > > > > >         }
> > > > > > > >
> > > > > > > > -      if (index == count)
> > > > > > > > +      if (index * cp == count)
> > > > > > > >         {
> > > > > > > >           if (!analyze_only)
> > > > > > > >             {
> > > > > > > > @@ -6869,7 +6967,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > > > > >           bool load_seen = false;
> > > > > > > >           for (unsigned i = 0; i < in_nlanes; ++i)
> > > > > > > >             {
> > > > > > > > -             if (i % const_nunits == 0)
> > > > > > > > +             if (i % (const_nunits * cp) == 0)
> > > > > > > >                 {
> > > > > > > >                   if (load_seen)
> > > > > > > >                     *n_loads += 1;
> > > > > > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > > > > > > > index 72107afc883..8af3b558be4 100644
> > > > > > > > --- a/gcc/tree-vect-stmts.cc
> > > > > > > > +++ b/gcc/tree-vect-stmts.cc
> > > > > > > > @@ -1397,25 +1397,70 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
> > > > > > > >  {
> > > > > > > >    gimple *init_stmt;
> > > > > > > >    tree new_temp;
> > > > > > > > +  tree scalar_type = TREE_TYPE (type);
> > > > > > > > +  gimple_seq stmts = NULL;
> > > > > > > > +
> > > > > > > > +  if (TREE_CODE (TREE_TYPE (val)) == COMPLEX_TYPE)
> > > > > > > > +    {
> > > > > > > > +      unsigned HOST_WIDE_INT nunits;
> > > > > > > > +      gcc_assert (TYPE_VECTOR_SUBPARTS (type).is_constant (&nunits));
> > > > > > > >
> > > > > > > > +      tree_vector_builder elts (type, nunits, 1);
> > > > > > > > +      tree imag, real;
> > > > > > > > +      if (TREE_CODE (val) == COMPLEX_CST)
> > > > > > > > +       {
> > > > > > > > +         real = fold_unary (REALPART_EXPR, scalar_type, val);
> > > > > > > > +         imag = fold_unary (IMAGPART_EXPR, scalar_type, val);
> > > > > > > > +       }
> > > > > > > > +      else
> > > > > > > > +       {
> > > > > > > > +         real = make_ssa_name (scalar_type);
> > > > > > > > +         imag = make_ssa_name (scalar_type);
> > > > > > > > +         init_stmt
> > > > > > > > +           = gimple_build_assign (real,
> > > > > > > > +                                  build1 (REALPART_EXPR, scalar_type, val));
> > > > > > > > +         gimple_seq_add_stmt (&stmts, init_stmt);
> > > > > > > > +         init_stmt
> > > > > > > > +           = gimple_build_assign (imag,
> > > > > > > > +                                  build1 (IMAGPART_EXPR, scalar_type, val));
> > > > > > > > +         gimple_seq_add_stmt (&stmts, init_stmt);
> > > > > > > > +       }
> > > > > > > > +
> > > > > > > > +      /* Build vector as [real,imag,real,imag,...].  */
> > > > > > > > +      for (unsigned i = 0; i != nunits; i++)
> > > > > > > > +       {
> > > > > > > > +         if (i % 2)
> > > > > > > > +           elts.quick_push (imag);
> > > > > > > > +         else
> > > > > > > > +           elts.quick_push (real);
> > > > > > > > +       }
> > > > > > > > +      val = gimple_build_vector (&stmts, &elts);
> > > > > > > > +      if (!gimple_seq_empty_p (stmts))
> > > > > > > > +       {
> > > > > > > > +         if (gsi)
> > > > > > > > +           gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> > > > > > > > +         else
> > > > > > > > +           vinfo->insert_seq_on_entry (stmt_info, stmts);
> > > > > > > > +       }
> > > > > > > > +    }
> > > > > > > >    /* We abuse this function to push sth to a SSA name with initial 'val'.  */
> > > > > > > > -  if (! useless_type_conversion_p (type, TREE_TYPE (val)))
> > > > > > > > +  else if (! useless_type_conversion_p (type, TREE_TYPE (val)))
> > > > > > > >      {
> > > > > > > >        gcc_assert (TREE_CODE (type) == VECTOR_TYPE);
> > > > > > > > -      if (! types_compatible_p (TREE_TYPE (type), TREE_TYPE (val)))
> > > > > > > > +      if (! types_compatible_p (scalar_type, TREE_TYPE (val)))
> > > > > > > >         {
> > > > > > > >           /* Scalar boolean value should be transformed into
> > > > > > > >              all zeros or all ones value before building a vector.  */
> > > > > > > >           if (VECTOR_BOOLEAN_TYPE_P (type))
> > > > > > > >             {
> > > > > > > > -             tree true_val = build_all_ones_cst (TREE_TYPE (type));
> > > > > > > > -             tree false_val = build_zero_cst (TREE_TYPE (type));
> > > > > > > > +             tree true_val = build_all_ones_cst (scalar_type);
> > > > > > > > +             tree false_val = build_zero_cst (scalar_type);
> > > > > > > >
> > > > > > > >               if (CONSTANT_CLASS_P (val))
> > > > > > > >                 val = integer_zerop (val) ? false_val : true_val;
> > > > > > > >               else
> > > > > > > >                 {
> > > > > > > > -                 new_temp = make_ssa_name (TREE_TYPE (type));
> > > > > > > > +                 new_temp = make_ssa_name (scalar_type);
> > > > > > > >                   init_stmt = gimple_build_assign (new_temp, COND_EXPR,
> > > > > > > >                                                    val, true_val, false_val);
> > > > > > > >                   vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi);
> > > > > > > > @@ -1424,14 +1469,13 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
> > > > > > > >             }
> > > > > > > >           else
> > > > > > > >             {
> > > > > > > > -             gimple_seq stmts = NULL;
> > > > > > > >               if (! INTEGRAL_TYPE_P (TREE_TYPE (val)))
> > > > > > > >                 val = gimple_build (&stmts, VIEW_CONVERT_EXPR,
> > > > > > > > -                                   TREE_TYPE (type), val);
> > > > > > > > +                                   scalar_type, val);
> > > > > > > >               else
> > > > > > > >                 /* ???  Condition vectorization expects us to do
> > > > > > > >                    promotion of invariant/external defs.  */
> > > > > > > > -               val = gimple_convert (&stmts, TREE_TYPE (type), val);
> > > > > > > > +               val = gimple_convert (&stmts, scalar_type, val);
> > > > > > > >               for (gimple_stmt_iterator gsi2 = gsi_start (stmts);
> > > > > > > >                    !gsi_end_p (gsi2); )
> > > > > > > >                 {
> > > > > > > > @@ -1496,7 +1540,12 @@ vect_get_vec_defs_for_operand (vec_info *vinfo, stmt_vec_info stmt_vinfo,
> > > > > > > >                && VECTOR_BOOLEAN_TYPE_P (stmt_vectype))
> > > > > > > >         vector_type = truth_type_for (stmt_vectype);
> > > > > > > >        else
> > > > > > > > -       vector_type = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op));
> > > > > > > > +       {
> > > > > > > > +         tree scalar_type = TREE_TYPE (op);
> > > > > > > > +         if (STMT_VINFO_COMPLEX_P (stmt_vinfo))
> > > > > > > > +           scalar_type = TREE_TYPE (scalar_type);
> > > > > > > > +         vector_type = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
> > > > > > > > +       }
> > > > > > > >
> > > > > > > >        gcc_assert (vector_type);
> > > > > > > >        tree vop = vect_init_vector (vinfo, stmt_vinfo, op, vector_type, NULL);
> > > > > > > > @@ -7509,8 +7558,17 @@ vectorizable_store (vec_info *vinfo,
> > > > > > > >       same location twice.  */
> > > > > > > >    gcc_assert (slp == PURE_SLP_STMT (stmt_info));
> > > > > > > >
> > > > > > > > +  if (!STMT_VINFO_DATA_REF (stmt_info))
> > > > > > > > +    return false;
> > > > > > > > +
> > > > > > > >    tree vectype = STMT_VINFO_VECTYPE (stmt_info), rhs_vectype = NULL_TREE;
> > > > > > > >    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> > > > > > > > +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > > > > +    {
> > > > > > > > +      if (!nunits.is_constant ())
> > > > > > > > +       return false;
> > > > > > > > +      nunits = exact_div (nunits, 2);
> > > > > > > > +    }
> > > > > > > >
> > > > > > > >    if (loop_vinfo)
> > > > > > > >      {
> > > > > > > > @@ -7526,7 +7584,8 @@ vectorizable_store (vec_info *vinfo,
> > > > > > > >    if (slp)
> > > > > > > >      ncopies = 1;
> > > > > > > >    else
> > > > > > > > -    ncopies = vect_get_num_copies (loop_vinfo, vectype);
> > > > > > > > +    ncopies = vect_get_num_copies (loop_vinfo, vectype,
> > > > > > > > +                                  STMT_VINFO_COMPLEX_P (stmt_info));
> > > > > > > >
> > > > > > > >    gcc_assert (ncopies >= 1);
> > > > > > > >
> > > > > > > > @@ -7546,9 +7605,6 @@ vectorizable_store (vec_info *vinfo,
> > > > > > > >    elem_type = TREE_TYPE (vectype);
> > > > > > > >    vec_mode = TYPE_MODE (vectype);
> > > > > > > >
> > > > > > > > -  if (!STMT_VINFO_DATA_REF (stmt_info))
> > > > > > > > -    return false;
> > > > > > > > -
> > > > > > > >    vect_memory_access_type memory_access_type;
> > > > > > > >    enum dr_alignment_support alignment_support_scheme;
> > > > > > > >    int misalignment;
> > > > > > > > @@ -8778,6 +8834,12 @@ vectorizable_load (vec_info *vinfo,
> > > > > > > >
> > > > > > > >    tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> > > > > > > >    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> > > > > > > > +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > > > > +    {
> > > > > > > > +      if (!nunits.is_constant ())
> > > > > > > > +       return false;
> > > > > > > > +      nunits = exact_div (nunits, 2);
> > > > > > > > +    }
> > > > > > > >
> > > > > > > >    if (loop_vinfo)
> > > > > > > >      {
> > > > > > > > @@ -8794,7 +8856,8 @@ vectorizable_load (vec_info *vinfo,
> > > > > > > >    if (slp)
> > > > > > > >      ncopies = 1;
> > > > > > > >    else
> > > > > > > > -    ncopies = vect_get_num_copies (loop_vinfo, vectype);
> > > > > > > > +    ncopies = vect_get_num_copies (loop_vinfo, vectype,
> > > > > > > > +                                  STMT_VINFO_COMPLEX_P (stmt_info));
> > > > > > > >
> > > > > > > >    gcc_assert (ncopies >= 1);
> > > > > > > >
> > > > > > > > @@ -8870,8 +8933,11 @@ vectorizable_load (vec_info *vinfo,
> > > > > > > >                 if (k > maxk)
> > > > > > > >                   maxk = k;
> > > > > > > >               tree vectype = SLP_TREE_VECTYPE (slp_node);
> > > > > > > > +             /* For complex type, half the nunits.  */
> > > > > > > >               if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
> > > > > > > > -                 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
> > > > > > > > +                 || maxk >= (DR_GROUP_SIZE (group_info)
> > > > > > > > +                             & ~((STMT_VINFO_COMPLEX_P (group_info)
> > > > > > > > +                                  ? nunits >> 1 : nunits) - 1)))
> > > > > > > >                 {
> > > > > > > >                   if (dump_enabled_p ())
> > > > > > > >                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > > > > > > @@ -12499,12 +12565,27 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > > > > >             dump_printf_loc (MSG_NOTE, vect_location,
> > > > > > > >                              "get vectype for scalar type: %T\n", scalar_type);
> > > > > > > >         }
> > > > > > > > +
> > > > > > > > +      tree orig_scalar_type = scalar_type;
> > > > > > > > +      if (TREE_CODE (scalar_type) == COMPLEX_TYPE)
> > > > > > > > +       {
> > > > > > > > +         /* Set complex_p for BB vectorizer.  */
> > > > > > > > +         STMT_VINFO_COMPLEX_P (stmt_info) = true;
> > > > > > > > +         scalar_type = TREE_TYPE (scalar_type);
> > > > > > > > +         /* Double group_size for BB vectorizer to make
> > > > > > > > +            following 2 get_vectype_for_scalar_type return wanted vectype.
> > > > > > > > +            Real group size is not changed, just make the "faked" input
> > > > > > > > +            group_size.  */
> > > > > > > > +         group_size *= 2;
> > > > > > > > +       }
> > > > > > > >        vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
> > > > > > > > -      if (!vectype)
> > > > > > > > +      if (!vectype
> > > > > > > > +         || (STMT_VINFO_COMPLEX_P (stmt_info)
> > > > > > > > +             && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ()))
> > > > > > > >         return opt_result::failure_at (stmt,
> > > > > > > >                                        "not vectorized:"
> > > > > > > >                                        " unsupported data-type %T\n",
> > > > > > > > -                                      scalar_type);
> > > > > > > > +                                      orig_scalar_type);
> > > > > > > >
> > > > > > > >        if (dump_enabled_p ())
> > > > > > > >         dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n", vectype);
> > > > > > > > @@ -12529,16 +12610,30 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > > > > >                                                    TREE_TYPE (vectype));
> > > > > > > >        if (scalar_type != TREE_TYPE (vectype))
> > > > > > > >         {
> > > > > > > > -         if (dump_enabled_p ())
> > > > > > > > +         tree orig_scalar_type = scalar_type;
> > > > > > > > +         if (TREE_CODE (scalar_type) == COMPLEX_TYPE)
> > > > > > > > +           {
> > > > > > > > +             /* Set complex_p for Loop vectorizer.  */
> > > > > > > > +             STMT_VINFO_COMPLEX_P (stmt_info) = true;
> > > > > > > > +             scalar_type = TREE_TYPE (scalar_type);
> > > > > > > > +             if (dump_enabled_p ())
> > > > > > > > +               dump_printf_loc (MSG_NOTE, vect_location,
> > > > > > > > +                            "get complex for smallest scalar type: %T\n",
> > > > > > > > +                            scalar_type);
> > > > > > > > +
> > > > > > > > +           }
> > > > > > > > +         else if (dump_enabled_p ())
> > > > > > > >             dump_printf_loc (MSG_NOTE, vect_location,
> > > > > > > >                              "get vectype for smallest scalar type: %T\n",
> > > > > > > >                              scalar_type);
> > > > > > > >           nunits_vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
> > > > > > > >                                                         group_size);
> > > > > > > > -         if (!nunits_vectype)
> > > > > > > > +         if (!nunits_vectype
> > > > > > > > +             || (STMT_VINFO_COMPLEX_P (stmt_info)
> > > > > > > > +                 && !TYPE_VECTOR_SUBPARTS (nunits_vectype).is_constant ()))
> > > > > > > >             return opt_result::failure_at
> > > > > > > >               (stmt, "not vectorized: unsupported data-type %T\n",
> > > > > > > > -              scalar_type);
> > > > > > > > +              orig_scalar_type);
> > > > > > > >           if (dump_enabled_p ())
> > > > > > > >             dump_printf_loc (MSG_NOTE, vect_location, "nunits vectype: %T\n",
> > > > > > > >                              nunits_vectype);
> > > > > > > > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> > > > > > > > index e5fdc9e0a14..4a809e492c4 100644
> > > > > > > > --- a/gcc/tree-vectorizer.h
> > > > > > > > +++ b/gcc/tree-vectorizer.h
> > > > > > > > @@ -1161,6 +1161,9 @@ public:
> > > > > > > >       vectorization.  */
> > > > > > > >    bool vectorizable;
> > > > > > > >
> > > > > > > > +  /* The scalar type of the LHS of this statement is complex type.  */
> > > > > > > > +  bool complex_p;
> > > > > > > > +
> > > > > > > >    /* The stmt to which this info struct refers to.  */
> > > > > > > >    gimple *stmt;
> > > > > > > >
> > > > > > > > @@ -1395,6 +1398,7 @@ struct gather_scatter_info {
> > > > > > > >  #define STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT(S) (S)->reduc_epilogue_adjustment
> > > > > > > >  #define STMT_VINFO_REDUC_IDX(S)                   (S)->reduc_idx
> > > > > > > >  #define STMT_VINFO_FORCE_SINGLE_CYCLE(S)   (S)->force_single_cycle
> > > > > > > > +#define STMT_VINFO_COMPLEX_P(S)            (S)->complex_p
> > > > > > > >
> > > > > > > >  #define STMT_VINFO_DR_WRT_VEC_LOOP(S)      (S)->dr_wrt_vec_loop
> > > > > > > >  #define STMT_VINFO_DR_BASE_ADDRESS(S)      (S)->dr_wrt_vec_loop.base_address
> > > > > > > > @@ -1970,6 +1974,15 @@ vect_get_num_copies (loop_vec_info loop_vinfo, tree vectype)
> > > > > > > >    return vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo), vectype);
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +static inline unsigned int
> > > > > > > > +vect_get_num_copies (loop_vec_info loop_vinfo, tree vectype, bool complex_p)
> > > > > > > > +{
> > > > > > > > +  poly_uint64 nunits = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> > > > > > > > +  if (complex_p)
> > > > > > > > +    nunits *= 2;
> > > > > > > > +  return vect_get_num_vectors (nunits, vectype);
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >  /* Update maximum unit count *MAX_NUNITS so that it accounts for
> > > > > > > >     NUNITS.  *MAX_NUNITS can be 1 if we haven't yet recorded anything.  */
> > > > > > > >
> > > > > > > > --
> > > > > > > > 2.18.1
> > > > > > > >
> > > > > >
> > > > > >
> > > > > >
> > > > > > --
> > > > > > BR,
> > > > > > Hongtao
> > > >
> > > >
> > > >
> > > > --
> > > > BR,
> > > > Hongtao
>
>
>
> --
> BR,
> Hongtao
Richard Biener July 14, 2022, 9:51 a.m. UTC | #10
On Thu, Jul 14, 2022 at 11:26 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Thu, Jul 14, 2022 at 4:53 PM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Thu, Jul 14, 2022 at 4:20 PM Richard Biener
> > <richard.guenther@gmail.com> wrote:
> > >
> > > On Wed, Jul 13, 2022 at 9:34 AM Richard Biener
> > > <richard.guenther@gmail.com> wrote:
> > > >
> > > > On Wed, Jul 13, 2022 at 6:47 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > >
> > > > > On Tue, Jul 12, 2022 at 10:12 PM Richard Biener
> > > > > <richard.guenther@gmail.com> wrote:
> > > > > >
> > > > > > On Tue, Jul 12, 2022 at 6:11 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > > > >
> > > > > > > On Mon, Jul 11, 2022 at 7:47 PM Richard Biener via Gcc-patches
> > > > > > > <gcc-patches@gcc.gnu.org> wrote:
> > > > > > > >
> > > > > > > > On Mon, Jul 11, 2022 at 5:44 AM liuhongt <hongtao.liu@intel.com> wrote:
> > > > > > > > >
> > > > > > > > > The patch only handles load/store(including ctor/permutation, except
> > > > > > > > > gather/scatter) for complex type, other operations don't needs to be
> > > > > > > > > handled since they will be lowered by pass cplxlower.(MASK_LOAD is not
> > > > > > > > > supported for complex type, so no need to handle either).
> > > > > > > >
> > > > > > > > (*)
> > > > > > > >
> > > > > > > > > Instead of support vector(2) _Complex double, this patch takes vector(4)
> > > > > > > > > double as vector type of _Complex double. Since vectorizer originally
> > > > > > > > > takes TYPE_VECTOR_SUBPARTS as nunits which is not true for complex
> > > > > > > > > type, the patch handles nunits/ncopies/vf specially for complex type.
> > > > > > > >
> > > > > > > > For the limited set above(*) can you explain what's "special" about
> > > > > > > > vector(2) _Complex
> > > > > > > > vs. vector(4) double, thus why we need to have STMT_VINFO_COMPLEX_P at all?
> > > > > > > Supporting a vector(2) complex  is a straightforward idea, just like
> > > > > > > supporting other scalar type in vectorizer, but it requires more
> > > > > > > efforts(in the backend and frontend), considering that most of
> > > > > > > operations of complex type will be lowered into realpart and imagpart
> > > > > > > operations, supporting a vector(2) complex does not look that
> > > > > > > necessary. Then it comes up with supporting vector(4) double(with
> > > > > > > adjustment of vf/ctor/permutation), the vectorizer only needs to
> > > > > > > handle the vectorization of the move operation of the complex type(no
> > > > > > > need to worry about wrongly mapping vector(4) double multiplication to
> > > > > > > complex type multiplication since it's already lowered before
> > > > > > > vectorizer).
> > > > > > > stmt_info does not record the scalar type, in order to avoid duplicate
> > > > > > > operation like getting a lhs type from stmt to determine whether it is
> > > > > > > a complex type, STMT_VINFO_COMPLEX_P bit is added, this bit is mainly
> > > > > > > initialized in vect_analyze_data_refs and vect_get_vector_types_for_
> > > > > > > stmt.
> > > > > > > >
> > > > > > > > I wonder to what extent your handling can be extended to support re-vectorizing
> > > > > > > > (with a higher VF for example) already vectorized code?  The vectorizer giving
> > > > > > > > up on vector(2) double looks quite obviously similar to it giving up
> > > > > > > > on _Complex double ...
> > > > > > > Yes, it can be extended to vector(2) double/float/int/.... with a bit
> > > > > > > adjustment(exacting element by using bit_field instead of
> > > > > > > imagpart_expr/realpart_expr).
> > > > > > > > It would be a shame to not use the same underlying mechanism for dealing with
> > > > > > > > both, where for the vector case obviously vector(4) would be supported as well.
> > > > > > > >
> > > > > > > > In principle _Complex double operations should be two SLP lanes but it seems you
> > > > > > > > are handling them with classical interleaving as well?
> > > > > > > I'm only handling move operations, for other operations it will be
> > > > > > > lowered to realpart and imagpart and thus two SLP lanes.
> > > > > >
> > > > > > Yes, I understood that.
> > > > > >
> > > > > > Doing it more general (and IMHO better) would involve enhancing
> > > > > > how we represent dataref groups, maintaining the number of scalars
> > > > > > covered by each of the vinfos.  On the SLP representation side it
> > > > > > probably requires to rely on the representative for access and not
> > > > > > on the scalar stmts (since those do not map properly to the lanes).
> > > > > >
> > > > > > Ideally we'd be able to handle
> > > > > >
> > > > > > struct { _Complex double c; double a; double b; } a[], b[];
> > > > > >
> > > > > > void foo ()
> > > > > > {
> > > > > >    for (int i = 0; i < 100; ++i)
> > > > > >     {
> > > > > >       a[i].c = b[i].c;
> > > > > >       a[i].a = b[i].a;
> > > > > >       a[i].b = b[i].b;
> > > > > >     }
> > > > > > }
> > > > > >
> > > > > > which I guess your patch doesn't handle with plain AVX vector
> > > > > > copies but instead uses interleaving for the _Complex and non-_Complex
> > > > > > parts?
> > > > > Indeed, it produces wrong code.
> > > >
> > > > For _Complex, in case we don't get to the "true and only" solution it
> > > > might be easier to split the loads and stores when it's just memory
> > > > copies and we have vectorization enabled and a supported vector
> > > > mode that would surely re-assemble them (store-merging doesn't seem
> > > > to do that).
> > > >
> > > > Btw, we seem to produce
> > > >
> > > >         movsd   b(%rip), %xmm0
> > > >         movsd   %xmm0, a(%rip)
> > > >         movsd   b+8(%rip), %xmm0
> > > >         movsd   %xmm0, a+8(%rip)
> > > >
> > > > for a _Complex double memory copy on x86 which means we lack
> > > > true DCmode support (pseudos get decomposed).  Not sure if we
> > > > can somehow check whether a target has DCmode load/store
> > > > support and key decomposing on that (maybe check the SET optab).
> > > >
> > > > It might be possible to check
> > > >
> > > > _Complex double a, b;
> > > > void bar()
> > > > {
> > > >   a = b;
> > > > }
> > > >
> > > > for all targets with a cc1 cross to see whether they somehow get
> > > > loads/stores _not_ decomposed (also check _Complex float,
> > > > I wouldn't worry for _Complex int or _Complex long double).
> > >
> > > Btw, a point for doing the above is that we already do it!  There just
> > > needs to be an (unrelated) complex op in the function:
> > >
> > > _Complex float a[2], b[2];
> > > _Complex double foo(_Complex double x, _Complex double y)
> > > {
> > >   a[0] = b[0];
> > >   a[1] = b[1];
> > >   return x + y;
> > > }
> > >
> > > vs
> > >
> > > void bar()
> > > {
> > >   a[0] = b[0];
> > >   a[1] = b[1];
> > > }
> > >
> > > they key difference is that tree_lower_complex returns early here:
> > >
> > >   if (!init_dont_simulate_again ())
> > >     return 0;
> > >
> > > that returns whether it saw any complex op.
> > >
> > > diff --git a/gcc/tree-complex.cc b/gcc/tree-complex.cc
> > > index 61950a0f099..bdcb9968af1 100644
> > > --- a/gcc/tree-complex.cc
> > > +++ b/gcc/tree-complex.cc
> > > @@ -297,6 +297,11 @@ init_dont_simulate_again (void)
> > >                 break;
> > >
> > >               default:
> > > +               /* When expand_complex_move would trigger make sure we
> > > +                  perform lowering even when there is no actual complex
> > > +                  operation.  This helps consistency and vectorization.  */
> > > +               if (TREE_CODE (TREE_TYPE (gimple_op (stmt, 0))) == COMPLEX_TYPE)
> > > +                 saw_a_complex_op = true;
> > >                 break;
> > >               }
> > >
> > Let me try this.
> > > fixes that.  If this change tests OK (and fixes your set of new
> > > vectorizer testcases)
> > The direct purpose of my patch is to support vectorization of the
> > complex type move, and the indirect purpose is to support automatic
> > vectorization of the complex type libmvec. For example, vectorization
> > of follow case
> > void
> > foo (_Complex double* a, _Complex double* b)
> > {
> >   for (int i = 0; i != 100; i++)
> >   a[i] = csin[b[i]];
> > }
> >
> 7918  _8 = REALPART_EXPR <*_3>;
> 7919  _7 = IMAGPART_EXPR <*_3>;
> 7920  _4 = COMPLEX_EXPR <_8, _7>;
> 7921  _5 = a_11(D) + _2;
> 7922  _6 = csin (_4);
> 7923  _15 = REALPART_EXPR <_6>;
> 7924  _14 = IMAGPART_EXPR <_6>;
>
> Still have complex type in loop, and will failed to get corresponding
> vector type.

Yes, but that's a different issue from memory-memory ops.  The
prime example for this issue is sincos vectorization btw which GCC
represents as cexpi() call with complex type return.

> 11464get_related_vectype_for_scalar_type (machine_mode prevailing_mode,
> 11465                                     tree scalar_type, poly_uint64 nunits)
> 11466{
> 11467  tree orig_scalar_type = scalar_type;
> 11468  scalar_mode inner_mode;
> 11469  machine_mode simd_mode;
> 11470  tree vectype;
> 11471
> 11472  if (!is_int_mode (TYPE_MODE (scalar_type), &inner_mode)
> 11473      && !is_float_mode (TYPE_MODE (scalar_type), &inner_mode))
> 11474    return NULL_TREE; ------------ here.
>
> I'm not sure if there's good way to handle that.

That would need to get "scalars with multiple lanes" support to be
truly handable.  I was shortly playing with this but it will be
non-trivial (maybe the non-memory case is easier).

Richard.

> > GCC has support vectorization for sin, but not for csin.
> > > then I think that's the way to go for the immediate issue of
> > > vectorizing _Complex.
> > >
> > > Richard.
> > >
> > > > Richard.
> > > >
> > > > > > Let me spend some time fleshing out what is necessary to make
> > > > > > this work "properly".  We can consider your special-casing of _Complex
> > > > > > memory ops if I can't manage to assess the complexity of the task.
> > > > > >
> > > > > > Thanks,
> > > > > > Richard.
> > > > > >
> > > > > > > >
> > > > > > > > Thanks,
> > > > > > > > Richard.
> > > > > > > >
> > > > > > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > > > > > > > Also test the patch for SPEC2017 and find there's complex type vectorization
> > > > > > > > > in 510/549(but no performance impact).
> > > > > > > > >
> > > > > > > > > Any comments?
> > > > > > > > >
> > > > > > > > > gcc/ChangeLog:
> > > > > > > > >
> > > > > > > > >         PR tree-optimization/106010
> > > > > > > > >         * tree-vect-data-refs.cc (vect_get_data_access_cost):
> > > > > > > > >         Pass complex_p to vect_get_num_copies to avoid ICE.
> > > > > > > > >         (vect_analyze_data_refs): Support vectorization for Complex
> > > > > > > > >         type with vector scalar types.
> > > > > > > > >         * tree-vect-loop.cc (vect_determine_vf_for_stmt_1): VF should
> > > > > > > > >         be half of TYPE_VECTOR_SUBPARTS when complex_p.
> > > > > > > > >         * tree-vect-slp.cc (vect_record_max_nunits): nunits should be
> > > > > > > > >         half of TYPE_VECTOR_SUBPARTS when complex_p.
> > > > > > > > >         (vect_optimize_slp): Support permutation for complex type.
> > > > > > > > >         (vect_slp_analyze_node_operations_1): Double nunits in
> > > > > > > > >         vect_get_num_vectors to get right SLP_TREE_NUMBER_OF_VEC_STMTS
> > > > > > > > >         when complex_p.
> > > > > > > > >         (vect_slp_analyze_node_operations): Ditto.
> > > > > > > > >         (vect_create_constant_vectors): Support CTOR for complex type.
> > > > > > > > >         (vect_transform_slp_perm_load): Support permutation for
> > > > > > > > >         complex type.
> > > > > > > > >         * tree-vect-stmts.cc (vect_init_vector): Support complex type.
> > > > > > > > >         (vect_get_vec_defs_for_operand): Get vector type for
> > > > > > > > >         complex type.
> > > > > > > > >         (vectorizable_store): Get right ncopies/nunits for complex
> > > > > > > > >         type, also return false when complex_p and
> > > > > > > > >         !TYPE_VECTOR_SUBPARTS.is_constant ().
> > > > > > > > >         (vectorizable_load): Ditto.
> > > > > > > > >         (vect_get_vector_types_for_stmt): Get vector type for complex type.
> > > > > > > > >         * tree-vectorizer.h (STMT_VINFO_COMPLEX_P): New macro.
> > > > > > > > >         (vect_get_num_copies): New overload.
> > > > > > > > >
> > > > > > > > > gcc/testsuite/ChangeLog:
> > > > > > > > >
> > > > > > > > >         * gcc.target/i386/pr106010-1a.c: New test.
> > > > > > > > >         * gcc.target/i386/pr106010-1b.c: New test.
> > > > > > > > >         * gcc.target/i386/pr106010-1c.c: New test.
> > > > > > > > >         * gcc.target/i386/pr106010-2a.c: New test.
> > > > > > > > >         * gcc.target/i386/pr106010-2b.c: New test.
> > > > > > > > >         * gcc.target/i386/pr106010-2c.c: New test.
> > > > > > > > >         * gcc.target/i386/pr106010-3a.c: New test.
> > > > > > > > >         * gcc.target/i386/pr106010-3b.c: New test.
> > > > > > > > >         * gcc.target/i386/pr106010-3c.c: New test.
> > > > > > > > >         * gcc.target/i386/pr106010-4a.c: New test.
> > > > > > > > >         * gcc.target/i386/pr106010-4b.c: New test.
> > > > > > > > >         * gcc.target/i386/pr106010-4c.c: New test.
> > > > > > > > >         * gcc.target/i386/pr106010-5a.c: New test.
> > > > > > > > >         * gcc.target/i386/pr106010-5b.c: New test.
> > > > > > > > >         * gcc.target/i386/pr106010-5c.c: New test.
> > > > > > > > >         * gcc.target/i386/pr106010-6a.c: New test.
> > > > > > > > >         * gcc.target/i386/pr106010-6b.c: New test.
> > > > > > > > >         * gcc.target/i386/pr106010-6c.c: New test.
> > > > > > > > >         * gcc.target/i386/pr106010-7a.c: New test.
> > > > > > > > >         * gcc.target/i386/pr106010-7b.c: New test.
> > > > > > > > >         * gcc.target/i386/pr106010-7c.c: New test.
> > > > > > > > >         * gcc.target/i386/pr106010-8a.c: New test.
> > > > > > > > >         * gcc.target/i386/pr106010-8b.c: New test.
> > > > > > > > >         * gcc.target/i386/pr106010-8c.c: New test.
> > > > > > > > > ---
> > > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-1a.c |  58 +++++++
> > > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-1b.c |  63 +++++++
> > > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-1c.c |  41 +++++
> > > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-2a.c |  82 +++++++++
> > > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-2b.c |  62 +++++++
> > > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-2c.c |  47 ++++++
> > > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-3a.c |  80 +++++++++
> > > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-3b.c | 126 ++++++++++++++
> > > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-3c.c |  69 ++++++++
> > > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-4a.c | 101 ++++++++++++
> > > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-4b.c |  67 ++++++++
> > > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-4c.c |  54 ++++++
> > > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-5a.c | 117 +++++++++++++
> > > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-5b.c |  80 +++++++++
> > > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-5c.c |  62 +++++++
> > > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-6a.c | 115 +++++++++++++
> > > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-6b.c | 157 ++++++++++++++++++
> > > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-6c.c |  80 +++++++++
> > > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-7a.c |  58 +++++++
> > > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-7b.c |  63 +++++++
> > > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-7c.c |  41 +++++
> > > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-8a.c |  58 +++++++
> > > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-8b.c |  53 ++++++
> > > > > > > > >  gcc/testsuite/gcc.target/i386/pr106010-8c.c |  38 +++++
> > > > > > > > >  gcc/tree-vect-data-refs.cc                  |  26 ++-
> > > > > > > > >  gcc/tree-vect-loop.cc                       |   7 +-
> > > > > > > > >  gcc/tree-vect-slp.cc                        | 174 +++++++++++++++-----
> > > > > > > > >  gcc/tree-vect-stmts.cc                      | 135 ++++++++++++---
> > > > > > > > >  gcc/tree-vectorizer.h                       |  13 ++
> > > > > > > > >  29 files changed, 2064 insertions(+), 63 deletions(-)
> > > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > > > > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > > > > > > > >
> > > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1a.c b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 00000000000..b608f484934
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > > > > > > > > @@ -0,0 +1,58 @@
> > > > > > > > > +/* { dg-do compile } */
> > > > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "vect" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "vect" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "vect" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "vect" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "vect" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "vect" } } */
> > > > > > > > > +
> > > > > > > > > +#define N 10000
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_pd (_Complex double* a, _Complex double* b)
> > > > > > > > > +{
> > > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > > +    a[i] = b[i];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_ps (_Complex float* a, _Complex float* b)
> > > > > > > > > +{
> > > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > > +    a[i] = b[i];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi64 (_Complex long long* a, _Complex long long* b)
> > > > > > > > > +{
> > > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > > +    a[i] = b[i];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi32 (_Complex int* a, _Complex int* b)
> > > > > > > > > +{
> > > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > > +    a[i] = b[i];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi16 (_Complex short* a, _Complex short* b)
> > > > > > > > > +{
> > > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > > +    a[i] = b[i];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi8 (_Complex char* a, _Complex char* b)
> > > > > > > > > +{
> > > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > > +    a[i] = b[i];
> > > > > > > > > +}
> > > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1b.c b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 00000000000..0f377c3a548
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > > > > > > > > @@ -0,0 +1,63 @@
> > > > > > > > > +/* { dg-do run } */
> > > > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > > > +/* { dg-require-effective-target avx } */
> > > > > > > > > +
> > > > > > > > > +#include "avx-check.h"
> > > > > > > > > +#include <string.h>
> > > > > > > > > +#include "pr106010-1a.c"
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +avx_test (void)
> > > > > > > > > +{
> > > > > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > > > > > > +  char* p_init = (char*) malloc (2 * N * sizeof (double));
> > > > > > > > > +
> > > > > > > > > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > > > > > > > > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > > > > > > > > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > > > > > > > > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > > > > > > > > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > > > > > > > > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > > > > > > > > +
> > > > > > > > > +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> > > > > > > > > +    p_init[i] = i;
> > > > > > > > > +
> > > > > > > > > +  memcpy (pd_src, p_init, 2 * N * sizeof (double));
> > > > > > > > > +  memcpy (ps_src, p_init, 2 * N * sizeof (float));
> > > > > > > > > +  memcpy (epi64_src, p_init, 2 * N * sizeof (long long));
> > > > > > > > > +  memcpy (epi32_src, p_init, 2 * N * sizeof (int));
> > > > > > > > > +  memcpy (epi16_src, p_init, 2 * N * sizeof (short));
> > > > > > > > > +  memcpy (epi8_src, p_init, 2 * N * sizeof (char));
> > > > > > > > > +
> > > > > > > > > +  foo_pd (pd_dst, pd_src);
> > > > > > > > > +  foo_ps (ps_dst, ps_src);
> > > > > > > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > > > > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > > > > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > > > > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > > > > > > +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +
> > > > > > > > > +  return;
> > > > > > > > > +}
> > > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1c.c b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 00000000000..f07e9fb2d3d
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > > > > > > > > @@ -0,0 +1,41 @@
> > > > > > > > > +/* { dg-do run } */
> > > > > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "vect" } } */
> > > > > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > > > > +
> > > > > > > > > +#include <string.h>
> > > > > > > > > +
> > > > > > > > > +static void do_test (void);
> > > > > > > > > +
> > > > > > > > > +#define DO_TEST do_test
> > > > > > > > > +#define AVX512FP16
> > > > > > > > > +#include "avx512-check.h"
> > > > > > > > > +
> > > > > > > > > +#define N 10000
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* b)
> > > > > > > > > +{
> > > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > > +    a[i] = b[i];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static void
> > > > > > > > > +do_test (void)
> > > > > > > > > +{
> > > > > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > > > > > > +  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
> > > > > > > > > +
> > > > > > > > > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > > > > > > > > +
> > > > > > > > > +  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
> > > > > > > > > +    p_init[i] = i;
> > > > > > > > > +
> > > > > > > > > +  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
> > > > > > > > > +
> > > > > > > > > +  foo_ph (ph_dst, ph_src);
> > > > > > > > > +  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +}
> > > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2a.c b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 00000000000..d2e2f8d4f43
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > > > > > > > > @@ -0,0 +1,82 @@
> > > > > > > > > +/* { dg-do compile } */
> > > > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "slp2" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "slp2" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "slp2" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "slp2" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "slp2" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "slp2" } } */
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[0];
> > > > > > > > > +  a[1] = b[1];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[0];
> > > > > > > > > +  a[1] = b[1];
> > > > > > > > > +  a[2] = b[2];
> > > > > > > > > +  a[3] = b[3];
> > > > > > > > > +
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[0];
> > > > > > > > > +  a[1] = b[1];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[0];
> > > > > > > > > +  a[1] = b[1];
> > > > > > > > > +  a[2] = b[2];
> > > > > > > > > +  a[3] = b[3];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[0];
> > > > > > > > > +  a[1] = b[1];
> > > > > > > > > +  a[2] = b[2];
> > > > > > > > > +  a[3] = b[3];
> > > > > > > > > +  a[4] = b[4];
> > > > > > > > > +  a[5] = b[5];
> > > > > > > > > +  a[6] = b[6];
> > > > > > > > > +  a[7] = b[7];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[0];
> > > > > > > > > +  a[1] = b[1];
> > > > > > > > > +  a[2] = b[2];
> > > > > > > > > +  a[3] = b[3];
> > > > > > > > > +  a[4] = b[4];
> > > > > > > > > +  a[5] = b[5];
> > > > > > > > > +  a[6] = b[6];
> > > > > > > > > +  a[7] = b[7];
> > > > > > > > > +  a[8] = b[8];
> > > > > > > > > +  a[9] = b[9];
> > > > > > > > > +  a[10] = b[10];
> > > > > > > > > +  a[11] = b[11];
> > > > > > > > > +  a[12] = b[12];
> > > > > > > > > +  a[13] = b[13];
> > > > > > > > > +  a[14] = b[14];
> > > > > > > > > +  a[15] = b[15];
> > > > > > > > > +}
> > > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2b.c b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 00000000000..ac360752693
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > > > > > > > > @@ -0,0 +1,62 @@
> > > > > > > > > +/* { dg-do run } */
> > > > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > > > +/* { dg-require-effective-target avx } */
> > > > > > > > > +
> > > > > > > > > +#include "avx-check.h"
> > > > > > > > > +#include <string.h>
> > > > > > > > > +#include "pr106010-2a.c"
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +avx_test (void)
> > > > > > > > > +{
> > > > > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > > > > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > > > > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > > > > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > > > > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > > > > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > > > > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > > > > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > > > > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > > > > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > > > > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > > > > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > > > > > > > > +  char* p = (char* ) malloc (32);
> > > > > > > > > +
> > > > > > > > > +  __builtin_memset (pd_dst, 0, 32);
> > > > > > > > > +  __builtin_memset (ps_dst, 0, 32);
> > > > > > > > > +  __builtin_memset (epi64_dst, 0, 32);
> > > > > > > > > +  __builtin_memset (epi32_dst, 0, 32);
> > > > > > > > > +  __builtin_memset (epi16_dst, 0, 32);
> > > > > > > > > +  __builtin_memset (epi8_dst, 0, 32);
> > > > > > > > > +
> > > > > > > > > +  for (int i = 0; i != 32; i++)
> > > > > > > > > +    p[i] = i;
> > > > > > > > > +  __builtin_memcpy (pd_src, p, 32);
> > > > > > > > > +  __builtin_memcpy (ps_src, p, 32);
> > > > > > > > > +  __builtin_memcpy (epi64_src, p, 32);
> > > > > > > > > +  __builtin_memcpy (epi32_src, p, 32);
> > > > > > > > > +  __builtin_memcpy (epi16_src, p, 32);
> > > > > > > > > +  __builtin_memcpy (epi8_src, p, 32);
> > > > > > > > > +
> > > > > > > > > +  foo_pd (pd_dst, pd_src);
> > > > > > > > > +  foo_ps (ps_dst, ps_src);
> > > > > > > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > > > > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > > > > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > > > > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > > > > > > +  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +
> > > > > > > > > +  return;
> > > > > > > > > +}
> > > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2c.c b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 00000000000..a002f209ec9
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > > > > > > > > @@ -0,0 +1,47 @@
> > > > > > > > > +/* { dg-do run } */
> > > > > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > > > > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > > > > +
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "slp2" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > > > > > > > +
> > > > > > > > > +#include <string.h>
> > > > > > > > > +
> > > > > > > > > +static void do_test (void);
> > > > > > > > > +#define DO_TEST do_test
> > > > > > > > > +#define AVX512FP16
> > > > > > > > > +#include "avx512-check.h"
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[0];
> > > > > > > > > +  a[1] = b[1];
> > > > > > > > > +  a[2] = b[2];
> > > > > > > > > +  a[3] = b[3];
> > > > > > > > > +  a[4] = b[4];
> > > > > > > > > +  a[5] = b[5];
> > > > > > > > > +  a[6] = b[6];
> > > > > > > > > +  a[7] = b[7];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +do_test (void)
> > > > > > > > > +{
> > > > > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > > > > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > > > > > > > > +  char* p = (char* ) malloc (32);
> > > > > > > > > +
> > > > > > > > > +   __builtin_memset (ph_dst, 0, 32);
> > > > > > > > > +
> > > > > > > > > +  for (int i = 0; i != 32; i++)
> > > > > > > > > +    p[i] = i;
> > > > > > > > > +  __builtin_memcpy (ph_src, p, 32);
> > > > > > > > > +
> > > > > > > > > +  foo_ph (ph_dst, ph_src);
> > > > > > > > > +  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +
> > > > > > > > > +  return;
> > > > > > > > > +}
> > > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3a.c b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 00000000000..c1b64b56b1c
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > > > > > > > > @@ -0,0 +1,80 @@
> > > > > > > > > +/* { dg-do compile } */
> > > > > > > > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 6, 7, 4, 5 \}} 1 "slp2" } }  */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17 \}} 1 "slp2" } }  */
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[1];
> > > > > > > > > +  a[1] = b[0];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[1];
> > > > > > > > > +  a[1] = b[0];
> > > > > > > > > +  a[2] = b[3];
> > > > > > > > > +  a[3] = b[2];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[1];
> > > > > > > > > +  a[1] = b[0];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[3];
> > > > > > > > > +  a[1] = b[2];
> > > > > > > > > +  a[2] = b[1];
> > > > > > > > > +  a[3] = b[0];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[7];
> > > > > > > > > +  a[1] = b[6];
> > > > > > > > > +  a[2] = b[5];
> > > > > > > > > +  a[3] = b[4];
> > > > > > > > > +  a[4] = b[3];
> > > > > > > > > +  a[5] = b[2];
> > > > > > > > > +  a[6] = b[1];
> > > > > > > > > +  a[7] = b[0];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[7];
> > > > > > > > > +  a[1] = b[6];
> > > > > > > > > +  a[2] = b[5];
> > > > > > > > > +  a[3] = b[4];
> > > > > > > > > +  a[4] = b[3];
> > > > > > > > > +  a[5] = b[2];
> > > > > > > > > +  a[6] = b[1];
> > > > > > > > > +  a[7] = b[0];
> > > > > > > > > +  a[8] = b[15];
> > > > > > > > > +  a[9] = b[14];
> > > > > > > > > +  a[10] = b[13];
> > > > > > > > > +  a[11] = b[12];
> > > > > > > > > +  a[12] = b[11];
> > > > > > > > > +  a[13] = b[10];
> > > > > > > > > +  a[14] = b[9];
> > > > > > > > > +  a[15] = b[8];
> > > > > > > > > +}
> > > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3b.c b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 00000000000..e4fa3f3a541
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > > > > > > > > @@ -0,0 +1,126 @@
> > > > > > > > > +/* { dg-do run } */
> > > > > > > > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > > > +/* { dg-require-effective-target avx2 } */
> > > > > > > > > +
> > > > > > > > > +#include "avx2-check.h"
> > > > > > > > > +#include <string.h>
> > > > > > > > > +#include "pr106010-3a.c"
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +avx2_test (void)
> > > > > > > > > +{
> > > > > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > > > > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > > > > > > > > +  _Complex double* pd_exp = (_Complex double*) malloc (32);
> > > > > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > > > > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > > > > > > > > +  _Complex float* ps_exp = (_Complex float*) malloc (32);
> > > > > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > > > > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > > > > > > > > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (32);
> > > > > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > > > > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > > > > > > > > +  _Complex int* epi32_exp = (_Complex int*) malloc (32);
> > > > > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > > > > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > > > > > > > > +  _Complex short* epi16_exp = (_Complex short*) malloc (32);
> > > > > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > > > > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > > > > > > > > +  _Complex char* epi8_exp = (_Complex char*) malloc (32);
> > > > > > > > > +  char* p = (char* ) malloc (32);
> > > > > > > > > +  char* q = (char* ) malloc (32);
> > > > > > > > > +
> > > > > > > > > +  __builtin_memset (pd_dst, 0, 32);
> > > > > > > > > +  __builtin_memset (ps_dst, 0, 32);
> > > > > > > > > +  __builtin_memset (epi64_dst, 0, 32);
> > > > > > > > > +  __builtin_memset (epi32_dst, 0, 32);
> > > > > > > > > +  __builtin_memset (epi16_dst, 0, 32);
> > > > > > > > > +  __builtin_memset (epi8_dst, 0, 32);
> > > > > > > > > +
> > > > > > > > > +  for (int i = 0; i != 32; i++)
> > > > > > > > > +    p[i] = i;
> > > > > > > > > +  __builtin_memcpy (pd_src, p, 32);
> > > > > > > > > +  __builtin_memcpy (ps_src, p, 32);
> > > > > > > > > +  __builtin_memcpy (epi64_src, p, 32);
> > > > > > > > > +  __builtin_memcpy (epi32_src, p, 32);
> > > > > > > > > +  __builtin_memcpy (epi16_src, p, 32);
> > > > > > > > > +  __builtin_memcpy (epi8_src, p, 32);
> > > > > > > > > +
> > > > > > > > > +  for (int i = 0; i != 16; i++)
> > > > > > > > > +    {
> > > > > > > > > +      p[i] = i + 16;
> > > > > > > > > +      p[i + 16] = i;
> > > > > > > > > +    }
> > > > > > > > > +  __builtin_memcpy (pd_exp, p, 32);
> > > > > > > > > +  __builtin_memcpy (epi64_exp, p, 32);
> > > > > > > > > +
> > > > > > > > > +  for (int i = 0; i != 8; i++)
> > > > > > > > > +    {
> > > > > > > > > +      p[i] = i + 8;
> > > > > > > > > +      p[i + 8] = i;
> > > > > > > > > +      p[i + 16] = i + 24;
> > > > > > > > > +      p[i + 24] = i + 16;
> > > > > > > > > +      q[i] = i + 24;
> > > > > > > > > +      q[i + 8] = i + 16;
> > > > > > > > > +      q[i + 16] = i + 8;
> > > > > > > > > +      q[i + 24] = i;
> > > > > > > > > +    }
> > > > > > > > > +  __builtin_memcpy (ps_exp, p, 32);
> > > > > > > > > +  __builtin_memcpy (epi32_exp, q, 32);
> > > > > > > > > +
> > > > > > > > > +
> > > > > > > > > +  for (int i = 0; i != 4; i++)
> > > > > > > > > +    {
> > > > > > > > > +      q[i] = i + 28;
> > > > > > > > > +      q[i + 4] = i + 24;
> > > > > > > > > +      q[i + 8] = i + 20;
> > > > > > > > > +      q[i + 12] = i + 16;
> > > > > > > > > +      q[i + 16] = i + 12;
> > > > > > > > > +      q[i + 20] = i + 8;
> > > > > > > > > +      q[i + 24] = i + 4;
> > > > > > > > > +      q[i + 28] = i;
> > > > > > > > > +    }
> > > > > > > > > +  __builtin_memcpy (epi16_exp, q, 32);
> > > > > > > > > +
> > > > > > > > > +  for (int i = 0; i != 2; i++)
> > > > > > > > > +    {
> > > > > > > > > +      q[i] = i + 14;
> > > > > > > > > +      q[i + 2] = i + 12;
> > > > > > > > > +      q[i + 4] = i + 10;
> > > > > > > > > +      q[i + 6] = i + 8;
> > > > > > > > > +      q[i + 8] = i + 6;
> > > > > > > > > +      q[i + 10] = i + 4;
> > > > > > > > > +      q[i + 12] = i + 2;
> > > > > > > > > +      q[i + 14] = i;
> > > > > > > > > +      q[i + 16] = i + 30;
> > > > > > > > > +      q[i + 18] = i + 28;
> > > > > > > > > +      q[i + 20] = i + 26;
> > > > > > > > > +      q[i + 22] = i + 24;
> > > > > > > > > +      q[i + 24] = i + 22;
> > > > > > > > > +      q[i + 26] = i + 20;
> > > > > > > > > +      q[i + 28] = i + 18;
> > > > > > > > > +      q[i + 30] = i + 16;
> > > > > > > > > +    }
> > > > > > > > > +  __builtin_memcpy (epi8_exp, q, 32);
> > > > > > > > > +
> > > > > > > > > +  foo_pd (pd_dst, pd_src);
> > > > > > > > > +  foo_ps (ps_dst, ps_src);
> > > > > > > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > > > > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > > > > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > > > > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > > > > > > +  if (__builtin_memcmp (pd_dst, pd_exp, 32) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (ps_dst, ps_exp, 32) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 32) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 32) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 32) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 32) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +
> > > > > > > > > +  return;
> > > > > > > > > +}
> > > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3c.c b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 00000000000..5a5a3d4b992
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > > > > > > > > @@ -0,0 +1,69 @@
> > > > > > > > > +/* { dg-do run } */
> > > > > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > > > > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 8, 9, 6, 7, 14, 15, 12, 13, 4, 5, 10, 11 \}} 1 "slp2" } }  */
> > > > > > > > > +
> > > > > > > > > +#include <string.h>
> > > > > > > > > +
> > > > > > > > > +static void do_test (void);
> > > > > > > > > +#define DO_TEST do_test
> > > > > > > > > +#define AVX512FP16
> > > > > > > > > +#include "avx512-check.h"
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[1];
> > > > > > > > > +  a[1] = b[0];
> > > > > > > > > +  a[2] = b[4];
> > > > > > > > > +  a[3] = b[3];
> > > > > > > > > +  a[4] = b[7];
> > > > > > > > > +  a[5] = b[6];
> > > > > > > > > +  a[6] = b[2];
> > > > > > > > > +  a[7] = b[5];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +do_test (void)
> > > > > > > > > +{
> > > > > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > > > > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > > > > > > > > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (32);
> > > > > > > > > +  char* p = (char* ) malloc (32);
> > > > > > > > > +  char* q = (char* ) malloc (32);
> > > > > > > > > +
> > > > > > > > > +  __builtin_memset (ph_dst, 0, 32);
> > > > > > > > > +
> > > > > > > > > +  for (int i = 0; i != 32; i++)
> > > > > > > > > +    p[i] = i;
> > > > > > > > > +  __builtin_memcpy (ph_src, p, 32);
> > > > > > > > > +
> > > > > > > > > +  for (int i = 0; i != 4; i++)
> > > > > > > > > +    {
> > > > > > > > > +      p[i] = i + 4;
> > > > > > > > > +      p[i + 4] = i;
> > > > > > > > > +      p[i + 8] = i + 16;
> > > > > > > > > +      p[i + 12] = i + 12;
> > > > > > > > > +      p[i + 16] = i + 28;
> > > > > > > > > +      p[i + 20] = i + 24;
> > > > > > > > > +      p[i + 24] = i + 8;
> > > > > > > > > +      p[i + 28] = i + 20;
> > > > > > > > > +      q[i] = i + 28;
> > > > > > > > > +      q[i + 4] = i + 24;
> > > > > > > > > +      q[i + 8] = i + 20;
> > > > > > > > > +      q[i + 12] = i + 16;
> > > > > > > > > +      q[i + 16] = i + 12;
> > > > > > > > > +      q[i + 20] = i + 8;
> > > > > > > > > +      q[i + 24] = i + 4;
> > > > > > > > > +      q[i + 28] = i;
> > > > > > > > > +    }
> > > > > > > > > +  __builtin_memcpy (ph_exp, p, 32);
> > > > > > > > > +
> > > > > > > > > +  foo_ph (ph_dst, ph_src);
> > > > > > > > > +  if (__builtin_memcmp (ph_dst, ph_exp, 32) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +
> > > > > > > > > +  return;
> > > > > > > > > +}
> > > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4a.c b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 00000000000..b7b0b532bb1
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > > > > > > > > @@ -0,0 +1,101 @@
> > > > > > > > > +/* { dg-do compile } */
> > > > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "slp2" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "slp2" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "slp2" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "slp2" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "slp2" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "slp2" } } */
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_pd (_Complex double* a,
> > > > > > > > > +       _Complex double b1,
> > > > > > > > > +       _Complex double b2)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b1;
> > > > > > > > > +  a[1] = b2;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_ps (_Complex float* a,
> > > > > > > > > +       _Complex float b1, _Complex float b2,
> > > > > > > > > +       _Complex float b3, _Complex float b4)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b1;
> > > > > > > > > +  a[1] = b2;
> > > > > > > > > +  a[2] = b3;
> > > > > > > > > +  a[3] = b4;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi64 (_Complex long long* a,
> > > > > > > > > +          _Complex long long b1,
> > > > > > > > > +          _Complex long long b2)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b1;
> > > > > > > > > +  a[1] = b2;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi32 (_Complex int* a,
> > > > > > > > > +          _Complex int b1, _Complex int b2,
> > > > > > > > > +          _Complex int b3, _Complex int b4)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b1;
> > > > > > > > > +  a[1] = b2;
> > > > > > > > > +  a[2] = b3;
> > > > > > > > > +  a[3] = b4;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi16 (_Complex short* a,
> > > > > > > > > +          _Complex short b1, _Complex short b2,
> > > > > > > > > +          _Complex short b3, _Complex short b4,
> > > > > > > > > +          _Complex short b5, _Complex short b6,
> > > > > > > > > +          _Complex short b7,_Complex short b8)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b1;
> > > > > > > > > +  a[1] = b2;
> > > > > > > > > +  a[2] = b3;
> > > > > > > > > +  a[3] = b4;
> > > > > > > > > +  a[4] = b5;
> > > > > > > > > +  a[5] = b6;
> > > > > > > > > +  a[6] = b7;
> > > > > > > > > +  a[7] = b8;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi8 (_Complex char* a,
> > > > > > > > > +         _Complex char b1, _Complex char b2,
> > > > > > > > > +         _Complex char b3, _Complex char b4,
> > > > > > > > > +         _Complex char b5, _Complex char b6,
> > > > > > > > > +         _Complex char b7,_Complex char b8,
> > > > > > > > > +         _Complex char b9, _Complex char b10,
> > > > > > > > > +         _Complex char b11, _Complex char b12,
> > > > > > > > > +         _Complex char b13, _Complex char b14,
> > > > > > > > > +         _Complex char b15,_Complex char b16)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b1;
> > > > > > > > > +  a[1] = b2;
> > > > > > > > > +  a[2] = b3;
> > > > > > > > > +  a[3] = b4;
> > > > > > > > > +  a[4] = b5;
> > > > > > > > > +  a[5] = b6;
> > > > > > > > > +  a[6] = b7;
> > > > > > > > > +  a[7] = b8;
> > > > > > > > > +  a[8] = b9;
> > > > > > > > > +  a[9] = b10;
> > > > > > > > > +  a[10] = b11;
> > > > > > > > > +  a[11] = b12;
> > > > > > > > > +  a[12] = b13;
> > > > > > > > > +  a[13] = b14;
> > > > > > > > > +  a[14] = b15;
> > > > > > > > > +  a[15] = b16;
> > > > > > > > > +}
> > > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4b.c b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 00000000000..e2e79508c4b
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > > > > > > > > @@ -0,0 +1,67 @@
> > > > > > > > > +/* { dg-do run } */
> > > > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > > > +/* { dg-require-effective-target avx } */
> > > > > > > > > +
> > > > > > > > > +#include "avx-check.h"
> > > > > > > > > +#include <string.h>
> > > > > > > > > +#include "pr106010-4a.c"
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +avx_test (void)
> > > > > > > > > +{
> > > > > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > > > > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > > > > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > > > > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > > > > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > > > > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > > > > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > > > > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > > > > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > > > > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > > > > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > > > > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > > > > > > > > +  char* p = (char* ) malloc (32);
> > > > > > > > > +
> > > > > > > > > +  __builtin_memset (pd_dst, 0, 32);
> > > > > > > > > +  __builtin_memset (ps_dst, 0, 32);
> > > > > > > > > +  __builtin_memset (epi64_dst, 0, 32);
> > > > > > > > > +  __builtin_memset (epi32_dst, 0, 32);
> > > > > > > > > +  __builtin_memset (epi16_dst, 0, 32);
> > > > > > > > > +  __builtin_memset (epi8_dst, 0, 32);
> > > > > > > > > +
> > > > > > > > > +  for (int i = 0; i != 32; i++)
> > > > > > > > > +    p[i] = i;
> > > > > > > > > +  __builtin_memcpy (pd_src, p, 32);
> > > > > > > > > +  __builtin_memcpy (ps_src, p, 32);
> > > > > > > > > +  __builtin_memcpy (epi64_src, p, 32);
> > > > > > > > > +  __builtin_memcpy (epi32_src, p, 32);
> > > > > > > > > +  __builtin_memcpy (epi16_src, p, 32);
> > > > > > > > > +  __builtin_memcpy (epi8_src, p, 32);
> > > > > > > > > +
> > > > > > > > > +  foo_pd (pd_dst, pd_src[0], pd_src[1]);
> > > > > > > > > +  foo_ps (ps_dst, ps_src[0], ps_src[1], ps_src[2], ps_src[3]);
> > > > > > > > > +  foo_epi64 (epi64_dst, epi64_src[0], epi64_src[1]);
> > > > > > > > > +  foo_epi32 (epi32_dst, epi32_src[0], epi32_src[1], epi32_src[2], epi32_src[3]);
> > > > > > > > > +  foo_epi16 (epi16_dst, epi16_src[0], epi16_src[1], epi16_src[2], epi16_src[3],
> > > > > > > > > +            epi16_src[4], epi16_src[5], epi16_src[6], epi16_src[7]);
> > > > > > > > > +  foo_epi8 (epi8_dst, epi8_src[0], epi8_src[1], epi8_src[2], epi8_src[3],
> > > > > > > > > +           epi8_src[4], epi8_src[5], epi8_src[6], epi8_src[7],
> > > > > > > > > +           epi8_src[8], epi8_src[9], epi8_src[10], epi8_src[11],
> > > > > > > > > +           epi8_src[12], epi8_src[13], epi8_src[14], epi8_src[15]);
> > > > > > > > > +
> > > > > > > > > +  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi8_dst, epi8_src, 32) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +
> > > > > > > > > +  return;
> > > > > > > > > +}
> > > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4c.c b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 00000000000..8e02aefe3b5
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > > > > > > > > @@ -0,0 +1,54 @@
> > > > > > > > > +/* { dg-do run } */
> > > > > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -fdump-tree-slp-details -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "slp2" } } */
> > > > > > > > > +
> > > > > > > > > +#include <string.h>
> > > > > > > > > +
> > > > > > > > > +static void do_test (void);
> > > > > > > > > +#define DO_TEST do_test
> > > > > > > > > +#define AVX512FP16
> > > > > > > > > +#include "avx512-check.h"
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_ph (_Complex _Float16* a,
> > > > > > > > > +       _Complex _Float16 b1, _Complex _Float16 b2,
> > > > > > > > > +       _Complex _Float16 b3, _Complex _Float16 b4,
> > > > > > > > > +       _Complex _Float16 b5, _Complex _Float16 b6,
> > > > > > > > > +       _Complex _Float16 b7,_Complex _Float16 b8)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b1;
> > > > > > > > > +  a[1] = b2;
> > > > > > > > > +  a[2] = b3;
> > > > > > > > > +  a[3] = b4;
> > > > > > > > > +  a[4] = b5;
> > > > > > > > > +  a[5] = b6;
> > > > > > > > > +  a[6] = b7;
> > > > > > > > > +  a[7] = b8;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +do_test (void)
> > > > > > > > > +{
> > > > > > > > > +
> > > > > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > > > > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > > > > > > > > +
> > > > > > > > > +  char* p = (char* ) malloc (32);
> > > > > > > > > +
> > > > > > > > > +  __builtin_memset (ph_dst, 0, 32);
> > > > > > > > > +
> > > > > > > > > +  for (int i = 0; i != 32; i++)
> > > > > > > > > +    p[i] = i;
> > > > > > > > > +
> > > > > > > > > +  __builtin_memcpy (ph_src, p, 32);
> > > > > > > > > +
> > > > > > > > > +  foo_ph (ph_dst, ph_src[0], ph_src[1], ph_src[2], ph_src[3],
> > > > > > > > > +         ph_src[4], ph_src[5], ph_src[6], ph_src[7]);
> > > > > > > > > +
> > > > > > > > > +  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  return;
> > > > > > > > > +}
> > > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5a.c b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 00000000000..9d4a6f9846b
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > > > > > > > > @@ -0,0 +1,117 @@
> > > > > > > > > +/* { dg-do compile } */
> > > > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 4 "slp2" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 4 "slp2" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 4 "slp2" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 4 "slp2" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 4 "slp2" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 4 "slp2" } } */
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[2];
> > > > > > > > > +  a[1] = b[3];
> > > > > > > > > +  a[2] = b[0];
> > > > > > > > > +  a[3] = b[1];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[4];
> > > > > > > > > +  a[1] = b[5];
> > > > > > > > > +  a[2] = b[6];
> > > > > > > > > +  a[3] = b[7];
> > > > > > > > > +  a[4] = b[0];
> > > > > > > > > +  a[5] = b[1];
> > > > > > > > > +  a[6] = b[2];
> > > > > > > > > +  a[7] = b[3];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[2];
> > > > > > > > > +  a[1] = b[3];
> > > > > > > > > +  a[2] = b[0];
> > > > > > > > > +  a[3] = b[1];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[4];
> > > > > > > > > +  a[1] = b[5];
> > > > > > > > > +  a[2] = b[6];
> > > > > > > > > +  a[3] = b[7];
> > > > > > > > > +  a[4] = b[0];
> > > > > > > > > +  a[5] = b[1];
> > > > > > > > > +  a[6] = b[2];
> > > > > > > > > +  a[7] = b[3];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[8];
> > > > > > > > > +  a[1] = b[9];
> > > > > > > > > +  a[2] = b[10];
> > > > > > > > > +  a[3] = b[11];
> > > > > > > > > +  a[4] = b[12];
> > > > > > > > > +  a[5] = b[13];
> > > > > > > > > +  a[6] = b[14];
> > > > > > > > > +  a[7] = b[15];
> > > > > > > > > +  a[8] = b[0];
> > > > > > > > > +  a[9] = b[1];
> > > > > > > > > +  a[10] = b[2];
> > > > > > > > > +  a[11] = b[3];
> > > > > > > > > +  a[12] = b[4];
> > > > > > > > > +  a[13] = b[5];
> > > > > > > > > +  a[14] = b[6];
> > > > > > > > > +  a[15] = b[7];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[16];
> > > > > > > > > +  a[1] = b[17];
> > > > > > > > > +  a[2] = b[18];
> > > > > > > > > +  a[3] = b[19];
> > > > > > > > > +  a[4] = b[20];
> > > > > > > > > +  a[5] = b[21];
> > > > > > > > > +  a[6] = b[22];
> > > > > > > > > +  a[7] = b[23];
> > > > > > > > > +  a[8] = b[24];
> > > > > > > > > +  a[9] = b[25];
> > > > > > > > > +  a[10] = b[26];
> > > > > > > > > +  a[11] = b[27];
> > > > > > > > > +  a[12] = b[28];
> > > > > > > > > +  a[13] = b[29];
> > > > > > > > > +  a[14] = b[30];
> > > > > > > > > +  a[15] = b[31];
> > > > > > > > > +  a[16] = b[0];
> > > > > > > > > +  a[17] = b[1];
> > > > > > > > > +  a[18] = b[2];
> > > > > > > > > +  a[19] = b[3];
> > > > > > > > > +  a[20] = b[4];
> > > > > > > > > +  a[21] = b[5];
> > > > > > > > > +  a[22] = b[6];
> > > > > > > > > +  a[23] = b[7];
> > > > > > > > > +  a[24] = b[8];
> > > > > > > > > +  a[25] = b[9];
> > > > > > > > > +  a[26] = b[10];
> > > > > > > > > +  a[27] = b[11];
> > > > > > > > > +  a[28] = b[12];
> > > > > > > > > +  a[29] = b[13];
> > > > > > > > > +  a[30] = b[14];
> > > > > > > > > +  a[31] = b[15];
> > > > > > > > > +}
> > > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5b.c b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 00000000000..d5c6ebeb5cf
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > > > > > > > > @@ -0,0 +1,80 @@
> > > > > > > > > +/* { dg-do run } */
> > > > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > > > +/* { dg-require-effective-target avx } */
> > > > > > > > > +
> > > > > > > > > +#include "avx-check.h"
> > > > > > > > > +#include <string.h>
> > > > > > > > > +#include "pr106010-5a.c"
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +avx_test (void)
> > > > > > > > > +{
> > > > > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (64);
> > > > > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (64);
> > > > > > > > > +  _Complex double* pd_exp = (_Complex double*) malloc (64);
> > > > > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (64);
> > > > > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (64);
> > > > > > > > > +  _Complex float* ps_exp = (_Complex float*) malloc (64);
> > > > > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
> > > > > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
> > > > > > > > > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
> > > > > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (64);
> > > > > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (64);
> > > > > > > > > +  _Complex int* epi32_exp = (_Complex int*) malloc (64);
> > > > > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (64);
> > > > > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (64);
> > > > > > > > > +  _Complex short* epi16_exp = (_Complex short*) malloc (64);
> > > > > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (64);
> > > > > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (64);
> > > > > > > > > +  _Complex char* epi8_exp = (_Complex char*) malloc (64);
> > > > > > > > > +  char* p = (char* ) malloc (64);
> > > > > > > > > +  char* q = (char* ) malloc (64);
> > > > > > > > > +
> > > > > > > > > +  __builtin_memset (pd_dst, 0, 64);
> > > > > > > > > +  __builtin_memset (ps_dst, 0, 64);
> > > > > > > > > +  __builtin_memset (epi64_dst, 0, 64);
> > > > > > > > > +  __builtin_memset (epi32_dst, 0, 64);
> > > > > > > > > +  __builtin_memset (epi16_dst, 0, 64);
> > > > > > > > > +  __builtin_memset (epi8_dst, 0, 64);
> > > > > > > > > +
> > > > > > > > > +  for (int i = 0; i != 64; i++)
> > > > > > > > > +    {
> > > > > > > > > +      p[i] = i;
> > > > > > > > > +      q[i] = (i + 32) % 64;
> > > > > > > > > +    }
> > > > > > > > > +  __builtin_memcpy (pd_src, p, 64);
> > > > > > > > > +  __builtin_memcpy (ps_src, p, 64);
> > > > > > > > > +  __builtin_memcpy (epi64_src, p, 64);
> > > > > > > > > +  __builtin_memcpy (epi32_src, p, 64);
> > > > > > > > > +  __builtin_memcpy (epi16_src, p, 64);
> > > > > > > > > +  __builtin_memcpy (epi8_src, p, 64);
> > > > > > > > > +
> > > > > > > > > +  __builtin_memcpy (pd_exp, q, 64);
> > > > > > > > > +  __builtin_memcpy (ps_exp, q, 64);
> > > > > > > > > +  __builtin_memcpy (epi64_exp, q, 64);
> > > > > > > > > +  __builtin_memcpy (epi32_exp, q, 64);
> > > > > > > > > +  __builtin_memcpy (epi16_exp, q, 64);
> > > > > > > > > +  __builtin_memcpy (epi8_exp, q, 64);
> > > > > > > > > +
> > > > > > > > > +  foo_pd (pd_dst, pd_src);
> > > > > > > > > +  foo_ps (ps_dst, ps_src);
> > > > > > > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > > > > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > > > > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > > > > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > > > > > > +
> > > > > > > > > +  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +
> > > > > > > > > +  return;
> > > > > > > > > +}
> > > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5c.c b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 00000000000..9ce4e6dd5c0
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > > > > > > > > @@ -0,0 +1,62 @@
> > > > > > > > > +/* { dg-do run } */
> > > > > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > > > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 4 "slp2" } } */
> > > > > > > > > +
> > > > > > > > > +#include <string.h>
> > > > > > > > > +
> > > > > > > > > +static void do_test (void);
> > > > > > > > > +#define DO_TEST do_test
> > > > > > > > > +#define AVX512FP16
> > > > > > > > > +#include "avx512-check.h"
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[8];
> > > > > > > > > +  a[1] = b[9];
> > > > > > > > > +  a[2] = b[10];
> > > > > > > > > +  a[3] = b[11];
> > > > > > > > > +  a[4] = b[12];
> > > > > > > > > +  a[5] = b[13];
> > > > > > > > > +  a[6] = b[14];
> > > > > > > > > +  a[7] = b[15];
> > > > > > > > > +  a[8] = b[0];
> > > > > > > > > +  a[9] = b[1];
> > > > > > > > > +  a[10] = b[2];
> > > > > > > > > +  a[11] = b[3];
> > > > > > > > > +  a[12] = b[4];
> > > > > > > > > +  a[13] = b[5];
> > > > > > > > > +  a[14] = b[6];
> > > > > > > > > +  a[15] = b[7];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +do_test (void)
> > > > > > > > > +{
> > > > > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
> > > > > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
> > > > > > > > > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
> > > > > > > > > +  char* p = (char* ) malloc (64);
> > > > > > > > > +  char* q = (char* ) malloc (64);
> > > > > > > > > +
> > > > > > > > > +  __builtin_memset (ph_dst, 0, 64);
> > > > > > > > > +
> > > > > > > > > +  for (int i = 0; i != 64; i++)
> > > > > > > > > +    {
> > > > > > > > > +      p[i] = i;
> > > > > > > > > +      q[i] = (i + 32) % 64;
> > > > > > > > > +    }
> > > > > > > > > +  __builtin_memcpy (ph_src, p, 64);
> > > > > > > > > +
> > > > > > > > > +  __builtin_memcpy (ph_exp, q, 64);
> > > > > > > > > +
> > > > > > > > > +  foo_ph (ph_dst, ph_src);
> > > > > > > > > +
> > > > > > > > > +  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +
> > > > > > > > > +  return;
> > > > > > > > > +}
> > > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6a.c b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 00000000000..65a90d03684
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > > > > > > > > @@ -0,0 +1,115 @@
> > > > > > > > > +/* { dg-do compile } */
> > > > > > > > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 4 "slp2" } }  */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 4 "slp2" } }  */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[3];
> > > > > > > > > +  a[1] = b[2];
> > > > > > > > > +  a[2] = b[1];
> > > > > > > > > +  a[3] = b[0];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[7];
> > > > > > > > > +  a[1] = b[6];
> > > > > > > > > +  a[2] = b[5];
> > > > > > > > > +  a[3] = b[4];
> > > > > > > > > +  a[4] = b[3];
> > > > > > > > > +  a[5] = b[2];
> > > > > > > > > +  a[6] = b[1];
> > > > > > > > > +  a[7] = b[0];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[3];
> > > > > > > > > +  a[1] = b[2];
> > > > > > > > > +  a[2] = b[1];
> > > > > > > > > +  a[3] = b[0];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[7];
> > > > > > > > > +  a[1] = b[6];
> > > > > > > > > +  a[2] = b[5];
> > > > > > > > > +  a[3] = b[4];
> > > > > > > > > +  a[4] = b[3];
> > > > > > > > > +  a[5] = b[2];
> > > > > > > > > +  a[6] = b[1];
> > > > > > > > > +  a[7] = b[0];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[15];
> > > > > > > > > +  a[1] = b[14];
> > > > > > > > > +  a[2] = b[13];
> > > > > > > > > +  a[3] = b[12];
> > > > > > > > > +  a[4] = b[11];
> > > > > > > > > +  a[5] = b[10];
> > > > > > > > > +  a[6] = b[9];
> > > > > > > > > +  a[7] = b[8];
> > > > > > > > > +  a[8] = b[7];
> > > > > > > > > +  a[9] = b[6];
> > > > > > > > > +  a[10] = b[5];
> > > > > > > > > +  a[11] = b[4];
> > > > > > > > > +  a[12] = b[3];
> > > > > > > > > +  a[13] = b[2];
> > > > > > > > > +  a[14] = b[1];
> > > > > > > > > +  a[15] = b[0];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[31];
> > > > > > > > > +  a[1] = b[30];
> > > > > > > > > +  a[2] = b[29];
> > > > > > > > > +  a[3] = b[28];
> > > > > > > > > +  a[4] = b[27];
> > > > > > > > > +  a[5] = b[26];
> > > > > > > > > +  a[6] = b[25];
> > > > > > > > > +  a[7] = b[24];
> > > > > > > > > +  a[8] = b[23];
> > > > > > > > > +  a[9] = b[22];
> > > > > > > > > +  a[10] = b[21];
> > > > > > > > > +  a[11] = b[20];
> > > > > > > > > +  a[12] = b[19];
> > > > > > > > > +  a[13] = b[18];
> > > > > > > > > +  a[14] = b[17];
> > > > > > > > > +  a[15] = b[16];
> > > > > > > > > +  a[16] = b[15];
> > > > > > > > > +  a[17] = b[14];
> > > > > > > > > +  a[18] = b[13];
> > > > > > > > > +  a[19] = b[12];
> > > > > > > > > +  a[20] = b[11];
> > > > > > > > > +  a[21] = b[10];
> > > > > > > > > +  a[22] = b[9];
> > > > > > > > > +  a[23] = b[8];
> > > > > > > > > +  a[24] = b[7];
> > > > > > > > > +  a[25] = b[6];
> > > > > > > > > +  a[26] = b[5];
> > > > > > > > > +  a[27] = b[4];
> > > > > > > > > +  a[28] = b[3];
> > > > > > > > > +  a[29] = b[2];
> > > > > > > > > +  a[30] = b[1];
> > > > > > > > > +  a[31] = b[0];
> > > > > > > > > +}
> > > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6b.c b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 00000000000..1c5bb020939
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > > > > > > > > @@ -0,0 +1,157 @@
> > > > > > > > > +/* { dg-do run } */
> > > > > > > > > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > > > +/* { dg-require-effective-target avx2 } */
> > > > > > > > > +
> > > > > > > > > +#include "avx2-check.h"
> > > > > > > > > +#include <string.h>
> > > > > > > > > +#include "pr106010-6a.c"
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +avx2_test (void)
> > > > > > > > > +{
> > > > > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (64);
> > > > > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (64);
> > > > > > > > > +  _Complex double* pd_exp = (_Complex double*) malloc (64);
> > > > > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (64);
> > > > > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (64);
> > > > > > > > > +  _Complex float* ps_exp = (_Complex float*) malloc (64);
> > > > > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
> > > > > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
> > > > > > > > > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
> > > > > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (64);
> > > > > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (64);
> > > > > > > > > +  _Complex int* epi32_exp = (_Complex int*) malloc (64);
> > > > > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (64);
> > > > > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (64);
> > > > > > > > > +  _Complex short* epi16_exp = (_Complex short*) malloc (64);
> > > > > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (64);
> > > > > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (64);
> > > > > > > > > +  _Complex char* epi8_exp = (_Complex char*) malloc (64);
> > > > > > > > > +  char* p = (char* ) malloc (64);
> > > > > > > > > +  char* q = (char* ) malloc (64);
> > > > > > > > > +
> > > > > > > > > +  __builtin_memset (pd_dst, 0, 64);
> > > > > > > > > +  __builtin_memset (ps_dst, 0, 64);
> > > > > > > > > +  __builtin_memset (epi64_dst, 0, 64);
> > > > > > > > > +  __builtin_memset (epi32_dst, 0, 64);
> > > > > > > > > +  __builtin_memset (epi16_dst, 0, 64);
> > > > > > > > > +  __builtin_memset (epi8_dst, 0, 64);
> > > > > > > > > +
> > > > > > > > > +  for (int i = 0; i != 64; i++)
> > > > > > > > > +    p[i] = i;
> > > > > > > > > +
> > > > > > > > > +  __builtin_memcpy (pd_src, p, 64);
> > > > > > > > > +  __builtin_memcpy (ps_src, p, 64);
> > > > > > > > > +  __builtin_memcpy (epi64_src, p, 64);
> > > > > > > > > +  __builtin_memcpy (epi32_src, p, 64);
> > > > > > > > > +  __builtin_memcpy (epi16_src, p, 64);
> > > > > > > > > +  __builtin_memcpy (epi8_src, p, 64);
> > > > > > > > > +
> > > > > > > > > +
> > > > > > > > > +  for (int i = 0; i != 16; i++)
> > > > > > > > > +    {
> > > > > > > > > +      q[i] = i + 48;
> > > > > > > > > +      q[i + 16] = i + 32;
> > > > > > > > > +      q[i + 32] = i + 16;
> > > > > > > > > +      q[i + 48] = i;
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > > +  __builtin_memcpy (pd_exp, q, 64);
> > > > > > > > > +  __builtin_memcpy (epi64_exp, q, 64);
> > > > > > > > > +
> > > > > > > > > +   for (int i = 0; i != 8; i++)
> > > > > > > > > +    {
> > > > > > > > > +      q[i] = i + 56;
> > > > > > > > > +      q[i + 8] = i + 48;
> > > > > > > > > +      q[i + 16] = i + 40;
> > > > > > > > > +      q[i + 24] = i + 32;
> > > > > > > > > +      q[i + 32] = i + 24;
> > > > > > > > > +      q[i + 40] = i + 16;
> > > > > > > > > +      q[i + 48] = i + 8;
> > > > > > > > > +      q[i + 56] = i;
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > > +  __builtin_memcpy (ps_exp, q, 64);
> > > > > > > > > +  __builtin_memcpy (epi32_exp, q, 64);
> > > > > > > > > +
> > > > > > > > > +  for (int i = 0; i != 4; i++)
> > > > > > > > > +    {
> > > > > > > > > +      q[i] = i + 60;
> > > > > > > > > +      q[i + 4] = i + 56;
> > > > > > > > > +      q[i + 8] = i + 52;
> > > > > > > > > +      q[i + 12] = i + 48;
> > > > > > > > > +      q[i + 16] = i + 44;
> > > > > > > > > +      q[i + 20] = i + 40;
> > > > > > > > > +      q[i + 24] = i + 36;
> > > > > > > > > +      q[i + 28] = i + 32;
> > > > > > > > > +      q[i + 32] = i + 28;
> > > > > > > > > +      q[i + 36] = i + 24;
> > > > > > > > > +      q[i + 40] = i + 20;
> > > > > > > > > +      q[i + 44] = i + 16;
> > > > > > > > > +      q[i + 48] = i + 12;
> > > > > > > > > +      q[i + 52] = i + 8;
> > > > > > > > > +      q[i + 56] = i + 4;
> > > > > > > > > +      q[i + 60] = i;
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > > +  __builtin_memcpy (epi16_exp, q, 64);
> > > > > > > > > +
> > > > > > > > > +  for (int i = 0; i != 2; i++)
> > > > > > > > > +    {
> > > > > > > > > +      q[i] = i + 62;
> > > > > > > > > +      q[i + 2] = i + 60;
> > > > > > > > > +      q[i + 4] = i + 58;
> > > > > > > > > +      q[i + 6] = i + 56;
> > > > > > > > > +      q[i + 8] = i + 54;
> > > > > > > > > +      q[i + 10] = i + 52;
> > > > > > > > > +      q[i + 12] = i + 50;
> > > > > > > > > +      q[i + 14] = i + 48;
> > > > > > > > > +      q[i + 16] = i + 46;
> > > > > > > > > +      q[i + 18] = i + 44;
> > > > > > > > > +      q[i + 20] = i + 42;
> > > > > > > > > +      q[i + 22] = i + 40;
> > > > > > > > > +      q[i + 24] = i + 38;
> > > > > > > > > +      q[i + 26] = i + 36;
> > > > > > > > > +      q[i + 28] = i + 34;
> > > > > > > > > +      q[i + 30] = i + 32;
> > > > > > > > > +      q[i + 32] = i + 30;
> > > > > > > > > +      q[i + 34] = i + 28;
> > > > > > > > > +      q[i + 36] = i + 26;
> > > > > > > > > +      q[i + 38] = i + 24;
> > > > > > > > > +      q[i + 40] = i + 22;
> > > > > > > > > +      q[i + 42] = i + 20;
> > > > > > > > > +      q[i + 44] = i + 18;
> > > > > > > > > +      q[i + 46] = i + 16;
> > > > > > > > > +      q[i + 48] = i + 14;
> > > > > > > > > +      q[i + 50] = i + 12;
> > > > > > > > > +      q[i + 52] = i + 10;
> > > > > > > > > +      q[i + 54] = i + 8;
> > > > > > > > > +      q[i + 56] = i + 6;
> > > > > > > > > +      q[i + 58] = i + 4;
> > > > > > > > > +      q[i + 60] = i + 2;
> > > > > > > > > +      q[i + 62] = i;
> > > > > > > > > +    }
> > > > > > > > > +  __builtin_memcpy (epi8_exp, q, 64);
> > > > > > > > > +
> > > > > > > > > +  foo_pd (pd_dst, pd_src);
> > > > > > > > > +  foo_ps (ps_dst, ps_src);
> > > > > > > > > +  foo_epi64 (epi64_dst, epi64_src);
> > > > > > > > > +  foo_epi32 (epi32_dst, epi32_src);
> > > > > > > > > +  foo_epi16 (epi16_dst, epi16_src);
> > > > > > > > > +  foo_epi8 (epi8_dst, epi8_src);
> > > > > > > > > +
> > > > > > > > > +  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +
> > > > > > > > > +  return;
> > > > > > > > > +}
> > > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6c.c b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 00000000000..b859d884a7f
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > > > > > > > > @@ -0,0 +1,80 @@
> > > > > > > > > +/* { dg-do run } */
> > > > > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > > > > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } } */
> > > > > > > > > +
> > > > > > > > > +#include <string.h>
> > > > > > > > > +
> > > > > > > > > +static void do_test (void);
> > > > > > > > > +#define DO_TEST do_test
> > > > > > > > > +#define AVX512FP16
> > > > > > > > > +#include "avx512-check.h"
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > > > > > > > > +{
> > > > > > > > > +  a[0] = b[15];
> > > > > > > > > +  a[1] = b[14];
> > > > > > > > > +  a[2] = b[13];
> > > > > > > > > +  a[3] = b[12];
> > > > > > > > > +  a[4] = b[11];
> > > > > > > > > +  a[5] = b[10];
> > > > > > > > > +  a[6] = b[9];
> > > > > > > > > +  a[7] = b[8];
> > > > > > > > > +  a[8] = b[7];
> > > > > > > > > +  a[9] = b[6];
> > > > > > > > > +  a[10] = b[5];
> > > > > > > > > +  a[11] = b[4];
> > > > > > > > > +  a[12] = b[3];
> > > > > > > > > +  a[13] = b[2];
> > > > > > > > > +  a[14] = b[1];
> > > > > > > > > +  a[15] = b[0];
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +do_test (void)
> > > > > > > > > +{
> > > > > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
> > > > > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
> > > > > > > > > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
> > > > > > > > > +  char* p = (char* ) malloc (64);
> > > > > > > > > +  char* q = (char* ) malloc (64);
> > > > > > > > > +
> > > > > > > > > +  __builtin_memset (ph_dst, 0, 64);
> > > > > > > > > +
> > > > > > > > > +  for (int i = 0; i != 64; i++)
> > > > > > > > > +    p[i] = i;
> > > > > > > > > +
> > > > > > > > > +  __builtin_memcpy (ph_src, p, 64);
> > > > > > > > > +
> > > > > > > > > +  for (int i = 0; i != 4; i++)
> > > > > > > > > +    {
> > > > > > > > > +      q[i] = i + 60;
> > > > > > > > > +      q[i + 4] = i + 56;
> > > > > > > > > +      q[i + 8] = i + 52;
> > > > > > > > > +      q[i + 12] = i + 48;
> > > > > > > > > +      q[i + 16] = i + 44;
> > > > > > > > > +      q[i + 20] = i + 40;
> > > > > > > > > +      q[i + 24] = i + 36;
> > > > > > > > > +      q[i + 28] = i + 32;
> > > > > > > > > +      q[i + 32] = i + 28;
> > > > > > > > > +      q[i + 36] = i + 24;
> > > > > > > > > +      q[i + 40] = i + 20;
> > > > > > > > > +      q[i + 44] = i + 16;
> > > > > > > > > +      q[i + 48] = i + 12;
> > > > > > > > > +      q[i + 52] = i + 8;
> > > > > > > > > +      q[i + 56] = i + 4;
> > > > > > > > > +      q[i + 60] = i;
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > > +  __builtin_memcpy (ph_exp, q, 64);
> > > > > > > > > +
> > > > > > > > > +  foo_ph (ph_dst, ph_src);
> > > > > > > > > +
> > > > > > > > > +  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +
> > > > > > > > > +  return;
> > > > > > > > > +}
> > > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7a.c b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 00000000000..2ea01fac927
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > > > > > > > > @@ -0,0 +1,58 @@
> > > > > > > > > +/* { dg-do compile } */
> > > > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
> > > > > > > > > +
> > > > > > > > > +#define N 10000
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_pd (_Complex double* a, _Complex double b)
> > > > > > > > > +{
> > > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > > +    a[i] = b;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_ps (_Complex float* a, _Complex float b)
> > > > > > > > > +{
> > > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > > +    a[i] = b;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi64 (_Complex long long* a, _Complex long long b)
> > > > > > > > > +{
> > > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > > +    a[i] = b;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi32 (_Complex int* a, _Complex int b)
> > > > > > > > > +{
> > > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > > +    a[i] = b;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi16 (_Complex short* a, _Complex short b)
> > > > > > > > > +{
> > > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > > +    a[i] = b;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi8 (_Complex char* a, _Complex char b)
> > > > > > > > > +{
> > > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > > +    a[i] = b;
> > > > > > > > > +}
> > > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7b.c b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 00000000000..26482cc10f5
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > > > > > > > > @@ -0,0 +1,63 @@
> > > > > > > > > +/* { dg-do run } */
> > > > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > > > +/* { dg-require-effective-target avx } */
> > > > > > > > > +
> > > > > > > > > +#include "avx-check.h"
> > > > > > > > > +#include <string.h>
> > > > > > > > > +#include "pr106010-7a.c"
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +avx_test (void)
> > > > > > > > > +{
> > > > > > > > > +  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > > > > > > +  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > > > > > > +  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > > > > > > +  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > > > > > > +  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > > > > > > +  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > > > > > > +  char* p_init = (char*) malloc (2 * N * sizeof (double));
> > > > > > > > > +
> > > > > > > > > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > > > > > > > > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > > > > > > > > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > > > > > > > > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > > > > > > > > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > > > > > > > > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > > > > > > > > +
> > > > > > > > > +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> > > > > > > > > +    p_init[i] = i % 2 + 3;
> > > > > > > > > +
> > > > > > > > > +  memcpy (pd_src, p_init, 2 * N * sizeof (double));
> > > > > > > > > +  memcpy (ps_dst, p_init, 2 * N * sizeof (float));
> > > > > > > > > +  memcpy (epi64_dst, p_init, 2 * N * sizeof (long long));
> > > > > > > > > +  memcpy (epi32_dst, p_init, 2 * N * sizeof (int));
> > > > > > > > > +  memcpy (epi16_dst, p_init, 2 * N * sizeof (short));
> > > > > > > > > +  memcpy (epi8_dst, p_init, 2 * N * sizeof (char));
> > > > > > > > > +
> > > > > > > > > +  foo_pd (pd_dst, pd_src[0]);
> > > > > > > > > +  foo_ps (ps_dst, ps_src[0]);
> > > > > > > > > +  foo_epi64 (epi64_dst, epi64_src[0]);
> > > > > > > > > +  foo_epi32 (epi32_dst, epi32_src[0]);
> > > > > > > > > +  foo_epi16 (epi16_dst, epi16_src[0]);
> > > > > > > > > +  foo_epi8 (epi8_dst, epi8_src[0]);
> > > > > > > > > +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +
> > > > > > > > > +  return;
> > > > > > > > > +}
> > > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7c.c b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 00000000000..7f4056a5ecc
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > > > > > > > > @@ -0,0 +1,41 @@
> > > > > > > > > +/* { dg-do run } */
> > > > > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
> > > > > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > > > > +
> > > > > > > > > +#include <string.h>
> > > > > > > > > +
> > > > > > > > > +static void do_test (void);
> > > > > > > > > +
> > > > > > > > > +#define DO_TEST do_test
> > > > > > > > > +#define AVX512FP16
> > > > > > > > > +#include "avx512-check.h"
> > > > > > > > > +
> > > > > > > > > +#define N 10000
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_ph (_Complex _Float16* a, _Complex _Float16 b)
> > > > > > > > > +{
> > > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > > +    a[i] = b;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static void
> > > > > > > > > +do_test (void)
> > > > > > > > > +{
> > > > > > > > > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > > > > > > +  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
> > > > > > > > > +
> > > > > > > > > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > > > > > > > > +
> > > > > > > > > +  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
> > > > > > > > > +    p_init[i] = i % 2 + 3;
> > > > > > > > > +
> > > > > > > > > +  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
> > > > > > > > > +
> > > > > > > > > +  foo_ph (ph_dst, ph_src[0]);
> > > > > > > > > +  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
> > > > > > > > > +    __builtin_abort ();
> > > > > > > > > +}
> > > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8a.c b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 00000000000..11054b60d30
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > > > > > > > > @@ -0,0 +1,58 @@
> > > > > > > > > +/* { dg-do compile } */
> > > > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
> > > > > > > > > +
> > > > > > > > > +#define N 10000
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_pd (_Complex double* a)
> > > > > > > > > +{
> > > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > > +    a[i] = 1.0 + 2.0i;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_ps (_Complex float* a)
> > > > > > > > > +{
> > > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > > +    a[i] = 1.0f + 2.0fi;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi64 (_Complex long long* a)
> > > > > > > > > +{
> > > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > > +    a[i] = 1 + 2i;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi32 (_Complex int* a)
> > > > > > > > > +{
> > > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > > +    a[i] = 1 + 2i;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi16 (_Complex short* a)
> > > > > > > > > +{
> > > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > > +    a[i] = 1 + 2i;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_epi8 (_Complex char* a)
> > > > > > > > > +{
> > > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > > +    a[i] = 1 + 2i;
> > > > > > > > > +}
> > > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8b.c b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 00000000000..6bb0073b691
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > > > > > > > > @@ -0,0 +1,53 @@
> > > > > > > > > +/* { dg-do run } */
> > > > > > > > > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > > > > > > > > +/* { dg-require-effective-target avx } */
> > > > > > > > > +
> > > > > > > > > +#include "avx-check.h"
> > > > > > > > > +#include <string.h>
> > > > > > > > > +#include "pr106010-8a.c"
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +avx_test (void)
> > > > > > > > > +{
> > > > > > > > > +  _Complex double pd_src = 1.0 + 2.0i;
> > > > > > > > > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > > > > > > > > +  _Complex float ps_src = 1.0 + 2.0i;
> > > > > > > > > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > > > > > > > > +  _Complex long long epi64_src = 1 + 2i;;
> > > > > > > > > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > > > > > > > > +  _Complex int epi32_src = 1 + 2i;
> > > > > > > > > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > > > > > > > > +  _Complex short epi16_src = 1 + 2i;
> > > > > > > > > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > > > > > > > > +  _Complex char epi8_src = 1 + 2i;
> > > > > > > > > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > > > > > > > > +
> > > > > > > > > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > > > > > > > > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > > > > > > > > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > > > > > > > > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > > > > > > > > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > > > > > > > > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > > > > > > > > +
> > > > > > > > > +  foo_pd (pd_dst);
> > > > > > > > > +  foo_ps (ps_dst);
> > > > > > > > > +  foo_epi64 (epi64_dst);
> > > > > > > > > +  foo_epi32 (epi32_dst);
> > > > > > > > > +  foo_epi16 (epi16_dst);
> > > > > > > > > +  foo_epi8 (epi8_dst);
> > > > > > > > > +  for (int i = 0 ; i != N; i++)
> > > > > > > > > +    {
> > > > > > > > > +      if (pd_dst[i] != pd_src)
> > > > > > > > > +       __builtin_abort ();
> > > > > > > > > +      if (ps_dst[i] != ps_src)
> > > > > > > > > +       __builtin_abort ();
> > > > > > > > > +      if (epi64_dst[i] != epi64_src)
> > > > > > > > > +       __builtin_abort ();
> > > > > > > > > +      if (epi32_dst[i] != epi32_src)
> > > > > > > > > +       __builtin_abort ();
> > > > > > > > > +      if (epi16_dst[i] != epi16_src)
> > > > > > > > > +       __builtin_abort ();
> > > > > > > > > +      if (epi8_dst[i] != epi8_src)
> > > > > > > > > +       __builtin_abort ();
> > > > > > > > > +    }
> > > > > > > > > +}
> > > > > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8c.c b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 00000000000..61ae131829d
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > > > > > > > > @@ -0,0 +1,38 @@
> > > > > > > > > +/* { dg-do run } */
> > > > > > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > > > > > > > > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
> > > > > > > > > +/* { dg-require-effective-target avx512fp16 } */
> > > > > > > > > +
> > > > > > > > > +#include <string.h>
> > > > > > > > > +
> > > > > > > > > +static void do_test (void);
> > > > > > > > > +
> > > > > > > > > +#define DO_TEST do_test
> > > > > > > > > +#define AVX512FP16
> > > > > > > > > +#include "avx512-check.h"
> > > > > > > > > +
> > > > > > > > > +#define N 10000
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +__attribute__((noipa))
> > > > > > > > > +foo_ph (_Complex _Float16* a)
> > > > > > > > > +{
> > > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > > +    a[i] = 1.0f16 + 2.0f16i;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static void
> > > > > > > > > +do_test (void)
> > > > > > > > > +{
> > > > > > > > > +  _Complex _Float16 ph_src = 1.0f16 + 2.0f16i;
> > > > > > > > > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > > > > > > > > +
> > > > > > > > > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > > > > > > > > +
> > > > > > > > > +  foo_ph (ph_dst);
> > > > > > > > > +  for (int i = 0; i != N; i++)
> > > > > > > > > +    {
> > > > > > > > > +      if (ph_dst[i] != ph_src)
> > > > > > > > > +       __builtin_abort ();
> > > > > > > > > +    }
> > > > > > > > > +}
> > > > > > > > > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> > > > > > > > > index d20a10a1524..42ee9df674c 100644
> > > > > > > > > --- a/gcc/tree-vect-data-refs.cc
> > > > > > > > > +++ b/gcc/tree-vect-data-refs.cc
> > > > > > > > > @@ -1403,7 +1403,8 @@ vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
> > > > > > > > >    if (PURE_SLP_STMT (stmt_info))
> > > > > > > > >      ncopies = 1;
> > > > > > > > >    else
> > > > > > > > > -    ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
> > > > > > > > > +    ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info),
> > > > > > > > > +                                  STMT_VINFO_COMPLEX_P (stmt_info));
> > > > > > > > >
> > > > > > > > >    if (DR_IS_READ (dr_info->dr))
> > > > > > > > >      vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
> > > > > > > > > @@ -4597,8 +4598,22 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
> > > > > > > > >
> > > > > > > > >        /* Set vectype for STMT.  */
> > > > > > > > >        scalar_type = TREE_TYPE (DR_REF (dr));
> > > > > > > > > -      tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
> > > > > > > > > -      if (!vectype)
> > > > > > > > > +      tree adjust_scalar_type = scalar_type;
> > > > > > > > > +      /* Support Complex type access. Note that the complex type of load/store
> > > > > > > > > +        does not support gather/scatter.  */
> > > > > > > > > +      if (TREE_CODE (scalar_type) == COMPLEX_TYPE
> > > > > > > > > +         && gatherscatter == SG_NONE)
> > > > > > > > > +       {
> > > > > > > > > +         adjust_scalar_type = TREE_TYPE (scalar_type);
> > > > > > > > > +         STMT_VINFO_COMPLEX_P (stmt_info) = true;
> > > > > > > > > +       }
> > > > > > > > > +      tree vectype = get_vectype_for_scalar_type (vinfo, adjust_scalar_type);
> > > > > > > > > +      unsigned HOST_WIDE_INT constant_nunits;
> > > > > > > > > +      if (!vectype
> > > > > > > > > +         /* For complex type, V1DI doesn't make sense.  */
> > > > > > > > > +         || (STMT_VINFO_COMPLEX_P (stmt_info)
> > > > > > > > > +             && (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&constant_nunits)
> > > > > > > > > +                 || constant_nunits == 1)))
> > > > > > > > >          {
> > > > > > > > >            if (dump_enabled_p ())
> > > > > > > > >              {
> > > > > > > > > @@ -4635,8 +4650,11 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
> > > > > > > > >         }
> > > > > > > > >
> > > > > > > > >        /* Adjust the minimal vectorization factor according to the
> > > > > > > > > -        vector type.  */
> > > > > > > > > +        vector type. Note for complex type, VF is half of
> > > > > > > > > +        TYPE_VECTOR_SUBPARTS.  */
> > > > > > > > >        vf = TYPE_VECTOR_SUBPARTS (vectype);
> > > > > > > > > +      if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > > > > > +       vf = exact_div (vf, 2);
> > > > > > > > >        *min_vf = upper_bound (*min_vf, vf);
> > > > > > > > >
> > > > > > > > >        /* Leave the BB vectorizer to pick the vector type later, based on
> > > > > > > > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > > > > > > > > index 3a70c15b593..365fa738022 100644
> > > > > > > > > --- a/gcc/tree-vect-loop.cc
> > > > > > > > > +++ b/gcc/tree-vect-loop.cc
> > > > > > > > > @@ -200,7 +200,12 @@ vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > > > > > >      }
> > > > > > > > >
> > > > > > > > >    if (nunits_vectype)
> > > > > > > > > -    vect_update_max_nunits (vf, nunits_vectype);
> > > > > > > > > +    {
> > > > > > > > > +      poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (nunits_vectype);
> > > > > > > > > +      if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > > > > > +       nunits = exact_div (nunits, 2);
> > > > > > > > > +      vect_update_max_nunits (vf, nunits);
> > > > > > > > > +    }
> > > > > > > > >
> > > > > > > > >    return opt_result::success ();
> > > > > > > > >  }
> > > > > > > > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> > > > > > > > > index dab5daddcc5..5d66ea2f286 100644
> > > > > > > > > --- a/gcc/tree-vect-slp.cc
> > > > > > > > > +++ b/gcc/tree-vect-slp.cc
> > > > > > > > > @@ -877,10 +877,14 @@ vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > > > > > >        return false;
> > > > > > > > >      }
> > > > > > > > >
> > > > > > > > > +  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> > > > > > > > > +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > > > > > +    nunits = exact_div (nunits, 2);
> > > > > > > > > +
> > > > > > > > >    /* If populating the vector type requires unrolling then fail
> > > > > > > > >       before adjusting *max_nunits for basic-block vectorization.  */
> > > > > > > > >    if (is_a <bb_vec_info> (vinfo)
> > > > > > > > > -      && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
> > > > > > > > > +      && !multiple_p (group_size , nunits))
> > > > > > > > >      {
> > > > > > > > >        if (dump_enabled_p ())
> > > > > > > > >         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > > > > > > > @@ -891,7 +895,7 @@ vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > > > > > >      }
> > > > > > > > >
> > > > > > > > >    /* In case of multiple types we need to detect the smallest type.  */
> > > > > > > > > -  vect_update_max_nunits (max_nunits, vectype);
> > > > > > > > > +  vect_update_max_nunits (max_nunits, nunits);
> > > > > > > > >    return true;
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > > @@ -3720,22 +3724,54 @@ vect_optimize_slp (vec_info *vinfo)
> > > > > > > > >          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
> > > > > > > > >          when permuting constants and invariants keeping the permute
> > > > > > > > >          bijective.  */
> > > > > > > > > -      auto_sbitmap load_index (SLP_TREE_LANES (node));
> > > > > > > > > -      bitmap_clear (load_index);
> > > > > > > > > -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > > > > -       bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
> > > > > > > > > -      unsigned j;
> > > > > > > > > -      for (j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > > > > -       if (!bitmap_bit_p (load_index, j))
> > > > > > > > > -         break;
> > > > > > > > > -      if (j != SLP_TREE_LANES (node))
> > > > > > > > > -       continue;
> > > > > > > > > +      /* Permutation of Complex type.  */
> > > > > > > > > +      if (STMT_VINFO_COMPLEX_P (dr_stmt))
> > > > > > > > > +       {
> > > > > > > > > +         auto_sbitmap load_index (SLP_TREE_LANES (node) * 2);
> > > > > > > > > +         bitmap_clear (load_index);
> > > > > > > > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > > > > +           {
> > > > > > > > > +             unsigned bit = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > > > > > > > +             bitmap_set_bit (load_index, 2 * bit);
> > > > > > > > > +             bitmap_set_bit (load_index, 2 * bit + 1);
> > > > > > > > > +           }
> > > > > > > > > +         unsigned j;
> > > > > > > > > +         for (j = 0; j < SLP_TREE_LANES (node) * 2; ++j)
> > > > > > > > > +           if (!bitmap_bit_p (load_index, j))
> > > > > > > > > +             break;
> > > > > > > > > +         if (j != SLP_TREE_LANES (node) * 2)
> > > > > > > > > +           continue;
> > > > > > > > >
> > > > > > > > > -      vec<unsigned> perm = vNULL;
> > > > > > > > > -      perm.safe_grow (SLP_TREE_LANES (node), true);
> > > > > > > > > -      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > > > > -       perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > > > > > > > -      perms.safe_push (perm);
> > > > > > > > > +         vec<unsigned> perm = vNULL;
> > > > > > > > > +         perm.safe_grow (SLP_TREE_LANES (node) * 2, true);
> > > > > > > > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > > > > +           {
> > > > > > > > > +             unsigned cidx = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > > > > > > > +             perm[2 * j] = 2 * cidx;
> > > > > > > > > +             perm[2 * j + 1] = 2 * cidx + 1;
> > > > > > > > > +           }
> > > > > > > > > +         perms.safe_push (perm);
> > > > > > > > > +       }
> > > > > > > > > +      else
> > > > > > > > > +       {
> > > > > > > > > +         auto_sbitmap load_index (SLP_TREE_LANES (node));
> > > > > > > > > +         bitmap_clear (load_index);
> > > > > > > > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > > > > +           bitmap_set_bit (load_index,
> > > > > > > > > +                           SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
> > > > > > > > > +         unsigned j;
> > > > > > > > > +         for (j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > > > > +           if (!bitmap_bit_p (load_index, j))
> > > > > > > > > +             break;
> > > > > > > > > +         if (j != SLP_TREE_LANES (node))
> > > > > > > > > +           continue;
> > > > > > > > > +
> > > > > > > > > +         vec<unsigned> perm = vNULL;
> > > > > > > > > +         perm.safe_grow (SLP_TREE_LANES (node), true);
> > > > > > > > > +         for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
> > > > > > > > > +           perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
> > > > > > > > > +         perms.safe_push (perm);
> > > > > > > > > +       }
> > > > > > > > >        vertices[idx].perm_in = perms.length () - 1;
> > > > > > > > >        vertices[idx].perm_out = perms.length () - 1;
> > > > > > > > >      }
> > > > > > > > > @@ -4518,6 +4554,12 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
> > > > > > > > >         vf = loop_vinfo->vectorization_factor;
> > > > > > > > >        else
> > > > > > > > >         vf = 1;
> > > > > > > > > +      /* For complex type and SLP, double vf to get right vectype.
> > > > > > > > > +        .i.e vector(4) double for complex double, group size is 2, double vf
> > > > > > > > > +        to map vf * group_size to TYPE_VECTOR_SUBPARTS.  */
> > > > > > > > > +     if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > > > > > +       vf *= 2;
> > > > > > > > > +
> > > > > > > > >        unsigned int group_size = SLP_TREE_LANES (node);
> > > > > > > > >        tree vectype = SLP_TREE_VECTYPE (node);
> > > > > > > > >        SLP_TREE_NUMBER_OF_VEC_STMTS (node)
> > > > > > > > > @@ -4763,10 +4805,17 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
> > > > > > > > >             }
> > > > > > > > >           unsigned group_size = SLP_TREE_LANES (child);
> > > > > > > > >           poly_uint64 vf = 1;
> > > > > > > > > +
> > > > > > > > >           if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
> > > > > > > > >             vf = loop_vinfo->vectorization_factor;
> > > > > > > > > +
> > > > > > > > > +         /* V2SF is just 1 complex type, so mutiply by 2
> > > > > > > > > +            to get release vector numbers.  */
> > > > > > > > > +         unsigned cp
> > > > > > > > > +           = STMT_VINFO_COMPLEX_P (SLP_TREE_REPRESENTATIVE (node)) ? 2 : 1;
> > > > > > > > > +
> > > > > > > > >           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
> > > > > > > > > -           = vect_get_num_vectors (vf * group_size, vector_type);
> > > > > > > > > +           = vect_get_num_vectors (vf * group_size * cp, vector_type);
> > > > > > > > >           /* And cost them.  */
> > > > > > > > >           vect_prologue_cost_for_slp (child, cost_vec);
> > > > > > > > >         }
> > > > > > > > > @@ -6402,6 +6451,11 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > > > > > > > >
> > > > > > > > >    /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
> > > > > > > > >    vector_type = SLP_TREE_VECTYPE (op_node);
> > > > > > > > > +  unsigned int cp = 1;
> > > > > > > > > +  /* Handle Complex type vector init.
> > > > > > > > > +     SLP_TREE_REPRESENTATIVE (op_node) could be NULL.  */
> > > > > > > > > +  if (TREE_CODE (TREE_TYPE (op_node->ops[0])) == COMPLEX_TYPE)
> > > > > > > > > +    cp = 2;
> > > > > > > > >
> > > > > > > > >    unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
> > > > > > > > >    SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
> > > > > > > > > @@ -6426,9 +6480,9 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > > > > > > > >    /* When using duplicate_and_interleave, we just need one element for
> > > > > > > > >       each scalar statement.  */
> > > > > > > > >    if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
> > > > > > > > > -    nunits = group_size;
> > > > > > > > > +    nunits = group_size * cp;
> > > > > > > > >
> > > > > > > > > -  number_of_copies = nunits * number_of_vectors / group_size;
> > > > > > > > > +  number_of_copies = nunits * number_of_vectors / (group_size * cp);
> > > > > > > > >
> > > > > > > > >    number_of_places_left_in_vector = nunits;
> > > > > > > > >    constant_p = true;
> > > > > > > > > @@ -6460,8 +6514,23 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > > > > > > > >                         gcc_unreachable ();
> > > > > > > > >                     }
> > > > > > > > >                   else
> > > > > > > > > -                   op = fold_unary (VIEW_CONVERT_EXPR,
> > > > > > > > > -                                    TREE_TYPE (vector_type), op);
> > > > > > > > > +                   {
> > > > > > > > > +                     tree scalar_type = TREE_TYPE (vector_type);
> > > > > > > > > +                     /* For complex type, insert real and imag part
> > > > > > > > > +                        separately.  */
> > > > > > > > > +                     if (cp == 2)
> > > > > > > > > +                       {
> > > > > > > > > +                         gcc_assert ((TREE_CODE (TREE_TYPE (op))
> > > > > > > > > +                                      == COMPLEX_TYPE)
> > > > > > > > > +                                     && (scalar_type
> > > > > > > > > +                                         == TREE_TYPE (TREE_TYPE (op))));
> > > > > > > > > +                         elts[number_of_places_left_in_vector--]
> > > > > > > > > +                           = fold_unary (IMAGPART_EXPR, scalar_type, op);
> > > > > > > > > +                         op = fold_unary (REALPART_EXPR, scalar_type, op);
> > > > > > > > > +                       }
> > > > > > > > > +                     else
> > > > > > > > > +                       op = fold_unary (VIEW_CONVERT_EXPR, scalar_type, op);
> > > > > > > > > +                   }
> > > > > > > > >                   gcc_assert (op && CONSTANT_CLASS_P (op));
> > > > > > > > >                 }
> > > > > > > > >               else
> > > > > > > > > @@ -6481,11 +6550,28 @@ vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
> > > > > > > > >                     }
> > > > > > > > >                   else
> > > > > > > > >                     {
> > > > > > > > > -                     op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
> > > > > > > > > -                                  op);
> > > > > > > > > -                     init_stmt
> > > > > > > > > -                       = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
> > > > > > > > > -                                              op);
> > > > > > > > > +                     tree scalar_type = TREE_TYPE (vector_type);
> > > > > > > > > +                     if (cp == 2)
> > > > > > > > > +                       {
> > > > > > > > > +                         gcc_assert ((TREE_CODE (TREE_TYPE (op))
> > > > > > > > > +                                      == COMPLEX_TYPE)
> > > > > > > > > +                                     && (scalar_type
> > > > > > > > > +                                         == TREE_TYPE (TREE_TYPE (op))));
> > > > > > > > > +                         tree imag = build1 (IMAGPART_EXPR, scalar_type, op);
> > > > > > > > > +                         op = build1 (REALPART_EXPR, scalar_type, op);
> > > > > > > > > +                         tree imag_temp = make_ssa_name (scalar_type);
> > > > > > > > > +                         elts[number_of_places_left_in_vector--] = imag_temp;
> > > > > > > > > +                         init_stmt = gimple_build_assign (imag_temp, imag);
> > > > > > > > > +                         gimple_seq_add_stmt (&ctor_seq, init_stmt);
> > > > > > > > > +                         init_stmt = gimple_build_assign (new_temp, op);
> > > > > > > > > +                       }
> > > > > > > > > +                     else
> > > > > > > > > +                       {
> > > > > > > > > +                         op = build1 (VIEW_CONVERT_EXPR, scalar_type, op);
> > > > > > > > > +                         init_stmt
> > > > > > > > > +                           = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
> > > > > > > > > +                                                  op);
> > > > > > > > > +                       }
> > > > > > > > >                     }
> > > > > > > > >                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
> > > > > > > > >                   op = new_temp;
> > > > > > > > > @@ -6696,15 +6782,17 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > > > > > >    unsigned int nelts_to_build;
> > > > > > > > >    unsigned int nvectors_per_build;
> > > > > > > > >    unsigned int in_nlanes;
> > > > > > > > > +  unsigned int cp = STMT_VINFO_COMPLEX_P (stmt_info) ? 2 : 1;
> > > > > > > > >    bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
> > > > > > > > > -                     && multiple_p (nunits, group_size));
> > > > > > > > > +                     && multiple_p (nunits, group_size * cp));
> > > > > > > > >    if (repeating_p)
> > > > > > > > >      {
> > > > > > > > >        /* A single vector contains a whole number of copies of the node, so:
> > > > > > > > >          (a) all permutes can use the same mask; and
> > > > > > > > >          (b) the permutes only need a single vector input.  */
> > > > > > > > > -      mask.new_vector (nunits, group_size, 3);
> > > > > > > > > -      nelts_to_build = mask.encoded_nelts ();
> > > > > > > > > +      /* For complex type, mask size should be double of nelts_to_build.  */
> > > > > > > > > +      mask.new_vector (nunits, group_size * cp, 3);
> > > > > > > > > +      nelts_to_build = mask.encoded_nelts () / cp;
> > > > > > > > >        nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
> > > > > > > > >        in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
> > > > > > > > >      }
> > > > > > > > > @@ -6744,8 +6832,8 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > > > > > >         {
> > > > > > > > >           /* Enforced before the loop when !repeating_p.  */
> > > > > > > > >           unsigned int const_nunits = nunits.to_constant ();
> > > > > > > > > -         vec_index = i / const_nunits;
> > > > > > > > > -         mask_element = i % const_nunits;
> > > > > > > > > +         vec_index = i / (const_nunits / cp);
> > > > > > > > > +         mask_element = i % (const_nunits / cp);
> > > > > > > > >           if (vec_index == first_vec_index
> > > > > > > > >               || first_vec_index == -1)
> > > > > > > > >             {
> > > > > > > > > @@ -6755,7 +6843,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > > > > > >                    || second_vec_index == -1)
> > > > > > > > >             {
> > > > > > > > >               second_vec_index = vec_index;
> > > > > > > > > -             mask_element += const_nunits;
> > > > > > > > > +             mask_element += (const_nunits / cp);
> > > > > > > > >             }
> > > > > > > > >           else
> > > > > > > > >             {
> > > > > > > > > @@ -6768,14 +6856,24 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > > > > > >               return false;
> > > > > > > > >             }
> > > > > > > > >
> > > > > > > > > -         gcc_assert (mask_element < 2 * const_nunits);
> > > > > > > > > +         gcc_assert (mask_element < 2 * const_nunits / cp);
> > > > > > > > >         }
> > > > > > > > >
> > > > > > > > >        if (mask_element != index)
> > > > > > > > >         noop_p = false;
> > > > > > > > > -      mask[index++] = mask_element;
> > > > > > > > > +      /* Set index for Complex _type.
> > > > > > > > > +        i.e. mask like [1,0] is actually [2, 3, 0, 1]
> > > > > > > > > +        for vector scalar type.  */
> > > > > > > > > +      if (cp == 2)
> > > > > > > > > +       {
> > > > > > > > > +         mask[2 * index] = 2 * mask_element;
> > > > > > > > > +         mask[2 * index + 1] = 2 * mask_element + 1;
> > > > > > > > > +       }
> > > > > > > > > +      else
> > > > > > > > > +       mask[index] = mask_element;
> > > > > > > > > +      index++;
> > > > > > > > >
> > > > > > > > > -      if (index == count && !noop_p)
> > > > > > > > > +      if (index * cp == count && !noop_p)
> > > > > > > > >         {
> > > > > > > > >           indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
> > > > > > > > >           if (!can_vec_perm_const_p (mode, mode, indices))
> > > > > > > > > @@ -6799,7 +6897,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > > > > > >           ++*n_perms;
> > > > > > > > >         }
> > > > > > > > >
> > > > > > > > > -      if (index == count)
> > > > > > > > > +      if (index * cp == count)
> > > > > > > > >         {
> > > > > > > > >           if (!analyze_only)
> > > > > > > > >             {
> > > > > > > > > @@ -6869,7 +6967,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
> > > > > > > > >           bool load_seen = false;
> > > > > > > > >           for (unsigned i = 0; i < in_nlanes; ++i)
> > > > > > > > >             {
> > > > > > > > > -             if (i % const_nunits == 0)
> > > > > > > > > +             if (i % (const_nunits * cp) == 0)
> > > > > > > > >                 {
> > > > > > > > >                   if (load_seen)
> > > > > > > > >                     *n_loads += 1;
> > > > > > > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > > > > > > > > index 72107afc883..8af3b558be4 100644
> > > > > > > > > --- a/gcc/tree-vect-stmts.cc
> > > > > > > > > +++ b/gcc/tree-vect-stmts.cc
> > > > > > > > > @@ -1397,25 +1397,70 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
> > > > > > > > >  {
> > > > > > > > >    gimple *init_stmt;
> > > > > > > > >    tree new_temp;
> > > > > > > > > +  tree scalar_type = TREE_TYPE (type);
> > > > > > > > > +  gimple_seq stmts = NULL;
> > > > > > > > > +
> > > > > > > > > +  if (TREE_CODE (TREE_TYPE (val)) == COMPLEX_TYPE)
> > > > > > > > > +    {
> > > > > > > > > +      unsigned HOST_WIDE_INT nunits;
> > > > > > > > > +      gcc_assert (TYPE_VECTOR_SUBPARTS (type).is_constant (&nunits));
> > > > > > > > >
> > > > > > > > > +      tree_vector_builder elts (type, nunits, 1);
> > > > > > > > > +      tree imag, real;
> > > > > > > > > +      if (TREE_CODE (val) == COMPLEX_CST)
> > > > > > > > > +       {
> > > > > > > > > +         real = fold_unary (REALPART_EXPR, scalar_type, val);
> > > > > > > > > +         imag = fold_unary (IMAGPART_EXPR, scalar_type, val);
> > > > > > > > > +       }
> > > > > > > > > +      else
> > > > > > > > > +       {
> > > > > > > > > +         real = make_ssa_name (scalar_type);
> > > > > > > > > +         imag = make_ssa_name (scalar_type);
> > > > > > > > > +         init_stmt
> > > > > > > > > +           = gimple_build_assign (real,
> > > > > > > > > +                                  build1 (REALPART_EXPR, scalar_type, val));
> > > > > > > > > +         gimple_seq_add_stmt (&stmts, init_stmt);
> > > > > > > > > +         init_stmt
> > > > > > > > > +           = gimple_build_assign (imag,
> > > > > > > > > +                                  build1 (IMAGPART_EXPR, scalar_type, val));
> > > > > > > > > +         gimple_seq_add_stmt (&stmts, init_stmt);
> > > > > > > > > +       }
> > > > > > > > > +
> > > > > > > > > +      /* Build vector as [real,imag,real,imag,...].  */
> > > > > > > > > +      for (unsigned i = 0; i != nunits; i++)
> > > > > > > > > +       {
> > > > > > > > > +         if (i % 2)
> > > > > > > > > +           elts.quick_push (imag);
> > > > > > > > > +         else
> > > > > > > > > +           elts.quick_push (real);
> > > > > > > > > +       }
> > > > > > > > > +      val = gimple_build_vector (&stmts, &elts);
> > > > > > > > > +      if (!gimple_seq_empty_p (stmts))
> > > > > > > > > +       {
> > > > > > > > > +         if (gsi)
> > > > > > > > > +           gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> > > > > > > > > +         else
> > > > > > > > > +           vinfo->insert_seq_on_entry (stmt_info, stmts);
> > > > > > > > > +       }
> > > > > > > > > +    }
> > > > > > > > >    /* We abuse this function to push sth to a SSA name with initial 'val'.  */
> > > > > > > > > -  if (! useless_type_conversion_p (type, TREE_TYPE (val)))
> > > > > > > > > +  else if (! useless_type_conversion_p (type, TREE_TYPE (val)))
> > > > > > > > >      {
> > > > > > > > >        gcc_assert (TREE_CODE (type) == VECTOR_TYPE);
> > > > > > > > > -      if (! types_compatible_p (TREE_TYPE (type), TREE_TYPE (val)))
> > > > > > > > > +      if (! types_compatible_p (scalar_type, TREE_TYPE (val)))
> > > > > > > > >         {
> > > > > > > > >           /* Scalar boolean value should be transformed into
> > > > > > > > >              all zeros or all ones value before building a vector.  */
> > > > > > > > >           if (VECTOR_BOOLEAN_TYPE_P (type))
> > > > > > > > >             {
> > > > > > > > > -             tree true_val = build_all_ones_cst (TREE_TYPE (type));
> > > > > > > > > -             tree false_val = build_zero_cst (TREE_TYPE (type));
> > > > > > > > > +             tree true_val = build_all_ones_cst (scalar_type);
> > > > > > > > > +             tree false_val = build_zero_cst (scalar_type);
> > > > > > > > >
> > > > > > > > >               if (CONSTANT_CLASS_P (val))
> > > > > > > > >                 val = integer_zerop (val) ? false_val : true_val;
> > > > > > > > >               else
> > > > > > > > >                 {
> > > > > > > > > -                 new_temp = make_ssa_name (TREE_TYPE (type));
> > > > > > > > > +                 new_temp = make_ssa_name (scalar_type);
> > > > > > > > >                   init_stmt = gimple_build_assign (new_temp, COND_EXPR,
> > > > > > > > >                                                    val, true_val, false_val);
> > > > > > > > >                   vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi);
> > > > > > > > > @@ -1424,14 +1469,13 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
> > > > > > > > >             }
> > > > > > > > >           else
> > > > > > > > >             {
> > > > > > > > > -             gimple_seq stmts = NULL;
> > > > > > > > >               if (! INTEGRAL_TYPE_P (TREE_TYPE (val)))
> > > > > > > > >                 val = gimple_build (&stmts, VIEW_CONVERT_EXPR,
> > > > > > > > > -                                   TREE_TYPE (type), val);
> > > > > > > > > +                                   scalar_type, val);
> > > > > > > > >               else
> > > > > > > > >                 /* ???  Condition vectorization expects us to do
> > > > > > > > >                    promotion of invariant/external defs.  */
> > > > > > > > > -               val = gimple_convert (&stmts, TREE_TYPE (type), val);
> > > > > > > > > +               val = gimple_convert (&stmts, scalar_type, val);
> > > > > > > > >               for (gimple_stmt_iterator gsi2 = gsi_start (stmts);
> > > > > > > > >                    !gsi_end_p (gsi2); )
> > > > > > > > >                 {
> > > > > > > > > @@ -1496,7 +1540,12 @@ vect_get_vec_defs_for_operand (vec_info *vinfo, stmt_vec_info stmt_vinfo,
> > > > > > > > >                && VECTOR_BOOLEAN_TYPE_P (stmt_vectype))
> > > > > > > > >         vector_type = truth_type_for (stmt_vectype);
> > > > > > > > >        else
> > > > > > > > > -       vector_type = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op));
> > > > > > > > > +       {
> > > > > > > > > +         tree scalar_type = TREE_TYPE (op);
> > > > > > > > > +         if (STMT_VINFO_COMPLEX_P (stmt_vinfo))
> > > > > > > > > +           scalar_type = TREE_TYPE (scalar_type);
> > > > > > > > > +         vector_type = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
> > > > > > > > > +       }
> > > > > > > > >
> > > > > > > > >        gcc_assert (vector_type);
> > > > > > > > >        tree vop = vect_init_vector (vinfo, stmt_vinfo, op, vector_type, NULL);
> > > > > > > > > @@ -7509,8 +7558,17 @@ vectorizable_store (vec_info *vinfo,
> > > > > > > > >       same location twice.  */
> > > > > > > > >    gcc_assert (slp == PURE_SLP_STMT (stmt_info));
> > > > > > > > >
> > > > > > > > > +  if (!STMT_VINFO_DATA_REF (stmt_info))
> > > > > > > > > +    return false;
> > > > > > > > > +
> > > > > > > > >    tree vectype = STMT_VINFO_VECTYPE (stmt_info), rhs_vectype = NULL_TREE;
> > > > > > > > >    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> > > > > > > > > +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > > > > > +    {
> > > > > > > > > +      if (!nunits.is_constant ())
> > > > > > > > > +       return false;
> > > > > > > > > +      nunits = exact_div (nunits, 2);
> > > > > > > > > +    }
> > > > > > > > >
> > > > > > > > >    if (loop_vinfo)
> > > > > > > > >      {
> > > > > > > > > @@ -7526,7 +7584,8 @@ vectorizable_store (vec_info *vinfo,
> > > > > > > > >    if (slp)
> > > > > > > > >      ncopies = 1;
> > > > > > > > >    else
> > > > > > > > > -    ncopies = vect_get_num_copies (loop_vinfo, vectype);
> > > > > > > > > +    ncopies = vect_get_num_copies (loop_vinfo, vectype,
> > > > > > > > > +                                  STMT_VINFO_COMPLEX_P (stmt_info));
> > > > > > > > >
> > > > > > > > >    gcc_assert (ncopies >= 1);
> > > > > > > > >
> > > > > > > > > @@ -7546,9 +7605,6 @@ vectorizable_store (vec_info *vinfo,
> > > > > > > > >    elem_type = TREE_TYPE (vectype);
> > > > > > > > >    vec_mode = TYPE_MODE (vectype);
> > > > > > > > >
> > > > > > > > > -  if (!STMT_VINFO_DATA_REF (stmt_info))
> > > > > > > > > -    return false;
> > > > > > > > > -
> > > > > > > > >    vect_memory_access_type memory_access_type;
> > > > > > > > >    enum dr_alignment_support alignment_support_scheme;
> > > > > > > > >    int misalignment;
> > > > > > > > > @@ -8778,6 +8834,12 @@ vectorizable_load (vec_info *vinfo,
> > > > > > > > >
> > > > > > > > >    tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> > > > > > > > >    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> > > > > > > > > +  if (STMT_VINFO_COMPLEX_P (stmt_info))
> > > > > > > > > +    {
> > > > > > > > > +      if (!nunits.is_constant ())
> > > > > > > > > +       return false;
> > > > > > > > > +      nunits = exact_div (nunits, 2);
> > > > > > > > > +    }
> > > > > > > > >
> > > > > > > > >    if (loop_vinfo)
> > > > > > > > >      {
> > > > > > > > > @@ -8794,7 +8856,8 @@ vectorizable_load (vec_info *vinfo,
> > > > > > > > >    if (slp)
> > > > > > > > >      ncopies = 1;
> > > > > > > > >    else
> > > > > > > > > -    ncopies = vect_get_num_copies (loop_vinfo, vectype);
> > > > > > > > > +    ncopies = vect_get_num_copies (loop_vinfo, vectype,
> > > > > > > > > +                                  STMT_VINFO_COMPLEX_P (stmt_info));
> > > > > > > > >
> > > > > > > > >    gcc_assert (ncopies >= 1);
> > > > > > > > >
> > > > > > > > > @@ -8870,8 +8933,11 @@ vectorizable_load (vec_info *vinfo,
> > > > > > > > >                 if (k > maxk)
> > > > > > > > >                   maxk = k;
> > > > > > > > >               tree vectype = SLP_TREE_VECTYPE (slp_node);
> > > > > > > > > +             /* For complex type, half the nunits.  */
> > > > > > > > >               if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
> > > > > > > > > -                 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
> > > > > > > > > +                 || maxk >= (DR_GROUP_SIZE (group_info)
> > > > > > > > > +                             & ~((STMT_VINFO_COMPLEX_P (group_info)
> > > > > > > > > +                                  ? nunits >> 1 : nunits) - 1)))
> > > > > > > > >                 {
> > > > > > > > >                   if (dump_enabled_p ())
> > > > > > > > >                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > > > > > > > @@ -12499,12 +12565,27 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > > > > > >             dump_printf_loc (MSG_NOTE, vect_location,
> > > > > > > > >                              "get vectype for scalar type: %T\n", scalar_type);
> > > > > > > > >         }
> > > > > > > > > +
> > > > > > > > > +      tree orig_scalar_type = scalar_type;
> > > > > > > > > +      if (TREE_CODE (scalar_type) == COMPLEX_TYPE)
> > > > > > > > > +       {
> > > > > > > > > +         /* Set complex_p for BB vectorizer.  */
> > > > > > > > > +         STMT_VINFO_COMPLEX_P (stmt_info) = true;
> > > > > > > > > +         scalar_type = TREE_TYPE (scalar_type);
> > > > > > > > > +         /* Double group_size for BB vectorizer to make
> > > > > > > > > +            following 2 get_vectype_for_scalar_type return wanted vectype.
> > > > > > > > > +            Real group size is not changed, just make the "faked" input
> > > > > > > > > +            group_size.  */
> > > > > > > > > +         group_size *= 2;
> > > > > > > > > +       }
> > > > > > > > >        vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
> > > > > > > > > -      if (!vectype)
> > > > > > > > > +      if (!vectype
> > > > > > > > > +         || (STMT_VINFO_COMPLEX_P (stmt_info)
> > > > > > > > > +             && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ()))
> > > > > > > > >         return opt_result::failure_at (stmt,
> > > > > > > > >                                        "not vectorized:"
> > > > > > > > >                                        " unsupported data-type %T\n",
> > > > > > > > > -                                      scalar_type);
> > > > > > > > > +                                      orig_scalar_type);
> > > > > > > > >
> > > > > > > > >        if (dump_enabled_p ())
> > > > > > > > >         dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n", vectype);
> > > > > > > > > @@ -12529,16 +12610,30 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > > > > > >                                                    TREE_TYPE (vectype));
> > > > > > > > >        if (scalar_type != TREE_TYPE (vectype))
> > > > > > > > >         {
> > > > > > > > > -         if (dump_enabled_p ())
> > > > > > > > > +         tree orig_scalar_type = scalar_type;
> > > > > > > > > +         if (TREE_CODE (scalar_type) == COMPLEX_TYPE)
> > > > > > > > > +           {
> > > > > > > > > +             /* Set complex_p for Loop vectorizer.  */
> > > > > > > > > +             STMT_VINFO_COMPLEX_P (stmt_info) = true;
> > > > > > > > > +             scalar_type = TREE_TYPE (scalar_type);
> > > > > > > > > +             if (dump_enabled_p ())
> > > > > > > > > +               dump_printf_loc (MSG_NOTE, vect_location,
> > > > > > > > > +                            "get complex for smallest scalar type: %T\n",
> > > > > > > > > +                            scalar_type);
> > > > > > > > > +
> > > > > > > > > +           }
> > > > > > > > > +         else if (dump_enabled_p ())
> > > > > > > > >             dump_printf_loc (MSG_NOTE, vect_location,
> > > > > > > > >                              "get vectype for smallest scalar type: %T\n",
> > > > > > > > >                              scalar_type);
> > > > > > > > >           nunits_vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
> > > > > > > > >                                                         group_size);
> > > > > > > > > -         if (!nunits_vectype)
> > > > > > > > > +         if (!nunits_vectype
> > > > > > > > > +             || (STMT_VINFO_COMPLEX_P (stmt_info)
> > > > > > > > > +                 && !TYPE_VECTOR_SUBPARTS (nunits_vectype).is_constant ()))
> > > > > > > > >             return opt_result::failure_at
> > > > > > > > >               (stmt, "not vectorized: unsupported data-type %T\n",
> > > > > > > > > -              scalar_type);
> > > > > > > > > +              orig_scalar_type);
> > > > > > > > >           if (dump_enabled_p ())
> > > > > > > > >             dump_printf_loc (MSG_NOTE, vect_location, "nunits vectype: %T\n",
> > > > > > > > >                              nunits_vectype);
> > > > > > > > > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> > > > > > > > > index e5fdc9e0a14..4a809e492c4 100644
> > > > > > > > > --- a/gcc/tree-vectorizer.h
> > > > > > > > > +++ b/gcc/tree-vectorizer.h
> > > > > > > > > @@ -1161,6 +1161,9 @@ public:
> > > > > > > > >       vectorization.  */
> > > > > > > > >    bool vectorizable;
> > > > > > > > >
> > > > > > > > > +  /* The scalar type of the LHS of this statement is complex type.  */
> > > > > > > > > +  bool complex_p;
> > > > > > > > > +
> > > > > > > > >    /* The stmt to which this info struct refers to.  */
> > > > > > > > >    gimple *stmt;
> > > > > > > > >
> > > > > > > > > @@ -1395,6 +1398,7 @@ struct gather_scatter_info {
> > > > > > > > >  #define STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT(S) (S)->reduc_epilogue_adjustment
> > > > > > > > >  #define STMT_VINFO_REDUC_IDX(S)                   (S)->reduc_idx
> > > > > > > > >  #define STMT_VINFO_FORCE_SINGLE_CYCLE(S)   (S)->force_single_cycle
> > > > > > > > > +#define STMT_VINFO_COMPLEX_P(S)            (S)->complex_p
> > > > > > > > >
> > > > > > > > >  #define STMT_VINFO_DR_WRT_VEC_LOOP(S)      (S)->dr_wrt_vec_loop
> > > > > > > > >  #define STMT_VINFO_DR_BASE_ADDRESS(S)      (S)->dr_wrt_vec_loop.base_address
> > > > > > > > > @@ -1970,6 +1974,15 @@ vect_get_num_copies (loop_vec_info loop_vinfo, tree vectype)
> > > > > > > > >    return vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo), vectype);
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > > +static inline unsigned int
> > > > > > > > > +vect_get_num_copies (loop_vec_info loop_vinfo, tree vectype, bool complex_p)
> > > > > > > > > +{
> > > > > > > > > +  poly_uint64 nunits = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> > > > > > > > > +  if (complex_p)
> > > > > > > > > +    nunits *= 2;
> > > > > > > > > +  return vect_get_num_vectors (nunits, vectype);
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > >  /* Update maximum unit count *MAX_NUNITS so that it accounts for
> > > > > > > > >     NUNITS.  *MAX_NUNITS can be 1 if we haven't yet recorded anything.  */
> > > > > > > > >
> > > > > > > > > --
> > > > > > > > > 2.18.1
> > > > > > > > >
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > > > --
> > > > > > > BR,
> > > > > > > Hongtao
> > > > >
> > > > >
> > > > >
> > > > > --
> > > > > BR,
> > > > > Hongtao
> >
> >
> >
> > --
> > BR,
> > Hongtao
>
>
>
> --
> BR,
> Hongtao
diff mbox series

Patch

diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1a.c b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
new file mode 100644
index 00000000000..b608f484934
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
@@ -0,0 +1,58 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "vect" } } */
+
+#define N 10000
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a, _Complex double* b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b[i];
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a, _Complex float* b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b[i];
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a, _Complex long long* b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b[i];
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a, _Complex int* b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b[i];
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a, _Complex short* b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b[i];
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a, _Complex char* b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b[i];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1b.c b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
new file mode 100644
index 00000000000..0f377c3a548
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
@@ -0,0 +1,63 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+#include <string.h>
+#include "pr106010-1a.c"
+
+void
+avx_test (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
+  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
+  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
+  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
+  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
+  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
+  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
+  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
+  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
+  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
+  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
+  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
+  char* p_init = (char*) malloc (2 * N * sizeof (double));
+
+  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
+  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
+  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
+  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
+  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
+  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
+
+  for (int i = 0; i != 2 * N * sizeof (double); i++)
+    p_init[i] = i;
+
+  memcpy (pd_src, p_init, 2 * N * sizeof (double));
+  memcpy (ps_src, p_init, 2 * N * sizeof (float));
+  memcpy (epi64_src, p_init, 2 * N * sizeof (long long));
+  memcpy (epi32_src, p_init, 2 * N * sizeof (int));
+  memcpy (epi16_src, p_init, 2 * N * sizeof (short));
+  memcpy (epi8_src, p_init, 2 * N * sizeof (char));
+
+  foo_pd (pd_dst, pd_src);
+  foo_ps (ps_dst, ps_src);
+  foo_epi64 (epi64_dst, epi64_src);
+  foo_epi32 (epi32_dst, epi32_src);
+  foo_epi16 (epi16_dst, epi16_src);
+  foo_epi8 (epi8_dst, epi8_src);
+  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1c.c b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
new file mode 100644
index 00000000000..f07e9fb2d3d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
@@ -0,0 +1,41 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "vect" } } */
+/* { dg-require-effective-target avx512fp16 } */
+
+#include <string.h>
+
+static void do_test (void);
+
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+#define N 10000
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a, _Complex _Float16* b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b[i];
+}
+
+static void
+do_test (void)
+{
+  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
+  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
+  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
+
+  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
+
+  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
+    p_init[i] = i;
+
+  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
+
+  foo_ph (ph_dst, ph_src);
+  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
+    __builtin_abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2a.c b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
new file mode 100644
index 00000000000..d2e2f8d4f43
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
@@ -0,0 +1,82 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "slp2" } } */
+
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a, _Complex double* __restrict b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a, _Complex float* __restrict b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a, _Complex int* __restrict b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a, _Complex short* __restrict b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+  a[4] = b[4];
+  a[5] = b[5];
+  a[6] = b[6];
+  a[7] = b[7];
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a, _Complex char* __restrict b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+  a[4] = b[4];
+  a[5] = b[5];
+  a[6] = b[6];
+  a[7] = b[7];
+  a[8] = b[8];
+  a[9] = b[9];
+  a[10] = b[10];
+  a[11] = b[11];
+  a[12] = b[12];
+  a[13] = b[13];
+  a[14] = b[14];
+  a[15] = b[15];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2b.c b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
new file mode 100644
index 00000000000..ac360752693
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
@@ -0,0 +1,62 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+#include <string.h>
+#include "pr106010-2a.c"
+
+void
+avx_test (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (32);
+  _Complex double* pd_dst = (_Complex double*) malloc (32);
+  _Complex float* ps_src = (_Complex float*) malloc (32);
+  _Complex float* ps_dst = (_Complex float*) malloc (32);
+  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
+  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
+  _Complex int* epi32_src = (_Complex int*) malloc (32);
+  _Complex int* epi32_dst = (_Complex int*) malloc (32);
+  _Complex short* epi16_src = (_Complex short*) malloc (32);
+  _Complex short* epi16_dst = (_Complex short*) malloc (32);
+  _Complex char* epi8_src = (_Complex char*) malloc (32);
+  _Complex char* epi8_dst = (_Complex char*) malloc (32);
+  char* p = (char* ) malloc (32);
+
+  __builtin_memset (pd_dst, 0, 32);
+  __builtin_memset (ps_dst, 0, 32);
+  __builtin_memset (epi64_dst, 0, 32);
+  __builtin_memset (epi32_dst, 0, 32);
+  __builtin_memset (epi16_dst, 0, 32);
+  __builtin_memset (epi8_dst, 0, 32);
+
+  for (int i = 0; i != 32; i++)
+    p[i] = i;
+  __builtin_memcpy (pd_src, p, 32);
+  __builtin_memcpy (ps_src, p, 32);
+  __builtin_memcpy (epi64_src, p, 32);
+  __builtin_memcpy (epi32_src, p, 32);
+  __builtin_memcpy (epi16_src, p, 32);
+  __builtin_memcpy (epi8_src, p, 32);
+
+  foo_pd (pd_dst, pd_src);
+  foo_ps (ps_dst, ps_src);
+  foo_epi64 (epi64_dst, epi64_src);
+  foo_epi32 (epi32_dst, epi32_src);
+  foo_epi16 (epi16_dst, epi16_src);
+  foo_epi8 (epi8_dst, epi8_src);
+  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2c.c b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
new file mode 100644
index 00000000000..a002f209ec9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
@@ -0,0 +1,47 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
+/* { dg-require-effective-target avx512fp16 } */
+
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
+
+#include <string.h>
+
+static void do_test (void);
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+  a[4] = b[4];
+  a[5] = b[5];
+  a[6] = b[6];
+  a[7] = b[7];
+}
+
+void
+do_test (void)
+{
+  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
+  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
+  char* p = (char* ) malloc (32);
+
+   __builtin_memset (ph_dst, 0, 32);
+ 
+  for (int i = 0; i != 32; i++)
+    p[i] = i;
+  __builtin_memcpy (ph_src, p, 32);
+ 
+  foo_ph (ph_dst, ph_src);
+  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3a.c b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
new file mode 100644
index 00000000000..c1b64b56b1c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
@@ -0,0 +1,80 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 2 "slp2" } }  */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 6, 7, 4, 5 \}} 1 "slp2" } }  */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17 \}} 1 "slp2" } }  */
+
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a, _Complex double* __restrict b)
+{
+  a[0] = b[1];
+  a[1] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a, _Complex float* __restrict b)
+{
+  a[0] = b[1];
+  a[1] = b[0];
+  a[2] = b[3];
+  a[3] = b[2];
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
+{
+  a[0] = b[1];
+  a[1] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a, _Complex int* __restrict b)
+{
+  a[0] = b[3];
+  a[1] = b[2];
+  a[2] = b[1];
+  a[3] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a, _Complex short* __restrict b)
+{
+  a[0] = b[7];
+  a[1] = b[6];
+  a[2] = b[5];
+  a[3] = b[4];
+  a[4] = b[3];
+  a[5] = b[2];
+  a[6] = b[1];
+  a[7] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a, _Complex char* __restrict b)
+{
+  a[0] = b[7];
+  a[1] = b[6];
+  a[2] = b[5];
+  a[3] = b[4];
+  a[4] = b[3];
+  a[5] = b[2];
+  a[6] = b[1];
+  a[7] = b[0];
+  a[8] = b[15];
+  a[9] = b[14];
+  a[10] = b[13];
+  a[11] = b[12];
+  a[12] = b[11];
+  a[13] = b[10];
+  a[14] = b[9];
+  a[15] = b[8];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3b.c b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
new file mode 100644
index 00000000000..e4fa3f3a541
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
@@ -0,0 +1,126 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx2 } */
+
+#include "avx2-check.h"
+#include <string.h>
+#include "pr106010-3a.c"
+
+void
+avx2_test (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (32);
+  _Complex double* pd_dst = (_Complex double*) malloc (32);
+  _Complex double* pd_exp = (_Complex double*) malloc (32);
+  _Complex float* ps_src = (_Complex float*) malloc (32);
+  _Complex float* ps_dst = (_Complex float*) malloc (32);
+  _Complex float* ps_exp = (_Complex float*) malloc (32);
+  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
+  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
+  _Complex long long* epi64_exp = (_Complex long long*) malloc (32);
+  _Complex int* epi32_src = (_Complex int*) malloc (32);
+  _Complex int* epi32_dst = (_Complex int*) malloc (32);
+  _Complex int* epi32_exp = (_Complex int*) malloc (32);
+  _Complex short* epi16_src = (_Complex short*) malloc (32);
+  _Complex short* epi16_dst = (_Complex short*) malloc (32);
+  _Complex short* epi16_exp = (_Complex short*) malloc (32);
+  _Complex char* epi8_src = (_Complex char*) malloc (32);
+  _Complex char* epi8_dst = (_Complex char*) malloc (32);
+  _Complex char* epi8_exp = (_Complex char*) malloc (32);
+  char* p = (char* ) malloc (32);
+  char* q = (char* ) malloc (32);
+
+  __builtin_memset (pd_dst, 0, 32);
+  __builtin_memset (ps_dst, 0, 32);
+  __builtin_memset (epi64_dst, 0, 32);
+  __builtin_memset (epi32_dst, 0, 32);
+  __builtin_memset (epi16_dst, 0, 32);
+  __builtin_memset (epi8_dst, 0, 32);
+
+  for (int i = 0; i != 32; i++)
+    p[i] = i;
+  __builtin_memcpy (pd_src, p, 32);
+  __builtin_memcpy (ps_src, p, 32);
+  __builtin_memcpy (epi64_src, p, 32);
+  __builtin_memcpy (epi32_src, p, 32);
+  __builtin_memcpy (epi16_src, p, 32);
+  __builtin_memcpy (epi8_src, p, 32);
+
+  for (int i = 0; i != 16; i++)
+    {
+      p[i] = i + 16;
+      p[i + 16] = i;
+    }
+  __builtin_memcpy (pd_exp, p, 32);
+  __builtin_memcpy (epi64_exp, p, 32);
+
+  for (int i = 0; i != 8; i++)
+    {
+      p[i] = i + 8;
+      p[i + 8] = i;
+      p[i + 16] = i + 24;
+      p[i + 24] = i + 16;
+      q[i] = i + 24;
+      q[i + 8] = i + 16;
+      q[i + 16] = i + 8;
+      q[i + 24] = i;
+    }
+  __builtin_memcpy (ps_exp, p, 32);
+  __builtin_memcpy (epi32_exp, q, 32);
+
+
+  for (int i = 0; i != 4; i++)
+    {
+      q[i] = i + 28;
+      q[i + 4] = i + 24;
+      q[i + 8] = i + 20;
+      q[i + 12] = i + 16;
+      q[i + 16] = i + 12;
+      q[i + 20] = i + 8;
+      q[i + 24] = i + 4;
+      q[i + 28] = i;
+    }
+  __builtin_memcpy (epi16_exp, q, 32);
+
+  for (int i = 0; i != 2; i++)
+    {
+      q[i] = i + 14;
+      q[i + 2] = i + 12;
+      q[i + 4] = i + 10;
+      q[i + 6] = i + 8;
+      q[i + 8] = i + 6;
+      q[i + 10] = i + 4;
+      q[i + 12] = i + 2;
+      q[i + 14] = i;
+      q[i + 16] = i + 30;
+      q[i + 18] = i + 28;
+      q[i + 20] = i + 26;
+      q[i + 22] = i + 24;
+      q[i + 24] = i + 22;
+      q[i + 26] = i + 20;
+      q[i + 28] = i + 18;
+      q[i + 30] = i + 16;
+    }
+  __builtin_memcpy (epi8_exp, q, 32);
+
+  foo_pd (pd_dst, pd_src);
+  foo_ps (ps_dst, ps_src);
+  foo_epi64 (epi64_dst, epi64_src);
+  foo_epi32 (epi32_dst, epi32_src);
+  foo_epi16 (epi16_dst, epi16_src);
+  foo_epi8 (epi8_dst, epi8_src);
+  if (__builtin_memcmp (pd_dst, pd_exp, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (ps_dst, ps_exp, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi64_dst, epi64_exp, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi32_dst, epi32_exp, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi16_dst, epi16_exp, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi8_dst, epi8_exp, 32) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3c.c b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
new file mode 100644
index 00000000000..5a5a3d4b992
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
@@ -0,0 +1,69 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 8, 9, 6, 7, 14, 15, 12, 13, 4, 5, 10, 11 \}} 1 "slp2" } }  */
+
+#include <string.h>
+
+static void do_test (void);
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
+{
+  a[0] = b[1];
+  a[1] = b[0];
+  a[2] = b[4];
+  a[3] = b[3];
+  a[4] = b[7];
+  a[5] = b[6];
+  a[6] = b[2];
+  a[7] = b[5];
+}
+
+void
+do_test (void)
+{
+  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
+  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
+  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (32);
+  char* p = (char* ) malloc (32);
+  char* q = (char* ) malloc (32);
+
+  __builtin_memset (ph_dst, 0, 32);
+
+  for (int i = 0; i != 32; i++)
+    p[i] = i;
+  __builtin_memcpy (ph_src, p, 32);
+
+  for (int i = 0; i != 4; i++)
+    {
+      p[i] = i + 4;
+      p[i + 4] = i;
+      p[i + 8] = i + 16;
+      p[i + 12] = i + 12;
+      p[i + 16] = i + 28;
+      p[i + 20] = i + 24;
+      p[i + 24] = i + 8;
+      p[i + 28] = i + 20;
+      q[i] = i + 28;
+      q[i + 4] = i + 24;
+      q[i + 8] = i + 20;
+      q[i + 12] = i + 16;
+      q[i + 16] = i + 12;
+      q[i + 20] = i + 8;
+      q[i + 24] = i + 4;
+      q[i + 28] = i;
+    }
+  __builtin_memcpy (ph_exp, p, 32);
+
+  foo_ph (ph_dst, ph_src);
+  if (__builtin_memcmp (ph_dst, ph_exp, 32) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4a.c b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
new file mode 100644
index 00000000000..b7b0b532bb1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
@@ -0,0 +1,101 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "slp2" } } */
+
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a,
+	_Complex double b1,
+	_Complex double b2)
+{
+  a[0] = b1;
+  a[1] = b2;
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a,
+	_Complex float b1, _Complex float b2,
+	_Complex float b3, _Complex float b4)
+{
+  a[0] = b1;
+  a[1] = b2;
+  a[2] = b3;
+  a[3] = b4;
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a,
+	   _Complex long long b1,
+	   _Complex long long b2)
+{
+  a[0] = b1;
+  a[1] = b2;
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a,
+	   _Complex int b1, _Complex int b2,
+	   _Complex int b3, _Complex int b4)
+{
+  a[0] = b1;
+  a[1] = b2;
+  a[2] = b3;
+  a[3] = b4;
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a,
+	   _Complex short b1, _Complex short b2,
+	   _Complex short b3, _Complex short b4,
+	   _Complex short b5, _Complex short b6,
+	   _Complex short b7,_Complex short b8)
+{
+  a[0] = b1;
+  a[1] = b2;
+  a[2] = b3;
+  a[3] = b4;
+  a[4] = b5;
+  a[5] = b6;
+  a[6] = b7;
+  a[7] = b8;
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a,
+	  _Complex char b1, _Complex char b2,
+	  _Complex char b3, _Complex char b4,
+	  _Complex char b5, _Complex char b6,
+	  _Complex char b7,_Complex char b8,
+	  _Complex char b9, _Complex char b10,
+	  _Complex char b11, _Complex char b12,
+	  _Complex char b13, _Complex char b14,
+	  _Complex char b15,_Complex char b16)
+{
+  a[0] = b1;
+  a[1] = b2;
+  a[2] = b3;
+  a[3] = b4;
+  a[4] = b5;
+  a[5] = b6;
+  a[6] = b7;
+  a[7] = b8;
+  a[8] = b9;
+  a[9] = b10;
+  a[10] = b11;
+  a[11] = b12;
+  a[12] = b13;
+  a[13] = b14;
+  a[14] = b15;
+  a[15] = b16;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4b.c b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
new file mode 100644
index 00000000000..e2e79508c4b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
@@ -0,0 +1,67 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+#include <string.h>
+#include "pr106010-4a.c"
+
+void
+avx_test (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (32);
+  _Complex double* pd_dst = (_Complex double*) malloc (32);
+  _Complex float* ps_src = (_Complex float*) malloc (32);
+  _Complex float* ps_dst = (_Complex float*) malloc (32);
+  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
+  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
+  _Complex int* epi32_src = (_Complex int*) malloc (32);
+  _Complex int* epi32_dst = (_Complex int*) malloc (32);
+  _Complex short* epi16_src = (_Complex short*) malloc (32);
+  _Complex short* epi16_dst = (_Complex short*) malloc (32);
+  _Complex char* epi8_src = (_Complex char*) malloc (32);
+  _Complex char* epi8_dst = (_Complex char*) malloc (32);
+  char* p = (char* ) malloc (32);
+
+  __builtin_memset (pd_dst, 0, 32);
+  __builtin_memset (ps_dst, 0, 32);
+  __builtin_memset (epi64_dst, 0, 32);
+  __builtin_memset (epi32_dst, 0, 32);
+  __builtin_memset (epi16_dst, 0, 32);
+  __builtin_memset (epi8_dst, 0, 32);
+
+  for (int i = 0; i != 32; i++)
+    p[i] = i;
+  __builtin_memcpy (pd_src, p, 32);
+  __builtin_memcpy (ps_src, p, 32);
+  __builtin_memcpy (epi64_src, p, 32);
+  __builtin_memcpy (epi32_src, p, 32);
+  __builtin_memcpy (epi16_src, p, 32);
+  __builtin_memcpy (epi8_src, p, 32);
+
+  foo_pd (pd_dst, pd_src[0], pd_src[1]);
+  foo_ps (ps_dst, ps_src[0], ps_src[1], ps_src[2], ps_src[3]);
+  foo_epi64 (epi64_dst, epi64_src[0], epi64_src[1]);
+  foo_epi32 (epi32_dst, epi32_src[0], epi32_src[1], epi32_src[2], epi32_src[3]);
+  foo_epi16 (epi16_dst, epi16_src[0], epi16_src[1], epi16_src[2], epi16_src[3],
+	     epi16_src[4], epi16_src[5], epi16_src[6], epi16_src[7]);
+  foo_epi8 (epi8_dst, epi8_src[0], epi8_src[1], epi8_src[2], epi8_src[3],
+	    epi8_src[4], epi8_src[5], epi8_src[6], epi8_src[7],
+	    epi8_src[8], epi8_src[9], epi8_src[10], epi8_src[11],
+	    epi8_src[12], epi8_src[13], epi8_src[14], epi8_src[15]);
+
+  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi8_dst, epi8_src, 32) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4c.c b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
new file mode 100644
index 00000000000..8e02aefe3b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
@@ -0,0 +1,54 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -fdump-tree-slp-details -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "slp2" } } */
+
+#include <string.h>
+
+static void do_test (void);
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a,
+	_Complex _Float16 b1, _Complex _Float16 b2,
+	_Complex _Float16 b3, _Complex _Float16 b4,
+	_Complex _Float16 b5, _Complex _Float16 b6,
+	_Complex _Float16 b7,_Complex _Float16 b8)
+{
+  a[0] = b1;
+  a[1] = b2;
+  a[2] = b3;
+  a[3] = b4;
+  a[4] = b5;
+  a[5] = b6;
+  a[6] = b7;
+  a[7] = b8;
+}
+
+void
+do_test (void)
+{
+
+  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
+  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
+
+  char* p = (char* ) malloc (32);
+
+  __builtin_memset (ph_dst, 0, 32);
+
+  for (int i = 0; i != 32; i++)
+    p[i] = i;
+
+  __builtin_memcpy (ph_src, p, 32);
+
+  foo_ph (ph_dst, ph_src[0], ph_src[1], ph_src[2], ph_src[3],
+	  ph_src[4], ph_src[5], ph_src[6], ph_src[7]);
+
+  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
+    __builtin_abort ();
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5a.c b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
new file mode 100644
index 00000000000..9d4a6f9846b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
@@ -0,0 +1,117 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 4 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 4 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 4 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 4 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 4 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 4 "slp2" } } */
+
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a, _Complex double* __restrict b)
+{
+  a[0] = b[2];
+  a[1] = b[3];
+  a[2] = b[0];
+  a[3] = b[1];
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a, _Complex float* __restrict b)
+{
+  a[0] = b[4];
+  a[1] = b[5];
+  a[2] = b[6];
+  a[3] = b[7];
+  a[4] = b[0];
+  a[5] = b[1];
+  a[6] = b[2];
+  a[7] = b[3];
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
+{
+  a[0] = b[2];
+  a[1] = b[3];
+  a[2] = b[0];
+  a[3] = b[1];
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a, _Complex int* __restrict b)
+{
+  a[0] = b[4];
+  a[1] = b[5];
+  a[2] = b[6];
+  a[3] = b[7];
+  a[4] = b[0];
+  a[5] = b[1];
+  a[6] = b[2];
+  a[7] = b[3];
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a, _Complex short* __restrict b)
+{
+  a[0] = b[8];
+  a[1] = b[9];
+  a[2] = b[10];
+  a[3] = b[11];
+  a[4] = b[12];
+  a[5] = b[13];
+  a[6] = b[14];
+  a[7] = b[15];
+  a[8] = b[0];
+  a[9] = b[1];
+  a[10] = b[2];
+  a[11] = b[3];
+  a[12] = b[4];
+  a[13] = b[5];
+  a[14] = b[6];
+  a[15] = b[7];
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a, _Complex char* __restrict b)
+{
+  a[0] = b[16];
+  a[1] = b[17];
+  a[2] = b[18];
+  a[3] = b[19];
+  a[4] = b[20];
+  a[5] = b[21];
+  a[6] = b[22];
+  a[7] = b[23];
+  a[8] = b[24];
+  a[9] = b[25];
+  a[10] = b[26];
+  a[11] = b[27];
+  a[12] = b[28];
+  a[13] = b[29];
+  a[14] = b[30];
+  a[15] = b[31];
+  a[16] = b[0];
+  a[17] = b[1];
+  a[18] = b[2];
+  a[19] = b[3];
+  a[20] = b[4];
+  a[21] = b[5];
+  a[22] = b[6];
+  a[23] = b[7];
+  a[24] = b[8];
+  a[25] = b[9];
+  a[26] = b[10];
+  a[27] = b[11];
+  a[28] = b[12];
+  a[29] = b[13];
+  a[30] = b[14];
+  a[31] = b[15];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5b.c b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
new file mode 100644
index 00000000000..d5c6ebeb5cf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
@@ -0,0 +1,80 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+#include <string.h>
+#include "pr106010-5a.c"
+
+void
+avx_test (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (64);
+  _Complex double* pd_dst = (_Complex double*) malloc (64);
+  _Complex double* pd_exp = (_Complex double*) malloc (64);
+  _Complex float* ps_src = (_Complex float*) malloc (64);
+  _Complex float* ps_dst = (_Complex float*) malloc (64);
+  _Complex float* ps_exp = (_Complex float*) malloc (64);
+  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
+  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
+  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
+  _Complex int* epi32_src = (_Complex int*) malloc (64);
+  _Complex int* epi32_dst = (_Complex int*) malloc (64);
+  _Complex int* epi32_exp = (_Complex int*) malloc (64);
+  _Complex short* epi16_src = (_Complex short*) malloc (64);
+  _Complex short* epi16_dst = (_Complex short*) malloc (64);
+  _Complex short* epi16_exp = (_Complex short*) malloc (64);
+  _Complex char* epi8_src = (_Complex char*) malloc (64);
+  _Complex char* epi8_dst = (_Complex char*) malloc (64);
+  _Complex char* epi8_exp = (_Complex char*) malloc (64);
+  char* p = (char* ) malloc (64);
+  char* q = (char* ) malloc (64);
+
+  __builtin_memset (pd_dst, 0, 64);
+  __builtin_memset (ps_dst, 0, 64);
+  __builtin_memset (epi64_dst, 0, 64);
+  __builtin_memset (epi32_dst, 0, 64);
+  __builtin_memset (epi16_dst, 0, 64);
+  __builtin_memset (epi8_dst, 0, 64);
+
+  for (int i = 0; i != 64; i++)
+    {
+      p[i] = i;
+      q[i] = (i + 32) % 64;
+    }
+  __builtin_memcpy (pd_src, p, 64);
+  __builtin_memcpy (ps_src, p, 64);
+  __builtin_memcpy (epi64_src, p, 64);
+  __builtin_memcpy (epi32_src, p, 64);
+  __builtin_memcpy (epi16_src, p, 64);
+  __builtin_memcpy (epi8_src, p, 64);
+
+  __builtin_memcpy (pd_exp, q, 64);
+  __builtin_memcpy (ps_exp, q, 64);
+  __builtin_memcpy (epi64_exp, q, 64);
+  __builtin_memcpy (epi32_exp, q, 64);
+  __builtin_memcpy (epi16_exp, q, 64);
+  __builtin_memcpy (epi8_exp, q, 64);
+
+  foo_pd (pd_dst, pd_src);
+  foo_ps (ps_dst, ps_src);
+  foo_epi64 (epi64_dst, epi64_src);
+  foo_epi32 (epi32_dst, epi32_src);
+  foo_epi16 (epi16_dst, epi16_src);
+  foo_epi8 (epi8_dst, epi8_src);
+
+  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5c.c b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
new file mode 100644
index 00000000000..9ce4e6dd5c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
@@ -0,0 +1,62 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 4 "slp2" } } */
+
+#include <string.h>
+
+static void do_test (void);
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
+{
+  a[0] = b[8];
+  a[1] = b[9];
+  a[2] = b[10];
+  a[3] = b[11];
+  a[4] = b[12];
+  a[5] = b[13];
+  a[6] = b[14];
+  a[7] = b[15];
+  a[8] = b[0];
+  a[9] = b[1];
+  a[10] = b[2];
+  a[11] = b[3];
+  a[12] = b[4];
+  a[13] = b[5];
+  a[14] = b[6];
+  a[15] = b[7];
+}
+
+void
+do_test (void)
+{
+  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
+  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
+  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
+  char* p = (char* ) malloc (64);
+  char* q = (char* ) malloc (64);
+
+  __builtin_memset (ph_dst, 0, 64);
+
+  for (int i = 0; i != 64; i++)
+    {
+      p[i] = i;
+      q[i] = (i + 32) % 64;
+    }
+  __builtin_memcpy (ph_src, p, 64);
+
+  __builtin_memcpy (ph_exp, q, 64);
+
+  foo_ph (ph_dst, ph_src);
+
+  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6a.c b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
new file mode 100644
index 00000000000..65a90d03684
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
@@ -0,0 +1,115 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 4 "slp2" } }  */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 4 "slp2" } }  */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
+
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a, _Complex double* __restrict b)
+{
+  a[0] = b[3];
+  a[1] = b[2];
+  a[2] = b[1];
+  a[3] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a, _Complex float* __restrict b)
+{
+  a[0] = b[7];
+  a[1] = b[6];
+  a[2] = b[5];
+  a[3] = b[4];
+  a[4] = b[3];
+  a[5] = b[2];
+  a[6] = b[1];
+  a[7] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
+{
+  a[0] = b[3];
+  a[1] = b[2];
+  a[2] = b[1];
+  a[3] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a, _Complex int* __restrict b)
+{
+  a[0] = b[7];
+  a[1] = b[6];
+  a[2] = b[5];
+  a[3] = b[4];
+  a[4] = b[3];
+  a[5] = b[2];
+  a[6] = b[1];
+  a[7] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a, _Complex short* __restrict b)
+{
+  a[0] = b[15];
+  a[1] = b[14];
+  a[2] = b[13];
+  a[3] = b[12];
+  a[4] = b[11];
+  a[5] = b[10];
+  a[6] = b[9];
+  a[7] = b[8];
+  a[8] = b[7];
+  a[9] = b[6];
+  a[10] = b[5];
+  a[11] = b[4];
+  a[12] = b[3];
+  a[13] = b[2];
+  a[14] = b[1];
+  a[15] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a, _Complex char* __restrict b)
+{
+  a[0] = b[31];
+  a[1] = b[30];
+  a[2] = b[29];
+  a[3] = b[28];
+  a[4] = b[27];
+  a[5] = b[26];
+  a[6] = b[25];
+  a[7] = b[24];
+  a[8] = b[23];
+  a[9] = b[22];
+  a[10] = b[21];
+  a[11] = b[20];
+  a[12] = b[19];
+  a[13] = b[18];
+  a[14] = b[17];
+  a[15] = b[16];
+  a[16] = b[15];
+  a[17] = b[14];
+  a[18] = b[13];
+  a[19] = b[12];
+  a[20] = b[11];
+  a[21] = b[10];
+  a[22] = b[9];
+  a[23] = b[8];
+  a[24] = b[7];
+  a[25] = b[6];
+  a[26] = b[5];
+  a[27] = b[4];
+  a[28] = b[3];
+  a[29] = b[2];
+  a[30] = b[1];
+  a[31] = b[0];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6b.c b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
new file mode 100644
index 00000000000..1c5bb020939
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
@@ -0,0 +1,157 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx2 } */
+
+#include "avx2-check.h"
+#include <string.h>
+#include "pr106010-6a.c"
+
+void
+avx2_test (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (64);
+  _Complex double* pd_dst = (_Complex double*) malloc (64);
+  _Complex double* pd_exp = (_Complex double*) malloc (64);
+  _Complex float* ps_src = (_Complex float*) malloc (64);
+  _Complex float* ps_dst = (_Complex float*) malloc (64);
+  _Complex float* ps_exp = (_Complex float*) malloc (64);
+  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
+  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
+  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
+  _Complex int* epi32_src = (_Complex int*) malloc (64);
+  _Complex int* epi32_dst = (_Complex int*) malloc (64);
+  _Complex int* epi32_exp = (_Complex int*) malloc (64);
+  _Complex short* epi16_src = (_Complex short*) malloc (64);
+  _Complex short* epi16_dst = (_Complex short*) malloc (64);
+  _Complex short* epi16_exp = (_Complex short*) malloc (64);
+  _Complex char* epi8_src = (_Complex char*) malloc (64);
+  _Complex char* epi8_dst = (_Complex char*) malloc (64);
+  _Complex char* epi8_exp = (_Complex char*) malloc (64);
+  char* p = (char* ) malloc (64);
+  char* q = (char* ) malloc (64);
+
+  __builtin_memset (pd_dst, 0, 64);
+  __builtin_memset (ps_dst, 0, 64);
+  __builtin_memset (epi64_dst, 0, 64);
+  __builtin_memset (epi32_dst, 0, 64);
+  __builtin_memset (epi16_dst, 0, 64);
+  __builtin_memset (epi8_dst, 0, 64);
+
+  for (int i = 0; i != 64; i++)
+    p[i] = i;
+
+  __builtin_memcpy (pd_src, p, 64);
+  __builtin_memcpy (ps_src, p, 64);
+  __builtin_memcpy (epi64_src, p, 64);
+  __builtin_memcpy (epi32_src, p, 64);
+  __builtin_memcpy (epi16_src, p, 64);
+  __builtin_memcpy (epi8_src, p, 64);
+
+
+  for (int i = 0; i != 16; i++)
+    {
+      q[i] = i + 48;
+      q[i + 16] = i + 32;
+      q[i + 32] = i + 16;
+      q[i + 48] = i;
+    }
+ 
+  __builtin_memcpy (pd_exp, q, 64);
+  __builtin_memcpy (epi64_exp, q, 64);
+
+   for (int i = 0; i != 8; i++)
+    {
+      q[i] = i + 56;
+      q[i + 8] = i + 48;
+      q[i + 16] = i + 40;
+      q[i + 24] = i + 32;
+      q[i + 32] = i + 24;
+      q[i + 40] = i + 16;
+      q[i + 48] = i + 8;
+      q[i + 56] = i;
+    }
+
+  __builtin_memcpy (ps_exp, q, 64);
+  __builtin_memcpy (epi32_exp, q, 64);
+
+  for (int i = 0; i != 4; i++)
+    {
+      q[i] = i + 60;
+      q[i + 4] = i + 56;
+      q[i + 8] = i + 52;
+      q[i + 12] = i + 48;
+      q[i + 16] = i + 44;
+      q[i + 20] = i + 40;
+      q[i + 24] = i + 36;
+      q[i + 28] = i + 32;
+      q[i + 32] = i + 28;
+      q[i + 36] = i + 24;
+      q[i + 40] = i + 20;
+      q[i + 44] = i + 16;
+      q[i + 48] = i + 12;
+      q[i + 52] = i + 8;
+      q[i + 56] = i + 4;
+      q[i + 60] = i;
+    }
+
+  __builtin_memcpy (epi16_exp, q, 64);
+
+  for (int i = 0; i != 2; i++)
+    {
+      q[i] = i + 62;
+      q[i + 2] = i + 60;
+      q[i + 4] = i + 58;
+      q[i + 6] = i + 56;
+      q[i + 8] = i + 54;
+      q[i + 10] = i + 52;
+      q[i + 12] = i + 50;
+      q[i + 14] = i + 48;
+      q[i + 16] = i + 46;
+      q[i + 18] = i + 44;
+      q[i + 20] = i + 42;
+      q[i + 22] = i + 40;
+      q[i + 24] = i + 38;
+      q[i + 26] = i + 36;
+      q[i + 28] = i + 34;
+      q[i + 30] = i + 32;
+      q[i + 32] = i + 30;
+      q[i + 34] = i + 28;
+      q[i + 36] = i + 26;
+      q[i + 38] = i + 24;
+      q[i + 40] = i + 22;
+      q[i + 42] = i + 20;
+      q[i + 44] = i + 18;
+      q[i + 46] = i + 16;
+      q[i + 48] = i + 14;
+      q[i + 50] = i + 12;
+      q[i + 52] = i + 10;
+      q[i + 54] = i + 8;
+      q[i + 56] = i + 6;
+      q[i + 58] = i + 4;
+      q[i + 60] = i + 2;
+      q[i + 62] = i;
+    }
+  __builtin_memcpy (epi8_exp, q, 64);
+
+  foo_pd (pd_dst, pd_src);
+  foo_ps (ps_dst, ps_src);
+  foo_epi64 (epi64_dst, epi64_src);
+  foo_epi32 (epi32_dst, epi32_src);
+  foo_epi16 (epi16_dst, epi16_src);
+  foo_epi8 (epi8_dst, epi8_src);
+
+  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6c.c b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
new file mode 100644
index 00000000000..b859d884a7f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
@@ -0,0 +1,80 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } } */
+
+#include <string.h>
+
+static void do_test (void);
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
+{
+  a[0] = b[15];
+  a[1] = b[14];
+  a[2] = b[13];
+  a[3] = b[12];
+  a[4] = b[11];
+  a[5] = b[10];
+  a[6] = b[9];
+  a[7] = b[8];
+  a[8] = b[7];
+  a[9] = b[6];
+  a[10] = b[5];
+  a[11] = b[4];
+  a[12] = b[3];
+  a[13] = b[2];
+  a[14] = b[1];
+  a[15] = b[0];
+}
+
+void
+do_test (void)
+{
+  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
+  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
+  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
+  char* p = (char* ) malloc (64);
+  char* q = (char* ) malloc (64);
+
+  __builtin_memset (ph_dst, 0, 64);
+
+  for (int i = 0; i != 64; i++)
+    p[i] = i;
+
+  __builtin_memcpy (ph_src, p, 64);
+
+  for (int i = 0; i != 4; i++)
+    {
+      q[i] = i + 60;
+      q[i + 4] = i + 56;
+      q[i + 8] = i + 52;
+      q[i + 12] = i + 48;
+      q[i + 16] = i + 44;
+      q[i + 20] = i + 40;
+      q[i + 24] = i + 36;
+      q[i + 28] = i + 32;
+      q[i + 32] = i + 28;
+      q[i + 36] = i + 24;
+      q[i + 40] = i + 20;
+      q[i + 44] = i + 16;
+      q[i + 48] = i + 12;
+      q[i + 52] = i + 8;
+      q[i + 56] = i + 4;
+      q[i + 60] = i;
+    }
+
+  __builtin_memcpy (ph_exp, q, 64);
+
+  foo_ph (ph_dst, ph_src);
+  
+  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7a.c b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
new file mode 100644
index 00000000000..2ea01fac927
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
@@ -0,0 +1,58 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
+
+#define N 10000
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a, _Complex double b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b;
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a, _Complex float b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b;
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a, _Complex long long b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b;
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a, _Complex int b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b;
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a, _Complex short b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b;
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a, _Complex char b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7b.c b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
new file mode 100644
index 00000000000..26482cc10f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
@@ -0,0 +1,63 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+#include <string.h>
+#include "pr106010-7a.c"
+
+void
+avx_test (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
+  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
+  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
+  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
+  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
+  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
+  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
+  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
+  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
+  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
+  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
+  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
+  char* p_init = (char*) malloc (2 * N * sizeof (double));
+
+  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
+  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
+  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
+  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
+  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
+  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
+
+  for (int i = 0; i != 2 * N * sizeof (double); i++)
+    p_init[i] = i % 2 + 3;
+
+  memcpy (pd_src, p_init, 2 * N * sizeof (double));
+  memcpy (ps_dst, p_init, 2 * N * sizeof (float));
+  memcpy (epi64_dst, p_init, 2 * N * sizeof (long long));
+  memcpy (epi32_dst, p_init, 2 * N * sizeof (int));
+  memcpy (epi16_dst, p_init, 2 * N * sizeof (short));
+  memcpy (epi8_dst, p_init, 2 * N * sizeof (char));
+
+  foo_pd (pd_dst, pd_src[0]);
+  foo_ps (ps_dst, ps_src[0]);
+  foo_epi64 (epi64_dst, epi64_src[0]);
+  foo_epi32 (epi32_dst, epi32_src[0]);
+  foo_epi16 (epi16_dst, epi16_src[0]);
+  foo_epi8 (epi8_dst, epi8_src[0]);
+  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7c.c b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
new file mode 100644
index 00000000000..7f4056a5ecc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
@@ -0,0 +1,41 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
+/* { dg-require-effective-target avx512fp16 } */
+
+#include <string.h>
+
+static void do_test (void);
+
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+#define N 10000
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a, _Complex _Float16 b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b;
+}
+
+static void
+do_test (void)
+{
+  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
+  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
+  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
+
+  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
+
+  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
+    p_init[i] = i % 2 + 3;
+
+  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
+
+  foo_ph (ph_dst, ph_src[0]);
+  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
+    __builtin_abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8a.c b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
new file mode 100644
index 00000000000..11054b60d30
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
@@ -0,0 +1,58 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
+
+#define N 10000
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = 1.0 + 2.0i;
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = 1.0f + 2.0fi;
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = 1 + 2i;
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = 1 + 2i;
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = 1 + 2i;
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = 1 + 2i;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8b.c b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
new file mode 100644
index 00000000000..6bb0073b691
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
@@ -0,0 +1,53 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+#include <string.h>
+#include "pr106010-8a.c"
+
+void
+avx_test (void)
+{
+  _Complex double pd_src = 1.0 + 2.0i;
+  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
+  _Complex float ps_src = 1.0 + 2.0i;
+  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
+  _Complex long long epi64_src = 1 + 2i;;
+  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
+  _Complex int epi32_src = 1 + 2i;
+  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
+  _Complex short epi16_src = 1 + 2i;
+  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
+  _Complex char epi8_src = 1 + 2i;
+  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
+
+  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
+  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
+  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
+  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
+  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
+  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
+
+  foo_pd (pd_dst);
+  foo_ps (ps_dst);
+  foo_epi64 (epi64_dst);
+  foo_epi32 (epi32_dst);
+  foo_epi16 (epi16_dst);
+  foo_epi8 (epi8_dst);
+  for (int i = 0 ; i != N; i++)
+    {
+      if (pd_dst[i] != pd_src)
+	__builtin_abort ();
+      if (ps_dst[i] != ps_src)
+	__builtin_abort ();
+      if (epi64_dst[i] != epi64_src)
+	__builtin_abort ();
+      if (epi32_dst[i] != epi32_src)
+	__builtin_abort ();
+      if (epi16_dst[i] != epi16_src)
+	__builtin_abort ();
+      if (epi8_dst[i] != epi8_src)
+	__builtin_abort ();
+    }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8c.c b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
new file mode 100644
index 00000000000..61ae131829d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
@@ -0,0 +1,38 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
+/* { dg-require-effective-target avx512fp16 } */
+
+#include <string.h>
+
+static void do_test (void);
+
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+#define N 10000
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = 1.0f16 + 2.0f16i;
+}
+
+static void
+do_test (void)
+{
+  _Complex _Float16 ph_src = 1.0f16 + 2.0f16i;
+  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
+
+  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
+
+  foo_ph (ph_dst);
+  for (int i = 0; i != N; i++)
+    {
+      if (ph_dst[i] != ph_src)
+	__builtin_abort ();
+    }
+}
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index d20a10a1524..42ee9df674c 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -1403,7 +1403,8 @@  vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
   if (PURE_SLP_STMT (stmt_info))
     ncopies = 1;
   else
-    ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
+    ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info),
+				   STMT_VINFO_COMPLEX_P (stmt_info));
 
   if (DR_IS_READ (dr_info->dr))
     vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
@@ -4597,8 +4598,22 @@  vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
 
       /* Set vectype for STMT.  */
       scalar_type = TREE_TYPE (DR_REF (dr));
-      tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
-      if (!vectype)
+      tree adjust_scalar_type = scalar_type;
+      /* Support Complex type access. Note that the complex type of load/store
+	 does not support gather/scatter.  */
+      if (TREE_CODE (scalar_type) == COMPLEX_TYPE
+	  && gatherscatter == SG_NONE)
+	{
+	  adjust_scalar_type = TREE_TYPE (scalar_type);
+	  STMT_VINFO_COMPLEX_P (stmt_info) = true;
+	}
+      tree vectype = get_vectype_for_scalar_type (vinfo, adjust_scalar_type);
+      unsigned HOST_WIDE_INT constant_nunits;
+      if (!vectype
+	  /* For complex type, V1DI doesn't make sense.  */
+	  || (STMT_VINFO_COMPLEX_P (stmt_info)
+	      && (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&constant_nunits)
+		  || constant_nunits == 1)))
         {
           if (dump_enabled_p ())
             {
@@ -4635,8 +4650,11 @@  vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
 	}
 
       /* Adjust the minimal vectorization factor according to the
-	 vector type.  */
+	 vector type. Note for complex type, VF is half of
+	 TYPE_VECTOR_SUBPARTS.  */
       vf = TYPE_VECTOR_SUBPARTS (vectype);
+      if (STMT_VINFO_COMPLEX_P (stmt_info))
+	vf = exact_div (vf, 2);
       *min_vf = upper_bound (*min_vf, vf);
 
       /* Leave the BB vectorizer to pick the vector type later, based on
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 3a70c15b593..365fa738022 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -200,7 +200,12 @@  vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
     }
 
   if (nunits_vectype)
-    vect_update_max_nunits (vf, nunits_vectype);
+    {
+      poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (nunits_vectype);
+      if (STMT_VINFO_COMPLEX_P (stmt_info))
+	nunits = exact_div (nunits, 2);
+      vect_update_max_nunits (vf, nunits);
+    }
 
   return opt_result::success ();
 }
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index dab5daddcc5..5d66ea2f286 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -877,10 +877,14 @@  vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
       return false;
     }
 
+  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
+  if (STMT_VINFO_COMPLEX_P (stmt_info))
+    nunits = exact_div (nunits, 2);
+
   /* If populating the vector type requires unrolling then fail
      before adjusting *max_nunits for basic-block vectorization.  */
   if (is_a <bb_vec_info> (vinfo)
-      && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
+      && !multiple_p (group_size , nunits))
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -891,7 +895,7 @@  vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
     }
 
   /* In case of multiple types we need to detect the smallest type.  */
-  vect_update_max_nunits (max_nunits, vectype);
+  vect_update_max_nunits (max_nunits, nunits);
   return true;
 }
 
@@ -3720,22 +3724,54 @@  vect_optimize_slp (vec_info *vinfo)
 	 vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
 	 when permuting constants and invariants keeping the permute
 	 bijective.  */
-      auto_sbitmap load_index (SLP_TREE_LANES (node));
-      bitmap_clear (load_index);
-      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
-	bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
-      unsigned j;
-      for (j = 0; j < SLP_TREE_LANES (node); ++j)
-	if (!bitmap_bit_p (load_index, j))
-	  break;
-      if (j != SLP_TREE_LANES (node))
-	continue;
+      /* Permutation of Complex type.  */
+      if (STMT_VINFO_COMPLEX_P (dr_stmt))
+	{
+	  auto_sbitmap load_index (SLP_TREE_LANES (node) * 2);
+	  bitmap_clear (load_index);
+	  for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
+	    {
+	      unsigned bit = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
+	      bitmap_set_bit (load_index, 2 * bit);
+	      bitmap_set_bit (load_index, 2 * bit + 1);
+	    }
+	  unsigned j;
+	  for (j = 0; j < SLP_TREE_LANES (node) * 2; ++j)
+	    if (!bitmap_bit_p (load_index, j))
+	      break;
+	  if (j != SLP_TREE_LANES (node) * 2)
+	    continue;
 
-      vec<unsigned> perm = vNULL;
-      perm.safe_grow (SLP_TREE_LANES (node), true);
-      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
-	perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
-      perms.safe_push (perm);
+	  vec<unsigned> perm = vNULL;
+	  perm.safe_grow (SLP_TREE_LANES (node) * 2, true);
+	  for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
+	    {
+	      unsigned cidx = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
+	      perm[2 * j] = 2 * cidx;
+	      perm[2 * j + 1] = 2 * cidx + 1;
+	    }
+	  perms.safe_push (perm);
+	}
+      else
+	{
+	  auto_sbitmap load_index (SLP_TREE_LANES (node));
+	  bitmap_clear (load_index);
+	  for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
+	    bitmap_set_bit (load_index,
+			    SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
+	  unsigned j;
+	  for (j = 0; j < SLP_TREE_LANES (node); ++j)
+	    if (!bitmap_bit_p (load_index, j))
+	      break;
+	  if (j != SLP_TREE_LANES (node))
+	    continue;
+
+	  vec<unsigned> perm = vNULL;
+	  perm.safe_grow (SLP_TREE_LANES (node), true);
+	  for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
+	    perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
+	  perms.safe_push (perm);
+	}
       vertices[idx].perm_in = perms.length () - 1;
       vertices[idx].perm_out = perms.length () - 1;
     }
@@ -4518,6 +4554,12 @@  vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
 	vf = loop_vinfo->vectorization_factor;
       else
 	vf = 1;
+      /* For complex type and SLP, double vf to get right vectype.
+	 .i.e vector(4) double for complex double, group size is 2, double vf
+	 to map vf * group_size to TYPE_VECTOR_SUBPARTS.  */
+     if (STMT_VINFO_COMPLEX_P (stmt_info))
+       vf *= 2;
+
       unsigned int group_size = SLP_TREE_LANES (node);
       tree vectype = SLP_TREE_VECTYPE (node);
       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
@@ -4763,10 +4805,17 @@  vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
 	    }
 	  unsigned group_size = SLP_TREE_LANES (child);
 	  poly_uint64 vf = 1;
+
 	  if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
 	    vf = loop_vinfo->vectorization_factor;
+
+	  /* V2SF is just 1 complex type, so mutiply by 2
+	     to get release vector numbers.  */
+	  unsigned cp
+	    = STMT_VINFO_COMPLEX_P (SLP_TREE_REPRESENTATIVE (node)) ? 2 : 1;
+
 	  SLP_TREE_NUMBER_OF_VEC_STMTS (child)
-	    = vect_get_num_vectors (vf * group_size, vector_type);
+	    = vect_get_num_vectors (vf * group_size * cp, vector_type);
 	  /* And cost them.  */
 	  vect_prologue_cost_for_slp (child, cost_vec);
 	}
@@ -6402,6 +6451,11 @@  vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
 
   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
   vector_type = SLP_TREE_VECTYPE (op_node);
+  unsigned int cp = 1;
+  /* Handle Complex type vector init.
+     SLP_TREE_REPRESENTATIVE (op_node) could be NULL.  */
+  if (TREE_CODE (TREE_TYPE (op_node->ops[0])) == COMPLEX_TYPE)
+    cp = 2;
 
   unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
@@ -6426,9 +6480,9 @@  vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
   /* When using duplicate_and_interleave, we just need one element for
      each scalar statement.  */
   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
-    nunits = group_size;
+    nunits = group_size * cp;
 
-  number_of_copies = nunits * number_of_vectors / group_size;
+  number_of_copies = nunits * number_of_vectors / (group_size * cp);
 
   number_of_places_left_in_vector = nunits;
   constant_p = true;
@@ -6460,8 +6514,23 @@  vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
 			gcc_unreachable ();
 		    }
 		  else
-		    op = fold_unary (VIEW_CONVERT_EXPR,
-				     TREE_TYPE (vector_type), op);
+		    {
+		      tree scalar_type = TREE_TYPE (vector_type);
+		      /* For complex type, insert real and imag part
+			 separately.  */
+		      if (cp == 2)
+			{
+			  gcc_assert ((TREE_CODE (TREE_TYPE (op))
+				       == COMPLEX_TYPE)
+				      && (scalar_type
+					  == TREE_TYPE (TREE_TYPE (op))));
+			  elts[number_of_places_left_in_vector--]
+			    = fold_unary (IMAGPART_EXPR, scalar_type, op);
+			  op = fold_unary (REALPART_EXPR, scalar_type, op);
+			}
+		      else
+			op = fold_unary (VIEW_CONVERT_EXPR, scalar_type, op);
+		    }
 		  gcc_assert (op && CONSTANT_CLASS_P (op));
 		}
 	      else
@@ -6481,11 +6550,28 @@  vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
 		    }
 		  else
 		    {
-		      op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
-				   op);
-		      init_stmt
-			= gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
-					       op);
+		      tree scalar_type = TREE_TYPE (vector_type);
+		      if (cp == 2)
+			{
+			  gcc_assert ((TREE_CODE (TREE_TYPE (op))
+				       == COMPLEX_TYPE)
+				      && (scalar_type
+					  == TREE_TYPE (TREE_TYPE (op))));
+			  tree imag = build1 (IMAGPART_EXPR, scalar_type, op);
+			  op = build1 (REALPART_EXPR, scalar_type, op);
+			  tree imag_temp = make_ssa_name (scalar_type);
+			  elts[number_of_places_left_in_vector--] = imag_temp;
+			  init_stmt = gimple_build_assign (imag_temp, imag);
+			  gimple_seq_add_stmt (&ctor_seq, init_stmt);
+			  init_stmt = gimple_build_assign (new_temp, op);
+			}
+		      else
+			{
+			  op = build1 (VIEW_CONVERT_EXPR, scalar_type, op);
+			  init_stmt
+			    = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
+						   op);
+			}
 		    }
 		  gimple_seq_add_stmt (&ctor_seq, init_stmt);
 		  op = new_temp;
@@ -6696,15 +6782,17 @@  vect_transform_slp_perm_load (vec_info *vinfo,
   unsigned int nelts_to_build;
   unsigned int nvectors_per_build;
   unsigned int in_nlanes;
+  unsigned int cp = STMT_VINFO_COMPLEX_P (stmt_info) ? 2 : 1;
   bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
-		      && multiple_p (nunits, group_size));
+		      && multiple_p (nunits, group_size * cp));
   if (repeating_p)
     {
       /* A single vector contains a whole number of copies of the node, so:
 	 (a) all permutes can use the same mask; and
 	 (b) the permutes only need a single vector input.  */
-      mask.new_vector (nunits, group_size, 3);
-      nelts_to_build = mask.encoded_nelts ();
+      /* For complex type, mask size should be double of nelts_to_build.  */
+      mask.new_vector (nunits, group_size * cp, 3);
+      nelts_to_build = mask.encoded_nelts () / cp;
       nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
       in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
     }
@@ -6744,8 +6832,8 @@  vect_transform_slp_perm_load (vec_info *vinfo,
 	{
 	  /* Enforced before the loop when !repeating_p.  */
 	  unsigned int const_nunits = nunits.to_constant ();
-	  vec_index = i / const_nunits;
-	  mask_element = i % const_nunits;
+	  vec_index = i / (const_nunits / cp);
+	  mask_element = i % (const_nunits / cp);
 	  if (vec_index == first_vec_index
 	      || first_vec_index == -1)
 	    {
@@ -6755,7 +6843,7 @@  vect_transform_slp_perm_load (vec_info *vinfo,
 		   || second_vec_index == -1)
 	    {
 	      second_vec_index = vec_index;
-	      mask_element += const_nunits;
+	      mask_element += (const_nunits / cp);
 	    }
 	  else
 	    {
@@ -6768,14 +6856,24 @@  vect_transform_slp_perm_load (vec_info *vinfo,
 	      return false;
 	    }
 
-	  gcc_assert (mask_element < 2 * const_nunits);
+	  gcc_assert (mask_element < 2 * const_nunits / cp);
 	}
 
       if (mask_element != index)
 	noop_p = false;
-      mask[index++] = mask_element;
+      /* Set index for Complex _type.
+	 i.e. mask like [1,0] is actually [2, 3, 0, 1]
+	 for vector scalar type.  */
+      if (cp == 2)
+	{
+	  mask[2 * index] = 2 * mask_element;
+	  mask[2 * index + 1] = 2 * mask_element + 1;
+	}
+      else
+	mask[index] = mask_element;
+      index++;
 
-      if (index == count && !noop_p)
+      if (index * cp == count && !noop_p)
 	{
 	  indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
 	  if (!can_vec_perm_const_p (mode, mode, indices))
@@ -6799,7 +6897,7 @@  vect_transform_slp_perm_load (vec_info *vinfo,
 	  ++*n_perms;
 	}
 
-      if (index == count)
+      if (index * cp == count)
 	{
 	  if (!analyze_only)
 	    {
@@ -6869,7 +6967,7 @@  vect_transform_slp_perm_load (vec_info *vinfo,
 	  bool load_seen = false;
 	  for (unsigned i = 0; i < in_nlanes; ++i)
 	    {
-	      if (i % const_nunits == 0)
+	      if (i % (const_nunits * cp) == 0)
 		{
 		  if (load_seen)
 		    *n_loads += 1;
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 72107afc883..8af3b558be4 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1397,25 +1397,70 @@  vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
 {
   gimple *init_stmt;
   tree new_temp;
+  tree scalar_type = TREE_TYPE (type);
+  gimple_seq stmts = NULL;
+
+  if (TREE_CODE (TREE_TYPE (val)) == COMPLEX_TYPE)
+    {
+      unsigned HOST_WIDE_INT nunits;
+      gcc_assert (TYPE_VECTOR_SUBPARTS (type).is_constant (&nunits));
 
+      tree_vector_builder elts (type, nunits, 1);
+      tree imag, real;
+      if (TREE_CODE (val) == COMPLEX_CST)
+	{
+	  real = fold_unary (REALPART_EXPR, scalar_type, val);
+	  imag = fold_unary (IMAGPART_EXPR, scalar_type, val);
+	}
+      else
+	{
+	  real = make_ssa_name (scalar_type);
+	  imag = make_ssa_name (scalar_type);
+	  init_stmt
+	    = gimple_build_assign (real,
+				   build1 (REALPART_EXPR, scalar_type, val));
+	  gimple_seq_add_stmt (&stmts, init_stmt);
+	  init_stmt
+	    = gimple_build_assign (imag,
+				   build1 (IMAGPART_EXPR, scalar_type, val));
+	  gimple_seq_add_stmt (&stmts, init_stmt);
+	}
+
+      /* Build vector as [real,imag,real,imag,...].  */
+      for (unsigned i = 0; i != nunits; i++)
+	{
+	  if (i % 2)
+	    elts.quick_push (imag);
+	  else
+	    elts.quick_push (real);
+	}
+      val = gimple_build_vector (&stmts, &elts);
+      if (!gimple_seq_empty_p (stmts))
+	{
+	  if (gsi)
+	    gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+	  else
+	    vinfo->insert_seq_on_entry (stmt_info, stmts);
+	}
+    }
   /* We abuse this function to push sth to a SSA name with initial 'val'.  */
-  if (! useless_type_conversion_p (type, TREE_TYPE (val)))
+  else if (! useless_type_conversion_p (type, TREE_TYPE (val)))
     {
       gcc_assert (TREE_CODE (type) == VECTOR_TYPE);
-      if (! types_compatible_p (TREE_TYPE (type), TREE_TYPE (val)))
+      if (! types_compatible_p (scalar_type, TREE_TYPE (val)))
 	{
 	  /* Scalar boolean value should be transformed into
 	     all zeros or all ones value before building a vector.  */
 	  if (VECTOR_BOOLEAN_TYPE_P (type))
 	    {
-	      tree true_val = build_all_ones_cst (TREE_TYPE (type));
-	      tree false_val = build_zero_cst (TREE_TYPE (type));
+	      tree true_val = build_all_ones_cst (scalar_type);
+	      tree false_val = build_zero_cst (scalar_type);
 
 	      if (CONSTANT_CLASS_P (val))
 		val = integer_zerop (val) ? false_val : true_val;
 	      else
 		{
-		  new_temp = make_ssa_name (TREE_TYPE (type));
+		  new_temp = make_ssa_name (scalar_type);
 		  init_stmt = gimple_build_assign (new_temp, COND_EXPR,
 						   val, true_val, false_val);
 		  vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi);
@@ -1424,14 +1469,13 @@  vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
 	    }
 	  else
 	    {
-	      gimple_seq stmts = NULL;
 	      if (! INTEGRAL_TYPE_P (TREE_TYPE (val)))
 		val = gimple_build (&stmts, VIEW_CONVERT_EXPR,
-				    TREE_TYPE (type), val);
+				    scalar_type, val);
 	      else
 		/* ???  Condition vectorization expects us to do
 		   promotion of invariant/external defs.  */
-		val = gimple_convert (&stmts, TREE_TYPE (type), val);
+		val = gimple_convert (&stmts, scalar_type, val);
 	      for (gimple_stmt_iterator gsi2 = gsi_start (stmts);
 		   !gsi_end_p (gsi2); )
 		{
@@ -1496,7 +1540,12 @@  vect_get_vec_defs_for_operand (vec_info *vinfo, stmt_vec_info stmt_vinfo,
 	       && VECTOR_BOOLEAN_TYPE_P (stmt_vectype))
 	vector_type = truth_type_for (stmt_vectype);
       else
-	vector_type = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op));
+	{
+	  tree scalar_type = TREE_TYPE (op);
+	  if (STMT_VINFO_COMPLEX_P (stmt_vinfo))
+	    scalar_type = TREE_TYPE (scalar_type);
+	  vector_type = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
+	}
 
       gcc_assert (vector_type);
       tree vop = vect_init_vector (vinfo, stmt_vinfo, op, vector_type, NULL);
@@ -7509,8 +7558,17 @@  vectorizable_store (vec_info *vinfo,
      same location twice.  */
   gcc_assert (slp == PURE_SLP_STMT (stmt_info));
 
+  if (!STMT_VINFO_DATA_REF (stmt_info))
+    return false;
+
   tree vectype = STMT_VINFO_VECTYPE (stmt_info), rhs_vectype = NULL_TREE;
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
+  if (STMT_VINFO_COMPLEX_P (stmt_info))
+    {
+      if (!nunits.is_constant ())
+	return false;
+      nunits = exact_div (nunits, 2);
+    }
 
   if (loop_vinfo)
     {
@@ -7526,7 +7584,8 @@  vectorizable_store (vec_info *vinfo,
   if (slp)
     ncopies = 1;
   else
-    ncopies = vect_get_num_copies (loop_vinfo, vectype);
+    ncopies = vect_get_num_copies (loop_vinfo, vectype,
+				   STMT_VINFO_COMPLEX_P (stmt_info));
 
   gcc_assert (ncopies >= 1);
 
@@ -7546,9 +7605,6 @@  vectorizable_store (vec_info *vinfo,
   elem_type = TREE_TYPE (vectype);
   vec_mode = TYPE_MODE (vectype);
 
-  if (!STMT_VINFO_DATA_REF (stmt_info))
-    return false;
-
   vect_memory_access_type memory_access_type;
   enum dr_alignment_support alignment_support_scheme;
   int misalignment;
@@ -8778,6 +8834,12 @@  vectorizable_load (vec_info *vinfo,
 
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
+  if (STMT_VINFO_COMPLEX_P (stmt_info))
+    {
+      if (!nunits.is_constant ())
+	return false;
+      nunits = exact_div (nunits, 2);
+    }
 
   if (loop_vinfo)
     {
@@ -8794,7 +8856,8 @@  vectorizable_load (vec_info *vinfo,
   if (slp)
     ncopies = 1;
   else
-    ncopies = vect_get_num_copies (loop_vinfo, vectype);
+    ncopies = vect_get_num_copies (loop_vinfo, vectype,
+				   STMT_VINFO_COMPLEX_P (stmt_info));
 
   gcc_assert (ncopies >= 1);
 
@@ -8870,8 +8933,11 @@  vectorizable_load (vec_info *vinfo,
 		if (k > maxk)
 		  maxk = k;
 	      tree vectype = SLP_TREE_VECTYPE (slp_node);
+	      /* For complex type, half the nunits.  */
 	      if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
-		  || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
+		  || maxk >= (DR_GROUP_SIZE (group_info)
+			      & ~((STMT_VINFO_COMPLEX_P (group_info)
+				   ? nunits >> 1 : nunits) - 1)))
 		{
 		  if (dump_enabled_p ())
 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -12499,12 +12565,27 @@  vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
 	    dump_printf_loc (MSG_NOTE, vect_location,
 			     "get vectype for scalar type: %T\n", scalar_type);
 	}
+
+      tree orig_scalar_type = scalar_type;
+      if (TREE_CODE (scalar_type) == COMPLEX_TYPE)
+	{
+	  /* Set complex_p for BB vectorizer.  */
+	  STMT_VINFO_COMPLEX_P (stmt_info) = true;
+	  scalar_type = TREE_TYPE (scalar_type);
+	  /* Double group_size for BB vectorizer to make
+	     following 2 get_vectype_for_scalar_type return wanted vectype.
+	     Real group size is not changed, just make the "faked" input
+	     group_size.  */
+	  group_size *= 2;
+	}
       vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
-      if (!vectype)
+      if (!vectype
+	  || (STMT_VINFO_COMPLEX_P (stmt_info)
+	      && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ()))
 	return opt_result::failure_at (stmt,
 				       "not vectorized:"
 				       " unsupported data-type %T\n",
-				       scalar_type);
+				       orig_scalar_type);
 
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n", vectype);
@@ -12529,16 +12610,30 @@  vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
 						   TREE_TYPE (vectype));
       if (scalar_type != TREE_TYPE (vectype))
 	{
-	  if (dump_enabled_p ())
+	  tree orig_scalar_type = scalar_type;
+	  if (TREE_CODE (scalar_type) == COMPLEX_TYPE)
+	    {
+	      /* Set complex_p for Loop vectorizer.  */
+	      STMT_VINFO_COMPLEX_P (stmt_info) = true;
+	      scalar_type = TREE_TYPE (scalar_type);
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_NOTE, vect_location,
+			     "get complex for smallest scalar type: %T\n",
+			     scalar_type);
+
+	    }
+	  else if (dump_enabled_p ())
 	    dump_printf_loc (MSG_NOTE, vect_location,
 			     "get vectype for smallest scalar type: %T\n",
 			     scalar_type);
 	  nunits_vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
 							group_size);
-	  if (!nunits_vectype)
+	  if (!nunits_vectype
+	      || (STMT_VINFO_COMPLEX_P (stmt_info)
+		  && !TYPE_VECTOR_SUBPARTS (nunits_vectype).is_constant ()))
 	    return opt_result::failure_at
 	      (stmt, "not vectorized: unsupported data-type %T\n",
-	       scalar_type);
+	       orig_scalar_type);
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_NOTE, vect_location, "nunits vectype: %T\n",
 			     nunits_vectype);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index e5fdc9e0a14..4a809e492c4 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1161,6 +1161,9 @@  public:
      vectorization.  */
   bool vectorizable;
 
+  /* The scalar type of the LHS of this statement is complex type.  */
+  bool complex_p;
+
   /* The stmt to which this info struct refers to.  */
   gimple *stmt;
 
@@ -1395,6 +1398,7 @@  struct gather_scatter_info {
 #define STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT(S) (S)->reduc_epilogue_adjustment
 #define STMT_VINFO_REDUC_IDX(S)		   (S)->reduc_idx
 #define STMT_VINFO_FORCE_SINGLE_CYCLE(S)   (S)->force_single_cycle
+#define STMT_VINFO_COMPLEX_P(S)            (S)->complex_p
 
 #define STMT_VINFO_DR_WRT_VEC_LOOP(S)      (S)->dr_wrt_vec_loop
 #define STMT_VINFO_DR_BASE_ADDRESS(S)      (S)->dr_wrt_vec_loop.base_address
@@ -1970,6 +1974,15 @@  vect_get_num_copies (loop_vec_info loop_vinfo, tree vectype)
   return vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo), vectype);
 }
 
+static inline unsigned int
+vect_get_num_copies (loop_vec_info loop_vinfo, tree vectype, bool complex_p)
+{
+  poly_uint64 nunits = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  if (complex_p)
+    nunits *= 2;
+  return vect_get_num_vectors (nunits, vectype);
+}
+
 /* Update maximum unit count *MAX_NUNITS so that it accounts for
    NUNITS.  *MAX_NUNITS can be 1 if we haven't yet recorded anything.  */