Message ID | 20240611064901.37222-1-lin1.hu@intel.com |
---|---|
State | New |
Headers | show |
Series | [1/3,v3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float. | expand |
Ping this thread.
BRs,
Lin
-----Original Message-----
From: Hu, Lin1 <lin1.hu@intel.com>
Sent: Tuesday, June 11, 2024 2:49 PM
To: gcc-patches@gcc.gnu.org
Cc: Liu, Hongtao <hongtao.liu@intel.com>; ubizjak@gmail.com; rguenther@suse.de
Subject: [PATCH 1/3 v3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
I wrap a part of code about indirect conversion. The API refers to supportable_narrowing/widening_operations.
BRs,
Lin
gcc/ChangeLog:
PR target/107432
* tree-vect-generic.cc
(expand_vector_conversion): Support convert for int -> int,
float -> float and int <-> float.
* tree-vect-stmts.cc (vectorizable_conversion): Wrap the
indirect convert part.
(supportable_indirect_convert_operation): New function.
* tree-vectorizer.h (supportable_indirect_convert_operation):
Define the new function.
gcc/testsuite/ChangeLog:
PR target/107432
* gcc.target/i386/pr107432-1.c: New test.
* gcc.target/i386/pr107432-2.c: Ditto.
* gcc.target/i386/pr107432-3.c: Ditto.
* gcc.target/i386/pr107432-4.c: Ditto.
* gcc.target/i386/pr107432-5.c: Ditto.
* gcc.target/i386/pr107432-6.c: Ditto.
* gcc.target/i386/pr107432-7.c: Ditto.
---
gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 ++++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +++++++++ gcc/testsuite/gcc.target/i386/pr107432-3.c | 55 +++++ gcc/testsuite/gcc.target/i386/pr107432-4.c | 56 +++++ gcc/testsuite/gcc.target/i386/pr107432-5.c | 72 ++++++ gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 ++++++++++++ gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 +++++++++++++
gcc/tree-vect-generic.cc | 33 ++-
gcc/tree-vect-stmts.cc | 244 +++++++++++++--------
gcc/tree-vectorizer.h | 9 +
10 files changed, 1011 insertions(+), 92 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c
new file mode 100644
index 00000000000..a4f37447eb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
@@ -0,0 +1,234 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } }
+} */
+/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } }
+} } */
+/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } }
+} */
+/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } }
+} } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include <x86intrin.h>
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef
+char __v2qi __attribute__ ((__vector_size__ (2))); typedef char __v4qi
+__attribute__ ((__vector_size__ (4))); typedef char __v8qi
+__attribute__ ((__vector_size__ (8)));
+
+typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4)));
+typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8)));
+typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2)));
+typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4)));
+typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8)));
+typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a) {
+ return __builtin_convertvector((__v2di)a, __v2si); }
+
+__m128i mm256_cvtepi64_epi32_builtin_convertvector(__m256i a)
+{
+ return (__m128i)__builtin_convertvector((__v4di)a, __v4si); }
+
+__m256i mm512_cvtepi64_epi32_builtin_convertvector(__m512i a)
+{
+ return (__m256i)__builtin_convertvector((__v8di)a, __v8si); }
+
+__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
+{
+ return __builtin_convertvector((__v2di)a, __v2hi); }
+
+__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
+{
+ return __builtin_convertvector((__v4di)a, __v4hi); }
+
+__m128i mm512_cvtepi64_epi16_builtin_convertvector(__m512i a)
+{
+ return (__m128i)__builtin_convertvector((__v8di)a, __v8hi); }
+
+__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
+{
+ return __builtin_convertvector((__v2di)a, __v2qi); }
+
+__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
+{
+ return __builtin_convertvector((__v4di)a, __v4qi); }
+
+__v8qi mm512_cvtepi64_epi8_builtin_convertvector(__m512i a)
+{
+ return __builtin_convertvector((__v8di)a, __v8qi); }
+
+__v2hi mm64_cvtepi32_epi16_builtin_convertvector(__v2si a)
+{
+ return __builtin_convertvector((__v2si)a, __v2hi); }
+
+__v4hi mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
+{
+ return __builtin_convertvector((__v4si)a, __v4hi); }
+
+__m128i mm256_cvtepi32_epi16_builtin_convertvector(__m256i a)
+{
+ return (__m128i)__builtin_convertvector((__v8si)a, __v8hi); }
+
+__m256i mm512_cvtepi32_epi16_builtin_convertvector(__m512i a)
+{
+ return (__m256i)__builtin_convertvector((__v16si)a, __v16hi); }
+
+__v2qi mm64_cvtepi32_epi8_builtin_convertvector(__v2si a)
+{
+ return __builtin_convertvector((__v2si)a, __v2qi); }
+
+__v4qi mm_cvtepi32_epi8_builtin_convertvector(__m128i a)
+{
+ return __builtin_convertvector((__v4si)a, __v4qi); }
+
+__v8qi mm256_cvtepi32_epi8_builtin_convertvector(__m256i a)
+{
+ return __builtin_convertvector((__v8si)a, __v8qi); }
+
+__m128i mm512_cvtepi32_epi8_builtin_convertvector(__m512i a)
+{
+ return (__m128i)__builtin_convertvector((__v16si)a, __v16qi); }
+
+__v2qi mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a)
+{
+ return __builtin_convertvector((__v2hi)a, __v2qi); }
+
+__v8qi mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
+{
+ return __builtin_convertvector((__v8hi)a, __v8qi); }
+
+__m128i mm256_cvtepi16_epi8_builtin_convertvector(__m256i a)
+{
+ return (__m128i)__builtin_convertvector((__v16hi)a, __v16qi); }
+
+__m256i mm512_cvtepi16_epi8_builtin_convertvector(__m512i a)
+{
+ return (__m256i)__builtin_convertvector((__v32hi)a, __v32qi); }
+
+__v2su mm_cvtepu64_epu32_builtin_convertvector(__m128i a) {
+ return __builtin_convertvector((__v2du)a, __v2su); }
+
+__m128i mm256_cvtepu64_epu32_builtin_convertvector(__m256i a)
+{
+ return (__m128i)__builtin_convertvector((__v4du)a, __v4su); }
+
+__m256i mm512_cvtepu64_epu32_builtin_convertvector(__m512i a)
+{
+ return (__m256i)__builtin_convertvector((__v8du)a, __v8su); }
+
+__v2hu mm_cvtepu64_epu16_builtin_convertvector(__m128i a)
+{
+ return __builtin_convertvector((__v2du)a, __v2hu); }
+
+__v4hu mm256_cvtepu64_epu16_builtin_convertvector(__m256i a)
+{
+ return __builtin_convertvector((__v4du)a, __v4hu); }
+
+__m128i mm512_cvtepu64_epu16_builtin_convertvector(__m512i a)
+{
+ return (__m128i)__builtin_convertvector((__v8du)a, __v8hu); }
+
+__v2qu mm_cvtepu64_epu8_builtin_convertvector(__m128i a)
+{
+ return __builtin_convertvector((__v2du)a, __v2qu); }
+
+__v4qu mm256_cvtepu64_epu8_builtin_convertvector(__m256i a)
+{
+ return __builtin_convertvector((__v4du)a, __v4qu); }
+
+__v8qu mm512_cvtepu64_epu8_builtin_convertvector(__m512i a)
+{
+ return __builtin_convertvector((__v8du)a, __v8qu); }
+
+__v2hu mm32_cvtepu32_epu16_builtin_convertvector(__v2su a)
+{
+ return __builtin_convertvector((__v2su)a, __v2hu); }
+
+__v4hu mm_cvtepu32_epu16_builtin_convertvector(__m128i a)
+{
+ return __builtin_convertvector((__v4su)a, __v4hu); }
+
+__m128i mm256_cvtepu32_epu16_builtin_convertvector(__m256i a)
+{
+ return (__m128i)__builtin_convertvector((__v8su)a, __v8hu); }
+
+__m256i mm512_cvtepu32_epu16_builtin_convertvector(__m512i a)
+{
+ return (__m256i)__builtin_convertvector((__v16su)a, __v16hu); }
+
+__v2qu mm32_cvtepu32_epu8_builtin_convertvector(__v2su a)
+{
+ return __builtin_convertvector((__v2su)a, __v2qu); }
+
+__v4qu mm_cvtepu2_epu8_builtin_convertvector(__m128i a)
+{
+ return __builtin_convertvector((__v4su)a, __v4qu); }
+
+__v8qu mm256_cvtepu32_epu8_builtin_convertvector(__m256i a)
+{
+ return __builtin_convertvector((__v8su)a, __v8qu); }
+
+__m128i mm512_cvtepu32_epu8_builtin_convertvector(__m512i a)
+{
+ return (__m128i)__builtin_convertvector((__v16su)a, __v16qu); }
+
+__v2qu mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a)
+{
+ return __builtin_convertvector((__v2hu)a, __v2qu); }
+
+__v8qu mm_cvtepu16_epu8_builtin_convertvector(__m128i a)
+{
+ return __builtin_convertvector((__v8hu)a, __v8qu); }
+
+__m128i mm256_cvtepu16_epu8_builtin_convertvector(__m256i a)
+{
+ return (__m128i)__builtin_convertvector((__v16hu)a, __v16qu); }
+
+__m256i mm512_cvtepu16_epu8_builtin_convertvector(__m512i a)
+{
+ return (__m256i)__builtin_convertvector((__v32hu)a, __v32qu); }
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-2.c b/gcc/testsuite/gcc.target/i386/pr107432-2.c
new file mode 100644
index 00000000000..02ffd811cb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-2.c
@@ -0,0 +1,105 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovsxdq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxwq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbq" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxwd" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 3 } } */
+/* { dg-final { scan-assembler-times "vpmovsxbw" 3 } } */
+
+#include <x86intrin.h>
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef
+char __v2qi __attribute__ ((__vector_size__ (2))); typedef char __v4qi
+__attribute__ ((__vector_size__ (4))); typedef char __v8qi
+__attribute__ ((__vector_size__ (8)));
+
+__m128i mm_cvtepi32_epi64_builtin_convertvector(__v2si a) {
+ return __builtin_convertvector(a, __v2di); }
+
+__m256i mm256_cvtepi32_epi64_builtin_convertvector(__v4si a)
+{
+ return (__m256i)__builtin_convertvector(a, __v4di); }
+
+__m512i mm512_cvtepi32_epi64_builtin_convertvector(__v8si a)
+{
+ return (__m512i)__builtin_convertvector(a, __v8di); }
+
+__m128i mm_cvtepi16_epi64_builtin_convertvector(__v2hi a) {
+ return __builtin_convertvector(a, __v2di); }
+
+__m256i mm256_cvtepi16_epi64_builtin_convertvector(__v4hi a)
+{
+ return (__m256i)__builtin_convertvector(a, __v4di); }
+
+__m512i mm512_cvtepi16_epi64_builtin_convertvector(__v8hi a)
+{
+ return (__m512i)__builtin_convertvector(a, __v8di); }
+
+__m128i mm_cvtepi8_epi64_builtin_convertvector(__v2qi a) {
+ return __builtin_convertvector(a, __v2di); }
+
+__m256i mm256_cvtepi8_epi64_builtin_convertvector(__v4qi a)
+{
+ return (__m256i)__builtin_convertvector(a, __v4di); }
+
+__m512i mm512_cvtepi8_epi64_builtin_convertvector(__v8qi a)
+{
+ return (__m512i)__builtin_convertvector(a, __v8di); }
+
+__m128i mm_cvtepi16_epi32_builtin_convertvector(__v4hi a) {
+ return (__m128i)__builtin_convertvector(a, __v4si); }
+
+__m256i mm256_cvtepi16_epi32_builtin_convertvector(__v8hi a)
+{
+ return (__m256i)__builtin_convertvector(a, __v8si); }
+
+__m512i mm512_cvtepi16_epi32_builtin_convertvector(__v16hi a)
+{
+ return (__m512i)__builtin_convertvector(a, __v16si); }
+
+__m128i mm_cvtepi8_epi32_builtin_convertvector(__v4qi a) {
+ return (__m128i)__builtin_convertvector(a, __v4si); }
+
+__m256i mm256_cvtepi8_epi32_builtin_convertvector(__v8qi a)
+{
+ return (__m256i)__builtin_convertvector(a, __v8si); }
+
+__m512i mm512_cvtepi8_epi32_builtin_convertvector(__v16qi a)
+{
+ return (__m512i)__builtin_convertvector(a, __v16si); }
+
+__m128i mm_cvtepi8_epi16_builtin_convertvector(__v8qi a) {
+ return (__m128i)__builtin_convertvector(a, __v8hi); }
+
+__m256i mm256_cvtepi8_epi16_builtin_convertvector(__v16qi a)
+{
+ return (__m256i)__builtin_convertvector(a, __v16hi); }
+
+__v32hi mm512_cvtepi8_epi16_builtin_convertvector(__v32qi a)
+{
+ return __builtin_convertvector(a, __v32hi); }
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-3.c b/gcc/testsuite/gcc.target/i386/pr107432-3.c
new file mode 100644
index 00000000000..30dc947b6dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-3.c
@@ -0,0 +1,55 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvtpd2ps" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtpd2ph" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtps2ph" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); typedef
+_Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2sf mm_cvtpd_ps_builtin_convertvector(__v2df a) {
+ return __builtin_convertvector(a, __v2sf); }
+
+__v4sf mm256_cvtpd_ps_builtin_convertvector(__v4df a)
+{
+ return __builtin_convertvector(a, __v4sf); }
+
+__v8sf mm512_cvtpd_ps_builtin_convertvector(__v8df a)
+{
+ return __builtin_convertvector(a, __v8sf); }
+
+__v2hf mm_cvtpd_ph_builtin_convertvector(__v2df a) {
+ return __builtin_convertvector(a, __v2hf); }
+
+__v4hf mm256_cvtpd_ph_builtin_convertvector(__v4df a)
+{
+ return __builtin_convertvector(a, __v4hf); }
+
+__v8hf mm512_cvtpd_ph_builtin_convertvector(__v8df a)
+{
+ return __builtin_convertvector(a, __v8hf); }
+
+__v4hf mm_cvtps_ph_builtin_convertvector(__v4sf a) {
+ return __builtin_convertvector(a, __v4hf); }
+
+__v8hf mm256_cvtps_ph_builtin_convertvector(__v8sf a)
+{
+ return __builtin_convertvector(a, __v8hf); }
+
+__v16hf mm512_cvtps_ph_builtin_convertvector(__v16sf a)
+{
+ return __builtin_convertvector(a, __v16hf); }
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-4.c b/gcc/testsuite/gcc.target/i386/pr107432-4.c
new file mode 100644
index 00000000000..e537e7349e4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-4.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vcvtps2pd" 2 { target { ia32 } }
+} } */
+/* { dg-final { scan-assembler-times "vcvtps2pd" 3 { target { ! ia32 }
+} } } */
+/* { dg-final { scan-assembler-times "vcvtph2pd" 3 } } */
+/* { dg-final { scan-assembler-times "vcvtph2ps" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); typedef
+_Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2df mm_cvtps_pd_builtin_convertvector(__v2sf a) {
+ return __builtin_convertvector(a, __v2df); }
+
+__v4df mm256_cvtps_pd_builtin_convertvector(__v4sf a)
+{
+ return __builtin_convertvector(a, __v4df); }
+
+__v8df mm512_cvtps_pd_builtin_convertvector(__v8sf a)
+{
+ return __builtin_convertvector(a, __v8df); }
+
+__v2df mm_cvtph_pd_builtin_convertvector(__v2hf a) {
+ return __builtin_convertvector(a, __v2df); }
+
+__v4df mm256_cvtph_pd_builtin_convertvector(__v4hf a)
+{
+ return __builtin_convertvector(a, __v4df); }
+
+__v8df mm512_cvtph_pd_builtin_convertvector(__v8hf a)
+{
+ return __builtin_convertvector(a, __v8df); }
+
+__v4sf mm_cvtph_ps_builtin_convertvector(__v4hf a) {
+ return __builtin_convertvector(a, __v4sf); }
+
+__v8sf mm256_cvtph_ps_builtin_convertvector(__v8hf a)
+{
+ return __builtin_convertvector(a, __v8sf); }
+
+__v16sf mm512_cvtph_ps_builtin_convertvector(__v16hf a)
+{
+ return __builtin_convertvector(a, __v16sf); }
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-5.c b/gcc/testsuite/gcc.target/i386/pr107432-5.c
new file mode 100644
index 00000000000..5a44ef9f3b9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-5.c
@@ -0,0 +1,72 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512dq -mavx512fp16 -mavx512vl -O3"
+} */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 } } */
+/* { dg-final { scan-assembler-times "vcvttps2qq" 2 { target { ia32 } }
+} } */
+/* { dg-final { scan-assembler-times "vcvttps2qq" 3 { target { ! ia32 }
+} } } */
+/* { dg-final { scan-assembler-times "vcvttph2dq" 3 } } */
+/* { dg-final { scan-assembler-times "vcvttph2qq" 3 } } */
+
+#include <x86intrin.h>
+
+typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); typedef
+_Float16 __v4hf __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtpd_epi32_builtin_convertvector(__v2df a) {
+ return __builtin_convertvector(a, __v2si); }
+
+__v4si mm256_cvtpd_epi32_builtin_convertvector(__v4df a)
+{
+ return __builtin_convertvector(a, __v4si); }
+
+__v8si mm512_cvtpd_epi32_builtin_convertvector(__v8df a)
+{
+ return __builtin_convertvector(a, __v8si); }
+
+__v2di mm_cvtps_epi64_builtin_convertvector(__v2sf a) {
+ return __builtin_convertvector(a, __v2di); }
+
+__v4di mm256_cvtps_epi64_builtin_convertvector(__v4sf a)
+{
+ return __builtin_convertvector(a, __v4di); }
+
+__v8di mm512_cvtps_epi64_builtin_convertvector(__v8sf a)
+{
+ return __builtin_convertvector(a, __v8di); }
+
+__v4si mm_cvtph_epi32_builtin_convertvector(__v4hf a) {
+ return __builtin_convertvector(a, __v4si); }
+
+__v8si mm256_cvtph_epi32_builtin_convertvector(__v8hf a)
+{
+ return __builtin_convertvector(a, __v8si); }
+
+__v16si mm512_cvtph_epi32_builtin_convertvector(__v16hf a)
+{
+ return __builtin_convertvector(a, __v16si); }
+
+__v2di mm_cvtph_epi64_builtin_convertvector(__v2hf a) {
+ return __builtin_convertvector(a, __v2di); }
+
+__v4di mm256_cvtph_epi64_builtin_convertvector(__v4hf a)
+{
+ return __builtin_convertvector(a, __v4di); }
+
+__v8di mm512_cvtph_epi64_builtin_convertvector(__v8hf a)
+{
+ return __builtin_convertvector(a, __v8di); }
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c
new file mode 100644
index 00000000000..4a68a10b089
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c
@@ -0,0 +1,139 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq
+-fno-trapping-math" } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 2 { target { ia32 } }
+} } */
+/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 { target { ! ia32 }
+} } } */
+/* { dg-final { scan-assembler-times "vcvttpd2udq" 2 { target { ia32 }
+} } } */
+/* { dg-final { scan-assembler-times "vcvttpd2udq" 3 { target { ! ia32
+} } } } */
+/* { dg-final { scan-assembler-times "vcvttps2dq" 3 { target { ia32 } }
+} } */
+/* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 }
+} } } */
+/* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 }
+} } } */
+/* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32
+} } } } */
+/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */
+/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } }
+} */
+/* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } }
+} } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include <x86intrin.h>
+
+typedef char __v2qi __attribute__ ((__vector_size__ (2))); typedef char
+__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi
+__attribute__ ((__vector_size__ (8))); typedef char __v16qi
+__attribute__ ((__vector_size__ (16))); typedef unsigned char __v2qu
+__attribute__ ((vector_size (2))); typedef unsigned char __v4qu
+__attribute__ ((vector_size (4))); typedef unsigned char __v8qu
+__attribute__ ((vector_size (8))); typedef unsigned char __v16qu
+__attribute__ ((vector_size (16))); typedef _Float16 __v2hf
+__attribute__ ((__vector_size__ (4))); typedef _Float16 __v4hf
+__attribute__ ((__vector_size__ (8))); typedef _Float16 __v8hf
+__attribute__ ((__vector_size__ (16)));
+
+__v2qi mm_cvtpd_epi8_builtin_convertvector(__v2df a)
+{
+ return __builtin_convertvector((__v2df)a, __v2qi); }
+
+__v4qi mm256_cvtpd_epi8_builtin_convertvector(__v4df a)
+{
+ return __builtin_convertvector((__v4df)a, __v4qi); }
+
+__v8qi mm512_cvtpd_epi8_builtin_convertvector(__v8df a)
+{
+ return __builtin_convertvector((__v8df)a, __v8qi); }
+
+__v2qu mm_cvtpd_epu8_builtin_convertvector(__v2df a)
+{
+ return __builtin_convertvector((__v2df)a, __v2qu); }
+
+__v4qu mm256_cvtpd_epu8_builtin_convertvector(__v4df a)
+{
+ return __builtin_convertvector((__v4df)a, __v4qu); }
+
+__v8qu mm512_cvtpd_epu8_builtin_convertvector(__v8df a)
+{
+ return __builtin_convertvector((__v8df)a, __v8qu); }
+
+__v2qi mm64_cvtps_epi8_builtin_convertvector(__v2sf a)
+{
+ return __builtin_convertvector((__v2sf)a, __v2qi); }
+
+__v4qi mm128_cvtps_epi8_builtin_convertvector(__v4sf a)
+{
+ return __builtin_convertvector((__v4sf)a, __v4qi); }
+
+__v8qi mm256_cvtps_epi8_builtin_convertvector(__v8sf a)
+{
+ return __builtin_convertvector((__v8sf)a, __v8qi); }
+
+__v16qi mm512_cvtps_epi8_builtin_convertvector(__v16sf a)
+{
+ return __builtin_convertvector((__v16sf)a, __v16qi); }
+
+__v2qu mm64_cvtps_epu8_builtin_convertvector(__v2sf a)
+{
+ return __builtin_convertvector((__v2sf)a, __v2qu); }
+
+__v4qu mm128_cvtps_epu8_builtin_convertvector(__v4sf a)
+{
+ return __builtin_convertvector((__v4sf)a, __v4qu); }
+
+__v8qu mm256_cvtps_epu8_builtin_convertvector(__v8sf a)
+{
+ return __builtin_convertvector((__v8sf)a, __v8qu); }
+
+__v16qu mm512_cvtps_epu8_builtin_convertvector(__v16sf a)
+{
+ return __builtin_convertvector((__v16sf)a, __v16qu); }
+
+__v2qi mm32_cvtph_epi8_builtin_convertvector(__v2hf a)
+{
+ return __builtin_convertvector((__v2hf)a, __v2qi); }
+
+__v8qi mm128_cvtph_epi8_builtin_convertvector(__v8hf a)
+{
+ return __builtin_convertvector((__v8hf)a, __v8qi); }
+
+__v16qi mm256_cvtph_epi8_builtin_convertvector(__v16hf a)
+{
+ return __builtin_convertvector((__v16hf)a, __v16qi); }
+
+__v32qi mm512_cvtph_epi8_builtin_convertvector(__v32hf a)
+{
+ return __builtin_convertvector((__v32hf)a, __v32qi); }
+
+__v2qu mm32_cvtph_epu8_builtin_convertvector(__v2hf a)
+{
+ return __builtin_convertvector((__v2hf)a, __v2qu); }
+
+__v8qu mm128_cvtph_epu8_builtin_convertvector(__v8hf a)
+{
+ return __builtin_convertvector((__v8hf)a, __v8qu); }
+
+__v16qu mm256_cvtph_epu8_builtin_convertvector(__v16hf a)
+{
+ return __builtin_convertvector((__v16hf)a, __v16qu); }
+
+__v32qu mm512_cvtph_epu8_builtin_convertvector(__v32hf a)
+{
+ return __builtin_convertvector((__v32hf)a, __v32qu); }
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-7.c b/gcc/testsuite/gcc.target/i386/pr107432-7.c
new file mode 100644
index 00000000000..0ff5a97ed1a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-7.c
@@ -0,0 +1,156 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq
+-fno-trapping-math" } */
+/* { dg-final { scan-assembler-times "vcvtdq2pd" 2 { target { ia32 } }
+} } */
+/* { dg-final { scan-assembler-times "vcvtdq2pd" 3 { target { ! ia32 }
+} } } */
+/* { dg-final { scan-assembler-times "vcvtudq2pd" 2 { target { ia32 } }
+} } */
+/* { dg-final { scan-assembler-times "vcvtudq2pd" 3 { target { ! ia32 }
+} } } */
+/* { dg-final { scan-assembler-times "vcvtdq2ps" 3 { target { ia32 } }
+} } */
+/* { dg-final { scan-assembler-times "vcvtdq2ps" 4 { target { ! ia32 }
+} } } */
+/* { dg-final { scan-assembler-times "vcvtudq2ps" 3 { target { ia32 } }
+} } */
+/* { dg-final { scan-assembler-times "vcvtudq2ps" 4 { target { ! ia32 }
+} } } */
+/* { dg-final { scan-assembler-times "vcvtw2ph" 4 { target { ia32 } } }
+} */
+/* { dg-final { scan-assembler-times "vcvtw2ph" 5 { target { ! ia32 } }
+} } */
+/* { dg-final { scan-assembler-times "vcvtuw2ph" 4 { target { ia32 } }
+} } */
+/* { dg-final { scan-assembler-times "vcvtuw2ph" 5 { target { ! ia32 }
+} } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } }
+} } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 }
+} } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } }
+} } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 }
+} } } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } }
+} } */
+/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 }
+} } } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } }
+} } */
+/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 }
+} } } */
+
+#include <x86intrin.h>
+
+typedef char __v2qi __attribute__ ((__vector_size__ (2))); typedef char
+__v4qi __attribute__ ((__vector_size__ (4))); typedef char __v8qi
+__attribute__ ((__vector_size__ (8))); typedef char __v16qi
+__attribute__ ((__vector_size__ (16))); typedef unsigned char __v2qu
+__attribute__ ((vector_size (2))); typedef unsigned char __v4qu
+__attribute__ ((vector_size (4))); typedef unsigned char __v8qu
+__attribute__ ((vector_size (8))); typedef unsigned char __v16qu
+__attribute__ ((vector_size (16))); typedef _Float16 __v2hf
+__attribute__ ((__vector_size__ (4))); typedef _Float16 __v4hf
+__attribute__ ((__vector_size__ (8))); typedef _Float16 __v8hf
+__attribute__ ((__vector_size__ (16)));
+
+__v2df mm_cvtepi8_pd_builtin_convertvector(__v2qi a)
+{
+ return __builtin_convertvector((__v2qi)a, __v2df); }
+
+__v4df mm256_cvtepi8_pd_builtin_convertvector(__v4qi a)
+{
+ return __builtin_convertvector((__v4qi)a, __v4df); }
+
+__v8df mm512_cvtepi8_pd_builtin_convertvector(__v8qi a)
+{
+ return __builtin_convertvector((__v8qi)a, __v8df); }
+
+__v2df mm_cvtepu8_pd_builtin_convertvector(__v2qu a)
+{
+ return __builtin_convertvector((__v2qu)a, __v2df); }
+
+__v4df mm256_cvtepu8_pd_builtin_convertvector(__v4qu a)
+{
+ return __builtin_convertvector((__v4qu)a, __v4df); }
+
+__v8df mm512_cvtepu8_pd_builtin_convertvector(__v8qu a)
+{
+ return __builtin_convertvector((__v8qu)a, __v8df); }
+
+__v2sf mm64_cvtepi8_ps_builtin_convertvector(__v2qi a)
+{
+ return __builtin_convertvector((__v2qi)a, __v2sf); }
+
+__v4sf mm128_cvtepi8_ps_builtin_convertvector(__v4qi a)
+{
+ return __builtin_convertvector((__v4qi)a, __v4sf); }
+
+__v8sf mm256_cvtepi8_ps_builtin_convertvector(__v8qi a)
+{
+ return __builtin_convertvector((__v8qi)a, __v8sf); }
+
+__v16sf mm512_cvtepi8_ps_builtin_convertvector(__v16qi a)
+{
+ return __builtin_convertvector((__v16qi)a, __v16sf); }
+
+__v2sf mm64_cvtepu8_ps_builtin_convertvector(__v2qu a)
+{
+ return __builtin_convertvector((__v2qu)a, __v2sf); }
+
+__v4sf mm128_cvtepu8_ps_builtin_convertvector(__v4qu a)
+{
+ return __builtin_convertvector((__v4qu)a, __v4sf); }
+
+__v8sf mm256_cvtepu8_ps_builtin_convertvector(__v8qu a)
+{
+ return __builtin_convertvector((__v8qu)a, __v8sf); }
+
+__v16sf mm512_cvtepu8_ps_builtin_convertvector(__v16qu a)
+{
+ return __builtin_convertvector((__v16qu)a, __v16sf); }
+
+__v2hf mm32_cvtepi8_ph_builtin_convertvector(__v2qi a)
+{
+ return __builtin_convertvector((__v2qi)a, __v2hf); }
+
+__v4hf mm64_cvtepi8_ph_builtin_convertvector(__v4qi a)
+{
+ return __builtin_convertvector((__v4qi)a, __v4hf); }
+
+__v8hf mm128_cvtepi8_ph_builtin_convertvector(__v8qi a)
+{
+ return __builtin_convertvector((__v8qi)a, __v8hf); }
+
+__v16hf mm256_cvtepi8_ph_builtin_convertvector(__v16qi a)
+{
+ return __builtin_convertvector((__v16qi)a, __v16hf); }
+
+__v32hf mm512_cvtepi8_ph_builtin_convertvector(__v32qi a)
+{
+ return __builtin_convertvector((__v32qi)a, __v32hf); }
+
+__v2hf mm32_cvtepu8_ph_builtin_convertvector(__v2qu a)
+{
+ return __builtin_convertvector((__v2qu)a, __v2hf); }
+
+__v4hf mm64_cvtepu8_ph_builtin_convertvector(__v4qu a)
+{
+ return __builtin_convertvector((__v4qu)a, __v4hf); }
+
+__v8hf mm128_cvtepu8_ph_builtin_convertvector(__v8qu a)
+{
+ return __builtin_convertvector((__v8qu)a, __v8hf); }
+
+__v16hf mm256_cvtepu8_ph_builtin_convertvector(__v16qu a)
+{
+ return __builtin_convertvector((__v16qu)a, __v16hf); }
+
+__v32hf mm512_cvtepu8_ph_builtin_convertvector(__v32qu a)
+{
+ return __builtin_convertvector((__v32qu)a, __v32hf); }
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index ea0069f7a67..c38c0b9dda8 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -45,6 +45,8 @@ along with GCC; see the file COPYING3. If not see #include "gimple-match.h"
#include "recog.h" /* FIXME: for insn_data */
#include "optabs-libfuncs.h"
+#include "cfgloop.h"
+#include "tree-vectorizer.h"
/* Build a ternary operation and gimplify it. Emit code before GSI.
@@ -1870,14 +1872,33 @@ expand_vector_conversion (gimple_stmt_iterator *gsi)
else if (ret_elt_bits > arg_elt_bits)
modifier = WIDEN;
+ if (supportable_convert_operation (code, ret_type, arg_type, &code1))
+ {
+ g = gimple_build_assign (lhs, code1, arg);
+ gsi_replace (gsi, g, false);
+ return;
+ }
+
+ code_helper code2 = ERROR_MARK, code3 = ERROR_MARK;
+ int multi_step_cvt = 0;
+ vec<tree> interm_types = vNULL;
+ if (supportable_indirect_convert_operation (NULL,
+ code,
+ ret_type, arg_type,
+ &code2, &code3,
+ &multi_step_cvt,
+ &interm_types, arg))
+ {
+ new_rhs = make_ssa_name (interm_types[0]);
+ g = gimple_build_assign (new_rhs, (tree_code) code3, arg);
+ gsi_insert_before (gsi, g, GSI_SAME_STMT);
+ g = gimple_build_assign (lhs, (tree_code) code2, new_rhs);
+ gsi_replace (gsi, g, false);
+ return;
+ }
+
if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR))
{
- if (supportable_convert_operation (code, ret_type, arg_type, &code1))
- {
- g = gimple_build_assign (lhs, code1, arg);
- gsi_replace (gsi, g, false);
- return;
- }
/* Can't use get_compute_type here, as supportable_convert_operation
doesn't necessarily use an optab and needs two arguments. */
tree vec_compute_type
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 05a169ecb2d..0aa608202ca 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -5175,7 +5175,7 @@ vectorizable_conversion (vec_info *vinfo,
tree scalar_dest;
tree op0, op1 = NULL_TREE;
loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
- tree_code tc1, tc2;
+ tree_code tc1;
code_helper code, code1, code2;
code_helper codecvt1 = ERROR_MARK, codecvt2 = ERROR_MARK;
tree new_temp;
@@ -5384,92 +5384,17 @@ vectorizable_conversion (vec_info *vinfo,
break;
}
- /* For conversions between float and integer types try whether
- we can use intermediate signed integer types to support the
- conversion. */
- if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
- && (code == FLOAT_EXPR ||
- (code == FIX_TRUNC_EXPR && !flag_trapping_math)))
- {
- bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode);
- bool float_expr_p = code == FLOAT_EXPR;
- unsigned short target_size;
- scalar_mode intermediate_mode;
- if (demotion)
- {
- intermediate_mode = lhs_mode;
- target_size = GET_MODE_SIZE (rhs_mode);
- }
- else
- {
- target_size = GET_MODE_SIZE (lhs_mode);
- if (!int_mode_for_size
- (GET_MODE_BITSIZE (rhs_mode), 0).exists (&intermediate_mode))
- goto unsupported;
- }
- code1 = float_expr_p ? code : NOP_EXPR;
- codecvt1 = float_expr_p ? NOP_EXPR : code;
- opt_scalar_mode mode_iter;
- FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
- {
- intermediate_mode = mode_iter.require ();
-
- if (GET_MODE_SIZE (intermediate_mode) > target_size)
- break;
-
- scalar_mode cvt_mode;
- if (!int_mode_for_size
- (GET_MODE_BITSIZE (intermediate_mode), 0).exists (&cvt_mode))
- break;
-
- cvt_type = build_nonstandard_integer_type
- (GET_MODE_BITSIZE (cvt_mode), 0);
-
- /* Check if the intermediate type can hold OP0's range.
- When converting from float to integer this is not necessary
- because values that do not fit the (smaller) target type are
- unspecified anyway. */
- if (demotion && float_expr_p)
- {
- wide_int op_min_value, op_max_value;
- if (!vect_get_range_info (op0, &op_min_value, &op_max_value))
- break;
-
- if (cvt_type == NULL_TREE
- || (wi::min_precision (op_max_value, SIGNED)
- > TYPE_PRECISION (cvt_type))
- || (wi::min_precision (op_min_value, SIGNED)
- > TYPE_PRECISION (cvt_type)))
- continue;
- }
-
- cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node);
- /* This should only happened for SLP as long as loop vectorizer
- only supports same-sized vector. */
- if (cvt_type == NULL_TREE
- || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nunits_in)
- || !supportable_convert_operation ((tree_code) code1,
- vectype_out,
- cvt_type, &tc1)
- || !supportable_convert_operation ((tree_code) codecvt1,
- cvt_type,
- vectype_in, &tc2))
- continue;
-
- found_mode = true;
- break;
- }
+ if (supportable_indirect_convert_operation (vinfo,
+ code,
+ vectype_out,
+ vectype_in,
+ &code1,
+ &codecvt1,
+ &multi_step_cvt,
+ &interm_types,
+ op0,slp_node))
+ break;
- if (found_mode)
- {
- multi_step_cvt++;
- interm_types.safe_push (cvt_type);
- cvt_type = NULL_TREE;
- code1 = tc1;
- codecvt1 = tc2;
- break;
- }
- }
/* FALLTHRU */
unsupported:
if (dump_enabled_p ())
@@ -14626,6 +14551,153 @@ supportable_narrowing_operation (code_helper code,
return false;
}
+/* Function supportable_indirect_convert_operation
+
+ Check whether an operation represented by the code CODE is two
+ convert operations that are supported by the target platform in
+ vector form (i.e., when operating on arguments of type VECTYPE_IN
+ producing a result of type VECTYPE_OUT).
+
+ Convert operations we currently support directly are FIX_TRUNC and FLOAT.
+ This function checks if these operations are supported
+ by the target platform directly (via vector tree-codes).
+
+ Output:
+ - CODE1 is the code of a vector operation to be used when
+ converting the operation in the first step, if available.
+ - CODE2 is the code of a vector operation to be used when
+ converting the operation in the second step, if available.
+ - MULTI_STEP_CVT determines the number of required intermediate steps in
+ case of multi-step conversion (like int->short->char - in that case
+ MULTI_STEP_CVT will be 1). In the function, it should be 1.
+ - INTERM_TYPES contains the intermediate type required to perform the
+ convert operation (short in the above example). */
+bool
+supportable_indirect_convert_operation (vec_info *vinfo,
+ code_helper code,
+ tree vectype_out,
+ tree vectype_in,
+ code_helper *code1,
+ code_helper *code2,
+ int *multi_step_cvt,
+ vec<tree> *interm_types,
+ tree op0,
+ slp_tree slp_node)
+{
+ bool found_mode = false;
+ scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_out));
+ scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_in));
+ opt_scalar_mode mode_iter;
+ tree_code tc1, tc2;
+
+ tree cvt_type = NULL_TREE;
+ poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype_in);
+
+ (*multi_step_cvt) = 0;
+ /* For conversions between float and integer types try whether
+ we can use intermediate signed integer types to support the
+ conversion. */
+ if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
+ && (code == FLOAT_EXPR
+ || (code == FIX_TRUNC_EXPR && !flag_trapping_math)))
+ {
+ bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode);
+ bool float_expr_p = code == FLOAT_EXPR;
+ unsigned short target_size;
+ scalar_mode intermediate_mode;
+ if (demotion)
+ {
+ intermediate_mode = lhs_mode;
+ target_size = GET_MODE_SIZE (rhs_mode);
+ }
+ else
+ {
+ target_size = GET_MODE_SIZE (lhs_mode);
+ if (!int_mode_for_size
+ (GET_MODE_BITSIZE (rhs_mode), 0).exists (&intermediate_mode))
+ return false;
+ }
+ *code1 = float_expr_p ? code : NOP_EXPR;
+ *code2 = float_expr_p ? NOP_EXPR : code;
+ opt_scalar_mode mode_iter;
+ FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
+ {
+ intermediate_mode = mode_iter.require ();
+
+ if (GET_MODE_SIZE (intermediate_mode) > target_size)
+ break;
+
+ scalar_mode cvt_mode;
+ if (!int_mode_for_size
+ (GET_MODE_BITSIZE (intermediate_mode), 0).exists (&cvt_mode))
+ break;
+
+ cvt_type = build_nonstandard_integer_type
+ (GET_MODE_BITSIZE (cvt_mode), 0);
+
+ /* Check if the intermediate type can hold OP0's range.
+ When converting from float to integer this is not necessary
+ because values that do not fit the (smaller) target type are
+ unspecified anyway. */
+ if (demotion && float_expr_p)
+ {
+ wide_int op_min_value, op_max_value;
+ /* For vector form, it looks like op0 doesn't have RANGE_INFO.
+ In the future, if it is supported, changes may need to be made
+ to this part, such as checking the RANGE of each element
+ in the vector. */
+ if (!SSA_NAME_RANGE_INFO (op0)
+ || !vect_get_range_info (op0, &op_min_value, &op_max_value))
+ break;
+
+ if (cvt_type == NULL_TREE
+ || (wi::min_precision (op_max_value, SIGNED)
+ > TYPE_PRECISION (cvt_type))
+ || (wi::min_precision (op_min_value, SIGNED)
+ > TYPE_PRECISION (cvt_type)))
+ continue;
+ }
+
+ if (vinfo != NULL && slp_node != NULL)
+ cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node);
+ else
+ {
+ bool uns = TYPE_UNSIGNED (TREE_TYPE (vectype_out))
+ || TYPE_UNSIGNED (TREE_TYPE (vectype_in));
+ cvt_type = build_nonstandard_integer_type
+ (GET_MODE_BITSIZE (cvt_mode), uns);
+ cvt_type = build_vector_type (cvt_type, nelts);
+ }
+ /* This should only happened for SLP as long as loop vectorizer
+ only supports same-sized vector. */
+ if (cvt_type == NULL_TREE
+ || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nelts)
+ || !supportable_convert_operation ((tree_code) *code1,
+ vectype_out,
+ cvt_type, &tc1)
+ || !supportable_convert_operation ((tree_code) *code2,
+ cvt_type,
+ vectype_in, &tc2))
+ continue;
+
+ found_mode = true;
+ break;
+ }
+
+ if (found_mode)
+ {
+ (*multi_step_cvt)++;
+ interm_types->safe_push (cvt_type);
+ cvt_type = NULL_TREE;
+ *code1 = tc1;
+ *code2 = tc2;
+ return true;
+ }
+ }
+ interm_types->release ();
+ return false;
+}
+
/* Generate and return a vector mask of MASK_TYPE such that
mask[I] is true iff J + START_INDEX < END_INDEX for all J <= I.
Add the statements to SEQ. */
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 97ec9c341e7..ad65ce71bb7 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2265,6 +2265,15 @@ extern bool supportable_widening_operation (vec_info*, code_helper, extern bool supportable_narrowing_operation (code_helper, tree, tree,
code_helper *, int *,
vec<tree> *);
+extern bool supportable_indirect_convert_operation (vec_info *,
+ code_helper,
+ tree, tree,
+ code_helper *,
+ code_helper *,
+ int *,
+ vec<tree> *,
+ tree = NULL_TREE,
+ slp_tree = NULL);
extern unsigned record_stmt_cost (stmt_vector_for_cost *, int,
enum vect_cost_for_stmt, stmt_vec_info,
--
2.31.1
On Tue, 11 Jun 2024, Hu, Lin1 wrote: > I wrap a part of code about indirect conversion. The API refers to > supportable_narrowing/widening_operations. Sorry for the delay - comments inline. > BRs, > Lin > > gcc/ChangeLog: > > PR target/107432 > * tree-vect-generic.cc > (expand_vector_conversion): Support convert for int -> int, > float -> float and int <-> float. > * tree-vect-stmts.cc (vectorizable_conversion): Wrap the > indirect convert part. > (supportable_indirect_convert_operation): New function. > * tree-vectorizer.h (supportable_indirect_convert_operation): > Define the new function. > > gcc/testsuite/ChangeLog: > > PR target/107432 > * gcc.target/i386/pr107432-1.c: New test. > * gcc.target/i386/pr107432-2.c: Ditto. > * gcc.target/i386/pr107432-3.c: Ditto. > * gcc.target/i386/pr107432-4.c: Ditto. > * gcc.target/i386/pr107432-5.c: Ditto. > * gcc.target/i386/pr107432-6.c: Ditto. > * gcc.target/i386/pr107432-7.c: Ditto. > --- > gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 ++++++++++++++++++++ > gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +++++++++ > gcc/testsuite/gcc.target/i386/pr107432-3.c | 55 +++++ > gcc/testsuite/gcc.target/i386/pr107432-4.c | 56 +++++ > gcc/testsuite/gcc.target/i386/pr107432-5.c | 72 ++++++ > gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 ++++++++++++ > gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 +++++++++++++ > gcc/tree-vect-generic.cc | 33 ++- > gcc/tree-vect-stmts.cc | 244 +++++++++++++-------- > gcc/tree-vectorizer.h | 9 + > 10 files changed, 1011 insertions(+), 92 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c > > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c > new file mode 100644 > index 00000000000..a4f37447eb4 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c > @@ -0,0 +1,234 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */ > +/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */ > +/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */ > +/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */ > +/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */ > + > +#include <x86intrin.h> > + > +typedef short __v2hi __attribute__ ((__vector_size__ (4))); > +typedef char __v2qi __attribute__ ((__vector_size__ (2))); > +typedef char __v4qi __attribute__ ((__vector_size__ (4))); > +typedef char __v8qi __attribute__ ((__vector_size__ (8))); > + > +typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4))); > +typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8))); > +typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2))); > +typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4))); > +typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8))); > +typedef unsigned int __v2su __attribute__ ((__vector_size__ (8))); > + > +__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v2di)a, __v2si); > +} > + > +__m128i mm256_cvtepi64_epi32_builtin_convertvector(__m256i a) > +{ > + return (__m128i)__builtin_convertvector((__v4di)a, __v4si); > +} > + > +__m256i mm512_cvtepi64_epi32_builtin_convertvector(__m512i a) > +{ > + return (__m256i)__builtin_convertvector((__v8di)a, __v8si); > +} > + > +__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v2di)a, __v2hi); > +} > + > +__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a) > +{ > + return __builtin_convertvector((__v4di)a, __v4hi); > +} > + > +__m128i mm512_cvtepi64_epi16_builtin_convertvector(__m512i a) > +{ > + return (__m128i)__builtin_convertvector((__v8di)a, __v8hi); > +} > + > +__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v2di)a, __v2qi); > +} > + > +__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a) > +{ > + return __builtin_convertvector((__v4di)a, __v4qi); > +} > + > +__v8qi mm512_cvtepi64_epi8_builtin_convertvector(__m512i a) > +{ > + return __builtin_convertvector((__v8di)a, __v8qi); > +} > + > +__v2hi mm64_cvtepi32_epi16_builtin_convertvector(__v2si a) > +{ > + return __builtin_convertvector((__v2si)a, __v2hi); > +} > + > +__v4hi mm_cvtepi32_epi16_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v4si)a, __v4hi); > +} > + > +__m128i mm256_cvtepi32_epi16_builtin_convertvector(__m256i a) > +{ > + return (__m128i)__builtin_convertvector((__v8si)a, __v8hi); > +} > + > +__m256i mm512_cvtepi32_epi16_builtin_convertvector(__m512i a) > +{ > + return (__m256i)__builtin_convertvector((__v16si)a, __v16hi); > +} > + > +__v2qi mm64_cvtepi32_epi8_builtin_convertvector(__v2si a) > +{ > + return __builtin_convertvector((__v2si)a, __v2qi); > +} > + > +__v4qi mm_cvtepi32_epi8_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v4si)a, __v4qi); > +} > + > +__v8qi mm256_cvtepi32_epi8_builtin_convertvector(__m256i a) > +{ > + return __builtin_convertvector((__v8si)a, __v8qi); > +} > + > +__m128i mm512_cvtepi32_epi8_builtin_convertvector(__m512i a) > +{ > + return (__m128i)__builtin_convertvector((__v16si)a, __v16qi); > +} > + > +__v2qi mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a) > +{ > + return __builtin_convertvector((__v2hi)a, __v2qi); > +} > + > +__v8qi mm_cvtepi16_epi8_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v8hi)a, __v8qi); > +} > + > +__m128i mm256_cvtepi16_epi8_builtin_convertvector(__m256i a) > +{ > + return (__m128i)__builtin_convertvector((__v16hi)a, __v16qi); > +} > + > +__m256i mm512_cvtepi16_epi8_builtin_convertvector(__m512i a) > +{ > + return (__m256i)__builtin_convertvector((__v32hi)a, __v32qi); > +} > + > +__v2su mm_cvtepu64_epu32_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v2du)a, __v2su); > +} > + > +__m128i mm256_cvtepu64_epu32_builtin_convertvector(__m256i a) > +{ > + return (__m128i)__builtin_convertvector((__v4du)a, __v4su); > +} > + > +__m256i mm512_cvtepu64_epu32_builtin_convertvector(__m512i a) > +{ > + return (__m256i)__builtin_convertvector((__v8du)a, __v8su); > +} > + > +__v2hu mm_cvtepu64_epu16_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v2du)a, __v2hu); > +} > + > +__v4hu mm256_cvtepu64_epu16_builtin_convertvector(__m256i a) > +{ > + return __builtin_convertvector((__v4du)a, __v4hu); > +} > + > +__m128i mm512_cvtepu64_epu16_builtin_convertvector(__m512i a) > +{ > + return (__m128i)__builtin_convertvector((__v8du)a, __v8hu); > +} > + > +__v2qu mm_cvtepu64_epu8_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v2du)a, __v2qu); > +} > + > +__v4qu mm256_cvtepu64_epu8_builtin_convertvector(__m256i a) > +{ > + return __builtin_convertvector((__v4du)a, __v4qu); > +} > + > +__v8qu mm512_cvtepu64_epu8_builtin_convertvector(__m512i a) > +{ > + return __builtin_convertvector((__v8du)a, __v8qu); > +} > + > +__v2hu mm32_cvtepu32_epu16_builtin_convertvector(__v2su a) > +{ > + return __builtin_convertvector((__v2su)a, __v2hu); > +} > + > +__v4hu mm_cvtepu32_epu16_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v4su)a, __v4hu); > +} > + > +__m128i mm256_cvtepu32_epu16_builtin_convertvector(__m256i a) > +{ > + return (__m128i)__builtin_convertvector((__v8su)a, __v8hu); > +} > + > +__m256i mm512_cvtepu32_epu16_builtin_convertvector(__m512i a) > +{ > + return (__m256i)__builtin_convertvector((__v16su)a, __v16hu); > +} > + > +__v2qu mm32_cvtepu32_epu8_builtin_convertvector(__v2su a) > +{ > + return __builtin_convertvector((__v2su)a, __v2qu); > +} > + > +__v4qu mm_cvtepu2_epu8_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v4su)a, __v4qu); > +} > + > +__v8qu mm256_cvtepu32_epu8_builtin_convertvector(__m256i a) > +{ > + return __builtin_convertvector((__v8su)a, __v8qu); > +} > + > +__m128i mm512_cvtepu32_epu8_builtin_convertvector(__m512i a) > +{ > + return (__m128i)__builtin_convertvector((__v16su)a, __v16qu); > +} > + > +__v2qu mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a) > +{ > + return __builtin_convertvector((__v2hu)a, __v2qu); > +} > + > +__v8qu mm_cvtepu16_epu8_builtin_convertvector(__m128i a) > +{ > + return __builtin_convertvector((__v8hu)a, __v8qu); > +} > + > +__m128i mm256_cvtepu16_epu8_builtin_convertvector(__m256i a) > +{ > + return (__m128i)__builtin_convertvector((__v16hu)a, __v16qu); > +} > + > +__m256i mm512_cvtepu16_epu8_builtin_convertvector(__m512i a) > +{ > + return (__m256i)__builtin_convertvector((__v32hu)a, __v32qu); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-2.c b/gcc/testsuite/gcc.target/i386/pr107432-2.c > new file mode 100644 > index 00000000000..02ffd811cb4 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr107432-2.c > @@ -0,0 +1,105 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */ > +/* { dg-final { scan-assembler-times "vpmovsxdq" 3 } } */ > +/* { dg-final { scan-assembler-times "vpmovsxwq" 3 } } */ > +/* { dg-final { scan-assembler-times "vpmovsxbq" 3 } } */ > +/* { dg-final { scan-assembler-times "vpmovsxwd" 3 } } */ > +/* { dg-final { scan-assembler-times "vpmovsxbd" 3 } } */ > +/* { dg-final { scan-assembler-times "vpmovsxbw" 3 } } */ > + > +#include <x86intrin.h> > + > +typedef short __v2hi __attribute__ ((__vector_size__ (4))); > +typedef char __v2qi __attribute__ ((__vector_size__ (2))); > +typedef char __v4qi __attribute__ ((__vector_size__ (4))); > +typedef char __v8qi __attribute__ ((__vector_size__ (8))); > + > +__m128i mm_cvtepi32_epi64_builtin_convertvector(__v2si a) > +{ > + return __builtin_convertvector(a, __v2di); > +} > + > +__m256i mm256_cvtepi32_epi64_builtin_convertvector(__v4si a) > +{ > + return (__m256i)__builtin_convertvector(a, __v4di); > +} > + > +__m512i mm512_cvtepi32_epi64_builtin_convertvector(__v8si a) > +{ > + return (__m512i)__builtin_convertvector(a, __v8di); > +} > + > +__m128i mm_cvtepi16_epi64_builtin_convertvector(__v2hi a) > +{ > + return __builtin_convertvector(a, __v2di); > +} > + > +__m256i mm256_cvtepi16_epi64_builtin_convertvector(__v4hi a) > +{ > + return (__m256i)__builtin_convertvector(a, __v4di); > +} > + > +__m512i mm512_cvtepi16_epi64_builtin_convertvector(__v8hi a) > +{ > + return (__m512i)__builtin_convertvector(a, __v8di); > +} > + > +__m128i mm_cvtepi8_epi64_builtin_convertvector(__v2qi a) > +{ > + return __builtin_convertvector(a, __v2di); > +} > + > +__m256i mm256_cvtepi8_epi64_builtin_convertvector(__v4qi a) > +{ > + return (__m256i)__builtin_convertvector(a, __v4di); > +} > + > +__m512i mm512_cvtepi8_epi64_builtin_convertvector(__v8qi a) > +{ > + return (__m512i)__builtin_convertvector(a, __v8di); > +} > + > +__m128i mm_cvtepi16_epi32_builtin_convertvector(__v4hi a) > +{ > + return (__m128i)__builtin_convertvector(a, __v4si); > +} > + > +__m256i mm256_cvtepi16_epi32_builtin_convertvector(__v8hi a) > +{ > + return (__m256i)__builtin_convertvector(a, __v8si); > +} > + > +__m512i mm512_cvtepi16_epi32_builtin_convertvector(__v16hi a) > +{ > + return (__m512i)__builtin_convertvector(a, __v16si); > +} > + > +__m128i mm_cvtepi8_epi32_builtin_convertvector(__v4qi a) > +{ > + return (__m128i)__builtin_convertvector(a, __v4si); > +} > + > +__m256i mm256_cvtepi8_epi32_builtin_convertvector(__v8qi a) > +{ > + return (__m256i)__builtin_convertvector(a, __v8si); > +} > + > +__m512i mm512_cvtepi8_epi32_builtin_convertvector(__v16qi a) > +{ > + return (__m512i)__builtin_convertvector(a, __v16si); > +} > + > +__m128i mm_cvtepi8_epi16_builtin_convertvector(__v8qi a) > +{ > + return (__m128i)__builtin_convertvector(a, __v8hi); > +} > + > +__m256i mm256_cvtepi8_epi16_builtin_convertvector(__v16qi a) > +{ > + return (__m256i)__builtin_convertvector(a, __v16hi); > +} > + > +__v32hi mm512_cvtepi8_epi16_builtin_convertvector(__v32qi a) > +{ > + return __builtin_convertvector(a, __v32hi); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-3.c b/gcc/testsuite/gcc.target/i386/pr107432-3.c > new file mode 100644 > index 00000000000..30dc947b6dd > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr107432-3.c > @@ -0,0 +1,55 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */ > +/* { dg-final { scan-assembler-times "vcvtpd2ps" 3 } } */ > +/* { dg-final { scan-assembler-times "vcvtpd2ph" 3 } } */ > +/* { dg-final { scan-assembler-times "vcvtps2ph" 3 } } */ > + > +#include <x86intrin.h> > + > +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); > +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8))); > + > +__v2sf mm_cvtpd_ps_builtin_convertvector(__v2df a) > +{ > + return __builtin_convertvector(a, __v2sf); > +} > + > +__v4sf mm256_cvtpd_ps_builtin_convertvector(__v4df a) > +{ > + return __builtin_convertvector(a, __v4sf); > +} > + > +__v8sf mm512_cvtpd_ps_builtin_convertvector(__v8df a) > +{ > + return __builtin_convertvector(a, __v8sf); > +} > + > +__v2hf mm_cvtpd_ph_builtin_convertvector(__v2df a) > +{ > + return __builtin_convertvector(a, __v2hf); > +} > + > +__v4hf mm256_cvtpd_ph_builtin_convertvector(__v4df a) > +{ > + return __builtin_convertvector(a, __v4hf); > +} > + > +__v8hf mm512_cvtpd_ph_builtin_convertvector(__v8df a) > +{ > + return __builtin_convertvector(a, __v8hf); > +} > + > +__v4hf mm_cvtps_ph_builtin_convertvector(__v4sf a) > +{ > + return __builtin_convertvector(a, __v4hf); > +} > + > +__v8hf mm256_cvtps_ph_builtin_convertvector(__v8sf a) > +{ > + return __builtin_convertvector(a, __v8hf); > +} > + > +__v16hf mm512_cvtps_ph_builtin_convertvector(__v16sf a) > +{ > + return __builtin_convertvector(a, __v16hf); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-4.c b/gcc/testsuite/gcc.target/i386/pr107432-4.c > new file mode 100644 > index 00000000000..e537e7349e4 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr107432-4.c > @@ -0,0 +1,56 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */ > +/* { dg-final { scan-assembler-times "vcvtps2pd" 2 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvtps2pd" 3 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvtph2pd" 3 } } */ > +/* { dg-final { scan-assembler-times "vcvtph2ps" 3 } } */ > + > +#include <x86intrin.h> > + > +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); > +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8))); > + > +__v2df mm_cvtps_pd_builtin_convertvector(__v2sf a) > +{ > + return __builtin_convertvector(a, __v2df); > +} > + > +__v4df mm256_cvtps_pd_builtin_convertvector(__v4sf a) > +{ > + return __builtin_convertvector(a, __v4df); > +} > + > +__v8df mm512_cvtps_pd_builtin_convertvector(__v8sf a) > +{ > + return __builtin_convertvector(a, __v8df); > +} > + > +__v2df mm_cvtph_pd_builtin_convertvector(__v2hf a) > +{ > + return __builtin_convertvector(a, __v2df); > +} > + > +__v4df mm256_cvtph_pd_builtin_convertvector(__v4hf a) > +{ > + return __builtin_convertvector(a, __v4df); > +} > + > +__v8df mm512_cvtph_pd_builtin_convertvector(__v8hf a) > +{ > + return __builtin_convertvector(a, __v8df); > +} > + > +__v4sf mm_cvtph_ps_builtin_convertvector(__v4hf a) > +{ > + return __builtin_convertvector(a, __v4sf); > +} > + > +__v8sf mm256_cvtph_ps_builtin_convertvector(__v8hf a) > +{ > + return __builtin_convertvector(a, __v8sf); > +} > + > +__v16sf mm512_cvtph_ps_builtin_convertvector(__v16hf a) > +{ > + return __builtin_convertvector(a, __v16sf); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-5.c b/gcc/testsuite/gcc.target/i386/pr107432-5.c > new file mode 100644 > index 00000000000..5a44ef9f3b9 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr107432-5.c > @@ -0,0 +1,72 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=x86-64 -mavx512dq -mavx512fp16 -mavx512vl -O3" } */ > +/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 } } */ > +/* { dg-final { scan-assembler-times "vcvttps2qq" 2 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvttps2qq" 3 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvttph2dq" 3 } } */ > +/* { dg-final { scan-assembler-times "vcvttph2qq" 3 } } */ > + > +#include <x86intrin.h> > + > +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); > +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8))); > + > +__v2si mm_cvtpd_epi32_builtin_convertvector(__v2df a) > +{ > + return __builtin_convertvector(a, __v2si); > +} > + > +__v4si mm256_cvtpd_epi32_builtin_convertvector(__v4df a) > +{ > + return __builtin_convertvector(a, __v4si); > +} > + > +__v8si mm512_cvtpd_epi32_builtin_convertvector(__v8df a) > +{ > + return __builtin_convertvector(a, __v8si); > +} > + > +__v2di mm_cvtps_epi64_builtin_convertvector(__v2sf a) > +{ > + return __builtin_convertvector(a, __v2di); > +} > + > +__v4di mm256_cvtps_epi64_builtin_convertvector(__v4sf a) > +{ > + return __builtin_convertvector(a, __v4di); > +} > + > +__v8di mm512_cvtps_epi64_builtin_convertvector(__v8sf a) > +{ > + return __builtin_convertvector(a, __v8di); > +} > + > +__v4si mm_cvtph_epi32_builtin_convertvector(__v4hf a) > +{ > + return __builtin_convertvector(a, __v4si); > +} > + > +__v8si mm256_cvtph_epi32_builtin_convertvector(__v8hf a) > +{ > + return __builtin_convertvector(a, __v8si); > +} > + > +__v16si mm512_cvtph_epi32_builtin_convertvector(__v16hf a) > +{ > + return __builtin_convertvector(a, __v16si); > +} > + > +__v2di mm_cvtph_epi64_builtin_convertvector(__v2hf a) > +{ > + return __builtin_convertvector(a, __v2di); > +} > + > +__v4di mm256_cvtph_epi64_builtin_convertvector(__v4hf a) > +{ > + return __builtin_convertvector(a, __v4di); > +} > + > +__v8di mm512_cvtph_epi64_builtin_convertvector(__v8hf a) > +{ > + return __builtin_convertvector(a, __v8di); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c > new file mode 100644 > index 00000000000..4a68a10b089 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c > @@ -0,0 +1,139 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */ > +/* { dg-final { scan-assembler-times "vcvttpd2dq" 2 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvttpd2udq" 2 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvttpd2udq" 3 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvttps2dq" 3 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */ > +/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */ > +/* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */ > + > +#include <x86intrin.h> > + > +typedef char __v2qi __attribute__ ((__vector_size__ (2))); > +typedef char __v4qi __attribute__ ((__vector_size__ (4))); > +typedef char __v8qi __attribute__ ((__vector_size__ (8))); > +typedef char __v16qi __attribute__ ((__vector_size__ (16))); > +typedef unsigned char __v2qu __attribute__ ((vector_size (2))); > +typedef unsigned char __v4qu __attribute__ ((vector_size (4))); > +typedef unsigned char __v8qu __attribute__ ((vector_size (8))); > +typedef unsigned char __v16qu __attribute__ ((vector_size (16))); > +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); > +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8))); > +typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16))); > + > +__v2qi mm_cvtpd_epi8_builtin_convertvector(__v2df a) > +{ > + return __builtin_convertvector((__v2df)a, __v2qi); > +} > + > +__v4qi mm256_cvtpd_epi8_builtin_convertvector(__v4df a) > +{ > + return __builtin_convertvector((__v4df)a, __v4qi); > +} > + > +__v8qi mm512_cvtpd_epi8_builtin_convertvector(__v8df a) > +{ > + return __builtin_convertvector((__v8df)a, __v8qi); > +} > + > +__v2qu mm_cvtpd_epu8_builtin_convertvector(__v2df a) > +{ > + return __builtin_convertvector((__v2df)a, __v2qu); > +} > + > +__v4qu mm256_cvtpd_epu8_builtin_convertvector(__v4df a) > +{ > + return __builtin_convertvector((__v4df)a, __v4qu); > +} > + > +__v8qu mm512_cvtpd_epu8_builtin_convertvector(__v8df a) > +{ > + return __builtin_convertvector((__v8df)a, __v8qu); > +} > + > +__v2qi mm64_cvtps_epi8_builtin_convertvector(__v2sf a) > +{ > + return __builtin_convertvector((__v2sf)a, __v2qi); > +} > + > +__v4qi mm128_cvtps_epi8_builtin_convertvector(__v4sf a) > +{ > + return __builtin_convertvector((__v4sf)a, __v4qi); > +} > + > +__v8qi mm256_cvtps_epi8_builtin_convertvector(__v8sf a) > +{ > + return __builtin_convertvector((__v8sf)a, __v8qi); > +} > + > +__v16qi mm512_cvtps_epi8_builtin_convertvector(__v16sf a) > +{ > + return __builtin_convertvector((__v16sf)a, __v16qi); > +} > + > +__v2qu mm64_cvtps_epu8_builtin_convertvector(__v2sf a) > +{ > + return __builtin_convertvector((__v2sf)a, __v2qu); > +} > + > +__v4qu mm128_cvtps_epu8_builtin_convertvector(__v4sf a) > +{ > + return __builtin_convertvector((__v4sf)a, __v4qu); > +} > + > +__v8qu mm256_cvtps_epu8_builtin_convertvector(__v8sf a) > +{ > + return __builtin_convertvector((__v8sf)a, __v8qu); > +} > + > +__v16qu mm512_cvtps_epu8_builtin_convertvector(__v16sf a) > +{ > + return __builtin_convertvector((__v16sf)a, __v16qu); > +} > + > +__v2qi mm32_cvtph_epi8_builtin_convertvector(__v2hf a) > +{ > + return __builtin_convertvector((__v2hf)a, __v2qi); > +} > + > +__v8qi mm128_cvtph_epi8_builtin_convertvector(__v8hf a) > +{ > + return __builtin_convertvector((__v8hf)a, __v8qi); > +} > + > +__v16qi mm256_cvtph_epi8_builtin_convertvector(__v16hf a) > +{ > + return __builtin_convertvector((__v16hf)a, __v16qi); > +} > + > +__v32qi mm512_cvtph_epi8_builtin_convertvector(__v32hf a) > +{ > + return __builtin_convertvector((__v32hf)a, __v32qi); > +} > + > +__v2qu mm32_cvtph_epu8_builtin_convertvector(__v2hf a) > +{ > + return __builtin_convertvector((__v2hf)a, __v2qu); > +} > + > +__v8qu mm128_cvtph_epu8_builtin_convertvector(__v8hf a) > +{ > + return __builtin_convertvector((__v8hf)a, __v8qu); > +} > + > +__v16qu mm256_cvtph_epu8_builtin_convertvector(__v16hf a) > +{ > + return __builtin_convertvector((__v16hf)a, __v16qu); > +} > + > +__v32qu mm512_cvtph_epu8_builtin_convertvector(__v32hf a) > +{ > + return __builtin_convertvector((__v32hf)a, __v32qu); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-7.c b/gcc/testsuite/gcc.target/i386/pr107432-7.c > new file mode 100644 > index 00000000000..0ff5a97ed1a > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr107432-7.c > @@ -0,0 +1,156 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */ > +/* { dg-final { scan-assembler-times "vcvtdq2pd" 2 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvtdq2pd" 3 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvtudq2pd" 2 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvtudq2pd" 3 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvtdq2ps" 3 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvtdq2ps" 4 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvtudq2ps" 3 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvtudq2ps" 4 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvtw2ph" 4 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvtw2ph" 5 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvtuw2ph" 4 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vcvtuw2ph" 5 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */ > + > +#include <x86intrin.h> > + > +typedef char __v2qi __attribute__ ((__vector_size__ (2))); > +typedef char __v4qi __attribute__ ((__vector_size__ (4))); > +typedef char __v8qi __attribute__ ((__vector_size__ (8))); > +typedef char __v16qi __attribute__ ((__vector_size__ (16))); > +typedef unsigned char __v2qu __attribute__ ((vector_size (2))); > +typedef unsigned char __v4qu __attribute__ ((vector_size (4))); > +typedef unsigned char __v8qu __attribute__ ((vector_size (8))); > +typedef unsigned char __v16qu __attribute__ ((vector_size (16))); > +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); > +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8))); > +typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16))); > + > +__v2df mm_cvtepi8_pd_builtin_convertvector(__v2qi a) > +{ > + return __builtin_convertvector((__v2qi)a, __v2df); > +} > + > +__v4df mm256_cvtepi8_pd_builtin_convertvector(__v4qi a) > +{ > + return __builtin_convertvector((__v4qi)a, __v4df); > +} > + > +__v8df mm512_cvtepi8_pd_builtin_convertvector(__v8qi a) > +{ > + return __builtin_convertvector((__v8qi)a, __v8df); > +} > + > +__v2df mm_cvtepu8_pd_builtin_convertvector(__v2qu a) > +{ > + return __builtin_convertvector((__v2qu)a, __v2df); > +} > + > +__v4df mm256_cvtepu8_pd_builtin_convertvector(__v4qu a) > +{ > + return __builtin_convertvector((__v4qu)a, __v4df); > +} > + > +__v8df mm512_cvtepu8_pd_builtin_convertvector(__v8qu a) > +{ > + return __builtin_convertvector((__v8qu)a, __v8df); > +} > + > +__v2sf mm64_cvtepi8_ps_builtin_convertvector(__v2qi a) > +{ > + return __builtin_convertvector((__v2qi)a, __v2sf); > +} > + > +__v4sf mm128_cvtepi8_ps_builtin_convertvector(__v4qi a) > +{ > + return __builtin_convertvector((__v4qi)a, __v4sf); > +} > + > +__v8sf mm256_cvtepi8_ps_builtin_convertvector(__v8qi a) > +{ > + return __builtin_convertvector((__v8qi)a, __v8sf); > +} > + > +__v16sf mm512_cvtepi8_ps_builtin_convertvector(__v16qi a) > +{ > + return __builtin_convertvector((__v16qi)a, __v16sf); > +} > + > +__v2sf mm64_cvtepu8_ps_builtin_convertvector(__v2qu a) > +{ > + return __builtin_convertvector((__v2qu)a, __v2sf); > +} > + > +__v4sf mm128_cvtepu8_ps_builtin_convertvector(__v4qu a) > +{ > + return __builtin_convertvector((__v4qu)a, __v4sf); > +} > + > +__v8sf mm256_cvtepu8_ps_builtin_convertvector(__v8qu a) > +{ > + return __builtin_convertvector((__v8qu)a, __v8sf); > +} > + > +__v16sf mm512_cvtepu8_ps_builtin_convertvector(__v16qu a) > +{ > + return __builtin_convertvector((__v16qu)a, __v16sf); > +} > + > +__v2hf mm32_cvtepi8_ph_builtin_convertvector(__v2qi a) > +{ > + return __builtin_convertvector((__v2qi)a, __v2hf); > +} > + > +__v4hf mm64_cvtepi8_ph_builtin_convertvector(__v4qi a) > +{ > + return __builtin_convertvector((__v4qi)a, __v4hf); > +} > + > +__v8hf mm128_cvtepi8_ph_builtin_convertvector(__v8qi a) > +{ > + return __builtin_convertvector((__v8qi)a, __v8hf); > +} > + > +__v16hf mm256_cvtepi8_ph_builtin_convertvector(__v16qi a) > +{ > + return __builtin_convertvector((__v16qi)a, __v16hf); > +} > + > +__v32hf mm512_cvtepi8_ph_builtin_convertvector(__v32qi a) > +{ > + return __builtin_convertvector((__v32qi)a, __v32hf); > +} > + > +__v2hf mm32_cvtepu8_ph_builtin_convertvector(__v2qu a) > +{ > + return __builtin_convertvector((__v2qu)a, __v2hf); > +} > + > +__v4hf mm64_cvtepu8_ph_builtin_convertvector(__v4qu a) > +{ > + return __builtin_convertvector((__v4qu)a, __v4hf); > +} > + > +__v8hf mm128_cvtepu8_ph_builtin_convertvector(__v8qu a) > +{ > + return __builtin_convertvector((__v8qu)a, __v8hf); > +} > + > +__v16hf mm256_cvtepu8_ph_builtin_convertvector(__v16qu a) > +{ > + return __builtin_convertvector((__v16qu)a, __v16hf); > +} > + > +__v32hf mm512_cvtepu8_ph_builtin_convertvector(__v32qu a) > +{ > + return __builtin_convertvector((__v32qu)a, __v32hf); > +} > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc > index ea0069f7a67..c38c0b9dda8 100644 > --- a/gcc/tree-vect-generic.cc > +++ b/gcc/tree-vect-generic.cc > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3. If not see > #include "gimple-match.h" > #include "recog.h" /* FIXME: for insn_data */ > #include "optabs-libfuncs.h" > +#include "cfgloop.h" > +#include "tree-vectorizer.h" > > > /* Build a ternary operation and gimplify it. Emit code before GSI. > @@ -1870,14 +1872,33 @@ expand_vector_conversion (gimple_stmt_iterator *gsi) > else if (ret_elt_bits > arg_elt_bits) > modifier = WIDEN; > > + if (supportable_convert_operation (code, ret_type, arg_type, &code1)) > + { > + g = gimple_build_assign (lhs, code1, arg); > + gsi_replace (gsi, g, false); > + return; > + } Given the API change I suggest below it might make sense to have supportable_indirect_convert_operation do the above and represent it as single-step conversion? > + code_helper code2 = ERROR_MARK, code3 = ERROR_MARK; > + int multi_step_cvt = 0; > + vec<tree> interm_types = vNULL; > + if (supportable_indirect_convert_operation (NULL, > + code, > + ret_type, arg_type, > + &code2, &code3, > + &multi_step_cvt, > + &interm_types, arg)) > + { > + new_rhs = make_ssa_name (interm_types[0]); > + g = gimple_build_assign (new_rhs, (tree_code) code3, arg); > + gsi_insert_before (gsi, g, GSI_SAME_STMT); > + g = gimple_build_assign (lhs, (tree_code) code2, new_rhs); > + gsi_replace (gsi, g, false); > + return; > + } > + > if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR)) > { > - if (supportable_convert_operation (code, ret_type, arg_type, &code1)) > - { > - g = gimple_build_assign (lhs, code1, arg); > - gsi_replace (gsi, g, false); > - return; > - } > /* Can't use get_compute_type here, as supportable_convert_operation > doesn't necessarily use an optab and needs two arguments. */ > tree vec_compute_type > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index 05a169ecb2d..0aa608202ca 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -5175,7 +5175,7 @@ vectorizable_conversion (vec_info *vinfo, > tree scalar_dest; > tree op0, op1 = NULL_TREE; > loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); > - tree_code tc1, tc2; > + tree_code tc1; > code_helper code, code1, code2; > code_helper codecvt1 = ERROR_MARK, codecvt2 = ERROR_MARK; > tree new_temp; > @@ -5384,92 +5384,17 @@ vectorizable_conversion (vec_info *vinfo, > break; > } > > - /* For conversions between float and integer types try whether > - we can use intermediate signed integer types to support the > - conversion. */ > - if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode) > - && (code == FLOAT_EXPR || > - (code == FIX_TRUNC_EXPR && !flag_trapping_math))) > - { > - bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode); > - bool float_expr_p = code == FLOAT_EXPR; > - unsigned short target_size; > - scalar_mode intermediate_mode; > - if (demotion) > - { > - intermediate_mode = lhs_mode; > - target_size = GET_MODE_SIZE (rhs_mode); > - } > - else > - { > - target_size = GET_MODE_SIZE (lhs_mode); > - if (!int_mode_for_size > - (GET_MODE_BITSIZE (rhs_mode), 0).exists (&intermediate_mode)) > - goto unsupported; > - } > - code1 = float_expr_p ? code : NOP_EXPR; > - codecvt1 = float_expr_p ? NOP_EXPR : code; > - opt_scalar_mode mode_iter; > - FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode) > - { > - intermediate_mode = mode_iter.require (); > - > - if (GET_MODE_SIZE (intermediate_mode) > target_size) > - break; > - > - scalar_mode cvt_mode; > - if (!int_mode_for_size > - (GET_MODE_BITSIZE (intermediate_mode), 0).exists (&cvt_mode)) > - break; > - > - cvt_type = build_nonstandard_integer_type > - (GET_MODE_BITSIZE (cvt_mode), 0); > - > - /* Check if the intermediate type can hold OP0's range. > - When converting from float to integer this is not necessary > - because values that do not fit the (smaller) target type are > - unspecified anyway. */ > - if (demotion && float_expr_p) > - { > - wide_int op_min_value, op_max_value; > - if (!vect_get_range_info (op0, &op_min_value, &op_max_value)) > - break; > - > - if (cvt_type == NULL_TREE > - || (wi::min_precision (op_max_value, SIGNED) > - > TYPE_PRECISION (cvt_type)) > - || (wi::min_precision (op_min_value, SIGNED) > - > TYPE_PRECISION (cvt_type))) > - continue; > - } > - > - cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node); > - /* This should only happened for SLP as long as loop vectorizer > - only supports same-sized vector. */ > - if (cvt_type == NULL_TREE > - || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nunits_in) > - || !supportable_convert_operation ((tree_code) code1, > - vectype_out, > - cvt_type, &tc1) > - || !supportable_convert_operation ((tree_code) codecvt1, > - cvt_type, > - vectype_in, &tc2)) > - continue; > - > - found_mode = true; > - break; > - } > + if (supportable_indirect_convert_operation (vinfo, > + code, > + vectype_out, > + vectype_in, > + &code1, > + &codecvt1, > + &multi_step_cvt, > + &interm_types, > + op0,slp_node)) > + break; > > - if (found_mode) > - { > - multi_step_cvt++; > - interm_types.safe_push (cvt_type); > - cvt_type = NULL_TREE; > - code1 = tc1; > - codecvt1 = tc2; > - break; > - } > - } > /* FALLTHRU */ > unsupported: > if (dump_enabled_p ()) > @@ -14626,6 +14551,153 @@ supportable_narrowing_operation (code_helper code, > return false; > } > > +/* Function supportable_indirect_convert_operation > + > + Check whether an operation represented by the code CODE is two > + convert operations that are supported by the target platform in > + vector form (i.e., when operating on arguments of type VECTYPE_IN > + producing a result of type VECTYPE_OUT). > + > + Convert operations we currently support directly are FIX_TRUNC and FLOAT. > + This function checks if these operations are supported > + by the target platform directly (via vector tree-codes). > + > + Output: > + - CODE1 is the code of a vector operation to be used when > + converting the operation in the first step, if available. > + - CODE2 is the code of a vector operation to be used when > + converting the operation in the second step, if available. > + - MULTI_STEP_CVT determines the number of required intermediate steps in > + case of multi-step conversion (like int->short->char - in that case > + MULTI_STEP_CVT will be 1). In the function, it should be 1. > + - INTERM_TYPES contains the intermediate type required to perform the > + convert operation (short in the above example). */ > +bool > +supportable_indirect_convert_operation (vec_info *vinfo, > + code_helper code, > + tree vectype_out, > + tree vectype_in, > + code_helper *code1, > + code_helper *code2, > + int *multi_step_cvt, > + vec<tree> *interm_types, This API is somewhat awkward, as we're inventing a new one I guess we can do better. I think we want vec<std::pair<tree, tree_code> > *converts, covering all code1, code2, multi_step_cvt and interm_types with the conversion sequence being converts[0].first tem0 = converts[0].second op0; converts[1].first tem1 = converts[1].second tem; ... while converts.length () determines the length of the chain, one being a direct conversion where then converts[0].first is vectype_out. That would allow double -> char to go double -> float -> int -> short -> char for example. > + tree op0, > + slp_tree slp_node) I would like to avoid passing VINFO and SLP_NODE here, see below. The same is true for OP0 where the existing use is wrong for SLP already, but I guess that can stay for now (I opened PR115538 about the wrong-code issue). > +{ > + bool found_mode = false; > + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_out)); > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_in)); > + opt_scalar_mode mode_iter; > + tree_code tc1, tc2; > + > + tree cvt_type = NULL_TREE; > + poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype_in); > + > + (*multi_step_cvt) = 0; > + /* For conversions between float and integer types try whether > + we can use intermediate signed integer types to support the > + conversion. */ > + if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode) > + && (code == FLOAT_EXPR > + || (code == FIX_TRUNC_EXPR && !flag_trapping_math))) > + { > + bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode); > + bool float_expr_p = code == FLOAT_EXPR; > + unsigned short target_size; > + scalar_mode intermediate_mode; > + if (demotion) > + { > + intermediate_mode = lhs_mode; > + target_size = GET_MODE_SIZE (rhs_mode); > + } > + else > + { > + target_size = GET_MODE_SIZE (lhs_mode); > + if (!int_mode_for_size > + (GET_MODE_BITSIZE (rhs_mode), 0).exists (&intermediate_mode)) > + return false; > + } > + *code1 = float_expr_p ? code : NOP_EXPR; > + *code2 = float_expr_p ? NOP_EXPR : code; > + opt_scalar_mode mode_iter; > + FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode) > + { > + intermediate_mode = mode_iter.require (); > + > + if (GET_MODE_SIZE (intermediate_mode) > target_size) > + break; > + > + scalar_mode cvt_mode; > + if (!int_mode_for_size > + (GET_MODE_BITSIZE (intermediate_mode), 0).exists (&cvt_mode)) > + break; > + > + cvt_type = build_nonstandard_integer_type > + (GET_MODE_BITSIZE (cvt_mode), 0); > + > + /* Check if the intermediate type can hold OP0's range. > + When converting from float to integer this is not necessary > + because values that do not fit the (smaller) target type are > + unspecified anyway. */ > + if (demotion && float_expr_p) > + { > + wide_int op_min_value, op_max_value; > + /* For vector form, it looks like op0 doesn't have RANGE_INFO. > + In the future, if it is supported, changes may need to be made > + to this part, such as checking the RANGE of each element > + in the vector. */ > + if (!SSA_NAME_RANGE_INFO (op0) > + || !vect_get_range_info (op0, &op_min_value, &op_max_value)) > + break; > + > + if (cvt_type == NULL_TREE > + || (wi::min_precision (op_max_value, SIGNED) > + > TYPE_PRECISION (cvt_type)) > + || (wi::min_precision (op_min_value, SIGNED) > + > TYPE_PRECISION (cvt_type))) > + continue; > + } > + > + if (vinfo != NULL && slp_node != NULL) > + cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node); > + else > + { > + bool uns = TYPE_UNSIGNED (TREE_TYPE (vectype_out)) > + || TYPE_UNSIGNED (TREE_TYPE (vectype_in)); > + cvt_type = build_nonstandard_integer_type > + (GET_MODE_BITSIZE (cvt_mode), uns); > + cvt_type = build_vector_type (cvt_type, nelts); > + } So this would then become cvt_type = get_related_vectype_for_scalar_type (TYPE_MODE (vectype_in), cvt_type, TYPE_VECTOR_SUBPARTS (vectype_in)); > + /* This should only happened for SLP as long as loop vectorizer > + only supports same-sized vector. */ > + if (cvt_type == NULL_TREE > + || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nelts) > + || !supportable_convert_operation ((tree_code) *code1, > + vectype_out, > + cvt_type, &tc1) > + || !supportable_convert_operation ((tree_code) *code2, > + cvt_type, > + vectype_in, &tc2)) > + continue; > + > + found_mode = true; > + break; > + } > + > + if (found_mode) > + { > + (*multi_step_cvt)++; > + interm_types->safe_push (cvt_type); > + cvt_type = NULL_TREE; > + *code1 = tc1; > + *code2 = tc2; > + return true; > + } > + } > + interm_types->release (); Hmm, ownership of interm_types is somewhat unclear here - the caller should release it, or is the situation that the caller is confused by stray elements in it? In that case I'd suggest to instead do interm_types->truncate (0). > + return false; > +} > + > /* Generate and return a vector mask of MASK_TYPE such that > mask[I] is true iff J + START_INDEX < END_INDEX for all J <= I. > Add the statements to SEQ. */ > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h > index 97ec9c341e7..ad65ce71bb7 100644 > --- a/gcc/tree-vectorizer.h > +++ b/gcc/tree-vectorizer.h > @@ -2265,6 +2265,15 @@ extern bool supportable_widening_operation (vec_info*, code_helper, > extern bool supportable_narrowing_operation (code_helper, tree, tree, > code_helper *, int *, > vec<tree> *); > +extern bool supportable_indirect_convert_operation (vec_info *, > + code_helper, > + tree, tree, > + code_helper *, > + code_helper *, > + int *, > + vec<tree> *, > + tree = NULL_TREE, > + slp_tree = NULL); > > extern unsigned record_stmt_cost (stmt_vector_for_cost *, int, > enum vect_cost_for_stmt, stmt_vec_info, >
> > else if (ret_elt_bits > arg_elt_bits) > > modifier = WIDEN; > > > > + if (supportable_convert_operation (code, ret_type, arg_type, &code1)) > > + { > > + g = gimple_build_assign (lhs, code1, arg); > > + gsi_replace (gsi, g, false); > > + return; > > + } > > Given the API change I suggest below it might make sense to have > supportable_indirect_convert_operation do the above and represent it as single- > step conversion? > OK, if you want to supportable_indirect_convert_operation can do something like supportable_convert_operation, I'll give it a try. This functionality is really the part that this function can cover. But this would require some changes not only the API change, because supportable_indirect_convert_operation originally only supported Float -> Int or Int ->Float. > > > + code_helper code2 = ERROR_MARK, code3 = ERROR_MARK; > > + int multi_step_cvt = 0; > > + vec<tree> interm_types = vNULL; > > + if (supportable_indirect_convert_operation (NULL, > > + code, > > + ret_type, arg_type, > > + &code2, &code3, > > + &multi_step_cvt, > > + &interm_types, arg)) > > + { > > + new_rhs = make_ssa_name (interm_types[0]); > > + g = gimple_build_assign (new_rhs, (tree_code) code3, arg); > > + gsi_insert_before (gsi, g, GSI_SAME_STMT); > > + g = gimple_build_assign (lhs, (tree_code) code2, new_rhs); > > + gsi_replace (gsi, g, false); > > + return; > > + } > > + > > if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == > FLOAT_EXPR)) > > { > > - if (supportable_convert_operation (code, ret_type, arg_type, &code1)) > > - { > > - g = gimple_build_assign (lhs, code1, arg); > > - gsi_replace (gsi, g, false); > > - return; > > - } > > /* Can't use get_compute_type here, as supportable_convert_operation > > doesn't necessarily use an optab and needs two arguments. */ > > tree vec_compute_type > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index > > 05a169ecb2d..0aa608202ca 100644 > > --- a/gcc/tree-vect-stmts.cc > > +++ b/gcc/tree-vect-stmts.cc > > @@ -5175,7 +5175,7 @@ vectorizable_conversion (vec_info *vinfo, > > tree scalar_dest; > > tree op0, op1 = NULL_TREE; > > loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); > > - tree_code tc1, tc2; > > + tree_code tc1; > > code_helper code, code1, code2; > > code_helper codecvt1 = ERROR_MARK, codecvt2 = ERROR_MARK; > > tree new_temp; > > @@ -5384,92 +5384,17 @@ vectorizable_conversion (vec_info *vinfo, > > break; > > } > > > > - /* For conversions between float and integer types try whether > > - we can use intermediate signed integer types to support the > > - conversion. */ > > - if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode) > > - && (code == FLOAT_EXPR || > > - (code == FIX_TRUNC_EXPR && !flag_trapping_math))) > > - { > > - bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE > (lhs_mode); > > - bool float_expr_p = code == FLOAT_EXPR; > > - unsigned short target_size; > > - scalar_mode intermediate_mode; > > - if (demotion) > > - { > > - intermediate_mode = lhs_mode; > > - target_size = GET_MODE_SIZE (rhs_mode); > > - } > > - else > > - { > > - target_size = GET_MODE_SIZE (lhs_mode); > > - if (!int_mode_for_size > > - (GET_MODE_BITSIZE (rhs_mode), 0).exists > (&intermediate_mode)) > > - goto unsupported; > > - } > > - code1 = float_expr_p ? code : NOP_EXPR; > > - codecvt1 = float_expr_p ? NOP_EXPR : code; > > - opt_scalar_mode mode_iter; > > - FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode) > > - { > > - intermediate_mode = mode_iter.require (); > > - > > - if (GET_MODE_SIZE (intermediate_mode) > target_size) > > - break; > > - > > - scalar_mode cvt_mode; > > - if (!int_mode_for_size > > - (GET_MODE_BITSIZE (intermediate_mode), 0).exists > (&cvt_mode)) > > - break; > > - > > - cvt_type = build_nonstandard_integer_type > > - (GET_MODE_BITSIZE (cvt_mode), 0); > > - > > - /* Check if the intermediate type can hold OP0's range. > > - When converting from float to integer this is not necessary > > - because values that do not fit the (smaller) target type are > > - unspecified anyway. */ > > - if (demotion && float_expr_p) > > - { > > - wide_int op_min_value, op_max_value; > > - if (!vect_get_range_info (op0, &op_min_value, > &op_max_value)) > > - break; > > - > > - if (cvt_type == NULL_TREE > > - || (wi::min_precision (op_max_value, SIGNED) > > - > TYPE_PRECISION (cvt_type)) > > - || (wi::min_precision (op_min_value, SIGNED) > > - > TYPE_PRECISION (cvt_type))) > > - continue; > > - } > > - > > - cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node); > > - /* This should only happened for SLP as long as loop vectorizer > > - only supports same-sized vector. */ > > - if (cvt_type == NULL_TREE > > - || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nunits_in) > > - || !supportable_convert_operation ((tree_code) code1, > > - vectype_out, > > - cvt_type, &tc1) > > - || !supportable_convert_operation ((tree_code) codecvt1, > > - cvt_type, > > - vectype_in, &tc2)) > > - continue; > > - > > - found_mode = true; > > - break; > > - } > > + if (supportable_indirect_convert_operation (vinfo, > > + code, > > + vectype_out, > > + vectype_in, > > + &code1, > > + &codecvt1, > > + &multi_step_cvt, > > + &interm_types, > > + op0,slp_node)) > > + break; > > > > - if (found_mode) > > - { > > - multi_step_cvt++; > > - interm_types.safe_push (cvt_type); > > - cvt_type = NULL_TREE; > > - code1 = tc1; > > - codecvt1 = tc2; > > - break; > > - } > > - } > > /* FALLTHRU */ > > unsupported: > > if (dump_enabled_p ()) > > @@ -14626,6 +14551,153 @@ supportable_narrowing_operation > (code_helper code, > > return false; > > } > > > > +/* Function supportable_indirect_convert_operation > > + > > + Check whether an operation represented by the code CODE is two > > + convert operations that are supported by the target platform in > > + vector form (i.e., when operating on arguments of type VECTYPE_IN > > + producing a result of type VECTYPE_OUT). > > + > > + Convert operations we currently support directly are FIX_TRUNC and FLOAT. > > + This function checks if these operations are supported > > + by the target platform directly (via vector tree-codes). > > + > > + Output: > > + - CODE1 is the code of a vector operation to be used when > > + converting the operation in the first step, if available. > > + - CODE2 is the code of a vector operation to be used when > > + converting the operation in the second step, if available. > > + - MULTI_STEP_CVT determines the number of required intermediate steps > in > > + case of multi-step conversion (like int->short->char - in that case > > + MULTI_STEP_CVT will be 1). In the function, it should be 1. > > + - INTERM_TYPES contains the intermediate type required to perform the > > + convert operation (short in the above example). */ > > +bool > > +supportable_indirect_convert_operation (vec_info *vinfo, > > + code_helper code, > > + tree vectype_out, > > + tree vectype_in, > > + code_helper *code1, > > + code_helper *code2, > > + int *multi_step_cvt, > > + vec<tree> *interm_types, > > This API is somewhat awkward, as we're inventing a new one I guess we can do > better. I think we want > > vec<std::pair<tree, tree_code> > *converts, > > covering all code1, code2, multi_step_cvt and interm_types with the conversion > sequence being > > converts[0].first tem0 = converts[0].second op0; > converts[1].first tem1 = converts[1].second tem; > That's great, this really makes the function work better. > > ... while converts.length () determines the length of the chain, one being a direct > conversion where then converts[0].first is vectype_out. That would allow > double -> char to go double -> float -> int -> short -> char for example. > I'm trying to determine the requirements, do you want this function to support multiple conversions (the current implementation just does a two-step conversion, like double -> char, which becomes double -> int -> char). Actually we should be able to do all conversions in two steps, if we have some suitable instructions. I can't think of a scenario where multiple conversions are needed yet. Could you give me some examples? Of course, I could tweak this feature in advance if it is for future consideration. > > > + tree op0, > > + slp_tree slp_node) > > I would like to avoid passing VINFO and SLP_NODE here, see below. > The same is true for OP0 where the existing use is wrong for SLP already, but I > guess that can stay for now (I opened PR115538 about the wrong-code issue). > Thanks, I have removed them. > > > +{ > > + bool found_mode = false; > > + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_out)); > > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_in)); > > + opt_scalar_mode mode_iter; > > + tree_code tc1, tc2; > > + > > + tree cvt_type = NULL_TREE; > > + poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype_in); > > + > > + (*multi_step_cvt) = 0; > > + /* For conversions between float and integer types try whether > > + we can use intermediate signed integer types to support the > > + conversion. */ > > + if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode) > > + && (code == FLOAT_EXPR > > + || (code == FIX_TRUNC_EXPR && !flag_trapping_math))) > > + { > > + bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE > (lhs_mode); > > + bool float_expr_p = code == FLOAT_EXPR; > > + unsigned short target_size; > > + scalar_mode intermediate_mode; > > + if (demotion) > > + { > > + intermediate_mode = lhs_mode; > > + target_size = GET_MODE_SIZE (rhs_mode); > > + } > > + else > > + { > > + target_size = GET_MODE_SIZE (lhs_mode); > > + if (!int_mode_for_size > > + (GET_MODE_BITSIZE (rhs_mode), 0).exists (&intermediate_mode)) > > + return false; > > + } > > + *code1 = float_expr_p ? code : NOP_EXPR; > > + *code2 = float_expr_p ? NOP_EXPR : code; > > + opt_scalar_mode mode_iter; > > + FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode) > > + { > > + intermediate_mode = mode_iter.require (); > > + > > + if (GET_MODE_SIZE (intermediate_mode) > target_size) > > + break; > > + > > + scalar_mode cvt_mode; > > + if (!int_mode_for_size > > + (GET_MODE_BITSIZE (intermediate_mode), 0).exists (&cvt_mode)) > > + break; > > + > > + cvt_type = build_nonstandard_integer_type > > + (GET_MODE_BITSIZE (cvt_mode), 0); > > + > > + /* Check if the intermediate type can hold OP0's range. > > + When converting from float to integer this is not necessary > > + because values that do not fit the (smaller) target type are > > + unspecified anyway. */ > > + if (demotion && float_expr_p) > > + { > > + wide_int op_min_value, op_max_value; > > + /* For vector form, it looks like op0 doesn't have RANGE_INFO. > > + In the future, if it is supported, changes may need to be made > > + to this part, such as checking the RANGE of each element > > + in the vector. */ > > + if (!SSA_NAME_RANGE_INFO (op0) > > + || !vect_get_range_info (op0, &op_min_value, > &op_max_value)) > > + break; > > + > > + if (cvt_type == NULL_TREE > > + || (wi::min_precision (op_max_value, SIGNED) > > + > TYPE_PRECISION (cvt_type)) > > + || (wi::min_precision (op_min_value, SIGNED) > > + > TYPE_PRECISION (cvt_type))) > > + continue; > > + } > > + > > + if (vinfo != NULL && slp_node != NULL) > > + cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node); > > + else > > + { > > + bool uns = TYPE_UNSIGNED (TREE_TYPE (vectype_out)) > > + || TYPE_UNSIGNED (TREE_TYPE (vectype_in)); > > + cvt_type = build_nonstandard_integer_type > > + (GET_MODE_BITSIZE (cvt_mode), uns); > > + cvt_type = build_vector_type (cvt_type, nelts); > > + } > > So this would then become > > cvt_type = get_related_vectype_for_scalar_type (TYPE_MODE > (vectype_in), cvt_type, TYPE_VECTOR_SUBPARTS (vectype_in)); > > > + /* This should only happened for SLP as long as loop vectorizer > > + only supports same-sized vector. */ > > + if (cvt_type == NULL_TREE > > + || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nelts) > > + || !supportable_convert_operation ((tree_code) *code1, > > + vectype_out, > > + cvt_type, &tc1) > > + || !supportable_convert_operation ((tree_code) *code2, > > + cvt_type, > > + vectype_in, &tc2)) > > + continue; > > + > > + found_mode = true; > > + break; > > + } > > + > > + if (found_mode) > > + { > > + (*multi_step_cvt)++; > > + interm_types->safe_push (cvt_type); > > + cvt_type = NULL_TREE; > > + *code1 = tc1; > > + *code2 = tc2; > > + return true; > > + } > > + } > > + interm_types->release (); > > Hmm, ownership of interm_types is somewhat unclear here - the caller should > release it, or is the situation that the caller is confused by stray elements in it? In > that case I'd suggest to instead do interm_types->truncate (0). > It's my fault, I just imitate supportable_narrowing/widening_operation, I think for this function, interm_types->release() is not needed. BRs, Lin
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c new file mode 100644 index 00000000000..a4f37447eb4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c @@ -0,0 +1,234 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */ +/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */ + +#include <x86intrin.h> + +typedef short __v2hi __attribute__ ((__vector_size__ (4))); +typedef char __v2qi __attribute__ ((__vector_size__ (2))); +typedef char __v4qi __attribute__ ((__vector_size__ (4))); +typedef char __v8qi __attribute__ ((__vector_size__ (8))); + +typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4))); +typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8))); +typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2))); +typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4))); +typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8))); +typedef unsigned int __v2su __attribute__ ((__vector_size__ (8))); + +__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2si); +} + +__m128i mm256_cvtepi64_epi32_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v4di)a, __v4si); +} + +__m256i mm512_cvtepi64_epi32_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v8di)a, __v8si); +} + +__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2hi); +} + +__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4di)a, __v4hi); +} + +__m128i mm512_cvtepi64_epi16_builtin_convertvector(__m512i a) +{ + return (__m128i)__builtin_convertvector((__v8di)a, __v8hi); +} + +__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2qi); +} + +__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4di)a, __v4qi); +} + +__v8qi mm512_cvtepi64_epi8_builtin_convertvector(__m512i a) +{ + return __builtin_convertvector((__v8di)a, __v8qi); +} + +__v2hi mm64_cvtepi32_epi16_builtin_convertvector(__v2si a) +{ + return __builtin_convertvector((__v2si)a, __v2hi); +} + +__v4hi mm_cvtepi32_epi16_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v4si)a, __v4hi); +} + +__m128i mm256_cvtepi32_epi16_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v8si)a, __v8hi); +} + +__m256i mm512_cvtepi32_epi16_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v16si)a, __v16hi); +} + +__v2qi mm64_cvtepi32_epi8_builtin_convertvector(__v2si a) +{ + return __builtin_convertvector((__v2si)a, __v2qi); +} + +__v4qi mm_cvtepi32_epi8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v4si)a, __v4qi); +} + +__v8qi mm256_cvtepi32_epi8_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v8si)a, __v8qi); +} + +__m128i mm512_cvtepi32_epi8_builtin_convertvector(__m512i a) +{ + return (__m128i)__builtin_convertvector((__v16si)a, __v16qi); +} + +__v2qi mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a) +{ + return __builtin_convertvector((__v2hi)a, __v2qi); +} + +__v8qi mm_cvtepi16_epi8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v8hi)a, __v8qi); +} + +__m128i mm256_cvtepi16_epi8_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v16hi)a, __v16qi); +} + +__m256i mm512_cvtepi16_epi8_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v32hi)a, __v32qi); +} + +__v2su mm_cvtepu64_epu32_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2du)a, __v2su); +} + +__m128i mm256_cvtepu64_epu32_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v4du)a, __v4su); +} + +__m256i mm512_cvtepu64_epu32_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v8du)a, __v8su); +} + +__v2hu mm_cvtepu64_epu16_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2du)a, __v2hu); +} + +__v4hu mm256_cvtepu64_epu16_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4du)a, __v4hu); +} + +__m128i mm512_cvtepu64_epu16_builtin_convertvector(__m512i a) +{ + return (__m128i)__builtin_convertvector((__v8du)a, __v8hu); +} + +__v2qu mm_cvtepu64_epu8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2du)a, __v2qu); +} + +__v4qu mm256_cvtepu64_epu8_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4du)a, __v4qu); +} + +__v8qu mm512_cvtepu64_epu8_builtin_convertvector(__m512i a) +{ + return __builtin_convertvector((__v8du)a, __v8qu); +} + +__v2hu mm32_cvtepu32_epu16_builtin_convertvector(__v2su a) +{ + return __builtin_convertvector((__v2su)a, __v2hu); +} + +__v4hu mm_cvtepu32_epu16_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v4su)a, __v4hu); +} + +__m128i mm256_cvtepu32_epu16_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v8su)a, __v8hu); +} + +__m256i mm512_cvtepu32_epu16_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v16su)a, __v16hu); +} + +__v2qu mm32_cvtepu32_epu8_builtin_convertvector(__v2su a) +{ + return __builtin_convertvector((__v2su)a, __v2qu); +} + +__v4qu mm_cvtepu2_epu8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v4su)a, __v4qu); +} + +__v8qu mm256_cvtepu32_epu8_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v8su)a, __v8qu); +} + +__m128i mm512_cvtepu32_epu8_builtin_convertvector(__m512i a) +{ + return (__m128i)__builtin_convertvector((__v16su)a, __v16qu); +} + +__v2qu mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a) +{ + return __builtin_convertvector((__v2hu)a, __v2qu); +} + +__v8qu mm_cvtepu16_epu8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v8hu)a, __v8qu); +} + +__m128i mm256_cvtepu16_epu8_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v16hu)a, __v16qu); +} + +__m256i mm512_cvtepu16_epu8_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v32hu)a, __v32qu); +} diff --git a/gcc/testsuite/gcc.target/i386/pr107432-2.c b/gcc/testsuite/gcc.target/i386/pr107432-2.c new file mode 100644 index 00000000000..02ffd811cb4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-2.c @@ -0,0 +1,105 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */ +/* { dg-final { scan-assembler-times "vpmovsxdq" 3 } } */ +/* { dg-final { scan-assembler-times "vpmovsxwq" 3 } } */ +/* { dg-final { scan-assembler-times "vpmovsxbq" 3 } } */ +/* { dg-final { scan-assembler-times "vpmovsxwd" 3 } } */ +/* { dg-final { scan-assembler-times "vpmovsxbd" 3 } } */ +/* { dg-final { scan-assembler-times "vpmovsxbw" 3 } } */ + +#include <x86intrin.h> + +typedef short __v2hi __attribute__ ((__vector_size__ (4))); +typedef char __v2qi __attribute__ ((__vector_size__ (2))); +typedef char __v4qi __attribute__ ((__vector_size__ (4))); +typedef char __v8qi __attribute__ ((__vector_size__ (8))); + +__m128i mm_cvtepi32_epi64_builtin_convertvector(__v2si a) +{ + return __builtin_convertvector(a, __v2di); +} + +__m256i mm256_cvtepi32_epi64_builtin_convertvector(__v4si a) +{ + return (__m256i)__builtin_convertvector(a, __v4di); +} + +__m512i mm512_cvtepi32_epi64_builtin_convertvector(__v8si a) +{ + return (__m512i)__builtin_convertvector(a, __v8di); +} + +__m128i mm_cvtepi16_epi64_builtin_convertvector(__v2hi a) +{ + return __builtin_convertvector(a, __v2di); +} + +__m256i mm256_cvtepi16_epi64_builtin_convertvector(__v4hi a) +{ + return (__m256i)__builtin_convertvector(a, __v4di); +} + +__m512i mm512_cvtepi16_epi64_builtin_convertvector(__v8hi a) +{ + return (__m512i)__builtin_convertvector(a, __v8di); +} + +__m128i mm_cvtepi8_epi64_builtin_convertvector(__v2qi a) +{ + return __builtin_convertvector(a, __v2di); +} + +__m256i mm256_cvtepi8_epi64_builtin_convertvector(__v4qi a) +{ + return (__m256i)__builtin_convertvector(a, __v4di); +} + +__m512i mm512_cvtepi8_epi64_builtin_convertvector(__v8qi a) +{ + return (__m512i)__builtin_convertvector(a, __v8di); +} + +__m128i mm_cvtepi16_epi32_builtin_convertvector(__v4hi a) +{ + return (__m128i)__builtin_convertvector(a, __v4si); +} + +__m256i mm256_cvtepi16_epi32_builtin_convertvector(__v8hi a) +{ + return (__m256i)__builtin_convertvector(a, __v8si); +} + +__m512i mm512_cvtepi16_epi32_builtin_convertvector(__v16hi a) +{ + return (__m512i)__builtin_convertvector(a, __v16si); +} + +__m128i mm_cvtepi8_epi32_builtin_convertvector(__v4qi a) +{ + return (__m128i)__builtin_convertvector(a, __v4si); +} + +__m256i mm256_cvtepi8_epi32_builtin_convertvector(__v8qi a) +{ + return (__m256i)__builtin_convertvector(a, __v8si); +} + +__m512i mm512_cvtepi8_epi32_builtin_convertvector(__v16qi a) +{ + return (__m512i)__builtin_convertvector(a, __v16si); +} + +__m128i mm_cvtepi8_epi16_builtin_convertvector(__v8qi a) +{ + return (__m128i)__builtin_convertvector(a, __v8hi); +} + +__m256i mm256_cvtepi8_epi16_builtin_convertvector(__v16qi a) +{ + return (__m256i)__builtin_convertvector(a, __v16hi); +} + +__v32hi mm512_cvtepi8_epi16_builtin_convertvector(__v32qi a) +{ + return __builtin_convertvector(a, __v32hi); +} diff --git a/gcc/testsuite/gcc.target/i386/pr107432-3.c b/gcc/testsuite/gcc.target/i386/pr107432-3.c new file mode 100644 index 00000000000..30dc947b6dd --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-3.c @@ -0,0 +1,55 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */ +/* { dg-final { scan-assembler-times "vcvtpd2ps" 3 } } */ +/* { dg-final { scan-assembler-times "vcvtpd2ph" 3 } } */ +/* { dg-final { scan-assembler-times "vcvtps2ph" 3 } } */ + +#include <x86intrin.h> + +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8))); + +__v2sf mm_cvtpd_ps_builtin_convertvector(__v2df a) +{ + return __builtin_convertvector(a, __v2sf); +} + +__v4sf mm256_cvtpd_ps_builtin_convertvector(__v4df a) +{ + return __builtin_convertvector(a, __v4sf); +} + +__v8sf mm512_cvtpd_ps_builtin_convertvector(__v8df a) +{ + return __builtin_convertvector(a, __v8sf); +} + +__v2hf mm_cvtpd_ph_builtin_convertvector(__v2df a) +{ + return __builtin_convertvector(a, __v2hf); +} + +__v4hf mm256_cvtpd_ph_builtin_convertvector(__v4df a) +{ + return __builtin_convertvector(a, __v4hf); +} + +__v8hf mm512_cvtpd_ph_builtin_convertvector(__v8df a) +{ + return __builtin_convertvector(a, __v8hf); +} + +__v4hf mm_cvtps_ph_builtin_convertvector(__v4sf a) +{ + return __builtin_convertvector(a, __v4hf); +} + +__v8hf mm256_cvtps_ph_builtin_convertvector(__v8sf a) +{ + return __builtin_convertvector(a, __v8hf); +} + +__v16hf mm512_cvtps_ph_builtin_convertvector(__v16sf a) +{ + return __builtin_convertvector(a, __v16hf); +} diff --git a/gcc/testsuite/gcc.target/i386/pr107432-4.c b/gcc/testsuite/gcc.target/i386/pr107432-4.c new file mode 100644 index 00000000000..e537e7349e4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-4.c @@ -0,0 +1,56 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64 -mavx512fp16 -mavx512vl -O3" } */ +/* { dg-final { scan-assembler-times "vcvtps2pd" 2 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvtps2pd" 3 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvtph2pd" 3 } } */ +/* { dg-final { scan-assembler-times "vcvtph2ps" 3 } } */ + +#include <x86intrin.h> + +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8))); + +__v2df mm_cvtps_pd_builtin_convertvector(__v2sf a) +{ + return __builtin_convertvector(a, __v2df); +} + +__v4df mm256_cvtps_pd_builtin_convertvector(__v4sf a) +{ + return __builtin_convertvector(a, __v4df); +} + +__v8df mm512_cvtps_pd_builtin_convertvector(__v8sf a) +{ + return __builtin_convertvector(a, __v8df); +} + +__v2df mm_cvtph_pd_builtin_convertvector(__v2hf a) +{ + return __builtin_convertvector(a, __v2df); +} + +__v4df mm256_cvtph_pd_builtin_convertvector(__v4hf a) +{ + return __builtin_convertvector(a, __v4df); +} + +__v8df mm512_cvtph_pd_builtin_convertvector(__v8hf a) +{ + return __builtin_convertvector(a, __v8df); +} + +__v4sf mm_cvtph_ps_builtin_convertvector(__v4hf a) +{ + return __builtin_convertvector(a, __v4sf); +} + +__v8sf mm256_cvtph_ps_builtin_convertvector(__v8hf a) +{ + return __builtin_convertvector(a, __v8sf); +} + +__v16sf mm512_cvtph_ps_builtin_convertvector(__v16hf a) +{ + return __builtin_convertvector(a, __v16sf); +} diff --git a/gcc/testsuite/gcc.target/i386/pr107432-5.c b/gcc/testsuite/gcc.target/i386/pr107432-5.c new file mode 100644 index 00000000000..5a44ef9f3b9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-5.c @@ -0,0 +1,72 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64 -mavx512dq -mavx512fp16 -mavx512vl -O3" } */ +/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 } } */ +/* { dg-final { scan-assembler-times "vcvttps2qq" 2 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvttps2qq" 3 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvttph2dq" 3 } } */ +/* { dg-final { scan-assembler-times "vcvttph2qq" 3 } } */ + +#include <x86intrin.h> + +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8))); + +__v2si mm_cvtpd_epi32_builtin_convertvector(__v2df a) +{ + return __builtin_convertvector(a, __v2si); +} + +__v4si mm256_cvtpd_epi32_builtin_convertvector(__v4df a) +{ + return __builtin_convertvector(a, __v4si); +} + +__v8si mm512_cvtpd_epi32_builtin_convertvector(__v8df a) +{ + return __builtin_convertvector(a, __v8si); +} + +__v2di mm_cvtps_epi64_builtin_convertvector(__v2sf a) +{ + return __builtin_convertvector(a, __v2di); +} + +__v4di mm256_cvtps_epi64_builtin_convertvector(__v4sf a) +{ + return __builtin_convertvector(a, __v4di); +} + +__v8di mm512_cvtps_epi64_builtin_convertvector(__v8sf a) +{ + return __builtin_convertvector(a, __v8di); +} + +__v4si mm_cvtph_epi32_builtin_convertvector(__v4hf a) +{ + return __builtin_convertvector(a, __v4si); +} + +__v8si mm256_cvtph_epi32_builtin_convertvector(__v8hf a) +{ + return __builtin_convertvector(a, __v8si); +} + +__v16si mm512_cvtph_epi32_builtin_convertvector(__v16hf a) +{ + return __builtin_convertvector(a, __v16si); +} + +__v2di mm_cvtph_epi64_builtin_convertvector(__v2hf a) +{ + return __builtin_convertvector(a, __v2di); +} + +__v4di mm256_cvtph_epi64_builtin_convertvector(__v4hf a) +{ + return __builtin_convertvector(a, __v4di); +} + +__v8di mm512_cvtph_epi64_builtin_convertvector(__v8hf a) +{ + return __builtin_convertvector(a, __v8di); +} diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c new file mode 100644 index 00000000000..4a68a10b089 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c @@ -0,0 +1,139 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */ +/* { dg-final { scan-assembler-times "vcvttpd2dq" 2 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvttpd2dq" 3 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvttpd2udq" 2 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvttpd2udq" 3 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvttps2dq" 3 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */ +/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */ +/* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */ + +#include <x86intrin.h> + +typedef char __v2qi __attribute__ ((__vector_size__ (2))); +typedef char __v4qi __attribute__ ((__vector_size__ (4))); +typedef char __v8qi __attribute__ ((__vector_size__ (8))); +typedef char __v16qi __attribute__ ((__vector_size__ (16))); +typedef unsigned char __v2qu __attribute__ ((vector_size (2))); +typedef unsigned char __v4qu __attribute__ ((vector_size (4))); +typedef unsigned char __v8qu __attribute__ ((vector_size (8))); +typedef unsigned char __v16qu __attribute__ ((vector_size (16))); +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8))); +typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16))); + +__v2qi mm_cvtpd_epi8_builtin_convertvector(__v2df a) +{ + return __builtin_convertvector((__v2df)a, __v2qi); +} + +__v4qi mm256_cvtpd_epi8_builtin_convertvector(__v4df a) +{ + return __builtin_convertvector((__v4df)a, __v4qi); +} + +__v8qi mm512_cvtpd_epi8_builtin_convertvector(__v8df a) +{ + return __builtin_convertvector((__v8df)a, __v8qi); +} + +__v2qu mm_cvtpd_epu8_builtin_convertvector(__v2df a) +{ + return __builtin_convertvector((__v2df)a, __v2qu); +} + +__v4qu mm256_cvtpd_epu8_builtin_convertvector(__v4df a) +{ + return __builtin_convertvector((__v4df)a, __v4qu); +} + +__v8qu mm512_cvtpd_epu8_builtin_convertvector(__v8df a) +{ + return __builtin_convertvector((__v8df)a, __v8qu); +} + +__v2qi mm64_cvtps_epi8_builtin_convertvector(__v2sf a) +{ + return __builtin_convertvector((__v2sf)a, __v2qi); +} + +__v4qi mm128_cvtps_epi8_builtin_convertvector(__v4sf a) +{ + return __builtin_convertvector((__v4sf)a, __v4qi); +} + +__v8qi mm256_cvtps_epi8_builtin_convertvector(__v8sf a) +{ + return __builtin_convertvector((__v8sf)a, __v8qi); +} + +__v16qi mm512_cvtps_epi8_builtin_convertvector(__v16sf a) +{ + return __builtin_convertvector((__v16sf)a, __v16qi); +} + +__v2qu mm64_cvtps_epu8_builtin_convertvector(__v2sf a) +{ + return __builtin_convertvector((__v2sf)a, __v2qu); +} + +__v4qu mm128_cvtps_epu8_builtin_convertvector(__v4sf a) +{ + return __builtin_convertvector((__v4sf)a, __v4qu); +} + +__v8qu mm256_cvtps_epu8_builtin_convertvector(__v8sf a) +{ + return __builtin_convertvector((__v8sf)a, __v8qu); +} + +__v16qu mm512_cvtps_epu8_builtin_convertvector(__v16sf a) +{ + return __builtin_convertvector((__v16sf)a, __v16qu); +} + +__v2qi mm32_cvtph_epi8_builtin_convertvector(__v2hf a) +{ + return __builtin_convertvector((__v2hf)a, __v2qi); +} + +__v8qi mm128_cvtph_epi8_builtin_convertvector(__v8hf a) +{ + return __builtin_convertvector((__v8hf)a, __v8qi); +} + +__v16qi mm256_cvtph_epi8_builtin_convertvector(__v16hf a) +{ + return __builtin_convertvector((__v16hf)a, __v16qi); +} + +__v32qi mm512_cvtph_epi8_builtin_convertvector(__v32hf a) +{ + return __builtin_convertvector((__v32hf)a, __v32qi); +} + +__v2qu mm32_cvtph_epu8_builtin_convertvector(__v2hf a) +{ + return __builtin_convertvector((__v2hf)a, __v2qu); +} + +__v8qu mm128_cvtph_epu8_builtin_convertvector(__v8hf a) +{ + return __builtin_convertvector((__v8hf)a, __v8qu); +} + +__v16qu mm256_cvtph_epu8_builtin_convertvector(__v16hf a) +{ + return __builtin_convertvector((__v16hf)a, __v16qu); +} + +__v32qu mm512_cvtph_epu8_builtin_convertvector(__v32hf a) +{ + return __builtin_convertvector((__v32hf)a, __v32qu); +} diff --git a/gcc/testsuite/gcc.target/i386/pr107432-7.c b/gcc/testsuite/gcc.target/i386/pr107432-7.c new file mode 100644 index 00000000000..0ff5a97ed1a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-7.c @@ -0,0 +1,156 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512fp16 -mavx512vl -mavx512bw -O2 -mavx512dq -fno-trapping-math" } */ +/* { dg-final { scan-assembler-times "vcvtdq2pd" 2 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvtdq2pd" 3 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvtudq2pd" 2 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvtudq2pd" 3 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvtdq2ps" 3 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvtdq2ps" 4 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvtudq2ps" 3 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvtudq2ps" 4 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvtw2ph" 4 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvtw2ph" 5 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvtuw2ph" 4 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvtuw2ph" 5 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovsxbd" 5 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovsxbd" 7 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovzxbd" 5 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovzxbd" 7 { target { ! ia32 } } } } */ + +#include <x86intrin.h> + +typedef char __v2qi __attribute__ ((__vector_size__ (2))); +typedef char __v4qi __attribute__ ((__vector_size__ (4))); +typedef char __v8qi __attribute__ ((__vector_size__ (8))); +typedef char __v16qi __attribute__ ((__vector_size__ (16))); +typedef unsigned char __v2qu __attribute__ ((vector_size (2))); +typedef unsigned char __v4qu __attribute__ ((vector_size (4))); +typedef unsigned char __v8qu __attribute__ ((vector_size (8))); +typedef unsigned char __v16qu __attribute__ ((vector_size (16))); +typedef _Float16 __v2hf __attribute__ ((__vector_size__ (4))); +typedef _Float16 __v4hf __attribute__ ((__vector_size__ (8))); +typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16))); + +__v2df mm_cvtepi8_pd_builtin_convertvector(__v2qi a) +{ + return __builtin_convertvector((__v2qi)a, __v2df); +} + +__v4df mm256_cvtepi8_pd_builtin_convertvector(__v4qi a) +{ + return __builtin_convertvector((__v4qi)a, __v4df); +} + +__v8df mm512_cvtepi8_pd_builtin_convertvector(__v8qi a) +{ + return __builtin_convertvector((__v8qi)a, __v8df); +} + +__v2df mm_cvtepu8_pd_builtin_convertvector(__v2qu a) +{ + return __builtin_convertvector((__v2qu)a, __v2df); +} + +__v4df mm256_cvtepu8_pd_builtin_convertvector(__v4qu a) +{ + return __builtin_convertvector((__v4qu)a, __v4df); +} + +__v8df mm512_cvtepu8_pd_builtin_convertvector(__v8qu a) +{ + return __builtin_convertvector((__v8qu)a, __v8df); +} + +__v2sf mm64_cvtepi8_ps_builtin_convertvector(__v2qi a) +{ + return __builtin_convertvector((__v2qi)a, __v2sf); +} + +__v4sf mm128_cvtepi8_ps_builtin_convertvector(__v4qi a) +{ + return __builtin_convertvector((__v4qi)a, __v4sf); +} + +__v8sf mm256_cvtepi8_ps_builtin_convertvector(__v8qi a) +{ + return __builtin_convertvector((__v8qi)a, __v8sf); +} + +__v16sf mm512_cvtepi8_ps_builtin_convertvector(__v16qi a) +{ + return __builtin_convertvector((__v16qi)a, __v16sf); +} + +__v2sf mm64_cvtepu8_ps_builtin_convertvector(__v2qu a) +{ + return __builtin_convertvector((__v2qu)a, __v2sf); +} + +__v4sf mm128_cvtepu8_ps_builtin_convertvector(__v4qu a) +{ + return __builtin_convertvector((__v4qu)a, __v4sf); +} + +__v8sf mm256_cvtepu8_ps_builtin_convertvector(__v8qu a) +{ + return __builtin_convertvector((__v8qu)a, __v8sf); +} + +__v16sf mm512_cvtepu8_ps_builtin_convertvector(__v16qu a) +{ + return __builtin_convertvector((__v16qu)a, __v16sf); +} + +__v2hf mm32_cvtepi8_ph_builtin_convertvector(__v2qi a) +{ + return __builtin_convertvector((__v2qi)a, __v2hf); +} + +__v4hf mm64_cvtepi8_ph_builtin_convertvector(__v4qi a) +{ + return __builtin_convertvector((__v4qi)a, __v4hf); +} + +__v8hf mm128_cvtepi8_ph_builtin_convertvector(__v8qi a) +{ + return __builtin_convertvector((__v8qi)a, __v8hf); +} + +__v16hf mm256_cvtepi8_ph_builtin_convertvector(__v16qi a) +{ + return __builtin_convertvector((__v16qi)a, __v16hf); +} + +__v32hf mm512_cvtepi8_ph_builtin_convertvector(__v32qi a) +{ + return __builtin_convertvector((__v32qi)a, __v32hf); +} + +__v2hf mm32_cvtepu8_ph_builtin_convertvector(__v2qu a) +{ + return __builtin_convertvector((__v2qu)a, __v2hf); +} + +__v4hf mm64_cvtepu8_ph_builtin_convertvector(__v4qu a) +{ + return __builtin_convertvector((__v4qu)a, __v4hf); +} + +__v8hf mm128_cvtepu8_ph_builtin_convertvector(__v8qu a) +{ + return __builtin_convertvector((__v8qu)a, __v8hf); +} + +__v16hf mm256_cvtepu8_ph_builtin_convertvector(__v16qu a) +{ + return __builtin_convertvector((__v16qu)a, __v16hf); +} + +__v32hf mm512_cvtepu8_ph_builtin_convertvector(__v32qu a) +{ + return __builtin_convertvector((__v32qu)a, __v32hf); +} diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index ea0069f7a67..c38c0b9dda8 100644 --- a/gcc/tree-vect-generic.cc +++ b/gcc/tree-vect-generic.cc @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3. If not see #include "gimple-match.h" #include "recog.h" /* FIXME: for insn_data */ #include "optabs-libfuncs.h" +#include "cfgloop.h" +#include "tree-vectorizer.h" /* Build a ternary operation and gimplify it. Emit code before GSI. @@ -1870,14 +1872,33 @@ expand_vector_conversion (gimple_stmt_iterator *gsi) else if (ret_elt_bits > arg_elt_bits) modifier = WIDEN; + if (supportable_convert_operation (code, ret_type, arg_type, &code1)) + { + g = gimple_build_assign (lhs, code1, arg); + gsi_replace (gsi, g, false); + return; + } + + code_helper code2 = ERROR_MARK, code3 = ERROR_MARK; + int multi_step_cvt = 0; + vec<tree> interm_types = vNULL; + if (supportable_indirect_convert_operation (NULL, + code, + ret_type, arg_type, + &code2, &code3, + &multi_step_cvt, + &interm_types, arg)) + { + new_rhs = make_ssa_name (interm_types[0]); + g = gimple_build_assign (new_rhs, (tree_code) code3, arg); + gsi_insert_before (gsi, g, GSI_SAME_STMT); + g = gimple_build_assign (lhs, (tree_code) code2, new_rhs); + gsi_replace (gsi, g, false); + return; + } + if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR)) { - if (supportable_convert_operation (code, ret_type, arg_type, &code1)) - { - g = gimple_build_assign (lhs, code1, arg); - gsi_replace (gsi, g, false); - return; - } /* Can't use get_compute_type here, as supportable_convert_operation doesn't necessarily use an optab and needs two arguments. */ tree vec_compute_type diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 05a169ecb2d..0aa608202ca 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -5175,7 +5175,7 @@ vectorizable_conversion (vec_info *vinfo, tree scalar_dest; tree op0, op1 = NULL_TREE; loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); - tree_code tc1, tc2; + tree_code tc1; code_helper code, code1, code2; code_helper codecvt1 = ERROR_MARK, codecvt2 = ERROR_MARK; tree new_temp; @@ -5384,92 +5384,17 @@ vectorizable_conversion (vec_info *vinfo, break; } - /* For conversions between float and integer types try whether - we can use intermediate signed integer types to support the - conversion. */ - if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode) - && (code == FLOAT_EXPR || - (code == FIX_TRUNC_EXPR && !flag_trapping_math))) - { - bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode); - bool float_expr_p = code == FLOAT_EXPR; - unsigned short target_size; - scalar_mode intermediate_mode; - if (demotion) - { - intermediate_mode = lhs_mode; - target_size = GET_MODE_SIZE (rhs_mode); - } - else - { - target_size = GET_MODE_SIZE (lhs_mode); - if (!int_mode_for_size - (GET_MODE_BITSIZE (rhs_mode), 0).exists (&intermediate_mode)) - goto unsupported; - } - code1 = float_expr_p ? code : NOP_EXPR; - codecvt1 = float_expr_p ? NOP_EXPR : code; - opt_scalar_mode mode_iter; - FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode) - { - intermediate_mode = mode_iter.require (); - - if (GET_MODE_SIZE (intermediate_mode) > target_size) - break; - - scalar_mode cvt_mode; - if (!int_mode_for_size - (GET_MODE_BITSIZE (intermediate_mode), 0).exists (&cvt_mode)) - break; - - cvt_type = build_nonstandard_integer_type - (GET_MODE_BITSIZE (cvt_mode), 0); - - /* Check if the intermediate type can hold OP0's range. - When converting from float to integer this is not necessary - because values that do not fit the (smaller) target type are - unspecified anyway. */ - if (demotion && float_expr_p) - { - wide_int op_min_value, op_max_value; - if (!vect_get_range_info (op0, &op_min_value, &op_max_value)) - break; - - if (cvt_type == NULL_TREE - || (wi::min_precision (op_max_value, SIGNED) - > TYPE_PRECISION (cvt_type)) - || (wi::min_precision (op_min_value, SIGNED) - > TYPE_PRECISION (cvt_type))) - continue; - } - - cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node); - /* This should only happened for SLP as long as loop vectorizer - only supports same-sized vector. */ - if (cvt_type == NULL_TREE - || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nunits_in) - || !supportable_convert_operation ((tree_code) code1, - vectype_out, - cvt_type, &tc1) - || !supportable_convert_operation ((tree_code) codecvt1, - cvt_type, - vectype_in, &tc2)) - continue; - - found_mode = true; - break; - } + if (supportable_indirect_convert_operation (vinfo, + code, + vectype_out, + vectype_in, + &code1, + &codecvt1, + &multi_step_cvt, + &interm_types, + op0,slp_node)) + break; - if (found_mode) - { - multi_step_cvt++; - interm_types.safe_push (cvt_type); - cvt_type = NULL_TREE; - code1 = tc1; - codecvt1 = tc2; - break; - } - } /* FALLTHRU */ unsupported: if (dump_enabled_p ()) @@ -14626,6 +14551,153 @@ supportable_narrowing_operation (code_helper code, return false; } +/* Function supportable_indirect_convert_operation + + Check whether an operation represented by the code CODE is two + convert operations that are supported by the target platform in + vector form (i.e., when operating on arguments of type VECTYPE_IN + producing a result of type VECTYPE_OUT). + + Convert operations we currently support directly are FIX_TRUNC and FLOAT. + This function checks if these operations are supported + by the target platform directly (via vector tree-codes). + + Output: + - CODE1 is the code of a vector operation to be used when + converting the operation in the first step, if available. + - CODE2 is the code of a vector operation to be used when + converting the operation in the second step, if available. + - MULTI_STEP_CVT determines the number of required intermediate steps in + case of multi-step conversion (like int->short->char - in that case + MULTI_STEP_CVT will be 1). In the function, it should be 1. + - INTERM_TYPES contains the intermediate type required to perform the + convert operation (short in the above example). */ +bool +supportable_indirect_convert_operation (vec_info *vinfo, + code_helper code, + tree vectype_out, + tree vectype_in, + code_helper *code1, + code_helper *code2, + int *multi_step_cvt, + vec<tree> *interm_types, + tree op0, + slp_tree slp_node) +{ + bool found_mode = false; + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_out)); + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_in)); + opt_scalar_mode mode_iter; + tree_code tc1, tc2; + + tree cvt_type = NULL_TREE; + poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype_in); + + (*multi_step_cvt) = 0; + /* For conversions between float and integer types try whether + we can use intermediate signed integer types to support the + conversion. */ + if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode) + && (code == FLOAT_EXPR + || (code == FIX_TRUNC_EXPR && !flag_trapping_math))) + { + bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode); + bool float_expr_p = code == FLOAT_EXPR; + unsigned short target_size; + scalar_mode intermediate_mode; + if (demotion) + { + intermediate_mode = lhs_mode; + target_size = GET_MODE_SIZE (rhs_mode); + } + else + { + target_size = GET_MODE_SIZE (lhs_mode); + if (!int_mode_for_size + (GET_MODE_BITSIZE (rhs_mode), 0).exists (&intermediate_mode)) + return false; + } + *code1 = float_expr_p ? code : NOP_EXPR; + *code2 = float_expr_p ? NOP_EXPR : code; + opt_scalar_mode mode_iter; + FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode) + { + intermediate_mode = mode_iter.require (); + + if (GET_MODE_SIZE (intermediate_mode) > target_size) + break; + + scalar_mode cvt_mode; + if (!int_mode_for_size + (GET_MODE_BITSIZE (intermediate_mode), 0).exists (&cvt_mode)) + break; + + cvt_type = build_nonstandard_integer_type + (GET_MODE_BITSIZE (cvt_mode), 0); + + /* Check if the intermediate type can hold OP0's range. + When converting from float to integer this is not necessary + because values that do not fit the (smaller) target type are + unspecified anyway. */ + if (demotion && float_expr_p) + { + wide_int op_min_value, op_max_value; + /* For vector form, it looks like op0 doesn't have RANGE_INFO. + In the future, if it is supported, changes may need to be made + to this part, such as checking the RANGE of each element + in the vector. */ + if (!SSA_NAME_RANGE_INFO (op0) + || !vect_get_range_info (op0, &op_min_value, &op_max_value)) + break; + + if (cvt_type == NULL_TREE + || (wi::min_precision (op_max_value, SIGNED) + > TYPE_PRECISION (cvt_type)) + || (wi::min_precision (op_min_value, SIGNED) + > TYPE_PRECISION (cvt_type))) + continue; + } + + if (vinfo != NULL && slp_node != NULL) + cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node); + else + { + bool uns = TYPE_UNSIGNED (TREE_TYPE (vectype_out)) + || TYPE_UNSIGNED (TREE_TYPE (vectype_in)); + cvt_type = build_nonstandard_integer_type + (GET_MODE_BITSIZE (cvt_mode), uns); + cvt_type = build_vector_type (cvt_type, nelts); + } + /* This should only happened for SLP as long as loop vectorizer + only supports same-sized vector. */ + if (cvt_type == NULL_TREE + || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nelts) + || !supportable_convert_operation ((tree_code) *code1, + vectype_out, + cvt_type, &tc1) + || !supportable_convert_operation ((tree_code) *code2, + cvt_type, + vectype_in, &tc2)) + continue; + + found_mode = true; + break; + } + + if (found_mode) + { + (*multi_step_cvt)++; + interm_types->safe_push (cvt_type); + cvt_type = NULL_TREE; + *code1 = tc1; + *code2 = tc2; + return true; + } + } + interm_types->release (); + return false; +} + /* Generate and return a vector mask of MASK_TYPE such that mask[I] is true iff J + START_INDEX < END_INDEX for all J <= I. Add the statements to SEQ. */ diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 97ec9c341e7..ad65ce71bb7 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -2265,6 +2265,15 @@ extern bool supportable_widening_operation (vec_info*, code_helper, extern bool supportable_narrowing_operation (code_helper, tree, tree, code_helper *, int *, vec<tree> *); +extern bool supportable_indirect_convert_operation (vec_info *, + code_helper, + tree, tree, + code_helper *, + code_helper *, + int *, + vec<tree> *, + tree = NULL_TREE, + slp_tree = NULL); extern unsigned record_stmt_cost (stmt_vector_for_cost *, int, enum vect_cost_for_stmt, stmt_vec_info,