diff mbox series

[07/62] AVX512FP16: Add vaddph/vsubph/vdivph/vmulph.

Message ID 20210701061648.9447-8-hongtao.liu@intel.com
State New
Headers show
Series Support all AVX512FP16 intrinsics. | expand

Commit Message

Liu, Hongtao July 1, 2021, 6:15 a.m. UTC
gcc/ChangeLog:

	* config.gcc: Add avx512fp16vlintrin.h.
	* config/i386/avx512fp16intrin.h: (_mm512_add_ph): New intrinsic.
	(_mm512_mask_add_ph): Likewise.
	(_mm512_maskz_add_ph): Likewise.
	(_mm512_sub_ph): Likewise.
	(_mm512_mask_sub_ph): Likewise.
	(_mm512_maskz_sub_ph): Likewise.
	(_mm512_mul_ph): Likewise.
	(_mm512_mask_mul_ph): Likewise.
	(_mm512_maskz_mul_ph): Likewise.
	(_mm512_div_ph): Likewise.
	(_mm512_mask_div_ph): Likewise.
	(_mm512_maskz_div_ph): Likewise.
	(_mm512_add_round_ph): Likewise.
	(_mm512_mask_add_round_ph): Likewise.
	(_mm512_maskz_add_round_ph): Likewise.
	(_mm512_sub_round_ph): Likewise.
	(_mm512_mask_sub_round_ph): Likewise.
	(_mm512_maskz_sub_round_ph): Likewise.
	(_mm512_mul_round_ph): Likewise.
	(_mm512_mask_mul_round_ph): Likewise.
	(_mm512_maskz_mul_round_ph): Likewise.
	(_mm512_div_round_ph): Likewise.
	(_mm512_mask_div_round_ph): Likewise.
	(_mm512_maskz_div_round_ph): Likewise.
	* config/i386/avx512fp16vlintrin.h: New header.
	* config/i386/i386-builtin-types.def (V16HF, V8HF, V32HF):
	Add new builtin types.
	* config/i386/i386-builtin.def: Add corresponding builtins.
	* config/i386/i386-expand.c
	(ix86_expand_args_builtin): Handle new builtin types.
	(ix86_expand_round_builtin): Likewise.
	* config/i386/immintrin.h: Include avx512fp16vlintrin.h
	* config/i386/sse.md (VFH): New mode_iterator.
	(VF2H): Likewise.
	(avx512fmaskmode): Add HF vector modes.
	(avx512fmaskhalfmode): Likewise.
	(<plusminus_insn><mode>3<mask_name><round_name>): Adjust to for
	HF vector modes.
	(*<plusminus_insn><mode>3<mask_name><round_name>): Likewise.
	(mul<mode>3<mask_name><round_name>): Likewise.
	(*mul<mode>3<mask_name><round_name>): Likewise.
	(div<mode>3): Likewise.
	(<sse>_div<mode>3<mask_name><round_name>): Likewise.
	* config/i386/subst.md (SUBST_V): Add HF vector modes.
	(SUBST_A): Likewise.
	(round_mode512bit_condition): Adjust for V32HFmode.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx-1.c: Add -mavx512vl and test for new intrinsics.
	* gcc.target/i386/avx-2.c: Add -mavx512vl.
	* gcc.target/i386/avx512fp16-11a.c: New test.
	* gcc.target/i386/avx512fp16-11b.c: Ditto.
	* gcc.target/i386/avx512vlfp16-11a.c: Ditto.
	* gcc.target/i386/avx512vlfp16-11b.c: Ditto.
	* gcc.target/i386/sse-13.c: Add test for new builtins.
	* gcc.target/i386/sse-23.c: Ditto.
	* gcc.target/i386/sse-14.c: Add test for new intrinsics.
	* gcc.target/i386/sse-22.c: Ditto.
---
 gcc/config.gcc                                |   2 +-
 gcc/config/i386/avx512fp16intrin.h            | 251 ++++++++++++++++++
 gcc/config/i386/avx512fp16vlintrin.h          | 219 +++++++++++++++
 gcc/config/i386/i386-builtin-types.def        |   7 +
 gcc/config/i386/i386-builtin.def              |  20 ++
 gcc/config/i386/i386-expand.c                 |   5 +
 gcc/config/i386/immintrin.h                   |   2 +
 gcc/config/i386/sse.md                        |  62 +++--
 gcc/config/i386/subst.md                      |   6 +-
 gcc/testsuite/gcc.target/i386/avx-1.c         |   8 +-
 gcc/testsuite/gcc.target/i386/avx-2.c         |   2 +-
 .../gcc.target/i386/avx512fp16-11a.c          |  36 +++
 .../gcc.target/i386/avx512fp16-11b.c          |  75 ++++++
 .../gcc.target/i386/avx512vlfp16-11a.c        |  68 +++++
 .../gcc.target/i386/avx512vlfp16-11b.c        |  96 +++++++
 gcc/testsuite/gcc.target/i386/sse-13.c        |   6 +
 gcc/testsuite/gcc.target/i386/sse-14.c        |  14 +
 gcc/testsuite/gcc.target/i386/sse-22.c        |  14 +
 gcc/testsuite/gcc.target/i386/sse-23.c        |   6 +
 19 files changed, 872 insertions(+), 27 deletions(-)
 create mode 100644 gcc/config/i386/avx512fp16vlintrin.h
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-11a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-11b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vlfp16-11a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vlfp16-11b.c

Comments

Hongtao Liu Sept. 9, 2021, 7:48 a.m. UTC | #1
On Thu, Jul 1, 2021 at 2:17 PM liuhongt <hongtao.liu@intel.com> wrote:
>
> gcc/ChangeLog:
>
>         * config.gcc: Add avx512fp16vlintrin.h.
>         * config/i386/avx512fp16intrin.h: (_mm512_add_ph): New intrinsic.
>         (_mm512_mask_add_ph): Likewise.
>         (_mm512_maskz_add_ph): Likewise.
>         (_mm512_sub_ph): Likewise.
>         (_mm512_mask_sub_ph): Likewise.
>         (_mm512_maskz_sub_ph): Likewise.
>         (_mm512_mul_ph): Likewise.
>         (_mm512_mask_mul_ph): Likewise.
>         (_mm512_maskz_mul_ph): Likewise.
>         (_mm512_div_ph): Likewise.
>         (_mm512_mask_div_ph): Likewise.
>         (_mm512_maskz_div_ph): Likewise.
>         (_mm512_add_round_ph): Likewise.
>         (_mm512_mask_add_round_ph): Likewise.
>         (_mm512_maskz_add_round_ph): Likewise.
>         (_mm512_sub_round_ph): Likewise.
>         (_mm512_mask_sub_round_ph): Likewise.
>         (_mm512_maskz_sub_round_ph): Likewise.
>         (_mm512_mul_round_ph): Likewise.
>         (_mm512_mask_mul_round_ph): Likewise.
>         (_mm512_maskz_mul_round_ph): Likewise.
>         (_mm512_div_round_ph): Likewise.
>         (_mm512_mask_div_round_ph): Likewise.
>         (_mm512_maskz_div_round_ph): Likewise.
>         * config/i386/avx512fp16vlintrin.h: New header.
>         * config/i386/i386-builtin-types.def (V16HF, V8HF, V32HF):
>         Add new builtin types.
>         * config/i386/i386-builtin.def: Add corresponding builtins.
>         * config/i386/i386-expand.c
>         (ix86_expand_args_builtin): Handle new builtin types.
>         (ix86_expand_round_builtin): Likewise.
>         * config/i386/immintrin.h: Include avx512fp16vlintrin.h
>         * config/i386/sse.md (VFH): New mode_iterator.
>         (VF2H): Likewise.
>         (avx512fmaskmode): Add HF vector modes.
>         (avx512fmaskhalfmode): Likewise.
>         (<plusminus_insn><mode>3<mask_name><round_name>): Adjust to for
>         HF vector modes.
>         (*<plusminus_insn><mode>3<mask_name><round_name>): Likewise.
>         (mul<mode>3<mask_name><round_name>): Likewise.
>         (*mul<mode>3<mask_name><round_name>): Likewise.
>         (div<mode>3): Likewise.
>         (<sse>_div<mode>3<mask_name><round_name>): Likewise.
>         * config/i386/subst.md (SUBST_V): Add HF vector modes.
>         (SUBST_A): Likewise.
>         (round_mode512bit_condition): Adjust for V32HFmode.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/avx-1.c: Add -mavx512vl and test for new intrinsics.
>         * gcc.target/i386/avx-2.c: Add -mavx512vl.
>         * gcc.target/i386/avx512fp16-11a.c: New test.
>         * gcc.target/i386/avx512fp16-11b.c: Ditto.
>         * gcc.target/i386/avx512vlfp16-11a.c: Ditto.
>         * gcc.target/i386/avx512vlfp16-11b.c: Ditto.
>         * gcc.target/i386/sse-13.c: Add test for new builtins.
>         * gcc.target/i386/sse-23.c: Ditto.
>         * gcc.target/i386/sse-14.c: Add test for new intrinsics.
>         * gcc.target/i386/sse-22.c: Ditto.
I'm going to check in 2 patches: this patch and [1] which contains
testcase for this patch.
Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
Newly added runtime tests passed under sde.

[1]https://gcc.gnu.org/pipermail/gcc-patches/2021-July/574125.html

> ---
>  gcc/config.gcc                                |   2 +-
>  gcc/config/i386/avx512fp16intrin.h            | 251 ++++++++++++++++++
>  gcc/config/i386/avx512fp16vlintrin.h          | 219 +++++++++++++++
>  gcc/config/i386/i386-builtin-types.def        |   7 +
>  gcc/config/i386/i386-builtin.def              |  20 ++
>  gcc/config/i386/i386-expand.c                 |   5 +
>  gcc/config/i386/immintrin.h                   |   2 +
>  gcc/config/i386/sse.md                        |  62 +++--
>  gcc/config/i386/subst.md                      |   6 +-
>  gcc/testsuite/gcc.target/i386/avx-1.c         |   8 +-
>  gcc/testsuite/gcc.target/i386/avx-2.c         |   2 +-
>  .../gcc.target/i386/avx512fp16-11a.c          |  36 +++
>  .../gcc.target/i386/avx512fp16-11b.c          |  75 ++++++
>  .../gcc.target/i386/avx512vlfp16-11a.c        |  68 +++++
>  .../gcc.target/i386/avx512vlfp16-11b.c        |  96 +++++++
>  gcc/testsuite/gcc.target/i386/sse-13.c        |   6 +
>  gcc/testsuite/gcc.target/i386/sse-14.c        |  14 +
>  gcc/testsuite/gcc.target/i386/sse-22.c        |  14 +
>  gcc/testsuite/gcc.target/i386/sse-23.c        |   6 +
>  19 files changed, 872 insertions(+), 27 deletions(-)
>  create mode 100644 gcc/config/i386/avx512fp16vlintrin.h
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-11a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-11b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512vlfp16-11a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512vlfp16-11b.c
>
> diff --git a/gcc/config.gcc b/gcc/config.gcc
> index 5b4f894185a..d64a8b9407e 100644
> --- a/gcc/config.gcc
> +++ b/gcc/config.gcc
> @@ -416,7 +416,7 @@ i[34567]86-*-* | x86_64-*-*)
>                        tsxldtrkintrin.h amxtileintrin.h amxint8intrin.h
>                        amxbf16intrin.h x86gprintrin.h uintrintrin.h
>                        hresetintrin.h keylockerintrin.h avxvnniintrin.h
> -                      mwaitintrin.h avx512fp16intrin.h"
> +                      mwaitintrin.h avx512fp16intrin.h avx512fp16vlintrin.h"
>         ;;
>  ia64-*-*)
>         extra_headers=ia64intrin.h
> diff --git a/gcc/config/i386/avx512fp16intrin.h b/gcc/config/i386/avx512fp16intrin.h
> index 3fc0770986e..3e9d676dc39 100644
> --- a/gcc/config/i386/avx512fp16intrin.h
> +++ b/gcc/config/i386/avx512fp16intrin.h
> @@ -217,6 +217,257 @@ _mm_store_sh (void *__P, __m128h __A)
>    *(_Float16 *) __P = ((__v8hf)__A)[0];
>  }
>
> +/* Intrinsics v[add,sub,mul,div]ph.  */
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_add_ph (__m512h __A, __m512h __B)
> +{
> +  return (__m512h) ((__v32hf) __A + (__v32hf) __B);
> +}
> +
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_add_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
> +{
> +  return __builtin_ia32_vaddph_v32hf_mask (__C, __D, __A, __B);
> +}
> +
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_maskz_add_ph (__mmask32 __A, __m512h __B, __m512h __C)
> +{
> +  return __builtin_ia32_vaddph_v32hf_mask (__B, __C,
> +                                          _mm512_setzero_ph (), __A);
> +}
> +
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_sub_ph (__m512h __A, __m512h __B)
> +{
> +  return (__m512h) ((__v32hf) __A - (__v32hf) __B);
> +}
> +
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_sub_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
> +{
> +  return __builtin_ia32_vsubph_v32hf_mask (__C, __D, __A, __B);
> +}
> +
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_maskz_sub_ph (__mmask32 __A, __m512h __B, __m512h __C)
> +{
> +  return __builtin_ia32_vsubph_v32hf_mask (__B, __C,
> +                                          _mm512_setzero_ph (), __A);
> +}
> +
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mul_ph (__m512h __A, __m512h __B)
> +{
> +  return (__m512h) ((__v32hf) __A * (__v32hf) __B);
> +}
> +
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_mul_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
> +{
> +  return __builtin_ia32_vmulph_v32hf_mask (__C, __D, __A, __B);
> +}
> +
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_maskz_mul_ph (__mmask32 __A, __m512h __B, __m512h __C)
> +{
> +  return __builtin_ia32_vmulph_v32hf_mask (__B, __C,
> +                                          _mm512_setzero_ph (), __A);
> +}
> +
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_div_ph (__m512h __A, __m512h __B)
> +{
> +  return (__m512h) ((__v32hf) __A / (__v32hf) __B);
> +}
> +
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_div_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
> +{
> +  return __builtin_ia32_vdivph_v32hf_mask (__C, __D, __A, __B);
> +}
> +
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_maskz_div_ph (__mmask32 __A, __m512h __B, __m512h __C)
> +{
> +  return __builtin_ia32_vdivph_v32hf_mask (__B, __C,
> +                                          _mm512_setzero_ph (), __A);
> +}
> +
> +#ifdef __OPTIMIZE__
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_add_round_ph (__m512h __A, __m512h __B, const int __C)
> +{
> +  return __builtin_ia32_vaddph_v32hf_mask_round (__A, __B,
> +                                                _mm512_setzero_ph (),
> +                                                (__mmask32) -1, __C);
> +}
> +
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_add_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
> +                         __m512h __D, const int __E)
> +{
> +  return __builtin_ia32_vaddph_v32hf_mask_round (__C, __D, __A, __B, __E);
> +}
> +
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_maskz_add_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
> +                          const int __D)
> +{
> +  return __builtin_ia32_vaddph_v32hf_mask_round (__B, __C,
> +                                                _mm512_setzero_ph (),
> +                                                __A, __D);
> +}
> +
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_sub_round_ph (__m512h __A, __m512h __B, const int __C)
> +{
> +  return __builtin_ia32_vsubph_v32hf_mask_round (__A, __B,
> +                                                _mm512_setzero_ph (),
> +                                                (__mmask32) -1, __C);
> +}
> +
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_sub_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
> +                         __m512h __D, const int __E)
> +{
> +  return __builtin_ia32_vsubph_v32hf_mask_round (__C, __D, __A, __B, __E);
> +}
> +
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_maskz_sub_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
> +                          const int __D)
> +{
> +  return __builtin_ia32_vsubph_v32hf_mask_round (__B, __C,
> +                                                _mm512_setzero_ph (),
> +                                                __A, __D);
> +}
> +
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mul_round_ph (__m512h __A, __m512h __B, const int __C)
> +{
> +  return __builtin_ia32_vmulph_v32hf_mask_round (__A, __B,
> +                                                _mm512_setzero_ph (),
> +                                                (__mmask32) -1, __C);
> +}
> +
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_mul_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
> +                         __m512h __D, const int __E)
> +{
> +  return __builtin_ia32_vmulph_v32hf_mask_round (__C, __D, __A, __B, __E);
> +}
> +
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_maskz_mul_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
> +                          const int __D)
> +{
> +  return __builtin_ia32_vmulph_v32hf_mask_round (__B, __C,
> +                                                _mm512_setzero_ph (),
> +                                                __A, __D);
> +}
> +
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_div_round_ph (__m512h __A, __m512h __B, const int __C)
> +{
> +  return __builtin_ia32_vdivph_v32hf_mask_round (__A, __B,
> +                                                _mm512_setzero_ph (),
> +                                                (__mmask32) -1, __C);
> +}
> +
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_div_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
> +                         __m512h __D, const int __E)
> +{
> +  return __builtin_ia32_vdivph_v32hf_mask_round (__C, __D, __A, __B, __E);
> +}
> +
> +extern __inline __m512h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_maskz_div_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
> +                          const int __D)
> +{
> +  return __builtin_ia32_vdivph_v32hf_mask_round (__B, __C,
> +                                                _mm512_setzero_ph (),
> +                                                __A, __D);
> +}
> +#else
> +#define _mm512_add_round_ph(A, B, C)                                   \
> +  ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((A), (B),           \
> +                                                  _mm512_setzero_ph (),\
> +                                                  (__mmask32)-1, (C)))
> +
> +#define _mm512_mask_add_round_ph(A, B, C, D, E)                        \
> +  ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((C), (D), (A), (B), (E)))
> +
> +#define _mm512_maskz_add_round_ph(A, B, C, D)                          \
> +  ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((B), (C),           \
> +                                                  _mm512_setzero_ph (),\
> +                                                  (A), (D)))
> +
> +#define _mm512_sub_round_ph(A, B, C)                                   \
> +  ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((A), (B),           \
> +                                                  _mm512_setzero_ph (),\
> +                                                  (__mmask32)-1, (C)))
> +
> +#define _mm512_mask_sub_round_ph(A, B, C, D, E)                        \
> +  ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((C), (D), (A), (B), (E)))
> +
> +#define _mm512_maskz_sub_round_ph(A, B, C, D)                          \
> +  ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((B), (C),           \
> +                                                  _mm512_setzero_ph (),\
> +                                                  (A), (D)))
> +
> +#define _mm512_mul_round_ph(A, B, C)                                   \
> +  ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((A), (B),           \
> +                                                  _mm512_setzero_ph (),\
> +                                                  (__mmask32)-1, (C)))
> +
> +#define _mm512_mask_mul_round_ph(A, B, C, D, E)                        \
> +  ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((C), (D), (A), (B), (E)))
> +
> +#define _mm512_maskz_mul_round_ph(A, B, C, D)                          \
> +  ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((B), (C),           \
> +                                                  _mm512_setzero_ph (),\
> +                                                  (A), (D)))
> +
> +#define _mm512_div_round_ph(A, B, C)                                   \
> +  ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((A), (B),           \
> +                                                  _mm512_setzero_ph (),\
> +                                                  (__mmask32)-1, (C)))
> +
> +#define _mm512_mask_div_round_ph(A, B, C, D, E)                        \
> +  ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((C), (D), (A), (B), (E)))
> +
> +#define _mm512_maskz_div_round_ph(A, B, C, D)                          \
> +  ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((B), (C),           \
> +                                                  _mm512_setzero_ph (),\
> +                                                  (A), (D)))
> +#endif  /* __OPTIMIZE__  */
> +
>  #ifdef __DISABLE_AVX512FP16__
>  #undef __DISABLE_AVX512FP16__
>  #pragma GCC pop_options
> diff --git a/gcc/config/i386/avx512fp16vlintrin.h b/gcc/config/i386/avx512fp16vlintrin.h
> new file mode 100644
> index 00000000000..75fa9eb29e7
> --- /dev/null
> +++ b/gcc/config/i386/avx512fp16vlintrin.h
> @@ -0,0 +1,219 @@
> +/* Copyright (C) 2019 Free Software Foundation, Inc.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify
> +   it under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3, or (at your option)
> +   any later version.
> +
> +   GCC is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +   GNU General Public License for more details.
> +
> +   Under Section 7 of GPL version 3, you are granted additional
> +   permissions described in the GCC Runtime Library Exception, version
> +   3.1, as published by the Free Software Foundation.
> +
> +   You should have received a copy of the GNU General Public License and
> +   a copy of the GCC Runtime Library Exception along with this program;
> +   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef _IMMINTRIN_H_INCLUDED
> +#error "Never use <avx512fp16vlintrin.h> directly; include <immintrin.h> instead."
> +#endif
> +
> +#ifndef __AVX512FP16VLINTRIN_H_INCLUDED
> +#define __AVX512FP16VLINTRIN_H_INCLUDED
> +
> +#if !defined(__AVX512VL__) || !defined(__AVX512FP16__)
> +#pragma GCC push_options
> +#pragma GCC target("avx512fp16,avx512vl")
> +#define __DISABLE_AVX512FP16VL__
> +#endif /* __AVX512FP16VL__ */
> +
> +/* Intrinsics v[add,sub,mul,div]ph.  */
> +extern __inline __m128h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_add_ph (__m128h __A, __m128h __B)
> +{
> +  return (__m128h) ((__v8hf) __A + (__v8hf) __B);
> +}
> +
> +extern __inline __m256h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_add_ph (__m256h __A, __m256h __B)
> +{
> +  return (__m256h) ((__v16hf) __A + (__v16hf) __B);
> +}
> +
> +extern __inline __m128h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mask_add_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
> +{
> +  return __builtin_ia32_vaddph_v8hf_mask (__C, __D, __A, __B);
> +}
> +
> +extern __inline __m256h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_mask_add_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
> +{
> +  return __builtin_ia32_vaddph_v16hf_mask (__C, __D, __A, __B);
> +}
> +
> +extern __inline __m128h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_maskz_add_ph (__mmask8 __A, __m128h __B, __m128h __C)
> +{
> +  return __builtin_ia32_vaddph_v8hf_mask (__B, __C, _mm_setzero_ph (),
> +                                         __A);
> +}
> +
> +extern __inline __m256h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_maskz_add_ph (__mmask16 __A, __m256h __B, __m256h __C)
> +{
> +  return __builtin_ia32_vaddph_v16hf_mask (__B, __C,
> +                                          _mm256_setzero_ph (), __A);
> +}
> +
> +extern __inline __m128h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_sub_ph (__m128h __A, __m128h __B)
> +{
> +  return (__m128h) ((__v8hf) __A - (__v8hf) __B);
> +}
> +
> +extern __inline __m256h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_sub_ph (__m256h __A, __m256h __B)
> +{
> +  return (__m256h) ((__v16hf) __A - (__v16hf) __B);
> +}
> +
> +extern __inline __m128h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mask_sub_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
> +{
> +  return __builtin_ia32_vsubph_v8hf_mask (__C, __D, __A, __B);
> +}
> +
> +extern __inline __m256h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_mask_sub_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
> +{
> +  return __builtin_ia32_vsubph_v16hf_mask (__C, __D, __A, __B);
> +}
> +
> +extern __inline __m128h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_maskz_sub_ph (__mmask8 __A, __m128h __B, __m128h __C)
> +{
> +  return __builtin_ia32_vsubph_v8hf_mask (__B, __C, _mm_setzero_ph (),
> +                                         __A);
> +}
> +
> +extern __inline __m256h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_maskz_sub_ph (__mmask16 __A, __m256h __B, __m256h __C)
> +{
> +  return __builtin_ia32_vsubph_v16hf_mask (__B, __C,
> +                                          _mm256_setzero_ph (), __A);
> +}
> +
> +extern __inline __m128h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mul_ph (__m128h __A, __m128h __B)
> +{
> +  return (__m128h) ((__v8hf) __A * (__v8hf) __B);
> +}
> +
> +extern __inline __m256h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_mul_ph (__m256h __A, __m256h __B)
> +{
> +  return (__m256h) ((__v16hf) __A * (__v16hf) __B);
> +}
> +
> +extern __inline __m128h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mask_mul_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
> +{
> +  return __builtin_ia32_vmulph_v8hf_mask (__C, __D, __A, __B);
> +}
> +
> +extern __inline __m256h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_mask_mul_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
> +{
> +  return __builtin_ia32_vmulph_v16hf_mask (__C, __D, __A, __B);
> +}
> +
> +extern __inline __m128h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_maskz_mul_ph (__mmask8 __A, __m128h __B, __m128h __C)
> +{
> +  return __builtin_ia32_vmulph_v8hf_mask (__B, __C, _mm_setzero_ph (),
> +                                         __A);
> +}
> +
> +extern __inline __m256h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_maskz_mul_ph (__mmask16 __A, __m256h __B, __m256h __C)
> +{
> +  return __builtin_ia32_vmulph_v16hf_mask (__B, __C,
> +                                          _mm256_setzero_ph (), __A);
> +}
> +
> +extern __inline __m128h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_div_ph (__m128h __A, __m128h __B)
> +{
> +  return (__m128h) ((__v8hf) __A / (__v8hf) __B);
> +}
> +
> +extern __inline __m256h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_div_ph (__m256h __A, __m256h __B)
> +{
> +  return (__m256h) ((__v16hf) __A / (__v16hf) __B);
> +}
> +
> +extern __inline __m128h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mask_div_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
> +{
> +  return __builtin_ia32_vdivph_v8hf_mask (__C, __D, __A, __B);
> +}
> +
> +extern __inline __m256h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_mask_div_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
> +{
> +  return __builtin_ia32_vdivph_v16hf_mask (__C, __D, __A, __B);
> +}
> +
> +extern __inline __m128h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_maskz_div_ph (__mmask8 __A, __m128h __B, __m128h __C)
> +{
> +  return __builtin_ia32_vdivph_v8hf_mask (__B, __C, _mm_setzero_ph (),
> +                                         __A);
> +}
> +
> +extern __inline __m256h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_maskz_div_ph (__mmask16 __A, __m256h __B, __m256h __C)
> +{
> +  return __builtin_ia32_vdivph_v16hf_mask (__B, __C,
> +                                          _mm256_setzero_ph (), __A);
> +}
> +
> +#ifdef __DISABLE_AVX512FP16VL__
> +#undef __DISABLE_AVX512FP16VL__
> +#pragma GCC pop_options
> +#endif /* __DISABLE_AVX512FP16VL__ */
> +
> +#endif /* __AVX512FP16VLINTRIN_H_INCLUDED */
> diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
> index eb5153002ae..ee3b8c30589 100644
> --- a/gcc/config/i386/i386-builtin-types.def
> +++ b/gcc/config/i386/i386-builtin-types.def
> @@ -98,6 +98,7 @@ DEF_VECTOR_TYPE (V16UQI, UQI, V16QI)
>  # AVX vectors
>  DEF_VECTOR_TYPE (V4DF, DOUBLE)
>  DEF_VECTOR_TYPE (V8SF, FLOAT)
> +DEF_VECTOR_TYPE (V16HF, FLOAT16)
>  DEF_VECTOR_TYPE (V4DI, DI)
>  DEF_VECTOR_TYPE (V8SI, SI)
>  DEF_VECTOR_TYPE (V16HI, HI)
> @@ -108,6 +109,7 @@ DEF_VECTOR_TYPE (V16UHI, UHI, V16HI)
>
>  # AVX512F vectors
>  DEF_VECTOR_TYPE (V32SF, FLOAT)
> +DEF_VECTOR_TYPE (V32HF, FLOAT16)
>  DEF_VECTOR_TYPE (V16SF, FLOAT)
>  DEF_VECTOR_TYPE (V8DF, DOUBLE)
>  DEF_VECTOR_TYPE (V8DI, DI)
> @@ -1302,3 +1304,8 @@ DEF_FUNCTION_TYPE (UINT8, PV2DI, PCV2DI, PCVOID)
>
>  # FP16 builtins
>  DEF_FUNCTION_TYPE (V8HF, V8HI)
> +DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, V8HF, UQI)
> +DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, V16HF, UHI)
> +DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, INT)
> +DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI)
> +DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI, INT)
> diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
> index 1cc0cc6968c..b783d266dd8 100644
> --- a/gcc/config/i386/i386-builtin.def
> +++ b/gcc/config/i386/i386-builtin.def
> @@ -2774,6 +2774,20 @@ BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf, "__builti
>  BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, "__builtin_ia32_dpbf16ps_v4sf_mask", IX86_BUILTIN_DPHI16PS_V4SF_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI)
>  BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPHI16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI)
>
> +/* AVX512FP16.  */
> +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv8hf3_mask, "__builtin_ia32_vaddph_v8hf_mask", IX86_BUILTIN_VADDPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
> +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv16hf3_mask, "__builtin_ia32_vaddph_v16hf_mask", IX86_BUILTIN_VADDPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask, "__builtin_ia32_vaddph_v32hf_mask", IX86_BUILTIN_VADDPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
> +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv8hf3_mask, "__builtin_ia32_vsubph_v8hf_mask", IX86_BUILTIN_VSUBPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
> +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv16hf3_mask, "__builtin_ia32_vsubph_v16hf_mask", IX86_BUILTIN_VSUBPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask, "__builtin_ia32_vsubph_v32hf_mask", IX86_BUILTIN_VSUBPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
> +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv8hf3_mask, "__builtin_ia32_vmulph_v8hf_mask", IX86_BUILTIN_VMULPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
> +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv16hf3_mask, "__builtin_ia32_vmulph_v16hf_mask", IX86_BUILTIN_VMULPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3_mask, "__builtin_ia32_vmulph_v32hf_mask", IX86_BUILTIN_VMULPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
> +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv8hf3_mask, "__builtin_ia32_vdivph_v8hf_mask", IX86_BUILTIN_VDIVPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
> +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv16hf3_mask, "__builtin_ia32_vdivph_v16hf_mask", IX86_BUILTIN_VDIVPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv32hf3_mask, "__builtin_ia32_vdivph_v32hf_mask", IX86_BUILTIN_VDIVPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
> +
>  /* Builtins with rounding support.  */
>  BDESC_END (ARGS, ROUND_ARGS)
>
> @@ -2973,6 +2987,12 @@ BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_fixuns_truncv8dfv8di2_mask_round, "
>  BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_rangepv16sf_mask_round, "__builtin_ia32_rangeps512_mask", IX86_BUILTIN_RANGEPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT)
>  BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_rangepv8df_mask_round, "__builtin_ia32_rangepd512_mask", IX86_BUILTIN_RANGEPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT)
>
> +/* AVX512FP16.  */
> +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask_round, "__builtin_ia32_vaddph_v32hf_mask_round", IX86_BUILTIN_VADDPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
> +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask_round, "__builtin_ia32_vsubph_v32hf_mask_round", IX86_BUILTIN_VSUBPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
> +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3_mask_round, "__builtin_ia32_vmulph_v32hf_mask_round", IX86_BUILTIN_VMULPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
> +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv32hf3_mask_round, "__builtin_ia32_vdivph_v32hf_mask_round", IX86_BUILTIN_VDIVPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
> +
>  BDESC_END (ROUND_ARGS, MULTI_ARG)
>
>  /* FMA4 and XOP.  */
> diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
> index 5ce7163b241..39647eb2cf1 100644
> --- a/gcc/config/i386/i386-expand.c
> +++ b/gcc/config/i386/i386-expand.c
> @@ -9760,6 +9760,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
>      case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
>      case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
>      case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
> +    case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
>      case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
>      case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
>      case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
> @@ -9777,6 +9778,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
>      case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
>      case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
>      case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
> +    case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
>      case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
>      case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
>      case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
> @@ -9784,6 +9786,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
>      case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
>      case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
>      case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
> +    case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
>      case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
>      case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
>      case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
> @@ -10460,6 +10463,7 @@ ix86_expand_round_builtin (const struct builtin_description *d,
>      case INT_FTYPE_V4SF_INT:
>        nargs = 2;
>        break;
> +    case V32HF_FTYPE_V32HF_V32HF_INT:
>      case V4SF_FTYPE_V4SF_UINT_INT:
>      case V4SF_FTYPE_V4SF_UINT64_INT:
>      case V2DF_FTYPE_V2DF_UINT64_INT:
> @@ -10500,6 +10504,7 @@ ix86_expand_round_builtin (const struct builtin_description *d,
>      case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
>      case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
>      case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
> +    case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
>      case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
>      case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
>      case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
> diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h
> index 5344e22c9c8..e08efb9dff3 100644
> --- a/gcc/config/i386/immintrin.h
> +++ b/gcc/config/i386/immintrin.h
> @@ -96,6 +96,8 @@
>
>  #include <avx512fp16intrin.h>
>
> +#include <avx512fp16vlintrin.h>
> +
>  #include <shaintrin.h>
>
>  #include <fmaintrin.h>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 1009d656cbb..2c1b6fbcd86 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -295,6 +295,13 @@ (define_mode_iterator VF
>    [(V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
>     (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
>
> +(define_mode_iterator VFH
> +  [(V32HF "TARGET_AVX512FP16")
> +   (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
> +   (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
> +   (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
> +   (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
> +
>  ;; 128- and 256-bit float vector modes
>  (define_mode_iterator VF_128_256
>    [(V8SF "TARGET_AVX") V4SF
> @@ -318,6 +325,13 @@ (define_mode_iterator VF1_128_256VL
>  (define_mode_iterator VF2
>    [(V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF])
>
> +;; All DFmode & HFmode vector float modes
> +(define_mode_iterator VF2H
> +  [(V32HF "TARGET_AVX512FP16")
> +   (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
> +   (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
> +   (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF])
> +
>  ;; 128- and 256-bit DF vector modes
>  (define_mode_iterator VF2_128_256
>    [(V4DF "TARGET_AVX") V2DF])
> @@ -824,6 +838,7 @@ (define_mode_attr avx512fmaskmode
>     (V32HI "SI") (V16HI "HI") (V8HI  "QI") (V4HI "QI")
>     (V16SI "HI") (V8SI  "QI") (V4SI  "QI")
>     (V8DI  "QI") (V4DI  "QI") (V2DI  "QI")
> +   (V32HF "SI") (V16HF "HI") (V8HF  "QI")
>     (V16SF "HI") (V8SF  "QI") (V4SF  "QI")
>     (V8DF  "QI") (V4DF  "QI") (V2DF  "QI")])
>
> @@ -842,6 +857,7 @@ (define_mode_attr avx512fmaskhalfmode
>     (V32HI "HI") (V16HI "QI") (V8HI  "QI") (V4HI "QI")
>     (V16SI "QI") (V8SI  "QI") (V4SI  "QI")
>     (V8DI  "QI") (V4DI  "QI") (V2DI  "QI")
> +   (V32HF "HI") (V16HF "QI") (V8HF  "QI")
>     (V16SF "QI") (V8SF  "QI") (V4SF  "QI")
>     (V8DF  "QI") (V4DF  "QI") (V2DF  "QI")])
>
> @@ -1940,18 +1956,18 @@ (define_insn_and_split "*nabs<mode>2"
>    [(set_attr "isa" "noavx,noavx,avx,avx")])
>
>  (define_expand "<insn><mode>3<mask_name><round_name>"
> -  [(set (match_operand:VF 0 "register_operand")
> -       (plusminus:VF
> -         (match_operand:VF 1 "<round_nimm_predicate>")
> -         (match_operand:VF 2 "<round_nimm_predicate>")))]
> +  [(set (match_operand:VFH 0 "register_operand")
> +       (plusminus:VFH
> +         (match_operand:VFH 1 "<round_nimm_predicate>")
> +         (match_operand:VFH 2 "<round_nimm_predicate>")))]
>    "TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>"
>    "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
>
>  (define_insn "*<insn><mode>3<mask_name><round_name>"
> -  [(set (match_operand:VF 0 "register_operand" "=x,v")
> -       (plusminus:VF
> -         (match_operand:VF 1 "<bcst_round_nimm_predicate>" "<comm>0,v")
> -         (match_operand:VF 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))]
> +  [(set (match_operand:VFH 0 "register_operand" "=x,v")
> +       (plusminus:VFH
> +         (match_operand:VFH 1 "<bcst_round_nimm_predicate>" "<comm>0,v")
> +         (match_operand:VFH 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))]
>    "TARGET_SSE && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)
>     && <mask_mode512bit_condition> && <round_mode512bit_condition>"
>    "@
> @@ -2002,18 +2018,18 @@ (define_insn "<sse>_vm<insn><mode>3<mask_scalar_name><round_scalar_name>"
>     (set_attr "mode" "<ssescalarmode>")])
>
>  (define_expand "mul<mode>3<mask_name><round_name>"
> -  [(set (match_operand:VF 0 "register_operand")
> -       (mult:VF
> -         (match_operand:VF 1 "<round_nimm_predicate>")
> -         (match_operand:VF 2 "<round_nimm_predicate>")))]
> +  [(set (match_operand:VFH 0 "register_operand")
> +       (mult:VFH
> +         (match_operand:VFH 1 "<round_nimm_predicate>")
> +         (match_operand:VFH 2 "<round_nimm_predicate>")))]
>    "TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>"
>    "ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);")
>
>  (define_insn "*mul<mode>3<mask_name><round_name>"
> -  [(set (match_operand:VF 0 "register_operand" "=x,v")
> -       (mult:VF
> -         (match_operand:VF 1 "<bcst_round_nimm_predicate>" "%0,v")
> -         (match_operand:VF 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))]
> +  [(set (match_operand:VFH 0 "register_operand" "=x,v")
> +       (mult:VFH
> +         (match_operand:VFH 1 "<bcst_round_nimm_predicate>" "%0,v")
> +         (match_operand:VFH 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))]
>    "TARGET_SSE && ix86_binary_operator_ok (MULT, <MODE>mode, operands)
>     && <mask_mode512bit_condition> && <round_mode512bit_condition>"
>    "@
> @@ -2067,9 +2083,9 @@ (define_insn "<sse>_vm<multdiv_mnemonic><mode>3<mask_scalar_name><round_scalar_n
>     (set_attr "mode" "<ssescalarmode>")])
>
>  (define_expand "div<mode>3"
> -  [(set (match_operand:VF2 0 "register_operand")
> -       (div:VF2 (match_operand:VF2 1 "register_operand")
> -                (match_operand:VF2 2 "vector_operand")))]
> +  [(set (match_operand:VF2H 0 "register_operand")
> +       (div:VF2H (match_operand:VF2H 1 "register_operand")
> +                 (match_operand:VF2H 2 "vector_operand")))]
>    "TARGET_SSE2")
>
>  (define_expand "div<mode>3"
> @@ -2090,10 +2106,10 @@ (define_expand "div<mode>3"
>  })
>
>  (define_insn "<sse>_div<mode>3<mask_name><round_name>"
> -  [(set (match_operand:VF 0 "register_operand" "=x,v")
> -       (div:VF
> -         (match_operand:VF 1 "register_operand" "0,v")
> -         (match_operand:VF 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))]
> +  [(set (match_operand:VFH 0 "register_operand" "=x,v")
> +       (div:VFH
> +         (match_operand:VFH 1 "register_operand" "0,v")
> +         (match_operand:VFH 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))]
>    "TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>"
>    "@
>     div<ssemodesuffix>\t{%2, %0|%0, %2}
> diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md
> index 477a89803fa..762383bfd11 100644
> --- a/gcc/config/i386/subst.md
> +++ b/gcc/config/i386/subst.md
> @@ -24,6 +24,7 @@ (define_mode_iterator SUBST_V
>     V32HI V16HI V8HI
>     V16SI V8SI  V4SI
>     V8DI  V4DI  V2DI
> +   V32HF V16HF V8HF
>     V16SF V8SF  V4SF
>     V8DF  V4DF  V2DF])
>
> @@ -35,6 +36,7 @@ (define_mode_iterator SUBST_A
>     V32HI V16HI V8HI
>     V16SI V8SI  V4SI
>     V8DI  V4DI  V2DI
> +   V32HF V16HF V8HF
>     V16SF V8SF  V4SF
>     V8DF  V4DF  V2DF
>     QI HI SI DI SF DF])
> @@ -142,7 +144,9 @@ (define_subst_attr "round_prefix" "round" "vex" "evex")
>  (define_subst_attr "round_mode512bit_condition" "round" "1" "(<MODE>mode == V16SFmode
>                                                               || <MODE>mode == V8DFmode
>                                                               || <MODE>mode == V8DImode
> -                                                             || <MODE>mode == V16SImode)")
> +                                                             || <MODE>mode == V16SImode
> +                                                             || <MODE>mode == V32HFmode)")
> +
>  (define_subst_attr "round_modev8sf_condition" "round" "1" "(<MODE>mode == V8SFmode)")
>  (define_subst_attr "round_modev4sf_condition" "round" "1" "(<MODE>mode == V4SFmode)")
>  (define_subst_attr "round_codefor" "round" "*" "")
> diff --git a/gcc/testsuite/gcc.target/i386/avx-1.c b/gcc/testsuite/gcc.target/i386/avx-1.c
> index f3676077743..1eaee861141 100644
> --- a/gcc/testsuite/gcc.target/i386/avx-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx-1.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -maes -mpclmul -mgfni -mavx512bw -mavx512fp16" } */
> +/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -maes -mpclmul -mgfni -mavx512bw -mavx512fp16 -mavx512vl" } */
>  /* { dg-add-options bind_pic_locally } */
>
>  #include <mm_malloc.h>
> @@ -685,6 +685,12 @@
>  #define __builtin_ia32_vpshld_v2di(A, B, C) __builtin_ia32_vpshld_v2di(A, B, 1)
>  #define __builtin_ia32_vpshld_v2di_mask(A, B, C, D, E)  __builtin_ia32_vpshld_v2di_mask(A, B, 1, D, E)
>
> +/* avx512fp16intrin.h */
> +#define __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, 8)
> +#define __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, 8)
> +#define __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, 8)
> +#define __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, 8)
> +
>  /* vpclmulqdqintrin.h */
>  #define __builtin_ia32_vpclmulqdq_v4di(A, B, C)  __builtin_ia32_vpclmulqdq_v4di(A, B, 1)
>  #define __builtin_ia32_vpclmulqdq_v2di(A, B, C)  __builtin_ia32_vpclmulqdq_v2di(A, B, 1)
> diff --git a/gcc/testsuite/gcc.target/i386/avx-2.c b/gcc/testsuite/gcc.target/i386/avx-2.c
> index 1751c52565c..642ae4d7bfb 100644
> --- a/gcc/testsuite/gcc.target/i386/avx-2.c
> +++ b/gcc/testsuite/gcc.target/i386/avx-2.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -msse4a -maes -mpclmul -mavx512bw -mavx512fp16" } */
> +/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -msse4a -maes -mpclmul -mavx512bw -mavx512fp16 -mavx512vl" } */
>  /* { dg-add-options bind_pic_locally } */
>
>  #include <mm_malloc.h>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-11a.c b/gcc/testsuite/gcc.target/i386/avx512fp16-11a.c
> new file mode 100644
> index 00000000000..28492fa3f7b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-11a.c
> @@ -0,0 +1,36 @@
> +/* { dg-do compile} */
> +/* { dg-options "-O2 -mavx512fp16" } */
> +
> +#include <immintrin.h>
> +__m512h
> +__attribute__ ((noinline, noclone))
> +vadd512 (__m512h a, __m512h b)
> +{
> +  return a + b;
> +}
> +
> +__m512h
> +__attribute__ ((noinline, noclone))
> +vsub512 (__m512h a, __m512h b)
> +{
> +  return a - b;
> +}
> +
> +__m512h
> +__attribute__ ((noinline, noclone))
> +vmul512 (__m512h a, __m512h b)
> +{
> +  return a * b;
> +}
> +
> +__m512h
> +__attribute__ ((noinline, noclone))
> +vdiv512 (__m512h a, __m512h b)
> +{
> +  return a / b;
> +}
> +
> +/* { dg-final { scan-assembler-times "vaddph\[ \\t\]+\[^\n\r\]*%zmm\[01\]" 1 } } */
> +/* { dg-final { scan-assembler-times "vsubph\[ \\t\]+\[^\n\r\]*%zmm\[01\]" 1 } } */
> +/* { dg-final { scan-assembler-times "vmulph\[ \\t\]+\[^\n\r\]*%zmm\[01\]" 1 } } */
> +/* { dg-final { scan-assembler-times "vdivph\[ \\t\]+\[^\n\r\]*%zmm\[01\]" 1 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-11b.c b/gcc/testsuite/gcc.target/i386/avx512fp16-11b.c
> new file mode 100644
> index 00000000000..fc105152d2f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-11b.c
> @@ -0,0 +1,75 @@
> +/* { dg-do run { target avx512fp16 } } */
> +/* { dg-options "-O2 -mavx512fp16" } */
> +
> +#include <string.h>
> +#include <stdlib.h>
> +static void do_test (void);
> +
> +#define DO_TEST do_test
> +#define AVX512FP16
> +#include "avx512-check.h"
> +#include "avx512fp16-11a.c"
> +
> +/* Get random float16 between -50.x to 50.x.  */
> +_Float16
> +get_float16_noround()
> +{
> +  return ((int) (100.0 * rand ()/ (RAND_MAX + 1.0)) - 50)
> +    + 0.1f * (int) (10 * rand() / (RAND_MAX + 1.0));
> +}
> +
> +static void
> +do_test (void)
> +{
> +  _Float16 x[32];
> +  _Float16 y[32];
> +  _Float16 res_add[32];
> +  _Float16 res_sub[32];
> +  _Float16 res_mul[32];
> +  _Float16 res_div[32];
> +  for (int i = 0 ; i != 32; i++)
> +    {
> +      x[i] = get_float16_noround ();
> +      y[i] = get_float16_noround ();
> +      if (y[i] == 0)
> +       y[i] = 1.0f;
> +      res_add[i] = x[i] + y[i];
> +      res_sub[i] = x[i] - y[i];
> +      res_mul[i] = x[i] * y[i];
> +      res_div[i] = x[i] / y[i];
> +
> +    }
> +
> +  union512h u512 = { x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
> +      x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15],
> +      x[16], x[17], x[18], x[19], x[20], x[21], x[22], x[23],
> +      x[24], x[25], x[26], x[27], x[28], x[29], x[30], x[31] };
> +  union512h u512_1 = {y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7],
> +      y[8], y[9], y[10], y[11], y[12], y[13], y[14], y[15],
> +      y[16], y[17], y[18], y[19], y[20], y[21], y[22], y[23],
> +      y[24], y[25], y[26], y[27], y[28], y[29], y[30], y[31] };
> +
> +  __m512h v512;
> +  union512h a512;
> +
> +  memset (&v512, -1, sizeof (v512));
> +  v512 = vadd512 (u512.x, u512_1.x);
> +  a512.x = v512;
> +  if (check_union512h (a512, res_add))
> +    abort ();
> +  memset (&v512, -1, sizeof (v512));
> +  v512 = vsub512 (u512.x, u512_1.x);
> +  a512.x = v512;
> +  if (check_union512h (a512, res_sub))
> +    abort ();
> +  memset (&v512, -1, sizeof (v512));
> +  v512 = vmul512 (u512.x, u512_1.x);
> +  a512.x = v512;
> +  if (check_union512h (a512, res_mul))
> +    abort ();
> +  memset (&v512, -1, sizeof (v512));
> +  v512 = vdiv512 (u512.x, u512_1.x);
> +  a512.x = v512;
> +  if (check_union512h (a512, res_div))
> +    abort ();
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/avx512vlfp16-11a.c b/gcc/testsuite/gcc.target/i386/avx512vlfp16-11a.c
> new file mode 100644
> index 00000000000..a8c6296f504
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512vlfp16-11a.c
> @@ -0,0 +1,68 @@
> +/* { dg-do compile} */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
> +
> +#include <immintrin.h>
> +__m128h
> +__attribute__ ((noinline, noclone))
> +vadd128 (__m128h a, __m128h b)
> +{
> +  return a + b;
> +}
> +
> +__m256h
> +__attribute__ ((noinline, noclone))
> +vadd256 (__m256h a, __m256h b)
> +{
> +  return a + b;
> +}
> +
> +__m128h
> +__attribute__ ((noinline, noclone))
> +vsub128 (__m128h a, __m128h b)
> +{
> +  return a - b;
> +}
> +
> +__m256h
> +__attribute__ ((noinline, noclone))
> +vsub256 (__m256h a, __m256h b)
> +{
> +  return a - b;
> +}
> +
> +__m128h
> +__attribute__ ((noinline, noclone))
> +vmul128 (__m128h a, __m128h b)
> +{
> +  return a * b;
> +}
> +
> +__m256h
> +__attribute__ ((noinline, noclone))
> +vmul256 (__m256h a, __m256h b)
> +{
> +  return a * b;
> +}
> +
> +__m128h
> +__attribute__ ((noinline, noclone))
> +vdiv128 (__m128h a, __m128h b)
> +{
> +  return a / b;
> +}
> +
> +__m256h
> +__attribute__ ((noinline, noclone))
> +vdiv256 (__m256h a, __m256h b)
> +{
> +  return a / b;
> +}
> +
> +/* { dg-final { scan-assembler-times "vaddph\[ \\t\]+\[^\n\r\]*%xmm\[01\]" 1 } } */
> +/* { dg-final { scan-assembler-times "vaddph\[ \\t\]+\[^\n\r\]*%ymm\[01\]" 1 } } */
> +/* { dg-final { scan-assembler-times "vsubph\[ \\t\]+\[^\n\r\]*%xmm\[01\]" 1 } } */
> +/* { dg-final { scan-assembler-times "vsubph\[ \\t\]+\[^\n\r\]*%ymm\[01\]" 1 } } */
> +/* { dg-final { scan-assembler-times "vmulph\[ \\t\]+\[^\n\r\]*%xmm\[01\]" 1 } } */
> +/* { dg-final { scan-assembler-times "vmulph\[ \\t\]+\[^\n\r\]*%ymm\[01\]" 1 } } */
> +/* { dg-final { scan-assembler-times "vdivph\[ \\t\]+\[^\n\r\]*%xmm\[01\]" 1 } } */
> +/* { dg-final { scan-assembler-times "vdivph\[ \\t\]+\[^\n\r\]*%ymm\[01\]" 1 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/avx512vlfp16-11b.c b/gcc/testsuite/gcc.target/i386/avx512vlfp16-11b.c
> new file mode 100644
> index 00000000000..b8d3e8a4e96
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512vlfp16-11b.c
> @@ -0,0 +1,96 @@
> +/* { dg-do run { target avx512fp16 } } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
> +
> +#include <string.h>
> +#include <stdlib.h>
> +static void do_test (void);
> +
> +#define DO_TEST do_test
> +#define AVX512FP16
> +#include "avx512-check.h"
> +#include "avx512vlfp16-11a.c"
> +
> +/* Get random float16 between -50.x to 50.x.  */
> +_Float16
> +get_float16_noround()
> +{
> +  return ((int) (100.0 * rand ()/ (RAND_MAX + 1.0)) - 50)
> +    + 0.1f * (int) (10 * rand() / (RAND_MAX + 1.0));
> +}
> +
> +static void
> +do_test (void)
> +{
> +  _Float16 x[16];
> +  _Float16 y[16];
> +  _Float16 res_add[16];
> +  _Float16 res_sub[16];
> +  _Float16 res_mul[16];
> +  _Float16 res_div[16];
> +  for (int i = 0 ; i != 16; i++)
> +    {
> +      x[i] = get_float16_noround ();
> +      y[i] = get_float16_noround ();
> +      if (y[i] == 0)
> +       y[i] = 1.0f;
> +      res_add[i] = x[i] + y[i];
> +      res_sub[i] = x[i] - y[i];
> +      res_mul[i] = x[i] * y[i];
> +      res_div[i] = x[i] / y[i];
> +
> +    }
> +
> +  union128h u128 = { x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7] };
> +  union128h u128_1 = { y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7] };
> +  union256h u256 = { x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
> +      x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15] };
> +  union256h u256_1 = { y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7],
> +      y[8], y[9], y[10], y[11], y[12], y[13], y[14], y[15]};
> +
> +  __m128h v128;
> +  __m256h v256;
> +  union128h a128;
> +  union256h a256;
> +
> +  memset (&v128, -1, sizeof (v128));
> +  v128 = vadd128 (u128.x, u128_1.x);
> +  a128.x = v128;
> +  if (check_union128h (a128, res_add))
> +    abort ();
> +  memset (&v128, -1, sizeof (v128));
> +  v128 = vsub128 (u128.x, u128_1.x);
> +  a128.x = v128;
> +  if (check_union128h (a128, res_sub))
> +    abort ();
> +  memset (&v128, -1, sizeof (v128));
> +  v128 = vmul128 (u128.x, u128_1.x);
> +  a128.x = v128;
> +  if (check_union128h (a128, res_mul))
> +    abort ();
> +  memset (&v128, -1, sizeof (v128));
> +  v128 = vdiv128 (u128.x, u128_1.x);
> +  a128.x = v128;
> +  if (check_union128h (a128, res_div))
> +    abort ();
> +
> +  memset (&v256, -1, sizeof (v256));
> +  v256 = vadd256 (u256.x, u256_1.x);
> +  a256.x = v256;
> +  if (check_union256h (a256, res_add))
> +    abort ();
> +  memset (&v256, -1, sizeof (v256));
> +  v256 = vsub256 (u256.x, u256_1.x);
> +  a256.x = v256;
> +  if (check_union256h (a256, res_sub))
> +    abort ();
> +  memset (&v256, -1, sizeof (v256));
> +  v256 = vmul256 (u256.x, u256_1.x);
> +  a256.x = v256;
> +  if (check_union256h (a256, res_mul))
> +    abort ();
> +  memset (&v256, -1, sizeof (v256));
> +  v256 = vdiv256 (u256.x, u256_1.x);
> +  a256.x = v256;
> +  if (check_union256h (a256, res_div))
> +    abort ();
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/sse-13.c b/gcc/testsuite/gcc.target/i386/sse-13.c
> index f5f5c113612..50ed74cd6d6 100644
> --- a/gcc/testsuite/gcc.target/i386/sse-13.c
> +++ b/gcc/testsuite/gcc.target/i386/sse-13.c
> @@ -702,6 +702,12 @@
>  #define __builtin_ia32_vpshld_v2di(A, B, C) __builtin_ia32_vpshld_v2di(A, B, 1)
>  #define __builtin_ia32_vpshld_v2di_mask(A, B, C, D, E)  __builtin_ia32_vpshld_v2di_mask(A, B, 1, D, E)
>
> +/* avx512fp16intrin.h */
> +#define __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, 8)
> +#define __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, 8)
> +#define __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, 8)
> +#define __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, 8)
> +
>  /* vpclmulqdqintrin.h */
>  #define __builtin_ia32_vpclmulqdq_v4di(A, B, C)  __builtin_ia32_vpclmulqdq_v4di(A, B, 1)
>  #define __builtin_ia32_vpclmulqdq_v2di(A, B, C)  __builtin_ia32_vpclmulqdq_v2di(A, B, 1)
> diff --git a/gcc/testsuite/gcc.target/i386/sse-14.c b/gcc/testsuite/gcc.target/i386/sse-14.c
> index 747d504cedb..26a5e94c7ca 100644
> --- a/gcc/testsuite/gcc.target/i386/sse-14.c
> +++ b/gcc/testsuite/gcc.target/i386/sse-14.c
> @@ -667,6 +667,20 @@ test_3 (_mm512_mask_rcp28_round_ps, __m512, __m512, __mmask16, __m512, 8)
>  test_3 (_mm512_mask_rsqrt28_round_pd, __m512d, __m512d, __mmask8, __m512d, 8)
>  test_3 (_mm512_mask_rsqrt28_round_ps, __m512, __m512, __mmask16, __m512, 8)
>
> +/* avx512fp16intrin.h */
> +test_2 (_mm512_add_round_ph, __m512h, __m512h, __m512h, 8)
> +test_2 (_mm512_sub_round_ph, __m512h, __m512h, __m512h, 8)
> +test_2 (_mm512_mul_round_ph, __m512h, __m512h, __m512h, 8)
> +test_2 (_mm512_div_round_ph, __m512h, __m512h, __m512h, 8)
> +test_3 (_mm512_maskz_add_round_ph, __m512h, __mmask32, __m512h, __m512h, 8)
> +test_3 (_mm512_maskz_sub_round_ph, __m512h, __mmask32, __m512h, __m512h, 8)
> +test_3 (_mm512_maskz_mul_round_ph, __m512h, __mmask32, __m512h, __m512h, 8)
> +test_3 (_mm512_maskz_div_round_ph, __m512h, __mmask32, __m512h, __m512h, 8)
> +test_4 (_mm512_mask_add_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8)
> +test_4 (_mm512_mask_sub_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8)
> +test_4 (_mm512_mask_mul_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8)
> +test_4 (_mm512_mask_div_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8)
> +
>  /* shaintrin.h */
>  test_2 (_mm_sha1rnds4_epu32, __m128i, __m128i, __m128i, 1)
>
> diff --git a/gcc/testsuite/gcc.target/i386/sse-22.c b/gcc/testsuite/gcc.target/i386/sse-22.c
> index 33411969901..8d25effd724 100644
> --- a/gcc/testsuite/gcc.target/i386/sse-22.c
> +++ b/gcc/testsuite/gcc.target/i386/sse-22.c
> @@ -772,6 +772,20 @@ test_2 (_mm_rcp28_round_ss, __m128, __m128, __m128, 8)
>  test_2 (_mm_rsqrt28_round_sd, __m128d, __m128d, __m128d, 8)
>  test_2 (_mm_rsqrt28_round_ss, __m128, __m128, __m128, 8)
>
> +/* avx512fp16intrin.h */
> +test_2 (_mm512_add_round_ph, __m512h, __m512h, __m512h, 8)
> +test_2 (_mm512_sub_round_ph, __m512h, __m512h, __m512h, 8)
> +test_2 (_mm512_mul_round_ph, __m512h, __m512h, __m512h, 8)
> +test_2 (_mm512_div_round_ph, __m512h, __m512h, __m512h, 8)
> +test_3 (_mm512_maskz_add_round_ph, __m512h, __mmask32, __m512h, __m512h, 8)
> +test_3 (_mm512_maskz_sub_round_ph, __m512h, __mmask32, __m512h, __m512h, 8)
> +test_3 (_mm512_maskz_mul_round_ph, __m512h, __mmask32, __m512h, __m512h, 8)
> +test_3 (_mm512_maskz_div_round_ph, __m512h, __mmask32, __m512h, __m512h, 8)
> +test_4 (_mm512_mask_add_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8)
> +test_4 (_mm512_mask_sub_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8)
> +test_4 (_mm512_mask_mul_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8)
> +test_4 (_mm512_mask_div_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8)
> +
>  /* shaintrin.h */
>  test_2 (_mm_sha1rnds4_epu32, __m128i, __m128i, __m128i, 1)
>
> diff --git a/gcc/testsuite/gcc.target/i386/sse-23.c b/gcc/testsuite/gcc.target/i386/sse-23.c
> index 86590ca5ffb..f7dd5d7495c 100644
> --- a/gcc/testsuite/gcc.target/i386/sse-23.c
> +++ b/gcc/testsuite/gcc.target/i386/sse-23.c
> @@ -703,6 +703,12 @@
>  #define __builtin_ia32_vpshld_v2di(A, B, C) __builtin_ia32_vpshld_v2di(A, B, 1)
>  #define __builtin_ia32_vpshld_v2di_mask(A, B, C, D, E)  __builtin_ia32_vpshld_v2di_mask(A, B, 1, D, E)
>
> +/* avx512fp16intrin.h */
> +#define __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, 8)
> +#define __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, 8)
> +#define __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, 8)
> +#define __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, 8)
> +
>  /* vpclmulqdqintrin.h */
>  #define __builtin_ia32_vpclmulqdq_v4di(A, B, C)  __builtin_ia32_vpclmulqdq_v4di(A, B, 1)
>  #define __builtin_ia32_vpclmulqdq_v2di(A, B, C)  __builtin_ia32_vpclmulqdq_v2di(A, B, 1)
> --
> 2.18.1
>


--
BR,
Hongtao
diff mbox series

Patch

diff --git a/gcc/config.gcc b/gcc/config.gcc
index 5b4f894185a..d64a8b9407e 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -416,7 +416,7 @@  i[34567]86-*-* | x86_64-*-*)
 		       tsxldtrkintrin.h amxtileintrin.h amxint8intrin.h
 		       amxbf16intrin.h x86gprintrin.h uintrintrin.h
 		       hresetintrin.h keylockerintrin.h avxvnniintrin.h
-		       mwaitintrin.h avx512fp16intrin.h"
+		       mwaitintrin.h avx512fp16intrin.h avx512fp16vlintrin.h"
 	;;
 ia64-*-*)
 	extra_headers=ia64intrin.h
diff --git a/gcc/config/i386/avx512fp16intrin.h b/gcc/config/i386/avx512fp16intrin.h
index 3fc0770986e..3e9d676dc39 100644
--- a/gcc/config/i386/avx512fp16intrin.h
+++ b/gcc/config/i386/avx512fp16intrin.h
@@ -217,6 +217,257 @@  _mm_store_sh (void *__P, __m128h __A)
   *(_Float16 *) __P = ((__v8hf)__A)[0];
 }
 
+/* Intrinsics v[add,sub,mul,div]ph.  */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_ph (__m512h __A, __m512h __B)
+{
+  return (__m512h) ((__v32hf) __A + (__v32hf) __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
+{
+  return __builtin_ia32_vaddph_v32hf_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_ph (__mmask32 __A, __m512h __B, __m512h __C)
+{
+  return __builtin_ia32_vaddph_v32hf_mask (__B, __C,
+					   _mm512_setzero_ph (), __A);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_ph (__m512h __A, __m512h __B)
+{
+  return (__m512h) ((__v32hf) __A - (__v32hf) __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
+{
+  return __builtin_ia32_vsubph_v32hf_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_ph (__mmask32 __A, __m512h __B, __m512h __C)
+{
+  return __builtin_ia32_vsubph_v32hf_mask (__B, __C,
+					   _mm512_setzero_ph (), __A);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mul_ph (__m512h __A, __m512h __B)
+{
+  return (__m512h) ((__v32hf) __A * (__v32hf) __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mul_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
+{
+  return __builtin_ia32_vmulph_v32hf_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mul_ph (__mmask32 __A, __m512h __B, __m512h __C)
+{
+  return __builtin_ia32_vmulph_v32hf_mask (__B, __C,
+					   _mm512_setzero_ph (), __A);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_div_ph (__m512h __A, __m512h __B)
+{
+  return (__m512h) ((__v32hf) __A / (__v32hf) __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_div_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
+{
+  return __builtin_ia32_vdivph_v32hf_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_div_ph (__mmask32 __A, __m512h __B, __m512h __C)
+{
+  return __builtin_ia32_vdivph_v32hf_mask (__B, __C,
+					   _mm512_setzero_ph (), __A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_round_ph (__m512h __A, __m512h __B, const int __C)
+{
+  return __builtin_ia32_vaddph_v32hf_mask_round (__A, __B,
+						 _mm512_setzero_ph (),
+						 (__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+			  __m512h __D, const int __E)
+{
+  return __builtin_ia32_vaddph_v32hf_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
+			   const int __D)
+{
+  return __builtin_ia32_vaddph_v32hf_mask_round (__B, __C,
+						 _mm512_setzero_ph (),
+						 __A, __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_round_ph (__m512h __A, __m512h __B, const int __C)
+{
+  return __builtin_ia32_vsubph_v32hf_mask_round (__A, __B,
+						 _mm512_setzero_ph (),
+						 (__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+			  __m512h __D, const int __E)
+{
+  return __builtin_ia32_vsubph_v32hf_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
+			   const int __D)
+{
+  return __builtin_ia32_vsubph_v32hf_mask_round (__B, __C,
+						 _mm512_setzero_ph (),
+						 __A, __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mul_round_ph (__m512h __A, __m512h __B, const int __C)
+{
+  return __builtin_ia32_vmulph_v32hf_mask_round (__A, __B,
+						 _mm512_setzero_ph (),
+						 (__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mul_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+			  __m512h __D, const int __E)
+{
+  return __builtin_ia32_vmulph_v32hf_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mul_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
+			   const int __D)
+{
+  return __builtin_ia32_vmulph_v32hf_mask_round (__B, __C,
+						 _mm512_setzero_ph (),
+						 __A, __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_div_round_ph (__m512h __A, __m512h __B, const int __C)
+{
+  return __builtin_ia32_vdivph_v32hf_mask_round (__A, __B,
+						 _mm512_setzero_ph (),
+						 (__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_div_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+			  __m512h __D, const int __E)
+{
+  return __builtin_ia32_vdivph_v32hf_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_div_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
+			   const int __D)
+{
+  return __builtin_ia32_vdivph_v32hf_mask_round (__B, __C,
+						 _mm512_setzero_ph (),
+						 __A, __D);
+}
+#else
+#define _mm512_add_round_ph(A, B, C)					\
+  ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((A), (B),		\
+						   _mm512_setzero_ph (),\
+						   (__mmask32)-1, (C)))
+
+#define _mm512_mask_add_round_ph(A, B, C, D, E)			\
+  ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_add_round_ph(A, B, C, D)				\
+  ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((B), (C),		\
+						   _mm512_setzero_ph (),\
+						   (A), (D)))
+
+#define _mm512_sub_round_ph(A, B, C)					\
+  ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((A), (B),		\
+						   _mm512_setzero_ph (),\
+						   (__mmask32)-1, (C)))
+
+#define _mm512_mask_sub_round_ph(A, B, C, D, E)			\
+  ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_sub_round_ph(A, B, C, D)				\
+  ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((B), (C),		\
+						   _mm512_setzero_ph (),\
+						   (A), (D)))
+
+#define _mm512_mul_round_ph(A, B, C)					\
+  ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((A), (B),		\
+						   _mm512_setzero_ph (),\
+						   (__mmask32)-1, (C)))
+
+#define _mm512_mask_mul_round_ph(A, B, C, D, E)			\
+  ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_mul_round_ph(A, B, C, D)				\
+  ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((B), (C),		\
+						   _mm512_setzero_ph (),\
+						   (A), (D)))
+
+#define _mm512_div_round_ph(A, B, C)					\
+  ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((A), (B),		\
+						   _mm512_setzero_ph (),\
+						   (__mmask32)-1, (C)))
+
+#define _mm512_mask_div_round_ph(A, B, C, D, E)			\
+  ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_div_round_ph(A, B, C, D)				\
+  ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((B), (C),		\
+						   _mm512_setzero_ph (),\
+						   (A), (D)))
+#endif  /* __OPTIMIZE__  */
+
 #ifdef __DISABLE_AVX512FP16__
 #undef __DISABLE_AVX512FP16__
 #pragma GCC pop_options
diff --git a/gcc/config/i386/avx512fp16vlintrin.h b/gcc/config/i386/avx512fp16vlintrin.h
new file mode 100644
index 00000000000..75fa9eb29e7
--- /dev/null
+++ b/gcc/config/i386/avx512fp16vlintrin.h
@@ -0,0 +1,219 @@ 
+/* Copyright (C) 2019 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512fp16vlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512FP16VLINTRIN_H_INCLUDED
+#define __AVX512FP16VLINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VL__) || !defined(__AVX512FP16__)
+#pragma GCC push_options
+#pragma GCC target("avx512fp16,avx512vl")
+#define __DISABLE_AVX512FP16VL__
+#endif /* __AVX512FP16VL__ */
+
+/* Intrinsics v[add,sub,mul,div]ph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_ph (__m128h __A, __m128h __B)
+{
+  return (__m128h) ((__v8hf) __A + (__v8hf) __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_ph (__m256h __A, __m256h __B)
+{
+  return (__m256h) ((__v16hf) __A + (__v16hf) __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_vaddph_v8hf_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_add_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
+{
+  return __builtin_ia32_vaddph_v16hf_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_ph (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_vaddph_v8hf_mask (__B, __C, _mm_setzero_ph (),
+					  __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_add_ph (__mmask16 __A, __m256h __B, __m256h __C)
+{
+  return __builtin_ia32_vaddph_v16hf_mask (__B, __C,
+					   _mm256_setzero_ph (), __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_ph (__m128h __A, __m128h __B)
+{
+  return (__m128h) ((__v8hf) __A - (__v8hf) __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_ph (__m256h __A, __m256h __B)
+{
+  return (__m256h) ((__v16hf) __A - (__v16hf) __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_vsubph_v8hf_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sub_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
+{
+  return __builtin_ia32_vsubph_v16hf_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_ph (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_vsubph_v8hf_mask (__B, __C, _mm_setzero_ph (),
+					  __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sub_ph (__mmask16 __A, __m256h __B, __m256h __C)
+{
+  return __builtin_ia32_vsubph_v16hf_mask (__B, __C,
+					   _mm256_setzero_ph (), __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_ph (__m128h __A, __m128h __B)
+{
+  return (__m128h) ((__v8hf) __A * (__v8hf) __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mul_ph (__m256h __A, __m256h __B)
+{
+  return (__m256h) ((__v16hf) __A * (__v16hf) __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mul_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_vmulph_v8hf_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mul_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
+{
+  return __builtin_ia32_vmulph_v16hf_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mul_ph (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_vmulph_v8hf_mask (__B, __C, _mm_setzero_ph (),
+					  __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mul_ph (__mmask16 __A, __m256h __B, __m256h __C)
+{
+  return __builtin_ia32_vmulph_v16hf_mask (__B, __C,
+					   _mm256_setzero_ph (), __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_ph (__m128h __A, __m128h __B)
+{
+  return (__m128h) ((__v8hf) __A / (__v8hf) __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_div_ph (__m256h __A, __m256h __B)
+{
+  return (__m256h) ((__v16hf) __A / (__v16hf) __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_div_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_vdivph_v8hf_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_div_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
+{
+  return __builtin_ia32_vdivph_v16hf_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_div_ph (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_vdivph_v8hf_mask (__B, __C, _mm_setzero_ph (),
+					  __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_div_ph (__mmask16 __A, __m256h __B, __m256h __C)
+{
+  return __builtin_ia32_vdivph_v16hf_mask (__B, __C,
+					   _mm256_setzero_ph (), __A);
+}
+
+#ifdef __DISABLE_AVX512FP16VL__
+#undef __DISABLE_AVX512FP16VL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512FP16VL__ */
+
+#endif /* __AVX512FP16VLINTRIN_H_INCLUDED */
diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
index eb5153002ae..ee3b8c30589 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -98,6 +98,7 @@  DEF_VECTOR_TYPE (V16UQI, UQI, V16QI)
 # AVX vectors
 DEF_VECTOR_TYPE (V4DF, DOUBLE)
 DEF_VECTOR_TYPE (V8SF, FLOAT)
+DEF_VECTOR_TYPE (V16HF, FLOAT16)
 DEF_VECTOR_TYPE (V4DI, DI)
 DEF_VECTOR_TYPE (V8SI, SI)
 DEF_VECTOR_TYPE (V16HI, HI)
@@ -108,6 +109,7 @@  DEF_VECTOR_TYPE (V16UHI, UHI, V16HI)
 
 # AVX512F vectors
 DEF_VECTOR_TYPE (V32SF, FLOAT)
+DEF_VECTOR_TYPE (V32HF, FLOAT16)
 DEF_VECTOR_TYPE (V16SF, FLOAT)
 DEF_VECTOR_TYPE (V8DF, DOUBLE)
 DEF_VECTOR_TYPE (V8DI, DI)
@@ -1302,3 +1304,8 @@  DEF_FUNCTION_TYPE (UINT8, PV2DI, PCV2DI, PCVOID)
 
 # FP16 builtins
 DEF_FUNCTION_TYPE (V8HF, V8HI)
+DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, V8HF, UQI)
+DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, V16HF, UHI)
+DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, INT)
+DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI)
+DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI, INT)
diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 1cc0cc6968c..b783d266dd8 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -2774,6 +2774,20 @@  BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf, "__builti
 BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, "__builtin_ia32_dpbf16ps_v4sf_mask", IX86_BUILTIN_DPHI16PS_V4SF_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI)
 BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPHI16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI)
 
+/* AVX512FP16.  */
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv8hf3_mask, "__builtin_ia32_vaddph_v8hf_mask", IX86_BUILTIN_VADDPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv16hf3_mask, "__builtin_ia32_vaddph_v16hf_mask", IX86_BUILTIN_VADDPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask, "__builtin_ia32_vaddph_v32hf_mask", IX86_BUILTIN_VADDPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv8hf3_mask, "__builtin_ia32_vsubph_v8hf_mask", IX86_BUILTIN_VSUBPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv16hf3_mask, "__builtin_ia32_vsubph_v16hf_mask", IX86_BUILTIN_VSUBPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask, "__builtin_ia32_vsubph_v32hf_mask", IX86_BUILTIN_VSUBPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv8hf3_mask, "__builtin_ia32_vmulph_v8hf_mask", IX86_BUILTIN_VMULPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv16hf3_mask, "__builtin_ia32_vmulph_v16hf_mask", IX86_BUILTIN_VMULPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3_mask, "__builtin_ia32_vmulph_v32hf_mask", IX86_BUILTIN_VMULPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv8hf3_mask, "__builtin_ia32_vdivph_v8hf_mask", IX86_BUILTIN_VDIVPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv16hf3_mask, "__builtin_ia32_vdivph_v16hf_mask", IX86_BUILTIN_VDIVPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv32hf3_mask, "__builtin_ia32_vdivph_v32hf_mask", IX86_BUILTIN_VDIVPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
+
 /* Builtins with rounding support.  */
 BDESC_END (ARGS, ROUND_ARGS)
 
@@ -2973,6 +2987,12 @@  BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_fixuns_truncv8dfv8di2_mask_round, "
 BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_rangepv16sf_mask_round, "__builtin_ia32_rangeps512_mask", IX86_BUILTIN_RANGEPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT)
 BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_rangepv8df_mask_round, "__builtin_ia32_rangepd512_mask", IX86_BUILTIN_RANGEPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT)
 
+/* AVX512FP16.  */
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask_round, "__builtin_ia32_vaddph_v32hf_mask_round", IX86_BUILTIN_VADDPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask_round, "__builtin_ia32_vsubph_v32hf_mask_round", IX86_BUILTIN_VSUBPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3_mask_round, "__builtin_ia32_vmulph_v32hf_mask_round", IX86_BUILTIN_VMULPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv32hf3_mask_round, "__builtin_ia32_vdivph_v32hf_mask_round", IX86_BUILTIN_VDIVPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
+
 BDESC_END (ROUND_ARGS, MULTI_ARG)
 
 /* FMA4 and XOP.  */
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 5ce7163b241..39647eb2cf1 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -9760,6 +9760,7 @@  ix86_expand_args_builtin (const struct builtin_description *d,
     case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
     case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
     case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
+    case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
     case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
     case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
     case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
@@ -9777,6 +9778,7 @@  ix86_expand_args_builtin (const struct builtin_description *d,
     case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
     case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
     case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
+    case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
     case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
     case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
     case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
@@ -9784,6 +9786,7 @@  ix86_expand_args_builtin (const struct builtin_description *d,
     case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
     case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
     case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
+    case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
     case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
     case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
     case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
@@ -10460,6 +10463,7 @@  ix86_expand_round_builtin (const struct builtin_description *d,
     case INT_FTYPE_V4SF_INT:
       nargs = 2;
       break;
+    case V32HF_FTYPE_V32HF_V32HF_INT:
     case V4SF_FTYPE_V4SF_UINT_INT:
     case V4SF_FTYPE_V4SF_UINT64_INT:
     case V2DF_FTYPE_V2DF_UINT64_INT:
@@ -10500,6 +10504,7 @@  ix86_expand_round_builtin (const struct builtin_description *d,
     case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
     case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
     case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
+    case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
     case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
     case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
     case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h
index 5344e22c9c8..e08efb9dff3 100644
--- a/gcc/config/i386/immintrin.h
+++ b/gcc/config/i386/immintrin.h
@@ -96,6 +96,8 @@ 
 
 #include <avx512fp16intrin.h>
 
+#include <avx512fp16vlintrin.h>
+
 #include <shaintrin.h>
 
 #include <fmaintrin.h>
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 1009d656cbb..2c1b6fbcd86 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -295,6 +295,13 @@  (define_mode_iterator VF
   [(V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
    (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
 
+(define_mode_iterator VFH
+  [(V32HF "TARGET_AVX512FP16")
+   (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+   (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+   (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
+   (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
+
 ;; 128- and 256-bit float vector modes
 (define_mode_iterator VF_128_256
   [(V8SF "TARGET_AVX") V4SF
@@ -318,6 +325,13 @@  (define_mode_iterator VF1_128_256VL
 (define_mode_iterator VF2
   [(V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF])
 
+;; All DFmode & HFmode vector float modes
+(define_mode_iterator VF2H
+  [(V32HF "TARGET_AVX512FP16")
+   (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+   (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+   (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF])
+
 ;; 128- and 256-bit DF vector modes
 (define_mode_iterator VF2_128_256
   [(V4DF "TARGET_AVX") V2DF])
@@ -824,6 +838,7 @@  (define_mode_attr avx512fmaskmode
    (V32HI "SI") (V16HI "HI") (V8HI  "QI") (V4HI "QI")
    (V16SI "HI") (V8SI  "QI") (V4SI  "QI")
    (V8DI  "QI") (V4DI  "QI") (V2DI  "QI")
+   (V32HF "SI") (V16HF "HI") (V8HF  "QI")
    (V16SF "HI") (V8SF  "QI") (V4SF  "QI")
    (V8DF  "QI") (V4DF  "QI") (V2DF  "QI")])
 
@@ -842,6 +857,7 @@  (define_mode_attr avx512fmaskhalfmode
    (V32HI "HI") (V16HI "QI") (V8HI  "QI") (V4HI "QI")
    (V16SI "QI") (V8SI  "QI") (V4SI  "QI")
    (V8DI  "QI") (V4DI  "QI") (V2DI  "QI")
+   (V32HF "HI") (V16HF "QI") (V8HF  "QI")
    (V16SF "QI") (V8SF  "QI") (V4SF  "QI")
    (V8DF  "QI") (V4DF  "QI") (V2DF  "QI")])
 
@@ -1940,18 +1956,18 @@  (define_insn_and_split "*nabs<mode>2"
   [(set_attr "isa" "noavx,noavx,avx,avx")])
 
 (define_expand "<insn><mode>3<mask_name><round_name>"
-  [(set (match_operand:VF 0 "register_operand")
-	(plusminus:VF
-	  (match_operand:VF 1 "<round_nimm_predicate>")
-	  (match_operand:VF 2 "<round_nimm_predicate>")))]
+  [(set (match_operand:VFH 0 "register_operand")
+	(plusminus:VFH
+	  (match_operand:VFH 1 "<round_nimm_predicate>")
+	  (match_operand:VFH 2 "<round_nimm_predicate>")))]
   "TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>"
   "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
 
 (define_insn "*<insn><mode>3<mask_name><round_name>"
-  [(set (match_operand:VF 0 "register_operand" "=x,v")
-	(plusminus:VF
-	  (match_operand:VF 1 "<bcst_round_nimm_predicate>" "<comm>0,v")
-	  (match_operand:VF 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))]
+  [(set (match_operand:VFH 0 "register_operand" "=x,v")
+	(plusminus:VFH
+	  (match_operand:VFH 1 "<bcst_round_nimm_predicate>" "<comm>0,v")
+	  (match_operand:VFH 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))]
   "TARGET_SSE && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)
    && <mask_mode512bit_condition> && <round_mode512bit_condition>"
   "@
@@ -2002,18 +2018,18 @@  (define_insn "<sse>_vm<insn><mode>3<mask_scalar_name><round_scalar_name>"
    (set_attr "mode" "<ssescalarmode>")])
 
 (define_expand "mul<mode>3<mask_name><round_name>"
-  [(set (match_operand:VF 0 "register_operand")
-	(mult:VF
-	  (match_operand:VF 1 "<round_nimm_predicate>")
-	  (match_operand:VF 2 "<round_nimm_predicate>")))]
+  [(set (match_operand:VFH 0 "register_operand")
+	(mult:VFH
+	  (match_operand:VFH 1 "<round_nimm_predicate>")
+	  (match_operand:VFH 2 "<round_nimm_predicate>")))]
   "TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>"
   "ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);")
 
 (define_insn "*mul<mode>3<mask_name><round_name>"
-  [(set (match_operand:VF 0 "register_operand" "=x,v")
-	(mult:VF
-	  (match_operand:VF 1 "<bcst_round_nimm_predicate>" "%0,v")
-	  (match_operand:VF 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))]
+  [(set (match_operand:VFH 0 "register_operand" "=x,v")
+	(mult:VFH
+	  (match_operand:VFH 1 "<bcst_round_nimm_predicate>" "%0,v")
+	  (match_operand:VFH 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))]
   "TARGET_SSE && ix86_binary_operator_ok (MULT, <MODE>mode, operands)
    && <mask_mode512bit_condition> && <round_mode512bit_condition>"
   "@
@@ -2067,9 +2083,9 @@  (define_insn "<sse>_vm<multdiv_mnemonic><mode>3<mask_scalar_name><round_scalar_n
    (set_attr "mode" "<ssescalarmode>")])
 
 (define_expand "div<mode>3"
-  [(set (match_operand:VF2 0 "register_operand")
-	(div:VF2 (match_operand:VF2 1 "register_operand")
-		 (match_operand:VF2 2 "vector_operand")))]
+  [(set (match_operand:VF2H 0 "register_operand")
+	(div:VF2H (match_operand:VF2H 1 "register_operand")
+		  (match_operand:VF2H 2 "vector_operand")))]
   "TARGET_SSE2")
 
 (define_expand "div<mode>3"
@@ -2090,10 +2106,10 @@  (define_expand "div<mode>3"
 })
 
 (define_insn "<sse>_div<mode>3<mask_name><round_name>"
-  [(set (match_operand:VF 0 "register_operand" "=x,v")
-	(div:VF
-	  (match_operand:VF 1 "register_operand" "0,v")
-	  (match_operand:VF 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))]
+  [(set (match_operand:VFH 0 "register_operand" "=x,v")
+	(div:VFH
+	  (match_operand:VFH 1 "register_operand" "0,v")
+	  (match_operand:VFH 2 "<bcst_round_nimm_predicate>" "xBm,<bcst_round_constraint>")))]
   "TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>"
   "@
    div<ssemodesuffix>\t{%2, %0|%0, %2}
diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md
index 477a89803fa..762383bfd11 100644
--- a/gcc/config/i386/subst.md
+++ b/gcc/config/i386/subst.md
@@ -24,6 +24,7 @@  (define_mode_iterator SUBST_V
    V32HI V16HI V8HI
    V16SI V8SI  V4SI
    V8DI  V4DI  V2DI
+   V32HF V16HF V8HF
    V16SF V8SF  V4SF
    V8DF  V4DF  V2DF])
 
@@ -35,6 +36,7 @@  (define_mode_iterator SUBST_A
    V32HI V16HI V8HI
    V16SI V8SI  V4SI
    V8DI  V4DI  V2DI
+   V32HF V16HF V8HF
    V16SF V8SF  V4SF
    V8DF  V4DF  V2DF
    QI HI SI DI SF DF])
@@ -142,7 +144,9 @@  (define_subst_attr "round_prefix" "round" "vex" "evex")
 (define_subst_attr "round_mode512bit_condition" "round" "1" "(<MODE>mode == V16SFmode
 							      || <MODE>mode == V8DFmode
 							      || <MODE>mode == V8DImode
-							      || <MODE>mode == V16SImode)")
+							      || <MODE>mode == V16SImode
+							      || <MODE>mode == V32HFmode)")
+
 (define_subst_attr "round_modev8sf_condition" "round" "1" "(<MODE>mode == V8SFmode)")
 (define_subst_attr "round_modev4sf_condition" "round" "1" "(<MODE>mode == V4SFmode)")
 (define_subst_attr "round_codefor" "round" "*" "")
diff --git a/gcc/testsuite/gcc.target/i386/avx-1.c b/gcc/testsuite/gcc.target/i386/avx-1.c
index f3676077743..1eaee861141 100644
--- a/gcc/testsuite/gcc.target/i386/avx-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx-1.c
@@ -1,5 +1,5 @@ 
 /* { dg-do compile } */
-/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -maes -mpclmul -mgfni -mavx512bw -mavx512fp16" } */
+/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -maes -mpclmul -mgfni -mavx512bw -mavx512fp16 -mavx512vl" } */
 /* { dg-add-options bind_pic_locally } */
 
 #include <mm_malloc.h>
@@ -685,6 +685,12 @@ 
 #define __builtin_ia32_vpshld_v2di(A, B, C) __builtin_ia32_vpshld_v2di(A, B, 1)
 #define __builtin_ia32_vpshld_v2di_mask(A, B, C, D, E)  __builtin_ia32_vpshld_v2di_mask(A, B, 1, D, E)
 
+/* avx512fp16intrin.h */
+#define __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, 8)
+#define __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, 8)
+#define __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, 8)
+#define __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, 8)
+
 /* vpclmulqdqintrin.h */
 #define __builtin_ia32_vpclmulqdq_v4di(A, B, C)  __builtin_ia32_vpclmulqdq_v4di(A, B, 1) 
 #define __builtin_ia32_vpclmulqdq_v2di(A, B, C)  __builtin_ia32_vpclmulqdq_v2di(A, B, 1) 
diff --git a/gcc/testsuite/gcc.target/i386/avx-2.c b/gcc/testsuite/gcc.target/i386/avx-2.c
index 1751c52565c..642ae4d7bfb 100644
--- a/gcc/testsuite/gcc.target/i386/avx-2.c
+++ b/gcc/testsuite/gcc.target/i386/avx-2.c
@@ -1,5 +1,5 @@ 
 /* { dg-do compile } */
-/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -msse4a -maes -mpclmul -mavx512bw -mavx512fp16" } */
+/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -msse4a -maes -mpclmul -mavx512bw -mavx512fp16 -mavx512vl" } */
 /* { dg-add-options bind_pic_locally } */
 
 #include <mm_malloc.h>
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-11a.c b/gcc/testsuite/gcc.target/i386/avx512fp16-11a.c
new file mode 100644
index 00000000000..28492fa3f7b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-11a.c
@@ -0,0 +1,36 @@ 
+/* { dg-do compile} */
+/* { dg-options "-O2 -mavx512fp16" } */
+
+#include <immintrin.h>
+__m512h
+__attribute__ ((noinline, noclone))
+vadd512 (__m512h a, __m512h b)
+{
+  return a + b;
+}
+
+__m512h
+__attribute__ ((noinline, noclone))
+vsub512 (__m512h a, __m512h b)
+{
+  return a - b;
+}
+
+__m512h
+__attribute__ ((noinline, noclone))
+vmul512 (__m512h a, __m512h b)
+{
+  return a * b;
+}
+
+__m512h
+__attribute__ ((noinline, noclone))
+vdiv512 (__m512h a, __m512h b)
+{
+  return a / b;
+}
+
+/* { dg-final { scan-assembler-times "vaddph\[ \\t\]+\[^\n\r\]*%zmm\[01\]" 1 } } */
+/* { dg-final { scan-assembler-times "vsubph\[ \\t\]+\[^\n\r\]*%zmm\[01\]" 1 } } */
+/* { dg-final { scan-assembler-times "vmulph\[ \\t\]+\[^\n\r\]*%zmm\[01\]" 1 } } */
+/* { dg-final { scan-assembler-times "vdivph\[ \\t\]+\[^\n\r\]*%zmm\[01\]" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-11b.c b/gcc/testsuite/gcc.target/i386/avx512fp16-11b.c
new file mode 100644
index 00000000000..fc105152d2f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-11b.c
@@ -0,0 +1,75 @@ 
+/* { dg-do run { target avx512fp16 } } */
+/* { dg-options "-O2 -mavx512fp16" } */
+
+#include <string.h>
+#include <stdlib.h>
+static void do_test (void);
+
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+#include "avx512fp16-11a.c"
+
+/* Get random float16 between -50.x to 50.x.  */
+_Float16
+get_float16_noround()
+{
+  return ((int) (100.0 * rand ()/ (RAND_MAX + 1.0)) - 50)
+    + 0.1f * (int) (10 * rand() / (RAND_MAX + 1.0));
+}
+
+static void
+do_test (void)
+{
+  _Float16 x[32];
+  _Float16 y[32];
+  _Float16 res_add[32];
+  _Float16 res_sub[32];
+  _Float16 res_mul[32];
+  _Float16 res_div[32];
+  for (int i = 0 ; i != 32; i++)
+    {
+      x[i] = get_float16_noround ();
+      y[i] = get_float16_noround ();
+      if (y[i] == 0)
+	y[i] = 1.0f;
+      res_add[i] = x[i] + y[i];
+      res_sub[i] = x[i] - y[i];
+      res_mul[i] = x[i] * y[i];
+      res_div[i] = x[i] / y[i];
+
+    }
+
+  union512h u512 = { x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
+      x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15],
+      x[16], x[17], x[18], x[19], x[20], x[21], x[22], x[23],
+      x[24], x[25], x[26], x[27], x[28], x[29], x[30], x[31] };
+  union512h u512_1 = {y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7],
+      y[8], y[9], y[10], y[11], y[12], y[13], y[14], y[15],
+      y[16], y[17], y[18], y[19], y[20], y[21], y[22], y[23],
+      y[24], y[25], y[26], y[27], y[28], y[29], y[30], y[31] };
+
+  __m512h v512;
+  union512h a512;
+
+  memset (&v512, -1, sizeof (v512));
+  v512 = vadd512 (u512.x, u512_1.x);
+  a512.x = v512;
+  if (check_union512h (a512, res_add))
+    abort ();
+  memset (&v512, -1, sizeof (v512));
+  v512 = vsub512 (u512.x, u512_1.x);
+  a512.x = v512;
+  if (check_union512h (a512, res_sub))
+    abort ();
+  memset (&v512, -1, sizeof (v512));
+  v512 = vmul512 (u512.x, u512_1.x);
+  a512.x = v512;
+  if (check_union512h (a512, res_mul))
+    abort ();
+  memset (&v512, -1, sizeof (v512));
+  v512 = vdiv512 (u512.x, u512_1.x);
+  a512.x = v512;
+  if (check_union512h (a512, res_div))
+    abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512vlfp16-11a.c b/gcc/testsuite/gcc.target/i386/avx512vlfp16-11a.c
new file mode 100644
index 00000000000..a8c6296f504
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512vlfp16-11a.c
@@ -0,0 +1,68 @@ 
+/* { dg-do compile} */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
+
+#include <immintrin.h>
+__m128h
+__attribute__ ((noinline, noclone))
+vadd128 (__m128h a, __m128h b)
+{
+  return a + b;
+}
+
+__m256h
+__attribute__ ((noinline, noclone))
+vadd256 (__m256h a, __m256h b)
+{
+  return a + b;
+}
+
+__m128h
+__attribute__ ((noinline, noclone))
+vsub128 (__m128h a, __m128h b)
+{
+  return a - b;
+}
+
+__m256h
+__attribute__ ((noinline, noclone))
+vsub256 (__m256h a, __m256h b)
+{
+  return a - b;
+}
+
+__m128h
+__attribute__ ((noinline, noclone))
+vmul128 (__m128h a, __m128h b)
+{
+  return a * b;
+}
+
+__m256h
+__attribute__ ((noinline, noclone))
+vmul256 (__m256h a, __m256h b)
+{
+  return a * b;
+}
+
+__m128h
+__attribute__ ((noinline, noclone))
+vdiv128 (__m128h a, __m128h b)
+{
+  return a / b;
+}
+
+__m256h
+__attribute__ ((noinline, noclone))
+vdiv256 (__m256h a, __m256h b)
+{
+  return a / b;
+}
+
+/* { dg-final { scan-assembler-times "vaddph\[ \\t\]+\[^\n\r\]*%xmm\[01\]" 1 } } */
+/* { dg-final { scan-assembler-times "vaddph\[ \\t\]+\[^\n\r\]*%ymm\[01\]" 1 } } */
+/* { dg-final { scan-assembler-times "vsubph\[ \\t\]+\[^\n\r\]*%xmm\[01\]" 1 } } */
+/* { dg-final { scan-assembler-times "vsubph\[ \\t\]+\[^\n\r\]*%ymm\[01\]" 1 } } */
+/* { dg-final { scan-assembler-times "vmulph\[ \\t\]+\[^\n\r\]*%xmm\[01\]" 1 } } */
+/* { dg-final { scan-assembler-times "vmulph\[ \\t\]+\[^\n\r\]*%ymm\[01\]" 1 } } */
+/* { dg-final { scan-assembler-times "vdivph\[ \\t\]+\[^\n\r\]*%xmm\[01\]" 1 } } */
+/* { dg-final { scan-assembler-times "vdivph\[ \\t\]+\[^\n\r\]*%ymm\[01\]" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512vlfp16-11b.c b/gcc/testsuite/gcc.target/i386/avx512vlfp16-11b.c
new file mode 100644
index 00000000000..b8d3e8a4e96
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512vlfp16-11b.c
@@ -0,0 +1,96 @@ 
+/* { dg-do run { target avx512fp16 } } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
+
+#include <string.h>
+#include <stdlib.h>
+static void do_test (void);
+
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+#include "avx512vlfp16-11a.c"
+
+/* Get random float16 between -50.x to 50.x.  */
+_Float16
+get_float16_noround()
+{
+  return ((int) (100.0 * rand ()/ (RAND_MAX + 1.0)) - 50)
+    + 0.1f * (int) (10 * rand() / (RAND_MAX + 1.0));
+}
+
+static void
+do_test (void)
+{
+  _Float16 x[16];
+  _Float16 y[16];
+  _Float16 res_add[16];
+  _Float16 res_sub[16];
+  _Float16 res_mul[16];
+  _Float16 res_div[16];
+  for (int i = 0 ; i != 16; i++)
+    {
+      x[i] = get_float16_noround ();
+      y[i] = get_float16_noround ();
+      if (y[i] == 0)
+	y[i] = 1.0f;
+      res_add[i] = x[i] + y[i];
+      res_sub[i] = x[i] - y[i];
+      res_mul[i] = x[i] * y[i];
+      res_div[i] = x[i] / y[i];
+
+    }
+
+  union128h u128 = { x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7] };
+  union128h u128_1 = { y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7] };
+  union256h u256 = { x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
+      x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15] };
+  union256h u256_1 = { y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7],
+      y[8], y[9], y[10], y[11], y[12], y[13], y[14], y[15]};
+
+  __m128h v128;
+  __m256h v256;
+  union128h a128;
+  union256h a256;
+
+  memset (&v128, -1, sizeof (v128));
+  v128 = vadd128 (u128.x, u128_1.x);
+  a128.x = v128;
+  if (check_union128h (a128, res_add))
+    abort ();
+  memset (&v128, -1, sizeof (v128));
+  v128 = vsub128 (u128.x, u128_1.x);
+  a128.x = v128;
+  if (check_union128h (a128, res_sub))
+    abort ();
+  memset (&v128, -1, sizeof (v128));
+  v128 = vmul128 (u128.x, u128_1.x);
+  a128.x = v128;
+  if (check_union128h (a128, res_mul))
+    abort ();
+  memset (&v128, -1, sizeof (v128));
+  v128 = vdiv128 (u128.x, u128_1.x);
+  a128.x = v128;
+  if (check_union128h (a128, res_div))
+    abort ();
+
+  memset (&v256, -1, sizeof (v256));
+  v256 = vadd256 (u256.x, u256_1.x);
+  a256.x = v256;
+  if (check_union256h (a256, res_add))
+    abort ();
+  memset (&v256, -1, sizeof (v256));
+  v256 = vsub256 (u256.x, u256_1.x);
+  a256.x = v256;
+  if (check_union256h (a256, res_sub))
+    abort ();
+  memset (&v256, -1, sizeof (v256));
+  v256 = vmul256 (u256.x, u256_1.x);
+  a256.x = v256;
+  if (check_union256h (a256, res_mul))
+    abort ();
+  memset (&v256, -1, sizeof (v256));
+  v256 = vdiv256 (u256.x, u256_1.x);
+  a256.x = v256;
+  if (check_union256h (a256, res_div))
+    abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/sse-13.c b/gcc/testsuite/gcc.target/i386/sse-13.c
index f5f5c113612..50ed74cd6d6 100644
--- a/gcc/testsuite/gcc.target/i386/sse-13.c
+++ b/gcc/testsuite/gcc.target/i386/sse-13.c
@@ -702,6 +702,12 @@ 
 #define __builtin_ia32_vpshld_v2di(A, B, C) __builtin_ia32_vpshld_v2di(A, B, 1)
 #define __builtin_ia32_vpshld_v2di_mask(A, B, C, D, E)  __builtin_ia32_vpshld_v2di_mask(A, B, 1, D, E)
 
+/* avx512fp16intrin.h */
+#define __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, 8)
+#define __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, 8)
+#define __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, 8)
+#define __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, 8)
+
 /* vpclmulqdqintrin.h */
 #define __builtin_ia32_vpclmulqdq_v4di(A, B, C)  __builtin_ia32_vpclmulqdq_v4di(A, B, 1) 
 #define __builtin_ia32_vpclmulqdq_v2di(A, B, C)  __builtin_ia32_vpclmulqdq_v2di(A, B, 1) 
diff --git a/gcc/testsuite/gcc.target/i386/sse-14.c b/gcc/testsuite/gcc.target/i386/sse-14.c
index 747d504cedb..26a5e94c7ca 100644
--- a/gcc/testsuite/gcc.target/i386/sse-14.c
+++ b/gcc/testsuite/gcc.target/i386/sse-14.c
@@ -667,6 +667,20 @@  test_3 (_mm512_mask_rcp28_round_ps, __m512, __m512, __mmask16, __m512, 8)
 test_3 (_mm512_mask_rsqrt28_round_pd, __m512d, __m512d, __mmask8, __m512d, 8)
 test_3 (_mm512_mask_rsqrt28_round_ps, __m512, __m512, __mmask16, __m512, 8)
 
+/* avx512fp16intrin.h */
+test_2 (_mm512_add_round_ph, __m512h, __m512h, __m512h, 8)
+test_2 (_mm512_sub_round_ph, __m512h, __m512h, __m512h, 8)
+test_2 (_mm512_mul_round_ph, __m512h, __m512h, __m512h, 8)
+test_2 (_mm512_div_round_ph, __m512h, __m512h, __m512h, 8)
+test_3 (_mm512_maskz_add_round_ph, __m512h, __mmask32, __m512h, __m512h, 8)
+test_3 (_mm512_maskz_sub_round_ph, __m512h, __mmask32, __m512h, __m512h, 8)
+test_3 (_mm512_maskz_mul_round_ph, __m512h, __mmask32, __m512h, __m512h, 8)
+test_3 (_mm512_maskz_div_round_ph, __m512h, __mmask32, __m512h, __m512h, 8)
+test_4 (_mm512_mask_add_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8)
+test_4 (_mm512_mask_sub_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8)
+test_4 (_mm512_mask_mul_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8)
+test_4 (_mm512_mask_div_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8)
+
 /* shaintrin.h */
 test_2 (_mm_sha1rnds4_epu32, __m128i, __m128i, __m128i, 1)
 
diff --git a/gcc/testsuite/gcc.target/i386/sse-22.c b/gcc/testsuite/gcc.target/i386/sse-22.c
index 33411969901..8d25effd724 100644
--- a/gcc/testsuite/gcc.target/i386/sse-22.c
+++ b/gcc/testsuite/gcc.target/i386/sse-22.c
@@ -772,6 +772,20 @@  test_2 (_mm_rcp28_round_ss, __m128, __m128, __m128, 8)
 test_2 (_mm_rsqrt28_round_sd, __m128d, __m128d, __m128d, 8)
 test_2 (_mm_rsqrt28_round_ss, __m128, __m128, __m128, 8)
 
+/* avx512fp16intrin.h */
+test_2 (_mm512_add_round_ph, __m512h, __m512h, __m512h, 8)
+test_2 (_mm512_sub_round_ph, __m512h, __m512h, __m512h, 8)
+test_2 (_mm512_mul_round_ph, __m512h, __m512h, __m512h, 8)
+test_2 (_mm512_div_round_ph, __m512h, __m512h, __m512h, 8)
+test_3 (_mm512_maskz_add_round_ph, __m512h, __mmask32, __m512h, __m512h, 8)
+test_3 (_mm512_maskz_sub_round_ph, __m512h, __mmask32, __m512h, __m512h, 8)
+test_3 (_mm512_maskz_mul_round_ph, __m512h, __mmask32, __m512h, __m512h, 8)
+test_3 (_mm512_maskz_div_round_ph, __m512h, __mmask32, __m512h, __m512h, 8)
+test_4 (_mm512_mask_add_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8)
+test_4 (_mm512_mask_sub_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8)
+test_4 (_mm512_mask_mul_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8)
+test_4 (_mm512_mask_div_round_ph, __m512h, __m512h, __mmask32, __m512h, __m512h, 8)
+
 /* shaintrin.h */
 test_2 (_mm_sha1rnds4_epu32, __m128i, __m128i, __m128i, 1)
 
diff --git a/gcc/testsuite/gcc.target/i386/sse-23.c b/gcc/testsuite/gcc.target/i386/sse-23.c
index 86590ca5ffb..f7dd5d7495c 100644
--- a/gcc/testsuite/gcc.target/i386/sse-23.c
+++ b/gcc/testsuite/gcc.target/i386/sse-23.c
@@ -703,6 +703,12 @@ 
 #define __builtin_ia32_vpshld_v2di(A, B, C) __builtin_ia32_vpshld_v2di(A, B, 1)
 #define __builtin_ia32_vpshld_v2di_mask(A, B, C, D, E)  __builtin_ia32_vpshld_v2di_mask(A, B, 1, D, E)
 
+/* avx512fp16intrin.h */
+#define __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vaddph_v32hf_mask_round(A, B, C, D, 8)
+#define __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vsubph_v32hf_mask_round(A, B, C, D, 8)
+#define __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vmulph_v32hf_mask_round(A, B, C, D, 8)
+#define __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, E) __builtin_ia32_vdivph_v32hf_mask_round(A, B, C, D, 8)
+
 /* vpclmulqdqintrin.h */
 #define __builtin_ia32_vpclmulqdq_v4di(A, B, C)  __builtin_ia32_vpclmulqdq_v4di(A, B, 1) 
 #define __builtin_ia32_vpclmulqdq_v2di(A, B, C)  __builtin_ia32_vpclmulqdq_v2di(A, B, 1)