diff mbox series

[v2] libstdc++: add ARM SVE support to std::experimental::simd

Message ID 20240209142810.97817-1-vasu.srinivasvasu.14@gmail.com
State New
Headers show
Series [v2] libstdc++: add ARM SVE support to std::experimental::simd | expand

Commit Message

Srinivas Yadav Singanaboina Feb. 9, 2024, 2:28 p.m. UTC
Hi,

Thanks for review @Richard!. I have tried to address most of your comments in this patch.
The major updates include optimizing operator[] for masks, find_first_set and find_last_set.

My further comments on some of the pointed out issues are
a. regarding the coverage of types supported for sve : Yes, all the types are covered by 
mapping any type using simple two rules : the size of the type and signedness of it.
b. all the operator overloads now use infix operators. For division and remainder, 
the inactive elements are padded with 1 to avoid undefined behavior.
c. isnan is optimized to have only two cases i.e finite_math_only case or case where svcmpuo is used.
d. _S_load for masks (bool) now uses svld1 by reinterpret_casting the pointer to uint8_t pointer and then performing a svunpklo.
The same optimization is not done for masked_load and stores, as conversion of mask from a higher size type to lower size type is not optimal (sequential).
e. _S_unary_minus could not use svneg_x because it does not support unsigned types.
f. added specializations for reductions.
g. find_first_set and find_last_set are optimized using svclastb.


libstdc++-v3/ChangeLog:

        * include/Makefile.am: Add simd_sve.h.
        * include/Makefile.in: Add simd_sve.h.
        * include/experimental/bits/simd.h: Add new SveAbi.
        * include/experimental/bits/simd_builtin.h: Use
          __no_sve_deduce_t to support existing Neon Abi.
        * include/experimental/bits/simd_converter.h: Convert
          sequentially when sve is available.
        * include/experimental/bits/simd_detail.h: Define sve
          specific macro.
        * include/experimental/bits/simd_math.h: Fallback frexp
          to execute sequntially when sve is available, to handle
          fixed_size_simd return type that always uses sve.
        * include/experimental/simd: Include bits/simd_sve.h.
        * testsuite/experimental/simd/tests/bits/main.h: Enable
          testing for sve128, sve256, sve512.
        * include/experimental/bits/simd_sve.h: New file.

 Signed-off-by: Srinivas Yadav Singanaboina
 vasu.srinivasvasu.14@gmail.com
---
 libstdc++-v3/include/Makefile.am              |    1 +
 libstdc++-v3/include/Makefile.in              |    1 +
 libstdc++-v3/include/experimental/bits/simd.h |  131 +-
 .../include/experimental/bits/simd_builtin.h  |   35 +-
 .../experimental/bits/simd_converter.h        |   57 +-
 .../include/experimental/bits/simd_detail.h   |    7 +-
 .../include/experimental/bits/simd_math.h     |   14 +-
 .../include/experimental/bits/simd_sve.h      | 1863 +++++++++++++++++
 libstdc++-v3/include/experimental/simd        |    3 +
 .../experimental/simd/tests/bits/main.h       |    3 +
 10 files changed, 2084 insertions(+), 31 deletions(-)
 create mode 100644 libstdc++-v3/include/experimental/bits/simd_sve.h

Comments

Matthias Kretz March 8, 2024, 9:57 a.m. UTC | #1
Hi,

I applied and did extended testing on x86_64 (no regressions) and aarch64 
using qemu testing SVE 256, 512, and 1024. Looks good!

While going through the applied patch I noticed a few style issues that I 
simply turned into a patch (attached).

A few comments inline. Sorry for not seeing these before.

On Friday, 9 February 2024 15:28:10 CET Srinivas Yadav Singanaboina wrote:
> diff --git a/libstdc++-v3/include/experimental/bits/simd.h
> b/libstdc++-v3/include/experimental/bits/simd.h index
> 90523ea57dc..d274cd740fe 100644
> --- a/libstdc++-v3/include/experimental/bits/simd.h
> +++ b/libstdc++-v3/include/experimental/bits/simd.h
> @@ -39,12 +39,16 @@
>  #include <functional>
>  #include <iosfwd>
>  #include <utility>
> +#include <algorithm>
> 
>  #if _GLIBCXX_SIMD_X86INTRIN
>  #include <x86intrin.h>
>  #elif _GLIBCXX_SIMD_HAVE_NEON
>  #include <arm_neon.h>
>  #endif
> +#if _GLIBCXX_SIMD_HAVE_SVE
> +#include <arm_sve.h>
> +#endif
> 
>  /** @ingroup ts_simd
>   * @{
> @@ -83,6 +87,12 @@ using __m512d [[__gnu__::__vector_size__(64)]] = double;
>  using __m512i [[__gnu__::__vector_size__(64)]] = long long;
>  #endif
> 
> +#if _GLIBCXX_SIMD_HAVE_SVE
> +constexpr inline int __sve_vectorized_size_bytes = __ARM_FEATURE_SVE_BITS /
> 8; +#else
> +constexpr inline int __sve_vectorized_size_bytes = 0;
> +#endif
> +
>  namespace simd_abi {
>  // simd_abi forward declarations {{{
>  // implementation details:
> @@ -108,6 +118,9 @@ template <int _UsedBytes>
>  template <int _UsedBytes>
>    struct _VecBltnBtmsk;
> 
> +template <int _UsedBytes, int _TotalBytes = __sve_vectorized_size_bytes>
> +  struct _SveAbi;
> +
>  template <typename _Tp, int _Np>
>    using _VecN = _VecBuiltin<sizeof(_Tp) * _Np>;
> 
> @@ -123,6 +136,9 @@ template <int _UsedBytes = 64>
>  template <int _UsedBytes = 16>
>    using _Neon = _VecBuiltin<_UsedBytes>;
> 
> +template <int _UsedBytes = __sve_vectorized_size_bytes>
> +  using _Sve = _SveAbi<_UsedBytes, __sve_vectorized_size_bytes>;
> +
>  // implementation-defined:
>  using __sse = _Sse<>;
>  using __avx = _Avx<>;
> @@ -130,6 +146,7 @@ using __avx512 = _Avx512<>;
>  using __neon = _Neon<>;
>  using __neon128 = _Neon<16>;
>  using __neon64 = _Neon<8>;
> +using __sve = _Sve<>;
> 
>  // standard:
>  template <typename _Tp, size_t _Np, typename...>
> @@ -250,6 +267,8 @@ constexpr inline bool __support_neon_float =
>    false;
>  #endif
> 
> +constexpr inline bool __have_sve = _GLIBCXX_SIMD_HAVE_SVE;
> +
>  #ifdef _ARCH_PWR10
>  constexpr inline bool __have_power10vec = true;
>  #else
> @@ -356,12 +375,13 @@ namespace __detail
> 
>  		 | (__have_avx512vnni         << 27)
>  		 | (__have_avx512vpopcntdq    << 28)
>  		 | (__have_avx512vp2intersect << 29);
> 
> -    else if constexpr (__have_neon)
> +    else if constexpr (__have_neon || __have_sve)
>        return __have_neon
> 
>  	       | (__have_neon_a32 << 1)
>  	       | (__have_neon_a64 << 2)
>  	       | (__have_neon_a64 << 2)
> 
> -	       | (__support_neon_float << 3);
> +	       | (__support_neon_float << 3)
> +         | (__have_sve << 4);

This is not enough. This should list all feature flags that might have a 
(significant enough) influence on code-gen in inline functions (that are not 
always_inline). AFAIU at least __ARM_FEATURE_SVE2 is necessary. But I assume 
__ARM_FEATURE_SVE2_BITPERM, __ARM_FEATURE_SVE_BITS, 
__ARM_FEATURE_SVE_MATMUL_INT8, and __ARM_FEATURE_SVE_VECTOR_OPERATORS are also 
relevant. Maybe more?

> [...]
bits/simd.h:

>  // fall back to fixed_size only if scalar and native ABIs don't match
>  template <typename _Tp, size_t _Np, typename = void>
>    struct __deduce_fixed_size_fallback {};
> 
> +template <typename _Tp, size_t _Np, typename = void>
> +  struct __no_sve_deduce_fixed_size_fallback {};
> +
>  template <typename _Tp, size_t _Np>
>    struct __deduce_fixed_size_fallback<_Tp, _Np,
>      enable_if_t<simd_abi::fixed_size<_Np>::template _S_is_valid_v<_Tp>>>
>    { using type = simd_abi::fixed_size<_Np>; };
> 
> +template <typename _Tp, size_t _Np>
> +  struct __no_sve_deduce_fixed_size_fallback<_Tp, _Np,
> +    enable_if_t<simd_abi::fixed_size<_Np>::template _S_is_valid_v<_Tp>>>
> +  { using type = simd_abi::fixed_size<_Np>; };
> +
>  template <typename _Tp, size_t _Np, typename>
>    struct __deduce_impl : public __deduce_fixed_size_fallback<_Tp, _Np> {};
> 
> +template <typename _Tp, size_t _Np, typename>
> +  struct __no_sve_deduce_impl : public
> __no_sve_deduce_fixed_size_fallback<_Tp, _Np> {};

I believe you don't need __no_sve_deduce_fixed_size_fallback. Simply derive 
__no_sve_deduce_impl from __deduce_fixed_size_fallback. No?


> diff --git a/libstdc++-v3/include/experimental/bits/simd_converter.h
> b/libstdc++-v3/include/experimental/bits/simd_converter.h index
> 3160e251632..b233d2c70a5 100644
> --- a/libstdc++-v3/include/experimental/bits/simd_converter.h
> +++ b/libstdc++-v3/include/experimental/bits/simd_converter.h
> @@ -28,6 +28,18 @@
>  #if __cplusplus >= 201703L
> 
>  _GLIBCXX_SIMD_BEGIN_NAMESPACE
> +
> +template <typename _Arg, typename _Ret, typename _To, size_t _Np>
> +_Ret __converter_fallback(_Arg __a)
> +  {
> +  _Ret __ret{};
> +  __execute_n_times<_Np>(
> +      [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
> +        __ret._M_set(__i, static_cast<_To>(__a[__i]));
> +    });
> +  return __ret;
> +  }
> +
>  // _SimdConverter scalar -> scalar {{{
>  template <typename _From, typename _To>
>    struct _SimdConverter<_From, simd_abi::scalar, _To, simd_abi::scalar,
> @@ -56,14 +68,16 @@ template <typename _From, typename _To, typename _Abi>
>    };
> 
>  // }}}
> -// _SimdConverter "native 1" -> "native 2" {{{
> +// _SimdConverter "native non-sve 1" -> "native non-sve 2" {{{
>  template <typename _From, typename _To, typename _AFrom, typename _ATo>
>    struct _SimdConverter<
>      _From, _AFrom, _To, _ATo,
>      enable_if_t<!disjunction_v<
>        __is_fixed_size_abi<_AFrom>, __is_fixed_size_abi<_ATo>,
>        is_same<_AFrom, simd_abi::scalar>, is_same<_ATo, simd_abi::scalar>,
> -      conjunction<is_same<_From, _To>, is_same<_AFrom, _ATo>>>>>
> +      conjunction<is_same<_From, _To>, is_same<_AFrom, _ATo>>>
> +	  && !(__is_sve_abi<_AFrom>() || __is_sve_abi<_ATo>())
> +	  >>
>    {
>      using _Arg = typename _AFrom::template __traits<_From>::_SimdMember;
>      using _Ret = typename _ATo::template __traits<_To>::_SimdMember;
> @@ -75,6 +89,26 @@ template <typename _From, typename _To, typename _AFrom,
> typename _ATo> { return __vector_convert<_V>(__a, __more...); }
>    };
> 
> +// }}}
> +// _SimdConverter "native 1" -> "native 2" {{{
> +template <typename _From, typename _To, typename _AFrom, typename _ATo>
> +  struct _SimdConverter<
> +    _From, _AFrom, _To, _ATo,
> +    enable_if_t<!disjunction_v<
> +      __is_fixed_size_abi<_AFrom>, __is_fixed_size_abi<_ATo>,
> +      is_same<_AFrom, simd_abi::scalar>, is_same<_ATo, simd_abi::scalar>,
> +      conjunction<is_same<_From, _To>, is_same<_AFrom, _ATo>>>
> +	  && (__is_sve_abi<_AFrom>() || __is_sve_abi<_ATo>())
> +	  >>
> +  {
> +    using _Arg = typename _AFrom::template __traits<_From>::_SimdMember;
> +    using _Ret = typename _ATo::template __traits<_To>::_SimdMember;
> +
> +    _GLIBCXX_SIMD_INTRINSIC constexpr _Ret
> +    operator()(_Arg __x) const noexcept
> +    { return __converter_fallback<_Arg, _Ret, _To, simd_size_v<_To,
> _ATo>>(__x); } +  };
> +

I'd prefer if you could solve this with a constexpr-if in operator() instead 
of making the enable_if condition even longer. Feel free to 
static_assert(sizeof...(_More) == 0) in the SVE branch. (Why is it 
unnecessary, though?)

>  // }}}
>  // _SimdConverter scalar -> fixed_size<1> {{{1
>  template <typename _From, typename _To>
> @@ -111,6 +145,10 @@ template <typename _From, typename _To, int _Np>
>        if constexpr (is_same_v<_From, _To>)
>  	return __x;
> 
> +	  // fallback to sequential when sve is available
> +	  else if constexpr (__have_sve)
> +	return __converter_fallback<_Arg, _Ret, _To, _Np>(__x);
> +

At least the next three cases should all work, no? Or is the point that this 
fallback leads to better code-gen with SVE?

> diff --git a/libstdc++-v3/include/experimental/bits/simd_detail.h
> b/libstdc++-v3/include/experimental/bits/simd_detail.h index
> 1fb77866bb2..52fdf7149bf 100644
> --- a/libstdc++-v3/include/experimental/bits/simd_detail.h
> +++ b/libstdc++-v3/include/experimental/bits/simd_detail.h
> @@ -61,6 +61,11 @@
>  #else
>  #define _GLIBCXX_SIMD_HAVE_NEON_A64 0
>  #endif
> +#if (__ARM_FEATURE_SVE_BITS > 0 && __ARM_FEATURE_SVE_VECTOR_OPERATORS==1)
> +#define _GLIBCXX_SIMD_HAVE_SVE 1
> +#else
> +#define _GLIBCXX_SIMD_HAVE_SVE 0
> +#endif
>  //}}}
>  // x86{{{
>  #ifdef __MMX__
> @@ -267,7 +272,7 @@
>  #define _GLIBCXX_SIMD_IS_UNLIKELY(__x) __builtin_expect(__x, 0)
>  #define _GLIBCXX_SIMD_IS_LIKELY(__x) __builtin_expect(__x, 1)
> 
> -#if __STRICT_ANSI__ || defined __clang__
> +#if _GLIBCXX_SIMD_HAVE_SVE || __STRICT_ANSI__ || defined __clang__
>  #define _GLIBCXX_SIMD_CONSTEXPR
>  #define _GLIBCXX_SIMD_USE_CONSTEXPR_API const

This is something I'd like to see resolved. (But not necessary for this patch, 
IMHO.) Even if some parts of the SVE interface can't be used in constant 
expressions, it must be possible to work around those with `if 
(__builtin_is_constant_evaluated())` branches. For C++26 we will have to do 
this, because the std::simd interface is fully constexpr.

> diff --git a/libstdc++-v3/include/experimental/bits/simd_sve.h
> b/libstdc++-v3/include/experimental/bits/simd_sve.h new file mode 100644
> index 00000000000..123242a3a62
> --- /dev/null
> +++ b/libstdc++-v3/include/experimental/bits/simd_sve.h
[...]
> +template <typename _Tp, size_t _Np>
> +  struct __sve_vector_type
> +  {};
> +
> +template <typename _Tp, size_t _Np>
> +  using __sve_vector_type_t = typename __sve_vector_type<_Tp, _Np>::type;
> +
> +template <size_t _Np>
> +  struct __sve_vector_type<int8_t, _Np>
> +  {
> +    typedef svint8_t __sve_vlst_type
> __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); +
> +    inline static __sve_vlst_type
> +    __sve_broadcast(int8_t __dup)
> +    { return svdup_s8(__dup); }
> +
> +    inline static __sve_bool_type
> +    __sve_active_mask()
> +    { return svwhilelt_b8(size_t(0), _Np); };
> +
> +    using type = __sve_vlst_type;
> +  };
> +
> +template <size_t _Np>
> +  struct __sve_vector_type<uint8_t, _Np>
> +  {
> +    typedef svuint8_t __sve_vlst_type
> __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); +
> +    inline static __sve_vlst_type
> +    __sve_broadcast(uint8_t __dup)
> +    { return svdup_u8(__dup); }
> +
> +    inline static __sve_bool_type
> +    __sve_active_mask()
> +    { return svwhilelt_b8(size_t(0), _Np); };
> +
> +    using type = __sve_vlst_type;
> +  };
> +
> +template <size_t _Np>
> +  struct __sve_vector_type<int16_t, _Np>
> +  {
> +    typedef svint16_t __sve_vlst_type
> __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); +
> +    inline static __sve_vlst_type
> +    __sve_broadcast(int16_t __dup)
> +    { return svdup_s16(__dup); }
> +
> +    inline static __sve_bool_type
> +    __sve_active_mask()
> +    { return svwhilelt_b16(size_t(0), _Np); };
> +
> +    using type = __sve_vlst_type;
> +  };
> +
> +template <size_t _Np>
> +  struct __sve_vector_type<uint16_t, _Np>
> +  {
> +    typedef svuint16_t __sve_vlst_type
> __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); +
> +    inline static __sve_vlst_type
> +    __sve_broadcast(uint16_t __dup)
> +    { return svdup_u16(__dup); }
> +
> +    inline static __sve_bool_type
> +    __sve_active_mask()
> +    { return svwhilelt_b16(size_t(0), _Np); };
> +
> +    using type = __sve_vlst_type;
> +  };
> +
> +template <size_t _Np>
> +  struct __sve_vector_type<int32_t, _Np>
> +  {
> +    typedef svint32_t __sve_vlst_type
> __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); +
> +    inline static __sve_vlst_type
> +    __sve_broadcast(int32_t __dup)
> +    { return svdup_s32(__dup); }
> +
> +    inline static __sve_bool_type
> +    __sve_active_mask()
> +    { return svwhilelt_b32(size_t(0), _Np); };
> +
> +    using type = __sve_vlst_type;
> +  };
> +
> +template <size_t _Np>
> +  struct __sve_vector_type<uint32_t, _Np>
> +  {
> +    typedef svuint32_t __sve_vlst_type
> __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); +
> +    inline static __sve_vlst_type
> +    __sve_broadcast(uint32_t __dup)
> +    { return svdup_u32(__dup); }
> +
> +    inline static __sve_bool_type
> +    __sve_active_mask()
> +    { return svwhilelt_b32(size_t(0), _Np); };
> +
> +    using type = __sve_vlst_type;
> +  };
> +
> +template <size_t _Np>
> +  struct __sve_vector_type<int64_t, _Np>
> +  {
> +    typedef svint64_t __sve_vlst_type
> __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); +
> +    inline static __sve_vlst_type
> +    __sve_broadcast(int64_t __dup)
> +    { return svdup_s64(__dup); }
> +
> +    inline static __sve_bool_type
> +    __sve_active_mask()
> +    { return svwhilelt_b64(size_t(0), _Np); };
> +
> +    using type = __sve_vlst_type;
> +  };
> +
> +template <size_t _Np>
> +  struct __sve_vector_type<uint64_t, _Np>
> +  {
> +    typedef svuint64_t __sve_vlst_type
> __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); +
> +    inline static __sve_vlst_type
> +    __sve_broadcast(uint64_t __dup)
> +    { return svdup_u64(__dup); }
> +
> +    inline static __sve_bool_type
> +    __sve_active_mask()
> +    { return svwhilelt_b64(size_t(0), _Np); };
> +
> +    using type = __sve_vlst_type;
> +  };
> +
> +template <size_t _Np>
> +  struct __sve_vector_type<float, _Np>
> +  {
> +    typedef svfloat32_t __sve_vlst_type
> __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); +
> +    inline static __sve_vlst_type
> +    __sve_broadcast(float __dup)
> +    { return svdup_f32(__dup); }
> +
> +    inline static __sve_bool_type
> +    __sve_active_mask()
> +    { return svwhilelt_b32(size_t(0), _Np); };
> +
> +    using type = __sve_vlst_type;
> +  };
> +
> +template <size_t _Np>
> +  struct __sve_vector_type<double, _Np>
> +  {
> +    typedef svfloat64_t __sve_vlst_type
> __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); +
> +    inline static __sve_vlst_type
> +    __sve_broadcast(double __dup)
> +    { return svdup_f64(__dup); }
> +
> +    inline static __sve_bool_type
> +    __sve_active_mask()
> +    { return svwhilelt_b64(size_t(0), _Np); };
> +
> +    using type = __sve_vlst_type;
> +  };
> +
> +template <size_t _Np>
> +  struct __sve_vector_type<char, _Np>
> +  : __sve_vector_type<__get_sve_value_type_t<char>, _Np>
> +  {};
> +
> +template <size_t _Np>
> +  struct __sve_vector_type<char16_t, _Np>
> +  : __sve_vector_type<__get_sve_value_type_t<char16_t>, _Np>
> +  {};
> +
> +template <size_t _Np>
> +  struct __sve_vector_type<wchar_t, _Np>
> +  : __sve_vector_type<__get_sve_value_type_t<wchar_t>, _Np>
> +  {};
> +
> +template <size_t _Np>
> +  struct __sve_vector_type<char32_t, _Np>
> +  : __sve_vector_type<__get_sve_value_type_t<char32_t>, _Np>
> +  {};
> +
> +template <size_t _Np>
> +  struct __sve_vector_type<long long int, _Np>
> +  : __sve_vector_type<__get_sve_value_type_t<long long int>, _Np>
> +  {};
> +
> +template <size_t _Np>
> +  struct __sve_vector_type<long long unsigned int, _Np>
> +  : __sve_vector_type<__get_sve_value_type_t<long long unsigned int>, _Np>
> +  {};

Please replace the last 6 partial specializations with a generic 
implementation of the primary template:

template <typename T, size_t _Np>
  struct __sve_vector_type
  : __sve_vector_type<__get_sve_value_type_t<T>, _Np>
  {};

This avoids issues on platforms that define (u)int64_t as (unsigned) long long 
and is simpler in any case.

[...]
> +  template <typename _Tp, typename _Up, size_t _Np>
> +    _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np>
> +    _S_load(const _Up* __p, _SveMaskWrapper<sizeof(_Tp), _Np> __k)
> +    {
> +      using _STp = __get_sve_value_type_t<_Tp>;
> +      using _SUp = __get_sve_value_type_t<_Up>;
> +      using _V = __sve_vector_type_t<_Tp, _Np>;
> +      const _SUp* __up = reinterpret_cast<const _SUp*>(__p);
> +
> +      if constexpr (std::is_same_v<_Tp, _Up>)
> +	return _V(svld1(__k._M_data, __up));
> +      if constexpr (std::is_integral_v<_Tp> && std::is_integral_v<_Up>
> +		      && (sizeof(_Tp) > sizeof(_Up)))
> +	{
> +	  if constexpr (std::is_same_v<_SUp, int8_t>)
> +	    {
> +	      if constexpr (std::is_same_v<_STp, int16_t>)
> +		return _V(svld1sb_s16(__k._M_data, __up));
> +	      if constexpr (std::is_same_v<_STp, uint16_t>)
> +		return _V(svld1sb_u16(__k._M_data, __up));
> +	      if constexpr (std::is_same_v<_STp, int32_t>)
> +		return _V(svld1sb_s32(__k._M_data, __up));
> +	      if constexpr (std::is_same_v<_STp, uint32_t>)
> +		return _V(svld1sb_u32(__k._M_data, __up));
> +	      if constexpr (std::is_same_v<_STp, int64_t>)
> +		return _V(svld1sb_s64(__k._M_data, __up));
> +	      if constexpr (std::is_same_v<_STp, uint64_t>)
> +		return _V(svld1sb_u64(__k._M_data, __up));
> +	    }
> +	  if constexpr (std::is_same_v<_SUp, uint8_t>)
> +	    {
> +	      if constexpr (std::is_same_v<_STp, int16_t>)
> +		return _V(svld1ub_s16(__k._M_data, __up));
> +	      if constexpr (std::is_same_v<_STp, uint16_t>)
> +		return _V(svld1ub_u16(__k._M_data, __up));
> +	      if constexpr (std::is_same_v<_STp, int32_t>)
> +		return _V(svld1ub_s32(__k._M_data, __up));
> +	      if constexpr (std::is_same_v<_STp, uint32_t>)
> +		return _V(svld1ub_u32(__k._M_data, __up));
> +	      if constexpr (std::is_same_v<_STp, int64_t>)
> +		return _V(svld1ub_s64(__k._M_data, __up));
> +	      if constexpr (std::is_same_v<_STp, uint64_t>)
> +		return _V(svld1ub_u64(__k._M_data, __up));
> +	    }
> +	  if constexpr (std::is_same_v<_SUp, int16_t>)
> +	    {
> +	      if constexpr (std::is_same_v<_STp, int32_t>)
> +		return _V(svld1sh_s32(__k._M_data, __up));
> +	      if constexpr (std::is_same_v<_STp, uint32_t>)
> +		return _V(svld1sh_u32(__k._M_data, __up));
> +	      if constexpr (std::is_same_v<_STp, int64_t>)
> +		return _V(svld1sh_s64(__k._M_data, __up));
> +	      if constexpr (std::is_same_v<_STp, uint64_t>)
> +		return _V(svld1sh_u64(__k._M_data, __up));
> +	    }
> +	  if constexpr (std::is_same_v<_SUp, uint16_t>)
> +	    {
> +	      if constexpr (std::is_same_v<_STp, int32_t>)
> +		return _V(svld1uh_s32(__k._M_data, __up));
> +	      if constexpr (std::is_same_v<_STp, uint32_t>)
> +		return _V(svld1uh_u32(__k._M_data, __up));
> +	      if constexpr (std::is_same_v<_STp, int64_t>)
> +		return _V(svld1uh_s64(__k._M_data, __up));
> +	      if constexpr (std::is_same_v<_STp, uint64_t>)
> +		return _V(svld1uh_u64(__k._M_data, __up));
> +	    }
> +	  if constexpr (std::is_same_v<_SUp, int32_t>)
> +	    {
> +	      if constexpr (std::is_same_v<_STp, int64_t>)
> +		return _V(svld1sw_s64(__k._M_data, __up));
> +	      if constexpr (std::is_same_v<_STp, uint64_t>)
> +		return _V(svld1sw_u64(__k._M_data, __up));
> +	    }
> +	  if constexpr (std::is_same_v<_SUp, uint32_t>)
> +	    {
> +	      if constexpr (std::is_same_v<_STp, int64_t>)
> +		return _V(svld1uw_s64(__k._M_data, __up));
> +	      if constexpr (std::is_same_v<_STp, uint64_t>)
> +		return _V(svld1uw_u64(__k._M_data, __up));
> +	    }
> +	}
> +      return __generate_from_n_evaluations<_Np, __sve_vector_type_t<_Tp,
> _Np>>(
> +	       [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
> +		 return __k[__i] ? static_cast<_Tp>(__p[__i]) : _Tp{};
> +	       });

Fine for now, because this unlikely to be used much anyway. But I'd prefer to 
see masked vector load(s) + vector conversion(s) at some point.

> +    }
> +
> +  template <typename _Tp, typename _Up, size_t _Np>
> +    _GLIBCXX_SIMD_INTRINSIC static constexpr void
> +    _S_store(_Up* __p, _SveSimdWrapper<_Tp, _Np> __x,
> _SveMaskWrapper<sizeof(_Tp), _Np> __k) +    {
> +      using _SUp = __get_sve_value_type_t<_Up>;
> +      using _STp = __get_sve_value_type_t<_Tp>;
> +
> +      _SUp* __up = reinterpret_cast<_SUp*>(__p);
> +
> +      if constexpr (std::is_same_v<_Tp, _Up>)
> +	return svst1(__k._M_data, __up, __x);
> +      if constexpr (std::is_integral_v<_Tp> && std::is_integral_v<_Up>
> +		      && (sizeof(_Tp) > sizeof(_Up)))
> +	{
> +    if constexpr (std::is_same_v<_SUp, int8_t> && std::is_signed_v<_STp>)
> +      return svst1b(__k._M_data, __up, __x);
> +    if constexpr (std::is_same_v<_SUp, uint8_t> &&
> std::is_unsigned_v<_STp>) +      return svst1b(__k._M_data, __up, __x);
> +    if constexpr (std::is_same_v<_SUp, int16_t> && std::is_signed_v<_STp>)
> +      return svst1h(__k._M_data, __up, __x);
> +    if constexpr (std::is_same_v<_SUp, uint16_t> &&
> std::is_unsigned_v<_STp>) +      return svst1h(__k._M_data, __up, __x);
> +    if constexpr (std::is_same_v<_SUp, int32_t> && std::is_signed_v<_STp>)
> +      return svst1w(__k._M_data, __up, __x);
> +    if constexpr (std::is_same_v<_SUp, uint32_t> &&
> std::is_unsigned_v<_STp>) +      return svst1w(__k._M_data, __up, __x);
> +  }
> +
> +      __execute_n_times<_Np>([&](auto __i)
> _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { +	if (__k[__i])
> +	  __p[__i] = static_cast<_Up>(__x[__i]);
> +      });

Same as for converting masked loads...

> +    }
> +
> +  template <typename _Tp, size_t _Np>
> +    _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np>
> +    _S_blend(_SveMaskWrapper<sizeof(_Tp), _Np> __k, _SveSimdWrapper<_Tp,
> _Np> __at0, +	     _SveSimdWrapper<_Tp, _Np> __at1)
> +    { return svsel(__k._M_data, __at1._M_data, __at0._M_data); }
> +
> +  template <size_t _Np, bool _Sanitized>
> +    _GLIBCXX_SIMD_INTRINSIC static constexpr void
> +    _S_store_bool_array(_BitMask<_Np, _Sanitized> __x, bool* __mem)
> +    {
> +      __execute_n_times<_Np>([&](auto __i)
> _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { +	__mem[__i] = __x[__i];
> +      });
> +    }
> +};
> +
> +template <typename _Abi, typename>
> +  struct _SimdImplSve
> +  {
> +    template <typename _Tp>
> +      using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
> +
> +    template <typename _Tp>
> +      using _SimdMember = typename _Abi::template
> __traits<_Tp>::_SimdMember; +
> +    using _CommonImpl = typename _Abi::_CommonImpl;
> +    using _SuperImpl = typename _Abi::_SimdImpl;
> +    using _MaskImpl = typename _Abi::_MaskImpl;
> +
> +    template <typename _Tp>
> +      static constexpr size_t _S_full_size = _Abi::template
> _S_full_size<_Tp>; +
> +    template <typename _Tp>
> +      static constexpr size_t _S_size = _Abi::template _S_size<_Tp>;
> +
> +    template <typename _Tp>
> +      using _TypeTag = _Tp*;
> +
> +    using abi_type = _Abi;
> +
> +    template <typename _Tp>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr auto
> +      _S_broadcast(_Tp __x) noexcept
> +      {
> +	return __sve_vector_type<_Tp, __sve_vectorized_size_bytes / 
sizeof(_Tp)>
> +		 ::__sve_broadcast(__x);
> +      }
> +
> +    template <typename _Fp, typename _Tp>
> +      inline static constexpr _SimdMember<_Tp>
> +      _S_generator(_Fp&& __gen, _TypeTag<_Tp>)
> +      {
> +	constexpr size_t _Np = _S_size<_Tp>;
> +	_SveSimdWrapper<_Tp, _Np> __ret;
> +	__execute_n_times<_S_size<_Tp>>(
> +	  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 
__ret._M_set(__i,
> __gen(__i)); }); +	return __ret;
> +      }
> +
> +    template <typename _Tp, typename _Up>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp>
> +      _S_load(const _Up* __mem, _TypeTag<_Tp>) noexcept
> +      {
> +	constexpr size_t _Np = _S_size<_Tp>;
> +	_SimdMember<_Tp> __ret = _CommonImpl::template _S_load<_Tp, _Up, 
_Np>(
> +				   __mem, _SveMaskWrapper<sizeof(_Tp), 
_Np>{
> +				     __sve_vector_type<_Tp, 
_Np>::__sve_active_mask()});
> +	return __ret;
> +      }
> +
> +    template <typename _Tp, size_t _Np, typename _Up>
> +      static constexpr inline _SveSimdWrapper<_Tp, _Np>
> +      _S_masked_load(_SveSimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp>
> __k, const _Up* __mem) +      noexcept
> +      {
> +	__sve_vector_type_t<_Tp, _Np> __v
> +	  = _CommonImpl::template _S_load<_Tp, _Up, _Np>(__mem, __k);
> +	__sve_vector_type_t<_Tp, _Np> __ret = svsel(__k._M_data, __v,
> __merge._M_data); +	return __ret;
> +      }
> +
> +    template <typename _Tp, typename _Up>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr void
> +      _S_store(_SimdMember<_Tp> __v, _Up* __mem, _TypeTag<_Tp>) noexcept
> +      {
> +	constexpr size_t _Np = _S_size<_Tp>;
> +	_CommonImpl::template _S_store<_Tp, _Up, _Np>(
> +	  __mem, __v, __sve_vector_type<_Tp, _Np>::__sve_active_mask());
> +      }
> +
> +    template <typename _Tp, typename _Up, size_t _Np>
> +      static constexpr inline void
> +      _S_masked_store(const _SveSimdWrapper<_Tp, _Np> __v, _Up* __mem,
> +		      const _SveMaskWrapper<sizeof(_Tp), _Np> __k) noexcept
> +      { _CommonImpl::template _S_store<_Tp, _Up, _Np>(__mem, __v, __k); }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
> +      _S_negate(_SveSimdWrapper<_Tp, _Np> __x) noexcept
> +      {
> +	return svcmpeq(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> __x._M_data, +		       __sve_vector_type<_Tp,
> _Np>::__sve_broadcast(_Tp{}));
> +      }
> +
> +    template <typename _Tp, typename _BinaryOperation>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
> +      _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
> +      {
> +	auto __x_data = __x._M_data;
> +	constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
> +	using __sve_vec_t = __sve_vector_type_t<_Tp, _Np>;
> +	std::size_t __i = __x.size();
> +	for (; (__i % 2) != 1; __i /= 2)
> +	  {
> +	    __x_data = __binary_op(simd<_Tp, _Abi>(
> +				     __private_init, _SveSimdWrapper<_Tp, 
_Np>(
> +						       
__sve_vec_t(svuzp1(__x_data, __x_data)))),
> +				   simd<_Tp, _Abi>(
> +				     __private_init, _SveSimdWrapper<_Tp, 
_Np>(
> +						       
__sve_vec_t(svuzp2(__x_data, __x_data))))
> +				  )._M_data;
> +	  }
> +	_Tp __res = __x_data[0];
> +	for (size_t __ri = 1; __ri != __i; __ri++)
> +	  __res = __binary_op(__x_data[__ri], __res);
> +	return __res;
> +      }
> +
> +    template <typename _Tp>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
> +      _S_reduce(simd<_Tp, _Abi> __x, plus<>)
> +      {
> +    return svaddv(__sve_vector_type<_Tp,
> _S_size<_Tp>>::__sve_active_mask(), __x._M_data); +      }
> +
> +    template <typename _Tp>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
> +      _S_reduce(simd<_Tp, _Abi> __x, bit_and<>)
> +      {
> +    return svandv(__sve_vector_type<_Tp,
> _S_size<_Tp>>::__sve_active_mask(), __x._M_data); +      }
> +
> +    template <typename _Tp>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
> +      _S_reduce(simd<_Tp, _Abi> __x, bit_or<>)
> +      {
> +    return svorv(__sve_vector_type<_Tp, _S_size<_Tp>>::__sve_active_mask(),
> __x._M_data); +      }
> +
> +    template <typename _Tp>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
> +      _S_reduce(simd<_Tp, _Abi> __x, bit_xor<>)
> +      {
> +    return sveorv(__sve_vector_type<_Tp,
> _S_size<_Tp>>::__sve_active_mask(), __x._M_data); +      }
> +
> +    template <typename _Tp>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
> +      _S_reduce(simd<_Tp, _Abi> __x, __detail::_Maximum())
> +      {
> +    return svmaxv(__sve_vector_type<_Tp,
> _S_size<_Tp>>::__sve_active_mask(), __x._M_data); +      }
> +
> +    template <typename _Tp>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
> +      _S_reduce(simd<_Tp, _Abi> __x, __detail::_Minimum())
> +      {
> +    return svminv(__sve_vector_type<_Tp,
> _S_size<_Tp>>::__sve_active_mask(), __x._M_data); +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
> +      __sve_vector_type_t<_Tp, _Np>
> +      _S_min(_SveSimdWrapper<_Tp, _Np> __a, _SveSimdWrapper<_Tp, _Np> __b)
> +      {
> +	return svmin_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> __a._M_data, __b._M_data); +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
> +      __sve_vector_type_t<_Tp, _Np>
> +      _S_max(_SveSimdWrapper<_Tp, _Np> __a, _SveSimdWrapper<_Tp, _Np> __b)
> +      {
> +	return svmax_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> __a._M_data, __b._M_data); +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
> +      pair<_SveSimdWrapper<_Tp, _Np>, _SveSimdWrapper<_Tp, _Np>>
> +      _S_minmax(_SveSimdWrapper<_Tp, _Np> __a, _SveSimdWrapper<_Tp, _Np>
> __b) +      {
> +	return {
> +	  svmin_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), 
__a._M_data,
> __b._M_data), +	  svmax_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> __a._M_data, __b._M_data) +	};
> +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp,
> _Np> +      _S_complement(_SveSimdWrapper<_Tp, _Np> __x) noexcept
> +      {
> +	if constexpr (is_floating_point_v<_Tp>)
> +	  {
> +	    using _Ip = __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>;
> +	    return __sve_reinterpret_cast<_Tp>(
> +		     svnot_z(__sve_vector_type<_Tp, 
_Np>::__sve_active_mask(),
> +			     __sve_reinterpret_cast<_Ip>(__x)));
> +	  }
> +	else
> +	  return svnot_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> __x._M_data); +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr _SveSimdWrapper<_Tp, _Np>
> +      _S_unary_minus(_SveSimdWrapper<_Tp, _Np> __x) noexcept
> +      {
> +	return svmul_x(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> __x._M_data, +		       static_cast<_Tp>(-1));
> +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp,
> _Np> +      _S_plus(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp,
> _Np> __y) +      { return __x._M_data + __y._M_data; }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp,
> _Np> +      _S_minus(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp,
> _Np> __y) +      { return __x._M_data - __y._M_data; }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp,
> _Np> +      _S_multiplies(_SveSimdWrapper<_Tp, _Np> __x,
> _SveSimdWrapper<_Tp, _Np> __y) +      { return __x._M_data * __y._M_data; }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp,
> _Np> +      _S_divides(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp,
> _Np> __y) +      {
> +        __sve_vector_type_t<_Tp, _Np> __y_padded =
> svsel(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), +                  
>    __y._M_data, __sve_vector_type<_Tp, _Np>::__sve_broadcast(1)); +       
> return __x._M_data / __y_padded;
> +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp,
> _Np> +      _S_modulus(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp,
> _Np> __y) +      {
> +        __sve_vector_type_t<_Tp, _Np> __y_padded =
> svsel(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), +                  
>    __y._M_data, __sve_vector_type<_Tp, _Np>::__sve_broadcast(1)); +       
> return __x._M_data % __y_padded;
> +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp,
> _Np> +      _S_bit_and(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp,
> _Np> __y) +      {
> +	if constexpr (is_floating_point_v<_Tp>)
> +	  {
> +	    using _Ip = __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>;
> +	    return __sve_reinterpret_cast<_Tp>(
> +		     svand_x(__sve_vector_type<_Tp, 
_Np>::__sve_active_mask(),
> +			     __sve_reinterpret_cast<_Ip>(__x),
> __sve_reinterpret_cast<_Ip>(__y))); +	  }
> +	else
> +	  return svand_x(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> +			 __x._M_data, __y._M_data);
> +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp,
> _Np> +      _S_bit_or(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp,
> _Np> __y) +      {
> +	if constexpr (is_floating_point_v<_Tp>)
> +	  {
> +	    using _Ip = __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>;
> +	    return __sve_reinterpret_cast<_Tp>(
> +		     svorr_x(__sve_vector_type<_Tp, 
_Np>::__sve_active_mask(),
> +			     __sve_reinterpret_cast<_Ip>(__x),
> __sve_reinterpret_cast<_Ip>(__y))); +	  }
> +	else
> +	  return svorr_x(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> +			 __x._M_data, __y._M_data);
> +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp,
> _Np> +      _S_bit_xor(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp,
> _Np> __y) +      {
> +	if constexpr (is_floating_point_v<_Tp>)
> +	  {
> +	    using _Ip = __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>;
> +	    return __sve_reinterpret_cast<_Tp>(
> +		     sveor_x(__sve_vector_type<_Tp, 
_Np>::__sve_active_mask(),
> +			     __sve_reinterpret_cast<_Ip>(__x),
> __sve_reinterpret_cast<_Ip>(__y))); +	  }
> +	else
> +	  return sveor_x(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> +			 __x._M_data, __y._M_data);
> +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static __sve_vector_type_t<_Tp, _Np>
> +      _S_bit_shift_left(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp,
> _Np> __y) +      { return __x._M_data << __y._M_data; }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static __sve_vector_type_t<_Tp, _Np>
> +      _S_bit_shift_right(_SveSimdWrapper<_Tp, _Np> __x,
> _SveSimdWrapper<_Tp, _Np> __y) +      { return __x._M_data >> __y._M_data;
> }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp,
> _Np> +      _S_bit_shift_left(_SveSimdWrapper<_Tp, _Np> __x, int __y)
> +      { return __x._M_data << __y; }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp,
> _Np> +      _S_bit_shift_right(_SveSimdWrapper<_Tp, _Np> __x, int __y)
> +      { return __x._M_data >> __y; }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr void
> +      _S_increment(_SveSimdWrapper<_Tp, _Np>& __x)
> +      { __x = __x._M_data + 1; }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr void
> +      _S_decrement(_SveSimdWrapper<_Tp, _Np>& __x)
> +      { __x = __x._M_data - 1; }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
> +      _S_equal_to(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np>
> __y) +      {
> +	return svcmpeq(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> __x._M_data, __y._M_data); +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
> +      _S_not_equal_to(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp,
> _Np> __y) +      {
> +	return svcmpne(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> __x._M_data, __y._M_data); +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
> +      _S_less(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y)
> +      {
> +	return svcmplt(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> __x._M_data, __y._M_data); +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
> +      _S_less_equal(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp,
> _Np> __y) +      {
> +	return svcmple(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> __x._M_data, __y._M_data); +      }
> +
> +    // simd.math
> +#define _GLIBCXX_SIMD_MATH_FALLBACK(__name)                                
>                        \ +    template <typename _Tp, size_t _Np,
> typename... _More>                                         \ +      static
> _SveSimdWrapper<_Tp, _Np> _S_##__name(const _SveSimdWrapper<_Tp, _Np>& __x,
>           \ +						   const 
_More&... __more)                         \
> +      {                                                                    
>                        \ +	_SveSimdWrapper<_Tp, _Np> __r;                  
>                                           \
> +	__execute_n_times<_Np>([&](auto __i) 
_GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
>                  \ +	  __r._M_set(__i, __name(__x[__i], __more[__i]...));  
>                                     \ +	});                                
>                                                        \ +	return __r;     
>                                                                           \
> +      }
> +
> +#define _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(_RetTp, __name)               
>                        \ +    template <typename _Tp, typename... _More>   
>                                                  \ +      static auto
> _S_##__name(const _Tp& __x, const _More&... __more)                        
>      \ +      {                                                            
>                                \ +	return 
__fixed_size_storage_t<_RetTp,
> _Tp::_S_size>::_S_generate(                          \ +		 [&]
(auto __meta)
> _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {                             \ +		
  
> return __meta._S_generator(                                                
>     \ +			    [&](auto __i) 
_GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {           
>          \ +			      return __name(__x[__meta._S_offset + 
__i],            
>               \ +					    
__more[__meta._S_offset + __i]...);              
>      \ +			    }, static_cast<_RetTp*>(nullptr));                          
>           \ +		 });                                                        
>                       \ +      }
> +
> +    _GLIBCXX_SIMD_MATH_FALLBACK(acos)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(asin)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(atan)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(atan2)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(cos)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(sin)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(tan)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(acosh)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(asinh)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(atanh)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(cosh)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(sinh)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(tanh)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(exp)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(exp2)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(expm1)
> +    _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(int, ilogb)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(log)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(log10)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(log1p)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(log2)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(logb)
> +
> +    // modf implemented in simd_math.h
> +    _GLIBCXX_SIMD_MATH_FALLBACK(scalbn)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(scalbln)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(cbrt)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(pow)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(erf)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(erfc)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(lgamma)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(tgamma)
> +
> +    _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lrint)
> +    _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llrint)
> +
> +    _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lround)
> +    _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llround)
> +
> +    _GLIBCXX_SIMD_MATH_FALLBACK(fmod)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(remainder)
> +
> +    template <typename _Tp, size_t _Np>
> +      static _SveSimdWrapper<_Tp, _Np>
> +      _S_remquo(const _SveSimdWrapper<_Tp, _Np> __x, const
> _SveSimdWrapper<_Tp, _Np> __y, +		__fixed_size_storage_t<int, _Np>* 
__z)
> +      {
> +	_SveSimdWrapper<_Tp, _Np> __r{};
> +	__execute_n_times<_Np>([&](auto __i) 
_GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
> +	  int __tmp;
> +	  __r._M_set(__i, remquo(__x[__i], __y[__i], &__tmp));
> +	  __z->_M_set(__i, __tmp);
> +	});
> +	return __r;
> +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static __fixed_size_storage_t<int, _Np>
> +      _S_fpclassify(_SveSimdWrapper<_Tp, _Np> __x)
> +      {
> +	__fixed_size_storage_t<int, _Np> __r{};
> +	__execute_n_times<_Np>([&](auto __i) 
_GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
> +	  __r._M_set(__i, std::fpclassify(__x[__i]));
> +	});
> +	return __r;
> +      }
> +
> +    // copysign in simd_math.h
> +    _GLIBCXX_SIMD_MATH_FALLBACK(nextafter)
> +    _GLIBCXX_SIMD_MATH_FALLBACK(fdim)
> +
> +#undef _GLIBCXX_SIMD_MATH_FALLBACK
> +#undef _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET
> +
> +    template <typename _Tp, size_t _Np, typename _Op>
> +      static constexpr _MaskMember<_Tp>
> +      __fp_cmp(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np>
> __y, _Op __op) +      {
> +	using _Ip = __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>;
> +	using _VI = __sve_vector_type_t<_Ip, _Np>;
> +	using _WI = _SveSimdWrapper<_Ip, _Np>;
> +	const _WI __fmv = __sve_vector_type<_Ip,
> _Np>::__sve_broadcast(__finite_max_v<_Ip>); +	const _WI __zerov =
> __sve_vector_type<_Ip, _Np>::__sve_broadcast(0); +	const _WI __xn =
> _VI(__sve_reinterpret_cast<_Ip>(__x));
> +	const _WI __yn = _VI(__sve_reinterpret_cast<_Ip>(__y));
> +
> +	const _WI __xp
> +	  = svsel(_S_less(__xn, __zerov), _S_unary_minus(_WI(_S_bit_and(__xn,
> __fmv))), __xn); +	const _WI __yp
> +	  = svsel(_S_less(__yn, __zerov), _S_unary_minus(_WI(_S_bit_and(__yn,
> __fmv))), __yn); +	return svbic_z(__sve_vector_type<_Ip,
> _Np>::__sve_active_mask(), __op(__xp, __yp)._M_data, +		      
> _SuperImpl::_S_isunordered(__x, __y)._M_data);
> +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      static constexpr _MaskMember<_Tp>
> +      _S_isgreater(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np>
> __y) noexcept +      { return __fp_cmp(__x, __y, [](auto __xp, auto __yp) {
> return _S_less(__yp, __xp); }); } +
> +    template <typename _Tp, size_t _Np>
> +      static constexpr _MaskMember<_Tp>
> +      _S_isgreaterequal(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp,
> _Np> __y) noexcept +      { return __fp_cmp(__x, __y, [](auto __xp, auto
> __yp) { return _S_less_equal(__yp, __xp); }); } +
> +    template <typename _Tp, size_t _Np>
> +      static constexpr _MaskMember<_Tp>
> +      _S_isless(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np>
> __y) noexcept +      { return __fp_cmp(__x, __y, [](auto __xp, auto __yp) {
> return _S_less(__xp, __yp); }); } +
> +    template <typename _Tp, size_t _Np>
> +      static constexpr _MaskMember<_Tp>
> +      _S_islessequal(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp,
> _Np> __y) noexcept +      { return __fp_cmp(__x, __y, [](auto __xp, auto
> __yp) { return _S_less_equal(__xp, __yp); }); } +
> +    template <typename _Tp, size_t _Np>
> +      static constexpr _MaskMember<_Tp>
> +      _S_islessgreater(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp,
> _Np> __y) noexcept +      {
> +	return svbic_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> +		       _SuperImpl::_S_not_equal_to(__x, __y)._M_data,
> +		       _SuperImpl::_S_isunordered(__x, __y)._M_data);
> +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np>
> +      _S_abs(_SveSimdWrapper<_Tp, _Np> __x) noexcept
> +      { return svabs_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> __x._M_data); } +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np>
> +      _S_fabs(_SveSimdWrapper<_Tp, _Np> __x) noexcept
> +      { return svabs_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> __x._M_data); } +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np>
> +      _S_sqrt(_SveSimdWrapper<_Tp, _Np> __x) noexcept
> +      { return svsqrt_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> __x._M_data); } +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np>
> +      _S_ldexp(_SveSimdWrapper<_Tp, _Np> __x, __fixed_size_storage_t<int,
> _Np> __y) noexcept +      {
> +	auto __sve_register = __y.first;
> +	if constexpr (std::is_same_v<_Tp, float>)
> +	  return svscale_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> __x._M_data, +			   __sve_register._M_data);
> +	else
> +	  {
> +	    __sve_vector_type_t<int64_t, _Np> __sve_d_register =
> svunpklo(__sve_register); +	    return svscale_z(__sve_vector_type<_Tp,
> _Np>::__sve_active_mask(), __x._M_data, +			     
__sve_d_register);
> +	  }
> +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np>
> +      _S_fma(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y,
> +	     _SveSimdWrapper<_Tp, _Np> __z)
> +      {
> +	return svmad_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> __x._M_data, __y._M_data, +		       __z._M_data);
> +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np>
> +      _S_fmax(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y)
> +      {
> +  return svmaxnm_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> __x._M_data, __y._M_data); +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np>
> +      _S_fmin(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y)
> +      {
> +  return svminnm_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> __x._M_data, __y._M_data); +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
> +      _S_isfinite([[maybe_unused]] _SveSimdWrapper<_Tp, _Np> __x)
> +      {
> +#if __FINITE_MATH_ONLY__
> +	return __sve_vector_type_t<_Tp, _Np>::__sve_all_true_mask();
> +#else
> +	// if all exponent bits are set, __x is either inf or NaN
> +
> +	using _Ip = __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>;
> +	const __sve_vector_type_t<_Ip, _Np> __absn =
> __sve_reinterpret_cast<_Ip>(_S_abs(__x)); +	const 
__sve_vector_type_t<_Ip,
> _Np> __maxn
> +	  = __sve_reinterpret_cast<_Ip>(
> +	      __sve_vector_type<_Tp, 
_Np>::__sve_broadcast(__finite_max_v<_Tp>));
> +
> +	return _S_less_equal(_SveSimdWrapper<_Ip, _Np>{__absn},
> _SveSimdWrapper<_Ip, _Np>{__maxn}); +#endif
> +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
> +      _S_isinf([[maybe_unused]] _SveSimdWrapper<_Tp, _Np> __x)
> +      {
> +#if __FINITE_MATH_ONLY__
> +	return {}; // false
> +#else
> +	return _S_equal_to<_Tp, _Np>(_S_abs(__x),
> _S_broadcast(__infinity_v<_Tp>)); +#endif
> +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
> +      _S_isnan([[maybe_unused]] _SveSimdWrapper<_Tp, _Np> __x)
> +      {
> +#if __FINITE_MATH_ONLY__
> +	return {}; // false
> +#else
> +	return svcmpuo(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> __x._M_data, __x._M_data); +#endif
> +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
> +      _S_isnormal(_SveSimdWrapper<_Tp, _Np> __x)
> +      {
> +	using _Ip = __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>;
> +	using _V = __sve_vector_type_t<_Ip, _Np>;
> +	using _VW = _SveSimdWrapper<_Ip, _Np>;
> +
> +	const _V __absn = __sve_reinterpret_cast<_Ip>(_S_abs(__x));
> +	const _V __minn = __sve_reinterpret_cast<_Ip>(
> +			    __sve_vector_type<_Tp, 
_Np>::__sve_broadcast(__norm_min_v<_Tp>));
> +#if __FINITE_MATH_ONLY__
> +	return _S_greater_equal(_VW{__absn}, _VW{__minn});
> +#else
> +	const _V __maxn = __sve_reinterpret_cast<_Ip>(
> +			    __sve_vector_type<_Tp, 
_Np>::__sve_broadcast(__finite_max_v<_Tp>));
> +	return _MaskImpl::_S_bit_and(_S_less_equal(_VW{__minn}, _VW{__absn}),
> +				     _S_less_equal(_VW{__absn}, 
_VW{__maxn}));
> +#endif
> +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
> +      _S_signbit(_SveSimdWrapper<_Tp, _Np> __x)
> +      {
> +	using _Ip = __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>;
> +	using _V = __sve_vector_type_t<_Ip, _Np>;
> +	using _VW = _SveSimdWrapper<_Ip, _Np>;
> +
> +	const _V __xn = __sve_reinterpret_cast<_Ip>(__x);
> +	const _V __zeron = __sve_vector_type<_Ip, _Np>::__sve_broadcast(0);
> +	return _S_less(_VW{__xn}, _VW{__zeron});
> +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
> +      _S_isunordered(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp,
> _Np> __y) +      {
> +	return svcmpuo(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> __x._M_data, __y._M_data); +      }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np>
> +      _S_nearbyint(_SveSimdWrapper<_Tp, _Np> __x) noexcept
> +      { return svrinti_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> __x._M_data); } +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np>
> +      _S_rint(_SveSimdWrapper<_Tp, _Np> __x) noexcept
> +      { return _SuperImpl::_S_nearbyint(__x); }
> +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np>
> +      _S_trunc(_SveSimdWrapper<_Tp, _Np> __x) noexcept
> +      { return svrintz_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> __x._M_data); } +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np>
> +      _S_round(_SveSimdWrapper<_Tp, _Np> __x) noexcept
> +      { return svrinta_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> __x._M_data); } +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np>
> +      _S_floor(_SveSimdWrapper<_Tp, _Np> __x) noexcept
> +      { return svrintm_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> __x._M_data); } +
> +    template <typename _Tp, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np>
> +      _S_ceil(_SveSimdWrapper<_Tp, _Np> __x) noexcept
> +      { return svrintp_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
> __x._M_data); } +
> +    template <typename _Tp, size_t _Bits, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr void
> +      _S_masked_assign(_SveMaskWrapper<_Bits, _Np> __k,
> _SveSimdWrapper<_Tp, _Np>& __lhs, +		      
> __type_identity_t<_SveSimdWrapper<_Tp, _Np>> __rhs)
> +      { __lhs = _CommonImpl::_S_blend(__k, __lhs, __rhs); }
> +
> +    template <typename _Tp, size_t _Bits, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr void
> +      _S_masked_assign(_SveMaskWrapper<_Bits, _Np> __k,
> _SveSimdWrapper<_Tp, _Np>& __lhs, +		       
__type_identity_t<_Tp> __rhs)
> +      { __lhs = _CommonImpl::_S_blend(__k, __lhs, __data(simd<_Tp,
> _Abi>(__rhs))); } +
> +    template <typename _Op, typename _Tp, size_t _Bits, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr void
> +      _S_masked_cassign(const _SveMaskWrapper<_Bits, _Np> __k,
> _SveSimdWrapper<_Tp, _Np>& __lhs, +			const
> __type_identity_t<_SveSimdWrapper<_Tp, _Np>> __rhs, _Op __op) +      {
> +	__lhs = _CommonImpl::_S_blend(__k, __lhs,
> +				      _SveSimdWrapper<_Tp, 
_Np>(__op(_SuperImpl{}, __lhs, __rhs)));
> +      }
> +
> +    template <typename _Op, typename _Tp, size_t _Bits, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr void
> +      _S_masked_cassign(const _SveMaskWrapper<_Bits, _Np> __k,
> _SveSimdWrapper<_Tp, _Np>& __lhs, +			const 
__type_identity_t<_Tp> __rhs,
> _Op __op)
> +      { _S_masked_cassign(__k, __lhs, _S_broadcast(__rhs), __op); }
> +
> +    template <typename _Tp, size_t _Np, typename _Up>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr void
> +      _S_set(_SveSimdWrapper<_Tp, _Np>& __v, int __i, _Up&& __x) noexcept
> +      { __v._M_set(__i, static_cast<_Up&&>(__x)); }
> +
> +    template <template <typename> class _Op, typename _Tp, size_t _Bits,
> size_t _Np> +      _GLIBCXX_SIMD_INTRINSIC static constexpr
> _SveSimdWrapper<_Tp, _Np> +      _S_masked_unary(const
> _SveMaskWrapper<_Bits, _Np> __k, const _SveSimdWrapper<_Tp, _Np> __v) +    
>  {
> +	auto __vv = simd<_Tp, _Abi>{__private_init, __v};
> +	_Op<decltype(__vv)> __op;
> +	return _CommonImpl::_S_blend(__k, __v, __data(__op(__vv)));
> +      }
> +  };
> +
> +template <typename _Abi, typename>
> +  struct _MaskImplSve
> +  {
> +    template <typename _Tp>
> +      using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
> +
> +    template <typename _Tp>
> +      using _TypeTag = _Tp*;
> +
> +    template <typename _Tp>
> +      static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>;
> +
> +    template <typename _Tp>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
> +      _S_broadcast(bool __x)
> +      {
> +	constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
> +	__sve_bool_type __tr = __sve_vector_type<_Tp, 
_Np>::__sve_active_mask();
> +	__sve_bool_type __fl = svpfalse_b();;
> +	return __x ? __tr : __fl;
> +      }
> +
> +    template <typename _Tp>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
> +      _S_load(const bool* __mem)
> +      {
> +	constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
> +  const uint8_t* __p = reinterpret_cast<const uint8_t*>(__mem);
> +  __sve_bool_type __u8_active_mask = __sve_vector_type<uint8_t,
> _Np>::__sve_active_mask(); +  __sve_vector_type_t<uint8_t, _Np>
> __u8_vec_mask_load = svld1(__u8_active_mask, __p); +  __sve_bool_type
> __u8_mask = svcmpne(__u8_active_mask, __u8_vec_mask_load, 0); +
> +  __sve_bool_type __tp_mask = __u8_mask;
> +  for (size_t __up_size = 1; __up_size != sizeof(_Tp); __up_size *= 2)
> +    {
> +  __tp_mask = svunpklo(__tp_mask);
> +    }
> +
> +	_SveMaskWrapper<sizeof(_Tp), simd_size_v<_Tp, _Abi>> __r{__tp_mask};
> +  return __r;
> +      }
> +
> +    template <size_t _Bits, size_t _Np>
> +      static inline _SveMaskWrapper<_Bits, _Np>
> +      _S_masked_load(_SveMaskWrapper<_Bits, _Np> __merge,
> _SveMaskWrapper<_Bits, _Np> __mask, +		     const bool* __mem) 
noexcept
> +      {
> +	_SveMaskWrapper<_Bits, _Np> __r;
> +
> +	__execute_n_times<_Np>([&](auto __i) 
_GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
> +	  if (__mask[__i])
> +	    __r._M_set(__i, __mem[__i]);
> +	  else
> +	    __r._M_set(__i, __merge[__i]);
> +	});
> +
> +	return __r;
> +      }
> +
> +    template <size_t _Bits, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr void
> +      _S_store(_SveMaskWrapper<_Bits, _Np> __v, bool* __mem) noexcept
> +      {
> +	__execute_n_times<_Np>([&](auto __i)
> +			      _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 
__mem[__i] = __v[__i]; });
> +      }
> +
> +    template <size_t _Bits, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr void
> +      _S_masked_store(const _SveMaskWrapper<_Bits, _Np> __v, bool* __mem,
> +		      const _SveMaskWrapper<_Bits, _Np> __k) noexcept
> +      {
> +	__execute_n_times<_Np>([&](auto __i) 
_GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
> +	  if (__k[__i])
> +	    __mem[__i] = __v[__i];
> +	});
> +      }
> +
> +    template <size_t _Bits, size_t _Np>
> +      _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
> +      _S_to_bits(_SveMaskWrapper<_Bits, _Np> __x)
> +      {
> +	_ULLong __r = 0;
> +	__execute_n_times<_Np>(
> +	  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __r |=
> _ULLong(__x[__i]) << __i; }); +	return __r;

With -msve-vector-bits=1024 (or larger) this can fail (UB on shift) and lose 
information. This function is needed on conversion to fixed_size_simd_mask. 
However, simd_fixed_size.h isn't ready for size > 64 either. While looking 
deeper I found that you didn't adjust max_fixed_size in bits/simd.h. For now 
please bump max_fixed_size to 64 for __ARM_FEATURE_SVE_BITS >= 512. Don't go 
higher than 64, even though the spec is asking for it:

  Additionally, for every supported simd<T, Abi> (see 9.6.1), where Abi
  is an ABI tag that is not a specialization of simd_abi::fixed_size,
  N == simd<T, Abi>::size() shall be supported.

I.e. the existence of simd<char, simd_abi::__sve> with simd_size_v<char, 
simd_abi> == 128 (for -msve-vector-bits=1024) asks for fixed_size_simd<char, 
128>. For now we can't conform.

[...]

From my side, with the noted changes the patch is ready for merging. 
@Jonathan, any chance for a green light before GCC 14.1?
Jonathan Wakely March 27, 2024, 9:50 a.m. UTC | #2
On Fri, 8 Mar 2024 at 09:58, Matthias Kretz wrote:
>
> Hi,
>
> I applied and did extended testing on x86_64 (no regressions) and aarch64
> using qemu testing SVE 256, 512, and 1024. Looks good!
>
> While going through the applied patch I noticed a few style issues that I
> simply turned into a patch (attached).
>
[...]
>
> From my side, with the noted changes the patch is ready for merging.
> @Jonathan, any chance for a green light before GCC 14.1?

As discussed on IRC, please push the revised patch with your
suggestions incorporated (and post to the lists for posterity).

Thanks, everybody, for the patches and the thorough review.
Richard Sandiford March 27, 2024, 10:07 a.m. UTC | #3
Jonathan Wakely <jwakely@redhat.com> writes:
> On Fri, 8 Mar 2024 at 09:58, Matthias Kretz wrote:
>>
>> Hi,
>>
>> I applied and did extended testing on x86_64 (no regressions) and aarch64
>> using qemu testing SVE 256, 512, and 1024. Looks good!
>>
>> While going through the applied patch I noticed a few style issues that I
>> simply turned into a patch (attached).
>>
> [...]
>>
>> From my side, with the noted changes the patch is ready for merging.
>> @Jonathan, any chance for a green light before GCC 14.1?
>
> As discussed on IRC, please push the revised patch with your
> suggestions incorporated (and post to the lists for posterity).
>
> Thanks, everybody, for the patches and the thorough review.

I'm still worried about:

  #if _GLIBCXX_SIMD_HAVE_SVE
  constexpr inline int __sve_vectorized_size_bytes = __ARM_FEATURE_SVE_BITS / 8;
  #else
  constexpr inline int __sve_vectorized_size_bytes = 0;
  #endif

and the direct use __ARM_FEATURE_SVE_BITS elsewhere, for the reasons
discussed here (including possible ODR problems):

  https://gcc.gnu.org/pipermail/gcc-patches/2023-December/640037.html
  https://gcc.gnu.org/pipermail/gcc-patches/2024-January/643734.html

Logically the vector length should be a template parameter rather than
an invariant.  Has this been resolved?  If not, it feels like a blocker
to me (sorry).

Thanks,
Richard
Matthias Kretz March 27, 2024, 10:30 a.m. UTC | #4
On Wednesday, 27 March 2024 11:07:14 CET Richard Sandiford wrote:
> I'm still worried about:
> 
>   #if _GLIBCXX_SIMD_HAVE_SVE
>   constexpr inline int __sve_vectorized_size_bytes = __ARM_FEATURE_SVE_BITS
> / 8; #else
>   constexpr inline int __sve_vectorized_size_bytes = 0;
>   #endif
> 
> and the direct use __ARM_FEATURE_SVE_BITS elsewhere, for the reasons
> discussed here (including possible ODR problems):
> 
>   https://gcc.gnu.org/pipermail/gcc-patches/2023-December/640037.html
>   https://gcc.gnu.org/pipermail/gcc-patches/2024-January/643734.html
> 
> Logically the vector length should be a template parameter rather than
> an invariant.  Has this been resolved?  If not, it feels like a blocker
> to me (sorry).

The vector length is always a template parameter to all user-facing API. Some 
examples

1. on aarch64 the following is independent of SVE flags (and status quo):

  simd<float> is an alias for
  simd<float, simd_abi::_VecBuiltin<16>

  fixed_size_simd<float, 4> is supposed to be ABI-stable anyway (passed via
  the stack, alignof == sizeof).

2. with -msve-vector-bits=512:

  native_simd<float> is an alias for
  simd<float, simd_abi::_SveAbi<64, 64>>

  simd<float, simd_abi::deduce_t<float, 4>> is an alias for
  simd<float, simd_abi::_SveAbi<16, 64>>

3. with -msve-vector-bits=256: 

  native_simd<float> is an alias for
  simd<float, simd_abi::_SveAbi<32, 32>>

  simd<float, simd_abi::deduce_t<float, 4>> is an alias for
  simd<float, simd_abi::_SveAbi<16, 32>>

Implementation functions are either [[gnu::always_inline]] or tagged with the 
ABI tag type and the __odr_helper template argument (to ensure not-inlined 
inline functions have unique names).

Does that make __ARM_FEATURE_SVE_BITS usage indirect enough?

Also for context, please consider that this is std::*experimental*::simd. The 
underlying ISO document will likely get retracted at some point and the whole 
API and implementation (hopefully) superseded by C++26. The main purpose of 
the spec and implementation is to gather experience.

Best,
  Matthias
Richard Sandiford March 27, 2024, 12:13 p.m. UTC | #5
Matthias Kretz <m.kretz@gsi.de> writes:
> On Wednesday, 27 March 2024 11:07:14 CET Richard Sandiford wrote:
>> I'm still worried about:
>> 
>>   #if _GLIBCXX_SIMD_HAVE_SVE
>>   constexpr inline int __sve_vectorized_size_bytes = __ARM_FEATURE_SVE_BITS
>> / 8; #else
>>   constexpr inline int __sve_vectorized_size_bytes = 0;
>>   #endif
>> 
>> and the direct use __ARM_FEATURE_SVE_BITS elsewhere, for the reasons
>> discussed here (including possible ODR problems):
>> 
>>   https://gcc.gnu.org/pipermail/gcc-patches/2023-December/640037.html
>>   https://gcc.gnu.org/pipermail/gcc-patches/2024-January/643734.html
>> 
>> Logically the vector length should be a template parameter rather than
>> an invariant.  Has this been resolved?  If not, it feels like a blocker
>> to me (sorry).
>
> The vector length is always a template parameter to all user-facing API. Some 
> examples
>
> 1. on aarch64 the following is independent of SVE flags (and status quo):
>
>   simd<float> is an alias for
>   simd<float, simd_abi::_VecBuiltin<16>
>
>   fixed_size_simd<float, 4> is supposed to be ABI-stable anyway (passed via
>   the stack, alignof == sizeof).
>
> 2. with -msve-vector-bits=512:
>
>   native_simd<float> is an alias for
>   simd<float, simd_abi::_SveAbi<64, 64>>
>
>   simd<float, simd_abi::deduce_t<float, 4>> is an alias for
>   simd<float, simd_abi::_SveAbi<16, 64>>
>
> 3. with -msve-vector-bits=256: 
>
>   native_simd<float> is an alias for
>   simd<float, simd_abi::_SveAbi<32, 32>>
>
>   simd<float, simd_abi::deduce_t<float, 4>> is an alias for
>   simd<float, simd_abi::_SveAbi<16, 32>>
>
> Implementation functions are either [[gnu::always_inline]] or tagged with the 
> ABI tag type and the __odr_helper template argument (to ensure not-inlined 
> inline functions have unique names).

Ah, thanks for the explanation.  I think the global native_float alias
is problematic for reasons that you touched on in your later message.
I'll reply more about that there.  But in other respects this looks good.

> Does that make __ARM_FEATURE_SVE_BITS usage indirect enough?

In principle, the only use of __ARM_FEATURE_SVE_BITS should be to determine
the definition of native_simd (with the caveats above).  But current
GCC restrictions might make that impractical.

> Also for context, please consider that this is std::*experimental*::simd. The 
> underlying ISO document will likely get retracted at some point and the whole 
> API and implementation (hopefully) superseded by C++26. The main purpose of 
> the spec and implementation is to gather experience.

Ah, ok.  If this is a deliberate experiment for evidence-gathering
purposes, rather than a long-term commitment, then I agree the barrier
should be lower.

So yeah, I'll withdraw my objection.  I've no problem with this going
into GCC 14 on the basis above.  Thanks again to you and Srinivas for
working on this.

Richard
Jonathan Wakely March 27, 2024, 12:47 p.m. UTC | #6
On Wed, 27 Mar 2024 at 12:13, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Matthias Kretz <m.kretz@gsi.de> writes:
> > On Wednesday, 27 March 2024 11:07:14 CET Richard Sandiford wrote:
> >> I'm still worried about:
> >>
> >>   #if _GLIBCXX_SIMD_HAVE_SVE
> >>   constexpr inline int __sve_vectorized_size_bytes = __ARM_FEATURE_SVE_BITS
> >> / 8; #else
> >>   constexpr inline int __sve_vectorized_size_bytes = 0;
> >>   #endif
> >>
> >> and the direct use __ARM_FEATURE_SVE_BITS elsewhere, for the reasons
> >> discussed here (including possible ODR problems):
> >>
> >>   https://gcc.gnu.org/pipermail/gcc-patches/2023-December/640037.html
> >>   https://gcc.gnu.org/pipermail/gcc-patches/2024-January/643734.html
> >>
> >> Logically the vector length should be a template parameter rather than
> >> an invariant.  Has this been resolved?  If not, it feels like a blocker
> >> to me (sorry).
> >
> > The vector length is always a template parameter to all user-facing API. Some
> > examples
> >
> > 1. on aarch64 the following is independent of SVE flags (and status quo):
> >
> >   simd<float> is an alias for
> >   simd<float, simd_abi::_VecBuiltin<16>
> >
> >   fixed_size_simd<float, 4> is supposed to be ABI-stable anyway (passed via
> >   the stack, alignof == sizeof).
> >
> > 2. with -msve-vector-bits=512:
> >
> >   native_simd<float> is an alias for
> >   simd<float, simd_abi::_SveAbi<64, 64>>
> >
> >   simd<float, simd_abi::deduce_t<float, 4>> is an alias for
> >   simd<float, simd_abi::_SveAbi<16, 64>>
> >
> > 3. with -msve-vector-bits=256:
> >
> >   native_simd<float> is an alias for
> >   simd<float, simd_abi::_SveAbi<32, 32>>
> >
> >   simd<float, simd_abi::deduce_t<float, 4>> is an alias for
> >   simd<float, simd_abi::_SveAbi<16, 32>>
> >
> > Implementation functions are either [[gnu::always_inline]] or tagged with the
> > ABI tag type and the __odr_helper template argument (to ensure not-inlined
> > inline functions have unique names).
>
> Ah, thanks for the explanation.  I think the global native_float alias
> is problematic for reasons that you touched on in your later message.
> I'll reply more about that there.  But in other respects this looks good.
>
> > Does that make __ARM_FEATURE_SVE_BITS usage indirect enough?
>
> In principle, the only use of __ARM_FEATURE_SVE_BITS should be to determine
> the definition of native_simd (with the caveats above).  But current
> GCC restrictions might make that impractical.
>
> > Also for context, please consider that this is std::*experimental*::simd. The
> > underlying ISO document will likely get retracted at some point and the whole
> > API and implementation (hopefully) superseded by C++26. The main purpose of
> > the spec and implementation is to gather experience.
>
> Ah, ok.  If this is a deliberate experiment for evidence-gathering
> purposes, rather than a long-term commitment, then I agree the barrier
> should be lower.

Yes, that's definitely what this code is for. The more feedback and
impl-experience we can get now with the std::experimental::simd
version, the better std::simd will be when that happens.

In practice, we probably won't ever actually remove the
<experimental/simd> header even when the experiment is over (e.g. we
still have <tr1/memory> with std::tr1::shared_ptr!), but we are likely
to consider it unmaintained and deprecated once it's superseded by
std::simd.

> So yeah, I'll withdraw my objection.  I've no problem with this going
> into GCC 14 on the basis above.  Thanks again to you and Srinivas for
> working on this.
>
> Richard
>
Matthias Kretz March 27, 2024, 2:18 p.m. UTC | #7
On Wednesday, 27 March 2024 10:50:41 CET Jonathan Wakely wrote:
> As discussed on IRC, please push the revised patch with your
> suggestions incorporated (and post to the lists for posterity).

The patch as pushed is attached.
diff mbox series

Patch

diff --git a/libstdc++-v3/include/Makefile.am b/libstdc++-v3/include/Makefile.am
index 6209f390e08..1170cb047a6 100644
--- a/libstdc++-v3/include/Makefile.am
+++ b/libstdc++-v3/include/Makefile.am
@@ -826,6 +826,7 @@  experimental_bits_headers = \
 	${experimental_bits_srcdir}/simd_neon.h \
 	${experimental_bits_srcdir}/simd_ppc.h \
 	${experimental_bits_srcdir}/simd_scalar.h \
+	${experimental_bits_srcdir}/simd_sve.h \
 	${experimental_bits_srcdir}/simd_x86.h \
 	${experimental_bits_srcdir}/simd_x86_conversions.h \
 	${experimental_bits_srcdir}/string_view.tcc \
diff --git a/libstdc++-v3/include/Makefile.in b/libstdc++-v3/include/Makefile.in
index 596fa0d2390..bc44582a2da 100644
--- a/libstdc++-v3/include/Makefile.in
+++ b/libstdc++-v3/include/Makefile.in
@@ -1172,6 +1172,7 @@  experimental_bits_headers = \
 	${experimental_bits_srcdir}/simd_neon.h \
 	${experimental_bits_srcdir}/simd_ppc.h \
 	${experimental_bits_srcdir}/simd_scalar.h \
+	${experimental_bits_srcdir}/simd_sve.h \
 	${experimental_bits_srcdir}/simd_x86.h \
 	${experimental_bits_srcdir}/simd_x86_conversions.h \
 	${experimental_bits_srcdir}/string_view.tcc \
diff --git a/libstdc++-v3/include/experimental/bits/simd.h b/libstdc++-v3/include/experimental/bits/simd.h
index 90523ea57dc..d274cd740fe 100644
--- a/libstdc++-v3/include/experimental/bits/simd.h
+++ b/libstdc++-v3/include/experimental/bits/simd.h
@@ -39,12 +39,16 @@ 
 #include <functional>
 #include <iosfwd>
 #include <utility>
+#include <algorithm>
 
 #if _GLIBCXX_SIMD_X86INTRIN
 #include <x86intrin.h>
 #elif _GLIBCXX_SIMD_HAVE_NEON
 #include <arm_neon.h>
 #endif
+#if _GLIBCXX_SIMD_HAVE_SVE
+#include <arm_sve.h>
+#endif
 
 /** @ingroup ts_simd
  * @{
@@ -83,6 +87,12 @@  using __m512d [[__gnu__::__vector_size__(64)]] = double;
 using __m512i [[__gnu__::__vector_size__(64)]] = long long;
 #endif
 
+#if _GLIBCXX_SIMD_HAVE_SVE
+constexpr inline int __sve_vectorized_size_bytes = __ARM_FEATURE_SVE_BITS / 8;
+#else
+constexpr inline int __sve_vectorized_size_bytes = 0;
+#endif 
+
 namespace simd_abi {
 // simd_abi forward declarations {{{
 // implementation details:
@@ -108,6 +118,9 @@  template <int _UsedBytes>
 template <int _UsedBytes>
   struct _VecBltnBtmsk;
 
+template <int _UsedBytes, int _TotalBytes = __sve_vectorized_size_bytes>
+  struct _SveAbi;
+
 template <typename _Tp, int _Np>
   using _VecN = _VecBuiltin<sizeof(_Tp) * _Np>;
 
@@ -123,6 +136,9 @@  template <int _UsedBytes = 64>
 template <int _UsedBytes = 16>
   using _Neon = _VecBuiltin<_UsedBytes>;
 
+template <int _UsedBytes = __sve_vectorized_size_bytes>
+  using _Sve = _SveAbi<_UsedBytes, __sve_vectorized_size_bytes>;
+
 // implementation-defined:
 using __sse = _Sse<>;
 using __avx = _Avx<>;
@@ -130,6 +146,7 @@  using __avx512 = _Avx512<>;
 using __neon = _Neon<>;
 using __neon128 = _Neon<16>;
 using __neon64 = _Neon<8>;
+using __sve = _Sve<>;
 
 // standard:
 template <typename _Tp, size_t _Np, typename...>
@@ -250,6 +267,8 @@  constexpr inline bool __support_neon_float =
   false;
 #endif
 
+constexpr inline bool __have_sve = _GLIBCXX_SIMD_HAVE_SVE;
+
 #ifdef _ARCH_PWR10
 constexpr inline bool __have_power10vec = true;
 #else
@@ -356,12 +375,13 @@  namespace __detail
 		 | (__have_avx512vnni         << 27)
 		 | (__have_avx512vpopcntdq    << 28)
 		 | (__have_avx512vp2intersect << 29);
-    else if constexpr (__have_neon)
+    else if constexpr (__have_neon || __have_sve)
       return __have_neon
 	       | (__have_neon_a32 << 1)
 	       | (__have_neon_a64 << 2)
 	       | (__have_neon_a64 << 2)
-	       | (__support_neon_float << 3);
+	       | (__support_neon_float << 3)
+         | (__have_sve << 4);
     else if constexpr (__have_power_vmx)
       return __have_power_vmx
 	       | (__have_power_vsx  << 1)
@@ -733,6 +753,16 @@  template <typename _Abi>
     return _Bytes <= 16 && is_same_v<simd_abi::_VecBuiltin<_Bytes>, _Abi>;
   }
 
+// }}}
+// __is_sve_abi {{{
+template <typename _Abi>
+  constexpr bool
+  __is_sve_abi()
+  {
+    constexpr auto _Bytes = __abi_bytes_v<_Abi>;
+    return _Bytes <= __sve_vectorized_size_bytes && is_same_v<simd_abi::_Sve<_Bytes>, _Abi>;
+  }
+
 // }}}
 // __make_dependent_t {{{
 template <typename, typename _Up>
@@ -998,6 +1028,9 @@  template <typename _Tp>
 template <typename _Tp>
   using _SimdWrapper64 = _SimdWrapper<_Tp, 64 / sizeof(_Tp)>;
 
+template <typename _Tp, size_t _Width>
+  struct _SveSimdWrapper;
+
 // }}}
 // __is_simd_wrapper {{{
 template <typename _Tp>
@@ -2858,6 +2891,8 @@  template <typename _Tp>
     constexpr size_t __bytes = __vectorized_sizeof<_Tp>();
     if constexpr (__bytes == sizeof(_Tp))
       return static_cast<scalar*>(nullptr);
+    else if constexpr (__have_sve)
+      return  static_cast<_SveAbi<__sve_vectorized_size_bytes>*>(nullptr);
     else if constexpr (__have_avx512vl || (__have_avx512f && __bytes == 64))
       return static_cast<_VecBltnBtmsk<__bytes>*>(nullptr);
     else
@@ -2951,6 +2986,9 @@  template <typename _Tp, typename _Abi = simd_abi::__default_abi<_Tp>>
 template <typename _Tp, size_t _Np, typename = void>
   struct __deduce_impl;
 
+template <typename _Tp, size_t _Np, typename = void>
+  struct __no_sve_deduce_impl;
+
 namespace simd_abi {
 /**
  * @tparam _Tp   The requested `value_type` for the elements.
@@ -2965,6 +3003,12 @@  template <typename _Tp, size_t _Np, typename...>
 
 template <typename _Tp, size_t _Np, typename... _Abis>
   using deduce_t = typename deduce<_Tp, _Np, _Abis...>::type;
+
+template <typename _Tp, size_t _Np, typename...>
+  struct __no_sve_deduce : __no_sve_deduce_impl<_Tp, _Np> {};
+
+template <typename _Tp, size_t _Np, typename... _Abis>
+  using __no_sve_deduce_t = typename __no_sve_deduce<_Tp, _Np, _Abis...>::type;
 } // namespace simd_abi
 
 // }}}2
@@ -2974,13 +3018,23 @@  template <typename _Tp, typename _V, typename = void>
 
 template <typename _Tp, typename _Up, typename _Abi>
   struct rebind_simd<_Tp, simd<_Up, _Abi>,
-		     void_t<simd_abi::deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>>>
-  { using type = simd<_Tp, simd_abi::deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>>; };
+		     void_t<std::conditional_t<!__is_sve_abi<_Abi>(),
+    simd_abi::__no_sve_deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>,
+    simd_abi::deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>>>>
+  { using type = simd<_Tp, std::conditional_t<!__is_sve_abi<_Abi>(),
+                  simd_abi::__no_sve_deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>,
+                  simd_abi::deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>>>;
+  };
 
 template <typename _Tp, typename _Up, typename _Abi>
   struct rebind_simd<_Tp, simd_mask<_Up, _Abi>,
-		     void_t<simd_abi::deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>>>
-  { using type = simd_mask<_Tp, simd_abi::deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>>; };
+		     void_t<std::conditional_t<!__is_sve_abi<_Abi>(),
+    simd_abi::__no_sve_deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>,
+    simd_abi::deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>>>>
+  { using type = simd_mask<_Tp, std::conditional_t<!__is_sve_abi<_Abi>(),
+                  simd_abi::__no_sve_deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>,
+                  simd_abi::deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>>>;
+  };
 
 template <typename _Tp, typename _V>
   using rebind_simd_t = typename rebind_simd<_Tp, _V>::type;
@@ -3243,7 +3297,7 @@  template <typename _Tp, typename _Up, typename _Ap>
     else if constexpr (_Tp::size() == 1)
       return __x[0];
     else if constexpr (sizeof(_Tp) == sizeof(__x)
-		       && !__is_fixed_size_abi_v<_Ap>)
+		       && !__is_fixed_size_abi_v<_Ap> && !__is_sve_abi<_Ap>())
       return {__private_init,
 	      __vector_bitcast<typename _Tp::value_type, _Tp::size()>(
 		_Ap::_S_masked(__data(__x))._M_data)};
@@ -4004,18 +4058,29 @@  template <typename _V, typename _Ap,
   split(const simd<typename _V::value_type, _Ap>& __x)
   {
     using _Tp = typename _V::value_type;
+
+    auto __gen_fallback = [&]() constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+	return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>(
+		 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+		   return _V([&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+			     { return __x[__i * _V::size() + __j]; });
+		 });
+    };
+
     if constexpr (_Parts == 1)
       {
 	return {simd_cast<_V>(__x)};
       }
     else if (__x._M_is_constprop())
       {
-	return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>(
-		 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
-		   return _V([&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
-			     { return __x[__i * _V::size() + __j]; });
-		 });
+  return __gen_fallback();
       }
+#if _GLIBCXX_SIMD_HAVE_SVE
+      else if constexpr(__is_sve_abi<_Ap>)
+      {
+  return __gen_fallback();
+      }
+#endif
     else if constexpr (
       __is_fixed_size_abi_v<_Ap>
       && (is_same_v<typename _V::abi_type, simd_abi::scalar>
@@ -4115,7 +4180,8 @@  template <size_t... _Sizes, typename _Tp, typename _Ap, typename>
     constexpr size_t _N0 = _SL::template _S_at<0>();
     using _V = __deduced_simd<_Tp, _N0>;
 
-    if (__x._M_is_constprop())
+    auto __gen_fallback = [&]() constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA 
+    {
       return __generate_from_n_evaluations<sizeof...(_Sizes), _Tuple>(
 	       [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
 		 using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>;
@@ -4124,6 +4190,14 @@  template <size_t... _Sizes, typename _Tp, typename _Ap, typename>
 			  return __x[__offset + __j];
 			});
 	       });
+    };
+
+    if (__x._M_is_constprop())
+      __gen_fallback();
+#if _GLIBCXX_SIMD_HAVE_SVE
+    else if constexpr (__have_sve)
+      __gen_fallback();
+#endif
     else if constexpr (_Np == _N0)
       {
 	static_assert(sizeof...(_Sizes) == 1);
@@ -4510,8 +4584,10 @@  template <template <int> class _A0, template <int> class... _Rest>
 	    // 1. The ABI tag is valid for _Tp
 	    // 2. The storage overhead is no more than padding to fill the next
 	    //    power-of-2 number of bytes
-	    if constexpr (_A0<_Bytes>::template _S_is_valid_v<
-			    _Tp> && __fullsize / 2 < _Np)
+	    if constexpr (_A0<_Bytes>::template _S_is_valid_v<_Tp> 
+            && ((__is_sve_abi<_A0<_Bytes>>() && __have_sve && (_Np <= __sve_vectorized_size_bytes/sizeof(_Tp)))
+                || (__fullsize / 2 < _Np))
+        )
 	      return typename __decay_abi<_A0<_Bytes>>::type{};
 	    else
 	      {
@@ -4536,7 +4612,13 @@  template <template <int> class _A0, template <int> class... _Rest>
 // the following lists all native ABIs, which makes them accessible to
 // simd_abi::deduce and select_best_vector_type_t (for fixed_size). Order
 // matters: Whatever comes first has higher priority.
-using _AllNativeAbis = _AbiList<simd_abi::_VecBltnBtmsk, simd_abi::_VecBuiltin,
+using _AllNativeAbis = _AbiList<
+#if _GLIBCXX_SIMD_HAVE_SVE
+  simd_abi::_SveAbi,
+#endif
+   simd_abi::_VecBltnBtmsk, simd_abi::_VecBuiltin, __scalar_abi_wrapper>;
+
+using _NoSveAllNativeAbis = _AbiList<simd_abi::_VecBltnBtmsk, simd_abi::_VecBuiltin,
 				__scalar_abi_wrapper>;
 
 // valid _SimdTraits specialization {{{1
@@ -4551,18 +4633,35 @@  template <typename _Tp, size_t _Np>
     _Tp, _Np, enable_if_t<_AllNativeAbis::template _S_has_valid_abi<_Tp, _Np>>>
   { using type = _AllNativeAbis::_FirstValidAbi<_Tp, _Np>; };
 
+template <typename _Tp, size_t _Np>
+  struct __no_sve_deduce_impl<
+    _Tp, _Np, enable_if_t<_NoSveAllNativeAbis::template _S_has_valid_abi<_Tp, _Np>>>
+  { using type = _NoSveAllNativeAbis::_FirstValidAbi<_Tp, _Np>; };
+
 // fall back to fixed_size only if scalar and native ABIs don't match
 template <typename _Tp, size_t _Np, typename = void>
   struct __deduce_fixed_size_fallback {};
 
+template <typename _Tp, size_t _Np, typename = void>
+  struct __no_sve_deduce_fixed_size_fallback {};
+
 template <typename _Tp, size_t _Np>
   struct __deduce_fixed_size_fallback<_Tp, _Np,
     enable_if_t<simd_abi::fixed_size<_Np>::template _S_is_valid_v<_Tp>>>
   { using type = simd_abi::fixed_size<_Np>; };
 
+template <typename _Tp, size_t _Np>
+  struct __no_sve_deduce_fixed_size_fallback<_Tp, _Np,
+    enable_if_t<simd_abi::fixed_size<_Np>::template _S_is_valid_v<_Tp>>>
+  { using type = simd_abi::fixed_size<_Np>; };
+
 template <typename _Tp, size_t _Np, typename>
   struct __deduce_impl : public __deduce_fixed_size_fallback<_Tp, _Np> {};
 
+template <typename _Tp, size_t _Np, typename>
+  struct __no_sve_deduce_impl : public __no_sve_deduce_fixed_size_fallback<_Tp, _Np> {};
+
+
 //}}}1
 /// @endcond
 
diff --git a/libstdc++-v3/include/experimental/bits/simd_builtin.h b/libstdc++-v3/include/experimental/bits/simd_builtin.h
index 6ccc2fcec9c..bb5d4e3d1c5 100644
--- a/libstdc++-v3/include/experimental/bits/simd_builtin.h
+++ b/libstdc++-v3/include/experimental/bits/simd_builtin.h
@@ -1614,7 +1614,7 @@  template <typename _Abi, typename>
 	    static_assert(_UW_size <= _TV_size);
 	    using _UW = _SimdWrapper<_Up, _UW_size>;
 	    using _UV = __vector_type_t<_Up, _UW_size>;
-	    using _UAbi = simd_abi::deduce_t<_Up, _UW_size>;
+	    using _UAbi = simd_abi::__no_sve_deduce_t<_Up, _UW_size>;
 	    if constexpr (_UW_size == _TV_size) // one convert+store
 	      {
 		const _UW __converted = __convert<_UW>(__v);
@@ -1857,7 +1857,7 @@  template <typename _Abi, typename>
 	    else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>,
 					 plus<>>)
 	      {
-		using _Ap = simd_abi::deduce_t<_Tp, __full_size>;
+		using _Ap = simd_abi::__no_sve_deduce_t<_Tp, __full_size>;
 		return _Ap::_SimdImpl::_S_reduce(
 		  simd<_Tp, _Ap>(__private_init,
 				 _Abi::_S_masked(__as_vector(__x))),
@@ -1866,7 +1866,7 @@  template <typename _Abi, typename>
 	    else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>,
 					 multiplies<>>)
 	      {
-		using _Ap = simd_abi::deduce_t<_Tp, __full_size>;
+		using _Ap = simd_abi::__no_sve_deduce_t<_Tp, __full_size>;
 		using _TW = _SimdWrapper<_Tp, __full_size>;
 		_GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask_full
 		  = _Abi::template _S_implicit_mask<_Tp>().__as_full_vector();
@@ -1882,7 +1882,7 @@  template <typename _Abi, typename>
 	      }
 	    else if constexpr (_Np & 1)
 	      {
-		using _Ap = simd_abi::deduce_t<_Tp, _Np - 1>;
+		using _Ap = simd_abi::__no_sve_deduce_t<_Tp, _Np - 1>;
 		return __binary_op(
 		  simd<_Tp, simd_abi::scalar>(_Ap::_SimdImpl::_S_reduce(
 		    simd<_Tp, _Ap>(
@@ -1936,7 +1936,7 @@  template <typename _Abi, typename>
 	  {
 	    static_assert(sizeof(__x) > __min_vector_size<_Tp>);
 	    static_assert((_Np & (_Np - 1)) == 0); // _Np must be a power of 2
-	    using _Ap = simd_abi::deduce_t<_Tp, _Np / 2>;
+	    using _Ap = simd_abi::__no_sve_deduce_t<_Tp, _Np / 2>;
 	    using _V = simd<_Tp, _Ap>;
 	    return _Ap::_SimdImpl::_S_reduce(
 	      __binary_op(_V(__private_init, __extract<0, 2>(__as_vector(__x))),
@@ -2376,6 +2376,16 @@  template <typename _Abi, typename>
       _GLIBCXX_SIMD_INTRINSIC static __fixed_size_storage_t<int, _Np>
       _S_fpclassify(_SimdWrapper<_Tp, _Np> __x)
       {
+		if constexpr(__have_sve)
+		{
+		__fixed_size_storage_t<int, _Np> __r{};
+		__execute_n_times<_Np>(
+			[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+				__r._M_set(__i, std::fpclassify(__x[__i]));
+			});
+		return __r;
+		}
+		else {
 	using _I = __int_for_sizeof_t<_Tp>;
 	const auto __xn
 	  = __vector_bitcast<_I>(__to_intrin(_SuperImpl::_S_abs(__x)));
@@ -2453,6 +2463,7 @@  template <typename _Abi, typename>
 					      })};
 	else
 	  __assert_unreachable<_Tp>();
+		}
       }
 
     // _S_increment & _S_decrement{{{2
@@ -2785,11 +2796,23 @@  template <typename _Abi, typename>
 	      return _R(_UAbi::_MaskImpl::_S_to_bits(__data(__x))._M_to_bits());
 	  }
 	else
+	{
+		if constexpr(__is_sve_abi<_UAbi>())
+		{
+			simd_mask<_Tp> __r(false);
+			constexpr size_t __min_size = std::min(__r.size(), __x.size());
+			__execute_n_times<__min_size>(
+			[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+	  			__r[__i] = __x[__i];
+			});
+			return __data(__r);			
+		}
+		else 
 	  return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
 						       _S_size<_Tp>>(
 	    __data(__x));
       }
-
+	}
     // }}}
     // _S_masked_load {{{2
     template <typename _Tp, size_t _Np>
diff --git a/libstdc++-v3/include/experimental/bits/simd_converter.h b/libstdc++-v3/include/experimental/bits/simd_converter.h
index 3160e251632..b233d2c70a5 100644
--- a/libstdc++-v3/include/experimental/bits/simd_converter.h
+++ b/libstdc++-v3/include/experimental/bits/simd_converter.h
@@ -28,6 +28,18 @@ 
 #if __cplusplus >= 201703L
 
 _GLIBCXX_SIMD_BEGIN_NAMESPACE
+
+template <typename _Arg, typename _Ret, typename _To, size_t _Np>
+_Ret __converter_fallback(_Arg __a)
+  {
+  _Ret __ret{};
+  __execute_n_times<_Np>(
+      [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+        __ret._M_set(__i, static_cast<_To>(__a[__i]));
+    });
+  return __ret;
+  }
+
 // _SimdConverter scalar -> scalar {{{
 template <typename _From, typename _To>
   struct _SimdConverter<_From, simd_abi::scalar, _To, simd_abi::scalar,
@@ -56,14 +68,16 @@  template <typename _From, typename _To, typename _Abi>
   };
 
 // }}}
-// _SimdConverter "native 1" -> "native 2" {{{
+// _SimdConverter "native non-sve 1" -> "native non-sve 2" {{{
 template <typename _From, typename _To, typename _AFrom, typename _ATo>
   struct _SimdConverter<
     _From, _AFrom, _To, _ATo,
     enable_if_t<!disjunction_v<
       __is_fixed_size_abi<_AFrom>, __is_fixed_size_abi<_ATo>,
       is_same<_AFrom, simd_abi::scalar>, is_same<_ATo, simd_abi::scalar>,
-      conjunction<is_same<_From, _To>, is_same<_AFrom, _ATo>>>>>
+      conjunction<is_same<_From, _To>, is_same<_AFrom, _ATo>>>
+	  && !(__is_sve_abi<_AFrom>() || __is_sve_abi<_ATo>())
+	  >>
   {
     using _Arg = typename _AFrom::template __traits<_From>::_SimdMember;
     using _Ret = typename _ATo::template __traits<_To>::_SimdMember;
@@ -75,6 +89,26 @@  template <typename _From, typename _To, typename _AFrom, typename _ATo>
       { return __vector_convert<_V>(__a, __more...); }
   };
 
+// }}}
+// _SimdConverter "native 1" -> "native 2" {{{
+template <typename _From, typename _To, typename _AFrom, typename _ATo>
+  struct _SimdConverter<
+    _From, _AFrom, _To, _ATo,
+    enable_if_t<!disjunction_v<
+      __is_fixed_size_abi<_AFrom>, __is_fixed_size_abi<_ATo>,
+      is_same<_AFrom, simd_abi::scalar>, is_same<_ATo, simd_abi::scalar>,
+      conjunction<is_same<_From, _To>, is_same<_AFrom, _ATo>>>
+	  && (__is_sve_abi<_AFrom>() || __is_sve_abi<_ATo>())
+	  >>
+  {
+    using _Arg = typename _AFrom::template __traits<_From>::_SimdMember;
+    using _Ret = typename _ATo::template __traits<_To>::_SimdMember;
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr _Ret
+    operator()(_Arg __x) const noexcept
+    { return __converter_fallback<_Arg, _Ret, _To, simd_size_v<_To, _ATo>>(__x); }
+  };
+
 // }}}
 // _SimdConverter scalar -> fixed_size<1> {{{1
 template <typename _From, typename _To>
@@ -111,6 +145,10 @@  template <typename _From, typename _To, int _Np>
       if constexpr (is_same_v<_From, _To>)
 	return __x;
 
+	  // fallback to sequential when sve is available
+	  else if constexpr (__have_sve)
+	return __converter_fallback<_Arg, _Ret, _To, _Np>(__x);
+
       // special case (optimize) int signedness casts
       else if constexpr (sizeof(_From) == sizeof(_To)
 			 && is_integral_v<_From> && is_integral_v<_To>)
@@ -275,11 +313,14 @@  template <typename _From, typename _Ap, typename _To, int _Np>
       "_SimdConverter to fixed_size only works for equal element counts");
 
     using _Ret = __fixed_size_storage_t<_To, _Np>;
+	using _Arg = typename _SimdTraits<_From, _Ap>::_SimdMember;
 
     _GLIBCXX_SIMD_INTRINSIC constexpr _Ret
-    operator()(typename _SimdTraits<_From, _Ap>::_SimdMember __x) const noexcept
+    operator()(_Arg __x) const noexcept
     {
-      if constexpr (_Ret::_S_tuple_size == 1)
+	  if constexpr (__have_sve)
+	return __converter_fallback<_Arg, _Ret, _To, _Np>(__x);
+      else if constexpr (_Ret::_S_tuple_size == 1)
 	return {__vector_convert<typename _Ret::_FirstType::_BuiltinType>(__x)};
       else
 	{
@@ -316,12 +357,14 @@  template <typename _From, int _Np, typename _To, typename _Ap>
       "_SimdConverter to fixed_size only works for equal element counts");
 
     using _Arg = __fixed_size_storage_t<_From, _Np>;
+	using _Ret = typename _SimdTraits<_To, _Ap>::_SimdMember;
 
     _GLIBCXX_SIMD_INTRINSIC constexpr
-      typename _SimdTraits<_To, _Ap>::_SimdMember
-      operator()(const _Arg& __x) const noexcept
+      _Ret operator()(const _Arg& __x) const noexcept
     {
-      if constexpr (_Arg::_S_tuple_size == 1)
+	  if constexpr(__have_sve)
+	return __converter_fallback<_Arg, _Ret, _To, _Np>(__x);
+      else if constexpr (_Arg::_S_tuple_size == 1)
 	return __vector_convert<__vector_type_t<_To, _Np>>(__x.first);
       else if constexpr (_Arg::_S_is_homogeneous)
 	return __call_with_n_evaluations<_Arg::_S_tuple_size>(
diff --git a/libstdc++-v3/include/experimental/bits/simd_detail.h b/libstdc++-v3/include/experimental/bits/simd_detail.h
index 1fb77866bb2..52fdf7149bf 100644
--- a/libstdc++-v3/include/experimental/bits/simd_detail.h
+++ b/libstdc++-v3/include/experimental/bits/simd_detail.h
@@ -61,6 +61,11 @@ 
 #else
 #define _GLIBCXX_SIMD_HAVE_NEON_A64 0
 #endif
+#if (__ARM_FEATURE_SVE_BITS > 0 && __ARM_FEATURE_SVE_VECTOR_OPERATORS==1)
+#define _GLIBCXX_SIMD_HAVE_SVE 1
+#else
+#define _GLIBCXX_SIMD_HAVE_SVE 0
+#endif
 //}}}
 // x86{{{
 #ifdef __MMX__
@@ -267,7 +272,7 @@ 
 #define _GLIBCXX_SIMD_IS_UNLIKELY(__x) __builtin_expect(__x, 0)
 #define _GLIBCXX_SIMD_IS_LIKELY(__x) __builtin_expect(__x, 1)
 
-#if __STRICT_ANSI__ || defined __clang__
+#if _GLIBCXX_SIMD_HAVE_SVE || __STRICT_ANSI__ || defined __clang__
 #define _GLIBCXX_SIMD_CONSTEXPR
 #define _GLIBCXX_SIMD_USE_CONSTEXPR_API const
 #else
diff --git a/libstdc++-v3/include/experimental/bits/simd_math.h b/libstdc++-v3/include/experimental/bits/simd_math.h
index c91f05fceb3..0e62a7f1650 100644
--- a/libstdc++-v3/include/experimental/bits/simd_math.h
+++ b/libstdc++-v3/include/experimental/bits/simd_math.h
@@ -652,6 +652,18 @@  template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
 	(*__exp)[0] = __tmp;
 	return __r;
       }
+      else if constexpr (__is_sve_abi<_Abi>())
+      {
+        simd<_Tp, _Abi> __r;
+        __execute_n_times<simd_size_v<_Tp, _Abi>>(
+        [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+            	int __tmp;
+              const auto __ri = std::frexp(__x[__i], &__tmp);
+              (*__exp)[__i] = __tmp;
+              __r[__i] = __ri;
+        });
+        return __r;
+      }
     else if constexpr (__is_fixed_size_abi_v<_Abi>)
       return {__private_init, _Abi::_SimdImpl::_S_frexp(__data(__x), __data(*__exp))};
 #if _GLIBCXX_SIMD_X86INTRIN
@@ -1135,7 +1147,7 @@  _GLIBCXX_SIMD_CVTING2(hypot)
 	    _GLIBCXX_SIMD_USE_CONSTEXPR_API _V __inf(__infinity_v<_Tp>);
 
 #ifndef __FAST_MATH__
-	    if constexpr (_V::size() > 1 && __have_neon && !__have_neon_a32)
+	    if constexpr (_V::size() > 1 && (__is_neon_abi<_Abi>() && __have_neon && !__have_neon_a32))
 	      { // With ARMv7 NEON, we have no subnormals and must use slightly
 		// different strategy
 		const _V __hi_exp = __hi & __inf;
diff --git a/libstdc++-v3/include/experimental/bits/simd_sve.h b/libstdc++-v3/include/experimental/bits/simd_sve.h
new file mode 100644
index 00000000000..123242a3a62
--- /dev/null
+++ b/libstdc++-v3/include/experimental/bits/simd_sve.h
@@ -0,0 +1,1863 @@ 
+// Simd SVE specific implementations -*- C++ -*-
+
+// Copyright The GNU Toolchain Authors.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// Under Section 7 of GPL version 3, you are granted additional
+// permissions described in the GCC Runtime Library Exception, version
+// 3.1, as published by the Free Software Foundation.
+
+// You should have received a copy of the GNU General Public License and
+// a copy of the GCC Runtime Library Exception along with this program;
+// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+// <http://www.gnu.org/licenses/>.
+
+
+#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_SVE_H_
+#define _GLIBCXX_EXPERIMENTAL_SIMD_SVE_H_
+
+#if __cplusplus >= 201703L
+
+#if !_GLIBCXX_SIMD_HAVE_SVE
+#error "simd_sve.h may only be included when SVE on ARM is available"
+#endif
+
+_GLIBCXX_SIMD_BEGIN_NAMESPACE
+
+// Helper function mapping to sve supported types
+template <typename _Tp>
+  constexpr auto
+  __get_sve_value_type()
+  {
+    if constexpr (is_integral_v<_Tp>)
+      {
+	if constexpr (is_signed_v<_Tp>)
+	  {
+	    if constexpr (sizeof(_Tp) == 1)
+	      return int8_t{};
+	    else if constexpr (sizeof(_Tp) == 2)
+	      return int16_t{};
+	    else if constexpr (sizeof(_Tp) == 4)
+	      return int32_t{};
+	    else if constexpr (sizeof(_Tp) == 8)
+	      return int64_t{};
+	    else
+	      return _Tp{};
+	  }
+	else
+	  {
+	    if constexpr (sizeof(_Tp) == 1)
+	      return uint8_t{};
+	    else if constexpr (sizeof(_Tp) == 2)
+	      return uint16_t{};
+	    else if constexpr (sizeof(_Tp) == 4)
+	      return uint32_t{};
+	    else if constexpr (sizeof(_Tp) == 8)
+	      return uint64_t{};
+	    else
+	      return _Tp{};
+	  }
+      }
+    else
+      {
+	if constexpr (is_floating_point_v<_Tp>)
+	  {
+	    if constexpr (sizeof(_Tp) == 4)
+	      return float32_t{};
+	    else if constexpr (sizeof(_Tp) == 8)
+	      return float64_t{};
+	    else
+	      return _Tp{};
+	  }
+      }
+  }
+
+template <typename _Tp>
+  using __get_sve_value_type_t = decltype(__get_sve_value_type<_Tp>());
+
+typedef svbool_t __sve_bool_type __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+
+template <typename _Tp, size_t _Np>
+  struct __sve_vector_type
+  {};
+
+template <typename _Tp, size_t _Np>
+  using __sve_vector_type_t = typename __sve_vector_type<_Tp, _Np>::type;
+
+template <size_t _Np>
+  struct __sve_vector_type<int8_t, _Np>
+  {
+    typedef svint8_t __sve_vlst_type __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+
+    inline static __sve_vlst_type
+    __sve_broadcast(int8_t __dup)
+    { return svdup_s8(__dup); }
+
+    inline static __sve_bool_type
+    __sve_active_mask()
+    { return svwhilelt_b8(size_t(0), _Np); };
+
+    using type = __sve_vlst_type;
+  };
+
+template <size_t _Np>
+  struct __sve_vector_type<uint8_t, _Np>
+  {
+    typedef svuint8_t __sve_vlst_type __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+
+    inline static __sve_vlst_type
+    __sve_broadcast(uint8_t __dup)
+    { return svdup_u8(__dup); }
+
+    inline static __sve_bool_type
+    __sve_active_mask()
+    { return svwhilelt_b8(size_t(0), _Np); };
+
+    using type = __sve_vlst_type;
+  };
+
+template <size_t _Np>
+  struct __sve_vector_type<int16_t, _Np>
+  {
+    typedef svint16_t __sve_vlst_type __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+
+    inline static __sve_vlst_type
+    __sve_broadcast(int16_t __dup)
+    { return svdup_s16(__dup); }
+
+    inline static __sve_bool_type
+    __sve_active_mask()
+    { return svwhilelt_b16(size_t(0), _Np); };
+
+    using type = __sve_vlst_type;
+  };
+
+template <size_t _Np>
+  struct __sve_vector_type<uint16_t, _Np>
+  {
+    typedef svuint16_t __sve_vlst_type __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+
+    inline static __sve_vlst_type
+    __sve_broadcast(uint16_t __dup)
+    { return svdup_u16(__dup); }
+
+    inline static __sve_bool_type
+    __sve_active_mask()
+    { return svwhilelt_b16(size_t(0), _Np); };
+
+    using type = __sve_vlst_type;
+  };
+
+template <size_t _Np>
+  struct __sve_vector_type<int32_t, _Np>
+  {
+    typedef svint32_t __sve_vlst_type __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+
+    inline static __sve_vlst_type
+    __sve_broadcast(int32_t __dup)
+    { return svdup_s32(__dup); }
+
+    inline static __sve_bool_type
+    __sve_active_mask()
+    { return svwhilelt_b32(size_t(0), _Np); };
+
+    using type = __sve_vlst_type;
+  };
+
+template <size_t _Np>
+  struct __sve_vector_type<uint32_t, _Np>
+  {
+    typedef svuint32_t __sve_vlst_type __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+
+    inline static __sve_vlst_type
+    __sve_broadcast(uint32_t __dup)
+    { return svdup_u32(__dup); }
+
+    inline static __sve_bool_type
+    __sve_active_mask()
+    { return svwhilelt_b32(size_t(0), _Np); };
+
+    using type = __sve_vlst_type;
+  };
+
+template <size_t _Np>
+  struct __sve_vector_type<int64_t, _Np>
+  {
+    typedef svint64_t __sve_vlst_type __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+
+    inline static __sve_vlst_type
+    __sve_broadcast(int64_t __dup)
+    { return svdup_s64(__dup); }
+
+    inline static __sve_bool_type
+    __sve_active_mask()
+    { return svwhilelt_b64(size_t(0), _Np); };
+
+    using type = __sve_vlst_type;
+  };
+
+template <size_t _Np>
+  struct __sve_vector_type<uint64_t, _Np>
+  {
+    typedef svuint64_t __sve_vlst_type __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+
+    inline static __sve_vlst_type
+    __sve_broadcast(uint64_t __dup)
+    { return svdup_u64(__dup); }
+
+    inline static __sve_bool_type
+    __sve_active_mask()
+    { return svwhilelt_b64(size_t(0), _Np); };
+
+    using type = __sve_vlst_type;
+  };
+
+template <size_t _Np>
+  struct __sve_vector_type<float, _Np>
+  {
+    typedef svfloat32_t __sve_vlst_type __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+
+    inline static __sve_vlst_type
+    __sve_broadcast(float __dup)
+    { return svdup_f32(__dup); }
+
+    inline static __sve_bool_type
+    __sve_active_mask()
+    { return svwhilelt_b32(size_t(0), _Np); };
+
+    using type = __sve_vlst_type;
+  };
+
+template <size_t _Np>
+  struct __sve_vector_type<double, _Np>
+  {
+    typedef svfloat64_t __sve_vlst_type __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+
+    inline static __sve_vlst_type
+    __sve_broadcast(double __dup)
+    { return svdup_f64(__dup); }
+
+    inline static __sve_bool_type
+    __sve_active_mask()
+    { return svwhilelt_b64(size_t(0), _Np); };
+
+    using type = __sve_vlst_type;
+  };
+
+template <size_t _Np>
+  struct __sve_vector_type<char, _Np>
+  : __sve_vector_type<__get_sve_value_type_t<char>, _Np>
+  {};
+
+template <size_t _Np>
+  struct __sve_vector_type<char16_t, _Np>
+  : __sve_vector_type<__get_sve_value_type_t<char16_t>, _Np>
+  {};
+
+template <size_t _Np>
+  struct __sve_vector_type<wchar_t, _Np>
+  : __sve_vector_type<__get_sve_value_type_t<wchar_t>, _Np>
+  {};
+
+template <size_t _Np>
+  struct __sve_vector_type<char32_t, _Np>
+  : __sve_vector_type<__get_sve_value_type_t<char32_t>, _Np>
+  {};
+
+template <size_t _Np>
+  struct __sve_vector_type<long long int, _Np>
+  : __sve_vector_type<__get_sve_value_type_t<long long int>, _Np>
+  {};
+
+template <size_t _Np>
+  struct __sve_vector_type<long long unsigned int, _Np>
+  : __sve_vector_type<__get_sve_value_type_t<long long unsigned int>, _Np>
+  {};
+
+template <size_t _Size>
+  struct __sve_mask_type
+  {
+    static_assert((_Size & (_Size - 1)) != 0, "This trait may only be used for non-power-of-2 "
+					      "sizes. Power-of-2 sizes must be specialized.");
+
+    using type = typename __sve_mask_type<std::__bit_ceil(_Size)>::type;
+  };
+
+template <size_t _Size>
+  using __sve_mask_type_t = typename __sve_mask_type<_Size>::type;
+
+template <>
+  struct __sve_mask_type<1>
+  {
+    using type = __sve_bool_type;
+
+    using __sve_mask_uint_type = uint8_t;
+
+    typedef svuint8_t __sve_mask_vector_type
+    __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+
+    inline static auto
+    __sve_mask_active_count(type __active_mask, type __pred)
+    { return svcntp_b8(__active_mask, __pred); }
+
+    inline static type
+    __sve_mask_first_true()
+    { return svptrue_pat_b8(SV_VL1); }
+
+    inline static type
+    __sve_mask_next_true(type __active_mask, type __pred)
+    { return svpnext_b8(__active_mask, __pred); }
+
+    inline static bool
+    __sve_mask_get(type __active_mask, size_t __i)
+    { return __sve_mask_vector_type(svdup_u8_z(__active_mask, 1))[__i]  != 0;}
+
+    inline static const __sve_mask_vector_type __index0123 = svindex_u8(0, 1);
+  };
+
+template <>
+  struct __sve_mask_type<2>
+  {
+    using type = __sve_bool_type;
+
+    using __sve_mask_uint_type = uint16_t;
+
+    typedef svuint16_t __sve_mask_vector_type
+    __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+
+    inline static auto
+    __sve_mask_active_count(type __active_mask, type __pred)
+    { return svcntp_b16(__active_mask, __pred); }
+
+    inline static type
+    __sve_mask_first_true()
+    { return svptrue_pat_b16(SV_VL1); }
+
+    inline static type
+    __sve_mask_next_true(type __active_mask, type __pred)
+    { return svpnext_b16(__active_mask, __pred); }
+
+    inline static bool
+    __sve_mask_get(type __active_mask, size_t __i)
+    { return __sve_mask_vector_type(svdup_u16_z(__active_mask, 1))[__i] != 0;}
+
+    inline static const __sve_mask_vector_type __index0123 = svindex_u16(0, 1);
+  };
+
+template <>
+  struct __sve_mask_type<4>
+  {
+    using type = __sve_bool_type;
+
+    using __sve_mask_uint_type = uint32_t;
+
+    typedef svuint32_t __sve_mask_vector_type
+    __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+
+    inline static auto
+    __sve_mask_active_count(type __active_mask, type __pred)
+    { return svcntp_b32(__active_mask, __pred); }
+
+    inline static type
+    __sve_mask_first_true()
+    { return svptrue_pat_b32(SV_VL1); }
+
+    inline static type
+    __sve_mask_next_true(type __active_mask, type __pred)
+    { return svpnext_b32(__active_mask, __pred); }
+
+    inline static bool
+    __sve_mask_get(type __active_mask, size_t __i)
+    { return __sve_mask_vector_type(svdup_u32_z(__active_mask, 1))[__i] != 0;}
+
+    inline static const __sve_mask_vector_type __index0123 = svindex_u32(0, 1);
+  };
+
+template <>
+  struct __sve_mask_type<8>
+  {
+    using type = __sve_bool_type;
+
+    using __sve_mask_uint_type = uint64_t;
+
+    typedef svuint64_t __sve_mask_vector_type
+    __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+
+    inline static auto
+    __sve_mask_active_count(type __active_mask, type __pred)
+    { return svcntp_b64(__active_mask, __pred); }
+
+    inline static type
+    __sve_mask_first_true()
+    { return svptrue_pat_b64(SV_VL1); }
+
+    inline static type
+    __sve_mask_next_true(type __active_mask, type __pred)
+    { return svpnext_b64(__active_mask, __pred); }
+
+    inline static bool
+    __sve_mask_get(type __active_mask, size_t __i)
+    { return __sve_mask_vector_type(svdup_u64_z(__active_mask, 1))[__i] != 0;}
+
+    inline static const __sve_mask_vector_type __index0123 = svindex_u64(0, 1);
+  };
+
+template <typename _To, typename _From>
+  _GLIBCXX_SIMD_INTRINSIC constexpr auto
+  __sve_reinterpret_cast(_From __v)
+  {
+    if constexpr (std::is_same_v<_To, int32_t>)
+      return svreinterpret_s32(__v);
+    else if constexpr (std::is_same_v<_To, int64_t>)
+      return svreinterpret_s64(__v);
+    else if constexpr (std::is_same_v<_To, float32_t>)
+      return svreinterpret_f32(__v);
+    else if constexpr (std::is_same_v<_To, float64_t>)
+      return svreinterpret_f64(__v);
+    else
+      __assert_unreachable<_To>(); // add more cases if needed.
+  }
+
+template <typename _Tp, size_t _Width>
+  struct _SveSimdWrapper
+  {
+    static_assert(__is_vectorizable_v<_Tp>);
+
+    static_assert(_Width >= 2); // 1 doesn't make sense, use _Tp directly then
+
+    using _BuiltinType = __sve_vector_type_t<_Tp, _Width>;
+
+    using value_type = _Tp;
+
+    static inline constexpr size_t _S_full_size = sizeof(_BuiltinType) / sizeof(value_type);
+
+    static inline constexpr int _S_size = _Width;
+
+    static inline constexpr bool _S_is_partial = _S_full_size != _S_size;
+
+    _BuiltinType _M_data;
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr _SveSimdWrapper<_Tp, _S_full_size>
+    __as_full_vector() const
+    { return _M_data; }
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr
+    _SveSimdWrapper(initializer_list<_Tp> __init)
+    : _M_data(__generate_from_n_evaluations<_Width, _BuiltinType>(
+		[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+		  return __init.begin()[__i.value];
+		}))
+    {}
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr
+    _SveSimdWrapper() = default;
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr
+    _SveSimdWrapper(const _SveSimdWrapper&) = default;
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr
+    _SveSimdWrapper(_SveSimdWrapper&&) = default;
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr _SveSimdWrapper&
+    operator=(const _SveSimdWrapper&) = default;
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr _SveSimdWrapper&
+    operator=(_SveSimdWrapper&&) = default;
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr
+    _SveSimdWrapper(__sve_vector_type_t<_Tp, _Width> __x)
+    : _M_data(__x)
+    {}
+
+    template <typename... _As, typename = enable_if_t<((is_same_v<simd_abi::scalar, _As> && ...)
+							 && sizeof...(_As) <= _Width)>>
+      _GLIBCXX_SIMD_INTRINSIC constexpr
+      operator _SimdTuple<_Tp, _As...>() const
+      {
+	return __generate_from_n_evaluations<sizeof...(_As), _SimdTuple<_Tp, _As...>>(
+		 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+		   return _M_data[int(__i)];
+		 });
+      }
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr
+    operator const _BuiltinType&() const
+    { return _M_data; }
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr
+    operator _BuiltinType&()
+    { return _M_data; }
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr _Tp
+    operator[](size_t __i) const
+    { return _M_data[__i]; }
+
+    template <size_t __i>
+      _GLIBCXX_SIMD_INTRINSIC constexpr _Tp
+      operator[](_SizeConstant<__i>) const
+      { return _M_data[__i]; }
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr void
+    _M_set(size_t __i, _Tp __x)
+    {
+      _M_data[__i] = __x;
+    }
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr bool
+    _M_is_constprop() const
+    { return false; }
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr bool
+    _M_is_constprop_none_of() const
+    { return false; }
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr bool
+    _M_is_constprop_all_of() const
+    { return false; }
+  };
+
+template <size_t _Bits, size_t _Width>
+  struct _SveMaskWrapper
+  {
+    using _BuiltinSveMaskType = __sve_mask_type<_Bits>;
+
+    using _BuiltinSveVectorType = __sve_vector_type<__int_with_sizeof_t<_Bits>, _Width>;
+
+    using _BuiltinType = typename _BuiltinSveMaskType::type;
+
+    using value_type = bool;
+
+    static constexpr size_t _S_full_size = sizeof(_BuiltinType);
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr _SveMaskWrapper<_Bits, _S_full_size>
+    __as_full_vector() const
+    { return _M_data; }
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr
+    _SveMaskWrapper() = default;
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr
+    _SveMaskWrapper(_BuiltinType __k)
+    : _M_data(__k)
+    {};
+
+    _GLIBCXX_SIMD_INTRINSIC
+    operator const _BuiltinType&() const
+    { return _M_data; }
+
+    _GLIBCXX_SIMD_INTRINSIC
+    operator _BuiltinType&()
+    { return _M_data; }
+
+    _GLIBCXX_SIMD_INTRINSIC _BuiltinType
+    __intrin() const
+    { return _M_data; }
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr value_type
+    operator[](size_t __i) const
+    {
+      return _BuiltinSveMaskType::__sve_mask_get(_M_data, __i);
+    }
+
+    template <size_t __i>
+      _GLIBCXX_SIMD_INTRINSIC constexpr value_type
+      operator[](_SizeConstant<__i>) const
+      {
+        return _BuiltinSveMaskType::__sve_mask_get(_M_data, __i);
+      }
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr void
+    _M_set(size_t __i, value_type __x)
+    {
+      _BuiltinType __index
+	= svcmpeq(_BuiltinSveVectorType::__sve_active_mask(), _BuiltinSveMaskType::__index0123,
+		  typename _BuiltinSveMaskType::__sve_mask_uint_type(__i));
+
+      if (__x)
+	_M_data = svorr_z(_BuiltinSveVectorType::__sve_active_mask(), _M_data, __index);
+      else
+	_M_data = svbic_z(_BuiltinSveVectorType::__sve_active_mask(), _M_data, __index);
+    }
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr bool
+    _M_is_constprop() const
+    { return false; }
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr bool
+    _M_is_constprop_none_of() const
+    { return false; }
+
+    _GLIBCXX_SIMD_INTRINSIC constexpr bool
+    _M_is_constprop_all_of() const
+    { return false; }
+
+    _BuiltinType _M_data;
+  };
+
+struct _CommonImplSve;
+
+template <typename _Abi, typename = __detail::__odr_helper>
+  struct _SimdImplSve;
+
+template <typename _Abi, typename = __detail::__odr_helper>
+  struct _MaskImplSve;
+
+template <int _UsedBytes, int>
+  struct simd_abi::_SveAbi
+  {
+    template <typename _Tp>
+      static constexpr size_t _S_size = _UsedBytes / sizeof(_Tp);
+
+    struct _IsValidAbiTag
+    : __bool_constant<(_UsedBytes > 1)>
+    {};
+
+    template <typename _Tp>
+      struct _IsValidSizeFor
+      : __bool_constant<(_UsedBytes / sizeof(_Tp) > 1 && _UsedBytes % sizeof(_Tp) == 0
+			   && _UsedBytes <= __sve_vectorized_size_bytes)>
+      {};
+
+    template <typename _Tp>
+      struct _IsValid
+      : conjunction<_IsValidAbiTag, __bool_constant<__have_sve>,
+		    __bool_constant<(__vectorized_sizeof<_Tp>() > sizeof(_Tp))>,
+		    _IsValidSizeFor<_Tp>>
+      {};
+
+    template <typename _Tp>
+      static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value;
+
+    using _CommonImpl = _CommonImplSve;
+
+    using _SimdImpl = _SimdImplSve<_SveAbi<_UsedBytes>>;
+
+    using _MaskImpl = _MaskImplSve<_SveAbi<_UsedBytes>>;
+
+    template <typename _Tp>
+      using _MaskMember = _SveMaskWrapper<sizeof(_Tp), _S_size<_Tp>>;
+
+    template <typename _Tp, bool = _S_is_valid_v<_Tp>>
+      struct __traits : _InvalidTraits
+      {};
+
+    template <typename _Tp>
+      struct __traits<_Tp, true>
+      {
+	using _IsValid = true_type;
+	using _SimdImpl = _SimdImplSve<_SveAbi<_UsedBytes>>;
+	using _MaskImpl = _MaskImplSve<_SveAbi<_UsedBytes>>;
+
+	using _SimdMember = _SveSimdWrapper<_Tp, _S_size<_Tp>>;         // sve vector type
+	using _MaskMember = _SveMaskWrapper<sizeof(_Tp), _S_size<_Tp>>; // sve mask type
+
+	static constexpr size_t _S_simd_align = alignof(_SimdMember);
+	static constexpr size_t _S_mask_align = alignof(_MaskMember);
+
+	static constexpr size_t _S_full_size = _SimdMember::_S_full_size;
+	static constexpr bool _S_is_partial = _SimdMember::_S_is_partial;
+
+	struct _SimdBase
+	{
+	  _GLIBCXX_SIMD_ALWAYS_INLINE explicit
+	  operator __sve_vector_type_t<_Tp, _S_size<_Tp>>() const
+	  { return __data(*static_cast<const simd<_Tp, _SveAbi<_UsedBytes>>*>(this)); }
+	};
+
+	class _SimdCastType
+	{
+	  using _Ap = __sve_vector_type_t<_Tp, _S_size<_Tp>>;
+
+	  _SimdMember _M_data;
+
+	public:
+	  _GLIBCXX_SIMD_ALWAYS_INLINE constexpr
+	  _SimdCastType(_Ap __a)
+	  : _M_data(__a)
+	  {}
+
+	  _GLIBCXX_SIMD_ALWAYS_INLINE constexpr
+	  operator _SimdMember() const
+	  { return _M_data; }
+	};
+
+	struct _MaskBase
+	{
+	  _GLIBCXX_SIMD_ALWAYS_INLINE explicit
+	  operator __sve_mask_type_t<sizeof(_Tp)>() const
+	  {
+	    return __data(*static_cast<const simd_mask<_Tp, _SveAbi<_UsedBytes>>*>(this));
+	  }
+	};
+
+	class _MaskCastType
+	{
+	  using _Ap = __sve_mask_type_t<sizeof(_Tp)>;
+
+	  _Ap _M_data;
+
+	public:
+	  _GLIBCXX_SIMD_ALWAYS_INLINE constexpr
+	  _MaskCastType(_Ap __a)
+	  : _M_data(__a)
+	  {}
+
+	  _GLIBCXX_SIMD_ALWAYS_INLINE constexpr
+	  operator _MaskMember() const
+	  { return _M_data; }
+	};
+      };
+
+    template <typename _Tp>
+      static constexpr size_t _S_full_size = __traits<_Tp>::_S_full_size;
+
+    template <typename _Tp>
+      static constexpr bool _S_is_partial = __traits<_Tp>::_S_is_partial;
+  };
+
+template <typename _Tp, size_t _Np>
+  using __sve_mask = __sve_mask_type<sizeof(_Tp)>;
+
+struct _CommonImplSve
+{
+  // _S_converts_via_decomposition
+  // This lists all cases where a __vector_convert needs to fall back to
+  // conversion of individual scalars (i.e. decompose the input vector into
+  // scalars, convert, compose output vector). In those cases, _S_masked_load &
+  // _S_masked_store prefer to use the _S_bit_iteration implementation.
+  template <typename _From, typename _To, size_t _ToSize>
+    static inline constexpr bool __converts_via_decomposition_v = sizeof(_From) != sizeof(_To);
+
+  template <typename _Tp, typename _Up, size_t _Np>
+    _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np>
+    _S_load(const _Up* __p, _SveMaskWrapper<sizeof(_Tp), _Np> __k)
+    {
+      using _STp = __get_sve_value_type_t<_Tp>;
+      using _SUp = __get_sve_value_type_t<_Up>;
+      using _V = __sve_vector_type_t<_Tp, _Np>;
+      const _SUp* __up = reinterpret_cast<const _SUp*>(__p);
+
+      if constexpr (std::is_same_v<_Tp, _Up>)
+	return _V(svld1(__k._M_data, __up));
+      if constexpr (std::is_integral_v<_Tp> && std::is_integral_v<_Up>
+		      && (sizeof(_Tp) > sizeof(_Up)))
+	{
+	  if constexpr (std::is_same_v<_SUp, int8_t>)
+	    {
+	      if constexpr (std::is_same_v<_STp, int16_t>)
+		return _V(svld1sb_s16(__k._M_data, __up));
+	      if constexpr (std::is_same_v<_STp, uint16_t>)
+		return _V(svld1sb_u16(__k._M_data, __up));
+	      if constexpr (std::is_same_v<_STp, int32_t>)
+		return _V(svld1sb_s32(__k._M_data, __up));
+	      if constexpr (std::is_same_v<_STp, uint32_t>)
+		return _V(svld1sb_u32(__k._M_data, __up));
+	      if constexpr (std::is_same_v<_STp, int64_t>)
+		return _V(svld1sb_s64(__k._M_data, __up));
+	      if constexpr (std::is_same_v<_STp, uint64_t>)
+		return _V(svld1sb_u64(__k._M_data, __up));
+	    }
+	  if constexpr (std::is_same_v<_SUp, uint8_t>)
+	    {
+	      if constexpr (std::is_same_v<_STp, int16_t>)
+		return _V(svld1ub_s16(__k._M_data, __up));
+	      if constexpr (std::is_same_v<_STp, uint16_t>)
+		return _V(svld1ub_u16(__k._M_data, __up));
+	      if constexpr (std::is_same_v<_STp, int32_t>)
+		return _V(svld1ub_s32(__k._M_data, __up));
+	      if constexpr (std::is_same_v<_STp, uint32_t>)
+		return _V(svld1ub_u32(__k._M_data, __up));
+	      if constexpr (std::is_same_v<_STp, int64_t>)
+		return _V(svld1ub_s64(__k._M_data, __up));
+	      if constexpr (std::is_same_v<_STp, uint64_t>)
+		return _V(svld1ub_u64(__k._M_data, __up));
+	    }
+	  if constexpr (std::is_same_v<_SUp, int16_t>)
+	    {
+	      if constexpr (std::is_same_v<_STp, int32_t>)
+		return _V(svld1sh_s32(__k._M_data, __up));
+	      if constexpr (std::is_same_v<_STp, uint32_t>)
+		return _V(svld1sh_u32(__k._M_data, __up));
+	      if constexpr (std::is_same_v<_STp, int64_t>)
+		return _V(svld1sh_s64(__k._M_data, __up));
+	      if constexpr (std::is_same_v<_STp, uint64_t>)
+		return _V(svld1sh_u64(__k._M_data, __up));
+	    }
+	  if constexpr (std::is_same_v<_SUp, uint16_t>)
+	    {
+	      if constexpr (std::is_same_v<_STp, int32_t>)
+		return _V(svld1uh_s32(__k._M_data, __up));
+	      if constexpr (std::is_same_v<_STp, uint32_t>)
+		return _V(svld1uh_u32(__k._M_data, __up));
+	      if constexpr (std::is_same_v<_STp, int64_t>)
+		return _V(svld1uh_s64(__k._M_data, __up));
+	      if constexpr (std::is_same_v<_STp, uint64_t>)
+		return _V(svld1uh_u64(__k._M_data, __up));
+	    }
+	  if constexpr (std::is_same_v<_SUp, int32_t>)
+	    {
+	      if constexpr (std::is_same_v<_STp, int64_t>)
+		return _V(svld1sw_s64(__k._M_data, __up));
+	      if constexpr (std::is_same_v<_STp, uint64_t>)
+		return _V(svld1sw_u64(__k._M_data, __up));
+	    }
+	  if constexpr (std::is_same_v<_SUp, uint32_t>)
+	    {
+	      if constexpr (std::is_same_v<_STp, int64_t>)
+		return _V(svld1uw_s64(__k._M_data, __up));
+	      if constexpr (std::is_same_v<_STp, uint64_t>)
+		return _V(svld1uw_u64(__k._M_data, __up));
+	    }
+	}
+      return __generate_from_n_evaluations<_Np, __sve_vector_type_t<_Tp, _Np>>(
+	       [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+		 return __k[__i] ? static_cast<_Tp>(__p[__i]) : _Tp{};
+	       });
+    }
+
+  template <typename _Tp, typename _Up, size_t _Np>
+    _GLIBCXX_SIMD_INTRINSIC static constexpr void
+    _S_store(_Up* __p, _SveSimdWrapper<_Tp, _Np> __x, _SveMaskWrapper<sizeof(_Tp), _Np> __k)
+    {
+      using _SUp = __get_sve_value_type_t<_Up>;
+      using _STp = __get_sve_value_type_t<_Tp>;
+      
+      _SUp* __up = reinterpret_cast<_SUp*>(__p);
+
+      if constexpr (std::is_same_v<_Tp, _Up>)
+	return svst1(__k._M_data, __up, __x);
+      if constexpr (std::is_integral_v<_Tp> && std::is_integral_v<_Up>
+		      && (sizeof(_Tp) > sizeof(_Up)))
+	{
+    if constexpr (std::is_same_v<_SUp, int8_t> && std::is_signed_v<_STp>)
+      return svst1b(__k._M_data, __up, __x);
+    if constexpr (std::is_same_v<_SUp, uint8_t> && std::is_unsigned_v<_STp>)
+      return svst1b(__k._M_data, __up, __x);
+    if constexpr (std::is_same_v<_SUp, int16_t> && std::is_signed_v<_STp>)
+      return svst1h(__k._M_data, __up, __x);
+    if constexpr (std::is_same_v<_SUp, uint16_t> && std::is_unsigned_v<_STp>)
+      return svst1h(__k._M_data, __up, __x);
+    if constexpr (std::is_same_v<_SUp, int32_t> && std::is_signed_v<_STp>)
+      return svst1w(__k._M_data, __up, __x);
+    if constexpr (std::is_same_v<_SUp, uint32_t> && std::is_unsigned_v<_STp>)
+      return svst1w(__k._M_data, __up, __x);
+  }
+
+      __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+	if (__k[__i])
+	  __p[__i] = static_cast<_Up>(__x[__i]);
+      });
+    }
+
+  template <typename _Tp, size_t _Np>
+    _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np>
+    _S_blend(_SveMaskWrapper<sizeof(_Tp), _Np> __k, _SveSimdWrapper<_Tp, _Np> __at0,
+	     _SveSimdWrapper<_Tp, _Np> __at1)
+    { return svsel(__k._M_data, __at1._M_data, __at0._M_data); }
+
+  template <size_t _Np, bool _Sanitized>
+    _GLIBCXX_SIMD_INTRINSIC static constexpr void
+    _S_store_bool_array(_BitMask<_Np, _Sanitized> __x, bool* __mem)
+    {
+      __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+	__mem[__i] = __x[__i];
+      });
+    }
+};
+
+template <typename _Abi, typename>
+  struct _SimdImplSve
+  {
+    template <typename _Tp>
+      using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
+
+    template <typename _Tp>
+      using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember;
+
+    using _CommonImpl = typename _Abi::_CommonImpl;
+    using _SuperImpl = typename _Abi::_SimdImpl;
+    using _MaskImpl = typename _Abi::_MaskImpl;
+
+    template <typename _Tp>
+      static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>;
+
+    template <typename _Tp>
+      static constexpr size_t _S_size = _Abi::template _S_size<_Tp>;
+
+    template <typename _Tp>
+      using _TypeTag = _Tp*;
+
+    using abi_type = _Abi;
+
+    template <typename _Tp>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr auto
+      _S_broadcast(_Tp __x) noexcept
+      {
+	return __sve_vector_type<_Tp, __sve_vectorized_size_bytes / sizeof(_Tp)>
+		 ::__sve_broadcast(__x);
+      }
+
+    template <typename _Fp, typename _Tp>
+      inline static constexpr _SimdMember<_Tp>
+      _S_generator(_Fp&& __gen, _TypeTag<_Tp>)
+      {
+	constexpr size_t _Np = _S_size<_Tp>;
+	_SveSimdWrapper<_Tp, _Np> __ret;
+	__execute_n_times<_S_size<_Tp>>(
+	  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __ret._M_set(__i, __gen(__i)); });
+	return __ret;
+      }
+
+    template <typename _Tp, typename _Up>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp>
+      _S_load(const _Up* __mem, _TypeTag<_Tp>) noexcept
+      {
+	constexpr size_t _Np = _S_size<_Tp>;
+	_SimdMember<_Tp> __ret = _CommonImpl::template _S_load<_Tp, _Up, _Np>(
+				   __mem, _SveMaskWrapper<sizeof(_Tp), _Np>{
+				     __sve_vector_type<_Tp, _Np>::__sve_active_mask()});
+	return __ret;
+      }
+
+    template <typename _Tp, size_t _Np, typename _Up>
+      static constexpr inline _SveSimdWrapper<_Tp, _Np>
+      _S_masked_load(_SveSimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k, const _Up* __mem)
+      noexcept
+      {
+	__sve_vector_type_t<_Tp, _Np> __v
+	  = _CommonImpl::template _S_load<_Tp, _Up, _Np>(__mem, __k);
+	__sve_vector_type_t<_Tp, _Np> __ret = svsel(__k._M_data, __v, __merge._M_data);
+	return __ret;
+      }
+
+    template <typename _Tp, typename _Up>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr void
+      _S_store(_SimdMember<_Tp> __v, _Up* __mem, _TypeTag<_Tp>) noexcept
+      {
+	constexpr size_t _Np = _S_size<_Tp>;
+	_CommonImpl::template _S_store<_Tp, _Up, _Np>(
+	  __mem, __v, __sve_vector_type<_Tp, _Np>::__sve_active_mask());
+      }
+
+    template <typename _Tp, typename _Up, size_t _Np>
+      static constexpr inline void
+      _S_masked_store(const _SveSimdWrapper<_Tp, _Np> __v, _Up* __mem,
+		      const _SveMaskWrapper<sizeof(_Tp), _Np> __k) noexcept
+      { _CommonImpl::template _S_store<_Tp, _Up, _Np>(__mem, __v, __k); }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
+      _S_negate(_SveSimdWrapper<_Tp, _Np> __x) noexcept
+      {
+	return svcmpeq(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data,
+		       __sve_vector_type<_Tp, _Np>::__sve_broadcast(_Tp{}));
+      }
+
+    template <typename _Tp, typename _BinaryOperation>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
+      _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
+      {
+	auto __x_data = __x._M_data;
+	constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
+	using __sve_vec_t = __sve_vector_type_t<_Tp, _Np>;
+	std::size_t __i = __x.size();
+	for (; (__i % 2) != 1; __i /= 2)
+	  {
+	    __x_data = __binary_op(simd<_Tp, _Abi>(
+				     __private_init, _SveSimdWrapper<_Tp, _Np>(
+						       __sve_vec_t(svuzp1(__x_data, __x_data)))),
+				   simd<_Tp, _Abi>(
+				     __private_init, _SveSimdWrapper<_Tp, _Np>(
+						       __sve_vec_t(svuzp2(__x_data, __x_data))))
+				  )._M_data;
+	  }
+	_Tp __res = __x_data[0];
+	for (size_t __ri = 1; __ri != __i; __ri++)
+	  __res = __binary_op(__x_data[__ri], __res);
+	return __res;
+      }
+
+    template <typename _Tp>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
+      _S_reduce(simd<_Tp, _Abi> __x, plus<>)
+      {
+    return svaddv(__sve_vector_type<_Tp, _S_size<_Tp>>::__sve_active_mask(), __x._M_data);
+      }
+
+    template <typename _Tp>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
+      _S_reduce(simd<_Tp, _Abi> __x, bit_and<>)
+      {
+    return svandv(__sve_vector_type<_Tp, _S_size<_Tp>>::__sve_active_mask(), __x._M_data);
+      }
+
+    template <typename _Tp>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
+      _S_reduce(simd<_Tp, _Abi> __x, bit_or<>)
+      {
+    return svorv(__sve_vector_type<_Tp, _S_size<_Tp>>::__sve_active_mask(), __x._M_data);
+      }
+
+    template <typename _Tp>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
+      _S_reduce(simd<_Tp, _Abi> __x, bit_xor<>)
+      {
+    return sveorv(__sve_vector_type<_Tp, _S_size<_Tp>>::__sve_active_mask(), __x._M_data);
+      }
+
+    template <typename _Tp>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
+      _S_reduce(simd<_Tp, _Abi> __x, __detail::_Maximum())
+      {
+    return svmaxv(__sve_vector_type<_Tp, _S_size<_Tp>>::__sve_active_mask(), __x._M_data);
+      }
+
+    template <typename _Tp>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
+      _S_reduce(simd<_Tp, _Abi> __x, __detail::_Minimum())
+      {
+    return svminv(__sve_vector_type<_Tp, _S_size<_Tp>>::__sve_active_mask(), __x._M_data);
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
+      __sve_vector_type_t<_Tp, _Np>
+      _S_min(_SveSimdWrapper<_Tp, _Np> __a, _SveSimdWrapper<_Tp, _Np> __b)
+      {
+	return svmin_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __a._M_data, __b._M_data);
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
+      __sve_vector_type_t<_Tp, _Np>
+      _S_max(_SveSimdWrapper<_Tp, _Np> __a, _SveSimdWrapper<_Tp, _Np> __b)
+      {
+	return svmax_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __a._M_data, __b._M_data);
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
+      pair<_SveSimdWrapper<_Tp, _Np>, _SveSimdWrapper<_Tp, _Np>>
+      _S_minmax(_SveSimdWrapper<_Tp, _Np> __a, _SveSimdWrapper<_Tp, _Np> __b)
+      {
+	return {
+	  svmin_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __a._M_data, __b._M_data),
+	  svmax_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __a._M_data, __b._M_data)
+	};
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np>
+      _S_complement(_SveSimdWrapper<_Tp, _Np> __x) noexcept
+      {
+	if constexpr (is_floating_point_v<_Tp>)
+	  {
+	    using _Ip = __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>;
+	    return __sve_reinterpret_cast<_Tp>(
+		     svnot_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
+			     __sve_reinterpret_cast<_Ip>(__x)));
+	  }
+	else
+	  return svnot_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data);
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _SveSimdWrapper<_Tp, _Np>
+      _S_unary_minus(_SveSimdWrapper<_Tp, _Np> __x) noexcept
+      {
+	return svmul_x(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data,
+		       static_cast<_Tp>(-1));
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np>
+      _S_plus(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y)
+      { return __x._M_data + __y._M_data; } 
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np>
+      _S_minus(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y)
+      { return __x._M_data - __y._M_data; }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np>
+      _S_multiplies(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y)
+      { return __x._M_data * __y._M_data; }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np>
+      _S_divides(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y)
+      { 
+        __sve_vector_type_t<_Tp, _Np> __y_padded = svsel(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
+                      __y._M_data, __sve_vector_type<_Tp, _Np>::__sve_broadcast(1));
+        return __x._M_data / __y_padded; 
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np>
+      _S_modulus(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y)
+      { 
+        __sve_vector_type_t<_Tp, _Np> __y_padded = svsel(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
+                      __y._M_data, __sve_vector_type<_Tp, _Np>::__sve_broadcast(1));
+        return __x._M_data % __y_padded;
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np>
+      _S_bit_and(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y)
+      {
+	if constexpr (is_floating_point_v<_Tp>)
+	  {
+	    using _Ip = __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>;
+	    return __sve_reinterpret_cast<_Tp>(
+		     svand_x(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
+			     __sve_reinterpret_cast<_Ip>(__x), __sve_reinterpret_cast<_Ip>(__y)));
+	  }
+	else
+	  return svand_x(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
+			 __x._M_data, __y._M_data);
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np>
+      _S_bit_or(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y)
+      {
+	if constexpr (is_floating_point_v<_Tp>)
+	  {
+	    using _Ip = __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>;
+	    return __sve_reinterpret_cast<_Tp>(
+		     svorr_x(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
+			     __sve_reinterpret_cast<_Ip>(__x), __sve_reinterpret_cast<_Ip>(__y)));
+	  }
+	else
+	  return svorr_x(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
+			 __x._M_data, __y._M_data);
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np>
+      _S_bit_xor(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y)
+      {
+	if constexpr (is_floating_point_v<_Tp>)
+	  {
+	    using _Ip = __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>;
+	    return __sve_reinterpret_cast<_Tp>(
+		     sveor_x(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
+			     __sve_reinterpret_cast<_Ip>(__x), __sve_reinterpret_cast<_Ip>(__y)));
+	  }
+	else
+	  return sveor_x(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
+			 __x._M_data, __y._M_data);
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static __sve_vector_type_t<_Tp, _Np>
+      _S_bit_shift_left(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y)
+      { return __x._M_data << __y._M_data; }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static __sve_vector_type_t<_Tp, _Np>
+      _S_bit_shift_right(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y)
+      { return __x._M_data >> __y._M_data; }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np>
+      _S_bit_shift_left(_SveSimdWrapper<_Tp, _Np> __x, int __y)
+      { return __x._M_data << __y; }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np>
+      _S_bit_shift_right(_SveSimdWrapper<_Tp, _Np> __x, int __y)
+      { return __x._M_data >> __y; }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr void
+      _S_increment(_SveSimdWrapper<_Tp, _Np>& __x)
+      { __x = __x._M_data + 1; }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr void
+      _S_decrement(_SveSimdWrapper<_Tp, _Np>& __x)
+      { __x = __x._M_data - 1; }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
+      _S_equal_to(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y)
+      {
+	return svcmpeq(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data, __y._M_data);
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
+      _S_not_equal_to(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y)
+      {
+	return svcmpne(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data, __y._M_data);
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
+      _S_less(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y)
+      {
+	return svcmplt(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data, __y._M_data);
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
+      _S_less_equal(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y)
+      {
+	return svcmple(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data, __y._M_data);
+      }
+
+    // simd.math
+#define _GLIBCXX_SIMD_MATH_FALLBACK(__name)                                                        \
+    template <typename _Tp, size_t _Np, typename... _More>                                         \
+      static _SveSimdWrapper<_Tp, _Np> _S_##__name(const _SveSimdWrapper<_Tp, _Np>& __x,           \
+						   const _More&... __more)                         \
+      {                                                                                            \
+	_SveSimdWrapper<_Tp, _Np> __r;                                                             \
+	__execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {                  \
+	  __r._M_set(__i, __name(__x[__i], __more[__i]...));                                       \
+	});                                                                                        \
+	return __r;                                                                                \
+      }
+
+#define _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(_RetTp, __name)                                       \
+    template <typename _Tp, typename... _More>                                                     \
+      static auto _S_##__name(const _Tp& __x, const _More&... __more)                              \
+      {                                                                                            \
+	return __fixed_size_storage_t<_RetTp, _Tp::_S_size>::_S_generate(                          \
+		 [&](auto __meta) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {                             \
+		   return __meta._S_generator(                                                     \
+			    [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {                     \
+			      return __name(__x[__meta._S_offset + __i],                           \
+					    __more[__meta._S_offset + __i]...);                    \
+			    }, static_cast<_RetTp*>(nullptr));                                     \
+		 });                                                                               \
+      }
+
+    _GLIBCXX_SIMD_MATH_FALLBACK(acos)
+    _GLIBCXX_SIMD_MATH_FALLBACK(asin)
+    _GLIBCXX_SIMD_MATH_FALLBACK(atan)
+    _GLIBCXX_SIMD_MATH_FALLBACK(atan2)
+    _GLIBCXX_SIMD_MATH_FALLBACK(cos)
+    _GLIBCXX_SIMD_MATH_FALLBACK(sin)
+    _GLIBCXX_SIMD_MATH_FALLBACK(tan)
+    _GLIBCXX_SIMD_MATH_FALLBACK(acosh)
+    _GLIBCXX_SIMD_MATH_FALLBACK(asinh)
+    _GLIBCXX_SIMD_MATH_FALLBACK(atanh)
+    _GLIBCXX_SIMD_MATH_FALLBACK(cosh)
+    _GLIBCXX_SIMD_MATH_FALLBACK(sinh)
+    _GLIBCXX_SIMD_MATH_FALLBACK(tanh)
+    _GLIBCXX_SIMD_MATH_FALLBACK(exp)
+    _GLIBCXX_SIMD_MATH_FALLBACK(exp2)
+    _GLIBCXX_SIMD_MATH_FALLBACK(expm1)
+    _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(int, ilogb)
+    _GLIBCXX_SIMD_MATH_FALLBACK(log)
+    _GLIBCXX_SIMD_MATH_FALLBACK(log10)
+    _GLIBCXX_SIMD_MATH_FALLBACK(log1p)
+    _GLIBCXX_SIMD_MATH_FALLBACK(log2)
+    _GLIBCXX_SIMD_MATH_FALLBACK(logb)
+
+    // modf implemented in simd_math.h
+    _GLIBCXX_SIMD_MATH_FALLBACK(scalbn)
+    _GLIBCXX_SIMD_MATH_FALLBACK(scalbln)
+    _GLIBCXX_SIMD_MATH_FALLBACK(cbrt)
+    _GLIBCXX_SIMD_MATH_FALLBACK(pow)
+    _GLIBCXX_SIMD_MATH_FALLBACK(erf)
+    _GLIBCXX_SIMD_MATH_FALLBACK(erfc)
+    _GLIBCXX_SIMD_MATH_FALLBACK(lgamma)
+    _GLIBCXX_SIMD_MATH_FALLBACK(tgamma)
+
+    _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lrint)
+    _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llrint)
+
+    _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lround)
+    _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llround)
+
+    _GLIBCXX_SIMD_MATH_FALLBACK(fmod)
+    _GLIBCXX_SIMD_MATH_FALLBACK(remainder)
+
+    template <typename _Tp, size_t _Np>
+      static _SveSimdWrapper<_Tp, _Np>
+      _S_remquo(const _SveSimdWrapper<_Tp, _Np> __x, const _SveSimdWrapper<_Tp, _Np> __y,
+		__fixed_size_storage_t<int, _Np>* __z)
+      {
+	_SveSimdWrapper<_Tp, _Np> __r{};
+	__execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+	  int __tmp;
+	  __r._M_set(__i, remquo(__x[__i], __y[__i], &__tmp));
+	  __z->_M_set(__i, __tmp);
+	});
+	return __r;
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static __fixed_size_storage_t<int, _Np>
+      _S_fpclassify(_SveSimdWrapper<_Tp, _Np> __x)
+      {
+	__fixed_size_storage_t<int, _Np> __r{};
+	__execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+	  __r._M_set(__i, std::fpclassify(__x[__i]));
+	});
+	return __r;
+      }
+
+    // copysign in simd_math.h
+    _GLIBCXX_SIMD_MATH_FALLBACK(nextafter)
+    _GLIBCXX_SIMD_MATH_FALLBACK(fdim)
+
+#undef _GLIBCXX_SIMD_MATH_FALLBACK
+#undef _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET
+
+    template <typename _Tp, size_t _Np, typename _Op>
+      static constexpr _MaskMember<_Tp>
+      __fp_cmp(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y, _Op __op)
+      {
+	using _Ip = __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>;
+	using _VI = __sve_vector_type_t<_Ip, _Np>;
+	using _WI = _SveSimdWrapper<_Ip, _Np>;
+	const _WI __fmv = __sve_vector_type<_Ip, _Np>::__sve_broadcast(__finite_max_v<_Ip>);
+	const _WI __zerov = __sve_vector_type<_Ip, _Np>::__sve_broadcast(0);
+	const _WI __xn = _VI(__sve_reinterpret_cast<_Ip>(__x));
+	const _WI __yn = _VI(__sve_reinterpret_cast<_Ip>(__y));
+
+	const _WI __xp
+	  = svsel(_S_less(__xn, __zerov), _S_unary_minus(_WI(_S_bit_and(__xn, __fmv))), __xn);
+	const _WI __yp
+	  = svsel(_S_less(__yn, __zerov), _S_unary_minus(_WI(_S_bit_and(__yn, __fmv))), __yn);
+	return svbic_z(__sve_vector_type<_Ip, _Np>::__sve_active_mask(), __op(__xp, __yp)._M_data,
+		       _SuperImpl::_S_isunordered(__x, __y)._M_data);
+      }
+
+    template <typename _Tp, size_t _Np>
+      static constexpr _MaskMember<_Tp>
+      _S_isgreater(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) noexcept
+      { return __fp_cmp(__x, __y, [](auto __xp, auto __yp) { return _S_less(__yp, __xp); }); }
+
+    template <typename _Tp, size_t _Np>
+      static constexpr _MaskMember<_Tp>
+      _S_isgreaterequal(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) noexcept
+      { return __fp_cmp(__x, __y, [](auto __xp, auto __yp) { return _S_less_equal(__yp, __xp); }); }
+
+    template <typename _Tp, size_t _Np>
+      static constexpr _MaskMember<_Tp>
+      _S_isless(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) noexcept
+      { return __fp_cmp(__x, __y, [](auto __xp, auto __yp) { return _S_less(__xp, __yp); }); }
+
+    template <typename _Tp, size_t _Np>
+      static constexpr _MaskMember<_Tp>
+      _S_islessequal(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) noexcept
+      { return __fp_cmp(__x, __y, [](auto __xp, auto __yp) { return _S_less_equal(__xp, __yp); }); }
+
+    template <typename _Tp, size_t _Np>
+      static constexpr _MaskMember<_Tp>
+      _S_islessgreater(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) noexcept
+      {
+	return svbic_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
+		       _SuperImpl::_S_not_equal_to(__x, __y)._M_data,
+		       _SuperImpl::_S_isunordered(__x, __y)._M_data);
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np>
+      _S_abs(_SveSimdWrapper<_Tp, _Np> __x) noexcept
+      { return svabs_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data); }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np>
+      _S_fabs(_SveSimdWrapper<_Tp, _Np> __x) noexcept
+      { return svabs_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data); }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np>
+      _S_sqrt(_SveSimdWrapper<_Tp, _Np> __x) noexcept
+      { return svsqrt_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data); }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np>
+      _S_ldexp(_SveSimdWrapper<_Tp, _Np> __x, __fixed_size_storage_t<int, _Np> __y) noexcept
+      {
+	auto __sve_register = __y.first;
+	if constexpr (std::is_same_v<_Tp, float>)
+	  return svscale_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data,
+			   __sve_register._M_data);
+	else
+	  {
+	    __sve_vector_type_t<int64_t, _Np> __sve_d_register = svunpklo(__sve_register);
+	    return svscale_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data,
+			     __sve_d_register);
+	  }
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np>
+      _S_fma(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y,
+	     _SveSimdWrapper<_Tp, _Np> __z)
+      {
+	return svmad_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data, __y._M_data,
+		       __z._M_data);
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np>
+      _S_fmax(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y)
+      {
+  return svmaxnm_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data, __y._M_data);
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np>
+      _S_fmin(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y)
+      {
+  return svminnm_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data, __y._M_data);
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
+      _S_isfinite([[maybe_unused]] _SveSimdWrapper<_Tp, _Np> __x)
+      {
+#if __FINITE_MATH_ONLY__
+	return __sve_vector_type_t<_Tp, _Np>::__sve_all_true_mask();
+#else
+	// if all exponent bits are set, __x is either inf or NaN
+
+	using _Ip = __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>;
+	const __sve_vector_type_t<_Ip, _Np> __absn = __sve_reinterpret_cast<_Ip>(_S_abs(__x));
+	const __sve_vector_type_t<_Ip, _Np> __maxn
+	  = __sve_reinterpret_cast<_Ip>(
+	      __sve_vector_type<_Tp, _Np>::__sve_broadcast(__finite_max_v<_Tp>));
+
+	return _S_less_equal(_SveSimdWrapper<_Ip, _Np>{__absn}, _SveSimdWrapper<_Ip, _Np>{__maxn});
+#endif
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
+      _S_isinf([[maybe_unused]] _SveSimdWrapper<_Tp, _Np> __x)
+      {
+#if __FINITE_MATH_ONLY__
+	return {}; // false
+#else
+	return _S_equal_to<_Tp, _Np>(_S_abs(__x), _S_broadcast(__infinity_v<_Tp>));
+#endif
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
+      _S_isnan([[maybe_unused]] _SveSimdWrapper<_Tp, _Np> __x)
+      {
+#if __FINITE_MATH_ONLY__
+	return {}; // false
+#else
+	return svcmpuo(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data, __x._M_data);
+#endif
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
+      _S_isnormal(_SveSimdWrapper<_Tp, _Np> __x)
+      {
+	using _Ip = __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>;
+	using _V = __sve_vector_type_t<_Ip, _Np>;
+	using _VW = _SveSimdWrapper<_Ip, _Np>;
+
+	const _V __absn = __sve_reinterpret_cast<_Ip>(_S_abs(__x));
+	const _V __minn = __sve_reinterpret_cast<_Ip>(
+			    __sve_vector_type<_Tp, _Np>::__sve_broadcast(__norm_min_v<_Tp>));
+#if __FINITE_MATH_ONLY__
+	return _S_greater_equal(_VW{__absn}, _VW{__minn});
+#else
+	const _V __maxn = __sve_reinterpret_cast<_Ip>(
+			    __sve_vector_type<_Tp, _Np>::__sve_broadcast(__finite_max_v<_Tp>));
+	return _MaskImpl::_S_bit_and(_S_less_equal(_VW{__minn}, _VW{__absn}),
+				     _S_less_equal(_VW{__absn}, _VW{__maxn}));
+#endif
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
+      _S_signbit(_SveSimdWrapper<_Tp, _Np> __x)
+      {
+	using _Ip = __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>;
+	using _V = __sve_vector_type_t<_Ip, _Np>;
+	using _VW = _SveSimdWrapper<_Ip, _Np>;
+
+	const _V __xn = __sve_reinterpret_cast<_Ip>(__x);
+	const _V __zeron = __sve_vector_type<_Ip, _Np>::__sve_broadcast(0);
+	return _S_less(_VW{__xn}, _VW{__zeron});
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
+      _S_isunordered(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y)
+      {
+	return svcmpuo(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data, __y._M_data);
+      }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np>
+      _S_nearbyint(_SveSimdWrapper<_Tp, _Np> __x) noexcept
+      { return svrinti_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data); }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np>
+      _S_rint(_SveSimdWrapper<_Tp, _Np> __x) noexcept
+      { return _SuperImpl::_S_nearbyint(__x); }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np>
+      _S_trunc(_SveSimdWrapper<_Tp, _Np> __x) noexcept
+      { return svrintz_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data); }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np>
+      _S_round(_SveSimdWrapper<_Tp, _Np> __x) noexcept
+      { return svrinta_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data); }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np>
+      _S_floor(_SveSimdWrapper<_Tp, _Np> __x) noexcept
+      { return svrintm_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data); }
+
+    template <typename _Tp, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np>
+      _S_ceil(_SveSimdWrapper<_Tp, _Np> __x) noexcept
+      { return svrintp_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data); }
+
+    template <typename _Tp, size_t _Bits, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr void
+      _S_masked_assign(_SveMaskWrapper<_Bits, _Np> __k, _SveSimdWrapper<_Tp, _Np>& __lhs,
+		       __type_identity_t<_SveSimdWrapper<_Tp, _Np>> __rhs)
+      { __lhs = _CommonImpl::_S_blend(__k, __lhs, __rhs); }
+
+    template <typename _Tp, size_t _Bits, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr void
+      _S_masked_assign(_SveMaskWrapper<_Bits, _Np> __k, _SveSimdWrapper<_Tp, _Np>& __lhs,
+		       __type_identity_t<_Tp> __rhs)
+      { __lhs = _CommonImpl::_S_blend(__k, __lhs, __data(simd<_Tp, _Abi>(__rhs))); }
+
+    template <typename _Op, typename _Tp, size_t _Bits, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr void
+      _S_masked_cassign(const _SveMaskWrapper<_Bits, _Np> __k, _SveSimdWrapper<_Tp, _Np>& __lhs,
+			const __type_identity_t<_SveSimdWrapper<_Tp, _Np>> __rhs, _Op __op)
+      {
+	__lhs = _CommonImpl::_S_blend(__k, __lhs,
+				      _SveSimdWrapper<_Tp, _Np>(__op(_SuperImpl{}, __lhs, __rhs)));
+      }
+
+    template <typename _Op, typename _Tp, size_t _Bits, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr void
+      _S_masked_cassign(const _SveMaskWrapper<_Bits, _Np> __k, _SveSimdWrapper<_Tp, _Np>& __lhs,
+			const __type_identity_t<_Tp> __rhs, _Op __op)
+      { _S_masked_cassign(__k, __lhs, _S_broadcast(__rhs), __op); }
+
+    template <typename _Tp, size_t _Np, typename _Up>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr void
+      _S_set(_SveSimdWrapper<_Tp, _Np>& __v, int __i, _Up&& __x) noexcept
+      { __v._M_set(__i, static_cast<_Up&&>(__x)); }
+
+    template <template <typename> class _Op, typename _Tp, size_t _Bits, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _SveSimdWrapper<_Tp, _Np>
+      _S_masked_unary(const _SveMaskWrapper<_Bits, _Np> __k, const _SveSimdWrapper<_Tp, _Np> __v)
+      {
+	auto __vv = simd<_Tp, _Abi>{__private_init, __v};
+	_Op<decltype(__vv)> __op;
+	return _CommonImpl::_S_blend(__k, __v, __data(__op(__vv)));
+      }
+  };
+
+template <typename _Abi, typename>
+  struct _MaskImplSve
+  {
+    template <typename _Tp>
+      using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
+
+    template <typename _Tp>
+      using _TypeTag = _Tp*;
+
+    template <typename _Tp>
+      static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>;
+
+    template <typename _Tp>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
+      _S_broadcast(bool __x)
+      {
+	constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
+	__sve_bool_type __tr = __sve_vector_type<_Tp, _Np>::__sve_active_mask();
+	__sve_bool_type __fl = svpfalse_b();;
+	return __x ? __tr : __fl;
+      }
+
+    template <typename _Tp>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
+      _S_load(const bool* __mem)
+      { 
+	constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
+  const uint8_t* __p = reinterpret_cast<const uint8_t*>(__mem);
+  __sve_bool_type __u8_active_mask = __sve_vector_type<uint8_t, _Np>::__sve_active_mask();
+  __sve_vector_type_t<uint8_t, _Np> __u8_vec_mask_load = svld1(__u8_active_mask, __p);
+  __sve_bool_type __u8_mask = svcmpne(__u8_active_mask, __u8_vec_mask_load, 0);
+
+  __sve_bool_type __tp_mask = __u8_mask;
+  for (size_t __up_size = 1; __up_size != sizeof(_Tp); __up_size *= 2)
+    {
+  __tp_mask = svunpklo(__tp_mask);
+    }
+
+	_SveMaskWrapper<sizeof(_Tp), simd_size_v<_Tp, _Abi>> __r{__tp_mask};
+  return __r;
+      }
+
+    template <size_t _Bits, size_t _Np>
+      static inline _SveMaskWrapper<_Bits, _Np>
+      _S_masked_load(_SveMaskWrapper<_Bits, _Np> __merge, _SveMaskWrapper<_Bits, _Np> __mask,
+		     const bool* __mem) noexcept
+      {
+	_SveMaskWrapper<_Bits, _Np> __r;
+
+	__execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+	  if (__mask[__i])
+	    __r._M_set(__i, __mem[__i]);
+	  else
+	    __r._M_set(__i, __merge[__i]);
+	});
+
+	return __r;
+      }
+
+    template <size_t _Bits, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr void
+      _S_store(_SveMaskWrapper<_Bits, _Np> __v, bool* __mem) noexcept
+      {
+	__execute_n_times<_Np>([&](auto __i)
+			      _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __mem[__i] = __v[__i]; });
+      }
+
+    template <size_t _Bits, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr void
+      _S_masked_store(const _SveMaskWrapper<_Bits, _Np> __v, bool* __mem,
+		      const _SveMaskWrapper<_Bits, _Np> __k) noexcept
+      {
+	__execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+	  if (__k[__i])
+	    __mem[__i] = __v[__i];
+	});
+      }
+
+    template <size_t _Bits, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
+      _S_to_bits(_SveMaskWrapper<_Bits, _Np> __x)
+      {
+	_ULLong __r = 0;
+	__execute_n_times<_Np>(
+	  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __r |= _ULLong(__x[__i]) << __i; });
+	return __r;
+      }
+
+    template <size_t _Np, typename _Tp>
+      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
+      _S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>)
+      {
+	_SveMaskWrapper<sizeof(_Tp), _Np> __r;
+	__execute_n_times<_Np>([&](auto __i)
+			      _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __r._M_set(__i, __bits[__i]); });
+	return __r;
+      }
+
+    template <typename _Tp, typename _Up, typename _UAbi>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr auto
+      _S_convert(simd_mask<_Up, _UAbi> __x)
+      {
+	using _R = _SveMaskWrapper<sizeof(_Tp), simd_size_v<_Tp, _Abi>>;
+	if constexpr (__is_scalar_abi<_UAbi>())
+	  {
+	    _R __r{__sve_bool_type(svpfalse())};
+	    __r._M_set(0, __data(__x));
+	    return __r;
+	  }
+	if constexpr (__is_sve_abi<_UAbi>())
+	  {
+	    if constexpr (sizeof(_Up) == sizeof(_Tp))
+	      return __data(__x);
+	    if constexpr (sizeof(_Up) < sizeof(_Tp))
+	      {
+		__sve_bool_type __xmdata = __data(__x)._M_data;
+		__sve_bool_type __r = __xmdata;
+		for (size_t __up_size = sizeof(_Up); __up_size != sizeof(_Tp); __up_size *= 2)
+		  {
+		    __r = svunpklo(__r);
+		  }
+		return _R{__r};
+	      }
+	    else
+	      {
+		_R __r{__sve_bool_type(svpfalse())};
+		constexpr size_t __min_size
+		  = std::min(simd_size_v<_Tp, _Abi>, simd_mask<_Up, _UAbi>::size());
+		__execute_n_times<__min_size>(
+		  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __r._M_set(__i, __x[__i]); });
+		return __r;
+	      }
+	  }
+	if constexpr (__is_neon_abi<_UAbi>())
+	  {
+	    _R __r{__sve_bool_type(svpfalse())};
+	    constexpr size_t __min_size
+	      = std::min(simd_size_v<_Tp, _Abi>, simd_mask<_Up, _UAbi>::size());
+	    __execute_n_times<__min_size>(
+	      [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __r._M_set(__i, __x[__i]); });
+	    return __r;
+	  }
+	if constexpr (__is_fixed_size_abi<_UAbi>())
+	  {
+	    return _S_convert<_Tp>(__data(__x));
+	  }
+	return _R{};
+      }
+
+    template <typename _Tp, size_t _Np, bool _Sanitized>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
+      _S_convert(_BitMask<_Np, _Sanitized> __x)
+      {
+	_MaskMember<_Tp> __r{};
+	__execute_n_times<_Np>([&](auto __i)
+			      _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __r._M_set(__i, __x[__i]); });
+	return __r;
+      }
+
+    template <size_t _Bits, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _SveMaskWrapper<_Bits, _Np>
+      _S_logical_and(const _SveMaskWrapper<_Bits, _Np>& __x, const _SveMaskWrapper<_Bits, _Np>& __y)
+      {
+	return svand_z(_SveMaskWrapper<_Bits, _Np>::_BuiltinSveVectorType::__sve_active_mask(),
+		       __x._M_data, __y._M_data);
+      }
+
+    template <size_t _Bits, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _SveMaskWrapper<_Bits, _Np>
+      _S_logical_or(const _SveMaskWrapper<_Bits, _Np>& __x, const _SveMaskWrapper<_Bits, _Np>& __y)
+      {
+	return svorr_z(_SveMaskWrapper<_Bits, _Np>::_BuiltinSveVectorType::__sve_active_mask(),
+		       __x._M_data, __y._M_data);
+      }
+
+    template <size_t _Bits, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _SveMaskWrapper<_Bits, _Np>
+      _S_bit_not(const _SveMaskWrapper<_Bits, _Np>& __x)
+      {
+	return svnot_z(_SveMaskWrapper<_Bits, _Np>::_BuiltinSveVectorType::__sve_active_mask(),
+		       __x._M_data);
+      }
+
+    template <size_t _Bits, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _SveMaskWrapper<_Bits, _Np>
+      _S_bit_and(const _SveMaskWrapper<_Bits, _Np>& __x, const _SveMaskWrapper<_Bits, _Np>& __y)
+      {
+	return svand_z(_SveMaskWrapper<_Bits, _Np>::_BuiltinSveVectorType::__sve_active_mask(),
+		       __x._M_data, __y._M_data);
+      }
+
+    template <size_t _Bits, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _SveMaskWrapper<_Bits, _Np>
+      _S_bit_or(const _SveMaskWrapper<_Bits, _Np>& __x, const _SveMaskWrapper<_Bits, _Np>& __y)
+      {
+	return svorr_z(_SveMaskWrapper<_Bits, _Np>::_BuiltinSveVectorType::__sve_active_mask(),
+		       __x._M_data, __y._M_data);
+      }
+
+    template <size_t _Bits, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static constexpr _SveMaskWrapper<_Bits, _Np>
+      _S_bit_xor(const _SveMaskWrapper<_Bits, _Np>& __x, const _SveMaskWrapper<_Bits, _Np>& __y)
+      {
+	return sveor_z(_SveMaskWrapper<_Bits, _Np>::_BuiltinSveVectorType::__sve_active_mask(),
+		       __x._M_data, __y._M_data);
+      }
+
+    template <size_t _Bits, size_t _Np>
+      static constexpr void
+      _S_set(_SveMaskWrapper<_Bits, _Np>& __k, int __i, bool __x) noexcept
+      {
+	auto __index = svcmpeq(_SveMaskWrapper<_Bits, _Np>::_BuiltinSveVectorType::__sve_active_mask(),
+			       __sve_mask_type<_Bits>::__index0123,
+			       typename __sve_mask_type<_Bits>::__sve_mask_uint_type(__i));
+	if (__x)
+	  __k._M_data = svorr_z(_SveMaskWrapper<_Bits, _Np>::_BuiltinSveVectorType::__sve_active_mask(),
+				__k._M_data, __index);
+	else
+	  __k._M_data = svbic_z(_SveMaskWrapper<_Bits, _Np>::_BuiltinSveVectorType::__sve_active_mask(),
+				__k._M_data, __index);
+      }
+
+    template <size_t _Bits, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static void
+      _S_masked_assign(_SveMaskWrapper<_Bits, _Np> __k, _SveMaskWrapper<_Bits, _Np>& __lhs,
+		       _SveMaskWrapper<_Bits, _Np> __rhs)
+      { __lhs._M_data = svsel(__k._M_data, __rhs._M_data, __lhs._M_data); }
+
+    template <size_t _Bits, size_t _Np>
+      _GLIBCXX_SIMD_INTRINSIC static void
+      _S_masked_assign(_SveMaskWrapper<_Bits, _Np> __k, _SveMaskWrapper<_Bits, _Np>& __lhs,
+		       bool __rhs)
+      {
+	__lhs._M_data
+	     = svsel(__k._M_data, _S_broadcast<__int_with_sizeof_t<_Bits>>(__rhs), __lhs._M_data);
+      }
+
+    template <typename _Tp>
+      _GLIBCXX_SIMD_INTRINSIC static int
+      _S_popcount(simd_mask<_Tp, _Abi> __k)
+      {
+	constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
+
+	return __sve_mask_type<sizeof(_Tp)>::__sve_mask_active_count(
+		 __sve_vector_type<_Tp, _Np>::__sve_active_mask(), __k._M_data);
+      }
+
+    template <typename _Tp>
+      _GLIBCXX_SIMD_INTRINSIC static bool
+      _S_all_of(simd_mask<_Tp, _Abi> __k)
+      { return _S_popcount(__k) == simd_size_v<_Tp, _Abi>; }
+
+    template <typename _Tp>
+      _GLIBCXX_SIMD_INTRINSIC static bool
+      _S_any_of(simd_mask<_Tp, _Abi> __k)
+      { return svptest_any(__sve_vector_type<_Tp, simd_size_v<_Tp, _Abi>>::__sve_active_mask(), __k._M_data); }
+
+    template <typename _Tp>
+      _GLIBCXX_SIMD_INTRINSIC static bool
+      _S_none_of(simd_mask<_Tp, _Abi> __k)
+      { return !svptest_any(__sve_vector_type<_Tp, simd_size_v<_Tp, _Abi>>::__sve_active_mask(), __k._M_data); }
+
+    template <typename _Tp>
+      _GLIBCXX_SIMD_INTRINSIC static bool
+      _S_some_of(simd_mask<_Tp, _Abi> __k)
+      {
+	int __msk_count = _S_popcount(__k);
+	return (__msk_count > 0) && (__msk_count < (int) simd_size_v<_Tp, _Abi>);
+      }
+
+    template <typename _Tp>
+      _GLIBCXX_SIMD_INTRINSIC static int
+      _S_find_first_set(simd_mask<_Tp, _Abi> __k)
+      { return svclastb(svpfirst(__k._M_data, svpfalse()), -1, __sve_mask_type<sizeof(_Tp)>::__index0123); }
+
+    template <typename _Tp>
+      _GLIBCXX_SIMD_INTRINSIC static int
+      _S_find_last_set(simd_mask<_Tp, _Abi> __k)
+      { return svclastb(__k._M_data, -1, __sve_mask_type<sizeof(_Tp)>::__index0123); }
+  };
+
+_GLIBCXX_SIMD_END_NAMESPACE
+#endif // __cplusplus >= 201703L
+#endif // _GLIBCXX_EXPERIMENTAL_SIMD_SVE_H_
+// vim: sw=2 noet ts=8 sts=2 tw=100
diff --git a/libstdc++-v3/include/experimental/simd b/libstdc++-v3/include/experimental/simd
index 2b9bdf9239a..1cb70713ebc 100644
--- a/libstdc++-v3/include/experimental/simd
+++ b/libstdc++-v3/include/experimental/simd
@@ -80,6 +80,9 @@ 
 #include "bits/simd_x86.h"
 #elif _GLIBCXX_SIMD_HAVE_NEON
 #include "bits/simd_neon.h"
+#if _GLIBCXX_SIMD_HAVE_SVE
+#include "bits/simd_sve.h"
+#endif
 #elif __ALTIVEC__
 #include "bits/simd_ppc.h"
 #endif
diff --git a/libstdc++-v3/testsuite/experimental/simd/tests/bits/main.h b/libstdc++-v3/testsuite/experimental/simd/tests/bits/main.h
index 270b433aa17..880495cda34 100644
--- a/libstdc++-v3/testsuite/experimental/simd/tests/bits/main.h
+++ b/libstdc++-v3/testsuite/experimental/simd/tests/bits/main.h
@@ -29,6 +29,9 @@  template <class T>
     invoke_test<simd<T, simd_abi::scalar>>(int());
     invoke_test<simd<T, simd_abi::_VecBuiltin<16>>>(int());
     invoke_test<simd<T, simd_abi::_VecBltnBtmsk<64>>>(int());
+    invoke_test<simd<T, simd_abi::_SveAbi<16>>>(int());
+    invoke_test<simd<T, simd_abi::_SveAbi<32>>>(int());
+    invoke_test<simd<T, simd_abi::_SveAbi<64>>>(int());
 #elif EXTENDEDTESTS == 0
     invoke_test<simd<T, simd_abi::_VecBuiltin<8>>>(int());
     invoke_test<simd<T, simd_abi::_VecBuiltin<12>>>(int());