libstdc++: add ARM SVE support to std::experimental::simd

libstdc++-v3/ChangeLog:

	* include/Makefile.am: Add simd_sve.h.
	* include/Makefile.in: Add simd_sve.h.
	* include/experimental/bits/simd.h: Add new SveAbi.
	* include/experimental/bits/simd_builtin.h: Use
	__no_sve_deduce_t to support existing Neon Abi.
	* include/experimental/bits/simd_converter.h: Convert
	sequentially when sve is available.
	* include/experimental/bits/simd_detail.h: Define sve
	specific macro.
	* include/experimental/bits/simd_math.h: Fallback frexp
	to execute sequntially when sve is available, to handle
	fixed_size_simd return type that always uses sve.
	* include/experimental/simd: Include bits/simd_sve.h.
	* testsuite/experimental/simd/tests/bits/main.h: Enable
	testing for sve128, sve256, sve512.
	* include/experimental/bits/simd_sve.h: New file.

Signed-off-by: Srinivas Yadav Singanaboina <vasu.srinivasvasu.14@gmail.com>
This commit is contained in:
Srinivas Yadav Singanaboina 2024-03-16 19:04:35 +00:00 committed by Matthias Kretz
parent 0b02da5b99
commit 9ac3119fec
10 changed files with 2154 additions and 103 deletions

View File

@ -835,6 +835,7 @@ experimental_bits_headers = \
${experimental_bits_srcdir}/simd_neon.h \
${experimental_bits_srcdir}/simd_ppc.h \
${experimental_bits_srcdir}/simd_scalar.h \
${experimental_bits_srcdir}/simd_sve.h \
${experimental_bits_srcdir}/simd_x86.h \
${experimental_bits_srcdir}/simd_x86_conversions.h \
${experimental_bits_srcdir}/string_view.tcc \

View File

@ -1181,6 +1181,7 @@ experimental_bits_headers = \
${experimental_bits_srcdir}/simd_neon.h \
${experimental_bits_srcdir}/simd_ppc.h \
${experimental_bits_srcdir}/simd_scalar.h \
${experimental_bits_srcdir}/simd_sve.h \
${experimental_bits_srcdir}/simd_x86.h \
${experimental_bits_srcdir}/simd_x86_conversions.h \
${experimental_bits_srcdir}/string_view.tcc \

View File

@ -39,12 +39,16 @@
#include <functional>
#include <iosfwd>
#include <utility>
#include <algorithm>
#if _GLIBCXX_SIMD_X86INTRIN
#include <x86intrin.h>
#elif _GLIBCXX_SIMD_HAVE_NEON
#include <arm_neon.h>
#endif
#if _GLIBCXX_SIMD_HAVE_SVE
#include <arm_sve.h>
#endif
/** @ingroup ts_simd
* @{
@ -83,6 +87,12 @@ using __m512d [[__gnu__::__vector_size__(64)]] = double;
using __m512i [[__gnu__::__vector_size__(64)]] = long long;
#endif
#if _GLIBCXX_SIMD_HAVE_SVE
constexpr inline int __sve_vectorized_size_bytes = __ARM_FEATURE_SVE_BITS / 8;
#else
constexpr inline int __sve_vectorized_size_bytes = 0;
#endif
namespace simd_abi {
// simd_abi forward declarations {{{
// implementation details:
@ -108,6 +118,9 @@ template <int _UsedBytes>
template <int _UsedBytes>
struct _VecBltnBtmsk;
template <int _UsedBytes, int _TotalBytes = __sve_vectorized_size_bytes>
struct _SveAbi;
template <typename _Tp, int _Np>
using _VecN = _VecBuiltin<sizeof(_Tp) * _Np>;
@ -123,6 +136,9 @@ template <int _UsedBytes = 64>
template <int _UsedBytes = 16>
using _Neon = _VecBuiltin<_UsedBytes>;
template <int _UsedBytes = __sve_vectorized_size_bytes>
using _Sve = _SveAbi<_UsedBytes, __sve_vectorized_size_bytes>;
// implementation-defined:
using __sse = _Sse<>;
using __avx = _Avx<>;
@ -130,6 +146,7 @@ using __avx512 = _Avx512<>;
using __neon = _Neon<>;
using __neon128 = _Neon<16>;
using __neon64 = _Neon<8>;
using __sve = _Sve<>;
// standard:
template <typename _Tp, size_t _Np, typename...>
@ -250,6 +267,9 @@ constexpr inline bool __support_neon_float =
false;
#endif
constexpr inline bool __have_sve = _GLIBCXX_SIMD_HAVE_SVE;
constexpr inline bool __have_sve2 = _GLIBCXX_SIMD_HAVE_SVE2;
#ifdef _ARCH_PWR10
constexpr inline bool __have_power10vec = true;
#else
@ -356,12 +376,14 @@ namespace __detail
| (__have_avx512vnni << 27)
| (__have_avx512vpopcntdq << 28)
| (__have_avx512vp2intersect << 29);
else if constexpr (__have_neon)
else if constexpr (__have_neon || __have_sve)
return __have_neon
| (__have_neon_a32 << 1)
| (__have_neon_a64 << 2)
| (__have_neon_a64 << 2)
| (__support_neon_float << 3);
| (__support_neon_float << 3)
| (__have_sve << 4)
| (__have_sve2 << 5);
else if constexpr (__have_power_vmx)
return __have_power_vmx
| (__have_power_vsx << 1)
@ -733,6 +755,16 @@ template <typename _Abi>
return _Bytes <= 16 && is_same_v<simd_abi::_VecBuiltin<_Bytes>, _Abi>;
}
// }}}
// __is_sve_abi {{{
template <typename _Abi>
constexpr bool
__is_sve_abi()
{
constexpr auto _Bytes = __abi_bytes_v<_Abi>;
return _Bytes <= __sve_vectorized_size_bytes && is_same_v<simd_abi::_Sve<_Bytes>, _Abi>;
}
// }}}
// __make_dependent_t {{{
template <typename, typename _Up>
@ -998,6 +1030,9 @@ template <typename _Tp>
template <typename _Tp>
using _SimdWrapper64 = _SimdWrapper<_Tp, 64 / sizeof(_Tp)>;
template <typename _Tp, size_t _Width>
struct _SveSimdWrapper;
// }}}
// __is_simd_wrapper {{{
template <typename _Tp>
@ -2830,7 +2865,8 @@ namespace simd_abi {
// most of simd_abi is defined in simd_detail.h
template <typename _Tp>
inline constexpr int max_fixed_size
= (__have_avx512bw && sizeof(_Tp) == 1) ? 64 : 32;
= ((__have_avx512bw && sizeof(_Tp) == 1)
|| (__have_sve && __sve_vectorized_size_bytes/sizeof(_Tp) >= 64)) ? 64 : 32;
// compatible {{{
#if defined __x86_64__ || defined __aarch64__
@ -2858,6 +2894,8 @@ template <typename _Tp>
constexpr size_t __bytes = __vectorized_sizeof<_Tp>();
if constexpr (__bytes == sizeof(_Tp))
return static_cast<scalar*>(nullptr);
else if constexpr (__have_sve)
return static_cast<_SveAbi<__sve_vectorized_size_bytes>*>(nullptr);
else if constexpr (__have_avx512vl || (__have_avx512f && __bytes == 64))
return static_cast<_VecBltnBtmsk<__bytes>*>(nullptr);
else
@ -2951,6 +2989,9 @@ template <typename _Tp, typename _Abi = simd_abi::__default_abi<_Tp>>
template <typename _Tp, size_t _Np, typename = void>
struct __deduce_impl;
template <typename _Tp, size_t _Np, typename = void>
struct __no_sve_deduce_impl;
namespace simd_abi {
/**
* @tparam _Tp The requested `value_type` for the elements.
@ -2965,6 +3006,12 @@ template <typename _Tp, size_t _Np, typename...>
template <typename _Tp, size_t _Np, typename... _Abis>
using deduce_t = typename deduce<_Tp, _Np, _Abis...>::type;
template <typename _Tp, size_t _Np, typename...>
struct __no_sve_deduce : __no_sve_deduce_impl<_Tp, _Np> {};
template <typename _Tp, size_t _Np, typename... _Abis>
using __no_sve_deduce_t = typename __no_sve_deduce<_Tp, _Np, _Abis...>::type;
} // namespace simd_abi
// }}}2
@ -2974,13 +3021,27 @@ template <typename _Tp, typename _V, typename = void>
template <typename _Tp, typename _Up, typename _Abi>
struct rebind_simd<_Tp, simd<_Up, _Abi>,
void_t<simd_abi::deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>>>
{ using type = simd<_Tp, simd_abi::deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>>; };
void_t<std::conditional_t<!__is_sve_abi<_Abi>(),
simd_abi::__no_sve_deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>,
simd_abi::deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>>>>
{
using type = simd<_Tp, std::conditional_t<
!__is_sve_abi<_Abi>(),
simd_abi::__no_sve_deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>,
simd_abi::deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>>>;
};
template <typename _Tp, typename _Up, typename _Abi>
struct rebind_simd<_Tp, simd_mask<_Up, _Abi>,
void_t<simd_abi::deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>>>
{ using type = simd_mask<_Tp, simd_abi::deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>>; };
void_t<std::conditional_t<!__is_sve_abi<_Abi>(),
simd_abi::__no_sve_deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>,
simd_abi::deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>>>>
{
using type = simd_mask<_Tp, std::conditional_t<
!__is_sve_abi<_Abi>(),
simd_abi::__no_sve_deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>,
simd_abi::deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>>>;
};
template <typename _Tp, typename _V>
using rebind_simd_t = typename rebind_simd<_Tp, _V>::type;
@ -3243,7 +3304,7 @@ template <typename _Tp, typename _Up, typename _Ap>
else if constexpr (_Tp::size() == 1)
return __x[0];
else if constexpr (sizeof(_Tp) == sizeof(__x)
&& !__is_fixed_size_abi_v<_Ap>)
&& !__is_fixed_size_abi_v<_Ap> && !__is_sve_abi<_Ap>())
return {__private_init,
__vector_bitcast<typename _Tp::value_type, _Tp::size()>(
_Ap::_S_masked(__data(__x))._M_data)};
@ -4004,18 +4065,29 @@ template <typename _V, typename _Ap,
split(const simd<typename _V::value_type, _Ap>& __x)
{
using _Tp = typename _V::value_type;
auto __gen_fallback = [&]() constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>(
[&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
return _V([&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
{ return __x[__i * _V::size() + __j]; });
});
};
if constexpr (_Parts == 1)
{
return {simd_cast<_V>(__x)};
}
else if (__x._M_is_constprop())
{
return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>(
[&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
return _V([&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
{ return __x[__i * _V::size() + __j]; });
});
return __gen_fallback();
}
#if _GLIBCXX_SIMD_HAVE_SVE
else if constexpr(__is_sve_abi<_Ap>)
{
return __gen_fallback();
}
#endif
else if constexpr (
__is_fixed_size_abi_v<_Ap>
&& (is_same_v<typename _V::abi_type, simd_abi::scalar>
@ -4115,7 +4187,8 @@ template <size_t... _Sizes, typename _Tp, typename _Ap, typename>
constexpr size_t _N0 = _SL::template _S_at<0>();
using _V = __deduced_simd<_Tp, _N0>;
if (__x._M_is_constprop())
auto __gen_fallback = [&]() constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
{
return __generate_from_n_evaluations<sizeof...(_Sizes), _Tuple>(
[&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>;
@ -4124,6 +4197,14 @@ template <size_t... _Sizes, typename _Tp, typename _Ap, typename>
return __x[__offset + __j];
});
});
};
if (__x._M_is_constprop())
__gen_fallback();
#if _GLIBCXX_SIMD_HAVE_SVE
else if constexpr (__have_sve)
__gen_fallback();
#endif
else if constexpr (_Np == _N0)
{
static_assert(sizeof...(_Sizes) == 1);
@ -4510,8 +4591,11 @@ template <template <int> class _A0, template <int> class... _Rest>
// 1. The ABI tag is valid for _Tp
// 2. The storage overhead is no more than padding to fill the next
// power-of-2 number of bytes
if constexpr (_A0<_Bytes>::template _S_is_valid_v<
_Tp> && __fullsize / 2 < _Np)
if constexpr (_A0<_Bytes>::template _S_is_valid_v<_Tp>
&& ((__is_sve_abi<_A0<_Bytes>>() && __have_sve
&& (_Np <= __sve_vectorized_size_bytes/sizeof(_Tp)))
|| (__fullsize / 2 < _Np))
)
return typename __decay_abi<_A0<_Bytes>>::type{};
else
{
@ -4536,7 +4620,13 @@ template <template <int> class _A0, template <int> class... _Rest>
// the following lists all native ABIs, which makes them accessible to
// simd_abi::deduce and select_best_vector_type_t (for fixed_size). Order
// matters: Whatever comes first has higher priority.
using _AllNativeAbis = _AbiList<simd_abi::_VecBltnBtmsk, simd_abi::_VecBuiltin,
using _AllNativeAbis = _AbiList<
#if _GLIBCXX_SIMD_HAVE_SVE
simd_abi::_SveAbi,
#endif
simd_abi::_VecBltnBtmsk, simd_abi::_VecBuiltin, __scalar_abi_wrapper>;
using _NoSveAllNativeAbis = _AbiList<simd_abi::_VecBltnBtmsk, simd_abi::_VecBuiltin,
__scalar_abi_wrapper>;
// valid _SimdTraits specialization {{{1
@ -4551,6 +4641,11 @@ template <typename _Tp, size_t _Np>
_Tp, _Np, enable_if_t<_AllNativeAbis::template _S_has_valid_abi<_Tp, _Np>>>
{ using type = _AllNativeAbis::_FirstValidAbi<_Tp, _Np>; };
template <typename _Tp, size_t _Np>
struct __no_sve_deduce_impl<
_Tp, _Np, enable_if_t<_NoSveAllNativeAbis::template _S_has_valid_abi<_Tp, _Np>>>
{ using type = _NoSveAllNativeAbis::_FirstValidAbi<_Tp, _Np>; };
// fall back to fixed_size only if scalar and native ABIs don't match
template <typename _Tp, size_t _Np, typename = void>
struct __deduce_fixed_size_fallback {};
@ -4563,6 +4658,12 @@ template <typename _Tp, size_t _Np>
template <typename _Tp, size_t _Np, typename>
struct __deduce_impl : public __deduce_fixed_size_fallback<_Tp, _Np> {};
template <typename _Tp, size_t _Np, typename>
struct __no_sve_deduce_impl
: public __deduce_fixed_size_fallback<_Tp, _Np>
{};
//}}}1
/// @endcond

View File

@ -1614,7 +1614,7 @@ template <typename _Abi, typename>
static_assert(_UW_size <= _TV_size);
using _UW = _SimdWrapper<_Up, _UW_size>;
using _UV = __vector_type_t<_Up, _UW_size>;
using _UAbi = simd_abi::deduce_t<_Up, _UW_size>;
using _UAbi = simd_abi::__no_sve_deduce_t<_Up, _UW_size>;
if constexpr (_UW_size == _TV_size) // one convert+store
{
const _UW __converted = __convert<_UW>(__v);
@ -1857,7 +1857,7 @@ template <typename _Abi, typename>
else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>,
plus<>>)
{
using _Ap = simd_abi::deduce_t<_Tp, __full_size>;
using _Ap = simd_abi::__no_sve_deduce_t<_Tp, __full_size>;
return _Ap::_SimdImpl::_S_reduce(
simd<_Tp, _Ap>(__private_init,
_Abi::_S_masked(__as_vector(__x))),
@ -1866,7 +1866,7 @@ template <typename _Abi, typename>
else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>,
multiplies<>>)
{
using _Ap = simd_abi::deduce_t<_Tp, __full_size>;
using _Ap = simd_abi::__no_sve_deduce_t<_Tp, __full_size>;
using _TW = _SimdWrapper<_Tp, __full_size>;
_GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask_full
= _Abi::template _S_implicit_mask<_Tp>().__as_full_vector();
@ -1882,7 +1882,7 @@ template <typename _Abi, typename>
}
else if constexpr (_Np & 1)
{
using _Ap = simd_abi::deduce_t<_Tp, _Np - 1>;
using _Ap = simd_abi::__no_sve_deduce_t<_Tp, _Np - 1>;
return __binary_op(
simd<_Tp, simd_abi::scalar>(_Ap::_SimdImpl::_S_reduce(
simd<_Tp, _Ap>(
@ -1936,7 +1936,7 @@ template <typename _Abi, typename>
{
static_assert(sizeof(__x) > __min_vector_size<_Tp>);
static_assert((_Np & (_Np - 1)) == 0); // _Np must be a power of 2
using _Ap = simd_abi::deduce_t<_Tp, _Np / 2>;
using _Ap = simd_abi::__no_sve_deduce_t<_Tp, _Np / 2>;
using _V = simd<_Tp, _Ap>;
return _Ap::_SimdImpl::_S_reduce(
__binary_op(_V(__private_init, __extract<0, 2>(__as_vector(__x))),
@ -2376,83 +2376,95 @@ template <typename _Abi, typename>
_GLIBCXX_SIMD_INTRINSIC static __fixed_size_storage_t<int, _Np>
_S_fpclassify(_SimdWrapper<_Tp, _Np> __x)
{
using _I = __int_for_sizeof_t<_Tp>;
const auto __xn
= __vector_bitcast<_I>(__to_intrin(_SuperImpl::_S_abs(__x)));
constexpr size_t _NI = sizeof(__xn) / sizeof(_I);
_GLIBCXX_SIMD_USE_CONSTEXPR auto __minn
= __vector_bitcast<_I>(__vector_broadcast<_NI>(__norm_min_v<_Tp>));
if constexpr(__have_sve)
{
__fixed_size_storage_t<int, _Np> __r{};
__execute_n_times<_Np>(
[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
__r._M_set(__i, std::fpclassify(__x[__i]));
});
return __r;
}
else
{
using _I = __int_for_sizeof_t<_Tp>;
const auto __xn
= __vector_bitcast<_I>(__to_intrin(_SuperImpl::_S_abs(__x)));
constexpr size_t _NI = sizeof(__xn) / sizeof(_I);
_GLIBCXX_SIMD_USE_CONSTEXPR auto __minn
= __vector_bitcast<_I>(__vector_broadcast<_NI>(__norm_min_v<_Tp>));
_GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_normal
= __vector_broadcast<_NI, _I>(FP_NORMAL);
_GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_normal
= __vector_broadcast<_NI, _I>(FP_NORMAL);
#if !__FINITE_MATH_ONLY__
_GLIBCXX_SIMD_USE_CONSTEXPR auto __infn
= __vector_bitcast<_I>(__vector_broadcast<_NI>(__infinity_v<_Tp>));
_GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_nan
= __vector_broadcast<_NI, _I>(FP_NAN);
_GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_infinite
= __vector_broadcast<_NI, _I>(FP_INFINITE);
_GLIBCXX_SIMD_USE_CONSTEXPR auto __infn
= __vector_bitcast<_I>(__vector_broadcast<_NI>(__infinity_v<_Tp>));
_GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_nan
= __vector_broadcast<_NI, _I>(FP_NAN);
_GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_infinite
= __vector_broadcast<_NI, _I>(FP_INFINITE);
#endif
#ifndef __FAST_MATH__
_GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_subnormal
= __vector_broadcast<_NI, _I>(FP_SUBNORMAL);
_GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_subnormal
= __vector_broadcast<_NI, _I>(FP_SUBNORMAL);
#endif
_GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_zero
= __vector_broadcast<_NI, _I>(FP_ZERO);
_GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_zero
= __vector_broadcast<_NI, _I>(FP_ZERO);
__vector_type_t<_I, _NI>
__tmp = __xn < __minn
#ifdef __FAST_MATH__
? __fp_zero
#else
? (__xn == 0 ? __fp_zero : __fp_subnormal)
#endif
#if __FINITE_MATH_ONLY__
: __fp_normal;
#else
: (__xn < __infn ? __fp_normal
: (__xn == __infn ? __fp_infinite : __fp_nan));
#endif
__vector_type_t<_I, _NI>
__tmp = __xn < __minn
#ifdef __FAST_MATH__
? __fp_zero
#else
? (__xn == 0 ? __fp_zero : __fp_subnormal)
#endif
#if __FINITE_MATH_ONLY__
: __fp_normal;
#else
: (__xn < __infn ? __fp_normal
: (__xn == __infn ? __fp_infinite : __fp_nan));
#endif
if constexpr (sizeof(_I) == sizeof(int))
{
using _FixedInt = __fixed_size_storage_t<int, _Np>;
const auto __as_int = __vector_bitcast<int, _Np>(__tmp);
if constexpr (_FixedInt::_S_tuple_size == 1)
return {__as_int};
else if constexpr (_FixedInt::_S_tuple_size == 2
&& is_same_v<
typename _FixedInt::_SecondType::_FirstAbi,
simd_abi::scalar>)
return {__extract<0, 2>(__as_int), __as_int[_Np - 1]};
else if constexpr (_FixedInt::_S_tuple_size == 2)
return {__extract<0, 2>(__as_int),
__auto_bitcast(__extract<1, 2>(__as_int))};
if constexpr (sizeof(_I) == sizeof(int))
{
using _FixedInt = __fixed_size_storage_t<int, _Np>;
const auto __as_int = __vector_bitcast<int, _Np>(__tmp);
if constexpr (_FixedInt::_S_tuple_size == 1)
return {__as_int};
else if constexpr (_FixedInt::_S_tuple_size == 2
&& is_same_v<
typename _FixedInt::_SecondType::_FirstAbi,
simd_abi::scalar>)
return {__extract<0, 2>(__as_int), __as_int[_Np - 1]};
else if constexpr (_FixedInt::_S_tuple_size == 2)
return {__extract<0, 2>(__as_int),
__auto_bitcast(__extract<1, 2>(__as_int))};
else
__assert_unreachable<_Tp>();
}
else if constexpr (_Np == 2 && sizeof(_I) == 8
&& __fixed_size_storage_t<int, _Np>::_S_tuple_size == 2)
{
const auto __aslong = __vector_bitcast<_LLong>(__tmp);
return {int(__aslong[0]), {int(__aslong[1])}};
}
#if _GLIBCXX_SIMD_X86INTRIN
else if constexpr (sizeof(_Tp) == 8 && sizeof(__tmp) == 32
&& __fixed_size_storage_t<int, _Np>::_S_tuple_size == 1)
return {_mm_packs_epi32(__to_intrin(__lo128(__tmp)),
__to_intrin(__hi128(__tmp)))};
else if constexpr (sizeof(_Tp) == 8 && sizeof(__tmp) == 64
&& __fixed_size_storage_t<int, _Np>::_S_tuple_size == 1)
return {_mm512_cvtepi64_epi32(__to_intrin(__tmp))};
#endif // _GLIBCXX_SIMD_X86INTRIN
else if constexpr (__fixed_size_storage_t<int, _Np>::_S_tuple_size == 1)
return {__call_with_subscripts<_Np>(__vector_bitcast<_LLong>(__tmp),
[](auto... __l) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
return __make_wrapper<int>(__l...);
})};
else
__assert_unreachable<_Tp>();
}
else if constexpr (_Np == 2 && sizeof(_I) == 8
&& __fixed_size_storage_t<int, _Np>::_S_tuple_size == 2)
{
const auto __aslong = __vector_bitcast<_LLong>(__tmp);
return {int(__aslong[0]), {int(__aslong[1])}};
}
#if _GLIBCXX_SIMD_X86INTRIN
else if constexpr (sizeof(_Tp) == 8 && sizeof(__tmp) == 32
&& __fixed_size_storage_t<int, _Np>::_S_tuple_size == 1)
return {_mm_packs_epi32(__to_intrin(__lo128(__tmp)),
__to_intrin(__hi128(__tmp)))};
else if constexpr (sizeof(_Tp) == 8 && sizeof(__tmp) == 64
&& __fixed_size_storage_t<int, _Np>::_S_tuple_size == 1)
return {_mm512_cvtepi64_epi32(__to_intrin(__tmp))};
#endif // _GLIBCXX_SIMD_X86INTRIN
else if constexpr (__fixed_size_storage_t<int, _Np>::_S_tuple_size == 1)
return {__call_with_subscripts<_Np>(__vector_bitcast<_LLong>(__tmp),
[](auto... __l) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
return __make_wrapper<int>(__l...);
})};
else
__assert_unreachable<_Tp>();
}
// _S_increment & _S_decrement{{{2
@ -2785,11 +2797,23 @@ template <typename _Abi, typename>
return _R(_UAbi::_MaskImpl::_S_to_bits(__data(__x))._M_to_bits());
}
else
return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
_S_size<_Tp>>(
__data(__x));
}
{
if constexpr(__is_sve_abi<_UAbi>())
{
simd_mask<_Tp> __r(false);
constexpr size_t __min_size = std::min(__r.size(), __x.size());
__execute_n_times<__min_size>(
[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
__r[__i] = __x[__i];
});
return __data(__r);
}
else
return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
_S_size<_Tp>>(
__data(__x));
}
}
// }}}
// _S_masked_load {{{2
template <typename _Tp, size_t _Np>

View File

@ -28,6 +28,18 @@
#if __cplusplus >= 201703L
_GLIBCXX_SIMD_BEGIN_NAMESPACE
template <typename _Arg, typename _Ret, typename _To, size_t _Np>
_Ret __converter_fallback(_Arg __a)
{
_Ret __ret{};
__execute_n_times<_Np>(
[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
__ret._M_set(__i, static_cast<_To>(__a[__i]));
});
return __ret;
}
// _SimdConverter scalar -> scalar {{{
template <typename _From, typename _To>
struct _SimdConverter<_From, simd_abi::scalar, _To, simd_abi::scalar,
@ -56,14 +68,15 @@ template <typename _From, typename _To, typename _Abi>
};
// }}}
// _SimdConverter "native 1" -> "native 2" {{{
// _SimdConverter "native non-sve 1" -> "native non-sve 2" {{{
template <typename _From, typename _To, typename _AFrom, typename _ATo>
struct _SimdConverter<
_From, _AFrom, _To, _ATo,
enable_if_t<!disjunction_v<
__is_fixed_size_abi<_AFrom>, __is_fixed_size_abi<_ATo>,
is_same<_AFrom, simd_abi::scalar>, is_same<_ATo, simd_abi::scalar>,
conjunction<is_same<_From, _To>, is_same<_AFrom, _ATo>>>>>
conjunction<is_same<_From, _To>, is_same<_AFrom, _ATo>>>
&& !(__is_sve_abi<_AFrom>() || __is_sve_abi<_ATo>())>>
{
using _Arg = typename _AFrom::template __traits<_From>::_SimdMember;
using _Ret = typename _ATo::template __traits<_To>::_SimdMember;
@ -75,6 +88,26 @@ template <typename _From, typename _To, typename _AFrom, typename _ATo>
{ return __vector_convert<_V>(__a, __more...); }
};
// }}}
// _SimdConverter "native 1" -> "native 2" {{{
template <typename _From, typename _To, typename _AFrom, typename _ATo>
struct _SimdConverter<
_From, _AFrom, _To, _ATo,
enable_if_t<!disjunction_v<
__is_fixed_size_abi<_AFrom>, __is_fixed_size_abi<_ATo>,
is_same<_AFrom, simd_abi::scalar>, is_same<_ATo, simd_abi::scalar>,
conjunction<is_same<_From, _To>, is_same<_AFrom, _ATo>>>
&& (__is_sve_abi<_AFrom>() || __is_sve_abi<_ATo>())
>>
{
using _Arg = typename _AFrom::template __traits<_From>::_SimdMember;
using _Ret = typename _ATo::template __traits<_To>::_SimdMember;
_GLIBCXX_SIMD_INTRINSIC constexpr _Ret
operator()(_Arg __x) const noexcept
{ return __converter_fallback<_Arg, _Ret, _To, simd_size_v<_To, _ATo>>(__x); }
};
// }}}
// _SimdConverter scalar -> fixed_size<1> {{{1
template <typename _From, typename _To>
@ -111,6 +144,10 @@ template <typename _From, typename _To, int _Np>
if constexpr (is_same_v<_From, _To>)
return __x;
// fallback to sequential when sve is available
else if constexpr (__have_sve)
return __converter_fallback<_Arg, _Ret, _To, _Np>(__x);
// special case (optimize) int signedness casts
else if constexpr (sizeof(_From) == sizeof(_To)
&& is_integral_v<_From> && is_integral_v<_To>)
@ -275,11 +312,14 @@ template <typename _From, typename _Ap, typename _To, int _Np>
"_SimdConverter to fixed_size only works for equal element counts");
using _Ret = __fixed_size_storage_t<_To, _Np>;
using _Arg = typename _SimdTraits<_From, _Ap>::_SimdMember;
_GLIBCXX_SIMD_INTRINSIC constexpr _Ret
operator()(typename _SimdTraits<_From, _Ap>::_SimdMember __x) const noexcept
operator()(_Arg __x) const noexcept
{
if constexpr (_Ret::_S_tuple_size == 1)
if constexpr (__have_sve)
return __converter_fallback<_Arg, _Ret, _To, _Np>(__x);
else if constexpr (_Ret::_S_tuple_size == 1)
return {__vector_convert<typename _Ret::_FirstType::_BuiltinType>(__x)};
else
{
@ -316,12 +356,15 @@ template <typename _From, int _Np, typename _To, typename _Ap>
"_SimdConverter to fixed_size only works for equal element counts");
using _Arg = __fixed_size_storage_t<_From, _Np>;
using _Ret = typename _SimdTraits<_To, _Ap>::_SimdMember;
_GLIBCXX_SIMD_INTRINSIC constexpr
typename _SimdTraits<_To, _Ap>::_SimdMember
operator()(const _Arg& __x) const noexcept
_Ret
operator()(const _Arg& __x) const noexcept
{
if constexpr (_Arg::_S_tuple_size == 1)
if constexpr(__have_sve)
return __converter_fallback<_Arg, _Ret, _To, _Np>(__x);
else if constexpr (_Arg::_S_tuple_size == 1)
return __vector_convert<__vector_type_t<_To, _Np>>(__x.first);
else if constexpr (_Arg::_S_is_homogeneous)
return __call_with_n_evaluations<_Arg::_S_tuple_size>(

View File

@ -61,6 +61,16 @@
#else
#define _GLIBCXX_SIMD_HAVE_NEON_A64 0
#endif
#if (__ARM_FEATURE_SVE_BITS > 0 && __ARM_FEATURE_SVE_VECTOR_OPERATORS==1)
#define _GLIBCXX_SIMD_HAVE_SVE 1
#else
#define _GLIBCXX_SIMD_HAVE_SVE 0
#endif
#ifdef __ARM_FEATURE_SVE2
#define _GLIBCXX_SIMD_HAVE_SVE2 1
#else
#define _GLIBCXX_SIMD_HAVE_SVE2 0
#endif
//}}}
// x86{{{
#ifdef __MMX__
@ -267,7 +277,7 @@
#define _GLIBCXX_SIMD_IS_UNLIKELY(__x) __builtin_expect(__x, 0)
#define _GLIBCXX_SIMD_IS_LIKELY(__x) __builtin_expect(__x, 1)
#if __STRICT_ANSI__ || defined __clang__
#if _GLIBCXX_SIMD_HAVE_SVE || __STRICT_ANSI__ || defined __clang__
#define _GLIBCXX_SIMD_CONSTEXPR
#define _GLIBCXX_SIMD_USE_CONSTEXPR_API const
#else

View File

@ -652,6 +652,18 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
(*__exp)[0] = __tmp;
return __r;
}
else if constexpr (__is_sve_abi<_Abi>())
{
simd<_Tp, _Abi> __r;
__execute_n_times<simd_size_v<_Tp, _Abi>>(
[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
int __tmp;
const auto __ri = std::frexp(__x[__i], &__tmp);
(*__exp)[__i] = __tmp;
__r[__i] = __ri;
});
return __r;
}
else if constexpr (__is_fixed_size_abi_v<_Abi>)
return {__private_init, _Abi::_SimdImpl::_S_frexp(__data(__x), __data(*__exp))};
#if _GLIBCXX_SIMD_X86INTRIN
@ -1135,7 +1147,8 @@ _GLIBCXX_SIMD_CVTING2(hypot)
_GLIBCXX_SIMD_USE_CONSTEXPR_API _V __inf(__infinity_v<_Tp>);
#ifndef __FAST_MATH__
if constexpr (_V::size() > 1 && __have_neon && !__have_neon_a32)
if constexpr (_V::size() > 1
&& __is_neon_abi<_Abi>() && __have_neon && !__have_neon_a32)
{ // With ARMv7 NEON, we have no subnormals and must use slightly
// different strategy
const _V __hi_exp = __hi & __inf;

File diff suppressed because it is too large Load Diff

View File

@ -80,6 +80,9 @@
#include "bits/simd_x86.h"
#elif _GLIBCXX_SIMD_HAVE_NEON
#include "bits/simd_neon.h"
#if _GLIBCXX_SIMD_HAVE_SVE
#include "bits/simd_sve.h"
#endif
#elif __ALTIVEC__
#include "bits/simd_ppc.h"
#endif

View File

@ -29,6 +29,9 @@ template <class T>
invoke_test<simd<T, simd_abi::scalar>>(int());
invoke_test<simd<T, simd_abi::_VecBuiltin<16>>>(int());
invoke_test<simd<T, simd_abi::_VecBltnBtmsk<64>>>(int());
invoke_test<simd<T, simd_abi::_SveAbi<16>>>(int());
invoke_test<simd<T, simd_abi::_SveAbi<32>>>(int());
invoke_test<simd<T, simd_abi::_SveAbi<64>>>(int());
#elif EXTENDEDTESTS == 0
invoke_test<simd<T, simd_abi::_VecBuiltin<8>>>(int());
invoke_test<simd<T, simd_abi::_VecBuiltin<12>>>(int());