518 class vectorized_struct<T, N> :
public detail::simdize_template_arguments_t<T, N>
520 using tuple_type =
typename detail::make_simd_tuple<T, N>::type;
521 using base_type = detail::simdize_template_arguments_t<T, N>;
523 static constexpr auto _flat_member_count = detail::flat_member_count_v<T>;
524 static constexpr auto _flat_member_idx_seq = std::make_index_sequence<_flat_member_count>();
526 static constexpr auto _struct_size_idx_seq = std::make_index_sequence<_struct_size>();
532 constexpr base_type
const&
533 _as_base_type()
const
537 using value_type = T;
538 using mask_type =
typename detail::flat_element_t<0, base_type>::mask_type;
540 static constexpr auto size = vir::cw<N>;
542 template <
typename U = T>
543 inline static constexpr std::size_t memory_alignment =
alignof(U);
545 template <
typename... Ts>
546 requires requires(Ts&&... args) { base_type{
static_cast<Ts&&
>(args)...}; }
547 VIR_ALWAYS_INLINE
constexpr
548 vectorized_struct(Ts&&... args)
549 : base_type{
static_cast<Ts&&
>(args)...}
552 VIR_ALWAYS_INLINE
constexpr
553 vectorized_struct(
const base_type& init)
558 template <reflectable_struct U>
560 and detail::test_all_of<std::is_constructible, std::tuple_element, tuple_type,
561 struct_element, U>(_struct_size_idx_seq).value)
563 explicit(not detail::test_all_of<std::is_convertible, struct_element, U,
564 std::tuple_element, tuple_type>(_struct_size_idx_seq).value)
565 vectorized_struct(
const U& init)
566 : base_type([&]<std::size_t... Is>(std::index_sequence<Is...>) {
568 }(_struct_size_idx_seq))
571 VIR_ALWAYS_INLINE
constexpr T
572 operator[](std::size_t i)
const
574 return [&]<std::size_t... Is>(std::index_sequence<Is...>) {
575 return T{detail::flat_get<Is>(*
this)[i]...};
576 }(_flat_member_idx_seq);
580 static constexpr base_type
581 _load_elements_via_permute(
const T* addr)
583#if VIR_HAVE_WORKING_SHUFFLEVECTOR
584 if (not std::is_constant_evaluated())
585 if constexpr (N > 1 and
sizeof(T) * N ==
sizeof(base_type) and _flat_member_count >= 2
586 and std::has_single_bit(
unsigned(N)))
588 const std::byte* byte_ptr =
reinterpret_cast<const std::byte*
>(addr);
591 using V0 = detail::flat_element_t<0, tuple_type>;
592 using V1 = detail::flat_element_t<1, tuple_type>;
593 if constexpr (_flat_member_count == 2 and N > 2)
595 static_assert(N == V0::size());
596 constexpr int N2 = N / 2;
597 using U =
typename V0::value_type;
600 and std::has_single_bit(V0::size())
601 and V0::size() <= stdx::native_simd<U>::size())
603 using V [[gnu::vector_size(
sizeof(U) * N)]] = U;
611 return [&]<std::size_t... Is>(std::index_sequence<Is...>) {
614 __builtin_shufflevector(x0, x1, (Is * 2)..., (N + Is * 2)...)),
616 __builtin_shufflevector(x0, x1, (1 + Is * 2)..., (1 + N + Is * 2)...))
618 }(std::make_index_sequence<N2>());
654 else if constexpr (std::same_as<vir::as_tuple_t<T>, std::tuple<float, float, float>>)
656 if constexpr (std::same_as<tuple_type, std::tuple<V0, V0, V0>>
657 and V0::size() == 8 and std::is_trivially_copyable_v<V0>)
669 using v8sf [[gnu::vector_size(32)]] = float;
672 v8sf x0, x1, x2, a0, b0, c0;
673 std::memcpy(&x0, byte_ptr + 0, 32);
674 std::memcpy(&x1, byte_ptr + 32, 32);
675 std::memcpy(&x2, byte_ptr + 64, 32);
677 a0 = detail::blend<1, 4, 7>(x0, x1);
678 a0 = detail::blend<2, 5>(a0, x2);
681 b0 = detail::blend<2, 5>(x0, x1);
682 b0 = detail::blend<0, 3, 6>(b0, x2);
685 c0 = detail::blend<0, 3, 6>(x0, x1);
686 c0 = detail::blend<1, 4, 7>(c0, x2);
689 V0 a = std::bit_cast<V0>(a0);
692 return base_type {a, b, c};
697 std::memcpy(&a, byte_ptr + 0, 32);
698 std::memcpy(&b, byte_ptr + 32, 32);
699 std::memcpy(&c, byte_ptr + 64, 32);
702 v8sf ac0 = __builtin_shufflevector(a, c, 0, 1, 2, 3, 8, 9, 10, 11);
705 v8sf ac1 = __builtin_shufflevector(a, c, 4, 5, 6, 7, 12, 13, 14, 15);
708 V0 tmp0 = std::bit_cast<V0>(detail::blend<2, 5>(
709 detail::blend<1, 4, 7>(ac0, b), ac1));
712 V0 tmp1 = std::bit_cast<V0>(detail::blend<0, 3, 6>(
713 detail::blend<2, 5>(ac0, b), ac1));
716 V0 tmp2 = std::bit_cast<V0>(detail::blend<1, 4, 7>(
717 detail::blend<0, 3, 6>(ac0, b), ac1));
721 return std::array{0, 3, 2, 1, 4, 7, 6, 5}[i];
724 return std::array{1, 0, 3, 2, 5, 4, 7, 6}[i];
727 return std::array{2, 1, 0, 3, 6, 5, 4, 7}[i];
732 if constexpr (std::same_as<tuple_type, std::tuple<V0, V0, V0>>
733 and V0::size() == 4 and std::is_trivially_copyable_v<V0>)
739 using v4sf [[gnu::vector_size(16)]] = float;
740 v4sf abca, bcab, cabc, a, b, c;
741 std::memcpy(&abca, byte_ptr + 0, 16);
742 std::memcpy(&bcab, byte_ptr + 16, 16);
743 std::memcpy(&cabc, byte_ptr + 32, 16);
746 a = __builtin_shufflevector(abca, detail::blend<2, 3>(cabc, bcab), 0, 3, 6, 5);
749 b = __builtin_shufflevector(detail::blend<0>(abca, bcab),
750 detail::blend<2>(bcab, cabc),
753 c = __builtin_shufflevector(detail::blend<2, 3>(bcab, abca),
756 return base_type {std::bit_cast<V0>(a), std::bit_cast<V0>(b),
757 std::bit_cast<V0>(c)};
760 if constexpr (_flat_member_count == 3 and N > 2)
762 using V2 = detail::flat_element_t<2, tuple_type>;
763 static_assert(N == V0::size());
764 static_assert(std::has_single_bit(
unsigned(N)));
765 using U =
typename V0::value_type;
766 if constexpr (
sizeof(U) ==
sizeof(detail::flat_element_t<0, T>)
767 and
sizeof(U) ==
sizeof(detail::flat_element_t<1, T>)
768 and
sizeof(U) ==
sizeof(detail::flat_element_t<2, T>)
769 and V0::size() <= stdx::native_simd<U>::size())
771 using V [[gnu::vector_size(
sizeof(U) * N)]] = U;
775 std::memcpy(&x0, byte_ptr,
sizeof(V));
777 std::memcpy(&x1, byte_ptr +
sizeof(V),
sizeof(V));
779 std::memcpy(&x2, byte_ptr + 2 *
sizeof(V),
sizeof(V));
781 return [&]<
int... Is>(std::integer_sequence<int, Is...>)
782 VIR_LAMBDA_ALWAYS_INLINE {
785 __builtin_shufflevector(
786 __builtin_shufflevector(
787 x0, x1, (Is % 3 == 0 ? Is : (Is + N) % 3 == 0 ? Is + N : -1)...),
788 x2, (Is * 3 < N ? Is * 3 : Is * 3 - N)...)),
790 __builtin_shufflevector(
791 __builtin_shufflevector(
792 x0, x1, (Is % 3 == 1 ? Is : (Is + N) % 3 == 1 ? Is + N : -1)...),
793 x2, (Is * 3 + 1 < N ? Is * 3 + 1 : Is * 3 - N + 1)...)),
795 __builtin_shufflevector(
796 __builtin_shufflevector(
797 x0, x1, (Is % 3 == 2 ? Is : (Is + N) % 3 == 2 ? Is + N : -1)...),
798 x2, (Is * 3 + 2 < N ? Is * 3 + 2 : Is * 3 - N + 2)...))
800 }(std::make_integer_sequence<int, N>());
807 return [&]<std::size_t... Is>(std::index_sequence<Is...>) {
808 return base_type {detail::flat_element_t<Is, tuple_type>([&](
size_t i) {
809 return detail::flat_get<Is>(addr[i]);
811 }(_flat_member_idx_seq);
815 VIR_ALWAYS_INLINE
constexpr void
816 _store_elements_via_permute(T* addr, std::integer_sequence<int, Is...>)
const
818#if VIR_HAVE_WORKING_SHUFFLEVECTOR
819 if (not std::is_constant_evaluated())
820 if constexpr (N > 2 and _flat_member_count >= 2
821 and
sizeof(T) * N ==
sizeof(base_type)
822 and std::has_single_bit(
unsigned(N)))
824 using V0 = detail::flat_element_t<0, tuple_type>;
826 static_assert(N == V0::size());
827 using U =
typename V0::value_type;
828 using V [[gnu::vector_size(
sizeof(U) * N)]] = U;
829 std::byte* byte_ptr =
reinterpret_cast<std::byte*
>(addr);
831 if constexpr (_flat_member_count == 3
832 and V0::size() <= stdx::native_simd<U>::size())
835 if constexpr (
sizeof(U) ==
sizeof(detail::flat_element_t<0, T>)
836 and
sizeof(U) ==
sizeof(detail::flat_element_t<1, T>)
837 and
sizeof(U) ==
sizeof(detail::flat_element_t<2, T>))
840 V a = std::bit_cast<V>(detail::flat_get<0>(_as_base_type()));
842 V b = std::bit_cast<V>(detail::flat_get<1>(_as_base_type()));
844 V c = std::bit_cast<V>(detail::flat_get<2>(_as_base_type()));
845 constexpr auto idx = [](
int i,
int pos,
int a,
int b,
int c) {
846 constexpr int N3 = N / 3;
850 return i / 3 + a + pos * N3;
852 return i / 3 + b + pos * N3 + N;
854 return i / 3 + c + ((pos + N % 3) % 3) * N3;
856 __builtin_unreachable();
859 if constexpr (N % 3 == 1)
862 const V aba = __builtin_shufflevector(a, b, idx(Is, 0, 0, 0, 1)...);
864 const V bcb = __builtin_shufflevector(b, c, idx(Is, 1, 0, 0, 1)...);
866 const V cac = __builtin_shufflevector(c, a, idx(Is, 2, 0, 1, 0)...);
868 a = __builtin_shufflevector(aba, cac, (Is % 3 != 2 ? Is : Is + N)...);
870 b = __builtin_shufflevector(bcb, aba, (Is % 3 != 2 ? Is : Is + N)...);
872 c = __builtin_shufflevector(cac, bcb, (Is % 3 != 2 ? Is : Is + N)...);
876 static_assert(N % 3 == 2);
878 const V aba = __builtin_shufflevector(a, b, idx(Is, 0, 0, 0, 2)...);
880 const V cac = __builtin_shufflevector(c, a, idx(Is, 1, 0, 1, 0)...);
882 const V bcb = __builtin_shufflevector(b, c, idx(Is, 2, 1, 1, 1)...);
884 a = __builtin_shufflevector(aba, cac, (Is % 3 != 2 ? Is : Is + N)...);
886 b = __builtin_shufflevector(cac, bcb, (Is % 3 != 2 ? Is : Is + N)...);
888 c = __builtin_shufflevector(bcb, aba, (Is % 3 != 2 ? Is : Is + N)...);
890 std::memcpy(byte_ptr, &a,
sizeof(V));
891 std::memcpy(byte_ptr +
sizeof(V), &b,
sizeof(V));
892 std::memcpy(byte_ptr + 2 *
sizeof(V), &c,
sizeof(V));
898 for (
int i = 0; i < N; ++i)
899 addr[i] =
operator[](i);
907 template <std::contiguous_iterator It,
typename Flags = stdx::element_aligned_tag>
908 requires std::same_as<std::iter_value_t<It>, T>
911 : base_type(_load_elements_via_permute(
std::to_address(it)))
914 template <std::contiguous_iterator It,
typename Flags = stdx::element_aligned_tag>
915 requires std::same_as<std::iter_value_t<It>, T>
917 copy_from(It it, Flags = {})
918 {
static_cast<base_type&
>(*this) = _load_elements_via_permute(std::to_address(it)); }
925 template <std::contiguous_iterator It,
typename Flags = stdx::element_aligned_tag>
926 requires std::output_iterator<It, T>
929 { _store_elements_via_permute(std::to_address(it), std::make_integer_sequence<int, N>()); }
934 template <
typename R>
935 using _op_return_type = std::conditional_t<std::same_as<R, base_type>, vectorized_struct, R>;
937#define VIR_OPERATOR_FWD(op) \
938 VIR_ALWAYS_INLINE friend constexpr auto \
939 operator op(vectorized_struct const& a, vectorized_struct const& b) \
940 requires requires(base_type const& x) { {x op x}; } \
942 return static_cast<_op_return_type<decltype(a._as_base_type() op b._as_base_type())>>( \
943 a._as_base_type() op b._as_base_type()); \
946 VIR_ALWAYS_INLINE friend constexpr auto \
947 operator op(base_type const& a, vectorized_struct const& b) \
948 requires requires(base_type const& x) { {x op x}; } \
950 return static_cast<_op_return_type<decltype(a op b._as_base_type())>>( \
951 a op b._as_base_type()); \
954 VIR_ALWAYS_INLINE friend constexpr auto \
955 operator op(vectorized_struct const& a, base_type const& b) \
956 requires requires(base_type const& x) { {x op x}; } \
958 return static_cast<_op_return_type<decltype(a._as_base_type() op b)>>( \
959 a._as_base_type() op b); \
978#undef VIR_OPERATOR_FWD