6#ifndef VIR_SIMD_EXECUTION_H_
7#define VIR_SIMD_EXECUTION_H_
17#if VIR_HAVE_SIMD_CONCEPTS and VIR_HAVE_SIMDIZE and VIR_HAVE_SIMD_CVT and VIR_HAVE_CONSTEXPR_WRAPPER
18#if DOXYGEN or not defined __clang_major__ or not defined _GLIBCXX_RELEASE \
19 or __clang_major__ >= 17 or _GLIBCXX_RELEASE < 13
20#define VIR_HAVE_SIMD_EXECUTION 1
31 using namespace vir::literals;
34 VIR_ALWAYS_INLINE
constexpr auto
35 data_or_ptr(T&& r) ->
decltype(std::ranges::data(r))
36 {
return std::ranges::data(r); }
39 VIR_ALWAYS_INLINE
constexpr T*
44 VIR_ALWAYS_INLINE
constexpr const T*
45 data_or_ptr(
const T* ptr)
50 template <
typename V,
bool write_back =
false,
typename Flags = stdx::element_aligned_tag,
52 VIR_ALWAYS_INLINE
constexpr void
53 simd_load_and_invoke(
auto&& fun,
auto ptr, Flags f, std::index_sequence<Is...>)
56 std::invoke(fun, chunks...);
57 if constexpr (write_back)
58 (chunks.copy_to(ptr + (V::size() * Is), f), ...);
59 }(std::conditional_t<write_back, V, const V>(ptr + (V::size() * Is), f)...);
62 template <
typename V,
typename Flags = stdx::element_aligned_tag, std::size_t... Is>
63 VIR_ALWAYS_INLINE
constexpr void
64 simd_load_and_invoke(
auto&& unary_op,
auto ptr,
auto&& result_op, Flags f, std::index_sequence<Is...>)
66 [&](
const auto... chunks) {
67 (std::invoke(result_op, std::invoke(unary_op, chunks),
68 std::integral_constant<std::size_t, V::size() * Is>()), ...);
69 }(V(ptr + (V::size() * Is), f)...);
72 template <
typename V1,
typename V2,
typename F1 = stdx::element_aligned_tag,
73 typename F2 = stdx::element_aligned_tag, std::size_t... Is>
74 VIR_ALWAYS_INLINE
constexpr void
75 simd_load_and_invoke(
auto&& binary_op,
auto ptr1,
auto ptr2,
auto&& result_op, F1 f1, F2 f2,
76 std::index_sequence<Is...>)
78 constexpr auto size = V1::size();
79 static_assert(size == V2::size());
80 [&](
const auto&... v1s) {
81 [&](
const auto&... v2s) {
82 (std::invoke(result_op, std::invoke(binary_op, v1s, v2s),
83 std::integral_constant<std::size_t, size * Is>()), ...);
84 }(V2(ptr2 + (size * Is), f2)...);
85 }(V1(ptr1 + (size * Is), f1)...);
88 static constexpr auto no_unroll = std::make_index_sequence<1>();
90 template <
class V,
bool write_back, std::
size_t max_elements>
91 VIR_ALWAYS_INLINE
void
92 simd_for_each_jumptable_prologue(
auto&& fun,
auto ptr, std::size_t to_process)
96#define VIR_CASE(N, M) \
98 static_assert(std::has_single_bit(unsigned(N - M))); \
99 static_assert(N - M < M or M == 0); \
100 static_assert(((N - M) & M) == 0); \
103 if constexpr (N >= max_elements) \
105 simd_load_and_invoke<vir::simdize<typename V::value_type, N - M>, write_back>( \
106 fun, ptr, stdx::vector_aligned, no_unroll); \
182 template <
class V,
bool write_back, std::
size_t max_elements>
183 VIR_ALWAYS_INLINE
constexpr void
184 simd_for_each_prologue(
auto&& fun,
auto ptr, std::size_t to_process)
186 static_assert(max_elements > 1);
187 static_assert(std::has_single_bit(max_elements));
188 static_assert(std::has_single_bit(
unsigned(V::size())));
194 if (to_process == 0 or to_process >= max_elements)
197 if constexpr (max_elements == 2)
199 simd_load_and_invoke<V, write_back>(fun, ptr, stdx::vector_aligned, no_unroll);
202#if !__OPTIMIZE_SIZE__
203 if constexpr (V::size() == 1 and max_elements < 64)
204 if (not std::is_constant_evaluated())
205 return simd_for_each_jumptable_prologue<V, write_back, max_elements>(
206 fun, ptr, to_process);
209 if (V::size() & to_process)
211 simd_load_and_invoke<V, write_back>(fun, ptr, stdx::vector_aligned, no_unroll);
214 if constexpr (V::size() * 2 < max_elements)
216 simd_for_each_prologue<
vir::simdize<
typename V::value_type, V::size() * 2>,
217 write_back, max_elements>(fun, ptr, to_process);
221 template <
class V0,
bool write_back>
223 simd_for_each_jumptable_epilogue(
auto&& fun,
unsigned leftover,
auto last,
auto f)
228 auto ptr = std::to_address(last - leftover);
231#define VIR_CASE(N, M) \
235 if constexpr (N >= V0::size()) \
237 simd_load_and_invoke<vir::simdize<typename V0::value_type, N - M>, write_back>( \
238 fun, ptr, f, no_unroll); \
314 template <
class V0,
bool write_back>
315 VIR_ALWAYS_INLINE
constexpr void
316 simd_for_each_epilogue(
auto&& fun,
unsigned leftover,
auto last,
auto f)
322 static_assert(V0::size() > 1);
324 if constexpr (V0::size() == 2)
326 simd_load_and_invoke<V1, write_back>(fun, std::to_address(last - 1), f, no_unroll);
329 if constexpr (V0::size() <= 4)
331 if (not std::is_constant_evaluated())
332 return simd_for_each_jumptable_epilogue<V0, write_back>(fun, leftover, last, f);
334#if !__OPTIMIZE_SIZE__
337 if constexpr (V0::size() <= 64
341 if (not std::is_constant_evaluated())
342 return simd_for_each_jumptable_epilogue<V0, write_back>(fun, leftover, last, f);
345 using V =
vir::simdize<
typename V0::value_type, std::bit_ceil(
unsigned(V0::size())) / 2>;
346 if (leftover & V::size())
348 static_assert(std::has_single_bit(
unsigned(V::size())));
349 simd_load_and_invoke<V, write_back>(fun, std::to_address(last - leftover), f,
351 leftover -= V::size();
353 if constexpr (V::size() > 1)
355 simd_for_each_epilogue<V, write_back>(fun, leftover, last, f);
358 struct simd_policy_prefer_aligned_t {};
360 struct simd_policy_auto_prologue_t {};
362 struct simd_policy_assume_matching_size_t {};
365 struct simd_policy_unroll_by_t
368 template <
typename T>
369 struct simd_policy_unroll_value
370 : std::integral_constant<int, 0>
374 struct simd_policy_unroll_value<simd_policy_unroll_by_t<N>>
375 : std::integral_constant<int, N>
379 struct simd_policy_size_t
382 template <
typename T>
383 struct simd_policy_size_value
384 : std::integral_constant<int, 0>
388 struct simd_policy_size_value<simd_policy_size_t<N>>
389 : std::integral_constant<int, N>
392 template <
typename T>
393 struct is_simd_policy
398 template <
typename T>
404 template <
typename Rng>
412 template <
typename It>
424 template <
typename... Options>
427 static constexpr bool _prefers_aligned
428 = (
false or ... or std::same_as<Options, detail::simd_policy_prefer_aligned_t>);
430 static constexpr bool _auto_prologue
431 = (
false or ... or std::same_as<Options, detail::simd_policy_auto_prologue_t>);
433 static constexpr bool _assume_matching_size
434 = (
false or ... or std::same_as<Options, detail::simd_policy_assume_matching_size_t>);
436 static constexpr int _unroll_by
437 = (0 + ... + detail::simd_policy_unroll_value<Options>::value);
439 static constexpr int _size
440 = (0 + ... + detail::simd_policy_size_value<Options>::value);
450 static constexpr simd_policy<Options..., detail::simd_policy_prefer_aligned_t>
452 and not _assume_matching_size)
461 static constexpr simd_policy<Options..., detail::simd_policy_auto_prologue_t>
463 and not _assume_matching_size)
473 static constexpr simd_policy<Options..., detail::simd_policy_assume_matching_size_t>
475 and not _assume_matching_size)
491 static constexpr simd_policy<Options..., detail::simd_policy_unroll_by_t<N>>
494 static_assert(N > 1);
503 static constexpr simd_policy<Options..., detail::simd_policy_size_t<N>>
506 static_assert(N > 0);
533 template <
typename... Options>
534 struct is_simd_policy<
execution::simd_policy<Options...>>
538 template <
typename V,
typename T =
typename V::value_type>
539 struct memory_alignment
540 : vir::constexpr_wrapper<alignof(T)>
543 template <
typename V,
typename T>
544 requires (stdx::is_simd_v<V>)
545 struct memory_alignment<V, T>
546 : stdx::memory_alignment<V, T>
549 template <
typename V,
typename T>
550 requires (not stdx::is_simd_v<V>) and
requires {
551 {V::template memory_alignment<T>} -> std::same_as<std::size_t>;
553 struct memory_alignment<V, T>
554 : vir::constexpr_wrapper<V::template memory_alignment<T>>
557 template <
typename V,
typename T =
typename V::value_type>
558 inline constexpr std::size_t memory_alignment_v = memory_alignment<V, T>::value;
560 template <
typename V, simd_execution_policy ExecutionPolicy>
563 using T =
typename V::value_type;
565 static constexpr std::size_t size = V::size();
567 static constexpr auto bytes_per_iteration = size *
sizeof(T);
569 static constexpr bool prologue_is_possible
570 = size > 1 and bytes_per_iteration % memory_alignment_v<V> == 0
571 and memory_alignment_v<V> >
sizeof(T);
573 static constexpr bool use_aligned_loadstore
574 = ExecutionPolicy::_prefers_aligned and prologue_is_possible;
576 static constexpr bool maybe_execute_prologue_anyway
577 = ExecutionPolicy::_auto_prologue and prologue_is_possible;
579 static constexpr std::conditional_t<use_aligned_loadstore, stdx::vector_aligned_tag,
580 stdx::element_aligned_tag> flags{};
583 operator()(std::size_t& remaining,
auto iterator_to_align,
auto&& do_prologue,
584 auto&... iterators_to_advance)
const
586 static_assert(std::same_as<T, std::iter_value_t<
decltype(iterator_to_align)>>);
588 if constexpr (use_aligned_loadstore or maybe_execute_prologue_anyway)
590 constexpr auto max_misalignment = vir::cw<memory_alignment_v<V> /
sizeof(T)>;
591 const auto misaligned_by_bytes
592 =
reinterpret_cast<std::uintptr_t
>(std::to_address(iterator_to_align))
593 % memory_alignment_v<V>;
594 if (misaligned_by_bytes == 0)
596 const auto misaligned_by_elements = misaligned_by_bytes /
sizeof(T);
597 const auto to_process = max_misalignment - misaligned_by_elements;
598 if constexpr (use_aligned_loadstore)
599 do_prologue(max_misalignment, to_process);
600 else if (remaining *
sizeof(T) > 4000
601 or (to_process & remaining) == to_process)
602 do_prologue(max_misalignment, to_process);
605 ((iterators_to_advance += to_process), ...);
606 remaining -= to_process;
615 template <
typename It,
int size>
618 template <
typename... Its>
620 simdized_load_and_invoke(
auto op, vir::constexpr_value
auto size, Its... its)
623 op, iter_simdize_t<Its, size>(std::to_address(its), stdx::element_aligned)...);
626 template <
typename It0,
typename... Its>
628 simdized_load_flag1_and_invoke(
auto op, vir::constexpr_value
auto size,
auto flag0, It0 it0,
632 op, iter_simdize_t<It0, size>(std::to_address(it0), flag0),
633 iter_simdize_t<Its, size>(std::to_address(its), stdx::element_aligned)...);
636 template <
typename R,
typename... Ts>
637 VIR_ALWAYS_INLINE
constexpr void
638 simd_transform_prologue(
auto&& op,
auto dst_ptr, std::size_t to_process,
639 vir::constexpr_value
auto max_elements,
const Ts*... ptrs)
641 static_assert(max_elements > 1);
642 static_assert(std::has_single_bit(
unsigned(max_elements)));
643 constexpr auto size = vir::cw<R::size()>;
644 static_assert(std::has_single_bit(
unsigned(size)));
645 static_assert(size < max_elements);
652 if (to_process == 0 or to_process >= max_elements)
655 if (max_elements == 2 or size & to_process)
657 const R& result = simdized_load_and_invoke(op, size, ptrs...);
658 result.copy_to(dst_ptr, stdx::vector_aligned);
659 if constexpr (max_elements == 2)
661 ((ptrs += size), ...);
664 if constexpr (size * 2 < max_elements)
666 simd_transform_prologue<vir::resize_simdize_t<size * 2, R>, Ts...>(
667 op, dst_ptr, to_process, max_elements, ptrs...);
671 template <
typename V0,
typename R0,
typename... More>
672 VIR_ALWAYS_INLINE
constexpr void
673 simd_transform_epilogue(
auto&& binary_op,
unsigned leftover,
auto last,
auto d_first,
auto f,
682 constexpr unsigned size0 = V0::size();
683 static_assert(size0 > 1);
684 static_assert(size0 == R0::size());
685 using T0 =
typename V0::value_type;
686 if constexpr (size0 == 2)
689 const R1& result = simdized_load_and_invoke(binary_op, 1_cw, last - 1, ptrs...);
690 result.copy_to(std::to_address(d_first), f);
694 using R =
vir::simdize<
typename R0::value_type, V::size()>;
695 if (leftover & V::size())
697 static_assert(std::has_single_bit(
unsigned(V::size())));
698 const R& result = simdized_load_and_invoke(binary_op, vir::cw<V::size()>,
699 last - leftover, ptrs...);
700 result.copy_to(std::to_address(d_first), f);
701 leftover -= V::size();
702 ((ptrs += V::size()), ...);
703 d_first += V::size();
705 if constexpr (V::size() > 1)
707 simd_transform_epilogue<V, R, More...>(binary_op, leftover, last, d_first, f, ptrs...);
713 transform(ExecutionPolicy, It1 first1, It1 last1, OutIt d_first, Operation op, It2... first2)
715 using T1 = std::iter_value_t<It1>;
716 using OutT = std::iter_value_t<OutIt>;
717 constexpr auto size = vir::cw<([] {
718 if constexpr (ExecutionPolicy::_size > 0)
719 return ExecutionPolicy::_size;
721 return std::max({int(iter_simdize_t<It1, 0>::size()),
722 int(iter_simdize_t<It2, 0>::size())...});
730 auto advance = [](
int n,
auto&... it) { ((it += n), ...); };
733 constexpr bool assume_matching_size = ExecutionPolicy::_assume_matching_size;
734 if constexpr (assume_matching_size)
735 vir_simd_precondition_vaargs(
736 distance % size == 0,
"The explicit assumption, that the range size (%zu) is a multiple"
737 " of the SIMD width (%d), does not hold.", distance, size());
739 if (std::is_constant_evaluated())
742 for (; first1 + (size - 1) < last1; advance(size, first1, d_first, first2...))
744 const OutV& result = simdized_load_and_invoke(op, size, first1, first2...);
745 result.copy_to(std::to_address(d_first), stdx::element_aligned);
748 if constexpr (!assume_matching_size)
750 for (; first1 < last1; advance(1, first1, d_first, first2...))
753 = simdized_load_and_invoke(op, 1_cw, first1, first2...);
754 result.copy_to(std::to_address(d_first), stdx::element_aligned);
766 constexpr prologue<OutV, ExecutionPolicy> p;
767 constexpr auto flags = p.flags;
768 if constexpr (!assume_matching_size)
770 p(distance, d_first, [&] (
auto max_elements,
auto to_process) {
771 simd_transform_prologue<vir::simdize<OutT, 1>, T1, std::iter_value_t<It2>...>(
772 op, std::to_address(d_first), to_process, max_elements, std::to_address(first1),
773 std::to_address(first2)...);
774 }, first1, d_first, first2...);
777 if constexpr (ExecutionPolicy::_unroll_by > 1)
779 constexpr auto step = size * ExecutionPolicy::_unroll_by;
780 const auto unrolled_last = last1 - step;
781 for (; first1 <= unrolled_last; advance(step, first1, d_first, first2...))
783 unroll2<ExecutionPolicy::_unroll_by>([&,size](
auto i) {
784 return simdized_load_and_invoke(op, size, first1 + i * size,
785 first2 + i * size...);
786 }, [&,size](
auto i,
const OutV& result) {
787 result.copy_to(std::to_address(d_first + i * size), flags);
792 const auto simd_last = last1 - size;
793 for (; assume_matching_size ? first1 != last1 : first1 <= simd_last;
794 advance(size, first1, d_first, first2...))
796 const OutV& result = simdized_load_and_invoke(op, size, first1, first2...);
797 result.copy_to(std::to_address(d_first), flags);
800 if constexpr (size > 1 and !assume_matching_size)
802 const auto leftover =
distance % size;
805 simd_transform_epilogue<V1, OutV, std::iter_value_t<It2>...>(
806 op, leftover, last1, d_first, flags, std::to_address(first2)...);
814 template <
int size,
int max_size,
typename A1,
typename It1,
typename... It2>
815 VIR_ALWAYS_INLINE
constexpr A1
816 simd_transform_reduce_prologue(A1 acc1, It1 first1, std::size_t to_process,
817 auto reduce_op,
auto transform_op, It2... first2)
824 static_assert(std::has_single_bit(
unsigned(size)));
825 static_assert(std::has_single_bit(
unsigned(max_size)));
826 if (to_process & size)
828 const std::size_t offset = to_process & (size - 1);
829 const A1 x = reduce(simdized_load_flag1_and_invoke(
830 transform_op, vir::cw<size>, stdx::vector_aligned,
831 first1 + offset, first2 + offset...),
833 acc1 = std::invoke(reduce_op, acc1, x);
835 if constexpr (size < max_size)
836 return simd_transform_reduce_prologue<size * 2, max_size>(
837 acc1, first1, to_process, reduce_op, transform_op, first2...);
846 template <
typename A1,
typename A,
typename It1,
typename... It2>
848 constexpr typename A1::value_type
849 simd_transform_reduce_epilogue(A1 acc1, A acc, It1 first1, std::size_t leftover,
850 auto reduce_op,
auto transform_op,
auto flags1, It2... first2)
855 static_assert(A1::size() == 1);
856 static_assert(std::has_single_bit(
unsigned(A::size())));
857 if (leftover & A::size())
859 acc = std::invoke(reduce_op, acc,
860 simdized_load_flag1_and_invoke(transform_op, vir::cw<A::size()>,
861 flags1, first1, first2...));
862 if constexpr (A::size() == 1)
864 return std::invoke(reduce_op, acc1, acc)[0];
865 else if ((leftover & (A::size() - 1)) == 0)
867 return std::invoke(reduce_op, acc1, A1(reduce(acc, reduce_op)))[0];
870 constexpr std::size_t size2 = A::size() / 2;
872 auto [lo, hi] = stdx::split<size2, size2>(acc);
873 A2 acc2 = std::invoke(reduce_op, lo, hi);
874 return simd_transform_reduce_epilogue(acc1, acc2, first1 + A::size(), leftover,
875 reduce_op, transform_op, flags1, first2 +
879 else if ((leftover & (A::size() - 1)) == 0)
881 else if constexpr (A::size() > 1)
883 constexpr std::size_t size2 = A::size() / 2;
885 auto [lo, hi] = stdx::split<size2, size2>(acc);
886 A2 acc2 = std::invoke(reduce_op, lo, hi);
887 return simd_transform_reduce_epilogue(acc1, acc2, first1, leftover, reduce_op,
888 transform_op, flags1, first2...);
897 template <std::size_t size,
typename A1,
typename It1,
typename... It2>
899 constexpr typename A1::value_type
900 simd_transform_reduce_epilogue(A1 acc1, It1 first1, std::size_t leftover,
901 auto reduce_op,
auto transform_op,
auto flags1, It2... first2)
906 static_assert(A1::size() == 1);
907 static_assert(std::has_single_bit(size));
911 A acc = simdized_load_flag1_and_invoke(transform_op, vir::cw<size>, flags1, first1,
913 if constexpr (size == 1)
915 return std::invoke(reduce_op, acc1, acc)[0];
916 else if ((leftover & (size - 1)) == 0)
918 return std::invoke(reduce_op, acc1, A1(reduce(acc, reduce_op)))[0];
921 constexpr std::size_t size2 = size / 2;
923 auto [lo, hi] = stdx::split<size2, size2>(acc);
924 A2 acc2 = std::invoke(reduce_op, lo, hi);
925 return simd_transform_reduce_epilogue(acc1, acc2, first1 + A::size(), leftover,
926 reduce_op, transform_op, flags1, first2 +
930 else if ((leftover & (A::size() - 1)) == 0)
932 else if constexpr (size > 1)
934 constexpr std::size_t size2 = A::size() / 2;
935 return simd_transform_reduce_epilogue<size2>(acc1, first1, leftover, reduce_op,
936 transform_op, flags1, first2...);
948 transform_reduce(ExecutionPolicy, It1 first1, It1 last1, T init, BinaryReductionOp reduce_op,
949 TransformOp transform_op, It2... first2)
951 using T1 = std::iter_value_t<It1>;
952 constexpr int size = [] {
953 if constexpr (ExecutionPolicy::_size > 0)
954 return ExecutionPolicy::_size;
956 return std::max({int(iter_simdize_t<It1, 0>::size()),
957 int(iter_simdize_t<It2, 0>::size())...});
969 constexpr bool assume_matching_size = ExecutionPolicy::_assume_matching_size;
970 if constexpr (assume_matching_size)
971 vir_simd_precondition_vaargs(
972 distance % size == 0,
"The explicit assumption, that the range size (%zu) is a multiple"
973 " of the SIMD width (%d), does not hold.", distance, size);
975 if (std::is_constant_evaluated())
978 for (; first1 < last1; ((first1 += 1), ..., (first2 += 1)))
980 const A1 r = simdized_load_and_invoke(transform_op, 1_cw, first1, first2...);
981 acc1 = std::invoke(reduce_op, acc1, r);
987 constexpr prologue<V1, ExecutionPolicy> p;
988 constexpr auto flags = p.flags;
989 p(distance, first1, [&] (vir::constexpr_value
auto max_elements,
auto to_process)
990 VIR_LAMBDA_ALWAYS_INLINE {
991 acc1 = simd_transform_reduce_prologue<1, max_elements>(
992 acc1, first1, to_process, reduce_op, transform_op, first2...);
993 }, first1, first2...);
995 const auto leftover =
distance % size;
997 constexpr int lo_size = std::bit_ceil(
unsigned(size)) / 2;
998 constexpr int hi_size = size - lo_size;
1000 const auto simd_last = last1 - size;
1003 if (assume_matching_size or first1 <= simd_last)
1005 A acc = [&]() VIR_LAMBDA_ALWAYS_INLINE {
1006 if constexpr (ExecutionPolicy::_unroll_by > 1)
1008 constexpr std::make_index_sequence<ExecutionPolicy::_unroll_by> unroll_idx_seq;
1009 constexpr auto step = size * ExecutionPolicy::_unroll_by;
1010 const auto unrolled_last = last1 - step;
1011 if (first1 + step <= unrolled_last)
1013 auto acc = [&]<std::size_t... Is>(std::index_sequence<Is...>)
1014 -> std::array<A, ExecutionPolicy::_unroll_by>
1015 VIR_LAMBDA_ALWAYS_INLINE
1018 [&](std::size_t offset) -> A VIR_LAMBDA_ALWAYS_INLINE {
1019 return simdized_load_flag1_and_invoke(
1020 transform_op, vir::cw<size>, flags,
1021 first1 + offset, first2 + offset...);
1026 ((first2 += step), ...);
1029 [&]<std::size_t... Is>(std::index_sequence<Is...>)
1030 VIR_LAMBDA_ALWAYS_INLINE {
1031 ([&](std::size_t i) VIR_LAMBDA_ALWAYS_INLINE {
1032 acc[i] = std::invoke(
1034 simdized_load_flag1_and_invoke(
1035 transform_op, vir::cw<size>, flags,
1036 first1 + i * size, first2 + i * size...));
1040 ((first2 += step), ...);
1042 while (first1 <= unrolled_last);
1044 unroll<std::bit_width(
unsigned(ExecutionPolicy::_unroll_by - 1))>(
1046 constexpr int j = 1 << outer.value;
1047 unroll<ExecutionPolicy::_unroll_by / 2>([&](
auto ii) {
1048 constexpr int i = ii * 2 * j;
1049 if constexpr (i + j < ExecutionPolicy::_unroll_by)
1050 acc[i] = std::invoke(reduce_op, acc[i], acc[i+j]);
1056 auto ret = simdized_load_flag1_and_invoke(transform_op, vir::cw<size>, flags,
1059 ((first2 += size), ...);
1062 for (; assume_matching_size ? first1 != last1 : first1 <= simd_last;
1063 ((first1 += size), ..., (first2 += size)))
1065 acc = std::invoke(reduce_op, acc,
1066 simdized_load_flag1_and_invoke(transform_op, vir::cw<size>,
1067 flags, first1, first2...));
1069 if constexpr (size == 1)
1070 return std::invoke(reduce_op, acc1, acc)[0];
1071 else if (assume_matching_size or leftover == 0)
1072 return std::invoke(reduce_op, acc1, A1(reduce(acc, reduce_op)))[0];
1075 auto [lo, hi] = stdx::split<lo_size, hi_size>(acc);
1077 if constexpr (lo_size == hi_size)
1078 acc2 = std::invoke(reduce_op, lo, hi);
1080 acc1 = std::invoke(reduce_op, acc1, A1(reduce(hi, reduce_op)));
1081 return simd_transform_reduce_epilogue(acc1, acc2, first1, leftover, reduce_op,
1082 transform_op, flags, first2...);
1085 else if constexpr (lo_size > 0)
1088 return simd_transform_reduce_epilogue<lo_size>(
1089 acc1, first1, leftover, reduce_op, transform_op, flags, first2...);
1119 for_each([[maybe_unused]] ExecutionPolicy pol, It first, It last, F&& fun)
1121 using T = std::iter_value_t<It>;
1123 constexpr int size = V::size();
1124 constexpr bool write_back = std::indirectly_writable<It, T>
1125 and std::invocable<F, V&> and not std::invocable<F, V&&>;
1130 constexpr bool assume_matching_size = ExecutionPolicy::_assume_matching_size;
1131 if constexpr (assume_matching_size)
1132 vir_simd_precondition_vaargs(
1133 distance % size == 0,
"The explicit assumption, that the range size (%zu) is a multiple"
1134 " of the SIMD width (%d), does not hold.", distance, size);
1136 if (std::is_constant_evaluated())
1139 for (; first + (size - 1) < last; first += size)
1140 detail::simd_load_and_invoke<V, write_back>(fun, std::to_address(first),
1141 stdx::element_aligned, detail::no_unroll);
1143 if constexpr (size > 1)
1144 detail::simd_for_each_epilogue<V, write_back>(fun, distance % size, last,
1145 stdx::element_aligned);
1149 constexpr detail::prologue<V, ExecutionPolicy> prologue;
1150 constexpr auto flags = prologue.flags;
1151 prologue(distance, first, [&] (
auto max_elements,
auto to_process) {
1152 detail::simd_for_each_prologue<vir::simdize<T, 1>, write_back, max_elements>(
1153 fun, std::to_address(first), to_process);
1155 const auto leftover = distance % size;
1157 if constexpr (ExecutionPolicy::_unroll_by > 1)
1159 constexpr auto step = size * ExecutionPolicy::_unroll_by;
1160 const auto unrolled_last = last - step;
1161 for (; first <= unrolled_last; first += step)
1163 detail::simd_load_and_invoke<V, write_back>(
1164 fun, std::to_address(first), flags,
1165 std::make_index_sequence<ExecutionPolicy::_unroll_by>());
1169 const auto simd_last = last - size;
1170 for (; assume_matching_size ? first != last : first <= simd_last; first += size)
1171 detail::simd_load_and_invoke<V, write_back>(fun, std::to_address(first), flags,
1174 if constexpr (not assume_matching_size and size > 1)
1176 detail::simd_for_each_epilogue<V, write_back>(fun, leftover, last, flags);
1181 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_range R,
1185 {
vir::for_each(pol, std::ranges::begin(rng), std::ranges::end(rng), std::forward<F>(fun)); }
1219 template<detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_iterator It1,
1220 detail::simd_execution_iterator OutIt,
typename UnaryOperation>
1222 transform(ExecutionPolicy pol, It1 first1, It1 last1, OutIt d_first, UnaryOperation unary_op)
1223 {
return detail::transform(pol, first1, last1, d_first, unary_op); }
1226 template<detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_iterator It1,
1227 detail::simd_execution_iterator It2, detail::simd_execution_iterator OutIt,
1228 typename BinaryOperation>
1230 transform(ExecutionPolicy pol, It1 first1, It1 last1, It2 first2, OutIt d_first,
1231 BinaryOperation binary_op)
1232 {
return detail::transform(pol, first1, last1, d_first, binary_op, first2); }
1235 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_range R1,
1236 detail::simd_execution_range R2,
typename UnaryOperation>
1238 transform(ExecutionPolicy pol, R1&& r1, R2& d_rng, UnaryOperation unary_op)
1240 return detail::transform(pol, std::ranges::begin(r1), std::ranges::end(r1),
1241 std::ranges::begin(d_rng), unary_op);
1245 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_range R1,
1246 detail::simd_execution_range R2, detail::simd_execution_range R3,
1247 typename BinaryOperation>
1249 transform(ExecutionPolicy pol, R1&& r1, R2&& r2, R3& d_rng, BinaryOperation binary_op)
1251 return detail::transform(pol, std::ranges::begin(r1), std::ranges::end(r1),
1252 std::ranges::begin(d_rng), binary_op, std::ranges::begin(r2));
1255#if __cpp_lib_ranges_zip >= 202110L
1257 template <detail::simd_execution_range... Rs>
1262 const auto size = std::ranges::size(rs);
1263 const auto it = std::ranges::begin(rs);
1264 const auto first0 = std::addressof(std::get<0>(*it));
1265 return [&]<std::size_t... Is>(std::index_sequence<Is...>) {
1266 return detail::transform(
1267 pol, first0, first0 + size, std::ranges::begin(d_rg),
1268 [&op](
auto&&... vs) {
1269 if constexpr (
sizeof...(vs) == 2)
1270 return std::invoke(op, std::pair{
static_cast<decltype(vs)
>(vs)...});
1272 return std::invoke(op, std::tuple{
static_cast<decltype(vs)
>(vs)...});
1274 std::addressof(std::get<1 + Is>(*it))...);
1275 }(std::make_index_sequence<
sizeof...(Rs) - 1>());
1341 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_iterator It1,
1342 detail::simd_execution_iterator It2,
typename T>
1346 return detail::transform_reduce(policy, first1, last1, init,
std::plus<>(),
1351 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_iterator It1,
1352 detail::simd_execution_iterator It2,
typename T,
typename BinaryReductionOp,
1353 typename BinaryTransformOp>
1356 BinaryReductionOp reduce_op, BinaryTransformOp transform_op)
1358 return detail::transform_reduce(policy, first1, last1, init, reduce_op, transform_op, first2);
1362 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_iterator It,
1363 typename T,
typename BinaryReductionOp,
typename UnaryTransformOp>
1366 BinaryReductionOp reduce_op, UnaryTransformOp transform_op)
1367 {
return detail::transform_reduce(policy, first1, last1, init, reduce_op, transform_op); }
1370 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_range Rng1,
1371 detail::simd_execution_range Rng2,
typename T>
1375 return detail::transform_reduce(policy, std::ranges::begin(r1), std::ranges::end(r1), init,
1380 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_range Rng1,
1381 detail::simd_execution_range Rng2,
typename T,
typename BinaryReductionOp,
1382 typename BinaryTransformOp>
1385 BinaryReductionOp reduce_op, BinaryTransformOp transform_op)
1387 return detail::transform_reduce(policy, std::ranges::begin(r1), std::ranges::end(r1), init,
1388 reduce_op, transform_op, std::ranges::begin(r2));
1392 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_range Rng,
1393 typename T,
typename BinaryReductionOp,
typename UnaryTransformOp>
1396 UnaryTransformOp transform_op)
1398 return detail::transform_reduce(policy, std::ranges::begin(r1), std::ranges::end(r1), init,
1399 reduce_op, transform_op);
1422 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_iterator It>
1423 constexpr std::iter_value_t<It>
1424 reduce(ExecutionPolicy policy, It first, It last)
1426 return detail::transform_reduce(policy, first, last, std::iter_value_t<It>{},
1431 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_iterator It,
1434 reduce(ExecutionPolicy policy, It first, It last, T init)
1436 return detail::transform_reduce(policy, first, last, init,
std::plus<>(),
1437 [](
auto const& x) {
return x; });
1441 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_iterator It,
1442 typename T,
typename BinaryReductionOp>
1444 reduce(ExecutionPolicy policy, It first, It last, T init, BinaryReductionOp op)
1446 return detail::transform_reduce(policy, first, last, init, op,
1447 [](
auto const& x) {
return x; });
1451 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_range Rg>
1452 constexpr std::ranges::range_value_t<Rg>
1455 return detail::transform_reduce(policy, std::ranges::begin(rg), std::ranges::end(rg),
1457 [](
auto const& x) {
return x; });
1461 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_range Rg,
1464 reduce(ExecutionPolicy policy, Rg&& rg, T init)
1466 return detail::transform_reduce(policy, std::ranges::begin(rg), std::ranges::end(rg), init,
1471 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_range Rg,
1472 typename T,
typename BinaryReductionOp>
1474 reduce(ExecutionPolicy policy, Rg&& rg, T init, BinaryReductionOp op)
1476 return detail::transform_reduce(policy, std::ranges::begin(rg), std::ranges::end(rg), init,
1477 op, [](
auto const& x) {
return x; });
1499 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_iterator It,
1502 count_if(ExecutionPolicy pol, It first, It last, F&& pred)
1504 using T = std::iter_value_t<It>;
1506 using IV = detail::deduced_simd<int, TV::size()>;
1509 vir::for_each(pol, first, last, [&](
auto... x) VIR_LAMBDA_ALWAYS_INLINE {
1510#if __cpp_lib_experimental_parallel_simd >= 201803
1511 if (std::is_constant_evaluated())
1512 count += (popcount(pred(x)) + ...);
1515 if constexpr (
sizeof...(x) == 1)
1517 if constexpr ((x.size(), ...) == countv.size())
1518 ++where(vir::cvt(pred(x...)), countv);
1520 count += popcount(pred(x...));
1524 ((++where(vir::cvt(pred(x)), countv)), ...);
1527 return count +
reduce(countv);
1531 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_range R,
1536 return vir::count_if(pol, std::ranges::begin(rg), std::ranges::end(rg),
1537 std::forward<F>(pred));
1549 template <vir::detail::simd_execution_policy ExecutionPolicy,
1550 vir::detail::simd_execution_iterator It,
typename F>
1552 for_each(ExecutionPolicy pol, It first, It last, F&& fun)
1560 typename UnaryOperation>
1562 transform(ExecutionPolicy pol, It1 first1, It1 last1, OutIt d_first, UnaryOperation unary_op)
1563 {
return vir::detail::transform(pol, first1, last1, d_first, unary_op); }
1572 transform(ExecutionPolicy pol, It1 first1, It1 last1, It2 first2, OutIt d_first,
1573 BinaryOperation binary_op)
1574 {
return vir::detail::transform(pol, first1, last1, d_first, binary_op, first2); }
1586 return vir::detail::transform_reduce(policy, first1, last1, init,
std::plus<>(),
1595 typename T,
typename BinaryReductionOp,
typename BinaryTransformOp>
1598 BinaryReductionOp reduce_op, BinaryTransformOp transform_op)
1600 return vir::detail::transform_reduce(policy, first1, last1, init, reduce_op, transform_op,
1609 typename UnaryTransformOp>
1612 BinaryReductionOp reduce_op, UnaryTransformOp transform_op)
1613 {
return vir::detail::transform_reduce(policy, first1, last1, init, reduce_op, transform_op); }
1620 constexpr std::iter_value_t<It>
1621 reduce(ExecutionPolicy policy, It first, It last)
1623 return vir::detail::transform_reduce(policy, first, last, std::iter_value_t<It>{},
1633 reduce(ExecutionPolicy policy, It first, It last, T init)
1635 return vir::detail::transform_reduce(policy, first, last, init,
std::plus<>(),
1636 [](
auto const& x) {
return x; });
1645 reduce(ExecutionPolicy policy, It first, It last, T init, BinaryReductionOp op)
1647 return vir::detail::transform_reduce(policy, first, last, init, op,
1648 [](
auto const& x) {
return x; });
1657 count_if(ExecutionPolicy pol, It first, It last, F&& fun)
1658 {
return vir::count_if(pol, first, last, std::forward<F>(fun)); }
Modelled if std::contiguous_iterator is modelled and the value-type of It can be transformed via vir:...
Definition simd_execution.h:413
Satisfied for valid specializations of vir::execution::simd_policy.
Definition simd_execution.h:399
Modelled if std::ranges::contiguous_range is modelled and the value-type of Rng can be transformed vi...
Definition simd_execution.h:405
constexpr int count_if(ExecutionPolicy pol, It first, It last, F &&pred)
Count the elements in the input range matching pred (iterator overload)
Definition simd_execution.h:1502
constexpr void for_each(ExecutionPolicy pol, It first, It last, F &&fun)
Iterate over the given range (iterator overload).
Definition simd_execution.h:1119
constexpr std::iter_value_t< It > reduce(ExecutionPolicy policy, It first, It last)
Sum the given range (iterator overload)
Definition simd_execution.h:1424
Definition simd_execution.h:422
constexpr simd_policy simd
SIMD execution policy.
Definition simd_execution.h:528
This namespace collects libraries and tools authored by Matthias Kretz.
Definition constexpr_wrapper.h:21
typename detail::simdize_impl< T, N >::type simdize
Apply a type transformation to a scalar type to produce a data-parallel type.
Definition simdize.h:1019
C++20 concepts extending the Parallelism TS 2 (which is limited to C++17).
Provides a type transformation for turning scalar user-defined types into a simd types.
Type of the vir::execution::simd execution policy.
Definition simd_execution.h:426
static constexpr simd_policy< Options..., detail::simd_policy_unroll_by_t< N > > unroll_by()
Definition simd_execution.h:492
static constexpr simd_policy< Options..., detail::simd_policy_size_t< N > > prefer_size()
Definition simd_execution.h:504
static constexpr simd_policy< Options..., detail::simd_policy_auto_prologue_t > auto_prologue()
Definition simd_execution.h:462
static constexpr simd_policy< Options..., detail::simd_policy_prefer_aligned_t > prefer_aligned()
Definition simd_execution.h:451
static constexpr simd_policy< Options..., detail::simd_policy_assume_matching_size_t > assume_matching_size()
Definition simd_execution.h:474