vir-simd 0.4.189
Parallelism TS 2 extensions and simd fallback implementation
Loading...
Searching...
No Matches
simd_execution.h
Go to the documentation of this file.
1/* SPDX-License-Identifier: LGPL-3.0-or-later */
2/* Copyright © 2023–2024 GSI Helmholtzzentrum fuer Schwerionenforschung GmbH
3 * Matthias Kretz <m.kretz@gsi.de>
4 */
5
6#ifndef VIR_SIMD_EXECUTION_H_
7#define VIR_SIMD_EXECUTION_H_
8
12
13#include "simd_concepts.h"
14#include "simd_cvt.h"
15#include "simdize.h"
16
17#if VIR_HAVE_SIMD_CONCEPTS and VIR_HAVE_SIMDIZE and VIR_HAVE_SIMD_CVT and VIR_HAVE_CONSTEXPR_WRAPPER
18#if DOXYGEN or not defined __clang_major__ or not defined _GLIBCXX_RELEASE \
19 or __clang_major__ >= 17 or _GLIBCXX_RELEASE < 13
20#define VIR_HAVE_SIMD_EXECUTION 1
21
22#include <ranges>
23#include <cstdint>
24#include <utility>
25
26namespace vir
27{
29 namespace detail
30 {
31 using namespace vir::literals;
32
33 template <typename T>
34 VIR_ALWAYS_INLINE constexpr auto
35 data_or_ptr(T&& r) -> decltype(std::ranges::data(r))
36 { return std::ranges::data(r); }
37
38 template <typename T>
39 VIR_ALWAYS_INLINE constexpr T*
40 data_or_ptr(T* ptr)
41 { return ptr; }
42
43 template <typename T>
44 VIR_ALWAYS_INLINE constexpr const T*
45 data_or_ptr(const T* ptr)
46 { return ptr; }
47
48 // Invokes fun(V&) or fun(const V&) with V copied from ptr.
49 // If write_back is true, copy it back to ptr.
50 template <typename V, bool write_back = false, typename Flags = stdx::element_aligned_tag,
51 std::size_t... Is>
52 VIR_ALWAYS_INLINE constexpr void
53 simd_load_and_invoke(auto&& fun, auto ptr, Flags f, std::index_sequence<Is...>)
54 {
55 [&](auto... chunks) {
56 std::invoke(fun, chunks...);
57 if constexpr (write_back)
58 (chunks.copy_to(ptr + (V::size() * Is), f), ...);
59 }(std::conditional_t<write_back, V, const V>(ptr + (V::size() * Is), f)...);
60 }
61
62 template <typename V, typename Flags = stdx::element_aligned_tag, std::size_t... Is>
63 VIR_ALWAYS_INLINE constexpr void
64 simd_load_and_invoke(auto&& unary_op, auto ptr, auto&& result_op, Flags f, std::index_sequence<Is...>)
65 {
66 [&](const auto... chunks) {
67 (std::invoke(result_op, std::invoke(unary_op, chunks),
68 std::integral_constant<std::size_t, V::size() * Is>()), ...);
69 }(V(ptr + (V::size() * Is), f)...);
70 }
71
72 template <typename V1, typename V2, typename F1 = stdx::element_aligned_tag,
73 typename F2 = stdx::element_aligned_tag, std::size_t... Is>
74 VIR_ALWAYS_INLINE constexpr void
75 simd_load_and_invoke(auto&& binary_op, auto ptr1, auto ptr2, auto&& result_op, F1 f1, F2 f2,
76 std::index_sequence<Is...>)
77 {
78 constexpr auto size = V1::size();
79 static_assert(size == V2::size());
80 [&](const auto&... v1s) {
81 [&](const auto&... v2s) {
82 (std::invoke(result_op, std::invoke(binary_op, v1s, v2s),
83 std::integral_constant<std::size_t, size * Is>()), ...);
84 }(V2(ptr2 + (size * Is), f2)...);
85 }(V1(ptr1 + (size * Is), f1)...);
86 }
87
88 static constexpr auto no_unroll = std::make_index_sequence<1>();
89
90 template <class V, bool write_back, std::size_t max_elements>
91 VIR_ALWAYS_INLINE void
92 simd_for_each_jumptable_prologue(auto&& fun, auto ptr, std::size_t to_process)
93 {
94 switch (to_process)
95 {
96#define VIR_CASE(N, M) \
97 case N: \
98 static_assert(std::has_single_bit(unsigned(N - M))); \
99 static_assert(N - M < M or M == 0); \
100 static_assert(((N - M) & M) == 0); \
101 goto case##N; \
102case##N: \
103 if constexpr (N >= max_elements) \
104 unreachable(); \
105 simd_load_and_invoke<vir::simdize<typename V::value_type, N - M>, write_back>( \
106 fun, ptr, stdx::vector_aligned, no_unroll); \
107 ptr += N - M; \
108 goto case##M
109
110 VIR_CASE(1, 0);
111 VIR_CASE(2, 0);
112 VIR_CASE(3, 2);
113 VIR_CASE(4, 0);
114 VIR_CASE(5, 4);
115 VIR_CASE(6, 4);
116 VIR_CASE(7, 6);
117 VIR_CASE(8, 0);
118 VIR_CASE(9, 8);
119 VIR_CASE(10, 8);
120 VIR_CASE(11, 10);
121 VIR_CASE(12, 8);
122 VIR_CASE(13, 12);
123 VIR_CASE(14, 12);
124 VIR_CASE(15, 14);
125 VIR_CASE(16, 0);
126 VIR_CASE(17, 16);
127 VIR_CASE(18, 16);
128 VIR_CASE(19, 18);
129 VIR_CASE(20, 16);
130 VIR_CASE(21, 20);
131 VIR_CASE(22, 20);
132 VIR_CASE(23, 22);
133 VIR_CASE(24, 16);
134 VIR_CASE(25, 24);
135 VIR_CASE(26, 24);
136 VIR_CASE(27, 26);
137 VIR_CASE(28, 24);
138 VIR_CASE(29, 28);
139 VIR_CASE(30, 28);
140 VIR_CASE(31, 30);
141 VIR_CASE(32, 0);
142 VIR_CASE(33, 32);
143 VIR_CASE(34, 32);
144 VIR_CASE(35, 34);
145 VIR_CASE(36, 32);
146 VIR_CASE(37, 36);
147 VIR_CASE(38, 34);
148 VIR_CASE(39, 38);
149 VIR_CASE(40, 32);
150 VIR_CASE(41, 40);
151 VIR_CASE(42, 40);
152 VIR_CASE(43, 42);
153 VIR_CASE(44, 40);
154 VIR_CASE(45, 44);
155 VIR_CASE(46, 44);
156 VIR_CASE(47, 46);
157 VIR_CASE(48, 32);
158 VIR_CASE(49, 48);
159 VIR_CASE(50, 48);
160 VIR_CASE(51, 50);
161 VIR_CASE(52, 48);
162 VIR_CASE(53, 52);
163 VIR_CASE(54, 52);
164 VIR_CASE(55, 54);
165 VIR_CASE(56, 48);
166 VIR_CASE(57, 56);
167 VIR_CASE(58, 56);
168 VIR_CASE(59, 58);
169 VIR_CASE(60, 56);
170 VIR_CASE(61, 60);
171 VIR_CASE(62, 60);
172 VIR_CASE(63, 62);
173#undef VIR_CASE
174
175 default:
176 unreachable();
177 }
178case0:
179 return;
180 }
181
182 template <class V, bool write_back, std::size_t max_elements>
183 VIR_ALWAYS_INLINE constexpr void
184 simd_for_each_prologue(auto&& fun, auto ptr, std::size_t to_process)
185 {
186 static_assert(max_elements > 1);
187 static_assert(std::has_single_bit(max_elements));
188 static_assert(std::has_single_bit(unsigned(V::size())));
189 // pre-conditions:
190 // * ptr is a valid pointer to a range [ptr, ptr + to_process)
191 // * to_process > 0
192 // * to_process < max_elements
193
194 if (to_process == 0 or to_process >= max_elements)
195 unreachable();
196
197 if constexpr (max_elements == 2) // implies to_process == 1
198 {
199 simd_load_and_invoke<V, write_back>(fun, ptr, stdx::vector_aligned, no_unroll);
200 return;
201 }
202#if !__OPTIMIZE_SIZE__
203 if constexpr (V::size() == 1 and max_elements < 64)
204 if (not std::is_constant_evaluated())
205 return simd_for_each_jumptable_prologue<V, write_back, max_elements>(
206 fun, ptr, to_process);
207#endif
208
209 if (V::size() & to_process)
210 {
211 simd_load_and_invoke<V, write_back>(fun, ptr, stdx::vector_aligned, no_unroll);
212 ptr += V::size();
213 }
214 if constexpr (V::size() * 2 < max_elements)
215 {
216 simd_for_each_prologue<vir::simdize<typename V::value_type, V::size() * 2>,
217 write_back, max_elements>(fun, ptr, to_process);
218 }
219 }
220
221 template <class V0, bool write_back>
222 void
223 simd_for_each_jumptable_epilogue(auto&& fun, unsigned leftover, auto last, auto f)
224 {
225 // pre-conditions:
226 // * leftover > 0
227 // * [last - leftover, last) is a valid range
228 auto ptr = std::to_address(last - leftover);
229 switch (leftover)
230 {
231#define VIR_CASE(N, M) \
232 case N: \
233 goto case##N; \
234case##N: \
235 if constexpr (N >= V0::size()) \
236 unreachable(); \
237 simd_load_and_invoke<vir::simdize<typename V0::value_type, N - M>, write_back>( \
238 fun, ptr, f, no_unroll); \
239 ptr += N - M; \
240 goto case##M
241
242 VIR_CASE(1, 0);
243 VIR_CASE(2, 0);
244 VIR_CASE(3, 1);
245 VIR_CASE(4, 0);
246 VIR_CASE(5, 1);
247 VIR_CASE(6, 2);
248 VIR_CASE(7, 3);
249 VIR_CASE(8, 0);
250 VIR_CASE(9, 1);
251 VIR_CASE(10, 2);
252 VIR_CASE(11, 3);
253 VIR_CASE(12, 4);
254 VIR_CASE(13, 5);
255 VIR_CASE(14, 6);
256 VIR_CASE(15, 7);
257 VIR_CASE(16, 0);
258 VIR_CASE(17, 1);
259 VIR_CASE(18, 2);
260 VIR_CASE(19, 3);
261 VIR_CASE(20, 4);
262 VIR_CASE(21, 5);
263 VIR_CASE(22, 6);
264 VIR_CASE(23, 7);
265 VIR_CASE(24, 8);
266 VIR_CASE(25, 9);
267 VIR_CASE(26, 10);
268 VIR_CASE(27, 11);
269 VIR_CASE(28, 12);
270 VIR_CASE(29, 13);
271 VIR_CASE(30, 14);
272 VIR_CASE(31, 15);
273 VIR_CASE(32, 0);
274 VIR_CASE(33, 1);
275 VIR_CASE(34, 2);
276 VIR_CASE(35, 3);
277 VIR_CASE(36, 4);
278 VIR_CASE(37, 5);
279 VIR_CASE(38, 6);
280 VIR_CASE(39, 7);
281 VIR_CASE(40, 8);
282 VIR_CASE(41, 9);
283 VIR_CASE(42, 10);
284 VIR_CASE(43, 11);
285 VIR_CASE(44, 12);
286 VIR_CASE(45, 13);
287 VIR_CASE(46, 14);
288 VIR_CASE(47, 15);
289 VIR_CASE(48, 16);
290 VIR_CASE(49, 17);
291 VIR_CASE(50, 18);
292 VIR_CASE(51, 19);
293 VIR_CASE(52, 20);
294 VIR_CASE(53, 21);
295 VIR_CASE(54, 22);
296 VIR_CASE(55, 23);
297 VIR_CASE(56, 24);
298 VIR_CASE(57, 25);
299 VIR_CASE(58, 26);
300 VIR_CASE(59, 27);
301 VIR_CASE(60, 28);
302 VIR_CASE(61, 29);
303 VIR_CASE(62, 30);
304 VIR_CASE(63, 31);
305#undef VIR_CASE
306
307 default:
308 unreachable();
309 }
310case0:
311 return;
312 }
313
314 template <class V0, bool write_back>
315 VIR_ALWAYS_INLINE constexpr void
316 simd_for_each_epilogue(auto&& fun, unsigned leftover, auto last, auto f)
317 {
318 // pre-conditions:
319 // * leftover > 0
320 // * leftover < V0::size()
321 // * [last - leftover, last) is a valid range
322 static_assert(V0::size() > 1);
324 if constexpr (V0::size() == 2) // implies leftover == 1
325 {
326 simd_load_and_invoke<V1, write_back>(fun, std::to_address(last - 1), f, no_unroll);
327 return;
328 }
329 if constexpr (V0::size() <= 4) // implies leftover ∈ [1, 3]
330 {
331 if (not std::is_constant_evaluated())
332 return simd_for_each_jumptable_epilogue<V0, write_back>(fun, leftover, last, f);
333 }
334#if !__OPTIMIZE_SIZE__
335 // The jumptable implementation for larger V0::size() can result in a lot of machine code.
336 // If the user wants to optimize for size then this isn't the right solution.
337 if constexpr (V0::size() <= 64
338 //and V0::size() <= vir::stdx::native_simd<typename V0::value_type>::size()
339 )
340 {
341 if (not std::is_constant_evaluated())
342 return simd_for_each_jumptable_epilogue<V0, write_back>(fun, leftover, last, f);
343 }
344#endif // !__OPTIMIZE_SIZE__
345 using V = vir::simdize<typename V0::value_type, std::bit_ceil(unsigned(V0::size())) / 2>;
346 if (leftover & V::size())
347 {
348 static_assert(std::has_single_bit(unsigned(V::size()))); // by construction
349 simd_load_and_invoke<V, write_back>(fun, std::to_address(last - leftover), f,
350 no_unroll);
351 leftover -= V::size();
352 }
353 if constexpr (V::size() > 1)
354 if (leftover)
355 simd_for_each_epilogue<V, write_back>(fun, leftover, last, f);
356 }
357
358 struct simd_policy_prefer_aligned_t {};
359
360 struct simd_policy_auto_prologue_t {};
361
362 struct simd_policy_assume_matching_size_t {};
363
364 template <int N>
365 struct simd_policy_unroll_by_t
366 {};
367
368 template <typename T>
369 struct simd_policy_unroll_value
370 : std::integral_constant<int, 0>
371 {};
372
373 template <int N>
374 struct simd_policy_unroll_value<simd_policy_unroll_by_t<N>>
375 : std::integral_constant<int, N>
376 {};
377
378 template <int N>
379 struct simd_policy_size_t
380 {};
381
382 template <typename T>
383 struct simd_policy_size_value
384 : std::integral_constant<int, 0>
385 {};
386
387 template <int N>
388 struct simd_policy_size_value<simd_policy_size_t<N>>
389 : std::integral_constant<int, N>
390 {};
391
392 template <typename T>
393 struct is_simd_policy
394 : std::false_type
395 {};
396
398 template <typename T>
399 concept simd_execution_policy = is_simd_policy<std::remove_cvref_t<T>>::value;
400
404 template <typename Rng>
408
412 template <typename It>
416 } // namespace detail
417
421 namespace execution
422 {
424 template <typename... Options>
426 {
427 static constexpr bool _prefers_aligned
428 = (false or ... or std::same_as<Options, detail::simd_policy_prefer_aligned_t>);
429
430 static constexpr bool _auto_prologue
431 = (false or ... or std::same_as<Options, detail::simd_policy_auto_prologue_t>);
432
433 static constexpr bool _assume_matching_size
434 = (false or ... or std::same_as<Options, detail::simd_policy_assume_matching_size_t>);
435
436 static constexpr int _unroll_by
437 = (0 + ... + detail::simd_policy_unroll_value<Options>::value);
438
439 static constexpr int _size
440 = (0 + ... + detail::simd_policy_size_value<Options>::value);
441
442 // The following three are mutually exclusive:
443
450 static constexpr simd_policy<Options..., detail::simd_policy_prefer_aligned_t>
451 prefer_aligned() requires(not _prefers_aligned and not _auto_prologue
452 and not _assume_matching_size)
453 { return {}; }
454
461 static constexpr simd_policy<Options..., detail::simd_policy_auto_prologue_t>
462 auto_prologue() requires(not _prefers_aligned and not _auto_prologue
463 and not _assume_matching_size)
464 { return {}; }
465
473 static constexpr simd_policy<Options..., detail::simd_policy_assume_matching_size_t>
474 assume_matching_size() requires(not _prefers_aligned and not _auto_prologue
475 and not _assume_matching_size)
476 { return {}; }
477
490 template <int N>
491 static constexpr simd_policy<Options..., detail::simd_policy_unroll_by_t<N>>
492 unroll_by() requires(_unroll_by == 0)
493 {
494 static_assert(N > 1);
495 return {};
496 }
497
502 template <int N>
503 static constexpr simd_policy<Options..., detail::simd_policy_size_t<N>>
504 prefer_size() requires(_size == 0)
505 {
506 static_assert(N > 0);
507 return {};
508 }
509 };
510
528 inline constexpr simd_policy<> simd {};
529 } // namespace execution
530
531 namespace detail
532 {
533 template <typename... Options>
534 struct is_simd_policy<execution::simd_policy<Options...>>
535 : std::true_type
536 {};
537
538 template <typename V, typename T = typename V::value_type>
539 struct memory_alignment
540 : vir::constexpr_wrapper<alignof(T)>
541 {};
542
543 template <typename V, typename T>
544 requires (stdx::is_simd_v<V>)
545 struct memory_alignment<V, T>
546 : stdx::memory_alignment<V, T>
547 {};
548
549 template <typename V, typename T>
550 requires (not stdx::is_simd_v<V>) and requires {
551 {V::template memory_alignment<T>} -> std::same_as<std::size_t>;
552 }
553 struct memory_alignment<V, T>
554 : vir::constexpr_wrapper<V::template memory_alignment<T>>
555 {};
556
557 template <typename V, typename T = typename V::value_type>
558 inline constexpr std::size_t memory_alignment_v = memory_alignment<V, T>::value;
559
560 template <typename V, simd_execution_policy ExecutionPolicy>
561 struct prologue
562 {
563 using T = typename V::value_type;
564
565 static constexpr std::size_t size = V::size();
566
567 static constexpr auto bytes_per_iteration = size * sizeof(T);
568
569 static constexpr bool prologue_is_possible
570 = size > 1 and bytes_per_iteration % memory_alignment_v<V> == 0
571 and memory_alignment_v<V> > sizeof(T);
572
573 static constexpr bool use_aligned_loadstore
574 = ExecutionPolicy::_prefers_aligned and prologue_is_possible;
575
576 static constexpr bool maybe_execute_prologue_anyway
577 = ExecutionPolicy::_auto_prologue and prologue_is_possible;
578
579 static constexpr std::conditional_t<use_aligned_loadstore, stdx::vector_aligned_tag,
580 stdx::element_aligned_tag> flags{};
581
582 constexpr void
583 operator()(std::size_t& remaining, auto iterator_to_align, auto&& do_prologue,
584 auto&... iterators_to_advance) const
585 {
586 static_assert(std::same_as<T, std::iter_value_t<decltype(iterator_to_align)>>);
587
588 if constexpr (use_aligned_loadstore or maybe_execute_prologue_anyway)
589 {
590 constexpr auto max_misalignment = vir::cw<memory_alignment_v<V> / sizeof(T)>;
591 const auto misaligned_by_bytes
592 = reinterpret_cast<std::uintptr_t>(std::to_address(iterator_to_align))
593 % memory_alignment_v<V>;
594 if (misaligned_by_bytes == 0)
595 return;
596 const auto misaligned_by_elements = misaligned_by_bytes / sizeof(T);
597 const auto to_process = max_misalignment - misaligned_by_elements;
598 if constexpr (use_aligned_loadstore)
599 do_prologue(max_misalignment, to_process);
600 else if (remaining * sizeof(T) > 4000
601 or (to_process & remaining) == to_process) // prologue replaces epilogue
602 do_prologue(max_misalignment, to_process);
603 else
604 return;
605 ((iterators_to_advance += to_process), ...);
606 remaining -= to_process;
607 }
608 }
609 };
610
615 template <typename It, int size>
616 using iter_simdize_t = vir::simdize<std::iter_value_t<It>, size>;
617
618 template <typename... Its>
619 constexpr auto
620 simdized_load_and_invoke(auto op, vir::constexpr_value auto size, Its... its)
621 {
622 return std::invoke(
623 op, iter_simdize_t<Its, size>(std::to_address(its), stdx::element_aligned)...);
624 }
625
626 template <typename It0, typename... Its>
627 constexpr auto
628 simdized_load_flag1_and_invoke(auto op, vir::constexpr_value auto size, auto flag0, It0 it0,
629 Its... its)
630 {
631 return std::invoke(
632 op, iter_simdize_t<It0, size>(std::to_address(it0), flag0),
633 iter_simdize_t<Its, size>(std::to_address(its), stdx::element_aligned)...);
634 }
635
636 template <typename R, typename... Ts>
637 VIR_ALWAYS_INLINE constexpr void
638 simd_transform_prologue(auto&& op, auto dst_ptr, std::size_t to_process,
639 vir::constexpr_value auto max_elements, const Ts*... ptrs)
640 {
641 static_assert(max_elements > 1);
642 static_assert(std::has_single_bit(unsigned(max_elements)));
643 constexpr auto size = vir::cw<R::size()>;
644 static_assert(std::has_single_bit(unsigned(size)));
645 static_assert(size < max_elements);
646 // pre-conditions:
647 // * ptr is a valid pointer to a range [ptr, ptr + to_process)
648 // * to_process > 0
649 // * to_process < max_elements
650 // => if max_elements == 2 then to_process == 1 and size == 1
651
652 if (to_process == 0 or to_process >= max_elements)
653 unreachable();
654
655 if (max_elements == 2 or size & to_process)
656 {
657 const R& result = simdized_load_and_invoke(op, size, ptrs...);
658 result.copy_to(dst_ptr, stdx::vector_aligned);
659 if constexpr (max_elements == 2)
660 return;
661 ((ptrs += size), ...);
662 dst_ptr += size;
663 }
664 if constexpr (size * 2 < max_elements)
665 {
666 simd_transform_prologue<vir::resize_simdize_t<size * 2, R>, Ts...>(
667 op, dst_ptr, to_process, max_elements, ptrs...);
668 }
669 }
670
671 template <typename V0, typename R0, typename... More>
672 VIR_ALWAYS_INLINE constexpr void
673 simd_transform_epilogue(auto&& binary_op, unsigned leftover, auto last, auto d_first, auto f,
674 const More*... ptrs)
675 {
676 // pre-conditions:
677 // * leftover > 0
678 // * leftover < V0::size()
679 // * [last - leftover, last) is a valid range
680 // * [ptr2, ptr2 + leftover) is a valid range
681 // * [d_first, d_first + leftover) is a valid range
682 constexpr unsigned size0 = V0::size();
683 static_assert(size0 > 1);
684 static_assert(size0 == R0::size());
685 using T0 = typename V0::value_type;
686 if constexpr (size0 == 2) // implies leftover == 1
687 {
689 const R1& result = simdized_load_and_invoke(binary_op, 1_cw, last - 1, ptrs...);
690 result.copy_to(std::to_address(d_first), f);
691 return;
692 }
693 using V = vir::simdize<T0, std::bit_ceil(size0) / 2>;
694 using R = vir::simdize<typename R0::value_type, V::size()>;
695 if (leftover & V::size())
696 {
697 static_assert(std::has_single_bit(unsigned(V::size()))); // by construction
698 const R& result = simdized_load_and_invoke(binary_op, vir::cw<V::size()>,
699 last - leftover, ptrs...);
700 result.copy_to(std::to_address(d_first), f);
701 leftover -= V::size();
702 ((ptrs += V::size()), ...);
703 d_first += V::size();
704 }
705 if constexpr (V::size() > 1)
706 if (leftover)
707 simd_transform_epilogue<V, R, More...>(binary_op, leftover, last, d_first, f, ptrs...);
708 }
709
710 template<simd_execution_policy ExecutionPolicy, simd_execution_iterator It1,
711 simd_execution_iterator OutIt, typename Operation, simd_execution_iterator... It2>
712 constexpr OutIt
713 transform(ExecutionPolicy, It1 first1, It1 last1, OutIt d_first, Operation op, It2... first2)
714 {
715 using T1 = std::iter_value_t<It1>;
716 using OutT = std::iter_value_t<OutIt>;
717 constexpr auto size = vir::cw<([] {
718 if constexpr (ExecutionPolicy::_size > 0)
719 return ExecutionPolicy::_size;
720 else
721 return std::max({int(iter_simdize_t<It1, 0>::size()),
722 int(iter_simdize_t<It2, 0>::size())...});
723 }())>;
724 using V1 = vir::simdize<T1, size>;
725 using OutV = vir::simdize<OutT, size>;
726
727 if (first1 == last1)
728 return d_first;
729
730 auto advance = [](int n, auto&... it) { ((it += n), ...); };
731
732 std::size_t distance = std::distance(first1, last1);
733 constexpr bool assume_matching_size = ExecutionPolicy::_assume_matching_size;
734 if constexpr (assume_matching_size)
735 vir_simd_precondition_vaargs(
736 distance % size == 0, "The explicit assumption, that the range size (%zu) is a multiple"
737 " of the SIMD width (%d), does not hold.", distance, size());
738
739 if (std::is_constant_evaluated())
740 {
741 // needs element_aligned because of GCC PR111302
742 for (; first1 + (size - 1) < last1; advance(size, first1, d_first, first2...))
743 {
744 const OutV& result = simdized_load_and_invoke(op, size, first1, first2...);
745 result.copy_to(std::to_address(d_first), stdx::element_aligned);
746 }
747
748 if constexpr (!assume_matching_size)
749 {
750 for (; first1 < last1; advance(1, first1, d_first, first2...))
751 {
752 const vir::simdize<OutT, 1>& result
753 = simdized_load_and_invoke(op, 1_cw, first1, first2...);
754 result.copy_to(std::to_address(d_first), stdx::element_aligned);
755 }
756 }
757 }
758 else
759 {
760 // There are three addresses and we can only ensure alignment on one of them. In
761 // principle, we could check at runtime whether two of three can be aligned. But the
762 // branching necessary to make this work - with relatively little gain (presumably) -
763 // renders this approach not interesting.
764 // Instead, let's simply focus on aligning the store address, avoiding pressure on the
765 // store execution ports (a factor of 2 throughput difference with AVX-512 vectors).
766 constexpr prologue<OutV, ExecutionPolicy> p;
767 constexpr auto flags = p.flags;
768 if constexpr (!assume_matching_size)
769 {
770 p(distance, d_first, [&] (auto max_elements, auto to_process) {
771 simd_transform_prologue<vir::simdize<OutT, 1>, T1, std::iter_value_t<It2>...>(
772 op, std::to_address(d_first), to_process, max_elements, std::to_address(first1),
773 std::to_address(first2)...);
774 }, first1, d_first, first2...);
775 }
776
777 if constexpr (ExecutionPolicy::_unroll_by > 1)
778 {
779 constexpr auto step = size * ExecutionPolicy::_unroll_by;
780 const auto unrolled_last = last1 - step;
781 for (; first1 <= unrolled_last; advance(step, first1, d_first, first2...))
782 {
783 unroll2<ExecutionPolicy::_unroll_by>([&,size](auto i) {
784 return simdized_load_and_invoke(op, size, first1 + i * size,
785 first2 + i * size...);
786 }, [&,size](auto i, const OutV& result) {
787 result.copy_to(std::to_address(d_first + i * size), flags);
788 });
789 }
790 }
791
792 const auto simd_last = last1 - size;
793 for (; assume_matching_size ? first1 != last1 : first1 <= simd_last;
794 advance(size, first1, d_first, first2...))
795 {
796 const OutV& result = simdized_load_and_invoke(op, size, first1, first2...);
797 result.copy_to(std::to_address(d_first), flags);
798 }
799
800 if constexpr (size > 1 and !assume_matching_size)
801 {
802 const auto leftover = distance % size;
803 if (leftover)
804 {
805 simd_transform_epilogue<V1, OutV, std::iter_value_t<It2>...>(
806 op, leftover, last1, d_first, flags, std::to_address(first2)...);
807 d_first += leftover;
808 }
809 }
810 }
811 return d_first;
812 }
813
814 template <int size, int max_size, typename A1, typename It1, typename... It2>
815 VIR_ALWAYS_INLINE constexpr A1
816 simd_transform_reduce_prologue(A1 acc1, It1 first1, std::size_t to_process,
817 auto reduce_op, auto transform_op, It2... first2)
818 {
819 // preconditions:
820 // - to_process > 0
821 // - to_process < max_size
822 // - [first1, first1 + to_process) is a valid range
823 // - [first2, first2 + to_process) is a valid range
824 static_assert(std::has_single_bit(unsigned(size)));
825 static_assert(std::has_single_bit(unsigned(max_size)));
826 if (to_process & size)
827 {
828 const std::size_t offset = to_process & (size - 1);
829 const A1 x = reduce(simdized_load_flag1_and_invoke(
830 transform_op, vir::cw<size>, stdx::vector_aligned,
831 first1 + offset, first2 + offset...),
832 reduce_op);
833 acc1 = std::invoke(reduce_op, acc1, x);
834 }
835 if constexpr (size < max_size)
836 return simd_transform_reduce_prologue<size * 2, max_size>(
837 acc1, first1, to_process, reduce_op, transform_op, first2...);
838 else
839 return acc1;
840 }
841
846 template <typename A1, typename A, typename It1, typename... It2>
847 VIR_ALWAYS_INLINE
848 constexpr typename A1::value_type
849 simd_transform_reduce_epilogue(A1 acc1, A acc, It1 first1, std::size_t leftover,
850 auto reduce_op, auto transform_op, auto flags1, It2... first2)
851 {
852 // preconditions:
853 // leftover > 0
854 // leftover & (A::size() * 2 - 1) != 0 (i.e. at least one more load+transform is necessary)
855 static_assert(A1::size() == 1);
856 static_assert(std::has_single_bit(unsigned(A::size())));
857 if (leftover & A::size())
858 {
859 acc = std::invoke(reduce_op, acc,
860 simdized_load_flag1_and_invoke(transform_op, vir::cw<A::size()>,
861 flags1, first1, first2...));
862 if constexpr (A::size() == 1)
863 // definetely no more work
864 return std::invoke(reduce_op, acc1, acc)[0];
865 else if ((leftover & (A::size() - 1)) == 0)
866 // no more work
867 return std::invoke(reduce_op, acc1, A1(reduce(acc, reduce_op)))[0];
868 else
869 {
870 constexpr std::size_t size2 = A::size() / 2;
872 auto [lo, hi] = stdx::split<size2, size2>(acc);
873 A2 acc2 = std::invoke(reduce_op, lo, hi);
874 return simd_transform_reduce_epilogue(acc1, acc2, first1 + A::size(), leftover,
875 reduce_op, transform_op, flags1, first2 +
876 A::size()...);
877 }
878 }
879 else if ((leftover & (A::size() - 1)) == 0)
880 unreachable(); // because we're only called if there is more work to be done
881 else if constexpr (A::size() > 1)
882 {
883 constexpr std::size_t size2 = A::size() / 2;
885 auto [lo, hi] = stdx::split<size2, size2>(acc);
886 A2 acc2 = std::invoke(reduce_op, lo, hi);
887 return simd_transform_reduce_epilogue(acc1, acc2, first1, leftover, reduce_op,
888 transform_op, flags1, first2...);
889 }
890 else
891 unreachable();
892 }
893
897 template <std::size_t size, typename A1, typename It1, typename... It2>
898 VIR_ALWAYS_INLINE
899 constexpr typename A1::value_type
900 simd_transform_reduce_epilogue(A1 acc1, It1 first1, std::size_t leftover,
901 auto reduce_op, auto transform_op, auto flags1, It2... first2)
902 {
903 // preconditions:
904 // leftover > 0
905 // leftover & (size * 2 - 1) != 0 (i.e. at least one more load+transform is necessary)
906 static_assert(A1::size() == 1);
907 static_assert(std::has_single_bit(size));
909 if (leftover & size)
910 {
911 A acc = simdized_load_flag1_and_invoke(transform_op, vir::cw<size>, flags1, first1,
912 first2...);
913 if constexpr (size == 1)
914 // definetely no more work
915 return std::invoke(reduce_op, acc1, acc)[0];
916 else if ((leftover & (size - 1)) == 0)
917 // no more work
918 return std::invoke(reduce_op, acc1, A1(reduce(acc, reduce_op)))[0];
919 else
920 {
921 constexpr std::size_t size2 = size / 2;
923 auto [lo, hi] = stdx::split<size2, size2>(acc);
924 A2 acc2 = std::invoke(reduce_op, lo, hi);
925 return simd_transform_reduce_epilogue(acc1, acc2, first1 + A::size(), leftover,
926 reduce_op, transform_op, flags1, first2 +
927 A::size()...);
928 }
929 }
930 else if ((leftover & (A::size() - 1)) == 0)
931 unreachable(); // because we're only called if there is more work to be done
932 else if constexpr (size > 1)
933 {
934 constexpr std::size_t size2 = A::size() / 2;
935 return simd_transform_reduce_epilogue<size2>(acc1, first1, leftover, reduce_op,
936 transform_op, flags1, first2...);
937 }
938 else
939 unreachable();
940 }
941
945 template <simd_execution_policy ExecutionPolicy, simd_execution_iterator It1, typename T,
946 typename BinaryReductionOp, typename TransformOp, simd_execution_iterator... It2>
947 constexpr T
948 transform_reduce(ExecutionPolicy, It1 first1, It1 last1, T init, BinaryReductionOp reduce_op,
949 TransformOp transform_op, It2... first2)
950 {
951 using T1 = std::iter_value_t<It1>;
952 constexpr int size = [] {
953 if constexpr (ExecutionPolicy::_size > 0)
954 return ExecutionPolicy::_size;
955 else
956 return std::max({int(iter_simdize_t<It1, 0>::size()),
957 int(iter_simdize_t<It2, 0>::size())...});
958 }();
959 using A1 = vir::simdize<T, 1>;
960 using A = vir::simdize<T, size>;
961 using V1 = vir::simdize<T1, size>;
962
963 if (first1 == last1)
964 return init;
965
966 A1 acc1 = init;
967
968 std::size_t distance = std::distance(first1, last1);
969 constexpr bool assume_matching_size = ExecutionPolicy::_assume_matching_size;
970 if constexpr (assume_matching_size)
971 vir_simd_precondition_vaargs(
972 distance % size == 0, "The explicit assumption, that the range size (%zu) is a multiple"
973 " of the SIMD width (%d), does not hold.", distance, size);
974
975 if (std::is_constant_evaluated())
976 {
977 // needs element_aligned because of GCC PR111302
978 for (; first1 < last1; ((first1 += 1), ..., (first2 += 1)))
979 {
980 const A1 r = simdized_load_and_invoke(transform_op, 1_cw, first1, first2...);
981 acc1 = std::invoke(reduce_op, acc1, r);
982 }
983 return acc1[0];
984 }
985 else
986 {
987 constexpr prologue<V1, ExecutionPolicy> p;
988 constexpr auto flags = p.flags;
989 p(distance, first1, [&] (vir::constexpr_value auto max_elements, auto to_process)
990 VIR_LAMBDA_ALWAYS_INLINE {
991 acc1 = simd_transform_reduce_prologue<1, max_elements>(
992 acc1, first1, to_process, reduce_op, transform_op, first2...);
993 }, first1, first2...);
994
995 const auto leftover = distance % size;
996
997 constexpr int lo_size = std::bit_ceil(unsigned(size)) / 2;
998 constexpr int hi_size = size - lo_size;
999
1000 const auto simd_last = last1 - size;
1001 // 'assume_matching_size' implies size is 0 or at least one simd width, and 0 has an
1002 // early return above
1003 if (assume_matching_size or first1 <= simd_last)
1004 {
1005 A acc = [&]() VIR_LAMBDA_ALWAYS_INLINE {
1006 if constexpr (ExecutionPolicy::_unroll_by > 1)
1007 {
1008 constexpr std::make_index_sequence<ExecutionPolicy::_unroll_by> unroll_idx_seq;
1009 constexpr auto step = size * ExecutionPolicy::_unroll_by;
1010 const auto unrolled_last = last1 - step;
1011 if (first1 + step <= unrolled_last)
1012 {
1013 auto acc = [&]<std::size_t... Is>(std::index_sequence<Is...>)
1014 -> std::array<A, ExecutionPolicy::_unroll_by>
1015 VIR_LAMBDA_ALWAYS_INLINE
1016 {
1017 return {
1018 [&](std::size_t offset) -> A VIR_LAMBDA_ALWAYS_INLINE {
1019 return simdized_load_flag1_and_invoke(
1020 transform_op, vir::cw<size>, flags,
1021 first1 + offset, first2 + offset...);
1022 }(Is * size)...
1023 };
1024 }(unroll_idx_seq);
1025 first1 += step;
1026 ((first2 += step), ...);
1027 do
1028 {
1029 [&]<std::size_t... Is>(std::index_sequence<Is...>)
1030 VIR_LAMBDA_ALWAYS_INLINE {
1031 ([&](std::size_t i) VIR_LAMBDA_ALWAYS_INLINE {
1032 acc[i] = std::invoke(
1033 reduce_op, acc[i],
1034 simdized_load_flag1_and_invoke(
1035 transform_op, vir::cw<size>, flags,
1036 first1 + i * size, first2 + i * size...));
1037 }(Is), ...);
1038 }(unroll_idx_seq);
1039 first1 += step;
1040 ((first2 += step), ...);
1041 }
1042 while (first1 <= unrolled_last);
1043 // tree reduction of acc into acc[0]
1044 unroll<std::bit_width(unsigned(ExecutionPolicy::_unroll_by - 1))>(
1045 [&](auto outer) {
1046 constexpr int j = 1 << outer.value; // j: 1 2 4 8 16 32 ...
1047 unroll<ExecutionPolicy::_unroll_by / 2>([&](auto ii) {
1048 constexpr int i = ii * 2 * j;
1049 if constexpr (i + j < ExecutionPolicy::_unroll_by)
1050 acc[i] = std::invoke(reduce_op, acc[i], acc[i+j]);
1051 });
1052 });
1053 return acc[0];
1054 }
1055 }
1056 auto ret = simdized_load_flag1_and_invoke(transform_op, vir::cw<size>, flags,
1057 first1, first2...);
1058 first1 += size;
1059 ((first2 += size), ...);
1060 return ret;
1061 }();
1062 for (; assume_matching_size ? first1 != last1 : first1 <= simd_last;
1063 ((first1 += size), ..., (first2 += size)))
1064 {
1065 acc = std::invoke(reduce_op, acc,
1066 simdized_load_flag1_and_invoke(transform_op, vir::cw<size>,
1067 flags, first1, first2...));
1068 }
1069 if constexpr (size == 1)
1070 return std::invoke(reduce_op, acc1, acc)[0];
1071 else if (assume_matching_size or leftover == 0)
1072 return std::invoke(reduce_op, acc1, A1(reduce(acc, reduce_op)))[0];
1073 else
1074 {
1075 auto [lo, hi] = stdx::split<lo_size, hi_size>(acc);
1076 vir::simdize<T, lo_size> acc2 = lo;
1077 if constexpr (lo_size == hi_size)
1078 acc2 = std::invoke(reduce_op, lo, hi);
1079 else
1080 acc1 = std::invoke(reduce_op, acc1, A1(reduce(hi, reduce_op)));
1081 return simd_transform_reduce_epilogue(acc1, acc2, first1, leftover, reduce_op,
1082 transform_op, flags, first2...);
1083 }
1084 }
1085 else if constexpr (lo_size > 0)
1086 {
1087 // leftover > 0 by construction
1088 return simd_transform_reduce_epilogue<lo_size>(
1089 acc1, first1, leftover, reduce_op, transform_op, flags, first2...);
1090 }
1091 else
1092 unreachable();
1093 }
1094 }
1095 } // namespace detail
1096
1117 typename F>
1118 constexpr void
1119 for_each([[maybe_unused]] ExecutionPolicy pol, It first, It last, F&& fun)
1120 {
1121 using T = std::iter_value_t<It>;
1123 constexpr int size = V::size();
1124 constexpr bool write_back = std::indirectly_writable<It, T>
1125 and std::invocable<F, V&> and not std::invocable<F, V&&>;
1126 if (first == last)
1127 return;
1128
1129 std::size_t distance = std::distance(first, last);
1130 constexpr bool assume_matching_size = ExecutionPolicy::_assume_matching_size;
1131 if constexpr (assume_matching_size)
1132 vir_simd_precondition_vaargs(
1133 distance % size == 0, "The explicit assumption, that the range size (%zu) is a multiple"
1134 " of the SIMD width (%d), does not hold.", distance, size);
1135
1136 if (std::is_constant_evaluated())
1137 {
1138 // needs element_aligned because of GCC PR111302
1139 for (; first + (size - 1) < last; first += size)
1140 detail::simd_load_and_invoke<V, write_back>(fun, std::to_address(first),
1141 stdx::element_aligned, detail::no_unroll);
1142
1143 if constexpr (size > 1)
1144 detail::simd_for_each_epilogue<V, write_back>(fun, distance % size, last,
1145 stdx::element_aligned);
1146 }
1147 else
1148 {
1149 constexpr detail::prologue<V, ExecutionPolicy> prologue;
1150 constexpr auto flags = prologue.flags;
1151 prologue(distance, first, [&] (auto max_elements, auto to_process) {
1152 detail::simd_for_each_prologue<vir::simdize<T, 1>, write_back, max_elements>(
1153 fun, std::to_address(first), to_process);
1154 }, first);
1155 const auto leftover = distance % size;
1156
1157 if constexpr (ExecutionPolicy::_unroll_by > 1)
1158 {
1159 constexpr auto step = size * ExecutionPolicy::_unroll_by;
1160 const auto unrolled_last = last - step;
1161 for (; first <= unrolled_last; first += step)
1162 {
1163 detail::simd_load_and_invoke<V, write_back>(
1164 fun, std::to_address(first), flags,
1165 std::make_index_sequence<ExecutionPolicy::_unroll_by>());
1166 }
1167 }
1168
1169 const auto simd_last = last - size;
1170 for (; assume_matching_size ? first != last : first <= simd_last; first += size)
1171 detail::simd_load_and_invoke<V, write_back>(fun, std::to_address(first), flags,
1172 detail::no_unroll);
1173
1174 if constexpr (not assume_matching_size and size > 1)
1175 if (leftover)
1176 detail::simd_for_each_epilogue<V, write_back>(fun, leftover, last, flags);
1177 }
1178 }
1179
1181 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_range R,
1182 typename F>
1183 constexpr void
1184 for_each(ExecutionPolicy pol, R&& rng, F&& fun)
1185 { vir::for_each(pol, std::ranges::begin(rng), std::ranges::end(rng), std::forward<F>(fun)); }
1186
1188
1219 template<detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_iterator It1,
1220 detail::simd_execution_iterator OutIt, typename UnaryOperation>
1221 constexpr OutIt
1222 transform(ExecutionPolicy pol, It1 first1, It1 last1, OutIt d_first, UnaryOperation unary_op)
1223 { return detail::transform(pol, first1, last1, d_first, unary_op); }
1224
1226 template<detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_iterator It1,
1227 detail::simd_execution_iterator It2, detail::simd_execution_iterator OutIt,
1228 typename BinaryOperation>
1229 constexpr OutIt
1230 transform(ExecutionPolicy pol, It1 first1, It1 last1, It2 first2, OutIt d_first,
1231 BinaryOperation binary_op)
1232 { return detail::transform(pol, first1, last1, d_first, binary_op, first2); }
1233
1235 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_range R1,
1236 detail::simd_execution_range R2, typename UnaryOperation>
1237 constexpr auto
1238 transform(ExecutionPolicy pol, R1&& r1, R2& d_rng, UnaryOperation unary_op)
1239 {
1240 return detail::transform(pol, std::ranges::begin(r1), std::ranges::end(r1),
1241 std::ranges::begin(d_rng), unary_op);
1242 }
1243
1245 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_range R1,
1246 detail::simd_execution_range R2, detail::simd_execution_range R3,
1247 typename BinaryOperation>
1248 constexpr auto
1249 transform(ExecutionPolicy pol, R1&& r1, R2&& r2, R3& d_rng, BinaryOperation binary_op)
1250 {
1251 return detail::transform(pol, std::ranges::begin(r1), std::ranges::end(r1),
1252 std::ranges::begin(d_rng), binary_op, std::ranges::begin(r2));
1253 }
1254
1255#if __cpp_lib_ranges_zip >= 202110L
1257 template <detail::simd_execution_range... Rs>
1258 constexpr auto
1260 detail::simd_execution_range auto&& d_rg, auto op)
1261 {
1262 const auto size = std::ranges::size(rs);
1263 const auto it = std::ranges::begin(rs);
1264 const auto first0 = std::addressof(std::get<0>(*it));
1265 return [&]<std::size_t... Is>(std::index_sequence<Is...>) {
1266 return detail::transform(
1267 pol, first0, first0 + size, std::ranges::begin(d_rg),
1268 [&op](auto&&... vs) {
1269 if constexpr (sizeof...(vs) == 2)
1270 return std::invoke(op, std::pair{static_cast<decltype(vs)>(vs)...});
1271 else
1272 return std::invoke(op, std::tuple{static_cast<decltype(vs)>(vs)...});
1273 },
1274 std::addressof(std::get<1 + Is>(*it))...);
1275 }(std::make_index_sequence<sizeof...(Rs) - 1>());
1276 }
1277#endif
1278
1280
1341 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_iterator It1,
1342 detail::simd_execution_iterator It2, typename T>
1343 constexpr T
1344 transform_reduce(ExecutionPolicy policy, It1 first1, It1 last1, It2 first2, T init)
1345 {
1346 return detail::transform_reduce(policy, first1, last1, init, std::plus<>(),
1347 std::multiplies<>(), first2);
1348 }
1349
1351 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_iterator It1,
1352 detail::simd_execution_iterator It2, typename T, typename BinaryReductionOp,
1353 typename BinaryTransformOp>
1354 constexpr T
1355 transform_reduce(ExecutionPolicy policy, It1 first1, It1 last1, It2 first2, T init,
1356 BinaryReductionOp reduce_op, BinaryTransformOp transform_op)
1357 {
1358 return detail::transform_reduce(policy, first1, last1, init, reduce_op, transform_op, first2);
1359 }
1360
1362 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_iterator It,
1363 typename T, typename BinaryReductionOp, typename UnaryTransformOp>
1364 constexpr T
1365 transform_reduce(ExecutionPolicy policy, It first1, It last1, T init,
1366 BinaryReductionOp reduce_op, UnaryTransformOp transform_op)
1367 { return detail::transform_reduce(policy, first1, last1, init, reduce_op, transform_op); }
1368
1370 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_range Rng1,
1371 detail::simd_execution_range Rng2, typename T>
1372 constexpr T
1373 transform_reduce(ExecutionPolicy policy, Rng1&& r1, Rng2&& r2, T init)
1374 {
1375 return detail::transform_reduce(policy, std::ranges::begin(r1), std::ranges::end(r1), init,
1376 std::plus<>(), std::multiplies<>(), std::ranges::begin(r2));
1377 }
1378
1380 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_range Rng1,
1381 detail::simd_execution_range Rng2, typename T, typename BinaryReductionOp,
1382 typename BinaryTransformOp>
1383 constexpr T
1384 transform_reduce(ExecutionPolicy policy, Rng1&& r1, Rng2&& r2, T init,
1385 BinaryReductionOp reduce_op, BinaryTransformOp transform_op)
1386 {
1387 return detail::transform_reduce(policy, std::ranges::begin(r1), std::ranges::end(r1), init,
1388 reduce_op, transform_op, std::ranges::begin(r2));
1389 }
1390
1392 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_range Rng,
1393 typename T, typename BinaryReductionOp, typename UnaryTransformOp>
1394 constexpr T
1395 transform_reduce(ExecutionPolicy policy, Rng&& r1, T init, BinaryReductionOp reduce_op,
1396 UnaryTransformOp transform_op)
1397 {
1398 return detail::transform_reduce(policy, std::ranges::begin(r1), std::ranges::end(r1), init,
1399 reduce_op, transform_op);
1400 }
1401
1403
1422 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_iterator It>
1423 constexpr std::iter_value_t<It>
1424 reduce(ExecutionPolicy policy, It first, It last)
1425 {
1426 return detail::transform_reduce(policy, first, last, std::iter_value_t<It>{},
1427 std::plus<>(), [](auto const& x) { return x; });
1428 }
1429
1431 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_iterator It,
1432 typename T>
1433 constexpr T
1434 reduce(ExecutionPolicy policy, It first, It last, T init)
1435 {
1436 return detail::transform_reduce(policy, first, last, init, std::plus<>(),
1437 [](auto const& x) { return x; });
1438 }
1439
1441 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_iterator It,
1442 typename T, typename BinaryReductionOp>
1443 constexpr T
1444 reduce(ExecutionPolicy policy, It first, It last, T init, BinaryReductionOp op)
1445 {
1446 return detail::transform_reduce(policy, first, last, init, op,
1447 [](auto const& x) { return x; });
1448 }
1449
1451 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_range Rg>
1452 constexpr std::ranges::range_value_t<Rg>
1453 reduce(ExecutionPolicy policy, Rg&& rg)
1454 {
1455 return detail::transform_reduce(policy, std::ranges::begin(rg), std::ranges::end(rg),
1456 std::ranges::range_value_t<Rg>{}, std::plus<>(),
1457 [](auto const& x) { return x; });
1458 }
1459
1461 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_range Rg,
1462 typename T>
1463 constexpr T
1464 reduce(ExecutionPolicy policy, Rg&& rg, T init)
1465 {
1466 return detail::transform_reduce(policy, std::ranges::begin(rg), std::ranges::end(rg), init,
1467 std::plus<>(), [](auto const& x) { return x; });
1468 }
1469
1471 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_range Rg,
1472 typename T, typename BinaryReductionOp>
1473 constexpr T
1474 reduce(ExecutionPolicy policy, Rg&& rg, T init, BinaryReductionOp op)
1475 {
1476 return detail::transform_reduce(policy, std::ranges::begin(rg), std::ranges::end(rg), init,
1477 op, [](auto const& x) { return x; });
1478 }
1479
1481
1499 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_iterator It,
1500 typename F>
1501 constexpr int
1502 count_if(ExecutionPolicy pol, It first, It last, F&& pred)
1503 {
1504 using T = std::iter_value_t<It>;
1506 using IV = detail::deduced_simd<int, TV::size()>;
1507 int count = 0;
1508 IV countv = 0;
1509 vir::for_each(pol, first, last, [&](auto... x) VIR_LAMBDA_ALWAYS_INLINE {
1510#if __cpp_lib_experimental_parallel_simd >= 201803
1511 if (std::is_constant_evaluated())
1512 count += (popcount(pred(x)) + ...);
1513 else
1514#endif
1515 if constexpr (sizeof...(x) == 1)
1516 {
1517 if constexpr ((x.size(), ...) == countv.size())
1518 ++where(vir::cvt(pred(x...)), countv);
1519 else
1520 count += popcount(pred(x...));
1521 }
1522 else
1523 {
1524 ((++where(vir::cvt(pred(x)), countv)), ...);
1525 }
1526 });
1527 return count + reduce(countv);
1528 }
1529
1531 template <detail::simd_execution_policy ExecutionPolicy, detail::simd_execution_range R,
1532 typename F>
1533 constexpr int
1534 count_if(ExecutionPolicy pol, R&& rg, F&& pred)
1535 {
1536 return vir::count_if(pol, std::ranges::begin(rg), std::ranges::end(rg),
1537 std::forward<F>(pred));
1538 }
1539
1541} // namespace vir
1542
1544namespace std
1545{
1549 template <vir::detail::simd_execution_policy ExecutionPolicy,
1550 vir::detail::simd_execution_iterator It, typename F>
1551 constexpr void
1552 for_each(ExecutionPolicy pol, It first, It last, F&& fun)
1553 { vir::for_each(pol, first, last, std::forward<F>(fun)); }
1554
1558 template<vir::detail::simd_execution_policy ExecutionPolicy,
1560 typename UnaryOperation>
1561 constexpr OutIt
1562 transform(ExecutionPolicy pol, It1 first1, It1 last1, OutIt d_first, UnaryOperation unary_op)
1563 { return vir::detail::transform(pol, first1, last1, d_first, unary_op); }
1564
1568 template<vir::detail::simd_execution_policy ExecutionPolicy,
1570 vir::detail::simd_execution_iterator OutIt, typename BinaryOperation>
1571 constexpr OutIt
1572 transform(ExecutionPolicy pol, It1 first1, It1 last1, It2 first2, OutIt d_first,
1573 BinaryOperation binary_op)
1574 { return vir::detail::transform(pol, first1, last1, d_first, binary_op, first2); }
1575
1576
1580 template <vir::detail::simd_execution_policy ExecutionPolicy,
1582 typename T>
1583 constexpr T
1584 transform_reduce(ExecutionPolicy policy, It1 first1, It1 last1, It2 first2, T init)
1585 {
1586 return vir::detail::transform_reduce(policy, first1, last1, init, std::plus<>(),
1587 std::multiplies<>(), first2);
1588 }
1589
1593 template <vir::detail::simd_execution_policy ExecutionPolicy,
1595 typename T, typename BinaryReductionOp, typename BinaryTransformOp>
1596 constexpr T
1597 transform_reduce(ExecutionPolicy policy, It1 first1, It1 last1, It2 first2, T init,
1598 BinaryReductionOp reduce_op, BinaryTransformOp transform_op)
1599 {
1600 return vir::detail::transform_reduce(policy, first1, last1, init, reduce_op, transform_op,
1601 first2);
1602 }
1603
1607 template <vir::detail::simd_execution_policy ExecutionPolicy,
1608 vir::detail::simd_execution_iterator It, typename T, typename BinaryReductionOp,
1609 typename UnaryTransformOp>
1610 constexpr T
1611 transform_reduce(ExecutionPolicy policy, It first1, It last1, T init,
1612 BinaryReductionOp reduce_op, UnaryTransformOp transform_op)
1613 { return vir::detail::transform_reduce(policy, first1, last1, init, reduce_op, transform_op); }
1614
1618 template <vir::detail::simd_execution_policy ExecutionPolicy,
1620 constexpr std::iter_value_t<It>
1621 reduce(ExecutionPolicy policy, It first, It last)
1622 {
1623 return vir::detail::transform_reduce(policy, first, last, std::iter_value_t<It>{},
1624 std::plus<>(), [](auto const& x) { return x; });
1625 }
1626
1630 template <vir::detail::simd_execution_policy ExecutionPolicy,
1632 constexpr T
1633 reduce(ExecutionPolicy policy, It first, It last, T init)
1634 {
1635 return vir::detail::transform_reduce(policy, first, last, init, std::plus<>(),
1636 [](auto const& x) { return x; });
1637 }
1638
1642 template <vir::detail::simd_execution_policy ExecutionPolicy,
1643 vir::detail::simd_execution_iterator It, typename T, typename BinaryReductionOp>
1644 constexpr T
1645 reduce(ExecutionPolicy policy, It first, It last, T init, BinaryReductionOp op)
1646 {
1647 return vir::detail::transform_reduce(policy, first, last, init, op,
1648 [](auto const& x) { return x; });
1649 }
1650
1654 template <vir::detail::simd_execution_policy ExecutionPolicy,
1656 constexpr int
1657 count_if(ExecutionPolicy pol, It first, It last, F&& fun)
1658 { return vir::count_if(pol, first, last, std::forward<F>(fun)); }
1659}
1660#endif // no Clang < 17
1661#endif // VIR_HAVE_SIMD_CONCEPTS
1662#endif // VIR_SIMD_EXECUTION_H_
1663
1664// vim: et cc=101 tw=100 sw=2 ts=8
Modelled if std::contiguous_iterator is modelled and the value-type of It can be transformed via vir:...
Definition simd_execution.h:413
Satisfied for valid specializations of vir::execution::simd_policy.
Definition simd_execution.h:399
Modelled if std::ranges::contiguous_range is modelled and the value-type of Rng can be transformed vi...
Definition simd_execution.h:405
constexpr int count_if(ExecutionPolicy pol, It first, It last, F &&pred)
Count the elements in the input range matching pred (iterator overload)
Definition simd_execution.h:1502
constexpr void for_each(ExecutionPolicy pol, It first, It last, F &&fun)
Iterate over the given range (iterator overload).
Definition simd_execution.h:1119
constexpr std::iter_value_t< It > reduce(ExecutionPolicy policy, It first, It last)
Sum the given range (iterator overload)
Definition simd_execution.h:1424
constexpr T transform_reduce(ExecutionPolicy policy, It1 first1, It1 last1, It2 first2, T init)
Inner product (iterator overload)
Definition simd_execution.h:1344
constexpr OutIt transform(ExecutionPolicy pol, It1 first1, It1 last1, OutIt d_first, UnaryOperation unary_op)
Unary transform (iterator overload)
Definition simd_execution.h:1222
multiplies
Definition simd_execution.h:422
constexpr simd_policy simd
SIMD execution policy.
Definition simd_execution.h:528
This namespace collects libraries and tools authored by Matthias Kretz.
Definition constexpr_wrapper.h:21
typename detail::simdize_impl< T, N >::type simdize
Apply a type transformation to a scalar type to produce a data-parallel type.
Definition simdize.h:1019
C++20 concepts extending the Parallelism TS 2 (which is limited to C++17).
Provides a type transformation for turning scalar user-defined types into a simd types.
Type of the vir::execution::simd execution policy.
Definition simd_execution.h:426
static constexpr simd_policy< Options..., detail::simd_policy_unroll_by_t< N > > unroll_by()
Definition simd_execution.h:492
static constexpr simd_policy< Options..., detail::simd_policy_size_t< N > > prefer_size()
Definition simd_execution.h:504
static constexpr simd_policy< Options..., detail::simd_policy_auto_prologue_t > auto_prologue()
Definition simd_execution.h:462
static constexpr simd_policy< Options..., detail::simd_policy_prefer_aligned_t > prefer_aligned()
Definition simd_execution.h:451
static constexpr simd_policy< Options..., detail::simd_policy_assume_matching_size_t > assume_matching_size()
Definition simd_execution.h:474
transform_reduce