vir-simd 0.4.189
Parallelism TS 2 extensions and simd fallback implementation
Loading...
Searching...
No Matches
simdize.h
Go to the documentation of this file.
1/* SPDX-License-Identifier: LGPL-3.0-or-later */
2/* Copyright © 2023–2024 GSI Helmholtzzentrum fuer Schwerionenforschung GmbH
3 * Matthias Kretz <m.kretz@gsi.de>
4 */
5
6#ifndef VIR_SIMD_SIMDIZE_H_
7#define VIR_SIMD_SIMDIZE_H_
8
12
13#include "struct_reflect.h"
14#include "constexpr_wrapper.h"
15
16#if VIR_HAVE_STRUCT_REFLECT and VIR_HAVE_CONSTEXPR_WRAPPER \
17 and (defined DOXYGEN or not defined __clang_major__ or __clang_major__ > 14)
18#define VIR_HAVE_SIMDIZE 1
19
20#include <tuple>
21#include <iterator>
22#include "simd.h"
23#include "detail.h"
24#include "simd_concepts.h"
25#include "simd_permute.h"
26
27namespace vir
28{
34 // Implementation note: partial specialization via concepts is broken on clang < 16
35 template <typename T>
37 : vir::constexpr_wrapper<[]() -> int {
38#ifndef DOXYGEN
39 if constexpr (stdx::is_simd_v<T>)
40 return T::size();
41 else if constexpr (reflectable_struct<T> and []<std::size_t... Is>(
42 std::index_sequence<Is...>) {
44 == simdize_size<vir::struct_element_t<Is, T>>::value) and ...);
45 }(std::make_index_sequence<vir::struct_size_v<T>>()))
47 else
48 return 0;
49#endif
50 }()>
51 {};
52
54 template <typename T>
55 inline constexpr int simdize_size_v = simdize_size<T>::value;
56
57 template <typename T, int N>
58 class simd_tuple
59 {
60 simd_tuple() = delete;
61 simd_tuple(const simd_tuple&) = delete;
62 ~simd_tuple() = delete;
63 };
64
65 template <typename T, int N>
66 class vectorized_struct
67 {
68 vectorized_struct() = delete;
69 vectorized_struct(const vectorized_struct&) = delete;
70 ~vectorized_struct() = delete;
71 };
72
73 namespace detail
74 {
75 template <typename V, bool... Values>
76 inline constexpr typename V::mask_type mask_constant
77 = typename V::mask_type(std::array<bool, sizeof...(Values)>{Values...}.data(),
78 stdx::element_aligned);
79
80#if VIR_HAVE_WORKING_SHUFFLEVECTOR
84 template <int... Indexes, typename T>
85 VIR_ALWAYS_INLINE T
86 blend(T a, T b)
87 {
88 constexpr int N = sizeof(a) / sizeof(a[0]);
89 static_assert(sizeof...(Indexes) <= N);
90 static_assert(((Indexes <= N) && ...));
91 constexpr auto selected = [](int i) {
92 return ((i == Indexes) || ...);
93 };
94 return [&]<int... Is>(std::integer_sequence<int, Is...>) {
95 return __builtin_shufflevector(a, b, (selected(Is) ? Is + N : Is)...);
96 }(std::make_integer_sequence<int, N>());
97 }
98#endif
99
100 template <typename T, int N>
101 struct simdize_impl
102 { using type = T; };
103
104 template <vectorizable T, int N>
105 requires requires {
106 typename stdx::simd_abi::deduce_t<T, N == 0 ? stdx::native_simd<T>::size() : N>;
107 }
108 struct simdize_impl<T, N>
109 { using type = deduced_simd<T, N == 0 ? stdx::native_simd<T>::size() : N>; };
110
111 template <typename T>
112 struct recursively_vectorizable
113 : std::false_type
114 {};
115
116 template <vectorizable T>
117 struct recursively_vectorizable<T>
118 : std::true_type
119 {};
120
121 template <vir::reflectable_struct T>
122 requires (vir::struct_size_v<T> == 1)
123 struct recursively_vectorizable<T>
124 : recursively_vectorizable<vir::struct_element_t<0, T>>
125 {};
126
127 template <vir::reflectable_struct T>
128 requires (vir::struct_size_v<T> > 1)
129 struct recursively_vectorizable<T>
130 : std::bool_constant<
131 []<std::size_t... Is>(std::index_sequence<Is...>) {
132 return (... and recursively_vectorizable<
134 }(std::make_index_sequence<vir::struct_size_v<T>>())>
135 {};
136
137 template <typename>
138 struct default_simdize_size;
139
140 template <vectorizable T>
141 struct default_simdize_size<T>
142 {
143 static inline constexpr int value = stdx::native_simd<T>::size();
144
145 static_assert(value > 0);
146 };
147
148 template <typename Tup>
149 requires (not vectorizable<Tup>)
150 and recursively_vectorizable<Tup>::value
151 struct default_simdize_size<Tup>
152 {
153 static inline constexpr int value
154 = []<std::size_t... Is>(std::index_sequence<Is...>) {
155 return std::max({int(simdize_impl<vir::struct_element_t<Is, Tup>, 0>::type::size())...});
156 }(std::make_index_sequence<vir::struct_size_v<Tup>>());
157
158 static_assert(value > 0);
159 };
160
161 template <typename Tup>
162 inline constexpr int default_simdize_size_v = default_simdize_size<Tup>::value;
163
164 template <reflectable_struct Tup, int N,
165 typename = std::make_index_sequence<vir::struct_size_v<Tup>>>
166 struct make_simd_tuple;
167
168 template <reflectable_struct Tup, int N, std::size_t... Is>
169 requires (vir::struct_size_v<Tup> > 0 and N > 0)
170 and ((simdize_impl<vir::struct_element_t<Is, Tup>, N>::type::size() == N) and ...)
171 struct make_simd_tuple<Tup, N, std::index_sequence<Is...>>
172 {
173 using type = std::tuple<typename simdize_impl<vir::struct_element_t<Is, Tup>, N>::type...>;
174 };
175
179 template <std::size_t N, template <typename...> class Tpl, typename... Ts>
180 requires ((simdize_impl<Ts, N>::type::size() == N) and ...)
181 Tpl<typename simdize_impl<Ts, N>::type...>
182 simdize_template_arguments_impl(const Tpl<Ts...>&);
183
184 template <std::size_t N, template <typename, auto...> class Tpl, typename T, auto... X>
185 requires(sizeof...(X) > 0)
186 and (simdize_impl<T, N>::type::size() == N)
187 Tpl<typename simdize_impl<T, N>::type, X...>
188 simdize_template_arguments_impl(const Tpl<T, X...>&);
189
190 template <std::size_t N, template <typename, typename, auto...> class Tpl,
191 typename... Ts, auto... X>
192 requires(sizeof...(X) > 0)
193 and ((simdize_impl<Ts, N>::type::size() == N) and ...)
194 Tpl<typename simdize_impl<Ts, N>::type..., X...>
195 simdize_template_arguments_impl(const Tpl<Ts..., X...>&);
196
197 template <std::size_t N, template <typename, typename, typename, auto...> class Tpl,
198 typename... Ts, auto... X>
199 requires(sizeof...(X) > 0)
200 and ((simdize_impl<Ts, N>::type::size() == N) and ...)
201 Tpl<typename simdize_impl<Ts, N>::type..., X...>
202 simdize_template_arguments_impl(const Tpl<Ts..., X...>&);
203
204 template <typename T, int N>
205 struct simdize_template_arguments;
206
207 template <typename T, int N>
208 requires requires(const T& tt) { simdize_template_arguments_impl<N>(tt); }
209 struct simdize_template_arguments<T, N>
210 { using type = decltype(simdize_template_arguments_impl<N>(std::declval<const T&>())); };
211
212 template <typename T>
213 requires requires(const T& tt) {
214 simdize_template_arguments_impl<default_simdize_size<T>::value>(tt);
215 }
216 struct simdize_template_arguments<T, 0>
217 {
218 using type = decltype(simdize_template_arguments_impl<default_simdize_size_v<T>>(
219 std::declval<const T&>()));
220 };
221
222 template <typename T, int N = 0>
223 using simdize_template_arguments_t = typename simdize_template_arguments<T, N>::type;
224
230 template <typename T>
231 struct flat_member_count;
232
233 template <typename T>
234 inline constexpr int flat_member_count_v = flat_member_count<T>::value;
235
236 template <typename T>
237 requires (not vir::reflectable_struct<T>)
238 struct flat_member_count<T>
239 : vir::constexpr_wrapper<1>
240 {};
241
242 template <vir::reflectable_struct T>
243 struct flat_member_count<T>
244 {
245 static constexpr int value = []<int... Is>(std::integer_sequence<int, Is...>) {
246 return (flat_member_count_v<vir::struct_element_t<Is, T>> + ...);
247 }(std::make_integer_sequence<int, vir::struct_size_v<T>>());
248 };
249
253 template <int I, typename T>
254 struct flat_element;
255
256 template <int I, typename T>
257 using flat_element_t = typename flat_element<I, T>::type;
258
259 template <typename T>
260 requires (not vir::reflectable_struct<T>)
261 struct flat_element<0, T>
262 { using type = T; };
263
264 template <int I, vir::reflectable_struct T>
265 requires (flat_member_count_v<T> == struct_size_v<T> and I < struct_size_v<T>)
266 struct flat_element<I, T>
267 { using type = struct_element_t<I, T>; };
268
269 template <int I, vir::reflectable_struct T>
270 requires (flat_member_count_v<T> > struct_size_v<T>
271 and I < flat_member_count_v<struct_element_t<0, T>>)
272 struct flat_element<I, T>
273 { using type = flat_element_t<I, struct_element_t<0, T>>; };
274
278 template <int I, int Offset = 0, vir::reflectable_struct T>
279 constexpr decltype(auto)
280 flat_get(T&& s)
281 {
282 using TT = std::remove_cvref_t<T>;
283 if constexpr (flat_member_count_v<TT> == struct_size_v<TT>)
284 {
285 static_assert(I < struct_size_v<TT>);
286 static_assert(Offset == 0);
287 return vir::struct_get<I>(s);
288 }
289 else
290 {
291 static_assert(flat_member_count_v<TT> > struct_size_v<TT>);
292 constexpr auto size = flat_member_count_v<struct_element_t<Offset, TT>>;
293 if constexpr (I < size)
294 return flat_get<I>(vir::struct_get<Offset>(s));
295 else
296 return flat_get<I - size, size>(s);
297 }
298 }
299
300 template <template <typename...> class Comp,
301 template <std::size_t, typename> class Element1, typename T1,
302 template <std::size_t, typename> class Element2, typename T2, std::size_t... Is>
303 constexpr std::bool_constant<(Comp<typename Element1<Is, T1>::type,
304 typename Element2<Is, T2>::type>::value and ...)>
305 test_all_of(std::index_sequence<Is...>)
306 { return {}; }
307
311 template <typename Trait, std::size_t... Is>
312 constexpr std::bool_constant<(Trait::template value<Is> and ...)>
313 test_all_of(std::index_sequence<Is...>)
314 { return {}; }
315
319 template <typename T, int N = default_simdize_size<T>::value,
320 typename TTup = typename make_simd_tuple<T, N>::type,
321 typename TS = simdize_template_arguments_t<T>>
322 struct is_consistent_struct_vectorization
323 {
324 template <std::size_t I, typename L = flat_element_t<I, TTup>,
325 typename R = flat_element_t<I, TS>>
326 static constexpr bool value = std::same_as<L, R>;
327 };
328 }
329
334 template <typename T>
339 and bool(detail::test_all_of<detail::is_consistent_struct_vectorization<T>>(
340 std::make_index_sequence<vir::detail::flat_member_count_v<T>>()));
341
342 namespace detail
343 {
344 template <reflectable_struct Tup, int N>
346 and requires { default_simdize_size<Tup>::value; }
347 and std::is_destructible_v<simd_tuple<Tup, N == 0 ? default_simdize_size_v<Tup> : N>>
348 struct simdize_impl<Tup, N>
349 {
350 static_assert(requires { typename simdize_impl<vir::struct_element_t<0, Tup>, N>::type; });
351
352 using type = simd_tuple<Tup, N == 0 ? default_simdize_size_v<Tup> : N>;
353 };
354
355 template <vectorizable_struct_template T, int N>
356 requires requires { default_simdize_size<T>::value; }
357 and std::is_destructible_v<vectorized_struct<T, N == 0 ? default_simdize_size_v<T> : N>>
358 struct simdize_impl<T, N>
359 { using type = vectorized_struct<T, N == 0 ? default_simdize_size_v<T> : N>; };
360 } // namespace detail
361
367 template <reflectable_struct T, int N>
368 requires requires { typename detail::make_simd_tuple<T, N>::type; }
369 class simd_tuple<T, N>
370 {
371 using tuple_type = typename detail::make_simd_tuple<T, N>::type;
372
373 tuple_type elements;
374
375 static constexpr auto tuple_size_idx_seq = std::make_index_sequence<vir::struct_size_v<T>>();
376
377 public:
379 using value_type = T;
380
382 using mask_type = typename std::tuple_element_t<0, tuple_type>::mask_type;
383
385 static constexpr auto size = vir::cw<N>;
386
387 template <typename U = T>
388 inline static constexpr std::size_t memory_alignment = alignof(U);
389
390 template <typename... Ts>
391 requires (sizeof...(Ts) == std::tuple_size_v<tuple_type>
392 and detail::test_all_of<std::is_constructible, std::tuple_element, tuple_type,
393 std::tuple_element, std::tuple<Ts...>
394 >(tuple_size_idx_seq).value)
395 constexpr
396 simd_tuple(Ts&&... args)
397 : elements{static_cast<Ts&&>(args)...}
398 {}
399
400 constexpr
401 simd_tuple(const tuple_type& init)
402 : elements(init)
403 {}
404
408 template <reflectable_struct U>
409 requires (struct_size_v<U> == struct_size_v<T>
410 and detail::test_all_of<std::is_constructible, std::tuple_element, tuple_type,
411 struct_element, U>(tuple_size_idx_seq).value)
412 constexpr
413 explicit(not detail::test_all_of<std::is_convertible, struct_element, U,
414 std::tuple_element, tuple_type>(tuple_size_idx_seq).value)
415 simd_tuple(const U& init)
416 : elements([&]<std::size_t... Is>(std::index_sequence<Is...>) {
417 return tuple_type {std::tuple_element_t<Is, tuple_type>(vir::struct_get<Is>(init))...};
418 }(tuple_size_idx_seq))
419 {}
420
421 template <reflectable_struct U>
422 requires (struct_size_v<U> == struct_size_v<T>
423 and detail::test_all_of<std::is_constructible, struct_element, U,
424 std::tuple_element, tuple_type>(tuple_size_idx_seq)
425 .value)
426 constexpr
427 explicit(not detail::test_all_of<std::is_same, struct_element, U,
428 std::tuple_element, tuple_type>(tuple_size_idx_seq).value)
429 operator U() const
430 {
431 return [&]<std::size_t... Is>(std::index_sequence<Is...>) {
432 return U {static_cast<struct_element_t<Is, U>>(std::get<Is>(elements))...};
433 }(tuple_size_idx_seq);
434 }
435
436 constexpr tuple_type&
437 as_tuple()
438 { return elements; }
439
440 constexpr tuple_type const&
441 as_tuple() const
442 { return elements; }
443
444 constexpr auto
445 operator[](std::size_t i) const
446 requires (not std::is_array_v<T>)
447 {
448 return [&]<std::size_t... Is>(std::index_sequence<Is...>) {
449 return T{std::get<Is>(elements)[i]...};
450 }(tuple_size_idx_seq);
451 }
452
458 template <std::contiguous_iterator It, typename Flags = stdx::element_aligned_tag>
459 requires std::same_as<std::iter_value_t<It>, T>
460 constexpr
461 simd_tuple(It it, Flags = {})
462 : elements([&]<std::size_t... Is>(std::index_sequence<Is...>) {
463 return tuple_type {std::tuple_element_t<Is, tuple_type>([&](size_t i) {
464 return struct_get<Is>(it[i]);
465 })...};
466 }(tuple_size_idx_seq))
467 {}
468
469 template <std::contiguous_iterator It, typename Flags = stdx::element_aligned_tag>
470 requires std::same_as<std::iter_value_t<It>, T>
471 constexpr void
472 copy_from(It it, Flags = {})
473 {
474 [&]<std::size_t... Is>(std::index_sequence<Is...>) {
475 ((std::get<Is>(elements) = std::tuple_element_t<Is, tuple_type>([&](size_t i) {
476 return struct_get<Is>(it[i]);
477 })), ...);
478 }(tuple_size_idx_seq);
479 }
480
486 template <std::contiguous_iterator It, typename Flags = stdx::element_aligned_tag>
487 requires std::output_iterator<It, T>
488 constexpr void
489 copy_to(It it, Flags = {}) const
490 {
491 for (std::size_t i = 0; i < size(); ++i)
492 it[i] = operator[](i);
493 }
494 };
495
501 template <std::size_t I, reflectable_struct T, int N>
503 constexpr decltype(auto)
504 get(const simd_tuple<T, N>& tup)
505 { return std::get<I>(tup.as_tuple()); }
506
507 template <std::size_t I, reflectable_struct T, int N>
508 requires (not vectorizable_struct_template<T>)
509 constexpr decltype(auto)
510 get(simd_tuple<T, N>& tup)
511 { return std::get<I>(tup.as_tuple()); }
512
516 template <vectorizable_struct_template T, int N>
517 requires requires { typename detail::simdize_template_arguments_t<T, N>; }
518 class vectorized_struct<T, N> : public detail::simdize_template_arguments_t<T, N>
519 {
520 using tuple_type = typename detail::make_simd_tuple<T, N>::type;
521 using base_type = detail::simdize_template_arguments_t<T, N>;
522
523 static constexpr auto _flat_member_count = detail::flat_member_count_v<T>;
524 static constexpr auto _flat_member_idx_seq = std::make_index_sequence<_flat_member_count>();
525 static constexpr auto _struct_size = struct_size_v<T>;
526 static constexpr auto _struct_size_idx_seq = std::make_index_sequence<_struct_size>();
527
528 constexpr base_type&
529 _as_base_type()
530 { return *this; }
531
532 constexpr base_type const&
533 _as_base_type() const
534 { return *this; }
535
536 public:
537 using value_type = T;
538 using mask_type = typename detail::flat_element_t<0, base_type>::mask_type;
539
540 static constexpr auto size = vir::cw<N>;
541
542 template <typename U = T>
543 inline static constexpr std::size_t memory_alignment = alignof(U);
544
545 template <typename... Ts>
546 requires requires(Ts&&... args) { base_type{static_cast<Ts&&>(args)...}; }
547 VIR_ALWAYS_INLINE constexpr
548 vectorized_struct(Ts&&... args)
549 : base_type{static_cast<Ts&&>(args)...}
550 {}
551
552 VIR_ALWAYS_INLINE constexpr
553 vectorized_struct(const base_type& init)
554 : base_type(init)
555 {}
556
557 // broadcast and or vector copy/conversion
558 template <reflectable_struct U>
559 requires (struct_size_v<U> == _struct_size
560 and detail::test_all_of<std::is_constructible, std::tuple_element, tuple_type,
561 struct_element, U>(_struct_size_idx_seq).value)
562 constexpr
563 explicit(not detail::test_all_of<std::is_convertible, struct_element, U,
564 std::tuple_element, tuple_type>(_struct_size_idx_seq).value)
565 vectorized_struct(const U& init)
566 : base_type([&]<std::size_t... Is>(std::index_sequence<Is...>) {
567 return base_type {std::tuple_element_t<Is, tuple_type>(vir::struct_get<Is>(init))...};
568 }(_struct_size_idx_seq))
569 {}
570
571 VIR_ALWAYS_INLINE constexpr T
572 operator[](std::size_t i) const
573 {
574 return [&]<std::size_t... Is>(std::index_sequence<Is...>) {
575 return T{detail::flat_get<Is>(*this)[i]...};
576 }(_flat_member_idx_seq);
577 }
578
579 VIR_ALWAYS_INLINE
580 static constexpr base_type
581 _load_elements_via_permute(const T* addr)
582 {
583#if VIR_HAVE_WORKING_SHUFFLEVECTOR
584 if (not std::is_constant_evaluated())
585 if constexpr (N > 1 and sizeof(T) * N == sizeof(base_type) and _flat_member_count >= 2
586 and std::has_single_bit(unsigned(N)))
587 {
588 const std::byte* byte_ptr = reinterpret_cast<const std::byte*>(addr);
589 // struct_size_v == 2 doesn't need anything, the fallback works fine, unless
590 // we allow unordered access
591 using V0 = detail::flat_element_t<0, tuple_type>;
592 using V1 = detail::flat_element_t<1, tuple_type>;
593 if constexpr (_flat_member_count == 2 and N > 2)
594 {
595 static_assert(N == V0::size());
596 constexpr int N2 = N / 2;
597 using U = typename V0::value_type;
598 if constexpr (sizeof(U) == sizeof(vir::struct_element_t<0, T>)
599 and sizeof(U) == sizeof(vir::struct_element_t<1, T>)
600 and std::has_single_bit(V0::size())
601 and V0::size() <= stdx::native_simd<U>::size())
602 {
603 using V [[gnu::vector_size(sizeof(U) * N)]] = U;
604
605 V x0, x1;
606 // [a0 b0 a1 b1 a2 b2 a3 b3]
607 std::memcpy(&x0, std::addressof(vir::struct_get<0>(addr[0])), sizeof(V));
608 // [a4 b4 a5 b5 a6 b6 a7 b7]
609 std::memcpy(&x1, std::addressof(vir::struct_get<0>(addr[N2])), sizeof(V));
610
611 return [&]<std::size_t... Is>(std::index_sequence<Is...>) {
612 return base_type {
613 std::bit_cast<V0>(
614 __builtin_shufflevector(x0, x1, (Is * 2)..., (N + Is * 2)...)),
615 std::bit_cast<V1>(
616 __builtin_shufflevector(x0, x1, (1 + Is * 2)..., (1 + N + Is * 2)...))
617 };
618 }(std::make_index_sequence<N2>());
619
620 /* if constexpr (sizeof(V) == 32)
621 [&]<std::size_t... Is>(std::index_sequence<Is...>) {
622 constexpr auto N4 = N2 / 2;
623 const auto tmp = x0;
624 // [a0 b0 a1 b1 a4 b4 a5 b5]
625 x0 = __builtin_shufflevector(tmp, x1, Is..., (Is + N)...);
626 // [a2 b2 a3 b3 a6 b6 a7 b7]
627 x1 = __builtin_shufflevector(tmp, x1, (Is + N2) ..., (Is + N + N2)...);
628 const lo0 = __builtin_shufflevector(x0, x1, ((Is & 1 ? N : 0) + Is / 2)...,
629 N2 + ((Is & 1 ? N : 0) + Is / 2)...);
630 const hi0 = __builtin_shufflevector(x0, x1, N4 + ((Is & 1 ? N : 0) + Is / 2)...,
631 N4 + N2 + ((Is & 1 ? N : 0) + Is / 2)...);
632 }(std::make_index_sequence<N2>());
633
634 V0 x0(std::addressof(vir::struct_get<0>(addr[0])), stdx::element_aligned);
635 V0 x1(std::addressof(vir::struct_get<0>(addr[V0::size()])), stdx::element_aligned);
636
637
638 // [b4 a4 b5 a5 b6 a6 b7 a7]
639 x1 = simd_permute(x1, simd_permutations::swap_neighbors<>);
640 // [0 1 0 1 0 1 0 1]
641 constexpr auto mask = []<std::size_t... Is>(std::index_sequence<Is...>) {
642 return detail::mask_constant<V0, (Is & 1)...>;
643 }(std::make_index_sequence<V0::size()>());
644 V0 tmp = x1;
645 // [b4 b0 b5 b1 b6 b2 b7 b3]
646 stdx::where(mask, x1) = x0;
647 // [b0 b4 b1 b5 b2 b6 b3 b7]
648 x1 = simd_permute(x1, simd_permutations::swap_neighbors<>);
649 // [a0 a4 a1 a5 a2 a6 a3 a7]
650 stdx::where(mask, x0) = tmp;
651 return base_type {x0, x1};*/
652 }
653 }
654 else if constexpr (std::same_as<vir::as_tuple_t<T>, std::tuple<float, float, float>>)
655 {
656 if constexpr (std::same_as<tuple_type, std::tuple<V0, V0, V0>>
657 and V0::size() == 8 and std::is_trivially_copyable_v<V0>)
658 {
659 // Implementation notes:
660 // 1. Three gather instructions with indexes 0, 3, 6, 9, 12, 15, 18, 21 is super
661 // slow
662 // 2. Eight 3/4-element loads -> concat to 8 elements -> unpack also much slower
663
664 // abc abc abc
665 // a = [a0 b0 c0 a1 b1 c1 a2 b2] 332 = 211+121
666 // b = [c2 a3 b3 c3 a4 b4 c4 a5] 323 = 112+211
667 // c = [b5 c5 a6 b6 c6 a7 b7 c7] 233 = 121+112
668
669 using v8sf [[gnu::vector_size(32)]] = float;
670 if constexpr (true) // allow_unordered
671 {
672 v8sf x0, x1, x2, a0, b0, c0;
673 std::memcpy(&x0, byte_ptr + 0, 32);
674 std::memcpy(&x1, byte_ptr + 32, 32);
675 std::memcpy(&x2, byte_ptr + 64, 32);
676
677 a0 = detail::blend<1, 4, 7>(x0, x1);
678 a0 = detail::blend<2, 5>(a0, x2);
679 // a0 a3 a6 a1 a4 a7 a2 a5
680
681 b0 = detail::blend<2, 5>(x0, x1);
682 b0 = detail::blend<0, 3, 6>(b0, x2);
683 // b5 b0 b3 b6 b1 b4 b7 b2
684
685 c0 = detail::blend<0, 3, 6>(x0, x1);
686 c0 = detail::blend<1, 4, 7>(c0, x2);
687 // c2 c5 c0 c3 c6 c1 c4 c7
688
689 V0 a = std::bit_cast<V0>(a0);
690 V0 b = simd_permute(std::bit_cast<V0>(b0), simd_permutations::rotate<1>);
691 V0 c = simd_permute(std::bit_cast<V0>(c0), simd_permutations::rotate<2>);
692 return base_type {a, b, c};
693 }
694 else
695 {
696 v8sf a, b, c;
697 std::memcpy(&a, byte_ptr + 0, 32);
698 std::memcpy(&b, byte_ptr + 32, 32);
699 std::memcpy(&c, byte_ptr + 64, 32);
700
701 // a0 b0 c0 a1 b5 c5 a6 b6
702 v8sf ac0 = __builtin_shufflevector(a, c, 0, 1, 2, 3, 8, 9, 10, 11);
703
704 // b1 c1 a2 b2 c6 a7 b7 c7
705 v8sf ac1 = __builtin_shufflevector(a, c, 4, 5, 6, 7, 12, 13, 14, 15);
706
707 // a0 a3 a2 a1 a4 a7 a6 a5
708 V0 tmp0 = std::bit_cast<V0>(detail::blend<2, 5>(
709 detail::blend<1, 4, 7>(ac0, b), ac1));
710
711 // b1 b0 b3 b2 b5 b4 b7 b6
712 V0 tmp1 = std::bit_cast<V0>(detail::blend<0, 3, 6>(
713 detail::blend<2, 5>(ac0, b), ac1));
714
715 // c2 c1 c0 c3 c6 c5 c4 c7
716 V0 tmp2 = std::bit_cast<V0>(detail::blend<1, 4, 7>(
717 detail::blend<0, 3, 6>(ac0, b), ac1));
718
719 return {
720 simd_permute(tmp0, [](size_t i) {
721 return std::array{0, 3, 2, 1, 4, 7, 6, 5}[i];
722 }),
723 simd_permute(tmp1, [](size_t i) {
724 return std::array{1, 0, 3, 2, 5, 4, 7, 6}[i];
725 }),
726 simd_permute(tmp2, [](size_t i) {
727 return std::array{2, 1, 0, 3, 6, 5, 4, 7}[i];
728 })
729 };
730 }
731 }
732 if constexpr (std::same_as<tuple_type, std::tuple<V0, V0, V0>>
733 and V0::size() == 4 and std::is_trivially_copyable_v<V0>)
734 {
735 // abca = [a0 b0 c0 a1]
736 // bcab = [b1 c1 a2 b2]
737 // cabc = [c2 a3 b3 c3]
738
739 using v4sf [[gnu::vector_size(16)]] = float;
740 v4sf abca, bcab, cabc, a, b, c;
741 std::memcpy(&abca, byte_ptr + 0, 16);
742 std::memcpy(&bcab, byte_ptr + 16, 16);
743 std::memcpy(&cabc, byte_ptr + 32, 16);
744
745 // [a0 a1 a2 a3]
746 a = __builtin_shufflevector(abca, detail::blend<2, 3>(cabc, bcab), 0, 3, 6, 5);
747
748 // [b0 b1 b2 b3]
749 b = __builtin_shufflevector(detail::blend<0>(abca, bcab), // [b1 b0 c0 a1]
750 detail::blend<2>(bcab, cabc), // [b1 c1 b3 b2]
751 1, 0, 7, 6);
752
753 c = __builtin_shufflevector(detail::blend<2, 3>(bcab, abca), // [b1 c1 c0 a1]
754 cabc, 2, 1, 4, 7);
755
756 return base_type {std::bit_cast<V0>(a), std::bit_cast<V0>(b),
757 std::bit_cast<V0>(c)};
758 }
759 }
760 if constexpr (_flat_member_count == 3 and N > 2)
761 {
762 using V2 = detail::flat_element_t<2, tuple_type>;
763 static_assert(N == V0::size());
764 static_assert(std::has_single_bit(unsigned(N)));
765 using U = typename V0::value_type;
766 if constexpr (sizeof(U) == sizeof(detail::flat_element_t<0, T>)
767 and sizeof(U) == sizeof(detail::flat_element_t<1, T>)
768 and sizeof(U) == sizeof(detail::flat_element_t<2, T>)
769 and V0::size() <= stdx::native_simd<U>::size())
770 {
771 using V [[gnu::vector_size(sizeof(U) * N)]] = U;
772
773 V x0, x1, x2;
774 // [a0 b0 c0 a1 b1 c1 a2 b2]
775 std::memcpy(&x0, byte_ptr, sizeof(V));
776 // [c2 a3 b3 c3 a4 b4 c4 a5]
777 std::memcpy(&x1, byte_ptr + sizeof(V), sizeof(V));
778 // [b5 c5 a6 b6 c6 a7 b7 c7]
779 std::memcpy(&x2, byte_ptr + 2 * sizeof(V), sizeof(V));
780
781 return [&]<int... Is>(std::integer_sequence<int, Is...>)
782 VIR_LAMBDA_ALWAYS_INLINE {
783 return base_type {
784 std::bit_cast<V0>(
785 __builtin_shufflevector(
786 __builtin_shufflevector(
787 x0, x1, (Is % 3 == 0 ? Is : (Is + N) % 3 == 0 ? Is + N : -1)...),
788 x2, (Is * 3 < N ? Is * 3 : Is * 3 - N)...)),
789 std::bit_cast<V1>(
790 __builtin_shufflevector(
791 __builtin_shufflevector(
792 x0, x1, (Is % 3 == 1 ? Is : (Is + N) % 3 == 1 ? Is + N : -1)...),
793 x2, (Is * 3 + 1 < N ? Is * 3 + 1 : Is * 3 - N + 1)...)),
794 std::bit_cast<V2>(
795 __builtin_shufflevector(
796 __builtin_shufflevector(
797 x0, x1, (Is % 3 == 2 ? Is : (Is + N) % 3 == 2 ? Is + N : -1)...),
798 x2, (Is * 3 + 2 < N ? Is * 3 + 2 : Is * 3 - N + 2)...))
799 };
800 }(std::make_integer_sequence<int, N>());
801 }
802 }
803 }
804#endif // VIR_HAVE_WORKING_SHUFFLEVECTOR
805
806 // not optimized fallback
807 return [&]<std::size_t... Is>(std::index_sequence<Is...>) {
808 return base_type {detail::flat_element_t<Is, tuple_type>([&](size_t i) {
809 return detail::flat_get<Is>(addr[i]);
810 })...};
811 }(_flat_member_idx_seq);
812 }
813
814 template <int... Is>
815 VIR_ALWAYS_INLINE constexpr void
816 _store_elements_via_permute(T* addr, std::integer_sequence<int, Is...>) const
817 {
818#if VIR_HAVE_WORKING_SHUFFLEVECTOR
819 if (not std::is_constant_evaluated())
820 if constexpr (N > 2 and _flat_member_count >= 2
821 and sizeof(T) * N == sizeof(base_type)
822 and std::has_single_bit(unsigned(N)))
823 {
824 using V0 = detail::flat_element_t<0, tuple_type>;
825 //using V1 = detail::flat_element_t<1, tuple_type>;
826 static_assert(N == V0::size());
827 using U = typename V0::value_type;
828 using V [[gnu::vector_size(sizeof(U) * N)]] = U;
829 std::byte* byte_ptr = reinterpret_cast<std::byte*>(addr);
830
831 if constexpr (_flat_member_count == 3
832 and V0::size() <= stdx::native_simd<U>::size())
833 {
834 //using V2 = detail::flat_element_t<2, tuple_type>;
835 if constexpr (sizeof(U) == sizeof(detail::flat_element_t<0, T>)
836 and sizeof(U) == sizeof(detail::flat_element_t<1, T>)
837 and sizeof(U) == sizeof(detail::flat_element_t<2, T>))
838 {
839 // [a0 a1 a2 a3 a4 a5 a6 ...]
840 V a = std::bit_cast<V>(detail::flat_get<0>(_as_base_type()));
841 // [b0 b1 b2 b3 b4 b5 b6 ...]
842 V b = std::bit_cast<V>(detail::flat_get<1>(_as_base_type()));
843 // [c0 c1 c2 c3 c4 c5 c6 ...]
844 V c = std::bit_cast<V>(detail::flat_get<2>(_as_base_type()));
845 constexpr auto idx = [](int i, int pos, int a, int b, int c) {
846 constexpr int N3 = N / 3;
847 switch(i % 3)
848 {
849 case 0:
850 return i / 3 + a + pos * N3;
851 case 1:
852 return i / 3 + b + pos * N3 + N;
853 case 2:
854 return i / 3 + c + ((pos + N % 3) % 3) * N3;
855 default:
856 __builtin_unreachable();
857 }
858 };
859 if constexpr (N % 3 == 1)
860 {
861 // [a0 b0 a6 a1|b1 a7 a2 b2|a8 a3 b3 a9|a4 b4 a10 a5]
862 const V aba = __builtin_shufflevector(a, b, idx(Is, 0, 0, 0, 1)...);
863 // [b5 c5 b11 b6|c6 b12 b7 c7|b13 b8 c8 b14|b9 c9 b15 b10]
864 const V bcb = __builtin_shufflevector(b, c, idx(Is, 1, 0, 0, 1)...);
865 // [c10 a11 c0 c11|a12 c1 c12 a13|c2 c13 a14 c3|c14 a15 c4 c15]
866 const V cac = __builtin_shufflevector(c, a, idx(Is, 2, 0, 1, 0)...);
867 // [a0 b0 c0 a1|b1 c1 a2 b2|c2 a3 b3 c3|a4 b4 c4 a5]
868 a = __builtin_shufflevector(aba, cac, (Is % 3 != 2 ? Is : Is + N)...);
869 // [b5 c5 a6 b6|c6 a7 b7 c7|a8 b8 c8 a9|b9 c9 a10 b10]
870 b = __builtin_shufflevector(bcb, aba, (Is % 3 != 2 ? Is : Is + N)...);
871 // [c10 a11 b11 c11|a12 b12 c12 a13|b13 c13 a14 b14|c14 a15 b15 c15]
872 c = __builtin_shufflevector(cac, bcb, (Is % 3 != 2 ? Is : Is + N)...);
873 }
874 else
875 {
876 static_assert(N % 3 == 2);
877 // [a0 b0 a6 a1|b1 a7 a2 b2]
878 const V aba = __builtin_shufflevector(a, b, idx(Is, 0, 0, 0, 2)...);
879 // [c2 a3 c0 c3|a4 c1 c4 a5]
880 const V cac = __builtin_shufflevector(c, a, idx(Is, 1, 0, 1, 0)...);
881 // [b5 c5 b3 b6|c6 b4 b7 c7]
882 const V bcb = __builtin_shufflevector(b, c, idx(Is, 2, 1, 1, 1)...);
883 // [a0 b0 c0 a1|b1 c1 a2 b2]
884 a = __builtin_shufflevector(aba, cac, (Is % 3 != 2 ? Is : Is + N)...);
885 // [c2 a3 b3 c3|a4 b4 c4 a5]
886 b = __builtin_shufflevector(cac, bcb, (Is % 3 != 2 ? Is : Is + N)...);
887 // [b5 c5 a6 b6|c6 a7 b7 c7]
888 c = __builtin_shufflevector(bcb, aba, (Is % 3 != 2 ? Is : Is + N)...);
889 }
890 std::memcpy(byte_ptr, &a, sizeof(V));
891 std::memcpy(byte_ptr + sizeof(V), &b, sizeof(V));
892 std::memcpy(byte_ptr + 2 * sizeof(V), &c, sizeof(V));
893 return;
894 }
895 }
896 }
897#endif
898 for (int i = 0; i < N; ++i)
899 addr[i] = operator[](i);
900 }
901
907 template <std::contiguous_iterator It, typename Flags = stdx::element_aligned_tag>
908 requires std::same_as<std::iter_value_t<It>, T>
909 constexpr
910 vectorized_struct(It it, Flags = {})
911 : base_type(_load_elements_via_permute(std::to_address(it)))
912 {}
913
914 template <std::contiguous_iterator It, typename Flags = stdx::element_aligned_tag>
915 requires std::same_as<std::iter_value_t<It>, T>
916 constexpr void
917 copy_from(It it, Flags = {})
918 { static_cast<base_type&>(*this) = _load_elements_via_permute(std::to_address(it)); }
919
925 template <std::contiguous_iterator It, typename Flags = stdx::element_aligned_tag>
926 requires std::output_iterator<It, T>
927 constexpr void
928 copy_to(It it, Flags = {}) const
929 { _store_elements_via_permute(std::to_address(it), std::make_integer_sequence<int, N>()); }
930
931 // The following enables implicit conversions added by vectorized_struct. E.g.
932 // `simdize<Point> + Point` will broadcast the latter to a `simdize<Point>` before applying
933 // operator+.
934 template <typename R>
935 using _op_return_type = std::conditional_t<std::same_as<R, base_type>, vectorized_struct, R>;
936
937#define VIR_OPERATOR_FWD(op) \
938 VIR_ALWAYS_INLINE friend constexpr auto \
939 operator op(vectorized_struct const& a, vectorized_struct const& b) \
940 requires requires(base_type const& x) { {x op x}; } \
941 { \
942 return static_cast<_op_return_type<decltype(a._as_base_type() op b._as_base_type())>>( \
943 a._as_base_type() op b._as_base_type()); \
944 } \
945 \
946 VIR_ALWAYS_INLINE friend constexpr auto \
947 operator op(base_type const& a, vectorized_struct const& b) \
948 requires requires(base_type const& x) { {x op x}; } \
949 { \
950 return static_cast<_op_return_type<decltype(a op b._as_base_type())>>( \
951 a op b._as_base_type()); \
952 } \
953 \
954 VIR_ALWAYS_INLINE friend constexpr auto \
955 operator op(vectorized_struct const& a, base_type const& b) \
956 requires requires(base_type const& x) { {x op x}; } \
957 { \
958 return static_cast<_op_return_type<decltype(a._as_base_type() op b)>>( \
959 a._as_base_type() op b); \
960 }
961
962 VIR_OPERATOR_FWD(+)
963 VIR_OPERATOR_FWD(-)
964 VIR_OPERATOR_FWD(*)
965 VIR_OPERATOR_FWD(/)
966 VIR_OPERATOR_FWD(%)
967 VIR_OPERATOR_FWD(&)
968 VIR_OPERATOR_FWD(|)
969 VIR_OPERATOR_FWD(^)
970 VIR_OPERATOR_FWD(<<)
971 VIR_OPERATOR_FWD(>>)
972 VIR_OPERATOR_FWD(==)
973 VIR_OPERATOR_FWD(!=)
974 VIR_OPERATOR_FWD(>=)
975 VIR_OPERATOR_FWD(>)
976 VIR_OPERATOR_FWD(<=)
977 VIR_OPERATOR_FWD(<)
978#undef VIR_OPERATOR_FWD
979 };
980
986 template <std::size_t I, vectorizable_struct_template T, int N>
987 constexpr decltype(auto)
988 get(const vectorized_struct<T, N>& tup)
989 {
990 return vir::struct_get<I>(
991 static_cast<const detail::simdize_template_arguments_t<T, N>&>(tup));
992 }
993
994 template <std::size_t I, vectorizable_struct_template T, int N>
995 constexpr decltype(auto)
996 get(vectorized_struct<T, N>& tup)
997 {
998 return vir::struct_get<I>(
999 static_cast<detail::simdize_template_arguments_t<T, N>&>(tup));
1000 }
1001
1006 template <typename T>
1008 = reflectable_struct<T> and detail::recursively_vectorizable<T>::value;
1009
1018 template <typename T, int N = 0>
1019 using simdize = typename detail::simdize_impl<T, N>::type;
1020
1021 template <int N, typename V>
1022 requires requires {
1023 V::size();
1024 typename V::value_type;
1025 } and (reflectable_struct<typename V::value_type> or vectorizable<typename V::value_type>)
1026 using resize_simdize_t = simdize<typename V::value_type, N>;
1027} // namespace vir
1028
1032template <vir::reflectable_struct T, int N>
1033 struct std::tuple_size<vir::simd_tuple<T, N>>
1034 : std::integral_constant<std::size_t, vir::struct_size_v<T>>
1035 {};
1036
1040template <std::size_t I, vir::reflectable_struct T, int N>
1041 struct std::tuple_element<I, vir::simd_tuple<T, N>>
1042 : std::tuple_element<I, typename vir::detail::make_simd_tuple<T, N>::type>
1043 {};
1044
1048template <vir::vectorizable_struct_template T, int N>
1049 struct std::tuple_size<vir::vectorized_struct<T, N>>
1050 : std::integral_constant<std::size_t, vir::struct_size_v<T>>
1051 {};
1052
1053template <std::size_t I, vir::vectorizable_struct_template T, int N>
1054 struct std::tuple_element<I, vir::vectorized_struct<T, N>>
1055 : vir::struct_element<I, vir::detail::simdize_template_arguments_t<T, N>>
1056 {};
1057
1058#endif // VIR_HAVE_STRUCT_REFLECT
1059#endif // VIR_SIMD_SIMDIZE_H_
1060
1061// vim: noet cc=101 tw=100 sw=2 ts=8
constexpr void copy_to(It it, Flags={}) const
Definition simdize.h:489
static constexpr auto size
The number of elements (of value_type) contained in objects of this type.
Definition simdize.h:385
constexpr simd_tuple(It it, Flags={})
Definition simdize.h:461
T value_type
The element type of this simd-like type.
Definition simdize.h:379
typename std::tuple_element_t< 0, tuple_type >::mask_type mask_type
The associated simd_mask type.
Definition simdize.h:382
constexpr vectorized_struct(It it, Flags={})
Definition simdize.h:910
constexpr void copy_to(It it, Flags={}) const
Definition simdize.h:928
Satisfied if T can be used with the following functions and types.
Definition struct_reflect.h:405
A type T is a vectorizable struct template if all of its data members can be vectorized via template ...
Definition simdize.h:336
Definition simdize.h:1008
constexpr Rotate< Offset > rotate
Rotate the elements by Offset.
Definition simd_permute.h:153
This namespace collects libraries and tools authored by Matthias Kretz.
Definition constexpr_wrapper.h:21
vir::simdize_size vir::constexpr_wrapper int constexpr int simdize_size_v
Inline variable for the simdize_size trait.
Definition simdize.h:55
std::remove_reference_t< decltype(struct_get< N >(std::declval< T & >()))> struct_element_t
struct_element_t<N, T> is an alias for the type of the N -th non-static data member of T.
Definition struct_reflect.h:456
constexpr decltype(auto) struct_get(T &&obj)
Returns a cv-qualified reference to the N -th non-static data member in obj.
Definition struct_reflect.h:439
constexpr decltype(auto) get(const simd_tuple< T, N > &tup)
Definition simdize.h:504
constexpr stdx::resize_simd_t< N==0 ? V::size() :N, V > simd_permute(V const &v, F const idx_perm) noexcept
Permute the elements of v using the index permutation function idx_perm.
Definition simd_permute.h:181
typename detail::simdize_impl< T, N >::type simdize
Apply a type transformation to a scalar type to produce a data-parallel type.
Definition simdize.h:1019
constexpr std::size_t struct_size_v
The number of non-static data members of T.
Definition struct_reflect.h:425
C++20 concepts extending the Parallelism TS 2 (which is limited to C++17).
Permutation functions for the Parallelism TS 2 simd types.
Tools for data member reflection of aggregates.
Definition constexpr_wrapper.h:69
Definition simdize.h:37