HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
native_f64x4_avx.hpp
1// Copyright Take Vos 2022, 2023.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "native_simd_utility.hpp"
8#include "../utility/module.hpp"
9#include <span>
10#include <array>
11#include <ostream>
12
13namespace hi { inline namespace v1 {
14
15#ifdef HI_HAS_AVX
16
33template<>
34struct native_simd<double,4> {
35 using value_type = double;
36 constexpr static size_t size = 4;
37 using array_type = std::array<value_type, size>;
38 using register_type = __m256d;
39
40 register_type v;
41
42 native_simd(native_simd const&) noexcept = default;
43 native_simd(native_simd&&) noexcept = default;
44 native_simd& operator=(native_simd const&) noexcept = default;
45 native_simd& operator=(native_simd&&) noexcept = default;
46
49 native_simd() noexcept : v(_mm256_setzero_pd()) {}
50
51 [[nodiscard]] explicit native_simd(register_type other) noexcept : v(other) {}
52
53 [[nodiscard]] explicit operator register_type() const noexcept
54 {
55 return v;
56 }
57
65 [[nodiscard]] native_simd(
66 value_type a,
67 value_type b = value_type{0},
68 value_type c = value_type{0},
69 value_type d = value_type{0}) noexcept :
70 v(_mm256_set_pd(d, c, b, a))
71 {
72 }
73
74 [[nodiscard]] explicit native_simd(value_type const *other) noexcept : v(_mm256_loadu_pd(other)) {}
75
76 void store(value_type *out) const noexcept
77 {
79 _mm256_storeu_pd(out, v);
80 }
81
82 [[nodiscard]] explicit native_simd(void const *other) noexcept : v(_mm256_loadu_pd(static_cast<value_type const *>(other))) {}
83
84 void store(void *out) const noexcept
85 {
87 _mm256_storeu_pd(static_cast<value_type *>(out), v);
88 }
89
90 [[nodiscard]] explicit native_simd(std::span<value_type const> other) noexcept
91 {
92 hi_axiom(other.size() >= size);
93 v = _mm256_loadu_pd(other.data());
94 }
95
96 void store(std::span<value_type> out) const noexcept
97 {
98 hi_axiom(out.size() >= size);
99 _mm256_storeu_pd(out.data(), v);
100 }
101
102 [[nodiscard]] explicit native_simd(array_type other) noexcept : v(_mm256_loadu_pd(other.data())) {}
103
104 [[nodiscard]] explicit operator array_type() const noexcept
105 {
106 auto r = array_type{};
107 _mm256_storeu_pd(r.data(), v);
108 return r;
109 }
110
111 [[nodiscard]] explicit native_simd(native_simd<float, 4> const& a) noexcept;
112 [[nodiscard]] explicit native_simd(native_simd<int32_t, 4> const& a) noexcept;
113 //[[nodiscard]] explicit native_simd(native_f64x2 const &a, native_f64x2 const &b) noexcept;
114
124 [[nodiscard]] static native_simd broadcast(value_type a) noexcept
125 {
126 return native_simd{_mm256_set1_pd(a)};
127 }
128
138 [[nodiscard]] static native_simd broadcast(native_simd a) noexcept
139 {
140#ifdef HI_HAS_AVX2
141 return native_simd{_mm256_permute4x64_pd(a.v, 0b00'00'00'00)};
142#else
143 hilet tmp = _mm256_permute_pd(a.v, 0b0000);
144 return native_simd{_mm256_permute2f128_pd(tmp, tmp, 0b0000'0000)};
145#endif
146 }
147
150 [[nodiscard]] static native_simd ones() noexcept
151 {
152#ifdef HI_HAS_AVX2
153 auto ones = _mm256_undefined_si256();
154 ones = _mm256_cmpeq_epi32(ones, ones);
155 return native_simd{_mm256_castsi256_pd(ones)};
156#else
157 auto ones = _mm256_setzero_pd();
158 ones = _mm256_cmpeq_pd(ones, ones);
159 return native_simd{ones};
160#endif
161 }
162
163 [[nodiscard]] static native_simd from_mask(size_t a) noexcept
164 {
165 hi_axiom(a <= 0b1111);
166
167 uint64_t a_ = a;
168
169 a_ <<= 31;
170 auto tmp = _mm_cvtsi32_si128(truncate<uint32_t>(a_));
171 a_ >>= 1;
172 tmp = _mm_insert_epi32(tmp, truncate<uint32_t>(a_), 1);
173 a_ >>= 1;
174 tmp = _mm_insert_epi32(tmp, truncate<uint32_t>(a_), 2);
175 a_ >>= 1;
176 tmp = _mm_insert_epi32(tmp, truncate<uint32_t>(a_), 3);
177
178 tmp = _mm_srai_epi32(tmp, 31);
179 return native_simd{_mm256_castsi256_pd(_mm256_cvtepi32_epi64(tmp))};
180 }
181
184 [[nodiscard]] size_t mask() const noexcept
185 {
186 return narrow_cast<size_t>(_mm256_movemask_pd(v));
187 }
188
195 [[nodiscard]] friend bool equal(native_simd a, native_simd b) noexcept
196 {
197 return _mm256_movemask_pd(_mm256_cmp_pd(a.v, b.v, _CMP_EQ_UQ)) == 0b1111;
198 }
199
200 [[nodiscard]] friend native_simd
201 almost_eq(native_simd a, native_simd b, value_type epsilon = std::numeric_limits<value_type>::epsilon()) noexcept
202 {
203 auto abs_diff = abs(a - b);
204 return abs_diff < broadcast(epsilon);
205 }
206
207 [[nodiscard]] friend bool
208 almost_equal(native_simd a, native_simd b, value_type epsilon = std::numeric_limits<value_type>::epsilon())
209 {
210 return almost_eq(a, b, epsilon).mask() == 0b1111;
211 }
212
213 [[nodiscard]] friend native_simd operator==(native_simd a, native_simd b) noexcept
214 {
215 return native_simd{_mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ)};
216 }
217
218 [[nodiscard]] friend native_simd operator!=(native_simd a, native_simd b) noexcept
219 {
220 return native_simd{_mm256_cmp_pd(a.v, b.v, _CMP_NEQ_UQ)};
221 }
222
223 [[nodiscard]] friend native_simd operator<(native_simd a, native_simd b) noexcept
224 {
225 return native_simd{_mm256_cmp_pd(a.v, b.v, _CMP_LT_OQ)};
226 }
227
228 [[nodiscard]] friend native_simd operator>(native_simd a, native_simd b) noexcept
229 {
230 return native_simd{_mm256_cmp_pd(a.v, b.v, _CMP_GT_OQ)};
231 }
232
233 [[nodiscard]] friend native_simd operator<=(native_simd a, native_simd b) noexcept
234 {
235 return native_simd{_mm256_cmp_pd(a.v, b.v, _CMP_LE_OQ)};
236 }
237
238 [[nodiscard]] friend native_simd operator>=(native_simd a, native_simd b) noexcept
239 {
240 return native_simd{_mm256_cmp_pd(a.v, b.v, _CMP_GE_OQ)};
241 }
242
243 [[nodiscard]] friend native_simd operator+(native_simd a) noexcept
244 {
245 return a;
246 }
247
248 [[nodiscard]] friend native_simd operator+(native_simd a, native_simd b) noexcept
249 {
250 return native_simd{_mm256_add_pd(a.v, b.v)};
251 }
252
253 [[nodiscard]] friend native_simd operator-(native_simd a, native_simd b) noexcept
254 {
255 return native_simd{_mm256_sub_pd(a.v, b.v)};
256 }
257
258 [[nodiscard]] friend native_simd operator-(native_simd a) noexcept
259 {
260 return native_simd{} - a;
261 }
262
263 [[nodiscard]] friend native_simd operator*(native_simd a, native_simd b) noexcept
264 {
265 return native_simd{_mm256_mul_pd(a.v, b.v)};
266 }
267
268 [[nodiscard]] friend native_simd operator/(native_simd a, native_simd b) noexcept
269 {
270 return native_simd{_mm256_div_pd(a.v, b.v)};
271 }
272
273 [[nodiscard]] friend native_simd operator&(native_simd a, native_simd b) noexcept
274 {
275 return native_simd{_mm256_and_pd(a.v, b.v)};
276 }
277
278 [[nodiscard]] friend native_simd operator|(native_simd a, native_simd b) noexcept
279 {
280 return native_simd{_mm256_or_pd(a.v, b.v)};
281 }
282
283 [[nodiscard]] friend native_simd operator^(native_simd a, native_simd b) noexcept
284 {
285 return native_simd{_mm256_xor_pd(a.v, b.v)};
286 }
287
288 [[nodiscard]] friend native_simd operator~(native_simd a) noexcept
289 {
290 return not_and(a, ones());
291 }
292
293 [[nodiscard]] friend native_simd min(native_simd a, native_simd b) noexcept
294 {
295 return native_simd{_mm256_min_pd(a.v, b.v)};
296 }
297
298 [[nodiscard]] friend native_simd max(native_simd a, native_simd b) noexcept
299 {
300 return native_simd{_mm256_max_pd(a.v, b.v)};
301 }
302
303 [[nodiscard]] friend native_simd abs(native_simd a) noexcept
304 {
305 return not_and(broadcast(-0.0f), a);
306 }
307
308 [[nodiscard]] friend native_simd floor(native_simd a) noexcept
309 {
310 return native_simd{_mm256_floor_pd(a.v)};
311 }
312
313 [[nodiscard]] friend native_simd ceil(native_simd a) noexcept
314 {
315 return native_simd{_mm256_ceil_pd(a.v)};
316 }
317
318 template<native_rounding_mode Rounding = native_rounding_mode::current>
319 [[nodiscard]] friend native_simd round(native_simd a) noexcept
320 {
321 return native_simd{_mm256_round_pd(a.v, to_underlying(Rounding))};
322 }
323
326 [[nodiscard]] friend native_simd rcp(native_simd a) noexcept
327 {
328 return native_simd{_mm256_div_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), a.v)};
329 }
330
333 [[nodiscard]] friend native_simd sqrt(native_simd a) noexcept
334 {
335 return native_simd{_mm256_sqrt_pd(a.v)};
336 }
337
344 [[nodiscard]] friend native_simd rsqrt(native_simd a) noexcept
345 {
346 return rcp(sqrt(a));
347 }
348
355 template<size_t Mask>
356 [[nodiscard]] friend native_simd set_zero(native_simd a) noexcept
357 {
358 static_assert(Mask <= 0b1111);
359 return blend<Mask>(a, native_simd{});
360 }
361
369 template<size_t Index>
370 [[nodiscard]] friend native_simd insert(native_simd a, value_type b) noexcept
371 {
372 static_assert(Index < 4);
373 return blend<1_uz << Index>(a, broadcast(b));
374 }
375
382 template<size_t Index>
383 [[nodiscard]] friend value_type get(native_simd a) noexcept
384 {
385 static_assert(Index < size);
386
387#ifdef HI_HAS_AVX2
388 return _mm256_cvtsd_f64(_mm256_permute4x64_pd(a.v, Index));
389
390#else
391 constexpr auto hi_index = Index / (size / 2);
392 constexpr auto lo_index = Index % (size / 2);
393
394 hilet hi = _mm256_extractf128_pd(a.v, hi_index);
395 hilet lo = _mm_permute_pd(hi, lo_index);
396 return _mm_cvtsd_f64(lo);
397#endif
398 }
399
408 template<size_t Mask>
409 [[nodiscard]] friend native_simd blend(native_simd a, native_simd b) noexcept
410 {
411 static_assert(Mask <= 0b1111);
412
413 if constexpr (Mask == 0b0000) {
414 return a;
415 } else if constexpr (Mask == 0b1111) {
416 return b;
417 } else {
418 return native_simd{_mm256_blend_pd(a.v, b.v, Mask)};
419 }
420 }
421
434 template<fixed_string SourceElements>
435 [[nodiscard]] friend native_simd permute(native_simd a) noexcept
436 {
437 static_assert(SourceElements.size() == size);
438 constexpr auto order = detail::native_swizzle_to_packed_indices<SourceElements, size>();
439
440#if HI_HAS_AVX2
441 if constexpr (order == 0b11'10'01'00) {
442 return a;
443 } else {
444 return native_simd{_mm256_permute4x64_pd(a.v, order)};
445 }
446
447#else
448 // clang-format off
449 constexpr auto hi_order =
450 ((order & 0b00'00'00'10) >> 1) |
451 ((order & 0b00'00'10'00) >> 2) |
452 ((order & 0b00'10'00'00) >> 3) |
453 ((order & 0b10'00'00'00) >> 4);
454 constexpr auto lo_order =
455 (order & 0b00'00'00'01) |
456 ((order & 0b00'00'01'00) >> 1) |
457 ((order & 0b00'01'00'00) >> 2) |
458 ((order & 0b01'00'00'00) >> 3);
459 // clang-format on
460
461 if constexpr (order == 0b11'10'01'00) {
462 return a;
463 } else if constexpr (order == 0b00'00'00'00) {
464 return broadcast(a);
465 } else if constexpr (hi_order == 0b1100) {
466 return native_simd{_mm256_permute_pd(a.v, lo_order)};
467 } else if constexpr (hi_order == 0b0011) {
468 hilet tmp = _mm256_permute2f128_pd(a.v, a.v, 0b0000'0001);
469 return native_simd{_mm256_permute_pd(tmp, lo_order)};
470 } else if constexpr (hi_order == 0b1111) {
471 hilet tmp = _mm256_permute2f128_pd(a.v, a.v, 0b0001'0001);
472 return native_simd{_mm256_permute_pd(tmp, lo_order)};
473 } else if constexpr (hi_order == 0b0000) {
474 hilet tmp = _mm256_permute2f128_pd(a.v, a.v, 0b0000'0000);
475 return native_simd{_mm256_permute_pd(tmp, lo_order)};
476 } else {
477 hilet hi_0 = _mm256_permute2f128_pd(a.v, a.v, 0b0000'0000);
478 hilet hi_1 = _mm256_permute2f128_pd(a.v, a.v, 0b0001'0001);
479 hilet lo_0 = _mm256_permute_pd(hi_0, lo_order);
480 hilet lo_1 = _mm256_permute_pd(hi_1, lo_order);
481 return native_simd{_mm256_blend_pd(lo_0, lo_1, hi_order)};
482 }
483#endif
484 }
485
502 template<fixed_string SourceElements>
503 [[nodiscard]] friend native_simd swizzle(native_simd a) noexcept
504 {
505 static_assert(SourceElements.size() == size);
506 constexpr auto one_mask = detail::native_swizzle_to_mask<SourceElements, size, '1'>();
507 constexpr auto zero_mask = detail::native_swizzle_to_mask<SourceElements, size, '0'>();
508 constexpr auto number_mask = one_mask | zero_mask;
509
510 if constexpr (number_mask == 0b1111) {
511 // Swizzle was /[01][01][01][01]/.
512 return swizzle_numbers<SourceElements>();
513
514 } else if constexpr (number_mask == 0b0000) {
515 // Swizzle was /[^01][^01][^01][^01]/.
516 return permute<SourceElements>(a);
517
518#ifdef HI_HAS_SSE4_1
519 } else if constexpr (number_mask == zero_mask) {
520 // Swizzle was /[^1][^1][^1][^1]/.
521 hilet ordered = permute<SourceElements>(a);
522 return set_zero<zero_mask>(ordered);
523#endif
524
525 } else {
526 hilet ordered = permute<SourceElements>(a);
527 hilet numbers = swizzle_numbers<SourceElements>();
528 return blend<number_mask>(ordered, numbers);
529 }
530 }
531
542 [[nodiscard]] friend native_simd horizontal_add(native_simd a, native_simd b) noexcept
543 {
544 return permute<"acbd">(native_simd{_mm256_hadd_pd(a.v, b.v)});
545 }
546
557 [[nodiscard]] friend native_simd horizontal_sub(native_simd a, native_simd b) noexcept
558 {
559 return permute<"acbd">(native_simd{_mm256_hsub_pd(a.v, b.v)});
560 }
561
568 [[nodiscard]] friend native_simd horizontal_sum(native_simd a) noexcept
569 {
570 hilet tmp = horizontal_add(a, a);
571 return native_simd{_mm256_hadd_pd(tmp.v, tmp.v)};
572 }
573
585 [[nodiscard]] friend native_simd interleaved_sub_add(native_simd a, native_simd b) noexcept
586 {
587 return native_simd{_mm256_addsub_pd(a.v, b.v)};
588 }
589
595 [[nodiscard]] friend native_simd not_and(native_simd a, native_simd b) noexcept
596 {
597 return native_simd{_mm256_andnot_pd(a.v, b.v)};
598 }
599
600 friend std::ostream& operator<<(std::ostream& a, native_simd b) noexcept
601 {
602 return a << "(" << get<0>(b) << ", " << get<1>(b) << ", " << get<2>(b) << ", " << get<3>(b) << ")";
603 }
604
605 template<fixed_string SourceElements>
606 [[nodiscard]] static native_simd swizzle_numbers() noexcept
607 {
608 constexpr auto one_mask = detail::native_swizzle_to_mask<SourceElements, size, '1'>();
609 constexpr auto zero_mask = detail::native_swizzle_to_mask<SourceElements, size, '0'>();
610 constexpr auto number_mask = one_mask | zero_mask;
611 constexpr auto alpha_mask = ~number_mask & 0b1111;
612
613 if constexpr ((zero_mask | alpha_mask) == 0b1111) {
614 return {};
615
616 } else if constexpr ((one_mask | alpha_mask) == 0b1111) {
617 return broadcast(1.0f);
618
619 } else {
620 return native_simd{
621 to_bool(one_mask & 0b0001) ? 1.0f : 0.0f,
622 to_bool(one_mask & 0b0010) ? 1.0f : 0.0f,
623 to_bool(one_mask & 0b0100) ? 1.0f : 0.0f,
624 to_bool(one_mask & 0b1000) ? 1.0f : 0.0f};
625 }
626 }
627};
628
629#endif
630
631}} // namespace hi::v1
#define hi_axiom(expression,...)
Specify an axiom; an expression that is true.
Definition assert.hpp:253
#define hi_axiom_not_null(expression,...)
Assert if an expression is not nullptr.
Definition assert.hpp:272
#define hilet
Invariant should be the default for variables.
Definition utility.hpp:23
@ round
The end cap of the line is round.
@ other
The gui_event does not have associated data.
DOXYGEN BUG.
Definition algorithm.hpp:13
geometry/margins.hpp
Definition cache.hpp:11
T ceil(T... args)
T equal(T... args)
T floor(T... args)
T max(T... args)
T min(T... args)
T operator!=(T... args)
T sqrt(T... args)