7#include "native_simd_utility.hpp"
8#include "../utility/module.hpp"
13namespace hi {
inline namespace v1 {
34struct native_simd<double,4> {
35 using value_type = double;
36 constexpr static size_t size = 4;
38 using register_type = __m256d;
42 native_simd(native_simd
const&)
noexcept =
default;
43 native_simd(native_simd&&) noexcept = default;
44 native_simd& operator=(native_simd const&) noexcept = default;
45 native_simd& operator=(native_simd&&) noexcept = default;
49 native_simd() noexcept : v(_mm256_setzero_pd()) {}
51 [[nodiscard]]
explicit native_simd(register_type
other) noexcept : v(
other) {}
53 [[nodiscard]]
explicit operator register_type() const noexcept
65 [[nodiscard]] native_simd(
67 value_type b = value_type{0},
68 value_type c = value_type{0},
69 value_type d = value_type{0})
noexcept :
70 v(_mm256_set_pd(d, c, b, a))
74 [[nodiscard]]
explicit native_simd(value_type
const *
other) noexcept : v(_mm256_loadu_pd(
other)) {}
76 void store(value_type *out)
const noexcept
79 _mm256_storeu_pd(out, v);
82 [[nodiscard]]
explicit native_simd(
void const *
other) noexcept : v(_mm256_loadu_pd(
static_cast<value_type
const *
>(
other))) {}
84 void store(
void *out)
const noexcept
87 _mm256_storeu_pd(
static_cast<value_type *
>(out), v);
90 [[nodiscard]]
explicit native_simd(std::span<value_type const>
other)
noexcept
93 v = _mm256_loadu_pd(
other.data());
96 void store(std::span<value_type> out)
const noexcept
99 _mm256_storeu_pd(out.data(), v);
102 [[nodiscard]]
explicit native_simd(array_type
other) noexcept : v(_mm256_loadu_pd(
other.data())) {}
104 [[nodiscard]]
explicit operator array_type() const noexcept
106 auto r = array_type{};
107 _mm256_storeu_pd(r.data(), v);
111 [[nodiscard]]
explicit native_simd(native_simd<float, 4>
const& a)
noexcept;
112 [[nodiscard]]
explicit native_simd(native_simd<int32_t, 4>
const& a)
noexcept;
124 [[nodiscard]]
static native_simd broadcast(value_type a)
noexcept
126 return native_simd{_mm256_set1_pd(a)};
138 [[nodiscard]]
static native_simd broadcast(native_simd a)
noexcept
141 return native_simd{_mm256_permute4x64_pd(a.v, 0b00'00'00'00)};
143 hilet tmp = _mm256_permute_pd(a.v, 0b0000);
144 return native_simd{_mm256_permute2f128_pd(tmp, tmp, 0b0000'0000)};
150 [[nodiscard]]
static native_simd ones() noexcept
153 auto ones = _mm256_undefined_si256();
154 ones = _mm256_cmpeq_epi32(ones, ones);
155 return native_simd{_mm256_castsi256_pd(ones)};
157 auto ones = _mm256_setzero_pd();
158 ones = _mm256_cmpeq_pd(ones, ones);
159 return native_simd{ones};
163 [[nodiscard]]
static native_simd from_mask(
size_t a)
noexcept
170 auto tmp = _mm_cvtsi32_si128(truncate<uint32_t>(a_));
172 tmp = _mm_insert_epi32(tmp, truncate<uint32_t>(a_), 1);
174 tmp = _mm_insert_epi32(tmp, truncate<uint32_t>(a_), 2);
176 tmp = _mm_insert_epi32(tmp, truncate<uint32_t>(a_), 3);
178 tmp = _mm_srai_epi32(tmp, 31);
179 return native_simd{_mm256_castsi256_pd(_mm256_cvtepi32_epi64(tmp))};
184 [[nodiscard]]
size_t mask() const noexcept
186 return narrow_cast<size_t>(_mm256_movemask_pd(v));
195 [[nodiscard]]
friend bool equal(native_simd a, native_simd b)
noexcept
197 return _mm256_movemask_pd(_mm256_cmp_pd(a.v, b.v, _CMP_EQ_UQ)) == 0b1111;
200 [[nodiscard]]
friend native_simd
203 auto abs_diff = abs(a - b);
204 return abs_diff < broadcast(epsilon);
207 [[nodiscard]]
friend bool
210 return almost_eq(a, b, epsilon).mask() == 0b1111;
213 [[nodiscard]]
friend native_simd operator==(native_simd a, native_simd b)
noexcept
215 return native_simd{_mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ)};
218 [[nodiscard]]
friend native_simd
operator!=(native_simd a, native_simd b)
noexcept
220 return native_simd{_mm256_cmp_pd(a.v, b.v, _CMP_NEQ_UQ)};
223 [[nodiscard]]
friend native_simd operator<(native_simd a, native_simd b)
noexcept
225 return native_simd{_mm256_cmp_pd(a.v, b.v, _CMP_LT_OQ)};
228 [[nodiscard]]
friend native_simd
operator>(native_simd a, native_simd b)
noexcept
230 return native_simd{_mm256_cmp_pd(a.v, b.v, _CMP_GT_OQ)};
233 [[nodiscard]]
friend native_simd
operator<=(native_simd a, native_simd b)
noexcept
235 return native_simd{_mm256_cmp_pd(a.v, b.v, _CMP_LE_OQ)};
238 [[nodiscard]]
friend native_simd
operator>=(native_simd a, native_simd b)
noexcept
240 return native_simd{_mm256_cmp_pd(a.v, b.v, _CMP_GE_OQ)};
243 [[nodiscard]]
friend native_simd operator+(native_simd a)
noexcept
248 [[nodiscard]]
friend native_simd operator+(native_simd a, native_simd b)
noexcept
250 return native_simd{_mm256_add_pd(a.v, b.v)};
253 [[nodiscard]]
friend native_simd operator-(native_simd a, native_simd b)
noexcept
255 return native_simd{_mm256_sub_pd(a.v, b.v)};
258 [[nodiscard]]
friend native_simd operator-(native_simd a)
noexcept
260 return native_simd{} - a;
263 [[nodiscard]]
friend native_simd operator*(native_simd a, native_simd b)
noexcept
265 return native_simd{_mm256_mul_pd(a.v, b.v)};
268 [[nodiscard]]
friend native_simd operator/(native_simd a, native_simd b)
noexcept
270 return native_simd{_mm256_div_pd(a.v, b.v)};
273 [[nodiscard]]
friend native_simd operator&(native_simd a, native_simd b)
noexcept
275 return native_simd{_mm256_and_pd(a.v, b.v)};
278 [[nodiscard]]
friend native_simd operator|(native_simd a, native_simd b)
noexcept
280 return native_simd{_mm256_or_pd(a.v, b.v)};
283 [[nodiscard]]
friend native_simd operator^(native_simd a, native_simd b)
noexcept
285 return native_simd{_mm256_xor_pd(a.v, b.v)};
288 [[nodiscard]]
friend native_simd operator~(native_simd a)
noexcept
290 return not_and(a, ones());
293 [[nodiscard]]
friend native_simd
min(native_simd a, native_simd b)
noexcept
295 return native_simd{_mm256_min_pd(a.v, b.v)};
298 [[nodiscard]]
friend native_simd
max(native_simd a, native_simd b)
noexcept
300 return native_simd{_mm256_max_pd(a.v, b.v)};
303 [[nodiscard]]
friend native_simd abs(native_simd a)
noexcept
305 return not_and(broadcast(-0.0f), a);
308 [[nodiscard]]
friend native_simd
floor(native_simd a)
noexcept
310 return native_simd{_mm256_floor_pd(a.v)};
313 [[nodiscard]]
friend native_simd
ceil(native_simd a)
noexcept
315 return native_simd{_mm256_ceil_pd(a.v)};
318 template<native_rounding_mode Rounding = native_rounding_mode::current>
319 [[nodiscard]]
friend native_simd
round(native_simd a)
noexcept
321 return native_simd{_mm256_round_pd(a.v, to_underlying(Rounding))};
326 [[nodiscard]]
friend native_simd rcp(native_simd a)
noexcept
328 return native_simd{_mm256_div_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), a.v)};
333 [[nodiscard]]
friend native_simd
sqrt(native_simd a)
noexcept
335 return native_simd{_mm256_sqrt_pd(a.v)};
344 [[nodiscard]]
friend native_simd rsqrt(native_simd a)
noexcept
355 template<
size_t Mask>
356 [[nodiscard]]
friend native_simd set_zero(native_simd a)
noexcept
358 static_assert(Mask <= 0b1111);
359 return blend<Mask>(a, native_simd{});
369 template<
size_t Index>
370 [[nodiscard]]
friend native_simd insert(native_simd a, value_type b)
noexcept
372 static_assert(Index < 4);
373 return blend<1_uz << Index>(a, broadcast(b));
382 template<
size_t Index>
383 [[nodiscard]]
friend value_type get(native_simd a)
noexcept
385 static_assert(Index < size);
388 return _mm256_cvtsd_f64(_mm256_permute4x64_pd(a.v, Index));
391 constexpr auto hi_index = Index / (size / 2);
392 constexpr auto lo_index = Index % (size / 2);
394 hilet hi = _mm256_extractf128_pd(a.v, hi_index);
395 hilet lo = _mm_permute_pd(
hi, lo_index);
396 return _mm_cvtsd_f64(lo);
408 template<
size_t Mask>
409 [[nodiscard]]
friend native_simd blend(native_simd a, native_simd b)
noexcept
411 static_assert(Mask <= 0b1111);
413 if constexpr (Mask == 0b0000) {
415 }
else if constexpr (Mask == 0b1111) {
418 return native_simd{_mm256_blend_pd(a.v, b.v, Mask)};
434 template<fixed_
string SourceElements>
435 [[nodiscard]]
friend native_simd permute(native_simd a)
noexcept
437 static_assert(SourceElements.size() == size);
438 constexpr auto order = detail::native_swizzle_to_packed_indices<SourceElements, size>();
441 if constexpr (order == 0b11'10'01'00) {
444 return native_simd{_mm256_permute4x64_pd(a.v, order)};
449 constexpr auto hi_order =
450 ((order & 0b00'00'00'10) >> 1) |
451 ((order & 0b00'00'10'00) >> 2) |
452 ((order & 0b00'10'00'00) >> 3) |
453 ((order & 0b10'00'00'00) >> 4);
454 constexpr auto lo_order =
455 (order & 0b00'00'00'01) |
456 ((order & 0b00'00'01'00) >> 1) |
457 ((order & 0b00'01'00'00) >> 2) |
458 ((order & 0b01'00'00'00) >> 3);
461 if constexpr (order == 0b11'10'01'00) {
463 }
else if constexpr (order == 0b00'00'00'00) {
465 }
else if constexpr (hi_order == 0b1100) {
466 return native_simd{_mm256_permute_pd(a.v, lo_order)};
467 }
else if constexpr (hi_order == 0b0011) {
468 hilet tmp = _mm256_permute2f128_pd(a.v, a.v, 0b0000'0001);
469 return native_simd{_mm256_permute_pd(tmp, lo_order)};
470 }
else if constexpr (hi_order == 0b1111) {
471 hilet tmp = _mm256_permute2f128_pd(a.v, a.v, 0b0001'0001);
472 return native_simd{_mm256_permute_pd(tmp, lo_order)};
473 }
else if constexpr (hi_order == 0b0000) {
474 hilet tmp = _mm256_permute2f128_pd(a.v, a.v, 0b0000'0000);
475 return native_simd{_mm256_permute_pd(tmp, lo_order)};
477 hilet hi_0 = _mm256_permute2f128_pd(a.v, a.v, 0b0000'0000);
478 hilet hi_1 = _mm256_permute2f128_pd(a.v, a.v, 0b0001'0001);
479 hilet lo_0 = _mm256_permute_pd(hi_0, lo_order);
480 hilet lo_1 = _mm256_permute_pd(hi_1, lo_order);
481 return native_simd{_mm256_blend_pd(lo_0, lo_1, hi_order)};
502 template<fixed_
string SourceElements>
503 [[nodiscard]]
friend native_simd swizzle(native_simd a)
noexcept
505 static_assert(SourceElements.size() == size);
506 constexpr auto one_mask = detail::native_swizzle_to_mask<SourceElements, size, '1'>();
507 constexpr auto zero_mask = detail::native_swizzle_to_mask<SourceElements, size, '0'>();
508 constexpr auto number_mask = one_mask | zero_mask;
510 if constexpr (number_mask == 0b1111) {
512 return swizzle_numbers<SourceElements>();
514 }
else if constexpr (number_mask == 0b0000) {
516 return permute<SourceElements>(a);
519 }
else if constexpr (number_mask == zero_mask) {
521 hilet ordered = permute<SourceElements>(a);
522 return set_zero<zero_mask>(ordered);
526 hilet ordered = permute<SourceElements>(a);
527 hilet numbers = swizzle_numbers<SourceElements>();
528 return blend<number_mask>(ordered, numbers);
542 [[nodiscard]]
friend native_simd horizontal_add(native_simd a, native_simd b)
noexcept
544 return permute<
"acbd">(native_simd{_mm256_hadd_pd(a.v, b.v)});
557 [[nodiscard]]
friend native_simd horizontal_sub(native_simd a, native_simd b)
noexcept
559 return permute<
"acbd">(native_simd{_mm256_hsub_pd(a.v, b.v)});
568 [[nodiscard]]
friend native_simd horizontal_sum(native_simd a)
noexcept
570 hilet tmp = horizontal_add(a, a);
571 return native_simd{_mm256_hadd_pd(tmp.v, tmp.v)};
585 [[nodiscard]]
friend native_simd interleaved_sub_add(native_simd a, native_simd b)
noexcept
587 return native_simd{_mm256_addsub_pd(a.v, b.v)};
595 [[nodiscard]]
friend native_simd not_and(native_simd a, native_simd b)
noexcept
597 return native_simd{_mm256_andnot_pd(a.v, b.v)};
602 return a <<
"(" << get<0>(b) <<
", " << get<1>(b) <<
", " << get<2>(b) <<
", " << get<3>(b) <<
")";
605 template<fixed_
string SourceElements>
606 [[nodiscard]]
static native_simd swizzle_numbers() noexcept
608 constexpr auto one_mask = detail::native_swizzle_to_mask<SourceElements, size, '1'>();
609 constexpr auto zero_mask = detail::native_swizzle_to_mask<SourceElements, size, '0'>();
610 constexpr auto number_mask = one_mask | zero_mask;
611 constexpr auto alpha_mask = ~number_mask & 0b1111;
613 if constexpr ((zero_mask | alpha_mask) == 0b1111) {
616 }
else if constexpr ((one_mask | alpha_mask) == 0b1111) {
617 return broadcast(1.0f);
621 to_bool(one_mask & 0b0001) ? 1.0f : 0.0f,
622 to_bool(one_mask & 0b0010) ? 1.0f : 0.0f,
623 to_bool(one_mask & 0b0100) ? 1.0f : 0.0f,
624 to_bool(one_mask & 0b1000) ? 1.0f : 0.0f};
#define hi_axiom(expression,...)
Specify an axiom; an expression that is true.
Definition assert.hpp:253
#define hi_axiom_not_null(expression,...)
Assert if an expression is not nullptr.
Definition assert.hpp:272
#define hilet
Invariant should be the default for variables.
Definition utility.hpp:23
@ round
The end cap of the line is round.
@ other
The gui_event does not have associated data.
DOXYGEN BUG.
Definition algorithm.hpp:13
geometry/margins.hpp
Definition cache.hpp:11