HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
native_f32x4_sse.hpp
1// Copyright Take Vos 2022, 2023.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "native_simd_utility.hpp"
8#include "../utility/module.hpp"
9#include <span>
10#include <array>
11#include <ostream>
12
13namespace hi { inline namespace v1 {
14
15#ifdef HI_HAS_SSE
16
33template<>
34struct native_simd<float, 4> {
35 using value_type = float;
36 constexpr static size_t size = 4;
37
38 using array_type = std::array<value_type, size>;
39 using register_type = __m128;
40
41 register_type v;
42
43 native_simd(native_simd const&) noexcept = default;
44 native_simd(native_simd&&) noexcept = default;
45 native_simd& operator=(native_simd const&) noexcept = default;
46 native_simd& operator=(native_simd&&) noexcept = default;
47
50 native_simd() noexcept : v(_mm_setzero_ps()) {}
51
52 [[nodiscard]] explicit native_simd(register_type other) noexcept : v(other) {}
53
54 [[nodiscard]] explicit operator register_type() const noexcept
55 {
56 return v;
57 }
58
63 [[nodiscard]] explicit native_simd(value_type a) noexcept : v(_mm_set_ss(a)) {}
64
72 [[nodiscard]] native_simd(value_type a, value_type b, value_type c = value_type{0}, value_type d = value_type{0}) noexcept :
73 v(_mm_set_ps(d, c, b, a))
74 {
75 }
76
77 [[nodiscard]] explicit native_simd(value_type const *other) noexcept : v(_mm_loadu_ps(other)) {}
78
79 void store(value_type *out) const noexcept
80 {
82 _mm_storeu_ps(out, v);
83 }
84
85 [[nodiscard]] explicit native_simd(void const *other) noexcept : v(_mm_loadu_ps(static_cast<value_type const *>(other))) {}
86
87 void store(void *out) const noexcept
88 {
90 _mm_storeu_ps(static_cast<value_type *>(out), v);
91 }
92
93 [[nodiscard]] explicit native_simd(std::span<value_type const> other) noexcept
94 {
95 hi_axiom(other.size() >= size);
96 v = _mm_loadu_ps(other.data());
97 }
98
99 void store(std::span<value_type> out) const noexcept
100 {
101 hi_axiom(out.size() >= size);
102 _mm_storeu_ps(out.data(), v);
103 }
104
105 [[nodiscard]] explicit native_simd(array_type other) noexcept : v(_mm_loadu_ps(other.data())) {}
106
107 [[nodiscard]] explicit operator array_type() const noexcept
108 {
109 auto r = array_type{};
110 _mm_storeu_ps(r.data(), v);
111 return r;
112 }
113
114#ifdef HI_HAS_SSE2
115 [[nodiscard]] explicit native_simd(native_simd<int32_t, 4> const& a) noexcept;
116#endif
117#ifdef HI_HAS_AVX
118 [[nodiscard]] explicit native_simd(native_simd<double, 4> const& a) noexcept;
119#endif
120
130 [[nodiscard]] static native_simd broadcast(value_type a) noexcept
131 {
132 return native_simd{_mm_set1_ps(a)};
133 }
134
144 [[nodiscard]] static native_simd broadcast(native_simd a) noexcept
145 {
146#ifdef HI_HAS_AVX2
147 return native_simd{_mm_broadcastss_ps(a.v)};
148#else
149 return native_simd{_mm_shuffle_ps(a.v, a.v, 0b00'00'00'00)};
150#endif
151 }
152
155 [[nodiscard]] static native_simd from_mask(size_t a) noexcept
156 {
157 hi_axiom(a <= 0b1111);
158
159 uint64_t a_ = a;
160
161 a_ <<= 31;
162 auto tmp = _mm_cvtsi32_si128(truncate<uint32_t>(a_));
163 a_ >>= 1;
164 tmp = _mm_insert_epi32(tmp, truncate<uint32_t>(a_), 1);
165 a_ >>= 1;
166 tmp = _mm_insert_epi32(tmp, truncate<uint32_t>(a_), 2);
167 a_ >>= 1;
168 tmp = _mm_insert_epi32(tmp, truncate<uint32_t>(a_), 3);
169
170 tmp = _mm_srai_epi32(tmp, 31);
171 return native_simd{_mm_castsi128_ps(tmp)};
172 }
173
176 [[nodiscard]] static native_simd ones() noexcept
177 {
178#ifdef HI_HAS_SSE2
179 auto ones = _mm_undefined_si128();
180 ones = _mm_cmpeq_epi32(ones, ones);
181 return native_simd{_mm_castsi128_ps(ones)};
182#else
183 auto ones = _mm_setzero_ps();
184 ones = _mm_cmpeq_ps(ones, ones);
185 return native_simd{ones};
186#endif
187 }
188
191 [[nodiscard]] size_t mask() const noexcept
192 {
193 return narrow_cast<size_t>(_mm_movemask_ps(v));
194 }
195
202 [[nodiscard]] friend bool equal(native_simd a, native_simd b) noexcept
203 {
204#ifdef HI_HAS_SSE2
205 return _mm_movemask_epi8(_mm_cmpeq_epi32(_mm_castps_si128(a.v), _mm_castps_si128(b.v))) == 0b1111'1111'1111'1111;
206#else
207 return static_cast<array_type>(a) == static_cast<array_type>(b);
208#endif
209 }
210
211 [[nodiscard]] friend native_simd
212 almost_eq(native_simd a, native_simd b, value_type epsilon = std::numeric_limits<value_type>::epsilon()) noexcept
213 {
214 hilet abs_diff = abs(a - b);
215 return abs_diff < broadcast(epsilon);
216 }
217
218 [[nodiscard]] friend bool
219 almost_equal(native_simd a, native_simd b, value_type epsilon = std::numeric_limits<value_type>::epsilon())
220 {
221 return almost_eq(a, b, epsilon).mask() == 0b1111;
222 }
223
224 [[nodiscard]] friend native_simd operator==(native_simd a, native_simd b) noexcept
225 {
226 return native_simd{_mm_cmpeq_ps(a.v, b.v)};
227 }
228
229 [[nodiscard]] friend native_simd operator!=(native_simd a, native_simd b) noexcept
230 {
231 return native_simd{_mm_cmpneq_ps(a.v, b.v)};
232 }
233
234 [[nodiscard]] friend native_simd operator<(native_simd a, native_simd b) noexcept
235 {
236 return native_simd{_mm_cmplt_ps(a.v, b.v)};
237 }
238
239 [[nodiscard]] friend native_simd operator>(native_simd a, native_simd b) noexcept
240 {
241 return native_simd{_mm_cmpgt_ps(a.v, b.v)};
242 }
243
244 [[nodiscard]] friend native_simd operator<=(native_simd a, native_simd b) noexcept
245 {
246 return native_simd{_mm_cmple_ps(a.v, b.v)};
247 }
248
249 [[nodiscard]] friend native_simd operator>=(native_simd a, native_simd b) noexcept
250 {
251 return native_simd{_mm_cmpge_ps(a.v, b.v)};
252 }
253
254 [[nodiscard]] friend native_simd operator+(native_simd a) noexcept
255 {
256 return a;
257 }
258
259 [[nodiscard]] friend native_simd operator+(native_simd a, native_simd b) noexcept
260 {
261 return native_simd{_mm_add_ps(a.v, b.v)};
262 }
263
264 [[nodiscard]] friend native_simd operator-(native_simd a, native_simd b) noexcept
265 {
266 return native_simd{_mm_sub_ps(a.v, b.v)};
267 }
268
269 [[nodiscard]] friend native_simd operator-(native_simd a) noexcept
270 {
271 return native_simd{} - a;
272 }
273
274 [[nodiscard]] friend native_simd operator*(native_simd a, native_simd b) noexcept
275 {
276 return native_simd{_mm_mul_ps(a.v, b.v)};
277 }
278
279 [[nodiscard]] friend native_simd operator/(native_simd a, native_simd b) noexcept
280 {
281 return native_simd{_mm_div_ps(a.v, b.v)};
282 }
283
284 [[nodiscard]] friend native_simd operator&(native_simd a, native_simd b) noexcept
285 {
286 return native_simd{_mm_and_ps(a.v, b.v)};
287 }
288
289 [[nodiscard]] friend native_simd operator|(native_simd a, native_simd b) noexcept
290 {
291 return native_simd{_mm_or_ps(a.v, b.v)};
292 }
293
294 [[nodiscard]] friend native_simd operator^(native_simd a, native_simd b) noexcept
295 {
296 return native_simd{_mm_xor_ps(a.v, b.v)};
297 }
298
299 [[nodiscard]] friend native_simd operator~(native_simd a) noexcept
300 {
301 return not_and(a, ones());
302 }
303
304 [[nodiscard]] friend native_simd min(native_simd a, native_simd b) noexcept
305 {
306 return native_simd{_mm_min_ps(a.v, b.v)};
307 }
308
309 [[nodiscard]] friend native_simd max(native_simd a, native_simd b) noexcept
310 {
311 return native_simd{_mm_max_ps(a.v, b.v)};
312 }
313
314 [[nodiscard]] friend native_simd abs(native_simd a) noexcept
315 {
316 return not_and(broadcast(-0.0f), a);
317 }
318
319#ifdef HI_HAS_SSE4_1
320 [[nodiscard]] friend native_simd floor(native_simd a) noexcept
321 {
322 return native_simd{_mm_floor_ps(a.v)};
323 }
324#endif
325
326#ifdef HI_HAS_SSE4_1
327 [[nodiscard]] friend native_simd ceil(native_simd a) noexcept
328 {
329 return native_simd{_mm_ceil_ps(a.v)};
330 }
331#endif
332
333#ifdef HI_HAS_SSE4_1
334 template<native_rounding_mode Rounding = native_rounding_mode::current>
335 [[nodiscard]] friend native_simd round(native_simd a) noexcept
336 {
337 return native_simd{_mm_round_ps(a.v, to_underlying(Rounding))};
338 }
339#endif
340
343 [[nodiscard]] friend native_simd rcp(native_simd a) noexcept
344 {
345 return native_simd{_mm_rcp_ps(a.v)};
346 }
347
350 [[nodiscard]] friend native_simd sqrt(native_simd a) noexcept
351 {
352 return native_simd{_mm_sqrt_ps(a.v)};
353 }
354
361 [[nodiscard]] friend native_simd rsqrt(native_simd a) noexcept
362 {
363 return native_simd{_mm_rsqrt_ps(a.v)};
364 }
365
372 template<size_t Mask>
373 [[nodiscard]] friend native_simd set_zero(native_simd a) noexcept
374 {
375 static_assert(Mask <= 0b1111);
376 if constexpr (Mask == 0b0000) {
377 return a;
378 } else if constexpr (Mask == 0b1111) {
379 return {};
380 } else {
381#ifdef HI_HAS_SSE4_1
382 return native_simd{_mm_insert_ps(a.v, a.v, Mask)};
383#else
384 hilet mask = from_mask(Mask);
385 return not_and(mask, a);
386#endif
387 }
388 }
389
397 template<size_t Index>
398 [[nodiscard]] friend native_simd insert(native_simd a, value_type b) noexcept
399 {
400 static_assert(Index < 4);
401
402#ifdef HI_HAS_SSE4_1
403 return native_simd{_mm_insert_ps(a.v, _mm_set1_ps(b), narrow_cast<int>(Index << 4))};
404#else
405 hilet mask = from_mask(1_uz << Index);
406 return not_and(mask, a) | (mask & broadcast(b));
407#endif
408 }
409
410 template<size_t SrcIndex, size_t DstIndex>
411 [[nodiscard]] friend native_simd insert(native_simd a, native_simd b) noexcept
412 {
413 static_assert(SrcIndex < size);
414 static_assert(DstIndex < size);
415#ifdef HI_HAS_SSE4_1
416 return native_simd{_mm_insert_ps(a.v, b.v, (SrcIndex << 6) | (DstIndex << 4))};
417#else
418 return insert<DstIndex>(a, get<SrcIndex>(b));
419#endif
420 }
421
428 template<size_t Index>
429 [[nodiscard]] friend value_type get(native_simd a) noexcept
430 {
431 static_assert(Index < size);
432
433 hilet tmp = _mm_shuffle_ps(a.v, a.v, Index);
434 return _mm_cvtss_f32(tmp);
435 }
436
445 template<size_t Mask>
446 [[nodiscard]] friend native_simd blend(native_simd a, native_simd b) noexcept
447 {
448 static_assert(Mask <= 0b1111);
449
450 if constexpr (Mask == 0b0000) {
451 return a;
452 } else if constexpr (Mask == 0b1111) {
453 return b;
454 } else {
455#ifdef HI_HAS_SSE4_1
456 return native_simd{_mm_blend_ps(a.v, b.v, Mask)};
457#else
458 hilet mask = from_mask(Mask);
459 return not_and(mask, a) | (mask & b);
460#endif
461 }
462 }
463
476 template<fixed_string SourceElements>
477 [[nodiscard]] friend native_simd permute(native_simd a) noexcept
478 {
479 static_assert(SourceElements.size() == size);
480 constexpr auto order = detail::native_swizzle_to_packed_indices<SourceElements, size>();
481
482 if constexpr (order == 0b11'10'01'00) {
483 return a;
484 } else if constexpr (order == 0b00'00'00'00) {
485 return broadcast(a);
486 } else {
487#ifdef HI_HAS_AVX
488 return native_simd{_mm_permute_ps(a.v, order)};
489#else
490 return native_simd{_mm_shuffle_ps(a.v, a.v, order)};
491#endif
492 }
493 }
494
495 [[nodiscard]] friend native_simd permute(native_simd a, native_simd<int32_t, 4> const& source_elements) noexcept;
496
513 template<fixed_string SourceElements>
514 [[nodiscard]] friend native_simd swizzle(native_simd a) noexcept
515 {
516 static_assert(SourceElements.size() == size);
517 constexpr auto one_mask = detail::native_swizzle_to_mask<SourceElements, size, '1'>();
518 constexpr auto zero_mask = detail::native_swizzle_to_mask<SourceElements, size, '0'>();
519 constexpr auto number_mask = one_mask | zero_mask;
520
521 if constexpr (number_mask == 0b1111) {
522 // Swizzle was /[01][01][01][01]/.
523 return swizzle_numbers<SourceElements>();
524
525 } else if constexpr (number_mask == 0b0000) {
526 // Swizzle was /[^01][^01][^01][^01]/.
527 return permute<SourceElements>(a);
528
529#ifdef HI_HAS_SSE4_1
530 } else if constexpr (number_mask == zero_mask) {
531 // Swizzle was /[^1][^1][^1][^1]/.
532 hilet ordered = permute<SourceElements>(a);
533 return set_zero<zero_mask>(ordered);
534#endif
535
536 } else {
537 hilet ordered = permute<SourceElements>(a);
538 hilet numbers = swizzle_numbers<SourceElements>();
539 return blend<number_mask>(ordered, numbers);
540 }
541 }
542
543#ifdef HI_HAS_SSE3
554 [[nodiscard]] friend native_simd horizontal_add(native_simd a, native_simd b) noexcept
555 {
556 return native_simd{_mm_hadd_ps(a.v, b.v)};
557 }
558#endif
559
560#ifdef HI_HAS_SSE3
571 [[nodiscard]] friend native_simd horizontal_sub(native_simd a, native_simd b) noexcept
572 {
573 return native_simd{_mm_hsub_ps(a.v, b.v)};
574 }
575#endif
576
583 [[nodiscard]] friend native_simd horizontal_sum(native_simd a) noexcept
584 {
585 hilet tmp = a + permute<"cdab">(a);
586 return tmp + permute<"badc">(tmp);
587 }
588
599 template<size_t SourceMask>
600 [[nodiscard]] friend native_simd dot_product(native_simd a, native_simd b) noexcept
601 {
602 static_assert(SourceMask <= 0b1111);
603#ifdef HI_HAS_SSE4_1
604 return native_simd{_mm_dp_ps(a.v, b.v, (SourceMask << 4) | 0b1111)};
605#else
606 return horizontal_sum(set_zero<~SourceMask & 0b1111>(a * b));
607#endif
608 }
609
610#ifdef HI_HAS_SSE3
622 [[nodiscard]] friend native_simd interleaved_sub_add(native_simd a, native_simd b) noexcept
623 {
624 return native_simd{_mm_addsub_ps(a.v, b.v)};
625 }
626#endif
627
633 [[nodiscard]] friend native_simd not_and(native_simd a, native_simd b) noexcept
634 {
635 return native_simd{_mm_andnot_ps(a.v, b.v)};
636 }
637
638 [[nodiscard]] friend std::array<native_simd, 4> transpose(native_simd a, native_simd b, native_simd c, native_simd d) noexcept
639 {
640 _MM_TRANSPOSE4_PS(a.v, b.v, c.v, d.v);
641 return {a, b, c, d};
642 }
643
644 friend std::ostream& operator<<(std::ostream& a, native_simd b) noexcept
645 {
646 return a << "(" << get<0>(b) << ", " << get<1>(b) << ", " << get<2>(b) << ", " << get<3>(b) << ")";
647 }
648
649 template<fixed_string SourceElements>
650 [[nodiscard]] static native_simd swizzle_numbers() noexcept
651 {
652 constexpr auto one_mask = detail::native_swizzle_to_mask<SourceElements, size, '1'>();
653 constexpr auto zero_mask = detail::native_swizzle_to_mask<SourceElements, size, '0'>();
654 constexpr auto number_mask = one_mask | zero_mask;
655 constexpr auto alpha_mask = ~number_mask & 0b1111;
656
657 if constexpr ((zero_mask | alpha_mask) == 0b1111) {
658 return {};
659
660 } else if constexpr ((one_mask | alpha_mask) == 0b1111) {
661 return broadcast(1.0f);
662
663 } else {
664 return native_simd{
665 to_bool(one_mask & 0b0001) ? 1.0f : 0.0f,
666 to_bool(one_mask & 0b0010) ? 1.0f : 0.0f,
667 to_bool(one_mask & 0b0100) ? 1.0f : 0.0f,
668 to_bool(one_mask & 0b1000) ? 1.0f : 0.0f};
669 }
670 }
671};
672
673#endif
674}} // namespace hi::v1
#define hi_axiom(expression,...)
Specify an axiom; an expression that is true.
Definition assert.hpp:238
#define hi_axiom_not_null(expression,...)
Assert if an expression is not nullptr.
Definition assert.hpp:257
#define hilet
Invariant should be the default for variables.
Definition utility.hpp:23
@ round
The end cap of the line is round.
@ other
The gui_event does not have associated data.
DOXYGEN BUG.
Definition algorithm.hpp:13
geometry/margins.hpp
Definition cache.hpp:11
T ceil(T... args)
T equal(T... args)
T floor(T... args)
T max(T... args)
T min(T... args)
T operator!=(T... args)
T sqrt(T... args)