7#include "../aligned_array.hpp"
13[[nodiscard]]
inline f32x4_raw f32x4_sse_ceil(f32x4_raw
const &rhs)
noexcept
15 return f32x4_raw{_mm_ceil_ps(
static_cast<__m128
>(rhs))};
20[[nodiscard]]
inline f32x4_raw f32x4_sse_floor(f32x4_raw
const &rhs)
noexcept
22 return f32x4_raw{_mm_floor_ps(
static_cast<__m128
>(rhs))};
27[[nodiscard]]
inline f32x4_raw f32x4_sse_round(f32x4_raw
const &rhs)
noexcept
29 return f32x4_raw{_mm_round_ps(
static_cast<__m128
>(rhs), _MM_FROUND_CUR_DIRECTION)};
34[[nodiscard]]
inline f32x4_raw f32x4_sse_rcp(f32x4_raw
const &rhs)
noexcept
36 return f32x4_raw{_mm_rcp_ps(
static_cast<__m128
>(rhs))};
43template<
unsigned int Mask>
44[[nodiscard]]
inline f32x4_raw f32x4_sse_clear(f32x4_raw
const &rhs)
noexcept
46 static_assert(Mask ^ (Mask & 0xf) == 0);
48 if constexpr (Mask == 0b0000) {
50 }
else if constexpr (Mask == 0b1111) {
52 return f32x4_raw{_mm_setzero_ps()};
55 return f32x4_raw{_mm_insert_ps(
static_cast<__m128
>(rhs),
static_cast<__m128
>(rhs), Mask)};
65template<
unsigned int Mask>
66[[nodiscard]]
inline f32x4_raw f32x4_sse_make_sign() noexcept
68 static_assert((Mask ^ (Mask & 0xf)) == 0);
70 if constexpr (Mask == 0b0000) {
71 return f32x4_raw{_mm_setzero_ps()};
73 }
else if constexpr (Mask == 0b0001) {
74 return f32x4_raw{_mm_set_ss(-0.0f)};
76 }
else if constexpr (Mask == 0b1111) {
77 return f32x4_raw{_mm_set_ps1(-0.0f)};
80 constexpr float x = (Mask & 0b0001) == 0 ? 0.0f : -0.0f;
81 constexpr float y = (Mask & 0b0010) == 0 ? 0.0f : -0.0f;
82 constexpr float z = (Mask & 0b0100) == 0 ? 0.0f : -0.0f;
83 constexpr float w = (Mask & 0b1000) == 0 ? 0.0f : -0.0f;
84 return f32x4_raw{_mm_set_ps(w, z, y, x)};
92template<
unsigned int Mask>
93[[nodiscard]]
inline f32x4_raw f32x4_sse_neg(f32x4_raw
const &rhs)
noexcept
95 static_assert((Mask ^ (Mask & 0xf)) == 0);
97 if constexpr (Mask == 0b0000) {
101 ttlet sign =
static_cast<__m128
>(f32x4_sse_make_sign<Mask>());
102 return f32x4_raw{_mm_xor_ps(
static_cast<__m128
>(rhs), sign)};
112[[nodiscard]]
inline f32x4_raw
113f32x4_sse_hadd(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
115 return f32x4_raw{_mm_hadd_ps(
static_cast<__m128
>(lhs),
static_cast<__m128
>(rhs))};
124[[nodiscard]]
inline f32x4_raw
125f32x4_sse_hsub(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
127 return f32x4_raw{_mm_hsub_ps(
static_cast<__m128
>(lhs),
static_cast<__m128
>(rhs))};
151template<
unsigned int Mask>
152[[nodiscard]]
inline f32x4_raw f32x4_sse_addsub(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
154 static_assert((Mask ^ (Mask & 0xf)) == 0,
"Only bottom 4 lsb may be set");
156 ttlet lhs_ =
static_cast<__m128
>(lhs);
157 ttlet rhs_ =
static_cast<__m128
>(rhs);
159 if constexpr (Mask == 0b0000) {
160 return f32x4_raw{_mm_sub_ps(lhs_, rhs_)};
162 }
else if constexpr (Mask == 0b0101) {
163 return f32x4_raw{_mm_addsub_ps(lhs_, rhs_)};
165 }
else if constexpr (Mask == 0b1010) {
166 ttlet neg_rhs =
static_cast<__m128
>(f32x4_sse_neg<0b1111>(rhs));
167 return f32x4_raw{_mm_addsub_ps(lhs_, neg_rhs)};
169 }
else if constexpr (Mask == 0b1111) {
170 return f32x4_raw{_mm_add_ps(lhs_, rhs_)};
173 ttlet neg_rhs =
static_cast<__m128
>(f32x4_sse_neg<~Mask & 0xf>(rhs));
174 return f32x4_raw{_mm_add_ps(lhs_, neg_rhs)};
185template<
unsigned int Mask>
186[[nodiscard]]
float f32x4_sse_dot(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
188 static_assert((Mask ^ (Mask & 0xf)) == 0,
"Only bottom 4 lsb may be set");
189 constexpr int imm8 = (Mask << 4) | 0x1;
191 auto tmp = f32x4_raw{_mm_dp_ps(
static_cast<__m128
>(lhs),
static_cast<__m128
>(rhs), imm8)};
203template<
unsigned int Mask>
204[[nodiscard]]
float f32x4_sse_hypot(f32x4_raw
const &rhs)
noexcept
206 static_assert((Mask ^ (Mask & 0xf)) == 0,
"Only bottom 4 lsb may be set");
207 constexpr int imm8 = (Mask << 4) | 0x1;
209 auto _rhs =
static_cast<__m128
>(rhs);
210 auto tmp = f32x4_raw{_mm_sqrt_ps(_mm_dp_ps(_rhs, _rhs, imm8))};
222template<
unsigned int Mask>
223[[nodiscard]]
float f32x4_sse_rcp_hypot(f32x4_raw
const &rhs)
noexcept
225 static_assert((Mask ^ (Mask & 0xf)) == 0,
"Only bottom 4 lsb may be set");
226 constexpr int imm8 = (Mask << 4) | 0x1;
228 auto _rhs =
static_cast<__m128
>(rhs);
229 auto tmp = f32x4_raw{_mm_rsqrt_ps(_mm_dp_ps(_rhs, _rhs, imm8))};
242template<
unsigned int Mask>
243[[nodiscard]] f32x4_raw f32x4_sse_normalize(f32x4_raw
const &rhs)
noexcept
245 static_assert((Mask ^ (Mask & 0xf)) == 0,
"Only bottom 4 lsb may be set");
246 constexpr int dp_imm8 = (Mask << 4) | Mask;
247 constexpr int zero_imm8 = ~Mask & 0xf;
249 ttlet rhs_ =
static_cast<__m128
>(rhs);
250 ttlet rcp_length = _mm_rsqrt_ps(_mm_dp_ps(rhs_, rhs_, dp_imm8));
251 ttlet rcp_length_ = _mm_insert_ps(rcp_length, rcp_length, zero_imm8);
252 return f32x4_raw{_mm_mul_ps(rhs_, rcp_length_)};
257[[nodiscard]]
inline unsigned int
258f32x4_sse_eq_mask(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
260 auto tmp = _mm_cmpeq_ps(
static_cast<__m128
>(lhs),
static_cast<__m128
>(rhs));
261 return static_cast<unsigned int>(_mm_movemask_ps(tmp));
266[[nodiscard]]
inline unsigned int
267f32x4_sse_ne_mask(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
269 auto tmp = _mm_cmpneq_ps(
static_cast<__m128
>(lhs),
static_cast<__m128
>(rhs));
270 return static_cast<unsigned int>(_mm_movemask_ps(tmp));
275[[nodiscard]]
inline unsigned int
276f32x4_sse_lt_mask(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
278 auto tmp = _mm_cmplt_ps(
static_cast<__m128
>(lhs),
static_cast<__m128
>(rhs));
279 return static_cast<unsigned int>(_mm_movemask_ps(tmp));
284[[nodiscard]]
inline unsigned int
285f32x4_sse_gt_mask(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
287 auto tmp = _mm_cmpgt_ps(
static_cast<__m128
>(lhs),
static_cast<__m128
>(rhs));
288 return static_cast<unsigned int>(_mm_movemask_ps(tmp));
293[[nodiscard]]
inline unsigned int
294f32x4_sse_le_mask(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
296 auto tmp = _mm_cmple_ps(
static_cast<__m128
>(lhs),
static_cast<__m128
>(rhs));
297 return static_cast<unsigned int>(_mm_movemask_ps(tmp));
302[[nodiscard]]
inline unsigned int
303f32x4_sse_ge_mask(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
305 auto tmp = _mm_cmpge_ps(
static_cast<__m128
>(lhs),
static_cast<__m128
>(rhs));
306 return static_cast<unsigned int>(_mm_movemask_ps(tmp));
311[[nodiscard]]
inline bool f32x4_sse_eq(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
325 auto tmp = _mm_cmpneq_ps(
static_cast<__m128
>(lhs),
static_cast<__m128
>(rhs));
326 return _mm_testz_ps(tmp, tmp);
333[[nodiscard]]
inline float f32x4_sse_viktor_cross(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
336 ttlet tmp1 = _mm_permute_ps(
static_cast<__m128
>(rhs), _MM_SHUFFLE(2, 3, 0, 1));
337 ttlet tmp2 = _mm_mul_ps(
static_cast<__m128
>(lhs), tmp1);
338 ttlet tmp3 = _mm_hsub_ps(tmp2, tmp2);
339 return _mm_cvtss_f32(tmp3);
350[[nodiscard]]
inline f32x4_raw f32x4_sse_hamilton_cross(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
352 ttlet lhs_ =
static_cast<__m128
>(lhs);
353 ttlet rhs_ =
static_cast<__m128
>(rhs);
355 ttlet lhs_x = _mm_permute_ps(lhs_, _MM_SHUFFLE(0, 0, 0, 0));
356 ttlet lhs_y = _mm_permute_ps(lhs_, _MM_SHUFFLE(1, 1, 1, 1));
357 ttlet lhs_z = _mm_permute_ps(lhs_, _MM_SHUFFLE(2, 2, 2, 2));
358 ttlet lhs_w = _mm_permute_ps(lhs_, _MM_SHUFFLE(3, 3, 3, 3));
360 ttlet rhs_1 = _mm_permute_ps(rhs_, _MM_SHUFFLE(0, 1, 2, 3));
361 ttlet rhs_2 = _mm_permute_ps(rhs_, _MM_SHUFFLE(1, 0, 3, 2));
362 ttlet rhs_3 = _mm_permute_ps(rhs_, _MM_SHUFFLE(2, 3, 0, 1));
364 ttlet w = _mm_mul_ps(lhs_w, rhs_);
365 ttlet x = _mm_mul_ps(lhs_x, rhs_1);
366 ttlet y = _mm_mul_ps(lhs_y, rhs_2);
367 ttlet z = _mm_mul_ps(lhs_z, rhs_3);
369 ttlet s0 = f32x4_sse_addsub<0b0101>(f32x4_raw{w}, f32x4_raw{x});
370 ttlet s1 = f32x4_sse_addsub<0b0011>(s0, f32x4_raw{y});
371 return f32x4_sse_addsub<0b0110>(s1, f32x4_raw{z});
382[[nodiscard]]
inline f32x4_raw f32x4_sse_cross(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
384 ttlet a_left = _mm_permute_ps(
static_cast<__m128
>(lhs), _MM_SHUFFLE(3, 0, 2, 1));
385 ttlet b_left = _mm_permute_ps(
static_cast<__m128
>(rhs), _MM_SHUFFLE(3, 1, 0, 2));
386 ttlet
left = _mm_mul_ps(a_left, b_left);
388 ttlet a_right = _mm_permute_ps(
static_cast<__m128
>(lhs), _MM_SHUFFLE(3, 1, 0, 2));
389 ttlet b_right = _mm_permute_ps(
static_cast<__m128
>(rhs), _MM_SHUFFLE(3, 0, 2, 1));
390 ttlet
right = _mm_mul_ps(a_right, b_right);
391 return f32x4_raw{_mm_sub_ps(left, right)};
395 f32x4_raw
const &col0,
396 f32x4_raw
const &col1,
397 f32x4_raw
const &col2,
398 f32x4_raw
const &col3)
noexcept
400 auto col0_ =
static_cast<__m128
>(col0);
401 auto col1_ =
static_cast<__m128
>(col1);
402 auto col2_ =
static_cast<__m128
>(col2);
403 auto col3_ =
static_cast<__m128
>(col3);
405 _MM_TRANSPOSE4_PS(col0_, col1_, col2_, col3_);
414template<s
size_t A, s
size_t B, s
size_t C, s
size_t D>
415[[nodiscard]]
constexpr static int f32x4_sse_permute_mask() noexcept
417 static_assert(A >= -3 && A < 4);
418 static_assert(B >= -3 && B < 4);
419 static_assert(C >= -3 && C < 4);
420 static_assert(D >= -3 && D < 4);
424 case 0: r |= 0b00'00'00'00;
break;
425 case 1: r |= 0b00'00'00'01;
break;
426 case 2: r |= 0b00'00'00'10;
break;
427 case 3: r |= 0b00'00'00'11;
break;
428 case -1: r |= 0b00'00'00'00;
break;
429 case -2: r |= 0b00'00'00'00;
break;
432 case 0: r |= 0b00'00'00'00;
break;
433 case 1: r |= 0b00'00'01'00;
break;
434 case 2: r |= 0b00'00'10'00;
break;
435 case 3: r |= 0b00'00'11'00;
break;
436 case -1: r |= 0b00'00'01'00;
break;
437 case -2: r |= 0b00'00'01'00;
break;
440 case 0: r |= 0b00'00'00'00;
break;
441 case 1: r |= 0b00'01'00'00;
break;
442 case 2: r |= 0b00'10'00'00;
break;
443 case 3: r |= 0b00'11'00'00;
break;
444 case -1: r |= 0b00'10'00'00;
break;
445 case -2: r |= 0b00'10'00'00;
break;
448 case 0: r |= 0b00'00'00'00;
break;
449 case 1: r |= 0b01'00'00'00;
break;
450 case 2: r |= 0b10'00'00'00;
break;
451 case 3: r |= 0b11'00'00'00;
break;
452 case -1: r |= 0b11'00'00'00;
break;
453 case -2: r |= 0b11'00'00'00;
break;
458template<s
size_t A, s
size_t B, s
size_t C, s
size_t D>
459[[nodiscard]]
constexpr static int f32x4_sse_not_one_mask() noexcept
461 static_assert(A >= -3 && A < 4);
462 static_assert(B >= -3 && B < 4);
463 static_assert(C >= -3 && C < 4);
464 static_assert(D >= -3 && D < 4);
467 r |= (A == -2) ? 0 : 0b0001;
468 r |= (B == -2) ? 0 : 0b0010;
469 r |= (C == -2) ? 0 : 0b0100;
470 r |= (D == -2) ? 0 : 0b1000;
474template<s
size_t A, s
size_t B, s
size_t C, s
size_t D>
475[[nodiscard]]
constexpr static int f32x4_sse_number_mask() noexcept
477 static_assert(A >= -3 && A < 4);
478 static_assert(B >= -3 && B < 4);
479 static_assert(C >= -3 && C < 4);
480 static_assert(D >= -3 && D < 4);
483 r |= A < 0 ? 0b0001 : 0;
484 r |= B < 0 ? 0b0010 : 0;
485 r |= C < 0 ? 0b0100 : 0;
486 r |= D < 0 ? 0b1000 : 0;
490template<ssize_t A = -1, ssize_t B = -1, ssize_t C = -1, ssize_t D = -1>
491[[nodiscard]] f32x4_raw f32x4_sse_swizzle(f32x4_raw
const &value)
noexcept
493 static_assert(A >= -3 && A < 4);
494 static_assert(B >= -3 && B < 4);
495 static_assert(C >= -3 && C < 4);
496 static_assert(D >= -3 && D < 4);
498 constexpr int permute_mask = f32x4_sse_permute_mask<A, B, C, D>();
499 constexpr int not_one_mask = f32x4_sse_not_one_mask<A, B, C, D>();
500 constexpr int number_mask = f32x4_sse_number_mask<A, B, C, D>();
504 if constexpr (permute_mask != 0b11'10'01'00) {
505 swizzled = _mm_permute_ps(
static_cast<__m128
>(value), permute_mask);
507 swizzled =
static_cast<__m128
>(value);
511 if constexpr (not_one_mask == 0b0000) {
512 numbers = _mm_set_ps1(1.0f);
513 }
else if constexpr (not_one_mask == 0b1111) {
514 numbers = _mm_setzero_ps();
515 }
else if constexpr (not_one_mask == 0b1110) {
516 numbers = _mm_set_ss(1.0f);
518 ttlet _1111 = _mm_set_ps1(1.0f);
519 numbers = _mm_insert_ps(_1111, _1111, not_one_mask);
523 if constexpr (number_mask == 0b0000) {
525 }
else if constexpr (number_mask == 0b1111) {
527 }
else if constexpr (((not_one_mask | ~number_mask) & 0b1111) == 0b1111) {
528 result = _mm_insert_ps(swizzled, swizzled, number_mask);
530 result = _mm_blend_ps(swizzled, numbers, number_mask);
532 return f32x4_raw{result};