17[[nodiscard]]
inline f32x4_raw to_f32x4_raw(__m128
const &rhs)
noexcept
20 _mm_storeu_ps(r.
data(), rhs);
24[[nodiscard]]
inline __m128 to_m128(f32x4_raw
const &rhs)
noexcept
26 return _mm_loadu_ps(rhs.data());
31[[nodiscard]]
inline f32x4_raw f32x4_sse_ceil(f32x4_raw
const &rhs)
noexcept
33 return to_f32x4_raw(_mm_ceil_ps(to_m128(rhs)));
38[[nodiscard]]
inline f32x4_raw f32x4_sse_floor(f32x4_raw
const &rhs)
noexcept
40 return to_f32x4_raw(_mm_floor_ps(to_m128(rhs)));
45[[nodiscard]]
inline f32x4_raw f32x4_sse_round(f32x4_raw
const &rhs)
noexcept
47 return to_f32x4_raw(_mm_round_ps(to_m128(rhs), _MM_FROUND_CUR_DIRECTION));
52[[nodiscard]]
inline f32x4_raw f32x4_sse_rcp(f32x4_raw
const &rhs)
noexcept
54 return to_f32x4_raw(_mm_rcp_ps(to_m128(rhs)));
61template<
unsigned int Mask>
62[[nodiscard]]
inline f32x4_raw f32x4_sse_clear(f32x4_raw
const &rhs)
noexcept
64 static_assert((Mask ^ (Mask & 0xf)) == 0);
66 if constexpr (Mask == 0b0000) {
68 }
else if constexpr (Mask == 0b1111) {
70 return to_f32x4_raw(_mm_setzero_ps());
73 return to_f32x4_raw(_mm_insert_ps(to_m128(rhs), to_m128(rhs), Mask));
83template<
unsigned int Mask>
84[[nodiscard]]
inline f32x4_raw f32x4_sse_make_sign() noexcept
86 static_assert((Mask ^ (Mask & 0xf)) == 0);
88 if constexpr (Mask == 0b0000) {
89 return to_f32x4_raw(_mm_setzero_ps());
91 }
else if constexpr (Mask == 0b0001) {
92 return to_f32x4_raw(_mm_set_ss(-0.0f));
94 }
else if constexpr (Mask == 0b1111) {
95 return to_f32x4_raw(_mm_set_ps1(-0.0f));
98 constexpr float x = (Mask & 0b0001) == 0 ? 0.0f : -0.0f;
99 constexpr float y = (Mask & 0b0010) == 0 ? 0.0f : -0.0f;
100 constexpr float z = (Mask & 0b0100) == 0 ? 0.0f : -0.0f;
101 constexpr float w = (Mask & 0b1000) == 0 ? 0.0f : -0.0f;
102 return to_f32x4_raw(_mm_set_ps(w, z, y, x));
110template<
unsigned int Mask>
111[[nodiscard]]
inline f32x4_raw f32x4_sse_neg(f32x4_raw
const &rhs)
noexcept
113 static_assert((Mask ^ (Mask & 0xf)) == 0);
115 if constexpr (Mask == 0b0000) {
119 ttlet sign = to_m128(f32x4_sse_make_sign<Mask>());
120 return to_f32x4_raw(_mm_xor_ps(to_m128(rhs), sign));
130[[nodiscard]]
inline f32x4_raw
131f32x4_sse_hadd(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
133 return to_f32x4_raw(_mm_hadd_ps(to_m128(lhs), to_m128(rhs)));
142[[nodiscard]]
inline f32x4_raw
143f32x4_sse_hsub(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
145 return to_f32x4_raw(_mm_hsub_ps(to_m128(lhs), to_m128(rhs)));
169template<
unsigned int Mask>
170[[nodiscard]]
inline f32x4_raw f32x4_sse_addsub(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
172 static_assert((Mask ^ (Mask & 0xf)) == 0,
"Only bottom 4 lsb may be set");
174 ttlet lhs_ = to_m128(lhs);
175 ttlet rhs_ = to_m128(rhs);
177 if constexpr (Mask == 0b0000) {
178 return to_f32x4_raw(_mm_sub_ps(lhs_, rhs_));
180 }
else if constexpr (Mask == 0b0101) {
181 return to_f32x4_raw(_mm_addsub_ps(lhs_, rhs_));
183 }
else if constexpr (Mask == 0b1010) {
184 ttlet neg_rhs = to_m128(f32x4_sse_neg<0b1111>(rhs));
185 return to_f32x4_raw(_mm_addsub_ps(lhs_, neg_rhs));
187 }
else if constexpr (Mask == 0b1111) {
188 return to_f32x4_raw(_mm_add_ps(lhs_, rhs_));
191 ttlet neg_rhs = to_m128(f32x4_sse_neg<~Mask & 0xf>(rhs));
192 return to_f32x4_raw(_mm_add_ps(lhs_, neg_rhs));
203template<
unsigned int Mask>
204[[nodiscard]]
float f32x4_sse_dot(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
206 static_assert((Mask ^ (Mask & 0xf)) == 0,
"Only bottom 4 lsb may be set");
207 constexpr int imm8 = (Mask << 4) | 0x1;
209 auto tmp = to_f32x4_raw(_mm_dp_ps(to_m128(lhs), to_m128(rhs), imm8));
221template<
unsigned int Mask>
222[[nodiscard]]
float f32x4_sse_hypot(f32x4_raw
const &rhs)
noexcept
224 static_assert((Mask ^ (Mask & 0xf)) == 0,
"Only bottom 4 lsb may be set");
225 constexpr int imm8 = (Mask << 4) | 0x1;
227 auto _rhs = to_m128(rhs);
228 auto tmp = to_f32x4_raw(_mm_sqrt_ps(_mm_dp_ps(_rhs, _rhs, imm8)));
240template<
unsigned int Mask>
241[[nodiscard]]
float f32x4_sse_rcp_hypot(f32x4_raw
const &rhs)
noexcept
243 static_assert((Mask ^ (Mask & 0xf)) == 0,
"Only bottom 4 lsb may be set");
244 constexpr int imm8 = (Mask << 4) | 0x1;
246 auto _rhs = to_m128(rhs);
247 auto tmp = to_f32x4_raw(_mm_rsqrt_ps(_mm_dp_ps(_rhs, _rhs, imm8)));
260template<
unsigned int Mask>
261[[nodiscard]] f32x4_raw f32x4_sse_normalize(f32x4_raw
const &rhs)
noexcept
263 static_assert((Mask ^ (Mask & 0xf)) == 0,
"Only bottom 4 lsb may be set");
264 constexpr int dp_imm8 = (Mask << 4) | Mask;
265 constexpr int zero_imm8 = ~Mask & 0xf;
267 ttlet rhs_ = to_m128(rhs);
268 ttlet rcp_length = _mm_rsqrt_ps(_mm_dp_ps(rhs_, rhs_, dp_imm8));
269 ttlet rcp_length_ = _mm_insert_ps(rcp_length, rcp_length, zero_imm8);
270 return to_f32x4_raw(_mm_mul_ps(rhs_, rcp_length_));
275[[nodiscard]]
inline unsigned int
276f32x4_sse_eq_mask(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
278 auto tmp = _mm_cmpeq_ps(to_m128(lhs), to_m128(rhs));
279 return static_cast<unsigned int>(_mm_movemask_ps(tmp));
284[[nodiscard]]
inline unsigned int
285f32x4_sse_ne_mask(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
287 auto tmp = _mm_cmpneq_ps(to_m128(lhs), to_m128(rhs));
288 return static_cast<unsigned int>(_mm_movemask_ps(tmp));
293[[nodiscard]]
inline unsigned int
294f32x4_sse_lt_mask(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
296 auto tmp = _mm_cmplt_ps(to_m128(lhs), to_m128(rhs));
297 return static_cast<unsigned int>(_mm_movemask_ps(tmp));
302[[nodiscard]]
inline unsigned int
303f32x4_sse_gt_mask(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
305 auto tmp = _mm_cmpgt_ps(to_m128(lhs), to_m128(rhs));
306 return static_cast<unsigned int>(_mm_movemask_ps(tmp));
311[[nodiscard]]
inline unsigned int
312f32x4_sse_le_mask(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
314 auto tmp = _mm_cmple_ps(to_m128(lhs), to_m128(rhs));
315 return static_cast<unsigned int>(_mm_movemask_ps(tmp));
320[[nodiscard]]
inline unsigned int
321f32x4_sse_ge_mask(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
323 auto tmp = _mm_cmpge_ps(to_m128(lhs), to_m128(rhs));
324 return static_cast<unsigned int>(_mm_movemask_ps(tmp));
329[[nodiscard]]
inline bool f32x4_sse_eq(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
343 auto tmp = _mm_cmpneq_ps(to_m128(lhs), to_m128(rhs));
344 return _mm_testz_ps(tmp, tmp);
351[[nodiscard]]
inline float f32x4_sse_viktor_cross(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
354 ttlet tmp1 = _mm_permute_ps(to_m128(rhs), _MM_SHUFFLE(2, 3, 0, 1));
355 ttlet tmp2 = _mm_mul_ps(to_m128(lhs), tmp1);
356 ttlet tmp3 = _mm_hsub_ps(tmp2, tmp2);
357 return _mm_cvtss_f32(tmp3);
368[[nodiscard]]
inline f32x4_raw f32x4_sse_hamilton_cross(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
370 ttlet lhs_ = to_m128(lhs);
371 ttlet rhs_ = to_m128(rhs);
373 ttlet lhs_x = _mm_permute_ps(lhs_, _MM_SHUFFLE(0, 0, 0, 0));
374 ttlet lhs_y = _mm_permute_ps(lhs_, _MM_SHUFFLE(1, 1, 1, 1));
375 ttlet lhs_z = _mm_permute_ps(lhs_, _MM_SHUFFLE(2, 2, 2, 2));
376 ttlet lhs_w = _mm_permute_ps(lhs_, _MM_SHUFFLE(3, 3, 3, 3));
378 ttlet rhs_1 = _mm_permute_ps(rhs_, _MM_SHUFFLE(0, 1, 2, 3));
379 ttlet rhs_2 = _mm_permute_ps(rhs_, _MM_SHUFFLE(1, 0, 3, 2));
380 ttlet rhs_3 = _mm_permute_ps(rhs_, _MM_SHUFFLE(2, 3, 0, 1));
382 ttlet w = _mm_mul_ps(lhs_w, rhs_);
383 ttlet x = _mm_mul_ps(lhs_x, rhs_1);
384 ttlet y = _mm_mul_ps(lhs_y, rhs_2);
385 ttlet z = _mm_mul_ps(lhs_z, rhs_3);
387 ttlet s0 = f32x4_sse_addsub<0b0101>(to_f32x4_raw(w), to_f32x4_raw(x));
388 ttlet s1 = f32x4_sse_addsub<0b0011>(s0, to_f32x4_raw(y));
389 return f32x4_sse_addsub<0b0110>(s1, to_f32x4_raw(z));
400[[nodiscard]]
inline f32x4_raw f32x4_sse_cross(f32x4_raw
const &lhs, f32x4_raw
const &rhs)
noexcept
402 ttlet a_left = _mm_permute_ps(to_m128(lhs), _MM_SHUFFLE(3, 0, 2, 1));
403 ttlet b_left = _mm_permute_ps(to_m128(rhs), _MM_SHUFFLE(3, 1, 0, 2));
404 ttlet
left = _mm_mul_ps(a_left, b_left);
406 ttlet a_right = _mm_permute_ps(to_m128(lhs), _MM_SHUFFLE(3, 1, 0, 2));
407 ttlet b_right = _mm_permute_ps(to_m128(rhs), _MM_SHUFFLE(3, 0, 2, 1));
408 ttlet
right = _mm_mul_ps(a_right, b_right);
409 return to_f32x4_raw(_mm_sub_ps(left, right));
413 f32x4_raw
const &col0,
414 f32x4_raw
const &col1,
415 f32x4_raw
const &col2,
416 f32x4_raw
const &col3)
noexcept
418 auto col0_ = to_m128(col0);
419 auto col1_ = to_m128(col1);
420 auto col2_ = to_m128(col2);
421 auto col3_ = to_m128(col3);
423 _MM_TRANSPOSE4_PS(col0_, col1_, col2_, col3_);
428 to_f32x4_raw(col2_), to_f32x4_raw(col3_)};
431template<s
size_t A, s
size_t B, s
size_t C, s
size_t D>
432[[nodiscard]]
constexpr static int f32x4_sse_permute_mask() noexcept
434 static_assert(A >= -3 && A < 4);
435 static_assert(B >= -3 && B < 4);
436 static_assert(C >= -3 && C < 4);
437 static_assert(D >= -3 && D < 4);
441 case 0: r |= 0b00'00'00'00;
break;
442 case 1: r |= 0b00'00'00'01;
break;
443 case 2: r |= 0b00'00'00'10;
break;
444 case 3: r |= 0b00'00'00'11;
break;
445 case -1: r |= 0b00'00'00'00;
break;
446 case -2: r |= 0b00'00'00'00;
break;
449 case 0: r |= 0b00'00'00'00;
break;
450 case 1: r |= 0b00'00'01'00;
break;
451 case 2: r |= 0b00'00'10'00;
break;
452 case 3: r |= 0b00'00'11'00;
break;
453 case -1: r |= 0b00'00'01'00;
break;
454 case -2: r |= 0b00'00'01'00;
break;
457 case 0: r |= 0b00'00'00'00;
break;
458 case 1: r |= 0b00'01'00'00;
break;
459 case 2: r |= 0b00'10'00'00;
break;
460 case 3: r |= 0b00'11'00'00;
break;
461 case -1: r |= 0b00'10'00'00;
break;
462 case -2: r |= 0b00'10'00'00;
break;
465 case 0: r |= 0b00'00'00'00;
break;
466 case 1: r |= 0b01'00'00'00;
break;
467 case 2: r |= 0b10'00'00'00;
break;
468 case 3: r |= 0b11'00'00'00;
break;
469 case -1: r |= 0b11'00'00'00;
break;
470 case -2: r |= 0b11'00'00'00;
break;
475template<s
size_t A, s
size_t B, s
size_t C, s
size_t D>
476[[nodiscard]]
constexpr static int f32x4_sse_not_one_mask() noexcept
478 static_assert(A >= -3 && A < 4);
479 static_assert(B >= -3 && B < 4);
480 static_assert(C >= -3 && C < 4);
481 static_assert(D >= -3 && D < 4);
484 r |= (A == -2) ? 0 : 0b0001;
485 r |= (B == -2) ? 0 : 0b0010;
486 r |= (C == -2) ? 0 : 0b0100;
487 r |= (D == -2) ? 0 : 0b1000;
491template<s
size_t A, s
size_t B, s
size_t C, s
size_t D>
492[[nodiscard]]
constexpr static int f32x4_sse_number_mask() noexcept
494 static_assert(A >= -3 && A < 4);
495 static_assert(B >= -3 && B < 4);
496 static_assert(C >= -3 && C < 4);
497 static_assert(D >= -3 && D < 4);
500 r |= A < 0 ? 0b0001 : 0;
501 r |= B < 0 ? 0b0010 : 0;
502 r |= C < 0 ? 0b0100 : 0;
503 r |= D < 0 ? 0b1000 : 0;
507template<ssize_t A = -1, ssize_t B = -1, ssize_t C = -1, ssize_t D = -1>
508[[nodiscard]] f32x4_raw f32x4_sse_swizzle(f32x4_raw
const &value)
noexcept
510 static_assert(A >= -3 && A < 4);
511 static_assert(B >= -3 && B < 4);
512 static_assert(C >= -3 && C < 4);
513 static_assert(D >= -3 && D < 4);
515 constexpr int permute_mask = f32x4_sse_permute_mask<A, B, C, D>();
516 constexpr int not_one_mask = f32x4_sse_not_one_mask<A, B, C, D>();
517 constexpr int number_mask = f32x4_sse_number_mask<A, B, C, D>();
521 if constexpr (permute_mask != 0b11'10'01'00) {
522 swizzled = _mm_permute_ps(to_m128(value), permute_mask);
524 swizzled = to_m128(value);
528 if constexpr (not_one_mask == 0b0000) {
529 numbers = _mm_set_ps1(1.0f);
530 }
else if constexpr (not_one_mask == 0b1111) {
531 numbers = _mm_setzero_ps();
532 }
else if constexpr (not_one_mask == 0b1110) {
533 numbers = _mm_set_ss(1.0f);
535 ttlet _1111 = _mm_set_ps1(1.0f);
536 numbers = _mm_insert_ps(_1111, _1111, not_one_mask);
540 if constexpr (number_mask == 0b0000) {
542 }
else if constexpr (number_mask == 0b1111) {
544 }
else if constexpr (((not_one_mask | ~number_mask) & 0b1111) == 0b1111) {
545 result = _mm_insert_ps(swizzled, swizzled, number_mask);
547 result = _mm_blend_ps(swizzled, numbers, number_mask);
549 return to_f32x4_raw(result);