7#include "raw_numeric_array.hpp"
22template<
unsigned int Mask>
23[[nodiscard]]
inline rf32x4 f32x4_x64v2_clear(rf32x4
const &rhs)
noexcept
25 static_assert((Mask ^ (Mask & 0xf)) == 0);
27 if constexpr (Mask == 0b0000) {
29 }
else if constexpr (Mask == 0b1111) {
31 return to_rf32x4(_mm_setzero_ps());
34 return to_rf32x4(_mm_insert_ps(to_m128(rhs), to_m128(rhs), Mask));
44template<
unsigned int Mask>
45[[nodiscard]]
inline rf32x4 f32x4_x64v2_make_sign() noexcept
47 static_assert((Mask ^ (Mask & 0xf)) == 0);
49 if constexpr (Mask == 0b0000) {
50 return to_rf32x4(_mm_setzero_ps());
52 }
else if constexpr (Mask == 0b0001) {
53 return to_rf32x4(_mm_set_ss(-0.0f));
55 }
else if constexpr (Mask == 0b1111) {
56 return to_rf32x4(_mm_set_ps1(-0.0f));
59 constexpr float x = (Mask & 0b0001) == 0 ? 0.0f : -0.0f;
60 constexpr float y = (Mask & 0b0010) == 0 ? 0.0f : -0.0f;
61 constexpr float z = (Mask & 0b0100) == 0 ? 0.0f : -0.0f;
62 constexpr float w = (Mask & 0b1000) == 0 ? 0.0f : -0.0f;
63 return to_rf32x4(_mm_set_ps(w, z, y, x));
71template<
unsigned int Mask>
72[[nodiscard]]
inline rf32x4 f32x4_x64v2_neg(rf32x4
const &rhs)
noexcept
74 static_assert((Mask ^ (Mask & 0xf)) == 0);
76 if constexpr (Mask == 0b0000) {
80 ttlet sign = to_m128(f32x4_x64v2_make_sign<Mask>());
81 return to_rf32x4(_mm_xor_ps(to_m128(rhs), sign));
106template<
unsigned int Mask>
107[[nodiscard]]
inline rf32x4 f32x4_x64v2_addsub(rf32x4
const &lhs, rf32x4
const &rhs)
noexcept
109 static_assert((Mask ^ (Mask & 0xf)) == 0,
"Only bottom 4 lsb may be set");
111 ttlet lhs_ = to_m128(lhs);
112 ttlet rhs_ = to_m128(rhs);
114 if constexpr (Mask == 0b0000) {
115 return to_rf32x4(_mm_sub_ps(lhs_, rhs_));
117 }
else if constexpr (Mask == 0b0101) {
118 return to_rf32x4(_mm_addsub_ps(lhs_, rhs_));
120 }
else if constexpr (Mask == 0b1010) {
121 ttlet neg_rhs = to_m128(f32x4_x64v2_neg<0b1111>(rhs));
122 return to_rf32x4(_mm_addsub_ps(lhs_, neg_rhs));
124 }
else if constexpr (Mask == 0b1111) {
125 return to_rf32x4(_mm_add_ps(lhs_, rhs_));
128 ttlet neg_rhs = to_m128(f32x4_x64v2_neg<~Mask & 0xf>(rhs));
129 return to_rf32x4(_mm_add_ps(lhs_, neg_rhs));
140template<
unsigned int Mask>
141[[nodiscard]]
float f32x4_x64v2_dot(rf32x4
const &lhs, rf32x4
const &rhs)
noexcept
143 static_assert((Mask ^ (Mask & 0xf)) == 0,
"Only bottom 4 lsb may be set");
144 constexpr int imm8 = (Mask << 4) | 0x1;
146 auto tmp = to_rf32x4(_mm_dp_ps(to_m128(lhs), to_m128(rhs), imm8));
158template<
unsigned int Mask>
159[[nodiscard]]
float f32x4_x64v2_hypot(rf32x4
const &rhs)
noexcept
161 static_assert((Mask ^ (Mask & 0xf)) == 0,
"Only bottom 4 lsb may be set");
162 constexpr int imm8 = (Mask << 4) | 0x1;
164 auto _rhs = to_m128(rhs);
165 auto tmp = to_rf32x4(_mm_sqrt_ps(_mm_dp_ps(_rhs, _rhs, imm8)));
177template<
unsigned int Mask>
178[[nodiscard]]
float f32x4_x64v2_rcp_hypot(rf32x4
const &rhs)
noexcept
180 static_assert((Mask ^ (Mask & 0xf)) == 0,
"Only bottom 4 lsb may be set");
181 constexpr int imm8 = (Mask << 4) | 0x1;
183 auto _rhs = to_m128(rhs);
184 auto tmp = to_rf32x4(_mm_rsqrt_ps(_mm_dp_ps(_rhs, _rhs, imm8)));
197template<
unsigned int Mask>
198[[nodiscard]] rf32x4 f32x4_x64v2_normalize(rf32x4
const &rhs)
noexcept
200 static_assert((Mask ^ (Mask & 0xf)) == 0,
"Only bottom 4 lsb may be set");
201 constexpr int dp_imm8 = (Mask << 4) | Mask;
202 constexpr int zero_imm8 = ~Mask & 0xf;
204 ttlet rhs_ = to_m128(rhs);
205 ttlet rcp_length = _mm_rsqrt_ps(_mm_dp_ps(rhs_, rhs_, dp_imm8));
206 ttlet rcp_length_ = _mm_insert_ps(rcp_length, rcp_length, zero_imm8);
207 return to_rf32x4(_mm_mul_ps(rhs_, rcp_length_));
212[[nodiscard]]
inline bool f32x4_x64v2_eq(rf32x4
const &lhs, rf32x4
const &rhs)
noexcept
226 auto tmp = _mm_cmpneq_ps(to_m128(lhs), to_m128(rhs));
227 return _mm_testz_ps(tmp, tmp);
234[[nodiscard]]
inline float f32x4_x64v2_viktor_cross(rf32x4
const &lhs, rf32x4
const &rhs)
noexcept
237 ttlet tmp1 = _mm_permute_ps(to_m128(rhs), _MM_SHUFFLE(2, 3, 0, 1));
238 ttlet tmp2 = _mm_mul_ps(to_m128(lhs), tmp1);
239 ttlet tmp3 = _mm_hsub_ps(tmp2, tmp2);
240 return _mm_cvtss_f32(tmp3);
251[[nodiscard]]
inline rf32x4 f32x4_x64v2_hamilton_cross(rf32x4
const &lhs, rf32x4
const &rhs)
noexcept
253 ttlet lhs_ = to_m128(lhs);
254 ttlet rhs_ = to_m128(rhs);
256 ttlet lhs_x = _mm_permute_ps(lhs_, _MM_SHUFFLE(0, 0, 0, 0));
257 ttlet lhs_y = _mm_permute_ps(lhs_, _MM_SHUFFLE(1, 1, 1, 1));
258 ttlet lhs_z = _mm_permute_ps(lhs_, _MM_SHUFFLE(2, 2, 2, 2));
259 ttlet lhs_w = _mm_permute_ps(lhs_, _MM_SHUFFLE(3, 3, 3, 3));
261 ttlet rhs_1 = _mm_permute_ps(rhs_, _MM_SHUFFLE(0, 1, 2, 3));
262 ttlet rhs_2 = _mm_permute_ps(rhs_, _MM_SHUFFLE(1, 0, 3, 2));
263 ttlet rhs_3 = _mm_permute_ps(rhs_, _MM_SHUFFLE(2, 3, 0, 1));
265 ttlet w = _mm_mul_ps(lhs_w, rhs_);
266 ttlet x = _mm_mul_ps(lhs_x, rhs_1);
267 ttlet y = _mm_mul_ps(lhs_y, rhs_2);
268 ttlet z = _mm_mul_ps(lhs_z, rhs_3);
270 ttlet s0 = f32x4_x64v2_addsub<0b0101>(to_rf32x4(w), to_rf32x4(x));
271 ttlet s1 = f32x4_x64v2_addsub<0b0011>(s0, to_rf32x4(y));
272 return f32x4_x64v2_addsub<0b0110>(s1, to_rf32x4(z));
282[[nodiscard]]
inline rf32x4 f32x4_x64v2_cross(rf32x4
const &lhs, rf32x4
const &rhs)
noexcept
284 ttlet a_left = _mm_permute_ps(to_m128(lhs), _MM_SHUFFLE(3, 0, 2, 1));
285 ttlet b_left = _mm_permute_ps(to_m128(rhs), _MM_SHUFFLE(3, 1, 0, 2));
286 ttlet
left = _mm_mul_ps(a_left, b_left);
288 ttlet a_right = _mm_permute_ps(to_m128(lhs), _MM_SHUFFLE(3, 1, 0, 2));
289 ttlet b_right = _mm_permute_ps(to_m128(rhs), _MM_SHUFFLE(3, 0, 2, 1));
290 ttlet
right = _mm_mul_ps(a_right, b_right);
291 return to_rf32x4(_mm_sub_ps(left, right));
295f32x4_x64v2_transpose(rf32x4
const &col0, rf32x4
const &col1, rf32x4
const &col2, rf32x4
const &col3)
noexcept
297 auto col0_ = to_m128(col0);
298 auto col1_ = to_m128(col1);
299 auto col2_ = to_m128(col2);
300 auto col3_ = to_m128(col3);
302 _MM_TRANSPOSE4_PS(col0_, col1_, col2_, col3_);
304 return {to_rf32x4(col0_), to_rf32x4(col1_), to_rf32x4(col2_), to_rf32x4(col3_)};
307template<s
size_t A, s
size_t B, s
size_t C, s
size_t D>
308[[nodiscard]]
constexpr static int f32x4_x64v2_permute_mask() noexcept
310 static_assert(A >= -3 && A < 4);
311 static_assert(B >= -3 && B < 4);
312 static_assert(C >= -3 && C < 4);
313 static_assert(D >= -3 && D < 4);
317 case 0: r |= 0b00'00'00'00;
break;
318 case 1: r |= 0b00'00'00'01;
break;
319 case 2: r |= 0b00'00'00'10;
break;
320 case 3: r |= 0b00'00'00'11;
break;
321 case -1: r |= 0b00'00'00'00;
break;
322 case -2: r |= 0b00'00'00'00;
break;
325 case 0: r |= 0b00'00'00'00;
break;
326 case 1: r |= 0b00'00'01'00;
break;
327 case 2: r |= 0b00'00'10'00;
break;
328 case 3: r |= 0b00'00'11'00;
break;
329 case -1: r |= 0b00'00'01'00;
break;
330 case -2: r |= 0b00'00'01'00;
break;
333 case 0: r |= 0b00'00'00'00;
break;
334 case 1: r |= 0b00'01'00'00;
break;
335 case 2: r |= 0b00'10'00'00;
break;
336 case 3: r |= 0b00'11'00'00;
break;
337 case -1: r |= 0b00'10'00'00;
break;
338 case -2: r |= 0b00'10'00'00;
break;
341 case 0: r |= 0b00'00'00'00;
break;
342 case 1: r |= 0b01'00'00'00;
break;
343 case 2: r |= 0b10'00'00'00;
break;
344 case 3: r |= 0b11'00'00'00;
break;
345 case -1: r |= 0b11'00'00'00;
break;
346 case -2: r |= 0b11'00'00'00;
break;
351template<s
size_t A, s
size_t B, s
size_t C, s
size_t D>
352[[nodiscard]]
constexpr static int f32x4_x64v2_not_one_mask() noexcept
354 static_assert(A >= -3 && A < 4);
355 static_assert(B >= -3 && B < 4);
356 static_assert(C >= -3 && C < 4);
357 static_assert(D >= -3 && D < 4);
360 r |= (A == -2) ? 0 : 0b0001;
361 r |= (B == -2) ? 0 : 0b0010;
362 r |= (C == -2) ? 0 : 0b0100;
363 r |= (D == -2) ? 0 : 0b1000;
367template<s
size_t A, s
size_t B, s
size_t C, s
size_t D>
368[[nodiscard]]
constexpr static int f32x4_x64v2_number_mask() noexcept
370 static_assert(A >= -3 && A < 4);
371 static_assert(B >= -3 && B < 4);
372 static_assert(C >= -3 && C < 4);
373 static_assert(D >= -3 && D < 4);
376 r |= A < 0 ? 0b0001 : 0;
377 r |= B < 0 ? 0b0010 : 0;
378 r |= C < 0 ? 0b0100 : 0;
379 r |= D < 0 ? 0b1000 : 0;
383template<s
size_t A, s
size_t B, s
size_t C, s
size_t D>
384[[nodiscard]] __m128 m128_x64v2_swizzle(__m128
const &value)
noexcept
386 static_assert(A >= -3 && A < 4);
387 static_assert(B >= -3 && B < 4);
388 static_assert(C >= -3 && C < 4);
389 static_assert(D >= -3 && D < 4);
391 constexpr int permute_mask = f32x4_x64v2_permute_mask<A, B, C, D>();
392 constexpr int not_one_mask = f32x4_x64v2_not_one_mask<A, B, C, D>();
393 constexpr int number_mask = f32x4_x64v2_number_mask<A, B, C, D>();
397 if constexpr (permute_mask != 0b11'10'01'00) {
398 swizzled = _mm_permute_ps(value, permute_mask);
404 if constexpr (not_one_mask == 0b0000) {
405 numbers = _mm_set_ps1(1.0f);
406 }
else if constexpr (not_one_mask == 0b1111) {
407 numbers = _mm_setzero_ps();
408 }
else if constexpr (not_one_mask == 0b1110) {
409 numbers = _mm_set_ss(1.0f);
411 ttlet _1111 = _mm_set_ps1(1.0f);
412 numbers = _mm_insert_ps(_1111, _1111, not_one_mask);
416 if constexpr (number_mask == 0b0000) {
418 }
else if constexpr (number_mask == 0b1111) {
420 }
else if constexpr (((not_one_mask | ~number_mask) & 0b1111) == 0b1111) {
421 result = _mm_insert_ps(swizzled, swizzled, number_mask);
423 result = _mm_blend_ps(swizzled, numbers, number_mask);
428template<s
size_t A, s
size_t B, s
size_t C, s
size_t D>
429[[nodiscard]] __m128i m128i_x64v2_swizzle(__m128i
const &value)
noexcept
431 return _mm_castps_si128(m128_x64v2_swizzle<A, B, C, D>(_mm_castsi128_ps(value)));
434template<ssize_t A = -1, ssize_t B = -1, ssize_t C = -1, ssize_t D = -1>
435[[nodiscard]] rf32x4 f32x4_x64v2_swizzle(rf32x4
const &value)
noexcept
437 return to_rf32x4(m128_x64v2_swizzle<A, B, C, D>(to_m128(value)));
440template<ssize_t A = -1, ssize_t B = -1, ssize_t C = -1, ssize_t D = -1>
441[[nodiscard]] ri32x4 i32x4_x64v2_swizzle(ri32x4
const &value)
noexcept
443 return to_ri32x4(m128i_x64v2_swizzle<A, B, C, D>(to_m128i(value)));
446template<ssize_t A = -1, ssize_t B = -1, ssize_t C = -1, ssize_t D = -1>
447[[nodiscard]] ru32x4 u32x4_x64v2_swizzle(ru32x4
const &value)
noexcept
449 return to_ru32x4(m128i_x64v2_swizzle<A, B, C, D>(to_m128i(value)));
452template<ssize_t A = -1, ssize_t B = -1>
453[[nodiscard]] ru64x2 u64x2_x64v2_swizzle(ru64x2
const &value)
noexcept
455 constexpr auto A1 = A >= 0 ? A * 2 : A;
456 constexpr auto A2 = A >= 0 ? A1 + 1 : A1;
457 constexpr auto B1 = B >= 0 ? B * 2 : B;
458 constexpr auto B2 = B >= 0 ? B1 + 1 : B1;
460 ttlet value_ = _mm_castsi128_ps(to_m128i(value));
461 ttlet r = to_m128(f32x4_x64v2_swizzle<A1, A2, B1, B2>(to_rf32x4(value_)));
462 return to_ru64x2(_mm_castps_si128(r));
465template<
size_t FromElement,
size_t ToElement,
size_t ZeroMask>
466[[nodiscard]] rf32x4 f32x4_x64v2_insert(rf32x4
const &lhs, rf32x4
const &rhs)
noexcept
468 static_assert(FromElement < 4);
469 static_assert(ToElement < 4);
470 static_assert(ZeroMask < 16);
472 constexpr uint8_t insert_mask =
static_cast<uint8_t
>((FromElement << 6) | (ToElement << 4) | ZeroMask);
474 return to_rf32x4(_mm_insert_ps(to_m128(lhs), to_m128(rhs), insert_mask));
477template<
size_t FromElement,
size_t ToElement,
size_t ZeroMask>
478[[nodiscard]] ru64x2 u64x2_x64v2_insert(ru64x2
const &lhs, ru64x2
const &rhs)
noexcept
480 static_assert(FromElement < 2);
481 static_assert(ToElement < 2);
482 static_assert(ZeroMask < 4);
484 if constexpr (ZeroMask == 0) {
485 auto lhs_ = _mm_castsi128_pd(to_m128i(lhs));
486 auto rhs_ = _mm_castsi128_pd(to_m128i(rhs));
489 if constexpr (FromElement == 0 and ToElement == 0) {
490 r = _mm_shuffle_pd(rhs_, lhs_, 0b10);
491 }
else if constexpr (FromElement == 1 and ToElement == 0) {
492 r = _mm_shuffle_pd(rhs_, lhs_, 0b11);
493 }
else if constexpr (FromElement == 0 and ToElement == 1) {
494 r = _mm_shuffle_pd(lhs_, rhs_, 0b00);
496 r = _mm_shuffle_pd(lhs_, rhs_, 0b10);
499 return to_ru64x2(_mm_castpd_si128(r));
502 constexpr size_t FromElement1 = FromElement * 2;
503 constexpr size_t FromElement2 = FromElement1 + 1;
504 constexpr size_t ToElement1 = ToElement * 2;
505 constexpr size_t ToElement2 = ToElement1 + 1;
506 constexpr size_t ZeroMask2 = (ZeroMask & 1 ? 0b11 : 0b00) | (ZeroMask & 2 ? 0b1100 : 0b0000);
508 ttlet lhs_ = to_rf32x4(_mm_castsi128_ps(to_m128i(lhs)));
509 ttlet rhs_ = to_rf32x4(_mm_castsi128_ps(to_m128i(rhs)));
510 ttlet tmp = f32x4_x64v2_insert<FromElement1, ToElement1, 0>(lhs_, rhs_);
511 ttlet r = f32x4_x64v2_insert<FromElement2, ToElement2, ZeroMask2>(tmp, rhs_);
513 return to_ru64x2(_mm_castps_si128(to_m128(r)));