HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
f32x4_x64v2.hpp
1// Copyright Take Vos 2020-2021.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "raw_numeric_array.hpp"
8
9#include <array>
10#include <emmintrin.h>
11#include <smmintrin.h>
12#include <xmmintrin.h>
13#include <pmmintrin.h>
14#include <immintrin.h>
15
16namespace tt {
17
22template<unsigned int Mask>
23[[nodiscard]] inline rf32x4 f32x4_x64v2_clear(rf32x4 const &rhs) noexcept
24{
25 static_assert((Mask ^ (Mask & 0xf)) == 0);
26
27 if constexpr (Mask == 0b0000) {
28 return rhs;
29 } else if constexpr (Mask == 0b1111) {
30 // 1 cycle
31 return to_rf32x4(_mm_setzero_ps());
32 } else {
33 // 1 cycle
34 return to_rf32x4(_mm_insert_ps(to_m128(rhs), to_m128(rhs), Mask));
35 }
36}
37
44template<unsigned int Mask>
45[[nodiscard]] inline rf32x4 f32x4_x64v2_make_sign() noexcept
46{
47 static_assert((Mask ^ (Mask & 0xf)) == 0);
48
49 if constexpr (Mask == 0b0000) {
50 return to_rf32x4(_mm_setzero_ps());
51
52 } else if constexpr (Mask == 0b0001) {
53 return to_rf32x4(_mm_set_ss(-0.0f));
54
55 } else if constexpr (Mask == 0b1111) {
56 return to_rf32x4(_mm_set_ps1(-0.0f));
57
58 } else {
59 constexpr float x = (Mask & 0b0001) == 0 ? 0.0f : -0.0f;
60 constexpr float y = (Mask & 0b0010) == 0 ? 0.0f : -0.0f;
61 constexpr float z = (Mask & 0b0100) == 0 ? 0.0f : -0.0f;
62 constexpr float w = (Mask & 0b1000) == 0 ? 0.0f : -0.0f;
63 return to_rf32x4(_mm_set_ps(w, z, y, x));
64 }
65}
66
71template<unsigned int Mask>
72[[nodiscard]] inline rf32x4 f32x4_x64v2_neg(rf32x4 const &rhs) noexcept
73{
74 static_assert((Mask ^ (Mask & 0xf)) == 0);
75
76 if constexpr (Mask == 0b0000) {
77 return rhs;
78
79 } else {
80 ttlet sign = to_m128(f32x4_x64v2_make_sign<Mask>());
81 return to_rf32x4(_mm_xor_ps(to_m128(rhs), sign));
82 }
83}
84
106template<unsigned int Mask>
107[[nodiscard]] inline rf32x4 f32x4_x64v2_addsub(rf32x4 const &lhs, rf32x4 const &rhs) noexcept
108{
109 static_assert((Mask ^ (Mask & 0xf)) == 0, "Only bottom 4 lsb may be set");
110
111 ttlet lhs_ = to_m128(lhs);
112 ttlet rhs_ = to_m128(rhs);
113
114 if constexpr (Mask == 0b0000) {
115 return to_rf32x4(_mm_sub_ps(lhs_, rhs_));
116
117 } else if constexpr (Mask == 0b0101) {
118 return to_rf32x4(_mm_addsub_ps(lhs_, rhs_));
119
120 } else if constexpr (Mask == 0b1010) {
121 ttlet neg_rhs = to_m128(f32x4_x64v2_neg<0b1111>(rhs));
122 return to_rf32x4(_mm_addsub_ps(lhs_, neg_rhs));
123
124 } else if constexpr (Mask == 0b1111) {
125 return to_rf32x4(_mm_add_ps(lhs_, rhs_));
126
127 } else {
128 ttlet neg_rhs = to_m128(f32x4_x64v2_neg<~Mask & 0xf>(rhs));
129 return to_rf32x4(_mm_add_ps(lhs_, neg_rhs));
130 }
131}
132
140template<unsigned int Mask>
141[[nodiscard]] float f32x4_x64v2_dot(rf32x4 const &lhs, rf32x4 const &rhs) noexcept
142{
143 static_assert((Mask ^ (Mask & 0xf)) == 0, "Only bottom 4 lsb may be set");
144 constexpr int imm8 = (Mask << 4) | 0x1;
145
146 auto tmp = to_rf32x4(_mm_dp_ps(to_m128(lhs), to_m128(rhs), imm8));
147 return get<0>(tmp);
148}
149
158template<unsigned int Mask>
159[[nodiscard]] float f32x4_x64v2_hypot(rf32x4 const &rhs) noexcept
160{
161 static_assert((Mask ^ (Mask & 0xf)) == 0, "Only bottom 4 lsb may be set");
162 constexpr int imm8 = (Mask << 4) | 0x1;
163
164 auto _rhs = to_m128(rhs);
165 auto tmp = to_rf32x4(_mm_sqrt_ps(_mm_dp_ps(_rhs, _rhs, imm8)));
166 return get<0>(tmp);
167}
168
177template<unsigned int Mask>
178[[nodiscard]] float f32x4_x64v2_rcp_hypot(rf32x4 const &rhs) noexcept
179{
180 static_assert((Mask ^ (Mask & 0xf)) == 0, "Only bottom 4 lsb may be set");
181 constexpr int imm8 = (Mask << 4) | 0x1;
182
183 auto _rhs = to_m128(rhs);
184 auto tmp = to_rf32x4(_mm_rsqrt_ps(_mm_dp_ps(_rhs, _rhs, imm8)));
185 return get<0>(tmp);
186}
187
197template<unsigned int Mask>
198[[nodiscard]] rf32x4 f32x4_x64v2_normalize(rf32x4 const &rhs) noexcept
199{
200 static_assert((Mask ^ (Mask & 0xf)) == 0, "Only bottom 4 lsb may be set");
201 constexpr int dp_imm8 = (Mask << 4) | Mask;
202 constexpr int zero_imm8 = ~Mask & 0xf;
203
204 ttlet rhs_ = to_m128(rhs);
205 ttlet rcp_length = _mm_rsqrt_ps(_mm_dp_ps(rhs_, rhs_, dp_imm8));
206 ttlet rcp_length_ = _mm_insert_ps(rcp_length, rcp_length, zero_imm8);
207 return to_rf32x4(_mm_mul_ps(rhs_, rcp_length_));
208}
209
212[[nodiscard]] inline bool f32x4_x64v2_eq(rf32x4 const &lhs, rf32x4 const &rhs) noexcept
213{
214 // Example 1: lhs == rhs
215 // tmp -> (1.0, 1.0, 1.0, 1.0) != (1.0, 1.0, 1.0, 1.0) -> (0,0,0,0)
216 // return -> x == 0 && y == 0 && z == 0 && w == 0 -> true
217
218 // Example 2: lhs != rhs
219 // tmp -> (0.0, 1.0, 1.0, 1.0) != (1.0, 1.0, 1.0, 1.0) -> (1,0,0,0)
220 // return -> x == 0 && y == 0 && z == 0 && w == 0 -> false
221
222 // Example 3: lhs != rhs
223 // tmp -> (0.0, 0.0, 0.0, 0.0) != (1.0, 1.0, 1.0, 1.0) -> (1,1,1,1)
224 // return -> x == 0 && y == 0 && z == 0 && w == 0 -> false
225
226 auto tmp = _mm_cmpneq_ps(to_m128(lhs), to_m128(rhs));
227 return _mm_testz_ps(tmp, tmp);
228}
229
234[[nodiscard]] inline float f32x4_x64v2_viktor_cross(rf32x4 const &lhs, rf32x4 const &rhs) noexcept
235{
236 // a.x * b.y - a.y * b.x
237 ttlet tmp1 = _mm_permute_ps(to_m128(rhs), _MM_SHUFFLE(2, 3, 0, 1));
238 ttlet tmp2 = _mm_mul_ps(to_m128(lhs), tmp1);
239 ttlet tmp3 = _mm_hsub_ps(tmp2, tmp2);
240 return _mm_cvtss_f32(tmp3);
241}
242
251[[nodiscard]] inline rf32x4 f32x4_x64v2_hamilton_cross(rf32x4 const &lhs, rf32x4 const &rhs) noexcept
252{
253 ttlet lhs_ = to_m128(lhs);
254 ttlet rhs_ = to_m128(rhs);
255
256 ttlet lhs_x = _mm_permute_ps(lhs_, _MM_SHUFFLE(0, 0, 0, 0));
257 ttlet lhs_y = _mm_permute_ps(lhs_, _MM_SHUFFLE(1, 1, 1, 1));
258 ttlet lhs_z = _mm_permute_ps(lhs_, _MM_SHUFFLE(2, 2, 2, 2));
259 ttlet lhs_w = _mm_permute_ps(lhs_, _MM_SHUFFLE(3, 3, 3, 3));
260
261 ttlet rhs_1 = _mm_permute_ps(rhs_, _MM_SHUFFLE(0, 1, 2, 3));
262 ttlet rhs_2 = _mm_permute_ps(rhs_, _MM_SHUFFLE(1, 0, 3, 2));
263 ttlet rhs_3 = _mm_permute_ps(rhs_, _MM_SHUFFLE(2, 3, 0, 1));
264
265 ttlet w = _mm_mul_ps(lhs_w, rhs_);
266 ttlet x = _mm_mul_ps(lhs_x, rhs_1);
267 ttlet y = _mm_mul_ps(lhs_y, rhs_2);
268 ttlet z = _mm_mul_ps(lhs_z, rhs_3);
269
270 ttlet s0 = f32x4_x64v2_addsub<0b0101>(to_rf32x4(w), to_rf32x4(x));
271 ttlet s1 = f32x4_x64v2_addsub<0b0011>(s0, to_rf32x4(y));
272 return f32x4_x64v2_addsub<0b0110>(s1, to_rf32x4(z));
273}
274
282[[nodiscard]] inline rf32x4 f32x4_x64v2_cross(rf32x4 const &lhs, rf32x4 const &rhs) noexcept
283{
284 ttlet a_left = _mm_permute_ps(to_m128(lhs), _MM_SHUFFLE(3, 0, 2, 1));
285 ttlet b_left = _mm_permute_ps(to_m128(rhs), _MM_SHUFFLE(3, 1, 0, 2));
286 ttlet left = _mm_mul_ps(a_left, b_left);
287
288 ttlet a_right = _mm_permute_ps(to_m128(lhs), _MM_SHUFFLE(3, 1, 0, 2));
289 ttlet b_right = _mm_permute_ps(to_m128(rhs), _MM_SHUFFLE(3, 0, 2, 1));
290 ttlet right = _mm_mul_ps(a_right, b_right);
291 return to_rf32x4(_mm_sub_ps(left, right));
292}
293
294[[nodiscard]] inline std::array<rf32x4, 4>
295f32x4_x64v2_transpose(rf32x4 const &col0, rf32x4 const &col1, rf32x4 const &col2, rf32x4 const &col3) noexcept
296{
297 auto col0_ = to_m128(col0);
298 auto col1_ = to_m128(col1);
299 auto col2_ = to_m128(col2);
300 auto col3_ = to_m128(col3);
301
302 _MM_TRANSPOSE4_PS(col0_, col1_, col2_, col3_);
303
304 return {to_rf32x4(col0_), to_rf32x4(col1_), to_rf32x4(col2_), to_rf32x4(col3_)};
305}
306
307template<ssize_t A, ssize_t B, ssize_t C, ssize_t D>
308[[nodiscard]] constexpr static int f32x4_x64v2_permute_mask() noexcept
309{
310 static_assert(A >= -3 && A < 4);
311 static_assert(B >= -3 && B < 4);
312 static_assert(C >= -3 && C < 4);
313 static_assert(D >= -3 && D < 4);
314
315 int r = 0;
316 switch (A) {
317 case 0: r |= 0b00'00'00'00; break;
318 case 1: r |= 0b00'00'00'01; break;
319 case 2: r |= 0b00'00'00'10; break;
320 case 3: r |= 0b00'00'00'11; break;
321 case -1: r |= 0b00'00'00'00; break;
322 case -2: r |= 0b00'00'00'00; break;
323 }
324 switch (B) {
325 case 0: r |= 0b00'00'00'00; break;
326 case 1: r |= 0b00'00'01'00; break;
327 case 2: r |= 0b00'00'10'00; break;
328 case 3: r |= 0b00'00'11'00; break;
329 case -1: r |= 0b00'00'01'00; break;
330 case -2: r |= 0b00'00'01'00; break;
331 }
332 switch (C) {
333 case 0: r |= 0b00'00'00'00; break;
334 case 1: r |= 0b00'01'00'00; break;
335 case 2: r |= 0b00'10'00'00; break;
336 case 3: r |= 0b00'11'00'00; break;
337 case -1: r |= 0b00'10'00'00; break;
338 case -2: r |= 0b00'10'00'00; break;
339 }
340 switch (D) {
341 case 0: r |= 0b00'00'00'00; break;
342 case 1: r |= 0b01'00'00'00; break;
343 case 2: r |= 0b10'00'00'00; break;
344 case 3: r |= 0b11'00'00'00; break;
345 case -1: r |= 0b11'00'00'00; break;
346 case -2: r |= 0b11'00'00'00; break;
347 }
348 return r;
349}
350
351template<ssize_t A, ssize_t B, ssize_t C, ssize_t D>
352[[nodiscard]] constexpr static int f32x4_x64v2_not_one_mask() noexcept
353{
354 static_assert(A >= -3 && A < 4);
355 static_assert(B >= -3 && B < 4);
356 static_assert(C >= -3 && C < 4);
357 static_assert(D >= -3 && D < 4);
358
359 int r = 0;
360 r |= (A == -2) ? 0 : 0b0001;
361 r |= (B == -2) ? 0 : 0b0010;
362 r |= (C == -2) ? 0 : 0b0100;
363 r |= (D == -2) ? 0 : 0b1000;
364 return r;
365}
366
367template<ssize_t A, ssize_t B, ssize_t C, ssize_t D>
368[[nodiscard]] constexpr static int f32x4_x64v2_number_mask() noexcept
369{
370 static_assert(A >= -3 && A < 4);
371 static_assert(B >= -3 && B < 4);
372 static_assert(C >= -3 && C < 4);
373 static_assert(D >= -3 && D < 4);
374
375 int r = 0;
376 r |= A < 0 ? 0b0001 : 0;
377 r |= B < 0 ? 0b0010 : 0;
378 r |= C < 0 ? 0b0100 : 0;
379 r |= D < 0 ? 0b1000 : 0;
380 return r;
381}
382
383template<ssize_t A, ssize_t B, ssize_t C, ssize_t D>
384[[nodiscard]] __m128 m128_x64v2_swizzle(__m128 const &value) noexcept
385{
386 static_assert(A >= -3 && A < 4);
387 static_assert(B >= -3 && B < 4);
388 static_assert(C >= -3 && C < 4);
389 static_assert(D >= -3 && D < 4);
390
391 constexpr int permute_mask = f32x4_x64v2_permute_mask<A, B, C, D>();
392 constexpr int not_one_mask = f32x4_x64v2_not_one_mask<A, B, C, D>();
393 constexpr int number_mask = f32x4_x64v2_number_mask<A, B, C, D>();
394
395 __m128 swizzled;
396 // Clang is able to optimize these intrinsics, MSVC is not.
397 if constexpr (permute_mask != 0b11'10'01'00) {
398 swizzled = _mm_permute_ps(value, permute_mask);
399 } else {
400 swizzled = value;
401 }
402
403 __m128 numbers;
404 if constexpr (not_one_mask == 0b0000) {
405 numbers = _mm_set_ps1(1.0f);
406 } else if constexpr (not_one_mask == 0b1111) {
407 numbers = _mm_setzero_ps();
408 } else if constexpr (not_one_mask == 0b1110) {
409 numbers = _mm_set_ss(1.0f);
410 } else {
411 ttlet _1111 = _mm_set_ps1(1.0f);
412 numbers = _mm_insert_ps(_1111, _1111, not_one_mask);
413 }
414
415 __m128 result;
416 if constexpr (number_mask == 0b0000) {
417 result = swizzled;
418 } else if constexpr (number_mask == 0b1111) {
419 result = numbers;
420 } else if constexpr (((not_one_mask | ~number_mask) & 0b1111) == 0b1111) {
421 result = _mm_insert_ps(swizzled, swizzled, number_mask);
422 } else {
423 result = _mm_blend_ps(swizzled, numbers, number_mask);
424 }
425 return result;
426}
427
428template<ssize_t A, ssize_t B, ssize_t C, ssize_t D>
429[[nodiscard]] __m128i m128i_x64v2_swizzle(__m128i const &value) noexcept
430{
431 return _mm_castps_si128(m128_x64v2_swizzle<A, B, C, D>(_mm_castsi128_ps(value)));
432}
433
434template<ssize_t A = -1, ssize_t B = -1, ssize_t C = -1, ssize_t D = -1>
435[[nodiscard]] rf32x4 f32x4_x64v2_swizzle(rf32x4 const &value) noexcept
436{
437 return to_rf32x4(m128_x64v2_swizzle<A, B, C, D>(to_m128(value)));
438}
439
440template<ssize_t A = -1, ssize_t B = -1, ssize_t C = -1, ssize_t D = -1>
441[[nodiscard]] ri32x4 i32x4_x64v2_swizzle(ri32x4 const &value) noexcept
442{
443 return to_ri32x4(m128i_x64v2_swizzle<A, B, C, D>(to_m128i(value)));
444}
445
446template<ssize_t A = -1, ssize_t B = -1, ssize_t C = -1, ssize_t D = -1>
447[[nodiscard]] ru32x4 u32x4_x64v2_swizzle(ru32x4 const &value) noexcept
448{
449 return to_ru32x4(m128i_x64v2_swizzle<A, B, C, D>(to_m128i(value)));
450}
451
452template<ssize_t A = -1, ssize_t B = -1>
453[[nodiscard]] ru64x2 u64x2_x64v2_swizzle(ru64x2 const &value) noexcept
454{
455 constexpr auto A1 = A >= 0 ? A * 2 : A;
456 constexpr auto A2 = A >= 0 ? A1 + 1 : A1;
457 constexpr auto B1 = B >= 0 ? B * 2 : B;
458 constexpr auto B2 = B >= 0 ? B1 + 1 : B1;
459
460 ttlet value_ = _mm_castsi128_ps(to_m128i(value));
461 ttlet r = to_m128(f32x4_x64v2_swizzle<A1, A2, B1, B2>(to_rf32x4(value_)));
462 return to_ru64x2(_mm_castps_si128(r));
463}
464
465template<size_t FromElement, size_t ToElement, size_t ZeroMask>
466[[nodiscard]] rf32x4 f32x4_x64v2_insert(rf32x4 const &lhs, rf32x4 const &rhs) noexcept
467{
468 static_assert(FromElement < 4);
469 static_assert(ToElement < 4);
470 static_assert(ZeroMask < 16);
471
472 constexpr uint8_t insert_mask = static_cast<uint8_t>((FromElement << 6) | (ToElement << 4) | ZeroMask);
473
474 return to_rf32x4(_mm_insert_ps(to_m128(lhs), to_m128(rhs), insert_mask));
475}
476
477template<size_t FromElement, size_t ToElement, size_t ZeroMask>
478[[nodiscard]] ru64x2 u64x2_x64v2_insert(ru64x2 const &lhs, ru64x2 const &rhs) noexcept
479{
480 static_assert(FromElement < 2);
481 static_assert(ToElement < 2);
482 static_assert(ZeroMask < 4);
483
484 if constexpr (ZeroMask == 0) {
485 auto lhs_ = _mm_castsi128_pd(to_m128i(lhs));
486 auto rhs_ = _mm_castsi128_pd(to_m128i(rhs));
487
488 __m128d r;
489 if constexpr (FromElement == 0 and ToElement == 0) {
490 r = _mm_shuffle_pd(rhs_, lhs_, 0b10);
491 } else if constexpr (FromElement == 1 and ToElement == 0) {
492 r = _mm_shuffle_pd(rhs_, lhs_, 0b11);
493 } else if constexpr (FromElement == 0 and ToElement == 1) {
494 r = _mm_shuffle_pd(lhs_, rhs_, 0b00);
495 } else {
496 r = _mm_shuffle_pd(lhs_, rhs_, 0b10);
497 }
498
499 return to_ru64x2(_mm_castpd_si128(r));
500
501 } else {
502 constexpr size_t FromElement1 = FromElement * 2;
503 constexpr size_t FromElement2 = FromElement1 + 1;
504 constexpr size_t ToElement1 = ToElement * 2;
505 constexpr size_t ToElement2 = ToElement1 + 1;
506 constexpr size_t ZeroMask2 = (ZeroMask & 1 ? 0b11 : 0b00) | (ZeroMask & 2 ? 0b1100 : 0b0000);
507
508 ttlet lhs_ = to_rf32x4(_mm_castsi128_ps(to_m128i(lhs)));
509 ttlet rhs_ = to_rf32x4(_mm_castsi128_ps(to_m128i(rhs)));
510 ttlet tmp = f32x4_x64v2_insert<FromElement1, ToElement1, 0>(lhs_, rhs_);
511 ttlet r = f32x4_x64v2_insert<FromElement2, ToElement2, ZeroMask2>(tmp, rhs_);
512
513 return to_ru64x2(_mm_castps_si128(to_m128(r)));
514 }
515}
516
517} // namespace tt
T left(T... args)