HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
f32x4_sse.hpp
1// Copyright Take Vos 2020-2021.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "../aligned_array.hpp"
8
9namespace tt {
10
13[[nodiscard]] inline f32x4_raw f32x4_sse_ceil(f32x4_raw const &rhs) noexcept
14{
15 return f32x4_raw{_mm_ceil_ps(static_cast<__m128>(rhs))};
16}
17
20[[nodiscard]] inline f32x4_raw f32x4_sse_floor(f32x4_raw const &rhs) noexcept
21{
22 return f32x4_raw{_mm_floor_ps(static_cast<__m128>(rhs))};
23}
24
27[[nodiscard]] inline f32x4_raw f32x4_sse_round(f32x4_raw const &rhs) noexcept
28{
29 return f32x4_raw{_mm_round_ps(static_cast<__m128>(rhs), _MM_FROUND_CUR_DIRECTION)};
30}
31
34[[nodiscard]] inline f32x4_raw f32x4_sse_rcp(f32x4_raw const &rhs) noexcept
35{
36 return f32x4_raw{_mm_rcp_ps(static_cast<__m128>(rhs))};
37}
38
43template<unsigned int Mask>
44[[nodiscard]] inline f32x4_raw f32x4_sse_clear(f32x4_raw const &rhs) noexcept
45{
46 static_assert(Mask ^ (Mask & 0xf) == 0);
47
48 if constexpr (Mask == 0b0000) {
49 return rhs;
50 } else if constexpr (Mask == 0b1111) {
51 // 1 cycle
52 return f32x4_raw{_mm_setzero_ps()};
53 } else {
54 // 1 cycle
55 return f32x4_raw{_mm_insert_ps(static_cast<__m128>(rhs), static_cast<__m128>(rhs), Mask)};
56 }
57}
58
65template<unsigned int Mask>
66[[nodiscard]] inline f32x4_raw f32x4_sse_make_sign() noexcept
67{
68 static_assert((Mask ^ (Mask & 0xf)) == 0);
69
70 if constexpr (Mask == 0b0000) {
71 return f32x4_raw{_mm_setzero_ps()};
72
73 } else if constexpr (Mask == 0b0001) {
74 return f32x4_raw{_mm_set_ss(-0.0f)};
75
76 } else if constexpr (Mask == 0b1111) {
77 return f32x4_raw{_mm_set_ps1(-0.0f)};
78
79 } else {
80 constexpr float x = (Mask & 0b0001) == 0 ? 0.0f : -0.0f;
81 constexpr float y = (Mask & 0b0010) == 0 ? 0.0f : -0.0f;
82 constexpr float z = (Mask & 0b0100) == 0 ? 0.0f : -0.0f;
83 constexpr float w = (Mask & 0b1000) == 0 ? 0.0f : -0.0f;
84 return f32x4_raw{_mm_set_ps(w, z, y, x)};
85 }
86}
87
92template<unsigned int Mask>
93[[nodiscard]] inline f32x4_raw f32x4_sse_neg(f32x4_raw const &rhs) noexcept
94{
95 static_assert((Mask ^ (Mask & 0xf)) == 0);
96
97 if constexpr (Mask == 0b0000) {
98 return rhs;
99
100 } else {
101 ttlet sign = static_cast<__m128>(f32x4_sse_make_sign<Mask>());
102 return f32x4_raw{_mm_xor_ps(static_cast<__m128>(rhs), sign)};
103 }
104}
105
112[[nodiscard]] inline f32x4_raw
113f32x4_sse_hadd(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
114{
115 return f32x4_raw{_mm_hadd_ps(static_cast<__m128>(lhs), static_cast<__m128>(rhs))};
116}
117
124[[nodiscard]] inline f32x4_raw
125f32x4_sse_hsub(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
126{
127 return f32x4_raw{_mm_hsub_ps(static_cast<__m128>(lhs), static_cast<__m128>(rhs))};
128}
129
151template<unsigned int Mask>
152[[nodiscard]] inline f32x4_raw f32x4_sse_addsub(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
153{
154 static_assert((Mask ^ (Mask & 0xf)) == 0, "Only bottom 4 lsb may be set");
155
156 ttlet lhs_ = static_cast<__m128>(lhs);
157 ttlet rhs_ = static_cast<__m128>(rhs);
158
159 if constexpr (Mask == 0b0000) {
160 return f32x4_raw{_mm_sub_ps(lhs_, rhs_)};
161
162 } else if constexpr (Mask == 0b0101) {
163 return f32x4_raw{_mm_addsub_ps(lhs_, rhs_)};
164
165 } else if constexpr (Mask == 0b1010) {
166 ttlet neg_rhs = static_cast<__m128>(f32x4_sse_neg<0b1111>(rhs));
167 return f32x4_raw{_mm_addsub_ps(lhs_, neg_rhs)};
168
169 } else if constexpr (Mask == 0b1111) {
170 return f32x4_raw{_mm_add_ps(lhs_, rhs_)};
171
172 } else {
173 ttlet neg_rhs = static_cast<__m128>(f32x4_sse_neg<~Mask & 0xf>(rhs));
174 return f32x4_raw{_mm_add_ps(lhs_, neg_rhs)};
175 }
176}
177
185template<unsigned int Mask>
186[[nodiscard]] float f32x4_sse_dot(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
187{
188 static_assert((Mask ^ (Mask & 0xf)) == 0, "Only bottom 4 lsb may be set");
189 constexpr int imm8 = (Mask << 4) | 0x1;
190
191 auto tmp = f32x4_raw{_mm_dp_ps(static_cast<__m128>(lhs), static_cast<__m128>(rhs), imm8)};
192 return get<0>(tmp);
193}
194
203template<unsigned int Mask>
204[[nodiscard]] float f32x4_sse_hypot(f32x4_raw const &rhs) noexcept
205{
206 static_assert((Mask ^ (Mask & 0xf)) == 0, "Only bottom 4 lsb may be set");
207 constexpr int imm8 = (Mask << 4) | 0x1;
208
209 auto _rhs = static_cast<__m128>(rhs);
210 auto tmp = f32x4_raw{_mm_sqrt_ps(_mm_dp_ps(_rhs, _rhs, imm8))};
211 return get<0>(tmp);
212}
213
222template<unsigned int Mask>
223[[nodiscard]] float f32x4_sse_rcp_hypot(f32x4_raw const &rhs) noexcept
224{
225 static_assert((Mask ^ (Mask & 0xf)) == 0, "Only bottom 4 lsb may be set");
226 constexpr int imm8 = (Mask << 4) | 0x1;
227
228 auto _rhs = static_cast<__m128>(rhs);
229 auto tmp = f32x4_raw{_mm_rsqrt_ps(_mm_dp_ps(_rhs, _rhs, imm8))};
230 return get<0>(tmp);
231}
232
242template<unsigned int Mask>
243[[nodiscard]] f32x4_raw f32x4_sse_normalize(f32x4_raw const &rhs) noexcept
244{
245 static_assert((Mask ^ (Mask & 0xf)) == 0, "Only bottom 4 lsb may be set");
246 constexpr int dp_imm8 = (Mask << 4) | Mask;
247 constexpr int zero_imm8 = ~Mask & 0xf;
248
249 ttlet rhs_ = static_cast<__m128>(rhs);
250 ttlet rcp_length = _mm_rsqrt_ps(_mm_dp_ps(rhs_, rhs_, dp_imm8));
251 ttlet rcp_length_ = _mm_insert_ps(rcp_length, rcp_length, zero_imm8);
252 return f32x4_raw{_mm_mul_ps(rhs_, rcp_length_)};
253}
254
257[[nodiscard]] inline unsigned int
258f32x4_sse_eq_mask(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
259{
260 auto tmp = _mm_cmpeq_ps(static_cast<__m128>(lhs), static_cast<__m128>(rhs));
261 return static_cast<unsigned int>(_mm_movemask_ps(tmp));
262}
263
266[[nodiscard]] inline unsigned int
267f32x4_sse_ne_mask(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
268{
269 auto tmp = _mm_cmpneq_ps(static_cast<__m128>(lhs), static_cast<__m128>(rhs));
270 return static_cast<unsigned int>(_mm_movemask_ps(tmp));
271}
272
275[[nodiscard]] inline unsigned int
276f32x4_sse_lt_mask(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
277{
278 auto tmp = _mm_cmplt_ps(static_cast<__m128>(lhs), static_cast<__m128>(rhs));
279 return static_cast<unsigned int>(_mm_movemask_ps(tmp));
280}
281
284[[nodiscard]] inline unsigned int
285f32x4_sse_gt_mask(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
286{
287 auto tmp = _mm_cmpgt_ps(static_cast<__m128>(lhs), static_cast<__m128>(rhs));
288 return static_cast<unsigned int>(_mm_movemask_ps(tmp));
289}
290
293[[nodiscard]] inline unsigned int
294f32x4_sse_le_mask(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
295{
296 auto tmp = _mm_cmple_ps(static_cast<__m128>(lhs), static_cast<__m128>(rhs));
297 return static_cast<unsigned int>(_mm_movemask_ps(tmp));
298}
299
302[[nodiscard]] inline unsigned int
303f32x4_sse_ge_mask(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
304{
305 auto tmp = _mm_cmpge_ps(static_cast<__m128>(lhs), static_cast<__m128>(rhs));
306 return static_cast<unsigned int>(_mm_movemask_ps(tmp));
307}
308
311[[nodiscard]] inline bool f32x4_sse_eq(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
312{
313 // Example 1: lhs == rhs
314 // tmp -> (1.0, 1.0, 1.0, 1.0) != (1.0, 1.0, 1.0, 1.0) -> (0,0,0,0)
315 // return -> x == 0 && y == 0 && z == 0 && w == 0 -> true
316
317 // Example 2: lhs != rhs
318 // tmp -> (0.0, 1.0, 1.0, 1.0) != (1.0, 1.0, 1.0, 1.0) -> (1,0,0,0)
319 // return -> x == 0 && y == 0 && z == 0 && w == 0 -> false
320
321 // Example 3: lhs != rhs
322 // tmp -> (0.0, 0.0, 0.0, 0.0) != (1.0, 1.0, 1.0, 1.0) -> (1,1,1,1)
323 // return -> x == 0 && y == 0 && z == 0 && w == 0 -> false
324
325 auto tmp = _mm_cmpneq_ps(static_cast<__m128>(lhs), static_cast<__m128>(rhs));
326 return _mm_testz_ps(tmp, tmp);
327}
328
333[[nodiscard]] inline float f32x4_sse_viktor_cross(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
334{
335 // a.x * b.y - a.y * b.x
336 ttlet tmp1 = _mm_permute_ps(static_cast<__m128>(rhs), _MM_SHUFFLE(2, 3, 0, 1));
337 ttlet tmp2 = _mm_mul_ps(static_cast<__m128>(lhs), tmp1);
338 ttlet tmp3 = _mm_hsub_ps(tmp2, tmp2);
339 return _mm_cvtss_f32(tmp3);
340}
341
350[[nodiscard]] inline f32x4_raw f32x4_sse_hamilton_cross(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
351{
352 ttlet lhs_ = static_cast<__m128>(lhs);
353 ttlet rhs_ = static_cast<__m128>(rhs);
354
355 ttlet lhs_x = _mm_permute_ps(lhs_, _MM_SHUFFLE(0, 0, 0, 0));
356 ttlet lhs_y = _mm_permute_ps(lhs_, _MM_SHUFFLE(1, 1, 1, 1));
357 ttlet lhs_z = _mm_permute_ps(lhs_, _MM_SHUFFLE(2, 2, 2, 2));
358 ttlet lhs_w = _mm_permute_ps(lhs_, _MM_SHUFFLE(3, 3, 3, 3));
359
360 ttlet rhs_1 = _mm_permute_ps(rhs_, _MM_SHUFFLE(0, 1, 2, 3));
361 ttlet rhs_2 = _mm_permute_ps(rhs_, _MM_SHUFFLE(1, 0, 3, 2));
362 ttlet rhs_3 = _mm_permute_ps(rhs_, _MM_SHUFFLE(2, 3, 0, 1));
363
364 ttlet w = _mm_mul_ps(lhs_w, rhs_);
365 ttlet x = _mm_mul_ps(lhs_x, rhs_1);
366 ttlet y = _mm_mul_ps(lhs_y, rhs_2);
367 ttlet z = _mm_mul_ps(lhs_z, rhs_3);
368
369 ttlet s0 = f32x4_sse_addsub<0b0101>(f32x4_raw{w}, f32x4_raw{x});
370 ttlet s1 = f32x4_sse_addsub<0b0011>(s0, f32x4_raw{y});
371 return f32x4_sse_addsub<0b0110>(s1, f32x4_raw{z});
372}
373
374
382[[nodiscard]] inline f32x4_raw f32x4_sse_cross(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
383{
384 ttlet a_left = _mm_permute_ps(static_cast<__m128>(lhs), _MM_SHUFFLE(3, 0, 2, 1));
385 ttlet b_left = _mm_permute_ps(static_cast<__m128>(rhs), _MM_SHUFFLE(3, 1, 0, 2));
386 ttlet left = _mm_mul_ps(a_left, b_left);
387
388 ttlet a_right = _mm_permute_ps(static_cast<__m128>(lhs), _MM_SHUFFLE(3, 1, 0, 2));
389 ttlet b_right = _mm_permute_ps(static_cast<__m128>(rhs), _MM_SHUFFLE(3, 0, 2, 1));
390 ttlet right = _mm_mul_ps(a_right, b_right);
391 return f32x4_raw{_mm_sub_ps(left, right)};
392}
393
394[[nodiscard]] inline std::array<f32x4_raw, 4> f32x4_sse_transpose(
395 f32x4_raw const &col0,
396 f32x4_raw const &col1,
397 f32x4_raw const &col2,
398 f32x4_raw const &col3) noexcept
399{
400 auto col0_ = static_cast<__m128>(col0);
401 auto col1_ = static_cast<__m128>(col1);
402 auto col2_ = static_cast<__m128>(col2);
403 auto col3_ = static_cast<__m128>(col3);
404
405 _MM_TRANSPOSE4_PS(col0_, col1_, col2_, col3_);
406
407 return {
408 f32x4_raw{col0_},
409 f32x4_raw{col1_},
410 f32x4_raw{col2_},
411 f32x4_raw{col3_}};
412}
413
414template<ssize_t A, ssize_t B, ssize_t C, ssize_t D>
415[[nodiscard]] constexpr static int f32x4_sse_permute_mask() noexcept
416{
417 static_assert(A >= -3 && A < 4);
418 static_assert(B >= -3 && B < 4);
419 static_assert(C >= -3 && C < 4);
420 static_assert(D >= -3 && D < 4);
421
422 int r = 0;
423 switch (A) {
424 case 0: r |= 0b00'00'00'00; break;
425 case 1: r |= 0b00'00'00'01; break;
426 case 2: r |= 0b00'00'00'10; break;
427 case 3: r |= 0b00'00'00'11; break;
428 case -1: r |= 0b00'00'00'00; break;
429 case -2: r |= 0b00'00'00'00; break;
430 }
431 switch (B) {
432 case 0: r |= 0b00'00'00'00; break;
433 case 1: r |= 0b00'00'01'00; break;
434 case 2: r |= 0b00'00'10'00; break;
435 case 3: r |= 0b00'00'11'00; break;
436 case -1: r |= 0b00'00'01'00; break;
437 case -2: r |= 0b00'00'01'00; break;
438 }
439 switch (C) {
440 case 0: r |= 0b00'00'00'00; break;
441 case 1: r |= 0b00'01'00'00; break;
442 case 2: r |= 0b00'10'00'00; break;
443 case 3: r |= 0b00'11'00'00; break;
444 case -1: r |= 0b00'10'00'00; break;
445 case -2: r |= 0b00'10'00'00; break;
446 }
447 switch (D) {
448 case 0: r |= 0b00'00'00'00; break;
449 case 1: r |= 0b01'00'00'00; break;
450 case 2: r |= 0b10'00'00'00; break;
451 case 3: r |= 0b11'00'00'00; break;
452 case -1: r |= 0b11'00'00'00; break;
453 case -2: r |= 0b11'00'00'00; break;
454 }
455 return r;
456}
457
458template<ssize_t A, ssize_t B, ssize_t C, ssize_t D>
459[[nodiscard]] constexpr static int f32x4_sse_not_one_mask() noexcept
460{
461 static_assert(A >= -3 && A < 4);
462 static_assert(B >= -3 && B < 4);
463 static_assert(C >= -3 && C < 4);
464 static_assert(D >= -3 && D < 4);
465
466 int r = 0;
467 r |= (A == -2) ? 0 : 0b0001;
468 r |= (B == -2) ? 0 : 0b0010;
469 r |= (C == -2) ? 0 : 0b0100;
470 r |= (D == -2) ? 0 : 0b1000;
471 return r;
472}
473
474template<ssize_t A, ssize_t B, ssize_t C, ssize_t D>
475[[nodiscard]] constexpr static int f32x4_sse_number_mask() noexcept
476{
477 static_assert(A >= -3 && A < 4);
478 static_assert(B >= -3 && B < 4);
479 static_assert(C >= -3 && C < 4);
480 static_assert(D >= -3 && D < 4);
481
482 int r = 0;
483 r |= A < 0 ? 0b0001 : 0;
484 r |= B < 0 ? 0b0010 : 0;
485 r |= C < 0 ? 0b0100 : 0;
486 r |= D < 0 ? 0b1000 : 0;
487 return r;
488}
489
490template<ssize_t A = -1, ssize_t B = -1, ssize_t C = -1, ssize_t D = -1>
491[[nodiscard]] f32x4_raw f32x4_sse_swizzle(f32x4_raw const &value) noexcept
492{
493 static_assert(A >= -3 && A < 4);
494 static_assert(B >= -3 && B < 4);
495 static_assert(C >= -3 && C < 4);
496 static_assert(D >= -3 && D < 4);
497
498 constexpr int permute_mask = f32x4_sse_permute_mask<A, B, C, D>();
499 constexpr int not_one_mask = f32x4_sse_not_one_mask<A, B, C, D>();
500 constexpr int number_mask = f32x4_sse_number_mask<A, B, C, D>();
501
502 __m128 swizzled;
503 // Clang is able to optimize these intrinsics, MSVC is not.
504 if constexpr (permute_mask != 0b11'10'01'00) {
505 swizzled = _mm_permute_ps(static_cast<__m128>(value), permute_mask);
506 } else {
507 swizzled = static_cast<__m128>(value);
508 }
509
510 __m128 numbers;
511 if constexpr (not_one_mask == 0b0000) {
512 numbers = _mm_set_ps1(1.0f);
513 } else if constexpr (not_one_mask == 0b1111) {
514 numbers = _mm_setzero_ps();
515 } else if constexpr (not_one_mask == 0b1110) {
516 numbers = _mm_set_ss(1.0f);
517 } else {
518 ttlet _1111 = _mm_set_ps1(1.0f);
519 numbers = _mm_insert_ps(_1111, _1111, not_one_mask);
520 }
521
522 __m128 result;
523 if constexpr (number_mask == 0b0000) {
524 result = swizzled;
525 } else if constexpr (number_mask == 0b1111) {
526 result = numbers;
527 } else if constexpr (((not_one_mask | ~number_mask) & 0b1111) == 0b1111) {
528 result = _mm_insert_ps(swizzled, swizzled, number_mask);
529 } else {
530 result = _mm_blend_ps(swizzled, numbers, number_mask);
531 }
532 return f32x4_raw{result};
533}
534
535}
T left(T... args)