HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
f32x4_sse.hpp
1// Copyright Take Vos 2020-2021.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include <array>
8#include <smmintrin.h>
9#include <xmmintrin.h>
10#include <pmmintrin.h>
11#include <immintrin.h>
12
13namespace tt {
14
15using f32x4_raw = std::array<float, 4>;
16
17[[nodiscard]] inline f32x4_raw to_f32x4_raw(__m128 const &rhs) noexcept
18{
20 _mm_storeu_ps(r.data(), rhs);
21 return r;
22}
23
24[[nodiscard]] inline __m128 to_m128(f32x4_raw const &rhs) noexcept
25{
26 return _mm_loadu_ps(rhs.data());
27}
28
31[[nodiscard]] inline f32x4_raw f32x4_sse_ceil(f32x4_raw const &rhs) noexcept
32{
33 return to_f32x4_raw(_mm_ceil_ps(to_m128(rhs)));
34}
35
38[[nodiscard]] inline f32x4_raw f32x4_sse_floor(f32x4_raw const &rhs) noexcept
39{
40 return to_f32x4_raw(_mm_floor_ps(to_m128(rhs)));
41}
42
45[[nodiscard]] inline f32x4_raw f32x4_sse_round(f32x4_raw const &rhs) noexcept
46{
47 return to_f32x4_raw(_mm_round_ps(to_m128(rhs), _MM_FROUND_CUR_DIRECTION));
48}
49
52[[nodiscard]] inline f32x4_raw f32x4_sse_rcp(f32x4_raw const &rhs) noexcept
53{
54 return to_f32x4_raw(_mm_rcp_ps(to_m128(rhs)));
55}
56
61template<unsigned int Mask>
62[[nodiscard]] inline f32x4_raw f32x4_sse_clear(f32x4_raw const &rhs) noexcept
63{
64 static_assert((Mask ^ (Mask & 0xf)) == 0);
65
66 if constexpr (Mask == 0b0000) {
67 return rhs;
68 } else if constexpr (Mask == 0b1111) {
69 // 1 cycle
70 return to_f32x4_raw(_mm_setzero_ps());
71 } else {
72 // 1 cycle
73 return to_f32x4_raw(_mm_insert_ps(to_m128(rhs), to_m128(rhs), Mask));
74 }
75}
76
83template<unsigned int Mask>
84[[nodiscard]] inline f32x4_raw f32x4_sse_make_sign() noexcept
85{
86 static_assert((Mask ^ (Mask & 0xf)) == 0);
87
88 if constexpr (Mask == 0b0000) {
89 return to_f32x4_raw(_mm_setzero_ps());
90
91 } else if constexpr (Mask == 0b0001) {
92 return to_f32x4_raw(_mm_set_ss(-0.0f));
93
94 } else if constexpr (Mask == 0b1111) {
95 return to_f32x4_raw(_mm_set_ps1(-0.0f));
96
97 } else {
98 constexpr float x = (Mask & 0b0001) == 0 ? 0.0f : -0.0f;
99 constexpr float y = (Mask & 0b0010) == 0 ? 0.0f : -0.0f;
100 constexpr float z = (Mask & 0b0100) == 0 ? 0.0f : -0.0f;
101 constexpr float w = (Mask & 0b1000) == 0 ? 0.0f : -0.0f;
102 return to_f32x4_raw(_mm_set_ps(w, z, y, x));
103 }
104}
105
110template<unsigned int Mask>
111[[nodiscard]] inline f32x4_raw f32x4_sse_neg(f32x4_raw const &rhs) noexcept
112{
113 static_assert((Mask ^ (Mask & 0xf)) == 0);
114
115 if constexpr (Mask == 0b0000) {
116 return rhs;
117
118 } else {
119 ttlet sign = to_m128(f32x4_sse_make_sign<Mask>());
120 return to_f32x4_raw(_mm_xor_ps(to_m128(rhs), sign));
121 }
122}
123
130[[nodiscard]] inline f32x4_raw
131f32x4_sse_hadd(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
132{
133 return to_f32x4_raw(_mm_hadd_ps(to_m128(lhs), to_m128(rhs)));
134}
135
142[[nodiscard]] inline f32x4_raw
143f32x4_sse_hsub(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
144{
145 return to_f32x4_raw(_mm_hsub_ps(to_m128(lhs), to_m128(rhs)));
146}
147
169template<unsigned int Mask>
170[[nodiscard]] inline f32x4_raw f32x4_sse_addsub(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
171{
172 static_assert((Mask ^ (Mask & 0xf)) == 0, "Only bottom 4 lsb may be set");
173
174 ttlet lhs_ = to_m128(lhs);
175 ttlet rhs_ = to_m128(rhs);
176
177 if constexpr (Mask == 0b0000) {
178 return to_f32x4_raw(_mm_sub_ps(lhs_, rhs_));
179
180 } else if constexpr (Mask == 0b0101) {
181 return to_f32x4_raw(_mm_addsub_ps(lhs_, rhs_));
182
183 } else if constexpr (Mask == 0b1010) {
184 ttlet neg_rhs = to_m128(f32x4_sse_neg<0b1111>(rhs));
185 return to_f32x4_raw(_mm_addsub_ps(lhs_, neg_rhs));
186
187 } else if constexpr (Mask == 0b1111) {
188 return to_f32x4_raw(_mm_add_ps(lhs_, rhs_));
189
190 } else {
191 ttlet neg_rhs = to_m128(f32x4_sse_neg<~Mask & 0xf>(rhs));
192 return to_f32x4_raw(_mm_add_ps(lhs_, neg_rhs));
193 }
194}
195
203template<unsigned int Mask>
204[[nodiscard]] float f32x4_sse_dot(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
205{
206 static_assert((Mask ^ (Mask & 0xf)) == 0, "Only bottom 4 lsb may be set");
207 constexpr int imm8 = (Mask << 4) | 0x1;
208
209 auto tmp = to_f32x4_raw(_mm_dp_ps(to_m128(lhs), to_m128(rhs), imm8));
210 return get<0>(tmp);
211}
212
221template<unsigned int Mask>
222[[nodiscard]] float f32x4_sse_hypot(f32x4_raw const &rhs) noexcept
223{
224 static_assert((Mask ^ (Mask & 0xf)) == 0, "Only bottom 4 lsb may be set");
225 constexpr int imm8 = (Mask << 4) | 0x1;
226
227 auto _rhs = to_m128(rhs);
228 auto tmp = to_f32x4_raw(_mm_sqrt_ps(_mm_dp_ps(_rhs, _rhs, imm8)));
229 return get<0>(tmp);
230}
231
240template<unsigned int Mask>
241[[nodiscard]] float f32x4_sse_rcp_hypot(f32x4_raw const &rhs) noexcept
242{
243 static_assert((Mask ^ (Mask & 0xf)) == 0, "Only bottom 4 lsb may be set");
244 constexpr int imm8 = (Mask << 4) | 0x1;
245
246 auto _rhs = to_m128(rhs);
247 auto tmp = to_f32x4_raw(_mm_rsqrt_ps(_mm_dp_ps(_rhs, _rhs, imm8)));
248 return get<0>(tmp);
249}
250
260template<unsigned int Mask>
261[[nodiscard]] f32x4_raw f32x4_sse_normalize(f32x4_raw const &rhs) noexcept
262{
263 static_assert((Mask ^ (Mask & 0xf)) == 0, "Only bottom 4 lsb may be set");
264 constexpr int dp_imm8 = (Mask << 4) | Mask;
265 constexpr int zero_imm8 = ~Mask & 0xf;
266
267 ttlet rhs_ = to_m128(rhs);
268 ttlet rcp_length = _mm_rsqrt_ps(_mm_dp_ps(rhs_, rhs_, dp_imm8));
269 ttlet rcp_length_ = _mm_insert_ps(rcp_length, rcp_length, zero_imm8);
270 return to_f32x4_raw(_mm_mul_ps(rhs_, rcp_length_));
271}
272
275[[nodiscard]] inline unsigned int
276f32x4_sse_eq_mask(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
277{
278 auto tmp = _mm_cmpeq_ps(to_m128(lhs), to_m128(rhs));
279 return static_cast<unsigned int>(_mm_movemask_ps(tmp));
280}
281
284[[nodiscard]] inline unsigned int
285f32x4_sse_ne_mask(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
286{
287 auto tmp = _mm_cmpneq_ps(to_m128(lhs), to_m128(rhs));
288 return static_cast<unsigned int>(_mm_movemask_ps(tmp));
289}
290
293[[nodiscard]] inline unsigned int
294f32x4_sse_lt_mask(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
295{
296 auto tmp = _mm_cmplt_ps(to_m128(lhs), to_m128(rhs));
297 return static_cast<unsigned int>(_mm_movemask_ps(tmp));
298}
299
302[[nodiscard]] inline unsigned int
303f32x4_sse_gt_mask(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
304{
305 auto tmp = _mm_cmpgt_ps(to_m128(lhs), to_m128(rhs));
306 return static_cast<unsigned int>(_mm_movemask_ps(tmp));
307}
308
311[[nodiscard]] inline unsigned int
312f32x4_sse_le_mask(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
313{
314 auto tmp = _mm_cmple_ps(to_m128(lhs), to_m128(rhs));
315 return static_cast<unsigned int>(_mm_movemask_ps(tmp));
316}
317
320[[nodiscard]] inline unsigned int
321f32x4_sse_ge_mask(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
322{
323 auto tmp = _mm_cmpge_ps(to_m128(lhs), to_m128(rhs));
324 return static_cast<unsigned int>(_mm_movemask_ps(tmp));
325}
326
329[[nodiscard]] inline bool f32x4_sse_eq(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
330{
331 // Example 1: lhs == rhs
332 // tmp -> (1.0, 1.0, 1.0, 1.0) != (1.0, 1.0, 1.0, 1.0) -> (0,0,0,0)
333 // return -> x == 0 && y == 0 && z == 0 && w == 0 -> true
334
335 // Example 2: lhs != rhs
336 // tmp -> (0.0, 1.0, 1.0, 1.0) != (1.0, 1.0, 1.0, 1.0) -> (1,0,0,0)
337 // return -> x == 0 && y == 0 && z == 0 && w == 0 -> false
338
339 // Example 3: lhs != rhs
340 // tmp -> (0.0, 0.0, 0.0, 0.0) != (1.0, 1.0, 1.0, 1.0) -> (1,1,1,1)
341 // return -> x == 0 && y == 0 && z == 0 && w == 0 -> false
342
343 auto tmp = _mm_cmpneq_ps(to_m128(lhs), to_m128(rhs));
344 return _mm_testz_ps(tmp, tmp);
345}
346
351[[nodiscard]] inline float f32x4_sse_viktor_cross(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
352{
353 // a.x * b.y - a.y * b.x
354 ttlet tmp1 = _mm_permute_ps(to_m128(rhs), _MM_SHUFFLE(2, 3, 0, 1));
355 ttlet tmp2 = _mm_mul_ps(to_m128(lhs), tmp1);
356 ttlet tmp3 = _mm_hsub_ps(tmp2, tmp2);
357 return _mm_cvtss_f32(tmp3);
358}
359
368[[nodiscard]] inline f32x4_raw f32x4_sse_hamilton_cross(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
369{
370 ttlet lhs_ = to_m128(lhs);
371 ttlet rhs_ = to_m128(rhs);
372
373 ttlet lhs_x = _mm_permute_ps(lhs_, _MM_SHUFFLE(0, 0, 0, 0));
374 ttlet lhs_y = _mm_permute_ps(lhs_, _MM_SHUFFLE(1, 1, 1, 1));
375 ttlet lhs_z = _mm_permute_ps(lhs_, _MM_SHUFFLE(2, 2, 2, 2));
376 ttlet lhs_w = _mm_permute_ps(lhs_, _MM_SHUFFLE(3, 3, 3, 3));
377
378 ttlet rhs_1 = _mm_permute_ps(rhs_, _MM_SHUFFLE(0, 1, 2, 3));
379 ttlet rhs_2 = _mm_permute_ps(rhs_, _MM_SHUFFLE(1, 0, 3, 2));
380 ttlet rhs_3 = _mm_permute_ps(rhs_, _MM_SHUFFLE(2, 3, 0, 1));
381
382 ttlet w = _mm_mul_ps(lhs_w, rhs_);
383 ttlet x = _mm_mul_ps(lhs_x, rhs_1);
384 ttlet y = _mm_mul_ps(lhs_y, rhs_2);
385 ttlet z = _mm_mul_ps(lhs_z, rhs_3);
386
387 ttlet s0 = f32x4_sse_addsub<0b0101>(to_f32x4_raw(w), to_f32x4_raw(x));
388 ttlet s1 = f32x4_sse_addsub<0b0011>(s0, to_f32x4_raw(y));
389 return f32x4_sse_addsub<0b0110>(s1, to_f32x4_raw(z));
390}
391
392
400[[nodiscard]] inline f32x4_raw f32x4_sse_cross(f32x4_raw const &lhs, f32x4_raw const &rhs) noexcept
401{
402 ttlet a_left = _mm_permute_ps(to_m128(lhs), _MM_SHUFFLE(3, 0, 2, 1));
403 ttlet b_left = _mm_permute_ps(to_m128(rhs), _MM_SHUFFLE(3, 1, 0, 2));
404 ttlet left = _mm_mul_ps(a_left, b_left);
405
406 ttlet a_right = _mm_permute_ps(to_m128(lhs), _MM_SHUFFLE(3, 1, 0, 2));
407 ttlet b_right = _mm_permute_ps(to_m128(rhs), _MM_SHUFFLE(3, 0, 2, 1));
408 ttlet right = _mm_mul_ps(a_right, b_right);
409 return to_f32x4_raw(_mm_sub_ps(left, right));
410}
411
412[[nodiscard]] inline std::array<f32x4_raw, 4> f32x4_sse_transpose(
413 f32x4_raw const &col0,
414 f32x4_raw const &col1,
415 f32x4_raw const &col2,
416 f32x4_raw const &col3) noexcept
417{
418 auto col0_ = to_m128(col0);
419 auto col1_ = to_m128(col1);
420 auto col2_ = to_m128(col2);
421 auto col3_ = to_m128(col3);
422
423 _MM_TRANSPOSE4_PS(col0_, col1_, col2_, col3_);
424
425 return {
426 to_f32x4_raw(col0_),
427 to_f32x4_raw(col1_),
428 to_f32x4_raw(col2_), to_f32x4_raw(col3_)};
429}
430
431template<ssize_t A, ssize_t B, ssize_t C, ssize_t D>
432[[nodiscard]] constexpr static int f32x4_sse_permute_mask() noexcept
433{
434 static_assert(A >= -3 && A < 4);
435 static_assert(B >= -3 && B < 4);
436 static_assert(C >= -3 && C < 4);
437 static_assert(D >= -3 && D < 4);
438
439 int r = 0;
440 switch (A) {
441 case 0: r |= 0b00'00'00'00; break;
442 case 1: r |= 0b00'00'00'01; break;
443 case 2: r |= 0b00'00'00'10; break;
444 case 3: r |= 0b00'00'00'11; break;
445 case -1: r |= 0b00'00'00'00; break;
446 case -2: r |= 0b00'00'00'00; break;
447 }
448 switch (B) {
449 case 0: r |= 0b00'00'00'00; break;
450 case 1: r |= 0b00'00'01'00; break;
451 case 2: r |= 0b00'00'10'00; break;
452 case 3: r |= 0b00'00'11'00; break;
453 case -1: r |= 0b00'00'01'00; break;
454 case -2: r |= 0b00'00'01'00; break;
455 }
456 switch (C) {
457 case 0: r |= 0b00'00'00'00; break;
458 case 1: r |= 0b00'01'00'00; break;
459 case 2: r |= 0b00'10'00'00; break;
460 case 3: r |= 0b00'11'00'00; break;
461 case -1: r |= 0b00'10'00'00; break;
462 case -2: r |= 0b00'10'00'00; break;
463 }
464 switch (D) {
465 case 0: r |= 0b00'00'00'00; break;
466 case 1: r |= 0b01'00'00'00; break;
467 case 2: r |= 0b10'00'00'00; break;
468 case 3: r |= 0b11'00'00'00; break;
469 case -1: r |= 0b11'00'00'00; break;
470 case -2: r |= 0b11'00'00'00; break;
471 }
472 return r;
473}
474
475template<ssize_t A, ssize_t B, ssize_t C, ssize_t D>
476[[nodiscard]] constexpr static int f32x4_sse_not_one_mask() noexcept
477{
478 static_assert(A >= -3 && A < 4);
479 static_assert(B >= -3 && B < 4);
480 static_assert(C >= -3 && C < 4);
481 static_assert(D >= -3 && D < 4);
482
483 int r = 0;
484 r |= (A == -2) ? 0 : 0b0001;
485 r |= (B == -2) ? 0 : 0b0010;
486 r |= (C == -2) ? 0 : 0b0100;
487 r |= (D == -2) ? 0 : 0b1000;
488 return r;
489}
490
491template<ssize_t A, ssize_t B, ssize_t C, ssize_t D>
492[[nodiscard]] constexpr static int f32x4_sse_number_mask() noexcept
493{
494 static_assert(A >= -3 && A < 4);
495 static_assert(B >= -3 && B < 4);
496 static_assert(C >= -3 && C < 4);
497 static_assert(D >= -3 && D < 4);
498
499 int r = 0;
500 r |= A < 0 ? 0b0001 : 0;
501 r |= B < 0 ? 0b0010 : 0;
502 r |= C < 0 ? 0b0100 : 0;
503 r |= D < 0 ? 0b1000 : 0;
504 return r;
505}
506
507template<ssize_t A = -1, ssize_t B = -1, ssize_t C = -1, ssize_t D = -1>
508[[nodiscard]] f32x4_raw f32x4_sse_swizzle(f32x4_raw const &value) noexcept
509{
510 static_assert(A >= -3 && A < 4);
511 static_assert(B >= -3 && B < 4);
512 static_assert(C >= -3 && C < 4);
513 static_assert(D >= -3 && D < 4);
514
515 constexpr int permute_mask = f32x4_sse_permute_mask<A, B, C, D>();
516 constexpr int not_one_mask = f32x4_sse_not_one_mask<A, B, C, D>();
517 constexpr int number_mask = f32x4_sse_number_mask<A, B, C, D>();
518
519 __m128 swizzled;
520 // Clang is able to optimize these intrinsics, MSVC is not.
521 if constexpr (permute_mask != 0b11'10'01'00) {
522 swizzled = _mm_permute_ps(to_m128(value), permute_mask);
523 } else {
524 swizzled = to_m128(value);
525 }
526
527 __m128 numbers;
528 if constexpr (not_one_mask == 0b0000) {
529 numbers = _mm_set_ps1(1.0f);
530 } else if constexpr (not_one_mask == 0b1111) {
531 numbers = _mm_setzero_ps();
532 } else if constexpr (not_one_mask == 0b1110) {
533 numbers = _mm_set_ss(1.0f);
534 } else {
535 ttlet _1111 = _mm_set_ps1(1.0f);
536 numbers = _mm_insert_ps(_1111, _1111, not_one_mask);
537 }
538
539 __m128 result;
540 if constexpr (number_mask == 0b0000) {
541 result = swizzled;
542 } else if constexpr (number_mask == 0b1111) {
543 result = numbers;
544 } else if constexpr (((not_one_mask | ~number_mask) & 0b1111) == 0b1111) {
545 result = _mm_insert_ps(swizzled, swizzled, number_mask);
546 } else {
547 result = _mm_blend_ps(swizzled, numbers, number_mask);
548 }
549 return to_f32x4_raw(result);
550}
551
552}
T data(T... args)
T left(T... args)